1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include <linux/namei.h> 18 #include "misc.h" 19 #include "ctree.h" 20 #include "extent_map.h" 21 #include "disk-io.h" 22 #include "transaction.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "async-thread.h" 27 #include "check-integrity.h" 28 #include "rcu-string.h" 29 #include "dev-replace.h" 30 #include "sysfs.h" 31 #include "tree-checker.h" 32 #include "space-info.h" 33 #include "block-group.h" 34 #include "discard.h" 35 #include "zoned.h" 36 37 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 38 BTRFS_BLOCK_GROUP_RAID10 | \ 39 BTRFS_BLOCK_GROUP_RAID56_MASK) 40 41 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 42 [BTRFS_RAID_RAID10] = { 43 .sub_stripes = 2, 44 .dev_stripes = 1, 45 .devs_max = 0, /* 0 == as many as possible */ 46 .devs_min = 2, 47 .tolerated_failures = 1, 48 .devs_increment = 2, 49 .ncopies = 2, 50 .nparity = 0, 51 .raid_name = "raid10", 52 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 53 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 54 }, 55 [BTRFS_RAID_RAID1] = { 56 .sub_stripes = 1, 57 .dev_stripes = 1, 58 .devs_max = 2, 59 .devs_min = 2, 60 .tolerated_failures = 1, 61 .devs_increment = 2, 62 .ncopies = 2, 63 .nparity = 0, 64 .raid_name = "raid1", 65 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 66 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 67 }, 68 [BTRFS_RAID_RAID1C3] = { 69 .sub_stripes = 1, 70 .dev_stripes = 1, 71 .devs_max = 3, 72 .devs_min = 3, 73 .tolerated_failures = 2, 74 .devs_increment = 3, 75 .ncopies = 3, 76 .nparity = 0, 77 .raid_name = "raid1c3", 78 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 79 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 80 }, 81 [BTRFS_RAID_RAID1C4] = { 82 .sub_stripes = 1, 83 .dev_stripes = 1, 84 .devs_max = 4, 85 .devs_min = 4, 86 .tolerated_failures = 3, 87 .devs_increment = 4, 88 .ncopies = 4, 89 .nparity = 0, 90 .raid_name = "raid1c4", 91 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 92 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 93 }, 94 [BTRFS_RAID_DUP] = { 95 .sub_stripes = 1, 96 .dev_stripes = 2, 97 .devs_max = 1, 98 .devs_min = 1, 99 .tolerated_failures = 0, 100 .devs_increment = 1, 101 .ncopies = 2, 102 .nparity = 0, 103 .raid_name = "dup", 104 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 105 .mindev_error = 0, 106 }, 107 [BTRFS_RAID_RAID0] = { 108 .sub_stripes = 1, 109 .dev_stripes = 1, 110 .devs_max = 0, 111 .devs_min = 1, 112 .tolerated_failures = 0, 113 .devs_increment = 1, 114 .ncopies = 1, 115 .nparity = 0, 116 .raid_name = "raid0", 117 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 118 .mindev_error = 0, 119 }, 120 [BTRFS_RAID_SINGLE] = { 121 .sub_stripes = 1, 122 .dev_stripes = 1, 123 .devs_max = 1, 124 .devs_min = 1, 125 .tolerated_failures = 0, 126 .devs_increment = 1, 127 .ncopies = 1, 128 .nparity = 0, 129 .raid_name = "single", 130 .bg_flag = 0, 131 .mindev_error = 0, 132 }, 133 [BTRFS_RAID_RAID5] = { 134 .sub_stripes = 1, 135 .dev_stripes = 1, 136 .devs_max = 0, 137 .devs_min = 2, 138 .tolerated_failures = 1, 139 .devs_increment = 1, 140 .ncopies = 1, 141 .nparity = 1, 142 .raid_name = "raid5", 143 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 144 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 145 }, 146 [BTRFS_RAID_RAID6] = { 147 .sub_stripes = 1, 148 .dev_stripes = 1, 149 .devs_max = 0, 150 .devs_min = 3, 151 .tolerated_failures = 2, 152 .devs_increment = 1, 153 .ncopies = 1, 154 .nparity = 2, 155 .raid_name = "raid6", 156 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 157 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 158 }, 159 }; 160 161 /* 162 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 163 * can be used as index to access btrfs_raid_array[]. 164 */ 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 166 { 167 if (flags & BTRFS_BLOCK_GROUP_RAID10) 168 return BTRFS_RAID_RAID10; 169 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 170 return BTRFS_RAID_RAID1; 171 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 172 return BTRFS_RAID_RAID1C3; 173 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 174 return BTRFS_RAID_RAID1C4; 175 else if (flags & BTRFS_BLOCK_GROUP_DUP) 176 return BTRFS_RAID_DUP; 177 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 178 return BTRFS_RAID_RAID0; 179 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 180 return BTRFS_RAID_RAID5; 181 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 182 return BTRFS_RAID_RAID6; 183 184 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 185 } 186 187 const char *btrfs_bg_type_to_raid_name(u64 flags) 188 { 189 const int index = btrfs_bg_flags_to_raid_index(flags); 190 191 if (index >= BTRFS_NR_RAID_TYPES) 192 return NULL; 193 194 return btrfs_raid_array[index].raid_name; 195 } 196 197 /* 198 * Fill @buf with textual description of @bg_flags, no more than @size_buf 199 * bytes including terminating null byte. 200 */ 201 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 202 { 203 int i; 204 int ret; 205 char *bp = buf; 206 u64 flags = bg_flags; 207 u32 size_bp = size_buf; 208 209 if (!flags) { 210 strcpy(bp, "NONE"); 211 return; 212 } 213 214 #define DESCRIBE_FLAG(flag, desc) \ 215 do { \ 216 if (flags & (flag)) { \ 217 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 218 if (ret < 0 || ret >= size_bp) \ 219 goto out_overflow; \ 220 size_bp -= ret; \ 221 bp += ret; \ 222 flags &= ~(flag); \ 223 } \ 224 } while (0) 225 226 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 227 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 228 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 229 230 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 231 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 232 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 233 btrfs_raid_array[i].raid_name); 234 #undef DESCRIBE_FLAG 235 236 if (flags) { 237 ret = snprintf(bp, size_bp, "0x%llx|", flags); 238 size_bp -= ret; 239 } 240 241 if (size_bp < size_buf) 242 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 243 244 /* 245 * The text is trimmed, it's up to the caller to provide sufficiently 246 * large buffer 247 */ 248 out_overflow:; 249 } 250 251 static int init_first_rw_device(struct btrfs_trans_handle *trans); 252 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 253 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 254 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 255 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 256 enum btrfs_map_op op, 257 u64 logical, u64 *length, 258 struct btrfs_io_context **bioc_ret, 259 int mirror_num, int need_raid_map); 260 261 /* 262 * Device locking 263 * ============== 264 * 265 * There are several mutexes that protect manipulation of devices and low-level 266 * structures like chunks but not block groups, extents or files 267 * 268 * uuid_mutex (global lock) 269 * ------------------------ 270 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 271 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 272 * device) or requested by the device= mount option 273 * 274 * the mutex can be very coarse and can cover long-running operations 275 * 276 * protects: updates to fs_devices counters like missing devices, rw devices, 277 * seeding, structure cloning, opening/closing devices at mount/umount time 278 * 279 * global::fs_devs - add, remove, updates to the global list 280 * 281 * does not protect: manipulation of the fs_devices::devices list in general 282 * but in mount context it could be used to exclude list modifications by eg. 283 * scan ioctl 284 * 285 * btrfs_device::name - renames (write side), read is RCU 286 * 287 * fs_devices::device_list_mutex (per-fs, with RCU) 288 * ------------------------------------------------ 289 * protects updates to fs_devices::devices, ie. adding and deleting 290 * 291 * simple list traversal with read-only actions can be done with RCU protection 292 * 293 * may be used to exclude some operations from running concurrently without any 294 * modifications to the list (see write_all_supers) 295 * 296 * Is not required at mount and close times, because our device list is 297 * protected by the uuid_mutex at that point. 298 * 299 * balance_mutex 300 * ------------- 301 * protects balance structures (status, state) and context accessed from 302 * several places (internally, ioctl) 303 * 304 * chunk_mutex 305 * ----------- 306 * protects chunks, adding or removing during allocation, trim or when a new 307 * device is added/removed. Additionally it also protects post_commit_list of 308 * individual devices, since they can be added to the transaction's 309 * post_commit_list only with chunk_mutex held. 310 * 311 * cleaner_mutex 312 * ------------- 313 * a big lock that is held by the cleaner thread and prevents running subvolume 314 * cleaning together with relocation or delayed iputs 315 * 316 * 317 * Lock nesting 318 * ============ 319 * 320 * uuid_mutex 321 * device_list_mutex 322 * chunk_mutex 323 * balance_mutex 324 * 325 * 326 * Exclusive operations 327 * ==================== 328 * 329 * Maintains the exclusivity of the following operations that apply to the 330 * whole filesystem and cannot run in parallel. 331 * 332 * - Balance (*) 333 * - Device add 334 * - Device remove 335 * - Device replace (*) 336 * - Resize 337 * 338 * The device operations (as above) can be in one of the following states: 339 * 340 * - Running state 341 * - Paused state 342 * - Completed state 343 * 344 * Only device operations marked with (*) can go into the Paused state for the 345 * following reasons: 346 * 347 * - ioctl (only Balance can be Paused through ioctl) 348 * - filesystem remounted as read-only 349 * - filesystem unmounted and mounted as read-only 350 * - system power-cycle and filesystem mounted as read-only 351 * - filesystem or device errors leading to forced read-only 352 * 353 * The status of exclusive operation is set and cleared atomically. 354 * During the course of Paused state, fs_info::exclusive_operation remains set. 355 * A device operation in Paused or Running state can be canceled or resumed 356 * either by ioctl (Balance only) or when remounted as read-write. 357 * The exclusive status is cleared when the device operation is canceled or 358 * completed. 359 */ 360 361 DEFINE_MUTEX(uuid_mutex); 362 static LIST_HEAD(fs_uuids); 363 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 364 { 365 return &fs_uuids; 366 } 367 368 /* 369 * alloc_fs_devices - allocate struct btrfs_fs_devices 370 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 371 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 372 * 373 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 374 * The returned struct is not linked onto any lists and can be destroyed with 375 * kfree() right away. 376 */ 377 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 378 const u8 *metadata_fsid) 379 { 380 struct btrfs_fs_devices *fs_devs; 381 382 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 383 if (!fs_devs) 384 return ERR_PTR(-ENOMEM); 385 386 mutex_init(&fs_devs->device_list_mutex); 387 388 INIT_LIST_HEAD(&fs_devs->devices); 389 INIT_LIST_HEAD(&fs_devs->alloc_list); 390 INIT_LIST_HEAD(&fs_devs->fs_list); 391 INIT_LIST_HEAD(&fs_devs->seed_list); 392 if (fsid) 393 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 394 395 if (metadata_fsid) 396 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 397 else if (fsid) 398 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 399 400 return fs_devs; 401 } 402 403 void btrfs_free_device(struct btrfs_device *device) 404 { 405 WARN_ON(!list_empty(&device->post_commit_list)); 406 rcu_string_free(device->name); 407 extent_io_tree_release(&device->alloc_state); 408 bio_put(device->flush_bio); 409 btrfs_destroy_dev_zone_info(device); 410 kfree(device); 411 } 412 413 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 414 { 415 struct btrfs_device *device; 416 WARN_ON(fs_devices->opened); 417 while (!list_empty(&fs_devices->devices)) { 418 device = list_entry(fs_devices->devices.next, 419 struct btrfs_device, dev_list); 420 list_del(&device->dev_list); 421 btrfs_free_device(device); 422 } 423 kfree(fs_devices); 424 } 425 426 void __exit btrfs_cleanup_fs_uuids(void) 427 { 428 struct btrfs_fs_devices *fs_devices; 429 430 while (!list_empty(&fs_uuids)) { 431 fs_devices = list_entry(fs_uuids.next, 432 struct btrfs_fs_devices, fs_list); 433 list_del(&fs_devices->fs_list); 434 free_fs_devices(fs_devices); 435 } 436 } 437 438 static noinline struct btrfs_fs_devices *find_fsid( 439 const u8 *fsid, const u8 *metadata_fsid) 440 { 441 struct btrfs_fs_devices *fs_devices; 442 443 ASSERT(fsid); 444 445 /* Handle non-split brain cases */ 446 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 447 if (metadata_fsid) { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 449 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 450 BTRFS_FSID_SIZE) == 0) 451 return fs_devices; 452 } else { 453 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 454 return fs_devices; 455 } 456 } 457 return NULL; 458 } 459 460 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 461 struct btrfs_super_block *disk_super) 462 { 463 464 struct btrfs_fs_devices *fs_devices; 465 466 /* 467 * Handle scanned device having completed its fsid change but 468 * belonging to a fs_devices that was created by first scanning 469 * a device which didn't have its fsid/metadata_uuid changed 470 * at all and the CHANGING_FSID_V2 flag set. 471 */ 472 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 473 if (fs_devices->fsid_change && 474 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 475 BTRFS_FSID_SIZE) == 0 && 476 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 477 BTRFS_FSID_SIZE) == 0) { 478 return fs_devices; 479 } 480 } 481 /* 482 * Handle scanned device having completed its fsid change but 483 * belonging to a fs_devices that was created by a device that 484 * has an outdated pair of fsid/metadata_uuid and 485 * CHANGING_FSID_V2 flag set. 486 */ 487 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 488 if (fs_devices->fsid_change && 489 memcmp(fs_devices->metadata_uuid, 490 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 491 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 492 BTRFS_FSID_SIZE) == 0) { 493 return fs_devices; 494 } 495 } 496 497 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 498 } 499 500 501 static int 502 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 503 int flush, struct block_device **bdev, 504 struct btrfs_super_block **disk_super) 505 { 506 int ret; 507 508 *bdev = blkdev_get_by_path(device_path, flags, holder); 509 510 if (IS_ERR(*bdev)) { 511 ret = PTR_ERR(*bdev); 512 goto error; 513 } 514 515 if (flush) 516 sync_blockdev(*bdev); 517 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 518 if (ret) { 519 blkdev_put(*bdev, flags); 520 goto error; 521 } 522 invalidate_bdev(*bdev); 523 *disk_super = btrfs_read_dev_super(*bdev); 524 if (IS_ERR(*disk_super)) { 525 ret = PTR_ERR(*disk_super); 526 blkdev_put(*bdev, flags); 527 goto error; 528 } 529 530 return 0; 531 532 error: 533 *bdev = NULL; 534 return ret; 535 } 536 537 /* 538 * Check if the device in the path matches the device in the given struct device. 539 * 540 * Returns: 541 * true If it is the same device. 542 * false If it is not the same device or on error. 543 */ 544 static bool device_matched(const struct btrfs_device *device, dev_t dev_new) 545 { 546 char *device_name; 547 dev_t dev_old; 548 int ret; 549 550 /* 551 * If we are looking for a device with the matching dev_t, then skip 552 * device without a name (a missing device). 553 */ 554 if (!device->name) 555 return false; 556 557 device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 558 if (!device_name) 559 return false; 560 561 rcu_read_lock(); 562 scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); 563 rcu_read_unlock(); 564 565 ret = lookup_bdev(device_name, &dev_old); 566 kfree(device_name); 567 if (ret) 568 return false; 569 570 if (dev_old == dev_new) 571 return true; 572 573 return false; 574 } 575 576 /** 577 * Search and remove all stale devices (which are not mounted). 578 * When both inputs are NULL, it will search and release all stale devices. 579 * 580 * @devt: Optional. When provided will it release all unmounted devices 581 * matching this devt only. 582 * @skip_device: Optional. Will skip this device when searching for the stale 583 * devices. 584 * 585 * Return: 0 for success or if @devt is 0. 586 * -EBUSY if @devt is a mounted device. 587 * -ENOENT if @devt does not match any device in the list. 588 */ 589 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 590 { 591 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 592 struct btrfs_device *device, *tmp_device; 593 int ret = 0; 594 595 lockdep_assert_held(&uuid_mutex); 596 597 if (devt) 598 ret = -ENOENT; 599 600 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 601 602 mutex_lock(&fs_devices->device_list_mutex); 603 list_for_each_entry_safe(device, tmp_device, 604 &fs_devices->devices, dev_list) { 605 if (skip_device && skip_device == device) 606 continue; 607 if (devt && !device_matched(device, devt)) 608 continue; 609 if (fs_devices->opened) { 610 /* for an already deleted device return 0 */ 611 if (devt && ret != 0) 612 ret = -EBUSY; 613 break; 614 } 615 616 /* delete the stale device */ 617 fs_devices->num_devices--; 618 list_del(&device->dev_list); 619 btrfs_free_device(device); 620 621 ret = 0; 622 } 623 mutex_unlock(&fs_devices->device_list_mutex); 624 625 if (fs_devices->num_devices == 0) { 626 btrfs_sysfs_remove_fsid(fs_devices); 627 list_del(&fs_devices->fs_list); 628 free_fs_devices(fs_devices); 629 } 630 } 631 632 return ret; 633 } 634 635 /* 636 * This is only used on mount, and we are protected from competing things 637 * messing with our fs_devices by the uuid_mutex, thus we do not need the 638 * fs_devices->device_list_mutex here. 639 */ 640 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 641 struct btrfs_device *device, fmode_t flags, 642 void *holder) 643 { 644 struct request_queue *q; 645 struct block_device *bdev; 646 struct btrfs_super_block *disk_super; 647 u64 devid; 648 int ret; 649 650 if (device->bdev) 651 return -EINVAL; 652 if (!device->name) 653 return -EINVAL; 654 655 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 656 &bdev, &disk_super); 657 if (ret) 658 return ret; 659 660 devid = btrfs_stack_device_id(&disk_super->dev_item); 661 if (devid != device->devid) 662 goto error_free_page; 663 664 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 665 goto error_free_page; 666 667 device->generation = btrfs_super_generation(disk_super); 668 669 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 670 if (btrfs_super_incompat_flags(disk_super) & 671 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 672 pr_err( 673 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 674 goto error_free_page; 675 } 676 677 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 678 fs_devices->seeding = true; 679 } else { 680 if (bdev_read_only(bdev)) 681 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 682 else 683 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 684 } 685 686 q = bdev_get_queue(bdev); 687 if (!blk_queue_nonrot(q)) 688 fs_devices->rotating = true; 689 690 device->bdev = bdev; 691 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 692 device->mode = flags; 693 694 fs_devices->open_devices++; 695 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 696 device->devid != BTRFS_DEV_REPLACE_DEVID) { 697 fs_devices->rw_devices++; 698 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 699 } 700 btrfs_release_disk_super(disk_super); 701 702 return 0; 703 704 error_free_page: 705 btrfs_release_disk_super(disk_super); 706 blkdev_put(bdev, flags); 707 708 return -EINVAL; 709 } 710 711 /* 712 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 713 * being created with a disk that has already completed its fsid change. Such 714 * disk can belong to an fs which has its FSID changed or to one which doesn't. 715 * Handle both cases here. 716 */ 717 static struct btrfs_fs_devices *find_fsid_inprogress( 718 struct btrfs_super_block *disk_super) 719 { 720 struct btrfs_fs_devices *fs_devices; 721 722 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 723 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 724 BTRFS_FSID_SIZE) != 0 && 725 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 726 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 727 return fs_devices; 728 } 729 } 730 731 return find_fsid(disk_super->fsid, NULL); 732 } 733 734 735 static struct btrfs_fs_devices *find_fsid_changed( 736 struct btrfs_super_block *disk_super) 737 { 738 struct btrfs_fs_devices *fs_devices; 739 740 /* 741 * Handles the case where scanned device is part of an fs that had 742 * multiple successful changes of FSID but currently device didn't 743 * observe it. Meaning our fsid will be different than theirs. We need 744 * to handle two subcases : 745 * 1 - The fs still continues to have different METADATA/FSID uuids. 746 * 2 - The fs is switched back to its original FSID (METADATA/FSID 747 * are equal). 748 */ 749 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 750 /* Changed UUIDs */ 751 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 752 BTRFS_FSID_SIZE) != 0 && 753 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 754 BTRFS_FSID_SIZE) == 0 && 755 memcmp(fs_devices->fsid, disk_super->fsid, 756 BTRFS_FSID_SIZE) != 0) 757 return fs_devices; 758 759 /* Unchanged UUIDs */ 760 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 761 BTRFS_FSID_SIZE) == 0 && 762 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 763 BTRFS_FSID_SIZE) == 0) 764 return fs_devices; 765 } 766 767 return NULL; 768 } 769 770 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 771 struct btrfs_super_block *disk_super) 772 { 773 struct btrfs_fs_devices *fs_devices; 774 775 /* 776 * Handle the case where the scanned device is part of an fs whose last 777 * metadata UUID change reverted it to the original FSID. At the same 778 * time * fs_devices was first created by another constitutent device 779 * which didn't fully observe the operation. This results in an 780 * btrfs_fs_devices created with metadata/fsid different AND 781 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 782 * fs_devices equal to the FSID of the disk. 783 */ 784 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 785 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 786 BTRFS_FSID_SIZE) != 0 && 787 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 788 BTRFS_FSID_SIZE) == 0 && 789 fs_devices->fsid_change) 790 return fs_devices; 791 } 792 793 return NULL; 794 } 795 /* 796 * Add new device to list of registered devices 797 * 798 * Returns: 799 * device pointer which was just added or updated when successful 800 * error pointer when failed 801 */ 802 static noinline struct btrfs_device *device_list_add(const char *path, 803 struct btrfs_super_block *disk_super, 804 bool *new_device_added) 805 { 806 struct btrfs_device *device; 807 struct btrfs_fs_devices *fs_devices = NULL; 808 struct rcu_string *name; 809 u64 found_transid = btrfs_super_generation(disk_super); 810 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 811 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 812 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 813 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 814 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 815 816 if (fsid_change_in_progress) { 817 if (!has_metadata_uuid) 818 fs_devices = find_fsid_inprogress(disk_super); 819 else 820 fs_devices = find_fsid_changed(disk_super); 821 } else if (has_metadata_uuid) { 822 fs_devices = find_fsid_with_metadata_uuid(disk_super); 823 } else { 824 fs_devices = find_fsid_reverted_metadata(disk_super); 825 if (!fs_devices) 826 fs_devices = find_fsid(disk_super->fsid, NULL); 827 } 828 829 830 if (!fs_devices) { 831 if (has_metadata_uuid) 832 fs_devices = alloc_fs_devices(disk_super->fsid, 833 disk_super->metadata_uuid); 834 else 835 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 836 837 if (IS_ERR(fs_devices)) 838 return ERR_CAST(fs_devices); 839 840 fs_devices->fsid_change = fsid_change_in_progress; 841 842 mutex_lock(&fs_devices->device_list_mutex); 843 list_add(&fs_devices->fs_list, &fs_uuids); 844 845 device = NULL; 846 } else { 847 struct btrfs_dev_lookup_args args = { 848 .devid = devid, 849 .uuid = disk_super->dev_item.uuid, 850 }; 851 852 mutex_lock(&fs_devices->device_list_mutex); 853 device = btrfs_find_device(fs_devices, &args); 854 855 /* 856 * If this disk has been pulled into an fs devices created by 857 * a device which had the CHANGING_FSID_V2 flag then replace the 858 * metadata_uuid/fsid values of the fs_devices. 859 */ 860 if (fs_devices->fsid_change && 861 found_transid > fs_devices->latest_generation) { 862 memcpy(fs_devices->fsid, disk_super->fsid, 863 BTRFS_FSID_SIZE); 864 865 if (has_metadata_uuid) 866 memcpy(fs_devices->metadata_uuid, 867 disk_super->metadata_uuid, 868 BTRFS_FSID_SIZE); 869 else 870 memcpy(fs_devices->metadata_uuid, 871 disk_super->fsid, BTRFS_FSID_SIZE); 872 873 fs_devices->fsid_change = false; 874 } 875 } 876 877 if (!device) { 878 if (fs_devices->opened) { 879 mutex_unlock(&fs_devices->device_list_mutex); 880 return ERR_PTR(-EBUSY); 881 } 882 883 device = btrfs_alloc_device(NULL, &devid, 884 disk_super->dev_item.uuid); 885 if (IS_ERR(device)) { 886 mutex_unlock(&fs_devices->device_list_mutex); 887 /* we can safely leave the fs_devices entry around */ 888 return device; 889 } 890 891 name = rcu_string_strdup(path, GFP_NOFS); 892 if (!name) { 893 btrfs_free_device(device); 894 mutex_unlock(&fs_devices->device_list_mutex); 895 return ERR_PTR(-ENOMEM); 896 } 897 rcu_assign_pointer(device->name, name); 898 899 list_add_rcu(&device->dev_list, &fs_devices->devices); 900 fs_devices->num_devices++; 901 902 device->fs_devices = fs_devices; 903 *new_device_added = true; 904 905 if (disk_super->label[0]) 906 pr_info( 907 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 908 disk_super->label, devid, found_transid, path, 909 current->comm, task_pid_nr(current)); 910 else 911 pr_info( 912 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 913 disk_super->fsid, devid, found_transid, path, 914 current->comm, task_pid_nr(current)); 915 916 } else if (!device->name || strcmp(device->name->str, path)) { 917 /* 918 * When FS is already mounted. 919 * 1. If you are here and if the device->name is NULL that 920 * means this device was missing at time of FS mount. 921 * 2. If you are here and if the device->name is different 922 * from 'path' that means either 923 * a. The same device disappeared and reappeared with 924 * different name. or 925 * b. The missing-disk-which-was-replaced, has 926 * reappeared now. 927 * 928 * We must allow 1 and 2a above. But 2b would be a spurious 929 * and unintentional. 930 * 931 * Further in case of 1 and 2a above, the disk at 'path' 932 * would have missed some transaction when it was away and 933 * in case of 2a the stale bdev has to be updated as well. 934 * 2b must not be allowed at all time. 935 */ 936 937 /* 938 * For now, we do allow update to btrfs_fs_device through the 939 * btrfs dev scan cli after FS has been mounted. We're still 940 * tracking a problem where systems fail mount by subvolume id 941 * when we reject replacement on a mounted FS. 942 */ 943 if (!fs_devices->opened && found_transid < device->generation) { 944 /* 945 * That is if the FS is _not_ mounted and if you 946 * are here, that means there is more than one 947 * disk with same uuid and devid.We keep the one 948 * with larger generation number or the last-in if 949 * generation are equal. 950 */ 951 mutex_unlock(&fs_devices->device_list_mutex); 952 return ERR_PTR(-EEXIST); 953 } 954 955 /* 956 * We are going to replace the device path for a given devid, 957 * make sure it's the same device if the device is mounted 958 */ 959 if (device->bdev) { 960 int error; 961 dev_t path_dev; 962 963 error = lookup_bdev(path, &path_dev); 964 if (error) { 965 mutex_unlock(&fs_devices->device_list_mutex); 966 return ERR_PTR(error); 967 } 968 969 if (device->bdev->bd_dev != path_dev) { 970 mutex_unlock(&fs_devices->device_list_mutex); 971 /* 972 * device->fs_info may not be reliable here, so 973 * pass in a NULL instead. This avoids a 974 * possible use-after-free when the fs_info and 975 * fs_info->sb are already torn down. 976 */ 977 btrfs_warn_in_rcu(NULL, 978 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 979 path, devid, found_transid, 980 current->comm, 981 task_pid_nr(current)); 982 return ERR_PTR(-EEXIST); 983 } 984 btrfs_info_in_rcu(device->fs_info, 985 "devid %llu device path %s changed to %s scanned by %s (%d)", 986 devid, rcu_str_deref(device->name), 987 path, current->comm, 988 task_pid_nr(current)); 989 } 990 991 name = rcu_string_strdup(path, GFP_NOFS); 992 if (!name) { 993 mutex_unlock(&fs_devices->device_list_mutex); 994 return ERR_PTR(-ENOMEM); 995 } 996 rcu_string_free(device->name); 997 rcu_assign_pointer(device->name, name); 998 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 999 fs_devices->missing_devices--; 1000 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1001 } 1002 } 1003 1004 /* 1005 * Unmount does not free the btrfs_device struct but would zero 1006 * generation along with most of the other members. So just update 1007 * it back. We need it to pick the disk with largest generation 1008 * (as above). 1009 */ 1010 if (!fs_devices->opened) { 1011 device->generation = found_transid; 1012 fs_devices->latest_generation = max_t(u64, found_transid, 1013 fs_devices->latest_generation); 1014 } 1015 1016 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 1017 1018 mutex_unlock(&fs_devices->device_list_mutex); 1019 return device; 1020 } 1021 1022 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 1023 { 1024 struct btrfs_fs_devices *fs_devices; 1025 struct btrfs_device *device; 1026 struct btrfs_device *orig_dev; 1027 int ret = 0; 1028 1029 lockdep_assert_held(&uuid_mutex); 1030 1031 fs_devices = alloc_fs_devices(orig->fsid, NULL); 1032 if (IS_ERR(fs_devices)) 1033 return fs_devices; 1034 1035 fs_devices->total_devices = orig->total_devices; 1036 1037 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1038 struct rcu_string *name; 1039 1040 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1041 orig_dev->uuid); 1042 if (IS_ERR(device)) { 1043 ret = PTR_ERR(device); 1044 goto error; 1045 } 1046 1047 /* 1048 * This is ok to do without rcu read locked because we hold the 1049 * uuid mutex so nothing we touch in here is going to disappear. 1050 */ 1051 if (orig_dev->name) { 1052 name = rcu_string_strdup(orig_dev->name->str, 1053 GFP_KERNEL); 1054 if (!name) { 1055 btrfs_free_device(device); 1056 ret = -ENOMEM; 1057 goto error; 1058 } 1059 rcu_assign_pointer(device->name, name); 1060 } 1061 1062 list_add(&device->dev_list, &fs_devices->devices); 1063 device->fs_devices = fs_devices; 1064 fs_devices->num_devices++; 1065 } 1066 return fs_devices; 1067 error: 1068 free_fs_devices(fs_devices); 1069 return ERR_PTR(ret); 1070 } 1071 1072 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1073 struct btrfs_device **latest_dev) 1074 { 1075 struct btrfs_device *device, *next; 1076 1077 /* This is the initialized path, it is safe to release the devices. */ 1078 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1079 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1080 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1081 &device->dev_state) && 1082 !test_bit(BTRFS_DEV_STATE_MISSING, 1083 &device->dev_state) && 1084 (!*latest_dev || 1085 device->generation > (*latest_dev)->generation)) { 1086 *latest_dev = device; 1087 } 1088 continue; 1089 } 1090 1091 /* 1092 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1093 * in btrfs_init_dev_replace() so just continue. 1094 */ 1095 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1096 continue; 1097 1098 if (device->bdev) { 1099 blkdev_put(device->bdev, device->mode); 1100 device->bdev = NULL; 1101 fs_devices->open_devices--; 1102 } 1103 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1104 list_del_init(&device->dev_alloc_list); 1105 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1106 fs_devices->rw_devices--; 1107 } 1108 list_del_init(&device->dev_list); 1109 fs_devices->num_devices--; 1110 btrfs_free_device(device); 1111 } 1112 1113 } 1114 1115 /* 1116 * After we have read the system tree and know devids belonging to this 1117 * filesystem, remove the device which does not belong there. 1118 */ 1119 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1120 { 1121 struct btrfs_device *latest_dev = NULL; 1122 struct btrfs_fs_devices *seed_dev; 1123 1124 mutex_lock(&uuid_mutex); 1125 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1126 1127 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1128 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1129 1130 fs_devices->latest_dev = latest_dev; 1131 1132 mutex_unlock(&uuid_mutex); 1133 } 1134 1135 static void btrfs_close_bdev(struct btrfs_device *device) 1136 { 1137 if (!device->bdev) 1138 return; 1139 1140 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1141 sync_blockdev(device->bdev); 1142 invalidate_bdev(device->bdev); 1143 } 1144 1145 blkdev_put(device->bdev, device->mode); 1146 } 1147 1148 static void btrfs_close_one_device(struct btrfs_device *device) 1149 { 1150 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1151 1152 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1153 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1154 list_del_init(&device->dev_alloc_list); 1155 fs_devices->rw_devices--; 1156 } 1157 1158 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1159 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1160 1161 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1162 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1163 fs_devices->missing_devices--; 1164 } 1165 1166 btrfs_close_bdev(device); 1167 if (device->bdev) { 1168 fs_devices->open_devices--; 1169 device->bdev = NULL; 1170 } 1171 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1172 btrfs_destroy_dev_zone_info(device); 1173 1174 device->fs_info = NULL; 1175 atomic_set(&device->dev_stats_ccnt, 0); 1176 extent_io_tree_release(&device->alloc_state); 1177 1178 /* 1179 * Reset the flush error record. We might have a transient flush error 1180 * in this mount, and if so we aborted the current transaction and set 1181 * the fs to an error state, guaranteeing no super blocks can be further 1182 * committed. However that error might be transient and if we unmount the 1183 * filesystem and mount it again, we should allow the mount to succeed 1184 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1185 * filesystem again we still get flush errors, then we will again abort 1186 * any transaction and set the error state, guaranteeing no commits of 1187 * unsafe super blocks. 1188 */ 1189 device->last_flush_error = 0; 1190 1191 /* Verify the device is back in a pristine state */ 1192 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1193 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1194 ASSERT(list_empty(&device->dev_alloc_list)); 1195 ASSERT(list_empty(&device->post_commit_list)); 1196 } 1197 1198 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1199 { 1200 struct btrfs_device *device, *tmp; 1201 1202 lockdep_assert_held(&uuid_mutex); 1203 1204 if (--fs_devices->opened > 0) 1205 return; 1206 1207 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1208 btrfs_close_one_device(device); 1209 1210 WARN_ON(fs_devices->open_devices); 1211 WARN_ON(fs_devices->rw_devices); 1212 fs_devices->opened = 0; 1213 fs_devices->seeding = false; 1214 fs_devices->fs_info = NULL; 1215 } 1216 1217 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1218 { 1219 LIST_HEAD(list); 1220 struct btrfs_fs_devices *tmp; 1221 1222 mutex_lock(&uuid_mutex); 1223 close_fs_devices(fs_devices); 1224 if (!fs_devices->opened) 1225 list_splice_init(&fs_devices->seed_list, &list); 1226 1227 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1228 close_fs_devices(fs_devices); 1229 list_del(&fs_devices->seed_list); 1230 free_fs_devices(fs_devices); 1231 } 1232 mutex_unlock(&uuid_mutex); 1233 } 1234 1235 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1236 fmode_t flags, void *holder) 1237 { 1238 struct btrfs_device *device; 1239 struct btrfs_device *latest_dev = NULL; 1240 struct btrfs_device *tmp_device; 1241 1242 flags |= FMODE_EXCL; 1243 1244 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1245 dev_list) { 1246 int ret; 1247 1248 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1249 if (ret == 0 && 1250 (!latest_dev || device->generation > latest_dev->generation)) { 1251 latest_dev = device; 1252 } else if (ret == -ENODATA) { 1253 fs_devices->num_devices--; 1254 list_del(&device->dev_list); 1255 btrfs_free_device(device); 1256 } 1257 } 1258 if (fs_devices->open_devices == 0) 1259 return -EINVAL; 1260 1261 fs_devices->opened = 1; 1262 fs_devices->latest_dev = latest_dev; 1263 fs_devices->total_rw_bytes = 0; 1264 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1265 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1266 1267 return 0; 1268 } 1269 1270 static int devid_cmp(void *priv, const struct list_head *a, 1271 const struct list_head *b) 1272 { 1273 const struct btrfs_device *dev1, *dev2; 1274 1275 dev1 = list_entry(a, struct btrfs_device, dev_list); 1276 dev2 = list_entry(b, struct btrfs_device, dev_list); 1277 1278 if (dev1->devid < dev2->devid) 1279 return -1; 1280 else if (dev1->devid > dev2->devid) 1281 return 1; 1282 return 0; 1283 } 1284 1285 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1286 fmode_t flags, void *holder) 1287 { 1288 int ret; 1289 1290 lockdep_assert_held(&uuid_mutex); 1291 /* 1292 * The device_list_mutex cannot be taken here in case opening the 1293 * underlying device takes further locks like open_mutex. 1294 * 1295 * We also don't need the lock here as this is called during mount and 1296 * exclusion is provided by uuid_mutex 1297 */ 1298 1299 if (fs_devices->opened) { 1300 fs_devices->opened++; 1301 ret = 0; 1302 } else { 1303 list_sort(NULL, &fs_devices->devices, devid_cmp); 1304 ret = open_fs_devices(fs_devices, flags, holder); 1305 } 1306 1307 return ret; 1308 } 1309 1310 void btrfs_release_disk_super(struct btrfs_super_block *super) 1311 { 1312 struct page *page = virt_to_page(super); 1313 1314 put_page(page); 1315 } 1316 1317 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1318 u64 bytenr, u64 bytenr_orig) 1319 { 1320 struct btrfs_super_block *disk_super; 1321 struct page *page; 1322 void *p; 1323 pgoff_t index; 1324 1325 /* make sure our super fits in the device */ 1326 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1327 return ERR_PTR(-EINVAL); 1328 1329 /* make sure our super fits in the page */ 1330 if (sizeof(*disk_super) > PAGE_SIZE) 1331 return ERR_PTR(-EINVAL); 1332 1333 /* make sure our super doesn't straddle pages on disk */ 1334 index = bytenr >> PAGE_SHIFT; 1335 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1336 return ERR_PTR(-EINVAL); 1337 1338 /* pull in the page with our super */ 1339 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1340 1341 if (IS_ERR(page)) 1342 return ERR_CAST(page); 1343 1344 p = page_address(page); 1345 1346 /* align our pointer to the offset of the super block */ 1347 disk_super = p + offset_in_page(bytenr); 1348 1349 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1350 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1351 btrfs_release_disk_super(p); 1352 return ERR_PTR(-EINVAL); 1353 } 1354 1355 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1356 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1357 1358 return disk_super; 1359 } 1360 1361 int btrfs_forget_devices(dev_t devt) 1362 { 1363 int ret; 1364 1365 mutex_lock(&uuid_mutex); 1366 ret = btrfs_free_stale_devices(devt, NULL); 1367 mutex_unlock(&uuid_mutex); 1368 1369 return ret; 1370 } 1371 1372 /* 1373 * Look for a btrfs signature on a device. This may be called out of the mount path 1374 * and we are not allowed to call set_blocksize during the scan. The superblock 1375 * is read via pagecache 1376 */ 1377 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1378 void *holder) 1379 { 1380 struct btrfs_super_block *disk_super; 1381 bool new_device_added = false; 1382 struct btrfs_device *device = NULL; 1383 struct block_device *bdev; 1384 u64 bytenr, bytenr_orig; 1385 int ret; 1386 1387 lockdep_assert_held(&uuid_mutex); 1388 1389 /* 1390 * we would like to check all the supers, but that would make 1391 * a btrfs mount succeed after a mkfs from a different FS. 1392 * So, we need to add a special mount option to scan for 1393 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1394 */ 1395 flags |= FMODE_EXCL; 1396 1397 bdev = blkdev_get_by_path(path, flags, holder); 1398 if (IS_ERR(bdev)) 1399 return ERR_CAST(bdev); 1400 1401 bytenr_orig = btrfs_sb_offset(0); 1402 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1403 if (ret) { 1404 device = ERR_PTR(ret); 1405 goto error_bdev_put; 1406 } 1407 1408 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1409 if (IS_ERR(disk_super)) { 1410 device = ERR_CAST(disk_super); 1411 goto error_bdev_put; 1412 } 1413 1414 device = device_list_add(path, disk_super, &new_device_added); 1415 if (!IS_ERR(device) && new_device_added) { 1416 dev_t devt; 1417 1418 /* 1419 * It is ok to ignore if we fail to free the stale device (if 1420 * any). As there is nothing much that can be done about it. 1421 */ 1422 if (lookup_bdev(path, &devt) == 0) 1423 btrfs_free_stale_devices(devt, device); 1424 } 1425 1426 btrfs_release_disk_super(disk_super); 1427 1428 error_bdev_put: 1429 blkdev_put(bdev, flags); 1430 1431 return device; 1432 } 1433 1434 /* 1435 * Try to find a chunk that intersects [start, start + len] range and when one 1436 * such is found, record the end of it in *start 1437 */ 1438 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1439 u64 len) 1440 { 1441 u64 physical_start, physical_end; 1442 1443 lockdep_assert_held(&device->fs_info->chunk_mutex); 1444 1445 if (!find_first_extent_bit(&device->alloc_state, *start, 1446 &physical_start, &physical_end, 1447 CHUNK_ALLOCATED, NULL)) { 1448 1449 if (in_range(physical_start, *start, len) || 1450 in_range(*start, physical_start, 1451 physical_end - physical_start)) { 1452 *start = physical_end + 1; 1453 return true; 1454 } 1455 } 1456 return false; 1457 } 1458 1459 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1460 { 1461 switch (device->fs_devices->chunk_alloc_policy) { 1462 case BTRFS_CHUNK_ALLOC_REGULAR: 1463 /* 1464 * We don't want to overwrite the superblock on the drive nor 1465 * any area used by the boot loader (grub for example), so we 1466 * make sure to start at an offset of at least 1MB. 1467 */ 1468 return max_t(u64, start, SZ_1M); 1469 case BTRFS_CHUNK_ALLOC_ZONED: 1470 /* 1471 * We don't care about the starting region like regular 1472 * allocator, because we anyway use/reserve the first two zones 1473 * for superblock logging. 1474 */ 1475 return ALIGN(start, device->zone_info->zone_size); 1476 default: 1477 BUG(); 1478 } 1479 } 1480 1481 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1482 u64 *hole_start, u64 *hole_size, 1483 u64 num_bytes) 1484 { 1485 u64 zone_size = device->zone_info->zone_size; 1486 u64 pos; 1487 int ret; 1488 bool changed = false; 1489 1490 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1491 1492 while (*hole_size > 0) { 1493 pos = btrfs_find_allocatable_zones(device, *hole_start, 1494 *hole_start + *hole_size, 1495 num_bytes); 1496 if (pos != *hole_start) { 1497 *hole_size = *hole_start + *hole_size - pos; 1498 *hole_start = pos; 1499 changed = true; 1500 if (*hole_size < num_bytes) 1501 break; 1502 } 1503 1504 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1505 1506 /* Range is ensured to be empty */ 1507 if (!ret) 1508 return changed; 1509 1510 /* Given hole range was invalid (outside of device) */ 1511 if (ret == -ERANGE) { 1512 *hole_start += *hole_size; 1513 *hole_size = 0; 1514 return true; 1515 } 1516 1517 *hole_start += zone_size; 1518 *hole_size -= zone_size; 1519 changed = true; 1520 } 1521 1522 return changed; 1523 } 1524 1525 /** 1526 * dev_extent_hole_check - check if specified hole is suitable for allocation 1527 * @device: the device which we have the hole 1528 * @hole_start: starting position of the hole 1529 * @hole_size: the size of the hole 1530 * @num_bytes: the size of the free space that we need 1531 * 1532 * This function may modify @hole_start and @hole_size to reflect the suitable 1533 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1534 */ 1535 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1536 u64 *hole_size, u64 num_bytes) 1537 { 1538 bool changed = false; 1539 u64 hole_end = *hole_start + *hole_size; 1540 1541 for (;;) { 1542 /* 1543 * Check before we set max_hole_start, otherwise we could end up 1544 * sending back this offset anyway. 1545 */ 1546 if (contains_pending_extent(device, hole_start, *hole_size)) { 1547 if (hole_end >= *hole_start) 1548 *hole_size = hole_end - *hole_start; 1549 else 1550 *hole_size = 0; 1551 changed = true; 1552 } 1553 1554 switch (device->fs_devices->chunk_alloc_policy) { 1555 case BTRFS_CHUNK_ALLOC_REGULAR: 1556 /* No extra check */ 1557 break; 1558 case BTRFS_CHUNK_ALLOC_ZONED: 1559 if (dev_extent_hole_check_zoned(device, hole_start, 1560 hole_size, num_bytes)) { 1561 changed = true; 1562 /* 1563 * The changed hole can contain pending extent. 1564 * Loop again to check that. 1565 */ 1566 continue; 1567 } 1568 break; 1569 default: 1570 BUG(); 1571 } 1572 1573 break; 1574 } 1575 1576 return changed; 1577 } 1578 1579 /* 1580 * find_free_dev_extent_start - find free space in the specified device 1581 * @device: the device which we search the free space in 1582 * @num_bytes: the size of the free space that we need 1583 * @search_start: the position from which to begin the search 1584 * @start: store the start of the free space. 1585 * @len: the size of the free space. that we find, or the size 1586 * of the max free space if we don't find suitable free space 1587 * 1588 * this uses a pretty simple search, the expectation is that it is 1589 * called very infrequently and that a given device has a small number 1590 * of extents 1591 * 1592 * @start is used to store the start of the free space if we find. But if we 1593 * don't find suitable free space, it will be used to store the start position 1594 * of the max free space. 1595 * 1596 * @len is used to store the size of the free space that we find. 1597 * But if we don't find suitable free space, it is used to store the size of 1598 * the max free space. 1599 * 1600 * NOTE: This function will search *commit* root of device tree, and does extra 1601 * check to ensure dev extents are not double allocated. 1602 * This makes the function safe to allocate dev extents but may not report 1603 * correct usable device space, as device extent freed in current transaction 1604 * is not reported as available. 1605 */ 1606 static int find_free_dev_extent_start(struct btrfs_device *device, 1607 u64 num_bytes, u64 search_start, u64 *start, 1608 u64 *len) 1609 { 1610 struct btrfs_fs_info *fs_info = device->fs_info; 1611 struct btrfs_root *root = fs_info->dev_root; 1612 struct btrfs_key key; 1613 struct btrfs_dev_extent *dev_extent; 1614 struct btrfs_path *path; 1615 u64 hole_size; 1616 u64 max_hole_start; 1617 u64 max_hole_size; 1618 u64 extent_end; 1619 u64 search_end = device->total_bytes; 1620 int ret; 1621 int slot; 1622 struct extent_buffer *l; 1623 1624 search_start = dev_extent_search_start(device, search_start); 1625 1626 WARN_ON(device->zone_info && 1627 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1628 1629 path = btrfs_alloc_path(); 1630 if (!path) 1631 return -ENOMEM; 1632 1633 max_hole_start = search_start; 1634 max_hole_size = 0; 1635 1636 again: 1637 if (search_start >= search_end || 1638 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1639 ret = -ENOSPC; 1640 goto out; 1641 } 1642 1643 path->reada = READA_FORWARD; 1644 path->search_commit_root = 1; 1645 path->skip_locking = 1; 1646 1647 key.objectid = device->devid; 1648 key.offset = search_start; 1649 key.type = BTRFS_DEV_EXTENT_KEY; 1650 1651 ret = btrfs_search_backwards(root, &key, path); 1652 if (ret < 0) 1653 goto out; 1654 1655 while (1) { 1656 l = path->nodes[0]; 1657 slot = path->slots[0]; 1658 if (slot >= btrfs_header_nritems(l)) { 1659 ret = btrfs_next_leaf(root, path); 1660 if (ret == 0) 1661 continue; 1662 if (ret < 0) 1663 goto out; 1664 1665 break; 1666 } 1667 btrfs_item_key_to_cpu(l, &key, slot); 1668 1669 if (key.objectid < device->devid) 1670 goto next; 1671 1672 if (key.objectid > device->devid) 1673 break; 1674 1675 if (key.type != BTRFS_DEV_EXTENT_KEY) 1676 goto next; 1677 1678 if (key.offset > search_start) { 1679 hole_size = key.offset - search_start; 1680 dev_extent_hole_check(device, &search_start, &hole_size, 1681 num_bytes); 1682 1683 if (hole_size > max_hole_size) { 1684 max_hole_start = search_start; 1685 max_hole_size = hole_size; 1686 } 1687 1688 /* 1689 * If this free space is greater than which we need, 1690 * it must be the max free space that we have found 1691 * until now, so max_hole_start must point to the start 1692 * of this free space and the length of this free space 1693 * is stored in max_hole_size. Thus, we return 1694 * max_hole_start and max_hole_size and go back to the 1695 * caller. 1696 */ 1697 if (hole_size >= num_bytes) { 1698 ret = 0; 1699 goto out; 1700 } 1701 } 1702 1703 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1704 extent_end = key.offset + btrfs_dev_extent_length(l, 1705 dev_extent); 1706 if (extent_end > search_start) 1707 search_start = extent_end; 1708 next: 1709 path->slots[0]++; 1710 cond_resched(); 1711 } 1712 1713 /* 1714 * At this point, search_start should be the end of 1715 * allocated dev extents, and when shrinking the device, 1716 * search_end may be smaller than search_start. 1717 */ 1718 if (search_end > search_start) { 1719 hole_size = search_end - search_start; 1720 if (dev_extent_hole_check(device, &search_start, &hole_size, 1721 num_bytes)) { 1722 btrfs_release_path(path); 1723 goto again; 1724 } 1725 1726 if (hole_size > max_hole_size) { 1727 max_hole_start = search_start; 1728 max_hole_size = hole_size; 1729 } 1730 } 1731 1732 /* See above. */ 1733 if (max_hole_size < num_bytes) 1734 ret = -ENOSPC; 1735 else 1736 ret = 0; 1737 1738 out: 1739 btrfs_free_path(path); 1740 *start = max_hole_start; 1741 if (len) 1742 *len = max_hole_size; 1743 return ret; 1744 } 1745 1746 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1747 u64 *start, u64 *len) 1748 { 1749 /* FIXME use last free of some kind */ 1750 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1751 } 1752 1753 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1754 struct btrfs_device *device, 1755 u64 start, u64 *dev_extent_len) 1756 { 1757 struct btrfs_fs_info *fs_info = device->fs_info; 1758 struct btrfs_root *root = fs_info->dev_root; 1759 int ret; 1760 struct btrfs_path *path; 1761 struct btrfs_key key; 1762 struct btrfs_key found_key; 1763 struct extent_buffer *leaf = NULL; 1764 struct btrfs_dev_extent *extent = NULL; 1765 1766 path = btrfs_alloc_path(); 1767 if (!path) 1768 return -ENOMEM; 1769 1770 key.objectid = device->devid; 1771 key.offset = start; 1772 key.type = BTRFS_DEV_EXTENT_KEY; 1773 again: 1774 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1775 if (ret > 0) { 1776 ret = btrfs_previous_item(root, path, key.objectid, 1777 BTRFS_DEV_EXTENT_KEY); 1778 if (ret) 1779 goto out; 1780 leaf = path->nodes[0]; 1781 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1782 extent = btrfs_item_ptr(leaf, path->slots[0], 1783 struct btrfs_dev_extent); 1784 BUG_ON(found_key.offset > start || found_key.offset + 1785 btrfs_dev_extent_length(leaf, extent) < start); 1786 key = found_key; 1787 btrfs_release_path(path); 1788 goto again; 1789 } else if (ret == 0) { 1790 leaf = path->nodes[0]; 1791 extent = btrfs_item_ptr(leaf, path->slots[0], 1792 struct btrfs_dev_extent); 1793 } else { 1794 goto out; 1795 } 1796 1797 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1798 1799 ret = btrfs_del_item(trans, root, path); 1800 if (ret == 0) 1801 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1802 out: 1803 btrfs_free_path(path); 1804 return ret; 1805 } 1806 1807 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1808 { 1809 struct extent_map_tree *em_tree; 1810 struct extent_map *em; 1811 struct rb_node *n; 1812 u64 ret = 0; 1813 1814 em_tree = &fs_info->mapping_tree; 1815 read_lock(&em_tree->lock); 1816 n = rb_last(&em_tree->map.rb_root); 1817 if (n) { 1818 em = rb_entry(n, struct extent_map, rb_node); 1819 ret = em->start + em->len; 1820 } 1821 read_unlock(&em_tree->lock); 1822 1823 return ret; 1824 } 1825 1826 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1827 u64 *devid_ret) 1828 { 1829 int ret; 1830 struct btrfs_key key; 1831 struct btrfs_key found_key; 1832 struct btrfs_path *path; 1833 1834 path = btrfs_alloc_path(); 1835 if (!path) 1836 return -ENOMEM; 1837 1838 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1839 key.type = BTRFS_DEV_ITEM_KEY; 1840 key.offset = (u64)-1; 1841 1842 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1843 if (ret < 0) 1844 goto error; 1845 1846 if (ret == 0) { 1847 /* Corruption */ 1848 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1849 ret = -EUCLEAN; 1850 goto error; 1851 } 1852 1853 ret = btrfs_previous_item(fs_info->chunk_root, path, 1854 BTRFS_DEV_ITEMS_OBJECTID, 1855 BTRFS_DEV_ITEM_KEY); 1856 if (ret) { 1857 *devid_ret = 1; 1858 } else { 1859 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1860 path->slots[0]); 1861 *devid_ret = found_key.offset + 1; 1862 } 1863 ret = 0; 1864 error: 1865 btrfs_free_path(path); 1866 return ret; 1867 } 1868 1869 /* 1870 * the device information is stored in the chunk root 1871 * the btrfs_device struct should be fully filled in 1872 */ 1873 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1874 struct btrfs_device *device) 1875 { 1876 int ret; 1877 struct btrfs_path *path; 1878 struct btrfs_dev_item *dev_item; 1879 struct extent_buffer *leaf; 1880 struct btrfs_key key; 1881 unsigned long ptr; 1882 1883 path = btrfs_alloc_path(); 1884 if (!path) 1885 return -ENOMEM; 1886 1887 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1888 key.type = BTRFS_DEV_ITEM_KEY; 1889 key.offset = device->devid; 1890 1891 btrfs_reserve_chunk_metadata(trans, true); 1892 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1893 &key, sizeof(*dev_item)); 1894 btrfs_trans_release_chunk_metadata(trans); 1895 if (ret) 1896 goto out; 1897 1898 leaf = path->nodes[0]; 1899 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1900 1901 btrfs_set_device_id(leaf, dev_item, device->devid); 1902 btrfs_set_device_generation(leaf, dev_item, 0); 1903 btrfs_set_device_type(leaf, dev_item, device->type); 1904 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1905 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1906 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1907 btrfs_set_device_total_bytes(leaf, dev_item, 1908 btrfs_device_get_disk_total_bytes(device)); 1909 btrfs_set_device_bytes_used(leaf, dev_item, 1910 btrfs_device_get_bytes_used(device)); 1911 btrfs_set_device_group(leaf, dev_item, 0); 1912 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1913 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1914 btrfs_set_device_start_offset(leaf, dev_item, 0); 1915 1916 ptr = btrfs_device_uuid(dev_item); 1917 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1918 ptr = btrfs_device_fsid(dev_item); 1919 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1920 ptr, BTRFS_FSID_SIZE); 1921 btrfs_mark_buffer_dirty(leaf); 1922 1923 ret = 0; 1924 out: 1925 btrfs_free_path(path); 1926 return ret; 1927 } 1928 1929 /* 1930 * Function to update ctime/mtime for a given device path. 1931 * Mainly used for ctime/mtime based probe like libblkid. 1932 * 1933 * We don't care about errors here, this is just to be kind to userspace. 1934 */ 1935 static void update_dev_time(const char *device_path) 1936 { 1937 struct path path; 1938 struct timespec64 now; 1939 int ret; 1940 1941 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1942 if (ret) 1943 return; 1944 1945 now = current_time(d_inode(path.dentry)); 1946 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1947 path_put(&path); 1948 } 1949 1950 static int btrfs_rm_dev_item(struct btrfs_device *device) 1951 { 1952 struct btrfs_root *root = device->fs_info->chunk_root; 1953 int ret; 1954 struct btrfs_path *path; 1955 struct btrfs_key key; 1956 struct btrfs_trans_handle *trans; 1957 1958 path = btrfs_alloc_path(); 1959 if (!path) 1960 return -ENOMEM; 1961 1962 trans = btrfs_start_transaction(root, 0); 1963 if (IS_ERR(trans)) { 1964 btrfs_free_path(path); 1965 return PTR_ERR(trans); 1966 } 1967 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1968 key.type = BTRFS_DEV_ITEM_KEY; 1969 key.offset = device->devid; 1970 1971 btrfs_reserve_chunk_metadata(trans, false); 1972 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1973 btrfs_trans_release_chunk_metadata(trans); 1974 if (ret) { 1975 if (ret > 0) 1976 ret = -ENOENT; 1977 btrfs_abort_transaction(trans, ret); 1978 btrfs_end_transaction(trans); 1979 goto out; 1980 } 1981 1982 ret = btrfs_del_item(trans, root, path); 1983 if (ret) { 1984 btrfs_abort_transaction(trans, ret); 1985 btrfs_end_transaction(trans); 1986 } 1987 1988 out: 1989 btrfs_free_path(path); 1990 if (!ret) 1991 ret = btrfs_commit_transaction(trans); 1992 return ret; 1993 } 1994 1995 /* 1996 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1997 * filesystem. It's up to the caller to adjust that number regarding eg. device 1998 * replace. 1999 */ 2000 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 2001 u64 num_devices) 2002 { 2003 u64 all_avail; 2004 unsigned seq; 2005 int i; 2006 2007 do { 2008 seq = read_seqbegin(&fs_info->profiles_lock); 2009 2010 all_avail = fs_info->avail_data_alloc_bits | 2011 fs_info->avail_system_alloc_bits | 2012 fs_info->avail_metadata_alloc_bits; 2013 } while (read_seqretry(&fs_info->profiles_lock, seq)); 2014 2015 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2016 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 2017 continue; 2018 2019 if (num_devices < btrfs_raid_array[i].devs_min) 2020 return btrfs_raid_array[i].mindev_error; 2021 } 2022 2023 return 0; 2024 } 2025 2026 static struct btrfs_device * btrfs_find_next_active_device( 2027 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 2028 { 2029 struct btrfs_device *next_device; 2030 2031 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 2032 if (next_device != device && 2033 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 2034 && next_device->bdev) 2035 return next_device; 2036 } 2037 2038 return NULL; 2039 } 2040 2041 /* 2042 * Helper function to check if the given device is part of s_bdev / latest_dev 2043 * and replace it with the provided or the next active device, in the context 2044 * where this function called, there should be always be another device (or 2045 * this_dev) which is active. 2046 */ 2047 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 2048 struct btrfs_device *next_device) 2049 { 2050 struct btrfs_fs_info *fs_info = device->fs_info; 2051 2052 if (!next_device) 2053 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2054 device); 2055 ASSERT(next_device); 2056 2057 if (fs_info->sb->s_bdev && 2058 (fs_info->sb->s_bdev == device->bdev)) 2059 fs_info->sb->s_bdev = next_device->bdev; 2060 2061 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2062 fs_info->fs_devices->latest_dev = next_device; 2063 } 2064 2065 /* 2066 * Return btrfs_fs_devices::num_devices excluding the device that's being 2067 * currently replaced. 2068 */ 2069 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2070 { 2071 u64 num_devices = fs_info->fs_devices->num_devices; 2072 2073 down_read(&fs_info->dev_replace.rwsem); 2074 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2075 ASSERT(num_devices > 1); 2076 num_devices--; 2077 } 2078 up_read(&fs_info->dev_replace.rwsem); 2079 2080 return num_devices; 2081 } 2082 2083 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2084 struct block_device *bdev, 2085 const char *device_path) 2086 { 2087 struct btrfs_super_block *disk_super; 2088 int copy_num; 2089 2090 if (!bdev) 2091 return; 2092 2093 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2094 struct page *page; 2095 int ret; 2096 2097 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2098 if (IS_ERR(disk_super)) 2099 continue; 2100 2101 if (bdev_is_zoned(bdev)) { 2102 btrfs_reset_sb_log_zones(bdev, copy_num); 2103 continue; 2104 } 2105 2106 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2107 2108 page = virt_to_page(disk_super); 2109 set_page_dirty(page); 2110 lock_page(page); 2111 /* write_on_page() unlocks the page */ 2112 ret = write_one_page(page); 2113 if (ret) 2114 btrfs_warn(fs_info, 2115 "error clearing superblock number %d (%d)", 2116 copy_num, ret); 2117 btrfs_release_disk_super(disk_super); 2118 2119 } 2120 2121 /* Notify udev that device has changed */ 2122 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2123 2124 /* Update ctime/mtime for device path for libblkid */ 2125 update_dev_time(device_path); 2126 } 2127 2128 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2129 struct btrfs_dev_lookup_args *args, 2130 struct block_device **bdev, fmode_t *mode) 2131 { 2132 struct btrfs_device *device; 2133 struct btrfs_fs_devices *cur_devices; 2134 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2135 u64 num_devices; 2136 int ret = 0; 2137 2138 /* 2139 * The device list in fs_devices is accessed without locks (neither 2140 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2141 * filesystem and another device rm cannot run. 2142 */ 2143 num_devices = btrfs_num_devices(fs_info); 2144 2145 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2146 if (ret) 2147 goto out; 2148 2149 device = btrfs_find_device(fs_info->fs_devices, args); 2150 if (!device) { 2151 if (args->missing) 2152 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2153 else 2154 ret = -ENOENT; 2155 goto out; 2156 } 2157 2158 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2159 btrfs_warn_in_rcu(fs_info, 2160 "cannot remove device %s (devid %llu) due to active swapfile", 2161 rcu_str_deref(device->name), device->devid); 2162 ret = -ETXTBSY; 2163 goto out; 2164 } 2165 2166 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2167 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2168 goto out; 2169 } 2170 2171 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2172 fs_info->fs_devices->rw_devices == 1) { 2173 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2174 goto out; 2175 } 2176 2177 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2178 mutex_lock(&fs_info->chunk_mutex); 2179 list_del_init(&device->dev_alloc_list); 2180 device->fs_devices->rw_devices--; 2181 mutex_unlock(&fs_info->chunk_mutex); 2182 } 2183 2184 ret = btrfs_shrink_device(device, 0); 2185 if (ret) 2186 goto error_undo; 2187 2188 /* 2189 * TODO: the superblock still includes this device in its num_devices 2190 * counter although write_all_supers() is not locked out. This 2191 * could give a filesystem state which requires a degraded mount. 2192 */ 2193 ret = btrfs_rm_dev_item(device); 2194 if (ret) 2195 goto error_undo; 2196 2197 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2198 btrfs_scrub_cancel_dev(device); 2199 2200 /* 2201 * the device list mutex makes sure that we don't change 2202 * the device list while someone else is writing out all 2203 * the device supers. Whoever is writing all supers, should 2204 * lock the device list mutex before getting the number of 2205 * devices in the super block (super_copy). Conversely, 2206 * whoever updates the number of devices in the super block 2207 * (super_copy) should hold the device list mutex. 2208 */ 2209 2210 /* 2211 * In normal cases the cur_devices == fs_devices. But in case 2212 * of deleting a seed device, the cur_devices should point to 2213 * its own fs_devices listed under the fs_devices->seed_list. 2214 */ 2215 cur_devices = device->fs_devices; 2216 mutex_lock(&fs_devices->device_list_mutex); 2217 list_del_rcu(&device->dev_list); 2218 2219 cur_devices->num_devices--; 2220 cur_devices->total_devices--; 2221 /* Update total_devices of the parent fs_devices if it's seed */ 2222 if (cur_devices != fs_devices) 2223 fs_devices->total_devices--; 2224 2225 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2226 cur_devices->missing_devices--; 2227 2228 btrfs_assign_next_active_device(device, NULL); 2229 2230 if (device->bdev) { 2231 cur_devices->open_devices--; 2232 /* remove sysfs entry */ 2233 btrfs_sysfs_remove_device(device); 2234 } 2235 2236 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2237 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2238 mutex_unlock(&fs_devices->device_list_mutex); 2239 2240 /* 2241 * At this point, the device is zero sized and detached from the 2242 * devices list. All that's left is to zero out the old supers and 2243 * free the device. 2244 * 2245 * We cannot call btrfs_close_bdev() here because we're holding the sb 2246 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2247 * block device and it's dependencies. Instead just flush the device 2248 * and let the caller do the final blkdev_put. 2249 */ 2250 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2251 btrfs_scratch_superblocks(fs_info, device->bdev, 2252 device->name->str); 2253 if (device->bdev) { 2254 sync_blockdev(device->bdev); 2255 invalidate_bdev(device->bdev); 2256 } 2257 } 2258 2259 *bdev = device->bdev; 2260 *mode = device->mode; 2261 synchronize_rcu(); 2262 btrfs_free_device(device); 2263 2264 /* 2265 * This can happen if cur_devices is the private seed devices list. We 2266 * cannot call close_fs_devices() here because it expects the uuid_mutex 2267 * to be held, but in fact we don't need that for the private 2268 * seed_devices, we can simply decrement cur_devices->opened and then 2269 * remove it from our list and free the fs_devices. 2270 */ 2271 if (cur_devices->num_devices == 0) { 2272 list_del_init(&cur_devices->seed_list); 2273 ASSERT(cur_devices->opened == 1); 2274 cur_devices->opened--; 2275 free_fs_devices(cur_devices); 2276 } 2277 2278 out: 2279 return ret; 2280 2281 error_undo: 2282 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2283 mutex_lock(&fs_info->chunk_mutex); 2284 list_add(&device->dev_alloc_list, 2285 &fs_devices->alloc_list); 2286 device->fs_devices->rw_devices++; 2287 mutex_unlock(&fs_info->chunk_mutex); 2288 } 2289 goto out; 2290 } 2291 2292 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2293 { 2294 struct btrfs_fs_devices *fs_devices; 2295 2296 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2297 2298 /* 2299 * in case of fs with no seed, srcdev->fs_devices will point 2300 * to fs_devices of fs_info. However when the dev being replaced is 2301 * a seed dev it will point to the seed's local fs_devices. In short 2302 * srcdev will have its correct fs_devices in both the cases. 2303 */ 2304 fs_devices = srcdev->fs_devices; 2305 2306 list_del_rcu(&srcdev->dev_list); 2307 list_del(&srcdev->dev_alloc_list); 2308 fs_devices->num_devices--; 2309 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2310 fs_devices->missing_devices--; 2311 2312 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2313 fs_devices->rw_devices--; 2314 2315 if (srcdev->bdev) 2316 fs_devices->open_devices--; 2317 } 2318 2319 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2320 { 2321 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2322 2323 mutex_lock(&uuid_mutex); 2324 2325 btrfs_close_bdev(srcdev); 2326 synchronize_rcu(); 2327 btrfs_free_device(srcdev); 2328 2329 /* if this is no devs we rather delete the fs_devices */ 2330 if (!fs_devices->num_devices) { 2331 /* 2332 * On a mounted FS, num_devices can't be zero unless it's a 2333 * seed. In case of a seed device being replaced, the replace 2334 * target added to the sprout FS, so there will be no more 2335 * device left under the seed FS. 2336 */ 2337 ASSERT(fs_devices->seeding); 2338 2339 list_del_init(&fs_devices->seed_list); 2340 close_fs_devices(fs_devices); 2341 free_fs_devices(fs_devices); 2342 } 2343 mutex_unlock(&uuid_mutex); 2344 } 2345 2346 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2347 { 2348 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2349 2350 mutex_lock(&fs_devices->device_list_mutex); 2351 2352 btrfs_sysfs_remove_device(tgtdev); 2353 2354 if (tgtdev->bdev) 2355 fs_devices->open_devices--; 2356 2357 fs_devices->num_devices--; 2358 2359 btrfs_assign_next_active_device(tgtdev, NULL); 2360 2361 list_del_rcu(&tgtdev->dev_list); 2362 2363 mutex_unlock(&fs_devices->device_list_mutex); 2364 2365 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2366 tgtdev->name->str); 2367 2368 btrfs_close_bdev(tgtdev); 2369 synchronize_rcu(); 2370 btrfs_free_device(tgtdev); 2371 } 2372 2373 /** 2374 * Populate args from device at path 2375 * 2376 * @fs_info: the filesystem 2377 * @args: the args to populate 2378 * @path: the path to the device 2379 * 2380 * This will read the super block of the device at @path and populate @args with 2381 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2382 * lookup a device to operate on, but need to do it before we take any locks. 2383 * This properly handles the special case of "missing" that a user may pass in, 2384 * and does some basic sanity checks. The caller must make sure that @path is 2385 * properly NUL terminated before calling in, and must call 2386 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2387 * uuid buffers. 2388 * 2389 * Return: 0 for success, -errno for failure 2390 */ 2391 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2392 struct btrfs_dev_lookup_args *args, 2393 const char *path) 2394 { 2395 struct btrfs_super_block *disk_super; 2396 struct block_device *bdev; 2397 int ret; 2398 2399 if (!path || !path[0]) 2400 return -EINVAL; 2401 if (!strcmp(path, "missing")) { 2402 args->missing = true; 2403 return 0; 2404 } 2405 2406 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2407 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2408 if (!args->uuid || !args->fsid) { 2409 btrfs_put_dev_args_from_path(args); 2410 return -ENOMEM; 2411 } 2412 2413 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2414 &bdev, &disk_super); 2415 if (ret) 2416 return ret; 2417 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2418 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2419 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2420 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2421 else 2422 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2423 btrfs_release_disk_super(disk_super); 2424 blkdev_put(bdev, FMODE_READ); 2425 return 0; 2426 } 2427 2428 /* 2429 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2430 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2431 * that don't need to be freed. 2432 */ 2433 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2434 { 2435 kfree(args->uuid); 2436 kfree(args->fsid); 2437 args->uuid = NULL; 2438 args->fsid = NULL; 2439 } 2440 2441 struct btrfs_device *btrfs_find_device_by_devspec( 2442 struct btrfs_fs_info *fs_info, u64 devid, 2443 const char *device_path) 2444 { 2445 BTRFS_DEV_LOOKUP_ARGS(args); 2446 struct btrfs_device *device; 2447 int ret; 2448 2449 if (devid) { 2450 args.devid = devid; 2451 device = btrfs_find_device(fs_info->fs_devices, &args); 2452 if (!device) 2453 return ERR_PTR(-ENOENT); 2454 return device; 2455 } 2456 2457 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2458 if (ret) 2459 return ERR_PTR(ret); 2460 device = btrfs_find_device(fs_info->fs_devices, &args); 2461 btrfs_put_dev_args_from_path(&args); 2462 if (!device) 2463 return ERR_PTR(-ENOENT); 2464 return device; 2465 } 2466 2467 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2468 { 2469 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2470 struct btrfs_fs_devices *old_devices; 2471 struct btrfs_fs_devices *seed_devices; 2472 2473 lockdep_assert_held(&uuid_mutex); 2474 if (!fs_devices->seeding) 2475 return ERR_PTR(-EINVAL); 2476 2477 /* 2478 * Private copy of the seed devices, anchored at 2479 * fs_info->fs_devices->seed_list 2480 */ 2481 seed_devices = alloc_fs_devices(NULL, NULL); 2482 if (IS_ERR(seed_devices)) 2483 return seed_devices; 2484 2485 /* 2486 * It's necessary to retain a copy of the original seed fs_devices in 2487 * fs_uuids so that filesystems which have been seeded can successfully 2488 * reference the seed device from open_seed_devices. This also supports 2489 * multiple fs seed. 2490 */ 2491 old_devices = clone_fs_devices(fs_devices); 2492 if (IS_ERR(old_devices)) { 2493 kfree(seed_devices); 2494 return old_devices; 2495 } 2496 2497 list_add(&old_devices->fs_list, &fs_uuids); 2498 2499 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2500 seed_devices->opened = 1; 2501 INIT_LIST_HEAD(&seed_devices->devices); 2502 INIT_LIST_HEAD(&seed_devices->alloc_list); 2503 mutex_init(&seed_devices->device_list_mutex); 2504 2505 return seed_devices; 2506 } 2507 2508 /* 2509 * Splice seed devices into the sprout fs_devices. 2510 * Generate a new fsid for the sprouted read-write filesystem. 2511 */ 2512 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2513 struct btrfs_fs_devices *seed_devices) 2514 { 2515 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2516 struct btrfs_super_block *disk_super = fs_info->super_copy; 2517 struct btrfs_device *device; 2518 u64 super_flags; 2519 2520 /* 2521 * We are updating the fsid, the thread leading to device_list_add() 2522 * could race, so uuid_mutex is needed. 2523 */ 2524 lockdep_assert_held(&uuid_mutex); 2525 2526 /* 2527 * The threads listed below may traverse dev_list but can do that without 2528 * device_list_mutex: 2529 * - All device ops and balance - as we are in btrfs_exclop_start. 2530 * - Various dev_list readers - are using RCU. 2531 * - btrfs_ioctl_fitrim() - is using RCU. 2532 * 2533 * For-read threads as below are using device_list_mutex: 2534 * - Readonly scrub btrfs_scrub_dev() 2535 * - Readonly scrub btrfs_scrub_progress() 2536 * - btrfs_get_dev_stats() 2537 */ 2538 lockdep_assert_held(&fs_devices->device_list_mutex); 2539 2540 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2541 synchronize_rcu); 2542 list_for_each_entry(device, &seed_devices->devices, dev_list) 2543 device->fs_devices = seed_devices; 2544 2545 fs_devices->seeding = false; 2546 fs_devices->num_devices = 0; 2547 fs_devices->open_devices = 0; 2548 fs_devices->missing_devices = 0; 2549 fs_devices->rotating = false; 2550 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2551 2552 generate_random_uuid(fs_devices->fsid); 2553 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2554 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2555 2556 super_flags = btrfs_super_flags(disk_super) & 2557 ~BTRFS_SUPER_FLAG_SEEDING; 2558 btrfs_set_super_flags(disk_super, super_flags); 2559 } 2560 2561 /* 2562 * Store the expected generation for seed devices in device items. 2563 */ 2564 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2565 { 2566 BTRFS_DEV_LOOKUP_ARGS(args); 2567 struct btrfs_fs_info *fs_info = trans->fs_info; 2568 struct btrfs_root *root = fs_info->chunk_root; 2569 struct btrfs_path *path; 2570 struct extent_buffer *leaf; 2571 struct btrfs_dev_item *dev_item; 2572 struct btrfs_device *device; 2573 struct btrfs_key key; 2574 u8 fs_uuid[BTRFS_FSID_SIZE]; 2575 u8 dev_uuid[BTRFS_UUID_SIZE]; 2576 int ret; 2577 2578 path = btrfs_alloc_path(); 2579 if (!path) 2580 return -ENOMEM; 2581 2582 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2583 key.offset = 0; 2584 key.type = BTRFS_DEV_ITEM_KEY; 2585 2586 while (1) { 2587 btrfs_reserve_chunk_metadata(trans, false); 2588 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2589 btrfs_trans_release_chunk_metadata(trans); 2590 if (ret < 0) 2591 goto error; 2592 2593 leaf = path->nodes[0]; 2594 next_slot: 2595 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2596 ret = btrfs_next_leaf(root, path); 2597 if (ret > 0) 2598 break; 2599 if (ret < 0) 2600 goto error; 2601 leaf = path->nodes[0]; 2602 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2603 btrfs_release_path(path); 2604 continue; 2605 } 2606 2607 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2608 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2609 key.type != BTRFS_DEV_ITEM_KEY) 2610 break; 2611 2612 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2613 struct btrfs_dev_item); 2614 args.devid = btrfs_device_id(leaf, dev_item); 2615 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2616 BTRFS_UUID_SIZE); 2617 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2618 BTRFS_FSID_SIZE); 2619 args.uuid = dev_uuid; 2620 args.fsid = fs_uuid; 2621 device = btrfs_find_device(fs_info->fs_devices, &args); 2622 BUG_ON(!device); /* Logic error */ 2623 2624 if (device->fs_devices->seeding) { 2625 btrfs_set_device_generation(leaf, dev_item, 2626 device->generation); 2627 btrfs_mark_buffer_dirty(leaf); 2628 } 2629 2630 path->slots[0]++; 2631 goto next_slot; 2632 } 2633 ret = 0; 2634 error: 2635 btrfs_free_path(path); 2636 return ret; 2637 } 2638 2639 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2640 { 2641 struct btrfs_root *root = fs_info->dev_root; 2642 struct request_queue *q; 2643 struct btrfs_trans_handle *trans; 2644 struct btrfs_device *device; 2645 struct block_device *bdev; 2646 struct super_block *sb = fs_info->sb; 2647 struct rcu_string *name; 2648 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2649 struct btrfs_fs_devices *seed_devices; 2650 u64 orig_super_total_bytes; 2651 u64 orig_super_num_devices; 2652 int ret = 0; 2653 bool seeding_dev = false; 2654 bool locked = false; 2655 dev_t devt; 2656 2657 if (sb_rdonly(sb) && !fs_devices->seeding) 2658 return -EROFS; 2659 2660 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2661 fs_info->bdev_holder); 2662 if (IS_ERR(bdev)) 2663 return PTR_ERR(bdev); 2664 2665 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2666 ret = -EINVAL; 2667 goto error; 2668 } 2669 2670 if (fs_devices->seeding) { 2671 seeding_dev = true; 2672 down_write(&sb->s_umount); 2673 mutex_lock(&uuid_mutex); 2674 locked = true; 2675 } 2676 2677 sync_blockdev(bdev); 2678 2679 rcu_read_lock(); 2680 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2681 if (device->bdev == bdev) { 2682 ret = -EEXIST; 2683 rcu_read_unlock(); 2684 goto error; 2685 } 2686 } 2687 rcu_read_unlock(); 2688 2689 device = btrfs_alloc_device(fs_info, NULL, NULL); 2690 if (IS_ERR(device)) { 2691 /* we can safely leave the fs_devices entry around */ 2692 ret = PTR_ERR(device); 2693 goto error; 2694 } 2695 2696 name = rcu_string_strdup(device_path, GFP_KERNEL); 2697 if (!name) { 2698 ret = -ENOMEM; 2699 goto error_free_device; 2700 } 2701 rcu_assign_pointer(device->name, name); 2702 2703 device->fs_info = fs_info; 2704 device->bdev = bdev; 2705 2706 ret = btrfs_get_dev_zone_info(device, false); 2707 if (ret) 2708 goto error_free_device; 2709 2710 trans = btrfs_start_transaction(root, 0); 2711 if (IS_ERR(trans)) { 2712 ret = PTR_ERR(trans); 2713 goto error_free_zone; 2714 } 2715 2716 q = bdev_get_queue(bdev); 2717 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2718 device->generation = trans->transid; 2719 device->io_width = fs_info->sectorsize; 2720 device->io_align = fs_info->sectorsize; 2721 device->sector_size = fs_info->sectorsize; 2722 device->total_bytes = 2723 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2724 device->disk_total_bytes = device->total_bytes; 2725 device->commit_total_bytes = device->total_bytes; 2726 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2727 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2728 device->mode = FMODE_EXCL; 2729 device->dev_stats_valid = 1; 2730 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2731 2732 if (seeding_dev) { 2733 btrfs_clear_sb_rdonly(sb); 2734 2735 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2736 seed_devices = btrfs_init_sprout(fs_info); 2737 if (IS_ERR(seed_devices)) { 2738 ret = PTR_ERR(seed_devices); 2739 btrfs_abort_transaction(trans, ret); 2740 goto error_trans; 2741 } 2742 } 2743 2744 mutex_lock(&fs_devices->device_list_mutex); 2745 if (seeding_dev) { 2746 btrfs_setup_sprout(fs_info, seed_devices); 2747 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2748 device); 2749 } 2750 2751 device->fs_devices = fs_devices; 2752 2753 mutex_lock(&fs_info->chunk_mutex); 2754 list_add_rcu(&device->dev_list, &fs_devices->devices); 2755 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2756 fs_devices->num_devices++; 2757 fs_devices->open_devices++; 2758 fs_devices->rw_devices++; 2759 fs_devices->total_devices++; 2760 fs_devices->total_rw_bytes += device->total_bytes; 2761 2762 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2763 2764 if (!blk_queue_nonrot(q)) 2765 fs_devices->rotating = true; 2766 2767 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2768 btrfs_set_super_total_bytes(fs_info->super_copy, 2769 round_down(orig_super_total_bytes + device->total_bytes, 2770 fs_info->sectorsize)); 2771 2772 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2773 btrfs_set_super_num_devices(fs_info->super_copy, 2774 orig_super_num_devices + 1); 2775 2776 /* 2777 * we've got more storage, clear any full flags on the space 2778 * infos 2779 */ 2780 btrfs_clear_space_info_full(fs_info); 2781 2782 mutex_unlock(&fs_info->chunk_mutex); 2783 2784 /* Add sysfs device entry */ 2785 btrfs_sysfs_add_device(device); 2786 2787 mutex_unlock(&fs_devices->device_list_mutex); 2788 2789 if (seeding_dev) { 2790 mutex_lock(&fs_info->chunk_mutex); 2791 ret = init_first_rw_device(trans); 2792 mutex_unlock(&fs_info->chunk_mutex); 2793 if (ret) { 2794 btrfs_abort_transaction(trans, ret); 2795 goto error_sysfs; 2796 } 2797 } 2798 2799 ret = btrfs_add_dev_item(trans, device); 2800 if (ret) { 2801 btrfs_abort_transaction(trans, ret); 2802 goto error_sysfs; 2803 } 2804 2805 if (seeding_dev) { 2806 ret = btrfs_finish_sprout(trans); 2807 if (ret) { 2808 btrfs_abort_transaction(trans, ret); 2809 goto error_sysfs; 2810 } 2811 2812 /* 2813 * fs_devices now represents the newly sprouted filesystem and 2814 * its fsid has been changed by btrfs_sprout_splice(). 2815 */ 2816 btrfs_sysfs_update_sprout_fsid(fs_devices); 2817 } 2818 2819 ret = btrfs_commit_transaction(trans); 2820 2821 if (seeding_dev) { 2822 mutex_unlock(&uuid_mutex); 2823 up_write(&sb->s_umount); 2824 locked = false; 2825 2826 if (ret) /* transaction commit */ 2827 return ret; 2828 2829 ret = btrfs_relocate_sys_chunks(fs_info); 2830 if (ret < 0) 2831 btrfs_handle_fs_error(fs_info, ret, 2832 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2833 trans = btrfs_attach_transaction(root); 2834 if (IS_ERR(trans)) { 2835 if (PTR_ERR(trans) == -ENOENT) 2836 return 0; 2837 ret = PTR_ERR(trans); 2838 trans = NULL; 2839 goto error_sysfs; 2840 } 2841 ret = btrfs_commit_transaction(trans); 2842 } 2843 2844 /* 2845 * Now that we have written a new super block to this device, check all 2846 * other fs_devices list if device_path alienates any other scanned 2847 * device. 2848 * Skip forget_deivces if lookup_bdev() fails as there is nothing much 2849 * that can be done about it. 2850 * We can ignore the return value as it typically returns -EINVAL and 2851 * only succeeds if the device was an alien. 2852 */ 2853 if (lookup_bdev(device_path, &devt) == 0) 2854 btrfs_forget_devices(devt); 2855 2856 /* Update ctime/mtime for blkid or udev */ 2857 update_dev_time(device_path); 2858 2859 return ret; 2860 2861 error_sysfs: 2862 btrfs_sysfs_remove_device(device); 2863 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2864 mutex_lock(&fs_info->chunk_mutex); 2865 list_del_rcu(&device->dev_list); 2866 list_del(&device->dev_alloc_list); 2867 fs_info->fs_devices->num_devices--; 2868 fs_info->fs_devices->open_devices--; 2869 fs_info->fs_devices->rw_devices--; 2870 fs_info->fs_devices->total_devices--; 2871 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2872 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2873 btrfs_set_super_total_bytes(fs_info->super_copy, 2874 orig_super_total_bytes); 2875 btrfs_set_super_num_devices(fs_info->super_copy, 2876 orig_super_num_devices); 2877 mutex_unlock(&fs_info->chunk_mutex); 2878 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2879 error_trans: 2880 if (seeding_dev) 2881 btrfs_set_sb_rdonly(sb); 2882 if (trans) 2883 btrfs_end_transaction(trans); 2884 error_free_zone: 2885 btrfs_destroy_dev_zone_info(device); 2886 error_free_device: 2887 btrfs_free_device(device); 2888 error: 2889 blkdev_put(bdev, FMODE_EXCL); 2890 if (locked) { 2891 mutex_unlock(&uuid_mutex); 2892 up_write(&sb->s_umount); 2893 } 2894 return ret; 2895 } 2896 2897 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2898 struct btrfs_device *device) 2899 { 2900 int ret; 2901 struct btrfs_path *path; 2902 struct btrfs_root *root = device->fs_info->chunk_root; 2903 struct btrfs_dev_item *dev_item; 2904 struct extent_buffer *leaf; 2905 struct btrfs_key key; 2906 2907 path = btrfs_alloc_path(); 2908 if (!path) 2909 return -ENOMEM; 2910 2911 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2912 key.type = BTRFS_DEV_ITEM_KEY; 2913 key.offset = device->devid; 2914 2915 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2916 if (ret < 0) 2917 goto out; 2918 2919 if (ret > 0) { 2920 ret = -ENOENT; 2921 goto out; 2922 } 2923 2924 leaf = path->nodes[0]; 2925 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2926 2927 btrfs_set_device_id(leaf, dev_item, device->devid); 2928 btrfs_set_device_type(leaf, dev_item, device->type); 2929 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2930 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2931 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2932 btrfs_set_device_total_bytes(leaf, dev_item, 2933 btrfs_device_get_disk_total_bytes(device)); 2934 btrfs_set_device_bytes_used(leaf, dev_item, 2935 btrfs_device_get_bytes_used(device)); 2936 btrfs_mark_buffer_dirty(leaf); 2937 2938 out: 2939 btrfs_free_path(path); 2940 return ret; 2941 } 2942 2943 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2944 struct btrfs_device *device, u64 new_size) 2945 { 2946 struct btrfs_fs_info *fs_info = device->fs_info; 2947 struct btrfs_super_block *super_copy = fs_info->super_copy; 2948 u64 old_total; 2949 u64 diff; 2950 int ret; 2951 2952 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2953 return -EACCES; 2954 2955 new_size = round_down(new_size, fs_info->sectorsize); 2956 2957 mutex_lock(&fs_info->chunk_mutex); 2958 old_total = btrfs_super_total_bytes(super_copy); 2959 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2960 2961 if (new_size <= device->total_bytes || 2962 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2963 mutex_unlock(&fs_info->chunk_mutex); 2964 return -EINVAL; 2965 } 2966 2967 btrfs_set_super_total_bytes(super_copy, 2968 round_down(old_total + diff, fs_info->sectorsize)); 2969 device->fs_devices->total_rw_bytes += diff; 2970 2971 btrfs_device_set_total_bytes(device, new_size); 2972 btrfs_device_set_disk_total_bytes(device, new_size); 2973 btrfs_clear_space_info_full(device->fs_info); 2974 if (list_empty(&device->post_commit_list)) 2975 list_add_tail(&device->post_commit_list, 2976 &trans->transaction->dev_update_list); 2977 mutex_unlock(&fs_info->chunk_mutex); 2978 2979 btrfs_reserve_chunk_metadata(trans, false); 2980 ret = btrfs_update_device(trans, device); 2981 btrfs_trans_release_chunk_metadata(trans); 2982 2983 return ret; 2984 } 2985 2986 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2987 { 2988 struct btrfs_fs_info *fs_info = trans->fs_info; 2989 struct btrfs_root *root = fs_info->chunk_root; 2990 int ret; 2991 struct btrfs_path *path; 2992 struct btrfs_key key; 2993 2994 path = btrfs_alloc_path(); 2995 if (!path) 2996 return -ENOMEM; 2997 2998 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2999 key.offset = chunk_offset; 3000 key.type = BTRFS_CHUNK_ITEM_KEY; 3001 3002 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3003 if (ret < 0) 3004 goto out; 3005 else if (ret > 0) { /* Logic error or corruption */ 3006 btrfs_handle_fs_error(fs_info, -ENOENT, 3007 "Failed lookup while freeing chunk."); 3008 ret = -ENOENT; 3009 goto out; 3010 } 3011 3012 ret = btrfs_del_item(trans, root, path); 3013 if (ret < 0) 3014 btrfs_handle_fs_error(fs_info, ret, 3015 "Failed to delete chunk item."); 3016 out: 3017 btrfs_free_path(path); 3018 return ret; 3019 } 3020 3021 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3022 { 3023 struct btrfs_super_block *super_copy = fs_info->super_copy; 3024 struct btrfs_disk_key *disk_key; 3025 struct btrfs_chunk *chunk; 3026 u8 *ptr; 3027 int ret = 0; 3028 u32 num_stripes; 3029 u32 array_size; 3030 u32 len = 0; 3031 u32 cur; 3032 struct btrfs_key key; 3033 3034 lockdep_assert_held(&fs_info->chunk_mutex); 3035 array_size = btrfs_super_sys_array_size(super_copy); 3036 3037 ptr = super_copy->sys_chunk_array; 3038 cur = 0; 3039 3040 while (cur < array_size) { 3041 disk_key = (struct btrfs_disk_key *)ptr; 3042 btrfs_disk_key_to_cpu(&key, disk_key); 3043 3044 len = sizeof(*disk_key); 3045 3046 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3047 chunk = (struct btrfs_chunk *)(ptr + len); 3048 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 3049 len += btrfs_chunk_item_size(num_stripes); 3050 } else { 3051 ret = -EIO; 3052 break; 3053 } 3054 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3055 key.offset == chunk_offset) { 3056 memmove(ptr, ptr + len, array_size - (cur + len)); 3057 array_size -= len; 3058 btrfs_set_super_sys_array_size(super_copy, array_size); 3059 } else { 3060 ptr += len; 3061 cur += len; 3062 } 3063 } 3064 return ret; 3065 } 3066 3067 /* 3068 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3069 * @logical: Logical block offset in bytes. 3070 * @length: Length of extent in bytes. 3071 * 3072 * Return: Chunk mapping or ERR_PTR. 3073 */ 3074 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3075 u64 logical, u64 length) 3076 { 3077 struct extent_map_tree *em_tree; 3078 struct extent_map *em; 3079 3080 em_tree = &fs_info->mapping_tree; 3081 read_lock(&em_tree->lock); 3082 em = lookup_extent_mapping(em_tree, logical, length); 3083 read_unlock(&em_tree->lock); 3084 3085 if (!em) { 3086 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3087 logical, length); 3088 return ERR_PTR(-EINVAL); 3089 } 3090 3091 if (em->start > logical || em->start + em->len < logical) { 3092 btrfs_crit(fs_info, 3093 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3094 logical, length, em->start, em->start + em->len); 3095 free_extent_map(em); 3096 return ERR_PTR(-EINVAL); 3097 } 3098 3099 /* callers are responsible for dropping em's ref. */ 3100 return em; 3101 } 3102 3103 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3104 struct map_lookup *map, u64 chunk_offset) 3105 { 3106 int i; 3107 3108 /* 3109 * Removing chunk items and updating the device items in the chunks btree 3110 * requires holding the chunk_mutex. 3111 * See the comment at btrfs_chunk_alloc() for the details. 3112 */ 3113 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3114 3115 for (i = 0; i < map->num_stripes; i++) { 3116 int ret; 3117 3118 ret = btrfs_update_device(trans, map->stripes[i].dev); 3119 if (ret) 3120 return ret; 3121 } 3122 3123 return btrfs_free_chunk(trans, chunk_offset); 3124 } 3125 3126 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3127 { 3128 struct btrfs_fs_info *fs_info = trans->fs_info; 3129 struct extent_map *em; 3130 struct map_lookup *map; 3131 u64 dev_extent_len = 0; 3132 int i, ret = 0; 3133 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3134 3135 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3136 if (IS_ERR(em)) { 3137 /* 3138 * This is a logic error, but we don't want to just rely on the 3139 * user having built with ASSERT enabled, so if ASSERT doesn't 3140 * do anything we still error out. 3141 */ 3142 ASSERT(0); 3143 return PTR_ERR(em); 3144 } 3145 map = em->map_lookup; 3146 3147 /* 3148 * First delete the device extent items from the devices btree. 3149 * We take the device_list_mutex to avoid racing with the finishing phase 3150 * of a device replace operation. See the comment below before acquiring 3151 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3152 * because that can result in a deadlock when deleting the device extent 3153 * items from the devices btree - COWing an extent buffer from the btree 3154 * may result in allocating a new metadata chunk, which would attempt to 3155 * lock again fs_info->chunk_mutex. 3156 */ 3157 mutex_lock(&fs_devices->device_list_mutex); 3158 for (i = 0; i < map->num_stripes; i++) { 3159 struct btrfs_device *device = map->stripes[i].dev; 3160 ret = btrfs_free_dev_extent(trans, device, 3161 map->stripes[i].physical, 3162 &dev_extent_len); 3163 if (ret) { 3164 mutex_unlock(&fs_devices->device_list_mutex); 3165 btrfs_abort_transaction(trans, ret); 3166 goto out; 3167 } 3168 3169 if (device->bytes_used > 0) { 3170 mutex_lock(&fs_info->chunk_mutex); 3171 btrfs_device_set_bytes_used(device, 3172 device->bytes_used - dev_extent_len); 3173 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3174 btrfs_clear_space_info_full(fs_info); 3175 mutex_unlock(&fs_info->chunk_mutex); 3176 } 3177 } 3178 mutex_unlock(&fs_devices->device_list_mutex); 3179 3180 /* 3181 * We acquire fs_info->chunk_mutex for 2 reasons: 3182 * 3183 * 1) Just like with the first phase of the chunk allocation, we must 3184 * reserve system space, do all chunk btree updates and deletions, and 3185 * update the system chunk array in the superblock while holding this 3186 * mutex. This is for similar reasons as explained on the comment at 3187 * the top of btrfs_chunk_alloc(); 3188 * 3189 * 2) Prevent races with the final phase of a device replace operation 3190 * that replaces the device object associated with the map's stripes, 3191 * because the device object's id can change at any time during that 3192 * final phase of the device replace operation 3193 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3194 * replaced device and then see it with an ID of 3195 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3196 * the device item, which does not exists on the chunk btree. 3197 * The finishing phase of device replace acquires both the 3198 * device_list_mutex and the chunk_mutex, in that order, so we are 3199 * safe by just acquiring the chunk_mutex. 3200 */ 3201 trans->removing_chunk = true; 3202 mutex_lock(&fs_info->chunk_mutex); 3203 3204 check_system_chunk(trans, map->type); 3205 3206 ret = remove_chunk_item(trans, map, chunk_offset); 3207 /* 3208 * Normally we should not get -ENOSPC since we reserved space before 3209 * through the call to check_system_chunk(). 3210 * 3211 * Despite our system space_info having enough free space, we may not 3212 * be able to allocate extents from its block groups, because all have 3213 * an incompatible profile, which will force us to allocate a new system 3214 * block group with the right profile, or right after we called 3215 * check_system_space() above, a scrub turned the only system block group 3216 * with enough free space into RO mode. 3217 * This is explained with more detail at do_chunk_alloc(). 3218 * 3219 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3220 */ 3221 if (ret == -ENOSPC) { 3222 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3223 struct btrfs_block_group *sys_bg; 3224 3225 sys_bg = btrfs_create_chunk(trans, sys_flags); 3226 if (IS_ERR(sys_bg)) { 3227 ret = PTR_ERR(sys_bg); 3228 btrfs_abort_transaction(trans, ret); 3229 goto out; 3230 } 3231 3232 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3233 if (ret) { 3234 btrfs_abort_transaction(trans, ret); 3235 goto out; 3236 } 3237 3238 ret = remove_chunk_item(trans, map, chunk_offset); 3239 if (ret) { 3240 btrfs_abort_transaction(trans, ret); 3241 goto out; 3242 } 3243 } else if (ret) { 3244 btrfs_abort_transaction(trans, ret); 3245 goto out; 3246 } 3247 3248 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3249 3250 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3251 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3252 if (ret) { 3253 btrfs_abort_transaction(trans, ret); 3254 goto out; 3255 } 3256 } 3257 3258 mutex_unlock(&fs_info->chunk_mutex); 3259 trans->removing_chunk = false; 3260 3261 /* 3262 * We are done with chunk btree updates and deletions, so release the 3263 * system space we previously reserved (with check_system_chunk()). 3264 */ 3265 btrfs_trans_release_chunk_metadata(trans); 3266 3267 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3268 if (ret) { 3269 btrfs_abort_transaction(trans, ret); 3270 goto out; 3271 } 3272 3273 out: 3274 if (trans->removing_chunk) { 3275 mutex_unlock(&fs_info->chunk_mutex); 3276 trans->removing_chunk = false; 3277 } 3278 /* once for us */ 3279 free_extent_map(em); 3280 return ret; 3281 } 3282 3283 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3284 { 3285 struct btrfs_root *root = fs_info->chunk_root; 3286 struct btrfs_trans_handle *trans; 3287 struct btrfs_block_group *block_group; 3288 u64 length; 3289 int ret; 3290 3291 /* 3292 * Prevent races with automatic removal of unused block groups. 3293 * After we relocate and before we remove the chunk with offset 3294 * chunk_offset, automatic removal of the block group can kick in, 3295 * resulting in a failure when calling btrfs_remove_chunk() below. 3296 * 3297 * Make sure to acquire this mutex before doing a tree search (dev 3298 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3299 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3300 * we release the path used to search the chunk/dev tree and before 3301 * the current task acquires this mutex and calls us. 3302 */ 3303 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3304 3305 /* step one, relocate all the extents inside this chunk */ 3306 btrfs_scrub_pause(fs_info); 3307 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3308 btrfs_scrub_continue(fs_info); 3309 if (ret) 3310 return ret; 3311 3312 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3313 if (!block_group) 3314 return -ENOENT; 3315 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3316 length = block_group->length; 3317 btrfs_put_block_group(block_group); 3318 3319 /* 3320 * On a zoned file system, discard the whole block group, this will 3321 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3322 * resetting the zone fails, don't treat it as a fatal problem from the 3323 * filesystem's point of view. 3324 */ 3325 if (btrfs_is_zoned(fs_info)) { 3326 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3327 if (ret) 3328 btrfs_info(fs_info, 3329 "failed to reset zone %llu after relocation", 3330 chunk_offset); 3331 } 3332 3333 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3334 chunk_offset); 3335 if (IS_ERR(trans)) { 3336 ret = PTR_ERR(trans); 3337 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3338 return ret; 3339 } 3340 3341 /* 3342 * step two, delete the device extents and the 3343 * chunk tree entries 3344 */ 3345 ret = btrfs_remove_chunk(trans, chunk_offset); 3346 btrfs_end_transaction(trans); 3347 return ret; 3348 } 3349 3350 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3351 { 3352 struct btrfs_root *chunk_root = fs_info->chunk_root; 3353 struct btrfs_path *path; 3354 struct extent_buffer *leaf; 3355 struct btrfs_chunk *chunk; 3356 struct btrfs_key key; 3357 struct btrfs_key found_key; 3358 u64 chunk_type; 3359 bool retried = false; 3360 int failed = 0; 3361 int ret; 3362 3363 path = btrfs_alloc_path(); 3364 if (!path) 3365 return -ENOMEM; 3366 3367 again: 3368 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3369 key.offset = (u64)-1; 3370 key.type = BTRFS_CHUNK_ITEM_KEY; 3371 3372 while (1) { 3373 mutex_lock(&fs_info->reclaim_bgs_lock); 3374 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3375 if (ret < 0) { 3376 mutex_unlock(&fs_info->reclaim_bgs_lock); 3377 goto error; 3378 } 3379 BUG_ON(ret == 0); /* Corruption */ 3380 3381 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3382 key.type); 3383 if (ret) 3384 mutex_unlock(&fs_info->reclaim_bgs_lock); 3385 if (ret < 0) 3386 goto error; 3387 if (ret > 0) 3388 break; 3389 3390 leaf = path->nodes[0]; 3391 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3392 3393 chunk = btrfs_item_ptr(leaf, path->slots[0], 3394 struct btrfs_chunk); 3395 chunk_type = btrfs_chunk_type(leaf, chunk); 3396 btrfs_release_path(path); 3397 3398 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3399 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3400 if (ret == -ENOSPC) 3401 failed++; 3402 else 3403 BUG_ON(ret); 3404 } 3405 mutex_unlock(&fs_info->reclaim_bgs_lock); 3406 3407 if (found_key.offset == 0) 3408 break; 3409 key.offset = found_key.offset - 1; 3410 } 3411 ret = 0; 3412 if (failed && !retried) { 3413 failed = 0; 3414 retried = true; 3415 goto again; 3416 } else if (WARN_ON(failed && retried)) { 3417 ret = -ENOSPC; 3418 } 3419 error: 3420 btrfs_free_path(path); 3421 return ret; 3422 } 3423 3424 /* 3425 * return 1 : allocate a data chunk successfully, 3426 * return <0: errors during allocating a data chunk, 3427 * return 0 : no need to allocate a data chunk. 3428 */ 3429 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3430 u64 chunk_offset) 3431 { 3432 struct btrfs_block_group *cache; 3433 u64 bytes_used; 3434 u64 chunk_type; 3435 3436 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3437 ASSERT(cache); 3438 chunk_type = cache->flags; 3439 btrfs_put_block_group(cache); 3440 3441 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3442 return 0; 3443 3444 spin_lock(&fs_info->data_sinfo->lock); 3445 bytes_used = fs_info->data_sinfo->bytes_used; 3446 spin_unlock(&fs_info->data_sinfo->lock); 3447 3448 if (!bytes_used) { 3449 struct btrfs_trans_handle *trans; 3450 int ret; 3451 3452 trans = btrfs_join_transaction(fs_info->tree_root); 3453 if (IS_ERR(trans)) 3454 return PTR_ERR(trans); 3455 3456 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3457 btrfs_end_transaction(trans); 3458 if (ret < 0) 3459 return ret; 3460 return 1; 3461 } 3462 3463 return 0; 3464 } 3465 3466 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3467 struct btrfs_balance_control *bctl) 3468 { 3469 struct btrfs_root *root = fs_info->tree_root; 3470 struct btrfs_trans_handle *trans; 3471 struct btrfs_balance_item *item; 3472 struct btrfs_disk_balance_args disk_bargs; 3473 struct btrfs_path *path; 3474 struct extent_buffer *leaf; 3475 struct btrfs_key key; 3476 int ret, err; 3477 3478 path = btrfs_alloc_path(); 3479 if (!path) 3480 return -ENOMEM; 3481 3482 trans = btrfs_start_transaction(root, 0); 3483 if (IS_ERR(trans)) { 3484 btrfs_free_path(path); 3485 return PTR_ERR(trans); 3486 } 3487 3488 key.objectid = BTRFS_BALANCE_OBJECTID; 3489 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3490 key.offset = 0; 3491 3492 ret = btrfs_insert_empty_item(trans, root, path, &key, 3493 sizeof(*item)); 3494 if (ret) 3495 goto out; 3496 3497 leaf = path->nodes[0]; 3498 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3499 3500 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3501 3502 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3503 btrfs_set_balance_data(leaf, item, &disk_bargs); 3504 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3505 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3506 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3507 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3508 3509 btrfs_set_balance_flags(leaf, item, bctl->flags); 3510 3511 btrfs_mark_buffer_dirty(leaf); 3512 out: 3513 btrfs_free_path(path); 3514 err = btrfs_commit_transaction(trans); 3515 if (err && !ret) 3516 ret = err; 3517 return ret; 3518 } 3519 3520 static int del_balance_item(struct btrfs_fs_info *fs_info) 3521 { 3522 struct btrfs_root *root = fs_info->tree_root; 3523 struct btrfs_trans_handle *trans; 3524 struct btrfs_path *path; 3525 struct btrfs_key key; 3526 int ret, err; 3527 3528 path = btrfs_alloc_path(); 3529 if (!path) 3530 return -ENOMEM; 3531 3532 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3533 if (IS_ERR(trans)) { 3534 btrfs_free_path(path); 3535 return PTR_ERR(trans); 3536 } 3537 3538 key.objectid = BTRFS_BALANCE_OBJECTID; 3539 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3540 key.offset = 0; 3541 3542 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3543 if (ret < 0) 3544 goto out; 3545 if (ret > 0) { 3546 ret = -ENOENT; 3547 goto out; 3548 } 3549 3550 ret = btrfs_del_item(trans, root, path); 3551 out: 3552 btrfs_free_path(path); 3553 err = btrfs_commit_transaction(trans); 3554 if (err && !ret) 3555 ret = err; 3556 return ret; 3557 } 3558 3559 /* 3560 * This is a heuristic used to reduce the number of chunks balanced on 3561 * resume after balance was interrupted. 3562 */ 3563 static void update_balance_args(struct btrfs_balance_control *bctl) 3564 { 3565 /* 3566 * Turn on soft mode for chunk types that were being converted. 3567 */ 3568 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3569 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3570 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3571 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3572 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3573 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3574 3575 /* 3576 * Turn on usage filter if is not already used. The idea is 3577 * that chunks that we have already balanced should be 3578 * reasonably full. Don't do it for chunks that are being 3579 * converted - that will keep us from relocating unconverted 3580 * (albeit full) chunks. 3581 */ 3582 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3583 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3584 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3585 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3586 bctl->data.usage = 90; 3587 } 3588 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3589 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3590 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3591 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3592 bctl->sys.usage = 90; 3593 } 3594 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3595 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3596 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3597 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3598 bctl->meta.usage = 90; 3599 } 3600 } 3601 3602 /* 3603 * Clear the balance status in fs_info and delete the balance item from disk. 3604 */ 3605 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3606 { 3607 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3608 int ret; 3609 3610 BUG_ON(!fs_info->balance_ctl); 3611 3612 spin_lock(&fs_info->balance_lock); 3613 fs_info->balance_ctl = NULL; 3614 spin_unlock(&fs_info->balance_lock); 3615 3616 kfree(bctl); 3617 ret = del_balance_item(fs_info); 3618 if (ret) 3619 btrfs_handle_fs_error(fs_info, ret, NULL); 3620 } 3621 3622 /* 3623 * Balance filters. Return 1 if chunk should be filtered out 3624 * (should not be balanced). 3625 */ 3626 static int chunk_profiles_filter(u64 chunk_type, 3627 struct btrfs_balance_args *bargs) 3628 { 3629 chunk_type = chunk_to_extended(chunk_type) & 3630 BTRFS_EXTENDED_PROFILE_MASK; 3631 3632 if (bargs->profiles & chunk_type) 3633 return 0; 3634 3635 return 1; 3636 } 3637 3638 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3639 struct btrfs_balance_args *bargs) 3640 { 3641 struct btrfs_block_group *cache; 3642 u64 chunk_used; 3643 u64 user_thresh_min; 3644 u64 user_thresh_max; 3645 int ret = 1; 3646 3647 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3648 chunk_used = cache->used; 3649 3650 if (bargs->usage_min == 0) 3651 user_thresh_min = 0; 3652 else 3653 user_thresh_min = div_factor_fine(cache->length, 3654 bargs->usage_min); 3655 3656 if (bargs->usage_max == 0) 3657 user_thresh_max = 1; 3658 else if (bargs->usage_max > 100) 3659 user_thresh_max = cache->length; 3660 else 3661 user_thresh_max = div_factor_fine(cache->length, 3662 bargs->usage_max); 3663 3664 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3665 ret = 0; 3666 3667 btrfs_put_block_group(cache); 3668 return ret; 3669 } 3670 3671 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3672 u64 chunk_offset, struct btrfs_balance_args *bargs) 3673 { 3674 struct btrfs_block_group *cache; 3675 u64 chunk_used, user_thresh; 3676 int ret = 1; 3677 3678 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3679 chunk_used = cache->used; 3680 3681 if (bargs->usage_min == 0) 3682 user_thresh = 1; 3683 else if (bargs->usage > 100) 3684 user_thresh = cache->length; 3685 else 3686 user_thresh = div_factor_fine(cache->length, bargs->usage); 3687 3688 if (chunk_used < user_thresh) 3689 ret = 0; 3690 3691 btrfs_put_block_group(cache); 3692 return ret; 3693 } 3694 3695 static int chunk_devid_filter(struct extent_buffer *leaf, 3696 struct btrfs_chunk *chunk, 3697 struct btrfs_balance_args *bargs) 3698 { 3699 struct btrfs_stripe *stripe; 3700 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3701 int i; 3702 3703 for (i = 0; i < num_stripes; i++) { 3704 stripe = btrfs_stripe_nr(chunk, i); 3705 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3706 return 0; 3707 } 3708 3709 return 1; 3710 } 3711 3712 static u64 calc_data_stripes(u64 type, int num_stripes) 3713 { 3714 const int index = btrfs_bg_flags_to_raid_index(type); 3715 const int ncopies = btrfs_raid_array[index].ncopies; 3716 const int nparity = btrfs_raid_array[index].nparity; 3717 3718 return (num_stripes - nparity) / ncopies; 3719 } 3720 3721 /* [pstart, pend) */ 3722 static int chunk_drange_filter(struct extent_buffer *leaf, 3723 struct btrfs_chunk *chunk, 3724 struct btrfs_balance_args *bargs) 3725 { 3726 struct btrfs_stripe *stripe; 3727 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3728 u64 stripe_offset; 3729 u64 stripe_length; 3730 u64 type; 3731 int factor; 3732 int i; 3733 3734 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3735 return 0; 3736 3737 type = btrfs_chunk_type(leaf, chunk); 3738 factor = calc_data_stripes(type, num_stripes); 3739 3740 for (i = 0; i < num_stripes; i++) { 3741 stripe = btrfs_stripe_nr(chunk, i); 3742 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3743 continue; 3744 3745 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3746 stripe_length = btrfs_chunk_length(leaf, chunk); 3747 stripe_length = div_u64(stripe_length, factor); 3748 3749 if (stripe_offset < bargs->pend && 3750 stripe_offset + stripe_length > bargs->pstart) 3751 return 0; 3752 } 3753 3754 return 1; 3755 } 3756 3757 /* [vstart, vend) */ 3758 static int chunk_vrange_filter(struct extent_buffer *leaf, 3759 struct btrfs_chunk *chunk, 3760 u64 chunk_offset, 3761 struct btrfs_balance_args *bargs) 3762 { 3763 if (chunk_offset < bargs->vend && 3764 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3765 /* at least part of the chunk is inside this vrange */ 3766 return 0; 3767 3768 return 1; 3769 } 3770 3771 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3772 struct btrfs_chunk *chunk, 3773 struct btrfs_balance_args *bargs) 3774 { 3775 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3776 3777 if (bargs->stripes_min <= num_stripes 3778 && num_stripes <= bargs->stripes_max) 3779 return 0; 3780 3781 return 1; 3782 } 3783 3784 static int chunk_soft_convert_filter(u64 chunk_type, 3785 struct btrfs_balance_args *bargs) 3786 { 3787 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3788 return 0; 3789 3790 chunk_type = chunk_to_extended(chunk_type) & 3791 BTRFS_EXTENDED_PROFILE_MASK; 3792 3793 if (bargs->target == chunk_type) 3794 return 1; 3795 3796 return 0; 3797 } 3798 3799 static int should_balance_chunk(struct extent_buffer *leaf, 3800 struct btrfs_chunk *chunk, u64 chunk_offset) 3801 { 3802 struct btrfs_fs_info *fs_info = leaf->fs_info; 3803 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3804 struct btrfs_balance_args *bargs = NULL; 3805 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3806 3807 /* type filter */ 3808 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3809 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3810 return 0; 3811 } 3812 3813 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3814 bargs = &bctl->data; 3815 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3816 bargs = &bctl->sys; 3817 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3818 bargs = &bctl->meta; 3819 3820 /* profiles filter */ 3821 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3822 chunk_profiles_filter(chunk_type, bargs)) { 3823 return 0; 3824 } 3825 3826 /* usage filter */ 3827 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3828 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3829 return 0; 3830 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3831 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3832 return 0; 3833 } 3834 3835 /* devid filter */ 3836 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3837 chunk_devid_filter(leaf, chunk, bargs)) { 3838 return 0; 3839 } 3840 3841 /* drange filter, makes sense only with devid filter */ 3842 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3843 chunk_drange_filter(leaf, chunk, bargs)) { 3844 return 0; 3845 } 3846 3847 /* vrange filter */ 3848 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3849 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3850 return 0; 3851 } 3852 3853 /* stripes filter */ 3854 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3855 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3856 return 0; 3857 } 3858 3859 /* soft profile changing mode */ 3860 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3861 chunk_soft_convert_filter(chunk_type, bargs)) { 3862 return 0; 3863 } 3864 3865 /* 3866 * limited by count, must be the last filter 3867 */ 3868 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3869 if (bargs->limit == 0) 3870 return 0; 3871 else 3872 bargs->limit--; 3873 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3874 /* 3875 * Same logic as the 'limit' filter; the minimum cannot be 3876 * determined here because we do not have the global information 3877 * about the count of all chunks that satisfy the filters. 3878 */ 3879 if (bargs->limit_max == 0) 3880 return 0; 3881 else 3882 bargs->limit_max--; 3883 } 3884 3885 return 1; 3886 } 3887 3888 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3889 { 3890 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3891 struct btrfs_root *chunk_root = fs_info->chunk_root; 3892 u64 chunk_type; 3893 struct btrfs_chunk *chunk; 3894 struct btrfs_path *path = NULL; 3895 struct btrfs_key key; 3896 struct btrfs_key found_key; 3897 struct extent_buffer *leaf; 3898 int slot; 3899 int ret; 3900 int enospc_errors = 0; 3901 bool counting = true; 3902 /* The single value limit and min/max limits use the same bytes in the */ 3903 u64 limit_data = bctl->data.limit; 3904 u64 limit_meta = bctl->meta.limit; 3905 u64 limit_sys = bctl->sys.limit; 3906 u32 count_data = 0; 3907 u32 count_meta = 0; 3908 u32 count_sys = 0; 3909 int chunk_reserved = 0; 3910 3911 path = btrfs_alloc_path(); 3912 if (!path) { 3913 ret = -ENOMEM; 3914 goto error; 3915 } 3916 3917 /* zero out stat counters */ 3918 spin_lock(&fs_info->balance_lock); 3919 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3920 spin_unlock(&fs_info->balance_lock); 3921 again: 3922 if (!counting) { 3923 /* 3924 * The single value limit and min/max limits use the same bytes 3925 * in the 3926 */ 3927 bctl->data.limit = limit_data; 3928 bctl->meta.limit = limit_meta; 3929 bctl->sys.limit = limit_sys; 3930 } 3931 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3932 key.offset = (u64)-1; 3933 key.type = BTRFS_CHUNK_ITEM_KEY; 3934 3935 while (1) { 3936 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3937 atomic_read(&fs_info->balance_cancel_req)) { 3938 ret = -ECANCELED; 3939 goto error; 3940 } 3941 3942 mutex_lock(&fs_info->reclaim_bgs_lock); 3943 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3944 if (ret < 0) { 3945 mutex_unlock(&fs_info->reclaim_bgs_lock); 3946 goto error; 3947 } 3948 3949 /* 3950 * this shouldn't happen, it means the last relocate 3951 * failed 3952 */ 3953 if (ret == 0) 3954 BUG(); /* FIXME break ? */ 3955 3956 ret = btrfs_previous_item(chunk_root, path, 0, 3957 BTRFS_CHUNK_ITEM_KEY); 3958 if (ret) { 3959 mutex_unlock(&fs_info->reclaim_bgs_lock); 3960 ret = 0; 3961 break; 3962 } 3963 3964 leaf = path->nodes[0]; 3965 slot = path->slots[0]; 3966 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3967 3968 if (found_key.objectid != key.objectid) { 3969 mutex_unlock(&fs_info->reclaim_bgs_lock); 3970 break; 3971 } 3972 3973 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3974 chunk_type = btrfs_chunk_type(leaf, chunk); 3975 3976 if (!counting) { 3977 spin_lock(&fs_info->balance_lock); 3978 bctl->stat.considered++; 3979 spin_unlock(&fs_info->balance_lock); 3980 } 3981 3982 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3983 3984 btrfs_release_path(path); 3985 if (!ret) { 3986 mutex_unlock(&fs_info->reclaim_bgs_lock); 3987 goto loop; 3988 } 3989 3990 if (counting) { 3991 mutex_unlock(&fs_info->reclaim_bgs_lock); 3992 spin_lock(&fs_info->balance_lock); 3993 bctl->stat.expected++; 3994 spin_unlock(&fs_info->balance_lock); 3995 3996 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3997 count_data++; 3998 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3999 count_sys++; 4000 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 4001 count_meta++; 4002 4003 goto loop; 4004 } 4005 4006 /* 4007 * Apply limit_min filter, no need to check if the LIMITS 4008 * filter is used, limit_min is 0 by default 4009 */ 4010 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 4011 count_data < bctl->data.limit_min) 4012 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 4013 count_meta < bctl->meta.limit_min) 4014 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 4015 count_sys < bctl->sys.limit_min)) { 4016 mutex_unlock(&fs_info->reclaim_bgs_lock); 4017 goto loop; 4018 } 4019 4020 if (!chunk_reserved) { 4021 /* 4022 * We may be relocating the only data chunk we have, 4023 * which could potentially end up with losing data's 4024 * raid profile, so lets allocate an empty one in 4025 * advance. 4026 */ 4027 ret = btrfs_may_alloc_data_chunk(fs_info, 4028 found_key.offset); 4029 if (ret < 0) { 4030 mutex_unlock(&fs_info->reclaim_bgs_lock); 4031 goto error; 4032 } else if (ret == 1) { 4033 chunk_reserved = 1; 4034 } 4035 } 4036 4037 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 4038 mutex_unlock(&fs_info->reclaim_bgs_lock); 4039 if (ret == -ENOSPC) { 4040 enospc_errors++; 4041 } else if (ret == -ETXTBSY) { 4042 btrfs_info(fs_info, 4043 "skipping relocation of block group %llu due to active swapfile", 4044 found_key.offset); 4045 ret = 0; 4046 } else if (ret) { 4047 goto error; 4048 } else { 4049 spin_lock(&fs_info->balance_lock); 4050 bctl->stat.completed++; 4051 spin_unlock(&fs_info->balance_lock); 4052 } 4053 loop: 4054 if (found_key.offset == 0) 4055 break; 4056 key.offset = found_key.offset - 1; 4057 } 4058 4059 if (counting) { 4060 btrfs_release_path(path); 4061 counting = false; 4062 goto again; 4063 } 4064 error: 4065 btrfs_free_path(path); 4066 if (enospc_errors) { 4067 btrfs_info(fs_info, "%d enospc errors during balance", 4068 enospc_errors); 4069 if (!ret) 4070 ret = -ENOSPC; 4071 } 4072 4073 return ret; 4074 } 4075 4076 /** 4077 * alloc_profile_is_valid - see if a given profile is valid and reduced 4078 * @flags: profile to validate 4079 * @extended: if true @flags is treated as an extended profile 4080 */ 4081 static int alloc_profile_is_valid(u64 flags, int extended) 4082 { 4083 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4084 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4085 4086 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4087 4088 /* 1) check that all other bits are zeroed */ 4089 if (flags & ~mask) 4090 return 0; 4091 4092 /* 2) see if profile is reduced */ 4093 if (flags == 0) 4094 return !extended; /* "0" is valid for usual profiles */ 4095 4096 return has_single_bit_set(flags); 4097 } 4098 4099 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4100 { 4101 /* cancel requested || normal exit path */ 4102 return atomic_read(&fs_info->balance_cancel_req) || 4103 (atomic_read(&fs_info->balance_pause_req) == 0 && 4104 atomic_read(&fs_info->balance_cancel_req) == 0); 4105 } 4106 4107 /* 4108 * Validate target profile against allowed profiles and return true if it's OK. 4109 * Otherwise print the error message and return false. 4110 */ 4111 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4112 const struct btrfs_balance_args *bargs, 4113 u64 allowed, const char *type) 4114 { 4115 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4116 return true; 4117 4118 if (fs_info->sectorsize < PAGE_SIZE && 4119 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 4120 btrfs_err(fs_info, 4121 "RAID56 is not yet supported for sectorsize %u with page size %lu", 4122 fs_info->sectorsize, PAGE_SIZE); 4123 return false; 4124 } 4125 /* Profile is valid and does not have bits outside of the allowed set */ 4126 if (alloc_profile_is_valid(bargs->target, 1) && 4127 (bargs->target & ~allowed) == 0) 4128 return true; 4129 4130 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4131 type, btrfs_bg_type_to_raid_name(bargs->target)); 4132 return false; 4133 } 4134 4135 /* 4136 * Fill @buf with textual description of balance filter flags @bargs, up to 4137 * @size_buf including the terminating null. The output may be trimmed if it 4138 * does not fit into the provided buffer. 4139 */ 4140 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4141 u32 size_buf) 4142 { 4143 int ret; 4144 u32 size_bp = size_buf; 4145 char *bp = buf; 4146 u64 flags = bargs->flags; 4147 char tmp_buf[128] = {'\0'}; 4148 4149 if (!flags) 4150 return; 4151 4152 #define CHECK_APPEND_NOARG(a) \ 4153 do { \ 4154 ret = snprintf(bp, size_bp, (a)); \ 4155 if (ret < 0 || ret >= size_bp) \ 4156 goto out_overflow; \ 4157 size_bp -= ret; \ 4158 bp += ret; \ 4159 } while (0) 4160 4161 #define CHECK_APPEND_1ARG(a, v1) \ 4162 do { \ 4163 ret = snprintf(bp, size_bp, (a), (v1)); \ 4164 if (ret < 0 || ret >= size_bp) \ 4165 goto out_overflow; \ 4166 size_bp -= ret; \ 4167 bp += ret; \ 4168 } while (0) 4169 4170 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4171 do { \ 4172 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4173 if (ret < 0 || ret >= size_bp) \ 4174 goto out_overflow; \ 4175 size_bp -= ret; \ 4176 bp += ret; \ 4177 } while (0) 4178 4179 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4180 CHECK_APPEND_1ARG("convert=%s,", 4181 btrfs_bg_type_to_raid_name(bargs->target)); 4182 4183 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4184 CHECK_APPEND_NOARG("soft,"); 4185 4186 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4187 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4188 sizeof(tmp_buf)); 4189 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4190 } 4191 4192 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4193 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4194 4195 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4196 CHECK_APPEND_2ARG("usage=%u..%u,", 4197 bargs->usage_min, bargs->usage_max); 4198 4199 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4200 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4201 4202 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4203 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4204 bargs->pstart, bargs->pend); 4205 4206 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4207 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4208 bargs->vstart, bargs->vend); 4209 4210 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4211 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4212 4213 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4214 CHECK_APPEND_2ARG("limit=%u..%u,", 4215 bargs->limit_min, bargs->limit_max); 4216 4217 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4218 CHECK_APPEND_2ARG("stripes=%u..%u,", 4219 bargs->stripes_min, bargs->stripes_max); 4220 4221 #undef CHECK_APPEND_2ARG 4222 #undef CHECK_APPEND_1ARG 4223 #undef CHECK_APPEND_NOARG 4224 4225 out_overflow: 4226 4227 if (size_bp < size_buf) 4228 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4229 else 4230 buf[0] = '\0'; 4231 } 4232 4233 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4234 { 4235 u32 size_buf = 1024; 4236 char tmp_buf[192] = {'\0'}; 4237 char *buf; 4238 char *bp; 4239 u32 size_bp = size_buf; 4240 int ret; 4241 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4242 4243 buf = kzalloc(size_buf, GFP_KERNEL); 4244 if (!buf) 4245 return; 4246 4247 bp = buf; 4248 4249 #define CHECK_APPEND_1ARG(a, v1) \ 4250 do { \ 4251 ret = snprintf(bp, size_bp, (a), (v1)); \ 4252 if (ret < 0 || ret >= size_bp) \ 4253 goto out_overflow; \ 4254 size_bp -= ret; \ 4255 bp += ret; \ 4256 } while (0) 4257 4258 if (bctl->flags & BTRFS_BALANCE_FORCE) 4259 CHECK_APPEND_1ARG("%s", "-f "); 4260 4261 if (bctl->flags & BTRFS_BALANCE_DATA) { 4262 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4263 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4264 } 4265 4266 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4267 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4268 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4269 } 4270 4271 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4272 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4273 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4274 } 4275 4276 #undef CHECK_APPEND_1ARG 4277 4278 out_overflow: 4279 4280 if (size_bp < size_buf) 4281 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4282 btrfs_info(fs_info, "balance: %s %s", 4283 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4284 "resume" : "start", buf); 4285 4286 kfree(buf); 4287 } 4288 4289 /* 4290 * Should be called with balance mutexe held 4291 */ 4292 int btrfs_balance(struct btrfs_fs_info *fs_info, 4293 struct btrfs_balance_control *bctl, 4294 struct btrfs_ioctl_balance_args *bargs) 4295 { 4296 u64 meta_target, data_target; 4297 u64 allowed; 4298 int mixed = 0; 4299 int ret; 4300 u64 num_devices; 4301 unsigned seq; 4302 bool reducing_redundancy; 4303 int i; 4304 4305 if (btrfs_fs_closing(fs_info) || 4306 atomic_read(&fs_info->balance_pause_req) || 4307 btrfs_should_cancel_balance(fs_info)) { 4308 ret = -EINVAL; 4309 goto out; 4310 } 4311 4312 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4313 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4314 mixed = 1; 4315 4316 /* 4317 * In case of mixed groups both data and meta should be picked, 4318 * and identical options should be given for both of them. 4319 */ 4320 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4321 if (mixed && (bctl->flags & allowed)) { 4322 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4323 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4324 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4325 btrfs_err(fs_info, 4326 "balance: mixed groups data and metadata options must be the same"); 4327 ret = -EINVAL; 4328 goto out; 4329 } 4330 } 4331 4332 /* 4333 * rw_devices will not change at the moment, device add/delete/replace 4334 * are exclusive 4335 */ 4336 num_devices = fs_info->fs_devices->rw_devices; 4337 4338 /* 4339 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4340 * special bit for it, to make it easier to distinguish. Thus we need 4341 * to set it manually, or balance would refuse the profile. 4342 */ 4343 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4344 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4345 if (num_devices >= btrfs_raid_array[i].devs_min) 4346 allowed |= btrfs_raid_array[i].bg_flag; 4347 4348 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4349 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4350 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4351 ret = -EINVAL; 4352 goto out; 4353 } 4354 4355 /* 4356 * Allow to reduce metadata or system integrity only if force set for 4357 * profiles with redundancy (copies, parity) 4358 */ 4359 allowed = 0; 4360 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4361 if (btrfs_raid_array[i].ncopies >= 2 || 4362 btrfs_raid_array[i].tolerated_failures >= 1) 4363 allowed |= btrfs_raid_array[i].bg_flag; 4364 } 4365 do { 4366 seq = read_seqbegin(&fs_info->profiles_lock); 4367 4368 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4369 (fs_info->avail_system_alloc_bits & allowed) && 4370 !(bctl->sys.target & allowed)) || 4371 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4372 (fs_info->avail_metadata_alloc_bits & allowed) && 4373 !(bctl->meta.target & allowed))) 4374 reducing_redundancy = true; 4375 else 4376 reducing_redundancy = false; 4377 4378 /* if we're not converting, the target field is uninitialized */ 4379 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4380 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4381 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4382 bctl->data.target : fs_info->avail_data_alloc_bits; 4383 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4384 4385 if (reducing_redundancy) { 4386 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4387 btrfs_info(fs_info, 4388 "balance: force reducing metadata redundancy"); 4389 } else { 4390 btrfs_err(fs_info, 4391 "balance: reduces metadata redundancy, use --force if you want this"); 4392 ret = -EINVAL; 4393 goto out; 4394 } 4395 } 4396 4397 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4398 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4399 btrfs_warn(fs_info, 4400 "balance: metadata profile %s has lower redundancy than data profile %s", 4401 btrfs_bg_type_to_raid_name(meta_target), 4402 btrfs_bg_type_to_raid_name(data_target)); 4403 } 4404 4405 ret = insert_balance_item(fs_info, bctl); 4406 if (ret && ret != -EEXIST) 4407 goto out; 4408 4409 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4410 BUG_ON(ret == -EEXIST); 4411 BUG_ON(fs_info->balance_ctl); 4412 spin_lock(&fs_info->balance_lock); 4413 fs_info->balance_ctl = bctl; 4414 spin_unlock(&fs_info->balance_lock); 4415 } else { 4416 BUG_ON(ret != -EEXIST); 4417 spin_lock(&fs_info->balance_lock); 4418 update_balance_args(bctl); 4419 spin_unlock(&fs_info->balance_lock); 4420 } 4421 4422 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4423 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4424 describe_balance_start_or_resume(fs_info); 4425 mutex_unlock(&fs_info->balance_mutex); 4426 4427 ret = __btrfs_balance(fs_info); 4428 4429 mutex_lock(&fs_info->balance_mutex); 4430 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4431 btrfs_info(fs_info, "balance: paused"); 4432 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4433 } 4434 /* 4435 * Balance can be canceled by: 4436 * 4437 * - Regular cancel request 4438 * Then ret == -ECANCELED and balance_cancel_req > 0 4439 * 4440 * - Fatal signal to "btrfs" process 4441 * Either the signal caught by wait_reserve_ticket() and callers 4442 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4443 * got -ECANCELED. 4444 * Either way, in this case balance_cancel_req = 0, and 4445 * ret == -EINTR or ret == -ECANCELED. 4446 * 4447 * So here we only check the return value to catch canceled balance. 4448 */ 4449 else if (ret == -ECANCELED || ret == -EINTR) 4450 btrfs_info(fs_info, "balance: canceled"); 4451 else 4452 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4453 4454 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4455 4456 if (bargs) { 4457 memset(bargs, 0, sizeof(*bargs)); 4458 btrfs_update_ioctl_balance_args(fs_info, bargs); 4459 } 4460 4461 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4462 balance_need_close(fs_info)) { 4463 reset_balance_state(fs_info); 4464 btrfs_exclop_finish(fs_info); 4465 } 4466 4467 wake_up(&fs_info->balance_wait_q); 4468 4469 return ret; 4470 out: 4471 if (bctl->flags & BTRFS_BALANCE_RESUME) 4472 reset_balance_state(fs_info); 4473 else 4474 kfree(bctl); 4475 btrfs_exclop_finish(fs_info); 4476 4477 return ret; 4478 } 4479 4480 static int balance_kthread(void *data) 4481 { 4482 struct btrfs_fs_info *fs_info = data; 4483 int ret = 0; 4484 4485 mutex_lock(&fs_info->balance_mutex); 4486 if (fs_info->balance_ctl) 4487 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4488 mutex_unlock(&fs_info->balance_mutex); 4489 4490 return ret; 4491 } 4492 4493 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4494 { 4495 struct task_struct *tsk; 4496 4497 mutex_lock(&fs_info->balance_mutex); 4498 if (!fs_info->balance_ctl) { 4499 mutex_unlock(&fs_info->balance_mutex); 4500 return 0; 4501 } 4502 mutex_unlock(&fs_info->balance_mutex); 4503 4504 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4505 btrfs_info(fs_info, "balance: resume skipped"); 4506 return 0; 4507 } 4508 4509 spin_lock(&fs_info->super_lock); 4510 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4511 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4512 spin_unlock(&fs_info->super_lock); 4513 /* 4514 * A ro->rw remount sequence should continue with the paused balance 4515 * regardless of who pauses it, system or the user as of now, so set 4516 * the resume flag. 4517 */ 4518 spin_lock(&fs_info->balance_lock); 4519 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4520 spin_unlock(&fs_info->balance_lock); 4521 4522 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4523 return PTR_ERR_OR_ZERO(tsk); 4524 } 4525 4526 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4527 { 4528 struct btrfs_balance_control *bctl; 4529 struct btrfs_balance_item *item; 4530 struct btrfs_disk_balance_args disk_bargs; 4531 struct btrfs_path *path; 4532 struct extent_buffer *leaf; 4533 struct btrfs_key key; 4534 int ret; 4535 4536 path = btrfs_alloc_path(); 4537 if (!path) 4538 return -ENOMEM; 4539 4540 key.objectid = BTRFS_BALANCE_OBJECTID; 4541 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4542 key.offset = 0; 4543 4544 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4545 if (ret < 0) 4546 goto out; 4547 if (ret > 0) { /* ret = -ENOENT; */ 4548 ret = 0; 4549 goto out; 4550 } 4551 4552 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4553 if (!bctl) { 4554 ret = -ENOMEM; 4555 goto out; 4556 } 4557 4558 leaf = path->nodes[0]; 4559 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4560 4561 bctl->flags = btrfs_balance_flags(leaf, item); 4562 bctl->flags |= BTRFS_BALANCE_RESUME; 4563 4564 btrfs_balance_data(leaf, item, &disk_bargs); 4565 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4566 btrfs_balance_meta(leaf, item, &disk_bargs); 4567 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4568 btrfs_balance_sys(leaf, item, &disk_bargs); 4569 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4570 4571 /* 4572 * This should never happen, as the paused balance state is recovered 4573 * during mount without any chance of other exclusive ops to collide. 4574 * 4575 * This gives the exclusive op status to balance and keeps in paused 4576 * state until user intervention (cancel or umount). If the ownership 4577 * cannot be assigned, show a message but do not fail. The balance 4578 * is in a paused state and must have fs_info::balance_ctl properly 4579 * set up. 4580 */ 4581 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4582 btrfs_warn(fs_info, 4583 "balance: cannot set exclusive op status, resume manually"); 4584 4585 btrfs_release_path(path); 4586 4587 mutex_lock(&fs_info->balance_mutex); 4588 BUG_ON(fs_info->balance_ctl); 4589 spin_lock(&fs_info->balance_lock); 4590 fs_info->balance_ctl = bctl; 4591 spin_unlock(&fs_info->balance_lock); 4592 mutex_unlock(&fs_info->balance_mutex); 4593 out: 4594 btrfs_free_path(path); 4595 return ret; 4596 } 4597 4598 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4599 { 4600 int ret = 0; 4601 4602 mutex_lock(&fs_info->balance_mutex); 4603 if (!fs_info->balance_ctl) { 4604 mutex_unlock(&fs_info->balance_mutex); 4605 return -ENOTCONN; 4606 } 4607 4608 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4609 atomic_inc(&fs_info->balance_pause_req); 4610 mutex_unlock(&fs_info->balance_mutex); 4611 4612 wait_event(fs_info->balance_wait_q, 4613 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4614 4615 mutex_lock(&fs_info->balance_mutex); 4616 /* we are good with balance_ctl ripped off from under us */ 4617 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4618 atomic_dec(&fs_info->balance_pause_req); 4619 } else { 4620 ret = -ENOTCONN; 4621 } 4622 4623 mutex_unlock(&fs_info->balance_mutex); 4624 return ret; 4625 } 4626 4627 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4628 { 4629 mutex_lock(&fs_info->balance_mutex); 4630 if (!fs_info->balance_ctl) { 4631 mutex_unlock(&fs_info->balance_mutex); 4632 return -ENOTCONN; 4633 } 4634 4635 /* 4636 * A paused balance with the item stored on disk can be resumed at 4637 * mount time if the mount is read-write. Otherwise it's still paused 4638 * and we must not allow cancelling as it deletes the item. 4639 */ 4640 if (sb_rdonly(fs_info->sb)) { 4641 mutex_unlock(&fs_info->balance_mutex); 4642 return -EROFS; 4643 } 4644 4645 atomic_inc(&fs_info->balance_cancel_req); 4646 /* 4647 * if we are running just wait and return, balance item is 4648 * deleted in btrfs_balance in this case 4649 */ 4650 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4651 mutex_unlock(&fs_info->balance_mutex); 4652 wait_event(fs_info->balance_wait_q, 4653 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4654 mutex_lock(&fs_info->balance_mutex); 4655 } else { 4656 mutex_unlock(&fs_info->balance_mutex); 4657 /* 4658 * Lock released to allow other waiters to continue, we'll 4659 * reexamine the status again. 4660 */ 4661 mutex_lock(&fs_info->balance_mutex); 4662 4663 if (fs_info->balance_ctl) { 4664 reset_balance_state(fs_info); 4665 btrfs_exclop_finish(fs_info); 4666 btrfs_info(fs_info, "balance: canceled"); 4667 } 4668 } 4669 4670 BUG_ON(fs_info->balance_ctl || 4671 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4672 atomic_dec(&fs_info->balance_cancel_req); 4673 mutex_unlock(&fs_info->balance_mutex); 4674 return 0; 4675 } 4676 4677 int btrfs_uuid_scan_kthread(void *data) 4678 { 4679 struct btrfs_fs_info *fs_info = data; 4680 struct btrfs_root *root = fs_info->tree_root; 4681 struct btrfs_key key; 4682 struct btrfs_path *path = NULL; 4683 int ret = 0; 4684 struct extent_buffer *eb; 4685 int slot; 4686 struct btrfs_root_item root_item; 4687 u32 item_size; 4688 struct btrfs_trans_handle *trans = NULL; 4689 bool closing = false; 4690 4691 path = btrfs_alloc_path(); 4692 if (!path) { 4693 ret = -ENOMEM; 4694 goto out; 4695 } 4696 4697 key.objectid = 0; 4698 key.type = BTRFS_ROOT_ITEM_KEY; 4699 key.offset = 0; 4700 4701 while (1) { 4702 if (btrfs_fs_closing(fs_info)) { 4703 closing = true; 4704 break; 4705 } 4706 ret = btrfs_search_forward(root, &key, path, 4707 BTRFS_OLDEST_GENERATION); 4708 if (ret) { 4709 if (ret > 0) 4710 ret = 0; 4711 break; 4712 } 4713 4714 if (key.type != BTRFS_ROOT_ITEM_KEY || 4715 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4716 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4717 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4718 goto skip; 4719 4720 eb = path->nodes[0]; 4721 slot = path->slots[0]; 4722 item_size = btrfs_item_size(eb, slot); 4723 if (item_size < sizeof(root_item)) 4724 goto skip; 4725 4726 read_extent_buffer(eb, &root_item, 4727 btrfs_item_ptr_offset(eb, slot), 4728 (int)sizeof(root_item)); 4729 if (btrfs_root_refs(&root_item) == 0) 4730 goto skip; 4731 4732 if (!btrfs_is_empty_uuid(root_item.uuid) || 4733 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4734 if (trans) 4735 goto update_tree; 4736 4737 btrfs_release_path(path); 4738 /* 4739 * 1 - subvol uuid item 4740 * 1 - received_subvol uuid item 4741 */ 4742 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4743 if (IS_ERR(trans)) { 4744 ret = PTR_ERR(trans); 4745 break; 4746 } 4747 continue; 4748 } else { 4749 goto skip; 4750 } 4751 update_tree: 4752 btrfs_release_path(path); 4753 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4754 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4755 BTRFS_UUID_KEY_SUBVOL, 4756 key.objectid); 4757 if (ret < 0) { 4758 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4759 ret); 4760 break; 4761 } 4762 } 4763 4764 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4765 ret = btrfs_uuid_tree_add(trans, 4766 root_item.received_uuid, 4767 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4768 key.objectid); 4769 if (ret < 0) { 4770 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4771 ret); 4772 break; 4773 } 4774 } 4775 4776 skip: 4777 btrfs_release_path(path); 4778 if (trans) { 4779 ret = btrfs_end_transaction(trans); 4780 trans = NULL; 4781 if (ret) 4782 break; 4783 } 4784 4785 if (key.offset < (u64)-1) { 4786 key.offset++; 4787 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4788 key.offset = 0; 4789 key.type = BTRFS_ROOT_ITEM_KEY; 4790 } else if (key.objectid < (u64)-1) { 4791 key.offset = 0; 4792 key.type = BTRFS_ROOT_ITEM_KEY; 4793 key.objectid++; 4794 } else { 4795 break; 4796 } 4797 cond_resched(); 4798 } 4799 4800 out: 4801 btrfs_free_path(path); 4802 if (trans && !IS_ERR(trans)) 4803 btrfs_end_transaction(trans); 4804 if (ret) 4805 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4806 else if (!closing) 4807 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4808 up(&fs_info->uuid_tree_rescan_sem); 4809 return 0; 4810 } 4811 4812 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4813 { 4814 struct btrfs_trans_handle *trans; 4815 struct btrfs_root *tree_root = fs_info->tree_root; 4816 struct btrfs_root *uuid_root; 4817 struct task_struct *task; 4818 int ret; 4819 4820 /* 4821 * 1 - root node 4822 * 1 - root item 4823 */ 4824 trans = btrfs_start_transaction(tree_root, 2); 4825 if (IS_ERR(trans)) 4826 return PTR_ERR(trans); 4827 4828 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4829 if (IS_ERR(uuid_root)) { 4830 ret = PTR_ERR(uuid_root); 4831 btrfs_abort_transaction(trans, ret); 4832 btrfs_end_transaction(trans); 4833 return ret; 4834 } 4835 4836 fs_info->uuid_root = uuid_root; 4837 4838 ret = btrfs_commit_transaction(trans); 4839 if (ret) 4840 return ret; 4841 4842 down(&fs_info->uuid_tree_rescan_sem); 4843 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4844 if (IS_ERR(task)) { 4845 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4846 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4847 up(&fs_info->uuid_tree_rescan_sem); 4848 return PTR_ERR(task); 4849 } 4850 4851 return 0; 4852 } 4853 4854 /* 4855 * shrinking a device means finding all of the device extents past 4856 * the new size, and then following the back refs to the chunks. 4857 * The chunk relocation code actually frees the device extent 4858 */ 4859 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4860 { 4861 struct btrfs_fs_info *fs_info = device->fs_info; 4862 struct btrfs_root *root = fs_info->dev_root; 4863 struct btrfs_trans_handle *trans; 4864 struct btrfs_dev_extent *dev_extent = NULL; 4865 struct btrfs_path *path; 4866 u64 length; 4867 u64 chunk_offset; 4868 int ret; 4869 int slot; 4870 int failed = 0; 4871 bool retried = false; 4872 struct extent_buffer *l; 4873 struct btrfs_key key; 4874 struct btrfs_super_block *super_copy = fs_info->super_copy; 4875 u64 old_total = btrfs_super_total_bytes(super_copy); 4876 u64 old_size = btrfs_device_get_total_bytes(device); 4877 u64 diff; 4878 u64 start; 4879 4880 new_size = round_down(new_size, fs_info->sectorsize); 4881 start = new_size; 4882 diff = round_down(old_size - new_size, fs_info->sectorsize); 4883 4884 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4885 return -EINVAL; 4886 4887 path = btrfs_alloc_path(); 4888 if (!path) 4889 return -ENOMEM; 4890 4891 path->reada = READA_BACK; 4892 4893 trans = btrfs_start_transaction(root, 0); 4894 if (IS_ERR(trans)) { 4895 btrfs_free_path(path); 4896 return PTR_ERR(trans); 4897 } 4898 4899 mutex_lock(&fs_info->chunk_mutex); 4900 4901 btrfs_device_set_total_bytes(device, new_size); 4902 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4903 device->fs_devices->total_rw_bytes -= diff; 4904 atomic64_sub(diff, &fs_info->free_chunk_space); 4905 } 4906 4907 /* 4908 * Once the device's size has been set to the new size, ensure all 4909 * in-memory chunks are synced to disk so that the loop below sees them 4910 * and relocates them accordingly. 4911 */ 4912 if (contains_pending_extent(device, &start, diff)) { 4913 mutex_unlock(&fs_info->chunk_mutex); 4914 ret = btrfs_commit_transaction(trans); 4915 if (ret) 4916 goto done; 4917 } else { 4918 mutex_unlock(&fs_info->chunk_mutex); 4919 btrfs_end_transaction(trans); 4920 } 4921 4922 again: 4923 key.objectid = device->devid; 4924 key.offset = (u64)-1; 4925 key.type = BTRFS_DEV_EXTENT_KEY; 4926 4927 do { 4928 mutex_lock(&fs_info->reclaim_bgs_lock); 4929 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4930 if (ret < 0) { 4931 mutex_unlock(&fs_info->reclaim_bgs_lock); 4932 goto done; 4933 } 4934 4935 ret = btrfs_previous_item(root, path, 0, key.type); 4936 if (ret) { 4937 mutex_unlock(&fs_info->reclaim_bgs_lock); 4938 if (ret < 0) 4939 goto done; 4940 ret = 0; 4941 btrfs_release_path(path); 4942 break; 4943 } 4944 4945 l = path->nodes[0]; 4946 slot = path->slots[0]; 4947 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4948 4949 if (key.objectid != device->devid) { 4950 mutex_unlock(&fs_info->reclaim_bgs_lock); 4951 btrfs_release_path(path); 4952 break; 4953 } 4954 4955 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4956 length = btrfs_dev_extent_length(l, dev_extent); 4957 4958 if (key.offset + length <= new_size) { 4959 mutex_unlock(&fs_info->reclaim_bgs_lock); 4960 btrfs_release_path(path); 4961 break; 4962 } 4963 4964 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4965 btrfs_release_path(path); 4966 4967 /* 4968 * We may be relocating the only data chunk we have, 4969 * which could potentially end up with losing data's 4970 * raid profile, so lets allocate an empty one in 4971 * advance. 4972 */ 4973 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4974 if (ret < 0) { 4975 mutex_unlock(&fs_info->reclaim_bgs_lock); 4976 goto done; 4977 } 4978 4979 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4980 mutex_unlock(&fs_info->reclaim_bgs_lock); 4981 if (ret == -ENOSPC) { 4982 failed++; 4983 } else if (ret) { 4984 if (ret == -ETXTBSY) { 4985 btrfs_warn(fs_info, 4986 "could not shrink block group %llu due to active swapfile", 4987 chunk_offset); 4988 } 4989 goto done; 4990 } 4991 } while (key.offset-- > 0); 4992 4993 if (failed && !retried) { 4994 failed = 0; 4995 retried = true; 4996 goto again; 4997 } else if (failed && retried) { 4998 ret = -ENOSPC; 4999 goto done; 5000 } 5001 5002 /* Shrinking succeeded, else we would be at "done". */ 5003 trans = btrfs_start_transaction(root, 0); 5004 if (IS_ERR(trans)) { 5005 ret = PTR_ERR(trans); 5006 goto done; 5007 } 5008 5009 mutex_lock(&fs_info->chunk_mutex); 5010 /* Clear all state bits beyond the shrunk device size */ 5011 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 5012 CHUNK_STATE_MASK); 5013 5014 btrfs_device_set_disk_total_bytes(device, new_size); 5015 if (list_empty(&device->post_commit_list)) 5016 list_add_tail(&device->post_commit_list, 5017 &trans->transaction->dev_update_list); 5018 5019 WARN_ON(diff > old_total); 5020 btrfs_set_super_total_bytes(super_copy, 5021 round_down(old_total - diff, fs_info->sectorsize)); 5022 mutex_unlock(&fs_info->chunk_mutex); 5023 5024 btrfs_reserve_chunk_metadata(trans, false); 5025 /* Now btrfs_update_device() will change the on-disk size. */ 5026 ret = btrfs_update_device(trans, device); 5027 btrfs_trans_release_chunk_metadata(trans); 5028 if (ret < 0) { 5029 btrfs_abort_transaction(trans, ret); 5030 btrfs_end_transaction(trans); 5031 } else { 5032 ret = btrfs_commit_transaction(trans); 5033 } 5034 done: 5035 btrfs_free_path(path); 5036 if (ret) { 5037 mutex_lock(&fs_info->chunk_mutex); 5038 btrfs_device_set_total_bytes(device, old_size); 5039 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 5040 device->fs_devices->total_rw_bytes += diff; 5041 atomic64_add(diff, &fs_info->free_chunk_space); 5042 mutex_unlock(&fs_info->chunk_mutex); 5043 } 5044 return ret; 5045 } 5046 5047 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 5048 struct btrfs_key *key, 5049 struct btrfs_chunk *chunk, int item_size) 5050 { 5051 struct btrfs_super_block *super_copy = fs_info->super_copy; 5052 struct btrfs_disk_key disk_key; 5053 u32 array_size; 5054 u8 *ptr; 5055 5056 lockdep_assert_held(&fs_info->chunk_mutex); 5057 5058 array_size = btrfs_super_sys_array_size(super_copy); 5059 if (array_size + item_size + sizeof(disk_key) 5060 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5061 return -EFBIG; 5062 5063 ptr = super_copy->sys_chunk_array + array_size; 5064 btrfs_cpu_key_to_disk(&disk_key, key); 5065 memcpy(ptr, &disk_key, sizeof(disk_key)); 5066 ptr += sizeof(disk_key); 5067 memcpy(ptr, chunk, item_size); 5068 item_size += sizeof(disk_key); 5069 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5070 5071 return 0; 5072 } 5073 5074 /* 5075 * sort the devices in descending order by max_avail, total_avail 5076 */ 5077 static int btrfs_cmp_device_info(const void *a, const void *b) 5078 { 5079 const struct btrfs_device_info *di_a = a; 5080 const struct btrfs_device_info *di_b = b; 5081 5082 if (di_a->max_avail > di_b->max_avail) 5083 return -1; 5084 if (di_a->max_avail < di_b->max_avail) 5085 return 1; 5086 if (di_a->total_avail > di_b->total_avail) 5087 return -1; 5088 if (di_a->total_avail < di_b->total_avail) 5089 return 1; 5090 return 0; 5091 } 5092 5093 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5094 { 5095 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5096 return; 5097 5098 btrfs_set_fs_incompat(info, RAID56); 5099 } 5100 5101 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5102 { 5103 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5104 return; 5105 5106 btrfs_set_fs_incompat(info, RAID1C34); 5107 } 5108 5109 /* 5110 * Structure used internally for btrfs_create_chunk() function. 5111 * Wraps needed parameters. 5112 */ 5113 struct alloc_chunk_ctl { 5114 u64 start; 5115 u64 type; 5116 /* Total number of stripes to allocate */ 5117 int num_stripes; 5118 /* sub_stripes info for map */ 5119 int sub_stripes; 5120 /* Stripes per device */ 5121 int dev_stripes; 5122 /* Maximum number of devices to use */ 5123 int devs_max; 5124 /* Minimum number of devices to use */ 5125 int devs_min; 5126 /* ndevs has to be a multiple of this */ 5127 int devs_increment; 5128 /* Number of copies */ 5129 int ncopies; 5130 /* Number of stripes worth of bytes to store parity information */ 5131 int nparity; 5132 u64 max_stripe_size; 5133 u64 max_chunk_size; 5134 u64 dev_extent_min; 5135 u64 stripe_size; 5136 u64 chunk_size; 5137 int ndevs; 5138 }; 5139 5140 static void init_alloc_chunk_ctl_policy_regular( 5141 struct btrfs_fs_devices *fs_devices, 5142 struct alloc_chunk_ctl *ctl) 5143 { 5144 u64 type = ctl->type; 5145 5146 if (type & BTRFS_BLOCK_GROUP_DATA) { 5147 ctl->max_stripe_size = SZ_1G; 5148 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5149 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5150 /* For larger filesystems, use larger metadata chunks */ 5151 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5152 ctl->max_stripe_size = SZ_1G; 5153 else 5154 ctl->max_stripe_size = SZ_256M; 5155 ctl->max_chunk_size = ctl->max_stripe_size; 5156 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5157 ctl->max_stripe_size = SZ_32M; 5158 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5159 ctl->devs_max = min_t(int, ctl->devs_max, 5160 BTRFS_MAX_DEVS_SYS_CHUNK); 5161 } else { 5162 BUG(); 5163 } 5164 5165 /* We don't want a chunk larger than 10% of writable space */ 5166 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5167 ctl->max_chunk_size); 5168 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5169 } 5170 5171 static void init_alloc_chunk_ctl_policy_zoned( 5172 struct btrfs_fs_devices *fs_devices, 5173 struct alloc_chunk_ctl *ctl) 5174 { 5175 u64 zone_size = fs_devices->fs_info->zone_size; 5176 u64 limit; 5177 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5178 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5179 u64 min_chunk_size = min_data_stripes * zone_size; 5180 u64 type = ctl->type; 5181 5182 ctl->max_stripe_size = zone_size; 5183 if (type & BTRFS_BLOCK_GROUP_DATA) { 5184 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5185 zone_size); 5186 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5187 ctl->max_chunk_size = ctl->max_stripe_size; 5188 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5189 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5190 ctl->devs_max = min_t(int, ctl->devs_max, 5191 BTRFS_MAX_DEVS_SYS_CHUNK); 5192 } else { 5193 BUG(); 5194 } 5195 5196 /* We don't want a chunk larger than 10% of writable space */ 5197 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5198 zone_size), 5199 min_chunk_size); 5200 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5201 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5202 } 5203 5204 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5205 struct alloc_chunk_ctl *ctl) 5206 { 5207 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5208 5209 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5210 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5211 ctl->devs_max = btrfs_raid_array[index].devs_max; 5212 if (!ctl->devs_max) 5213 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5214 ctl->devs_min = btrfs_raid_array[index].devs_min; 5215 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5216 ctl->ncopies = btrfs_raid_array[index].ncopies; 5217 ctl->nparity = btrfs_raid_array[index].nparity; 5218 ctl->ndevs = 0; 5219 5220 switch (fs_devices->chunk_alloc_policy) { 5221 case BTRFS_CHUNK_ALLOC_REGULAR: 5222 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5223 break; 5224 case BTRFS_CHUNK_ALLOC_ZONED: 5225 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5226 break; 5227 default: 5228 BUG(); 5229 } 5230 } 5231 5232 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5233 struct alloc_chunk_ctl *ctl, 5234 struct btrfs_device_info *devices_info) 5235 { 5236 struct btrfs_fs_info *info = fs_devices->fs_info; 5237 struct btrfs_device *device; 5238 u64 total_avail; 5239 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5240 int ret; 5241 int ndevs = 0; 5242 u64 max_avail; 5243 u64 dev_offset; 5244 5245 /* 5246 * in the first pass through the devices list, we gather information 5247 * about the available holes on each device. 5248 */ 5249 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5250 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5251 WARN(1, KERN_ERR 5252 "BTRFS: read-only device in alloc_list\n"); 5253 continue; 5254 } 5255 5256 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5257 &device->dev_state) || 5258 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5259 continue; 5260 5261 if (device->total_bytes > device->bytes_used) 5262 total_avail = device->total_bytes - device->bytes_used; 5263 else 5264 total_avail = 0; 5265 5266 /* If there is no space on this device, skip it. */ 5267 if (total_avail < ctl->dev_extent_min) 5268 continue; 5269 5270 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5271 &max_avail); 5272 if (ret && ret != -ENOSPC) 5273 return ret; 5274 5275 if (ret == 0) 5276 max_avail = dev_extent_want; 5277 5278 if (max_avail < ctl->dev_extent_min) { 5279 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5280 btrfs_debug(info, 5281 "%s: devid %llu has no free space, have=%llu want=%llu", 5282 __func__, device->devid, max_avail, 5283 ctl->dev_extent_min); 5284 continue; 5285 } 5286 5287 if (ndevs == fs_devices->rw_devices) { 5288 WARN(1, "%s: found more than %llu devices\n", 5289 __func__, fs_devices->rw_devices); 5290 break; 5291 } 5292 devices_info[ndevs].dev_offset = dev_offset; 5293 devices_info[ndevs].max_avail = max_avail; 5294 devices_info[ndevs].total_avail = total_avail; 5295 devices_info[ndevs].dev = device; 5296 ++ndevs; 5297 } 5298 ctl->ndevs = ndevs; 5299 5300 /* 5301 * now sort the devices by hole size / available space 5302 */ 5303 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5304 btrfs_cmp_device_info, NULL); 5305 5306 return 0; 5307 } 5308 5309 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5310 struct btrfs_device_info *devices_info) 5311 { 5312 /* Number of stripes that count for block group size */ 5313 int data_stripes; 5314 5315 /* 5316 * The primary goal is to maximize the number of stripes, so use as 5317 * many devices as possible, even if the stripes are not maximum sized. 5318 * 5319 * The DUP profile stores more than one stripe per device, the 5320 * max_avail is the total size so we have to adjust. 5321 */ 5322 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5323 ctl->dev_stripes); 5324 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5325 5326 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5327 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5328 5329 /* 5330 * Use the number of data stripes to figure out how big this chunk is 5331 * really going to be in terms of logical address space, and compare 5332 * that answer with the max chunk size. If it's higher, we try to 5333 * reduce stripe_size. 5334 */ 5335 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5336 /* 5337 * Reduce stripe_size, round it up to a 16MB boundary again and 5338 * then use it, unless it ends up being even bigger than the 5339 * previous value we had already. 5340 */ 5341 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5342 data_stripes), SZ_16M), 5343 ctl->stripe_size); 5344 } 5345 5346 /* Align to BTRFS_STRIPE_LEN */ 5347 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5348 ctl->chunk_size = ctl->stripe_size * data_stripes; 5349 5350 return 0; 5351 } 5352 5353 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5354 struct btrfs_device_info *devices_info) 5355 { 5356 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5357 /* Number of stripes that count for block group size */ 5358 int data_stripes; 5359 5360 /* 5361 * It should hold because: 5362 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5363 */ 5364 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5365 5366 ctl->stripe_size = zone_size; 5367 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5368 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5369 5370 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5371 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5372 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5373 ctl->stripe_size) + ctl->nparity, 5374 ctl->dev_stripes); 5375 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5376 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5377 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5378 } 5379 5380 ctl->chunk_size = ctl->stripe_size * data_stripes; 5381 5382 return 0; 5383 } 5384 5385 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5386 struct alloc_chunk_ctl *ctl, 5387 struct btrfs_device_info *devices_info) 5388 { 5389 struct btrfs_fs_info *info = fs_devices->fs_info; 5390 5391 /* 5392 * Round down to number of usable stripes, devs_increment can be any 5393 * number so we can't use round_down() that requires power of 2, while 5394 * rounddown is safe. 5395 */ 5396 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5397 5398 if (ctl->ndevs < ctl->devs_min) { 5399 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5400 btrfs_debug(info, 5401 "%s: not enough devices with free space: have=%d minimum required=%d", 5402 __func__, ctl->ndevs, ctl->devs_min); 5403 } 5404 return -ENOSPC; 5405 } 5406 5407 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5408 5409 switch (fs_devices->chunk_alloc_policy) { 5410 case BTRFS_CHUNK_ALLOC_REGULAR: 5411 return decide_stripe_size_regular(ctl, devices_info); 5412 case BTRFS_CHUNK_ALLOC_ZONED: 5413 return decide_stripe_size_zoned(ctl, devices_info); 5414 default: 5415 BUG(); 5416 } 5417 } 5418 5419 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5420 struct alloc_chunk_ctl *ctl, 5421 struct btrfs_device_info *devices_info) 5422 { 5423 struct btrfs_fs_info *info = trans->fs_info; 5424 struct map_lookup *map = NULL; 5425 struct extent_map_tree *em_tree; 5426 struct btrfs_block_group *block_group; 5427 struct extent_map *em; 5428 u64 start = ctl->start; 5429 u64 type = ctl->type; 5430 int ret; 5431 int i; 5432 int j; 5433 5434 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5435 if (!map) 5436 return ERR_PTR(-ENOMEM); 5437 map->num_stripes = ctl->num_stripes; 5438 5439 for (i = 0; i < ctl->ndevs; ++i) { 5440 for (j = 0; j < ctl->dev_stripes; ++j) { 5441 int s = i * ctl->dev_stripes + j; 5442 map->stripes[s].dev = devices_info[i].dev; 5443 map->stripes[s].physical = devices_info[i].dev_offset + 5444 j * ctl->stripe_size; 5445 } 5446 } 5447 map->stripe_len = BTRFS_STRIPE_LEN; 5448 map->io_align = BTRFS_STRIPE_LEN; 5449 map->io_width = BTRFS_STRIPE_LEN; 5450 map->type = type; 5451 map->sub_stripes = ctl->sub_stripes; 5452 5453 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5454 5455 em = alloc_extent_map(); 5456 if (!em) { 5457 kfree(map); 5458 return ERR_PTR(-ENOMEM); 5459 } 5460 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5461 em->map_lookup = map; 5462 em->start = start; 5463 em->len = ctl->chunk_size; 5464 em->block_start = 0; 5465 em->block_len = em->len; 5466 em->orig_block_len = ctl->stripe_size; 5467 5468 em_tree = &info->mapping_tree; 5469 write_lock(&em_tree->lock); 5470 ret = add_extent_mapping(em_tree, em, 0); 5471 if (ret) { 5472 write_unlock(&em_tree->lock); 5473 free_extent_map(em); 5474 return ERR_PTR(ret); 5475 } 5476 write_unlock(&em_tree->lock); 5477 5478 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5479 if (IS_ERR(block_group)) 5480 goto error_del_extent; 5481 5482 for (i = 0; i < map->num_stripes; i++) { 5483 struct btrfs_device *dev = map->stripes[i].dev; 5484 5485 btrfs_device_set_bytes_used(dev, 5486 dev->bytes_used + ctl->stripe_size); 5487 if (list_empty(&dev->post_commit_list)) 5488 list_add_tail(&dev->post_commit_list, 5489 &trans->transaction->dev_update_list); 5490 } 5491 5492 atomic64_sub(ctl->stripe_size * map->num_stripes, 5493 &info->free_chunk_space); 5494 5495 free_extent_map(em); 5496 check_raid56_incompat_flag(info, type); 5497 check_raid1c34_incompat_flag(info, type); 5498 5499 return block_group; 5500 5501 error_del_extent: 5502 write_lock(&em_tree->lock); 5503 remove_extent_mapping(em_tree, em); 5504 write_unlock(&em_tree->lock); 5505 5506 /* One for our allocation */ 5507 free_extent_map(em); 5508 /* One for the tree reference */ 5509 free_extent_map(em); 5510 5511 return block_group; 5512 } 5513 5514 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5515 u64 type) 5516 { 5517 struct btrfs_fs_info *info = trans->fs_info; 5518 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5519 struct btrfs_device_info *devices_info = NULL; 5520 struct alloc_chunk_ctl ctl; 5521 struct btrfs_block_group *block_group; 5522 int ret; 5523 5524 lockdep_assert_held(&info->chunk_mutex); 5525 5526 if (!alloc_profile_is_valid(type, 0)) { 5527 ASSERT(0); 5528 return ERR_PTR(-EINVAL); 5529 } 5530 5531 if (list_empty(&fs_devices->alloc_list)) { 5532 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5533 btrfs_debug(info, "%s: no writable device", __func__); 5534 return ERR_PTR(-ENOSPC); 5535 } 5536 5537 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5538 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5539 ASSERT(0); 5540 return ERR_PTR(-EINVAL); 5541 } 5542 5543 ctl.start = find_next_chunk(info); 5544 ctl.type = type; 5545 init_alloc_chunk_ctl(fs_devices, &ctl); 5546 5547 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5548 GFP_NOFS); 5549 if (!devices_info) 5550 return ERR_PTR(-ENOMEM); 5551 5552 ret = gather_device_info(fs_devices, &ctl, devices_info); 5553 if (ret < 0) { 5554 block_group = ERR_PTR(ret); 5555 goto out; 5556 } 5557 5558 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5559 if (ret < 0) { 5560 block_group = ERR_PTR(ret); 5561 goto out; 5562 } 5563 5564 block_group = create_chunk(trans, &ctl, devices_info); 5565 5566 out: 5567 kfree(devices_info); 5568 return block_group; 5569 } 5570 5571 /* 5572 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5573 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5574 * chunks. 5575 * 5576 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5577 * phases. 5578 */ 5579 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5580 struct btrfs_block_group *bg) 5581 { 5582 struct btrfs_fs_info *fs_info = trans->fs_info; 5583 struct btrfs_root *chunk_root = fs_info->chunk_root; 5584 struct btrfs_key key; 5585 struct btrfs_chunk *chunk; 5586 struct btrfs_stripe *stripe; 5587 struct extent_map *em; 5588 struct map_lookup *map; 5589 size_t item_size; 5590 int i; 5591 int ret; 5592 5593 /* 5594 * We take the chunk_mutex for 2 reasons: 5595 * 5596 * 1) Updates and insertions in the chunk btree must be done while holding 5597 * the chunk_mutex, as well as updating the system chunk array in the 5598 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5599 * details; 5600 * 5601 * 2) To prevent races with the final phase of a device replace operation 5602 * that replaces the device object associated with the map's stripes, 5603 * because the device object's id can change at any time during that 5604 * final phase of the device replace operation 5605 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5606 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5607 * which would cause a failure when updating the device item, which does 5608 * not exists, or persisting a stripe of the chunk item with such ID. 5609 * Here we can't use the device_list_mutex because our caller already 5610 * has locked the chunk_mutex, and the final phase of device replace 5611 * acquires both mutexes - first the device_list_mutex and then the 5612 * chunk_mutex. Using any of those two mutexes protects us from a 5613 * concurrent device replace. 5614 */ 5615 lockdep_assert_held(&fs_info->chunk_mutex); 5616 5617 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5618 if (IS_ERR(em)) { 5619 ret = PTR_ERR(em); 5620 btrfs_abort_transaction(trans, ret); 5621 return ret; 5622 } 5623 5624 map = em->map_lookup; 5625 item_size = btrfs_chunk_item_size(map->num_stripes); 5626 5627 chunk = kzalloc(item_size, GFP_NOFS); 5628 if (!chunk) { 5629 ret = -ENOMEM; 5630 btrfs_abort_transaction(trans, ret); 5631 goto out; 5632 } 5633 5634 for (i = 0; i < map->num_stripes; i++) { 5635 struct btrfs_device *device = map->stripes[i].dev; 5636 5637 ret = btrfs_update_device(trans, device); 5638 if (ret) 5639 goto out; 5640 } 5641 5642 stripe = &chunk->stripe; 5643 for (i = 0; i < map->num_stripes; i++) { 5644 struct btrfs_device *device = map->stripes[i].dev; 5645 const u64 dev_offset = map->stripes[i].physical; 5646 5647 btrfs_set_stack_stripe_devid(stripe, device->devid); 5648 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5649 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5650 stripe++; 5651 } 5652 5653 btrfs_set_stack_chunk_length(chunk, bg->length); 5654 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5655 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5656 btrfs_set_stack_chunk_type(chunk, map->type); 5657 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5658 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5659 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5660 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5661 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5662 5663 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5664 key.type = BTRFS_CHUNK_ITEM_KEY; 5665 key.offset = bg->start; 5666 5667 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5668 if (ret) 5669 goto out; 5670 5671 bg->chunk_item_inserted = 1; 5672 5673 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5674 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5675 if (ret) 5676 goto out; 5677 } 5678 5679 out: 5680 kfree(chunk); 5681 free_extent_map(em); 5682 return ret; 5683 } 5684 5685 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5686 { 5687 struct btrfs_fs_info *fs_info = trans->fs_info; 5688 u64 alloc_profile; 5689 struct btrfs_block_group *meta_bg; 5690 struct btrfs_block_group *sys_bg; 5691 5692 /* 5693 * When adding a new device for sprouting, the seed device is read-only 5694 * so we must first allocate a metadata and a system chunk. But before 5695 * adding the block group items to the extent, device and chunk btrees, 5696 * we must first: 5697 * 5698 * 1) Create both chunks without doing any changes to the btrees, as 5699 * otherwise we would get -ENOSPC since the block groups from the 5700 * seed device are read-only; 5701 * 5702 * 2) Add the device item for the new sprout device - finishing the setup 5703 * of a new block group requires updating the device item in the chunk 5704 * btree, so it must exist when we attempt to do it. The previous step 5705 * ensures this does not fail with -ENOSPC. 5706 * 5707 * After that we can add the block group items to their btrees: 5708 * update existing device item in the chunk btree, add a new block group 5709 * item to the extent btree, add a new chunk item to the chunk btree and 5710 * finally add the new device extent items to the devices btree. 5711 */ 5712 5713 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5714 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5715 if (IS_ERR(meta_bg)) 5716 return PTR_ERR(meta_bg); 5717 5718 alloc_profile = btrfs_system_alloc_profile(fs_info); 5719 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5720 if (IS_ERR(sys_bg)) 5721 return PTR_ERR(sys_bg); 5722 5723 return 0; 5724 } 5725 5726 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5727 { 5728 const int index = btrfs_bg_flags_to_raid_index(map->type); 5729 5730 return btrfs_raid_array[index].tolerated_failures; 5731 } 5732 5733 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5734 { 5735 struct extent_map *em; 5736 struct map_lookup *map; 5737 int miss_ndevs = 0; 5738 int i; 5739 bool ret = true; 5740 5741 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5742 if (IS_ERR(em)) 5743 return false; 5744 5745 map = em->map_lookup; 5746 for (i = 0; i < map->num_stripes; i++) { 5747 if (test_bit(BTRFS_DEV_STATE_MISSING, 5748 &map->stripes[i].dev->dev_state)) { 5749 miss_ndevs++; 5750 continue; 5751 } 5752 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5753 &map->stripes[i].dev->dev_state)) { 5754 ret = false; 5755 goto end; 5756 } 5757 } 5758 5759 /* 5760 * If the number of missing devices is larger than max errors, we can 5761 * not write the data into that chunk successfully. 5762 */ 5763 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5764 ret = false; 5765 end: 5766 free_extent_map(em); 5767 return ret; 5768 } 5769 5770 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5771 { 5772 struct extent_map *em; 5773 5774 while (1) { 5775 write_lock(&tree->lock); 5776 em = lookup_extent_mapping(tree, 0, (u64)-1); 5777 if (em) 5778 remove_extent_mapping(tree, em); 5779 write_unlock(&tree->lock); 5780 if (!em) 5781 break; 5782 /* once for us */ 5783 free_extent_map(em); 5784 /* once for the tree */ 5785 free_extent_map(em); 5786 } 5787 } 5788 5789 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5790 { 5791 struct extent_map *em; 5792 struct map_lookup *map; 5793 int ret; 5794 5795 em = btrfs_get_chunk_map(fs_info, logical, len); 5796 if (IS_ERR(em)) 5797 /* 5798 * We could return errors for these cases, but that could get 5799 * ugly and we'd probably do the same thing which is just not do 5800 * anything else and exit, so return 1 so the callers don't try 5801 * to use other copies. 5802 */ 5803 return 1; 5804 5805 map = em->map_lookup; 5806 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5807 ret = map->num_stripes; 5808 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5809 ret = map->sub_stripes; 5810 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5811 ret = 2; 5812 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5813 /* 5814 * There could be two corrupted data stripes, we need 5815 * to loop retry in order to rebuild the correct data. 5816 * 5817 * Fail a stripe at a time on every retry except the 5818 * stripe under reconstruction. 5819 */ 5820 ret = map->num_stripes; 5821 else 5822 ret = 1; 5823 free_extent_map(em); 5824 5825 down_read(&fs_info->dev_replace.rwsem); 5826 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5827 fs_info->dev_replace.tgtdev) 5828 ret++; 5829 up_read(&fs_info->dev_replace.rwsem); 5830 5831 return ret; 5832 } 5833 5834 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5835 u64 logical) 5836 { 5837 struct extent_map *em; 5838 struct map_lookup *map; 5839 unsigned long len = fs_info->sectorsize; 5840 5841 em = btrfs_get_chunk_map(fs_info, logical, len); 5842 5843 if (!WARN_ON(IS_ERR(em))) { 5844 map = em->map_lookup; 5845 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5846 len = map->stripe_len * nr_data_stripes(map); 5847 free_extent_map(em); 5848 } 5849 return len; 5850 } 5851 5852 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5853 { 5854 struct extent_map *em; 5855 struct map_lookup *map; 5856 int ret = 0; 5857 5858 em = btrfs_get_chunk_map(fs_info, logical, len); 5859 5860 if(!WARN_ON(IS_ERR(em))) { 5861 map = em->map_lookup; 5862 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5863 ret = 1; 5864 free_extent_map(em); 5865 } 5866 return ret; 5867 } 5868 5869 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5870 struct map_lookup *map, int first, 5871 int dev_replace_is_ongoing) 5872 { 5873 int i; 5874 int num_stripes; 5875 int preferred_mirror; 5876 int tolerance; 5877 struct btrfs_device *srcdev; 5878 5879 ASSERT((map->type & 5880 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5881 5882 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5883 num_stripes = map->sub_stripes; 5884 else 5885 num_stripes = map->num_stripes; 5886 5887 switch (fs_info->fs_devices->read_policy) { 5888 default: 5889 /* Shouldn't happen, just warn and use pid instead of failing */ 5890 btrfs_warn_rl(fs_info, 5891 "unknown read_policy type %u, reset to pid", 5892 fs_info->fs_devices->read_policy); 5893 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5894 fallthrough; 5895 case BTRFS_READ_POLICY_PID: 5896 preferred_mirror = first + (current->pid % num_stripes); 5897 break; 5898 } 5899 5900 if (dev_replace_is_ongoing && 5901 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5902 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5903 srcdev = fs_info->dev_replace.srcdev; 5904 else 5905 srcdev = NULL; 5906 5907 /* 5908 * try to avoid the drive that is the source drive for a 5909 * dev-replace procedure, only choose it if no other non-missing 5910 * mirror is available 5911 */ 5912 for (tolerance = 0; tolerance < 2; tolerance++) { 5913 if (map->stripes[preferred_mirror].dev->bdev && 5914 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5915 return preferred_mirror; 5916 for (i = first; i < first + num_stripes; i++) { 5917 if (map->stripes[i].dev->bdev && 5918 (tolerance || map->stripes[i].dev != srcdev)) 5919 return i; 5920 } 5921 } 5922 5923 /* we couldn't find one that doesn't fail. Just return something 5924 * and the io error handling code will clean up eventually 5925 */ 5926 return preferred_mirror; 5927 } 5928 5929 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5930 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5931 { 5932 int i; 5933 int again = 1; 5934 5935 while (again) { 5936 again = 0; 5937 for (i = 0; i < num_stripes - 1; i++) { 5938 /* Swap if parity is on a smaller index */ 5939 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5940 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5941 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5942 again = 1; 5943 } 5944 } 5945 } 5946 } 5947 5948 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5949 int total_stripes, 5950 int real_stripes) 5951 { 5952 struct btrfs_io_context *bioc = kzalloc( 5953 /* The size of btrfs_io_context */ 5954 sizeof(struct btrfs_io_context) + 5955 /* Plus the variable array for the stripes */ 5956 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5957 /* Plus the variable array for the tgt dev */ 5958 sizeof(int) * (real_stripes) + 5959 /* 5960 * Plus the raid_map, which includes both the tgt dev 5961 * and the stripes. 5962 */ 5963 sizeof(u64) * (total_stripes), 5964 GFP_NOFS|__GFP_NOFAIL); 5965 5966 atomic_set(&bioc->error, 0); 5967 refcount_set(&bioc->refs, 1); 5968 5969 bioc->fs_info = fs_info; 5970 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5971 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5972 5973 return bioc; 5974 } 5975 5976 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5977 { 5978 WARN_ON(!refcount_read(&bioc->refs)); 5979 refcount_inc(&bioc->refs); 5980 } 5981 5982 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5983 { 5984 if (!bioc) 5985 return; 5986 if (refcount_dec_and_test(&bioc->refs)) 5987 kfree(bioc); 5988 } 5989 5990 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5991 /* 5992 * Please note that, discard won't be sent to target device of device 5993 * replace. 5994 */ 5995 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5996 u64 logical, u64 *length_ret, 5997 struct btrfs_io_context **bioc_ret) 5998 { 5999 struct extent_map *em; 6000 struct map_lookup *map; 6001 struct btrfs_io_context *bioc; 6002 u64 length = *length_ret; 6003 u64 offset; 6004 u64 stripe_nr; 6005 u64 stripe_nr_end; 6006 u64 stripe_end_offset; 6007 u64 stripe_cnt; 6008 u64 stripe_len; 6009 u64 stripe_offset; 6010 u64 num_stripes; 6011 u32 stripe_index; 6012 u32 factor = 0; 6013 u32 sub_stripes = 0; 6014 u64 stripes_per_dev = 0; 6015 u32 remaining_stripes = 0; 6016 u32 last_stripe = 0; 6017 int ret = 0; 6018 int i; 6019 6020 /* Discard always returns a bioc. */ 6021 ASSERT(bioc_ret); 6022 6023 em = btrfs_get_chunk_map(fs_info, logical, length); 6024 if (IS_ERR(em)) 6025 return PTR_ERR(em); 6026 6027 map = em->map_lookup; 6028 /* we don't discard raid56 yet */ 6029 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6030 ret = -EOPNOTSUPP; 6031 goto out; 6032 } 6033 6034 offset = logical - em->start; 6035 length = min_t(u64, em->start + em->len - logical, length); 6036 *length_ret = length; 6037 6038 stripe_len = map->stripe_len; 6039 /* 6040 * stripe_nr counts the total number of stripes we have to stride 6041 * to get to this block 6042 */ 6043 stripe_nr = div64_u64(offset, stripe_len); 6044 6045 /* stripe_offset is the offset of this block in its stripe */ 6046 stripe_offset = offset - stripe_nr * stripe_len; 6047 6048 stripe_nr_end = round_up(offset + length, map->stripe_len); 6049 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 6050 stripe_cnt = stripe_nr_end - stripe_nr; 6051 stripe_end_offset = stripe_nr_end * map->stripe_len - 6052 (offset + length); 6053 /* 6054 * after this, stripe_nr is the number of stripes on this 6055 * device we have to walk to find the data, and stripe_index is 6056 * the number of our device in the stripe array 6057 */ 6058 num_stripes = 1; 6059 stripe_index = 0; 6060 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6061 BTRFS_BLOCK_GROUP_RAID10)) { 6062 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6063 sub_stripes = 1; 6064 else 6065 sub_stripes = map->sub_stripes; 6066 6067 factor = map->num_stripes / sub_stripes; 6068 num_stripes = min_t(u64, map->num_stripes, 6069 sub_stripes * stripe_cnt); 6070 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6071 stripe_index *= sub_stripes; 6072 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6073 &remaining_stripes); 6074 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6075 last_stripe *= sub_stripes; 6076 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6077 BTRFS_BLOCK_GROUP_DUP)) { 6078 num_stripes = map->num_stripes; 6079 } else { 6080 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6081 &stripe_index); 6082 } 6083 6084 bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); 6085 if (!bioc) { 6086 ret = -ENOMEM; 6087 goto out; 6088 } 6089 6090 for (i = 0; i < num_stripes; i++) { 6091 bioc->stripes[i].physical = 6092 map->stripes[stripe_index].physical + 6093 stripe_offset + stripe_nr * map->stripe_len; 6094 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6095 6096 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6097 BTRFS_BLOCK_GROUP_RAID10)) { 6098 bioc->stripes[i].length = stripes_per_dev * 6099 map->stripe_len; 6100 6101 if (i / sub_stripes < remaining_stripes) 6102 bioc->stripes[i].length += map->stripe_len; 6103 6104 /* 6105 * Special for the first stripe and 6106 * the last stripe: 6107 * 6108 * |-------|...|-------| 6109 * |----------| 6110 * off end_off 6111 */ 6112 if (i < sub_stripes) 6113 bioc->stripes[i].length -= stripe_offset; 6114 6115 if (stripe_index >= last_stripe && 6116 stripe_index <= (last_stripe + 6117 sub_stripes - 1)) 6118 bioc->stripes[i].length -= stripe_end_offset; 6119 6120 if (i == sub_stripes - 1) 6121 stripe_offset = 0; 6122 } else { 6123 bioc->stripes[i].length = length; 6124 } 6125 6126 stripe_index++; 6127 if (stripe_index == map->num_stripes) { 6128 stripe_index = 0; 6129 stripe_nr++; 6130 } 6131 } 6132 6133 *bioc_ret = bioc; 6134 bioc->map_type = map->type; 6135 bioc->num_stripes = num_stripes; 6136 out: 6137 free_extent_map(em); 6138 return ret; 6139 } 6140 6141 /* 6142 * In dev-replace case, for repair case (that's the only case where the mirror 6143 * is selected explicitly when calling btrfs_map_block), blocks left of the 6144 * left cursor can also be read from the target drive. 6145 * 6146 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6147 * array of stripes. 6148 * For READ, it also needs to be supported using the same mirror number. 6149 * 6150 * If the requested block is not left of the left cursor, EIO is returned. This 6151 * can happen because btrfs_num_copies() returns one more in the dev-replace 6152 * case. 6153 */ 6154 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6155 u64 logical, u64 length, 6156 u64 srcdev_devid, int *mirror_num, 6157 u64 *physical) 6158 { 6159 struct btrfs_io_context *bioc = NULL; 6160 int num_stripes; 6161 int index_srcdev = 0; 6162 int found = 0; 6163 u64 physical_of_found = 0; 6164 int i; 6165 int ret = 0; 6166 6167 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6168 logical, &length, &bioc, 0, 0); 6169 if (ret) { 6170 ASSERT(bioc == NULL); 6171 return ret; 6172 } 6173 6174 num_stripes = bioc->num_stripes; 6175 if (*mirror_num > num_stripes) { 6176 /* 6177 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6178 * that means that the requested area is not left of the left 6179 * cursor 6180 */ 6181 btrfs_put_bioc(bioc); 6182 return -EIO; 6183 } 6184 6185 /* 6186 * process the rest of the function using the mirror_num of the source 6187 * drive. Therefore look it up first. At the end, patch the device 6188 * pointer to the one of the target drive. 6189 */ 6190 for (i = 0; i < num_stripes; i++) { 6191 if (bioc->stripes[i].dev->devid != srcdev_devid) 6192 continue; 6193 6194 /* 6195 * In case of DUP, in order to keep it simple, only add the 6196 * mirror with the lowest physical address 6197 */ 6198 if (found && 6199 physical_of_found <= bioc->stripes[i].physical) 6200 continue; 6201 6202 index_srcdev = i; 6203 found = 1; 6204 physical_of_found = bioc->stripes[i].physical; 6205 } 6206 6207 btrfs_put_bioc(bioc); 6208 6209 ASSERT(found); 6210 if (!found) 6211 return -EIO; 6212 6213 *mirror_num = index_srcdev + 1; 6214 *physical = physical_of_found; 6215 return ret; 6216 } 6217 6218 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6219 { 6220 struct btrfs_block_group *cache; 6221 bool ret; 6222 6223 /* Non zoned filesystem does not use "to_copy" flag */ 6224 if (!btrfs_is_zoned(fs_info)) 6225 return false; 6226 6227 cache = btrfs_lookup_block_group(fs_info, logical); 6228 6229 spin_lock(&cache->lock); 6230 ret = cache->to_copy; 6231 spin_unlock(&cache->lock); 6232 6233 btrfs_put_block_group(cache); 6234 return ret; 6235 } 6236 6237 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6238 struct btrfs_io_context **bioc_ret, 6239 struct btrfs_dev_replace *dev_replace, 6240 u64 logical, 6241 int *num_stripes_ret, int *max_errors_ret) 6242 { 6243 struct btrfs_io_context *bioc = *bioc_ret; 6244 u64 srcdev_devid = dev_replace->srcdev->devid; 6245 int tgtdev_indexes = 0; 6246 int num_stripes = *num_stripes_ret; 6247 int max_errors = *max_errors_ret; 6248 int i; 6249 6250 if (op == BTRFS_MAP_WRITE) { 6251 int index_where_to_add; 6252 6253 /* 6254 * A block group which have "to_copy" set will eventually 6255 * copied by dev-replace process. We can avoid cloning IO here. 6256 */ 6257 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6258 return; 6259 6260 /* 6261 * duplicate the write operations while the dev replace 6262 * procedure is running. Since the copying of the old disk to 6263 * the new disk takes place at run time while the filesystem is 6264 * mounted writable, the regular write operations to the old 6265 * disk have to be duplicated to go to the new disk as well. 6266 * 6267 * Note that device->missing is handled by the caller, and that 6268 * the write to the old disk is already set up in the stripes 6269 * array. 6270 */ 6271 index_where_to_add = num_stripes; 6272 for (i = 0; i < num_stripes; i++) { 6273 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6274 /* write to new disk, too */ 6275 struct btrfs_io_stripe *new = 6276 bioc->stripes + index_where_to_add; 6277 struct btrfs_io_stripe *old = 6278 bioc->stripes + i; 6279 6280 new->physical = old->physical; 6281 new->length = old->length; 6282 new->dev = dev_replace->tgtdev; 6283 bioc->tgtdev_map[i] = index_where_to_add; 6284 index_where_to_add++; 6285 max_errors++; 6286 tgtdev_indexes++; 6287 } 6288 } 6289 num_stripes = index_where_to_add; 6290 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6291 int index_srcdev = 0; 6292 int found = 0; 6293 u64 physical_of_found = 0; 6294 6295 /* 6296 * During the dev-replace procedure, the target drive can also 6297 * be used to read data in case it is needed to repair a corrupt 6298 * block elsewhere. This is possible if the requested area is 6299 * left of the left cursor. In this area, the target drive is a 6300 * full copy of the source drive. 6301 */ 6302 for (i = 0; i < num_stripes; i++) { 6303 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6304 /* 6305 * In case of DUP, in order to keep it simple, 6306 * only add the mirror with the lowest physical 6307 * address 6308 */ 6309 if (found && 6310 physical_of_found <= bioc->stripes[i].physical) 6311 continue; 6312 index_srcdev = i; 6313 found = 1; 6314 physical_of_found = bioc->stripes[i].physical; 6315 } 6316 } 6317 if (found) { 6318 struct btrfs_io_stripe *tgtdev_stripe = 6319 bioc->stripes + num_stripes; 6320 6321 tgtdev_stripe->physical = physical_of_found; 6322 tgtdev_stripe->length = 6323 bioc->stripes[index_srcdev].length; 6324 tgtdev_stripe->dev = dev_replace->tgtdev; 6325 bioc->tgtdev_map[index_srcdev] = num_stripes; 6326 6327 tgtdev_indexes++; 6328 num_stripes++; 6329 } 6330 } 6331 6332 *num_stripes_ret = num_stripes; 6333 *max_errors_ret = max_errors; 6334 bioc->num_tgtdevs = tgtdev_indexes; 6335 *bioc_ret = bioc; 6336 } 6337 6338 static bool need_full_stripe(enum btrfs_map_op op) 6339 { 6340 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6341 } 6342 6343 /* 6344 * Calculate the geometry of a particular (address, len) tuple. This 6345 * information is used to calculate how big a particular bio can get before it 6346 * straddles a stripe. 6347 * 6348 * @fs_info: the filesystem 6349 * @em: mapping containing the logical extent 6350 * @op: type of operation - write or read 6351 * @logical: address that we want to figure out the geometry of 6352 * @io_geom: pointer used to return values 6353 * 6354 * Returns < 0 in case a chunk for the given logical address cannot be found, 6355 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6356 */ 6357 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6358 enum btrfs_map_op op, u64 logical, 6359 struct btrfs_io_geometry *io_geom) 6360 { 6361 struct map_lookup *map; 6362 u64 len; 6363 u64 offset; 6364 u64 stripe_offset; 6365 u64 stripe_nr; 6366 u64 stripe_len; 6367 u64 raid56_full_stripe_start = (u64)-1; 6368 int data_stripes; 6369 6370 ASSERT(op != BTRFS_MAP_DISCARD); 6371 6372 map = em->map_lookup; 6373 /* Offset of this logical address in the chunk */ 6374 offset = logical - em->start; 6375 /* Len of a stripe in a chunk */ 6376 stripe_len = map->stripe_len; 6377 /* Stripe where this block falls in */ 6378 stripe_nr = div64_u64(offset, stripe_len); 6379 /* Offset of stripe in the chunk */ 6380 stripe_offset = stripe_nr * stripe_len; 6381 if (offset < stripe_offset) { 6382 btrfs_crit(fs_info, 6383 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6384 stripe_offset, offset, em->start, logical, stripe_len); 6385 return -EINVAL; 6386 } 6387 6388 /* stripe_offset is the offset of this block in its stripe */ 6389 stripe_offset = offset - stripe_offset; 6390 data_stripes = nr_data_stripes(map); 6391 6392 /* Only stripe based profiles needs to check against stripe length. */ 6393 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { 6394 u64 max_len = stripe_len - stripe_offset; 6395 6396 /* 6397 * In case of raid56, we need to know the stripe aligned start 6398 */ 6399 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6400 unsigned long full_stripe_len = stripe_len * data_stripes; 6401 raid56_full_stripe_start = offset; 6402 6403 /* 6404 * Allow a write of a full stripe, but make sure we 6405 * don't allow straddling of stripes 6406 */ 6407 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6408 full_stripe_len); 6409 raid56_full_stripe_start *= full_stripe_len; 6410 6411 /* 6412 * For writes to RAID[56], allow a full stripeset across 6413 * all disks. For other RAID types and for RAID[56] 6414 * reads, just allow a single stripe (on a single disk). 6415 */ 6416 if (op == BTRFS_MAP_WRITE) { 6417 max_len = stripe_len * data_stripes - 6418 (offset - raid56_full_stripe_start); 6419 } 6420 } 6421 len = min_t(u64, em->len - offset, max_len); 6422 } else { 6423 len = em->len - offset; 6424 } 6425 6426 io_geom->len = len; 6427 io_geom->offset = offset; 6428 io_geom->stripe_len = stripe_len; 6429 io_geom->stripe_nr = stripe_nr; 6430 io_geom->stripe_offset = stripe_offset; 6431 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6432 6433 return 0; 6434 } 6435 6436 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6437 enum btrfs_map_op op, 6438 u64 logical, u64 *length, 6439 struct btrfs_io_context **bioc_ret, 6440 int mirror_num, int need_raid_map) 6441 { 6442 struct extent_map *em; 6443 struct map_lookup *map; 6444 u64 stripe_offset; 6445 u64 stripe_nr; 6446 u64 stripe_len; 6447 u32 stripe_index; 6448 int data_stripes; 6449 int i; 6450 int ret = 0; 6451 int num_stripes; 6452 int max_errors = 0; 6453 int tgtdev_indexes = 0; 6454 struct btrfs_io_context *bioc = NULL; 6455 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6456 int dev_replace_is_ongoing = 0; 6457 int num_alloc_stripes; 6458 int patch_the_first_stripe_for_dev_replace = 0; 6459 u64 physical_to_patch_in_first_stripe = 0; 6460 u64 raid56_full_stripe_start = (u64)-1; 6461 struct btrfs_io_geometry geom; 6462 6463 ASSERT(bioc_ret); 6464 ASSERT(op != BTRFS_MAP_DISCARD); 6465 6466 em = btrfs_get_chunk_map(fs_info, logical, *length); 6467 ASSERT(!IS_ERR(em)); 6468 6469 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6470 if (ret < 0) 6471 return ret; 6472 6473 map = em->map_lookup; 6474 6475 *length = geom.len; 6476 stripe_len = geom.stripe_len; 6477 stripe_nr = geom.stripe_nr; 6478 stripe_offset = geom.stripe_offset; 6479 raid56_full_stripe_start = geom.raid56_stripe_offset; 6480 data_stripes = nr_data_stripes(map); 6481 6482 down_read(&dev_replace->rwsem); 6483 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6484 /* 6485 * Hold the semaphore for read during the whole operation, write is 6486 * requested at commit time but must wait. 6487 */ 6488 if (!dev_replace_is_ongoing) 6489 up_read(&dev_replace->rwsem); 6490 6491 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6492 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6493 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6494 dev_replace->srcdev->devid, 6495 &mirror_num, 6496 &physical_to_patch_in_first_stripe); 6497 if (ret) 6498 goto out; 6499 else 6500 patch_the_first_stripe_for_dev_replace = 1; 6501 } else if (mirror_num > map->num_stripes) { 6502 mirror_num = 0; 6503 } 6504 6505 num_stripes = 1; 6506 stripe_index = 0; 6507 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6508 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6509 &stripe_index); 6510 if (!need_full_stripe(op)) 6511 mirror_num = 1; 6512 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6513 if (need_full_stripe(op)) 6514 num_stripes = map->num_stripes; 6515 else if (mirror_num) 6516 stripe_index = mirror_num - 1; 6517 else { 6518 stripe_index = find_live_mirror(fs_info, map, 0, 6519 dev_replace_is_ongoing); 6520 mirror_num = stripe_index + 1; 6521 } 6522 6523 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6524 if (need_full_stripe(op)) { 6525 num_stripes = map->num_stripes; 6526 } else if (mirror_num) { 6527 stripe_index = mirror_num - 1; 6528 } else { 6529 mirror_num = 1; 6530 } 6531 6532 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6533 u32 factor = map->num_stripes / map->sub_stripes; 6534 6535 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6536 stripe_index *= map->sub_stripes; 6537 6538 if (need_full_stripe(op)) 6539 num_stripes = map->sub_stripes; 6540 else if (mirror_num) 6541 stripe_index += mirror_num - 1; 6542 else { 6543 int old_stripe_index = stripe_index; 6544 stripe_index = find_live_mirror(fs_info, map, 6545 stripe_index, 6546 dev_replace_is_ongoing); 6547 mirror_num = stripe_index - old_stripe_index + 1; 6548 } 6549 6550 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6551 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6552 /* push stripe_nr back to the start of the full stripe */ 6553 stripe_nr = div64_u64(raid56_full_stripe_start, 6554 stripe_len * data_stripes); 6555 6556 /* RAID[56] write or recovery. Return all stripes */ 6557 num_stripes = map->num_stripes; 6558 max_errors = nr_parity_stripes(map); 6559 6560 *length = map->stripe_len; 6561 stripe_index = 0; 6562 stripe_offset = 0; 6563 } else { 6564 /* 6565 * Mirror #0 or #1 means the original data block. 6566 * Mirror #2 is RAID5 parity block. 6567 * Mirror #3 is RAID6 Q block. 6568 */ 6569 stripe_nr = div_u64_rem(stripe_nr, 6570 data_stripes, &stripe_index); 6571 if (mirror_num > 1) 6572 stripe_index = data_stripes + mirror_num - 2; 6573 6574 /* We distribute the parity blocks across stripes */ 6575 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6576 &stripe_index); 6577 if (!need_full_stripe(op) && mirror_num <= 1) 6578 mirror_num = 1; 6579 } 6580 } else { 6581 /* 6582 * after this, stripe_nr is the number of stripes on this 6583 * device we have to walk to find the data, and stripe_index is 6584 * the number of our device in the stripe array 6585 */ 6586 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6587 &stripe_index); 6588 mirror_num = stripe_index + 1; 6589 } 6590 if (stripe_index >= map->num_stripes) { 6591 btrfs_crit(fs_info, 6592 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6593 stripe_index, map->num_stripes); 6594 ret = -EINVAL; 6595 goto out; 6596 } 6597 6598 num_alloc_stripes = num_stripes; 6599 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6600 if (op == BTRFS_MAP_WRITE) 6601 num_alloc_stripes <<= 1; 6602 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6603 num_alloc_stripes++; 6604 tgtdev_indexes = num_stripes; 6605 } 6606 6607 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6608 if (!bioc) { 6609 ret = -ENOMEM; 6610 goto out; 6611 } 6612 6613 for (i = 0; i < num_stripes; i++) { 6614 bioc->stripes[i].physical = map->stripes[stripe_index].physical + 6615 stripe_offset + stripe_nr * map->stripe_len; 6616 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6617 stripe_index++; 6618 } 6619 6620 /* Build raid_map */ 6621 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6622 (need_full_stripe(op) || mirror_num > 1)) { 6623 u64 tmp; 6624 unsigned rot; 6625 6626 /* Work out the disk rotation on this stripe-set */ 6627 div_u64_rem(stripe_nr, num_stripes, &rot); 6628 6629 /* Fill in the logical address of each stripe */ 6630 tmp = stripe_nr * data_stripes; 6631 for (i = 0; i < data_stripes; i++) 6632 bioc->raid_map[(i + rot) % num_stripes] = 6633 em->start + (tmp + i) * map->stripe_len; 6634 6635 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6636 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6637 bioc->raid_map[(i + rot + 1) % num_stripes] = 6638 RAID6_Q_STRIPE; 6639 6640 sort_parity_stripes(bioc, num_stripes); 6641 } 6642 6643 if (need_full_stripe(op)) 6644 max_errors = btrfs_chunk_max_errors(map); 6645 6646 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6647 need_full_stripe(op)) { 6648 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6649 &num_stripes, &max_errors); 6650 } 6651 6652 *bioc_ret = bioc; 6653 bioc->map_type = map->type; 6654 bioc->num_stripes = num_stripes; 6655 bioc->max_errors = max_errors; 6656 bioc->mirror_num = mirror_num; 6657 6658 /* 6659 * this is the case that REQ_READ && dev_replace_is_ongoing && 6660 * mirror_num == num_stripes + 1 && dev_replace target drive is 6661 * available as a mirror 6662 */ 6663 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6664 WARN_ON(num_stripes > 1); 6665 bioc->stripes[0].dev = dev_replace->tgtdev; 6666 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6667 bioc->mirror_num = map->num_stripes + 1; 6668 } 6669 out: 6670 if (dev_replace_is_ongoing) { 6671 lockdep_assert_held(&dev_replace->rwsem); 6672 /* Unlock and let waiting writers proceed */ 6673 up_read(&dev_replace->rwsem); 6674 } 6675 free_extent_map(em); 6676 return ret; 6677 } 6678 6679 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6680 u64 logical, u64 *length, 6681 struct btrfs_io_context **bioc_ret, int mirror_num) 6682 { 6683 if (op == BTRFS_MAP_DISCARD) 6684 return __btrfs_map_block_for_discard(fs_info, logical, 6685 length, bioc_ret); 6686 6687 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6688 mirror_num, 0); 6689 } 6690 6691 /* For Scrub/replace */ 6692 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6693 u64 logical, u64 *length, 6694 struct btrfs_io_context **bioc_ret) 6695 { 6696 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); 6697 } 6698 6699 static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) 6700 { 6701 bio->bi_private = bioc->private; 6702 bio->bi_end_io = bioc->end_io; 6703 bio_endio(bio); 6704 6705 btrfs_put_bioc(bioc); 6706 } 6707 6708 static void btrfs_end_bio(struct bio *bio) 6709 { 6710 struct btrfs_io_context *bioc = bio->bi_private; 6711 int is_orig_bio = 0; 6712 6713 if (bio->bi_status) { 6714 atomic_inc(&bioc->error); 6715 if (bio->bi_status == BLK_STS_IOERR || 6716 bio->bi_status == BLK_STS_TARGET) { 6717 struct btrfs_device *dev = btrfs_bio(bio)->device; 6718 6719 ASSERT(dev->bdev); 6720 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6721 btrfs_dev_stat_inc_and_print(dev, 6722 BTRFS_DEV_STAT_WRITE_ERRS); 6723 else if (!(bio->bi_opf & REQ_RAHEAD)) 6724 btrfs_dev_stat_inc_and_print(dev, 6725 BTRFS_DEV_STAT_READ_ERRS); 6726 if (bio->bi_opf & REQ_PREFLUSH) 6727 btrfs_dev_stat_inc_and_print(dev, 6728 BTRFS_DEV_STAT_FLUSH_ERRS); 6729 } 6730 } 6731 6732 if (bio == bioc->orig_bio) 6733 is_orig_bio = 1; 6734 6735 btrfs_bio_counter_dec(bioc->fs_info); 6736 6737 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6738 if (!is_orig_bio) { 6739 bio_put(bio); 6740 bio = bioc->orig_bio; 6741 } 6742 6743 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6744 /* only send an error to the higher layers if it is 6745 * beyond the tolerance of the btrfs bio 6746 */ 6747 if (atomic_read(&bioc->error) > bioc->max_errors) { 6748 bio->bi_status = BLK_STS_IOERR; 6749 } else { 6750 /* 6751 * this bio is actually up to date, we didn't 6752 * go over the max number of errors 6753 */ 6754 bio->bi_status = BLK_STS_OK; 6755 } 6756 6757 btrfs_end_bioc(bioc, bio); 6758 } else if (!is_orig_bio) { 6759 bio_put(bio); 6760 } 6761 } 6762 6763 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, 6764 u64 physical, struct btrfs_device *dev) 6765 { 6766 struct btrfs_fs_info *fs_info = bioc->fs_info; 6767 6768 bio->bi_private = bioc; 6769 btrfs_bio(bio)->device = dev; 6770 bio->bi_end_io = btrfs_end_bio; 6771 bio->bi_iter.bi_sector = physical >> 9; 6772 /* 6773 * For zone append writing, bi_sector must point the beginning of the 6774 * zone 6775 */ 6776 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6777 if (btrfs_dev_is_sequential(dev, physical)) { 6778 u64 zone_start = round_down(physical, fs_info->zone_size); 6779 6780 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6781 } else { 6782 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6783 bio->bi_opf |= REQ_OP_WRITE; 6784 } 6785 } 6786 btrfs_debug_in_rcu(fs_info, 6787 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6788 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6789 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6790 dev->devid, bio->bi_iter.bi_size); 6791 bio_set_dev(bio, dev->bdev); 6792 6793 btrfs_bio_counter_inc_noblocked(fs_info); 6794 6795 btrfsic_submit_bio(bio); 6796 } 6797 6798 static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) 6799 { 6800 atomic_inc(&bioc->error); 6801 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6802 /* Should be the original bio. */ 6803 WARN_ON(bio != bioc->orig_bio); 6804 6805 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6806 bio->bi_iter.bi_sector = logical >> 9; 6807 if (atomic_read(&bioc->error) > bioc->max_errors) 6808 bio->bi_status = BLK_STS_IOERR; 6809 else 6810 bio->bi_status = BLK_STS_OK; 6811 btrfs_end_bioc(bioc, bio); 6812 } 6813 } 6814 6815 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6816 int mirror_num) 6817 { 6818 struct btrfs_device *dev; 6819 struct bio *first_bio = bio; 6820 u64 logical = bio->bi_iter.bi_sector << 9; 6821 u64 length = 0; 6822 u64 map_length; 6823 int ret; 6824 int dev_nr; 6825 int total_devs; 6826 struct btrfs_io_context *bioc = NULL; 6827 6828 length = bio->bi_iter.bi_size; 6829 map_length = length; 6830 6831 btrfs_bio_counter_inc_blocked(fs_info); 6832 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6833 &map_length, &bioc, mirror_num, 1); 6834 if (ret) { 6835 btrfs_bio_counter_dec(fs_info); 6836 return errno_to_blk_status(ret); 6837 } 6838 6839 total_devs = bioc->num_stripes; 6840 bioc->orig_bio = first_bio; 6841 bioc->private = first_bio->bi_private; 6842 bioc->end_io = first_bio->bi_end_io; 6843 atomic_set(&bioc->stripes_pending, bioc->num_stripes); 6844 6845 if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6846 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6847 /* In this case, map_length has been set to the length of 6848 a single stripe; not the whole write */ 6849 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6850 ret = raid56_parity_write(bio, bioc, map_length); 6851 } else { 6852 ret = raid56_parity_recover(bio, bioc, map_length, 6853 mirror_num, 1); 6854 } 6855 6856 btrfs_bio_counter_dec(fs_info); 6857 return errno_to_blk_status(ret); 6858 } 6859 6860 if (map_length < length) { 6861 btrfs_crit(fs_info, 6862 "mapping failed logical %llu bio len %llu len %llu", 6863 logical, length, map_length); 6864 BUG(); 6865 } 6866 6867 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6868 dev = bioc->stripes[dev_nr].dev; 6869 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6870 &dev->dev_state) || 6871 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6872 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6873 bioc_error(bioc, first_bio, logical); 6874 continue; 6875 } 6876 6877 if (dev_nr < total_devs - 1) 6878 bio = btrfs_bio_clone(first_bio); 6879 else 6880 bio = first_bio; 6881 6882 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); 6883 } 6884 btrfs_bio_counter_dec(fs_info); 6885 return BLK_STS_OK; 6886 } 6887 6888 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6889 const struct btrfs_fs_devices *fs_devices) 6890 { 6891 if (args->fsid == NULL) 6892 return true; 6893 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6894 return true; 6895 return false; 6896 } 6897 6898 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6899 const struct btrfs_device *device) 6900 { 6901 ASSERT((args->devid != (u64)-1) || args->missing); 6902 6903 if ((args->devid != (u64)-1) && device->devid != args->devid) 6904 return false; 6905 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6906 return false; 6907 if (!args->missing) 6908 return true; 6909 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6910 !device->bdev) 6911 return true; 6912 return false; 6913 } 6914 6915 /* 6916 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6917 * return NULL. 6918 * 6919 * If devid and uuid are both specified, the match must be exact, otherwise 6920 * only devid is used. 6921 */ 6922 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6923 const struct btrfs_dev_lookup_args *args) 6924 { 6925 struct btrfs_device *device; 6926 struct btrfs_fs_devices *seed_devs; 6927 6928 if (dev_args_match_fs_devices(args, fs_devices)) { 6929 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6930 if (dev_args_match_device(args, device)) 6931 return device; 6932 } 6933 } 6934 6935 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6936 if (!dev_args_match_fs_devices(args, seed_devs)) 6937 continue; 6938 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6939 if (dev_args_match_device(args, device)) 6940 return device; 6941 } 6942 } 6943 6944 return NULL; 6945 } 6946 6947 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6948 u64 devid, u8 *dev_uuid) 6949 { 6950 struct btrfs_device *device; 6951 unsigned int nofs_flag; 6952 6953 /* 6954 * We call this under the chunk_mutex, so we want to use NOFS for this 6955 * allocation, however we don't want to change btrfs_alloc_device() to 6956 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6957 * places. 6958 */ 6959 nofs_flag = memalloc_nofs_save(); 6960 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6961 memalloc_nofs_restore(nofs_flag); 6962 if (IS_ERR(device)) 6963 return device; 6964 6965 list_add(&device->dev_list, &fs_devices->devices); 6966 device->fs_devices = fs_devices; 6967 fs_devices->num_devices++; 6968 6969 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6970 fs_devices->missing_devices++; 6971 6972 return device; 6973 } 6974 6975 /** 6976 * btrfs_alloc_device - allocate struct btrfs_device 6977 * @fs_info: used only for generating a new devid, can be NULL if 6978 * devid is provided (i.e. @devid != NULL). 6979 * @devid: a pointer to devid for this device. If NULL a new devid 6980 * is generated. 6981 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6982 * is generated. 6983 * 6984 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6985 * on error. Returned struct is not linked onto any lists and must be 6986 * destroyed with btrfs_free_device. 6987 */ 6988 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6989 const u64 *devid, 6990 const u8 *uuid) 6991 { 6992 struct btrfs_device *dev; 6993 u64 tmp; 6994 6995 if (WARN_ON(!devid && !fs_info)) 6996 return ERR_PTR(-EINVAL); 6997 6998 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6999 if (!dev) 7000 return ERR_PTR(-ENOMEM); 7001 7002 /* 7003 * Preallocate a bio that's always going to be used for flushing device 7004 * barriers and matches the device lifespan 7005 */ 7006 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 7007 if (!dev->flush_bio) { 7008 kfree(dev); 7009 return ERR_PTR(-ENOMEM); 7010 } 7011 7012 INIT_LIST_HEAD(&dev->dev_list); 7013 INIT_LIST_HEAD(&dev->dev_alloc_list); 7014 INIT_LIST_HEAD(&dev->post_commit_list); 7015 7016 atomic_set(&dev->dev_stats_ccnt, 0); 7017 btrfs_device_data_ordered_init(dev); 7018 extent_io_tree_init(fs_info, &dev->alloc_state, 7019 IO_TREE_DEVICE_ALLOC_STATE, NULL); 7020 7021 if (devid) 7022 tmp = *devid; 7023 else { 7024 int ret; 7025 7026 ret = find_next_devid(fs_info, &tmp); 7027 if (ret) { 7028 btrfs_free_device(dev); 7029 return ERR_PTR(ret); 7030 } 7031 } 7032 dev->devid = tmp; 7033 7034 if (uuid) 7035 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 7036 else 7037 generate_random_uuid(dev->uuid); 7038 7039 return dev; 7040 } 7041 7042 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 7043 u64 devid, u8 *uuid, bool error) 7044 { 7045 if (error) 7046 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 7047 devid, uuid); 7048 else 7049 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 7050 devid, uuid); 7051 } 7052 7053 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7054 { 7055 const int data_stripes = calc_data_stripes(type, num_stripes); 7056 7057 return div_u64(chunk_len, data_stripes); 7058 } 7059 7060 #if BITS_PER_LONG == 32 7061 /* 7062 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 7063 * can't be accessed on 32bit systems. 7064 * 7065 * This function do mount time check to reject the fs if it already has 7066 * metadata chunk beyond that limit. 7067 */ 7068 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7069 u64 logical, u64 length, u64 type) 7070 { 7071 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7072 return 0; 7073 7074 if (logical + length < MAX_LFS_FILESIZE) 7075 return 0; 7076 7077 btrfs_err_32bit_limit(fs_info); 7078 return -EOVERFLOW; 7079 } 7080 7081 /* 7082 * This is to give early warning for any metadata chunk reaching 7083 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7084 * Although we can still access the metadata, it's not going to be possible 7085 * once the limit is reached. 7086 */ 7087 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7088 u64 logical, u64 length, u64 type) 7089 { 7090 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7091 return; 7092 7093 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7094 return; 7095 7096 btrfs_warn_32bit_limit(fs_info); 7097 } 7098 #endif 7099 7100 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7101 u64 devid, u8 *uuid) 7102 { 7103 struct btrfs_device *dev; 7104 7105 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7106 btrfs_report_missing_device(fs_info, devid, uuid, true); 7107 return ERR_PTR(-ENOENT); 7108 } 7109 7110 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7111 if (IS_ERR(dev)) { 7112 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7113 devid, PTR_ERR(dev)); 7114 return dev; 7115 } 7116 btrfs_report_missing_device(fs_info, devid, uuid, false); 7117 7118 return dev; 7119 } 7120 7121 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7122 struct btrfs_chunk *chunk) 7123 { 7124 BTRFS_DEV_LOOKUP_ARGS(args); 7125 struct btrfs_fs_info *fs_info = leaf->fs_info; 7126 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7127 struct map_lookup *map; 7128 struct extent_map *em; 7129 u64 logical; 7130 u64 length; 7131 u64 devid; 7132 u64 type; 7133 u8 uuid[BTRFS_UUID_SIZE]; 7134 int num_stripes; 7135 int ret; 7136 int i; 7137 7138 logical = key->offset; 7139 length = btrfs_chunk_length(leaf, chunk); 7140 type = btrfs_chunk_type(leaf, chunk); 7141 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7142 7143 #if BITS_PER_LONG == 32 7144 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7145 if (ret < 0) 7146 return ret; 7147 warn_32bit_meta_chunk(fs_info, logical, length, type); 7148 #endif 7149 7150 /* 7151 * Only need to verify chunk item if we're reading from sys chunk array, 7152 * as chunk item in tree block is already verified by tree-checker. 7153 */ 7154 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7155 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7156 if (ret) 7157 return ret; 7158 } 7159 7160 read_lock(&map_tree->lock); 7161 em = lookup_extent_mapping(map_tree, logical, 1); 7162 read_unlock(&map_tree->lock); 7163 7164 /* already mapped? */ 7165 if (em && em->start <= logical && em->start + em->len > logical) { 7166 free_extent_map(em); 7167 return 0; 7168 } else if (em) { 7169 free_extent_map(em); 7170 } 7171 7172 em = alloc_extent_map(); 7173 if (!em) 7174 return -ENOMEM; 7175 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7176 if (!map) { 7177 free_extent_map(em); 7178 return -ENOMEM; 7179 } 7180 7181 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7182 em->map_lookup = map; 7183 em->start = logical; 7184 em->len = length; 7185 em->orig_start = 0; 7186 em->block_start = 0; 7187 em->block_len = em->len; 7188 7189 map->num_stripes = num_stripes; 7190 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7191 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7192 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7193 map->type = type; 7194 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7195 map->verified_stripes = 0; 7196 em->orig_block_len = calc_stripe_length(type, em->len, 7197 map->num_stripes); 7198 for (i = 0; i < num_stripes; i++) { 7199 map->stripes[i].physical = 7200 btrfs_stripe_offset_nr(leaf, chunk, i); 7201 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7202 args.devid = devid; 7203 read_extent_buffer(leaf, uuid, (unsigned long) 7204 btrfs_stripe_dev_uuid_nr(chunk, i), 7205 BTRFS_UUID_SIZE); 7206 args.uuid = uuid; 7207 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7208 if (!map->stripes[i].dev) { 7209 map->stripes[i].dev = handle_missing_device(fs_info, 7210 devid, uuid); 7211 if (IS_ERR(map->stripes[i].dev)) { 7212 free_extent_map(em); 7213 return PTR_ERR(map->stripes[i].dev); 7214 } 7215 } 7216 7217 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7218 &(map->stripes[i].dev->dev_state)); 7219 } 7220 7221 write_lock(&map_tree->lock); 7222 ret = add_extent_mapping(map_tree, em, 0); 7223 write_unlock(&map_tree->lock); 7224 if (ret < 0) { 7225 btrfs_err(fs_info, 7226 "failed to add chunk map, start=%llu len=%llu: %d", 7227 em->start, em->len, ret); 7228 } 7229 free_extent_map(em); 7230 7231 return ret; 7232 } 7233 7234 static void fill_device_from_item(struct extent_buffer *leaf, 7235 struct btrfs_dev_item *dev_item, 7236 struct btrfs_device *device) 7237 { 7238 unsigned long ptr; 7239 7240 device->devid = btrfs_device_id(leaf, dev_item); 7241 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7242 device->total_bytes = device->disk_total_bytes; 7243 device->commit_total_bytes = device->disk_total_bytes; 7244 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7245 device->commit_bytes_used = device->bytes_used; 7246 device->type = btrfs_device_type(leaf, dev_item); 7247 device->io_align = btrfs_device_io_align(leaf, dev_item); 7248 device->io_width = btrfs_device_io_width(leaf, dev_item); 7249 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7250 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7251 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7252 7253 ptr = btrfs_device_uuid(dev_item); 7254 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7255 } 7256 7257 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7258 u8 *fsid) 7259 { 7260 struct btrfs_fs_devices *fs_devices; 7261 int ret; 7262 7263 lockdep_assert_held(&uuid_mutex); 7264 ASSERT(fsid); 7265 7266 /* This will match only for multi-device seed fs */ 7267 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7268 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7269 return fs_devices; 7270 7271 7272 fs_devices = find_fsid(fsid, NULL); 7273 if (!fs_devices) { 7274 if (!btrfs_test_opt(fs_info, DEGRADED)) 7275 return ERR_PTR(-ENOENT); 7276 7277 fs_devices = alloc_fs_devices(fsid, NULL); 7278 if (IS_ERR(fs_devices)) 7279 return fs_devices; 7280 7281 fs_devices->seeding = true; 7282 fs_devices->opened = 1; 7283 return fs_devices; 7284 } 7285 7286 /* 7287 * Upon first call for a seed fs fsid, just create a private copy of the 7288 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7289 */ 7290 fs_devices = clone_fs_devices(fs_devices); 7291 if (IS_ERR(fs_devices)) 7292 return fs_devices; 7293 7294 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7295 if (ret) { 7296 free_fs_devices(fs_devices); 7297 return ERR_PTR(ret); 7298 } 7299 7300 if (!fs_devices->seeding) { 7301 close_fs_devices(fs_devices); 7302 free_fs_devices(fs_devices); 7303 return ERR_PTR(-EINVAL); 7304 } 7305 7306 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7307 7308 return fs_devices; 7309 } 7310 7311 static int read_one_dev(struct extent_buffer *leaf, 7312 struct btrfs_dev_item *dev_item) 7313 { 7314 BTRFS_DEV_LOOKUP_ARGS(args); 7315 struct btrfs_fs_info *fs_info = leaf->fs_info; 7316 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7317 struct btrfs_device *device; 7318 u64 devid; 7319 int ret; 7320 u8 fs_uuid[BTRFS_FSID_SIZE]; 7321 u8 dev_uuid[BTRFS_UUID_SIZE]; 7322 7323 devid = args.devid = btrfs_device_id(leaf, dev_item); 7324 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7325 BTRFS_UUID_SIZE); 7326 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7327 BTRFS_FSID_SIZE); 7328 args.uuid = dev_uuid; 7329 args.fsid = fs_uuid; 7330 7331 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7332 fs_devices = open_seed_devices(fs_info, fs_uuid); 7333 if (IS_ERR(fs_devices)) 7334 return PTR_ERR(fs_devices); 7335 } 7336 7337 device = btrfs_find_device(fs_info->fs_devices, &args); 7338 if (!device) { 7339 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7340 btrfs_report_missing_device(fs_info, devid, 7341 dev_uuid, true); 7342 return -ENOENT; 7343 } 7344 7345 device = add_missing_dev(fs_devices, devid, dev_uuid); 7346 if (IS_ERR(device)) { 7347 btrfs_err(fs_info, 7348 "failed to add missing dev %llu: %ld", 7349 devid, PTR_ERR(device)); 7350 return PTR_ERR(device); 7351 } 7352 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7353 } else { 7354 if (!device->bdev) { 7355 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7356 btrfs_report_missing_device(fs_info, 7357 devid, dev_uuid, true); 7358 return -ENOENT; 7359 } 7360 btrfs_report_missing_device(fs_info, devid, 7361 dev_uuid, false); 7362 } 7363 7364 if (!device->bdev && 7365 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7366 /* 7367 * this happens when a device that was properly setup 7368 * in the device info lists suddenly goes bad. 7369 * device->bdev is NULL, and so we have to set 7370 * device->missing to one here 7371 */ 7372 device->fs_devices->missing_devices++; 7373 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7374 } 7375 7376 /* Move the device to its own fs_devices */ 7377 if (device->fs_devices != fs_devices) { 7378 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7379 &device->dev_state)); 7380 7381 list_move(&device->dev_list, &fs_devices->devices); 7382 device->fs_devices->num_devices--; 7383 fs_devices->num_devices++; 7384 7385 device->fs_devices->missing_devices--; 7386 fs_devices->missing_devices++; 7387 7388 device->fs_devices = fs_devices; 7389 } 7390 } 7391 7392 if (device->fs_devices != fs_info->fs_devices) { 7393 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7394 if (device->generation != 7395 btrfs_device_generation(leaf, dev_item)) 7396 return -EINVAL; 7397 } 7398 7399 fill_device_from_item(leaf, dev_item, device); 7400 if (device->bdev) { 7401 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7402 7403 if (device->total_bytes > max_total_bytes) { 7404 btrfs_err(fs_info, 7405 "device total_bytes should be at most %llu but found %llu", 7406 max_total_bytes, device->total_bytes); 7407 return -EINVAL; 7408 } 7409 } 7410 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7411 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7412 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7413 device->fs_devices->total_rw_bytes += device->total_bytes; 7414 atomic64_add(device->total_bytes - device->bytes_used, 7415 &fs_info->free_chunk_space); 7416 } 7417 ret = 0; 7418 return ret; 7419 } 7420 7421 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7422 { 7423 struct btrfs_root *root = fs_info->tree_root; 7424 struct btrfs_super_block *super_copy = fs_info->super_copy; 7425 struct extent_buffer *sb; 7426 struct btrfs_disk_key *disk_key; 7427 struct btrfs_chunk *chunk; 7428 u8 *array_ptr; 7429 unsigned long sb_array_offset; 7430 int ret = 0; 7431 u32 num_stripes; 7432 u32 array_size; 7433 u32 len = 0; 7434 u32 cur_offset; 7435 u64 type; 7436 struct btrfs_key key; 7437 7438 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7439 /* 7440 * This will create extent buffer of nodesize, superblock size is 7441 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7442 * overallocate but we can keep it as-is, only the first page is used. 7443 */ 7444 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7445 root->root_key.objectid, 0); 7446 if (IS_ERR(sb)) 7447 return PTR_ERR(sb); 7448 set_extent_buffer_uptodate(sb); 7449 /* 7450 * The sb extent buffer is artificial and just used to read the system array. 7451 * set_extent_buffer_uptodate() call does not properly mark all it's 7452 * pages up-to-date when the page is larger: extent does not cover the 7453 * whole page and consequently check_page_uptodate does not find all 7454 * the page's extents up-to-date (the hole beyond sb), 7455 * write_extent_buffer then triggers a WARN_ON. 7456 * 7457 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7458 * but sb spans only this function. Add an explicit SetPageUptodate call 7459 * to silence the warning eg. on PowerPC 64. 7460 */ 7461 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7462 SetPageUptodate(sb->pages[0]); 7463 7464 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7465 array_size = btrfs_super_sys_array_size(super_copy); 7466 7467 array_ptr = super_copy->sys_chunk_array; 7468 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7469 cur_offset = 0; 7470 7471 while (cur_offset < array_size) { 7472 disk_key = (struct btrfs_disk_key *)array_ptr; 7473 len = sizeof(*disk_key); 7474 if (cur_offset + len > array_size) 7475 goto out_short_read; 7476 7477 btrfs_disk_key_to_cpu(&key, disk_key); 7478 7479 array_ptr += len; 7480 sb_array_offset += len; 7481 cur_offset += len; 7482 7483 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7484 btrfs_err(fs_info, 7485 "unexpected item type %u in sys_array at offset %u", 7486 (u32)key.type, cur_offset); 7487 ret = -EIO; 7488 break; 7489 } 7490 7491 chunk = (struct btrfs_chunk *)sb_array_offset; 7492 /* 7493 * At least one btrfs_chunk with one stripe must be present, 7494 * exact stripe count check comes afterwards 7495 */ 7496 len = btrfs_chunk_item_size(1); 7497 if (cur_offset + len > array_size) 7498 goto out_short_read; 7499 7500 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7501 if (!num_stripes) { 7502 btrfs_err(fs_info, 7503 "invalid number of stripes %u in sys_array at offset %u", 7504 num_stripes, cur_offset); 7505 ret = -EIO; 7506 break; 7507 } 7508 7509 type = btrfs_chunk_type(sb, chunk); 7510 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7511 btrfs_err(fs_info, 7512 "invalid chunk type %llu in sys_array at offset %u", 7513 type, cur_offset); 7514 ret = -EIO; 7515 break; 7516 } 7517 7518 len = btrfs_chunk_item_size(num_stripes); 7519 if (cur_offset + len > array_size) 7520 goto out_short_read; 7521 7522 ret = read_one_chunk(&key, sb, chunk); 7523 if (ret) 7524 break; 7525 7526 array_ptr += len; 7527 sb_array_offset += len; 7528 cur_offset += len; 7529 } 7530 clear_extent_buffer_uptodate(sb); 7531 free_extent_buffer_stale(sb); 7532 return ret; 7533 7534 out_short_read: 7535 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7536 len, cur_offset); 7537 clear_extent_buffer_uptodate(sb); 7538 free_extent_buffer_stale(sb); 7539 return -EIO; 7540 } 7541 7542 /* 7543 * Check if all chunks in the fs are OK for read-write degraded mount 7544 * 7545 * If the @failing_dev is specified, it's accounted as missing. 7546 * 7547 * Return true if all chunks meet the minimal RW mount requirements. 7548 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7549 */ 7550 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7551 struct btrfs_device *failing_dev) 7552 { 7553 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7554 struct extent_map *em; 7555 u64 next_start = 0; 7556 bool ret = true; 7557 7558 read_lock(&map_tree->lock); 7559 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7560 read_unlock(&map_tree->lock); 7561 /* No chunk at all? Return false anyway */ 7562 if (!em) { 7563 ret = false; 7564 goto out; 7565 } 7566 while (em) { 7567 struct map_lookup *map; 7568 int missing = 0; 7569 int max_tolerated; 7570 int i; 7571 7572 map = em->map_lookup; 7573 max_tolerated = 7574 btrfs_get_num_tolerated_disk_barrier_failures( 7575 map->type); 7576 for (i = 0; i < map->num_stripes; i++) { 7577 struct btrfs_device *dev = map->stripes[i].dev; 7578 7579 if (!dev || !dev->bdev || 7580 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7581 dev->last_flush_error) 7582 missing++; 7583 else if (failing_dev && failing_dev == dev) 7584 missing++; 7585 } 7586 if (missing > max_tolerated) { 7587 if (!failing_dev) 7588 btrfs_warn(fs_info, 7589 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7590 em->start, missing, max_tolerated); 7591 free_extent_map(em); 7592 ret = false; 7593 goto out; 7594 } 7595 next_start = extent_map_end(em); 7596 free_extent_map(em); 7597 7598 read_lock(&map_tree->lock); 7599 em = lookup_extent_mapping(map_tree, next_start, 7600 (u64)(-1) - next_start); 7601 read_unlock(&map_tree->lock); 7602 } 7603 out: 7604 return ret; 7605 } 7606 7607 static void readahead_tree_node_children(struct extent_buffer *node) 7608 { 7609 int i; 7610 const int nr_items = btrfs_header_nritems(node); 7611 7612 for (i = 0; i < nr_items; i++) 7613 btrfs_readahead_node_child(node, i); 7614 } 7615 7616 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7617 { 7618 struct btrfs_root *root = fs_info->chunk_root; 7619 struct btrfs_path *path; 7620 struct extent_buffer *leaf; 7621 struct btrfs_key key; 7622 struct btrfs_key found_key; 7623 int ret; 7624 int slot; 7625 u64 total_dev = 0; 7626 u64 last_ra_node = 0; 7627 7628 path = btrfs_alloc_path(); 7629 if (!path) 7630 return -ENOMEM; 7631 7632 /* 7633 * uuid_mutex is needed only if we are mounting a sprout FS 7634 * otherwise we don't need it. 7635 */ 7636 mutex_lock(&uuid_mutex); 7637 7638 /* 7639 * It is possible for mount and umount to race in such a way that 7640 * we execute this code path, but open_fs_devices failed to clear 7641 * total_rw_bytes. We certainly want it cleared before reading the 7642 * device items, so clear it here. 7643 */ 7644 fs_info->fs_devices->total_rw_bytes = 0; 7645 7646 /* 7647 * Lockdep complains about possible circular locking dependency between 7648 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7649 * used for freeze procection of a fs (struct super_block.s_writers), 7650 * which we take when starting a transaction, and extent buffers of the 7651 * chunk tree if we call read_one_dev() while holding a lock on an 7652 * extent buffer of the chunk tree. Since we are mounting the filesystem 7653 * and at this point there can't be any concurrent task modifying the 7654 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7655 */ 7656 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7657 path->skip_locking = 1; 7658 7659 /* 7660 * Read all device items, and then all the chunk items. All 7661 * device items are found before any chunk item (their object id 7662 * is smaller than the lowest possible object id for a chunk 7663 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7664 */ 7665 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7666 key.offset = 0; 7667 key.type = 0; 7668 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7669 if (ret < 0) 7670 goto error; 7671 while (1) { 7672 struct extent_buffer *node; 7673 7674 leaf = path->nodes[0]; 7675 slot = path->slots[0]; 7676 if (slot >= btrfs_header_nritems(leaf)) { 7677 ret = btrfs_next_leaf(root, path); 7678 if (ret == 0) 7679 continue; 7680 if (ret < 0) 7681 goto error; 7682 break; 7683 } 7684 node = path->nodes[1]; 7685 if (node) { 7686 if (last_ra_node != node->start) { 7687 readahead_tree_node_children(node); 7688 last_ra_node = node->start; 7689 } 7690 } 7691 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7692 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7693 struct btrfs_dev_item *dev_item; 7694 dev_item = btrfs_item_ptr(leaf, slot, 7695 struct btrfs_dev_item); 7696 ret = read_one_dev(leaf, dev_item); 7697 if (ret) 7698 goto error; 7699 total_dev++; 7700 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7701 struct btrfs_chunk *chunk; 7702 7703 /* 7704 * We are only called at mount time, so no need to take 7705 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7706 * we always lock first fs_info->chunk_mutex before 7707 * acquiring any locks on the chunk tree. This is a 7708 * requirement for chunk allocation, see the comment on 7709 * top of btrfs_chunk_alloc() for details. 7710 */ 7711 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7712 ret = read_one_chunk(&found_key, leaf, chunk); 7713 if (ret) 7714 goto error; 7715 } 7716 path->slots[0]++; 7717 } 7718 7719 /* 7720 * After loading chunk tree, we've got all device information, 7721 * do another round of validation checks. 7722 */ 7723 if (total_dev != fs_info->fs_devices->total_devices) { 7724 btrfs_err(fs_info, 7725 "super_num_devices %llu mismatch with num_devices %llu found here", 7726 btrfs_super_num_devices(fs_info->super_copy), 7727 total_dev); 7728 ret = -EINVAL; 7729 goto error; 7730 } 7731 if (btrfs_super_total_bytes(fs_info->super_copy) < 7732 fs_info->fs_devices->total_rw_bytes) { 7733 btrfs_err(fs_info, 7734 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7735 btrfs_super_total_bytes(fs_info->super_copy), 7736 fs_info->fs_devices->total_rw_bytes); 7737 ret = -EINVAL; 7738 goto error; 7739 } 7740 ret = 0; 7741 error: 7742 mutex_unlock(&uuid_mutex); 7743 7744 btrfs_free_path(path); 7745 return ret; 7746 } 7747 7748 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7749 { 7750 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7751 struct btrfs_device *device; 7752 7753 fs_devices->fs_info = fs_info; 7754 7755 mutex_lock(&fs_devices->device_list_mutex); 7756 list_for_each_entry(device, &fs_devices->devices, dev_list) 7757 device->fs_info = fs_info; 7758 7759 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7760 list_for_each_entry(device, &seed_devs->devices, dev_list) 7761 device->fs_info = fs_info; 7762 7763 seed_devs->fs_info = fs_info; 7764 } 7765 mutex_unlock(&fs_devices->device_list_mutex); 7766 } 7767 7768 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7769 const struct btrfs_dev_stats_item *ptr, 7770 int index) 7771 { 7772 u64 val; 7773 7774 read_extent_buffer(eb, &val, 7775 offsetof(struct btrfs_dev_stats_item, values) + 7776 ((unsigned long)ptr) + (index * sizeof(u64)), 7777 sizeof(val)); 7778 return val; 7779 } 7780 7781 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7782 struct btrfs_dev_stats_item *ptr, 7783 int index, u64 val) 7784 { 7785 write_extent_buffer(eb, &val, 7786 offsetof(struct btrfs_dev_stats_item, values) + 7787 ((unsigned long)ptr) + (index * sizeof(u64)), 7788 sizeof(val)); 7789 } 7790 7791 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7792 struct btrfs_path *path) 7793 { 7794 struct btrfs_dev_stats_item *ptr; 7795 struct extent_buffer *eb; 7796 struct btrfs_key key; 7797 int item_size; 7798 int i, ret, slot; 7799 7800 if (!device->fs_info->dev_root) 7801 return 0; 7802 7803 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7804 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7805 key.offset = device->devid; 7806 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7807 if (ret) { 7808 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7809 btrfs_dev_stat_set(device, i, 0); 7810 device->dev_stats_valid = 1; 7811 btrfs_release_path(path); 7812 return ret < 0 ? ret : 0; 7813 } 7814 slot = path->slots[0]; 7815 eb = path->nodes[0]; 7816 item_size = btrfs_item_size(eb, slot); 7817 7818 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7819 7820 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7821 if (item_size >= (1 + i) * sizeof(__le64)) 7822 btrfs_dev_stat_set(device, i, 7823 btrfs_dev_stats_value(eb, ptr, i)); 7824 else 7825 btrfs_dev_stat_set(device, i, 0); 7826 } 7827 7828 device->dev_stats_valid = 1; 7829 btrfs_dev_stat_print_on_load(device); 7830 btrfs_release_path(path); 7831 7832 return 0; 7833 } 7834 7835 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7836 { 7837 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7838 struct btrfs_device *device; 7839 struct btrfs_path *path = NULL; 7840 int ret = 0; 7841 7842 path = btrfs_alloc_path(); 7843 if (!path) 7844 return -ENOMEM; 7845 7846 mutex_lock(&fs_devices->device_list_mutex); 7847 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7848 ret = btrfs_device_init_dev_stats(device, path); 7849 if (ret) 7850 goto out; 7851 } 7852 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7853 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7854 ret = btrfs_device_init_dev_stats(device, path); 7855 if (ret) 7856 goto out; 7857 } 7858 } 7859 out: 7860 mutex_unlock(&fs_devices->device_list_mutex); 7861 7862 btrfs_free_path(path); 7863 return ret; 7864 } 7865 7866 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7867 struct btrfs_device *device) 7868 { 7869 struct btrfs_fs_info *fs_info = trans->fs_info; 7870 struct btrfs_root *dev_root = fs_info->dev_root; 7871 struct btrfs_path *path; 7872 struct btrfs_key key; 7873 struct extent_buffer *eb; 7874 struct btrfs_dev_stats_item *ptr; 7875 int ret; 7876 int i; 7877 7878 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7879 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7880 key.offset = device->devid; 7881 7882 path = btrfs_alloc_path(); 7883 if (!path) 7884 return -ENOMEM; 7885 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7886 if (ret < 0) { 7887 btrfs_warn_in_rcu(fs_info, 7888 "error %d while searching for dev_stats item for device %s", 7889 ret, rcu_str_deref(device->name)); 7890 goto out; 7891 } 7892 7893 if (ret == 0 && 7894 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7895 /* need to delete old one and insert a new one */ 7896 ret = btrfs_del_item(trans, dev_root, path); 7897 if (ret != 0) { 7898 btrfs_warn_in_rcu(fs_info, 7899 "delete too small dev_stats item for device %s failed %d", 7900 rcu_str_deref(device->name), ret); 7901 goto out; 7902 } 7903 ret = 1; 7904 } 7905 7906 if (ret == 1) { 7907 /* need to insert a new item */ 7908 btrfs_release_path(path); 7909 ret = btrfs_insert_empty_item(trans, dev_root, path, 7910 &key, sizeof(*ptr)); 7911 if (ret < 0) { 7912 btrfs_warn_in_rcu(fs_info, 7913 "insert dev_stats item for device %s failed %d", 7914 rcu_str_deref(device->name), ret); 7915 goto out; 7916 } 7917 } 7918 7919 eb = path->nodes[0]; 7920 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7921 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7922 btrfs_set_dev_stats_value(eb, ptr, i, 7923 btrfs_dev_stat_read(device, i)); 7924 btrfs_mark_buffer_dirty(eb); 7925 7926 out: 7927 btrfs_free_path(path); 7928 return ret; 7929 } 7930 7931 /* 7932 * called from commit_transaction. Writes all changed device stats to disk. 7933 */ 7934 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7935 { 7936 struct btrfs_fs_info *fs_info = trans->fs_info; 7937 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7938 struct btrfs_device *device; 7939 int stats_cnt; 7940 int ret = 0; 7941 7942 mutex_lock(&fs_devices->device_list_mutex); 7943 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7944 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7945 if (!device->dev_stats_valid || stats_cnt == 0) 7946 continue; 7947 7948 7949 /* 7950 * There is a LOAD-LOAD control dependency between the value of 7951 * dev_stats_ccnt and updating the on-disk values which requires 7952 * reading the in-memory counters. Such control dependencies 7953 * require explicit read memory barriers. 7954 * 7955 * This memory barriers pairs with smp_mb__before_atomic in 7956 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7957 * barrier implied by atomic_xchg in 7958 * btrfs_dev_stats_read_and_reset 7959 */ 7960 smp_rmb(); 7961 7962 ret = update_dev_stat_item(trans, device); 7963 if (!ret) 7964 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7965 } 7966 mutex_unlock(&fs_devices->device_list_mutex); 7967 7968 return ret; 7969 } 7970 7971 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7972 { 7973 btrfs_dev_stat_inc(dev, index); 7974 btrfs_dev_stat_print_on_error(dev); 7975 } 7976 7977 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7978 { 7979 if (!dev->dev_stats_valid) 7980 return; 7981 btrfs_err_rl_in_rcu(dev->fs_info, 7982 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7983 rcu_str_deref(dev->name), 7984 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7985 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7986 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7987 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7988 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7989 } 7990 7991 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7992 { 7993 int i; 7994 7995 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7996 if (btrfs_dev_stat_read(dev, i) != 0) 7997 break; 7998 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7999 return; /* all values == 0, suppress message */ 8000 8001 btrfs_info_in_rcu(dev->fs_info, 8002 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 8003 rcu_str_deref(dev->name), 8004 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 8005 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 8006 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 8007 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 8008 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 8009 } 8010 8011 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 8012 struct btrfs_ioctl_get_dev_stats *stats) 8013 { 8014 BTRFS_DEV_LOOKUP_ARGS(args); 8015 struct btrfs_device *dev; 8016 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 8017 int i; 8018 8019 mutex_lock(&fs_devices->device_list_mutex); 8020 args.devid = stats->devid; 8021 dev = btrfs_find_device(fs_info->fs_devices, &args); 8022 mutex_unlock(&fs_devices->device_list_mutex); 8023 8024 if (!dev) { 8025 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 8026 return -ENODEV; 8027 } else if (!dev->dev_stats_valid) { 8028 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 8029 return -ENODEV; 8030 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 8031 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 8032 if (stats->nr_items > i) 8033 stats->values[i] = 8034 btrfs_dev_stat_read_and_reset(dev, i); 8035 else 8036 btrfs_dev_stat_set(dev, i, 0); 8037 } 8038 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 8039 current->comm, task_pid_nr(current)); 8040 } else { 8041 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8042 if (stats->nr_items > i) 8043 stats->values[i] = btrfs_dev_stat_read(dev, i); 8044 } 8045 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 8046 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 8047 return 0; 8048 } 8049 8050 /* 8051 * Update the size and bytes used for each device where it changed. This is 8052 * delayed since we would otherwise get errors while writing out the 8053 * superblocks. 8054 * 8055 * Must be invoked during transaction commit. 8056 */ 8057 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 8058 { 8059 struct btrfs_device *curr, *next; 8060 8061 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 8062 8063 if (list_empty(&trans->dev_update_list)) 8064 return; 8065 8066 /* 8067 * We don't need the device_list_mutex here. This list is owned by the 8068 * transaction and the transaction must complete before the device is 8069 * released. 8070 */ 8071 mutex_lock(&trans->fs_info->chunk_mutex); 8072 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 8073 post_commit_list) { 8074 list_del_init(&curr->post_commit_list); 8075 curr->commit_total_bytes = curr->disk_total_bytes; 8076 curr->commit_bytes_used = curr->bytes_used; 8077 } 8078 mutex_unlock(&trans->fs_info->chunk_mutex); 8079 } 8080 8081 /* 8082 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 8083 */ 8084 int btrfs_bg_type_to_factor(u64 flags) 8085 { 8086 const int index = btrfs_bg_flags_to_raid_index(flags); 8087 8088 return btrfs_raid_array[index].ncopies; 8089 } 8090 8091 8092 8093 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 8094 u64 chunk_offset, u64 devid, 8095 u64 physical_offset, u64 physical_len) 8096 { 8097 struct btrfs_dev_lookup_args args = { .devid = devid }; 8098 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8099 struct extent_map *em; 8100 struct map_lookup *map; 8101 struct btrfs_device *dev; 8102 u64 stripe_len; 8103 bool found = false; 8104 int ret = 0; 8105 int i; 8106 8107 read_lock(&em_tree->lock); 8108 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8109 read_unlock(&em_tree->lock); 8110 8111 if (!em) { 8112 btrfs_err(fs_info, 8113 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 8114 physical_offset, devid); 8115 ret = -EUCLEAN; 8116 goto out; 8117 } 8118 8119 map = em->map_lookup; 8120 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 8121 if (physical_len != stripe_len) { 8122 btrfs_err(fs_info, 8123 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8124 physical_offset, devid, em->start, physical_len, 8125 stripe_len); 8126 ret = -EUCLEAN; 8127 goto out; 8128 } 8129 8130 for (i = 0; i < map->num_stripes; i++) { 8131 if (map->stripes[i].dev->devid == devid && 8132 map->stripes[i].physical == physical_offset) { 8133 found = true; 8134 if (map->verified_stripes >= map->num_stripes) { 8135 btrfs_err(fs_info, 8136 "too many dev extents for chunk %llu found", 8137 em->start); 8138 ret = -EUCLEAN; 8139 goto out; 8140 } 8141 map->verified_stripes++; 8142 break; 8143 } 8144 } 8145 if (!found) { 8146 btrfs_err(fs_info, 8147 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8148 physical_offset, devid); 8149 ret = -EUCLEAN; 8150 } 8151 8152 /* Make sure no dev extent is beyond device boundary */ 8153 dev = btrfs_find_device(fs_info->fs_devices, &args); 8154 if (!dev) { 8155 btrfs_err(fs_info, "failed to find devid %llu", devid); 8156 ret = -EUCLEAN; 8157 goto out; 8158 } 8159 8160 if (physical_offset + physical_len > dev->disk_total_bytes) { 8161 btrfs_err(fs_info, 8162 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8163 devid, physical_offset, physical_len, 8164 dev->disk_total_bytes); 8165 ret = -EUCLEAN; 8166 goto out; 8167 } 8168 8169 if (dev->zone_info) { 8170 u64 zone_size = dev->zone_info->zone_size; 8171 8172 if (!IS_ALIGNED(physical_offset, zone_size) || 8173 !IS_ALIGNED(physical_len, zone_size)) { 8174 btrfs_err(fs_info, 8175 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8176 devid, physical_offset, physical_len); 8177 ret = -EUCLEAN; 8178 goto out; 8179 } 8180 } 8181 8182 out: 8183 free_extent_map(em); 8184 return ret; 8185 } 8186 8187 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8188 { 8189 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8190 struct extent_map *em; 8191 struct rb_node *node; 8192 int ret = 0; 8193 8194 read_lock(&em_tree->lock); 8195 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8196 em = rb_entry(node, struct extent_map, rb_node); 8197 if (em->map_lookup->num_stripes != 8198 em->map_lookup->verified_stripes) { 8199 btrfs_err(fs_info, 8200 "chunk %llu has missing dev extent, have %d expect %d", 8201 em->start, em->map_lookup->verified_stripes, 8202 em->map_lookup->num_stripes); 8203 ret = -EUCLEAN; 8204 goto out; 8205 } 8206 } 8207 out: 8208 read_unlock(&em_tree->lock); 8209 return ret; 8210 } 8211 8212 /* 8213 * Ensure that all dev extents are mapped to correct chunk, otherwise 8214 * later chunk allocation/free would cause unexpected behavior. 8215 * 8216 * NOTE: This will iterate through the whole device tree, which should be of 8217 * the same size level as the chunk tree. This slightly increases mount time. 8218 */ 8219 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8220 { 8221 struct btrfs_path *path; 8222 struct btrfs_root *root = fs_info->dev_root; 8223 struct btrfs_key key; 8224 u64 prev_devid = 0; 8225 u64 prev_dev_ext_end = 0; 8226 int ret = 0; 8227 8228 /* 8229 * We don't have a dev_root because we mounted with ignorebadroots and 8230 * failed to load the root, so we want to skip the verification in this 8231 * case for sure. 8232 * 8233 * However if the dev root is fine, but the tree itself is corrupted 8234 * we'd still fail to mount. This verification is only to make sure 8235 * writes can happen safely, so instead just bypass this check 8236 * completely in the case of IGNOREBADROOTS. 8237 */ 8238 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8239 return 0; 8240 8241 key.objectid = 1; 8242 key.type = BTRFS_DEV_EXTENT_KEY; 8243 key.offset = 0; 8244 8245 path = btrfs_alloc_path(); 8246 if (!path) 8247 return -ENOMEM; 8248 8249 path->reada = READA_FORWARD; 8250 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8251 if (ret < 0) 8252 goto out; 8253 8254 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8255 ret = btrfs_next_leaf(root, path); 8256 if (ret < 0) 8257 goto out; 8258 /* No dev extents at all? Not good */ 8259 if (ret > 0) { 8260 ret = -EUCLEAN; 8261 goto out; 8262 } 8263 } 8264 while (1) { 8265 struct extent_buffer *leaf = path->nodes[0]; 8266 struct btrfs_dev_extent *dext; 8267 int slot = path->slots[0]; 8268 u64 chunk_offset; 8269 u64 physical_offset; 8270 u64 physical_len; 8271 u64 devid; 8272 8273 btrfs_item_key_to_cpu(leaf, &key, slot); 8274 if (key.type != BTRFS_DEV_EXTENT_KEY) 8275 break; 8276 devid = key.objectid; 8277 physical_offset = key.offset; 8278 8279 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8280 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8281 physical_len = btrfs_dev_extent_length(leaf, dext); 8282 8283 /* Check if this dev extent overlaps with the previous one */ 8284 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8285 btrfs_err(fs_info, 8286 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8287 devid, physical_offset, prev_dev_ext_end); 8288 ret = -EUCLEAN; 8289 goto out; 8290 } 8291 8292 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8293 physical_offset, physical_len); 8294 if (ret < 0) 8295 goto out; 8296 prev_devid = devid; 8297 prev_dev_ext_end = physical_offset + physical_len; 8298 8299 ret = btrfs_next_item(root, path); 8300 if (ret < 0) 8301 goto out; 8302 if (ret > 0) { 8303 ret = 0; 8304 break; 8305 } 8306 } 8307 8308 /* Ensure all chunks have corresponding dev extents */ 8309 ret = verify_chunk_dev_extent_mapping(fs_info); 8310 out: 8311 btrfs_free_path(path); 8312 return ret; 8313 } 8314 8315 /* 8316 * Check whether the given block group or device is pinned by any inode being 8317 * used as a swapfile. 8318 */ 8319 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8320 { 8321 struct btrfs_swapfile_pin *sp; 8322 struct rb_node *node; 8323 8324 spin_lock(&fs_info->swapfile_pins_lock); 8325 node = fs_info->swapfile_pins.rb_node; 8326 while (node) { 8327 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8328 if (ptr < sp->ptr) 8329 node = node->rb_left; 8330 else if (ptr > sp->ptr) 8331 node = node->rb_right; 8332 else 8333 break; 8334 } 8335 spin_unlock(&fs_info->swapfile_pins_lock); 8336 return node != NULL; 8337 } 8338 8339 static int relocating_repair_kthread(void *data) 8340 { 8341 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8342 struct btrfs_fs_info *fs_info = cache->fs_info; 8343 u64 target; 8344 int ret = 0; 8345 8346 target = cache->start; 8347 btrfs_put_block_group(cache); 8348 8349 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8350 btrfs_info(fs_info, 8351 "zoned: skip relocating block group %llu to repair: EBUSY", 8352 target); 8353 return -EBUSY; 8354 } 8355 8356 mutex_lock(&fs_info->reclaim_bgs_lock); 8357 8358 /* Ensure block group still exists */ 8359 cache = btrfs_lookup_block_group(fs_info, target); 8360 if (!cache) 8361 goto out; 8362 8363 if (!cache->relocating_repair) 8364 goto out; 8365 8366 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8367 if (ret < 0) 8368 goto out; 8369 8370 btrfs_info(fs_info, 8371 "zoned: relocating block group %llu to repair IO failure", 8372 target); 8373 ret = btrfs_relocate_chunk(fs_info, target); 8374 8375 out: 8376 if (cache) 8377 btrfs_put_block_group(cache); 8378 mutex_unlock(&fs_info->reclaim_bgs_lock); 8379 btrfs_exclop_finish(fs_info); 8380 8381 return ret; 8382 } 8383 8384 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8385 { 8386 struct btrfs_block_group *cache; 8387 8388 if (!btrfs_is_zoned(fs_info)) 8389 return false; 8390 8391 /* Do not attempt to repair in degraded state */ 8392 if (btrfs_test_opt(fs_info, DEGRADED)) 8393 return true; 8394 8395 cache = btrfs_lookup_block_group(fs_info, logical); 8396 if (!cache) 8397 return true; 8398 8399 spin_lock(&cache->lock); 8400 if (cache->relocating_repair) { 8401 spin_unlock(&cache->lock); 8402 btrfs_put_block_group(cache); 8403 return true; 8404 } 8405 cache->relocating_repair = 1; 8406 spin_unlock(&cache->lock); 8407 8408 kthread_run(relocating_repair_kthread, cache, 8409 "btrfs-relocating-repair"); 8410 8411 return true; 8412 } 8413