1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "misc.h" 18 #include "ctree.h" 19 #include "extent_map.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "async-thread.h" 26 #include "check-integrity.h" 27 #include "rcu-string.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 #include "tree-checker.h" 31 #include "space-info.h" 32 #include "block-group.h" 33 #include "discard.h" 34 #include "zoned.h" 35 36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 37 [BTRFS_RAID_RAID10] = { 38 .sub_stripes = 2, 39 .dev_stripes = 1, 40 .devs_max = 0, /* 0 == as many as possible */ 41 .devs_min = 2, 42 .tolerated_failures = 1, 43 .devs_increment = 2, 44 .ncopies = 2, 45 .nparity = 0, 46 .raid_name = "raid10", 47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 49 }, 50 [BTRFS_RAID_RAID1] = { 51 .sub_stripes = 1, 52 .dev_stripes = 1, 53 .devs_max = 2, 54 .devs_min = 2, 55 .tolerated_failures = 1, 56 .devs_increment = 2, 57 .ncopies = 2, 58 .nparity = 0, 59 .raid_name = "raid1", 60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 62 }, 63 [BTRFS_RAID_RAID1C3] = { 64 .sub_stripes = 1, 65 .dev_stripes = 1, 66 .devs_max = 3, 67 .devs_min = 3, 68 .tolerated_failures = 2, 69 .devs_increment = 3, 70 .ncopies = 3, 71 .nparity = 0, 72 .raid_name = "raid1c3", 73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 75 }, 76 [BTRFS_RAID_RAID1C4] = { 77 .sub_stripes = 1, 78 .dev_stripes = 1, 79 .devs_max = 4, 80 .devs_min = 4, 81 .tolerated_failures = 3, 82 .devs_increment = 4, 83 .ncopies = 4, 84 .nparity = 0, 85 .raid_name = "raid1c4", 86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 88 }, 89 [BTRFS_RAID_DUP] = { 90 .sub_stripes = 1, 91 .dev_stripes = 2, 92 .devs_max = 1, 93 .devs_min = 1, 94 .tolerated_failures = 0, 95 .devs_increment = 1, 96 .ncopies = 2, 97 .nparity = 0, 98 .raid_name = "dup", 99 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 100 .mindev_error = 0, 101 }, 102 [BTRFS_RAID_RAID0] = { 103 .sub_stripes = 1, 104 .dev_stripes = 1, 105 .devs_max = 0, 106 .devs_min = 1, 107 .tolerated_failures = 0, 108 .devs_increment = 1, 109 .ncopies = 1, 110 .nparity = 0, 111 .raid_name = "raid0", 112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 113 .mindev_error = 0, 114 }, 115 [BTRFS_RAID_SINGLE] = { 116 .sub_stripes = 1, 117 .dev_stripes = 1, 118 .devs_max = 1, 119 .devs_min = 1, 120 .tolerated_failures = 0, 121 .devs_increment = 1, 122 .ncopies = 1, 123 .nparity = 0, 124 .raid_name = "single", 125 .bg_flag = 0, 126 .mindev_error = 0, 127 }, 128 [BTRFS_RAID_RAID5] = { 129 .sub_stripes = 1, 130 .dev_stripes = 1, 131 .devs_max = 0, 132 .devs_min = 2, 133 .tolerated_failures = 1, 134 .devs_increment = 1, 135 .ncopies = 1, 136 .nparity = 1, 137 .raid_name = "raid5", 138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 140 }, 141 [BTRFS_RAID_RAID6] = { 142 .sub_stripes = 1, 143 .dev_stripes = 1, 144 .devs_max = 0, 145 .devs_min = 3, 146 .tolerated_failures = 2, 147 .devs_increment = 1, 148 .ncopies = 1, 149 .nparity = 2, 150 .raid_name = "raid6", 151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 153 }, 154 }; 155 156 /* 157 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 158 * can be used as index to access btrfs_raid_array[]. 159 */ 160 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 161 { 162 if (flags & BTRFS_BLOCK_GROUP_RAID10) 163 return BTRFS_RAID_RAID10; 164 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 165 return BTRFS_RAID_RAID1; 166 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 167 return BTRFS_RAID_RAID1C3; 168 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 169 return BTRFS_RAID_RAID1C4; 170 else if (flags & BTRFS_BLOCK_GROUP_DUP) 171 return BTRFS_RAID_DUP; 172 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 173 return BTRFS_RAID_RAID0; 174 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 175 return BTRFS_RAID_RAID5; 176 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 177 return BTRFS_RAID_RAID6; 178 179 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 180 } 181 182 const char *btrfs_bg_type_to_raid_name(u64 flags) 183 { 184 const int index = btrfs_bg_flags_to_raid_index(flags); 185 186 if (index >= BTRFS_NR_RAID_TYPES) 187 return NULL; 188 189 return btrfs_raid_array[index].raid_name; 190 } 191 192 /* 193 * Fill @buf with textual description of @bg_flags, no more than @size_buf 194 * bytes including terminating null byte. 195 */ 196 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 197 { 198 int i; 199 int ret; 200 char *bp = buf; 201 u64 flags = bg_flags; 202 u32 size_bp = size_buf; 203 204 if (!flags) { 205 strcpy(bp, "NONE"); 206 return; 207 } 208 209 #define DESCRIBE_FLAG(flag, desc) \ 210 do { \ 211 if (flags & (flag)) { \ 212 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 213 if (ret < 0 || ret >= size_bp) \ 214 goto out_overflow; \ 215 size_bp -= ret; \ 216 bp += ret; \ 217 flags &= ~(flag); \ 218 } \ 219 } while (0) 220 221 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 222 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 224 225 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 226 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 227 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 228 btrfs_raid_array[i].raid_name); 229 #undef DESCRIBE_FLAG 230 231 if (flags) { 232 ret = snprintf(bp, size_bp, "0x%llx|", flags); 233 size_bp -= ret; 234 } 235 236 if (size_bp < size_buf) 237 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 238 239 /* 240 * The text is trimmed, it's up to the caller to provide sufficiently 241 * large buffer 242 */ 243 out_overflow:; 244 } 245 246 static int init_first_rw_device(struct btrfs_trans_handle *trans); 247 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 248 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 249 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 250 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 251 enum btrfs_map_op op, 252 u64 logical, u64 *length, 253 struct btrfs_bio **bbio_ret, 254 int mirror_num, int need_raid_map); 255 256 /* 257 * Device locking 258 * ============== 259 * 260 * There are several mutexes that protect manipulation of devices and low-level 261 * structures like chunks but not block groups, extents or files 262 * 263 * uuid_mutex (global lock) 264 * ------------------------ 265 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 266 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 267 * device) or requested by the device= mount option 268 * 269 * the mutex can be very coarse and can cover long-running operations 270 * 271 * protects: updates to fs_devices counters like missing devices, rw devices, 272 * seeding, structure cloning, opening/closing devices at mount/umount time 273 * 274 * global::fs_devs - add, remove, updates to the global list 275 * 276 * does not protect: manipulation of the fs_devices::devices list in general 277 * but in mount context it could be used to exclude list modifications by eg. 278 * scan ioctl 279 * 280 * btrfs_device::name - renames (write side), read is RCU 281 * 282 * fs_devices::device_list_mutex (per-fs, with RCU) 283 * ------------------------------------------------ 284 * protects updates to fs_devices::devices, ie. adding and deleting 285 * 286 * simple list traversal with read-only actions can be done with RCU protection 287 * 288 * may be used to exclude some operations from running concurrently without any 289 * modifications to the list (see write_all_supers) 290 * 291 * Is not required at mount and close times, because our device list is 292 * protected by the uuid_mutex at that point. 293 * 294 * balance_mutex 295 * ------------- 296 * protects balance structures (status, state) and context accessed from 297 * several places (internally, ioctl) 298 * 299 * chunk_mutex 300 * ----------- 301 * protects chunks, adding or removing during allocation, trim or when a new 302 * device is added/removed. Additionally it also protects post_commit_list of 303 * individual devices, since they can be added to the transaction's 304 * post_commit_list only with chunk_mutex held. 305 * 306 * cleaner_mutex 307 * ------------- 308 * a big lock that is held by the cleaner thread and prevents running subvolume 309 * cleaning together with relocation or delayed iputs 310 * 311 * 312 * Lock nesting 313 * ============ 314 * 315 * uuid_mutex 316 * device_list_mutex 317 * chunk_mutex 318 * balance_mutex 319 * 320 * 321 * Exclusive operations 322 * ==================== 323 * 324 * Maintains the exclusivity of the following operations that apply to the 325 * whole filesystem and cannot run in parallel. 326 * 327 * - Balance (*) 328 * - Device add 329 * - Device remove 330 * - Device replace (*) 331 * - Resize 332 * 333 * The device operations (as above) can be in one of the following states: 334 * 335 * - Running state 336 * - Paused state 337 * - Completed state 338 * 339 * Only device operations marked with (*) can go into the Paused state for the 340 * following reasons: 341 * 342 * - ioctl (only Balance can be Paused through ioctl) 343 * - filesystem remounted as read-only 344 * - filesystem unmounted and mounted as read-only 345 * - system power-cycle and filesystem mounted as read-only 346 * - filesystem or device errors leading to forced read-only 347 * 348 * The status of exclusive operation is set and cleared atomically. 349 * During the course of Paused state, fs_info::exclusive_operation remains set. 350 * A device operation in Paused or Running state can be canceled or resumed 351 * either by ioctl (Balance only) or when remounted as read-write. 352 * The exclusive status is cleared when the device operation is canceled or 353 * completed. 354 */ 355 356 DEFINE_MUTEX(uuid_mutex); 357 static LIST_HEAD(fs_uuids); 358 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 359 { 360 return &fs_uuids; 361 } 362 363 /* 364 * alloc_fs_devices - allocate struct btrfs_fs_devices 365 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 366 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 367 * 368 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 369 * The returned struct is not linked onto any lists and can be destroyed with 370 * kfree() right away. 371 */ 372 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 373 const u8 *metadata_fsid) 374 { 375 struct btrfs_fs_devices *fs_devs; 376 377 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 378 if (!fs_devs) 379 return ERR_PTR(-ENOMEM); 380 381 mutex_init(&fs_devs->device_list_mutex); 382 383 INIT_LIST_HEAD(&fs_devs->devices); 384 INIT_LIST_HEAD(&fs_devs->alloc_list); 385 INIT_LIST_HEAD(&fs_devs->fs_list); 386 INIT_LIST_HEAD(&fs_devs->seed_list); 387 if (fsid) 388 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 389 390 if (metadata_fsid) 391 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 392 else if (fsid) 393 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 394 395 return fs_devs; 396 } 397 398 void btrfs_free_device(struct btrfs_device *device) 399 { 400 WARN_ON(!list_empty(&device->post_commit_list)); 401 rcu_string_free(device->name); 402 extent_io_tree_release(&device->alloc_state); 403 bio_put(device->flush_bio); 404 btrfs_destroy_dev_zone_info(device); 405 kfree(device); 406 } 407 408 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409 { 410 struct btrfs_device *device; 411 WARN_ON(fs_devices->opened); 412 while (!list_empty(&fs_devices->devices)) { 413 device = list_entry(fs_devices->devices.next, 414 struct btrfs_device, dev_list); 415 list_del(&device->dev_list); 416 btrfs_free_device(device); 417 } 418 kfree(fs_devices); 419 } 420 421 void __exit btrfs_cleanup_fs_uuids(void) 422 { 423 struct btrfs_fs_devices *fs_devices; 424 425 while (!list_empty(&fs_uuids)) { 426 fs_devices = list_entry(fs_uuids.next, 427 struct btrfs_fs_devices, fs_list); 428 list_del(&fs_devices->fs_list); 429 free_fs_devices(fs_devices); 430 } 431 } 432 433 static noinline struct btrfs_fs_devices *find_fsid( 434 const u8 *fsid, const u8 *metadata_fsid) 435 { 436 struct btrfs_fs_devices *fs_devices; 437 438 ASSERT(fsid); 439 440 /* Handle non-split brain cases */ 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 442 if (metadata_fsid) { 443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445 BTRFS_FSID_SIZE) == 0) 446 return fs_devices; 447 } else { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449 return fs_devices; 450 } 451 } 452 return NULL; 453 } 454 455 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456 struct btrfs_super_block *disk_super) 457 { 458 459 struct btrfs_fs_devices *fs_devices; 460 461 /* 462 * Handle scanned device having completed its fsid change but 463 * belonging to a fs_devices that was created by first scanning 464 * a device which didn't have its fsid/metadata_uuid changed 465 * at all and the CHANGING_FSID_V2 flag set. 466 */ 467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 468 if (fs_devices->fsid_change && 469 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 470 BTRFS_FSID_SIZE) == 0 && 471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 472 BTRFS_FSID_SIZE) == 0) { 473 return fs_devices; 474 } 475 } 476 /* 477 * Handle scanned device having completed its fsid change but 478 * belonging to a fs_devices that was created by a device that 479 * has an outdated pair of fsid/metadata_uuid and 480 * CHANGING_FSID_V2 flag set. 481 */ 482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483 if (fs_devices->fsid_change && 484 memcmp(fs_devices->metadata_uuid, 485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487 BTRFS_FSID_SIZE) == 0) { 488 return fs_devices; 489 } 490 } 491 492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 493 } 494 495 496 static int 497 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498 int flush, struct block_device **bdev, 499 struct btrfs_super_block **disk_super) 500 { 501 int ret; 502 503 *bdev = blkdev_get_by_path(device_path, flags, holder); 504 505 if (IS_ERR(*bdev)) { 506 ret = PTR_ERR(*bdev); 507 goto error; 508 } 509 510 if (flush) 511 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513 if (ret) { 514 blkdev_put(*bdev, flags); 515 goto error; 516 } 517 invalidate_bdev(*bdev); 518 *disk_super = btrfs_read_dev_super(*bdev); 519 if (IS_ERR(*disk_super)) { 520 ret = PTR_ERR(*disk_super); 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 525 return 0; 526 527 error: 528 *bdev = NULL; 529 return ret; 530 } 531 532 static bool device_path_matched(const char *path, struct btrfs_device *device) 533 { 534 int found; 535 536 rcu_read_lock(); 537 found = strcmp(rcu_str_deref(device->name), path); 538 rcu_read_unlock(); 539 540 return found == 0; 541 } 542 543 /* 544 * Search and remove all stale (devices which are not mounted) devices. 545 * When both inputs are NULL, it will search and release all stale devices. 546 * path: Optional. When provided will it release all unmounted devices 547 * matching this path only. 548 * skip_dev: Optional. Will skip this device when searching for the stale 549 * devices. 550 * Return: 0 for success or if @path is NULL. 551 * -EBUSY if @path is a mounted device. 552 * -ENOENT if @path does not match any device in the list. 553 */ 554 static int btrfs_free_stale_devices(const char *path, 555 struct btrfs_device *skip_device) 556 { 557 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 558 struct btrfs_device *device, *tmp_device; 559 int ret = 0; 560 561 lockdep_assert_held(&uuid_mutex); 562 563 if (path) 564 ret = -ENOENT; 565 566 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 567 568 mutex_lock(&fs_devices->device_list_mutex); 569 list_for_each_entry_safe(device, tmp_device, 570 &fs_devices->devices, dev_list) { 571 if (skip_device && skip_device == device) 572 continue; 573 if (path && !device->name) 574 continue; 575 if (path && !device_path_matched(path, device)) 576 continue; 577 if (fs_devices->opened) { 578 /* for an already deleted device return 0 */ 579 if (path && ret != 0) 580 ret = -EBUSY; 581 break; 582 } 583 584 /* delete the stale device */ 585 fs_devices->num_devices--; 586 list_del(&device->dev_list); 587 btrfs_free_device(device); 588 589 ret = 0; 590 } 591 mutex_unlock(&fs_devices->device_list_mutex); 592 593 if (fs_devices->num_devices == 0) { 594 btrfs_sysfs_remove_fsid(fs_devices); 595 list_del(&fs_devices->fs_list); 596 free_fs_devices(fs_devices); 597 } 598 } 599 600 return ret; 601 } 602 603 /* 604 * This is only used on mount, and we are protected from competing things 605 * messing with our fs_devices by the uuid_mutex, thus we do not need the 606 * fs_devices->device_list_mutex here. 607 */ 608 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 609 struct btrfs_device *device, fmode_t flags, 610 void *holder) 611 { 612 struct request_queue *q; 613 struct block_device *bdev; 614 struct btrfs_super_block *disk_super; 615 u64 devid; 616 int ret; 617 618 if (device->bdev) 619 return -EINVAL; 620 if (!device->name) 621 return -EINVAL; 622 623 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 624 &bdev, &disk_super); 625 if (ret) 626 return ret; 627 628 devid = btrfs_stack_device_id(&disk_super->dev_item); 629 if (devid != device->devid) 630 goto error_free_page; 631 632 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 633 goto error_free_page; 634 635 device->generation = btrfs_super_generation(disk_super); 636 637 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 638 if (btrfs_super_incompat_flags(disk_super) & 639 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 640 pr_err( 641 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 642 goto error_free_page; 643 } 644 645 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 646 fs_devices->seeding = true; 647 } else { 648 if (bdev_read_only(bdev)) 649 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 650 else 651 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 652 } 653 654 q = bdev_get_queue(bdev); 655 if (!blk_queue_nonrot(q)) 656 fs_devices->rotating = true; 657 658 device->bdev = bdev; 659 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 660 device->mode = flags; 661 662 fs_devices->open_devices++; 663 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 664 device->devid != BTRFS_DEV_REPLACE_DEVID) { 665 fs_devices->rw_devices++; 666 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 667 } 668 btrfs_release_disk_super(disk_super); 669 670 return 0; 671 672 error_free_page: 673 btrfs_release_disk_super(disk_super); 674 blkdev_put(bdev, flags); 675 676 return -EINVAL; 677 } 678 679 /* 680 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 681 * being created with a disk that has already completed its fsid change. Such 682 * disk can belong to an fs which has its FSID changed or to one which doesn't. 683 * Handle both cases here. 684 */ 685 static struct btrfs_fs_devices *find_fsid_inprogress( 686 struct btrfs_super_block *disk_super) 687 { 688 struct btrfs_fs_devices *fs_devices; 689 690 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 691 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 692 BTRFS_FSID_SIZE) != 0 && 693 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 694 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 695 return fs_devices; 696 } 697 } 698 699 return find_fsid(disk_super->fsid, NULL); 700 } 701 702 703 static struct btrfs_fs_devices *find_fsid_changed( 704 struct btrfs_super_block *disk_super) 705 { 706 struct btrfs_fs_devices *fs_devices; 707 708 /* 709 * Handles the case where scanned device is part of an fs that had 710 * multiple successful changes of FSID but currently device didn't 711 * observe it. Meaning our fsid will be different than theirs. We need 712 * to handle two subcases : 713 * 1 - The fs still continues to have different METADATA/FSID uuids. 714 * 2 - The fs is switched back to its original FSID (METADATA/FSID 715 * are equal). 716 */ 717 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 718 /* Changed UUIDs */ 719 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 720 BTRFS_FSID_SIZE) != 0 && 721 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 722 BTRFS_FSID_SIZE) == 0 && 723 memcmp(fs_devices->fsid, disk_super->fsid, 724 BTRFS_FSID_SIZE) != 0) 725 return fs_devices; 726 727 /* Unchanged UUIDs */ 728 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 729 BTRFS_FSID_SIZE) == 0 && 730 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 731 BTRFS_FSID_SIZE) == 0) 732 return fs_devices; 733 } 734 735 return NULL; 736 } 737 738 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 739 struct btrfs_super_block *disk_super) 740 { 741 struct btrfs_fs_devices *fs_devices; 742 743 /* 744 * Handle the case where the scanned device is part of an fs whose last 745 * metadata UUID change reverted it to the original FSID. At the same 746 * time * fs_devices was first created by another constitutent device 747 * which didn't fully observe the operation. This results in an 748 * btrfs_fs_devices created with metadata/fsid different AND 749 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 750 * fs_devices equal to the FSID of the disk. 751 */ 752 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 753 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 754 BTRFS_FSID_SIZE) != 0 && 755 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 756 BTRFS_FSID_SIZE) == 0 && 757 fs_devices->fsid_change) 758 return fs_devices; 759 } 760 761 return NULL; 762 } 763 /* 764 * Add new device to list of registered devices 765 * 766 * Returns: 767 * device pointer which was just added or updated when successful 768 * error pointer when failed 769 */ 770 static noinline struct btrfs_device *device_list_add(const char *path, 771 struct btrfs_super_block *disk_super, 772 bool *new_device_added) 773 { 774 struct btrfs_device *device; 775 struct btrfs_fs_devices *fs_devices = NULL; 776 struct rcu_string *name; 777 u64 found_transid = btrfs_super_generation(disk_super); 778 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 779 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 780 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 781 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 782 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 783 784 if (fsid_change_in_progress) { 785 if (!has_metadata_uuid) 786 fs_devices = find_fsid_inprogress(disk_super); 787 else 788 fs_devices = find_fsid_changed(disk_super); 789 } else if (has_metadata_uuid) { 790 fs_devices = find_fsid_with_metadata_uuid(disk_super); 791 } else { 792 fs_devices = find_fsid_reverted_metadata(disk_super); 793 if (!fs_devices) 794 fs_devices = find_fsid(disk_super->fsid, NULL); 795 } 796 797 798 if (!fs_devices) { 799 if (has_metadata_uuid) 800 fs_devices = alloc_fs_devices(disk_super->fsid, 801 disk_super->metadata_uuid); 802 else 803 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 804 805 if (IS_ERR(fs_devices)) 806 return ERR_CAST(fs_devices); 807 808 fs_devices->fsid_change = fsid_change_in_progress; 809 810 mutex_lock(&fs_devices->device_list_mutex); 811 list_add(&fs_devices->fs_list, &fs_uuids); 812 813 device = NULL; 814 } else { 815 mutex_lock(&fs_devices->device_list_mutex); 816 device = btrfs_find_device(fs_devices, devid, 817 disk_super->dev_item.uuid, NULL); 818 819 /* 820 * If this disk has been pulled into an fs devices created by 821 * a device which had the CHANGING_FSID_V2 flag then replace the 822 * metadata_uuid/fsid values of the fs_devices. 823 */ 824 if (fs_devices->fsid_change && 825 found_transid > fs_devices->latest_generation) { 826 memcpy(fs_devices->fsid, disk_super->fsid, 827 BTRFS_FSID_SIZE); 828 829 if (has_metadata_uuid) 830 memcpy(fs_devices->metadata_uuid, 831 disk_super->metadata_uuid, 832 BTRFS_FSID_SIZE); 833 else 834 memcpy(fs_devices->metadata_uuid, 835 disk_super->fsid, BTRFS_FSID_SIZE); 836 837 fs_devices->fsid_change = false; 838 } 839 } 840 841 if (!device) { 842 if (fs_devices->opened) { 843 mutex_unlock(&fs_devices->device_list_mutex); 844 return ERR_PTR(-EBUSY); 845 } 846 847 device = btrfs_alloc_device(NULL, &devid, 848 disk_super->dev_item.uuid); 849 if (IS_ERR(device)) { 850 mutex_unlock(&fs_devices->device_list_mutex); 851 /* we can safely leave the fs_devices entry around */ 852 return device; 853 } 854 855 name = rcu_string_strdup(path, GFP_NOFS); 856 if (!name) { 857 btrfs_free_device(device); 858 mutex_unlock(&fs_devices->device_list_mutex); 859 return ERR_PTR(-ENOMEM); 860 } 861 rcu_assign_pointer(device->name, name); 862 863 list_add_rcu(&device->dev_list, &fs_devices->devices); 864 fs_devices->num_devices++; 865 866 device->fs_devices = fs_devices; 867 *new_device_added = true; 868 869 if (disk_super->label[0]) 870 pr_info( 871 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 872 disk_super->label, devid, found_transid, path, 873 current->comm, task_pid_nr(current)); 874 else 875 pr_info( 876 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 877 disk_super->fsid, devid, found_transid, path, 878 current->comm, task_pid_nr(current)); 879 880 } else if (!device->name || strcmp(device->name->str, path)) { 881 /* 882 * When FS is already mounted. 883 * 1. If you are here and if the device->name is NULL that 884 * means this device was missing at time of FS mount. 885 * 2. If you are here and if the device->name is different 886 * from 'path' that means either 887 * a. The same device disappeared and reappeared with 888 * different name. or 889 * b. The missing-disk-which-was-replaced, has 890 * reappeared now. 891 * 892 * We must allow 1 and 2a above. But 2b would be a spurious 893 * and unintentional. 894 * 895 * Further in case of 1 and 2a above, the disk at 'path' 896 * would have missed some transaction when it was away and 897 * in case of 2a the stale bdev has to be updated as well. 898 * 2b must not be allowed at all time. 899 */ 900 901 /* 902 * For now, we do allow update to btrfs_fs_device through the 903 * btrfs dev scan cli after FS has been mounted. We're still 904 * tracking a problem where systems fail mount by subvolume id 905 * when we reject replacement on a mounted FS. 906 */ 907 if (!fs_devices->opened && found_transid < device->generation) { 908 /* 909 * That is if the FS is _not_ mounted and if you 910 * are here, that means there is more than one 911 * disk with same uuid and devid.We keep the one 912 * with larger generation number or the last-in if 913 * generation are equal. 914 */ 915 mutex_unlock(&fs_devices->device_list_mutex); 916 return ERR_PTR(-EEXIST); 917 } 918 919 /* 920 * We are going to replace the device path for a given devid, 921 * make sure it's the same device if the device is mounted 922 */ 923 if (device->bdev) { 924 int error; 925 dev_t path_dev; 926 927 error = lookup_bdev(path, &path_dev); 928 if (error) { 929 mutex_unlock(&fs_devices->device_list_mutex); 930 return ERR_PTR(error); 931 } 932 933 if (device->bdev->bd_dev != path_dev) { 934 mutex_unlock(&fs_devices->device_list_mutex); 935 /* 936 * device->fs_info may not be reliable here, so 937 * pass in a NULL instead. This avoids a 938 * possible use-after-free when the fs_info and 939 * fs_info->sb are already torn down. 940 */ 941 btrfs_warn_in_rcu(NULL, 942 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 943 path, devid, found_transid, 944 current->comm, 945 task_pid_nr(current)); 946 return ERR_PTR(-EEXIST); 947 } 948 btrfs_info_in_rcu(device->fs_info, 949 "devid %llu device path %s changed to %s scanned by %s (%d)", 950 devid, rcu_str_deref(device->name), 951 path, current->comm, 952 task_pid_nr(current)); 953 } 954 955 name = rcu_string_strdup(path, GFP_NOFS); 956 if (!name) { 957 mutex_unlock(&fs_devices->device_list_mutex); 958 return ERR_PTR(-ENOMEM); 959 } 960 rcu_string_free(device->name); 961 rcu_assign_pointer(device->name, name); 962 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 963 fs_devices->missing_devices--; 964 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 965 } 966 } 967 968 /* 969 * Unmount does not free the btrfs_device struct but would zero 970 * generation along with most of the other members. So just update 971 * it back. We need it to pick the disk with largest generation 972 * (as above). 973 */ 974 if (!fs_devices->opened) { 975 device->generation = found_transid; 976 fs_devices->latest_generation = max_t(u64, found_transid, 977 fs_devices->latest_generation); 978 } 979 980 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 981 982 mutex_unlock(&fs_devices->device_list_mutex); 983 return device; 984 } 985 986 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 987 { 988 struct btrfs_fs_devices *fs_devices; 989 struct btrfs_device *device; 990 struct btrfs_device *orig_dev; 991 int ret = 0; 992 993 lockdep_assert_held(&uuid_mutex); 994 995 fs_devices = alloc_fs_devices(orig->fsid, NULL); 996 if (IS_ERR(fs_devices)) 997 return fs_devices; 998 999 fs_devices->total_devices = orig->total_devices; 1000 1001 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1002 struct rcu_string *name; 1003 1004 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1005 orig_dev->uuid); 1006 if (IS_ERR(device)) { 1007 ret = PTR_ERR(device); 1008 goto error; 1009 } 1010 1011 /* 1012 * This is ok to do without rcu read locked because we hold the 1013 * uuid mutex so nothing we touch in here is going to disappear. 1014 */ 1015 if (orig_dev->name) { 1016 name = rcu_string_strdup(orig_dev->name->str, 1017 GFP_KERNEL); 1018 if (!name) { 1019 btrfs_free_device(device); 1020 ret = -ENOMEM; 1021 goto error; 1022 } 1023 rcu_assign_pointer(device->name, name); 1024 } 1025 1026 list_add(&device->dev_list, &fs_devices->devices); 1027 device->fs_devices = fs_devices; 1028 fs_devices->num_devices++; 1029 } 1030 return fs_devices; 1031 error: 1032 free_fs_devices(fs_devices); 1033 return ERR_PTR(ret); 1034 } 1035 1036 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1037 struct btrfs_device **latest_dev) 1038 { 1039 struct btrfs_device *device, *next; 1040 1041 /* This is the initialized path, it is safe to release the devices. */ 1042 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1043 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1044 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1045 &device->dev_state) && 1046 !test_bit(BTRFS_DEV_STATE_MISSING, 1047 &device->dev_state) && 1048 (!*latest_dev || 1049 device->generation > (*latest_dev)->generation)) { 1050 *latest_dev = device; 1051 } 1052 continue; 1053 } 1054 1055 /* 1056 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1057 * in btrfs_init_dev_replace() so just continue. 1058 */ 1059 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1060 continue; 1061 1062 if (device->bdev) { 1063 blkdev_put(device->bdev, device->mode); 1064 device->bdev = NULL; 1065 fs_devices->open_devices--; 1066 } 1067 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1068 list_del_init(&device->dev_alloc_list); 1069 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1070 fs_devices->rw_devices--; 1071 } 1072 list_del_init(&device->dev_list); 1073 fs_devices->num_devices--; 1074 btrfs_free_device(device); 1075 } 1076 1077 } 1078 1079 /* 1080 * After we have read the system tree and know devids belonging to this 1081 * filesystem, remove the device which does not belong there. 1082 */ 1083 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1084 { 1085 struct btrfs_device *latest_dev = NULL; 1086 struct btrfs_fs_devices *seed_dev; 1087 1088 mutex_lock(&uuid_mutex); 1089 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1090 1091 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1092 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1093 1094 fs_devices->latest_dev = latest_dev; 1095 1096 mutex_unlock(&uuid_mutex); 1097 } 1098 1099 static void btrfs_close_bdev(struct btrfs_device *device) 1100 { 1101 if (!device->bdev) 1102 return; 1103 1104 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1105 sync_blockdev(device->bdev); 1106 invalidate_bdev(device->bdev); 1107 } 1108 1109 blkdev_put(device->bdev, device->mode); 1110 } 1111 1112 static void btrfs_close_one_device(struct btrfs_device *device) 1113 { 1114 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1115 1116 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1117 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1118 list_del_init(&device->dev_alloc_list); 1119 fs_devices->rw_devices--; 1120 } 1121 1122 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1123 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1124 1125 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1126 fs_devices->missing_devices--; 1127 1128 btrfs_close_bdev(device); 1129 if (device->bdev) { 1130 fs_devices->open_devices--; 1131 device->bdev = NULL; 1132 } 1133 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1134 btrfs_destroy_dev_zone_info(device); 1135 1136 device->fs_info = NULL; 1137 atomic_set(&device->dev_stats_ccnt, 0); 1138 extent_io_tree_release(&device->alloc_state); 1139 1140 /* 1141 * Reset the flush error record. We might have a transient flush error 1142 * in this mount, and if so we aborted the current transaction and set 1143 * the fs to an error state, guaranteeing no super blocks can be further 1144 * committed. However that error might be transient and if we unmount the 1145 * filesystem and mount it again, we should allow the mount to succeed 1146 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1147 * filesystem again we still get flush errors, then we will again abort 1148 * any transaction and set the error state, guaranteeing no commits of 1149 * unsafe super blocks. 1150 */ 1151 device->last_flush_error = 0; 1152 1153 /* Verify the device is back in a pristine state */ 1154 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1155 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1156 ASSERT(list_empty(&device->dev_alloc_list)); 1157 ASSERT(list_empty(&device->post_commit_list)); 1158 ASSERT(atomic_read(&device->reada_in_flight) == 0); 1159 } 1160 1161 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1162 { 1163 struct btrfs_device *device, *tmp; 1164 1165 lockdep_assert_held(&uuid_mutex); 1166 1167 if (--fs_devices->opened > 0) 1168 return; 1169 1170 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1171 btrfs_close_one_device(device); 1172 1173 WARN_ON(fs_devices->open_devices); 1174 WARN_ON(fs_devices->rw_devices); 1175 fs_devices->opened = 0; 1176 fs_devices->seeding = false; 1177 fs_devices->fs_info = NULL; 1178 } 1179 1180 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1181 { 1182 LIST_HEAD(list); 1183 struct btrfs_fs_devices *tmp; 1184 1185 mutex_lock(&uuid_mutex); 1186 close_fs_devices(fs_devices); 1187 if (!fs_devices->opened) 1188 list_splice_init(&fs_devices->seed_list, &list); 1189 1190 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1191 close_fs_devices(fs_devices); 1192 list_del(&fs_devices->seed_list); 1193 free_fs_devices(fs_devices); 1194 } 1195 mutex_unlock(&uuid_mutex); 1196 } 1197 1198 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1199 fmode_t flags, void *holder) 1200 { 1201 struct btrfs_device *device; 1202 struct btrfs_device *latest_dev = NULL; 1203 struct btrfs_device *tmp_device; 1204 1205 flags |= FMODE_EXCL; 1206 1207 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1208 dev_list) { 1209 int ret; 1210 1211 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1212 if (ret == 0 && 1213 (!latest_dev || device->generation > latest_dev->generation)) { 1214 latest_dev = device; 1215 } else if (ret == -ENODATA) { 1216 fs_devices->num_devices--; 1217 list_del(&device->dev_list); 1218 btrfs_free_device(device); 1219 } 1220 } 1221 if (fs_devices->open_devices == 0) 1222 return -EINVAL; 1223 1224 fs_devices->opened = 1; 1225 fs_devices->latest_dev = latest_dev; 1226 fs_devices->total_rw_bytes = 0; 1227 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1228 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1229 1230 return 0; 1231 } 1232 1233 static int devid_cmp(void *priv, const struct list_head *a, 1234 const struct list_head *b) 1235 { 1236 const struct btrfs_device *dev1, *dev2; 1237 1238 dev1 = list_entry(a, struct btrfs_device, dev_list); 1239 dev2 = list_entry(b, struct btrfs_device, dev_list); 1240 1241 if (dev1->devid < dev2->devid) 1242 return -1; 1243 else if (dev1->devid > dev2->devid) 1244 return 1; 1245 return 0; 1246 } 1247 1248 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1249 fmode_t flags, void *holder) 1250 { 1251 int ret; 1252 1253 lockdep_assert_held(&uuid_mutex); 1254 /* 1255 * The device_list_mutex cannot be taken here in case opening the 1256 * underlying device takes further locks like open_mutex. 1257 * 1258 * We also don't need the lock here as this is called during mount and 1259 * exclusion is provided by uuid_mutex 1260 */ 1261 1262 if (fs_devices->opened) { 1263 fs_devices->opened++; 1264 ret = 0; 1265 } else { 1266 list_sort(NULL, &fs_devices->devices, devid_cmp); 1267 ret = open_fs_devices(fs_devices, flags, holder); 1268 } 1269 1270 return ret; 1271 } 1272 1273 void btrfs_release_disk_super(struct btrfs_super_block *super) 1274 { 1275 struct page *page = virt_to_page(super); 1276 1277 put_page(page); 1278 } 1279 1280 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1281 u64 bytenr, u64 bytenr_orig) 1282 { 1283 struct btrfs_super_block *disk_super; 1284 struct page *page; 1285 void *p; 1286 pgoff_t index; 1287 1288 /* make sure our super fits in the device */ 1289 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1290 return ERR_PTR(-EINVAL); 1291 1292 /* make sure our super fits in the page */ 1293 if (sizeof(*disk_super) > PAGE_SIZE) 1294 return ERR_PTR(-EINVAL); 1295 1296 /* make sure our super doesn't straddle pages on disk */ 1297 index = bytenr >> PAGE_SHIFT; 1298 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1299 return ERR_PTR(-EINVAL); 1300 1301 /* pull in the page with our super */ 1302 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1303 1304 if (IS_ERR(page)) 1305 return ERR_CAST(page); 1306 1307 p = page_address(page); 1308 1309 /* align our pointer to the offset of the super block */ 1310 disk_super = p + offset_in_page(bytenr); 1311 1312 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1313 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1314 btrfs_release_disk_super(p); 1315 return ERR_PTR(-EINVAL); 1316 } 1317 1318 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1319 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1320 1321 return disk_super; 1322 } 1323 1324 int btrfs_forget_devices(const char *path) 1325 { 1326 int ret; 1327 1328 mutex_lock(&uuid_mutex); 1329 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1330 mutex_unlock(&uuid_mutex); 1331 1332 return ret; 1333 } 1334 1335 /* 1336 * Look for a btrfs signature on a device. This may be called out of the mount path 1337 * and we are not allowed to call set_blocksize during the scan. The superblock 1338 * is read via pagecache 1339 */ 1340 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1341 void *holder) 1342 { 1343 struct btrfs_super_block *disk_super; 1344 bool new_device_added = false; 1345 struct btrfs_device *device = NULL; 1346 struct block_device *bdev; 1347 u64 bytenr, bytenr_orig; 1348 int ret; 1349 1350 lockdep_assert_held(&uuid_mutex); 1351 1352 /* 1353 * we would like to check all the supers, but that would make 1354 * a btrfs mount succeed after a mkfs from a different FS. 1355 * So, we need to add a special mount option to scan for 1356 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1357 */ 1358 flags |= FMODE_EXCL; 1359 1360 bdev = blkdev_get_by_path(path, flags, holder); 1361 if (IS_ERR(bdev)) 1362 return ERR_CAST(bdev); 1363 1364 bytenr_orig = btrfs_sb_offset(0); 1365 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1366 if (ret) 1367 return ERR_PTR(ret); 1368 1369 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1370 if (IS_ERR(disk_super)) { 1371 device = ERR_CAST(disk_super); 1372 goto error_bdev_put; 1373 } 1374 1375 device = device_list_add(path, disk_super, &new_device_added); 1376 if (!IS_ERR(device)) { 1377 if (new_device_added) 1378 btrfs_free_stale_devices(path, device); 1379 } 1380 1381 btrfs_release_disk_super(disk_super); 1382 1383 error_bdev_put: 1384 blkdev_put(bdev, flags); 1385 1386 return device; 1387 } 1388 1389 /* 1390 * Try to find a chunk that intersects [start, start + len] range and when one 1391 * such is found, record the end of it in *start 1392 */ 1393 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1394 u64 len) 1395 { 1396 u64 physical_start, physical_end; 1397 1398 lockdep_assert_held(&device->fs_info->chunk_mutex); 1399 1400 if (!find_first_extent_bit(&device->alloc_state, *start, 1401 &physical_start, &physical_end, 1402 CHUNK_ALLOCATED, NULL)) { 1403 1404 if (in_range(physical_start, *start, len) || 1405 in_range(*start, physical_start, 1406 physical_end - physical_start)) { 1407 *start = physical_end + 1; 1408 return true; 1409 } 1410 } 1411 return false; 1412 } 1413 1414 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1415 { 1416 switch (device->fs_devices->chunk_alloc_policy) { 1417 case BTRFS_CHUNK_ALLOC_REGULAR: 1418 /* 1419 * We don't want to overwrite the superblock on the drive nor 1420 * any area used by the boot loader (grub for example), so we 1421 * make sure to start at an offset of at least 1MB. 1422 */ 1423 return max_t(u64, start, SZ_1M); 1424 case BTRFS_CHUNK_ALLOC_ZONED: 1425 /* 1426 * We don't care about the starting region like regular 1427 * allocator, because we anyway use/reserve the first two zones 1428 * for superblock logging. 1429 */ 1430 return ALIGN(start, device->zone_info->zone_size); 1431 default: 1432 BUG(); 1433 } 1434 } 1435 1436 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1437 u64 *hole_start, u64 *hole_size, 1438 u64 num_bytes) 1439 { 1440 u64 zone_size = device->zone_info->zone_size; 1441 u64 pos; 1442 int ret; 1443 bool changed = false; 1444 1445 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1446 1447 while (*hole_size > 0) { 1448 pos = btrfs_find_allocatable_zones(device, *hole_start, 1449 *hole_start + *hole_size, 1450 num_bytes); 1451 if (pos != *hole_start) { 1452 *hole_size = *hole_start + *hole_size - pos; 1453 *hole_start = pos; 1454 changed = true; 1455 if (*hole_size < num_bytes) 1456 break; 1457 } 1458 1459 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1460 1461 /* Range is ensured to be empty */ 1462 if (!ret) 1463 return changed; 1464 1465 /* Given hole range was invalid (outside of device) */ 1466 if (ret == -ERANGE) { 1467 *hole_start += *hole_size; 1468 *hole_size = 0; 1469 return true; 1470 } 1471 1472 *hole_start += zone_size; 1473 *hole_size -= zone_size; 1474 changed = true; 1475 } 1476 1477 return changed; 1478 } 1479 1480 /** 1481 * dev_extent_hole_check - check if specified hole is suitable for allocation 1482 * @device: the device which we have the hole 1483 * @hole_start: starting position of the hole 1484 * @hole_size: the size of the hole 1485 * @num_bytes: the size of the free space that we need 1486 * 1487 * This function may modify @hole_start and @hole_size to reflect the suitable 1488 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1489 */ 1490 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1491 u64 *hole_size, u64 num_bytes) 1492 { 1493 bool changed = false; 1494 u64 hole_end = *hole_start + *hole_size; 1495 1496 for (;;) { 1497 /* 1498 * Check before we set max_hole_start, otherwise we could end up 1499 * sending back this offset anyway. 1500 */ 1501 if (contains_pending_extent(device, hole_start, *hole_size)) { 1502 if (hole_end >= *hole_start) 1503 *hole_size = hole_end - *hole_start; 1504 else 1505 *hole_size = 0; 1506 changed = true; 1507 } 1508 1509 switch (device->fs_devices->chunk_alloc_policy) { 1510 case BTRFS_CHUNK_ALLOC_REGULAR: 1511 /* No extra check */ 1512 break; 1513 case BTRFS_CHUNK_ALLOC_ZONED: 1514 if (dev_extent_hole_check_zoned(device, hole_start, 1515 hole_size, num_bytes)) { 1516 changed = true; 1517 /* 1518 * The changed hole can contain pending extent. 1519 * Loop again to check that. 1520 */ 1521 continue; 1522 } 1523 break; 1524 default: 1525 BUG(); 1526 } 1527 1528 break; 1529 } 1530 1531 return changed; 1532 } 1533 1534 /* 1535 * find_free_dev_extent_start - find free space in the specified device 1536 * @device: the device which we search the free space in 1537 * @num_bytes: the size of the free space that we need 1538 * @search_start: the position from which to begin the search 1539 * @start: store the start of the free space. 1540 * @len: the size of the free space. that we find, or the size 1541 * of the max free space if we don't find suitable free space 1542 * 1543 * this uses a pretty simple search, the expectation is that it is 1544 * called very infrequently and that a given device has a small number 1545 * of extents 1546 * 1547 * @start is used to store the start of the free space if we find. But if we 1548 * don't find suitable free space, it will be used to store the start position 1549 * of the max free space. 1550 * 1551 * @len is used to store the size of the free space that we find. 1552 * But if we don't find suitable free space, it is used to store the size of 1553 * the max free space. 1554 * 1555 * NOTE: This function will search *commit* root of device tree, and does extra 1556 * check to ensure dev extents are not double allocated. 1557 * This makes the function safe to allocate dev extents but may not report 1558 * correct usable device space, as device extent freed in current transaction 1559 * is not reported as available. 1560 */ 1561 static int find_free_dev_extent_start(struct btrfs_device *device, 1562 u64 num_bytes, u64 search_start, u64 *start, 1563 u64 *len) 1564 { 1565 struct btrfs_fs_info *fs_info = device->fs_info; 1566 struct btrfs_root *root = fs_info->dev_root; 1567 struct btrfs_key key; 1568 struct btrfs_dev_extent *dev_extent; 1569 struct btrfs_path *path; 1570 u64 hole_size; 1571 u64 max_hole_start; 1572 u64 max_hole_size; 1573 u64 extent_end; 1574 u64 search_end = device->total_bytes; 1575 int ret; 1576 int slot; 1577 struct extent_buffer *l; 1578 1579 search_start = dev_extent_search_start(device, search_start); 1580 1581 WARN_ON(device->zone_info && 1582 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1583 1584 path = btrfs_alloc_path(); 1585 if (!path) 1586 return -ENOMEM; 1587 1588 max_hole_start = search_start; 1589 max_hole_size = 0; 1590 1591 again: 1592 if (search_start >= search_end || 1593 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1594 ret = -ENOSPC; 1595 goto out; 1596 } 1597 1598 path->reada = READA_FORWARD; 1599 path->search_commit_root = 1; 1600 path->skip_locking = 1; 1601 1602 key.objectid = device->devid; 1603 key.offset = search_start; 1604 key.type = BTRFS_DEV_EXTENT_KEY; 1605 1606 ret = btrfs_search_backwards(root, &key, path); 1607 if (ret < 0) 1608 goto out; 1609 1610 while (1) { 1611 l = path->nodes[0]; 1612 slot = path->slots[0]; 1613 if (slot >= btrfs_header_nritems(l)) { 1614 ret = btrfs_next_leaf(root, path); 1615 if (ret == 0) 1616 continue; 1617 if (ret < 0) 1618 goto out; 1619 1620 break; 1621 } 1622 btrfs_item_key_to_cpu(l, &key, slot); 1623 1624 if (key.objectid < device->devid) 1625 goto next; 1626 1627 if (key.objectid > device->devid) 1628 break; 1629 1630 if (key.type != BTRFS_DEV_EXTENT_KEY) 1631 goto next; 1632 1633 if (key.offset > search_start) { 1634 hole_size = key.offset - search_start; 1635 dev_extent_hole_check(device, &search_start, &hole_size, 1636 num_bytes); 1637 1638 if (hole_size > max_hole_size) { 1639 max_hole_start = search_start; 1640 max_hole_size = hole_size; 1641 } 1642 1643 /* 1644 * If this free space is greater than which we need, 1645 * it must be the max free space that we have found 1646 * until now, so max_hole_start must point to the start 1647 * of this free space and the length of this free space 1648 * is stored in max_hole_size. Thus, we return 1649 * max_hole_start and max_hole_size and go back to the 1650 * caller. 1651 */ 1652 if (hole_size >= num_bytes) { 1653 ret = 0; 1654 goto out; 1655 } 1656 } 1657 1658 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1659 extent_end = key.offset + btrfs_dev_extent_length(l, 1660 dev_extent); 1661 if (extent_end > search_start) 1662 search_start = extent_end; 1663 next: 1664 path->slots[0]++; 1665 cond_resched(); 1666 } 1667 1668 /* 1669 * At this point, search_start should be the end of 1670 * allocated dev extents, and when shrinking the device, 1671 * search_end may be smaller than search_start. 1672 */ 1673 if (search_end > search_start) { 1674 hole_size = search_end - search_start; 1675 if (dev_extent_hole_check(device, &search_start, &hole_size, 1676 num_bytes)) { 1677 btrfs_release_path(path); 1678 goto again; 1679 } 1680 1681 if (hole_size > max_hole_size) { 1682 max_hole_start = search_start; 1683 max_hole_size = hole_size; 1684 } 1685 } 1686 1687 /* See above. */ 1688 if (max_hole_size < num_bytes) 1689 ret = -ENOSPC; 1690 else 1691 ret = 0; 1692 1693 out: 1694 btrfs_free_path(path); 1695 *start = max_hole_start; 1696 if (len) 1697 *len = max_hole_size; 1698 return ret; 1699 } 1700 1701 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1702 u64 *start, u64 *len) 1703 { 1704 /* FIXME use last free of some kind */ 1705 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1706 } 1707 1708 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1709 struct btrfs_device *device, 1710 u64 start, u64 *dev_extent_len) 1711 { 1712 struct btrfs_fs_info *fs_info = device->fs_info; 1713 struct btrfs_root *root = fs_info->dev_root; 1714 int ret; 1715 struct btrfs_path *path; 1716 struct btrfs_key key; 1717 struct btrfs_key found_key; 1718 struct extent_buffer *leaf = NULL; 1719 struct btrfs_dev_extent *extent = NULL; 1720 1721 path = btrfs_alloc_path(); 1722 if (!path) 1723 return -ENOMEM; 1724 1725 key.objectid = device->devid; 1726 key.offset = start; 1727 key.type = BTRFS_DEV_EXTENT_KEY; 1728 again: 1729 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1730 if (ret > 0) { 1731 ret = btrfs_previous_item(root, path, key.objectid, 1732 BTRFS_DEV_EXTENT_KEY); 1733 if (ret) 1734 goto out; 1735 leaf = path->nodes[0]; 1736 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1737 extent = btrfs_item_ptr(leaf, path->slots[0], 1738 struct btrfs_dev_extent); 1739 BUG_ON(found_key.offset > start || found_key.offset + 1740 btrfs_dev_extent_length(leaf, extent) < start); 1741 key = found_key; 1742 btrfs_release_path(path); 1743 goto again; 1744 } else if (ret == 0) { 1745 leaf = path->nodes[0]; 1746 extent = btrfs_item_ptr(leaf, path->slots[0], 1747 struct btrfs_dev_extent); 1748 } else { 1749 goto out; 1750 } 1751 1752 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1753 1754 ret = btrfs_del_item(trans, root, path); 1755 if (ret == 0) 1756 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1757 out: 1758 btrfs_free_path(path); 1759 return ret; 1760 } 1761 1762 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1763 { 1764 struct extent_map_tree *em_tree; 1765 struct extent_map *em; 1766 struct rb_node *n; 1767 u64 ret = 0; 1768 1769 em_tree = &fs_info->mapping_tree; 1770 read_lock(&em_tree->lock); 1771 n = rb_last(&em_tree->map.rb_root); 1772 if (n) { 1773 em = rb_entry(n, struct extent_map, rb_node); 1774 ret = em->start + em->len; 1775 } 1776 read_unlock(&em_tree->lock); 1777 1778 return ret; 1779 } 1780 1781 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1782 u64 *devid_ret) 1783 { 1784 int ret; 1785 struct btrfs_key key; 1786 struct btrfs_key found_key; 1787 struct btrfs_path *path; 1788 1789 path = btrfs_alloc_path(); 1790 if (!path) 1791 return -ENOMEM; 1792 1793 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1794 key.type = BTRFS_DEV_ITEM_KEY; 1795 key.offset = (u64)-1; 1796 1797 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1798 if (ret < 0) 1799 goto error; 1800 1801 if (ret == 0) { 1802 /* Corruption */ 1803 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1804 ret = -EUCLEAN; 1805 goto error; 1806 } 1807 1808 ret = btrfs_previous_item(fs_info->chunk_root, path, 1809 BTRFS_DEV_ITEMS_OBJECTID, 1810 BTRFS_DEV_ITEM_KEY); 1811 if (ret) { 1812 *devid_ret = 1; 1813 } else { 1814 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1815 path->slots[0]); 1816 *devid_ret = found_key.offset + 1; 1817 } 1818 ret = 0; 1819 error: 1820 btrfs_free_path(path); 1821 return ret; 1822 } 1823 1824 /* 1825 * the device information is stored in the chunk root 1826 * the btrfs_device struct should be fully filled in 1827 */ 1828 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1829 struct btrfs_device *device) 1830 { 1831 int ret; 1832 struct btrfs_path *path; 1833 struct btrfs_dev_item *dev_item; 1834 struct extent_buffer *leaf; 1835 struct btrfs_key key; 1836 unsigned long ptr; 1837 1838 path = btrfs_alloc_path(); 1839 if (!path) 1840 return -ENOMEM; 1841 1842 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1843 key.type = BTRFS_DEV_ITEM_KEY; 1844 key.offset = device->devid; 1845 1846 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1847 &key, sizeof(*dev_item)); 1848 if (ret) 1849 goto out; 1850 1851 leaf = path->nodes[0]; 1852 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1853 1854 btrfs_set_device_id(leaf, dev_item, device->devid); 1855 btrfs_set_device_generation(leaf, dev_item, 0); 1856 btrfs_set_device_type(leaf, dev_item, device->type); 1857 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1858 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1859 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1860 btrfs_set_device_total_bytes(leaf, dev_item, 1861 btrfs_device_get_disk_total_bytes(device)); 1862 btrfs_set_device_bytes_used(leaf, dev_item, 1863 btrfs_device_get_bytes_used(device)); 1864 btrfs_set_device_group(leaf, dev_item, 0); 1865 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1866 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1867 btrfs_set_device_start_offset(leaf, dev_item, 0); 1868 1869 ptr = btrfs_device_uuid(dev_item); 1870 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1871 ptr = btrfs_device_fsid(dev_item); 1872 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1873 ptr, BTRFS_FSID_SIZE); 1874 btrfs_mark_buffer_dirty(leaf); 1875 1876 ret = 0; 1877 out: 1878 btrfs_free_path(path); 1879 return ret; 1880 } 1881 1882 /* 1883 * Function to update ctime/mtime for a given device path. 1884 * Mainly used for ctime/mtime based probe like libblkid. 1885 */ 1886 static void update_dev_time(struct block_device *bdev) 1887 { 1888 struct inode *inode = bdev->bd_inode; 1889 struct timespec64 now; 1890 1891 /* Shouldn't happen but just in case. */ 1892 if (!inode) 1893 return; 1894 1895 now = current_time(inode); 1896 generic_update_time(inode, &now, S_MTIME | S_CTIME); 1897 } 1898 1899 static int btrfs_rm_dev_item(struct btrfs_device *device) 1900 { 1901 struct btrfs_root *root = device->fs_info->chunk_root; 1902 int ret; 1903 struct btrfs_path *path; 1904 struct btrfs_key key; 1905 struct btrfs_trans_handle *trans; 1906 1907 path = btrfs_alloc_path(); 1908 if (!path) 1909 return -ENOMEM; 1910 1911 trans = btrfs_start_transaction(root, 0); 1912 if (IS_ERR(trans)) { 1913 btrfs_free_path(path); 1914 return PTR_ERR(trans); 1915 } 1916 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1917 key.type = BTRFS_DEV_ITEM_KEY; 1918 key.offset = device->devid; 1919 1920 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1921 if (ret) { 1922 if (ret > 0) 1923 ret = -ENOENT; 1924 btrfs_abort_transaction(trans, ret); 1925 btrfs_end_transaction(trans); 1926 goto out; 1927 } 1928 1929 ret = btrfs_del_item(trans, root, path); 1930 if (ret) { 1931 btrfs_abort_transaction(trans, ret); 1932 btrfs_end_transaction(trans); 1933 } 1934 1935 out: 1936 btrfs_free_path(path); 1937 if (!ret) 1938 ret = btrfs_commit_transaction(trans); 1939 return ret; 1940 } 1941 1942 /* 1943 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1944 * filesystem. It's up to the caller to adjust that number regarding eg. device 1945 * replace. 1946 */ 1947 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1948 u64 num_devices) 1949 { 1950 u64 all_avail; 1951 unsigned seq; 1952 int i; 1953 1954 do { 1955 seq = read_seqbegin(&fs_info->profiles_lock); 1956 1957 all_avail = fs_info->avail_data_alloc_bits | 1958 fs_info->avail_system_alloc_bits | 1959 fs_info->avail_metadata_alloc_bits; 1960 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1961 1962 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1963 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1964 continue; 1965 1966 if (num_devices < btrfs_raid_array[i].devs_min) 1967 return btrfs_raid_array[i].mindev_error; 1968 } 1969 1970 return 0; 1971 } 1972 1973 static struct btrfs_device * btrfs_find_next_active_device( 1974 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1975 { 1976 struct btrfs_device *next_device; 1977 1978 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1979 if (next_device != device && 1980 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1981 && next_device->bdev) 1982 return next_device; 1983 } 1984 1985 return NULL; 1986 } 1987 1988 /* 1989 * Helper function to check if the given device is part of s_bdev / latest_dev 1990 * and replace it with the provided or the next active device, in the context 1991 * where this function called, there should be always be another device (or 1992 * this_dev) which is active. 1993 */ 1994 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1995 struct btrfs_device *next_device) 1996 { 1997 struct btrfs_fs_info *fs_info = device->fs_info; 1998 1999 if (!next_device) 2000 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2001 device); 2002 ASSERT(next_device); 2003 2004 if (fs_info->sb->s_bdev && 2005 (fs_info->sb->s_bdev == device->bdev)) 2006 fs_info->sb->s_bdev = next_device->bdev; 2007 2008 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2009 fs_info->fs_devices->latest_dev = next_device; 2010 } 2011 2012 /* 2013 * Return btrfs_fs_devices::num_devices excluding the device that's being 2014 * currently replaced. 2015 */ 2016 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2017 { 2018 u64 num_devices = fs_info->fs_devices->num_devices; 2019 2020 down_read(&fs_info->dev_replace.rwsem); 2021 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2022 ASSERT(num_devices > 1); 2023 num_devices--; 2024 } 2025 up_read(&fs_info->dev_replace.rwsem); 2026 2027 return num_devices; 2028 } 2029 2030 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2031 struct block_device *bdev, 2032 const char *device_path) 2033 { 2034 struct btrfs_super_block *disk_super; 2035 int copy_num; 2036 2037 if (!bdev) 2038 return; 2039 2040 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2041 struct page *page; 2042 int ret; 2043 2044 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2045 if (IS_ERR(disk_super)) 2046 continue; 2047 2048 if (bdev_is_zoned(bdev)) { 2049 btrfs_reset_sb_log_zones(bdev, copy_num); 2050 continue; 2051 } 2052 2053 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2054 2055 page = virt_to_page(disk_super); 2056 set_page_dirty(page); 2057 lock_page(page); 2058 /* write_on_page() unlocks the page */ 2059 ret = write_one_page(page); 2060 if (ret) 2061 btrfs_warn(fs_info, 2062 "error clearing superblock number %d (%d)", 2063 copy_num, ret); 2064 btrfs_release_disk_super(disk_super); 2065 2066 } 2067 2068 /* Notify udev that device has changed */ 2069 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2070 2071 /* Update ctime/mtime for device path for libblkid */ 2072 update_dev_time(bdev); 2073 } 2074 2075 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2076 u64 devid, struct block_device **bdev, fmode_t *mode) 2077 { 2078 struct btrfs_device *device; 2079 struct btrfs_fs_devices *cur_devices; 2080 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2081 u64 num_devices; 2082 int ret = 0; 2083 2084 mutex_lock(&uuid_mutex); 2085 2086 num_devices = btrfs_num_devices(fs_info); 2087 2088 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2089 if (ret) 2090 goto out; 2091 2092 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2093 2094 if (IS_ERR(device)) { 2095 if (PTR_ERR(device) == -ENOENT && 2096 device_path && strcmp(device_path, "missing") == 0) 2097 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2098 else 2099 ret = PTR_ERR(device); 2100 goto out; 2101 } 2102 2103 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2104 btrfs_warn_in_rcu(fs_info, 2105 "cannot remove device %s (devid %llu) due to active swapfile", 2106 rcu_str_deref(device->name), device->devid); 2107 ret = -ETXTBSY; 2108 goto out; 2109 } 2110 2111 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2112 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2113 goto out; 2114 } 2115 2116 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2117 fs_info->fs_devices->rw_devices == 1) { 2118 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2119 goto out; 2120 } 2121 2122 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2123 mutex_lock(&fs_info->chunk_mutex); 2124 list_del_init(&device->dev_alloc_list); 2125 device->fs_devices->rw_devices--; 2126 mutex_unlock(&fs_info->chunk_mutex); 2127 } 2128 2129 mutex_unlock(&uuid_mutex); 2130 ret = btrfs_shrink_device(device, 0); 2131 if (!ret) 2132 btrfs_reada_remove_dev(device); 2133 mutex_lock(&uuid_mutex); 2134 if (ret) 2135 goto error_undo; 2136 2137 /* 2138 * TODO: the superblock still includes this device in its num_devices 2139 * counter although write_all_supers() is not locked out. This 2140 * could give a filesystem state which requires a degraded mount. 2141 */ 2142 ret = btrfs_rm_dev_item(device); 2143 if (ret) 2144 goto error_undo; 2145 2146 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2147 btrfs_scrub_cancel_dev(device); 2148 2149 /* 2150 * the device list mutex makes sure that we don't change 2151 * the device list while someone else is writing out all 2152 * the device supers. Whoever is writing all supers, should 2153 * lock the device list mutex before getting the number of 2154 * devices in the super block (super_copy). Conversely, 2155 * whoever updates the number of devices in the super block 2156 * (super_copy) should hold the device list mutex. 2157 */ 2158 2159 /* 2160 * In normal cases the cur_devices == fs_devices. But in case 2161 * of deleting a seed device, the cur_devices should point to 2162 * its own fs_devices listed under the fs_devices->seed_list. 2163 */ 2164 cur_devices = device->fs_devices; 2165 mutex_lock(&fs_devices->device_list_mutex); 2166 list_del_rcu(&device->dev_list); 2167 2168 cur_devices->num_devices--; 2169 cur_devices->total_devices--; 2170 /* Update total_devices of the parent fs_devices if it's seed */ 2171 if (cur_devices != fs_devices) 2172 fs_devices->total_devices--; 2173 2174 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2175 cur_devices->missing_devices--; 2176 2177 btrfs_assign_next_active_device(device, NULL); 2178 2179 if (device->bdev) { 2180 cur_devices->open_devices--; 2181 /* remove sysfs entry */ 2182 btrfs_sysfs_remove_device(device); 2183 } 2184 2185 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2186 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2187 mutex_unlock(&fs_devices->device_list_mutex); 2188 2189 /* 2190 * At this point, the device is zero sized and detached from the 2191 * devices list. All that's left is to zero out the old supers and 2192 * free the device. 2193 * 2194 * We cannot call btrfs_close_bdev() here because we're holding the sb 2195 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2196 * block device and it's dependencies. Instead just flush the device 2197 * and let the caller do the final blkdev_put. 2198 */ 2199 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2200 btrfs_scratch_superblocks(fs_info, device->bdev, 2201 device->name->str); 2202 if (device->bdev) { 2203 sync_blockdev(device->bdev); 2204 invalidate_bdev(device->bdev); 2205 } 2206 } 2207 2208 *bdev = device->bdev; 2209 *mode = device->mode; 2210 synchronize_rcu(); 2211 btrfs_free_device(device); 2212 2213 if (cur_devices->open_devices == 0) { 2214 list_del_init(&cur_devices->seed_list); 2215 close_fs_devices(cur_devices); 2216 free_fs_devices(cur_devices); 2217 } 2218 2219 out: 2220 mutex_unlock(&uuid_mutex); 2221 return ret; 2222 2223 error_undo: 2224 btrfs_reada_undo_remove_dev(device); 2225 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2226 mutex_lock(&fs_info->chunk_mutex); 2227 list_add(&device->dev_alloc_list, 2228 &fs_devices->alloc_list); 2229 device->fs_devices->rw_devices++; 2230 mutex_unlock(&fs_info->chunk_mutex); 2231 } 2232 goto out; 2233 } 2234 2235 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2236 { 2237 struct btrfs_fs_devices *fs_devices; 2238 2239 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2240 2241 /* 2242 * in case of fs with no seed, srcdev->fs_devices will point 2243 * to fs_devices of fs_info. However when the dev being replaced is 2244 * a seed dev it will point to the seed's local fs_devices. In short 2245 * srcdev will have its correct fs_devices in both the cases. 2246 */ 2247 fs_devices = srcdev->fs_devices; 2248 2249 list_del_rcu(&srcdev->dev_list); 2250 list_del(&srcdev->dev_alloc_list); 2251 fs_devices->num_devices--; 2252 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2253 fs_devices->missing_devices--; 2254 2255 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2256 fs_devices->rw_devices--; 2257 2258 if (srcdev->bdev) 2259 fs_devices->open_devices--; 2260 } 2261 2262 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2263 { 2264 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2265 2266 mutex_lock(&uuid_mutex); 2267 2268 btrfs_close_bdev(srcdev); 2269 synchronize_rcu(); 2270 btrfs_free_device(srcdev); 2271 2272 /* if this is no devs we rather delete the fs_devices */ 2273 if (!fs_devices->num_devices) { 2274 /* 2275 * On a mounted FS, num_devices can't be zero unless it's a 2276 * seed. In case of a seed device being replaced, the replace 2277 * target added to the sprout FS, so there will be no more 2278 * device left under the seed FS. 2279 */ 2280 ASSERT(fs_devices->seeding); 2281 2282 list_del_init(&fs_devices->seed_list); 2283 close_fs_devices(fs_devices); 2284 free_fs_devices(fs_devices); 2285 } 2286 mutex_unlock(&uuid_mutex); 2287 } 2288 2289 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2290 { 2291 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2292 2293 mutex_lock(&fs_devices->device_list_mutex); 2294 2295 btrfs_sysfs_remove_device(tgtdev); 2296 2297 if (tgtdev->bdev) 2298 fs_devices->open_devices--; 2299 2300 fs_devices->num_devices--; 2301 2302 btrfs_assign_next_active_device(tgtdev, NULL); 2303 2304 list_del_rcu(&tgtdev->dev_list); 2305 2306 mutex_unlock(&fs_devices->device_list_mutex); 2307 2308 /* 2309 * The update_dev_time() with in btrfs_scratch_superblocks() 2310 * may lead to a call to btrfs_show_devname() which will try 2311 * to hold device_list_mutex. And here this device 2312 * is already out of device list, so we don't have to hold 2313 * the device_list_mutex lock. 2314 */ 2315 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2316 tgtdev->name->str); 2317 2318 btrfs_close_bdev(tgtdev); 2319 synchronize_rcu(); 2320 btrfs_free_device(tgtdev); 2321 } 2322 2323 static struct btrfs_device *btrfs_find_device_by_path( 2324 struct btrfs_fs_info *fs_info, const char *device_path) 2325 { 2326 int ret = 0; 2327 struct btrfs_super_block *disk_super; 2328 u64 devid; 2329 u8 *dev_uuid; 2330 struct block_device *bdev; 2331 struct btrfs_device *device; 2332 2333 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2334 fs_info->bdev_holder, 0, &bdev, &disk_super); 2335 if (ret) 2336 return ERR_PTR(ret); 2337 2338 devid = btrfs_stack_device_id(&disk_super->dev_item); 2339 dev_uuid = disk_super->dev_item.uuid; 2340 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2341 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2342 disk_super->metadata_uuid); 2343 else 2344 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2345 disk_super->fsid); 2346 2347 btrfs_release_disk_super(disk_super); 2348 if (!device) 2349 device = ERR_PTR(-ENOENT); 2350 blkdev_put(bdev, FMODE_READ); 2351 return device; 2352 } 2353 2354 /* 2355 * Lookup a device given by device id, or the path if the id is 0. 2356 */ 2357 struct btrfs_device *btrfs_find_device_by_devspec( 2358 struct btrfs_fs_info *fs_info, u64 devid, 2359 const char *device_path) 2360 { 2361 struct btrfs_device *device; 2362 2363 if (devid) { 2364 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2365 NULL); 2366 if (!device) 2367 return ERR_PTR(-ENOENT); 2368 return device; 2369 } 2370 2371 if (!device_path || !device_path[0]) 2372 return ERR_PTR(-EINVAL); 2373 2374 if (strcmp(device_path, "missing") == 0) { 2375 /* Find first missing device */ 2376 list_for_each_entry(device, &fs_info->fs_devices->devices, 2377 dev_list) { 2378 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2379 &device->dev_state) && !device->bdev) 2380 return device; 2381 } 2382 return ERR_PTR(-ENOENT); 2383 } 2384 2385 return btrfs_find_device_by_path(fs_info, device_path); 2386 } 2387 2388 /* 2389 * does all the dirty work required for changing file system's UUID. 2390 */ 2391 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2392 { 2393 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2394 struct btrfs_fs_devices *old_devices; 2395 struct btrfs_fs_devices *seed_devices; 2396 struct btrfs_super_block *disk_super = fs_info->super_copy; 2397 struct btrfs_device *device; 2398 u64 super_flags; 2399 2400 lockdep_assert_held(&uuid_mutex); 2401 if (!fs_devices->seeding) 2402 return -EINVAL; 2403 2404 /* 2405 * Private copy of the seed devices, anchored at 2406 * fs_info->fs_devices->seed_list 2407 */ 2408 seed_devices = alloc_fs_devices(NULL, NULL); 2409 if (IS_ERR(seed_devices)) 2410 return PTR_ERR(seed_devices); 2411 2412 /* 2413 * It's necessary to retain a copy of the original seed fs_devices in 2414 * fs_uuids so that filesystems which have been seeded can successfully 2415 * reference the seed device from open_seed_devices. This also supports 2416 * multiple fs seed. 2417 */ 2418 old_devices = clone_fs_devices(fs_devices); 2419 if (IS_ERR(old_devices)) { 2420 kfree(seed_devices); 2421 return PTR_ERR(old_devices); 2422 } 2423 2424 list_add(&old_devices->fs_list, &fs_uuids); 2425 2426 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2427 seed_devices->opened = 1; 2428 INIT_LIST_HEAD(&seed_devices->devices); 2429 INIT_LIST_HEAD(&seed_devices->alloc_list); 2430 mutex_init(&seed_devices->device_list_mutex); 2431 2432 mutex_lock(&fs_devices->device_list_mutex); 2433 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2434 synchronize_rcu); 2435 list_for_each_entry(device, &seed_devices->devices, dev_list) 2436 device->fs_devices = seed_devices; 2437 2438 fs_devices->seeding = false; 2439 fs_devices->num_devices = 0; 2440 fs_devices->open_devices = 0; 2441 fs_devices->missing_devices = 0; 2442 fs_devices->rotating = false; 2443 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2444 2445 generate_random_uuid(fs_devices->fsid); 2446 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2447 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2448 mutex_unlock(&fs_devices->device_list_mutex); 2449 2450 super_flags = btrfs_super_flags(disk_super) & 2451 ~BTRFS_SUPER_FLAG_SEEDING; 2452 btrfs_set_super_flags(disk_super, super_flags); 2453 2454 return 0; 2455 } 2456 2457 /* 2458 * Store the expected generation for seed devices in device items. 2459 */ 2460 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2461 { 2462 struct btrfs_fs_info *fs_info = trans->fs_info; 2463 struct btrfs_root *root = fs_info->chunk_root; 2464 struct btrfs_path *path; 2465 struct extent_buffer *leaf; 2466 struct btrfs_dev_item *dev_item; 2467 struct btrfs_device *device; 2468 struct btrfs_key key; 2469 u8 fs_uuid[BTRFS_FSID_SIZE]; 2470 u8 dev_uuid[BTRFS_UUID_SIZE]; 2471 u64 devid; 2472 int ret; 2473 2474 path = btrfs_alloc_path(); 2475 if (!path) 2476 return -ENOMEM; 2477 2478 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2479 key.offset = 0; 2480 key.type = BTRFS_DEV_ITEM_KEY; 2481 2482 while (1) { 2483 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2484 if (ret < 0) 2485 goto error; 2486 2487 leaf = path->nodes[0]; 2488 next_slot: 2489 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2490 ret = btrfs_next_leaf(root, path); 2491 if (ret > 0) 2492 break; 2493 if (ret < 0) 2494 goto error; 2495 leaf = path->nodes[0]; 2496 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2497 btrfs_release_path(path); 2498 continue; 2499 } 2500 2501 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2502 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2503 key.type != BTRFS_DEV_ITEM_KEY) 2504 break; 2505 2506 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2507 struct btrfs_dev_item); 2508 devid = btrfs_device_id(leaf, dev_item); 2509 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2510 BTRFS_UUID_SIZE); 2511 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2512 BTRFS_FSID_SIZE); 2513 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2514 fs_uuid); 2515 BUG_ON(!device); /* Logic error */ 2516 2517 if (device->fs_devices->seeding) { 2518 btrfs_set_device_generation(leaf, dev_item, 2519 device->generation); 2520 btrfs_mark_buffer_dirty(leaf); 2521 } 2522 2523 path->slots[0]++; 2524 goto next_slot; 2525 } 2526 ret = 0; 2527 error: 2528 btrfs_free_path(path); 2529 return ret; 2530 } 2531 2532 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2533 { 2534 struct btrfs_root *root = fs_info->dev_root; 2535 struct request_queue *q; 2536 struct btrfs_trans_handle *trans; 2537 struct btrfs_device *device; 2538 struct block_device *bdev; 2539 struct super_block *sb = fs_info->sb; 2540 struct rcu_string *name; 2541 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2542 u64 orig_super_total_bytes; 2543 u64 orig_super_num_devices; 2544 int seeding_dev = 0; 2545 int ret = 0; 2546 bool locked = false; 2547 2548 if (sb_rdonly(sb) && !fs_devices->seeding) 2549 return -EROFS; 2550 2551 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2552 fs_info->bdev_holder); 2553 if (IS_ERR(bdev)) 2554 return PTR_ERR(bdev); 2555 2556 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2557 ret = -EINVAL; 2558 goto error; 2559 } 2560 2561 if (fs_devices->seeding) { 2562 seeding_dev = 1; 2563 down_write(&sb->s_umount); 2564 mutex_lock(&uuid_mutex); 2565 locked = true; 2566 } 2567 2568 sync_blockdev(bdev); 2569 2570 rcu_read_lock(); 2571 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2572 if (device->bdev == bdev) { 2573 ret = -EEXIST; 2574 rcu_read_unlock(); 2575 goto error; 2576 } 2577 } 2578 rcu_read_unlock(); 2579 2580 device = btrfs_alloc_device(fs_info, NULL, NULL); 2581 if (IS_ERR(device)) { 2582 /* we can safely leave the fs_devices entry around */ 2583 ret = PTR_ERR(device); 2584 goto error; 2585 } 2586 2587 name = rcu_string_strdup(device_path, GFP_KERNEL); 2588 if (!name) { 2589 ret = -ENOMEM; 2590 goto error_free_device; 2591 } 2592 rcu_assign_pointer(device->name, name); 2593 2594 device->fs_info = fs_info; 2595 device->bdev = bdev; 2596 2597 ret = btrfs_get_dev_zone_info(device); 2598 if (ret) 2599 goto error_free_device; 2600 2601 trans = btrfs_start_transaction(root, 0); 2602 if (IS_ERR(trans)) { 2603 ret = PTR_ERR(trans); 2604 goto error_free_zone; 2605 } 2606 2607 q = bdev_get_queue(bdev); 2608 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2609 device->generation = trans->transid; 2610 device->io_width = fs_info->sectorsize; 2611 device->io_align = fs_info->sectorsize; 2612 device->sector_size = fs_info->sectorsize; 2613 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2614 fs_info->sectorsize); 2615 device->disk_total_bytes = device->total_bytes; 2616 device->commit_total_bytes = device->total_bytes; 2617 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2618 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2619 device->mode = FMODE_EXCL; 2620 device->dev_stats_valid = 1; 2621 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2622 2623 if (seeding_dev) { 2624 btrfs_clear_sb_rdonly(sb); 2625 ret = btrfs_prepare_sprout(fs_info); 2626 if (ret) { 2627 btrfs_abort_transaction(trans, ret); 2628 goto error_trans; 2629 } 2630 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2631 device); 2632 } 2633 2634 device->fs_devices = fs_devices; 2635 2636 mutex_lock(&fs_devices->device_list_mutex); 2637 mutex_lock(&fs_info->chunk_mutex); 2638 list_add_rcu(&device->dev_list, &fs_devices->devices); 2639 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2640 fs_devices->num_devices++; 2641 fs_devices->open_devices++; 2642 fs_devices->rw_devices++; 2643 fs_devices->total_devices++; 2644 fs_devices->total_rw_bytes += device->total_bytes; 2645 2646 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2647 2648 if (!blk_queue_nonrot(q)) 2649 fs_devices->rotating = true; 2650 2651 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2652 btrfs_set_super_total_bytes(fs_info->super_copy, 2653 round_down(orig_super_total_bytes + device->total_bytes, 2654 fs_info->sectorsize)); 2655 2656 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2657 btrfs_set_super_num_devices(fs_info->super_copy, 2658 orig_super_num_devices + 1); 2659 2660 /* 2661 * we've got more storage, clear any full flags on the space 2662 * infos 2663 */ 2664 btrfs_clear_space_info_full(fs_info); 2665 2666 mutex_unlock(&fs_info->chunk_mutex); 2667 2668 /* Add sysfs device entry */ 2669 btrfs_sysfs_add_device(device); 2670 2671 mutex_unlock(&fs_devices->device_list_mutex); 2672 2673 if (seeding_dev) { 2674 mutex_lock(&fs_info->chunk_mutex); 2675 ret = init_first_rw_device(trans); 2676 mutex_unlock(&fs_info->chunk_mutex); 2677 if (ret) { 2678 btrfs_abort_transaction(trans, ret); 2679 goto error_sysfs; 2680 } 2681 } 2682 2683 ret = btrfs_add_dev_item(trans, device); 2684 if (ret) { 2685 btrfs_abort_transaction(trans, ret); 2686 goto error_sysfs; 2687 } 2688 2689 if (seeding_dev) { 2690 ret = btrfs_finish_sprout(trans); 2691 if (ret) { 2692 btrfs_abort_transaction(trans, ret); 2693 goto error_sysfs; 2694 } 2695 2696 /* 2697 * fs_devices now represents the newly sprouted filesystem and 2698 * its fsid has been changed by btrfs_prepare_sprout 2699 */ 2700 btrfs_sysfs_update_sprout_fsid(fs_devices); 2701 } 2702 2703 ret = btrfs_commit_transaction(trans); 2704 2705 if (seeding_dev) { 2706 mutex_unlock(&uuid_mutex); 2707 up_write(&sb->s_umount); 2708 locked = false; 2709 2710 if (ret) /* transaction commit */ 2711 return ret; 2712 2713 ret = btrfs_relocate_sys_chunks(fs_info); 2714 if (ret < 0) 2715 btrfs_handle_fs_error(fs_info, ret, 2716 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2717 trans = btrfs_attach_transaction(root); 2718 if (IS_ERR(trans)) { 2719 if (PTR_ERR(trans) == -ENOENT) 2720 return 0; 2721 ret = PTR_ERR(trans); 2722 trans = NULL; 2723 goto error_sysfs; 2724 } 2725 ret = btrfs_commit_transaction(trans); 2726 } 2727 2728 /* 2729 * Now that we have written a new super block to this device, check all 2730 * other fs_devices list if device_path alienates any other scanned 2731 * device. 2732 * We can ignore the return value as it typically returns -EINVAL and 2733 * only succeeds if the device was an alien. 2734 */ 2735 btrfs_forget_devices(device_path); 2736 2737 /* Update ctime/mtime for blkid or udev */ 2738 update_dev_time(bdev); 2739 2740 return ret; 2741 2742 error_sysfs: 2743 btrfs_sysfs_remove_device(device); 2744 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2745 mutex_lock(&fs_info->chunk_mutex); 2746 list_del_rcu(&device->dev_list); 2747 list_del(&device->dev_alloc_list); 2748 fs_info->fs_devices->num_devices--; 2749 fs_info->fs_devices->open_devices--; 2750 fs_info->fs_devices->rw_devices--; 2751 fs_info->fs_devices->total_devices--; 2752 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2753 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2754 btrfs_set_super_total_bytes(fs_info->super_copy, 2755 orig_super_total_bytes); 2756 btrfs_set_super_num_devices(fs_info->super_copy, 2757 orig_super_num_devices); 2758 mutex_unlock(&fs_info->chunk_mutex); 2759 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2760 error_trans: 2761 if (seeding_dev) 2762 btrfs_set_sb_rdonly(sb); 2763 if (trans) 2764 btrfs_end_transaction(trans); 2765 error_free_zone: 2766 btrfs_destroy_dev_zone_info(device); 2767 error_free_device: 2768 btrfs_free_device(device); 2769 error: 2770 blkdev_put(bdev, FMODE_EXCL); 2771 if (locked) { 2772 mutex_unlock(&uuid_mutex); 2773 up_write(&sb->s_umount); 2774 } 2775 return ret; 2776 } 2777 2778 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2779 struct btrfs_device *device) 2780 { 2781 int ret; 2782 struct btrfs_path *path; 2783 struct btrfs_root *root = device->fs_info->chunk_root; 2784 struct btrfs_dev_item *dev_item; 2785 struct extent_buffer *leaf; 2786 struct btrfs_key key; 2787 2788 path = btrfs_alloc_path(); 2789 if (!path) 2790 return -ENOMEM; 2791 2792 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2793 key.type = BTRFS_DEV_ITEM_KEY; 2794 key.offset = device->devid; 2795 2796 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2797 if (ret < 0) 2798 goto out; 2799 2800 if (ret > 0) { 2801 ret = -ENOENT; 2802 goto out; 2803 } 2804 2805 leaf = path->nodes[0]; 2806 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2807 2808 btrfs_set_device_id(leaf, dev_item, device->devid); 2809 btrfs_set_device_type(leaf, dev_item, device->type); 2810 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2811 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2812 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2813 btrfs_set_device_total_bytes(leaf, dev_item, 2814 btrfs_device_get_disk_total_bytes(device)); 2815 btrfs_set_device_bytes_used(leaf, dev_item, 2816 btrfs_device_get_bytes_used(device)); 2817 btrfs_mark_buffer_dirty(leaf); 2818 2819 out: 2820 btrfs_free_path(path); 2821 return ret; 2822 } 2823 2824 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2825 struct btrfs_device *device, u64 new_size) 2826 { 2827 struct btrfs_fs_info *fs_info = device->fs_info; 2828 struct btrfs_super_block *super_copy = fs_info->super_copy; 2829 u64 old_total; 2830 u64 diff; 2831 2832 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2833 return -EACCES; 2834 2835 new_size = round_down(new_size, fs_info->sectorsize); 2836 2837 mutex_lock(&fs_info->chunk_mutex); 2838 old_total = btrfs_super_total_bytes(super_copy); 2839 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2840 2841 if (new_size <= device->total_bytes || 2842 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2843 mutex_unlock(&fs_info->chunk_mutex); 2844 return -EINVAL; 2845 } 2846 2847 btrfs_set_super_total_bytes(super_copy, 2848 round_down(old_total + diff, fs_info->sectorsize)); 2849 device->fs_devices->total_rw_bytes += diff; 2850 2851 btrfs_device_set_total_bytes(device, new_size); 2852 btrfs_device_set_disk_total_bytes(device, new_size); 2853 btrfs_clear_space_info_full(device->fs_info); 2854 if (list_empty(&device->post_commit_list)) 2855 list_add_tail(&device->post_commit_list, 2856 &trans->transaction->dev_update_list); 2857 mutex_unlock(&fs_info->chunk_mutex); 2858 2859 return btrfs_update_device(trans, device); 2860 } 2861 2862 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2863 { 2864 struct btrfs_fs_info *fs_info = trans->fs_info; 2865 struct btrfs_root *root = fs_info->chunk_root; 2866 int ret; 2867 struct btrfs_path *path; 2868 struct btrfs_key key; 2869 2870 path = btrfs_alloc_path(); 2871 if (!path) 2872 return -ENOMEM; 2873 2874 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2875 key.offset = chunk_offset; 2876 key.type = BTRFS_CHUNK_ITEM_KEY; 2877 2878 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2879 if (ret < 0) 2880 goto out; 2881 else if (ret > 0) { /* Logic error or corruption */ 2882 btrfs_handle_fs_error(fs_info, -ENOENT, 2883 "Failed lookup while freeing chunk."); 2884 ret = -ENOENT; 2885 goto out; 2886 } 2887 2888 ret = btrfs_del_item(trans, root, path); 2889 if (ret < 0) 2890 btrfs_handle_fs_error(fs_info, ret, 2891 "Failed to delete chunk item."); 2892 out: 2893 btrfs_free_path(path); 2894 return ret; 2895 } 2896 2897 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2898 { 2899 struct btrfs_super_block *super_copy = fs_info->super_copy; 2900 struct btrfs_disk_key *disk_key; 2901 struct btrfs_chunk *chunk; 2902 u8 *ptr; 2903 int ret = 0; 2904 u32 num_stripes; 2905 u32 array_size; 2906 u32 len = 0; 2907 u32 cur; 2908 struct btrfs_key key; 2909 2910 lockdep_assert_held(&fs_info->chunk_mutex); 2911 array_size = btrfs_super_sys_array_size(super_copy); 2912 2913 ptr = super_copy->sys_chunk_array; 2914 cur = 0; 2915 2916 while (cur < array_size) { 2917 disk_key = (struct btrfs_disk_key *)ptr; 2918 btrfs_disk_key_to_cpu(&key, disk_key); 2919 2920 len = sizeof(*disk_key); 2921 2922 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2923 chunk = (struct btrfs_chunk *)(ptr + len); 2924 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2925 len += btrfs_chunk_item_size(num_stripes); 2926 } else { 2927 ret = -EIO; 2928 break; 2929 } 2930 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2931 key.offset == chunk_offset) { 2932 memmove(ptr, ptr + len, array_size - (cur + len)); 2933 array_size -= len; 2934 btrfs_set_super_sys_array_size(super_copy, array_size); 2935 } else { 2936 ptr += len; 2937 cur += len; 2938 } 2939 } 2940 return ret; 2941 } 2942 2943 /* 2944 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2945 * @logical: Logical block offset in bytes. 2946 * @length: Length of extent in bytes. 2947 * 2948 * Return: Chunk mapping or ERR_PTR. 2949 */ 2950 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2951 u64 logical, u64 length) 2952 { 2953 struct extent_map_tree *em_tree; 2954 struct extent_map *em; 2955 2956 em_tree = &fs_info->mapping_tree; 2957 read_lock(&em_tree->lock); 2958 em = lookup_extent_mapping(em_tree, logical, length); 2959 read_unlock(&em_tree->lock); 2960 2961 if (!em) { 2962 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2963 logical, length); 2964 return ERR_PTR(-EINVAL); 2965 } 2966 2967 if (em->start > logical || em->start + em->len < logical) { 2968 btrfs_crit(fs_info, 2969 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2970 logical, length, em->start, em->start + em->len); 2971 free_extent_map(em); 2972 return ERR_PTR(-EINVAL); 2973 } 2974 2975 /* callers are responsible for dropping em's ref. */ 2976 return em; 2977 } 2978 2979 static int remove_chunk_item(struct btrfs_trans_handle *trans, 2980 struct map_lookup *map, u64 chunk_offset) 2981 { 2982 int i; 2983 2984 /* 2985 * Removing chunk items and updating the device items in the chunks btree 2986 * requires holding the chunk_mutex. 2987 * See the comment at btrfs_chunk_alloc() for the details. 2988 */ 2989 lockdep_assert_held(&trans->fs_info->chunk_mutex); 2990 2991 for (i = 0; i < map->num_stripes; i++) { 2992 int ret; 2993 2994 ret = btrfs_update_device(trans, map->stripes[i].dev); 2995 if (ret) 2996 return ret; 2997 } 2998 2999 return btrfs_free_chunk(trans, chunk_offset); 3000 } 3001 3002 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3003 { 3004 struct btrfs_fs_info *fs_info = trans->fs_info; 3005 struct extent_map *em; 3006 struct map_lookup *map; 3007 u64 dev_extent_len = 0; 3008 int i, ret = 0; 3009 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3010 3011 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3012 if (IS_ERR(em)) { 3013 /* 3014 * This is a logic error, but we don't want to just rely on the 3015 * user having built with ASSERT enabled, so if ASSERT doesn't 3016 * do anything we still error out. 3017 */ 3018 ASSERT(0); 3019 return PTR_ERR(em); 3020 } 3021 map = em->map_lookup; 3022 3023 /* 3024 * First delete the device extent items from the devices btree. 3025 * We take the device_list_mutex to avoid racing with the finishing phase 3026 * of a device replace operation. See the comment below before acquiring 3027 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3028 * because that can result in a deadlock when deleting the device extent 3029 * items from the devices btree - COWing an extent buffer from the btree 3030 * may result in allocating a new metadata chunk, which would attempt to 3031 * lock again fs_info->chunk_mutex. 3032 */ 3033 mutex_lock(&fs_devices->device_list_mutex); 3034 for (i = 0; i < map->num_stripes; i++) { 3035 struct btrfs_device *device = map->stripes[i].dev; 3036 ret = btrfs_free_dev_extent(trans, device, 3037 map->stripes[i].physical, 3038 &dev_extent_len); 3039 if (ret) { 3040 mutex_unlock(&fs_devices->device_list_mutex); 3041 btrfs_abort_transaction(trans, ret); 3042 goto out; 3043 } 3044 3045 if (device->bytes_used > 0) { 3046 mutex_lock(&fs_info->chunk_mutex); 3047 btrfs_device_set_bytes_used(device, 3048 device->bytes_used - dev_extent_len); 3049 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3050 btrfs_clear_space_info_full(fs_info); 3051 mutex_unlock(&fs_info->chunk_mutex); 3052 } 3053 } 3054 mutex_unlock(&fs_devices->device_list_mutex); 3055 3056 /* 3057 * We acquire fs_info->chunk_mutex for 2 reasons: 3058 * 3059 * 1) Just like with the first phase of the chunk allocation, we must 3060 * reserve system space, do all chunk btree updates and deletions, and 3061 * update the system chunk array in the superblock while holding this 3062 * mutex. This is for similar reasons as explained on the comment at 3063 * the top of btrfs_chunk_alloc(); 3064 * 3065 * 2) Prevent races with the final phase of a device replace operation 3066 * that replaces the device object associated with the map's stripes, 3067 * because the device object's id can change at any time during that 3068 * final phase of the device replace operation 3069 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3070 * replaced device and then see it with an ID of 3071 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3072 * the device item, which does not exists on the chunk btree. 3073 * The finishing phase of device replace acquires both the 3074 * device_list_mutex and the chunk_mutex, in that order, so we are 3075 * safe by just acquiring the chunk_mutex. 3076 */ 3077 trans->removing_chunk = true; 3078 mutex_lock(&fs_info->chunk_mutex); 3079 3080 check_system_chunk(trans, map->type); 3081 3082 ret = remove_chunk_item(trans, map, chunk_offset); 3083 /* 3084 * Normally we should not get -ENOSPC since we reserved space before 3085 * through the call to check_system_chunk(). 3086 * 3087 * Despite our system space_info having enough free space, we may not 3088 * be able to allocate extents from its block groups, because all have 3089 * an incompatible profile, which will force us to allocate a new system 3090 * block group with the right profile, or right after we called 3091 * check_system_space() above, a scrub turned the only system block group 3092 * with enough free space into RO mode. 3093 * This is explained with more detail at do_chunk_alloc(). 3094 * 3095 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3096 */ 3097 if (ret == -ENOSPC) { 3098 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3099 struct btrfs_block_group *sys_bg; 3100 3101 sys_bg = btrfs_create_chunk(trans, sys_flags); 3102 if (IS_ERR(sys_bg)) { 3103 ret = PTR_ERR(sys_bg); 3104 btrfs_abort_transaction(trans, ret); 3105 goto out; 3106 } 3107 3108 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3109 if (ret) { 3110 btrfs_abort_transaction(trans, ret); 3111 goto out; 3112 } 3113 3114 ret = remove_chunk_item(trans, map, chunk_offset); 3115 if (ret) { 3116 btrfs_abort_transaction(trans, ret); 3117 goto out; 3118 } 3119 } else if (ret) { 3120 btrfs_abort_transaction(trans, ret); 3121 goto out; 3122 } 3123 3124 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3125 3126 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3127 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3128 if (ret) { 3129 btrfs_abort_transaction(trans, ret); 3130 goto out; 3131 } 3132 } 3133 3134 mutex_unlock(&fs_info->chunk_mutex); 3135 trans->removing_chunk = false; 3136 3137 /* 3138 * We are done with chunk btree updates and deletions, so release the 3139 * system space we previously reserved (with check_system_chunk()). 3140 */ 3141 btrfs_trans_release_chunk_metadata(trans); 3142 3143 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3144 if (ret) { 3145 btrfs_abort_transaction(trans, ret); 3146 goto out; 3147 } 3148 3149 out: 3150 if (trans->removing_chunk) { 3151 mutex_unlock(&fs_info->chunk_mutex); 3152 trans->removing_chunk = false; 3153 } 3154 /* once for us */ 3155 free_extent_map(em); 3156 return ret; 3157 } 3158 3159 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3160 { 3161 struct btrfs_root *root = fs_info->chunk_root; 3162 struct btrfs_trans_handle *trans; 3163 struct btrfs_block_group *block_group; 3164 u64 length; 3165 int ret; 3166 3167 /* 3168 * Prevent races with automatic removal of unused block groups. 3169 * After we relocate and before we remove the chunk with offset 3170 * chunk_offset, automatic removal of the block group can kick in, 3171 * resulting in a failure when calling btrfs_remove_chunk() below. 3172 * 3173 * Make sure to acquire this mutex before doing a tree search (dev 3174 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3175 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3176 * we release the path used to search the chunk/dev tree and before 3177 * the current task acquires this mutex and calls us. 3178 */ 3179 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3180 3181 /* step one, relocate all the extents inside this chunk */ 3182 btrfs_scrub_pause(fs_info); 3183 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3184 btrfs_scrub_continue(fs_info); 3185 if (ret) 3186 return ret; 3187 3188 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3189 if (!block_group) 3190 return -ENOENT; 3191 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3192 length = block_group->length; 3193 btrfs_put_block_group(block_group); 3194 3195 /* 3196 * On a zoned file system, discard the whole block group, this will 3197 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3198 * resetting the zone fails, don't treat it as a fatal problem from the 3199 * filesystem's point of view. 3200 */ 3201 if (btrfs_is_zoned(fs_info)) { 3202 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3203 if (ret) 3204 btrfs_info(fs_info, 3205 "failed to reset zone %llu after relocation", 3206 chunk_offset); 3207 } 3208 3209 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3210 chunk_offset); 3211 if (IS_ERR(trans)) { 3212 ret = PTR_ERR(trans); 3213 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3214 return ret; 3215 } 3216 3217 /* 3218 * step two, delete the device extents and the 3219 * chunk tree entries 3220 */ 3221 ret = btrfs_remove_chunk(trans, chunk_offset); 3222 btrfs_end_transaction(trans); 3223 return ret; 3224 } 3225 3226 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3227 { 3228 struct btrfs_root *chunk_root = fs_info->chunk_root; 3229 struct btrfs_path *path; 3230 struct extent_buffer *leaf; 3231 struct btrfs_chunk *chunk; 3232 struct btrfs_key key; 3233 struct btrfs_key found_key; 3234 u64 chunk_type; 3235 bool retried = false; 3236 int failed = 0; 3237 int ret; 3238 3239 path = btrfs_alloc_path(); 3240 if (!path) 3241 return -ENOMEM; 3242 3243 again: 3244 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3245 key.offset = (u64)-1; 3246 key.type = BTRFS_CHUNK_ITEM_KEY; 3247 3248 while (1) { 3249 mutex_lock(&fs_info->reclaim_bgs_lock); 3250 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3251 if (ret < 0) { 3252 mutex_unlock(&fs_info->reclaim_bgs_lock); 3253 goto error; 3254 } 3255 BUG_ON(ret == 0); /* Corruption */ 3256 3257 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3258 key.type); 3259 if (ret) 3260 mutex_unlock(&fs_info->reclaim_bgs_lock); 3261 if (ret < 0) 3262 goto error; 3263 if (ret > 0) 3264 break; 3265 3266 leaf = path->nodes[0]; 3267 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3268 3269 chunk = btrfs_item_ptr(leaf, path->slots[0], 3270 struct btrfs_chunk); 3271 chunk_type = btrfs_chunk_type(leaf, chunk); 3272 btrfs_release_path(path); 3273 3274 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3275 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3276 if (ret == -ENOSPC) 3277 failed++; 3278 else 3279 BUG_ON(ret); 3280 } 3281 mutex_unlock(&fs_info->reclaim_bgs_lock); 3282 3283 if (found_key.offset == 0) 3284 break; 3285 key.offset = found_key.offset - 1; 3286 } 3287 ret = 0; 3288 if (failed && !retried) { 3289 failed = 0; 3290 retried = true; 3291 goto again; 3292 } else if (WARN_ON(failed && retried)) { 3293 ret = -ENOSPC; 3294 } 3295 error: 3296 btrfs_free_path(path); 3297 return ret; 3298 } 3299 3300 /* 3301 * return 1 : allocate a data chunk successfully, 3302 * return <0: errors during allocating a data chunk, 3303 * return 0 : no need to allocate a data chunk. 3304 */ 3305 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3306 u64 chunk_offset) 3307 { 3308 struct btrfs_block_group *cache; 3309 u64 bytes_used; 3310 u64 chunk_type; 3311 3312 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3313 ASSERT(cache); 3314 chunk_type = cache->flags; 3315 btrfs_put_block_group(cache); 3316 3317 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3318 return 0; 3319 3320 spin_lock(&fs_info->data_sinfo->lock); 3321 bytes_used = fs_info->data_sinfo->bytes_used; 3322 spin_unlock(&fs_info->data_sinfo->lock); 3323 3324 if (!bytes_used) { 3325 struct btrfs_trans_handle *trans; 3326 int ret; 3327 3328 trans = btrfs_join_transaction(fs_info->tree_root); 3329 if (IS_ERR(trans)) 3330 return PTR_ERR(trans); 3331 3332 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3333 btrfs_end_transaction(trans); 3334 if (ret < 0) 3335 return ret; 3336 return 1; 3337 } 3338 3339 return 0; 3340 } 3341 3342 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3343 struct btrfs_balance_control *bctl) 3344 { 3345 struct btrfs_root *root = fs_info->tree_root; 3346 struct btrfs_trans_handle *trans; 3347 struct btrfs_balance_item *item; 3348 struct btrfs_disk_balance_args disk_bargs; 3349 struct btrfs_path *path; 3350 struct extent_buffer *leaf; 3351 struct btrfs_key key; 3352 int ret, err; 3353 3354 path = btrfs_alloc_path(); 3355 if (!path) 3356 return -ENOMEM; 3357 3358 trans = btrfs_start_transaction(root, 0); 3359 if (IS_ERR(trans)) { 3360 btrfs_free_path(path); 3361 return PTR_ERR(trans); 3362 } 3363 3364 key.objectid = BTRFS_BALANCE_OBJECTID; 3365 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3366 key.offset = 0; 3367 3368 ret = btrfs_insert_empty_item(trans, root, path, &key, 3369 sizeof(*item)); 3370 if (ret) 3371 goto out; 3372 3373 leaf = path->nodes[0]; 3374 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3375 3376 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3377 3378 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3379 btrfs_set_balance_data(leaf, item, &disk_bargs); 3380 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3381 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3382 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3383 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3384 3385 btrfs_set_balance_flags(leaf, item, bctl->flags); 3386 3387 btrfs_mark_buffer_dirty(leaf); 3388 out: 3389 btrfs_free_path(path); 3390 err = btrfs_commit_transaction(trans); 3391 if (err && !ret) 3392 ret = err; 3393 return ret; 3394 } 3395 3396 static int del_balance_item(struct btrfs_fs_info *fs_info) 3397 { 3398 struct btrfs_root *root = fs_info->tree_root; 3399 struct btrfs_trans_handle *trans; 3400 struct btrfs_path *path; 3401 struct btrfs_key key; 3402 int ret, err; 3403 3404 path = btrfs_alloc_path(); 3405 if (!path) 3406 return -ENOMEM; 3407 3408 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3409 if (IS_ERR(trans)) { 3410 btrfs_free_path(path); 3411 return PTR_ERR(trans); 3412 } 3413 3414 key.objectid = BTRFS_BALANCE_OBJECTID; 3415 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3416 key.offset = 0; 3417 3418 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3419 if (ret < 0) 3420 goto out; 3421 if (ret > 0) { 3422 ret = -ENOENT; 3423 goto out; 3424 } 3425 3426 ret = btrfs_del_item(trans, root, path); 3427 out: 3428 btrfs_free_path(path); 3429 err = btrfs_commit_transaction(trans); 3430 if (err && !ret) 3431 ret = err; 3432 return ret; 3433 } 3434 3435 /* 3436 * This is a heuristic used to reduce the number of chunks balanced on 3437 * resume after balance was interrupted. 3438 */ 3439 static void update_balance_args(struct btrfs_balance_control *bctl) 3440 { 3441 /* 3442 * Turn on soft mode for chunk types that were being converted. 3443 */ 3444 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3445 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3446 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3447 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3448 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3449 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3450 3451 /* 3452 * Turn on usage filter if is not already used. The idea is 3453 * that chunks that we have already balanced should be 3454 * reasonably full. Don't do it for chunks that are being 3455 * converted - that will keep us from relocating unconverted 3456 * (albeit full) chunks. 3457 */ 3458 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3459 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3460 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3461 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3462 bctl->data.usage = 90; 3463 } 3464 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3465 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3466 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3467 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3468 bctl->sys.usage = 90; 3469 } 3470 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3471 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3472 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3473 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3474 bctl->meta.usage = 90; 3475 } 3476 } 3477 3478 /* 3479 * Clear the balance status in fs_info and delete the balance item from disk. 3480 */ 3481 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3482 { 3483 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3484 int ret; 3485 3486 BUG_ON(!fs_info->balance_ctl); 3487 3488 spin_lock(&fs_info->balance_lock); 3489 fs_info->balance_ctl = NULL; 3490 spin_unlock(&fs_info->balance_lock); 3491 3492 kfree(bctl); 3493 ret = del_balance_item(fs_info); 3494 if (ret) 3495 btrfs_handle_fs_error(fs_info, ret, NULL); 3496 } 3497 3498 /* 3499 * Balance filters. Return 1 if chunk should be filtered out 3500 * (should not be balanced). 3501 */ 3502 static int chunk_profiles_filter(u64 chunk_type, 3503 struct btrfs_balance_args *bargs) 3504 { 3505 chunk_type = chunk_to_extended(chunk_type) & 3506 BTRFS_EXTENDED_PROFILE_MASK; 3507 3508 if (bargs->profiles & chunk_type) 3509 return 0; 3510 3511 return 1; 3512 } 3513 3514 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3515 struct btrfs_balance_args *bargs) 3516 { 3517 struct btrfs_block_group *cache; 3518 u64 chunk_used; 3519 u64 user_thresh_min; 3520 u64 user_thresh_max; 3521 int ret = 1; 3522 3523 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3524 chunk_used = cache->used; 3525 3526 if (bargs->usage_min == 0) 3527 user_thresh_min = 0; 3528 else 3529 user_thresh_min = div_factor_fine(cache->length, 3530 bargs->usage_min); 3531 3532 if (bargs->usage_max == 0) 3533 user_thresh_max = 1; 3534 else if (bargs->usage_max > 100) 3535 user_thresh_max = cache->length; 3536 else 3537 user_thresh_max = div_factor_fine(cache->length, 3538 bargs->usage_max); 3539 3540 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3541 ret = 0; 3542 3543 btrfs_put_block_group(cache); 3544 return ret; 3545 } 3546 3547 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3548 u64 chunk_offset, struct btrfs_balance_args *bargs) 3549 { 3550 struct btrfs_block_group *cache; 3551 u64 chunk_used, user_thresh; 3552 int ret = 1; 3553 3554 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3555 chunk_used = cache->used; 3556 3557 if (bargs->usage_min == 0) 3558 user_thresh = 1; 3559 else if (bargs->usage > 100) 3560 user_thresh = cache->length; 3561 else 3562 user_thresh = div_factor_fine(cache->length, bargs->usage); 3563 3564 if (chunk_used < user_thresh) 3565 ret = 0; 3566 3567 btrfs_put_block_group(cache); 3568 return ret; 3569 } 3570 3571 static int chunk_devid_filter(struct extent_buffer *leaf, 3572 struct btrfs_chunk *chunk, 3573 struct btrfs_balance_args *bargs) 3574 { 3575 struct btrfs_stripe *stripe; 3576 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3577 int i; 3578 3579 for (i = 0; i < num_stripes; i++) { 3580 stripe = btrfs_stripe_nr(chunk, i); 3581 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3582 return 0; 3583 } 3584 3585 return 1; 3586 } 3587 3588 static u64 calc_data_stripes(u64 type, int num_stripes) 3589 { 3590 const int index = btrfs_bg_flags_to_raid_index(type); 3591 const int ncopies = btrfs_raid_array[index].ncopies; 3592 const int nparity = btrfs_raid_array[index].nparity; 3593 3594 return (num_stripes - nparity) / ncopies; 3595 } 3596 3597 /* [pstart, pend) */ 3598 static int chunk_drange_filter(struct extent_buffer *leaf, 3599 struct btrfs_chunk *chunk, 3600 struct btrfs_balance_args *bargs) 3601 { 3602 struct btrfs_stripe *stripe; 3603 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3604 u64 stripe_offset; 3605 u64 stripe_length; 3606 u64 type; 3607 int factor; 3608 int i; 3609 3610 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3611 return 0; 3612 3613 type = btrfs_chunk_type(leaf, chunk); 3614 factor = calc_data_stripes(type, num_stripes); 3615 3616 for (i = 0; i < num_stripes; i++) { 3617 stripe = btrfs_stripe_nr(chunk, i); 3618 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3619 continue; 3620 3621 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3622 stripe_length = btrfs_chunk_length(leaf, chunk); 3623 stripe_length = div_u64(stripe_length, factor); 3624 3625 if (stripe_offset < bargs->pend && 3626 stripe_offset + stripe_length > bargs->pstart) 3627 return 0; 3628 } 3629 3630 return 1; 3631 } 3632 3633 /* [vstart, vend) */ 3634 static int chunk_vrange_filter(struct extent_buffer *leaf, 3635 struct btrfs_chunk *chunk, 3636 u64 chunk_offset, 3637 struct btrfs_balance_args *bargs) 3638 { 3639 if (chunk_offset < bargs->vend && 3640 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3641 /* at least part of the chunk is inside this vrange */ 3642 return 0; 3643 3644 return 1; 3645 } 3646 3647 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3648 struct btrfs_chunk *chunk, 3649 struct btrfs_balance_args *bargs) 3650 { 3651 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3652 3653 if (bargs->stripes_min <= num_stripes 3654 && num_stripes <= bargs->stripes_max) 3655 return 0; 3656 3657 return 1; 3658 } 3659 3660 static int chunk_soft_convert_filter(u64 chunk_type, 3661 struct btrfs_balance_args *bargs) 3662 { 3663 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3664 return 0; 3665 3666 chunk_type = chunk_to_extended(chunk_type) & 3667 BTRFS_EXTENDED_PROFILE_MASK; 3668 3669 if (bargs->target == chunk_type) 3670 return 1; 3671 3672 return 0; 3673 } 3674 3675 static int should_balance_chunk(struct extent_buffer *leaf, 3676 struct btrfs_chunk *chunk, u64 chunk_offset) 3677 { 3678 struct btrfs_fs_info *fs_info = leaf->fs_info; 3679 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3680 struct btrfs_balance_args *bargs = NULL; 3681 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3682 3683 /* type filter */ 3684 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3685 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3686 return 0; 3687 } 3688 3689 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3690 bargs = &bctl->data; 3691 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3692 bargs = &bctl->sys; 3693 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3694 bargs = &bctl->meta; 3695 3696 /* profiles filter */ 3697 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3698 chunk_profiles_filter(chunk_type, bargs)) { 3699 return 0; 3700 } 3701 3702 /* usage filter */ 3703 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3704 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3705 return 0; 3706 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3707 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3708 return 0; 3709 } 3710 3711 /* devid filter */ 3712 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3713 chunk_devid_filter(leaf, chunk, bargs)) { 3714 return 0; 3715 } 3716 3717 /* drange filter, makes sense only with devid filter */ 3718 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3719 chunk_drange_filter(leaf, chunk, bargs)) { 3720 return 0; 3721 } 3722 3723 /* vrange filter */ 3724 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3725 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3726 return 0; 3727 } 3728 3729 /* stripes filter */ 3730 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3731 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3732 return 0; 3733 } 3734 3735 /* soft profile changing mode */ 3736 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3737 chunk_soft_convert_filter(chunk_type, bargs)) { 3738 return 0; 3739 } 3740 3741 /* 3742 * limited by count, must be the last filter 3743 */ 3744 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3745 if (bargs->limit == 0) 3746 return 0; 3747 else 3748 bargs->limit--; 3749 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3750 /* 3751 * Same logic as the 'limit' filter; the minimum cannot be 3752 * determined here because we do not have the global information 3753 * about the count of all chunks that satisfy the filters. 3754 */ 3755 if (bargs->limit_max == 0) 3756 return 0; 3757 else 3758 bargs->limit_max--; 3759 } 3760 3761 return 1; 3762 } 3763 3764 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3765 { 3766 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3767 struct btrfs_root *chunk_root = fs_info->chunk_root; 3768 u64 chunk_type; 3769 struct btrfs_chunk *chunk; 3770 struct btrfs_path *path = NULL; 3771 struct btrfs_key key; 3772 struct btrfs_key found_key; 3773 struct extent_buffer *leaf; 3774 int slot; 3775 int ret; 3776 int enospc_errors = 0; 3777 bool counting = true; 3778 /* The single value limit and min/max limits use the same bytes in the */ 3779 u64 limit_data = bctl->data.limit; 3780 u64 limit_meta = bctl->meta.limit; 3781 u64 limit_sys = bctl->sys.limit; 3782 u32 count_data = 0; 3783 u32 count_meta = 0; 3784 u32 count_sys = 0; 3785 int chunk_reserved = 0; 3786 3787 path = btrfs_alloc_path(); 3788 if (!path) { 3789 ret = -ENOMEM; 3790 goto error; 3791 } 3792 3793 /* zero out stat counters */ 3794 spin_lock(&fs_info->balance_lock); 3795 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3796 spin_unlock(&fs_info->balance_lock); 3797 again: 3798 if (!counting) { 3799 /* 3800 * The single value limit and min/max limits use the same bytes 3801 * in the 3802 */ 3803 bctl->data.limit = limit_data; 3804 bctl->meta.limit = limit_meta; 3805 bctl->sys.limit = limit_sys; 3806 } 3807 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3808 key.offset = (u64)-1; 3809 key.type = BTRFS_CHUNK_ITEM_KEY; 3810 3811 while (1) { 3812 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3813 atomic_read(&fs_info->balance_cancel_req)) { 3814 ret = -ECANCELED; 3815 goto error; 3816 } 3817 3818 mutex_lock(&fs_info->reclaim_bgs_lock); 3819 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3820 if (ret < 0) { 3821 mutex_unlock(&fs_info->reclaim_bgs_lock); 3822 goto error; 3823 } 3824 3825 /* 3826 * this shouldn't happen, it means the last relocate 3827 * failed 3828 */ 3829 if (ret == 0) 3830 BUG(); /* FIXME break ? */ 3831 3832 ret = btrfs_previous_item(chunk_root, path, 0, 3833 BTRFS_CHUNK_ITEM_KEY); 3834 if (ret) { 3835 mutex_unlock(&fs_info->reclaim_bgs_lock); 3836 ret = 0; 3837 break; 3838 } 3839 3840 leaf = path->nodes[0]; 3841 slot = path->slots[0]; 3842 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3843 3844 if (found_key.objectid != key.objectid) { 3845 mutex_unlock(&fs_info->reclaim_bgs_lock); 3846 break; 3847 } 3848 3849 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3850 chunk_type = btrfs_chunk_type(leaf, chunk); 3851 3852 if (!counting) { 3853 spin_lock(&fs_info->balance_lock); 3854 bctl->stat.considered++; 3855 spin_unlock(&fs_info->balance_lock); 3856 } 3857 3858 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3859 3860 btrfs_release_path(path); 3861 if (!ret) { 3862 mutex_unlock(&fs_info->reclaim_bgs_lock); 3863 goto loop; 3864 } 3865 3866 if (counting) { 3867 mutex_unlock(&fs_info->reclaim_bgs_lock); 3868 spin_lock(&fs_info->balance_lock); 3869 bctl->stat.expected++; 3870 spin_unlock(&fs_info->balance_lock); 3871 3872 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3873 count_data++; 3874 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3875 count_sys++; 3876 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3877 count_meta++; 3878 3879 goto loop; 3880 } 3881 3882 /* 3883 * Apply limit_min filter, no need to check if the LIMITS 3884 * filter is used, limit_min is 0 by default 3885 */ 3886 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3887 count_data < bctl->data.limit_min) 3888 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3889 count_meta < bctl->meta.limit_min) 3890 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3891 count_sys < bctl->sys.limit_min)) { 3892 mutex_unlock(&fs_info->reclaim_bgs_lock); 3893 goto loop; 3894 } 3895 3896 if (!chunk_reserved) { 3897 /* 3898 * We may be relocating the only data chunk we have, 3899 * which could potentially end up with losing data's 3900 * raid profile, so lets allocate an empty one in 3901 * advance. 3902 */ 3903 ret = btrfs_may_alloc_data_chunk(fs_info, 3904 found_key.offset); 3905 if (ret < 0) { 3906 mutex_unlock(&fs_info->reclaim_bgs_lock); 3907 goto error; 3908 } else if (ret == 1) { 3909 chunk_reserved = 1; 3910 } 3911 } 3912 3913 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3914 mutex_unlock(&fs_info->reclaim_bgs_lock); 3915 if (ret == -ENOSPC) { 3916 enospc_errors++; 3917 } else if (ret == -ETXTBSY) { 3918 btrfs_info(fs_info, 3919 "skipping relocation of block group %llu due to active swapfile", 3920 found_key.offset); 3921 ret = 0; 3922 } else if (ret) { 3923 goto error; 3924 } else { 3925 spin_lock(&fs_info->balance_lock); 3926 bctl->stat.completed++; 3927 spin_unlock(&fs_info->balance_lock); 3928 } 3929 loop: 3930 if (found_key.offset == 0) 3931 break; 3932 key.offset = found_key.offset - 1; 3933 } 3934 3935 if (counting) { 3936 btrfs_release_path(path); 3937 counting = false; 3938 goto again; 3939 } 3940 error: 3941 btrfs_free_path(path); 3942 if (enospc_errors) { 3943 btrfs_info(fs_info, "%d enospc errors during balance", 3944 enospc_errors); 3945 if (!ret) 3946 ret = -ENOSPC; 3947 } 3948 3949 return ret; 3950 } 3951 3952 /** 3953 * alloc_profile_is_valid - see if a given profile is valid and reduced 3954 * @flags: profile to validate 3955 * @extended: if true @flags is treated as an extended profile 3956 */ 3957 static int alloc_profile_is_valid(u64 flags, int extended) 3958 { 3959 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3960 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3961 3962 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3963 3964 /* 1) check that all other bits are zeroed */ 3965 if (flags & ~mask) 3966 return 0; 3967 3968 /* 2) see if profile is reduced */ 3969 if (flags == 0) 3970 return !extended; /* "0" is valid for usual profiles */ 3971 3972 return has_single_bit_set(flags); 3973 } 3974 3975 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3976 { 3977 /* cancel requested || normal exit path */ 3978 return atomic_read(&fs_info->balance_cancel_req) || 3979 (atomic_read(&fs_info->balance_pause_req) == 0 && 3980 atomic_read(&fs_info->balance_cancel_req) == 0); 3981 } 3982 3983 /* 3984 * Validate target profile against allowed profiles and return true if it's OK. 3985 * Otherwise print the error message and return false. 3986 */ 3987 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 3988 const struct btrfs_balance_args *bargs, 3989 u64 allowed, const char *type) 3990 { 3991 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3992 return true; 3993 3994 if (fs_info->sectorsize < PAGE_SIZE && 3995 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3996 btrfs_err(fs_info, 3997 "RAID56 is not yet supported for sectorsize %u with page size %lu", 3998 fs_info->sectorsize, PAGE_SIZE); 3999 return false; 4000 } 4001 /* Profile is valid and does not have bits outside of the allowed set */ 4002 if (alloc_profile_is_valid(bargs->target, 1) && 4003 (bargs->target & ~allowed) == 0) 4004 return true; 4005 4006 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4007 type, btrfs_bg_type_to_raid_name(bargs->target)); 4008 return false; 4009 } 4010 4011 /* 4012 * Fill @buf with textual description of balance filter flags @bargs, up to 4013 * @size_buf including the terminating null. The output may be trimmed if it 4014 * does not fit into the provided buffer. 4015 */ 4016 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4017 u32 size_buf) 4018 { 4019 int ret; 4020 u32 size_bp = size_buf; 4021 char *bp = buf; 4022 u64 flags = bargs->flags; 4023 char tmp_buf[128] = {'\0'}; 4024 4025 if (!flags) 4026 return; 4027 4028 #define CHECK_APPEND_NOARG(a) \ 4029 do { \ 4030 ret = snprintf(bp, size_bp, (a)); \ 4031 if (ret < 0 || ret >= size_bp) \ 4032 goto out_overflow; \ 4033 size_bp -= ret; \ 4034 bp += ret; \ 4035 } while (0) 4036 4037 #define CHECK_APPEND_1ARG(a, v1) \ 4038 do { \ 4039 ret = snprintf(bp, size_bp, (a), (v1)); \ 4040 if (ret < 0 || ret >= size_bp) \ 4041 goto out_overflow; \ 4042 size_bp -= ret; \ 4043 bp += ret; \ 4044 } while (0) 4045 4046 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4047 do { \ 4048 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4049 if (ret < 0 || ret >= size_bp) \ 4050 goto out_overflow; \ 4051 size_bp -= ret; \ 4052 bp += ret; \ 4053 } while (0) 4054 4055 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4056 CHECK_APPEND_1ARG("convert=%s,", 4057 btrfs_bg_type_to_raid_name(bargs->target)); 4058 4059 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4060 CHECK_APPEND_NOARG("soft,"); 4061 4062 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4063 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4064 sizeof(tmp_buf)); 4065 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4066 } 4067 4068 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4069 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4070 4071 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4072 CHECK_APPEND_2ARG("usage=%u..%u,", 4073 bargs->usage_min, bargs->usage_max); 4074 4075 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4076 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4077 4078 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4079 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4080 bargs->pstart, bargs->pend); 4081 4082 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4083 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4084 bargs->vstart, bargs->vend); 4085 4086 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4087 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4088 4089 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4090 CHECK_APPEND_2ARG("limit=%u..%u,", 4091 bargs->limit_min, bargs->limit_max); 4092 4093 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4094 CHECK_APPEND_2ARG("stripes=%u..%u,", 4095 bargs->stripes_min, bargs->stripes_max); 4096 4097 #undef CHECK_APPEND_2ARG 4098 #undef CHECK_APPEND_1ARG 4099 #undef CHECK_APPEND_NOARG 4100 4101 out_overflow: 4102 4103 if (size_bp < size_buf) 4104 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4105 else 4106 buf[0] = '\0'; 4107 } 4108 4109 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4110 { 4111 u32 size_buf = 1024; 4112 char tmp_buf[192] = {'\0'}; 4113 char *buf; 4114 char *bp; 4115 u32 size_bp = size_buf; 4116 int ret; 4117 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4118 4119 buf = kzalloc(size_buf, GFP_KERNEL); 4120 if (!buf) 4121 return; 4122 4123 bp = buf; 4124 4125 #define CHECK_APPEND_1ARG(a, v1) \ 4126 do { \ 4127 ret = snprintf(bp, size_bp, (a), (v1)); \ 4128 if (ret < 0 || ret >= size_bp) \ 4129 goto out_overflow; \ 4130 size_bp -= ret; \ 4131 bp += ret; \ 4132 } while (0) 4133 4134 if (bctl->flags & BTRFS_BALANCE_FORCE) 4135 CHECK_APPEND_1ARG("%s", "-f "); 4136 4137 if (bctl->flags & BTRFS_BALANCE_DATA) { 4138 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4139 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4140 } 4141 4142 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4143 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4144 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4145 } 4146 4147 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4148 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4149 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4150 } 4151 4152 #undef CHECK_APPEND_1ARG 4153 4154 out_overflow: 4155 4156 if (size_bp < size_buf) 4157 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4158 btrfs_info(fs_info, "balance: %s %s", 4159 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4160 "resume" : "start", buf); 4161 4162 kfree(buf); 4163 } 4164 4165 /* 4166 * Should be called with balance mutexe held 4167 */ 4168 int btrfs_balance(struct btrfs_fs_info *fs_info, 4169 struct btrfs_balance_control *bctl, 4170 struct btrfs_ioctl_balance_args *bargs) 4171 { 4172 u64 meta_target, data_target; 4173 u64 allowed; 4174 int mixed = 0; 4175 int ret; 4176 u64 num_devices; 4177 unsigned seq; 4178 bool reducing_redundancy; 4179 int i; 4180 4181 if (btrfs_fs_closing(fs_info) || 4182 atomic_read(&fs_info->balance_pause_req) || 4183 btrfs_should_cancel_balance(fs_info)) { 4184 ret = -EINVAL; 4185 goto out; 4186 } 4187 4188 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4189 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4190 mixed = 1; 4191 4192 /* 4193 * In case of mixed groups both data and meta should be picked, 4194 * and identical options should be given for both of them. 4195 */ 4196 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4197 if (mixed && (bctl->flags & allowed)) { 4198 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4199 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4200 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4201 btrfs_err(fs_info, 4202 "balance: mixed groups data and metadata options must be the same"); 4203 ret = -EINVAL; 4204 goto out; 4205 } 4206 } 4207 4208 /* 4209 * rw_devices will not change at the moment, device add/delete/replace 4210 * are exclusive 4211 */ 4212 num_devices = fs_info->fs_devices->rw_devices; 4213 4214 /* 4215 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4216 * special bit for it, to make it easier to distinguish. Thus we need 4217 * to set it manually, or balance would refuse the profile. 4218 */ 4219 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4220 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4221 if (num_devices >= btrfs_raid_array[i].devs_min) 4222 allowed |= btrfs_raid_array[i].bg_flag; 4223 4224 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4225 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4226 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4227 ret = -EINVAL; 4228 goto out; 4229 } 4230 4231 /* 4232 * Allow to reduce metadata or system integrity only if force set for 4233 * profiles with redundancy (copies, parity) 4234 */ 4235 allowed = 0; 4236 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4237 if (btrfs_raid_array[i].ncopies >= 2 || 4238 btrfs_raid_array[i].tolerated_failures >= 1) 4239 allowed |= btrfs_raid_array[i].bg_flag; 4240 } 4241 do { 4242 seq = read_seqbegin(&fs_info->profiles_lock); 4243 4244 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4245 (fs_info->avail_system_alloc_bits & allowed) && 4246 !(bctl->sys.target & allowed)) || 4247 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4248 (fs_info->avail_metadata_alloc_bits & allowed) && 4249 !(bctl->meta.target & allowed))) 4250 reducing_redundancy = true; 4251 else 4252 reducing_redundancy = false; 4253 4254 /* if we're not converting, the target field is uninitialized */ 4255 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4256 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4257 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4258 bctl->data.target : fs_info->avail_data_alloc_bits; 4259 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4260 4261 if (reducing_redundancy) { 4262 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4263 btrfs_info(fs_info, 4264 "balance: force reducing metadata redundancy"); 4265 } else { 4266 btrfs_err(fs_info, 4267 "balance: reduces metadata redundancy, use --force if you want this"); 4268 ret = -EINVAL; 4269 goto out; 4270 } 4271 } 4272 4273 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4274 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4275 btrfs_warn(fs_info, 4276 "balance: metadata profile %s has lower redundancy than data profile %s", 4277 btrfs_bg_type_to_raid_name(meta_target), 4278 btrfs_bg_type_to_raid_name(data_target)); 4279 } 4280 4281 ret = insert_balance_item(fs_info, bctl); 4282 if (ret && ret != -EEXIST) 4283 goto out; 4284 4285 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4286 BUG_ON(ret == -EEXIST); 4287 BUG_ON(fs_info->balance_ctl); 4288 spin_lock(&fs_info->balance_lock); 4289 fs_info->balance_ctl = bctl; 4290 spin_unlock(&fs_info->balance_lock); 4291 } else { 4292 BUG_ON(ret != -EEXIST); 4293 spin_lock(&fs_info->balance_lock); 4294 update_balance_args(bctl); 4295 spin_unlock(&fs_info->balance_lock); 4296 } 4297 4298 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4299 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4300 describe_balance_start_or_resume(fs_info); 4301 mutex_unlock(&fs_info->balance_mutex); 4302 4303 ret = __btrfs_balance(fs_info); 4304 4305 mutex_lock(&fs_info->balance_mutex); 4306 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4307 btrfs_info(fs_info, "balance: paused"); 4308 /* 4309 * Balance can be canceled by: 4310 * 4311 * - Regular cancel request 4312 * Then ret == -ECANCELED and balance_cancel_req > 0 4313 * 4314 * - Fatal signal to "btrfs" process 4315 * Either the signal caught by wait_reserve_ticket() and callers 4316 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4317 * got -ECANCELED. 4318 * Either way, in this case balance_cancel_req = 0, and 4319 * ret == -EINTR or ret == -ECANCELED. 4320 * 4321 * So here we only check the return value to catch canceled balance. 4322 */ 4323 else if (ret == -ECANCELED || ret == -EINTR) 4324 btrfs_info(fs_info, "balance: canceled"); 4325 else 4326 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4327 4328 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4329 4330 if (bargs) { 4331 memset(bargs, 0, sizeof(*bargs)); 4332 btrfs_update_ioctl_balance_args(fs_info, bargs); 4333 } 4334 4335 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4336 balance_need_close(fs_info)) { 4337 reset_balance_state(fs_info); 4338 btrfs_exclop_finish(fs_info); 4339 } 4340 4341 wake_up(&fs_info->balance_wait_q); 4342 4343 return ret; 4344 out: 4345 if (bctl->flags & BTRFS_BALANCE_RESUME) 4346 reset_balance_state(fs_info); 4347 else 4348 kfree(bctl); 4349 btrfs_exclop_finish(fs_info); 4350 4351 return ret; 4352 } 4353 4354 static int balance_kthread(void *data) 4355 { 4356 struct btrfs_fs_info *fs_info = data; 4357 int ret = 0; 4358 4359 mutex_lock(&fs_info->balance_mutex); 4360 if (fs_info->balance_ctl) 4361 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4362 mutex_unlock(&fs_info->balance_mutex); 4363 4364 return ret; 4365 } 4366 4367 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4368 { 4369 struct task_struct *tsk; 4370 4371 mutex_lock(&fs_info->balance_mutex); 4372 if (!fs_info->balance_ctl) { 4373 mutex_unlock(&fs_info->balance_mutex); 4374 return 0; 4375 } 4376 mutex_unlock(&fs_info->balance_mutex); 4377 4378 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4379 btrfs_info(fs_info, "balance: resume skipped"); 4380 return 0; 4381 } 4382 4383 /* 4384 * A ro->rw remount sequence should continue with the paused balance 4385 * regardless of who pauses it, system or the user as of now, so set 4386 * the resume flag. 4387 */ 4388 spin_lock(&fs_info->balance_lock); 4389 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4390 spin_unlock(&fs_info->balance_lock); 4391 4392 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4393 return PTR_ERR_OR_ZERO(tsk); 4394 } 4395 4396 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4397 { 4398 struct btrfs_balance_control *bctl; 4399 struct btrfs_balance_item *item; 4400 struct btrfs_disk_balance_args disk_bargs; 4401 struct btrfs_path *path; 4402 struct extent_buffer *leaf; 4403 struct btrfs_key key; 4404 int ret; 4405 4406 path = btrfs_alloc_path(); 4407 if (!path) 4408 return -ENOMEM; 4409 4410 key.objectid = BTRFS_BALANCE_OBJECTID; 4411 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4412 key.offset = 0; 4413 4414 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4415 if (ret < 0) 4416 goto out; 4417 if (ret > 0) { /* ret = -ENOENT; */ 4418 ret = 0; 4419 goto out; 4420 } 4421 4422 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4423 if (!bctl) { 4424 ret = -ENOMEM; 4425 goto out; 4426 } 4427 4428 leaf = path->nodes[0]; 4429 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4430 4431 bctl->flags = btrfs_balance_flags(leaf, item); 4432 bctl->flags |= BTRFS_BALANCE_RESUME; 4433 4434 btrfs_balance_data(leaf, item, &disk_bargs); 4435 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4436 btrfs_balance_meta(leaf, item, &disk_bargs); 4437 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4438 btrfs_balance_sys(leaf, item, &disk_bargs); 4439 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4440 4441 /* 4442 * This should never happen, as the paused balance state is recovered 4443 * during mount without any chance of other exclusive ops to collide. 4444 * 4445 * This gives the exclusive op status to balance and keeps in paused 4446 * state until user intervention (cancel or umount). If the ownership 4447 * cannot be assigned, show a message but do not fail. The balance 4448 * is in a paused state and must have fs_info::balance_ctl properly 4449 * set up. 4450 */ 4451 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4452 btrfs_warn(fs_info, 4453 "balance: cannot set exclusive op status, resume manually"); 4454 4455 btrfs_release_path(path); 4456 4457 mutex_lock(&fs_info->balance_mutex); 4458 BUG_ON(fs_info->balance_ctl); 4459 spin_lock(&fs_info->balance_lock); 4460 fs_info->balance_ctl = bctl; 4461 spin_unlock(&fs_info->balance_lock); 4462 mutex_unlock(&fs_info->balance_mutex); 4463 out: 4464 btrfs_free_path(path); 4465 return ret; 4466 } 4467 4468 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4469 { 4470 int ret = 0; 4471 4472 mutex_lock(&fs_info->balance_mutex); 4473 if (!fs_info->balance_ctl) { 4474 mutex_unlock(&fs_info->balance_mutex); 4475 return -ENOTCONN; 4476 } 4477 4478 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4479 atomic_inc(&fs_info->balance_pause_req); 4480 mutex_unlock(&fs_info->balance_mutex); 4481 4482 wait_event(fs_info->balance_wait_q, 4483 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4484 4485 mutex_lock(&fs_info->balance_mutex); 4486 /* we are good with balance_ctl ripped off from under us */ 4487 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4488 atomic_dec(&fs_info->balance_pause_req); 4489 } else { 4490 ret = -ENOTCONN; 4491 } 4492 4493 mutex_unlock(&fs_info->balance_mutex); 4494 return ret; 4495 } 4496 4497 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4498 { 4499 mutex_lock(&fs_info->balance_mutex); 4500 if (!fs_info->balance_ctl) { 4501 mutex_unlock(&fs_info->balance_mutex); 4502 return -ENOTCONN; 4503 } 4504 4505 /* 4506 * A paused balance with the item stored on disk can be resumed at 4507 * mount time if the mount is read-write. Otherwise it's still paused 4508 * and we must not allow cancelling as it deletes the item. 4509 */ 4510 if (sb_rdonly(fs_info->sb)) { 4511 mutex_unlock(&fs_info->balance_mutex); 4512 return -EROFS; 4513 } 4514 4515 atomic_inc(&fs_info->balance_cancel_req); 4516 /* 4517 * if we are running just wait and return, balance item is 4518 * deleted in btrfs_balance in this case 4519 */ 4520 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4521 mutex_unlock(&fs_info->balance_mutex); 4522 wait_event(fs_info->balance_wait_q, 4523 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4524 mutex_lock(&fs_info->balance_mutex); 4525 } else { 4526 mutex_unlock(&fs_info->balance_mutex); 4527 /* 4528 * Lock released to allow other waiters to continue, we'll 4529 * reexamine the status again. 4530 */ 4531 mutex_lock(&fs_info->balance_mutex); 4532 4533 if (fs_info->balance_ctl) { 4534 reset_balance_state(fs_info); 4535 btrfs_exclop_finish(fs_info); 4536 btrfs_info(fs_info, "balance: canceled"); 4537 } 4538 } 4539 4540 BUG_ON(fs_info->balance_ctl || 4541 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4542 atomic_dec(&fs_info->balance_cancel_req); 4543 mutex_unlock(&fs_info->balance_mutex); 4544 return 0; 4545 } 4546 4547 int btrfs_uuid_scan_kthread(void *data) 4548 { 4549 struct btrfs_fs_info *fs_info = data; 4550 struct btrfs_root *root = fs_info->tree_root; 4551 struct btrfs_key key; 4552 struct btrfs_path *path = NULL; 4553 int ret = 0; 4554 struct extent_buffer *eb; 4555 int slot; 4556 struct btrfs_root_item root_item; 4557 u32 item_size; 4558 struct btrfs_trans_handle *trans = NULL; 4559 bool closing = false; 4560 4561 path = btrfs_alloc_path(); 4562 if (!path) { 4563 ret = -ENOMEM; 4564 goto out; 4565 } 4566 4567 key.objectid = 0; 4568 key.type = BTRFS_ROOT_ITEM_KEY; 4569 key.offset = 0; 4570 4571 while (1) { 4572 if (btrfs_fs_closing(fs_info)) { 4573 closing = true; 4574 break; 4575 } 4576 ret = btrfs_search_forward(root, &key, path, 4577 BTRFS_OLDEST_GENERATION); 4578 if (ret) { 4579 if (ret > 0) 4580 ret = 0; 4581 break; 4582 } 4583 4584 if (key.type != BTRFS_ROOT_ITEM_KEY || 4585 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4586 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4587 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4588 goto skip; 4589 4590 eb = path->nodes[0]; 4591 slot = path->slots[0]; 4592 item_size = btrfs_item_size_nr(eb, slot); 4593 if (item_size < sizeof(root_item)) 4594 goto skip; 4595 4596 read_extent_buffer(eb, &root_item, 4597 btrfs_item_ptr_offset(eb, slot), 4598 (int)sizeof(root_item)); 4599 if (btrfs_root_refs(&root_item) == 0) 4600 goto skip; 4601 4602 if (!btrfs_is_empty_uuid(root_item.uuid) || 4603 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4604 if (trans) 4605 goto update_tree; 4606 4607 btrfs_release_path(path); 4608 /* 4609 * 1 - subvol uuid item 4610 * 1 - received_subvol uuid item 4611 */ 4612 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4613 if (IS_ERR(trans)) { 4614 ret = PTR_ERR(trans); 4615 break; 4616 } 4617 continue; 4618 } else { 4619 goto skip; 4620 } 4621 update_tree: 4622 btrfs_release_path(path); 4623 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4624 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4625 BTRFS_UUID_KEY_SUBVOL, 4626 key.objectid); 4627 if (ret < 0) { 4628 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4629 ret); 4630 break; 4631 } 4632 } 4633 4634 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4635 ret = btrfs_uuid_tree_add(trans, 4636 root_item.received_uuid, 4637 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4638 key.objectid); 4639 if (ret < 0) { 4640 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4641 ret); 4642 break; 4643 } 4644 } 4645 4646 skip: 4647 btrfs_release_path(path); 4648 if (trans) { 4649 ret = btrfs_end_transaction(trans); 4650 trans = NULL; 4651 if (ret) 4652 break; 4653 } 4654 4655 if (key.offset < (u64)-1) { 4656 key.offset++; 4657 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4658 key.offset = 0; 4659 key.type = BTRFS_ROOT_ITEM_KEY; 4660 } else if (key.objectid < (u64)-1) { 4661 key.offset = 0; 4662 key.type = BTRFS_ROOT_ITEM_KEY; 4663 key.objectid++; 4664 } else { 4665 break; 4666 } 4667 cond_resched(); 4668 } 4669 4670 out: 4671 btrfs_free_path(path); 4672 if (trans && !IS_ERR(trans)) 4673 btrfs_end_transaction(trans); 4674 if (ret) 4675 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4676 else if (!closing) 4677 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4678 up(&fs_info->uuid_tree_rescan_sem); 4679 return 0; 4680 } 4681 4682 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4683 { 4684 struct btrfs_trans_handle *trans; 4685 struct btrfs_root *tree_root = fs_info->tree_root; 4686 struct btrfs_root *uuid_root; 4687 struct task_struct *task; 4688 int ret; 4689 4690 /* 4691 * 1 - root node 4692 * 1 - root item 4693 */ 4694 trans = btrfs_start_transaction(tree_root, 2); 4695 if (IS_ERR(trans)) 4696 return PTR_ERR(trans); 4697 4698 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4699 if (IS_ERR(uuid_root)) { 4700 ret = PTR_ERR(uuid_root); 4701 btrfs_abort_transaction(trans, ret); 4702 btrfs_end_transaction(trans); 4703 return ret; 4704 } 4705 4706 fs_info->uuid_root = uuid_root; 4707 4708 ret = btrfs_commit_transaction(trans); 4709 if (ret) 4710 return ret; 4711 4712 down(&fs_info->uuid_tree_rescan_sem); 4713 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4714 if (IS_ERR(task)) { 4715 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4716 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4717 up(&fs_info->uuid_tree_rescan_sem); 4718 return PTR_ERR(task); 4719 } 4720 4721 return 0; 4722 } 4723 4724 /* 4725 * shrinking a device means finding all of the device extents past 4726 * the new size, and then following the back refs to the chunks. 4727 * The chunk relocation code actually frees the device extent 4728 */ 4729 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4730 { 4731 struct btrfs_fs_info *fs_info = device->fs_info; 4732 struct btrfs_root *root = fs_info->dev_root; 4733 struct btrfs_trans_handle *trans; 4734 struct btrfs_dev_extent *dev_extent = NULL; 4735 struct btrfs_path *path; 4736 u64 length; 4737 u64 chunk_offset; 4738 int ret; 4739 int slot; 4740 int failed = 0; 4741 bool retried = false; 4742 struct extent_buffer *l; 4743 struct btrfs_key key; 4744 struct btrfs_super_block *super_copy = fs_info->super_copy; 4745 u64 old_total = btrfs_super_total_bytes(super_copy); 4746 u64 old_size = btrfs_device_get_total_bytes(device); 4747 u64 diff; 4748 u64 start; 4749 4750 new_size = round_down(new_size, fs_info->sectorsize); 4751 start = new_size; 4752 diff = round_down(old_size - new_size, fs_info->sectorsize); 4753 4754 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4755 return -EINVAL; 4756 4757 path = btrfs_alloc_path(); 4758 if (!path) 4759 return -ENOMEM; 4760 4761 path->reada = READA_BACK; 4762 4763 trans = btrfs_start_transaction(root, 0); 4764 if (IS_ERR(trans)) { 4765 btrfs_free_path(path); 4766 return PTR_ERR(trans); 4767 } 4768 4769 mutex_lock(&fs_info->chunk_mutex); 4770 4771 btrfs_device_set_total_bytes(device, new_size); 4772 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4773 device->fs_devices->total_rw_bytes -= diff; 4774 atomic64_sub(diff, &fs_info->free_chunk_space); 4775 } 4776 4777 /* 4778 * Once the device's size has been set to the new size, ensure all 4779 * in-memory chunks are synced to disk so that the loop below sees them 4780 * and relocates them accordingly. 4781 */ 4782 if (contains_pending_extent(device, &start, diff)) { 4783 mutex_unlock(&fs_info->chunk_mutex); 4784 ret = btrfs_commit_transaction(trans); 4785 if (ret) 4786 goto done; 4787 } else { 4788 mutex_unlock(&fs_info->chunk_mutex); 4789 btrfs_end_transaction(trans); 4790 } 4791 4792 again: 4793 key.objectid = device->devid; 4794 key.offset = (u64)-1; 4795 key.type = BTRFS_DEV_EXTENT_KEY; 4796 4797 do { 4798 mutex_lock(&fs_info->reclaim_bgs_lock); 4799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4800 if (ret < 0) { 4801 mutex_unlock(&fs_info->reclaim_bgs_lock); 4802 goto done; 4803 } 4804 4805 ret = btrfs_previous_item(root, path, 0, key.type); 4806 if (ret) { 4807 mutex_unlock(&fs_info->reclaim_bgs_lock); 4808 if (ret < 0) 4809 goto done; 4810 ret = 0; 4811 btrfs_release_path(path); 4812 break; 4813 } 4814 4815 l = path->nodes[0]; 4816 slot = path->slots[0]; 4817 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4818 4819 if (key.objectid != device->devid) { 4820 mutex_unlock(&fs_info->reclaim_bgs_lock); 4821 btrfs_release_path(path); 4822 break; 4823 } 4824 4825 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4826 length = btrfs_dev_extent_length(l, dev_extent); 4827 4828 if (key.offset + length <= new_size) { 4829 mutex_unlock(&fs_info->reclaim_bgs_lock); 4830 btrfs_release_path(path); 4831 break; 4832 } 4833 4834 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4835 btrfs_release_path(path); 4836 4837 /* 4838 * We may be relocating the only data chunk we have, 4839 * which could potentially end up with losing data's 4840 * raid profile, so lets allocate an empty one in 4841 * advance. 4842 */ 4843 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4844 if (ret < 0) { 4845 mutex_unlock(&fs_info->reclaim_bgs_lock); 4846 goto done; 4847 } 4848 4849 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4850 mutex_unlock(&fs_info->reclaim_bgs_lock); 4851 if (ret == -ENOSPC) { 4852 failed++; 4853 } else if (ret) { 4854 if (ret == -ETXTBSY) { 4855 btrfs_warn(fs_info, 4856 "could not shrink block group %llu due to active swapfile", 4857 chunk_offset); 4858 } 4859 goto done; 4860 } 4861 } while (key.offset-- > 0); 4862 4863 if (failed && !retried) { 4864 failed = 0; 4865 retried = true; 4866 goto again; 4867 } else if (failed && retried) { 4868 ret = -ENOSPC; 4869 goto done; 4870 } 4871 4872 /* Shrinking succeeded, else we would be at "done". */ 4873 trans = btrfs_start_transaction(root, 0); 4874 if (IS_ERR(trans)) { 4875 ret = PTR_ERR(trans); 4876 goto done; 4877 } 4878 4879 mutex_lock(&fs_info->chunk_mutex); 4880 /* Clear all state bits beyond the shrunk device size */ 4881 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4882 CHUNK_STATE_MASK); 4883 4884 btrfs_device_set_disk_total_bytes(device, new_size); 4885 if (list_empty(&device->post_commit_list)) 4886 list_add_tail(&device->post_commit_list, 4887 &trans->transaction->dev_update_list); 4888 4889 WARN_ON(diff > old_total); 4890 btrfs_set_super_total_bytes(super_copy, 4891 round_down(old_total - diff, fs_info->sectorsize)); 4892 mutex_unlock(&fs_info->chunk_mutex); 4893 4894 /* Now btrfs_update_device() will change the on-disk size. */ 4895 ret = btrfs_update_device(trans, device); 4896 if (ret < 0) { 4897 btrfs_abort_transaction(trans, ret); 4898 btrfs_end_transaction(trans); 4899 } else { 4900 ret = btrfs_commit_transaction(trans); 4901 } 4902 done: 4903 btrfs_free_path(path); 4904 if (ret) { 4905 mutex_lock(&fs_info->chunk_mutex); 4906 btrfs_device_set_total_bytes(device, old_size); 4907 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4908 device->fs_devices->total_rw_bytes += diff; 4909 atomic64_add(diff, &fs_info->free_chunk_space); 4910 mutex_unlock(&fs_info->chunk_mutex); 4911 } 4912 return ret; 4913 } 4914 4915 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4916 struct btrfs_key *key, 4917 struct btrfs_chunk *chunk, int item_size) 4918 { 4919 struct btrfs_super_block *super_copy = fs_info->super_copy; 4920 struct btrfs_disk_key disk_key; 4921 u32 array_size; 4922 u8 *ptr; 4923 4924 lockdep_assert_held(&fs_info->chunk_mutex); 4925 4926 array_size = btrfs_super_sys_array_size(super_copy); 4927 if (array_size + item_size + sizeof(disk_key) 4928 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4929 return -EFBIG; 4930 4931 ptr = super_copy->sys_chunk_array + array_size; 4932 btrfs_cpu_key_to_disk(&disk_key, key); 4933 memcpy(ptr, &disk_key, sizeof(disk_key)); 4934 ptr += sizeof(disk_key); 4935 memcpy(ptr, chunk, item_size); 4936 item_size += sizeof(disk_key); 4937 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4938 4939 return 0; 4940 } 4941 4942 /* 4943 * sort the devices in descending order by max_avail, total_avail 4944 */ 4945 static int btrfs_cmp_device_info(const void *a, const void *b) 4946 { 4947 const struct btrfs_device_info *di_a = a; 4948 const struct btrfs_device_info *di_b = b; 4949 4950 if (di_a->max_avail > di_b->max_avail) 4951 return -1; 4952 if (di_a->max_avail < di_b->max_avail) 4953 return 1; 4954 if (di_a->total_avail > di_b->total_avail) 4955 return -1; 4956 if (di_a->total_avail < di_b->total_avail) 4957 return 1; 4958 return 0; 4959 } 4960 4961 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4962 { 4963 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4964 return; 4965 4966 btrfs_set_fs_incompat(info, RAID56); 4967 } 4968 4969 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4970 { 4971 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4972 return; 4973 4974 btrfs_set_fs_incompat(info, RAID1C34); 4975 } 4976 4977 /* 4978 * Structure used internally for btrfs_create_chunk() function. 4979 * Wraps needed parameters. 4980 */ 4981 struct alloc_chunk_ctl { 4982 u64 start; 4983 u64 type; 4984 /* Total number of stripes to allocate */ 4985 int num_stripes; 4986 /* sub_stripes info for map */ 4987 int sub_stripes; 4988 /* Stripes per device */ 4989 int dev_stripes; 4990 /* Maximum number of devices to use */ 4991 int devs_max; 4992 /* Minimum number of devices to use */ 4993 int devs_min; 4994 /* ndevs has to be a multiple of this */ 4995 int devs_increment; 4996 /* Number of copies */ 4997 int ncopies; 4998 /* Number of stripes worth of bytes to store parity information */ 4999 int nparity; 5000 u64 max_stripe_size; 5001 u64 max_chunk_size; 5002 u64 dev_extent_min; 5003 u64 stripe_size; 5004 u64 chunk_size; 5005 int ndevs; 5006 }; 5007 5008 static void init_alloc_chunk_ctl_policy_regular( 5009 struct btrfs_fs_devices *fs_devices, 5010 struct alloc_chunk_ctl *ctl) 5011 { 5012 u64 type = ctl->type; 5013 5014 if (type & BTRFS_BLOCK_GROUP_DATA) { 5015 ctl->max_stripe_size = SZ_1G; 5016 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5017 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5018 /* For larger filesystems, use larger metadata chunks */ 5019 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5020 ctl->max_stripe_size = SZ_1G; 5021 else 5022 ctl->max_stripe_size = SZ_256M; 5023 ctl->max_chunk_size = ctl->max_stripe_size; 5024 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5025 ctl->max_stripe_size = SZ_32M; 5026 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5027 ctl->devs_max = min_t(int, ctl->devs_max, 5028 BTRFS_MAX_DEVS_SYS_CHUNK); 5029 } else { 5030 BUG(); 5031 } 5032 5033 /* We don't want a chunk larger than 10% of writable space */ 5034 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5035 ctl->max_chunk_size); 5036 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5037 } 5038 5039 static void init_alloc_chunk_ctl_policy_zoned( 5040 struct btrfs_fs_devices *fs_devices, 5041 struct alloc_chunk_ctl *ctl) 5042 { 5043 u64 zone_size = fs_devices->fs_info->zone_size; 5044 u64 limit; 5045 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5046 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5047 u64 min_chunk_size = min_data_stripes * zone_size; 5048 u64 type = ctl->type; 5049 5050 ctl->max_stripe_size = zone_size; 5051 if (type & BTRFS_BLOCK_GROUP_DATA) { 5052 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5053 zone_size); 5054 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5055 ctl->max_chunk_size = ctl->max_stripe_size; 5056 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5057 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5058 ctl->devs_max = min_t(int, ctl->devs_max, 5059 BTRFS_MAX_DEVS_SYS_CHUNK); 5060 } else { 5061 BUG(); 5062 } 5063 5064 /* We don't want a chunk larger than 10% of writable space */ 5065 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5066 zone_size), 5067 min_chunk_size); 5068 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5069 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5070 } 5071 5072 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5073 struct alloc_chunk_ctl *ctl) 5074 { 5075 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5076 5077 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5078 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5079 ctl->devs_max = btrfs_raid_array[index].devs_max; 5080 if (!ctl->devs_max) 5081 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5082 ctl->devs_min = btrfs_raid_array[index].devs_min; 5083 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5084 ctl->ncopies = btrfs_raid_array[index].ncopies; 5085 ctl->nparity = btrfs_raid_array[index].nparity; 5086 ctl->ndevs = 0; 5087 5088 switch (fs_devices->chunk_alloc_policy) { 5089 case BTRFS_CHUNK_ALLOC_REGULAR: 5090 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5091 break; 5092 case BTRFS_CHUNK_ALLOC_ZONED: 5093 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5094 break; 5095 default: 5096 BUG(); 5097 } 5098 } 5099 5100 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5101 struct alloc_chunk_ctl *ctl, 5102 struct btrfs_device_info *devices_info) 5103 { 5104 struct btrfs_fs_info *info = fs_devices->fs_info; 5105 struct btrfs_device *device; 5106 u64 total_avail; 5107 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5108 int ret; 5109 int ndevs = 0; 5110 u64 max_avail; 5111 u64 dev_offset; 5112 5113 /* 5114 * in the first pass through the devices list, we gather information 5115 * about the available holes on each device. 5116 */ 5117 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5118 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5119 WARN(1, KERN_ERR 5120 "BTRFS: read-only device in alloc_list\n"); 5121 continue; 5122 } 5123 5124 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5125 &device->dev_state) || 5126 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5127 continue; 5128 5129 if (device->total_bytes > device->bytes_used) 5130 total_avail = device->total_bytes - device->bytes_used; 5131 else 5132 total_avail = 0; 5133 5134 /* If there is no space on this device, skip it. */ 5135 if (total_avail < ctl->dev_extent_min) 5136 continue; 5137 5138 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5139 &max_avail); 5140 if (ret && ret != -ENOSPC) 5141 return ret; 5142 5143 if (ret == 0) 5144 max_avail = dev_extent_want; 5145 5146 if (max_avail < ctl->dev_extent_min) { 5147 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5148 btrfs_debug(info, 5149 "%s: devid %llu has no free space, have=%llu want=%llu", 5150 __func__, device->devid, max_avail, 5151 ctl->dev_extent_min); 5152 continue; 5153 } 5154 5155 if (ndevs == fs_devices->rw_devices) { 5156 WARN(1, "%s: found more than %llu devices\n", 5157 __func__, fs_devices->rw_devices); 5158 break; 5159 } 5160 devices_info[ndevs].dev_offset = dev_offset; 5161 devices_info[ndevs].max_avail = max_avail; 5162 devices_info[ndevs].total_avail = total_avail; 5163 devices_info[ndevs].dev = device; 5164 ++ndevs; 5165 } 5166 ctl->ndevs = ndevs; 5167 5168 /* 5169 * now sort the devices by hole size / available space 5170 */ 5171 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5172 btrfs_cmp_device_info, NULL); 5173 5174 return 0; 5175 } 5176 5177 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5178 struct btrfs_device_info *devices_info) 5179 { 5180 /* Number of stripes that count for block group size */ 5181 int data_stripes; 5182 5183 /* 5184 * The primary goal is to maximize the number of stripes, so use as 5185 * many devices as possible, even if the stripes are not maximum sized. 5186 * 5187 * The DUP profile stores more than one stripe per device, the 5188 * max_avail is the total size so we have to adjust. 5189 */ 5190 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5191 ctl->dev_stripes); 5192 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5193 5194 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5195 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5196 5197 /* 5198 * Use the number of data stripes to figure out how big this chunk is 5199 * really going to be in terms of logical address space, and compare 5200 * that answer with the max chunk size. If it's higher, we try to 5201 * reduce stripe_size. 5202 */ 5203 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5204 /* 5205 * Reduce stripe_size, round it up to a 16MB boundary again and 5206 * then use it, unless it ends up being even bigger than the 5207 * previous value we had already. 5208 */ 5209 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5210 data_stripes), SZ_16M), 5211 ctl->stripe_size); 5212 } 5213 5214 /* Align to BTRFS_STRIPE_LEN */ 5215 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5216 ctl->chunk_size = ctl->stripe_size * data_stripes; 5217 5218 return 0; 5219 } 5220 5221 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5222 struct btrfs_device_info *devices_info) 5223 { 5224 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5225 /* Number of stripes that count for block group size */ 5226 int data_stripes; 5227 5228 /* 5229 * It should hold because: 5230 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5231 */ 5232 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5233 5234 ctl->stripe_size = zone_size; 5235 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5236 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5237 5238 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5239 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5240 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5241 ctl->stripe_size) + ctl->nparity, 5242 ctl->dev_stripes); 5243 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5244 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5245 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5246 } 5247 5248 ctl->chunk_size = ctl->stripe_size * data_stripes; 5249 5250 return 0; 5251 } 5252 5253 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5254 struct alloc_chunk_ctl *ctl, 5255 struct btrfs_device_info *devices_info) 5256 { 5257 struct btrfs_fs_info *info = fs_devices->fs_info; 5258 5259 /* 5260 * Round down to number of usable stripes, devs_increment can be any 5261 * number so we can't use round_down() that requires power of 2, while 5262 * rounddown is safe. 5263 */ 5264 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5265 5266 if (ctl->ndevs < ctl->devs_min) { 5267 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5268 btrfs_debug(info, 5269 "%s: not enough devices with free space: have=%d minimum required=%d", 5270 __func__, ctl->ndevs, ctl->devs_min); 5271 } 5272 return -ENOSPC; 5273 } 5274 5275 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5276 5277 switch (fs_devices->chunk_alloc_policy) { 5278 case BTRFS_CHUNK_ALLOC_REGULAR: 5279 return decide_stripe_size_regular(ctl, devices_info); 5280 case BTRFS_CHUNK_ALLOC_ZONED: 5281 return decide_stripe_size_zoned(ctl, devices_info); 5282 default: 5283 BUG(); 5284 } 5285 } 5286 5287 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5288 struct alloc_chunk_ctl *ctl, 5289 struct btrfs_device_info *devices_info) 5290 { 5291 struct btrfs_fs_info *info = trans->fs_info; 5292 struct map_lookup *map = NULL; 5293 struct extent_map_tree *em_tree; 5294 struct btrfs_block_group *block_group; 5295 struct extent_map *em; 5296 u64 start = ctl->start; 5297 u64 type = ctl->type; 5298 int ret; 5299 int i; 5300 int j; 5301 5302 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5303 if (!map) 5304 return ERR_PTR(-ENOMEM); 5305 map->num_stripes = ctl->num_stripes; 5306 5307 for (i = 0; i < ctl->ndevs; ++i) { 5308 for (j = 0; j < ctl->dev_stripes; ++j) { 5309 int s = i * ctl->dev_stripes + j; 5310 map->stripes[s].dev = devices_info[i].dev; 5311 map->stripes[s].physical = devices_info[i].dev_offset + 5312 j * ctl->stripe_size; 5313 } 5314 } 5315 map->stripe_len = BTRFS_STRIPE_LEN; 5316 map->io_align = BTRFS_STRIPE_LEN; 5317 map->io_width = BTRFS_STRIPE_LEN; 5318 map->type = type; 5319 map->sub_stripes = ctl->sub_stripes; 5320 5321 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5322 5323 em = alloc_extent_map(); 5324 if (!em) { 5325 kfree(map); 5326 return ERR_PTR(-ENOMEM); 5327 } 5328 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5329 em->map_lookup = map; 5330 em->start = start; 5331 em->len = ctl->chunk_size; 5332 em->block_start = 0; 5333 em->block_len = em->len; 5334 em->orig_block_len = ctl->stripe_size; 5335 5336 em_tree = &info->mapping_tree; 5337 write_lock(&em_tree->lock); 5338 ret = add_extent_mapping(em_tree, em, 0); 5339 if (ret) { 5340 write_unlock(&em_tree->lock); 5341 free_extent_map(em); 5342 return ERR_PTR(ret); 5343 } 5344 write_unlock(&em_tree->lock); 5345 5346 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5347 if (IS_ERR(block_group)) 5348 goto error_del_extent; 5349 5350 for (i = 0; i < map->num_stripes; i++) { 5351 struct btrfs_device *dev = map->stripes[i].dev; 5352 5353 btrfs_device_set_bytes_used(dev, 5354 dev->bytes_used + ctl->stripe_size); 5355 if (list_empty(&dev->post_commit_list)) 5356 list_add_tail(&dev->post_commit_list, 5357 &trans->transaction->dev_update_list); 5358 } 5359 5360 atomic64_sub(ctl->stripe_size * map->num_stripes, 5361 &info->free_chunk_space); 5362 5363 free_extent_map(em); 5364 check_raid56_incompat_flag(info, type); 5365 check_raid1c34_incompat_flag(info, type); 5366 5367 return block_group; 5368 5369 error_del_extent: 5370 write_lock(&em_tree->lock); 5371 remove_extent_mapping(em_tree, em); 5372 write_unlock(&em_tree->lock); 5373 5374 /* One for our allocation */ 5375 free_extent_map(em); 5376 /* One for the tree reference */ 5377 free_extent_map(em); 5378 5379 return block_group; 5380 } 5381 5382 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5383 u64 type) 5384 { 5385 struct btrfs_fs_info *info = trans->fs_info; 5386 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5387 struct btrfs_device_info *devices_info = NULL; 5388 struct alloc_chunk_ctl ctl; 5389 struct btrfs_block_group *block_group; 5390 int ret; 5391 5392 lockdep_assert_held(&info->chunk_mutex); 5393 5394 if (!alloc_profile_is_valid(type, 0)) { 5395 ASSERT(0); 5396 return ERR_PTR(-EINVAL); 5397 } 5398 5399 if (list_empty(&fs_devices->alloc_list)) { 5400 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5401 btrfs_debug(info, "%s: no writable device", __func__); 5402 return ERR_PTR(-ENOSPC); 5403 } 5404 5405 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5406 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5407 ASSERT(0); 5408 return ERR_PTR(-EINVAL); 5409 } 5410 5411 ctl.start = find_next_chunk(info); 5412 ctl.type = type; 5413 init_alloc_chunk_ctl(fs_devices, &ctl); 5414 5415 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5416 GFP_NOFS); 5417 if (!devices_info) 5418 return ERR_PTR(-ENOMEM); 5419 5420 ret = gather_device_info(fs_devices, &ctl, devices_info); 5421 if (ret < 0) { 5422 block_group = ERR_PTR(ret); 5423 goto out; 5424 } 5425 5426 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5427 if (ret < 0) { 5428 block_group = ERR_PTR(ret); 5429 goto out; 5430 } 5431 5432 block_group = create_chunk(trans, &ctl, devices_info); 5433 5434 out: 5435 kfree(devices_info); 5436 return block_group; 5437 } 5438 5439 /* 5440 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5441 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5442 * chunks. 5443 * 5444 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5445 * phases. 5446 */ 5447 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5448 struct btrfs_block_group *bg) 5449 { 5450 struct btrfs_fs_info *fs_info = trans->fs_info; 5451 struct btrfs_root *extent_root = fs_info->extent_root; 5452 struct btrfs_root *chunk_root = fs_info->chunk_root; 5453 struct btrfs_key key; 5454 struct btrfs_chunk *chunk; 5455 struct btrfs_stripe *stripe; 5456 struct extent_map *em; 5457 struct map_lookup *map; 5458 size_t item_size; 5459 int i; 5460 int ret; 5461 5462 /* 5463 * We take the chunk_mutex for 2 reasons: 5464 * 5465 * 1) Updates and insertions in the chunk btree must be done while holding 5466 * the chunk_mutex, as well as updating the system chunk array in the 5467 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5468 * details; 5469 * 5470 * 2) To prevent races with the final phase of a device replace operation 5471 * that replaces the device object associated with the map's stripes, 5472 * because the device object's id can change at any time during that 5473 * final phase of the device replace operation 5474 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5475 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5476 * which would cause a failure when updating the device item, which does 5477 * not exists, or persisting a stripe of the chunk item with such ID. 5478 * Here we can't use the device_list_mutex because our caller already 5479 * has locked the chunk_mutex, and the final phase of device replace 5480 * acquires both mutexes - first the device_list_mutex and then the 5481 * chunk_mutex. Using any of those two mutexes protects us from a 5482 * concurrent device replace. 5483 */ 5484 lockdep_assert_held(&fs_info->chunk_mutex); 5485 5486 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5487 if (IS_ERR(em)) { 5488 ret = PTR_ERR(em); 5489 btrfs_abort_transaction(trans, ret); 5490 return ret; 5491 } 5492 5493 map = em->map_lookup; 5494 item_size = btrfs_chunk_item_size(map->num_stripes); 5495 5496 chunk = kzalloc(item_size, GFP_NOFS); 5497 if (!chunk) { 5498 ret = -ENOMEM; 5499 btrfs_abort_transaction(trans, ret); 5500 goto out; 5501 } 5502 5503 for (i = 0; i < map->num_stripes; i++) { 5504 struct btrfs_device *device = map->stripes[i].dev; 5505 5506 ret = btrfs_update_device(trans, device); 5507 if (ret) 5508 goto out; 5509 } 5510 5511 stripe = &chunk->stripe; 5512 for (i = 0; i < map->num_stripes; i++) { 5513 struct btrfs_device *device = map->stripes[i].dev; 5514 const u64 dev_offset = map->stripes[i].physical; 5515 5516 btrfs_set_stack_stripe_devid(stripe, device->devid); 5517 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5518 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5519 stripe++; 5520 } 5521 5522 btrfs_set_stack_chunk_length(chunk, bg->length); 5523 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5524 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5525 btrfs_set_stack_chunk_type(chunk, map->type); 5526 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5527 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5528 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5529 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5530 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5531 5532 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5533 key.type = BTRFS_CHUNK_ITEM_KEY; 5534 key.offset = bg->start; 5535 5536 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5537 if (ret) 5538 goto out; 5539 5540 bg->chunk_item_inserted = 1; 5541 5542 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5543 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5544 if (ret) 5545 goto out; 5546 } 5547 5548 out: 5549 kfree(chunk); 5550 free_extent_map(em); 5551 return ret; 5552 } 5553 5554 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5555 { 5556 struct btrfs_fs_info *fs_info = trans->fs_info; 5557 u64 alloc_profile; 5558 struct btrfs_block_group *meta_bg; 5559 struct btrfs_block_group *sys_bg; 5560 5561 /* 5562 * When adding a new device for sprouting, the seed device is read-only 5563 * so we must first allocate a metadata and a system chunk. But before 5564 * adding the block group items to the extent, device and chunk btrees, 5565 * we must first: 5566 * 5567 * 1) Create both chunks without doing any changes to the btrees, as 5568 * otherwise we would get -ENOSPC since the block groups from the 5569 * seed device are read-only; 5570 * 5571 * 2) Add the device item for the new sprout device - finishing the setup 5572 * of a new block group requires updating the device item in the chunk 5573 * btree, so it must exist when we attempt to do it. The previous step 5574 * ensures this does not fail with -ENOSPC. 5575 * 5576 * After that we can add the block group items to their btrees: 5577 * update existing device item in the chunk btree, add a new block group 5578 * item to the extent btree, add a new chunk item to the chunk btree and 5579 * finally add the new device extent items to the devices btree. 5580 */ 5581 5582 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5583 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5584 if (IS_ERR(meta_bg)) 5585 return PTR_ERR(meta_bg); 5586 5587 alloc_profile = btrfs_system_alloc_profile(fs_info); 5588 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5589 if (IS_ERR(sys_bg)) 5590 return PTR_ERR(sys_bg); 5591 5592 return 0; 5593 } 5594 5595 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5596 { 5597 const int index = btrfs_bg_flags_to_raid_index(map->type); 5598 5599 return btrfs_raid_array[index].tolerated_failures; 5600 } 5601 5602 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5603 { 5604 struct extent_map *em; 5605 struct map_lookup *map; 5606 int miss_ndevs = 0; 5607 int i; 5608 bool ret = true; 5609 5610 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5611 if (IS_ERR(em)) 5612 return false; 5613 5614 map = em->map_lookup; 5615 for (i = 0; i < map->num_stripes; i++) { 5616 if (test_bit(BTRFS_DEV_STATE_MISSING, 5617 &map->stripes[i].dev->dev_state)) { 5618 miss_ndevs++; 5619 continue; 5620 } 5621 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5622 &map->stripes[i].dev->dev_state)) { 5623 ret = false; 5624 goto end; 5625 } 5626 } 5627 5628 /* 5629 * If the number of missing devices is larger than max errors, we can 5630 * not write the data into that chunk successfully. 5631 */ 5632 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5633 ret = false; 5634 end: 5635 free_extent_map(em); 5636 return ret; 5637 } 5638 5639 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5640 { 5641 struct extent_map *em; 5642 5643 while (1) { 5644 write_lock(&tree->lock); 5645 em = lookup_extent_mapping(tree, 0, (u64)-1); 5646 if (em) 5647 remove_extent_mapping(tree, em); 5648 write_unlock(&tree->lock); 5649 if (!em) 5650 break; 5651 /* once for us */ 5652 free_extent_map(em); 5653 /* once for the tree */ 5654 free_extent_map(em); 5655 } 5656 } 5657 5658 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5659 { 5660 struct extent_map *em; 5661 struct map_lookup *map; 5662 int ret; 5663 5664 em = btrfs_get_chunk_map(fs_info, logical, len); 5665 if (IS_ERR(em)) 5666 /* 5667 * We could return errors for these cases, but that could get 5668 * ugly and we'd probably do the same thing which is just not do 5669 * anything else and exit, so return 1 so the callers don't try 5670 * to use other copies. 5671 */ 5672 return 1; 5673 5674 map = em->map_lookup; 5675 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5676 ret = map->num_stripes; 5677 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5678 ret = map->sub_stripes; 5679 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5680 ret = 2; 5681 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5682 /* 5683 * There could be two corrupted data stripes, we need 5684 * to loop retry in order to rebuild the correct data. 5685 * 5686 * Fail a stripe at a time on every retry except the 5687 * stripe under reconstruction. 5688 */ 5689 ret = map->num_stripes; 5690 else 5691 ret = 1; 5692 free_extent_map(em); 5693 5694 down_read(&fs_info->dev_replace.rwsem); 5695 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5696 fs_info->dev_replace.tgtdev) 5697 ret++; 5698 up_read(&fs_info->dev_replace.rwsem); 5699 5700 return ret; 5701 } 5702 5703 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5704 u64 logical) 5705 { 5706 struct extent_map *em; 5707 struct map_lookup *map; 5708 unsigned long len = fs_info->sectorsize; 5709 5710 em = btrfs_get_chunk_map(fs_info, logical, len); 5711 5712 if (!WARN_ON(IS_ERR(em))) { 5713 map = em->map_lookup; 5714 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5715 len = map->stripe_len * nr_data_stripes(map); 5716 free_extent_map(em); 5717 } 5718 return len; 5719 } 5720 5721 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5722 { 5723 struct extent_map *em; 5724 struct map_lookup *map; 5725 int ret = 0; 5726 5727 em = btrfs_get_chunk_map(fs_info, logical, len); 5728 5729 if(!WARN_ON(IS_ERR(em))) { 5730 map = em->map_lookup; 5731 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5732 ret = 1; 5733 free_extent_map(em); 5734 } 5735 return ret; 5736 } 5737 5738 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5739 struct map_lookup *map, int first, 5740 int dev_replace_is_ongoing) 5741 { 5742 int i; 5743 int num_stripes; 5744 int preferred_mirror; 5745 int tolerance; 5746 struct btrfs_device *srcdev; 5747 5748 ASSERT((map->type & 5749 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5750 5751 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5752 num_stripes = map->sub_stripes; 5753 else 5754 num_stripes = map->num_stripes; 5755 5756 switch (fs_info->fs_devices->read_policy) { 5757 default: 5758 /* Shouldn't happen, just warn and use pid instead of failing */ 5759 btrfs_warn_rl(fs_info, 5760 "unknown read_policy type %u, reset to pid", 5761 fs_info->fs_devices->read_policy); 5762 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5763 fallthrough; 5764 case BTRFS_READ_POLICY_PID: 5765 preferred_mirror = first + (current->pid % num_stripes); 5766 break; 5767 } 5768 5769 if (dev_replace_is_ongoing && 5770 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5771 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5772 srcdev = fs_info->dev_replace.srcdev; 5773 else 5774 srcdev = NULL; 5775 5776 /* 5777 * try to avoid the drive that is the source drive for a 5778 * dev-replace procedure, only choose it if no other non-missing 5779 * mirror is available 5780 */ 5781 for (tolerance = 0; tolerance < 2; tolerance++) { 5782 if (map->stripes[preferred_mirror].dev->bdev && 5783 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5784 return preferred_mirror; 5785 for (i = first; i < first + num_stripes; i++) { 5786 if (map->stripes[i].dev->bdev && 5787 (tolerance || map->stripes[i].dev != srcdev)) 5788 return i; 5789 } 5790 } 5791 5792 /* we couldn't find one that doesn't fail. Just return something 5793 * and the io error handling code will clean up eventually 5794 */ 5795 return preferred_mirror; 5796 } 5797 5798 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5799 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5800 { 5801 int i; 5802 int again = 1; 5803 5804 while (again) { 5805 again = 0; 5806 for (i = 0; i < num_stripes - 1; i++) { 5807 /* Swap if parity is on a smaller index */ 5808 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5809 swap(bbio->stripes[i], bbio->stripes[i + 1]); 5810 swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 5811 again = 1; 5812 } 5813 } 5814 } 5815 } 5816 5817 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5818 { 5819 struct btrfs_bio *bbio = kzalloc( 5820 /* the size of the btrfs_bio */ 5821 sizeof(struct btrfs_bio) + 5822 /* plus the variable array for the stripes */ 5823 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5824 /* plus the variable array for the tgt dev */ 5825 sizeof(int) * (real_stripes) + 5826 /* 5827 * plus the raid_map, which includes both the tgt dev 5828 * and the stripes 5829 */ 5830 sizeof(u64) * (total_stripes), 5831 GFP_NOFS|__GFP_NOFAIL); 5832 5833 atomic_set(&bbio->error, 0); 5834 refcount_set(&bbio->refs, 1); 5835 5836 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5837 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5838 5839 return bbio; 5840 } 5841 5842 void btrfs_get_bbio(struct btrfs_bio *bbio) 5843 { 5844 WARN_ON(!refcount_read(&bbio->refs)); 5845 refcount_inc(&bbio->refs); 5846 } 5847 5848 void btrfs_put_bbio(struct btrfs_bio *bbio) 5849 { 5850 if (!bbio) 5851 return; 5852 if (refcount_dec_and_test(&bbio->refs)) 5853 kfree(bbio); 5854 } 5855 5856 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5857 /* 5858 * Please note that, discard won't be sent to target device of device 5859 * replace. 5860 */ 5861 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5862 u64 logical, u64 *length_ret, 5863 struct btrfs_bio **bbio_ret) 5864 { 5865 struct extent_map *em; 5866 struct map_lookup *map; 5867 struct btrfs_bio *bbio; 5868 u64 length = *length_ret; 5869 u64 offset; 5870 u64 stripe_nr; 5871 u64 stripe_nr_end; 5872 u64 stripe_end_offset; 5873 u64 stripe_cnt; 5874 u64 stripe_len; 5875 u64 stripe_offset; 5876 u64 num_stripes; 5877 u32 stripe_index; 5878 u32 factor = 0; 5879 u32 sub_stripes = 0; 5880 u64 stripes_per_dev = 0; 5881 u32 remaining_stripes = 0; 5882 u32 last_stripe = 0; 5883 int ret = 0; 5884 int i; 5885 5886 /* discard always return a bbio */ 5887 ASSERT(bbio_ret); 5888 5889 em = btrfs_get_chunk_map(fs_info, logical, length); 5890 if (IS_ERR(em)) 5891 return PTR_ERR(em); 5892 5893 map = em->map_lookup; 5894 /* we don't discard raid56 yet */ 5895 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5896 ret = -EOPNOTSUPP; 5897 goto out; 5898 } 5899 5900 offset = logical - em->start; 5901 length = min_t(u64, em->start + em->len - logical, length); 5902 *length_ret = length; 5903 5904 stripe_len = map->stripe_len; 5905 /* 5906 * stripe_nr counts the total number of stripes we have to stride 5907 * to get to this block 5908 */ 5909 stripe_nr = div64_u64(offset, stripe_len); 5910 5911 /* stripe_offset is the offset of this block in its stripe */ 5912 stripe_offset = offset - stripe_nr * stripe_len; 5913 5914 stripe_nr_end = round_up(offset + length, map->stripe_len); 5915 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5916 stripe_cnt = stripe_nr_end - stripe_nr; 5917 stripe_end_offset = stripe_nr_end * map->stripe_len - 5918 (offset + length); 5919 /* 5920 * after this, stripe_nr is the number of stripes on this 5921 * device we have to walk to find the data, and stripe_index is 5922 * the number of our device in the stripe array 5923 */ 5924 num_stripes = 1; 5925 stripe_index = 0; 5926 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5927 BTRFS_BLOCK_GROUP_RAID10)) { 5928 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5929 sub_stripes = 1; 5930 else 5931 sub_stripes = map->sub_stripes; 5932 5933 factor = map->num_stripes / sub_stripes; 5934 num_stripes = min_t(u64, map->num_stripes, 5935 sub_stripes * stripe_cnt); 5936 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5937 stripe_index *= sub_stripes; 5938 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5939 &remaining_stripes); 5940 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5941 last_stripe *= sub_stripes; 5942 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5943 BTRFS_BLOCK_GROUP_DUP)) { 5944 num_stripes = map->num_stripes; 5945 } else { 5946 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5947 &stripe_index); 5948 } 5949 5950 bbio = alloc_btrfs_bio(num_stripes, 0); 5951 if (!bbio) { 5952 ret = -ENOMEM; 5953 goto out; 5954 } 5955 5956 for (i = 0; i < num_stripes; i++) { 5957 bbio->stripes[i].physical = 5958 map->stripes[stripe_index].physical + 5959 stripe_offset + stripe_nr * map->stripe_len; 5960 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5961 5962 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5963 BTRFS_BLOCK_GROUP_RAID10)) { 5964 bbio->stripes[i].length = stripes_per_dev * 5965 map->stripe_len; 5966 5967 if (i / sub_stripes < remaining_stripes) 5968 bbio->stripes[i].length += 5969 map->stripe_len; 5970 5971 /* 5972 * Special for the first stripe and 5973 * the last stripe: 5974 * 5975 * |-------|...|-------| 5976 * |----------| 5977 * off end_off 5978 */ 5979 if (i < sub_stripes) 5980 bbio->stripes[i].length -= 5981 stripe_offset; 5982 5983 if (stripe_index >= last_stripe && 5984 stripe_index <= (last_stripe + 5985 sub_stripes - 1)) 5986 bbio->stripes[i].length -= 5987 stripe_end_offset; 5988 5989 if (i == sub_stripes - 1) 5990 stripe_offset = 0; 5991 } else { 5992 bbio->stripes[i].length = length; 5993 } 5994 5995 stripe_index++; 5996 if (stripe_index == map->num_stripes) { 5997 stripe_index = 0; 5998 stripe_nr++; 5999 } 6000 } 6001 6002 *bbio_ret = bbio; 6003 bbio->map_type = map->type; 6004 bbio->num_stripes = num_stripes; 6005 out: 6006 free_extent_map(em); 6007 return ret; 6008 } 6009 6010 /* 6011 * In dev-replace case, for repair case (that's the only case where the mirror 6012 * is selected explicitly when calling btrfs_map_block), blocks left of the 6013 * left cursor can also be read from the target drive. 6014 * 6015 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6016 * array of stripes. 6017 * For READ, it also needs to be supported using the same mirror number. 6018 * 6019 * If the requested block is not left of the left cursor, EIO is returned. This 6020 * can happen because btrfs_num_copies() returns one more in the dev-replace 6021 * case. 6022 */ 6023 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6024 u64 logical, u64 length, 6025 u64 srcdev_devid, int *mirror_num, 6026 u64 *physical) 6027 { 6028 struct btrfs_bio *bbio = NULL; 6029 int num_stripes; 6030 int index_srcdev = 0; 6031 int found = 0; 6032 u64 physical_of_found = 0; 6033 int i; 6034 int ret = 0; 6035 6036 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6037 logical, &length, &bbio, 0, 0); 6038 if (ret) { 6039 ASSERT(bbio == NULL); 6040 return ret; 6041 } 6042 6043 num_stripes = bbio->num_stripes; 6044 if (*mirror_num > num_stripes) { 6045 /* 6046 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6047 * that means that the requested area is not left of the left 6048 * cursor 6049 */ 6050 btrfs_put_bbio(bbio); 6051 return -EIO; 6052 } 6053 6054 /* 6055 * process the rest of the function using the mirror_num of the source 6056 * drive. Therefore look it up first. At the end, patch the device 6057 * pointer to the one of the target drive. 6058 */ 6059 for (i = 0; i < num_stripes; i++) { 6060 if (bbio->stripes[i].dev->devid != srcdev_devid) 6061 continue; 6062 6063 /* 6064 * In case of DUP, in order to keep it simple, only add the 6065 * mirror with the lowest physical address 6066 */ 6067 if (found && 6068 physical_of_found <= bbio->stripes[i].physical) 6069 continue; 6070 6071 index_srcdev = i; 6072 found = 1; 6073 physical_of_found = bbio->stripes[i].physical; 6074 } 6075 6076 btrfs_put_bbio(bbio); 6077 6078 ASSERT(found); 6079 if (!found) 6080 return -EIO; 6081 6082 *mirror_num = index_srcdev + 1; 6083 *physical = physical_of_found; 6084 return ret; 6085 } 6086 6087 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6088 { 6089 struct btrfs_block_group *cache; 6090 bool ret; 6091 6092 /* Non zoned filesystem does not use "to_copy" flag */ 6093 if (!btrfs_is_zoned(fs_info)) 6094 return false; 6095 6096 cache = btrfs_lookup_block_group(fs_info, logical); 6097 6098 spin_lock(&cache->lock); 6099 ret = cache->to_copy; 6100 spin_unlock(&cache->lock); 6101 6102 btrfs_put_block_group(cache); 6103 return ret; 6104 } 6105 6106 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6107 struct btrfs_bio **bbio_ret, 6108 struct btrfs_dev_replace *dev_replace, 6109 u64 logical, 6110 int *num_stripes_ret, int *max_errors_ret) 6111 { 6112 struct btrfs_bio *bbio = *bbio_ret; 6113 u64 srcdev_devid = dev_replace->srcdev->devid; 6114 int tgtdev_indexes = 0; 6115 int num_stripes = *num_stripes_ret; 6116 int max_errors = *max_errors_ret; 6117 int i; 6118 6119 if (op == BTRFS_MAP_WRITE) { 6120 int index_where_to_add; 6121 6122 /* 6123 * A block group which have "to_copy" set will eventually 6124 * copied by dev-replace process. We can avoid cloning IO here. 6125 */ 6126 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6127 return; 6128 6129 /* 6130 * duplicate the write operations while the dev replace 6131 * procedure is running. Since the copying of the old disk to 6132 * the new disk takes place at run time while the filesystem is 6133 * mounted writable, the regular write operations to the old 6134 * disk have to be duplicated to go to the new disk as well. 6135 * 6136 * Note that device->missing is handled by the caller, and that 6137 * the write to the old disk is already set up in the stripes 6138 * array. 6139 */ 6140 index_where_to_add = num_stripes; 6141 for (i = 0; i < num_stripes; i++) { 6142 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6143 /* write to new disk, too */ 6144 struct btrfs_bio_stripe *new = 6145 bbio->stripes + index_where_to_add; 6146 struct btrfs_bio_stripe *old = 6147 bbio->stripes + i; 6148 6149 new->physical = old->physical; 6150 new->length = old->length; 6151 new->dev = dev_replace->tgtdev; 6152 bbio->tgtdev_map[i] = index_where_to_add; 6153 index_where_to_add++; 6154 max_errors++; 6155 tgtdev_indexes++; 6156 } 6157 } 6158 num_stripes = index_where_to_add; 6159 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6160 int index_srcdev = 0; 6161 int found = 0; 6162 u64 physical_of_found = 0; 6163 6164 /* 6165 * During the dev-replace procedure, the target drive can also 6166 * be used to read data in case it is needed to repair a corrupt 6167 * block elsewhere. This is possible if the requested area is 6168 * left of the left cursor. In this area, the target drive is a 6169 * full copy of the source drive. 6170 */ 6171 for (i = 0; i < num_stripes; i++) { 6172 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6173 /* 6174 * In case of DUP, in order to keep it simple, 6175 * only add the mirror with the lowest physical 6176 * address 6177 */ 6178 if (found && 6179 physical_of_found <= 6180 bbio->stripes[i].physical) 6181 continue; 6182 index_srcdev = i; 6183 found = 1; 6184 physical_of_found = bbio->stripes[i].physical; 6185 } 6186 } 6187 if (found) { 6188 struct btrfs_bio_stripe *tgtdev_stripe = 6189 bbio->stripes + num_stripes; 6190 6191 tgtdev_stripe->physical = physical_of_found; 6192 tgtdev_stripe->length = 6193 bbio->stripes[index_srcdev].length; 6194 tgtdev_stripe->dev = dev_replace->tgtdev; 6195 bbio->tgtdev_map[index_srcdev] = num_stripes; 6196 6197 tgtdev_indexes++; 6198 num_stripes++; 6199 } 6200 } 6201 6202 *num_stripes_ret = num_stripes; 6203 *max_errors_ret = max_errors; 6204 bbio->num_tgtdevs = tgtdev_indexes; 6205 *bbio_ret = bbio; 6206 } 6207 6208 static bool need_full_stripe(enum btrfs_map_op op) 6209 { 6210 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6211 } 6212 6213 /* 6214 * Calculate the geometry of a particular (address, len) tuple. This 6215 * information is used to calculate how big a particular bio can get before it 6216 * straddles a stripe. 6217 * 6218 * @fs_info: the filesystem 6219 * @em: mapping containing the logical extent 6220 * @op: type of operation - write or read 6221 * @logical: address that we want to figure out the geometry of 6222 * @io_geom: pointer used to return values 6223 * 6224 * Returns < 0 in case a chunk for the given logical address cannot be found, 6225 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6226 */ 6227 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6228 enum btrfs_map_op op, u64 logical, 6229 struct btrfs_io_geometry *io_geom) 6230 { 6231 struct map_lookup *map; 6232 u64 len; 6233 u64 offset; 6234 u64 stripe_offset; 6235 u64 stripe_nr; 6236 u64 stripe_len; 6237 u64 raid56_full_stripe_start = (u64)-1; 6238 int data_stripes; 6239 6240 ASSERT(op != BTRFS_MAP_DISCARD); 6241 6242 map = em->map_lookup; 6243 /* Offset of this logical address in the chunk */ 6244 offset = logical - em->start; 6245 /* Len of a stripe in a chunk */ 6246 stripe_len = map->stripe_len; 6247 /* Stripe where this block falls in */ 6248 stripe_nr = div64_u64(offset, stripe_len); 6249 /* Offset of stripe in the chunk */ 6250 stripe_offset = stripe_nr * stripe_len; 6251 if (offset < stripe_offset) { 6252 btrfs_crit(fs_info, 6253 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6254 stripe_offset, offset, em->start, logical, stripe_len); 6255 return -EINVAL; 6256 } 6257 6258 /* stripe_offset is the offset of this block in its stripe */ 6259 stripe_offset = offset - stripe_offset; 6260 data_stripes = nr_data_stripes(map); 6261 6262 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6263 u64 max_len = stripe_len - stripe_offset; 6264 6265 /* 6266 * In case of raid56, we need to know the stripe aligned start 6267 */ 6268 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6269 unsigned long full_stripe_len = stripe_len * data_stripes; 6270 raid56_full_stripe_start = offset; 6271 6272 /* 6273 * Allow a write of a full stripe, but make sure we 6274 * don't allow straddling of stripes 6275 */ 6276 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6277 full_stripe_len); 6278 raid56_full_stripe_start *= full_stripe_len; 6279 6280 /* 6281 * For writes to RAID[56], allow a full stripeset across 6282 * all disks. For other RAID types and for RAID[56] 6283 * reads, just allow a single stripe (on a single disk). 6284 */ 6285 if (op == BTRFS_MAP_WRITE) { 6286 max_len = stripe_len * data_stripes - 6287 (offset - raid56_full_stripe_start); 6288 } 6289 } 6290 len = min_t(u64, em->len - offset, max_len); 6291 } else { 6292 len = em->len - offset; 6293 } 6294 6295 io_geom->len = len; 6296 io_geom->offset = offset; 6297 io_geom->stripe_len = stripe_len; 6298 io_geom->stripe_nr = stripe_nr; 6299 io_geom->stripe_offset = stripe_offset; 6300 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6301 6302 return 0; 6303 } 6304 6305 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6306 enum btrfs_map_op op, 6307 u64 logical, u64 *length, 6308 struct btrfs_bio **bbio_ret, 6309 int mirror_num, int need_raid_map) 6310 { 6311 struct extent_map *em; 6312 struct map_lookup *map; 6313 u64 stripe_offset; 6314 u64 stripe_nr; 6315 u64 stripe_len; 6316 u32 stripe_index; 6317 int data_stripes; 6318 int i; 6319 int ret = 0; 6320 int num_stripes; 6321 int max_errors = 0; 6322 int tgtdev_indexes = 0; 6323 struct btrfs_bio *bbio = NULL; 6324 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6325 int dev_replace_is_ongoing = 0; 6326 int num_alloc_stripes; 6327 int patch_the_first_stripe_for_dev_replace = 0; 6328 u64 physical_to_patch_in_first_stripe = 0; 6329 u64 raid56_full_stripe_start = (u64)-1; 6330 struct btrfs_io_geometry geom; 6331 6332 ASSERT(bbio_ret); 6333 ASSERT(op != BTRFS_MAP_DISCARD); 6334 6335 em = btrfs_get_chunk_map(fs_info, logical, *length); 6336 ASSERT(!IS_ERR(em)); 6337 6338 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6339 if (ret < 0) 6340 return ret; 6341 6342 map = em->map_lookup; 6343 6344 *length = geom.len; 6345 stripe_len = geom.stripe_len; 6346 stripe_nr = geom.stripe_nr; 6347 stripe_offset = geom.stripe_offset; 6348 raid56_full_stripe_start = geom.raid56_stripe_offset; 6349 data_stripes = nr_data_stripes(map); 6350 6351 down_read(&dev_replace->rwsem); 6352 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6353 /* 6354 * Hold the semaphore for read during the whole operation, write is 6355 * requested at commit time but must wait. 6356 */ 6357 if (!dev_replace_is_ongoing) 6358 up_read(&dev_replace->rwsem); 6359 6360 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6361 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6362 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6363 dev_replace->srcdev->devid, 6364 &mirror_num, 6365 &physical_to_patch_in_first_stripe); 6366 if (ret) 6367 goto out; 6368 else 6369 patch_the_first_stripe_for_dev_replace = 1; 6370 } else if (mirror_num > map->num_stripes) { 6371 mirror_num = 0; 6372 } 6373 6374 num_stripes = 1; 6375 stripe_index = 0; 6376 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6377 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6378 &stripe_index); 6379 if (!need_full_stripe(op)) 6380 mirror_num = 1; 6381 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6382 if (need_full_stripe(op)) 6383 num_stripes = map->num_stripes; 6384 else if (mirror_num) 6385 stripe_index = mirror_num - 1; 6386 else { 6387 stripe_index = find_live_mirror(fs_info, map, 0, 6388 dev_replace_is_ongoing); 6389 mirror_num = stripe_index + 1; 6390 } 6391 6392 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6393 if (need_full_stripe(op)) { 6394 num_stripes = map->num_stripes; 6395 } else if (mirror_num) { 6396 stripe_index = mirror_num - 1; 6397 } else { 6398 mirror_num = 1; 6399 } 6400 6401 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6402 u32 factor = map->num_stripes / map->sub_stripes; 6403 6404 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6405 stripe_index *= map->sub_stripes; 6406 6407 if (need_full_stripe(op)) 6408 num_stripes = map->sub_stripes; 6409 else if (mirror_num) 6410 stripe_index += mirror_num - 1; 6411 else { 6412 int old_stripe_index = stripe_index; 6413 stripe_index = find_live_mirror(fs_info, map, 6414 stripe_index, 6415 dev_replace_is_ongoing); 6416 mirror_num = stripe_index - old_stripe_index + 1; 6417 } 6418 6419 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6420 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6421 /* push stripe_nr back to the start of the full stripe */ 6422 stripe_nr = div64_u64(raid56_full_stripe_start, 6423 stripe_len * data_stripes); 6424 6425 /* RAID[56] write or recovery. Return all stripes */ 6426 num_stripes = map->num_stripes; 6427 max_errors = nr_parity_stripes(map); 6428 6429 *length = map->stripe_len; 6430 stripe_index = 0; 6431 stripe_offset = 0; 6432 } else { 6433 /* 6434 * Mirror #0 or #1 means the original data block. 6435 * Mirror #2 is RAID5 parity block. 6436 * Mirror #3 is RAID6 Q block. 6437 */ 6438 stripe_nr = div_u64_rem(stripe_nr, 6439 data_stripes, &stripe_index); 6440 if (mirror_num > 1) 6441 stripe_index = data_stripes + mirror_num - 2; 6442 6443 /* We distribute the parity blocks across stripes */ 6444 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6445 &stripe_index); 6446 if (!need_full_stripe(op) && mirror_num <= 1) 6447 mirror_num = 1; 6448 } 6449 } else { 6450 /* 6451 * after this, stripe_nr is the number of stripes on this 6452 * device we have to walk to find the data, and stripe_index is 6453 * the number of our device in the stripe array 6454 */ 6455 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6456 &stripe_index); 6457 mirror_num = stripe_index + 1; 6458 } 6459 if (stripe_index >= map->num_stripes) { 6460 btrfs_crit(fs_info, 6461 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6462 stripe_index, map->num_stripes); 6463 ret = -EINVAL; 6464 goto out; 6465 } 6466 6467 num_alloc_stripes = num_stripes; 6468 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6469 if (op == BTRFS_MAP_WRITE) 6470 num_alloc_stripes <<= 1; 6471 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6472 num_alloc_stripes++; 6473 tgtdev_indexes = num_stripes; 6474 } 6475 6476 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6477 if (!bbio) { 6478 ret = -ENOMEM; 6479 goto out; 6480 } 6481 6482 for (i = 0; i < num_stripes; i++) { 6483 bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6484 stripe_offset + stripe_nr * map->stripe_len; 6485 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6486 stripe_index++; 6487 } 6488 6489 /* build raid_map */ 6490 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6491 (need_full_stripe(op) || mirror_num > 1)) { 6492 u64 tmp; 6493 unsigned rot; 6494 6495 /* Work out the disk rotation on this stripe-set */ 6496 div_u64_rem(stripe_nr, num_stripes, &rot); 6497 6498 /* Fill in the logical address of each stripe */ 6499 tmp = stripe_nr * data_stripes; 6500 for (i = 0; i < data_stripes; i++) 6501 bbio->raid_map[(i+rot) % num_stripes] = 6502 em->start + (tmp + i) * map->stripe_len; 6503 6504 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6505 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6506 bbio->raid_map[(i+rot+1) % num_stripes] = 6507 RAID6_Q_STRIPE; 6508 6509 sort_parity_stripes(bbio, num_stripes); 6510 } 6511 6512 if (need_full_stripe(op)) 6513 max_errors = btrfs_chunk_max_errors(map); 6514 6515 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6516 need_full_stripe(op)) { 6517 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, 6518 &num_stripes, &max_errors); 6519 } 6520 6521 *bbio_ret = bbio; 6522 bbio->map_type = map->type; 6523 bbio->num_stripes = num_stripes; 6524 bbio->max_errors = max_errors; 6525 bbio->mirror_num = mirror_num; 6526 6527 /* 6528 * this is the case that REQ_READ && dev_replace_is_ongoing && 6529 * mirror_num == num_stripes + 1 && dev_replace target drive is 6530 * available as a mirror 6531 */ 6532 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6533 WARN_ON(num_stripes > 1); 6534 bbio->stripes[0].dev = dev_replace->tgtdev; 6535 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6536 bbio->mirror_num = map->num_stripes + 1; 6537 } 6538 out: 6539 if (dev_replace_is_ongoing) { 6540 lockdep_assert_held(&dev_replace->rwsem); 6541 /* Unlock and let waiting writers proceed */ 6542 up_read(&dev_replace->rwsem); 6543 } 6544 free_extent_map(em); 6545 return ret; 6546 } 6547 6548 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6549 u64 logical, u64 *length, 6550 struct btrfs_bio **bbio_ret, int mirror_num) 6551 { 6552 if (op == BTRFS_MAP_DISCARD) 6553 return __btrfs_map_block_for_discard(fs_info, logical, 6554 length, bbio_ret); 6555 6556 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6557 mirror_num, 0); 6558 } 6559 6560 /* For Scrub/replace */ 6561 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6562 u64 logical, u64 *length, 6563 struct btrfs_bio **bbio_ret) 6564 { 6565 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6566 } 6567 6568 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6569 { 6570 bio->bi_private = bbio->private; 6571 bio->bi_end_io = bbio->end_io; 6572 bio_endio(bio); 6573 6574 btrfs_put_bbio(bbio); 6575 } 6576 6577 static void btrfs_end_bio(struct bio *bio) 6578 { 6579 struct btrfs_bio *bbio = bio->bi_private; 6580 int is_orig_bio = 0; 6581 6582 if (bio->bi_status) { 6583 atomic_inc(&bbio->error); 6584 if (bio->bi_status == BLK_STS_IOERR || 6585 bio->bi_status == BLK_STS_TARGET) { 6586 struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6587 6588 ASSERT(dev->bdev); 6589 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6590 btrfs_dev_stat_inc_and_print(dev, 6591 BTRFS_DEV_STAT_WRITE_ERRS); 6592 else if (!(bio->bi_opf & REQ_RAHEAD)) 6593 btrfs_dev_stat_inc_and_print(dev, 6594 BTRFS_DEV_STAT_READ_ERRS); 6595 if (bio->bi_opf & REQ_PREFLUSH) 6596 btrfs_dev_stat_inc_and_print(dev, 6597 BTRFS_DEV_STAT_FLUSH_ERRS); 6598 } 6599 } 6600 6601 if (bio == bbio->orig_bio) 6602 is_orig_bio = 1; 6603 6604 btrfs_bio_counter_dec(bbio->fs_info); 6605 6606 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6607 if (!is_orig_bio) { 6608 bio_put(bio); 6609 bio = bbio->orig_bio; 6610 } 6611 6612 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6613 /* only send an error to the higher layers if it is 6614 * beyond the tolerance of the btrfs bio 6615 */ 6616 if (atomic_read(&bbio->error) > bbio->max_errors) { 6617 bio->bi_status = BLK_STS_IOERR; 6618 } else { 6619 /* 6620 * this bio is actually up to date, we didn't 6621 * go over the max number of errors 6622 */ 6623 bio->bi_status = BLK_STS_OK; 6624 } 6625 6626 btrfs_end_bbio(bbio, bio); 6627 } else if (!is_orig_bio) { 6628 bio_put(bio); 6629 } 6630 } 6631 6632 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6633 u64 physical, struct btrfs_device *dev) 6634 { 6635 struct btrfs_fs_info *fs_info = bbio->fs_info; 6636 6637 bio->bi_private = bbio; 6638 btrfs_io_bio(bio)->device = dev; 6639 bio->bi_end_io = btrfs_end_bio; 6640 bio->bi_iter.bi_sector = physical >> 9; 6641 /* 6642 * For zone append writing, bi_sector must point the beginning of the 6643 * zone 6644 */ 6645 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6646 if (btrfs_dev_is_sequential(dev, physical)) { 6647 u64 zone_start = round_down(physical, fs_info->zone_size); 6648 6649 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6650 } else { 6651 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6652 bio->bi_opf |= REQ_OP_WRITE; 6653 } 6654 } 6655 btrfs_debug_in_rcu(fs_info, 6656 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6657 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6658 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6659 dev->devid, bio->bi_iter.bi_size); 6660 bio_set_dev(bio, dev->bdev); 6661 6662 btrfs_bio_counter_inc_noblocked(fs_info); 6663 6664 btrfsic_submit_bio(bio); 6665 } 6666 6667 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6668 { 6669 atomic_inc(&bbio->error); 6670 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6671 /* Should be the original bio. */ 6672 WARN_ON(bio != bbio->orig_bio); 6673 6674 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6675 bio->bi_iter.bi_sector = logical >> 9; 6676 if (atomic_read(&bbio->error) > bbio->max_errors) 6677 bio->bi_status = BLK_STS_IOERR; 6678 else 6679 bio->bi_status = BLK_STS_OK; 6680 btrfs_end_bbio(bbio, bio); 6681 } 6682 } 6683 6684 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6685 int mirror_num) 6686 { 6687 struct btrfs_device *dev; 6688 struct bio *first_bio = bio; 6689 u64 logical = bio->bi_iter.bi_sector << 9; 6690 u64 length = 0; 6691 u64 map_length; 6692 int ret; 6693 int dev_nr; 6694 int total_devs; 6695 struct btrfs_bio *bbio = NULL; 6696 6697 length = bio->bi_iter.bi_size; 6698 map_length = length; 6699 6700 btrfs_bio_counter_inc_blocked(fs_info); 6701 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6702 &map_length, &bbio, mirror_num, 1); 6703 if (ret) { 6704 btrfs_bio_counter_dec(fs_info); 6705 return errno_to_blk_status(ret); 6706 } 6707 6708 total_devs = bbio->num_stripes; 6709 bbio->orig_bio = first_bio; 6710 bbio->private = first_bio->bi_private; 6711 bbio->end_io = first_bio->bi_end_io; 6712 bbio->fs_info = fs_info; 6713 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6714 6715 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6716 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6717 /* In this case, map_length has been set to the length of 6718 a single stripe; not the whole write */ 6719 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6720 ret = raid56_parity_write(fs_info, bio, bbio, 6721 map_length); 6722 } else { 6723 ret = raid56_parity_recover(fs_info, bio, bbio, 6724 map_length, mirror_num, 1); 6725 } 6726 6727 btrfs_bio_counter_dec(fs_info); 6728 return errno_to_blk_status(ret); 6729 } 6730 6731 if (map_length < length) { 6732 btrfs_crit(fs_info, 6733 "mapping failed logical %llu bio len %llu len %llu", 6734 logical, length, map_length); 6735 BUG(); 6736 } 6737 6738 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6739 dev = bbio->stripes[dev_nr].dev; 6740 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6741 &dev->dev_state) || 6742 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6743 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6744 bbio_error(bbio, first_bio, logical); 6745 continue; 6746 } 6747 6748 if (dev_nr < total_devs - 1) 6749 bio = btrfs_bio_clone(first_bio); 6750 else 6751 bio = first_bio; 6752 6753 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 6754 } 6755 btrfs_bio_counter_dec(fs_info); 6756 return BLK_STS_OK; 6757 } 6758 6759 /* 6760 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6761 * return NULL. 6762 * 6763 * If devid and uuid are both specified, the match must be exact, otherwise 6764 * only devid is used. 6765 */ 6766 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6767 u64 devid, u8 *uuid, u8 *fsid) 6768 { 6769 struct btrfs_device *device; 6770 struct btrfs_fs_devices *seed_devs; 6771 6772 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6773 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6774 if (device->devid == devid && 6775 (!uuid || memcmp(device->uuid, uuid, 6776 BTRFS_UUID_SIZE) == 0)) 6777 return device; 6778 } 6779 } 6780 6781 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6782 if (!fsid || 6783 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6784 list_for_each_entry(device, &seed_devs->devices, 6785 dev_list) { 6786 if (device->devid == devid && 6787 (!uuid || memcmp(device->uuid, uuid, 6788 BTRFS_UUID_SIZE) == 0)) 6789 return device; 6790 } 6791 } 6792 } 6793 6794 return NULL; 6795 } 6796 6797 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6798 u64 devid, u8 *dev_uuid) 6799 { 6800 struct btrfs_device *device; 6801 unsigned int nofs_flag; 6802 6803 /* 6804 * We call this under the chunk_mutex, so we want to use NOFS for this 6805 * allocation, however we don't want to change btrfs_alloc_device() to 6806 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6807 * places. 6808 */ 6809 nofs_flag = memalloc_nofs_save(); 6810 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6811 memalloc_nofs_restore(nofs_flag); 6812 if (IS_ERR(device)) 6813 return device; 6814 6815 list_add(&device->dev_list, &fs_devices->devices); 6816 device->fs_devices = fs_devices; 6817 fs_devices->num_devices++; 6818 6819 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6820 fs_devices->missing_devices++; 6821 6822 return device; 6823 } 6824 6825 /** 6826 * btrfs_alloc_device - allocate struct btrfs_device 6827 * @fs_info: used only for generating a new devid, can be NULL if 6828 * devid is provided (i.e. @devid != NULL). 6829 * @devid: a pointer to devid for this device. If NULL a new devid 6830 * is generated. 6831 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6832 * is generated. 6833 * 6834 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6835 * on error. Returned struct is not linked onto any lists and must be 6836 * destroyed with btrfs_free_device. 6837 */ 6838 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6839 const u64 *devid, 6840 const u8 *uuid) 6841 { 6842 struct btrfs_device *dev; 6843 u64 tmp; 6844 6845 if (WARN_ON(!devid && !fs_info)) 6846 return ERR_PTR(-EINVAL); 6847 6848 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6849 if (!dev) 6850 return ERR_PTR(-ENOMEM); 6851 6852 /* 6853 * Preallocate a bio that's always going to be used for flushing device 6854 * barriers and matches the device lifespan 6855 */ 6856 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6857 if (!dev->flush_bio) { 6858 kfree(dev); 6859 return ERR_PTR(-ENOMEM); 6860 } 6861 6862 INIT_LIST_HEAD(&dev->dev_list); 6863 INIT_LIST_HEAD(&dev->dev_alloc_list); 6864 INIT_LIST_HEAD(&dev->post_commit_list); 6865 6866 atomic_set(&dev->reada_in_flight, 0); 6867 atomic_set(&dev->dev_stats_ccnt, 0); 6868 btrfs_device_data_ordered_init(dev); 6869 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6870 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6871 extent_io_tree_init(fs_info, &dev->alloc_state, 6872 IO_TREE_DEVICE_ALLOC_STATE, NULL); 6873 6874 if (devid) 6875 tmp = *devid; 6876 else { 6877 int ret; 6878 6879 ret = find_next_devid(fs_info, &tmp); 6880 if (ret) { 6881 btrfs_free_device(dev); 6882 return ERR_PTR(ret); 6883 } 6884 } 6885 dev->devid = tmp; 6886 6887 if (uuid) 6888 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6889 else 6890 generate_random_uuid(dev->uuid); 6891 6892 return dev; 6893 } 6894 6895 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6896 u64 devid, u8 *uuid, bool error) 6897 { 6898 if (error) 6899 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6900 devid, uuid); 6901 else 6902 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6903 devid, uuid); 6904 } 6905 6906 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 6907 { 6908 const int data_stripes = calc_data_stripes(type, num_stripes); 6909 6910 return div_u64(chunk_len, data_stripes); 6911 } 6912 6913 #if BITS_PER_LONG == 32 6914 /* 6915 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6916 * can't be accessed on 32bit systems. 6917 * 6918 * This function do mount time check to reject the fs if it already has 6919 * metadata chunk beyond that limit. 6920 */ 6921 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6922 u64 logical, u64 length, u64 type) 6923 { 6924 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6925 return 0; 6926 6927 if (logical + length < MAX_LFS_FILESIZE) 6928 return 0; 6929 6930 btrfs_err_32bit_limit(fs_info); 6931 return -EOVERFLOW; 6932 } 6933 6934 /* 6935 * This is to give early warning for any metadata chunk reaching 6936 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6937 * Although we can still access the metadata, it's not going to be possible 6938 * once the limit is reached. 6939 */ 6940 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6941 u64 logical, u64 length, u64 type) 6942 { 6943 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6944 return; 6945 6946 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6947 return; 6948 6949 btrfs_warn_32bit_limit(fs_info); 6950 } 6951 #endif 6952 6953 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6954 struct btrfs_chunk *chunk) 6955 { 6956 struct btrfs_fs_info *fs_info = leaf->fs_info; 6957 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6958 struct map_lookup *map; 6959 struct extent_map *em; 6960 u64 logical; 6961 u64 length; 6962 u64 devid; 6963 u64 type; 6964 u8 uuid[BTRFS_UUID_SIZE]; 6965 int num_stripes; 6966 int ret; 6967 int i; 6968 6969 logical = key->offset; 6970 length = btrfs_chunk_length(leaf, chunk); 6971 type = btrfs_chunk_type(leaf, chunk); 6972 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6973 6974 #if BITS_PER_LONG == 32 6975 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6976 if (ret < 0) 6977 return ret; 6978 warn_32bit_meta_chunk(fs_info, logical, length, type); 6979 #endif 6980 6981 /* 6982 * Only need to verify chunk item if we're reading from sys chunk array, 6983 * as chunk item in tree block is already verified by tree-checker. 6984 */ 6985 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6986 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6987 if (ret) 6988 return ret; 6989 } 6990 6991 read_lock(&map_tree->lock); 6992 em = lookup_extent_mapping(map_tree, logical, 1); 6993 read_unlock(&map_tree->lock); 6994 6995 /* already mapped? */ 6996 if (em && em->start <= logical && em->start + em->len > logical) { 6997 free_extent_map(em); 6998 return 0; 6999 } else if (em) { 7000 free_extent_map(em); 7001 } 7002 7003 em = alloc_extent_map(); 7004 if (!em) 7005 return -ENOMEM; 7006 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7007 if (!map) { 7008 free_extent_map(em); 7009 return -ENOMEM; 7010 } 7011 7012 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7013 em->map_lookup = map; 7014 em->start = logical; 7015 em->len = length; 7016 em->orig_start = 0; 7017 em->block_start = 0; 7018 em->block_len = em->len; 7019 7020 map->num_stripes = num_stripes; 7021 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7022 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7023 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7024 map->type = type; 7025 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7026 map->verified_stripes = 0; 7027 em->orig_block_len = calc_stripe_length(type, em->len, 7028 map->num_stripes); 7029 for (i = 0; i < num_stripes; i++) { 7030 map->stripes[i].physical = 7031 btrfs_stripe_offset_nr(leaf, chunk, i); 7032 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7033 read_extent_buffer(leaf, uuid, (unsigned long) 7034 btrfs_stripe_dev_uuid_nr(chunk, i), 7035 BTRFS_UUID_SIZE); 7036 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 7037 devid, uuid, NULL); 7038 if (!map->stripes[i].dev && 7039 !btrfs_test_opt(fs_info, DEGRADED)) { 7040 free_extent_map(em); 7041 btrfs_report_missing_device(fs_info, devid, uuid, true); 7042 return -ENOENT; 7043 } 7044 if (!map->stripes[i].dev) { 7045 map->stripes[i].dev = 7046 add_missing_dev(fs_info->fs_devices, devid, 7047 uuid); 7048 if (IS_ERR(map->stripes[i].dev)) { 7049 free_extent_map(em); 7050 btrfs_err(fs_info, 7051 "failed to init missing dev %llu: %ld", 7052 devid, PTR_ERR(map->stripes[i].dev)); 7053 return PTR_ERR(map->stripes[i].dev); 7054 } 7055 btrfs_report_missing_device(fs_info, devid, uuid, false); 7056 } 7057 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7058 &(map->stripes[i].dev->dev_state)); 7059 7060 } 7061 7062 write_lock(&map_tree->lock); 7063 ret = add_extent_mapping(map_tree, em, 0); 7064 write_unlock(&map_tree->lock); 7065 if (ret < 0) { 7066 btrfs_err(fs_info, 7067 "failed to add chunk map, start=%llu len=%llu: %d", 7068 em->start, em->len, ret); 7069 } 7070 free_extent_map(em); 7071 7072 return ret; 7073 } 7074 7075 static void fill_device_from_item(struct extent_buffer *leaf, 7076 struct btrfs_dev_item *dev_item, 7077 struct btrfs_device *device) 7078 { 7079 unsigned long ptr; 7080 7081 device->devid = btrfs_device_id(leaf, dev_item); 7082 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7083 device->total_bytes = device->disk_total_bytes; 7084 device->commit_total_bytes = device->disk_total_bytes; 7085 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7086 device->commit_bytes_used = device->bytes_used; 7087 device->type = btrfs_device_type(leaf, dev_item); 7088 device->io_align = btrfs_device_io_align(leaf, dev_item); 7089 device->io_width = btrfs_device_io_width(leaf, dev_item); 7090 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7091 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7092 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7093 7094 ptr = btrfs_device_uuid(dev_item); 7095 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7096 } 7097 7098 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7099 u8 *fsid) 7100 { 7101 struct btrfs_fs_devices *fs_devices; 7102 int ret; 7103 7104 lockdep_assert_held(&uuid_mutex); 7105 ASSERT(fsid); 7106 7107 /* This will match only for multi-device seed fs */ 7108 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7109 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7110 return fs_devices; 7111 7112 7113 fs_devices = find_fsid(fsid, NULL); 7114 if (!fs_devices) { 7115 if (!btrfs_test_opt(fs_info, DEGRADED)) 7116 return ERR_PTR(-ENOENT); 7117 7118 fs_devices = alloc_fs_devices(fsid, NULL); 7119 if (IS_ERR(fs_devices)) 7120 return fs_devices; 7121 7122 fs_devices->seeding = true; 7123 fs_devices->opened = 1; 7124 return fs_devices; 7125 } 7126 7127 /* 7128 * Upon first call for a seed fs fsid, just create a private copy of the 7129 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7130 */ 7131 fs_devices = clone_fs_devices(fs_devices); 7132 if (IS_ERR(fs_devices)) 7133 return fs_devices; 7134 7135 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7136 if (ret) { 7137 free_fs_devices(fs_devices); 7138 return ERR_PTR(ret); 7139 } 7140 7141 if (!fs_devices->seeding) { 7142 close_fs_devices(fs_devices); 7143 free_fs_devices(fs_devices); 7144 return ERR_PTR(-EINVAL); 7145 } 7146 7147 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7148 7149 return fs_devices; 7150 } 7151 7152 static int read_one_dev(struct extent_buffer *leaf, 7153 struct btrfs_dev_item *dev_item) 7154 { 7155 struct btrfs_fs_info *fs_info = leaf->fs_info; 7156 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7157 struct btrfs_device *device; 7158 u64 devid; 7159 int ret; 7160 u8 fs_uuid[BTRFS_FSID_SIZE]; 7161 u8 dev_uuid[BTRFS_UUID_SIZE]; 7162 7163 devid = btrfs_device_id(leaf, dev_item); 7164 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7165 BTRFS_UUID_SIZE); 7166 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7167 BTRFS_FSID_SIZE); 7168 7169 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7170 fs_devices = open_seed_devices(fs_info, fs_uuid); 7171 if (IS_ERR(fs_devices)) 7172 return PTR_ERR(fs_devices); 7173 } 7174 7175 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 7176 fs_uuid); 7177 if (!device) { 7178 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7179 btrfs_report_missing_device(fs_info, devid, 7180 dev_uuid, true); 7181 return -ENOENT; 7182 } 7183 7184 device = add_missing_dev(fs_devices, devid, dev_uuid); 7185 if (IS_ERR(device)) { 7186 btrfs_err(fs_info, 7187 "failed to add missing dev %llu: %ld", 7188 devid, PTR_ERR(device)); 7189 return PTR_ERR(device); 7190 } 7191 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7192 } else { 7193 if (!device->bdev) { 7194 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7195 btrfs_report_missing_device(fs_info, 7196 devid, dev_uuid, true); 7197 return -ENOENT; 7198 } 7199 btrfs_report_missing_device(fs_info, devid, 7200 dev_uuid, false); 7201 } 7202 7203 if (!device->bdev && 7204 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7205 /* 7206 * this happens when a device that was properly setup 7207 * in the device info lists suddenly goes bad. 7208 * device->bdev is NULL, and so we have to set 7209 * device->missing to one here 7210 */ 7211 device->fs_devices->missing_devices++; 7212 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7213 } 7214 7215 /* Move the device to its own fs_devices */ 7216 if (device->fs_devices != fs_devices) { 7217 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7218 &device->dev_state)); 7219 7220 list_move(&device->dev_list, &fs_devices->devices); 7221 device->fs_devices->num_devices--; 7222 fs_devices->num_devices++; 7223 7224 device->fs_devices->missing_devices--; 7225 fs_devices->missing_devices++; 7226 7227 device->fs_devices = fs_devices; 7228 } 7229 } 7230 7231 if (device->fs_devices != fs_info->fs_devices) { 7232 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7233 if (device->generation != 7234 btrfs_device_generation(leaf, dev_item)) 7235 return -EINVAL; 7236 } 7237 7238 fill_device_from_item(leaf, dev_item, device); 7239 if (device->bdev) { 7240 u64 max_total_bytes = i_size_read(device->bdev->bd_inode); 7241 7242 if (device->total_bytes > max_total_bytes) { 7243 btrfs_err(fs_info, 7244 "device total_bytes should be at most %llu but found %llu", 7245 max_total_bytes, device->total_bytes); 7246 return -EINVAL; 7247 } 7248 } 7249 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7250 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7251 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7252 device->fs_devices->total_rw_bytes += device->total_bytes; 7253 atomic64_add(device->total_bytes - device->bytes_used, 7254 &fs_info->free_chunk_space); 7255 } 7256 ret = 0; 7257 return ret; 7258 } 7259 7260 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7261 { 7262 struct btrfs_root *root = fs_info->tree_root; 7263 struct btrfs_super_block *super_copy = fs_info->super_copy; 7264 struct extent_buffer *sb; 7265 struct btrfs_disk_key *disk_key; 7266 struct btrfs_chunk *chunk; 7267 u8 *array_ptr; 7268 unsigned long sb_array_offset; 7269 int ret = 0; 7270 u32 num_stripes; 7271 u32 array_size; 7272 u32 len = 0; 7273 u32 cur_offset; 7274 u64 type; 7275 struct btrfs_key key; 7276 7277 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7278 /* 7279 * This will create extent buffer of nodesize, superblock size is 7280 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7281 * overallocate but we can keep it as-is, only the first page is used. 7282 */ 7283 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7284 root->root_key.objectid, 0); 7285 if (IS_ERR(sb)) 7286 return PTR_ERR(sb); 7287 set_extent_buffer_uptodate(sb); 7288 /* 7289 * The sb extent buffer is artificial and just used to read the system array. 7290 * set_extent_buffer_uptodate() call does not properly mark all it's 7291 * pages up-to-date when the page is larger: extent does not cover the 7292 * whole page and consequently check_page_uptodate does not find all 7293 * the page's extents up-to-date (the hole beyond sb), 7294 * write_extent_buffer then triggers a WARN_ON. 7295 * 7296 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7297 * but sb spans only this function. Add an explicit SetPageUptodate call 7298 * to silence the warning eg. on PowerPC 64. 7299 */ 7300 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7301 SetPageUptodate(sb->pages[0]); 7302 7303 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7304 array_size = btrfs_super_sys_array_size(super_copy); 7305 7306 array_ptr = super_copy->sys_chunk_array; 7307 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7308 cur_offset = 0; 7309 7310 while (cur_offset < array_size) { 7311 disk_key = (struct btrfs_disk_key *)array_ptr; 7312 len = sizeof(*disk_key); 7313 if (cur_offset + len > array_size) 7314 goto out_short_read; 7315 7316 btrfs_disk_key_to_cpu(&key, disk_key); 7317 7318 array_ptr += len; 7319 sb_array_offset += len; 7320 cur_offset += len; 7321 7322 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7323 btrfs_err(fs_info, 7324 "unexpected item type %u in sys_array at offset %u", 7325 (u32)key.type, cur_offset); 7326 ret = -EIO; 7327 break; 7328 } 7329 7330 chunk = (struct btrfs_chunk *)sb_array_offset; 7331 /* 7332 * At least one btrfs_chunk with one stripe must be present, 7333 * exact stripe count check comes afterwards 7334 */ 7335 len = btrfs_chunk_item_size(1); 7336 if (cur_offset + len > array_size) 7337 goto out_short_read; 7338 7339 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7340 if (!num_stripes) { 7341 btrfs_err(fs_info, 7342 "invalid number of stripes %u in sys_array at offset %u", 7343 num_stripes, cur_offset); 7344 ret = -EIO; 7345 break; 7346 } 7347 7348 type = btrfs_chunk_type(sb, chunk); 7349 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7350 btrfs_err(fs_info, 7351 "invalid chunk type %llu in sys_array at offset %u", 7352 type, cur_offset); 7353 ret = -EIO; 7354 break; 7355 } 7356 7357 len = btrfs_chunk_item_size(num_stripes); 7358 if (cur_offset + len > array_size) 7359 goto out_short_read; 7360 7361 ret = read_one_chunk(&key, sb, chunk); 7362 if (ret) 7363 break; 7364 7365 array_ptr += len; 7366 sb_array_offset += len; 7367 cur_offset += len; 7368 } 7369 clear_extent_buffer_uptodate(sb); 7370 free_extent_buffer_stale(sb); 7371 return ret; 7372 7373 out_short_read: 7374 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7375 len, cur_offset); 7376 clear_extent_buffer_uptodate(sb); 7377 free_extent_buffer_stale(sb); 7378 return -EIO; 7379 } 7380 7381 /* 7382 * Check if all chunks in the fs are OK for read-write degraded mount 7383 * 7384 * If the @failing_dev is specified, it's accounted as missing. 7385 * 7386 * Return true if all chunks meet the minimal RW mount requirements. 7387 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7388 */ 7389 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7390 struct btrfs_device *failing_dev) 7391 { 7392 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7393 struct extent_map *em; 7394 u64 next_start = 0; 7395 bool ret = true; 7396 7397 read_lock(&map_tree->lock); 7398 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7399 read_unlock(&map_tree->lock); 7400 /* No chunk at all? Return false anyway */ 7401 if (!em) { 7402 ret = false; 7403 goto out; 7404 } 7405 while (em) { 7406 struct map_lookup *map; 7407 int missing = 0; 7408 int max_tolerated; 7409 int i; 7410 7411 map = em->map_lookup; 7412 max_tolerated = 7413 btrfs_get_num_tolerated_disk_barrier_failures( 7414 map->type); 7415 for (i = 0; i < map->num_stripes; i++) { 7416 struct btrfs_device *dev = map->stripes[i].dev; 7417 7418 if (!dev || !dev->bdev || 7419 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7420 dev->last_flush_error) 7421 missing++; 7422 else if (failing_dev && failing_dev == dev) 7423 missing++; 7424 } 7425 if (missing > max_tolerated) { 7426 if (!failing_dev) 7427 btrfs_warn(fs_info, 7428 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7429 em->start, missing, max_tolerated); 7430 free_extent_map(em); 7431 ret = false; 7432 goto out; 7433 } 7434 next_start = extent_map_end(em); 7435 free_extent_map(em); 7436 7437 read_lock(&map_tree->lock); 7438 em = lookup_extent_mapping(map_tree, next_start, 7439 (u64)(-1) - next_start); 7440 read_unlock(&map_tree->lock); 7441 } 7442 out: 7443 return ret; 7444 } 7445 7446 static void readahead_tree_node_children(struct extent_buffer *node) 7447 { 7448 int i; 7449 const int nr_items = btrfs_header_nritems(node); 7450 7451 for (i = 0; i < nr_items; i++) 7452 btrfs_readahead_node_child(node, i); 7453 } 7454 7455 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7456 { 7457 struct btrfs_root *root = fs_info->chunk_root; 7458 struct btrfs_path *path; 7459 struct extent_buffer *leaf; 7460 struct btrfs_key key; 7461 struct btrfs_key found_key; 7462 int ret; 7463 int slot; 7464 u64 total_dev = 0; 7465 u64 last_ra_node = 0; 7466 7467 path = btrfs_alloc_path(); 7468 if (!path) 7469 return -ENOMEM; 7470 7471 /* 7472 * uuid_mutex is needed only if we are mounting a sprout FS 7473 * otherwise we don't need it. 7474 */ 7475 mutex_lock(&uuid_mutex); 7476 7477 /* 7478 * It is possible for mount and umount to race in such a way that 7479 * we execute this code path, but open_fs_devices failed to clear 7480 * total_rw_bytes. We certainly want it cleared before reading the 7481 * device items, so clear it here. 7482 */ 7483 fs_info->fs_devices->total_rw_bytes = 0; 7484 7485 /* 7486 * Read all device items, and then all the chunk items. All 7487 * device items are found before any chunk item (their object id 7488 * is smaller than the lowest possible object id for a chunk 7489 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7490 */ 7491 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7492 key.offset = 0; 7493 key.type = 0; 7494 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7495 if (ret < 0) 7496 goto error; 7497 while (1) { 7498 struct extent_buffer *node; 7499 7500 leaf = path->nodes[0]; 7501 slot = path->slots[0]; 7502 if (slot >= btrfs_header_nritems(leaf)) { 7503 ret = btrfs_next_leaf(root, path); 7504 if (ret == 0) 7505 continue; 7506 if (ret < 0) 7507 goto error; 7508 break; 7509 } 7510 /* 7511 * The nodes on level 1 are not locked but we don't need to do 7512 * that during mount time as nothing else can access the tree 7513 */ 7514 node = path->nodes[1]; 7515 if (node) { 7516 if (last_ra_node != node->start) { 7517 readahead_tree_node_children(node); 7518 last_ra_node = node->start; 7519 } 7520 } 7521 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7522 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7523 struct btrfs_dev_item *dev_item; 7524 dev_item = btrfs_item_ptr(leaf, slot, 7525 struct btrfs_dev_item); 7526 ret = read_one_dev(leaf, dev_item); 7527 if (ret) 7528 goto error; 7529 total_dev++; 7530 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7531 struct btrfs_chunk *chunk; 7532 7533 /* 7534 * We are only called at mount time, so no need to take 7535 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7536 * we always lock first fs_info->chunk_mutex before 7537 * acquiring any locks on the chunk tree. This is a 7538 * requirement for chunk allocation, see the comment on 7539 * top of btrfs_chunk_alloc() for details. 7540 */ 7541 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7542 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7543 ret = read_one_chunk(&found_key, leaf, chunk); 7544 if (ret) 7545 goto error; 7546 } 7547 path->slots[0]++; 7548 } 7549 7550 /* 7551 * After loading chunk tree, we've got all device information, 7552 * do another round of validation checks. 7553 */ 7554 if (total_dev != fs_info->fs_devices->total_devices) { 7555 btrfs_err(fs_info, 7556 "super_num_devices %llu mismatch with num_devices %llu found here", 7557 btrfs_super_num_devices(fs_info->super_copy), 7558 total_dev); 7559 ret = -EINVAL; 7560 goto error; 7561 } 7562 if (btrfs_super_total_bytes(fs_info->super_copy) < 7563 fs_info->fs_devices->total_rw_bytes) { 7564 btrfs_err(fs_info, 7565 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7566 btrfs_super_total_bytes(fs_info->super_copy), 7567 fs_info->fs_devices->total_rw_bytes); 7568 ret = -EINVAL; 7569 goto error; 7570 } 7571 ret = 0; 7572 error: 7573 mutex_unlock(&uuid_mutex); 7574 7575 btrfs_free_path(path); 7576 return ret; 7577 } 7578 7579 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7580 { 7581 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7582 struct btrfs_device *device; 7583 7584 fs_devices->fs_info = fs_info; 7585 7586 mutex_lock(&fs_devices->device_list_mutex); 7587 list_for_each_entry(device, &fs_devices->devices, dev_list) 7588 device->fs_info = fs_info; 7589 7590 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7591 list_for_each_entry(device, &seed_devs->devices, dev_list) 7592 device->fs_info = fs_info; 7593 7594 seed_devs->fs_info = fs_info; 7595 } 7596 mutex_unlock(&fs_devices->device_list_mutex); 7597 } 7598 7599 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7600 const struct btrfs_dev_stats_item *ptr, 7601 int index) 7602 { 7603 u64 val; 7604 7605 read_extent_buffer(eb, &val, 7606 offsetof(struct btrfs_dev_stats_item, values) + 7607 ((unsigned long)ptr) + (index * sizeof(u64)), 7608 sizeof(val)); 7609 return val; 7610 } 7611 7612 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7613 struct btrfs_dev_stats_item *ptr, 7614 int index, u64 val) 7615 { 7616 write_extent_buffer(eb, &val, 7617 offsetof(struct btrfs_dev_stats_item, values) + 7618 ((unsigned long)ptr) + (index * sizeof(u64)), 7619 sizeof(val)); 7620 } 7621 7622 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7623 struct btrfs_path *path) 7624 { 7625 struct btrfs_dev_stats_item *ptr; 7626 struct extent_buffer *eb; 7627 struct btrfs_key key; 7628 int item_size; 7629 int i, ret, slot; 7630 7631 if (!device->fs_info->dev_root) 7632 return 0; 7633 7634 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7635 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7636 key.offset = device->devid; 7637 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7638 if (ret) { 7639 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7640 btrfs_dev_stat_set(device, i, 0); 7641 device->dev_stats_valid = 1; 7642 btrfs_release_path(path); 7643 return ret < 0 ? ret : 0; 7644 } 7645 slot = path->slots[0]; 7646 eb = path->nodes[0]; 7647 item_size = btrfs_item_size_nr(eb, slot); 7648 7649 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7650 7651 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7652 if (item_size >= (1 + i) * sizeof(__le64)) 7653 btrfs_dev_stat_set(device, i, 7654 btrfs_dev_stats_value(eb, ptr, i)); 7655 else 7656 btrfs_dev_stat_set(device, i, 0); 7657 } 7658 7659 device->dev_stats_valid = 1; 7660 btrfs_dev_stat_print_on_load(device); 7661 btrfs_release_path(path); 7662 7663 return 0; 7664 } 7665 7666 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7667 { 7668 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7669 struct btrfs_device *device; 7670 struct btrfs_path *path = NULL; 7671 int ret = 0; 7672 7673 path = btrfs_alloc_path(); 7674 if (!path) 7675 return -ENOMEM; 7676 7677 mutex_lock(&fs_devices->device_list_mutex); 7678 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7679 ret = btrfs_device_init_dev_stats(device, path); 7680 if (ret) 7681 goto out; 7682 } 7683 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7684 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7685 ret = btrfs_device_init_dev_stats(device, path); 7686 if (ret) 7687 goto out; 7688 } 7689 } 7690 out: 7691 mutex_unlock(&fs_devices->device_list_mutex); 7692 7693 btrfs_free_path(path); 7694 return ret; 7695 } 7696 7697 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7698 struct btrfs_device *device) 7699 { 7700 struct btrfs_fs_info *fs_info = trans->fs_info; 7701 struct btrfs_root *dev_root = fs_info->dev_root; 7702 struct btrfs_path *path; 7703 struct btrfs_key key; 7704 struct extent_buffer *eb; 7705 struct btrfs_dev_stats_item *ptr; 7706 int ret; 7707 int i; 7708 7709 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7710 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7711 key.offset = device->devid; 7712 7713 path = btrfs_alloc_path(); 7714 if (!path) 7715 return -ENOMEM; 7716 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7717 if (ret < 0) { 7718 btrfs_warn_in_rcu(fs_info, 7719 "error %d while searching for dev_stats item for device %s", 7720 ret, rcu_str_deref(device->name)); 7721 goto out; 7722 } 7723 7724 if (ret == 0 && 7725 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7726 /* need to delete old one and insert a new one */ 7727 ret = btrfs_del_item(trans, dev_root, path); 7728 if (ret != 0) { 7729 btrfs_warn_in_rcu(fs_info, 7730 "delete too small dev_stats item for device %s failed %d", 7731 rcu_str_deref(device->name), ret); 7732 goto out; 7733 } 7734 ret = 1; 7735 } 7736 7737 if (ret == 1) { 7738 /* need to insert a new item */ 7739 btrfs_release_path(path); 7740 ret = btrfs_insert_empty_item(trans, dev_root, path, 7741 &key, sizeof(*ptr)); 7742 if (ret < 0) { 7743 btrfs_warn_in_rcu(fs_info, 7744 "insert dev_stats item for device %s failed %d", 7745 rcu_str_deref(device->name), ret); 7746 goto out; 7747 } 7748 } 7749 7750 eb = path->nodes[0]; 7751 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7752 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7753 btrfs_set_dev_stats_value(eb, ptr, i, 7754 btrfs_dev_stat_read(device, i)); 7755 btrfs_mark_buffer_dirty(eb); 7756 7757 out: 7758 btrfs_free_path(path); 7759 return ret; 7760 } 7761 7762 /* 7763 * called from commit_transaction. Writes all changed device stats to disk. 7764 */ 7765 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7766 { 7767 struct btrfs_fs_info *fs_info = trans->fs_info; 7768 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7769 struct btrfs_device *device; 7770 int stats_cnt; 7771 int ret = 0; 7772 7773 mutex_lock(&fs_devices->device_list_mutex); 7774 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7775 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7776 if (!device->dev_stats_valid || stats_cnt == 0) 7777 continue; 7778 7779 7780 /* 7781 * There is a LOAD-LOAD control dependency between the value of 7782 * dev_stats_ccnt and updating the on-disk values which requires 7783 * reading the in-memory counters. Such control dependencies 7784 * require explicit read memory barriers. 7785 * 7786 * This memory barriers pairs with smp_mb__before_atomic in 7787 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7788 * barrier implied by atomic_xchg in 7789 * btrfs_dev_stats_read_and_reset 7790 */ 7791 smp_rmb(); 7792 7793 ret = update_dev_stat_item(trans, device); 7794 if (!ret) 7795 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7796 } 7797 mutex_unlock(&fs_devices->device_list_mutex); 7798 7799 return ret; 7800 } 7801 7802 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7803 { 7804 btrfs_dev_stat_inc(dev, index); 7805 btrfs_dev_stat_print_on_error(dev); 7806 } 7807 7808 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7809 { 7810 if (!dev->dev_stats_valid) 7811 return; 7812 btrfs_err_rl_in_rcu(dev->fs_info, 7813 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7814 rcu_str_deref(dev->name), 7815 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7816 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7817 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7818 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7819 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7820 } 7821 7822 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7823 { 7824 int i; 7825 7826 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7827 if (btrfs_dev_stat_read(dev, i) != 0) 7828 break; 7829 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7830 return; /* all values == 0, suppress message */ 7831 7832 btrfs_info_in_rcu(dev->fs_info, 7833 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7834 rcu_str_deref(dev->name), 7835 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7836 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7837 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7838 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7839 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7840 } 7841 7842 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7843 struct btrfs_ioctl_get_dev_stats *stats) 7844 { 7845 struct btrfs_device *dev; 7846 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7847 int i; 7848 7849 mutex_lock(&fs_devices->device_list_mutex); 7850 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); 7851 mutex_unlock(&fs_devices->device_list_mutex); 7852 7853 if (!dev) { 7854 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7855 return -ENODEV; 7856 } else if (!dev->dev_stats_valid) { 7857 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7858 return -ENODEV; 7859 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7860 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7861 if (stats->nr_items > i) 7862 stats->values[i] = 7863 btrfs_dev_stat_read_and_reset(dev, i); 7864 else 7865 btrfs_dev_stat_set(dev, i, 0); 7866 } 7867 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7868 current->comm, task_pid_nr(current)); 7869 } else { 7870 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7871 if (stats->nr_items > i) 7872 stats->values[i] = btrfs_dev_stat_read(dev, i); 7873 } 7874 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7875 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7876 return 0; 7877 } 7878 7879 /* 7880 * Update the size and bytes used for each device where it changed. This is 7881 * delayed since we would otherwise get errors while writing out the 7882 * superblocks. 7883 * 7884 * Must be invoked during transaction commit. 7885 */ 7886 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7887 { 7888 struct btrfs_device *curr, *next; 7889 7890 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7891 7892 if (list_empty(&trans->dev_update_list)) 7893 return; 7894 7895 /* 7896 * We don't need the device_list_mutex here. This list is owned by the 7897 * transaction and the transaction must complete before the device is 7898 * released. 7899 */ 7900 mutex_lock(&trans->fs_info->chunk_mutex); 7901 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7902 post_commit_list) { 7903 list_del_init(&curr->post_commit_list); 7904 curr->commit_total_bytes = curr->disk_total_bytes; 7905 curr->commit_bytes_used = curr->bytes_used; 7906 } 7907 mutex_unlock(&trans->fs_info->chunk_mutex); 7908 } 7909 7910 /* 7911 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7912 */ 7913 int btrfs_bg_type_to_factor(u64 flags) 7914 { 7915 const int index = btrfs_bg_flags_to_raid_index(flags); 7916 7917 return btrfs_raid_array[index].ncopies; 7918 } 7919 7920 7921 7922 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7923 u64 chunk_offset, u64 devid, 7924 u64 physical_offset, u64 physical_len) 7925 { 7926 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7927 struct extent_map *em; 7928 struct map_lookup *map; 7929 struct btrfs_device *dev; 7930 u64 stripe_len; 7931 bool found = false; 7932 int ret = 0; 7933 int i; 7934 7935 read_lock(&em_tree->lock); 7936 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7937 read_unlock(&em_tree->lock); 7938 7939 if (!em) { 7940 btrfs_err(fs_info, 7941 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7942 physical_offset, devid); 7943 ret = -EUCLEAN; 7944 goto out; 7945 } 7946 7947 map = em->map_lookup; 7948 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7949 if (physical_len != stripe_len) { 7950 btrfs_err(fs_info, 7951 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7952 physical_offset, devid, em->start, physical_len, 7953 stripe_len); 7954 ret = -EUCLEAN; 7955 goto out; 7956 } 7957 7958 for (i = 0; i < map->num_stripes; i++) { 7959 if (map->stripes[i].dev->devid == devid && 7960 map->stripes[i].physical == physical_offset) { 7961 found = true; 7962 if (map->verified_stripes >= map->num_stripes) { 7963 btrfs_err(fs_info, 7964 "too many dev extents for chunk %llu found", 7965 em->start); 7966 ret = -EUCLEAN; 7967 goto out; 7968 } 7969 map->verified_stripes++; 7970 break; 7971 } 7972 } 7973 if (!found) { 7974 btrfs_err(fs_info, 7975 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7976 physical_offset, devid); 7977 ret = -EUCLEAN; 7978 } 7979 7980 /* Make sure no dev extent is beyond device boundary */ 7981 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 7982 if (!dev) { 7983 btrfs_err(fs_info, "failed to find devid %llu", devid); 7984 ret = -EUCLEAN; 7985 goto out; 7986 } 7987 7988 if (physical_offset + physical_len > dev->disk_total_bytes) { 7989 btrfs_err(fs_info, 7990 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7991 devid, physical_offset, physical_len, 7992 dev->disk_total_bytes); 7993 ret = -EUCLEAN; 7994 goto out; 7995 } 7996 7997 if (dev->zone_info) { 7998 u64 zone_size = dev->zone_info->zone_size; 7999 8000 if (!IS_ALIGNED(physical_offset, zone_size) || 8001 !IS_ALIGNED(physical_len, zone_size)) { 8002 btrfs_err(fs_info, 8003 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8004 devid, physical_offset, physical_len); 8005 ret = -EUCLEAN; 8006 goto out; 8007 } 8008 } 8009 8010 out: 8011 free_extent_map(em); 8012 return ret; 8013 } 8014 8015 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8016 { 8017 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8018 struct extent_map *em; 8019 struct rb_node *node; 8020 int ret = 0; 8021 8022 read_lock(&em_tree->lock); 8023 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8024 em = rb_entry(node, struct extent_map, rb_node); 8025 if (em->map_lookup->num_stripes != 8026 em->map_lookup->verified_stripes) { 8027 btrfs_err(fs_info, 8028 "chunk %llu has missing dev extent, have %d expect %d", 8029 em->start, em->map_lookup->verified_stripes, 8030 em->map_lookup->num_stripes); 8031 ret = -EUCLEAN; 8032 goto out; 8033 } 8034 } 8035 out: 8036 read_unlock(&em_tree->lock); 8037 return ret; 8038 } 8039 8040 /* 8041 * Ensure that all dev extents are mapped to correct chunk, otherwise 8042 * later chunk allocation/free would cause unexpected behavior. 8043 * 8044 * NOTE: This will iterate through the whole device tree, which should be of 8045 * the same size level as the chunk tree. This slightly increases mount time. 8046 */ 8047 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8048 { 8049 struct btrfs_path *path; 8050 struct btrfs_root *root = fs_info->dev_root; 8051 struct btrfs_key key; 8052 u64 prev_devid = 0; 8053 u64 prev_dev_ext_end = 0; 8054 int ret = 0; 8055 8056 /* 8057 * We don't have a dev_root because we mounted with ignorebadroots and 8058 * failed to load the root, so we want to skip the verification in this 8059 * case for sure. 8060 * 8061 * However if the dev root is fine, but the tree itself is corrupted 8062 * we'd still fail to mount. This verification is only to make sure 8063 * writes can happen safely, so instead just bypass this check 8064 * completely in the case of IGNOREBADROOTS. 8065 */ 8066 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8067 return 0; 8068 8069 key.objectid = 1; 8070 key.type = BTRFS_DEV_EXTENT_KEY; 8071 key.offset = 0; 8072 8073 path = btrfs_alloc_path(); 8074 if (!path) 8075 return -ENOMEM; 8076 8077 path->reada = READA_FORWARD; 8078 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8079 if (ret < 0) 8080 goto out; 8081 8082 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8083 ret = btrfs_next_leaf(root, path); 8084 if (ret < 0) 8085 goto out; 8086 /* No dev extents at all? Not good */ 8087 if (ret > 0) { 8088 ret = -EUCLEAN; 8089 goto out; 8090 } 8091 } 8092 while (1) { 8093 struct extent_buffer *leaf = path->nodes[0]; 8094 struct btrfs_dev_extent *dext; 8095 int slot = path->slots[0]; 8096 u64 chunk_offset; 8097 u64 physical_offset; 8098 u64 physical_len; 8099 u64 devid; 8100 8101 btrfs_item_key_to_cpu(leaf, &key, slot); 8102 if (key.type != BTRFS_DEV_EXTENT_KEY) 8103 break; 8104 devid = key.objectid; 8105 physical_offset = key.offset; 8106 8107 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8108 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8109 physical_len = btrfs_dev_extent_length(leaf, dext); 8110 8111 /* Check if this dev extent overlaps with the previous one */ 8112 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8113 btrfs_err(fs_info, 8114 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8115 devid, physical_offset, prev_dev_ext_end); 8116 ret = -EUCLEAN; 8117 goto out; 8118 } 8119 8120 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8121 physical_offset, physical_len); 8122 if (ret < 0) 8123 goto out; 8124 prev_devid = devid; 8125 prev_dev_ext_end = physical_offset + physical_len; 8126 8127 ret = btrfs_next_item(root, path); 8128 if (ret < 0) 8129 goto out; 8130 if (ret > 0) { 8131 ret = 0; 8132 break; 8133 } 8134 } 8135 8136 /* Ensure all chunks have corresponding dev extents */ 8137 ret = verify_chunk_dev_extent_mapping(fs_info); 8138 out: 8139 btrfs_free_path(path); 8140 return ret; 8141 } 8142 8143 /* 8144 * Check whether the given block group or device is pinned by any inode being 8145 * used as a swapfile. 8146 */ 8147 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8148 { 8149 struct btrfs_swapfile_pin *sp; 8150 struct rb_node *node; 8151 8152 spin_lock(&fs_info->swapfile_pins_lock); 8153 node = fs_info->swapfile_pins.rb_node; 8154 while (node) { 8155 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8156 if (ptr < sp->ptr) 8157 node = node->rb_left; 8158 else if (ptr > sp->ptr) 8159 node = node->rb_right; 8160 else 8161 break; 8162 } 8163 spin_unlock(&fs_info->swapfile_pins_lock); 8164 return node != NULL; 8165 } 8166 8167 static int relocating_repair_kthread(void *data) 8168 { 8169 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8170 struct btrfs_fs_info *fs_info = cache->fs_info; 8171 u64 target; 8172 int ret = 0; 8173 8174 target = cache->start; 8175 btrfs_put_block_group(cache); 8176 8177 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8178 btrfs_info(fs_info, 8179 "zoned: skip relocating block group %llu to repair: EBUSY", 8180 target); 8181 return -EBUSY; 8182 } 8183 8184 mutex_lock(&fs_info->reclaim_bgs_lock); 8185 8186 /* Ensure block group still exists */ 8187 cache = btrfs_lookup_block_group(fs_info, target); 8188 if (!cache) 8189 goto out; 8190 8191 if (!cache->relocating_repair) 8192 goto out; 8193 8194 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8195 if (ret < 0) 8196 goto out; 8197 8198 btrfs_info(fs_info, 8199 "zoned: relocating block group %llu to repair IO failure", 8200 target); 8201 ret = btrfs_relocate_chunk(fs_info, target); 8202 8203 out: 8204 if (cache) 8205 btrfs_put_block_group(cache); 8206 mutex_unlock(&fs_info->reclaim_bgs_lock); 8207 btrfs_exclop_finish(fs_info); 8208 8209 return ret; 8210 } 8211 8212 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8213 { 8214 struct btrfs_block_group *cache; 8215 8216 /* Do not attempt to repair in degraded state */ 8217 if (btrfs_test_opt(fs_info, DEGRADED)) 8218 return 0; 8219 8220 cache = btrfs_lookup_block_group(fs_info, logical); 8221 if (!cache) 8222 return 0; 8223 8224 spin_lock(&cache->lock); 8225 if (cache->relocating_repair) { 8226 spin_unlock(&cache->lock); 8227 btrfs_put_block_group(cache); 8228 return 0; 8229 } 8230 cache->relocating_repair = 1; 8231 spin_unlock(&cache->lock); 8232 8233 kthread_run(relocating_repair_kthread, cache, 8234 "btrfs-relocating-repair"); 8235 8236 return 0; 8237 } 8238