1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "misc.h" 18 #include "ctree.h" 19 #include "extent_map.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "async-thread.h" 26 #include "check-integrity.h" 27 #include "rcu-string.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 #include "tree-checker.h" 31 #include "space-info.h" 32 #include "block-group.h" 33 #include "discard.h" 34 #include "zoned.h" 35 36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 37 [BTRFS_RAID_RAID10] = { 38 .sub_stripes = 2, 39 .dev_stripes = 1, 40 .devs_max = 0, /* 0 == as many as possible */ 41 .devs_min = 2, 42 .tolerated_failures = 1, 43 .devs_increment = 2, 44 .ncopies = 2, 45 .nparity = 0, 46 .raid_name = "raid10", 47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 49 }, 50 [BTRFS_RAID_RAID1] = { 51 .sub_stripes = 1, 52 .dev_stripes = 1, 53 .devs_max = 2, 54 .devs_min = 2, 55 .tolerated_failures = 1, 56 .devs_increment = 2, 57 .ncopies = 2, 58 .nparity = 0, 59 .raid_name = "raid1", 60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 62 }, 63 [BTRFS_RAID_RAID1C3] = { 64 .sub_stripes = 1, 65 .dev_stripes = 1, 66 .devs_max = 3, 67 .devs_min = 3, 68 .tolerated_failures = 2, 69 .devs_increment = 3, 70 .ncopies = 3, 71 .nparity = 0, 72 .raid_name = "raid1c3", 73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 75 }, 76 [BTRFS_RAID_RAID1C4] = { 77 .sub_stripes = 1, 78 .dev_stripes = 1, 79 .devs_max = 4, 80 .devs_min = 4, 81 .tolerated_failures = 3, 82 .devs_increment = 4, 83 .ncopies = 4, 84 .nparity = 0, 85 .raid_name = "raid1c4", 86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 88 }, 89 [BTRFS_RAID_DUP] = { 90 .sub_stripes = 1, 91 .dev_stripes = 2, 92 .devs_max = 1, 93 .devs_min = 1, 94 .tolerated_failures = 0, 95 .devs_increment = 1, 96 .ncopies = 2, 97 .nparity = 0, 98 .raid_name = "dup", 99 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 100 .mindev_error = 0, 101 }, 102 [BTRFS_RAID_RAID0] = { 103 .sub_stripes = 1, 104 .dev_stripes = 1, 105 .devs_max = 0, 106 .devs_min = 1, 107 .tolerated_failures = 0, 108 .devs_increment = 1, 109 .ncopies = 1, 110 .nparity = 0, 111 .raid_name = "raid0", 112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 113 .mindev_error = 0, 114 }, 115 [BTRFS_RAID_SINGLE] = { 116 .sub_stripes = 1, 117 .dev_stripes = 1, 118 .devs_max = 1, 119 .devs_min = 1, 120 .tolerated_failures = 0, 121 .devs_increment = 1, 122 .ncopies = 1, 123 .nparity = 0, 124 .raid_name = "single", 125 .bg_flag = 0, 126 .mindev_error = 0, 127 }, 128 [BTRFS_RAID_RAID5] = { 129 .sub_stripes = 1, 130 .dev_stripes = 1, 131 .devs_max = 0, 132 .devs_min = 2, 133 .tolerated_failures = 1, 134 .devs_increment = 1, 135 .ncopies = 1, 136 .nparity = 1, 137 .raid_name = "raid5", 138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 140 }, 141 [BTRFS_RAID_RAID6] = { 142 .sub_stripes = 1, 143 .dev_stripes = 1, 144 .devs_max = 0, 145 .devs_min = 3, 146 .tolerated_failures = 2, 147 .devs_increment = 1, 148 .ncopies = 1, 149 .nparity = 2, 150 .raid_name = "raid6", 151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 153 }, 154 }; 155 156 /* 157 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 158 * can be used as index to access btrfs_raid_array[]. 159 */ 160 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 161 { 162 if (flags & BTRFS_BLOCK_GROUP_RAID10) 163 return BTRFS_RAID_RAID10; 164 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 165 return BTRFS_RAID_RAID1; 166 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 167 return BTRFS_RAID_RAID1C3; 168 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 169 return BTRFS_RAID_RAID1C4; 170 else if (flags & BTRFS_BLOCK_GROUP_DUP) 171 return BTRFS_RAID_DUP; 172 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 173 return BTRFS_RAID_RAID0; 174 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 175 return BTRFS_RAID_RAID5; 176 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 177 return BTRFS_RAID_RAID6; 178 179 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 180 } 181 182 const char *btrfs_bg_type_to_raid_name(u64 flags) 183 { 184 const int index = btrfs_bg_flags_to_raid_index(flags); 185 186 if (index >= BTRFS_NR_RAID_TYPES) 187 return NULL; 188 189 return btrfs_raid_array[index].raid_name; 190 } 191 192 /* 193 * Fill @buf with textual description of @bg_flags, no more than @size_buf 194 * bytes including terminating null byte. 195 */ 196 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 197 { 198 int i; 199 int ret; 200 char *bp = buf; 201 u64 flags = bg_flags; 202 u32 size_bp = size_buf; 203 204 if (!flags) { 205 strcpy(bp, "NONE"); 206 return; 207 } 208 209 #define DESCRIBE_FLAG(flag, desc) \ 210 do { \ 211 if (flags & (flag)) { \ 212 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 213 if (ret < 0 || ret >= size_bp) \ 214 goto out_overflow; \ 215 size_bp -= ret; \ 216 bp += ret; \ 217 flags &= ~(flag); \ 218 } \ 219 } while (0) 220 221 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 222 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 224 225 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 226 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 227 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 228 btrfs_raid_array[i].raid_name); 229 #undef DESCRIBE_FLAG 230 231 if (flags) { 232 ret = snprintf(bp, size_bp, "0x%llx|", flags); 233 size_bp -= ret; 234 } 235 236 if (size_bp < size_buf) 237 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 238 239 /* 240 * The text is trimmed, it's up to the caller to provide sufficiently 241 * large buffer 242 */ 243 out_overflow:; 244 } 245 246 static int init_first_rw_device(struct btrfs_trans_handle *trans); 247 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 248 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 249 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 250 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 251 enum btrfs_map_op op, 252 u64 logical, u64 *length, 253 struct btrfs_bio **bbio_ret, 254 int mirror_num, int need_raid_map); 255 256 /* 257 * Device locking 258 * ============== 259 * 260 * There are several mutexes that protect manipulation of devices and low-level 261 * structures like chunks but not block groups, extents or files 262 * 263 * uuid_mutex (global lock) 264 * ------------------------ 265 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 266 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 267 * device) or requested by the device= mount option 268 * 269 * the mutex can be very coarse and can cover long-running operations 270 * 271 * protects: updates to fs_devices counters like missing devices, rw devices, 272 * seeding, structure cloning, opening/closing devices at mount/umount time 273 * 274 * global::fs_devs - add, remove, updates to the global list 275 * 276 * does not protect: manipulation of the fs_devices::devices list in general 277 * but in mount context it could be used to exclude list modifications by eg. 278 * scan ioctl 279 * 280 * btrfs_device::name - renames (write side), read is RCU 281 * 282 * fs_devices::device_list_mutex (per-fs, with RCU) 283 * ------------------------------------------------ 284 * protects updates to fs_devices::devices, ie. adding and deleting 285 * 286 * simple list traversal with read-only actions can be done with RCU protection 287 * 288 * may be used to exclude some operations from running concurrently without any 289 * modifications to the list (see write_all_supers) 290 * 291 * Is not required at mount and close times, because our device list is 292 * protected by the uuid_mutex at that point. 293 * 294 * balance_mutex 295 * ------------- 296 * protects balance structures (status, state) and context accessed from 297 * several places (internally, ioctl) 298 * 299 * chunk_mutex 300 * ----------- 301 * protects chunks, adding or removing during allocation, trim or when a new 302 * device is added/removed. Additionally it also protects post_commit_list of 303 * individual devices, since they can be added to the transaction's 304 * post_commit_list only with chunk_mutex held. 305 * 306 * cleaner_mutex 307 * ------------- 308 * a big lock that is held by the cleaner thread and prevents running subvolume 309 * cleaning together with relocation or delayed iputs 310 * 311 * 312 * Lock nesting 313 * ============ 314 * 315 * uuid_mutex 316 * device_list_mutex 317 * chunk_mutex 318 * balance_mutex 319 * 320 * 321 * Exclusive operations 322 * ==================== 323 * 324 * Maintains the exclusivity of the following operations that apply to the 325 * whole filesystem and cannot run in parallel. 326 * 327 * - Balance (*) 328 * - Device add 329 * - Device remove 330 * - Device replace (*) 331 * - Resize 332 * 333 * The device operations (as above) can be in one of the following states: 334 * 335 * - Running state 336 * - Paused state 337 * - Completed state 338 * 339 * Only device operations marked with (*) can go into the Paused state for the 340 * following reasons: 341 * 342 * - ioctl (only Balance can be Paused through ioctl) 343 * - filesystem remounted as read-only 344 * - filesystem unmounted and mounted as read-only 345 * - system power-cycle and filesystem mounted as read-only 346 * - filesystem or device errors leading to forced read-only 347 * 348 * The status of exclusive operation is set and cleared atomically. 349 * During the course of Paused state, fs_info::exclusive_operation remains set. 350 * A device operation in Paused or Running state can be canceled or resumed 351 * either by ioctl (Balance only) or when remounted as read-write. 352 * The exclusive status is cleared when the device operation is canceled or 353 * completed. 354 */ 355 356 DEFINE_MUTEX(uuid_mutex); 357 static LIST_HEAD(fs_uuids); 358 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 359 { 360 return &fs_uuids; 361 } 362 363 /* 364 * alloc_fs_devices - allocate struct btrfs_fs_devices 365 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 366 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 367 * 368 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 369 * The returned struct is not linked onto any lists and can be destroyed with 370 * kfree() right away. 371 */ 372 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 373 const u8 *metadata_fsid) 374 { 375 struct btrfs_fs_devices *fs_devs; 376 377 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 378 if (!fs_devs) 379 return ERR_PTR(-ENOMEM); 380 381 mutex_init(&fs_devs->device_list_mutex); 382 383 INIT_LIST_HEAD(&fs_devs->devices); 384 INIT_LIST_HEAD(&fs_devs->alloc_list); 385 INIT_LIST_HEAD(&fs_devs->fs_list); 386 INIT_LIST_HEAD(&fs_devs->seed_list); 387 if (fsid) 388 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 389 390 if (metadata_fsid) 391 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 392 else if (fsid) 393 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 394 395 return fs_devs; 396 } 397 398 void btrfs_free_device(struct btrfs_device *device) 399 { 400 WARN_ON(!list_empty(&device->post_commit_list)); 401 rcu_string_free(device->name); 402 extent_io_tree_release(&device->alloc_state); 403 bio_put(device->flush_bio); 404 btrfs_destroy_dev_zone_info(device); 405 kfree(device); 406 } 407 408 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409 { 410 struct btrfs_device *device; 411 WARN_ON(fs_devices->opened); 412 while (!list_empty(&fs_devices->devices)) { 413 device = list_entry(fs_devices->devices.next, 414 struct btrfs_device, dev_list); 415 list_del(&device->dev_list); 416 btrfs_free_device(device); 417 } 418 kfree(fs_devices); 419 } 420 421 void __exit btrfs_cleanup_fs_uuids(void) 422 { 423 struct btrfs_fs_devices *fs_devices; 424 425 while (!list_empty(&fs_uuids)) { 426 fs_devices = list_entry(fs_uuids.next, 427 struct btrfs_fs_devices, fs_list); 428 list_del(&fs_devices->fs_list); 429 free_fs_devices(fs_devices); 430 } 431 } 432 433 static noinline struct btrfs_fs_devices *find_fsid( 434 const u8 *fsid, const u8 *metadata_fsid) 435 { 436 struct btrfs_fs_devices *fs_devices; 437 438 ASSERT(fsid); 439 440 /* Handle non-split brain cases */ 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 442 if (metadata_fsid) { 443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445 BTRFS_FSID_SIZE) == 0) 446 return fs_devices; 447 } else { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449 return fs_devices; 450 } 451 } 452 return NULL; 453 } 454 455 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456 struct btrfs_super_block *disk_super) 457 { 458 459 struct btrfs_fs_devices *fs_devices; 460 461 /* 462 * Handle scanned device having completed its fsid change but 463 * belonging to a fs_devices that was created by first scanning 464 * a device which didn't have its fsid/metadata_uuid changed 465 * at all and the CHANGING_FSID_V2 flag set. 466 */ 467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 468 if (fs_devices->fsid_change && 469 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 470 BTRFS_FSID_SIZE) == 0 && 471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 472 BTRFS_FSID_SIZE) == 0) { 473 return fs_devices; 474 } 475 } 476 /* 477 * Handle scanned device having completed its fsid change but 478 * belonging to a fs_devices that was created by a device that 479 * has an outdated pair of fsid/metadata_uuid and 480 * CHANGING_FSID_V2 flag set. 481 */ 482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483 if (fs_devices->fsid_change && 484 memcmp(fs_devices->metadata_uuid, 485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487 BTRFS_FSID_SIZE) == 0) { 488 return fs_devices; 489 } 490 } 491 492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 493 } 494 495 496 static int 497 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498 int flush, struct block_device **bdev, 499 struct btrfs_super_block **disk_super) 500 { 501 int ret; 502 503 *bdev = blkdev_get_by_path(device_path, flags, holder); 504 505 if (IS_ERR(*bdev)) { 506 ret = PTR_ERR(*bdev); 507 goto error; 508 } 509 510 if (flush) 511 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513 if (ret) { 514 blkdev_put(*bdev, flags); 515 goto error; 516 } 517 invalidate_bdev(*bdev); 518 *disk_super = btrfs_read_dev_super(*bdev); 519 if (IS_ERR(*disk_super)) { 520 ret = PTR_ERR(*disk_super); 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 525 return 0; 526 527 error: 528 *bdev = NULL; 529 return ret; 530 } 531 532 static bool device_path_matched(const char *path, struct btrfs_device *device) 533 { 534 int found; 535 536 rcu_read_lock(); 537 found = strcmp(rcu_str_deref(device->name), path); 538 rcu_read_unlock(); 539 540 return found == 0; 541 } 542 543 /* 544 * Search and remove all stale (devices which are not mounted) devices. 545 * When both inputs are NULL, it will search and release all stale devices. 546 * path: Optional. When provided will it release all unmounted devices 547 * matching this path only. 548 * skip_dev: Optional. Will skip this device when searching for the stale 549 * devices. 550 * Return: 0 for success or if @path is NULL. 551 * -EBUSY if @path is a mounted device. 552 * -ENOENT if @path does not match any device in the list. 553 */ 554 static int btrfs_free_stale_devices(const char *path, 555 struct btrfs_device *skip_device) 556 { 557 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 558 struct btrfs_device *device, *tmp_device; 559 int ret = 0; 560 561 if (path) 562 ret = -ENOENT; 563 564 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 565 566 mutex_lock(&fs_devices->device_list_mutex); 567 list_for_each_entry_safe(device, tmp_device, 568 &fs_devices->devices, dev_list) { 569 if (skip_device && skip_device == device) 570 continue; 571 if (path && !device->name) 572 continue; 573 if (path && !device_path_matched(path, device)) 574 continue; 575 if (fs_devices->opened) { 576 /* for an already deleted device return 0 */ 577 if (path && ret != 0) 578 ret = -EBUSY; 579 break; 580 } 581 582 /* delete the stale device */ 583 fs_devices->num_devices--; 584 list_del(&device->dev_list); 585 btrfs_free_device(device); 586 587 ret = 0; 588 } 589 mutex_unlock(&fs_devices->device_list_mutex); 590 591 if (fs_devices->num_devices == 0) { 592 btrfs_sysfs_remove_fsid(fs_devices); 593 list_del(&fs_devices->fs_list); 594 free_fs_devices(fs_devices); 595 } 596 } 597 598 return ret; 599 } 600 601 /* 602 * This is only used on mount, and we are protected from competing things 603 * messing with our fs_devices by the uuid_mutex, thus we do not need the 604 * fs_devices->device_list_mutex here. 605 */ 606 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 607 struct btrfs_device *device, fmode_t flags, 608 void *holder) 609 { 610 struct request_queue *q; 611 struct block_device *bdev; 612 struct btrfs_super_block *disk_super; 613 u64 devid; 614 int ret; 615 616 if (device->bdev) 617 return -EINVAL; 618 if (!device->name) 619 return -EINVAL; 620 621 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 622 &bdev, &disk_super); 623 if (ret) 624 return ret; 625 626 devid = btrfs_stack_device_id(&disk_super->dev_item); 627 if (devid != device->devid) 628 goto error_free_page; 629 630 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 631 goto error_free_page; 632 633 device->generation = btrfs_super_generation(disk_super); 634 635 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 636 if (btrfs_super_incompat_flags(disk_super) & 637 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 638 pr_err( 639 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 640 goto error_free_page; 641 } 642 643 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 644 fs_devices->seeding = true; 645 } else { 646 if (bdev_read_only(bdev)) 647 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 648 else 649 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 650 } 651 652 q = bdev_get_queue(bdev); 653 if (!blk_queue_nonrot(q)) 654 fs_devices->rotating = true; 655 656 device->bdev = bdev; 657 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 658 device->mode = flags; 659 660 fs_devices->open_devices++; 661 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 662 device->devid != BTRFS_DEV_REPLACE_DEVID) { 663 fs_devices->rw_devices++; 664 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 665 } 666 btrfs_release_disk_super(disk_super); 667 668 return 0; 669 670 error_free_page: 671 btrfs_release_disk_super(disk_super); 672 blkdev_put(bdev, flags); 673 674 return -EINVAL; 675 } 676 677 /* 678 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 679 * being created with a disk that has already completed its fsid change. Such 680 * disk can belong to an fs which has its FSID changed or to one which doesn't. 681 * Handle both cases here. 682 */ 683 static struct btrfs_fs_devices *find_fsid_inprogress( 684 struct btrfs_super_block *disk_super) 685 { 686 struct btrfs_fs_devices *fs_devices; 687 688 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 689 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 690 BTRFS_FSID_SIZE) != 0 && 691 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 692 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 693 return fs_devices; 694 } 695 } 696 697 return find_fsid(disk_super->fsid, NULL); 698 } 699 700 701 static struct btrfs_fs_devices *find_fsid_changed( 702 struct btrfs_super_block *disk_super) 703 { 704 struct btrfs_fs_devices *fs_devices; 705 706 /* 707 * Handles the case where scanned device is part of an fs that had 708 * multiple successful changes of FSID but currently device didn't 709 * observe it. Meaning our fsid will be different than theirs. We need 710 * to handle two subcases : 711 * 1 - The fs still continues to have different METADATA/FSID uuids. 712 * 2 - The fs is switched back to its original FSID (METADATA/FSID 713 * are equal). 714 */ 715 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 716 /* Changed UUIDs */ 717 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 718 BTRFS_FSID_SIZE) != 0 && 719 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 720 BTRFS_FSID_SIZE) == 0 && 721 memcmp(fs_devices->fsid, disk_super->fsid, 722 BTRFS_FSID_SIZE) != 0) 723 return fs_devices; 724 725 /* Unchanged UUIDs */ 726 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 727 BTRFS_FSID_SIZE) == 0 && 728 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 729 BTRFS_FSID_SIZE) == 0) 730 return fs_devices; 731 } 732 733 return NULL; 734 } 735 736 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 737 struct btrfs_super_block *disk_super) 738 { 739 struct btrfs_fs_devices *fs_devices; 740 741 /* 742 * Handle the case where the scanned device is part of an fs whose last 743 * metadata UUID change reverted it to the original FSID. At the same 744 * time * fs_devices was first created by another constitutent device 745 * which didn't fully observe the operation. This results in an 746 * btrfs_fs_devices created with metadata/fsid different AND 747 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 748 * fs_devices equal to the FSID of the disk. 749 */ 750 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 751 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 752 BTRFS_FSID_SIZE) != 0 && 753 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 754 BTRFS_FSID_SIZE) == 0 && 755 fs_devices->fsid_change) 756 return fs_devices; 757 } 758 759 return NULL; 760 } 761 /* 762 * Add new device to list of registered devices 763 * 764 * Returns: 765 * device pointer which was just added or updated when successful 766 * error pointer when failed 767 */ 768 static noinline struct btrfs_device *device_list_add(const char *path, 769 struct btrfs_super_block *disk_super, 770 bool *new_device_added) 771 { 772 struct btrfs_device *device; 773 struct btrfs_fs_devices *fs_devices = NULL; 774 struct rcu_string *name; 775 u64 found_transid = btrfs_super_generation(disk_super); 776 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 777 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 778 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 779 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 780 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 781 782 if (fsid_change_in_progress) { 783 if (!has_metadata_uuid) 784 fs_devices = find_fsid_inprogress(disk_super); 785 else 786 fs_devices = find_fsid_changed(disk_super); 787 } else if (has_metadata_uuid) { 788 fs_devices = find_fsid_with_metadata_uuid(disk_super); 789 } else { 790 fs_devices = find_fsid_reverted_metadata(disk_super); 791 if (!fs_devices) 792 fs_devices = find_fsid(disk_super->fsid, NULL); 793 } 794 795 796 if (!fs_devices) { 797 if (has_metadata_uuid) 798 fs_devices = alloc_fs_devices(disk_super->fsid, 799 disk_super->metadata_uuid); 800 else 801 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 802 803 if (IS_ERR(fs_devices)) 804 return ERR_CAST(fs_devices); 805 806 fs_devices->fsid_change = fsid_change_in_progress; 807 808 mutex_lock(&fs_devices->device_list_mutex); 809 list_add(&fs_devices->fs_list, &fs_uuids); 810 811 device = NULL; 812 } else { 813 mutex_lock(&fs_devices->device_list_mutex); 814 device = btrfs_find_device(fs_devices, devid, 815 disk_super->dev_item.uuid, NULL); 816 817 /* 818 * If this disk has been pulled into an fs devices created by 819 * a device which had the CHANGING_FSID_V2 flag then replace the 820 * metadata_uuid/fsid values of the fs_devices. 821 */ 822 if (fs_devices->fsid_change && 823 found_transid > fs_devices->latest_generation) { 824 memcpy(fs_devices->fsid, disk_super->fsid, 825 BTRFS_FSID_SIZE); 826 827 if (has_metadata_uuid) 828 memcpy(fs_devices->metadata_uuid, 829 disk_super->metadata_uuid, 830 BTRFS_FSID_SIZE); 831 else 832 memcpy(fs_devices->metadata_uuid, 833 disk_super->fsid, BTRFS_FSID_SIZE); 834 835 fs_devices->fsid_change = false; 836 } 837 } 838 839 if (!device) { 840 if (fs_devices->opened) { 841 mutex_unlock(&fs_devices->device_list_mutex); 842 return ERR_PTR(-EBUSY); 843 } 844 845 device = btrfs_alloc_device(NULL, &devid, 846 disk_super->dev_item.uuid); 847 if (IS_ERR(device)) { 848 mutex_unlock(&fs_devices->device_list_mutex); 849 /* we can safely leave the fs_devices entry around */ 850 return device; 851 } 852 853 name = rcu_string_strdup(path, GFP_NOFS); 854 if (!name) { 855 btrfs_free_device(device); 856 mutex_unlock(&fs_devices->device_list_mutex); 857 return ERR_PTR(-ENOMEM); 858 } 859 rcu_assign_pointer(device->name, name); 860 861 list_add_rcu(&device->dev_list, &fs_devices->devices); 862 fs_devices->num_devices++; 863 864 device->fs_devices = fs_devices; 865 *new_device_added = true; 866 867 if (disk_super->label[0]) 868 pr_info( 869 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 870 disk_super->label, devid, found_transid, path, 871 current->comm, task_pid_nr(current)); 872 else 873 pr_info( 874 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 875 disk_super->fsid, devid, found_transid, path, 876 current->comm, task_pid_nr(current)); 877 878 } else if (!device->name || strcmp(device->name->str, path)) { 879 /* 880 * When FS is already mounted. 881 * 1. If you are here and if the device->name is NULL that 882 * means this device was missing at time of FS mount. 883 * 2. If you are here and if the device->name is different 884 * from 'path' that means either 885 * a. The same device disappeared and reappeared with 886 * different name. or 887 * b. The missing-disk-which-was-replaced, has 888 * reappeared now. 889 * 890 * We must allow 1 and 2a above. But 2b would be a spurious 891 * and unintentional. 892 * 893 * Further in case of 1 and 2a above, the disk at 'path' 894 * would have missed some transaction when it was away and 895 * in case of 2a the stale bdev has to be updated as well. 896 * 2b must not be allowed at all time. 897 */ 898 899 /* 900 * For now, we do allow update to btrfs_fs_device through the 901 * btrfs dev scan cli after FS has been mounted. We're still 902 * tracking a problem where systems fail mount by subvolume id 903 * when we reject replacement on a mounted FS. 904 */ 905 if (!fs_devices->opened && found_transid < device->generation) { 906 /* 907 * That is if the FS is _not_ mounted and if you 908 * are here, that means there is more than one 909 * disk with same uuid and devid.We keep the one 910 * with larger generation number or the last-in if 911 * generation are equal. 912 */ 913 mutex_unlock(&fs_devices->device_list_mutex); 914 return ERR_PTR(-EEXIST); 915 } 916 917 /* 918 * We are going to replace the device path for a given devid, 919 * make sure it's the same device if the device is mounted 920 */ 921 if (device->bdev) { 922 int error; 923 dev_t path_dev; 924 925 error = lookup_bdev(path, &path_dev); 926 if (error) { 927 mutex_unlock(&fs_devices->device_list_mutex); 928 return ERR_PTR(error); 929 } 930 931 if (device->bdev->bd_dev != path_dev) { 932 mutex_unlock(&fs_devices->device_list_mutex); 933 /* 934 * device->fs_info may not be reliable here, so 935 * pass in a NULL instead. This avoids a 936 * possible use-after-free when the fs_info and 937 * fs_info->sb are already torn down. 938 */ 939 btrfs_warn_in_rcu(NULL, 940 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 941 path, devid, found_transid, 942 current->comm, 943 task_pid_nr(current)); 944 return ERR_PTR(-EEXIST); 945 } 946 btrfs_info_in_rcu(device->fs_info, 947 "devid %llu device path %s changed to %s scanned by %s (%d)", 948 devid, rcu_str_deref(device->name), 949 path, current->comm, 950 task_pid_nr(current)); 951 } 952 953 name = rcu_string_strdup(path, GFP_NOFS); 954 if (!name) { 955 mutex_unlock(&fs_devices->device_list_mutex); 956 return ERR_PTR(-ENOMEM); 957 } 958 rcu_string_free(device->name); 959 rcu_assign_pointer(device->name, name); 960 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 961 fs_devices->missing_devices--; 962 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 963 } 964 } 965 966 /* 967 * Unmount does not free the btrfs_device struct but would zero 968 * generation along with most of the other members. So just update 969 * it back. We need it to pick the disk with largest generation 970 * (as above). 971 */ 972 if (!fs_devices->opened) { 973 device->generation = found_transid; 974 fs_devices->latest_generation = max_t(u64, found_transid, 975 fs_devices->latest_generation); 976 } 977 978 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 979 980 mutex_unlock(&fs_devices->device_list_mutex); 981 return device; 982 } 983 984 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 985 { 986 struct btrfs_fs_devices *fs_devices; 987 struct btrfs_device *device; 988 struct btrfs_device *orig_dev; 989 int ret = 0; 990 991 fs_devices = alloc_fs_devices(orig->fsid, NULL); 992 if (IS_ERR(fs_devices)) 993 return fs_devices; 994 995 mutex_lock(&orig->device_list_mutex); 996 fs_devices->total_devices = orig->total_devices; 997 998 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 999 struct rcu_string *name; 1000 1001 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1002 orig_dev->uuid); 1003 if (IS_ERR(device)) { 1004 ret = PTR_ERR(device); 1005 goto error; 1006 } 1007 1008 /* 1009 * This is ok to do without rcu read locked because we hold the 1010 * uuid mutex so nothing we touch in here is going to disappear. 1011 */ 1012 if (orig_dev->name) { 1013 name = rcu_string_strdup(orig_dev->name->str, 1014 GFP_KERNEL); 1015 if (!name) { 1016 btrfs_free_device(device); 1017 ret = -ENOMEM; 1018 goto error; 1019 } 1020 rcu_assign_pointer(device->name, name); 1021 } 1022 1023 list_add(&device->dev_list, &fs_devices->devices); 1024 device->fs_devices = fs_devices; 1025 fs_devices->num_devices++; 1026 } 1027 mutex_unlock(&orig->device_list_mutex); 1028 return fs_devices; 1029 error: 1030 mutex_unlock(&orig->device_list_mutex); 1031 free_fs_devices(fs_devices); 1032 return ERR_PTR(ret); 1033 } 1034 1035 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1036 struct btrfs_device **latest_dev) 1037 { 1038 struct btrfs_device *device, *next; 1039 1040 /* This is the initialized path, it is safe to release the devices. */ 1041 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1042 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1043 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1044 &device->dev_state) && 1045 !test_bit(BTRFS_DEV_STATE_MISSING, 1046 &device->dev_state) && 1047 (!*latest_dev || 1048 device->generation > (*latest_dev)->generation)) { 1049 *latest_dev = device; 1050 } 1051 continue; 1052 } 1053 1054 /* 1055 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1056 * in btrfs_init_dev_replace() so just continue. 1057 */ 1058 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1059 continue; 1060 1061 if (device->bdev) { 1062 blkdev_put(device->bdev, device->mode); 1063 device->bdev = NULL; 1064 fs_devices->open_devices--; 1065 } 1066 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1067 list_del_init(&device->dev_alloc_list); 1068 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1069 fs_devices->rw_devices--; 1070 } 1071 list_del_init(&device->dev_list); 1072 fs_devices->num_devices--; 1073 btrfs_free_device(device); 1074 } 1075 1076 } 1077 1078 /* 1079 * After we have read the system tree and know devids belonging to this 1080 * filesystem, remove the device which does not belong there. 1081 */ 1082 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1083 { 1084 struct btrfs_device *latest_dev = NULL; 1085 struct btrfs_fs_devices *seed_dev; 1086 1087 mutex_lock(&uuid_mutex); 1088 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1089 1090 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1091 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1092 1093 fs_devices->latest_bdev = latest_dev->bdev; 1094 1095 mutex_unlock(&uuid_mutex); 1096 } 1097 1098 static void btrfs_close_bdev(struct btrfs_device *device) 1099 { 1100 if (!device->bdev) 1101 return; 1102 1103 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1104 sync_blockdev(device->bdev); 1105 invalidate_bdev(device->bdev); 1106 } 1107 1108 blkdev_put(device->bdev, device->mode); 1109 } 1110 1111 static void btrfs_close_one_device(struct btrfs_device *device) 1112 { 1113 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1114 1115 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1116 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1117 list_del_init(&device->dev_alloc_list); 1118 fs_devices->rw_devices--; 1119 } 1120 1121 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1122 fs_devices->missing_devices--; 1123 1124 btrfs_close_bdev(device); 1125 if (device->bdev) { 1126 fs_devices->open_devices--; 1127 device->bdev = NULL; 1128 } 1129 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1130 btrfs_destroy_dev_zone_info(device); 1131 1132 device->fs_info = NULL; 1133 atomic_set(&device->dev_stats_ccnt, 0); 1134 extent_io_tree_release(&device->alloc_state); 1135 1136 /* Verify the device is back in a pristine state */ 1137 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1138 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1139 ASSERT(list_empty(&device->dev_alloc_list)); 1140 ASSERT(list_empty(&device->post_commit_list)); 1141 ASSERT(atomic_read(&device->reada_in_flight) == 0); 1142 } 1143 1144 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1145 { 1146 struct btrfs_device *device, *tmp; 1147 1148 lockdep_assert_held(&uuid_mutex); 1149 1150 if (--fs_devices->opened > 0) 1151 return; 1152 1153 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1154 btrfs_close_one_device(device); 1155 1156 WARN_ON(fs_devices->open_devices); 1157 WARN_ON(fs_devices->rw_devices); 1158 fs_devices->opened = 0; 1159 fs_devices->seeding = false; 1160 fs_devices->fs_info = NULL; 1161 } 1162 1163 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1164 { 1165 LIST_HEAD(list); 1166 struct btrfs_fs_devices *tmp; 1167 1168 mutex_lock(&uuid_mutex); 1169 close_fs_devices(fs_devices); 1170 if (!fs_devices->opened) 1171 list_splice_init(&fs_devices->seed_list, &list); 1172 1173 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1174 close_fs_devices(fs_devices); 1175 list_del(&fs_devices->seed_list); 1176 free_fs_devices(fs_devices); 1177 } 1178 mutex_unlock(&uuid_mutex); 1179 } 1180 1181 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1182 fmode_t flags, void *holder) 1183 { 1184 struct btrfs_device *device; 1185 struct btrfs_device *latest_dev = NULL; 1186 struct btrfs_device *tmp_device; 1187 1188 flags |= FMODE_EXCL; 1189 1190 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1191 dev_list) { 1192 int ret; 1193 1194 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1195 if (ret == 0 && 1196 (!latest_dev || device->generation > latest_dev->generation)) { 1197 latest_dev = device; 1198 } else if (ret == -ENODATA) { 1199 fs_devices->num_devices--; 1200 list_del(&device->dev_list); 1201 btrfs_free_device(device); 1202 } 1203 } 1204 if (fs_devices->open_devices == 0) 1205 return -EINVAL; 1206 1207 fs_devices->opened = 1; 1208 fs_devices->latest_bdev = latest_dev->bdev; 1209 fs_devices->total_rw_bytes = 0; 1210 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1211 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1212 1213 return 0; 1214 } 1215 1216 static int devid_cmp(void *priv, const struct list_head *a, 1217 const struct list_head *b) 1218 { 1219 const struct btrfs_device *dev1, *dev2; 1220 1221 dev1 = list_entry(a, struct btrfs_device, dev_list); 1222 dev2 = list_entry(b, struct btrfs_device, dev_list); 1223 1224 if (dev1->devid < dev2->devid) 1225 return -1; 1226 else if (dev1->devid > dev2->devid) 1227 return 1; 1228 return 0; 1229 } 1230 1231 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1232 fmode_t flags, void *holder) 1233 { 1234 int ret; 1235 1236 lockdep_assert_held(&uuid_mutex); 1237 /* 1238 * The device_list_mutex cannot be taken here in case opening the 1239 * underlying device takes further locks like open_mutex. 1240 * 1241 * We also don't need the lock here as this is called during mount and 1242 * exclusion is provided by uuid_mutex 1243 */ 1244 1245 if (fs_devices->opened) { 1246 fs_devices->opened++; 1247 ret = 0; 1248 } else { 1249 list_sort(NULL, &fs_devices->devices, devid_cmp); 1250 ret = open_fs_devices(fs_devices, flags, holder); 1251 } 1252 1253 return ret; 1254 } 1255 1256 void btrfs_release_disk_super(struct btrfs_super_block *super) 1257 { 1258 struct page *page = virt_to_page(super); 1259 1260 put_page(page); 1261 } 1262 1263 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1264 u64 bytenr, u64 bytenr_orig) 1265 { 1266 struct btrfs_super_block *disk_super; 1267 struct page *page; 1268 void *p; 1269 pgoff_t index; 1270 1271 /* make sure our super fits in the device */ 1272 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1273 return ERR_PTR(-EINVAL); 1274 1275 /* make sure our super fits in the page */ 1276 if (sizeof(*disk_super) > PAGE_SIZE) 1277 return ERR_PTR(-EINVAL); 1278 1279 /* make sure our super doesn't straddle pages on disk */ 1280 index = bytenr >> PAGE_SHIFT; 1281 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1282 return ERR_PTR(-EINVAL); 1283 1284 /* pull in the page with our super */ 1285 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1286 1287 if (IS_ERR(page)) 1288 return ERR_CAST(page); 1289 1290 p = page_address(page); 1291 1292 /* align our pointer to the offset of the super block */ 1293 disk_super = p + offset_in_page(bytenr); 1294 1295 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1296 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1297 btrfs_release_disk_super(p); 1298 return ERR_PTR(-EINVAL); 1299 } 1300 1301 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1302 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1303 1304 return disk_super; 1305 } 1306 1307 int btrfs_forget_devices(const char *path) 1308 { 1309 int ret; 1310 1311 mutex_lock(&uuid_mutex); 1312 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1313 mutex_unlock(&uuid_mutex); 1314 1315 return ret; 1316 } 1317 1318 /* 1319 * Look for a btrfs signature on a device. This may be called out of the mount path 1320 * and we are not allowed to call set_blocksize during the scan. The superblock 1321 * is read via pagecache 1322 */ 1323 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1324 void *holder) 1325 { 1326 struct btrfs_super_block *disk_super; 1327 bool new_device_added = false; 1328 struct btrfs_device *device = NULL; 1329 struct block_device *bdev; 1330 u64 bytenr, bytenr_orig; 1331 int ret; 1332 1333 lockdep_assert_held(&uuid_mutex); 1334 1335 /* 1336 * we would like to check all the supers, but that would make 1337 * a btrfs mount succeed after a mkfs from a different FS. 1338 * So, we need to add a special mount option to scan for 1339 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1340 */ 1341 flags |= FMODE_EXCL; 1342 1343 bdev = blkdev_get_by_path(path, flags, holder); 1344 if (IS_ERR(bdev)) 1345 return ERR_CAST(bdev); 1346 1347 bytenr_orig = btrfs_sb_offset(0); 1348 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1349 if (ret) 1350 return ERR_PTR(ret); 1351 1352 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1353 if (IS_ERR(disk_super)) { 1354 device = ERR_CAST(disk_super); 1355 goto error_bdev_put; 1356 } 1357 1358 device = device_list_add(path, disk_super, &new_device_added); 1359 if (!IS_ERR(device)) { 1360 if (new_device_added) 1361 btrfs_free_stale_devices(path, device); 1362 } 1363 1364 btrfs_release_disk_super(disk_super); 1365 1366 error_bdev_put: 1367 blkdev_put(bdev, flags); 1368 1369 return device; 1370 } 1371 1372 /* 1373 * Try to find a chunk that intersects [start, start + len] range and when one 1374 * such is found, record the end of it in *start 1375 */ 1376 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1377 u64 len) 1378 { 1379 u64 physical_start, physical_end; 1380 1381 lockdep_assert_held(&device->fs_info->chunk_mutex); 1382 1383 if (!find_first_extent_bit(&device->alloc_state, *start, 1384 &physical_start, &physical_end, 1385 CHUNK_ALLOCATED, NULL)) { 1386 1387 if (in_range(physical_start, *start, len) || 1388 in_range(*start, physical_start, 1389 physical_end - physical_start)) { 1390 *start = physical_end + 1; 1391 return true; 1392 } 1393 } 1394 return false; 1395 } 1396 1397 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1398 { 1399 switch (device->fs_devices->chunk_alloc_policy) { 1400 case BTRFS_CHUNK_ALLOC_REGULAR: 1401 /* 1402 * We don't want to overwrite the superblock on the drive nor 1403 * any area used by the boot loader (grub for example), so we 1404 * make sure to start at an offset of at least 1MB. 1405 */ 1406 return max_t(u64, start, SZ_1M); 1407 case BTRFS_CHUNK_ALLOC_ZONED: 1408 /* 1409 * We don't care about the starting region like regular 1410 * allocator, because we anyway use/reserve the first two zones 1411 * for superblock logging. 1412 */ 1413 return ALIGN(start, device->zone_info->zone_size); 1414 default: 1415 BUG(); 1416 } 1417 } 1418 1419 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1420 u64 *hole_start, u64 *hole_size, 1421 u64 num_bytes) 1422 { 1423 u64 zone_size = device->zone_info->zone_size; 1424 u64 pos; 1425 int ret; 1426 bool changed = false; 1427 1428 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1429 1430 while (*hole_size > 0) { 1431 pos = btrfs_find_allocatable_zones(device, *hole_start, 1432 *hole_start + *hole_size, 1433 num_bytes); 1434 if (pos != *hole_start) { 1435 *hole_size = *hole_start + *hole_size - pos; 1436 *hole_start = pos; 1437 changed = true; 1438 if (*hole_size < num_bytes) 1439 break; 1440 } 1441 1442 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1443 1444 /* Range is ensured to be empty */ 1445 if (!ret) 1446 return changed; 1447 1448 /* Given hole range was invalid (outside of device) */ 1449 if (ret == -ERANGE) { 1450 *hole_start += *hole_size; 1451 *hole_size = 0; 1452 return true; 1453 } 1454 1455 *hole_start += zone_size; 1456 *hole_size -= zone_size; 1457 changed = true; 1458 } 1459 1460 return changed; 1461 } 1462 1463 /** 1464 * dev_extent_hole_check - check if specified hole is suitable for allocation 1465 * @device: the device which we have the hole 1466 * @hole_start: starting position of the hole 1467 * @hole_size: the size of the hole 1468 * @num_bytes: the size of the free space that we need 1469 * 1470 * This function may modify @hole_start and @hole_size to reflect the suitable 1471 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1472 */ 1473 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1474 u64 *hole_size, u64 num_bytes) 1475 { 1476 bool changed = false; 1477 u64 hole_end = *hole_start + *hole_size; 1478 1479 for (;;) { 1480 /* 1481 * Check before we set max_hole_start, otherwise we could end up 1482 * sending back this offset anyway. 1483 */ 1484 if (contains_pending_extent(device, hole_start, *hole_size)) { 1485 if (hole_end >= *hole_start) 1486 *hole_size = hole_end - *hole_start; 1487 else 1488 *hole_size = 0; 1489 changed = true; 1490 } 1491 1492 switch (device->fs_devices->chunk_alloc_policy) { 1493 case BTRFS_CHUNK_ALLOC_REGULAR: 1494 /* No extra check */ 1495 break; 1496 case BTRFS_CHUNK_ALLOC_ZONED: 1497 if (dev_extent_hole_check_zoned(device, hole_start, 1498 hole_size, num_bytes)) { 1499 changed = true; 1500 /* 1501 * The changed hole can contain pending extent. 1502 * Loop again to check that. 1503 */ 1504 continue; 1505 } 1506 break; 1507 default: 1508 BUG(); 1509 } 1510 1511 break; 1512 } 1513 1514 return changed; 1515 } 1516 1517 /* 1518 * find_free_dev_extent_start - find free space in the specified device 1519 * @device: the device which we search the free space in 1520 * @num_bytes: the size of the free space that we need 1521 * @search_start: the position from which to begin the search 1522 * @start: store the start of the free space. 1523 * @len: the size of the free space. that we find, or the size 1524 * of the max free space if we don't find suitable free space 1525 * 1526 * this uses a pretty simple search, the expectation is that it is 1527 * called very infrequently and that a given device has a small number 1528 * of extents 1529 * 1530 * @start is used to store the start of the free space if we find. But if we 1531 * don't find suitable free space, it will be used to store the start position 1532 * of the max free space. 1533 * 1534 * @len is used to store the size of the free space that we find. 1535 * But if we don't find suitable free space, it is used to store the size of 1536 * the max free space. 1537 * 1538 * NOTE: This function will search *commit* root of device tree, and does extra 1539 * check to ensure dev extents are not double allocated. 1540 * This makes the function safe to allocate dev extents but may not report 1541 * correct usable device space, as device extent freed in current transaction 1542 * is not reported as available. 1543 */ 1544 static int find_free_dev_extent_start(struct btrfs_device *device, 1545 u64 num_bytes, u64 search_start, u64 *start, 1546 u64 *len) 1547 { 1548 struct btrfs_fs_info *fs_info = device->fs_info; 1549 struct btrfs_root *root = fs_info->dev_root; 1550 struct btrfs_key key; 1551 struct btrfs_dev_extent *dev_extent; 1552 struct btrfs_path *path; 1553 u64 hole_size; 1554 u64 max_hole_start; 1555 u64 max_hole_size; 1556 u64 extent_end; 1557 u64 search_end = device->total_bytes; 1558 int ret; 1559 int slot; 1560 struct extent_buffer *l; 1561 1562 search_start = dev_extent_search_start(device, search_start); 1563 1564 WARN_ON(device->zone_info && 1565 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1566 1567 path = btrfs_alloc_path(); 1568 if (!path) 1569 return -ENOMEM; 1570 1571 max_hole_start = search_start; 1572 max_hole_size = 0; 1573 1574 again: 1575 if (search_start >= search_end || 1576 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1577 ret = -ENOSPC; 1578 goto out; 1579 } 1580 1581 path->reada = READA_FORWARD; 1582 path->search_commit_root = 1; 1583 path->skip_locking = 1; 1584 1585 key.objectid = device->devid; 1586 key.offset = search_start; 1587 key.type = BTRFS_DEV_EXTENT_KEY; 1588 1589 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1590 if (ret < 0) 1591 goto out; 1592 if (ret > 0) { 1593 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1594 if (ret < 0) 1595 goto out; 1596 } 1597 1598 while (1) { 1599 l = path->nodes[0]; 1600 slot = path->slots[0]; 1601 if (slot >= btrfs_header_nritems(l)) { 1602 ret = btrfs_next_leaf(root, path); 1603 if (ret == 0) 1604 continue; 1605 if (ret < 0) 1606 goto out; 1607 1608 break; 1609 } 1610 btrfs_item_key_to_cpu(l, &key, slot); 1611 1612 if (key.objectid < device->devid) 1613 goto next; 1614 1615 if (key.objectid > device->devid) 1616 break; 1617 1618 if (key.type != BTRFS_DEV_EXTENT_KEY) 1619 goto next; 1620 1621 if (key.offset > search_start) { 1622 hole_size = key.offset - search_start; 1623 dev_extent_hole_check(device, &search_start, &hole_size, 1624 num_bytes); 1625 1626 if (hole_size > max_hole_size) { 1627 max_hole_start = search_start; 1628 max_hole_size = hole_size; 1629 } 1630 1631 /* 1632 * If this free space is greater than which we need, 1633 * it must be the max free space that we have found 1634 * until now, so max_hole_start must point to the start 1635 * of this free space and the length of this free space 1636 * is stored in max_hole_size. Thus, we return 1637 * max_hole_start and max_hole_size and go back to the 1638 * caller. 1639 */ 1640 if (hole_size >= num_bytes) { 1641 ret = 0; 1642 goto out; 1643 } 1644 } 1645 1646 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1647 extent_end = key.offset + btrfs_dev_extent_length(l, 1648 dev_extent); 1649 if (extent_end > search_start) 1650 search_start = extent_end; 1651 next: 1652 path->slots[0]++; 1653 cond_resched(); 1654 } 1655 1656 /* 1657 * At this point, search_start should be the end of 1658 * allocated dev extents, and when shrinking the device, 1659 * search_end may be smaller than search_start. 1660 */ 1661 if (search_end > search_start) { 1662 hole_size = search_end - search_start; 1663 if (dev_extent_hole_check(device, &search_start, &hole_size, 1664 num_bytes)) { 1665 btrfs_release_path(path); 1666 goto again; 1667 } 1668 1669 if (hole_size > max_hole_size) { 1670 max_hole_start = search_start; 1671 max_hole_size = hole_size; 1672 } 1673 } 1674 1675 /* See above. */ 1676 if (max_hole_size < num_bytes) 1677 ret = -ENOSPC; 1678 else 1679 ret = 0; 1680 1681 out: 1682 btrfs_free_path(path); 1683 *start = max_hole_start; 1684 if (len) 1685 *len = max_hole_size; 1686 return ret; 1687 } 1688 1689 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1690 u64 *start, u64 *len) 1691 { 1692 /* FIXME use last free of some kind */ 1693 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1694 } 1695 1696 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1697 struct btrfs_device *device, 1698 u64 start, u64 *dev_extent_len) 1699 { 1700 struct btrfs_fs_info *fs_info = device->fs_info; 1701 struct btrfs_root *root = fs_info->dev_root; 1702 int ret; 1703 struct btrfs_path *path; 1704 struct btrfs_key key; 1705 struct btrfs_key found_key; 1706 struct extent_buffer *leaf = NULL; 1707 struct btrfs_dev_extent *extent = NULL; 1708 1709 path = btrfs_alloc_path(); 1710 if (!path) 1711 return -ENOMEM; 1712 1713 key.objectid = device->devid; 1714 key.offset = start; 1715 key.type = BTRFS_DEV_EXTENT_KEY; 1716 again: 1717 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1718 if (ret > 0) { 1719 ret = btrfs_previous_item(root, path, key.objectid, 1720 BTRFS_DEV_EXTENT_KEY); 1721 if (ret) 1722 goto out; 1723 leaf = path->nodes[0]; 1724 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1725 extent = btrfs_item_ptr(leaf, path->slots[0], 1726 struct btrfs_dev_extent); 1727 BUG_ON(found_key.offset > start || found_key.offset + 1728 btrfs_dev_extent_length(leaf, extent) < start); 1729 key = found_key; 1730 btrfs_release_path(path); 1731 goto again; 1732 } else if (ret == 0) { 1733 leaf = path->nodes[0]; 1734 extent = btrfs_item_ptr(leaf, path->slots[0], 1735 struct btrfs_dev_extent); 1736 } else { 1737 goto out; 1738 } 1739 1740 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1741 1742 ret = btrfs_del_item(trans, root, path); 1743 if (ret == 0) 1744 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1745 out: 1746 btrfs_free_path(path); 1747 return ret; 1748 } 1749 1750 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1751 { 1752 struct extent_map_tree *em_tree; 1753 struct extent_map *em; 1754 struct rb_node *n; 1755 u64 ret = 0; 1756 1757 em_tree = &fs_info->mapping_tree; 1758 read_lock(&em_tree->lock); 1759 n = rb_last(&em_tree->map.rb_root); 1760 if (n) { 1761 em = rb_entry(n, struct extent_map, rb_node); 1762 ret = em->start + em->len; 1763 } 1764 read_unlock(&em_tree->lock); 1765 1766 return ret; 1767 } 1768 1769 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1770 u64 *devid_ret) 1771 { 1772 int ret; 1773 struct btrfs_key key; 1774 struct btrfs_key found_key; 1775 struct btrfs_path *path; 1776 1777 path = btrfs_alloc_path(); 1778 if (!path) 1779 return -ENOMEM; 1780 1781 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1782 key.type = BTRFS_DEV_ITEM_KEY; 1783 key.offset = (u64)-1; 1784 1785 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1786 if (ret < 0) 1787 goto error; 1788 1789 if (ret == 0) { 1790 /* Corruption */ 1791 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1792 ret = -EUCLEAN; 1793 goto error; 1794 } 1795 1796 ret = btrfs_previous_item(fs_info->chunk_root, path, 1797 BTRFS_DEV_ITEMS_OBJECTID, 1798 BTRFS_DEV_ITEM_KEY); 1799 if (ret) { 1800 *devid_ret = 1; 1801 } else { 1802 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1803 path->slots[0]); 1804 *devid_ret = found_key.offset + 1; 1805 } 1806 ret = 0; 1807 error: 1808 btrfs_free_path(path); 1809 return ret; 1810 } 1811 1812 /* 1813 * the device information is stored in the chunk root 1814 * the btrfs_device struct should be fully filled in 1815 */ 1816 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1817 struct btrfs_device *device) 1818 { 1819 int ret; 1820 struct btrfs_path *path; 1821 struct btrfs_dev_item *dev_item; 1822 struct extent_buffer *leaf; 1823 struct btrfs_key key; 1824 unsigned long ptr; 1825 1826 path = btrfs_alloc_path(); 1827 if (!path) 1828 return -ENOMEM; 1829 1830 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1831 key.type = BTRFS_DEV_ITEM_KEY; 1832 key.offset = device->devid; 1833 1834 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1835 &key, sizeof(*dev_item)); 1836 if (ret) 1837 goto out; 1838 1839 leaf = path->nodes[0]; 1840 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1841 1842 btrfs_set_device_id(leaf, dev_item, device->devid); 1843 btrfs_set_device_generation(leaf, dev_item, 0); 1844 btrfs_set_device_type(leaf, dev_item, device->type); 1845 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1846 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1847 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1848 btrfs_set_device_total_bytes(leaf, dev_item, 1849 btrfs_device_get_disk_total_bytes(device)); 1850 btrfs_set_device_bytes_used(leaf, dev_item, 1851 btrfs_device_get_bytes_used(device)); 1852 btrfs_set_device_group(leaf, dev_item, 0); 1853 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1854 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1855 btrfs_set_device_start_offset(leaf, dev_item, 0); 1856 1857 ptr = btrfs_device_uuid(dev_item); 1858 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1859 ptr = btrfs_device_fsid(dev_item); 1860 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1861 ptr, BTRFS_FSID_SIZE); 1862 btrfs_mark_buffer_dirty(leaf); 1863 1864 ret = 0; 1865 out: 1866 btrfs_free_path(path); 1867 return ret; 1868 } 1869 1870 /* 1871 * Function to update ctime/mtime for a given device path. 1872 * Mainly used for ctime/mtime based probe like libblkid. 1873 */ 1874 static void update_dev_time(const char *path_name) 1875 { 1876 struct file *filp; 1877 1878 filp = filp_open(path_name, O_RDWR, 0); 1879 if (IS_ERR(filp)) 1880 return; 1881 file_update_time(filp); 1882 filp_close(filp, NULL); 1883 } 1884 1885 static int btrfs_rm_dev_item(struct btrfs_device *device) 1886 { 1887 struct btrfs_root *root = device->fs_info->chunk_root; 1888 int ret; 1889 struct btrfs_path *path; 1890 struct btrfs_key key; 1891 struct btrfs_trans_handle *trans; 1892 1893 path = btrfs_alloc_path(); 1894 if (!path) 1895 return -ENOMEM; 1896 1897 trans = btrfs_start_transaction(root, 0); 1898 if (IS_ERR(trans)) { 1899 btrfs_free_path(path); 1900 return PTR_ERR(trans); 1901 } 1902 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1903 key.type = BTRFS_DEV_ITEM_KEY; 1904 key.offset = device->devid; 1905 1906 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1907 if (ret) { 1908 if (ret > 0) 1909 ret = -ENOENT; 1910 btrfs_abort_transaction(trans, ret); 1911 btrfs_end_transaction(trans); 1912 goto out; 1913 } 1914 1915 ret = btrfs_del_item(trans, root, path); 1916 if (ret) { 1917 btrfs_abort_transaction(trans, ret); 1918 btrfs_end_transaction(trans); 1919 } 1920 1921 out: 1922 btrfs_free_path(path); 1923 if (!ret) 1924 ret = btrfs_commit_transaction(trans); 1925 return ret; 1926 } 1927 1928 /* 1929 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1930 * filesystem. It's up to the caller to adjust that number regarding eg. device 1931 * replace. 1932 */ 1933 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1934 u64 num_devices) 1935 { 1936 u64 all_avail; 1937 unsigned seq; 1938 int i; 1939 1940 do { 1941 seq = read_seqbegin(&fs_info->profiles_lock); 1942 1943 all_avail = fs_info->avail_data_alloc_bits | 1944 fs_info->avail_system_alloc_bits | 1945 fs_info->avail_metadata_alloc_bits; 1946 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1947 1948 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1949 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1950 continue; 1951 1952 if (num_devices < btrfs_raid_array[i].devs_min) 1953 return btrfs_raid_array[i].mindev_error; 1954 } 1955 1956 return 0; 1957 } 1958 1959 static struct btrfs_device * btrfs_find_next_active_device( 1960 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1961 { 1962 struct btrfs_device *next_device; 1963 1964 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1965 if (next_device != device && 1966 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1967 && next_device->bdev) 1968 return next_device; 1969 } 1970 1971 return NULL; 1972 } 1973 1974 /* 1975 * Helper function to check if the given device is part of s_bdev / latest_bdev 1976 * and replace it with the provided or the next active device, in the context 1977 * where this function called, there should be always be another device (or 1978 * this_dev) which is active. 1979 */ 1980 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1981 struct btrfs_device *next_device) 1982 { 1983 struct btrfs_fs_info *fs_info = device->fs_info; 1984 1985 if (!next_device) 1986 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1987 device); 1988 ASSERT(next_device); 1989 1990 if (fs_info->sb->s_bdev && 1991 (fs_info->sb->s_bdev == device->bdev)) 1992 fs_info->sb->s_bdev = next_device->bdev; 1993 1994 if (fs_info->fs_devices->latest_bdev == device->bdev) 1995 fs_info->fs_devices->latest_bdev = next_device->bdev; 1996 } 1997 1998 /* 1999 * Return btrfs_fs_devices::num_devices excluding the device that's being 2000 * currently replaced. 2001 */ 2002 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2003 { 2004 u64 num_devices = fs_info->fs_devices->num_devices; 2005 2006 down_read(&fs_info->dev_replace.rwsem); 2007 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2008 ASSERT(num_devices > 1); 2009 num_devices--; 2010 } 2011 up_read(&fs_info->dev_replace.rwsem); 2012 2013 return num_devices; 2014 } 2015 2016 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2017 struct block_device *bdev, 2018 const char *device_path) 2019 { 2020 struct btrfs_super_block *disk_super; 2021 int copy_num; 2022 2023 if (!bdev) 2024 return; 2025 2026 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2027 struct page *page; 2028 int ret; 2029 2030 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2031 if (IS_ERR(disk_super)) 2032 continue; 2033 2034 if (bdev_is_zoned(bdev)) { 2035 btrfs_reset_sb_log_zones(bdev, copy_num); 2036 continue; 2037 } 2038 2039 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2040 2041 page = virt_to_page(disk_super); 2042 set_page_dirty(page); 2043 lock_page(page); 2044 /* write_on_page() unlocks the page */ 2045 ret = write_one_page(page); 2046 if (ret) 2047 btrfs_warn(fs_info, 2048 "error clearing superblock number %d (%d)", 2049 copy_num, ret); 2050 btrfs_release_disk_super(disk_super); 2051 2052 } 2053 2054 /* Notify udev that device has changed */ 2055 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2056 2057 /* Update ctime/mtime for device path for libblkid */ 2058 update_dev_time(device_path); 2059 } 2060 2061 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2062 u64 devid) 2063 { 2064 struct btrfs_device *device; 2065 struct btrfs_fs_devices *cur_devices; 2066 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2067 u64 num_devices; 2068 int ret = 0; 2069 2070 mutex_lock(&uuid_mutex); 2071 2072 num_devices = btrfs_num_devices(fs_info); 2073 2074 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2075 if (ret) 2076 goto out; 2077 2078 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2079 2080 if (IS_ERR(device)) { 2081 if (PTR_ERR(device) == -ENOENT && 2082 strcmp(device_path, "missing") == 0) 2083 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2084 else 2085 ret = PTR_ERR(device); 2086 goto out; 2087 } 2088 2089 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2090 btrfs_warn_in_rcu(fs_info, 2091 "cannot remove device %s (devid %llu) due to active swapfile", 2092 rcu_str_deref(device->name), device->devid); 2093 ret = -ETXTBSY; 2094 goto out; 2095 } 2096 2097 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2098 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2099 goto out; 2100 } 2101 2102 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2103 fs_info->fs_devices->rw_devices == 1) { 2104 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2105 goto out; 2106 } 2107 2108 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2109 mutex_lock(&fs_info->chunk_mutex); 2110 list_del_init(&device->dev_alloc_list); 2111 device->fs_devices->rw_devices--; 2112 mutex_unlock(&fs_info->chunk_mutex); 2113 } 2114 2115 mutex_unlock(&uuid_mutex); 2116 ret = btrfs_shrink_device(device, 0); 2117 if (!ret) 2118 btrfs_reada_remove_dev(device); 2119 mutex_lock(&uuid_mutex); 2120 if (ret) 2121 goto error_undo; 2122 2123 /* 2124 * TODO: the superblock still includes this device in its num_devices 2125 * counter although write_all_supers() is not locked out. This 2126 * could give a filesystem state which requires a degraded mount. 2127 */ 2128 ret = btrfs_rm_dev_item(device); 2129 if (ret) 2130 goto error_undo; 2131 2132 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2133 btrfs_scrub_cancel_dev(device); 2134 2135 /* 2136 * the device list mutex makes sure that we don't change 2137 * the device list while someone else is writing out all 2138 * the device supers. Whoever is writing all supers, should 2139 * lock the device list mutex before getting the number of 2140 * devices in the super block (super_copy). Conversely, 2141 * whoever updates the number of devices in the super block 2142 * (super_copy) should hold the device list mutex. 2143 */ 2144 2145 /* 2146 * In normal cases the cur_devices == fs_devices. But in case 2147 * of deleting a seed device, the cur_devices should point to 2148 * its own fs_devices listed under the fs_devices->seed. 2149 */ 2150 cur_devices = device->fs_devices; 2151 mutex_lock(&fs_devices->device_list_mutex); 2152 list_del_rcu(&device->dev_list); 2153 2154 cur_devices->num_devices--; 2155 cur_devices->total_devices--; 2156 /* Update total_devices of the parent fs_devices if it's seed */ 2157 if (cur_devices != fs_devices) 2158 fs_devices->total_devices--; 2159 2160 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2161 cur_devices->missing_devices--; 2162 2163 btrfs_assign_next_active_device(device, NULL); 2164 2165 if (device->bdev) { 2166 cur_devices->open_devices--; 2167 /* remove sysfs entry */ 2168 btrfs_sysfs_remove_device(device); 2169 } 2170 2171 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2172 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2173 mutex_unlock(&fs_devices->device_list_mutex); 2174 2175 /* 2176 * at this point, the device is zero sized and detached from 2177 * the devices list. All that's left is to zero out the old 2178 * supers and free the device. 2179 */ 2180 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2181 btrfs_scratch_superblocks(fs_info, device->bdev, 2182 device->name->str); 2183 2184 btrfs_close_bdev(device); 2185 synchronize_rcu(); 2186 btrfs_free_device(device); 2187 2188 if (cur_devices->open_devices == 0) { 2189 list_del_init(&cur_devices->seed_list); 2190 close_fs_devices(cur_devices); 2191 free_fs_devices(cur_devices); 2192 } 2193 2194 out: 2195 mutex_unlock(&uuid_mutex); 2196 return ret; 2197 2198 error_undo: 2199 btrfs_reada_undo_remove_dev(device); 2200 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2201 mutex_lock(&fs_info->chunk_mutex); 2202 list_add(&device->dev_alloc_list, 2203 &fs_devices->alloc_list); 2204 device->fs_devices->rw_devices++; 2205 mutex_unlock(&fs_info->chunk_mutex); 2206 } 2207 goto out; 2208 } 2209 2210 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2211 { 2212 struct btrfs_fs_devices *fs_devices; 2213 2214 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2215 2216 /* 2217 * in case of fs with no seed, srcdev->fs_devices will point 2218 * to fs_devices of fs_info. However when the dev being replaced is 2219 * a seed dev it will point to the seed's local fs_devices. In short 2220 * srcdev will have its correct fs_devices in both the cases. 2221 */ 2222 fs_devices = srcdev->fs_devices; 2223 2224 list_del_rcu(&srcdev->dev_list); 2225 list_del(&srcdev->dev_alloc_list); 2226 fs_devices->num_devices--; 2227 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2228 fs_devices->missing_devices--; 2229 2230 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2231 fs_devices->rw_devices--; 2232 2233 if (srcdev->bdev) 2234 fs_devices->open_devices--; 2235 } 2236 2237 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2238 { 2239 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2240 2241 mutex_lock(&uuid_mutex); 2242 2243 btrfs_close_bdev(srcdev); 2244 synchronize_rcu(); 2245 btrfs_free_device(srcdev); 2246 2247 /* if this is no devs we rather delete the fs_devices */ 2248 if (!fs_devices->num_devices) { 2249 /* 2250 * On a mounted FS, num_devices can't be zero unless it's a 2251 * seed. In case of a seed device being replaced, the replace 2252 * target added to the sprout FS, so there will be no more 2253 * device left under the seed FS. 2254 */ 2255 ASSERT(fs_devices->seeding); 2256 2257 list_del_init(&fs_devices->seed_list); 2258 close_fs_devices(fs_devices); 2259 free_fs_devices(fs_devices); 2260 } 2261 mutex_unlock(&uuid_mutex); 2262 } 2263 2264 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2265 { 2266 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2267 2268 mutex_lock(&fs_devices->device_list_mutex); 2269 2270 btrfs_sysfs_remove_device(tgtdev); 2271 2272 if (tgtdev->bdev) 2273 fs_devices->open_devices--; 2274 2275 fs_devices->num_devices--; 2276 2277 btrfs_assign_next_active_device(tgtdev, NULL); 2278 2279 list_del_rcu(&tgtdev->dev_list); 2280 2281 mutex_unlock(&fs_devices->device_list_mutex); 2282 2283 /* 2284 * The update_dev_time() with in btrfs_scratch_superblocks() 2285 * may lead to a call to btrfs_show_devname() which will try 2286 * to hold device_list_mutex. And here this device 2287 * is already out of device list, so we don't have to hold 2288 * the device_list_mutex lock. 2289 */ 2290 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2291 tgtdev->name->str); 2292 2293 btrfs_close_bdev(tgtdev); 2294 synchronize_rcu(); 2295 btrfs_free_device(tgtdev); 2296 } 2297 2298 static struct btrfs_device *btrfs_find_device_by_path( 2299 struct btrfs_fs_info *fs_info, const char *device_path) 2300 { 2301 int ret = 0; 2302 struct btrfs_super_block *disk_super; 2303 u64 devid; 2304 u8 *dev_uuid; 2305 struct block_device *bdev; 2306 struct btrfs_device *device; 2307 2308 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2309 fs_info->bdev_holder, 0, &bdev, &disk_super); 2310 if (ret) 2311 return ERR_PTR(ret); 2312 2313 devid = btrfs_stack_device_id(&disk_super->dev_item); 2314 dev_uuid = disk_super->dev_item.uuid; 2315 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2316 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2317 disk_super->metadata_uuid); 2318 else 2319 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2320 disk_super->fsid); 2321 2322 btrfs_release_disk_super(disk_super); 2323 if (!device) 2324 device = ERR_PTR(-ENOENT); 2325 blkdev_put(bdev, FMODE_READ); 2326 return device; 2327 } 2328 2329 /* 2330 * Lookup a device given by device id, or the path if the id is 0. 2331 */ 2332 struct btrfs_device *btrfs_find_device_by_devspec( 2333 struct btrfs_fs_info *fs_info, u64 devid, 2334 const char *device_path) 2335 { 2336 struct btrfs_device *device; 2337 2338 if (devid) { 2339 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2340 NULL); 2341 if (!device) 2342 return ERR_PTR(-ENOENT); 2343 return device; 2344 } 2345 2346 if (!device_path || !device_path[0]) 2347 return ERR_PTR(-EINVAL); 2348 2349 if (strcmp(device_path, "missing") == 0) { 2350 /* Find first missing device */ 2351 list_for_each_entry(device, &fs_info->fs_devices->devices, 2352 dev_list) { 2353 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2354 &device->dev_state) && !device->bdev) 2355 return device; 2356 } 2357 return ERR_PTR(-ENOENT); 2358 } 2359 2360 return btrfs_find_device_by_path(fs_info, device_path); 2361 } 2362 2363 /* 2364 * does all the dirty work required for changing file system's UUID. 2365 */ 2366 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2367 { 2368 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2369 struct btrfs_fs_devices *old_devices; 2370 struct btrfs_fs_devices *seed_devices; 2371 struct btrfs_super_block *disk_super = fs_info->super_copy; 2372 struct btrfs_device *device; 2373 u64 super_flags; 2374 2375 lockdep_assert_held(&uuid_mutex); 2376 if (!fs_devices->seeding) 2377 return -EINVAL; 2378 2379 /* 2380 * Private copy of the seed devices, anchored at 2381 * fs_info->fs_devices->seed_list 2382 */ 2383 seed_devices = alloc_fs_devices(NULL, NULL); 2384 if (IS_ERR(seed_devices)) 2385 return PTR_ERR(seed_devices); 2386 2387 /* 2388 * It's necessary to retain a copy of the original seed fs_devices in 2389 * fs_uuids so that filesystems which have been seeded can successfully 2390 * reference the seed device from open_seed_devices. This also supports 2391 * multiple fs seed. 2392 */ 2393 old_devices = clone_fs_devices(fs_devices); 2394 if (IS_ERR(old_devices)) { 2395 kfree(seed_devices); 2396 return PTR_ERR(old_devices); 2397 } 2398 2399 list_add(&old_devices->fs_list, &fs_uuids); 2400 2401 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2402 seed_devices->opened = 1; 2403 INIT_LIST_HEAD(&seed_devices->devices); 2404 INIT_LIST_HEAD(&seed_devices->alloc_list); 2405 mutex_init(&seed_devices->device_list_mutex); 2406 2407 mutex_lock(&fs_devices->device_list_mutex); 2408 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2409 synchronize_rcu); 2410 list_for_each_entry(device, &seed_devices->devices, dev_list) 2411 device->fs_devices = seed_devices; 2412 2413 fs_devices->seeding = false; 2414 fs_devices->num_devices = 0; 2415 fs_devices->open_devices = 0; 2416 fs_devices->missing_devices = 0; 2417 fs_devices->rotating = false; 2418 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2419 2420 generate_random_uuid(fs_devices->fsid); 2421 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2422 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2423 mutex_unlock(&fs_devices->device_list_mutex); 2424 2425 super_flags = btrfs_super_flags(disk_super) & 2426 ~BTRFS_SUPER_FLAG_SEEDING; 2427 btrfs_set_super_flags(disk_super, super_flags); 2428 2429 return 0; 2430 } 2431 2432 /* 2433 * Store the expected generation for seed devices in device items. 2434 */ 2435 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2436 { 2437 struct btrfs_fs_info *fs_info = trans->fs_info; 2438 struct btrfs_root *root = fs_info->chunk_root; 2439 struct btrfs_path *path; 2440 struct extent_buffer *leaf; 2441 struct btrfs_dev_item *dev_item; 2442 struct btrfs_device *device; 2443 struct btrfs_key key; 2444 u8 fs_uuid[BTRFS_FSID_SIZE]; 2445 u8 dev_uuid[BTRFS_UUID_SIZE]; 2446 u64 devid; 2447 int ret; 2448 2449 path = btrfs_alloc_path(); 2450 if (!path) 2451 return -ENOMEM; 2452 2453 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2454 key.offset = 0; 2455 key.type = BTRFS_DEV_ITEM_KEY; 2456 2457 while (1) { 2458 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2459 if (ret < 0) 2460 goto error; 2461 2462 leaf = path->nodes[0]; 2463 next_slot: 2464 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2465 ret = btrfs_next_leaf(root, path); 2466 if (ret > 0) 2467 break; 2468 if (ret < 0) 2469 goto error; 2470 leaf = path->nodes[0]; 2471 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2472 btrfs_release_path(path); 2473 continue; 2474 } 2475 2476 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2477 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2478 key.type != BTRFS_DEV_ITEM_KEY) 2479 break; 2480 2481 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2482 struct btrfs_dev_item); 2483 devid = btrfs_device_id(leaf, dev_item); 2484 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2485 BTRFS_UUID_SIZE); 2486 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2487 BTRFS_FSID_SIZE); 2488 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2489 fs_uuid); 2490 BUG_ON(!device); /* Logic error */ 2491 2492 if (device->fs_devices->seeding) { 2493 btrfs_set_device_generation(leaf, dev_item, 2494 device->generation); 2495 btrfs_mark_buffer_dirty(leaf); 2496 } 2497 2498 path->slots[0]++; 2499 goto next_slot; 2500 } 2501 ret = 0; 2502 error: 2503 btrfs_free_path(path); 2504 return ret; 2505 } 2506 2507 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2508 { 2509 struct btrfs_root *root = fs_info->dev_root; 2510 struct request_queue *q; 2511 struct btrfs_trans_handle *trans; 2512 struct btrfs_device *device; 2513 struct block_device *bdev; 2514 struct super_block *sb = fs_info->sb; 2515 struct rcu_string *name; 2516 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2517 u64 orig_super_total_bytes; 2518 u64 orig_super_num_devices; 2519 int seeding_dev = 0; 2520 int ret = 0; 2521 bool locked = false; 2522 2523 if (sb_rdonly(sb) && !fs_devices->seeding) 2524 return -EROFS; 2525 2526 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2527 fs_info->bdev_holder); 2528 if (IS_ERR(bdev)) 2529 return PTR_ERR(bdev); 2530 2531 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2532 ret = -EINVAL; 2533 goto error; 2534 } 2535 2536 if (fs_devices->seeding) { 2537 seeding_dev = 1; 2538 down_write(&sb->s_umount); 2539 mutex_lock(&uuid_mutex); 2540 locked = true; 2541 } 2542 2543 sync_blockdev(bdev); 2544 2545 rcu_read_lock(); 2546 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2547 if (device->bdev == bdev) { 2548 ret = -EEXIST; 2549 rcu_read_unlock(); 2550 goto error; 2551 } 2552 } 2553 rcu_read_unlock(); 2554 2555 device = btrfs_alloc_device(fs_info, NULL, NULL); 2556 if (IS_ERR(device)) { 2557 /* we can safely leave the fs_devices entry around */ 2558 ret = PTR_ERR(device); 2559 goto error; 2560 } 2561 2562 name = rcu_string_strdup(device_path, GFP_KERNEL); 2563 if (!name) { 2564 ret = -ENOMEM; 2565 goto error_free_device; 2566 } 2567 rcu_assign_pointer(device->name, name); 2568 2569 device->fs_info = fs_info; 2570 device->bdev = bdev; 2571 2572 ret = btrfs_get_dev_zone_info(device); 2573 if (ret) 2574 goto error_free_device; 2575 2576 trans = btrfs_start_transaction(root, 0); 2577 if (IS_ERR(trans)) { 2578 ret = PTR_ERR(trans); 2579 goto error_free_zone; 2580 } 2581 2582 q = bdev_get_queue(bdev); 2583 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2584 device->generation = trans->transid; 2585 device->io_width = fs_info->sectorsize; 2586 device->io_align = fs_info->sectorsize; 2587 device->sector_size = fs_info->sectorsize; 2588 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2589 fs_info->sectorsize); 2590 device->disk_total_bytes = device->total_bytes; 2591 device->commit_total_bytes = device->total_bytes; 2592 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2593 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2594 device->mode = FMODE_EXCL; 2595 device->dev_stats_valid = 1; 2596 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2597 2598 if (seeding_dev) { 2599 btrfs_clear_sb_rdonly(sb); 2600 ret = btrfs_prepare_sprout(fs_info); 2601 if (ret) { 2602 btrfs_abort_transaction(trans, ret); 2603 goto error_trans; 2604 } 2605 } 2606 2607 device->fs_devices = fs_devices; 2608 2609 mutex_lock(&fs_devices->device_list_mutex); 2610 mutex_lock(&fs_info->chunk_mutex); 2611 list_add_rcu(&device->dev_list, &fs_devices->devices); 2612 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2613 fs_devices->num_devices++; 2614 fs_devices->open_devices++; 2615 fs_devices->rw_devices++; 2616 fs_devices->total_devices++; 2617 fs_devices->total_rw_bytes += device->total_bytes; 2618 2619 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2620 2621 if (!blk_queue_nonrot(q)) 2622 fs_devices->rotating = true; 2623 2624 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2625 btrfs_set_super_total_bytes(fs_info->super_copy, 2626 round_down(orig_super_total_bytes + device->total_bytes, 2627 fs_info->sectorsize)); 2628 2629 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2630 btrfs_set_super_num_devices(fs_info->super_copy, 2631 orig_super_num_devices + 1); 2632 2633 /* 2634 * we've got more storage, clear any full flags on the space 2635 * infos 2636 */ 2637 btrfs_clear_space_info_full(fs_info); 2638 2639 mutex_unlock(&fs_info->chunk_mutex); 2640 2641 /* Add sysfs device entry */ 2642 btrfs_sysfs_add_device(device); 2643 2644 mutex_unlock(&fs_devices->device_list_mutex); 2645 2646 if (seeding_dev) { 2647 mutex_lock(&fs_info->chunk_mutex); 2648 ret = init_first_rw_device(trans); 2649 mutex_unlock(&fs_info->chunk_mutex); 2650 if (ret) { 2651 btrfs_abort_transaction(trans, ret); 2652 goto error_sysfs; 2653 } 2654 } 2655 2656 ret = btrfs_add_dev_item(trans, device); 2657 if (ret) { 2658 btrfs_abort_transaction(trans, ret); 2659 goto error_sysfs; 2660 } 2661 2662 if (seeding_dev) { 2663 ret = btrfs_finish_sprout(trans); 2664 if (ret) { 2665 btrfs_abort_transaction(trans, ret); 2666 goto error_sysfs; 2667 } 2668 2669 /* 2670 * fs_devices now represents the newly sprouted filesystem and 2671 * its fsid has been changed by btrfs_prepare_sprout 2672 */ 2673 btrfs_sysfs_update_sprout_fsid(fs_devices); 2674 } 2675 2676 ret = btrfs_commit_transaction(trans); 2677 2678 if (seeding_dev) { 2679 mutex_unlock(&uuid_mutex); 2680 up_write(&sb->s_umount); 2681 locked = false; 2682 2683 if (ret) /* transaction commit */ 2684 return ret; 2685 2686 ret = btrfs_relocate_sys_chunks(fs_info); 2687 if (ret < 0) 2688 btrfs_handle_fs_error(fs_info, ret, 2689 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2690 trans = btrfs_attach_transaction(root); 2691 if (IS_ERR(trans)) { 2692 if (PTR_ERR(trans) == -ENOENT) 2693 return 0; 2694 ret = PTR_ERR(trans); 2695 trans = NULL; 2696 goto error_sysfs; 2697 } 2698 ret = btrfs_commit_transaction(trans); 2699 } 2700 2701 /* 2702 * Now that we have written a new super block to this device, check all 2703 * other fs_devices list if device_path alienates any other scanned 2704 * device. 2705 * We can ignore the return value as it typically returns -EINVAL and 2706 * only succeeds if the device was an alien. 2707 */ 2708 btrfs_forget_devices(device_path); 2709 2710 /* Update ctime/mtime for blkid or udev */ 2711 update_dev_time(device_path); 2712 2713 return ret; 2714 2715 error_sysfs: 2716 btrfs_sysfs_remove_device(device); 2717 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2718 mutex_lock(&fs_info->chunk_mutex); 2719 list_del_rcu(&device->dev_list); 2720 list_del(&device->dev_alloc_list); 2721 fs_info->fs_devices->num_devices--; 2722 fs_info->fs_devices->open_devices--; 2723 fs_info->fs_devices->rw_devices--; 2724 fs_info->fs_devices->total_devices--; 2725 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2726 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2727 btrfs_set_super_total_bytes(fs_info->super_copy, 2728 orig_super_total_bytes); 2729 btrfs_set_super_num_devices(fs_info->super_copy, 2730 orig_super_num_devices); 2731 mutex_unlock(&fs_info->chunk_mutex); 2732 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2733 error_trans: 2734 if (seeding_dev) 2735 btrfs_set_sb_rdonly(sb); 2736 if (trans) 2737 btrfs_end_transaction(trans); 2738 error_free_zone: 2739 btrfs_destroy_dev_zone_info(device); 2740 error_free_device: 2741 btrfs_free_device(device); 2742 error: 2743 blkdev_put(bdev, FMODE_EXCL); 2744 if (locked) { 2745 mutex_unlock(&uuid_mutex); 2746 up_write(&sb->s_umount); 2747 } 2748 return ret; 2749 } 2750 2751 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2752 struct btrfs_device *device) 2753 { 2754 int ret; 2755 struct btrfs_path *path; 2756 struct btrfs_root *root = device->fs_info->chunk_root; 2757 struct btrfs_dev_item *dev_item; 2758 struct extent_buffer *leaf; 2759 struct btrfs_key key; 2760 2761 path = btrfs_alloc_path(); 2762 if (!path) 2763 return -ENOMEM; 2764 2765 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2766 key.type = BTRFS_DEV_ITEM_KEY; 2767 key.offset = device->devid; 2768 2769 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2770 if (ret < 0) 2771 goto out; 2772 2773 if (ret > 0) { 2774 ret = -ENOENT; 2775 goto out; 2776 } 2777 2778 leaf = path->nodes[0]; 2779 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2780 2781 btrfs_set_device_id(leaf, dev_item, device->devid); 2782 btrfs_set_device_type(leaf, dev_item, device->type); 2783 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2784 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2785 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2786 btrfs_set_device_total_bytes(leaf, dev_item, 2787 btrfs_device_get_disk_total_bytes(device)); 2788 btrfs_set_device_bytes_used(leaf, dev_item, 2789 btrfs_device_get_bytes_used(device)); 2790 btrfs_mark_buffer_dirty(leaf); 2791 2792 out: 2793 btrfs_free_path(path); 2794 return ret; 2795 } 2796 2797 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2798 struct btrfs_device *device, u64 new_size) 2799 { 2800 struct btrfs_fs_info *fs_info = device->fs_info; 2801 struct btrfs_super_block *super_copy = fs_info->super_copy; 2802 u64 old_total; 2803 u64 diff; 2804 2805 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2806 return -EACCES; 2807 2808 new_size = round_down(new_size, fs_info->sectorsize); 2809 2810 mutex_lock(&fs_info->chunk_mutex); 2811 old_total = btrfs_super_total_bytes(super_copy); 2812 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2813 2814 if (new_size <= device->total_bytes || 2815 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2816 mutex_unlock(&fs_info->chunk_mutex); 2817 return -EINVAL; 2818 } 2819 2820 btrfs_set_super_total_bytes(super_copy, 2821 round_down(old_total + diff, fs_info->sectorsize)); 2822 device->fs_devices->total_rw_bytes += diff; 2823 2824 btrfs_device_set_total_bytes(device, new_size); 2825 btrfs_device_set_disk_total_bytes(device, new_size); 2826 btrfs_clear_space_info_full(device->fs_info); 2827 if (list_empty(&device->post_commit_list)) 2828 list_add_tail(&device->post_commit_list, 2829 &trans->transaction->dev_update_list); 2830 mutex_unlock(&fs_info->chunk_mutex); 2831 2832 return btrfs_update_device(trans, device); 2833 } 2834 2835 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2836 { 2837 struct btrfs_fs_info *fs_info = trans->fs_info; 2838 struct btrfs_root *root = fs_info->chunk_root; 2839 int ret; 2840 struct btrfs_path *path; 2841 struct btrfs_key key; 2842 2843 path = btrfs_alloc_path(); 2844 if (!path) 2845 return -ENOMEM; 2846 2847 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2848 key.offset = chunk_offset; 2849 key.type = BTRFS_CHUNK_ITEM_KEY; 2850 2851 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2852 if (ret < 0) 2853 goto out; 2854 else if (ret > 0) { /* Logic error or corruption */ 2855 btrfs_handle_fs_error(fs_info, -ENOENT, 2856 "Failed lookup while freeing chunk."); 2857 ret = -ENOENT; 2858 goto out; 2859 } 2860 2861 ret = btrfs_del_item(trans, root, path); 2862 if (ret < 0) 2863 btrfs_handle_fs_error(fs_info, ret, 2864 "Failed to delete chunk item."); 2865 out: 2866 btrfs_free_path(path); 2867 return ret; 2868 } 2869 2870 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2871 { 2872 struct btrfs_super_block *super_copy = fs_info->super_copy; 2873 struct btrfs_disk_key *disk_key; 2874 struct btrfs_chunk *chunk; 2875 u8 *ptr; 2876 int ret = 0; 2877 u32 num_stripes; 2878 u32 array_size; 2879 u32 len = 0; 2880 u32 cur; 2881 struct btrfs_key key; 2882 2883 lockdep_assert_held(&fs_info->chunk_mutex); 2884 array_size = btrfs_super_sys_array_size(super_copy); 2885 2886 ptr = super_copy->sys_chunk_array; 2887 cur = 0; 2888 2889 while (cur < array_size) { 2890 disk_key = (struct btrfs_disk_key *)ptr; 2891 btrfs_disk_key_to_cpu(&key, disk_key); 2892 2893 len = sizeof(*disk_key); 2894 2895 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2896 chunk = (struct btrfs_chunk *)(ptr + len); 2897 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2898 len += btrfs_chunk_item_size(num_stripes); 2899 } else { 2900 ret = -EIO; 2901 break; 2902 } 2903 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2904 key.offset == chunk_offset) { 2905 memmove(ptr, ptr + len, array_size - (cur + len)); 2906 array_size -= len; 2907 btrfs_set_super_sys_array_size(super_copy, array_size); 2908 } else { 2909 ptr += len; 2910 cur += len; 2911 } 2912 } 2913 return ret; 2914 } 2915 2916 /* 2917 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2918 * @logical: Logical block offset in bytes. 2919 * @length: Length of extent in bytes. 2920 * 2921 * Return: Chunk mapping or ERR_PTR. 2922 */ 2923 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2924 u64 logical, u64 length) 2925 { 2926 struct extent_map_tree *em_tree; 2927 struct extent_map *em; 2928 2929 em_tree = &fs_info->mapping_tree; 2930 read_lock(&em_tree->lock); 2931 em = lookup_extent_mapping(em_tree, logical, length); 2932 read_unlock(&em_tree->lock); 2933 2934 if (!em) { 2935 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2936 logical, length); 2937 return ERR_PTR(-EINVAL); 2938 } 2939 2940 if (em->start > logical || em->start + em->len < logical) { 2941 btrfs_crit(fs_info, 2942 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2943 logical, length, em->start, em->start + em->len); 2944 free_extent_map(em); 2945 return ERR_PTR(-EINVAL); 2946 } 2947 2948 /* callers are responsible for dropping em's ref. */ 2949 return em; 2950 } 2951 2952 static int remove_chunk_item(struct btrfs_trans_handle *trans, 2953 struct map_lookup *map, u64 chunk_offset) 2954 { 2955 int i; 2956 2957 /* 2958 * Removing chunk items and updating the device items in the chunks btree 2959 * requires holding the chunk_mutex. 2960 * See the comment at btrfs_chunk_alloc() for the details. 2961 */ 2962 lockdep_assert_held(&trans->fs_info->chunk_mutex); 2963 2964 for (i = 0; i < map->num_stripes; i++) { 2965 int ret; 2966 2967 ret = btrfs_update_device(trans, map->stripes[i].dev); 2968 if (ret) 2969 return ret; 2970 } 2971 2972 return btrfs_free_chunk(trans, chunk_offset); 2973 } 2974 2975 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2976 { 2977 struct btrfs_fs_info *fs_info = trans->fs_info; 2978 struct extent_map *em; 2979 struct map_lookup *map; 2980 u64 dev_extent_len = 0; 2981 int i, ret = 0; 2982 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2983 2984 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 2985 if (IS_ERR(em)) { 2986 /* 2987 * This is a logic error, but we don't want to just rely on the 2988 * user having built with ASSERT enabled, so if ASSERT doesn't 2989 * do anything we still error out. 2990 */ 2991 ASSERT(0); 2992 return PTR_ERR(em); 2993 } 2994 map = em->map_lookup; 2995 2996 /* 2997 * First delete the device extent items from the devices btree. 2998 * We take the device_list_mutex to avoid racing with the finishing phase 2999 * of a device replace operation. See the comment below before acquiring 3000 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3001 * because that can result in a deadlock when deleting the device extent 3002 * items from the devices btree - COWing an extent buffer from the btree 3003 * may result in allocating a new metadata chunk, which would attempt to 3004 * lock again fs_info->chunk_mutex. 3005 */ 3006 mutex_lock(&fs_devices->device_list_mutex); 3007 for (i = 0; i < map->num_stripes; i++) { 3008 struct btrfs_device *device = map->stripes[i].dev; 3009 ret = btrfs_free_dev_extent(trans, device, 3010 map->stripes[i].physical, 3011 &dev_extent_len); 3012 if (ret) { 3013 mutex_unlock(&fs_devices->device_list_mutex); 3014 btrfs_abort_transaction(trans, ret); 3015 goto out; 3016 } 3017 3018 if (device->bytes_used > 0) { 3019 mutex_lock(&fs_info->chunk_mutex); 3020 btrfs_device_set_bytes_used(device, 3021 device->bytes_used - dev_extent_len); 3022 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3023 btrfs_clear_space_info_full(fs_info); 3024 mutex_unlock(&fs_info->chunk_mutex); 3025 } 3026 } 3027 mutex_unlock(&fs_devices->device_list_mutex); 3028 3029 /* 3030 * We acquire fs_info->chunk_mutex for 2 reasons: 3031 * 3032 * 1) Just like with the first phase of the chunk allocation, we must 3033 * reserve system space, do all chunk btree updates and deletions, and 3034 * update the system chunk array in the superblock while holding this 3035 * mutex. This is for similar reasons as explained on the comment at 3036 * the top of btrfs_chunk_alloc(); 3037 * 3038 * 2) Prevent races with the final phase of a device replace operation 3039 * that replaces the device object associated with the map's stripes, 3040 * because the device object's id can change at any time during that 3041 * final phase of the device replace operation 3042 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3043 * replaced device and then see it with an ID of 3044 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3045 * the device item, which does not exists on the chunk btree. 3046 * The finishing phase of device replace acquires both the 3047 * device_list_mutex and the chunk_mutex, in that order, so we are 3048 * safe by just acquiring the chunk_mutex. 3049 */ 3050 trans->removing_chunk = true; 3051 mutex_lock(&fs_info->chunk_mutex); 3052 3053 check_system_chunk(trans, map->type); 3054 3055 ret = remove_chunk_item(trans, map, chunk_offset); 3056 /* 3057 * Normally we should not get -ENOSPC since we reserved space before 3058 * through the call to check_system_chunk(). 3059 * 3060 * Despite our system space_info having enough free space, we may not 3061 * be able to allocate extents from its block groups, because all have 3062 * an incompatible profile, which will force us to allocate a new system 3063 * block group with the right profile, or right after we called 3064 * check_system_space() above, a scrub turned the only system block group 3065 * with enough free space into RO mode. 3066 * This is explained with more detail at do_chunk_alloc(). 3067 * 3068 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3069 */ 3070 if (ret == -ENOSPC) { 3071 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3072 struct btrfs_block_group *sys_bg; 3073 3074 sys_bg = btrfs_alloc_chunk(trans, sys_flags); 3075 if (IS_ERR(sys_bg)) { 3076 ret = PTR_ERR(sys_bg); 3077 btrfs_abort_transaction(trans, ret); 3078 goto out; 3079 } 3080 3081 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3082 if (ret) { 3083 btrfs_abort_transaction(trans, ret); 3084 goto out; 3085 } 3086 3087 ret = remove_chunk_item(trans, map, chunk_offset); 3088 if (ret) { 3089 btrfs_abort_transaction(trans, ret); 3090 goto out; 3091 } 3092 } else if (ret) { 3093 btrfs_abort_transaction(trans, ret); 3094 goto out; 3095 } 3096 3097 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3098 3099 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3100 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3101 if (ret) { 3102 btrfs_abort_transaction(trans, ret); 3103 goto out; 3104 } 3105 } 3106 3107 mutex_unlock(&fs_info->chunk_mutex); 3108 trans->removing_chunk = false; 3109 3110 /* 3111 * We are done with chunk btree updates and deletions, so release the 3112 * system space we previously reserved (with check_system_chunk()). 3113 */ 3114 btrfs_trans_release_chunk_metadata(trans); 3115 3116 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3117 if (ret) { 3118 btrfs_abort_transaction(trans, ret); 3119 goto out; 3120 } 3121 3122 out: 3123 if (trans->removing_chunk) { 3124 mutex_unlock(&fs_info->chunk_mutex); 3125 trans->removing_chunk = false; 3126 } 3127 /* once for us */ 3128 free_extent_map(em); 3129 return ret; 3130 } 3131 3132 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3133 { 3134 struct btrfs_root *root = fs_info->chunk_root; 3135 struct btrfs_trans_handle *trans; 3136 struct btrfs_block_group *block_group; 3137 u64 length; 3138 int ret; 3139 3140 /* 3141 * Prevent races with automatic removal of unused block groups. 3142 * After we relocate and before we remove the chunk with offset 3143 * chunk_offset, automatic removal of the block group can kick in, 3144 * resulting in a failure when calling btrfs_remove_chunk() below. 3145 * 3146 * Make sure to acquire this mutex before doing a tree search (dev 3147 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3148 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3149 * we release the path used to search the chunk/dev tree and before 3150 * the current task acquires this mutex and calls us. 3151 */ 3152 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3153 3154 /* step one, relocate all the extents inside this chunk */ 3155 btrfs_scrub_pause(fs_info); 3156 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3157 btrfs_scrub_continue(fs_info); 3158 if (ret) 3159 return ret; 3160 3161 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3162 if (!block_group) 3163 return -ENOENT; 3164 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3165 length = block_group->length; 3166 btrfs_put_block_group(block_group); 3167 3168 /* 3169 * On a zoned file system, discard the whole block group, this will 3170 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3171 * resetting the zone fails, don't treat it as a fatal problem from the 3172 * filesystem's point of view. 3173 */ 3174 if (btrfs_is_zoned(fs_info)) { 3175 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3176 if (ret) 3177 btrfs_info(fs_info, 3178 "failed to reset zone %llu after relocation", 3179 chunk_offset); 3180 } 3181 3182 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3183 chunk_offset); 3184 if (IS_ERR(trans)) { 3185 ret = PTR_ERR(trans); 3186 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3187 return ret; 3188 } 3189 3190 /* 3191 * step two, delete the device extents and the 3192 * chunk tree entries 3193 */ 3194 ret = btrfs_remove_chunk(trans, chunk_offset); 3195 btrfs_end_transaction(trans); 3196 return ret; 3197 } 3198 3199 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3200 { 3201 struct btrfs_root *chunk_root = fs_info->chunk_root; 3202 struct btrfs_path *path; 3203 struct extent_buffer *leaf; 3204 struct btrfs_chunk *chunk; 3205 struct btrfs_key key; 3206 struct btrfs_key found_key; 3207 u64 chunk_type; 3208 bool retried = false; 3209 int failed = 0; 3210 int ret; 3211 3212 path = btrfs_alloc_path(); 3213 if (!path) 3214 return -ENOMEM; 3215 3216 again: 3217 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3218 key.offset = (u64)-1; 3219 key.type = BTRFS_CHUNK_ITEM_KEY; 3220 3221 while (1) { 3222 mutex_lock(&fs_info->reclaim_bgs_lock); 3223 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3224 if (ret < 0) { 3225 mutex_unlock(&fs_info->reclaim_bgs_lock); 3226 goto error; 3227 } 3228 BUG_ON(ret == 0); /* Corruption */ 3229 3230 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3231 key.type); 3232 if (ret) 3233 mutex_unlock(&fs_info->reclaim_bgs_lock); 3234 if (ret < 0) 3235 goto error; 3236 if (ret > 0) 3237 break; 3238 3239 leaf = path->nodes[0]; 3240 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3241 3242 chunk = btrfs_item_ptr(leaf, path->slots[0], 3243 struct btrfs_chunk); 3244 chunk_type = btrfs_chunk_type(leaf, chunk); 3245 btrfs_release_path(path); 3246 3247 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3248 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3249 if (ret == -ENOSPC) 3250 failed++; 3251 else 3252 BUG_ON(ret); 3253 } 3254 mutex_unlock(&fs_info->reclaim_bgs_lock); 3255 3256 if (found_key.offset == 0) 3257 break; 3258 key.offset = found_key.offset - 1; 3259 } 3260 ret = 0; 3261 if (failed && !retried) { 3262 failed = 0; 3263 retried = true; 3264 goto again; 3265 } else if (WARN_ON(failed && retried)) { 3266 ret = -ENOSPC; 3267 } 3268 error: 3269 btrfs_free_path(path); 3270 return ret; 3271 } 3272 3273 /* 3274 * return 1 : allocate a data chunk successfully, 3275 * return <0: errors during allocating a data chunk, 3276 * return 0 : no need to allocate a data chunk. 3277 */ 3278 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3279 u64 chunk_offset) 3280 { 3281 struct btrfs_block_group *cache; 3282 u64 bytes_used; 3283 u64 chunk_type; 3284 3285 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3286 ASSERT(cache); 3287 chunk_type = cache->flags; 3288 btrfs_put_block_group(cache); 3289 3290 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3291 return 0; 3292 3293 spin_lock(&fs_info->data_sinfo->lock); 3294 bytes_used = fs_info->data_sinfo->bytes_used; 3295 spin_unlock(&fs_info->data_sinfo->lock); 3296 3297 if (!bytes_used) { 3298 struct btrfs_trans_handle *trans; 3299 int ret; 3300 3301 trans = btrfs_join_transaction(fs_info->tree_root); 3302 if (IS_ERR(trans)) 3303 return PTR_ERR(trans); 3304 3305 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3306 btrfs_end_transaction(trans); 3307 if (ret < 0) 3308 return ret; 3309 return 1; 3310 } 3311 3312 return 0; 3313 } 3314 3315 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3316 struct btrfs_balance_control *bctl) 3317 { 3318 struct btrfs_root *root = fs_info->tree_root; 3319 struct btrfs_trans_handle *trans; 3320 struct btrfs_balance_item *item; 3321 struct btrfs_disk_balance_args disk_bargs; 3322 struct btrfs_path *path; 3323 struct extent_buffer *leaf; 3324 struct btrfs_key key; 3325 int ret, err; 3326 3327 path = btrfs_alloc_path(); 3328 if (!path) 3329 return -ENOMEM; 3330 3331 trans = btrfs_start_transaction(root, 0); 3332 if (IS_ERR(trans)) { 3333 btrfs_free_path(path); 3334 return PTR_ERR(trans); 3335 } 3336 3337 key.objectid = BTRFS_BALANCE_OBJECTID; 3338 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3339 key.offset = 0; 3340 3341 ret = btrfs_insert_empty_item(trans, root, path, &key, 3342 sizeof(*item)); 3343 if (ret) 3344 goto out; 3345 3346 leaf = path->nodes[0]; 3347 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3348 3349 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3350 3351 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3352 btrfs_set_balance_data(leaf, item, &disk_bargs); 3353 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3354 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3355 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3356 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3357 3358 btrfs_set_balance_flags(leaf, item, bctl->flags); 3359 3360 btrfs_mark_buffer_dirty(leaf); 3361 out: 3362 btrfs_free_path(path); 3363 err = btrfs_commit_transaction(trans); 3364 if (err && !ret) 3365 ret = err; 3366 return ret; 3367 } 3368 3369 static int del_balance_item(struct btrfs_fs_info *fs_info) 3370 { 3371 struct btrfs_root *root = fs_info->tree_root; 3372 struct btrfs_trans_handle *trans; 3373 struct btrfs_path *path; 3374 struct btrfs_key key; 3375 int ret, err; 3376 3377 path = btrfs_alloc_path(); 3378 if (!path) 3379 return -ENOMEM; 3380 3381 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3382 if (IS_ERR(trans)) { 3383 btrfs_free_path(path); 3384 return PTR_ERR(trans); 3385 } 3386 3387 key.objectid = BTRFS_BALANCE_OBJECTID; 3388 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3389 key.offset = 0; 3390 3391 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3392 if (ret < 0) 3393 goto out; 3394 if (ret > 0) { 3395 ret = -ENOENT; 3396 goto out; 3397 } 3398 3399 ret = btrfs_del_item(trans, root, path); 3400 out: 3401 btrfs_free_path(path); 3402 err = btrfs_commit_transaction(trans); 3403 if (err && !ret) 3404 ret = err; 3405 return ret; 3406 } 3407 3408 /* 3409 * This is a heuristic used to reduce the number of chunks balanced on 3410 * resume after balance was interrupted. 3411 */ 3412 static void update_balance_args(struct btrfs_balance_control *bctl) 3413 { 3414 /* 3415 * Turn on soft mode for chunk types that were being converted. 3416 */ 3417 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3418 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3419 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3420 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3421 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3422 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3423 3424 /* 3425 * Turn on usage filter if is not already used. The idea is 3426 * that chunks that we have already balanced should be 3427 * reasonably full. Don't do it for chunks that are being 3428 * converted - that will keep us from relocating unconverted 3429 * (albeit full) chunks. 3430 */ 3431 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3432 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3433 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3434 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3435 bctl->data.usage = 90; 3436 } 3437 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3438 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3439 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3440 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3441 bctl->sys.usage = 90; 3442 } 3443 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3444 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3445 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3446 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3447 bctl->meta.usage = 90; 3448 } 3449 } 3450 3451 /* 3452 * Clear the balance status in fs_info and delete the balance item from disk. 3453 */ 3454 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3455 { 3456 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3457 int ret; 3458 3459 BUG_ON(!fs_info->balance_ctl); 3460 3461 spin_lock(&fs_info->balance_lock); 3462 fs_info->balance_ctl = NULL; 3463 spin_unlock(&fs_info->balance_lock); 3464 3465 kfree(bctl); 3466 ret = del_balance_item(fs_info); 3467 if (ret) 3468 btrfs_handle_fs_error(fs_info, ret, NULL); 3469 } 3470 3471 /* 3472 * Balance filters. Return 1 if chunk should be filtered out 3473 * (should not be balanced). 3474 */ 3475 static int chunk_profiles_filter(u64 chunk_type, 3476 struct btrfs_balance_args *bargs) 3477 { 3478 chunk_type = chunk_to_extended(chunk_type) & 3479 BTRFS_EXTENDED_PROFILE_MASK; 3480 3481 if (bargs->profiles & chunk_type) 3482 return 0; 3483 3484 return 1; 3485 } 3486 3487 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3488 struct btrfs_balance_args *bargs) 3489 { 3490 struct btrfs_block_group *cache; 3491 u64 chunk_used; 3492 u64 user_thresh_min; 3493 u64 user_thresh_max; 3494 int ret = 1; 3495 3496 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3497 chunk_used = cache->used; 3498 3499 if (bargs->usage_min == 0) 3500 user_thresh_min = 0; 3501 else 3502 user_thresh_min = div_factor_fine(cache->length, 3503 bargs->usage_min); 3504 3505 if (bargs->usage_max == 0) 3506 user_thresh_max = 1; 3507 else if (bargs->usage_max > 100) 3508 user_thresh_max = cache->length; 3509 else 3510 user_thresh_max = div_factor_fine(cache->length, 3511 bargs->usage_max); 3512 3513 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3514 ret = 0; 3515 3516 btrfs_put_block_group(cache); 3517 return ret; 3518 } 3519 3520 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3521 u64 chunk_offset, struct btrfs_balance_args *bargs) 3522 { 3523 struct btrfs_block_group *cache; 3524 u64 chunk_used, user_thresh; 3525 int ret = 1; 3526 3527 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3528 chunk_used = cache->used; 3529 3530 if (bargs->usage_min == 0) 3531 user_thresh = 1; 3532 else if (bargs->usage > 100) 3533 user_thresh = cache->length; 3534 else 3535 user_thresh = div_factor_fine(cache->length, bargs->usage); 3536 3537 if (chunk_used < user_thresh) 3538 ret = 0; 3539 3540 btrfs_put_block_group(cache); 3541 return ret; 3542 } 3543 3544 static int chunk_devid_filter(struct extent_buffer *leaf, 3545 struct btrfs_chunk *chunk, 3546 struct btrfs_balance_args *bargs) 3547 { 3548 struct btrfs_stripe *stripe; 3549 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3550 int i; 3551 3552 for (i = 0; i < num_stripes; i++) { 3553 stripe = btrfs_stripe_nr(chunk, i); 3554 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3555 return 0; 3556 } 3557 3558 return 1; 3559 } 3560 3561 static u64 calc_data_stripes(u64 type, int num_stripes) 3562 { 3563 const int index = btrfs_bg_flags_to_raid_index(type); 3564 const int ncopies = btrfs_raid_array[index].ncopies; 3565 const int nparity = btrfs_raid_array[index].nparity; 3566 3567 return (num_stripes - nparity) / ncopies; 3568 } 3569 3570 /* [pstart, pend) */ 3571 static int chunk_drange_filter(struct extent_buffer *leaf, 3572 struct btrfs_chunk *chunk, 3573 struct btrfs_balance_args *bargs) 3574 { 3575 struct btrfs_stripe *stripe; 3576 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3577 u64 stripe_offset; 3578 u64 stripe_length; 3579 u64 type; 3580 int factor; 3581 int i; 3582 3583 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3584 return 0; 3585 3586 type = btrfs_chunk_type(leaf, chunk); 3587 factor = calc_data_stripes(type, num_stripes); 3588 3589 for (i = 0; i < num_stripes; i++) { 3590 stripe = btrfs_stripe_nr(chunk, i); 3591 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3592 continue; 3593 3594 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3595 stripe_length = btrfs_chunk_length(leaf, chunk); 3596 stripe_length = div_u64(stripe_length, factor); 3597 3598 if (stripe_offset < bargs->pend && 3599 stripe_offset + stripe_length > bargs->pstart) 3600 return 0; 3601 } 3602 3603 return 1; 3604 } 3605 3606 /* [vstart, vend) */ 3607 static int chunk_vrange_filter(struct extent_buffer *leaf, 3608 struct btrfs_chunk *chunk, 3609 u64 chunk_offset, 3610 struct btrfs_balance_args *bargs) 3611 { 3612 if (chunk_offset < bargs->vend && 3613 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3614 /* at least part of the chunk is inside this vrange */ 3615 return 0; 3616 3617 return 1; 3618 } 3619 3620 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3621 struct btrfs_chunk *chunk, 3622 struct btrfs_balance_args *bargs) 3623 { 3624 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3625 3626 if (bargs->stripes_min <= num_stripes 3627 && num_stripes <= bargs->stripes_max) 3628 return 0; 3629 3630 return 1; 3631 } 3632 3633 static int chunk_soft_convert_filter(u64 chunk_type, 3634 struct btrfs_balance_args *bargs) 3635 { 3636 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3637 return 0; 3638 3639 chunk_type = chunk_to_extended(chunk_type) & 3640 BTRFS_EXTENDED_PROFILE_MASK; 3641 3642 if (bargs->target == chunk_type) 3643 return 1; 3644 3645 return 0; 3646 } 3647 3648 static int should_balance_chunk(struct extent_buffer *leaf, 3649 struct btrfs_chunk *chunk, u64 chunk_offset) 3650 { 3651 struct btrfs_fs_info *fs_info = leaf->fs_info; 3652 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3653 struct btrfs_balance_args *bargs = NULL; 3654 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3655 3656 /* type filter */ 3657 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3658 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3659 return 0; 3660 } 3661 3662 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3663 bargs = &bctl->data; 3664 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3665 bargs = &bctl->sys; 3666 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3667 bargs = &bctl->meta; 3668 3669 /* profiles filter */ 3670 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3671 chunk_profiles_filter(chunk_type, bargs)) { 3672 return 0; 3673 } 3674 3675 /* usage filter */ 3676 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3677 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3678 return 0; 3679 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3680 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3681 return 0; 3682 } 3683 3684 /* devid filter */ 3685 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3686 chunk_devid_filter(leaf, chunk, bargs)) { 3687 return 0; 3688 } 3689 3690 /* drange filter, makes sense only with devid filter */ 3691 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3692 chunk_drange_filter(leaf, chunk, bargs)) { 3693 return 0; 3694 } 3695 3696 /* vrange filter */ 3697 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3698 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3699 return 0; 3700 } 3701 3702 /* stripes filter */ 3703 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3704 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3705 return 0; 3706 } 3707 3708 /* soft profile changing mode */ 3709 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3710 chunk_soft_convert_filter(chunk_type, bargs)) { 3711 return 0; 3712 } 3713 3714 /* 3715 * limited by count, must be the last filter 3716 */ 3717 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3718 if (bargs->limit == 0) 3719 return 0; 3720 else 3721 bargs->limit--; 3722 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3723 /* 3724 * Same logic as the 'limit' filter; the minimum cannot be 3725 * determined here because we do not have the global information 3726 * about the count of all chunks that satisfy the filters. 3727 */ 3728 if (bargs->limit_max == 0) 3729 return 0; 3730 else 3731 bargs->limit_max--; 3732 } 3733 3734 return 1; 3735 } 3736 3737 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3738 { 3739 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3740 struct btrfs_root *chunk_root = fs_info->chunk_root; 3741 u64 chunk_type; 3742 struct btrfs_chunk *chunk; 3743 struct btrfs_path *path = NULL; 3744 struct btrfs_key key; 3745 struct btrfs_key found_key; 3746 struct extent_buffer *leaf; 3747 int slot; 3748 int ret; 3749 int enospc_errors = 0; 3750 bool counting = true; 3751 /* The single value limit and min/max limits use the same bytes in the */ 3752 u64 limit_data = bctl->data.limit; 3753 u64 limit_meta = bctl->meta.limit; 3754 u64 limit_sys = bctl->sys.limit; 3755 u32 count_data = 0; 3756 u32 count_meta = 0; 3757 u32 count_sys = 0; 3758 int chunk_reserved = 0; 3759 3760 path = btrfs_alloc_path(); 3761 if (!path) { 3762 ret = -ENOMEM; 3763 goto error; 3764 } 3765 3766 /* zero out stat counters */ 3767 spin_lock(&fs_info->balance_lock); 3768 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3769 spin_unlock(&fs_info->balance_lock); 3770 again: 3771 if (!counting) { 3772 /* 3773 * The single value limit and min/max limits use the same bytes 3774 * in the 3775 */ 3776 bctl->data.limit = limit_data; 3777 bctl->meta.limit = limit_meta; 3778 bctl->sys.limit = limit_sys; 3779 } 3780 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3781 key.offset = (u64)-1; 3782 key.type = BTRFS_CHUNK_ITEM_KEY; 3783 3784 while (1) { 3785 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3786 atomic_read(&fs_info->balance_cancel_req)) { 3787 ret = -ECANCELED; 3788 goto error; 3789 } 3790 3791 mutex_lock(&fs_info->reclaim_bgs_lock); 3792 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3793 if (ret < 0) { 3794 mutex_unlock(&fs_info->reclaim_bgs_lock); 3795 goto error; 3796 } 3797 3798 /* 3799 * this shouldn't happen, it means the last relocate 3800 * failed 3801 */ 3802 if (ret == 0) 3803 BUG(); /* FIXME break ? */ 3804 3805 ret = btrfs_previous_item(chunk_root, path, 0, 3806 BTRFS_CHUNK_ITEM_KEY); 3807 if (ret) { 3808 mutex_unlock(&fs_info->reclaim_bgs_lock); 3809 ret = 0; 3810 break; 3811 } 3812 3813 leaf = path->nodes[0]; 3814 slot = path->slots[0]; 3815 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3816 3817 if (found_key.objectid != key.objectid) { 3818 mutex_unlock(&fs_info->reclaim_bgs_lock); 3819 break; 3820 } 3821 3822 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3823 chunk_type = btrfs_chunk_type(leaf, chunk); 3824 3825 if (!counting) { 3826 spin_lock(&fs_info->balance_lock); 3827 bctl->stat.considered++; 3828 spin_unlock(&fs_info->balance_lock); 3829 } 3830 3831 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3832 3833 btrfs_release_path(path); 3834 if (!ret) { 3835 mutex_unlock(&fs_info->reclaim_bgs_lock); 3836 goto loop; 3837 } 3838 3839 if (counting) { 3840 mutex_unlock(&fs_info->reclaim_bgs_lock); 3841 spin_lock(&fs_info->balance_lock); 3842 bctl->stat.expected++; 3843 spin_unlock(&fs_info->balance_lock); 3844 3845 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3846 count_data++; 3847 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3848 count_sys++; 3849 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3850 count_meta++; 3851 3852 goto loop; 3853 } 3854 3855 /* 3856 * Apply limit_min filter, no need to check if the LIMITS 3857 * filter is used, limit_min is 0 by default 3858 */ 3859 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3860 count_data < bctl->data.limit_min) 3861 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3862 count_meta < bctl->meta.limit_min) 3863 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3864 count_sys < bctl->sys.limit_min)) { 3865 mutex_unlock(&fs_info->reclaim_bgs_lock); 3866 goto loop; 3867 } 3868 3869 if (!chunk_reserved) { 3870 /* 3871 * We may be relocating the only data chunk we have, 3872 * which could potentially end up with losing data's 3873 * raid profile, so lets allocate an empty one in 3874 * advance. 3875 */ 3876 ret = btrfs_may_alloc_data_chunk(fs_info, 3877 found_key.offset); 3878 if (ret < 0) { 3879 mutex_unlock(&fs_info->reclaim_bgs_lock); 3880 goto error; 3881 } else if (ret == 1) { 3882 chunk_reserved = 1; 3883 } 3884 } 3885 3886 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3887 mutex_unlock(&fs_info->reclaim_bgs_lock); 3888 if (ret == -ENOSPC) { 3889 enospc_errors++; 3890 } else if (ret == -ETXTBSY) { 3891 btrfs_info(fs_info, 3892 "skipping relocation of block group %llu due to active swapfile", 3893 found_key.offset); 3894 ret = 0; 3895 } else if (ret) { 3896 goto error; 3897 } else { 3898 spin_lock(&fs_info->balance_lock); 3899 bctl->stat.completed++; 3900 spin_unlock(&fs_info->balance_lock); 3901 } 3902 loop: 3903 if (found_key.offset == 0) 3904 break; 3905 key.offset = found_key.offset - 1; 3906 } 3907 3908 if (counting) { 3909 btrfs_release_path(path); 3910 counting = false; 3911 goto again; 3912 } 3913 error: 3914 btrfs_free_path(path); 3915 if (enospc_errors) { 3916 btrfs_info(fs_info, "%d enospc errors during balance", 3917 enospc_errors); 3918 if (!ret) 3919 ret = -ENOSPC; 3920 } 3921 3922 return ret; 3923 } 3924 3925 /** 3926 * alloc_profile_is_valid - see if a given profile is valid and reduced 3927 * @flags: profile to validate 3928 * @extended: if true @flags is treated as an extended profile 3929 */ 3930 static int alloc_profile_is_valid(u64 flags, int extended) 3931 { 3932 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3933 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3934 3935 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3936 3937 /* 1) check that all other bits are zeroed */ 3938 if (flags & ~mask) 3939 return 0; 3940 3941 /* 2) see if profile is reduced */ 3942 if (flags == 0) 3943 return !extended; /* "0" is valid for usual profiles */ 3944 3945 return has_single_bit_set(flags); 3946 } 3947 3948 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3949 { 3950 /* cancel requested || normal exit path */ 3951 return atomic_read(&fs_info->balance_cancel_req) || 3952 (atomic_read(&fs_info->balance_pause_req) == 0 && 3953 atomic_read(&fs_info->balance_cancel_req) == 0); 3954 } 3955 3956 /* 3957 * Validate target profile against allowed profiles and return true if it's OK. 3958 * Otherwise print the error message and return false. 3959 */ 3960 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 3961 const struct btrfs_balance_args *bargs, 3962 u64 allowed, const char *type) 3963 { 3964 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3965 return true; 3966 3967 if (fs_info->sectorsize < PAGE_SIZE && 3968 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3969 btrfs_err(fs_info, 3970 "RAID56 is not yet supported for sectorsize %u with page size %lu", 3971 fs_info->sectorsize, PAGE_SIZE); 3972 return false; 3973 } 3974 /* Profile is valid and does not have bits outside of the allowed set */ 3975 if (alloc_profile_is_valid(bargs->target, 1) && 3976 (bargs->target & ~allowed) == 0) 3977 return true; 3978 3979 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 3980 type, btrfs_bg_type_to_raid_name(bargs->target)); 3981 return false; 3982 } 3983 3984 /* 3985 * Fill @buf with textual description of balance filter flags @bargs, up to 3986 * @size_buf including the terminating null. The output may be trimmed if it 3987 * does not fit into the provided buffer. 3988 */ 3989 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 3990 u32 size_buf) 3991 { 3992 int ret; 3993 u32 size_bp = size_buf; 3994 char *bp = buf; 3995 u64 flags = bargs->flags; 3996 char tmp_buf[128] = {'\0'}; 3997 3998 if (!flags) 3999 return; 4000 4001 #define CHECK_APPEND_NOARG(a) \ 4002 do { \ 4003 ret = snprintf(bp, size_bp, (a)); \ 4004 if (ret < 0 || ret >= size_bp) \ 4005 goto out_overflow; \ 4006 size_bp -= ret; \ 4007 bp += ret; \ 4008 } while (0) 4009 4010 #define CHECK_APPEND_1ARG(a, v1) \ 4011 do { \ 4012 ret = snprintf(bp, size_bp, (a), (v1)); \ 4013 if (ret < 0 || ret >= size_bp) \ 4014 goto out_overflow; \ 4015 size_bp -= ret; \ 4016 bp += ret; \ 4017 } while (0) 4018 4019 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4020 do { \ 4021 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4022 if (ret < 0 || ret >= size_bp) \ 4023 goto out_overflow; \ 4024 size_bp -= ret; \ 4025 bp += ret; \ 4026 } while (0) 4027 4028 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4029 CHECK_APPEND_1ARG("convert=%s,", 4030 btrfs_bg_type_to_raid_name(bargs->target)); 4031 4032 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4033 CHECK_APPEND_NOARG("soft,"); 4034 4035 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4036 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4037 sizeof(tmp_buf)); 4038 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4039 } 4040 4041 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4042 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4043 4044 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4045 CHECK_APPEND_2ARG("usage=%u..%u,", 4046 bargs->usage_min, bargs->usage_max); 4047 4048 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4049 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4050 4051 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4052 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4053 bargs->pstart, bargs->pend); 4054 4055 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4056 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4057 bargs->vstart, bargs->vend); 4058 4059 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4060 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4061 4062 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4063 CHECK_APPEND_2ARG("limit=%u..%u,", 4064 bargs->limit_min, bargs->limit_max); 4065 4066 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4067 CHECK_APPEND_2ARG("stripes=%u..%u,", 4068 bargs->stripes_min, bargs->stripes_max); 4069 4070 #undef CHECK_APPEND_2ARG 4071 #undef CHECK_APPEND_1ARG 4072 #undef CHECK_APPEND_NOARG 4073 4074 out_overflow: 4075 4076 if (size_bp < size_buf) 4077 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4078 else 4079 buf[0] = '\0'; 4080 } 4081 4082 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4083 { 4084 u32 size_buf = 1024; 4085 char tmp_buf[192] = {'\0'}; 4086 char *buf; 4087 char *bp; 4088 u32 size_bp = size_buf; 4089 int ret; 4090 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4091 4092 buf = kzalloc(size_buf, GFP_KERNEL); 4093 if (!buf) 4094 return; 4095 4096 bp = buf; 4097 4098 #define CHECK_APPEND_1ARG(a, v1) \ 4099 do { \ 4100 ret = snprintf(bp, size_bp, (a), (v1)); \ 4101 if (ret < 0 || ret >= size_bp) \ 4102 goto out_overflow; \ 4103 size_bp -= ret; \ 4104 bp += ret; \ 4105 } while (0) 4106 4107 if (bctl->flags & BTRFS_BALANCE_FORCE) 4108 CHECK_APPEND_1ARG("%s", "-f "); 4109 4110 if (bctl->flags & BTRFS_BALANCE_DATA) { 4111 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4112 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4113 } 4114 4115 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4116 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4117 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4118 } 4119 4120 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4121 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4122 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4123 } 4124 4125 #undef CHECK_APPEND_1ARG 4126 4127 out_overflow: 4128 4129 if (size_bp < size_buf) 4130 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4131 btrfs_info(fs_info, "balance: %s %s", 4132 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4133 "resume" : "start", buf); 4134 4135 kfree(buf); 4136 } 4137 4138 /* 4139 * Should be called with balance mutexe held 4140 */ 4141 int btrfs_balance(struct btrfs_fs_info *fs_info, 4142 struct btrfs_balance_control *bctl, 4143 struct btrfs_ioctl_balance_args *bargs) 4144 { 4145 u64 meta_target, data_target; 4146 u64 allowed; 4147 int mixed = 0; 4148 int ret; 4149 u64 num_devices; 4150 unsigned seq; 4151 bool reducing_redundancy; 4152 int i; 4153 4154 if (btrfs_fs_closing(fs_info) || 4155 atomic_read(&fs_info->balance_pause_req) || 4156 btrfs_should_cancel_balance(fs_info)) { 4157 ret = -EINVAL; 4158 goto out; 4159 } 4160 4161 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4162 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4163 mixed = 1; 4164 4165 /* 4166 * In case of mixed groups both data and meta should be picked, 4167 * and identical options should be given for both of them. 4168 */ 4169 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4170 if (mixed && (bctl->flags & allowed)) { 4171 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4172 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4173 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4174 btrfs_err(fs_info, 4175 "balance: mixed groups data and metadata options must be the same"); 4176 ret = -EINVAL; 4177 goto out; 4178 } 4179 } 4180 4181 /* 4182 * rw_devices will not change at the moment, device add/delete/replace 4183 * are exclusive 4184 */ 4185 num_devices = fs_info->fs_devices->rw_devices; 4186 4187 /* 4188 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4189 * special bit for it, to make it easier to distinguish. Thus we need 4190 * to set it manually, or balance would refuse the profile. 4191 */ 4192 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4193 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4194 if (num_devices >= btrfs_raid_array[i].devs_min) 4195 allowed |= btrfs_raid_array[i].bg_flag; 4196 4197 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4198 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4199 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4200 ret = -EINVAL; 4201 goto out; 4202 } 4203 4204 /* 4205 * Allow to reduce metadata or system integrity only if force set for 4206 * profiles with redundancy (copies, parity) 4207 */ 4208 allowed = 0; 4209 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4210 if (btrfs_raid_array[i].ncopies >= 2 || 4211 btrfs_raid_array[i].tolerated_failures >= 1) 4212 allowed |= btrfs_raid_array[i].bg_flag; 4213 } 4214 do { 4215 seq = read_seqbegin(&fs_info->profiles_lock); 4216 4217 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4218 (fs_info->avail_system_alloc_bits & allowed) && 4219 !(bctl->sys.target & allowed)) || 4220 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4221 (fs_info->avail_metadata_alloc_bits & allowed) && 4222 !(bctl->meta.target & allowed))) 4223 reducing_redundancy = true; 4224 else 4225 reducing_redundancy = false; 4226 4227 /* if we're not converting, the target field is uninitialized */ 4228 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4229 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4230 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4231 bctl->data.target : fs_info->avail_data_alloc_bits; 4232 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4233 4234 if (reducing_redundancy) { 4235 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4236 btrfs_info(fs_info, 4237 "balance: force reducing metadata redundancy"); 4238 } else { 4239 btrfs_err(fs_info, 4240 "balance: reduces metadata redundancy, use --force if you want this"); 4241 ret = -EINVAL; 4242 goto out; 4243 } 4244 } 4245 4246 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4247 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4248 btrfs_warn(fs_info, 4249 "balance: metadata profile %s has lower redundancy than data profile %s", 4250 btrfs_bg_type_to_raid_name(meta_target), 4251 btrfs_bg_type_to_raid_name(data_target)); 4252 } 4253 4254 ret = insert_balance_item(fs_info, bctl); 4255 if (ret && ret != -EEXIST) 4256 goto out; 4257 4258 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4259 BUG_ON(ret == -EEXIST); 4260 BUG_ON(fs_info->balance_ctl); 4261 spin_lock(&fs_info->balance_lock); 4262 fs_info->balance_ctl = bctl; 4263 spin_unlock(&fs_info->balance_lock); 4264 } else { 4265 BUG_ON(ret != -EEXIST); 4266 spin_lock(&fs_info->balance_lock); 4267 update_balance_args(bctl); 4268 spin_unlock(&fs_info->balance_lock); 4269 } 4270 4271 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4272 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4273 describe_balance_start_or_resume(fs_info); 4274 mutex_unlock(&fs_info->balance_mutex); 4275 4276 ret = __btrfs_balance(fs_info); 4277 4278 mutex_lock(&fs_info->balance_mutex); 4279 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4280 btrfs_info(fs_info, "balance: paused"); 4281 /* 4282 * Balance can be canceled by: 4283 * 4284 * - Regular cancel request 4285 * Then ret == -ECANCELED and balance_cancel_req > 0 4286 * 4287 * - Fatal signal to "btrfs" process 4288 * Either the signal caught by wait_reserve_ticket() and callers 4289 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4290 * got -ECANCELED. 4291 * Either way, in this case balance_cancel_req = 0, and 4292 * ret == -EINTR or ret == -ECANCELED. 4293 * 4294 * So here we only check the return value to catch canceled balance. 4295 */ 4296 else if (ret == -ECANCELED || ret == -EINTR) 4297 btrfs_info(fs_info, "balance: canceled"); 4298 else 4299 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4300 4301 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4302 4303 if (bargs) { 4304 memset(bargs, 0, sizeof(*bargs)); 4305 btrfs_update_ioctl_balance_args(fs_info, bargs); 4306 } 4307 4308 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4309 balance_need_close(fs_info)) { 4310 reset_balance_state(fs_info); 4311 btrfs_exclop_finish(fs_info); 4312 } 4313 4314 wake_up(&fs_info->balance_wait_q); 4315 4316 return ret; 4317 out: 4318 if (bctl->flags & BTRFS_BALANCE_RESUME) 4319 reset_balance_state(fs_info); 4320 else 4321 kfree(bctl); 4322 btrfs_exclop_finish(fs_info); 4323 4324 return ret; 4325 } 4326 4327 static int balance_kthread(void *data) 4328 { 4329 struct btrfs_fs_info *fs_info = data; 4330 int ret = 0; 4331 4332 mutex_lock(&fs_info->balance_mutex); 4333 if (fs_info->balance_ctl) 4334 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4335 mutex_unlock(&fs_info->balance_mutex); 4336 4337 return ret; 4338 } 4339 4340 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4341 { 4342 struct task_struct *tsk; 4343 4344 mutex_lock(&fs_info->balance_mutex); 4345 if (!fs_info->balance_ctl) { 4346 mutex_unlock(&fs_info->balance_mutex); 4347 return 0; 4348 } 4349 mutex_unlock(&fs_info->balance_mutex); 4350 4351 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4352 btrfs_info(fs_info, "balance: resume skipped"); 4353 return 0; 4354 } 4355 4356 /* 4357 * A ro->rw remount sequence should continue with the paused balance 4358 * regardless of who pauses it, system or the user as of now, so set 4359 * the resume flag. 4360 */ 4361 spin_lock(&fs_info->balance_lock); 4362 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4363 spin_unlock(&fs_info->balance_lock); 4364 4365 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4366 return PTR_ERR_OR_ZERO(tsk); 4367 } 4368 4369 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4370 { 4371 struct btrfs_balance_control *bctl; 4372 struct btrfs_balance_item *item; 4373 struct btrfs_disk_balance_args disk_bargs; 4374 struct btrfs_path *path; 4375 struct extent_buffer *leaf; 4376 struct btrfs_key key; 4377 int ret; 4378 4379 path = btrfs_alloc_path(); 4380 if (!path) 4381 return -ENOMEM; 4382 4383 key.objectid = BTRFS_BALANCE_OBJECTID; 4384 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4385 key.offset = 0; 4386 4387 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4388 if (ret < 0) 4389 goto out; 4390 if (ret > 0) { /* ret = -ENOENT; */ 4391 ret = 0; 4392 goto out; 4393 } 4394 4395 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4396 if (!bctl) { 4397 ret = -ENOMEM; 4398 goto out; 4399 } 4400 4401 leaf = path->nodes[0]; 4402 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4403 4404 bctl->flags = btrfs_balance_flags(leaf, item); 4405 bctl->flags |= BTRFS_BALANCE_RESUME; 4406 4407 btrfs_balance_data(leaf, item, &disk_bargs); 4408 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4409 btrfs_balance_meta(leaf, item, &disk_bargs); 4410 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4411 btrfs_balance_sys(leaf, item, &disk_bargs); 4412 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4413 4414 /* 4415 * This should never happen, as the paused balance state is recovered 4416 * during mount without any chance of other exclusive ops to collide. 4417 * 4418 * This gives the exclusive op status to balance and keeps in paused 4419 * state until user intervention (cancel or umount). If the ownership 4420 * cannot be assigned, show a message but do not fail. The balance 4421 * is in a paused state and must have fs_info::balance_ctl properly 4422 * set up. 4423 */ 4424 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4425 btrfs_warn(fs_info, 4426 "balance: cannot set exclusive op status, resume manually"); 4427 4428 btrfs_release_path(path); 4429 4430 mutex_lock(&fs_info->balance_mutex); 4431 BUG_ON(fs_info->balance_ctl); 4432 spin_lock(&fs_info->balance_lock); 4433 fs_info->balance_ctl = bctl; 4434 spin_unlock(&fs_info->balance_lock); 4435 mutex_unlock(&fs_info->balance_mutex); 4436 out: 4437 btrfs_free_path(path); 4438 return ret; 4439 } 4440 4441 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4442 { 4443 int ret = 0; 4444 4445 mutex_lock(&fs_info->balance_mutex); 4446 if (!fs_info->balance_ctl) { 4447 mutex_unlock(&fs_info->balance_mutex); 4448 return -ENOTCONN; 4449 } 4450 4451 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4452 atomic_inc(&fs_info->balance_pause_req); 4453 mutex_unlock(&fs_info->balance_mutex); 4454 4455 wait_event(fs_info->balance_wait_q, 4456 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4457 4458 mutex_lock(&fs_info->balance_mutex); 4459 /* we are good with balance_ctl ripped off from under us */ 4460 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4461 atomic_dec(&fs_info->balance_pause_req); 4462 } else { 4463 ret = -ENOTCONN; 4464 } 4465 4466 mutex_unlock(&fs_info->balance_mutex); 4467 return ret; 4468 } 4469 4470 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4471 { 4472 mutex_lock(&fs_info->balance_mutex); 4473 if (!fs_info->balance_ctl) { 4474 mutex_unlock(&fs_info->balance_mutex); 4475 return -ENOTCONN; 4476 } 4477 4478 /* 4479 * A paused balance with the item stored on disk can be resumed at 4480 * mount time if the mount is read-write. Otherwise it's still paused 4481 * and we must not allow cancelling as it deletes the item. 4482 */ 4483 if (sb_rdonly(fs_info->sb)) { 4484 mutex_unlock(&fs_info->balance_mutex); 4485 return -EROFS; 4486 } 4487 4488 atomic_inc(&fs_info->balance_cancel_req); 4489 /* 4490 * if we are running just wait and return, balance item is 4491 * deleted in btrfs_balance in this case 4492 */ 4493 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4494 mutex_unlock(&fs_info->balance_mutex); 4495 wait_event(fs_info->balance_wait_q, 4496 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4497 mutex_lock(&fs_info->balance_mutex); 4498 } else { 4499 mutex_unlock(&fs_info->balance_mutex); 4500 /* 4501 * Lock released to allow other waiters to continue, we'll 4502 * reexamine the status again. 4503 */ 4504 mutex_lock(&fs_info->balance_mutex); 4505 4506 if (fs_info->balance_ctl) { 4507 reset_balance_state(fs_info); 4508 btrfs_exclop_finish(fs_info); 4509 btrfs_info(fs_info, "balance: canceled"); 4510 } 4511 } 4512 4513 BUG_ON(fs_info->balance_ctl || 4514 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4515 atomic_dec(&fs_info->balance_cancel_req); 4516 mutex_unlock(&fs_info->balance_mutex); 4517 return 0; 4518 } 4519 4520 int btrfs_uuid_scan_kthread(void *data) 4521 { 4522 struct btrfs_fs_info *fs_info = data; 4523 struct btrfs_root *root = fs_info->tree_root; 4524 struct btrfs_key key; 4525 struct btrfs_path *path = NULL; 4526 int ret = 0; 4527 struct extent_buffer *eb; 4528 int slot; 4529 struct btrfs_root_item root_item; 4530 u32 item_size; 4531 struct btrfs_trans_handle *trans = NULL; 4532 bool closing = false; 4533 4534 path = btrfs_alloc_path(); 4535 if (!path) { 4536 ret = -ENOMEM; 4537 goto out; 4538 } 4539 4540 key.objectid = 0; 4541 key.type = BTRFS_ROOT_ITEM_KEY; 4542 key.offset = 0; 4543 4544 while (1) { 4545 if (btrfs_fs_closing(fs_info)) { 4546 closing = true; 4547 break; 4548 } 4549 ret = btrfs_search_forward(root, &key, path, 4550 BTRFS_OLDEST_GENERATION); 4551 if (ret) { 4552 if (ret > 0) 4553 ret = 0; 4554 break; 4555 } 4556 4557 if (key.type != BTRFS_ROOT_ITEM_KEY || 4558 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4559 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4560 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4561 goto skip; 4562 4563 eb = path->nodes[0]; 4564 slot = path->slots[0]; 4565 item_size = btrfs_item_size_nr(eb, slot); 4566 if (item_size < sizeof(root_item)) 4567 goto skip; 4568 4569 read_extent_buffer(eb, &root_item, 4570 btrfs_item_ptr_offset(eb, slot), 4571 (int)sizeof(root_item)); 4572 if (btrfs_root_refs(&root_item) == 0) 4573 goto skip; 4574 4575 if (!btrfs_is_empty_uuid(root_item.uuid) || 4576 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4577 if (trans) 4578 goto update_tree; 4579 4580 btrfs_release_path(path); 4581 /* 4582 * 1 - subvol uuid item 4583 * 1 - received_subvol uuid item 4584 */ 4585 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4586 if (IS_ERR(trans)) { 4587 ret = PTR_ERR(trans); 4588 break; 4589 } 4590 continue; 4591 } else { 4592 goto skip; 4593 } 4594 update_tree: 4595 btrfs_release_path(path); 4596 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4597 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4598 BTRFS_UUID_KEY_SUBVOL, 4599 key.objectid); 4600 if (ret < 0) { 4601 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4602 ret); 4603 break; 4604 } 4605 } 4606 4607 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4608 ret = btrfs_uuid_tree_add(trans, 4609 root_item.received_uuid, 4610 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4611 key.objectid); 4612 if (ret < 0) { 4613 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4614 ret); 4615 break; 4616 } 4617 } 4618 4619 skip: 4620 btrfs_release_path(path); 4621 if (trans) { 4622 ret = btrfs_end_transaction(trans); 4623 trans = NULL; 4624 if (ret) 4625 break; 4626 } 4627 4628 if (key.offset < (u64)-1) { 4629 key.offset++; 4630 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4631 key.offset = 0; 4632 key.type = BTRFS_ROOT_ITEM_KEY; 4633 } else if (key.objectid < (u64)-1) { 4634 key.offset = 0; 4635 key.type = BTRFS_ROOT_ITEM_KEY; 4636 key.objectid++; 4637 } else { 4638 break; 4639 } 4640 cond_resched(); 4641 } 4642 4643 out: 4644 btrfs_free_path(path); 4645 if (trans && !IS_ERR(trans)) 4646 btrfs_end_transaction(trans); 4647 if (ret) 4648 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4649 else if (!closing) 4650 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4651 up(&fs_info->uuid_tree_rescan_sem); 4652 return 0; 4653 } 4654 4655 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4656 { 4657 struct btrfs_trans_handle *trans; 4658 struct btrfs_root *tree_root = fs_info->tree_root; 4659 struct btrfs_root *uuid_root; 4660 struct task_struct *task; 4661 int ret; 4662 4663 /* 4664 * 1 - root node 4665 * 1 - root item 4666 */ 4667 trans = btrfs_start_transaction(tree_root, 2); 4668 if (IS_ERR(trans)) 4669 return PTR_ERR(trans); 4670 4671 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4672 if (IS_ERR(uuid_root)) { 4673 ret = PTR_ERR(uuid_root); 4674 btrfs_abort_transaction(trans, ret); 4675 btrfs_end_transaction(trans); 4676 return ret; 4677 } 4678 4679 fs_info->uuid_root = uuid_root; 4680 4681 ret = btrfs_commit_transaction(trans); 4682 if (ret) 4683 return ret; 4684 4685 down(&fs_info->uuid_tree_rescan_sem); 4686 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4687 if (IS_ERR(task)) { 4688 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4689 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4690 up(&fs_info->uuid_tree_rescan_sem); 4691 return PTR_ERR(task); 4692 } 4693 4694 return 0; 4695 } 4696 4697 /* 4698 * shrinking a device means finding all of the device extents past 4699 * the new size, and then following the back refs to the chunks. 4700 * The chunk relocation code actually frees the device extent 4701 */ 4702 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4703 { 4704 struct btrfs_fs_info *fs_info = device->fs_info; 4705 struct btrfs_root *root = fs_info->dev_root; 4706 struct btrfs_trans_handle *trans; 4707 struct btrfs_dev_extent *dev_extent = NULL; 4708 struct btrfs_path *path; 4709 u64 length; 4710 u64 chunk_offset; 4711 int ret; 4712 int slot; 4713 int failed = 0; 4714 bool retried = false; 4715 struct extent_buffer *l; 4716 struct btrfs_key key; 4717 struct btrfs_super_block *super_copy = fs_info->super_copy; 4718 u64 old_total = btrfs_super_total_bytes(super_copy); 4719 u64 old_size = btrfs_device_get_total_bytes(device); 4720 u64 diff; 4721 u64 start; 4722 4723 new_size = round_down(new_size, fs_info->sectorsize); 4724 start = new_size; 4725 diff = round_down(old_size - new_size, fs_info->sectorsize); 4726 4727 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4728 return -EINVAL; 4729 4730 path = btrfs_alloc_path(); 4731 if (!path) 4732 return -ENOMEM; 4733 4734 path->reada = READA_BACK; 4735 4736 trans = btrfs_start_transaction(root, 0); 4737 if (IS_ERR(trans)) { 4738 btrfs_free_path(path); 4739 return PTR_ERR(trans); 4740 } 4741 4742 mutex_lock(&fs_info->chunk_mutex); 4743 4744 btrfs_device_set_total_bytes(device, new_size); 4745 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4746 device->fs_devices->total_rw_bytes -= diff; 4747 atomic64_sub(diff, &fs_info->free_chunk_space); 4748 } 4749 4750 /* 4751 * Once the device's size has been set to the new size, ensure all 4752 * in-memory chunks are synced to disk so that the loop below sees them 4753 * and relocates them accordingly. 4754 */ 4755 if (contains_pending_extent(device, &start, diff)) { 4756 mutex_unlock(&fs_info->chunk_mutex); 4757 ret = btrfs_commit_transaction(trans); 4758 if (ret) 4759 goto done; 4760 } else { 4761 mutex_unlock(&fs_info->chunk_mutex); 4762 btrfs_end_transaction(trans); 4763 } 4764 4765 again: 4766 key.objectid = device->devid; 4767 key.offset = (u64)-1; 4768 key.type = BTRFS_DEV_EXTENT_KEY; 4769 4770 do { 4771 mutex_lock(&fs_info->reclaim_bgs_lock); 4772 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4773 if (ret < 0) { 4774 mutex_unlock(&fs_info->reclaim_bgs_lock); 4775 goto done; 4776 } 4777 4778 ret = btrfs_previous_item(root, path, 0, key.type); 4779 if (ret) { 4780 mutex_unlock(&fs_info->reclaim_bgs_lock); 4781 if (ret < 0) 4782 goto done; 4783 ret = 0; 4784 btrfs_release_path(path); 4785 break; 4786 } 4787 4788 l = path->nodes[0]; 4789 slot = path->slots[0]; 4790 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4791 4792 if (key.objectid != device->devid) { 4793 mutex_unlock(&fs_info->reclaim_bgs_lock); 4794 btrfs_release_path(path); 4795 break; 4796 } 4797 4798 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4799 length = btrfs_dev_extent_length(l, dev_extent); 4800 4801 if (key.offset + length <= new_size) { 4802 mutex_unlock(&fs_info->reclaim_bgs_lock); 4803 btrfs_release_path(path); 4804 break; 4805 } 4806 4807 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4808 btrfs_release_path(path); 4809 4810 /* 4811 * We may be relocating the only data chunk we have, 4812 * which could potentially end up with losing data's 4813 * raid profile, so lets allocate an empty one in 4814 * advance. 4815 */ 4816 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4817 if (ret < 0) { 4818 mutex_unlock(&fs_info->reclaim_bgs_lock); 4819 goto done; 4820 } 4821 4822 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4823 mutex_unlock(&fs_info->reclaim_bgs_lock); 4824 if (ret == -ENOSPC) { 4825 failed++; 4826 } else if (ret) { 4827 if (ret == -ETXTBSY) { 4828 btrfs_warn(fs_info, 4829 "could not shrink block group %llu due to active swapfile", 4830 chunk_offset); 4831 } 4832 goto done; 4833 } 4834 } while (key.offset-- > 0); 4835 4836 if (failed && !retried) { 4837 failed = 0; 4838 retried = true; 4839 goto again; 4840 } else if (failed && retried) { 4841 ret = -ENOSPC; 4842 goto done; 4843 } 4844 4845 /* Shrinking succeeded, else we would be at "done". */ 4846 trans = btrfs_start_transaction(root, 0); 4847 if (IS_ERR(trans)) { 4848 ret = PTR_ERR(trans); 4849 goto done; 4850 } 4851 4852 mutex_lock(&fs_info->chunk_mutex); 4853 /* Clear all state bits beyond the shrunk device size */ 4854 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4855 CHUNK_STATE_MASK); 4856 4857 btrfs_device_set_disk_total_bytes(device, new_size); 4858 if (list_empty(&device->post_commit_list)) 4859 list_add_tail(&device->post_commit_list, 4860 &trans->transaction->dev_update_list); 4861 4862 WARN_ON(diff > old_total); 4863 btrfs_set_super_total_bytes(super_copy, 4864 round_down(old_total - diff, fs_info->sectorsize)); 4865 mutex_unlock(&fs_info->chunk_mutex); 4866 4867 /* Now btrfs_update_device() will change the on-disk size. */ 4868 ret = btrfs_update_device(trans, device); 4869 if (ret < 0) { 4870 btrfs_abort_transaction(trans, ret); 4871 btrfs_end_transaction(trans); 4872 } else { 4873 ret = btrfs_commit_transaction(trans); 4874 } 4875 done: 4876 btrfs_free_path(path); 4877 if (ret) { 4878 mutex_lock(&fs_info->chunk_mutex); 4879 btrfs_device_set_total_bytes(device, old_size); 4880 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4881 device->fs_devices->total_rw_bytes += diff; 4882 atomic64_add(diff, &fs_info->free_chunk_space); 4883 mutex_unlock(&fs_info->chunk_mutex); 4884 } 4885 return ret; 4886 } 4887 4888 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4889 struct btrfs_key *key, 4890 struct btrfs_chunk *chunk, int item_size) 4891 { 4892 struct btrfs_super_block *super_copy = fs_info->super_copy; 4893 struct btrfs_disk_key disk_key; 4894 u32 array_size; 4895 u8 *ptr; 4896 4897 lockdep_assert_held(&fs_info->chunk_mutex); 4898 4899 array_size = btrfs_super_sys_array_size(super_copy); 4900 if (array_size + item_size + sizeof(disk_key) 4901 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4902 return -EFBIG; 4903 4904 ptr = super_copy->sys_chunk_array + array_size; 4905 btrfs_cpu_key_to_disk(&disk_key, key); 4906 memcpy(ptr, &disk_key, sizeof(disk_key)); 4907 ptr += sizeof(disk_key); 4908 memcpy(ptr, chunk, item_size); 4909 item_size += sizeof(disk_key); 4910 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4911 4912 return 0; 4913 } 4914 4915 /* 4916 * sort the devices in descending order by max_avail, total_avail 4917 */ 4918 static int btrfs_cmp_device_info(const void *a, const void *b) 4919 { 4920 const struct btrfs_device_info *di_a = a; 4921 const struct btrfs_device_info *di_b = b; 4922 4923 if (di_a->max_avail > di_b->max_avail) 4924 return -1; 4925 if (di_a->max_avail < di_b->max_avail) 4926 return 1; 4927 if (di_a->total_avail > di_b->total_avail) 4928 return -1; 4929 if (di_a->total_avail < di_b->total_avail) 4930 return 1; 4931 return 0; 4932 } 4933 4934 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4935 { 4936 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4937 return; 4938 4939 btrfs_set_fs_incompat(info, RAID56); 4940 } 4941 4942 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4943 { 4944 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4945 return; 4946 4947 btrfs_set_fs_incompat(info, RAID1C34); 4948 } 4949 4950 /* 4951 * Structure used internally for __btrfs_alloc_chunk() function. 4952 * Wraps needed parameters. 4953 */ 4954 struct alloc_chunk_ctl { 4955 u64 start; 4956 u64 type; 4957 /* Total number of stripes to allocate */ 4958 int num_stripes; 4959 /* sub_stripes info for map */ 4960 int sub_stripes; 4961 /* Stripes per device */ 4962 int dev_stripes; 4963 /* Maximum number of devices to use */ 4964 int devs_max; 4965 /* Minimum number of devices to use */ 4966 int devs_min; 4967 /* ndevs has to be a multiple of this */ 4968 int devs_increment; 4969 /* Number of copies */ 4970 int ncopies; 4971 /* Number of stripes worth of bytes to store parity information */ 4972 int nparity; 4973 u64 max_stripe_size; 4974 u64 max_chunk_size; 4975 u64 dev_extent_min; 4976 u64 stripe_size; 4977 u64 chunk_size; 4978 int ndevs; 4979 }; 4980 4981 static void init_alloc_chunk_ctl_policy_regular( 4982 struct btrfs_fs_devices *fs_devices, 4983 struct alloc_chunk_ctl *ctl) 4984 { 4985 u64 type = ctl->type; 4986 4987 if (type & BTRFS_BLOCK_GROUP_DATA) { 4988 ctl->max_stripe_size = SZ_1G; 4989 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4990 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4991 /* For larger filesystems, use larger metadata chunks */ 4992 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4993 ctl->max_stripe_size = SZ_1G; 4994 else 4995 ctl->max_stripe_size = SZ_256M; 4996 ctl->max_chunk_size = ctl->max_stripe_size; 4997 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4998 ctl->max_stripe_size = SZ_32M; 4999 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5000 ctl->devs_max = min_t(int, ctl->devs_max, 5001 BTRFS_MAX_DEVS_SYS_CHUNK); 5002 } else { 5003 BUG(); 5004 } 5005 5006 /* We don't want a chunk larger than 10% of writable space */ 5007 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5008 ctl->max_chunk_size); 5009 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5010 } 5011 5012 static void init_alloc_chunk_ctl_policy_zoned( 5013 struct btrfs_fs_devices *fs_devices, 5014 struct alloc_chunk_ctl *ctl) 5015 { 5016 u64 zone_size = fs_devices->fs_info->zone_size; 5017 u64 limit; 5018 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5019 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5020 u64 min_chunk_size = min_data_stripes * zone_size; 5021 u64 type = ctl->type; 5022 5023 ctl->max_stripe_size = zone_size; 5024 if (type & BTRFS_BLOCK_GROUP_DATA) { 5025 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5026 zone_size); 5027 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5028 ctl->max_chunk_size = ctl->max_stripe_size; 5029 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5030 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5031 ctl->devs_max = min_t(int, ctl->devs_max, 5032 BTRFS_MAX_DEVS_SYS_CHUNK); 5033 } else { 5034 BUG(); 5035 } 5036 5037 /* We don't want a chunk larger than 10% of writable space */ 5038 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5039 zone_size), 5040 min_chunk_size); 5041 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5042 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5043 } 5044 5045 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5046 struct alloc_chunk_ctl *ctl) 5047 { 5048 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5049 5050 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5051 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5052 ctl->devs_max = btrfs_raid_array[index].devs_max; 5053 if (!ctl->devs_max) 5054 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5055 ctl->devs_min = btrfs_raid_array[index].devs_min; 5056 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5057 ctl->ncopies = btrfs_raid_array[index].ncopies; 5058 ctl->nparity = btrfs_raid_array[index].nparity; 5059 ctl->ndevs = 0; 5060 5061 switch (fs_devices->chunk_alloc_policy) { 5062 case BTRFS_CHUNK_ALLOC_REGULAR: 5063 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5064 break; 5065 case BTRFS_CHUNK_ALLOC_ZONED: 5066 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5067 break; 5068 default: 5069 BUG(); 5070 } 5071 } 5072 5073 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5074 struct alloc_chunk_ctl *ctl, 5075 struct btrfs_device_info *devices_info) 5076 { 5077 struct btrfs_fs_info *info = fs_devices->fs_info; 5078 struct btrfs_device *device; 5079 u64 total_avail; 5080 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5081 int ret; 5082 int ndevs = 0; 5083 u64 max_avail; 5084 u64 dev_offset; 5085 5086 /* 5087 * in the first pass through the devices list, we gather information 5088 * about the available holes on each device. 5089 */ 5090 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5091 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5092 WARN(1, KERN_ERR 5093 "BTRFS: read-only device in alloc_list\n"); 5094 continue; 5095 } 5096 5097 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5098 &device->dev_state) || 5099 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5100 continue; 5101 5102 if (device->total_bytes > device->bytes_used) 5103 total_avail = device->total_bytes - device->bytes_used; 5104 else 5105 total_avail = 0; 5106 5107 /* If there is no space on this device, skip it. */ 5108 if (total_avail < ctl->dev_extent_min) 5109 continue; 5110 5111 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5112 &max_avail); 5113 if (ret && ret != -ENOSPC) 5114 return ret; 5115 5116 if (ret == 0) 5117 max_avail = dev_extent_want; 5118 5119 if (max_avail < ctl->dev_extent_min) { 5120 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5121 btrfs_debug(info, 5122 "%s: devid %llu has no free space, have=%llu want=%llu", 5123 __func__, device->devid, max_avail, 5124 ctl->dev_extent_min); 5125 continue; 5126 } 5127 5128 if (ndevs == fs_devices->rw_devices) { 5129 WARN(1, "%s: found more than %llu devices\n", 5130 __func__, fs_devices->rw_devices); 5131 break; 5132 } 5133 devices_info[ndevs].dev_offset = dev_offset; 5134 devices_info[ndevs].max_avail = max_avail; 5135 devices_info[ndevs].total_avail = total_avail; 5136 devices_info[ndevs].dev = device; 5137 ++ndevs; 5138 } 5139 ctl->ndevs = ndevs; 5140 5141 /* 5142 * now sort the devices by hole size / available space 5143 */ 5144 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5145 btrfs_cmp_device_info, NULL); 5146 5147 return 0; 5148 } 5149 5150 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5151 struct btrfs_device_info *devices_info) 5152 { 5153 /* Number of stripes that count for block group size */ 5154 int data_stripes; 5155 5156 /* 5157 * The primary goal is to maximize the number of stripes, so use as 5158 * many devices as possible, even if the stripes are not maximum sized. 5159 * 5160 * The DUP profile stores more than one stripe per device, the 5161 * max_avail is the total size so we have to adjust. 5162 */ 5163 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5164 ctl->dev_stripes); 5165 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5166 5167 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5168 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5169 5170 /* 5171 * Use the number of data stripes to figure out how big this chunk is 5172 * really going to be in terms of logical address space, and compare 5173 * that answer with the max chunk size. If it's higher, we try to 5174 * reduce stripe_size. 5175 */ 5176 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5177 /* 5178 * Reduce stripe_size, round it up to a 16MB boundary again and 5179 * then use it, unless it ends up being even bigger than the 5180 * previous value we had already. 5181 */ 5182 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5183 data_stripes), SZ_16M), 5184 ctl->stripe_size); 5185 } 5186 5187 /* Align to BTRFS_STRIPE_LEN */ 5188 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5189 ctl->chunk_size = ctl->stripe_size * data_stripes; 5190 5191 return 0; 5192 } 5193 5194 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5195 struct btrfs_device_info *devices_info) 5196 { 5197 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5198 /* Number of stripes that count for block group size */ 5199 int data_stripes; 5200 5201 /* 5202 * It should hold because: 5203 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5204 */ 5205 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5206 5207 ctl->stripe_size = zone_size; 5208 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5209 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5210 5211 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5212 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5213 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5214 ctl->stripe_size) + ctl->nparity, 5215 ctl->dev_stripes); 5216 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5217 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5218 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5219 } 5220 5221 ctl->chunk_size = ctl->stripe_size * data_stripes; 5222 5223 return 0; 5224 } 5225 5226 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5227 struct alloc_chunk_ctl *ctl, 5228 struct btrfs_device_info *devices_info) 5229 { 5230 struct btrfs_fs_info *info = fs_devices->fs_info; 5231 5232 /* 5233 * Round down to number of usable stripes, devs_increment can be any 5234 * number so we can't use round_down() that requires power of 2, while 5235 * rounddown is safe. 5236 */ 5237 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5238 5239 if (ctl->ndevs < ctl->devs_min) { 5240 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5241 btrfs_debug(info, 5242 "%s: not enough devices with free space: have=%d minimum required=%d", 5243 __func__, ctl->ndevs, ctl->devs_min); 5244 } 5245 return -ENOSPC; 5246 } 5247 5248 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5249 5250 switch (fs_devices->chunk_alloc_policy) { 5251 case BTRFS_CHUNK_ALLOC_REGULAR: 5252 return decide_stripe_size_regular(ctl, devices_info); 5253 case BTRFS_CHUNK_ALLOC_ZONED: 5254 return decide_stripe_size_zoned(ctl, devices_info); 5255 default: 5256 BUG(); 5257 } 5258 } 5259 5260 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5261 struct alloc_chunk_ctl *ctl, 5262 struct btrfs_device_info *devices_info) 5263 { 5264 struct btrfs_fs_info *info = trans->fs_info; 5265 struct map_lookup *map = NULL; 5266 struct extent_map_tree *em_tree; 5267 struct btrfs_block_group *block_group; 5268 struct extent_map *em; 5269 u64 start = ctl->start; 5270 u64 type = ctl->type; 5271 int ret; 5272 int i; 5273 int j; 5274 5275 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5276 if (!map) 5277 return ERR_PTR(-ENOMEM); 5278 map->num_stripes = ctl->num_stripes; 5279 5280 for (i = 0; i < ctl->ndevs; ++i) { 5281 for (j = 0; j < ctl->dev_stripes; ++j) { 5282 int s = i * ctl->dev_stripes + j; 5283 map->stripes[s].dev = devices_info[i].dev; 5284 map->stripes[s].physical = devices_info[i].dev_offset + 5285 j * ctl->stripe_size; 5286 } 5287 } 5288 map->stripe_len = BTRFS_STRIPE_LEN; 5289 map->io_align = BTRFS_STRIPE_LEN; 5290 map->io_width = BTRFS_STRIPE_LEN; 5291 map->type = type; 5292 map->sub_stripes = ctl->sub_stripes; 5293 5294 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5295 5296 em = alloc_extent_map(); 5297 if (!em) { 5298 kfree(map); 5299 return ERR_PTR(-ENOMEM); 5300 } 5301 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5302 em->map_lookup = map; 5303 em->start = start; 5304 em->len = ctl->chunk_size; 5305 em->block_start = 0; 5306 em->block_len = em->len; 5307 em->orig_block_len = ctl->stripe_size; 5308 5309 em_tree = &info->mapping_tree; 5310 write_lock(&em_tree->lock); 5311 ret = add_extent_mapping(em_tree, em, 0); 5312 if (ret) { 5313 write_unlock(&em_tree->lock); 5314 free_extent_map(em); 5315 return ERR_PTR(ret); 5316 } 5317 write_unlock(&em_tree->lock); 5318 5319 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5320 if (IS_ERR(block_group)) 5321 goto error_del_extent; 5322 5323 for (i = 0; i < map->num_stripes; i++) { 5324 struct btrfs_device *dev = map->stripes[i].dev; 5325 5326 btrfs_device_set_bytes_used(dev, 5327 dev->bytes_used + ctl->stripe_size); 5328 if (list_empty(&dev->post_commit_list)) 5329 list_add_tail(&dev->post_commit_list, 5330 &trans->transaction->dev_update_list); 5331 } 5332 5333 atomic64_sub(ctl->stripe_size * map->num_stripes, 5334 &info->free_chunk_space); 5335 5336 free_extent_map(em); 5337 check_raid56_incompat_flag(info, type); 5338 check_raid1c34_incompat_flag(info, type); 5339 5340 return block_group; 5341 5342 error_del_extent: 5343 write_lock(&em_tree->lock); 5344 remove_extent_mapping(em_tree, em); 5345 write_unlock(&em_tree->lock); 5346 5347 /* One for our allocation */ 5348 free_extent_map(em); 5349 /* One for the tree reference */ 5350 free_extent_map(em); 5351 5352 return block_group; 5353 } 5354 5355 struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5356 u64 type) 5357 { 5358 struct btrfs_fs_info *info = trans->fs_info; 5359 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5360 struct btrfs_device_info *devices_info = NULL; 5361 struct alloc_chunk_ctl ctl; 5362 struct btrfs_block_group *block_group; 5363 int ret; 5364 5365 lockdep_assert_held(&info->chunk_mutex); 5366 5367 if (!alloc_profile_is_valid(type, 0)) { 5368 ASSERT(0); 5369 return ERR_PTR(-EINVAL); 5370 } 5371 5372 if (list_empty(&fs_devices->alloc_list)) { 5373 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5374 btrfs_debug(info, "%s: no writable device", __func__); 5375 return ERR_PTR(-ENOSPC); 5376 } 5377 5378 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5379 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5380 ASSERT(0); 5381 return ERR_PTR(-EINVAL); 5382 } 5383 5384 ctl.start = find_next_chunk(info); 5385 ctl.type = type; 5386 init_alloc_chunk_ctl(fs_devices, &ctl); 5387 5388 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5389 GFP_NOFS); 5390 if (!devices_info) 5391 return ERR_PTR(-ENOMEM); 5392 5393 ret = gather_device_info(fs_devices, &ctl, devices_info); 5394 if (ret < 0) { 5395 block_group = ERR_PTR(ret); 5396 goto out; 5397 } 5398 5399 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5400 if (ret < 0) { 5401 block_group = ERR_PTR(ret); 5402 goto out; 5403 } 5404 5405 block_group = create_chunk(trans, &ctl, devices_info); 5406 5407 out: 5408 kfree(devices_info); 5409 return block_group; 5410 } 5411 5412 /* 5413 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5414 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5415 * chunks. 5416 * 5417 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5418 * phases. 5419 */ 5420 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5421 struct btrfs_block_group *bg) 5422 { 5423 struct btrfs_fs_info *fs_info = trans->fs_info; 5424 struct btrfs_root *extent_root = fs_info->extent_root; 5425 struct btrfs_root *chunk_root = fs_info->chunk_root; 5426 struct btrfs_key key; 5427 struct btrfs_chunk *chunk; 5428 struct btrfs_stripe *stripe; 5429 struct extent_map *em; 5430 struct map_lookup *map; 5431 size_t item_size; 5432 int i; 5433 int ret; 5434 5435 /* 5436 * We take the chunk_mutex for 2 reasons: 5437 * 5438 * 1) Updates and insertions in the chunk btree must be done while holding 5439 * the chunk_mutex, as well as updating the system chunk array in the 5440 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5441 * details; 5442 * 5443 * 2) To prevent races with the final phase of a device replace operation 5444 * that replaces the device object associated with the map's stripes, 5445 * because the device object's id can change at any time during that 5446 * final phase of the device replace operation 5447 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5448 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5449 * which would cause a failure when updating the device item, which does 5450 * not exists, or persisting a stripe of the chunk item with such ID. 5451 * Here we can't use the device_list_mutex because our caller already 5452 * has locked the chunk_mutex, and the final phase of device replace 5453 * acquires both mutexes - first the device_list_mutex and then the 5454 * chunk_mutex. Using any of those two mutexes protects us from a 5455 * concurrent device replace. 5456 */ 5457 lockdep_assert_held(&fs_info->chunk_mutex); 5458 5459 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5460 if (IS_ERR(em)) { 5461 ret = PTR_ERR(em); 5462 btrfs_abort_transaction(trans, ret); 5463 return ret; 5464 } 5465 5466 map = em->map_lookup; 5467 item_size = btrfs_chunk_item_size(map->num_stripes); 5468 5469 chunk = kzalloc(item_size, GFP_NOFS); 5470 if (!chunk) { 5471 ret = -ENOMEM; 5472 btrfs_abort_transaction(trans, ret); 5473 goto out; 5474 } 5475 5476 for (i = 0; i < map->num_stripes; i++) { 5477 struct btrfs_device *device = map->stripes[i].dev; 5478 5479 ret = btrfs_update_device(trans, device); 5480 if (ret) 5481 goto out; 5482 } 5483 5484 stripe = &chunk->stripe; 5485 for (i = 0; i < map->num_stripes; i++) { 5486 struct btrfs_device *device = map->stripes[i].dev; 5487 const u64 dev_offset = map->stripes[i].physical; 5488 5489 btrfs_set_stack_stripe_devid(stripe, device->devid); 5490 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5491 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5492 stripe++; 5493 } 5494 5495 btrfs_set_stack_chunk_length(chunk, bg->length); 5496 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5497 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5498 btrfs_set_stack_chunk_type(chunk, map->type); 5499 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5500 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5501 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5502 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5503 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5504 5505 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5506 key.type = BTRFS_CHUNK_ITEM_KEY; 5507 key.offset = bg->start; 5508 5509 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5510 if (ret) 5511 goto out; 5512 5513 bg->chunk_item_inserted = 1; 5514 5515 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5516 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5517 if (ret) 5518 goto out; 5519 } 5520 5521 out: 5522 kfree(chunk); 5523 free_extent_map(em); 5524 return ret; 5525 } 5526 5527 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5528 { 5529 struct btrfs_fs_info *fs_info = trans->fs_info; 5530 u64 alloc_profile; 5531 struct btrfs_block_group *meta_bg; 5532 struct btrfs_block_group *sys_bg; 5533 5534 /* 5535 * When adding a new device for sprouting, the seed device is read-only 5536 * so we must first allocate a metadata and a system chunk. But before 5537 * adding the block group items to the extent, device and chunk btrees, 5538 * we must first: 5539 * 5540 * 1) Create both chunks without doing any changes to the btrees, as 5541 * otherwise we would get -ENOSPC since the block groups from the 5542 * seed device are read-only; 5543 * 5544 * 2) Add the device item for the new sprout device - finishing the setup 5545 * of a new block group requires updating the device item in the chunk 5546 * btree, so it must exist when we attempt to do it. The previous step 5547 * ensures this does not fail with -ENOSPC. 5548 * 5549 * After that we can add the block group items to their btrees: 5550 * update existing device item in the chunk btree, add a new block group 5551 * item to the extent btree, add a new chunk item to the chunk btree and 5552 * finally add the new device extent items to the devices btree. 5553 */ 5554 5555 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5556 meta_bg = btrfs_alloc_chunk(trans, alloc_profile); 5557 if (IS_ERR(meta_bg)) 5558 return PTR_ERR(meta_bg); 5559 5560 alloc_profile = btrfs_system_alloc_profile(fs_info); 5561 sys_bg = btrfs_alloc_chunk(trans, alloc_profile); 5562 if (IS_ERR(sys_bg)) 5563 return PTR_ERR(sys_bg); 5564 5565 return 0; 5566 } 5567 5568 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5569 { 5570 const int index = btrfs_bg_flags_to_raid_index(map->type); 5571 5572 return btrfs_raid_array[index].tolerated_failures; 5573 } 5574 5575 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5576 { 5577 struct extent_map *em; 5578 struct map_lookup *map; 5579 int readonly = 0; 5580 int miss_ndevs = 0; 5581 int i; 5582 5583 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5584 if (IS_ERR(em)) 5585 return 1; 5586 5587 map = em->map_lookup; 5588 for (i = 0; i < map->num_stripes; i++) { 5589 if (test_bit(BTRFS_DEV_STATE_MISSING, 5590 &map->stripes[i].dev->dev_state)) { 5591 miss_ndevs++; 5592 continue; 5593 } 5594 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5595 &map->stripes[i].dev->dev_state)) { 5596 readonly = 1; 5597 goto end; 5598 } 5599 } 5600 5601 /* 5602 * If the number of missing devices is larger than max errors, 5603 * we can not write the data into that chunk successfully, so 5604 * set it readonly. 5605 */ 5606 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5607 readonly = 1; 5608 end: 5609 free_extent_map(em); 5610 return readonly; 5611 } 5612 5613 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5614 { 5615 struct extent_map *em; 5616 5617 while (1) { 5618 write_lock(&tree->lock); 5619 em = lookup_extent_mapping(tree, 0, (u64)-1); 5620 if (em) 5621 remove_extent_mapping(tree, em); 5622 write_unlock(&tree->lock); 5623 if (!em) 5624 break; 5625 /* once for us */ 5626 free_extent_map(em); 5627 /* once for the tree */ 5628 free_extent_map(em); 5629 } 5630 } 5631 5632 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5633 { 5634 struct extent_map *em; 5635 struct map_lookup *map; 5636 int ret; 5637 5638 em = btrfs_get_chunk_map(fs_info, logical, len); 5639 if (IS_ERR(em)) 5640 /* 5641 * We could return errors for these cases, but that could get 5642 * ugly and we'd probably do the same thing which is just not do 5643 * anything else and exit, so return 1 so the callers don't try 5644 * to use other copies. 5645 */ 5646 return 1; 5647 5648 map = em->map_lookup; 5649 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5650 ret = map->num_stripes; 5651 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5652 ret = map->sub_stripes; 5653 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5654 ret = 2; 5655 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5656 /* 5657 * There could be two corrupted data stripes, we need 5658 * to loop retry in order to rebuild the correct data. 5659 * 5660 * Fail a stripe at a time on every retry except the 5661 * stripe under reconstruction. 5662 */ 5663 ret = map->num_stripes; 5664 else 5665 ret = 1; 5666 free_extent_map(em); 5667 5668 down_read(&fs_info->dev_replace.rwsem); 5669 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5670 fs_info->dev_replace.tgtdev) 5671 ret++; 5672 up_read(&fs_info->dev_replace.rwsem); 5673 5674 return ret; 5675 } 5676 5677 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5678 u64 logical) 5679 { 5680 struct extent_map *em; 5681 struct map_lookup *map; 5682 unsigned long len = fs_info->sectorsize; 5683 5684 em = btrfs_get_chunk_map(fs_info, logical, len); 5685 5686 if (!WARN_ON(IS_ERR(em))) { 5687 map = em->map_lookup; 5688 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5689 len = map->stripe_len * nr_data_stripes(map); 5690 free_extent_map(em); 5691 } 5692 return len; 5693 } 5694 5695 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5696 { 5697 struct extent_map *em; 5698 struct map_lookup *map; 5699 int ret = 0; 5700 5701 em = btrfs_get_chunk_map(fs_info, logical, len); 5702 5703 if(!WARN_ON(IS_ERR(em))) { 5704 map = em->map_lookup; 5705 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5706 ret = 1; 5707 free_extent_map(em); 5708 } 5709 return ret; 5710 } 5711 5712 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5713 struct map_lookup *map, int first, 5714 int dev_replace_is_ongoing) 5715 { 5716 int i; 5717 int num_stripes; 5718 int preferred_mirror; 5719 int tolerance; 5720 struct btrfs_device *srcdev; 5721 5722 ASSERT((map->type & 5723 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5724 5725 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5726 num_stripes = map->sub_stripes; 5727 else 5728 num_stripes = map->num_stripes; 5729 5730 switch (fs_info->fs_devices->read_policy) { 5731 default: 5732 /* Shouldn't happen, just warn and use pid instead of failing */ 5733 btrfs_warn_rl(fs_info, 5734 "unknown read_policy type %u, reset to pid", 5735 fs_info->fs_devices->read_policy); 5736 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5737 fallthrough; 5738 case BTRFS_READ_POLICY_PID: 5739 preferred_mirror = first + (current->pid % num_stripes); 5740 break; 5741 } 5742 5743 if (dev_replace_is_ongoing && 5744 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5745 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5746 srcdev = fs_info->dev_replace.srcdev; 5747 else 5748 srcdev = NULL; 5749 5750 /* 5751 * try to avoid the drive that is the source drive for a 5752 * dev-replace procedure, only choose it if no other non-missing 5753 * mirror is available 5754 */ 5755 for (tolerance = 0; tolerance < 2; tolerance++) { 5756 if (map->stripes[preferred_mirror].dev->bdev && 5757 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5758 return preferred_mirror; 5759 for (i = first; i < first + num_stripes; i++) { 5760 if (map->stripes[i].dev->bdev && 5761 (tolerance || map->stripes[i].dev != srcdev)) 5762 return i; 5763 } 5764 } 5765 5766 /* we couldn't find one that doesn't fail. Just return something 5767 * and the io error handling code will clean up eventually 5768 */ 5769 return preferred_mirror; 5770 } 5771 5772 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5773 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5774 { 5775 int i; 5776 int again = 1; 5777 5778 while (again) { 5779 again = 0; 5780 for (i = 0; i < num_stripes - 1; i++) { 5781 /* Swap if parity is on a smaller index */ 5782 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5783 swap(bbio->stripes[i], bbio->stripes[i + 1]); 5784 swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 5785 again = 1; 5786 } 5787 } 5788 } 5789 } 5790 5791 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5792 { 5793 struct btrfs_bio *bbio = kzalloc( 5794 /* the size of the btrfs_bio */ 5795 sizeof(struct btrfs_bio) + 5796 /* plus the variable array for the stripes */ 5797 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5798 /* plus the variable array for the tgt dev */ 5799 sizeof(int) * (real_stripes) + 5800 /* 5801 * plus the raid_map, which includes both the tgt dev 5802 * and the stripes 5803 */ 5804 sizeof(u64) * (total_stripes), 5805 GFP_NOFS|__GFP_NOFAIL); 5806 5807 atomic_set(&bbio->error, 0); 5808 refcount_set(&bbio->refs, 1); 5809 5810 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5811 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5812 5813 return bbio; 5814 } 5815 5816 void btrfs_get_bbio(struct btrfs_bio *bbio) 5817 { 5818 WARN_ON(!refcount_read(&bbio->refs)); 5819 refcount_inc(&bbio->refs); 5820 } 5821 5822 void btrfs_put_bbio(struct btrfs_bio *bbio) 5823 { 5824 if (!bbio) 5825 return; 5826 if (refcount_dec_and_test(&bbio->refs)) 5827 kfree(bbio); 5828 } 5829 5830 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5831 /* 5832 * Please note that, discard won't be sent to target device of device 5833 * replace. 5834 */ 5835 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5836 u64 logical, u64 *length_ret, 5837 struct btrfs_bio **bbio_ret) 5838 { 5839 struct extent_map *em; 5840 struct map_lookup *map; 5841 struct btrfs_bio *bbio; 5842 u64 length = *length_ret; 5843 u64 offset; 5844 u64 stripe_nr; 5845 u64 stripe_nr_end; 5846 u64 stripe_end_offset; 5847 u64 stripe_cnt; 5848 u64 stripe_len; 5849 u64 stripe_offset; 5850 u64 num_stripes; 5851 u32 stripe_index; 5852 u32 factor = 0; 5853 u32 sub_stripes = 0; 5854 u64 stripes_per_dev = 0; 5855 u32 remaining_stripes = 0; 5856 u32 last_stripe = 0; 5857 int ret = 0; 5858 int i; 5859 5860 /* discard always return a bbio */ 5861 ASSERT(bbio_ret); 5862 5863 em = btrfs_get_chunk_map(fs_info, logical, length); 5864 if (IS_ERR(em)) 5865 return PTR_ERR(em); 5866 5867 map = em->map_lookup; 5868 /* we don't discard raid56 yet */ 5869 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5870 ret = -EOPNOTSUPP; 5871 goto out; 5872 } 5873 5874 offset = logical - em->start; 5875 length = min_t(u64, em->start + em->len - logical, length); 5876 *length_ret = length; 5877 5878 stripe_len = map->stripe_len; 5879 /* 5880 * stripe_nr counts the total number of stripes we have to stride 5881 * to get to this block 5882 */ 5883 stripe_nr = div64_u64(offset, stripe_len); 5884 5885 /* stripe_offset is the offset of this block in its stripe */ 5886 stripe_offset = offset - stripe_nr * stripe_len; 5887 5888 stripe_nr_end = round_up(offset + length, map->stripe_len); 5889 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5890 stripe_cnt = stripe_nr_end - stripe_nr; 5891 stripe_end_offset = stripe_nr_end * map->stripe_len - 5892 (offset + length); 5893 /* 5894 * after this, stripe_nr is the number of stripes on this 5895 * device we have to walk to find the data, and stripe_index is 5896 * the number of our device in the stripe array 5897 */ 5898 num_stripes = 1; 5899 stripe_index = 0; 5900 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5901 BTRFS_BLOCK_GROUP_RAID10)) { 5902 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5903 sub_stripes = 1; 5904 else 5905 sub_stripes = map->sub_stripes; 5906 5907 factor = map->num_stripes / sub_stripes; 5908 num_stripes = min_t(u64, map->num_stripes, 5909 sub_stripes * stripe_cnt); 5910 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5911 stripe_index *= sub_stripes; 5912 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5913 &remaining_stripes); 5914 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5915 last_stripe *= sub_stripes; 5916 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5917 BTRFS_BLOCK_GROUP_DUP)) { 5918 num_stripes = map->num_stripes; 5919 } else { 5920 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5921 &stripe_index); 5922 } 5923 5924 bbio = alloc_btrfs_bio(num_stripes, 0); 5925 if (!bbio) { 5926 ret = -ENOMEM; 5927 goto out; 5928 } 5929 5930 for (i = 0; i < num_stripes; i++) { 5931 bbio->stripes[i].physical = 5932 map->stripes[stripe_index].physical + 5933 stripe_offset + stripe_nr * map->stripe_len; 5934 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5935 5936 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5937 BTRFS_BLOCK_GROUP_RAID10)) { 5938 bbio->stripes[i].length = stripes_per_dev * 5939 map->stripe_len; 5940 5941 if (i / sub_stripes < remaining_stripes) 5942 bbio->stripes[i].length += 5943 map->stripe_len; 5944 5945 /* 5946 * Special for the first stripe and 5947 * the last stripe: 5948 * 5949 * |-------|...|-------| 5950 * |----------| 5951 * off end_off 5952 */ 5953 if (i < sub_stripes) 5954 bbio->stripes[i].length -= 5955 stripe_offset; 5956 5957 if (stripe_index >= last_stripe && 5958 stripe_index <= (last_stripe + 5959 sub_stripes - 1)) 5960 bbio->stripes[i].length -= 5961 stripe_end_offset; 5962 5963 if (i == sub_stripes - 1) 5964 stripe_offset = 0; 5965 } else { 5966 bbio->stripes[i].length = length; 5967 } 5968 5969 stripe_index++; 5970 if (stripe_index == map->num_stripes) { 5971 stripe_index = 0; 5972 stripe_nr++; 5973 } 5974 } 5975 5976 *bbio_ret = bbio; 5977 bbio->map_type = map->type; 5978 bbio->num_stripes = num_stripes; 5979 out: 5980 free_extent_map(em); 5981 return ret; 5982 } 5983 5984 /* 5985 * In dev-replace case, for repair case (that's the only case where the mirror 5986 * is selected explicitly when calling btrfs_map_block), blocks left of the 5987 * left cursor can also be read from the target drive. 5988 * 5989 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5990 * array of stripes. 5991 * For READ, it also needs to be supported using the same mirror number. 5992 * 5993 * If the requested block is not left of the left cursor, EIO is returned. This 5994 * can happen because btrfs_num_copies() returns one more in the dev-replace 5995 * case. 5996 */ 5997 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5998 u64 logical, u64 length, 5999 u64 srcdev_devid, int *mirror_num, 6000 u64 *physical) 6001 { 6002 struct btrfs_bio *bbio = NULL; 6003 int num_stripes; 6004 int index_srcdev = 0; 6005 int found = 0; 6006 u64 physical_of_found = 0; 6007 int i; 6008 int ret = 0; 6009 6010 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6011 logical, &length, &bbio, 0, 0); 6012 if (ret) { 6013 ASSERT(bbio == NULL); 6014 return ret; 6015 } 6016 6017 num_stripes = bbio->num_stripes; 6018 if (*mirror_num > num_stripes) { 6019 /* 6020 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6021 * that means that the requested area is not left of the left 6022 * cursor 6023 */ 6024 btrfs_put_bbio(bbio); 6025 return -EIO; 6026 } 6027 6028 /* 6029 * process the rest of the function using the mirror_num of the source 6030 * drive. Therefore look it up first. At the end, patch the device 6031 * pointer to the one of the target drive. 6032 */ 6033 for (i = 0; i < num_stripes; i++) { 6034 if (bbio->stripes[i].dev->devid != srcdev_devid) 6035 continue; 6036 6037 /* 6038 * In case of DUP, in order to keep it simple, only add the 6039 * mirror with the lowest physical address 6040 */ 6041 if (found && 6042 physical_of_found <= bbio->stripes[i].physical) 6043 continue; 6044 6045 index_srcdev = i; 6046 found = 1; 6047 physical_of_found = bbio->stripes[i].physical; 6048 } 6049 6050 btrfs_put_bbio(bbio); 6051 6052 ASSERT(found); 6053 if (!found) 6054 return -EIO; 6055 6056 *mirror_num = index_srcdev + 1; 6057 *physical = physical_of_found; 6058 return ret; 6059 } 6060 6061 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6062 { 6063 struct btrfs_block_group *cache; 6064 bool ret; 6065 6066 /* Non zoned filesystem does not use "to_copy" flag */ 6067 if (!btrfs_is_zoned(fs_info)) 6068 return false; 6069 6070 cache = btrfs_lookup_block_group(fs_info, logical); 6071 6072 spin_lock(&cache->lock); 6073 ret = cache->to_copy; 6074 spin_unlock(&cache->lock); 6075 6076 btrfs_put_block_group(cache); 6077 return ret; 6078 } 6079 6080 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6081 struct btrfs_bio **bbio_ret, 6082 struct btrfs_dev_replace *dev_replace, 6083 u64 logical, 6084 int *num_stripes_ret, int *max_errors_ret) 6085 { 6086 struct btrfs_bio *bbio = *bbio_ret; 6087 u64 srcdev_devid = dev_replace->srcdev->devid; 6088 int tgtdev_indexes = 0; 6089 int num_stripes = *num_stripes_ret; 6090 int max_errors = *max_errors_ret; 6091 int i; 6092 6093 if (op == BTRFS_MAP_WRITE) { 6094 int index_where_to_add; 6095 6096 /* 6097 * A block group which have "to_copy" set will eventually 6098 * copied by dev-replace process. We can avoid cloning IO here. 6099 */ 6100 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6101 return; 6102 6103 /* 6104 * duplicate the write operations while the dev replace 6105 * procedure is running. Since the copying of the old disk to 6106 * the new disk takes place at run time while the filesystem is 6107 * mounted writable, the regular write operations to the old 6108 * disk have to be duplicated to go to the new disk as well. 6109 * 6110 * Note that device->missing is handled by the caller, and that 6111 * the write to the old disk is already set up in the stripes 6112 * array. 6113 */ 6114 index_where_to_add = num_stripes; 6115 for (i = 0; i < num_stripes; i++) { 6116 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6117 /* write to new disk, too */ 6118 struct btrfs_bio_stripe *new = 6119 bbio->stripes + index_where_to_add; 6120 struct btrfs_bio_stripe *old = 6121 bbio->stripes + i; 6122 6123 new->physical = old->physical; 6124 new->length = old->length; 6125 new->dev = dev_replace->tgtdev; 6126 bbio->tgtdev_map[i] = index_where_to_add; 6127 index_where_to_add++; 6128 max_errors++; 6129 tgtdev_indexes++; 6130 } 6131 } 6132 num_stripes = index_where_to_add; 6133 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6134 int index_srcdev = 0; 6135 int found = 0; 6136 u64 physical_of_found = 0; 6137 6138 /* 6139 * During the dev-replace procedure, the target drive can also 6140 * be used to read data in case it is needed to repair a corrupt 6141 * block elsewhere. This is possible if the requested area is 6142 * left of the left cursor. In this area, the target drive is a 6143 * full copy of the source drive. 6144 */ 6145 for (i = 0; i < num_stripes; i++) { 6146 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6147 /* 6148 * In case of DUP, in order to keep it simple, 6149 * only add the mirror with the lowest physical 6150 * address 6151 */ 6152 if (found && 6153 physical_of_found <= 6154 bbio->stripes[i].physical) 6155 continue; 6156 index_srcdev = i; 6157 found = 1; 6158 physical_of_found = bbio->stripes[i].physical; 6159 } 6160 } 6161 if (found) { 6162 struct btrfs_bio_stripe *tgtdev_stripe = 6163 bbio->stripes + num_stripes; 6164 6165 tgtdev_stripe->physical = physical_of_found; 6166 tgtdev_stripe->length = 6167 bbio->stripes[index_srcdev].length; 6168 tgtdev_stripe->dev = dev_replace->tgtdev; 6169 bbio->tgtdev_map[index_srcdev] = num_stripes; 6170 6171 tgtdev_indexes++; 6172 num_stripes++; 6173 } 6174 } 6175 6176 *num_stripes_ret = num_stripes; 6177 *max_errors_ret = max_errors; 6178 bbio->num_tgtdevs = tgtdev_indexes; 6179 *bbio_ret = bbio; 6180 } 6181 6182 static bool need_full_stripe(enum btrfs_map_op op) 6183 { 6184 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6185 } 6186 6187 /* 6188 * Calculate the geometry of a particular (address, len) tuple. This 6189 * information is used to calculate how big a particular bio can get before it 6190 * straddles a stripe. 6191 * 6192 * @fs_info: the filesystem 6193 * @em: mapping containing the logical extent 6194 * @op: type of operation - write or read 6195 * @logical: address that we want to figure out the geometry of 6196 * @io_geom: pointer used to return values 6197 * 6198 * Returns < 0 in case a chunk for the given logical address cannot be found, 6199 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6200 */ 6201 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6202 enum btrfs_map_op op, u64 logical, 6203 struct btrfs_io_geometry *io_geom) 6204 { 6205 struct map_lookup *map; 6206 u64 len; 6207 u64 offset; 6208 u64 stripe_offset; 6209 u64 stripe_nr; 6210 u64 stripe_len; 6211 u64 raid56_full_stripe_start = (u64)-1; 6212 int data_stripes; 6213 6214 ASSERT(op != BTRFS_MAP_DISCARD); 6215 6216 map = em->map_lookup; 6217 /* Offset of this logical address in the chunk */ 6218 offset = logical - em->start; 6219 /* Len of a stripe in a chunk */ 6220 stripe_len = map->stripe_len; 6221 /* Stripe where this block falls in */ 6222 stripe_nr = div64_u64(offset, stripe_len); 6223 /* Offset of stripe in the chunk */ 6224 stripe_offset = stripe_nr * stripe_len; 6225 if (offset < stripe_offset) { 6226 btrfs_crit(fs_info, 6227 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6228 stripe_offset, offset, em->start, logical, stripe_len); 6229 return -EINVAL; 6230 } 6231 6232 /* stripe_offset is the offset of this block in its stripe */ 6233 stripe_offset = offset - stripe_offset; 6234 data_stripes = nr_data_stripes(map); 6235 6236 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6237 u64 max_len = stripe_len - stripe_offset; 6238 6239 /* 6240 * In case of raid56, we need to know the stripe aligned start 6241 */ 6242 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6243 unsigned long full_stripe_len = stripe_len * data_stripes; 6244 raid56_full_stripe_start = offset; 6245 6246 /* 6247 * Allow a write of a full stripe, but make sure we 6248 * don't allow straddling of stripes 6249 */ 6250 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6251 full_stripe_len); 6252 raid56_full_stripe_start *= full_stripe_len; 6253 6254 /* 6255 * For writes to RAID[56], allow a full stripeset across 6256 * all disks. For other RAID types and for RAID[56] 6257 * reads, just allow a single stripe (on a single disk). 6258 */ 6259 if (op == BTRFS_MAP_WRITE) { 6260 max_len = stripe_len * data_stripes - 6261 (offset - raid56_full_stripe_start); 6262 } 6263 } 6264 len = min_t(u64, em->len - offset, max_len); 6265 } else { 6266 len = em->len - offset; 6267 } 6268 6269 io_geom->len = len; 6270 io_geom->offset = offset; 6271 io_geom->stripe_len = stripe_len; 6272 io_geom->stripe_nr = stripe_nr; 6273 io_geom->stripe_offset = stripe_offset; 6274 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6275 6276 return 0; 6277 } 6278 6279 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6280 enum btrfs_map_op op, 6281 u64 logical, u64 *length, 6282 struct btrfs_bio **bbio_ret, 6283 int mirror_num, int need_raid_map) 6284 { 6285 struct extent_map *em; 6286 struct map_lookup *map; 6287 u64 stripe_offset; 6288 u64 stripe_nr; 6289 u64 stripe_len; 6290 u32 stripe_index; 6291 int data_stripes; 6292 int i; 6293 int ret = 0; 6294 int num_stripes; 6295 int max_errors = 0; 6296 int tgtdev_indexes = 0; 6297 struct btrfs_bio *bbio = NULL; 6298 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6299 int dev_replace_is_ongoing = 0; 6300 int num_alloc_stripes; 6301 int patch_the_first_stripe_for_dev_replace = 0; 6302 u64 physical_to_patch_in_first_stripe = 0; 6303 u64 raid56_full_stripe_start = (u64)-1; 6304 struct btrfs_io_geometry geom; 6305 6306 ASSERT(bbio_ret); 6307 ASSERT(op != BTRFS_MAP_DISCARD); 6308 6309 em = btrfs_get_chunk_map(fs_info, logical, *length); 6310 ASSERT(!IS_ERR(em)); 6311 6312 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6313 if (ret < 0) 6314 return ret; 6315 6316 map = em->map_lookup; 6317 6318 *length = geom.len; 6319 stripe_len = geom.stripe_len; 6320 stripe_nr = geom.stripe_nr; 6321 stripe_offset = geom.stripe_offset; 6322 raid56_full_stripe_start = geom.raid56_stripe_offset; 6323 data_stripes = nr_data_stripes(map); 6324 6325 down_read(&dev_replace->rwsem); 6326 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6327 /* 6328 * Hold the semaphore for read during the whole operation, write is 6329 * requested at commit time but must wait. 6330 */ 6331 if (!dev_replace_is_ongoing) 6332 up_read(&dev_replace->rwsem); 6333 6334 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6335 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6336 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6337 dev_replace->srcdev->devid, 6338 &mirror_num, 6339 &physical_to_patch_in_first_stripe); 6340 if (ret) 6341 goto out; 6342 else 6343 patch_the_first_stripe_for_dev_replace = 1; 6344 } else if (mirror_num > map->num_stripes) { 6345 mirror_num = 0; 6346 } 6347 6348 num_stripes = 1; 6349 stripe_index = 0; 6350 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6351 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6352 &stripe_index); 6353 if (!need_full_stripe(op)) 6354 mirror_num = 1; 6355 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6356 if (need_full_stripe(op)) 6357 num_stripes = map->num_stripes; 6358 else if (mirror_num) 6359 stripe_index = mirror_num - 1; 6360 else { 6361 stripe_index = find_live_mirror(fs_info, map, 0, 6362 dev_replace_is_ongoing); 6363 mirror_num = stripe_index + 1; 6364 } 6365 6366 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6367 if (need_full_stripe(op)) { 6368 num_stripes = map->num_stripes; 6369 } else if (mirror_num) { 6370 stripe_index = mirror_num - 1; 6371 } else { 6372 mirror_num = 1; 6373 } 6374 6375 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6376 u32 factor = map->num_stripes / map->sub_stripes; 6377 6378 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6379 stripe_index *= map->sub_stripes; 6380 6381 if (need_full_stripe(op)) 6382 num_stripes = map->sub_stripes; 6383 else if (mirror_num) 6384 stripe_index += mirror_num - 1; 6385 else { 6386 int old_stripe_index = stripe_index; 6387 stripe_index = find_live_mirror(fs_info, map, 6388 stripe_index, 6389 dev_replace_is_ongoing); 6390 mirror_num = stripe_index - old_stripe_index + 1; 6391 } 6392 6393 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6394 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6395 /* push stripe_nr back to the start of the full stripe */ 6396 stripe_nr = div64_u64(raid56_full_stripe_start, 6397 stripe_len * data_stripes); 6398 6399 /* RAID[56] write or recovery. Return all stripes */ 6400 num_stripes = map->num_stripes; 6401 max_errors = nr_parity_stripes(map); 6402 6403 *length = map->stripe_len; 6404 stripe_index = 0; 6405 stripe_offset = 0; 6406 } else { 6407 /* 6408 * Mirror #0 or #1 means the original data block. 6409 * Mirror #2 is RAID5 parity block. 6410 * Mirror #3 is RAID6 Q block. 6411 */ 6412 stripe_nr = div_u64_rem(stripe_nr, 6413 data_stripes, &stripe_index); 6414 if (mirror_num > 1) 6415 stripe_index = data_stripes + mirror_num - 2; 6416 6417 /* We distribute the parity blocks across stripes */ 6418 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6419 &stripe_index); 6420 if (!need_full_stripe(op) && mirror_num <= 1) 6421 mirror_num = 1; 6422 } 6423 } else { 6424 /* 6425 * after this, stripe_nr is the number of stripes on this 6426 * device we have to walk to find the data, and stripe_index is 6427 * the number of our device in the stripe array 6428 */ 6429 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6430 &stripe_index); 6431 mirror_num = stripe_index + 1; 6432 } 6433 if (stripe_index >= map->num_stripes) { 6434 btrfs_crit(fs_info, 6435 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6436 stripe_index, map->num_stripes); 6437 ret = -EINVAL; 6438 goto out; 6439 } 6440 6441 num_alloc_stripes = num_stripes; 6442 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6443 if (op == BTRFS_MAP_WRITE) 6444 num_alloc_stripes <<= 1; 6445 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6446 num_alloc_stripes++; 6447 tgtdev_indexes = num_stripes; 6448 } 6449 6450 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6451 if (!bbio) { 6452 ret = -ENOMEM; 6453 goto out; 6454 } 6455 6456 for (i = 0; i < num_stripes; i++) { 6457 bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6458 stripe_offset + stripe_nr * map->stripe_len; 6459 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6460 stripe_index++; 6461 } 6462 6463 /* build raid_map */ 6464 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6465 (need_full_stripe(op) || mirror_num > 1)) { 6466 u64 tmp; 6467 unsigned rot; 6468 6469 /* Work out the disk rotation on this stripe-set */ 6470 div_u64_rem(stripe_nr, num_stripes, &rot); 6471 6472 /* Fill in the logical address of each stripe */ 6473 tmp = stripe_nr * data_stripes; 6474 for (i = 0; i < data_stripes; i++) 6475 bbio->raid_map[(i+rot) % num_stripes] = 6476 em->start + (tmp + i) * map->stripe_len; 6477 6478 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6479 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6480 bbio->raid_map[(i+rot+1) % num_stripes] = 6481 RAID6_Q_STRIPE; 6482 6483 sort_parity_stripes(bbio, num_stripes); 6484 } 6485 6486 if (need_full_stripe(op)) 6487 max_errors = btrfs_chunk_max_errors(map); 6488 6489 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6490 need_full_stripe(op)) { 6491 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, 6492 &num_stripes, &max_errors); 6493 } 6494 6495 *bbio_ret = bbio; 6496 bbio->map_type = map->type; 6497 bbio->num_stripes = num_stripes; 6498 bbio->max_errors = max_errors; 6499 bbio->mirror_num = mirror_num; 6500 6501 /* 6502 * this is the case that REQ_READ && dev_replace_is_ongoing && 6503 * mirror_num == num_stripes + 1 && dev_replace target drive is 6504 * available as a mirror 6505 */ 6506 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6507 WARN_ON(num_stripes > 1); 6508 bbio->stripes[0].dev = dev_replace->tgtdev; 6509 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6510 bbio->mirror_num = map->num_stripes + 1; 6511 } 6512 out: 6513 if (dev_replace_is_ongoing) { 6514 lockdep_assert_held(&dev_replace->rwsem); 6515 /* Unlock and let waiting writers proceed */ 6516 up_read(&dev_replace->rwsem); 6517 } 6518 free_extent_map(em); 6519 return ret; 6520 } 6521 6522 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6523 u64 logical, u64 *length, 6524 struct btrfs_bio **bbio_ret, int mirror_num) 6525 { 6526 if (op == BTRFS_MAP_DISCARD) 6527 return __btrfs_map_block_for_discard(fs_info, logical, 6528 length, bbio_ret); 6529 6530 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6531 mirror_num, 0); 6532 } 6533 6534 /* For Scrub/replace */ 6535 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6536 u64 logical, u64 *length, 6537 struct btrfs_bio **bbio_ret) 6538 { 6539 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6540 } 6541 6542 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6543 { 6544 bio->bi_private = bbio->private; 6545 bio->bi_end_io = bbio->end_io; 6546 bio_endio(bio); 6547 6548 btrfs_put_bbio(bbio); 6549 } 6550 6551 static void btrfs_end_bio(struct bio *bio) 6552 { 6553 struct btrfs_bio *bbio = bio->bi_private; 6554 int is_orig_bio = 0; 6555 6556 if (bio->bi_status) { 6557 atomic_inc(&bbio->error); 6558 if (bio->bi_status == BLK_STS_IOERR || 6559 bio->bi_status == BLK_STS_TARGET) { 6560 struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6561 6562 ASSERT(dev->bdev); 6563 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6564 btrfs_dev_stat_inc_and_print(dev, 6565 BTRFS_DEV_STAT_WRITE_ERRS); 6566 else if (!(bio->bi_opf & REQ_RAHEAD)) 6567 btrfs_dev_stat_inc_and_print(dev, 6568 BTRFS_DEV_STAT_READ_ERRS); 6569 if (bio->bi_opf & REQ_PREFLUSH) 6570 btrfs_dev_stat_inc_and_print(dev, 6571 BTRFS_DEV_STAT_FLUSH_ERRS); 6572 } 6573 } 6574 6575 if (bio == bbio->orig_bio) 6576 is_orig_bio = 1; 6577 6578 btrfs_bio_counter_dec(bbio->fs_info); 6579 6580 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6581 if (!is_orig_bio) { 6582 bio_put(bio); 6583 bio = bbio->orig_bio; 6584 } 6585 6586 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6587 /* only send an error to the higher layers if it is 6588 * beyond the tolerance of the btrfs bio 6589 */ 6590 if (atomic_read(&bbio->error) > bbio->max_errors) { 6591 bio->bi_status = BLK_STS_IOERR; 6592 } else { 6593 /* 6594 * this bio is actually up to date, we didn't 6595 * go over the max number of errors 6596 */ 6597 bio->bi_status = BLK_STS_OK; 6598 } 6599 6600 btrfs_end_bbio(bbio, bio); 6601 } else if (!is_orig_bio) { 6602 bio_put(bio); 6603 } 6604 } 6605 6606 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6607 u64 physical, struct btrfs_device *dev) 6608 { 6609 struct btrfs_fs_info *fs_info = bbio->fs_info; 6610 6611 bio->bi_private = bbio; 6612 btrfs_io_bio(bio)->device = dev; 6613 bio->bi_end_io = btrfs_end_bio; 6614 bio->bi_iter.bi_sector = physical >> 9; 6615 /* 6616 * For zone append writing, bi_sector must point the beginning of the 6617 * zone 6618 */ 6619 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6620 if (btrfs_dev_is_sequential(dev, physical)) { 6621 u64 zone_start = round_down(physical, fs_info->zone_size); 6622 6623 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6624 } else { 6625 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6626 bio->bi_opf |= REQ_OP_WRITE; 6627 } 6628 } 6629 btrfs_debug_in_rcu(fs_info, 6630 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6631 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6632 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6633 dev->devid, bio->bi_iter.bi_size); 6634 bio_set_dev(bio, dev->bdev); 6635 6636 btrfs_bio_counter_inc_noblocked(fs_info); 6637 6638 btrfsic_submit_bio(bio); 6639 } 6640 6641 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6642 { 6643 atomic_inc(&bbio->error); 6644 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6645 /* Should be the original bio. */ 6646 WARN_ON(bio != bbio->orig_bio); 6647 6648 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6649 bio->bi_iter.bi_sector = logical >> 9; 6650 if (atomic_read(&bbio->error) > bbio->max_errors) 6651 bio->bi_status = BLK_STS_IOERR; 6652 else 6653 bio->bi_status = BLK_STS_OK; 6654 btrfs_end_bbio(bbio, bio); 6655 } 6656 } 6657 6658 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6659 int mirror_num) 6660 { 6661 struct btrfs_device *dev; 6662 struct bio *first_bio = bio; 6663 u64 logical = bio->bi_iter.bi_sector << 9; 6664 u64 length = 0; 6665 u64 map_length; 6666 int ret; 6667 int dev_nr; 6668 int total_devs; 6669 struct btrfs_bio *bbio = NULL; 6670 6671 length = bio->bi_iter.bi_size; 6672 map_length = length; 6673 6674 btrfs_bio_counter_inc_blocked(fs_info); 6675 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6676 &map_length, &bbio, mirror_num, 1); 6677 if (ret) { 6678 btrfs_bio_counter_dec(fs_info); 6679 return errno_to_blk_status(ret); 6680 } 6681 6682 total_devs = bbio->num_stripes; 6683 bbio->orig_bio = first_bio; 6684 bbio->private = first_bio->bi_private; 6685 bbio->end_io = first_bio->bi_end_io; 6686 bbio->fs_info = fs_info; 6687 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6688 6689 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6690 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6691 /* In this case, map_length has been set to the length of 6692 a single stripe; not the whole write */ 6693 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6694 ret = raid56_parity_write(fs_info, bio, bbio, 6695 map_length); 6696 } else { 6697 ret = raid56_parity_recover(fs_info, bio, bbio, 6698 map_length, mirror_num, 1); 6699 } 6700 6701 btrfs_bio_counter_dec(fs_info); 6702 return errno_to_blk_status(ret); 6703 } 6704 6705 if (map_length < length) { 6706 btrfs_crit(fs_info, 6707 "mapping failed logical %llu bio len %llu len %llu", 6708 logical, length, map_length); 6709 BUG(); 6710 } 6711 6712 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6713 dev = bbio->stripes[dev_nr].dev; 6714 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6715 &dev->dev_state) || 6716 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6717 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6718 bbio_error(bbio, first_bio, logical); 6719 continue; 6720 } 6721 6722 if (dev_nr < total_devs - 1) 6723 bio = btrfs_bio_clone(first_bio); 6724 else 6725 bio = first_bio; 6726 6727 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 6728 } 6729 btrfs_bio_counter_dec(fs_info); 6730 return BLK_STS_OK; 6731 } 6732 6733 /* 6734 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6735 * return NULL. 6736 * 6737 * If devid and uuid are both specified, the match must be exact, otherwise 6738 * only devid is used. 6739 */ 6740 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6741 u64 devid, u8 *uuid, u8 *fsid) 6742 { 6743 struct btrfs_device *device; 6744 struct btrfs_fs_devices *seed_devs; 6745 6746 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6747 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6748 if (device->devid == devid && 6749 (!uuid || memcmp(device->uuid, uuid, 6750 BTRFS_UUID_SIZE) == 0)) 6751 return device; 6752 } 6753 } 6754 6755 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6756 if (!fsid || 6757 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6758 list_for_each_entry(device, &seed_devs->devices, 6759 dev_list) { 6760 if (device->devid == devid && 6761 (!uuid || memcmp(device->uuid, uuid, 6762 BTRFS_UUID_SIZE) == 0)) 6763 return device; 6764 } 6765 } 6766 } 6767 6768 return NULL; 6769 } 6770 6771 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6772 u64 devid, u8 *dev_uuid) 6773 { 6774 struct btrfs_device *device; 6775 unsigned int nofs_flag; 6776 6777 /* 6778 * We call this under the chunk_mutex, so we want to use NOFS for this 6779 * allocation, however we don't want to change btrfs_alloc_device() to 6780 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6781 * places. 6782 */ 6783 nofs_flag = memalloc_nofs_save(); 6784 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6785 memalloc_nofs_restore(nofs_flag); 6786 if (IS_ERR(device)) 6787 return device; 6788 6789 list_add(&device->dev_list, &fs_devices->devices); 6790 device->fs_devices = fs_devices; 6791 fs_devices->num_devices++; 6792 6793 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6794 fs_devices->missing_devices++; 6795 6796 return device; 6797 } 6798 6799 /** 6800 * btrfs_alloc_device - allocate struct btrfs_device 6801 * @fs_info: used only for generating a new devid, can be NULL if 6802 * devid is provided (i.e. @devid != NULL). 6803 * @devid: a pointer to devid for this device. If NULL a new devid 6804 * is generated. 6805 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6806 * is generated. 6807 * 6808 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6809 * on error. Returned struct is not linked onto any lists and must be 6810 * destroyed with btrfs_free_device. 6811 */ 6812 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6813 const u64 *devid, 6814 const u8 *uuid) 6815 { 6816 struct btrfs_device *dev; 6817 u64 tmp; 6818 6819 if (WARN_ON(!devid && !fs_info)) 6820 return ERR_PTR(-EINVAL); 6821 6822 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6823 if (!dev) 6824 return ERR_PTR(-ENOMEM); 6825 6826 /* 6827 * Preallocate a bio that's always going to be used for flushing device 6828 * barriers and matches the device lifespan 6829 */ 6830 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6831 if (!dev->flush_bio) { 6832 kfree(dev); 6833 return ERR_PTR(-ENOMEM); 6834 } 6835 6836 INIT_LIST_HEAD(&dev->dev_list); 6837 INIT_LIST_HEAD(&dev->dev_alloc_list); 6838 INIT_LIST_HEAD(&dev->post_commit_list); 6839 6840 atomic_set(&dev->reada_in_flight, 0); 6841 atomic_set(&dev->dev_stats_ccnt, 0); 6842 btrfs_device_data_ordered_init(dev); 6843 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6844 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6845 extent_io_tree_init(fs_info, &dev->alloc_state, 6846 IO_TREE_DEVICE_ALLOC_STATE, NULL); 6847 6848 if (devid) 6849 tmp = *devid; 6850 else { 6851 int ret; 6852 6853 ret = find_next_devid(fs_info, &tmp); 6854 if (ret) { 6855 btrfs_free_device(dev); 6856 return ERR_PTR(ret); 6857 } 6858 } 6859 dev->devid = tmp; 6860 6861 if (uuid) 6862 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6863 else 6864 generate_random_uuid(dev->uuid); 6865 6866 return dev; 6867 } 6868 6869 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6870 u64 devid, u8 *uuid, bool error) 6871 { 6872 if (error) 6873 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6874 devid, uuid); 6875 else 6876 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6877 devid, uuid); 6878 } 6879 6880 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 6881 { 6882 const int data_stripes = calc_data_stripes(type, num_stripes); 6883 6884 return div_u64(chunk_len, data_stripes); 6885 } 6886 6887 #if BITS_PER_LONG == 32 6888 /* 6889 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6890 * can't be accessed on 32bit systems. 6891 * 6892 * This function do mount time check to reject the fs if it already has 6893 * metadata chunk beyond that limit. 6894 */ 6895 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6896 u64 logical, u64 length, u64 type) 6897 { 6898 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6899 return 0; 6900 6901 if (logical + length < MAX_LFS_FILESIZE) 6902 return 0; 6903 6904 btrfs_err_32bit_limit(fs_info); 6905 return -EOVERFLOW; 6906 } 6907 6908 /* 6909 * This is to give early warning for any metadata chunk reaching 6910 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6911 * Although we can still access the metadata, it's not going to be possible 6912 * once the limit is reached. 6913 */ 6914 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6915 u64 logical, u64 length, u64 type) 6916 { 6917 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6918 return; 6919 6920 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6921 return; 6922 6923 btrfs_warn_32bit_limit(fs_info); 6924 } 6925 #endif 6926 6927 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6928 struct btrfs_chunk *chunk) 6929 { 6930 struct btrfs_fs_info *fs_info = leaf->fs_info; 6931 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6932 struct map_lookup *map; 6933 struct extent_map *em; 6934 u64 logical; 6935 u64 length; 6936 u64 devid; 6937 u64 type; 6938 u8 uuid[BTRFS_UUID_SIZE]; 6939 int num_stripes; 6940 int ret; 6941 int i; 6942 6943 logical = key->offset; 6944 length = btrfs_chunk_length(leaf, chunk); 6945 type = btrfs_chunk_type(leaf, chunk); 6946 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6947 6948 #if BITS_PER_LONG == 32 6949 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6950 if (ret < 0) 6951 return ret; 6952 warn_32bit_meta_chunk(fs_info, logical, length, type); 6953 #endif 6954 6955 /* 6956 * Only need to verify chunk item if we're reading from sys chunk array, 6957 * as chunk item in tree block is already verified by tree-checker. 6958 */ 6959 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6960 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6961 if (ret) 6962 return ret; 6963 } 6964 6965 read_lock(&map_tree->lock); 6966 em = lookup_extent_mapping(map_tree, logical, 1); 6967 read_unlock(&map_tree->lock); 6968 6969 /* already mapped? */ 6970 if (em && em->start <= logical && em->start + em->len > logical) { 6971 free_extent_map(em); 6972 return 0; 6973 } else if (em) { 6974 free_extent_map(em); 6975 } 6976 6977 em = alloc_extent_map(); 6978 if (!em) 6979 return -ENOMEM; 6980 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6981 if (!map) { 6982 free_extent_map(em); 6983 return -ENOMEM; 6984 } 6985 6986 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6987 em->map_lookup = map; 6988 em->start = logical; 6989 em->len = length; 6990 em->orig_start = 0; 6991 em->block_start = 0; 6992 em->block_len = em->len; 6993 6994 map->num_stripes = num_stripes; 6995 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6996 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6997 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6998 map->type = type; 6999 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7000 map->verified_stripes = 0; 7001 em->orig_block_len = calc_stripe_length(type, em->len, 7002 map->num_stripes); 7003 for (i = 0; i < num_stripes; i++) { 7004 map->stripes[i].physical = 7005 btrfs_stripe_offset_nr(leaf, chunk, i); 7006 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7007 read_extent_buffer(leaf, uuid, (unsigned long) 7008 btrfs_stripe_dev_uuid_nr(chunk, i), 7009 BTRFS_UUID_SIZE); 7010 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 7011 devid, uuid, NULL); 7012 if (!map->stripes[i].dev && 7013 !btrfs_test_opt(fs_info, DEGRADED)) { 7014 free_extent_map(em); 7015 btrfs_report_missing_device(fs_info, devid, uuid, true); 7016 return -ENOENT; 7017 } 7018 if (!map->stripes[i].dev) { 7019 map->stripes[i].dev = 7020 add_missing_dev(fs_info->fs_devices, devid, 7021 uuid); 7022 if (IS_ERR(map->stripes[i].dev)) { 7023 free_extent_map(em); 7024 btrfs_err(fs_info, 7025 "failed to init missing dev %llu: %ld", 7026 devid, PTR_ERR(map->stripes[i].dev)); 7027 return PTR_ERR(map->stripes[i].dev); 7028 } 7029 btrfs_report_missing_device(fs_info, devid, uuid, false); 7030 } 7031 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7032 &(map->stripes[i].dev->dev_state)); 7033 7034 } 7035 7036 write_lock(&map_tree->lock); 7037 ret = add_extent_mapping(map_tree, em, 0); 7038 write_unlock(&map_tree->lock); 7039 if (ret < 0) { 7040 btrfs_err(fs_info, 7041 "failed to add chunk map, start=%llu len=%llu: %d", 7042 em->start, em->len, ret); 7043 } 7044 free_extent_map(em); 7045 7046 return ret; 7047 } 7048 7049 static void fill_device_from_item(struct extent_buffer *leaf, 7050 struct btrfs_dev_item *dev_item, 7051 struct btrfs_device *device) 7052 { 7053 unsigned long ptr; 7054 7055 device->devid = btrfs_device_id(leaf, dev_item); 7056 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7057 device->total_bytes = device->disk_total_bytes; 7058 device->commit_total_bytes = device->disk_total_bytes; 7059 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7060 device->commit_bytes_used = device->bytes_used; 7061 device->type = btrfs_device_type(leaf, dev_item); 7062 device->io_align = btrfs_device_io_align(leaf, dev_item); 7063 device->io_width = btrfs_device_io_width(leaf, dev_item); 7064 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7065 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7066 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7067 7068 ptr = btrfs_device_uuid(dev_item); 7069 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7070 } 7071 7072 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7073 u8 *fsid) 7074 { 7075 struct btrfs_fs_devices *fs_devices; 7076 int ret; 7077 7078 lockdep_assert_held(&uuid_mutex); 7079 ASSERT(fsid); 7080 7081 /* This will match only for multi-device seed fs */ 7082 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7083 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7084 return fs_devices; 7085 7086 7087 fs_devices = find_fsid(fsid, NULL); 7088 if (!fs_devices) { 7089 if (!btrfs_test_opt(fs_info, DEGRADED)) 7090 return ERR_PTR(-ENOENT); 7091 7092 fs_devices = alloc_fs_devices(fsid, NULL); 7093 if (IS_ERR(fs_devices)) 7094 return fs_devices; 7095 7096 fs_devices->seeding = true; 7097 fs_devices->opened = 1; 7098 return fs_devices; 7099 } 7100 7101 /* 7102 * Upon first call for a seed fs fsid, just create a private copy of the 7103 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7104 */ 7105 fs_devices = clone_fs_devices(fs_devices); 7106 if (IS_ERR(fs_devices)) 7107 return fs_devices; 7108 7109 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7110 if (ret) { 7111 free_fs_devices(fs_devices); 7112 return ERR_PTR(ret); 7113 } 7114 7115 if (!fs_devices->seeding) { 7116 close_fs_devices(fs_devices); 7117 free_fs_devices(fs_devices); 7118 return ERR_PTR(-EINVAL); 7119 } 7120 7121 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7122 7123 return fs_devices; 7124 } 7125 7126 static int read_one_dev(struct extent_buffer *leaf, 7127 struct btrfs_dev_item *dev_item) 7128 { 7129 struct btrfs_fs_info *fs_info = leaf->fs_info; 7130 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7131 struct btrfs_device *device; 7132 u64 devid; 7133 int ret; 7134 u8 fs_uuid[BTRFS_FSID_SIZE]; 7135 u8 dev_uuid[BTRFS_UUID_SIZE]; 7136 7137 devid = btrfs_device_id(leaf, dev_item); 7138 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7139 BTRFS_UUID_SIZE); 7140 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7141 BTRFS_FSID_SIZE); 7142 7143 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7144 fs_devices = open_seed_devices(fs_info, fs_uuid); 7145 if (IS_ERR(fs_devices)) 7146 return PTR_ERR(fs_devices); 7147 } 7148 7149 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 7150 fs_uuid); 7151 if (!device) { 7152 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7153 btrfs_report_missing_device(fs_info, devid, 7154 dev_uuid, true); 7155 return -ENOENT; 7156 } 7157 7158 device = add_missing_dev(fs_devices, devid, dev_uuid); 7159 if (IS_ERR(device)) { 7160 btrfs_err(fs_info, 7161 "failed to add missing dev %llu: %ld", 7162 devid, PTR_ERR(device)); 7163 return PTR_ERR(device); 7164 } 7165 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7166 } else { 7167 if (!device->bdev) { 7168 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7169 btrfs_report_missing_device(fs_info, 7170 devid, dev_uuid, true); 7171 return -ENOENT; 7172 } 7173 btrfs_report_missing_device(fs_info, devid, 7174 dev_uuid, false); 7175 } 7176 7177 if (!device->bdev && 7178 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7179 /* 7180 * this happens when a device that was properly setup 7181 * in the device info lists suddenly goes bad. 7182 * device->bdev is NULL, and so we have to set 7183 * device->missing to one here 7184 */ 7185 device->fs_devices->missing_devices++; 7186 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7187 } 7188 7189 /* Move the device to its own fs_devices */ 7190 if (device->fs_devices != fs_devices) { 7191 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7192 &device->dev_state)); 7193 7194 list_move(&device->dev_list, &fs_devices->devices); 7195 device->fs_devices->num_devices--; 7196 fs_devices->num_devices++; 7197 7198 device->fs_devices->missing_devices--; 7199 fs_devices->missing_devices++; 7200 7201 device->fs_devices = fs_devices; 7202 } 7203 } 7204 7205 if (device->fs_devices != fs_info->fs_devices) { 7206 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7207 if (device->generation != 7208 btrfs_device_generation(leaf, dev_item)) 7209 return -EINVAL; 7210 } 7211 7212 fill_device_from_item(leaf, dev_item, device); 7213 if (device->bdev) { 7214 u64 max_total_bytes = i_size_read(device->bdev->bd_inode); 7215 7216 if (device->total_bytes > max_total_bytes) { 7217 btrfs_err(fs_info, 7218 "device total_bytes should be at most %llu but found %llu", 7219 max_total_bytes, device->total_bytes); 7220 return -EINVAL; 7221 } 7222 } 7223 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7224 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7225 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7226 device->fs_devices->total_rw_bytes += device->total_bytes; 7227 atomic64_add(device->total_bytes - device->bytes_used, 7228 &fs_info->free_chunk_space); 7229 } 7230 ret = 0; 7231 return ret; 7232 } 7233 7234 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7235 { 7236 struct btrfs_root *root = fs_info->tree_root; 7237 struct btrfs_super_block *super_copy = fs_info->super_copy; 7238 struct extent_buffer *sb; 7239 struct btrfs_disk_key *disk_key; 7240 struct btrfs_chunk *chunk; 7241 u8 *array_ptr; 7242 unsigned long sb_array_offset; 7243 int ret = 0; 7244 u32 num_stripes; 7245 u32 array_size; 7246 u32 len = 0; 7247 u32 cur_offset; 7248 u64 type; 7249 struct btrfs_key key; 7250 7251 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7252 /* 7253 * This will create extent buffer of nodesize, superblock size is 7254 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7255 * overallocate but we can keep it as-is, only the first page is used. 7256 */ 7257 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7258 root->root_key.objectid, 0); 7259 if (IS_ERR(sb)) 7260 return PTR_ERR(sb); 7261 set_extent_buffer_uptodate(sb); 7262 /* 7263 * The sb extent buffer is artificial and just used to read the system array. 7264 * set_extent_buffer_uptodate() call does not properly mark all it's 7265 * pages up-to-date when the page is larger: extent does not cover the 7266 * whole page and consequently check_page_uptodate does not find all 7267 * the page's extents up-to-date (the hole beyond sb), 7268 * write_extent_buffer then triggers a WARN_ON. 7269 * 7270 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7271 * but sb spans only this function. Add an explicit SetPageUptodate call 7272 * to silence the warning eg. on PowerPC 64. 7273 */ 7274 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7275 SetPageUptodate(sb->pages[0]); 7276 7277 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7278 array_size = btrfs_super_sys_array_size(super_copy); 7279 7280 array_ptr = super_copy->sys_chunk_array; 7281 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7282 cur_offset = 0; 7283 7284 while (cur_offset < array_size) { 7285 disk_key = (struct btrfs_disk_key *)array_ptr; 7286 len = sizeof(*disk_key); 7287 if (cur_offset + len > array_size) 7288 goto out_short_read; 7289 7290 btrfs_disk_key_to_cpu(&key, disk_key); 7291 7292 array_ptr += len; 7293 sb_array_offset += len; 7294 cur_offset += len; 7295 7296 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7297 btrfs_err(fs_info, 7298 "unexpected item type %u in sys_array at offset %u", 7299 (u32)key.type, cur_offset); 7300 ret = -EIO; 7301 break; 7302 } 7303 7304 chunk = (struct btrfs_chunk *)sb_array_offset; 7305 /* 7306 * At least one btrfs_chunk with one stripe must be present, 7307 * exact stripe count check comes afterwards 7308 */ 7309 len = btrfs_chunk_item_size(1); 7310 if (cur_offset + len > array_size) 7311 goto out_short_read; 7312 7313 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7314 if (!num_stripes) { 7315 btrfs_err(fs_info, 7316 "invalid number of stripes %u in sys_array at offset %u", 7317 num_stripes, cur_offset); 7318 ret = -EIO; 7319 break; 7320 } 7321 7322 type = btrfs_chunk_type(sb, chunk); 7323 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7324 btrfs_err(fs_info, 7325 "invalid chunk type %llu in sys_array at offset %u", 7326 type, cur_offset); 7327 ret = -EIO; 7328 break; 7329 } 7330 7331 len = btrfs_chunk_item_size(num_stripes); 7332 if (cur_offset + len > array_size) 7333 goto out_short_read; 7334 7335 ret = read_one_chunk(&key, sb, chunk); 7336 if (ret) 7337 break; 7338 7339 array_ptr += len; 7340 sb_array_offset += len; 7341 cur_offset += len; 7342 } 7343 clear_extent_buffer_uptodate(sb); 7344 free_extent_buffer_stale(sb); 7345 return ret; 7346 7347 out_short_read: 7348 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7349 len, cur_offset); 7350 clear_extent_buffer_uptodate(sb); 7351 free_extent_buffer_stale(sb); 7352 return -EIO; 7353 } 7354 7355 /* 7356 * Check if all chunks in the fs are OK for read-write degraded mount 7357 * 7358 * If the @failing_dev is specified, it's accounted as missing. 7359 * 7360 * Return true if all chunks meet the minimal RW mount requirements. 7361 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7362 */ 7363 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7364 struct btrfs_device *failing_dev) 7365 { 7366 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7367 struct extent_map *em; 7368 u64 next_start = 0; 7369 bool ret = true; 7370 7371 read_lock(&map_tree->lock); 7372 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7373 read_unlock(&map_tree->lock); 7374 /* No chunk at all? Return false anyway */ 7375 if (!em) { 7376 ret = false; 7377 goto out; 7378 } 7379 while (em) { 7380 struct map_lookup *map; 7381 int missing = 0; 7382 int max_tolerated; 7383 int i; 7384 7385 map = em->map_lookup; 7386 max_tolerated = 7387 btrfs_get_num_tolerated_disk_barrier_failures( 7388 map->type); 7389 for (i = 0; i < map->num_stripes; i++) { 7390 struct btrfs_device *dev = map->stripes[i].dev; 7391 7392 if (!dev || !dev->bdev || 7393 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7394 dev->last_flush_error) 7395 missing++; 7396 else if (failing_dev && failing_dev == dev) 7397 missing++; 7398 } 7399 if (missing > max_tolerated) { 7400 if (!failing_dev) 7401 btrfs_warn(fs_info, 7402 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7403 em->start, missing, max_tolerated); 7404 free_extent_map(em); 7405 ret = false; 7406 goto out; 7407 } 7408 next_start = extent_map_end(em); 7409 free_extent_map(em); 7410 7411 read_lock(&map_tree->lock); 7412 em = lookup_extent_mapping(map_tree, next_start, 7413 (u64)(-1) - next_start); 7414 read_unlock(&map_tree->lock); 7415 } 7416 out: 7417 return ret; 7418 } 7419 7420 static void readahead_tree_node_children(struct extent_buffer *node) 7421 { 7422 int i; 7423 const int nr_items = btrfs_header_nritems(node); 7424 7425 for (i = 0; i < nr_items; i++) 7426 btrfs_readahead_node_child(node, i); 7427 } 7428 7429 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7430 { 7431 struct btrfs_root *root = fs_info->chunk_root; 7432 struct btrfs_path *path; 7433 struct extent_buffer *leaf; 7434 struct btrfs_key key; 7435 struct btrfs_key found_key; 7436 int ret; 7437 int slot; 7438 u64 total_dev = 0; 7439 u64 last_ra_node = 0; 7440 7441 path = btrfs_alloc_path(); 7442 if (!path) 7443 return -ENOMEM; 7444 7445 /* 7446 * uuid_mutex is needed only if we are mounting a sprout FS 7447 * otherwise we don't need it. 7448 */ 7449 mutex_lock(&uuid_mutex); 7450 7451 /* 7452 * It is possible for mount and umount to race in such a way that 7453 * we execute this code path, but open_fs_devices failed to clear 7454 * total_rw_bytes. We certainly want it cleared before reading the 7455 * device items, so clear it here. 7456 */ 7457 fs_info->fs_devices->total_rw_bytes = 0; 7458 7459 /* 7460 * Read all device items, and then all the chunk items. All 7461 * device items are found before any chunk item (their object id 7462 * is smaller than the lowest possible object id for a chunk 7463 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7464 */ 7465 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7466 key.offset = 0; 7467 key.type = 0; 7468 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7469 if (ret < 0) 7470 goto error; 7471 while (1) { 7472 struct extent_buffer *node; 7473 7474 leaf = path->nodes[0]; 7475 slot = path->slots[0]; 7476 if (slot >= btrfs_header_nritems(leaf)) { 7477 ret = btrfs_next_leaf(root, path); 7478 if (ret == 0) 7479 continue; 7480 if (ret < 0) 7481 goto error; 7482 break; 7483 } 7484 /* 7485 * The nodes on level 1 are not locked but we don't need to do 7486 * that during mount time as nothing else can access the tree 7487 */ 7488 node = path->nodes[1]; 7489 if (node) { 7490 if (last_ra_node != node->start) { 7491 readahead_tree_node_children(node); 7492 last_ra_node = node->start; 7493 } 7494 } 7495 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7496 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7497 struct btrfs_dev_item *dev_item; 7498 dev_item = btrfs_item_ptr(leaf, slot, 7499 struct btrfs_dev_item); 7500 ret = read_one_dev(leaf, dev_item); 7501 if (ret) 7502 goto error; 7503 total_dev++; 7504 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7505 struct btrfs_chunk *chunk; 7506 7507 /* 7508 * We are only called at mount time, so no need to take 7509 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7510 * we always lock first fs_info->chunk_mutex before 7511 * acquiring any locks on the chunk tree. This is a 7512 * requirement for chunk allocation, see the comment on 7513 * top of btrfs_chunk_alloc() for details. 7514 */ 7515 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7516 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7517 ret = read_one_chunk(&found_key, leaf, chunk); 7518 if (ret) 7519 goto error; 7520 } 7521 path->slots[0]++; 7522 } 7523 7524 /* 7525 * After loading chunk tree, we've got all device information, 7526 * do another round of validation checks. 7527 */ 7528 if (total_dev != fs_info->fs_devices->total_devices) { 7529 btrfs_err(fs_info, 7530 "super_num_devices %llu mismatch with num_devices %llu found here", 7531 btrfs_super_num_devices(fs_info->super_copy), 7532 total_dev); 7533 ret = -EINVAL; 7534 goto error; 7535 } 7536 if (btrfs_super_total_bytes(fs_info->super_copy) < 7537 fs_info->fs_devices->total_rw_bytes) { 7538 btrfs_err(fs_info, 7539 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7540 btrfs_super_total_bytes(fs_info->super_copy), 7541 fs_info->fs_devices->total_rw_bytes); 7542 ret = -EINVAL; 7543 goto error; 7544 } 7545 ret = 0; 7546 error: 7547 mutex_unlock(&uuid_mutex); 7548 7549 btrfs_free_path(path); 7550 return ret; 7551 } 7552 7553 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7554 { 7555 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7556 struct btrfs_device *device; 7557 7558 fs_devices->fs_info = fs_info; 7559 7560 mutex_lock(&fs_devices->device_list_mutex); 7561 list_for_each_entry(device, &fs_devices->devices, dev_list) 7562 device->fs_info = fs_info; 7563 7564 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7565 list_for_each_entry(device, &seed_devs->devices, dev_list) 7566 device->fs_info = fs_info; 7567 7568 seed_devs->fs_info = fs_info; 7569 } 7570 mutex_unlock(&fs_devices->device_list_mutex); 7571 } 7572 7573 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7574 const struct btrfs_dev_stats_item *ptr, 7575 int index) 7576 { 7577 u64 val; 7578 7579 read_extent_buffer(eb, &val, 7580 offsetof(struct btrfs_dev_stats_item, values) + 7581 ((unsigned long)ptr) + (index * sizeof(u64)), 7582 sizeof(val)); 7583 return val; 7584 } 7585 7586 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7587 struct btrfs_dev_stats_item *ptr, 7588 int index, u64 val) 7589 { 7590 write_extent_buffer(eb, &val, 7591 offsetof(struct btrfs_dev_stats_item, values) + 7592 ((unsigned long)ptr) + (index * sizeof(u64)), 7593 sizeof(val)); 7594 } 7595 7596 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7597 struct btrfs_path *path) 7598 { 7599 struct btrfs_dev_stats_item *ptr; 7600 struct extent_buffer *eb; 7601 struct btrfs_key key; 7602 int item_size; 7603 int i, ret, slot; 7604 7605 if (!device->fs_info->dev_root) 7606 return 0; 7607 7608 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7609 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7610 key.offset = device->devid; 7611 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7612 if (ret) { 7613 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7614 btrfs_dev_stat_set(device, i, 0); 7615 device->dev_stats_valid = 1; 7616 btrfs_release_path(path); 7617 return ret < 0 ? ret : 0; 7618 } 7619 slot = path->slots[0]; 7620 eb = path->nodes[0]; 7621 item_size = btrfs_item_size_nr(eb, slot); 7622 7623 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7624 7625 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7626 if (item_size >= (1 + i) * sizeof(__le64)) 7627 btrfs_dev_stat_set(device, i, 7628 btrfs_dev_stats_value(eb, ptr, i)); 7629 else 7630 btrfs_dev_stat_set(device, i, 0); 7631 } 7632 7633 device->dev_stats_valid = 1; 7634 btrfs_dev_stat_print_on_load(device); 7635 btrfs_release_path(path); 7636 7637 return 0; 7638 } 7639 7640 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7641 { 7642 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7643 struct btrfs_device *device; 7644 struct btrfs_path *path = NULL; 7645 int ret = 0; 7646 7647 path = btrfs_alloc_path(); 7648 if (!path) 7649 return -ENOMEM; 7650 7651 mutex_lock(&fs_devices->device_list_mutex); 7652 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7653 ret = btrfs_device_init_dev_stats(device, path); 7654 if (ret) 7655 goto out; 7656 } 7657 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7658 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7659 ret = btrfs_device_init_dev_stats(device, path); 7660 if (ret) 7661 goto out; 7662 } 7663 } 7664 out: 7665 mutex_unlock(&fs_devices->device_list_mutex); 7666 7667 btrfs_free_path(path); 7668 return ret; 7669 } 7670 7671 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7672 struct btrfs_device *device) 7673 { 7674 struct btrfs_fs_info *fs_info = trans->fs_info; 7675 struct btrfs_root *dev_root = fs_info->dev_root; 7676 struct btrfs_path *path; 7677 struct btrfs_key key; 7678 struct extent_buffer *eb; 7679 struct btrfs_dev_stats_item *ptr; 7680 int ret; 7681 int i; 7682 7683 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7684 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7685 key.offset = device->devid; 7686 7687 path = btrfs_alloc_path(); 7688 if (!path) 7689 return -ENOMEM; 7690 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7691 if (ret < 0) { 7692 btrfs_warn_in_rcu(fs_info, 7693 "error %d while searching for dev_stats item for device %s", 7694 ret, rcu_str_deref(device->name)); 7695 goto out; 7696 } 7697 7698 if (ret == 0 && 7699 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7700 /* need to delete old one and insert a new one */ 7701 ret = btrfs_del_item(trans, dev_root, path); 7702 if (ret != 0) { 7703 btrfs_warn_in_rcu(fs_info, 7704 "delete too small dev_stats item for device %s failed %d", 7705 rcu_str_deref(device->name), ret); 7706 goto out; 7707 } 7708 ret = 1; 7709 } 7710 7711 if (ret == 1) { 7712 /* need to insert a new item */ 7713 btrfs_release_path(path); 7714 ret = btrfs_insert_empty_item(trans, dev_root, path, 7715 &key, sizeof(*ptr)); 7716 if (ret < 0) { 7717 btrfs_warn_in_rcu(fs_info, 7718 "insert dev_stats item for device %s failed %d", 7719 rcu_str_deref(device->name), ret); 7720 goto out; 7721 } 7722 } 7723 7724 eb = path->nodes[0]; 7725 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7726 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7727 btrfs_set_dev_stats_value(eb, ptr, i, 7728 btrfs_dev_stat_read(device, i)); 7729 btrfs_mark_buffer_dirty(eb); 7730 7731 out: 7732 btrfs_free_path(path); 7733 return ret; 7734 } 7735 7736 /* 7737 * called from commit_transaction. Writes all changed device stats to disk. 7738 */ 7739 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7740 { 7741 struct btrfs_fs_info *fs_info = trans->fs_info; 7742 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7743 struct btrfs_device *device; 7744 int stats_cnt; 7745 int ret = 0; 7746 7747 mutex_lock(&fs_devices->device_list_mutex); 7748 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7749 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7750 if (!device->dev_stats_valid || stats_cnt == 0) 7751 continue; 7752 7753 7754 /* 7755 * There is a LOAD-LOAD control dependency between the value of 7756 * dev_stats_ccnt and updating the on-disk values which requires 7757 * reading the in-memory counters. Such control dependencies 7758 * require explicit read memory barriers. 7759 * 7760 * This memory barriers pairs with smp_mb__before_atomic in 7761 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7762 * barrier implied by atomic_xchg in 7763 * btrfs_dev_stats_read_and_reset 7764 */ 7765 smp_rmb(); 7766 7767 ret = update_dev_stat_item(trans, device); 7768 if (!ret) 7769 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7770 } 7771 mutex_unlock(&fs_devices->device_list_mutex); 7772 7773 return ret; 7774 } 7775 7776 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7777 { 7778 btrfs_dev_stat_inc(dev, index); 7779 btrfs_dev_stat_print_on_error(dev); 7780 } 7781 7782 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7783 { 7784 if (!dev->dev_stats_valid) 7785 return; 7786 btrfs_err_rl_in_rcu(dev->fs_info, 7787 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7788 rcu_str_deref(dev->name), 7789 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7790 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7791 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7792 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7793 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7794 } 7795 7796 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7797 { 7798 int i; 7799 7800 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7801 if (btrfs_dev_stat_read(dev, i) != 0) 7802 break; 7803 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7804 return; /* all values == 0, suppress message */ 7805 7806 btrfs_info_in_rcu(dev->fs_info, 7807 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7808 rcu_str_deref(dev->name), 7809 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7810 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7811 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7812 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7813 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7814 } 7815 7816 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7817 struct btrfs_ioctl_get_dev_stats *stats) 7818 { 7819 struct btrfs_device *dev; 7820 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7821 int i; 7822 7823 mutex_lock(&fs_devices->device_list_mutex); 7824 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); 7825 mutex_unlock(&fs_devices->device_list_mutex); 7826 7827 if (!dev) { 7828 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7829 return -ENODEV; 7830 } else if (!dev->dev_stats_valid) { 7831 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7832 return -ENODEV; 7833 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7834 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7835 if (stats->nr_items > i) 7836 stats->values[i] = 7837 btrfs_dev_stat_read_and_reset(dev, i); 7838 else 7839 btrfs_dev_stat_set(dev, i, 0); 7840 } 7841 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7842 current->comm, task_pid_nr(current)); 7843 } else { 7844 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7845 if (stats->nr_items > i) 7846 stats->values[i] = btrfs_dev_stat_read(dev, i); 7847 } 7848 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7849 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7850 return 0; 7851 } 7852 7853 /* 7854 * Update the size and bytes used for each device where it changed. This is 7855 * delayed since we would otherwise get errors while writing out the 7856 * superblocks. 7857 * 7858 * Must be invoked during transaction commit. 7859 */ 7860 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7861 { 7862 struct btrfs_device *curr, *next; 7863 7864 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7865 7866 if (list_empty(&trans->dev_update_list)) 7867 return; 7868 7869 /* 7870 * We don't need the device_list_mutex here. This list is owned by the 7871 * transaction and the transaction must complete before the device is 7872 * released. 7873 */ 7874 mutex_lock(&trans->fs_info->chunk_mutex); 7875 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7876 post_commit_list) { 7877 list_del_init(&curr->post_commit_list); 7878 curr->commit_total_bytes = curr->disk_total_bytes; 7879 curr->commit_bytes_used = curr->bytes_used; 7880 } 7881 mutex_unlock(&trans->fs_info->chunk_mutex); 7882 } 7883 7884 /* 7885 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7886 */ 7887 int btrfs_bg_type_to_factor(u64 flags) 7888 { 7889 const int index = btrfs_bg_flags_to_raid_index(flags); 7890 7891 return btrfs_raid_array[index].ncopies; 7892 } 7893 7894 7895 7896 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7897 u64 chunk_offset, u64 devid, 7898 u64 physical_offset, u64 physical_len) 7899 { 7900 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7901 struct extent_map *em; 7902 struct map_lookup *map; 7903 struct btrfs_device *dev; 7904 u64 stripe_len; 7905 bool found = false; 7906 int ret = 0; 7907 int i; 7908 7909 read_lock(&em_tree->lock); 7910 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7911 read_unlock(&em_tree->lock); 7912 7913 if (!em) { 7914 btrfs_err(fs_info, 7915 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7916 physical_offset, devid); 7917 ret = -EUCLEAN; 7918 goto out; 7919 } 7920 7921 map = em->map_lookup; 7922 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7923 if (physical_len != stripe_len) { 7924 btrfs_err(fs_info, 7925 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7926 physical_offset, devid, em->start, physical_len, 7927 stripe_len); 7928 ret = -EUCLEAN; 7929 goto out; 7930 } 7931 7932 for (i = 0; i < map->num_stripes; i++) { 7933 if (map->stripes[i].dev->devid == devid && 7934 map->stripes[i].physical == physical_offset) { 7935 found = true; 7936 if (map->verified_stripes >= map->num_stripes) { 7937 btrfs_err(fs_info, 7938 "too many dev extents for chunk %llu found", 7939 em->start); 7940 ret = -EUCLEAN; 7941 goto out; 7942 } 7943 map->verified_stripes++; 7944 break; 7945 } 7946 } 7947 if (!found) { 7948 btrfs_err(fs_info, 7949 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7950 physical_offset, devid); 7951 ret = -EUCLEAN; 7952 } 7953 7954 /* Make sure no dev extent is beyond device boundary */ 7955 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 7956 if (!dev) { 7957 btrfs_err(fs_info, "failed to find devid %llu", devid); 7958 ret = -EUCLEAN; 7959 goto out; 7960 } 7961 7962 if (physical_offset + physical_len > dev->disk_total_bytes) { 7963 btrfs_err(fs_info, 7964 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7965 devid, physical_offset, physical_len, 7966 dev->disk_total_bytes); 7967 ret = -EUCLEAN; 7968 goto out; 7969 } 7970 7971 if (dev->zone_info) { 7972 u64 zone_size = dev->zone_info->zone_size; 7973 7974 if (!IS_ALIGNED(physical_offset, zone_size) || 7975 !IS_ALIGNED(physical_len, zone_size)) { 7976 btrfs_err(fs_info, 7977 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 7978 devid, physical_offset, physical_len); 7979 ret = -EUCLEAN; 7980 goto out; 7981 } 7982 } 7983 7984 out: 7985 free_extent_map(em); 7986 return ret; 7987 } 7988 7989 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7990 { 7991 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7992 struct extent_map *em; 7993 struct rb_node *node; 7994 int ret = 0; 7995 7996 read_lock(&em_tree->lock); 7997 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7998 em = rb_entry(node, struct extent_map, rb_node); 7999 if (em->map_lookup->num_stripes != 8000 em->map_lookup->verified_stripes) { 8001 btrfs_err(fs_info, 8002 "chunk %llu has missing dev extent, have %d expect %d", 8003 em->start, em->map_lookup->verified_stripes, 8004 em->map_lookup->num_stripes); 8005 ret = -EUCLEAN; 8006 goto out; 8007 } 8008 } 8009 out: 8010 read_unlock(&em_tree->lock); 8011 return ret; 8012 } 8013 8014 /* 8015 * Ensure that all dev extents are mapped to correct chunk, otherwise 8016 * later chunk allocation/free would cause unexpected behavior. 8017 * 8018 * NOTE: This will iterate through the whole device tree, which should be of 8019 * the same size level as the chunk tree. This slightly increases mount time. 8020 */ 8021 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8022 { 8023 struct btrfs_path *path; 8024 struct btrfs_root *root = fs_info->dev_root; 8025 struct btrfs_key key; 8026 u64 prev_devid = 0; 8027 u64 prev_dev_ext_end = 0; 8028 int ret = 0; 8029 8030 /* 8031 * We don't have a dev_root because we mounted with ignorebadroots and 8032 * failed to load the root, so we want to skip the verification in this 8033 * case for sure. 8034 * 8035 * However if the dev root is fine, but the tree itself is corrupted 8036 * we'd still fail to mount. This verification is only to make sure 8037 * writes can happen safely, so instead just bypass this check 8038 * completely in the case of IGNOREBADROOTS. 8039 */ 8040 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8041 return 0; 8042 8043 key.objectid = 1; 8044 key.type = BTRFS_DEV_EXTENT_KEY; 8045 key.offset = 0; 8046 8047 path = btrfs_alloc_path(); 8048 if (!path) 8049 return -ENOMEM; 8050 8051 path->reada = READA_FORWARD; 8052 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8053 if (ret < 0) 8054 goto out; 8055 8056 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8057 ret = btrfs_next_leaf(root, path); 8058 if (ret < 0) 8059 goto out; 8060 /* No dev extents at all? Not good */ 8061 if (ret > 0) { 8062 ret = -EUCLEAN; 8063 goto out; 8064 } 8065 } 8066 while (1) { 8067 struct extent_buffer *leaf = path->nodes[0]; 8068 struct btrfs_dev_extent *dext; 8069 int slot = path->slots[0]; 8070 u64 chunk_offset; 8071 u64 physical_offset; 8072 u64 physical_len; 8073 u64 devid; 8074 8075 btrfs_item_key_to_cpu(leaf, &key, slot); 8076 if (key.type != BTRFS_DEV_EXTENT_KEY) 8077 break; 8078 devid = key.objectid; 8079 physical_offset = key.offset; 8080 8081 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8082 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8083 physical_len = btrfs_dev_extent_length(leaf, dext); 8084 8085 /* Check if this dev extent overlaps with the previous one */ 8086 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8087 btrfs_err(fs_info, 8088 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8089 devid, physical_offset, prev_dev_ext_end); 8090 ret = -EUCLEAN; 8091 goto out; 8092 } 8093 8094 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8095 physical_offset, physical_len); 8096 if (ret < 0) 8097 goto out; 8098 prev_devid = devid; 8099 prev_dev_ext_end = physical_offset + physical_len; 8100 8101 ret = btrfs_next_item(root, path); 8102 if (ret < 0) 8103 goto out; 8104 if (ret > 0) { 8105 ret = 0; 8106 break; 8107 } 8108 } 8109 8110 /* Ensure all chunks have corresponding dev extents */ 8111 ret = verify_chunk_dev_extent_mapping(fs_info); 8112 out: 8113 btrfs_free_path(path); 8114 return ret; 8115 } 8116 8117 /* 8118 * Check whether the given block group or device is pinned by any inode being 8119 * used as a swapfile. 8120 */ 8121 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8122 { 8123 struct btrfs_swapfile_pin *sp; 8124 struct rb_node *node; 8125 8126 spin_lock(&fs_info->swapfile_pins_lock); 8127 node = fs_info->swapfile_pins.rb_node; 8128 while (node) { 8129 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8130 if (ptr < sp->ptr) 8131 node = node->rb_left; 8132 else if (ptr > sp->ptr) 8133 node = node->rb_right; 8134 else 8135 break; 8136 } 8137 spin_unlock(&fs_info->swapfile_pins_lock); 8138 return node != NULL; 8139 } 8140 8141 static int relocating_repair_kthread(void *data) 8142 { 8143 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8144 struct btrfs_fs_info *fs_info = cache->fs_info; 8145 u64 target; 8146 int ret = 0; 8147 8148 target = cache->start; 8149 btrfs_put_block_group(cache); 8150 8151 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8152 btrfs_info(fs_info, 8153 "zoned: skip relocating block group %llu to repair: EBUSY", 8154 target); 8155 return -EBUSY; 8156 } 8157 8158 mutex_lock(&fs_info->reclaim_bgs_lock); 8159 8160 /* Ensure block group still exists */ 8161 cache = btrfs_lookup_block_group(fs_info, target); 8162 if (!cache) 8163 goto out; 8164 8165 if (!cache->relocating_repair) 8166 goto out; 8167 8168 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8169 if (ret < 0) 8170 goto out; 8171 8172 btrfs_info(fs_info, 8173 "zoned: relocating block group %llu to repair IO failure", 8174 target); 8175 ret = btrfs_relocate_chunk(fs_info, target); 8176 8177 out: 8178 if (cache) 8179 btrfs_put_block_group(cache); 8180 mutex_unlock(&fs_info->reclaim_bgs_lock); 8181 btrfs_exclop_finish(fs_info); 8182 8183 return ret; 8184 } 8185 8186 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8187 { 8188 struct btrfs_block_group *cache; 8189 8190 /* Do not attempt to repair in degraded state */ 8191 if (btrfs_test_opt(fs_info, DEGRADED)) 8192 return 0; 8193 8194 cache = btrfs_lookup_block_group(fs_info, logical); 8195 if (!cache) 8196 return 0; 8197 8198 spin_lock(&cache->lock); 8199 if (cache->relocating_repair) { 8200 spin_unlock(&cache->lock); 8201 btrfs_put_block_group(cache); 8202 return 0; 8203 } 8204 cache->relocating_repair = 1; 8205 spin_unlock(&cache->lock); 8206 8207 kthread_run(relocating_repair_kthread, cache, 8208 "btrfs-relocating-repair"); 8209 8210 return 0; 8211 } 8212