1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "misc.h" 18 #include "ctree.h" 19 #include "extent_map.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "async-thread.h" 26 #include "check-integrity.h" 27 #include "rcu-string.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 #include "tree-checker.h" 31 #include "space-info.h" 32 #include "block-group.h" 33 #include "discard.h" 34 #include "zoned.h" 35 36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 37 [BTRFS_RAID_RAID10] = { 38 .sub_stripes = 2, 39 .dev_stripes = 1, 40 .devs_max = 0, /* 0 == as many as possible */ 41 .devs_min = 2, 42 .tolerated_failures = 1, 43 .devs_increment = 2, 44 .ncopies = 2, 45 .nparity = 0, 46 .raid_name = "raid10", 47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 49 }, 50 [BTRFS_RAID_RAID1] = { 51 .sub_stripes = 1, 52 .dev_stripes = 1, 53 .devs_max = 2, 54 .devs_min = 2, 55 .tolerated_failures = 1, 56 .devs_increment = 2, 57 .ncopies = 2, 58 .nparity = 0, 59 .raid_name = "raid1", 60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 62 }, 63 [BTRFS_RAID_RAID1C3] = { 64 .sub_stripes = 1, 65 .dev_stripes = 1, 66 .devs_max = 3, 67 .devs_min = 3, 68 .tolerated_failures = 2, 69 .devs_increment = 3, 70 .ncopies = 3, 71 .nparity = 0, 72 .raid_name = "raid1c3", 73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 75 }, 76 [BTRFS_RAID_RAID1C4] = { 77 .sub_stripes = 1, 78 .dev_stripes = 1, 79 .devs_max = 4, 80 .devs_min = 4, 81 .tolerated_failures = 3, 82 .devs_increment = 4, 83 .ncopies = 4, 84 .nparity = 0, 85 .raid_name = "raid1c4", 86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 88 }, 89 [BTRFS_RAID_DUP] = { 90 .sub_stripes = 1, 91 .dev_stripes = 2, 92 .devs_max = 1, 93 .devs_min = 1, 94 .tolerated_failures = 0, 95 .devs_increment = 1, 96 .ncopies = 2, 97 .nparity = 0, 98 .raid_name = "dup", 99 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 100 .mindev_error = 0, 101 }, 102 [BTRFS_RAID_RAID0] = { 103 .sub_stripes = 1, 104 .dev_stripes = 1, 105 .devs_max = 0, 106 .devs_min = 1, 107 .tolerated_failures = 0, 108 .devs_increment = 1, 109 .ncopies = 1, 110 .nparity = 0, 111 .raid_name = "raid0", 112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 113 .mindev_error = 0, 114 }, 115 [BTRFS_RAID_SINGLE] = { 116 .sub_stripes = 1, 117 .dev_stripes = 1, 118 .devs_max = 1, 119 .devs_min = 1, 120 .tolerated_failures = 0, 121 .devs_increment = 1, 122 .ncopies = 1, 123 .nparity = 0, 124 .raid_name = "single", 125 .bg_flag = 0, 126 .mindev_error = 0, 127 }, 128 [BTRFS_RAID_RAID5] = { 129 .sub_stripes = 1, 130 .dev_stripes = 1, 131 .devs_max = 0, 132 .devs_min = 2, 133 .tolerated_failures = 1, 134 .devs_increment = 1, 135 .ncopies = 1, 136 .nparity = 1, 137 .raid_name = "raid5", 138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 140 }, 141 [BTRFS_RAID_RAID6] = { 142 .sub_stripes = 1, 143 .dev_stripes = 1, 144 .devs_max = 0, 145 .devs_min = 3, 146 .tolerated_failures = 2, 147 .devs_increment = 1, 148 .ncopies = 1, 149 .nparity = 2, 150 .raid_name = "raid6", 151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 153 }, 154 }; 155 156 /* 157 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 158 * can be used as index to access btrfs_raid_array[]. 159 */ 160 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 161 { 162 if (flags & BTRFS_BLOCK_GROUP_RAID10) 163 return BTRFS_RAID_RAID10; 164 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 165 return BTRFS_RAID_RAID1; 166 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 167 return BTRFS_RAID_RAID1C3; 168 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 169 return BTRFS_RAID_RAID1C4; 170 else if (flags & BTRFS_BLOCK_GROUP_DUP) 171 return BTRFS_RAID_DUP; 172 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 173 return BTRFS_RAID_RAID0; 174 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 175 return BTRFS_RAID_RAID5; 176 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 177 return BTRFS_RAID_RAID6; 178 179 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 180 } 181 182 const char *btrfs_bg_type_to_raid_name(u64 flags) 183 { 184 const int index = btrfs_bg_flags_to_raid_index(flags); 185 186 if (index >= BTRFS_NR_RAID_TYPES) 187 return NULL; 188 189 return btrfs_raid_array[index].raid_name; 190 } 191 192 /* 193 * Fill @buf with textual description of @bg_flags, no more than @size_buf 194 * bytes including terminating null byte. 195 */ 196 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 197 { 198 int i; 199 int ret; 200 char *bp = buf; 201 u64 flags = bg_flags; 202 u32 size_bp = size_buf; 203 204 if (!flags) { 205 strcpy(bp, "NONE"); 206 return; 207 } 208 209 #define DESCRIBE_FLAG(flag, desc) \ 210 do { \ 211 if (flags & (flag)) { \ 212 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 213 if (ret < 0 || ret >= size_bp) \ 214 goto out_overflow; \ 215 size_bp -= ret; \ 216 bp += ret; \ 217 flags &= ~(flag); \ 218 } \ 219 } while (0) 220 221 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 222 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 224 225 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 226 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 227 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 228 btrfs_raid_array[i].raid_name); 229 #undef DESCRIBE_FLAG 230 231 if (flags) { 232 ret = snprintf(bp, size_bp, "0x%llx|", flags); 233 size_bp -= ret; 234 } 235 236 if (size_bp < size_buf) 237 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 238 239 /* 240 * The text is trimmed, it's up to the caller to provide sufficiently 241 * large buffer 242 */ 243 out_overflow:; 244 } 245 246 static int init_first_rw_device(struct btrfs_trans_handle *trans); 247 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 248 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 249 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 250 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 251 enum btrfs_map_op op, 252 u64 logical, u64 *length, 253 struct btrfs_bio **bbio_ret, 254 int mirror_num, int need_raid_map); 255 256 /* 257 * Device locking 258 * ============== 259 * 260 * There are several mutexes that protect manipulation of devices and low-level 261 * structures like chunks but not block groups, extents or files 262 * 263 * uuid_mutex (global lock) 264 * ------------------------ 265 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 266 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 267 * device) or requested by the device= mount option 268 * 269 * the mutex can be very coarse and can cover long-running operations 270 * 271 * protects: updates to fs_devices counters like missing devices, rw devices, 272 * seeding, structure cloning, opening/closing devices at mount/umount time 273 * 274 * global::fs_devs - add, remove, updates to the global list 275 * 276 * does not protect: manipulation of the fs_devices::devices list in general 277 * but in mount context it could be used to exclude list modifications by eg. 278 * scan ioctl 279 * 280 * btrfs_device::name - renames (write side), read is RCU 281 * 282 * fs_devices::device_list_mutex (per-fs, with RCU) 283 * ------------------------------------------------ 284 * protects updates to fs_devices::devices, ie. adding and deleting 285 * 286 * simple list traversal with read-only actions can be done with RCU protection 287 * 288 * may be used to exclude some operations from running concurrently without any 289 * modifications to the list (see write_all_supers) 290 * 291 * Is not required at mount and close times, because our device list is 292 * protected by the uuid_mutex at that point. 293 * 294 * balance_mutex 295 * ------------- 296 * protects balance structures (status, state) and context accessed from 297 * several places (internally, ioctl) 298 * 299 * chunk_mutex 300 * ----------- 301 * protects chunks, adding or removing during allocation, trim or when a new 302 * device is added/removed. Additionally it also protects post_commit_list of 303 * individual devices, since they can be added to the transaction's 304 * post_commit_list only with chunk_mutex held. 305 * 306 * cleaner_mutex 307 * ------------- 308 * a big lock that is held by the cleaner thread and prevents running subvolume 309 * cleaning together with relocation or delayed iputs 310 * 311 * 312 * Lock nesting 313 * ============ 314 * 315 * uuid_mutex 316 * device_list_mutex 317 * chunk_mutex 318 * balance_mutex 319 * 320 * 321 * Exclusive operations 322 * ==================== 323 * 324 * Maintains the exclusivity of the following operations that apply to the 325 * whole filesystem and cannot run in parallel. 326 * 327 * - Balance (*) 328 * - Device add 329 * - Device remove 330 * - Device replace (*) 331 * - Resize 332 * 333 * The device operations (as above) can be in one of the following states: 334 * 335 * - Running state 336 * - Paused state 337 * - Completed state 338 * 339 * Only device operations marked with (*) can go into the Paused state for the 340 * following reasons: 341 * 342 * - ioctl (only Balance can be Paused through ioctl) 343 * - filesystem remounted as read-only 344 * - filesystem unmounted and mounted as read-only 345 * - system power-cycle and filesystem mounted as read-only 346 * - filesystem or device errors leading to forced read-only 347 * 348 * The status of exclusive operation is set and cleared atomically. 349 * During the course of Paused state, fs_info::exclusive_operation remains set. 350 * A device operation in Paused or Running state can be canceled or resumed 351 * either by ioctl (Balance only) or when remounted as read-write. 352 * The exclusive status is cleared when the device operation is canceled or 353 * completed. 354 */ 355 356 DEFINE_MUTEX(uuid_mutex); 357 static LIST_HEAD(fs_uuids); 358 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 359 { 360 return &fs_uuids; 361 } 362 363 /* 364 * alloc_fs_devices - allocate struct btrfs_fs_devices 365 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 366 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 367 * 368 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 369 * The returned struct is not linked onto any lists and can be destroyed with 370 * kfree() right away. 371 */ 372 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 373 const u8 *metadata_fsid) 374 { 375 struct btrfs_fs_devices *fs_devs; 376 377 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 378 if (!fs_devs) 379 return ERR_PTR(-ENOMEM); 380 381 mutex_init(&fs_devs->device_list_mutex); 382 383 INIT_LIST_HEAD(&fs_devs->devices); 384 INIT_LIST_HEAD(&fs_devs->alloc_list); 385 INIT_LIST_HEAD(&fs_devs->fs_list); 386 INIT_LIST_HEAD(&fs_devs->seed_list); 387 if (fsid) 388 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 389 390 if (metadata_fsid) 391 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 392 else if (fsid) 393 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 394 395 return fs_devs; 396 } 397 398 void btrfs_free_device(struct btrfs_device *device) 399 { 400 WARN_ON(!list_empty(&device->post_commit_list)); 401 rcu_string_free(device->name); 402 extent_io_tree_release(&device->alloc_state); 403 bio_put(device->flush_bio); 404 btrfs_destroy_dev_zone_info(device); 405 kfree(device); 406 } 407 408 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409 { 410 struct btrfs_device *device; 411 WARN_ON(fs_devices->opened); 412 while (!list_empty(&fs_devices->devices)) { 413 device = list_entry(fs_devices->devices.next, 414 struct btrfs_device, dev_list); 415 list_del(&device->dev_list); 416 btrfs_free_device(device); 417 } 418 kfree(fs_devices); 419 } 420 421 void __exit btrfs_cleanup_fs_uuids(void) 422 { 423 struct btrfs_fs_devices *fs_devices; 424 425 while (!list_empty(&fs_uuids)) { 426 fs_devices = list_entry(fs_uuids.next, 427 struct btrfs_fs_devices, fs_list); 428 list_del(&fs_devices->fs_list); 429 free_fs_devices(fs_devices); 430 } 431 } 432 433 static noinline struct btrfs_fs_devices *find_fsid( 434 const u8 *fsid, const u8 *metadata_fsid) 435 { 436 struct btrfs_fs_devices *fs_devices; 437 438 ASSERT(fsid); 439 440 /* Handle non-split brain cases */ 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 442 if (metadata_fsid) { 443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445 BTRFS_FSID_SIZE) == 0) 446 return fs_devices; 447 } else { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449 return fs_devices; 450 } 451 } 452 return NULL; 453 } 454 455 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456 struct btrfs_super_block *disk_super) 457 { 458 459 struct btrfs_fs_devices *fs_devices; 460 461 /* 462 * Handle scanned device having completed its fsid change but 463 * belonging to a fs_devices that was created by first scanning 464 * a device which didn't have its fsid/metadata_uuid changed 465 * at all and the CHANGING_FSID_V2 flag set. 466 */ 467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 468 if (fs_devices->fsid_change && 469 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 470 BTRFS_FSID_SIZE) == 0 && 471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 472 BTRFS_FSID_SIZE) == 0) { 473 return fs_devices; 474 } 475 } 476 /* 477 * Handle scanned device having completed its fsid change but 478 * belonging to a fs_devices that was created by a device that 479 * has an outdated pair of fsid/metadata_uuid and 480 * CHANGING_FSID_V2 flag set. 481 */ 482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483 if (fs_devices->fsid_change && 484 memcmp(fs_devices->metadata_uuid, 485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487 BTRFS_FSID_SIZE) == 0) { 488 return fs_devices; 489 } 490 } 491 492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 493 } 494 495 496 static int 497 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498 int flush, struct block_device **bdev, 499 struct btrfs_super_block **disk_super) 500 { 501 int ret; 502 503 *bdev = blkdev_get_by_path(device_path, flags, holder); 504 505 if (IS_ERR(*bdev)) { 506 ret = PTR_ERR(*bdev); 507 goto error; 508 } 509 510 if (flush) 511 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513 if (ret) { 514 blkdev_put(*bdev, flags); 515 goto error; 516 } 517 invalidate_bdev(*bdev); 518 *disk_super = btrfs_read_dev_super(*bdev); 519 if (IS_ERR(*disk_super)) { 520 ret = PTR_ERR(*disk_super); 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 525 return 0; 526 527 error: 528 *bdev = NULL; 529 return ret; 530 } 531 532 static bool device_path_matched(const char *path, struct btrfs_device *device) 533 { 534 int found; 535 536 rcu_read_lock(); 537 found = strcmp(rcu_str_deref(device->name), path); 538 rcu_read_unlock(); 539 540 return found == 0; 541 } 542 543 /* 544 * Search and remove all stale (devices which are not mounted) devices. 545 * When both inputs are NULL, it will search and release all stale devices. 546 * path: Optional. When provided will it release all unmounted devices 547 * matching this path only. 548 * skip_dev: Optional. Will skip this device when searching for the stale 549 * devices. 550 * Return: 0 for success or if @path is NULL. 551 * -EBUSY if @path is a mounted device. 552 * -ENOENT if @path does not match any device in the list. 553 */ 554 static int btrfs_free_stale_devices(const char *path, 555 struct btrfs_device *skip_device) 556 { 557 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 558 struct btrfs_device *device, *tmp_device; 559 int ret = 0; 560 561 if (path) 562 ret = -ENOENT; 563 564 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 565 566 mutex_lock(&fs_devices->device_list_mutex); 567 list_for_each_entry_safe(device, tmp_device, 568 &fs_devices->devices, dev_list) { 569 if (skip_device && skip_device == device) 570 continue; 571 if (path && !device->name) 572 continue; 573 if (path && !device_path_matched(path, device)) 574 continue; 575 if (fs_devices->opened) { 576 /* for an already deleted device return 0 */ 577 if (path && ret != 0) 578 ret = -EBUSY; 579 break; 580 } 581 582 /* delete the stale device */ 583 fs_devices->num_devices--; 584 list_del(&device->dev_list); 585 btrfs_free_device(device); 586 587 ret = 0; 588 } 589 mutex_unlock(&fs_devices->device_list_mutex); 590 591 if (fs_devices->num_devices == 0) { 592 btrfs_sysfs_remove_fsid(fs_devices); 593 list_del(&fs_devices->fs_list); 594 free_fs_devices(fs_devices); 595 } 596 } 597 598 return ret; 599 } 600 601 /* 602 * This is only used on mount, and we are protected from competing things 603 * messing with our fs_devices by the uuid_mutex, thus we do not need the 604 * fs_devices->device_list_mutex here. 605 */ 606 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 607 struct btrfs_device *device, fmode_t flags, 608 void *holder) 609 { 610 struct request_queue *q; 611 struct block_device *bdev; 612 struct btrfs_super_block *disk_super; 613 u64 devid; 614 int ret; 615 616 if (device->bdev) 617 return -EINVAL; 618 if (!device->name) 619 return -EINVAL; 620 621 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 622 &bdev, &disk_super); 623 if (ret) 624 return ret; 625 626 devid = btrfs_stack_device_id(&disk_super->dev_item); 627 if (devid != device->devid) 628 goto error_free_page; 629 630 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 631 goto error_free_page; 632 633 device->generation = btrfs_super_generation(disk_super); 634 635 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 636 if (btrfs_super_incompat_flags(disk_super) & 637 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 638 pr_err( 639 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 640 goto error_free_page; 641 } 642 643 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 644 fs_devices->seeding = true; 645 } else { 646 if (bdev_read_only(bdev)) 647 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 648 else 649 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 650 } 651 652 q = bdev_get_queue(bdev); 653 if (!blk_queue_nonrot(q)) 654 fs_devices->rotating = true; 655 656 device->bdev = bdev; 657 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 658 device->mode = flags; 659 660 fs_devices->open_devices++; 661 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 662 device->devid != BTRFS_DEV_REPLACE_DEVID) { 663 fs_devices->rw_devices++; 664 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 665 } 666 btrfs_release_disk_super(disk_super); 667 668 return 0; 669 670 error_free_page: 671 btrfs_release_disk_super(disk_super); 672 blkdev_put(bdev, flags); 673 674 return -EINVAL; 675 } 676 677 /* 678 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 679 * being created with a disk that has already completed its fsid change. Such 680 * disk can belong to an fs which has its FSID changed or to one which doesn't. 681 * Handle both cases here. 682 */ 683 static struct btrfs_fs_devices *find_fsid_inprogress( 684 struct btrfs_super_block *disk_super) 685 { 686 struct btrfs_fs_devices *fs_devices; 687 688 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 689 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 690 BTRFS_FSID_SIZE) != 0 && 691 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 692 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 693 return fs_devices; 694 } 695 } 696 697 return find_fsid(disk_super->fsid, NULL); 698 } 699 700 701 static struct btrfs_fs_devices *find_fsid_changed( 702 struct btrfs_super_block *disk_super) 703 { 704 struct btrfs_fs_devices *fs_devices; 705 706 /* 707 * Handles the case where scanned device is part of an fs that had 708 * multiple successful changes of FSID but currently device didn't 709 * observe it. Meaning our fsid will be different than theirs. We need 710 * to handle two subcases : 711 * 1 - The fs still continues to have different METADATA/FSID uuids. 712 * 2 - The fs is switched back to its original FSID (METADATA/FSID 713 * are equal). 714 */ 715 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 716 /* Changed UUIDs */ 717 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 718 BTRFS_FSID_SIZE) != 0 && 719 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 720 BTRFS_FSID_SIZE) == 0 && 721 memcmp(fs_devices->fsid, disk_super->fsid, 722 BTRFS_FSID_SIZE) != 0) 723 return fs_devices; 724 725 /* Unchanged UUIDs */ 726 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 727 BTRFS_FSID_SIZE) == 0 && 728 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 729 BTRFS_FSID_SIZE) == 0) 730 return fs_devices; 731 } 732 733 return NULL; 734 } 735 736 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 737 struct btrfs_super_block *disk_super) 738 { 739 struct btrfs_fs_devices *fs_devices; 740 741 /* 742 * Handle the case where the scanned device is part of an fs whose last 743 * metadata UUID change reverted it to the original FSID. At the same 744 * time * fs_devices was first created by another constitutent device 745 * which didn't fully observe the operation. This results in an 746 * btrfs_fs_devices created with metadata/fsid different AND 747 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 748 * fs_devices equal to the FSID of the disk. 749 */ 750 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 751 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 752 BTRFS_FSID_SIZE) != 0 && 753 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 754 BTRFS_FSID_SIZE) == 0 && 755 fs_devices->fsid_change) 756 return fs_devices; 757 } 758 759 return NULL; 760 } 761 /* 762 * Add new device to list of registered devices 763 * 764 * Returns: 765 * device pointer which was just added or updated when successful 766 * error pointer when failed 767 */ 768 static noinline struct btrfs_device *device_list_add(const char *path, 769 struct btrfs_super_block *disk_super, 770 bool *new_device_added) 771 { 772 struct btrfs_device *device; 773 struct btrfs_fs_devices *fs_devices = NULL; 774 struct rcu_string *name; 775 u64 found_transid = btrfs_super_generation(disk_super); 776 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 777 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 778 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 779 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 780 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 781 782 if (fsid_change_in_progress) { 783 if (!has_metadata_uuid) 784 fs_devices = find_fsid_inprogress(disk_super); 785 else 786 fs_devices = find_fsid_changed(disk_super); 787 } else if (has_metadata_uuid) { 788 fs_devices = find_fsid_with_metadata_uuid(disk_super); 789 } else { 790 fs_devices = find_fsid_reverted_metadata(disk_super); 791 if (!fs_devices) 792 fs_devices = find_fsid(disk_super->fsid, NULL); 793 } 794 795 796 if (!fs_devices) { 797 if (has_metadata_uuid) 798 fs_devices = alloc_fs_devices(disk_super->fsid, 799 disk_super->metadata_uuid); 800 else 801 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 802 803 if (IS_ERR(fs_devices)) 804 return ERR_CAST(fs_devices); 805 806 fs_devices->fsid_change = fsid_change_in_progress; 807 808 mutex_lock(&fs_devices->device_list_mutex); 809 list_add(&fs_devices->fs_list, &fs_uuids); 810 811 device = NULL; 812 } else { 813 mutex_lock(&fs_devices->device_list_mutex); 814 device = btrfs_find_device(fs_devices, devid, 815 disk_super->dev_item.uuid, NULL); 816 817 /* 818 * If this disk has been pulled into an fs devices created by 819 * a device which had the CHANGING_FSID_V2 flag then replace the 820 * metadata_uuid/fsid values of the fs_devices. 821 */ 822 if (fs_devices->fsid_change && 823 found_transid > fs_devices->latest_generation) { 824 memcpy(fs_devices->fsid, disk_super->fsid, 825 BTRFS_FSID_SIZE); 826 827 if (has_metadata_uuid) 828 memcpy(fs_devices->metadata_uuid, 829 disk_super->metadata_uuid, 830 BTRFS_FSID_SIZE); 831 else 832 memcpy(fs_devices->metadata_uuid, 833 disk_super->fsid, BTRFS_FSID_SIZE); 834 835 fs_devices->fsid_change = false; 836 } 837 } 838 839 if (!device) { 840 if (fs_devices->opened) { 841 mutex_unlock(&fs_devices->device_list_mutex); 842 return ERR_PTR(-EBUSY); 843 } 844 845 device = btrfs_alloc_device(NULL, &devid, 846 disk_super->dev_item.uuid); 847 if (IS_ERR(device)) { 848 mutex_unlock(&fs_devices->device_list_mutex); 849 /* we can safely leave the fs_devices entry around */ 850 return device; 851 } 852 853 name = rcu_string_strdup(path, GFP_NOFS); 854 if (!name) { 855 btrfs_free_device(device); 856 mutex_unlock(&fs_devices->device_list_mutex); 857 return ERR_PTR(-ENOMEM); 858 } 859 rcu_assign_pointer(device->name, name); 860 861 list_add_rcu(&device->dev_list, &fs_devices->devices); 862 fs_devices->num_devices++; 863 864 device->fs_devices = fs_devices; 865 *new_device_added = true; 866 867 if (disk_super->label[0]) 868 pr_info( 869 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 870 disk_super->label, devid, found_transid, path, 871 current->comm, task_pid_nr(current)); 872 else 873 pr_info( 874 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 875 disk_super->fsid, devid, found_transid, path, 876 current->comm, task_pid_nr(current)); 877 878 } else if (!device->name || strcmp(device->name->str, path)) { 879 /* 880 * When FS is already mounted. 881 * 1. If you are here and if the device->name is NULL that 882 * means this device was missing at time of FS mount. 883 * 2. If you are here and if the device->name is different 884 * from 'path' that means either 885 * a. The same device disappeared and reappeared with 886 * different name. or 887 * b. The missing-disk-which-was-replaced, has 888 * reappeared now. 889 * 890 * We must allow 1 and 2a above. But 2b would be a spurious 891 * and unintentional. 892 * 893 * Further in case of 1 and 2a above, the disk at 'path' 894 * would have missed some transaction when it was away and 895 * in case of 2a the stale bdev has to be updated as well. 896 * 2b must not be allowed at all time. 897 */ 898 899 /* 900 * For now, we do allow update to btrfs_fs_device through the 901 * btrfs dev scan cli after FS has been mounted. We're still 902 * tracking a problem where systems fail mount by subvolume id 903 * when we reject replacement on a mounted FS. 904 */ 905 if (!fs_devices->opened && found_transid < device->generation) { 906 /* 907 * That is if the FS is _not_ mounted and if you 908 * are here, that means there is more than one 909 * disk with same uuid and devid.We keep the one 910 * with larger generation number or the last-in if 911 * generation are equal. 912 */ 913 mutex_unlock(&fs_devices->device_list_mutex); 914 return ERR_PTR(-EEXIST); 915 } 916 917 /* 918 * We are going to replace the device path for a given devid, 919 * make sure it's the same device if the device is mounted 920 */ 921 if (device->bdev) { 922 int error; 923 dev_t path_dev; 924 925 error = lookup_bdev(path, &path_dev); 926 if (error) { 927 mutex_unlock(&fs_devices->device_list_mutex); 928 return ERR_PTR(error); 929 } 930 931 if (device->bdev->bd_dev != path_dev) { 932 mutex_unlock(&fs_devices->device_list_mutex); 933 /* 934 * device->fs_info may not be reliable here, so 935 * pass in a NULL instead. This avoids a 936 * possible use-after-free when the fs_info and 937 * fs_info->sb are already torn down. 938 */ 939 btrfs_warn_in_rcu(NULL, 940 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 941 path, devid, found_transid, 942 current->comm, 943 task_pid_nr(current)); 944 return ERR_PTR(-EEXIST); 945 } 946 btrfs_info_in_rcu(device->fs_info, 947 "devid %llu device path %s changed to %s scanned by %s (%d)", 948 devid, rcu_str_deref(device->name), 949 path, current->comm, 950 task_pid_nr(current)); 951 } 952 953 name = rcu_string_strdup(path, GFP_NOFS); 954 if (!name) { 955 mutex_unlock(&fs_devices->device_list_mutex); 956 return ERR_PTR(-ENOMEM); 957 } 958 rcu_string_free(device->name); 959 rcu_assign_pointer(device->name, name); 960 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 961 fs_devices->missing_devices--; 962 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 963 } 964 } 965 966 /* 967 * Unmount does not free the btrfs_device struct but would zero 968 * generation along with most of the other members. So just update 969 * it back. We need it to pick the disk with largest generation 970 * (as above). 971 */ 972 if (!fs_devices->opened) { 973 device->generation = found_transid; 974 fs_devices->latest_generation = max_t(u64, found_transid, 975 fs_devices->latest_generation); 976 } 977 978 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 979 980 mutex_unlock(&fs_devices->device_list_mutex); 981 return device; 982 } 983 984 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 985 { 986 struct btrfs_fs_devices *fs_devices; 987 struct btrfs_device *device; 988 struct btrfs_device *orig_dev; 989 int ret = 0; 990 991 fs_devices = alloc_fs_devices(orig->fsid, NULL); 992 if (IS_ERR(fs_devices)) 993 return fs_devices; 994 995 mutex_lock(&orig->device_list_mutex); 996 fs_devices->total_devices = orig->total_devices; 997 998 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 999 struct rcu_string *name; 1000 1001 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1002 orig_dev->uuid); 1003 if (IS_ERR(device)) { 1004 ret = PTR_ERR(device); 1005 goto error; 1006 } 1007 1008 /* 1009 * This is ok to do without rcu read locked because we hold the 1010 * uuid mutex so nothing we touch in here is going to disappear. 1011 */ 1012 if (orig_dev->name) { 1013 name = rcu_string_strdup(orig_dev->name->str, 1014 GFP_KERNEL); 1015 if (!name) { 1016 btrfs_free_device(device); 1017 ret = -ENOMEM; 1018 goto error; 1019 } 1020 rcu_assign_pointer(device->name, name); 1021 } 1022 1023 list_add(&device->dev_list, &fs_devices->devices); 1024 device->fs_devices = fs_devices; 1025 fs_devices->num_devices++; 1026 } 1027 mutex_unlock(&orig->device_list_mutex); 1028 return fs_devices; 1029 error: 1030 mutex_unlock(&orig->device_list_mutex); 1031 free_fs_devices(fs_devices); 1032 return ERR_PTR(ret); 1033 } 1034 1035 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1036 struct btrfs_device **latest_dev) 1037 { 1038 struct btrfs_device *device, *next; 1039 1040 /* This is the initialized path, it is safe to release the devices. */ 1041 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1042 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1043 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1044 &device->dev_state) && 1045 !test_bit(BTRFS_DEV_STATE_MISSING, 1046 &device->dev_state) && 1047 (!*latest_dev || 1048 device->generation > (*latest_dev)->generation)) { 1049 *latest_dev = device; 1050 } 1051 continue; 1052 } 1053 1054 /* 1055 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1056 * in btrfs_init_dev_replace() so just continue. 1057 */ 1058 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1059 continue; 1060 1061 if (device->bdev) { 1062 blkdev_put(device->bdev, device->mode); 1063 device->bdev = NULL; 1064 fs_devices->open_devices--; 1065 } 1066 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1067 list_del_init(&device->dev_alloc_list); 1068 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1069 fs_devices->rw_devices--; 1070 } 1071 list_del_init(&device->dev_list); 1072 fs_devices->num_devices--; 1073 btrfs_free_device(device); 1074 } 1075 1076 } 1077 1078 /* 1079 * After we have read the system tree and know devids belonging to this 1080 * filesystem, remove the device which does not belong there. 1081 */ 1082 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1083 { 1084 struct btrfs_device *latest_dev = NULL; 1085 struct btrfs_fs_devices *seed_dev; 1086 1087 mutex_lock(&uuid_mutex); 1088 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1089 1090 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1091 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1092 1093 fs_devices->latest_bdev = latest_dev->bdev; 1094 1095 mutex_unlock(&uuid_mutex); 1096 } 1097 1098 static void btrfs_close_bdev(struct btrfs_device *device) 1099 { 1100 if (!device->bdev) 1101 return; 1102 1103 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1104 sync_blockdev(device->bdev); 1105 invalidate_bdev(device->bdev); 1106 } 1107 1108 blkdev_put(device->bdev, device->mode); 1109 } 1110 1111 static void btrfs_close_one_device(struct btrfs_device *device) 1112 { 1113 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1114 1115 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1116 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1117 list_del_init(&device->dev_alloc_list); 1118 fs_devices->rw_devices--; 1119 } 1120 1121 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1122 fs_devices->missing_devices--; 1123 1124 btrfs_close_bdev(device); 1125 if (device->bdev) { 1126 fs_devices->open_devices--; 1127 device->bdev = NULL; 1128 } 1129 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1130 btrfs_destroy_dev_zone_info(device); 1131 1132 device->fs_info = NULL; 1133 atomic_set(&device->dev_stats_ccnt, 0); 1134 extent_io_tree_release(&device->alloc_state); 1135 1136 /* Verify the device is back in a pristine state */ 1137 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1138 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1139 ASSERT(list_empty(&device->dev_alloc_list)); 1140 ASSERT(list_empty(&device->post_commit_list)); 1141 ASSERT(atomic_read(&device->reada_in_flight) == 0); 1142 } 1143 1144 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1145 { 1146 struct btrfs_device *device, *tmp; 1147 1148 lockdep_assert_held(&uuid_mutex); 1149 1150 if (--fs_devices->opened > 0) 1151 return; 1152 1153 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1154 btrfs_close_one_device(device); 1155 1156 WARN_ON(fs_devices->open_devices); 1157 WARN_ON(fs_devices->rw_devices); 1158 fs_devices->opened = 0; 1159 fs_devices->seeding = false; 1160 fs_devices->fs_info = NULL; 1161 } 1162 1163 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1164 { 1165 LIST_HEAD(list); 1166 struct btrfs_fs_devices *tmp; 1167 1168 mutex_lock(&uuid_mutex); 1169 close_fs_devices(fs_devices); 1170 if (!fs_devices->opened) 1171 list_splice_init(&fs_devices->seed_list, &list); 1172 1173 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1174 close_fs_devices(fs_devices); 1175 list_del(&fs_devices->seed_list); 1176 free_fs_devices(fs_devices); 1177 } 1178 mutex_unlock(&uuid_mutex); 1179 } 1180 1181 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1182 fmode_t flags, void *holder) 1183 { 1184 struct btrfs_device *device; 1185 struct btrfs_device *latest_dev = NULL; 1186 struct btrfs_device *tmp_device; 1187 1188 flags |= FMODE_EXCL; 1189 1190 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1191 dev_list) { 1192 int ret; 1193 1194 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1195 if (ret == 0 && 1196 (!latest_dev || device->generation > latest_dev->generation)) { 1197 latest_dev = device; 1198 } else if (ret == -ENODATA) { 1199 fs_devices->num_devices--; 1200 list_del(&device->dev_list); 1201 btrfs_free_device(device); 1202 } 1203 } 1204 if (fs_devices->open_devices == 0) 1205 return -EINVAL; 1206 1207 fs_devices->opened = 1; 1208 fs_devices->latest_bdev = latest_dev->bdev; 1209 fs_devices->total_rw_bytes = 0; 1210 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1211 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1212 1213 return 0; 1214 } 1215 1216 static int devid_cmp(void *priv, const struct list_head *a, 1217 const struct list_head *b) 1218 { 1219 const struct btrfs_device *dev1, *dev2; 1220 1221 dev1 = list_entry(a, struct btrfs_device, dev_list); 1222 dev2 = list_entry(b, struct btrfs_device, dev_list); 1223 1224 if (dev1->devid < dev2->devid) 1225 return -1; 1226 else if (dev1->devid > dev2->devid) 1227 return 1; 1228 return 0; 1229 } 1230 1231 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1232 fmode_t flags, void *holder) 1233 { 1234 int ret; 1235 1236 lockdep_assert_held(&uuid_mutex); 1237 /* 1238 * The device_list_mutex cannot be taken here in case opening the 1239 * underlying device takes further locks like open_mutex. 1240 * 1241 * We also don't need the lock here as this is called during mount and 1242 * exclusion is provided by uuid_mutex 1243 */ 1244 1245 if (fs_devices->opened) { 1246 fs_devices->opened++; 1247 ret = 0; 1248 } else { 1249 list_sort(NULL, &fs_devices->devices, devid_cmp); 1250 ret = open_fs_devices(fs_devices, flags, holder); 1251 } 1252 1253 return ret; 1254 } 1255 1256 void btrfs_release_disk_super(struct btrfs_super_block *super) 1257 { 1258 struct page *page = virt_to_page(super); 1259 1260 put_page(page); 1261 } 1262 1263 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1264 u64 bytenr, u64 bytenr_orig) 1265 { 1266 struct btrfs_super_block *disk_super; 1267 struct page *page; 1268 void *p; 1269 pgoff_t index; 1270 1271 /* make sure our super fits in the device */ 1272 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1273 return ERR_PTR(-EINVAL); 1274 1275 /* make sure our super fits in the page */ 1276 if (sizeof(*disk_super) > PAGE_SIZE) 1277 return ERR_PTR(-EINVAL); 1278 1279 /* make sure our super doesn't straddle pages on disk */ 1280 index = bytenr >> PAGE_SHIFT; 1281 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1282 return ERR_PTR(-EINVAL); 1283 1284 /* pull in the page with our super */ 1285 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1286 1287 if (IS_ERR(page)) 1288 return ERR_CAST(page); 1289 1290 p = page_address(page); 1291 1292 /* align our pointer to the offset of the super block */ 1293 disk_super = p + offset_in_page(bytenr); 1294 1295 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1296 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1297 btrfs_release_disk_super(p); 1298 return ERR_PTR(-EINVAL); 1299 } 1300 1301 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1302 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1303 1304 return disk_super; 1305 } 1306 1307 int btrfs_forget_devices(const char *path) 1308 { 1309 int ret; 1310 1311 mutex_lock(&uuid_mutex); 1312 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1313 mutex_unlock(&uuid_mutex); 1314 1315 return ret; 1316 } 1317 1318 /* 1319 * Look for a btrfs signature on a device. This may be called out of the mount path 1320 * and we are not allowed to call set_blocksize during the scan. The superblock 1321 * is read via pagecache 1322 */ 1323 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1324 void *holder) 1325 { 1326 struct btrfs_super_block *disk_super; 1327 bool new_device_added = false; 1328 struct btrfs_device *device = NULL; 1329 struct block_device *bdev; 1330 u64 bytenr, bytenr_orig; 1331 int ret; 1332 1333 lockdep_assert_held(&uuid_mutex); 1334 1335 /* 1336 * we would like to check all the supers, but that would make 1337 * a btrfs mount succeed after a mkfs from a different FS. 1338 * So, we need to add a special mount option to scan for 1339 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1340 */ 1341 flags |= FMODE_EXCL; 1342 1343 bdev = blkdev_get_by_path(path, flags, holder); 1344 if (IS_ERR(bdev)) 1345 return ERR_CAST(bdev); 1346 1347 bytenr_orig = btrfs_sb_offset(0); 1348 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1349 if (ret) 1350 return ERR_PTR(ret); 1351 1352 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1353 if (IS_ERR(disk_super)) { 1354 device = ERR_CAST(disk_super); 1355 goto error_bdev_put; 1356 } 1357 1358 device = device_list_add(path, disk_super, &new_device_added); 1359 if (!IS_ERR(device)) { 1360 if (new_device_added) 1361 btrfs_free_stale_devices(path, device); 1362 } 1363 1364 btrfs_release_disk_super(disk_super); 1365 1366 error_bdev_put: 1367 blkdev_put(bdev, flags); 1368 1369 return device; 1370 } 1371 1372 /* 1373 * Try to find a chunk that intersects [start, start + len] range and when one 1374 * such is found, record the end of it in *start 1375 */ 1376 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1377 u64 len) 1378 { 1379 u64 physical_start, physical_end; 1380 1381 lockdep_assert_held(&device->fs_info->chunk_mutex); 1382 1383 if (!find_first_extent_bit(&device->alloc_state, *start, 1384 &physical_start, &physical_end, 1385 CHUNK_ALLOCATED, NULL)) { 1386 1387 if (in_range(physical_start, *start, len) || 1388 in_range(*start, physical_start, 1389 physical_end - physical_start)) { 1390 *start = physical_end + 1; 1391 return true; 1392 } 1393 } 1394 return false; 1395 } 1396 1397 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1398 { 1399 switch (device->fs_devices->chunk_alloc_policy) { 1400 case BTRFS_CHUNK_ALLOC_REGULAR: 1401 /* 1402 * We don't want to overwrite the superblock on the drive nor 1403 * any area used by the boot loader (grub for example), so we 1404 * make sure to start at an offset of at least 1MB. 1405 */ 1406 return max_t(u64, start, SZ_1M); 1407 case BTRFS_CHUNK_ALLOC_ZONED: 1408 /* 1409 * We don't care about the starting region like regular 1410 * allocator, because we anyway use/reserve the first two zones 1411 * for superblock logging. 1412 */ 1413 return ALIGN(start, device->zone_info->zone_size); 1414 default: 1415 BUG(); 1416 } 1417 } 1418 1419 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1420 u64 *hole_start, u64 *hole_size, 1421 u64 num_bytes) 1422 { 1423 u64 zone_size = device->zone_info->zone_size; 1424 u64 pos; 1425 int ret; 1426 bool changed = false; 1427 1428 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1429 1430 while (*hole_size > 0) { 1431 pos = btrfs_find_allocatable_zones(device, *hole_start, 1432 *hole_start + *hole_size, 1433 num_bytes); 1434 if (pos != *hole_start) { 1435 *hole_size = *hole_start + *hole_size - pos; 1436 *hole_start = pos; 1437 changed = true; 1438 if (*hole_size < num_bytes) 1439 break; 1440 } 1441 1442 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1443 1444 /* Range is ensured to be empty */ 1445 if (!ret) 1446 return changed; 1447 1448 /* Given hole range was invalid (outside of device) */ 1449 if (ret == -ERANGE) { 1450 *hole_start += *hole_size; 1451 *hole_size = 0; 1452 return true; 1453 } 1454 1455 *hole_start += zone_size; 1456 *hole_size -= zone_size; 1457 changed = true; 1458 } 1459 1460 return changed; 1461 } 1462 1463 /** 1464 * dev_extent_hole_check - check if specified hole is suitable for allocation 1465 * @device: the device which we have the hole 1466 * @hole_start: starting position of the hole 1467 * @hole_size: the size of the hole 1468 * @num_bytes: the size of the free space that we need 1469 * 1470 * This function may modify @hole_start and @hole_size to reflect the suitable 1471 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1472 */ 1473 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1474 u64 *hole_size, u64 num_bytes) 1475 { 1476 bool changed = false; 1477 u64 hole_end = *hole_start + *hole_size; 1478 1479 for (;;) { 1480 /* 1481 * Check before we set max_hole_start, otherwise we could end up 1482 * sending back this offset anyway. 1483 */ 1484 if (contains_pending_extent(device, hole_start, *hole_size)) { 1485 if (hole_end >= *hole_start) 1486 *hole_size = hole_end - *hole_start; 1487 else 1488 *hole_size = 0; 1489 changed = true; 1490 } 1491 1492 switch (device->fs_devices->chunk_alloc_policy) { 1493 case BTRFS_CHUNK_ALLOC_REGULAR: 1494 /* No extra check */ 1495 break; 1496 case BTRFS_CHUNK_ALLOC_ZONED: 1497 if (dev_extent_hole_check_zoned(device, hole_start, 1498 hole_size, num_bytes)) { 1499 changed = true; 1500 /* 1501 * The changed hole can contain pending extent. 1502 * Loop again to check that. 1503 */ 1504 continue; 1505 } 1506 break; 1507 default: 1508 BUG(); 1509 } 1510 1511 break; 1512 } 1513 1514 return changed; 1515 } 1516 1517 /* 1518 * find_free_dev_extent_start - find free space in the specified device 1519 * @device: the device which we search the free space in 1520 * @num_bytes: the size of the free space that we need 1521 * @search_start: the position from which to begin the search 1522 * @start: store the start of the free space. 1523 * @len: the size of the free space. that we find, or the size 1524 * of the max free space if we don't find suitable free space 1525 * 1526 * this uses a pretty simple search, the expectation is that it is 1527 * called very infrequently and that a given device has a small number 1528 * of extents 1529 * 1530 * @start is used to store the start of the free space if we find. But if we 1531 * don't find suitable free space, it will be used to store the start position 1532 * of the max free space. 1533 * 1534 * @len is used to store the size of the free space that we find. 1535 * But if we don't find suitable free space, it is used to store the size of 1536 * the max free space. 1537 * 1538 * NOTE: This function will search *commit* root of device tree, and does extra 1539 * check to ensure dev extents are not double allocated. 1540 * This makes the function safe to allocate dev extents but may not report 1541 * correct usable device space, as device extent freed in current transaction 1542 * is not reported as available. 1543 */ 1544 static int find_free_dev_extent_start(struct btrfs_device *device, 1545 u64 num_bytes, u64 search_start, u64 *start, 1546 u64 *len) 1547 { 1548 struct btrfs_fs_info *fs_info = device->fs_info; 1549 struct btrfs_root *root = fs_info->dev_root; 1550 struct btrfs_key key; 1551 struct btrfs_dev_extent *dev_extent; 1552 struct btrfs_path *path; 1553 u64 hole_size; 1554 u64 max_hole_start; 1555 u64 max_hole_size; 1556 u64 extent_end; 1557 u64 search_end = device->total_bytes; 1558 int ret; 1559 int slot; 1560 struct extent_buffer *l; 1561 1562 search_start = dev_extent_search_start(device, search_start); 1563 1564 WARN_ON(device->zone_info && 1565 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1566 1567 path = btrfs_alloc_path(); 1568 if (!path) 1569 return -ENOMEM; 1570 1571 max_hole_start = search_start; 1572 max_hole_size = 0; 1573 1574 again: 1575 if (search_start >= search_end || 1576 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1577 ret = -ENOSPC; 1578 goto out; 1579 } 1580 1581 path->reada = READA_FORWARD; 1582 path->search_commit_root = 1; 1583 path->skip_locking = 1; 1584 1585 key.objectid = device->devid; 1586 key.offset = search_start; 1587 key.type = BTRFS_DEV_EXTENT_KEY; 1588 1589 ret = btrfs_search_backwards(root, &key, path); 1590 if (ret < 0) 1591 goto out; 1592 1593 while (1) { 1594 l = path->nodes[0]; 1595 slot = path->slots[0]; 1596 if (slot >= btrfs_header_nritems(l)) { 1597 ret = btrfs_next_leaf(root, path); 1598 if (ret == 0) 1599 continue; 1600 if (ret < 0) 1601 goto out; 1602 1603 break; 1604 } 1605 btrfs_item_key_to_cpu(l, &key, slot); 1606 1607 if (key.objectid < device->devid) 1608 goto next; 1609 1610 if (key.objectid > device->devid) 1611 break; 1612 1613 if (key.type != BTRFS_DEV_EXTENT_KEY) 1614 goto next; 1615 1616 if (key.offset > search_start) { 1617 hole_size = key.offset - search_start; 1618 dev_extent_hole_check(device, &search_start, &hole_size, 1619 num_bytes); 1620 1621 if (hole_size > max_hole_size) { 1622 max_hole_start = search_start; 1623 max_hole_size = hole_size; 1624 } 1625 1626 /* 1627 * If this free space is greater than which we need, 1628 * it must be the max free space that we have found 1629 * until now, so max_hole_start must point to the start 1630 * of this free space and the length of this free space 1631 * is stored in max_hole_size. Thus, we return 1632 * max_hole_start and max_hole_size and go back to the 1633 * caller. 1634 */ 1635 if (hole_size >= num_bytes) { 1636 ret = 0; 1637 goto out; 1638 } 1639 } 1640 1641 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1642 extent_end = key.offset + btrfs_dev_extent_length(l, 1643 dev_extent); 1644 if (extent_end > search_start) 1645 search_start = extent_end; 1646 next: 1647 path->slots[0]++; 1648 cond_resched(); 1649 } 1650 1651 /* 1652 * At this point, search_start should be the end of 1653 * allocated dev extents, and when shrinking the device, 1654 * search_end may be smaller than search_start. 1655 */ 1656 if (search_end > search_start) { 1657 hole_size = search_end - search_start; 1658 if (dev_extent_hole_check(device, &search_start, &hole_size, 1659 num_bytes)) { 1660 btrfs_release_path(path); 1661 goto again; 1662 } 1663 1664 if (hole_size > max_hole_size) { 1665 max_hole_start = search_start; 1666 max_hole_size = hole_size; 1667 } 1668 } 1669 1670 /* See above. */ 1671 if (max_hole_size < num_bytes) 1672 ret = -ENOSPC; 1673 else 1674 ret = 0; 1675 1676 out: 1677 btrfs_free_path(path); 1678 *start = max_hole_start; 1679 if (len) 1680 *len = max_hole_size; 1681 return ret; 1682 } 1683 1684 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1685 u64 *start, u64 *len) 1686 { 1687 /* FIXME use last free of some kind */ 1688 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1689 } 1690 1691 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1692 struct btrfs_device *device, 1693 u64 start, u64 *dev_extent_len) 1694 { 1695 struct btrfs_fs_info *fs_info = device->fs_info; 1696 struct btrfs_root *root = fs_info->dev_root; 1697 int ret; 1698 struct btrfs_path *path; 1699 struct btrfs_key key; 1700 struct btrfs_key found_key; 1701 struct extent_buffer *leaf = NULL; 1702 struct btrfs_dev_extent *extent = NULL; 1703 1704 path = btrfs_alloc_path(); 1705 if (!path) 1706 return -ENOMEM; 1707 1708 key.objectid = device->devid; 1709 key.offset = start; 1710 key.type = BTRFS_DEV_EXTENT_KEY; 1711 again: 1712 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1713 if (ret > 0) { 1714 ret = btrfs_previous_item(root, path, key.objectid, 1715 BTRFS_DEV_EXTENT_KEY); 1716 if (ret) 1717 goto out; 1718 leaf = path->nodes[0]; 1719 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1720 extent = btrfs_item_ptr(leaf, path->slots[0], 1721 struct btrfs_dev_extent); 1722 BUG_ON(found_key.offset > start || found_key.offset + 1723 btrfs_dev_extent_length(leaf, extent) < start); 1724 key = found_key; 1725 btrfs_release_path(path); 1726 goto again; 1727 } else if (ret == 0) { 1728 leaf = path->nodes[0]; 1729 extent = btrfs_item_ptr(leaf, path->slots[0], 1730 struct btrfs_dev_extent); 1731 } else { 1732 goto out; 1733 } 1734 1735 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1736 1737 ret = btrfs_del_item(trans, root, path); 1738 if (ret == 0) 1739 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1740 out: 1741 btrfs_free_path(path); 1742 return ret; 1743 } 1744 1745 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1746 { 1747 struct extent_map_tree *em_tree; 1748 struct extent_map *em; 1749 struct rb_node *n; 1750 u64 ret = 0; 1751 1752 em_tree = &fs_info->mapping_tree; 1753 read_lock(&em_tree->lock); 1754 n = rb_last(&em_tree->map.rb_root); 1755 if (n) { 1756 em = rb_entry(n, struct extent_map, rb_node); 1757 ret = em->start + em->len; 1758 } 1759 read_unlock(&em_tree->lock); 1760 1761 return ret; 1762 } 1763 1764 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1765 u64 *devid_ret) 1766 { 1767 int ret; 1768 struct btrfs_key key; 1769 struct btrfs_key found_key; 1770 struct btrfs_path *path; 1771 1772 path = btrfs_alloc_path(); 1773 if (!path) 1774 return -ENOMEM; 1775 1776 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1777 key.type = BTRFS_DEV_ITEM_KEY; 1778 key.offset = (u64)-1; 1779 1780 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1781 if (ret < 0) 1782 goto error; 1783 1784 if (ret == 0) { 1785 /* Corruption */ 1786 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1787 ret = -EUCLEAN; 1788 goto error; 1789 } 1790 1791 ret = btrfs_previous_item(fs_info->chunk_root, path, 1792 BTRFS_DEV_ITEMS_OBJECTID, 1793 BTRFS_DEV_ITEM_KEY); 1794 if (ret) { 1795 *devid_ret = 1; 1796 } else { 1797 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1798 path->slots[0]); 1799 *devid_ret = found_key.offset + 1; 1800 } 1801 ret = 0; 1802 error: 1803 btrfs_free_path(path); 1804 return ret; 1805 } 1806 1807 /* 1808 * the device information is stored in the chunk root 1809 * the btrfs_device struct should be fully filled in 1810 */ 1811 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1812 struct btrfs_device *device) 1813 { 1814 int ret; 1815 struct btrfs_path *path; 1816 struct btrfs_dev_item *dev_item; 1817 struct extent_buffer *leaf; 1818 struct btrfs_key key; 1819 unsigned long ptr; 1820 1821 path = btrfs_alloc_path(); 1822 if (!path) 1823 return -ENOMEM; 1824 1825 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1826 key.type = BTRFS_DEV_ITEM_KEY; 1827 key.offset = device->devid; 1828 1829 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1830 &key, sizeof(*dev_item)); 1831 if (ret) 1832 goto out; 1833 1834 leaf = path->nodes[0]; 1835 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1836 1837 btrfs_set_device_id(leaf, dev_item, device->devid); 1838 btrfs_set_device_generation(leaf, dev_item, 0); 1839 btrfs_set_device_type(leaf, dev_item, device->type); 1840 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1841 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1842 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1843 btrfs_set_device_total_bytes(leaf, dev_item, 1844 btrfs_device_get_disk_total_bytes(device)); 1845 btrfs_set_device_bytes_used(leaf, dev_item, 1846 btrfs_device_get_bytes_used(device)); 1847 btrfs_set_device_group(leaf, dev_item, 0); 1848 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1849 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1850 btrfs_set_device_start_offset(leaf, dev_item, 0); 1851 1852 ptr = btrfs_device_uuid(dev_item); 1853 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1854 ptr = btrfs_device_fsid(dev_item); 1855 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1856 ptr, BTRFS_FSID_SIZE); 1857 btrfs_mark_buffer_dirty(leaf); 1858 1859 ret = 0; 1860 out: 1861 btrfs_free_path(path); 1862 return ret; 1863 } 1864 1865 /* 1866 * Function to update ctime/mtime for a given device path. 1867 * Mainly used for ctime/mtime based probe like libblkid. 1868 */ 1869 static void update_dev_time(const char *path_name) 1870 { 1871 struct file *filp; 1872 1873 filp = filp_open(path_name, O_RDWR, 0); 1874 if (IS_ERR(filp)) 1875 return; 1876 file_update_time(filp); 1877 filp_close(filp, NULL); 1878 } 1879 1880 static int btrfs_rm_dev_item(struct btrfs_device *device) 1881 { 1882 struct btrfs_root *root = device->fs_info->chunk_root; 1883 int ret; 1884 struct btrfs_path *path; 1885 struct btrfs_key key; 1886 struct btrfs_trans_handle *trans; 1887 1888 path = btrfs_alloc_path(); 1889 if (!path) 1890 return -ENOMEM; 1891 1892 trans = btrfs_start_transaction(root, 0); 1893 if (IS_ERR(trans)) { 1894 btrfs_free_path(path); 1895 return PTR_ERR(trans); 1896 } 1897 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1898 key.type = BTRFS_DEV_ITEM_KEY; 1899 key.offset = device->devid; 1900 1901 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1902 if (ret) { 1903 if (ret > 0) 1904 ret = -ENOENT; 1905 btrfs_abort_transaction(trans, ret); 1906 btrfs_end_transaction(trans); 1907 goto out; 1908 } 1909 1910 ret = btrfs_del_item(trans, root, path); 1911 if (ret) { 1912 btrfs_abort_transaction(trans, ret); 1913 btrfs_end_transaction(trans); 1914 } 1915 1916 out: 1917 btrfs_free_path(path); 1918 if (!ret) 1919 ret = btrfs_commit_transaction(trans); 1920 return ret; 1921 } 1922 1923 /* 1924 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1925 * filesystem. It's up to the caller to adjust that number regarding eg. device 1926 * replace. 1927 */ 1928 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1929 u64 num_devices) 1930 { 1931 u64 all_avail; 1932 unsigned seq; 1933 int i; 1934 1935 do { 1936 seq = read_seqbegin(&fs_info->profiles_lock); 1937 1938 all_avail = fs_info->avail_data_alloc_bits | 1939 fs_info->avail_system_alloc_bits | 1940 fs_info->avail_metadata_alloc_bits; 1941 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1942 1943 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1944 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1945 continue; 1946 1947 if (num_devices < btrfs_raid_array[i].devs_min) 1948 return btrfs_raid_array[i].mindev_error; 1949 } 1950 1951 return 0; 1952 } 1953 1954 static struct btrfs_device * btrfs_find_next_active_device( 1955 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1956 { 1957 struct btrfs_device *next_device; 1958 1959 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1960 if (next_device != device && 1961 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1962 && next_device->bdev) 1963 return next_device; 1964 } 1965 1966 return NULL; 1967 } 1968 1969 /* 1970 * Helper function to check if the given device is part of s_bdev / latest_bdev 1971 * and replace it with the provided or the next active device, in the context 1972 * where this function called, there should be always be another device (or 1973 * this_dev) which is active. 1974 */ 1975 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1976 struct btrfs_device *next_device) 1977 { 1978 struct btrfs_fs_info *fs_info = device->fs_info; 1979 1980 if (!next_device) 1981 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1982 device); 1983 ASSERT(next_device); 1984 1985 if (fs_info->sb->s_bdev && 1986 (fs_info->sb->s_bdev == device->bdev)) 1987 fs_info->sb->s_bdev = next_device->bdev; 1988 1989 if (fs_info->fs_devices->latest_bdev == device->bdev) 1990 fs_info->fs_devices->latest_bdev = next_device->bdev; 1991 } 1992 1993 /* 1994 * Return btrfs_fs_devices::num_devices excluding the device that's being 1995 * currently replaced. 1996 */ 1997 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 1998 { 1999 u64 num_devices = fs_info->fs_devices->num_devices; 2000 2001 down_read(&fs_info->dev_replace.rwsem); 2002 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2003 ASSERT(num_devices > 1); 2004 num_devices--; 2005 } 2006 up_read(&fs_info->dev_replace.rwsem); 2007 2008 return num_devices; 2009 } 2010 2011 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2012 struct block_device *bdev, 2013 const char *device_path) 2014 { 2015 struct btrfs_super_block *disk_super; 2016 int copy_num; 2017 2018 if (!bdev) 2019 return; 2020 2021 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2022 struct page *page; 2023 int ret; 2024 2025 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2026 if (IS_ERR(disk_super)) 2027 continue; 2028 2029 if (bdev_is_zoned(bdev)) { 2030 btrfs_reset_sb_log_zones(bdev, copy_num); 2031 continue; 2032 } 2033 2034 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2035 2036 page = virt_to_page(disk_super); 2037 set_page_dirty(page); 2038 lock_page(page); 2039 /* write_on_page() unlocks the page */ 2040 ret = write_one_page(page); 2041 if (ret) 2042 btrfs_warn(fs_info, 2043 "error clearing superblock number %d (%d)", 2044 copy_num, ret); 2045 btrfs_release_disk_super(disk_super); 2046 2047 } 2048 2049 /* Notify udev that device has changed */ 2050 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2051 2052 /* Update ctime/mtime for device path for libblkid */ 2053 update_dev_time(device_path); 2054 } 2055 2056 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2057 u64 devid) 2058 { 2059 struct btrfs_device *device; 2060 struct btrfs_fs_devices *cur_devices; 2061 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2062 u64 num_devices; 2063 int ret = 0; 2064 2065 mutex_lock(&uuid_mutex); 2066 2067 num_devices = btrfs_num_devices(fs_info); 2068 2069 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2070 if (ret) 2071 goto out; 2072 2073 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2074 2075 if (IS_ERR(device)) { 2076 if (PTR_ERR(device) == -ENOENT && 2077 strcmp(device_path, "missing") == 0) 2078 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2079 else 2080 ret = PTR_ERR(device); 2081 goto out; 2082 } 2083 2084 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2085 btrfs_warn_in_rcu(fs_info, 2086 "cannot remove device %s (devid %llu) due to active swapfile", 2087 rcu_str_deref(device->name), device->devid); 2088 ret = -ETXTBSY; 2089 goto out; 2090 } 2091 2092 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2093 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2094 goto out; 2095 } 2096 2097 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2098 fs_info->fs_devices->rw_devices == 1) { 2099 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2100 goto out; 2101 } 2102 2103 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2104 mutex_lock(&fs_info->chunk_mutex); 2105 list_del_init(&device->dev_alloc_list); 2106 device->fs_devices->rw_devices--; 2107 mutex_unlock(&fs_info->chunk_mutex); 2108 } 2109 2110 mutex_unlock(&uuid_mutex); 2111 ret = btrfs_shrink_device(device, 0); 2112 if (!ret) 2113 btrfs_reada_remove_dev(device); 2114 mutex_lock(&uuid_mutex); 2115 if (ret) 2116 goto error_undo; 2117 2118 /* 2119 * TODO: the superblock still includes this device in its num_devices 2120 * counter although write_all_supers() is not locked out. This 2121 * could give a filesystem state which requires a degraded mount. 2122 */ 2123 ret = btrfs_rm_dev_item(device); 2124 if (ret) 2125 goto error_undo; 2126 2127 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2128 btrfs_scrub_cancel_dev(device); 2129 2130 /* 2131 * the device list mutex makes sure that we don't change 2132 * the device list while someone else is writing out all 2133 * the device supers. Whoever is writing all supers, should 2134 * lock the device list mutex before getting the number of 2135 * devices in the super block (super_copy). Conversely, 2136 * whoever updates the number of devices in the super block 2137 * (super_copy) should hold the device list mutex. 2138 */ 2139 2140 /* 2141 * In normal cases the cur_devices == fs_devices. But in case 2142 * of deleting a seed device, the cur_devices should point to 2143 * its own fs_devices listed under the fs_devices->seed. 2144 */ 2145 cur_devices = device->fs_devices; 2146 mutex_lock(&fs_devices->device_list_mutex); 2147 list_del_rcu(&device->dev_list); 2148 2149 cur_devices->num_devices--; 2150 cur_devices->total_devices--; 2151 /* Update total_devices of the parent fs_devices if it's seed */ 2152 if (cur_devices != fs_devices) 2153 fs_devices->total_devices--; 2154 2155 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2156 cur_devices->missing_devices--; 2157 2158 btrfs_assign_next_active_device(device, NULL); 2159 2160 if (device->bdev) { 2161 cur_devices->open_devices--; 2162 /* remove sysfs entry */ 2163 btrfs_sysfs_remove_device(device); 2164 } 2165 2166 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2167 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2168 mutex_unlock(&fs_devices->device_list_mutex); 2169 2170 /* 2171 * at this point, the device is zero sized and detached from 2172 * the devices list. All that's left is to zero out the old 2173 * supers and free the device. 2174 */ 2175 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2176 btrfs_scratch_superblocks(fs_info, device->bdev, 2177 device->name->str); 2178 2179 btrfs_close_bdev(device); 2180 synchronize_rcu(); 2181 btrfs_free_device(device); 2182 2183 if (cur_devices->open_devices == 0) { 2184 list_del_init(&cur_devices->seed_list); 2185 close_fs_devices(cur_devices); 2186 free_fs_devices(cur_devices); 2187 } 2188 2189 out: 2190 mutex_unlock(&uuid_mutex); 2191 return ret; 2192 2193 error_undo: 2194 btrfs_reada_undo_remove_dev(device); 2195 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2196 mutex_lock(&fs_info->chunk_mutex); 2197 list_add(&device->dev_alloc_list, 2198 &fs_devices->alloc_list); 2199 device->fs_devices->rw_devices++; 2200 mutex_unlock(&fs_info->chunk_mutex); 2201 } 2202 goto out; 2203 } 2204 2205 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2206 { 2207 struct btrfs_fs_devices *fs_devices; 2208 2209 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2210 2211 /* 2212 * in case of fs with no seed, srcdev->fs_devices will point 2213 * to fs_devices of fs_info. However when the dev being replaced is 2214 * a seed dev it will point to the seed's local fs_devices. In short 2215 * srcdev will have its correct fs_devices in both the cases. 2216 */ 2217 fs_devices = srcdev->fs_devices; 2218 2219 list_del_rcu(&srcdev->dev_list); 2220 list_del(&srcdev->dev_alloc_list); 2221 fs_devices->num_devices--; 2222 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2223 fs_devices->missing_devices--; 2224 2225 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2226 fs_devices->rw_devices--; 2227 2228 if (srcdev->bdev) 2229 fs_devices->open_devices--; 2230 } 2231 2232 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2233 { 2234 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2235 2236 mutex_lock(&uuid_mutex); 2237 2238 btrfs_close_bdev(srcdev); 2239 synchronize_rcu(); 2240 btrfs_free_device(srcdev); 2241 2242 /* if this is no devs we rather delete the fs_devices */ 2243 if (!fs_devices->num_devices) { 2244 /* 2245 * On a mounted FS, num_devices can't be zero unless it's a 2246 * seed. In case of a seed device being replaced, the replace 2247 * target added to the sprout FS, so there will be no more 2248 * device left under the seed FS. 2249 */ 2250 ASSERT(fs_devices->seeding); 2251 2252 list_del_init(&fs_devices->seed_list); 2253 close_fs_devices(fs_devices); 2254 free_fs_devices(fs_devices); 2255 } 2256 mutex_unlock(&uuid_mutex); 2257 } 2258 2259 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2260 { 2261 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2262 2263 mutex_lock(&fs_devices->device_list_mutex); 2264 2265 btrfs_sysfs_remove_device(tgtdev); 2266 2267 if (tgtdev->bdev) 2268 fs_devices->open_devices--; 2269 2270 fs_devices->num_devices--; 2271 2272 btrfs_assign_next_active_device(tgtdev, NULL); 2273 2274 list_del_rcu(&tgtdev->dev_list); 2275 2276 mutex_unlock(&fs_devices->device_list_mutex); 2277 2278 /* 2279 * The update_dev_time() with in btrfs_scratch_superblocks() 2280 * may lead to a call to btrfs_show_devname() which will try 2281 * to hold device_list_mutex. And here this device 2282 * is already out of device list, so we don't have to hold 2283 * the device_list_mutex lock. 2284 */ 2285 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2286 tgtdev->name->str); 2287 2288 btrfs_close_bdev(tgtdev); 2289 synchronize_rcu(); 2290 btrfs_free_device(tgtdev); 2291 } 2292 2293 static struct btrfs_device *btrfs_find_device_by_path( 2294 struct btrfs_fs_info *fs_info, const char *device_path) 2295 { 2296 int ret = 0; 2297 struct btrfs_super_block *disk_super; 2298 u64 devid; 2299 u8 *dev_uuid; 2300 struct block_device *bdev; 2301 struct btrfs_device *device; 2302 2303 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2304 fs_info->bdev_holder, 0, &bdev, &disk_super); 2305 if (ret) 2306 return ERR_PTR(ret); 2307 2308 devid = btrfs_stack_device_id(&disk_super->dev_item); 2309 dev_uuid = disk_super->dev_item.uuid; 2310 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2311 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2312 disk_super->metadata_uuid); 2313 else 2314 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2315 disk_super->fsid); 2316 2317 btrfs_release_disk_super(disk_super); 2318 if (!device) 2319 device = ERR_PTR(-ENOENT); 2320 blkdev_put(bdev, FMODE_READ); 2321 return device; 2322 } 2323 2324 /* 2325 * Lookup a device given by device id, or the path if the id is 0. 2326 */ 2327 struct btrfs_device *btrfs_find_device_by_devspec( 2328 struct btrfs_fs_info *fs_info, u64 devid, 2329 const char *device_path) 2330 { 2331 struct btrfs_device *device; 2332 2333 if (devid) { 2334 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2335 NULL); 2336 if (!device) 2337 return ERR_PTR(-ENOENT); 2338 return device; 2339 } 2340 2341 if (!device_path || !device_path[0]) 2342 return ERR_PTR(-EINVAL); 2343 2344 if (strcmp(device_path, "missing") == 0) { 2345 /* Find first missing device */ 2346 list_for_each_entry(device, &fs_info->fs_devices->devices, 2347 dev_list) { 2348 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2349 &device->dev_state) && !device->bdev) 2350 return device; 2351 } 2352 return ERR_PTR(-ENOENT); 2353 } 2354 2355 return btrfs_find_device_by_path(fs_info, device_path); 2356 } 2357 2358 /* 2359 * does all the dirty work required for changing file system's UUID. 2360 */ 2361 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2362 { 2363 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2364 struct btrfs_fs_devices *old_devices; 2365 struct btrfs_fs_devices *seed_devices; 2366 struct btrfs_super_block *disk_super = fs_info->super_copy; 2367 struct btrfs_device *device; 2368 u64 super_flags; 2369 2370 lockdep_assert_held(&uuid_mutex); 2371 if (!fs_devices->seeding) 2372 return -EINVAL; 2373 2374 /* 2375 * Private copy of the seed devices, anchored at 2376 * fs_info->fs_devices->seed_list 2377 */ 2378 seed_devices = alloc_fs_devices(NULL, NULL); 2379 if (IS_ERR(seed_devices)) 2380 return PTR_ERR(seed_devices); 2381 2382 /* 2383 * It's necessary to retain a copy of the original seed fs_devices in 2384 * fs_uuids so that filesystems which have been seeded can successfully 2385 * reference the seed device from open_seed_devices. This also supports 2386 * multiple fs seed. 2387 */ 2388 old_devices = clone_fs_devices(fs_devices); 2389 if (IS_ERR(old_devices)) { 2390 kfree(seed_devices); 2391 return PTR_ERR(old_devices); 2392 } 2393 2394 list_add(&old_devices->fs_list, &fs_uuids); 2395 2396 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2397 seed_devices->opened = 1; 2398 INIT_LIST_HEAD(&seed_devices->devices); 2399 INIT_LIST_HEAD(&seed_devices->alloc_list); 2400 mutex_init(&seed_devices->device_list_mutex); 2401 2402 mutex_lock(&fs_devices->device_list_mutex); 2403 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2404 synchronize_rcu); 2405 list_for_each_entry(device, &seed_devices->devices, dev_list) 2406 device->fs_devices = seed_devices; 2407 2408 fs_devices->seeding = false; 2409 fs_devices->num_devices = 0; 2410 fs_devices->open_devices = 0; 2411 fs_devices->missing_devices = 0; 2412 fs_devices->rotating = false; 2413 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2414 2415 generate_random_uuid(fs_devices->fsid); 2416 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2417 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2418 mutex_unlock(&fs_devices->device_list_mutex); 2419 2420 super_flags = btrfs_super_flags(disk_super) & 2421 ~BTRFS_SUPER_FLAG_SEEDING; 2422 btrfs_set_super_flags(disk_super, super_flags); 2423 2424 return 0; 2425 } 2426 2427 /* 2428 * Store the expected generation for seed devices in device items. 2429 */ 2430 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2431 { 2432 struct btrfs_fs_info *fs_info = trans->fs_info; 2433 struct btrfs_root *root = fs_info->chunk_root; 2434 struct btrfs_path *path; 2435 struct extent_buffer *leaf; 2436 struct btrfs_dev_item *dev_item; 2437 struct btrfs_device *device; 2438 struct btrfs_key key; 2439 u8 fs_uuid[BTRFS_FSID_SIZE]; 2440 u8 dev_uuid[BTRFS_UUID_SIZE]; 2441 u64 devid; 2442 int ret; 2443 2444 path = btrfs_alloc_path(); 2445 if (!path) 2446 return -ENOMEM; 2447 2448 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2449 key.offset = 0; 2450 key.type = BTRFS_DEV_ITEM_KEY; 2451 2452 while (1) { 2453 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2454 if (ret < 0) 2455 goto error; 2456 2457 leaf = path->nodes[0]; 2458 next_slot: 2459 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2460 ret = btrfs_next_leaf(root, path); 2461 if (ret > 0) 2462 break; 2463 if (ret < 0) 2464 goto error; 2465 leaf = path->nodes[0]; 2466 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2467 btrfs_release_path(path); 2468 continue; 2469 } 2470 2471 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2472 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2473 key.type != BTRFS_DEV_ITEM_KEY) 2474 break; 2475 2476 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2477 struct btrfs_dev_item); 2478 devid = btrfs_device_id(leaf, dev_item); 2479 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2480 BTRFS_UUID_SIZE); 2481 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2482 BTRFS_FSID_SIZE); 2483 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2484 fs_uuid); 2485 BUG_ON(!device); /* Logic error */ 2486 2487 if (device->fs_devices->seeding) { 2488 btrfs_set_device_generation(leaf, dev_item, 2489 device->generation); 2490 btrfs_mark_buffer_dirty(leaf); 2491 } 2492 2493 path->slots[0]++; 2494 goto next_slot; 2495 } 2496 ret = 0; 2497 error: 2498 btrfs_free_path(path); 2499 return ret; 2500 } 2501 2502 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2503 { 2504 struct btrfs_root *root = fs_info->dev_root; 2505 struct request_queue *q; 2506 struct btrfs_trans_handle *trans; 2507 struct btrfs_device *device; 2508 struct block_device *bdev; 2509 struct super_block *sb = fs_info->sb; 2510 struct rcu_string *name; 2511 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2512 u64 orig_super_total_bytes; 2513 u64 orig_super_num_devices; 2514 int seeding_dev = 0; 2515 int ret = 0; 2516 bool locked = false; 2517 2518 if (sb_rdonly(sb) && !fs_devices->seeding) 2519 return -EROFS; 2520 2521 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2522 fs_info->bdev_holder); 2523 if (IS_ERR(bdev)) 2524 return PTR_ERR(bdev); 2525 2526 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2527 ret = -EINVAL; 2528 goto error; 2529 } 2530 2531 if (fs_devices->seeding) { 2532 seeding_dev = 1; 2533 down_write(&sb->s_umount); 2534 mutex_lock(&uuid_mutex); 2535 locked = true; 2536 } 2537 2538 sync_blockdev(bdev); 2539 2540 rcu_read_lock(); 2541 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2542 if (device->bdev == bdev) { 2543 ret = -EEXIST; 2544 rcu_read_unlock(); 2545 goto error; 2546 } 2547 } 2548 rcu_read_unlock(); 2549 2550 device = btrfs_alloc_device(fs_info, NULL, NULL); 2551 if (IS_ERR(device)) { 2552 /* we can safely leave the fs_devices entry around */ 2553 ret = PTR_ERR(device); 2554 goto error; 2555 } 2556 2557 name = rcu_string_strdup(device_path, GFP_KERNEL); 2558 if (!name) { 2559 ret = -ENOMEM; 2560 goto error_free_device; 2561 } 2562 rcu_assign_pointer(device->name, name); 2563 2564 device->fs_info = fs_info; 2565 device->bdev = bdev; 2566 2567 ret = btrfs_get_dev_zone_info(device); 2568 if (ret) 2569 goto error_free_device; 2570 2571 trans = btrfs_start_transaction(root, 0); 2572 if (IS_ERR(trans)) { 2573 ret = PTR_ERR(trans); 2574 goto error_free_zone; 2575 } 2576 2577 q = bdev_get_queue(bdev); 2578 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2579 device->generation = trans->transid; 2580 device->io_width = fs_info->sectorsize; 2581 device->io_align = fs_info->sectorsize; 2582 device->sector_size = fs_info->sectorsize; 2583 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2584 fs_info->sectorsize); 2585 device->disk_total_bytes = device->total_bytes; 2586 device->commit_total_bytes = device->total_bytes; 2587 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2588 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2589 device->mode = FMODE_EXCL; 2590 device->dev_stats_valid = 1; 2591 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2592 2593 if (seeding_dev) { 2594 btrfs_clear_sb_rdonly(sb); 2595 ret = btrfs_prepare_sprout(fs_info); 2596 if (ret) { 2597 btrfs_abort_transaction(trans, ret); 2598 goto error_trans; 2599 } 2600 } 2601 2602 device->fs_devices = fs_devices; 2603 2604 mutex_lock(&fs_devices->device_list_mutex); 2605 mutex_lock(&fs_info->chunk_mutex); 2606 list_add_rcu(&device->dev_list, &fs_devices->devices); 2607 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2608 fs_devices->num_devices++; 2609 fs_devices->open_devices++; 2610 fs_devices->rw_devices++; 2611 fs_devices->total_devices++; 2612 fs_devices->total_rw_bytes += device->total_bytes; 2613 2614 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2615 2616 if (!blk_queue_nonrot(q)) 2617 fs_devices->rotating = true; 2618 2619 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2620 btrfs_set_super_total_bytes(fs_info->super_copy, 2621 round_down(orig_super_total_bytes + device->total_bytes, 2622 fs_info->sectorsize)); 2623 2624 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2625 btrfs_set_super_num_devices(fs_info->super_copy, 2626 orig_super_num_devices + 1); 2627 2628 /* 2629 * we've got more storage, clear any full flags on the space 2630 * infos 2631 */ 2632 btrfs_clear_space_info_full(fs_info); 2633 2634 mutex_unlock(&fs_info->chunk_mutex); 2635 2636 /* Add sysfs device entry */ 2637 btrfs_sysfs_add_device(device); 2638 2639 mutex_unlock(&fs_devices->device_list_mutex); 2640 2641 if (seeding_dev) { 2642 mutex_lock(&fs_info->chunk_mutex); 2643 ret = init_first_rw_device(trans); 2644 mutex_unlock(&fs_info->chunk_mutex); 2645 if (ret) { 2646 btrfs_abort_transaction(trans, ret); 2647 goto error_sysfs; 2648 } 2649 } 2650 2651 ret = btrfs_add_dev_item(trans, device); 2652 if (ret) { 2653 btrfs_abort_transaction(trans, ret); 2654 goto error_sysfs; 2655 } 2656 2657 if (seeding_dev) { 2658 ret = btrfs_finish_sprout(trans); 2659 if (ret) { 2660 btrfs_abort_transaction(trans, ret); 2661 goto error_sysfs; 2662 } 2663 2664 /* 2665 * fs_devices now represents the newly sprouted filesystem and 2666 * its fsid has been changed by btrfs_prepare_sprout 2667 */ 2668 btrfs_sysfs_update_sprout_fsid(fs_devices); 2669 } 2670 2671 ret = btrfs_commit_transaction(trans); 2672 2673 if (seeding_dev) { 2674 mutex_unlock(&uuid_mutex); 2675 up_write(&sb->s_umount); 2676 locked = false; 2677 2678 if (ret) /* transaction commit */ 2679 return ret; 2680 2681 ret = btrfs_relocate_sys_chunks(fs_info); 2682 if (ret < 0) 2683 btrfs_handle_fs_error(fs_info, ret, 2684 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2685 trans = btrfs_attach_transaction(root); 2686 if (IS_ERR(trans)) { 2687 if (PTR_ERR(trans) == -ENOENT) 2688 return 0; 2689 ret = PTR_ERR(trans); 2690 trans = NULL; 2691 goto error_sysfs; 2692 } 2693 ret = btrfs_commit_transaction(trans); 2694 } 2695 2696 /* 2697 * Now that we have written a new super block to this device, check all 2698 * other fs_devices list if device_path alienates any other scanned 2699 * device. 2700 * We can ignore the return value as it typically returns -EINVAL and 2701 * only succeeds if the device was an alien. 2702 */ 2703 btrfs_forget_devices(device_path); 2704 2705 /* Update ctime/mtime for blkid or udev */ 2706 update_dev_time(device_path); 2707 2708 return ret; 2709 2710 error_sysfs: 2711 btrfs_sysfs_remove_device(device); 2712 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2713 mutex_lock(&fs_info->chunk_mutex); 2714 list_del_rcu(&device->dev_list); 2715 list_del(&device->dev_alloc_list); 2716 fs_info->fs_devices->num_devices--; 2717 fs_info->fs_devices->open_devices--; 2718 fs_info->fs_devices->rw_devices--; 2719 fs_info->fs_devices->total_devices--; 2720 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2721 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2722 btrfs_set_super_total_bytes(fs_info->super_copy, 2723 orig_super_total_bytes); 2724 btrfs_set_super_num_devices(fs_info->super_copy, 2725 orig_super_num_devices); 2726 mutex_unlock(&fs_info->chunk_mutex); 2727 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2728 error_trans: 2729 if (seeding_dev) 2730 btrfs_set_sb_rdonly(sb); 2731 if (trans) 2732 btrfs_end_transaction(trans); 2733 error_free_zone: 2734 btrfs_destroy_dev_zone_info(device); 2735 error_free_device: 2736 btrfs_free_device(device); 2737 error: 2738 blkdev_put(bdev, FMODE_EXCL); 2739 if (locked) { 2740 mutex_unlock(&uuid_mutex); 2741 up_write(&sb->s_umount); 2742 } 2743 return ret; 2744 } 2745 2746 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2747 struct btrfs_device *device) 2748 { 2749 int ret; 2750 struct btrfs_path *path; 2751 struct btrfs_root *root = device->fs_info->chunk_root; 2752 struct btrfs_dev_item *dev_item; 2753 struct extent_buffer *leaf; 2754 struct btrfs_key key; 2755 2756 path = btrfs_alloc_path(); 2757 if (!path) 2758 return -ENOMEM; 2759 2760 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2761 key.type = BTRFS_DEV_ITEM_KEY; 2762 key.offset = device->devid; 2763 2764 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2765 if (ret < 0) 2766 goto out; 2767 2768 if (ret > 0) { 2769 ret = -ENOENT; 2770 goto out; 2771 } 2772 2773 leaf = path->nodes[0]; 2774 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2775 2776 btrfs_set_device_id(leaf, dev_item, device->devid); 2777 btrfs_set_device_type(leaf, dev_item, device->type); 2778 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2779 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2780 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2781 btrfs_set_device_total_bytes(leaf, dev_item, 2782 btrfs_device_get_disk_total_bytes(device)); 2783 btrfs_set_device_bytes_used(leaf, dev_item, 2784 btrfs_device_get_bytes_used(device)); 2785 btrfs_mark_buffer_dirty(leaf); 2786 2787 out: 2788 btrfs_free_path(path); 2789 return ret; 2790 } 2791 2792 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2793 struct btrfs_device *device, u64 new_size) 2794 { 2795 struct btrfs_fs_info *fs_info = device->fs_info; 2796 struct btrfs_super_block *super_copy = fs_info->super_copy; 2797 u64 old_total; 2798 u64 diff; 2799 2800 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2801 return -EACCES; 2802 2803 new_size = round_down(new_size, fs_info->sectorsize); 2804 2805 mutex_lock(&fs_info->chunk_mutex); 2806 old_total = btrfs_super_total_bytes(super_copy); 2807 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2808 2809 if (new_size <= device->total_bytes || 2810 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2811 mutex_unlock(&fs_info->chunk_mutex); 2812 return -EINVAL; 2813 } 2814 2815 btrfs_set_super_total_bytes(super_copy, 2816 round_down(old_total + diff, fs_info->sectorsize)); 2817 device->fs_devices->total_rw_bytes += diff; 2818 2819 btrfs_device_set_total_bytes(device, new_size); 2820 btrfs_device_set_disk_total_bytes(device, new_size); 2821 btrfs_clear_space_info_full(device->fs_info); 2822 if (list_empty(&device->post_commit_list)) 2823 list_add_tail(&device->post_commit_list, 2824 &trans->transaction->dev_update_list); 2825 mutex_unlock(&fs_info->chunk_mutex); 2826 2827 return btrfs_update_device(trans, device); 2828 } 2829 2830 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2831 { 2832 struct btrfs_fs_info *fs_info = trans->fs_info; 2833 struct btrfs_root *root = fs_info->chunk_root; 2834 int ret; 2835 struct btrfs_path *path; 2836 struct btrfs_key key; 2837 2838 path = btrfs_alloc_path(); 2839 if (!path) 2840 return -ENOMEM; 2841 2842 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2843 key.offset = chunk_offset; 2844 key.type = BTRFS_CHUNK_ITEM_KEY; 2845 2846 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2847 if (ret < 0) 2848 goto out; 2849 else if (ret > 0) { /* Logic error or corruption */ 2850 btrfs_handle_fs_error(fs_info, -ENOENT, 2851 "Failed lookup while freeing chunk."); 2852 ret = -ENOENT; 2853 goto out; 2854 } 2855 2856 ret = btrfs_del_item(trans, root, path); 2857 if (ret < 0) 2858 btrfs_handle_fs_error(fs_info, ret, 2859 "Failed to delete chunk item."); 2860 out: 2861 btrfs_free_path(path); 2862 return ret; 2863 } 2864 2865 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2866 { 2867 struct btrfs_super_block *super_copy = fs_info->super_copy; 2868 struct btrfs_disk_key *disk_key; 2869 struct btrfs_chunk *chunk; 2870 u8 *ptr; 2871 int ret = 0; 2872 u32 num_stripes; 2873 u32 array_size; 2874 u32 len = 0; 2875 u32 cur; 2876 struct btrfs_key key; 2877 2878 lockdep_assert_held(&fs_info->chunk_mutex); 2879 array_size = btrfs_super_sys_array_size(super_copy); 2880 2881 ptr = super_copy->sys_chunk_array; 2882 cur = 0; 2883 2884 while (cur < array_size) { 2885 disk_key = (struct btrfs_disk_key *)ptr; 2886 btrfs_disk_key_to_cpu(&key, disk_key); 2887 2888 len = sizeof(*disk_key); 2889 2890 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2891 chunk = (struct btrfs_chunk *)(ptr + len); 2892 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2893 len += btrfs_chunk_item_size(num_stripes); 2894 } else { 2895 ret = -EIO; 2896 break; 2897 } 2898 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2899 key.offset == chunk_offset) { 2900 memmove(ptr, ptr + len, array_size - (cur + len)); 2901 array_size -= len; 2902 btrfs_set_super_sys_array_size(super_copy, array_size); 2903 } else { 2904 ptr += len; 2905 cur += len; 2906 } 2907 } 2908 return ret; 2909 } 2910 2911 /* 2912 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2913 * @logical: Logical block offset in bytes. 2914 * @length: Length of extent in bytes. 2915 * 2916 * Return: Chunk mapping or ERR_PTR. 2917 */ 2918 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2919 u64 logical, u64 length) 2920 { 2921 struct extent_map_tree *em_tree; 2922 struct extent_map *em; 2923 2924 em_tree = &fs_info->mapping_tree; 2925 read_lock(&em_tree->lock); 2926 em = lookup_extent_mapping(em_tree, logical, length); 2927 read_unlock(&em_tree->lock); 2928 2929 if (!em) { 2930 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2931 logical, length); 2932 return ERR_PTR(-EINVAL); 2933 } 2934 2935 if (em->start > logical || em->start + em->len < logical) { 2936 btrfs_crit(fs_info, 2937 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2938 logical, length, em->start, em->start + em->len); 2939 free_extent_map(em); 2940 return ERR_PTR(-EINVAL); 2941 } 2942 2943 /* callers are responsible for dropping em's ref. */ 2944 return em; 2945 } 2946 2947 static int remove_chunk_item(struct btrfs_trans_handle *trans, 2948 struct map_lookup *map, u64 chunk_offset) 2949 { 2950 int i; 2951 2952 /* 2953 * Removing chunk items and updating the device items in the chunks btree 2954 * requires holding the chunk_mutex. 2955 * See the comment at btrfs_chunk_alloc() for the details. 2956 */ 2957 lockdep_assert_held(&trans->fs_info->chunk_mutex); 2958 2959 for (i = 0; i < map->num_stripes; i++) { 2960 int ret; 2961 2962 ret = btrfs_update_device(trans, map->stripes[i].dev); 2963 if (ret) 2964 return ret; 2965 } 2966 2967 return btrfs_free_chunk(trans, chunk_offset); 2968 } 2969 2970 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2971 { 2972 struct btrfs_fs_info *fs_info = trans->fs_info; 2973 struct extent_map *em; 2974 struct map_lookup *map; 2975 u64 dev_extent_len = 0; 2976 int i, ret = 0; 2977 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2978 2979 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 2980 if (IS_ERR(em)) { 2981 /* 2982 * This is a logic error, but we don't want to just rely on the 2983 * user having built with ASSERT enabled, so if ASSERT doesn't 2984 * do anything we still error out. 2985 */ 2986 ASSERT(0); 2987 return PTR_ERR(em); 2988 } 2989 map = em->map_lookup; 2990 2991 /* 2992 * First delete the device extent items from the devices btree. 2993 * We take the device_list_mutex to avoid racing with the finishing phase 2994 * of a device replace operation. See the comment below before acquiring 2995 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 2996 * because that can result in a deadlock when deleting the device extent 2997 * items from the devices btree - COWing an extent buffer from the btree 2998 * may result in allocating a new metadata chunk, which would attempt to 2999 * lock again fs_info->chunk_mutex. 3000 */ 3001 mutex_lock(&fs_devices->device_list_mutex); 3002 for (i = 0; i < map->num_stripes; i++) { 3003 struct btrfs_device *device = map->stripes[i].dev; 3004 ret = btrfs_free_dev_extent(trans, device, 3005 map->stripes[i].physical, 3006 &dev_extent_len); 3007 if (ret) { 3008 mutex_unlock(&fs_devices->device_list_mutex); 3009 btrfs_abort_transaction(trans, ret); 3010 goto out; 3011 } 3012 3013 if (device->bytes_used > 0) { 3014 mutex_lock(&fs_info->chunk_mutex); 3015 btrfs_device_set_bytes_used(device, 3016 device->bytes_used - dev_extent_len); 3017 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3018 btrfs_clear_space_info_full(fs_info); 3019 mutex_unlock(&fs_info->chunk_mutex); 3020 } 3021 } 3022 mutex_unlock(&fs_devices->device_list_mutex); 3023 3024 /* 3025 * We acquire fs_info->chunk_mutex for 2 reasons: 3026 * 3027 * 1) Just like with the first phase of the chunk allocation, we must 3028 * reserve system space, do all chunk btree updates and deletions, and 3029 * update the system chunk array in the superblock while holding this 3030 * mutex. This is for similar reasons as explained on the comment at 3031 * the top of btrfs_chunk_alloc(); 3032 * 3033 * 2) Prevent races with the final phase of a device replace operation 3034 * that replaces the device object associated with the map's stripes, 3035 * because the device object's id can change at any time during that 3036 * final phase of the device replace operation 3037 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3038 * replaced device and then see it with an ID of 3039 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3040 * the device item, which does not exists on the chunk btree. 3041 * The finishing phase of device replace acquires both the 3042 * device_list_mutex and the chunk_mutex, in that order, so we are 3043 * safe by just acquiring the chunk_mutex. 3044 */ 3045 trans->removing_chunk = true; 3046 mutex_lock(&fs_info->chunk_mutex); 3047 3048 check_system_chunk(trans, map->type); 3049 3050 ret = remove_chunk_item(trans, map, chunk_offset); 3051 /* 3052 * Normally we should not get -ENOSPC since we reserved space before 3053 * through the call to check_system_chunk(). 3054 * 3055 * Despite our system space_info having enough free space, we may not 3056 * be able to allocate extents from its block groups, because all have 3057 * an incompatible profile, which will force us to allocate a new system 3058 * block group with the right profile, or right after we called 3059 * check_system_space() above, a scrub turned the only system block group 3060 * with enough free space into RO mode. 3061 * This is explained with more detail at do_chunk_alloc(). 3062 * 3063 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3064 */ 3065 if (ret == -ENOSPC) { 3066 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3067 struct btrfs_block_group *sys_bg; 3068 3069 sys_bg = btrfs_alloc_chunk(trans, sys_flags); 3070 if (IS_ERR(sys_bg)) { 3071 ret = PTR_ERR(sys_bg); 3072 btrfs_abort_transaction(trans, ret); 3073 goto out; 3074 } 3075 3076 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3077 if (ret) { 3078 btrfs_abort_transaction(trans, ret); 3079 goto out; 3080 } 3081 3082 ret = remove_chunk_item(trans, map, chunk_offset); 3083 if (ret) { 3084 btrfs_abort_transaction(trans, ret); 3085 goto out; 3086 } 3087 } else if (ret) { 3088 btrfs_abort_transaction(trans, ret); 3089 goto out; 3090 } 3091 3092 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3093 3094 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3095 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3096 if (ret) { 3097 btrfs_abort_transaction(trans, ret); 3098 goto out; 3099 } 3100 } 3101 3102 mutex_unlock(&fs_info->chunk_mutex); 3103 trans->removing_chunk = false; 3104 3105 /* 3106 * We are done with chunk btree updates and deletions, so release the 3107 * system space we previously reserved (with check_system_chunk()). 3108 */ 3109 btrfs_trans_release_chunk_metadata(trans); 3110 3111 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3112 if (ret) { 3113 btrfs_abort_transaction(trans, ret); 3114 goto out; 3115 } 3116 3117 out: 3118 if (trans->removing_chunk) { 3119 mutex_unlock(&fs_info->chunk_mutex); 3120 trans->removing_chunk = false; 3121 } 3122 /* once for us */ 3123 free_extent_map(em); 3124 return ret; 3125 } 3126 3127 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3128 { 3129 struct btrfs_root *root = fs_info->chunk_root; 3130 struct btrfs_trans_handle *trans; 3131 struct btrfs_block_group *block_group; 3132 u64 length; 3133 int ret; 3134 3135 /* 3136 * Prevent races with automatic removal of unused block groups. 3137 * After we relocate and before we remove the chunk with offset 3138 * chunk_offset, automatic removal of the block group can kick in, 3139 * resulting in a failure when calling btrfs_remove_chunk() below. 3140 * 3141 * Make sure to acquire this mutex before doing a tree search (dev 3142 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3143 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3144 * we release the path used to search the chunk/dev tree and before 3145 * the current task acquires this mutex and calls us. 3146 */ 3147 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3148 3149 /* step one, relocate all the extents inside this chunk */ 3150 btrfs_scrub_pause(fs_info); 3151 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3152 btrfs_scrub_continue(fs_info); 3153 if (ret) 3154 return ret; 3155 3156 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3157 if (!block_group) 3158 return -ENOENT; 3159 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3160 length = block_group->length; 3161 btrfs_put_block_group(block_group); 3162 3163 /* 3164 * On a zoned file system, discard the whole block group, this will 3165 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3166 * resetting the zone fails, don't treat it as a fatal problem from the 3167 * filesystem's point of view. 3168 */ 3169 if (btrfs_is_zoned(fs_info)) { 3170 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3171 if (ret) 3172 btrfs_info(fs_info, 3173 "failed to reset zone %llu after relocation", 3174 chunk_offset); 3175 } 3176 3177 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3178 chunk_offset); 3179 if (IS_ERR(trans)) { 3180 ret = PTR_ERR(trans); 3181 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3182 return ret; 3183 } 3184 3185 /* 3186 * step two, delete the device extents and the 3187 * chunk tree entries 3188 */ 3189 ret = btrfs_remove_chunk(trans, chunk_offset); 3190 btrfs_end_transaction(trans); 3191 return ret; 3192 } 3193 3194 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3195 { 3196 struct btrfs_root *chunk_root = fs_info->chunk_root; 3197 struct btrfs_path *path; 3198 struct extent_buffer *leaf; 3199 struct btrfs_chunk *chunk; 3200 struct btrfs_key key; 3201 struct btrfs_key found_key; 3202 u64 chunk_type; 3203 bool retried = false; 3204 int failed = 0; 3205 int ret; 3206 3207 path = btrfs_alloc_path(); 3208 if (!path) 3209 return -ENOMEM; 3210 3211 again: 3212 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3213 key.offset = (u64)-1; 3214 key.type = BTRFS_CHUNK_ITEM_KEY; 3215 3216 while (1) { 3217 mutex_lock(&fs_info->reclaim_bgs_lock); 3218 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3219 if (ret < 0) { 3220 mutex_unlock(&fs_info->reclaim_bgs_lock); 3221 goto error; 3222 } 3223 BUG_ON(ret == 0); /* Corruption */ 3224 3225 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3226 key.type); 3227 if (ret) 3228 mutex_unlock(&fs_info->reclaim_bgs_lock); 3229 if (ret < 0) 3230 goto error; 3231 if (ret > 0) 3232 break; 3233 3234 leaf = path->nodes[0]; 3235 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3236 3237 chunk = btrfs_item_ptr(leaf, path->slots[0], 3238 struct btrfs_chunk); 3239 chunk_type = btrfs_chunk_type(leaf, chunk); 3240 btrfs_release_path(path); 3241 3242 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3243 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3244 if (ret == -ENOSPC) 3245 failed++; 3246 else 3247 BUG_ON(ret); 3248 } 3249 mutex_unlock(&fs_info->reclaim_bgs_lock); 3250 3251 if (found_key.offset == 0) 3252 break; 3253 key.offset = found_key.offset - 1; 3254 } 3255 ret = 0; 3256 if (failed && !retried) { 3257 failed = 0; 3258 retried = true; 3259 goto again; 3260 } else if (WARN_ON(failed && retried)) { 3261 ret = -ENOSPC; 3262 } 3263 error: 3264 btrfs_free_path(path); 3265 return ret; 3266 } 3267 3268 /* 3269 * return 1 : allocate a data chunk successfully, 3270 * return <0: errors during allocating a data chunk, 3271 * return 0 : no need to allocate a data chunk. 3272 */ 3273 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3274 u64 chunk_offset) 3275 { 3276 struct btrfs_block_group *cache; 3277 u64 bytes_used; 3278 u64 chunk_type; 3279 3280 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3281 ASSERT(cache); 3282 chunk_type = cache->flags; 3283 btrfs_put_block_group(cache); 3284 3285 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3286 return 0; 3287 3288 spin_lock(&fs_info->data_sinfo->lock); 3289 bytes_used = fs_info->data_sinfo->bytes_used; 3290 spin_unlock(&fs_info->data_sinfo->lock); 3291 3292 if (!bytes_used) { 3293 struct btrfs_trans_handle *trans; 3294 int ret; 3295 3296 trans = btrfs_join_transaction(fs_info->tree_root); 3297 if (IS_ERR(trans)) 3298 return PTR_ERR(trans); 3299 3300 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3301 btrfs_end_transaction(trans); 3302 if (ret < 0) 3303 return ret; 3304 return 1; 3305 } 3306 3307 return 0; 3308 } 3309 3310 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3311 struct btrfs_balance_control *bctl) 3312 { 3313 struct btrfs_root *root = fs_info->tree_root; 3314 struct btrfs_trans_handle *trans; 3315 struct btrfs_balance_item *item; 3316 struct btrfs_disk_balance_args disk_bargs; 3317 struct btrfs_path *path; 3318 struct extent_buffer *leaf; 3319 struct btrfs_key key; 3320 int ret, err; 3321 3322 path = btrfs_alloc_path(); 3323 if (!path) 3324 return -ENOMEM; 3325 3326 trans = btrfs_start_transaction(root, 0); 3327 if (IS_ERR(trans)) { 3328 btrfs_free_path(path); 3329 return PTR_ERR(trans); 3330 } 3331 3332 key.objectid = BTRFS_BALANCE_OBJECTID; 3333 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3334 key.offset = 0; 3335 3336 ret = btrfs_insert_empty_item(trans, root, path, &key, 3337 sizeof(*item)); 3338 if (ret) 3339 goto out; 3340 3341 leaf = path->nodes[0]; 3342 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3343 3344 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3345 3346 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3347 btrfs_set_balance_data(leaf, item, &disk_bargs); 3348 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3349 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3350 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3351 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3352 3353 btrfs_set_balance_flags(leaf, item, bctl->flags); 3354 3355 btrfs_mark_buffer_dirty(leaf); 3356 out: 3357 btrfs_free_path(path); 3358 err = btrfs_commit_transaction(trans); 3359 if (err && !ret) 3360 ret = err; 3361 return ret; 3362 } 3363 3364 static int del_balance_item(struct btrfs_fs_info *fs_info) 3365 { 3366 struct btrfs_root *root = fs_info->tree_root; 3367 struct btrfs_trans_handle *trans; 3368 struct btrfs_path *path; 3369 struct btrfs_key key; 3370 int ret, err; 3371 3372 path = btrfs_alloc_path(); 3373 if (!path) 3374 return -ENOMEM; 3375 3376 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3377 if (IS_ERR(trans)) { 3378 btrfs_free_path(path); 3379 return PTR_ERR(trans); 3380 } 3381 3382 key.objectid = BTRFS_BALANCE_OBJECTID; 3383 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3384 key.offset = 0; 3385 3386 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3387 if (ret < 0) 3388 goto out; 3389 if (ret > 0) { 3390 ret = -ENOENT; 3391 goto out; 3392 } 3393 3394 ret = btrfs_del_item(trans, root, path); 3395 out: 3396 btrfs_free_path(path); 3397 err = btrfs_commit_transaction(trans); 3398 if (err && !ret) 3399 ret = err; 3400 return ret; 3401 } 3402 3403 /* 3404 * This is a heuristic used to reduce the number of chunks balanced on 3405 * resume after balance was interrupted. 3406 */ 3407 static void update_balance_args(struct btrfs_balance_control *bctl) 3408 { 3409 /* 3410 * Turn on soft mode for chunk types that were being converted. 3411 */ 3412 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3413 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3414 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3415 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3416 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3417 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3418 3419 /* 3420 * Turn on usage filter if is not already used. The idea is 3421 * that chunks that we have already balanced should be 3422 * reasonably full. Don't do it for chunks that are being 3423 * converted - that will keep us from relocating unconverted 3424 * (albeit full) chunks. 3425 */ 3426 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3427 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3428 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3429 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3430 bctl->data.usage = 90; 3431 } 3432 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3433 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3434 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3435 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3436 bctl->sys.usage = 90; 3437 } 3438 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3439 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3440 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3441 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3442 bctl->meta.usage = 90; 3443 } 3444 } 3445 3446 /* 3447 * Clear the balance status in fs_info and delete the balance item from disk. 3448 */ 3449 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3450 { 3451 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3452 int ret; 3453 3454 BUG_ON(!fs_info->balance_ctl); 3455 3456 spin_lock(&fs_info->balance_lock); 3457 fs_info->balance_ctl = NULL; 3458 spin_unlock(&fs_info->balance_lock); 3459 3460 kfree(bctl); 3461 ret = del_balance_item(fs_info); 3462 if (ret) 3463 btrfs_handle_fs_error(fs_info, ret, NULL); 3464 } 3465 3466 /* 3467 * Balance filters. Return 1 if chunk should be filtered out 3468 * (should not be balanced). 3469 */ 3470 static int chunk_profiles_filter(u64 chunk_type, 3471 struct btrfs_balance_args *bargs) 3472 { 3473 chunk_type = chunk_to_extended(chunk_type) & 3474 BTRFS_EXTENDED_PROFILE_MASK; 3475 3476 if (bargs->profiles & chunk_type) 3477 return 0; 3478 3479 return 1; 3480 } 3481 3482 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3483 struct btrfs_balance_args *bargs) 3484 { 3485 struct btrfs_block_group *cache; 3486 u64 chunk_used; 3487 u64 user_thresh_min; 3488 u64 user_thresh_max; 3489 int ret = 1; 3490 3491 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3492 chunk_used = cache->used; 3493 3494 if (bargs->usage_min == 0) 3495 user_thresh_min = 0; 3496 else 3497 user_thresh_min = div_factor_fine(cache->length, 3498 bargs->usage_min); 3499 3500 if (bargs->usage_max == 0) 3501 user_thresh_max = 1; 3502 else if (bargs->usage_max > 100) 3503 user_thresh_max = cache->length; 3504 else 3505 user_thresh_max = div_factor_fine(cache->length, 3506 bargs->usage_max); 3507 3508 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3509 ret = 0; 3510 3511 btrfs_put_block_group(cache); 3512 return ret; 3513 } 3514 3515 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3516 u64 chunk_offset, struct btrfs_balance_args *bargs) 3517 { 3518 struct btrfs_block_group *cache; 3519 u64 chunk_used, user_thresh; 3520 int ret = 1; 3521 3522 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3523 chunk_used = cache->used; 3524 3525 if (bargs->usage_min == 0) 3526 user_thresh = 1; 3527 else if (bargs->usage > 100) 3528 user_thresh = cache->length; 3529 else 3530 user_thresh = div_factor_fine(cache->length, bargs->usage); 3531 3532 if (chunk_used < user_thresh) 3533 ret = 0; 3534 3535 btrfs_put_block_group(cache); 3536 return ret; 3537 } 3538 3539 static int chunk_devid_filter(struct extent_buffer *leaf, 3540 struct btrfs_chunk *chunk, 3541 struct btrfs_balance_args *bargs) 3542 { 3543 struct btrfs_stripe *stripe; 3544 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3545 int i; 3546 3547 for (i = 0; i < num_stripes; i++) { 3548 stripe = btrfs_stripe_nr(chunk, i); 3549 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3550 return 0; 3551 } 3552 3553 return 1; 3554 } 3555 3556 static u64 calc_data_stripes(u64 type, int num_stripes) 3557 { 3558 const int index = btrfs_bg_flags_to_raid_index(type); 3559 const int ncopies = btrfs_raid_array[index].ncopies; 3560 const int nparity = btrfs_raid_array[index].nparity; 3561 3562 return (num_stripes - nparity) / ncopies; 3563 } 3564 3565 /* [pstart, pend) */ 3566 static int chunk_drange_filter(struct extent_buffer *leaf, 3567 struct btrfs_chunk *chunk, 3568 struct btrfs_balance_args *bargs) 3569 { 3570 struct btrfs_stripe *stripe; 3571 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3572 u64 stripe_offset; 3573 u64 stripe_length; 3574 u64 type; 3575 int factor; 3576 int i; 3577 3578 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3579 return 0; 3580 3581 type = btrfs_chunk_type(leaf, chunk); 3582 factor = calc_data_stripes(type, num_stripes); 3583 3584 for (i = 0; i < num_stripes; i++) { 3585 stripe = btrfs_stripe_nr(chunk, i); 3586 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3587 continue; 3588 3589 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3590 stripe_length = btrfs_chunk_length(leaf, chunk); 3591 stripe_length = div_u64(stripe_length, factor); 3592 3593 if (stripe_offset < bargs->pend && 3594 stripe_offset + stripe_length > bargs->pstart) 3595 return 0; 3596 } 3597 3598 return 1; 3599 } 3600 3601 /* [vstart, vend) */ 3602 static int chunk_vrange_filter(struct extent_buffer *leaf, 3603 struct btrfs_chunk *chunk, 3604 u64 chunk_offset, 3605 struct btrfs_balance_args *bargs) 3606 { 3607 if (chunk_offset < bargs->vend && 3608 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3609 /* at least part of the chunk is inside this vrange */ 3610 return 0; 3611 3612 return 1; 3613 } 3614 3615 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3616 struct btrfs_chunk *chunk, 3617 struct btrfs_balance_args *bargs) 3618 { 3619 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3620 3621 if (bargs->stripes_min <= num_stripes 3622 && num_stripes <= bargs->stripes_max) 3623 return 0; 3624 3625 return 1; 3626 } 3627 3628 static int chunk_soft_convert_filter(u64 chunk_type, 3629 struct btrfs_balance_args *bargs) 3630 { 3631 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3632 return 0; 3633 3634 chunk_type = chunk_to_extended(chunk_type) & 3635 BTRFS_EXTENDED_PROFILE_MASK; 3636 3637 if (bargs->target == chunk_type) 3638 return 1; 3639 3640 return 0; 3641 } 3642 3643 static int should_balance_chunk(struct extent_buffer *leaf, 3644 struct btrfs_chunk *chunk, u64 chunk_offset) 3645 { 3646 struct btrfs_fs_info *fs_info = leaf->fs_info; 3647 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3648 struct btrfs_balance_args *bargs = NULL; 3649 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3650 3651 /* type filter */ 3652 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3653 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3654 return 0; 3655 } 3656 3657 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3658 bargs = &bctl->data; 3659 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3660 bargs = &bctl->sys; 3661 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3662 bargs = &bctl->meta; 3663 3664 /* profiles filter */ 3665 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3666 chunk_profiles_filter(chunk_type, bargs)) { 3667 return 0; 3668 } 3669 3670 /* usage filter */ 3671 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3672 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3673 return 0; 3674 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3675 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3676 return 0; 3677 } 3678 3679 /* devid filter */ 3680 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3681 chunk_devid_filter(leaf, chunk, bargs)) { 3682 return 0; 3683 } 3684 3685 /* drange filter, makes sense only with devid filter */ 3686 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3687 chunk_drange_filter(leaf, chunk, bargs)) { 3688 return 0; 3689 } 3690 3691 /* vrange filter */ 3692 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3693 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3694 return 0; 3695 } 3696 3697 /* stripes filter */ 3698 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3699 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3700 return 0; 3701 } 3702 3703 /* soft profile changing mode */ 3704 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3705 chunk_soft_convert_filter(chunk_type, bargs)) { 3706 return 0; 3707 } 3708 3709 /* 3710 * limited by count, must be the last filter 3711 */ 3712 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3713 if (bargs->limit == 0) 3714 return 0; 3715 else 3716 bargs->limit--; 3717 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3718 /* 3719 * Same logic as the 'limit' filter; the minimum cannot be 3720 * determined here because we do not have the global information 3721 * about the count of all chunks that satisfy the filters. 3722 */ 3723 if (bargs->limit_max == 0) 3724 return 0; 3725 else 3726 bargs->limit_max--; 3727 } 3728 3729 return 1; 3730 } 3731 3732 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3733 { 3734 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3735 struct btrfs_root *chunk_root = fs_info->chunk_root; 3736 u64 chunk_type; 3737 struct btrfs_chunk *chunk; 3738 struct btrfs_path *path = NULL; 3739 struct btrfs_key key; 3740 struct btrfs_key found_key; 3741 struct extent_buffer *leaf; 3742 int slot; 3743 int ret; 3744 int enospc_errors = 0; 3745 bool counting = true; 3746 /* The single value limit and min/max limits use the same bytes in the */ 3747 u64 limit_data = bctl->data.limit; 3748 u64 limit_meta = bctl->meta.limit; 3749 u64 limit_sys = bctl->sys.limit; 3750 u32 count_data = 0; 3751 u32 count_meta = 0; 3752 u32 count_sys = 0; 3753 int chunk_reserved = 0; 3754 3755 path = btrfs_alloc_path(); 3756 if (!path) { 3757 ret = -ENOMEM; 3758 goto error; 3759 } 3760 3761 /* zero out stat counters */ 3762 spin_lock(&fs_info->balance_lock); 3763 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3764 spin_unlock(&fs_info->balance_lock); 3765 again: 3766 if (!counting) { 3767 /* 3768 * The single value limit and min/max limits use the same bytes 3769 * in the 3770 */ 3771 bctl->data.limit = limit_data; 3772 bctl->meta.limit = limit_meta; 3773 bctl->sys.limit = limit_sys; 3774 } 3775 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3776 key.offset = (u64)-1; 3777 key.type = BTRFS_CHUNK_ITEM_KEY; 3778 3779 while (1) { 3780 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3781 atomic_read(&fs_info->balance_cancel_req)) { 3782 ret = -ECANCELED; 3783 goto error; 3784 } 3785 3786 mutex_lock(&fs_info->reclaim_bgs_lock); 3787 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3788 if (ret < 0) { 3789 mutex_unlock(&fs_info->reclaim_bgs_lock); 3790 goto error; 3791 } 3792 3793 /* 3794 * this shouldn't happen, it means the last relocate 3795 * failed 3796 */ 3797 if (ret == 0) 3798 BUG(); /* FIXME break ? */ 3799 3800 ret = btrfs_previous_item(chunk_root, path, 0, 3801 BTRFS_CHUNK_ITEM_KEY); 3802 if (ret) { 3803 mutex_unlock(&fs_info->reclaim_bgs_lock); 3804 ret = 0; 3805 break; 3806 } 3807 3808 leaf = path->nodes[0]; 3809 slot = path->slots[0]; 3810 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3811 3812 if (found_key.objectid != key.objectid) { 3813 mutex_unlock(&fs_info->reclaim_bgs_lock); 3814 break; 3815 } 3816 3817 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3818 chunk_type = btrfs_chunk_type(leaf, chunk); 3819 3820 if (!counting) { 3821 spin_lock(&fs_info->balance_lock); 3822 bctl->stat.considered++; 3823 spin_unlock(&fs_info->balance_lock); 3824 } 3825 3826 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3827 3828 btrfs_release_path(path); 3829 if (!ret) { 3830 mutex_unlock(&fs_info->reclaim_bgs_lock); 3831 goto loop; 3832 } 3833 3834 if (counting) { 3835 mutex_unlock(&fs_info->reclaim_bgs_lock); 3836 spin_lock(&fs_info->balance_lock); 3837 bctl->stat.expected++; 3838 spin_unlock(&fs_info->balance_lock); 3839 3840 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3841 count_data++; 3842 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3843 count_sys++; 3844 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3845 count_meta++; 3846 3847 goto loop; 3848 } 3849 3850 /* 3851 * Apply limit_min filter, no need to check if the LIMITS 3852 * filter is used, limit_min is 0 by default 3853 */ 3854 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3855 count_data < bctl->data.limit_min) 3856 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3857 count_meta < bctl->meta.limit_min) 3858 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3859 count_sys < bctl->sys.limit_min)) { 3860 mutex_unlock(&fs_info->reclaim_bgs_lock); 3861 goto loop; 3862 } 3863 3864 if (!chunk_reserved) { 3865 /* 3866 * We may be relocating the only data chunk we have, 3867 * which could potentially end up with losing data's 3868 * raid profile, so lets allocate an empty one in 3869 * advance. 3870 */ 3871 ret = btrfs_may_alloc_data_chunk(fs_info, 3872 found_key.offset); 3873 if (ret < 0) { 3874 mutex_unlock(&fs_info->reclaim_bgs_lock); 3875 goto error; 3876 } else if (ret == 1) { 3877 chunk_reserved = 1; 3878 } 3879 } 3880 3881 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3882 mutex_unlock(&fs_info->reclaim_bgs_lock); 3883 if (ret == -ENOSPC) { 3884 enospc_errors++; 3885 } else if (ret == -ETXTBSY) { 3886 btrfs_info(fs_info, 3887 "skipping relocation of block group %llu due to active swapfile", 3888 found_key.offset); 3889 ret = 0; 3890 } else if (ret) { 3891 goto error; 3892 } else { 3893 spin_lock(&fs_info->balance_lock); 3894 bctl->stat.completed++; 3895 spin_unlock(&fs_info->balance_lock); 3896 } 3897 loop: 3898 if (found_key.offset == 0) 3899 break; 3900 key.offset = found_key.offset - 1; 3901 } 3902 3903 if (counting) { 3904 btrfs_release_path(path); 3905 counting = false; 3906 goto again; 3907 } 3908 error: 3909 btrfs_free_path(path); 3910 if (enospc_errors) { 3911 btrfs_info(fs_info, "%d enospc errors during balance", 3912 enospc_errors); 3913 if (!ret) 3914 ret = -ENOSPC; 3915 } 3916 3917 return ret; 3918 } 3919 3920 /** 3921 * alloc_profile_is_valid - see if a given profile is valid and reduced 3922 * @flags: profile to validate 3923 * @extended: if true @flags is treated as an extended profile 3924 */ 3925 static int alloc_profile_is_valid(u64 flags, int extended) 3926 { 3927 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3928 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3929 3930 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3931 3932 /* 1) check that all other bits are zeroed */ 3933 if (flags & ~mask) 3934 return 0; 3935 3936 /* 2) see if profile is reduced */ 3937 if (flags == 0) 3938 return !extended; /* "0" is valid for usual profiles */ 3939 3940 return has_single_bit_set(flags); 3941 } 3942 3943 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3944 { 3945 /* cancel requested || normal exit path */ 3946 return atomic_read(&fs_info->balance_cancel_req) || 3947 (atomic_read(&fs_info->balance_pause_req) == 0 && 3948 atomic_read(&fs_info->balance_cancel_req) == 0); 3949 } 3950 3951 /* 3952 * Validate target profile against allowed profiles and return true if it's OK. 3953 * Otherwise print the error message and return false. 3954 */ 3955 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 3956 const struct btrfs_balance_args *bargs, 3957 u64 allowed, const char *type) 3958 { 3959 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3960 return true; 3961 3962 if (fs_info->sectorsize < PAGE_SIZE && 3963 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3964 btrfs_err(fs_info, 3965 "RAID56 is not yet supported for sectorsize %u with page size %lu", 3966 fs_info->sectorsize, PAGE_SIZE); 3967 return false; 3968 } 3969 /* Profile is valid and does not have bits outside of the allowed set */ 3970 if (alloc_profile_is_valid(bargs->target, 1) && 3971 (bargs->target & ~allowed) == 0) 3972 return true; 3973 3974 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 3975 type, btrfs_bg_type_to_raid_name(bargs->target)); 3976 return false; 3977 } 3978 3979 /* 3980 * Fill @buf with textual description of balance filter flags @bargs, up to 3981 * @size_buf including the terminating null. The output may be trimmed if it 3982 * does not fit into the provided buffer. 3983 */ 3984 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 3985 u32 size_buf) 3986 { 3987 int ret; 3988 u32 size_bp = size_buf; 3989 char *bp = buf; 3990 u64 flags = bargs->flags; 3991 char tmp_buf[128] = {'\0'}; 3992 3993 if (!flags) 3994 return; 3995 3996 #define CHECK_APPEND_NOARG(a) \ 3997 do { \ 3998 ret = snprintf(bp, size_bp, (a)); \ 3999 if (ret < 0 || ret >= size_bp) \ 4000 goto out_overflow; \ 4001 size_bp -= ret; \ 4002 bp += ret; \ 4003 } while (0) 4004 4005 #define CHECK_APPEND_1ARG(a, v1) \ 4006 do { \ 4007 ret = snprintf(bp, size_bp, (a), (v1)); \ 4008 if (ret < 0 || ret >= size_bp) \ 4009 goto out_overflow; \ 4010 size_bp -= ret; \ 4011 bp += ret; \ 4012 } while (0) 4013 4014 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4015 do { \ 4016 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4017 if (ret < 0 || ret >= size_bp) \ 4018 goto out_overflow; \ 4019 size_bp -= ret; \ 4020 bp += ret; \ 4021 } while (0) 4022 4023 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4024 CHECK_APPEND_1ARG("convert=%s,", 4025 btrfs_bg_type_to_raid_name(bargs->target)); 4026 4027 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4028 CHECK_APPEND_NOARG("soft,"); 4029 4030 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4031 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4032 sizeof(tmp_buf)); 4033 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4034 } 4035 4036 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4037 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4038 4039 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4040 CHECK_APPEND_2ARG("usage=%u..%u,", 4041 bargs->usage_min, bargs->usage_max); 4042 4043 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4044 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4045 4046 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4047 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4048 bargs->pstart, bargs->pend); 4049 4050 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4051 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4052 bargs->vstart, bargs->vend); 4053 4054 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4055 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4056 4057 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4058 CHECK_APPEND_2ARG("limit=%u..%u,", 4059 bargs->limit_min, bargs->limit_max); 4060 4061 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4062 CHECK_APPEND_2ARG("stripes=%u..%u,", 4063 bargs->stripes_min, bargs->stripes_max); 4064 4065 #undef CHECK_APPEND_2ARG 4066 #undef CHECK_APPEND_1ARG 4067 #undef CHECK_APPEND_NOARG 4068 4069 out_overflow: 4070 4071 if (size_bp < size_buf) 4072 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4073 else 4074 buf[0] = '\0'; 4075 } 4076 4077 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4078 { 4079 u32 size_buf = 1024; 4080 char tmp_buf[192] = {'\0'}; 4081 char *buf; 4082 char *bp; 4083 u32 size_bp = size_buf; 4084 int ret; 4085 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4086 4087 buf = kzalloc(size_buf, GFP_KERNEL); 4088 if (!buf) 4089 return; 4090 4091 bp = buf; 4092 4093 #define CHECK_APPEND_1ARG(a, v1) \ 4094 do { \ 4095 ret = snprintf(bp, size_bp, (a), (v1)); \ 4096 if (ret < 0 || ret >= size_bp) \ 4097 goto out_overflow; \ 4098 size_bp -= ret; \ 4099 bp += ret; \ 4100 } while (0) 4101 4102 if (bctl->flags & BTRFS_BALANCE_FORCE) 4103 CHECK_APPEND_1ARG("%s", "-f "); 4104 4105 if (bctl->flags & BTRFS_BALANCE_DATA) { 4106 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4107 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4108 } 4109 4110 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4111 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4112 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4113 } 4114 4115 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4116 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4117 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4118 } 4119 4120 #undef CHECK_APPEND_1ARG 4121 4122 out_overflow: 4123 4124 if (size_bp < size_buf) 4125 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4126 btrfs_info(fs_info, "balance: %s %s", 4127 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4128 "resume" : "start", buf); 4129 4130 kfree(buf); 4131 } 4132 4133 /* 4134 * Should be called with balance mutexe held 4135 */ 4136 int btrfs_balance(struct btrfs_fs_info *fs_info, 4137 struct btrfs_balance_control *bctl, 4138 struct btrfs_ioctl_balance_args *bargs) 4139 { 4140 u64 meta_target, data_target; 4141 u64 allowed; 4142 int mixed = 0; 4143 int ret; 4144 u64 num_devices; 4145 unsigned seq; 4146 bool reducing_redundancy; 4147 int i; 4148 4149 if (btrfs_fs_closing(fs_info) || 4150 atomic_read(&fs_info->balance_pause_req) || 4151 btrfs_should_cancel_balance(fs_info)) { 4152 ret = -EINVAL; 4153 goto out; 4154 } 4155 4156 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4157 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4158 mixed = 1; 4159 4160 /* 4161 * In case of mixed groups both data and meta should be picked, 4162 * and identical options should be given for both of them. 4163 */ 4164 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4165 if (mixed && (bctl->flags & allowed)) { 4166 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4167 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4168 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4169 btrfs_err(fs_info, 4170 "balance: mixed groups data and metadata options must be the same"); 4171 ret = -EINVAL; 4172 goto out; 4173 } 4174 } 4175 4176 /* 4177 * rw_devices will not change at the moment, device add/delete/replace 4178 * are exclusive 4179 */ 4180 num_devices = fs_info->fs_devices->rw_devices; 4181 4182 /* 4183 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4184 * special bit for it, to make it easier to distinguish. Thus we need 4185 * to set it manually, or balance would refuse the profile. 4186 */ 4187 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4188 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4189 if (num_devices >= btrfs_raid_array[i].devs_min) 4190 allowed |= btrfs_raid_array[i].bg_flag; 4191 4192 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4193 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4194 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4195 ret = -EINVAL; 4196 goto out; 4197 } 4198 4199 /* 4200 * Allow to reduce metadata or system integrity only if force set for 4201 * profiles with redundancy (copies, parity) 4202 */ 4203 allowed = 0; 4204 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4205 if (btrfs_raid_array[i].ncopies >= 2 || 4206 btrfs_raid_array[i].tolerated_failures >= 1) 4207 allowed |= btrfs_raid_array[i].bg_flag; 4208 } 4209 do { 4210 seq = read_seqbegin(&fs_info->profiles_lock); 4211 4212 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4213 (fs_info->avail_system_alloc_bits & allowed) && 4214 !(bctl->sys.target & allowed)) || 4215 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4216 (fs_info->avail_metadata_alloc_bits & allowed) && 4217 !(bctl->meta.target & allowed))) 4218 reducing_redundancy = true; 4219 else 4220 reducing_redundancy = false; 4221 4222 /* if we're not converting, the target field is uninitialized */ 4223 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4224 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4225 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4226 bctl->data.target : fs_info->avail_data_alloc_bits; 4227 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4228 4229 if (reducing_redundancy) { 4230 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4231 btrfs_info(fs_info, 4232 "balance: force reducing metadata redundancy"); 4233 } else { 4234 btrfs_err(fs_info, 4235 "balance: reduces metadata redundancy, use --force if you want this"); 4236 ret = -EINVAL; 4237 goto out; 4238 } 4239 } 4240 4241 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4242 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4243 btrfs_warn(fs_info, 4244 "balance: metadata profile %s has lower redundancy than data profile %s", 4245 btrfs_bg_type_to_raid_name(meta_target), 4246 btrfs_bg_type_to_raid_name(data_target)); 4247 } 4248 4249 ret = insert_balance_item(fs_info, bctl); 4250 if (ret && ret != -EEXIST) 4251 goto out; 4252 4253 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4254 BUG_ON(ret == -EEXIST); 4255 BUG_ON(fs_info->balance_ctl); 4256 spin_lock(&fs_info->balance_lock); 4257 fs_info->balance_ctl = bctl; 4258 spin_unlock(&fs_info->balance_lock); 4259 } else { 4260 BUG_ON(ret != -EEXIST); 4261 spin_lock(&fs_info->balance_lock); 4262 update_balance_args(bctl); 4263 spin_unlock(&fs_info->balance_lock); 4264 } 4265 4266 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4267 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4268 describe_balance_start_or_resume(fs_info); 4269 mutex_unlock(&fs_info->balance_mutex); 4270 4271 ret = __btrfs_balance(fs_info); 4272 4273 mutex_lock(&fs_info->balance_mutex); 4274 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4275 btrfs_info(fs_info, "balance: paused"); 4276 /* 4277 * Balance can be canceled by: 4278 * 4279 * - Regular cancel request 4280 * Then ret == -ECANCELED and balance_cancel_req > 0 4281 * 4282 * - Fatal signal to "btrfs" process 4283 * Either the signal caught by wait_reserve_ticket() and callers 4284 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4285 * got -ECANCELED. 4286 * Either way, in this case balance_cancel_req = 0, and 4287 * ret == -EINTR or ret == -ECANCELED. 4288 * 4289 * So here we only check the return value to catch canceled balance. 4290 */ 4291 else if (ret == -ECANCELED || ret == -EINTR) 4292 btrfs_info(fs_info, "balance: canceled"); 4293 else 4294 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4295 4296 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4297 4298 if (bargs) { 4299 memset(bargs, 0, sizeof(*bargs)); 4300 btrfs_update_ioctl_balance_args(fs_info, bargs); 4301 } 4302 4303 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4304 balance_need_close(fs_info)) { 4305 reset_balance_state(fs_info); 4306 btrfs_exclop_finish(fs_info); 4307 } 4308 4309 wake_up(&fs_info->balance_wait_q); 4310 4311 return ret; 4312 out: 4313 if (bctl->flags & BTRFS_BALANCE_RESUME) 4314 reset_balance_state(fs_info); 4315 else 4316 kfree(bctl); 4317 btrfs_exclop_finish(fs_info); 4318 4319 return ret; 4320 } 4321 4322 static int balance_kthread(void *data) 4323 { 4324 struct btrfs_fs_info *fs_info = data; 4325 int ret = 0; 4326 4327 mutex_lock(&fs_info->balance_mutex); 4328 if (fs_info->balance_ctl) 4329 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4330 mutex_unlock(&fs_info->balance_mutex); 4331 4332 return ret; 4333 } 4334 4335 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4336 { 4337 struct task_struct *tsk; 4338 4339 mutex_lock(&fs_info->balance_mutex); 4340 if (!fs_info->balance_ctl) { 4341 mutex_unlock(&fs_info->balance_mutex); 4342 return 0; 4343 } 4344 mutex_unlock(&fs_info->balance_mutex); 4345 4346 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4347 btrfs_info(fs_info, "balance: resume skipped"); 4348 return 0; 4349 } 4350 4351 /* 4352 * A ro->rw remount sequence should continue with the paused balance 4353 * regardless of who pauses it, system or the user as of now, so set 4354 * the resume flag. 4355 */ 4356 spin_lock(&fs_info->balance_lock); 4357 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4358 spin_unlock(&fs_info->balance_lock); 4359 4360 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4361 return PTR_ERR_OR_ZERO(tsk); 4362 } 4363 4364 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4365 { 4366 struct btrfs_balance_control *bctl; 4367 struct btrfs_balance_item *item; 4368 struct btrfs_disk_balance_args disk_bargs; 4369 struct btrfs_path *path; 4370 struct extent_buffer *leaf; 4371 struct btrfs_key key; 4372 int ret; 4373 4374 path = btrfs_alloc_path(); 4375 if (!path) 4376 return -ENOMEM; 4377 4378 key.objectid = BTRFS_BALANCE_OBJECTID; 4379 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4380 key.offset = 0; 4381 4382 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4383 if (ret < 0) 4384 goto out; 4385 if (ret > 0) { /* ret = -ENOENT; */ 4386 ret = 0; 4387 goto out; 4388 } 4389 4390 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4391 if (!bctl) { 4392 ret = -ENOMEM; 4393 goto out; 4394 } 4395 4396 leaf = path->nodes[0]; 4397 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4398 4399 bctl->flags = btrfs_balance_flags(leaf, item); 4400 bctl->flags |= BTRFS_BALANCE_RESUME; 4401 4402 btrfs_balance_data(leaf, item, &disk_bargs); 4403 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4404 btrfs_balance_meta(leaf, item, &disk_bargs); 4405 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4406 btrfs_balance_sys(leaf, item, &disk_bargs); 4407 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4408 4409 /* 4410 * This should never happen, as the paused balance state is recovered 4411 * during mount without any chance of other exclusive ops to collide. 4412 * 4413 * This gives the exclusive op status to balance and keeps in paused 4414 * state until user intervention (cancel or umount). If the ownership 4415 * cannot be assigned, show a message but do not fail. The balance 4416 * is in a paused state and must have fs_info::balance_ctl properly 4417 * set up. 4418 */ 4419 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4420 btrfs_warn(fs_info, 4421 "balance: cannot set exclusive op status, resume manually"); 4422 4423 btrfs_release_path(path); 4424 4425 mutex_lock(&fs_info->balance_mutex); 4426 BUG_ON(fs_info->balance_ctl); 4427 spin_lock(&fs_info->balance_lock); 4428 fs_info->balance_ctl = bctl; 4429 spin_unlock(&fs_info->balance_lock); 4430 mutex_unlock(&fs_info->balance_mutex); 4431 out: 4432 btrfs_free_path(path); 4433 return ret; 4434 } 4435 4436 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4437 { 4438 int ret = 0; 4439 4440 mutex_lock(&fs_info->balance_mutex); 4441 if (!fs_info->balance_ctl) { 4442 mutex_unlock(&fs_info->balance_mutex); 4443 return -ENOTCONN; 4444 } 4445 4446 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4447 atomic_inc(&fs_info->balance_pause_req); 4448 mutex_unlock(&fs_info->balance_mutex); 4449 4450 wait_event(fs_info->balance_wait_q, 4451 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4452 4453 mutex_lock(&fs_info->balance_mutex); 4454 /* we are good with balance_ctl ripped off from under us */ 4455 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4456 atomic_dec(&fs_info->balance_pause_req); 4457 } else { 4458 ret = -ENOTCONN; 4459 } 4460 4461 mutex_unlock(&fs_info->balance_mutex); 4462 return ret; 4463 } 4464 4465 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4466 { 4467 mutex_lock(&fs_info->balance_mutex); 4468 if (!fs_info->balance_ctl) { 4469 mutex_unlock(&fs_info->balance_mutex); 4470 return -ENOTCONN; 4471 } 4472 4473 /* 4474 * A paused balance with the item stored on disk can be resumed at 4475 * mount time if the mount is read-write. Otherwise it's still paused 4476 * and we must not allow cancelling as it deletes the item. 4477 */ 4478 if (sb_rdonly(fs_info->sb)) { 4479 mutex_unlock(&fs_info->balance_mutex); 4480 return -EROFS; 4481 } 4482 4483 atomic_inc(&fs_info->balance_cancel_req); 4484 /* 4485 * if we are running just wait and return, balance item is 4486 * deleted in btrfs_balance in this case 4487 */ 4488 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4489 mutex_unlock(&fs_info->balance_mutex); 4490 wait_event(fs_info->balance_wait_q, 4491 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4492 mutex_lock(&fs_info->balance_mutex); 4493 } else { 4494 mutex_unlock(&fs_info->balance_mutex); 4495 /* 4496 * Lock released to allow other waiters to continue, we'll 4497 * reexamine the status again. 4498 */ 4499 mutex_lock(&fs_info->balance_mutex); 4500 4501 if (fs_info->balance_ctl) { 4502 reset_balance_state(fs_info); 4503 btrfs_exclop_finish(fs_info); 4504 btrfs_info(fs_info, "balance: canceled"); 4505 } 4506 } 4507 4508 BUG_ON(fs_info->balance_ctl || 4509 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4510 atomic_dec(&fs_info->balance_cancel_req); 4511 mutex_unlock(&fs_info->balance_mutex); 4512 return 0; 4513 } 4514 4515 int btrfs_uuid_scan_kthread(void *data) 4516 { 4517 struct btrfs_fs_info *fs_info = data; 4518 struct btrfs_root *root = fs_info->tree_root; 4519 struct btrfs_key key; 4520 struct btrfs_path *path = NULL; 4521 int ret = 0; 4522 struct extent_buffer *eb; 4523 int slot; 4524 struct btrfs_root_item root_item; 4525 u32 item_size; 4526 struct btrfs_trans_handle *trans = NULL; 4527 bool closing = false; 4528 4529 path = btrfs_alloc_path(); 4530 if (!path) { 4531 ret = -ENOMEM; 4532 goto out; 4533 } 4534 4535 key.objectid = 0; 4536 key.type = BTRFS_ROOT_ITEM_KEY; 4537 key.offset = 0; 4538 4539 while (1) { 4540 if (btrfs_fs_closing(fs_info)) { 4541 closing = true; 4542 break; 4543 } 4544 ret = btrfs_search_forward(root, &key, path, 4545 BTRFS_OLDEST_GENERATION); 4546 if (ret) { 4547 if (ret > 0) 4548 ret = 0; 4549 break; 4550 } 4551 4552 if (key.type != BTRFS_ROOT_ITEM_KEY || 4553 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4554 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4555 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4556 goto skip; 4557 4558 eb = path->nodes[0]; 4559 slot = path->slots[0]; 4560 item_size = btrfs_item_size_nr(eb, slot); 4561 if (item_size < sizeof(root_item)) 4562 goto skip; 4563 4564 read_extent_buffer(eb, &root_item, 4565 btrfs_item_ptr_offset(eb, slot), 4566 (int)sizeof(root_item)); 4567 if (btrfs_root_refs(&root_item) == 0) 4568 goto skip; 4569 4570 if (!btrfs_is_empty_uuid(root_item.uuid) || 4571 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4572 if (trans) 4573 goto update_tree; 4574 4575 btrfs_release_path(path); 4576 /* 4577 * 1 - subvol uuid item 4578 * 1 - received_subvol uuid item 4579 */ 4580 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4581 if (IS_ERR(trans)) { 4582 ret = PTR_ERR(trans); 4583 break; 4584 } 4585 continue; 4586 } else { 4587 goto skip; 4588 } 4589 update_tree: 4590 btrfs_release_path(path); 4591 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4592 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4593 BTRFS_UUID_KEY_SUBVOL, 4594 key.objectid); 4595 if (ret < 0) { 4596 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4597 ret); 4598 break; 4599 } 4600 } 4601 4602 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4603 ret = btrfs_uuid_tree_add(trans, 4604 root_item.received_uuid, 4605 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4606 key.objectid); 4607 if (ret < 0) { 4608 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4609 ret); 4610 break; 4611 } 4612 } 4613 4614 skip: 4615 btrfs_release_path(path); 4616 if (trans) { 4617 ret = btrfs_end_transaction(trans); 4618 trans = NULL; 4619 if (ret) 4620 break; 4621 } 4622 4623 if (key.offset < (u64)-1) { 4624 key.offset++; 4625 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4626 key.offset = 0; 4627 key.type = BTRFS_ROOT_ITEM_KEY; 4628 } else if (key.objectid < (u64)-1) { 4629 key.offset = 0; 4630 key.type = BTRFS_ROOT_ITEM_KEY; 4631 key.objectid++; 4632 } else { 4633 break; 4634 } 4635 cond_resched(); 4636 } 4637 4638 out: 4639 btrfs_free_path(path); 4640 if (trans && !IS_ERR(trans)) 4641 btrfs_end_transaction(trans); 4642 if (ret) 4643 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4644 else if (!closing) 4645 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4646 up(&fs_info->uuid_tree_rescan_sem); 4647 return 0; 4648 } 4649 4650 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4651 { 4652 struct btrfs_trans_handle *trans; 4653 struct btrfs_root *tree_root = fs_info->tree_root; 4654 struct btrfs_root *uuid_root; 4655 struct task_struct *task; 4656 int ret; 4657 4658 /* 4659 * 1 - root node 4660 * 1 - root item 4661 */ 4662 trans = btrfs_start_transaction(tree_root, 2); 4663 if (IS_ERR(trans)) 4664 return PTR_ERR(trans); 4665 4666 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4667 if (IS_ERR(uuid_root)) { 4668 ret = PTR_ERR(uuid_root); 4669 btrfs_abort_transaction(trans, ret); 4670 btrfs_end_transaction(trans); 4671 return ret; 4672 } 4673 4674 fs_info->uuid_root = uuid_root; 4675 4676 ret = btrfs_commit_transaction(trans); 4677 if (ret) 4678 return ret; 4679 4680 down(&fs_info->uuid_tree_rescan_sem); 4681 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4682 if (IS_ERR(task)) { 4683 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4684 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4685 up(&fs_info->uuid_tree_rescan_sem); 4686 return PTR_ERR(task); 4687 } 4688 4689 return 0; 4690 } 4691 4692 /* 4693 * shrinking a device means finding all of the device extents past 4694 * the new size, and then following the back refs to the chunks. 4695 * The chunk relocation code actually frees the device extent 4696 */ 4697 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4698 { 4699 struct btrfs_fs_info *fs_info = device->fs_info; 4700 struct btrfs_root *root = fs_info->dev_root; 4701 struct btrfs_trans_handle *trans; 4702 struct btrfs_dev_extent *dev_extent = NULL; 4703 struct btrfs_path *path; 4704 u64 length; 4705 u64 chunk_offset; 4706 int ret; 4707 int slot; 4708 int failed = 0; 4709 bool retried = false; 4710 struct extent_buffer *l; 4711 struct btrfs_key key; 4712 struct btrfs_super_block *super_copy = fs_info->super_copy; 4713 u64 old_total = btrfs_super_total_bytes(super_copy); 4714 u64 old_size = btrfs_device_get_total_bytes(device); 4715 u64 diff; 4716 u64 start; 4717 4718 new_size = round_down(new_size, fs_info->sectorsize); 4719 start = new_size; 4720 diff = round_down(old_size - new_size, fs_info->sectorsize); 4721 4722 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4723 return -EINVAL; 4724 4725 path = btrfs_alloc_path(); 4726 if (!path) 4727 return -ENOMEM; 4728 4729 path->reada = READA_BACK; 4730 4731 trans = btrfs_start_transaction(root, 0); 4732 if (IS_ERR(trans)) { 4733 btrfs_free_path(path); 4734 return PTR_ERR(trans); 4735 } 4736 4737 mutex_lock(&fs_info->chunk_mutex); 4738 4739 btrfs_device_set_total_bytes(device, new_size); 4740 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4741 device->fs_devices->total_rw_bytes -= diff; 4742 atomic64_sub(diff, &fs_info->free_chunk_space); 4743 } 4744 4745 /* 4746 * Once the device's size has been set to the new size, ensure all 4747 * in-memory chunks are synced to disk so that the loop below sees them 4748 * and relocates them accordingly. 4749 */ 4750 if (contains_pending_extent(device, &start, diff)) { 4751 mutex_unlock(&fs_info->chunk_mutex); 4752 ret = btrfs_commit_transaction(trans); 4753 if (ret) 4754 goto done; 4755 } else { 4756 mutex_unlock(&fs_info->chunk_mutex); 4757 btrfs_end_transaction(trans); 4758 } 4759 4760 again: 4761 key.objectid = device->devid; 4762 key.offset = (u64)-1; 4763 key.type = BTRFS_DEV_EXTENT_KEY; 4764 4765 do { 4766 mutex_lock(&fs_info->reclaim_bgs_lock); 4767 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4768 if (ret < 0) { 4769 mutex_unlock(&fs_info->reclaim_bgs_lock); 4770 goto done; 4771 } 4772 4773 ret = btrfs_previous_item(root, path, 0, key.type); 4774 if (ret) { 4775 mutex_unlock(&fs_info->reclaim_bgs_lock); 4776 if (ret < 0) 4777 goto done; 4778 ret = 0; 4779 btrfs_release_path(path); 4780 break; 4781 } 4782 4783 l = path->nodes[0]; 4784 slot = path->slots[0]; 4785 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4786 4787 if (key.objectid != device->devid) { 4788 mutex_unlock(&fs_info->reclaim_bgs_lock); 4789 btrfs_release_path(path); 4790 break; 4791 } 4792 4793 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4794 length = btrfs_dev_extent_length(l, dev_extent); 4795 4796 if (key.offset + length <= new_size) { 4797 mutex_unlock(&fs_info->reclaim_bgs_lock); 4798 btrfs_release_path(path); 4799 break; 4800 } 4801 4802 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4803 btrfs_release_path(path); 4804 4805 /* 4806 * We may be relocating the only data chunk we have, 4807 * which could potentially end up with losing data's 4808 * raid profile, so lets allocate an empty one in 4809 * advance. 4810 */ 4811 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4812 if (ret < 0) { 4813 mutex_unlock(&fs_info->reclaim_bgs_lock); 4814 goto done; 4815 } 4816 4817 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4818 mutex_unlock(&fs_info->reclaim_bgs_lock); 4819 if (ret == -ENOSPC) { 4820 failed++; 4821 } else if (ret) { 4822 if (ret == -ETXTBSY) { 4823 btrfs_warn(fs_info, 4824 "could not shrink block group %llu due to active swapfile", 4825 chunk_offset); 4826 } 4827 goto done; 4828 } 4829 } while (key.offset-- > 0); 4830 4831 if (failed && !retried) { 4832 failed = 0; 4833 retried = true; 4834 goto again; 4835 } else if (failed && retried) { 4836 ret = -ENOSPC; 4837 goto done; 4838 } 4839 4840 /* Shrinking succeeded, else we would be at "done". */ 4841 trans = btrfs_start_transaction(root, 0); 4842 if (IS_ERR(trans)) { 4843 ret = PTR_ERR(trans); 4844 goto done; 4845 } 4846 4847 mutex_lock(&fs_info->chunk_mutex); 4848 /* Clear all state bits beyond the shrunk device size */ 4849 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4850 CHUNK_STATE_MASK); 4851 4852 btrfs_device_set_disk_total_bytes(device, new_size); 4853 if (list_empty(&device->post_commit_list)) 4854 list_add_tail(&device->post_commit_list, 4855 &trans->transaction->dev_update_list); 4856 4857 WARN_ON(diff > old_total); 4858 btrfs_set_super_total_bytes(super_copy, 4859 round_down(old_total - diff, fs_info->sectorsize)); 4860 mutex_unlock(&fs_info->chunk_mutex); 4861 4862 /* Now btrfs_update_device() will change the on-disk size. */ 4863 ret = btrfs_update_device(trans, device); 4864 if (ret < 0) { 4865 btrfs_abort_transaction(trans, ret); 4866 btrfs_end_transaction(trans); 4867 } else { 4868 ret = btrfs_commit_transaction(trans); 4869 } 4870 done: 4871 btrfs_free_path(path); 4872 if (ret) { 4873 mutex_lock(&fs_info->chunk_mutex); 4874 btrfs_device_set_total_bytes(device, old_size); 4875 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4876 device->fs_devices->total_rw_bytes += diff; 4877 atomic64_add(diff, &fs_info->free_chunk_space); 4878 mutex_unlock(&fs_info->chunk_mutex); 4879 } 4880 return ret; 4881 } 4882 4883 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4884 struct btrfs_key *key, 4885 struct btrfs_chunk *chunk, int item_size) 4886 { 4887 struct btrfs_super_block *super_copy = fs_info->super_copy; 4888 struct btrfs_disk_key disk_key; 4889 u32 array_size; 4890 u8 *ptr; 4891 4892 lockdep_assert_held(&fs_info->chunk_mutex); 4893 4894 array_size = btrfs_super_sys_array_size(super_copy); 4895 if (array_size + item_size + sizeof(disk_key) 4896 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4897 return -EFBIG; 4898 4899 ptr = super_copy->sys_chunk_array + array_size; 4900 btrfs_cpu_key_to_disk(&disk_key, key); 4901 memcpy(ptr, &disk_key, sizeof(disk_key)); 4902 ptr += sizeof(disk_key); 4903 memcpy(ptr, chunk, item_size); 4904 item_size += sizeof(disk_key); 4905 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4906 4907 return 0; 4908 } 4909 4910 /* 4911 * sort the devices in descending order by max_avail, total_avail 4912 */ 4913 static int btrfs_cmp_device_info(const void *a, const void *b) 4914 { 4915 const struct btrfs_device_info *di_a = a; 4916 const struct btrfs_device_info *di_b = b; 4917 4918 if (di_a->max_avail > di_b->max_avail) 4919 return -1; 4920 if (di_a->max_avail < di_b->max_avail) 4921 return 1; 4922 if (di_a->total_avail > di_b->total_avail) 4923 return -1; 4924 if (di_a->total_avail < di_b->total_avail) 4925 return 1; 4926 return 0; 4927 } 4928 4929 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4930 { 4931 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4932 return; 4933 4934 btrfs_set_fs_incompat(info, RAID56); 4935 } 4936 4937 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4938 { 4939 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4940 return; 4941 4942 btrfs_set_fs_incompat(info, RAID1C34); 4943 } 4944 4945 /* 4946 * Structure used internally for __btrfs_alloc_chunk() function. 4947 * Wraps needed parameters. 4948 */ 4949 struct alloc_chunk_ctl { 4950 u64 start; 4951 u64 type; 4952 /* Total number of stripes to allocate */ 4953 int num_stripes; 4954 /* sub_stripes info for map */ 4955 int sub_stripes; 4956 /* Stripes per device */ 4957 int dev_stripes; 4958 /* Maximum number of devices to use */ 4959 int devs_max; 4960 /* Minimum number of devices to use */ 4961 int devs_min; 4962 /* ndevs has to be a multiple of this */ 4963 int devs_increment; 4964 /* Number of copies */ 4965 int ncopies; 4966 /* Number of stripes worth of bytes to store parity information */ 4967 int nparity; 4968 u64 max_stripe_size; 4969 u64 max_chunk_size; 4970 u64 dev_extent_min; 4971 u64 stripe_size; 4972 u64 chunk_size; 4973 int ndevs; 4974 }; 4975 4976 static void init_alloc_chunk_ctl_policy_regular( 4977 struct btrfs_fs_devices *fs_devices, 4978 struct alloc_chunk_ctl *ctl) 4979 { 4980 u64 type = ctl->type; 4981 4982 if (type & BTRFS_BLOCK_GROUP_DATA) { 4983 ctl->max_stripe_size = SZ_1G; 4984 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4985 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4986 /* For larger filesystems, use larger metadata chunks */ 4987 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4988 ctl->max_stripe_size = SZ_1G; 4989 else 4990 ctl->max_stripe_size = SZ_256M; 4991 ctl->max_chunk_size = ctl->max_stripe_size; 4992 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4993 ctl->max_stripe_size = SZ_32M; 4994 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 4995 ctl->devs_max = min_t(int, ctl->devs_max, 4996 BTRFS_MAX_DEVS_SYS_CHUNK); 4997 } else { 4998 BUG(); 4999 } 5000 5001 /* We don't want a chunk larger than 10% of writable space */ 5002 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5003 ctl->max_chunk_size); 5004 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5005 } 5006 5007 static void init_alloc_chunk_ctl_policy_zoned( 5008 struct btrfs_fs_devices *fs_devices, 5009 struct alloc_chunk_ctl *ctl) 5010 { 5011 u64 zone_size = fs_devices->fs_info->zone_size; 5012 u64 limit; 5013 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5014 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5015 u64 min_chunk_size = min_data_stripes * zone_size; 5016 u64 type = ctl->type; 5017 5018 ctl->max_stripe_size = zone_size; 5019 if (type & BTRFS_BLOCK_GROUP_DATA) { 5020 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5021 zone_size); 5022 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5023 ctl->max_chunk_size = ctl->max_stripe_size; 5024 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5025 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5026 ctl->devs_max = min_t(int, ctl->devs_max, 5027 BTRFS_MAX_DEVS_SYS_CHUNK); 5028 } else { 5029 BUG(); 5030 } 5031 5032 /* We don't want a chunk larger than 10% of writable space */ 5033 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5034 zone_size), 5035 min_chunk_size); 5036 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5037 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5038 } 5039 5040 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5041 struct alloc_chunk_ctl *ctl) 5042 { 5043 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5044 5045 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5046 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5047 ctl->devs_max = btrfs_raid_array[index].devs_max; 5048 if (!ctl->devs_max) 5049 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5050 ctl->devs_min = btrfs_raid_array[index].devs_min; 5051 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5052 ctl->ncopies = btrfs_raid_array[index].ncopies; 5053 ctl->nparity = btrfs_raid_array[index].nparity; 5054 ctl->ndevs = 0; 5055 5056 switch (fs_devices->chunk_alloc_policy) { 5057 case BTRFS_CHUNK_ALLOC_REGULAR: 5058 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5059 break; 5060 case BTRFS_CHUNK_ALLOC_ZONED: 5061 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5062 break; 5063 default: 5064 BUG(); 5065 } 5066 } 5067 5068 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5069 struct alloc_chunk_ctl *ctl, 5070 struct btrfs_device_info *devices_info) 5071 { 5072 struct btrfs_fs_info *info = fs_devices->fs_info; 5073 struct btrfs_device *device; 5074 u64 total_avail; 5075 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5076 int ret; 5077 int ndevs = 0; 5078 u64 max_avail; 5079 u64 dev_offset; 5080 5081 /* 5082 * in the first pass through the devices list, we gather information 5083 * about the available holes on each device. 5084 */ 5085 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5086 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5087 WARN(1, KERN_ERR 5088 "BTRFS: read-only device in alloc_list\n"); 5089 continue; 5090 } 5091 5092 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5093 &device->dev_state) || 5094 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5095 continue; 5096 5097 if (device->total_bytes > device->bytes_used) 5098 total_avail = device->total_bytes - device->bytes_used; 5099 else 5100 total_avail = 0; 5101 5102 /* If there is no space on this device, skip it. */ 5103 if (total_avail < ctl->dev_extent_min) 5104 continue; 5105 5106 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5107 &max_avail); 5108 if (ret && ret != -ENOSPC) 5109 return ret; 5110 5111 if (ret == 0) 5112 max_avail = dev_extent_want; 5113 5114 if (max_avail < ctl->dev_extent_min) { 5115 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5116 btrfs_debug(info, 5117 "%s: devid %llu has no free space, have=%llu want=%llu", 5118 __func__, device->devid, max_avail, 5119 ctl->dev_extent_min); 5120 continue; 5121 } 5122 5123 if (ndevs == fs_devices->rw_devices) { 5124 WARN(1, "%s: found more than %llu devices\n", 5125 __func__, fs_devices->rw_devices); 5126 break; 5127 } 5128 devices_info[ndevs].dev_offset = dev_offset; 5129 devices_info[ndevs].max_avail = max_avail; 5130 devices_info[ndevs].total_avail = total_avail; 5131 devices_info[ndevs].dev = device; 5132 ++ndevs; 5133 } 5134 ctl->ndevs = ndevs; 5135 5136 /* 5137 * now sort the devices by hole size / available space 5138 */ 5139 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5140 btrfs_cmp_device_info, NULL); 5141 5142 return 0; 5143 } 5144 5145 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5146 struct btrfs_device_info *devices_info) 5147 { 5148 /* Number of stripes that count for block group size */ 5149 int data_stripes; 5150 5151 /* 5152 * The primary goal is to maximize the number of stripes, so use as 5153 * many devices as possible, even if the stripes are not maximum sized. 5154 * 5155 * The DUP profile stores more than one stripe per device, the 5156 * max_avail is the total size so we have to adjust. 5157 */ 5158 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5159 ctl->dev_stripes); 5160 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5161 5162 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5163 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5164 5165 /* 5166 * Use the number of data stripes to figure out how big this chunk is 5167 * really going to be in terms of logical address space, and compare 5168 * that answer with the max chunk size. If it's higher, we try to 5169 * reduce stripe_size. 5170 */ 5171 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5172 /* 5173 * Reduce stripe_size, round it up to a 16MB boundary again and 5174 * then use it, unless it ends up being even bigger than the 5175 * previous value we had already. 5176 */ 5177 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5178 data_stripes), SZ_16M), 5179 ctl->stripe_size); 5180 } 5181 5182 /* Align to BTRFS_STRIPE_LEN */ 5183 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5184 ctl->chunk_size = ctl->stripe_size * data_stripes; 5185 5186 return 0; 5187 } 5188 5189 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5190 struct btrfs_device_info *devices_info) 5191 { 5192 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5193 /* Number of stripes that count for block group size */ 5194 int data_stripes; 5195 5196 /* 5197 * It should hold because: 5198 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5199 */ 5200 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5201 5202 ctl->stripe_size = zone_size; 5203 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5204 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5205 5206 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5207 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5208 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5209 ctl->stripe_size) + ctl->nparity, 5210 ctl->dev_stripes); 5211 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5212 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5213 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5214 } 5215 5216 ctl->chunk_size = ctl->stripe_size * data_stripes; 5217 5218 return 0; 5219 } 5220 5221 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5222 struct alloc_chunk_ctl *ctl, 5223 struct btrfs_device_info *devices_info) 5224 { 5225 struct btrfs_fs_info *info = fs_devices->fs_info; 5226 5227 /* 5228 * Round down to number of usable stripes, devs_increment can be any 5229 * number so we can't use round_down() that requires power of 2, while 5230 * rounddown is safe. 5231 */ 5232 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5233 5234 if (ctl->ndevs < ctl->devs_min) { 5235 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5236 btrfs_debug(info, 5237 "%s: not enough devices with free space: have=%d minimum required=%d", 5238 __func__, ctl->ndevs, ctl->devs_min); 5239 } 5240 return -ENOSPC; 5241 } 5242 5243 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5244 5245 switch (fs_devices->chunk_alloc_policy) { 5246 case BTRFS_CHUNK_ALLOC_REGULAR: 5247 return decide_stripe_size_regular(ctl, devices_info); 5248 case BTRFS_CHUNK_ALLOC_ZONED: 5249 return decide_stripe_size_zoned(ctl, devices_info); 5250 default: 5251 BUG(); 5252 } 5253 } 5254 5255 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5256 struct alloc_chunk_ctl *ctl, 5257 struct btrfs_device_info *devices_info) 5258 { 5259 struct btrfs_fs_info *info = trans->fs_info; 5260 struct map_lookup *map = NULL; 5261 struct extent_map_tree *em_tree; 5262 struct btrfs_block_group *block_group; 5263 struct extent_map *em; 5264 u64 start = ctl->start; 5265 u64 type = ctl->type; 5266 int ret; 5267 int i; 5268 int j; 5269 5270 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5271 if (!map) 5272 return ERR_PTR(-ENOMEM); 5273 map->num_stripes = ctl->num_stripes; 5274 5275 for (i = 0; i < ctl->ndevs; ++i) { 5276 for (j = 0; j < ctl->dev_stripes; ++j) { 5277 int s = i * ctl->dev_stripes + j; 5278 map->stripes[s].dev = devices_info[i].dev; 5279 map->stripes[s].physical = devices_info[i].dev_offset + 5280 j * ctl->stripe_size; 5281 } 5282 } 5283 map->stripe_len = BTRFS_STRIPE_LEN; 5284 map->io_align = BTRFS_STRIPE_LEN; 5285 map->io_width = BTRFS_STRIPE_LEN; 5286 map->type = type; 5287 map->sub_stripes = ctl->sub_stripes; 5288 5289 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5290 5291 em = alloc_extent_map(); 5292 if (!em) { 5293 kfree(map); 5294 return ERR_PTR(-ENOMEM); 5295 } 5296 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5297 em->map_lookup = map; 5298 em->start = start; 5299 em->len = ctl->chunk_size; 5300 em->block_start = 0; 5301 em->block_len = em->len; 5302 em->orig_block_len = ctl->stripe_size; 5303 5304 em_tree = &info->mapping_tree; 5305 write_lock(&em_tree->lock); 5306 ret = add_extent_mapping(em_tree, em, 0); 5307 if (ret) { 5308 write_unlock(&em_tree->lock); 5309 free_extent_map(em); 5310 return ERR_PTR(ret); 5311 } 5312 write_unlock(&em_tree->lock); 5313 5314 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5315 if (IS_ERR(block_group)) 5316 goto error_del_extent; 5317 5318 for (i = 0; i < map->num_stripes; i++) { 5319 struct btrfs_device *dev = map->stripes[i].dev; 5320 5321 btrfs_device_set_bytes_used(dev, 5322 dev->bytes_used + ctl->stripe_size); 5323 if (list_empty(&dev->post_commit_list)) 5324 list_add_tail(&dev->post_commit_list, 5325 &trans->transaction->dev_update_list); 5326 } 5327 5328 atomic64_sub(ctl->stripe_size * map->num_stripes, 5329 &info->free_chunk_space); 5330 5331 free_extent_map(em); 5332 check_raid56_incompat_flag(info, type); 5333 check_raid1c34_incompat_flag(info, type); 5334 5335 return block_group; 5336 5337 error_del_extent: 5338 write_lock(&em_tree->lock); 5339 remove_extent_mapping(em_tree, em); 5340 write_unlock(&em_tree->lock); 5341 5342 /* One for our allocation */ 5343 free_extent_map(em); 5344 /* One for the tree reference */ 5345 free_extent_map(em); 5346 5347 return block_group; 5348 } 5349 5350 struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5351 u64 type) 5352 { 5353 struct btrfs_fs_info *info = trans->fs_info; 5354 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5355 struct btrfs_device_info *devices_info = NULL; 5356 struct alloc_chunk_ctl ctl; 5357 struct btrfs_block_group *block_group; 5358 int ret; 5359 5360 lockdep_assert_held(&info->chunk_mutex); 5361 5362 if (!alloc_profile_is_valid(type, 0)) { 5363 ASSERT(0); 5364 return ERR_PTR(-EINVAL); 5365 } 5366 5367 if (list_empty(&fs_devices->alloc_list)) { 5368 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5369 btrfs_debug(info, "%s: no writable device", __func__); 5370 return ERR_PTR(-ENOSPC); 5371 } 5372 5373 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5374 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5375 ASSERT(0); 5376 return ERR_PTR(-EINVAL); 5377 } 5378 5379 ctl.start = find_next_chunk(info); 5380 ctl.type = type; 5381 init_alloc_chunk_ctl(fs_devices, &ctl); 5382 5383 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5384 GFP_NOFS); 5385 if (!devices_info) 5386 return ERR_PTR(-ENOMEM); 5387 5388 ret = gather_device_info(fs_devices, &ctl, devices_info); 5389 if (ret < 0) { 5390 block_group = ERR_PTR(ret); 5391 goto out; 5392 } 5393 5394 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5395 if (ret < 0) { 5396 block_group = ERR_PTR(ret); 5397 goto out; 5398 } 5399 5400 block_group = create_chunk(trans, &ctl, devices_info); 5401 5402 out: 5403 kfree(devices_info); 5404 return block_group; 5405 } 5406 5407 /* 5408 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5409 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5410 * chunks. 5411 * 5412 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5413 * phases. 5414 */ 5415 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5416 struct btrfs_block_group *bg) 5417 { 5418 struct btrfs_fs_info *fs_info = trans->fs_info; 5419 struct btrfs_root *extent_root = fs_info->extent_root; 5420 struct btrfs_root *chunk_root = fs_info->chunk_root; 5421 struct btrfs_key key; 5422 struct btrfs_chunk *chunk; 5423 struct btrfs_stripe *stripe; 5424 struct extent_map *em; 5425 struct map_lookup *map; 5426 size_t item_size; 5427 int i; 5428 int ret; 5429 5430 /* 5431 * We take the chunk_mutex for 2 reasons: 5432 * 5433 * 1) Updates and insertions in the chunk btree must be done while holding 5434 * the chunk_mutex, as well as updating the system chunk array in the 5435 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5436 * details; 5437 * 5438 * 2) To prevent races with the final phase of a device replace operation 5439 * that replaces the device object associated with the map's stripes, 5440 * because the device object's id can change at any time during that 5441 * final phase of the device replace operation 5442 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5443 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5444 * which would cause a failure when updating the device item, which does 5445 * not exists, or persisting a stripe of the chunk item with such ID. 5446 * Here we can't use the device_list_mutex because our caller already 5447 * has locked the chunk_mutex, and the final phase of device replace 5448 * acquires both mutexes - first the device_list_mutex and then the 5449 * chunk_mutex. Using any of those two mutexes protects us from a 5450 * concurrent device replace. 5451 */ 5452 lockdep_assert_held(&fs_info->chunk_mutex); 5453 5454 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5455 if (IS_ERR(em)) { 5456 ret = PTR_ERR(em); 5457 btrfs_abort_transaction(trans, ret); 5458 return ret; 5459 } 5460 5461 map = em->map_lookup; 5462 item_size = btrfs_chunk_item_size(map->num_stripes); 5463 5464 chunk = kzalloc(item_size, GFP_NOFS); 5465 if (!chunk) { 5466 ret = -ENOMEM; 5467 btrfs_abort_transaction(trans, ret); 5468 goto out; 5469 } 5470 5471 for (i = 0; i < map->num_stripes; i++) { 5472 struct btrfs_device *device = map->stripes[i].dev; 5473 5474 ret = btrfs_update_device(trans, device); 5475 if (ret) 5476 goto out; 5477 } 5478 5479 stripe = &chunk->stripe; 5480 for (i = 0; i < map->num_stripes; i++) { 5481 struct btrfs_device *device = map->stripes[i].dev; 5482 const u64 dev_offset = map->stripes[i].physical; 5483 5484 btrfs_set_stack_stripe_devid(stripe, device->devid); 5485 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5486 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5487 stripe++; 5488 } 5489 5490 btrfs_set_stack_chunk_length(chunk, bg->length); 5491 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5492 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5493 btrfs_set_stack_chunk_type(chunk, map->type); 5494 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5495 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5496 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5497 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5498 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5499 5500 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5501 key.type = BTRFS_CHUNK_ITEM_KEY; 5502 key.offset = bg->start; 5503 5504 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5505 if (ret) 5506 goto out; 5507 5508 bg->chunk_item_inserted = 1; 5509 5510 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5511 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5512 if (ret) 5513 goto out; 5514 } 5515 5516 out: 5517 kfree(chunk); 5518 free_extent_map(em); 5519 return ret; 5520 } 5521 5522 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5523 { 5524 struct btrfs_fs_info *fs_info = trans->fs_info; 5525 u64 alloc_profile; 5526 struct btrfs_block_group *meta_bg; 5527 struct btrfs_block_group *sys_bg; 5528 5529 /* 5530 * When adding a new device for sprouting, the seed device is read-only 5531 * so we must first allocate a metadata and a system chunk. But before 5532 * adding the block group items to the extent, device and chunk btrees, 5533 * we must first: 5534 * 5535 * 1) Create both chunks without doing any changes to the btrees, as 5536 * otherwise we would get -ENOSPC since the block groups from the 5537 * seed device are read-only; 5538 * 5539 * 2) Add the device item for the new sprout device - finishing the setup 5540 * of a new block group requires updating the device item in the chunk 5541 * btree, so it must exist when we attempt to do it. The previous step 5542 * ensures this does not fail with -ENOSPC. 5543 * 5544 * After that we can add the block group items to their btrees: 5545 * update existing device item in the chunk btree, add a new block group 5546 * item to the extent btree, add a new chunk item to the chunk btree and 5547 * finally add the new device extent items to the devices btree. 5548 */ 5549 5550 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5551 meta_bg = btrfs_alloc_chunk(trans, alloc_profile); 5552 if (IS_ERR(meta_bg)) 5553 return PTR_ERR(meta_bg); 5554 5555 alloc_profile = btrfs_system_alloc_profile(fs_info); 5556 sys_bg = btrfs_alloc_chunk(trans, alloc_profile); 5557 if (IS_ERR(sys_bg)) 5558 return PTR_ERR(sys_bg); 5559 5560 return 0; 5561 } 5562 5563 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5564 { 5565 const int index = btrfs_bg_flags_to_raid_index(map->type); 5566 5567 return btrfs_raid_array[index].tolerated_failures; 5568 } 5569 5570 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5571 { 5572 struct extent_map *em; 5573 struct map_lookup *map; 5574 int readonly = 0; 5575 int miss_ndevs = 0; 5576 int i; 5577 5578 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5579 if (IS_ERR(em)) 5580 return 1; 5581 5582 map = em->map_lookup; 5583 for (i = 0; i < map->num_stripes; i++) { 5584 if (test_bit(BTRFS_DEV_STATE_MISSING, 5585 &map->stripes[i].dev->dev_state)) { 5586 miss_ndevs++; 5587 continue; 5588 } 5589 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5590 &map->stripes[i].dev->dev_state)) { 5591 readonly = 1; 5592 goto end; 5593 } 5594 } 5595 5596 /* 5597 * If the number of missing devices is larger than max errors, 5598 * we can not write the data into that chunk successfully, so 5599 * set it readonly. 5600 */ 5601 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5602 readonly = 1; 5603 end: 5604 free_extent_map(em); 5605 return readonly; 5606 } 5607 5608 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5609 { 5610 struct extent_map *em; 5611 5612 while (1) { 5613 write_lock(&tree->lock); 5614 em = lookup_extent_mapping(tree, 0, (u64)-1); 5615 if (em) 5616 remove_extent_mapping(tree, em); 5617 write_unlock(&tree->lock); 5618 if (!em) 5619 break; 5620 /* once for us */ 5621 free_extent_map(em); 5622 /* once for the tree */ 5623 free_extent_map(em); 5624 } 5625 } 5626 5627 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5628 { 5629 struct extent_map *em; 5630 struct map_lookup *map; 5631 int ret; 5632 5633 em = btrfs_get_chunk_map(fs_info, logical, len); 5634 if (IS_ERR(em)) 5635 /* 5636 * We could return errors for these cases, but that could get 5637 * ugly and we'd probably do the same thing which is just not do 5638 * anything else and exit, so return 1 so the callers don't try 5639 * to use other copies. 5640 */ 5641 return 1; 5642 5643 map = em->map_lookup; 5644 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5645 ret = map->num_stripes; 5646 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5647 ret = map->sub_stripes; 5648 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5649 ret = 2; 5650 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5651 /* 5652 * There could be two corrupted data stripes, we need 5653 * to loop retry in order to rebuild the correct data. 5654 * 5655 * Fail a stripe at a time on every retry except the 5656 * stripe under reconstruction. 5657 */ 5658 ret = map->num_stripes; 5659 else 5660 ret = 1; 5661 free_extent_map(em); 5662 5663 down_read(&fs_info->dev_replace.rwsem); 5664 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5665 fs_info->dev_replace.tgtdev) 5666 ret++; 5667 up_read(&fs_info->dev_replace.rwsem); 5668 5669 return ret; 5670 } 5671 5672 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5673 u64 logical) 5674 { 5675 struct extent_map *em; 5676 struct map_lookup *map; 5677 unsigned long len = fs_info->sectorsize; 5678 5679 em = btrfs_get_chunk_map(fs_info, logical, len); 5680 5681 if (!WARN_ON(IS_ERR(em))) { 5682 map = em->map_lookup; 5683 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5684 len = map->stripe_len * nr_data_stripes(map); 5685 free_extent_map(em); 5686 } 5687 return len; 5688 } 5689 5690 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5691 { 5692 struct extent_map *em; 5693 struct map_lookup *map; 5694 int ret = 0; 5695 5696 em = btrfs_get_chunk_map(fs_info, logical, len); 5697 5698 if(!WARN_ON(IS_ERR(em))) { 5699 map = em->map_lookup; 5700 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5701 ret = 1; 5702 free_extent_map(em); 5703 } 5704 return ret; 5705 } 5706 5707 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5708 struct map_lookup *map, int first, 5709 int dev_replace_is_ongoing) 5710 { 5711 int i; 5712 int num_stripes; 5713 int preferred_mirror; 5714 int tolerance; 5715 struct btrfs_device *srcdev; 5716 5717 ASSERT((map->type & 5718 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5719 5720 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5721 num_stripes = map->sub_stripes; 5722 else 5723 num_stripes = map->num_stripes; 5724 5725 switch (fs_info->fs_devices->read_policy) { 5726 default: 5727 /* Shouldn't happen, just warn and use pid instead of failing */ 5728 btrfs_warn_rl(fs_info, 5729 "unknown read_policy type %u, reset to pid", 5730 fs_info->fs_devices->read_policy); 5731 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5732 fallthrough; 5733 case BTRFS_READ_POLICY_PID: 5734 preferred_mirror = first + (current->pid % num_stripes); 5735 break; 5736 } 5737 5738 if (dev_replace_is_ongoing && 5739 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5740 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5741 srcdev = fs_info->dev_replace.srcdev; 5742 else 5743 srcdev = NULL; 5744 5745 /* 5746 * try to avoid the drive that is the source drive for a 5747 * dev-replace procedure, only choose it if no other non-missing 5748 * mirror is available 5749 */ 5750 for (tolerance = 0; tolerance < 2; tolerance++) { 5751 if (map->stripes[preferred_mirror].dev->bdev && 5752 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5753 return preferred_mirror; 5754 for (i = first; i < first + num_stripes; i++) { 5755 if (map->stripes[i].dev->bdev && 5756 (tolerance || map->stripes[i].dev != srcdev)) 5757 return i; 5758 } 5759 } 5760 5761 /* we couldn't find one that doesn't fail. Just return something 5762 * and the io error handling code will clean up eventually 5763 */ 5764 return preferred_mirror; 5765 } 5766 5767 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5768 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5769 { 5770 int i; 5771 int again = 1; 5772 5773 while (again) { 5774 again = 0; 5775 for (i = 0; i < num_stripes - 1; i++) { 5776 /* Swap if parity is on a smaller index */ 5777 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5778 swap(bbio->stripes[i], bbio->stripes[i + 1]); 5779 swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 5780 again = 1; 5781 } 5782 } 5783 } 5784 } 5785 5786 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5787 { 5788 struct btrfs_bio *bbio = kzalloc( 5789 /* the size of the btrfs_bio */ 5790 sizeof(struct btrfs_bio) + 5791 /* plus the variable array for the stripes */ 5792 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5793 /* plus the variable array for the tgt dev */ 5794 sizeof(int) * (real_stripes) + 5795 /* 5796 * plus the raid_map, which includes both the tgt dev 5797 * and the stripes 5798 */ 5799 sizeof(u64) * (total_stripes), 5800 GFP_NOFS|__GFP_NOFAIL); 5801 5802 atomic_set(&bbio->error, 0); 5803 refcount_set(&bbio->refs, 1); 5804 5805 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5806 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5807 5808 return bbio; 5809 } 5810 5811 void btrfs_get_bbio(struct btrfs_bio *bbio) 5812 { 5813 WARN_ON(!refcount_read(&bbio->refs)); 5814 refcount_inc(&bbio->refs); 5815 } 5816 5817 void btrfs_put_bbio(struct btrfs_bio *bbio) 5818 { 5819 if (!bbio) 5820 return; 5821 if (refcount_dec_and_test(&bbio->refs)) 5822 kfree(bbio); 5823 } 5824 5825 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5826 /* 5827 * Please note that, discard won't be sent to target device of device 5828 * replace. 5829 */ 5830 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5831 u64 logical, u64 *length_ret, 5832 struct btrfs_bio **bbio_ret) 5833 { 5834 struct extent_map *em; 5835 struct map_lookup *map; 5836 struct btrfs_bio *bbio; 5837 u64 length = *length_ret; 5838 u64 offset; 5839 u64 stripe_nr; 5840 u64 stripe_nr_end; 5841 u64 stripe_end_offset; 5842 u64 stripe_cnt; 5843 u64 stripe_len; 5844 u64 stripe_offset; 5845 u64 num_stripes; 5846 u32 stripe_index; 5847 u32 factor = 0; 5848 u32 sub_stripes = 0; 5849 u64 stripes_per_dev = 0; 5850 u32 remaining_stripes = 0; 5851 u32 last_stripe = 0; 5852 int ret = 0; 5853 int i; 5854 5855 /* discard always return a bbio */ 5856 ASSERT(bbio_ret); 5857 5858 em = btrfs_get_chunk_map(fs_info, logical, length); 5859 if (IS_ERR(em)) 5860 return PTR_ERR(em); 5861 5862 map = em->map_lookup; 5863 /* we don't discard raid56 yet */ 5864 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5865 ret = -EOPNOTSUPP; 5866 goto out; 5867 } 5868 5869 offset = logical - em->start; 5870 length = min_t(u64, em->start + em->len - logical, length); 5871 *length_ret = length; 5872 5873 stripe_len = map->stripe_len; 5874 /* 5875 * stripe_nr counts the total number of stripes we have to stride 5876 * to get to this block 5877 */ 5878 stripe_nr = div64_u64(offset, stripe_len); 5879 5880 /* stripe_offset is the offset of this block in its stripe */ 5881 stripe_offset = offset - stripe_nr * stripe_len; 5882 5883 stripe_nr_end = round_up(offset + length, map->stripe_len); 5884 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5885 stripe_cnt = stripe_nr_end - stripe_nr; 5886 stripe_end_offset = stripe_nr_end * map->stripe_len - 5887 (offset + length); 5888 /* 5889 * after this, stripe_nr is the number of stripes on this 5890 * device we have to walk to find the data, and stripe_index is 5891 * the number of our device in the stripe array 5892 */ 5893 num_stripes = 1; 5894 stripe_index = 0; 5895 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5896 BTRFS_BLOCK_GROUP_RAID10)) { 5897 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5898 sub_stripes = 1; 5899 else 5900 sub_stripes = map->sub_stripes; 5901 5902 factor = map->num_stripes / sub_stripes; 5903 num_stripes = min_t(u64, map->num_stripes, 5904 sub_stripes * stripe_cnt); 5905 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5906 stripe_index *= sub_stripes; 5907 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5908 &remaining_stripes); 5909 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5910 last_stripe *= sub_stripes; 5911 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5912 BTRFS_BLOCK_GROUP_DUP)) { 5913 num_stripes = map->num_stripes; 5914 } else { 5915 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5916 &stripe_index); 5917 } 5918 5919 bbio = alloc_btrfs_bio(num_stripes, 0); 5920 if (!bbio) { 5921 ret = -ENOMEM; 5922 goto out; 5923 } 5924 5925 for (i = 0; i < num_stripes; i++) { 5926 bbio->stripes[i].physical = 5927 map->stripes[stripe_index].physical + 5928 stripe_offset + stripe_nr * map->stripe_len; 5929 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5930 5931 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5932 BTRFS_BLOCK_GROUP_RAID10)) { 5933 bbio->stripes[i].length = stripes_per_dev * 5934 map->stripe_len; 5935 5936 if (i / sub_stripes < remaining_stripes) 5937 bbio->stripes[i].length += 5938 map->stripe_len; 5939 5940 /* 5941 * Special for the first stripe and 5942 * the last stripe: 5943 * 5944 * |-------|...|-------| 5945 * |----------| 5946 * off end_off 5947 */ 5948 if (i < sub_stripes) 5949 bbio->stripes[i].length -= 5950 stripe_offset; 5951 5952 if (stripe_index >= last_stripe && 5953 stripe_index <= (last_stripe + 5954 sub_stripes - 1)) 5955 bbio->stripes[i].length -= 5956 stripe_end_offset; 5957 5958 if (i == sub_stripes - 1) 5959 stripe_offset = 0; 5960 } else { 5961 bbio->stripes[i].length = length; 5962 } 5963 5964 stripe_index++; 5965 if (stripe_index == map->num_stripes) { 5966 stripe_index = 0; 5967 stripe_nr++; 5968 } 5969 } 5970 5971 *bbio_ret = bbio; 5972 bbio->map_type = map->type; 5973 bbio->num_stripes = num_stripes; 5974 out: 5975 free_extent_map(em); 5976 return ret; 5977 } 5978 5979 /* 5980 * In dev-replace case, for repair case (that's the only case where the mirror 5981 * is selected explicitly when calling btrfs_map_block), blocks left of the 5982 * left cursor can also be read from the target drive. 5983 * 5984 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5985 * array of stripes. 5986 * For READ, it also needs to be supported using the same mirror number. 5987 * 5988 * If the requested block is not left of the left cursor, EIO is returned. This 5989 * can happen because btrfs_num_copies() returns one more in the dev-replace 5990 * case. 5991 */ 5992 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5993 u64 logical, u64 length, 5994 u64 srcdev_devid, int *mirror_num, 5995 u64 *physical) 5996 { 5997 struct btrfs_bio *bbio = NULL; 5998 int num_stripes; 5999 int index_srcdev = 0; 6000 int found = 0; 6001 u64 physical_of_found = 0; 6002 int i; 6003 int ret = 0; 6004 6005 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6006 logical, &length, &bbio, 0, 0); 6007 if (ret) { 6008 ASSERT(bbio == NULL); 6009 return ret; 6010 } 6011 6012 num_stripes = bbio->num_stripes; 6013 if (*mirror_num > num_stripes) { 6014 /* 6015 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6016 * that means that the requested area is not left of the left 6017 * cursor 6018 */ 6019 btrfs_put_bbio(bbio); 6020 return -EIO; 6021 } 6022 6023 /* 6024 * process the rest of the function using the mirror_num of the source 6025 * drive. Therefore look it up first. At the end, patch the device 6026 * pointer to the one of the target drive. 6027 */ 6028 for (i = 0; i < num_stripes; i++) { 6029 if (bbio->stripes[i].dev->devid != srcdev_devid) 6030 continue; 6031 6032 /* 6033 * In case of DUP, in order to keep it simple, only add the 6034 * mirror with the lowest physical address 6035 */ 6036 if (found && 6037 physical_of_found <= bbio->stripes[i].physical) 6038 continue; 6039 6040 index_srcdev = i; 6041 found = 1; 6042 physical_of_found = bbio->stripes[i].physical; 6043 } 6044 6045 btrfs_put_bbio(bbio); 6046 6047 ASSERT(found); 6048 if (!found) 6049 return -EIO; 6050 6051 *mirror_num = index_srcdev + 1; 6052 *physical = physical_of_found; 6053 return ret; 6054 } 6055 6056 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6057 { 6058 struct btrfs_block_group *cache; 6059 bool ret; 6060 6061 /* Non zoned filesystem does not use "to_copy" flag */ 6062 if (!btrfs_is_zoned(fs_info)) 6063 return false; 6064 6065 cache = btrfs_lookup_block_group(fs_info, logical); 6066 6067 spin_lock(&cache->lock); 6068 ret = cache->to_copy; 6069 spin_unlock(&cache->lock); 6070 6071 btrfs_put_block_group(cache); 6072 return ret; 6073 } 6074 6075 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6076 struct btrfs_bio **bbio_ret, 6077 struct btrfs_dev_replace *dev_replace, 6078 u64 logical, 6079 int *num_stripes_ret, int *max_errors_ret) 6080 { 6081 struct btrfs_bio *bbio = *bbio_ret; 6082 u64 srcdev_devid = dev_replace->srcdev->devid; 6083 int tgtdev_indexes = 0; 6084 int num_stripes = *num_stripes_ret; 6085 int max_errors = *max_errors_ret; 6086 int i; 6087 6088 if (op == BTRFS_MAP_WRITE) { 6089 int index_where_to_add; 6090 6091 /* 6092 * A block group which have "to_copy" set will eventually 6093 * copied by dev-replace process. We can avoid cloning IO here. 6094 */ 6095 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6096 return; 6097 6098 /* 6099 * duplicate the write operations while the dev replace 6100 * procedure is running. Since the copying of the old disk to 6101 * the new disk takes place at run time while the filesystem is 6102 * mounted writable, the regular write operations to the old 6103 * disk have to be duplicated to go to the new disk as well. 6104 * 6105 * Note that device->missing is handled by the caller, and that 6106 * the write to the old disk is already set up in the stripes 6107 * array. 6108 */ 6109 index_where_to_add = num_stripes; 6110 for (i = 0; i < num_stripes; i++) { 6111 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6112 /* write to new disk, too */ 6113 struct btrfs_bio_stripe *new = 6114 bbio->stripes + index_where_to_add; 6115 struct btrfs_bio_stripe *old = 6116 bbio->stripes + i; 6117 6118 new->physical = old->physical; 6119 new->length = old->length; 6120 new->dev = dev_replace->tgtdev; 6121 bbio->tgtdev_map[i] = index_where_to_add; 6122 index_where_to_add++; 6123 max_errors++; 6124 tgtdev_indexes++; 6125 } 6126 } 6127 num_stripes = index_where_to_add; 6128 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6129 int index_srcdev = 0; 6130 int found = 0; 6131 u64 physical_of_found = 0; 6132 6133 /* 6134 * During the dev-replace procedure, the target drive can also 6135 * be used to read data in case it is needed to repair a corrupt 6136 * block elsewhere. This is possible if the requested area is 6137 * left of the left cursor. In this area, the target drive is a 6138 * full copy of the source drive. 6139 */ 6140 for (i = 0; i < num_stripes; i++) { 6141 if (bbio->stripes[i].dev->devid == srcdev_devid) { 6142 /* 6143 * In case of DUP, in order to keep it simple, 6144 * only add the mirror with the lowest physical 6145 * address 6146 */ 6147 if (found && 6148 physical_of_found <= 6149 bbio->stripes[i].physical) 6150 continue; 6151 index_srcdev = i; 6152 found = 1; 6153 physical_of_found = bbio->stripes[i].physical; 6154 } 6155 } 6156 if (found) { 6157 struct btrfs_bio_stripe *tgtdev_stripe = 6158 bbio->stripes + num_stripes; 6159 6160 tgtdev_stripe->physical = physical_of_found; 6161 tgtdev_stripe->length = 6162 bbio->stripes[index_srcdev].length; 6163 tgtdev_stripe->dev = dev_replace->tgtdev; 6164 bbio->tgtdev_map[index_srcdev] = num_stripes; 6165 6166 tgtdev_indexes++; 6167 num_stripes++; 6168 } 6169 } 6170 6171 *num_stripes_ret = num_stripes; 6172 *max_errors_ret = max_errors; 6173 bbio->num_tgtdevs = tgtdev_indexes; 6174 *bbio_ret = bbio; 6175 } 6176 6177 static bool need_full_stripe(enum btrfs_map_op op) 6178 { 6179 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6180 } 6181 6182 /* 6183 * Calculate the geometry of a particular (address, len) tuple. This 6184 * information is used to calculate how big a particular bio can get before it 6185 * straddles a stripe. 6186 * 6187 * @fs_info: the filesystem 6188 * @em: mapping containing the logical extent 6189 * @op: type of operation - write or read 6190 * @logical: address that we want to figure out the geometry of 6191 * @io_geom: pointer used to return values 6192 * 6193 * Returns < 0 in case a chunk for the given logical address cannot be found, 6194 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6195 */ 6196 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6197 enum btrfs_map_op op, u64 logical, 6198 struct btrfs_io_geometry *io_geom) 6199 { 6200 struct map_lookup *map; 6201 u64 len; 6202 u64 offset; 6203 u64 stripe_offset; 6204 u64 stripe_nr; 6205 u64 stripe_len; 6206 u64 raid56_full_stripe_start = (u64)-1; 6207 int data_stripes; 6208 6209 ASSERT(op != BTRFS_MAP_DISCARD); 6210 6211 map = em->map_lookup; 6212 /* Offset of this logical address in the chunk */ 6213 offset = logical - em->start; 6214 /* Len of a stripe in a chunk */ 6215 stripe_len = map->stripe_len; 6216 /* Stripe where this block falls in */ 6217 stripe_nr = div64_u64(offset, stripe_len); 6218 /* Offset of stripe in the chunk */ 6219 stripe_offset = stripe_nr * stripe_len; 6220 if (offset < stripe_offset) { 6221 btrfs_crit(fs_info, 6222 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6223 stripe_offset, offset, em->start, logical, stripe_len); 6224 return -EINVAL; 6225 } 6226 6227 /* stripe_offset is the offset of this block in its stripe */ 6228 stripe_offset = offset - stripe_offset; 6229 data_stripes = nr_data_stripes(map); 6230 6231 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6232 u64 max_len = stripe_len - stripe_offset; 6233 6234 /* 6235 * In case of raid56, we need to know the stripe aligned start 6236 */ 6237 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6238 unsigned long full_stripe_len = stripe_len * data_stripes; 6239 raid56_full_stripe_start = offset; 6240 6241 /* 6242 * Allow a write of a full stripe, but make sure we 6243 * don't allow straddling of stripes 6244 */ 6245 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6246 full_stripe_len); 6247 raid56_full_stripe_start *= full_stripe_len; 6248 6249 /* 6250 * For writes to RAID[56], allow a full stripeset across 6251 * all disks. For other RAID types and for RAID[56] 6252 * reads, just allow a single stripe (on a single disk). 6253 */ 6254 if (op == BTRFS_MAP_WRITE) { 6255 max_len = stripe_len * data_stripes - 6256 (offset - raid56_full_stripe_start); 6257 } 6258 } 6259 len = min_t(u64, em->len - offset, max_len); 6260 } else { 6261 len = em->len - offset; 6262 } 6263 6264 io_geom->len = len; 6265 io_geom->offset = offset; 6266 io_geom->stripe_len = stripe_len; 6267 io_geom->stripe_nr = stripe_nr; 6268 io_geom->stripe_offset = stripe_offset; 6269 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6270 6271 return 0; 6272 } 6273 6274 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6275 enum btrfs_map_op op, 6276 u64 logical, u64 *length, 6277 struct btrfs_bio **bbio_ret, 6278 int mirror_num, int need_raid_map) 6279 { 6280 struct extent_map *em; 6281 struct map_lookup *map; 6282 u64 stripe_offset; 6283 u64 stripe_nr; 6284 u64 stripe_len; 6285 u32 stripe_index; 6286 int data_stripes; 6287 int i; 6288 int ret = 0; 6289 int num_stripes; 6290 int max_errors = 0; 6291 int tgtdev_indexes = 0; 6292 struct btrfs_bio *bbio = NULL; 6293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6294 int dev_replace_is_ongoing = 0; 6295 int num_alloc_stripes; 6296 int patch_the_first_stripe_for_dev_replace = 0; 6297 u64 physical_to_patch_in_first_stripe = 0; 6298 u64 raid56_full_stripe_start = (u64)-1; 6299 struct btrfs_io_geometry geom; 6300 6301 ASSERT(bbio_ret); 6302 ASSERT(op != BTRFS_MAP_DISCARD); 6303 6304 em = btrfs_get_chunk_map(fs_info, logical, *length); 6305 ASSERT(!IS_ERR(em)); 6306 6307 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6308 if (ret < 0) 6309 return ret; 6310 6311 map = em->map_lookup; 6312 6313 *length = geom.len; 6314 stripe_len = geom.stripe_len; 6315 stripe_nr = geom.stripe_nr; 6316 stripe_offset = geom.stripe_offset; 6317 raid56_full_stripe_start = geom.raid56_stripe_offset; 6318 data_stripes = nr_data_stripes(map); 6319 6320 down_read(&dev_replace->rwsem); 6321 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6322 /* 6323 * Hold the semaphore for read during the whole operation, write is 6324 * requested at commit time but must wait. 6325 */ 6326 if (!dev_replace_is_ongoing) 6327 up_read(&dev_replace->rwsem); 6328 6329 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6330 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6331 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6332 dev_replace->srcdev->devid, 6333 &mirror_num, 6334 &physical_to_patch_in_first_stripe); 6335 if (ret) 6336 goto out; 6337 else 6338 patch_the_first_stripe_for_dev_replace = 1; 6339 } else if (mirror_num > map->num_stripes) { 6340 mirror_num = 0; 6341 } 6342 6343 num_stripes = 1; 6344 stripe_index = 0; 6345 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6346 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6347 &stripe_index); 6348 if (!need_full_stripe(op)) 6349 mirror_num = 1; 6350 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6351 if (need_full_stripe(op)) 6352 num_stripes = map->num_stripes; 6353 else if (mirror_num) 6354 stripe_index = mirror_num - 1; 6355 else { 6356 stripe_index = find_live_mirror(fs_info, map, 0, 6357 dev_replace_is_ongoing); 6358 mirror_num = stripe_index + 1; 6359 } 6360 6361 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6362 if (need_full_stripe(op)) { 6363 num_stripes = map->num_stripes; 6364 } else if (mirror_num) { 6365 stripe_index = mirror_num - 1; 6366 } else { 6367 mirror_num = 1; 6368 } 6369 6370 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6371 u32 factor = map->num_stripes / map->sub_stripes; 6372 6373 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6374 stripe_index *= map->sub_stripes; 6375 6376 if (need_full_stripe(op)) 6377 num_stripes = map->sub_stripes; 6378 else if (mirror_num) 6379 stripe_index += mirror_num - 1; 6380 else { 6381 int old_stripe_index = stripe_index; 6382 stripe_index = find_live_mirror(fs_info, map, 6383 stripe_index, 6384 dev_replace_is_ongoing); 6385 mirror_num = stripe_index - old_stripe_index + 1; 6386 } 6387 6388 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6389 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6390 /* push stripe_nr back to the start of the full stripe */ 6391 stripe_nr = div64_u64(raid56_full_stripe_start, 6392 stripe_len * data_stripes); 6393 6394 /* RAID[56] write or recovery. Return all stripes */ 6395 num_stripes = map->num_stripes; 6396 max_errors = nr_parity_stripes(map); 6397 6398 *length = map->stripe_len; 6399 stripe_index = 0; 6400 stripe_offset = 0; 6401 } else { 6402 /* 6403 * Mirror #0 or #1 means the original data block. 6404 * Mirror #2 is RAID5 parity block. 6405 * Mirror #3 is RAID6 Q block. 6406 */ 6407 stripe_nr = div_u64_rem(stripe_nr, 6408 data_stripes, &stripe_index); 6409 if (mirror_num > 1) 6410 stripe_index = data_stripes + mirror_num - 2; 6411 6412 /* We distribute the parity blocks across stripes */ 6413 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6414 &stripe_index); 6415 if (!need_full_stripe(op) && mirror_num <= 1) 6416 mirror_num = 1; 6417 } 6418 } else { 6419 /* 6420 * after this, stripe_nr is the number of stripes on this 6421 * device we have to walk to find the data, and stripe_index is 6422 * the number of our device in the stripe array 6423 */ 6424 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6425 &stripe_index); 6426 mirror_num = stripe_index + 1; 6427 } 6428 if (stripe_index >= map->num_stripes) { 6429 btrfs_crit(fs_info, 6430 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6431 stripe_index, map->num_stripes); 6432 ret = -EINVAL; 6433 goto out; 6434 } 6435 6436 num_alloc_stripes = num_stripes; 6437 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6438 if (op == BTRFS_MAP_WRITE) 6439 num_alloc_stripes <<= 1; 6440 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6441 num_alloc_stripes++; 6442 tgtdev_indexes = num_stripes; 6443 } 6444 6445 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6446 if (!bbio) { 6447 ret = -ENOMEM; 6448 goto out; 6449 } 6450 6451 for (i = 0; i < num_stripes; i++) { 6452 bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6453 stripe_offset + stripe_nr * map->stripe_len; 6454 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6455 stripe_index++; 6456 } 6457 6458 /* build raid_map */ 6459 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6460 (need_full_stripe(op) || mirror_num > 1)) { 6461 u64 tmp; 6462 unsigned rot; 6463 6464 /* Work out the disk rotation on this stripe-set */ 6465 div_u64_rem(stripe_nr, num_stripes, &rot); 6466 6467 /* Fill in the logical address of each stripe */ 6468 tmp = stripe_nr * data_stripes; 6469 for (i = 0; i < data_stripes; i++) 6470 bbio->raid_map[(i+rot) % num_stripes] = 6471 em->start + (tmp + i) * map->stripe_len; 6472 6473 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6474 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6475 bbio->raid_map[(i+rot+1) % num_stripes] = 6476 RAID6_Q_STRIPE; 6477 6478 sort_parity_stripes(bbio, num_stripes); 6479 } 6480 6481 if (need_full_stripe(op)) 6482 max_errors = btrfs_chunk_max_errors(map); 6483 6484 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6485 need_full_stripe(op)) { 6486 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, 6487 &num_stripes, &max_errors); 6488 } 6489 6490 *bbio_ret = bbio; 6491 bbio->map_type = map->type; 6492 bbio->num_stripes = num_stripes; 6493 bbio->max_errors = max_errors; 6494 bbio->mirror_num = mirror_num; 6495 6496 /* 6497 * this is the case that REQ_READ && dev_replace_is_ongoing && 6498 * mirror_num == num_stripes + 1 && dev_replace target drive is 6499 * available as a mirror 6500 */ 6501 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6502 WARN_ON(num_stripes > 1); 6503 bbio->stripes[0].dev = dev_replace->tgtdev; 6504 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6505 bbio->mirror_num = map->num_stripes + 1; 6506 } 6507 out: 6508 if (dev_replace_is_ongoing) { 6509 lockdep_assert_held(&dev_replace->rwsem); 6510 /* Unlock and let waiting writers proceed */ 6511 up_read(&dev_replace->rwsem); 6512 } 6513 free_extent_map(em); 6514 return ret; 6515 } 6516 6517 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6518 u64 logical, u64 *length, 6519 struct btrfs_bio **bbio_ret, int mirror_num) 6520 { 6521 if (op == BTRFS_MAP_DISCARD) 6522 return __btrfs_map_block_for_discard(fs_info, logical, 6523 length, bbio_ret); 6524 6525 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6526 mirror_num, 0); 6527 } 6528 6529 /* For Scrub/replace */ 6530 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6531 u64 logical, u64 *length, 6532 struct btrfs_bio **bbio_ret) 6533 { 6534 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6535 } 6536 6537 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6538 { 6539 bio->bi_private = bbio->private; 6540 bio->bi_end_io = bbio->end_io; 6541 bio_endio(bio); 6542 6543 btrfs_put_bbio(bbio); 6544 } 6545 6546 static void btrfs_end_bio(struct bio *bio) 6547 { 6548 struct btrfs_bio *bbio = bio->bi_private; 6549 int is_orig_bio = 0; 6550 6551 if (bio->bi_status) { 6552 atomic_inc(&bbio->error); 6553 if (bio->bi_status == BLK_STS_IOERR || 6554 bio->bi_status == BLK_STS_TARGET) { 6555 struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6556 6557 ASSERT(dev->bdev); 6558 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6559 btrfs_dev_stat_inc_and_print(dev, 6560 BTRFS_DEV_STAT_WRITE_ERRS); 6561 else if (!(bio->bi_opf & REQ_RAHEAD)) 6562 btrfs_dev_stat_inc_and_print(dev, 6563 BTRFS_DEV_STAT_READ_ERRS); 6564 if (bio->bi_opf & REQ_PREFLUSH) 6565 btrfs_dev_stat_inc_and_print(dev, 6566 BTRFS_DEV_STAT_FLUSH_ERRS); 6567 } 6568 } 6569 6570 if (bio == bbio->orig_bio) 6571 is_orig_bio = 1; 6572 6573 btrfs_bio_counter_dec(bbio->fs_info); 6574 6575 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6576 if (!is_orig_bio) { 6577 bio_put(bio); 6578 bio = bbio->orig_bio; 6579 } 6580 6581 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6582 /* only send an error to the higher layers if it is 6583 * beyond the tolerance of the btrfs bio 6584 */ 6585 if (atomic_read(&bbio->error) > bbio->max_errors) { 6586 bio->bi_status = BLK_STS_IOERR; 6587 } else { 6588 /* 6589 * this bio is actually up to date, we didn't 6590 * go over the max number of errors 6591 */ 6592 bio->bi_status = BLK_STS_OK; 6593 } 6594 6595 btrfs_end_bbio(bbio, bio); 6596 } else if (!is_orig_bio) { 6597 bio_put(bio); 6598 } 6599 } 6600 6601 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6602 u64 physical, struct btrfs_device *dev) 6603 { 6604 struct btrfs_fs_info *fs_info = bbio->fs_info; 6605 6606 bio->bi_private = bbio; 6607 btrfs_io_bio(bio)->device = dev; 6608 bio->bi_end_io = btrfs_end_bio; 6609 bio->bi_iter.bi_sector = physical >> 9; 6610 /* 6611 * For zone append writing, bi_sector must point the beginning of the 6612 * zone 6613 */ 6614 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6615 if (btrfs_dev_is_sequential(dev, physical)) { 6616 u64 zone_start = round_down(physical, fs_info->zone_size); 6617 6618 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6619 } else { 6620 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6621 bio->bi_opf |= REQ_OP_WRITE; 6622 } 6623 } 6624 btrfs_debug_in_rcu(fs_info, 6625 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6626 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6627 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6628 dev->devid, bio->bi_iter.bi_size); 6629 bio_set_dev(bio, dev->bdev); 6630 6631 btrfs_bio_counter_inc_noblocked(fs_info); 6632 6633 btrfsic_submit_bio(bio); 6634 } 6635 6636 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6637 { 6638 atomic_inc(&bbio->error); 6639 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6640 /* Should be the original bio. */ 6641 WARN_ON(bio != bbio->orig_bio); 6642 6643 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6644 bio->bi_iter.bi_sector = logical >> 9; 6645 if (atomic_read(&bbio->error) > bbio->max_errors) 6646 bio->bi_status = BLK_STS_IOERR; 6647 else 6648 bio->bi_status = BLK_STS_OK; 6649 btrfs_end_bbio(bbio, bio); 6650 } 6651 } 6652 6653 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6654 int mirror_num) 6655 { 6656 struct btrfs_device *dev; 6657 struct bio *first_bio = bio; 6658 u64 logical = bio->bi_iter.bi_sector << 9; 6659 u64 length = 0; 6660 u64 map_length; 6661 int ret; 6662 int dev_nr; 6663 int total_devs; 6664 struct btrfs_bio *bbio = NULL; 6665 6666 length = bio->bi_iter.bi_size; 6667 map_length = length; 6668 6669 btrfs_bio_counter_inc_blocked(fs_info); 6670 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6671 &map_length, &bbio, mirror_num, 1); 6672 if (ret) { 6673 btrfs_bio_counter_dec(fs_info); 6674 return errno_to_blk_status(ret); 6675 } 6676 6677 total_devs = bbio->num_stripes; 6678 bbio->orig_bio = first_bio; 6679 bbio->private = first_bio->bi_private; 6680 bbio->end_io = first_bio->bi_end_io; 6681 bbio->fs_info = fs_info; 6682 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6683 6684 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6685 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6686 /* In this case, map_length has been set to the length of 6687 a single stripe; not the whole write */ 6688 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6689 ret = raid56_parity_write(fs_info, bio, bbio, 6690 map_length); 6691 } else { 6692 ret = raid56_parity_recover(fs_info, bio, bbio, 6693 map_length, mirror_num, 1); 6694 } 6695 6696 btrfs_bio_counter_dec(fs_info); 6697 return errno_to_blk_status(ret); 6698 } 6699 6700 if (map_length < length) { 6701 btrfs_crit(fs_info, 6702 "mapping failed logical %llu bio len %llu len %llu", 6703 logical, length, map_length); 6704 BUG(); 6705 } 6706 6707 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6708 dev = bbio->stripes[dev_nr].dev; 6709 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6710 &dev->dev_state) || 6711 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6712 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6713 bbio_error(bbio, first_bio, logical); 6714 continue; 6715 } 6716 6717 if (dev_nr < total_devs - 1) 6718 bio = btrfs_bio_clone(first_bio); 6719 else 6720 bio = first_bio; 6721 6722 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 6723 } 6724 btrfs_bio_counter_dec(fs_info); 6725 return BLK_STS_OK; 6726 } 6727 6728 /* 6729 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6730 * return NULL. 6731 * 6732 * If devid and uuid are both specified, the match must be exact, otherwise 6733 * only devid is used. 6734 */ 6735 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6736 u64 devid, u8 *uuid, u8 *fsid) 6737 { 6738 struct btrfs_device *device; 6739 struct btrfs_fs_devices *seed_devs; 6740 6741 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6742 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6743 if (device->devid == devid && 6744 (!uuid || memcmp(device->uuid, uuid, 6745 BTRFS_UUID_SIZE) == 0)) 6746 return device; 6747 } 6748 } 6749 6750 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6751 if (!fsid || 6752 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6753 list_for_each_entry(device, &seed_devs->devices, 6754 dev_list) { 6755 if (device->devid == devid && 6756 (!uuid || memcmp(device->uuid, uuid, 6757 BTRFS_UUID_SIZE) == 0)) 6758 return device; 6759 } 6760 } 6761 } 6762 6763 return NULL; 6764 } 6765 6766 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6767 u64 devid, u8 *dev_uuid) 6768 { 6769 struct btrfs_device *device; 6770 unsigned int nofs_flag; 6771 6772 /* 6773 * We call this under the chunk_mutex, so we want to use NOFS for this 6774 * allocation, however we don't want to change btrfs_alloc_device() to 6775 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6776 * places. 6777 */ 6778 nofs_flag = memalloc_nofs_save(); 6779 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6780 memalloc_nofs_restore(nofs_flag); 6781 if (IS_ERR(device)) 6782 return device; 6783 6784 list_add(&device->dev_list, &fs_devices->devices); 6785 device->fs_devices = fs_devices; 6786 fs_devices->num_devices++; 6787 6788 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6789 fs_devices->missing_devices++; 6790 6791 return device; 6792 } 6793 6794 /** 6795 * btrfs_alloc_device - allocate struct btrfs_device 6796 * @fs_info: used only for generating a new devid, can be NULL if 6797 * devid is provided (i.e. @devid != NULL). 6798 * @devid: a pointer to devid for this device. If NULL a new devid 6799 * is generated. 6800 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6801 * is generated. 6802 * 6803 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6804 * on error. Returned struct is not linked onto any lists and must be 6805 * destroyed with btrfs_free_device. 6806 */ 6807 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6808 const u64 *devid, 6809 const u8 *uuid) 6810 { 6811 struct btrfs_device *dev; 6812 u64 tmp; 6813 6814 if (WARN_ON(!devid && !fs_info)) 6815 return ERR_PTR(-EINVAL); 6816 6817 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6818 if (!dev) 6819 return ERR_PTR(-ENOMEM); 6820 6821 /* 6822 * Preallocate a bio that's always going to be used for flushing device 6823 * barriers and matches the device lifespan 6824 */ 6825 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6826 if (!dev->flush_bio) { 6827 kfree(dev); 6828 return ERR_PTR(-ENOMEM); 6829 } 6830 6831 INIT_LIST_HEAD(&dev->dev_list); 6832 INIT_LIST_HEAD(&dev->dev_alloc_list); 6833 INIT_LIST_HEAD(&dev->post_commit_list); 6834 6835 atomic_set(&dev->reada_in_flight, 0); 6836 atomic_set(&dev->dev_stats_ccnt, 0); 6837 btrfs_device_data_ordered_init(dev); 6838 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6839 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6840 extent_io_tree_init(fs_info, &dev->alloc_state, 6841 IO_TREE_DEVICE_ALLOC_STATE, NULL); 6842 6843 if (devid) 6844 tmp = *devid; 6845 else { 6846 int ret; 6847 6848 ret = find_next_devid(fs_info, &tmp); 6849 if (ret) { 6850 btrfs_free_device(dev); 6851 return ERR_PTR(ret); 6852 } 6853 } 6854 dev->devid = tmp; 6855 6856 if (uuid) 6857 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6858 else 6859 generate_random_uuid(dev->uuid); 6860 6861 return dev; 6862 } 6863 6864 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6865 u64 devid, u8 *uuid, bool error) 6866 { 6867 if (error) 6868 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6869 devid, uuid); 6870 else 6871 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6872 devid, uuid); 6873 } 6874 6875 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 6876 { 6877 const int data_stripes = calc_data_stripes(type, num_stripes); 6878 6879 return div_u64(chunk_len, data_stripes); 6880 } 6881 6882 #if BITS_PER_LONG == 32 6883 /* 6884 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6885 * can't be accessed on 32bit systems. 6886 * 6887 * This function do mount time check to reject the fs if it already has 6888 * metadata chunk beyond that limit. 6889 */ 6890 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6891 u64 logical, u64 length, u64 type) 6892 { 6893 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6894 return 0; 6895 6896 if (logical + length < MAX_LFS_FILESIZE) 6897 return 0; 6898 6899 btrfs_err_32bit_limit(fs_info); 6900 return -EOVERFLOW; 6901 } 6902 6903 /* 6904 * This is to give early warning for any metadata chunk reaching 6905 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6906 * Although we can still access the metadata, it's not going to be possible 6907 * once the limit is reached. 6908 */ 6909 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6910 u64 logical, u64 length, u64 type) 6911 { 6912 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6913 return; 6914 6915 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6916 return; 6917 6918 btrfs_warn_32bit_limit(fs_info); 6919 } 6920 #endif 6921 6922 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6923 struct btrfs_chunk *chunk) 6924 { 6925 struct btrfs_fs_info *fs_info = leaf->fs_info; 6926 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6927 struct map_lookup *map; 6928 struct extent_map *em; 6929 u64 logical; 6930 u64 length; 6931 u64 devid; 6932 u64 type; 6933 u8 uuid[BTRFS_UUID_SIZE]; 6934 int num_stripes; 6935 int ret; 6936 int i; 6937 6938 logical = key->offset; 6939 length = btrfs_chunk_length(leaf, chunk); 6940 type = btrfs_chunk_type(leaf, chunk); 6941 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6942 6943 #if BITS_PER_LONG == 32 6944 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6945 if (ret < 0) 6946 return ret; 6947 warn_32bit_meta_chunk(fs_info, logical, length, type); 6948 #endif 6949 6950 /* 6951 * Only need to verify chunk item if we're reading from sys chunk array, 6952 * as chunk item in tree block is already verified by tree-checker. 6953 */ 6954 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6955 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6956 if (ret) 6957 return ret; 6958 } 6959 6960 read_lock(&map_tree->lock); 6961 em = lookup_extent_mapping(map_tree, logical, 1); 6962 read_unlock(&map_tree->lock); 6963 6964 /* already mapped? */ 6965 if (em && em->start <= logical && em->start + em->len > logical) { 6966 free_extent_map(em); 6967 return 0; 6968 } else if (em) { 6969 free_extent_map(em); 6970 } 6971 6972 em = alloc_extent_map(); 6973 if (!em) 6974 return -ENOMEM; 6975 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6976 if (!map) { 6977 free_extent_map(em); 6978 return -ENOMEM; 6979 } 6980 6981 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6982 em->map_lookup = map; 6983 em->start = logical; 6984 em->len = length; 6985 em->orig_start = 0; 6986 em->block_start = 0; 6987 em->block_len = em->len; 6988 6989 map->num_stripes = num_stripes; 6990 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6991 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6992 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6993 map->type = type; 6994 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6995 map->verified_stripes = 0; 6996 em->orig_block_len = calc_stripe_length(type, em->len, 6997 map->num_stripes); 6998 for (i = 0; i < num_stripes; i++) { 6999 map->stripes[i].physical = 7000 btrfs_stripe_offset_nr(leaf, chunk, i); 7001 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7002 read_extent_buffer(leaf, uuid, (unsigned long) 7003 btrfs_stripe_dev_uuid_nr(chunk, i), 7004 BTRFS_UUID_SIZE); 7005 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 7006 devid, uuid, NULL); 7007 if (!map->stripes[i].dev && 7008 !btrfs_test_opt(fs_info, DEGRADED)) { 7009 free_extent_map(em); 7010 btrfs_report_missing_device(fs_info, devid, uuid, true); 7011 return -ENOENT; 7012 } 7013 if (!map->stripes[i].dev) { 7014 map->stripes[i].dev = 7015 add_missing_dev(fs_info->fs_devices, devid, 7016 uuid); 7017 if (IS_ERR(map->stripes[i].dev)) { 7018 free_extent_map(em); 7019 btrfs_err(fs_info, 7020 "failed to init missing dev %llu: %ld", 7021 devid, PTR_ERR(map->stripes[i].dev)); 7022 return PTR_ERR(map->stripes[i].dev); 7023 } 7024 btrfs_report_missing_device(fs_info, devid, uuid, false); 7025 } 7026 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7027 &(map->stripes[i].dev->dev_state)); 7028 7029 } 7030 7031 write_lock(&map_tree->lock); 7032 ret = add_extent_mapping(map_tree, em, 0); 7033 write_unlock(&map_tree->lock); 7034 if (ret < 0) { 7035 btrfs_err(fs_info, 7036 "failed to add chunk map, start=%llu len=%llu: %d", 7037 em->start, em->len, ret); 7038 } 7039 free_extent_map(em); 7040 7041 return ret; 7042 } 7043 7044 static void fill_device_from_item(struct extent_buffer *leaf, 7045 struct btrfs_dev_item *dev_item, 7046 struct btrfs_device *device) 7047 { 7048 unsigned long ptr; 7049 7050 device->devid = btrfs_device_id(leaf, dev_item); 7051 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7052 device->total_bytes = device->disk_total_bytes; 7053 device->commit_total_bytes = device->disk_total_bytes; 7054 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7055 device->commit_bytes_used = device->bytes_used; 7056 device->type = btrfs_device_type(leaf, dev_item); 7057 device->io_align = btrfs_device_io_align(leaf, dev_item); 7058 device->io_width = btrfs_device_io_width(leaf, dev_item); 7059 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7060 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7061 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7062 7063 ptr = btrfs_device_uuid(dev_item); 7064 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7065 } 7066 7067 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7068 u8 *fsid) 7069 { 7070 struct btrfs_fs_devices *fs_devices; 7071 int ret; 7072 7073 lockdep_assert_held(&uuid_mutex); 7074 ASSERT(fsid); 7075 7076 /* This will match only for multi-device seed fs */ 7077 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7078 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7079 return fs_devices; 7080 7081 7082 fs_devices = find_fsid(fsid, NULL); 7083 if (!fs_devices) { 7084 if (!btrfs_test_opt(fs_info, DEGRADED)) 7085 return ERR_PTR(-ENOENT); 7086 7087 fs_devices = alloc_fs_devices(fsid, NULL); 7088 if (IS_ERR(fs_devices)) 7089 return fs_devices; 7090 7091 fs_devices->seeding = true; 7092 fs_devices->opened = 1; 7093 return fs_devices; 7094 } 7095 7096 /* 7097 * Upon first call for a seed fs fsid, just create a private copy of the 7098 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7099 */ 7100 fs_devices = clone_fs_devices(fs_devices); 7101 if (IS_ERR(fs_devices)) 7102 return fs_devices; 7103 7104 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7105 if (ret) { 7106 free_fs_devices(fs_devices); 7107 return ERR_PTR(ret); 7108 } 7109 7110 if (!fs_devices->seeding) { 7111 close_fs_devices(fs_devices); 7112 free_fs_devices(fs_devices); 7113 return ERR_PTR(-EINVAL); 7114 } 7115 7116 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7117 7118 return fs_devices; 7119 } 7120 7121 static int read_one_dev(struct extent_buffer *leaf, 7122 struct btrfs_dev_item *dev_item) 7123 { 7124 struct btrfs_fs_info *fs_info = leaf->fs_info; 7125 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7126 struct btrfs_device *device; 7127 u64 devid; 7128 int ret; 7129 u8 fs_uuid[BTRFS_FSID_SIZE]; 7130 u8 dev_uuid[BTRFS_UUID_SIZE]; 7131 7132 devid = btrfs_device_id(leaf, dev_item); 7133 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7134 BTRFS_UUID_SIZE); 7135 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7136 BTRFS_FSID_SIZE); 7137 7138 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7139 fs_devices = open_seed_devices(fs_info, fs_uuid); 7140 if (IS_ERR(fs_devices)) 7141 return PTR_ERR(fs_devices); 7142 } 7143 7144 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 7145 fs_uuid); 7146 if (!device) { 7147 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7148 btrfs_report_missing_device(fs_info, devid, 7149 dev_uuid, true); 7150 return -ENOENT; 7151 } 7152 7153 device = add_missing_dev(fs_devices, devid, dev_uuid); 7154 if (IS_ERR(device)) { 7155 btrfs_err(fs_info, 7156 "failed to add missing dev %llu: %ld", 7157 devid, PTR_ERR(device)); 7158 return PTR_ERR(device); 7159 } 7160 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7161 } else { 7162 if (!device->bdev) { 7163 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7164 btrfs_report_missing_device(fs_info, 7165 devid, dev_uuid, true); 7166 return -ENOENT; 7167 } 7168 btrfs_report_missing_device(fs_info, devid, 7169 dev_uuid, false); 7170 } 7171 7172 if (!device->bdev && 7173 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7174 /* 7175 * this happens when a device that was properly setup 7176 * in the device info lists suddenly goes bad. 7177 * device->bdev is NULL, and so we have to set 7178 * device->missing to one here 7179 */ 7180 device->fs_devices->missing_devices++; 7181 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7182 } 7183 7184 /* Move the device to its own fs_devices */ 7185 if (device->fs_devices != fs_devices) { 7186 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7187 &device->dev_state)); 7188 7189 list_move(&device->dev_list, &fs_devices->devices); 7190 device->fs_devices->num_devices--; 7191 fs_devices->num_devices++; 7192 7193 device->fs_devices->missing_devices--; 7194 fs_devices->missing_devices++; 7195 7196 device->fs_devices = fs_devices; 7197 } 7198 } 7199 7200 if (device->fs_devices != fs_info->fs_devices) { 7201 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7202 if (device->generation != 7203 btrfs_device_generation(leaf, dev_item)) 7204 return -EINVAL; 7205 } 7206 7207 fill_device_from_item(leaf, dev_item, device); 7208 if (device->bdev) { 7209 u64 max_total_bytes = i_size_read(device->bdev->bd_inode); 7210 7211 if (device->total_bytes > max_total_bytes) { 7212 btrfs_err(fs_info, 7213 "device total_bytes should be at most %llu but found %llu", 7214 max_total_bytes, device->total_bytes); 7215 return -EINVAL; 7216 } 7217 } 7218 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7219 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7220 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7221 device->fs_devices->total_rw_bytes += device->total_bytes; 7222 atomic64_add(device->total_bytes - device->bytes_used, 7223 &fs_info->free_chunk_space); 7224 } 7225 ret = 0; 7226 return ret; 7227 } 7228 7229 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7230 { 7231 struct btrfs_root *root = fs_info->tree_root; 7232 struct btrfs_super_block *super_copy = fs_info->super_copy; 7233 struct extent_buffer *sb; 7234 struct btrfs_disk_key *disk_key; 7235 struct btrfs_chunk *chunk; 7236 u8 *array_ptr; 7237 unsigned long sb_array_offset; 7238 int ret = 0; 7239 u32 num_stripes; 7240 u32 array_size; 7241 u32 len = 0; 7242 u32 cur_offset; 7243 u64 type; 7244 struct btrfs_key key; 7245 7246 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7247 /* 7248 * This will create extent buffer of nodesize, superblock size is 7249 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7250 * overallocate but we can keep it as-is, only the first page is used. 7251 */ 7252 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7253 root->root_key.objectid, 0); 7254 if (IS_ERR(sb)) 7255 return PTR_ERR(sb); 7256 set_extent_buffer_uptodate(sb); 7257 /* 7258 * The sb extent buffer is artificial and just used to read the system array. 7259 * set_extent_buffer_uptodate() call does not properly mark all it's 7260 * pages up-to-date when the page is larger: extent does not cover the 7261 * whole page and consequently check_page_uptodate does not find all 7262 * the page's extents up-to-date (the hole beyond sb), 7263 * write_extent_buffer then triggers a WARN_ON. 7264 * 7265 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7266 * but sb spans only this function. Add an explicit SetPageUptodate call 7267 * to silence the warning eg. on PowerPC 64. 7268 */ 7269 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7270 SetPageUptodate(sb->pages[0]); 7271 7272 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7273 array_size = btrfs_super_sys_array_size(super_copy); 7274 7275 array_ptr = super_copy->sys_chunk_array; 7276 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7277 cur_offset = 0; 7278 7279 while (cur_offset < array_size) { 7280 disk_key = (struct btrfs_disk_key *)array_ptr; 7281 len = sizeof(*disk_key); 7282 if (cur_offset + len > array_size) 7283 goto out_short_read; 7284 7285 btrfs_disk_key_to_cpu(&key, disk_key); 7286 7287 array_ptr += len; 7288 sb_array_offset += len; 7289 cur_offset += len; 7290 7291 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7292 btrfs_err(fs_info, 7293 "unexpected item type %u in sys_array at offset %u", 7294 (u32)key.type, cur_offset); 7295 ret = -EIO; 7296 break; 7297 } 7298 7299 chunk = (struct btrfs_chunk *)sb_array_offset; 7300 /* 7301 * At least one btrfs_chunk with one stripe must be present, 7302 * exact stripe count check comes afterwards 7303 */ 7304 len = btrfs_chunk_item_size(1); 7305 if (cur_offset + len > array_size) 7306 goto out_short_read; 7307 7308 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7309 if (!num_stripes) { 7310 btrfs_err(fs_info, 7311 "invalid number of stripes %u in sys_array at offset %u", 7312 num_stripes, cur_offset); 7313 ret = -EIO; 7314 break; 7315 } 7316 7317 type = btrfs_chunk_type(sb, chunk); 7318 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7319 btrfs_err(fs_info, 7320 "invalid chunk type %llu in sys_array at offset %u", 7321 type, cur_offset); 7322 ret = -EIO; 7323 break; 7324 } 7325 7326 len = btrfs_chunk_item_size(num_stripes); 7327 if (cur_offset + len > array_size) 7328 goto out_short_read; 7329 7330 ret = read_one_chunk(&key, sb, chunk); 7331 if (ret) 7332 break; 7333 7334 array_ptr += len; 7335 sb_array_offset += len; 7336 cur_offset += len; 7337 } 7338 clear_extent_buffer_uptodate(sb); 7339 free_extent_buffer_stale(sb); 7340 return ret; 7341 7342 out_short_read: 7343 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7344 len, cur_offset); 7345 clear_extent_buffer_uptodate(sb); 7346 free_extent_buffer_stale(sb); 7347 return -EIO; 7348 } 7349 7350 /* 7351 * Check if all chunks in the fs are OK for read-write degraded mount 7352 * 7353 * If the @failing_dev is specified, it's accounted as missing. 7354 * 7355 * Return true if all chunks meet the minimal RW mount requirements. 7356 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7357 */ 7358 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7359 struct btrfs_device *failing_dev) 7360 { 7361 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7362 struct extent_map *em; 7363 u64 next_start = 0; 7364 bool ret = true; 7365 7366 read_lock(&map_tree->lock); 7367 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7368 read_unlock(&map_tree->lock); 7369 /* No chunk at all? Return false anyway */ 7370 if (!em) { 7371 ret = false; 7372 goto out; 7373 } 7374 while (em) { 7375 struct map_lookup *map; 7376 int missing = 0; 7377 int max_tolerated; 7378 int i; 7379 7380 map = em->map_lookup; 7381 max_tolerated = 7382 btrfs_get_num_tolerated_disk_barrier_failures( 7383 map->type); 7384 for (i = 0; i < map->num_stripes; i++) { 7385 struct btrfs_device *dev = map->stripes[i].dev; 7386 7387 if (!dev || !dev->bdev || 7388 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7389 dev->last_flush_error) 7390 missing++; 7391 else if (failing_dev && failing_dev == dev) 7392 missing++; 7393 } 7394 if (missing > max_tolerated) { 7395 if (!failing_dev) 7396 btrfs_warn(fs_info, 7397 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7398 em->start, missing, max_tolerated); 7399 free_extent_map(em); 7400 ret = false; 7401 goto out; 7402 } 7403 next_start = extent_map_end(em); 7404 free_extent_map(em); 7405 7406 read_lock(&map_tree->lock); 7407 em = lookup_extent_mapping(map_tree, next_start, 7408 (u64)(-1) - next_start); 7409 read_unlock(&map_tree->lock); 7410 } 7411 out: 7412 return ret; 7413 } 7414 7415 static void readahead_tree_node_children(struct extent_buffer *node) 7416 { 7417 int i; 7418 const int nr_items = btrfs_header_nritems(node); 7419 7420 for (i = 0; i < nr_items; i++) 7421 btrfs_readahead_node_child(node, i); 7422 } 7423 7424 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7425 { 7426 struct btrfs_root *root = fs_info->chunk_root; 7427 struct btrfs_path *path; 7428 struct extent_buffer *leaf; 7429 struct btrfs_key key; 7430 struct btrfs_key found_key; 7431 int ret; 7432 int slot; 7433 u64 total_dev = 0; 7434 u64 last_ra_node = 0; 7435 7436 path = btrfs_alloc_path(); 7437 if (!path) 7438 return -ENOMEM; 7439 7440 /* 7441 * uuid_mutex is needed only if we are mounting a sprout FS 7442 * otherwise we don't need it. 7443 */ 7444 mutex_lock(&uuid_mutex); 7445 7446 /* 7447 * It is possible for mount and umount to race in such a way that 7448 * we execute this code path, but open_fs_devices failed to clear 7449 * total_rw_bytes. We certainly want it cleared before reading the 7450 * device items, so clear it here. 7451 */ 7452 fs_info->fs_devices->total_rw_bytes = 0; 7453 7454 /* 7455 * Read all device items, and then all the chunk items. All 7456 * device items are found before any chunk item (their object id 7457 * is smaller than the lowest possible object id for a chunk 7458 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7459 */ 7460 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7461 key.offset = 0; 7462 key.type = 0; 7463 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7464 if (ret < 0) 7465 goto error; 7466 while (1) { 7467 struct extent_buffer *node; 7468 7469 leaf = path->nodes[0]; 7470 slot = path->slots[0]; 7471 if (slot >= btrfs_header_nritems(leaf)) { 7472 ret = btrfs_next_leaf(root, path); 7473 if (ret == 0) 7474 continue; 7475 if (ret < 0) 7476 goto error; 7477 break; 7478 } 7479 /* 7480 * The nodes on level 1 are not locked but we don't need to do 7481 * that during mount time as nothing else can access the tree 7482 */ 7483 node = path->nodes[1]; 7484 if (node) { 7485 if (last_ra_node != node->start) { 7486 readahead_tree_node_children(node); 7487 last_ra_node = node->start; 7488 } 7489 } 7490 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7491 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7492 struct btrfs_dev_item *dev_item; 7493 dev_item = btrfs_item_ptr(leaf, slot, 7494 struct btrfs_dev_item); 7495 ret = read_one_dev(leaf, dev_item); 7496 if (ret) 7497 goto error; 7498 total_dev++; 7499 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7500 struct btrfs_chunk *chunk; 7501 7502 /* 7503 * We are only called at mount time, so no need to take 7504 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7505 * we always lock first fs_info->chunk_mutex before 7506 * acquiring any locks on the chunk tree. This is a 7507 * requirement for chunk allocation, see the comment on 7508 * top of btrfs_chunk_alloc() for details. 7509 */ 7510 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7511 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7512 ret = read_one_chunk(&found_key, leaf, chunk); 7513 if (ret) 7514 goto error; 7515 } 7516 path->slots[0]++; 7517 } 7518 7519 /* 7520 * After loading chunk tree, we've got all device information, 7521 * do another round of validation checks. 7522 */ 7523 if (total_dev != fs_info->fs_devices->total_devices) { 7524 btrfs_err(fs_info, 7525 "super_num_devices %llu mismatch with num_devices %llu found here", 7526 btrfs_super_num_devices(fs_info->super_copy), 7527 total_dev); 7528 ret = -EINVAL; 7529 goto error; 7530 } 7531 if (btrfs_super_total_bytes(fs_info->super_copy) < 7532 fs_info->fs_devices->total_rw_bytes) { 7533 btrfs_err(fs_info, 7534 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7535 btrfs_super_total_bytes(fs_info->super_copy), 7536 fs_info->fs_devices->total_rw_bytes); 7537 ret = -EINVAL; 7538 goto error; 7539 } 7540 ret = 0; 7541 error: 7542 mutex_unlock(&uuid_mutex); 7543 7544 btrfs_free_path(path); 7545 return ret; 7546 } 7547 7548 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7549 { 7550 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7551 struct btrfs_device *device; 7552 7553 fs_devices->fs_info = fs_info; 7554 7555 mutex_lock(&fs_devices->device_list_mutex); 7556 list_for_each_entry(device, &fs_devices->devices, dev_list) 7557 device->fs_info = fs_info; 7558 7559 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7560 list_for_each_entry(device, &seed_devs->devices, dev_list) 7561 device->fs_info = fs_info; 7562 7563 seed_devs->fs_info = fs_info; 7564 } 7565 mutex_unlock(&fs_devices->device_list_mutex); 7566 } 7567 7568 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7569 const struct btrfs_dev_stats_item *ptr, 7570 int index) 7571 { 7572 u64 val; 7573 7574 read_extent_buffer(eb, &val, 7575 offsetof(struct btrfs_dev_stats_item, values) + 7576 ((unsigned long)ptr) + (index * sizeof(u64)), 7577 sizeof(val)); 7578 return val; 7579 } 7580 7581 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7582 struct btrfs_dev_stats_item *ptr, 7583 int index, u64 val) 7584 { 7585 write_extent_buffer(eb, &val, 7586 offsetof(struct btrfs_dev_stats_item, values) + 7587 ((unsigned long)ptr) + (index * sizeof(u64)), 7588 sizeof(val)); 7589 } 7590 7591 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7592 struct btrfs_path *path) 7593 { 7594 struct btrfs_dev_stats_item *ptr; 7595 struct extent_buffer *eb; 7596 struct btrfs_key key; 7597 int item_size; 7598 int i, ret, slot; 7599 7600 if (!device->fs_info->dev_root) 7601 return 0; 7602 7603 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7604 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7605 key.offset = device->devid; 7606 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7607 if (ret) { 7608 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7609 btrfs_dev_stat_set(device, i, 0); 7610 device->dev_stats_valid = 1; 7611 btrfs_release_path(path); 7612 return ret < 0 ? ret : 0; 7613 } 7614 slot = path->slots[0]; 7615 eb = path->nodes[0]; 7616 item_size = btrfs_item_size_nr(eb, slot); 7617 7618 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7619 7620 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7621 if (item_size >= (1 + i) * sizeof(__le64)) 7622 btrfs_dev_stat_set(device, i, 7623 btrfs_dev_stats_value(eb, ptr, i)); 7624 else 7625 btrfs_dev_stat_set(device, i, 0); 7626 } 7627 7628 device->dev_stats_valid = 1; 7629 btrfs_dev_stat_print_on_load(device); 7630 btrfs_release_path(path); 7631 7632 return 0; 7633 } 7634 7635 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7636 { 7637 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7638 struct btrfs_device *device; 7639 struct btrfs_path *path = NULL; 7640 int ret = 0; 7641 7642 path = btrfs_alloc_path(); 7643 if (!path) 7644 return -ENOMEM; 7645 7646 mutex_lock(&fs_devices->device_list_mutex); 7647 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7648 ret = btrfs_device_init_dev_stats(device, path); 7649 if (ret) 7650 goto out; 7651 } 7652 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7653 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7654 ret = btrfs_device_init_dev_stats(device, path); 7655 if (ret) 7656 goto out; 7657 } 7658 } 7659 out: 7660 mutex_unlock(&fs_devices->device_list_mutex); 7661 7662 btrfs_free_path(path); 7663 return ret; 7664 } 7665 7666 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7667 struct btrfs_device *device) 7668 { 7669 struct btrfs_fs_info *fs_info = trans->fs_info; 7670 struct btrfs_root *dev_root = fs_info->dev_root; 7671 struct btrfs_path *path; 7672 struct btrfs_key key; 7673 struct extent_buffer *eb; 7674 struct btrfs_dev_stats_item *ptr; 7675 int ret; 7676 int i; 7677 7678 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7679 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7680 key.offset = device->devid; 7681 7682 path = btrfs_alloc_path(); 7683 if (!path) 7684 return -ENOMEM; 7685 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7686 if (ret < 0) { 7687 btrfs_warn_in_rcu(fs_info, 7688 "error %d while searching for dev_stats item for device %s", 7689 ret, rcu_str_deref(device->name)); 7690 goto out; 7691 } 7692 7693 if (ret == 0 && 7694 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7695 /* need to delete old one and insert a new one */ 7696 ret = btrfs_del_item(trans, dev_root, path); 7697 if (ret != 0) { 7698 btrfs_warn_in_rcu(fs_info, 7699 "delete too small dev_stats item for device %s failed %d", 7700 rcu_str_deref(device->name), ret); 7701 goto out; 7702 } 7703 ret = 1; 7704 } 7705 7706 if (ret == 1) { 7707 /* need to insert a new item */ 7708 btrfs_release_path(path); 7709 ret = btrfs_insert_empty_item(trans, dev_root, path, 7710 &key, sizeof(*ptr)); 7711 if (ret < 0) { 7712 btrfs_warn_in_rcu(fs_info, 7713 "insert dev_stats item for device %s failed %d", 7714 rcu_str_deref(device->name), ret); 7715 goto out; 7716 } 7717 } 7718 7719 eb = path->nodes[0]; 7720 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7721 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7722 btrfs_set_dev_stats_value(eb, ptr, i, 7723 btrfs_dev_stat_read(device, i)); 7724 btrfs_mark_buffer_dirty(eb); 7725 7726 out: 7727 btrfs_free_path(path); 7728 return ret; 7729 } 7730 7731 /* 7732 * called from commit_transaction. Writes all changed device stats to disk. 7733 */ 7734 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7735 { 7736 struct btrfs_fs_info *fs_info = trans->fs_info; 7737 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7738 struct btrfs_device *device; 7739 int stats_cnt; 7740 int ret = 0; 7741 7742 mutex_lock(&fs_devices->device_list_mutex); 7743 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7744 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7745 if (!device->dev_stats_valid || stats_cnt == 0) 7746 continue; 7747 7748 7749 /* 7750 * There is a LOAD-LOAD control dependency between the value of 7751 * dev_stats_ccnt and updating the on-disk values which requires 7752 * reading the in-memory counters. Such control dependencies 7753 * require explicit read memory barriers. 7754 * 7755 * This memory barriers pairs with smp_mb__before_atomic in 7756 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7757 * barrier implied by atomic_xchg in 7758 * btrfs_dev_stats_read_and_reset 7759 */ 7760 smp_rmb(); 7761 7762 ret = update_dev_stat_item(trans, device); 7763 if (!ret) 7764 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7765 } 7766 mutex_unlock(&fs_devices->device_list_mutex); 7767 7768 return ret; 7769 } 7770 7771 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7772 { 7773 btrfs_dev_stat_inc(dev, index); 7774 btrfs_dev_stat_print_on_error(dev); 7775 } 7776 7777 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7778 { 7779 if (!dev->dev_stats_valid) 7780 return; 7781 btrfs_err_rl_in_rcu(dev->fs_info, 7782 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7783 rcu_str_deref(dev->name), 7784 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7785 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7786 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7787 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7788 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7789 } 7790 7791 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7792 { 7793 int i; 7794 7795 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7796 if (btrfs_dev_stat_read(dev, i) != 0) 7797 break; 7798 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7799 return; /* all values == 0, suppress message */ 7800 7801 btrfs_info_in_rcu(dev->fs_info, 7802 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7803 rcu_str_deref(dev->name), 7804 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7805 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7806 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7807 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7808 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7809 } 7810 7811 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7812 struct btrfs_ioctl_get_dev_stats *stats) 7813 { 7814 struct btrfs_device *dev; 7815 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7816 int i; 7817 7818 mutex_lock(&fs_devices->device_list_mutex); 7819 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); 7820 mutex_unlock(&fs_devices->device_list_mutex); 7821 7822 if (!dev) { 7823 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7824 return -ENODEV; 7825 } else if (!dev->dev_stats_valid) { 7826 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7827 return -ENODEV; 7828 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7829 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7830 if (stats->nr_items > i) 7831 stats->values[i] = 7832 btrfs_dev_stat_read_and_reset(dev, i); 7833 else 7834 btrfs_dev_stat_set(dev, i, 0); 7835 } 7836 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7837 current->comm, task_pid_nr(current)); 7838 } else { 7839 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7840 if (stats->nr_items > i) 7841 stats->values[i] = btrfs_dev_stat_read(dev, i); 7842 } 7843 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7844 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7845 return 0; 7846 } 7847 7848 /* 7849 * Update the size and bytes used for each device where it changed. This is 7850 * delayed since we would otherwise get errors while writing out the 7851 * superblocks. 7852 * 7853 * Must be invoked during transaction commit. 7854 */ 7855 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7856 { 7857 struct btrfs_device *curr, *next; 7858 7859 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7860 7861 if (list_empty(&trans->dev_update_list)) 7862 return; 7863 7864 /* 7865 * We don't need the device_list_mutex here. This list is owned by the 7866 * transaction and the transaction must complete before the device is 7867 * released. 7868 */ 7869 mutex_lock(&trans->fs_info->chunk_mutex); 7870 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7871 post_commit_list) { 7872 list_del_init(&curr->post_commit_list); 7873 curr->commit_total_bytes = curr->disk_total_bytes; 7874 curr->commit_bytes_used = curr->bytes_used; 7875 } 7876 mutex_unlock(&trans->fs_info->chunk_mutex); 7877 } 7878 7879 /* 7880 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7881 */ 7882 int btrfs_bg_type_to_factor(u64 flags) 7883 { 7884 const int index = btrfs_bg_flags_to_raid_index(flags); 7885 7886 return btrfs_raid_array[index].ncopies; 7887 } 7888 7889 7890 7891 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7892 u64 chunk_offset, u64 devid, 7893 u64 physical_offset, u64 physical_len) 7894 { 7895 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7896 struct extent_map *em; 7897 struct map_lookup *map; 7898 struct btrfs_device *dev; 7899 u64 stripe_len; 7900 bool found = false; 7901 int ret = 0; 7902 int i; 7903 7904 read_lock(&em_tree->lock); 7905 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7906 read_unlock(&em_tree->lock); 7907 7908 if (!em) { 7909 btrfs_err(fs_info, 7910 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7911 physical_offset, devid); 7912 ret = -EUCLEAN; 7913 goto out; 7914 } 7915 7916 map = em->map_lookup; 7917 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7918 if (physical_len != stripe_len) { 7919 btrfs_err(fs_info, 7920 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7921 physical_offset, devid, em->start, physical_len, 7922 stripe_len); 7923 ret = -EUCLEAN; 7924 goto out; 7925 } 7926 7927 for (i = 0; i < map->num_stripes; i++) { 7928 if (map->stripes[i].dev->devid == devid && 7929 map->stripes[i].physical == physical_offset) { 7930 found = true; 7931 if (map->verified_stripes >= map->num_stripes) { 7932 btrfs_err(fs_info, 7933 "too many dev extents for chunk %llu found", 7934 em->start); 7935 ret = -EUCLEAN; 7936 goto out; 7937 } 7938 map->verified_stripes++; 7939 break; 7940 } 7941 } 7942 if (!found) { 7943 btrfs_err(fs_info, 7944 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7945 physical_offset, devid); 7946 ret = -EUCLEAN; 7947 } 7948 7949 /* Make sure no dev extent is beyond device boundary */ 7950 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 7951 if (!dev) { 7952 btrfs_err(fs_info, "failed to find devid %llu", devid); 7953 ret = -EUCLEAN; 7954 goto out; 7955 } 7956 7957 if (physical_offset + physical_len > dev->disk_total_bytes) { 7958 btrfs_err(fs_info, 7959 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7960 devid, physical_offset, physical_len, 7961 dev->disk_total_bytes); 7962 ret = -EUCLEAN; 7963 goto out; 7964 } 7965 7966 if (dev->zone_info) { 7967 u64 zone_size = dev->zone_info->zone_size; 7968 7969 if (!IS_ALIGNED(physical_offset, zone_size) || 7970 !IS_ALIGNED(physical_len, zone_size)) { 7971 btrfs_err(fs_info, 7972 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 7973 devid, physical_offset, physical_len); 7974 ret = -EUCLEAN; 7975 goto out; 7976 } 7977 } 7978 7979 out: 7980 free_extent_map(em); 7981 return ret; 7982 } 7983 7984 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7985 { 7986 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7987 struct extent_map *em; 7988 struct rb_node *node; 7989 int ret = 0; 7990 7991 read_lock(&em_tree->lock); 7992 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7993 em = rb_entry(node, struct extent_map, rb_node); 7994 if (em->map_lookup->num_stripes != 7995 em->map_lookup->verified_stripes) { 7996 btrfs_err(fs_info, 7997 "chunk %llu has missing dev extent, have %d expect %d", 7998 em->start, em->map_lookup->verified_stripes, 7999 em->map_lookup->num_stripes); 8000 ret = -EUCLEAN; 8001 goto out; 8002 } 8003 } 8004 out: 8005 read_unlock(&em_tree->lock); 8006 return ret; 8007 } 8008 8009 /* 8010 * Ensure that all dev extents are mapped to correct chunk, otherwise 8011 * later chunk allocation/free would cause unexpected behavior. 8012 * 8013 * NOTE: This will iterate through the whole device tree, which should be of 8014 * the same size level as the chunk tree. This slightly increases mount time. 8015 */ 8016 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8017 { 8018 struct btrfs_path *path; 8019 struct btrfs_root *root = fs_info->dev_root; 8020 struct btrfs_key key; 8021 u64 prev_devid = 0; 8022 u64 prev_dev_ext_end = 0; 8023 int ret = 0; 8024 8025 /* 8026 * We don't have a dev_root because we mounted with ignorebadroots and 8027 * failed to load the root, so we want to skip the verification in this 8028 * case for sure. 8029 * 8030 * However if the dev root is fine, but the tree itself is corrupted 8031 * we'd still fail to mount. This verification is only to make sure 8032 * writes can happen safely, so instead just bypass this check 8033 * completely in the case of IGNOREBADROOTS. 8034 */ 8035 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8036 return 0; 8037 8038 key.objectid = 1; 8039 key.type = BTRFS_DEV_EXTENT_KEY; 8040 key.offset = 0; 8041 8042 path = btrfs_alloc_path(); 8043 if (!path) 8044 return -ENOMEM; 8045 8046 path->reada = READA_FORWARD; 8047 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8048 if (ret < 0) 8049 goto out; 8050 8051 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8052 ret = btrfs_next_leaf(root, path); 8053 if (ret < 0) 8054 goto out; 8055 /* No dev extents at all? Not good */ 8056 if (ret > 0) { 8057 ret = -EUCLEAN; 8058 goto out; 8059 } 8060 } 8061 while (1) { 8062 struct extent_buffer *leaf = path->nodes[0]; 8063 struct btrfs_dev_extent *dext; 8064 int slot = path->slots[0]; 8065 u64 chunk_offset; 8066 u64 physical_offset; 8067 u64 physical_len; 8068 u64 devid; 8069 8070 btrfs_item_key_to_cpu(leaf, &key, slot); 8071 if (key.type != BTRFS_DEV_EXTENT_KEY) 8072 break; 8073 devid = key.objectid; 8074 physical_offset = key.offset; 8075 8076 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8077 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8078 physical_len = btrfs_dev_extent_length(leaf, dext); 8079 8080 /* Check if this dev extent overlaps with the previous one */ 8081 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8082 btrfs_err(fs_info, 8083 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8084 devid, physical_offset, prev_dev_ext_end); 8085 ret = -EUCLEAN; 8086 goto out; 8087 } 8088 8089 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8090 physical_offset, physical_len); 8091 if (ret < 0) 8092 goto out; 8093 prev_devid = devid; 8094 prev_dev_ext_end = physical_offset + physical_len; 8095 8096 ret = btrfs_next_item(root, path); 8097 if (ret < 0) 8098 goto out; 8099 if (ret > 0) { 8100 ret = 0; 8101 break; 8102 } 8103 } 8104 8105 /* Ensure all chunks have corresponding dev extents */ 8106 ret = verify_chunk_dev_extent_mapping(fs_info); 8107 out: 8108 btrfs_free_path(path); 8109 return ret; 8110 } 8111 8112 /* 8113 * Check whether the given block group or device is pinned by any inode being 8114 * used as a swapfile. 8115 */ 8116 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8117 { 8118 struct btrfs_swapfile_pin *sp; 8119 struct rb_node *node; 8120 8121 spin_lock(&fs_info->swapfile_pins_lock); 8122 node = fs_info->swapfile_pins.rb_node; 8123 while (node) { 8124 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8125 if (ptr < sp->ptr) 8126 node = node->rb_left; 8127 else if (ptr > sp->ptr) 8128 node = node->rb_right; 8129 else 8130 break; 8131 } 8132 spin_unlock(&fs_info->swapfile_pins_lock); 8133 return node != NULL; 8134 } 8135 8136 static int relocating_repair_kthread(void *data) 8137 { 8138 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8139 struct btrfs_fs_info *fs_info = cache->fs_info; 8140 u64 target; 8141 int ret = 0; 8142 8143 target = cache->start; 8144 btrfs_put_block_group(cache); 8145 8146 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8147 btrfs_info(fs_info, 8148 "zoned: skip relocating block group %llu to repair: EBUSY", 8149 target); 8150 return -EBUSY; 8151 } 8152 8153 mutex_lock(&fs_info->reclaim_bgs_lock); 8154 8155 /* Ensure block group still exists */ 8156 cache = btrfs_lookup_block_group(fs_info, target); 8157 if (!cache) 8158 goto out; 8159 8160 if (!cache->relocating_repair) 8161 goto out; 8162 8163 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8164 if (ret < 0) 8165 goto out; 8166 8167 btrfs_info(fs_info, 8168 "zoned: relocating block group %llu to repair IO failure", 8169 target); 8170 ret = btrfs_relocate_chunk(fs_info, target); 8171 8172 out: 8173 if (cache) 8174 btrfs_put_block_group(cache); 8175 mutex_unlock(&fs_info->reclaim_bgs_lock); 8176 btrfs_exclop_finish(fs_info); 8177 8178 return ret; 8179 } 8180 8181 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8182 { 8183 struct btrfs_block_group *cache; 8184 8185 /* Do not attempt to repair in degraded state */ 8186 if (btrfs_test_opt(fs_info, DEGRADED)) 8187 return 0; 8188 8189 cache = btrfs_lookup_block_group(fs_info, logical); 8190 if (!cache) 8191 return 0; 8192 8193 spin_lock(&cache->lock); 8194 if (cache->relocating_repair) { 8195 spin_unlock(&cache->lock); 8196 btrfs_put_block_group(cache); 8197 return 0; 8198 } 8199 cache->relocating_repair = 1; 8200 spin_unlock(&cache->lock); 8201 8202 kthread_run(relocating_repair_kthread, cache, 8203 "btrfs-relocating-repair"); 8204 8205 return 0; 8206 } 8207