1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/buffer_head.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "ctree.h" 18 #include "extent_map.h" 19 #include "disk-io.h" 20 #include "transaction.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "async-thread.h" 25 #include "check-integrity.h" 26 #include "rcu-string.h" 27 #include "math.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 32 [BTRFS_RAID_RAID10] = { 33 .sub_stripes = 2, 34 .dev_stripes = 1, 35 .devs_max = 0, /* 0 == as many as possible */ 36 .devs_min = 4, 37 .tolerated_failures = 1, 38 .devs_increment = 2, 39 .ncopies = 2, 40 .nparity = 0, 41 .raid_name = "raid10", 42 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 43 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 44 }, 45 [BTRFS_RAID_RAID1] = { 46 .sub_stripes = 1, 47 .dev_stripes = 1, 48 .devs_max = 2, 49 .devs_min = 2, 50 .tolerated_failures = 1, 51 .devs_increment = 2, 52 .ncopies = 2, 53 .nparity = 0, 54 .raid_name = "raid1", 55 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 56 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 57 }, 58 [BTRFS_RAID_DUP] = { 59 .sub_stripes = 1, 60 .dev_stripes = 2, 61 .devs_max = 1, 62 .devs_min = 1, 63 .tolerated_failures = 0, 64 .devs_increment = 1, 65 .ncopies = 2, 66 .nparity = 0, 67 .raid_name = "dup", 68 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 69 .mindev_error = 0, 70 }, 71 [BTRFS_RAID_RAID0] = { 72 .sub_stripes = 1, 73 .dev_stripes = 1, 74 .devs_max = 0, 75 .devs_min = 2, 76 .tolerated_failures = 0, 77 .devs_increment = 1, 78 .ncopies = 1, 79 .nparity = 0, 80 .raid_name = "raid0", 81 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 82 .mindev_error = 0, 83 }, 84 [BTRFS_RAID_SINGLE] = { 85 .sub_stripes = 1, 86 .dev_stripes = 1, 87 .devs_max = 1, 88 .devs_min = 1, 89 .tolerated_failures = 0, 90 .devs_increment = 1, 91 .ncopies = 1, 92 .nparity = 0, 93 .raid_name = "single", 94 .bg_flag = 0, 95 .mindev_error = 0, 96 }, 97 [BTRFS_RAID_RAID5] = { 98 .sub_stripes = 1, 99 .dev_stripes = 1, 100 .devs_max = 0, 101 .devs_min = 2, 102 .tolerated_failures = 1, 103 .devs_increment = 1, 104 .ncopies = 1, 105 .nparity = 1, 106 .raid_name = "raid5", 107 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 108 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 109 }, 110 [BTRFS_RAID_RAID6] = { 111 .sub_stripes = 1, 112 .dev_stripes = 1, 113 .devs_max = 0, 114 .devs_min = 3, 115 .tolerated_failures = 2, 116 .devs_increment = 1, 117 .ncopies = 1, 118 .nparity = 2, 119 .raid_name = "raid6", 120 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 121 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 122 }, 123 }; 124 125 const char *get_raid_name(enum btrfs_raid_types type) 126 { 127 if (type >= BTRFS_NR_RAID_TYPES) 128 return NULL; 129 130 return btrfs_raid_array[type].raid_name; 131 } 132 133 /* 134 * Fill @buf with textual description of @bg_flags, no more than @size_buf 135 * bytes including terminating null byte. 136 */ 137 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 138 { 139 int i; 140 int ret; 141 char *bp = buf; 142 u64 flags = bg_flags; 143 u32 size_bp = size_buf; 144 145 if (!flags) { 146 strcpy(bp, "NONE"); 147 return; 148 } 149 150 #define DESCRIBE_FLAG(flag, desc) \ 151 do { \ 152 if (flags & (flag)) { \ 153 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 154 if (ret < 0 || ret >= size_bp) \ 155 goto out_overflow; \ 156 size_bp -= ret; \ 157 bp += ret; \ 158 flags &= ~(flag); \ 159 } \ 160 } while (0) 161 162 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 163 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 164 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 165 166 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 167 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 168 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 169 btrfs_raid_array[i].raid_name); 170 #undef DESCRIBE_FLAG 171 172 if (flags) { 173 ret = snprintf(bp, size_bp, "0x%llx|", flags); 174 size_bp -= ret; 175 } 176 177 if (size_bp < size_buf) 178 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 179 180 /* 181 * The text is trimmed, it's up to the caller to provide sufficiently 182 * large buffer 183 */ 184 out_overflow:; 185 } 186 187 static int init_first_rw_device(struct btrfs_trans_handle *trans, 188 struct btrfs_fs_info *fs_info); 189 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 190 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 191 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 192 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 193 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 194 enum btrfs_map_op op, 195 u64 logical, u64 *length, 196 struct btrfs_bio **bbio_ret, 197 int mirror_num, int need_raid_map); 198 199 /* 200 * Device locking 201 * ============== 202 * 203 * There are several mutexes that protect manipulation of devices and low-level 204 * structures like chunks but not block groups, extents or files 205 * 206 * uuid_mutex (global lock) 207 * ------------------------ 208 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 209 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 210 * device) or requested by the device= mount option 211 * 212 * the mutex can be very coarse and can cover long-running operations 213 * 214 * protects: updates to fs_devices counters like missing devices, rw devices, 215 * seeding, structure cloning, opening/closing devices at mount/umount time 216 * 217 * global::fs_devs - add, remove, updates to the global list 218 * 219 * does not protect: manipulation of the fs_devices::devices list! 220 * 221 * btrfs_device::name - renames (write side), read is RCU 222 * 223 * fs_devices::device_list_mutex (per-fs, with RCU) 224 * ------------------------------------------------ 225 * protects updates to fs_devices::devices, ie. adding and deleting 226 * 227 * simple list traversal with read-only actions can be done with RCU protection 228 * 229 * may be used to exclude some operations from running concurrently without any 230 * modifications to the list (see write_all_supers) 231 * 232 * balance_mutex 233 * ------------- 234 * protects balance structures (status, state) and context accessed from 235 * several places (internally, ioctl) 236 * 237 * chunk_mutex 238 * ----------- 239 * protects chunks, adding or removing during allocation, trim or when a new 240 * device is added/removed 241 * 242 * cleaner_mutex 243 * ------------- 244 * a big lock that is held by the cleaner thread and prevents running subvolume 245 * cleaning together with relocation or delayed iputs 246 * 247 * 248 * Lock nesting 249 * ============ 250 * 251 * uuid_mutex 252 * volume_mutex 253 * device_list_mutex 254 * chunk_mutex 255 * balance_mutex 256 * 257 * 258 * Exclusive operations, BTRFS_FS_EXCL_OP 259 * ====================================== 260 * 261 * Maintains the exclusivity of the following operations that apply to the 262 * whole filesystem and cannot run in parallel. 263 * 264 * - Balance (*) 265 * - Device add 266 * - Device remove 267 * - Device replace (*) 268 * - Resize 269 * 270 * The device operations (as above) can be in one of the following states: 271 * 272 * - Running state 273 * - Paused state 274 * - Completed state 275 * 276 * Only device operations marked with (*) can go into the Paused state for the 277 * following reasons: 278 * 279 * - ioctl (only Balance can be Paused through ioctl) 280 * - filesystem remounted as read-only 281 * - filesystem unmounted and mounted as read-only 282 * - system power-cycle and filesystem mounted as read-only 283 * - filesystem or device errors leading to forced read-only 284 * 285 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. 286 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. 287 * A device operation in Paused or Running state can be canceled or resumed 288 * either by ioctl (Balance only) or when remounted as read-write. 289 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or 290 * completed. 291 */ 292 293 DEFINE_MUTEX(uuid_mutex); 294 static LIST_HEAD(fs_uuids); 295 struct list_head *btrfs_get_fs_uuids(void) 296 { 297 return &fs_uuids; 298 } 299 300 /* 301 * alloc_fs_devices - allocate struct btrfs_fs_devices 302 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 303 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 304 * 305 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 306 * The returned struct is not linked onto any lists and can be destroyed with 307 * kfree() right away. 308 */ 309 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 310 const u8 *metadata_fsid) 311 { 312 struct btrfs_fs_devices *fs_devs; 313 314 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 315 if (!fs_devs) 316 return ERR_PTR(-ENOMEM); 317 318 mutex_init(&fs_devs->device_list_mutex); 319 320 INIT_LIST_HEAD(&fs_devs->devices); 321 INIT_LIST_HEAD(&fs_devs->resized_devices); 322 INIT_LIST_HEAD(&fs_devs->alloc_list); 323 INIT_LIST_HEAD(&fs_devs->fs_list); 324 if (fsid) 325 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 326 327 if (metadata_fsid) 328 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 329 else if (fsid) 330 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 331 332 return fs_devs; 333 } 334 335 void btrfs_free_device(struct btrfs_device *device) 336 { 337 rcu_string_free(device->name); 338 bio_put(device->flush_bio); 339 kfree(device); 340 } 341 342 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 343 { 344 struct btrfs_device *device; 345 WARN_ON(fs_devices->opened); 346 while (!list_empty(&fs_devices->devices)) { 347 device = list_entry(fs_devices->devices.next, 348 struct btrfs_device, dev_list); 349 list_del(&device->dev_list); 350 btrfs_free_device(device); 351 } 352 kfree(fs_devices); 353 } 354 355 static void btrfs_kobject_uevent(struct block_device *bdev, 356 enum kobject_action action) 357 { 358 int ret; 359 360 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 361 if (ret) 362 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 363 action, 364 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 365 &disk_to_dev(bdev->bd_disk)->kobj); 366 } 367 368 void __exit btrfs_cleanup_fs_uuids(void) 369 { 370 struct btrfs_fs_devices *fs_devices; 371 372 while (!list_empty(&fs_uuids)) { 373 fs_devices = list_entry(fs_uuids.next, 374 struct btrfs_fs_devices, fs_list); 375 list_del(&fs_devices->fs_list); 376 free_fs_devices(fs_devices); 377 } 378 } 379 380 /* 381 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 382 * Returned struct is not linked onto any lists and must be destroyed using 383 * btrfs_free_device. 384 */ 385 static struct btrfs_device *__alloc_device(void) 386 { 387 struct btrfs_device *dev; 388 389 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 390 if (!dev) 391 return ERR_PTR(-ENOMEM); 392 393 /* 394 * Preallocate a bio that's always going to be used for flushing device 395 * barriers and matches the device lifespan 396 */ 397 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 398 if (!dev->flush_bio) { 399 kfree(dev); 400 return ERR_PTR(-ENOMEM); 401 } 402 403 INIT_LIST_HEAD(&dev->dev_list); 404 INIT_LIST_HEAD(&dev->dev_alloc_list); 405 INIT_LIST_HEAD(&dev->resized_list); 406 407 spin_lock_init(&dev->io_lock); 408 409 atomic_set(&dev->reada_in_flight, 0); 410 atomic_set(&dev->dev_stats_ccnt, 0); 411 btrfs_device_data_ordered_init(dev); 412 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 413 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 414 415 return dev; 416 } 417 418 /* 419 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 420 * return NULL. 421 * 422 * If devid and uuid are both specified, the match must be exact, otherwise 423 * only devid is used. 424 */ 425 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 426 u64 devid, const u8 *uuid) 427 { 428 struct btrfs_device *dev; 429 430 list_for_each_entry(dev, &fs_devices->devices, dev_list) { 431 if (dev->devid == devid && 432 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 433 return dev; 434 } 435 } 436 return NULL; 437 } 438 439 static noinline struct btrfs_fs_devices *find_fsid( 440 const u8 *fsid, const u8 *metadata_fsid) 441 { 442 struct btrfs_fs_devices *fs_devices; 443 444 ASSERT(fsid); 445 446 if (metadata_fsid) { 447 /* 448 * Handle scanned device having completed its fsid change but 449 * belonging to a fs_devices that was created by first scanning 450 * a device which didn't have its fsid/metadata_uuid changed 451 * at all and the CHANGING_FSID_V2 flag set. 452 */ 453 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 454 if (fs_devices->fsid_change && 455 memcmp(metadata_fsid, fs_devices->fsid, 456 BTRFS_FSID_SIZE) == 0 && 457 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 458 BTRFS_FSID_SIZE) == 0) { 459 return fs_devices; 460 } 461 } 462 /* 463 * Handle scanned device having completed its fsid change but 464 * belonging to a fs_devices that was created by a device that 465 * has an outdated pair of fsid/metadata_uuid and 466 * CHANGING_FSID_V2 flag set. 467 */ 468 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 469 if (fs_devices->fsid_change && 470 memcmp(fs_devices->metadata_uuid, 471 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 472 memcmp(metadata_fsid, fs_devices->metadata_uuid, 473 BTRFS_FSID_SIZE) == 0) { 474 return fs_devices; 475 } 476 } 477 } 478 479 /* Handle non-split brain cases */ 480 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 481 if (metadata_fsid) { 482 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 483 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 484 BTRFS_FSID_SIZE) == 0) 485 return fs_devices; 486 } else { 487 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 488 return fs_devices; 489 } 490 } 491 return NULL; 492 } 493 494 static int 495 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 496 int flush, struct block_device **bdev, 497 struct buffer_head **bh) 498 { 499 int ret; 500 501 *bdev = blkdev_get_by_path(device_path, flags, holder); 502 503 if (IS_ERR(*bdev)) { 504 ret = PTR_ERR(*bdev); 505 goto error; 506 } 507 508 if (flush) 509 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 510 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 511 if (ret) { 512 blkdev_put(*bdev, flags); 513 goto error; 514 } 515 invalidate_bdev(*bdev); 516 *bh = btrfs_read_dev_super(*bdev); 517 if (IS_ERR(*bh)) { 518 ret = PTR_ERR(*bh); 519 blkdev_put(*bdev, flags); 520 goto error; 521 } 522 523 return 0; 524 525 error: 526 *bdev = NULL; 527 *bh = NULL; 528 return ret; 529 } 530 531 static void requeue_list(struct btrfs_pending_bios *pending_bios, 532 struct bio *head, struct bio *tail) 533 { 534 535 struct bio *old_head; 536 537 old_head = pending_bios->head; 538 pending_bios->head = head; 539 if (pending_bios->tail) 540 tail->bi_next = old_head; 541 else 542 pending_bios->tail = tail; 543 } 544 545 /* 546 * we try to collect pending bios for a device so we don't get a large 547 * number of procs sending bios down to the same device. This greatly 548 * improves the schedulers ability to collect and merge the bios. 549 * 550 * But, it also turns into a long list of bios to process and that is sure 551 * to eventually make the worker thread block. The solution here is to 552 * make some progress and then put this work struct back at the end of 553 * the list if the block device is congested. This way, multiple devices 554 * can make progress from a single worker thread. 555 */ 556 static noinline void run_scheduled_bios(struct btrfs_device *device) 557 { 558 struct btrfs_fs_info *fs_info = device->fs_info; 559 struct bio *pending; 560 struct backing_dev_info *bdi; 561 struct btrfs_pending_bios *pending_bios; 562 struct bio *tail; 563 struct bio *cur; 564 int again = 0; 565 unsigned long num_run; 566 unsigned long batch_run = 0; 567 unsigned long last_waited = 0; 568 int force_reg = 0; 569 int sync_pending = 0; 570 struct blk_plug plug; 571 572 /* 573 * this function runs all the bios we've collected for 574 * a particular device. We don't want to wander off to 575 * another device without first sending all of these down. 576 * So, setup a plug here and finish it off before we return 577 */ 578 blk_start_plug(&plug); 579 580 bdi = device->bdev->bd_bdi; 581 582 loop: 583 spin_lock(&device->io_lock); 584 585 loop_lock: 586 num_run = 0; 587 588 /* take all the bios off the list at once and process them 589 * later on (without the lock held). But, remember the 590 * tail and other pointers so the bios can be properly reinserted 591 * into the list if we hit congestion 592 */ 593 if (!force_reg && device->pending_sync_bios.head) { 594 pending_bios = &device->pending_sync_bios; 595 force_reg = 1; 596 } else { 597 pending_bios = &device->pending_bios; 598 force_reg = 0; 599 } 600 601 pending = pending_bios->head; 602 tail = pending_bios->tail; 603 WARN_ON(pending && !tail); 604 605 /* 606 * if pending was null this time around, no bios need processing 607 * at all and we can stop. Otherwise it'll loop back up again 608 * and do an additional check so no bios are missed. 609 * 610 * device->running_pending is used to synchronize with the 611 * schedule_bio code. 612 */ 613 if (device->pending_sync_bios.head == NULL && 614 device->pending_bios.head == NULL) { 615 again = 0; 616 device->running_pending = 0; 617 } else { 618 again = 1; 619 device->running_pending = 1; 620 } 621 622 pending_bios->head = NULL; 623 pending_bios->tail = NULL; 624 625 spin_unlock(&device->io_lock); 626 627 while (pending) { 628 629 rmb(); 630 /* we want to work on both lists, but do more bios on the 631 * sync list than the regular list 632 */ 633 if ((num_run > 32 && 634 pending_bios != &device->pending_sync_bios && 635 device->pending_sync_bios.head) || 636 (num_run > 64 && pending_bios == &device->pending_sync_bios && 637 device->pending_bios.head)) { 638 spin_lock(&device->io_lock); 639 requeue_list(pending_bios, pending, tail); 640 goto loop_lock; 641 } 642 643 cur = pending; 644 pending = pending->bi_next; 645 cur->bi_next = NULL; 646 647 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 648 649 /* 650 * if we're doing the sync list, record that our 651 * plug has some sync requests on it 652 * 653 * If we're doing the regular list and there are 654 * sync requests sitting around, unplug before 655 * we add more 656 */ 657 if (pending_bios == &device->pending_sync_bios) { 658 sync_pending = 1; 659 } else if (sync_pending) { 660 blk_finish_plug(&plug); 661 blk_start_plug(&plug); 662 sync_pending = 0; 663 } 664 665 btrfsic_submit_bio(cur); 666 num_run++; 667 batch_run++; 668 669 cond_resched(); 670 671 /* 672 * we made progress, there is more work to do and the bdi 673 * is now congested. Back off and let other work structs 674 * run instead 675 */ 676 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 677 fs_info->fs_devices->open_devices > 1) { 678 struct io_context *ioc; 679 680 ioc = current->io_context; 681 682 /* 683 * the main goal here is that we don't want to 684 * block if we're going to be able to submit 685 * more requests without blocking. 686 * 687 * This code does two great things, it pokes into 688 * the elevator code from a filesystem _and_ 689 * it makes assumptions about how batching works. 690 */ 691 if (ioc && ioc->nr_batch_requests > 0 && 692 time_before(jiffies, ioc->last_waited + HZ/50UL) && 693 (last_waited == 0 || 694 ioc->last_waited == last_waited)) { 695 /* 696 * we want to go through our batch of 697 * requests and stop. So, we copy out 698 * the ioc->last_waited time and test 699 * against it before looping 700 */ 701 last_waited = ioc->last_waited; 702 cond_resched(); 703 continue; 704 } 705 spin_lock(&device->io_lock); 706 requeue_list(pending_bios, pending, tail); 707 device->running_pending = 1; 708 709 spin_unlock(&device->io_lock); 710 btrfs_queue_work(fs_info->submit_workers, 711 &device->work); 712 goto done; 713 } 714 } 715 716 cond_resched(); 717 if (again) 718 goto loop; 719 720 spin_lock(&device->io_lock); 721 if (device->pending_bios.head || device->pending_sync_bios.head) 722 goto loop_lock; 723 spin_unlock(&device->io_lock); 724 725 done: 726 blk_finish_plug(&plug); 727 } 728 729 static void pending_bios_fn(struct btrfs_work *work) 730 { 731 struct btrfs_device *device; 732 733 device = container_of(work, struct btrfs_device, work); 734 run_scheduled_bios(device); 735 } 736 737 /* 738 * Search and remove all stale (devices which are not mounted) devices. 739 * When both inputs are NULL, it will search and release all stale devices. 740 * path: Optional. When provided will it release all unmounted devices 741 * matching this path only. 742 * skip_dev: Optional. Will skip this device when searching for the stale 743 * devices. 744 */ 745 static void btrfs_free_stale_devices(const char *path, 746 struct btrfs_device *skip_device) 747 { 748 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 749 struct btrfs_device *device, *tmp_device; 750 751 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 752 mutex_lock(&fs_devices->device_list_mutex); 753 if (fs_devices->opened) { 754 mutex_unlock(&fs_devices->device_list_mutex); 755 continue; 756 } 757 758 list_for_each_entry_safe(device, tmp_device, 759 &fs_devices->devices, dev_list) { 760 int not_found = 0; 761 762 if (skip_device && skip_device == device) 763 continue; 764 if (path && !device->name) 765 continue; 766 767 rcu_read_lock(); 768 if (path) 769 not_found = strcmp(rcu_str_deref(device->name), 770 path); 771 rcu_read_unlock(); 772 if (not_found) 773 continue; 774 775 /* delete the stale device */ 776 fs_devices->num_devices--; 777 list_del(&device->dev_list); 778 btrfs_free_device(device); 779 780 if (fs_devices->num_devices == 0) 781 break; 782 } 783 mutex_unlock(&fs_devices->device_list_mutex); 784 if (fs_devices->num_devices == 0) { 785 btrfs_sysfs_remove_fsid(fs_devices); 786 list_del(&fs_devices->fs_list); 787 free_fs_devices(fs_devices); 788 } 789 } 790 } 791 792 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 793 struct btrfs_device *device, fmode_t flags, 794 void *holder) 795 { 796 struct request_queue *q; 797 struct block_device *bdev; 798 struct buffer_head *bh; 799 struct btrfs_super_block *disk_super; 800 u64 devid; 801 int ret; 802 803 if (device->bdev) 804 return -EINVAL; 805 if (!device->name) 806 return -EINVAL; 807 808 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 809 &bdev, &bh); 810 if (ret) 811 return ret; 812 813 disk_super = (struct btrfs_super_block *)bh->b_data; 814 devid = btrfs_stack_device_id(&disk_super->dev_item); 815 if (devid != device->devid) 816 goto error_brelse; 817 818 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 819 goto error_brelse; 820 821 device->generation = btrfs_super_generation(disk_super); 822 823 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 824 if (btrfs_super_incompat_flags(disk_super) & 825 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 826 pr_err( 827 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 828 goto error_brelse; 829 } 830 831 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 832 fs_devices->seeding = 1; 833 } else { 834 if (bdev_read_only(bdev)) 835 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 836 else 837 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 838 } 839 840 q = bdev_get_queue(bdev); 841 if (!blk_queue_nonrot(q)) 842 fs_devices->rotating = 1; 843 844 device->bdev = bdev; 845 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 846 device->mode = flags; 847 848 fs_devices->open_devices++; 849 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 850 device->devid != BTRFS_DEV_REPLACE_DEVID) { 851 fs_devices->rw_devices++; 852 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 853 } 854 brelse(bh); 855 856 return 0; 857 858 error_brelse: 859 brelse(bh); 860 blkdev_put(bdev, flags); 861 862 return -EINVAL; 863 } 864 865 /* 866 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 867 * being created with a disk that has already completed its fsid change. 868 */ 869 static struct btrfs_fs_devices *find_fsid_inprogress( 870 struct btrfs_super_block *disk_super) 871 { 872 struct btrfs_fs_devices *fs_devices; 873 874 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 875 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 876 BTRFS_FSID_SIZE) != 0 && 877 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 878 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 879 return fs_devices; 880 } 881 } 882 883 return NULL; 884 } 885 886 887 static struct btrfs_fs_devices *find_fsid_changed( 888 struct btrfs_super_block *disk_super) 889 { 890 struct btrfs_fs_devices *fs_devices; 891 892 /* 893 * Handles the case where scanned device is part of an fs that had 894 * multiple successful changes of FSID but curently device didn't 895 * observe it. Meaning our fsid will be different than theirs. 896 */ 897 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 898 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 899 BTRFS_FSID_SIZE) != 0 && 900 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 901 BTRFS_FSID_SIZE) == 0 && 902 memcmp(fs_devices->fsid, disk_super->fsid, 903 BTRFS_FSID_SIZE) != 0) { 904 return fs_devices; 905 } 906 } 907 908 return NULL; 909 } 910 /* 911 * Add new device to list of registered devices 912 * 913 * Returns: 914 * device pointer which was just added or updated when successful 915 * error pointer when failed 916 */ 917 static noinline struct btrfs_device *device_list_add(const char *path, 918 struct btrfs_super_block *disk_super, 919 bool *new_device_added) 920 { 921 struct btrfs_device *device; 922 struct btrfs_fs_devices *fs_devices = NULL; 923 struct rcu_string *name; 924 u64 found_transid = btrfs_super_generation(disk_super); 925 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 926 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 927 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 928 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 929 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 930 931 if (fsid_change_in_progress) { 932 if (!has_metadata_uuid) { 933 /* 934 * When we have an image which has CHANGING_FSID_V2 set 935 * it might belong to either a filesystem which has 936 * disks with completed fsid change or it might belong 937 * to fs with no UUID changes in effect, handle both. 938 */ 939 fs_devices = find_fsid_inprogress(disk_super); 940 if (!fs_devices) 941 fs_devices = find_fsid(disk_super->fsid, NULL); 942 } else { 943 fs_devices = find_fsid_changed(disk_super); 944 } 945 } else if (has_metadata_uuid) { 946 fs_devices = find_fsid(disk_super->fsid, 947 disk_super->metadata_uuid); 948 } else { 949 fs_devices = find_fsid(disk_super->fsid, NULL); 950 } 951 952 953 if (!fs_devices) { 954 if (has_metadata_uuid) 955 fs_devices = alloc_fs_devices(disk_super->fsid, 956 disk_super->metadata_uuid); 957 else 958 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 959 960 fs_devices->fsid_change = fsid_change_in_progress; 961 962 if (IS_ERR(fs_devices)) 963 return ERR_CAST(fs_devices); 964 965 mutex_lock(&fs_devices->device_list_mutex); 966 list_add(&fs_devices->fs_list, &fs_uuids); 967 968 device = NULL; 969 } else { 970 mutex_lock(&fs_devices->device_list_mutex); 971 device = find_device(fs_devices, devid, 972 disk_super->dev_item.uuid); 973 974 /* 975 * If this disk has been pulled into an fs devices created by 976 * a device which had the CHANGING_FSID_V2 flag then replace the 977 * metadata_uuid/fsid values of the fs_devices. 978 */ 979 if (has_metadata_uuid && fs_devices->fsid_change && 980 found_transid > fs_devices->latest_generation) { 981 memcpy(fs_devices->fsid, disk_super->fsid, 982 BTRFS_FSID_SIZE); 983 memcpy(fs_devices->metadata_uuid, 984 disk_super->metadata_uuid, BTRFS_FSID_SIZE); 985 986 fs_devices->fsid_change = false; 987 } 988 } 989 990 if (!device) { 991 if (fs_devices->opened) { 992 mutex_unlock(&fs_devices->device_list_mutex); 993 return ERR_PTR(-EBUSY); 994 } 995 996 device = btrfs_alloc_device(NULL, &devid, 997 disk_super->dev_item.uuid); 998 if (IS_ERR(device)) { 999 mutex_unlock(&fs_devices->device_list_mutex); 1000 /* we can safely leave the fs_devices entry around */ 1001 return device; 1002 } 1003 1004 name = rcu_string_strdup(path, GFP_NOFS); 1005 if (!name) { 1006 btrfs_free_device(device); 1007 mutex_unlock(&fs_devices->device_list_mutex); 1008 return ERR_PTR(-ENOMEM); 1009 } 1010 rcu_assign_pointer(device->name, name); 1011 1012 list_add_rcu(&device->dev_list, &fs_devices->devices); 1013 fs_devices->num_devices++; 1014 1015 device->fs_devices = fs_devices; 1016 *new_device_added = true; 1017 1018 if (disk_super->label[0]) 1019 pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", 1020 disk_super->label, devid, found_transid, path); 1021 else 1022 pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", 1023 disk_super->fsid, devid, found_transid, path); 1024 1025 } else if (!device->name || strcmp(device->name->str, path)) { 1026 /* 1027 * When FS is already mounted. 1028 * 1. If you are here and if the device->name is NULL that 1029 * means this device was missing at time of FS mount. 1030 * 2. If you are here and if the device->name is different 1031 * from 'path' that means either 1032 * a. The same device disappeared and reappeared with 1033 * different name. or 1034 * b. The missing-disk-which-was-replaced, has 1035 * reappeared now. 1036 * 1037 * We must allow 1 and 2a above. But 2b would be a spurious 1038 * and unintentional. 1039 * 1040 * Further in case of 1 and 2a above, the disk at 'path' 1041 * would have missed some transaction when it was away and 1042 * in case of 2a the stale bdev has to be updated as well. 1043 * 2b must not be allowed at all time. 1044 */ 1045 1046 /* 1047 * For now, we do allow update to btrfs_fs_device through the 1048 * btrfs dev scan cli after FS has been mounted. We're still 1049 * tracking a problem where systems fail mount by subvolume id 1050 * when we reject replacement on a mounted FS. 1051 */ 1052 if (!fs_devices->opened && found_transid < device->generation) { 1053 /* 1054 * That is if the FS is _not_ mounted and if you 1055 * are here, that means there is more than one 1056 * disk with same uuid and devid.We keep the one 1057 * with larger generation number or the last-in if 1058 * generation are equal. 1059 */ 1060 mutex_unlock(&fs_devices->device_list_mutex); 1061 return ERR_PTR(-EEXIST); 1062 } 1063 1064 /* 1065 * We are going to replace the device path for a given devid, 1066 * make sure it's the same device if the device is mounted 1067 */ 1068 if (device->bdev) { 1069 struct block_device *path_bdev; 1070 1071 path_bdev = lookup_bdev(path); 1072 if (IS_ERR(path_bdev)) { 1073 mutex_unlock(&fs_devices->device_list_mutex); 1074 return ERR_CAST(path_bdev); 1075 } 1076 1077 if (device->bdev != path_bdev) { 1078 bdput(path_bdev); 1079 mutex_unlock(&fs_devices->device_list_mutex); 1080 btrfs_warn_in_rcu(device->fs_info, 1081 "duplicate device fsid:devid for %pU:%llu old:%s new:%s", 1082 disk_super->fsid, devid, 1083 rcu_str_deref(device->name), path); 1084 return ERR_PTR(-EEXIST); 1085 } 1086 bdput(path_bdev); 1087 btrfs_info_in_rcu(device->fs_info, 1088 "device fsid %pU devid %llu moved old:%s new:%s", 1089 disk_super->fsid, devid, 1090 rcu_str_deref(device->name), path); 1091 } 1092 1093 name = rcu_string_strdup(path, GFP_NOFS); 1094 if (!name) { 1095 mutex_unlock(&fs_devices->device_list_mutex); 1096 return ERR_PTR(-ENOMEM); 1097 } 1098 rcu_string_free(device->name); 1099 rcu_assign_pointer(device->name, name); 1100 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1101 fs_devices->missing_devices--; 1102 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1103 } 1104 } 1105 1106 /* 1107 * Unmount does not free the btrfs_device struct but would zero 1108 * generation along with most of the other members. So just update 1109 * it back. We need it to pick the disk with largest generation 1110 * (as above). 1111 */ 1112 if (!fs_devices->opened) { 1113 device->generation = found_transid; 1114 fs_devices->latest_generation = max_t(u64, found_transid, 1115 fs_devices->latest_generation); 1116 } 1117 1118 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 1119 1120 mutex_unlock(&fs_devices->device_list_mutex); 1121 return device; 1122 } 1123 1124 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 1125 { 1126 struct btrfs_fs_devices *fs_devices; 1127 struct btrfs_device *device; 1128 struct btrfs_device *orig_dev; 1129 1130 fs_devices = alloc_fs_devices(orig->fsid, NULL); 1131 if (IS_ERR(fs_devices)) 1132 return fs_devices; 1133 1134 mutex_lock(&orig->device_list_mutex); 1135 fs_devices->total_devices = orig->total_devices; 1136 1137 /* We have held the volume lock, it is safe to get the devices. */ 1138 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1139 struct rcu_string *name; 1140 1141 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1142 orig_dev->uuid); 1143 if (IS_ERR(device)) 1144 goto error; 1145 1146 /* 1147 * This is ok to do without rcu read locked because we hold the 1148 * uuid mutex so nothing we touch in here is going to disappear. 1149 */ 1150 if (orig_dev->name) { 1151 name = rcu_string_strdup(orig_dev->name->str, 1152 GFP_KERNEL); 1153 if (!name) { 1154 btrfs_free_device(device); 1155 goto error; 1156 } 1157 rcu_assign_pointer(device->name, name); 1158 } 1159 1160 list_add(&device->dev_list, &fs_devices->devices); 1161 device->fs_devices = fs_devices; 1162 fs_devices->num_devices++; 1163 } 1164 mutex_unlock(&orig->device_list_mutex); 1165 return fs_devices; 1166 error: 1167 mutex_unlock(&orig->device_list_mutex); 1168 free_fs_devices(fs_devices); 1169 return ERR_PTR(-ENOMEM); 1170 } 1171 1172 /* 1173 * After we have read the system tree and know devids belonging to 1174 * this filesystem, remove the device which does not belong there. 1175 */ 1176 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 1177 { 1178 struct btrfs_device *device, *next; 1179 struct btrfs_device *latest_dev = NULL; 1180 1181 mutex_lock(&uuid_mutex); 1182 again: 1183 /* This is the initialized path, it is safe to release the devices. */ 1184 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1185 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 1186 &device->dev_state)) { 1187 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1188 &device->dev_state) && 1189 (!latest_dev || 1190 device->generation > latest_dev->generation)) { 1191 latest_dev = device; 1192 } 1193 continue; 1194 } 1195 1196 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 1197 /* 1198 * In the first step, keep the device which has 1199 * the correct fsid and the devid that is used 1200 * for the dev_replace procedure. 1201 * In the second step, the dev_replace state is 1202 * read from the device tree and it is known 1203 * whether the procedure is really active or 1204 * not, which means whether this device is 1205 * used or whether it should be removed. 1206 */ 1207 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1208 &device->dev_state)) { 1209 continue; 1210 } 1211 } 1212 if (device->bdev) { 1213 blkdev_put(device->bdev, device->mode); 1214 device->bdev = NULL; 1215 fs_devices->open_devices--; 1216 } 1217 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1218 list_del_init(&device->dev_alloc_list); 1219 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1220 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1221 &device->dev_state)) 1222 fs_devices->rw_devices--; 1223 } 1224 list_del_init(&device->dev_list); 1225 fs_devices->num_devices--; 1226 btrfs_free_device(device); 1227 } 1228 1229 if (fs_devices->seed) { 1230 fs_devices = fs_devices->seed; 1231 goto again; 1232 } 1233 1234 fs_devices->latest_bdev = latest_dev->bdev; 1235 1236 mutex_unlock(&uuid_mutex); 1237 } 1238 1239 static void free_device_rcu(struct rcu_head *head) 1240 { 1241 struct btrfs_device *device; 1242 1243 device = container_of(head, struct btrfs_device, rcu); 1244 btrfs_free_device(device); 1245 } 1246 1247 static void btrfs_close_bdev(struct btrfs_device *device) 1248 { 1249 if (!device->bdev) 1250 return; 1251 1252 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1253 sync_blockdev(device->bdev); 1254 invalidate_bdev(device->bdev); 1255 } 1256 1257 blkdev_put(device->bdev, device->mode); 1258 } 1259 1260 static void btrfs_close_one_device(struct btrfs_device *device) 1261 { 1262 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1263 struct btrfs_device *new_device; 1264 struct rcu_string *name; 1265 1266 if (device->bdev) 1267 fs_devices->open_devices--; 1268 1269 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1270 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1271 list_del_init(&device->dev_alloc_list); 1272 fs_devices->rw_devices--; 1273 } 1274 1275 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1276 fs_devices->missing_devices--; 1277 1278 btrfs_close_bdev(device); 1279 1280 new_device = btrfs_alloc_device(NULL, &device->devid, 1281 device->uuid); 1282 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 1283 1284 /* Safe because we are under uuid_mutex */ 1285 if (device->name) { 1286 name = rcu_string_strdup(device->name->str, GFP_NOFS); 1287 BUG_ON(!name); /* -ENOMEM */ 1288 rcu_assign_pointer(new_device->name, name); 1289 } 1290 1291 list_replace_rcu(&device->dev_list, &new_device->dev_list); 1292 new_device->fs_devices = device->fs_devices; 1293 1294 call_rcu(&device->rcu, free_device_rcu); 1295 } 1296 1297 static int close_fs_devices(struct btrfs_fs_devices *fs_devices) 1298 { 1299 struct btrfs_device *device, *tmp; 1300 1301 if (--fs_devices->opened > 0) 1302 return 0; 1303 1304 mutex_lock(&fs_devices->device_list_mutex); 1305 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1306 btrfs_close_one_device(device); 1307 } 1308 mutex_unlock(&fs_devices->device_list_mutex); 1309 1310 WARN_ON(fs_devices->open_devices); 1311 WARN_ON(fs_devices->rw_devices); 1312 fs_devices->opened = 0; 1313 fs_devices->seeding = 0; 1314 1315 return 0; 1316 } 1317 1318 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1319 { 1320 struct btrfs_fs_devices *seed_devices = NULL; 1321 int ret; 1322 1323 mutex_lock(&uuid_mutex); 1324 ret = close_fs_devices(fs_devices); 1325 if (!fs_devices->opened) { 1326 seed_devices = fs_devices->seed; 1327 fs_devices->seed = NULL; 1328 } 1329 mutex_unlock(&uuid_mutex); 1330 1331 while (seed_devices) { 1332 fs_devices = seed_devices; 1333 seed_devices = fs_devices->seed; 1334 close_fs_devices(fs_devices); 1335 free_fs_devices(fs_devices); 1336 } 1337 return ret; 1338 } 1339 1340 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1341 fmode_t flags, void *holder) 1342 { 1343 struct btrfs_device *device; 1344 struct btrfs_device *latest_dev = NULL; 1345 int ret = 0; 1346 1347 flags |= FMODE_EXCL; 1348 1349 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1350 /* Just open everything we can; ignore failures here */ 1351 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1352 continue; 1353 1354 if (!latest_dev || 1355 device->generation > latest_dev->generation) 1356 latest_dev = device; 1357 } 1358 if (fs_devices->open_devices == 0) { 1359 ret = -EINVAL; 1360 goto out; 1361 } 1362 fs_devices->opened = 1; 1363 fs_devices->latest_bdev = latest_dev->bdev; 1364 fs_devices->total_rw_bytes = 0; 1365 out: 1366 return ret; 1367 } 1368 1369 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1370 { 1371 struct btrfs_device *dev1, *dev2; 1372 1373 dev1 = list_entry(a, struct btrfs_device, dev_list); 1374 dev2 = list_entry(b, struct btrfs_device, dev_list); 1375 1376 if (dev1->devid < dev2->devid) 1377 return -1; 1378 else if (dev1->devid > dev2->devid) 1379 return 1; 1380 return 0; 1381 } 1382 1383 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1384 fmode_t flags, void *holder) 1385 { 1386 int ret; 1387 1388 lockdep_assert_held(&uuid_mutex); 1389 1390 mutex_lock(&fs_devices->device_list_mutex); 1391 if (fs_devices->opened) { 1392 fs_devices->opened++; 1393 ret = 0; 1394 } else { 1395 list_sort(NULL, &fs_devices->devices, devid_cmp); 1396 ret = open_fs_devices(fs_devices, flags, holder); 1397 } 1398 mutex_unlock(&fs_devices->device_list_mutex); 1399 1400 return ret; 1401 } 1402 1403 static void btrfs_release_disk_super(struct page *page) 1404 { 1405 kunmap(page); 1406 put_page(page); 1407 } 1408 1409 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1410 struct page **page, 1411 struct btrfs_super_block **disk_super) 1412 { 1413 void *p; 1414 pgoff_t index; 1415 1416 /* make sure our super fits in the device */ 1417 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1418 return 1; 1419 1420 /* make sure our super fits in the page */ 1421 if (sizeof(**disk_super) > PAGE_SIZE) 1422 return 1; 1423 1424 /* make sure our super doesn't straddle pages on disk */ 1425 index = bytenr >> PAGE_SHIFT; 1426 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1427 return 1; 1428 1429 /* pull in the page with our super */ 1430 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1431 index, GFP_KERNEL); 1432 1433 if (IS_ERR_OR_NULL(*page)) 1434 return 1; 1435 1436 p = kmap(*page); 1437 1438 /* align our pointer to the offset of the super block */ 1439 *disk_super = p + offset_in_page(bytenr); 1440 1441 if (btrfs_super_bytenr(*disk_super) != bytenr || 1442 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1443 btrfs_release_disk_super(*page); 1444 return 1; 1445 } 1446 1447 if ((*disk_super)->label[0] && 1448 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1449 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1450 1451 return 0; 1452 } 1453 1454 /* 1455 * Look for a btrfs signature on a device. This may be called out of the mount path 1456 * and we are not allowed to call set_blocksize during the scan. The superblock 1457 * is read via pagecache 1458 */ 1459 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1460 void *holder) 1461 { 1462 struct btrfs_super_block *disk_super; 1463 bool new_device_added = false; 1464 struct btrfs_device *device = NULL; 1465 struct block_device *bdev; 1466 struct page *page; 1467 u64 bytenr; 1468 1469 lockdep_assert_held(&uuid_mutex); 1470 1471 /* 1472 * we would like to check all the supers, but that would make 1473 * a btrfs mount succeed after a mkfs from a different FS. 1474 * So, we need to add a special mount option to scan for 1475 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1476 */ 1477 bytenr = btrfs_sb_offset(0); 1478 flags |= FMODE_EXCL; 1479 1480 bdev = blkdev_get_by_path(path, flags, holder); 1481 if (IS_ERR(bdev)) 1482 return ERR_CAST(bdev); 1483 1484 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 1485 device = ERR_PTR(-EINVAL); 1486 goto error_bdev_put; 1487 } 1488 1489 device = device_list_add(path, disk_super, &new_device_added); 1490 if (!IS_ERR(device)) { 1491 if (new_device_added) 1492 btrfs_free_stale_devices(path, device); 1493 } 1494 1495 btrfs_release_disk_super(page); 1496 1497 error_bdev_put: 1498 blkdev_put(bdev, flags); 1499 1500 return device; 1501 } 1502 1503 static int contains_pending_extent(struct btrfs_transaction *transaction, 1504 struct btrfs_device *device, 1505 u64 *start, u64 len) 1506 { 1507 struct btrfs_fs_info *fs_info = device->fs_info; 1508 struct extent_map *em; 1509 struct list_head *search_list = &fs_info->pinned_chunks; 1510 int ret = 0; 1511 u64 physical_start = *start; 1512 1513 if (transaction) 1514 search_list = &transaction->pending_chunks; 1515 again: 1516 list_for_each_entry(em, search_list, list) { 1517 struct map_lookup *map; 1518 int i; 1519 1520 map = em->map_lookup; 1521 for (i = 0; i < map->num_stripes; i++) { 1522 u64 end; 1523 1524 if (map->stripes[i].dev != device) 1525 continue; 1526 if (map->stripes[i].physical >= physical_start + len || 1527 map->stripes[i].physical + em->orig_block_len <= 1528 physical_start) 1529 continue; 1530 /* 1531 * Make sure that while processing the pinned list we do 1532 * not override our *start with a lower value, because 1533 * we can have pinned chunks that fall within this 1534 * device hole and that have lower physical addresses 1535 * than the pending chunks we processed before. If we 1536 * do not take this special care we can end up getting 1537 * 2 pending chunks that start at the same physical 1538 * device offsets because the end offset of a pinned 1539 * chunk can be equal to the start offset of some 1540 * pending chunk. 1541 */ 1542 end = map->stripes[i].physical + em->orig_block_len; 1543 if (end > *start) { 1544 *start = end; 1545 ret = 1; 1546 } 1547 } 1548 } 1549 if (search_list != &fs_info->pinned_chunks) { 1550 search_list = &fs_info->pinned_chunks; 1551 goto again; 1552 } 1553 1554 return ret; 1555 } 1556 1557 1558 /* 1559 * find_free_dev_extent_start - find free space in the specified device 1560 * @device: the device which we search the free space in 1561 * @num_bytes: the size of the free space that we need 1562 * @search_start: the position from which to begin the search 1563 * @start: store the start of the free space. 1564 * @len: the size of the free space. that we find, or the size 1565 * of the max free space if we don't find suitable free space 1566 * 1567 * this uses a pretty simple search, the expectation is that it is 1568 * called very infrequently and that a given device has a small number 1569 * of extents 1570 * 1571 * @start is used to store the start of the free space if we find. But if we 1572 * don't find suitable free space, it will be used to store the start position 1573 * of the max free space. 1574 * 1575 * @len is used to store the size of the free space that we find. 1576 * But if we don't find suitable free space, it is used to store the size of 1577 * the max free space. 1578 */ 1579 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1580 struct btrfs_device *device, u64 num_bytes, 1581 u64 search_start, u64 *start, u64 *len) 1582 { 1583 struct btrfs_fs_info *fs_info = device->fs_info; 1584 struct btrfs_root *root = fs_info->dev_root; 1585 struct btrfs_key key; 1586 struct btrfs_dev_extent *dev_extent; 1587 struct btrfs_path *path; 1588 u64 hole_size; 1589 u64 max_hole_start; 1590 u64 max_hole_size; 1591 u64 extent_end; 1592 u64 search_end = device->total_bytes; 1593 int ret; 1594 int slot; 1595 struct extent_buffer *l; 1596 1597 /* 1598 * We don't want to overwrite the superblock on the drive nor any area 1599 * used by the boot loader (grub for example), so we make sure to start 1600 * at an offset of at least 1MB. 1601 */ 1602 search_start = max_t(u64, search_start, SZ_1M); 1603 1604 path = btrfs_alloc_path(); 1605 if (!path) 1606 return -ENOMEM; 1607 1608 max_hole_start = search_start; 1609 max_hole_size = 0; 1610 1611 again: 1612 if (search_start >= search_end || 1613 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1614 ret = -ENOSPC; 1615 goto out; 1616 } 1617 1618 path->reada = READA_FORWARD; 1619 path->search_commit_root = 1; 1620 path->skip_locking = 1; 1621 1622 key.objectid = device->devid; 1623 key.offset = search_start; 1624 key.type = BTRFS_DEV_EXTENT_KEY; 1625 1626 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1627 if (ret < 0) 1628 goto out; 1629 if (ret > 0) { 1630 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1631 if (ret < 0) 1632 goto out; 1633 } 1634 1635 while (1) { 1636 l = path->nodes[0]; 1637 slot = path->slots[0]; 1638 if (slot >= btrfs_header_nritems(l)) { 1639 ret = btrfs_next_leaf(root, path); 1640 if (ret == 0) 1641 continue; 1642 if (ret < 0) 1643 goto out; 1644 1645 break; 1646 } 1647 btrfs_item_key_to_cpu(l, &key, slot); 1648 1649 if (key.objectid < device->devid) 1650 goto next; 1651 1652 if (key.objectid > device->devid) 1653 break; 1654 1655 if (key.type != BTRFS_DEV_EXTENT_KEY) 1656 goto next; 1657 1658 if (key.offset > search_start) { 1659 hole_size = key.offset - search_start; 1660 1661 /* 1662 * Have to check before we set max_hole_start, otherwise 1663 * we could end up sending back this offset anyway. 1664 */ 1665 if (contains_pending_extent(transaction, device, 1666 &search_start, 1667 hole_size)) { 1668 if (key.offset >= search_start) { 1669 hole_size = key.offset - search_start; 1670 } else { 1671 WARN_ON_ONCE(1); 1672 hole_size = 0; 1673 } 1674 } 1675 1676 if (hole_size > max_hole_size) { 1677 max_hole_start = search_start; 1678 max_hole_size = hole_size; 1679 } 1680 1681 /* 1682 * If this free space is greater than which we need, 1683 * it must be the max free space that we have found 1684 * until now, so max_hole_start must point to the start 1685 * of this free space and the length of this free space 1686 * is stored in max_hole_size. Thus, we return 1687 * max_hole_start and max_hole_size and go back to the 1688 * caller. 1689 */ 1690 if (hole_size >= num_bytes) { 1691 ret = 0; 1692 goto out; 1693 } 1694 } 1695 1696 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1697 extent_end = key.offset + btrfs_dev_extent_length(l, 1698 dev_extent); 1699 if (extent_end > search_start) 1700 search_start = extent_end; 1701 next: 1702 path->slots[0]++; 1703 cond_resched(); 1704 } 1705 1706 /* 1707 * At this point, search_start should be the end of 1708 * allocated dev extents, and when shrinking the device, 1709 * search_end may be smaller than search_start. 1710 */ 1711 if (search_end > search_start) { 1712 hole_size = search_end - search_start; 1713 1714 if (contains_pending_extent(transaction, device, &search_start, 1715 hole_size)) { 1716 btrfs_release_path(path); 1717 goto again; 1718 } 1719 1720 if (hole_size > max_hole_size) { 1721 max_hole_start = search_start; 1722 max_hole_size = hole_size; 1723 } 1724 } 1725 1726 /* See above. */ 1727 if (max_hole_size < num_bytes) 1728 ret = -ENOSPC; 1729 else 1730 ret = 0; 1731 1732 out: 1733 btrfs_free_path(path); 1734 *start = max_hole_start; 1735 if (len) 1736 *len = max_hole_size; 1737 return ret; 1738 } 1739 1740 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1741 struct btrfs_device *device, u64 num_bytes, 1742 u64 *start, u64 *len) 1743 { 1744 /* FIXME use last free of some kind */ 1745 return find_free_dev_extent_start(trans->transaction, device, 1746 num_bytes, 0, start, len); 1747 } 1748 1749 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1750 struct btrfs_device *device, 1751 u64 start, u64 *dev_extent_len) 1752 { 1753 struct btrfs_fs_info *fs_info = device->fs_info; 1754 struct btrfs_root *root = fs_info->dev_root; 1755 int ret; 1756 struct btrfs_path *path; 1757 struct btrfs_key key; 1758 struct btrfs_key found_key; 1759 struct extent_buffer *leaf = NULL; 1760 struct btrfs_dev_extent *extent = NULL; 1761 1762 path = btrfs_alloc_path(); 1763 if (!path) 1764 return -ENOMEM; 1765 1766 key.objectid = device->devid; 1767 key.offset = start; 1768 key.type = BTRFS_DEV_EXTENT_KEY; 1769 again: 1770 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1771 if (ret > 0) { 1772 ret = btrfs_previous_item(root, path, key.objectid, 1773 BTRFS_DEV_EXTENT_KEY); 1774 if (ret) 1775 goto out; 1776 leaf = path->nodes[0]; 1777 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1778 extent = btrfs_item_ptr(leaf, path->slots[0], 1779 struct btrfs_dev_extent); 1780 BUG_ON(found_key.offset > start || found_key.offset + 1781 btrfs_dev_extent_length(leaf, extent) < start); 1782 key = found_key; 1783 btrfs_release_path(path); 1784 goto again; 1785 } else if (ret == 0) { 1786 leaf = path->nodes[0]; 1787 extent = btrfs_item_ptr(leaf, path->slots[0], 1788 struct btrfs_dev_extent); 1789 } else { 1790 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1791 goto out; 1792 } 1793 1794 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1795 1796 ret = btrfs_del_item(trans, root, path); 1797 if (ret) { 1798 btrfs_handle_fs_error(fs_info, ret, 1799 "Failed to remove dev extent item"); 1800 } else { 1801 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1802 } 1803 out: 1804 btrfs_free_path(path); 1805 return ret; 1806 } 1807 1808 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1809 struct btrfs_device *device, 1810 u64 chunk_offset, u64 start, u64 num_bytes) 1811 { 1812 int ret; 1813 struct btrfs_path *path; 1814 struct btrfs_fs_info *fs_info = device->fs_info; 1815 struct btrfs_root *root = fs_info->dev_root; 1816 struct btrfs_dev_extent *extent; 1817 struct extent_buffer *leaf; 1818 struct btrfs_key key; 1819 1820 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1821 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1822 path = btrfs_alloc_path(); 1823 if (!path) 1824 return -ENOMEM; 1825 1826 key.objectid = device->devid; 1827 key.offset = start; 1828 key.type = BTRFS_DEV_EXTENT_KEY; 1829 ret = btrfs_insert_empty_item(trans, root, path, &key, 1830 sizeof(*extent)); 1831 if (ret) 1832 goto out; 1833 1834 leaf = path->nodes[0]; 1835 extent = btrfs_item_ptr(leaf, path->slots[0], 1836 struct btrfs_dev_extent); 1837 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1838 BTRFS_CHUNK_TREE_OBJECTID); 1839 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1840 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1841 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1842 1843 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1844 btrfs_mark_buffer_dirty(leaf); 1845 out: 1846 btrfs_free_path(path); 1847 return ret; 1848 } 1849 1850 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1851 { 1852 struct extent_map_tree *em_tree; 1853 struct extent_map *em; 1854 struct rb_node *n; 1855 u64 ret = 0; 1856 1857 em_tree = &fs_info->mapping_tree.map_tree; 1858 read_lock(&em_tree->lock); 1859 n = rb_last(&em_tree->map.rb_root); 1860 if (n) { 1861 em = rb_entry(n, struct extent_map, rb_node); 1862 ret = em->start + em->len; 1863 } 1864 read_unlock(&em_tree->lock); 1865 1866 return ret; 1867 } 1868 1869 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1870 u64 *devid_ret) 1871 { 1872 int ret; 1873 struct btrfs_key key; 1874 struct btrfs_key found_key; 1875 struct btrfs_path *path; 1876 1877 path = btrfs_alloc_path(); 1878 if (!path) 1879 return -ENOMEM; 1880 1881 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1882 key.type = BTRFS_DEV_ITEM_KEY; 1883 key.offset = (u64)-1; 1884 1885 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1886 if (ret < 0) 1887 goto error; 1888 1889 BUG_ON(ret == 0); /* Corruption */ 1890 1891 ret = btrfs_previous_item(fs_info->chunk_root, path, 1892 BTRFS_DEV_ITEMS_OBJECTID, 1893 BTRFS_DEV_ITEM_KEY); 1894 if (ret) { 1895 *devid_ret = 1; 1896 } else { 1897 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1898 path->slots[0]); 1899 *devid_ret = found_key.offset + 1; 1900 } 1901 ret = 0; 1902 error: 1903 btrfs_free_path(path); 1904 return ret; 1905 } 1906 1907 /* 1908 * the device information is stored in the chunk root 1909 * the btrfs_device struct should be fully filled in 1910 */ 1911 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1912 struct btrfs_device *device) 1913 { 1914 int ret; 1915 struct btrfs_path *path; 1916 struct btrfs_dev_item *dev_item; 1917 struct extent_buffer *leaf; 1918 struct btrfs_key key; 1919 unsigned long ptr; 1920 1921 path = btrfs_alloc_path(); 1922 if (!path) 1923 return -ENOMEM; 1924 1925 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1926 key.type = BTRFS_DEV_ITEM_KEY; 1927 key.offset = device->devid; 1928 1929 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1930 &key, sizeof(*dev_item)); 1931 if (ret) 1932 goto out; 1933 1934 leaf = path->nodes[0]; 1935 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1936 1937 btrfs_set_device_id(leaf, dev_item, device->devid); 1938 btrfs_set_device_generation(leaf, dev_item, 0); 1939 btrfs_set_device_type(leaf, dev_item, device->type); 1940 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1941 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1942 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1943 btrfs_set_device_total_bytes(leaf, dev_item, 1944 btrfs_device_get_disk_total_bytes(device)); 1945 btrfs_set_device_bytes_used(leaf, dev_item, 1946 btrfs_device_get_bytes_used(device)); 1947 btrfs_set_device_group(leaf, dev_item, 0); 1948 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1949 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1950 btrfs_set_device_start_offset(leaf, dev_item, 0); 1951 1952 ptr = btrfs_device_uuid(dev_item); 1953 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1954 ptr = btrfs_device_fsid(dev_item); 1955 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1956 ptr, BTRFS_FSID_SIZE); 1957 btrfs_mark_buffer_dirty(leaf); 1958 1959 ret = 0; 1960 out: 1961 btrfs_free_path(path); 1962 return ret; 1963 } 1964 1965 /* 1966 * Function to update ctime/mtime for a given device path. 1967 * Mainly used for ctime/mtime based probe like libblkid. 1968 */ 1969 static void update_dev_time(const char *path_name) 1970 { 1971 struct file *filp; 1972 1973 filp = filp_open(path_name, O_RDWR, 0); 1974 if (IS_ERR(filp)) 1975 return; 1976 file_update_time(filp); 1977 filp_close(filp, NULL); 1978 } 1979 1980 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1981 struct btrfs_device *device) 1982 { 1983 struct btrfs_root *root = fs_info->chunk_root; 1984 int ret; 1985 struct btrfs_path *path; 1986 struct btrfs_key key; 1987 struct btrfs_trans_handle *trans; 1988 1989 path = btrfs_alloc_path(); 1990 if (!path) 1991 return -ENOMEM; 1992 1993 trans = btrfs_start_transaction(root, 0); 1994 if (IS_ERR(trans)) { 1995 btrfs_free_path(path); 1996 return PTR_ERR(trans); 1997 } 1998 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1999 key.type = BTRFS_DEV_ITEM_KEY; 2000 key.offset = device->devid; 2001 2002 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2003 if (ret) { 2004 if (ret > 0) 2005 ret = -ENOENT; 2006 btrfs_abort_transaction(trans, ret); 2007 btrfs_end_transaction(trans); 2008 goto out; 2009 } 2010 2011 ret = btrfs_del_item(trans, root, path); 2012 if (ret) { 2013 btrfs_abort_transaction(trans, ret); 2014 btrfs_end_transaction(trans); 2015 } 2016 2017 out: 2018 btrfs_free_path(path); 2019 if (!ret) 2020 ret = btrfs_commit_transaction(trans); 2021 return ret; 2022 } 2023 2024 /* 2025 * Verify that @num_devices satisfies the RAID profile constraints in the whole 2026 * filesystem. It's up to the caller to adjust that number regarding eg. device 2027 * replace. 2028 */ 2029 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 2030 u64 num_devices) 2031 { 2032 u64 all_avail; 2033 unsigned seq; 2034 int i; 2035 2036 do { 2037 seq = read_seqbegin(&fs_info->profiles_lock); 2038 2039 all_avail = fs_info->avail_data_alloc_bits | 2040 fs_info->avail_system_alloc_bits | 2041 fs_info->avail_metadata_alloc_bits; 2042 } while (read_seqretry(&fs_info->profiles_lock, seq)); 2043 2044 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2045 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 2046 continue; 2047 2048 if (num_devices < btrfs_raid_array[i].devs_min) { 2049 int ret = btrfs_raid_array[i].mindev_error; 2050 2051 if (ret) 2052 return ret; 2053 } 2054 } 2055 2056 return 0; 2057 } 2058 2059 static struct btrfs_device * btrfs_find_next_active_device( 2060 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 2061 { 2062 struct btrfs_device *next_device; 2063 2064 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 2065 if (next_device != device && 2066 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 2067 && next_device->bdev) 2068 return next_device; 2069 } 2070 2071 return NULL; 2072 } 2073 2074 /* 2075 * Helper function to check if the given device is part of s_bdev / latest_bdev 2076 * and replace it with the provided or the next active device, in the context 2077 * where this function called, there should be always be another device (or 2078 * this_dev) which is active. 2079 */ 2080 void btrfs_assign_next_active_device(struct btrfs_device *device, 2081 struct btrfs_device *this_dev) 2082 { 2083 struct btrfs_fs_info *fs_info = device->fs_info; 2084 struct btrfs_device *next_device; 2085 2086 if (this_dev) 2087 next_device = this_dev; 2088 else 2089 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2090 device); 2091 ASSERT(next_device); 2092 2093 if (fs_info->sb->s_bdev && 2094 (fs_info->sb->s_bdev == device->bdev)) 2095 fs_info->sb->s_bdev = next_device->bdev; 2096 2097 if (fs_info->fs_devices->latest_bdev == device->bdev) 2098 fs_info->fs_devices->latest_bdev = next_device->bdev; 2099 } 2100 2101 /* 2102 * Return btrfs_fs_devices::num_devices excluding the device that's being 2103 * currently replaced. 2104 */ 2105 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2106 { 2107 u64 num_devices = fs_info->fs_devices->num_devices; 2108 2109 down_read(&fs_info->dev_replace.rwsem); 2110 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2111 ASSERT(num_devices > 1); 2112 num_devices--; 2113 } 2114 up_read(&fs_info->dev_replace.rwsem); 2115 2116 return num_devices; 2117 } 2118 2119 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2120 u64 devid) 2121 { 2122 struct btrfs_device *device; 2123 struct btrfs_fs_devices *cur_devices; 2124 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2125 u64 num_devices; 2126 int ret = 0; 2127 2128 mutex_lock(&uuid_mutex); 2129 2130 num_devices = btrfs_num_devices(fs_info); 2131 2132 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2133 if (ret) 2134 goto out; 2135 2136 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2137 2138 if (IS_ERR(device)) { 2139 if (PTR_ERR(device) == -ENOENT && 2140 strcmp(device_path, "missing") == 0) 2141 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2142 else 2143 ret = PTR_ERR(device); 2144 goto out; 2145 } 2146 2147 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2148 btrfs_warn_in_rcu(fs_info, 2149 "cannot remove device %s (devid %llu) due to active swapfile", 2150 rcu_str_deref(device->name), device->devid); 2151 ret = -ETXTBSY; 2152 goto out; 2153 } 2154 2155 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2156 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2157 goto out; 2158 } 2159 2160 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2161 fs_info->fs_devices->rw_devices == 1) { 2162 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2163 goto out; 2164 } 2165 2166 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2167 mutex_lock(&fs_info->chunk_mutex); 2168 list_del_init(&device->dev_alloc_list); 2169 device->fs_devices->rw_devices--; 2170 mutex_unlock(&fs_info->chunk_mutex); 2171 } 2172 2173 mutex_unlock(&uuid_mutex); 2174 ret = btrfs_shrink_device(device, 0); 2175 mutex_lock(&uuid_mutex); 2176 if (ret) 2177 goto error_undo; 2178 2179 /* 2180 * TODO: the superblock still includes this device in its num_devices 2181 * counter although write_all_supers() is not locked out. This 2182 * could give a filesystem state which requires a degraded mount. 2183 */ 2184 ret = btrfs_rm_dev_item(fs_info, device); 2185 if (ret) 2186 goto error_undo; 2187 2188 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2189 btrfs_scrub_cancel_dev(fs_info, device); 2190 2191 /* 2192 * the device list mutex makes sure that we don't change 2193 * the device list while someone else is writing out all 2194 * the device supers. Whoever is writing all supers, should 2195 * lock the device list mutex before getting the number of 2196 * devices in the super block (super_copy). Conversely, 2197 * whoever updates the number of devices in the super block 2198 * (super_copy) should hold the device list mutex. 2199 */ 2200 2201 /* 2202 * In normal cases the cur_devices == fs_devices. But in case 2203 * of deleting a seed device, the cur_devices should point to 2204 * its own fs_devices listed under the fs_devices->seed. 2205 */ 2206 cur_devices = device->fs_devices; 2207 mutex_lock(&fs_devices->device_list_mutex); 2208 list_del_rcu(&device->dev_list); 2209 2210 cur_devices->num_devices--; 2211 cur_devices->total_devices--; 2212 /* Update total_devices of the parent fs_devices if it's seed */ 2213 if (cur_devices != fs_devices) 2214 fs_devices->total_devices--; 2215 2216 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2217 cur_devices->missing_devices--; 2218 2219 btrfs_assign_next_active_device(device, NULL); 2220 2221 if (device->bdev) { 2222 cur_devices->open_devices--; 2223 /* remove sysfs entry */ 2224 btrfs_sysfs_rm_device_link(fs_devices, device); 2225 } 2226 2227 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2228 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2229 mutex_unlock(&fs_devices->device_list_mutex); 2230 2231 /* 2232 * at this point, the device is zero sized and detached from 2233 * the devices list. All that's left is to zero out the old 2234 * supers and free the device. 2235 */ 2236 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2237 btrfs_scratch_superblocks(device->bdev, device->name->str); 2238 2239 btrfs_close_bdev(device); 2240 call_rcu(&device->rcu, free_device_rcu); 2241 2242 if (cur_devices->open_devices == 0) { 2243 while (fs_devices) { 2244 if (fs_devices->seed == cur_devices) { 2245 fs_devices->seed = cur_devices->seed; 2246 break; 2247 } 2248 fs_devices = fs_devices->seed; 2249 } 2250 cur_devices->seed = NULL; 2251 close_fs_devices(cur_devices); 2252 free_fs_devices(cur_devices); 2253 } 2254 2255 out: 2256 mutex_unlock(&uuid_mutex); 2257 return ret; 2258 2259 error_undo: 2260 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2261 mutex_lock(&fs_info->chunk_mutex); 2262 list_add(&device->dev_alloc_list, 2263 &fs_devices->alloc_list); 2264 device->fs_devices->rw_devices++; 2265 mutex_unlock(&fs_info->chunk_mutex); 2266 } 2267 goto out; 2268 } 2269 2270 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2271 { 2272 struct btrfs_fs_devices *fs_devices; 2273 2274 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2275 2276 /* 2277 * in case of fs with no seed, srcdev->fs_devices will point 2278 * to fs_devices of fs_info. However when the dev being replaced is 2279 * a seed dev it will point to the seed's local fs_devices. In short 2280 * srcdev will have its correct fs_devices in both the cases. 2281 */ 2282 fs_devices = srcdev->fs_devices; 2283 2284 list_del_rcu(&srcdev->dev_list); 2285 list_del(&srcdev->dev_alloc_list); 2286 fs_devices->num_devices--; 2287 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2288 fs_devices->missing_devices--; 2289 2290 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2291 fs_devices->rw_devices--; 2292 2293 if (srcdev->bdev) 2294 fs_devices->open_devices--; 2295 } 2296 2297 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2298 struct btrfs_device *srcdev) 2299 { 2300 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2301 2302 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2303 /* zero out the old super if it is writable */ 2304 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2305 } 2306 2307 btrfs_close_bdev(srcdev); 2308 call_rcu(&srcdev->rcu, free_device_rcu); 2309 2310 /* if this is no devs we rather delete the fs_devices */ 2311 if (!fs_devices->num_devices) { 2312 struct btrfs_fs_devices *tmp_fs_devices; 2313 2314 /* 2315 * On a mounted FS, num_devices can't be zero unless it's a 2316 * seed. In case of a seed device being replaced, the replace 2317 * target added to the sprout FS, so there will be no more 2318 * device left under the seed FS. 2319 */ 2320 ASSERT(fs_devices->seeding); 2321 2322 tmp_fs_devices = fs_info->fs_devices; 2323 while (tmp_fs_devices) { 2324 if (tmp_fs_devices->seed == fs_devices) { 2325 tmp_fs_devices->seed = fs_devices->seed; 2326 break; 2327 } 2328 tmp_fs_devices = tmp_fs_devices->seed; 2329 } 2330 fs_devices->seed = NULL; 2331 close_fs_devices(fs_devices); 2332 free_fs_devices(fs_devices); 2333 } 2334 } 2335 2336 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2337 { 2338 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2339 2340 WARN_ON(!tgtdev); 2341 mutex_lock(&fs_devices->device_list_mutex); 2342 2343 btrfs_sysfs_rm_device_link(fs_devices, tgtdev); 2344 2345 if (tgtdev->bdev) 2346 fs_devices->open_devices--; 2347 2348 fs_devices->num_devices--; 2349 2350 btrfs_assign_next_active_device(tgtdev, NULL); 2351 2352 list_del_rcu(&tgtdev->dev_list); 2353 2354 mutex_unlock(&fs_devices->device_list_mutex); 2355 2356 /* 2357 * The update_dev_time() with in btrfs_scratch_superblocks() 2358 * may lead to a call to btrfs_show_devname() which will try 2359 * to hold device_list_mutex. And here this device 2360 * is already out of device list, so we don't have to hold 2361 * the device_list_mutex lock. 2362 */ 2363 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2364 2365 btrfs_close_bdev(tgtdev); 2366 call_rcu(&tgtdev->rcu, free_device_rcu); 2367 } 2368 2369 static struct btrfs_device *btrfs_find_device_by_path( 2370 struct btrfs_fs_info *fs_info, const char *device_path) 2371 { 2372 int ret = 0; 2373 struct btrfs_super_block *disk_super; 2374 u64 devid; 2375 u8 *dev_uuid; 2376 struct block_device *bdev; 2377 struct buffer_head *bh; 2378 struct btrfs_device *device; 2379 2380 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2381 fs_info->bdev_holder, 0, &bdev, &bh); 2382 if (ret) 2383 return ERR_PTR(ret); 2384 disk_super = (struct btrfs_super_block *)bh->b_data; 2385 devid = btrfs_stack_device_id(&disk_super->dev_item); 2386 dev_uuid = disk_super->dev_item.uuid; 2387 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2388 device = btrfs_find_device(fs_info, devid, dev_uuid, 2389 disk_super->metadata_uuid); 2390 else 2391 device = btrfs_find_device(fs_info, devid, 2392 dev_uuid, disk_super->fsid); 2393 2394 brelse(bh); 2395 if (!device) 2396 device = ERR_PTR(-ENOENT); 2397 blkdev_put(bdev, FMODE_READ); 2398 return device; 2399 } 2400 2401 static struct btrfs_device *btrfs_find_device_missing_or_by_path( 2402 struct btrfs_fs_info *fs_info, const char *device_path) 2403 { 2404 struct btrfs_device *device = NULL; 2405 if (strcmp(device_path, "missing") == 0) { 2406 struct list_head *devices; 2407 struct btrfs_device *tmp; 2408 2409 devices = &fs_info->fs_devices->devices; 2410 list_for_each_entry(tmp, devices, dev_list) { 2411 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2412 &tmp->dev_state) && !tmp->bdev) { 2413 device = tmp; 2414 break; 2415 } 2416 } 2417 2418 if (!device) 2419 return ERR_PTR(-ENOENT); 2420 } else { 2421 device = btrfs_find_device_by_path(fs_info, device_path); 2422 } 2423 2424 return device; 2425 } 2426 2427 /* 2428 * Lookup a device given by device id, or the path if the id is 0. 2429 */ 2430 struct btrfs_device *btrfs_find_device_by_devspec( 2431 struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) 2432 { 2433 struct btrfs_device *device; 2434 2435 if (devid) { 2436 device = btrfs_find_device(fs_info, devid, NULL, NULL); 2437 if (!device) 2438 return ERR_PTR(-ENOENT); 2439 } else { 2440 if (!devpath || !devpath[0]) 2441 return ERR_PTR(-EINVAL); 2442 device = btrfs_find_device_missing_or_by_path(fs_info, devpath); 2443 } 2444 return device; 2445 } 2446 2447 /* 2448 * does all the dirty work required for changing file system's UUID. 2449 */ 2450 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2451 { 2452 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2453 struct btrfs_fs_devices *old_devices; 2454 struct btrfs_fs_devices *seed_devices; 2455 struct btrfs_super_block *disk_super = fs_info->super_copy; 2456 struct btrfs_device *device; 2457 u64 super_flags; 2458 2459 lockdep_assert_held(&uuid_mutex); 2460 if (!fs_devices->seeding) 2461 return -EINVAL; 2462 2463 seed_devices = alloc_fs_devices(NULL, NULL); 2464 if (IS_ERR(seed_devices)) 2465 return PTR_ERR(seed_devices); 2466 2467 old_devices = clone_fs_devices(fs_devices); 2468 if (IS_ERR(old_devices)) { 2469 kfree(seed_devices); 2470 return PTR_ERR(old_devices); 2471 } 2472 2473 list_add(&old_devices->fs_list, &fs_uuids); 2474 2475 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2476 seed_devices->opened = 1; 2477 INIT_LIST_HEAD(&seed_devices->devices); 2478 INIT_LIST_HEAD(&seed_devices->alloc_list); 2479 mutex_init(&seed_devices->device_list_mutex); 2480 2481 mutex_lock(&fs_devices->device_list_mutex); 2482 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2483 synchronize_rcu); 2484 list_for_each_entry(device, &seed_devices->devices, dev_list) 2485 device->fs_devices = seed_devices; 2486 2487 mutex_lock(&fs_info->chunk_mutex); 2488 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2489 mutex_unlock(&fs_info->chunk_mutex); 2490 2491 fs_devices->seeding = 0; 2492 fs_devices->num_devices = 0; 2493 fs_devices->open_devices = 0; 2494 fs_devices->missing_devices = 0; 2495 fs_devices->rotating = 0; 2496 fs_devices->seed = seed_devices; 2497 2498 generate_random_uuid(fs_devices->fsid); 2499 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2500 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2501 mutex_unlock(&fs_devices->device_list_mutex); 2502 2503 super_flags = btrfs_super_flags(disk_super) & 2504 ~BTRFS_SUPER_FLAG_SEEDING; 2505 btrfs_set_super_flags(disk_super, super_flags); 2506 2507 return 0; 2508 } 2509 2510 /* 2511 * Store the expected generation for seed devices in device items. 2512 */ 2513 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2514 struct btrfs_fs_info *fs_info) 2515 { 2516 struct btrfs_root *root = fs_info->chunk_root; 2517 struct btrfs_path *path; 2518 struct extent_buffer *leaf; 2519 struct btrfs_dev_item *dev_item; 2520 struct btrfs_device *device; 2521 struct btrfs_key key; 2522 u8 fs_uuid[BTRFS_FSID_SIZE]; 2523 u8 dev_uuid[BTRFS_UUID_SIZE]; 2524 u64 devid; 2525 int ret; 2526 2527 path = btrfs_alloc_path(); 2528 if (!path) 2529 return -ENOMEM; 2530 2531 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2532 key.offset = 0; 2533 key.type = BTRFS_DEV_ITEM_KEY; 2534 2535 while (1) { 2536 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2537 if (ret < 0) 2538 goto error; 2539 2540 leaf = path->nodes[0]; 2541 next_slot: 2542 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2543 ret = btrfs_next_leaf(root, path); 2544 if (ret > 0) 2545 break; 2546 if (ret < 0) 2547 goto error; 2548 leaf = path->nodes[0]; 2549 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2550 btrfs_release_path(path); 2551 continue; 2552 } 2553 2554 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2555 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2556 key.type != BTRFS_DEV_ITEM_KEY) 2557 break; 2558 2559 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2560 struct btrfs_dev_item); 2561 devid = btrfs_device_id(leaf, dev_item); 2562 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2563 BTRFS_UUID_SIZE); 2564 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2565 BTRFS_FSID_SIZE); 2566 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2567 BUG_ON(!device); /* Logic error */ 2568 2569 if (device->fs_devices->seeding) { 2570 btrfs_set_device_generation(leaf, dev_item, 2571 device->generation); 2572 btrfs_mark_buffer_dirty(leaf); 2573 } 2574 2575 path->slots[0]++; 2576 goto next_slot; 2577 } 2578 ret = 0; 2579 error: 2580 btrfs_free_path(path); 2581 return ret; 2582 } 2583 2584 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2585 { 2586 struct btrfs_root *root = fs_info->dev_root; 2587 struct request_queue *q; 2588 struct btrfs_trans_handle *trans; 2589 struct btrfs_device *device; 2590 struct block_device *bdev; 2591 struct super_block *sb = fs_info->sb; 2592 struct rcu_string *name; 2593 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2594 u64 orig_super_total_bytes; 2595 u64 orig_super_num_devices; 2596 int seeding_dev = 0; 2597 int ret = 0; 2598 bool unlocked = false; 2599 2600 if (sb_rdonly(sb) && !fs_devices->seeding) 2601 return -EROFS; 2602 2603 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2604 fs_info->bdev_holder); 2605 if (IS_ERR(bdev)) 2606 return PTR_ERR(bdev); 2607 2608 if (fs_devices->seeding) { 2609 seeding_dev = 1; 2610 down_write(&sb->s_umount); 2611 mutex_lock(&uuid_mutex); 2612 } 2613 2614 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2615 2616 mutex_lock(&fs_devices->device_list_mutex); 2617 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2618 if (device->bdev == bdev) { 2619 ret = -EEXIST; 2620 mutex_unlock( 2621 &fs_devices->device_list_mutex); 2622 goto error; 2623 } 2624 } 2625 mutex_unlock(&fs_devices->device_list_mutex); 2626 2627 device = btrfs_alloc_device(fs_info, NULL, NULL); 2628 if (IS_ERR(device)) { 2629 /* we can safely leave the fs_devices entry around */ 2630 ret = PTR_ERR(device); 2631 goto error; 2632 } 2633 2634 name = rcu_string_strdup(device_path, GFP_KERNEL); 2635 if (!name) { 2636 ret = -ENOMEM; 2637 goto error_free_device; 2638 } 2639 rcu_assign_pointer(device->name, name); 2640 2641 trans = btrfs_start_transaction(root, 0); 2642 if (IS_ERR(trans)) { 2643 ret = PTR_ERR(trans); 2644 goto error_free_device; 2645 } 2646 2647 q = bdev_get_queue(bdev); 2648 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2649 device->generation = trans->transid; 2650 device->io_width = fs_info->sectorsize; 2651 device->io_align = fs_info->sectorsize; 2652 device->sector_size = fs_info->sectorsize; 2653 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2654 fs_info->sectorsize); 2655 device->disk_total_bytes = device->total_bytes; 2656 device->commit_total_bytes = device->total_bytes; 2657 device->fs_info = fs_info; 2658 device->bdev = bdev; 2659 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2660 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2661 device->mode = FMODE_EXCL; 2662 device->dev_stats_valid = 1; 2663 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2664 2665 if (seeding_dev) { 2666 sb->s_flags &= ~SB_RDONLY; 2667 ret = btrfs_prepare_sprout(fs_info); 2668 if (ret) { 2669 btrfs_abort_transaction(trans, ret); 2670 goto error_trans; 2671 } 2672 } 2673 2674 device->fs_devices = fs_devices; 2675 2676 mutex_lock(&fs_devices->device_list_mutex); 2677 mutex_lock(&fs_info->chunk_mutex); 2678 list_add_rcu(&device->dev_list, &fs_devices->devices); 2679 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2680 fs_devices->num_devices++; 2681 fs_devices->open_devices++; 2682 fs_devices->rw_devices++; 2683 fs_devices->total_devices++; 2684 fs_devices->total_rw_bytes += device->total_bytes; 2685 2686 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2687 2688 if (!blk_queue_nonrot(q)) 2689 fs_devices->rotating = 1; 2690 2691 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2692 btrfs_set_super_total_bytes(fs_info->super_copy, 2693 round_down(orig_super_total_bytes + device->total_bytes, 2694 fs_info->sectorsize)); 2695 2696 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2697 btrfs_set_super_num_devices(fs_info->super_copy, 2698 orig_super_num_devices + 1); 2699 2700 /* add sysfs device entry */ 2701 btrfs_sysfs_add_device_link(fs_devices, device); 2702 2703 /* 2704 * we've got more storage, clear any full flags on the space 2705 * infos 2706 */ 2707 btrfs_clear_space_info_full(fs_info); 2708 2709 mutex_unlock(&fs_info->chunk_mutex); 2710 mutex_unlock(&fs_devices->device_list_mutex); 2711 2712 if (seeding_dev) { 2713 mutex_lock(&fs_info->chunk_mutex); 2714 ret = init_first_rw_device(trans, fs_info); 2715 mutex_unlock(&fs_info->chunk_mutex); 2716 if (ret) { 2717 btrfs_abort_transaction(trans, ret); 2718 goto error_sysfs; 2719 } 2720 } 2721 2722 ret = btrfs_add_dev_item(trans, device); 2723 if (ret) { 2724 btrfs_abort_transaction(trans, ret); 2725 goto error_sysfs; 2726 } 2727 2728 if (seeding_dev) { 2729 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2730 2731 ret = btrfs_finish_sprout(trans, fs_info); 2732 if (ret) { 2733 btrfs_abort_transaction(trans, ret); 2734 goto error_sysfs; 2735 } 2736 2737 /* Sprouting would change fsid of the mounted root, 2738 * so rename the fsid on the sysfs 2739 */ 2740 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2741 fs_info->fs_devices->fsid); 2742 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) 2743 btrfs_warn(fs_info, 2744 "sysfs: failed to create fsid for sprout"); 2745 } 2746 2747 ret = btrfs_commit_transaction(trans); 2748 2749 if (seeding_dev) { 2750 mutex_unlock(&uuid_mutex); 2751 up_write(&sb->s_umount); 2752 unlocked = true; 2753 2754 if (ret) /* transaction commit */ 2755 return ret; 2756 2757 ret = btrfs_relocate_sys_chunks(fs_info); 2758 if (ret < 0) 2759 btrfs_handle_fs_error(fs_info, ret, 2760 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2761 trans = btrfs_attach_transaction(root); 2762 if (IS_ERR(trans)) { 2763 if (PTR_ERR(trans) == -ENOENT) 2764 return 0; 2765 ret = PTR_ERR(trans); 2766 trans = NULL; 2767 goto error_sysfs; 2768 } 2769 ret = btrfs_commit_transaction(trans); 2770 } 2771 2772 /* Update ctime/mtime for libblkid */ 2773 update_dev_time(device_path); 2774 return ret; 2775 2776 error_sysfs: 2777 btrfs_sysfs_rm_device_link(fs_devices, device); 2778 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2779 mutex_lock(&fs_info->chunk_mutex); 2780 list_del_rcu(&device->dev_list); 2781 list_del(&device->dev_alloc_list); 2782 fs_info->fs_devices->num_devices--; 2783 fs_info->fs_devices->open_devices--; 2784 fs_info->fs_devices->rw_devices--; 2785 fs_info->fs_devices->total_devices--; 2786 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2787 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2788 btrfs_set_super_total_bytes(fs_info->super_copy, 2789 orig_super_total_bytes); 2790 btrfs_set_super_num_devices(fs_info->super_copy, 2791 orig_super_num_devices); 2792 mutex_unlock(&fs_info->chunk_mutex); 2793 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2794 error_trans: 2795 if (seeding_dev) 2796 sb->s_flags |= SB_RDONLY; 2797 if (trans) 2798 btrfs_end_transaction(trans); 2799 error_free_device: 2800 btrfs_free_device(device); 2801 error: 2802 blkdev_put(bdev, FMODE_EXCL); 2803 if (seeding_dev && !unlocked) { 2804 mutex_unlock(&uuid_mutex); 2805 up_write(&sb->s_umount); 2806 } 2807 return ret; 2808 } 2809 2810 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2811 struct btrfs_device *device) 2812 { 2813 int ret; 2814 struct btrfs_path *path; 2815 struct btrfs_root *root = device->fs_info->chunk_root; 2816 struct btrfs_dev_item *dev_item; 2817 struct extent_buffer *leaf; 2818 struct btrfs_key key; 2819 2820 path = btrfs_alloc_path(); 2821 if (!path) 2822 return -ENOMEM; 2823 2824 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2825 key.type = BTRFS_DEV_ITEM_KEY; 2826 key.offset = device->devid; 2827 2828 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2829 if (ret < 0) 2830 goto out; 2831 2832 if (ret > 0) { 2833 ret = -ENOENT; 2834 goto out; 2835 } 2836 2837 leaf = path->nodes[0]; 2838 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2839 2840 btrfs_set_device_id(leaf, dev_item, device->devid); 2841 btrfs_set_device_type(leaf, dev_item, device->type); 2842 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2843 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2844 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2845 btrfs_set_device_total_bytes(leaf, dev_item, 2846 btrfs_device_get_disk_total_bytes(device)); 2847 btrfs_set_device_bytes_used(leaf, dev_item, 2848 btrfs_device_get_bytes_used(device)); 2849 btrfs_mark_buffer_dirty(leaf); 2850 2851 out: 2852 btrfs_free_path(path); 2853 return ret; 2854 } 2855 2856 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2857 struct btrfs_device *device, u64 new_size) 2858 { 2859 struct btrfs_fs_info *fs_info = device->fs_info; 2860 struct btrfs_super_block *super_copy = fs_info->super_copy; 2861 struct btrfs_fs_devices *fs_devices; 2862 u64 old_total; 2863 u64 diff; 2864 2865 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2866 return -EACCES; 2867 2868 new_size = round_down(new_size, fs_info->sectorsize); 2869 2870 mutex_lock(&fs_info->chunk_mutex); 2871 old_total = btrfs_super_total_bytes(super_copy); 2872 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2873 2874 if (new_size <= device->total_bytes || 2875 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2876 mutex_unlock(&fs_info->chunk_mutex); 2877 return -EINVAL; 2878 } 2879 2880 fs_devices = fs_info->fs_devices; 2881 2882 btrfs_set_super_total_bytes(super_copy, 2883 round_down(old_total + diff, fs_info->sectorsize)); 2884 device->fs_devices->total_rw_bytes += diff; 2885 2886 btrfs_device_set_total_bytes(device, new_size); 2887 btrfs_device_set_disk_total_bytes(device, new_size); 2888 btrfs_clear_space_info_full(device->fs_info); 2889 if (list_empty(&device->resized_list)) 2890 list_add_tail(&device->resized_list, 2891 &fs_devices->resized_devices); 2892 mutex_unlock(&fs_info->chunk_mutex); 2893 2894 return btrfs_update_device(trans, device); 2895 } 2896 2897 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2898 { 2899 struct btrfs_fs_info *fs_info = trans->fs_info; 2900 struct btrfs_root *root = fs_info->chunk_root; 2901 int ret; 2902 struct btrfs_path *path; 2903 struct btrfs_key key; 2904 2905 path = btrfs_alloc_path(); 2906 if (!path) 2907 return -ENOMEM; 2908 2909 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2910 key.offset = chunk_offset; 2911 key.type = BTRFS_CHUNK_ITEM_KEY; 2912 2913 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2914 if (ret < 0) 2915 goto out; 2916 else if (ret > 0) { /* Logic error or corruption */ 2917 btrfs_handle_fs_error(fs_info, -ENOENT, 2918 "Failed lookup while freeing chunk."); 2919 ret = -ENOENT; 2920 goto out; 2921 } 2922 2923 ret = btrfs_del_item(trans, root, path); 2924 if (ret < 0) 2925 btrfs_handle_fs_error(fs_info, ret, 2926 "Failed to delete chunk item."); 2927 out: 2928 btrfs_free_path(path); 2929 return ret; 2930 } 2931 2932 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2933 { 2934 struct btrfs_super_block *super_copy = fs_info->super_copy; 2935 struct btrfs_disk_key *disk_key; 2936 struct btrfs_chunk *chunk; 2937 u8 *ptr; 2938 int ret = 0; 2939 u32 num_stripes; 2940 u32 array_size; 2941 u32 len = 0; 2942 u32 cur; 2943 struct btrfs_key key; 2944 2945 mutex_lock(&fs_info->chunk_mutex); 2946 array_size = btrfs_super_sys_array_size(super_copy); 2947 2948 ptr = super_copy->sys_chunk_array; 2949 cur = 0; 2950 2951 while (cur < array_size) { 2952 disk_key = (struct btrfs_disk_key *)ptr; 2953 btrfs_disk_key_to_cpu(&key, disk_key); 2954 2955 len = sizeof(*disk_key); 2956 2957 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2958 chunk = (struct btrfs_chunk *)(ptr + len); 2959 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2960 len += btrfs_chunk_item_size(num_stripes); 2961 } else { 2962 ret = -EIO; 2963 break; 2964 } 2965 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2966 key.offset == chunk_offset) { 2967 memmove(ptr, ptr + len, array_size - (cur + len)); 2968 array_size -= len; 2969 btrfs_set_super_sys_array_size(super_copy, array_size); 2970 } else { 2971 ptr += len; 2972 cur += len; 2973 } 2974 } 2975 mutex_unlock(&fs_info->chunk_mutex); 2976 return ret; 2977 } 2978 2979 /* 2980 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2981 * @logical: Logical block offset in bytes. 2982 * @length: Length of extent in bytes. 2983 * 2984 * Return: Chunk mapping or ERR_PTR. 2985 */ 2986 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2987 u64 logical, u64 length) 2988 { 2989 struct extent_map_tree *em_tree; 2990 struct extent_map *em; 2991 2992 em_tree = &fs_info->mapping_tree.map_tree; 2993 read_lock(&em_tree->lock); 2994 em = lookup_extent_mapping(em_tree, logical, length); 2995 read_unlock(&em_tree->lock); 2996 2997 if (!em) { 2998 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2999 logical, length); 3000 return ERR_PTR(-EINVAL); 3001 } 3002 3003 if (em->start > logical || em->start + em->len < logical) { 3004 btrfs_crit(fs_info, 3005 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3006 logical, length, em->start, em->start + em->len); 3007 free_extent_map(em); 3008 return ERR_PTR(-EINVAL); 3009 } 3010 3011 /* callers are responsible for dropping em's ref. */ 3012 return em; 3013 } 3014 3015 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3016 { 3017 struct btrfs_fs_info *fs_info = trans->fs_info; 3018 struct extent_map *em; 3019 struct map_lookup *map; 3020 u64 dev_extent_len = 0; 3021 int i, ret = 0; 3022 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3023 3024 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3025 if (IS_ERR(em)) { 3026 /* 3027 * This is a logic error, but we don't want to just rely on the 3028 * user having built with ASSERT enabled, so if ASSERT doesn't 3029 * do anything we still error out. 3030 */ 3031 ASSERT(0); 3032 return PTR_ERR(em); 3033 } 3034 map = em->map_lookup; 3035 mutex_lock(&fs_info->chunk_mutex); 3036 check_system_chunk(trans, map->type); 3037 mutex_unlock(&fs_info->chunk_mutex); 3038 3039 /* 3040 * Take the device list mutex to prevent races with the final phase of 3041 * a device replace operation that replaces the device object associated 3042 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 3043 */ 3044 mutex_lock(&fs_devices->device_list_mutex); 3045 for (i = 0; i < map->num_stripes; i++) { 3046 struct btrfs_device *device = map->stripes[i].dev; 3047 ret = btrfs_free_dev_extent(trans, device, 3048 map->stripes[i].physical, 3049 &dev_extent_len); 3050 if (ret) { 3051 mutex_unlock(&fs_devices->device_list_mutex); 3052 btrfs_abort_transaction(trans, ret); 3053 goto out; 3054 } 3055 3056 if (device->bytes_used > 0) { 3057 mutex_lock(&fs_info->chunk_mutex); 3058 btrfs_device_set_bytes_used(device, 3059 device->bytes_used - dev_extent_len); 3060 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3061 btrfs_clear_space_info_full(fs_info); 3062 mutex_unlock(&fs_info->chunk_mutex); 3063 } 3064 3065 ret = btrfs_update_device(trans, device); 3066 if (ret) { 3067 mutex_unlock(&fs_devices->device_list_mutex); 3068 btrfs_abort_transaction(trans, ret); 3069 goto out; 3070 } 3071 } 3072 mutex_unlock(&fs_devices->device_list_mutex); 3073 3074 ret = btrfs_free_chunk(trans, chunk_offset); 3075 if (ret) { 3076 btrfs_abort_transaction(trans, ret); 3077 goto out; 3078 } 3079 3080 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3081 3082 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3083 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3084 if (ret) { 3085 btrfs_abort_transaction(trans, ret); 3086 goto out; 3087 } 3088 } 3089 3090 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3091 if (ret) { 3092 btrfs_abort_transaction(trans, ret); 3093 goto out; 3094 } 3095 3096 out: 3097 /* once for us */ 3098 free_extent_map(em); 3099 return ret; 3100 } 3101 3102 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3103 { 3104 struct btrfs_root *root = fs_info->chunk_root; 3105 struct btrfs_trans_handle *trans; 3106 int ret; 3107 3108 /* 3109 * Prevent races with automatic removal of unused block groups. 3110 * After we relocate and before we remove the chunk with offset 3111 * chunk_offset, automatic removal of the block group can kick in, 3112 * resulting in a failure when calling btrfs_remove_chunk() below. 3113 * 3114 * Make sure to acquire this mutex before doing a tree search (dev 3115 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3116 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3117 * we release the path used to search the chunk/dev tree and before 3118 * the current task acquires this mutex and calls us. 3119 */ 3120 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 3121 3122 ret = btrfs_can_relocate(fs_info, chunk_offset); 3123 if (ret) 3124 return -ENOSPC; 3125 3126 /* step one, relocate all the extents inside this chunk */ 3127 btrfs_scrub_pause(fs_info); 3128 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3129 btrfs_scrub_continue(fs_info); 3130 if (ret) 3131 return ret; 3132 3133 /* 3134 * We add the kobjects here (and after forcing data chunk creation) 3135 * since relocation is the only place we'll create chunks of a new 3136 * type at runtime. The only place where we'll remove the last 3137 * chunk of a type is the call immediately below this one. Even 3138 * so, we're protected against races with the cleaner thread since 3139 * we're covered by the delete_unused_bgs_mutex. 3140 */ 3141 btrfs_add_raid_kobjects(fs_info); 3142 3143 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3144 chunk_offset); 3145 if (IS_ERR(trans)) { 3146 ret = PTR_ERR(trans); 3147 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3148 return ret; 3149 } 3150 3151 /* 3152 * step two, delete the device extents and the 3153 * chunk tree entries 3154 */ 3155 ret = btrfs_remove_chunk(trans, chunk_offset); 3156 btrfs_end_transaction(trans); 3157 return ret; 3158 } 3159 3160 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3161 { 3162 struct btrfs_root *chunk_root = fs_info->chunk_root; 3163 struct btrfs_path *path; 3164 struct extent_buffer *leaf; 3165 struct btrfs_chunk *chunk; 3166 struct btrfs_key key; 3167 struct btrfs_key found_key; 3168 u64 chunk_type; 3169 bool retried = false; 3170 int failed = 0; 3171 int ret; 3172 3173 path = btrfs_alloc_path(); 3174 if (!path) 3175 return -ENOMEM; 3176 3177 again: 3178 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3179 key.offset = (u64)-1; 3180 key.type = BTRFS_CHUNK_ITEM_KEY; 3181 3182 while (1) { 3183 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3184 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3185 if (ret < 0) { 3186 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3187 goto error; 3188 } 3189 BUG_ON(ret == 0); /* Corruption */ 3190 3191 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3192 key.type); 3193 if (ret) 3194 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3195 if (ret < 0) 3196 goto error; 3197 if (ret > 0) 3198 break; 3199 3200 leaf = path->nodes[0]; 3201 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3202 3203 chunk = btrfs_item_ptr(leaf, path->slots[0], 3204 struct btrfs_chunk); 3205 chunk_type = btrfs_chunk_type(leaf, chunk); 3206 btrfs_release_path(path); 3207 3208 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3209 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3210 if (ret == -ENOSPC) 3211 failed++; 3212 else 3213 BUG_ON(ret); 3214 } 3215 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3216 3217 if (found_key.offset == 0) 3218 break; 3219 key.offset = found_key.offset - 1; 3220 } 3221 ret = 0; 3222 if (failed && !retried) { 3223 failed = 0; 3224 retried = true; 3225 goto again; 3226 } else if (WARN_ON(failed && retried)) { 3227 ret = -ENOSPC; 3228 } 3229 error: 3230 btrfs_free_path(path); 3231 return ret; 3232 } 3233 3234 /* 3235 * return 1 : allocate a data chunk successfully, 3236 * return <0: errors during allocating a data chunk, 3237 * return 0 : no need to allocate a data chunk. 3238 */ 3239 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3240 u64 chunk_offset) 3241 { 3242 struct btrfs_block_group_cache *cache; 3243 u64 bytes_used; 3244 u64 chunk_type; 3245 3246 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3247 ASSERT(cache); 3248 chunk_type = cache->flags; 3249 btrfs_put_block_group(cache); 3250 3251 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 3252 spin_lock(&fs_info->data_sinfo->lock); 3253 bytes_used = fs_info->data_sinfo->bytes_used; 3254 spin_unlock(&fs_info->data_sinfo->lock); 3255 3256 if (!bytes_used) { 3257 struct btrfs_trans_handle *trans; 3258 int ret; 3259 3260 trans = btrfs_join_transaction(fs_info->tree_root); 3261 if (IS_ERR(trans)) 3262 return PTR_ERR(trans); 3263 3264 ret = btrfs_force_chunk_alloc(trans, 3265 BTRFS_BLOCK_GROUP_DATA); 3266 btrfs_end_transaction(trans); 3267 if (ret < 0) 3268 return ret; 3269 3270 btrfs_add_raid_kobjects(fs_info); 3271 3272 return 1; 3273 } 3274 } 3275 return 0; 3276 } 3277 3278 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3279 struct btrfs_balance_control *bctl) 3280 { 3281 struct btrfs_root *root = fs_info->tree_root; 3282 struct btrfs_trans_handle *trans; 3283 struct btrfs_balance_item *item; 3284 struct btrfs_disk_balance_args disk_bargs; 3285 struct btrfs_path *path; 3286 struct extent_buffer *leaf; 3287 struct btrfs_key key; 3288 int ret, err; 3289 3290 path = btrfs_alloc_path(); 3291 if (!path) 3292 return -ENOMEM; 3293 3294 trans = btrfs_start_transaction(root, 0); 3295 if (IS_ERR(trans)) { 3296 btrfs_free_path(path); 3297 return PTR_ERR(trans); 3298 } 3299 3300 key.objectid = BTRFS_BALANCE_OBJECTID; 3301 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3302 key.offset = 0; 3303 3304 ret = btrfs_insert_empty_item(trans, root, path, &key, 3305 sizeof(*item)); 3306 if (ret) 3307 goto out; 3308 3309 leaf = path->nodes[0]; 3310 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3311 3312 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3313 3314 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3315 btrfs_set_balance_data(leaf, item, &disk_bargs); 3316 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3317 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3318 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3319 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3320 3321 btrfs_set_balance_flags(leaf, item, bctl->flags); 3322 3323 btrfs_mark_buffer_dirty(leaf); 3324 out: 3325 btrfs_free_path(path); 3326 err = btrfs_commit_transaction(trans); 3327 if (err && !ret) 3328 ret = err; 3329 return ret; 3330 } 3331 3332 static int del_balance_item(struct btrfs_fs_info *fs_info) 3333 { 3334 struct btrfs_root *root = fs_info->tree_root; 3335 struct btrfs_trans_handle *trans; 3336 struct btrfs_path *path; 3337 struct btrfs_key key; 3338 int ret, err; 3339 3340 path = btrfs_alloc_path(); 3341 if (!path) 3342 return -ENOMEM; 3343 3344 trans = btrfs_start_transaction(root, 0); 3345 if (IS_ERR(trans)) { 3346 btrfs_free_path(path); 3347 return PTR_ERR(trans); 3348 } 3349 3350 key.objectid = BTRFS_BALANCE_OBJECTID; 3351 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3352 key.offset = 0; 3353 3354 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3355 if (ret < 0) 3356 goto out; 3357 if (ret > 0) { 3358 ret = -ENOENT; 3359 goto out; 3360 } 3361 3362 ret = btrfs_del_item(trans, root, path); 3363 out: 3364 btrfs_free_path(path); 3365 err = btrfs_commit_transaction(trans); 3366 if (err && !ret) 3367 ret = err; 3368 return ret; 3369 } 3370 3371 /* 3372 * This is a heuristic used to reduce the number of chunks balanced on 3373 * resume after balance was interrupted. 3374 */ 3375 static void update_balance_args(struct btrfs_balance_control *bctl) 3376 { 3377 /* 3378 * Turn on soft mode for chunk types that were being converted. 3379 */ 3380 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3381 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3382 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3383 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3384 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3385 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3386 3387 /* 3388 * Turn on usage filter if is not already used. The idea is 3389 * that chunks that we have already balanced should be 3390 * reasonably full. Don't do it for chunks that are being 3391 * converted - that will keep us from relocating unconverted 3392 * (albeit full) chunks. 3393 */ 3394 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3395 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3396 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3397 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3398 bctl->data.usage = 90; 3399 } 3400 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3401 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3402 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3403 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3404 bctl->sys.usage = 90; 3405 } 3406 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3407 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3408 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3409 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3410 bctl->meta.usage = 90; 3411 } 3412 } 3413 3414 /* 3415 * Clear the balance status in fs_info and delete the balance item from disk. 3416 */ 3417 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3418 { 3419 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3420 int ret; 3421 3422 BUG_ON(!fs_info->balance_ctl); 3423 3424 spin_lock(&fs_info->balance_lock); 3425 fs_info->balance_ctl = NULL; 3426 spin_unlock(&fs_info->balance_lock); 3427 3428 kfree(bctl); 3429 ret = del_balance_item(fs_info); 3430 if (ret) 3431 btrfs_handle_fs_error(fs_info, ret, NULL); 3432 } 3433 3434 /* 3435 * Balance filters. Return 1 if chunk should be filtered out 3436 * (should not be balanced). 3437 */ 3438 static int chunk_profiles_filter(u64 chunk_type, 3439 struct btrfs_balance_args *bargs) 3440 { 3441 chunk_type = chunk_to_extended(chunk_type) & 3442 BTRFS_EXTENDED_PROFILE_MASK; 3443 3444 if (bargs->profiles & chunk_type) 3445 return 0; 3446 3447 return 1; 3448 } 3449 3450 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3451 struct btrfs_balance_args *bargs) 3452 { 3453 struct btrfs_block_group_cache *cache; 3454 u64 chunk_used; 3455 u64 user_thresh_min; 3456 u64 user_thresh_max; 3457 int ret = 1; 3458 3459 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3460 chunk_used = btrfs_block_group_used(&cache->item); 3461 3462 if (bargs->usage_min == 0) 3463 user_thresh_min = 0; 3464 else 3465 user_thresh_min = div_factor_fine(cache->key.offset, 3466 bargs->usage_min); 3467 3468 if (bargs->usage_max == 0) 3469 user_thresh_max = 1; 3470 else if (bargs->usage_max > 100) 3471 user_thresh_max = cache->key.offset; 3472 else 3473 user_thresh_max = div_factor_fine(cache->key.offset, 3474 bargs->usage_max); 3475 3476 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3477 ret = 0; 3478 3479 btrfs_put_block_group(cache); 3480 return ret; 3481 } 3482 3483 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3484 u64 chunk_offset, struct btrfs_balance_args *bargs) 3485 { 3486 struct btrfs_block_group_cache *cache; 3487 u64 chunk_used, user_thresh; 3488 int ret = 1; 3489 3490 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3491 chunk_used = btrfs_block_group_used(&cache->item); 3492 3493 if (bargs->usage_min == 0) 3494 user_thresh = 1; 3495 else if (bargs->usage > 100) 3496 user_thresh = cache->key.offset; 3497 else 3498 user_thresh = div_factor_fine(cache->key.offset, 3499 bargs->usage); 3500 3501 if (chunk_used < user_thresh) 3502 ret = 0; 3503 3504 btrfs_put_block_group(cache); 3505 return ret; 3506 } 3507 3508 static int chunk_devid_filter(struct extent_buffer *leaf, 3509 struct btrfs_chunk *chunk, 3510 struct btrfs_balance_args *bargs) 3511 { 3512 struct btrfs_stripe *stripe; 3513 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3514 int i; 3515 3516 for (i = 0; i < num_stripes; i++) { 3517 stripe = btrfs_stripe_nr(chunk, i); 3518 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3519 return 0; 3520 } 3521 3522 return 1; 3523 } 3524 3525 /* [pstart, pend) */ 3526 static int chunk_drange_filter(struct extent_buffer *leaf, 3527 struct btrfs_chunk *chunk, 3528 struct btrfs_balance_args *bargs) 3529 { 3530 struct btrfs_stripe *stripe; 3531 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3532 u64 stripe_offset; 3533 u64 stripe_length; 3534 int factor; 3535 int i; 3536 3537 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3538 return 0; 3539 3540 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3541 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3542 factor = num_stripes / 2; 3543 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3544 factor = num_stripes - 1; 3545 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3546 factor = num_stripes - 2; 3547 } else { 3548 factor = num_stripes; 3549 } 3550 3551 for (i = 0; i < num_stripes; i++) { 3552 stripe = btrfs_stripe_nr(chunk, i); 3553 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3554 continue; 3555 3556 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3557 stripe_length = btrfs_chunk_length(leaf, chunk); 3558 stripe_length = div_u64(stripe_length, factor); 3559 3560 if (stripe_offset < bargs->pend && 3561 stripe_offset + stripe_length > bargs->pstart) 3562 return 0; 3563 } 3564 3565 return 1; 3566 } 3567 3568 /* [vstart, vend) */ 3569 static int chunk_vrange_filter(struct extent_buffer *leaf, 3570 struct btrfs_chunk *chunk, 3571 u64 chunk_offset, 3572 struct btrfs_balance_args *bargs) 3573 { 3574 if (chunk_offset < bargs->vend && 3575 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3576 /* at least part of the chunk is inside this vrange */ 3577 return 0; 3578 3579 return 1; 3580 } 3581 3582 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3583 struct btrfs_chunk *chunk, 3584 struct btrfs_balance_args *bargs) 3585 { 3586 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3587 3588 if (bargs->stripes_min <= num_stripes 3589 && num_stripes <= bargs->stripes_max) 3590 return 0; 3591 3592 return 1; 3593 } 3594 3595 static int chunk_soft_convert_filter(u64 chunk_type, 3596 struct btrfs_balance_args *bargs) 3597 { 3598 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3599 return 0; 3600 3601 chunk_type = chunk_to_extended(chunk_type) & 3602 BTRFS_EXTENDED_PROFILE_MASK; 3603 3604 if (bargs->target == chunk_type) 3605 return 1; 3606 3607 return 0; 3608 } 3609 3610 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3611 struct extent_buffer *leaf, 3612 struct btrfs_chunk *chunk, u64 chunk_offset) 3613 { 3614 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3615 struct btrfs_balance_args *bargs = NULL; 3616 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3617 3618 /* type filter */ 3619 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3620 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3621 return 0; 3622 } 3623 3624 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3625 bargs = &bctl->data; 3626 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3627 bargs = &bctl->sys; 3628 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3629 bargs = &bctl->meta; 3630 3631 /* profiles filter */ 3632 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3633 chunk_profiles_filter(chunk_type, bargs)) { 3634 return 0; 3635 } 3636 3637 /* usage filter */ 3638 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3639 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3640 return 0; 3641 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3642 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3643 return 0; 3644 } 3645 3646 /* devid filter */ 3647 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3648 chunk_devid_filter(leaf, chunk, bargs)) { 3649 return 0; 3650 } 3651 3652 /* drange filter, makes sense only with devid filter */ 3653 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3654 chunk_drange_filter(leaf, chunk, bargs)) { 3655 return 0; 3656 } 3657 3658 /* vrange filter */ 3659 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3660 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3661 return 0; 3662 } 3663 3664 /* stripes filter */ 3665 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3666 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3667 return 0; 3668 } 3669 3670 /* soft profile changing mode */ 3671 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3672 chunk_soft_convert_filter(chunk_type, bargs)) { 3673 return 0; 3674 } 3675 3676 /* 3677 * limited by count, must be the last filter 3678 */ 3679 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3680 if (bargs->limit == 0) 3681 return 0; 3682 else 3683 bargs->limit--; 3684 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3685 /* 3686 * Same logic as the 'limit' filter; the minimum cannot be 3687 * determined here because we do not have the global information 3688 * about the count of all chunks that satisfy the filters. 3689 */ 3690 if (bargs->limit_max == 0) 3691 return 0; 3692 else 3693 bargs->limit_max--; 3694 } 3695 3696 return 1; 3697 } 3698 3699 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3700 { 3701 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3702 struct btrfs_root *chunk_root = fs_info->chunk_root; 3703 u64 chunk_type; 3704 struct btrfs_chunk *chunk; 3705 struct btrfs_path *path = NULL; 3706 struct btrfs_key key; 3707 struct btrfs_key found_key; 3708 struct extent_buffer *leaf; 3709 int slot; 3710 int ret; 3711 int enospc_errors = 0; 3712 bool counting = true; 3713 /* The single value limit and min/max limits use the same bytes in the */ 3714 u64 limit_data = bctl->data.limit; 3715 u64 limit_meta = bctl->meta.limit; 3716 u64 limit_sys = bctl->sys.limit; 3717 u32 count_data = 0; 3718 u32 count_meta = 0; 3719 u32 count_sys = 0; 3720 int chunk_reserved = 0; 3721 3722 path = btrfs_alloc_path(); 3723 if (!path) { 3724 ret = -ENOMEM; 3725 goto error; 3726 } 3727 3728 /* zero out stat counters */ 3729 spin_lock(&fs_info->balance_lock); 3730 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3731 spin_unlock(&fs_info->balance_lock); 3732 again: 3733 if (!counting) { 3734 /* 3735 * The single value limit and min/max limits use the same bytes 3736 * in the 3737 */ 3738 bctl->data.limit = limit_data; 3739 bctl->meta.limit = limit_meta; 3740 bctl->sys.limit = limit_sys; 3741 } 3742 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3743 key.offset = (u64)-1; 3744 key.type = BTRFS_CHUNK_ITEM_KEY; 3745 3746 while (1) { 3747 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3748 atomic_read(&fs_info->balance_cancel_req)) { 3749 ret = -ECANCELED; 3750 goto error; 3751 } 3752 3753 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3754 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3755 if (ret < 0) { 3756 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3757 goto error; 3758 } 3759 3760 /* 3761 * this shouldn't happen, it means the last relocate 3762 * failed 3763 */ 3764 if (ret == 0) 3765 BUG(); /* FIXME break ? */ 3766 3767 ret = btrfs_previous_item(chunk_root, path, 0, 3768 BTRFS_CHUNK_ITEM_KEY); 3769 if (ret) { 3770 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3771 ret = 0; 3772 break; 3773 } 3774 3775 leaf = path->nodes[0]; 3776 slot = path->slots[0]; 3777 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3778 3779 if (found_key.objectid != key.objectid) { 3780 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3781 break; 3782 } 3783 3784 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3785 chunk_type = btrfs_chunk_type(leaf, chunk); 3786 3787 if (!counting) { 3788 spin_lock(&fs_info->balance_lock); 3789 bctl->stat.considered++; 3790 spin_unlock(&fs_info->balance_lock); 3791 } 3792 3793 ret = should_balance_chunk(fs_info, leaf, chunk, 3794 found_key.offset); 3795 3796 btrfs_release_path(path); 3797 if (!ret) { 3798 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3799 goto loop; 3800 } 3801 3802 if (counting) { 3803 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3804 spin_lock(&fs_info->balance_lock); 3805 bctl->stat.expected++; 3806 spin_unlock(&fs_info->balance_lock); 3807 3808 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3809 count_data++; 3810 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3811 count_sys++; 3812 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3813 count_meta++; 3814 3815 goto loop; 3816 } 3817 3818 /* 3819 * Apply limit_min filter, no need to check if the LIMITS 3820 * filter is used, limit_min is 0 by default 3821 */ 3822 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3823 count_data < bctl->data.limit_min) 3824 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3825 count_meta < bctl->meta.limit_min) 3826 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3827 count_sys < bctl->sys.limit_min)) { 3828 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3829 goto loop; 3830 } 3831 3832 if (!chunk_reserved) { 3833 /* 3834 * We may be relocating the only data chunk we have, 3835 * which could potentially end up with losing data's 3836 * raid profile, so lets allocate an empty one in 3837 * advance. 3838 */ 3839 ret = btrfs_may_alloc_data_chunk(fs_info, 3840 found_key.offset); 3841 if (ret < 0) { 3842 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3843 goto error; 3844 } else if (ret == 1) { 3845 chunk_reserved = 1; 3846 } 3847 } 3848 3849 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3850 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3851 if (ret == -ENOSPC) { 3852 enospc_errors++; 3853 } else if (ret == -ETXTBSY) { 3854 btrfs_info(fs_info, 3855 "skipping relocation of block group %llu due to active swapfile", 3856 found_key.offset); 3857 ret = 0; 3858 } else if (ret) { 3859 goto error; 3860 } else { 3861 spin_lock(&fs_info->balance_lock); 3862 bctl->stat.completed++; 3863 spin_unlock(&fs_info->balance_lock); 3864 } 3865 loop: 3866 if (found_key.offset == 0) 3867 break; 3868 key.offset = found_key.offset - 1; 3869 } 3870 3871 if (counting) { 3872 btrfs_release_path(path); 3873 counting = false; 3874 goto again; 3875 } 3876 error: 3877 btrfs_free_path(path); 3878 if (enospc_errors) { 3879 btrfs_info(fs_info, "%d enospc errors during balance", 3880 enospc_errors); 3881 if (!ret) 3882 ret = -ENOSPC; 3883 } 3884 3885 return ret; 3886 } 3887 3888 /** 3889 * alloc_profile_is_valid - see if a given profile is valid and reduced 3890 * @flags: profile to validate 3891 * @extended: if true @flags is treated as an extended profile 3892 */ 3893 static int alloc_profile_is_valid(u64 flags, int extended) 3894 { 3895 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3896 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3897 3898 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3899 3900 /* 1) check that all other bits are zeroed */ 3901 if (flags & ~mask) 3902 return 0; 3903 3904 /* 2) see if profile is reduced */ 3905 if (flags == 0) 3906 return !extended; /* "0" is valid for usual profiles */ 3907 3908 /* true if exactly one bit set */ 3909 return is_power_of_2(flags); 3910 } 3911 3912 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3913 { 3914 /* cancel requested || normal exit path */ 3915 return atomic_read(&fs_info->balance_cancel_req) || 3916 (atomic_read(&fs_info->balance_pause_req) == 0 && 3917 atomic_read(&fs_info->balance_cancel_req) == 0); 3918 } 3919 3920 /* Non-zero return value signifies invalidity */ 3921 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3922 u64 allowed) 3923 { 3924 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3925 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3926 (bctl_arg->target & ~allowed))); 3927 } 3928 3929 /* 3930 * Fill @buf with textual description of balance filter flags @bargs, up to 3931 * @size_buf including the terminating null. The output may be trimmed if it 3932 * does not fit into the provided buffer. 3933 */ 3934 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 3935 u32 size_buf) 3936 { 3937 int ret; 3938 u32 size_bp = size_buf; 3939 char *bp = buf; 3940 u64 flags = bargs->flags; 3941 char tmp_buf[128] = {'\0'}; 3942 3943 if (!flags) 3944 return; 3945 3946 #define CHECK_APPEND_NOARG(a) \ 3947 do { \ 3948 ret = snprintf(bp, size_bp, (a)); \ 3949 if (ret < 0 || ret >= size_bp) \ 3950 goto out_overflow; \ 3951 size_bp -= ret; \ 3952 bp += ret; \ 3953 } while (0) 3954 3955 #define CHECK_APPEND_1ARG(a, v1) \ 3956 do { \ 3957 ret = snprintf(bp, size_bp, (a), (v1)); \ 3958 if (ret < 0 || ret >= size_bp) \ 3959 goto out_overflow; \ 3960 size_bp -= ret; \ 3961 bp += ret; \ 3962 } while (0) 3963 3964 #define CHECK_APPEND_2ARG(a, v1, v2) \ 3965 do { \ 3966 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 3967 if (ret < 0 || ret >= size_bp) \ 3968 goto out_overflow; \ 3969 size_bp -= ret; \ 3970 bp += ret; \ 3971 } while (0) 3972 3973 if (flags & BTRFS_BALANCE_ARGS_CONVERT) { 3974 int index = btrfs_bg_flags_to_raid_index(bargs->target); 3975 3976 CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); 3977 } 3978 3979 if (flags & BTRFS_BALANCE_ARGS_SOFT) 3980 CHECK_APPEND_NOARG("soft,"); 3981 3982 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 3983 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 3984 sizeof(tmp_buf)); 3985 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 3986 } 3987 3988 if (flags & BTRFS_BALANCE_ARGS_USAGE) 3989 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 3990 3991 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 3992 CHECK_APPEND_2ARG("usage=%u..%u,", 3993 bargs->usage_min, bargs->usage_max); 3994 3995 if (flags & BTRFS_BALANCE_ARGS_DEVID) 3996 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 3997 3998 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 3999 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4000 bargs->pstart, bargs->pend); 4001 4002 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4003 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4004 bargs->vstart, bargs->vend); 4005 4006 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4007 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4008 4009 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4010 CHECK_APPEND_2ARG("limit=%u..%u,", 4011 bargs->limit_min, bargs->limit_max); 4012 4013 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4014 CHECK_APPEND_2ARG("stripes=%u..%u,", 4015 bargs->stripes_min, bargs->stripes_max); 4016 4017 #undef CHECK_APPEND_2ARG 4018 #undef CHECK_APPEND_1ARG 4019 #undef CHECK_APPEND_NOARG 4020 4021 out_overflow: 4022 4023 if (size_bp < size_buf) 4024 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4025 else 4026 buf[0] = '\0'; 4027 } 4028 4029 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4030 { 4031 u32 size_buf = 1024; 4032 char tmp_buf[192] = {'\0'}; 4033 char *buf; 4034 char *bp; 4035 u32 size_bp = size_buf; 4036 int ret; 4037 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4038 4039 buf = kzalloc(size_buf, GFP_KERNEL); 4040 if (!buf) 4041 return; 4042 4043 bp = buf; 4044 4045 #define CHECK_APPEND_1ARG(a, v1) \ 4046 do { \ 4047 ret = snprintf(bp, size_bp, (a), (v1)); \ 4048 if (ret < 0 || ret >= size_bp) \ 4049 goto out_overflow; \ 4050 size_bp -= ret; \ 4051 bp += ret; \ 4052 } while (0) 4053 4054 if (bctl->flags & BTRFS_BALANCE_FORCE) 4055 CHECK_APPEND_1ARG("%s", "-f "); 4056 4057 if (bctl->flags & BTRFS_BALANCE_DATA) { 4058 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4059 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4060 } 4061 4062 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4063 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4064 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4065 } 4066 4067 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4068 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4069 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4070 } 4071 4072 #undef CHECK_APPEND_1ARG 4073 4074 out_overflow: 4075 4076 if (size_bp < size_buf) 4077 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4078 btrfs_info(fs_info, "balance: %s %s", 4079 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4080 "resume" : "start", buf); 4081 4082 kfree(buf); 4083 } 4084 4085 /* 4086 * Should be called with balance mutexe held 4087 */ 4088 int btrfs_balance(struct btrfs_fs_info *fs_info, 4089 struct btrfs_balance_control *bctl, 4090 struct btrfs_ioctl_balance_args *bargs) 4091 { 4092 u64 meta_target, data_target; 4093 u64 allowed; 4094 int mixed = 0; 4095 int ret; 4096 u64 num_devices; 4097 unsigned seq; 4098 bool reducing_integrity; 4099 4100 if (btrfs_fs_closing(fs_info) || 4101 atomic_read(&fs_info->balance_pause_req) || 4102 atomic_read(&fs_info->balance_cancel_req)) { 4103 ret = -EINVAL; 4104 goto out; 4105 } 4106 4107 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4108 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4109 mixed = 1; 4110 4111 /* 4112 * In case of mixed groups both data and meta should be picked, 4113 * and identical options should be given for both of them. 4114 */ 4115 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4116 if (mixed && (bctl->flags & allowed)) { 4117 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4118 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4119 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4120 btrfs_err(fs_info, 4121 "balance: mixed groups data and metadata options must be the same"); 4122 ret = -EINVAL; 4123 goto out; 4124 } 4125 } 4126 4127 num_devices = btrfs_num_devices(fs_info); 4128 4129 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 4130 if (num_devices > 1) 4131 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 4132 if (num_devices > 2) 4133 allowed |= BTRFS_BLOCK_GROUP_RAID5; 4134 if (num_devices > 3) 4135 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 4136 BTRFS_BLOCK_GROUP_RAID6); 4137 if (validate_convert_profile(&bctl->data, allowed)) { 4138 int index = btrfs_bg_flags_to_raid_index(bctl->data.target); 4139 4140 btrfs_err(fs_info, 4141 "balance: invalid convert data profile %s", 4142 get_raid_name(index)); 4143 ret = -EINVAL; 4144 goto out; 4145 } 4146 if (validate_convert_profile(&bctl->meta, allowed)) { 4147 int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); 4148 4149 btrfs_err(fs_info, 4150 "balance: invalid convert metadata profile %s", 4151 get_raid_name(index)); 4152 ret = -EINVAL; 4153 goto out; 4154 } 4155 if (validate_convert_profile(&bctl->sys, allowed)) { 4156 int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); 4157 4158 btrfs_err(fs_info, 4159 "balance: invalid convert system profile %s", 4160 get_raid_name(index)); 4161 ret = -EINVAL; 4162 goto out; 4163 } 4164 4165 /* allow to reduce meta or sys integrity only if force set */ 4166 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 4167 BTRFS_BLOCK_GROUP_RAID10 | 4168 BTRFS_BLOCK_GROUP_RAID5 | 4169 BTRFS_BLOCK_GROUP_RAID6; 4170 do { 4171 seq = read_seqbegin(&fs_info->profiles_lock); 4172 4173 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4174 (fs_info->avail_system_alloc_bits & allowed) && 4175 !(bctl->sys.target & allowed)) || 4176 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4177 (fs_info->avail_metadata_alloc_bits & allowed) && 4178 !(bctl->meta.target & allowed))) 4179 reducing_integrity = true; 4180 else 4181 reducing_integrity = false; 4182 4183 /* if we're not converting, the target field is uninitialized */ 4184 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4185 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4186 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4187 bctl->data.target : fs_info->avail_data_alloc_bits; 4188 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4189 4190 if (reducing_integrity) { 4191 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4192 btrfs_info(fs_info, 4193 "balance: force reducing metadata integrity"); 4194 } else { 4195 btrfs_err(fs_info, 4196 "balance: reduces metadata integrity, use --force if you want this"); 4197 ret = -EINVAL; 4198 goto out; 4199 } 4200 } 4201 4202 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4203 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4204 int meta_index = btrfs_bg_flags_to_raid_index(meta_target); 4205 int data_index = btrfs_bg_flags_to_raid_index(data_target); 4206 4207 btrfs_warn(fs_info, 4208 "balance: metadata profile %s has lower redundancy than data profile %s", 4209 get_raid_name(meta_index), get_raid_name(data_index)); 4210 } 4211 4212 ret = insert_balance_item(fs_info, bctl); 4213 if (ret && ret != -EEXIST) 4214 goto out; 4215 4216 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4217 BUG_ON(ret == -EEXIST); 4218 BUG_ON(fs_info->balance_ctl); 4219 spin_lock(&fs_info->balance_lock); 4220 fs_info->balance_ctl = bctl; 4221 spin_unlock(&fs_info->balance_lock); 4222 } else { 4223 BUG_ON(ret != -EEXIST); 4224 spin_lock(&fs_info->balance_lock); 4225 update_balance_args(bctl); 4226 spin_unlock(&fs_info->balance_lock); 4227 } 4228 4229 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4230 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4231 describe_balance_start_or_resume(fs_info); 4232 mutex_unlock(&fs_info->balance_mutex); 4233 4234 ret = __btrfs_balance(fs_info); 4235 4236 mutex_lock(&fs_info->balance_mutex); 4237 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4238 btrfs_info(fs_info, "balance: paused"); 4239 else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) 4240 btrfs_info(fs_info, "balance: canceled"); 4241 else 4242 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4243 4244 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4245 4246 if (bargs) { 4247 memset(bargs, 0, sizeof(*bargs)); 4248 btrfs_update_ioctl_balance_args(fs_info, bargs); 4249 } 4250 4251 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4252 balance_need_close(fs_info)) { 4253 reset_balance_state(fs_info); 4254 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4255 } 4256 4257 wake_up(&fs_info->balance_wait_q); 4258 4259 return ret; 4260 out: 4261 if (bctl->flags & BTRFS_BALANCE_RESUME) 4262 reset_balance_state(fs_info); 4263 else 4264 kfree(bctl); 4265 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4266 4267 return ret; 4268 } 4269 4270 static int balance_kthread(void *data) 4271 { 4272 struct btrfs_fs_info *fs_info = data; 4273 int ret = 0; 4274 4275 mutex_lock(&fs_info->balance_mutex); 4276 if (fs_info->balance_ctl) 4277 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4278 mutex_unlock(&fs_info->balance_mutex); 4279 4280 return ret; 4281 } 4282 4283 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4284 { 4285 struct task_struct *tsk; 4286 4287 mutex_lock(&fs_info->balance_mutex); 4288 if (!fs_info->balance_ctl) { 4289 mutex_unlock(&fs_info->balance_mutex); 4290 return 0; 4291 } 4292 mutex_unlock(&fs_info->balance_mutex); 4293 4294 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4295 btrfs_info(fs_info, "balance: resume skipped"); 4296 return 0; 4297 } 4298 4299 /* 4300 * A ro->rw remount sequence should continue with the paused balance 4301 * regardless of who pauses it, system or the user as of now, so set 4302 * the resume flag. 4303 */ 4304 spin_lock(&fs_info->balance_lock); 4305 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4306 spin_unlock(&fs_info->balance_lock); 4307 4308 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4309 return PTR_ERR_OR_ZERO(tsk); 4310 } 4311 4312 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4313 { 4314 struct btrfs_balance_control *bctl; 4315 struct btrfs_balance_item *item; 4316 struct btrfs_disk_balance_args disk_bargs; 4317 struct btrfs_path *path; 4318 struct extent_buffer *leaf; 4319 struct btrfs_key key; 4320 int ret; 4321 4322 path = btrfs_alloc_path(); 4323 if (!path) 4324 return -ENOMEM; 4325 4326 key.objectid = BTRFS_BALANCE_OBJECTID; 4327 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4328 key.offset = 0; 4329 4330 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4331 if (ret < 0) 4332 goto out; 4333 if (ret > 0) { /* ret = -ENOENT; */ 4334 ret = 0; 4335 goto out; 4336 } 4337 4338 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4339 if (!bctl) { 4340 ret = -ENOMEM; 4341 goto out; 4342 } 4343 4344 leaf = path->nodes[0]; 4345 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4346 4347 bctl->flags = btrfs_balance_flags(leaf, item); 4348 bctl->flags |= BTRFS_BALANCE_RESUME; 4349 4350 btrfs_balance_data(leaf, item, &disk_bargs); 4351 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4352 btrfs_balance_meta(leaf, item, &disk_bargs); 4353 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4354 btrfs_balance_sys(leaf, item, &disk_bargs); 4355 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4356 4357 /* 4358 * This should never happen, as the paused balance state is recovered 4359 * during mount without any chance of other exclusive ops to collide. 4360 * 4361 * This gives the exclusive op status to balance and keeps in paused 4362 * state until user intervention (cancel or umount). If the ownership 4363 * cannot be assigned, show a message but do not fail. The balance 4364 * is in a paused state and must have fs_info::balance_ctl properly 4365 * set up. 4366 */ 4367 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 4368 btrfs_warn(fs_info, 4369 "balance: cannot set exclusive op status, resume manually"); 4370 4371 mutex_lock(&fs_info->balance_mutex); 4372 BUG_ON(fs_info->balance_ctl); 4373 spin_lock(&fs_info->balance_lock); 4374 fs_info->balance_ctl = bctl; 4375 spin_unlock(&fs_info->balance_lock); 4376 mutex_unlock(&fs_info->balance_mutex); 4377 out: 4378 btrfs_free_path(path); 4379 return ret; 4380 } 4381 4382 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4383 { 4384 int ret = 0; 4385 4386 mutex_lock(&fs_info->balance_mutex); 4387 if (!fs_info->balance_ctl) { 4388 mutex_unlock(&fs_info->balance_mutex); 4389 return -ENOTCONN; 4390 } 4391 4392 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4393 atomic_inc(&fs_info->balance_pause_req); 4394 mutex_unlock(&fs_info->balance_mutex); 4395 4396 wait_event(fs_info->balance_wait_q, 4397 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4398 4399 mutex_lock(&fs_info->balance_mutex); 4400 /* we are good with balance_ctl ripped off from under us */ 4401 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4402 atomic_dec(&fs_info->balance_pause_req); 4403 } else { 4404 ret = -ENOTCONN; 4405 } 4406 4407 mutex_unlock(&fs_info->balance_mutex); 4408 return ret; 4409 } 4410 4411 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4412 { 4413 mutex_lock(&fs_info->balance_mutex); 4414 if (!fs_info->balance_ctl) { 4415 mutex_unlock(&fs_info->balance_mutex); 4416 return -ENOTCONN; 4417 } 4418 4419 /* 4420 * A paused balance with the item stored on disk can be resumed at 4421 * mount time if the mount is read-write. Otherwise it's still paused 4422 * and we must not allow cancelling as it deletes the item. 4423 */ 4424 if (sb_rdonly(fs_info->sb)) { 4425 mutex_unlock(&fs_info->balance_mutex); 4426 return -EROFS; 4427 } 4428 4429 atomic_inc(&fs_info->balance_cancel_req); 4430 /* 4431 * if we are running just wait and return, balance item is 4432 * deleted in btrfs_balance in this case 4433 */ 4434 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4435 mutex_unlock(&fs_info->balance_mutex); 4436 wait_event(fs_info->balance_wait_q, 4437 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4438 mutex_lock(&fs_info->balance_mutex); 4439 } else { 4440 mutex_unlock(&fs_info->balance_mutex); 4441 /* 4442 * Lock released to allow other waiters to continue, we'll 4443 * reexamine the status again. 4444 */ 4445 mutex_lock(&fs_info->balance_mutex); 4446 4447 if (fs_info->balance_ctl) { 4448 reset_balance_state(fs_info); 4449 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4450 btrfs_info(fs_info, "balance: canceled"); 4451 } 4452 } 4453 4454 BUG_ON(fs_info->balance_ctl || 4455 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4456 atomic_dec(&fs_info->balance_cancel_req); 4457 mutex_unlock(&fs_info->balance_mutex); 4458 return 0; 4459 } 4460 4461 static int btrfs_uuid_scan_kthread(void *data) 4462 { 4463 struct btrfs_fs_info *fs_info = data; 4464 struct btrfs_root *root = fs_info->tree_root; 4465 struct btrfs_key key; 4466 struct btrfs_path *path = NULL; 4467 int ret = 0; 4468 struct extent_buffer *eb; 4469 int slot; 4470 struct btrfs_root_item root_item; 4471 u32 item_size; 4472 struct btrfs_trans_handle *trans = NULL; 4473 4474 path = btrfs_alloc_path(); 4475 if (!path) { 4476 ret = -ENOMEM; 4477 goto out; 4478 } 4479 4480 key.objectid = 0; 4481 key.type = BTRFS_ROOT_ITEM_KEY; 4482 key.offset = 0; 4483 4484 while (1) { 4485 ret = btrfs_search_forward(root, &key, path, 4486 BTRFS_OLDEST_GENERATION); 4487 if (ret) { 4488 if (ret > 0) 4489 ret = 0; 4490 break; 4491 } 4492 4493 if (key.type != BTRFS_ROOT_ITEM_KEY || 4494 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4495 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4496 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4497 goto skip; 4498 4499 eb = path->nodes[0]; 4500 slot = path->slots[0]; 4501 item_size = btrfs_item_size_nr(eb, slot); 4502 if (item_size < sizeof(root_item)) 4503 goto skip; 4504 4505 read_extent_buffer(eb, &root_item, 4506 btrfs_item_ptr_offset(eb, slot), 4507 (int)sizeof(root_item)); 4508 if (btrfs_root_refs(&root_item) == 0) 4509 goto skip; 4510 4511 if (!btrfs_is_empty_uuid(root_item.uuid) || 4512 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4513 if (trans) 4514 goto update_tree; 4515 4516 btrfs_release_path(path); 4517 /* 4518 * 1 - subvol uuid item 4519 * 1 - received_subvol uuid item 4520 */ 4521 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4522 if (IS_ERR(trans)) { 4523 ret = PTR_ERR(trans); 4524 break; 4525 } 4526 continue; 4527 } else { 4528 goto skip; 4529 } 4530 update_tree: 4531 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4532 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4533 BTRFS_UUID_KEY_SUBVOL, 4534 key.objectid); 4535 if (ret < 0) { 4536 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4537 ret); 4538 break; 4539 } 4540 } 4541 4542 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4543 ret = btrfs_uuid_tree_add(trans, 4544 root_item.received_uuid, 4545 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4546 key.objectid); 4547 if (ret < 0) { 4548 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4549 ret); 4550 break; 4551 } 4552 } 4553 4554 skip: 4555 if (trans) { 4556 ret = btrfs_end_transaction(trans); 4557 trans = NULL; 4558 if (ret) 4559 break; 4560 } 4561 4562 btrfs_release_path(path); 4563 if (key.offset < (u64)-1) { 4564 key.offset++; 4565 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4566 key.offset = 0; 4567 key.type = BTRFS_ROOT_ITEM_KEY; 4568 } else if (key.objectid < (u64)-1) { 4569 key.offset = 0; 4570 key.type = BTRFS_ROOT_ITEM_KEY; 4571 key.objectid++; 4572 } else { 4573 break; 4574 } 4575 cond_resched(); 4576 } 4577 4578 out: 4579 btrfs_free_path(path); 4580 if (trans && !IS_ERR(trans)) 4581 btrfs_end_transaction(trans); 4582 if (ret) 4583 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4584 else 4585 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4586 up(&fs_info->uuid_tree_rescan_sem); 4587 return 0; 4588 } 4589 4590 /* 4591 * Callback for btrfs_uuid_tree_iterate(). 4592 * returns: 4593 * 0 check succeeded, the entry is not outdated. 4594 * < 0 if an error occurred. 4595 * > 0 if the check failed, which means the caller shall remove the entry. 4596 */ 4597 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4598 u8 *uuid, u8 type, u64 subid) 4599 { 4600 struct btrfs_key key; 4601 int ret = 0; 4602 struct btrfs_root *subvol_root; 4603 4604 if (type != BTRFS_UUID_KEY_SUBVOL && 4605 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4606 goto out; 4607 4608 key.objectid = subid; 4609 key.type = BTRFS_ROOT_ITEM_KEY; 4610 key.offset = (u64)-1; 4611 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4612 if (IS_ERR(subvol_root)) { 4613 ret = PTR_ERR(subvol_root); 4614 if (ret == -ENOENT) 4615 ret = 1; 4616 goto out; 4617 } 4618 4619 switch (type) { 4620 case BTRFS_UUID_KEY_SUBVOL: 4621 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4622 ret = 1; 4623 break; 4624 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4625 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4626 BTRFS_UUID_SIZE)) 4627 ret = 1; 4628 break; 4629 } 4630 4631 out: 4632 return ret; 4633 } 4634 4635 static int btrfs_uuid_rescan_kthread(void *data) 4636 { 4637 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4638 int ret; 4639 4640 /* 4641 * 1st step is to iterate through the existing UUID tree and 4642 * to delete all entries that contain outdated data. 4643 * 2nd step is to add all missing entries to the UUID tree. 4644 */ 4645 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4646 if (ret < 0) { 4647 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4648 up(&fs_info->uuid_tree_rescan_sem); 4649 return ret; 4650 } 4651 return btrfs_uuid_scan_kthread(data); 4652 } 4653 4654 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4655 { 4656 struct btrfs_trans_handle *trans; 4657 struct btrfs_root *tree_root = fs_info->tree_root; 4658 struct btrfs_root *uuid_root; 4659 struct task_struct *task; 4660 int ret; 4661 4662 /* 4663 * 1 - root node 4664 * 1 - root item 4665 */ 4666 trans = btrfs_start_transaction(tree_root, 2); 4667 if (IS_ERR(trans)) 4668 return PTR_ERR(trans); 4669 4670 uuid_root = btrfs_create_tree(trans, fs_info, 4671 BTRFS_UUID_TREE_OBJECTID); 4672 if (IS_ERR(uuid_root)) { 4673 ret = PTR_ERR(uuid_root); 4674 btrfs_abort_transaction(trans, ret); 4675 btrfs_end_transaction(trans); 4676 return ret; 4677 } 4678 4679 fs_info->uuid_root = uuid_root; 4680 4681 ret = btrfs_commit_transaction(trans); 4682 if (ret) 4683 return ret; 4684 4685 down(&fs_info->uuid_tree_rescan_sem); 4686 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4687 if (IS_ERR(task)) { 4688 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4689 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4690 up(&fs_info->uuid_tree_rescan_sem); 4691 return PTR_ERR(task); 4692 } 4693 4694 return 0; 4695 } 4696 4697 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4698 { 4699 struct task_struct *task; 4700 4701 down(&fs_info->uuid_tree_rescan_sem); 4702 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4703 if (IS_ERR(task)) { 4704 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4705 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4706 up(&fs_info->uuid_tree_rescan_sem); 4707 return PTR_ERR(task); 4708 } 4709 4710 return 0; 4711 } 4712 4713 /* 4714 * shrinking a device means finding all of the device extents past 4715 * the new size, and then following the back refs to the chunks. 4716 * The chunk relocation code actually frees the device extent 4717 */ 4718 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4719 { 4720 struct btrfs_fs_info *fs_info = device->fs_info; 4721 struct btrfs_root *root = fs_info->dev_root; 4722 struct btrfs_trans_handle *trans; 4723 struct btrfs_dev_extent *dev_extent = NULL; 4724 struct btrfs_path *path; 4725 u64 length; 4726 u64 chunk_offset; 4727 int ret; 4728 int slot; 4729 int failed = 0; 4730 bool retried = false; 4731 bool checked_pending_chunks = false; 4732 struct extent_buffer *l; 4733 struct btrfs_key key; 4734 struct btrfs_super_block *super_copy = fs_info->super_copy; 4735 u64 old_total = btrfs_super_total_bytes(super_copy); 4736 u64 old_size = btrfs_device_get_total_bytes(device); 4737 u64 diff; 4738 4739 new_size = round_down(new_size, fs_info->sectorsize); 4740 diff = round_down(old_size - new_size, fs_info->sectorsize); 4741 4742 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4743 return -EINVAL; 4744 4745 path = btrfs_alloc_path(); 4746 if (!path) 4747 return -ENOMEM; 4748 4749 path->reada = READA_BACK; 4750 4751 mutex_lock(&fs_info->chunk_mutex); 4752 4753 btrfs_device_set_total_bytes(device, new_size); 4754 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4755 device->fs_devices->total_rw_bytes -= diff; 4756 atomic64_sub(diff, &fs_info->free_chunk_space); 4757 } 4758 mutex_unlock(&fs_info->chunk_mutex); 4759 4760 again: 4761 key.objectid = device->devid; 4762 key.offset = (u64)-1; 4763 key.type = BTRFS_DEV_EXTENT_KEY; 4764 4765 do { 4766 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4767 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4768 if (ret < 0) { 4769 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4770 goto done; 4771 } 4772 4773 ret = btrfs_previous_item(root, path, 0, key.type); 4774 if (ret) 4775 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4776 if (ret < 0) 4777 goto done; 4778 if (ret) { 4779 ret = 0; 4780 btrfs_release_path(path); 4781 break; 4782 } 4783 4784 l = path->nodes[0]; 4785 slot = path->slots[0]; 4786 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4787 4788 if (key.objectid != device->devid) { 4789 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4790 btrfs_release_path(path); 4791 break; 4792 } 4793 4794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4795 length = btrfs_dev_extent_length(l, dev_extent); 4796 4797 if (key.offset + length <= new_size) { 4798 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4799 btrfs_release_path(path); 4800 break; 4801 } 4802 4803 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4804 btrfs_release_path(path); 4805 4806 /* 4807 * We may be relocating the only data chunk we have, 4808 * which could potentially end up with losing data's 4809 * raid profile, so lets allocate an empty one in 4810 * advance. 4811 */ 4812 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4813 if (ret < 0) { 4814 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4815 goto done; 4816 } 4817 4818 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4819 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4820 if (ret == -ENOSPC) { 4821 failed++; 4822 } else if (ret) { 4823 if (ret == -ETXTBSY) { 4824 btrfs_warn(fs_info, 4825 "could not shrink block group %llu due to active swapfile", 4826 chunk_offset); 4827 } 4828 goto done; 4829 } 4830 } while (key.offset-- > 0); 4831 4832 if (failed && !retried) { 4833 failed = 0; 4834 retried = true; 4835 goto again; 4836 } else if (failed && retried) { 4837 ret = -ENOSPC; 4838 goto done; 4839 } 4840 4841 /* Shrinking succeeded, else we would be at "done". */ 4842 trans = btrfs_start_transaction(root, 0); 4843 if (IS_ERR(trans)) { 4844 ret = PTR_ERR(trans); 4845 goto done; 4846 } 4847 4848 mutex_lock(&fs_info->chunk_mutex); 4849 4850 /* 4851 * We checked in the above loop all device extents that were already in 4852 * the device tree. However before we have updated the device's 4853 * total_bytes to the new size, we might have had chunk allocations that 4854 * have not complete yet (new block groups attached to transaction 4855 * handles), and therefore their device extents were not yet in the 4856 * device tree and we missed them in the loop above. So if we have any 4857 * pending chunk using a device extent that overlaps the device range 4858 * that we can not use anymore, commit the current transaction and 4859 * repeat the search on the device tree - this way we guarantee we will 4860 * not have chunks using device extents that end beyond 'new_size'. 4861 */ 4862 if (!checked_pending_chunks) { 4863 u64 start = new_size; 4864 u64 len = old_size - new_size; 4865 4866 if (contains_pending_extent(trans->transaction, device, 4867 &start, len)) { 4868 mutex_unlock(&fs_info->chunk_mutex); 4869 checked_pending_chunks = true; 4870 failed = 0; 4871 retried = false; 4872 ret = btrfs_commit_transaction(trans); 4873 if (ret) 4874 goto done; 4875 goto again; 4876 } 4877 } 4878 4879 btrfs_device_set_disk_total_bytes(device, new_size); 4880 if (list_empty(&device->resized_list)) 4881 list_add_tail(&device->resized_list, 4882 &fs_info->fs_devices->resized_devices); 4883 4884 WARN_ON(diff > old_total); 4885 btrfs_set_super_total_bytes(super_copy, 4886 round_down(old_total - diff, fs_info->sectorsize)); 4887 mutex_unlock(&fs_info->chunk_mutex); 4888 4889 /* Now btrfs_update_device() will change the on-disk size. */ 4890 ret = btrfs_update_device(trans, device); 4891 if (ret < 0) { 4892 btrfs_abort_transaction(trans, ret); 4893 btrfs_end_transaction(trans); 4894 } else { 4895 ret = btrfs_commit_transaction(trans); 4896 } 4897 done: 4898 btrfs_free_path(path); 4899 if (ret) { 4900 mutex_lock(&fs_info->chunk_mutex); 4901 btrfs_device_set_total_bytes(device, old_size); 4902 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4903 device->fs_devices->total_rw_bytes += diff; 4904 atomic64_add(diff, &fs_info->free_chunk_space); 4905 mutex_unlock(&fs_info->chunk_mutex); 4906 } 4907 return ret; 4908 } 4909 4910 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4911 struct btrfs_key *key, 4912 struct btrfs_chunk *chunk, int item_size) 4913 { 4914 struct btrfs_super_block *super_copy = fs_info->super_copy; 4915 struct btrfs_disk_key disk_key; 4916 u32 array_size; 4917 u8 *ptr; 4918 4919 mutex_lock(&fs_info->chunk_mutex); 4920 array_size = btrfs_super_sys_array_size(super_copy); 4921 if (array_size + item_size + sizeof(disk_key) 4922 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4923 mutex_unlock(&fs_info->chunk_mutex); 4924 return -EFBIG; 4925 } 4926 4927 ptr = super_copy->sys_chunk_array + array_size; 4928 btrfs_cpu_key_to_disk(&disk_key, key); 4929 memcpy(ptr, &disk_key, sizeof(disk_key)); 4930 ptr += sizeof(disk_key); 4931 memcpy(ptr, chunk, item_size); 4932 item_size += sizeof(disk_key); 4933 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4934 mutex_unlock(&fs_info->chunk_mutex); 4935 4936 return 0; 4937 } 4938 4939 /* 4940 * sort the devices in descending order by max_avail, total_avail 4941 */ 4942 static int btrfs_cmp_device_info(const void *a, const void *b) 4943 { 4944 const struct btrfs_device_info *di_a = a; 4945 const struct btrfs_device_info *di_b = b; 4946 4947 if (di_a->max_avail > di_b->max_avail) 4948 return -1; 4949 if (di_a->max_avail < di_b->max_avail) 4950 return 1; 4951 if (di_a->total_avail > di_b->total_avail) 4952 return -1; 4953 if (di_a->total_avail < di_b->total_avail) 4954 return 1; 4955 return 0; 4956 } 4957 4958 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4959 { 4960 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4961 return; 4962 4963 btrfs_set_fs_incompat(info, RAID56); 4964 } 4965 4966 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ 4967 - sizeof(struct btrfs_chunk)) \ 4968 / sizeof(struct btrfs_stripe) + 1) 4969 4970 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4971 - 2 * sizeof(struct btrfs_disk_key) \ 4972 - 2 * sizeof(struct btrfs_chunk)) \ 4973 / sizeof(struct btrfs_stripe) + 1) 4974 4975 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4976 u64 start, u64 type) 4977 { 4978 struct btrfs_fs_info *info = trans->fs_info; 4979 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4980 struct btrfs_device *device; 4981 struct map_lookup *map = NULL; 4982 struct extent_map_tree *em_tree; 4983 struct extent_map *em; 4984 struct btrfs_device_info *devices_info = NULL; 4985 u64 total_avail; 4986 int num_stripes; /* total number of stripes to allocate */ 4987 int data_stripes; /* number of stripes that count for 4988 block group size */ 4989 int sub_stripes; /* sub_stripes info for map */ 4990 int dev_stripes; /* stripes per dev */ 4991 int devs_max; /* max devs to use */ 4992 int devs_min; /* min devs needed */ 4993 int devs_increment; /* ndevs has to be a multiple of this */ 4994 int ncopies; /* how many copies to data has */ 4995 int nparity; /* number of stripes worth of bytes to 4996 store parity information */ 4997 int ret; 4998 u64 max_stripe_size; 4999 u64 max_chunk_size; 5000 u64 stripe_size; 5001 u64 chunk_size; 5002 int ndevs; 5003 int i; 5004 int j; 5005 int index; 5006 5007 BUG_ON(!alloc_profile_is_valid(type, 0)); 5008 5009 if (list_empty(&fs_devices->alloc_list)) { 5010 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5011 btrfs_debug(info, "%s: no writable device", __func__); 5012 return -ENOSPC; 5013 } 5014 5015 index = btrfs_bg_flags_to_raid_index(type); 5016 5017 sub_stripes = btrfs_raid_array[index].sub_stripes; 5018 dev_stripes = btrfs_raid_array[index].dev_stripes; 5019 devs_max = btrfs_raid_array[index].devs_max; 5020 devs_min = btrfs_raid_array[index].devs_min; 5021 devs_increment = btrfs_raid_array[index].devs_increment; 5022 ncopies = btrfs_raid_array[index].ncopies; 5023 nparity = btrfs_raid_array[index].nparity; 5024 5025 if (type & BTRFS_BLOCK_GROUP_DATA) { 5026 max_stripe_size = SZ_1G; 5027 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5028 if (!devs_max) 5029 devs_max = BTRFS_MAX_DEVS(info); 5030 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5031 /* for larger filesystems, use larger metadata chunks */ 5032 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5033 max_stripe_size = SZ_1G; 5034 else 5035 max_stripe_size = SZ_256M; 5036 max_chunk_size = max_stripe_size; 5037 if (!devs_max) 5038 devs_max = BTRFS_MAX_DEVS(info); 5039 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5040 max_stripe_size = SZ_32M; 5041 max_chunk_size = 2 * max_stripe_size; 5042 if (!devs_max) 5043 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 5044 } else { 5045 btrfs_err(info, "invalid chunk type 0x%llx requested", 5046 type); 5047 BUG_ON(1); 5048 } 5049 5050 /* We don't want a chunk larger than 10% of writable space */ 5051 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5052 max_chunk_size); 5053 5054 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5055 GFP_NOFS); 5056 if (!devices_info) 5057 return -ENOMEM; 5058 5059 /* 5060 * in the first pass through the devices list, we gather information 5061 * about the available holes on each device. 5062 */ 5063 ndevs = 0; 5064 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5065 u64 max_avail; 5066 u64 dev_offset; 5067 5068 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5069 WARN(1, KERN_ERR 5070 "BTRFS: read-only device in alloc_list\n"); 5071 continue; 5072 } 5073 5074 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5075 &device->dev_state) || 5076 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5077 continue; 5078 5079 if (device->total_bytes > device->bytes_used) 5080 total_avail = device->total_bytes - device->bytes_used; 5081 else 5082 total_avail = 0; 5083 5084 /* If there is no space on this device, skip it. */ 5085 if (total_avail == 0) 5086 continue; 5087 5088 ret = find_free_dev_extent(trans, device, 5089 max_stripe_size * dev_stripes, 5090 &dev_offset, &max_avail); 5091 if (ret && ret != -ENOSPC) 5092 goto error; 5093 5094 if (ret == 0) 5095 max_avail = max_stripe_size * dev_stripes; 5096 5097 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { 5098 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5099 btrfs_debug(info, 5100 "%s: devid %llu has no free space, have=%llu want=%u", 5101 __func__, device->devid, max_avail, 5102 BTRFS_STRIPE_LEN * dev_stripes); 5103 continue; 5104 } 5105 5106 if (ndevs == fs_devices->rw_devices) { 5107 WARN(1, "%s: found more than %llu devices\n", 5108 __func__, fs_devices->rw_devices); 5109 break; 5110 } 5111 devices_info[ndevs].dev_offset = dev_offset; 5112 devices_info[ndevs].max_avail = max_avail; 5113 devices_info[ndevs].total_avail = total_avail; 5114 devices_info[ndevs].dev = device; 5115 ++ndevs; 5116 } 5117 5118 /* 5119 * now sort the devices by hole size / available space 5120 */ 5121 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5122 btrfs_cmp_device_info, NULL); 5123 5124 /* round down to number of usable stripes */ 5125 ndevs = round_down(ndevs, devs_increment); 5126 5127 if (ndevs < devs_min) { 5128 ret = -ENOSPC; 5129 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5130 btrfs_debug(info, 5131 "%s: not enough devices with free space: have=%d minimum required=%d", 5132 __func__, ndevs, devs_min); 5133 } 5134 goto error; 5135 } 5136 5137 ndevs = min(ndevs, devs_max); 5138 5139 /* 5140 * The primary goal is to maximize the number of stripes, so use as 5141 * many devices as possible, even if the stripes are not maximum sized. 5142 * 5143 * The DUP profile stores more than one stripe per device, the 5144 * max_avail is the total size so we have to adjust. 5145 */ 5146 stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); 5147 num_stripes = ndevs * dev_stripes; 5148 5149 /* 5150 * this will have to be fixed for RAID1 and RAID10 over 5151 * more drives 5152 */ 5153 data_stripes = (num_stripes - nparity) / ncopies; 5154 5155 /* 5156 * Use the number of data stripes to figure out how big this chunk 5157 * is really going to be in terms of logical address space, 5158 * and compare that answer with the max chunk size. If it's higher, 5159 * we try to reduce stripe_size. 5160 */ 5161 if (stripe_size * data_stripes > max_chunk_size) { 5162 /* 5163 * Reduce stripe_size, round it up to a 16MB boundary again and 5164 * then use it, unless it ends up being even bigger than the 5165 * previous value we had already. 5166 */ 5167 stripe_size = min(round_up(div_u64(max_chunk_size, 5168 data_stripes), SZ_16M), 5169 stripe_size); 5170 } 5171 5172 /* align to BTRFS_STRIPE_LEN */ 5173 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 5174 5175 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 5176 if (!map) { 5177 ret = -ENOMEM; 5178 goto error; 5179 } 5180 map->num_stripes = num_stripes; 5181 5182 for (i = 0; i < ndevs; ++i) { 5183 for (j = 0; j < dev_stripes; ++j) { 5184 int s = i * dev_stripes + j; 5185 map->stripes[s].dev = devices_info[i].dev; 5186 map->stripes[s].physical = devices_info[i].dev_offset + 5187 j * stripe_size; 5188 } 5189 } 5190 map->stripe_len = BTRFS_STRIPE_LEN; 5191 map->io_align = BTRFS_STRIPE_LEN; 5192 map->io_width = BTRFS_STRIPE_LEN; 5193 map->type = type; 5194 map->sub_stripes = sub_stripes; 5195 5196 chunk_size = stripe_size * data_stripes; 5197 5198 trace_btrfs_chunk_alloc(info, map, start, chunk_size); 5199 5200 em = alloc_extent_map(); 5201 if (!em) { 5202 kfree(map); 5203 ret = -ENOMEM; 5204 goto error; 5205 } 5206 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5207 em->map_lookup = map; 5208 em->start = start; 5209 em->len = chunk_size; 5210 em->block_start = 0; 5211 em->block_len = em->len; 5212 em->orig_block_len = stripe_size; 5213 5214 em_tree = &info->mapping_tree.map_tree; 5215 write_lock(&em_tree->lock); 5216 ret = add_extent_mapping(em_tree, em, 0); 5217 if (ret) { 5218 write_unlock(&em_tree->lock); 5219 free_extent_map(em); 5220 goto error; 5221 } 5222 5223 list_add_tail(&em->list, &trans->transaction->pending_chunks); 5224 refcount_inc(&em->refs); 5225 write_unlock(&em_tree->lock); 5226 5227 ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); 5228 if (ret) 5229 goto error_del_extent; 5230 5231 for (i = 0; i < map->num_stripes; i++) 5232 btrfs_device_set_bytes_used(map->stripes[i].dev, 5233 map->stripes[i].dev->bytes_used + stripe_size); 5234 5235 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 5236 5237 free_extent_map(em); 5238 check_raid56_incompat_flag(info, type); 5239 5240 kfree(devices_info); 5241 return 0; 5242 5243 error_del_extent: 5244 write_lock(&em_tree->lock); 5245 remove_extent_mapping(em_tree, em); 5246 write_unlock(&em_tree->lock); 5247 5248 /* One for our allocation */ 5249 free_extent_map(em); 5250 /* One for the tree reference */ 5251 free_extent_map(em); 5252 /* One for the pending_chunks list reference */ 5253 free_extent_map(em); 5254 error: 5255 kfree(devices_info); 5256 return ret; 5257 } 5258 5259 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 5260 u64 chunk_offset, u64 chunk_size) 5261 { 5262 struct btrfs_fs_info *fs_info = trans->fs_info; 5263 struct btrfs_root *extent_root = fs_info->extent_root; 5264 struct btrfs_root *chunk_root = fs_info->chunk_root; 5265 struct btrfs_key key; 5266 struct btrfs_device *device; 5267 struct btrfs_chunk *chunk; 5268 struct btrfs_stripe *stripe; 5269 struct extent_map *em; 5270 struct map_lookup *map; 5271 size_t item_size; 5272 u64 dev_offset; 5273 u64 stripe_size; 5274 int i = 0; 5275 int ret = 0; 5276 5277 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 5278 if (IS_ERR(em)) 5279 return PTR_ERR(em); 5280 5281 map = em->map_lookup; 5282 item_size = btrfs_chunk_item_size(map->num_stripes); 5283 stripe_size = em->orig_block_len; 5284 5285 chunk = kzalloc(item_size, GFP_NOFS); 5286 if (!chunk) { 5287 ret = -ENOMEM; 5288 goto out; 5289 } 5290 5291 /* 5292 * Take the device list mutex to prevent races with the final phase of 5293 * a device replace operation that replaces the device object associated 5294 * with the map's stripes, because the device object's id can change 5295 * at any time during that final phase of the device replace operation 5296 * (dev-replace.c:btrfs_dev_replace_finishing()). 5297 */ 5298 mutex_lock(&fs_info->fs_devices->device_list_mutex); 5299 for (i = 0; i < map->num_stripes; i++) { 5300 device = map->stripes[i].dev; 5301 dev_offset = map->stripes[i].physical; 5302 5303 ret = btrfs_update_device(trans, device); 5304 if (ret) 5305 break; 5306 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5307 dev_offset, stripe_size); 5308 if (ret) 5309 break; 5310 } 5311 if (ret) { 5312 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5313 goto out; 5314 } 5315 5316 stripe = &chunk->stripe; 5317 for (i = 0; i < map->num_stripes; i++) { 5318 device = map->stripes[i].dev; 5319 dev_offset = map->stripes[i].physical; 5320 5321 btrfs_set_stack_stripe_devid(stripe, device->devid); 5322 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5323 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5324 stripe++; 5325 } 5326 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5327 5328 btrfs_set_stack_chunk_length(chunk, chunk_size); 5329 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5330 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5331 btrfs_set_stack_chunk_type(chunk, map->type); 5332 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5333 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5334 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5335 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5336 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5337 5338 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5339 key.type = BTRFS_CHUNK_ITEM_KEY; 5340 key.offset = chunk_offset; 5341 5342 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5343 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5344 /* 5345 * TODO: Cleanup of inserted chunk root in case of 5346 * failure. 5347 */ 5348 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5349 } 5350 5351 out: 5352 kfree(chunk); 5353 free_extent_map(em); 5354 return ret; 5355 } 5356 5357 /* 5358 * Chunk allocation falls into two parts. The first part does work 5359 * that makes the new allocated chunk usable, but does not do any operation 5360 * that modifies the chunk tree. The second part does the work that 5361 * requires modifying the chunk tree. This division is important for the 5362 * bootstrap process of adding storage to a seed btrfs. 5363 */ 5364 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) 5365 { 5366 u64 chunk_offset; 5367 5368 lockdep_assert_held(&trans->fs_info->chunk_mutex); 5369 chunk_offset = find_next_chunk(trans->fs_info); 5370 return __btrfs_alloc_chunk(trans, chunk_offset, type); 5371 } 5372 5373 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5374 struct btrfs_fs_info *fs_info) 5375 { 5376 u64 chunk_offset; 5377 u64 sys_chunk_offset; 5378 u64 alloc_profile; 5379 int ret; 5380 5381 chunk_offset = find_next_chunk(fs_info); 5382 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5383 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 5384 if (ret) 5385 return ret; 5386 5387 sys_chunk_offset = find_next_chunk(fs_info); 5388 alloc_profile = btrfs_system_alloc_profile(fs_info); 5389 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 5390 return ret; 5391 } 5392 5393 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5394 { 5395 int max_errors; 5396 5397 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5398 BTRFS_BLOCK_GROUP_RAID10 | 5399 BTRFS_BLOCK_GROUP_RAID5 | 5400 BTRFS_BLOCK_GROUP_DUP)) { 5401 max_errors = 1; 5402 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5403 max_errors = 2; 5404 } else { 5405 max_errors = 0; 5406 } 5407 5408 return max_errors; 5409 } 5410 5411 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5412 { 5413 struct extent_map *em; 5414 struct map_lookup *map; 5415 int readonly = 0; 5416 int miss_ndevs = 0; 5417 int i; 5418 5419 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5420 if (IS_ERR(em)) 5421 return 1; 5422 5423 map = em->map_lookup; 5424 for (i = 0; i < map->num_stripes; i++) { 5425 if (test_bit(BTRFS_DEV_STATE_MISSING, 5426 &map->stripes[i].dev->dev_state)) { 5427 miss_ndevs++; 5428 continue; 5429 } 5430 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5431 &map->stripes[i].dev->dev_state)) { 5432 readonly = 1; 5433 goto end; 5434 } 5435 } 5436 5437 /* 5438 * If the number of missing devices is larger than max errors, 5439 * we can not write the data into that chunk successfully, so 5440 * set it readonly. 5441 */ 5442 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5443 readonly = 1; 5444 end: 5445 free_extent_map(em); 5446 return readonly; 5447 } 5448 5449 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5450 { 5451 extent_map_tree_init(&tree->map_tree); 5452 } 5453 5454 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5455 { 5456 struct extent_map *em; 5457 5458 while (1) { 5459 write_lock(&tree->map_tree.lock); 5460 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5461 if (em) 5462 remove_extent_mapping(&tree->map_tree, em); 5463 write_unlock(&tree->map_tree.lock); 5464 if (!em) 5465 break; 5466 /* once for us */ 5467 free_extent_map(em); 5468 /* once for the tree */ 5469 free_extent_map(em); 5470 } 5471 } 5472 5473 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5474 { 5475 struct extent_map *em; 5476 struct map_lookup *map; 5477 int ret; 5478 5479 em = btrfs_get_chunk_map(fs_info, logical, len); 5480 if (IS_ERR(em)) 5481 /* 5482 * We could return errors for these cases, but that could get 5483 * ugly and we'd probably do the same thing which is just not do 5484 * anything else and exit, so return 1 so the callers don't try 5485 * to use other copies. 5486 */ 5487 return 1; 5488 5489 map = em->map_lookup; 5490 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5491 ret = map->num_stripes; 5492 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5493 ret = map->sub_stripes; 5494 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5495 ret = 2; 5496 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5497 /* 5498 * There could be two corrupted data stripes, we need 5499 * to loop retry in order to rebuild the correct data. 5500 * 5501 * Fail a stripe at a time on every retry except the 5502 * stripe under reconstruction. 5503 */ 5504 ret = map->num_stripes; 5505 else 5506 ret = 1; 5507 free_extent_map(em); 5508 5509 down_read(&fs_info->dev_replace.rwsem); 5510 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5511 fs_info->dev_replace.tgtdev) 5512 ret++; 5513 up_read(&fs_info->dev_replace.rwsem); 5514 5515 return ret; 5516 } 5517 5518 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5519 u64 logical) 5520 { 5521 struct extent_map *em; 5522 struct map_lookup *map; 5523 unsigned long len = fs_info->sectorsize; 5524 5525 em = btrfs_get_chunk_map(fs_info, logical, len); 5526 5527 if (!WARN_ON(IS_ERR(em))) { 5528 map = em->map_lookup; 5529 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5530 len = map->stripe_len * nr_data_stripes(map); 5531 free_extent_map(em); 5532 } 5533 return len; 5534 } 5535 5536 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5537 { 5538 struct extent_map *em; 5539 struct map_lookup *map; 5540 int ret = 0; 5541 5542 em = btrfs_get_chunk_map(fs_info, logical, len); 5543 5544 if(!WARN_ON(IS_ERR(em))) { 5545 map = em->map_lookup; 5546 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5547 ret = 1; 5548 free_extent_map(em); 5549 } 5550 return ret; 5551 } 5552 5553 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5554 struct map_lookup *map, int first, 5555 int dev_replace_is_ongoing) 5556 { 5557 int i; 5558 int num_stripes; 5559 int preferred_mirror; 5560 int tolerance; 5561 struct btrfs_device *srcdev; 5562 5563 ASSERT((map->type & 5564 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5565 5566 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5567 num_stripes = map->sub_stripes; 5568 else 5569 num_stripes = map->num_stripes; 5570 5571 preferred_mirror = first + current->pid % num_stripes; 5572 5573 if (dev_replace_is_ongoing && 5574 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5575 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5576 srcdev = fs_info->dev_replace.srcdev; 5577 else 5578 srcdev = NULL; 5579 5580 /* 5581 * try to avoid the drive that is the source drive for a 5582 * dev-replace procedure, only choose it if no other non-missing 5583 * mirror is available 5584 */ 5585 for (tolerance = 0; tolerance < 2; tolerance++) { 5586 if (map->stripes[preferred_mirror].dev->bdev && 5587 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5588 return preferred_mirror; 5589 for (i = first; i < first + num_stripes; i++) { 5590 if (map->stripes[i].dev->bdev && 5591 (tolerance || map->stripes[i].dev != srcdev)) 5592 return i; 5593 } 5594 } 5595 5596 /* we couldn't find one that doesn't fail. Just return something 5597 * and the io error handling code will clean up eventually 5598 */ 5599 return preferred_mirror; 5600 } 5601 5602 static inline int parity_smaller(u64 a, u64 b) 5603 { 5604 return a > b; 5605 } 5606 5607 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5608 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5609 { 5610 struct btrfs_bio_stripe s; 5611 int i; 5612 u64 l; 5613 int again = 1; 5614 5615 while (again) { 5616 again = 0; 5617 for (i = 0; i < num_stripes - 1; i++) { 5618 if (parity_smaller(bbio->raid_map[i], 5619 bbio->raid_map[i+1])) { 5620 s = bbio->stripes[i]; 5621 l = bbio->raid_map[i]; 5622 bbio->stripes[i] = bbio->stripes[i+1]; 5623 bbio->raid_map[i] = bbio->raid_map[i+1]; 5624 bbio->stripes[i+1] = s; 5625 bbio->raid_map[i+1] = l; 5626 5627 again = 1; 5628 } 5629 } 5630 } 5631 } 5632 5633 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5634 { 5635 struct btrfs_bio *bbio = kzalloc( 5636 /* the size of the btrfs_bio */ 5637 sizeof(struct btrfs_bio) + 5638 /* plus the variable array for the stripes */ 5639 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5640 /* plus the variable array for the tgt dev */ 5641 sizeof(int) * (real_stripes) + 5642 /* 5643 * plus the raid_map, which includes both the tgt dev 5644 * and the stripes 5645 */ 5646 sizeof(u64) * (total_stripes), 5647 GFP_NOFS|__GFP_NOFAIL); 5648 5649 atomic_set(&bbio->error, 0); 5650 refcount_set(&bbio->refs, 1); 5651 5652 return bbio; 5653 } 5654 5655 void btrfs_get_bbio(struct btrfs_bio *bbio) 5656 { 5657 WARN_ON(!refcount_read(&bbio->refs)); 5658 refcount_inc(&bbio->refs); 5659 } 5660 5661 void btrfs_put_bbio(struct btrfs_bio *bbio) 5662 { 5663 if (!bbio) 5664 return; 5665 if (refcount_dec_and_test(&bbio->refs)) 5666 kfree(bbio); 5667 } 5668 5669 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5670 /* 5671 * Please note that, discard won't be sent to target device of device 5672 * replace. 5673 */ 5674 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5675 u64 logical, u64 length, 5676 struct btrfs_bio **bbio_ret) 5677 { 5678 struct extent_map *em; 5679 struct map_lookup *map; 5680 struct btrfs_bio *bbio; 5681 u64 offset; 5682 u64 stripe_nr; 5683 u64 stripe_nr_end; 5684 u64 stripe_end_offset; 5685 u64 stripe_cnt; 5686 u64 stripe_len; 5687 u64 stripe_offset; 5688 u64 num_stripes; 5689 u32 stripe_index; 5690 u32 factor = 0; 5691 u32 sub_stripes = 0; 5692 u64 stripes_per_dev = 0; 5693 u32 remaining_stripes = 0; 5694 u32 last_stripe = 0; 5695 int ret = 0; 5696 int i; 5697 5698 /* discard always return a bbio */ 5699 ASSERT(bbio_ret); 5700 5701 em = btrfs_get_chunk_map(fs_info, logical, length); 5702 if (IS_ERR(em)) 5703 return PTR_ERR(em); 5704 5705 map = em->map_lookup; 5706 /* we don't discard raid56 yet */ 5707 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5708 ret = -EOPNOTSUPP; 5709 goto out; 5710 } 5711 5712 offset = logical - em->start; 5713 length = min_t(u64, em->len - offset, length); 5714 5715 stripe_len = map->stripe_len; 5716 /* 5717 * stripe_nr counts the total number of stripes we have to stride 5718 * to get to this block 5719 */ 5720 stripe_nr = div64_u64(offset, stripe_len); 5721 5722 /* stripe_offset is the offset of this block in its stripe */ 5723 stripe_offset = offset - stripe_nr * stripe_len; 5724 5725 stripe_nr_end = round_up(offset + length, map->stripe_len); 5726 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5727 stripe_cnt = stripe_nr_end - stripe_nr; 5728 stripe_end_offset = stripe_nr_end * map->stripe_len - 5729 (offset + length); 5730 /* 5731 * after this, stripe_nr is the number of stripes on this 5732 * device we have to walk to find the data, and stripe_index is 5733 * the number of our device in the stripe array 5734 */ 5735 num_stripes = 1; 5736 stripe_index = 0; 5737 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5738 BTRFS_BLOCK_GROUP_RAID10)) { 5739 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5740 sub_stripes = 1; 5741 else 5742 sub_stripes = map->sub_stripes; 5743 5744 factor = map->num_stripes / sub_stripes; 5745 num_stripes = min_t(u64, map->num_stripes, 5746 sub_stripes * stripe_cnt); 5747 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5748 stripe_index *= sub_stripes; 5749 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5750 &remaining_stripes); 5751 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5752 last_stripe *= sub_stripes; 5753 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5754 BTRFS_BLOCK_GROUP_DUP)) { 5755 num_stripes = map->num_stripes; 5756 } else { 5757 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5758 &stripe_index); 5759 } 5760 5761 bbio = alloc_btrfs_bio(num_stripes, 0); 5762 if (!bbio) { 5763 ret = -ENOMEM; 5764 goto out; 5765 } 5766 5767 for (i = 0; i < num_stripes; i++) { 5768 bbio->stripes[i].physical = 5769 map->stripes[stripe_index].physical + 5770 stripe_offset + stripe_nr * map->stripe_len; 5771 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5772 5773 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5774 BTRFS_BLOCK_GROUP_RAID10)) { 5775 bbio->stripes[i].length = stripes_per_dev * 5776 map->stripe_len; 5777 5778 if (i / sub_stripes < remaining_stripes) 5779 bbio->stripes[i].length += 5780 map->stripe_len; 5781 5782 /* 5783 * Special for the first stripe and 5784 * the last stripe: 5785 * 5786 * |-------|...|-------| 5787 * |----------| 5788 * off end_off 5789 */ 5790 if (i < sub_stripes) 5791 bbio->stripes[i].length -= 5792 stripe_offset; 5793 5794 if (stripe_index >= last_stripe && 5795 stripe_index <= (last_stripe + 5796 sub_stripes - 1)) 5797 bbio->stripes[i].length -= 5798 stripe_end_offset; 5799 5800 if (i == sub_stripes - 1) 5801 stripe_offset = 0; 5802 } else { 5803 bbio->stripes[i].length = length; 5804 } 5805 5806 stripe_index++; 5807 if (stripe_index == map->num_stripes) { 5808 stripe_index = 0; 5809 stripe_nr++; 5810 } 5811 } 5812 5813 *bbio_ret = bbio; 5814 bbio->map_type = map->type; 5815 bbio->num_stripes = num_stripes; 5816 out: 5817 free_extent_map(em); 5818 return ret; 5819 } 5820 5821 /* 5822 * In dev-replace case, for repair case (that's the only case where the mirror 5823 * is selected explicitly when calling btrfs_map_block), blocks left of the 5824 * left cursor can also be read from the target drive. 5825 * 5826 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5827 * array of stripes. 5828 * For READ, it also needs to be supported using the same mirror number. 5829 * 5830 * If the requested block is not left of the left cursor, EIO is returned. This 5831 * can happen because btrfs_num_copies() returns one more in the dev-replace 5832 * case. 5833 */ 5834 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5835 u64 logical, u64 length, 5836 u64 srcdev_devid, int *mirror_num, 5837 u64 *physical) 5838 { 5839 struct btrfs_bio *bbio = NULL; 5840 int num_stripes; 5841 int index_srcdev = 0; 5842 int found = 0; 5843 u64 physical_of_found = 0; 5844 int i; 5845 int ret = 0; 5846 5847 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5848 logical, &length, &bbio, 0, 0); 5849 if (ret) { 5850 ASSERT(bbio == NULL); 5851 return ret; 5852 } 5853 5854 num_stripes = bbio->num_stripes; 5855 if (*mirror_num > num_stripes) { 5856 /* 5857 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5858 * that means that the requested area is not left of the left 5859 * cursor 5860 */ 5861 btrfs_put_bbio(bbio); 5862 return -EIO; 5863 } 5864 5865 /* 5866 * process the rest of the function using the mirror_num of the source 5867 * drive. Therefore look it up first. At the end, patch the device 5868 * pointer to the one of the target drive. 5869 */ 5870 for (i = 0; i < num_stripes; i++) { 5871 if (bbio->stripes[i].dev->devid != srcdev_devid) 5872 continue; 5873 5874 /* 5875 * In case of DUP, in order to keep it simple, only add the 5876 * mirror with the lowest physical address 5877 */ 5878 if (found && 5879 physical_of_found <= bbio->stripes[i].physical) 5880 continue; 5881 5882 index_srcdev = i; 5883 found = 1; 5884 physical_of_found = bbio->stripes[i].physical; 5885 } 5886 5887 btrfs_put_bbio(bbio); 5888 5889 ASSERT(found); 5890 if (!found) 5891 return -EIO; 5892 5893 *mirror_num = index_srcdev + 1; 5894 *physical = physical_of_found; 5895 return ret; 5896 } 5897 5898 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5899 struct btrfs_bio **bbio_ret, 5900 struct btrfs_dev_replace *dev_replace, 5901 int *num_stripes_ret, int *max_errors_ret) 5902 { 5903 struct btrfs_bio *bbio = *bbio_ret; 5904 u64 srcdev_devid = dev_replace->srcdev->devid; 5905 int tgtdev_indexes = 0; 5906 int num_stripes = *num_stripes_ret; 5907 int max_errors = *max_errors_ret; 5908 int i; 5909 5910 if (op == BTRFS_MAP_WRITE) { 5911 int index_where_to_add; 5912 5913 /* 5914 * duplicate the write operations while the dev replace 5915 * procedure is running. Since the copying of the old disk to 5916 * the new disk takes place at run time while the filesystem is 5917 * mounted writable, the regular write operations to the old 5918 * disk have to be duplicated to go to the new disk as well. 5919 * 5920 * Note that device->missing is handled by the caller, and that 5921 * the write to the old disk is already set up in the stripes 5922 * array. 5923 */ 5924 index_where_to_add = num_stripes; 5925 for (i = 0; i < num_stripes; i++) { 5926 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5927 /* write to new disk, too */ 5928 struct btrfs_bio_stripe *new = 5929 bbio->stripes + index_where_to_add; 5930 struct btrfs_bio_stripe *old = 5931 bbio->stripes + i; 5932 5933 new->physical = old->physical; 5934 new->length = old->length; 5935 new->dev = dev_replace->tgtdev; 5936 bbio->tgtdev_map[i] = index_where_to_add; 5937 index_where_to_add++; 5938 max_errors++; 5939 tgtdev_indexes++; 5940 } 5941 } 5942 num_stripes = index_where_to_add; 5943 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5944 int index_srcdev = 0; 5945 int found = 0; 5946 u64 physical_of_found = 0; 5947 5948 /* 5949 * During the dev-replace procedure, the target drive can also 5950 * be used to read data in case it is needed to repair a corrupt 5951 * block elsewhere. This is possible if the requested area is 5952 * left of the left cursor. In this area, the target drive is a 5953 * full copy of the source drive. 5954 */ 5955 for (i = 0; i < num_stripes; i++) { 5956 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5957 /* 5958 * In case of DUP, in order to keep it simple, 5959 * only add the mirror with the lowest physical 5960 * address 5961 */ 5962 if (found && 5963 physical_of_found <= 5964 bbio->stripes[i].physical) 5965 continue; 5966 index_srcdev = i; 5967 found = 1; 5968 physical_of_found = bbio->stripes[i].physical; 5969 } 5970 } 5971 if (found) { 5972 struct btrfs_bio_stripe *tgtdev_stripe = 5973 bbio->stripes + num_stripes; 5974 5975 tgtdev_stripe->physical = physical_of_found; 5976 tgtdev_stripe->length = 5977 bbio->stripes[index_srcdev].length; 5978 tgtdev_stripe->dev = dev_replace->tgtdev; 5979 bbio->tgtdev_map[index_srcdev] = num_stripes; 5980 5981 tgtdev_indexes++; 5982 num_stripes++; 5983 } 5984 } 5985 5986 *num_stripes_ret = num_stripes; 5987 *max_errors_ret = max_errors; 5988 bbio->num_tgtdevs = tgtdev_indexes; 5989 *bbio_ret = bbio; 5990 } 5991 5992 static bool need_full_stripe(enum btrfs_map_op op) 5993 { 5994 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5995 } 5996 5997 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5998 enum btrfs_map_op op, 5999 u64 logical, u64 *length, 6000 struct btrfs_bio **bbio_ret, 6001 int mirror_num, int need_raid_map) 6002 { 6003 struct extent_map *em; 6004 struct map_lookup *map; 6005 u64 offset; 6006 u64 stripe_offset; 6007 u64 stripe_nr; 6008 u64 stripe_len; 6009 u32 stripe_index; 6010 int i; 6011 int ret = 0; 6012 int num_stripes; 6013 int max_errors = 0; 6014 int tgtdev_indexes = 0; 6015 struct btrfs_bio *bbio = NULL; 6016 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6017 int dev_replace_is_ongoing = 0; 6018 int num_alloc_stripes; 6019 int patch_the_first_stripe_for_dev_replace = 0; 6020 u64 physical_to_patch_in_first_stripe = 0; 6021 u64 raid56_full_stripe_start = (u64)-1; 6022 6023 if (op == BTRFS_MAP_DISCARD) 6024 return __btrfs_map_block_for_discard(fs_info, logical, 6025 *length, bbio_ret); 6026 6027 em = btrfs_get_chunk_map(fs_info, logical, *length); 6028 if (IS_ERR(em)) 6029 return PTR_ERR(em); 6030 6031 map = em->map_lookup; 6032 offset = logical - em->start; 6033 6034 stripe_len = map->stripe_len; 6035 stripe_nr = offset; 6036 /* 6037 * stripe_nr counts the total number of stripes we have to stride 6038 * to get to this block 6039 */ 6040 stripe_nr = div64_u64(stripe_nr, stripe_len); 6041 6042 stripe_offset = stripe_nr * stripe_len; 6043 if (offset < stripe_offset) { 6044 btrfs_crit(fs_info, 6045 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 6046 stripe_offset, offset, em->start, logical, 6047 stripe_len); 6048 free_extent_map(em); 6049 return -EINVAL; 6050 } 6051 6052 /* stripe_offset is the offset of this block in its stripe*/ 6053 stripe_offset = offset - stripe_offset; 6054 6055 /* if we're here for raid56, we need to know the stripe aligned start */ 6056 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6057 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 6058 raid56_full_stripe_start = offset; 6059 6060 /* allow a write of a full stripe, but make sure we don't 6061 * allow straddling of stripes 6062 */ 6063 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6064 full_stripe_len); 6065 raid56_full_stripe_start *= full_stripe_len; 6066 } 6067 6068 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6069 u64 max_len; 6070 /* For writes to RAID[56], allow a full stripeset across all disks. 6071 For other RAID types and for RAID[56] reads, just allow a single 6072 stripe (on a single disk). */ 6073 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6074 (op == BTRFS_MAP_WRITE)) { 6075 max_len = stripe_len * nr_data_stripes(map) - 6076 (offset - raid56_full_stripe_start); 6077 } else { 6078 /* we limit the length of each bio to what fits in a stripe */ 6079 max_len = stripe_len - stripe_offset; 6080 } 6081 *length = min_t(u64, em->len - offset, max_len); 6082 } else { 6083 *length = em->len - offset; 6084 } 6085 6086 /* 6087 * This is for when we're called from btrfs_bio_fits_in_stripe and all 6088 * it cares about is the length 6089 */ 6090 if (!bbio_ret) 6091 goto out; 6092 6093 down_read(&dev_replace->rwsem); 6094 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6095 /* 6096 * Hold the semaphore for read during the whole operation, write is 6097 * requested at commit time but must wait. 6098 */ 6099 if (!dev_replace_is_ongoing) 6100 up_read(&dev_replace->rwsem); 6101 6102 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6103 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6104 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6105 dev_replace->srcdev->devid, 6106 &mirror_num, 6107 &physical_to_patch_in_first_stripe); 6108 if (ret) 6109 goto out; 6110 else 6111 patch_the_first_stripe_for_dev_replace = 1; 6112 } else if (mirror_num > map->num_stripes) { 6113 mirror_num = 0; 6114 } 6115 6116 num_stripes = 1; 6117 stripe_index = 0; 6118 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6119 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6120 &stripe_index); 6121 if (!need_full_stripe(op)) 6122 mirror_num = 1; 6123 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 6124 if (need_full_stripe(op)) 6125 num_stripes = map->num_stripes; 6126 else if (mirror_num) 6127 stripe_index = mirror_num - 1; 6128 else { 6129 stripe_index = find_live_mirror(fs_info, map, 0, 6130 dev_replace_is_ongoing); 6131 mirror_num = stripe_index + 1; 6132 } 6133 6134 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6135 if (need_full_stripe(op)) { 6136 num_stripes = map->num_stripes; 6137 } else if (mirror_num) { 6138 stripe_index = mirror_num - 1; 6139 } else { 6140 mirror_num = 1; 6141 } 6142 6143 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6144 u32 factor = map->num_stripes / map->sub_stripes; 6145 6146 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6147 stripe_index *= map->sub_stripes; 6148 6149 if (need_full_stripe(op)) 6150 num_stripes = map->sub_stripes; 6151 else if (mirror_num) 6152 stripe_index += mirror_num - 1; 6153 else { 6154 int old_stripe_index = stripe_index; 6155 stripe_index = find_live_mirror(fs_info, map, 6156 stripe_index, 6157 dev_replace_is_ongoing); 6158 mirror_num = stripe_index - old_stripe_index + 1; 6159 } 6160 6161 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6162 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6163 /* push stripe_nr back to the start of the full stripe */ 6164 stripe_nr = div64_u64(raid56_full_stripe_start, 6165 stripe_len * nr_data_stripes(map)); 6166 6167 /* RAID[56] write or recovery. Return all stripes */ 6168 num_stripes = map->num_stripes; 6169 max_errors = nr_parity_stripes(map); 6170 6171 *length = map->stripe_len; 6172 stripe_index = 0; 6173 stripe_offset = 0; 6174 } else { 6175 /* 6176 * Mirror #0 or #1 means the original data block. 6177 * Mirror #2 is RAID5 parity block. 6178 * Mirror #3 is RAID6 Q block. 6179 */ 6180 stripe_nr = div_u64_rem(stripe_nr, 6181 nr_data_stripes(map), &stripe_index); 6182 if (mirror_num > 1) 6183 stripe_index = nr_data_stripes(map) + 6184 mirror_num - 2; 6185 6186 /* We distribute the parity blocks across stripes */ 6187 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6188 &stripe_index); 6189 if (!need_full_stripe(op) && mirror_num <= 1) 6190 mirror_num = 1; 6191 } 6192 } else { 6193 /* 6194 * after this, stripe_nr is the number of stripes on this 6195 * device we have to walk to find the data, and stripe_index is 6196 * the number of our device in the stripe array 6197 */ 6198 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6199 &stripe_index); 6200 mirror_num = stripe_index + 1; 6201 } 6202 if (stripe_index >= map->num_stripes) { 6203 btrfs_crit(fs_info, 6204 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6205 stripe_index, map->num_stripes); 6206 ret = -EINVAL; 6207 goto out; 6208 } 6209 6210 num_alloc_stripes = num_stripes; 6211 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6212 if (op == BTRFS_MAP_WRITE) 6213 num_alloc_stripes <<= 1; 6214 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6215 num_alloc_stripes++; 6216 tgtdev_indexes = num_stripes; 6217 } 6218 6219 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6220 if (!bbio) { 6221 ret = -ENOMEM; 6222 goto out; 6223 } 6224 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 6225 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 6226 6227 /* build raid_map */ 6228 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6229 (need_full_stripe(op) || mirror_num > 1)) { 6230 u64 tmp; 6231 unsigned rot; 6232 6233 bbio->raid_map = (u64 *)((void *)bbio->stripes + 6234 sizeof(struct btrfs_bio_stripe) * 6235 num_alloc_stripes + 6236 sizeof(int) * tgtdev_indexes); 6237 6238 /* Work out the disk rotation on this stripe-set */ 6239 div_u64_rem(stripe_nr, num_stripes, &rot); 6240 6241 /* Fill in the logical address of each stripe */ 6242 tmp = stripe_nr * nr_data_stripes(map); 6243 for (i = 0; i < nr_data_stripes(map); i++) 6244 bbio->raid_map[(i+rot) % num_stripes] = 6245 em->start + (tmp + i) * map->stripe_len; 6246 6247 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6248 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6249 bbio->raid_map[(i+rot+1) % num_stripes] = 6250 RAID6_Q_STRIPE; 6251 } 6252 6253 6254 for (i = 0; i < num_stripes; i++) { 6255 bbio->stripes[i].physical = 6256 map->stripes[stripe_index].physical + 6257 stripe_offset + 6258 stripe_nr * map->stripe_len; 6259 bbio->stripes[i].dev = 6260 map->stripes[stripe_index].dev; 6261 stripe_index++; 6262 } 6263 6264 if (need_full_stripe(op)) 6265 max_errors = btrfs_chunk_max_errors(map); 6266 6267 if (bbio->raid_map) 6268 sort_parity_stripes(bbio, num_stripes); 6269 6270 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6271 need_full_stripe(op)) { 6272 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 6273 &max_errors); 6274 } 6275 6276 *bbio_ret = bbio; 6277 bbio->map_type = map->type; 6278 bbio->num_stripes = num_stripes; 6279 bbio->max_errors = max_errors; 6280 bbio->mirror_num = mirror_num; 6281 6282 /* 6283 * this is the case that REQ_READ && dev_replace_is_ongoing && 6284 * mirror_num == num_stripes + 1 && dev_replace target drive is 6285 * available as a mirror 6286 */ 6287 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6288 WARN_ON(num_stripes > 1); 6289 bbio->stripes[0].dev = dev_replace->tgtdev; 6290 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6291 bbio->mirror_num = map->num_stripes + 1; 6292 } 6293 out: 6294 if (dev_replace_is_ongoing) { 6295 lockdep_assert_held(&dev_replace->rwsem); 6296 /* Unlock and let waiting writers proceed */ 6297 up_read(&dev_replace->rwsem); 6298 } 6299 free_extent_map(em); 6300 return ret; 6301 } 6302 6303 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6304 u64 logical, u64 *length, 6305 struct btrfs_bio **bbio_ret, int mirror_num) 6306 { 6307 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6308 mirror_num, 0); 6309 } 6310 6311 /* For Scrub/replace */ 6312 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6313 u64 logical, u64 *length, 6314 struct btrfs_bio **bbio_ret) 6315 { 6316 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6317 } 6318 6319 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 6320 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 6321 { 6322 struct extent_map *em; 6323 struct map_lookup *map; 6324 u64 *buf; 6325 u64 bytenr; 6326 u64 length; 6327 u64 stripe_nr; 6328 u64 rmap_len; 6329 int i, j, nr = 0; 6330 6331 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 6332 if (IS_ERR(em)) 6333 return -EIO; 6334 6335 map = em->map_lookup; 6336 length = em->len; 6337 rmap_len = map->stripe_len; 6338 6339 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 6340 length = div_u64(length, map->num_stripes / map->sub_stripes); 6341 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6342 length = div_u64(length, map->num_stripes); 6343 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6344 length = div_u64(length, nr_data_stripes(map)); 6345 rmap_len = map->stripe_len * nr_data_stripes(map); 6346 } 6347 6348 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 6349 BUG_ON(!buf); /* -ENOMEM */ 6350 6351 for (i = 0; i < map->num_stripes; i++) { 6352 if (map->stripes[i].physical > physical || 6353 map->stripes[i].physical + length <= physical) 6354 continue; 6355 6356 stripe_nr = physical - map->stripes[i].physical; 6357 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 6358 6359 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6360 stripe_nr = stripe_nr * map->num_stripes + i; 6361 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 6362 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6363 stripe_nr = stripe_nr * map->num_stripes + i; 6364 } /* else if RAID[56], multiply by nr_data_stripes(). 6365 * Alternatively, just use rmap_len below instead of 6366 * map->stripe_len */ 6367 6368 bytenr = chunk_start + stripe_nr * rmap_len; 6369 WARN_ON(nr >= map->num_stripes); 6370 for (j = 0; j < nr; j++) { 6371 if (buf[j] == bytenr) 6372 break; 6373 } 6374 if (j == nr) { 6375 WARN_ON(nr >= map->num_stripes); 6376 buf[nr++] = bytenr; 6377 } 6378 } 6379 6380 *logical = buf; 6381 *naddrs = nr; 6382 *stripe_len = rmap_len; 6383 6384 free_extent_map(em); 6385 return 0; 6386 } 6387 6388 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6389 { 6390 bio->bi_private = bbio->private; 6391 bio->bi_end_io = bbio->end_io; 6392 bio_endio(bio); 6393 6394 btrfs_put_bbio(bbio); 6395 } 6396 6397 static void btrfs_end_bio(struct bio *bio) 6398 { 6399 struct btrfs_bio *bbio = bio->bi_private; 6400 int is_orig_bio = 0; 6401 6402 if (bio->bi_status) { 6403 atomic_inc(&bbio->error); 6404 if (bio->bi_status == BLK_STS_IOERR || 6405 bio->bi_status == BLK_STS_TARGET) { 6406 unsigned int stripe_index = 6407 btrfs_io_bio(bio)->stripe_index; 6408 struct btrfs_device *dev; 6409 6410 BUG_ON(stripe_index >= bbio->num_stripes); 6411 dev = bbio->stripes[stripe_index].dev; 6412 if (dev->bdev) { 6413 if (bio_op(bio) == REQ_OP_WRITE) 6414 btrfs_dev_stat_inc_and_print(dev, 6415 BTRFS_DEV_STAT_WRITE_ERRS); 6416 else 6417 btrfs_dev_stat_inc_and_print(dev, 6418 BTRFS_DEV_STAT_READ_ERRS); 6419 if (bio->bi_opf & REQ_PREFLUSH) 6420 btrfs_dev_stat_inc_and_print(dev, 6421 BTRFS_DEV_STAT_FLUSH_ERRS); 6422 } 6423 } 6424 } 6425 6426 if (bio == bbio->orig_bio) 6427 is_orig_bio = 1; 6428 6429 btrfs_bio_counter_dec(bbio->fs_info); 6430 6431 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6432 if (!is_orig_bio) { 6433 bio_put(bio); 6434 bio = bbio->orig_bio; 6435 } 6436 6437 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6438 /* only send an error to the higher layers if it is 6439 * beyond the tolerance of the btrfs bio 6440 */ 6441 if (atomic_read(&bbio->error) > bbio->max_errors) { 6442 bio->bi_status = BLK_STS_IOERR; 6443 } else { 6444 /* 6445 * this bio is actually up to date, we didn't 6446 * go over the max number of errors 6447 */ 6448 bio->bi_status = BLK_STS_OK; 6449 } 6450 6451 btrfs_end_bbio(bbio, bio); 6452 } else if (!is_orig_bio) { 6453 bio_put(bio); 6454 } 6455 } 6456 6457 /* 6458 * see run_scheduled_bios for a description of why bios are collected for 6459 * async submit. 6460 * 6461 * This will add one bio to the pending list for a device and make sure 6462 * the work struct is scheduled. 6463 */ 6464 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6465 struct bio *bio) 6466 { 6467 struct btrfs_fs_info *fs_info = device->fs_info; 6468 int should_queue = 1; 6469 struct btrfs_pending_bios *pending_bios; 6470 6471 /* don't bother with additional async steps for reads, right now */ 6472 if (bio_op(bio) == REQ_OP_READ) { 6473 btrfsic_submit_bio(bio); 6474 return; 6475 } 6476 6477 WARN_ON(bio->bi_next); 6478 bio->bi_next = NULL; 6479 6480 spin_lock(&device->io_lock); 6481 if (op_is_sync(bio->bi_opf)) 6482 pending_bios = &device->pending_sync_bios; 6483 else 6484 pending_bios = &device->pending_bios; 6485 6486 if (pending_bios->tail) 6487 pending_bios->tail->bi_next = bio; 6488 6489 pending_bios->tail = bio; 6490 if (!pending_bios->head) 6491 pending_bios->head = bio; 6492 if (device->running_pending) 6493 should_queue = 0; 6494 6495 spin_unlock(&device->io_lock); 6496 6497 if (should_queue) 6498 btrfs_queue_work(fs_info->submit_workers, &device->work); 6499 } 6500 6501 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6502 u64 physical, int dev_nr, int async) 6503 { 6504 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6505 struct btrfs_fs_info *fs_info = bbio->fs_info; 6506 6507 bio->bi_private = bbio; 6508 btrfs_io_bio(bio)->stripe_index = dev_nr; 6509 bio->bi_end_io = btrfs_end_bio; 6510 bio->bi_iter.bi_sector = physical >> 9; 6511 btrfs_debug_in_rcu(fs_info, 6512 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6513 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, 6514 (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, 6515 bio->bi_iter.bi_size); 6516 bio_set_dev(bio, dev->bdev); 6517 6518 btrfs_bio_counter_inc_noblocked(fs_info); 6519 6520 if (async) 6521 btrfs_schedule_bio(dev, bio); 6522 else 6523 btrfsic_submit_bio(bio); 6524 } 6525 6526 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6527 { 6528 atomic_inc(&bbio->error); 6529 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6530 /* Should be the original bio. */ 6531 WARN_ON(bio != bbio->orig_bio); 6532 6533 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6534 bio->bi_iter.bi_sector = logical >> 9; 6535 if (atomic_read(&bbio->error) > bbio->max_errors) 6536 bio->bi_status = BLK_STS_IOERR; 6537 else 6538 bio->bi_status = BLK_STS_OK; 6539 btrfs_end_bbio(bbio, bio); 6540 } 6541 } 6542 6543 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6544 int mirror_num, int async_submit) 6545 { 6546 struct btrfs_device *dev; 6547 struct bio *first_bio = bio; 6548 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6549 u64 length = 0; 6550 u64 map_length; 6551 int ret; 6552 int dev_nr; 6553 int total_devs; 6554 struct btrfs_bio *bbio = NULL; 6555 6556 length = bio->bi_iter.bi_size; 6557 map_length = length; 6558 6559 btrfs_bio_counter_inc_blocked(fs_info); 6560 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6561 &map_length, &bbio, mirror_num, 1); 6562 if (ret) { 6563 btrfs_bio_counter_dec(fs_info); 6564 return errno_to_blk_status(ret); 6565 } 6566 6567 total_devs = bbio->num_stripes; 6568 bbio->orig_bio = first_bio; 6569 bbio->private = first_bio->bi_private; 6570 bbio->end_io = first_bio->bi_end_io; 6571 bbio->fs_info = fs_info; 6572 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6573 6574 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6575 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6576 /* In this case, map_length has been set to the length of 6577 a single stripe; not the whole write */ 6578 if (bio_op(bio) == REQ_OP_WRITE) { 6579 ret = raid56_parity_write(fs_info, bio, bbio, 6580 map_length); 6581 } else { 6582 ret = raid56_parity_recover(fs_info, bio, bbio, 6583 map_length, mirror_num, 1); 6584 } 6585 6586 btrfs_bio_counter_dec(fs_info); 6587 return errno_to_blk_status(ret); 6588 } 6589 6590 if (map_length < length) { 6591 btrfs_crit(fs_info, 6592 "mapping failed logical %llu bio len %llu len %llu", 6593 logical, length, map_length); 6594 BUG(); 6595 } 6596 6597 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6598 dev = bbio->stripes[dev_nr].dev; 6599 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6600 &dev->dev_state) || 6601 (bio_op(first_bio) == REQ_OP_WRITE && 6602 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6603 bbio_error(bbio, first_bio, logical); 6604 continue; 6605 } 6606 6607 if (dev_nr < total_devs - 1) 6608 bio = btrfs_bio_clone(first_bio); 6609 else 6610 bio = first_bio; 6611 6612 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6613 dev_nr, async_submit); 6614 } 6615 btrfs_bio_counter_dec(fs_info); 6616 return BLK_STS_OK; 6617 } 6618 6619 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6620 u8 *uuid, u8 *fsid) 6621 { 6622 struct btrfs_device *device; 6623 struct btrfs_fs_devices *cur_devices; 6624 6625 cur_devices = fs_info->fs_devices; 6626 while (cur_devices) { 6627 if (!fsid || 6628 !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6629 device = find_device(cur_devices, devid, uuid); 6630 if (device) 6631 return device; 6632 } 6633 cur_devices = cur_devices->seed; 6634 } 6635 return NULL; 6636 } 6637 6638 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6639 u64 devid, u8 *dev_uuid) 6640 { 6641 struct btrfs_device *device; 6642 6643 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6644 if (IS_ERR(device)) 6645 return device; 6646 6647 list_add(&device->dev_list, &fs_devices->devices); 6648 device->fs_devices = fs_devices; 6649 fs_devices->num_devices++; 6650 6651 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6652 fs_devices->missing_devices++; 6653 6654 return device; 6655 } 6656 6657 /** 6658 * btrfs_alloc_device - allocate struct btrfs_device 6659 * @fs_info: used only for generating a new devid, can be NULL if 6660 * devid is provided (i.e. @devid != NULL). 6661 * @devid: a pointer to devid for this device. If NULL a new devid 6662 * is generated. 6663 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6664 * is generated. 6665 * 6666 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6667 * on error. Returned struct is not linked onto any lists and must be 6668 * destroyed with btrfs_free_device. 6669 */ 6670 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6671 const u64 *devid, 6672 const u8 *uuid) 6673 { 6674 struct btrfs_device *dev; 6675 u64 tmp; 6676 6677 if (WARN_ON(!devid && !fs_info)) 6678 return ERR_PTR(-EINVAL); 6679 6680 dev = __alloc_device(); 6681 if (IS_ERR(dev)) 6682 return dev; 6683 6684 if (devid) 6685 tmp = *devid; 6686 else { 6687 int ret; 6688 6689 ret = find_next_devid(fs_info, &tmp); 6690 if (ret) { 6691 btrfs_free_device(dev); 6692 return ERR_PTR(ret); 6693 } 6694 } 6695 dev->devid = tmp; 6696 6697 if (uuid) 6698 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6699 else 6700 generate_random_uuid(dev->uuid); 6701 6702 btrfs_init_work(&dev->work, btrfs_submit_helper, 6703 pending_bios_fn, NULL, NULL); 6704 6705 return dev; 6706 } 6707 6708 /* Return -EIO if any error, otherwise return 0. */ 6709 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6710 struct extent_buffer *leaf, 6711 struct btrfs_chunk *chunk, u64 logical) 6712 { 6713 u64 length; 6714 u64 stripe_len; 6715 u16 num_stripes; 6716 u16 sub_stripes; 6717 u64 type; 6718 u64 features; 6719 bool mixed = false; 6720 6721 length = btrfs_chunk_length(leaf, chunk); 6722 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6723 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6724 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6725 type = btrfs_chunk_type(leaf, chunk); 6726 6727 if (!num_stripes) { 6728 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6729 num_stripes); 6730 return -EIO; 6731 } 6732 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6733 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6734 return -EIO; 6735 } 6736 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6737 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6738 btrfs_chunk_sector_size(leaf, chunk)); 6739 return -EIO; 6740 } 6741 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6742 btrfs_err(fs_info, "invalid chunk length %llu", length); 6743 return -EIO; 6744 } 6745 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6746 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6747 stripe_len); 6748 return -EIO; 6749 } 6750 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6751 type) { 6752 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6753 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6754 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6755 btrfs_chunk_type(leaf, chunk)); 6756 return -EIO; 6757 } 6758 6759 if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { 6760 btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); 6761 return -EIO; 6762 } 6763 6764 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && 6765 (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { 6766 btrfs_err(fs_info, 6767 "system chunk with data or metadata type: 0x%llx", type); 6768 return -EIO; 6769 } 6770 6771 features = btrfs_super_incompat_flags(fs_info->super_copy); 6772 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 6773 mixed = true; 6774 6775 if (!mixed) { 6776 if ((type & BTRFS_BLOCK_GROUP_METADATA) && 6777 (type & BTRFS_BLOCK_GROUP_DATA)) { 6778 btrfs_err(fs_info, 6779 "mixed chunk type in non-mixed mode: 0x%llx", type); 6780 return -EIO; 6781 } 6782 } 6783 6784 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6785 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6786 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6787 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6788 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6789 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6790 num_stripes != 1)) { 6791 btrfs_err(fs_info, 6792 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6793 num_stripes, sub_stripes, 6794 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6795 return -EIO; 6796 } 6797 6798 return 0; 6799 } 6800 6801 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6802 u64 devid, u8 *uuid, bool error) 6803 { 6804 if (error) 6805 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6806 devid, uuid); 6807 else 6808 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6809 devid, uuid); 6810 } 6811 6812 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6813 struct extent_buffer *leaf, 6814 struct btrfs_chunk *chunk) 6815 { 6816 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6817 struct map_lookup *map; 6818 struct extent_map *em; 6819 u64 logical; 6820 u64 length; 6821 u64 devid; 6822 u8 uuid[BTRFS_UUID_SIZE]; 6823 int num_stripes; 6824 int ret; 6825 int i; 6826 6827 logical = key->offset; 6828 length = btrfs_chunk_length(leaf, chunk); 6829 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6830 6831 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6832 if (ret) 6833 return ret; 6834 6835 read_lock(&map_tree->map_tree.lock); 6836 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6837 read_unlock(&map_tree->map_tree.lock); 6838 6839 /* already mapped? */ 6840 if (em && em->start <= logical && em->start + em->len > logical) { 6841 free_extent_map(em); 6842 return 0; 6843 } else if (em) { 6844 free_extent_map(em); 6845 } 6846 6847 em = alloc_extent_map(); 6848 if (!em) 6849 return -ENOMEM; 6850 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6851 if (!map) { 6852 free_extent_map(em); 6853 return -ENOMEM; 6854 } 6855 6856 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6857 em->map_lookup = map; 6858 em->start = logical; 6859 em->len = length; 6860 em->orig_start = 0; 6861 em->block_start = 0; 6862 em->block_len = em->len; 6863 6864 map->num_stripes = num_stripes; 6865 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6866 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6867 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6868 map->type = btrfs_chunk_type(leaf, chunk); 6869 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6870 map->verified_stripes = 0; 6871 for (i = 0; i < num_stripes; i++) { 6872 map->stripes[i].physical = 6873 btrfs_stripe_offset_nr(leaf, chunk, i); 6874 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6875 read_extent_buffer(leaf, uuid, (unsigned long) 6876 btrfs_stripe_dev_uuid_nr(chunk, i), 6877 BTRFS_UUID_SIZE); 6878 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6879 uuid, NULL); 6880 if (!map->stripes[i].dev && 6881 !btrfs_test_opt(fs_info, DEGRADED)) { 6882 free_extent_map(em); 6883 btrfs_report_missing_device(fs_info, devid, uuid, true); 6884 return -ENOENT; 6885 } 6886 if (!map->stripes[i].dev) { 6887 map->stripes[i].dev = 6888 add_missing_dev(fs_info->fs_devices, devid, 6889 uuid); 6890 if (IS_ERR(map->stripes[i].dev)) { 6891 free_extent_map(em); 6892 btrfs_err(fs_info, 6893 "failed to init missing dev %llu: %ld", 6894 devid, PTR_ERR(map->stripes[i].dev)); 6895 return PTR_ERR(map->stripes[i].dev); 6896 } 6897 btrfs_report_missing_device(fs_info, devid, uuid, false); 6898 } 6899 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6900 &(map->stripes[i].dev->dev_state)); 6901 6902 } 6903 6904 write_lock(&map_tree->map_tree.lock); 6905 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6906 write_unlock(&map_tree->map_tree.lock); 6907 if (ret < 0) { 6908 btrfs_err(fs_info, 6909 "failed to add chunk map, start=%llu len=%llu: %d", 6910 em->start, em->len, ret); 6911 } 6912 free_extent_map(em); 6913 6914 return ret; 6915 } 6916 6917 static void fill_device_from_item(struct extent_buffer *leaf, 6918 struct btrfs_dev_item *dev_item, 6919 struct btrfs_device *device) 6920 { 6921 unsigned long ptr; 6922 6923 device->devid = btrfs_device_id(leaf, dev_item); 6924 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6925 device->total_bytes = device->disk_total_bytes; 6926 device->commit_total_bytes = device->disk_total_bytes; 6927 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6928 device->commit_bytes_used = device->bytes_used; 6929 device->type = btrfs_device_type(leaf, dev_item); 6930 device->io_align = btrfs_device_io_align(leaf, dev_item); 6931 device->io_width = btrfs_device_io_width(leaf, dev_item); 6932 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6933 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6934 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6935 6936 ptr = btrfs_device_uuid(dev_item); 6937 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6938 } 6939 6940 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6941 u8 *fsid) 6942 { 6943 struct btrfs_fs_devices *fs_devices; 6944 int ret; 6945 6946 lockdep_assert_held(&uuid_mutex); 6947 ASSERT(fsid); 6948 6949 fs_devices = fs_info->fs_devices->seed; 6950 while (fs_devices) { 6951 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6952 return fs_devices; 6953 6954 fs_devices = fs_devices->seed; 6955 } 6956 6957 fs_devices = find_fsid(fsid, NULL); 6958 if (!fs_devices) { 6959 if (!btrfs_test_opt(fs_info, DEGRADED)) 6960 return ERR_PTR(-ENOENT); 6961 6962 fs_devices = alloc_fs_devices(fsid, NULL); 6963 if (IS_ERR(fs_devices)) 6964 return fs_devices; 6965 6966 fs_devices->seeding = 1; 6967 fs_devices->opened = 1; 6968 return fs_devices; 6969 } 6970 6971 fs_devices = clone_fs_devices(fs_devices); 6972 if (IS_ERR(fs_devices)) 6973 return fs_devices; 6974 6975 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 6976 if (ret) { 6977 free_fs_devices(fs_devices); 6978 fs_devices = ERR_PTR(ret); 6979 goto out; 6980 } 6981 6982 if (!fs_devices->seeding) { 6983 close_fs_devices(fs_devices); 6984 free_fs_devices(fs_devices); 6985 fs_devices = ERR_PTR(-EINVAL); 6986 goto out; 6987 } 6988 6989 fs_devices->seed = fs_info->fs_devices->seed; 6990 fs_info->fs_devices->seed = fs_devices; 6991 out: 6992 return fs_devices; 6993 } 6994 6995 static int read_one_dev(struct btrfs_fs_info *fs_info, 6996 struct extent_buffer *leaf, 6997 struct btrfs_dev_item *dev_item) 6998 { 6999 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7000 struct btrfs_device *device; 7001 u64 devid; 7002 int ret; 7003 u8 fs_uuid[BTRFS_FSID_SIZE]; 7004 u8 dev_uuid[BTRFS_UUID_SIZE]; 7005 7006 devid = btrfs_device_id(leaf, dev_item); 7007 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7008 BTRFS_UUID_SIZE); 7009 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7010 BTRFS_FSID_SIZE); 7011 7012 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7013 fs_devices = open_seed_devices(fs_info, fs_uuid); 7014 if (IS_ERR(fs_devices)) 7015 return PTR_ERR(fs_devices); 7016 } 7017 7018 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 7019 if (!device) { 7020 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7021 btrfs_report_missing_device(fs_info, devid, 7022 dev_uuid, true); 7023 return -ENOENT; 7024 } 7025 7026 device = add_missing_dev(fs_devices, devid, dev_uuid); 7027 if (IS_ERR(device)) { 7028 btrfs_err(fs_info, 7029 "failed to add missing dev %llu: %ld", 7030 devid, PTR_ERR(device)); 7031 return PTR_ERR(device); 7032 } 7033 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7034 } else { 7035 if (!device->bdev) { 7036 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7037 btrfs_report_missing_device(fs_info, 7038 devid, dev_uuid, true); 7039 return -ENOENT; 7040 } 7041 btrfs_report_missing_device(fs_info, devid, 7042 dev_uuid, false); 7043 } 7044 7045 if (!device->bdev && 7046 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7047 /* 7048 * this happens when a device that was properly setup 7049 * in the device info lists suddenly goes bad. 7050 * device->bdev is NULL, and so we have to set 7051 * device->missing to one here 7052 */ 7053 device->fs_devices->missing_devices++; 7054 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7055 } 7056 7057 /* Move the device to its own fs_devices */ 7058 if (device->fs_devices != fs_devices) { 7059 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7060 &device->dev_state)); 7061 7062 list_move(&device->dev_list, &fs_devices->devices); 7063 device->fs_devices->num_devices--; 7064 fs_devices->num_devices++; 7065 7066 device->fs_devices->missing_devices--; 7067 fs_devices->missing_devices++; 7068 7069 device->fs_devices = fs_devices; 7070 } 7071 } 7072 7073 if (device->fs_devices != fs_info->fs_devices) { 7074 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7075 if (device->generation != 7076 btrfs_device_generation(leaf, dev_item)) 7077 return -EINVAL; 7078 } 7079 7080 fill_device_from_item(leaf, dev_item, device); 7081 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7082 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7083 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7084 device->fs_devices->total_rw_bytes += device->total_bytes; 7085 atomic64_add(device->total_bytes - device->bytes_used, 7086 &fs_info->free_chunk_space); 7087 } 7088 ret = 0; 7089 return ret; 7090 } 7091 7092 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7093 { 7094 struct btrfs_root *root = fs_info->tree_root; 7095 struct btrfs_super_block *super_copy = fs_info->super_copy; 7096 struct extent_buffer *sb; 7097 struct btrfs_disk_key *disk_key; 7098 struct btrfs_chunk *chunk; 7099 u8 *array_ptr; 7100 unsigned long sb_array_offset; 7101 int ret = 0; 7102 u32 num_stripes; 7103 u32 array_size; 7104 u32 len = 0; 7105 u32 cur_offset; 7106 u64 type; 7107 struct btrfs_key key; 7108 7109 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7110 /* 7111 * This will create extent buffer of nodesize, superblock size is 7112 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7113 * overallocate but we can keep it as-is, only the first page is used. 7114 */ 7115 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 7116 if (IS_ERR(sb)) 7117 return PTR_ERR(sb); 7118 set_extent_buffer_uptodate(sb); 7119 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 7120 /* 7121 * The sb extent buffer is artificial and just used to read the system array. 7122 * set_extent_buffer_uptodate() call does not properly mark all it's 7123 * pages up-to-date when the page is larger: extent does not cover the 7124 * whole page and consequently check_page_uptodate does not find all 7125 * the page's extents up-to-date (the hole beyond sb), 7126 * write_extent_buffer then triggers a WARN_ON. 7127 * 7128 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7129 * but sb spans only this function. Add an explicit SetPageUptodate call 7130 * to silence the warning eg. on PowerPC 64. 7131 */ 7132 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7133 SetPageUptodate(sb->pages[0]); 7134 7135 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7136 array_size = btrfs_super_sys_array_size(super_copy); 7137 7138 array_ptr = super_copy->sys_chunk_array; 7139 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7140 cur_offset = 0; 7141 7142 while (cur_offset < array_size) { 7143 disk_key = (struct btrfs_disk_key *)array_ptr; 7144 len = sizeof(*disk_key); 7145 if (cur_offset + len > array_size) 7146 goto out_short_read; 7147 7148 btrfs_disk_key_to_cpu(&key, disk_key); 7149 7150 array_ptr += len; 7151 sb_array_offset += len; 7152 cur_offset += len; 7153 7154 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 7155 chunk = (struct btrfs_chunk *)sb_array_offset; 7156 /* 7157 * At least one btrfs_chunk with one stripe must be 7158 * present, exact stripe count check comes afterwards 7159 */ 7160 len = btrfs_chunk_item_size(1); 7161 if (cur_offset + len > array_size) 7162 goto out_short_read; 7163 7164 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7165 if (!num_stripes) { 7166 btrfs_err(fs_info, 7167 "invalid number of stripes %u in sys_array at offset %u", 7168 num_stripes, cur_offset); 7169 ret = -EIO; 7170 break; 7171 } 7172 7173 type = btrfs_chunk_type(sb, chunk); 7174 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7175 btrfs_err(fs_info, 7176 "invalid chunk type %llu in sys_array at offset %u", 7177 type, cur_offset); 7178 ret = -EIO; 7179 break; 7180 } 7181 7182 len = btrfs_chunk_item_size(num_stripes); 7183 if (cur_offset + len > array_size) 7184 goto out_short_read; 7185 7186 ret = read_one_chunk(fs_info, &key, sb, chunk); 7187 if (ret) 7188 break; 7189 } else { 7190 btrfs_err(fs_info, 7191 "unexpected item type %u in sys_array at offset %u", 7192 (u32)key.type, cur_offset); 7193 ret = -EIO; 7194 break; 7195 } 7196 array_ptr += len; 7197 sb_array_offset += len; 7198 cur_offset += len; 7199 } 7200 clear_extent_buffer_uptodate(sb); 7201 free_extent_buffer_stale(sb); 7202 return ret; 7203 7204 out_short_read: 7205 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7206 len, cur_offset); 7207 clear_extent_buffer_uptodate(sb); 7208 free_extent_buffer_stale(sb); 7209 return -EIO; 7210 } 7211 7212 /* 7213 * Check if all chunks in the fs are OK for read-write degraded mount 7214 * 7215 * If the @failing_dev is specified, it's accounted as missing. 7216 * 7217 * Return true if all chunks meet the minimal RW mount requirements. 7218 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7219 */ 7220 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7221 struct btrfs_device *failing_dev) 7222 { 7223 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 7224 struct extent_map *em; 7225 u64 next_start = 0; 7226 bool ret = true; 7227 7228 read_lock(&map_tree->map_tree.lock); 7229 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 7230 read_unlock(&map_tree->map_tree.lock); 7231 /* No chunk at all? Return false anyway */ 7232 if (!em) { 7233 ret = false; 7234 goto out; 7235 } 7236 while (em) { 7237 struct map_lookup *map; 7238 int missing = 0; 7239 int max_tolerated; 7240 int i; 7241 7242 map = em->map_lookup; 7243 max_tolerated = 7244 btrfs_get_num_tolerated_disk_barrier_failures( 7245 map->type); 7246 for (i = 0; i < map->num_stripes; i++) { 7247 struct btrfs_device *dev = map->stripes[i].dev; 7248 7249 if (!dev || !dev->bdev || 7250 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7251 dev->last_flush_error) 7252 missing++; 7253 else if (failing_dev && failing_dev == dev) 7254 missing++; 7255 } 7256 if (missing > max_tolerated) { 7257 if (!failing_dev) 7258 btrfs_warn(fs_info, 7259 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7260 em->start, missing, max_tolerated); 7261 free_extent_map(em); 7262 ret = false; 7263 goto out; 7264 } 7265 next_start = extent_map_end(em); 7266 free_extent_map(em); 7267 7268 read_lock(&map_tree->map_tree.lock); 7269 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 7270 (u64)(-1) - next_start); 7271 read_unlock(&map_tree->map_tree.lock); 7272 } 7273 out: 7274 return ret; 7275 } 7276 7277 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7278 { 7279 struct btrfs_root *root = fs_info->chunk_root; 7280 struct btrfs_path *path; 7281 struct extent_buffer *leaf; 7282 struct btrfs_key key; 7283 struct btrfs_key found_key; 7284 int ret; 7285 int slot; 7286 u64 total_dev = 0; 7287 7288 path = btrfs_alloc_path(); 7289 if (!path) 7290 return -ENOMEM; 7291 7292 /* 7293 * uuid_mutex is needed only if we are mounting a sprout FS 7294 * otherwise we don't need it. 7295 */ 7296 mutex_lock(&uuid_mutex); 7297 mutex_lock(&fs_info->chunk_mutex); 7298 7299 /* 7300 * Read all device items, and then all the chunk items. All 7301 * device items are found before any chunk item (their object id 7302 * is smaller than the lowest possible object id for a chunk 7303 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7304 */ 7305 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7306 key.offset = 0; 7307 key.type = 0; 7308 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7309 if (ret < 0) 7310 goto error; 7311 while (1) { 7312 leaf = path->nodes[0]; 7313 slot = path->slots[0]; 7314 if (slot >= btrfs_header_nritems(leaf)) { 7315 ret = btrfs_next_leaf(root, path); 7316 if (ret == 0) 7317 continue; 7318 if (ret < 0) 7319 goto error; 7320 break; 7321 } 7322 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7323 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7324 struct btrfs_dev_item *dev_item; 7325 dev_item = btrfs_item_ptr(leaf, slot, 7326 struct btrfs_dev_item); 7327 ret = read_one_dev(fs_info, leaf, dev_item); 7328 if (ret) 7329 goto error; 7330 total_dev++; 7331 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7332 struct btrfs_chunk *chunk; 7333 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7334 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 7335 if (ret) 7336 goto error; 7337 } 7338 path->slots[0]++; 7339 } 7340 7341 /* 7342 * After loading chunk tree, we've got all device information, 7343 * do another round of validation checks. 7344 */ 7345 if (total_dev != fs_info->fs_devices->total_devices) { 7346 btrfs_err(fs_info, 7347 "super_num_devices %llu mismatch with num_devices %llu found here", 7348 btrfs_super_num_devices(fs_info->super_copy), 7349 total_dev); 7350 ret = -EINVAL; 7351 goto error; 7352 } 7353 if (btrfs_super_total_bytes(fs_info->super_copy) < 7354 fs_info->fs_devices->total_rw_bytes) { 7355 btrfs_err(fs_info, 7356 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7357 btrfs_super_total_bytes(fs_info->super_copy), 7358 fs_info->fs_devices->total_rw_bytes); 7359 ret = -EINVAL; 7360 goto error; 7361 } 7362 ret = 0; 7363 error: 7364 mutex_unlock(&fs_info->chunk_mutex); 7365 mutex_unlock(&uuid_mutex); 7366 7367 btrfs_free_path(path); 7368 return ret; 7369 } 7370 7371 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7372 { 7373 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7374 struct btrfs_device *device; 7375 7376 while (fs_devices) { 7377 mutex_lock(&fs_devices->device_list_mutex); 7378 list_for_each_entry(device, &fs_devices->devices, dev_list) 7379 device->fs_info = fs_info; 7380 mutex_unlock(&fs_devices->device_list_mutex); 7381 7382 fs_devices = fs_devices->seed; 7383 } 7384 } 7385 7386 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 7387 { 7388 int i; 7389 7390 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7391 btrfs_dev_stat_reset(dev, i); 7392 } 7393 7394 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7395 { 7396 struct btrfs_key key; 7397 struct btrfs_key found_key; 7398 struct btrfs_root *dev_root = fs_info->dev_root; 7399 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7400 struct extent_buffer *eb; 7401 int slot; 7402 int ret = 0; 7403 struct btrfs_device *device; 7404 struct btrfs_path *path = NULL; 7405 int i; 7406 7407 path = btrfs_alloc_path(); 7408 if (!path) { 7409 ret = -ENOMEM; 7410 goto out; 7411 } 7412 7413 mutex_lock(&fs_devices->device_list_mutex); 7414 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7415 int item_size; 7416 struct btrfs_dev_stats_item *ptr; 7417 7418 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7419 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7420 key.offset = device->devid; 7421 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7422 if (ret) { 7423 __btrfs_reset_dev_stats(device); 7424 device->dev_stats_valid = 1; 7425 btrfs_release_path(path); 7426 continue; 7427 } 7428 slot = path->slots[0]; 7429 eb = path->nodes[0]; 7430 btrfs_item_key_to_cpu(eb, &found_key, slot); 7431 item_size = btrfs_item_size_nr(eb, slot); 7432 7433 ptr = btrfs_item_ptr(eb, slot, 7434 struct btrfs_dev_stats_item); 7435 7436 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7437 if (item_size >= (1 + i) * sizeof(__le64)) 7438 btrfs_dev_stat_set(device, i, 7439 btrfs_dev_stats_value(eb, ptr, i)); 7440 else 7441 btrfs_dev_stat_reset(device, i); 7442 } 7443 7444 device->dev_stats_valid = 1; 7445 btrfs_dev_stat_print_on_load(device); 7446 btrfs_release_path(path); 7447 } 7448 mutex_unlock(&fs_devices->device_list_mutex); 7449 7450 out: 7451 btrfs_free_path(path); 7452 return ret < 0 ? ret : 0; 7453 } 7454 7455 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7456 struct btrfs_device *device) 7457 { 7458 struct btrfs_fs_info *fs_info = trans->fs_info; 7459 struct btrfs_root *dev_root = fs_info->dev_root; 7460 struct btrfs_path *path; 7461 struct btrfs_key key; 7462 struct extent_buffer *eb; 7463 struct btrfs_dev_stats_item *ptr; 7464 int ret; 7465 int i; 7466 7467 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7468 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7469 key.offset = device->devid; 7470 7471 path = btrfs_alloc_path(); 7472 if (!path) 7473 return -ENOMEM; 7474 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7475 if (ret < 0) { 7476 btrfs_warn_in_rcu(fs_info, 7477 "error %d while searching for dev_stats item for device %s", 7478 ret, rcu_str_deref(device->name)); 7479 goto out; 7480 } 7481 7482 if (ret == 0 && 7483 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7484 /* need to delete old one and insert a new one */ 7485 ret = btrfs_del_item(trans, dev_root, path); 7486 if (ret != 0) { 7487 btrfs_warn_in_rcu(fs_info, 7488 "delete too small dev_stats item for device %s failed %d", 7489 rcu_str_deref(device->name), ret); 7490 goto out; 7491 } 7492 ret = 1; 7493 } 7494 7495 if (ret == 1) { 7496 /* need to insert a new item */ 7497 btrfs_release_path(path); 7498 ret = btrfs_insert_empty_item(trans, dev_root, path, 7499 &key, sizeof(*ptr)); 7500 if (ret < 0) { 7501 btrfs_warn_in_rcu(fs_info, 7502 "insert dev_stats item for device %s failed %d", 7503 rcu_str_deref(device->name), ret); 7504 goto out; 7505 } 7506 } 7507 7508 eb = path->nodes[0]; 7509 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7510 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7511 btrfs_set_dev_stats_value(eb, ptr, i, 7512 btrfs_dev_stat_read(device, i)); 7513 btrfs_mark_buffer_dirty(eb); 7514 7515 out: 7516 btrfs_free_path(path); 7517 return ret; 7518 } 7519 7520 /* 7521 * called from commit_transaction. Writes all changed device stats to disk. 7522 */ 7523 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7524 struct btrfs_fs_info *fs_info) 7525 { 7526 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7527 struct btrfs_device *device; 7528 int stats_cnt; 7529 int ret = 0; 7530 7531 mutex_lock(&fs_devices->device_list_mutex); 7532 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7533 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7534 if (!device->dev_stats_valid || stats_cnt == 0) 7535 continue; 7536 7537 7538 /* 7539 * There is a LOAD-LOAD control dependency between the value of 7540 * dev_stats_ccnt and updating the on-disk values which requires 7541 * reading the in-memory counters. Such control dependencies 7542 * require explicit read memory barriers. 7543 * 7544 * This memory barriers pairs with smp_mb__before_atomic in 7545 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7546 * barrier implied by atomic_xchg in 7547 * btrfs_dev_stats_read_and_reset 7548 */ 7549 smp_rmb(); 7550 7551 ret = update_dev_stat_item(trans, device); 7552 if (!ret) 7553 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7554 } 7555 mutex_unlock(&fs_devices->device_list_mutex); 7556 7557 return ret; 7558 } 7559 7560 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7561 { 7562 btrfs_dev_stat_inc(dev, index); 7563 btrfs_dev_stat_print_on_error(dev); 7564 } 7565 7566 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7567 { 7568 if (!dev->dev_stats_valid) 7569 return; 7570 btrfs_err_rl_in_rcu(dev->fs_info, 7571 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7572 rcu_str_deref(dev->name), 7573 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7574 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7575 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7576 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7577 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7578 } 7579 7580 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7581 { 7582 int i; 7583 7584 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7585 if (btrfs_dev_stat_read(dev, i) != 0) 7586 break; 7587 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7588 return; /* all values == 0, suppress message */ 7589 7590 btrfs_info_in_rcu(dev->fs_info, 7591 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7592 rcu_str_deref(dev->name), 7593 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7594 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7595 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7596 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7597 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7598 } 7599 7600 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7601 struct btrfs_ioctl_get_dev_stats *stats) 7602 { 7603 struct btrfs_device *dev; 7604 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7605 int i; 7606 7607 mutex_lock(&fs_devices->device_list_mutex); 7608 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7609 mutex_unlock(&fs_devices->device_list_mutex); 7610 7611 if (!dev) { 7612 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7613 return -ENODEV; 7614 } else if (!dev->dev_stats_valid) { 7615 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7616 return -ENODEV; 7617 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7618 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7619 if (stats->nr_items > i) 7620 stats->values[i] = 7621 btrfs_dev_stat_read_and_reset(dev, i); 7622 else 7623 btrfs_dev_stat_reset(dev, i); 7624 } 7625 } else { 7626 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7627 if (stats->nr_items > i) 7628 stats->values[i] = btrfs_dev_stat_read(dev, i); 7629 } 7630 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7631 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7632 return 0; 7633 } 7634 7635 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7636 { 7637 struct buffer_head *bh; 7638 struct btrfs_super_block *disk_super; 7639 int copy_num; 7640 7641 if (!bdev) 7642 return; 7643 7644 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7645 copy_num++) { 7646 7647 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7648 continue; 7649 7650 disk_super = (struct btrfs_super_block *)bh->b_data; 7651 7652 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7653 set_buffer_dirty(bh); 7654 sync_dirty_buffer(bh); 7655 brelse(bh); 7656 } 7657 7658 /* Notify udev that device has changed */ 7659 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7660 7661 /* Update ctime/mtime for device path for libblkid */ 7662 update_dev_time(device_path); 7663 } 7664 7665 /* 7666 * Update the size of all devices, which is used for writing out the 7667 * super blocks. 7668 */ 7669 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7670 { 7671 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7672 struct btrfs_device *curr, *next; 7673 7674 if (list_empty(&fs_devices->resized_devices)) 7675 return; 7676 7677 mutex_lock(&fs_devices->device_list_mutex); 7678 mutex_lock(&fs_info->chunk_mutex); 7679 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7680 resized_list) { 7681 list_del_init(&curr->resized_list); 7682 curr->commit_total_bytes = curr->disk_total_bytes; 7683 } 7684 mutex_unlock(&fs_info->chunk_mutex); 7685 mutex_unlock(&fs_devices->device_list_mutex); 7686 } 7687 7688 /* Must be invoked during the transaction commit */ 7689 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) 7690 { 7691 struct btrfs_fs_info *fs_info = trans->fs_info; 7692 struct extent_map *em; 7693 struct map_lookup *map; 7694 struct btrfs_device *dev; 7695 int i; 7696 7697 if (list_empty(&trans->pending_chunks)) 7698 return; 7699 7700 /* In order to kick the device replace finish process */ 7701 mutex_lock(&fs_info->chunk_mutex); 7702 list_for_each_entry(em, &trans->pending_chunks, list) { 7703 map = em->map_lookup; 7704 7705 for (i = 0; i < map->num_stripes; i++) { 7706 dev = map->stripes[i].dev; 7707 dev->commit_bytes_used = dev->bytes_used; 7708 } 7709 } 7710 mutex_unlock(&fs_info->chunk_mutex); 7711 } 7712 7713 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7714 { 7715 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7716 while (fs_devices) { 7717 fs_devices->fs_info = fs_info; 7718 fs_devices = fs_devices->seed; 7719 } 7720 } 7721 7722 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7723 { 7724 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7725 while (fs_devices) { 7726 fs_devices->fs_info = NULL; 7727 fs_devices = fs_devices->seed; 7728 } 7729 } 7730 7731 /* 7732 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7733 */ 7734 int btrfs_bg_type_to_factor(u64 flags) 7735 { 7736 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 7737 BTRFS_BLOCK_GROUP_RAID10)) 7738 return 2; 7739 return 1; 7740 } 7741 7742 7743 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7744 { 7745 int index = btrfs_bg_flags_to_raid_index(type); 7746 int ncopies = btrfs_raid_array[index].ncopies; 7747 int data_stripes; 7748 7749 switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 7750 case BTRFS_BLOCK_GROUP_RAID5: 7751 data_stripes = num_stripes - 1; 7752 break; 7753 case BTRFS_BLOCK_GROUP_RAID6: 7754 data_stripes = num_stripes - 2; 7755 break; 7756 default: 7757 data_stripes = num_stripes / ncopies; 7758 break; 7759 } 7760 return div_u64(chunk_len, data_stripes); 7761 } 7762 7763 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7764 u64 chunk_offset, u64 devid, 7765 u64 physical_offset, u64 physical_len) 7766 { 7767 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7768 struct extent_map *em; 7769 struct map_lookup *map; 7770 struct btrfs_device *dev; 7771 u64 stripe_len; 7772 bool found = false; 7773 int ret = 0; 7774 int i; 7775 7776 read_lock(&em_tree->lock); 7777 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7778 read_unlock(&em_tree->lock); 7779 7780 if (!em) { 7781 btrfs_err(fs_info, 7782 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7783 physical_offset, devid); 7784 ret = -EUCLEAN; 7785 goto out; 7786 } 7787 7788 map = em->map_lookup; 7789 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7790 if (physical_len != stripe_len) { 7791 btrfs_err(fs_info, 7792 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7793 physical_offset, devid, em->start, physical_len, 7794 stripe_len); 7795 ret = -EUCLEAN; 7796 goto out; 7797 } 7798 7799 for (i = 0; i < map->num_stripes; i++) { 7800 if (map->stripes[i].dev->devid == devid && 7801 map->stripes[i].physical == physical_offset) { 7802 found = true; 7803 if (map->verified_stripes >= map->num_stripes) { 7804 btrfs_err(fs_info, 7805 "too many dev extents for chunk %llu found", 7806 em->start); 7807 ret = -EUCLEAN; 7808 goto out; 7809 } 7810 map->verified_stripes++; 7811 break; 7812 } 7813 } 7814 if (!found) { 7815 btrfs_err(fs_info, 7816 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7817 physical_offset, devid); 7818 ret = -EUCLEAN; 7819 } 7820 7821 /* Make sure no dev extent is beyond device bondary */ 7822 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 7823 if (!dev) { 7824 btrfs_err(fs_info, "failed to find devid %llu", devid); 7825 ret = -EUCLEAN; 7826 goto out; 7827 } 7828 7829 /* It's possible this device is a dummy for seed device */ 7830 if (dev->disk_total_bytes == 0) { 7831 dev = find_device(fs_info->fs_devices->seed, devid, NULL); 7832 if (!dev) { 7833 btrfs_err(fs_info, "failed to find seed devid %llu", 7834 devid); 7835 ret = -EUCLEAN; 7836 goto out; 7837 } 7838 } 7839 7840 if (physical_offset + physical_len > dev->disk_total_bytes) { 7841 btrfs_err(fs_info, 7842 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7843 devid, physical_offset, physical_len, 7844 dev->disk_total_bytes); 7845 ret = -EUCLEAN; 7846 goto out; 7847 } 7848 out: 7849 free_extent_map(em); 7850 return ret; 7851 } 7852 7853 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7854 { 7855 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7856 struct extent_map *em; 7857 struct rb_node *node; 7858 int ret = 0; 7859 7860 read_lock(&em_tree->lock); 7861 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7862 em = rb_entry(node, struct extent_map, rb_node); 7863 if (em->map_lookup->num_stripes != 7864 em->map_lookup->verified_stripes) { 7865 btrfs_err(fs_info, 7866 "chunk %llu has missing dev extent, have %d expect %d", 7867 em->start, em->map_lookup->verified_stripes, 7868 em->map_lookup->num_stripes); 7869 ret = -EUCLEAN; 7870 goto out; 7871 } 7872 } 7873 out: 7874 read_unlock(&em_tree->lock); 7875 return ret; 7876 } 7877 7878 /* 7879 * Ensure that all dev extents are mapped to correct chunk, otherwise 7880 * later chunk allocation/free would cause unexpected behavior. 7881 * 7882 * NOTE: This will iterate through the whole device tree, which should be of 7883 * the same size level as the chunk tree. This slightly increases mount time. 7884 */ 7885 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7886 { 7887 struct btrfs_path *path; 7888 struct btrfs_root *root = fs_info->dev_root; 7889 struct btrfs_key key; 7890 u64 prev_devid = 0; 7891 u64 prev_dev_ext_end = 0; 7892 int ret = 0; 7893 7894 key.objectid = 1; 7895 key.type = BTRFS_DEV_EXTENT_KEY; 7896 key.offset = 0; 7897 7898 path = btrfs_alloc_path(); 7899 if (!path) 7900 return -ENOMEM; 7901 7902 path->reada = READA_FORWARD; 7903 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7904 if (ret < 0) 7905 goto out; 7906 7907 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7908 ret = btrfs_next_item(root, path); 7909 if (ret < 0) 7910 goto out; 7911 /* No dev extents at all? Not good */ 7912 if (ret > 0) { 7913 ret = -EUCLEAN; 7914 goto out; 7915 } 7916 } 7917 while (1) { 7918 struct extent_buffer *leaf = path->nodes[0]; 7919 struct btrfs_dev_extent *dext; 7920 int slot = path->slots[0]; 7921 u64 chunk_offset; 7922 u64 physical_offset; 7923 u64 physical_len; 7924 u64 devid; 7925 7926 btrfs_item_key_to_cpu(leaf, &key, slot); 7927 if (key.type != BTRFS_DEV_EXTENT_KEY) 7928 break; 7929 devid = key.objectid; 7930 physical_offset = key.offset; 7931 7932 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 7933 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 7934 physical_len = btrfs_dev_extent_length(leaf, dext); 7935 7936 /* Check if this dev extent overlaps with the previous one */ 7937 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 7938 btrfs_err(fs_info, 7939 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 7940 devid, physical_offset, prev_dev_ext_end); 7941 ret = -EUCLEAN; 7942 goto out; 7943 } 7944 7945 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 7946 physical_offset, physical_len); 7947 if (ret < 0) 7948 goto out; 7949 prev_devid = devid; 7950 prev_dev_ext_end = physical_offset + physical_len; 7951 7952 ret = btrfs_next_item(root, path); 7953 if (ret < 0) 7954 goto out; 7955 if (ret > 0) { 7956 ret = 0; 7957 break; 7958 } 7959 } 7960 7961 /* Ensure all chunks have corresponding dev extents */ 7962 ret = verify_chunk_dev_extent_mapping(fs_info); 7963 out: 7964 btrfs_free_path(path); 7965 return ret; 7966 } 7967 7968 /* 7969 * Check whether the given block group or device is pinned by any inode being 7970 * used as a swapfile. 7971 */ 7972 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 7973 { 7974 struct btrfs_swapfile_pin *sp; 7975 struct rb_node *node; 7976 7977 spin_lock(&fs_info->swapfile_pins_lock); 7978 node = fs_info->swapfile_pins.rb_node; 7979 while (node) { 7980 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 7981 if (ptr < sp->ptr) 7982 node = node->rb_left; 7983 else if (ptr > sp->ptr) 7984 node = node->rb_right; 7985 else 7986 break; 7987 } 7988 spin_unlock(&fs_info->swapfile_pins_lock); 7989 return node != NULL; 7990 } 7991