1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/buffer_head.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "ctree.h" 18 #include "extent_map.h" 19 #include "disk-io.h" 20 #include "transaction.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "async-thread.h" 25 #include "check-integrity.h" 26 #include "rcu-string.h" 27 #include "math.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 32 [BTRFS_RAID_RAID10] = { 33 .sub_stripes = 2, 34 .dev_stripes = 1, 35 .devs_max = 0, /* 0 == as many as possible */ 36 .devs_min = 4, 37 .tolerated_failures = 1, 38 .devs_increment = 2, 39 .ncopies = 2, 40 .raid_name = "raid10", 41 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 42 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 43 }, 44 [BTRFS_RAID_RAID1] = { 45 .sub_stripes = 1, 46 .dev_stripes = 1, 47 .devs_max = 2, 48 .devs_min = 2, 49 .tolerated_failures = 1, 50 .devs_increment = 2, 51 .ncopies = 2, 52 .raid_name = "raid1", 53 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 54 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 55 }, 56 [BTRFS_RAID_DUP] = { 57 .sub_stripes = 1, 58 .dev_stripes = 2, 59 .devs_max = 1, 60 .devs_min = 1, 61 .tolerated_failures = 0, 62 .devs_increment = 1, 63 .ncopies = 2, 64 .raid_name = "dup", 65 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 66 .mindev_error = 0, 67 }, 68 [BTRFS_RAID_RAID0] = { 69 .sub_stripes = 1, 70 .dev_stripes = 1, 71 .devs_max = 0, 72 .devs_min = 2, 73 .tolerated_failures = 0, 74 .devs_increment = 1, 75 .ncopies = 1, 76 .raid_name = "raid0", 77 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 78 .mindev_error = 0, 79 }, 80 [BTRFS_RAID_SINGLE] = { 81 .sub_stripes = 1, 82 .dev_stripes = 1, 83 .devs_max = 1, 84 .devs_min = 1, 85 .tolerated_failures = 0, 86 .devs_increment = 1, 87 .ncopies = 1, 88 .raid_name = "single", 89 .bg_flag = 0, 90 .mindev_error = 0, 91 }, 92 [BTRFS_RAID_RAID5] = { 93 .sub_stripes = 1, 94 .dev_stripes = 1, 95 .devs_max = 0, 96 .devs_min = 2, 97 .tolerated_failures = 1, 98 .devs_increment = 1, 99 .ncopies = 2, 100 .raid_name = "raid5", 101 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 102 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 103 }, 104 [BTRFS_RAID_RAID6] = { 105 .sub_stripes = 1, 106 .dev_stripes = 1, 107 .devs_max = 0, 108 .devs_min = 3, 109 .tolerated_failures = 2, 110 .devs_increment = 1, 111 .ncopies = 3, 112 .raid_name = "raid6", 113 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 114 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 115 }, 116 }; 117 118 const char *get_raid_name(enum btrfs_raid_types type) 119 { 120 if (type >= BTRFS_NR_RAID_TYPES) 121 return NULL; 122 123 return btrfs_raid_array[type].raid_name; 124 } 125 126 static int init_first_rw_device(struct btrfs_trans_handle *trans, 127 struct btrfs_fs_info *fs_info); 128 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 129 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 130 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 131 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 132 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 133 enum btrfs_map_op op, 134 u64 logical, u64 *length, 135 struct btrfs_bio **bbio_ret, 136 int mirror_num, int need_raid_map); 137 138 /* 139 * Device locking 140 * ============== 141 * 142 * There are several mutexes that protect manipulation of devices and low-level 143 * structures like chunks but not block groups, extents or files 144 * 145 * uuid_mutex (global lock) 146 * ------------------------ 147 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 148 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 149 * device) or requested by the device= mount option 150 * 151 * the mutex can be very coarse and can cover long-running operations 152 * 153 * protects: updates to fs_devices counters like missing devices, rw devices, 154 * seeding, structure cloning, openning/closing devices at mount/umount time 155 * 156 * global::fs_devs - add, remove, updates to the global list 157 * 158 * does not protect: manipulation of the fs_devices::devices list! 159 * 160 * btrfs_device::name - renames (write side), read is RCU 161 * 162 * fs_devices::device_list_mutex (per-fs, with RCU) 163 * ------------------------------------------------ 164 * protects updates to fs_devices::devices, ie. adding and deleting 165 * 166 * simple list traversal with read-only actions can be done with RCU protection 167 * 168 * may be used to exclude some operations from running concurrently without any 169 * modifications to the list (see write_all_supers) 170 * 171 * balance_mutex 172 * ------------- 173 * protects balance structures (status, state) and context accessed from 174 * several places (internally, ioctl) 175 * 176 * chunk_mutex 177 * ----------- 178 * protects chunks, adding or removing during allocation, trim or when a new 179 * device is added/removed 180 * 181 * cleaner_mutex 182 * ------------- 183 * a big lock that is held by the cleaner thread and prevents running subvolume 184 * cleaning together with relocation or delayed iputs 185 * 186 * 187 * Lock nesting 188 * ============ 189 * 190 * uuid_mutex 191 * volume_mutex 192 * device_list_mutex 193 * chunk_mutex 194 * balance_mutex 195 * 196 * 197 * Exclusive operations, BTRFS_FS_EXCL_OP 198 * ====================================== 199 * 200 * Maintains the exclusivity of the following operations that apply to the 201 * whole filesystem and cannot run in parallel. 202 * 203 * - Balance (*) 204 * - Device add 205 * - Device remove 206 * - Device replace (*) 207 * - Resize 208 * 209 * The device operations (as above) can be in one of the following states: 210 * 211 * - Running state 212 * - Paused state 213 * - Completed state 214 * 215 * Only device operations marked with (*) can go into the Paused state for the 216 * following reasons: 217 * 218 * - ioctl (only Balance can be Paused through ioctl) 219 * - filesystem remounted as read-only 220 * - filesystem unmounted and mounted as read-only 221 * - system power-cycle and filesystem mounted as read-only 222 * - filesystem or device errors leading to forced read-only 223 * 224 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. 225 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. 226 * A device operation in Paused or Running state can be canceled or resumed 227 * either by ioctl (Balance only) or when remounted as read-write. 228 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or 229 * completed. 230 */ 231 232 DEFINE_MUTEX(uuid_mutex); 233 static LIST_HEAD(fs_uuids); 234 struct list_head *btrfs_get_fs_uuids(void) 235 { 236 return &fs_uuids; 237 } 238 239 /* 240 * alloc_fs_devices - allocate struct btrfs_fs_devices 241 * @fsid: if not NULL, copy the uuid to fs_devices::fsid 242 * 243 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 244 * The returned struct is not linked onto any lists and can be destroyed with 245 * kfree() right away. 246 */ 247 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 248 { 249 struct btrfs_fs_devices *fs_devs; 250 251 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 252 if (!fs_devs) 253 return ERR_PTR(-ENOMEM); 254 255 mutex_init(&fs_devs->device_list_mutex); 256 257 INIT_LIST_HEAD(&fs_devs->devices); 258 INIT_LIST_HEAD(&fs_devs->resized_devices); 259 INIT_LIST_HEAD(&fs_devs->alloc_list); 260 INIT_LIST_HEAD(&fs_devs->fs_list); 261 if (fsid) 262 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 263 264 return fs_devs; 265 } 266 267 void btrfs_free_device(struct btrfs_device *device) 268 { 269 rcu_string_free(device->name); 270 bio_put(device->flush_bio); 271 kfree(device); 272 } 273 274 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 275 { 276 struct btrfs_device *device; 277 WARN_ON(fs_devices->opened); 278 while (!list_empty(&fs_devices->devices)) { 279 device = list_entry(fs_devices->devices.next, 280 struct btrfs_device, dev_list); 281 list_del(&device->dev_list); 282 btrfs_free_device(device); 283 } 284 kfree(fs_devices); 285 } 286 287 static void btrfs_kobject_uevent(struct block_device *bdev, 288 enum kobject_action action) 289 { 290 int ret; 291 292 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 293 if (ret) 294 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 295 action, 296 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 297 &disk_to_dev(bdev->bd_disk)->kobj); 298 } 299 300 void __exit btrfs_cleanup_fs_uuids(void) 301 { 302 struct btrfs_fs_devices *fs_devices; 303 304 while (!list_empty(&fs_uuids)) { 305 fs_devices = list_entry(fs_uuids.next, 306 struct btrfs_fs_devices, fs_list); 307 list_del(&fs_devices->fs_list); 308 free_fs_devices(fs_devices); 309 } 310 } 311 312 /* 313 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 314 * Returned struct is not linked onto any lists and must be destroyed using 315 * btrfs_free_device. 316 */ 317 static struct btrfs_device *__alloc_device(void) 318 { 319 struct btrfs_device *dev; 320 321 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 322 if (!dev) 323 return ERR_PTR(-ENOMEM); 324 325 /* 326 * Preallocate a bio that's always going to be used for flushing device 327 * barriers and matches the device lifespan 328 */ 329 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 330 if (!dev->flush_bio) { 331 kfree(dev); 332 return ERR_PTR(-ENOMEM); 333 } 334 335 INIT_LIST_HEAD(&dev->dev_list); 336 INIT_LIST_HEAD(&dev->dev_alloc_list); 337 INIT_LIST_HEAD(&dev->resized_list); 338 339 spin_lock_init(&dev->io_lock); 340 341 atomic_set(&dev->reada_in_flight, 0); 342 atomic_set(&dev->dev_stats_ccnt, 0); 343 btrfs_device_data_ordered_init(dev); 344 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 345 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 346 347 return dev; 348 } 349 350 /* 351 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 352 * return NULL. 353 * 354 * If devid and uuid are both specified, the match must be exact, otherwise 355 * only devid is used. 356 */ 357 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 358 u64 devid, const u8 *uuid) 359 { 360 struct btrfs_device *dev; 361 362 list_for_each_entry(dev, &fs_devices->devices, dev_list) { 363 if (dev->devid == devid && 364 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 365 return dev; 366 } 367 } 368 return NULL; 369 } 370 371 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 372 { 373 struct btrfs_fs_devices *fs_devices; 374 375 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 376 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 377 return fs_devices; 378 } 379 return NULL; 380 } 381 382 static int 383 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 384 int flush, struct block_device **bdev, 385 struct buffer_head **bh) 386 { 387 int ret; 388 389 *bdev = blkdev_get_by_path(device_path, flags, holder); 390 391 if (IS_ERR(*bdev)) { 392 ret = PTR_ERR(*bdev); 393 goto error; 394 } 395 396 if (flush) 397 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 398 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 399 if (ret) { 400 blkdev_put(*bdev, flags); 401 goto error; 402 } 403 invalidate_bdev(*bdev); 404 *bh = btrfs_read_dev_super(*bdev); 405 if (IS_ERR(*bh)) { 406 ret = PTR_ERR(*bh); 407 blkdev_put(*bdev, flags); 408 goto error; 409 } 410 411 return 0; 412 413 error: 414 *bdev = NULL; 415 *bh = NULL; 416 return ret; 417 } 418 419 static void requeue_list(struct btrfs_pending_bios *pending_bios, 420 struct bio *head, struct bio *tail) 421 { 422 423 struct bio *old_head; 424 425 old_head = pending_bios->head; 426 pending_bios->head = head; 427 if (pending_bios->tail) 428 tail->bi_next = old_head; 429 else 430 pending_bios->tail = tail; 431 } 432 433 /* 434 * we try to collect pending bios for a device so we don't get a large 435 * number of procs sending bios down to the same device. This greatly 436 * improves the schedulers ability to collect and merge the bios. 437 * 438 * But, it also turns into a long list of bios to process and that is sure 439 * to eventually make the worker thread block. The solution here is to 440 * make some progress and then put this work struct back at the end of 441 * the list if the block device is congested. This way, multiple devices 442 * can make progress from a single worker thread. 443 */ 444 static noinline void run_scheduled_bios(struct btrfs_device *device) 445 { 446 struct btrfs_fs_info *fs_info = device->fs_info; 447 struct bio *pending; 448 struct backing_dev_info *bdi; 449 struct btrfs_pending_bios *pending_bios; 450 struct bio *tail; 451 struct bio *cur; 452 int again = 0; 453 unsigned long num_run; 454 unsigned long batch_run = 0; 455 unsigned long last_waited = 0; 456 int force_reg = 0; 457 int sync_pending = 0; 458 struct blk_plug plug; 459 460 /* 461 * this function runs all the bios we've collected for 462 * a particular device. We don't want to wander off to 463 * another device without first sending all of these down. 464 * So, setup a plug here and finish it off before we return 465 */ 466 blk_start_plug(&plug); 467 468 bdi = device->bdev->bd_bdi; 469 470 loop: 471 spin_lock(&device->io_lock); 472 473 loop_lock: 474 num_run = 0; 475 476 /* take all the bios off the list at once and process them 477 * later on (without the lock held). But, remember the 478 * tail and other pointers so the bios can be properly reinserted 479 * into the list if we hit congestion 480 */ 481 if (!force_reg && device->pending_sync_bios.head) { 482 pending_bios = &device->pending_sync_bios; 483 force_reg = 1; 484 } else { 485 pending_bios = &device->pending_bios; 486 force_reg = 0; 487 } 488 489 pending = pending_bios->head; 490 tail = pending_bios->tail; 491 WARN_ON(pending && !tail); 492 493 /* 494 * if pending was null this time around, no bios need processing 495 * at all and we can stop. Otherwise it'll loop back up again 496 * and do an additional check so no bios are missed. 497 * 498 * device->running_pending is used to synchronize with the 499 * schedule_bio code. 500 */ 501 if (device->pending_sync_bios.head == NULL && 502 device->pending_bios.head == NULL) { 503 again = 0; 504 device->running_pending = 0; 505 } else { 506 again = 1; 507 device->running_pending = 1; 508 } 509 510 pending_bios->head = NULL; 511 pending_bios->tail = NULL; 512 513 spin_unlock(&device->io_lock); 514 515 while (pending) { 516 517 rmb(); 518 /* we want to work on both lists, but do more bios on the 519 * sync list than the regular list 520 */ 521 if ((num_run > 32 && 522 pending_bios != &device->pending_sync_bios && 523 device->pending_sync_bios.head) || 524 (num_run > 64 && pending_bios == &device->pending_sync_bios && 525 device->pending_bios.head)) { 526 spin_lock(&device->io_lock); 527 requeue_list(pending_bios, pending, tail); 528 goto loop_lock; 529 } 530 531 cur = pending; 532 pending = pending->bi_next; 533 cur->bi_next = NULL; 534 535 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 536 537 /* 538 * if we're doing the sync list, record that our 539 * plug has some sync requests on it 540 * 541 * If we're doing the regular list and there are 542 * sync requests sitting around, unplug before 543 * we add more 544 */ 545 if (pending_bios == &device->pending_sync_bios) { 546 sync_pending = 1; 547 } else if (sync_pending) { 548 blk_finish_plug(&plug); 549 blk_start_plug(&plug); 550 sync_pending = 0; 551 } 552 553 btrfsic_submit_bio(cur); 554 num_run++; 555 batch_run++; 556 557 cond_resched(); 558 559 /* 560 * we made progress, there is more work to do and the bdi 561 * is now congested. Back off and let other work structs 562 * run instead 563 */ 564 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 565 fs_info->fs_devices->open_devices > 1) { 566 struct io_context *ioc; 567 568 ioc = current->io_context; 569 570 /* 571 * the main goal here is that we don't want to 572 * block if we're going to be able to submit 573 * more requests without blocking. 574 * 575 * This code does two great things, it pokes into 576 * the elevator code from a filesystem _and_ 577 * it makes assumptions about how batching works. 578 */ 579 if (ioc && ioc->nr_batch_requests > 0 && 580 time_before(jiffies, ioc->last_waited + HZ/50UL) && 581 (last_waited == 0 || 582 ioc->last_waited == last_waited)) { 583 /* 584 * we want to go through our batch of 585 * requests and stop. So, we copy out 586 * the ioc->last_waited time and test 587 * against it before looping 588 */ 589 last_waited = ioc->last_waited; 590 cond_resched(); 591 continue; 592 } 593 spin_lock(&device->io_lock); 594 requeue_list(pending_bios, pending, tail); 595 device->running_pending = 1; 596 597 spin_unlock(&device->io_lock); 598 btrfs_queue_work(fs_info->submit_workers, 599 &device->work); 600 goto done; 601 } 602 } 603 604 cond_resched(); 605 if (again) 606 goto loop; 607 608 spin_lock(&device->io_lock); 609 if (device->pending_bios.head || device->pending_sync_bios.head) 610 goto loop_lock; 611 spin_unlock(&device->io_lock); 612 613 done: 614 blk_finish_plug(&plug); 615 } 616 617 static void pending_bios_fn(struct btrfs_work *work) 618 { 619 struct btrfs_device *device; 620 621 device = container_of(work, struct btrfs_device, work); 622 run_scheduled_bios(device); 623 } 624 625 /* 626 * Search and remove all stale (devices which are not mounted) devices. 627 * When both inputs are NULL, it will search and release all stale devices. 628 * path: Optional. When provided will it release all unmounted devices 629 * matching this path only. 630 * skip_dev: Optional. Will skip this device when searching for the stale 631 * devices. 632 */ 633 static void btrfs_free_stale_devices(const char *path, 634 struct btrfs_device *skip_device) 635 { 636 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 637 struct btrfs_device *device, *tmp_device; 638 639 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 640 mutex_lock(&fs_devices->device_list_mutex); 641 if (fs_devices->opened) { 642 mutex_unlock(&fs_devices->device_list_mutex); 643 continue; 644 } 645 646 list_for_each_entry_safe(device, tmp_device, 647 &fs_devices->devices, dev_list) { 648 int not_found = 0; 649 650 if (skip_device && skip_device == device) 651 continue; 652 if (path && !device->name) 653 continue; 654 655 rcu_read_lock(); 656 if (path) 657 not_found = strcmp(rcu_str_deref(device->name), 658 path); 659 rcu_read_unlock(); 660 if (not_found) 661 continue; 662 663 /* delete the stale device */ 664 fs_devices->num_devices--; 665 list_del(&device->dev_list); 666 btrfs_free_device(device); 667 668 if (fs_devices->num_devices == 0) 669 break; 670 } 671 mutex_unlock(&fs_devices->device_list_mutex); 672 if (fs_devices->num_devices == 0) { 673 btrfs_sysfs_remove_fsid(fs_devices); 674 list_del(&fs_devices->fs_list); 675 free_fs_devices(fs_devices); 676 } 677 } 678 } 679 680 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 681 struct btrfs_device *device, fmode_t flags, 682 void *holder) 683 { 684 struct request_queue *q; 685 struct block_device *bdev; 686 struct buffer_head *bh; 687 struct btrfs_super_block *disk_super; 688 u64 devid; 689 int ret; 690 691 if (device->bdev) 692 return -EINVAL; 693 if (!device->name) 694 return -EINVAL; 695 696 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 697 &bdev, &bh); 698 if (ret) 699 return ret; 700 701 disk_super = (struct btrfs_super_block *)bh->b_data; 702 devid = btrfs_stack_device_id(&disk_super->dev_item); 703 if (devid != device->devid) 704 goto error_brelse; 705 706 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 707 goto error_brelse; 708 709 device->generation = btrfs_super_generation(disk_super); 710 711 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 712 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 713 fs_devices->seeding = 1; 714 } else { 715 if (bdev_read_only(bdev)) 716 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 717 else 718 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 719 } 720 721 q = bdev_get_queue(bdev); 722 if (!blk_queue_nonrot(q)) 723 fs_devices->rotating = 1; 724 725 device->bdev = bdev; 726 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 727 device->mode = flags; 728 729 fs_devices->open_devices++; 730 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 731 device->devid != BTRFS_DEV_REPLACE_DEVID) { 732 fs_devices->rw_devices++; 733 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 734 } 735 brelse(bh); 736 737 return 0; 738 739 error_brelse: 740 brelse(bh); 741 blkdev_put(bdev, flags); 742 743 return -EINVAL; 744 } 745 746 /* 747 * Add new device to list of registered devices 748 * 749 * Returns: 750 * device pointer which was just added or updated when successful 751 * error pointer when failed 752 */ 753 static noinline struct btrfs_device *device_list_add(const char *path, 754 struct btrfs_super_block *disk_super, 755 bool *new_device_added) 756 { 757 struct btrfs_device *device; 758 struct btrfs_fs_devices *fs_devices; 759 struct rcu_string *name; 760 u64 found_transid = btrfs_super_generation(disk_super); 761 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 762 763 fs_devices = find_fsid(disk_super->fsid); 764 if (!fs_devices) { 765 fs_devices = alloc_fs_devices(disk_super->fsid); 766 if (IS_ERR(fs_devices)) 767 return ERR_CAST(fs_devices); 768 769 mutex_lock(&fs_devices->device_list_mutex); 770 list_add(&fs_devices->fs_list, &fs_uuids); 771 772 device = NULL; 773 } else { 774 mutex_lock(&fs_devices->device_list_mutex); 775 device = find_device(fs_devices, devid, 776 disk_super->dev_item.uuid); 777 } 778 779 if (!device) { 780 if (fs_devices->opened) { 781 mutex_unlock(&fs_devices->device_list_mutex); 782 return ERR_PTR(-EBUSY); 783 } 784 785 device = btrfs_alloc_device(NULL, &devid, 786 disk_super->dev_item.uuid); 787 if (IS_ERR(device)) { 788 mutex_unlock(&fs_devices->device_list_mutex); 789 /* we can safely leave the fs_devices entry around */ 790 return device; 791 } 792 793 name = rcu_string_strdup(path, GFP_NOFS); 794 if (!name) { 795 btrfs_free_device(device); 796 mutex_unlock(&fs_devices->device_list_mutex); 797 return ERR_PTR(-ENOMEM); 798 } 799 rcu_assign_pointer(device->name, name); 800 801 list_add_rcu(&device->dev_list, &fs_devices->devices); 802 fs_devices->num_devices++; 803 804 device->fs_devices = fs_devices; 805 *new_device_added = true; 806 807 if (disk_super->label[0]) 808 pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", 809 disk_super->label, devid, found_transid, path); 810 else 811 pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", 812 disk_super->fsid, devid, found_transid, path); 813 814 } else if (!device->name || strcmp(device->name->str, path)) { 815 /* 816 * When FS is already mounted. 817 * 1. If you are here and if the device->name is NULL that 818 * means this device was missing at time of FS mount. 819 * 2. If you are here and if the device->name is different 820 * from 'path' that means either 821 * a. The same device disappeared and reappeared with 822 * different name. or 823 * b. The missing-disk-which-was-replaced, has 824 * reappeared now. 825 * 826 * We must allow 1 and 2a above. But 2b would be a spurious 827 * and unintentional. 828 * 829 * Further in case of 1 and 2a above, the disk at 'path' 830 * would have missed some transaction when it was away and 831 * in case of 2a the stale bdev has to be updated as well. 832 * 2b must not be allowed at all time. 833 */ 834 835 /* 836 * For now, we do allow update to btrfs_fs_device through the 837 * btrfs dev scan cli after FS has been mounted. We're still 838 * tracking a problem where systems fail mount by subvolume id 839 * when we reject replacement on a mounted FS. 840 */ 841 if (!fs_devices->opened && found_transid < device->generation) { 842 /* 843 * That is if the FS is _not_ mounted and if you 844 * are here, that means there is more than one 845 * disk with same uuid and devid.We keep the one 846 * with larger generation number or the last-in if 847 * generation are equal. 848 */ 849 mutex_unlock(&fs_devices->device_list_mutex); 850 return ERR_PTR(-EEXIST); 851 } 852 853 name = rcu_string_strdup(path, GFP_NOFS); 854 if (!name) { 855 mutex_unlock(&fs_devices->device_list_mutex); 856 return ERR_PTR(-ENOMEM); 857 } 858 rcu_string_free(device->name); 859 rcu_assign_pointer(device->name, name); 860 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 861 fs_devices->missing_devices--; 862 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 863 } 864 } 865 866 /* 867 * Unmount does not free the btrfs_device struct but would zero 868 * generation along with most of the other members. So just update 869 * it back. We need it to pick the disk with largest generation 870 * (as above). 871 */ 872 if (!fs_devices->opened) 873 device->generation = found_transid; 874 875 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 876 877 mutex_unlock(&fs_devices->device_list_mutex); 878 return device; 879 } 880 881 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 882 { 883 struct btrfs_fs_devices *fs_devices; 884 struct btrfs_device *device; 885 struct btrfs_device *orig_dev; 886 887 fs_devices = alloc_fs_devices(orig->fsid); 888 if (IS_ERR(fs_devices)) 889 return fs_devices; 890 891 mutex_lock(&orig->device_list_mutex); 892 fs_devices->total_devices = orig->total_devices; 893 894 /* We have held the volume lock, it is safe to get the devices. */ 895 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 896 struct rcu_string *name; 897 898 device = btrfs_alloc_device(NULL, &orig_dev->devid, 899 orig_dev->uuid); 900 if (IS_ERR(device)) 901 goto error; 902 903 /* 904 * This is ok to do without rcu read locked because we hold the 905 * uuid mutex so nothing we touch in here is going to disappear. 906 */ 907 if (orig_dev->name) { 908 name = rcu_string_strdup(orig_dev->name->str, 909 GFP_KERNEL); 910 if (!name) { 911 btrfs_free_device(device); 912 goto error; 913 } 914 rcu_assign_pointer(device->name, name); 915 } 916 917 list_add(&device->dev_list, &fs_devices->devices); 918 device->fs_devices = fs_devices; 919 fs_devices->num_devices++; 920 } 921 mutex_unlock(&orig->device_list_mutex); 922 return fs_devices; 923 error: 924 mutex_unlock(&orig->device_list_mutex); 925 free_fs_devices(fs_devices); 926 return ERR_PTR(-ENOMEM); 927 } 928 929 /* 930 * After we have read the system tree and know devids belonging to 931 * this filesystem, remove the device which does not belong there. 932 */ 933 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 934 { 935 struct btrfs_device *device, *next; 936 struct btrfs_device *latest_dev = NULL; 937 938 mutex_lock(&uuid_mutex); 939 again: 940 /* This is the initialized path, it is safe to release the devices. */ 941 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 942 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 943 &device->dev_state)) { 944 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 945 &device->dev_state) && 946 (!latest_dev || 947 device->generation > latest_dev->generation)) { 948 latest_dev = device; 949 } 950 continue; 951 } 952 953 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 954 /* 955 * In the first step, keep the device which has 956 * the correct fsid and the devid that is used 957 * for the dev_replace procedure. 958 * In the second step, the dev_replace state is 959 * read from the device tree and it is known 960 * whether the procedure is really active or 961 * not, which means whether this device is 962 * used or whether it should be removed. 963 */ 964 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 965 &device->dev_state)) { 966 continue; 967 } 968 } 969 if (device->bdev) { 970 blkdev_put(device->bdev, device->mode); 971 device->bdev = NULL; 972 fs_devices->open_devices--; 973 } 974 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 975 list_del_init(&device->dev_alloc_list); 976 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 977 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 978 &device->dev_state)) 979 fs_devices->rw_devices--; 980 } 981 list_del_init(&device->dev_list); 982 fs_devices->num_devices--; 983 btrfs_free_device(device); 984 } 985 986 if (fs_devices->seed) { 987 fs_devices = fs_devices->seed; 988 goto again; 989 } 990 991 fs_devices->latest_bdev = latest_dev->bdev; 992 993 mutex_unlock(&uuid_mutex); 994 } 995 996 static void free_device_rcu(struct rcu_head *head) 997 { 998 struct btrfs_device *device; 999 1000 device = container_of(head, struct btrfs_device, rcu); 1001 btrfs_free_device(device); 1002 } 1003 1004 static void btrfs_close_bdev(struct btrfs_device *device) 1005 { 1006 if (!device->bdev) 1007 return; 1008 1009 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1010 sync_blockdev(device->bdev); 1011 invalidate_bdev(device->bdev); 1012 } 1013 1014 blkdev_put(device->bdev, device->mode); 1015 } 1016 1017 static void btrfs_close_one_device(struct btrfs_device *device) 1018 { 1019 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1020 struct btrfs_device *new_device; 1021 struct rcu_string *name; 1022 1023 if (device->bdev) 1024 fs_devices->open_devices--; 1025 1026 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1027 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1028 list_del_init(&device->dev_alloc_list); 1029 fs_devices->rw_devices--; 1030 } 1031 1032 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1033 fs_devices->missing_devices--; 1034 1035 btrfs_close_bdev(device); 1036 1037 new_device = btrfs_alloc_device(NULL, &device->devid, 1038 device->uuid); 1039 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 1040 1041 /* Safe because we are under uuid_mutex */ 1042 if (device->name) { 1043 name = rcu_string_strdup(device->name->str, GFP_NOFS); 1044 BUG_ON(!name); /* -ENOMEM */ 1045 rcu_assign_pointer(new_device->name, name); 1046 } 1047 1048 list_replace_rcu(&device->dev_list, &new_device->dev_list); 1049 new_device->fs_devices = device->fs_devices; 1050 1051 call_rcu(&device->rcu, free_device_rcu); 1052 } 1053 1054 static int close_fs_devices(struct btrfs_fs_devices *fs_devices) 1055 { 1056 struct btrfs_device *device, *tmp; 1057 1058 if (--fs_devices->opened > 0) 1059 return 0; 1060 1061 mutex_lock(&fs_devices->device_list_mutex); 1062 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1063 btrfs_close_one_device(device); 1064 } 1065 mutex_unlock(&fs_devices->device_list_mutex); 1066 1067 WARN_ON(fs_devices->open_devices); 1068 WARN_ON(fs_devices->rw_devices); 1069 fs_devices->opened = 0; 1070 fs_devices->seeding = 0; 1071 1072 return 0; 1073 } 1074 1075 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1076 { 1077 struct btrfs_fs_devices *seed_devices = NULL; 1078 int ret; 1079 1080 mutex_lock(&uuid_mutex); 1081 ret = close_fs_devices(fs_devices); 1082 if (!fs_devices->opened) { 1083 seed_devices = fs_devices->seed; 1084 fs_devices->seed = NULL; 1085 } 1086 mutex_unlock(&uuid_mutex); 1087 1088 while (seed_devices) { 1089 fs_devices = seed_devices; 1090 seed_devices = fs_devices->seed; 1091 close_fs_devices(fs_devices); 1092 free_fs_devices(fs_devices); 1093 } 1094 return ret; 1095 } 1096 1097 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1098 fmode_t flags, void *holder) 1099 { 1100 struct btrfs_device *device; 1101 struct btrfs_device *latest_dev = NULL; 1102 int ret = 0; 1103 1104 flags |= FMODE_EXCL; 1105 1106 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1107 /* Just open everything we can; ignore failures here */ 1108 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1109 continue; 1110 1111 if (!latest_dev || 1112 device->generation > latest_dev->generation) 1113 latest_dev = device; 1114 } 1115 if (fs_devices->open_devices == 0) { 1116 ret = -EINVAL; 1117 goto out; 1118 } 1119 fs_devices->opened = 1; 1120 fs_devices->latest_bdev = latest_dev->bdev; 1121 fs_devices->total_rw_bytes = 0; 1122 out: 1123 return ret; 1124 } 1125 1126 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1127 { 1128 struct btrfs_device *dev1, *dev2; 1129 1130 dev1 = list_entry(a, struct btrfs_device, dev_list); 1131 dev2 = list_entry(b, struct btrfs_device, dev_list); 1132 1133 if (dev1->devid < dev2->devid) 1134 return -1; 1135 else if (dev1->devid > dev2->devid) 1136 return 1; 1137 return 0; 1138 } 1139 1140 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1141 fmode_t flags, void *holder) 1142 { 1143 int ret; 1144 1145 lockdep_assert_held(&uuid_mutex); 1146 1147 mutex_lock(&fs_devices->device_list_mutex); 1148 if (fs_devices->opened) { 1149 fs_devices->opened++; 1150 ret = 0; 1151 } else { 1152 list_sort(NULL, &fs_devices->devices, devid_cmp); 1153 ret = open_fs_devices(fs_devices, flags, holder); 1154 } 1155 mutex_unlock(&fs_devices->device_list_mutex); 1156 1157 return ret; 1158 } 1159 1160 static void btrfs_release_disk_super(struct page *page) 1161 { 1162 kunmap(page); 1163 put_page(page); 1164 } 1165 1166 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1167 struct page **page, 1168 struct btrfs_super_block **disk_super) 1169 { 1170 void *p; 1171 pgoff_t index; 1172 1173 /* make sure our super fits in the device */ 1174 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1175 return 1; 1176 1177 /* make sure our super fits in the page */ 1178 if (sizeof(**disk_super) > PAGE_SIZE) 1179 return 1; 1180 1181 /* make sure our super doesn't straddle pages on disk */ 1182 index = bytenr >> PAGE_SHIFT; 1183 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1184 return 1; 1185 1186 /* pull in the page with our super */ 1187 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1188 index, GFP_KERNEL); 1189 1190 if (IS_ERR_OR_NULL(*page)) 1191 return 1; 1192 1193 p = kmap(*page); 1194 1195 /* align our pointer to the offset of the super block */ 1196 *disk_super = p + (bytenr & ~PAGE_MASK); 1197 1198 if (btrfs_super_bytenr(*disk_super) != bytenr || 1199 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1200 btrfs_release_disk_super(*page); 1201 return 1; 1202 } 1203 1204 if ((*disk_super)->label[0] && 1205 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1206 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1207 1208 return 0; 1209 } 1210 1211 /* 1212 * Look for a btrfs signature on a device. This may be called out of the mount path 1213 * and we are not allowed to call set_blocksize during the scan. The superblock 1214 * is read via pagecache 1215 */ 1216 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1217 void *holder) 1218 { 1219 struct btrfs_super_block *disk_super; 1220 bool new_device_added = false; 1221 struct btrfs_device *device = NULL; 1222 struct block_device *bdev; 1223 struct page *page; 1224 u64 bytenr; 1225 1226 lockdep_assert_held(&uuid_mutex); 1227 1228 /* 1229 * we would like to check all the supers, but that would make 1230 * a btrfs mount succeed after a mkfs from a different FS. 1231 * So, we need to add a special mount option to scan for 1232 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1233 */ 1234 bytenr = btrfs_sb_offset(0); 1235 flags |= FMODE_EXCL; 1236 1237 bdev = blkdev_get_by_path(path, flags, holder); 1238 if (IS_ERR(bdev)) 1239 return ERR_CAST(bdev); 1240 1241 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 1242 device = ERR_PTR(-EINVAL); 1243 goto error_bdev_put; 1244 } 1245 1246 device = device_list_add(path, disk_super, &new_device_added); 1247 if (!IS_ERR(device)) { 1248 if (new_device_added) 1249 btrfs_free_stale_devices(path, device); 1250 } 1251 1252 btrfs_release_disk_super(page); 1253 1254 error_bdev_put: 1255 blkdev_put(bdev, flags); 1256 1257 return device; 1258 } 1259 1260 static int contains_pending_extent(struct btrfs_transaction *transaction, 1261 struct btrfs_device *device, 1262 u64 *start, u64 len) 1263 { 1264 struct btrfs_fs_info *fs_info = device->fs_info; 1265 struct extent_map *em; 1266 struct list_head *search_list = &fs_info->pinned_chunks; 1267 int ret = 0; 1268 u64 physical_start = *start; 1269 1270 if (transaction) 1271 search_list = &transaction->pending_chunks; 1272 again: 1273 list_for_each_entry(em, search_list, list) { 1274 struct map_lookup *map; 1275 int i; 1276 1277 map = em->map_lookup; 1278 for (i = 0; i < map->num_stripes; i++) { 1279 u64 end; 1280 1281 if (map->stripes[i].dev != device) 1282 continue; 1283 if (map->stripes[i].physical >= physical_start + len || 1284 map->stripes[i].physical + em->orig_block_len <= 1285 physical_start) 1286 continue; 1287 /* 1288 * Make sure that while processing the pinned list we do 1289 * not override our *start with a lower value, because 1290 * we can have pinned chunks that fall within this 1291 * device hole and that have lower physical addresses 1292 * than the pending chunks we processed before. If we 1293 * do not take this special care we can end up getting 1294 * 2 pending chunks that start at the same physical 1295 * device offsets because the end offset of a pinned 1296 * chunk can be equal to the start offset of some 1297 * pending chunk. 1298 */ 1299 end = map->stripes[i].physical + em->orig_block_len; 1300 if (end > *start) { 1301 *start = end; 1302 ret = 1; 1303 } 1304 } 1305 } 1306 if (search_list != &fs_info->pinned_chunks) { 1307 search_list = &fs_info->pinned_chunks; 1308 goto again; 1309 } 1310 1311 return ret; 1312 } 1313 1314 1315 /* 1316 * find_free_dev_extent_start - find free space in the specified device 1317 * @device: the device which we search the free space in 1318 * @num_bytes: the size of the free space that we need 1319 * @search_start: the position from which to begin the search 1320 * @start: store the start of the free space. 1321 * @len: the size of the free space. that we find, or the size 1322 * of the max free space if we don't find suitable free space 1323 * 1324 * this uses a pretty simple search, the expectation is that it is 1325 * called very infrequently and that a given device has a small number 1326 * of extents 1327 * 1328 * @start is used to store the start of the free space if we find. But if we 1329 * don't find suitable free space, it will be used to store the start position 1330 * of the max free space. 1331 * 1332 * @len is used to store the size of the free space that we find. 1333 * But if we don't find suitable free space, it is used to store the size of 1334 * the max free space. 1335 */ 1336 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1337 struct btrfs_device *device, u64 num_bytes, 1338 u64 search_start, u64 *start, u64 *len) 1339 { 1340 struct btrfs_fs_info *fs_info = device->fs_info; 1341 struct btrfs_root *root = fs_info->dev_root; 1342 struct btrfs_key key; 1343 struct btrfs_dev_extent *dev_extent; 1344 struct btrfs_path *path; 1345 u64 hole_size; 1346 u64 max_hole_start; 1347 u64 max_hole_size; 1348 u64 extent_end; 1349 u64 search_end = device->total_bytes; 1350 int ret; 1351 int slot; 1352 struct extent_buffer *l; 1353 1354 /* 1355 * We don't want to overwrite the superblock on the drive nor any area 1356 * used by the boot loader (grub for example), so we make sure to start 1357 * at an offset of at least 1MB. 1358 */ 1359 search_start = max_t(u64, search_start, SZ_1M); 1360 1361 path = btrfs_alloc_path(); 1362 if (!path) 1363 return -ENOMEM; 1364 1365 max_hole_start = search_start; 1366 max_hole_size = 0; 1367 1368 again: 1369 if (search_start >= search_end || 1370 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1371 ret = -ENOSPC; 1372 goto out; 1373 } 1374 1375 path->reada = READA_FORWARD; 1376 path->search_commit_root = 1; 1377 path->skip_locking = 1; 1378 1379 key.objectid = device->devid; 1380 key.offset = search_start; 1381 key.type = BTRFS_DEV_EXTENT_KEY; 1382 1383 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1384 if (ret < 0) 1385 goto out; 1386 if (ret > 0) { 1387 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1388 if (ret < 0) 1389 goto out; 1390 } 1391 1392 while (1) { 1393 l = path->nodes[0]; 1394 slot = path->slots[0]; 1395 if (slot >= btrfs_header_nritems(l)) { 1396 ret = btrfs_next_leaf(root, path); 1397 if (ret == 0) 1398 continue; 1399 if (ret < 0) 1400 goto out; 1401 1402 break; 1403 } 1404 btrfs_item_key_to_cpu(l, &key, slot); 1405 1406 if (key.objectid < device->devid) 1407 goto next; 1408 1409 if (key.objectid > device->devid) 1410 break; 1411 1412 if (key.type != BTRFS_DEV_EXTENT_KEY) 1413 goto next; 1414 1415 if (key.offset > search_start) { 1416 hole_size = key.offset - search_start; 1417 1418 /* 1419 * Have to check before we set max_hole_start, otherwise 1420 * we could end up sending back this offset anyway. 1421 */ 1422 if (contains_pending_extent(transaction, device, 1423 &search_start, 1424 hole_size)) { 1425 if (key.offset >= search_start) { 1426 hole_size = key.offset - search_start; 1427 } else { 1428 WARN_ON_ONCE(1); 1429 hole_size = 0; 1430 } 1431 } 1432 1433 if (hole_size > max_hole_size) { 1434 max_hole_start = search_start; 1435 max_hole_size = hole_size; 1436 } 1437 1438 /* 1439 * If this free space is greater than which we need, 1440 * it must be the max free space that we have found 1441 * until now, so max_hole_start must point to the start 1442 * of this free space and the length of this free space 1443 * is stored in max_hole_size. Thus, we return 1444 * max_hole_start and max_hole_size and go back to the 1445 * caller. 1446 */ 1447 if (hole_size >= num_bytes) { 1448 ret = 0; 1449 goto out; 1450 } 1451 } 1452 1453 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1454 extent_end = key.offset + btrfs_dev_extent_length(l, 1455 dev_extent); 1456 if (extent_end > search_start) 1457 search_start = extent_end; 1458 next: 1459 path->slots[0]++; 1460 cond_resched(); 1461 } 1462 1463 /* 1464 * At this point, search_start should be the end of 1465 * allocated dev extents, and when shrinking the device, 1466 * search_end may be smaller than search_start. 1467 */ 1468 if (search_end > search_start) { 1469 hole_size = search_end - search_start; 1470 1471 if (contains_pending_extent(transaction, device, &search_start, 1472 hole_size)) { 1473 btrfs_release_path(path); 1474 goto again; 1475 } 1476 1477 if (hole_size > max_hole_size) { 1478 max_hole_start = search_start; 1479 max_hole_size = hole_size; 1480 } 1481 } 1482 1483 /* See above. */ 1484 if (max_hole_size < num_bytes) 1485 ret = -ENOSPC; 1486 else 1487 ret = 0; 1488 1489 out: 1490 btrfs_free_path(path); 1491 *start = max_hole_start; 1492 if (len) 1493 *len = max_hole_size; 1494 return ret; 1495 } 1496 1497 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1498 struct btrfs_device *device, u64 num_bytes, 1499 u64 *start, u64 *len) 1500 { 1501 /* FIXME use last free of some kind */ 1502 return find_free_dev_extent_start(trans->transaction, device, 1503 num_bytes, 0, start, len); 1504 } 1505 1506 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1507 struct btrfs_device *device, 1508 u64 start, u64 *dev_extent_len) 1509 { 1510 struct btrfs_fs_info *fs_info = device->fs_info; 1511 struct btrfs_root *root = fs_info->dev_root; 1512 int ret; 1513 struct btrfs_path *path; 1514 struct btrfs_key key; 1515 struct btrfs_key found_key; 1516 struct extent_buffer *leaf = NULL; 1517 struct btrfs_dev_extent *extent = NULL; 1518 1519 path = btrfs_alloc_path(); 1520 if (!path) 1521 return -ENOMEM; 1522 1523 key.objectid = device->devid; 1524 key.offset = start; 1525 key.type = BTRFS_DEV_EXTENT_KEY; 1526 again: 1527 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1528 if (ret > 0) { 1529 ret = btrfs_previous_item(root, path, key.objectid, 1530 BTRFS_DEV_EXTENT_KEY); 1531 if (ret) 1532 goto out; 1533 leaf = path->nodes[0]; 1534 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1535 extent = btrfs_item_ptr(leaf, path->slots[0], 1536 struct btrfs_dev_extent); 1537 BUG_ON(found_key.offset > start || found_key.offset + 1538 btrfs_dev_extent_length(leaf, extent) < start); 1539 key = found_key; 1540 btrfs_release_path(path); 1541 goto again; 1542 } else if (ret == 0) { 1543 leaf = path->nodes[0]; 1544 extent = btrfs_item_ptr(leaf, path->slots[0], 1545 struct btrfs_dev_extent); 1546 } else { 1547 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1548 goto out; 1549 } 1550 1551 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1552 1553 ret = btrfs_del_item(trans, root, path); 1554 if (ret) { 1555 btrfs_handle_fs_error(fs_info, ret, 1556 "Failed to remove dev extent item"); 1557 } else { 1558 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1559 } 1560 out: 1561 btrfs_free_path(path); 1562 return ret; 1563 } 1564 1565 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1566 struct btrfs_device *device, 1567 u64 chunk_offset, u64 start, u64 num_bytes) 1568 { 1569 int ret; 1570 struct btrfs_path *path; 1571 struct btrfs_fs_info *fs_info = device->fs_info; 1572 struct btrfs_root *root = fs_info->dev_root; 1573 struct btrfs_dev_extent *extent; 1574 struct extent_buffer *leaf; 1575 struct btrfs_key key; 1576 1577 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1578 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1579 path = btrfs_alloc_path(); 1580 if (!path) 1581 return -ENOMEM; 1582 1583 key.objectid = device->devid; 1584 key.offset = start; 1585 key.type = BTRFS_DEV_EXTENT_KEY; 1586 ret = btrfs_insert_empty_item(trans, root, path, &key, 1587 sizeof(*extent)); 1588 if (ret) 1589 goto out; 1590 1591 leaf = path->nodes[0]; 1592 extent = btrfs_item_ptr(leaf, path->slots[0], 1593 struct btrfs_dev_extent); 1594 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1595 BTRFS_CHUNK_TREE_OBJECTID); 1596 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1597 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1598 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1599 1600 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1601 btrfs_mark_buffer_dirty(leaf); 1602 out: 1603 btrfs_free_path(path); 1604 return ret; 1605 } 1606 1607 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1608 { 1609 struct extent_map_tree *em_tree; 1610 struct extent_map *em; 1611 struct rb_node *n; 1612 u64 ret = 0; 1613 1614 em_tree = &fs_info->mapping_tree.map_tree; 1615 read_lock(&em_tree->lock); 1616 n = rb_last(&em_tree->map); 1617 if (n) { 1618 em = rb_entry(n, struct extent_map, rb_node); 1619 ret = em->start + em->len; 1620 } 1621 read_unlock(&em_tree->lock); 1622 1623 return ret; 1624 } 1625 1626 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1627 u64 *devid_ret) 1628 { 1629 int ret; 1630 struct btrfs_key key; 1631 struct btrfs_key found_key; 1632 struct btrfs_path *path; 1633 1634 path = btrfs_alloc_path(); 1635 if (!path) 1636 return -ENOMEM; 1637 1638 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1639 key.type = BTRFS_DEV_ITEM_KEY; 1640 key.offset = (u64)-1; 1641 1642 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1643 if (ret < 0) 1644 goto error; 1645 1646 BUG_ON(ret == 0); /* Corruption */ 1647 1648 ret = btrfs_previous_item(fs_info->chunk_root, path, 1649 BTRFS_DEV_ITEMS_OBJECTID, 1650 BTRFS_DEV_ITEM_KEY); 1651 if (ret) { 1652 *devid_ret = 1; 1653 } else { 1654 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1655 path->slots[0]); 1656 *devid_ret = found_key.offset + 1; 1657 } 1658 ret = 0; 1659 error: 1660 btrfs_free_path(path); 1661 return ret; 1662 } 1663 1664 /* 1665 * the device information is stored in the chunk root 1666 * the btrfs_device struct should be fully filled in 1667 */ 1668 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1669 struct btrfs_device *device) 1670 { 1671 int ret; 1672 struct btrfs_path *path; 1673 struct btrfs_dev_item *dev_item; 1674 struct extent_buffer *leaf; 1675 struct btrfs_key key; 1676 unsigned long ptr; 1677 1678 path = btrfs_alloc_path(); 1679 if (!path) 1680 return -ENOMEM; 1681 1682 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1683 key.type = BTRFS_DEV_ITEM_KEY; 1684 key.offset = device->devid; 1685 1686 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1687 &key, sizeof(*dev_item)); 1688 if (ret) 1689 goto out; 1690 1691 leaf = path->nodes[0]; 1692 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1693 1694 btrfs_set_device_id(leaf, dev_item, device->devid); 1695 btrfs_set_device_generation(leaf, dev_item, 0); 1696 btrfs_set_device_type(leaf, dev_item, device->type); 1697 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1698 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1699 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1700 btrfs_set_device_total_bytes(leaf, dev_item, 1701 btrfs_device_get_disk_total_bytes(device)); 1702 btrfs_set_device_bytes_used(leaf, dev_item, 1703 btrfs_device_get_bytes_used(device)); 1704 btrfs_set_device_group(leaf, dev_item, 0); 1705 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1706 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1707 btrfs_set_device_start_offset(leaf, dev_item, 0); 1708 1709 ptr = btrfs_device_uuid(dev_item); 1710 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1711 ptr = btrfs_device_fsid(dev_item); 1712 write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); 1713 btrfs_mark_buffer_dirty(leaf); 1714 1715 ret = 0; 1716 out: 1717 btrfs_free_path(path); 1718 return ret; 1719 } 1720 1721 /* 1722 * Function to update ctime/mtime for a given device path. 1723 * Mainly used for ctime/mtime based probe like libblkid. 1724 */ 1725 static void update_dev_time(const char *path_name) 1726 { 1727 struct file *filp; 1728 1729 filp = filp_open(path_name, O_RDWR, 0); 1730 if (IS_ERR(filp)) 1731 return; 1732 file_update_time(filp); 1733 filp_close(filp, NULL); 1734 } 1735 1736 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1737 struct btrfs_device *device) 1738 { 1739 struct btrfs_root *root = fs_info->chunk_root; 1740 int ret; 1741 struct btrfs_path *path; 1742 struct btrfs_key key; 1743 struct btrfs_trans_handle *trans; 1744 1745 path = btrfs_alloc_path(); 1746 if (!path) 1747 return -ENOMEM; 1748 1749 trans = btrfs_start_transaction(root, 0); 1750 if (IS_ERR(trans)) { 1751 btrfs_free_path(path); 1752 return PTR_ERR(trans); 1753 } 1754 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1755 key.type = BTRFS_DEV_ITEM_KEY; 1756 key.offset = device->devid; 1757 1758 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1759 if (ret) { 1760 if (ret > 0) 1761 ret = -ENOENT; 1762 btrfs_abort_transaction(trans, ret); 1763 btrfs_end_transaction(trans); 1764 goto out; 1765 } 1766 1767 ret = btrfs_del_item(trans, root, path); 1768 if (ret) { 1769 btrfs_abort_transaction(trans, ret); 1770 btrfs_end_transaction(trans); 1771 } 1772 1773 out: 1774 btrfs_free_path(path); 1775 if (!ret) 1776 ret = btrfs_commit_transaction(trans); 1777 return ret; 1778 } 1779 1780 /* 1781 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1782 * filesystem. It's up to the caller to adjust that number regarding eg. device 1783 * replace. 1784 */ 1785 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1786 u64 num_devices) 1787 { 1788 u64 all_avail; 1789 unsigned seq; 1790 int i; 1791 1792 do { 1793 seq = read_seqbegin(&fs_info->profiles_lock); 1794 1795 all_avail = fs_info->avail_data_alloc_bits | 1796 fs_info->avail_system_alloc_bits | 1797 fs_info->avail_metadata_alloc_bits; 1798 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1799 1800 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1801 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1802 continue; 1803 1804 if (num_devices < btrfs_raid_array[i].devs_min) { 1805 int ret = btrfs_raid_array[i].mindev_error; 1806 1807 if (ret) 1808 return ret; 1809 } 1810 } 1811 1812 return 0; 1813 } 1814 1815 static struct btrfs_device * btrfs_find_next_active_device( 1816 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1817 { 1818 struct btrfs_device *next_device; 1819 1820 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1821 if (next_device != device && 1822 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1823 && next_device->bdev) 1824 return next_device; 1825 } 1826 1827 return NULL; 1828 } 1829 1830 /* 1831 * Helper function to check if the given device is part of s_bdev / latest_bdev 1832 * and replace it with the provided or the next active device, in the context 1833 * where this function called, there should be always be another device (or 1834 * this_dev) which is active. 1835 */ 1836 void btrfs_assign_next_active_device(struct btrfs_device *device, 1837 struct btrfs_device *this_dev) 1838 { 1839 struct btrfs_fs_info *fs_info = device->fs_info; 1840 struct btrfs_device *next_device; 1841 1842 if (this_dev) 1843 next_device = this_dev; 1844 else 1845 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1846 device); 1847 ASSERT(next_device); 1848 1849 if (fs_info->sb->s_bdev && 1850 (fs_info->sb->s_bdev == device->bdev)) 1851 fs_info->sb->s_bdev = next_device->bdev; 1852 1853 if (fs_info->fs_devices->latest_bdev == device->bdev) 1854 fs_info->fs_devices->latest_bdev = next_device->bdev; 1855 } 1856 1857 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1858 u64 devid) 1859 { 1860 struct btrfs_device *device; 1861 struct btrfs_fs_devices *cur_devices; 1862 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 1863 u64 num_devices; 1864 int ret = 0; 1865 1866 mutex_lock(&uuid_mutex); 1867 1868 num_devices = fs_devices->num_devices; 1869 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 1870 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1871 WARN_ON(num_devices < 1); 1872 num_devices--; 1873 } 1874 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 1875 1876 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1877 if (ret) 1878 goto out; 1879 1880 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1881 &device); 1882 if (ret) 1883 goto out; 1884 1885 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1886 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1887 goto out; 1888 } 1889 1890 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1891 fs_info->fs_devices->rw_devices == 1) { 1892 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1893 goto out; 1894 } 1895 1896 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1897 mutex_lock(&fs_info->chunk_mutex); 1898 list_del_init(&device->dev_alloc_list); 1899 device->fs_devices->rw_devices--; 1900 mutex_unlock(&fs_info->chunk_mutex); 1901 } 1902 1903 mutex_unlock(&uuid_mutex); 1904 ret = btrfs_shrink_device(device, 0); 1905 mutex_lock(&uuid_mutex); 1906 if (ret) 1907 goto error_undo; 1908 1909 /* 1910 * TODO: the superblock still includes this device in its num_devices 1911 * counter although write_all_supers() is not locked out. This 1912 * could give a filesystem state which requires a degraded mount. 1913 */ 1914 ret = btrfs_rm_dev_item(fs_info, device); 1915 if (ret) 1916 goto error_undo; 1917 1918 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 1919 btrfs_scrub_cancel_dev(fs_info, device); 1920 1921 /* 1922 * the device list mutex makes sure that we don't change 1923 * the device list while someone else is writing out all 1924 * the device supers. Whoever is writing all supers, should 1925 * lock the device list mutex before getting the number of 1926 * devices in the super block (super_copy). Conversely, 1927 * whoever updates the number of devices in the super block 1928 * (super_copy) should hold the device list mutex. 1929 */ 1930 1931 /* 1932 * In normal cases the cur_devices == fs_devices. But in case 1933 * of deleting a seed device, the cur_devices should point to 1934 * its own fs_devices listed under the fs_devices->seed. 1935 */ 1936 cur_devices = device->fs_devices; 1937 mutex_lock(&fs_devices->device_list_mutex); 1938 list_del_rcu(&device->dev_list); 1939 1940 cur_devices->num_devices--; 1941 cur_devices->total_devices--; 1942 /* Update total_devices of the parent fs_devices if it's seed */ 1943 if (cur_devices != fs_devices) 1944 fs_devices->total_devices--; 1945 1946 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1947 cur_devices->missing_devices--; 1948 1949 btrfs_assign_next_active_device(device, NULL); 1950 1951 if (device->bdev) { 1952 cur_devices->open_devices--; 1953 /* remove sysfs entry */ 1954 btrfs_sysfs_rm_device_link(fs_devices, device); 1955 } 1956 1957 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 1958 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 1959 mutex_unlock(&fs_devices->device_list_mutex); 1960 1961 /* 1962 * at this point, the device is zero sized and detached from 1963 * the devices list. All that's left is to zero out the old 1964 * supers and free the device. 1965 */ 1966 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 1967 btrfs_scratch_superblocks(device->bdev, device->name->str); 1968 1969 btrfs_close_bdev(device); 1970 call_rcu(&device->rcu, free_device_rcu); 1971 1972 if (cur_devices->open_devices == 0) { 1973 while (fs_devices) { 1974 if (fs_devices->seed == cur_devices) { 1975 fs_devices->seed = cur_devices->seed; 1976 break; 1977 } 1978 fs_devices = fs_devices->seed; 1979 } 1980 cur_devices->seed = NULL; 1981 close_fs_devices(cur_devices); 1982 free_fs_devices(cur_devices); 1983 } 1984 1985 out: 1986 mutex_unlock(&uuid_mutex); 1987 return ret; 1988 1989 error_undo: 1990 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1991 mutex_lock(&fs_info->chunk_mutex); 1992 list_add(&device->dev_alloc_list, 1993 &fs_devices->alloc_list); 1994 device->fs_devices->rw_devices++; 1995 mutex_unlock(&fs_info->chunk_mutex); 1996 } 1997 goto out; 1998 } 1999 2000 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2001 { 2002 struct btrfs_fs_devices *fs_devices; 2003 2004 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2005 2006 /* 2007 * in case of fs with no seed, srcdev->fs_devices will point 2008 * to fs_devices of fs_info. However when the dev being replaced is 2009 * a seed dev it will point to the seed's local fs_devices. In short 2010 * srcdev will have its correct fs_devices in both the cases. 2011 */ 2012 fs_devices = srcdev->fs_devices; 2013 2014 list_del_rcu(&srcdev->dev_list); 2015 list_del(&srcdev->dev_alloc_list); 2016 fs_devices->num_devices--; 2017 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2018 fs_devices->missing_devices--; 2019 2020 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2021 fs_devices->rw_devices--; 2022 2023 if (srcdev->bdev) 2024 fs_devices->open_devices--; 2025 } 2026 2027 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2028 struct btrfs_device *srcdev) 2029 { 2030 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2031 2032 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2033 /* zero out the old super if it is writable */ 2034 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2035 } 2036 2037 btrfs_close_bdev(srcdev); 2038 call_rcu(&srcdev->rcu, free_device_rcu); 2039 2040 /* if this is no devs we rather delete the fs_devices */ 2041 if (!fs_devices->num_devices) { 2042 struct btrfs_fs_devices *tmp_fs_devices; 2043 2044 /* 2045 * On a mounted FS, num_devices can't be zero unless it's a 2046 * seed. In case of a seed device being replaced, the replace 2047 * target added to the sprout FS, so there will be no more 2048 * device left under the seed FS. 2049 */ 2050 ASSERT(fs_devices->seeding); 2051 2052 tmp_fs_devices = fs_info->fs_devices; 2053 while (tmp_fs_devices) { 2054 if (tmp_fs_devices->seed == fs_devices) { 2055 tmp_fs_devices->seed = fs_devices->seed; 2056 break; 2057 } 2058 tmp_fs_devices = tmp_fs_devices->seed; 2059 } 2060 fs_devices->seed = NULL; 2061 close_fs_devices(fs_devices); 2062 free_fs_devices(fs_devices); 2063 } 2064 } 2065 2066 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2067 { 2068 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2069 2070 WARN_ON(!tgtdev); 2071 mutex_lock(&fs_devices->device_list_mutex); 2072 2073 btrfs_sysfs_rm_device_link(fs_devices, tgtdev); 2074 2075 if (tgtdev->bdev) 2076 fs_devices->open_devices--; 2077 2078 fs_devices->num_devices--; 2079 2080 btrfs_assign_next_active_device(tgtdev, NULL); 2081 2082 list_del_rcu(&tgtdev->dev_list); 2083 2084 mutex_unlock(&fs_devices->device_list_mutex); 2085 2086 /* 2087 * The update_dev_time() with in btrfs_scratch_superblocks() 2088 * may lead to a call to btrfs_show_devname() which will try 2089 * to hold device_list_mutex. And here this device 2090 * is already out of device list, so we don't have to hold 2091 * the device_list_mutex lock. 2092 */ 2093 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2094 2095 btrfs_close_bdev(tgtdev); 2096 call_rcu(&tgtdev->rcu, free_device_rcu); 2097 } 2098 2099 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2100 const char *device_path, 2101 struct btrfs_device **device) 2102 { 2103 int ret = 0; 2104 struct btrfs_super_block *disk_super; 2105 u64 devid; 2106 u8 *dev_uuid; 2107 struct block_device *bdev; 2108 struct buffer_head *bh; 2109 2110 *device = NULL; 2111 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2112 fs_info->bdev_holder, 0, &bdev, &bh); 2113 if (ret) 2114 return ret; 2115 disk_super = (struct btrfs_super_block *)bh->b_data; 2116 devid = btrfs_stack_device_id(&disk_super->dev_item); 2117 dev_uuid = disk_super->dev_item.uuid; 2118 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2119 brelse(bh); 2120 if (!*device) 2121 ret = -ENOENT; 2122 blkdev_put(bdev, FMODE_READ); 2123 return ret; 2124 } 2125 2126 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2127 const char *device_path, 2128 struct btrfs_device **device) 2129 { 2130 *device = NULL; 2131 if (strcmp(device_path, "missing") == 0) { 2132 struct list_head *devices; 2133 struct btrfs_device *tmp; 2134 2135 devices = &fs_info->fs_devices->devices; 2136 list_for_each_entry(tmp, devices, dev_list) { 2137 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2138 &tmp->dev_state) && !tmp->bdev) { 2139 *device = tmp; 2140 break; 2141 } 2142 } 2143 2144 if (!*device) 2145 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2146 2147 return 0; 2148 } else { 2149 return btrfs_find_device_by_path(fs_info, device_path, device); 2150 } 2151 } 2152 2153 /* 2154 * Lookup a device given by device id, or the path if the id is 0. 2155 */ 2156 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2157 const char *devpath, 2158 struct btrfs_device **device) 2159 { 2160 int ret; 2161 2162 if (devid) { 2163 ret = 0; 2164 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2165 if (!*device) 2166 ret = -ENOENT; 2167 } else { 2168 if (!devpath || !devpath[0]) 2169 return -EINVAL; 2170 2171 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2172 device); 2173 } 2174 return ret; 2175 } 2176 2177 /* 2178 * does all the dirty work required for changing file system's UUID. 2179 */ 2180 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2181 { 2182 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2183 struct btrfs_fs_devices *old_devices; 2184 struct btrfs_fs_devices *seed_devices; 2185 struct btrfs_super_block *disk_super = fs_info->super_copy; 2186 struct btrfs_device *device; 2187 u64 super_flags; 2188 2189 lockdep_assert_held(&uuid_mutex); 2190 if (!fs_devices->seeding) 2191 return -EINVAL; 2192 2193 seed_devices = alloc_fs_devices(NULL); 2194 if (IS_ERR(seed_devices)) 2195 return PTR_ERR(seed_devices); 2196 2197 old_devices = clone_fs_devices(fs_devices); 2198 if (IS_ERR(old_devices)) { 2199 kfree(seed_devices); 2200 return PTR_ERR(old_devices); 2201 } 2202 2203 list_add(&old_devices->fs_list, &fs_uuids); 2204 2205 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2206 seed_devices->opened = 1; 2207 INIT_LIST_HEAD(&seed_devices->devices); 2208 INIT_LIST_HEAD(&seed_devices->alloc_list); 2209 mutex_init(&seed_devices->device_list_mutex); 2210 2211 mutex_lock(&fs_devices->device_list_mutex); 2212 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2213 synchronize_rcu); 2214 list_for_each_entry(device, &seed_devices->devices, dev_list) 2215 device->fs_devices = seed_devices; 2216 2217 mutex_lock(&fs_info->chunk_mutex); 2218 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2219 mutex_unlock(&fs_info->chunk_mutex); 2220 2221 fs_devices->seeding = 0; 2222 fs_devices->num_devices = 0; 2223 fs_devices->open_devices = 0; 2224 fs_devices->missing_devices = 0; 2225 fs_devices->rotating = 0; 2226 fs_devices->seed = seed_devices; 2227 2228 generate_random_uuid(fs_devices->fsid); 2229 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2230 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2231 mutex_unlock(&fs_devices->device_list_mutex); 2232 2233 super_flags = btrfs_super_flags(disk_super) & 2234 ~BTRFS_SUPER_FLAG_SEEDING; 2235 btrfs_set_super_flags(disk_super, super_flags); 2236 2237 return 0; 2238 } 2239 2240 /* 2241 * Store the expected generation for seed devices in device items. 2242 */ 2243 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2244 struct btrfs_fs_info *fs_info) 2245 { 2246 struct btrfs_root *root = fs_info->chunk_root; 2247 struct btrfs_path *path; 2248 struct extent_buffer *leaf; 2249 struct btrfs_dev_item *dev_item; 2250 struct btrfs_device *device; 2251 struct btrfs_key key; 2252 u8 fs_uuid[BTRFS_FSID_SIZE]; 2253 u8 dev_uuid[BTRFS_UUID_SIZE]; 2254 u64 devid; 2255 int ret; 2256 2257 path = btrfs_alloc_path(); 2258 if (!path) 2259 return -ENOMEM; 2260 2261 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2262 key.offset = 0; 2263 key.type = BTRFS_DEV_ITEM_KEY; 2264 2265 while (1) { 2266 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2267 if (ret < 0) 2268 goto error; 2269 2270 leaf = path->nodes[0]; 2271 next_slot: 2272 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2273 ret = btrfs_next_leaf(root, path); 2274 if (ret > 0) 2275 break; 2276 if (ret < 0) 2277 goto error; 2278 leaf = path->nodes[0]; 2279 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2280 btrfs_release_path(path); 2281 continue; 2282 } 2283 2284 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2285 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2286 key.type != BTRFS_DEV_ITEM_KEY) 2287 break; 2288 2289 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2290 struct btrfs_dev_item); 2291 devid = btrfs_device_id(leaf, dev_item); 2292 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2293 BTRFS_UUID_SIZE); 2294 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2295 BTRFS_FSID_SIZE); 2296 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2297 BUG_ON(!device); /* Logic error */ 2298 2299 if (device->fs_devices->seeding) { 2300 btrfs_set_device_generation(leaf, dev_item, 2301 device->generation); 2302 btrfs_mark_buffer_dirty(leaf); 2303 } 2304 2305 path->slots[0]++; 2306 goto next_slot; 2307 } 2308 ret = 0; 2309 error: 2310 btrfs_free_path(path); 2311 return ret; 2312 } 2313 2314 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2315 { 2316 struct btrfs_root *root = fs_info->dev_root; 2317 struct request_queue *q; 2318 struct btrfs_trans_handle *trans; 2319 struct btrfs_device *device; 2320 struct block_device *bdev; 2321 struct super_block *sb = fs_info->sb; 2322 struct rcu_string *name; 2323 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2324 u64 orig_super_total_bytes; 2325 u64 orig_super_num_devices; 2326 int seeding_dev = 0; 2327 int ret = 0; 2328 bool unlocked = false; 2329 2330 if (sb_rdonly(sb) && !fs_devices->seeding) 2331 return -EROFS; 2332 2333 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2334 fs_info->bdev_holder); 2335 if (IS_ERR(bdev)) 2336 return PTR_ERR(bdev); 2337 2338 if (fs_devices->seeding) { 2339 seeding_dev = 1; 2340 down_write(&sb->s_umount); 2341 mutex_lock(&uuid_mutex); 2342 } 2343 2344 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2345 2346 mutex_lock(&fs_devices->device_list_mutex); 2347 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2348 if (device->bdev == bdev) { 2349 ret = -EEXIST; 2350 mutex_unlock( 2351 &fs_devices->device_list_mutex); 2352 goto error; 2353 } 2354 } 2355 mutex_unlock(&fs_devices->device_list_mutex); 2356 2357 device = btrfs_alloc_device(fs_info, NULL, NULL); 2358 if (IS_ERR(device)) { 2359 /* we can safely leave the fs_devices entry around */ 2360 ret = PTR_ERR(device); 2361 goto error; 2362 } 2363 2364 name = rcu_string_strdup(device_path, GFP_KERNEL); 2365 if (!name) { 2366 ret = -ENOMEM; 2367 goto error_free_device; 2368 } 2369 rcu_assign_pointer(device->name, name); 2370 2371 trans = btrfs_start_transaction(root, 0); 2372 if (IS_ERR(trans)) { 2373 ret = PTR_ERR(trans); 2374 goto error_free_device; 2375 } 2376 2377 q = bdev_get_queue(bdev); 2378 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2379 device->generation = trans->transid; 2380 device->io_width = fs_info->sectorsize; 2381 device->io_align = fs_info->sectorsize; 2382 device->sector_size = fs_info->sectorsize; 2383 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2384 fs_info->sectorsize); 2385 device->disk_total_bytes = device->total_bytes; 2386 device->commit_total_bytes = device->total_bytes; 2387 device->fs_info = fs_info; 2388 device->bdev = bdev; 2389 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2390 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2391 device->mode = FMODE_EXCL; 2392 device->dev_stats_valid = 1; 2393 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2394 2395 if (seeding_dev) { 2396 sb->s_flags &= ~SB_RDONLY; 2397 ret = btrfs_prepare_sprout(fs_info); 2398 if (ret) { 2399 btrfs_abort_transaction(trans, ret); 2400 goto error_trans; 2401 } 2402 } 2403 2404 device->fs_devices = fs_devices; 2405 2406 mutex_lock(&fs_devices->device_list_mutex); 2407 mutex_lock(&fs_info->chunk_mutex); 2408 list_add_rcu(&device->dev_list, &fs_devices->devices); 2409 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2410 fs_devices->num_devices++; 2411 fs_devices->open_devices++; 2412 fs_devices->rw_devices++; 2413 fs_devices->total_devices++; 2414 fs_devices->total_rw_bytes += device->total_bytes; 2415 2416 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2417 2418 if (!blk_queue_nonrot(q)) 2419 fs_devices->rotating = 1; 2420 2421 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2422 btrfs_set_super_total_bytes(fs_info->super_copy, 2423 round_down(orig_super_total_bytes + device->total_bytes, 2424 fs_info->sectorsize)); 2425 2426 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2427 btrfs_set_super_num_devices(fs_info->super_copy, 2428 orig_super_num_devices + 1); 2429 2430 /* add sysfs device entry */ 2431 btrfs_sysfs_add_device_link(fs_devices, device); 2432 2433 /* 2434 * we've got more storage, clear any full flags on the space 2435 * infos 2436 */ 2437 btrfs_clear_space_info_full(fs_info); 2438 2439 mutex_unlock(&fs_info->chunk_mutex); 2440 mutex_unlock(&fs_devices->device_list_mutex); 2441 2442 if (seeding_dev) { 2443 mutex_lock(&fs_info->chunk_mutex); 2444 ret = init_first_rw_device(trans, fs_info); 2445 mutex_unlock(&fs_info->chunk_mutex); 2446 if (ret) { 2447 btrfs_abort_transaction(trans, ret); 2448 goto error_sysfs; 2449 } 2450 } 2451 2452 ret = btrfs_add_dev_item(trans, device); 2453 if (ret) { 2454 btrfs_abort_transaction(trans, ret); 2455 goto error_sysfs; 2456 } 2457 2458 if (seeding_dev) { 2459 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2460 2461 ret = btrfs_finish_sprout(trans, fs_info); 2462 if (ret) { 2463 btrfs_abort_transaction(trans, ret); 2464 goto error_sysfs; 2465 } 2466 2467 /* Sprouting would change fsid of the mounted root, 2468 * so rename the fsid on the sysfs 2469 */ 2470 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2471 fs_info->fsid); 2472 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) 2473 btrfs_warn(fs_info, 2474 "sysfs: failed to create fsid for sprout"); 2475 } 2476 2477 ret = btrfs_commit_transaction(trans); 2478 2479 if (seeding_dev) { 2480 mutex_unlock(&uuid_mutex); 2481 up_write(&sb->s_umount); 2482 unlocked = true; 2483 2484 if (ret) /* transaction commit */ 2485 return ret; 2486 2487 ret = btrfs_relocate_sys_chunks(fs_info); 2488 if (ret < 0) 2489 btrfs_handle_fs_error(fs_info, ret, 2490 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2491 trans = btrfs_attach_transaction(root); 2492 if (IS_ERR(trans)) { 2493 if (PTR_ERR(trans) == -ENOENT) 2494 return 0; 2495 ret = PTR_ERR(trans); 2496 trans = NULL; 2497 goto error_sysfs; 2498 } 2499 ret = btrfs_commit_transaction(trans); 2500 } 2501 2502 /* Update ctime/mtime for libblkid */ 2503 update_dev_time(device_path); 2504 return ret; 2505 2506 error_sysfs: 2507 btrfs_sysfs_rm_device_link(fs_devices, device); 2508 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2509 mutex_lock(&fs_info->chunk_mutex); 2510 list_del_rcu(&device->dev_list); 2511 list_del(&device->dev_alloc_list); 2512 fs_info->fs_devices->num_devices--; 2513 fs_info->fs_devices->open_devices--; 2514 fs_info->fs_devices->rw_devices--; 2515 fs_info->fs_devices->total_devices--; 2516 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2517 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2518 btrfs_set_super_total_bytes(fs_info->super_copy, 2519 orig_super_total_bytes); 2520 btrfs_set_super_num_devices(fs_info->super_copy, 2521 orig_super_num_devices); 2522 mutex_unlock(&fs_info->chunk_mutex); 2523 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2524 error_trans: 2525 if (seeding_dev) 2526 sb->s_flags |= SB_RDONLY; 2527 if (trans) 2528 btrfs_end_transaction(trans); 2529 error_free_device: 2530 btrfs_free_device(device); 2531 error: 2532 blkdev_put(bdev, FMODE_EXCL); 2533 if (seeding_dev && !unlocked) { 2534 mutex_unlock(&uuid_mutex); 2535 up_write(&sb->s_umount); 2536 } 2537 return ret; 2538 } 2539 2540 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2541 struct btrfs_device *device) 2542 { 2543 int ret; 2544 struct btrfs_path *path; 2545 struct btrfs_root *root = device->fs_info->chunk_root; 2546 struct btrfs_dev_item *dev_item; 2547 struct extent_buffer *leaf; 2548 struct btrfs_key key; 2549 2550 path = btrfs_alloc_path(); 2551 if (!path) 2552 return -ENOMEM; 2553 2554 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2555 key.type = BTRFS_DEV_ITEM_KEY; 2556 key.offset = device->devid; 2557 2558 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2559 if (ret < 0) 2560 goto out; 2561 2562 if (ret > 0) { 2563 ret = -ENOENT; 2564 goto out; 2565 } 2566 2567 leaf = path->nodes[0]; 2568 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2569 2570 btrfs_set_device_id(leaf, dev_item, device->devid); 2571 btrfs_set_device_type(leaf, dev_item, device->type); 2572 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2573 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2574 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2575 btrfs_set_device_total_bytes(leaf, dev_item, 2576 btrfs_device_get_disk_total_bytes(device)); 2577 btrfs_set_device_bytes_used(leaf, dev_item, 2578 btrfs_device_get_bytes_used(device)); 2579 btrfs_mark_buffer_dirty(leaf); 2580 2581 out: 2582 btrfs_free_path(path); 2583 return ret; 2584 } 2585 2586 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2587 struct btrfs_device *device, u64 new_size) 2588 { 2589 struct btrfs_fs_info *fs_info = device->fs_info; 2590 struct btrfs_super_block *super_copy = fs_info->super_copy; 2591 struct btrfs_fs_devices *fs_devices; 2592 u64 old_total; 2593 u64 diff; 2594 2595 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2596 return -EACCES; 2597 2598 new_size = round_down(new_size, fs_info->sectorsize); 2599 2600 mutex_lock(&fs_info->chunk_mutex); 2601 old_total = btrfs_super_total_bytes(super_copy); 2602 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2603 2604 if (new_size <= device->total_bytes || 2605 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2606 mutex_unlock(&fs_info->chunk_mutex); 2607 return -EINVAL; 2608 } 2609 2610 fs_devices = fs_info->fs_devices; 2611 2612 btrfs_set_super_total_bytes(super_copy, 2613 round_down(old_total + diff, fs_info->sectorsize)); 2614 device->fs_devices->total_rw_bytes += diff; 2615 2616 btrfs_device_set_total_bytes(device, new_size); 2617 btrfs_device_set_disk_total_bytes(device, new_size); 2618 btrfs_clear_space_info_full(device->fs_info); 2619 if (list_empty(&device->resized_list)) 2620 list_add_tail(&device->resized_list, 2621 &fs_devices->resized_devices); 2622 mutex_unlock(&fs_info->chunk_mutex); 2623 2624 return btrfs_update_device(trans, device); 2625 } 2626 2627 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2628 { 2629 struct btrfs_fs_info *fs_info = trans->fs_info; 2630 struct btrfs_root *root = fs_info->chunk_root; 2631 int ret; 2632 struct btrfs_path *path; 2633 struct btrfs_key key; 2634 2635 path = btrfs_alloc_path(); 2636 if (!path) 2637 return -ENOMEM; 2638 2639 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2640 key.offset = chunk_offset; 2641 key.type = BTRFS_CHUNK_ITEM_KEY; 2642 2643 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2644 if (ret < 0) 2645 goto out; 2646 else if (ret > 0) { /* Logic error or corruption */ 2647 btrfs_handle_fs_error(fs_info, -ENOENT, 2648 "Failed lookup while freeing chunk."); 2649 ret = -ENOENT; 2650 goto out; 2651 } 2652 2653 ret = btrfs_del_item(trans, root, path); 2654 if (ret < 0) 2655 btrfs_handle_fs_error(fs_info, ret, 2656 "Failed to delete chunk item."); 2657 out: 2658 btrfs_free_path(path); 2659 return ret; 2660 } 2661 2662 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2663 { 2664 struct btrfs_super_block *super_copy = fs_info->super_copy; 2665 struct btrfs_disk_key *disk_key; 2666 struct btrfs_chunk *chunk; 2667 u8 *ptr; 2668 int ret = 0; 2669 u32 num_stripes; 2670 u32 array_size; 2671 u32 len = 0; 2672 u32 cur; 2673 struct btrfs_key key; 2674 2675 mutex_lock(&fs_info->chunk_mutex); 2676 array_size = btrfs_super_sys_array_size(super_copy); 2677 2678 ptr = super_copy->sys_chunk_array; 2679 cur = 0; 2680 2681 while (cur < array_size) { 2682 disk_key = (struct btrfs_disk_key *)ptr; 2683 btrfs_disk_key_to_cpu(&key, disk_key); 2684 2685 len = sizeof(*disk_key); 2686 2687 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2688 chunk = (struct btrfs_chunk *)(ptr + len); 2689 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2690 len += btrfs_chunk_item_size(num_stripes); 2691 } else { 2692 ret = -EIO; 2693 break; 2694 } 2695 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2696 key.offset == chunk_offset) { 2697 memmove(ptr, ptr + len, array_size - (cur + len)); 2698 array_size -= len; 2699 btrfs_set_super_sys_array_size(super_copy, array_size); 2700 } else { 2701 ptr += len; 2702 cur += len; 2703 } 2704 } 2705 mutex_unlock(&fs_info->chunk_mutex); 2706 return ret; 2707 } 2708 2709 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2710 u64 logical, u64 length) 2711 { 2712 struct extent_map_tree *em_tree; 2713 struct extent_map *em; 2714 2715 em_tree = &fs_info->mapping_tree.map_tree; 2716 read_lock(&em_tree->lock); 2717 em = lookup_extent_mapping(em_tree, logical, length); 2718 read_unlock(&em_tree->lock); 2719 2720 if (!em) { 2721 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2722 logical, length); 2723 return ERR_PTR(-EINVAL); 2724 } 2725 2726 if (em->start > logical || em->start + em->len < logical) { 2727 btrfs_crit(fs_info, 2728 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2729 logical, length, em->start, em->start + em->len); 2730 free_extent_map(em); 2731 return ERR_PTR(-EINVAL); 2732 } 2733 2734 /* callers are responsible for dropping em's ref. */ 2735 return em; 2736 } 2737 2738 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2739 { 2740 struct btrfs_fs_info *fs_info = trans->fs_info; 2741 struct extent_map *em; 2742 struct map_lookup *map; 2743 u64 dev_extent_len = 0; 2744 int i, ret = 0; 2745 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2746 2747 em = get_chunk_map(fs_info, chunk_offset, 1); 2748 if (IS_ERR(em)) { 2749 /* 2750 * This is a logic error, but we don't want to just rely on the 2751 * user having built with ASSERT enabled, so if ASSERT doesn't 2752 * do anything we still error out. 2753 */ 2754 ASSERT(0); 2755 return PTR_ERR(em); 2756 } 2757 map = em->map_lookup; 2758 mutex_lock(&fs_info->chunk_mutex); 2759 check_system_chunk(trans, map->type); 2760 mutex_unlock(&fs_info->chunk_mutex); 2761 2762 /* 2763 * Take the device list mutex to prevent races with the final phase of 2764 * a device replace operation that replaces the device object associated 2765 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2766 */ 2767 mutex_lock(&fs_devices->device_list_mutex); 2768 for (i = 0; i < map->num_stripes; i++) { 2769 struct btrfs_device *device = map->stripes[i].dev; 2770 ret = btrfs_free_dev_extent(trans, device, 2771 map->stripes[i].physical, 2772 &dev_extent_len); 2773 if (ret) { 2774 mutex_unlock(&fs_devices->device_list_mutex); 2775 btrfs_abort_transaction(trans, ret); 2776 goto out; 2777 } 2778 2779 if (device->bytes_used > 0) { 2780 mutex_lock(&fs_info->chunk_mutex); 2781 btrfs_device_set_bytes_used(device, 2782 device->bytes_used - dev_extent_len); 2783 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2784 btrfs_clear_space_info_full(fs_info); 2785 mutex_unlock(&fs_info->chunk_mutex); 2786 } 2787 2788 if (map->stripes[i].dev) { 2789 ret = btrfs_update_device(trans, map->stripes[i].dev); 2790 if (ret) { 2791 mutex_unlock(&fs_devices->device_list_mutex); 2792 btrfs_abort_transaction(trans, ret); 2793 goto out; 2794 } 2795 } 2796 } 2797 mutex_unlock(&fs_devices->device_list_mutex); 2798 2799 ret = btrfs_free_chunk(trans, chunk_offset); 2800 if (ret) { 2801 btrfs_abort_transaction(trans, ret); 2802 goto out; 2803 } 2804 2805 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2806 2807 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2808 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2809 if (ret) { 2810 btrfs_abort_transaction(trans, ret); 2811 goto out; 2812 } 2813 } 2814 2815 ret = btrfs_remove_block_group(trans, chunk_offset, em); 2816 if (ret) { 2817 btrfs_abort_transaction(trans, ret); 2818 goto out; 2819 } 2820 2821 out: 2822 /* once for us */ 2823 free_extent_map(em); 2824 return ret; 2825 } 2826 2827 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2828 { 2829 struct btrfs_root *root = fs_info->chunk_root; 2830 struct btrfs_trans_handle *trans; 2831 int ret; 2832 2833 /* 2834 * Prevent races with automatic removal of unused block groups. 2835 * After we relocate and before we remove the chunk with offset 2836 * chunk_offset, automatic removal of the block group can kick in, 2837 * resulting in a failure when calling btrfs_remove_chunk() below. 2838 * 2839 * Make sure to acquire this mutex before doing a tree search (dev 2840 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2841 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2842 * we release the path used to search the chunk/dev tree and before 2843 * the current task acquires this mutex and calls us. 2844 */ 2845 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 2846 2847 ret = btrfs_can_relocate(fs_info, chunk_offset); 2848 if (ret) 2849 return -ENOSPC; 2850 2851 /* step one, relocate all the extents inside this chunk */ 2852 btrfs_scrub_pause(fs_info); 2853 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 2854 btrfs_scrub_continue(fs_info); 2855 if (ret) 2856 return ret; 2857 2858 /* 2859 * We add the kobjects here (and after forcing data chunk creation) 2860 * since relocation is the only place we'll create chunks of a new 2861 * type at runtime. The only place where we'll remove the last 2862 * chunk of a type is the call immediately below this one. Even 2863 * so, we're protected against races with the cleaner thread since 2864 * we're covered by the delete_unused_bgs_mutex. 2865 */ 2866 btrfs_add_raid_kobjects(fs_info); 2867 2868 trans = btrfs_start_trans_remove_block_group(root->fs_info, 2869 chunk_offset); 2870 if (IS_ERR(trans)) { 2871 ret = PTR_ERR(trans); 2872 btrfs_handle_fs_error(root->fs_info, ret, NULL); 2873 return ret; 2874 } 2875 2876 /* 2877 * step two, delete the device extents and the 2878 * chunk tree entries 2879 */ 2880 ret = btrfs_remove_chunk(trans, chunk_offset); 2881 btrfs_end_transaction(trans); 2882 return ret; 2883 } 2884 2885 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 2886 { 2887 struct btrfs_root *chunk_root = fs_info->chunk_root; 2888 struct btrfs_path *path; 2889 struct extent_buffer *leaf; 2890 struct btrfs_chunk *chunk; 2891 struct btrfs_key key; 2892 struct btrfs_key found_key; 2893 u64 chunk_type; 2894 bool retried = false; 2895 int failed = 0; 2896 int ret; 2897 2898 path = btrfs_alloc_path(); 2899 if (!path) 2900 return -ENOMEM; 2901 2902 again: 2903 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2904 key.offset = (u64)-1; 2905 key.type = BTRFS_CHUNK_ITEM_KEY; 2906 2907 while (1) { 2908 mutex_lock(&fs_info->delete_unused_bgs_mutex); 2909 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2910 if (ret < 0) { 2911 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2912 goto error; 2913 } 2914 BUG_ON(ret == 0); /* Corruption */ 2915 2916 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2917 key.type); 2918 if (ret) 2919 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2920 if (ret < 0) 2921 goto error; 2922 if (ret > 0) 2923 break; 2924 2925 leaf = path->nodes[0]; 2926 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2927 2928 chunk = btrfs_item_ptr(leaf, path->slots[0], 2929 struct btrfs_chunk); 2930 chunk_type = btrfs_chunk_type(leaf, chunk); 2931 btrfs_release_path(path); 2932 2933 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2934 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 2935 if (ret == -ENOSPC) 2936 failed++; 2937 else 2938 BUG_ON(ret); 2939 } 2940 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2941 2942 if (found_key.offset == 0) 2943 break; 2944 key.offset = found_key.offset - 1; 2945 } 2946 ret = 0; 2947 if (failed && !retried) { 2948 failed = 0; 2949 retried = true; 2950 goto again; 2951 } else if (WARN_ON(failed && retried)) { 2952 ret = -ENOSPC; 2953 } 2954 error: 2955 btrfs_free_path(path); 2956 return ret; 2957 } 2958 2959 /* 2960 * return 1 : allocate a data chunk successfully, 2961 * return <0: errors during allocating a data chunk, 2962 * return 0 : no need to allocate a data chunk. 2963 */ 2964 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 2965 u64 chunk_offset) 2966 { 2967 struct btrfs_block_group_cache *cache; 2968 u64 bytes_used; 2969 u64 chunk_type; 2970 2971 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2972 ASSERT(cache); 2973 chunk_type = cache->flags; 2974 btrfs_put_block_group(cache); 2975 2976 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 2977 spin_lock(&fs_info->data_sinfo->lock); 2978 bytes_used = fs_info->data_sinfo->bytes_used; 2979 spin_unlock(&fs_info->data_sinfo->lock); 2980 2981 if (!bytes_used) { 2982 struct btrfs_trans_handle *trans; 2983 int ret; 2984 2985 trans = btrfs_join_transaction(fs_info->tree_root); 2986 if (IS_ERR(trans)) 2987 return PTR_ERR(trans); 2988 2989 ret = btrfs_force_chunk_alloc(trans, 2990 BTRFS_BLOCK_GROUP_DATA); 2991 btrfs_end_transaction(trans); 2992 if (ret < 0) 2993 return ret; 2994 2995 btrfs_add_raid_kobjects(fs_info); 2996 2997 return 1; 2998 } 2999 } 3000 return 0; 3001 } 3002 3003 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3004 struct btrfs_balance_control *bctl) 3005 { 3006 struct btrfs_root *root = fs_info->tree_root; 3007 struct btrfs_trans_handle *trans; 3008 struct btrfs_balance_item *item; 3009 struct btrfs_disk_balance_args disk_bargs; 3010 struct btrfs_path *path; 3011 struct extent_buffer *leaf; 3012 struct btrfs_key key; 3013 int ret, err; 3014 3015 path = btrfs_alloc_path(); 3016 if (!path) 3017 return -ENOMEM; 3018 3019 trans = btrfs_start_transaction(root, 0); 3020 if (IS_ERR(trans)) { 3021 btrfs_free_path(path); 3022 return PTR_ERR(trans); 3023 } 3024 3025 key.objectid = BTRFS_BALANCE_OBJECTID; 3026 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3027 key.offset = 0; 3028 3029 ret = btrfs_insert_empty_item(trans, root, path, &key, 3030 sizeof(*item)); 3031 if (ret) 3032 goto out; 3033 3034 leaf = path->nodes[0]; 3035 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3036 3037 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3038 3039 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3040 btrfs_set_balance_data(leaf, item, &disk_bargs); 3041 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3042 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3043 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3044 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3045 3046 btrfs_set_balance_flags(leaf, item, bctl->flags); 3047 3048 btrfs_mark_buffer_dirty(leaf); 3049 out: 3050 btrfs_free_path(path); 3051 err = btrfs_commit_transaction(trans); 3052 if (err && !ret) 3053 ret = err; 3054 return ret; 3055 } 3056 3057 static int del_balance_item(struct btrfs_fs_info *fs_info) 3058 { 3059 struct btrfs_root *root = fs_info->tree_root; 3060 struct btrfs_trans_handle *trans; 3061 struct btrfs_path *path; 3062 struct btrfs_key key; 3063 int ret, err; 3064 3065 path = btrfs_alloc_path(); 3066 if (!path) 3067 return -ENOMEM; 3068 3069 trans = btrfs_start_transaction(root, 0); 3070 if (IS_ERR(trans)) { 3071 btrfs_free_path(path); 3072 return PTR_ERR(trans); 3073 } 3074 3075 key.objectid = BTRFS_BALANCE_OBJECTID; 3076 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3077 key.offset = 0; 3078 3079 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3080 if (ret < 0) 3081 goto out; 3082 if (ret > 0) { 3083 ret = -ENOENT; 3084 goto out; 3085 } 3086 3087 ret = btrfs_del_item(trans, root, path); 3088 out: 3089 btrfs_free_path(path); 3090 err = btrfs_commit_transaction(trans); 3091 if (err && !ret) 3092 ret = err; 3093 return ret; 3094 } 3095 3096 /* 3097 * This is a heuristic used to reduce the number of chunks balanced on 3098 * resume after balance was interrupted. 3099 */ 3100 static void update_balance_args(struct btrfs_balance_control *bctl) 3101 { 3102 /* 3103 * Turn on soft mode for chunk types that were being converted. 3104 */ 3105 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3106 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3107 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3108 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3109 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3110 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3111 3112 /* 3113 * Turn on usage filter if is not already used. The idea is 3114 * that chunks that we have already balanced should be 3115 * reasonably full. Don't do it for chunks that are being 3116 * converted - that will keep us from relocating unconverted 3117 * (albeit full) chunks. 3118 */ 3119 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3120 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3121 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3122 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3123 bctl->data.usage = 90; 3124 } 3125 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3126 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3127 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3128 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3129 bctl->sys.usage = 90; 3130 } 3131 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3132 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3133 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3134 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3135 bctl->meta.usage = 90; 3136 } 3137 } 3138 3139 /* 3140 * Clear the balance status in fs_info and delete the balance item from disk. 3141 */ 3142 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3143 { 3144 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3145 int ret; 3146 3147 BUG_ON(!fs_info->balance_ctl); 3148 3149 spin_lock(&fs_info->balance_lock); 3150 fs_info->balance_ctl = NULL; 3151 spin_unlock(&fs_info->balance_lock); 3152 3153 kfree(bctl); 3154 ret = del_balance_item(fs_info); 3155 if (ret) 3156 btrfs_handle_fs_error(fs_info, ret, NULL); 3157 } 3158 3159 /* 3160 * Balance filters. Return 1 if chunk should be filtered out 3161 * (should not be balanced). 3162 */ 3163 static int chunk_profiles_filter(u64 chunk_type, 3164 struct btrfs_balance_args *bargs) 3165 { 3166 chunk_type = chunk_to_extended(chunk_type) & 3167 BTRFS_EXTENDED_PROFILE_MASK; 3168 3169 if (bargs->profiles & chunk_type) 3170 return 0; 3171 3172 return 1; 3173 } 3174 3175 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3176 struct btrfs_balance_args *bargs) 3177 { 3178 struct btrfs_block_group_cache *cache; 3179 u64 chunk_used; 3180 u64 user_thresh_min; 3181 u64 user_thresh_max; 3182 int ret = 1; 3183 3184 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3185 chunk_used = btrfs_block_group_used(&cache->item); 3186 3187 if (bargs->usage_min == 0) 3188 user_thresh_min = 0; 3189 else 3190 user_thresh_min = div_factor_fine(cache->key.offset, 3191 bargs->usage_min); 3192 3193 if (bargs->usage_max == 0) 3194 user_thresh_max = 1; 3195 else if (bargs->usage_max > 100) 3196 user_thresh_max = cache->key.offset; 3197 else 3198 user_thresh_max = div_factor_fine(cache->key.offset, 3199 bargs->usage_max); 3200 3201 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3202 ret = 0; 3203 3204 btrfs_put_block_group(cache); 3205 return ret; 3206 } 3207 3208 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3209 u64 chunk_offset, struct btrfs_balance_args *bargs) 3210 { 3211 struct btrfs_block_group_cache *cache; 3212 u64 chunk_used, user_thresh; 3213 int ret = 1; 3214 3215 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3216 chunk_used = btrfs_block_group_used(&cache->item); 3217 3218 if (bargs->usage_min == 0) 3219 user_thresh = 1; 3220 else if (bargs->usage > 100) 3221 user_thresh = cache->key.offset; 3222 else 3223 user_thresh = div_factor_fine(cache->key.offset, 3224 bargs->usage); 3225 3226 if (chunk_used < user_thresh) 3227 ret = 0; 3228 3229 btrfs_put_block_group(cache); 3230 return ret; 3231 } 3232 3233 static int chunk_devid_filter(struct extent_buffer *leaf, 3234 struct btrfs_chunk *chunk, 3235 struct btrfs_balance_args *bargs) 3236 { 3237 struct btrfs_stripe *stripe; 3238 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3239 int i; 3240 3241 for (i = 0; i < num_stripes; i++) { 3242 stripe = btrfs_stripe_nr(chunk, i); 3243 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3244 return 0; 3245 } 3246 3247 return 1; 3248 } 3249 3250 /* [pstart, pend) */ 3251 static int chunk_drange_filter(struct extent_buffer *leaf, 3252 struct btrfs_chunk *chunk, 3253 struct btrfs_balance_args *bargs) 3254 { 3255 struct btrfs_stripe *stripe; 3256 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3257 u64 stripe_offset; 3258 u64 stripe_length; 3259 int factor; 3260 int i; 3261 3262 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3263 return 0; 3264 3265 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3266 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3267 factor = num_stripes / 2; 3268 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3269 factor = num_stripes - 1; 3270 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3271 factor = num_stripes - 2; 3272 } else { 3273 factor = num_stripes; 3274 } 3275 3276 for (i = 0; i < num_stripes; i++) { 3277 stripe = btrfs_stripe_nr(chunk, i); 3278 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3279 continue; 3280 3281 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3282 stripe_length = btrfs_chunk_length(leaf, chunk); 3283 stripe_length = div_u64(stripe_length, factor); 3284 3285 if (stripe_offset < bargs->pend && 3286 stripe_offset + stripe_length > bargs->pstart) 3287 return 0; 3288 } 3289 3290 return 1; 3291 } 3292 3293 /* [vstart, vend) */ 3294 static int chunk_vrange_filter(struct extent_buffer *leaf, 3295 struct btrfs_chunk *chunk, 3296 u64 chunk_offset, 3297 struct btrfs_balance_args *bargs) 3298 { 3299 if (chunk_offset < bargs->vend && 3300 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3301 /* at least part of the chunk is inside this vrange */ 3302 return 0; 3303 3304 return 1; 3305 } 3306 3307 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3308 struct btrfs_chunk *chunk, 3309 struct btrfs_balance_args *bargs) 3310 { 3311 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3312 3313 if (bargs->stripes_min <= num_stripes 3314 && num_stripes <= bargs->stripes_max) 3315 return 0; 3316 3317 return 1; 3318 } 3319 3320 static int chunk_soft_convert_filter(u64 chunk_type, 3321 struct btrfs_balance_args *bargs) 3322 { 3323 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3324 return 0; 3325 3326 chunk_type = chunk_to_extended(chunk_type) & 3327 BTRFS_EXTENDED_PROFILE_MASK; 3328 3329 if (bargs->target == chunk_type) 3330 return 1; 3331 3332 return 0; 3333 } 3334 3335 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3336 struct extent_buffer *leaf, 3337 struct btrfs_chunk *chunk, u64 chunk_offset) 3338 { 3339 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3340 struct btrfs_balance_args *bargs = NULL; 3341 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3342 3343 /* type filter */ 3344 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3345 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3346 return 0; 3347 } 3348 3349 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3350 bargs = &bctl->data; 3351 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3352 bargs = &bctl->sys; 3353 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3354 bargs = &bctl->meta; 3355 3356 /* profiles filter */ 3357 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3358 chunk_profiles_filter(chunk_type, bargs)) { 3359 return 0; 3360 } 3361 3362 /* usage filter */ 3363 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3364 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3365 return 0; 3366 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3367 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3368 return 0; 3369 } 3370 3371 /* devid filter */ 3372 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3373 chunk_devid_filter(leaf, chunk, bargs)) { 3374 return 0; 3375 } 3376 3377 /* drange filter, makes sense only with devid filter */ 3378 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3379 chunk_drange_filter(leaf, chunk, bargs)) { 3380 return 0; 3381 } 3382 3383 /* vrange filter */ 3384 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3385 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3386 return 0; 3387 } 3388 3389 /* stripes filter */ 3390 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3391 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3392 return 0; 3393 } 3394 3395 /* soft profile changing mode */ 3396 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3397 chunk_soft_convert_filter(chunk_type, bargs)) { 3398 return 0; 3399 } 3400 3401 /* 3402 * limited by count, must be the last filter 3403 */ 3404 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3405 if (bargs->limit == 0) 3406 return 0; 3407 else 3408 bargs->limit--; 3409 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3410 /* 3411 * Same logic as the 'limit' filter; the minimum cannot be 3412 * determined here because we do not have the global information 3413 * about the count of all chunks that satisfy the filters. 3414 */ 3415 if (bargs->limit_max == 0) 3416 return 0; 3417 else 3418 bargs->limit_max--; 3419 } 3420 3421 return 1; 3422 } 3423 3424 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3425 { 3426 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3427 struct btrfs_root *chunk_root = fs_info->chunk_root; 3428 struct btrfs_root *dev_root = fs_info->dev_root; 3429 struct list_head *devices; 3430 struct btrfs_device *device; 3431 u64 old_size; 3432 u64 size_to_free; 3433 u64 chunk_type; 3434 struct btrfs_chunk *chunk; 3435 struct btrfs_path *path = NULL; 3436 struct btrfs_key key; 3437 struct btrfs_key found_key; 3438 struct btrfs_trans_handle *trans; 3439 struct extent_buffer *leaf; 3440 int slot; 3441 int ret; 3442 int enospc_errors = 0; 3443 bool counting = true; 3444 /* The single value limit and min/max limits use the same bytes in the */ 3445 u64 limit_data = bctl->data.limit; 3446 u64 limit_meta = bctl->meta.limit; 3447 u64 limit_sys = bctl->sys.limit; 3448 u32 count_data = 0; 3449 u32 count_meta = 0; 3450 u32 count_sys = 0; 3451 int chunk_reserved = 0; 3452 3453 /* step one make some room on all the devices */ 3454 devices = &fs_info->fs_devices->devices; 3455 list_for_each_entry(device, devices, dev_list) { 3456 old_size = btrfs_device_get_total_bytes(device); 3457 size_to_free = div_factor(old_size, 1); 3458 size_to_free = min_t(u64, size_to_free, SZ_1M); 3459 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || 3460 btrfs_device_get_total_bytes(device) - 3461 btrfs_device_get_bytes_used(device) > size_to_free || 3462 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 3463 continue; 3464 3465 ret = btrfs_shrink_device(device, old_size - size_to_free); 3466 if (ret == -ENOSPC) 3467 break; 3468 if (ret) { 3469 /* btrfs_shrink_device never returns ret > 0 */ 3470 WARN_ON(ret > 0); 3471 goto error; 3472 } 3473 3474 trans = btrfs_start_transaction(dev_root, 0); 3475 if (IS_ERR(trans)) { 3476 ret = PTR_ERR(trans); 3477 btrfs_info_in_rcu(fs_info, 3478 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3479 rcu_str_deref(device->name), ret, 3480 old_size, old_size - size_to_free); 3481 goto error; 3482 } 3483 3484 ret = btrfs_grow_device(trans, device, old_size); 3485 if (ret) { 3486 btrfs_end_transaction(trans); 3487 /* btrfs_grow_device never returns ret > 0 */ 3488 WARN_ON(ret > 0); 3489 btrfs_info_in_rcu(fs_info, 3490 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3491 rcu_str_deref(device->name), ret, 3492 old_size, old_size - size_to_free); 3493 goto error; 3494 } 3495 3496 btrfs_end_transaction(trans); 3497 } 3498 3499 /* step two, relocate all the chunks */ 3500 path = btrfs_alloc_path(); 3501 if (!path) { 3502 ret = -ENOMEM; 3503 goto error; 3504 } 3505 3506 /* zero out stat counters */ 3507 spin_lock(&fs_info->balance_lock); 3508 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3509 spin_unlock(&fs_info->balance_lock); 3510 again: 3511 if (!counting) { 3512 /* 3513 * The single value limit and min/max limits use the same bytes 3514 * in the 3515 */ 3516 bctl->data.limit = limit_data; 3517 bctl->meta.limit = limit_meta; 3518 bctl->sys.limit = limit_sys; 3519 } 3520 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3521 key.offset = (u64)-1; 3522 key.type = BTRFS_CHUNK_ITEM_KEY; 3523 3524 while (1) { 3525 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3526 atomic_read(&fs_info->balance_cancel_req)) { 3527 ret = -ECANCELED; 3528 goto error; 3529 } 3530 3531 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3532 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3533 if (ret < 0) { 3534 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3535 goto error; 3536 } 3537 3538 /* 3539 * this shouldn't happen, it means the last relocate 3540 * failed 3541 */ 3542 if (ret == 0) 3543 BUG(); /* FIXME break ? */ 3544 3545 ret = btrfs_previous_item(chunk_root, path, 0, 3546 BTRFS_CHUNK_ITEM_KEY); 3547 if (ret) { 3548 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3549 ret = 0; 3550 break; 3551 } 3552 3553 leaf = path->nodes[0]; 3554 slot = path->slots[0]; 3555 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3556 3557 if (found_key.objectid != key.objectid) { 3558 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3559 break; 3560 } 3561 3562 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3563 chunk_type = btrfs_chunk_type(leaf, chunk); 3564 3565 if (!counting) { 3566 spin_lock(&fs_info->balance_lock); 3567 bctl->stat.considered++; 3568 spin_unlock(&fs_info->balance_lock); 3569 } 3570 3571 ret = should_balance_chunk(fs_info, leaf, chunk, 3572 found_key.offset); 3573 3574 btrfs_release_path(path); 3575 if (!ret) { 3576 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3577 goto loop; 3578 } 3579 3580 if (counting) { 3581 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3582 spin_lock(&fs_info->balance_lock); 3583 bctl->stat.expected++; 3584 spin_unlock(&fs_info->balance_lock); 3585 3586 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3587 count_data++; 3588 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3589 count_sys++; 3590 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3591 count_meta++; 3592 3593 goto loop; 3594 } 3595 3596 /* 3597 * Apply limit_min filter, no need to check if the LIMITS 3598 * filter is used, limit_min is 0 by default 3599 */ 3600 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3601 count_data < bctl->data.limit_min) 3602 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3603 count_meta < bctl->meta.limit_min) 3604 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3605 count_sys < bctl->sys.limit_min)) { 3606 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3607 goto loop; 3608 } 3609 3610 if (!chunk_reserved) { 3611 /* 3612 * We may be relocating the only data chunk we have, 3613 * which could potentially end up with losing data's 3614 * raid profile, so lets allocate an empty one in 3615 * advance. 3616 */ 3617 ret = btrfs_may_alloc_data_chunk(fs_info, 3618 found_key.offset); 3619 if (ret < 0) { 3620 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3621 goto error; 3622 } else if (ret == 1) { 3623 chunk_reserved = 1; 3624 } 3625 } 3626 3627 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3628 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3629 if (ret && ret != -ENOSPC) 3630 goto error; 3631 if (ret == -ENOSPC) { 3632 enospc_errors++; 3633 } else { 3634 spin_lock(&fs_info->balance_lock); 3635 bctl->stat.completed++; 3636 spin_unlock(&fs_info->balance_lock); 3637 } 3638 loop: 3639 if (found_key.offset == 0) 3640 break; 3641 key.offset = found_key.offset - 1; 3642 } 3643 3644 if (counting) { 3645 btrfs_release_path(path); 3646 counting = false; 3647 goto again; 3648 } 3649 error: 3650 btrfs_free_path(path); 3651 if (enospc_errors) { 3652 btrfs_info(fs_info, "%d enospc errors during balance", 3653 enospc_errors); 3654 if (!ret) 3655 ret = -ENOSPC; 3656 } 3657 3658 return ret; 3659 } 3660 3661 /** 3662 * alloc_profile_is_valid - see if a given profile is valid and reduced 3663 * @flags: profile to validate 3664 * @extended: if true @flags is treated as an extended profile 3665 */ 3666 static int alloc_profile_is_valid(u64 flags, int extended) 3667 { 3668 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3669 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3670 3671 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3672 3673 /* 1) check that all other bits are zeroed */ 3674 if (flags & ~mask) 3675 return 0; 3676 3677 /* 2) see if profile is reduced */ 3678 if (flags == 0) 3679 return !extended; /* "0" is valid for usual profiles */ 3680 3681 /* true if exactly one bit set */ 3682 return (flags & (flags - 1)) == 0; 3683 } 3684 3685 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3686 { 3687 /* cancel requested || normal exit path */ 3688 return atomic_read(&fs_info->balance_cancel_req) || 3689 (atomic_read(&fs_info->balance_pause_req) == 0 && 3690 atomic_read(&fs_info->balance_cancel_req) == 0); 3691 } 3692 3693 /* Non-zero return value signifies invalidity */ 3694 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3695 u64 allowed) 3696 { 3697 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3698 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3699 (bctl_arg->target & ~allowed))); 3700 } 3701 3702 /* 3703 * Should be called with balance mutexe held 3704 */ 3705 int btrfs_balance(struct btrfs_fs_info *fs_info, 3706 struct btrfs_balance_control *bctl, 3707 struct btrfs_ioctl_balance_args *bargs) 3708 { 3709 u64 meta_target, data_target; 3710 u64 allowed; 3711 int mixed = 0; 3712 int ret; 3713 u64 num_devices; 3714 unsigned seq; 3715 3716 if (btrfs_fs_closing(fs_info) || 3717 atomic_read(&fs_info->balance_pause_req) || 3718 atomic_read(&fs_info->balance_cancel_req)) { 3719 ret = -EINVAL; 3720 goto out; 3721 } 3722 3723 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3724 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3725 mixed = 1; 3726 3727 /* 3728 * In case of mixed groups both data and meta should be picked, 3729 * and identical options should be given for both of them. 3730 */ 3731 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3732 if (mixed && (bctl->flags & allowed)) { 3733 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3734 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3735 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3736 btrfs_err(fs_info, 3737 "balance: mixed groups data and metadata options must be the same"); 3738 ret = -EINVAL; 3739 goto out; 3740 } 3741 } 3742 3743 num_devices = fs_info->fs_devices->num_devices; 3744 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 3745 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3746 BUG_ON(num_devices < 1); 3747 num_devices--; 3748 } 3749 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 3750 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3751 if (num_devices > 1) 3752 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3753 if (num_devices > 2) 3754 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3755 if (num_devices > 3) 3756 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3757 BTRFS_BLOCK_GROUP_RAID6); 3758 if (validate_convert_profile(&bctl->data, allowed)) { 3759 int index = btrfs_bg_flags_to_raid_index(bctl->data.target); 3760 3761 btrfs_err(fs_info, 3762 "balance: invalid convert data profile %s", 3763 get_raid_name(index)); 3764 ret = -EINVAL; 3765 goto out; 3766 } 3767 if (validate_convert_profile(&bctl->meta, allowed)) { 3768 int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); 3769 3770 btrfs_err(fs_info, 3771 "balance: invalid convert metadata profile %s", 3772 get_raid_name(index)); 3773 ret = -EINVAL; 3774 goto out; 3775 } 3776 if (validate_convert_profile(&bctl->sys, allowed)) { 3777 int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); 3778 3779 btrfs_err(fs_info, 3780 "balance: invalid convert system profile %s", 3781 get_raid_name(index)); 3782 ret = -EINVAL; 3783 goto out; 3784 } 3785 3786 /* allow to reduce meta or sys integrity only if force set */ 3787 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3788 BTRFS_BLOCK_GROUP_RAID10 | 3789 BTRFS_BLOCK_GROUP_RAID5 | 3790 BTRFS_BLOCK_GROUP_RAID6; 3791 do { 3792 seq = read_seqbegin(&fs_info->profiles_lock); 3793 3794 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3795 (fs_info->avail_system_alloc_bits & allowed) && 3796 !(bctl->sys.target & allowed)) || 3797 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3798 (fs_info->avail_metadata_alloc_bits & allowed) && 3799 !(bctl->meta.target & allowed))) { 3800 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3801 btrfs_info(fs_info, 3802 "balance: force reducing metadata integrity"); 3803 } else { 3804 btrfs_err(fs_info, 3805 "balance: reduces metadata integrity, use --force if you want this"); 3806 ret = -EINVAL; 3807 goto out; 3808 } 3809 } 3810 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3811 3812 /* if we're not converting, the target field is uninitialized */ 3813 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3814 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 3815 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3816 bctl->data.target : fs_info->avail_data_alloc_bits; 3817 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 3818 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3819 int meta_index = btrfs_bg_flags_to_raid_index(meta_target); 3820 int data_index = btrfs_bg_flags_to_raid_index(data_target); 3821 3822 btrfs_warn(fs_info, 3823 "balance: metadata profile %s has lower redundancy than data profile %s", 3824 get_raid_name(meta_index), get_raid_name(data_index)); 3825 } 3826 3827 ret = insert_balance_item(fs_info, bctl); 3828 if (ret && ret != -EEXIST) 3829 goto out; 3830 3831 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3832 BUG_ON(ret == -EEXIST); 3833 BUG_ON(fs_info->balance_ctl); 3834 spin_lock(&fs_info->balance_lock); 3835 fs_info->balance_ctl = bctl; 3836 spin_unlock(&fs_info->balance_lock); 3837 } else { 3838 BUG_ON(ret != -EEXIST); 3839 spin_lock(&fs_info->balance_lock); 3840 update_balance_args(bctl); 3841 spin_unlock(&fs_info->balance_lock); 3842 } 3843 3844 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 3845 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 3846 mutex_unlock(&fs_info->balance_mutex); 3847 3848 ret = __btrfs_balance(fs_info); 3849 3850 mutex_lock(&fs_info->balance_mutex); 3851 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 3852 3853 if (bargs) { 3854 memset(bargs, 0, sizeof(*bargs)); 3855 btrfs_update_ioctl_balance_args(fs_info, bargs); 3856 } 3857 3858 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3859 balance_need_close(fs_info)) { 3860 reset_balance_state(fs_info); 3861 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3862 } 3863 3864 wake_up(&fs_info->balance_wait_q); 3865 3866 return ret; 3867 out: 3868 if (bctl->flags & BTRFS_BALANCE_RESUME) 3869 reset_balance_state(fs_info); 3870 else 3871 kfree(bctl); 3872 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3873 3874 return ret; 3875 } 3876 3877 static int balance_kthread(void *data) 3878 { 3879 struct btrfs_fs_info *fs_info = data; 3880 int ret = 0; 3881 3882 mutex_lock(&fs_info->balance_mutex); 3883 if (fs_info->balance_ctl) { 3884 btrfs_info(fs_info, "balance: resuming"); 3885 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 3886 } 3887 mutex_unlock(&fs_info->balance_mutex); 3888 3889 return ret; 3890 } 3891 3892 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3893 { 3894 struct task_struct *tsk; 3895 3896 mutex_lock(&fs_info->balance_mutex); 3897 if (!fs_info->balance_ctl) { 3898 mutex_unlock(&fs_info->balance_mutex); 3899 return 0; 3900 } 3901 mutex_unlock(&fs_info->balance_mutex); 3902 3903 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 3904 btrfs_info(fs_info, "balance: resume skipped"); 3905 return 0; 3906 } 3907 3908 /* 3909 * A ro->rw remount sequence should continue with the paused balance 3910 * regardless of who pauses it, system or the user as of now, so set 3911 * the resume flag. 3912 */ 3913 spin_lock(&fs_info->balance_lock); 3914 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 3915 spin_unlock(&fs_info->balance_lock); 3916 3917 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3918 return PTR_ERR_OR_ZERO(tsk); 3919 } 3920 3921 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3922 { 3923 struct btrfs_balance_control *bctl; 3924 struct btrfs_balance_item *item; 3925 struct btrfs_disk_balance_args disk_bargs; 3926 struct btrfs_path *path; 3927 struct extent_buffer *leaf; 3928 struct btrfs_key key; 3929 int ret; 3930 3931 path = btrfs_alloc_path(); 3932 if (!path) 3933 return -ENOMEM; 3934 3935 key.objectid = BTRFS_BALANCE_OBJECTID; 3936 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3937 key.offset = 0; 3938 3939 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3940 if (ret < 0) 3941 goto out; 3942 if (ret > 0) { /* ret = -ENOENT; */ 3943 ret = 0; 3944 goto out; 3945 } 3946 3947 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3948 if (!bctl) { 3949 ret = -ENOMEM; 3950 goto out; 3951 } 3952 3953 leaf = path->nodes[0]; 3954 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3955 3956 bctl->flags = btrfs_balance_flags(leaf, item); 3957 bctl->flags |= BTRFS_BALANCE_RESUME; 3958 3959 btrfs_balance_data(leaf, item, &disk_bargs); 3960 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3961 btrfs_balance_meta(leaf, item, &disk_bargs); 3962 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3963 btrfs_balance_sys(leaf, item, &disk_bargs); 3964 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3965 3966 /* 3967 * This should never happen, as the paused balance state is recovered 3968 * during mount without any chance of other exclusive ops to collide. 3969 * 3970 * This gives the exclusive op status to balance and keeps in paused 3971 * state until user intervention (cancel or umount). If the ownership 3972 * cannot be assigned, show a message but do not fail. The balance 3973 * is in a paused state and must have fs_info::balance_ctl properly 3974 * set up. 3975 */ 3976 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 3977 btrfs_warn(fs_info, 3978 "balance: cannot set exclusive op status, resume manually"); 3979 3980 mutex_lock(&fs_info->balance_mutex); 3981 BUG_ON(fs_info->balance_ctl); 3982 spin_lock(&fs_info->balance_lock); 3983 fs_info->balance_ctl = bctl; 3984 spin_unlock(&fs_info->balance_lock); 3985 mutex_unlock(&fs_info->balance_mutex); 3986 out: 3987 btrfs_free_path(path); 3988 return ret; 3989 } 3990 3991 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3992 { 3993 int ret = 0; 3994 3995 mutex_lock(&fs_info->balance_mutex); 3996 if (!fs_info->balance_ctl) { 3997 mutex_unlock(&fs_info->balance_mutex); 3998 return -ENOTCONN; 3999 } 4000 4001 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4002 atomic_inc(&fs_info->balance_pause_req); 4003 mutex_unlock(&fs_info->balance_mutex); 4004 4005 wait_event(fs_info->balance_wait_q, 4006 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4007 4008 mutex_lock(&fs_info->balance_mutex); 4009 /* we are good with balance_ctl ripped off from under us */ 4010 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4011 atomic_dec(&fs_info->balance_pause_req); 4012 } else { 4013 ret = -ENOTCONN; 4014 } 4015 4016 mutex_unlock(&fs_info->balance_mutex); 4017 return ret; 4018 } 4019 4020 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4021 { 4022 mutex_lock(&fs_info->balance_mutex); 4023 if (!fs_info->balance_ctl) { 4024 mutex_unlock(&fs_info->balance_mutex); 4025 return -ENOTCONN; 4026 } 4027 4028 /* 4029 * A paused balance with the item stored on disk can be resumed at 4030 * mount time if the mount is read-write. Otherwise it's still paused 4031 * and we must not allow cancelling as it deletes the item. 4032 */ 4033 if (sb_rdonly(fs_info->sb)) { 4034 mutex_unlock(&fs_info->balance_mutex); 4035 return -EROFS; 4036 } 4037 4038 atomic_inc(&fs_info->balance_cancel_req); 4039 /* 4040 * if we are running just wait and return, balance item is 4041 * deleted in btrfs_balance in this case 4042 */ 4043 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4044 mutex_unlock(&fs_info->balance_mutex); 4045 wait_event(fs_info->balance_wait_q, 4046 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4047 mutex_lock(&fs_info->balance_mutex); 4048 } else { 4049 mutex_unlock(&fs_info->balance_mutex); 4050 /* 4051 * Lock released to allow other waiters to continue, we'll 4052 * reexamine the status again. 4053 */ 4054 mutex_lock(&fs_info->balance_mutex); 4055 4056 if (fs_info->balance_ctl) { 4057 reset_balance_state(fs_info); 4058 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4059 btrfs_info(fs_info, "balance: canceled"); 4060 } 4061 } 4062 4063 BUG_ON(fs_info->balance_ctl || 4064 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4065 atomic_dec(&fs_info->balance_cancel_req); 4066 mutex_unlock(&fs_info->balance_mutex); 4067 return 0; 4068 } 4069 4070 static int btrfs_uuid_scan_kthread(void *data) 4071 { 4072 struct btrfs_fs_info *fs_info = data; 4073 struct btrfs_root *root = fs_info->tree_root; 4074 struct btrfs_key key; 4075 struct btrfs_path *path = NULL; 4076 int ret = 0; 4077 struct extent_buffer *eb; 4078 int slot; 4079 struct btrfs_root_item root_item; 4080 u32 item_size; 4081 struct btrfs_trans_handle *trans = NULL; 4082 4083 path = btrfs_alloc_path(); 4084 if (!path) { 4085 ret = -ENOMEM; 4086 goto out; 4087 } 4088 4089 key.objectid = 0; 4090 key.type = BTRFS_ROOT_ITEM_KEY; 4091 key.offset = 0; 4092 4093 while (1) { 4094 ret = btrfs_search_forward(root, &key, path, 4095 BTRFS_OLDEST_GENERATION); 4096 if (ret) { 4097 if (ret > 0) 4098 ret = 0; 4099 break; 4100 } 4101 4102 if (key.type != BTRFS_ROOT_ITEM_KEY || 4103 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4104 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4105 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4106 goto skip; 4107 4108 eb = path->nodes[0]; 4109 slot = path->slots[0]; 4110 item_size = btrfs_item_size_nr(eb, slot); 4111 if (item_size < sizeof(root_item)) 4112 goto skip; 4113 4114 read_extent_buffer(eb, &root_item, 4115 btrfs_item_ptr_offset(eb, slot), 4116 (int)sizeof(root_item)); 4117 if (btrfs_root_refs(&root_item) == 0) 4118 goto skip; 4119 4120 if (!btrfs_is_empty_uuid(root_item.uuid) || 4121 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4122 if (trans) 4123 goto update_tree; 4124 4125 btrfs_release_path(path); 4126 /* 4127 * 1 - subvol uuid item 4128 * 1 - received_subvol uuid item 4129 */ 4130 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4131 if (IS_ERR(trans)) { 4132 ret = PTR_ERR(trans); 4133 break; 4134 } 4135 continue; 4136 } else { 4137 goto skip; 4138 } 4139 update_tree: 4140 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4141 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4142 BTRFS_UUID_KEY_SUBVOL, 4143 key.objectid); 4144 if (ret < 0) { 4145 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4146 ret); 4147 break; 4148 } 4149 } 4150 4151 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4152 ret = btrfs_uuid_tree_add(trans, 4153 root_item.received_uuid, 4154 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4155 key.objectid); 4156 if (ret < 0) { 4157 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4158 ret); 4159 break; 4160 } 4161 } 4162 4163 skip: 4164 if (trans) { 4165 ret = btrfs_end_transaction(trans); 4166 trans = NULL; 4167 if (ret) 4168 break; 4169 } 4170 4171 btrfs_release_path(path); 4172 if (key.offset < (u64)-1) { 4173 key.offset++; 4174 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4175 key.offset = 0; 4176 key.type = BTRFS_ROOT_ITEM_KEY; 4177 } else if (key.objectid < (u64)-1) { 4178 key.offset = 0; 4179 key.type = BTRFS_ROOT_ITEM_KEY; 4180 key.objectid++; 4181 } else { 4182 break; 4183 } 4184 cond_resched(); 4185 } 4186 4187 out: 4188 btrfs_free_path(path); 4189 if (trans && !IS_ERR(trans)) 4190 btrfs_end_transaction(trans); 4191 if (ret) 4192 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4193 else 4194 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4195 up(&fs_info->uuid_tree_rescan_sem); 4196 return 0; 4197 } 4198 4199 /* 4200 * Callback for btrfs_uuid_tree_iterate(). 4201 * returns: 4202 * 0 check succeeded, the entry is not outdated. 4203 * < 0 if an error occurred. 4204 * > 0 if the check failed, which means the caller shall remove the entry. 4205 */ 4206 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4207 u8 *uuid, u8 type, u64 subid) 4208 { 4209 struct btrfs_key key; 4210 int ret = 0; 4211 struct btrfs_root *subvol_root; 4212 4213 if (type != BTRFS_UUID_KEY_SUBVOL && 4214 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4215 goto out; 4216 4217 key.objectid = subid; 4218 key.type = BTRFS_ROOT_ITEM_KEY; 4219 key.offset = (u64)-1; 4220 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4221 if (IS_ERR(subvol_root)) { 4222 ret = PTR_ERR(subvol_root); 4223 if (ret == -ENOENT) 4224 ret = 1; 4225 goto out; 4226 } 4227 4228 switch (type) { 4229 case BTRFS_UUID_KEY_SUBVOL: 4230 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4231 ret = 1; 4232 break; 4233 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4234 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4235 BTRFS_UUID_SIZE)) 4236 ret = 1; 4237 break; 4238 } 4239 4240 out: 4241 return ret; 4242 } 4243 4244 static int btrfs_uuid_rescan_kthread(void *data) 4245 { 4246 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4247 int ret; 4248 4249 /* 4250 * 1st step is to iterate through the existing UUID tree and 4251 * to delete all entries that contain outdated data. 4252 * 2nd step is to add all missing entries to the UUID tree. 4253 */ 4254 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4255 if (ret < 0) { 4256 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4257 up(&fs_info->uuid_tree_rescan_sem); 4258 return ret; 4259 } 4260 return btrfs_uuid_scan_kthread(data); 4261 } 4262 4263 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4264 { 4265 struct btrfs_trans_handle *trans; 4266 struct btrfs_root *tree_root = fs_info->tree_root; 4267 struct btrfs_root *uuid_root; 4268 struct task_struct *task; 4269 int ret; 4270 4271 /* 4272 * 1 - root node 4273 * 1 - root item 4274 */ 4275 trans = btrfs_start_transaction(tree_root, 2); 4276 if (IS_ERR(trans)) 4277 return PTR_ERR(trans); 4278 4279 uuid_root = btrfs_create_tree(trans, fs_info, 4280 BTRFS_UUID_TREE_OBJECTID); 4281 if (IS_ERR(uuid_root)) { 4282 ret = PTR_ERR(uuid_root); 4283 btrfs_abort_transaction(trans, ret); 4284 btrfs_end_transaction(trans); 4285 return ret; 4286 } 4287 4288 fs_info->uuid_root = uuid_root; 4289 4290 ret = btrfs_commit_transaction(trans); 4291 if (ret) 4292 return ret; 4293 4294 down(&fs_info->uuid_tree_rescan_sem); 4295 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4296 if (IS_ERR(task)) { 4297 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4298 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4299 up(&fs_info->uuid_tree_rescan_sem); 4300 return PTR_ERR(task); 4301 } 4302 4303 return 0; 4304 } 4305 4306 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4307 { 4308 struct task_struct *task; 4309 4310 down(&fs_info->uuid_tree_rescan_sem); 4311 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4312 if (IS_ERR(task)) { 4313 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4314 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4315 up(&fs_info->uuid_tree_rescan_sem); 4316 return PTR_ERR(task); 4317 } 4318 4319 return 0; 4320 } 4321 4322 /* 4323 * shrinking a device means finding all of the device extents past 4324 * the new size, and then following the back refs to the chunks. 4325 * The chunk relocation code actually frees the device extent 4326 */ 4327 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4328 { 4329 struct btrfs_fs_info *fs_info = device->fs_info; 4330 struct btrfs_root *root = fs_info->dev_root; 4331 struct btrfs_trans_handle *trans; 4332 struct btrfs_dev_extent *dev_extent = NULL; 4333 struct btrfs_path *path; 4334 u64 length; 4335 u64 chunk_offset; 4336 int ret; 4337 int slot; 4338 int failed = 0; 4339 bool retried = false; 4340 bool checked_pending_chunks = false; 4341 struct extent_buffer *l; 4342 struct btrfs_key key; 4343 struct btrfs_super_block *super_copy = fs_info->super_copy; 4344 u64 old_total = btrfs_super_total_bytes(super_copy); 4345 u64 old_size = btrfs_device_get_total_bytes(device); 4346 u64 diff; 4347 4348 new_size = round_down(new_size, fs_info->sectorsize); 4349 diff = round_down(old_size - new_size, fs_info->sectorsize); 4350 4351 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4352 return -EINVAL; 4353 4354 path = btrfs_alloc_path(); 4355 if (!path) 4356 return -ENOMEM; 4357 4358 path->reada = READA_BACK; 4359 4360 mutex_lock(&fs_info->chunk_mutex); 4361 4362 btrfs_device_set_total_bytes(device, new_size); 4363 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4364 device->fs_devices->total_rw_bytes -= diff; 4365 atomic64_sub(diff, &fs_info->free_chunk_space); 4366 } 4367 mutex_unlock(&fs_info->chunk_mutex); 4368 4369 again: 4370 key.objectid = device->devid; 4371 key.offset = (u64)-1; 4372 key.type = BTRFS_DEV_EXTENT_KEY; 4373 4374 do { 4375 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4376 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4377 if (ret < 0) { 4378 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4379 goto done; 4380 } 4381 4382 ret = btrfs_previous_item(root, path, 0, key.type); 4383 if (ret) 4384 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4385 if (ret < 0) 4386 goto done; 4387 if (ret) { 4388 ret = 0; 4389 btrfs_release_path(path); 4390 break; 4391 } 4392 4393 l = path->nodes[0]; 4394 slot = path->slots[0]; 4395 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4396 4397 if (key.objectid != device->devid) { 4398 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4399 btrfs_release_path(path); 4400 break; 4401 } 4402 4403 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4404 length = btrfs_dev_extent_length(l, dev_extent); 4405 4406 if (key.offset + length <= new_size) { 4407 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4408 btrfs_release_path(path); 4409 break; 4410 } 4411 4412 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4413 btrfs_release_path(path); 4414 4415 /* 4416 * We may be relocating the only data chunk we have, 4417 * which could potentially end up with losing data's 4418 * raid profile, so lets allocate an empty one in 4419 * advance. 4420 */ 4421 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4422 if (ret < 0) { 4423 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4424 goto done; 4425 } 4426 4427 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4428 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4429 if (ret && ret != -ENOSPC) 4430 goto done; 4431 if (ret == -ENOSPC) 4432 failed++; 4433 } while (key.offset-- > 0); 4434 4435 if (failed && !retried) { 4436 failed = 0; 4437 retried = true; 4438 goto again; 4439 } else if (failed && retried) { 4440 ret = -ENOSPC; 4441 goto done; 4442 } 4443 4444 /* Shrinking succeeded, else we would be at "done". */ 4445 trans = btrfs_start_transaction(root, 0); 4446 if (IS_ERR(trans)) { 4447 ret = PTR_ERR(trans); 4448 goto done; 4449 } 4450 4451 mutex_lock(&fs_info->chunk_mutex); 4452 4453 /* 4454 * We checked in the above loop all device extents that were already in 4455 * the device tree. However before we have updated the device's 4456 * total_bytes to the new size, we might have had chunk allocations that 4457 * have not complete yet (new block groups attached to transaction 4458 * handles), and therefore their device extents were not yet in the 4459 * device tree and we missed them in the loop above. So if we have any 4460 * pending chunk using a device extent that overlaps the device range 4461 * that we can not use anymore, commit the current transaction and 4462 * repeat the search on the device tree - this way we guarantee we will 4463 * not have chunks using device extents that end beyond 'new_size'. 4464 */ 4465 if (!checked_pending_chunks) { 4466 u64 start = new_size; 4467 u64 len = old_size - new_size; 4468 4469 if (contains_pending_extent(trans->transaction, device, 4470 &start, len)) { 4471 mutex_unlock(&fs_info->chunk_mutex); 4472 checked_pending_chunks = true; 4473 failed = 0; 4474 retried = false; 4475 ret = btrfs_commit_transaction(trans); 4476 if (ret) 4477 goto done; 4478 goto again; 4479 } 4480 } 4481 4482 btrfs_device_set_disk_total_bytes(device, new_size); 4483 if (list_empty(&device->resized_list)) 4484 list_add_tail(&device->resized_list, 4485 &fs_info->fs_devices->resized_devices); 4486 4487 WARN_ON(diff > old_total); 4488 btrfs_set_super_total_bytes(super_copy, 4489 round_down(old_total - diff, fs_info->sectorsize)); 4490 mutex_unlock(&fs_info->chunk_mutex); 4491 4492 /* Now btrfs_update_device() will change the on-disk size. */ 4493 ret = btrfs_update_device(trans, device); 4494 if (ret < 0) { 4495 btrfs_abort_transaction(trans, ret); 4496 btrfs_end_transaction(trans); 4497 } else { 4498 ret = btrfs_commit_transaction(trans); 4499 } 4500 done: 4501 btrfs_free_path(path); 4502 if (ret) { 4503 mutex_lock(&fs_info->chunk_mutex); 4504 btrfs_device_set_total_bytes(device, old_size); 4505 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4506 device->fs_devices->total_rw_bytes += diff; 4507 atomic64_add(diff, &fs_info->free_chunk_space); 4508 mutex_unlock(&fs_info->chunk_mutex); 4509 } 4510 return ret; 4511 } 4512 4513 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4514 struct btrfs_key *key, 4515 struct btrfs_chunk *chunk, int item_size) 4516 { 4517 struct btrfs_super_block *super_copy = fs_info->super_copy; 4518 struct btrfs_disk_key disk_key; 4519 u32 array_size; 4520 u8 *ptr; 4521 4522 mutex_lock(&fs_info->chunk_mutex); 4523 array_size = btrfs_super_sys_array_size(super_copy); 4524 if (array_size + item_size + sizeof(disk_key) 4525 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4526 mutex_unlock(&fs_info->chunk_mutex); 4527 return -EFBIG; 4528 } 4529 4530 ptr = super_copy->sys_chunk_array + array_size; 4531 btrfs_cpu_key_to_disk(&disk_key, key); 4532 memcpy(ptr, &disk_key, sizeof(disk_key)); 4533 ptr += sizeof(disk_key); 4534 memcpy(ptr, chunk, item_size); 4535 item_size += sizeof(disk_key); 4536 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4537 mutex_unlock(&fs_info->chunk_mutex); 4538 4539 return 0; 4540 } 4541 4542 /* 4543 * sort the devices in descending order by max_avail, total_avail 4544 */ 4545 static int btrfs_cmp_device_info(const void *a, const void *b) 4546 { 4547 const struct btrfs_device_info *di_a = a; 4548 const struct btrfs_device_info *di_b = b; 4549 4550 if (di_a->max_avail > di_b->max_avail) 4551 return -1; 4552 if (di_a->max_avail < di_b->max_avail) 4553 return 1; 4554 if (di_a->total_avail > di_b->total_avail) 4555 return -1; 4556 if (di_a->total_avail < di_b->total_avail) 4557 return 1; 4558 return 0; 4559 } 4560 4561 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4562 { 4563 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4564 return; 4565 4566 btrfs_set_fs_incompat(info, RAID56); 4567 } 4568 4569 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ 4570 - sizeof(struct btrfs_chunk)) \ 4571 / sizeof(struct btrfs_stripe) + 1) 4572 4573 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4574 - 2 * sizeof(struct btrfs_disk_key) \ 4575 - 2 * sizeof(struct btrfs_chunk)) \ 4576 / sizeof(struct btrfs_stripe) + 1) 4577 4578 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4579 u64 start, u64 type) 4580 { 4581 struct btrfs_fs_info *info = trans->fs_info; 4582 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4583 struct btrfs_device *device; 4584 struct map_lookup *map = NULL; 4585 struct extent_map_tree *em_tree; 4586 struct extent_map *em; 4587 struct btrfs_device_info *devices_info = NULL; 4588 u64 total_avail; 4589 int num_stripes; /* total number of stripes to allocate */ 4590 int data_stripes; /* number of stripes that count for 4591 block group size */ 4592 int sub_stripes; /* sub_stripes info for map */ 4593 int dev_stripes; /* stripes per dev */ 4594 int devs_max; /* max devs to use */ 4595 int devs_min; /* min devs needed */ 4596 int devs_increment; /* ndevs has to be a multiple of this */ 4597 int ncopies; /* how many copies to data has */ 4598 int ret; 4599 u64 max_stripe_size; 4600 u64 max_chunk_size; 4601 u64 stripe_size; 4602 u64 num_bytes; 4603 int ndevs; 4604 int i; 4605 int j; 4606 int index; 4607 4608 BUG_ON(!alloc_profile_is_valid(type, 0)); 4609 4610 if (list_empty(&fs_devices->alloc_list)) { 4611 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4612 btrfs_debug(info, "%s: no writable device", __func__); 4613 return -ENOSPC; 4614 } 4615 4616 index = btrfs_bg_flags_to_raid_index(type); 4617 4618 sub_stripes = btrfs_raid_array[index].sub_stripes; 4619 dev_stripes = btrfs_raid_array[index].dev_stripes; 4620 devs_max = btrfs_raid_array[index].devs_max; 4621 devs_min = btrfs_raid_array[index].devs_min; 4622 devs_increment = btrfs_raid_array[index].devs_increment; 4623 ncopies = btrfs_raid_array[index].ncopies; 4624 4625 if (type & BTRFS_BLOCK_GROUP_DATA) { 4626 max_stripe_size = SZ_1G; 4627 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4628 if (!devs_max) 4629 devs_max = BTRFS_MAX_DEVS(info); 4630 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4631 /* for larger filesystems, use larger metadata chunks */ 4632 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4633 max_stripe_size = SZ_1G; 4634 else 4635 max_stripe_size = SZ_256M; 4636 max_chunk_size = max_stripe_size; 4637 if (!devs_max) 4638 devs_max = BTRFS_MAX_DEVS(info); 4639 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4640 max_stripe_size = SZ_32M; 4641 max_chunk_size = 2 * max_stripe_size; 4642 if (!devs_max) 4643 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4644 } else { 4645 btrfs_err(info, "invalid chunk type 0x%llx requested", 4646 type); 4647 BUG_ON(1); 4648 } 4649 4650 /* we don't want a chunk larger than 10% of writeable space */ 4651 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4652 max_chunk_size); 4653 4654 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4655 GFP_NOFS); 4656 if (!devices_info) 4657 return -ENOMEM; 4658 4659 /* 4660 * in the first pass through the devices list, we gather information 4661 * about the available holes on each device. 4662 */ 4663 ndevs = 0; 4664 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4665 u64 max_avail; 4666 u64 dev_offset; 4667 4668 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4669 WARN(1, KERN_ERR 4670 "BTRFS: read-only device in alloc_list\n"); 4671 continue; 4672 } 4673 4674 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4675 &device->dev_state) || 4676 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4677 continue; 4678 4679 if (device->total_bytes > device->bytes_used) 4680 total_avail = device->total_bytes - device->bytes_used; 4681 else 4682 total_avail = 0; 4683 4684 /* If there is no space on this device, skip it. */ 4685 if (total_avail == 0) 4686 continue; 4687 4688 ret = find_free_dev_extent(trans, device, 4689 max_stripe_size * dev_stripes, 4690 &dev_offset, &max_avail); 4691 if (ret && ret != -ENOSPC) 4692 goto error; 4693 4694 if (ret == 0) 4695 max_avail = max_stripe_size * dev_stripes; 4696 4697 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { 4698 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4699 btrfs_debug(info, 4700 "%s: devid %llu has no free space, have=%llu want=%u", 4701 __func__, device->devid, max_avail, 4702 BTRFS_STRIPE_LEN * dev_stripes); 4703 continue; 4704 } 4705 4706 if (ndevs == fs_devices->rw_devices) { 4707 WARN(1, "%s: found more than %llu devices\n", 4708 __func__, fs_devices->rw_devices); 4709 break; 4710 } 4711 devices_info[ndevs].dev_offset = dev_offset; 4712 devices_info[ndevs].max_avail = max_avail; 4713 devices_info[ndevs].total_avail = total_avail; 4714 devices_info[ndevs].dev = device; 4715 ++ndevs; 4716 } 4717 4718 /* 4719 * now sort the devices by hole size / available space 4720 */ 4721 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4722 btrfs_cmp_device_info, NULL); 4723 4724 /* round down to number of usable stripes */ 4725 ndevs = round_down(ndevs, devs_increment); 4726 4727 if (ndevs < devs_min) { 4728 ret = -ENOSPC; 4729 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 4730 btrfs_debug(info, 4731 "%s: not enough devices with free space: have=%d minimum required=%d", 4732 __func__, ndevs, devs_min); 4733 } 4734 goto error; 4735 } 4736 4737 ndevs = min(ndevs, devs_max); 4738 4739 /* 4740 * The primary goal is to maximize the number of stripes, so use as 4741 * many devices as possible, even if the stripes are not maximum sized. 4742 * 4743 * The DUP profile stores more than one stripe per device, the 4744 * max_avail is the total size so we have to adjust. 4745 */ 4746 stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); 4747 num_stripes = ndevs * dev_stripes; 4748 4749 /* 4750 * this will have to be fixed for RAID1 and RAID10 over 4751 * more drives 4752 */ 4753 data_stripes = num_stripes / ncopies; 4754 4755 if (type & BTRFS_BLOCK_GROUP_RAID5) 4756 data_stripes = num_stripes - 1; 4757 4758 if (type & BTRFS_BLOCK_GROUP_RAID6) 4759 data_stripes = num_stripes - 2; 4760 4761 /* 4762 * Use the number of data stripes to figure out how big this chunk 4763 * is really going to be in terms of logical address space, 4764 * and compare that answer with the max chunk size 4765 */ 4766 if (stripe_size * data_stripes > max_chunk_size) { 4767 stripe_size = div_u64(max_chunk_size, data_stripes); 4768 4769 /* bump the answer up to a 16MB boundary */ 4770 stripe_size = round_up(stripe_size, SZ_16M); 4771 4772 /* 4773 * But don't go higher than the limits we found while searching 4774 * for free extents 4775 */ 4776 stripe_size = min(devices_info[ndevs - 1].max_avail, 4777 stripe_size); 4778 } 4779 4780 /* align to BTRFS_STRIPE_LEN */ 4781 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 4782 4783 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4784 if (!map) { 4785 ret = -ENOMEM; 4786 goto error; 4787 } 4788 map->num_stripes = num_stripes; 4789 4790 for (i = 0; i < ndevs; ++i) { 4791 for (j = 0; j < dev_stripes; ++j) { 4792 int s = i * dev_stripes + j; 4793 map->stripes[s].dev = devices_info[i].dev; 4794 map->stripes[s].physical = devices_info[i].dev_offset + 4795 j * stripe_size; 4796 } 4797 } 4798 map->stripe_len = BTRFS_STRIPE_LEN; 4799 map->io_align = BTRFS_STRIPE_LEN; 4800 map->io_width = BTRFS_STRIPE_LEN; 4801 map->type = type; 4802 map->sub_stripes = sub_stripes; 4803 4804 num_bytes = stripe_size * data_stripes; 4805 4806 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4807 4808 em = alloc_extent_map(); 4809 if (!em) { 4810 kfree(map); 4811 ret = -ENOMEM; 4812 goto error; 4813 } 4814 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4815 em->map_lookup = map; 4816 em->start = start; 4817 em->len = num_bytes; 4818 em->block_start = 0; 4819 em->block_len = em->len; 4820 em->orig_block_len = stripe_size; 4821 4822 em_tree = &info->mapping_tree.map_tree; 4823 write_lock(&em_tree->lock); 4824 ret = add_extent_mapping(em_tree, em, 0); 4825 if (ret) { 4826 write_unlock(&em_tree->lock); 4827 free_extent_map(em); 4828 goto error; 4829 } 4830 4831 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4832 refcount_inc(&em->refs); 4833 write_unlock(&em_tree->lock); 4834 4835 ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); 4836 if (ret) 4837 goto error_del_extent; 4838 4839 for (i = 0; i < map->num_stripes; i++) { 4840 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4841 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4842 } 4843 4844 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 4845 4846 free_extent_map(em); 4847 check_raid56_incompat_flag(info, type); 4848 4849 kfree(devices_info); 4850 return 0; 4851 4852 error_del_extent: 4853 write_lock(&em_tree->lock); 4854 remove_extent_mapping(em_tree, em); 4855 write_unlock(&em_tree->lock); 4856 4857 /* One for our allocation */ 4858 free_extent_map(em); 4859 /* One for the tree reference */ 4860 free_extent_map(em); 4861 /* One for the pending_chunks list reference */ 4862 free_extent_map(em); 4863 error: 4864 kfree(devices_info); 4865 return ret; 4866 } 4867 4868 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4869 u64 chunk_offset, u64 chunk_size) 4870 { 4871 struct btrfs_fs_info *fs_info = trans->fs_info; 4872 struct btrfs_root *extent_root = fs_info->extent_root; 4873 struct btrfs_root *chunk_root = fs_info->chunk_root; 4874 struct btrfs_key key; 4875 struct btrfs_device *device; 4876 struct btrfs_chunk *chunk; 4877 struct btrfs_stripe *stripe; 4878 struct extent_map *em; 4879 struct map_lookup *map; 4880 size_t item_size; 4881 u64 dev_offset; 4882 u64 stripe_size; 4883 int i = 0; 4884 int ret = 0; 4885 4886 em = get_chunk_map(fs_info, chunk_offset, chunk_size); 4887 if (IS_ERR(em)) 4888 return PTR_ERR(em); 4889 4890 map = em->map_lookup; 4891 item_size = btrfs_chunk_item_size(map->num_stripes); 4892 stripe_size = em->orig_block_len; 4893 4894 chunk = kzalloc(item_size, GFP_NOFS); 4895 if (!chunk) { 4896 ret = -ENOMEM; 4897 goto out; 4898 } 4899 4900 /* 4901 * Take the device list mutex to prevent races with the final phase of 4902 * a device replace operation that replaces the device object associated 4903 * with the map's stripes, because the device object's id can change 4904 * at any time during that final phase of the device replace operation 4905 * (dev-replace.c:btrfs_dev_replace_finishing()). 4906 */ 4907 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4908 for (i = 0; i < map->num_stripes; i++) { 4909 device = map->stripes[i].dev; 4910 dev_offset = map->stripes[i].physical; 4911 4912 ret = btrfs_update_device(trans, device); 4913 if (ret) 4914 break; 4915 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 4916 dev_offset, stripe_size); 4917 if (ret) 4918 break; 4919 } 4920 if (ret) { 4921 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4922 goto out; 4923 } 4924 4925 stripe = &chunk->stripe; 4926 for (i = 0; i < map->num_stripes; i++) { 4927 device = map->stripes[i].dev; 4928 dev_offset = map->stripes[i].physical; 4929 4930 btrfs_set_stack_stripe_devid(stripe, device->devid); 4931 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4932 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4933 stripe++; 4934 } 4935 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4936 4937 btrfs_set_stack_chunk_length(chunk, chunk_size); 4938 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4939 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4940 btrfs_set_stack_chunk_type(chunk, map->type); 4941 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4942 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4943 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4944 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 4945 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4946 4947 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4948 key.type = BTRFS_CHUNK_ITEM_KEY; 4949 key.offset = chunk_offset; 4950 4951 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4952 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4953 /* 4954 * TODO: Cleanup of inserted chunk root in case of 4955 * failure. 4956 */ 4957 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 4958 } 4959 4960 out: 4961 kfree(chunk); 4962 free_extent_map(em); 4963 return ret; 4964 } 4965 4966 /* 4967 * Chunk allocation falls into two parts. The first part does works 4968 * that make the new allocated chunk useable, but not do any operation 4969 * that modifies the chunk tree. The second part does the works that 4970 * require modifying the chunk tree. This division is important for the 4971 * bootstrap process of adding storage to a seed btrfs. 4972 */ 4973 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) 4974 { 4975 u64 chunk_offset; 4976 4977 lockdep_assert_held(&trans->fs_info->chunk_mutex); 4978 chunk_offset = find_next_chunk(trans->fs_info); 4979 return __btrfs_alloc_chunk(trans, chunk_offset, type); 4980 } 4981 4982 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4983 struct btrfs_fs_info *fs_info) 4984 { 4985 u64 chunk_offset; 4986 u64 sys_chunk_offset; 4987 u64 alloc_profile; 4988 int ret; 4989 4990 chunk_offset = find_next_chunk(fs_info); 4991 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 4992 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 4993 if (ret) 4994 return ret; 4995 4996 sys_chunk_offset = find_next_chunk(fs_info); 4997 alloc_profile = btrfs_system_alloc_profile(fs_info); 4998 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 4999 return ret; 5000 } 5001 5002 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5003 { 5004 int max_errors; 5005 5006 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5007 BTRFS_BLOCK_GROUP_RAID10 | 5008 BTRFS_BLOCK_GROUP_RAID5 | 5009 BTRFS_BLOCK_GROUP_DUP)) { 5010 max_errors = 1; 5011 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5012 max_errors = 2; 5013 } else { 5014 max_errors = 0; 5015 } 5016 5017 return max_errors; 5018 } 5019 5020 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5021 { 5022 struct extent_map *em; 5023 struct map_lookup *map; 5024 int readonly = 0; 5025 int miss_ndevs = 0; 5026 int i; 5027 5028 em = get_chunk_map(fs_info, chunk_offset, 1); 5029 if (IS_ERR(em)) 5030 return 1; 5031 5032 map = em->map_lookup; 5033 for (i = 0; i < map->num_stripes; i++) { 5034 if (test_bit(BTRFS_DEV_STATE_MISSING, 5035 &map->stripes[i].dev->dev_state)) { 5036 miss_ndevs++; 5037 continue; 5038 } 5039 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5040 &map->stripes[i].dev->dev_state)) { 5041 readonly = 1; 5042 goto end; 5043 } 5044 } 5045 5046 /* 5047 * If the number of missing devices is larger than max errors, 5048 * we can not write the data into that chunk successfully, so 5049 * set it readonly. 5050 */ 5051 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5052 readonly = 1; 5053 end: 5054 free_extent_map(em); 5055 return readonly; 5056 } 5057 5058 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5059 { 5060 extent_map_tree_init(&tree->map_tree); 5061 } 5062 5063 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5064 { 5065 struct extent_map *em; 5066 5067 while (1) { 5068 write_lock(&tree->map_tree.lock); 5069 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5070 if (em) 5071 remove_extent_mapping(&tree->map_tree, em); 5072 write_unlock(&tree->map_tree.lock); 5073 if (!em) 5074 break; 5075 /* once for us */ 5076 free_extent_map(em); 5077 /* once for the tree */ 5078 free_extent_map(em); 5079 } 5080 } 5081 5082 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5083 { 5084 struct extent_map *em; 5085 struct map_lookup *map; 5086 int ret; 5087 5088 em = get_chunk_map(fs_info, logical, len); 5089 if (IS_ERR(em)) 5090 /* 5091 * We could return errors for these cases, but that could get 5092 * ugly and we'd probably do the same thing which is just not do 5093 * anything else and exit, so return 1 so the callers don't try 5094 * to use other copies. 5095 */ 5096 return 1; 5097 5098 map = em->map_lookup; 5099 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5100 ret = map->num_stripes; 5101 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5102 ret = map->sub_stripes; 5103 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5104 ret = 2; 5105 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5106 /* 5107 * There could be two corrupted data stripes, we need 5108 * to loop retry in order to rebuild the correct data. 5109 * 5110 * Fail a stripe at a time on every retry except the 5111 * stripe under reconstruction. 5112 */ 5113 ret = map->num_stripes; 5114 else 5115 ret = 1; 5116 free_extent_map(em); 5117 5118 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 5119 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5120 fs_info->dev_replace.tgtdev) 5121 ret++; 5122 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 5123 5124 return ret; 5125 } 5126 5127 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5128 u64 logical) 5129 { 5130 struct extent_map *em; 5131 struct map_lookup *map; 5132 unsigned long len = fs_info->sectorsize; 5133 5134 em = get_chunk_map(fs_info, logical, len); 5135 5136 if (!WARN_ON(IS_ERR(em))) { 5137 map = em->map_lookup; 5138 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5139 len = map->stripe_len * nr_data_stripes(map); 5140 free_extent_map(em); 5141 } 5142 return len; 5143 } 5144 5145 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5146 { 5147 struct extent_map *em; 5148 struct map_lookup *map; 5149 int ret = 0; 5150 5151 em = get_chunk_map(fs_info, logical, len); 5152 5153 if(!WARN_ON(IS_ERR(em))) { 5154 map = em->map_lookup; 5155 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5156 ret = 1; 5157 free_extent_map(em); 5158 } 5159 return ret; 5160 } 5161 5162 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5163 struct map_lookup *map, int first, 5164 int dev_replace_is_ongoing) 5165 { 5166 int i; 5167 int num_stripes; 5168 int preferred_mirror; 5169 int tolerance; 5170 struct btrfs_device *srcdev; 5171 5172 ASSERT((map->type & 5173 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5174 5175 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5176 num_stripes = map->sub_stripes; 5177 else 5178 num_stripes = map->num_stripes; 5179 5180 preferred_mirror = first + current->pid % num_stripes; 5181 5182 if (dev_replace_is_ongoing && 5183 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5184 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5185 srcdev = fs_info->dev_replace.srcdev; 5186 else 5187 srcdev = NULL; 5188 5189 /* 5190 * try to avoid the drive that is the source drive for a 5191 * dev-replace procedure, only choose it if no other non-missing 5192 * mirror is available 5193 */ 5194 for (tolerance = 0; tolerance < 2; tolerance++) { 5195 if (map->stripes[preferred_mirror].dev->bdev && 5196 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5197 return preferred_mirror; 5198 for (i = first; i < first + num_stripes; i++) { 5199 if (map->stripes[i].dev->bdev && 5200 (tolerance || map->stripes[i].dev != srcdev)) 5201 return i; 5202 } 5203 } 5204 5205 /* we couldn't find one that doesn't fail. Just return something 5206 * and the io error handling code will clean up eventually 5207 */ 5208 return preferred_mirror; 5209 } 5210 5211 static inline int parity_smaller(u64 a, u64 b) 5212 { 5213 return a > b; 5214 } 5215 5216 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5217 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5218 { 5219 struct btrfs_bio_stripe s; 5220 int i; 5221 u64 l; 5222 int again = 1; 5223 5224 while (again) { 5225 again = 0; 5226 for (i = 0; i < num_stripes - 1; i++) { 5227 if (parity_smaller(bbio->raid_map[i], 5228 bbio->raid_map[i+1])) { 5229 s = bbio->stripes[i]; 5230 l = bbio->raid_map[i]; 5231 bbio->stripes[i] = bbio->stripes[i+1]; 5232 bbio->raid_map[i] = bbio->raid_map[i+1]; 5233 bbio->stripes[i+1] = s; 5234 bbio->raid_map[i+1] = l; 5235 5236 again = 1; 5237 } 5238 } 5239 } 5240 } 5241 5242 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5243 { 5244 struct btrfs_bio *bbio = kzalloc( 5245 /* the size of the btrfs_bio */ 5246 sizeof(struct btrfs_bio) + 5247 /* plus the variable array for the stripes */ 5248 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5249 /* plus the variable array for the tgt dev */ 5250 sizeof(int) * (real_stripes) + 5251 /* 5252 * plus the raid_map, which includes both the tgt dev 5253 * and the stripes 5254 */ 5255 sizeof(u64) * (total_stripes), 5256 GFP_NOFS|__GFP_NOFAIL); 5257 5258 atomic_set(&bbio->error, 0); 5259 refcount_set(&bbio->refs, 1); 5260 5261 return bbio; 5262 } 5263 5264 void btrfs_get_bbio(struct btrfs_bio *bbio) 5265 { 5266 WARN_ON(!refcount_read(&bbio->refs)); 5267 refcount_inc(&bbio->refs); 5268 } 5269 5270 void btrfs_put_bbio(struct btrfs_bio *bbio) 5271 { 5272 if (!bbio) 5273 return; 5274 if (refcount_dec_and_test(&bbio->refs)) 5275 kfree(bbio); 5276 } 5277 5278 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5279 /* 5280 * Please note that, discard won't be sent to target device of device 5281 * replace. 5282 */ 5283 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5284 u64 logical, u64 length, 5285 struct btrfs_bio **bbio_ret) 5286 { 5287 struct extent_map *em; 5288 struct map_lookup *map; 5289 struct btrfs_bio *bbio; 5290 u64 offset; 5291 u64 stripe_nr; 5292 u64 stripe_nr_end; 5293 u64 stripe_end_offset; 5294 u64 stripe_cnt; 5295 u64 stripe_len; 5296 u64 stripe_offset; 5297 u64 num_stripes; 5298 u32 stripe_index; 5299 u32 factor = 0; 5300 u32 sub_stripes = 0; 5301 u64 stripes_per_dev = 0; 5302 u32 remaining_stripes = 0; 5303 u32 last_stripe = 0; 5304 int ret = 0; 5305 int i; 5306 5307 /* discard always return a bbio */ 5308 ASSERT(bbio_ret); 5309 5310 em = get_chunk_map(fs_info, logical, length); 5311 if (IS_ERR(em)) 5312 return PTR_ERR(em); 5313 5314 map = em->map_lookup; 5315 /* we don't discard raid56 yet */ 5316 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5317 ret = -EOPNOTSUPP; 5318 goto out; 5319 } 5320 5321 offset = logical - em->start; 5322 length = min_t(u64, em->len - offset, length); 5323 5324 stripe_len = map->stripe_len; 5325 /* 5326 * stripe_nr counts the total number of stripes we have to stride 5327 * to get to this block 5328 */ 5329 stripe_nr = div64_u64(offset, stripe_len); 5330 5331 /* stripe_offset is the offset of this block in its stripe */ 5332 stripe_offset = offset - stripe_nr * stripe_len; 5333 5334 stripe_nr_end = round_up(offset + length, map->stripe_len); 5335 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5336 stripe_cnt = stripe_nr_end - stripe_nr; 5337 stripe_end_offset = stripe_nr_end * map->stripe_len - 5338 (offset + length); 5339 /* 5340 * after this, stripe_nr is the number of stripes on this 5341 * device we have to walk to find the data, and stripe_index is 5342 * the number of our device in the stripe array 5343 */ 5344 num_stripes = 1; 5345 stripe_index = 0; 5346 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5347 BTRFS_BLOCK_GROUP_RAID10)) { 5348 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5349 sub_stripes = 1; 5350 else 5351 sub_stripes = map->sub_stripes; 5352 5353 factor = map->num_stripes / sub_stripes; 5354 num_stripes = min_t(u64, map->num_stripes, 5355 sub_stripes * stripe_cnt); 5356 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5357 stripe_index *= sub_stripes; 5358 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5359 &remaining_stripes); 5360 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5361 last_stripe *= sub_stripes; 5362 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5363 BTRFS_BLOCK_GROUP_DUP)) { 5364 num_stripes = map->num_stripes; 5365 } else { 5366 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5367 &stripe_index); 5368 } 5369 5370 bbio = alloc_btrfs_bio(num_stripes, 0); 5371 if (!bbio) { 5372 ret = -ENOMEM; 5373 goto out; 5374 } 5375 5376 for (i = 0; i < num_stripes; i++) { 5377 bbio->stripes[i].physical = 5378 map->stripes[stripe_index].physical + 5379 stripe_offset + stripe_nr * map->stripe_len; 5380 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5381 5382 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5383 BTRFS_BLOCK_GROUP_RAID10)) { 5384 bbio->stripes[i].length = stripes_per_dev * 5385 map->stripe_len; 5386 5387 if (i / sub_stripes < remaining_stripes) 5388 bbio->stripes[i].length += 5389 map->stripe_len; 5390 5391 /* 5392 * Special for the first stripe and 5393 * the last stripe: 5394 * 5395 * |-------|...|-------| 5396 * |----------| 5397 * off end_off 5398 */ 5399 if (i < sub_stripes) 5400 bbio->stripes[i].length -= 5401 stripe_offset; 5402 5403 if (stripe_index >= last_stripe && 5404 stripe_index <= (last_stripe + 5405 sub_stripes - 1)) 5406 bbio->stripes[i].length -= 5407 stripe_end_offset; 5408 5409 if (i == sub_stripes - 1) 5410 stripe_offset = 0; 5411 } else { 5412 bbio->stripes[i].length = length; 5413 } 5414 5415 stripe_index++; 5416 if (stripe_index == map->num_stripes) { 5417 stripe_index = 0; 5418 stripe_nr++; 5419 } 5420 } 5421 5422 *bbio_ret = bbio; 5423 bbio->map_type = map->type; 5424 bbio->num_stripes = num_stripes; 5425 out: 5426 free_extent_map(em); 5427 return ret; 5428 } 5429 5430 /* 5431 * In dev-replace case, for repair case (that's the only case where the mirror 5432 * is selected explicitly when calling btrfs_map_block), blocks left of the 5433 * left cursor can also be read from the target drive. 5434 * 5435 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5436 * array of stripes. 5437 * For READ, it also needs to be supported using the same mirror number. 5438 * 5439 * If the requested block is not left of the left cursor, EIO is returned. This 5440 * can happen because btrfs_num_copies() returns one more in the dev-replace 5441 * case. 5442 */ 5443 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5444 u64 logical, u64 length, 5445 u64 srcdev_devid, int *mirror_num, 5446 u64 *physical) 5447 { 5448 struct btrfs_bio *bbio = NULL; 5449 int num_stripes; 5450 int index_srcdev = 0; 5451 int found = 0; 5452 u64 physical_of_found = 0; 5453 int i; 5454 int ret = 0; 5455 5456 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5457 logical, &length, &bbio, 0, 0); 5458 if (ret) { 5459 ASSERT(bbio == NULL); 5460 return ret; 5461 } 5462 5463 num_stripes = bbio->num_stripes; 5464 if (*mirror_num > num_stripes) { 5465 /* 5466 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5467 * that means that the requested area is not left of the left 5468 * cursor 5469 */ 5470 btrfs_put_bbio(bbio); 5471 return -EIO; 5472 } 5473 5474 /* 5475 * process the rest of the function using the mirror_num of the source 5476 * drive. Therefore look it up first. At the end, patch the device 5477 * pointer to the one of the target drive. 5478 */ 5479 for (i = 0; i < num_stripes; i++) { 5480 if (bbio->stripes[i].dev->devid != srcdev_devid) 5481 continue; 5482 5483 /* 5484 * In case of DUP, in order to keep it simple, only add the 5485 * mirror with the lowest physical address 5486 */ 5487 if (found && 5488 physical_of_found <= bbio->stripes[i].physical) 5489 continue; 5490 5491 index_srcdev = i; 5492 found = 1; 5493 physical_of_found = bbio->stripes[i].physical; 5494 } 5495 5496 btrfs_put_bbio(bbio); 5497 5498 ASSERT(found); 5499 if (!found) 5500 return -EIO; 5501 5502 *mirror_num = index_srcdev + 1; 5503 *physical = physical_of_found; 5504 return ret; 5505 } 5506 5507 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5508 struct btrfs_bio **bbio_ret, 5509 struct btrfs_dev_replace *dev_replace, 5510 int *num_stripes_ret, int *max_errors_ret) 5511 { 5512 struct btrfs_bio *bbio = *bbio_ret; 5513 u64 srcdev_devid = dev_replace->srcdev->devid; 5514 int tgtdev_indexes = 0; 5515 int num_stripes = *num_stripes_ret; 5516 int max_errors = *max_errors_ret; 5517 int i; 5518 5519 if (op == BTRFS_MAP_WRITE) { 5520 int index_where_to_add; 5521 5522 /* 5523 * duplicate the write operations while the dev replace 5524 * procedure is running. Since the copying of the old disk to 5525 * the new disk takes place at run time while the filesystem is 5526 * mounted writable, the regular write operations to the old 5527 * disk have to be duplicated to go to the new disk as well. 5528 * 5529 * Note that device->missing is handled by the caller, and that 5530 * the write to the old disk is already set up in the stripes 5531 * array. 5532 */ 5533 index_where_to_add = num_stripes; 5534 for (i = 0; i < num_stripes; i++) { 5535 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5536 /* write to new disk, too */ 5537 struct btrfs_bio_stripe *new = 5538 bbio->stripes + index_where_to_add; 5539 struct btrfs_bio_stripe *old = 5540 bbio->stripes + i; 5541 5542 new->physical = old->physical; 5543 new->length = old->length; 5544 new->dev = dev_replace->tgtdev; 5545 bbio->tgtdev_map[i] = index_where_to_add; 5546 index_where_to_add++; 5547 max_errors++; 5548 tgtdev_indexes++; 5549 } 5550 } 5551 num_stripes = index_where_to_add; 5552 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5553 int index_srcdev = 0; 5554 int found = 0; 5555 u64 physical_of_found = 0; 5556 5557 /* 5558 * During the dev-replace procedure, the target drive can also 5559 * be used to read data in case it is needed to repair a corrupt 5560 * block elsewhere. This is possible if the requested area is 5561 * left of the left cursor. In this area, the target drive is a 5562 * full copy of the source drive. 5563 */ 5564 for (i = 0; i < num_stripes; i++) { 5565 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5566 /* 5567 * In case of DUP, in order to keep it simple, 5568 * only add the mirror with the lowest physical 5569 * address 5570 */ 5571 if (found && 5572 physical_of_found <= 5573 bbio->stripes[i].physical) 5574 continue; 5575 index_srcdev = i; 5576 found = 1; 5577 physical_of_found = bbio->stripes[i].physical; 5578 } 5579 } 5580 if (found) { 5581 struct btrfs_bio_stripe *tgtdev_stripe = 5582 bbio->stripes + num_stripes; 5583 5584 tgtdev_stripe->physical = physical_of_found; 5585 tgtdev_stripe->length = 5586 bbio->stripes[index_srcdev].length; 5587 tgtdev_stripe->dev = dev_replace->tgtdev; 5588 bbio->tgtdev_map[index_srcdev] = num_stripes; 5589 5590 tgtdev_indexes++; 5591 num_stripes++; 5592 } 5593 } 5594 5595 *num_stripes_ret = num_stripes; 5596 *max_errors_ret = max_errors; 5597 bbio->num_tgtdevs = tgtdev_indexes; 5598 *bbio_ret = bbio; 5599 } 5600 5601 static bool need_full_stripe(enum btrfs_map_op op) 5602 { 5603 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5604 } 5605 5606 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5607 enum btrfs_map_op op, 5608 u64 logical, u64 *length, 5609 struct btrfs_bio **bbio_ret, 5610 int mirror_num, int need_raid_map) 5611 { 5612 struct extent_map *em; 5613 struct map_lookup *map; 5614 u64 offset; 5615 u64 stripe_offset; 5616 u64 stripe_nr; 5617 u64 stripe_len; 5618 u32 stripe_index; 5619 int i; 5620 int ret = 0; 5621 int num_stripes; 5622 int max_errors = 0; 5623 int tgtdev_indexes = 0; 5624 struct btrfs_bio *bbio = NULL; 5625 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5626 int dev_replace_is_ongoing = 0; 5627 int num_alloc_stripes; 5628 int patch_the_first_stripe_for_dev_replace = 0; 5629 u64 physical_to_patch_in_first_stripe = 0; 5630 u64 raid56_full_stripe_start = (u64)-1; 5631 5632 if (op == BTRFS_MAP_DISCARD) 5633 return __btrfs_map_block_for_discard(fs_info, logical, 5634 *length, bbio_ret); 5635 5636 em = get_chunk_map(fs_info, logical, *length); 5637 if (IS_ERR(em)) 5638 return PTR_ERR(em); 5639 5640 map = em->map_lookup; 5641 offset = logical - em->start; 5642 5643 stripe_len = map->stripe_len; 5644 stripe_nr = offset; 5645 /* 5646 * stripe_nr counts the total number of stripes we have to stride 5647 * to get to this block 5648 */ 5649 stripe_nr = div64_u64(stripe_nr, stripe_len); 5650 5651 stripe_offset = stripe_nr * stripe_len; 5652 if (offset < stripe_offset) { 5653 btrfs_crit(fs_info, 5654 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5655 stripe_offset, offset, em->start, logical, 5656 stripe_len); 5657 free_extent_map(em); 5658 return -EINVAL; 5659 } 5660 5661 /* stripe_offset is the offset of this block in its stripe*/ 5662 stripe_offset = offset - stripe_offset; 5663 5664 /* if we're here for raid56, we need to know the stripe aligned start */ 5665 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5666 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5667 raid56_full_stripe_start = offset; 5668 5669 /* allow a write of a full stripe, but make sure we don't 5670 * allow straddling of stripes 5671 */ 5672 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5673 full_stripe_len); 5674 raid56_full_stripe_start *= full_stripe_len; 5675 } 5676 5677 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5678 u64 max_len; 5679 /* For writes to RAID[56], allow a full stripeset across all disks. 5680 For other RAID types and for RAID[56] reads, just allow a single 5681 stripe (on a single disk). */ 5682 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5683 (op == BTRFS_MAP_WRITE)) { 5684 max_len = stripe_len * nr_data_stripes(map) - 5685 (offset - raid56_full_stripe_start); 5686 } else { 5687 /* we limit the length of each bio to what fits in a stripe */ 5688 max_len = stripe_len - stripe_offset; 5689 } 5690 *length = min_t(u64, em->len - offset, max_len); 5691 } else { 5692 *length = em->len - offset; 5693 } 5694 5695 /* This is for when we're called from btrfs_merge_bio_hook() and all 5696 it cares about is the length */ 5697 if (!bbio_ret) 5698 goto out; 5699 5700 btrfs_dev_replace_read_lock(dev_replace); 5701 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5702 if (!dev_replace_is_ongoing) 5703 btrfs_dev_replace_read_unlock(dev_replace); 5704 else 5705 btrfs_dev_replace_set_lock_blocking(dev_replace); 5706 5707 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5708 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 5709 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 5710 dev_replace->srcdev->devid, 5711 &mirror_num, 5712 &physical_to_patch_in_first_stripe); 5713 if (ret) 5714 goto out; 5715 else 5716 patch_the_first_stripe_for_dev_replace = 1; 5717 } else if (mirror_num > map->num_stripes) { 5718 mirror_num = 0; 5719 } 5720 5721 num_stripes = 1; 5722 stripe_index = 0; 5723 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5724 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5725 &stripe_index); 5726 if (!need_full_stripe(op)) 5727 mirror_num = 1; 5728 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5729 if (need_full_stripe(op)) 5730 num_stripes = map->num_stripes; 5731 else if (mirror_num) 5732 stripe_index = mirror_num - 1; 5733 else { 5734 stripe_index = find_live_mirror(fs_info, map, 0, 5735 dev_replace_is_ongoing); 5736 mirror_num = stripe_index + 1; 5737 } 5738 5739 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5740 if (need_full_stripe(op)) { 5741 num_stripes = map->num_stripes; 5742 } else if (mirror_num) { 5743 stripe_index = mirror_num - 1; 5744 } else { 5745 mirror_num = 1; 5746 } 5747 5748 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5749 u32 factor = map->num_stripes / map->sub_stripes; 5750 5751 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5752 stripe_index *= map->sub_stripes; 5753 5754 if (need_full_stripe(op)) 5755 num_stripes = map->sub_stripes; 5756 else if (mirror_num) 5757 stripe_index += mirror_num - 1; 5758 else { 5759 int old_stripe_index = stripe_index; 5760 stripe_index = find_live_mirror(fs_info, map, 5761 stripe_index, 5762 dev_replace_is_ongoing); 5763 mirror_num = stripe_index - old_stripe_index + 1; 5764 } 5765 5766 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5767 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5768 /* push stripe_nr back to the start of the full stripe */ 5769 stripe_nr = div64_u64(raid56_full_stripe_start, 5770 stripe_len * nr_data_stripes(map)); 5771 5772 /* RAID[56] write or recovery. Return all stripes */ 5773 num_stripes = map->num_stripes; 5774 max_errors = nr_parity_stripes(map); 5775 5776 *length = map->stripe_len; 5777 stripe_index = 0; 5778 stripe_offset = 0; 5779 } else { 5780 /* 5781 * Mirror #0 or #1 means the original data block. 5782 * Mirror #2 is RAID5 parity block. 5783 * Mirror #3 is RAID6 Q block. 5784 */ 5785 stripe_nr = div_u64_rem(stripe_nr, 5786 nr_data_stripes(map), &stripe_index); 5787 if (mirror_num > 1) 5788 stripe_index = nr_data_stripes(map) + 5789 mirror_num - 2; 5790 5791 /* We distribute the parity blocks across stripes */ 5792 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5793 &stripe_index); 5794 if (!need_full_stripe(op) && mirror_num <= 1) 5795 mirror_num = 1; 5796 } 5797 } else { 5798 /* 5799 * after this, stripe_nr is the number of stripes on this 5800 * device we have to walk to find the data, and stripe_index is 5801 * the number of our device in the stripe array 5802 */ 5803 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5804 &stripe_index); 5805 mirror_num = stripe_index + 1; 5806 } 5807 if (stripe_index >= map->num_stripes) { 5808 btrfs_crit(fs_info, 5809 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5810 stripe_index, map->num_stripes); 5811 ret = -EINVAL; 5812 goto out; 5813 } 5814 5815 num_alloc_stripes = num_stripes; 5816 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 5817 if (op == BTRFS_MAP_WRITE) 5818 num_alloc_stripes <<= 1; 5819 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5820 num_alloc_stripes++; 5821 tgtdev_indexes = num_stripes; 5822 } 5823 5824 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5825 if (!bbio) { 5826 ret = -ENOMEM; 5827 goto out; 5828 } 5829 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 5830 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5831 5832 /* build raid_map */ 5833 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 5834 (need_full_stripe(op) || mirror_num > 1)) { 5835 u64 tmp; 5836 unsigned rot; 5837 5838 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5839 sizeof(struct btrfs_bio_stripe) * 5840 num_alloc_stripes + 5841 sizeof(int) * tgtdev_indexes); 5842 5843 /* Work out the disk rotation on this stripe-set */ 5844 div_u64_rem(stripe_nr, num_stripes, &rot); 5845 5846 /* Fill in the logical address of each stripe */ 5847 tmp = stripe_nr * nr_data_stripes(map); 5848 for (i = 0; i < nr_data_stripes(map); i++) 5849 bbio->raid_map[(i+rot) % num_stripes] = 5850 em->start + (tmp + i) * map->stripe_len; 5851 5852 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5853 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5854 bbio->raid_map[(i+rot+1) % num_stripes] = 5855 RAID6_Q_STRIPE; 5856 } 5857 5858 5859 for (i = 0; i < num_stripes; i++) { 5860 bbio->stripes[i].physical = 5861 map->stripes[stripe_index].physical + 5862 stripe_offset + 5863 stripe_nr * map->stripe_len; 5864 bbio->stripes[i].dev = 5865 map->stripes[stripe_index].dev; 5866 stripe_index++; 5867 } 5868 5869 if (need_full_stripe(op)) 5870 max_errors = btrfs_chunk_max_errors(map); 5871 5872 if (bbio->raid_map) 5873 sort_parity_stripes(bbio, num_stripes); 5874 5875 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 5876 need_full_stripe(op)) { 5877 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 5878 &max_errors); 5879 } 5880 5881 *bbio_ret = bbio; 5882 bbio->map_type = map->type; 5883 bbio->num_stripes = num_stripes; 5884 bbio->max_errors = max_errors; 5885 bbio->mirror_num = mirror_num; 5886 5887 /* 5888 * this is the case that REQ_READ && dev_replace_is_ongoing && 5889 * mirror_num == num_stripes + 1 && dev_replace target drive is 5890 * available as a mirror 5891 */ 5892 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5893 WARN_ON(num_stripes > 1); 5894 bbio->stripes[0].dev = dev_replace->tgtdev; 5895 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5896 bbio->mirror_num = map->num_stripes + 1; 5897 } 5898 out: 5899 if (dev_replace_is_ongoing) { 5900 btrfs_dev_replace_clear_lock_blocking(dev_replace); 5901 btrfs_dev_replace_read_unlock(dev_replace); 5902 } 5903 free_extent_map(em); 5904 return ret; 5905 } 5906 5907 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5908 u64 logical, u64 *length, 5909 struct btrfs_bio **bbio_ret, int mirror_num) 5910 { 5911 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 5912 mirror_num, 0); 5913 } 5914 5915 /* For Scrub/replace */ 5916 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5917 u64 logical, u64 *length, 5918 struct btrfs_bio **bbio_ret) 5919 { 5920 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 5921 } 5922 5923 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 5924 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 5925 { 5926 struct extent_map *em; 5927 struct map_lookup *map; 5928 u64 *buf; 5929 u64 bytenr; 5930 u64 length; 5931 u64 stripe_nr; 5932 u64 rmap_len; 5933 int i, j, nr = 0; 5934 5935 em = get_chunk_map(fs_info, chunk_start, 1); 5936 if (IS_ERR(em)) 5937 return -EIO; 5938 5939 map = em->map_lookup; 5940 length = em->len; 5941 rmap_len = map->stripe_len; 5942 5943 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5944 length = div_u64(length, map->num_stripes / map->sub_stripes); 5945 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5946 length = div_u64(length, map->num_stripes); 5947 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5948 length = div_u64(length, nr_data_stripes(map)); 5949 rmap_len = map->stripe_len * nr_data_stripes(map); 5950 } 5951 5952 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 5953 BUG_ON(!buf); /* -ENOMEM */ 5954 5955 for (i = 0; i < map->num_stripes; i++) { 5956 if (map->stripes[i].physical > physical || 5957 map->stripes[i].physical + length <= physical) 5958 continue; 5959 5960 stripe_nr = physical - map->stripes[i].physical; 5961 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 5962 5963 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5964 stripe_nr = stripe_nr * map->num_stripes + i; 5965 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 5966 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5967 stripe_nr = stripe_nr * map->num_stripes + i; 5968 } /* else if RAID[56], multiply by nr_data_stripes(). 5969 * Alternatively, just use rmap_len below instead of 5970 * map->stripe_len */ 5971 5972 bytenr = chunk_start + stripe_nr * rmap_len; 5973 WARN_ON(nr >= map->num_stripes); 5974 for (j = 0; j < nr; j++) { 5975 if (buf[j] == bytenr) 5976 break; 5977 } 5978 if (j == nr) { 5979 WARN_ON(nr >= map->num_stripes); 5980 buf[nr++] = bytenr; 5981 } 5982 } 5983 5984 *logical = buf; 5985 *naddrs = nr; 5986 *stripe_len = rmap_len; 5987 5988 free_extent_map(em); 5989 return 0; 5990 } 5991 5992 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 5993 { 5994 bio->bi_private = bbio->private; 5995 bio->bi_end_io = bbio->end_io; 5996 bio_endio(bio); 5997 5998 btrfs_put_bbio(bbio); 5999 } 6000 6001 static void btrfs_end_bio(struct bio *bio) 6002 { 6003 struct btrfs_bio *bbio = bio->bi_private; 6004 int is_orig_bio = 0; 6005 6006 if (bio->bi_status) { 6007 atomic_inc(&bbio->error); 6008 if (bio->bi_status == BLK_STS_IOERR || 6009 bio->bi_status == BLK_STS_TARGET) { 6010 unsigned int stripe_index = 6011 btrfs_io_bio(bio)->stripe_index; 6012 struct btrfs_device *dev; 6013 6014 BUG_ON(stripe_index >= bbio->num_stripes); 6015 dev = bbio->stripes[stripe_index].dev; 6016 if (dev->bdev) { 6017 if (bio_op(bio) == REQ_OP_WRITE) 6018 btrfs_dev_stat_inc_and_print(dev, 6019 BTRFS_DEV_STAT_WRITE_ERRS); 6020 else 6021 btrfs_dev_stat_inc_and_print(dev, 6022 BTRFS_DEV_STAT_READ_ERRS); 6023 if (bio->bi_opf & REQ_PREFLUSH) 6024 btrfs_dev_stat_inc_and_print(dev, 6025 BTRFS_DEV_STAT_FLUSH_ERRS); 6026 } 6027 } 6028 } 6029 6030 if (bio == bbio->orig_bio) 6031 is_orig_bio = 1; 6032 6033 btrfs_bio_counter_dec(bbio->fs_info); 6034 6035 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6036 if (!is_orig_bio) { 6037 bio_put(bio); 6038 bio = bbio->orig_bio; 6039 } 6040 6041 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6042 /* only send an error to the higher layers if it is 6043 * beyond the tolerance of the btrfs bio 6044 */ 6045 if (atomic_read(&bbio->error) > bbio->max_errors) { 6046 bio->bi_status = BLK_STS_IOERR; 6047 } else { 6048 /* 6049 * this bio is actually up to date, we didn't 6050 * go over the max number of errors 6051 */ 6052 bio->bi_status = BLK_STS_OK; 6053 } 6054 6055 btrfs_end_bbio(bbio, bio); 6056 } else if (!is_orig_bio) { 6057 bio_put(bio); 6058 } 6059 } 6060 6061 /* 6062 * see run_scheduled_bios for a description of why bios are collected for 6063 * async submit. 6064 * 6065 * This will add one bio to the pending list for a device and make sure 6066 * the work struct is scheduled. 6067 */ 6068 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6069 struct bio *bio) 6070 { 6071 struct btrfs_fs_info *fs_info = device->fs_info; 6072 int should_queue = 1; 6073 struct btrfs_pending_bios *pending_bios; 6074 6075 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || 6076 !device->bdev) { 6077 bio_io_error(bio); 6078 return; 6079 } 6080 6081 /* don't bother with additional async steps for reads, right now */ 6082 if (bio_op(bio) == REQ_OP_READ) { 6083 btrfsic_submit_bio(bio); 6084 return; 6085 } 6086 6087 WARN_ON(bio->bi_next); 6088 bio->bi_next = NULL; 6089 6090 spin_lock(&device->io_lock); 6091 if (op_is_sync(bio->bi_opf)) 6092 pending_bios = &device->pending_sync_bios; 6093 else 6094 pending_bios = &device->pending_bios; 6095 6096 if (pending_bios->tail) 6097 pending_bios->tail->bi_next = bio; 6098 6099 pending_bios->tail = bio; 6100 if (!pending_bios->head) 6101 pending_bios->head = bio; 6102 if (device->running_pending) 6103 should_queue = 0; 6104 6105 spin_unlock(&device->io_lock); 6106 6107 if (should_queue) 6108 btrfs_queue_work(fs_info->submit_workers, &device->work); 6109 } 6110 6111 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6112 u64 physical, int dev_nr, int async) 6113 { 6114 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6115 struct btrfs_fs_info *fs_info = bbio->fs_info; 6116 6117 bio->bi_private = bbio; 6118 btrfs_io_bio(bio)->stripe_index = dev_nr; 6119 bio->bi_end_io = btrfs_end_bio; 6120 bio->bi_iter.bi_sector = physical >> 9; 6121 btrfs_debug_in_rcu(fs_info, 6122 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6123 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, 6124 (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, 6125 bio->bi_iter.bi_size); 6126 bio_set_dev(bio, dev->bdev); 6127 6128 btrfs_bio_counter_inc_noblocked(fs_info); 6129 6130 if (async) 6131 btrfs_schedule_bio(dev, bio); 6132 else 6133 btrfsic_submit_bio(bio); 6134 } 6135 6136 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6137 { 6138 atomic_inc(&bbio->error); 6139 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6140 /* Should be the original bio. */ 6141 WARN_ON(bio != bbio->orig_bio); 6142 6143 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6144 bio->bi_iter.bi_sector = logical >> 9; 6145 if (atomic_read(&bbio->error) > bbio->max_errors) 6146 bio->bi_status = BLK_STS_IOERR; 6147 else 6148 bio->bi_status = BLK_STS_OK; 6149 btrfs_end_bbio(bbio, bio); 6150 } 6151 } 6152 6153 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6154 int mirror_num, int async_submit) 6155 { 6156 struct btrfs_device *dev; 6157 struct bio *first_bio = bio; 6158 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6159 u64 length = 0; 6160 u64 map_length; 6161 int ret; 6162 int dev_nr; 6163 int total_devs; 6164 struct btrfs_bio *bbio = NULL; 6165 6166 length = bio->bi_iter.bi_size; 6167 map_length = length; 6168 6169 btrfs_bio_counter_inc_blocked(fs_info); 6170 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6171 &map_length, &bbio, mirror_num, 1); 6172 if (ret) { 6173 btrfs_bio_counter_dec(fs_info); 6174 return errno_to_blk_status(ret); 6175 } 6176 6177 total_devs = bbio->num_stripes; 6178 bbio->orig_bio = first_bio; 6179 bbio->private = first_bio->bi_private; 6180 bbio->end_io = first_bio->bi_end_io; 6181 bbio->fs_info = fs_info; 6182 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6183 6184 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6185 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6186 /* In this case, map_length has been set to the length of 6187 a single stripe; not the whole write */ 6188 if (bio_op(bio) == REQ_OP_WRITE) { 6189 ret = raid56_parity_write(fs_info, bio, bbio, 6190 map_length); 6191 } else { 6192 ret = raid56_parity_recover(fs_info, bio, bbio, 6193 map_length, mirror_num, 1); 6194 } 6195 6196 btrfs_bio_counter_dec(fs_info); 6197 return errno_to_blk_status(ret); 6198 } 6199 6200 if (map_length < length) { 6201 btrfs_crit(fs_info, 6202 "mapping failed logical %llu bio len %llu len %llu", 6203 logical, length, map_length); 6204 BUG(); 6205 } 6206 6207 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6208 dev = bbio->stripes[dev_nr].dev; 6209 if (!dev || !dev->bdev || 6210 (bio_op(first_bio) == REQ_OP_WRITE && 6211 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6212 bbio_error(bbio, first_bio, logical); 6213 continue; 6214 } 6215 6216 if (dev_nr < total_devs - 1) 6217 bio = btrfs_bio_clone(first_bio); 6218 else 6219 bio = first_bio; 6220 6221 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6222 dev_nr, async_submit); 6223 } 6224 btrfs_bio_counter_dec(fs_info); 6225 return BLK_STS_OK; 6226 } 6227 6228 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6229 u8 *uuid, u8 *fsid) 6230 { 6231 struct btrfs_device *device; 6232 struct btrfs_fs_devices *cur_devices; 6233 6234 cur_devices = fs_info->fs_devices; 6235 while (cur_devices) { 6236 if (!fsid || 6237 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 6238 device = find_device(cur_devices, devid, uuid); 6239 if (device) 6240 return device; 6241 } 6242 cur_devices = cur_devices->seed; 6243 } 6244 return NULL; 6245 } 6246 6247 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6248 u64 devid, u8 *dev_uuid) 6249 { 6250 struct btrfs_device *device; 6251 6252 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6253 if (IS_ERR(device)) 6254 return device; 6255 6256 list_add(&device->dev_list, &fs_devices->devices); 6257 device->fs_devices = fs_devices; 6258 fs_devices->num_devices++; 6259 6260 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6261 fs_devices->missing_devices++; 6262 6263 return device; 6264 } 6265 6266 /** 6267 * btrfs_alloc_device - allocate struct btrfs_device 6268 * @fs_info: used only for generating a new devid, can be NULL if 6269 * devid is provided (i.e. @devid != NULL). 6270 * @devid: a pointer to devid for this device. If NULL a new devid 6271 * is generated. 6272 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6273 * is generated. 6274 * 6275 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6276 * on error. Returned struct is not linked onto any lists and must be 6277 * destroyed with btrfs_free_device. 6278 */ 6279 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6280 const u64 *devid, 6281 const u8 *uuid) 6282 { 6283 struct btrfs_device *dev; 6284 u64 tmp; 6285 6286 if (WARN_ON(!devid && !fs_info)) 6287 return ERR_PTR(-EINVAL); 6288 6289 dev = __alloc_device(); 6290 if (IS_ERR(dev)) 6291 return dev; 6292 6293 if (devid) 6294 tmp = *devid; 6295 else { 6296 int ret; 6297 6298 ret = find_next_devid(fs_info, &tmp); 6299 if (ret) { 6300 btrfs_free_device(dev); 6301 return ERR_PTR(ret); 6302 } 6303 } 6304 dev->devid = tmp; 6305 6306 if (uuid) 6307 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6308 else 6309 generate_random_uuid(dev->uuid); 6310 6311 btrfs_init_work(&dev->work, btrfs_submit_helper, 6312 pending_bios_fn, NULL, NULL); 6313 6314 return dev; 6315 } 6316 6317 /* Return -EIO if any error, otherwise return 0. */ 6318 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6319 struct extent_buffer *leaf, 6320 struct btrfs_chunk *chunk, u64 logical) 6321 { 6322 u64 length; 6323 u64 stripe_len; 6324 u16 num_stripes; 6325 u16 sub_stripes; 6326 u64 type; 6327 u64 features; 6328 bool mixed = false; 6329 6330 length = btrfs_chunk_length(leaf, chunk); 6331 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6332 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6333 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6334 type = btrfs_chunk_type(leaf, chunk); 6335 6336 if (!num_stripes) { 6337 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6338 num_stripes); 6339 return -EIO; 6340 } 6341 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6342 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6343 return -EIO; 6344 } 6345 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6346 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6347 btrfs_chunk_sector_size(leaf, chunk)); 6348 return -EIO; 6349 } 6350 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6351 btrfs_err(fs_info, "invalid chunk length %llu", length); 6352 return -EIO; 6353 } 6354 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6355 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6356 stripe_len); 6357 return -EIO; 6358 } 6359 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6360 type) { 6361 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6362 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6363 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6364 btrfs_chunk_type(leaf, chunk)); 6365 return -EIO; 6366 } 6367 6368 if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { 6369 btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); 6370 return -EIO; 6371 } 6372 6373 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && 6374 (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { 6375 btrfs_err(fs_info, 6376 "system chunk with data or metadata type: 0x%llx", type); 6377 return -EIO; 6378 } 6379 6380 features = btrfs_super_incompat_flags(fs_info->super_copy); 6381 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 6382 mixed = true; 6383 6384 if (!mixed) { 6385 if ((type & BTRFS_BLOCK_GROUP_METADATA) && 6386 (type & BTRFS_BLOCK_GROUP_DATA)) { 6387 btrfs_err(fs_info, 6388 "mixed chunk type in non-mixed mode: 0x%llx", type); 6389 return -EIO; 6390 } 6391 } 6392 6393 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6394 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6395 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6396 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6397 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6398 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6399 num_stripes != 1)) { 6400 btrfs_err(fs_info, 6401 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6402 num_stripes, sub_stripes, 6403 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6404 return -EIO; 6405 } 6406 6407 return 0; 6408 } 6409 6410 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6411 u64 devid, u8 *uuid, bool error) 6412 { 6413 if (error) 6414 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6415 devid, uuid); 6416 else 6417 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6418 devid, uuid); 6419 } 6420 6421 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6422 struct extent_buffer *leaf, 6423 struct btrfs_chunk *chunk) 6424 { 6425 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6426 struct map_lookup *map; 6427 struct extent_map *em; 6428 u64 logical; 6429 u64 length; 6430 u64 devid; 6431 u8 uuid[BTRFS_UUID_SIZE]; 6432 int num_stripes; 6433 int ret; 6434 int i; 6435 6436 logical = key->offset; 6437 length = btrfs_chunk_length(leaf, chunk); 6438 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6439 6440 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6441 if (ret) 6442 return ret; 6443 6444 read_lock(&map_tree->map_tree.lock); 6445 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6446 read_unlock(&map_tree->map_tree.lock); 6447 6448 /* already mapped? */ 6449 if (em && em->start <= logical && em->start + em->len > logical) { 6450 free_extent_map(em); 6451 return 0; 6452 } else if (em) { 6453 free_extent_map(em); 6454 } 6455 6456 em = alloc_extent_map(); 6457 if (!em) 6458 return -ENOMEM; 6459 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6460 if (!map) { 6461 free_extent_map(em); 6462 return -ENOMEM; 6463 } 6464 6465 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6466 em->map_lookup = map; 6467 em->start = logical; 6468 em->len = length; 6469 em->orig_start = 0; 6470 em->block_start = 0; 6471 em->block_len = em->len; 6472 6473 map->num_stripes = num_stripes; 6474 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6475 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6476 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6477 map->type = btrfs_chunk_type(leaf, chunk); 6478 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6479 map->verified_stripes = 0; 6480 for (i = 0; i < num_stripes; i++) { 6481 map->stripes[i].physical = 6482 btrfs_stripe_offset_nr(leaf, chunk, i); 6483 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6484 read_extent_buffer(leaf, uuid, (unsigned long) 6485 btrfs_stripe_dev_uuid_nr(chunk, i), 6486 BTRFS_UUID_SIZE); 6487 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6488 uuid, NULL); 6489 if (!map->stripes[i].dev && 6490 !btrfs_test_opt(fs_info, DEGRADED)) { 6491 free_extent_map(em); 6492 btrfs_report_missing_device(fs_info, devid, uuid, true); 6493 return -ENOENT; 6494 } 6495 if (!map->stripes[i].dev) { 6496 map->stripes[i].dev = 6497 add_missing_dev(fs_info->fs_devices, devid, 6498 uuid); 6499 if (IS_ERR(map->stripes[i].dev)) { 6500 free_extent_map(em); 6501 btrfs_err(fs_info, 6502 "failed to init missing dev %llu: %ld", 6503 devid, PTR_ERR(map->stripes[i].dev)); 6504 return PTR_ERR(map->stripes[i].dev); 6505 } 6506 btrfs_report_missing_device(fs_info, devid, uuid, false); 6507 } 6508 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6509 &(map->stripes[i].dev->dev_state)); 6510 6511 } 6512 6513 write_lock(&map_tree->map_tree.lock); 6514 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6515 write_unlock(&map_tree->map_tree.lock); 6516 if (ret < 0) { 6517 btrfs_err(fs_info, 6518 "failed to add chunk map, start=%llu len=%llu: %d", 6519 em->start, em->len, ret); 6520 } 6521 free_extent_map(em); 6522 6523 return ret; 6524 } 6525 6526 static void fill_device_from_item(struct extent_buffer *leaf, 6527 struct btrfs_dev_item *dev_item, 6528 struct btrfs_device *device) 6529 { 6530 unsigned long ptr; 6531 6532 device->devid = btrfs_device_id(leaf, dev_item); 6533 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6534 device->total_bytes = device->disk_total_bytes; 6535 device->commit_total_bytes = device->disk_total_bytes; 6536 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6537 device->commit_bytes_used = device->bytes_used; 6538 device->type = btrfs_device_type(leaf, dev_item); 6539 device->io_align = btrfs_device_io_align(leaf, dev_item); 6540 device->io_width = btrfs_device_io_width(leaf, dev_item); 6541 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6542 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6543 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6544 6545 ptr = btrfs_device_uuid(dev_item); 6546 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6547 } 6548 6549 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6550 u8 *fsid) 6551 { 6552 struct btrfs_fs_devices *fs_devices; 6553 int ret; 6554 6555 lockdep_assert_held(&uuid_mutex); 6556 ASSERT(fsid); 6557 6558 fs_devices = fs_info->fs_devices->seed; 6559 while (fs_devices) { 6560 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6561 return fs_devices; 6562 6563 fs_devices = fs_devices->seed; 6564 } 6565 6566 fs_devices = find_fsid(fsid); 6567 if (!fs_devices) { 6568 if (!btrfs_test_opt(fs_info, DEGRADED)) 6569 return ERR_PTR(-ENOENT); 6570 6571 fs_devices = alloc_fs_devices(fsid); 6572 if (IS_ERR(fs_devices)) 6573 return fs_devices; 6574 6575 fs_devices->seeding = 1; 6576 fs_devices->opened = 1; 6577 return fs_devices; 6578 } 6579 6580 fs_devices = clone_fs_devices(fs_devices); 6581 if (IS_ERR(fs_devices)) 6582 return fs_devices; 6583 6584 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 6585 if (ret) { 6586 free_fs_devices(fs_devices); 6587 fs_devices = ERR_PTR(ret); 6588 goto out; 6589 } 6590 6591 if (!fs_devices->seeding) { 6592 close_fs_devices(fs_devices); 6593 free_fs_devices(fs_devices); 6594 fs_devices = ERR_PTR(-EINVAL); 6595 goto out; 6596 } 6597 6598 fs_devices->seed = fs_info->fs_devices->seed; 6599 fs_info->fs_devices->seed = fs_devices; 6600 out: 6601 return fs_devices; 6602 } 6603 6604 static int read_one_dev(struct btrfs_fs_info *fs_info, 6605 struct extent_buffer *leaf, 6606 struct btrfs_dev_item *dev_item) 6607 { 6608 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6609 struct btrfs_device *device; 6610 u64 devid; 6611 int ret; 6612 u8 fs_uuid[BTRFS_FSID_SIZE]; 6613 u8 dev_uuid[BTRFS_UUID_SIZE]; 6614 6615 devid = btrfs_device_id(leaf, dev_item); 6616 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6617 BTRFS_UUID_SIZE); 6618 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6619 BTRFS_FSID_SIZE); 6620 6621 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 6622 fs_devices = open_seed_devices(fs_info, fs_uuid); 6623 if (IS_ERR(fs_devices)) 6624 return PTR_ERR(fs_devices); 6625 } 6626 6627 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6628 if (!device) { 6629 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6630 btrfs_report_missing_device(fs_info, devid, 6631 dev_uuid, true); 6632 return -ENOENT; 6633 } 6634 6635 device = add_missing_dev(fs_devices, devid, dev_uuid); 6636 if (IS_ERR(device)) { 6637 btrfs_err(fs_info, 6638 "failed to add missing dev %llu: %ld", 6639 devid, PTR_ERR(device)); 6640 return PTR_ERR(device); 6641 } 6642 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6643 } else { 6644 if (!device->bdev) { 6645 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6646 btrfs_report_missing_device(fs_info, 6647 devid, dev_uuid, true); 6648 return -ENOENT; 6649 } 6650 btrfs_report_missing_device(fs_info, devid, 6651 dev_uuid, false); 6652 } 6653 6654 if (!device->bdev && 6655 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6656 /* 6657 * this happens when a device that was properly setup 6658 * in the device info lists suddenly goes bad. 6659 * device->bdev is NULL, and so we have to set 6660 * device->missing to one here 6661 */ 6662 device->fs_devices->missing_devices++; 6663 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6664 } 6665 6666 /* Move the device to its own fs_devices */ 6667 if (device->fs_devices != fs_devices) { 6668 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6669 &device->dev_state)); 6670 6671 list_move(&device->dev_list, &fs_devices->devices); 6672 device->fs_devices->num_devices--; 6673 fs_devices->num_devices++; 6674 6675 device->fs_devices->missing_devices--; 6676 fs_devices->missing_devices++; 6677 6678 device->fs_devices = fs_devices; 6679 } 6680 } 6681 6682 if (device->fs_devices != fs_info->fs_devices) { 6683 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 6684 if (device->generation != 6685 btrfs_device_generation(leaf, dev_item)) 6686 return -EINVAL; 6687 } 6688 6689 fill_device_from_item(leaf, dev_item, device); 6690 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6691 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6692 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 6693 device->fs_devices->total_rw_bytes += device->total_bytes; 6694 atomic64_add(device->total_bytes - device->bytes_used, 6695 &fs_info->free_chunk_space); 6696 } 6697 ret = 0; 6698 return ret; 6699 } 6700 6701 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6702 { 6703 struct btrfs_root *root = fs_info->tree_root; 6704 struct btrfs_super_block *super_copy = fs_info->super_copy; 6705 struct extent_buffer *sb; 6706 struct btrfs_disk_key *disk_key; 6707 struct btrfs_chunk *chunk; 6708 u8 *array_ptr; 6709 unsigned long sb_array_offset; 6710 int ret = 0; 6711 u32 num_stripes; 6712 u32 array_size; 6713 u32 len = 0; 6714 u32 cur_offset; 6715 u64 type; 6716 struct btrfs_key key; 6717 6718 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6719 /* 6720 * This will create extent buffer of nodesize, superblock size is 6721 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6722 * overallocate but we can keep it as-is, only the first page is used. 6723 */ 6724 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6725 if (IS_ERR(sb)) 6726 return PTR_ERR(sb); 6727 set_extent_buffer_uptodate(sb); 6728 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6729 /* 6730 * The sb extent buffer is artificial and just used to read the system array. 6731 * set_extent_buffer_uptodate() call does not properly mark all it's 6732 * pages up-to-date when the page is larger: extent does not cover the 6733 * whole page and consequently check_page_uptodate does not find all 6734 * the page's extents up-to-date (the hole beyond sb), 6735 * write_extent_buffer then triggers a WARN_ON. 6736 * 6737 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6738 * but sb spans only this function. Add an explicit SetPageUptodate call 6739 * to silence the warning eg. on PowerPC 64. 6740 */ 6741 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6742 SetPageUptodate(sb->pages[0]); 6743 6744 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6745 array_size = btrfs_super_sys_array_size(super_copy); 6746 6747 array_ptr = super_copy->sys_chunk_array; 6748 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6749 cur_offset = 0; 6750 6751 while (cur_offset < array_size) { 6752 disk_key = (struct btrfs_disk_key *)array_ptr; 6753 len = sizeof(*disk_key); 6754 if (cur_offset + len > array_size) 6755 goto out_short_read; 6756 6757 btrfs_disk_key_to_cpu(&key, disk_key); 6758 6759 array_ptr += len; 6760 sb_array_offset += len; 6761 cur_offset += len; 6762 6763 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6764 chunk = (struct btrfs_chunk *)sb_array_offset; 6765 /* 6766 * At least one btrfs_chunk with one stripe must be 6767 * present, exact stripe count check comes afterwards 6768 */ 6769 len = btrfs_chunk_item_size(1); 6770 if (cur_offset + len > array_size) 6771 goto out_short_read; 6772 6773 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6774 if (!num_stripes) { 6775 btrfs_err(fs_info, 6776 "invalid number of stripes %u in sys_array at offset %u", 6777 num_stripes, cur_offset); 6778 ret = -EIO; 6779 break; 6780 } 6781 6782 type = btrfs_chunk_type(sb, chunk); 6783 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6784 btrfs_err(fs_info, 6785 "invalid chunk type %llu in sys_array at offset %u", 6786 type, cur_offset); 6787 ret = -EIO; 6788 break; 6789 } 6790 6791 len = btrfs_chunk_item_size(num_stripes); 6792 if (cur_offset + len > array_size) 6793 goto out_short_read; 6794 6795 ret = read_one_chunk(fs_info, &key, sb, chunk); 6796 if (ret) 6797 break; 6798 } else { 6799 btrfs_err(fs_info, 6800 "unexpected item type %u in sys_array at offset %u", 6801 (u32)key.type, cur_offset); 6802 ret = -EIO; 6803 break; 6804 } 6805 array_ptr += len; 6806 sb_array_offset += len; 6807 cur_offset += len; 6808 } 6809 clear_extent_buffer_uptodate(sb); 6810 free_extent_buffer_stale(sb); 6811 return ret; 6812 6813 out_short_read: 6814 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6815 len, cur_offset); 6816 clear_extent_buffer_uptodate(sb); 6817 free_extent_buffer_stale(sb); 6818 return -EIO; 6819 } 6820 6821 /* 6822 * Check if all chunks in the fs are OK for read-write degraded mount 6823 * 6824 * If the @failing_dev is specified, it's accounted as missing. 6825 * 6826 * Return true if all chunks meet the minimal RW mount requirements. 6827 * Return false if any chunk doesn't meet the minimal RW mount requirements. 6828 */ 6829 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 6830 struct btrfs_device *failing_dev) 6831 { 6832 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6833 struct extent_map *em; 6834 u64 next_start = 0; 6835 bool ret = true; 6836 6837 read_lock(&map_tree->map_tree.lock); 6838 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 6839 read_unlock(&map_tree->map_tree.lock); 6840 /* No chunk at all? Return false anyway */ 6841 if (!em) { 6842 ret = false; 6843 goto out; 6844 } 6845 while (em) { 6846 struct map_lookup *map; 6847 int missing = 0; 6848 int max_tolerated; 6849 int i; 6850 6851 map = em->map_lookup; 6852 max_tolerated = 6853 btrfs_get_num_tolerated_disk_barrier_failures( 6854 map->type); 6855 for (i = 0; i < map->num_stripes; i++) { 6856 struct btrfs_device *dev = map->stripes[i].dev; 6857 6858 if (!dev || !dev->bdev || 6859 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6860 dev->last_flush_error) 6861 missing++; 6862 else if (failing_dev && failing_dev == dev) 6863 missing++; 6864 } 6865 if (missing > max_tolerated) { 6866 if (!failing_dev) 6867 btrfs_warn(fs_info, 6868 "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 6869 em->start, missing, max_tolerated); 6870 free_extent_map(em); 6871 ret = false; 6872 goto out; 6873 } 6874 next_start = extent_map_end(em); 6875 free_extent_map(em); 6876 6877 read_lock(&map_tree->map_tree.lock); 6878 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 6879 (u64)(-1) - next_start); 6880 read_unlock(&map_tree->map_tree.lock); 6881 } 6882 out: 6883 return ret; 6884 } 6885 6886 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6887 { 6888 struct btrfs_root *root = fs_info->chunk_root; 6889 struct btrfs_path *path; 6890 struct extent_buffer *leaf; 6891 struct btrfs_key key; 6892 struct btrfs_key found_key; 6893 int ret; 6894 int slot; 6895 u64 total_dev = 0; 6896 6897 path = btrfs_alloc_path(); 6898 if (!path) 6899 return -ENOMEM; 6900 6901 /* 6902 * uuid_mutex is needed only if we are mounting a sprout FS 6903 * otherwise we don't need it. 6904 */ 6905 mutex_lock(&uuid_mutex); 6906 mutex_lock(&fs_info->chunk_mutex); 6907 6908 /* 6909 * Read all device items, and then all the chunk items. All 6910 * device items are found before any chunk item (their object id 6911 * is smaller than the lowest possible object id for a chunk 6912 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6913 */ 6914 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6915 key.offset = 0; 6916 key.type = 0; 6917 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6918 if (ret < 0) 6919 goto error; 6920 while (1) { 6921 leaf = path->nodes[0]; 6922 slot = path->slots[0]; 6923 if (slot >= btrfs_header_nritems(leaf)) { 6924 ret = btrfs_next_leaf(root, path); 6925 if (ret == 0) 6926 continue; 6927 if (ret < 0) 6928 goto error; 6929 break; 6930 } 6931 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6932 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6933 struct btrfs_dev_item *dev_item; 6934 dev_item = btrfs_item_ptr(leaf, slot, 6935 struct btrfs_dev_item); 6936 ret = read_one_dev(fs_info, leaf, dev_item); 6937 if (ret) 6938 goto error; 6939 total_dev++; 6940 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6941 struct btrfs_chunk *chunk; 6942 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6943 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 6944 if (ret) 6945 goto error; 6946 } 6947 path->slots[0]++; 6948 } 6949 6950 /* 6951 * After loading chunk tree, we've got all device information, 6952 * do another round of validation checks. 6953 */ 6954 if (total_dev != fs_info->fs_devices->total_devices) { 6955 btrfs_err(fs_info, 6956 "super_num_devices %llu mismatch with num_devices %llu found here", 6957 btrfs_super_num_devices(fs_info->super_copy), 6958 total_dev); 6959 ret = -EINVAL; 6960 goto error; 6961 } 6962 if (btrfs_super_total_bytes(fs_info->super_copy) < 6963 fs_info->fs_devices->total_rw_bytes) { 6964 btrfs_err(fs_info, 6965 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 6966 btrfs_super_total_bytes(fs_info->super_copy), 6967 fs_info->fs_devices->total_rw_bytes); 6968 ret = -EINVAL; 6969 goto error; 6970 } 6971 ret = 0; 6972 error: 6973 mutex_unlock(&fs_info->chunk_mutex); 6974 mutex_unlock(&uuid_mutex); 6975 6976 btrfs_free_path(path); 6977 return ret; 6978 } 6979 6980 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6981 { 6982 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6983 struct btrfs_device *device; 6984 6985 while (fs_devices) { 6986 mutex_lock(&fs_devices->device_list_mutex); 6987 list_for_each_entry(device, &fs_devices->devices, dev_list) 6988 device->fs_info = fs_info; 6989 mutex_unlock(&fs_devices->device_list_mutex); 6990 6991 fs_devices = fs_devices->seed; 6992 } 6993 } 6994 6995 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6996 { 6997 int i; 6998 6999 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7000 btrfs_dev_stat_reset(dev, i); 7001 } 7002 7003 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7004 { 7005 struct btrfs_key key; 7006 struct btrfs_key found_key; 7007 struct btrfs_root *dev_root = fs_info->dev_root; 7008 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7009 struct extent_buffer *eb; 7010 int slot; 7011 int ret = 0; 7012 struct btrfs_device *device; 7013 struct btrfs_path *path = NULL; 7014 int i; 7015 7016 path = btrfs_alloc_path(); 7017 if (!path) { 7018 ret = -ENOMEM; 7019 goto out; 7020 } 7021 7022 mutex_lock(&fs_devices->device_list_mutex); 7023 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7024 int item_size; 7025 struct btrfs_dev_stats_item *ptr; 7026 7027 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7028 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7029 key.offset = device->devid; 7030 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7031 if (ret) { 7032 __btrfs_reset_dev_stats(device); 7033 device->dev_stats_valid = 1; 7034 btrfs_release_path(path); 7035 continue; 7036 } 7037 slot = path->slots[0]; 7038 eb = path->nodes[0]; 7039 btrfs_item_key_to_cpu(eb, &found_key, slot); 7040 item_size = btrfs_item_size_nr(eb, slot); 7041 7042 ptr = btrfs_item_ptr(eb, slot, 7043 struct btrfs_dev_stats_item); 7044 7045 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7046 if (item_size >= (1 + i) * sizeof(__le64)) 7047 btrfs_dev_stat_set(device, i, 7048 btrfs_dev_stats_value(eb, ptr, i)); 7049 else 7050 btrfs_dev_stat_reset(device, i); 7051 } 7052 7053 device->dev_stats_valid = 1; 7054 btrfs_dev_stat_print_on_load(device); 7055 btrfs_release_path(path); 7056 } 7057 mutex_unlock(&fs_devices->device_list_mutex); 7058 7059 out: 7060 btrfs_free_path(path); 7061 return ret < 0 ? ret : 0; 7062 } 7063 7064 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7065 struct btrfs_device *device) 7066 { 7067 struct btrfs_fs_info *fs_info = trans->fs_info; 7068 struct btrfs_root *dev_root = fs_info->dev_root; 7069 struct btrfs_path *path; 7070 struct btrfs_key key; 7071 struct extent_buffer *eb; 7072 struct btrfs_dev_stats_item *ptr; 7073 int ret; 7074 int i; 7075 7076 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7077 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7078 key.offset = device->devid; 7079 7080 path = btrfs_alloc_path(); 7081 if (!path) 7082 return -ENOMEM; 7083 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7084 if (ret < 0) { 7085 btrfs_warn_in_rcu(fs_info, 7086 "error %d while searching for dev_stats item for device %s", 7087 ret, rcu_str_deref(device->name)); 7088 goto out; 7089 } 7090 7091 if (ret == 0 && 7092 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7093 /* need to delete old one and insert a new one */ 7094 ret = btrfs_del_item(trans, dev_root, path); 7095 if (ret != 0) { 7096 btrfs_warn_in_rcu(fs_info, 7097 "delete too small dev_stats item for device %s failed %d", 7098 rcu_str_deref(device->name), ret); 7099 goto out; 7100 } 7101 ret = 1; 7102 } 7103 7104 if (ret == 1) { 7105 /* need to insert a new item */ 7106 btrfs_release_path(path); 7107 ret = btrfs_insert_empty_item(trans, dev_root, path, 7108 &key, sizeof(*ptr)); 7109 if (ret < 0) { 7110 btrfs_warn_in_rcu(fs_info, 7111 "insert dev_stats item for device %s failed %d", 7112 rcu_str_deref(device->name), ret); 7113 goto out; 7114 } 7115 } 7116 7117 eb = path->nodes[0]; 7118 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7119 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7120 btrfs_set_dev_stats_value(eb, ptr, i, 7121 btrfs_dev_stat_read(device, i)); 7122 btrfs_mark_buffer_dirty(eb); 7123 7124 out: 7125 btrfs_free_path(path); 7126 return ret; 7127 } 7128 7129 /* 7130 * called from commit_transaction. Writes all changed device stats to disk. 7131 */ 7132 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7133 struct btrfs_fs_info *fs_info) 7134 { 7135 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7136 struct btrfs_device *device; 7137 int stats_cnt; 7138 int ret = 0; 7139 7140 mutex_lock(&fs_devices->device_list_mutex); 7141 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7142 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7143 if (!device->dev_stats_valid || stats_cnt == 0) 7144 continue; 7145 7146 7147 /* 7148 * There is a LOAD-LOAD control dependency between the value of 7149 * dev_stats_ccnt and updating the on-disk values which requires 7150 * reading the in-memory counters. Such control dependencies 7151 * require explicit read memory barriers. 7152 * 7153 * This memory barriers pairs with smp_mb__before_atomic in 7154 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7155 * barrier implied by atomic_xchg in 7156 * btrfs_dev_stats_read_and_reset 7157 */ 7158 smp_rmb(); 7159 7160 ret = update_dev_stat_item(trans, device); 7161 if (!ret) 7162 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7163 } 7164 mutex_unlock(&fs_devices->device_list_mutex); 7165 7166 return ret; 7167 } 7168 7169 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7170 { 7171 btrfs_dev_stat_inc(dev, index); 7172 btrfs_dev_stat_print_on_error(dev); 7173 } 7174 7175 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7176 { 7177 if (!dev->dev_stats_valid) 7178 return; 7179 btrfs_err_rl_in_rcu(dev->fs_info, 7180 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7181 rcu_str_deref(dev->name), 7182 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7183 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7184 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7185 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7186 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7187 } 7188 7189 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7190 { 7191 int i; 7192 7193 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7194 if (btrfs_dev_stat_read(dev, i) != 0) 7195 break; 7196 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7197 return; /* all values == 0, suppress message */ 7198 7199 btrfs_info_in_rcu(dev->fs_info, 7200 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7201 rcu_str_deref(dev->name), 7202 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7203 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7204 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7205 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7206 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7207 } 7208 7209 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7210 struct btrfs_ioctl_get_dev_stats *stats) 7211 { 7212 struct btrfs_device *dev; 7213 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7214 int i; 7215 7216 mutex_lock(&fs_devices->device_list_mutex); 7217 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7218 mutex_unlock(&fs_devices->device_list_mutex); 7219 7220 if (!dev) { 7221 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7222 return -ENODEV; 7223 } else if (!dev->dev_stats_valid) { 7224 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7225 return -ENODEV; 7226 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7227 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7228 if (stats->nr_items > i) 7229 stats->values[i] = 7230 btrfs_dev_stat_read_and_reset(dev, i); 7231 else 7232 btrfs_dev_stat_reset(dev, i); 7233 } 7234 } else { 7235 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7236 if (stats->nr_items > i) 7237 stats->values[i] = btrfs_dev_stat_read(dev, i); 7238 } 7239 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7240 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7241 return 0; 7242 } 7243 7244 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7245 { 7246 struct buffer_head *bh; 7247 struct btrfs_super_block *disk_super; 7248 int copy_num; 7249 7250 if (!bdev) 7251 return; 7252 7253 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7254 copy_num++) { 7255 7256 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7257 continue; 7258 7259 disk_super = (struct btrfs_super_block *)bh->b_data; 7260 7261 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7262 set_buffer_dirty(bh); 7263 sync_dirty_buffer(bh); 7264 brelse(bh); 7265 } 7266 7267 /* Notify udev that device has changed */ 7268 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7269 7270 /* Update ctime/mtime for device path for libblkid */ 7271 update_dev_time(device_path); 7272 } 7273 7274 /* 7275 * Update the size of all devices, which is used for writing out the 7276 * super blocks. 7277 */ 7278 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7279 { 7280 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7281 struct btrfs_device *curr, *next; 7282 7283 if (list_empty(&fs_devices->resized_devices)) 7284 return; 7285 7286 mutex_lock(&fs_devices->device_list_mutex); 7287 mutex_lock(&fs_info->chunk_mutex); 7288 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7289 resized_list) { 7290 list_del_init(&curr->resized_list); 7291 curr->commit_total_bytes = curr->disk_total_bytes; 7292 } 7293 mutex_unlock(&fs_info->chunk_mutex); 7294 mutex_unlock(&fs_devices->device_list_mutex); 7295 } 7296 7297 /* Must be invoked during the transaction commit */ 7298 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) 7299 { 7300 struct btrfs_fs_info *fs_info = trans->fs_info; 7301 struct extent_map *em; 7302 struct map_lookup *map; 7303 struct btrfs_device *dev; 7304 int i; 7305 7306 if (list_empty(&trans->pending_chunks)) 7307 return; 7308 7309 /* In order to kick the device replace finish process */ 7310 mutex_lock(&fs_info->chunk_mutex); 7311 list_for_each_entry(em, &trans->pending_chunks, list) { 7312 map = em->map_lookup; 7313 7314 for (i = 0; i < map->num_stripes; i++) { 7315 dev = map->stripes[i].dev; 7316 dev->commit_bytes_used = dev->bytes_used; 7317 } 7318 } 7319 mutex_unlock(&fs_info->chunk_mutex); 7320 } 7321 7322 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7323 { 7324 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7325 while (fs_devices) { 7326 fs_devices->fs_info = fs_info; 7327 fs_devices = fs_devices->seed; 7328 } 7329 } 7330 7331 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7332 { 7333 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7334 while (fs_devices) { 7335 fs_devices->fs_info = NULL; 7336 fs_devices = fs_devices->seed; 7337 } 7338 } 7339 7340 /* 7341 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7342 */ 7343 int btrfs_bg_type_to_factor(u64 flags) 7344 { 7345 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 7346 BTRFS_BLOCK_GROUP_RAID10)) 7347 return 2; 7348 return 1; 7349 } 7350 7351 7352 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7353 { 7354 int index = btrfs_bg_flags_to_raid_index(type); 7355 int ncopies = btrfs_raid_array[index].ncopies; 7356 int data_stripes; 7357 7358 switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 7359 case BTRFS_BLOCK_GROUP_RAID5: 7360 data_stripes = num_stripes - 1; 7361 break; 7362 case BTRFS_BLOCK_GROUP_RAID6: 7363 data_stripes = num_stripes - 2; 7364 break; 7365 default: 7366 data_stripes = num_stripes / ncopies; 7367 break; 7368 } 7369 return div_u64(chunk_len, data_stripes); 7370 } 7371 7372 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7373 u64 chunk_offset, u64 devid, 7374 u64 physical_offset, u64 physical_len) 7375 { 7376 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7377 struct extent_map *em; 7378 struct map_lookup *map; 7379 u64 stripe_len; 7380 bool found = false; 7381 int ret = 0; 7382 int i; 7383 7384 read_lock(&em_tree->lock); 7385 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7386 read_unlock(&em_tree->lock); 7387 7388 if (!em) { 7389 btrfs_err(fs_info, 7390 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7391 physical_offset, devid); 7392 ret = -EUCLEAN; 7393 goto out; 7394 } 7395 7396 map = em->map_lookup; 7397 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7398 if (physical_len != stripe_len) { 7399 btrfs_err(fs_info, 7400 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7401 physical_offset, devid, em->start, physical_len, 7402 stripe_len); 7403 ret = -EUCLEAN; 7404 goto out; 7405 } 7406 7407 for (i = 0; i < map->num_stripes; i++) { 7408 if (map->stripes[i].dev->devid == devid && 7409 map->stripes[i].physical == physical_offset) { 7410 found = true; 7411 if (map->verified_stripes >= map->num_stripes) { 7412 btrfs_err(fs_info, 7413 "too many dev extents for chunk %llu found", 7414 em->start); 7415 ret = -EUCLEAN; 7416 goto out; 7417 } 7418 map->verified_stripes++; 7419 break; 7420 } 7421 } 7422 if (!found) { 7423 btrfs_err(fs_info, 7424 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7425 physical_offset, devid); 7426 ret = -EUCLEAN; 7427 } 7428 out: 7429 free_extent_map(em); 7430 return ret; 7431 } 7432 7433 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7434 { 7435 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7436 struct extent_map *em; 7437 struct rb_node *node; 7438 int ret = 0; 7439 7440 read_lock(&em_tree->lock); 7441 for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { 7442 em = rb_entry(node, struct extent_map, rb_node); 7443 if (em->map_lookup->num_stripes != 7444 em->map_lookup->verified_stripes) { 7445 btrfs_err(fs_info, 7446 "chunk %llu has missing dev extent, have %d expect %d", 7447 em->start, em->map_lookup->verified_stripes, 7448 em->map_lookup->num_stripes); 7449 ret = -EUCLEAN; 7450 goto out; 7451 } 7452 } 7453 out: 7454 read_unlock(&em_tree->lock); 7455 return ret; 7456 } 7457 7458 /* 7459 * Ensure that all dev extents are mapped to correct chunk, otherwise 7460 * later chunk allocation/free would cause unexpected behavior. 7461 * 7462 * NOTE: This will iterate through the whole device tree, which should be of 7463 * the same size level as the chunk tree. This slightly increases mount time. 7464 */ 7465 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7466 { 7467 struct btrfs_path *path; 7468 struct btrfs_root *root = fs_info->dev_root; 7469 struct btrfs_key key; 7470 int ret = 0; 7471 7472 key.objectid = 1; 7473 key.type = BTRFS_DEV_EXTENT_KEY; 7474 key.offset = 0; 7475 7476 path = btrfs_alloc_path(); 7477 if (!path) 7478 return -ENOMEM; 7479 7480 path->reada = READA_FORWARD; 7481 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7482 if (ret < 0) 7483 goto out; 7484 7485 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7486 ret = btrfs_next_item(root, path); 7487 if (ret < 0) 7488 goto out; 7489 /* No dev extents at all? Not good */ 7490 if (ret > 0) { 7491 ret = -EUCLEAN; 7492 goto out; 7493 } 7494 } 7495 while (1) { 7496 struct extent_buffer *leaf = path->nodes[0]; 7497 struct btrfs_dev_extent *dext; 7498 int slot = path->slots[0]; 7499 u64 chunk_offset; 7500 u64 physical_offset; 7501 u64 physical_len; 7502 u64 devid; 7503 7504 btrfs_item_key_to_cpu(leaf, &key, slot); 7505 if (key.type != BTRFS_DEV_EXTENT_KEY) 7506 break; 7507 devid = key.objectid; 7508 physical_offset = key.offset; 7509 7510 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 7511 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 7512 physical_len = btrfs_dev_extent_length(leaf, dext); 7513 7514 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 7515 physical_offset, physical_len); 7516 if (ret < 0) 7517 goto out; 7518 ret = btrfs_next_item(root, path); 7519 if (ret < 0) 7520 goto out; 7521 if (ret > 0) { 7522 ret = 0; 7523 break; 7524 } 7525 } 7526 7527 /* Ensure all chunks have corresponding dev extents */ 7528 ret = verify_chunk_dev_extent_mapping(fs_info); 7529 out: 7530 btrfs_free_path(path); 7531 return ret; 7532 } 7533