1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/buffer_head.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "ctree.h" 18 #include "extent_map.h" 19 #include "disk-io.h" 20 #include "transaction.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "async-thread.h" 25 #include "check-integrity.h" 26 #include "rcu-string.h" 27 #include "math.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 32 [BTRFS_RAID_RAID10] = { 33 .sub_stripes = 2, 34 .dev_stripes = 1, 35 .devs_max = 0, /* 0 == as many as possible */ 36 .devs_min = 4, 37 .tolerated_failures = 1, 38 .devs_increment = 2, 39 .ncopies = 2, 40 .raid_name = "raid10", 41 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 42 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 43 }, 44 [BTRFS_RAID_RAID1] = { 45 .sub_stripes = 1, 46 .dev_stripes = 1, 47 .devs_max = 2, 48 .devs_min = 2, 49 .tolerated_failures = 1, 50 .devs_increment = 2, 51 .ncopies = 2, 52 .raid_name = "raid1", 53 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 54 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 55 }, 56 [BTRFS_RAID_DUP] = { 57 .sub_stripes = 1, 58 .dev_stripes = 2, 59 .devs_max = 1, 60 .devs_min = 1, 61 .tolerated_failures = 0, 62 .devs_increment = 1, 63 .ncopies = 2, 64 .raid_name = "dup", 65 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 66 .mindev_error = 0, 67 }, 68 [BTRFS_RAID_RAID0] = { 69 .sub_stripes = 1, 70 .dev_stripes = 1, 71 .devs_max = 0, 72 .devs_min = 2, 73 .tolerated_failures = 0, 74 .devs_increment = 1, 75 .ncopies = 1, 76 .raid_name = "raid0", 77 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 78 .mindev_error = 0, 79 }, 80 [BTRFS_RAID_SINGLE] = { 81 .sub_stripes = 1, 82 .dev_stripes = 1, 83 .devs_max = 1, 84 .devs_min = 1, 85 .tolerated_failures = 0, 86 .devs_increment = 1, 87 .ncopies = 1, 88 .raid_name = "single", 89 .bg_flag = 0, 90 .mindev_error = 0, 91 }, 92 [BTRFS_RAID_RAID5] = { 93 .sub_stripes = 1, 94 .dev_stripes = 1, 95 .devs_max = 0, 96 .devs_min = 2, 97 .tolerated_failures = 1, 98 .devs_increment = 1, 99 .ncopies = 2, 100 .raid_name = "raid5", 101 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 102 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 103 }, 104 [BTRFS_RAID_RAID6] = { 105 .sub_stripes = 1, 106 .dev_stripes = 1, 107 .devs_max = 0, 108 .devs_min = 3, 109 .tolerated_failures = 2, 110 .devs_increment = 1, 111 .ncopies = 3, 112 .raid_name = "raid6", 113 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 114 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 115 }, 116 }; 117 118 const char *get_raid_name(enum btrfs_raid_types type) 119 { 120 if (type >= BTRFS_NR_RAID_TYPES) 121 return NULL; 122 123 return btrfs_raid_array[type].raid_name; 124 } 125 126 static int init_first_rw_device(struct btrfs_trans_handle *trans, 127 struct btrfs_fs_info *fs_info); 128 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 129 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 130 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 131 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 132 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 133 enum btrfs_map_op op, 134 u64 logical, u64 *length, 135 struct btrfs_bio **bbio_ret, 136 int mirror_num, int need_raid_map); 137 138 /* 139 * Device locking 140 * ============== 141 * 142 * There are several mutexes that protect manipulation of devices and low-level 143 * structures like chunks but not block groups, extents or files 144 * 145 * uuid_mutex (global lock) 146 * ------------------------ 147 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 148 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 149 * device) or requested by the device= mount option 150 * 151 * the mutex can be very coarse and can cover long-running operations 152 * 153 * protects: updates to fs_devices counters like missing devices, rw devices, 154 * seeding, structure cloning, openning/closing devices at mount/umount time 155 * 156 * global::fs_devs - add, remove, updates to the global list 157 * 158 * does not protect: manipulation of the fs_devices::devices list! 159 * 160 * btrfs_device::name - renames (write side), read is RCU 161 * 162 * fs_devices::device_list_mutex (per-fs, with RCU) 163 * ------------------------------------------------ 164 * protects updates to fs_devices::devices, ie. adding and deleting 165 * 166 * simple list traversal with read-only actions can be done with RCU protection 167 * 168 * may be used to exclude some operations from running concurrently without any 169 * modifications to the list (see write_all_supers) 170 * 171 * balance_mutex 172 * ------------- 173 * protects balance structures (status, state) and context accessed from 174 * several places (internally, ioctl) 175 * 176 * chunk_mutex 177 * ----------- 178 * protects chunks, adding or removing during allocation, trim or when a new 179 * device is added/removed 180 * 181 * cleaner_mutex 182 * ------------- 183 * a big lock that is held by the cleaner thread and prevents running subvolume 184 * cleaning together with relocation or delayed iputs 185 * 186 * 187 * Lock nesting 188 * ============ 189 * 190 * uuid_mutex 191 * volume_mutex 192 * device_list_mutex 193 * chunk_mutex 194 * balance_mutex 195 * 196 * 197 * Exclusive operations, BTRFS_FS_EXCL_OP 198 * ====================================== 199 * 200 * Maintains the exclusivity of the following operations that apply to the 201 * whole filesystem and cannot run in parallel. 202 * 203 * - Balance (*) 204 * - Device add 205 * - Device remove 206 * - Device replace (*) 207 * - Resize 208 * 209 * The device operations (as above) can be in one of the following states: 210 * 211 * - Running state 212 * - Paused state 213 * - Completed state 214 * 215 * Only device operations marked with (*) can go into the Paused state for the 216 * following reasons: 217 * 218 * - ioctl (only Balance can be Paused through ioctl) 219 * - filesystem remounted as read-only 220 * - filesystem unmounted and mounted as read-only 221 * - system power-cycle and filesystem mounted as read-only 222 * - filesystem or device errors leading to forced read-only 223 * 224 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. 225 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. 226 * A device operation in Paused or Running state can be canceled or resumed 227 * either by ioctl (Balance only) or when remounted as read-write. 228 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or 229 * completed. 230 */ 231 232 DEFINE_MUTEX(uuid_mutex); 233 static LIST_HEAD(fs_uuids); 234 struct list_head *btrfs_get_fs_uuids(void) 235 { 236 return &fs_uuids; 237 } 238 239 /* 240 * alloc_fs_devices - allocate struct btrfs_fs_devices 241 * @fsid: if not NULL, copy the uuid to fs_devices::fsid 242 * 243 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 244 * The returned struct is not linked onto any lists and can be destroyed with 245 * kfree() right away. 246 */ 247 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 248 { 249 struct btrfs_fs_devices *fs_devs; 250 251 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 252 if (!fs_devs) 253 return ERR_PTR(-ENOMEM); 254 255 mutex_init(&fs_devs->device_list_mutex); 256 257 INIT_LIST_HEAD(&fs_devs->devices); 258 INIT_LIST_HEAD(&fs_devs->resized_devices); 259 INIT_LIST_HEAD(&fs_devs->alloc_list); 260 INIT_LIST_HEAD(&fs_devs->fs_list); 261 if (fsid) 262 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 263 264 return fs_devs; 265 } 266 267 void btrfs_free_device(struct btrfs_device *device) 268 { 269 rcu_string_free(device->name); 270 bio_put(device->flush_bio); 271 kfree(device); 272 } 273 274 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 275 { 276 struct btrfs_device *device; 277 WARN_ON(fs_devices->opened); 278 while (!list_empty(&fs_devices->devices)) { 279 device = list_entry(fs_devices->devices.next, 280 struct btrfs_device, dev_list); 281 list_del(&device->dev_list); 282 btrfs_free_device(device); 283 } 284 kfree(fs_devices); 285 } 286 287 static void btrfs_kobject_uevent(struct block_device *bdev, 288 enum kobject_action action) 289 { 290 int ret; 291 292 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 293 if (ret) 294 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 295 action, 296 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 297 &disk_to_dev(bdev->bd_disk)->kobj); 298 } 299 300 void __exit btrfs_cleanup_fs_uuids(void) 301 { 302 struct btrfs_fs_devices *fs_devices; 303 304 while (!list_empty(&fs_uuids)) { 305 fs_devices = list_entry(fs_uuids.next, 306 struct btrfs_fs_devices, fs_list); 307 list_del(&fs_devices->fs_list); 308 free_fs_devices(fs_devices); 309 } 310 } 311 312 /* 313 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 314 * Returned struct is not linked onto any lists and must be destroyed using 315 * btrfs_free_device. 316 */ 317 static struct btrfs_device *__alloc_device(void) 318 { 319 struct btrfs_device *dev; 320 321 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 322 if (!dev) 323 return ERR_PTR(-ENOMEM); 324 325 /* 326 * Preallocate a bio that's always going to be used for flushing device 327 * barriers and matches the device lifespan 328 */ 329 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 330 if (!dev->flush_bio) { 331 kfree(dev); 332 return ERR_PTR(-ENOMEM); 333 } 334 335 INIT_LIST_HEAD(&dev->dev_list); 336 INIT_LIST_HEAD(&dev->dev_alloc_list); 337 INIT_LIST_HEAD(&dev->resized_list); 338 339 spin_lock_init(&dev->io_lock); 340 341 atomic_set(&dev->reada_in_flight, 0); 342 atomic_set(&dev->dev_stats_ccnt, 0); 343 btrfs_device_data_ordered_init(dev); 344 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 345 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 346 347 return dev; 348 } 349 350 /* 351 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 352 * return NULL. 353 * 354 * If devid and uuid are both specified, the match must be exact, otherwise 355 * only devid is used. 356 */ 357 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 358 u64 devid, const u8 *uuid) 359 { 360 struct btrfs_device *dev; 361 362 list_for_each_entry(dev, &fs_devices->devices, dev_list) { 363 if (dev->devid == devid && 364 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 365 return dev; 366 } 367 } 368 return NULL; 369 } 370 371 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 372 { 373 struct btrfs_fs_devices *fs_devices; 374 375 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 376 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 377 return fs_devices; 378 } 379 return NULL; 380 } 381 382 static int 383 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 384 int flush, struct block_device **bdev, 385 struct buffer_head **bh) 386 { 387 int ret; 388 389 *bdev = blkdev_get_by_path(device_path, flags, holder); 390 391 if (IS_ERR(*bdev)) { 392 ret = PTR_ERR(*bdev); 393 goto error; 394 } 395 396 if (flush) 397 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 398 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 399 if (ret) { 400 blkdev_put(*bdev, flags); 401 goto error; 402 } 403 invalidate_bdev(*bdev); 404 *bh = btrfs_read_dev_super(*bdev); 405 if (IS_ERR(*bh)) { 406 ret = PTR_ERR(*bh); 407 blkdev_put(*bdev, flags); 408 goto error; 409 } 410 411 return 0; 412 413 error: 414 *bdev = NULL; 415 *bh = NULL; 416 return ret; 417 } 418 419 static void requeue_list(struct btrfs_pending_bios *pending_bios, 420 struct bio *head, struct bio *tail) 421 { 422 423 struct bio *old_head; 424 425 old_head = pending_bios->head; 426 pending_bios->head = head; 427 if (pending_bios->tail) 428 tail->bi_next = old_head; 429 else 430 pending_bios->tail = tail; 431 } 432 433 /* 434 * we try to collect pending bios for a device so we don't get a large 435 * number of procs sending bios down to the same device. This greatly 436 * improves the schedulers ability to collect and merge the bios. 437 * 438 * But, it also turns into a long list of bios to process and that is sure 439 * to eventually make the worker thread block. The solution here is to 440 * make some progress and then put this work struct back at the end of 441 * the list if the block device is congested. This way, multiple devices 442 * can make progress from a single worker thread. 443 */ 444 static noinline void run_scheduled_bios(struct btrfs_device *device) 445 { 446 struct btrfs_fs_info *fs_info = device->fs_info; 447 struct bio *pending; 448 struct backing_dev_info *bdi; 449 struct btrfs_pending_bios *pending_bios; 450 struct bio *tail; 451 struct bio *cur; 452 int again = 0; 453 unsigned long num_run; 454 unsigned long batch_run = 0; 455 unsigned long last_waited = 0; 456 int force_reg = 0; 457 int sync_pending = 0; 458 struct blk_plug plug; 459 460 /* 461 * this function runs all the bios we've collected for 462 * a particular device. We don't want to wander off to 463 * another device without first sending all of these down. 464 * So, setup a plug here and finish it off before we return 465 */ 466 blk_start_plug(&plug); 467 468 bdi = device->bdev->bd_bdi; 469 470 loop: 471 spin_lock(&device->io_lock); 472 473 loop_lock: 474 num_run = 0; 475 476 /* take all the bios off the list at once and process them 477 * later on (without the lock held). But, remember the 478 * tail and other pointers so the bios can be properly reinserted 479 * into the list if we hit congestion 480 */ 481 if (!force_reg && device->pending_sync_bios.head) { 482 pending_bios = &device->pending_sync_bios; 483 force_reg = 1; 484 } else { 485 pending_bios = &device->pending_bios; 486 force_reg = 0; 487 } 488 489 pending = pending_bios->head; 490 tail = pending_bios->tail; 491 WARN_ON(pending && !tail); 492 493 /* 494 * if pending was null this time around, no bios need processing 495 * at all and we can stop. Otherwise it'll loop back up again 496 * and do an additional check so no bios are missed. 497 * 498 * device->running_pending is used to synchronize with the 499 * schedule_bio code. 500 */ 501 if (device->pending_sync_bios.head == NULL && 502 device->pending_bios.head == NULL) { 503 again = 0; 504 device->running_pending = 0; 505 } else { 506 again = 1; 507 device->running_pending = 1; 508 } 509 510 pending_bios->head = NULL; 511 pending_bios->tail = NULL; 512 513 spin_unlock(&device->io_lock); 514 515 while (pending) { 516 517 rmb(); 518 /* we want to work on both lists, but do more bios on the 519 * sync list than the regular list 520 */ 521 if ((num_run > 32 && 522 pending_bios != &device->pending_sync_bios && 523 device->pending_sync_bios.head) || 524 (num_run > 64 && pending_bios == &device->pending_sync_bios && 525 device->pending_bios.head)) { 526 spin_lock(&device->io_lock); 527 requeue_list(pending_bios, pending, tail); 528 goto loop_lock; 529 } 530 531 cur = pending; 532 pending = pending->bi_next; 533 cur->bi_next = NULL; 534 535 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 536 537 /* 538 * if we're doing the sync list, record that our 539 * plug has some sync requests on it 540 * 541 * If we're doing the regular list and there are 542 * sync requests sitting around, unplug before 543 * we add more 544 */ 545 if (pending_bios == &device->pending_sync_bios) { 546 sync_pending = 1; 547 } else if (sync_pending) { 548 blk_finish_plug(&plug); 549 blk_start_plug(&plug); 550 sync_pending = 0; 551 } 552 553 btrfsic_submit_bio(cur); 554 num_run++; 555 batch_run++; 556 557 cond_resched(); 558 559 /* 560 * we made progress, there is more work to do and the bdi 561 * is now congested. Back off and let other work structs 562 * run instead 563 */ 564 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 565 fs_info->fs_devices->open_devices > 1) { 566 struct io_context *ioc; 567 568 ioc = current->io_context; 569 570 /* 571 * the main goal here is that we don't want to 572 * block if we're going to be able to submit 573 * more requests without blocking. 574 * 575 * This code does two great things, it pokes into 576 * the elevator code from a filesystem _and_ 577 * it makes assumptions about how batching works. 578 */ 579 if (ioc && ioc->nr_batch_requests > 0 && 580 time_before(jiffies, ioc->last_waited + HZ/50UL) && 581 (last_waited == 0 || 582 ioc->last_waited == last_waited)) { 583 /* 584 * we want to go through our batch of 585 * requests and stop. So, we copy out 586 * the ioc->last_waited time and test 587 * against it before looping 588 */ 589 last_waited = ioc->last_waited; 590 cond_resched(); 591 continue; 592 } 593 spin_lock(&device->io_lock); 594 requeue_list(pending_bios, pending, tail); 595 device->running_pending = 1; 596 597 spin_unlock(&device->io_lock); 598 btrfs_queue_work(fs_info->submit_workers, 599 &device->work); 600 goto done; 601 } 602 } 603 604 cond_resched(); 605 if (again) 606 goto loop; 607 608 spin_lock(&device->io_lock); 609 if (device->pending_bios.head || device->pending_sync_bios.head) 610 goto loop_lock; 611 spin_unlock(&device->io_lock); 612 613 done: 614 blk_finish_plug(&plug); 615 } 616 617 static void pending_bios_fn(struct btrfs_work *work) 618 { 619 struct btrfs_device *device; 620 621 device = container_of(work, struct btrfs_device, work); 622 run_scheduled_bios(device); 623 } 624 625 /* 626 * Search and remove all stale (devices which are not mounted) devices. 627 * When both inputs are NULL, it will search and release all stale devices. 628 * path: Optional. When provided will it release all unmounted devices 629 * matching this path only. 630 * skip_dev: Optional. Will skip this device when searching for the stale 631 * devices. 632 */ 633 static void btrfs_free_stale_devices(const char *path, 634 struct btrfs_device *skip_device) 635 { 636 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 637 struct btrfs_device *device, *tmp_device; 638 639 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 640 mutex_lock(&fs_devices->device_list_mutex); 641 if (fs_devices->opened) { 642 mutex_unlock(&fs_devices->device_list_mutex); 643 continue; 644 } 645 646 list_for_each_entry_safe(device, tmp_device, 647 &fs_devices->devices, dev_list) { 648 int not_found = 0; 649 650 if (skip_device && skip_device == device) 651 continue; 652 if (path && !device->name) 653 continue; 654 655 rcu_read_lock(); 656 if (path) 657 not_found = strcmp(rcu_str_deref(device->name), 658 path); 659 rcu_read_unlock(); 660 if (not_found) 661 continue; 662 663 /* delete the stale device */ 664 fs_devices->num_devices--; 665 list_del(&device->dev_list); 666 btrfs_free_device(device); 667 668 if (fs_devices->num_devices == 0) 669 break; 670 } 671 mutex_unlock(&fs_devices->device_list_mutex); 672 if (fs_devices->num_devices == 0) { 673 btrfs_sysfs_remove_fsid(fs_devices); 674 list_del(&fs_devices->fs_list); 675 free_fs_devices(fs_devices); 676 } 677 } 678 } 679 680 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 681 struct btrfs_device *device, fmode_t flags, 682 void *holder) 683 { 684 struct request_queue *q; 685 struct block_device *bdev; 686 struct buffer_head *bh; 687 struct btrfs_super_block *disk_super; 688 u64 devid; 689 int ret; 690 691 if (device->bdev) 692 return -EINVAL; 693 if (!device->name) 694 return -EINVAL; 695 696 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 697 &bdev, &bh); 698 if (ret) 699 return ret; 700 701 disk_super = (struct btrfs_super_block *)bh->b_data; 702 devid = btrfs_stack_device_id(&disk_super->dev_item); 703 if (devid != device->devid) 704 goto error_brelse; 705 706 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 707 goto error_brelse; 708 709 device->generation = btrfs_super_generation(disk_super); 710 711 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 712 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 713 fs_devices->seeding = 1; 714 } else { 715 if (bdev_read_only(bdev)) 716 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 717 else 718 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 719 } 720 721 q = bdev_get_queue(bdev); 722 if (!blk_queue_nonrot(q)) 723 fs_devices->rotating = 1; 724 725 device->bdev = bdev; 726 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 727 device->mode = flags; 728 729 fs_devices->open_devices++; 730 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 731 device->devid != BTRFS_DEV_REPLACE_DEVID) { 732 fs_devices->rw_devices++; 733 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 734 } 735 brelse(bh); 736 737 return 0; 738 739 error_brelse: 740 brelse(bh); 741 blkdev_put(bdev, flags); 742 743 return -EINVAL; 744 } 745 746 /* 747 * Add new device to list of registered devices 748 * 749 * Returns: 750 * device pointer which was just added or updated when successful 751 * error pointer when failed 752 */ 753 static noinline struct btrfs_device *device_list_add(const char *path, 754 struct btrfs_super_block *disk_super, 755 bool *new_device_added) 756 { 757 struct btrfs_device *device; 758 struct btrfs_fs_devices *fs_devices; 759 struct rcu_string *name; 760 u64 found_transid = btrfs_super_generation(disk_super); 761 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 762 763 fs_devices = find_fsid(disk_super->fsid); 764 if (!fs_devices) { 765 fs_devices = alloc_fs_devices(disk_super->fsid); 766 if (IS_ERR(fs_devices)) 767 return ERR_CAST(fs_devices); 768 769 mutex_lock(&fs_devices->device_list_mutex); 770 list_add(&fs_devices->fs_list, &fs_uuids); 771 772 device = NULL; 773 } else { 774 mutex_lock(&fs_devices->device_list_mutex); 775 device = find_device(fs_devices, devid, 776 disk_super->dev_item.uuid); 777 } 778 779 if (!device) { 780 if (fs_devices->opened) { 781 mutex_unlock(&fs_devices->device_list_mutex); 782 return ERR_PTR(-EBUSY); 783 } 784 785 device = btrfs_alloc_device(NULL, &devid, 786 disk_super->dev_item.uuid); 787 if (IS_ERR(device)) { 788 mutex_unlock(&fs_devices->device_list_mutex); 789 /* we can safely leave the fs_devices entry around */ 790 return device; 791 } 792 793 name = rcu_string_strdup(path, GFP_NOFS); 794 if (!name) { 795 btrfs_free_device(device); 796 mutex_unlock(&fs_devices->device_list_mutex); 797 return ERR_PTR(-ENOMEM); 798 } 799 rcu_assign_pointer(device->name, name); 800 801 list_add_rcu(&device->dev_list, &fs_devices->devices); 802 fs_devices->num_devices++; 803 804 device->fs_devices = fs_devices; 805 *new_device_added = true; 806 807 if (disk_super->label[0]) 808 pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", 809 disk_super->label, devid, found_transid, path); 810 else 811 pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", 812 disk_super->fsid, devid, found_transid, path); 813 814 } else if (!device->name || strcmp(device->name->str, path)) { 815 /* 816 * When FS is already mounted. 817 * 1. If you are here and if the device->name is NULL that 818 * means this device was missing at time of FS mount. 819 * 2. If you are here and if the device->name is different 820 * from 'path' that means either 821 * a. The same device disappeared and reappeared with 822 * different name. or 823 * b. The missing-disk-which-was-replaced, has 824 * reappeared now. 825 * 826 * We must allow 1 and 2a above. But 2b would be a spurious 827 * and unintentional. 828 * 829 * Further in case of 1 and 2a above, the disk at 'path' 830 * would have missed some transaction when it was away and 831 * in case of 2a the stale bdev has to be updated as well. 832 * 2b must not be allowed at all time. 833 */ 834 835 /* 836 * For now, we do allow update to btrfs_fs_device through the 837 * btrfs dev scan cli after FS has been mounted. We're still 838 * tracking a problem where systems fail mount by subvolume id 839 * when we reject replacement on a mounted FS. 840 */ 841 if (!fs_devices->opened && found_transid < device->generation) { 842 /* 843 * That is if the FS is _not_ mounted and if you 844 * are here, that means there is more than one 845 * disk with same uuid and devid.We keep the one 846 * with larger generation number or the last-in if 847 * generation are equal. 848 */ 849 mutex_unlock(&fs_devices->device_list_mutex); 850 return ERR_PTR(-EEXIST); 851 } 852 853 name = rcu_string_strdup(path, GFP_NOFS); 854 if (!name) { 855 mutex_unlock(&fs_devices->device_list_mutex); 856 return ERR_PTR(-ENOMEM); 857 } 858 rcu_string_free(device->name); 859 rcu_assign_pointer(device->name, name); 860 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 861 fs_devices->missing_devices--; 862 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 863 } 864 } 865 866 /* 867 * Unmount does not free the btrfs_device struct but would zero 868 * generation along with most of the other members. So just update 869 * it back. We need it to pick the disk with largest generation 870 * (as above). 871 */ 872 if (!fs_devices->opened) 873 device->generation = found_transid; 874 875 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 876 877 mutex_unlock(&fs_devices->device_list_mutex); 878 return device; 879 } 880 881 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 882 { 883 struct btrfs_fs_devices *fs_devices; 884 struct btrfs_device *device; 885 struct btrfs_device *orig_dev; 886 887 fs_devices = alloc_fs_devices(orig->fsid); 888 if (IS_ERR(fs_devices)) 889 return fs_devices; 890 891 mutex_lock(&orig->device_list_mutex); 892 fs_devices->total_devices = orig->total_devices; 893 894 /* We have held the volume lock, it is safe to get the devices. */ 895 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 896 struct rcu_string *name; 897 898 device = btrfs_alloc_device(NULL, &orig_dev->devid, 899 orig_dev->uuid); 900 if (IS_ERR(device)) 901 goto error; 902 903 /* 904 * This is ok to do without rcu read locked because we hold the 905 * uuid mutex so nothing we touch in here is going to disappear. 906 */ 907 if (orig_dev->name) { 908 name = rcu_string_strdup(orig_dev->name->str, 909 GFP_KERNEL); 910 if (!name) { 911 btrfs_free_device(device); 912 goto error; 913 } 914 rcu_assign_pointer(device->name, name); 915 } 916 917 list_add(&device->dev_list, &fs_devices->devices); 918 device->fs_devices = fs_devices; 919 fs_devices->num_devices++; 920 } 921 mutex_unlock(&orig->device_list_mutex); 922 return fs_devices; 923 error: 924 mutex_unlock(&orig->device_list_mutex); 925 free_fs_devices(fs_devices); 926 return ERR_PTR(-ENOMEM); 927 } 928 929 /* 930 * After we have read the system tree and know devids belonging to 931 * this filesystem, remove the device which does not belong there. 932 */ 933 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 934 { 935 struct btrfs_device *device, *next; 936 struct btrfs_device *latest_dev = NULL; 937 938 mutex_lock(&uuid_mutex); 939 again: 940 /* This is the initialized path, it is safe to release the devices. */ 941 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 942 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 943 &device->dev_state)) { 944 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 945 &device->dev_state) && 946 (!latest_dev || 947 device->generation > latest_dev->generation)) { 948 latest_dev = device; 949 } 950 continue; 951 } 952 953 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 954 /* 955 * In the first step, keep the device which has 956 * the correct fsid and the devid that is used 957 * for the dev_replace procedure. 958 * In the second step, the dev_replace state is 959 * read from the device tree and it is known 960 * whether the procedure is really active or 961 * not, which means whether this device is 962 * used or whether it should be removed. 963 */ 964 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 965 &device->dev_state)) { 966 continue; 967 } 968 } 969 if (device->bdev) { 970 blkdev_put(device->bdev, device->mode); 971 device->bdev = NULL; 972 fs_devices->open_devices--; 973 } 974 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 975 list_del_init(&device->dev_alloc_list); 976 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 977 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 978 &device->dev_state)) 979 fs_devices->rw_devices--; 980 } 981 list_del_init(&device->dev_list); 982 fs_devices->num_devices--; 983 btrfs_free_device(device); 984 } 985 986 if (fs_devices->seed) { 987 fs_devices = fs_devices->seed; 988 goto again; 989 } 990 991 fs_devices->latest_bdev = latest_dev->bdev; 992 993 mutex_unlock(&uuid_mutex); 994 } 995 996 static void free_device_rcu(struct rcu_head *head) 997 { 998 struct btrfs_device *device; 999 1000 device = container_of(head, struct btrfs_device, rcu); 1001 btrfs_free_device(device); 1002 } 1003 1004 static void btrfs_close_bdev(struct btrfs_device *device) 1005 { 1006 if (!device->bdev) 1007 return; 1008 1009 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1010 sync_blockdev(device->bdev); 1011 invalidate_bdev(device->bdev); 1012 } 1013 1014 blkdev_put(device->bdev, device->mode); 1015 } 1016 1017 static void btrfs_close_one_device(struct btrfs_device *device) 1018 { 1019 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1020 struct btrfs_device *new_device; 1021 struct rcu_string *name; 1022 1023 if (device->bdev) 1024 fs_devices->open_devices--; 1025 1026 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1027 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1028 list_del_init(&device->dev_alloc_list); 1029 fs_devices->rw_devices--; 1030 } 1031 1032 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1033 fs_devices->missing_devices--; 1034 1035 btrfs_close_bdev(device); 1036 1037 new_device = btrfs_alloc_device(NULL, &device->devid, 1038 device->uuid); 1039 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 1040 1041 /* Safe because we are under uuid_mutex */ 1042 if (device->name) { 1043 name = rcu_string_strdup(device->name->str, GFP_NOFS); 1044 BUG_ON(!name); /* -ENOMEM */ 1045 rcu_assign_pointer(new_device->name, name); 1046 } 1047 1048 list_replace_rcu(&device->dev_list, &new_device->dev_list); 1049 new_device->fs_devices = device->fs_devices; 1050 1051 call_rcu(&device->rcu, free_device_rcu); 1052 } 1053 1054 static int close_fs_devices(struct btrfs_fs_devices *fs_devices) 1055 { 1056 struct btrfs_device *device, *tmp; 1057 1058 if (--fs_devices->opened > 0) 1059 return 0; 1060 1061 mutex_lock(&fs_devices->device_list_mutex); 1062 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1063 btrfs_close_one_device(device); 1064 } 1065 mutex_unlock(&fs_devices->device_list_mutex); 1066 1067 WARN_ON(fs_devices->open_devices); 1068 WARN_ON(fs_devices->rw_devices); 1069 fs_devices->opened = 0; 1070 fs_devices->seeding = 0; 1071 1072 return 0; 1073 } 1074 1075 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1076 { 1077 struct btrfs_fs_devices *seed_devices = NULL; 1078 int ret; 1079 1080 mutex_lock(&uuid_mutex); 1081 ret = close_fs_devices(fs_devices); 1082 if (!fs_devices->opened) { 1083 seed_devices = fs_devices->seed; 1084 fs_devices->seed = NULL; 1085 } 1086 mutex_unlock(&uuid_mutex); 1087 1088 while (seed_devices) { 1089 fs_devices = seed_devices; 1090 seed_devices = fs_devices->seed; 1091 close_fs_devices(fs_devices); 1092 free_fs_devices(fs_devices); 1093 } 1094 return ret; 1095 } 1096 1097 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1098 fmode_t flags, void *holder) 1099 { 1100 struct btrfs_device *device; 1101 struct btrfs_device *latest_dev = NULL; 1102 int ret = 0; 1103 1104 flags |= FMODE_EXCL; 1105 1106 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1107 /* Just open everything we can; ignore failures here */ 1108 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1109 continue; 1110 1111 if (!latest_dev || 1112 device->generation > latest_dev->generation) 1113 latest_dev = device; 1114 } 1115 if (fs_devices->open_devices == 0) { 1116 ret = -EINVAL; 1117 goto out; 1118 } 1119 fs_devices->opened = 1; 1120 fs_devices->latest_bdev = latest_dev->bdev; 1121 fs_devices->total_rw_bytes = 0; 1122 out: 1123 return ret; 1124 } 1125 1126 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1127 { 1128 struct btrfs_device *dev1, *dev2; 1129 1130 dev1 = list_entry(a, struct btrfs_device, dev_list); 1131 dev2 = list_entry(b, struct btrfs_device, dev_list); 1132 1133 if (dev1->devid < dev2->devid) 1134 return -1; 1135 else if (dev1->devid > dev2->devid) 1136 return 1; 1137 return 0; 1138 } 1139 1140 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1141 fmode_t flags, void *holder) 1142 { 1143 int ret; 1144 1145 lockdep_assert_held(&uuid_mutex); 1146 1147 mutex_lock(&fs_devices->device_list_mutex); 1148 if (fs_devices->opened) { 1149 fs_devices->opened++; 1150 ret = 0; 1151 } else { 1152 list_sort(NULL, &fs_devices->devices, devid_cmp); 1153 ret = open_fs_devices(fs_devices, flags, holder); 1154 } 1155 mutex_unlock(&fs_devices->device_list_mutex); 1156 1157 return ret; 1158 } 1159 1160 static void btrfs_release_disk_super(struct page *page) 1161 { 1162 kunmap(page); 1163 put_page(page); 1164 } 1165 1166 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1167 struct page **page, 1168 struct btrfs_super_block **disk_super) 1169 { 1170 void *p; 1171 pgoff_t index; 1172 1173 /* make sure our super fits in the device */ 1174 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1175 return 1; 1176 1177 /* make sure our super fits in the page */ 1178 if (sizeof(**disk_super) > PAGE_SIZE) 1179 return 1; 1180 1181 /* make sure our super doesn't straddle pages on disk */ 1182 index = bytenr >> PAGE_SHIFT; 1183 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1184 return 1; 1185 1186 /* pull in the page with our super */ 1187 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1188 index, GFP_KERNEL); 1189 1190 if (IS_ERR_OR_NULL(*page)) 1191 return 1; 1192 1193 p = kmap(*page); 1194 1195 /* align our pointer to the offset of the super block */ 1196 *disk_super = p + (bytenr & ~PAGE_MASK); 1197 1198 if (btrfs_super_bytenr(*disk_super) != bytenr || 1199 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1200 btrfs_release_disk_super(*page); 1201 return 1; 1202 } 1203 1204 if ((*disk_super)->label[0] && 1205 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1206 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1207 1208 return 0; 1209 } 1210 1211 /* 1212 * Look for a btrfs signature on a device. This may be called out of the mount path 1213 * and we are not allowed to call set_blocksize during the scan. The superblock 1214 * is read via pagecache 1215 */ 1216 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1217 void *holder) 1218 { 1219 struct btrfs_super_block *disk_super; 1220 bool new_device_added = false; 1221 struct btrfs_device *device = NULL; 1222 struct block_device *bdev; 1223 struct page *page; 1224 u64 bytenr; 1225 1226 lockdep_assert_held(&uuid_mutex); 1227 1228 /* 1229 * we would like to check all the supers, but that would make 1230 * a btrfs mount succeed after a mkfs from a different FS. 1231 * So, we need to add a special mount option to scan for 1232 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1233 */ 1234 bytenr = btrfs_sb_offset(0); 1235 flags |= FMODE_EXCL; 1236 1237 bdev = blkdev_get_by_path(path, flags, holder); 1238 if (IS_ERR(bdev)) 1239 return ERR_CAST(bdev); 1240 1241 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 1242 device = ERR_PTR(-EINVAL); 1243 goto error_bdev_put; 1244 } 1245 1246 device = device_list_add(path, disk_super, &new_device_added); 1247 if (!IS_ERR(device)) { 1248 if (new_device_added) 1249 btrfs_free_stale_devices(path, device); 1250 } 1251 1252 btrfs_release_disk_super(page); 1253 1254 error_bdev_put: 1255 blkdev_put(bdev, flags); 1256 1257 return device; 1258 } 1259 1260 static int contains_pending_extent(struct btrfs_transaction *transaction, 1261 struct btrfs_device *device, 1262 u64 *start, u64 len) 1263 { 1264 struct btrfs_fs_info *fs_info = device->fs_info; 1265 struct extent_map *em; 1266 struct list_head *search_list = &fs_info->pinned_chunks; 1267 int ret = 0; 1268 u64 physical_start = *start; 1269 1270 if (transaction) 1271 search_list = &transaction->pending_chunks; 1272 again: 1273 list_for_each_entry(em, search_list, list) { 1274 struct map_lookup *map; 1275 int i; 1276 1277 map = em->map_lookup; 1278 for (i = 0; i < map->num_stripes; i++) { 1279 u64 end; 1280 1281 if (map->stripes[i].dev != device) 1282 continue; 1283 if (map->stripes[i].physical >= physical_start + len || 1284 map->stripes[i].physical + em->orig_block_len <= 1285 physical_start) 1286 continue; 1287 /* 1288 * Make sure that while processing the pinned list we do 1289 * not override our *start with a lower value, because 1290 * we can have pinned chunks that fall within this 1291 * device hole and that have lower physical addresses 1292 * than the pending chunks we processed before. If we 1293 * do not take this special care we can end up getting 1294 * 2 pending chunks that start at the same physical 1295 * device offsets because the end offset of a pinned 1296 * chunk can be equal to the start offset of some 1297 * pending chunk. 1298 */ 1299 end = map->stripes[i].physical + em->orig_block_len; 1300 if (end > *start) { 1301 *start = end; 1302 ret = 1; 1303 } 1304 } 1305 } 1306 if (search_list != &fs_info->pinned_chunks) { 1307 search_list = &fs_info->pinned_chunks; 1308 goto again; 1309 } 1310 1311 return ret; 1312 } 1313 1314 1315 /* 1316 * find_free_dev_extent_start - find free space in the specified device 1317 * @device: the device which we search the free space in 1318 * @num_bytes: the size of the free space that we need 1319 * @search_start: the position from which to begin the search 1320 * @start: store the start of the free space. 1321 * @len: the size of the free space. that we find, or the size 1322 * of the max free space if we don't find suitable free space 1323 * 1324 * this uses a pretty simple search, the expectation is that it is 1325 * called very infrequently and that a given device has a small number 1326 * of extents 1327 * 1328 * @start is used to store the start of the free space if we find. But if we 1329 * don't find suitable free space, it will be used to store the start position 1330 * of the max free space. 1331 * 1332 * @len is used to store the size of the free space that we find. 1333 * But if we don't find suitable free space, it is used to store the size of 1334 * the max free space. 1335 */ 1336 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1337 struct btrfs_device *device, u64 num_bytes, 1338 u64 search_start, u64 *start, u64 *len) 1339 { 1340 struct btrfs_fs_info *fs_info = device->fs_info; 1341 struct btrfs_root *root = fs_info->dev_root; 1342 struct btrfs_key key; 1343 struct btrfs_dev_extent *dev_extent; 1344 struct btrfs_path *path; 1345 u64 hole_size; 1346 u64 max_hole_start; 1347 u64 max_hole_size; 1348 u64 extent_end; 1349 u64 search_end = device->total_bytes; 1350 int ret; 1351 int slot; 1352 struct extent_buffer *l; 1353 1354 /* 1355 * We don't want to overwrite the superblock on the drive nor any area 1356 * used by the boot loader (grub for example), so we make sure to start 1357 * at an offset of at least 1MB. 1358 */ 1359 search_start = max_t(u64, search_start, SZ_1M); 1360 1361 path = btrfs_alloc_path(); 1362 if (!path) 1363 return -ENOMEM; 1364 1365 max_hole_start = search_start; 1366 max_hole_size = 0; 1367 1368 again: 1369 if (search_start >= search_end || 1370 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1371 ret = -ENOSPC; 1372 goto out; 1373 } 1374 1375 path->reada = READA_FORWARD; 1376 path->search_commit_root = 1; 1377 path->skip_locking = 1; 1378 1379 key.objectid = device->devid; 1380 key.offset = search_start; 1381 key.type = BTRFS_DEV_EXTENT_KEY; 1382 1383 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1384 if (ret < 0) 1385 goto out; 1386 if (ret > 0) { 1387 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1388 if (ret < 0) 1389 goto out; 1390 } 1391 1392 while (1) { 1393 l = path->nodes[0]; 1394 slot = path->slots[0]; 1395 if (slot >= btrfs_header_nritems(l)) { 1396 ret = btrfs_next_leaf(root, path); 1397 if (ret == 0) 1398 continue; 1399 if (ret < 0) 1400 goto out; 1401 1402 break; 1403 } 1404 btrfs_item_key_to_cpu(l, &key, slot); 1405 1406 if (key.objectid < device->devid) 1407 goto next; 1408 1409 if (key.objectid > device->devid) 1410 break; 1411 1412 if (key.type != BTRFS_DEV_EXTENT_KEY) 1413 goto next; 1414 1415 if (key.offset > search_start) { 1416 hole_size = key.offset - search_start; 1417 1418 /* 1419 * Have to check before we set max_hole_start, otherwise 1420 * we could end up sending back this offset anyway. 1421 */ 1422 if (contains_pending_extent(transaction, device, 1423 &search_start, 1424 hole_size)) { 1425 if (key.offset >= search_start) { 1426 hole_size = key.offset - search_start; 1427 } else { 1428 WARN_ON_ONCE(1); 1429 hole_size = 0; 1430 } 1431 } 1432 1433 if (hole_size > max_hole_size) { 1434 max_hole_start = search_start; 1435 max_hole_size = hole_size; 1436 } 1437 1438 /* 1439 * If this free space is greater than which we need, 1440 * it must be the max free space that we have found 1441 * until now, so max_hole_start must point to the start 1442 * of this free space and the length of this free space 1443 * is stored in max_hole_size. Thus, we return 1444 * max_hole_start and max_hole_size and go back to the 1445 * caller. 1446 */ 1447 if (hole_size >= num_bytes) { 1448 ret = 0; 1449 goto out; 1450 } 1451 } 1452 1453 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1454 extent_end = key.offset + btrfs_dev_extent_length(l, 1455 dev_extent); 1456 if (extent_end > search_start) 1457 search_start = extent_end; 1458 next: 1459 path->slots[0]++; 1460 cond_resched(); 1461 } 1462 1463 /* 1464 * At this point, search_start should be the end of 1465 * allocated dev extents, and when shrinking the device, 1466 * search_end may be smaller than search_start. 1467 */ 1468 if (search_end > search_start) { 1469 hole_size = search_end - search_start; 1470 1471 if (contains_pending_extent(transaction, device, &search_start, 1472 hole_size)) { 1473 btrfs_release_path(path); 1474 goto again; 1475 } 1476 1477 if (hole_size > max_hole_size) { 1478 max_hole_start = search_start; 1479 max_hole_size = hole_size; 1480 } 1481 } 1482 1483 /* See above. */ 1484 if (max_hole_size < num_bytes) 1485 ret = -ENOSPC; 1486 else 1487 ret = 0; 1488 1489 out: 1490 btrfs_free_path(path); 1491 *start = max_hole_start; 1492 if (len) 1493 *len = max_hole_size; 1494 return ret; 1495 } 1496 1497 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1498 struct btrfs_device *device, u64 num_bytes, 1499 u64 *start, u64 *len) 1500 { 1501 /* FIXME use last free of some kind */ 1502 return find_free_dev_extent_start(trans->transaction, device, 1503 num_bytes, 0, start, len); 1504 } 1505 1506 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1507 struct btrfs_device *device, 1508 u64 start, u64 *dev_extent_len) 1509 { 1510 struct btrfs_fs_info *fs_info = device->fs_info; 1511 struct btrfs_root *root = fs_info->dev_root; 1512 int ret; 1513 struct btrfs_path *path; 1514 struct btrfs_key key; 1515 struct btrfs_key found_key; 1516 struct extent_buffer *leaf = NULL; 1517 struct btrfs_dev_extent *extent = NULL; 1518 1519 path = btrfs_alloc_path(); 1520 if (!path) 1521 return -ENOMEM; 1522 1523 key.objectid = device->devid; 1524 key.offset = start; 1525 key.type = BTRFS_DEV_EXTENT_KEY; 1526 again: 1527 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1528 if (ret > 0) { 1529 ret = btrfs_previous_item(root, path, key.objectid, 1530 BTRFS_DEV_EXTENT_KEY); 1531 if (ret) 1532 goto out; 1533 leaf = path->nodes[0]; 1534 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1535 extent = btrfs_item_ptr(leaf, path->slots[0], 1536 struct btrfs_dev_extent); 1537 BUG_ON(found_key.offset > start || found_key.offset + 1538 btrfs_dev_extent_length(leaf, extent) < start); 1539 key = found_key; 1540 btrfs_release_path(path); 1541 goto again; 1542 } else if (ret == 0) { 1543 leaf = path->nodes[0]; 1544 extent = btrfs_item_ptr(leaf, path->slots[0], 1545 struct btrfs_dev_extent); 1546 } else { 1547 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1548 goto out; 1549 } 1550 1551 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1552 1553 ret = btrfs_del_item(trans, root, path); 1554 if (ret) { 1555 btrfs_handle_fs_error(fs_info, ret, 1556 "Failed to remove dev extent item"); 1557 } else { 1558 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1559 } 1560 out: 1561 btrfs_free_path(path); 1562 return ret; 1563 } 1564 1565 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1566 struct btrfs_device *device, 1567 u64 chunk_offset, u64 start, u64 num_bytes) 1568 { 1569 int ret; 1570 struct btrfs_path *path; 1571 struct btrfs_fs_info *fs_info = device->fs_info; 1572 struct btrfs_root *root = fs_info->dev_root; 1573 struct btrfs_dev_extent *extent; 1574 struct extent_buffer *leaf; 1575 struct btrfs_key key; 1576 1577 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1578 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1579 path = btrfs_alloc_path(); 1580 if (!path) 1581 return -ENOMEM; 1582 1583 key.objectid = device->devid; 1584 key.offset = start; 1585 key.type = BTRFS_DEV_EXTENT_KEY; 1586 ret = btrfs_insert_empty_item(trans, root, path, &key, 1587 sizeof(*extent)); 1588 if (ret) 1589 goto out; 1590 1591 leaf = path->nodes[0]; 1592 extent = btrfs_item_ptr(leaf, path->slots[0], 1593 struct btrfs_dev_extent); 1594 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1595 BTRFS_CHUNK_TREE_OBJECTID); 1596 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1597 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1598 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1599 1600 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1601 btrfs_mark_buffer_dirty(leaf); 1602 out: 1603 btrfs_free_path(path); 1604 return ret; 1605 } 1606 1607 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1608 { 1609 struct extent_map_tree *em_tree; 1610 struct extent_map *em; 1611 struct rb_node *n; 1612 u64 ret = 0; 1613 1614 em_tree = &fs_info->mapping_tree.map_tree; 1615 read_lock(&em_tree->lock); 1616 n = rb_last(&em_tree->map); 1617 if (n) { 1618 em = rb_entry(n, struct extent_map, rb_node); 1619 ret = em->start + em->len; 1620 } 1621 read_unlock(&em_tree->lock); 1622 1623 return ret; 1624 } 1625 1626 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1627 u64 *devid_ret) 1628 { 1629 int ret; 1630 struct btrfs_key key; 1631 struct btrfs_key found_key; 1632 struct btrfs_path *path; 1633 1634 path = btrfs_alloc_path(); 1635 if (!path) 1636 return -ENOMEM; 1637 1638 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1639 key.type = BTRFS_DEV_ITEM_KEY; 1640 key.offset = (u64)-1; 1641 1642 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1643 if (ret < 0) 1644 goto error; 1645 1646 BUG_ON(ret == 0); /* Corruption */ 1647 1648 ret = btrfs_previous_item(fs_info->chunk_root, path, 1649 BTRFS_DEV_ITEMS_OBJECTID, 1650 BTRFS_DEV_ITEM_KEY); 1651 if (ret) { 1652 *devid_ret = 1; 1653 } else { 1654 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1655 path->slots[0]); 1656 *devid_ret = found_key.offset + 1; 1657 } 1658 ret = 0; 1659 error: 1660 btrfs_free_path(path); 1661 return ret; 1662 } 1663 1664 /* 1665 * the device information is stored in the chunk root 1666 * the btrfs_device struct should be fully filled in 1667 */ 1668 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1669 struct btrfs_device *device) 1670 { 1671 int ret; 1672 struct btrfs_path *path; 1673 struct btrfs_dev_item *dev_item; 1674 struct extent_buffer *leaf; 1675 struct btrfs_key key; 1676 unsigned long ptr; 1677 1678 path = btrfs_alloc_path(); 1679 if (!path) 1680 return -ENOMEM; 1681 1682 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1683 key.type = BTRFS_DEV_ITEM_KEY; 1684 key.offset = device->devid; 1685 1686 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1687 &key, sizeof(*dev_item)); 1688 if (ret) 1689 goto out; 1690 1691 leaf = path->nodes[0]; 1692 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1693 1694 btrfs_set_device_id(leaf, dev_item, device->devid); 1695 btrfs_set_device_generation(leaf, dev_item, 0); 1696 btrfs_set_device_type(leaf, dev_item, device->type); 1697 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1698 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1699 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1700 btrfs_set_device_total_bytes(leaf, dev_item, 1701 btrfs_device_get_disk_total_bytes(device)); 1702 btrfs_set_device_bytes_used(leaf, dev_item, 1703 btrfs_device_get_bytes_used(device)); 1704 btrfs_set_device_group(leaf, dev_item, 0); 1705 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1706 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1707 btrfs_set_device_start_offset(leaf, dev_item, 0); 1708 1709 ptr = btrfs_device_uuid(dev_item); 1710 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1711 ptr = btrfs_device_fsid(dev_item); 1712 write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); 1713 btrfs_mark_buffer_dirty(leaf); 1714 1715 ret = 0; 1716 out: 1717 btrfs_free_path(path); 1718 return ret; 1719 } 1720 1721 /* 1722 * Function to update ctime/mtime for a given device path. 1723 * Mainly used for ctime/mtime based probe like libblkid. 1724 */ 1725 static void update_dev_time(const char *path_name) 1726 { 1727 struct file *filp; 1728 1729 filp = filp_open(path_name, O_RDWR, 0); 1730 if (IS_ERR(filp)) 1731 return; 1732 file_update_time(filp); 1733 filp_close(filp, NULL); 1734 } 1735 1736 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1737 struct btrfs_device *device) 1738 { 1739 struct btrfs_root *root = fs_info->chunk_root; 1740 int ret; 1741 struct btrfs_path *path; 1742 struct btrfs_key key; 1743 struct btrfs_trans_handle *trans; 1744 1745 path = btrfs_alloc_path(); 1746 if (!path) 1747 return -ENOMEM; 1748 1749 trans = btrfs_start_transaction(root, 0); 1750 if (IS_ERR(trans)) { 1751 btrfs_free_path(path); 1752 return PTR_ERR(trans); 1753 } 1754 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1755 key.type = BTRFS_DEV_ITEM_KEY; 1756 key.offset = device->devid; 1757 1758 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1759 if (ret) { 1760 if (ret > 0) 1761 ret = -ENOENT; 1762 btrfs_abort_transaction(trans, ret); 1763 btrfs_end_transaction(trans); 1764 goto out; 1765 } 1766 1767 ret = btrfs_del_item(trans, root, path); 1768 if (ret) { 1769 btrfs_abort_transaction(trans, ret); 1770 btrfs_end_transaction(trans); 1771 } 1772 1773 out: 1774 btrfs_free_path(path); 1775 if (!ret) 1776 ret = btrfs_commit_transaction(trans); 1777 return ret; 1778 } 1779 1780 /* 1781 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1782 * filesystem. It's up to the caller to adjust that number regarding eg. device 1783 * replace. 1784 */ 1785 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1786 u64 num_devices) 1787 { 1788 u64 all_avail; 1789 unsigned seq; 1790 int i; 1791 1792 do { 1793 seq = read_seqbegin(&fs_info->profiles_lock); 1794 1795 all_avail = fs_info->avail_data_alloc_bits | 1796 fs_info->avail_system_alloc_bits | 1797 fs_info->avail_metadata_alloc_bits; 1798 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1799 1800 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1801 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1802 continue; 1803 1804 if (num_devices < btrfs_raid_array[i].devs_min) { 1805 int ret = btrfs_raid_array[i].mindev_error; 1806 1807 if (ret) 1808 return ret; 1809 } 1810 } 1811 1812 return 0; 1813 } 1814 1815 static struct btrfs_device * btrfs_find_next_active_device( 1816 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1817 { 1818 struct btrfs_device *next_device; 1819 1820 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1821 if (next_device != device && 1822 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1823 && next_device->bdev) 1824 return next_device; 1825 } 1826 1827 return NULL; 1828 } 1829 1830 /* 1831 * Helper function to check if the given device is part of s_bdev / latest_bdev 1832 * and replace it with the provided or the next active device, in the context 1833 * where this function called, there should be always be another device (or 1834 * this_dev) which is active. 1835 */ 1836 void btrfs_assign_next_active_device(struct btrfs_device *device, 1837 struct btrfs_device *this_dev) 1838 { 1839 struct btrfs_fs_info *fs_info = device->fs_info; 1840 struct btrfs_device *next_device; 1841 1842 if (this_dev) 1843 next_device = this_dev; 1844 else 1845 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1846 device); 1847 ASSERT(next_device); 1848 1849 if (fs_info->sb->s_bdev && 1850 (fs_info->sb->s_bdev == device->bdev)) 1851 fs_info->sb->s_bdev = next_device->bdev; 1852 1853 if (fs_info->fs_devices->latest_bdev == device->bdev) 1854 fs_info->fs_devices->latest_bdev = next_device->bdev; 1855 } 1856 1857 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1858 u64 devid) 1859 { 1860 struct btrfs_device *device; 1861 struct btrfs_fs_devices *cur_devices; 1862 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 1863 u64 num_devices; 1864 int ret = 0; 1865 1866 mutex_lock(&uuid_mutex); 1867 1868 num_devices = fs_devices->num_devices; 1869 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 1870 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1871 WARN_ON(num_devices < 1); 1872 num_devices--; 1873 } 1874 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 1875 1876 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1877 if (ret) 1878 goto out; 1879 1880 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1881 &device); 1882 if (ret) 1883 goto out; 1884 1885 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1886 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1887 goto out; 1888 } 1889 1890 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1891 fs_info->fs_devices->rw_devices == 1) { 1892 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1893 goto out; 1894 } 1895 1896 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1897 mutex_lock(&fs_info->chunk_mutex); 1898 list_del_init(&device->dev_alloc_list); 1899 device->fs_devices->rw_devices--; 1900 mutex_unlock(&fs_info->chunk_mutex); 1901 } 1902 1903 mutex_unlock(&uuid_mutex); 1904 ret = btrfs_shrink_device(device, 0); 1905 mutex_lock(&uuid_mutex); 1906 if (ret) 1907 goto error_undo; 1908 1909 /* 1910 * TODO: the superblock still includes this device in its num_devices 1911 * counter although write_all_supers() is not locked out. This 1912 * could give a filesystem state which requires a degraded mount. 1913 */ 1914 ret = btrfs_rm_dev_item(fs_info, device); 1915 if (ret) 1916 goto error_undo; 1917 1918 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 1919 btrfs_scrub_cancel_dev(fs_info, device); 1920 1921 /* 1922 * the device list mutex makes sure that we don't change 1923 * the device list while someone else is writing out all 1924 * the device supers. Whoever is writing all supers, should 1925 * lock the device list mutex before getting the number of 1926 * devices in the super block (super_copy). Conversely, 1927 * whoever updates the number of devices in the super block 1928 * (super_copy) should hold the device list mutex. 1929 */ 1930 1931 /* 1932 * In normal cases the cur_devices == fs_devices. But in case 1933 * of deleting a seed device, the cur_devices should point to 1934 * its own fs_devices listed under the fs_devices->seed. 1935 */ 1936 cur_devices = device->fs_devices; 1937 mutex_lock(&fs_devices->device_list_mutex); 1938 list_del_rcu(&device->dev_list); 1939 1940 cur_devices->num_devices--; 1941 cur_devices->total_devices--; 1942 /* Update total_devices of the parent fs_devices if it's seed */ 1943 if (cur_devices != fs_devices) 1944 fs_devices->total_devices--; 1945 1946 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1947 cur_devices->missing_devices--; 1948 1949 btrfs_assign_next_active_device(device, NULL); 1950 1951 if (device->bdev) { 1952 cur_devices->open_devices--; 1953 /* remove sysfs entry */ 1954 btrfs_sysfs_rm_device_link(fs_devices, device); 1955 } 1956 1957 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 1958 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 1959 mutex_unlock(&fs_devices->device_list_mutex); 1960 1961 /* 1962 * at this point, the device is zero sized and detached from 1963 * the devices list. All that's left is to zero out the old 1964 * supers and free the device. 1965 */ 1966 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 1967 btrfs_scratch_superblocks(device->bdev, device->name->str); 1968 1969 btrfs_close_bdev(device); 1970 call_rcu(&device->rcu, free_device_rcu); 1971 1972 if (cur_devices->open_devices == 0) { 1973 while (fs_devices) { 1974 if (fs_devices->seed == cur_devices) { 1975 fs_devices->seed = cur_devices->seed; 1976 break; 1977 } 1978 fs_devices = fs_devices->seed; 1979 } 1980 cur_devices->seed = NULL; 1981 close_fs_devices(cur_devices); 1982 free_fs_devices(cur_devices); 1983 } 1984 1985 out: 1986 mutex_unlock(&uuid_mutex); 1987 return ret; 1988 1989 error_undo: 1990 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1991 mutex_lock(&fs_info->chunk_mutex); 1992 list_add(&device->dev_alloc_list, 1993 &fs_devices->alloc_list); 1994 device->fs_devices->rw_devices++; 1995 mutex_unlock(&fs_info->chunk_mutex); 1996 } 1997 goto out; 1998 } 1999 2000 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2001 { 2002 struct btrfs_fs_devices *fs_devices; 2003 2004 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2005 2006 /* 2007 * in case of fs with no seed, srcdev->fs_devices will point 2008 * to fs_devices of fs_info. However when the dev being replaced is 2009 * a seed dev it will point to the seed's local fs_devices. In short 2010 * srcdev will have its correct fs_devices in both the cases. 2011 */ 2012 fs_devices = srcdev->fs_devices; 2013 2014 list_del_rcu(&srcdev->dev_list); 2015 list_del(&srcdev->dev_alloc_list); 2016 fs_devices->num_devices--; 2017 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2018 fs_devices->missing_devices--; 2019 2020 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2021 fs_devices->rw_devices--; 2022 2023 if (srcdev->bdev) 2024 fs_devices->open_devices--; 2025 } 2026 2027 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2028 struct btrfs_device *srcdev) 2029 { 2030 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2031 2032 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2033 /* zero out the old super if it is writable */ 2034 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2035 } 2036 2037 btrfs_close_bdev(srcdev); 2038 call_rcu(&srcdev->rcu, free_device_rcu); 2039 2040 /* if this is no devs we rather delete the fs_devices */ 2041 if (!fs_devices->num_devices) { 2042 struct btrfs_fs_devices *tmp_fs_devices; 2043 2044 /* 2045 * On a mounted FS, num_devices can't be zero unless it's a 2046 * seed. In case of a seed device being replaced, the replace 2047 * target added to the sprout FS, so there will be no more 2048 * device left under the seed FS. 2049 */ 2050 ASSERT(fs_devices->seeding); 2051 2052 tmp_fs_devices = fs_info->fs_devices; 2053 while (tmp_fs_devices) { 2054 if (tmp_fs_devices->seed == fs_devices) { 2055 tmp_fs_devices->seed = fs_devices->seed; 2056 break; 2057 } 2058 tmp_fs_devices = tmp_fs_devices->seed; 2059 } 2060 fs_devices->seed = NULL; 2061 close_fs_devices(fs_devices); 2062 free_fs_devices(fs_devices); 2063 } 2064 } 2065 2066 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2067 { 2068 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2069 2070 WARN_ON(!tgtdev); 2071 mutex_lock(&fs_devices->device_list_mutex); 2072 2073 btrfs_sysfs_rm_device_link(fs_devices, tgtdev); 2074 2075 if (tgtdev->bdev) 2076 fs_devices->open_devices--; 2077 2078 fs_devices->num_devices--; 2079 2080 btrfs_assign_next_active_device(tgtdev, NULL); 2081 2082 list_del_rcu(&tgtdev->dev_list); 2083 2084 mutex_unlock(&fs_devices->device_list_mutex); 2085 2086 /* 2087 * The update_dev_time() with in btrfs_scratch_superblocks() 2088 * may lead to a call to btrfs_show_devname() which will try 2089 * to hold device_list_mutex. And here this device 2090 * is already out of device list, so we don't have to hold 2091 * the device_list_mutex lock. 2092 */ 2093 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2094 2095 btrfs_close_bdev(tgtdev); 2096 call_rcu(&tgtdev->rcu, free_device_rcu); 2097 } 2098 2099 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2100 const char *device_path, 2101 struct btrfs_device **device) 2102 { 2103 int ret = 0; 2104 struct btrfs_super_block *disk_super; 2105 u64 devid; 2106 u8 *dev_uuid; 2107 struct block_device *bdev; 2108 struct buffer_head *bh; 2109 2110 *device = NULL; 2111 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2112 fs_info->bdev_holder, 0, &bdev, &bh); 2113 if (ret) 2114 return ret; 2115 disk_super = (struct btrfs_super_block *)bh->b_data; 2116 devid = btrfs_stack_device_id(&disk_super->dev_item); 2117 dev_uuid = disk_super->dev_item.uuid; 2118 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2119 brelse(bh); 2120 if (!*device) 2121 ret = -ENOENT; 2122 blkdev_put(bdev, FMODE_READ); 2123 return ret; 2124 } 2125 2126 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2127 const char *device_path, 2128 struct btrfs_device **device) 2129 { 2130 *device = NULL; 2131 if (strcmp(device_path, "missing") == 0) { 2132 struct list_head *devices; 2133 struct btrfs_device *tmp; 2134 2135 devices = &fs_info->fs_devices->devices; 2136 list_for_each_entry(tmp, devices, dev_list) { 2137 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2138 &tmp->dev_state) && !tmp->bdev) { 2139 *device = tmp; 2140 break; 2141 } 2142 } 2143 2144 if (!*device) 2145 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2146 2147 return 0; 2148 } else { 2149 return btrfs_find_device_by_path(fs_info, device_path, device); 2150 } 2151 } 2152 2153 /* 2154 * Lookup a device given by device id, or the path if the id is 0. 2155 */ 2156 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2157 const char *devpath, 2158 struct btrfs_device **device) 2159 { 2160 int ret; 2161 2162 if (devid) { 2163 ret = 0; 2164 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2165 if (!*device) 2166 ret = -ENOENT; 2167 } else { 2168 if (!devpath || !devpath[0]) 2169 return -EINVAL; 2170 2171 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2172 device); 2173 } 2174 return ret; 2175 } 2176 2177 /* 2178 * does all the dirty work required for changing file system's UUID. 2179 */ 2180 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2181 { 2182 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2183 struct btrfs_fs_devices *old_devices; 2184 struct btrfs_fs_devices *seed_devices; 2185 struct btrfs_super_block *disk_super = fs_info->super_copy; 2186 struct btrfs_device *device; 2187 u64 super_flags; 2188 2189 lockdep_assert_held(&uuid_mutex); 2190 if (!fs_devices->seeding) 2191 return -EINVAL; 2192 2193 seed_devices = alloc_fs_devices(NULL); 2194 if (IS_ERR(seed_devices)) 2195 return PTR_ERR(seed_devices); 2196 2197 old_devices = clone_fs_devices(fs_devices); 2198 if (IS_ERR(old_devices)) { 2199 kfree(seed_devices); 2200 return PTR_ERR(old_devices); 2201 } 2202 2203 list_add(&old_devices->fs_list, &fs_uuids); 2204 2205 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2206 seed_devices->opened = 1; 2207 INIT_LIST_HEAD(&seed_devices->devices); 2208 INIT_LIST_HEAD(&seed_devices->alloc_list); 2209 mutex_init(&seed_devices->device_list_mutex); 2210 2211 mutex_lock(&fs_devices->device_list_mutex); 2212 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2213 synchronize_rcu); 2214 list_for_each_entry(device, &seed_devices->devices, dev_list) 2215 device->fs_devices = seed_devices; 2216 2217 mutex_lock(&fs_info->chunk_mutex); 2218 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2219 mutex_unlock(&fs_info->chunk_mutex); 2220 2221 fs_devices->seeding = 0; 2222 fs_devices->num_devices = 0; 2223 fs_devices->open_devices = 0; 2224 fs_devices->missing_devices = 0; 2225 fs_devices->rotating = 0; 2226 fs_devices->seed = seed_devices; 2227 2228 generate_random_uuid(fs_devices->fsid); 2229 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2230 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2231 mutex_unlock(&fs_devices->device_list_mutex); 2232 2233 super_flags = btrfs_super_flags(disk_super) & 2234 ~BTRFS_SUPER_FLAG_SEEDING; 2235 btrfs_set_super_flags(disk_super, super_flags); 2236 2237 return 0; 2238 } 2239 2240 /* 2241 * Store the expected generation for seed devices in device items. 2242 */ 2243 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2244 struct btrfs_fs_info *fs_info) 2245 { 2246 struct btrfs_root *root = fs_info->chunk_root; 2247 struct btrfs_path *path; 2248 struct extent_buffer *leaf; 2249 struct btrfs_dev_item *dev_item; 2250 struct btrfs_device *device; 2251 struct btrfs_key key; 2252 u8 fs_uuid[BTRFS_FSID_SIZE]; 2253 u8 dev_uuid[BTRFS_UUID_SIZE]; 2254 u64 devid; 2255 int ret; 2256 2257 path = btrfs_alloc_path(); 2258 if (!path) 2259 return -ENOMEM; 2260 2261 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2262 key.offset = 0; 2263 key.type = BTRFS_DEV_ITEM_KEY; 2264 2265 while (1) { 2266 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2267 if (ret < 0) 2268 goto error; 2269 2270 leaf = path->nodes[0]; 2271 next_slot: 2272 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2273 ret = btrfs_next_leaf(root, path); 2274 if (ret > 0) 2275 break; 2276 if (ret < 0) 2277 goto error; 2278 leaf = path->nodes[0]; 2279 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2280 btrfs_release_path(path); 2281 continue; 2282 } 2283 2284 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2285 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2286 key.type != BTRFS_DEV_ITEM_KEY) 2287 break; 2288 2289 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2290 struct btrfs_dev_item); 2291 devid = btrfs_device_id(leaf, dev_item); 2292 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2293 BTRFS_UUID_SIZE); 2294 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2295 BTRFS_FSID_SIZE); 2296 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2297 BUG_ON(!device); /* Logic error */ 2298 2299 if (device->fs_devices->seeding) { 2300 btrfs_set_device_generation(leaf, dev_item, 2301 device->generation); 2302 btrfs_mark_buffer_dirty(leaf); 2303 } 2304 2305 path->slots[0]++; 2306 goto next_slot; 2307 } 2308 ret = 0; 2309 error: 2310 btrfs_free_path(path); 2311 return ret; 2312 } 2313 2314 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2315 { 2316 struct btrfs_root *root = fs_info->dev_root; 2317 struct request_queue *q; 2318 struct btrfs_trans_handle *trans; 2319 struct btrfs_device *device; 2320 struct block_device *bdev; 2321 struct super_block *sb = fs_info->sb; 2322 struct rcu_string *name; 2323 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2324 u64 orig_super_total_bytes; 2325 u64 orig_super_num_devices; 2326 int seeding_dev = 0; 2327 int ret = 0; 2328 bool unlocked = false; 2329 2330 if (sb_rdonly(sb) && !fs_devices->seeding) 2331 return -EROFS; 2332 2333 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2334 fs_info->bdev_holder); 2335 if (IS_ERR(bdev)) 2336 return PTR_ERR(bdev); 2337 2338 if (fs_devices->seeding) { 2339 seeding_dev = 1; 2340 down_write(&sb->s_umount); 2341 mutex_lock(&uuid_mutex); 2342 } 2343 2344 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2345 2346 mutex_lock(&fs_devices->device_list_mutex); 2347 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2348 if (device->bdev == bdev) { 2349 ret = -EEXIST; 2350 mutex_unlock( 2351 &fs_devices->device_list_mutex); 2352 goto error; 2353 } 2354 } 2355 mutex_unlock(&fs_devices->device_list_mutex); 2356 2357 device = btrfs_alloc_device(fs_info, NULL, NULL); 2358 if (IS_ERR(device)) { 2359 /* we can safely leave the fs_devices entry around */ 2360 ret = PTR_ERR(device); 2361 goto error; 2362 } 2363 2364 name = rcu_string_strdup(device_path, GFP_KERNEL); 2365 if (!name) { 2366 ret = -ENOMEM; 2367 goto error_free_device; 2368 } 2369 rcu_assign_pointer(device->name, name); 2370 2371 trans = btrfs_start_transaction(root, 0); 2372 if (IS_ERR(trans)) { 2373 ret = PTR_ERR(trans); 2374 goto error_free_device; 2375 } 2376 2377 q = bdev_get_queue(bdev); 2378 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2379 device->generation = trans->transid; 2380 device->io_width = fs_info->sectorsize; 2381 device->io_align = fs_info->sectorsize; 2382 device->sector_size = fs_info->sectorsize; 2383 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2384 fs_info->sectorsize); 2385 device->disk_total_bytes = device->total_bytes; 2386 device->commit_total_bytes = device->total_bytes; 2387 device->fs_info = fs_info; 2388 device->bdev = bdev; 2389 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2390 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2391 device->mode = FMODE_EXCL; 2392 device->dev_stats_valid = 1; 2393 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2394 2395 if (seeding_dev) { 2396 sb->s_flags &= ~SB_RDONLY; 2397 ret = btrfs_prepare_sprout(fs_info); 2398 if (ret) { 2399 btrfs_abort_transaction(trans, ret); 2400 goto error_trans; 2401 } 2402 } 2403 2404 device->fs_devices = fs_devices; 2405 2406 mutex_lock(&fs_devices->device_list_mutex); 2407 mutex_lock(&fs_info->chunk_mutex); 2408 list_add_rcu(&device->dev_list, &fs_devices->devices); 2409 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2410 fs_devices->num_devices++; 2411 fs_devices->open_devices++; 2412 fs_devices->rw_devices++; 2413 fs_devices->total_devices++; 2414 fs_devices->total_rw_bytes += device->total_bytes; 2415 2416 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2417 2418 if (!blk_queue_nonrot(q)) 2419 fs_devices->rotating = 1; 2420 2421 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2422 btrfs_set_super_total_bytes(fs_info->super_copy, 2423 round_down(orig_super_total_bytes + device->total_bytes, 2424 fs_info->sectorsize)); 2425 2426 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2427 btrfs_set_super_num_devices(fs_info->super_copy, 2428 orig_super_num_devices + 1); 2429 2430 /* add sysfs device entry */ 2431 btrfs_sysfs_add_device_link(fs_devices, device); 2432 2433 /* 2434 * we've got more storage, clear any full flags on the space 2435 * infos 2436 */ 2437 btrfs_clear_space_info_full(fs_info); 2438 2439 mutex_unlock(&fs_info->chunk_mutex); 2440 mutex_unlock(&fs_devices->device_list_mutex); 2441 2442 if (seeding_dev) { 2443 mutex_lock(&fs_info->chunk_mutex); 2444 ret = init_first_rw_device(trans, fs_info); 2445 mutex_unlock(&fs_info->chunk_mutex); 2446 if (ret) { 2447 btrfs_abort_transaction(trans, ret); 2448 goto error_sysfs; 2449 } 2450 } 2451 2452 ret = btrfs_add_dev_item(trans, device); 2453 if (ret) { 2454 btrfs_abort_transaction(trans, ret); 2455 goto error_sysfs; 2456 } 2457 2458 if (seeding_dev) { 2459 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2460 2461 ret = btrfs_finish_sprout(trans, fs_info); 2462 if (ret) { 2463 btrfs_abort_transaction(trans, ret); 2464 goto error_sysfs; 2465 } 2466 2467 /* Sprouting would change fsid of the mounted root, 2468 * so rename the fsid on the sysfs 2469 */ 2470 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2471 fs_info->fsid); 2472 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) 2473 btrfs_warn(fs_info, 2474 "sysfs: failed to create fsid for sprout"); 2475 } 2476 2477 ret = btrfs_commit_transaction(trans); 2478 2479 if (seeding_dev) { 2480 mutex_unlock(&uuid_mutex); 2481 up_write(&sb->s_umount); 2482 unlocked = true; 2483 2484 if (ret) /* transaction commit */ 2485 return ret; 2486 2487 ret = btrfs_relocate_sys_chunks(fs_info); 2488 if (ret < 0) 2489 btrfs_handle_fs_error(fs_info, ret, 2490 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2491 trans = btrfs_attach_transaction(root); 2492 if (IS_ERR(trans)) { 2493 if (PTR_ERR(trans) == -ENOENT) 2494 return 0; 2495 ret = PTR_ERR(trans); 2496 trans = NULL; 2497 goto error_sysfs; 2498 } 2499 ret = btrfs_commit_transaction(trans); 2500 } 2501 2502 /* Update ctime/mtime for libblkid */ 2503 update_dev_time(device_path); 2504 return ret; 2505 2506 error_sysfs: 2507 btrfs_sysfs_rm_device_link(fs_devices, device); 2508 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2509 mutex_lock(&fs_info->chunk_mutex); 2510 list_del_rcu(&device->dev_list); 2511 list_del(&device->dev_alloc_list); 2512 fs_info->fs_devices->num_devices--; 2513 fs_info->fs_devices->open_devices--; 2514 fs_info->fs_devices->rw_devices--; 2515 fs_info->fs_devices->total_devices--; 2516 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2517 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2518 btrfs_set_super_total_bytes(fs_info->super_copy, 2519 orig_super_total_bytes); 2520 btrfs_set_super_num_devices(fs_info->super_copy, 2521 orig_super_num_devices); 2522 mutex_unlock(&fs_info->chunk_mutex); 2523 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2524 error_trans: 2525 if (seeding_dev) 2526 sb->s_flags |= SB_RDONLY; 2527 if (trans) 2528 btrfs_end_transaction(trans); 2529 error_free_device: 2530 btrfs_free_device(device); 2531 error: 2532 blkdev_put(bdev, FMODE_EXCL); 2533 if (seeding_dev && !unlocked) { 2534 mutex_unlock(&uuid_mutex); 2535 up_write(&sb->s_umount); 2536 } 2537 return ret; 2538 } 2539 2540 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2541 struct btrfs_device *device) 2542 { 2543 int ret; 2544 struct btrfs_path *path; 2545 struct btrfs_root *root = device->fs_info->chunk_root; 2546 struct btrfs_dev_item *dev_item; 2547 struct extent_buffer *leaf; 2548 struct btrfs_key key; 2549 2550 path = btrfs_alloc_path(); 2551 if (!path) 2552 return -ENOMEM; 2553 2554 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2555 key.type = BTRFS_DEV_ITEM_KEY; 2556 key.offset = device->devid; 2557 2558 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2559 if (ret < 0) 2560 goto out; 2561 2562 if (ret > 0) { 2563 ret = -ENOENT; 2564 goto out; 2565 } 2566 2567 leaf = path->nodes[0]; 2568 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2569 2570 btrfs_set_device_id(leaf, dev_item, device->devid); 2571 btrfs_set_device_type(leaf, dev_item, device->type); 2572 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2573 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2574 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2575 btrfs_set_device_total_bytes(leaf, dev_item, 2576 btrfs_device_get_disk_total_bytes(device)); 2577 btrfs_set_device_bytes_used(leaf, dev_item, 2578 btrfs_device_get_bytes_used(device)); 2579 btrfs_mark_buffer_dirty(leaf); 2580 2581 out: 2582 btrfs_free_path(path); 2583 return ret; 2584 } 2585 2586 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2587 struct btrfs_device *device, u64 new_size) 2588 { 2589 struct btrfs_fs_info *fs_info = device->fs_info; 2590 struct btrfs_super_block *super_copy = fs_info->super_copy; 2591 struct btrfs_fs_devices *fs_devices; 2592 u64 old_total; 2593 u64 diff; 2594 2595 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2596 return -EACCES; 2597 2598 new_size = round_down(new_size, fs_info->sectorsize); 2599 2600 mutex_lock(&fs_info->chunk_mutex); 2601 old_total = btrfs_super_total_bytes(super_copy); 2602 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2603 2604 if (new_size <= device->total_bytes || 2605 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2606 mutex_unlock(&fs_info->chunk_mutex); 2607 return -EINVAL; 2608 } 2609 2610 fs_devices = fs_info->fs_devices; 2611 2612 btrfs_set_super_total_bytes(super_copy, 2613 round_down(old_total + diff, fs_info->sectorsize)); 2614 device->fs_devices->total_rw_bytes += diff; 2615 2616 btrfs_device_set_total_bytes(device, new_size); 2617 btrfs_device_set_disk_total_bytes(device, new_size); 2618 btrfs_clear_space_info_full(device->fs_info); 2619 if (list_empty(&device->resized_list)) 2620 list_add_tail(&device->resized_list, 2621 &fs_devices->resized_devices); 2622 mutex_unlock(&fs_info->chunk_mutex); 2623 2624 return btrfs_update_device(trans, device); 2625 } 2626 2627 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2628 { 2629 struct btrfs_fs_info *fs_info = trans->fs_info; 2630 struct btrfs_root *root = fs_info->chunk_root; 2631 int ret; 2632 struct btrfs_path *path; 2633 struct btrfs_key key; 2634 2635 path = btrfs_alloc_path(); 2636 if (!path) 2637 return -ENOMEM; 2638 2639 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2640 key.offset = chunk_offset; 2641 key.type = BTRFS_CHUNK_ITEM_KEY; 2642 2643 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2644 if (ret < 0) 2645 goto out; 2646 else if (ret > 0) { /* Logic error or corruption */ 2647 btrfs_handle_fs_error(fs_info, -ENOENT, 2648 "Failed lookup while freeing chunk."); 2649 ret = -ENOENT; 2650 goto out; 2651 } 2652 2653 ret = btrfs_del_item(trans, root, path); 2654 if (ret < 0) 2655 btrfs_handle_fs_error(fs_info, ret, 2656 "Failed to delete chunk item."); 2657 out: 2658 btrfs_free_path(path); 2659 return ret; 2660 } 2661 2662 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2663 { 2664 struct btrfs_super_block *super_copy = fs_info->super_copy; 2665 struct btrfs_disk_key *disk_key; 2666 struct btrfs_chunk *chunk; 2667 u8 *ptr; 2668 int ret = 0; 2669 u32 num_stripes; 2670 u32 array_size; 2671 u32 len = 0; 2672 u32 cur; 2673 struct btrfs_key key; 2674 2675 mutex_lock(&fs_info->chunk_mutex); 2676 array_size = btrfs_super_sys_array_size(super_copy); 2677 2678 ptr = super_copy->sys_chunk_array; 2679 cur = 0; 2680 2681 while (cur < array_size) { 2682 disk_key = (struct btrfs_disk_key *)ptr; 2683 btrfs_disk_key_to_cpu(&key, disk_key); 2684 2685 len = sizeof(*disk_key); 2686 2687 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2688 chunk = (struct btrfs_chunk *)(ptr + len); 2689 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2690 len += btrfs_chunk_item_size(num_stripes); 2691 } else { 2692 ret = -EIO; 2693 break; 2694 } 2695 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2696 key.offset == chunk_offset) { 2697 memmove(ptr, ptr + len, array_size - (cur + len)); 2698 array_size -= len; 2699 btrfs_set_super_sys_array_size(super_copy, array_size); 2700 } else { 2701 ptr += len; 2702 cur += len; 2703 } 2704 } 2705 mutex_unlock(&fs_info->chunk_mutex); 2706 return ret; 2707 } 2708 2709 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2710 u64 logical, u64 length) 2711 { 2712 struct extent_map_tree *em_tree; 2713 struct extent_map *em; 2714 2715 em_tree = &fs_info->mapping_tree.map_tree; 2716 read_lock(&em_tree->lock); 2717 em = lookup_extent_mapping(em_tree, logical, length); 2718 read_unlock(&em_tree->lock); 2719 2720 if (!em) { 2721 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2722 logical, length); 2723 return ERR_PTR(-EINVAL); 2724 } 2725 2726 if (em->start > logical || em->start + em->len < logical) { 2727 btrfs_crit(fs_info, 2728 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2729 logical, length, em->start, em->start + em->len); 2730 free_extent_map(em); 2731 return ERR_PTR(-EINVAL); 2732 } 2733 2734 /* callers are responsible for dropping em's ref. */ 2735 return em; 2736 } 2737 2738 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2739 { 2740 struct btrfs_fs_info *fs_info = trans->fs_info; 2741 struct extent_map *em; 2742 struct map_lookup *map; 2743 u64 dev_extent_len = 0; 2744 int i, ret = 0; 2745 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2746 2747 em = get_chunk_map(fs_info, chunk_offset, 1); 2748 if (IS_ERR(em)) { 2749 /* 2750 * This is a logic error, but we don't want to just rely on the 2751 * user having built with ASSERT enabled, so if ASSERT doesn't 2752 * do anything we still error out. 2753 */ 2754 ASSERT(0); 2755 return PTR_ERR(em); 2756 } 2757 map = em->map_lookup; 2758 mutex_lock(&fs_info->chunk_mutex); 2759 check_system_chunk(trans, map->type); 2760 mutex_unlock(&fs_info->chunk_mutex); 2761 2762 /* 2763 * Take the device list mutex to prevent races with the final phase of 2764 * a device replace operation that replaces the device object associated 2765 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2766 */ 2767 mutex_lock(&fs_devices->device_list_mutex); 2768 for (i = 0; i < map->num_stripes; i++) { 2769 struct btrfs_device *device = map->stripes[i].dev; 2770 ret = btrfs_free_dev_extent(trans, device, 2771 map->stripes[i].physical, 2772 &dev_extent_len); 2773 if (ret) { 2774 mutex_unlock(&fs_devices->device_list_mutex); 2775 btrfs_abort_transaction(trans, ret); 2776 goto out; 2777 } 2778 2779 if (device->bytes_used > 0) { 2780 mutex_lock(&fs_info->chunk_mutex); 2781 btrfs_device_set_bytes_used(device, 2782 device->bytes_used - dev_extent_len); 2783 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2784 btrfs_clear_space_info_full(fs_info); 2785 mutex_unlock(&fs_info->chunk_mutex); 2786 } 2787 2788 if (map->stripes[i].dev) { 2789 ret = btrfs_update_device(trans, map->stripes[i].dev); 2790 if (ret) { 2791 mutex_unlock(&fs_devices->device_list_mutex); 2792 btrfs_abort_transaction(trans, ret); 2793 goto out; 2794 } 2795 } 2796 } 2797 mutex_unlock(&fs_devices->device_list_mutex); 2798 2799 ret = btrfs_free_chunk(trans, chunk_offset); 2800 if (ret) { 2801 btrfs_abort_transaction(trans, ret); 2802 goto out; 2803 } 2804 2805 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2806 2807 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2808 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2809 if (ret) { 2810 btrfs_abort_transaction(trans, ret); 2811 goto out; 2812 } 2813 } 2814 2815 ret = btrfs_remove_block_group(trans, chunk_offset, em); 2816 if (ret) { 2817 btrfs_abort_transaction(trans, ret); 2818 goto out; 2819 } 2820 2821 out: 2822 /* once for us */ 2823 free_extent_map(em); 2824 return ret; 2825 } 2826 2827 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2828 { 2829 struct btrfs_root *root = fs_info->chunk_root; 2830 struct btrfs_trans_handle *trans; 2831 int ret; 2832 2833 /* 2834 * Prevent races with automatic removal of unused block groups. 2835 * After we relocate and before we remove the chunk with offset 2836 * chunk_offset, automatic removal of the block group can kick in, 2837 * resulting in a failure when calling btrfs_remove_chunk() below. 2838 * 2839 * Make sure to acquire this mutex before doing a tree search (dev 2840 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2841 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2842 * we release the path used to search the chunk/dev tree and before 2843 * the current task acquires this mutex and calls us. 2844 */ 2845 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 2846 2847 ret = btrfs_can_relocate(fs_info, chunk_offset); 2848 if (ret) 2849 return -ENOSPC; 2850 2851 /* step one, relocate all the extents inside this chunk */ 2852 btrfs_scrub_pause(fs_info); 2853 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 2854 btrfs_scrub_continue(fs_info); 2855 if (ret) 2856 return ret; 2857 2858 /* 2859 * We add the kobjects here (and after forcing data chunk creation) 2860 * since relocation is the only place we'll create chunks of a new 2861 * type at runtime. The only place where we'll remove the last 2862 * chunk of a type is the call immediately below this one. Even 2863 * so, we're protected against races with the cleaner thread since 2864 * we're covered by the delete_unused_bgs_mutex. 2865 */ 2866 btrfs_add_raid_kobjects(fs_info); 2867 2868 trans = btrfs_start_trans_remove_block_group(root->fs_info, 2869 chunk_offset); 2870 if (IS_ERR(trans)) { 2871 ret = PTR_ERR(trans); 2872 btrfs_handle_fs_error(root->fs_info, ret, NULL); 2873 return ret; 2874 } 2875 2876 /* 2877 * step two, delete the device extents and the 2878 * chunk tree entries 2879 */ 2880 ret = btrfs_remove_chunk(trans, chunk_offset); 2881 btrfs_end_transaction(trans); 2882 return ret; 2883 } 2884 2885 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 2886 { 2887 struct btrfs_root *chunk_root = fs_info->chunk_root; 2888 struct btrfs_path *path; 2889 struct extent_buffer *leaf; 2890 struct btrfs_chunk *chunk; 2891 struct btrfs_key key; 2892 struct btrfs_key found_key; 2893 u64 chunk_type; 2894 bool retried = false; 2895 int failed = 0; 2896 int ret; 2897 2898 path = btrfs_alloc_path(); 2899 if (!path) 2900 return -ENOMEM; 2901 2902 again: 2903 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2904 key.offset = (u64)-1; 2905 key.type = BTRFS_CHUNK_ITEM_KEY; 2906 2907 while (1) { 2908 mutex_lock(&fs_info->delete_unused_bgs_mutex); 2909 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2910 if (ret < 0) { 2911 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2912 goto error; 2913 } 2914 BUG_ON(ret == 0); /* Corruption */ 2915 2916 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2917 key.type); 2918 if (ret) 2919 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2920 if (ret < 0) 2921 goto error; 2922 if (ret > 0) 2923 break; 2924 2925 leaf = path->nodes[0]; 2926 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2927 2928 chunk = btrfs_item_ptr(leaf, path->slots[0], 2929 struct btrfs_chunk); 2930 chunk_type = btrfs_chunk_type(leaf, chunk); 2931 btrfs_release_path(path); 2932 2933 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2934 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 2935 if (ret == -ENOSPC) 2936 failed++; 2937 else 2938 BUG_ON(ret); 2939 } 2940 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2941 2942 if (found_key.offset == 0) 2943 break; 2944 key.offset = found_key.offset - 1; 2945 } 2946 ret = 0; 2947 if (failed && !retried) { 2948 failed = 0; 2949 retried = true; 2950 goto again; 2951 } else if (WARN_ON(failed && retried)) { 2952 ret = -ENOSPC; 2953 } 2954 error: 2955 btrfs_free_path(path); 2956 return ret; 2957 } 2958 2959 /* 2960 * return 1 : allocate a data chunk successfully, 2961 * return <0: errors during allocating a data chunk, 2962 * return 0 : no need to allocate a data chunk. 2963 */ 2964 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 2965 u64 chunk_offset) 2966 { 2967 struct btrfs_block_group_cache *cache; 2968 u64 bytes_used; 2969 u64 chunk_type; 2970 2971 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2972 ASSERT(cache); 2973 chunk_type = cache->flags; 2974 btrfs_put_block_group(cache); 2975 2976 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 2977 spin_lock(&fs_info->data_sinfo->lock); 2978 bytes_used = fs_info->data_sinfo->bytes_used; 2979 spin_unlock(&fs_info->data_sinfo->lock); 2980 2981 if (!bytes_used) { 2982 struct btrfs_trans_handle *trans; 2983 int ret; 2984 2985 trans = btrfs_join_transaction(fs_info->tree_root); 2986 if (IS_ERR(trans)) 2987 return PTR_ERR(trans); 2988 2989 ret = btrfs_force_chunk_alloc(trans, 2990 BTRFS_BLOCK_GROUP_DATA); 2991 btrfs_end_transaction(trans); 2992 if (ret < 0) 2993 return ret; 2994 2995 btrfs_add_raid_kobjects(fs_info); 2996 2997 return 1; 2998 } 2999 } 3000 return 0; 3001 } 3002 3003 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3004 struct btrfs_balance_control *bctl) 3005 { 3006 struct btrfs_root *root = fs_info->tree_root; 3007 struct btrfs_trans_handle *trans; 3008 struct btrfs_balance_item *item; 3009 struct btrfs_disk_balance_args disk_bargs; 3010 struct btrfs_path *path; 3011 struct extent_buffer *leaf; 3012 struct btrfs_key key; 3013 int ret, err; 3014 3015 path = btrfs_alloc_path(); 3016 if (!path) 3017 return -ENOMEM; 3018 3019 trans = btrfs_start_transaction(root, 0); 3020 if (IS_ERR(trans)) { 3021 btrfs_free_path(path); 3022 return PTR_ERR(trans); 3023 } 3024 3025 key.objectid = BTRFS_BALANCE_OBJECTID; 3026 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3027 key.offset = 0; 3028 3029 ret = btrfs_insert_empty_item(trans, root, path, &key, 3030 sizeof(*item)); 3031 if (ret) 3032 goto out; 3033 3034 leaf = path->nodes[0]; 3035 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3036 3037 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3038 3039 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3040 btrfs_set_balance_data(leaf, item, &disk_bargs); 3041 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3042 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3043 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3044 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3045 3046 btrfs_set_balance_flags(leaf, item, bctl->flags); 3047 3048 btrfs_mark_buffer_dirty(leaf); 3049 out: 3050 btrfs_free_path(path); 3051 err = btrfs_commit_transaction(trans); 3052 if (err && !ret) 3053 ret = err; 3054 return ret; 3055 } 3056 3057 static int del_balance_item(struct btrfs_fs_info *fs_info) 3058 { 3059 struct btrfs_root *root = fs_info->tree_root; 3060 struct btrfs_trans_handle *trans; 3061 struct btrfs_path *path; 3062 struct btrfs_key key; 3063 int ret, err; 3064 3065 path = btrfs_alloc_path(); 3066 if (!path) 3067 return -ENOMEM; 3068 3069 trans = btrfs_start_transaction(root, 0); 3070 if (IS_ERR(trans)) { 3071 btrfs_free_path(path); 3072 return PTR_ERR(trans); 3073 } 3074 3075 key.objectid = BTRFS_BALANCE_OBJECTID; 3076 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3077 key.offset = 0; 3078 3079 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3080 if (ret < 0) 3081 goto out; 3082 if (ret > 0) { 3083 ret = -ENOENT; 3084 goto out; 3085 } 3086 3087 ret = btrfs_del_item(trans, root, path); 3088 out: 3089 btrfs_free_path(path); 3090 err = btrfs_commit_transaction(trans); 3091 if (err && !ret) 3092 ret = err; 3093 return ret; 3094 } 3095 3096 /* 3097 * This is a heuristic used to reduce the number of chunks balanced on 3098 * resume after balance was interrupted. 3099 */ 3100 static void update_balance_args(struct btrfs_balance_control *bctl) 3101 { 3102 /* 3103 * Turn on soft mode for chunk types that were being converted. 3104 */ 3105 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3106 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3107 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3108 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3109 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3110 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3111 3112 /* 3113 * Turn on usage filter if is not already used. The idea is 3114 * that chunks that we have already balanced should be 3115 * reasonably full. Don't do it for chunks that are being 3116 * converted - that will keep us from relocating unconverted 3117 * (albeit full) chunks. 3118 */ 3119 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3120 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3121 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3122 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3123 bctl->data.usage = 90; 3124 } 3125 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3126 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3127 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3128 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3129 bctl->sys.usage = 90; 3130 } 3131 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3132 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3133 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3134 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3135 bctl->meta.usage = 90; 3136 } 3137 } 3138 3139 /* 3140 * Clear the balance status in fs_info and delete the balance item from disk. 3141 */ 3142 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3143 { 3144 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3145 int ret; 3146 3147 BUG_ON(!fs_info->balance_ctl); 3148 3149 spin_lock(&fs_info->balance_lock); 3150 fs_info->balance_ctl = NULL; 3151 spin_unlock(&fs_info->balance_lock); 3152 3153 kfree(bctl); 3154 ret = del_balance_item(fs_info); 3155 if (ret) 3156 btrfs_handle_fs_error(fs_info, ret, NULL); 3157 } 3158 3159 /* 3160 * Balance filters. Return 1 if chunk should be filtered out 3161 * (should not be balanced). 3162 */ 3163 static int chunk_profiles_filter(u64 chunk_type, 3164 struct btrfs_balance_args *bargs) 3165 { 3166 chunk_type = chunk_to_extended(chunk_type) & 3167 BTRFS_EXTENDED_PROFILE_MASK; 3168 3169 if (bargs->profiles & chunk_type) 3170 return 0; 3171 3172 return 1; 3173 } 3174 3175 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3176 struct btrfs_balance_args *bargs) 3177 { 3178 struct btrfs_block_group_cache *cache; 3179 u64 chunk_used; 3180 u64 user_thresh_min; 3181 u64 user_thresh_max; 3182 int ret = 1; 3183 3184 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3185 chunk_used = btrfs_block_group_used(&cache->item); 3186 3187 if (bargs->usage_min == 0) 3188 user_thresh_min = 0; 3189 else 3190 user_thresh_min = div_factor_fine(cache->key.offset, 3191 bargs->usage_min); 3192 3193 if (bargs->usage_max == 0) 3194 user_thresh_max = 1; 3195 else if (bargs->usage_max > 100) 3196 user_thresh_max = cache->key.offset; 3197 else 3198 user_thresh_max = div_factor_fine(cache->key.offset, 3199 bargs->usage_max); 3200 3201 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3202 ret = 0; 3203 3204 btrfs_put_block_group(cache); 3205 return ret; 3206 } 3207 3208 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3209 u64 chunk_offset, struct btrfs_balance_args *bargs) 3210 { 3211 struct btrfs_block_group_cache *cache; 3212 u64 chunk_used, user_thresh; 3213 int ret = 1; 3214 3215 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3216 chunk_used = btrfs_block_group_used(&cache->item); 3217 3218 if (bargs->usage_min == 0) 3219 user_thresh = 1; 3220 else if (bargs->usage > 100) 3221 user_thresh = cache->key.offset; 3222 else 3223 user_thresh = div_factor_fine(cache->key.offset, 3224 bargs->usage); 3225 3226 if (chunk_used < user_thresh) 3227 ret = 0; 3228 3229 btrfs_put_block_group(cache); 3230 return ret; 3231 } 3232 3233 static int chunk_devid_filter(struct extent_buffer *leaf, 3234 struct btrfs_chunk *chunk, 3235 struct btrfs_balance_args *bargs) 3236 { 3237 struct btrfs_stripe *stripe; 3238 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3239 int i; 3240 3241 for (i = 0; i < num_stripes; i++) { 3242 stripe = btrfs_stripe_nr(chunk, i); 3243 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3244 return 0; 3245 } 3246 3247 return 1; 3248 } 3249 3250 /* [pstart, pend) */ 3251 static int chunk_drange_filter(struct extent_buffer *leaf, 3252 struct btrfs_chunk *chunk, 3253 struct btrfs_balance_args *bargs) 3254 { 3255 struct btrfs_stripe *stripe; 3256 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3257 u64 stripe_offset; 3258 u64 stripe_length; 3259 int factor; 3260 int i; 3261 3262 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3263 return 0; 3264 3265 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3266 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3267 factor = num_stripes / 2; 3268 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3269 factor = num_stripes - 1; 3270 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3271 factor = num_stripes - 2; 3272 } else { 3273 factor = num_stripes; 3274 } 3275 3276 for (i = 0; i < num_stripes; i++) { 3277 stripe = btrfs_stripe_nr(chunk, i); 3278 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3279 continue; 3280 3281 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3282 stripe_length = btrfs_chunk_length(leaf, chunk); 3283 stripe_length = div_u64(stripe_length, factor); 3284 3285 if (stripe_offset < bargs->pend && 3286 stripe_offset + stripe_length > bargs->pstart) 3287 return 0; 3288 } 3289 3290 return 1; 3291 } 3292 3293 /* [vstart, vend) */ 3294 static int chunk_vrange_filter(struct extent_buffer *leaf, 3295 struct btrfs_chunk *chunk, 3296 u64 chunk_offset, 3297 struct btrfs_balance_args *bargs) 3298 { 3299 if (chunk_offset < bargs->vend && 3300 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3301 /* at least part of the chunk is inside this vrange */ 3302 return 0; 3303 3304 return 1; 3305 } 3306 3307 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3308 struct btrfs_chunk *chunk, 3309 struct btrfs_balance_args *bargs) 3310 { 3311 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3312 3313 if (bargs->stripes_min <= num_stripes 3314 && num_stripes <= bargs->stripes_max) 3315 return 0; 3316 3317 return 1; 3318 } 3319 3320 static int chunk_soft_convert_filter(u64 chunk_type, 3321 struct btrfs_balance_args *bargs) 3322 { 3323 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3324 return 0; 3325 3326 chunk_type = chunk_to_extended(chunk_type) & 3327 BTRFS_EXTENDED_PROFILE_MASK; 3328 3329 if (bargs->target == chunk_type) 3330 return 1; 3331 3332 return 0; 3333 } 3334 3335 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3336 struct extent_buffer *leaf, 3337 struct btrfs_chunk *chunk, u64 chunk_offset) 3338 { 3339 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3340 struct btrfs_balance_args *bargs = NULL; 3341 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3342 3343 /* type filter */ 3344 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3345 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3346 return 0; 3347 } 3348 3349 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3350 bargs = &bctl->data; 3351 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3352 bargs = &bctl->sys; 3353 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3354 bargs = &bctl->meta; 3355 3356 /* profiles filter */ 3357 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3358 chunk_profiles_filter(chunk_type, bargs)) { 3359 return 0; 3360 } 3361 3362 /* usage filter */ 3363 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3364 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3365 return 0; 3366 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3367 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3368 return 0; 3369 } 3370 3371 /* devid filter */ 3372 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3373 chunk_devid_filter(leaf, chunk, bargs)) { 3374 return 0; 3375 } 3376 3377 /* drange filter, makes sense only with devid filter */ 3378 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3379 chunk_drange_filter(leaf, chunk, bargs)) { 3380 return 0; 3381 } 3382 3383 /* vrange filter */ 3384 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3385 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3386 return 0; 3387 } 3388 3389 /* stripes filter */ 3390 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3391 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3392 return 0; 3393 } 3394 3395 /* soft profile changing mode */ 3396 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3397 chunk_soft_convert_filter(chunk_type, bargs)) { 3398 return 0; 3399 } 3400 3401 /* 3402 * limited by count, must be the last filter 3403 */ 3404 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3405 if (bargs->limit == 0) 3406 return 0; 3407 else 3408 bargs->limit--; 3409 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3410 /* 3411 * Same logic as the 'limit' filter; the minimum cannot be 3412 * determined here because we do not have the global information 3413 * about the count of all chunks that satisfy the filters. 3414 */ 3415 if (bargs->limit_max == 0) 3416 return 0; 3417 else 3418 bargs->limit_max--; 3419 } 3420 3421 return 1; 3422 } 3423 3424 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3425 { 3426 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3427 struct btrfs_root *chunk_root = fs_info->chunk_root; 3428 struct btrfs_root *dev_root = fs_info->dev_root; 3429 struct list_head *devices; 3430 struct btrfs_device *device; 3431 u64 old_size; 3432 u64 size_to_free; 3433 u64 chunk_type; 3434 struct btrfs_chunk *chunk; 3435 struct btrfs_path *path = NULL; 3436 struct btrfs_key key; 3437 struct btrfs_key found_key; 3438 struct btrfs_trans_handle *trans; 3439 struct extent_buffer *leaf; 3440 int slot; 3441 int ret; 3442 int enospc_errors = 0; 3443 bool counting = true; 3444 /* The single value limit and min/max limits use the same bytes in the */ 3445 u64 limit_data = bctl->data.limit; 3446 u64 limit_meta = bctl->meta.limit; 3447 u64 limit_sys = bctl->sys.limit; 3448 u32 count_data = 0; 3449 u32 count_meta = 0; 3450 u32 count_sys = 0; 3451 int chunk_reserved = 0; 3452 3453 /* step one make some room on all the devices */ 3454 devices = &fs_info->fs_devices->devices; 3455 list_for_each_entry(device, devices, dev_list) { 3456 old_size = btrfs_device_get_total_bytes(device); 3457 size_to_free = div_factor(old_size, 1); 3458 size_to_free = min_t(u64, size_to_free, SZ_1M); 3459 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || 3460 btrfs_device_get_total_bytes(device) - 3461 btrfs_device_get_bytes_used(device) > size_to_free || 3462 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 3463 continue; 3464 3465 ret = btrfs_shrink_device(device, old_size - size_to_free); 3466 if (ret == -ENOSPC) 3467 break; 3468 if (ret) { 3469 /* btrfs_shrink_device never returns ret > 0 */ 3470 WARN_ON(ret > 0); 3471 goto error; 3472 } 3473 3474 trans = btrfs_start_transaction(dev_root, 0); 3475 if (IS_ERR(trans)) { 3476 ret = PTR_ERR(trans); 3477 btrfs_info_in_rcu(fs_info, 3478 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3479 rcu_str_deref(device->name), ret, 3480 old_size, old_size - size_to_free); 3481 goto error; 3482 } 3483 3484 ret = btrfs_grow_device(trans, device, old_size); 3485 if (ret) { 3486 btrfs_end_transaction(trans); 3487 /* btrfs_grow_device never returns ret > 0 */ 3488 WARN_ON(ret > 0); 3489 btrfs_info_in_rcu(fs_info, 3490 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3491 rcu_str_deref(device->name), ret, 3492 old_size, old_size - size_to_free); 3493 goto error; 3494 } 3495 3496 btrfs_end_transaction(trans); 3497 } 3498 3499 /* step two, relocate all the chunks */ 3500 path = btrfs_alloc_path(); 3501 if (!path) { 3502 ret = -ENOMEM; 3503 goto error; 3504 } 3505 3506 /* zero out stat counters */ 3507 spin_lock(&fs_info->balance_lock); 3508 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3509 spin_unlock(&fs_info->balance_lock); 3510 again: 3511 if (!counting) { 3512 /* 3513 * The single value limit and min/max limits use the same bytes 3514 * in the 3515 */ 3516 bctl->data.limit = limit_data; 3517 bctl->meta.limit = limit_meta; 3518 bctl->sys.limit = limit_sys; 3519 } 3520 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3521 key.offset = (u64)-1; 3522 key.type = BTRFS_CHUNK_ITEM_KEY; 3523 3524 while (1) { 3525 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3526 atomic_read(&fs_info->balance_cancel_req)) { 3527 ret = -ECANCELED; 3528 goto error; 3529 } 3530 3531 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3532 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3533 if (ret < 0) { 3534 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3535 goto error; 3536 } 3537 3538 /* 3539 * this shouldn't happen, it means the last relocate 3540 * failed 3541 */ 3542 if (ret == 0) 3543 BUG(); /* FIXME break ? */ 3544 3545 ret = btrfs_previous_item(chunk_root, path, 0, 3546 BTRFS_CHUNK_ITEM_KEY); 3547 if (ret) { 3548 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3549 ret = 0; 3550 break; 3551 } 3552 3553 leaf = path->nodes[0]; 3554 slot = path->slots[0]; 3555 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3556 3557 if (found_key.objectid != key.objectid) { 3558 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3559 break; 3560 } 3561 3562 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3563 chunk_type = btrfs_chunk_type(leaf, chunk); 3564 3565 if (!counting) { 3566 spin_lock(&fs_info->balance_lock); 3567 bctl->stat.considered++; 3568 spin_unlock(&fs_info->balance_lock); 3569 } 3570 3571 ret = should_balance_chunk(fs_info, leaf, chunk, 3572 found_key.offset); 3573 3574 btrfs_release_path(path); 3575 if (!ret) { 3576 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3577 goto loop; 3578 } 3579 3580 if (counting) { 3581 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3582 spin_lock(&fs_info->balance_lock); 3583 bctl->stat.expected++; 3584 spin_unlock(&fs_info->balance_lock); 3585 3586 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3587 count_data++; 3588 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3589 count_sys++; 3590 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3591 count_meta++; 3592 3593 goto loop; 3594 } 3595 3596 /* 3597 * Apply limit_min filter, no need to check if the LIMITS 3598 * filter is used, limit_min is 0 by default 3599 */ 3600 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3601 count_data < bctl->data.limit_min) 3602 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3603 count_meta < bctl->meta.limit_min) 3604 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3605 count_sys < bctl->sys.limit_min)) { 3606 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3607 goto loop; 3608 } 3609 3610 if (!chunk_reserved) { 3611 /* 3612 * We may be relocating the only data chunk we have, 3613 * which could potentially end up with losing data's 3614 * raid profile, so lets allocate an empty one in 3615 * advance. 3616 */ 3617 ret = btrfs_may_alloc_data_chunk(fs_info, 3618 found_key.offset); 3619 if (ret < 0) { 3620 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3621 goto error; 3622 } else if (ret == 1) { 3623 chunk_reserved = 1; 3624 } 3625 } 3626 3627 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3628 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3629 if (ret && ret != -ENOSPC) 3630 goto error; 3631 if (ret == -ENOSPC) { 3632 enospc_errors++; 3633 } else { 3634 spin_lock(&fs_info->balance_lock); 3635 bctl->stat.completed++; 3636 spin_unlock(&fs_info->balance_lock); 3637 } 3638 loop: 3639 if (found_key.offset == 0) 3640 break; 3641 key.offset = found_key.offset - 1; 3642 } 3643 3644 if (counting) { 3645 btrfs_release_path(path); 3646 counting = false; 3647 goto again; 3648 } 3649 error: 3650 btrfs_free_path(path); 3651 if (enospc_errors) { 3652 btrfs_info(fs_info, "%d enospc errors during balance", 3653 enospc_errors); 3654 if (!ret) 3655 ret = -ENOSPC; 3656 } 3657 3658 return ret; 3659 } 3660 3661 /** 3662 * alloc_profile_is_valid - see if a given profile is valid and reduced 3663 * @flags: profile to validate 3664 * @extended: if true @flags is treated as an extended profile 3665 */ 3666 static int alloc_profile_is_valid(u64 flags, int extended) 3667 { 3668 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3669 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3670 3671 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3672 3673 /* 1) check that all other bits are zeroed */ 3674 if (flags & ~mask) 3675 return 0; 3676 3677 /* 2) see if profile is reduced */ 3678 if (flags == 0) 3679 return !extended; /* "0" is valid for usual profiles */ 3680 3681 /* true if exactly one bit set */ 3682 return (flags & (flags - 1)) == 0; 3683 } 3684 3685 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3686 { 3687 /* cancel requested || normal exit path */ 3688 return atomic_read(&fs_info->balance_cancel_req) || 3689 (atomic_read(&fs_info->balance_pause_req) == 0 && 3690 atomic_read(&fs_info->balance_cancel_req) == 0); 3691 } 3692 3693 /* Non-zero return value signifies invalidity */ 3694 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3695 u64 allowed) 3696 { 3697 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3698 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3699 (bctl_arg->target & ~allowed))); 3700 } 3701 3702 /* 3703 * Should be called with balance mutexe held 3704 */ 3705 int btrfs_balance(struct btrfs_fs_info *fs_info, 3706 struct btrfs_balance_control *bctl, 3707 struct btrfs_ioctl_balance_args *bargs) 3708 { 3709 u64 meta_target, data_target; 3710 u64 allowed; 3711 int mixed = 0; 3712 int ret; 3713 u64 num_devices; 3714 unsigned seq; 3715 3716 if (btrfs_fs_closing(fs_info) || 3717 atomic_read(&fs_info->balance_pause_req) || 3718 atomic_read(&fs_info->balance_cancel_req)) { 3719 ret = -EINVAL; 3720 goto out; 3721 } 3722 3723 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3724 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3725 mixed = 1; 3726 3727 /* 3728 * In case of mixed groups both data and meta should be picked, 3729 * and identical options should be given for both of them. 3730 */ 3731 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3732 if (mixed && (bctl->flags & allowed)) { 3733 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3734 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3735 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3736 btrfs_err(fs_info, 3737 "balance: mixed groups data and metadata options must be the same"); 3738 ret = -EINVAL; 3739 goto out; 3740 } 3741 } 3742 3743 num_devices = fs_info->fs_devices->num_devices; 3744 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 3745 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3746 BUG_ON(num_devices < 1); 3747 num_devices--; 3748 } 3749 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 3750 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3751 if (num_devices > 1) 3752 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3753 if (num_devices > 2) 3754 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3755 if (num_devices > 3) 3756 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3757 BTRFS_BLOCK_GROUP_RAID6); 3758 if (validate_convert_profile(&bctl->data, allowed)) { 3759 int index = btrfs_bg_flags_to_raid_index(bctl->data.target); 3760 3761 btrfs_err(fs_info, 3762 "balance: invalid convert data profile %s", 3763 get_raid_name(index)); 3764 ret = -EINVAL; 3765 goto out; 3766 } 3767 if (validate_convert_profile(&bctl->meta, allowed)) { 3768 int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); 3769 3770 btrfs_err(fs_info, 3771 "balance: invalid convert metadata profile %s", 3772 get_raid_name(index)); 3773 ret = -EINVAL; 3774 goto out; 3775 } 3776 if (validate_convert_profile(&bctl->sys, allowed)) { 3777 int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); 3778 3779 btrfs_err(fs_info, 3780 "balance: invalid convert system profile %s", 3781 get_raid_name(index)); 3782 ret = -EINVAL; 3783 goto out; 3784 } 3785 3786 /* allow to reduce meta or sys integrity only if force set */ 3787 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3788 BTRFS_BLOCK_GROUP_RAID10 | 3789 BTRFS_BLOCK_GROUP_RAID5 | 3790 BTRFS_BLOCK_GROUP_RAID6; 3791 do { 3792 seq = read_seqbegin(&fs_info->profiles_lock); 3793 3794 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3795 (fs_info->avail_system_alloc_bits & allowed) && 3796 !(bctl->sys.target & allowed)) || 3797 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3798 (fs_info->avail_metadata_alloc_bits & allowed) && 3799 !(bctl->meta.target & allowed))) { 3800 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3801 btrfs_info(fs_info, 3802 "balance: force reducing metadata integrity"); 3803 } else { 3804 btrfs_err(fs_info, 3805 "balance: reduces metadata integrity, use --force if you want this"); 3806 ret = -EINVAL; 3807 goto out; 3808 } 3809 } 3810 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3811 3812 /* if we're not converting, the target field is uninitialized */ 3813 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3814 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 3815 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3816 bctl->data.target : fs_info->avail_data_alloc_bits; 3817 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 3818 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3819 int meta_index = btrfs_bg_flags_to_raid_index(meta_target); 3820 int data_index = btrfs_bg_flags_to_raid_index(data_target); 3821 3822 btrfs_warn(fs_info, 3823 "balance: metadata profile %s has lower redundancy than data profile %s", 3824 get_raid_name(meta_index), get_raid_name(data_index)); 3825 } 3826 3827 ret = insert_balance_item(fs_info, bctl); 3828 if (ret && ret != -EEXIST) 3829 goto out; 3830 3831 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3832 BUG_ON(ret == -EEXIST); 3833 BUG_ON(fs_info->balance_ctl); 3834 spin_lock(&fs_info->balance_lock); 3835 fs_info->balance_ctl = bctl; 3836 spin_unlock(&fs_info->balance_lock); 3837 } else { 3838 BUG_ON(ret != -EEXIST); 3839 spin_lock(&fs_info->balance_lock); 3840 update_balance_args(bctl); 3841 spin_unlock(&fs_info->balance_lock); 3842 } 3843 3844 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 3845 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 3846 mutex_unlock(&fs_info->balance_mutex); 3847 3848 ret = __btrfs_balance(fs_info); 3849 3850 mutex_lock(&fs_info->balance_mutex); 3851 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 3852 3853 if (bargs) { 3854 memset(bargs, 0, sizeof(*bargs)); 3855 btrfs_update_ioctl_balance_args(fs_info, bargs); 3856 } 3857 3858 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3859 balance_need_close(fs_info)) { 3860 reset_balance_state(fs_info); 3861 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3862 } 3863 3864 wake_up(&fs_info->balance_wait_q); 3865 3866 return ret; 3867 out: 3868 if (bctl->flags & BTRFS_BALANCE_RESUME) 3869 reset_balance_state(fs_info); 3870 else 3871 kfree(bctl); 3872 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3873 3874 return ret; 3875 } 3876 3877 static int balance_kthread(void *data) 3878 { 3879 struct btrfs_fs_info *fs_info = data; 3880 int ret = 0; 3881 3882 mutex_lock(&fs_info->balance_mutex); 3883 if (fs_info->balance_ctl) { 3884 btrfs_info(fs_info, "balance: resuming"); 3885 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 3886 } 3887 mutex_unlock(&fs_info->balance_mutex); 3888 3889 return ret; 3890 } 3891 3892 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3893 { 3894 struct task_struct *tsk; 3895 3896 mutex_lock(&fs_info->balance_mutex); 3897 if (!fs_info->balance_ctl) { 3898 mutex_unlock(&fs_info->balance_mutex); 3899 return 0; 3900 } 3901 mutex_unlock(&fs_info->balance_mutex); 3902 3903 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 3904 btrfs_info(fs_info, "balance: resume skipped"); 3905 return 0; 3906 } 3907 3908 /* 3909 * A ro->rw remount sequence should continue with the paused balance 3910 * regardless of who pauses it, system or the user as of now, so set 3911 * the resume flag. 3912 */ 3913 spin_lock(&fs_info->balance_lock); 3914 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 3915 spin_unlock(&fs_info->balance_lock); 3916 3917 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3918 return PTR_ERR_OR_ZERO(tsk); 3919 } 3920 3921 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3922 { 3923 struct btrfs_balance_control *bctl; 3924 struct btrfs_balance_item *item; 3925 struct btrfs_disk_balance_args disk_bargs; 3926 struct btrfs_path *path; 3927 struct extent_buffer *leaf; 3928 struct btrfs_key key; 3929 int ret; 3930 3931 path = btrfs_alloc_path(); 3932 if (!path) 3933 return -ENOMEM; 3934 3935 key.objectid = BTRFS_BALANCE_OBJECTID; 3936 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3937 key.offset = 0; 3938 3939 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3940 if (ret < 0) 3941 goto out; 3942 if (ret > 0) { /* ret = -ENOENT; */ 3943 ret = 0; 3944 goto out; 3945 } 3946 3947 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3948 if (!bctl) { 3949 ret = -ENOMEM; 3950 goto out; 3951 } 3952 3953 leaf = path->nodes[0]; 3954 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3955 3956 bctl->flags = btrfs_balance_flags(leaf, item); 3957 bctl->flags |= BTRFS_BALANCE_RESUME; 3958 3959 btrfs_balance_data(leaf, item, &disk_bargs); 3960 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3961 btrfs_balance_meta(leaf, item, &disk_bargs); 3962 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3963 btrfs_balance_sys(leaf, item, &disk_bargs); 3964 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3965 3966 /* 3967 * This should never happen, as the paused balance state is recovered 3968 * during mount without any chance of other exclusive ops to collide. 3969 * 3970 * This gives the exclusive op status to balance and keeps in paused 3971 * state until user intervention (cancel or umount). If the ownership 3972 * cannot be assigned, show a message but do not fail. The balance 3973 * is in a paused state and must have fs_info::balance_ctl properly 3974 * set up. 3975 */ 3976 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 3977 btrfs_warn(fs_info, 3978 "balance: cannot set exclusive op status, resume manually"); 3979 3980 mutex_lock(&fs_info->balance_mutex); 3981 BUG_ON(fs_info->balance_ctl); 3982 spin_lock(&fs_info->balance_lock); 3983 fs_info->balance_ctl = bctl; 3984 spin_unlock(&fs_info->balance_lock); 3985 mutex_unlock(&fs_info->balance_mutex); 3986 out: 3987 btrfs_free_path(path); 3988 return ret; 3989 } 3990 3991 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3992 { 3993 int ret = 0; 3994 3995 mutex_lock(&fs_info->balance_mutex); 3996 if (!fs_info->balance_ctl) { 3997 mutex_unlock(&fs_info->balance_mutex); 3998 return -ENOTCONN; 3999 } 4000 4001 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4002 atomic_inc(&fs_info->balance_pause_req); 4003 mutex_unlock(&fs_info->balance_mutex); 4004 4005 wait_event(fs_info->balance_wait_q, 4006 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4007 4008 mutex_lock(&fs_info->balance_mutex); 4009 /* we are good with balance_ctl ripped off from under us */ 4010 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4011 atomic_dec(&fs_info->balance_pause_req); 4012 } else { 4013 ret = -ENOTCONN; 4014 } 4015 4016 mutex_unlock(&fs_info->balance_mutex); 4017 return ret; 4018 } 4019 4020 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4021 { 4022 mutex_lock(&fs_info->balance_mutex); 4023 if (!fs_info->balance_ctl) { 4024 mutex_unlock(&fs_info->balance_mutex); 4025 return -ENOTCONN; 4026 } 4027 4028 /* 4029 * A paused balance with the item stored on disk can be resumed at 4030 * mount time if the mount is read-write. Otherwise it's still paused 4031 * and we must not allow cancelling as it deletes the item. 4032 */ 4033 if (sb_rdonly(fs_info->sb)) { 4034 mutex_unlock(&fs_info->balance_mutex); 4035 return -EROFS; 4036 } 4037 4038 atomic_inc(&fs_info->balance_cancel_req); 4039 /* 4040 * if we are running just wait and return, balance item is 4041 * deleted in btrfs_balance in this case 4042 */ 4043 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4044 mutex_unlock(&fs_info->balance_mutex); 4045 wait_event(fs_info->balance_wait_q, 4046 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4047 mutex_lock(&fs_info->balance_mutex); 4048 } else { 4049 mutex_unlock(&fs_info->balance_mutex); 4050 /* 4051 * Lock released to allow other waiters to continue, we'll 4052 * reexamine the status again. 4053 */ 4054 mutex_lock(&fs_info->balance_mutex); 4055 4056 if (fs_info->balance_ctl) { 4057 reset_balance_state(fs_info); 4058 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4059 btrfs_info(fs_info, "balance: canceled"); 4060 } 4061 } 4062 4063 BUG_ON(fs_info->balance_ctl || 4064 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4065 atomic_dec(&fs_info->balance_cancel_req); 4066 mutex_unlock(&fs_info->balance_mutex); 4067 return 0; 4068 } 4069 4070 static int btrfs_uuid_scan_kthread(void *data) 4071 { 4072 struct btrfs_fs_info *fs_info = data; 4073 struct btrfs_root *root = fs_info->tree_root; 4074 struct btrfs_key key; 4075 struct btrfs_path *path = NULL; 4076 int ret = 0; 4077 struct extent_buffer *eb; 4078 int slot; 4079 struct btrfs_root_item root_item; 4080 u32 item_size; 4081 struct btrfs_trans_handle *trans = NULL; 4082 4083 path = btrfs_alloc_path(); 4084 if (!path) { 4085 ret = -ENOMEM; 4086 goto out; 4087 } 4088 4089 key.objectid = 0; 4090 key.type = BTRFS_ROOT_ITEM_KEY; 4091 key.offset = 0; 4092 4093 while (1) { 4094 ret = btrfs_search_forward(root, &key, path, 4095 BTRFS_OLDEST_GENERATION); 4096 if (ret) { 4097 if (ret > 0) 4098 ret = 0; 4099 break; 4100 } 4101 4102 if (key.type != BTRFS_ROOT_ITEM_KEY || 4103 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4104 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4105 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4106 goto skip; 4107 4108 eb = path->nodes[0]; 4109 slot = path->slots[0]; 4110 item_size = btrfs_item_size_nr(eb, slot); 4111 if (item_size < sizeof(root_item)) 4112 goto skip; 4113 4114 read_extent_buffer(eb, &root_item, 4115 btrfs_item_ptr_offset(eb, slot), 4116 (int)sizeof(root_item)); 4117 if (btrfs_root_refs(&root_item) == 0) 4118 goto skip; 4119 4120 if (!btrfs_is_empty_uuid(root_item.uuid) || 4121 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4122 if (trans) 4123 goto update_tree; 4124 4125 btrfs_release_path(path); 4126 /* 4127 * 1 - subvol uuid item 4128 * 1 - received_subvol uuid item 4129 */ 4130 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4131 if (IS_ERR(trans)) { 4132 ret = PTR_ERR(trans); 4133 break; 4134 } 4135 continue; 4136 } else { 4137 goto skip; 4138 } 4139 update_tree: 4140 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4141 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4142 BTRFS_UUID_KEY_SUBVOL, 4143 key.objectid); 4144 if (ret < 0) { 4145 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4146 ret); 4147 break; 4148 } 4149 } 4150 4151 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4152 ret = btrfs_uuid_tree_add(trans, 4153 root_item.received_uuid, 4154 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4155 key.objectid); 4156 if (ret < 0) { 4157 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4158 ret); 4159 break; 4160 } 4161 } 4162 4163 skip: 4164 if (trans) { 4165 ret = btrfs_end_transaction(trans); 4166 trans = NULL; 4167 if (ret) 4168 break; 4169 } 4170 4171 btrfs_release_path(path); 4172 if (key.offset < (u64)-1) { 4173 key.offset++; 4174 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4175 key.offset = 0; 4176 key.type = BTRFS_ROOT_ITEM_KEY; 4177 } else if (key.objectid < (u64)-1) { 4178 key.offset = 0; 4179 key.type = BTRFS_ROOT_ITEM_KEY; 4180 key.objectid++; 4181 } else { 4182 break; 4183 } 4184 cond_resched(); 4185 } 4186 4187 out: 4188 btrfs_free_path(path); 4189 if (trans && !IS_ERR(trans)) 4190 btrfs_end_transaction(trans); 4191 if (ret) 4192 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4193 else 4194 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4195 up(&fs_info->uuid_tree_rescan_sem); 4196 return 0; 4197 } 4198 4199 /* 4200 * Callback for btrfs_uuid_tree_iterate(). 4201 * returns: 4202 * 0 check succeeded, the entry is not outdated. 4203 * < 0 if an error occurred. 4204 * > 0 if the check failed, which means the caller shall remove the entry. 4205 */ 4206 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4207 u8 *uuid, u8 type, u64 subid) 4208 { 4209 struct btrfs_key key; 4210 int ret = 0; 4211 struct btrfs_root *subvol_root; 4212 4213 if (type != BTRFS_UUID_KEY_SUBVOL && 4214 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4215 goto out; 4216 4217 key.objectid = subid; 4218 key.type = BTRFS_ROOT_ITEM_KEY; 4219 key.offset = (u64)-1; 4220 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4221 if (IS_ERR(subvol_root)) { 4222 ret = PTR_ERR(subvol_root); 4223 if (ret == -ENOENT) 4224 ret = 1; 4225 goto out; 4226 } 4227 4228 switch (type) { 4229 case BTRFS_UUID_KEY_SUBVOL: 4230 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4231 ret = 1; 4232 break; 4233 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4234 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4235 BTRFS_UUID_SIZE)) 4236 ret = 1; 4237 break; 4238 } 4239 4240 out: 4241 return ret; 4242 } 4243 4244 static int btrfs_uuid_rescan_kthread(void *data) 4245 { 4246 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4247 int ret; 4248 4249 /* 4250 * 1st step is to iterate through the existing UUID tree and 4251 * to delete all entries that contain outdated data. 4252 * 2nd step is to add all missing entries to the UUID tree. 4253 */ 4254 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4255 if (ret < 0) { 4256 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4257 up(&fs_info->uuid_tree_rescan_sem); 4258 return ret; 4259 } 4260 return btrfs_uuid_scan_kthread(data); 4261 } 4262 4263 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4264 { 4265 struct btrfs_trans_handle *trans; 4266 struct btrfs_root *tree_root = fs_info->tree_root; 4267 struct btrfs_root *uuid_root; 4268 struct task_struct *task; 4269 int ret; 4270 4271 /* 4272 * 1 - root node 4273 * 1 - root item 4274 */ 4275 trans = btrfs_start_transaction(tree_root, 2); 4276 if (IS_ERR(trans)) 4277 return PTR_ERR(trans); 4278 4279 uuid_root = btrfs_create_tree(trans, fs_info, 4280 BTRFS_UUID_TREE_OBJECTID); 4281 if (IS_ERR(uuid_root)) { 4282 ret = PTR_ERR(uuid_root); 4283 btrfs_abort_transaction(trans, ret); 4284 btrfs_end_transaction(trans); 4285 return ret; 4286 } 4287 4288 fs_info->uuid_root = uuid_root; 4289 4290 ret = btrfs_commit_transaction(trans); 4291 if (ret) 4292 return ret; 4293 4294 down(&fs_info->uuid_tree_rescan_sem); 4295 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4296 if (IS_ERR(task)) { 4297 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4298 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4299 up(&fs_info->uuid_tree_rescan_sem); 4300 return PTR_ERR(task); 4301 } 4302 4303 return 0; 4304 } 4305 4306 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4307 { 4308 struct task_struct *task; 4309 4310 down(&fs_info->uuid_tree_rescan_sem); 4311 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4312 if (IS_ERR(task)) { 4313 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4314 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4315 up(&fs_info->uuid_tree_rescan_sem); 4316 return PTR_ERR(task); 4317 } 4318 4319 return 0; 4320 } 4321 4322 /* 4323 * shrinking a device means finding all of the device extents past 4324 * the new size, and then following the back refs to the chunks. 4325 * The chunk relocation code actually frees the device extent 4326 */ 4327 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4328 { 4329 struct btrfs_fs_info *fs_info = device->fs_info; 4330 struct btrfs_root *root = fs_info->dev_root; 4331 struct btrfs_trans_handle *trans; 4332 struct btrfs_dev_extent *dev_extent = NULL; 4333 struct btrfs_path *path; 4334 u64 length; 4335 u64 chunk_offset; 4336 int ret; 4337 int slot; 4338 int failed = 0; 4339 bool retried = false; 4340 bool checked_pending_chunks = false; 4341 struct extent_buffer *l; 4342 struct btrfs_key key; 4343 struct btrfs_super_block *super_copy = fs_info->super_copy; 4344 u64 old_total = btrfs_super_total_bytes(super_copy); 4345 u64 old_size = btrfs_device_get_total_bytes(device); 4346 u64 diff; 4347 4348 new_size = round_down(new_size, fs_info->sectorsize); 4349 diff = round_down(old_size - new_size, fs_info->sectorsize); 4350 4351 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4352 return -EINVAL; 4353 4354 path = btrfs_alloc_path(); 4355 if (!path) 4356 return -ENOMEM; 4357 4358 path->reada = READA_BACK; 4359 4360 mutex_lock(&fs_info->chunk_mutex); 4361 4362 btrfs_device_set_total_bytes(device, new_size); 4363 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4364 device->fs_devices->total_rw_bytes -= diff; 4365 atomic64_sub(diff, &fs_info->free_chunk_space); 4366 } 4367 mutex_unlock(&fs_info->chunk_mutex); 4368 4369 again: 4370 key.objectid = device->devid; 4371 key.offset = (u64)-1; 4372 key.type = BTRFS_DEV_EXTENT_KEY; 4373 4374 do { 4375 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4376 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4377 if (ret < 0) { 4378 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4379 goto done; 4380 } 4381 4382 ret = btrfs_previous_item(root, path, 0, key.type); 4383 if (ret) 4384 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4385 if (ret < 0) 4386 goto done; 4387 if (ret) { 4388 ret = 0; 4389 btrfs_release_path(path); 4390 break; 4391 } 4392 4393 l = path->nodes[0]; 4394 slot = path->slots[0]; 4395 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4396 4397 if (key.objectid != device->devid) { 4398 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4399 btrfs_release_path(path); 4400 break; 4401 } 4402 4403 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4404 length = btrfs_dev_extent_length(l, dev_extent); 4405 4406 if (key.offset + length <= new_size) { 4407 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4408 btrfs_release_path(path); 4409 break; 4410 } 4411 4412 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4413 btrfs_release_path(path); 4414 4415 /* 4416 * We may be relocating the only data chunk we have, 4417 * which could potentially end up with losing data's 4418 * raid profile, so lets allocate an empty one in 4419 * advance. 4420 */ 4421 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4422 if (ret < 0) { 4423 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4424 goto done; 4425 } 4426 4427 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4428 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4429 if (ret && ret != -ENOSPC) 4430 goto done; 4431 if (ret == -ENOSPC) 4432 failed++; 4433 } while (key.offset-- > 0); 4434 4435 if (failed && !retried) { 4436 failed = 0; 4437 retried = true; 4438 goto again; 4439 } else if (failed && retried) { 4440 ret = -ENOSPC; 4441 goto done; 4442 } 4443 4444 /* Shrinking succeeded, else we would be at "done". */ 4445 trans = btrfs_start_transaction(root, 0); 4446 if (IS_ERR(trans)) { 4447 ret = PTR_ERR(trans); 4448 goto done; 4449 } 4450 4451 mutex_lock(&fs_info->chunk_mutex); 4452 4453 /* 4454 * We checked in the above loop all device extents that were already in 4455 * the device tree. However before we have updated the device's 4456 * total_bytes to the new size, we might have had chunk allocations that 4457 * have not complete yet (new block groups attached to transaction 4458 * handles), and therefore their device extents were not yet in the 4459 * device tree and we missed them in the loop above. So if we have any 4460 * pending chunk using a device extent that overlaps the device range 4461 * that we can not use anymore, commit the current transaction and 4462 * repeat the search on the device tree - this way we guarantee we will 4463 * not have chunks using device extents that end beyond 'new_size'. 4464 */ 4465 if (!checked_pending_chunks) { 4466 u64 start = new_size; 4467 u64 len = old_size - new_size; 4468 4469 if (contains_pending_extent(trans->transaction, device, 4470 &start, len)) { 4471 mutex_unlock(&fs_info->chunk_mutex); 4472 checked_pending_chunks = true; 4473 failed = 0; 4474 retried = false; 4475 ret = btrfs_commit_transaction(trans); 4476 if (ret) 4477 goto done; 4478 goto again; 4479 } 4480 } 4481 4482 btrfs_device_set_disk_total_bytes(device, new_size); 4483 if (list_empty(&device->resized_list)) 4484 list_add_tail(&device->resized_list, 4485 &fs_info->fs_devices->resized_devices); 4486 4487 WARN_ON(diff > old_total); 4488 btrfs_set_super_total_bytes(super_copy, 4489 round_down(old_total - diff, fs_info->sectorsize)); 4490 mutex_unlock(&fs_info->chunk_mutex); 4491 4492 /* Now btrfs_update_device() will change the on-disk size. */ 4493 ret = btrfs_update_device(trans, device); 4494 btrfs_end_transaction(trans); 4495 done: 4496 btrfs_free_path(path); 4497 if (ret) { 4498 mutex_lock(&fs_info->chunk_mutex); 4499 btrfs_device_set_total_bytes(device, old_size); 4500 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4501 device->fs_devices->total_rw_bytes += diff; 4502 atomic64_add(diff, &fs_info->free_chunk_space); 4503 mutex_unlock(&fs_info->chunk_mutex); 4504 } 4505 return ret; 4506 } 4507 4508 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4509 struct btrfs_key *key, 4510 struct btrfs_chunk *chunk, int item_size) 4511 { 4512 struct btrfs_super_block *super_copy = fs_info->super_copy; 4513 struct btrfs_disk_key disk_key; 4514 u32 array_size; 4515 u8 *ptr; 4516 4517 mutex_lock(&fs_info->chunk_mutex); 4518 array_size = btrfs_super_sys_array_size(super_copy); 4519 if (array_size + item_size + sizeof(disk_key) 4520 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4521 mutex_unlock(&fs_info->chunk_mutex); 4522 return -EFBIG; 4523 } 4524 4525 ptr = super_copy->sys_chunk_array + array_size; 4526 btrfs_cpu_key_to_disk(&disk_key, key); 4527 memcpy(ptr, &disk_key, sizeof(disk_key)); 4528 ptr += sizeof(disk_key); 4529 memcpy(ptr, chunk, item_size); 4530 item_size += sizeof(disk_key); 4531 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4532 mutex_unlock(&fs_info->chunk_mutex); 4533 4534 return 0; 4535 } 4536 4537 /* 4538 * sort the devices in descending order by max_avail, total_avail 4539 */ 4540 static int btrfs_cmp_device_info(const void *a, const void *b) 4541 { 4542 const struct btrfs_device_info *di_a = a; 4543 const struct btrfs_device_info *di_b = b; 4544 4545 if (di_a->max_avail > di_b->max_avail) 4546 return -1; 4547 if (di_a->max_avail < di_b->max_avail) 4548 return 1; 4549 if (di_a->total_avail > di_b->total_avail) 4550 return -1; 4551 if (di_a->total_avail < di_b->total_avail) 4552 return 1; 4553 return 0; 4554 } 4555 4556 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4557 { 4558 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4559 return; 4560 4561 btrfs_set_fs_incompat(info, RAID56); 4562 } 4563 4564 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ 4565 - sizeof(struct btrfs_chunk)) \ 4566 / sizeof(struct btrfs_stripe) + 1) 4567 4568 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4569 - 2 * sizeof(struct btrfs_disk_key) \ 4570 - 2 * sizeof(struct btrfs_chunk)) \ 4571 / sizeof(struct btrfs_stripe) + 1) 4572 4573 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4574 u64 start, u64 type) 4575 { 4576 struct btrfs_fs_info *info = trans->fs_info; 4577 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4578 struct btrfs_device *device; 4579 struct map_lookup *map = NULL; 4580 struct extent_map_tree *em_tree; 4581 struct extent_map *em; 4582 struct btrfs_device_info *devices_info = NULL; 4583 u64 total_avail; 4584 int num_stripes; /* total number of stripes to allocate */ 4585 int data_stripes; /* number of stripes that count for 4586 block group size */ 4587 int sub_stripes; /* sub_stripes info for map */ 4588 int dev_stripes; /* stripes per dev */ 4589 int devs_max; /* max devs to use */ 4590 int devs_min; /* min devs needed */ 4591 int devs_increment; /* ndevs has to be a multiple of this */ 4592 int ncopies; /* how many copies to data has */ 4593 int ret; 4594 u64 max_stripe_size; 4595 u64 max_chunk_size; 4596 u64 stripe_size; 4597 u64 num_bytes; 4598 int ndevs; 4599 int i; 4600 int j; 4601 int index; 4602 4603 BUG_ON(!alloc_profile_is_valid(type, 0)); 4604 4605 if (list_empty(&fs_devices->alloc_list)) { 4606 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4607 btrfs_debug(info, "%s: no writable device", __func__); 4608 return -ENOSPC; 4609 } 4610 4611 index = btrfs_bg_flags_to_raid_index(type); 4612 4613 sub_stripes = btrfs_raid_array[index].sub_stripes; 4614 dev_stripes = btrfs_raid_array[index].dev_stripes; 4615 devs_max = btrfs_raid_array[index].devs_max; 4616 devs_min = btrfs_raid_array[index].devs_min; 4617 devs_increment = btrfs_raid_array[index].devs_increment; 4618 ncopies = btrfs_raid_array[index].ncopies; 4619 4620 if (type & BTRFS_BLOCK_GROUP_DATA) { 4621 max_stripe_size = SZ_1G; 4622 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4623 if (!devs_max) 4624 devs_max = BTRFS_MAX_DEVS(info); 4625 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4626 /* for larger filesystems, use larger metadata chunks */ 4627 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4628 max_stripe_size = SZ_1G; 4629 else 4630 max_stripe_size = SZ_256M; 4631 max_chunk_size = max_stripe_size; 4632 if (!devs_max) 4633 devs_max = BTRFS_MAX_DEVS(info); 4634 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4635 max_stripe_size = SZ_32M; 4636 max_chunk_size = 2 * max_stripe_size; 4637 if (!devs_max) 4638 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4639 } else { 4640 btrfs_err(info, "invalid chunk type 0x%llx requested", 4641 type); 4642 BUG_ON(1); 4643 } 4644 4645 /* we don't want a chunk larger than 10% of writeable space */ 4646 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4647 max_chunk_size); 4648 4649 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4650 GFP_NOFS); 4651 if (!devices_info) 4652 return -ENOMEM; 4653 4654 /* 4655 * in the first pass through the devices list, we gather information 4656 * about the available holes on each device. 4657 */ 4658 ndevs = 0; 4659 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4660 u64 max_avail; 4661 u64 dev_offset; 4662 4663 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4664 WARN(1, KERN_ERR 4665 "BTRFS: read-only device in alloc_list\n"); 4666 continue; 4667 } 4668 4669 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4670 &device->dev_state) || 4671 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4672 continue; 4673 4674 if (device->total_bytes > device->bytes_used) 4675 total_avail = device->total_bytes - device->bytes_used; 4676 else 4677 total_avail = 0; 4678 4679 /* If there is no space on this device, skip it. */ 4680 if (total_avail == 0) 4681 continue; 4682 4683 ret = find_free_dev_extent(trans, device, 4684 max_stripe_size * dev_stripes, 4685 &dev_offset, &max_avail); 4686 if (ret && ret != -ENOSPC) 4687 goto error; 4688 4689 if (ret == 0) 4690 max_avail = max_stripe_size * dev_stripes; 4691 4692 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { 4693 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4694 btrfs_debug(info, 4695 "%s: devid %llu has no free space, have=%llu want=%u", 4696 __func__, device->devid, max_avail, 4697 BTRFS_STRIPE_LEN * dev_stripes); 4698 continue; 4699 } 4700 4701 if (ndevs == fs_devices->rw_devices) { 4702 WARN(1, "%s: found more than %llu devices\n", 4703 __func__, fs_devices->rw_devices); 4704 break; 4705 } 4706 devices_info[ndevs].dev_offset = dev_offset; 4707 devices_info[ndevs].max_avail = max_avail; 4708 devices_info[ndevs].total_avail = total_avail; 4709 devices_info[ndevs].dev = device; 4710 ++ndevs; 4711 } 4712 4713 /* 4714 * now sort the devices by hole size / available space 4715 */ 4716 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4717 btrfs_cmp_device_info, NULL); 4718 4719 /* round down to number of usable stripes */ 4720 ndevs = round_down(ndevs, devs_increment); 4721 4722 if (ndevs < devs_min) { 4723 ret = -ENOSPC; 4724 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 4725 btrfs_debug(info, 4726 "%s: not enough devices with free space: have=%d minimum required=%d", 4727 __func__, ndevs, devs_min); 4728 } 4729 goto error; 4730 } 4731 4732 ndevs = min(ndevs, devs_max); 4733 4734 /* 4735 * The primary goal is to maximize the number of stripes, so use as 4736 * many devices as possible, even if the stripes are not maximum sized. 4737 * 4738 * The DUP profile stores more than one stripe per device, the 4739 * max_avail is the total size so we have to adjust. 4740 */ 4741 stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); 4742 num_stripes = ndevs * dev_stripes; 4743 4744 /* 4745 * this will have to be fixed for RAID1 and RAID10 over 4746 * more drives 4747 */ 4748 data_stripes = num_stripes / ncopies; 4749 4750 if (type & BTRFS_BLOCK_GROUP_RAID5) 4751 data_stripes = num_stripes - 1; 4752 4753 if (type & BTRFS_BLOCK_GROUP_RAID6) 4754 data_stripes = num_stripes - 2; 4755 4756 /* 4757 * Use the number of data stripes to figure out how big this chunk 4758 * is really going to be in terms of logical address space, 4759 * and compare that answer with the max chunk size 4760 */ 4761 if (stripe_size * data_stripes > max_chunk_size) { 4762 stripe_size = div_u64(max_chunk_size, data_stripes); 4763 4764 /* bump the answer up to a 16MB boundary */ 4765 stripe_size = round_up(stripe_size, SZ_16M); 4766 4767 /* 4768 * But don't go higher than the limits we found while searching 4769 * for free extents 4770 */ 4771 stripe_size = min(devices_info[ndevs - 1].max_avail, 4772 stripe_size); 4773 } 4774 4775 /* align to BTRFS_STRIPE_LEN */ 4776 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 4777 4778 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4779 if (!map) { 4780 ret = -ENOMEM; 4781 goto error; 4782 } 4783 map->num_stripes = num_stripes; 4784 4785 for (i = 0; i < ndevs; ++i) { 4786 for (j = 0; j < dev_stripes; ++j) { 4787 int s = i * dev_stripes + j; 4788 map->stripes[s].dev = devices_info[i].dev; 4789 map->stripes[s].physical = devices_info[i].dev_offset + 4790 j * stripe_size; 4791 } 4792 } 4793 map->stripe_len = BTRFS_STRIPE_LEN; 4794 map->io_align = BTRFS_STRIPE_LEN; 4795 map->io_width = BTRFS_STRIPE_LEN; 4796 map->type = type; 4797 map->sub_stripes = sub_stripes; 4798 4799 num_bytes = stripe_size * data_stripes; 4800 4801 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4802 4803 em = alloc_extent_map(); 4804 if (!em) { 4805 kfree(map); 4806 ret = -ENOMEM; 4807 goto error; 4808 } 4809 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4810 em->map_lookup = map; 4811 em->start = start; 4812 em->len = num_bytes; 4813 em->block_start = 0; 4814 em->block_len = em->len; 4815 em->orig_block_len = stripe_size; 4816 4817 em_tree = &info->mapping_tree.map_tree; 4818 write_lock(&em_tree->lock); 4819 ret = add_extent_mapping(em_tree, em, 0); 4820 if (ret) { 4821 write_unlock(&em_tree->lock); 4822 free_extent_map(em); 4823 goto error; 4824 } 4825 4826 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4827 refcount_inc(&em->refs); 4828 write_unlock(&em_tree->lock); 4829 4830 ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); 4831 if (ret) 4832 goto error_del_extent; 4833 4834 for (i = 0; i < map->num_stripes; i++) { 4835 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4836 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4837 } 4838 4839 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 4840 4841 free_extent_map(em); 4842 check_raid56_incompat_flag(info, type); 4843 4844 kfree(devices_info); 4845 return 0; 4846 4847 error_del_extent: 4848 write_lock(&em_tree->lock); 4849 remove_extent_mapping(em_tree, em); 4850 write_unlock(&em_tree->lock); 4851 4852 /* One for our allocation */ 4853 free_extent_map(em); 4854 /* One for the tree reference */ 4855 free_extent_map(em); 4856 /* One for the pending_chunks list reference */ 4857 free_extent_map(em); 4858 error: 4859 kfree(devices_info); 4860 return ret; 4861 } 4862 4863 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4864 u64 chunk_offset, u64 chunk_size) 4865 { 4866 struct btrfs_fs_info *fs_info = trans->fs_info; 4867 struct btrfs_root *extent_root = fs_info->extent_root; 4868 struct btrfs_root *chunk_root = fs_info->chunk_root; 4869 struct btrfs_key key; 4870 struct btrfs_device *device; 4871 struct btrfs_chunk *chunk; 4872 struct btrfs_stripe *stripe; 4873 struct extent_map *em; 4874 struct map_lookup *map; 4875 size_t item_size; 4876 u64 dev_offset; 4877 u64 stripe_size; 4878 int i = 0; 4879 int ret = 0; 4880 4881 em = get_chunk_map(fs_info, chunk_offset, chunk_size); 4882 if (IS_ERR(em)) 4883 return PTR_ERR(em); 4884 4885 map = em->map_lookup; 4886 item_size = btrfs_chunk_item_size(map->num_stripes); 4887 stripe_size = em->orig_block_len; 4888 4889 chunk = kzalloc(item_size, GFP_NOFS); 4890 if (!chunk) { 4891 ret = -ENOMEM; 4892 goto out; 4893 } 4894 4895 /* 4896 * Take the device list mutex to prevent races with the final phase of 4897 * a device replace operation that replaces the device object associated 4898 * with the map's stripes, because the device object's id can change 4899 * at any time during that final phase of the device replace operation 4900 * (dev-replace.c:btrfs_dev_replace_finishing()). 4901 */ 4902 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4903 for (i = 0; i < map->num_stripes; i++) { 4904 device = map->stripes[i].dev; 4905 dev_offset = map->stripes[i].physical; 4906 4907 ret = btrfs_update_device(trans, device); 4908 if (ret) 4909 break; 4910 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 4911 dev_offset, stripe_size); 4912 if (ret) 4913 break; 4914 } 4915 if (ret) { 4916 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4917 goto out; 4918 } 4919 4920 stripe = &chunk->stripe; 4921 for (i = 0; i < map->num_stripes; i++) { 4922 device = map->stripes[i].dev; 4923 dev_offset = map->stripes[i].physical; 4924 4925 btrfs_set_stack_stripe_devid(stripe, device->devid); 4926 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4927 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4928 stripe++; 4929 } 4930 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4931 4932 btrfs_set_stack_chunk_length(chunk, chunk_size); 4933 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4934 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4935 btrfs_set_stack_chunk_type(chunk, map->type); 4936 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4937 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4938 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4939 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 4940 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4941 4942 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4943 key.type = BTRFS_CHUNK_ITEM_KEY; 4944 key.offset = chunk_offset; 4945 4946 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4947 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4948 /* 4949 * TODO: Cleanup of inserted chunk root in case of 4950 * failure. 4951 */ 4952 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 4953 } 4954 4955 out: 4956 kfree(chunk); 4957 free_extent_map(em); 4958 return ret; 4959 } 4960 4961 /* 4962 * Chunk allocation falls into two parts. The first part does works 4963 * that make the new allocated chunk useable, but not do any operation 4964 * that modifies the chunk tree. The second part does the works that 4965 * require modifying the chunk tree. This division is important for the 4966 * bootstrap process of adding storage to a seed btrfs. 4967 */ 4968 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) 4969 { 4970 u64 chunk_offset; 4971 4972 lockdep_assert_held(&trans->fs_info->chunk_mutex); 4973 chunk_offset = find_next_chunk(trans->fs_info); 4974 return __btrfs_alloc_chunk(trans, chunk_offset, type); 4975 } 4976 4977 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4978 struct btrfs_fs_info *fs_info) 4979 { 4980 u64 chunk_offset; 4981 u64 sys_chunk_offset; 4982 u64 alloc_profile; 4983 int ret; 4984 4985 chunk_offset = find_next_chunk(fs_info); 4986 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 4987 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 4988 if (ret) 4989 return ret; 4990 4991 sys_chunk_offset = find_next_chunk(fs_info); 4992 alloc_profile = btrfs_system_alloc_profile(fs_info); 4993 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 4994 return ret; 4995 } 4996 4997 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 4998 { 4999 int max_errors; 5000 5001 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5002 BTRFS_BLOCK_GROUP_RAID10 | 5003 BTRFS_BLOCK_GROUP_RAID5 | 5004 BTRFS_BLOCK_GROUP_DUP)) { 5005 max_errors = 1; 5006 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5007 max_errors = 2; 5008 } else { 5009 max_errors = 0; 5010 } 5011 5012 return max_errors; 5013 } 5014 5015 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5016 { 5017 struct extent_map *em; 5018 struct map_lookup *map; 5019 int readonly = 0; 5020 int miss_ndevs = 0; 5021 int i; 5022 5023 em = get_chunk_map(fs_info, chunk_offset, 1); 5024 if (IS_ERR(em)) 5025 return 1; 5026 5027 map = em->map_lookup; 5028 for (i = 0; i < map->num_stripes; i++) { 5029 if (test_bit(BTRFS_DEV_STATE_MISSING, 5030 &map->stripes[i].dev->dev_state)) { 5031 miss_ndevs++; 5032 continue; 5033 } 5034 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5035 &map->stripes[i].dev->dev_state)) { 5036 readonly = 1; 5037 goto end; 5038 } 5039 } 5040 5041 /* 5042 * If the number of missing devices is larger than max errors, 5043 * we can not write the data into that chunk successfully, so 5044 * set it readonly. 5045 */ 5046 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5047 readonly = 1; 5048 end: 5049 free_extent_map(em); 5050 return readonly; 5051 } 5052 5053 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5054 { 5055 extent_map_tree_init(&tree->map_tree); 5056 } 5057 5058 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5059 { 5060 struct extent_map *em; 5061 5062 while (1) { 5063 write_lock(&tree->map_tree.lock); 5064 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5065 if (em) 5066 remove_extent_mapping(&tree->map_tree, em); 5067 write_unlock(&tree->map_tree.lock); 5068 if (!em) 5069 break; 5070 /* once for us */ 5071 free_extent_map(em); 5072 /* once for the tree */ 5073 free_extent_map(em); 5074 } 5075 } 5076 5077 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5078 { 5079 struct extent_map *em; 5080 struct map_lookup *map; 5081 int ret; 5082 5083 em = get_chunk_map(fs_info, logical, len); 5084 if (IS_ERR(em)) 5085 /* 5086 * We could return errors for these cases, but that could get 5087 * ugly and we'd probably do the same thing which is just not do 5088 * anything else and exit, so return 1 so the callers don't try 5089 * to use other copies. 5090 */ 5091 return 1; 5092 5093 map = em->map_lookup; 5094 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5095 ret = map->num_stripes; 5096 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5097 ret = map->sub_stripes; 5098 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5099 ret = 2; 5100 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5101 /* 5102 * There could be two corrupted data stripes, we need 5103 * to loop retry in order to rebuild the correct data. 5104 * 5105 * Fail a stripe at a time on every retry except the 5106 * stripe under reconstruction. 5107 */ 5108 ret = map->num_stripes; 5109 else 5110 ret = 1; 5111 free_extent_map(em); 5112 5113 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 5114 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5115 fs_info->dev_replace.tgtdev) 5116 ret++; 5117 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 5118 5119 return ret; 5120 } 5121 5122 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5123 u64 logical) 5124 { 5125 struct extent_map *em; 5126 struct map_lookup *map; 5127 unsigned long len = fs_info->sectorsize; 5128 5129 em = get_chunk_map(fs_info, logical, len); 5130 5131 if (!WARN_ON(IS_ERR(em))) { 5132 map = em->map_lookup; 5133 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5134 len = map->stripe_len * nr_data_stripes(map); 5135 free_extent_map(em); 5136 } 5137 return len; 5138 } 5139 5140 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5141 { 5142 struct extent_map *em; 5143 struct map_lookup *map; 5144 int ret = 0; 5145 5146 em = get_chunk_map(fs_info, logical, len); 5147 5148 if(!WARN_ON(IS_ERR(em))) { 5149 map = em->map_lookup; 5150 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5151 ret = 1; 5152 free_extent_map(em); 5153 } 5154 return ret; 5155 } 5156 5157 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5158 struct map_lookup *map, int first, 5159 int dev_replace_is_ongoing) 5160 { 5161 int i; 5162 int num_stripes; 5163 int preferred_mirror; 5164 int tolerance; 5165 struct btrfs_device *srcdev; 5166 5167 ASSERT((map->type & 5168 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5169 5170 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5171 num_stripes = map->sub_stripes; 5172 else 5173 num_stripes = map->num_stripes; 5174 5175 preferred_mirror = first + current->pid % num_stripes; 5176 5177 if (dev_replace_is_ongoing && 5178 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5179 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5180 srcdev = fs_info->dev_replace.srcdev; 5181 else 5182 srcdev = NULL; 5183 5184 /* 5185 * try to avoid the drive that is the source drive for a 5186 * dev-replace procedure, only choose it if no other non-missing 5187 * mirror is available 5188 */ 5189 for (tolerance = 0; tolerance < 2; tolerance++) { 5190 if (map->stripes[preferred_mirror].dev->bdev && 5191 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5192 return preferred_mirror; 5193 for (i = first; i < first + num_stripes; i++) { 5194 if (map->stripes[i].dev->bdev && 5195 (tolerance || map->stripes[i].dev != srcdev)) 5196 return i; 5197 } 5198 } 5199 5200 /* we couldn't find one that doesn't fail. Just return something 5201 * and the io error handling code will clean up eventually 5202 */ 5203 return preferred_mirror; 5204 } 5205 5206 static inline int parity_smaller(u64 a, u64 b) 5207 { 5208 return a > b; 5209 } 5210 5211 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5212 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5213 { 5214 struct btrfs_bio_stripe s; 5215 int i; 5216 u64 l; 5217 int again = 1; 5218 5219 while (again) { 5220 again = 0; 5221 for (i = 0; i < num_stripes - 1; i++) { 5222 if (parity_smaller(bbio->raid_map[i], 5223 bbio->raid_map[i+1])) { 5224 s = bbio->stripes[i]; 5225 l = bbio->raid_map[i]; 5226 bbio->stripes[i] = bbio->stripes[i+1]; 5227 bbio->raid_map[i] = bbio->raid_map[i+1]; 5228 bbio->stripes[i+1] = s; 5229 bbio->raid_map[i+1] = l; 5230 5231 again = 1; 5232 } 5233 } 5234 } 5235 } 5236 5237 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5238 { 5239 struct btrfs_bio *bbio = kzalloc( 5240 /* the size of the btrfs_bio */ 5241 sizeof(struct btrfs_bio) + 5242 /* plus the variable array for the stripes */ 5243 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5244 /* plus the variable array for the tgt dev */ 5245 sizeof(int) * (real_stripes) + 5246 /* 5247 * plus the raid_map, which includes both the tgt dev 5248 * and the stripes 5249 */ 5250 sizeof(u64) * (total_stripes), 5251 GFP_NOFS|__GFP_NOFAIL); 5252 5253 atomic_set(&bbio->error, 0); 5254 refcount_set(&bbio->refs, 1); 5255 5256 return bbio; 5257 } 5258 5259 void btrfs_get_bbio(struct btrfs_bio *bbio) 5260 { 5261 WARN_ON(!refcount_read(&bbio->refs)); 5262 refcount_inc(&bbio->refs); 5263 } 5264 5265 void btrfs_put_bbio(struct btrfs_bio *bbio) 5266 { 5267 if (!bbio) 5268 return; 5269 if (refcount_dec_and_test(&bbio->refs)) 5270 kfree(bbio); 5271 } 5272 5273 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5274 /* 5275 * Please note that, discard won't be sent to target device of device 5276 * replace. 5277 */ 5278 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5279 u64 logical, u64 length, 5280 struct btrfs_bio **bbio_ret) 5281 { 5282 struct extent_map *em; 5283 struct map_lookup *map; 5284 struct btrfs_bio *bbio; 5285 u64 offset; 5286 u64 stripe_nr; 5287 u64 stripe_nr_end; 5288 u64 stripe_end_offset; 5289 u64 stripe_cnt; 5290 u64 stripe_len; 5291 u64 stripe_offset; 5292 u64 num_stripes; 5293 u32 stripe_index; 5294 u32 factor = 0; 5295 u32 sub_stripes = 0; 5296 u64 stripes_per_dev = 0; 5297 u32 remaining_stripes = 0; 5298 u32 last_stripe = 0; 5299 int ret = 0; 5300 int i; 5301 5302 /* discard always return a bbio */ 5303 ASSERT(bbio_ret); 5304 5305 em = get_chunk_map(fs_info, logical, length); 5306 if (IS_ERR(em)) 5307 return PTR_ERR(em); 5308 5309 map = em->map_lookup; 5310 /* we don't discard raid56 yet */ 5311 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5312 ret = -EOPNOTSUPP; 5313 goto out; 5314 } 5315 5316 offset = logical - em->start; 5317 length = min_t(u64, em->len - offset, length); 5318 5319 stripe_len = map->stripe_len; 5320 /* 5321 * stripe_nr counts the total number of stripes we have to stride 5322 * to get to this block 5323 */ 5324 stripe_nr = div64_u64(offset, stripe_len); 5325 5326 /* stripe_offset is the offset of this block in its stripe */ 5327 stripe_offset = offset - stripe_nr * stripe_len; 5328 5329 stripe_nr_end = round_up(offset + length, map->stripe_len); 5330 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5331 stripe_cnt = stripe_nr_end - stripe_nr; 5332 stripe_end_offset = stripe_nr_end * map->stripe_len - 5333 (offset + length); 5334 /* 5335 * after this, stripe_nr is the number of stripes on this 5336 * device we have to walk to find the data, and stripe_index is 5337 * the number of our device in the stripe array 5338 */ 5339 num_stripes = 1; 5340 stripe_index = 0; 5341 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5342 BTRFS_BLOCK_GROUP_RAID10)) { 5343 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5344 sub_stripes = 1; 5345 else 5346 sub_stripes = map->sub_stripes; 5347 5348 factor = map->num_stripes / sub_stripes; 5349 num_stripes = min_t(u64, map->num_stripes, 5350 sub_stripes * stripe_cnt); 5351 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5352 stripe_index *= sub_stripes; 5353 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5354 &remaining_stripes); 5355 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5356 last_stripe *= sub_stripes; 5357 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5358 BTRFS_BLOCK_GROUP_DUP)) { 5359 num_stripes = map->num_stripes; 5360 } else { 5361 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5362 &stripe_index); 5363 } 5364 5365 bbio = alloc_btrfs_bio(num_stripes, 0); 5366 if (!bbio) { 5367 ret = -ENOMEM; 5368 goto out; 5369 } 5370 5371 for (i = 0; i < num_stripes; i++) { 5372 bbio->stripes[i].physical = 5373 map->stripes[stripe_index].physical + 5374 stripe_offset + stripe_nr * map->stripe_len; 5375 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5376 5377 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5378 BTRFS_BLOCK_GROUP_RAID10)) { 5379 bbio->stripes[i].length = stripes_per_dev * 5380 map->stripe_len; 5381 5382 if (i / sub_stripes < remaining_stripes) 5383 bbio->stripes[i].length += 5384 map->stripe_len; 5385 5386 /* 5387 * Special for the first stripe and 5388 * the last stripe: 5389 * 5390 * |-------|...|-------| 5391 * |----------| 5392 * off end_off 5393 */ 5394 if (i < sub_stripes) 5395 bbio->stripes[i].length -= 5396 stripe_offset; 5397 5398 if (stripe_index >= last_stripe && 5399 stripe_index <= (last_stripe + 5400 sub_stripes - 1)) 5401 bbio->stripes[i].length -= 5402 stripe_end_offset; 5403 5404 if (i == sub_stripes - 1) 5405 stripe_offset = 0; 5406 } else { 5407 bbio->stripes[i].length = length; 5408 } 5409 5410 stripe_index++; 5411 if (stripe_index == map->num_stripes) { 5412 stripe_index = 0; 5413 stripe_nr++; 5414 } 5415 } 5416 5417 *bbio_ret = bbio; 5418 bbio->map_type = map->type; 5419 bbio->num_stripes = num_stripes; 5420 out: 5421 free_extent_map(em); 5422 return ret; 5423 } 5424 5425 /* 5426 * In dev-replace case, for repair case (that's the only case where the mirror 5427 * is selected explicitly when calling btrfs_map_block), blocks left of the 5428 * left cursor can also be read from the target drive. 5429 * 5430 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5431 * array of stripes. 5432 * For READ, it also needs to be supported using the same mirror number. 5433 * 5434 * If the requested block is not left of the left cursor, EIO is returned. This 5435 * can happen because btrfs_num_copies() returns one more in the dev-replace 5436 * case. 5437 */ 5438 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5439 u64 logical, u64 length, 5440 u64 srcdev_devid, int *mirror_num, 5441 u64 *physical) 5442 { 5443 struct btrfs_bio *bbio = NULL; 5444 int num_stripes; 5445 int index_srcdev = 0; 5446 int found = 0; 5447 u64 physical_of_found = 0; 5448 int i; 5449 int ret = 0; 5450 5451 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5452 logical, &length, &bbio, 0, 0); 5453 if (ret) { 5454 ASSERT(bbio == NULL); 5455 return ret; 5456 } 5457 5458 num_stripes = bbio->num_stripes; 5459 if (*mirror_num > num_stripes) { 5460 /* 5461 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5462 * that means that the requested area is not left of the left 5463 * cursor 5464 */ 5465 btrfs_put_bbio(bbio); 5466 return -EIO; 5467 } 5468 5469 /* 5470 * process the rest of the function using the mirror_num of the source 5471 * drive. Therefore look it up first. At the end, patch the device 5472 * pointer to the one of the target drive. 5473 */ 5474 for (i = 0; i < num_stripes; i++) { 5475 if (bbio->stripes[i].dev->devid != srcdev_devid) 5476 continue; 5477 5478 /* 5479 * In case of DUP, in order to keep it simple, only add the 5480 * mirror with the lowest physical address 5481 */ 5482 if (found && 5483 physical_of_found <= bbio->stripes[i].physical) 5484 continue; 5485 5486 index_srcdev = i; 5487 found = 1; 5488 physical_of_found = bbio->stripes[i].physical; 5489 } 5490 5491 btrfs_put_bbio(bbio); 5492 5493 ASSERT(found); 5494 if (!found) 5495 return -EIO; 5496 5497 *mirror_num = index_srcdev + 1; 5498 *physical = physical_of_found; 5499 return ret; 5500 } 5501 5502 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5503 struct btrfs_bio **bbio_ret, 5504 struct btrfs_dev_replace *dev_replace, 5505 int *num_stripes_ret, int *max_errors_ret) 5506 { 5507 struct btrfs_bio *bbio = *bbio_ret; 5508 u64 srcdev_devid = dev_replace->srcdev->devid; 5509 int tgtdev_indexes = 0; 5510 int num_stripes = *num_stripes_ret; 5511 int max_errors = *max_errors_ret; 5512 int i; 5513 5514 if (op == BTRFS_MAP_WRITE) { 5515 int index_where_to_add; 5516 5517 /* 5518 * duplicate the write operations while the dev replace 5519 * procedure is running. Since the copying of the old disk to 5520 * the new disk takes place at run time while the filesystem is 5521 * mounted writable, the regular write operations to the old 5522 * disk have to be duplicated to go to the new disk as well. 5523 * 5524 * Note that device->missing is handled by the caller, and that 5525 * the write to the old disk is already set up in the stripes 5526 * array. 5527 */ 5528 index_where_to_add = num_stripes; 5529 for (i = 0; i < num_stripes; i++) { 5530 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5531 /* write to new disk, too */ 5532 struct btrfs_bio_stripe *new = 5533 bbio->stripes + index_where_to_add; 5534 struct btrfs_bio_stripe *old = 5535 bbio->stripes + i; 5536 5537 new->physical = old->physical; 5538 new->length = old->length; 5539 new->dev = dev_replace->tgtdev; 5540 bbio->tgtdev_map[i] = index_where_to_add; 5541 index_where_to_add++; 5542 max_errors++; 5543 tgtdev_indexes++; 5544 } 5545 } 5546 num_stripes = index_where_to_add; 5547 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5548 int index_srcdev = 0; 5549 int found = 0; 5550 u64 physical_of_found = 0; 5551 5552 /* 5553 * During the dev-replace procedure, the target drive can also 5554 * be used to read data in case it is needed to repair a corrupt 5555 * block elsewhere. This is possible if the requested area is 5556 * left of the left cursor. In this area, the target drive is a 5557 * full copy of the source drive. 5558 */ 5559 for (i = 0; i < num_stripes; i++) { 5560 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5561 /* 5562 * In case of DUP, in order to keep it simple, 5563 * only add the mirror with the lowest physical 5564 * address 5565 */ 5566 if (found && 5567 physical_of_found <= 5568 bbio->stripes[i].physical) 5569 continue; 5570 index_srcdev = i; 5571 found = 1; 5572 physical_of_found = bbio->stripes[i].physical; 5573 } 5574 } 5575 if (found) { 5576 struct btrfs_bio_stripe *tgtdev_stripe = 5577 bbio->stripes + num_stripes; 5578 5579 tgtdev_stripe->physical = physical_of_found; 5580 tgtdev_stripe->length = 5581 bbio->stripes[index_srcdev].length; 5582 tgtdev_stripe->dev = dev_replace->tgtdev; 5583 bbio->tgtdev_map[index_srcdev] = num_stripes; 5584 5585 tgtdev_indexes++; 5586 num_stripes++; 5587 } 5588 } 5589 5590 *num_stripes_ret = num_stripes; 5591 *max_errors_ret = max_errors; 5592 bbio->num_tgtdevs = tgtdev_indexes; 5593 *bbio_ret = bbio; 5594 } 5595 5596 static bool need_full_stripe(enum btrfs_map_op op) 5597 { 5598 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5599 } 5600 5601 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5602 enum btrfs_map_op op, 5603 u64 logical, u64 *length, 5604 struct btrfs_bio **bbio_ret, 5605 int mirror_num, int need_raid_map) 5606 { 5607 struct extent_map *em; 5608 struct map_lookup *map; 5609 u64 offset; 5610 u64 stripe_offset; 5611 u64 stripe_nr; 5612 u64 stripe_len; 5613 u32 stripe_index; 5614 int i; 5615 int ret = 0; 5616 int num_stripes; 5617 int max_errors = 0; 5618 int tgtdev_indexes = 0; 5619 struct btrfs_bio *bbio = NULL; 5620 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5621 int dev_replace_is_ongoing = 0; 5622 int num_alloc_stripes; 5623 int patch_the_first_stripe_for_dev_replace = 0; 5624 u64 physical_to_patch_in_first_stripe = 0; 5625 u64 raid56_full_stripe_start = (u64)-1; 5626 5627 if (op == BTRFS_MAP_DISCARD) 5628 return __btrfs_map_block_for_discard(fs_info, logical, 5629 *length, bbio_ret); 5630 5631 em = get_chunk_map(fs_info, logical, *length); 5632 if (IS_ERR(em)) 5633 return PTR_ERR(em); 5634 5635 map = em->map_lookup; 5636 offset = logical - em->start; 5637 5638 stripe_len = map->stripe_len; 5639 stripe_nr = offset; 5640 /* 5641 * stripe_nr counts the total number of stripes we have to stride 5642 * to get to this block 5643 */ 5644 stripe_nr = div64_u64(stripe_nr, stripe_len); 5645 5646 stripe_offset = stripe_nr * stripe_len; 5647 if (offset < stripe_offset) { 5648 btrfs_crit(fs_info, 5649 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5650 stripe_offset, offset, em->start, logical, 5651 stripe_len); 5652 free_extent_map(em); 5653 return -EINVAL; 5654 } 5655 5656 /* stripe_offset is the offset of this block in its stripe*/ 5657 stripe_offset = offset - stripe_offset; 5658 5659 /* if we're here for raid56, we need to know the stripe aligned start */ 5660 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5661 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5662 raid56_full_stripe_start = offset; 5663 5664 /* allow a write of a full stripe, but make sure we don't 5665 * allow straddling of stripes 5666 */ 5667 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5668 full_stripe_len); 5669 raid56_full_stripe_start *= full_stripe_len; 5670 } 5671 5672 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5673 u64 max_len; 5674 /* For writes to RAID[56], allow a full stripeset across all disks. 5675 For other RAID types and for RAID[56] reads, just allow a single 5676 stripe (on a single disk). */ 5677 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5678 (op == BTRFS_MAP_WRITE)) { 5679 max_len = stripe_len * nr_data_stripes(map) - 5680 (offset - raid56_full_stripe_start); 5681 } else { 5682 /* we limit the length of each bio to what fits in a stripe */ 5683 max_len = stripe_len - stripe_offset; 5684 } 5685 *length = min_t(u64, em->len - offset, max_len); 5686 } else { 5687 *length = em->len - offset; 5688 } 5689 5690 /* This is for when we're called from btrfs_merge_bio_hook() and all 5691 it cares about is the length */ 5692 if (!bbio_ret) 5693 goto out; 5694 5695 btrfs_dev_replace_read_lock(dev_replace); 5696 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5697 if (!dev_replace_is_ongoing) 5698 btrfs_dev_replace_read_unlock(dev_replace); 5699 else 5700 btrfs_dev_replace_set_lock_blocking(dev_replace); 5701 5702 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5703 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 5704 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 5705 dev_replace->srcdev->devid, 5706 &mirror_num, 5707 &physical_to_patch_in_first_stripe); 5708 if (ret) 5709 goto out; 5710 else 5711 patch_the_first_stripe_for_dev_replace = 1; 5712 } else if (mirror_num > map->num_stripes) { 5713 mirror_num = 0; 5714 } 5715 5716 num_stripes = 1; 5717 stripe_index = 0; 5718 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5719 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5720 &stripe_index); 5721 if (!need_full_stripe(op)) 5722 mirror_num = 1; 5723 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5724 if (need_full_stripe(op)) 5725 num_stripes = map->num_stripes; 5726 else if (mirror_num) 5727 stripe_index = mirror_num - 1; 5728 else { 5729 stripe_index = find_live_mirror(fs_info, map, 0, 5730 dev_replace_is_ongoing); 5731 mirror_num = stripe_index + 1; 5732 } 5733 5734 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5735 if (need_full_stripe(op)) { 5736 num_stripes = map->num_stripes; 5737 } else if (mirror_num) { 5738 stripe_index = mirror_num - 1; 5739 } else { 5740 mirror_num = 1; 5741 } 5742 5743 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5744 u32 factor = map->num_stripes / map->sub_stripes; 5745 5746 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5747 stripe_index *= map->sub_stripes; 5748 5749 if (need_full_stripe(op)) 5750 num_stripes = map->sub_stripes; 5751 else if (mirror_num) 5752 stripe_index += mirror_num - 1; 5753 else { 5754 int old_stripe_index = stripe_index; 5755 stripe_index = find_live_mirror(fs_info, map, 5756 stripe_index, 5757 dev_replace_is_ongoing); 5758 mirror_num = stripe_index - old_stripe_index + 1; 5759 } 5760 5761 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5762 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5763 /* push stripe_nr back to the start of the full stripe */ 5764 stripe_nr = div64_u64(raid56_full_stripe_start, 5765 stripe_len * nr_data_stripes(map)); 5766 5767 /* RAID[56] write or recovery. Return all stripes */ 5768 num_stripes = map->num_stripes; 5769 max_errors = nr_parity_stripes(map); 5770 5771 *length = map->stripe_len; 5772 stripe_index = 0; 5773 stripe_offset = 0; 5774 } else { 5775 /* 5776 * Mirror #0 or #1 means the original data block. 5777 * Mirror #2 is RAID5 parity block. 5778 * Mirror #3 is RAID6 Q block. 5779 */ 5780 stripe_nr = div_u64_rem(stripe_nr, 5781 nr_data_stripes(map), &stripe_index); 5782 if (mirror_num > 1) 5783 stripe_index = nr_data_stripes(map) + 5784 mirror_num - 2; 5785 5786 /* We distribute the parity blocks across stripes */ 5787 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5788 &stripe_index); 5789 if (!need_full_stripe(op) && mirror_num <= 1) 5790 mirror_num = 1; 5791 } 5792 } else { 5793 /* 5794 * after this, stripe_nr is the number of stripes on this 5795 * device we have to walk to find the data, and stripe_index is 5796 * the number of our device in the stripe array 5797 */ 5798 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5799 &stripe_index); 5800 mirror_num = stripe_index + 1; 5801 } 5802 if (stripe_index >= map->num_stripes) { 5803 btrfs_crit(fs_info, 5804 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5805 stripe_index, map->num_stripes); 5806 ret = -EINVAL; 5807 goto out; 5808 } 5809 5810 num_alloc_stripes = num_stripes; 5811 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 5812 if (op == BTRFS_MAP_WRITE) 5813 num_alloc_stripes <<= 1; 5814 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5815 num_alloc_stripes++; 5816 tgtdev_indexes = num_stripes; 5817 } 5818 5819 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5820 if (!bbio) { 5821 ret = -ENOMEM; 5822 goto out; 5823 } 5824 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 5825 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5826 5827 /* build raid_map */ 5828 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 5829 (need_full_stripe(op) || mirror_num > 1)) { 5830 u64 tmp; 5831 unsigned rot; 5832 5833 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5834 sizeof(struct btrfs_bio_stripe) * 5835 num_alloc_stripes + 5836 sizeof(int) * tgtdev_indexes); 5837 5838 /* Work out the disk rotation on this stripe-set */ 5839 div_u64_rem(stripe_nr, num_stripes, &rot); 5840 5841 /* Fill in the logical address of each stripe */ 5842 tmp = stripe_nr * nr_data_stripes(map); 5843 for (i = 0; i < nr_data_stripes(map); i++) 5844 bbio->raid_map[(i+rot) % num_stripes] = 5845 em->start + (tmp + i) * map->stripe_len; 5846 5847 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5848 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5849 bbio->raid_map[(i+rot+1) % num_stripes] = 5850 RAID6_Q_STRIPE; 5851 } 5852 5853 5854 for (i = 0; i < num_stripes; i++) { 5855 bbio->stripes[i].physical = 5856 map->stripes[stripe_index].physical + 5857 stripe_offset + 5858 stripe_nr * map->stripe_len; 5859 bbio->stripes[i].dev = 5860 map->stripes[stripe_index].dev; 5861 stripe_index++; 5862 } 5863 5864 if (need_full_stripe(op)) 5865 max_errors = btrfs_chunk_max_errors(map); 5866 5867 if (bbio->raid_map) 5868 sort_parity_stripes(bbio, num_stripes); 5869 5870 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 5871 need_full_stripe(op)) { 5872 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 5873 &max_errors); 5874 } 5875 5876 *bbio_ret = bbio; 5877 bbio->map_type = map->type; 5878 bbio->num_stripes = num_stripes; 5879 bbio->max_errors = max_errors; 5880 bbio->mirror_num = mirror_num; 5881 5882 /* 5883 * this is the case that REQ_READ && dev_replace_is_ongoing && 5884 * mirror_num == num_stripes + 1 && dev_replace target drive is 5885 * available as a mirror 5886 */ 5887 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5888 WARN_ON(num_stripes > 1); 5889 bbio->stripes[0].dev = dev_replace->tgtdev; 5890 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5891 bbio->mirror_num = map->num_stripes + 1; 5892 } 5893 out: 5894 if (dev_replace_is_ongoing) { 5895 btrfs_dev_replace_clear_lock_blocking(dev_replace); 5896 btrfs_dev_replace_read_unlock(dev_replace); 5897 } 5898 free_extent_map(em); 5899 return ret; 5900 } 5901 5902 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5903 u64 logical, u64 *length, 5904 struct btrfs_bio **bbio_ret, int mirror_num) 5905 { 5906 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 5907 mirror_num, 0); 5908 } 5909 5910 /* For Scrub/replace */ 5911 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5912 u64 logical, u64 *length, 5913 struct btrfs_bio **bbio_ret) 5914 { 5915 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 5916 } 5917 5918 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 5919 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 5920 { 5921 struct extent_map *em; 5922 struct map_lookup *map; 5923 u64 *buf; 5924 u64 bytenr; 5925 u64 length; 5926 u64 stripe_nr; 5927 u64 rmap_len; 5928 int i, j, nr = 0; 5929 5930 em = get_chunk_map(fs_info, chunk_start, 1); 5931 if (IS_ERR(em)) 5932 return -EIO; 5933 5934 map = em->map_lookup; 5935 length = em->len; 5936 rmap_len = map->stripe_len; 5937 5938 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5939 length = div_u64(length, map->num_stripes / map->sub_stripes); 5940 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5941 length = div_u64(length, map->num_stripes); 5942 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5943 length = div_u64(length, nr_data_stripes(map)); 5944 rmap_len = map->stripe_len * nr_data_stripes(map); 5945 } 5946 5947 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 5948 BUG_ON(!buf); /* -ENOMEM */ 5949 5950 for (i = 0; i < map->num_stripes; i++) { 5951 if (map->stripes[i].physical > physical || 5952 map->stripes[i].physical + length <= physical) 5953 continue; 5954 5955 stripe_nr = physical - map->stripes[i].physical; 5956 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 5957 5958 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5959 stripe_nr = stripe_nr * map->num_stripes + i; 5960 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 5961 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5962 stripe_nr = stripe_nr * map->num_stripes + i; 5963 } /* else if RAID[56], multiply by nr_data_stripes(). 5964 * Alternatively, just use rmap_len below instead of 5965 * map->stripe_len */ 5966 5967 bytenr = chunk_start + stripe_nr * rmap_len; 5968 WARN_ON(nr >= map->num_stripes); 5969 for (j = 0; j < nr; j++) { 5970 if (buf[j] == bytenr) 5971 break; 5972 } 5973 if (j == nr) { 5974 WARN_ON(nr >= map->num_stripes); 5975 buf[nr++] = bytenr; 5976 } 5977 } 5978 5979 *logical = buf; 5980 *naddrs = nr; 5981 *stripe_len = rmap_len; 5982 5983 free_extent_map(em); 5984 return 0; 5985 } 5986 5987 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 5988 { 5989 bio->bi_private = bbio->private; 5990 bio->bi_end_io = bbio->end_io; 5991 bio_endio(bio); 5992 5993 btrfs_put_bbio(bbio); 5994 } 5995 5996 static void btrfs_end_bio(struct bio *bio) 5997 { 5998 struct btrfs_bio *bbio = bio->bi_private; 5999 int is_orig_bio = 0; 6000 6001 if (bio->bi_status) { 6002 atomic_inc(&bbio->error); 6003 if (bio->bi_status == BLK_STS_IOERR || 6004 bio->bi_status == BLK_STS_TARGET) { 6005 unsigned int stripe_index = 6006 btrfs_io_bio(bio)->stripe_index; 6007 struct btrfs_device *dev; 6008 6009 BUG_ON(stripe_index >= bbio->num_stripes); 6010 dev = bbio->stripes[stripe_index].dev; 6011 if (dev->bdev) { 6012 if (bio_op(bio) == REQ_OP_WRITE) 6013 btrfs_dev_stat_inc_and_print(dev, 6014 BTRFS_DEV_STAT_WRITE_ERRS); 6015 else 6016 btrfs_dev_stat_inc_and_print(dev, 6017 BTRFS_DEV_STAT_READ_ERRS); 6018 if (bio->bi_opf & REQ_PREFLUSH) 6019 btrfs_dev_stat_inc_and_print(dev, 6020 BTRFS_DEV_STAT_FLUSH_ERRS); 6021 } 6022 } 6023 } 6024 6025 if (bio == bbio->orig_bio) 6026 is_orig_bio = 1; 6027 6028 btrfs_bio_counter_dec(bbio->fs_info); 6029 6030 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6031 if (!is_orig_bio) { 6032 bio_put(bio); 6033 bio = bbio->orig_bio; 6034 } 6035 6036 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6037 /* only send an error to the higher layers if it is 6038 * beyond the tolerance of the btrfs bio 6039 */ 6040 if (atomic_read(&bbio->error) > bbio->max_errors) { 6041 bio->bi_status = BLK_STS_IOERR; 6042 } else { 6043 /* 6044 * this bio is actually up to date, we didn't 6045 * go over the max number of errors 6046 */ 6047 bio->bi_status = BLK_STS_OK; 6048 } 6049 6050 btrfs_end_bbio(bbio, bio); 6051 } else if (!is_orig_bio) { 6052 bio_put(bio); 6053 } 6054 } 6055 6056 /* 6057 * see run_scheduled_bios for a description of why bios are collected for 6058 * async submit. 6059 * 6060 * This will add one bio to the pending list for a device and make sure 6061 * the work struct is scheduled. 6062 */ 6063 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6064 struct bio *bio) 6065 { 6066 struct btrfs_fs_info *fs_info = device->fs_info; 6067 int should_queue = 1; 6068 struct btrfs_pending_bios *pending_bios; 6069 6070 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || 6071 !device->bdev) { 6072 bio_io_error(bio); 6073 return; 6074 } 6075 6076 /* don't bother with additional async steps for reads, right now */ 6077 if (bio_op(bio) == REQ_OP_READ) { 6078 btrfsic_submit_bio(bio); 6079 return; 6080 } 6081 6082 WARN_ON(bio->bi_next); 6083 bio->bi_next = NULL; 6084 6085 spin_lock(&device->io_lock); 6086 if (op_is_sync(bio->bi_opf)) 6087 pending_bios = &device->pending_sync_bios; 6088 else 6089 pending_bios = &device->pending_bios; 6090 6091 if (pending_bios->tail) 6092 pending_bios->tail->bi_next = bio; 6093 6094 pending_bios->tail = bio; 6095 if (!pending_bios->head) 6096 pending_bios->head = bio; 6097 if (device->running_pending) 6098 should_queue = 0; 6099 6100 spin_unlock(&device->io_lock); 6101 6102 if (should_queue) 6103 btrfs_queue_work(fs_info->submit_workers, &device->work); 6104 } 6105 6106 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6107 u64 physical, int dev_nr, int async) 6108 { 6109 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6110 struct btrfs_fs_info *fs_info = bbio->fs_info; 6111 6112 bio->bi_private = bbio; 6113 btrfs_io_bio(bio)->stripe_index = dev_nr; 6114 bio->bi_end_io = btrfs_end_bio; 6115 bio->bi_iter.bi_sector = physical >> 9; 6116 btrfs_debug_in_rcu(fs_info, 6117 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6118 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, 6119 (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, 6120 bio->bi_iter.bi_size); 6121 bio_set_dev(bio, dev->bdev); 6122 6123 btrfs_bio_counter_inc_noblocked(fs_info); 6124 6125 if (async) 6126 btrfs_schedule_bio(dev, bio); 6127 else 6128 btrfsic_submit_bio(bio); 6129 } 6130 6131 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6132 { 6133 atomic_inc(&bbio->error); 6134 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6135 /* Should be the original bio. */ 6136 WARN_ON(bio != bbio->orig_bio); 6137 6138 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6139 bio->bi_iter.bi_sector = logical >> 9; 6140 if (atomic_read(&bbio->error) > bbio->max_errors) 6141 bio->bi_status = BLK_STS_IOERR; 6142 else 6143 bio->bi_status = BLK_STS_OK; 6144 btrfs_end_bbio(bbio, bio); 6145 } 6146 } 6147 6148 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6149 int mirror_num, int async_submit) 6150 { 6151 struct btrfs_device *dev; 6152 struct bio *first_bio = bio; 6153 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6154 u64 length = 0; 6155 u64 map_length; 6156 int ret; 6157 int dev_nr; 6158 int total_devs; 6159 struct btrfs_bio *bbio = NULL; 6160 6161 length = bio->bi_iter.bi_size; 6162 map_length = length; 6163 6164 btrfs_bio_counter_inc_blocked(fs_info); 6165 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6166 &map_length, &bbio, mirror_num, 1); 6167 if (ret) { 6168 btrfs_bio_counter_dec(fs_info); 6169 return errno_to_blk_status(ret); 6170 } 6171 6172 total_devs = bbio->num_stripes; 6173 bbio->orig_bio = first_bio; 6174 bbio->private = first_bio->bi_private; 6175 bbio->end_io = first_bio->bi_end_io; 6176 bbio->fs_info = fs_info; 6177 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6178 6179 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6180 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6181 /* In this case, map_length has been set to the length of 6182 a single stripe; not the whole write */ 6183 if (bio_op(bio) == REQ_OP_WRITE) { 6184 ret = raid56_parity_write(fs_info, bio, bbio, 6185 map_length); 6186 } else { 6187 ret = raid56_parity_recover(fs_info, bio, bbio, 6188 map_length, mirror_num, 1); 6189 } 6190 6191 btrfs_bio_counter_dec(fs_info); 6192 return errno_to_blk_status(ret); 6193 } 6194 6195 if (map_length < length) { 6196 btrfs_crit(fs_info, 6197 "mapping failed logical %llu bio len %llu len %llu", 6198 logical, length, map_length); 6199 BUG(); 6200 } 6201 6202 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6203 dev = bbio->stripes[dev_nr].dev; 6204 if (!dev || !dev->bdev || 6205 (bio_op(first_bio) == REQ_OP_WRITE && 6206 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6207 bbio_error(bbio, first_bio, logical); 6208 continue; 6209 } 6210 6211 if (dev_nr < total_devs - 1) 6212 bio = btrfs_bio_clone(first_bio); 6213 else 6214 bio = first_bio; 6215 6216 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6217 dev_nr, async_submit); 6218 } 6219 btrfs_bio_counter_dec(fs_info); 6220 return BLK_STS_OK; 6221 } 6222 6223 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6224 u8 *uuid, u8 *fsid) 6225 { 6226 struct btrfs_device *device; 6227 struct btrfs_fs_devices *cur_devices; 6228 6229 cur_devices = fs_info->fs_devices; 6230 while (cur_devices) { 6231 if (!fsid || 6232 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 6233 device = find_device(cur_devices, devid, uuid); 6234 if (device) 6235 return device; 6236 } 6237 cur_devices = cur_devices->seed; 6238 } 6239 return NULL; 6240 } 6241 6242 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6243 u64 devid, u8 *dev_uuid) 6244 { 6245 struct btrfs_device *device; 6246 6247 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6248 if (IS_ERR(device)) 6249 return device; 6250 6251 list_add(&device->dev_list, &fs_devices->devices); 6252 device->fs_devices = fs_devices; 6253 fs_devices->num_devices++; 6254 6255 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6256 fs_devices->missing_devices++; 6257 6258 return device; 6259 } 6260 6261 /** 6262 * btrfs_alloc_device - allocate struct btrfs_device 6263 * @fs_info: used only for generating a new devid, can be NULL if 6264 * devid is provided (i.e. @devid != NULL). 6265 * @devid: a pointer to devid for this device. If NULL a new devid 6266 * is generated. 6267 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6268 * is generated. 6269 * 6270 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6271 * on error. Returned struct is not linked onto any lists and must be 6272 * destroyed with btrfs_free_device. 6273 */ 6274 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6275 const u64 *devid, 6276 const u8 *uuid) 6277 { 6278 struct btrfs_device *dev; 6279 u64 tmp; 6280 6281 if (WARN_ON(!devid && !fs_info)) 6282 return ERR_PTR(-EINVAL); 6283 6284 dev = __alloc_device(); 6285 if (IS_ERR(dev)) 6286 return dev; 6287 6288 if (devid) 6289 tmp = *devid; 6290 else { 6291 int ret; 6292 6293 ret = find_next_devid(fs_info, &tmp); 6294 if (ret) { 6295 btrfs_free_device(dev); 6296 return ERR_PTR(ret); 6297 } 6298 } 6299 dev->devid = tmp; 6300 6301 if (uuid) 6302 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6303 else 6304 generate_random_uuid(dev->uuid); 6305 6306 btrfs_init_work(&dev->work, btrfs_submit_helper, 6307 pending_bios_fn, NULL, NULL); 6308 6309 return dev; 6310 } 6311 6312 /* Return -EIO if any error, otherwise return 0. */ 6313 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6314 struct extent_buffer *leaf, 6315 struct btrfs_chunk *chunk, u64 logical) 6316 { 6317 u64 length; 6318 u64 stripe_len; 6319 u16 num_stripes; 6320 u16 sub_stripes; 6321 u64 type; 6322 u64 features; 6323 bool mixed = false; 6324 6325 length = btrfs_chunk_length(leaf, chunk); 6326 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6327 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6328 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6329 type = btrfs_chunk_type(leaf, chunk); 6330 6331 if (!num_stripes) { 6332 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6333 num_stripes); 6334 return -EIO; 6335 } 6336 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6337 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6338 return -EIO; 6339 } 6340 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6341 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6342 btrfs_chunk_sector_size(leaf, chunk)); 6343 return -EIO; 6344 } 6345 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6346 btrfs_err(fs_info, "invalid chunk length %llu", length); 6347 return -EIO; 6348 } 6349 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6350 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6351 stripe_len); 6352 return -EIO; 6353 } 6354 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6355 type) { 6356 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6357 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6358 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6359 btrfs_chunk_type(leaf, chunk)); 6360 return -EIO; 6361 } 6362 6363 if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { 6364 btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); 6365 return -EIO; 6366 } 6367 6368 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && 6369 (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { 6370 btrfs_err(fs_info, 6371 "system chunk with data or metadata type: 0x%llx", type); 6372 return -EIO; 6373 } 6374 6375 features = btrfs_super_incompat_flags(fs_info->super_copy); 6376 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 6377 mixed = true; 6378 6379 if (!mixed) { 6380 if ((type & BTRFS_BLOCK_GROUP_METADATA) && 6381 (type & BTRFS_BLOCK_GROUP_DATA)) { 6382 btrfs_err(fs_info, 6383 "mixed chunk type in non-mixed mode: 0x%llx", type); 6384 return -EIO; 6385 } 6386 } 6387 6388 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6389 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6390 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6391 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6392 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6393 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6394 num_stripes != 1)) { 6395 btrfs_err(fs_info, 6396 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6397 num_stripes, sub_stripes, 6398 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6399 return -EIO; 6400 } 6401 6402 return 0; 6403 } 6404 6405 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6406 u64 devid, u8 *uuid, bool error) 6407 { 6408 if (error) 6409 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6410 devid, uuid); 6411 else 6412 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6413 devid, uuid); 6414 } 6415 6416 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6417 struct extent_buffer *leaf, 6418 struct btrfs_chunk *chunk) 6419 { 6420 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6421 struct map_lookup *map; 6422 struct extent_map *em; 6423 u64 logical; 6424 u64 length; 6425 u64 devid; 6426 u8 uuid[BTRFS_UUID_SIZE]; 6427 int num_stripes; 6428 int ret; 6429 int i; 6430 6431 logical = key->offset; 6432 length = btrfs_chunk_length(leaf, chunk); 6433 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6434 6435 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6436 if (ret) 6437 return ret; 6438 6439 read_lock(&map_tree->map_tree.lock); 6440 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6441 read_unlock(&map_tree->map_tree.lock); 6442 6443 /* already mapped? */ 6444 if (em && em->start <= logical && em->start + em->len > logical) { 6445 free_extent_map(em); 6446 return 0; 6447 } else if (em) { 6448 free_extent_map(em); 6449 } 6450 6451 em = alloc_extent_map(); 6452 if (!em) 6453 return -ENOMEM; 6454 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6455 if (!map) { 6456 free_extent_map(em); 6457 return -ENOMEM; 6458 } 6459 6460 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6461 em->map_lookup = map; 6462 em->start = logical; 6463 em->len = length; 6464 em->orig_start = 0; 6465 em->block_start = 0; 6466 em->block_len = em->len; 6467 6468 map->num_stripes = num_stripes; 6469 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6470 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6471 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6472 map->type = btrfs_chunk_type(leaf, chunk); 6473 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6474 map->verified_stripes = 0; 6475 for (i = 0; i < num_stripes; i++) { 6476 map->stripes[i].physical = 6477 btrfs_stripe_offset_nr(leaf, chunk, i); 6478 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6479 read_extent_buffer(leaf, uuid, (unsigned long) 6480 btrfs_stripe_dev_uuid_nr(chunk, i), 6481 BTRFS_UUID_SIZE); 6482 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6483 uuid, NULL); 6484 if (!map->stripes[i].dev && 6485 !btrfs_test_opt(fs_info, DEGRADED)) { 6486 free_extent_map(em); 6487 btrfs_report_missing_device(fs_info, devid, uuid, true); 6488 return -ENOENT; 6489 } 6490 if (!map->stripes[i].dev) { 6491 map->stripes[i].dev = 6492 add_missing_dev(fs_info->fs_devices, devid, 6493 uuid); 6494 if (IS_ERR(map->stripes[i].dev)) { 6495 free_extent_map(em); 6496 btrfs_err(fs_info, 6497 "failed to init missing dev %llu: %ld", 6498 devid, PTR_ERR(map->stripes[i].dev)); 6499 return PTR_ERR(map->stripes[i].dev); 6500 } 6501 btrfs_report_missing_device(fs_info, devid, uuid, false); 6502 } 6503 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6504 &(map->stripes[i].dev->dev_state)); 6505 6506 } 6507 6508 write_lock(&map_tree->map_tree.lock); 6509 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6510 write_unlock(&map_tree->map_tree.lock); 6511 if (ret < 0) { 6512 btrfs_err(fs_info, 6513 "failed to add chunk map, start=%llu len=%llu: %d", 6514 em->start, em->len, ret); 6515 } 6516 free_extent_map(em); 6517 6518 return ret; 6519 } 6520 6521 static void fill_device_from_item(struct extent_buffer *leaf, 6522 struct btrfs_dev_item *dev_item, 6523 struct btrfs_device *device) 6524 { 6525 unsigned long ptr; 6526 6527 device->devid = btrfs_device_id(leaf, dev_item); 6528 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6529 device->total_bytes = device->disk_total_bytes; 6530 device->commit_total_bytes = device->disk_total_bytes; 6531 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6532 device->commit_bytes_used = device->bytes_used; 6533 device->type = btrfs_device_type(leaf, dev_item); 6534 device->io_align = btrfs_device_io_align(leaf, dev_item); 6535 device->io_width = btrfs_device_io_width(leaf, dev_item); 6536 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6537 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6538 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6539 6540 ptr = btrfs_device_uuid(dev_item); 6541 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6542 } 6543 6544 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6545 u8 *fsid) 6546 { 6547 struct btrfs_fs_devices *fs_devices; 6548 int ret; 6549 6550 lockdep_assert_held(&uuid_mutex); 6551 ASSERT(fsid); 6552 6553 fs_devices = fs_info->fs_devices->seed; 6554 while (fs_devices) { 6555 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6556 return fs_devices; 6557 6558 fs_devices = fs_devices->seed; 6559 } 6560 6561 fs_devices = find_fsid(fsid); 6562 if (!fs_devices) { 6563 if (!btrfs_test_opt(fs_info, DEGRADED)) 6564 return ERR_PTR(-ENOENT); 6565 6566 fs_devices = alloc_fs_devices(fsid); 6567 if (IS_ERR(fs_devices)) 6568 return fs_devices; 6569 6570 fs_devices->seeding = 1; 6571 fs_devices->opened = 1; 6572 return fs_devices; 6573 } 6574 6575 fs_devices = clone_fs_devices(fs_devices); 6576 if (IS_ERR(fs_devices)) 6577 return fs_devices; 6578 6579 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 6580 if (ret) { 6581 free_fs_devices(fs_devices); 6582 fs_devices = ERR_PTR(ret); 6583 goto out; 6584 } 6585 6586 if (!fs_devices->seeding) { 6587 close_fs_devices(fs_devices); 6588 free_fs_devices(fs_devices); 6589 fs_devices = ERR_PTR(-EINVAL); 6590 goto out; 6591 } 6592 6593 fs_devices->seed = fs_info->fs_devices->seed; 6594 fs_info->fs_devices->seed = fs_devices; 6595 out: 6596 return fs_devices; 6597 } 6598 6599 static int read_one_dev(struct btrfs_fs_info *fs_info, 6600 struct extent_buffer *leaf, 6601 struct btrfs_dev_item *dev_item) 6602 { 6603 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6604 struct btrfs_device *device; 6605 u64 devid; 6606 int ret; 6607 u8 fs_uuid[BTRFS_FSID_SIZE]; 6608 u8 dev_uuid[BTRFS_UUID_SIZE]; 6609 6610 devid = btrfs_device_id(leaf, dev_item); 6611 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6612 BTRFS_UUID_SIZE); 6613 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6614 BTRFS_FSID_SIZE); 6615 6616 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 6617 fs_devices = open_seed_devices(fs_info, fs_uuid); 6618 if (IS_ERR(fs_devices)) 6619 return PTR_ERR(fs_devices); 6620 } 6621 6622 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6623 if (!device) { 6624 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6625 btrfs_report_missing_device(fs_info, devid, 6626 dev_uuid, true); 6627 return -ENOENT; 6628 } 6629 6630 device = add_missing_dev(fs_devices, devid, dev_uuid); 6631 if (IS_ERR(device)) { 6632 btrfs_err(fs_info, 6633 "failed to add missing dev %llu: %ld", 6634 devid, PTR_ERR(device)); 6635 return PTR_ERR(device); 6636 } 6637 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6638 } else { 6639 if (!device->bdev) { 6640 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6641 btrfs_report_missing_device(fs_info, 6642 devid, dev_uuid, true); 6643 return -ENOENT; 6644 } 6645 btrfs_report_missing_device(fs_info, devid, 6646 dev_uuid, false); 6647 } 6648 6649 if (!device->bdev && 6650 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6651 /* 6652 * this happens when a device that was properly setup 6653 * in the device info lists suddenly goes bad. 6654 * device->bdev is NULL, and so we have to set 6655 * device->missing to one here 6656 */ 6657 device->fs_devices->missing_devices++; 6658 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6659 } 6660 6661 /* Move the device to its own fs_devices */ 6662 if (device->fs_devices != fs_devices) { 6663 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6664 &device->dev_state)); 6665 6666 list_move(&device->dev_list, &fs_devices->devices); 6667 device->fs_devices->num_devices--; 6668 fs_devices->num_devices++; 6669 6670 device->fs_devices->missing_devices--; 6671 fs_devices->missing_devices++; 6672 6673 device->fs_devices = fs_devices; 6674 } 6675 } 6676 6677 if (device->fs_devices != fs_info->fs_devices) { 6678 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 6679 if (device->generation != 6680 btrfs_device_generation(leaf, dev_item)) 6681 return -EINVAL; 6682 } 6683 6684 fill_device_from_item(leaf, dev_item, device); 6685 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6686 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6687 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 6688 device->fs_devices->total_rw_bytes += device->total_bytes; 6689 atomic64_add(device->total_bytes - device->bytes_used, 6690 &fs_info->free_chunk_space); 6691 } 6692 ret = 0; 6693 return ret; 6694 } 6695 6696 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6697 { 6698 struct btrfs_root *root = fs_info->tree_root; 6699 struct btrfs_super_block *super_copy = fs_info->super_copy; 6700 struct extent_buffer *sb; 6701 struct btrfs_disk_key *disk_key; 6702 struct btrfs_chunk *chunk; 6703 u8 *array_ptr; 6704 unsigned long sb_array_offset; 6705 int ret = 0; 6706 u32 num_stripes; 6707 u32 array_size; 6708 u32 len = 0; 6709 u32 cur_offset; 6710 u64 type; 6711 struct btrfs_key key; 6712 6713 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6714 /* 6715 * This will create extent buffer of nodesize, superblock size is 6716 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6717 * overallocate but we can keep it as-is, only the first page is used. 6718 */ 6719 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6720 if (IS_ERR(sb)) 6721 return PTR_ERR(sb); 6722 set_extent_buffer_uptodate(sb); 6723 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6724 /* 6725 * The sb extent buffer is artificial and just used to read the system array. 6726 * set_extent_buffer_uptodate() call does not properly mark all it's 6727 * pages up-to-date when the page is larger: extent does not cover the 6728 * whole page and consequently check_page_uptodate does not find all 6729 * the page's extents up-to-date (the hole beyond sb), 6730 * write_extent_buffer then triggers a WARN_ON. 6731 * 6732 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6733 * but sb spans only this function. Add an explicit SetPageUptodate call 6734 * to silence the warning eg. on PowerPC 64. 6735 */ 6736 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6737 SetPageUptodate(sb->pages[0]); 6738 6739 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6740 array_size = btrfs_super_sys_array_size(super_copy); 6741 6742 array_ptr = super_copy->sys_chunk_array; 6743 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6744 cur_offset = 0; 6745 6746 while (cur_offset < array_size) { 6747 disk_key = (struct btrfs_disk_key *)array_ptr; 6748 len = sizeof(*disk_key); 6749 if (cur_offset + len > array_size) 6750 goto out_short_read; 6751 6752 btrfs_disk_key_to_cpu(&key, disk_key); 6753 6754 array_ptr += len; 6755 sb_array_offset += len; 6756 cur_offset += len; 6757 6758 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6759 chunk = (struct btrfs_chunk *)sb_array_offset; 6760 /* 6761 * At least one btrfs_chunk with one stripe must be 6762 * present, exact stripe count check comes afterwards 6763 */ 6764 len = btrfs_chunk_item_size(1); 6765 if (cur_offset + len > array_size) 6766 goto out_short_read; 6767 6768 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6769 if (!num_stripes) { 6770 btrfs_err(fs_info, 6771 "invalid number of stripes %u in sys_array at offset %u", 6772 num_stripes, cur_offset); 6773 ret = -EIO; 6774 break; 6775 } 6776 6777 type = btrfs_chunk_type(sb, chunk); 6778 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6779 btrfs_err(fs_info, 6780 "invalid chunk type %llu in sys_array at offset %u", 6781 type, cur_offset); 6782 ret = -EIO; 6783 break; 6784 } 6785 6786 len = btrfs_chunk_item_size(num_stripes); 6787 if (cur_offset + len > array_size) 6788 goto out_short_read; 6789 6790 ret = read_one_chunk(fs_info, &key, sb, chunk); 6791 if (ret) 6792 break; 6793 } else { 6794 btrfs_err(fs_info, 6795 "unexpected item type %u in sys_array at offset %u", 6796 (u32)key.type, cur_offset); 6797 ret = -EIO; 6798 break; 6799 } 6800 array_ptr += len; 6801 sb_array_offset += len; 6802 cur_offset += len; 6803 } 6804 clear_extent_buffer_uptodate(sb); 6805 free_extent_buffer_stale(sb); 6806 return ret; 6807 6808 out_short_read: 6809 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6810 len, cur_offset); 6811 clear_extent_buffer_uptodate(sb); 6812 free_extent_buffer_stale(sb); 6813 return -EIO; 6814 } 6815 6816 /* 6817 * Check if all chunks in the fs are OK for read-write degraded mount 6818 * 6819 * If the @failing_dev is specified, it's accounted as missing. 6820 * 6821 * Return true if all chunks meet the minimal RW mount requirements. 6822 * Return false if any chunk doesn't meet the minimal RW mount requirements. 6823 */ 6824 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 6825 struct btrfs_device *failing_dev) 6826 { 6827 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6828 struct extent_map *em; 6829 u64 next_start = 0; 6830 bool ret = true; 6831 6832 read_lock(&map_tree->map_tree.lock); 6833 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 6834 read_unlock(&map_tree->map_tree.lock); 6835 /* No chunk at all? Return false anyway */ 6836 if (!em) { 6837 ret = false; 6838 goto out; 6839 } 6840 while (em) { 6841 struct map_lookup *map; 6842 int missing = 0; 6843 int max_tolerated; 6844 int i; 6845 6846 map = em->map_lookup; 6847 max_tolerated = 6848 btrfs_get_num_tolerated_disk_barrier_failures( 6849 map->type); 6850 for (i = 0; i < map->num_stripes; i++) { 6851 struct btrfs_device *dev = map->stripes[i].dev; 6852 6853 if (!dev || !dev->bdev || 6854 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6855 dev->last_flush_error) 6856 missing++; 6857 else if (failing_dev && failing_dev == dev) 6858 missing++; 6859 } 6860 if (missing > max_tolerated) { 6861 if (!failing_dev) 6862 btrfs_warn(fs_info, 6863 "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 6864 em->start, missing, max_tolerated); 6865 free_extent_map(em); 6866 ret = false; 6867 goto out; 6868 } 6869 next_start = extent_map_end(em); 6870 free_extent_map(em); 6871 6872 read_lock(&map_tree->map_tree.lock); 6873 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 6874 (u64)(-1) - next_start); 6875 read_unlock(&map_tree->map_tree.lock); 6876 } 6877 out: 6878 return ret; 6879 } 6880 6881 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6882 { 6883 struct btrfs_root *root = fs_info->chunk_root; 6884 struct btrfs_path *path; 6885 struct extent_buffer *leaf; 6886 struct btrfs_key key; 6887 struct btrfs_key found_key; 6888 int ret; 6889 int slot; 6890 u64 total_dev = 0; 6891 6892 path = btrfs_alloc_path(); 6893 if (!path) 6894 return -ENOMEM; 6895 6896 /* 6897 * uuid_mutex is needed only if we are mounting a sprout FS 6898 * otherwise we don't need it. 6899 */ 6900 mutex_lock(&uuid_mutex); 6901 mutex_lock(&fs_info->chunk_mutex); 6902 6903 /* 6904 * Read all device items, and then all the chunk items. All 6905 * device items are found before any chunk item (their object id 6906 * is smaller than the lowest possible object id for a chunk 6907 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6908 */ 6909 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6910 key.offset = 0; 6911 key.type = 0; 6912 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6913 if (ret < 0) 6914 goto error; 6915 while (1) { 6916 leaf = path->nodes[0]; 6917 slot = path->slots[0]; 6918 if (slot >= btrfs_header_nritems(leaf)) { 6919 ret = btrfs_next_leaf(root, path); 6920 if (ret == 0) 6921 continue; 6922 if (ret < 0) 6923 goto error; 6924 break; 6925 } 6926 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6927 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6928 struct btrfs_dev_item *dev_item; 6929 dev_item = btrfs_item_ptr(leaf, slot, 6930 struct btrfs_dev_item); 6931 ret = read_one_dev(fs_info, leaf, dev_item); 6932 if (ret) 6933 goto error; 6934 total_dev++; 6935 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6936 struct btrfs_chunk *chunk; 6937 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6938 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 6939 if (ret) 6940 goto error; 6941 } 6942 path->slots[0]++; 6943 } 6944 6945 /* 6946 * After loading chunk tree, we've got all device information, 6947 * do another round of validation checks. 6948 */ 6949 if (total_dev != fs_info->fs_devices->total_devices) { 6950 btrfs_err(fs_info, 6951 "super_num_devices %llu mismatch with num_devices %llu found here", 6952 btrfs_super_num_devices(fs_info->super_copy), 6953 total_dev); 6954 ret = -EINVAL; 6955 goto error; 6956 } 6957 if (btrfs_super_total_bytes(fs_info->super_copy) < 6958 fs_info->fs_devices->total_rw_bytes) { 6959 btrfs_err(fs_info, 6960 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 6961 btrfs_super_total_bytes(fs_info->super_copy), 6962 fs_info->fs_devices->total_rw_bytes); 6963 ret = -EINVAL; 6964 goto error; 6965 } 6966 ret = 0; 6967 error: 6968 mutex_unlock(&fs_info->chunk_mutex); 6969 mutex_unlock(&uuid_mutex); 6970 6971 btrfs_free_path(path); 6972 return ret; 6973 } 6974 6975 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6976 { 6977 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6978 struct btrfs_device *device; 6979 6980 while (fs_devices) { 6981 mutex_lock(&fs_devices->device_list_mutex); 6982 list_for_each_entry(device, &fs_devices->devices, dev_list) 6983 device->fs_info = fs_info; 6984 mutex_unlock(&fs_devices->device_list_mutex); 6985 6986 fs_devices = fs_devices->seed; 6987 } 6988 } 6989 6990 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6991 { 6992 int i; 6993 6994 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6995 btrfs_dev_stat_reset(dev, i); 6996 } 6997 6998 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6999 { 7000 struct btrfs_key key; 7001 struct btrfs_key found_key; 7002 struct btrfs_root *dev_root = fs_info->dev_root; 7003 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7004 struct extent_buffer *eb; 7005 int slot; 7006 int ret = 0; 7007 struct btrfs_device *device; 7008 struct btrfs_path *path = NULL; 7009 int i; 7010 7011 path = btrfs_alloc_path(); 7012 if (!path) { 7013 ret = -ENOMEM; 7014 goto out; 7015 } 7016 7017 mutex_lock(&fs_devices->device_list_mutex); 7018 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7019 int item_size; 7020 struct btrfs_dev_stats_item *ptr; 7021 7022 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7023 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7024 key.offset = device->devid; 7025 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7026 if (ret) { 7027 __btrfs_reset_dev_stats(device); 7028 device->dev_stats_valid = 1; 7029 btrfs_release_path(path); 7030 continue; 7031 } 7032 slot = path->slots[0]; 7033 eb = path->nodes[0]; 7034 btrfs_item_key_to_cpu(eb, &found_key, slot); 7035 item_size = btrfs_item_size_nr(eb, slot); 7036 7037 ptr = btrfs_item_ptr(eb, slot, 7038 struct btrfs_dev_stats_item); 7039 7040 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7041 if (item_size >= (1 + i) * sizeof(__le64)) 7042 btrfs_dev_stat_set(device, i, 7043 btrfs_dev_stats_value(eb, ptr, i)); 7044 else 7045 btrfs_dev_stat_reset(device, i); 7046 } 7047 7048 device->dev_stats_valid = 1; 7049 btrfs_dev_stat_print_on_load(device); 7050 btrfs_release_path(path); 7051 } 7052 mutex_unlock(&fs_devices->device_list_mutex); 7053 7054 out: 7055 btrfs_free_path(path); 7056 return ret < 0 ? ret : 0; 7057 } 7058 7059 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7060 struct btrfs_device *device) 7061 { 7062 struct btrfs_fs_info *fs_info = trans->fs_info; 7063 struct btrfs_root *dev_root = fs_info->dev_root; 7064 struct btrfs_path *path; 7065 struct btrfs_key key; 7066 struct extent_buffer *eb; 7067 struct btrfs_dev_stats_item *ptr; 7068 int ret; 7069 int i; 7070 7071 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7072 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7073 key.offset = device->devid; 7074 7075 path = btrfs_alloc_path(); 7076 if (!path) 7077 return -ENOMEM; 7078 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7079 if (ret < 0) { 7080 btrfs_warn_in_rcu(fs_info, 7081 "error %d while searching for dev_stats item for device %s", 7082 ret, rcu_str_deref(device->name)); 7083 goto out; 7084 } 7085 7086 if (ret == 0 && 7087 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7088 /* need to delete old one and insert a new one */ 7089 ret = btrfs_del_item(trans, dev_root, path); 7090 if (ret != 0) { 7091 btrfs_warn_in_rcu(fs_info, 7092 "delete too small dev_stats item for device %s failed %d", 7093 rcu_str_deref(device->name), ret); 7094 goto out; 7095 } 7096 ret = 1; 7097 } 7098 7099 if (ret == 1) { 7100 /* need to insert a new item */ 7101 btrfs_release_path(path); 7102 ret = btrfs_insert_empty_item(trans, dev_root, path, 7103 &key, sizeof(*ptr)); 7104 if (ret < 0) { 7105 btrfs_warn_in_rcu(fs_info, 7106 "insert dev_stats item for device %s failed %d", 7107 rcu_str_deref(device->name), ret); 7108 goto out; 7109 } 7110 } 7111 7112 eb = path->nodes[0]; 7113 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7114 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7115 btrfs_set_dev_stats_value(eb, ptr, i, 7116 btrfs_dev_stat_read(device, i)); 7117 btrfs_mark_buffer_dirty(eb); 7118 7119 out: 7120 btrfs_free_path(path); 7121 return ret; 7122 } 7123 7124 /* 7125 * called from commit_transaction. Writes all changed device stats to disk. 7126 */ 7127 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7128 struct btrfs_fs_info *fs_info) 7129 { 7130 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7131 struct btrfs_device *device; 7132 int stats_cnt; 7133 int ret = 0; 7134 7135 mutex_lock(&fs_devices->device_list_mutex); 7136 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7137 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7138 if (!device->dev_stats_valid || stats_cnt == 0) 7139 continue; 7140 7141 7142 /* 7143 * There is a LOAD-LOAD control dependency between the value of 7144 * dev_stats_ccnt and updating the on-disk values which requires 7145 * reading the in-memory counters. Such control dependencies 7146 * require explicit read memory barriers. 7147 * 7148 * This memory barriers pairs with smp_mb__before_atomic in 7149 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7150 * barrier implied by atomic_xchg in 7151 * btrfs_dev_stats_read_and_reset 7152 */ 7153 smp_rmb(); 7154 7155 ret = update_dev_stat_item(trans, device); 7156 if (!ret) 7157 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7158 } 7159 mutex_unlock(&fs_devices->device_list_mutex); 7160 7161 return ret; 7162 } 7163 7164 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7165 { 7166 btrfs_dev_stat_inc(dev, index); 7167 btrfs_dev_stat_print_on_error(dev); 7168 } 7169 7170 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7171 { 7172 if (!dev->dev_stats_valid) 7173 return; 7174 btrfs_err_rl_in_rcu(dev->fs_info, 7175 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7176 rcu_str_deref(dev->name), 7177 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7178 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7179 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7180 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7181 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7182 } 7183 7184 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7185 { 7186 int i; 7187 7188 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7189 if (btrfs_dev_stat_read(dev, i) != 0) 7190 break; 7191 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7192 return; /* all values == 0, suppress message */ 7193 7194 btrfs_info_in_rcu(dev->fs_info, 7195 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7196 rcu_str_deref(dev->name), 7197 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7198 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7199 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7200 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7201 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7202 } 7203 7204 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7205 struct btrfs_ioctl_get_dev_stats *stats) 7206 { 7207 struct btrfs_device *dev; 7208 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7209 int i; 7210 7211 mutex_lock(&fs_devices->device_list_mutex); 7212 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7213 mutex_unlock(&fs_devices->device_list_mutex); 7214 7215 if (!dev) { 7216 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7217 return -ENODEV; 7218 } else if (!dev->dev_stats_valid) { 7219 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7220 return -ENODEV; 7221 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7222 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7223 if (stats->nr_items > i) 7224 stats->values[i] = 7225 btrfs_dev_stat_read_and_reset(dev, i); 7226 else 7227 btrfs_dev_stat_reset(dev, i); 7228 } 7229 } else { 7230 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7231 if (stats->nr_items > i) 7232 stats->values[i] = btrfs_dev_stat_read(dev, i); 7233 } 7234 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7235 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7236 return 0; 7237 } 7238 7239 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7240 { 7241 struct buffer_head *bh; 7242 struct btrfs_super_block *disk_super; 7243 int copy_num; 7244 7245 if (!bdev) 7246 return; 7247 7248 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7249 copy_num++) { 7250 7251 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7252 continue; 7253 7254 disk_super = (struct btrfs_super_block *)bh->b_data; 7255 7256 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7257 set_buffer_dirty(bh); 7258 sync_dirty_buffer(bh); 7259 brelse(bh); 7260 } 7261 7262 /* Notify udev that device has changed */ 7263 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7264 7265 /* Update ctime/mtime for device path for libblkid */ 7266 update_dev_time(device_path); 7267 } 7268 7269 /* 7270 * Update the size of all devices, which is used for writing out the 7271 * super blocks. 7272 */ 7273 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7274 { 7275 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7276 struct btrfs_device *curr, *next; 7277 7278 if (list_empty(&fs_devices->resized_devices)) 7279 return; 7280 7281 mutex_lock(&fs_devices->device_list_mutex); 7282 mutex_lock(&fs_info->chunk_mutex); 7283 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7284 resized_list) { 7285 list_del_init(&curr->resized_list); 7286 curr->commit_total_bytes = curr->disk_total_bytes; 7287 } 7288 mutex_unlock(&fs_info->chunk_mutex); 7289 mutex_unlock(&fs_devices->device_list_mutex); 7290 } 7291 7292 /* Must be invoked during the transaction commit */ 7293 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) 7294 { 7295 struct btrfs_fs_info *fs_info = trans->fs_info; 7296 struct extent_map *em; 7297 struct map_lookup *map; 7298 struct btrfs_device *dev; 7299 int i; 7300 7301 if (list_empty(&trans->pending_chunks)) 7302 return; 7303 7304 /* In order to kick the device replace finish process */ 7305 mutex_lock(&fs_info->chunk_mutex); 7306 list_for_each_entry(em, &trans->pending_chunks, list) { 7307 map = em->map_lookup; 7308 7309 for (i = 0; i < map->num_stripes; i++) { 7310 dev = map->stripes[i].dev; 7311 dev->commit_bytes_used = dev->bytes_used; 7312 } 7313 } 7314 mutex_unlock(&fs_info->chunk_mutex); 7315 } 7316 7317 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7318 { 7319 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7320 while (fs_devices) { 7321 fs_devices->fs_info = fs_info; 7322 fs_devices = fs_devices->seed; 7323 } 7324 } 7325 7326 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7327 { 7328 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7329 while (fs_devices) { 7330 fs_devices->fs_info = NULL; 7331 fs_devices = fs_devices->seed; 7332 } 7333 } 7334 7335 /* 7336 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7337 */ 7338 int btrfs_bg_type_to_factor(u64 flags) 7339 { 7340 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 7341 BTRFS_BLOCK_GROUP_RAID10)) 7342 return 2; 7343 return 1; 7344 } 7345 7346 7347 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7348 { 7349 int index = btrfs_bg_flags_to_raid_index(type); 7350 int ncopies = btrfs_raid_array[index].ncopies; 7351 int data_stripes; 7352 7353 switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 7354 case BTRFS_BLOCK_GROUP_RAID5: 7355 data_stripes = num_stripes - 1; 7356 break; 7357 case BTRFS_BLOCK_GROUP_RAID6: 7358 data_stripes = num_stripes - 2; 7359 break; 7360 default: 7361 data_stripes = num_stripes / ncopies; 7362 break; 7363 } 7364 return div_u64(chunk_len, data_stripes); 7365 } 7366 7367 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7368 u64 chunk_offset, u64 devid, 7369 u64 physical_offset, u64 physical_len) 7370 { 7371 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7372 struct extent_map *em; 7373 struct map_lookup *map; 7374 u64 stripe_len; 7375 bool found = false; 7376 int ret = 0; 7377 int i; 7378 7379 read_lock(&em_tree->lock); 7380 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7381 read_unlock(&em_tree->lock); 7382 7383 if (!em) { 7384 btrfs_err(fs_info, 7385 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7386 physical_offset, devid); 7387 ret = -EUCLEAN; 7388 goto out; 7389 } 7390 7391 map = em->map_lookup; 7392 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7393 if (physical_len != stripe_len) { 7394 btrfs_err(fs_info, 7395 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7396 physical_offset, devid, em->start, physical_len, 7397 stripe_len); 7398 ret = -EUCLEAN; 7399 goto out; 7400 } 7401 7402 for (i = 0; i < map->num_stripes; i++) { 7403 if (map->stripes[i].dev->devid == devid && 7404 map->stripes[i].physical == physical_offset) { 7405 found = true; 7406 if (map->verified_stripes >= map->num_stripes) { 7407 btrfs_err(fs_info, 7408 "too many dev extents for chunk %llu found", 7409 em->start); 7410 ret = -EUCLEAN; 7411 goto out; 7412 } 7413 map->verified_stripes++; 7414 break; 7415 } 7416 } 7417 if (!found) { 7418 btrfs_err(fs_info, 7419 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7420 physical_offset, devid); 7421 ret = -EUCLEAN; 7422 } 7423 out: 7424 free_extent_map(em); 7425 return ret; 7426 } 7427 7428 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7429 { 7430 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7431 struct extent_map *em; 7432 struct rb_node *node; 7433 int ret = 0; 7434 7435 read_lock(&em_tree->lock); 7436 for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { 7437 em = rb_entry(node, struct extent_map, rb_node); 7438 if (em->map_lookup->num_stripes != 7439 em->map_lookup->verified_stripes) { 7440 btrfs_err(fs_info, 7441 "chunk %llu has missing dev extent, have %d expect %d", 7442 em->start, em->map_lookup->verified_stripes, 7443 em->map_lookup->num_stripes); 7444 ret = -EUCLEAN; 7445 goto out; 7446 } 7447 } 7448 out: 7449 read_unlock(&em_tree->lock); 7450 return ret; 7451 } 7452 7453 /* 7454 * Ensure that all dev extents are mapped to correct chunk, otherwise 7455 * later chunk allocation/free would cause unexpected behavior. 7456 * 7457 * NOTE: This will iterate through the whole device tree, which should be of 7458 * the same size level as the chunk tree. This slightly increases mount time. 7459 */ 7460 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7461 { 7462 struct btrfs_path *path; 7463 struct btrfs_root *root = fs_info->dev_root; 7464 struct btrfs_key key; 7465 int ret = 0; 7466 7467 key.objectid = 1; 7468 key.type = BTRFS_DEV_EXTENT_KEY; 7469 key.offset = 0; 7470 7471 path = btrfs_alloc_path(); 7472 if (!path) 7473 return -ENOMEM; 7474 7475 path->reada = READA_FORWARD; 7476 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7477 if (ret < 0) 7478 goto out; 7479 7480 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7481 ret = btrfs_next_item(root, path); 7482 if (ret < 0) 7483 goto out; 7484 /* No dev extents at all? Not good */ 7485 if (ret > 0) { 7486 ret = -EUCLEAN; 7487 goto out; 7488 } 7489 } 7490 while (1) { 7491 struct extent_buffer *leaf = path->nodes[0]; 7492 struct btrfs_dev_extent *dext; 7493 int slot = path->slots[0]; 7494 u64 chunk_offset; 7495 u64 physical_offset; 7496 u64 physical_len; 7497 u64 devid; 7498 7499 btrfs_item_key_to_cpu(leaf, &key, slot); 7500 if (key.type != BTRFS_DEV_EXTENT_KEY) 7501 break; 7502 devid = key.objectid; 7503 physical_offset = key.offset; 7504 7505 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 7506 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 7507 physical_len = btrfs_dev_extent_length(leaf, dext); 7508 7509 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 7510 physical_offset, physical_len); 7511 if (ret < 0) 7512 goto out; 7513 ret = btrfs_next_item(root, path); 7514 if (ret < 0) 7515 goto out; 7516 if (ret > 0) { 7517 ret = 0; 7518 break; 7519 } 7520 } 7521 7522 /* Ensure all chunks have corresponding dev extents */ 7523 ret = verify_chunk_dev_extent_mapping(fs_info); 7524 out: 7525 btrfs_free_path(path); 7526 return ret; 7527 } 7528