1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/buffer_head.h> 10 #include <linux/blkdev.h> 11 #include <linux/iocontext.h> 12 #include <linux/capability.h> 13 #include <linux/ratelimit.h> 14 #include <linux/kthread.h> 15 #include <linux/raid/pq.h> 16 #include <linux/semaphore.h> 17 #include <linux/uuid.h> 18 #include <linux/list_sort.h> 19 #include <asm/div64.h> 20 #include "ctree.h" 21 #include "extent_map.h" 22 #include "disk-io.h" 23 #include "transaction.h" 24 #include "print-tree.h" 25 #include "volumes.h" 26 #include "raid56.h" 27 #include "async-thread.h" 28 #include "check-integrity.h" 29 #include "rcu-string.h" 30 #include "math.h" 31 #include "dev-replace.h" 32 #include "sysfs.h" 33 34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 35 [BTRFS_RAID_RAID10] = { 36 .sub_stripes = 2, 37 .dev_stripes = 1, 38 .devs_max = 0, /* 0 == as many as possible */ 39 .devs_min = 4, 40 .tolerated_failures = 1, 41 .devs_increment = 2, 42 .ncopies = 2, 43 }, 44 [BTRFS_RAID_RAID1] = { 45 .sub_stripes = 1, 46 .dev_stripes = 1, 47 .devs_max = 2, 48 .devs_min = 2, 49 .tolerated_failures = 1, 50 .devs_increment = 2, 51 .ncopies = 2, 52 }, 53 [BTRFS_RAID_DUP] = { 54 .sub_stripes = 1, 55 .dev_stripes = 2, 56 .devs_max = 1, 57 .devs_min = 1, 58 .tolerated_failures = 0, 59 .devs_increment = 1, 60 .ncopies = 2, 61 }, 62 [BTRFS_RAID_RAID0] = { 63 .sub_stripes = 1, 64 .dev_stripes = 1, 65 .devs_max = 0, 66 .devs_min = 2, 67 .tolerated_failures = 0, 68 .devs_increment = 1, 69 .ncopies = 1, 70 }, 71 [BTRFS_RAID_SINGLE] = { 72 .sub_stripes = 1, 73 .dev_stripes = 1, 74 .devs_max = 1, 75 .devs_min = 1, 76 .tolerated_failures = 0, 77 .devs_increment = 1, 78 .ncopies = 1, 79 }, 80 [BTRFS_RAID_RAID5] = { 81 .sub_stripes = 1, 82 .dev_stripes = 1, 83 .devs_max = 0, 84 .devs_min = 2, 85 .tolerated_failures = 1, 86 .devs_increment = 1, 87 .ncopies = 2, 88 }, 89 [BTRFS_RAID_RAID6] = { 90 .sub_stripes = 1, 91 .dev_stripes = 1, 92 .devs_max = 0, 93 .devs_min = 3, 94 .tolerated_failures = 2, 95 .devs_increment = 1, 96 .ncopies = 3, 97 }, 98 }; 99 100 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { 101 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, 102 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, 103 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, 104 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, 105 [BTRFS_RAID_SINGLE] = 0, 106 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, 107 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 108 }; 109 110 /* 111 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices 112 * condition is not met. Zero means there's no corresponding 113 * BTRFS_ERROR_DEV_*_NOT_MET value. 114 */ 115 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { 116 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 117 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 118 [BTRFS_RAID_DUP] = 0, 119 [BTRFS_RAID_RAID0] = 0, 120 [BTRFS_RAID_SINGLE] = 0, 121 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 122 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 123 }; 124 125 static int init_first_rw_device(struct btrfs_trans_handle *trans, 126 struct btrfs_fs_info *fs_info); 127 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 128 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 129 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 130 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 131 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 132 enum btrfs_map_op op, 133 u64 logical, u64 *length, 134 struct btrfs_bio **bbio_ret, 135 int mirror_num, int need_raid_map); 136 137 /* 138 * Device locking 139 * ============== 140 * 141 * There are several mutexes that protect manipulation of devices and low-level 142 * structures like chunks but not block groups, extents or files 143 * 144 * uuid_mutex (global lock) 145 * ------------------------ 146 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 147 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 148 * device) or requested by the device= mount option 149 * 150 * the mutex can be very coarse and can cover long-running operations 151 * 152 * protects: updates to fs_devices counters like missing devices, rw devices, 153 * seeding, structure cloning, openning/closing devices at mount/umount time 154 * 155 * global::fs_devs - add, remove, updates to the global list 156 * 157 * does not protect: manipulation of the fs_devices::devices list! 158 * 159 * btrfs_device::name - renames (write side), read is RCU 160 * 161 * fs_devices::device_list_mutex (per-fs, with RCU) 162 * ------------------------------------------------ 163 * protects updates to fs_devices::devices, ie. adding and deleting 164 * 165 * simple list traversal with read-only actions can be done with RCU protection 166 * 167 * may be used to exclude some operations from running concurrently without any 168 * modifications to the list (see write_all_supers) 169 * 170 * volume_mutex 171 * ------------ 172 * coarse lock owned by a mounted filesystem; used to exclude some operations 173 * that cannot run in parallel and affect the higher-level properties of the 174 * filesystem like: device add/deleting/resize/replace, or balance 175 * 176 * balance_mutex 177 * ------------- 178 * protects balance structures (status, state) and context accessed from 179 * several places (internally, ioctl) 180 * 181 * chunk_mutex 182 * ----------- 183 * protects chunks, adding or removing during allocation, trim or when a new 184 * device is added/removed 185 * 186 * cleaner_mutex 187 * ------------- 188 * a big lock that is held by the cleaner thread and prevents running subvolume 189 * cleaning together with relocation or delayed iputs 190 * 191 * 192 * Lock nesting 193 * ============ 194 * 195 * uuid_mutex 196 * volume_mutex 197 * device_list_mutex 198 * chunk_mutex 199 * balance_mutex 200 */ 201 202 DEFINE_MUTEX(uuid_mutex); 203 static LIST_HEAD(fs_uuids); 204 struct list_head *btrfs_get_fs_uuids(void) 205 { 206 return &fs_uuids; 207 } 208 209 /* 210 * alloc_fs_devices - allocate struct btrfs_fs_devices 211 * @fsid: if not NULL, copy the uuid to fs_devices::fsid 212 * 213 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 214 * The returned struct is not linked onto any lists and can be destroyed with 215 * kfree() right away. 216 */ 217 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 218 { 219 struct btrfs_fs_devices *fs_devs; 220 221 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 222 if (!fs_devs) 223 return ERR_PTR(-ENOMEM); 224 225 mutex_init(&fs_devs->device_list_mutex); 226 227 INIT_LIST_HEAD(&fs_devs->devices); 228 INIT_LIST_HEAD(&fs_devs->resized_devices); 229 INIT_LIST_HEAD(&fs_devs->alloc_list); 230 INIT_LIST_HEAD(&fs_devs->list); 231 if (fsid) 232 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 233 234 return fs_devs; 235 } 236 237 static void free_device(struct btrfs_device *device) 238 { 239 rcu_string_free(device->name); 240 bio_put(device->flush_bio); 241 kfree(device); 242 } 243 244 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 245 { 246 struct btrfs_device *device; 247 WARN_ON(fs_devices->opened); 248 while (!list_empty(&fs_devices->devices)) { 249 device = list_entry(fs_devices->devices.next, 250 struct btrfs_device, dev_list); 251 list_del(&device->dev_list); 252 free_device(device); 253 } 254 kfree(fs_devices); 255 } 256 257 static void btrfs_kobject_uevent(struct block_device *bdev, 258 enum kobject_action action) 259 { 260 int ret; 261 262 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 263 if (ret) 264 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 265 action, 266 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 267 &disk_to_dev(bdev->bd_disk)->kobj); 268 } 269 270 void __exit btrfs_cleanup_fs_uuids(void) 271 { 272 struct btrfs_fs_devices *fs_devices; 273 274 while (!list_empty(&fs_uuids)) { 275 fs_devices = list_entry(fs_uuids.next, 276 struct btrfs_fs_devices, list); 277 list_del(&fs_devices->list); 278 free_fs_devices(fs_devices); 279 } 280 } 281 282 /* 283 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 284 * Returned struct is not linked onto any lists and must be destroyed using 285 * free_device. 286 */ 287 static struct btrfs_device *__alloc_device(void) 288 { 289 struct btrfs_device *dev; 290 291 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 292 if (!dev) 293 return ERR_PTR(-ENOMEM); 294 295 /* 296 * Preallocate a bio that's always going to be used for flushing device 297 * barriers and matches the device lifespan 298 */ 299 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 300 if (!dev->flush_bio) { 301 kfree(dev); 302 return ERR_PTR(-ENOMEM); 303 } 304 305 INIT_LIST_HEAD(&dev->dev_list); 306 INIT_LIST_HEAD(&dev->dev_alloc_list); 307 INIT_LIST_HEAD(&dev->resized_list); 308 309 spin_lock_init(&dev->io_lock); 310 311 atomic_set(&dev->reada_in_flight, 0); 312 atomic_set(&dev->dev_stats_ccnt, 0); 313 btrfs_device_data_ordered_init(dev); 314 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 315 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 316 317 return dev; 318 } 319 320 /* 321 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 322 * return NULL. 323 * 324 * If devid and uuid are both specified, the match must be exact, otherwise 325 * only devid is used. 326 */ 327 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 328 u64 devid, const u8 *uuid) 329 { 330 struct list_head *head = &fs_devices->devices; 331 struct btrfs_device *dev; 332 333 list_for_each_entry(dev, head, dev_list) { 334 if (dev->devid == devid && 335 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 336 return dev; 337 } 338 } 339 return NULL; 340 } 341 342 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 343 { 344 struct btrfs_fs_devices *fs_devices; 345 346 list_for_each_entry(fs_devices, &fs_uuids, list) { 347 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 348 return fs_devices; 349 } 350 return NULL; 351 } 352 353 static int 354 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 355 int flush, struct block_device **bdev, 356 struct buffer_head **bh) 357 { 358 int ret; 359 360 *bdev = blkdev_get_by_path(device_path, flags, holder); 361 362 if (IS_ERR(*bdev)) { 363 ret = PTR_ERR(*bdev); 364 goto error; 365 } 366 367 if (flush) 368 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 369 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 370 if (ret) { 371 blkdev_put(*bdev, flags); 372 goto error; 373 } 374 invalidate_bdev(*bdev); 375 *bh = btrfs_read_dev_super(*bdev); 376 if (IS_ERR(*bh)) { 377 ret = PTR_ERR(*bh); 378 blkdev_put(*bdev, flags); 379 goto error; 380 } 381 382 return 0; 383 384 error: 385 *bdev = NULL; 386 *bh = NULL; 387 return ret; 388 } 389 390 static void requeue_list(struct btrfs_pending_bios *pending_bios, 391 struct bio *head, struct bio *tail) 392 { 393 394 struct bio *old_head; 395 396 old_head = pending_bios->head; 397 pending_bios->head = head; 398 if (pending_bios->tail) 399 tail->bi_next = old_head; 400 else 401 pending_bios->tail = tail; 402 } 403 404 /* 405 * we try to collect pending bios for a device so we don't get a large 406 * number of procs sending bios down to the same device. This greatly 407 * improves the schedulers ability to collect and merge the bios. 408 * 409 * But, it also turns into a long list of bios to process and that is sure 410 * to eventually make the worker thread block. The solution here is to 411 * make some progress and then put this work struct back at the end of 412 * the list if the block device is congested. This way, multiple devices 413 * can make progress from a single worker thread. 414 */ 415 static noinline void run_scheduled_bios(struct btrfs_device *device) 416 { 417 struct btrfs_fs_info *fs_info = device->fs_info; 418 struct bio *pending; 419 struct backing_dev_info *bdi; 420 struct btrfs_pending_bios *pending_bios; 421 struct bio *tail; 422 struct bio *cur; 423 int again = 0; 424 unsigned long num_run; 425 unsigned long batch_run = 0; 426 unsigned long last_waited = 0; 427 int force_reg = 0; 428 int sync_pending = 0; 429 struct blk_plug plug; 430 431 /* 432 * this function runs all the bios we've collected for 433 * a particular device. We don't want to wander off to 434 * another device without first sending all of these down. 435 * So, setup a plug here and finish it off before we return 436 */ 437 blk_start_plug(&plug); 438 439 bdi = device->bdev->bd_bdi; 440 441 loop: 442 spin_lock(&device->io_lock); 443 444 loop_lock: 445 num_run = 0; 446 447 /* take all the bios off the list at once and process them 448 * later on (without the lock held). But, remember the 449 * tail and other pointers so the bios can be properly reinserted 450 * into the list if we hit congestion 451 */ 452 if (!force_reg && device->pending_sync_bios.head) { 453 pending_bios = &device->pending_sync_bios; 454 force_reg = 1; 455 } else { 456 pending_bios = &device->pending_bios; 457 force_reg = 0; 458 } 459 460 pending = pending_bios->head; 461 tail = pending_bios->tail; 462 WARN_ON(pending && !tail); 463 464 /* 465 * if pending was null this time around, no bios need processing 466 * at all and we can stop. Otherwise it'll loop back up again 467 * and do an additional check so no bios are missed. 468 * 469 * device->running_pending is used to synchronize with the 470 * schedule_bio code. 471 */ 472 if (device->pending_sync_bios.head == NULL && 473 device->pending_bios.head == NULL) { 474 again = 0; 475 device->running_pending = 0; 476 } else { 477 again = 1; 478 device->running_pending = 1; 479 } 480 481 pending_bios->head = NULL; 482 pending_bios->tail = NULL; 483 484 spin_unlock(&device->io_lock); 485 486 while (pending) { 487 488 rmb(); 489 /* we want to work on both lists, but do more bios on the 490 * sync list than the regular list 491 */ 492 if ((num_run > 32 && 493 pending_bios != &device->pending_sync_bios && 494 device->pending_sync_bios.head) || 495 (num_run > 64 && pending_bios == &device->pending_sync_bios && 496 device->pending_bios.head)) { 497 spin_lock(&device->io_lock); 498 requeue_list(pending_bios, pending, tail); 499 goto loop_lock; 500 } 501 502 cur = pending; 503 pending = pending->bi_next; 504 cur->bi_next = NULL; 505 506 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 507 508 /* 509 * if we're doing the sync list, record that our 510 * plug has some sync requests on it 511 * 512 * If we're doing the regular list and there are 513 * sync requests sitting around, unplug before 514 * we add more 515 */ 516 if (pending_bios == &device->pending_sync_bios) { 517 sync_pending = 1; 518 } else if (sync_pending) { 519 blk_finish_plug(&plug); 520 blk_start_plug(&plug); 521 sync_pending = 0; 522 } 523 524 btrfsic_submit_bio(cur); 525 num_run++; 526 batch_run++; 527 528 cond_resched(); 529 530 /* 531 * we made progress, there is more work to do and the bdi 532 * is now congested. Back off and let other work structs 533 * run instead 534 */ 535 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 536 fs_info->fs_devices->open_devices > 1) { 537 struct io_context *ioc; 538 539 ioc = current->io_context; 540 541 /* 542 * the main goal here is that we don't want to 543 * block if we're going to be able to submit 544 * more requests without blocking. 545 * 546 * This code does two great things, it pokes into 547 * the elevator code from a filesystem _and_ 548 * it makes assumptions about how batching works. 549 */ 550 if (ioc && ioc->nr_batch_requests > 0 && 551 time_before(jiffies, ioc->last_waited + HZ/50UL) && 552 (last_waited == 0 || 553 ioc->last_waited == last_waited)) { 554 /* 555 * we want to go through our batch of 556 * requests and stop. So, we copy out 557 * the ioc->last_waited time and test 558 * against it before looping 559 */ 560 last_waited = ioc->last_waited; 561 cond_resched(); 562 continue; 563 } 564 spin_lock(&device->io_lock); 565 requeue_list(pending_bios, pending, tail); 566 device->running_pending = 1; 567 568 spin_unlock(&device->io_lock); 569 btrfs_queue_work(fs_info->submit_workers, 570 &device->work); 571 goto done; 572 } 573 } 574 575 cond_resched(); 576 if (again) 577 goto loop; 578 579 spin_lock(&device->io_lock); 580 if (device->pending_bios.head || device->pending_sync_bios.head) 581 goto loop_lock; 582 spin_unlock(&device->io_lock); 583 584 done: 585 blk_finish_plug(&plug); 586 } 587 588 static void pending_bios_fn(struct btrfs_work *work) 589 { 590 struct btrfs_device *device; 591 592 device = container_of(work, struct btrfs_device, work); 593 run_scheduled_bios(device); 594 } 595 596 /* 597 * Search and remove all stale (devices which are not mounted) devices. 598 * When both inputs are NULL, it will search and release all stale devices. 599 * path: Optional. When provided will it release all unmounted devices 600 * matching this path only. 601 * skip_dev: Optional. Will skip this device when searching for the stale 602 * devices. 603 */ 604 static void btrfs_free_stale_devices(const char *path, 605 struct btrfs_device *skip_dev) 606 { 607 struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; 608 struct btrfs_device *dev, *tmp_dev; 609 610 list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { 611 612 if (fs_devs->opened) 613 continue; 614 615 list_for_each_entry_safe(dev, tmp_dev, 616 &fs_devs->devices, dev_list) { 617 int not_found = 0; 618 619 if (skip_dev && skip_dev == dev) 620 continue; 621 if (path && !dev->name) 622 continue; 623 624 rcu_read_lock(); 625 if (path) 626 not_found = strcmp(rcu_str_deref(dev->name), 627 path); 628 rcu_read_unlock(); 629 if (not_found) 630 continue; 631 632 /* delete the stale device */ 633 if (fs_devs->num_devices == 1) { 634 btrfs_sysfs_remove_fsid(fs_devs); 635 list_del(&fs_devs->list); 636 free_fs_devices(fs_devs); 637 break; 638 } else { 639 fs_devs->num_devices--; 640 list_del(&dev->dev_list); 641 free_device(dev); 642 } 643 } 644 } 645 } 646 647 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 648 struct btrfs_device *device, fmode_t flags, 649 void *holder) 650 { 651 struct request_queue *q; 652 struct block_device *bdev; 653 struct buffer_head *bh; 654 struct btrfs_super_block *disk_super; 655 u64 devid; 656 int ret; 657 658 if (device->bdev) 659 return -EINVAL; 660 if (!device->name) 661 return -EINVAL; 662 663 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 664 &bdev, &bh); 665 if (ret) 666 return ret; 667 668 disk_super = (struct btrfs_super_block *)bh->b_data; 669 devid = btrfs_stack_device_id(&disk_super->dev_item); 670 if (devid != device->devid) 671 goto error_brelse; 672 673 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 674 goto error_brelse; 675 676 device->generation = btrfs_super_generation(disk_super); 677 678 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 679 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 680 fs_devices->seeding = 1; 681 } else { 682 if (bdev_read_only(bdev)) 683 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 684 else 685 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 686 } 687 688 q = bdev_get_queue(bdev); 689 if (!blk_queue_nonrot(q)) 690 fs_devices->rotating = 1; 691 692 device->bdev = bdev; 693 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 694 device->mode = flags; 695 696 fs_devices->open_devices++; 697 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 698 device->devid != BTRFS_DEV_REPLACE_DEVID) { 699 fs_devices->rw_devices++; 700 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 701 } 702 brelse(bh); 703 704 return 0; 705 706 error_brelse: 707 brelse(bh); 708 blkdev_put(bdev, flags); 709 710 return -EINVAL; 711 } 712 713 /* 714 * Add new device to list of registered devices 715 * 716 * Returns: 717 * device pointer which was just added or updated when successful 718 * error pointer when failed 719 */ 720 static noinline struct btrfs_device *device_list_add(const char *path, 721 struct btrfs_super_block *disk_super) 722 { 723 struct btrfs_device *device; 724 struct btrfs_fs_devices *fs_devices; 725 struct rcu_string *name; 726 u64 found_transid = btrfs_super_generation(disk_super); 727 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 728 729 fs_devices = find_fsid(disk_super->fsid); 730 if (!fs_devices) { 731 fs_devices = alloc_fs_devices(disk_super->fsid); 732 if (IS_ERR(fs_devices)) 733 return ERR_CAST(fs_devices); 734 735 list_add(&fs_devices->list, &fs_uuids); 736 737 device = NULL; 738 } else { 739 device = find_device(fs_devices, devid, 740 disk_super->dev_item.uuid); 741 } 742 743 if (!device) { 744 if (fs_devices->opened) 745 return ERR_PTR(-EBUSY); 746 747 device = btrfs_alloc_device(NULL, &devid, 748 disk_super->dev_item.uuid); 749 if (IS_ERR(device)) { 750 /* we can safely leave the fs_devices entry around */ 751 return device; 752 } 753 754 name = rcu_string_strdup(path, GFP_NOFS); 755 if (!name) { 756 free_device(device); 757 return ERR_PTR(-ENOMEM); 758 } 759 rcu_assign_pointer(device->name, name); 760 761 mutex_lock(&fs_devices->device_list_mutex); 762 list_add_rcu(&device->dev_list, &fs_devices->devices); 763 fs_devices->num_devices++; 764 mutex_unlock(&fs_devices->device_list_mutex); 765 766 device->fs_devices = fs_devices; 767 btrfs_free_stale_devices(path, device); 768 769 if (disk_super->label[0]) 770 pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", 771 disk_super->label, devid, found_transid, path); 772 else 773 pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", 774 disk_super->fsid, devid, found_transid, path); 775 776 } else if (!device->name || strcmp(device->name->str, path)) { 777 /* 778 * When FS is already mounted. 779 * 1. If you are here and if the device->name is NULL that 780 * means this device was missing at time of FS mount. 781 * 2. If you are here and if the device->name is different 782 * from 'path' that means either 783 * a. The same device disappeared and reappeared with 784 * different name. or 785 * b. The missing-disk-which-was-replaced, has 786 * reappeared now. 787 * 788 * We must allow 1 and 2a above. But 2b would be a spurious 789 * and unintentional. 790 * 791 * Further in case of 1 and 2a above, the disk at 'path' 792 * would have missed some transaction when it was away and 793 * in case of 2a the stale bdev has to be updated as well. 794 * 2b must not be allowed at all time. 795 */ 796 797 /* 798 * For now, we do allow update to btrfs_fs_device through the 799 * btrfs dev scan cli after FS has been mounted. We're still 800 * tracking a problem where systems fail mount by subvolume id 801 * when we reject replacement on a mounted FS. 802 */ 803 if (!fs_devices->opened && found_transid < device->generation) { 804 /* 805 * That is if the FS is _not_ mounted and if you 806 * are here, that means there is more than one 807 * disk with same uuid and devid.We keep the one 808 * with larger generation number or the last-in if 809 * generation are equal. 810 */ 811 return ERR_PTR(-EEXIST); 812 } 813 814 name = rcu_string_strdup(path, GFP_NOFS); 815 if (!name) 816 return ERR_PTR(-ENOMEM); 817 rcu_string_free(device->name); 818 rcu_assign_pointer(device->name, name); 819 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 820 fs_devices->missing_devices--; 821 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 822 } 823 } 824 825 /* 826 * Unmount does not free the btrfs_device struct but would zero 827 * generation along with most of the other members. So just update 828 * it back. We need it to pick the disk with largest generation 829 * (as above). 830 */ 831 if (!fs_devices->opened) 832 device->generation = found_transid; 833 834 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 835 836 return device; 837 } 838 839 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 840 { 841 struct btrfs_fs_devices *fs_devices; 842 struct btrfs_device *device; 843 struct btrfs_device *orig_dev; 844 845 fs_devices = alloc_fs_devices(orig->fsid); 846 if (IS_ERR(fs_devices)) 847 return fs_devices; 848 849 mutex_lock(&orig->device_list_mutex); 850 fs_devices->total_devices = orig->total_devices; 851 852 /* We have held the volume lock, it is safe to get the devices. */ 853 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 854 struct rcu_string *name; 855 856 device = btrfs_alloc_device(NULL, &orig_dev->devid, 857 orig_dev->uuid); 858 if (IS_ERR(device)) 859 goto error; 860 861 /* 862 * This is ok to do without rcu read locked because we hold the 863 * uuid mutex so nothing we touch in here is going to disappear. 864 */ 865 if (orig_dev->name) { 866 name = rcu_string_strdup(orig_dev->name->str, 867 GFP_KERNEL); 868 if (!name) { 869 free_device(device); 870 goto error; 871 } 872 rcu_assign_pointer(device->name, name); 873 } 874 875 list_add(&device->dev_list, &fs_devices->devices); 876 device->fs_devices = fs_devices; 877 fs_devices->num_devices++; 878 } 879 mutex_unlock(&orig->device_list_mutex); 880 return fs_devices; 881 error: 882 mutex_unlock(&orig->device_list_mutex); 883 free_fs_devices(fs_devices); 884 return ERR_PTR(-ENOMEM); 885 } 886 887 /* 888 * After we have read the system tree and know devids belonging to 889 * this filesystem, remove the device which does not belong there. 890 */ 891 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 892 { 893 struct btrfs_device *device, *next; 894 struct btrfs_device *latest_dev = NULL; 895 896 mutex_lock(&uuid_mutex); 897 again: 898 /* This is the initialized path, it is safe to release the devices. */ 899 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 900 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 901 &device->dev_state)) { 902 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 903 &device->dev_state) && 904 (!latest_dev || 905 device->generation > latest_dev->generation)) { 906 latest_dev = device; 907 } 908 continue; 909 } 910 911 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 912 /* 913 * In the first step, keep the device which has 914 * the correct fsid and the devid that is used 915 * for the dev_replace procedure. 916 * In the second step, the dev_replace state is 917 * read from the device tree and it is known 918 * whether the procedure is really active or 919 * not, which means whether this device is 920 * used or whether it should be removed. 921 */ 922 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 923 &device->dev_state)) { 924 continue; 925 } 926 } 927 if (device->bdev) { 928 blkdev_put(device->bdev, device->mode); 929 device->bdev = NULL; 930 fs_devices->open_devices--; 931 } 932 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 933 list_del_init(&device->dev_alloc_list); 934 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 935 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 936 &device->dev_state)) 937 fs_devices->rw_devices--; 938 } 939 list_del_init(&device->dev_list); 940 fs_devices->num_devices--; 941 free_device(device); 942 } 943 944 if (fs_devices->seed) { 945 fs_devices = fs_devices->seed; 946 goto again; 947 } 948 949 fs_devices->latest_bdev = latest_dev->bdev; 950 951 mutex_unlock(&uuid_mutex); 952 } 953 954 static void free_device_rcu(struct rcu_head *head) 955 { 956 struct btrfs_device *device; 957 958 device = container_of(head, struct btrfs_device, rcu); 959 free_device(device); 960 } 961 962 static void btrfs_close_bdev(struct btrfs_device *device) 963 { 964 if (!device->bdev) 965 return; 966 967 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 968 sync_blockdev(device->bdev); 969 invalidate_bdev(device->bdev); 970 } 971 972 blkdev_put(device->bdev, device->mode); 973 } 974 975 static void btrfs_prepare_close_one_device(struct btrfs_device *device) 976 { 977 struct btrfs_fs_devices *fs_devices = device->fs_devices; 978 struct btrfs_device *new_device; 979 struct rcu_string *name; 980 981 if (device->bdev) 982 fs_devices->open_devices--; 983 984 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 985 device->devid != BTRFS_DEV_REPLACE_DEVID) { 986 list_del_init(&device->dev_alloc_list); 987 fs_devices->rw_devices--; 988 } 989 990 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 991 fs_devices->missing_devices--; 992 993 new_device = btrfs_alloc_device(NULL, &device->devid, 994 device->uuid); 995 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 996 997 /* Safe because we are under uuid_mutex */ 998 if (device->name) { 999 name = rcu_string_strdup(device->name->str, GFP_NOFS); 1000 BUG_ON(!name); /* -ENOMEM */ 1001 rcu_assign_pointer(new_device->name, name); 1002 } 1003 1004 list_replace_rcu(&device->dev_list, &new_device->dev_list); 1005 new_device->fs_devices = device->fs_devices; 1006 } 1007 1008 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1009 { 1010 struct btrfs_device *device, *tmp; 1011 struct list_head pending_put; 1012 1013 INIT_LIST_HEAD(&pending_put); 1014 1015 if (--fs_devices->opened > 0) 1016 return 0; 1017 1018 mutex_lock(&fs_devices->device_list_mutex); 1019 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1020 btrfs_prepare_close_one_device(device); 1021 list_add(&device->dev_list, &pending_put); 1022 } 1023 mutex_unlock(&fs_devices->device_list_mutex); 1024 1025 /* 1026 * btrfs_show_devname() is using the device_list_mutex, 1027 * sometimes call to blkdev_put() leads vfs calling 1028 * into this func. So do put outside of device_list_mutex, 1029 * as of now. 1030 */ 1031 while (!list_empty(&pending_put)) { 1032 device = list_first_entry(&pending_put, 1033 struct btrfs_device, dev_list); 1034 list_del(&device->dev_list); 1035 btrfs_close_bdev(device); 1036 call_rcu(&device->rcu, free_device_rcu); 1037 } 1038 1039 WARN_ON(fs_devices->open_devices); 1040 WARN_ON(fs_devices->rw_devices); 1041 fs_devices->opened = 0; 1042 fs_devices->seeding = 0; 1043 1044 return 0; 1045 } 1046 1047 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1048 { 1049 struct btrfs_fs_devices *seed_devices = NULL; 1050 int ret; 1051 1052 mutex_lock(&uuid_mutex); 1053 ret = __btrfs_close_devices(fs_devices); 1054 if (!fs_devices->opened) { 1055 seed_devices = fs_devices->seed; 1056 fs_devices->seed = NULL; 1057 } 1058 mutex_unlock(&uuid_mutex); 1059 1060 while (seed_devices) { 1061 fs_devices = seed_devices; 1062 seed_devices = fs_devices->seed; 1063 __btrfs_close_devices(fs_devices); 1064 free_fs_devices(fs_devices); 1065 } 1066 return ret; 1067 } 1068 1069 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1070 fmode_t flags, void *holder) 1071 { 1072 struct list_head *head = &fs_devices->devices; 1073 struct btrfs_device *device; 1074 struct btrfs_device *latest_dev = NULL; 1075 int ret = 0; 1076 1077 flags |= FMODE_EXCL; 1078 1079 list_for_each_entry(device, head, dev_list) { 1080 /* Just open everything we can; ignore failures here */ 1081 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1082 continue; 1083 1084 if (!latest_dev || 1085 device->generation > latest_dev->generation) 1086 latest_dev = device; 1087 } 1088 if (fs_devices->open_devices == 0) { 1089 ret = -EINVAL; 1090 goto out; 1091 } 1092 fs_devices->opened = 1; 1093 fs_devices->latest_bdev = latest_dev->bdev; 1094 fs_devices->total_rw_bytes = 0; 1095 out: 1096 return ret; 1097 } 1098 1099 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1100 { 1101 struct btrfs_device *dev1, *dev2; 1102 1103 dev1 = list_entry(a, struct btrfs_device, dev_list); 1104 dev2 = list_entry(b, struct btrfs_device, dev_list); 1105 1106 if (dev1->devid < dev2->devid) 1107 return -1; 1108 else if (dev1->devid > dev2->devid) 1109 return 1; 1110 return 0; 1111 } 1112 1113 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1114 fmode_t flags, void *holder) 1115 { 1116 int ret; 1117 1118 mutex_lock(&uuid_mutex); 1119 if (fs_devices->opened) { 1120 fs_devices->opened++; 1121 ret = 0; 1122 } else { 1123 list_sort(NULL, &fs_devices->devices, devid_cmp); 1124 ret = __btrfs_open_devices(fs_devices, flags, holder); 1125 } 1126 mutex_unlock(&uuid_mutex); 1127 return ret; 1128 } 1129 1130 static void btrfs_release_disk_super(struct page *page) 1131 { 1132 kunmap(page); 1133 put_page(page); 1134 } 1135 1136 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1137 struct page **page, 1138 struct btrfs_super_block **disk_super) 1139 { 1140 void *p; 1141 pgoff_t index; 1142 1143 /* make sure our super fits in the device */ 1144 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1145 return 1; 1146 1147 /* make sure our super fits in the page */ 1148 if (sizeof(**disk_super) > PAGE_SIZE) 1149 return 1; 1150 1151 /* make sure our super doesn't straddle pages on disk */ 1152 index = bytenr >> PAGE_SHIFT; 1153 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1154 return 1; 1155 1156 /* pull in the page with our super */ 1157 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1158 index, GFP_KERNEL); 1159 1160 if (IS_ERR_OR_NULL(*page)) 1161 return 1; 1162 1163 p = kmap(*page); 1164 1165 /* align our pointer to the offset of the super block */ 1166 *disk_super = p + (bytenr & ~PAGE_MASK); 1167 1168 if (btrfs_super_bytenr(*disk_super) != bytenr || 1169 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1170 btrfs_release_disk_super(*page); 1171 return 1; 1172 } 1173 1174 if ((*disk_super)->label[0] && 1175 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1176 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1177 1178 return 0; 1179 } 1180 1181 /* 1182 * Look for a btrfs signature on a device. This may be called out of the mount path 1183 * and we are not allowed to call set_blocksize during the scan. The superblock 1184 * is read via pagecache 1185 */ 1186 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 1187 struct btrfs_fs_devices **fs_devices_ret) 1188 { 1189 struct btrfs_super_block *disk_super; 1190 struct btrfs_device *device; 1191 struct block_device *bdev; 1192 struct page *page; 1193 int ret = 0; 1194 u64 bytenr; 1195 1196 /* 1197 * we would like to check all the supers, but that would make 1198 * a btrfs mount succeed after a mkfs from a different FS. 1199 * So, we need to add a special mount option to scan for 1200 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1201 */ 1202 bytenr = btrfs_sb_offset(0); 1203 flags |= FMODE_EXCL; 1204 mutex_lock(&uuid_mutex); 1205 1206 bdev = blkdev_get_by_path(path, flags, holder); 1207 if (IS_ERR(bdev)) { 1208 ret = PTR_ERR(bdev); 1209 goto error; 1210 } 1211 1212 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 1213 ret = -EINVAL; 1214 goto error_bdev_put; 1215 } 1216 1217 device = device_list_add(path, disk_super); 1218 if (IS_ERR(device)) 1219 ret = PTR_ERR(device); 1220 else 1221 *fs_devices_ret = device->fs_devices; 1222 1223 btrfs_release_disk_super(page); 1224 1225 error_bdev_put: 1226 blkdev_put(bdev, flags); 1227 error: 1228 mutex_unlock(&uuid_mutex); 1229 return ret; 1230 } 1231 1232 /* helper to account the used device space in the range */ 1233 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 1234 u64 end, u64 *length) 1235 { 1236 struct btrfs_key key; 1237 struct btrfs_root *root = device->fs_info->dev_root; 1238 struct btrfs_dev_extent *dev_extent; 1239 struct btrfs_path *path; 1240 u64 extent_end; 1241 int ret; 1242 int slot; 1243 struct extent_buffer *l; 1244 1245 *length = 0; 1246 1247 if (start >= device->total_bytes || 1248 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 1249 return 0; 1250 1251 path = btrfs_alloc_path(); 1252 if (!path) 1253 return -ENOMEM; 1254 path->reada = READA_FORWARD; 1255 1256 key.objectid = device->devid; 1257 key.offset = start; 1258 key.type = BTRFS_DEV_EXTENT_KEY; 1259 1260 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1261 if (ret < 0) 1262 goto out; 1263 if (ret > 0) { 1264 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1265 if (ret < 0) 1266 goto out; 1267 } 1268 1269 while (1) { 1270 l = path->nodes[0]; 1271 slot = path->slots[0]; 1272 if (slot >= btrfs_header_nritems(l)) { 1273 ret = btrfs_next_leaf(root, path); 1274 if (ret == 0) 1275 continue; 1276 if (ret < 0) 1277 goto out; 1278 1279 break; 1280 } 1281 btrfs_item_key_to_cpu(l, &key, slot); 1282 1283 if (key.objectid < device->devid) 1284 goto next; 1285 1286 if (key.objectid > device->devid) 1287 break; 1288 1289 if (key.type != BTRFS_DEV_EXTENT_KEY) 1290 goto next; 1291 1292 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1293 extent_end = key.offset + btrfs_dev_extent_length(l, 1294 dev_extent); 1295 if (key.offset <= start && extent_end > end) { 1296 *length = end - start + 1; 1297 break; 1298 } else if (key.offset <= start && extent_end > start) 1299 *length += extent_end - start; 1300 else if (key.offset > start && extent_end <= end) 1301 *length += extent_end - key.offset; 1302 else if (key.offset > start && key.offset <= end) { 1303 *length += end - key.offset + 1; 1304 break; 1305 } else if (key.offset > end) 1306 break; 1307 1308 next: 1309 path->slots[0]++; 1310 } 1311 ret = 0; 1312 out: 1313 btrfs_free_path(path); 1314 return ret; 1315 } 1316 1317 static int contains_pending_extent(struct btrfs_transaction *transaction, 1318 struct btrfs_device *device, 1319 u64 *start, u64 len) 1320 { 1321 struct btrfs_fs_info *fs_info = device->fs_info; 1322 struct extent_map *em; 1323 struct list_head *search_list = &fs_info->pinned_chunks; 1324 int ret = 0; 1325 u64 physical_start = *start; 1326 1327 if (transaction) 1328 search_list = &transaction->pending_chunks; 1329 again: 1330 list_for_each_entry(em, search_list, list) { 1331 struct map_lookup *map; 1332 int i; 1333 1334 map = em->map_lookup; 1335 for (i = 0; i < map->num_stripes; i++) { 1336 u64 end; 1337 1338 if (map->stripes[i].dev != device) 1339 continue; 1340 if (map->stripes[i].physical >= physical_start + len || 1341 map->stripes[i].physical + em->orig_block_len <= 1342 physical_start) 1343 continue; 1344 /* 1345 * Make sure that while processing the pinned list we do 1346 * not override our *start with a lower value, because 1347 * we can have pinned chunks that fall within this 1348 * device hole and that have lower physical addresses 1349 * than the pending chunks we processed before. If we 1350 * do not take this special care we can end up getting 1351 * 2 pending chunks that start at the same physical 1352 * device offsets because the end offset of a pinned 1353 * chunk can be equal to the start offset of some 1354 * pending chunk. 1355 */ 1356 end = map->stripes[i].physical + em->orig_block_len; 1357 if (end > *start) { 1358 *start = end; 1359 ret = 1; 1360 } 1361 } 1362 } 1363 if (search_list != &fs_info->pinned_chunks) { 1364 search_list = &fs_info->pinned_chunks; 1365 goto again; 1366 } 1367 1368 return ret; 1369 } 1370 1371 1372 /* 1373 * find_free_dev_extent_start - find free space in the specified device 1374 * @device: the device which we search the free space in 1375 * @num_bytes: the size of the free space that we need 1376 * @search_start: the position from which to begin the search 1377 * @start: store the start of the free space. 1378 * @len: the size of the free space. that we find, or the size 1379 * of the max free space if we don't find suitable free space 1380 * 1381 * this uses a pretty simple search, the expectation is that it is 1382 * called very infrequently and that a given device has a small number 1383 * of extents 1384 * 1385 * @start is used to store the start of the free space if we find. But if we 1386 * don't find suitable free space, it will be used to store the start position 1387 * of the max free space. 1388 * 1389 * @len is used to store the size of the free space that we find. 1390 * But if we don't find suitable free space, it is used to store the size of 1391 * the max free space. 1392 */ 1393 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1394 struct btrfs_device *device, u64 num_bytes, 1395 u64 search_start, u64 *start, u64 *len) 1396 { 1397 struct btrfs_fs_info *fs_info = device->fs_info; 1398 struct btrfs_root *root = fs_info->dev_root; 1399 struct btrfs_key key; 1400 struct btrfs_dev_extent *dev_extent; 1401 struct btrfs_path *path; 1402 u64 hole_size; 1403 u64 max_hole_start; 1404 u64 max_hole_size; 1405 u64 extent_end; 1406 u64 search_end = device->total_bytes; 1407 int ret; 1408 int slot; 1409 struct extent_buffer *l; 1410 1411 /* 1412 * We don't want to overwrite the superblock on the drive nor any area 1413 * used by the boot loader (grub for example), so we make sure to start 1414 * at an offset of at least 1MB. 1415 */ 1416 search_start = max_t(u64, search_start, SZ_1M); 1417 1418 path = btrfs_alloc_path(); 1419 if (!path) 1420 return -ENOMEM; 1421 1422 max_hole_start = search_start; 1423 max_hole_size = 0; 1424 1425 again: 1426 if (search_start >= search_end || 1427 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1428 ret = -ENOSPC; 1429 goto out; 1430 } 1431 1432 path->reada = READA_FORWARD; 1433 path->search_commit_root = 1; 1434 path->skip_locking = 1; 1435 1436 key.objectid = device->devid; 1437 key.offset = search_start; 1438 key.type = BTRFS_DEV_EXTENT_KEY; 1439 1440 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1441 if (ret < 0) 1442 goto out; 1443 if (ret > 0) { 1444 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1445 if (ret < 0) 1446 goto out; 1447 } 1448 1449 while (1) { 1450 l = path->nodes[0]; 1451 slot = path->slots[0]; 1452 if (slot >= btrfs_header_nritems(l)) { 1453 ret = btrfs_next_leaf(root, path); 1454 if (ret == 0) 1455 continue; 1456 if (ret < 0) 1457 goto out; 1458 1459 break; 1460 } 1461 btrfs_item_key_to_cpu(l, &key, slot); 1462 1463 if (key.objectid < device->devid) 1464 goto next; 1465 1466 if (key.objectid > device->devid) 1467 break; 1468 1469 if (key.type != BTRFS_DEV_EXTENT_KEY) 1470 goto next; 1471 1472 if (key.offset > search_start) { 1473 hole_size = key.offset - search_start; 1474 1475 /* 1476 * Have to check before we set max_hole_start, otherwise 1477 * we could end up sending back this offset anyway. 1478 */ 1479 if (contains_pending_extent(transaction, device, 1480 &search_start, 1481 hole_size)) { 1482 if (key.offset >= search_start) { 1483 hole_size = key.offset - search_start; 1484 } else { 1485 WARN_ON_ONCE(1); 1486 hole_size = 0; 1487 } 1488 } 1489 1490 if (hole_size > max_hole_size) { 1491 max_hole_start = search_start; 1492 max_hole_size = hole_size; 1493 } 1494 1495 /* 1496 * If this free space is greater than which we need, 1497 * it must be the max free space that we have found 1498 * until now, so max_hole_start must point to the start 1499 * of this free space and the length of this free space 1500 * is stored in max_hole_size. Thus, we return 1501 * max_hole_start and max_hole_size and go back to the 1502 * caller. 1503 */ 1504 if (hole_size >= num_bytes) { 1505 ret = 0; 1506 goto out; 1507 } 1508 } 1509 1510 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1511 extent_end = key.offset + btrfs_dev_extent_length(l, 1512 dev_extent); 1513 if (extent_end > search_start) 1514 search_start = extent_end; 1515 next: 1516 path->slots[0]++; 1517 cond_resched(); 1518 } 1519 1520 /* 1521 * At this point, search_start should be the end of 1522 * allocated dev extents, and when shrinking the device, 1523 * search_end may be smaller than search_start. 1524 */ 1525 if (search_end > search_start) { 1526 hole_size = search_end - search_start; 1527 1528 if (contains_pending_extent(transaction, device, &search_start, 1529 hole_size)) { 1530 btrfs_release_path(path); 1531 goto again; 1532 } 1533 1534 if (hole_size > max_hole_size) { 1535 max_hole_start = search_start; 1536 max_hole_size = hole_size; 1537 } 1538 } 1539 1540 /* See above. */ 1541 if (max_hole_size < num_bytes) 1542 ret = -ENOSPC; 1543 else 1544 ret = 0; 1545 1546 out: 1547 btrfs_free_path(path); 1548 *start = max_hole_start; 1549 if (len) 1550 *len = max_hole_size; 1551 return ret; 1552 } 1553 1554 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1555 struct btrfs_device *device, u64 num_bytes, 1556 u64 *start, u64 *len) 1557 { 1558 /* FIXME use last free of some kind */ 1559 return find_free_dev_extent_start(trans->transaction, device, 1560 num_bytes, 0, start, len); 1561 } 1562 1563 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1564 struct btrfs_device *device, 1565 u64 start, u64 *dev_extent_len) 1566 { 1567 struct btrfs_fs_info *fs_info = device->fs_info; 1568 struct btrfs_root *root = fs_info->dev_root; 1569 int ret; 1570 struct btrfs_path *path; 1571 struct btrfs_key key; 1572 struct btrfs_key found_key; 1573 struct extent_buffer *leaf = NULL; 1574 struct btrfs_dev_extent *extent = NULL; 1575 1576 path = btrfs_alloc_path(); 1577 if (!path) 1578 return -ENOMEM; 1579 1580 key.objectid = device->devid; 1581 key.offset = start; 1582 key.type = BTRFS_DEV_EXTENT_KEY; 1583 again: 1584 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1585 if (ret > 0) { 1586 ret = btrfs_previous_item(root, path, key.objectid, 1587 BTRFS_DEV_EXTENT_KEY); 1588 if (ret) 1589 goto out; 1590 leaf = path->nodes[0]; 1591 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1592 extent = btrfs_item_ptr(leaf, path->slots[0], 1593 struct btrfs_dev_extent); 1594 BUG_ON(found_key.offset > start || found_key.offset + 1595 btrfs_dev_extent_length(leaf, extent) < start); 1596 key = found_key; 1597 btrfs_release_path(path); 1598 goto again; 1599 } else if (ret == 0) { 1600 leaf = path->nodes[0]; 1601 extent = btrfs_item_ptr(leaf, path->slots[0], 1602 struct btrfs_dev_extent); 1603 } else { 1604 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1605 goto out; 1606 } 1607 1608 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1609 1610 ret = btrfs_del_item(trans, root, path); 1611 if (ret) { 1612 btrfs_handle_fs_error(fs_info, ret, 1613 "Failed to remove dev extent item"); 1614 } else { 1615 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1616 } 1617 out: 1618 btrfs_free_path(path); 1619 return ret; 1620 } 1621 1622 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1623 struct btrfs_device *device, 1624 u64 chunk_offset, u64 start, u64 num_bytes) 1625 { 1626 int ret; 1627 struct btrfs_path *path; 1628 struct btrfs_fs_info *fs_info = device->fs_info; 1629 struct btrfs_root *root = fs_info->dev_root; 1630 struct btrfs_dev_extent *extent; 1631 struct extent_buffer *leaf; 1632 struct btrfs_key key; 1633 1634 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1635 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1636 path = btrfs_alloc_path(); 1637 if (!path) 1638 return -ENOMEM; 1639 1640 key.objectid = device->devid; 1641 key.offset = start; 1642 key.type = BTRFS_DEV_EXTENT_KEY; 1643 ret = btrfs_insert_empty_item(trans, root, path, &key, 1644 sizeof(*extent)); 1645 if (ret) 1646 goto out; 1647 1648 leaf = path->nodes[0]; 1649 extent = btrfs_item_ptr(leaf, path->slots[0], 1650 struct btrfs_dev_extent); 1651 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1652 BTRFS_CHUNK_TREE_OBJECTID); 1653 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1654 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1655 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1656 1657 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1658 btrfs_mark_buffer_dirty(leaf); 1659 out: 1660 btrfs_free_path(path); 1661 return ret; 1662 } 1663 1664 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1665 { 1666 struct extent_map_tree *em_tree; 1667 struct extent_map *em; 1668 struct rb_node *n; 1669 u64 ret = 0; 1670 1671 em_tree = &fs_info->mapping_tree.map_tree; 1672 read_lock(&em_tree->lock); 1673 n = rb_last(&em_tree->map); 1674 if (n) { 1675 em = rb_entry(n, struct extent_map, rb_node); 1676 ret = em->start + em->len; 1677 } 1678 read_unlock(&em_tree->lock); 1679 1680 return ret; 1681 } 1682 1683 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1684 u64 *devid_ret) 1685 { 1686 int ret; 1687 struct btrfs_key key; 1688 struct btrfs_key found_key; 1689 struct btrfs_path *path; 1690 1691 path = btrfs_alloc_path(); 1692 if (!path) 1693 return -ENOMEM; 1694 1695 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1696 key.type = BTRFS_DEV_ITEM_KEY; 1697 key.offset = (u64)-1; 1698 1699 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1700 if (ret < 0) 1701 goto error; 1702 1703 BUG_ON(ret == 0); /* Corruption */ 1704 1705 ret = btrfs_previous_item(fs_info->chunk_root, path, 1706 BTRFS_DEV_ITEMS_OBJECTID, 1707 BTRFS_DEV_ITEM_KEY); 1708 if (ret) { 1709 *devid_ret = 1; 1710 } else { 1711 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1712 path->slots[0]); 1713 *devid_ret = found_key.offset + 1; 1714 } 1715 ret = 0; 1716 error: 1717 btrfs_free_path(path); 1718 return ret; 1719 } 1720 1721 /* 1722 * the device information is stored in the chunk root 1723 * the btrfs_device struct should be fully filled in 1724 */ 1725 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1726 struct btrfs_fs_info *fs_info, 1727 struct btrfs_device *device) 1728 { 1729 struct btrfs_root *root = fs_info->chunk_root; 1730 int ret; 1731 struct btrfs_path *path; 1732 struct btrfs_dev_item *dev_item; 1733 struct extent_buffer *leaf; 1734 struct btrfs_key key; 1735 unsigned long ptr; 1736 1737 path = btrfs_alloc_path(); 1738 if (!path) 1739 return -ENOMEM; 1740 1741 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1742 key.type = BTRFS_DEV_ITEM_KEY; 1743 key.offset = device->devid; 1744 1745 ret = btrfs_insert_empty_item(trans, root, path, &key, 1746 sizeof(*dev_item)); 1747 if (ret) 1748 goto out; 1749 1750 leaf = path->nodes[0]; 1751 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1752 1753 btrfs_set_device_id(leaf, dev_item, device->devid); 1754 btrfs_set_device_generation(leaf, dev_item, 0); 1755 btrfs_set_device_type(leaf, dev_item, device->type); 1756 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1757 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1758 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1759 btrfs_set_device_total_bytes(leaf, dev_item, 1760 btrfs_device_get_disk_total_bytes(device)); 1761 btrfs_set_device_bytes_used(leaf, dev_item, 1762 btrfs_device_get_bytes_used(device)); 1763 btrfs_set_device_group(leaf, dev_item, 0); 1764 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1765 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1766 btrfs_set_device_start_offset(leaf, dev_item, 0); 1767 1768 ptr = btrfs_device_uuid(dev_item); 1769 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1770 ptr = btrfs_device_fsid(dev_item); 1771 write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); 1772 btrfs_mark_buffer_dirty(leaf); 1773 1774 ret = 0; 1775 out: 1776 btrfs_free_path(path); 1777 return ret; 1778 } 1779 1780 /* 1781 * Function to update ctime/mtime for a given device path. 1782 * Mainly used for ctime/mtime based probe like libblkid. 1783 */ 1784 static void update_dev_time(const char *path_name) 1785 { 1786 struct file *filp; 1787 1788 filp = filp_open(path_name, O_RDWR, 0); 1789 if (IS_ERR(filp)) 1790 return; 1791 file_update_time(filp); 1792 filp_close(filp, NULL); 1793 } 1794 1795 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1796 struct btrfs_device *device) 1797 { 1798 struct btrfs_root *root = fs_info->chunk_root; 1799 int ret; 1800 struct btrfs_path *path; 1801 struct btrfs_key key; 1802 struct btrfs_trans_handle *trans; 1803 1804 path = btrfs_alloc_path(); 1805 if (!path) 1806 return -ENOMEM; 1807 1808 trans = btrfs_start_transaction(root, 0); 1809 if (IS_ERR(trans)) { 1810 btrfs_free_path(path); 1811 return PTR_ERR(trans); 1812 } 1813 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1814 key.type = BTRFS_DEV_ITEM_KEY; 1815 key.offset = device->devid; 1816 1817 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1818 if (ret) { 1819 if (ret > 0) 1820 ret = -ENOENT; 1821 btrfs_abort_transaction(trans, ret); 1822 btrfs_end_transaction(trans); 1823 goto out; 1824 } 1825 1826 ret = btrfs_del_item(trans, root, path); 1827 if (ret) { 1828 btrfs_abort_transaction(trans, ret); 1829 btrfs_end_transaction(trans); 1830 } 1831 1832 out: 1833 btrfs_free_path(path); 1834 if (!ret) 1835 ret = btrfs_commit_transaction(trans); 1836 return ret; 1837 } 1838 1839 /* 1840 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1841 * filesystem. It's up to the caller to adjust that number regarding eg. device 1842 * replace. 1843 */ 1844 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1845 u64 num_devices) 1846 { 1847 u64 all_avail; 1848 unsigned seq; 1849 int i; 1850 1851 do { 1852 seq = read_seqbegin(&fs_info->profiles_lock); 1853 1854 all_avail = fs_info->avail_data_alloc_bits | 1855 fs_info->avail_system_alloc_bits | 1856 fs_info->avail_metadata_alloc_bits; 1857 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1858 1859 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1860 if (!(all_avail & btrfs_raid_group[i])) 1861 continue; 1862 1863 if (num_devices < btrfs_raid_array[i].devs_min) { 1864 int ret = btrfs_raid_mindev_error[i]; 1865 1866 if (ret) 1867 return ret; 1868 } 1869 } 1870 1871 return 0; 1872 } 1873 1874 static struct btrfs_device * btrfs_find_next_active_device( 1875 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1876 { 1877 struct btrfs_device *next_device; 1878 1879 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1880 if (next_device != device && 1881 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1882 && next_device->bdev) 1883 return next_device; 1884 } 1885 1886 return NULL; 1887 } 1888 1889 /* 1890 * Helper function to check if the given device is part of s_bdev / latest_bdev 1891 * and replace it with the provided or the next active device, in the context 1892 * where this function called, there should be always be another device (or 1893 * this_dev) which is active. 1894 */ 1895 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, 1896 struct btrfs_device *device, struct btrfs_device *this_dev) 1897 { 1898 struct btrfs_device *next_device; 1899 1900 if (this_dev) 1901 next_device = this_dev; 1902 else 1903 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1904 device); 1905 ASSERT(next_device); 1906 1907 if (fs_info->sb->s_bdev && 1908 (fs_info->sb->s_bdev == device->bdev)) 1909 fs_info->sb->s_bdev = next_device->bdev; 1910 1911 if (fs_info->fs_devices->latest_bdev == device->bdev) 1912 fs_info->fs_devices->latest_bdev = next_device->bdev; 1913 } 1914 1915 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1916 u64 devid) 1917 { 1918 struct btrfs_device *device; 1919 struct btrfs_fs_devices *cur_devices; 1920 u64 num_devices; 1921 int ret = 0; 1922 1923 mutex_lock(&fs_info->volume_mutex); 1924 mutex_lock(&uuid_mutex); 1925 1926 num_devices = fs_info->fs_devices->num_devices; 1927 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 1928 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1929 WARN_ON(num_devices < 1); 1930 num_devices--; 1931 } 1932 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 1933 1934 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1935 if (ret) 1936 goto out; 1937 1938 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1939 &device); 1940 if (ret) 1941 goto out; 1942 1943 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1944 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1945 goto out; 1946 } 1947 1948 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1949 fs_info->fs_devices->rw_devices == 1) { 1950 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1951 goto out; 1952 } 1953 1954 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1955 mutex_lock(&fs_info->chunk_mutex); 1956 list_del_init(&device->dev_alloc_list); 1957 device->fs_devices->rw_devices--; 1958 mutex_unlock(&fs_info->chunk_mutex); 1959 } 1960 1961 mutex_unlock(&uuid_mutex); 1962 ret = btrfs_shrink_device(device, 0); 1963 mutex_lock(&uuid_mutex); 1964 if (ret) 1965 goto error_undo; 1966 1967 /* 1968 * TODO: the superblock still includes this device in its num_devices 1969 * counter although write_all_supers() is not locked out. This 1970 * could give a filesystem state which requires a degraded mount. 1971 */ 1972 ret = btrfs_rm_dev_item(fs_info, device); 1973 if (ret) 1974 goto error_undo; 1975 1976 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 1977 btrfs_scrub_cancel_dev(fs_info, device); 1978 1979 /* 1980 * the device list mutex makes sure that we don't change 1981 * the device list while someone else is writing out all 1982 * the device supers. Whoever is writing all supers, should 1983 * lock the device list mutex before getting the number of 1984 * devices in the super block (super_copy). Conversely, 1985 * whoever updates the number of devices in the super block 1986 * (super_copy) should hold the device list mutex. 1987 */ 1988 1989 cur_devices = device->fs_devices; 1990 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1991 list_del_rcu(&device->dev_list); 1992 1993 device->fs_devices->num_devices--; 1994 device->fs_devices->total_devices--; 1995 1996 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1997 device->fs_devices->missing_devices--; 1998 1999 btrfs_assign_next_active_device(fs_info, device, NULL); 2000 2001 if (device->bdev) { 2002 device->fs_devices->open_devices--; 2003 /* remove sysfs entry */ 2004 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2005 } 2006 2007 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2008 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2009 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2010 2011 /* 2012 * at this point, the device is zero sized and detached from 2013 * the devices list. All that's left is to zero out the old 2014 * supers and free the device. 2015 */ 2016 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2017 btrfs_scratch_superblocks(device->bdev, device->name->str); 2018 2019 btrfs_close_bdev(device); 2020 call_rcu(&device->rcu, free_device_rcu); 2021 2022 if (cur_devices->open_devices == 0) { 2023 struct btrfs_fs_devices *fs_devices; 2024 fs_devices = fs_info->fs_devices; 2025 while (fs_devices) { 2026 if (fs_devices->seed == cur_devices) { 2027 fs_devices->seed = cur_devices->seed; 2028 break; 2029 } 2030 fs_devices = fs_devices->seed; 2031 } 2032 cur_devices->seed = NULL; 2033 __btrfs_close_devices(cur_devices); 2034 free_fs_devices(cur_devices); 2035 } 2036 2037 out: 2038 mutex_unlock(&uuid_mutex); 2039 mutex_unlock(&fs_info->volume_mutex); 2040 return ret; 2041 2042 error_undo: 2043 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2044 mutex_lock(&fs_info->chunk_mutex); 2045 list_add(&device->dev_alloc_list, 2046 &fs_info->fs_devices->alloc_list); 2047 device->fs_devices->rw_devices++; 2048 mutex_unlock(&fs_info->chunk_mutex); 2049 } 2050 goto out; 2051 } 2052 2053 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 2054 struct btrfs_device *srcdev) 2055 { 2056 struct btrfs_fs_devices *fs_devices; 2057 2058 lockdep_assert_held(&fs_info->fs_devices->device_list_mutex); 2059 2060 /* 2061 * in case of fs with no seed, srcdev->fs_devices will point 2062 * to fs_devices of fs_info. However when the dev being replaced is 2063 * a seed dev it will point to the seed's local fs_devices. In short 2064 * srcdev will have its correct fs_devices in both the cases. 2065 */ 2066 fs_devices = srcdev->fs_devices; 2067 2068 list_del_rcu(&srcdev->dev_list); 2069 list_del(&srcdev->dev_alloc_list); 2070 fs_devices->num_devices--; 2071 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2072 fs_devices->missing_devices--; 2073 2074 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2075 fs_devices->rw_devices--; 2076 2077 if (srcdev->bdev) 2078 fs_devices->open_devices--; 2079 } 2080 2081 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2082 struct btrfs_device *srcdev) 2083 { 2084 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2085 2086 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2087 /* zero out the old super if it is writable */ 2088 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2089 } 2090 2091 btrfs_close_bdev(srcdev); 2092 call_rcu(&srcdev->rcu, free_device_rcu); 2093 2094 /* if this is no devs we rather delete the fs_devices */ 2095 if (!fs_devices->num_devices) { 2096 struct btrfs_fs_devices *tmp_fs_devices; 2097 2098 /* 2099 * On a mounted FS, num_devices can't be zero unless it's a 2100 * seed. In case of a seed device being replaced, the replace 2101 * target added to the sprout FS, so there will be no more 2102 * device left under the seed FS. 2103 */ 2104 ASSERT(fs_devices->seeding); 2105 2106 tmp_fs_devices = fs_info->fs_devices; 2107 while (tmp_fs_devices) { 2108 if (tmp_fs_devices->seed == fs_devices) { 2109 tmp_fs_devices->seed = fs_devices->seed; 2110 break; 2111 } 2112 tmp_fs_devices = tmp_fs_devices->seed; 2113 } 2114 fs_devices->seed = NULL; 2115 __btrfs_close_devices(fs_devices); 2116 free_fs_devices(fs_devices); 2117 } 2118 } 2119 2120 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2121 struct btrfs_device *tgtdev) 2122 { 2123 mutex_lock(&uuid_mutex); 2124 WARN_ON(!tgtdev); 2125 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2126 2127 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2128 2129 if (tgtdev->bdev) 2130 fs_info->fs_devices->open_devices--; 2131 2132 fs_info->fs_devices->num_devices--; 2133 2134 btrfs_assign_next_active_device(fs_info, tgtdev, NULL); 2135 2136 list_del_rcu(&tgtdev->dev_list); 2137 2138 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2139 mutex_unlock(&uuid_mutex); 2140 2141 /* 2142 * The update_dev_time() with in btrfs_scratch_superblocks() 2143 * may lead to a call to btrfs_show_devname() which will try 2144 * to hold device_list_mutex. And here this device 2145 * is already out of device list, so we don't have to hold 2146 * the device_list_mutex lock. 2147 */ 2148 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2149 2150 btrfs_close_bdev(tgtdev); 2151 call_rcu(&tgtdev->rcu, free_device_rcu); 2152 } 2153 2154 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2155 const char *device_path, 2156 struct btrfs_device **device) 2157 { 2158 int ret = 0; 2159 struct btrfs_super_block *disk_super; 2160 u64 devid; 2161 u8 *dev_uuid; 2162 struct block_device *bdev; 2163 struct buffer_head *bh; 2164 2165 *device = NULL; 2166 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2167 fs_info->bdev_holder, 0, &bdev, &bh); 2168 if (ret) 2169 return ret; 2170 disk_super = (struct btrfs_super_block *)bh->b_data; 2171 devid = btrfs_stack_device_id(&disk_super->dev_item); 2172 dev_uuid = disk_super->dev_item.uuid; 2173 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2174 brelse(bh); 2175 if (!*device) 2176 ret = -ENOENT; 2177 blkdev_put(bdev, FMODE_READ); 2178 return ret; 2179 } 2180 2181 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2182 const char *device_path, 2183 struct btrfs_device **device) 2184 { 2185 *device = NULL; 2186 if (strcmp(device_path, "missing") == 0) { 2187 struct list_head *devices; 2188 struct btrfs_device *tmp; 2189 2190 devices = &fs_info->fs_devices->devices; 2191 /* 2192 * It is safe to read the devices since the volume_mutex 2193 * is held by the caller. 2194 */ 2195 list_for_each_entry(tmp, devices, dev_list) { 2196 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2197 &tmp->dev_state) && !tmp->bdev) { 2198 *device = tmp; 2199 break; 2200 } 2201 } 2202 2203 if (!*device) 2204 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2205 2206 return 0; 2207 } else { 2208 return btrfs_find_device_by_path(fs_info, device_path, device); 2209 } 2210 } 2211 2212 /* 2213 * Lookup a device given by device id, or the path if the id is 0. 2214 */ 2215 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2216 const char *devpath, 2217 struct btrfs_device **device) 2218 { 2219 int ret; 2220 2221 if (devid) { 2222 ret = 0; 2223 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2224 if (!*device) 2225 ret = -ENOENT; 2226 } else { 2227 if (!devpath || !devpath[0]) 2228 return -EINVAL; 2229 2230 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2231 device); 2232 } 2233 return ret; 2234 } 2235 2236 /* 2237 * does all the dirty work required for changing file system's UUID. 2238 */ 2239 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2240 { 2241 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2242 struct btrfs_fs_devices *old_devices; 2243 struct btrfs_fs_devices *seed_devices; 2244 struct btrfs_super_block *disk_super = fs_info->super_copy; 2245 struct btrfs_device *device; 2246 u64 super_flags; 2247 2248 lockdep_assert_held(&uuid_mutex); 2249 if (!fs_devices->seeding) 2250 return -EINVAL; 2251 2252 seed_devices = alloc_fs_devices(NULL); 2253 if (IS_ERR(seed_devices)) 2254 return PTR_ERR(seed_devices); 2255 2256 old_devices = clone_fs_devices(fs_devices); 2257 if (IS_ERR(old_devices)) { 2258 kfree(seed_devices); 2259 return PTR_ERR(old_devices); 2260 } 2261 2262 list_add(&old_devices->list, &fs_uuids); 2263 2264 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2265 seed_devices->opened = 1; 2266 INIT_LIST_HEAD(&seed_devices->devices); 2267 INIT_LIST_HEAD(&seed_devices->alloc_list); 2268 mutex_init(&seed_devices->device_list_mutex); 2269 2270 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2271 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2272 synchronize_rcu); 2273 list_for_each_entry(device, &seed_devices->devices, dev_list) 2274 device->fs_devices = seed_devices; 2275 2276 mutex_lock(&fs_info->chunk_mutex); 2277 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2278 mutex_unlock(&fs_info->chunk_mutex); 2279 2280 fs_devices->seeding = 0; 2281 fs_devices->num_devices = 0; 2282 fs_devices->open_devices = 0; 2283 fs_devices->missing_devices = 0; 2284 fs_devices->rotating = 0; 2285 fs_devices->seed = seed_devices; 2286 2287 generate_random_uuid(fs_devices->fsid); 2288 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2289 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2290 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2291 2292 super_flags = btrfs_super_flags(disk_super) & 2293 ~BTRFS_SUPER_FLAG_SEEDING; 2294 btrfs_set_super_flags(disk_super, super_flags); 2295 2296 return 0; 2297 } 2298 2299 /* 2300 * Store the expected generation for seed devices in device items. 2301 */ 2302 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2303 struct btrfs_fs_info *fs_info) 2304 { 2305 struct btrfs_root *root = fs_info->chunk_root; 2306 struct btrfs_path *path; 2307 struct extent_buffer *leaf; 2308 struct btrfs_dev_item *dev_item; 2309 struct btrfs_device *device; 2310 struct btrfs_key key; 2311 u8 fs_uuid[BTRFS_FSID_SIZE]; 2312 u8 dev_uuid[BTRFS_UUID_SIZE]; 2313 u64 devid; 2314 int ret; 2315 2316 path = btrfs_alloc_path(); 2317 if (!path) 2318 return -ENOMEM; 2319 2320 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2321 key.offset = 0; 2322 key.type = BTRFS_DEV_ITEM_KEY; 2323 2324 while (1) { 2325 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2326 if (ret < 0) 2327 goto error; 2328 2329 leaf = path->nodes[0]; 2330 next_slot: 2331 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2332 ret = btrfs_next_leaf(root, path); 2333 if (ret > 0) 2334 break; 2335 if (ret < 0) 2336 goto error; 2337 leaf = path->nodes[0]; 2338 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2339 btrfs_release_path(path); 2340 continue; 2341 } 2342 2343 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2344 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2345 key.type != BTRFS_DEV_ITEM_KEY) 2346 break; 2347 2348 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2349 struct btrfs_dev_item); 2350 devid = btrfs_device_id(leaf, dev_item); 2351 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2352 BTRFS_UUID_SIZE); 2353 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2354 BTRFS_FSID_SIZE); 2355 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2356 BUG_ON(!device); /* Logic error */ 2357 2358 if (device->fs_devices->seeding) { 2359 btrfs_set_device_generation(leaf, dev_item, 2360 device->generation); 2361 btrfs_mark_buffer_dirty(leaf); 2362 } 2363 2364 path->slots[0]++; 2365 goto next_slot; 2366 } 2367 ret = 0; 2368 error: 2369 btrfs_free_path(path); 2370 return ret; 2371 } 2372 2373 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2374 { 2375 struct btrfs_root *root = fs_info->dev_root; 2376 struct request_queue *q; 2377 struct btrfs_trans_handle *trans; 2378 struct btrfs_device *device; 2379 struct block_device *bdev; 2380 struct list_head *devices; 2381 struct super_block *sb = fs_info->sb; 2382 struct rcu_string *name; 2383 u64 tmp; 2384 int seeding_dev = 0; 2385 int ret = 0; 2386 bool unlocked = false; 2387 2388 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2389 return -EROFS; 2390 2391 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2392 fs_info->bdev_holder); 2393 if (IS_ERR(bdev)) 2394 return PTR_ERR(bdev); 2395 2396 if (fs_info->fs_devices->seeding) { 2397 seeding_dev = 1; 2398 down_write(&sb->s_umount); 2399 mutex_lock(&uuid_mutex); 2400 } 2401 2402 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2403 2404 devices = &fs_info->fs_devices->devices; 2405 2406 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2407 list_for_each_entry(device, devices, dev_list) { 2408 if (device->bdev == bdev) { 2409 ret = -EEXIST; 2410 mutex_unlock( 2411 &fs_info->fs_devices->device_list_mutex); 2412 goto error; 2413 } 2414 } 2415 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2416 2417 device = btrfs_alloc_device(fs_info, NULL, NULL); 2418 if (IS_ERR(device)) { 2419 /* we can safely leave the fs_devices entry around */ 2420 ret = PTR_ERR(device); 2421 goto error; 2422 } 2423 2424 name = rcu_string_strdup(device_path, GFP_KERNEL); 2425 if (!name) { 2426 ret = -ENOMEM; 2427 goto error_free_device; 2428 } 2429 rcu_assign_pointer(device->name, name); 2430 2431 trans = btrfs_start_transaction(root, 0); 2432 if (IS_ERR(trans)) { 2433 ret = PTR_ERR(trans); 2434 goto error_free_device; 2435 } 2436 2437 q = bdev_get_queue(bdev); 2438 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2439 device->generation = trans->transid; 2440 device->io_width = fs_info->sectorsize; 2441 device->io_align = fs_info->sectorsize; 2442 device->sector_size = fs_info->sectorsize; 2443 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2444 fs_info->sectorsize); 2445 device->disk_total_bytes = device->total_bytes; 2446 device->commit_total_bytes = device->total_bytes; 2447 device->fs_info = fs_info; 2448 device->bdev = bdev; 2449 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2450 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2451 device->mode = FMODE_EXCL; 2452 device->dev_stats_valid = 1; 2453 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2454 2455 if (seeding_dev) { 2456 sb->s_flags &= ~SB_RDONLY; 2457 ret = btrfs_prepare_sprout(fs_info); 2458 if (ret) { 2459 btrfs_abort_transaction(trans, ret); 2460 goto error_trans; 2461 } 2462 } 2463 2464 device->fs_devices = fs_info->fs_devices; 2465 2466 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2467 mutex_lock(&fs_info->chunk_mutex); 2468 list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); 2469 list_add(&device->dev_alloc_list, 2470 &fs_info->fs_devices->alloc_list); 2471 fs_info->fs_devices->num_devices++; 2472 fs_info->fs_devices->open_devices++; 2473 fs_info->fs_devices->rw_devices++; 2474 fs_info->fs_devices->total_devices++; 2475 fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2476 2477 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2478 2479 if (!blk_queue_nonrot(q)) 2480 fs_info->fs_devices->rotating = 1; 2481 2482 tmp = btrfs_super_total_bytes(fs_info->super_copy); 2483 btrfs_set_super_total_bytes(fs_info->super_copy, 2484 round_down(tmp + device->total_bytes, fs_info->sectorsize)); 2485 2486 tmp = btrfs_super_num_devices(fs_info->super_copy); 2487 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 2488 2489 /* add sysfs device entry */ 2490 btrfs_sysfs_add_device_link(fs_info->fs_devices, device); 2491 2492 /* 2493 * we've got more storage, clear any full flags on the space 2494 * infos 2495 */ 2496 btrfs_clear_space_info_full(fs_info); 2497 2498 mutex_unlock(&fs_info->chunk_mutex); 2499 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2500 2501 if (seeding_dev) { 2502 mutex_lock(&fs_info->chunk_mutex); 2503 ret = init_first_rw_device(trans, fs_info); 2504 mutex_unlock(&fs_info->chunk_mutex); 2505 if (ret) { 2506 btrfs_abort_transaction(trans, ret); 2507 goto error_sysfs; 2508 } 2509 } 2510 2511 ret = btrfs_add_dev_item(trans, fs_info, device); 2512 if (ret) { 2513 btrfs_abort_transaction(trans, ret); 2514 goto error_sysfs; 2515 } 2516 2517 if (seeding_dev) { 2518 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2519 2520 ret = btrfs_finish_sprout(trans, fs_info); 2521 if (ret) { 2522 btrfs_abort_transaction(trans, ret); 2523 goto error_sysfs; 2524 } 2525 2526 /* Sprouting would change fsid of the mounted root, 2527 * so rename the fsid on the sysfs 2528 */ 2529 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2530 fs_info->fsid); 2531 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) 2532 btrfs_warn(fs_info, 2533 "sysfs: failed to create fsid for sprout"); 2534 } 2535 2536 ret = btrfs_commit_transaction(trans); 2537 2538 if (seeding_dev) { 2539 mutex_unlock(&uuid_mutex); 2540 up_write(&sb->s_umount); 2541 unlocked = true; 2542 2543 if (ret) /* transaction commit */ 2544 return ret; 2545 2546 ret = btrfs_relocate_sys_chunks(fs_info); 2547 if (ret < 0) 2548 btrfs_handle_fs_error(fs_info, ret, 2549 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2550 trans = btrfs_attach_transaction(root); 2551 if (IS_ERR(trans)) { 2552 if (PTR_ERR(trans) == -ENOENT) 2553 return 0; 2554 ret = PTR_ERR(trans); 2555 trans = NULL; 2556 goto error_sysfs; 2557 } 2558 ret = btrfs_commit_transaction(trans); 2559 } 2560 2561 /* Update ctime/mtime for libblkid */ 2562 update_dev_time(device_path); 2563 return ret; 2564 2565 error_sysfs: 2566 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2567 error_trans: 2568 if (seeding_dev) 2569 sb->s_flags |= SB_RDONLY; 2570 if (trans) 2571 btrfs_end_transaction(trans); 2572 error_free_device: 2573 free_device(device); 2574 error: 2575 blkdev_put(bdev, FMODE_EXCL); 2576 if (seeding_dev && !unlocked) { 2577 mutex_unlock(&uuid_mutex); 2578 up_write(&sb->s_umount); 2579 } 2580 return ret; 2581 } 2582 2583 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2584 const char *device_path, 2585 struct btrfs_device *srcdev, 2586 struct btrfs_device **device_out) 2587 { 2588 struct btrfs_device *device; 2589 struct block_device *bdev; 2590 struct list_head *devices; 2591 struct rcu_string *name; 2592 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2593 int ret = 0; 2594 2595 *device_out = NULL; 2596 if (fs_info->fs_devices->seeding) { 2597 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2598 return -EINVAL; 2599 } 2600 2601 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2602 fs_info->bdev_holder); 2603 if (IS_ERR(bdev)) { 2604 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2605 return PTR_ERR(bdev); 2606 } 2607 2608 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2609 2610 devices = &fs_info->fs_devices->devices; 2611 list_for_each_entry(device, devices, dev_list) { 2612 if (device->bdev == bdev) { 2613 btrfs_err(fs_info, 2614 "target device is in the filesystem!"); 2615 ret = -EEXIST; 2616 goto error; 2617 } 2618 } 2619 2620 2621 if (i_size_read(bdev->bd_inode) < 2622 btrfs_device_get_total_bytes(srcdev)) { 2623 btrfs_err(fs_info, 2624 "target device is smaller than source device!"); 2625 ret = -EINVAL; 2626 goto error; 2627 } 2628 2629 2630 device = btrfs_alloc_device(NULL, &devid, NULL); 2631 if (IS_ERR(device)) { 2632 ret = PTR_ERR(device); 2633 goto error; 2634 } 2635 2636 name = rcu_string_strdup(device_path, GFP_KERNEL); 2637 if (!name) { 2638 free_device(device); 2639 ret = -ENOMEM; 2640 goto error; 2641 } 2642 rcu_assign_pointer(device->name, name); 2643 2644 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2645 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2646 device->generation = 0; 2647 device->io_width = fs_info->sectorsize; 2648 device->io_align = fs_info->sectorsize; 2649 device->sector_size = fs_info->sectorsize; 2650 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2651 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2652 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2653 device->commit_total_bytes = srcdev->commit_total_bytes; 2654 device->commit_bytes_used = device->bytes_used; 2655 device->fs_info = fs_info; 2656 device->bdev = bdev; 2657 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2658 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2659 device->mode = FMODE_EXCL; 2660 device->dev_stats_valid = 1; 2661 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2662 device->fs_devices = fs_info->fs_devices; 2663 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2664 fs_info->fs_devices->num_devices++; 2665 fs_info->fs_devices->open_devices++; 2666 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2667 2668 *device_out = device; 2669 return ret; 2670 2671 error: 2672 blkdev_put(bdev, FMODE_EXCL); 2673 return ret; 2674 } 2675 2676 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2677 struct btrfs_device *device) 2678 { 2679 int ret; 2680 struct btrfs_path *path; 2681 struct btrfs_root *root = device->fs_info->chunk_root; 2682 struct btrfs_dev_item *dev_item; 2683 struct extent_buffer *leaf; 2684 struct btrfs_key key; 2685 2686 path = btrfs_alloc_path(); 2687 if (!path) 2688 return -ENOMEM; 2689 2690 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2691 key.type = BTRFS_DEV_ITEM_KEY; 2692 key.offset = device->devid; 2693 2694 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2695 if (ret < 0) 2696 goto out; 2697 2698 if (ret > 0) { 2699 ret = -ENOENT; 2700 goto out; 2701 } 2702 2703 leaf = path->nodes[0]; 2704 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2705 2706 btrfs_set_device_id(leaf, dev_item, device->devid); 2707 btrfs_set_device_type(leaf, dev_item, device->type); 2708 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2709 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2710 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2711 btrfs_set_device_total_bytes(leaf, dev_item, 2712 btrfs_device_get_disk_total_bytes(device)); 2713 btrfs_set_device_bytes_used(leaf, dev_item, 2714 btrfs_device_get_bytes_used(device)); 2715 btrfs_mark_buffer_dirty(leaf); 2716 2717 out: 2718 btrfs_free_path(path); 2719 return ret; 2720 } 2721 2722 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2723 struct btrfs_device *device, u64 new_size) 2724 { 2725 struct btrfs_fs_info *fs_info = device->fs_info; 2726 struct btrfs_super_block *super_copy = fs_info->super_copy; 2727 struct btrfs_fs_devices *fs_devices; 2728 u64 old_total; 2729 u64 diff; 2730 2731 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2732 return -EACCES; 2733 2734 new_size = round_down(new_size, fs_info->sectorsize); 2735 2736 mutex_lock(&fs_info->chunk_mutex); 2737 old_total = btrfs_super_total_bytes(super_copy); 2738 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2739 2740 if (new_size <= device->total_bytes || 2741 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2742 mutex_unlock(&fs_info->chunk_mutex); 2743 return -EINVAL; 2744 } 2745 2746 fs_devices = fs_info->fs_devices; 2747 2748 btrfs_set_super_total_bytes(super_copy, 2749 round_down(old_total + diff, fs_info->sectorsize)); 2750 device->fs_devices->total_rw_bytes += diff; 2751 2752 btrfs_device_set_total_bytes(device, new_size); 2753 btrfs_device_set_disk_total_bytes(device, new_size); 2754 btrfs_clear_space_info_full(device->fs_info); 2755 if (list_empty(&device->resized_list)) 2756 list_add_tail(&device->resized_list, 2757 &fs_devices->resized_devices); 2758 mutex_unlock(&fs_info->chunk_mutex); 2759 2760 return btrfs_update_device(trans, device); 2761 } 2762 2763 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2764 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2765 { 2766 struct btrfs_root *root = fs_info->chunk_root; 2767 int ret; 2768 struct btrfs_path *path; 2769 struct btrfs_key key; 2770 2771 path = btrfs_alloc_path(); 2772 if (!path) 2773 return -ENOMEM; 2774 2775 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2776 key.offset = chunk_offset; 2777 key.type = BTRFS_CHUNK_ITEM_KEY; 2778 2779 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2780 if (ret < 0) 2781 goto out; 2782 else if (ret > 0) { /* Logic error or corruption */ 2783 btrfs_handle_fs_error(fs_info, -ENOENT, 2784 "Failed lookup while freeing chunk."); 2785 ret = -ENOENT; 2786 goto out; 2787 } 2788 2789 ret = btrfs_del_item(trans, root, path); 2790 if (ret < 0) 2791 btrfs_handle_fs_error(fs_info, ret, 2792 "Failed to delete chunk item."); 2793 out: 2794 btrfs_free_path(path); 2795 return ret; 2796 } 2797 2798 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2799 { 2800 struct btrfs_super_block *super_copy = fs_info->super_copy; 2801 struct btrfs_disk_key *disk_key; 2802 struct btrfs_chunk *chunk; 2803 u8 *ptr; 2804 int ret = 0; 2805 u32 num_stripes; 2806 u32 array_size; 2807 u32 len = 0; 2808 u32 cur; 2809 struct btrfs_key key; 2810 2811 mutex_lock(&fs_info->chunk_mutex); 2812 array_size = btrfs_super_sys_array_size(super_copy); 2813 2814 ptr = super_copy->sys_chunk_array; 2815 cur = 0; 2816 2817 while (cur < array_size) { 2818 disk_key = (struct btrfs_disk_key *)ptr; 2819 btrfs_disk_key_to_cpu(&key, disk_key); 2820 2821 len = sizeof(*disk_key); 2822 2823 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2824 chunk = (struct btrfs_chunk *)(ptr + len); 2825 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2826 len += btrfs_chunk_item_size(num_stripes); 2827 } else { 2828 ret = -EIO; 2829 break; 2830 } 2831 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2832 key.offset == chunk_offset) { 2833 memmove(ptr, ptr + len, array_size - (cur + len)); 2834 array_size -= len; 2835 btrfs_set_super_sys_array_size(super_copy, array_size); 2836 } else { 2837 ptr += len; 2838 cur += len; 2839 } 2840 } 2841 mutex_unlock(&fs_info->chunk_mutex); 2842 return ret; 2843 } 2844 2845 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2846 u64 logical, u64 length) 2847 { 2848 struct extent_map_tree *em_tree; 2849 struct extent_map *em; 2850 2851 em_tree = &fs_info->mapping_tree.map_tree; 2852 read_lock(&em_tree->lock); 2853 em = lookup_extent_mapping(em_tree, logical, length); 2854 read_unlock(&em_tree->lock); 2855 2856 if (!em) { 2857 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2858 logical, length); 2859 return ERR_PTR(-EINVAL); 2860 } 2861 2862 if (em->start > logical || em->start + em->len < logical) { 2863 btrfs_crit(fs_info, 2864 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2865 logical, length, em->start, em->start + em->len); 2866 free_extent_map(em); 2867 return ERR_PTR(-EINVAL); 2868 } 2869 2870 /* callers are responsible for dropping em's ref. */ 2871 return em; 2872 } 2873 2874 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2875 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2876 { 2877 struct extent_map *em; 2878 struct map_lookup *map; 2879 u64 dev_extent_len = 0; 2880 int i, ret = 0; 2881 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2882 2883 em = get_chunk_map(fs_info, chunk_offset, 1); 2884 if (IS_ERR(em)) { 2885 /* 2886 * This is a logic error, but we don't want to just rely on the 2887 * user having built with ASSERT enabled, so if ASSERT doesn't 2888 * do anything we still error out. 2889 */ 2890 ASSERT(0); 2891 return PTR_ERR(em); 2892 } 2893 map = em->map_lookup; 2894 mutex_lock(&fs_info->chunk_mutex); 2895 check_system_chunk(trans, fs_info, map->type); 2896 mutex_unlock(&fs_info->chunk_mutex); 2897 2898 /* 2899 * Take the device list mutex to prevent races with the final phase of 2900 * a device replace operation that replaces the device object associated 2901 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2902 */ 2903 mutex_lock(&fs_devices->device_list_mutex); 2904 for (i = 0; i < map->num_stripes; i++) { 2905 struct btrfs_device *device = map->stripes[i].dev; 2906 ret = btrfs_free_dev_extent(trans, device, 2907 map->stripes[i].physical, 2908 &dev_extent_len); 2909 if (ret) { 2910 mutex_unlock(&fs_devices->device_list_mutex); 2911 btrfs_abort_transaction(trans, ret); 2912 goto out; 2913 } 2914 2915 if (device->bytes_used > 0) { 2916 mutex_lock(&fs_info->chunk_mutex); 2917 btrfs_device_set_bytes_used(device, 2918 device->bytes_used - dev_extent_len); 2919 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2920 btrfs_clear_space_info_full(fs_info); 2921 mutex_unlock(&fs_info->chunk_mutex); 2922 } 2923 2924 if (map->stripes[i].dev) { 2925 ret = btrfs_update_device(trans, map->stripes[i].dev); 2926 if (ret) { 2927 mutex_unlock(&fs_devices->device_list_mutex); 2928 btrfs_abort_transaction(trans, ret); 2929 goto out; 2930 } 2931 } 2932 } 2933 mutex_unlock(&fs_devices->device_list_mutex); 2934 2935 ret = btrfs_free_chunk(trans, fs_info, chunk_offset); 2936 if (ret) { 2937 btrfs_abort_transaction(trans, ret); 2938 goto out; 2939 } 2940 2941 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2942 2943 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2944 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2945 if (ret) { 2946 btrfs_abort_transaction(trans, ret); 2947 goto out; 2948 } 2949 } 2950 2951 ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); 2952 if (ret) { 2953 btrfs_abort_transaction(trans, ret); 2954 goto out; 2955 } 2956 2957 out: 2958 /* once for us */ 2959 free_extent_map(em); 2960 return ret; 2961 } 2962 2963 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2964 { 2965 struct btrfs_root *root = fs_info->chunk_root; 2966 struct btrfs_trans_handle *trans; 2967 int ret; 2968 2969 /* 2970 * Prevent races with automatic removal of unused block groups. 2971 * After we relocate and before we remove the chunk with offset 2972 * chunk_offset, automatic removal of the block group can kick in, 2973 * resulting in a failure when calling btrfs_remove_chunk() below. 2974 * 2975 * Make sure to acquire this mutex before doing a tree search (dev 2976 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2977 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2978 * we release the path used to search the chunk/dev tree and before 2979 * the current task acquires this mutex and calls us. 2980 */ 2981 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 2982 2983 ret = btrfs_can_relocate(fs_info, chunk_offset); 2984 if (ret) 2985 return -ENOSPC; 2986 2987 /* step one, relocate all the extents inside this chunk */ 2988 btrfs_scrub_pause(fs_info); 2989 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 2990 btrfs_scrub_continue(fs_info); 2991 if (ret) 2992 return ret; 2993 2994 /* 2995 * We add the kobjects here (and after forcing data chunk creation) 2996 * since relocation is the only place we'll create chunks of a new 2997 * type at runtime. The only place where we'll remove the last 2998 * chunk of a type is the call immediately below this one. Even 2999 * so, we're protected against races with the cleaner thread since 3000 * we're covered by the delete_unused_bgs_mutex. 3001 */ 3002 btrfs_add_raid_kobjects(fs_info); 3003 3004 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3005 chunk_offset); 3006 if (IS_ERR(trans)) { 3007 ret = PTR_ERR(trans); 3008 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3009 return ret; 3010 } 3011 3012 /* 3013 * step two, delete the device extents and the 3014 * chunk tree entries 3015 */ 3016 ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); 3017 btrfs_end_transaction(trans); 3018 return ret; 3019 } 3020 3021 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3022 { 3023 struct btrfs_root *chunk_root = fs_info->chunk_root; 3024 struct btrfs_path *path; 3025 struct extent_buffer *leaf; 3026 struct btrfs_chunk *chunk; 3027 struct btrfs_key key; 3028 struct btrfs_key found_key; 3029 u64 chunk_type; 3030 bool retried = false; 3031 int failed = 0; 3032 int ret; 3033 3034 path = btrfs_alloc_path(); 3035 if (!path) 3036 return -ENOMEM; 3037 3038 again: 3039 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3040 key.offset = (u64)-1; 3041 key.type = BTRFS_CHUNK_ITEM_KEY; 3042 3043 while (1) { 3044 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3045 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3046 if (ret < 0) { 3047 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3048 goto error; 3049 } 3050 BUG_ON(ret == 0); /* Corruption */ 3051 3052 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3053 key.type); 3054 if (ret) 3055 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3056 if (ret < 0) 3057 goto error; 3058 if (ret > 0) 3059 break; 3060 3061 leaf = path->nodes[0]; 3062 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3063 3064 chunk = btrfs_item_ptr(leaf, path->slots[0], 3065 struct btrfs_chunk); 3066 chunk_type = btrfs_chunk_type(leaf, chunk); 3067 btrfs_release_path(path); 3068 3069 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3070 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3071 if (ret == -ENOSPC) 3072 failed++; 3073 else 3074 BUG_ON(ret); 3075 } 3076 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3077 3078 if (found_key.offset == 0) 3079 break; 3080 key.offset = found_key.offset - 1; 3081 } 3082 ret = 0; 3083 if (failed && !retried) { 3084 failed = 0; 3085 retried = true; 3086 goto again; 3087 } else if (WARN_ON(failed && retried)) { 3088 ret = -ENOSPC; 3089 } 3090 error: 3091 btrfs_free_path(path); 3092 return ret; 3093 } 3094 3095 /* 3096 * return 1 : allocate a data chunk successfully, 3097 * return <0: errors during allocating a data chunk, 3098 * return 0 : no need to allocate a data chunk. 3099 */ 3100 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3101 u64 chunk_offset) 3102 { 3103 struct btrfs_block_group_cache *cache; 3104 u64 bytes_used; 3105 u64 chunk_type; 3106 3107 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3108 ASSERT(cache); 3109 chunk_type = cache->flags; 3110 btrfs_put_block_group(cache); 3111 3112 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 3113 spin_lock(&fs_info->data_sinfo->lock); 3114 bytes_used = fs_info->data_sinfo->bytes_used; 3115 spin_unlock(&fs_info->data_sinfo->lock); 3116 3117 if (!bytes_used) { 3118 struct btrfs_trans_handle *trans; 3119 int ret; 3120 3121 trans = btrfs_join_transaction(fs_info->tree_root); 3122 if (IS_ERR(trans)) 3123 return PTR_ERR(trans); 3124 3125 ret = btrfs_force_chunk_alloc(trans, fs_info, 3126 BTRFS_BLOCK_GROUP_DATA); 3127 btrfs_end_transaction(trans); 3128 if (ret < 0) 3129 return ret; 3130 3131 btrfs_add_raid_kobjects(fs_info); 3132 3133 return 1; 3134 } 3135 } 3136 return 0; 3137 } 3138 3139 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3140 struct btrfs_balance_control *bctl) 3141 { 3142 struct btrfs_root *root = fs_info->tree_root; 3143 struct btrfs_trans_handle *trans; 3144 struct btrfs_balance_item *item; 3145 struct btrfs_disk_balance_args disk_bargs; 3146 struct btrfs_path *path; 3147 struct extent_buffer *leaf; 3148 struct btrfs_key key; 3149 int ret, err; 3150 3151 path = btrfs_alloc_path(); 3152 if (!path) 3153 return -ENOMEM; 3154 3155 trans = btrfs_start_transaction(root, 0); 3156 if (IS_ERR(trans)) { 3157 btrfs_free_path(path); 3158 return PTR_ERR(trans); 3159 } 3160 3161 key.objectid = BTRFS_BALANCE_OBJECTID; 3162 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3163 key.offset = 0; 3164 3165 ret = btrfs_insert_empty_item(trans, root, path, &key, 3166 sizeof(*item)); 3167 if (ret) 3168 goto out; 3169 3170 leaf = path->nodes[0]; 3171 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3172 3173 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3174 3175 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3176 btrfs_set_balance_data(leaf, item, &disk_bargs); 3177 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3178 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3179 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3180 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3181 3182 btrfs_set_balance_flags(leaf, item, bctl->flags); 3183 3184 btrfs_mark_buffer_dirty(leaf); 3185 out: 3186 btrfs_free_path(path); 3187 err = btrfs_commit_transaction(trans); 3188 if (err && !ret) 3189 ret = err; 3190 return ret; 3191 } 3192 3193 static int del_balance_item(struct btrfs_fs_info *fs_info) 3194 { 3195 struct btrfs_root *root = fs_info->tree_root; 3196 struct btrfs_trans_handle *trans; 3197 struct btrfs_path *path; 3198 struct btrfs_key key; 3199 int ret, err; 3200 3201 path = btrfs_alloc_path(); 3202 if (!path) 3203 return -ENOMEM; 3204 3205 trans = btrfs_start_transaction(root, 0); 3206 if (IS_ERR(trans)) { 3207 btrfs_free_path(path); 3208 return PTR_ERR(trans); 3209 } 3210 3211 key.objectid = BTRFS_BALANCE_OBJECTID; 3212 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3213 key.offset = 0; 3214 3215 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3216 if (ret < 0) 3217 goto out; 3218 if (ret > 0) { 3219 ret = -ENOENT; 3220 goto out; 3221 } 3222 3223 ret = btrfs_del_item(trans, root, path); 3224 out: 3225 btrfs_free_path(path); 3226 err = btrfs_commit_transaction(trans); 3227 if (err && !ret) 3228 ret = err; 3229 return ret; 3230 } 3231 3232 /* 3233 * This is a heuristic used to reduce the number of chunks balanced on 3234 * resume after balance was interrupted. 3235 */ 3236 static void update_balance_args(struct btrfs_balance_control *bctl) 3237 { 3238 /* 3239 * Turn on soft mode for chunk types that were being converted. 3240 */ 3241 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3242 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3243 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3244 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3245 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3246 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3247 3248 /* 3249 * Turn on usage filter if is not already used. The idea is 3250 * that chunks that we have already balanced should be 3251 * reasonably full. Don't do it for chunks that are being 3252 * converted - that will keep us from relocating unconverted 3253 * (albeit full) chunks. 3254 */ 3255 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3256 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3257 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3258 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3259 bctl->data.usage = 90; 3260 } 3261 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3262 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3263 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3264 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3265 bctl->sys.usage = 90; 3266 } 3267 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3268 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3269 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3270 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3271 bctl->meta.usage = 90; 3272 } 3273 } 3274 3275 /* 3276 * Should be called with both balance and volume mutexes held to 3277 * serialize other volume operations (add_dev/rm_dev/resize) with 3278 * restriper. Same goes for unset_balance_control. 3279 */ 3280 static void set_balance_control(struct btrfs_balance_control *bctl) 3281 { 3282 struct btrfs_fs_info *fs_info = bctl->fs_info; 3283 3284 BUG_ON(fs_info->balance_ctl); 3285 3286 spin_lock(&fs_info->balance_lock); 3287 fs_info->balance_ctl = bctl; 3288 spin_unlock(&fs_info->balance_lock); 3289 } 3290 3291 static void unset_balance_control(struct btrfs_fs_info *fs_info) 3292 { 3293 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3294 3295 BUG_ON(!fs_info->balance_ctl); 3296 3297 spin_lock(&fs_info->balance_lock); 3298 fs_info->balance_ctl = NULL; 3299 spin_unlock(&fs_info->balance_lock); 3300 3301 kfree(bctl); 3302 } 3303 3304 /* 3305 * Balance filters. Return 1 if chunk should be filtered out 3306 * (should not be balanced). 3307 */ 3308 static int chunk_profiles_filter(u64 chunk_type, 3309 struct btrfs_balance_args *bargs) 3310 { 3311 chunk_type = chunk_to_extended(chunk_type) & 3312 BTRFS_EXTENDED_PROFILE_MASK; 3313 3314 if (bargs->profiles & chunk_type) 3315 return 0; 3316 3317 return 1; 3318 } 3319 3320 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3321 struct btrfs_balance_args *bargs) 3322 { 3323 struct btrfs_block_group_cache *cache; 3324 u64 chunk_used; 3325 u64 user_thresh_min; 3326 u64 user_thresh_max; 3327 int ret = 1; 3328 3329 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3330 chunk_used = btrfs_block_group_used(&cache->item); 3331 3332 if (bargs->usage_min == 0) 3333 user_thresh_min = 0; 3334 else 3335 user_thresh_min = div_factor_fine(cache->key.offset, 3336 bargs->usage_min); 3337 3338 if (bargs->usage_max == 0) 3339 user_thresh_max = 1; 3340 else if (bargs->usage_max > 100) 3341 user_thresh_max = cache->key.offset; 3342 else 3343 user_thresh_max = div_factor_fine(cache->key.offset, 3344 bargs->usage_max); 3345 3346 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3347 ret = 0; 3348 3349 btrfs_put_block_group(cache); 3350 return ret; 3351 } 3352 3353 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3354 u64 chunk_offset, struct btrfs_balance_args *bargs) 3355 { 3356 struct btrfs_block_group_cache *cache; 3357 u64 chunk_used, user_thresh; 3358 int ret = 1; 3359 3360 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3361 chunk_used = btrfs_block_group_used(&cache->item); 3362 3363 if (bargs->usage_min == 0) 3364 user_thresh = 1; 3365 else if (bargs->usage > 100) 3366 user_thresh = cache->key.offset; 3367 else 3368 user_thresh = div_factor_fine(cache->key.offset, 3369 bargs->usage); 3370 3371 if (chunk_used < user_thresh) 3372 ret = 0; 3373 3374 btrfs_put_block_group(cache); 3375 return ret; 3376 } 3377 3378 static int chunk_devid_filter(struct extent_buffer *leaf, 3379 struct btrfs_chunk *chunk, 3380 struct btrfs_balance_args *bargs) 3381 { 3382 struct btrfs_stripe *stripe; 3383 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3384 int i; 3385 3386 for (i = 0; i < num_stripes; i++) { 3387 stripe = btrfs_stripe_nr(chunk, i); 3388 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3389 return 0; 3390 } 3391 3392 return 1; 3393 } 3394 3395 /* [pstart, pend) */ 3396 static int chunk_drange_filter(struct extent_buffer *leaf, 3397 struct btrfs_chunk *chunk, 3398 struct btrfs_balance_args *bargs) 3399 { 3400 struct btrfs_stripe *stripe; 3401 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3402 u64 stripe_offset; 3403 u64 stripe_length; 3404 int factor; 3405 int i; 3406 3407 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3408 return 0; 3409 3410 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3411 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3412 factor = num_stripes / 2; 3413 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3414 factor = num_stripes - 1; 3415 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3416 factor = num_stripes - 2; 3417 } else { 3418 factor = num_stripes; 3419 } 3420 3421 for (i = 0; i < num_stripes; i++) { 3422 stripe = btrfs_stripe_nr(chunk, i); 3423 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3424 continue; 3425 3426 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3427 stripe_length = btrfs_chunk_length(leaf, chunk); 3428 stripe_length = div_u64(stripe_length, factor); 3429 3430 if (stripe_offset < bargs->pend && 3431 stripe_offset + stripe_length > bargs->pstart) 3432 return 0; 3433 } 3434 3435 return 1; 3436 } 3437 3438 /* [vstart, vend) */ 3439 static int chunk_vrange_filter(struct extent_buffer *leaf, 3440 struct btrfs_chunk *chunk, 3441 u64 chunk_offset, 3442 struct btrfs_balance_args *bargs) 3443 { 3444 if (chunk_offset < bargs->vend && 3445 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3446 /* at least part of the chunk is inside this vrange */ 3447 return 0; 3448 3449 return 1; 3450 } 3451 3452 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3453 struct btrfs_chunk *chunk, 3454 struct btrfs_balance_args *bargs) 3455 { 3456 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3457 3458 if (bargs->stripes_min <= num_stripes 3459 && num_stripes <= bargs->stripes_max) 3460 return 0; 3461 3462 return 1; 3463 } 3464 3465 static int chunk_soft_convert_filter(u64 chunk_type, 3466 struct btrfs_balance_args *bargs) 3467 { 3468 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3469 return 0; 3470 3471 chunk_type = chunk_to_extended(chunk_type) & 3472 BTRFS_EXTENDED_PROFILE_MASK; 3473 3474 if (bargs->target == chunk_type) 3475 return 1; 3476 3477 return 0; 3478 } 3479 3480 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3481 struct extent_buffer *leaf, 3482 struct btrfs_chunk *chunk, u64 chunk_offset) 3483 { 3484 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3485 struct btrfs_balance_args *bargs = NULL; 3486 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3487 3488 /* type filter */ 3489 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3490 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3491 return 0; 3492 } 3493 3494 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3495 bargs = &bctl->data; 3496 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3497 bargs = &bctl->sys; 3498 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3499 bargs = &bctl->meta; 3500 3501 /* profiles filter */ 3502 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3503 chunk_profiles_filter(chunk_type, bargs)) { 3504 return 0; 3505 } 3506 3507 /* usage filter */ 3508 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3509 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3510 return 0; 3511 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3512 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3513 return 0; 3514 } 3515 3516 /* devid filter */ 3517 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3518 chunk_devid_filter(leaf, chunk, bargs)) { 3519 return 0; 3520 } 3521 3522 /* drange filter, makes sense only with devid filter */ 3523 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3524 chunk_drange_filter(leaf, chunk, bargs)) { 3525 return 0; 3526 } 3527 3528 /* vrange filter */ 3529 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3530 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3531 return 0; 3532 } 3533 3534 /* stripes filter */ 3535 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3536 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3537 return 0; 3538 } 3539 3540 /* soft profile changing mode */ 3541 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3542 chunk_soft_convert_filter(chunk_type, bargs)) { 3543 return 0; 3544 } 3545 3546 /* 3547 * limited by count, must be the last filter 3548 */ 3549 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3550 if (bargs->limit == 0) 3551 return 0; 3552 else 3553 bargs->limit--; 3554 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3555 /* 3556 * Same logic as the 'limit' filter; the minimum cannot be 3557 * determined here because we do not have the global information 3558 * about the count of all chunks that satisfy the filters. 3559 */ 3560 if (bargs->limit_max == 0) 3561 return 0; 3562 else 3563 bargs->limit_max--; 3564 } 3565 3566 return 1; 3567 } 3568 3569 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3570 { 3571 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3572 struct btrfs_root *chunk_root = fs_info->chunk_root; 3573 struct btrfs_root *dev_root = fs_info->dev_root; 3574 struct list_head *devices; 3575 struct btrfs_device *device; 3576 u64 old_size; 3577 u64 size_to_free; 3578 u64 chunk_type; 3579 struct btrfs_chunk *chunk; 3580 struct btrfs_path *path = NULL; 3581 struct btrfs_key key; 3582 struct btrfs_key found_key; 3583 struct btrfs_trans_handle *trans; 3584 struct extent_buffer *leaf; 3585 int slot; 3586 int ret; 3587 int enospc_errors = 0; 3588 bool counting = true; 3589 /* The single value limit and min/max limits use the same bytes in the */ 3590 u64 limit_data = bctl->data.limit; 3591 u64 limit_meta = bctl->meta.limit; 3592 u64 limit_sys = bctl->sys.limit; 3593 u32 count_data = 0; 3594 u32 count_meta = 0; 3595 u32 count_sys = 0; 3596 int chunk_reserved = 0; 3597 3598 /* step one make some room on all the devices */ 3599 devices = &fs_info->fs_devices->devices; 3600 list_for_each_entry(device, devices, dev_list) { 3601 old_size = btrfs_device_get_total_bytes(device); 3602 size_to_free = div_factor(old_size, 1); 3603 size_to_free = min_t(u64, size_to_free, SZ_1M); 3604 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || 3605 btrfs_device_get_total_bytes(device) - 3606 btrfs_device_get_bytes_used(device) > size_to_free || 3607 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 3608 continue; 3609 3610 ret = btrfs_shrink_device(device, old_size - size_to_free); 3611 if (ret == -ENOSPC) 3612 break; 3613 if (ret) { 3614 /* btrfs_shrink_device never returns ret > 0 */ 3615 WARN_ON(ret > 0); 3616 goto error; 3617 } 3618 3619 trans = btrfs_start_transaction(dev_root, 0); 3620 if (IS_ERR(trans)) { 3621 ret = PTR_ERR(trans); 3622 btrfs_info_in_rcu(fs_info, 3623 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3624 rcu_str_deref(device->name), ret, 3625 old_size, old_size - size_to_free); 3626 goto error; 3627 } 3628 3629 ret = btrfs_grow_device(trans, device, old_size); 3630 if (ret) { 3631 btrfs_end_transaction(trans); 3632 /* btrfs_grow_device never returns ret > 0 */ 3633 WARN_ON(ret > 0); 3634 btrfs_info_in_rcu(fs_info, 3635 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3636 rcu_str_deref(device->name), ret, 3637 old_size, old_size - size_to_free); 3638 goto error; 3639 } 3640 3641 btrfs_end_transaction(trans); 3642 } 3643 3644 /* step two, relocate all the chunks */ 3645 path = btrfs_alloc_path(); 3646 if (!path) { 3647 ret = -ENOMEM; 3648 goto error; 3649 } 3650 3651 /* zero out stat counters */ 3652 spin_lock(&fs_info->balance_lock); 3653 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3654 spin_unlock(&fs_info->balance_lock); 3655 again: 3656 if (!counting) { 3657 /* 3658 * The single value limit and min/max limits use the same bytes 3659 * in the 3660 */ 3661 bctl->data.limit = limit_data; 3662 bctl->meta.limit = limit_meta; 3663 bctl->sys.limit = limit_sys; 3664 } 3665 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3666 key.offset = (u64)-1; 3667 key.type = BTRFS_CHUNK_ITEM_KEY; 3668 3669 while (1) { 3670 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3671 atomic_read(&fs_info->balance_cancel_req)) { 3672 ret = -ECANCELED; 3673 goto error; 3674 } 3675 3676 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3677 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3678 if (ret < 0) { 3679 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3680 goto error; 3681 } 3682 3683 /* 3684 * this shouldn't happen, it means the last relocate 3685 * failed 3686 */ 3687 if (ret == 0) 3688 BUG(); /* FIXME break ? */ 3689 3690 ret = btrfs_previous_item(chunk_root, path, 0, 3691 BTRFS_CHUNK_ITEM_KEY); 3692 if (ret) { 3693 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3694 ret = 0; 3695 break; 3696 } 3697 3698 leaf = path->nodes[0]; 3699 slot = path->slots[0]; 3700 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3701 3702 if (found_key.objectid != key.objectid) { 3703 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3704 break; 3705 } 3706 3707 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3708 chunk_type = btrfs_chunk_type(leaf, chunk); 3709 3710 if (!counting) { 3711 spin_lock(&fs_info->balance_lock); 3712 bctl->stat.considered++; 3713 spin_unlock(&fs_info->balance_lock); 3714 } 3715 3716 ret = should_balance_chunk(fs_info, leaf, chunk, 3717 found_key.offset); 3718 3719 btrfs_release_path(path); 3720 if (!ret) { 3721 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3722 goto loop; 3723 } 3724 3725 if (counting) { 3726 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3727 spin_lock(&fs_info->balance_lock); 3728 bctl->stat.expected++; 3729 spin_unlock(&fs_info->balance_lock); 3730 3731 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3732 count_data++; 3733 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3734 count_sys++; 3735 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3736 count_meta++; 3737 3738 goto loop; 3739 } 3740 3741 /* 3742 * Apply limit_min filter, no need to check if the LIMITS 3743 * filter is used, limit_min is 0 by default 3744 */ 3745 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3746 count_data < bctl->data.limit_min) 3747 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3748 count_meta < bctl->meta.limit_min) 3749 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3750 count_sys < bctl->sys.limit_min)) { 3751 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3752 goto loop; 3753 } 3754 3755 if (!chunk_reserved) { 3756 /* 3757 * We may be relocating the only data chunk we have, 3758 * which could potentially end up with losing data's 3759 * raid profile, so lets allocate an empty one in 3760 * advance. 3761 */ 3762 ret = btrfs_may_alloc_data_chunk(fs_info, 3763 found_key.offset); 3764 if (ret < 0) { 3765 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3766 goto error; 3767 } else if (ret == 1) { 3768 chunk_reserved = 1; 3769 } 3770 } 3771 3772 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3773 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3774 if (ret && ret != -ENOSPC) 3775 goto error; 3776 if (ret == -ENOSPC) { 3777 enospc_errors++; 3778 } else { 3779 spin_lock(&fs_info->balance_lock); 3780 bctl->stat.completed++; 3781 spin_unlock(&fs_info->balance_lock); 3782 } 3783 loop: 3784 if (found_key.offset == 0) 3785 break; 3786 key.offset = found_key.offset - 1; 3787 } 3788 3789 if (counting) { 3790 btrfs_release_path(path); 3791 counting = false; 3792 goto again; 3793 } 3794 error: 3795 btrfs_free_path(path); 3796 if (enospc_errors) { 3797 btrfs_info(fs_info, "%d enospc errors during balance", 3798 enospc_errors); 3799 if (!ret) 3800 ret = -ENOSPC; 3801 } 3802 3803 return ret; 3804 } 3805 3806 /** 3807 * alloc_profile_is_valid - see if a given profile is valid and reduced 3808 * @flags: profile to validate 3809 * @extended: if true @flags is treated as an extended profile 3810 */ 3811 static int alloc_profile_is_valid(u64 flags, int extended) 3812 { 3813 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3814 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3815 3816 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3817 3818 /* 1) check that all other bits are zeroed */ 3819 if (flags & ~mask) 3820 return 0; 3821 3822 /* 2) see if profile is reduced */ 3823 if (flags == 0) 3824 return !extended; /* "0" is valid for usual profiles */ 3825 3826 /* true if exactly one bit set */ 3827 return (flags & (flags - 1)) == 0; 3828 } 3829 3830 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3831 { 3832 /* cancel requested || normal exit path */ 3833 return atomic_read(&fs_info->balance_cancel_req) || 3834 (atomic_read(&fs_info->balance_pause_req) == 0 && 3835 atomic_read(&fs_info->balance_cancel_req) == 0); 3836 } 3837 3838 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3839 { 3840 int ret; 3841 3842 unset_balance_control(fs_info); 3843 ret = del_balance_item(fs_info); 3844 if (ret) 3845 btrfs_handle_fs_error(fs_info, ret, NULL); 3846 3847 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3848 } 3849 3850 /* Non-zero return value signifies invalidity */ 3851 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3852 u64 allowed) 3853 { 3854 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3855 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3856 (bctl_arg->target & ~allowed))); 3857 } 3858 3859 /* 3860 * Should be called with both balance and volume mutexes held 3861 */ 3862 int btrfs_balance(struct btrfs_balance_control *bctl, 3863 struct btrfs_ioctl_balance_args *bargs) 3864 { 3865 struct btrfs_fs_info *fs_info = bctl->fs_info; 3866 u64 meta_target, data_target; 3867 u64 allowed; 3868 int mixed = 0; 3869 int ret; 3870 u64 num_devices; 3871 unsigned seq; 3872 3873 if (btrfs_fs_closing(fs_info) || 3874 atomic_read(&fs_info->balance_pause_req) || 3875 atomic_read(&fs_info->balance_cancel_req)) { 3876 ret = -EINVAL; 3877 goto out; 3878 } 3879 3880 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3881 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3882 mixed = 1; 3883 3884 /* 3885 * In case of mixed groups both data and meta should be picked, 3886 * and identical options should be given for both of them. 3887 */ 3888 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3889 if (mixed && (bctl->flags & allowed)) { 3890 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3891 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3892 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3893 btrfs_err(fs_info, 3894 "with mixed groups data and metadata balance options must be the same"); 3895 ret = -EINVAL; 3896 goto out; 3897 } 3898 } 3899 3900 num_devices = fs_info->fs_devices->num_devices; 3901 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 3902 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3903 BUG_ON(num_devices < 1); 3904 num_devices--; 3905 } 3906 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 3907 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3908 if (num_devices > 1) 3909 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3910 if (num_devices > 2) 3911 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3912 if (num_devices > 3) 3913 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3914 BTRFS_BLOCK_GROUP_RAID6); 3915 if (validate_convert_profile(&bctl->data, allowed)) { 3916 btrfs_err(fs_info, 3917 "unable to start balance with target data profile %llu", 3918 bctl->data.target); 3919 ret = -EINVAL; 3920 goto out; 3921 } 3922 if (validate_convert_profile(&bctl->meta, allowed)) { 3923 btrfs_err(fs_info, 3924 "unable to start balance with target metadata profile %llu", 3925 bctl->meta.target); 3926 ret = -EINVAL; 3927 goto out; 3928 } 3929 if (validate_convert_profile(&bctl->sys, allowed)) { 3930 btrfs_err(fs_info, 3931 "unable to start balance with target system profile %llu", 3932 bctl->sys.target); 3933 ret = -EINVAL; 3934 goto out; 3935 } 3936 3937 /* allow to reduce meta or sys integrity only if force set */ 3938 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3939 BTRFS_BLOCK_GROUP_RAID10 | 3940 BTRFS_BLOCK_GROUP_RAID5 | 3941 BTRFS_BLOCK_GROUP_RAID6; 3942 do { 3943 seq = read_seqbegin(&fs_info->profiles_lock); 3944 3945 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3946 (fs_info->avail_system_alloc_bits & allowed) && 3947 !(bctl->sys.target & allowed)) || 3948 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3949 (fs_info->avail_metadata_alloc_bits & allowed) && 3950 !(bctl->meta.target & allowed))) { 3951 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3952 btrfs_info(fs_info, 3953 "force reducing metadata integrity"); 3954 } else { 3955 btrfs_err(fs_info, 3956 "balance will reduce metadata integrity, use force if you want this"); 3957 ret = -EINVAL; 3958 goto out; 3959 } 3960 } 3961 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3962 3963 /* if we're not converting, the target field is uninitialized */ 3964 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3965 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 3966 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3967 bctl->data.target : fs_info->avail_data_alloc_bits; 3968 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 3969 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3970 btrfs_warn(fs_info, 3971 "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", 3972 meta_target, data_target); 3973 } 3974 3975 ret = insert_balance_item(fs_info, bctl); 3976 if (ret && ret != -EEXIST) 3977 goto out; 3978 3979 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3980 BUG_ON(ret == -EEXIST); 3981 set_balance_control(bctl); 3982 } else { 3983 BUG_ON(ret != -EEXIST); 3984 spin_lock(&fs_info->balance_lock); 3985 update_balance_args(bctl); 3986 spin_unlock(&fs_info->balance_lock); 3987 } 3988 3989 atomic_inc(&fs_info->balance_running); 3990 mutex_unlock(&fs_info->balance_mutex); 3991 3992 ret = __btrfs_balance(fs_info); 3993 3994 mutex_lock(&fs_info->balance_mutex); 3995 atomic_dec(&fs_info->balance_running); 3996 3997 if (bargs) { 3998 memset(bargs, 0, sizeof(*bargs)); 3999 update_ioctl_balance_args(fs_info, 0, bargs); 4000 } 4001 4002 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4003 balance_need_close(fs_info)) { 4004 __cancel_balance(fs_info); 4005 } 4006 4007 wake_up(&fs_info->balance_wait_q); 4008 4009 return ret; 4010 out: 4011 if (bctl->flags & BTRFS_BALANCE_RESUME) 4012 __cancel_balance(fs_info); 4013 else { 4014 kfree(bctl); 4015 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4016 } 4017 return ret; 4018 } 4019 4020 static int balance_kthread(void *data) 4021 { 4022 struct btrfs_fs_info *fs_info = data; 4023 int ret = 0; 4024 4025 mutex_lock(&fs_info->volume_mutex); 4026 mutex_lock(&fs_info->balance_mutex); 4027 4028 if (fs_info->balance_ctl) { 4029 btrfs_info(fs_info, "continuing balance"); 4030 ret = btrfs_balance(fs_info->balance_ctl, NULL); 4031 } 4032 4033 mutex_unlock(&fs_info->balance_mutex); 4034 mutex_unlock(&fs_info->volume_mutex); 4035 4036 return ret; 4037 } 4038 4039 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4040 { 4041 struct task_struct *tsk; 4042 4043 spin_lock(&fs_info->balance_lock); 4044 if (!fs_info->balance_ctl) { 4045 spin_unlock(&fs_info->balance_lock); 4046 return 0; 4047 } 4048 spin_unlock(&fs_info->balance_lock); 4049 4050 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4051 btrfs_info(fs_info, "force skipping balance"); 4052 return 0; 4053 } 4054 4055 /* 4056 * A ro->rw remount sequence should continue with the paused balance 4057 * regardless of who pauses it, system or the user as of now, so set 4058 * the resume flag. 4059 */ 4060 spin_lock(&fs_info->balance_lock); 4061 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4062 spin_unlock(&fs_info->balance_lock); 4063 4064 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4065 return PTR_ERR_OR_ZERO(tsk); 4066 } 4067 4068 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4069 { 4070 struct btrfs_balance_control *bctl; 4071 struct btrfs_balance_item *item; 4072 struct btrfs_disk_balance_args disk_bargs; 4073 struct btrfs_path *path; 4074 struct extent_buffer *leaf; 4075 struct btrfs_key key; 4076 int ret; 4077 4078 path = btrfs_alloc_path(); 4079 if (!path) 4080 return -ENOMEM; 4081 4082 key.objectid = BTRFS_BALANCE_OBJECTID; 4083 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4084 key.offset = 0; 4085 4086 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4087 if (ret < 0) 4088 goto out; 4089 if (ret > 0) { /* ret = -ENOENT; */ 4090 ret = 0; 4091 goto out; 4092 } 4093 4094 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4095 if (!bctl) { 4096 ret = -ENOMEM; 4097 goto out; 4098 } 4099 4100 leaf = path->nodes[0]; 4101 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4102 4103 bctl->fs_info = fs_info; 4104 bctl->flags = btrfs_balance_flags(leaf, item); 4105 bctl->flags |= BTRFS_BALANCE_RESUME; 4106 4107 btrfs_balance_data(leaf, item, &disk_bargs); 4108 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4109 btrfs_balance_meta(leaf, item, &disk_bargs); 4110 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4111 btrfs_balance_sys(leaf, item, &disk_bargs); 4112 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4113 4114 WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 4115 4116 mutex_lock(&fs_info->volume_mutex); 4117 mutex_lock(&fs_info->balance_mutex); 4118 4119 set_balance_control(bctl); 4120 4121 mutex_unlock(&fs_info->balance_mutex); 4122 mutex_unlock(&fs_info->volume_mutex); 4123 out: 4124 btrfs_free_path(path); 4125 return ret; 4126 } 4127 4128 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4129 { 4130 int ret = 0; 4131 4132 mutex_lock(&fs_info->balance_mutex); 4133 if (!fs_info->balance_ctl) { 4134 mutex_unlock(&fs_info->balance_mutex); 4135 return -ENOTCONN; 4136 } 4137 4138 if (atomic_read(&fs_info->balance_running)) { 4139 atomic_inc(&fs_info->balance_pause_req); 4140 mutex_unlock(&fs_info->balance_mutex); 4141 4142 wait_event(fs_info->balance_wait_q, 4143 atomic_read(&fs_info->balance_running) == 0); 4144 4145 mutex_lock(&fs_info->balance_mutex); 4146 /* we are good with balance_ctl ripped off from under us */ 4147 BUG_ON(atomic_read(&fs_info->balance_running)); 4148 atomic_dec(&fs_info->balance_pause_req); 4149 } else { 4150 ret = -ENOTCONN; 4151 } 4152 4153 mutex_unlock(&fs_info->balance_mutex); 4154 return ret; 4155 } 4156 4157 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4158 { 4159 if (sb_rdonly(fs_info->sb)) 4160 return -EROFS; 4161 4162 mutex_lock(&fs_info->balance_mutex); 4163 if (!fs_info->balance_ctl) { 4164 mutex_unlock(&fs_info->balance_mutex); 4165 return -ENOTCONN; 4166 } 4167 4168 atomic_inc(&fs_info->balance_cancel_req); 4169 /* 4170 * if we are running just wait and return, balance item is 4171 * deleted in btrfs_balance in this case 4172 */ 4173 if (atomic_read(&fs_info->balance_running)) { 4174 mutex_unlock(&fs_info->balance_mutex); 4175 wait_event(fs_info->balance_wait_q, 4176 atomic_read(&fs_info->balance_running) == 0); 4177 mutex_lock(&fs_info->balance_mutex); 4178 } else { 4179 /* __cancel_balance needs volume_mutex */ 4180 mutex_unlock(&fs_info->balance_mutex); 4181 mutex_lock(&fs_info->volume_mutex); 4182 mutex_lock(&fs_info->balance_mutex); 4183 4184 if (fs_info->balance_ctl) 4185 __cancel_balance(fs_info); 4186 4187 mutex_unlock(&fs_info->volume_mutex); 4188 } 4189 4190 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 4191 atomic_dec(&fs_info->balance_cancel_req); 4192 mutex_unlock(&fs_info->balance_mutex); 4193 return 0; 4194 } 4195 4196 static int btrfs_uuid_scan_kthread(void *data) 4197 { 4198 struct btrfs_fs_info *fs_info = data; 4199 struct btrfs_root *root = fs_info->tree_root; 4200 struct btrfs_key key; 4201 struct btrfs_path *path = NULL; 4202 int ret = 0; 4203 struct extent_buffer *eb; 4204 int slot; 4205 struct btrfs_root_item root_item; 4206 u32 item_size; 4207 struct btrfs_trans_handle *trans = NULL; 4208 4209 path = btrfs_alloc_path(); 4210 if (!path) { 4211 ret = -ENOMEM; 4212 goto out; 4213 } 4214 4215 key.objectid = 0; 4216 key.type = BTRFS_ROOT_ITEM_KEY; 4217 key.offset = 0; 4218 4219 while (1) { 4220 ret = btrfs_search_forward(root, &key, path, 4221 BTRFS_OLDEST_GENERATION); 4222 if (ret) { 4223 if (ret > 0) 4224 ret = 0; 4225 break; 4226 } 4227 4228 if (key.type != BTRFS_ROOT_ITEM_KEY || 4229 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4230 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4231 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4232 goto skip; 4233 4234 eb = path->nodes[0]; 4235 slot = path->slots[0]; 4236 item_size = btrfs_item_size_nr(eb, slot); 4237 if (item_size < sizeof(root_item)) 4238 goto skip; 4239 4240 read_extent_buffer(eb, &root_item, 4241 btrfs_item_ptr_offset(eb, slot), 4242 (int)sizeof(root_item)); 4243 if (btrfs_root_refs(&root_item) == 0) 4244 goto skip; 4245 4246 if (!btrfs_is_empty_uuid(root_item.uuid) || 4247 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4248 if (trans) 4249 goto update_tree; 4250 4251 btrfs_release_path(path); 4252 /* 4253 * 1 - subvol uuid item 4254 * 1 - received_subvol uuid item 4255 */ 4256 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4257 if (IS_ERR(trans)) { 4258 ret = PTR_ERR(trans); 4259 break; 4260 } 4261 continue; 4262 } else { 4263 goto skip; 4264 } 4265 update_tree: 4266 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4267 ret = btrfs_uuid_tree_add(trans, fs_info, 4268 root_item.uuid, 4269 BTRFS_UUID_KEY_SUBVOL, 4270 key.objectid); 4271 if (ret < 0) { 4272 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4273 ret); 4274 break; 4275 } 4276 } 4277 4278 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4279 ret = btrfs_uuid_tree_add(trans, fs_info, 4280 root_item.received_uuid, 4281 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4282 key.objectid); 4283 if (ret < 0) { 4284 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4285 ret); 4286 break; 4287 } 4288 } 4289 4290 skip: 4291 if (trans) { 4292 ret = btrfs_end_transaction(trans); 4293 trans = NULL; 4294 if (ret) 4295 break; 4296 } 4297 4298 btrfs_release_path(path); 4299 if (key.offset < (u64)-1) { 4300 key.offset++; 4301 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4302 key.offset = 0; 4303 key.type = BTRFS_ROOT_ITEM_KEY; 4304 } else if (key.objectid < (u64)-1) { 4305 key.offset = 0; 4306 key.type = BTRFS_ROOT_ITEM_KEY; 4307 key.objectid++; 4308 } else { 4309 break; 4310 } 4311 cond_resched(); 4312 } 4313 4314 out: 4315 btrfs_free_path(path); 4316 if (trans && !IS_ERR(trans)) 4317 btrfs_end_transaction(trans); 4318 if (ret) 4319 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4320 else 4321 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4322 up(&fs_info->uuid_tree_rescan_sem); 4323 return 0; 4324 } 4325 4326 /* 4327 * Callback for btrfs_uuid_tree_iterate(). 4328 * returns: 4329 * 0 check succeeded, the entry is not outdated. 4330 * < 0 if an error occurred. 4331 * > 0 if the check failed, which means the caller shall remove the entry. 4332 */ 4333 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4334 u8 *uuid, u8 type, u64 subid) 4335 { 4336 struct btrfs_key key; 4337 int ret = 0; 4338 struct btrfs_root *subvol_root; 4339 4340 if (type != BTRFS_UUID_KEY_SUBVOL && 4341 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4342 goto out; 4343 4344 key.objectid = subid; 4345 key.type = BTRFS_ROOT_ITEM_KEY; 4346 key.offset = (u64)-1; 4347 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4348 if (IS_ERR(subvol_root)) { 4349 ret = PTR_ERR(subvol_root); 4350 if (ret == -ENOENT) 4351 ret = 1; 4352 goto out; 4353 } 4354 4355 switch (type) { 4356 case BTRFS_UUID_KEY_SUBVOL: 4357 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4358 ret = 1; 4359 break; 4360 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4361 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4362 BTRFS_UUID_SIZE)) 4363 ret = 1; 4364 break; 4365 } 4366 4367 out: 4368 return ret; 4369 } 4370 4371 static int btrfs_uuid_rescan_kthread(void *data) 4372 { 4373 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4374 int ret; 4375 4376 /* 4377 * 1st step is to iterate through the existing UUID tree and 4378 * to delete all entries that contain outdated data. 4379 * 2nd step is to add all missing entries to the UUID tree. 4380 */ 4381 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4382 if (ret < 0) { 4383 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4384 up(&fs_info->uuid_tree_rescan_sem); 4385 return ret; 4386 } 4387 return btrfs_uuid_scan_kthread(data); 4388 } 4389 4390 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4391 { 4392 struct btrfs_trans_handle *trans; 4393 struct btrfs_root *tree_root = fs_info->tree_root; 4394 struct btrfs_root *uuid_root; 4395 struct task_struct *task; 4396 int ret; 4397 4398 /* 4399 * 1 - root node 4400 * 1 - root item 4401 */ 4402 trans = btrfs_start_transaction(tree_root, 2); 4403 if (IS_ERR(trans)) 4404 return PTR_ERR(trans); 4405 4406 uuid_root = btrfs_create_tree(trans, fs_info, 4407 BTRFS_UUID_TREE_OBJECTID); 4408 if (IS_ERR(uuid_root)) { 4409 ret = PTR_ERR(uuid_root); 4410 btrfs_abort_transaction(trans, ret); 4411 btrfs_end_transaction(trans); 4412 return ret; 4413 } 4414 4415 fs_info->uuid_root = uuid_root; 4416 4417 ret = btrfs_commit_transaction(trans); 4418 if (ret) 4419 return ret; 4420 4421 down(&fs_info->uuid_tree_rescan_sem); 4422 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4423 if (IS_ERR(task)) { 4424 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4425 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4426 up(&fs_info->uuid_tree_rescan_sem); 4427 return PTR_ERR(task); 4428 } 4429 4430 return 0; 4431 } 4432 4433 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4434 { 4435 struct task_struct *task; 4436 4437 down(&fs_info->uuid_tree_rescan_sem); 4438 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4439 if (IS_ERR(task)) { 4440 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4441 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4442 up(&fs_info->uuid_tree_rescan_sem); 4443 return PTR_ERR(task); 4444 } 4445 4446 return 0; 4447 } 4448 4449 /* 4450 * shrinking a device means finding all of the device extents past 4451 * the new size, and then following the back refs to the chunks. 4452 * The chunk relocation code actually frees the device extent 4453 */ 4454 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4455 { 4456 struct btrfs_fs_info *fs_info = device->fs_info; 4457 struct btrfs_root *root = fs_info->dev_root; 4458 struct btrfs_trans_handle *trans; 4459 struct btrfs_dev_extent *dev_extent = NULL; 4460 struct btrfs_path *path; 4461 u64 length; 4462 u64 chunk_offset; 4463 int ret; 4464 int slot; 4465 int failed = 0; 4466 bool retried = false; 4467 bool checked_pending_chunks = false; 4468 struct extent_buffer *l; 4469 struct btrfs_key key; 4470 struct btrfs_super_block *super_copy = fs_info->super_copy; 4471 u64 old_total = btrfs_super_total_bytes(super_copy); 4472 u64 old_size = btrfs_device_get_total_bytes(device); 4473 u64 diff; 4474 4475 new_size = round_down(new_size, fs_info->sectorsize); 4476 diff = round_down(old_size - new_size, fs_info->sectorsize); 4477 4478 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4479 return -EINVAL; 4480 4481 path = btrfs_alloc_path(); 4482 if (!path) 4483 return -ENOMEM; 4484 4485 path->reada = READA_FORWARD; 4486 4487 mutex_lock(&fs_info->chunk_mutex); 4488 4489 btrfs_device_set_total_bytes(device, new_size); 4490 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4491 device->fs_devices->total_rw_bytes -= diff; 4492 atomic64_sub(diff, &fs_info->free_chunk_space); 4493 } 4494 mutex_unlock(&fs_info->chunk_mutex); 4495 4496 again: 4497 key.objectid = device->devid; 4498 key.offset = (u64)-1; 4499 key.type = BTRFS_DEV_EXTENT_KEY; 4500 4501 do { 4502 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4503 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4504 if (ret < 0) { 4505 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4506 goto done; 4507 } 4508 4509 ret = btrfs_previous_item(root, path, 0, key.type); 4510 if (ret) 4511 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4512 if (ret < 0) 4513 goto done; 4514 if (ret) { 4515 ret = 0; 4516 btrfs_release_path(path); 4517 break; 4518 } 4519 4520 l = path->nodes[0]; 4521 slot = path->slots[0]; 4522 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4523 4524 if (key.objectid != device->devid) { 4525 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4526 btrfs_release_path(path); 4527 break; 4528 } 4529 4530 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4531 length = btrfs_dev_extent_length(l, dev_extent); 4532 4533 if (key.offset + length <= new_size) { 4534 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4535 btrfs_release_path(path); 4536 break; 4537 } 4538 4539 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4540 btrfs_release_path(path); 4541 4542 /* 4543 * We may be relocating the only data chunk we have, 4544 * which could potentially end up with losing data's 4545 * raid profile, so lets allocate an empty one in 4546 * advance. 4547 */ 4548 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4549 if (ret < 0) { 4550 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4551 goto done; 4552 } 4553 4554 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4555 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4556 if (ret && ret != -ENOSPC) 4557 goto done; 4558 if (ret == -ENOSPC) 4559 failed++; 4560 } while (key.offset-- > 0); 4561 4562 if (failed && !retried) { 4563 failed = 0; 4564 retried = true; 4565 goto again; 4566 } else if (failed && retried) { 4567 ret = -ENOSPC; 4568 goto done; 4569 } 4570 4571 /* Shrinking succeeded, else we would be at "done". */ 4572 trans = btrfs_start_transaction(root, 0); 4573 if (IS_ERR(trans)) { 4574 ret = PTR_ERR(trans); 4575 goto done; 4576 } 4577 4578 mutex_lock(&fs_info->chunk_mutex); 4579 4580 /* 4581 * We checked in the above loop all device extents that were already in 4582 * the device tree. However before we have updated the device's 4583 * total_bytes to the new size, we might have had chunk allocations that 4584 * have not complete yet (new block groups attached to transaction 4585 * handles), and therefore their device extents were not yet in the 4586 * device tree and we missed them in the loop above. So if we have any 4587 * pending chunk using a device extent that overlaps the device range 4588 * that we can not use anymore, commit the current transaction and 4589 * repeat the search on the device tree - this way we guarantee we will 4590 * not have chunks using device extents that end beyond 'new_size'. 4591 */ 4592 if (!checked_pending_chunks) { 4593 u64 start = new_size; 4594 u64 len = old_size - new_size; 4595 4596 if (contains_pending_extent(trans->transaction, device, 4597 &start, len)) { 4598 mutex_unlock(&fs_info->chunk_mutex); 4599 checked_pending_chunks = true; 4600 failed = 0; 4601 retried = false; 4602 ret = btrfs_commit_transaction(trans); 4603 if (ret) 4604 goto done; 4605 goto again; 4606 } 4607 } 4608 4609 btrfs_device_set_disk_total_bytes(device, new_size); 4610 if (list_empty(&device->resized_list)) 4611 list_add_tail(&device->resized_list, 4612 &fs_info->fs_devices->resized_devices); 4613 4614 WARN_ON(diff > old_total); 4615 btrfs_set_super_total_bytes(super_copy, 4616 round_down(old_total - diff, fs_info->sectorsize)); 4617 mutex_unlock(&fs_info->chunk_mutex); 4618 4619 /* Now btrfs_update_device() will change the on-disk size. */ 4620 ret = btrfs_update_device(trans, device); 4621 btrfs_end_transaction(trans); 4622 done: 4623 btrfs_free_path(path); 4624 if (ret) { 4625 mutex_lock(&fs_info->chunk_mutex); 4626 btrfs_device_set_total_bytes(device, old_size); 4627 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4628 device->fs_devices->total_rw_bytes += diff; 4629 atomic64_add(diff, &fs_info->free_chunk_space); 4630 mutex_unlock(&fs_info->chunk_mutex); 4631 } 4632 return ret; 4633 } 4634 4635 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4636 struct btrfs_key *key, 4637 struct btrfs_chunk *chunk, int item_size) 4638 { 4639 struct btrfs_super_block *super_copy = fs_info->super_copy; 4640 struct btrfs_disk_key disk_key; 4641 u32 array_size; 4642 u8 *ptr; 4643 4644 mutex_lock(&fs_info->chunk_mutex); 4645 array_size = btrfs_super_sys_array_size(super_copy); 4646 if (array_size + item_size + sizeof(disk_key) 4647 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4648 mutex_unlock(&fs_info->chunk_mutex); 4649 return -EFBIG; 4650 } 4651 4652 ptr = super_copy->sys_chunk_array + array_size; 4653 btrfs_cpu_key_to_disk(&disk_key, key); 4654 memcpy(ptr, &disk_key, sizeof(disk_key)); 4655 ptr += sizeof(disk_key); 4656 memcpy(ptr, chunk, item_size); 4657 item_size += sizeof(disk_key); 4658 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4659 mutex_unlock(&fs_info->chunk_mutex); 4660 4661 return 0; 4662 } 4663 4664 /* 4665 * sort the devices in descending order by max_avail, total_avail 4666 */ 4667 static int btrfs_cmp_device_info(const void *a, const void *b) 4668 { 4669 const struct btrfs_device_info *di_a = a; 4670 const struct btrfs_device_info *di_b = b; 4671 4672 if (di_a->max_avail > di_b->max_avail) 4673 return -1; 4674 if (di_a->max_avail < di_b->max_avail) 4675 return 1; 4676 if (di_a->total_avail > di_b->total_avail) 4677 return -1; 4678 if (di_a->total_avail < di_b->total_avail) 4679 return 1; 4680 return 0; 4681 } 4682 4683 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4684 { 4685 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4686 return; 4687 4688 btrfs_set_fs_incompat(info, RAID56); 4689 } 4690 4691 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ 4692 - sizeof(struct btrfs_chunk)) \ 4693 / sizeof(struct btrfs_stripe) + 1) 4694 4695 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4696 - 2 * sizeof(struct btrfs_disk_key) \ 4697 - 2 * sizeof(struct btrfs_chunk)) \ 4698 / sizeof(struct btrfs_stripe) + 1) 4699 4700 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4701 u64 start, u64 type) 4702 { 4703 struct btrfs_fs_info *info = trans->fs_info; 4704 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4705 struct btrfs_device *device; 4706 struct map_lookup *map = NULL; 4707 struct extent_map_tree *em_tree; 4708 struct extent_map *em; 4709 struct btrfs_device_info *devices_info = NULL; 4710 u64 total_avail; 4711 int num_stripes; /* total number of stripes to allocate */ 4712 int data_stripes; /* number of stripes that count for 4713 block group size */ 4714 int sub_stripes; /* sub_stripes info for map */ 4715 int dev_stripes; /* stripes per dev */ 4716 int devs_max; /* max devs to use */ 4717 int devs_min; /* min devs needed */ 4718 int devs_increment; /* ndevs has to be a multiple of this */ 4719 int ncopies; /* how many copies to data has */ 4720 int ret; 4721 u64 max_stripe_size; 4722 u64 max_chunk_size; 4723 u64 stripe_size; 4724 u64 num_bytes; 4725 int ndevs; 4726 int i; 4727 int j; 4728 int index; 4729 4730 BUG_ON(!alloc_profile_is_valid(type, 0)); 4731 4732 if (list_empty(&fs_devices->alloc_list)) { 4733 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4734 btrfs_debug(info, "%s: no writable device", __func__); 4735 return -ENOSPC; 4736 } 4737 4738 index = btrfs_bg_flags_to_raid_index(type); 4739 4740 sub_stripes = btrfs_raid_array[index].sub_stripes; 4741 dev_stripes = btrfs_raid_array[index].dev_stripes; 4742 devs_max = btrfs_raid_array[index].devs_max; 4743 devs_min = btrfs_raid_array[index].devs_min; 4744 devs_increment = btrfs_raid_array[index].devs_increment; 4745 ncopies = btrfs_raid_array[index].ncopies; 4746 4747 if (type & BTRFS_BLOCK_GROUP_DATA) { 4748 max_stripe_size = SZ_1G; 4749 max_chunk_size = 10 * max_stripe_size; 4750 if (!devs_max) 4751 devs_max = BTRFS_MAX_DEVS(info); 4752 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4753 /* for larger filesystems, use larger metadata chunks */ 4754 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4755 max_stripe_size = SZ_1G; 4756 else 4757 max_stripe_size = SZ_256M; 4758 max_chunk_size = max_stripe_size; 4759 if (!devs_max) 4760 devs_max = BTRFS_MAX_DEVS(info); 4761 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4762 max_stripe_size = SZ_32M; 4763 max_chunk_size = 2 * max_stripe_size; 4764 if (!devs_max) 4765 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4766 } else { 4767 btrfs_err(info, "invalid chunk type 0x%llx requested", 4768 type); 4769 BUG_ON(1); 4770 } 4771 4772 /* we don't want a chunk larger than 10% of writeable space */ 4773 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4774 max_chunk_size); 4775 4776 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4777 GFP_NOFS); 4778 if (!devices_info) 4779 return -ENOMEM; 4780 4781 /* 4782 * in the first pass through the devices list, we gather information 4783 * about the available holes on each device. 4784 */ 4785 ndevs = 0; 4786 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4787 u64 max_avail; 4788 u64 dev_offset; 4789 4790 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4791 WARN(1, KERN_ERR 4792 "BTRFS: read-only device in alloc_list\n"); 4793 continue; 4794 } 4795 4796 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4797 &device->dev_state) || 4798 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4799 continue; 4800 4801 if (device->total_bytes > device->bytes_used) 4802 total_avail = device->total_bytes - device->bytes_used; 4803 else 4804 total_avail = 0; 4805 4806 /* If there is no space on this device, skip it. */ 4807 if (total_avail == 0) 4808 continue; 4809 4810 ret = find_free_dev_extent(trans, device, 4811 max_stripe_size * dev_stripes, 4812 &dev_offset, &max_avail); 4813 if (ret && ret != -ENOSPC) 4814 goto error; 4815 4816 if (ret == 0) 4817 max_avail = max_stripe_size * dev_stripes; 4818 4819 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { 4820 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4821 btrfs_debug(info, 4822 "%s: devid %llu has no free space, have=%llu want=%u", 4823 __func__, device->devid, max_avail, 4824 BTRFS_STRIPE_LEN * dev_stripes); 4825 continue; 4826 } 4827 4828 if (ndevs == fs_devices->rw_devices) { 4829 WARN(1, "%s: found more than %llu devices\n", 4830 __func__, fs_devices->rw_devices); 4831 break; 4832 } 4833 devices_info[ndevs].dev_offset = dev_offset; 4834 devices_info[ndevs].max_avail = max_avail; 4835 devices_info[ndevs].total_avail = total_avail; 4836 devices_info[ndevs].dev = device; 4837 ++ndevs; 4838 } 4839 4840 /* 4841 * now sort the devices by hole size / available space 4842 */ 4843 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4844 btrfs_cmp_device_info, NULL); 4845 4846 /* round down to number of usable stripes */ 4847 ndevs = round_down(ndevs, devs_increment); 4848 4849 if (ndevs < devs_min) { 4850 ret = -ENOSPC; 4851 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 4852 btrfs_debug(info, 4853 "%s: not enough devices with free space: have=%d minimum required=%d", 4854 __func__, ndevs, devs_min); 4855 } 4856 goto error; 4857 } 4858 4859 ndevs = min(ndevs, devs_max); 4860 4861 /* 4862 * The primary goal is to maximize the number of stripes, so use as 4863 * many devices as possible, even if the stripes are not maximum sized. 4864 * 4865 * The DUP profile stores more than one stripe per device, the 4866 * max_avail is the total size so we have to adjust. 4867 */ 4868 stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); 4869 num_stripes = ndevs * dev_stripes; 4870 4871 /* 4872 * this will have to be fixed for RAID1 and RAID10 over 4873 * more drives 4874 */ 4875 data_stripes = num_stripes / ncopies; 4876 4877 if (type & BTRFS_BLOCK_GROUP_RAID5) 4878 data_stripes = num_stripes - 1; 4879 4880 if (type & BTRFS_BLOCK_GROUP_RAID6) 4881 data_stripes = num_stripes - 2; 4882 4883 /* 4884 * Use the number of data stripes to figure out how big this chunk 4885 * is really going to be in terms of logical address space, 4886 * and compare that answer with the max chunk size 4887 */ 4888 if (stripe_size * data_stripes > max_chunk_size) { 4889 stripe_size = div_u64(max_chunk_size, data_stripes); 4890 4891 /* bump the answer up to a 16MB boundary */ 4892 stripe_size = round_up(stripe_size, SZ_16M); 4893 4894 /* 4895 * But don't go higher than the limits we found while searching 4896 * for free extents 4897 */ 4898 stripe_size = min(devices_info[ndevs - 1].max_avail, 4899 stripe_size); 4900 } 4901 4902 /* align to BTRFS_STRIPE_LEN */ 4903 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 4904 4905 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4906 if (!map) { 4907 ret = -ENOMEM; 4908 goto error; 4909 } 4910 map->num_stripes = num_stripes; 4911 4912 for (i = 0; i < ndevs; ++i) { 4913 for (j = 0; j < dev_stripes; ++j) { 4914 int s = i * dev_stripes + j; 4915 map->stripes[s].dev = devices_info[i].dev; 4916 map->stripes[s].physical = devices_info[i].dev_offset + 4917 j * stripe_size; 4918 } 4919 } 4920 map->stripe_len = BTRFS_STRIPE_LEN; 4921 map->io_align = BTRFS_STRIPE_LEN; 4922 map->io_width = BTRFS_STRIPE_LEN; 4923 map->type = type; 4924 map->sub_stripes = sub_stripes; 4925 4926 num_bytes = stripe_size * data_stripes; 4927 4928 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4929 4930 em = alloc_extent_map(); 4931 if (!em) { 4932 kfree(map); 4933 ret = -ENOMEM; 4934 goto error; 4935 } 4936 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4937 em->map_lookup = map; 4938 em->start = start; 4939 em->len = num_bytes; 4940 em->block_start = 0; 4941 em->block_len = em->len; 4942 em->orig_block_len = stripe_size; 4943 4944 em_tree = &info->mapping_tree.map_tree; 4945 write_lock(&em_tree->lock); 4946 ret = add_extent_mapping(em_tree, em, 0); 4947 if (ret) { 4948 write_unlock(&em_tree->lock); 4949 free_extent_map(em); 4950 goto error; 4951 } 4952 4953 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4954 refcount_inc(&em->refs); 4955 write_unlock(&em_tree->lock); 4956 4957 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 4958 if (ret) 4959 goto error_del_extent; 4960 4961 for (i = 0; i < map->num_stripes; i++) { 4962 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4963 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4964 } 4965 4966 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 4967 4968 free_extent_map(em); 4969 check_raid56_incompat_flag(info, type); 4970 4971 kfree(devices_info); 4972 return 0; 4973 4974 error_del_extent: 4975 write_lock(&em_tree->lock); 4976 remove_extent_mapping(em_tree, em); 4977 write_unlock(&em_tree->lock); 4978 4979 /* One for our allocation */ 4980 free_extent_map(em); 4981 /* One for the tree reference */ 4982 free_extent_map(em); 4983 /* One for the pending_chunks list reference */ 4984 free_extent_map(em); 4985 error: 4986 kfree(devices_info); 4987 return ret; 4988 } 4989 4990 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4991 struct btrfs_fs_info *fs_info, 4992 u64 chunk_offset, u64 chunk_size) 4993 { 4994 struct btrfs_root *extent_root = fs_info->extent_root; 4995 struct btrfs_root *chunk_root = fs_info->chunk_root; 4996 struct btrfs_key key; 4997 struct btrfs_device *device; 4998 struct btrfs_chunk *chunk; 4999 struct btrfs_stripe *stripe; 5000 struct extent_map *em; 5001 struct map_lookup *map; 5002 size_t item_size; 5003 u64 dev_offset; 5004 u64 stripe_size; 5005 int i = 0; 5006 int ret = 0; 5007 5008 em = get_chunk_map(fs_info, chunk_offset, chunk_size); 5009 if (IS_ERR(em)) 5010 return PTR_ERR(em); 5011 5012 map = em->map_lookup; 5013 item_size = btrfs_chunk_item_size(map->num_stripes); 5014 stripe_size = em->orig_block_len; 5015 5016 chunk = kzalloc(item_size, GFP_NOFS); 5017 if (!chunk) { 5018 ret = -ENOMEM; 5019 goto out; 5020 } 5021 5022 /* 5023 * Take the device list mutex to prevent races with the final phase of 5024 * a device replace operation that replaces the device object associated 5025 * with the map's stripes, because the device object's id can change 5026 * at any time during that final phase of the device replace operation 5027 * (dev-replace.c:btrfs_dev_replace_finishing()). 5028 */ 5029 mutex_lock(&fs_info->fs_devices->device_list_mutex); 5030 for (i = 0; i < map->num_stripes; i++) { 5031 device = map->stripes[i].dev; 5032 dev_offset = map->stripes[i].physical; 5033 5034 ret = btrfs_update_device(trans, device); 5035 if (ret) 5036 break; 5037 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5038 dev_offset, stripe_size); 5039 if (ret) 5040 break; 5041 } 5042 if (ret) { 5043 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5044 goto out; 5045 } 5046 5047 stripe = &chunk->stripe; 5048 for (i = 0; i < map->num_stripes; i++) { 5049 device = map->stripes[i].dev; 5050 dev_offset = map->stripes[i].physical; 5051 5052 btrfs_set_stack_stripe_devid(stripe, device->devid); 5053 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5054 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5055 stripe++; 5056 } 5057 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5058 5059 btrfs_set_stack_chunk_length(chunk, chunk_size); 5060 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5061 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5062 btrfs_set_stack_chunk_type(chunk, map->type); 5063 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5064 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5065 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5066 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5067 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5068 5069 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5070 key.type = BTRFS_CHUNK_ITEM_KEY; 5071 key.offset = chunk_offset; 5072 5073 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5074 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5075 /* 5076 * TODO: Cleanup of inserted chunk root in case of 5077 * failure. 5078 */ 5079 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5080 } 5081 5082 out: 5083 kfree(chunk); 5084 free_extent_map(em); 5085 return ret; 5086 } 5087 5088 /* 5089 * Chunk allocation falls into two parts. The first part does works 5090 * that make the new allocated chunk useable, but not do any operation 5091 * that modifies the chunk tree. The second part does the works that 5092 * require modifying the chunk tree. This division is important for the 5093 * bootstrap process of adding storage to a seed btrfs. 5094 */ 5095 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5096 struct btrfs_fs_info *fs_info, u64 type) 5097 { 5098 u64 chunk_offset; 5099 5100 lockdep_assert_held(&fs_info->chunk_mutex); 5101 chunk_offset = find_next_chunk(fs_info); 5102 return __btrfs_alloc_chunk(trans, chunk_offset, type); 5103 } 5104 5105 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5106 struct btrfs_fs_info *fs_info) 5107 { 5108 u64 chunk_offset; 5109 u64 sys_chunk_offset; 5110 u64 alloc_profile; 5111 int ret; 5112 5113 chunk_offset = find_next_chunk(fs_info); 5114 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5115 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 5116 if (ret) 5117 return ret; 5118 5119 sys_chunk_offset = find_next_chunk(fs_info); 5120 alloc_profile = btrfs_system_alloc_profile(fs_info); 5121 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 5122 return ret; 5123 } 5124 5125 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5126 { 5127 int max_errors; 5128 5129 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5130 BTRFS_BLOCK_GROUP_RAID10 | 5131 BTRFS_BLOCK_GROUP_RAID5 | 5132 BTRFS_BLOCK_GROUP_DUP)) { 5133 max_errors = 1; 5134 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5135 max_errors = 2; 5136 } else { 5137 max_errors = 0; 5138 } 5139 5140 return max_errors; 5141 } 5142 5143 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5144 { 5145 struct extent_map *em; 5146 struct map_lookup *map; 5147 int readonly = 0; 5148 int miss_ndevs = 0; 5149 int i; 5150 5151 em = get_chunk_map(fs_info, chunk_offset, 1); 5152 if (IS_ERR(em)) 5153 return 1; 5154 5155 map = em->map_lookup; 5156 for (i = 0; i < map->num_stripes; i++) { 5157 if (test_bit(BTRFS_DEV_STATE_MISSING, 5158 &map->stripes[i].dev->dev_state)) { 5159 miss_ndevs++; 5160 continue; 5161 } 5162 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5163 &map->stripes[i].dev->dev_state)) { 5164 readonly = 1; 5165 goto end; 5166 } 5167 } 5168 5169 /* 5170 * If the number of missing devices is larger than max errors, 5171 * we can not write the data into that chunk successfully, so 5172 * set it readonly. 5173 */ 5174 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5175 readonly = 1; 5176 end: 5177 free_extent_map(em); 5178 return readonly; 5179 } 5180 5181 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5182 { 5183 extent_map_tree_init(&tree->map_tree); 5184 } 5185 5186 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5187 { 5188 struct extent_map *em; 5189 5190 while (1) { 5191 write_lock(&tree->map_tree.lock); 5192 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5193 if (em) 5194 remove_extent_mapping(&tree->map_tree, em); 5195 write_unlock(&tree->map_tree.lock); 5196 if (!em) 5197 break; 5198 /* once for us */ 5199 free_extent_map(em); 5200 /* once for the tree */ 5201 free_extent_map(em); 5202 } 5203 } 5204 5205 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5206 { 5207 struct extent_map *em; 5208 struct map_lookup *map; 5209 int ret; 5210 5211 em = get_chunk_map(fs_info, logical, len); 5212 if (IS_ERR(em)) 5213 /* 5214 * We could return errors for these cases, but that could get 5215 * ugly and we'd probably do the same thing which is just not do 5216 * anything else and exit, so return 1 so the callers don't try 5217 * to use other copies. 5218 */ 5219 return 1; 5220 5221 map = em->map_lookup; 5222 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5223 ret = map->num_stripes; 5224 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5225 ret = map->sub_stripes; 5226 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5227 ret = 2; 5228 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5229 /* 5230 * There could be two corrupted data stripes, we need 5231 * to loop retry in order to rebuild the correct data. 5232 * 5233 * Fail a stripe at a time on every retry except the 5234 * stripe under reconstruction. 5235 */ 5236 ret = map->num_stripes; 5237 else 5238 ret = 1; 5239 free_extent_map(em); 5240 5241 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 5242 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5243 fs_info->dev_replace.tgtdev) 5244 ret++; 5245 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 5246 5247 return ret; 5248 } 5249 5250 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5251 u64 logical) 5252 { 5253 struct extent_map *em; 5254 struct map_lookup *map; 5255 unsigned long len = fs_info->sectorsize; 5256 5257 em = get_chunk_map(fs_info, logical, len); 5258 5259 if (!WARN_ON(IS_ERR(em))) { 5260 map = em->map_lookup; 5261 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5262 len = map->stripe_len * nr_data_stripes(map); 5263 free_extent_map(em); 5264 } 5265 return len; 5266 } 5267 5268 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5269 { 5270 struct extent_map *em; 5271 struct map_lookup *map; 5272 int ret = 0; 5273 5274 em = get_chunk_map(fs_info, logical, len); 5275 5276 if(!WARN_ON(IS_ERR(em))) { 5277 map = em->map_lookup; 5278 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5279 ret = 1; 5280 free_extent_map(em); 5281 } 5282 return ret; 5283 } 5284 5285 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5286 struct map_lookup *map, int first, 5287 int dev_replace_is_ongoing) 5288 { 5289 int i; 5290 int num_stripes; 5291 int preferred_mirror; 5292 int tolerance; 5293 struct btrfs_device *srcdev; 5294 5295 ASSERT((map->type & 5296 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5297 5298 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5299 num_stripes = map->sub_stripes; 5300 else 5301 num_stripes = map->num_stripes; 5302 5303 preferred_mirror = first + current->pid % num_stripes; 5304 5305 if (dev_replace_is_ongoing && 5306 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5307 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5308 srcdev = fs_info->dev_replace.srcdev; 5309 else 5310 srcdev = NULL; 5311 5312 /* 5313 * try to avoid the drive that is the source drive for a 5314 * dev-replace procedure, only choose it if no other non-missing 5315 * mirror is available 5316 */ 5317 for (tolerance = 0; tolerance < 2; tolerance++) { 5318 if (map->stripes[preferred_mirror].dev->bdev && 5319 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5320 return preferred_mirror; 5321 for (i = first; i < first + num_stripes; i++) { 5322 if (map->stripes[i].dev->bdev && 5323 (tolerance || map->stripes[i].dev != srcdev)) 5324 return i; 5325 } 5326 } 5327 5328 /* we couldn't find one that doesn't fail. Just return something 5329 * and the io error handling code will clean up eventually 5330 */ 5331 return preferred_mirror; 5332 } 5333 5334 static inline int parity_smaller(u64 a, u64 b) 5335 { 5336 return a > b; 5337 } 5338 5339 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5340 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5341 { 5342 struct btrfs_bio_stripe s; 5343 int i; 5344 u64 l; 5345 int again = 1; 5346 5347 while (again) { 5348 again = 0; 5349 for (i = 0; i < num_stripes - 1; i++) { 5350 if (parity_smaller(bbio->raid_map[i], 5351 bbio->raid_map[i+1])) { 5352 s = bbio->stripes[i]; 5353 l = bbio->raid_map[i]; 5354 bbio->stripes[i] = bbio->stripes[i+1]; 5355 bbio->raid_map[i] = bbio->raid_map[i+1]; 5356 bbio->stripes[i+1] = s; 5357 bbio->raid_map[i+1] = l; 5358 5359 again = 1; 5360 } 5361 } 5362 } 5363 } 5364 5365 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5366 { 5367 struct btrfs_bio *bbio = kzalloc( 5368 /* the size of the btrfs_bio */ 5369 sizeof(struct btrfs_bio) + 5370 /* plus the variable array for the stripes */ 5371 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5372 /* plus the variable array for the tgt dev */ 5373 sizeof(int) * (real_stripes) + 5374 /* 5375 * plus the raid_map, which includes both the tgt dev 5376 * and the stripes 5377 */ 5378 sizeof(u64) * (total_stripes), 5379 GFP_NOFS|__GFP_NOFAIL); 5380 5381 atomic_set(&bbio->error, 0); 5382 refcount_set(&bbio->refs, 1); 5383 5384 return bbio; 5385 } 5386 5387 void btrfs_get_bbio(struct btrfs_bio *bbio) 5388 { 5389 WARN_ON(!refcount_read(&bbio->refs)); 5390 refcount_inc(&bbio->refs); 5391 } 5392 5393 void btrfs_put_bbio(struct btrfs_bio *bbio) 5394 { 5395 if (!bbio) 5396 return; 5397 if (refcount_dec_and_test(&bbio->refs)) 5398 kfree(bbio); 5399 } 5400 5401 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5402 /* 5403 * Please note that, discard won't be sent to target device of device 5404 * replace. 5405 */ 5406 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5407 u64 logical, u64 length, 5408 struct btrfs_bio **bbio_ret) 5409 { 5410 struct extent_map *em; 5411 struct map_lookup *map; 5412 struct btrfs_bio *bbio; 5413 u64 offset; 5414 u64 stripe_nr; 5415 u64 stripe_nr_end; 5416 u64 stripe_end_offset; 5417 u64 stripe_cnt; 5418 u64 stripe_len; 5419 u64 stripe_offset; 5420 u64 num_stripes; 5421 u32 stripe_index; 5422 u32 factor = 0; 5423 u32 sub_stripes = 0; 5424 u64 stripes_per_dev = 0; 5425 u32 remaining_stripes = 0; 5426 u32 last_stripe = 0; 5427 int ret = 0; 5428 int i; 5429 5430 /* discard always return a bbio */ 5431 ASSERT(bbio_ret); 5432 5433 em = get_chunk_map(fs_info, logical, length); 5434 if (IS_ERR(em)) 5435 return PTR_ERR(em); 5436 5437 map = em->map_lookup; 5438 /* we don't discard raid56 yet */ 5439 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5440 ret = -EOPNOTSUPP; 5441 goto out; 5442 } 5443 5444 offset = logical - em->start; 5445 length = min_t(u64, em->len - offset, length); 5446 5447 stripe_len = map->stripe_len; 5448 /* 5449 * stripe_nr counts the total number of stripes we have to stride 5450 * to get to this block 5451 */ 5452 stripe_nr = div64_u64(offset, stripe_len); 5453 5454 /* stripe_offset is the offset of this block in its stripe */ 5455 stripe_offset = offset - stripe_nr * stripe_len; 5456 5457 stripe_nr_end = round_up(offset + length, map->stripe_len); 5458 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5459 stripe_cnt = stripe_nr_end - stripe_nr; 5460 stripe_end_offset = stripe_nr_end * map->stripe_len - 5461 (offset + length); 5462 /* 5463 * after this, stripe_nr is the number of stripes on this 5464 * device we have to walk to find the data, and stripe_index is 5465 * the number of our device in the stripe array 5466 */ 5467 num_stripes = 1; 5468 stripe_index = 0; 5469 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5470 BTRFS_BLOCK_GROUP_RAID10)) { 5471 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5472 sub_stripes = 1; 5473 else 5474 sub_stripes = map->sub_stripes; 5475 5476 factor = map->num_stripes / sub_stripes; 5477 num_stripes = min_t(u64, map->num_stripes, 5478 sub_stripes * stripe_cnt); 5479 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5480 stripe_index *= sub_stripes; 5481 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5482 &remaining_stripes); 5483 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5484 last_stripe *= sub_stripes; 5485 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5486 BTRFS_BLOCK_GROUP_DUP)) { 5487 num_stripes = map->num_stripes; 5488 } else { 5489 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5490 &stripe_index); 5491 } 5492 5493 bbio = alloc_btrfs_bio(num_stripes, 0); 5494 if (!bbio) { 5495 ret = -ENOMEM; 5496 goto out; 5497 } 5498 5499 for (i = 0; i < num_stripes; i++) { 5500 bbio->stripes[i].physical = 5501 map->stripes[stripe_index].physical + 5502 stripe_offset + stripe_nr * map->stripe_len; 5503 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5504 5505 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5506 BTRFS_BLOCK_GROUP_RAID10)) { 5507 bbio->stripes[i].length = stripes_per_dev * 5508 map->stripe_len; 5509 5510 if (i / sub_stripes < remaining_stripes) 5511 bbio->stripes[i].length += 5512 map->stripe_len; 5513 5514 /* 5515 * Special for the first stripe and 5516 * the last stripe: 5517 * 5518 * |-------|...|-------| 5519 * |----------| 5520 * off end_off 5521 */ 5522 if (i < sub_stripes) 5523 bbio->stripes[i].length -= 5524 stripe_offset; 5525 5526 if (stripe_index >= last_stripe && 5527 stripe_index <= (last_stripe + 5528 sub_stripes - 1)) 5529 bbio->stripes[i].length -= 5530 stripe_end_offset; 5531 5532 if (i == sub_stripes - 1) 5533 stripe_offset = 0; 5534 } else { 5535 bbio->stripes[i].length = length; 5536 } 5537 5538 stripe_index++; 5539 if (stripe_index == map->num_stripes) { 5540 stripe_index = 0; 5541 stripe_nr++; 5542 } 5543 } 5544 5545 *bbio_ret = bbio; 5546 bbio->map_type = map->type; 5547 bbio->num_stripes = num_stripes; 5548 out: 5549 free_extent_map(em); 5550 return ret; 5551 } 5552 5553 /* 5554 * In dev-replace case, for repair case (that's the only case where the mirror 5555 * is selected explicitly when calling btrfs_map_block), blocks left of the 5556 * left cursor can also be read from the target drive. 5557 * 5558 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5559 * array of stripes. 5560 * For READ, it also needs to be supported using the same mirror number. 5561 * 5562 * If the requested block is not left of the left cursor, EIO is returned. This 5563 * can happen because btrfs_num_copies() returns one more in the dev-replace 5564 * case. 5565 */ 5566 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5567 u64 logical, u64 length, 5568 u64 srcdev_devid, int *mirror_num, 5569 u64 *physical) 5570 { 5571 struct btrfs_bio *bbio = NULL; 5572 int num_stripes; 5573 int index_srcdev = 0; 5574 int found = 0; 5575 u64 physical_of_found = 0; 5576 int i; 5577 int ret = 0; 5578 5579 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5580 logical, &length, &bbio, 0, 0); 5581 if (ret) { 5582 ASSERT(bbio == NULL); 5583 return ret; 5584 } 5585 5586 num_stripes = bbio->num_stripes; 5587 if (*mirror_num > num_stripes) { 5588 /* 5589 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5590 * that means that the requested area is not left of the left 5591 * cursor 5592 */ 5593 btrfs_put_bbio(bbio); 5594 return -EIO; 5595 } 5596 5597 /* 5598 * process the rest of the function using the mirror_num of the source 5599 * drive. Therefore look it up first. At the end, patch the device 5600 * pointer to the one of the target drive. 5601 */ 5602 for (i = 0; i < num_stripes; i++) { 5603 if (bbio->stripes[i].dev->devid != srcdev_devid) 5604 continue; 5605 5606 /* 5607 * In case of DUP, in order to keep it simple, only add the 5608 * mirror with the lowest physical address 5609 */ 5610 if (found && 5611 physical_of_found <= bbio->stripes[i].physical) 5612 continue; 5613 5614 index_srcdev = i; 5615 found = 1; 5616 physical_of_found = bbio->stripes[i].physical; 5617 } 5618 5619 btrfs_put_bbio(bbio); 5620 5621 ASSERT(found); 5622 if (!found) 5623 return -EIO; 5624 5625 *mirror_num = index_srcdev + 1; 5626 *physical = physical_of_found; 5627 return ret; 5628 } 5629 5630 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5631 struct btrfs_bio **bbio_ret, 5632 struct btrfs_dev_replace *dev_replace, 5633 int *num_stripes_ret, int *max_errors_ret) 5634 { 5635 struct btrfs_bio *bbio = *bbio_ret; 5636 u64 srcdev_devid = dev_replace->srcdev->devid; 5637 int tgtdev_indexes = 0; 5638 int num_stripes = *num_stripes_ret; 5639 int max_errors = *max_errors_ret; 5640 int i; 5641 5642 if (op == BTRFS_MAP_WRITE) { 5643 int index_where_to_add; 5644 5645 /* 5646 * duplicate the write operations while the dev replace 5647 * procedure is running. Since the copying of the old disk to 5648 * the new disk takes place at run time while the filesystem is 5649 * mounted writable, the regular write operations to the old 5650 * disk have to be duplicated to go to the new disk as well. 5651 * 5652 * Note that device->missing is handled by the caller, and that 5653 * the write to the old disk is already set up in the stripes 5654 * array. 5655 */ 5656 index_where_to_add = num_stripes; 5657 for (i = 0; i < num_stripes; i++) { 5658 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5659 /* write to new disk, too */ 5660 struct btrfs_bio_stripe *new = 5661 bbio->stripes + index_where_to_add; 5662 struct btrfs_bio_stripe *old = 5663 bbio->stripes + i; 5664 5665 new->physical = old->physical; 5666 new->length = old->length; 5667 new->dev = dev_replace->tgtdev; 5668 bbio->tgtdev_map[i] = index_where_to_add; 5669 index_where_to_add++; 5670 max_errors++; 5671 tgtdev_indexes++; 5672 } 5673 } 5674 num_stripes = index_where_to_add; 5675 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5676 int index_srcdev = 0; 5677 int found = 0; 5678 u64 physical_of_found = 0; 5679 5680 /* 5681 * During the dev-replace procedure, the target drive can also 5682 * be used to read data in case it is needed to repair a corrupt 5683 * block elsewhere. This is possible if the requested area is 5684 * left of the left cursor. In this area, the target drive is a 5685 * full copy of the source drive. 5686 */ 5687 for (i = 0; i < num_stripes; i++) { 5688 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5689 /* 5690 * In case of DUP, in order to keep it simple, 5691 * only add the mirror with the lowest physical 5692 * address 5693 */ 5694 if (found && 5695 physical_of_found <= 5696 bbio->stripes[i].physical) 5697 continue; 5698 index_srcdev = i; 5699 found = 1; 5700 physical_of_found = bbio->stripes[i].physical; 5701 } 5702 } 5703 if (found) { 5704 struct btrfs_bio_stripe *tgtdev_stripe = 5705 bbio->stripes + num_stripes; 5706 5707 tgtdev_stripe->physical = physical_of_found; 5708 tgtdev_stripe->length = 5709 bbio->stripes[index_srcdev].length; 5710 tgtdev_stripe->dev = dev_replace->tgtdev; 5711 bbio->tgtdev_map[index_srcdev] = num_stripes; 5712 5713 tgtdev_indexes++; 5714 num_stripes++; 5715 } 5716 } 5717 5718 *num_stripes_ret = num_stripes; 5719 *max_errors_ret = max_errors; 5720 bbio->num_tgtdevs = tgtdev_indexes; 5721 *bbio_ret = bbio; 5722 } 5723 5724 static bool need_full_stripe(enum btrfs_map_op op) 5725 { 5726 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5727 } 5728 5729 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5730 enum btrfs_map_op op, 5731 u64 logical, u64 *length, 5732 struct btrfs_bio **bbio_ret, 5733 int mirror_num, int need_raid_map) 5734 { 5735 struct extent_map *em; 5736 struct map_lookup *map; 5737 u64 offset; 5738 u64 stripe_offset; 5739 u64 stripe_nr; 5740 u64 stripe_len; 5741 u32 stripe_index; 5742 int i; 5743 int ret = 0; 5744 int num_stripes; 5745 int max_errors = 0; 5746 int tgtdev_indexes = 0; 5747 struct btrfs_bio *bbio = NULL; 5748 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5749 int dev_replace_is_ongoing = 0; 5750 int num_alloc_stripes; 5751 int patch_the_first_stripe_for_dev_replace = 0; 5752 u64 physical_to_patch_in_first_stripe = 0; 5753 u64 raid56_full_stripe_start = (u64)-1; 5754 5755 if (op == BTRFS_MAP_DISCARD) 5756 return __btrfs_map_block_for_discard(fs_info, logical, 5757 *length, bbio_ret); 5758 5759 em = get_chunk_map(fs_info, logical, *length); 5760 if (IS_ERR(em)) 5761 return PTR_ERR(em); 5762 5763 map = em->map_lookup; 5764 offset = logical - em->start; 5765 5766 stripe_len = map->stripe_len; 5767 stripe_nr = offset; 5768 /* 5769 * stripe_nr counts the total number of stripes we have to stride 5770 * to get to this block 5771 */ 5772 stripe_nr = div64_u64(stripe_nr, stripe_len); 5773 5774 stripe_offset = stripe_nr * stripe_len; 5775 if (offset < stripe_offset) { 5776 btrfs_crit(fs_info, 5777 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5778 stripe_offset, offset, em->start, logical, 5779 stripe_len); 5780 free_extent_map(em); 5781 return -EINVAL; 5782 } 5783 5784 /* stripe_offset is the offset of this block in its stripe*/ 5785 stripe_offset = offset - stripe_offset; 5786 5787 /* if we're here for raid56, we need to know the stripe aligned start */ 5788 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5789 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5790 raid56_full_stripe_start = offset; 5791 5792 /* allow a write of a full stripe, but make sure we don't 5793 * allow straddling of stripes 5794 */ 5795 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5796 full_stripe_len); 5797 raid56_full_stripe_start *= full_stripe_len; 5798 } 5799 5800 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5801 u64 max_len; 5802 /* For writes to RAID[56], allow a full stripeset across all disks. 5803 For other RAID types and for RAID[56] reads, just allow a single 5804 stripe (on a single disk). */ 5805 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5806 (op == BTRFS_MAP_WRITE)) { 5807 max_len = stripe_len * nr_data_stripes(map) - 5808 (offset - raid56_full_stripe_start); 5809 } else { 5810 /* we limit the length of each bio to what fits in a stripe */ 5811 max_len = stripe_len - stripe_offset; 5812 } 5813 *length = min_t(u64, em->len - offset, max_len); 5814 } else { 5815 *length = em->len - offset; 5816 } 5817 5818 /* This is for when we're called from btrfs_merge_bio_hook() and all 5819 it cares about is the length */ 5820 if (!bbio_ret) 5821 goto out; 5822 5823 btrfs_dev_replace_read_lock(dev_replace); 5824 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5825 if (!dev_replace_is_ongoing) 5826 btrfs_dev_replace_read_unlock(dev_replace); 5827 else 5828 btrfs_dev_replace_set_lock_blocking(dev_replace); 5829 5830 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5831 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 5832 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 5833 dev_replace->srcdev->devid, 5834 &mirror_num, 5835 &physical_to_patch_in_first_stripe); 5836 if (ret) 5837 goto out; 5838 else 5839 patch_the_first_stripe_for_dev_replace = 1; 5840 } else if (mirror_num > map->num_stripes) { 5841 mirror_num = 0; 5842 } 5843 5844 num_stripes = 1; 5845 stripe_index = 0; 5846 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5847 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5848 &stripe_index); 5849 if (!need_full_stripe(op)) 5850 mirror_num = 1; 5851 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5852 if (need_full_stripe(op)) 5853 num_stripes = map->num_stripes; 5854 else if (mirror_num) 5855 stripe_index = mirror_num - 1; 5856 else { 5857 stripe_index = find_live_mirror(fs_info, map, 0, 5858 dev_replace_is_ongoing); 5859 mirror_num = stripe_index + 1; 5860 } 5861 5862 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5863 if (need_full_stripe(op)) { 5864 num_stripes = map->num_stripes; 5865 } else if (mirror_num) { 5866 stripe_index = mirror_num - 1; 5867 } else { 5868 mirror_num = 1; 5869 } 5870 5871 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5872 u32 factor = map->num_stripes / map->sub_stripes; 5873 5874 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5875 stripe_index *= map->sub_stripes; 5876 5877 if (need_full_stripe(op)) 5878 num_stripes = map->sub_stripes; 5879 else if (mirror_num) 5880 stripe_index += mirror_num - 1; 5881 else { 5882 int old_stripe_index = stripe_index; 5883 stripe_index = find_live_mirror(fs_info, map, 5884 stripe_index, 5885 dev_replace_is_ongoing); 5886 mirror_num = stripe_index - old_stripe_index + 1; 5887 } 5888 5889 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5890 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5891 /* push stripe_nr back to the start of the full stripe */ 5892 stripe_nr = div64_u64(raid56_full_stripe_start, 5893 stripe_len * nr_data_stripes(map)); 5894 5895 /* RAID[56] write or recovery. Return all stripes */ 5896 num_stripes = map->num_stripes; 5897 max_errors = nr_parity_stripes(map); 5898 5899 *length = map->stripe_len; 5900 stripe_index = 0; 5901 stripe_offset = 0; 5902 } else { 5903 /* 5904 * Mirror #0 or #1 means the original data block. 5905 * Mirror #2 is RAID5 parity block. 5906 * Mirror #3 is RAID6 Q block. 5907 */ 5908 stripe_nr = div_u64_rem(stripe_nr, 5909 nr_data_stripes(map), &stripe_index); 5910 if (mirror_num > 1) 5911 stripe_index = nr_data_stripes(map) + 5912 mirror_num - 2; 5913 5914 /* We distribute the parity blocks across stripes */ 5915 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5916 &stripe_index); 5917 if (!need_full_stripe(op) && mirror_num <= 1) 5918 mirror_num = 1; 5919 } 5920 } else { 5921 /* 5922 * after this, stripe_nr is the number of stripes on this 5923 * device we have to walk to find the data, and stripe_index is 5924 * the number of our device in the stripe array 5925 */ 5926 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5927 &stripe_index); 5928 mirror_num = stripe_index + 1; 5929 } 5930 if (stripe_index >= map->num_stripes) { 5931 btrfs_crit(fs_info, 5932 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5933 stripe_index, map->num_stripes); 5934 ret = -EINVAL; 5935 goto out; 5936 } 5937 5938 num_alloc_stripes = num_stripes; 5939 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 5940 if (op == BTRFS_MAP_WRITE) 5941 num_alloc_stripes <<= 1; 5942 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5943 num_alloc_stripes++; 5944 tgtdev_indexes = num_stripes; 5945 } 5946 5947 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5948 if (!bbio) { 5949 ret = -ENOMEM; 5950 goto out; 5951 } 5952 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 5953 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5954 5955 /* build raid_map */ 5956 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 5957 (need_full_stripe(op) || mirror_num > 1)) { 5958 u64 tmp; 5959 unsigned rot; 5960 5961 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5962 sizeof(struct btrfs_bio_stripe) * 5963 num_alloc_stripes + 5964 sizeof(int) * tgtdev_indexes); 5965 5966 /* Work out the disk rotation on this stripe-set */ 5967 div_u64_rem(stripe_nr, num_stripes, &rot); 5968 5969 /* Fill in the logical address of each stripe */ 5970 tmp = stripe_nr * nr_data_stripes(map); 5971 for (i = 0; i < nr_data_stripes(map); i++) 5972 bbio->raid_map[(i+rot) % num_stripes] = 5973 em->start + (tmp + i) * map->stripe_len; 5974 5975 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5976 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5977 bbio->raid_map[(i+rot+1) % num_stripes] = 5978 RAID6_Q_STRIPE; 5979 } 5980 5981 5982 for (i = 0; i < num_stripes; i++) { 5983 bbio->stripes[i].physical = 5984 map->stripes[stripe_index].physical + 5985 stripe_offset + 5986 stripe_nr * map->stripe_len; 5987 bbio->stripes[i].dev = 5988 map->stripes[stripe_index].dev; 5989 stripe_index++; 5990 } 5991 5992 if (need_full_stripe(op)) 5993 max_errors = btrfs_chunk_max_errors(map); 5994 5995 if (bbio->raid_map) 5996 sort_parity_stripes(bbio, num_stripes); 5997 5998 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 5999 need_full_stripe(op)) { 6000 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 6001 &max_errors); 6002 } 6003 6004 *bbio_ret = bbio; 6005 bbio->map_type = map->type; 6006 bbio->num_stripes = num_stripes; 6007 bbio->max_errors = max_errors; 6008 bbio->mirror_num = mirror_num; 6009 6010 /* 6011 * this is the case that REQ_READ && dev_replace_is_ongoing && 6012 * mirror_num == num_stripes + 1 && dev_replace target drive is 6013 * available as a mirror 6014 */ 6015 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6016 WARN_ON(num_stripes > 1); 6017 bbio->stripes[0].dev = dev_replace->tgtdev; 6018 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6019 bbio->mirror_num = map->num_stripes + 1; 6020 } 6021 out: 6022 if (dev_replace_is_ongoing) { 6023 btrfs_dev_replace_clear_lock_blocking(dev_replace); 6024 btrfs_dev_replace_read_unlock(dev_replace); 6025 } 6026 free_extent_map(em); 6027 return ret; 6028 } 6029 6030 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6031 u64 logical, u64 *length, 6032 struct btrfs_bio **bbio_ret, int mirror_num) 6033 { 6034 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6035 mirror_num, 0); 6036 } 6037 6038 /* For Scrub/replace */ 6039 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6040 u64 logical, u64 *length, 6041 struct btrfs_bio **bbio_ret) 6042 { 6043 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6044 } 6045 6046 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, 6047 u64 chunk_start, u64 physical, u64 devid, 6048 u64 **logical, int *naddrs, int *stripe_len) 6049 { 6050 struct extent_map *em; 6051 struct map_lookup *map; 6052 u64 *buf; 6053 u64 bytenr; 6054 u64 length; 6055 u64 stripe_nr; 6056 u64 rmap_len; 6057 int i, j, nr = 0; 6058 6059 em = get_chunk_map(fs_info, chunk_start, 1); 6060 if (IS_ERR(em)) 6061 return -EIO; 6062 6063 map = em->map_lookup; 6064 length = em->len; 6065 rmap_len = map->stripe_len; 6066 6067 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 6068 length = div_u64(length, map->num_stripes / map->sub_stripes); 6069 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6070 length = div_u64(length, map->num_stripes); 6071 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6072 length = div_u64(length, nr_data_stripes(map)); 6073 rmap_len = map->stripe_len * nr_data_stripes(map); 6074 } 6075 6076 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 6077 BUG_ON(!buf); /* -ENOMEM */ 6078 6079 for (i = 0; i < map->num_stripes; i++) { 6080 if (devid && map->stripes[i].dev->devid != devid) 6081 continue; 6082 if (map->stripes[i].physical > physical || 6083 map->stripes[i].physical + length <= physical) 6084 continue; 6085 6086 stripe_nr = physical - map->stripes[i].physical; 6087 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 6088 6089 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6090 stripe_nr = stripe_nr * map->num_stripes + i; 6091 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 6092 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6093 stripe_nr = stripe_nr * map->num_stripes + i; 6094 } /* else if RAID[56], multiply by nr_data_stripes(). 6095 * Alternatively, just use rmap_len below instead of 6096 * map->stripe_len */ 6097 6098 bytenr = chunk_start + stripe_nr * rmap_len; 6099 WARN_ON(nr >= map->num_stripes); 6100 for (j = 0; j < nr; j++) { 6101 if (buf[j] == bytenr) 6102 break; 6103 } 6104 if (j == nr) { 6105 WARN_ON(nr >= map->num_stripes); 6106 buf[nr++] = bytenr; 6107 } 6108 } 6109 6110 *logical = buf; 6111 *naddrs = nr; 6112 *stripe_len = rmap_len; 6113 6114 free_extent_map(em); 6115 return 0; 6116 } 6117 6118 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6119 { 6120 bio->bi_private = bbio->private; 6121 bio->bi_end_io = bbio->end_io; 6122 bio_endio(bio); 6123 6124 btrfs_put_bbio(bbio); 6125 } 6126 6127 static void btrfs_end_bio(struct bio *bio) 6128 { 6129 struct btrfs_bio *bbio = bio->bi_private; 6130 int is_orig_bio = 0; 6131 6132 if (bio->bi_status) { 6133 atomic_inc(&bbio->error); 6134 if (bio->bi_status == BLK_STS_IOERR || 6135 bio->bi_status == BLK_STS_TARGET) { 6136 unsigned int stripe_index = 6137 btrfs_io_bio(bio)->stripe_index; 6138 struct btrfs_device *dev; 6139 6140 BUG_ON(stripe_index >= bbio->num_stripes); 6141 dev = bbio->stripes[stripe_index].dev; 6142 if (dev->bdev) { 6143 if (bio_op(bio) == REQ_OP_WRITE) 6144 btrfs_dev_stat_inc_and_print(dev, 6145 BTRFS_DEV_STAT_WRITE_ERRS); 6146 else 6147 btrfs_dev_stat_inc_and_print(dev, 6148 BTRFS_DEV_STAT_READ_ERRS); 6149 if (bio->bi_opf & REQ_PREFLUSH) 6150 btrfs_dev_stat_inc_and_print(dev, 6151 BTRFS_DEV_STAT_FLUSH_ERRS); 6152 } 6153 } 6154 } 6155 6156 if (bio == bbio->orig_bio) 6157 is_orig_bio = 1; 6158 6159 btrfs_bio_counter_dec(bbio->fs_info); 6160 6161 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6162 if (!is_orig_bio) { 6163 bio_put(bio); 6164 bio = bbio->orig_bio; 6165 } 6166 6167 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6168 /* only send an error to the higher layers if it is 6169 * beyond the tolerance of the btrfs bio 6170 */ 6171 if (atomic_read(&bbio->error) > bbio->max_errors) { 6172 bio->bi_status = BLK_STS_IOERR; 6173 } else { 6174 /* 6175 * this bio is actually up to date, we didn't 6176 * go over the max number of errors 6177 */ 6178 bio->bi_status = BLK_STS_OK; 6179 } 6180 6181 btrfs_end_bbio(bbio, bio); 6182 } else if (!is_orig_bio) { 6183 bio_put(bio); 6184 } 6185 } 6186 6187 /* 6188 * see run_scheduled_bios for a description of why bios are collected for 6189 * async submit. 6190 * 6191 * This will add one bio to the pending list for a device and make sure 6192 * the work struct is scheduled. 6193 */ 6194 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6195 struct bio *bio) 6196 { 6197 struct btrfs_fs_info *fs_info = device->fs_info; 6198 int should_queue = 1; 6199 struct btrfs_pending_bios *pending_bios; 6200 6201 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || 6202 !device->bdev) { 6203 bio_io_error(bio); 6204 return; 6205 } 6206 6207 /* don't bother with additional async steps for reads, right now */ 6208 if (bio_op(bio) == REQ_OP_READ) { 6209 btrfsic_submit_bio(bio); 6210 return; 6211 } 6212 6213 WARN_ON(bio->bi_next); 6214 bio->bi_next = NULL; 6215 6216 spin_lock(&device->io_lock); 6217 if (op_is_sync(bio->bi_opf)) 6218 pending_bios = &device->pending_sync_bios; 6219 else 6220 pending_bios = &device->pending_bios; 6221 6222 if (pending_bios->tail) 6223 pending_bios->tail->bi_next = bio; 6224 6225 pending_bios->tail = bio; 6226 if (!pending_bios->head) 6227 pending_bios->head = bio; 6228 if (device->running_pending) 6229 should_queue = 0; 6230 6231 spin_unlock(&device->io_lock); 6232 6233 if (should_queue) 6234 btrfs_queue_work(fs_info->submit_workers, &device->work); 6235 } 6236 6237 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6238 u64 physical, int dev_nr, int async) 6239 { 6240 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6241 struct btrfs_fs_info *fs_info = bbio->fs_info; 6242 6243 bio->bi_private = bbio; 6244 btrfs_io_bio(bio)->stripe_index = dev_nr; 6245 bio->bi_end_io = btrfs_end_bio; 6246 bio->bi_iter.bi_sector = physical >> 9; 6247 #ifdef DEBUG 6248 { 6249 struct rcu_string *name; 6250 6251 rcu_read_lock(); 6252 name = rcu_dereference(dev->name); 6253 btrfs_debug(fs_info, 6254 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6255 bio_op(bio), bio->bi_opf, 6256 (u64)bio->bi_iter.bi_sector, 6257 (u_long)dev->bdev->bd_dev, name->str, dev->devid, 6258 bio->bi_iter.bi_size); 6259 rcu_read_unlock(); 6260 } 6261 #endif 6262 bio_set_dev(bio, dev->bdev); 6263 6264 btrfs_bio_counter_inc_noblocked(fs_info); 6265 6266 if (async) 6267 btrfs_schedule_bio(dev, bio); 6268 else 6269 btrfsic_submit_bio(bio); 6270 } 6271 6272 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6273 { 6274 atomic_inc(&bbio->error); 6275 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6276 /* Should be the original bio. */ 6277 WARN_ON(bio != bbio->orig_bio); 6278 6279 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6280 bio->bi_iter.bi_sector = logical >> 9; 6281 if (atomic_read(&bbio->error) > bbio->max_errors) 6282 bio->bi_status = BLK_STS_IOERR; 6283 else 6284 bio->bi_status = BLK_STS_OK; 6285 btrfs_end_bbio(bbio, bio); 6286 } 6287 } 6288 6289 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6290 int mirror_num, int async_submit) 6291 { 6292 struct btrfs_device *dev; 6293 struct bio *first_bio = bio; 6294 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6295 u64 length = 0; 6296 u64 map_length; 6297 int ret; 6298 int dev_nr; 6299 int total_devs; 6300 struct btrfs_bio *bbio = NULL; 6301 6302 length = bio->bi_iter.bi_size; 6303 map_length = length; 6304 6305 btrfs_bio_counter_inc_blocked(fs_info); 6306 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6307 &map_length, &bbio, mirror_num, 1); 6308 if (ret) { 6309 btrfs_bio_counter_dec(fs_info); 6310 return errno_to_blk_status(ret); 6311 } 6312 6313 total_devs = bbio->num_stripes; 6314 bbio->orig_bio = first_bio; 6315 bbio->private = first_bio->bi_private; 6316 bbio->end_io = first_bio->bi_end_io; 6317 bbio->fs_info = fs_info; 6318 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6319 6320 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6321 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6322 /* In this case, map_length has been set to the length of 6323 a single stripe; not the whole write */ 6324 if (bio_op(bio) == REQ_OP_WRITE) { 6325 ret = raid56_parity_write(fs_info, bio, bbio, 6326 map_length); 6327 } else { 6328 ret = raid56_parity_recover(fs_info, bio, bbio, 6329 map_length, mirror_num, 1); 6330 } 6331 6332 btrfs_bio_counter_dec(fs_info); 6333 return errno_to_blk_status(ret); 6334 } 6335 6336 if (map_length < length) { 6337 btrfs_crit(fs_info, 6338 "mapping failed logical %llu bio len %llu len %llu", 6339 logical, length, map_length); 6340 BUG(); 6341 } 6342 6343 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6344 dev = bbio->stripes[dev_nr].dev; 6345 if (!dev || !dev->bdev || 6346 (bio_op(first_bio) == REQ_OP_WRITE && 6347 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6348 bbio_error(bbio, first_bio, logical); 6349 continue; 6350 } 6351 6352 if (dev_nr < total_devs - 1) 6353 bio = btrfs_bio_clone(first_bio); 6354 else 6355 bio = first_bio; 6356 6357 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6358 dev_nr, async_submit); 6359 } 6360 btrfs_bio_counter_dec(fs_info); 6361 return BLK_STS_OK; 6362 } 6363 6364 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6365 u8 *uuid, u8 *fsid) 6366 { 6367 struct btrfs_device *device; 6368 struct btrfs_fs_devices *cur_devices; 6369 6370 cur_devices = fs_info->fs_devices; 6371 while (cur_devices) { 6372 if (!fsid || 6373 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 6374 device = find_device(cur_devices, devid, uuid); 6375 if (device) 6376 return device; 6377 } 6378 cur_devices = cur_devices->seed; 6379 } 6380 return NULL; 6381 } 6382 6383 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6384 u64 devid, u8 *dev_uuid) 6385 { 6386 struct btrfs_device *device; 6387 6388 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6389 if (IS_ERR(device)) 6390 return device; 6391 6392 list_add(&device->dev_list, &fs_devices->devices); 6393 device->fs_devices = fs_devices; 6394 fs_devices->num_devices++; 6395 6396 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6397 fs_devices->missing_devices++; 6398 6399 return device; 6400 } 6401 6402 /** 6403 * btrfs_alloc_device - allocate struct btrfs_device 6404 * @fs_info: used only for generating a new devid, can be NULL if 6405 * devid is provided (i.e. @devid != NULL). 6406 * @devid: a pointer to devid for this device. If NULL a new devid 6407 * is generated. 6408 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6409 * is generated. 6410 * 6411 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6412 * on error. Returned struct is not linked onto any lists and must be 6413 * destroyed with free_device. 6414 */ 6415 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6416 const u64 *devid, 6417 const u8 *uuid) 6418 { 6419 struct btrfs_device *dev; 6420 u64 tmp; 6421 6422 if (WARN_ON(!devid && !fs_info)) 6423 return ERR_PTR(-EINVAL); 6424 6425 dev = __alloc_device(); 6426 if (IS_ERR(dev)) 6427 return dev; 6428 6429 if (devid) 6430 tmp = *devid; 6431 else { 6432 int ret; 6433 6434 ret = find_next_devid(fs_info, &tmp); 6435 if (ret) { 6436 free_device(dev); 6437 return ERR_PTR(ret); 6438 } 6439 } 6440 dev->devid = tmp; 6441 6442 if (uuid) 6443 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6444 else 6445 generate_random_uuid(dev->uuid); 6446 6447 btrfs_init_work(&dev->work, btrfs_submit_helper, 6448 pending_bios_fn, NULL, NULL); 6449 6450 return dev; 6451 } 6452 6453 /* Return -EIO if any error, otherwise return 0. */ 6454 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6455 struct extent_buffer *leaf, 6456 struct btrfs_chunk *chunk, u64 logical) 6457 { 6458 u64 length; 6459 u64 stripe_len; 6460 u16 num_stripes; 6461 u16 sub_stripes; 6462 u64 type; 6463 6464 length = btrfs_chunk_length(leaf, chunk); 6465 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6466 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6467 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6468 type = btrfs_chunk_type(leaf, chunk); 6469 6470 if (!num_stripes) { 6471 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6472 num_stripes); 6473 return -EIO; 6474 } 6475 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6476 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6477 return -EIO; 6478 } 6479 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6480 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6481 btrfs_chunk_sector_size(leaf, chunk)); 6482 return -EIO; 6483 } 6484 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6485 btrfs_err(fs_info, "invalid chunk length %llu", length); 6486 return -EIO; 6487 } 6488 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6489 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6490 stripe_len); 6491 return -EIO; 6492 } 6493 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6494 type) { 6495 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6496 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6497 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6498 btrfs_chunk_type(leaf, chunk)); 6499 return -EIO; 6500 } 6501 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6502 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6503 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6504 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6505 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6506 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6507 num_stripes != 1)) { 6508 btrfs_err(fs_info, 6509 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6510 num_stripes, sub_stripes, 6511 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6512 return -EIO; 6513 } 6514 6515 return 0; 6516 } 6517 6518 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6519 u64 devid, u8 *uuid, bool error) 6520 { 6521 if (error) 6522 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6523 devid, uuid); 6524 else 6525 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6526 devid, uuid); 6527 } 6528 6529 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6530 struct extent_buffer *leaf, 6531 struct btrfs_chunk *chunk) 6532 { 6533 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6534 struct map_lookup *map; 6535 struct extent_map *em; 6536 u64 logical; 6537 u64 length; 6538 u64 devid; 6539 u8 uuid[BTRFS_UUID_SIZE]; 6540 int num_stripes; 6541 int ret; 6542 int i; 6543 6544 logical = key->offset; 6545 length = btrfs_chunk_length(leaf, chunk); 6546 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6547 6548 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6549 if (ret) 6550 return ret; 6551 6552 read_lock(&map_tree->map_tree.lock); 6553 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6554 read_unlock(&map_tree->map_tree.lock); 6555 6556 /* already mapped? */ 6557 if (em && em->start <= logical && em->start + em->len > logical) { 6558 free_extent_map(em); 6559 return 0; 6560 } else if (em) { 6561 free_extent_map(em); 6562 } 6563 6564 em = alloc_extent_map(); 6565 if (!em) 6566 return -ENOMEM; 6567 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6568 if (!map) { 6569 free_extent_map(em); 6570 return -ENOMEM; 6571 } 6572 6573 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6574 em->map_lookup = map; 6575 em->start = logical; 6576 em->len = length; 6577 em->orig_start = 0; 6578 em->block_start = 0; 6579 em->block_len = em->len; 6580 6581 map->num_stripes = num_stripes; 6582 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6583 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6584 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6585 map->type = btrfs_chunk_type(leaf, chunk); 6586 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6587 for (i = 0; i < num_stripes; i++) { 6588 map->stripes[i].physical = 6589 btrfs_stripe_offset_nr(leaf, chunk, i); 6590 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6591 read_extent_buffer(leaf, uuid, (unsigned long) 6592 btrfs_stripe_dev_uuid_nr(chunk, i), 6593 BTRFS_UUID_SIZE); 6594 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6595 uuid, NULL); 6596 if (!map->stripes[i].dev && 6597 !btrfs_test_opt(fs_info, DEGRADED)) { 6598 free_extent_map(em); 6599 btrfs_report_missing_device(fs_info, devid, uuid, true); 6600 return -ENOENT; 6601 } 6602 if (!map->stripes[i].dev) { 6603 map->stripes[i].dev = 6604 add_missing_dev(fs_info->fs_devices, devid, 6605 uuid); 6606 if (IS_ERR(map->stripes[i].dev)) { 6607 free_extent_map(em); 6608 btrfs_err(fs_info, 6609 "failed to init missing dev %llu: %ld", 6610 devid, PTR_ERR(map->stripes[i].dev)); 6611 return PTR_ERR(map->stripes[i].dev); 6612 } 6613 btrfs_report_missing_device(fs_info, devid, uuid, false); 6614 } 6615 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6616 &(map->stripes[i].dev->dev_state)); 6617 6618 } 6619 6620 write_lock(&map_tree->map_tree.lock); 6621 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6622 write_unlock(&map_tree->map_tree.lock); 6623 BUG_ON(ret); /* Tree corruption */ 6624 free_extent_map(em); 6625 6626 return 0; 6627 } 6628 6629 static void fill_device_from_item(struct extent_buffer *leaf, 6630 struct btrfs_dev_item *dev_item, 6631 struct btrfs_device *device) 6632 { 6633 unsigned long ptr; 6634 6635 device->devid = btrfs_device_id(leaf, dev_item); 6636 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6637 device->total_bytes = device->disk_total_bytes; 6638 device->commit_total_bytes = device->disk_total_bytes; 6639 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6640 device->commit_bytes_used = device->bytes_used; 6641 device->type = btrfs_device_type(leaf, dev_item); 6642 device->io_align = btrfs_device_io_align(leaf, dev_item); 6643 device->io_width = btrfs_device_io_width(leaf, dev_item); 6644 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6645 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6646 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6647 6648 ptr = btrfs_device_uuid(dev_item); 6649 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6650 } 6651 6652 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6653 u8 *fsid) 6654 { 6655 struct btrfs_fs_devices *fs_devices; 6656 int ret; 6657 6658 lockdep_assert_held(&uuid_mutex); 6659 ASSERT(fsid); 6660 6661 fs_devices = fs_info->fs_devices->seed; 6662 while (fs_devices) { 6663 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6664 return fs_devices; 6665 6666 fs_devices = fs_devices->seed; 6667 } 6668 6669 fs_devices = find_fsid(fsid); 6670 if (!fs_devices) { 6671 if (!btrfs_test_opt(fs_info, DEGRADED)) 6672 return ERR_PTR(-ENOENT); 6673 6674 fs_devices = alloc_fs_devices(fsid); 6675 if (IS_ERR(fs_devices)) 6676 return fs_devices; 6677 6678 fs_devices->seeding = 1; 6679 fs_devices->opened = 1; 6680 return fs_devices; 6681 } 6682 6683 fs_devices = clone_fs_devices(fs_devices); 6684 if (IS_ERR(fs_devices)) 6685 return fs_devices; 6686 6687 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6688 fs_info->bdev_holder); 6689 if (ret) { 6690 free_fs_devices(fs_devices); 6691 fs_devices = ERR_PTR(ret); 6692 goto out; 6693 } 6694 6695 if (!fs_devices->seeding) { 6696 __btrfs_close_devices(fs_devices); 6697 free_fs_devices(fs_devices); 6698 fs_devices = ERR_PTR(-EINVAL); 6699 goto out; 6700 } 6701 6702 fs_devices->seed = fs_info->fs_devices->seed; 6703 fs_info->fs_devices->seed = fs_devices; 6704 out: 6705 return fs_devices; 6706 } 6707 6708 static int read_one_dev(struct btrfs_fs_info *fs_info, 6709 struct extent_buffer *leaf, 6710 struct btrfs_dev_item *dev_item) 6711 { 6712 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6713 struct btrfs_device *device; 6714 u64 devid; 6715 int ret; 6716 u8 fs_uuid[BTRFS_FSID_SIZE]; 6717 u8 dev_uuid[BTRFS_UUID_SIZE]; 6718 6719 devid = btrfs_device_id(leaf, dev_item); 6720 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6721 BTRFS_UUID_SIZE); 6722 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6723 BTRFS_FSID_SIZE); 6724 6725 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 6726 fs_devices = open_seed_devices(fs_info, fs_uuid); 6727 if (IS_ERR(fs_devices)) 6728 return PTR_ERR(fs_devices); 6729 } 6730 6731 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6732 if (!device) { 6733 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6734 btrfs_report_missing_device(fs_info, devid, 6735 dev_uuid, true); 6736 return -ENOENT; 6737 } 6738 6739 device = add_missing_dev(fs_devices, devid, dev_uuid); 6740 if (IS_ERR(device)) { 6741 btrfs_err(fs_info, 6742 "failed to add missing dev %llu: %ld", 6743 devid, PTR_ERR(device)); 6744 return PTR_ERR(device); 6745 } 6746 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6747 } else { 6748 if (!device->bdev) { 6749 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6750 btrfs_report_missing_device(fs_info, 6751 devid, dev_uuid, true); 6752 return -ENOENT; 6753 } 6754 btrfs_report_missing_device(fs_info, devid, 6755 dev_uuid, false); 6756 } 6757 6758 if (!device->bdev && 6759 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6760 /* 6761 * this happens when a device that was properly setup 6762 * in the device info lists suddenly goes bad. 6763 * device->bdev is NULL, and so we have to set 6764 * device->missing to one here 6765 */ 6766 device->fs_devices->missing_devices++; 6767 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6768 } 6769 6770 /* Move the device to its own fs_devices */ 6771 if (device->fs_devices != fs_devices) { 6772 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6773 &device->dev_state)); 6774 6775 list_move(&device->dev_list, &fs_devices->devices); 6776 device->fs_devices->num_devices--; 6777 fs_devices->num_devices++; 6778 6779 device->fs_devices->missing_devices--; 6780 fs_devices->missing_devices++; 6781 6782 device->fs_devices = fs_devices; 6783 } 6784 } 6785 6786 if (device->fs_devices != fs_info->fs_devices) { 6787 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 6788 if (device->generation != 6789 btrfs_device_generation(leaf, dev_item)) 6790 return -EINVAL; 6791 } 6792 6793 fill_device_from_item(leaf, dev_item, device); 6794 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6795 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6796 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 6797 device->fs_devices->total_rw_bytes += device->total_bytes; 6798 atomic64_add(device->total_bytes - device->bytes_used, 6799 &fs_info->free_chunk_space); 6800 } 6801 ret = 0; 6802 return ret; 6803 } 6804 6805 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6806 { 6807 struct btrfs_root *root = fs_info->tree_root; 6808 struct btrfs_super_block *super_copy = fs_info->super_copy; 6809 struct extent_buffer *sb; 6810 struct btrfs_disk_key *disk_key; 6811 struct btrfs_chunk *chunk; 6812 u8 *array_ptr; 6813 unsigned long sb_array_offset; 6814 int ret = 0; 6815 u32 num_stripes; 6816 u32 array_size; 6817 u32 len = 0; 6818 u32 cur_offset; 6819 u64 type; 6820 struct btrfs_key key; 6821 6822 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6823 /* 6824 * This will create extent buffer of nodesize, superblock size is 6825 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6826 * overallocate but we can keep it as-is, only the first page is used. 6827 */ 6828 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6829 if (IS_ERR(sb)) 6830 return PTR_ERR(sb); 6831 set_extent_buffer_uptodate(sb); 6832 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6833 /* 6834 * The sb extent buffer is artificial and just used to read the system array. 6835 * set_extent_buffer_uptodate() call does not properly mark all it's 6836 * pages up-to-date when the page is larger: extent does not cover the 6837 * whole page and consequently check_page_uptodate does not find all 6838 * the page's extents up-to-date (the hole beyond sb), 6839 * write_extent_buffer then triggers a WARN_ON. 6840 * 6841 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6842 * but sb spans only this function. Add an explicit SetPageUptodate call 6843 * to silence the warning eg. on PowerPC 64. 6844 */ 6845 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6846 SetPageUptodate(sb->pages[0]); 6847 6848 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6849 array_size = btrfs_super_sys_array_size(super_copy); 6850 6851 array_ptr = super_copy->sys_chunk_array; 6852 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6853 cur_offset = 0; 6854 6855 while (cur_offset < array_size) { 6856 disk_key = (struct btrfs_disk_key *)array_ptr; 6857 len = sizeof(*disk_key); 6858 if (cur_offset + len > array_size) 6859 goto out_short_read; 6860 6861 btrfs_disk_key_to_cpu(&key, disk_key); 6862 6863 array_ptr += len; 6864 sb_array_offset += len; 6865 cur_offset += len; 6866 6867 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6868 chunk = (struct btrfs_chunk *)sb_array_offset; 6869 /* 6870 * At least one btrfs_chunk with one stripe must be 6871 * present, exact stripe count check comes afterwards 6872 */ 6873 len = btrfs_chunk_item_size(1); 6874 if (cur_offset + len > array_size) 6875 goto out_short_read; 6876 6877 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6878 if (!num_stripes) { 6879 btrfs_err(fs_info, 6880 "invalid number of stripes %u in sys_array at offset %u", 6881 num_stripes, cur_offset); 6882 ret = -EIO; 6883 break; 6884 } 6885 6886 type = btrfs_chunk_type(sb, chunk); 6887 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6888 btrfs_err(fs_info, 6889 "invalid chunk type %llu in sys_array at offset %u", 6890 type, cur_offset); 6891 ret = -EIO; 6892 break; 6893 } 6894 6895 len = btrfs_chunk_item_size(num_stripes); 6896 if (cur_offset + len > array_size) 6897 goto out_short_read; 6898 6899 ret = read_one_chunk(fs_info, &key, sb, chunk); 6900 if (ret) 6901 break; 6902 } else { 6903 btrfs_err(fs_info, 6904 "unexpected item type %u in sys_array at offset %u", 6905 (u32)key.type, cur_offset); 6906 ret = -EIO; 6907 break; 6908 } 6909 array_ptr += len; 6910 sb_array_offset += len; 6911 cur_offset += len; 6912 } 6913 clear_extent_buffer_uptodate(sb); 6914 free_extent_buffer_stale(sb); 6915 return ret; 6916 6917 out_short_read: 6918 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6919 len, cur_offset); 6920 clear_extent_buffer_uptodate(sb); 6921 free_extent_buffer_stale(sb); 6922 return -EIO; 6923 } 6924 6925 /* 6926 * Check if all chunks in the fs are OK for read-write degraded mount 6927 * 6928 * If the @failing_dev is specified, it's accounted as missing. 6929 * 6930 * Return true if all chunks meet the minimal RW mount requirements. 6931 * Return false if any chunk doesn't meet the minimal RW mount requirements. 6932 */ 6933 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 6934 struct btrfs_device *failing_dev) 6935 { 6936 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6937 struct extent_map *em; 6938 u64 next_start = 0; 6939 bool ret = true; 6940 6941 read_lock(&map_tree->map_tree.lock); 6942 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 6943 read_unlock(&map_tree->map_tree.lock); 6944 /* No chunk at all? Return false anyway */ 6945 if (!em) { 6946 ret = false; 6947 goto out; 6948 } 6949 while (em) { 6950 struct map_lookup *map; 6951 int missing = 0; 6952 int max_tolerated; 6953 int i; 6954 6955 map = em->map_lookup; 6956 max_tolerated = 6957 btrfs_get_num_tolerated_disk_barrier_failures( 6958 map->type); 6959 for (i = 0; i < map->num_stripes; i++) { 6960 struct btrfs_device *dev = map->stripes[i].dev; 6961 6962 if (!dev || !dev->bdev || 6963 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6964 dev->last_flush_error) 6965 missing++; 6966 else if (failing_dev && failing_dev == dev) 6967 missing++; 6968 } 6969 if (missing > max_tolerated) { 6970 if (!failing_dev) 6971 btrfs_warn(fs_info, 6972 "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 6973 em->start, missing, max_tolerated); 6974 free_extent_map(em); 6975 ret = false; 6976 goto out; 6977 } 6978 next_start = extent_map_end(em); 6979 free_extent_map(em); 6980 6981 read_lock(&map_tree->map_tree.lock); 6982 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 6983 (u64)(-1) - next_start); 6984 read_unlock(&map_tree->map_tree.lock); 6985 } 6986 out: 6987 return ret; 6988 } 6989 6990 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6991 { 6992 struct btrfs_root *root = fs_info->chunk_root; 6993 struct btrfs_path *path; 6994 struct extent_buffer *leaf; 6995 struct btrfs_key key; 6996 struct btrfs_key found_key; 6997 int ret; 6998 int slot; 6999 u64 total_dev = 0; 7000 7001 path = btrfs_alloc_path(); 7002 if (!path) 7003 return -ENOMEM; 7004 7005 mutex_lock(&uuid_mutex); 7006 mutex_lock(&fs_info->chunk_mutex); 7007 7008 /* 7009 * Read all device items, and then all the chunk items. All 7010 * device items are found before any chunk item (their object id 7011 * is smaller than the lowest possible object id for a chunk 7012 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7013 */ 7014 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7015 key.offset = 0; 7016 key.type = 0; 7017 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7018 if (ret < 0) 7019 goto error; 7020 while (1) { 7021 leaf = path->nodes[0]; 7022 slot = path->slots[0]; 7023 if (slot >= btrfs_header_nritems(leaf)) { 7024 ret = btrfs_next_leaf(root, path); 7025 if (ret == 0) 7026 continue; 7027 if (ret < 0) 7028 goto error; 7029 break; 7030 } 7031 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7032 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7033 struct btrfs_dev_item *dev_item; 7034 dev_item = btrfs_item_ptr(leaf, slot, 7035 struct btrfs_dev_item); 7036 ret = read_one_dev(fs_info, leaf, dev_item); 7037 if (ret) 7038 goto error; 7039 total_dev++; 7040 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7041 struct btrfs_chunk *chunk; 7042 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7043 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 7044 if (ret) 7045 goto error; 7046 } 7047 path->slots[0]++; 7048 } 7049 7050 /* 7051 * After loading chunk tree, we've got all device information, 7052 * do another round of validation checks. 7053 */ 7054 if (total_dev != fs_info->fs_devices->total_devices) { 7055 btrfs_err(fs_info, 7056 "super_num_devices %llu mismatch with num_devices %llu found here", 7057 btrfs_super_num_devices(fs_info->super_copy), 7058 total_dev); 7059 ret = -EINVAL; 7060 goto error; 7061 } 7062 if (btrfs_super_total_bytes(fs_info->super_copy) < 7063 fs_info->fs_devices->total_rw_bytes) { 7064 btrfs_err(fs_info, 7065 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7066 btrfs_super_total_bytes(fs_info->super_copy), 7067 fs_info->fs_devices->total_rw_bytes); 7068 ret = -EINVAL; 7069 goto error; 7070 } 7071 ret = 0; 7072 error: 7073 mutex_unlock(&fs_info->chunk_mutex); 7074 mutex_unlock(&uuid_mutex); 7075 7076 btrfs_free_path(path); 7077 return ret; 7078 } 7079 7080 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7081 { 7082 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7083 struct btrfs_device *device; 7084 7085 while (fs_devices) { 7086 mutex_lock(&fs_devices->device_list_mutex); 7087 list_for_each_entry(device, &fs_devices->devices, dev_list) 7088 device->fs_info = fs_info; 7089 mutex_unlock(&fs_devices->device_list_mutex); 7090 7091 fs_devices = fs_devices->seed; 7092 } 7093 } 7094 7095 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 7096 { 7097 int i; 7098 7099 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7100 btrfs_dev_stat_reset(dev, i); 7101 } 7102 7103 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7104 { 7105 struct btrfs_key key; 7106 struct btrfs_key found_key; 7107 struct btrfs_root *dev_root = fs_info->dev_root; 7108 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7109 struct extent_buffer *eb; 7110 int slot; 7111 int ret = 0; 7112 struct btrfs_device *device; 7113 struct btrfs_path *path = NULL; 7114 int i; 7115 7116 path = btrfs_alloc_path(); 7117 if (!path) { 7118 ret = -ENOMEM; 7119 goto out; 7120 } 7121 7122 mutex_lock(&fs_devices->device_list_mutex); 7123 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7124 int item_size; 7125 struct btrfs_dev_stats_item *ptr; 7126 7127 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7128 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7129 key.offset = device->devid; 7130 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7131 if (ret) { 7132 __btrfs_reset_dev_stats(device); 7133 device->dev_stats_valid = 1; 7134 btrfs_release_path(path); 7135 continue; 7136 } 7137 slot = path->slots[0]; 7138 eb = path->nodes[0]; 7139 btrfs_item_key_to_cpu(eb, &found_key, slot); 7140 item_size = btrfs_item_size_nr(eb, slot); 7141 7142 ptr = btrfs_item_ptr(eb, slot, 7143 struct btrfs_dev_stats_item); 7144 7145 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7146 if (item_size >= (1 + i) * sizeof(__le64)) 7147 btrfs_dev_stat_set(device, i, 7148 btrfs_dev_stats_value(eb, ptr, i)); 7149 else 7150 btrfs_dev_stat_reset(device, i); 7151 } 7152 7153 device->dev_stats_valid = 1; 7154 btrfs_dev_stat_print_on_load(device); 7155 btrfs_release_path(path); 7156 } 7157 mutex_unlock(&fs_devices->device_list_mutex); 7158 7159 out: 7160 btrfs_free_path(path); 7161 return ret < 0 ? ret : 0; 7162 } 7163 7164 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7165 struct btrfs_fs_info *fs_info, 7166 struct btrfs_device *device) 7167 { 7168 struct btrfs_root *dev_root = fs_info->dev_root; 7169 struct btrfs_path *path; 7170 struct btrfs_key key; 7171 struct extent_buffer *eb; 7172 struct btrfs_dev_stats_item *ptr; 7173 int ret; 7174 int i; 7175 7176 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7177 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7178 key.offset = device->devid; 7179 7180 path = btrfs_alloc_path(); 7181 if (!path) 7182 return -ENOMEM; 7183 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7184 if (ret < 0) { 7185 btrfs_warn_in_rcu(fs_info, 7186 "error %d while searching for dev_stats item for device %s", 7187 ret, rcu_str_deref(device->name)); 7188 goto out; 7189 } 7190 7191 if (ret == 0 && 7192 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7193 /* need to delete old one and insert a new one */ 7194 ret = btrfs_del_item(trans, dev_root, path); 7195 if (ret != 0) { 7196 btrfs_warn_in_rcu(fs_info, 7197 "delete too small dev_stats item for device %s failed %d", 7198 rcu_str_deref(device->name), ret); 7199 goto out; 7200 } 7201 ret = 1; 7202 } 7203 7204 if (ret == 1) { 7205 /* need to insert a new item */ 7206 btrfs_release_path(path); 7207 ret = btrfs_insert_empty_item(trans, dev_root, path, 7208 &key, sizeof(*ptr)); 7209 if (ret < 0) { 7210 btrfs_warn_in_rcu(fs_info, 7211 "insert dev_stats item for device %s failed %d", 7212 rcu_str_deref(device->name), ret); 7213 goto out; 7214 } 7215 } 7216 7217 eb = path->nodes[0]; 7218 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7219 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7220 btrfs_set_dev_stats_value(eb, ptr, i, 7221 btrfs_dev_stat_read(device, i)); 7222 btrfs_mark_buffer_dirty(eb); 7223 7224 out: 7225 btrfs_free_path(path); 7226 return ret; 7227 } 7228 7229 /* 7230 * called from commit_transaction. Writes all changed device stats to disk. 7231 */ 7232 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7233 struct btrfs_fs_info *fs_info) 7234 { 7235 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7236 struct btrfs_device *device; 7237 int stats_cnt; 7238 int ret = 0; 7239 7240 mutex_lock(&fs_devices->device_list_mutex); 7241 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7242 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7243 if (!device->dev_stats_valid || stats_cnt == 0) 7244 continue; 7245 7246 7247 /* 7248 * There is a LOAD-LOAD control dependency between the value of 7249 * dev_stats_ccnt and updating the on-disk values which requires 7250 * reading the in-memory counters. Such control dependencies 7251 * require explicit read memory barriers. 7252 * 7253 * This memory barriers pairs with smp_mb__before_atomic in 7254 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7255 * barrier implied by atomic_xchg in 7256 * btrfs_dev_stats_read_and_reset 7257 */ 7258 smp_rmb(); 7259 7260 ret = update_dev_stat_item(trans, fs_info, device); 7261 if (!ret) 7262 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7263 } 7264 mutex_unlock(&fs_devices->device_list_mutex); 7265 7266 return ret; 7267 } 7268 7269 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7270 { 7271 btrfs_dev_stat_inc(dev, index); 7272 btrfs_dev_stat_print_on_error(dev); 7273 } 7274 7275 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7276 { 7277 if (!dev->dev_stats_valid) 7278 return; 7279 btrfs_err_rl_in_rcu(dev->fs_info, 7280 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7281 rcu_str_deref(dev->name), 7282 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7283 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7284 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7285 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7286 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7287 } 7288 7289 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7290 { 7291 int i; 7292 7293 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7294 if (btrfs_dev_stat_read(dev, i) != 0) 7295 break; 7296 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7297 return; /* all values == 0, suppress message */ 7298 7299 btrfs_info_in_rcu(dev->fs_info, 7300 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7301 rcu_str_deref(dev->name), 7302 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7303 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7304 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7305 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7306 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7307 } 7308 7309 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7310 struct btrfs_ioctl_get_dev_stats *stats) 7311 { 7312 struct btrfs_device *dev; 7313 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7314 int i; 7315 7316 mutex_lock(&fs_devices->device_list_mutex); 7317 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7318 mutex_unlock(&fs_devices->device_list_mutex); 7319 7320 if (!dev) { 7321 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7322 return -ENODEV; 7323 } else if (!dev->dev_stats_valid) { 7324 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7325 return -ENODEV; 7326 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7327 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7328 if (stats->nr_items > i) 7329 stats->values[i] = 7330 btrfs_dev_stat_read_and_reset(dev, i); 7331 else 7332 btrfs_dev_stat_reset(dev, i); 7333 } 7334 } else { 7335 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7336 if (stats->nr_items > i) 7337 stats->values[i] = btrfs_dev_stat_read(dev, i); 7338 } 7339 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7340 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7341 return 0; 7342 } 7343 7344 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7345 { 7346 struct buffer_head *bh; 7347 struct btrfs_super_block *disk_super; 7348 int copy_num; 7349 7350 if (!bdev) 7351 return; 7352 7353 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7354 copy_num++) { 7355 7356 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7357 continue; 7358 7359 disk_super = (struct btrfs_super_block *)bh->b_data; 7360 7361 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7362 set_buffer_dirty(bh); 7363 sync_dirty_buffer(bh); 7364 brelse(bh); 7365 } 7366 7367 /* Notify udev that device has changed */ 7368 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7369 7370 /* Update ctime/mtime for device path for libblkid */ 7371 update_dev_time(device_path); 7372 } 7373 7374 /* 7375 * Update the size of all devices, which is used for writing out the 7376 * super blocks. 7377 */ 7378 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7379 { 7380 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7381 struct btrfs_device *curr, *next; 7382 7383 if (list_empty(&fs_devices->resized_devices)) 7384 return; 7385 7386 mutex_lock(&fs_devices->device_list_mutex); 7387 mutex_lock(&fs_info->chunk_mutex); 7388 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7389 resized_list) { 7390 list_del_init(&curr->resized_list); 7391 curr->commit_total_bytes = curr->disk_total_bytes; 7392 } 7393 mutex_unlock(&fs_info->chunk_mutex); 7394 mutex_unlock(&fs_devices->device_list_mutex); 7395 } 7396 7397 /* Must be invoked during the transaction commit */ 7398 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) 7399 { 7400 struct btrfs_fs_info *fs_info = trans->fs_info; 7401 struct extent_map *em; 7402 struct map_lookup *map; 7403 struct btrfs_device *dev; 7404 int i; 7405 7406 if (list_empty(&trans->pending_chunks)) 7407 return; 7408 7409 /* In order to kick the device replace finish process */ 7410 mutex_lock(&fs_info->chunk_mutex); 7411 list_for_each_entry(em, &trans->pending_chunks, list) { 7412 map = em->map_lookup; 7413 7414 for (i = 0; i < map->num_stripes; i++) { 7415 dev = map->stripes[i].dev; 7416 dev->commit_bytes_used = dev->bytes_used; 7417 } 7418 } 7419 mutex_unlock(&fs_info->chunk_mutex); 7420 } 7421 7422 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7423 { 7424 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7425 while (fs_devices) { 7426 fs_devices->fs_info = fs_info; 7427 fs_devices = fs_devices->seed; 7428 } 7429 } 7430 7431 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7432 { 7433 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7434 while (fs_devices) { 7435 fs_devices->fs_info = NULL; 7436 fs_devices = fs_devices->seed; 7437 } 7438 } 7439