1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/iocontext.h> 24 #include <linux/capability.h> 25 #include <linux/ratelimit.h> 26 #include <linux/kthread.h> 27 #include <linux/raid/pq.h> 28 #include <linux/semaphore.h> 29 #include <linux/uuid.h> 30 #include <linux/list_sort.h> 31 #include <asm/div64.h> 32 #include "ctree.h" 33 #include "extent_map.h" 34 #include "disk-io.h" 35 #include "transaction.h" 36 #include "print-tree.h" 37 #include "volumes.h" 38 #include "raid56.h" 39 #include "async-thread.h" 40 #include "check-integrity.h" 41 #include "rcu-string.h" 42 #include "math.h" 43 #include "dev-replace.h" 44 #include "sysfs.h" 45 46 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 47 [BTRFS_RAID_RAID10] = { 48 .sub_stripes = 2, 49 .dev_stripes = 1, 50 .devs_max = 0, /* 0 == as many as possible */ 51 .devs_min = 4, 52 .tolerated_failures = 1, 53 .devs_increment = 2, 54 .ncopies = 2, 55 }, 56 [BTRFS_RAID_RAID1] = { 57 .sub_stripes = 1, 58 .dev_stripes = 1, 59 .devs_max = 2, 60 .devs_min = 2, 61 .tolerated_failures = 1, 62 .devs_increment = 2, 63 .ncopies = 2, 64 }, 65 [BTRFS_RAID_DUP] = { 66 .sub_stripes = 1, 67 .dev_stripes = 2, 68 .devs_max = 1, 69 .devs_min = 1, 70 .tolerated_failures = 0, 71 .devs_increment = 1, 72 .ncopies = 2, 73 }, 74 [BTRFS_RAID_RAID0] = { 75 .sub_stripes = 1, 76 .dev_stripes = 1, 77 .devs_max = 0, 78 .devs_min = 2, 79 .tolerated_failures = 0, 80 .devs_increment = 1, 81 .ncopies = 1, 82 }, 83 [BTRFS_RAID_SINGLE] = { 84 .sub_stripes = 1, 85 .dev_stripes = 1, 86 .devs_max = 1, 87 .devs_min = 1, 88 .tolerated_failures = 0, 89 .devs_increment = 1, 90 .ncopies = 1, 91 }, 92 [BTRFS_RAID_RAID5] = { 93 .sub_stripes = 1, 94 .dev_stripes = 1, 95 .devs_max = 0, 96 .devs_min = 2, 97 .tolerated_failures = 1, 98 .devs_increment = 1, 99 .ncopies = 2, 100 }, 101 [BTRFS_RAID_RAID6] = { 102 .sub_stripes = 1, 103 .dev_stripes = 1, 104 .devs_max = 0, 105 .devs_min = 3, 106 .tolerated_failures = 2, 107 .devs_increment = 1, 108 .ncopies = 3, 109 }, 110 }; 111 112 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { 113 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, 114 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, 115 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, 116 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, 117 [BTRFS_RAID_SINGLE] = 0, 118 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, 119 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 120 }; 121 122 /* 123 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices 124 * condition is not met. Zero means there's no corresponding 125 * BTRFS_ERROR_DEV_*_NOT_MET value. 126 */ 127 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { 128 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 129 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 130 [BTRFS_RAID_DUP] = 0, 131 [BTRFS_RAID_RAID0] = 0, 132 [BTRFS_RAID_SINGLE] = 0, 133 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 134 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 135 }; 136 137 static int init_first_rw_device(struct btrfs_trans_handle *trans, 138 struct btrfs_fs_info *fs_info); 139 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 140 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 141 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 142 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 143 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 144 enum btrfs_map_op op, 145 u64 logical, u64 *length, 146 struct btrfs_bio **bbio_ret, 147 int mirror_num, int need_raid_map); 148 149 /* 150 * Device locking 151 * ============== 152 * 153 * There are several mutexes that protect manipulation of devices and low-level 154 * structures like chunks but not block groups, extents or files 155 * 156 * uuid_mutex (global lock) 157 * ------------------------ 158 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 159 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 160 * device) or requested by the device= mount option 161 * 162 * the mutex can be very coarse and can cover long-running operations 163 * 164 * protects: updates to fs_devices counters like missing devices, rw devices, 165 * seeding, structure cloning, openning/closing devices at mount/umount time 166 * 167 * global::fs_devs - add, remove, updates to the global list 168 * 169 * does not protect: manipulation of the fs_devices::devices list! 170 * 171 * btrfs_device::name - renames (write side), read is RCU 172 * 173 * fs_devices::device_list_mutex (per-fs, with RCU) 174 * ------------------------------------------------ 175 * protects updates to fs_devices::devices, ie. adding and deleting 176 * 177 * simple list traversal with read-only actions can be done with RCU protection 178 * 179 * may be used to exclude some operations from running concurrently without any 180 * modifications to the list (see write_all_supers) 181 * 182 * volume_mutex 183 * ------------ 184 * coarse lock owned by a mounted filesystem; used to exclude some operations 185 * that cannot run in parallel and affect the higher-level properties of the 186 * filesystem like: device add/deleting/resize/replace, or balance 187 * 188 * balance_mutex 189 * ------------- 190 * protects balance structures (status, state) and context accessed from 191 * several places (internally, ioctl) 192 * 193 * chunk_mutex 194 * ----------- 195 * protects chunks, adding or removing during allocation, trim or when a new 196 * device is added/removed 197 * 198 * cleaner_mutex 199 * ------------- 200 * a big lock that is held by the cleaner thread and prevents running subvolume 201 * cleaning together with relocation or delayed iputs 202 * 203 * 204 * Lock nesting 205 * ============ 206 * 207 * uuid_mutex 208 * volume_mutex 209 * device_list_mutex 210 * chunk_mutex 211 * balance_mutex 212 */ 213 214 DEFINE_MUTEX(uuid_mutex); 215 static LIST_HEAD(fs_uuids); 216 struct list_head *btrfs_get_fs_uuids(void) 217 { 218 return &fs_uuids; 219 } 220 221 /* 222 * alloc_fs_devices - allocate struct btrfs_fs_devices 223 * @fsid: if not NULL, copy the uuid to fs_devices::fsid 224 * 225 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 226 * The returned struct is not linked onto any lists and can be destroyed with 227 * kfree() right away. 228 */ 229 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 230 { 231 struct btrfs_fs_devices *fs_devs; 232 233 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 234 if (!fs_devs) 235 return ERR_PTR(-ENOMEM); 236 237 mutex_init(&fs_devs->device_list_mutex); 238 239 INIT_LIST_HEAD(&fs_devs->devices); 240 INIT_LIST_HEAD(&fs_devs->resized_devices); 241 INIT_LIST_HEAD(&fs_devs->alloc_list); 242 INIT_LIST_HEAD(&fs_devs->list); 243 if (fsid) 244 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 245 246 return fs_devs; 247 } 248 249 static void free_device(struct btrfs_device *device) 250 { 251 rcu_string_free(device->name); 252 bio_put(device->flush_bio); 253 kfree(device); 254 } 255 256 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 257 { 258 struct btrfs_device *device; 259 WARN_ON(fs_devices->opened); 260 while (!list_empty(&fs_devices->devices)) { 261 device = list_entry(fs_devices->devices.next, 262 struct btrfs_device, dev_list); 263 list_del(&device->dev_list); 264 free_device(device); 265 } 266 kfree(fs_devices); 267 } 268 269 static void btrfs_kobject_uevent(struct block_device *bdev, 270 enum kobject_action action) 271 { 272 int ret; 273 274 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 275 if (ret) 276 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 277 action, 278 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 279 &disk_to_dev(bdev->bd_disk)->kobj); 280 } 281 282 void __exit btrfs_cleanup_fs_uuids(void) 283 { 284 struct btrfs_fs_devices *fs_devices; 285 286 while (!list_empty(&fs_uuids)) { 287 fs_devices = list_entry(fs_uuids.next, 288 struct btrfs_fs_devices, list); 289 list_del(&fs_devices->list); 290 free_fs_devices(fs_devices); 291 } 292 } 293 294 /* 295 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 296 * Returned struct is not linked onto any lists and must be destroyed using 297 * free_device. 298 */ 299 static struct btrfs_device *__alloc_device(void) 300 { 301 struct btrfs_device *dev; 302 303 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 304 if (!dev) 305 return ERR_PTR(-ENOMEM); 306 307 /* 308 * Preallocate a bio that's always going to be used for flushing device 309 * barriers and matches the device lifespan 310 */ 311 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 312 if (!dev->flush_bio) { 313 kfree(dev); 314 return ERR_PTR(-ENOMEM); 315 } 316 317 INIT_LIST_HEAD(&dev->dev_list); 318 INIT_LIST_HEAD(&dev->dev_alloc_list); 319 INIT_LIST_HEAD(&dev->resized_list); 320 321 spin_lock_init(&dev->io_lock); 322 323 atomic_set(&dev->reada_in_flight, 0); 324 atomic_set(&dev->dev_stats_ccnt, 0); 325 btrfs_device_data_ordered_init(dev); 326 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 327 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 328 329 return dev; 330 } 331 332 /* 333 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 334 * return NULL. 335 * 336 * If devid and uuid are both specified, the match must be exact, otherwise 337 * only devid is used. 338 */ 339 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 340 u64 devid, const u8 *uuid) 341 { 342 struct list_head *head = &fs_devices->devices; 343 struct btrfs_device *dev; 344 345 list_for_each_entry(dev, head, dev_list) { 346 if (dev->devid == devid && 347 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 348 return dev; 349 } 350 } 351 return NULL; 352 } 353 354 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 355 { 356 struct btrfs_fs_devices *fs_devices; 357 358 list_for_each_entry(fs_devices, &fs_uuids, list) { 359 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 360 return fs_devices; 361 } 362 return NULL; 363 } 364 365 static int 366 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 367 int flush, struct block_device **bdev, 368 struct buffer_head **bh) 369 { 370 int ret; 371 372 *bdev = blkdev_get_by_path(device_path, flags, holder); 373 374 if (IS_ERR(*bdev)) { 375 ret = PTR_ERR(*bdev); 376 goto error; 377 } 378 379 if (flush) 380 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 381 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 382 if (ret) { 383 blkdev_put(*bdev, flags); 384 goto error; 385 } 386 invalidate_bdev(*bdev); 387 *bh = btrfs_read_dev_super(*bdev); 388 if (IS_ERR(*bh)) { 389 ret = PTR_ERR(*bh); 390 blkdev_put(*bdev, flags); 391 goto error; 392 } 393 394 return 0; 395 396 error: 397 *bdev = NULL; 398 *bh = NULL; 399 return ret; 400 } 401 402 static void requeue_list(struct btrfs_pending_bios *pending_bios, 403 struct bio *head, struct bio *tail) 404 { 405 406 struct bio *old_head; 407 408 old_head = pending_bios->head; 409 pending_bios->head = head; 410 if (pending_bios->tail) 411 tail->bi_next = old_head; 412 else 413 pending_bios->tail = tail; 414 } 415 416 /* 417 * we try to collect pending bios for a device so we don't get a large 418 * number of procs sending bios down to the same device. This greatly 419 * improves the schedulers ability to collect and merge the bios. 420 * 421 * But, it also turns into a long list of bios to process and that is sure 422 * to eventually make the worker thread block. The solution here is to 423 * make some progress and then put this work struct back at the end of 424 * the list if the block device is congested. This way, multiple devices 425 * can make progress from a single worker thread. 426 */ 427 static noinline void run_scheduled_bios(struct btrfs_device *device) 428 { 429 struct btrfs_fs_info *fs_info = device->fs_info; 430 struct bio *pending; 431 struct backing_dev_info *bdi; 432 struct btrfs_pending_bios *pending_bios; 433 struct bio *tail; 434 struct bio *cur; 435 int again = 0; 436 unsigned long num_run; 437 unsigned long batch_run = 0; 438 unsigned long last_waited = 0; 439 int force_reg = 0; 440 int sync_pending = 0; 441 struct blk_plug plug; 442 443 /* 444 * this function runs all the bios we've collected for 445 * a particular device. We don't want to wander off to 446 * another device without first sending all of these down. 447 * So, setup a plug here and finish it off before we return 448 */ 449 blk_start_plug(&plug); 450 451 bdi = device->bdev->bd_bdi; 452 453 loop: 454 spin_lock(&device->io_lock); 455 456 loop_lock: 457 num_run = 0; 458 459 /* take all the bios off the list at once and process them 460 * later on (without the lock held). But, remember the 461 * tail and other pointers so the bios can be properly reinserted 462 * into the list if we hit congestion 463 */ 464 if (!force_reg && device->pending_sync_bios.head) { 465 pending_bios = &device->pending_sync_bios; 466 force_reg = 1; 467 } else { 468 pending_bios = &device->pending_bios; 469 force_reg = 0; 470 } 471 472 pending = pending_bios->head; 473 tail = pending_bios->tail; 474 WARN_ON(pending && !tail); 475 476 /* 477 * if pending was null this time around, no bios need processing 478 * at all and we can stop. Otherwise it'll loop back up again 479 * and do an additional check so no bios are missed. 480 * 481 * device->running_pending is used to synchronize with the 482 * schedule_bio code. 483 */ 484 if (device->pending_sync_bios.head == NULL && 485 device->pending_bios.head == NULL) { 486 again = 0; 487 device->running_pending = 0; 488 } else { 489 again = 1; 490 device->running_pending = 1; 491 } 492 493 pending_bios->head = NULL; 494 pending_bios->tail = NULL; 495 496 spin_unlock(&device->io_lock); 497 498 while (pending) { 499 500 rmb(); 501 /* we want to work on both lists, but do more bios on the 502 * sync list than the regular list 503 */ 504 if ((num_run > 32 && 505 pending_bios != &device->pending_sync_bios && 506 device->pending_sync_bios.head) || 507 (num_run > 64 && pending_bios == &device->pending_sync_bios && 508 device->pending_bios.head)) { 509 spin_lock(&device->io_lock); 510 requeue_list(pending_bios, pending, tail); 511 goto loop_lock; 512 } 513 514 cur = pending; 515 pending = pending->bi_next; 516 cur->bi_next = NULL; 517 518 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 519 520 /* 521 * if we're doing the sync list, record that our 522 * plug has some sync requests on it 523 * 524 * If we're doing the regular list and there are 525 * sync requests sitting around, unplug before 526 * we add more 527 */ 528 if (pending_bios == &device->pending_sync_bios) { 529 sync_pending = 1; 530 } else if (sync_pending) { 531 blk_finish_plug(&plug); 532 blk_start_plug(&plug); 533 sync_pending = 0; 534 } 535 536 btrfsic_submit_bio(cur); 537 num_run++; 538 batch_run++; 539 540 cond_resched(); 541 542 /* 543 * we made progress, there is more work to do and the bdi 544 * is now congested. Back off and let other work structs 545 * run instead 546 */ 547 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 548 fs_info->fs_devices->open_devices > 1) { 549 struct io_context *ioc; 550 551 ioc = current->io_context; 552 553 /* 554 * the main goal here is that we don't want to 555 * block if we're going to be able to submit 556 * more requests without blocking. 557 * 558 * This code does two great things, it pokes into 559 * the elevator code from a filesystem _and_ 560 * it makes assumptions about how batching works. 561 */ 562 if (ioc && ioc->nr_batch_requests > 0 && 563 time_before(jiffies, ioc->last_waited + HZ/50UL) && 564 (last_waited == 0 || 565 ioc->last_waited == last_waited)) { 566 /* 567 * we want to go through our batch of 568 * requests and stop. So, we copy out 569 * the ioc->last_waited time and test 570 * against it before looping 571 */ 572 last_waited = ioc->last_waited; 573 cond_resched(); 574 continue; 575 } 576 spin_lock(&device->io_lock); 577 requeue_list(pending_bios, pending, tail); 578 device->running_pending = 1; 579 580 spin_unlock(&device->io_lock); 581 btrfs_queue_work(fs_info->submit_workers, 582 &device->work); 583 goto done; 584 } 585 } 586 587 cond_resched(); 588 if (again) 589 goto loop; 590 591 spin_lock(&device->io_lock); 592 if (device->pending_bios.head || device->pending_sync_bios.head) 593 goto loop_lock; 594 spin_unlock(&device->io_lock); 595 596 done: 597 blk_finish_plug(&plug); 598 } 599 600 static void pending_bios_fn(struct btrfs_work *work) 601 { 602 struct btrfs_device *device; 603 604 device = container_of(work, struct btrfs_device, work); 605 run_scheduled_bios(device); 606 } 607 608 /* 609 * Search and remove all stale (devices which are not mounted) devices. 610 * When both inputs are NULL, it will search and release all stale devices. 611 * path: Optional. When provided will it release all unmounted devices 612 * matching this path only. 613 * skip_dev: Optional. Will skip this device when searching for the stale 614 * devices. 615 */ 616 static void btrfs_free_stale_devices(const char *path, 617 struct btrfs_device *skip_dev) 618 { 619 struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; 620 struct btrfs_device *dev, *tmp_dev; 621 622 list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { 623 624 if (fs_devs->opened) 625 continue; 626 627 list_for_each_entry_safe(dev, tmp_dev, 628 &fs_devs->devices, dev_list) { 629 int not_found = 0; 630 631 if (skip_dev && skip_dev == dev) 632 continue; 633 if (path && !dev->name) 634 continue; 635 636 rcu_read_lock(); 637 if (path) 638 not_found = strcmp(rcu_str_deref(dev->name), 639 path); 640 rcu_read_unlock(); 641 if (not_found) 642 continue; 643 644 /* delete the stale device */ 645 if (fs_devs->num_devices == 1) { 646 btrfs_sysfs_remove_fsid(fs_devs); 647 list_del(&fs_devs->list); 648 free_fs_devices(fs_devs); 649 break; 650 } else { 651 fs_devs->num_devices--; 652 list_del(&dev->dev_list); 653 free_device(dev); 654 } 655 } 656 } 657 } 658 659 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 660 struct btrfs_device *device, fmode_t flags, 661 void *holder) 662 { 663 struct request_queue *q; 664 struct block_device *bdev; 665 struct buffer_head *bh; 666 struct btrfs_super_block *disk_super; 667 u64 devid; 668 int ret; 669 670 if (device->bdev) 671 return -EINVAL; 672 if (!device->name) 673 return -EINVAL; 674 675 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 676 &bdev, &bh); 677 if (ret) 678 return ret; 679 680 disk_super = (struct btrfs_super_block *)bh->b_data; 681 devid = btrfs_stack_device_id(&disk_super->dev_item); 682 if (devid != device->devid) 683 goto error_brelse; 684 685 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 686 goto error_brelse; 687 688 device->generation = btrfs_super_generation(disk_super); 689 690 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 691 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 692 fs_devices->seeding = 1; 693 } else { 694 if (bdev_read_only(bdev)) 695 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 696 else 697 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 698 } 699 700 q = bdev_get_queue(bdev); 701 if (!blk_queue_nonrot(q)) 702 fs_devices->rotating = 1; 703 704 device->bdev = bdev; 705 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 706 device->mode = flags; 707 708 fs_devices->open_devices++; 709 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 710 device->devid != BTRFS_DEV_REPLACE_DEVID) { 711 fs_devices->rw_devices++; 712 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 713 } 714 brelse(bh); 715 716 return 0; 717 718 error_brelse: 719 brelse(bh); 720 blkdev_put(bdev, flags); 721 722 return -EINVAL; 723 } 724 725 /* 726 * Add new device to list of registered devices 727 * 728 * Returns: 729 * device pointer which was just added or updated when successful 730 * error pointer when failed 731 */ 732 static noinline struct btrfs_device *device_list_add(const char *path, 733 struct btrfs_super_block *disk_super) 734 { 735 struct btrfs_device *device; 736 struct btrfs_fs_devices *fs_devices; 737 struct rcu_string *name; 738 u64 found_transid = btrfs_super_generation(disk_super); 739 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 740 741 fs_devices = find_fsid(disk_super->fsid); 742 if (!fs_devices) { 743 fs_devices = alloc_fs_devices(disk_super->fsid); 744 if (IS_ERR(fs_devices)) 745 return ERR_CAST(fs_devices); 746 747 list_add(&fs_devices->list, &fs_uuids); 748 749 device = NULL; 750 } else { 751 device = find_device(fs_devices, devid, 752 disk_super->dev_item.uuid); 753 } 754 755 if (!device) { 756 if (fs_devices->opened) 757 return ERR_PTR(-EBUSY); 758 759 device = btrfs_alloc_device(NULL, &devid, 760 disk_super->dev_item.uuid); 761 if (IS_ERR(device)) { 762 /* we can safely leave the fs_devices entry around */ 763 return device; 764 } 765 766 name = rcu_string_strdup(path, GFP_NOFS); 767 if (!name) { 768 free_device(device); 769 return ERR_PTR(-ENOMEM); 770 } 771 rcu_assign_pointer(device->name, name); 772 773 mutex_lock(&fs_devices->device_list_mutex); 774 list_add_rcu(&device->dev_list, &fs_devices->devices); 775 fs_devices->num_devices++; 776 mutex_unlock(&fs_devices->device_list_mutex); 777 778 device->fs_devices = fs_devices; 779 btrfs_free_stale_devices(path, device); 780 781 if (disk_super->label[0]) 782 pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", 783 disk_super->label, devid, found_transid, path); 784 else 785 pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", 786 disk_super->fsid, devid, found_transid, path); 787 788 } else if (!device->name || strcmp(device->name->str, path)) { 789 /* 790 * When FS is already mounted. 791 * 1. If you are here and if the device->name is NULL that 792 * means this device was missing at time of FS mount. 793 * 2. If you are here and if the device->name is different 794 * from 'path' that means either 795 * a. The same device disappeared and reappeared with 796 * different name. or 797 * b. The missing-disk-which-was-replaced, has 798 * reappeared now. 799 * 800 * We must allow 1 and 2a above. But 2b would be a spurious 801 * and unintentional. 802 * 803 * Further in case of 1 and 2a above, the disk at 'path' 804 * would have missed some transaction when it was away and 805 * in case of 2a the stale bdev has to be updated as well. 806 * 2b must not be allowed at all time. 807 */ 808 809 /* 810 * For now, we do allow update to btrfs_fs_device through the 811 * btrfs dev scan cli after FS has been mounted. We're still 812 * tracking a problem where systems fail mount by subvolume id 813 * when we reject replacement on a mounted FS. 814 */ 815 if (!fs_devices->opened && found_transid < device->generation) { 816 /* 817 * That is if the FS is _not_ mounted and if you 818 * are here, that means there is more than one 819 * disk with same uuid and devid.We keep the one 820 * with larger generation number or the last-in if 821 * generation are equal. 822 */ 823 return ERR_PTR(-EEXIST); 824 } 825 826 name = rcu_string_strdup(path, GFP_NOFS); 827 if (!name) 828 return ERR_PTR(-ENOMEM); 829 rcu_string_free(device->name); 830 rcu_assign_pointer(device->name, name); 831 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 832 fs_devices->missing_devices--; 833 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 834 } 835 } 836 837 /* 838 * Unmount does not free the btrfs_device struct but would zero 839 * generation along with most of the other members. So just update 840 * it back. We need it to pick the disk with largest generation 841 * (as above). 842 */ 843 if (!fs_devices->opened) 844 device->generation = found_transid; 845 846 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 847 848 return device; 849 } 850 851 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 852 { 853 struct btrfs_fs_devices *fs_devices; 854 struct btrfs_device *device; 855 struct btrfs_device *orig_dev; 856 857 fs_devices = alloc_fs_devices(orig->fsid); 858 if (IS_ERR(fs_devices)) 859 return fs_devices; 860 861 mutex_lock(&orig->device_list_mutex); 862 fs_devices->total_devices = orig->total_devices; 863 864 /* We have held the volume lock, it is safe to get the devices. */ 865 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 866 struct rcu_string *name; 867 868 device = btrfs_alloc_device(NULL, &orig_dev->devid, 869 orig_dev->uuid); 870 if (IS_ERR(device)) 871 goto error; 872 873 /* 874 * This is ok to do without rcu read locked because we hold the 875 * uuid mutex so nothing we touch in here is going to disappear. 876 */ 877 if (orig_dev->name) { 878 name = rcu_string_strdup(orig_dev->name->str, 879 GFP_KERNEL); 880 if (!name) { 881 free_device(device); 882 goto error; 883 } 884 rcu_assign_pointer(device->name, name); 885 } 886 887 list_add(&device->dev_list, &fs_devices->devices); 888 device->fs_devices = fs_devices; 889 fs_devices->num_devices++; 890 } 891 mutex_unlock(&orig->device_list_mutex); 892 return fs_devices; 893 error: 894 mutex_unlock(&orig->device_list_mutex); 895 free_fs_devices(fs_devices); 896 return ERR_PTR(-ENOMEM); 897 } 898 899 /* 900 * After we have read the system tree and know devids belonging to 901 * this filesystem, remove the device which does not belong there. 902 */ 903 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 904 { 905 struct btrfs_device *device, *next; 906 struct btrfs_device *latest_dev = NULL; 907 908 mutex_lock(&uuid_mutex); 909 again: 910 /* This is the initialized path, it is safe to release the devices. */ 911 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 912 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 913 &device->dev_state)) { 914 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 915 &device->dev_state) && 916 (!latest_dev || 917 device->generation > latest_dev->generation)) { 918 latest_dev = device; 919 } 920 continue; 921 } 922 923 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 924 /* 925 * In the first step, keep the device which has 926 * the correct fsid and the devid that is used 927 * for the dev_replace procedure. 928 * In the second step, the dev_replace state is 929 * read from the device tree and it is known 930 * whether the procedure is really active or 931 * not, which means whether this device is 932 * used or whether it should be removed. 933 */ 934 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 935 &device->dev_state)) { 936 continue; 937 } 938 } 939 if (device->bdev) { 940 blkdev_put(device->bdev, device->mode); 941 device->bdev = NULL; 942 fs_devices->open_devices--; 943 } 944 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 945 list_del_init(&device->dev_alloc_list); 946 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 947 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 948 &device->dev_state)) 949 fs_devices->rw_devices--; 950 } 951 list_del_init(&device->dev_list); 952 fs_devices->num_devices--; 953 free_device(device); 954 } 955 956 if (fs_devices->seed) { 957 fs_devices = fs_devices->seed; 958 goto again; 959 } 960 961 fs_devices->latest_bdev = latest_dev->bdev; 962 963 mutex_unlock(&uuid_mutex); 964 } 965 966 static void free_device_rcu(struct rcu_head *head) 967 { 968 struct btrfs_device *device; 969 970 device = container_of(head, struct btrfs_device, rcu); 971 free_device(device); 972 } 973 974 static void btrfs_close_bdev(struct btrfs_device *device) 975 { 976 if (!device->bdev) 977 return; 978 979 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 980 sync_blockdev(device->bdev); 981 invalidate_bdev(device->bdev); 982 } 983 984 blkdev_put(device->bdev, device->mode); 985 } 986 987 static void btrfs_prepare_close_one_device(struct btrfs_device *device) 988 { 989 struct btrfs_fs_devices *fs_devices = device->fs_devices; 990 struct btrfs_device *new_device; 991 struct rcu_string *name; 992 993 if (device->bdev) 994 fs_devices->open_devices--; 995 996 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 997 device->devid != BTRFS_DEV_REPLACE_DEVID) { 998 list_del_init(&device->dev_alloc_list); 999 fs_devices->rw_devices--; 1000 } 1001 1002 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1003 fs_devices->missing_devices--; 1004 1005 new_device = btrfs_alloc_device(NULL, &device->devid, 1006 device->uuid); 1007 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 1008 1009 /* Safe because we are under uuid_mutex */ 1010 if (device->name) { 1011 name = rcu_string_strdup(device->name->str, GFP_NOFS); 1012 BUG_ON(!name); /* -ENOMEM */ 1013 rcu_assign_pointer(new_device->name, name); 1014 } 1015 1016 list_replace_rcu(&device->dev_list, &new_device->dev_list); 1017 new_device->fs_devices = device->fs_devices; 1018 } 1019 1020 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1021 { 1022 struct btrfs_device *device, *tmp; 1023 struct list_head pending_put; 1024 1025 INIT_LIST_HEAD(&pending_put); 1026 1027 if (--fs_devices->opened > 0) 1028 return 0; 1029 1030 mutex_lock(&fs_devices->device_list_mutex); 1031 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1032 btrfs_prepare_close_one_device(device); 1033 list_add(&device->dev_list, &pending_put); 1034 } 1035 mutex_unlock(&fs_devices->device_list_mutex); 1036 1037 /* 1038 * btrfs_show_devname() is using the device_list_mutex, 1039 * sometimes call to blkdev_put() leads vfs calling 1040 * into this func. So do put outside of device_list_mutex, 1041 * as of now. 1042 */ 1043 while (!list_empty(&pending_put)) { 1044 device = list_first_entry(&pending_put, 1045 struct btrfs_device, dev_list); 1046 list_del(&device->dev_list); 1047 btrfs_close_bdev(device); 1048 call_rcu(&device->rcu, free_device_rcu); 1049 } 1050 1051 WARN_ON(fs_devices->open_devices); 1052 WARN_ON(fs_devices->rw_devices); 1053 fs_devices->opened = 0; 1054 fs_devices->seeding = 0; 1055 1056 return 0; 1057 } 1058 1059 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1060 { 1061 struct btrfs_fs_devices *seed_devices = NULL; 1062 int ret; 1063 1064 mutex_lock(&uuid_mutex); 1065 ret = __btrfs_close_devices(fs_devices); 1066 if (!fs_devices->opened) { 1067 seed_devices = fs_devices->seed; 1068 fs_devices->seed = NULL; 1069 } 1070 mutex_unlock(&uuid_mutex); 1071 1072 while (seed_devices) { 1073 fs_devices = seed_devices; 1074 seed_devices = fs_devices->seed; 1075 __btrfs_close_devices(fs_devices); 1076 free_fs_devices(fs_devices); 1077 } 1078 return ret; 1079 } 1080 1081 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1082 fmode_t flags, void *holder) 1083 { 1084 struct list_head *head = &fs_devices->devices; 1085 struct btrfs_device *device; 1086 struct btrfs_device *latest_dev = NULL; 1087 int ret = 0; 1088 1089 flags |= FMODE_EXCL; 1090 1091 list_for_each_entry(device, head, dev_list) { 1092 /* Just open everything we can; ignore failures here */ 1093 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1094 continue; 1095 1096 if (!latest_dev || 1097 device->generation > latest_dev->generation) 1098 latest_dev = device; 1099 } 1100 if (fs_devices->open_devices == 0) { 1101 ret = -EINVAL; 1102 goto out; 1103 } 1104 fs_devices->opened = 1; 1105 fs_devices->latest_bdev = latest_dev->bdev; 1106 fs_devices->total_rw_bytes = 0; 1107 out: 1108 return ret; 1109 } 1110 1111 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1112 { 1113 struct btrfs_device *dev1, *dev2; 1114 1115 dev1 = list_entry(a, struct btrfs_device, dev_list); 1116 dev2 = list_entry(b, struct btrfs_device, dev_list); 1117 1118 if (dev1->devid < dev2->devid) 1119 return -1; 1120 else if (dev1->devid > dev2->devid) 1121 return 1; 1122 return 0; 1123 } 1124 1125 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1126 fmode_t flags, void *holder) 1127 { 1128 int ret; 1129 1130 mutex_lock(&uuid_mutex); 1131 if (fs_devices->opened) { 1132 fs_devices->opened++; 1133 ret = 0; 1134 } else { 1135 list_sort(NULL, &fs_devices->devices, devid_cmp); 1136 ret = __btrfs_open_devices(fs_devices, flags, holder); 1137 } 1138 mutex_unlock(&uuid_mutex); 1139 return ret; 1140 } 1141 1142 static void btrfs_release_disk_super(struct page *page) 1143 { 1144 kunmap(page); 1145 put_page(page); 1146 } 1147 1148 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1149 struct page **page, 1150 struct btrfs_super_block **disk_super) 1151 { 1152 void *p; 1153 pgoff_t index; 1154 1155 /* make sure our super fits in the device */ 1156 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1157 return 1; 1158 1159 /* make sure our super fits in the page */ 1160 if (sizeof(**disk_super) > PAGE_SIZE) 1161 return 1; 1162 1163 /* make sure our super doesn't straddle pages on disk */ 1164 index = bytenr >> PAGE_SHIFT; 1165 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1166 return 1; 1167 1168 /* pull in the page with our super */ 1169 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1170 index, GFP_KERNEL); 1171 1172 if (IS_ERR_OR_NULL(*page)) 1173 return 1; 1174 1175 p = kmap(*page); 1176 1177 /* align our pointer to the offset of the super block */ 1178 *disk_super = p + (bytenr & ~PAGE_MASK); 1179 1180 if (btrfs_super_bytenr(*disk_super) != bytenr || 1181 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1182 btrfs_release_disk_super(*page); 1183 return 1; 1184 } 1185 1186 if ((*disk_super)->label[0] && 1187 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1188 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1189 1190 return 0; 1191 } 1192 1193 /* 1194 * Look for a btrfs signature on a device. This may be called out of the mount path 1195 * and we are not allowed to call set_blocksize during the scan. The superblock 1196 * is read via pagecache 1197 */ 1198 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 1199 struct btrfs_fs_devices **fs_devices_ret) 1200 { 1201 struct btrfs_super_block *disk_super; 1202 struct btrfs_device *device; 1203 struct block_device *bdev; 1204 struct page *page; 1205 int ret = 0; 1206 u64 bytenr; 1207 1208 /* 1209 * we would like to check all the supers, but that would make 1210 * a btrfs mount succeed after a mkfs from a different FS. 1211 * So, we need to add a special mount option to scan for 1212 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1213 */ 1214 bytenr = btrfs_sb_offset(0); 1215 flags |= FMODE_EXCL; 1216 mutex_lock(&uuid_mutex); 1217 1218 bdev = blkdev_get_by_path(path, flags, holder); 1219 if (IS_ERR(bdev)) { 1220 ret = PTR_ERR(bdev); 1221 goto error; 1222 } 1223 1224 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 1225 ret = -EINVAL; 1226 goto error_bdev_put; 1227 } 1228 1229 device = device_list_add(path, disk_super); 1230 if (IS_ERR(device)) 1231 ret = PTR_ERR(device); 1232 else 1233 *fs_devices_ret = device->fs_devices; 1234 1235 btrfs_release_disk_super(page); 1236 1237 error_bdev_put: 1238 blkdev_put(bdev, flags); 1239 error: 1240 mutex_unlock(&uuid_mutex); 1241 return ret; 1242 } 1243 1244 /* helper to account the used device space in the range */ 1245 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 1246 u64 end, u64 *length) 1247 { 1248 struct btrfs_key key; 1249 struct btrfs_root *root = device->fs_info->dev_root; 1250 struct btrfs_dev_extent *dev_extent; 1251 struct btrfs_path *path; 1252 u64 extent_end; 1253 int ret; 1254 int slot; 1255 struct extent_buffer *l; 1256 1257 *length = 0; 1258 1259 if (start >= device->total_bytes || 1260 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 1261 return 0; 1262 1263 path = btrfs_alloc_path(); 1264 if (!path) 1265 return -ENOMEM; 1266 path->reada = READA_FORWARD; 1267 1268 key.objectid = device->devid; 1269 key.offset = start; 1270 key.type = BTRFS_DEV_EXTENT_KEY; 1271 1272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1273 if (ret < 0) 1274 goto out; 1275 if (ret > 0) { 1276 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1277 if (ret < 0) 1278 goto out; 1279 } 1280 1281 while (1) { 1282 l = path->nodes[0]; 1283 slot = path->slots[0]; 1284 if (slot >= btrfs_header_nritems(l)) { 1285 ret = btrfs_next_leaf(root, path); 1286 if (ret == 0) 1287 continue; 1288 if (ret < 0) 1289 goto out; 1290 1291 break; 1292 } 1293 btrfs_item_key_to_cpu(l, &key, slot); 1294 1295 if (key.objectid < device->devid) 1296 goto next; 1297 1298 if (key.objectid > device->devid) 1299 break; 1300 1301 if (key.type != BTRFS_DEV_EXTENT_KEY) 1302 goto next; 1303 1304 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1305 extent_end = key.offset + btrfs_dev_extent_length(l, 1306 dev_extent); 1307 if (key.offset <= start && extent_end > end) { 1308 *length = end - start + 1; 1309 break; 1310 } else if (key.offset <= start && extent_end > start) 1311 *length += extent_end - start; 1312 else if (key.offset > start && extent_end <= end) 1313 *length += extent_end - key.offset; 1314 else if (key.offset > start && key.offset <= end) { 1315 *length += end - key.offset + 1; 1316 break; 1317 } else if (key.offset > end) 1318 break; 1319 1320 next: 1321 path->slots[0]++; 1322 } 1323 ret = 0; 1324 out: 1325 btrfs_free_path(path); 1326 return ret; 1327 } 1328 1329 static int contains_pending_extent(struct btrfs_transaction *transaction, 1330 struct btrfs_device *device, 1331 u64 *start, u64 len) 1332 { 1333 struct btrfs_fs_info *fs_info = device->fs_info; 1334 struct extent_map *em; 1335 struct list_head *search_list = &fs_info->pinned_chunks; 1336 int ret = 0; 1337 u64 physical_start = *start; 1338 1339 if (transaction) 1340 search_list = &transaction->pending_chunks; 1341 again: 1342 list_for_each_entry(em, search_list, list) { 1343 struct map_lookup *map; 1344 int i; 1345 1346 map = em->map_lookup; 1347 for (i = 0; i < map->num_stripes; i++) { 1348 u64 end; 1349 1350 if (map->stripes[i].dev != device) 1351 continue; 1352 if (map->stripes[i].physical >= physical_start + len || 1353 map->stripes[i].physical + em->orig_block_len <= 1354 physical_start) 1355 continue; 1356 /* 1357 * Make sure that while processing the pinned list we do 1358 * not override our *start with a lower value, because 1359 * we can have pinned chunks that fall within this 1360 * device hole and that have lower physical addresses 1361 * than the pending chunks we processed before. If we 1362 * do not take this special care we can end up getting 1363 * 2 pending chunks that start at the same physical 1364 * device offsets because the end offset of a pinned 1365 * chunk can be equal to the start offset of some 1366 * pending chunk. 1367 */ 1368 end = map->stripes[i].physical + em->orig_block_len; 1369 if (end > *start) { 1370 *start = end; 1371 ret = 1; 1372 } 1373 } 1374 } 1375 if (search_list != &fs_info->pinned_chunks) { 1376 search_list = &fs_info->pinned_chunks; 1377 goto again; 1378 } 1379 1380 return ret; 1381 } 1382 1383 1384 /* 1385 * find_free_dev_extent_start - find free space in the specified device 1386 * @device: the device which we search the free space in 1387 * @num_bytes: the size of the free space that we need 1388 * @search_start: the position from which to begin the search 1389 * @start: store the start of the free space. 1390 * @len: the size of the free space. that we find, or the size 1391 * of the max free space if we don't find suitable free space 1392 * 1393 * this uses a pretty simple search, the expectation is that it is 1394 * called very infrequently and that a given device has a small number 1395 * of extents 1396 * 1397 * @start is used to store the start of the free space if we find. But if we 1398 * don't find suitable free space, it will be used to store the start position 1399 * of the max free space. 1400 * 1401 * @len is used to store the size of the free space that we find. 1402 * But if we don't find suitable free space, it is used to store the size of 1403 * the max free space. 1404 */ 1405 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1406 struct btrfs_device *device, u64 num_bytes, 1407 u64 search_start, u64 *start, u64 *len) 1408 { 1409 struct btrfs_fs_info *fs_info = device->fs_info; 1410 struct btrfs_root *root = fs_info->dev_root; 1411 struct btrfs_key key; 1412 struct btrfs_dev_extent *dev_extent; 1413 struct btrfs_path *path; 1414 u64 hole_size; 1415 u64 max_hole_start; 1416 u64 max_hole_size; 1417 u64 extent_end; 1418 u64 search_end = device->total_bytes; 1419 int ret; 1420 int slot; 1421 struct extent_buffer *l; 1422 1423 /* 1424 * We don't want to overwrite the superblock on the drive nor any area 1425 * used by the boot loader (grub for example), so we make sure to start 1426 * at an offset of at least 1MB. 1427 */ 1428 search_start = max_t(u64, search_start, SZ_1M); 1429 1430 path = btrfs_alloc_path(); 1431 if (!path) 1432 return -ENOMEM; 1433 1434 max_hole_start = search_start; 1435 max_hole_size = 0; 1436 1437 again: 1438 if (search_start >= search_end || 1439 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1440 ret = -ENOSPC; 1441 goto out; 1442 } 1443 1444 path->reada = READA_FORWARD; 1445 path->search_commit_root = 1; 1446 path->skip_locking = 1; 1447 1448 key.objectid = device->devid; 1449 key.offset = search_start; 1450 key.type = BTRFS_DEV_EXTENT_KEY; 1451 1452 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1453 if (ret < 0) 1454 goto out; 1455 if (ret > 0) { 1456 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1457 if (ret < 0) 1458 goto out; 1459 } 1460 1461 while (1) { 1462 l = path->nodes[0]; 1463 slot = path->slots[0]; 1464 if (slot >= btrfs_header_nritems(l)) { 1465 ret = btrfs_next_leaf(root, path); 1466 if (ret == 0) 1467 continue; 1468 if (ret < 0) 1469 goto out; 1470 1471 break; 1472 } 1473 btrfs_item_key_to_cpu(l, &key, slot); 1474 1475 if (key.objectid < device->devid) 1476 goto next; 1477 1478 if (key.objectid > device->devid) 1479 break; 1480 1481 if (key.type != BTRFS_DEV_EXTENT_KEY) 1482 goto next; 1483 1484 if (key.offset > search_start) { 1485 hole_size = key.offset - search_start; 1486 1487 /* 1488 * Have to check before we set max_hole_start, otherwise 1489 * we could end up sending back this offset anyway. 1490 */ 1491 if (contains_pending_extent(transaction, device, 1492 &search_start, 1493 hole_size)) { 1494 if (key.offset >= search_start) { 1495 hole_size = key.offset - search_start; 1496 } else { 1497 WARN_ON_ONCE(1); 1498 hole_size = 0; 1499 } 1500 } 1501 1502 if (hole_size > max_hole_size) { 1503 max_hole_start = search_start; 1504 max_hole_size = hole_size; 1505 } 1506 1507 /* 1508 * If this free space is greater than which we need, 1509 * it must be the max free space that we have found 1510 * until now, so max_hole_start must point to the start 1511 * of this free space and the length of this free space 1512 * is stored in max_hole_size. Thus, we return 1513 * max_hole_start and max_hole_size and go back to the 1514 * caller. 1515 */ 1516 if (hole_size >= num_bytes) { 1517 ret = 0; 1518 goto out; 1519 } 1520 } 1521 1522 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1523 extent_end = key.offset + btrfs_dev_extent_length(l, 1524 dev_extent); 1525 if (extent_end > search_start) 1526 search_start = extent_end; 1527 next: 1528 path->slots[0]++; 1529 cond_resched(); 1530 } 1531 1532 /* 1533 * At this point, search_start should be the end of 1534 * allocated dev extents, and when shrinking the device, 1535 * search_end may be smaller than search_start. 1536 */ 1537 if (search_end > search_start) { 1538 hole_size = search_end - search_start; 1539 1540 if (contains_pending_extent(transaction, device, &search_start, 1541 hole_size)) { 1542 btrfs_release_path(path); 1543 goto again; 1544 } 1545 1546 if (hole_size > max_hole_size) { 1547 max_hole_start = search_start; 1548 max_hole_size = hole_size; 1549 } 1550 } 1551 1552 /* See above. */ 1553 if (max_hole_size < num_bytes) 1554 ret = -ENOSPC; 1555 else 1556 ret = 0; 1557 1558 out: 1559 btrfs_free_path(path); 1560 *start = max_hole_start; 1561 if (len) 1562 *len = max_hole_size; 1563 return ret; 1564 } 1565 1566 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1567 struct btrfs_device *device, u64 num_bytes, 1568 u64 *start, u64 *len) 1569 { 1570 /* FIXME use last free of some kind */ 1571 return find_free_dev_extent_start(trans->transaction, device, 1572 num_bytes, 0, start, len); 1573 } 1574 1575 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1576 struct btrfs_device *device, 1577 u64 start, u64 *dev_extent_len) 1578 { 1579 struct btrfs_fs_info *fs_info = device->fs_info; 1580 struct btrfs_root *root = fs_info->dev_root; 1581 int ret; 1582 struct btrfs_path *path; 1583 struct btrfs_key key; 1584 struct btrfs_key found_key; 1585 struct extent_buffer *leaf = NULL; 1586 struct btrfs_dev_extent *extent = NULL; 1587 1588 path = btrfs_alloc_path(); 1589 if (!path) 1590 return -ENOMEM; 1591 1592 key.objectid = device->devid; 1593 key.offset = start; 1594 key.type = BTRFS_DEV_EXTENT_KEY; 1595 again: 1596 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1597 if (ret > 0) { 1598 ret = btrfs_previous_item(root, path, key.objectid, 1599 BTRFS_DEV_EXTENT_KEY); 1600 if (ret) 1601 goto out; 1602 leaf = path->nodes[0]; 1603 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1604 extent = btrfs_item_ptr(leaf, path->slots[0], 1605 struct btrfs_dev_extent); 1606 BUG_ON(found_key.offset > start || found_key.offset + 1607 btrfs_dev_extent_length(leaf, extent) < start); 1608 key = found_key; 1609 btrfs_release_path(path); 1610 goto again; 1611 } else if (ret == 0) { 1612 leaf = path->nodes[0]; 1613 extent = btrfs_item_ptr(leaf, path->slots[0], 1614 struct btrfs_dev_extent); 1615 } else { 1616 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1617 goto out; 1618 } 1619 1620 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1621 1622 ret = btrfs_del_item(trans, root, path); 1623 if (ret) { 1624 btrfs_handle_fs_error(fs_info, ret, 1625 "Failed to remove dev extent item"); 1626 } else { 1627 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1628 } 1629 out: 1630 btrfs_free_path(path); 1631 return ret; 1632 } 1633 1634 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1635 struct btrfs_device *device, 1636 u64 chunk_offset, u64 start, u64 num_bytes) 1637 { 1638 int ret; 1639 struct btrfs_path *path; 1640 struct btrfs_fs_info *fs_info = device->fs_info; 1641 struct btrfs_root *root = fs_info->dev_root; 1642 struct btrfs_dev_extent *extent; 1643 struct extent_buffer *leaf; 1644 struct btrfs_key key; 1645 1646 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1647 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1648 path = btrfs_alloc_path(); 1649 if (!path) 1650 return -ENOMEM; 1651 1652 key.objectid = device->devid; 1653 key.offset = start; 1654 key.type = BTRFS_DEV_EXTENT_KEY; 1655 ret = btrfs_insert_empty_item(trans, root, path, &key, 1656 sizeof(*extent)); 1657 if (ret) 1658 goto out; 1659 1660 leaf = path->nodes[0]; 1661 extent = btrfs_item_ptr(leaf, path->slots[0], 1662 struct btrfs_dev_extent); 1663 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1664 BTRFS_CHUNK_TREE_OBJECTID); 1665 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1666 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1667 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1668 1669 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1670 btrfs_mark_buffer_dirty(leaf); 1671 out: 1672 btrfs_free_path(path); 1673 return ret; 1674 } 1675 1676 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1677 { 1678 struct extent_map_tree *em_tree; 1679 struct extent_map *em; 1680 struct rb_node *n; 1681 u64 ret = 0; 1682 1683 em_tree = &fs_info->mapping_tree.map_tree; 1684 read_lock(&em_tree->lock); 1685 n = rb_last(&em_tree->map); 1686 if (n) { 1687 em = rb_entry(n, struct extent_map, rb_node); 1688 ret = em->start + em->len; 1689 } 1690 read_unlock(&em_tree->lock); 1691 1692 return ret; 1693 } 1694 1695 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1696 u64 *devid_ret) 1697 { 1698 int ret; 1699 struct btrfs_key key; 1700 struct btrfs_key found_key; 1701 struct btrfs_path *path; 1702 1703 path = btrfs_alloc_path(); 1704 if (!path) 1705 return -ENOMEM; 1706 1707 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1708 key.type = BTRFS_DEV_ITEM_KEY; 1709 key.offset = (u64)-1; 1710 1711 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1712 if (ret < 0) 1713 goto error; 1714 1715 BUG_ON(ret == 0); /* Corruption */ 1716 1717 ret = btrfs_previous_item(fs_info->chunk_root, path, 1718 BTRFS_DEV_ITEMS_OBJECTID, 1719 BTRFS_DEV_ITEM_KEY); 1720 if (ret) { 1721 *devid_ret = 1; 1722 } else { 1723 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1724 path->slots[0]); 1725 *devid_ret = found_key.offset + 1; 1726 } 1727 ret = 0; 1728 error: 1729 btrfs_free_path(path); 1730 return ret; 1731 } 1732 1733 /* 1734 * the device information is stored in the chunk root 1735 * the btrfs_device struct should be fully filled in 1736 */ 1737 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1738 struct btrfs_fs_info *fs_info, 1739 struct btrfs_device *device) 1740 { 1741 struct btrfs_root *root = fs_info->chunk_root; 1742 int ret; 1743 struct btrfs_path *path; 1744 struct btrfs_dev_item *dev_item; 1745 struct extent_buffer *leaf; 1746 struct btrfs_key key; 1747 unsigned long ptr; 1748 1749 path = btrfs_alloc_path(); 1750 if (!path) 1751 return -ENOMEM; 1752 1753 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1754 key.type = BTRFS_DEV_ITEM_KEY; 1755 key.offset = device->devid; 1756 1757 ret = btrfs_insert_empty_item(trans, root, path, &key, 1758 sizeof(*dev_item)); 1759 if (ret) 1760 goto out; 1761 1762 leaf = path->nodes[0]; 1763 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1764 1765 btrfs_set_device_id(leaf, dev_item, device->devid); 1766 btrfs_set_device_generation(leaf, dev_item, 0); 1767 btrfs_set_device_type(leaf, dev_item, device->type); 1768 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1769 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1770 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1771 btrfs_set_device_total_bytes(leaf, dev_item, 1772 btrfs_device_get_disk_total_bytes(device)); 1773 btrfs_set_device_bytes_used(leaf, dev_item, 1774 btrfs_device_get_bytes_used(device)); 1775 btrfs_set_device_group(leaf, dev_item, 0); 1776 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1777 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1778 btrfs_set_device_start_offset(leaf, dev_item, 0); 1779 1780 ptr = btrfs_device_uuid(dev_item); 1781 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1782 ptr = btrfs_device_fsid(dev_item); 1783 write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); 1784 btrfs_mark_buffer_dirty(leaf); 1785 1786 ret = 0; 1787 out: 1788 btrfs_free_path(path); 1789 return ret; 1790 } 1791 1792 /* 1793 * Function to update ctime/mtime for a given device path. 1794 * Mainly used for ctime/mtime based probe like libblkid. 1795 */ 1796 static void update_dev_time(const char *path_name) 1797 { 1798 struct file *filp; 1799 1800 filp = filp_open(path_name, O_RDWR, 0); 1801 if (IS_ERR(filp)) 1802 return; 1803 file_update_time(filp); 1804 filp_close(filp, NULL); 1805 } 1806 1807 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1808 struct btrfs_device *device) 1809 { 1810 struct btrfs_root *root = fs_info->chunk_root; 1811 int ret; 1812 struct btrfs_path *path; 1813 struct btrfs_key key; 1814 struct btrfs_trans_handle *trans; 1815 1816 path = btrfs_alloc_path(); 1817 if (!path) 1818 return -ENOMEM; 1819 1820 trans = btrfs_start_transaction(root, 0); 1821 if (IS_ERR(trans)) { 1822 btrfs_free_path(path); 1823 return PTR_ERR(trans); 1824 } 1825 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1826 key.type = BTRFS_DEV_ITEM_KEY; 1827 key.offset = device->devid; 1828 1829 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1830 if (ret) { 1831 if (ret > 0) 1832 ret = -ENOENT; 1833 btrfs_abort_transaction(trans, ret); 1834 btrfs_end_transaction(trans); 1835 goto out; 1836 } 1837 1838 ret = btrfs_del_item(trans, root, path); 1839 if (ret) { 1840 btrfs_abort_transaction(trans, ret); 1841 btrfs_end_transaction(trans); 1842 } 1843 1844 out: 1845 btrfs_free_path(path); 1846 if (!ret) 1847 ret = btrfs_commit_transaction(trans); 1848 return ret; 1849 } 1850 1851 /* 1852 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1853 * filesystem. It's up to the caller to adjust that number regarding eg. device 1854 * replace. 1855 */ 1856 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1857 u64 num_devices) 1858 { 1859 u64 all_avail; 1860 unsigned seq; 1861 int i; 1862 1863 do { 1864 seq = read_seqbegin(&fs_info->profiles_lock); 1865 1866 all_avail = fs_info->avail_data_alloc_bits | 1867 fs_info->avail_system_alloc_bits | 1868 fs_info->avail_metadata_alloc_bits; 1869 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1870 1871 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1872 if (!(all_avail & btrfs_raid_group[i])) 1873 continue; 1874 1875 if (num_devices < btrfs_raid_array[i].devs_min) { 1876 int ret = btrfs_raid_mindev_error[i]; 1877 1878 if (ret) 1879 return ret; 1880 } 1881 } 1882 1883 return 0; 1884 } 1885 1886 static struct btrfs_device * btrfs_find_next_active_device( 1887 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1888 { 1889 struct btrfs_device *next_device; 1890 1891 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1892 if (next_device != device && 1893 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1894 && next_device->bdev) 1895 return next_device; 1896 } 1897 1898 return NULL; 1899 } 1900 1901 /* 1902 * Helper function to check if the given device is part of s_bdev / latest_bdev 1903 * and replace it with the provided or the next active device, in the context 1904 * where this function called, there should be always be another device (or 1905 * this_dev) which is active. 1906 */ 1907 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, 1908 struct btrfs_device *device, struct btrfs_device *this_dev) 1909 { 1910 struct btrfs_device *next_device; 1911 1912 if (this_dev) 1913 next_device = this_dev; 1914 else 1915 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1916 device); 1917 ASSERT(next_device); 1918 1919 if (fs_info->sb->s_bdev && 1920 (fs_info->sb->s_bdev == device->bdev)) 1921 fs_info->sb->s_bdev = next_device->bdev; 1922 1923 if (fs_info->fs_devices->latest_bdev == device->bdev) 1924 fs_info->fs_devices->latest_bdev = next_device->bdev; 1925 } 1926 1927 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1928 u64 devid) 1929 { 1930 struct btrfs_device *device; 1931 struct btrfs_fs_devices *cur_devices; 1932 u64 num_devices; 1933 int ret = 0; 1934 1935 mutex_lock(&fs_info->volume_mutex); 1936 mutex_lock(&uuid_mutex); 1937 1938 num_devices = fs_info->fs_devices->num_devices; 1939 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 1940 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1941 WARN_ON(num_devices < 1); 1942 num_devices--; 1943 } 1944 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 1945 1946 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1947 if (ret) 1948 goto out; 1949 1950 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1951 &device); 1952 if (ret) 1953 goto out; 1954 1955 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1956 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1957 goto out; 1958 } 1959 1960 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1961 fs_info->fs_devices->rw_devices == 1) { 1962 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1963 goto out; 1964 } 1965 1966 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1967 mutex_lock(&fs_info->chunk_mutex); 1968 list_del_init(&device->dev_alloc_list); 1969 device->fs_devices->rw_devices--; 1970 mutex_unlock(&fs_info->chunk_mutex); 1971 } 1972 1973 mutex_unlock(&uuid_mutex); 1974 ret = btrfs_shrink_device(device, 0); 1975 mutex_lock(&uuid_mutex); 1976 if (ret) 1977 goto error_undo; 1978 1979 /* 1980 * TODO: the superblock still includes this device in its num_devices 1981 * counter although write_all_supers() is not locked out. This 1982 * could give a filesystem state which requires a degraded mount. 1983 */ 1984 ret = btrfs_rm_dev_item(fs_info, device); 1985 if (ret) 1986 goto error_undo; 1987 1988 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 1989 btrfs_scrub_cancel_dev(fs_info, device); 1990 1991 /* 1992 * the device list mutex makes sure that we don't change 1993 * the device list while someone else is writing out all 1994 * the device supers. Whoever is writing all supers, should 1995 * lock the device list mutex before getting the number of 1996 * devices in the super block (super_copy). Conversely, 1997 * whoever updates the number of devices in the super block 1998 * (super_copy) should hold the device list mutex. 1999 */ 2000 2001 cur_devices = device->fs_devices; 2002 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2003 list_del_rcu(&device->dev_list); 2004 2005 device->fs_devices->num_devices--; 2006 device->fs_devices->total_devices--; 2007 2008 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2009 device->fs_devices->missing_devices--; 2010 2011 btrfs_assign_next_active_device(fs_info, device, NULL); 2012 2013 if (device->bdev) { 2014 device->fs_devices->open_devices--; 2015 /* remove sysfs entry */ 2016 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2017 } 2018 2019 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2020 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2021 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2022 2023 /* 2024 * at this point, the device is zero sized and detached from 2025 * the devices list. All that's left is to zero out the old 2026 * supers and free the device. 2027 */ 2028 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2029 btrfs_scratch_superblocks(device->bdev, device->name->str); 2030 2031 btrfs_close_bdev(device); 2032 call_rcu(&device->rcu, free_device_rcu); 2033 2034 if (cur_devices->open_devices == 0) { 2035 struct btrfs_fs_devices *fs_devices; 2036 fs_devices = fs_info->fs_devices; 2037 while (fs_devices) { 2038 if (fs_devices->seed == cur_devices) { 2039 fs_devices->seed = cur_devices->seed; 2040 break; 2041 } 2042 fs_devices = fs_devices->seed; 2043 } 2044 cur_devices->seed = NULL; 2045 __btrfs_close_devices(cur_devices); 2046 free_fs_devices(cur_devices); 2047 } 2048 2049 out: 2050 mutex_unlock(&uuid_mutex); 2051 mutex_unlock(&fs_info->volume_mutex); 2052 return ret; 2053 2054 error_undo: 2055 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2056 mutex_lock(&fs_info->chunk_mutex); 2057 list_add(&device->dev_alloc_list, 2058 &fs_info->fs_devices->alloc_list); 2059 device->fs_devices->rw_devices++; 2060 mutex_unlock(&fs_info->chunk_mutex); 2061 } 2062 goto out; 2063 } 2064 2065 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 2066 struct btrfs_device *srcdev) 2067 { 2068 struct btrfs_fs_devices *fs_devices; 2069 2070 lockdep_assert_held(&fs_info->fs_devices->device_list_mutex); 2071 2072 /* 2073 * in case of fs with no seed, srcdev->fs_devices will point 2074 * to fs_devices of fs_info. However when the dev being replaced is 2075 * a seed dev it will point to the seed's local fs_devices. In short 2076 * srcdev will have its correct fs_devices in both the cases. 2077 */ 2078 fs_devices = srcdev->fs_devices; 2079 2080 list_del_rcu(&srcdev->dev_list); 2081 list_del(&srcdev->dev_alloc_list); 2082 fs_devices->num_devices--; 2083 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2084 fs_devices->missing_devices--; 2085 2086 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2087 fs_devices->rw_devices--; 2088 2089 if (srcdev->bdev) 2090 fs_devices->open_devices--; 2091 } 2092 2093 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2094 struct btrfs_device *srcdev) 2095 { 2096 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2097 2098 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2099 /* zero out the old super if it is writable */ 2100 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2101 } 2102 2103 btrfs_close_bdev(srcdev); 2104 call_rcu(&srcdev->rcu, free_device_rcu); 2105 2106 /* if this is no devs we rather delete the fs_devices */ 2107 if (!fs_devices->num_devices) { 2108 struct btrfs_fs_devices *tmp_fs_devices; 2109 2110 /* 2111 * On a mounted FS, num_devices can't be zero unless it's a 2112 * seed. In case of a seed device being replaced, the replace 2113 * target added to the sprout FS, so there will be no more 2114 * device left under the seed FS. 2115 */ 2116 ASSERT(fs_devices->seeding); 2117 2118 tmp_fs_devices = fs_info->fs_devices; 2119 while (tmp_fs_devices) { 2120 if (tmp_fs_devices->seed == fs_devices) { 2121 tmp_fs_devices->seed = fs_devices->seed; 2122 break; 2123 } 2124 tmp_fs_devices = tmp_fs_devices->seed; 2125 } 2126 fs_devices->seed = NULL; 2127 __btrfs_close_devices(fs_devices); 2128 free_fs_devices(fs_devices); 2129 } 2130 } 2131 2132 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2133 struct btrfs_device *tgtdev) 2134 { 2135 mutex_lock(&uuid_mutex); 2136 WARN_ON(!tgtdev); 2137 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2138 2139 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2140 2141 if (tgtdev->bdev) 2142 fs_info->fs_devices->open_devices--; 2143 2144 fs_info->fs_devices->num_devices--; 2145 2146 btrfs_assign_next_active_device(fs_info, tgtdev, NULL); 2147 2148 list_del_rcu(&tgtdev->dev_list); 2149 2150 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2151 mutex_unlock(&uuid_mutex); 2152 2153 /* 2154 * The update_dev_time() with in btrfs_scratch_superblocks() 2155 * may lead to a call to btrfs_show_devname() which will try 2156 * to hold device_list_mutex. And here this device 2157 * is already out of device list, so we don't have to hold 2158 * the device_list_mutex lock. 2159 */ 2160 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2161 2162 btrfs_close_bdev(tgtdev); 2163 call_rcu(&tgtdev->rcu, free_device_rcu); 2164 } 2165 2166 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2167 const char *device_path, 2168 struct btrfs_device **device) 2169 { 2170 int ret = 0; 2171 struct btrfs_super_block *disk_super; 2172 u64 devid; 2173 u8 *dev_uuid; 2174 struct block_device *bdev; 2175 struct buffer_head *bh; 2176 2177 *device = NULL; 2178 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2179 fs_info->bdev_holder, 0, &bdev, &bh); 2180 if (ret) 2181 return ret; 2182 disk_super = (struct btrfs_super_block *)bh->b_data; 2183 devid = btrfs_stack_device_id(&disk_super->dev_item); 2184 dev_uuid = disk_super->dev_item.uuid; 2185 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2186 brelse(bh); 2187 if (!*device) 2188 ret = -ENOENT; 2189 blkdev_put(bdev, FMODE_READ); 2190 return ret; 2191 } 2192 2193 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2194 const char *device_path, 2195 struct btrfs_device **device) 2196 { 2197 *device = NULL; 2198 if (strcmp(device_path, "missing") == 0) { 2199 struct list_head *devices; 2200 struct btrfs_device *tmp; 2201 2202 devices = &fs_info->fs_devices->devices; 2203 /* 2204 * It is safe to read the devices since the volume_mutex 2205 * is held by the caller. 2206 */ 2207 list_for_each_entry(tmp, devices, dev_list) { 2208 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2209 &tmp->dev_state) && !tmp->bdev) { 2210 *device = tmp; 2211 break; 2212 } 2213 } 2214 2215 if (!*device) 2216 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2217 2218 return 0; 2219 } else { 2220 return btrfs_find_device_by_path(fs_info, device_path, device); 2221 } 2222 } 2223 2224 /* 2225 * Lookup a device given by device id, or the path if the id is 0. 2226 */ 2227 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2228 const char *devpath, 2229 struct btrfs_device **device) 2230 { 2231 int ret; 2232 2233 if (devid) { 2234 ret = 0; 2235 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2236 if (!*device) 2237 ret = -ENOENT; 2238 } else { 2239 if (!devpath || !devpath[0]) 2240 return -EINVAL; 2241 2242 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2243 device); 2244 } 2245 return ret; 2246 } 2247 2248 /* 2249 * does all the dirty work required for changing file system's UUID. 2250 */ 2251 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2252 { 2253 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2254 struct btrfs_fs_devices *old_devices; 2255 struct btrfs_fs_devices *seed_devices; 2256 struct btrfs_super_block *disk_super = fs_info->super_copy; 2257 struct btrfs_device *device; 2258 u64 super_flags; 2259 2260 lockdep_assert_held(&uuid_mutex); 2261 if (!fs_devices->seeding) 2262 return -EINVAL; 2263 2264 seed_devices = alloc_fs_devices(NULL); 2265 if (IS_ERR(seed_devices)) 2266 return PTR_ERR(seed_devices); 2267 2268 old_devices = clone_fs_devices(fs_devices); 2269 if (IS_ERR(old_devices)) { 2270 kfree(seed_devices); 2271 return PTR_ERR(old_devices); 2272 } 2273 2274 list_add(&old_devices->list, &fs_uuids); 2275 2276 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2277 seed_devices->opened = 1; 2278 INIT_LIST_HEAD(&seed_devices->devices); 2279 INIT_LIST_HEAD(&seed_devices->alloc_list); 2280 mutex_init(&seed_devices->device_list_mutex); 2281 2282 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2283 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2284 synchronize_rcu); 2285 list_for_each_entry(device, &seed_devices->devices, dev_list) 2286 device->fs_devices = seed_devices; 2287 2288 mutex_lock(&fs_info->chunk_mutex); 2289 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2290 mutex_unlock(&fs_info->chunk_mutex); 2291 2292 fs_devices->seeding = 0; 2293 fs_devices->num_devices = 0; 2294 fs_devices->open_devices = 0; 2295 fs_devices->missing_devices = 0; 2296 fs_devices->rotating = 0; 2297 fs_devices->seed = seed_devices; 2298 2299 generate_random_uuid(fs_devices->fsid); 2300 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2301 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2302 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2303 2304 super_flags = btrfs_super_flags(disk_super) & 2305 ~BTRFS_SUPER_FLAG_SEEDING; 2306 btrfs_set_super_flags(disk_super, super_flags); 2307 2308 return 0; 2309 } 2310 2311 /* 2312 * Store the expected generation for seed devices in device items. 2313 */ 2314 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2315 struct btrfs_fs_info *fs_info) 2316 { 2317 struct btrfs_root *root = fs_info->chunk_root; 2318 struct btrfs_path *path; 2319 struct extent_buffer *leaf; 2320 struct btrfs_dev_item *dev_item; 2321 struct btrfs_device *device; 2322 struct btrfs_key key; 2323 u8 fs_uuid[BTRFS_FSID_SIZE]; 2324 u8 dev_uuid[BTRFS_UUID_SIZE]; 2325 u64 devid; 2326 int ret; 2327 2328 path = btrfs_alloc_path(); 2329 if (!path) 2330 return -ENOMEM; 2331 2332 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2333 key.offset = 0; 2334 key.type = BTRFS_DEV_ITEM_KEY; 2335 2336 while (1) { 2337 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2338 if (ret < 0) 2339 goto error; 2340 2341 leaf = path->nodes[0]; 2342 next_slot: 2343 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2344 ret = btrfs_next_leaf(root, path); 2345 if (ret > 0) 2346 break; 2347 if (ret < 0) 2348 goto error; 2349 leaf = path->nodes[0]; 2350 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2351 btrfs_release_path(path); 2352 continue; 2353 } 2354 2355 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2356 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2357 key.type != BTRFS_DEV_ITEM_KEY) 2358 break; 2359 2360 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2361 struct btrfs_dev_item); 2362 devid = btrfs_device_id(leaf, dev_item); 2363 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2364 BTRFS_UUID_SIZE); 2365 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2366 BTRFS_FSID_SIZE); 2367 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2368 BUG_ON(!device); /* Logic error */ 2369 2370 if (device->fs_devices->seeding) { 2371 btrfs_set_device_generation(leaf, dev_item, 2372 device->generation); 2373 btrfs_mark_buffer_dirty(leaf); 2374 } 2375 2376 path->slots[0]++; 2377 goto next_slot; 2378 } 2379 ret = 0; 2380 error: 2381 btrfs_free_path(path); 2382 return ret; 2383 } 2384 2385 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2386 { 2387 struct btrfs_root *root = fs_info->dev_root; 2388 struct request_queue *q; 2389 struct btrfs_trans_handle *trans; 2390 struct btrfs_device *device; 2391 struct block_device *bdev; 2392 struct list_head *devices; 2393 struct super_block *sb = fs_info->sb; 2394 struct rcu_string *name; 2395 u64 tmp; 2396 int seeding_dev = 0; 2397 int ret = 0; 2398 bool unlocked = false; 2399 2400 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2401 return -EROFS; 2402 2403 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2404 fs_info->bdev_holder); 2405 if (IS_ERR(bdev)) 2406 return PTR_ERR(bdev); 2407 2408 if (fs_info->fs_devices->seeding) { 2409 seeding_dev = 1; 2410 down_write(&sb->s_umount); 2411 mutex_lock(&uuid_mutex); 2412 } 2413 2414 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2415 2416 devices = &fs_info->fs_devices->devices; 2417 2418 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2419 list_for_each_entry(device, devices, dev_list) { 2420 if (device->bdev == bdev) { 2421 ret = -EEXIST; 2422 mutex_unlock( 2423 &fs_info->fs_devices->device_list_mutex); 2424 goto error; 2425 } 2426 } 2427 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2428 2429 device = btrfs_alloc_device(fs_info, NULL, NULL); 2430 if (IS_ERR(device)) { 2431 /* we can safely leave the fs_devices entry around */ 2432 ret = PTR_ERR(device); 2433 goto error; 2434 } 2435 2436 name = rcu_string_strdup(device_path, GFP_KERNEL); 2437 if (!name) { 2438 ret = -ENOMEM; 2439 goto error_free_device; 2440 } 2441 rcu_assign_pointer(device->name, name); 2442 2443 trans = btrfs_start_transaction(root, 0); 2444 if (IS_ERR(trans)) { 2445 ret = PTR_ERR(trans); 2446 goto error_free_device; 2447 } 2448 2449 q = bdev_get_queue(bdev); 2450 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2451 device->generation = trans->transid; 2452 device->io_width = fs_info->sectorsize; 2453 device->io_align = fs_info->sectorsize; 2454 device->sector_size = fs_info->sectorsize; 2455 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2456 fs_info->sectorsize); 2457 device->disk_total_bytes = device->total_bytes; 2458 device->commit_total_bytes = device->total_bytes; 2459 device->fs_info = fs_info; 2460 device->bdev = bdev; 2461 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2462 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2463 device->mode = FMODE_EXCL; 2464 device->dev_stats_valid = 1; 2465 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2466 2467 if (seeding_dev) { 2468 sb->s_flags &= ~SB_RDONLY; 2469 ret = btrfs_prepare_sprout(fs_info); 2470 if (ret) { 2471 btrfs_abort_transaction(trans, ret); 2472 goto error_trans; 2473 } 2474 } 2475 2476 device->fs_devices = fs_info->fs_devices; 2477 2478 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2479 mutex_lock(&fs_info->chunk_mutex); 2480 list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); 2481 list_add(&device->dev_alloc_list, 2482 &fs_info->fs_devices->alloc_list); 2483 fs_info->fs_devices->num_devices++; 2484 fs_info->fs_devices->open_devices++; 2485 fs_info->fs_devices->rw_devices++; 2486 fs_info->fs_devices->total_devices++; 2487 fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2488 2489 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2490 2491 if (!blk_queue_nonrot(q)) 2492 fs_info->fs_devices->rotating = 1; 2493 2494 tmp = btrfs_super_total_bytes(fs_info->super_copy); 2495 btrfs_set_super_total_bytes(fs_info->super_copy, 2496 round_down(tmp + device->total_bytes, fs_info->sectorsize)); 2497 2498 tmp = btrfs_super_num_devices(fs_info->super_copy); 2499 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 2500 2501 /* add sysfs device entry */ 2502 btrfs_sysfs_add_device_link(fs_info->fs_devices, device); 2503 2504 /* 2505 * we've got more storage, clear any full flags on the space 2506 * infos 2507 */ 2508 btrfs_clear_space_info_full(fs_info); 2509 2510 mutex_unlock(&fs_info->chunk_mutex); 2511 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2512 2513 if (seeding_dev) { 2514 mutex_lock(&fs_info->chunk_mutex); 2515 ret = init_first_rw_device(trans, fs_info); 2516 mutex_unlock(&fs_info->chunk_mutex); 2517 if (ret) { 2518 btrfs_abort_transaction(trans, ret); 2519 goto error_sysfs; 2520 } 2521 } 2522 2523 ret = btrfs_add_dev_item(trans, fs_info, device); 2524 if (ret) { 2525 btrfs_abort_transaction(trans, ret); 2526 goto error_sysfs; 2527 } 2528 2529 if (seeding_dev) { 2530 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2531 2532 ret = btrfs_finish_sprout(trans, fs_info); 2533 if (ret) { 2534 btrfs_abort_transaction(trans, ret); 2535 goto error_sysfs; 2536 } 2537 2538 /* Sprouting would change fsid of the mounted root, 2539 * so rename the fsid on the sysfs 2540 */ 2541 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2542 fs_info->fsid); 2543 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) 2544 btrfs_warn(fs_info, 2545 "sysfs: failed to create fsid for sprout"); 2546 } 2547 2548 ret = btrfs_commit_transaction(trans); 2549 2550 if (seeding_dev) { 2551 mutex_unlock(&uuid_mutex); 2552 up_write(&sb->s_umount); 2553 unlocked = true; 2554 2555 if (ret) /* transaction commit */ 2556 return ret; 2557 2558 ret = btrfs_relocate_sys_chunks(fs_info); 2559 if (ret < 0) 2560 btrfs_handle_fs_error(fs_info, ret, 2561 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2562 trans = btrfs_attach_transaction(root); 2563 if (IS_ERR(trans)) { 2564 if (PTR_ERR(trans) == -ENOENT) 2565 return 0; 2566 ret = PTR_ERR(trans); 2567 trans = NULL; 2568 goto error_sysfs; 2569 } 2570 ret = btrfs_commit_transaction(trans); 2571 } 2572 2573 /* Update ctime/mtime for libblkid */ 2574 update_dev_time(device_path); 2575 return ret; 2576 2577 error_sysfs: 2578 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2579 error_trans: 2580 if (seeding_dev) 2581 sb->s_flags |= SB_RDONLY; 2582 if (trans) 2583 btrfs_end_transaction(trans); 2584 error_free_device: 2585 free_device(device); 2586 error: 2587 blkdev_put(bdev, FMODE_EXCL); 2588 if (seeding_dev && !unlocked) { 2589 mutex_unlock(&uuid_mutex); 2590 up_write(&sb->s_umount); 2591 } 2592 return ret; 2593 } 2594 2595 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2596 const char *device_path, 2597 struct btrfs_device *srcdev, 2598 struct btrfs_device **device_out) 2599 { 2600 struct btrfs_device *device; 2601 struct block_device *bdev; 2602 struct list_head *devices; 2603 struct rcu_string *name; 2604 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2605 int ret = 0; 2606 2607 *device_out = NULL; 2608 if (fs_info->fs_devices->seeding) { 2609 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2610 return -EINVAL; 2611 } 2612 2613 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2614 fs_info->bdev_holder); 2615 if (IS_ERR(bdev)) { 2616 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2617 return PTR_ERR(bdev); 2618 } 2619 2620 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2621 2622 devices = &fs_info->fs_devices->devices; 2623 list_for_each_entry(device, devices, dev_list) { 2624 if (device->bdev == bdev) { 2625 btrfs_err(fs_info, 2626 "target device is in the filesystem!"); 2627 ret = -EEXIST; 2628 goto error; 2629 } 2630 } 2631 2632 2633 if (i_size_read(bdev->bd_inode) < 2634 btrfs_device_get_total_bytes(srcdev)) { 2635 btrfs_err(fs_info, 2636 "target device is smaller than source device!"); 2637 ret = -EINVAL; 2638 goto error; 2639 } 2640 2641 2642 device = btrfs_alloc_device(NULL, &devid, NULL); 2643 if (IS_ERR(device)) { 2644 ret = PTR_ERR(device); 2645 goto error; 2646 } 2647 2648 name = rcu_string_strdup(device_path, GFP_KERNEL); 2649 if (!name) { 2650 free_device(device); 2651 ret = -ENOMEM; 2652 goto error; 2653 } 2654 rcu_assign_pointer(device->name, name); 2655 2656 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2657 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2658 device->generation = 0; 2659 device->io_width = fs_info->sectorsize; 2660 device->io_align = fs_info->sectorsize; 2661 device->sector_size = fs_info->sectorsize; 2662 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2663 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2664 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2665 device->commit_total_bytes = srcdev->commit_total_bytes; 2666 device->commit_bytes_used = device->bytes_used; 2667 device->fs_info = fs_info; 2668 device->bdev = bdev; 2669 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2670 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2671 device->mode = FMODE_EXCL; 2672 device->dev_stats_valid = 1; 2673 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2674 device->fs_devices = fs_info->fs_devices; 2675 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2676 fs_info->fs_devices->num_devices++; 2677 fs_info->fs_devices->open_devices++; 2678 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2679 2680 *device_out = device; 2681 return ret; 2682 2683 error: 2684 blkdev_put(bdev, FMODE_EXCL); 2685 return ret; 2686 } 2687 2688 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2689 struct btrfs_device *device) 2690 { 2691 int ret; 2692 struct btrfs_path *path; 2693 struct btrfs_root *root = device->fs_info->chunk_root; 2694 struct btrfs_dev_item *dev_item; 2695 struct extent_buffer *leaf; 2696 struct btrfs_key key; 2697 2698 path = btrfs_alloc_path(); 2699 if (!path) 2700 return -ENOMEM; 2701 2702 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2703 key.type = BTRFS_DEV_ITEM_KEY; 2704 key.offset = device->devid; 2705 2706 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2707 if (ret < 0) 2708 goto out; 2709 2710 if (ret > 0) { 2711 ret = -ENOENT; 2712 goto out; 2713 } 2714 2715 leaf = path->nodes[0]; 2716 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2717 2718 btrfs_set_device_id(leaf, dev_item, device->devid); 2719 btrfs_set_device_type(leaf, dev_item, device->type); 2720 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2721 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2722 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2723 btrfs_set_device_total_bytes(leaf, dev_item, 2724 btrfs_device_get_disk_total_bytes(device)); 2725 btrfs_set_device_bytes_used(leaf, dev_item, 2726 btrfs_device_get_bytes_used(device)); 2727 btrfs_mark_buffer_dirty(leaf); 2728 2729 out: 2730 btrfs_free_path(path); 2731 return ret; 2732 } 2733 2734 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2735 struct btrfs_device *device, u64 new_size) 2736 { 2737 struct btrfs_fs_info *fs_info = device->fs_info; 2738 struct btrfs_super_block *super_copy = fs_info->super_copy; 2739 struct btrfs_fs_devices *fs_devices; 2740 u64 old_total; 2741 u64 diff; 2742 2743 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2744 return -EACCES; 2745 2746 new_size = round_down(new_size, fs_info->sectorsize); 2747 2748 mutex_lock(&fs_info->chunk_mutex); 2749 old_total = btrfs_super_total_bytes(super_copy); 2750 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2751 2752 if (new_size <= device->total_bytes || 2753 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2754 mutex_unlock(&fs_info->chunk_mutex); 2755 return -EINVAL; 2756 } 2757 2758 fs_devices = fs_info->fs_devices; 2759 2760 btrfs_set_super_total_bytes(super_copy, 2761 round_down(old_total + diff, fs_info->sectorsize)); 2762 device->fs_devices->total_rw_bytes += diff; 2763 2764 btrfs_device_set_total_bytes(device, new_size); 2765 btrfs_device_set_disk_total_bytes(device, new_size); 2766 btrfs_clear_space_info_full(device->fs_info); 2767 if (list_empty(&device->resized_list)) 2768 list_add_tail(&device->resized_list, 2769 &fs_devices->resized_devices); 2770 mutex_unlock(&fs_info->chunk_mutex); 2771 2772 return btrfs_update_device(trans, device); 2773 } 2774 2775 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2776 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2777 { 2778 struct btrfs_root *root = fs_info->chunk_root; 2779 int ret; 2780 struct btrfs_path *path; 2781 struct btrfs_key key; 2782 2783 path = btrfs_alloc_path(); 2784 if (!path) 2785 return -ENOMEM; 2786 2787 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2788 key.offset = chunk_offset; 2789 key.type = BTRFS_CHUNK_ITEM_KEY; 2790 2791 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2792 if (ret < 0) 2793 goto out; 2794 else if (ret > 0) { /* Logic error or corruption */ 2795 btrfs_handle_fs_error(fs_info, -ENOENT, 2796 "Failed lookup while freeing chunk."); 2797 ret = -ENOENT; 2798 goto out; 2799 } 2800 2801 ret = btrfs_del_item(trans, root, path); 2802 if (ret < 0) 2803 btrfs_handle_fs_error(fs_info, ret, 2804 "Failed to delete chunk item."); 2805 out: 2806 btrfs_free_path(path); 2807 return ret; 2808 } 2809 2810 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2811 { 2812 struct btrfs_super_block *super_copy = fs_info->super_copy; 2813 struct btrfs_disk_key *disk_key; 2814 struct btrfs_chunk *chunk; 2815 u8 *ptr; 2816 int ret = 0; 2817 u32 num_stripes; 2818 u32 array_size; 2819 u32 len = 0; 2820 u32 cur; 2821 struct btrfs_key key; 2822 2823 mutex_lock(&fs_info->chunk_mutex); 2824 array_size = btrfs_super_sys_array_size(super_copy); 2825 2826 ptr = super_copy->sys_chunk_array; 2827 cur = 0; 2828 2829 while (cur < array_size) { 2830 disk_key = (struct btrfs_disk_key *)ptr; 2831 btrfs_disk_key_to_cpu(&key, disk_key); 2832 2833 len = sizeof(*disk_key); 2834 2835 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2836 chunk = (struct btrfs_chunk *)(ptr + len); 2837 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2838 len += btrfs_chunk_item_size(num_stripes); 2839 } else { 2840 ret = -EIO; 2841 break; 2842 } 2843 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2844 key.offset == chunk_offset) { 2845 memmove(ptr, ptr + len, array_size - (cur + len)); 2846 array_size -= len; 2847 btrfs_set_super_sys_array_size(super_copy, array_size); 2848 } else { 2849 ptr += len; 2850 cur += len; 2851 } 2852 } 2853 mutex_unlock(&fs_info->chunk_mutex); 2854 return ret; 2855 } 2856 2857 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2858 u64 logical, u64 length) 2859 { 2860 struct extent_map_tree *em_tree; 2861 struct extent_map *em; 2862 2863 em_tree = &fs_info->mapping_tree.map_tree; 2864 read_lock(&em_tree->lock); 2865 em = lookup_extent_mapping(em_tree, logical, length); 2866 read_unlock(&em_tree->lock); 2867 2868 if (!em) { 2869 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2870 logical, length); 2871 return ERR_PTR(-EINVAL); 2872 } 2873 2874 if (em->start > logical || em->start + em->len < logical) { 2875 btrfs_crit(fs_info, 2876 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2877 logical, length, em->start, em->start + em->len); 2878 free_extent_map(em); 2879 return ERR_PTR(-EINVAL); 2880 } 2881 2882 /* callers are responsible for dropping em's ref. */ 2883 return em; 2884 } 2885 2886 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2887 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2888 { 2889 struct extent_map *em; 2890 struct map_lookup *map; 2891 u64 dev_extent_len = 0; 2892 int i, ret = 0; 2893 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2894 2895 em = get_chunk_map(fs_info, chunk_offset, 1); 2896 if (IS_ERR(em)) { 2897 /* 2898 * This is a logic error, but we don't want to just rely on the 2899 * user having built with ASSERT enabled, so if ASSERT doesn't 2900 * do anything we still error out. 2901 */ 2902 ASSERT(0); 2903 return PTR_ERR(em); 2904 } 2905 map = em->map_lookup; 2906 mutex_lock(&fs_info->chunk_mutex); 2907 check_system_chunk(trans, fs_info, map->type); 2908 mutex_unlock(&fs_info->chunk_mutex); 2909 2910 /* 2911 * Take the device list mutex to prevent races with the final phase of 2912 * a device replace operation that replaces the device object associated 2913 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2914 */ 2915 mutex_lock(&fs_devices->device_list_mutex); 2916 for (i = 0; i < map->num_stripes; i++) { 2917 struct btrfs_device *device = map->stripes[i].dev; 2918 ret = btrfs_free_dev_extent(trans, device, 2919 map->stripes[i].physical, 2920 &dev_extent_len); 2921 if (ret) { 2922 mutex_unlock(&fs_devices->device_list_mutex); 2923 btrfs_abort_transaction(trans, ret); 2924 goto out; 2925 } 2926 2927 if (device->bytes_used > 0) { 2928 mutex_lock(&fs_info->chunk_mutex); 2929 btrfs_device_set_bytes_used(device, 2930 device->bytes_used - dev_extent_len); 2931 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2932 btrfs_clear_space_info_full(fs_info); 2933 mutex_unlock(&fs_info->chunk_mutex); 2934 } 2935 2936 if (map->stripes[i].dev) { 2937 ret = btrfs_update_device(trans, map->stripes[i].dev); 2938 if (ret) { 2939 mutex_unlock(&fs_devices->device_list_mutex); 2940 btrfs_abort_transaction(trans, ret); 2941 goto out; 2942 } 2943 } 2944 } 2945 mutex_unlock(&fs_devices->device_list_mutex); 2946 2947 ret = btrfs_free_chunk(trans, fs_info, chunk_offset); 2948 if (ret) { 2949 btrfs_abort_transaction(trans, ret); 2950 goto out; 2951 } 2952 2953 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2954 2955 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2956 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2957 if (ret) { 2958 btrfs_abort_transaction(trans, ret); 2959 goto out; 2960 } 2961 } 2962 2963 ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); 2964 if (ret) { 2965 btrfs_abort_transaction(trans, ret); 2966 goto out; 2967 } 2968 2969 out: 2970 /* once for us */ 2971 free_extent_map(em); 2972 return ret; 2973 } 2974 2975 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2976 { 2977 struct btrfs_root *root = fs_info->chunk_root; 2978 struct btrfs_trans_handle *trans; 2979 int ret; 2980 2981 /* 2982 * Prevent races with automatic removal of unused block groups. 2983 * After we relocate and before we remove the chunk with offset 2984 * chunk_offset, automatic removal of the block group can kick in, 2985 * resulting in a failure when calling btrfs_remove_chunk() below. 2986 * 2987 * Make sure to acquire this mutex before doing a tree search (dev 2988 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2989 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2990 * we release the path used to search the chunk/dev tree and before 2991 * the current task acquires this mutex and calls us. 2992 */ 2993 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 2994 2995 ret = btrfs_can_relocate(fs_info, chunk_offset); 2996 if (ret) 2997 return -ENOSPC; 2998 2999 /* step one, relocate all the extents inside this chunk */ 3000 btrfs_scrub_pause(fs_info); 3001 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3002 btrfs_scrub_continue(fs_info); 3003 if (ret) 3004 return ret; 3005 3006 /* 3007 * We add the kobjects here (and after forcing data chunk creation) 3008 * since relocation is the only place we'll create chunks of a new 3009 * type at runtime. The only place where we'll remove the last 3010 * chunk of a type is the call immediately below this one. Even 3011 * so, we're protected against races with the cleaner thread since 3012 * we're covered by the delete_unused_bgs_mutex. 3013 */ 3014 btrfs_add_raid_kobjects(fs_info); 3015 3016 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3017 chunk_offset); 3018 if (IS_ERR(trans)) { 3019 ret = PTR_ERR(trans); 3020 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3021 return ret; 3022 } 3023 3024 /* 3025 * step two, delete the device extents and the 3026 * chunk tree entries 3027 */ 3028 ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); 3029 btrfs_end_transaction(trans); 3030 return ret; 3031 } 3032 3033 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3034 { 3035 struct btrfs_root *chunk_root = fs_info->chunk_root; 3036 struct btrfs_path *path; 3037 struct extent_buffer *leaf; 3038 struct btrfs_chunk *chunk; 3039 struct btrfs_key key; 3040 struct btrfs_key found_key; 3041 u64 chunk_type; 3042 bool retried = false; 3043 int failed = 0; 3044 int ret; 3045 3046 path = btrfs_alloc_path(); 3047 if (!path) 3048 return -ENOMEM; 3049 3050 again: 3051 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3052 key.offset = (u64)-1; 3053 key.type = BTRFS_CHUNK_ITEM_KEY; 3054 3055 while (1) { 3056 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3057 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3058 if (ret < 0) { 3059 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3060 goto error; 3061 } 3062 BUG_ON(ret == 0); /* Corruption */ 3063 3064 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3065 key.type); 3066 if (ret) 3067 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3068 if (ret < 0) 3069 goto error; 3070 if (ret > 0) 3071 break; 3072 3073 leaf = path->nodes[0]; 3074 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3075 3076 chunk = btrfs_item_ptr(leaf, path->slots[0], 3077 struct btrfs_chunk); 3078 chunk_type = btrfs_chunk_type(leaf, chunk); 3079 btrfs_release_path(path); 3080 3081 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3082 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3083 if (ret == -ENOSPC) 3084 failed++; 3085 else 3086 BUG_ON(ret); 3087 } 3088 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3089 3090 if (found_key.offset == 0) 3091 break; 3092 key.offset = found_key.offset - 1; 3093 } 3094 ret = 0; 3095 if (failed && !retried) { 3096 failed = 0; 3097 retried = true; 3098 goto again; 3099 } else if (WARN_ON(failed && retried)) { 3100 ret = -ENOSPC; 3101 } 3102 error: 3103 btrfs_free_path(path); 3104 return ret; 3105 } 3106 3107 /* 3108 * return 1 : allocate a data chunk successfully, 3109 * return <0: errors during allocating a data chunk, 3110 * return 0 : no need to allocate a data chunk. 3111 */ 3112 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3113 u64 chunk_offset) 3114 { 3115 struct btrfs_block_group_cache *cache; 3116 u64 bytes_used; 3117 u64 chunk_type; 3118 3119 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3120 ASSERT(cache); 3121 chunk_type = cache->flags; 3122 btrfs_put_block_group(cache); 3123 3124 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 3125 spin_lock(&fs_info->data_sinfo->lock); 3126 bytes_used = fs_info->data_sinfo->bytes_used; 3127 spin_unlock(&fs_info->data_sinfo->lock); 3128 3129 if (!bytes_used) { 3130 struct btrfs_trans_handle *trans; 3131 int ret; 3132 3133 trans = btrfs_join_transaction(fs_info->tree_root); 3134 if (IS_ERR(trans)) 3135 return PTR_ERR(trans); 3136 3137 ret = btrfs_force_chunk_alloc(trans, fs_info, 3138 BTRFS_BLOCK_GROUP_DATA); 3139 btrfs_end_transaction(trans); 3140 if (ret < 0) 3141 return ret; 3142 3143 btrfs_add_raid_kobjects(fs_info); 3144 3145 return 1; 3146 } 3147 } 3148 return 0; 3149 } 3150 3151 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3152 struct btrfs_balance_control *bctl) 3153 { 3154 struct btrfs_root *root = fs_info->tree_root; 3155 struct btrfs_trans_handle *trans; 3156 struct btrfs_balance_item *item; 3157 struct btrfs_disk_balance_args disk_bargs; 3158 struct btrfs_path *path; 3159 struct extent_buffer *leaf; 3160 struct btrfs_key key; 3161 int ret, err; 3162 3163 path = btrfs_alloc_path(); 3164 if (!path) 3165 return -ENOMEM; 3166 3167 trans = btrfs_start_transaction(root, 0); 3168 if (IS_ERR(trans)) { 3169 btrfs_free_path(path); 3170 return PTR_ERR(trans); 3171 } 3172 3173 key.objectid = BTRFS_BALANCE_OBJECTID; 3174 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3175 key.offset = 0; 3176 3177 ret = btrfs_insert_empty_item(trans, root, path, &key, 3178 sizeof(*item)); 3179 if (ret) 3180 goto out; 3181 3182 leaf = path->nodes[0]; 3183 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3184 3185 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3186 3187 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3188 btrfs_set_balance_data(leaf, item, &disk_bargs); 3189 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3190 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3191 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3192 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3193 3194 btrfs_set_balance_flags(leaf, item, bctl->flags); 3195 3196 btrfs_mark_buffer_dirty(leaf); 3197 out: 3198 btrfs_free_path(path); 3199 err = btrfs_commit_transaction(trans); 3200 if (err && !ret) 3201 ret = err; 3202 return ret; 3203 } 3204 3205 static int del_balance_item(struct btrfs_fs_info *fs_info) 3206 { 3207 struct btrfs_root *root = fs_info->tree_root; 3208 struct btrfs_trans_handle *trans; 3209 struct btrfs_path *path; 3210 struct btrfs_key key; 3211 int ret, err; 3212 3213 path = btrfs_alloc_path(); 3214 if (!path) 3215 return -ENOMEM; 3216 3217 trans = btrfs_start_transaction(root, 0); 3218 if (IS_ERR(trans)) { 3219 btrfs_free_path(path); 3220 return PTR_ERR(trans); 3221 } 3222 3223 key.objectid = BTRFS_BALANCE_OBJECTID; 3224 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3225 key.offset = 0; 3226 3227 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3228 if (ret < 0) 3229 goto out; 3230 if (ret > 0) { 3231 ret = -ENOENT; 3232 goto out; 3233 } 3234 3235 ret = btrfs_del_item(trans, root, path); 3236 out: 3237 btrfs_free_path(path); 3238 err = btrfs_commit_transaction(trans); 3239 if (err && !ret) 3240 ret = err; 3241 return ret; 3242 } 3243 3244 /* 3245 * This is a heuristic used to reduce the number of chunks balanced on 3246 * resume after balance was interrupted. 3247 */ 3248 static void update_balance_args(struct btrfs_balance_control *bctl) 3249 { 3250 /* 3251 * Turn on soft mode for chunk types that were being converted. 3252 */ 3253 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3254 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3255 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3256 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3257 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3258 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3259 3260 /* 3261 * Turn on usage filter if is not already used. The idea is 3262 * that chunks that we have already balanced should be 3263 * reasonably full. Don't do it for chunks that are being 3264 * converted - that will keep us from relocating unconverted 3265 * (albeit full) chunks. 3266 */ 3267 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3268 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3269 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3270 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3271 bctl->data.usage = 90; 3272 } 3273 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3274 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3275 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3276 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3277 bctl->sys.usage = 90; 3278 } 3279 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3280 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3281 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3282 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3283 bctl->meta.usage = 90; 3284 } 3285 } 3286 3287 /* 3288 * Should be called with both balance and volume mutexes held to 3289 * serialize other volume operations (add_dev/rm_dev/resize) with 3290 * restriper. Same goes for unset_balance_control. 3291 */ 3292 static void set_balance_control(struct btrfs_balance_control *bctl) 3293 { 3294 struct btrfs_fs_info *fs_info = bctl->fs_info; 3295 3296 BUG_ON(fs_info->balance_ctl); 3297 3298 spin_lock(&fs_info->balance_lock); 3299 fs_info->balance_ctl = bctl; 3300 spin_unlock(&fs_info->balance_lock); 3301 } 3302 3303 static void unset_balance_control(struct btrfs_fs_info *fs_info) 3304 { 3305 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3306 3307 BUG_ON(!fs_info->balance_ctl); 3308 3309 spin_lock(&fs_info->balance_lock); 3310 fs_info->balance_ctl = NULL; 3311 spin_unlock(&fs_info->balance_lock); 3312 3313 kfree(bctl); 3314 } 3315 3316 /* 3317 * Balance filters. Return 1 if chunk should be filtered out 3318 * (should not be balanced). 3319 */ 3320 static int chunk_profiles_filter(u64 chunk_type, 3321 struct btrfs_balance_args *bargs) 3322 { 3323 chunk_type = chunk_to_extended(chunk_type) & 3324 BTRFS_EXTENDED_PROFILE_MASK; 3325 3326 if (bargs->profiles & chunk_type) 3327 return 0; 3328 3329 return 1; 3330 } 3331 3332 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3333 struct btrfs_balance_args *bargs) 3334 { 3335 struct btrfs_block_group_cache *cache; 3336 u64 chunk_used; 3337 u64 user_thresh_min; 3338 u64 user_thresh_max; 3339 int ret = 1; 3340 3341 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3342 chunk_used = btrfs_block_group_used(&cache->item); 3343 3344 if (bargs->usage_min == 0) 3345 user_thresh_min = 0; 3346 else 3347 user_thresh_min = div_factor_fine(cache->key.offset, 3348 bargs->usage_min); 3349 3350 if (bargs->usage_max == 0) 3351 user_thresh_max = 1; 3352 else if (bargs->usage_max > 100) 3353 user_thresh_max = cache->key.offset; 3354 else 3355 user_thresh_max = div_factor_fine(cache->key.offset, 3356 bargs->usage_max); 3357 3358 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3359 ret = 0; 3360 3361 btrfs_put_block_group(cache); 3362 return ret; 3363 } 3364 3365 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3366 u64 chunk_offset, struct btrfs_balance_args *bargs) 3367 { 3368 struct btrfs_block_group_cache *cache; 3369 u64 chunk_used, user_thresh; 3370 int ret = 1; 3371 3372 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3373 chunk_used = btrfs_block_group_used(&cache->item); 3374 3375 if (bargs->usage_min == 0) 3376 user_thresh = 1; 3377 else if (bargs->usage > 100) 3378 user_thresh = cache->key.offset; 3379 else 3380 user_thresh = div_factor_fine(cache->key.offset, 3381 bargs->usage); 3382 3383 if (chunk_used < user_thresh) 3384 ret = 0; 3385 3386 btrfs_put_block_group(cache); 3387 return ret; 3388 } 3389 3390 static int chunk_devid_filter(struct extent_buffer *leaf, 3391 struct btrfs_chunk *chunk, 3392 struct btrfs_balance_args *bargs) 3393 { 3394 struct btrfs_stripe *stripe; 3395 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3396 int i; 3397 3398 for (i = 0; i < num_stripes; i++) { 3399 stripe = btrfs_stripe_nr(chunk, i); 3400 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3401 return 0; 3402 } 3403 3404 return 1; 3405 } 3406 3407 /* [pstart, pend) */ 3408 static int chunk_drange_filter(struct extent_buffer *leaf, 3409 struct btrfs_chunk *chunk, 3410 struct btrfs_balance_args *bargs) 3411 { 3412 struct btrfs_stripe *stripe; 3413 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3414 u64 stripe_offset; 3415 u64 stripe_length; 3416 int factor; 3417 int i; 3418 3419 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3420 return 0; 3421 3422 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3423 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3424 factor = num_stripes / 2; 3425 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3426 factor = num_stripes - 1; 3427 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3428 factor = num_stripes - 2; 3429 } else { 3430 factor = num_stripes; 3431 } 3432 3433 for (i = 0; i < num_stripes; i++) { 3434 stripe = btrfs_stripe_nr(chunk, i); 3435 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3436 continue; 3437 3438 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3439 stripe_length = btrfs_chunk_length(leaf, chunk); 3440 stripe_length = div_u64(stripe_length, factor); 3441 3442 if (stripe_offset < bargs->pend && 3443 stripe_offset + stripe_length > bargs->pstart) 3444 return 0; 3445 } 3446 3447 return 1; 3448 } 3449 3450 /* [vstart, vend) */ 3451 static int chunk_vrange_filter(struct extent_buffer *leaf, 3452 struct btrfs_chunk *chunk, 3453 u64 chunk_offset, 3454 struct btrfs_balance_args *bargs) 3455 { 3456 if (chunk_offset < bargs->vend && 3457 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3458 /* at least part of the chunk is inside this vrange */ 3459 return 0; 3460 3461 return 1; 3462 } 3463 3464 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3465 struct btrfs_chunk *chunk, 3466 struct btrfs_balance_args *bargs) 3467 { 3468 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3469 3470 if (bargs->stripes_min <= num_stripes 3471 && num_stripes <= bargs->stripes_max) 3472 return 0; 3473 3474 return 1; 3475 } 3476 3477 static int chunk_soft_convert_filter(u64 chunk_type, 3478 struct btrfs_balance_args *bargs) 3479 { 3480 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3481 return 0; 3482 3483 chunk_type = chunk_to_extended(chunk_type) & 3484 BTRFS_EXTENDED_PROFILE_MASK; 3485 3486 if (bargs->target == chunk_type) 3487 return 1; 3488 3489 return 0; 3490 } 3491 3492 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3493 struct extent_buffer *leaf, 3494 struct btrfs_chunk *chunk, u64 chunk_offset) 3495 { 3496 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3497 struct btrfs_balance_args *bargs = NULL; 3498 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3499 3500 /* type filter */ 3501 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3502 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3503 return 0; 3504 } 3505 3506 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3507 bargs = &bctl->data; 3508 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3509 bargs = &bctl->sys; 3510 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3511 bargs = &bctl->meta; 3512 3513 /* profiles filter */ 3514 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3515 chunk_profiles_filter(chunk_type, bargs)) { 3516 return 0; 3517 } 3518 3519 /* usage filter */ 3520 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3521 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3522 return 0; 3523 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3524 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3525 return 0; 3526 } 3527 3528 /* devid filter */ 3529 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3530 chunk_devid_filter(leaf, chunk, bargs)) { 3531 return 0; 3532 } 3533 3534 /* drange filter, makes sense only with devid filter */ 3535 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3536 chunk_drange_filter(leaf, chunk, bargs)) { 3537 return 0; 3538 } 3539 3540 /* vrange filter */ 3541 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3542 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3543 return 0; 3544 } 3545 3546 /* stripes filter */ 3547 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3548 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3549 return 0; 3550 } 3551 3552 /* soft profile changing mode */ 3553 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3554 chunk_soft_convert_filter(chunk_type, bargs)) { 3555 return 0; 3556 } 3557 3558 /* 3559 * limited by count, must be the last filter 3560 */ 3561 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3562 if (bargs->limit == 0) 3563 return 0; 3564 else 3565 bargs->limit--; 3566 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3567 /* 3568 * Same logic as the 'limit' filter; the minimum cannot be 3569 * determined here because we do not have the global information 3570 * about the count of all chunks that satisfy the filters. 3571 */ 3572 if (bargs->limit_max == 0) 3573 return 0; 3574 else 3575 bargs->limit_max--; 3576 } 3577 3578 return 1; 3579 } 3580 3581 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3582 { 3583 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3584 struct btrfs_root *chunk_root = fs_info->chunk_root; 3585 struct btrfs_root *dev_root = fs_info->dev_root; 3586 struct list_head *devices; 3587 struct btrfs_device *device; 3588 u64 old_size; 3589 u64 size_to_free; 3590 u64 chunk_type; 3591 struct btrfs_chunk *chunk; 3592 struct btrfs_path *path = NULL; 3593 struct btrfs_key key; 3594 struct btrfs_key found_key; 3595 struct btrfs_trans_handle *trans; 3596 struct extent_buffer *leaf; 3597 int slot; 3598 int ret; 3599 int enospc_errors = 0; 3600 bool counting = true; 3601 /* The single value limit and min/max limits use the same bytes in the */ 3602 u64 limit_data = bctl->data.limit; 3603 u64 limit_meta = bctl->meta.limit; 3604 u64 limit_sys = bctl->sys.limit; 3605 u32 count_data = 0; 3606 u32 count_meta = 0; 3607 u32 count_sys = 0; 3608 int chunk_reserved = 0; 3609 3610 /* step one make some room on all the devices */ 3611 devices = &fs_info->fs_devices->devices; 3612 list_for_each_entry(device, devices, dev_list) { 3613 old_size = btrfs_device_get_total_bytes(device); 3614 size_to_free = div_factor(old_size, 1); 3615 size_to_free = min_t(u64, size_to_free, SZ_1M); 3616 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || 3617 btrfs_device_get_total_bytes(device) - 3618 btrfs_device_get_bytes_used(device) > size_to_free || 3619 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 3620 continue; 3621 3622 ret = btrfs_shrink_device(device, old_size - size_to_free); 3623 if (ret == -ENOSPC) 3624 break; 3625 if (ret) { 3626 /* btrfs_shrink_device never returns ret > 0 */ 3627 WARN_ON(ret > 0); 3628 goto error; 3629 } 3630 3631 trans = btrfs_start_transaction(dev_root, 0); 3632 if (IS_ERR(trans)) { 3633 ret = PTR_ERR(trans); 3634 btrfs_info_in_rcu(fs_info, 3635 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3636 rcu_str_deref(device->name), ret, 3637 old_size, old_size - size_to_free); 3638 goto error; 3639 } 3640 3641 ret = btrfs_grow_device(trans, device, old_size); 3642 if (ret) { 3643 btrfs_end_transaction(trans); 3644 /* btrfs_grow_device never returns ret > 0 */ 3645 WARN_ON(ret > 0); 3646 btrfs_info_in_rcu(fs_info, 3647 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3648 rcu_str_deref(device->name), ret, 3649 old_size, old_size - size_to_free); 3650 goto error; 3651 } 3652 3653 btrfs_end_transaction(trans); 3654 } 3655 3656 /* step two, relocate all the chunks */ 3657 path = btrfs_alloc_path(); 3658 if (!path) { 3659 ret = -ENOMEM; 3660 goto error; 3661 } 3662 3663 /* zero out stat counters */ 3664 spin_lock(&fs_info->balance_lock); 3665 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3666 spin_unlock(&fs_info->balance_lock); 3667 again: 3668 if (!counting) { 3669 /* 3670 * The single value limit and min/max limits use the same bytes 3671 * in the 3672 */ 3673 bctl->data.limit = limit_data; 3674 bctl->meta.limit = limit_meta; 3675 bctl->sys.limit = limit_sys; 3676 } 3677 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3678 key.offset = (u64)-1; 3679 key.type = BTRFS_CHUNK_ITEM_KEY; 3680 3681 while (1) { 3682 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3683 atomic_read(&fs_info->balance_cancel_req)) { 3684 ret = -ECANCELED; 3685 goto error; 3686 } 3687 3688 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3689 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3690 if (ret < 0) { 3691 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3692 goto error; 3693 } 3694 3695 /* 3696 * this shouldn't happen, it means the last relocate 3697 * failed 3698 */ 3699 if (ret == 0) 3700 BUG(); /* FIXME break ? */ 3701 3702 ret = btrfs_previous_item(chunk_root, path, 0, 3703 BTRFS_CHUNK_ITEM_KEY); 3704 if (ret) { 3705 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3706 ret = 0; 3707 break; 3708 } 3709 3710 leaf = path->nodes[0]; 3711 slot = path->slots[0]; 3712 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3713 3714 if (found_key.objectid != key.objectid) { 3715 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3716 break; 3717 } 3718 3719 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3720 chunk_type = btrfs_chunk_type(leaf, chunk); 3721 3722 if (!counting) { 3723 spin_lock(&fs_info->balance_lock); 3724 bctl->stat.considered++; 3725 spin_unlock(&fs_info->balance_lock); 3726 } 3727 3728 ret = should_balance_chunk(fs_info, leaf, chunk, 3729 found_key.offset); 3730 3731 btrfs_release_path(path); 3732 if (!ret) { 3733 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3734 goto loop; 3735 } 3736 3737 if (counting) { 3738 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3739 spin_lock(&fs_info->balance_lock); 3740 bctl->stat.expected++; 3741 spin_unlock(&fs_info->balance_lock); 3742 3743 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3744 count_data++; 3745 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3746 count_sys++; 3747 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3748 count_meta++; 3749 3750 goto loop; 3751 } 3752 3753 /* 3754 * Apply limit_min filter, no need to check if the LIMITS 3755 * filter is used, limit_min is 0 by default 3756 */ 3757 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3758 count_data < bctl->data.limit_min) 3759 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3760 count_meta < bctl->meta.limit_min) 3761 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3762 count_sys < bctl->sys.limit_min)) { 3763 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3764 goto loop; 3765 } 3766 3767 if (!chunk_reserved) { 3768 /* 3769 * We may be relocating the only data chunk we have, 3770 * which could potentially end up with losing data's 3771 * raid profile, so lets allocate an empty one in 3772 * advance. 3773 */ 3774 ret = btrfs_may_alloc_data_chunk(fs_info, 3775 found_key.offset); 3776 if (ret < 0) { 3777 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3778 goto error; 3779 } else if (ret == 1) { 3780 chunk_reserved = 1; 3781 } 3782 } 3783 3784 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3785 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3786 if (ret && ret != -ENOSPC) 3787 goto error; 3788 if (ret == -ENOSPC) { 3789 enospc_errors++; 3790 } else { 3791 spin_lock(&fs_info->balance_lock); 3792 bctl->stat.completed++; 3793 spin_unlock(&fs_info->balance_lock); 3794 } 3795 loop: 3796 if (found_key.offset == 0) 3797 break; 3798 key.offset = found_key.offset - 1; 3799 } 3800 3801 if (counting) { 3802 btrfs_release_path(path); 3803 counting = false; 3804 goto again; 3805 } 3806 error: 3807 btrfs_free_path(path); 3808 if (enospc_errors) { 3809 btrfs_info(fs_info, "%d enospc errors during balance", 3810 enospc_errors); 3811 if (!ret) 3812 ret = -ENOSPC; 3813 } 3814 3815 return ret; 3816 } 3817 3818 /** 3819 * alloc_profile_is_valid - see if a given profile is valid and reduced 3820 * @flags: profile to validate 3821 * @extended: if true @flags is treated as an extended profile 3822 */ 3823 static int alloc_profile_is_valid(u64 flags, int extended) 3824 { 3825 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3826 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3827 3828 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3829 3830 /* 1) check that all other bits are zeroed */ 3831 if (flags & ~mask) 3832 return 0; 3833 3834 /* 2) see if profile is reduced */ 3835 if (flags == 0) 3836 return !extended; /* "0" is valid for usual profiles */ 3837 3838 /* true if exactly one bit set */ 3839 return (flags & (flags - 1)) == 0; 3840 } 3841 3842 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3843 { 3844 /* cancel requested || normal exit path */ 3845 return atomic_read(&fs_info->balance_cancel_req) || 3846 (atomic_read(&fs_info->balance_pause_req) == 0 && 3847 atomic_read(&fs_info->balance_cancel_req) == 0); 3848 } 3849 3850 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3851 { 3852 int ret; 3853 3854 unset_balance_control(fs_info); 3855 ret = del_balance_item(fs_info); 3856 if (ret) 3857 btrfs_handle_fs_error(fs_info, ret, NULL); 3858 3859 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3860 } 3861 3862 /* Non-zero return value signifies invalidity */ 3863 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3864 u64 allowed) 3865 { 3866 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3867 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3868 (bctl_arg->target & ~allowed))); 3869 } 3870 3871 /* 3872 * Should be called with both balance and volume mutexes held 3873 */ 3874 int btrfs_balance(struct btrfs_balance_control *bctl, 3875 struct btrfs_ioctl_balance_args *bargs) 3876 { 3877 struct btrfs_fs_info *fs_info = bctl->fs_info; 3878 u64 meta_target, data_target; 3879 u64 allowed; 3880 int mixed = 0; 3881 int ret; 3882 u64 num_devices; 3883 unsigned seq; 3884 3885 if (btrfs_fs_closing(fs_info) || 3886 atomic_read(&fs_info->balance_pause_req) || 3887 atomic_read(&fs_info->balance_cancel_req)) { 3888 ret = -EINVAL; 3889 goto out; 3890 } 3891 3892 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3893 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3894 mixed = 1; 3895 3896 /* 3897 * In case of mixed groups both data and meta should be picked, 3898 * and identical options should be given for both of them. 3899 */ 3900 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3901 if (mixed && (bctl->flags & allowed)) { 3902 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3903 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3904 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3905 btrfs_err(fs_info, 3906 "with mixed groups data and metadata balance options must be the same"); 3907 ret = -EINVAL; 3908 goto out; 3909 } 3910 } 3911 3912 num_devices = fs_info->fs_devices->num_devices; 3913 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 3914 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3915 BUG_ON(num_devices < 1); 3916 num_devices--; 3917 } 3918 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 3919 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3920 if (num_devices > 1) 3921 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3922 if (num_devices > 2) 3923 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3924 if (num_devices > 3) 3925 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3926 BTRFS_BLOCK_GROUP_RAID6); 3927 if (validate_convert_profile(&bctl->data, allowed)) { 3928 btrfs_err(fs_info, 3929 "unable to start balance with target data profile %llu", 3930 bctl->data.target); 3931 ret = -EINVAL; 3932 goto out; 3933 } 3934 if (validate_convert_profile(&bctl->meta, allowed)) { 3935 btrfs_err(fs_info, 3936 "unable to start balance with target metadata profile %llu", 3937 bctl->meta.target); 3938 ret = -EINVAL; 3939 goto out; 3940 } 3941 if (validate_convert_profile(&bctl->sys, allowed)) { 3942 btrfs_err(fs_info, 3943 "unable to start balance with target system profile %llu", 3944 bctl->sys.target); 3945 ret = -EINVAL; 3946 goto out; 3947 } 3948 3949 /* allow to reduce meta or sys integrity only if force set */ 3950 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3951 BTRFS_BLOCK_GROUP_RAID10 | 3952 BTRFS_BLOCK_GROUP_RAID5 | 3953 BTRFS_BLOCK_GROUP_RAID6; 3954 do { 3955 seq = read_seqbegin(&fs_info->profiles_lock); 3956 3957 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3958 (fs_info->avail_system_alloc_bits & allowed) && 3959 !(bctl->sys.target & allowed)) || 3960 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3961 (fs_info->avail_metadata_alloc_bits & allowed) && 3962 !(bctl->meta.target & allowed))) { 3963 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3964 btrfs_info(fs_info, 3965 "force reducing metadata integrity"); 3966 } else { 3967 btrfs_err(fs_info, 3968 "balance will reduce metadata integrity, use force if you want this"); 3969 ret = -EINVAL; 3970 goto out; 3971 } 3972 } 3973 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3974 3975 /* if we're not converting, the target field is uninitialized */ 3976 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3977 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 3978 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3979 bctl->data.target : fs_info->avail_data_alloc_bits; 3980 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 3981 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3982 btrfs_warn(fs_info, 3983 "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", 3984 meta_target, data_target); 3985 } 3986 3987 ret = insert_balance_item(fs_info, bctl); 3988 if (ret && ret != -EEXIST) 3989 goto out; 3990 3991 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3992 BUG_ON(ret == -EEXIST); 3993 set_balance_control(bctl); 3994 } else { 3995 BUG_ON(ret != -EEXIST); 3996 spin_lock(&fs_info->balance_lock); 3997 update_balance_args(bctl); 3998 spin_unlock(&fs_info->balance_lock); 3999 } 4000 4001 atomic_inc(&fs_info->balance_running); 4002 mutex_unlock(&fs_info->balance_mutex); 4003 4004 ret = __btrfs_balance(fs_info); 4005 4006 mutex_lock(&fs_info->balance_mutex); 4007 atomic_dec(&fs_info->balance_running); 4008 4009 if (bargs) { 4010 memset(bargs, 0, sizeof(*bargs)); 4011 update_ioctl_balance_args(fs_info, 0, bargs); 4012 } 4013 4014 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4015 balance_need_close(fs_info)) { 4016 __cancel_balance(fs_info); 4017 } 4018 4019 wake_up(&fs_info->balance_wait_q); 4020 4021 return ret; 4022 out: 4023 if (bctl->flags & BTRFS_BALANCE_RESUME) 4024 __cancel_balance(fs_info); 4025 else { 4026 kfree(bctl); 4027 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4028 } 4029 return ret; 4030 } 4031 4032 static int balance_kthread(void *data) 4033 { 4034 struct btrfs_fs_info *fs_info = data; 4035 int ret = 0; 4036 4037 mutex_lock(&fs_info->volume_mutex); 4038 mutex_lock(&fs_info->balance_mutex); 4039 4040 if (fs_info->balance_ctl) { 4041 btrfs_info(fs_info, "continuing balance"); 4042 ret = btrfs_balance(fs_info->balance_ctl, NULL); 4043 } 4044 4045 mutex_unlock(&fs_info->balance_mutex); 4046 mutex_unlock(&fs_info->volume_mutex); 4047 4048 return ret; 4049 } 4050 4051 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4052 { 4053 struct task_struct *tsk; 4054 4055 spin_lock(&fs_info->balance_lock); 4056 if (!fs_info->balance_ctl) { 4057 spin_unlock(&fs_info->balance_lock); 4058 return 0; 4059 } 4060 spin_unlock(&fs_info->balance_lock); 4061 4062 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4063 btrfs_info(fs_info, "force skipping balance"); 4064 return 0; 4065 } 4066 4067 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4068 return PTR_ERR_OR_ZERO(tsk); 4069 } 4070 4071 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4072 { 4073 struct btrfs_balance_control *bctl; 4074 struct btrfs_balance_item *item; 4075 struct btrfs_disk_balance_args disk_bargs; 4076 struct btrfs_path *path; 4077 struct extent_buffer *leaf; 4078 struct btrfs_key key; 4079 int ret; 4080 4081 path = btrfs_alloc_path(); 4082 if (!path) 4083 return -ENOMEM; 4084 4085 key.objectid = BTRFS_BALANCE_OBJECTID; 4086 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4087 key.offset = 0; 4088 4089 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4090 if (ret < 0) 4091 goto out; 4092 if (ret > 0) { /* ret = -ENOENT; */ 4093 ret = 0; 4094 goto out; 4095 } 4096 4097 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4098 if (!bctl) { 4099 ret = -ENOMEM; 4100 goto out; 4101 } 4102 4103 leaf = path->nodes[0]; 4104 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4105 4106 bctl->fs_info = fs_info; 4107 bctl->flags = btrfs_balance_flags(leaf, item); 4108 bctl->flags |= BTRFS_BALANCE_RESUME; 4109 4110 btrfs_balance_data(leaf, item, &disk_bargs); 4111 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4112 btrfs_balance_meta(leaf, item, &disk_bargs); 4113 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4114 btrfs_balance_sys(leaf, item, &disk_bargs); 4115 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4116 4117 WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 4118 4119 mutex_lock(&fs_info->volume_mutex); 4120 mutex_lock(&fs_info->balance_mutex); 4121 4122 set_balance_control(bctl); 4123 4124 mutex_unlock(&fs_info->balance_mutex); 4125 mutex_unlock(&fs_info->volume_mutex); 4126 out: 4127 btrfs_free_path(path); 4128 return ret; 4129 } 4130 4131 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4132 { 4133 int ret = 0; 4134 4135 mutex_lock(&fs_info->balance_mutex); 4136 if (!fs_info->balance_ctl) { 4137 mutex_unlock(&fs_info->balance_mutex); 4138 return -ENOTCONN; 4139 } 4140 4141 if (atomic_read(&fs_info->balance_running)) { 4142 atomic_inc(&fs_info->balance_pause_req); 4143 mutex_unlock(&fs_info->balance_mutex); 4144 4145 wait_event(fs_info->balance_wait_q, 4146 atomic_read(&fs_info->balance_running) == 0); 4147 4148 mutex_lock(&fs_info->balance_mutex); 4149 /* we are good with balance_ctl ripped off from under us */ 4150 BUG_ON(atomic_read(&fs_info->balance_running)); 4151 atomic_dec(&fs_info->balance_pause_req); 4152 } else { 4153 ret = -ENOTCONN; 4154 } 4155 4156 mutex_unlock(&fs_info->balance_mutex); 4157 return ret; 4158 } 4159 4160 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4161 { 4162 if (sb_rdonly(fs_info->sb)) 4163 return -EROFS; 4164 4165 mutex_lock(&fs_info->balance_mutex); 4166 if (!fs_info->balance_ctl) { 4167 mutex_unlock(&fs_info->balance_mutex); 4168 return -ENOTCONN; 4169 } 4170 4171 atomic_inc(&fs_info->balance_cancel_req); 4172 /* 4173 * if we are running just wait and return, balance item is 4174 * deleted in btrfs_balance in this case 4175 */ 4176 if (atomic_read(&fs_info->balance_running)) { 4177 mutex_unlock(&fs_info->balance_mutex); 4178 wait_event(fs_info->balance_wait_q, 4179 atomic_read(&fs_info->balance_running) == 0); 4180 mutex_lock(&fs_info->balance_mutex); 4181 } else { 4182 /* __cancel_balance needs volume_mutex */ 4183 mutex_unlock(&fs_info->balance_mutex); 4184 mutex_lock(&fs_info->volume_mutex); 4185 mutex_lock(&fs_info->balance_mutex); 4186 4187 if (fs_info->balance_ctl) 4188 __cancel_balance(fs_info); 4189 4190 mutex_unlock(&fs_info->volume_mutex); 4191 } 4192 4193 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 4194 atomic_dec(&fs_info->balance_cancel_req); 4195 mutex_unlock(&fs_info->balance_mutex); 4196 return 0; 4197 } 4198 4199 static int btrfs_uuid_scan_kthread(void *data) 4200 { 4201 struct btrfs_fs_info *fs_info = data; 4202 struct btrfs_root *root = fs_info->tree_root; 4203 struct btrfs_key key; 4204 struct btrfs_path *path = NULL; 4205 int ret = 0; 4206 struct extent_buffer *eb; 4207 int slot; 4208 struct btrfs_root_item root_item; 4209 u32 item_size; 4210 struct btrfs_trans_handle *trans = NULL; 4211 4212 path = btrfs_alloc_path(); 4213 if (!path) { 4214 ret = -ENOMEM; 4215 goto out; 4216 } 4217 4218 key.objectid = 0; 4219 key.type = BTRFS_ROOT_ITEM_KEY; 4220 key.offset = 0; 4221 4222 while (1) { 4223 ret = btrfs_search_forward(root, &key, path, 4224 BTRFS_OLDEST_GENERATION); 4225 if (ret) { 4226 if (ret > 0) 4227 ret = 0; 4228 break; 4229 } 4230 4231 if (key.type != BTRFS_ROOT_ITEM_KEY || 4232 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4233 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4234 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4235 goto skip; 4236 4237 eb = path->nodes[0]; 4238 slot = path->slots[0]; 4239 item_size = btrfs_item_size_nr(eb, slot); 4240 if (item_size < sizeof(root_item)) 4241 goto skip; 4242 4243 read_extent_buffer(eb, &root_item, 4244 btrfs_item_ptr_offset(eb, slot), 4245 (int)sizeof(root_item)); 4246 if (btrfs_root_refs(&root_item) == 0) 4247 goto skip; 4248 4249 if (!btrfs_is_empty_uuid(root_item.uuid) || 4250 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4251 if (trans) 4252 goto update_tree; 4253 4254 btrfs_release_path(path); 4255 /* 4256 * 1 - subvol uuid item 4257 * 1 - received_subvol uuid item 4258 */ 4259 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4260 if (IS_ERR(trans)) { 4261 ret = PTR_ERR(trans); 4262 break; 4263 } 4264 continue; 4265 } else { 4266 goto skip; 4267 } 4268 update_tree: 4269 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4270 ret = btrfs_uuid_tree_add(trans, fs_info, 4271 root_item.uuid, 4272 BTRFS_UUID_KEY_SUBVOL, 4273 key.objectid); 4274 if (ret < 0) { 4275 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4276 ret); 4277 break; 4278 } 4279 } 4280 4281 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4282 ret = btrfs_uuid_tree_add(trans, fs_info, 4283 root_item.received_uuid, 4284 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4285 key.objectid); 4286 if (ret < 0) { 4287 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4288 ret); 4289 break; 4290 } 4291 } 4292 4293 skip: 4294 if (trans) { 4295 ret = btrfs_end_transaction(trans); 4296 trans = NULL; 4297 if (ret) 4298 break; 4299 } 4300 4301 btrfs_release_path(path); 4302 if (key.offset < (u64)-1) { 4303 key.offset++; 4304 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4305 key.offset = 0; 4306 key.type = BTRFS_ROOT_ITEM_KEY; 4307 } else if (key.objectid < (u64)-1) { 4308 key.offset = 0; 4309 key.type = BTRFS_ROOT_ITEM_KEY; 4310 key.objectid++; 4311 } else { 4312 break; 4313 } 4314 cond_resched(); 4315 } 4316 4317 out: 4318 btrfs_free_path(path); 4319 if (trans && !IS_ERR(trans)) 4320 btrfs_end_transaction(trans); 4321 if (ret) 4322 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4323 else 4324 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4325 up(&fs_info->uuid_tree_rescan_sem); 4326 return 0; 4327 } 4328 4329 /* 4330 * Callback for btrfs_uuid_tree_iterate(). 4331 * returns: 4332 * 0 check succeeded, the entry is not outdated. 4333 * < 0 if an error occurred. 4334 * > 0 if the check failed, which means the caller shall remove the entry. 4335 */ 4336 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4337 u8 *uuid, u8 type, u64 subid) 4338 { 4339 struct btrfs_key key; 4340 int ret = 0; 4341 struct btrfs_root *subvol_root; 4342 4343 if (type != BTRFS_UUID_KEY_SUBVOL && 4344 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4345 goto out; 4346 4347 key.objectid = subid; 4348 key.type = BTRFS_ROOT_ITEM_KEY; 4349 key.offset = (u64)-1; 4350 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4351 if (IS_ERR(subvol_root)) { 4352 ret = PTR_ERR(subvol_root); 4353 if (ret == -ENOENT) 4354 ret = 1; 4355 goto out; 4356 } 4357 4358 switch (type) { 4359 case BTRFS_UUID_KEY_SUBVOL: 4360 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4361 ret = 1; 4362 break; 4363 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4364 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4365 BTRFS_UUID_SIZE)) 4366 ret = 1; 4367 break; 4368 } 4369 4370 out: 4371 return ret; 4372 } 4373 4374 static int btrfs_uuid_rescan_kthread(void *data) 4375 { 4376 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4377 int ret; 4378 4379 /* 4380 * 1st step is to iterate through the existing UUID tree and 4381 * to delete all entries that contain outdated data. 4382 * 2nd step is to add all missing entries to the UUID tree. 4383 */ 4384 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4385 if (ret < 0) { 4386 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4387 up(&fs_info->uuid_tree_rescan_sem); 4388 return ret; 4389 } 4390 return btrfs_uuid_scan_kthread(data); 4391 } 4392 4393 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4394 { 4395 struct btrfs_trans_handle *trans; 4396 struct btrfs_root *tree_root = fs_info->tree_root; 4397 struct btrfs_root *uuid_root; 4398 struct task_struct *task; 4399 int ret; 4400 4401 /* 4402 * 1 - root node 4403 * 1 - root item 4404 */ 4405 trans = btrfs_start_transaction(tree_root, 2); 4406 if (IS_ERR(trans)) 4407 return PTR_ERR(trans); 4408 4409 uuid_root = btrfs_create_tree(trans, fs_info, 4410 BTRFS_UUID_TREE_OBJECTID); 4411 if (IS_ERR(uuid_root)) { 4412 ret = PTR_ERR(uuid_root); 4413 btrfs_abort_transaction(trans, ret); 4414 btrfs_end_transaction(trans); 4415 return ret; 4416 } 4417 4418 fs_info->uuid_root = uuid_root; 4419 4420 ret = btrfs_commit_transaction(trans); 4421 if (ret) 4422 return ret; 4423 4424 down(&fs_info->uuid_tree_rescan_sem); 4425 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4426 if (IS_ERR(task)) { 4427 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4428 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4429 up(&fs_info->uuid_tree_rescan_sem); 4430 return PTR_ERR(task); 4431 } 4432 4433 return 0; 4434 } 4435 4436 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4437 { 4438 struct task_struct *task; 4439 4440 down(&fs_info->uuid_tree_rescan_sem); 4441 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4442 if (IS_ERR(task)) { 4443 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4444 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4445 up(&fs_info->uuid_tree_rescan_sem); 4446 return PTR_ERR(task); 4447 } 4448 4449 return 0; 4450 } 4451 4452 /* 4453 * shrinking a device means finding all of the device extents past 4454 * the new size, and then following the back refs to the chunks. 4455 * The chunk relocation code actually frees the device extent 4456 */ 4457 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4458 { 4459 struct btrfs_fs_info *fs_info = device->fs_info; 4460 struct btrfs_root *root = fs_info->dev_root; 4461 struct btrfs_trans_handle *trans; 4462 struct btrfs_dev_extent *dev_extent = NULL; 4463 struct btrfs_path *path; 4464 u64 length; 4465 u64 chunk_offset; 4466 int ret; 4467 int slot; 4468 int failed = 0; 4469 bool retried = false; 4470 bool checked_pending_chunks = false; 4471 struct extent_buffer *l; 4472 struct btrfs_key key; 4473 struct btrfs_super_block *super_copy = fs_info->super_copy; 4474 u64 old_total = btrfs_super_total_bytes(super_copy); 4475 u64 old_size = btrfs_device_get_total_bytes(device); 4476 u64 diff; 4477 4478 new_size = round_down(new_size, fs_info->sectorsize); 4479 diff = round_down(old_size - new_size, fs_info->sectorsize); 4480 4481 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4482 return -EINVAL; 4483 4484 path = btrfs_alloc_path(); 4485 if (!path) 4486 return -ENOMEM; 4487 4488 path->reada = READA_FORWARD; 4489 4490 mutex_lock(&fs_info->chunk_mutex); 4491 4492 btrfs_device_set_total_bytes(device, new_size); 4493 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4494 device->fs_devices->total_rw_bytes -= diff; 4495 atomic64_sub(diff, &fs_info->free_chunk_space); 4496 } 4497 mutex_unlock(&fs_info->chunk_mutex); 4498 4499 again: 4500 key.objectid = device->devid; 4501 key.offset = (u64)-1; 4502 key.type = BTRFS_DEV_EXTENT_KEY; 4503 4504 do { 4505 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4506 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4507 if (ret < 0) { 4508 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4509 goto done; 4510 } 4511 4512 ret = btrfs_previous_item(root, path, 0, key.type); 4513 if (ret) 4514 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4515 if (ret < 0) 4516 goto done; 4517 if (ret) { 4518 ret = 0; 4519 btrfs_release_path(path); 4520 break; 4521 } 4522 4523 l = path->nodes[0]; 4524 slot = path->slots[0]; 4525 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4526 4527 if (key.objectid != device->devid) { 4528 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4529 btrfs_release_path(path); 4530 break; 4531 } 4532 4533 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4534 length = btrfs_dev_extent_length(l, dev_extent); 4535 4536 if (key.offset + length <= new_size) { 4537 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4538 btrfs_release_path(path); 4539 break; 4540 } 4541 4542 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4543 btrfs_release_path(path); 4544 4545 /* 4546 * We may be relocating the only data chunk we have, 4547 * which could potentially end up with losing data's 4548 * raid profile, so lets allocate an empty one in 4549 * advance. 4550 */ 4551 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4552 if (ret < 0) { 4553 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4554 goto done; 4555 } 4556 4557 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4558 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4559 if (ret && ret != -ENOSPC) 4560 goto done; 4561 if (ret == -ENOSPC) 4562 failed++; 4563 } while (key.offset-- > 0); 4564 4565 if (failed && !retried) { 4566 failed = 0; 4567 retried = true; 4568 goto again; 4569 } else if (failed && retried) { 4570 ret = -ENOSPC; 4571 goto done; 4572 } 4573 4574 /* Shrinking succeeded, else we would be at "done". */ 4575 trans = btrfs_start_transaction(root, 0); 4576 if (IS_ERR(trans)) { 4577 ret = PTR_ERR(trans); 4578 goto done; 4579 } 4580 4581 mutex_lock(&fs_info->chunk_mutex); 4582 4583 /* 4584 * We checked in the above loop all device extents that were already in 4585 * the device tree. However before we have updated the device's 4586 * total_bytes to the new size, we might have had chunk allocations that 4587 * have not complete yet (new block groups attached to transaction 4588 * handles), and therefore their device extents were not yet in the 4589 * device tree and we missed them in the loop above. So if we have any 4590 * pending chunk using a device extent that overlaps the device range 4591 * that we can not use anymore, commit the current transaction and 4592 * repeat the search on the device tree - this way we guarantee we will 4593 * not have chunks using device extents that end beyond 'new_size'. 4594 */ 4595 if (!checked_pending_chunks) { 4596 u64 start = new_size; 4597 u64 len = old_size - new_size; 4598 4599 if (contains_pending_extent(trans->transaction, device, 4600 &start, len)) { 4601 mutex_unlock(&fs_info->chunk_mutex); 4602 checked_pending_chunks = true; 4603 failed = 0; 4604 retried = false; 4605 ret = btrfs_commit_transaction(trans); 4606 if (ret) 4607 goto done; 4608 goto again; 4609 } 4610 } 4611 4612 btrfs_device_set_disk_total_bytes(device, new_size); 4613 if (list_empty(&device->resized_list)) 4614 list_add_tail(&device->resized_list, 4615 &fs_info->fs_devices->resized_devices); 4616 4617 WARN_ON(diff > old_total); 4618 btrfs_set_super_total_bytes(super_copy, 4619 round_down(old_total - diff, fs_info->sectorsize)); 4620 mutex_unlock(&fs_info->chunk_mutex); 4621 4622 /* Now btrfs_update_device() will change the on-disk size. */ 4623 ret = btrfs_update_device(trans, device); 4624 btrfs_end_transaction(trans); 4625 done: 4626 btrfs_free_path(path); 4627 if (ret) { 4628 mutex_lock(&fs_info->chunk_mutex); 4629 btrfs_device_set_total_bytes(device, old_size); 4630 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4631 device->fs_devices->total_rw_bytes += diff; 4632 atomic64_add(diff, &fs_info->free_chunk_space); 4633 mutex_unlock(&fs_info->chunk_mutex); 4634 } 4635 return ret; 4636 } 4637 4638 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4639 struct btrfs_key *key, 4640 struct btrfs_chunk *chunk, int item_size) 4641 { 4642 struct btrfs_super_block *super_copy = fs_info->super_copy; 4643 struct btrfs_disk_key disk_key; 4644 u32 array_size; 4645 u8 *ptr; 4646 4647 mutex_lock(&fs_info->chunk_mutex); 4648 array_size = btrfs_super_sys_array_size(super_copy); 4649 if (array_size + item_size + sizeof(disk_key) 4650 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4651 mutex_unlock(&fs_info->chunk_mutex); 4652 return -EFBIG; 4653 } 4654 4655 ptr = super_copy->sys_chunk_array + array_size; 4656 btrfs_cpu_key_to_disk(&disk_key, key); 4657 memcpy(ptr, &disk_key, sizeof(disk_key)); 4658 ptr += sizeof(disk_key); 4659 memcpy(ptr, chunk, item_size); 4660 item_size += sizeof(disk_key); 4661 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4662 mutex_unlock(&fs_info->chunk_mutex); 4663 4664 return 0; 4665 } 4666 4667 /* 4668 * sort the devices in descending order by max_avail, total_avail 4669 */ 4670 static int btrfs_cmp_device_info(const void *a, const void *b) 4671 { 4672 const struct btrfs_device_info *di_a = a; 4673 const struct btrfs_device_info *di_b = b; 4674 4675 if (di_a->max_avail > di_b->max_avail) 4676 return -1; 4677 if (di_a->max_avail < di_b->max_avail) 4678 return 1; 4679 if (di_a->total_avail > di_b->total_avail) 4680 return -1; 4681 if (di_a->total_avail < di_b->total_avail) 4682 return 1; 4683 return 0; 4684 } 4685 4686 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4687 { 4688 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4689 return; 4690 4691 btrfs_set_fs_incompat(info, RAID56); 4692 } 4693 4694 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ 4695 - sizeof(struct btrfs_chunk)) \ 4696 / sizeof(struct btrfs_stripe) + 1) 4697 4698 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4699 - 2 * sizeof(struct btrfs_disk_key) \ 4700 - 2 * sizeof(struct btrfs_chunk)) \ 4701 / sizeof(struct btrfs_stripe) + 1) 4702 4703 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4704 u64 start, u64 type) 4705 { 4706 struct btrfs_fs_info *info = trans->fs_info; 4707 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4708 struct btrfs_device *device; 4709 struct map_lookup *map = NULL; 4710 struct extent_map_tree *em_tree; 4711 struct extent_map *em; 4712 struct btrfs_device_info *devices_info = NULL; 4713 u64 total_avail; 4714 int num_stripes; /* total number of stripes to allocate */ 4715 int data_stripes; /* number of stripes that count for 4716 block group size */ 4717 int sub_stripes; /* sub_stripes info for map */ 4718 int dev_stripes; /* stripes per dev */ 4719 int devs_max; /* max devs to use */ 4720 int devs_min; /* min devs needed */ 4721 int devs_increment; /* ndevs has to be a multiple of this */ 4722 int ncopies; /* how many copies to data has */ 4723 int ret; 4724 u64 max_stripe_size; 4725 u64 max_chunk_size; 4726 u64 stripe_size; 4727 u64 num_bytes; 4728 int ndevs; 4729 int i; 4730 int j; 4731 int index; 4732 4733 BUG_ON(!alloc_profile_is_valid(type, 0)); 4734 4735 if (list_empty(&fs_devices->alloc_list)) { 4736 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4737 btrfs_debug(info, "%s: no writable device", __func__); 4738 return -ENOSPC; 4739 } 4740 4741 index = btrfs_bg_flags_to_raid_index(type); 4742 4743 sub_stripes = btrfs_raid_array[index].sub_stripes; 4744 dev_stripes = btrfs_raid_array[index].dev_stripes; 4745 devs_max = btrfs_raid_array[index].devs_max; 4746 devs_min = btrfs_raid_array[index].devs_min; 4747 devs_increment = btrfs_raid_array[index].devs_increment; 4748 ncopies = btrfs_raid_array[index].ncopies; 4749 4750 if (type & BTRFS_BLOCK_GROUP_DATA) { 4751 max_stripe_size = SZ_1G; 4752 max_chunk_size = 10 * max_stripe_size; 4753 if (!devs_max) 4754 devs_max = BTRFS_MAX_DEVS(info); 4755 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4756 /* for larger filesystems, use larger metadata chunks */ 4757 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4758 max_stripe_size = SZ_1G; 4759 else 4760 max_stripe_size = SZ_256M; 4761 max_chunk_size = max_stripe_size; 4762 if (!devs_max) 4763 devs_max = BTRFS_MAX_DEVS(info); 4764 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4765 max_stripe_size = SZ_32M; 4766 max_chunk_size = 2 * max_stripe_size; 4767 if (!devs_max) 4768 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4769 } else { 4770 btrfs_err(info, "invalid chunk type 0x%llx requested", 4771 type); 4772 BUG_ON(1); 4773 } 4774 4775 /* we don't want a chunk larger than 10% of writeable space */ 4776 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4777 max_chunk_size); 4778 4779 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4780 GFP_NOFS); 4781 if (!devices_info) 4782 return -ENOMEM; 4783 4784 /* 4785 * in the first pass through the devices list, we gather information 4786 * about the available holes on each device. 4787 */ 4788 ndevs = 0; 4789 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4790 u64 max_avail; 4791 u64 dev_offset; 4792 4793 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4794 WARN(1, KERN_ERR 4795 "BTRFS: read-only device in alloc_list\n"); 4796 continue; 4797 } 4798 4799 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4800 &device->dev_state) || 4801 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4802 continue; 4803 4804 if (device->total_bytes > device->bytes_used) 4805 total_avail = device->total_bytes - device->bytes_used; 4806 else 4807 total_avail = 0; 4808 4809 /* If there is no space on this device, skip it. */ 4810 if (total_avail == 0) 4811 continue; 4812 4813 ret = find_free_dev_extent(trans, device, 4814 max_stripe_size * dev_stripes, 4815 &dev_offset, &max_avail); 4816 if (ret && ret != -ENOSPC) 4817 goto error; 4818 4819 if (ret == 0) 4820 max_avail = max_stripe_size * dev_stripes; 4821 4822 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { 4823 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4824 btrfs_debug(info, 4825 "%s: devid %llu has no free space, have=%llu want=%u", 4826 __func__, device->devid, max_avail, 4827 BTRFS_STRIPE_LEN * dev_stripes); 4828 continue; 4829 } 4830 4831 if (ndevs == fs_devices->rw_devices) { 4832 WARN(1, "%s: found more than %llu devices\n", 4833 __func__, fs_devices->rw_devices); 4834 break; 4835 } 4836 devices_info[ndevs].dev_offset = dev_offset; 4837 devices_info[ndevs].max_avail = max_avail; 4838 devices_info[ndevs].total_avail = total_avail; 4839 devices_info[ndevs].dev = device; 4840 ++ndevs; 4841 } 4842 4843 /* 4844 * now sort the devices by hole size / available space 4845 */ 4846 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4847 btrfs_cmp_device_info, NULL); 4848 4849 /* round down to number of usable stripes */ 4850 ndevs = round_down(ndevs, devs_increment); 4851 4852 if (ndevs < devs_min) { 4853 ret = -ENOSPC; 4854 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 4855 btrfs_debug(info, 4856 "%s: not enough devices with free space: have=%d minimum required=%d", 4857 __func__, ndevs, devs_min); 4858 } 4859 goto error; 4860 } 4861 4862 ndevs = min(ndevs, devs_max); 4863 4864 /* 4865 * The primary goal is to maximize the number of stripes, so use as 4866 * many devices as possible, even if the stripes are not maximum sized. 4867 * 4868 * The DUP profile stores more than one stripe per device, the 4869 * max_avail is the total size so we have to adjust. 4870 */ 4871 stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); 4872 num_stripes = ndevs * dev_stripes; 4873 4874 /* 4875 * this will have to be fixed for RAID1 and RAID10 over 4876 * more drives 4877 */ 4878 data_stripes = num_stripes / ncopies; 4879 4880 if (type & BTRFS_BLOCK_GROUP_RAID5) 4881 data_stripes = num_stripes - 1; 4882 4883 if (type & BTRFS_BLOCK_GROUP_RAID6) 4884 data_stripes = num_stripes - 2; 4885 4886 /* 4887 * Use the number of data stripes to figure out how big this chunk 4888 * is really going to be in terms of logical address space, 4889 * and compare that answer with the max chunk size 4890 */ 4891 if (stripe_size * data_stripes > max_chunk_size) { 4892 stripe_size = div_u64(max_chunk_size, data_stripes); 4893 4894 /* bump the answer up to a 16MB boundary */ 4895 stripe_size = round_up(stripe_size, SZ_16M); 4896 4897 /* 4898 * But don't go higher than the limits we found while searching 4899 * for free extents 4900 */ 4901 stripe_size = min(devices_info[ndevs - 1].max_avail, 4902 stripe_size); 4903 } 4904 4905 /* align to BTRFS_STRIPE_LEN */ 4906 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 4907 4908 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4909 if (!map) { 4910 ret = -ENOMEM; 4911 goto error; 4912 } 4913 map->num_stripes = num_stripes; 4914 4915 for (i = 0; i < ndevs; ++i) { 4916 for (j = 0; j < dev_stripes; ++j) { 4917 int s = i * dev_stripes + j; 4918 map->stripes[s].dev = devices_info[i].dev; 4919 map->stripes[s].physical = devices_info[i].dev_offset + 4920 j * stripe_size; 4921 } 4922 } 4923 map->stripe_len = BTRFS_STRIPE_LEN; 4924 map->io_align = BTRFS_STRIPE_LEN; 4925 map->io_width = BTRFS_STRIPE_LEN; 4926 map->type = type; 4927 map->sub_stripes = sub_stripes; 4928 4929 num_bytes = stripe_size * data_stripes; 4930 4931 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4932 4933 em = alloc_extent_map(); 4934 if (!em) { 4935 kfree(map); 4936 ret = -ENOMEM; 4937 goto error; 4938 } 4939 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4940 em->map_lookup = map; 4941 em->start = start; 4942 em->len = num_bytes; 4943 em->block_start = 0; 4944 em->block_len = em->len; 4945 em->orig_block_len = stripe_size; 4946 4947 em_tree = &info->mapping_tree.map_tree; 4948 write_lock(&em_tree->lock); 4949 ret = add_extent_mapping(em_tree, em, 0); 4950 if (ret) { 4951 write_unlock(&em_tree->lock); 4952 free_extent_map(em); 4953 goto error; 4954 } 4955 4956 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4957 refcount_inc(&em->refs); 4958 write_unlock(&em_tree->lock); 4959 4960 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 4961 if (ret) 4962 goto error_del_extent; 4963 4964 for (i = 0; i < map->num_stripes; i++) { 4965 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4966 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4967 } 4968 4969 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 4970 4971 free_extent_map(em); 4972 check_raid56_incompat_flag(info, type); 4973 4974 kfree(devices_info); 4975 return 0; 4976 4977 error_del_extent: 4978 write_lock(&em_tree->lock); 4979 remove_extent_mapping(em_tree, em); 4980 write_unlock(&em_tree->lock); 4981 4982 /* One for our allocation */ 4983 free_extent_map(em); 4984 /* One for the tree reference */ 4985 free_extent_map(em); 4986 /* One for the pending_chunks list reference */ 4987 free_extent_map(em); 4988 error: 4989 kfree(devices_info); 4990 return ret; 4991 } 4992 4993 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4994 struct btrfs_fs_info *fs_info, 4995 u64 chunk_offset, u64 chunk_size) 4996 { 4997 struct btrfs_root *extent_root = fs_info->extent_root; 4998 struct btrfs_root *chunk_root = fs_info->chunk_root; 4999 struct btrfs_key key; 5000 struct btrfs_device *device; 5001 struct btrfs_chunk *chunk; 5002 struct btrfs_stripe *stripe; 5003 struct extent_map *em; 5004 struct map_lookup *map; 5005 size_t item_size; 5006 u64 dev_offset; 5007 u64 stripe_size; 5008 int i = 0; 5009 int ret = 0; 5010 5011 em = get_chunk_map(fs_info, chunk_offset, chunk_size); 5012 if (IS_ERR(em)) 5013 return PTR_ERR(em); 5014 5015 map = em->map_lookup; 5016 item_size = btrfs_chunk_item_size(map->num_stripes); 5017 stripe_size = em->orig_block_len; 5018 5019 chunk = kzalloc(item_size, GFP_NOFS); 5020 if (!chunk) { 5021 ret = -ENOMEM; 5022 goto out; 5023 } 5024 5025 /* 5026 * Take the device list mutex to prevent races with the final phase of 5027 * a device replace operation that replaces the device object associated 5028 * with the map's stripes, because the device object's id can change 5029 * at any time during that final phase of the device replace operation 5030 * (dev-replace.c:btrfs_dev_replace_finishing()). 5031 */ 5032 mutex_lock(&fs_info->fs_devices->device_list_mutex); 5033 for (i = 0; i < map->num_stripes; i++) { 5034 device = map->stripes[i].dev; 5035 dev_offset = map->stripes[i].physical; 5036 5037 ret = btrfs_update_device(trans, device); 5038 if (ret) 5039 break; 5040 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5041 dev_offset, stripe_size); 5042 if (ret) 5043 break; 5044 } 5045 if (ret) { 5046 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5047 goto out; 5048 } 5049 5050 stripe = &chunk->stripe; 5051 for (i = 0; i < map->num_stripes; i++) { 5052 device = map->stripes[i].dev; 5053 dev_offset = map->stripes[i].physical; 5054 5055 btrfs_set_stack_stripe_devid(stripe, device->devid); 5056 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5057 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5058 stripe++; 5059 } 5060 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5061 5062 btrfs_set_stack_chunk_length(chunk, chunk_size); 5063 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5064 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5065 btrfs_set_stack_chunk_type(chunk, map->type); 5066 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5067 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5068 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5069 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5070 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5071 5072 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5073 key.type = BTRFS_CHUNK_ITEM_KEY; 5074 key.offset = chunk_offset; 5075 5076 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5077 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5078 /* 5079 * TODO: Cleanup of inserted chunk root in case of 5080 * failure. 5081 */ 5082 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5083 } 5084 5085 out: 5086 kfree(chunk); 5087 free_extent_map(em); 5088 return ret; 5089 } 5090 5091 /* 5092 * Chunk allocation falls into two parts. The first part does works 5093 * that make the new allocated chunk useable, but not do any operation 5094 * that modifies the chunk tree. The second part does the works that 5095 * require modifying the chunk tree. This division is important for the 5096 * bootstrap process of adding storage to a seed btrfs. 5097 */ 5098 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5099 struct btrfs_fs_info *fs_info, u64 type) 5100 { 5101 u64 chunk_offset; 5102 5103 lockdep_assert_held(&fs_info->chunk_mutex); 5104 chunk_offset = find_next_chunk(fs_info); 5105 return __btrfs_alloc_chunk(trans, chunk_offset, type); 5106 } 5107 5108 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5109 struct btrfs_fs_info *fs_info) 5110 { 5111 u64 chunk_offset; 5112 u64 sys_chunk_offset; 5113 u64 alloc_profile; 5114 int ret; 5115 5116 chunk_offset = find_next_chunk(fs_info); 5117 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5118 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 5119 if (ret) 5120 return ret; 5121 5122 sys_chunk_offset = find_next_chunk(fs_info); 5123 alloc_profile = btrfs_system_alloc_profile(fs_info); 5124 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 5125 return ret; 5126 } 5127 5128 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5129 { 5130 int max_errors; 5131 5132 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5133 BTRFS_BLOCK_GROUP_RAID10 | 5134 BTRFS_BLOCK_GROUP_RAID5 | 5135 BTRFS_BLOCK_GROUP_DUP)) { 5136 max_errors = 1; 5137 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5138 max_errors = 2; 5139 } else { 5140 max_errors = 0; 5141 } 5142 5143 return max_errors; 5144 } 5145 5146 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5147 { 5148 struct extent_map *em; 5149 struct map_lookup *map; 5150 int readonly = 0; 5151 int miss_ndevs = 0; 5152 int i; 5153 5154 em = get_chunk_map(fs_info, chunk_offset, 1); 5155 if (IS_ERR(em)) 5156 return 1; 5157 5158 map = em->map_lookup; 5159 for (i = 0; i < map->num_stripes; i++) { 5160 if (test_bit(BTRFS_DEV_STATE_MISSING, 5161 &map->stripes[i].dev->dev_state)) { 5162 miss_ndevs++; 5163 continue; 5164 } 5165 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5166 &map->stripes[i].dev->dev_state)) { 5167 readonly = 1; 5168 goto end; 5169 } 5170 } 5171 5172 /* 5173 * If the number of missing devices is larger than max errors, 5174 * we can not write the data into that chunk successfully, so 5175 * set it readonly. 5176 */ 5177 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5178 readonly = 1; 5179 end: 5180 free_extent_map(em); 5181 return readonly; 5182 } 5183 5184 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5185 { 5186 extent_map_tree_init(&tree->map_tree); 5187 } 5188 5189 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5190 { 5191 struct extent_map *em; 5192 5193 while (1) { 5194 write_lock(&tree->map_tree.lock); 5195 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5196 if (em) 5197 remove_extent_mapping(&tree->map_tree, em); 5198 write_unlock(&tree->map_tree.lock); 5199 if (!em) 5200 break; 5201 /* once for us */ 5202 free_extent_map(em); 5203 /* once for the tree */ 5204 free_extent_map(em); 5205 } 5206 } 5207 5208 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5209 { 5210 struct extent_map *em; 5211 struct map_lookup *map; 5212 int ret; 5213 5214 em = get_chunk_map(fs_info, logical, len); 5215 if (IS_ERR(em)) 5216 /* 5217 * We could return errors for these cases, but that could get 5218 * ugly and we'd probably do the same thing which is just not do 5219 * anything else and exit, so return 1 so the callers don't try 5220 * to use other copies. 5221 */ 5222 return 1; 5223 5224 map = em->map_lookup; 5225 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5226 ret = map->num_stripes; 5227 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5228 ret = map->sub_stripes; 5229 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5230 ret = 2; 5231 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5232 /* 5233 * There could be two corrupted data stripes, we need 5234 * to loop retry in order to rebuild the correct data. 5235 * 5236 * Fail a stripe at a time on every retry except the 5237 * stripe under reconstruction. 5238 */ 5239 ret = map->num_stripes; 5240 else 5241 ret = 1; 5242 free_extent_map(em); 5243 5244 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 5245 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5246 fs_info->dev_replace.tgtdev) 5247 ret++; 5248 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 5249 5250 return ret; 5251 } 5252 5253 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5254 u64 logical) 5255 { 5256 struct extent_map *em; 5257 struct map_lookup *map; 5258 unsigned long len = fs_info->sectorsize; 5259 5260 em = get_chunk_map(fs_info, logical, len); 5261 5262 if (!WARN_ON(IS_ERR(em))) { 5263 map = em->map_lookup; 5264 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5265 len = map->stripe_len * nr_data_stripes(map); 5266 free_extent_map(em); 5267 } 5268 return len; 5269 } 5270 5271 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5272 { 5273 struct extent_map *em; 5274 struct map_lookup *map; 5275 int ret = 0; 5276 5277 em = get_chunk_map(fs_info, logical, len); 5278 5279 if(!WARN_ON(IS_ERR(em))) { 5280 map = em->map_lookup; 5281 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5282 ret = 1; 5283 free_extent_map(em); 5284 } 5285 return ret; 5286 } 5287 5288 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5289 struct map_lookup *map, int first, 5290 int dev_replace_is_ongoing) 5291 { 5292 int i; 5293 int num_stripes; 5294 int preferred_mirror; 5295 int tolerance; 5296 struct btrfs_device *srcdev; 5297 5298 ASSERT((map->type & 5299 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5300 5301 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5302 num_stripes = map->sub_stripes; 5303 else 5304 num_stripes = map->num_stripes; 5305 5306 preferred_mirror = first + current->pid % num_stripes; 5307 5308 if (dev_replace_is_ongoing && 5309 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5310 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5311 srcdev = fs_info->dev_replace.srcdev; 5312 else 5313 srcdev = NULL; 5314 5315 /* 5316 * try to avoid the drive that is the source drive for a 5317 * dev-replace procedure, only choose it if no other non-missing 5318 * mirror is available 5319 */ 5320 for (tolerance = 0; tolerance < 2; tolerance++) { 5321 if (map->stripes[preferred_mirror].dev->bdev && 5322 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5323 return preferred_mirror; 5324 for (i = first; i < first + num_stripes; i++) { 5325 if (map->stripes[i].dev->bdev && 5326 (tolerance || map->stripes[i].dev != srcdev)) 5327 return i; 5328 } 5329 } 5330 5331 /* we couldn't find one that doesn't fail. Just return something 5332 * and the io error handling code will clean up eventually 5333 */ 5334 return preferred_mirror; 5335 } 5336 5337 static inline int parity_smaller(u64 a, u64 b) 5338 { 5339 return a > b; 5340 } 5341 5342 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5343 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5344 { 5345 struct btrfs_bio_stripe s; 5346 int i; 5347 u64 l; 5348 int again = 1; 5349 5350 while (again) { 5351 again = 0; 5352 for (i = 0; i < num_stripes - 1; i++) { 5353 if (parity_smaller(bbio->raid_map[i], 5354 bbio->raid_map[i+1])) { 5355 s = bbio->stripes[i]; 5356 l = bbio->raid_map[i]; 5357 bbio->stripes[i] = bbio->stripes[i+1]; 5358 bbio->raid_map[i] = bbio->raid_map[i+1]; 5359 bbio->stripes[i+1] = s; 5360 bbio->raid_map[i+1] = l; 5361 5362 again = 1; 5363 } 5364 } 5365 } 5366 } 5367 5368 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5369 { 5370 struct btrfs_bio *bbio = kzalloc( 5371 /* the size of the btrfs_bio */ 5372 sizeof(struct btrfs_bio) + 5373 /* plus the variable array for the stripes */ 5374 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5375 /* plus the variable array for the tgt dev */ 5376 sizeof(int) * (real_stripes) + 5377 /* 5378 * plus the raid_map, which includes both the tgt dev 5379 * and the stripes 5380 */ 5381 sizeof(u64) * (total_stripes), 5382 GFP_NOFS|__GFP_NOFAIL); 5383 5384 atomic_set(&bbio->error, 0); 5385 refcount_set(&bbio->refs, 1); 5386 5387 return bbio; 5388 } 5389 5390 void btrfs_get_bbio(struct btrfs_bio *bbio) 5391 { 5392 WARN_ON(!refcount_read(&bbio->refs)); 5393 refcount_inc(&bbio->refs); 5394 } 5395 5396 void btrfs_put_bbio(struct btrfs_bio *bbio) 5397 { 5398 if (!bbio) 5399 return; 5400 if (refcount_dec_and_test(&bbio->refs)) 5401 kfree(bbio); 5402 } 5403 5404 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5405 /* 5406 * Please note that, discard won't be sent to target device of device 5407 * replace. 5408 */ 5409 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5410 u64 logical, u64 length, 5411 struct btrfs_bio **bbio_ret) 5412 { 5413 struct extent_map *em; 5414 struct map_lookup *map; 5415 struct btrfs_bio *bbio; 5416 u64 offset; 5417 u64 stripe_nr; 5418 u64 stripe_nr_end; 5419 u64 stripe_end_offset; 5420 u64 stripe_cnt; 5421 u64 stripe_len; 5422 u64 stripe_offset; 5423 u64 num_stripes; 5424 u32 stripe_index; 5425 u32 factor = 0; 5426 u32 sub_stripes = 0; 5427 u64 stripes_per_dev = 0; 5428 u32 remaining_stripes = 0; 5429 u32 last_stripe = 0; 5430 int ret = 0; 5431 int i; 5432 5433 /* discard always return a bbio */ 5434 ASSERT(bbio_ret); 5435 5436 em = get_chunk_map(fs_info, logical, length); 5437 if (IS_ERR(em)) 5438 return PTR_ERR(em); 5439 5440 map = em->map_lookup; 5441 /* we don't discard raid56 yet */ 5442 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5443 ret = -EOPNOTSUPP; 5444 goto out; 5445 } 5446 5447 offset = logical - em->start; 5448 length = min_t(u64, em->len - offset, length); 5449 5450 stripe_len = map->stripe_len; 5451 /* 5452 * stripe_nr counts the total number of stripes we have to stride 5453 * to get to this block 5454 */ 5455 stripe_nr = div64_u64(offset, stripe_len); 5456 5457 /* stripe_offset is the offset of this block in its stripe */ 5458 stripe_offset = offset - stripe_nr * stripe_len; 5459 5460 stripe_nr_end = round_up(offset + length, map->stripe_len); 5461 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5462 stripe_cnt = stripe_nr_end - stripe_nr; 5463 stripe_end_offset = stripe_nr_end * map->stripe_len - 5464 (offset + length); 5465 /* 5466 * after this, stripe_nr is the number of stripes on this 5467 * device we have to walk to find the data, and stripe_index is 5468 * the number of our device in the stripe array 5469 */ 5470 num_stripes = 1; 5471 stripe_index = 0; 5472 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5473 BTRFS_BLOCK_GROUP_RAID10)) { 5474 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5475 sub_stripes = 1; 5476 else 5477 sub_stripes = map->sub_stripes; 5478 5479 factor = map->num_stripes / sub_stripes; 5480 num_stripes = min_t(u64, map->num_stripes, 5481 sub_stripes * stripe_cnt); 5482 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5483 stripe_index *= sub_stripes; 5484 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5485 &remaining_stripes); 5486 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5487 last_stripe *= sub_stripes; 5488 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5489 BTRFS_BLOCK_GROUP_DUP)) { 5490 num_stripes = map->num_stripes; 5491 } else { 5492 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5493 &stripe_index); 5494 } 5495 5496 bbio = alloc_btrfs_bio(num_stripes, 0); 5497 if (!bbio) { 5498 ret = -ENOMEM; 5499 goto out; 5500 } 5501 5502 for (i = 0; i < num_stripes; i++) { 5503 bbio->stripes[i].physical = 5504 map->stripes[stripe_index].physical + 5505 stripe_offset + stripe_nr * map->stripe_len; 5506 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5507 5508 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5509 BTRFS_BLOCK_GROUP_RAID10)) { 5510 bbio->stripes[i].length = stripes_per_dev * 5511 map->stripe_len; 5512 5513 if (i / sub_stripes < remaining_stripes) 5514 bbio->stripes[i].length += 5515 map->stripe_len; 5516 5517 /* 5518 * Special for the first stripe and 5519 * the last stripe: 5520 * 5521 * |-------|...|-------| 5522 * |----------| 5523 * off end_off 5524 */ 5525 if (i < sub_stripes) 5526 bbio->stripes[i].length -= 5527 stripe_offset; 5528 5529 if (stripe_index >= last_stripe && 5530 stripe_index <= (last_stripe + 5531 sub_stripes - 1)) 5532 bbio->stripes[i].length -= 5533 stripe_end_offset; 5534 5535 if (i == sub_stripes - 1) 5536 stripe_offset = 0; 5537 } else { 5538 bbio->stripes[i].length = length; 5539 } 5540 5541 stripe_index++; 5542 if (stripe_index == map->num_stripes) { 5543 stripe_index = 0; 5544 stripe_nr++; 5545 } 5546 } 5547 5548 *bbio_ret = bbio; 5549 bbio->map_type = map->type; 5550 bbio->num_stripes = num_stripes; 5551 out: 5552 free_extent_map(em); 5553 return ret; 5554 } 5555 5556 /* 5557 * In dev-replace case, for repair case (that's the only case where the mirror 5558 * is selected explicitly when calling btrfs_map_block), blocks left of the 5559 * left cursor can also be read from the target drive. 5560 * 5561 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5562 * array of stripes. 5563 * For READ, it also needs to be supported using the same mirror number. 5564 * 5565 * If the requested block is not left of the left cursor, EIO is returned. This 5566 * can happen because btrfs_num_copies() returns one more in the dev-replace 5567 * case. 5568 */ 5569 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5570 u64 logical, u64 length, 5571 u64 srcdev_devid, int *mirror_num, 5572 u64 *physical) 5573 { 5574 struct btrfs_bio *bbio = NULL; 5575 int num_stripes; 5576 int index_srcdev = 0; 5577 int found = 0; 5578 u64 physical_of_found = 0; 5579 int i; 5580 int ret = 0; 5581 5582 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5583 logical, &length, &bbio, 0, 0); 5584 if (ret) { 5585 ASSERT(bbio == NULL); 5586 return ret; 5587 } 5588 5589 num_stripes = bbio->num_stripes; 5590 if (*mirror_num > num_stripes) { 5591 /* 5592 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5593 * that means that the requested area is not left of the left 5594 * cursor 5595 */ 5596 btrfs_put_bbio(bbio); 5597 return -EIO; 5598 } 5599 5600 /* 5601 * process the rest of the function using the mirror_num of the source 5602 * drive. Therefore look it up first. At the end, patch the device 5603 * pointer to the one of the target drive. 5604 */ 5605 for (i = 0; i < num_stripes; i++) { 5606 if (bbio->stripes[i].dev->devid != srcdev_devid) 5607 continue; 5608 5609 /* 5610 * In case of DUP, in order to keep it simple, only add the 5611 * mirror with the lowest physical address 5612 */ 5613 if (found && 5614 physical_of_found <= bbio->stripes[i].physical) 5615 continue; 5616 5617 index_srcdev = i; 5618 found = 1; 5619 physical_of_found = bbio->stripes[i].physical; 5620 } 5621 5622 btrfs_put_bbio(bbio); 5623 5624 ASSERT(found); 5625 if (!found) 5626 return -EIO; 5627 5628 *mirror_num = index_srcdev + 1; 5629 *physical = physical_of_found; 5630 return ret; 5631 } 5632 5633 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5634 struct btrfs_bio **bbio_ret, 5635 struct btrfs_dev_replace *dev_replace, 5636 int *num_stripes_ret, int *max_errors_ret) 5637 { 5638 struct btrfs_bio *bbio = *bbio_ret; 5639 u64 srcdev_devid = dev_replace->srcdev->devid; 5640 int tgtdev_indexes = 0; 5641 int num_stripes = *num_stripes_ret; 5642 int max_errors = *max_errors_ret; 5643 int i; 5644 5645 if (op == BTRFS_MAP_WRITE) { 5646 int index_where_to_add; 5647 5648 /* 5649 * duplicate the write operations while the dev replace 5650 * procedure is running. Since the copying of the old disk to 5651 * the new disk takes place at run time while the filesystem is 5652 * mounted writable, the regular write operations to the old 5653 * disk have to be duplicated to go to the new disk as well. 5654 * 5655 * Note that device->missing is handled by the caller, and that 5656 * the write to the old disk is already set up in the stripes 5657 * array. 5658 */ 5659 index_where_to_add = num_stripes; 5660 for (i = 0; i < num_stripes; i++) { 5661 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5662 /* write to new disk, too */ 5663 struct btrfs_bio_stripe *new = 5664 bbio->stripes + index_where_to_add; 5665 struct btrfs_bio_stripe *old = 5666 bbio->stripes + i; 5667 5668 new->physical = old->physical; 5669 new->length = old->length; 5670 new->dev = dev_replace->tgtdev; 5671 bbio->tgtdev_map[i] = index_where_to_add; 5672 index_where_to_add++; 5673 max_errors++; 5674 tgtdev_indexes++; 5675 } 5676 } 5677 num_stripes = index_where_to_add; 5678 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5679 int index_srcdev = 0; 5680 int found = 0; 5681 u64 physical_of_found = 0; 5682 5683 /* 5684 * During the dev-replace procedure, the target drive can also 5685 * be used to read data in case it is needed to repair a corrupt 5686 * block elsewhere. This is possible if the requested area is 5687 * left of the left cursor. In this area, the target drive is a 5688 * full copy of the source drive. 5689 */ 5690 for (i = 0; i < num_stripes; i++) { 5691 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5692 /* 5693 * In case of DUP, in order to keep it simple, 5694 * only add the mirror with the lowest physical 5695 * address 5696 */ 5697 if (found && 5698 physical_of_found <= 5699 bbio->stripes[i].physical) 5700 continue; 5701 index_srcdev = i; 5702 found = 1; 5703 physical_of_found = bbio->stripes[i].physical; 5704 } 5705 } 5706 if (found) { 5707 struct btrfs_bio_stripe *tgtdev_stripe = 5708 bbio->stripes + num_stripes; 5709 5710 tgtdev_stripe->physical = physical_of_found; 5711 tgtdev_stripe->length = 5712 bbio->stripes[index_srcdev].length; 5713 tgtdev_stripe->dev = dev_replace->tgtdev; 5714 bbio->tgtdev_map[index_srcdev] = num_stripes; 5715 5716 tgtdev_indexes++; 5717 num_stripes++; 5718 } 5719 } 5720 5721 *num_stripes_ret = num_stripes; 5722 *max_errors_ret = max_errors; 5723 bbio->num_tgtdevs = tgtdev_indexes; 5724 *bbio_ret = bbio; 5725 } 5726 5727 static bool need_full_stripe(enum btrfs_map_op op) 5728 { 5729 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5730 } 5731 5732 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5733 enum btrfs_map_op op, 5734 u64 logical, u64 *length, 5735 struct btrfs_bio **bbio_ret, 5736 int mirror_num, int need_raid_map) 5737 { 5738 struct extent_map *em; 5739 struct map_lookup *map; 5740 u64 offset; 5741 u64 stripe_offset; 5742 u64 stripe_nr; 5743 u64 stripe_len; 5744 u32 stripe_index; 5745 int i; 5746 int ret = 0; 5747 int num_stripes; 5748 int max_errors = 0; 5749 int tgtdev_indexes = 0; 5750 struct btrfs_bio *bbio = NULL; 5751 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5752 int dev_replace_is_ongoing = 0; 5753 int num_alloc_stripes; 5754 int patch_the_first_stripe_for_dev_replace = 0; 5755 u64 physical_to_patch_in_first_stripe = 0; 5756 u64 raid56_full_stripe_start = (u64)-1; 5757 5758 if (op == BTRFS_MAP_DISCARD) 5759 return __btrfs_map_block_for_discard(fs_info, logical, 5760 *length, bbio_ret); 5761 5762 em = get_chunk_map(fs_info, logical, *length); 5763 if (IS_ERR(em)) 5764 return PTR_ERR(em); 5765 5766 map = em->map_lookup; 5767 offset = logical - em->start; 5768 5769 stripe_len = map->stripe_len; 5770 stripe_nr = offset; 5771 /* 5772 * stripe_nr counts the total number of stripes we have to stride 5773 * to get to this block 5774 */ 5775 stripe_nr = div64_u64(stripe_nr, stripe_len); 5776 5777 stripe_offset = stripe_nr * stripe_len; 5778 if (offset < stripe_offset) { 5779 btrfs_crit(fs_info, 5780 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5781 stripe_offset, offset, em->start, logical, 5782 stripe_len); 5783 free_extent_map(em); 5784 return -EINVAL; 5785 } 5786 5787 /* stripe_offset is the offset of this block in its stripe*/ 5788 stripe_offset = offset - stripe_offset; 5789 5790 /* if we're here for raid56, we need to know the stripe aligned start */ 5791 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5792 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5793 raid56_full_stripe_start = offset; 5794 5795 /* allow a write of a full stripe, but make sure we don't 5796 * allow straddling of stripes 5797 */ 5798 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5799 full_stripe_len); 5800 raid56_full_stripe_start *= full_stripe_len; 5801 } 5802 5803 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5804 u64 max_len; 5805 /* For writes to RAID[56], allow a full stripeset across all disks. 5806 For other RAID types and for RAID[56] reads, just allow a single 5807 stripe (on a single disk). */ 5808 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5809 (op == BTRFS_MAP_WRITE)) { 5810 max_len = stripe_len * nr_data_stripes(map) - 5811 (offset - raid56_full_stripe_start); 5812 } else { 5813 /* we limit the length of each bio to what fits in a stripe */ 5814 max_len = stripe_len - stripe_offset; 5815 } 5816 *length = min_t(u64, em->len - offset, max_len); 5817 } else { 5818 *length = em->len - offset; 5819 } 5820 5821 /* This is for when we're called from btrfs_merge_bio_hook() and all 5822 it cares about is the length */ 5823 if (!bbio_ret) 5824 goto out; 5825 5826 btrfs_dev_replace_read_lock(dev_replace); 5827 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5828 if (!dev_replace_is_ongoing) 5829 btrfs_dev_replace_read_unlock(dev_replace); 5830 else 5831 btrfs_dev_replace_set_lock_blocking(dev_replace); 5832 5833 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5834 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 5835 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 5836 dev_replace->srcdev->devid, 5837 &mirror_num, 5838 &physical_to_patch_in_first_stripe); 5839 if (ret) 5840 goto out; 5841 else 5842 patch_the_first_stripe_for_dev_replace = 1; 5843 } else if (mirror_num > map->num_stripes) { 5844 mirror_num = 0; 5845 } 5846 5847 num_stripes = 1; 5848 stripe_index = 0; 5849 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5850 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5851 &stripe_index); 5852 if (!need_full_stripe(op)) 5853 mirror_num = 1; 5854 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5855 if (need_full_stripe(op)) 5856 num_stripes = map->num_stripes; 5857 else if (mirror_num) 5858 stripe_index = mirror_num - 1; 5859 else { 5860 stripe_index = find_live_mirror(fs_info, map, 0, 5861 dev_replace_is_ongoing); 5862 mirror_num = stripe_index + 1; 5863 } 5864 5865 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5866 if (need_full_stripe(op)) { 5867 num_stripes = map->num_stripes; 5868 } else if (mirror_num) { 5869 stripe_index = mirror_num - 1; 5870 } else { 5871 mirror_num = 1; 5872 } 5873 5874 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5875 u32 factor = map->num_stripes / map->sub_stripes; 5876 5877 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5878 stripe_index *= map->sub_stripes; 5879 5880 if (need_full_stripe(op)) 5881 num_stripes = map->sub_stripes; 5882 else if (mirror_num) 5883 stripe_index += mirror_num - 1; 5884 else { 5885 int old_stripe_index = stripe_index; 5886 stripe_index = find_live_mirror(fs_info, map, 5887 stripe_index, 5888 dev_replace_is_ongoing); 5889 mirror_num = stripe_index - old_stripe_index + 1; 5890 } 5891 5892 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5893 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5894 /* push stripe_nr back to the start of the full stripe */ 5895 stripe_nr = div64_u64(raid56_full_stripe_start, 5896 stripe_len * nr_data_stripes(map)); 5897 5898 /* RAID[56] write or recovery. Return all stripes */ 5899 num_stripes = map->num_stripes; 5900 max_errors = nr_parity_stripes(map); 5901 5902 *length = map->stripe_len; 5903 stripe_index = 0; 5904 stripe_offset = 0; 5905 } else { 5906 /* 5907 * Mirror #0 or #1 means the original data block. 5908 * Mirror #2 is RAID5 parity block. 5909 * Mirror #3 is RAID6 Q block. 5910 */ 5911 stripe_nr = div_u64_rem(stripe_nr, 5912 nr_data_stripes(map), &stripe_index); 5913 if (mirror_num > 1) 5914 stripe_index = nr_data_stripes(map) + 5915 mirror_num - 2; 5916 5917 /* We distribute the parity blocks across stripes */ 5918 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5919 &stripe_index); 5920 if (!need_full_stripe(op) && mirror_num <= 1) 5921 mirror_num = 1; 5922 } 5923 } else { 5924 /* 5925 * after this, stripe_nr is the number of stripes on this 5926 * device we have to walk to find the data, and stripe_index is 5927 * the number of our device in the stripe array 5928 */ 5929 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5930 &stripe_index); 5931 mirror_num = stripe_index + 1; 5932 } 5933 if (stripe_index >= map->num_stripes) { 5934 btrfs_crit(fs_info, 5935 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5936 stripe_index, map->num_stripes); 5937 ret = -EINVAL; 5938 goto out; 5939 } 5940 5941 num_alloc_stripes = num_stripes; 5942 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 5943 if (op == BTRFS_MAP_WRITE) 5944 num_alloc_stripes <<= 1; 5945 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5946 num_alloc_stripes++; 5947 tgtdev_indexes = num_stripes; 5948 } 5949 5950 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5951 if (!bbio) { 5952 ret = -ENOMEM; 5953 goto out; 5954 } 5955 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 5956 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5957 5958 /* build raid_map */ 5959 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 5960 (need_full_stripe(op) || mirror_num > 1)) { 5961 u64 tmp; 5962 unsigned rot; 5963 5964 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5965 sizeof(struct btrfs_bio_stripe) * 5966 num_alloc_stripes + 5967 sizeof(int) * tgtdev_indexes); 5968 5969 /* Work out the disk rotation on this stripe-set */ 5970 div_u64_rem(stripe_nr, num_stripes, &rot); 5971 5972 /* Fill in the logical address of each stripe */ 5973 tmp = stripe_nr * nr_data_stripes(map); 5974 for (i = 0; i < nr_data_stripes(map); i++) 5975 bbio->raid_map[(i+rot) % num_stripes] = 5976 em->start + (tmp + i) * map->stripe_len; 5977 5978 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5979 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5980 bbio->raid_map[(i+rot+1) % num_stripes] = 5981 RAID6_Q_STRIPE; 5982 } 5983 5984 5985 for (i = 0; i < num_stripes; i++) { 5986 bbio->stripes[i].physical = 5987 map->stripes[stripe_index].physical + 5988 stripe_offset + 5989 stripe_nr * map->stripe_len; 5990 bbio->stripes[i].dev = 5991 map->stripes[stripe_index].dev; 5992 stripe_index++; 5993 } 5994 5995 if (need_full_stripe(op)) 5996 max_errors = btrfs_chunk_max_errors(map); 5997 5998 if (bbio->raid_map) 5999 sort_parity_stripes(bbio, num_stripes); 6000 6001 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6002 need_full_stripe(op)) { 6003 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 6004 &max_errors); 6005 } 6006 6007 *bbio_ret = bbio; 6008 bbio->map_type = map->type; 6009 bbio->num_stripes = num_stripes; 6010 bbio->max_errors = max_errors; 6011 bbio->mirror_num = mirror_num; 6012 6013 /* 6014 * this is the case that REQ_READ && dev_replace_is_ongoing && 6015 * mirror_num == num_stripes + 1 && dev_replace target drive is 6016 * available as a mirror 6017 */ 6018 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6019 WARN_ON(num_stripes > 1); 6020 bbio->stripes[0].dev = dev_replace->tgtdev; 6021 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6022 bbio->mirror_num = map->num_stripes + 1; 6023 } 6024 out: 6025 if (dev_replace_is_ongoing) { 6026 btrfs_dev_replace_clear_lock_blocking(dev_replace); 6027 btrfs_dev_replace_read_unlock(dev_replace); 6028 } 6029 free_extent_map(em); 6030 return ret; 6031 } 6032 6033 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6034 u64 logical, u64 *length, 6035 struct btrfs_bio **bbio_ret, int mirror_num) 6036 { 6037 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6038 mirror_num, 0); 6039 } 6040 6041 /* For Scrub/replace */ 6042 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6043 u64 logical, u64 *length, 6044 struct btrfs_bio **bbio_ret) 6045 { 6046 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6047 } 6048 6049 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, 6050 u64 chunk_start, u64 physical, u64 devid, 6051 u64 **logical, int *naddrs, int *stripe_len) 6052 { 6053 struct extent_map *em; 6054 struct map_lookup *map; 6055 u64 *buf; 6056 u64 bytenr; 6057 u64 length; 6058 u64 stripe_nr; 6059 u64 rmap_len; 6060 int i, j, nr = 0; 6061 6062 em = get_chunk_map(fs_info, chunk_start, 1); 6063 if (IS_ERR(em)) 6064 return -EIO; 6065 6066 map = em->map_lookup; 6067 length = em->len; 6068 rmap_len = map->stripe_len; 6069 6070 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 6071 length = div_u64(length, map->num_stripes / map->sub_stripes); 6072 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6073 length = div_u64(length, map->num_stripes); 6074 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6075 length = div_u64(length, nr_data_stripes(map)); 6076 rmap_len = map->stripe_len * nr_data_stripes(map); 6077 } 6078 6079 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 6080 BUG_ON(!buf); /* -ENOMEM */ 6081 6082 for (i = 0; i < map->num_stripes; i++) { 6083 if (devid && map->stripes[i].dev->devid != devid) 6084 continue; 6085 if (map->stripes[i].physical > physical || 6086 map->stripes[i].physical + length <= physical) 6087 continue; 6088 6089 stripe_nr = physical - map->stripes[i].physical; 6090 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 6091 6092 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6093 stripe_nr = stripe_nr * map->num_stripes + i; 6094 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 6095 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6096 stripe_nr = stripe_nr * map->num_stripes + i; 6097 } /* else if RAID[56], multiply by nr_data_stripes(). 6098 * Alternatively, just use rmap_len below instead of 6099 * map->stripe_len */ 6100 6101 bytenr = chunk_start + stripe_nr * rmap_len; 6102 WARN_ON(nr >= map->num_stripes); 6103 for (j = 0; j < nr; j++) { 6104 if (buf[j] == bytenr) 6105 break; 6106 } 6107 if (j == nr) { 6108 WARN_ON(nr >= map->num_stripes); 6109 buf[nr++] = bytenr; 6110 } 6111 } 6112 6113 *logical = buf; 6114 *naddrs = nr; 6115 *stripe_len = rmap_len; 6116 6117 free_extent_map(em); 6118 return 0; 6119 } 6120 6121 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6122 { 6123 bio->bi_private = bbio->private; 6124 bio->bi_end_io = bbio->end_io; 6125 bio_endio(bio); 6126 6127 btrfs_put_bbio(bbio); 6128 } 6129 6130 static void btrfs_end_bio(struct bio *bio) 6131 { 6132 struct btrfs_bio *bbio = bio->bi_private; 6133 int is_orig_bio = 0; 6134 6135 if (bio->bi_status) { 6136 atomic_inc(&bbio->error); 6137 if (bio->bi_status == BLK_STS_IOERR || 6138 bio->bi_status == BLK_STS_TARGET) { 6139 unsigned int stripe_index = 6140 btrfs_io_bio(bio)->stripe_index; 6141 struct btrfs_device *dev; 6142 6143 BUG_ON(stripe_index >= bbio->num_stripes); 6144 dev = bbio->stripes[stripe_index].dev; 6145 if (dev->bdev) { 6146 if (bio_op(bio) == REQ_OP_WRITE) 6147 btrfs_dev_stat_inc_and_print(dev, 6148 BTRFS_DEV_STAT_WRITE_ERRS); 6149 else 6150 btrfs_dev_stat_inc_and_print(dev, 6151 BTRFS_DEV_STAT_READ_ERRS); 6152 if (bio->bi_opf & REQ_PREFLUSH) 6153 btrfs_dev_stat_inc_and_print(dev, 6154 BTRFS_DEV_STAT_FLUSH_ERRS); 6155 } 6156 } 6157 } 6158 6159 if (bio == bbio->orig_bio) 6160 is_orig_bio = 1; 6161 6162 btrfs_bio_counter_dec(bbio->fs_info); 6163 6164 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6165 if (!is_orig_bio) { 6166 bio_put(bio); 6167 bio = bbio->orig_bio; 6168 } 6169 6170 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6171 /* only send an error to the higher layers if it is 6172 * beyond the tolerance of the btrfs bio 6173 */ 6174 if (atomic_read(&bbio->error) > bbio->max_errors) { 6175 bio->bi_status = BLK_STS_IOERR; 6176 } else { 6177 /* 6178 * this bio is actually up to date, we didn't 6179 * go over the max number of errors 6180 */ 6181 bio->bi_status = BLK_STS_OK; 6182 } 6183 6184 btrfs_end_bbio(bbio, bio); 6185 } else if (!is_orig_bio) { 6186 bio_put(bio); 6187 } 6188 } 6189 6190 /* 6191 * see run_scheduled_bios for a description of why bios are collected for 6192 * async submit. 6193 * 6194 * This will add one bio to the pending list for a device and make sure 6195 * the work struct is scheduled. 6196 */ 6197 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6198 struct bio *bio) 6199 { 6200 struct btrfs_fs_info *fs_info = device->fs_info; 6201 int should_queue = 1; 6202 struct btrfs_pending_bios *pending_bios; 6203 6204 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || 6205 !device->bdev) { 6206 bio_io_error(bio); 6207 return; 6208 } 6209 6210 /* don't bother with additional async steps for reads, right now */ 6211 if (bio_op(bio) == REQ_OP_READ) { 6212 btrfsic_submit_bio(bio); 6213 return; 6214 } 6215 6216 WARN_ON(bio->bi_next); 6217 bio->bi_next = NULL; 6218 6219 spin_lock(&device->io_lock); 6220 if (op_is_sync(bio->bi_opf)) 6221 pending_bios = &device->pending_sync_bios; 6222 else 6223 pending_bios = &device->pending_bios; 6224 6225 if (pending_bios->tail) 6226 pending_bios->tail->bi_next = bio; 6227 6228 pending_bios->tail = bio; 6229 if (!pending_bios->head) 6230 pending_bios->head = bio; 6231 if (device->running_pending) 6232 should_queue = 0; 6233 6234 spin_unlock(&device->io_lock); 6235 6236 if (should_queue) 6237 btrfs_queue_work(fs_info->submit_workers, &device->work); 6238 } 6239 6240 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6241 u64 physical, int dev_nr, int async) 6242 { 6243 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6244 struct btrfs_fs_info *fs_info = bbio->fs_info; 6245 6246 bio->bi_private = bbio; 6247 btrfs_io_bio(bio)->stripe_index = dev_nr; 6248 bio->bi_end_io = btrfs_end_bio; 6249 bio->bi_iter.bi_sector = physical >> 9; 6250 #ifdef DEBUG 6251 { 6252 struct rcu_string *name; 6253 6254 rcu_read_lock(); 6255 name = rcu_dereference(dev->name); 6256 btrfs_debug(fs_info, 6257 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6258 bio_op(bio), bio->bi_opf, 6259 (u64)bio->bi_iter.bi_sector, 6260 (u_long)dev->bdev->bd_dev, name->str, dev->devid, 6261 bio->bi_iter.bi_size); 6262 rcu_read_unlock(); 6263 } 6264 #endif 6265 bio_set_dev(bio, dev->bdev); 6266 6267 btrfs_bio_counter_inc_noblocked(fs_info); 6268 6269 if (async) 6270 btrfs_schedule_bio(dev, bio); 6271 else 6272 btrfsic_submit_bio(bio); 6273 } 6274 6275 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6276 { 6277 atomic_inc(&bbio->error); 6278 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6279 /* Should be the original bio. */ 6280 WARN_ON(bio != bbio->orig_bio); 6281 6282 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6283 bio->bi_iter.bi_sector = logical >> 9; 6284 if (atomic_read(&bbio->error) > bbio->max_errors) 6285 bio->bi_status = BLK_STS_IOERR; 6286 else 6287 bio->bi_status = BLK_STS_OK; 6288 btrfs_end_bbio(bbio, bio); 6289 } 6290 } 6291 6292 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6293 int mirror_num, int async_submit) 6294 { 6295 struct btrfs_device *dev; 6296 struct bio *first_bio = bio; 6297 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6298 u64 length = 0; 6299 u64 map_length; 6300 int ret; 6301 int dev_nr; 6302 int total_devs; 6303 struct btrfs_bio *bbio = NULL; 6304 6305 length = bio->bi_iter.bi_size; 6306 map_length = length; 6307 6308 btrfs_bio_counter_inc_blocked(fs_info); 6309 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6310 &map_length, &bbio, mirror_num, 1); 6311 if (ret) { 6312 btrfs_bio_counter_dec(fs_info); 6313 return errno_to_blk_status(ret); 6314 } 6315 6316 total_devs = bbio->num_stripes; 6317 bbio->orig_bio = first_bio; 6318 bbio->private = first_bio->bi_private; 6319 bbio->end_io = first_bio->bi_end_io; 6320 bbio->fs_info = fs_info; 6321 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6322 6323 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6324 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6325 /* In this case, map_length has been set to the length of 6326 a single stripe; not the whole write */ 6327 if (bio_op(bio) == REQ_OP_WRITE) { 6328 ret = raid56_parity_write(fs_info, bio, bbio, 6329 map_length); 6330 } else { 6331 ret = raid56_parity_recover(fs_info, bio, bbio, 6332 map_length, mirror_num, 1); 6333 } 6334 6335 btrfs_bio_counter_dec(fs_info); 6336 return errno_to_blk_status(ret); 6337 } 6338 6339 if (map_length < length) { 6340 btrfs_crit(fs_info, 6341 "mapping failed logical %llu bio len %llu len %llu", 6342 logical, length, map_length); 6343 BUG(); 6344 } 6345 6346 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6347 dev = bbio->stripes[dev_nr].dev; 6348 if (!dev || !dev->bdev || 6349 (bio_op(first_bio) == REQ_OP_WRITE && 6350 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6351 bbio_error(bbio, first_bio, logical); 6352 continue; 6353 } 6354 6355 if (dev_nr < total_devs - 1) 6356 bio = btrfs_bio_clone(first_bio); 6357 else 6358 bio = first_bio; 6359 6360 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6361 dev_nr, async_submit); 6362 } 6363 btrfs_bio_counter_dec(fs_info); 6364 return BLK_STS_OK; 6365 } 6366 6367 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6368 u8 *uuid, u8 *fsid) 6369 { 6370 struct btrfs_device *device; 6371 struct btrfs_fs_devices *cur_devices; 6372 6373 cur_devices = fs_info->fs_devices; 6374 while (cur_devices) { 6375 if (!fsid || 6376 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 6377 device = find_device(cur_devices, devid, uuid); 6378 if (device) 6379 return device; 6380 } 6381 cur_devices = cur_devices->seed; 6382 } 6383 return NULL; 6384 } 6385 6386 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6387 u64 devid, u8 *dev_uuid) 6388 { 6389 struct btrfs_device *device; 6390 6391 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6392 if (IS_ERR(device)) 6393 return device; 6394 6395 list_add(&device->dev_list, &fs_devices->devices); 6396 device->fs_devices = fs_devices; 6397 fs_devices->num_devices++; 6398 6399 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6400 fs_devices->missing_devices++; 6401 6402 return device; 6403 } 6404 6405 /** 6406 * btrfs_alloc_device - allocate struct btrfs_device 6407 * @fs_info: used only for generating a new devid, can be NULL if 6408 * devid is provided (i.e. @devid != NULL). 6409 * @devid: a pointer to devid for this device. If NULL a new devid 6410 * is generated. 6411 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6412 * is generated. 6413 * 6414 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6415 * on error. Returned struct is not linked onto any lists and must be 6416 * destroyed with free_device. 6417 */ 6418 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6419 const u64 *devid, 6420 const u8 *uuid) 6421 { 6422 struct btrfs_device *dev; 6423 u64 tmp; 6424 6425 if (WARN_ON(!devid && !fs_info)) 6426 return ERR_PTR(-EINVAL); 6427 6428 dev = __alloc_device(); 6429 if (IS_ERR(dev)) 6430 return dev; 6431 6432 if (devid) 6433 tmp = *devid; 6434 else { 6435 int ret; 6436 6437 ret = find_next_devid(fs_info, &tmp); 6438 if (ret) { 6439 free_device(dev); 6440 return ERR_PTR(ret); 6441 } 6442 } 6443 dev->devid = tmp; 6444 6445 if (uuid) 6446 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6447 else 6448 generate_random_uuid(dev->uuid); 6449 6450 btrfs_init_work(&dev->work, btrfs_submit_helper, 6451 pending_bios_fn, NULL, NULL); 6452 6453 return dev; 6454 } 6455 6456 /* Return -EIO if any error, otherwise return 0. */ 6457 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6458 struct extent_buffer *leaf, 6459 struct btrfs_chunk *chunk, u64 logical) 6460 { 6461 u64 length; 6462 u64 stripe_len; 6463 u16 num_stripes; 6464 u16 sub_stripes; 6465 u64 type; 6466 6467 length = btrfs_chunk_length(leaf, chunk); 6468 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6469 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6470 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6471 type = btrfs_chunk_type(leaf, chunk); 6472 6473 if (!num_stripes) { 6474 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6475 num_stripes); 6476 return -EIO; 6477 } 6478 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6479 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6480 return -EIO; 6481 } 6482 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6483 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6484 btrfs_chunk_sector_size(leaf, chunk)); 6485 return -EIO; 6486 } 6487 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6488 btrfs_err(fs_info, "invalid chunk length %llu", length); 6489 return -EIO; 6490 } 6491 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6492 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6493 stripe_len); 6494 return -EIO; 6495 } 6496 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6497 type) { 6498 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6499 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6500 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6501 btrfs_chunk_type(leaf, chunk)); 6502 return -EIO; 6503 } 6504 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6505 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6506 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6507 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6508 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6509 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6510 num_stripes != 1)) { 6511 btrfs_err(fs_info, 6512 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6513 num_stripes, sub_stripes, 6514 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6515 return -EIO; 6516 } 6517 6518 return 0; 6519 } 6520 6521 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6522 u64 devid, u8 *uuid, bool error) 6523 { 6524 if (error) 6525 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6526 devid, uuid); 6527 else 6528 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6529 devid, uuid); 6530 } 6531 6532 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6533 struct extent_buffer *leaf, 6534 struct btrfs_chunk *chunk) 6535 { 6536 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6537 struct map_lookup *map; 6538 struct extent_map *em; 6539 u64 logical; 6540 u64 length; 6541 u64 devid; 6542 u8 uuid[BTRFS_UUID_SIZE]; 6543 int num_stripes; 6544 int ret; 6545 int i; 6546 6547 logical = key->offset; 6548 length = btrfs_chunk_length(leaf, chunk); 6549 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6550 6551 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6552 if (ret) 6553 return ret; 6554 6555 read_lock(&map_tree->map_tree.lock); 6556 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6557 read_unlock(&map_tree->map_tree.lock); 6558 6559 /* already mapped? */ 6560 if (em && em->start <= logical && em->start + em->len > logical) { 6561 free_extent_map(em); 6562 return 0; 6563 } else if (em) { 6564 free_extent_map(em); 6565 } 6566 6567 em = alloc_extent_map(); 6568 if (!em) 6569 return -ENOMEM; 6570 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6571 if (!map) { 6572 free_extent_map(em); 6573 return -ENOMEM; 6574 } 6575 6576 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6577 em->map_lookup = map; 6578 em->start = logical; 6579 em->len = length; 6580 em->orig_start = 0; 6581 em->block_start = 0; 6582 em->block_len = em->len; 6583 6584 map->num_stripes = num_stripes; 6585 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6586 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6587 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6588 map->type = btrfs_chunk_type(leaf, chunk); 6589 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6590 for (i = 0; i < num_stripes; i++) { 6591 map->stripes[i].physical = 6592 btrfs_stripe_offset_nr(leaf, chunk, i); 6593 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6594 read_extent_buffer(leaf, uuid, (unsigned long) 6595 btrfs_stripe_dev_uuid_nr(chunk, i), 6596 BTRFS_UUID_SIZE); 6597 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6598 uuid, NULL); 6599 if (!map->stripes[i].dev && 6600 !btrfs_test_opt(fs_info, DEGRADED)) { 6601 free_extent_map(em); 6602 btrfs_report_missing_device(fs_info, devid, uuid, true); 6603 return -ENOENT; 6604 } 6605 if (!map->stripes[i].dev) { 6606 map->stripes[i].dev = 6607 add_missing_dev(fs_info->fs_devices, devid, 6608 uuid); 6609 if (IS_ERR(map->stripes[i].dev)) { 6610 free_extent_map(em); 6611 btrfs_err(fs_info, 6612 "failed to init missing dev %llu: %ld", 6613 devid, PTR_ERR(map->stripes[i].dev)); 6614 return PTR_ERR(map->stripes[i].dev); 6615 } 6616 btrfs_report_missing_device(fs_info, devid, uuid, false); 6617 } 6618 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6619 &(map->stripes[i].dev->dev_state)); 6620 6621 } 6622 6623 write_lock(&map_tree->map_tree.lock); 6624 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6625 write_unlock(&map_tree->map_tree.lock); 6626 BUG_ON(ret); /* Tree corruption */ 6627 free_extent_map(em); 6628 6629 return 0; 6630 } 6631 6632 static void fill_device_from_item(struct extent_buffer *leaf, 6633 struct btrfs_dev_item *dev_item, 6634 struct btrfs_device *device) 6635 { 6636 unsigned long ptr; 6637 6638 device->devid = btrfs_device_id(leaf, dev_item); 6639 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6640 device->total_bytes = device->disk_total_bytes; 6641 device->commit_total_bytes = device->disk_total_bytes; 6642 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6643 device->commit_bytes_used = device->bytes_used; 6644 device->type = btrfs_device_type(leaf, dev_item); 6645 device->io_align = btrfs_device_io_align(leaf, dev_item); 6646 device->io_width = btrfs_device_io_width(leaf, dev_item); 6647 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6648 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6649 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6650 6651 ptr = btrfs_device_uuid(dev_item); 6652 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6653 } 6654 6655 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6656 u8 *fsid) 6657 { 6658 struct btrfs_fs_devices *fs_devices; 6659 int ret; 6660 6661 lockdep_assert_held(&uuid_mutex); 6662 ASSERT(fsid); 6663 6664 fs_devices = fs_info->fs_devices->seed; 6665 while (fs_devices) { 6666 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6667 return fs_devices; 6668 6669 fs_devices = fs_devices->seed; 6670 } 6671 6672 fs_devices = find_fsid(fsid); 6673 if (!fs_devices) { 6674 if (!btrfs_test_opt(fs_info, DEGRADED)) 6675 return ERR_PTR(-ENOENT); 6676 6677 fs_devices = alloc_fs_devices(fsid); 6678 if (IS_ERR(fs_devices)) 6679 return fs_devices; 6680 6681 fs_devices->seeding = 1; 6682 fs_devices->opened = 1; 6683 return fs_devices; 6684 } 6685 6686 fs_devices = clone_fs_devices(fs_devices); 6687 if (IS_ERR(fs_devices)) 6688 return fs_devices; 6689 6690 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6691 fs_info->bdev_holder); 6692 if (ret) { 6693 free_fs_devices(fs_devices); 6694 fs_devices = ERR_PTR(ret); 6695 goto out; 6696 } 6697 6698 if (!fs_devices->seeding) { 6699 __btrfs_close_devices(fs_devices); 6700 free_fs_devices(fs_devices); 6701 fs_devices = ERR_PTR(-EINVAL); 6702 goto out; 6703 } 6704 6705 fs_devices->seed = fs_info->fs_devices->seed; 6706 fs_info->fs_devices->seed = fs_devices; 6707 out: 6708 return fs_devices; 6709 } 6710 6711 static int read_one_dev(struct btrfs_fs_info *fs_info, 6712 struct extent_buffer *leaf, 6713 struct btrfs_dev_item *dev_item) 6714 { 6715 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6716 struct btrfs_device *device; 6717 u64 devid; 6718 int ret; 6719 u8 fs_uuid[BTRFS_FSID_SIZE]; 6720 u8 dev_uuid[BTRFS_UUID_SIZE]; 6721 6722 devid = btrfs_device_id(leaf, dev_item); 6723 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6724 BTRFS_UUID_SIZE); 6725 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6726 BTRFS_FSID_SIZE); 6727 6728 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 6729 fs_devices = open_seed_devices(fs_info, fs_uuid); 6730 if (IS_ERR(fs_devices)) 6731 return PTR_ERR(fs_devices); 6732 } 6733 6734 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6735 if (!device) { 6736 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6737 btrfs_report_missing_device(fs_info, devid, 6738 dev_uuid, true); 6739 return -ENOENT; 6740 } 6741 6742 device = add_missing_dev(fs_devices, devid, dev_uuid); 6743 if (IS_ERR(device)) { 6744 btrfs_err(fs_info, 6745 "failed to add missing dev %llu: %ld", 6746 devid, PTR_ERR(device)); 6747 return PTR_ERR(device); 6748 } 6749 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6750 } else { 6751 if (!device->bdev) { 6752 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6753 btrfs_report_missing_device(fs_info, 6754 devid, dev_uuid, true); 6755 return -ENOENT; 6756 } 6757 btrfs_report_missing_device(fs_info, devid, 6758 dev_uuid, false); 6759 } 6760 6761 if (!device->bdev && 6762 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6763 /* 6764 * this happens when a device that was properly setup 6765 * in the device info lists suddenly goes bad. 6766 * device->bdev is NULL, and so we have to set 6767 * device->missing to one here 6768 */ 6769 device->fs_devices->missing_devices++; 6770 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6771 } 6772 6773 /* Move the device to its own fs_devices */ 6774 if (device->fs_devices != fs_devices) { 6775 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6776 &device->dev_state)); 6777 6778 list_move(&device->dev_list, &fs_devices->devices); 6779 device->fs_devices->num_devices--; 6780 fs_devices->num_devices++; 6781 6782 device->fs_devices->missing_devices--; 6783 fs_devices->missing_devices++; 6784 6785 device->fs_devices = fs_devices; 6786 } 6787 } 6788 6789 if (device->fs_devices != fs_info->fs_devices) { 6790 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 6791 if (device->generation != 6792 btrfs_device_generation(leaf, dev_item)) 6793 return -EINVAL; 6794 } 6795 6796 fill_device_from_item(leaf, dev_item, device); 6797 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6798 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6799 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 6800 device->fs_devices->total_rw_bytes += device->total_bytes; 6801 atomic64_add(device->total_bytes - device->bytes_used, 6802 &fs_info->free_chunk_space); 6803 } 6804 ret = 0; 6805 return ret; 6806 } 6807 6808 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6809 { 6810 struct btrfs_root *root = fs_info->tree_root; 6811 struct btrfs_super_block *super_copy = fs_info->super_copy; 6812 struct extent_buffer *sb; 6813 struct btrfs_disk_key *disk_key; 6814 struct btrfs_chunk *chunk; 6815 u8 *array_ptr; 6816 unsigned long sb_array_offset; 6817 int ret = 0; 6818 u32 num_stripes; 6819 u32 array_size; 6820 u32 len = 0; 6821 u32 cur_offset; 6822 u64 type; 6823 struct btrfs_key key; 6824 6825 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6826 /* 6827 * This will create extent buffer of nodesize, superblock size is 6828 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6829 * overallocate but we can keep it as-is, only the first page is used. 6830 */ 6831 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6832 if (IS_ERR(sb)) 6833 return PTR_ERR(sb); 6834 set_extent_buffer_uptodate(sb); 6835 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6836 /* 6837 * The sb extent buffer is artificial and just used to read the system array. 6838 * set_extent_buffer_uptodate() call does not properly mark all it's 6839 * pages up-to-date when the page is larger: extent does not cover the 6840 * whole page and consequently check_page_uptodate does not find all 6841 * the page's extents up-to-date (the hole beyond sb), 6842 * write_extent_buffer then triggers a WARN_ON. 6843 * 6844 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6845 * but sb spans only this function. Add an explicit SetPageUptodate call 6846 * to silence the warning eg. on PowerPC 64. 6847 */ 6848 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6849 SetPageUptodate(sb->pages[0]); 6850 6851 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6852 array_size = btrfs_super_sys_array_size(super_copy); 6853 6854 array_ptr = super_copy->sys_chunk_array; 6855 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6856 cur_offset = 0; 6857 6858 while (cur_offset < array_size) { 6859 disk_key = (struct btrfs_disk_key *)array_ptr; 6860 len = sizeof(*disk_key); 6861 if (cur_offset + len > array_size) 6862 goto out_short_read; 6863 6864 btrfs_disk_key_to_cpu(&key, disk_key); 6865 6866 array_ptr += len; 6867 sb_array_offset += len; 6868 cur_offset += len; 6869 6870 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6871 chunk = (struct btrfs_chunk *)sb_array_offset; 6872 /* 6873 * At least one btrfs_chunk with one stripe must be 6874 * present, exact stripe count check comes afterwards 6875 */ 6876 len = btrfs_chunk_item_size(1); 6877 if (cur_offset + len > array_size) 6878 goto out_short_read; 6879 6880 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6881 if (!num_stripes) { 6882 btrfs_err(fs_info, 6883 "invalid number of stripes %u in sys_array at offset %u", 6884 num_stripes, cur_offset); 6885 ret = -EIO; 6886 break; 6887 } 6888 6889 type = btrfs_chunk_type(sb, chunk); 6890 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6891 btrfs_err(fs_info, 6892 "invalid chunk type %llu in sys_array at offset %u", 6893 type, cur_offset); 6894 ret = -EIO; 6895 break; 6896 } 6897 6898 len = btrfs_chunk_item_size(num_stripes); 6899 if (cur_offset + len > array_size) 6900 goto out_short_read; 6901 6902 ret = read_one_chunk(fs_info, &key, sb, chunk); 6903 if (ret) 6904 break; 6905 } else { 6906 btrfs_err(fs_info, 6907 "unexpected item type %u in sys_array at offset %u", 6908 (u32)key.type, cur_offset); 6909 ret = -EIO; 6910 break; 6911 } 6912 array_ptr += len; 6913 sb_array_offset += len; 6914 cur_offset += len; 6915 } 6916 clear_extent_buffer_uptodate(sb); 6917 free_extent_buffer_stale(sb); 6918 return ret; 6919 6920 out_short_read: 6921 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6922 len, cur_offset); 6923 clear_extent_buffer_uptodate(sb); 6924 free_extent_buffer_stale(sb); 6925 return -EIO; 6926 } 6927 6928 /* 6929 * Check if all chunks in the fs are OK for read-write degraded mount 6930 * 6931 * If the @failing_dev is specified, it's accounted as missing. 6932 * 6933 * Return true if all chunks meet the minimal RW mount requirements. 6934 * Return false if any chunk doesn't meet the minimal RW mount requirements. 6935 */ 6936 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 6937 struct btrfs_device *failing_dev) 6938 { 6939 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6940 struct extent_map *em; 6941 u64 next_start = 0; 6942 bool ret = true; 6943 6944 read_lock(&map_tree->map_tree.lock); 6945 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 6946 read_unlock(&map_tree->map_tree.lock); 6947 /* No chunk at all? Return false anyway */ 6948 if (!em) { 6949 ret = false; 6950 goto out; 6951 } 6952 while (em) { 6953 struct map_lookup *map; 6954 int missing = 0; 6955 int max_tolerated; 6956 int i; 6957 6958 map = em->map_lookup; 6959 max_tolerated = 6960 btrfs_get_num_tolerated_disk_barrier_failures( 6961 map->type); 6962 for (i = 0; i < map->num_stripes; i++) { 6963 struct btrfs_device *dev = map->stripes[i].dev; 6964 6965 if (!dev || !dev->bdev || 6966 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6967 dev->last_flush_error) 6968 missing++; 6969 else if (failing_dev && failing_dev == dev) 6970 missing++; 6971 } 6972 if (missing > max_tolerated) { 6973 if (!failing_dev) 6974 btrfs_warn(fs_info, 6975 "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 6976 em->start, missing, max_tolerated); 6977 free_extent_map(em); 6978 ret = false; 6979 goto out; 6980 } 6981 next_start = extent_map_end(em); 6982 free_extent_map(em); 6983 6984 read_lock(&map_tree->map_tree.lock); 6985 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 6986 (u64)(-1) - next_start); 6987 read_unlock(&map_tree->map_tree.lock); 6988 } 6989 out: 6990 return ret; 6991 } 6992 6993 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6994 { 6995 struct btrfs_root *root = fs_info->chunk_root; 6996 struct btrfs_path *path; 6997 struct extent_buffer *leaf; 6998 struct btrfs_key key; 6999 struct btrfs_key found_key; 7000 int ret; 7001 int slot; 7002 u64 total_dev = 0; 7003 7004 path = btrfs_alloc_path(); 7005 if (!path) 7006 return -ENOMEM; 7007 7008 mutex_lock(&uuid_mutex); 7009 mutex_lock(&fs_info->chunk_mutex); 7010 7011 /* 7012 * Read all device items, and then all the chunk items. All 7013 * device items are found before any chunk item (their object id 7014 * is smaller than the lowest possible object id for a chunk 7015 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7016 */ 7017 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7018 key.offset = 0; 7019 key.type = 0; 7020 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7021 if (ret < 0) 7022 goto error; 7023 while (1) { 7024 leaf = path->nodes[0]; 7025 slot = path->slots[0]; 7026 if (slot >= btrfs_header_nritems(leaf)) { 7027 ret = btrfs_next_leaf(root, path); 7028 if (ret == 0) 7029 continue; 7030 if (ret < 0) 7031 goto error; 7032 break; 7033 } 7034 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7035 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7036 struct btrfs_dev_item *dev_item; 7037 dev_item = btrfs_item_ptr(leaf, slot, 7038 struct btrfs_dev_item); 7039 ret = read_one_dev(fs_info, leaf, dev_item); 7040 if (ret) 7041 goto error; 7042 total_dev++; 7043 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7044 struct btrfs_chunk *chunk; 7045 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7046 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 7047 if (ret) 7048 goto error; 7049 } 7050 path->slots[0]++; 7051 } 7052 7053 /* 7054 * After loading chunk tree, we've got all device information, 7055 * do another round of validation checks. 7056 */ 7057 if (total_dev != fs_info->fs_devices->total_devices) { 7058 btrfs_err(fs_info, 7059 "super_num_devices %llu mismatch with num_devices %llu found here", 7060 btrfs_super_num_devices(fs_info->super_copy), 7061 total_dev); 7062 ret = -EINVAL; 7063 goto error; 7064 } 7065 if (btrfs_super_total_bytes(fs_info->super_copy) < 7066 fs_info->fs_devices->total_rw_bytes) { 7067 btrfs_err(fs_info, 7068 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7069 btrfs_super_total_bytes(fs_info->super_copy), 7070 fs_info->fs_devices->total_rw_bytes); 7071 ret = -EINVAL; 7072 goto error; 7073 } 7074 ret = 0; 7075 error: 7076 mutex_unlock(&fs_info->chunk_mutex); 7077 mutex_unlock(&uuid_mutex); 7078 7079 btrfs_free_path(path); 7080 return ret; 7081 } 7082 7083 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7084 { 7085 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7086 struct btrfs_device *device; 7087 7088 while (fs_devices) { 7089 mutex_lock(&fs_devices->device_list_mutex); 7090 list_for_each_entry(device, &fs_devices->devices, dev_list) 7091 device->fs_info = fs_info; 7092 mutex_unlock(&fs_devices->device_list_mutex); 7093 7094 fs_devices = fs_devices->seed; 7095 } 7096 } 7097 7098 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 7099 { 7100 int i; 7101 7102 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7103 btrfs_dev_stat_reset(dev, i); 7104 } 7105 7106 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7107 { 7108 struct btrfs_key key; 7109 struct btrfs_key found_key; 7110 struct btrfs_root *dev_root = fs_info->dev_root; 7111 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7112 struct extent_buffer *eb; 7113 int slot; 7114 int ret = 0; 7115 struct btrfs_device *device; 7116 struct btrfs_path *path = NULL; 7117 int i; 7118 7119 path = btrfs_alloc_path(); 7120 if (!path) { 7121 ret = -ENOMEM; 7122 goto out; 7123 } 7124 7125 mutex_lock(&fs_devices->device_list_mutex); 7126 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7127 int item_size; 7128 struct btrfs_dev_stats_item *ptr; 7129 7130 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7131 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7132 key.offset = device->devid; 7133 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7134 if (ret) { 7135 __btrfs_reset_dev_stats(device); 7136 device->dev_stats_valid = 1; 7137 btrfs_release_path(path); 7138 continue; 7139 } 7140 slot = path->slots[0]; 7141 eb = path->nodes[0]; 7142 btrfs_item_key_to_cpu(eb, &found_key, slot); 7143 item_size = btrfs_item_size_nr(eb, slot); 7144 7145 ptr = btrfs_item_ptr(eb, slot, 7146 struct btrfs_dev_stats_item); 7147 7148 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7149 if (item_size >= (1 + i) * sizeof(__le64)) 7150 btrfs_dev_stat_set(device, i, 7151 btrfs_dev_stats_value(eb, ptr, i)); 7152 else 7153 btrfs_dev_stat_reset(device, i); 7154 } 7155 7156 device->dev_stats_valid = 1; 7157 btrfs_dev_stat_print_on_load(device); 7158 btrfs_release_path(path); 7159 } 7160 mutex_unlock(&fs_devices->device_list_mutex); 7161 7162 out: 7163 btrfs_free_path(path); 7164 return ret < 0 ? ret : 0; 7165 } 7166 7167 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7168 struct btrfs_fs_info *fs_info, 7169 struct btrfs_device *device) 7170 { 7171 struct btrfs_root *dev_root = fs_info->dev_root; 7172 struct btrfs_path *path; 7173 struct btrfs_key key; 7174 struct extent_buffer *eb; 7175 struct btrfs_dev_stats_item *ptr; 7176 int ret; 7177 int i; 7178 7179 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7180 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7181 key.offset = device->devid; 7182 7183 path = btrfs_alloc_path(); 7184 if (!path) 7185 return -ENOMEM; 7186 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7187 if (ret < 0) { 7188 btrfs_warn_in_rcu(fs_info, 7189 "error %d while searching for dev_stats item for device %s", 7190 ret, rcu_str_deref(device->name)); 7191 goto out; 7192 } 7193 7194 if (ret == 0 && 7195 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7196 /* need to delete old one and insert a new one */ 7197 ret = btrfs_del_item(trans, dev_root, path); 7198 if (ret != 0) { 7199 btrfs_warn_in_rcu(fs_info, 7200 "delete too small dev_stats item for device %s failed %d", 7201 rcu_str_deref(device->name), ret); 7202 goto out; 7203 } 7204 ret = 1; 7205 } 7206 7207 if (ret == 1) { 7208 /* need to insert a new item */ 7209 btrfs_release_path(path); 7210 ret = btrfs_insert_empty_item(trans, dev_root, path, 7211 &key, sizeof(*ptr)); 7212 if (ret < 0) { 7213 btrfs_warn_in_rcu(fs_info, 7214 "insert dev_stats item for device %s failed %d", 7215 rcu_str_deref(device->name), ret); 7216 goto out; 7217 } 7218 } 7219 7220 eb = path->nodes[0]; 7221 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7222 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7223 btrfs_set_dev_stats_value(eb, ptr, i, 7224 btrfs_dev_stat_read(device, i)); 7225 btrfs_mark_buffer_dirty(eb); 7226 7227 out: 7228 btrfs_free_path(path); 7229 return ret; 7230 } 7231 7232 /* 7233 * called from commit_transaction. Writes all changed device stats to disk. 7234 */ 7235 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7236 struct btrfs_fs_info *fs_info) 7237 { 7238 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7239 struct btrfs_device *device; 7240 int stats_cnt; 7241 int ret = 0; 7242 7243 mutex_lock(&fs_devices->device_list_mutex); 7244 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7245 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7246 if (!device->dev_stats_valid || stats_cnt == 0) 7247 continue; 7248 7249 7250 /* 7251 * There is a LOAD-LOAD control dependency between the value of 7252 * dev_stats_ccnt and updating the on-disk values which requires 7253 * reading the in-memory counters. Such control dependencies 7254 * require explicit read memory barriers. 7255 * 7256 * This memory barriers pairs with smp_mb__before_atomic in 7257 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7258 * barrier implied by atomic_xchg in 7259 * btrfs_dev_stats_read_and_reset 7260 */ 7261 smp_rmb(); 7262 7263 ret = update_dev_stat_item(trans, fs_info, device); 7264 if (!ret) 7265 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7266 } 7267 mutex_unlock(&fs_devices->device_list_mutex); 7268 7269 return ret; 7270 } 7271 7272 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7273 { 7274 btrfs_dev_stat_inc(dev, index); 7275 btrfs_dev_stat_print_on_error(dev); 7276 } 7277 7278 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7279 { 7280 if (!dev->dev_stats_valid) 7281 return; 7282 btrfs_err_rl_in_rcu(dev->fs_info, 7283 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7284 rcu_str_deref(dev->name), 7285 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7286 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7287 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7288 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7289 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7290 } 7291 7292 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7293 { 7294 int i; 7295 7296 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7297 if (btrfs_dev_stat_read(dev, i) != 0) 7298 break; 7299 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7300 return; /* all values == 0, suppress message */ 7301 7302 btrfs_info_in_rcu(dev->fs_info, 7303 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7304 rcu_str_deref(dev->name), 7305 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7306 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7307 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7308 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7309 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7310 } 7311 7312 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7313 struct btrfs_ioctl_get_dev_stats *stats) 7314 { 7315 struct btrfs_device *dev; 7316 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7317 int i; 7318 7319 mutex_lock(&fs_devices->device_list_mutex); 7320 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7321 mutex_unlock(&fs_devices->device_list_mutex); 7322 7323 if (!dev) { 7324 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7325 return -ENODEV; 7326 } else if (!dev->dev_stats_valid) { 7327 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7328 return -ENODEV; 7329 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7330 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7331 if (stats->nr_items > i) 7332 stats->values[i] = 7333 btrfs_dev_stat_read_and_reset(dev, i); 7334 else 7335 btrfs_dev_stat_reset(dev, i); 7336 } 7337 } else { 7338 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7339 if (stats->nr_items > i) 7340 stats->values[i] = btrfs_dev_stat_read(dev, i); 7341 } 7342 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7343 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7344 return 0; 7345 } 7346 7347 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7348 { 7349 struct buffer_head *bh; 7350 struct btrfs_super_block *disk_super; 7351 int copy_num; 7352 7353 if (!bdev) 7354 return; 7355 7356 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7357 copy_num++) { 7358 7359 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7360 continue; 7361 7362 disk_super = (struct btrfs_super_block *)bh->b_data; 7363 7364 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7365 set_buffer_dirty(bh); 7366 sync_dirty_buffer(bh); 7367 brelse(bh); 7368 } 7369 7370 /* Notify udev that device has changed */ 7371 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7372 7373 /* Update ctime/mtime for device path for libblkid */ 7374 update_dev_time(device_path); 7375 } 7376 7377 /* 7378 * Update the size of all devices, which is used for writing out the 7379 * super blocks. 7380 */ 7381 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7382 { 7383 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7384 struct btrfs_device *curr, *next; 7385 7386 if (list_empty(&fs_devices->resized_devices)) 7387 return; 7388 7389 mutex_lock(&fs_devices->device_list_mutex); 7390 mutex_lock(&fs_info->chunk_mutex); 7391 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7392 resized_list) { 7393 list_del_init(&curr->resized_list); 7394 curr->commit_total_bytes = curr->disk_total_bytes; 7395 } 7396 mutex_unlock(&fs_info->chunk_mutex); 7397 mutex_unlock(&fs_devices->device_list_mutex); 7398 } 7399 7400 /* Must be invoked during the transaction commit */ 7401 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) 7402 { 7403 struct btrfs_fs_info *fs_info = trans->fs_info; 7404 struct extent_map *em; 7405 struct map_lookup *map; 7406 struct btrfs_device *dev; 7407 int i; 7408 7409 if (list_empty(&trans->pending_chunks)) 7410 return; 7411 7412 /* In order to kick the device replace finish process */ 7413 mutex_lock(&fs_info->chunk_mutex); 7414 list_for_each_entry(em, &trans->pending_chunks, list) { 7415 map = em->map_lookup; 7416 7417 for (i = 0; i < map->num_stripes; i++) { 7418 dev = map->stripes[i].dev; 7419 dev->commit_bytes_used = dev->bytes_used; 7420 } 7421 } 7422 mutex_unlock(&fs_info->chunk_mutex); 7423 } 7424 7425 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7426 { 7427 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7428 while (fs_devices) { 7429 fs_devices->fs_info = fs_info; 7430 fs_devices = fs_devices->seed; 7431 } 7432 } 7433 7434 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7435 { 7436 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7437 while (fs_devices) { 7438 fs_devices->fs_info = NULL; 7439 fs_devices = fs_devices->seed; 7440 } 7441 } 7442