1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <linux/raid/pq.h> 29 #include <linux/semaphore.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 #include "sysfs.h" 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 static void lock_chunks(struct btrfs_root *root) 57 { 58 mutex_lock(&root->fs_info->chunk_mutex); 59 } 60 61 static void unlock_chunks(struct btrfs_root *root) 62 { 63 mutex_unlock(&root->fs_info->chunk_mutex); 64 } 65 66 static struct btrfs_fs_devices *__alloc_fs_devices(void) 67 { 68 struct btrfs_fs_devices *fs_devs; 69 70 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 71 if (!fs_devs) 72 return ERR_PTR(-ENOMEM); 73 74 mutex_init(&fs_devs->device_list_mutex); 75 76 INIT_LIST_HEAD(&fs_devs->devices); 77 INIT_LIST_HEAD(&fs_devs->alloc_list); 78 INIT_LIST_HEAD(&fs_devs->list); 79 80 return fs_devs; 81 } 82 83 /** 84 * alloc_fs_devices - allocate struct btrfs_fs_devices 85 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 86 * generated. 87 * 88 * Return: a pointer to a new &struct btrfs_fs_devices on success; 89 * ERR_PTR() on error. Returned struct is not linked onto any lists and 90 * can be destroyed with kfree() right away. 91 */ 92 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 93 { 94 struct btrfs_fs_devices *fs_devs; 95 96 fs_devs = __alloc_fs_devices(); 97 if (IS_ERR(fs_devs)) 98 return fs_devs; 99 100 if (fsid) 101 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 102 else 103 generate_random_uuid(fs_devs->fsid); 104 105 return fs_devs; 106 } 107 108 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 109 { 110 struct btrfs_device *device; 111 WARN_ON(fs_devices->opened); 112 while (!list_empty(&fs_devices->devices)) { 113 device = list_entry(fs_devices->devices.next, 114 struct btrfs_device, dev_list); 115 list_del(&device->dev_list); 116 rcu_string_free(device->name); 117 kfree(device); 118 } 119 kfree(fs_devices); 120 } 121 122 static void btrfs_kobject_uevent(struct block_device *bdev, 123 enum kobject_action action) 124 { 125 int ret; 126 127 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 128 if (ret) 129 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 130 action, 131 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 132 &disk_to_dev(bdev->bd_disk)->kobj); 133 } 134 135 void btrfs_cleanup_fs_uuids(void) 136 { 137 struct btrfs_fs_devices *fs_devices; 138 139 while (!list_empty(&fs_uuids)) { 140 fs_devices = list_entry(fs_uuids.next, 141 struct btrfs_fs_devices, list); 142 list_del(&fs_devices->list); 143 free_fs_devices(fs_devices); 144 } 145 } 146 147 static struct btrfs_device *__alloc_device(void) 148 { 149 struct btrfs_device *dev; 150 151 dev = kzalloc(sizeof(*dev), GFP_NOFS); 152 if (!dev) 153 return ERR_PTR(-ENOMEM); 154 155 INIT_LIST_HEAD(&dev->dev_list); 156 INIT_LIST_HEAD(&dev->dev_alloc_list); 157 158 spin_lock_init(&dev->io_lock); 159 160 spin_lock_init(&dev->reada_lock); 161 atomic_set(&dev->reada_in_flight, 0); 162 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 163 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 164 165 return dev; 166 } 167 168 static noinline struct btrfs_device *__find_device(struct list_head *head, 169 u64 devid, u8 *uuid) 170 { 171 struct btrfs_device *dev; 172 173 list_for_each_entry(dev, head, dev_list) { 174 if (dev->devid == devid && 175 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 176 return dev; 177 } 178 } 179 return NULL; 180 } 181 182 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 183 { 184 struct btrfs_fs_devices *fs_devices; 185 186 list_for_each_entry(fs_devices, &fs_uuids, list) { 187 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 188 return fs_devices; 189 } 190 return NULL; 191 } 192 193 static int 194 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 195 int flush, struct block_device **bdev, 196 struct buffer_head **bh) 197 { 198 int ret; 199 200 *bdev = blkdev_get_by_path(device_path, flags, holder); 201 202 if (IS_ERR(*bdev)) { 203 ret = PTR_ERR(*bdev); 204 printk(KERN_INFO "BTRFS: open %s failed\n", device_path); 205 goto error; 206 } 207 208 if (flush) 209 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 210 ret = set_blocksize(*bdev, 4096); 211 if (ret) { 212 blkdev_put(*bdev, flags); 213 goto error; 214 } 215 invalidate_bdev(*bdev); 216 *bh = btrfs_read_dev_super(*bdev); 217 if (!*bh) { 218 ret = -EINVAL; 219 blkdev_put(*bdev, flags); 220 goto error; 221 } 222 223 return 0; 224 225 error: 226 *bdev = NULL; 227 *bh = NULL; 228 return ret; 229 } 230 231 static void requeue_list(struct btrfs_pending_bios *pending_bios, 232 struct bio *head, struct bio *tail) 233 { 234 235 struct bio *old_head; 236 237 old_head = pending_bios->head; 238 pending_bios->head = head; 239 if (pending_bios->tail) 240 tail->bi_next = old_head; 241 else 242 pending_bios->tail = tail; 243 } 244 245 /* 246 * we try to collect pending bios for a device so we don't get a large 247 * number of procs sending bios down to the same device. This greatly 248 * improves the schedulers ability to collect and merge the bios. 249 * 250 * But, it also turns into a long list of bios to process and that is sure 251 * to eventually make the worker thread block. The solution here is to 252 * make some progress and then put this work struct back at the end of 253 * the list if the block device is congested. This way, multiple devices 254 * can make progress from a single worker thread. 255 */ 256 static noinline void run_scheduled_bios(struct btrfs_device *device) 257 { 258 struct bio *pending; 259 struct backing_dev_info *bdi; 260 struct btrfs_fs_info *fs_info; 261 struct btrfs_pending_bios *pending_bios; 262 struct bio *tail; 263 struct bio *cur; 264 int again = 0; 265 unsigned long num_run; 266 unsigned long batch_run = 0; 267 unsigned long limit; 268 unsigned long last_waited = 0; 269 int force_reg = 0; 270 int sync_pending = 0; 271 struct blk_plug plug; 272 273 /* 274 * this function runs all the bios we've collected for 275 * a particular device. We don't want to wander off to 276 * another device without first sending all of these down. 277 * So, setup a plug here and finish it off before we return 278 */ 279 blk_start_plug(&plug); 280 281 bdi = blk_get_backing_dev_info(device->bdev); 282 fs_info = device->dev_root->fs_info; 283 limit = btrfs_async_submit_limit(fs_info); 284 limit = limit * 2 / 3; 285 286 loop: 287 spin_lock(&device->io_lock); 288 289 loop_lock: 290 num_run = 0; 291 292 /* take all the bios off the list at once and process them 293 * later on (without the lock held). But, remember the 294 * tail and other pointers so the bios can be properly reinserted 295 * into the list if we hit congestion 296 */ 297 if (!force_reg && device->pending_sync_bios.head) { 298 pending_bios = &device->pending_sync_bios; 299 force_reg = 1; 300 } else { 301 pending_bios = &device->pending_bios; 302 force_reg = 0; 303 } 304 305 pending = pending_bios->head; 306 tail = pending_bios->tail; 307 WARN_ON(pending && !tail); 308 309 /* 310 * if pending was null this time around, no bios need processing 311 * at all and we can stop. Otherwise it'll loop back up again 312 * and do an additional check so no bios are missed. 313 * 314 * device->running_pending is used to synchronize with the 315 * schedule_bio code. 316 */ 317 if (device->pending_sync_bios.head == NULL && 318 device->pending_bios.head == NULL) { 319 again = 0; 320 device->running_pending = 0; 321 } else { 322 again = 1; 323 device->running_pending = 1; 324 } 325 326 pending_bios->head = NULL; 327 pending_bios->tail = NULL; 328 329 spin_unlock(&device->io_lock); 330 331 while (pending) { 332 333 rmb(); 334 /* we want to work on both lists, but do more bios on the 335 * sync list than the regular list 336 */ 337 if ((num_run > 32 && 338 pending_bios != &device->pending_sync_bios && 339 device->pending_sync_bios.head) || 340 (num_run > 64 && pending_bios == &device->pending_sync_bios && 341 device->pending_bios.head)) { 342 spin_lock(&device->io_lock); 343 requeue_list(pending_bios, pending, tail); 344 goto loop_lock; 345 } 346 347 cur = pending; 348 pending = pending->bi_next; 349 cur->bi_next = NULL; 350 351 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 352 waitqueue_active(&fs_info->async_submit_wait)) 353 wake_up(&fs_info->async_submit_wait); 354 355 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 356 357 /* 358 * if we're doing the sync list, record that our 359 * plug has some sync requests on it 360 * 361 * If we're doing the regular list and there are 362 * sync requests sitting around, unplug before 363 * we add more 364 */ 365 if (pending_bios == &device->pending_sync_bios) { 366 sync_pending = 1; 367 } else if (sync_pending) { 368 blk_finish_plug(&plug); 369 blk_start_plug(&plug); 370 sync_pending = 0; 371 } 372 373 btrfsic_submit_bio(cur->bi_rw, cur); 374 num_run++; 375 batch_run++; 376 if (need_resched()) 377 cond_resched(); 378 379 /* 380 * we made progress, there is more work to do and the bdi 381 * is now congested. Back off and let other work structs 382 * run instead 383 */ 384 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 385 fs_info->fs_devices->open_devices > 1) { 386 struct io_context *ioc; 387 388 ioc = current->io_context; 389 390 /* 391 * the main goal here is that we don't want to 392 * block if we're going to be able to submit 393 * more requests without blocking. 394 * 395 * This code does two great things, it pokes into 396 * the elevator code from a filesystem _and_ 397 * it makes assumptions about how batching works. 398 */ 399 if (ioc && ioc->nr_batch_requests > 0 && 400 time_before(jiffies, ioc->last_waited + HZ/50UL) && 401 (last_waited == 0 || 402 ioc->last_waited == last_waited)) { 403 /* 404 * we want to go through our batch of 405 * requests and stop. So, we copy out 406 * the ioc->last_waited time and test 407 * against it before looping 408 */ 409 last_waited = ioc->last_waited; 410 if (need_resched()) 411 cond_resched(); 412 continue; 413 } 414 spin_lock(&device->io_lock); 415 requeue_list(pending_bios, pending, tail); 416 device->running_pending = 1; 417 418 spin_unlock(&device->io_lock); 419 btrfs_queue_work(fs_info->submit_workers, 420 &device->work); 421 goto done; 422 } 423 /* unplug every 64 requests just for good measure */ 424 if (batch_run % 64 == 0) { 425 blk_finish_plug(&plug); 426 blk_start_plug(&plug); 427 sync_pending = 0; 428 } 429 } 430 431 cond_resched(); 432 if (again) 433 goto loop; 434 435 spin_lock(&device->io_lock); 436 if (device->pending_bios.head || device->pending_sync_bios.head) 437 goto loop_lock; 438 spin_unlock(&device->io_lock); 439 440 done: 441 blk_finish_plug(&plug); 442 } 443 444 static void pending_bios_fn(struct btrfs_work *work) 445 { 446 struct btrfs_device *device; 447 448 device = container_of(work, struct btrfs_device, work); 449 run_scheduled_bios(device); 450 } 451 452 /* 453 * Add new device to list of registered devices 454 * 455 * Returns: 456 * 1 - first time device is seen 457 * 0 - device already known 458 * < 0 - error 459 */ 460 static noinline int device_list_add(const char *path, 461 struct btrfs_super_block *disk_super, 462 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 463 { 464 struct btrfs_device *device; 465 struct btrfs_fs_devices *fs_devices; 466 struct rcu_string *name; 467 int ret = 0; 468 u64 found_transid = btrfs_super_generation(disk_super); 469 470 fs_devices = find_fsid(disk_super->fsid); 471 if (!fs_devices) { 472 fs_devices = alloc_fs_devices(disk_super->fsid); 473 if (IS_ERR(fs_devices)) 474 return PTR_ERR(fs_devices); 475 476 list_add(&fs_devices->list, &fs_uuids); 477 fs_devices->latest_devid = devid; 478 fs_devices->latest_trans = found_transid; 479 480 device = NULL; 481 } else { 482 device = __find_device(&fs_devices->devices, devid, 483 disk_super->dev_item.uuid); 484 } 485 if (!device) { 486 if (fs_devices->opened) 487 return -EBUSY; 488 489 device = btrfs_alloc_device(NULL, &devid, 490 disk_super->dev_item.uuid); 491 if (IS_ERR(device)) { 492 /* we can safely leave the fs_devices entry around */ 493 return PTR_ERR(device); 494 } 495 496 name = rcu_string_strdup(path, GFP_NOFS); 497 if (!name) { 498 kfree(device); 499 return -ENOMEM; 500 } 501 rcu_assign_pointer(device->name, name); 502 503 mutex_lock(&fs_devices->device_list_mutex); 504 list_add_rcu(&device->dev_list, &fs_devices->devices); 505 fs_devices->num_devices++; 506 mutex_unlock(&fs_devices->device_list_mutex); 507 508 ret = 1; 509 device->fs_devices = fs_devices; 510 } else if (!device->name || strcmp(device->name->str, path)) { 511 name = rcu_string_strdup(path, GFP_NOFS); 512 if (!name) 513 return -ENOMEM; 514 rcu_string_free(device->name); 515 rcu_assign_pointer(device->name, name); 516 if (device->missing) { 517 fs_devices->missing_devices--; 518 device->missing = 0; 519 } 520 } 521 522 if (found_transid > fs_devices->latest_trans) { 523 fs_devices->latest_devid = devid; 524 fs_devices->latest_trans = found_transid; 525 } 526 *fs_devices_ret = fs_devices; 527 528 return ret; 529 } 530 531 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 532 { 533 struct btrfs_fs_devices *fs_devices; 534 struct btrfs_device *device; 535 struct btrfs_device *orig_dev; 536 537 fs_devices = alloc_fs_devices(orig->fsid); 538 if (IS_ERR(fs_devices)) 539 return fs_devices; 540 541 fs_devices->latest_devid = orig->latest_devid; 542 fs_devices->latest_trans = orig->latest_trans; 543 fs_devices->total_devices = orig->total_devices; 544 545 /* We have held the volume lock, it is safe to get the devices. */ 546 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 547 struct rcu_string *name; 548 549 device = btrfs_alloc_device(NULL, &orig_dev->devid, 550 orig_dev->uuid); 551 if (IS_ERR(device)) 552 goto error; 553 554 /* 555 * This is ok to do without rcu read locked because we hold the 556 * uuid mutex so nothing we touch in here is going to disappear. 557 */ 558 if (orig_dev->name) { 559 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 560 if (!name) { 561 kfree(device); 562 goto error; 563 } 564 rcu_assign_pointer(device->name, name); 565 } 566 567 list_add(&device->dev_list, &fs_devices->devices); 568 device->fs_devices = fs_devices; 569 fs_devices->num_devices++; 570 } 571 return fs_devices; 572 error: 573 free_fs_devices(fs_devices); 574 return ERR_PTR(-ENOMEM); 575 } 576 577 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 578 struct btrfs_fs_devices *fs_devices, int step) 579 { 580 struct btrfs_device *device, *next; 581 582 struct block_device *latest_bdev = NULL; 583 u64 latest_devid = 0; 584 u64 latest_transid = 0; 585 586 mutex_lock(&uuid_mutex); 587 again: 588 /* This is the initialized path, it is safe to release the devices. */ 589 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 590 if (device->in_fs_metadata) { 591 if (!device->is_tgtdev_for_dev_replace && 592 (!latest_transid || 593 device->generation > latest_transid)) { 594 latest_devid = device->devid; 595 latest_transid = device->generation; 596 latest_bdev = device->bdev; 597 } 598 continue; 599 } 600 601 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 602 /* 603 * In the first step, keep the device which has 604 * the correct fsid and the devid that is used 605 * for the dev_replace procedure. 606 * In the second step, the dev_replace state is 607 * read from the device tree and it is known 608 * whether the procedure is really active or 609 * not, which means whether this device is 610 * used or whether it should be removed. 611 */ 612 if (step == 0 || device->is_tgtdev_for_dev_replace) { 613 continue; 614 } 615 } 616 if (device->bdev) { 617 blkdev_put(device->bdev, device->mode); 618 device->bdev = NULL; 619 fs_devices->open_devices--; 620 } 621 if (device->writeable) { 622 list_del_init(&device->dev_alloc_list); 623 device->writeable = 0; 624 if (!device->is_tgtdev_for_dev_replace) 625 fs_devices->rw_devices--; 626 } 627 list_del_init(&device->dev_list); 628 fs_devices->num_devices--; 629 rcu_string_free(device->name); 630 kfree(device); 631 } 632 633 if (fs_devices->seed) { 634 fs_devices = fs_devices->seed; 635 goto again; 636 } 637 638 fs_devices->latest_bdev = latest_bdev; 639 fs_devices->latest_devid = latest_devid; 640 fs_devices->latest_trans = latest_transid; 641 642 mutex_unlock(&uuid_mutex); 643 } 644 645 static void __free_device(struct work_struct *work) 646 { 647 struct btrfs_device *device; 648 649 device = container_of(work, struct btrfs_device, rcu_work); 650 651 if (device->bdev) 652 blkdev_put(device->bdev, device->mode); 653 654 rcu_string_free(device->name); 655 kfree(device); 656 } 657 658 static void free_device(struct rcu_head *head) 659 { 660 struct btrfs_device *device; 661 662 device = container_of(head, struct btrfs_device, rcu); 663 664 INIT_WORK(&device->rcu_work, __free_device); 665 schedule_work(&device->rcu_work); 666 } 667 668 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 669 { 670 struct btrfs_device *device; 671 672 if (--fs_devices->opened > 0) 673 return 0; 674 675 mutex_lock(&fs_devices->device_list_mutex); 676 list_for_each_entry(device, &fs_devices->devices, dev_list) { 677 struct btrfs_device *new_device; 678 struct rcu_string *name; 679 680 if (device->bdev) 681 fs_devices->open_devices--; 682 683 if (device->writeable && 684 device->devid != BTRFS_DEV_REPLACE_DEVID) { 685 list_del_init(&device->dev_alloc_list); 686 fs_devices->rw_devices--; 687 } 688 689 if (device->can_discard) 690 fs_devices->num_can_discard--; 691 if (device->missing) 692 fs_devices->missing_devices--; 693 694 new_device = btrfs_alloc_device(NULL, &device->devid, 695 device->uuid); 696 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 697 698 /* Safe because we are under uuid_mutex */ 699 if (device->name) { 700 name = rcu_string_strdup(device->name->str, GFP_NOFS); 701 BUG_ON(!name); /* -ENOMEM */ 702 rcu_assign_pointer(new_device->name, name); 703 } 704 705 list_replace_rcu(&device->dev_list, &new_device->dev_list); 706 new_device->fs_devices = device->fs_devices; 707 708 call_rcu(&device->rcu, free_device); 709 } 710 mutex_unlock(&fs_devices->device_list_mutex); 711 712 WARN_ON(fs_devices->open_devices); 713 WARN_ON(fs_devices->rw_devices); 714 fs_devices->opened = 0; 715 fs_devices->seeding = 0; 716 717 return 0; 718 } 719 720 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 721 { 722 struct btrfs_fs_devices *seed_devices = NULL; 723 int ret; 724 725 mutex_lock(&uuid_mutex); 726 ret = __btrfs_close_devices(fs_devices); 727 if (!fs_devices->opened) { 728 seed_devices = fs_devices->seed; 729 fs_devices->seed = NULL; 730 } 731 mutex_unlock(&uuid_mutex); 732 733 while (seed_devices) { 734 fs_devices = seed_devices; 735 seed_devices = fs_devices->seed; 736 __btrfs_close_devices(fs_devices); 737 free_fs_devices(fs_devices); 738 } 739 /* 740 * Wait for rcu kworkers under __btrfs_close_devices 741 * to finish all blkdev_puts so device is really 742 * free when umount is done. 743 */ 744 rcu_barrier(); 745 return ret; 746 } 747 748 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 749 fmode_t flags, void *holder) 750 { 751 struct request_queue *q; 752 struct block_device *bdev; 753 struct list_head *head = &fs_devices->devices; 754 struct btrfs_device *device; 755 struct block_device *latest_bdev = NULL; 756 struct buffer_head *bh; 757 struct btrfs_super_block *disk_super; 758 u64 latest_devid = 0; 759 u64 latest_transid = 0; 760 u64 devid; 761 int seeding = 1; 762 int ret = 0; 763 764 flags |= FMODE_EXCL; 765 766 list_for_each_entry(device, head, dev_list) { 767 if (device->bdev) 768 continue; 769 if (!device->name) 770 continue; 771 772 /* Just open everything we can; ignore failures here */ 773 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 774 &bdev, &bh)) 775 continue; 776 777 disk_super = (struct btrfs_super_block *)bh->b_data; 778 devid = btrfs_stack_device_id(&disk_super->dev_item); 779 if (devid != device->devid) 780 goto error_brelse; 781 782 if (memcmp(device->uuid, disk_super->dev_item.uuid, 783 BTRFS_UUID_SIZE)) 784 goto error_brelse; 785 786 device->generation = btrfs_super_generation(disk_super); 787 if (!latest_transid || device->generation > latest_transid) { 788 latest_devid = devid; 789 latest_transid = device->generation; 790 latest_bdev = bdev; 791 } 792 793 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 794 device->writeable = 0; 795 } else { 796 device->writeable = !bdev_read_only(bdev); 797 seeding = 0; 798 } 799 800 q = bdev_get_queue(bdev); 801 if (blk_queue_discard(q)) { 802 device->can_discard = 1; 803 fs_devices->num_can_discard++; 804 } 805 806 device->bdev = bdev; 807 device->in_fs_metadata = 0; 808 device->mode = flags; 809 810 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 811 fs_devices->rotating = 1; 812 813 fs_devices->open_devices++; 814 if (device->writeable && 815 device->devid != BTRFS_DEV_REPLACE_DEVID) { 816 fs_devices->rw_devices++; 817 list_add(&device->dev_alloc_list, 818 &fs_devices->alloc_list); 819 } 820 brelse(bh); 821 continue; 822 823 error_brelse: 824 brelse(bh); 825 blkdev_put(bdev, flags); 826 continue; 827 } 828 if (fs_devices->open_devices == 0) { 829 ret = -EINVAL; 830 goto out; 831 } 832 fs_devices->seeding = seeding; 833 fs_devices->opened = 1; 834 fs_devices->latest_bdev = latest_bdev; 835 fs_devices->latest_devid = latest_devid; 836 fs_devices->latest_trans = latest_transid; 837 fs_devices->total_rw_bytes = 0; 838 out: 839 return ret; 840 } 841 842 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 843 fmode_t flags, void *holder) 844 { 845 int ret; 846 847 mutex_lock(&uuid_mutex); 848 if (fs_devices->opened) { 849 fs_devices->opened++; 850 ret = 0; 851 } else { 852 ret = __btrfs_open_devices(fs_devices, flags, holder); 853 } 854 mutex_unlock(&uuid_mutex); 855 return ret; 856 } 857 858 /* 859 * Look for a btrfs signature on a device. This may be called out of the mount path 860 * and we are not allowed to call set_blocksize during the scan. The superblock 861 * is read via pagecache 862 */ 863 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 864 struct btrfs_fs_devices **fs_devices_ret) 865 { 866 struct btrfs_super_block *disk_super; 867 struct block_device *bdev; 868 struct page *page; 869 void *p; 870 int ret = -EINVAL; 871 u64 devid; 872 u64 transid; 873 u64 total_devices; 874 u64 bytenr; 875 pgoff_t index; 876 877 /* 878 * we would like to check all the supers, but that would make 879 * a btrfs mount succeed after a mkfs from a different FS. 880 * So, we need to add a special mount option to scan for 881 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 882 */ 883 bytenr = btrfs_sb_offset(0); 884 flags |= FMODE_EXCL; 885 mutex_lock(&uuid_mutex); 886 887 bdev = blkdev_get_by_path(path, flags, holder); 888 889 if (IS_ERR(bdev)) { 890 ret = PTR_ERR(bdev); 891 goto error; 892 } 893 894 /* make sure our super fits in the device */ 895 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) 896 goto error_bdev_put; 897 898 /* make sure our super fits in the page */ 899 if (sizeof(*disk_super) > PAGE_CACHE_SIZE) 900 goto error_bdev_put; 901 902 /* make sure our super doesn't straddle pages on disk */ 903 index = bytenr >> PAGE_CACHE_SHIFT; 904 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) 905 goto error_bdev_put; 906 907 /* pull in the page with our super */ 908 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 909 index, GFP_NOFS); 910 911 if (IS_ERR_OR_NULL(page)) 912 goto error_bdev_put; 913 914 p = kmap(page); 915 916 /* align our pointer to the offset of the super block */ 917 disk_super = p + (bytenr & ~PAGE_CACHE_MASK); 918 919 if (btrfs_super_bytenr(disk_super) != bytenr || 920 btrfs_super_magic(disk_super) != BTRFS_MAGIC) 921 goto error_unmap; 922 923 devid = btrfs_stack_device_id(&disk_super->dev_item); 924 transid = btrfs_super_generation(disk_super); 925 total_devices = btrfs_super_num_devices(disk_super); 926 927 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 928 if (ret > 0) { 929 if (disk_super->label[0]) { 930 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 931 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 932 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); 933 } else { 934 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); 935 } 936 937 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 938 ret = 0; 939 } 940 if (!ret && fs_devices_ret) 941 (*fs_devices_ret)->total_devices = total_devices; 942 943 error_unmap: 944 kunmap(page); 945 page_cache_release(page); 946 947 error_bdev_put: 948 blkdev_put(bdev, flags); 949 error: 950 mutex_unlock(&uuid_mutex); 951 return ret; 952 } 953 954 /* helper to account the used device space in the range */ 955 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 956 u64 end, u64 *length) 957 { 958 struct btrfs_key key; 959 struct btrfs_root *root = device->dev_root; 960 struct btrfs_dev_extent *dev_extent; 961 struct btrfs_path *path; 962 u64 extent_end; 963 int ret; 964 int slot; 965 struct extent_buffer *l; 966 967 *length = 0; 968 969 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 970 return 0; 971 972 path = btrfs_alloc_path(); 973 if (!path) 974 return -ENOMEM; 975 path->reada = 2; 976 977 key.objectid = device->devid; 978 key.offset = start; 979 key.type = BTRFS_DEV_EXTENT_KEY; 980 981 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 982 if (ret < 0) 983 goto out; 984 if (ret > 0) { 985 ret = btrfs_previous_item(root, path, key.objectid, key.type); 986 if (ret < 0) 987 goto out; 988 } 989 990 while (1) { 991 l = path->nodes[0]; 992 slot = path->slots[0]; 993 if (slot >= btrfs_header_nritems(l)) { 994 ret = btrfs_next_leaf(root, path); 995 if (ret == 0) 996 continue; 997 if (ret < 0) 998 goto out; 999 1000 break; 1001 } 1002 btrfs_item_key_to_cpu(l, &key, slot); 1003 1004 if (key.objectid < device->devid) 1005 goto next; 1006 1007 if (key.objectid > device->devid) 1008 break; 1009 1010 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1011 goto next; 1012 1013 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1014 extent_end = key.offset + btrfs_dev_extent_length(l, 1015 dev_extent); 1016 if (key.offset <= start && extent_end > end) { 1017 *length = end - start + 1; 1018 break; 1019 } else if (key.offset <= start && extent_end > start) 1020 *length += extent_end - start; 1021 else if (key.offset > start && extent_end <= end) 1022 *length += extent_end - key.offset; 1023 else if (key.offset > start && key.offset <= end) { 1024 *length += end - key.offset + 1; 1025 break; 1026 } else if (key.offset > end) 1027 break; 1028 1029 next: 1030 path->slots[0]++; 1031 } 1032 ret = 0; 1033 out: 1034 btrfs_free_path(path); 1035 return ret; 1036 } 1037 1038 static int contains_pending_extent(struct btrfs_trans_handle *trans, 1039 struct btrfs_device *device, 1040 u64 *start, u64 len) 1041 { 1042 struct extent_map *em; 1043 int ret = 0; 1044 1045 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1046 struct map_lookup *map; 1047 int i; 1048 1049 map = (struct map_lookup *)em->bdev; 1050 for (i = 0; i < map->num_stripes; i++) { 1051 if (map->stripes[i].dev != device) 1052 continue; 1053 if (map->stripes[i].physical >= *start + len || 1054 map->stripes[i].physical + em->orig_block_len <= 1055 *start) 1056 continue; 1057 *start = map->stripes[i].physical + 1058 em->orig_block_len; 1059 ret = 1; 1060 } 1061 } 1062 1063 return ret; 1064 } 1065 1066 1067 /* 1068 * find_free_dev_extent - find free space in the specified device 1069 * @device: the device which we search the free space in 1070 * @num_bytes: the size of the free space that we need 1071 * @start: store the start of the free space. 1072 * @len: the size of the free space. that we find, or the size of the max 1073 * free space if we don't find suitable free space 1074 * 1075 * this uses a pretty simple search, the expectation is that it is 1076 * called very infrequently and that a given device has a small number 1077 * of extents 1078 * 1079 * @start is used to store the start of the free space if we find. But if we 1080 * don't find suitable free space, it will be used to store the start position 1081 * of the max free space. 1082 * 1083 * @len is used to store the size of the free space that we find. 1084 * But if we don't find suitable free space, it is used to store the size of 1085 * the max free space. 1086 */ 1087 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1088 struct btrfs_device *device, u64 num_bytes, 1089 u64 *start, u64 *len) 1090 { 1091 struct btrfs_key key; 1092 struct btrfs_root *root = device->dev_root; 1093 struct btrfs_dev_extent *dev_extent; 1094 struct btrfs_path *path; 1095 u64 hole_size; 1096 u64 max_hole_start; 1097 u64 max_hole_size; 1098 u64 extent_end; 1099 u64 search_start; 1100 u64 search_end = device->total_bytes; 1101 int ret; 1102 int slot; 1103 struct extent_buffer *l; 1104 1105 /* FIXME use last free of some kind */ 1106 1107 /* we don't want to overwrite the superblock on the drive, 1108 * so we make sure to start at an offset of at least 1MB 1109 */ 1110 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1111 1112 path = btrfs_alloc_path(); 1113 if (!path) 1114 return -ENOMEM; 1115 again: 1116 max_hole_start = search_start; 1117 max_hole_size = 0; 1118 hole_size = 0; 1119 1120 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1121 ret = -ENOSPC; 1122 goto out; 1123 } 1124 1125 path->reada = 2; 1126 path->search_commit_root = 1; 1127 path->skip_locking = 1; 1128 1129 key.objectid = device->devid; 1130 key.offset = search_start; 1131 key.type = BTRFS_DEV_EXTENT_KEY; 1132 1133 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1134 if (ret < 0) 1135 goto out; 1136 if (ret > 0) { 1137 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1138 if (ret < 0) 1139 goto out; 1140 } 1141 1142 while (1) { 1143 l = path->nodes[0]; 1144 slot = path->slots[0]; 1145 if (slot >= btrfs_header_nritems(l)) { 1146 ret = btrfs_next_leaf(root, path); 1147 if (ret == 0) 1148 continue; 1149 if (ret < 0) 1150 goto out; 1151 1152 break; 1153 } 1154 btrfs_item_key_to_cpu(l, &key, slot); 1155 1156 if (key.objectid < device->devid) 1157 goto next; 1158 1159 if (key.objectid > device->devid) 1160 break; 1161 1162 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1163 goto next; 1164 1165 if (key.offset > search_start) { 1166 hole_size = key.offset - search_start; 1167 1168 /* 1169 * Have to check before we set max_hole_start, otherwise 1170 * we could end up sending back this offset anyway. 1171 */ 1172 if (contains_pending_extent(trans, device, 1173 &search_start, 1174 hole_size)) 1175 hole_size = 0; 1176 1177 if (hole_size > max_hole_size) { 1178 max_hole_start = search_start; 1179 max_hole_size = hole_size; 1180 } 1181 1182 /* 1183 * If this free space is greater than which we need, 1184 * it must be the max free space that we have found 1185 * until now, so max_hole_start must point to the start 1186 * of this free space and the length of this free space 1187 * is stored in max_hole_size. Thus, we return 1188 * max_hole_start and max_hole_size and go back to the 1189 * caller. 1190 */ 1191 if (hole_size >= num_bytes) { 1192 ret = 0; 1193 goto out; 1194 } 1195 } 1196 1197 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1198 extent_end = key.offset + btrfs_dev_extent_length(l, 1199 dev_extent); 1200 if (extent_end > search_start) 1201 search_start = extent_end; 1202 next: 1203 path->slots[0]++; 1204 cond_resched(); 1205 } 1206 1207 /* 1208 * At this point, search_start should be the end of 1209 * allocated dev extents, and when shrinking the device, 1210 * search_end may be smaller than search_start. 1211 */ 1212 if (search_end > search_start) 1213 hole_size = search_end - search_start; 1214 1215 if (hole_size > max_hole_size) { 1216 max_hole_start = search_start; 1217 max_hole_size = hole_size; 1218 } 1219 1220 if (contains_pending_extent(trans, device, &search_start, hole_size)) { 1221 btrfs_release_path(path); 1222 goto again; 1223 } 1224 1225 /* See above. */ 1226 if (hole_size < num_bytes) 1227 ret = -ENOSPC; 1228 else 1229 ret = 0; 1230 1231 out: 1232 btrfs_free_path(path); 1233 *start = max_hole_start; 1234 if (len) 1235 *len = max_hole_size; 1236 return ret; 1237 } 1238 1239 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1240 struct btrfs_device *device, 1241 u64 start) 1242 { 1243 int ret; 1244 struct btrfs_path *path; 1245 struct btrfs_root *root = device->dev_root; 1246 struct btrfs_key key; 1247 struct btrfs_key found_key; 1248 struct extent_buffer *leaf = NULL; 1249 struct btrfs_dev_extent *extent = NULL; 1250 1251 path = btrfs_alloc_path(); 1252 if (!path) 1253 return -ENOMEM; 1254 1255 key.objectid = device->devid; 1256 key.offset = start; 1257 key.type = BTRFS_DEV_EXTENT_KEY; 1258 again: 1259 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1260 if (ret > 0) { 1261 ret = btrfs_previous_item(root, path, key.objectid, 1262 BTRFS_DEV_EXTENT_KEY); 1263 if (ret) 1264 goto out; 1265 leaf = path->nodes[0]; 1266 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1267 extent = btrfs_item_ptr(leaf, path->slots[0], 1268 struct btrfs_dev_extent); 1269 BUG_ON(found_key.offset > start || found_key.offset + 1270 btrfs_dev_extent_length(leaf, extent) < start); 1271 key = found_key; 1272 btrfs_release_path(path); 1273 goto again; 1274 } else if (ret == 0) { 1275 leaf = path->nodes[0]; 1276 extent = btrfs_item_ptr(leaf, path->slots[0], 1277 struct btrfs_dev_extent); 1278 } else { 1279 btrfs_error(root->fs_info, ret, "Slot search failed"); 1280 goto out; 1281 } 1282 1283 if (device->bytes_used > 0) { 1284 u64 len = btrfs_dev_extent_length(leaf, extent); 1285 device->bytes_used -= len; 1286 spin_lock(&root->fs_info->free_chunk_lock); 1287 root->fs_info->free_chunk_space += len; 1288 spin_unlock(&root->fs_info->free_chunk_lock); 1289 } 1290 ret = btrfs_del_item(trans, root, path); 1291 if (ret) { 1292 btrfs_error(root->fs_info, ret, 1293 "Failed to remove dev extent item"); 1294 } 1295 out: 1296 btrfs_free_path(path); 1297 return ret; 1298 } 1299 1300 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1301 struct btrfs_device *device, 1302 u64 chunk_tree, u64 chunk_objectid, 1303 u64 chunk_offset, u64 start, u64 num_bytes) 1304 { 1305 int ret; 1306 struct btrfs_path *path; 1307 struct btrfs_root *root = device->dev_root; 1308 struct btrfs_dev_extent *extent; 1309 struct extent_buffer *leaf; 1310 struct btrfs_key key; 1311 1312 WARN_ON(!device->in_fs_metadata); 1313 WARN_ON(device->is_tgtdev_for_dev_replace); 1314 path = btrfs_alloc_path(); 1315 if (!path) 1316 return -ENOMEM; 1317 1318 key.objectid = device->devid; 1319 key.offset = start; 1320 key.type = BTRFS_DEV_EXTENT_KEY; 1321 ret = btrfs_insert_empty_item(trans, root, path, &key, 1322 sizeof(*extent)); 1323 if (ret) 1324 goto out; 1325 1326 leaf = path->nodes[0]; 1327 extent = btrfs_item_ptr(leaf, path->slots[0], 1328 struct btrfs_dev_extent); 1329 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1330 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1331 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1332 1333 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1334 btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); 1335 1336 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1337 btrfs_mark_buffer_dirty(leaf); 1338 out: 1339 btrfs_free_path(path); 1340 return ret; 1341 } 1342 1343 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1344 { 1345 struct extent_map_tree *em_tree; 1346 struct extent_map *em; 1347 struct rb_node *n; 1348 u64 ret = 0; 1349 1350 em_tree = &fs_info->mapping_tree.map_tree; 1351 read_lock(&em_tree->lock); 1352 n = rb_last(&em_tree->map); 1353 if (n) { 1354 em = rb_entry(n, struct extent_map, rb_node); 1355 ret = em->start + em->len; 1356 } 1357 read_unlock(&em_tree->lock); 1358 1359 return ret; 1360 } 1361 1362 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1363 u64 *devid_ret) 1364 { 1365 int ret; 1366 struct btrfs_key key; 1367 struct btrfs_key found_key; 1368 struct btrfs_path *path; 1369 1370 path = btrfs_alloc_path(); 1371 if (!path) 1372 return -ENOMEM; 1373 1374 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1375 key.type = BTRFS_DEV_ITEM_KEY; 1376 key.offset = (u64)-1; 1377 1378 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1379 if (ret < 0) 1380 goto error; 1381 1382 BUG_ON(ret == 0); /* Corruption */ 1383 1384 ret = btrfs_previous_item(fs_info->chunk_root, path, 1385 BTRFS_DEV_ITEMS_OBJECTID, 1386 BTRFS_DEV_ITEM_KEY); 1387 if (ret) { 1388 *devid_ret = 1; 1389 } else { 1390 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1391 path->slots[0]); 1392 *devid_ret = found_key.offset + 1; 1393 } 1394 ret = 0; 1395 error: 1396 btrfs_free_path(path); 1397 return ret; 1398 } 1399 1400 /* 1401 * the device information is stored in the chunk root 1402 * the btrfs_device struct should be fully filled in 1403 */ 1404 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1405 struct btrfs_root *root, 1406 struct btrfs_device *device) 1407 { 1408 int ret; 1409 struct btrfs_path *path; 1410 struct btrfs_dev_item *dev_item; 1411 struct extent_buffer *leaf; 1412 struct btrfs_key key; 1413 unsigned long ptr; 1414 1415 root = root->fs_info->chunk_root; 1416 1417 path = btrfs_alloc_path(); 1418 if (!path) 1419 return -ENOMEM; 1420 1421 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1422 key.type = BTRFS_DEV_ITEM_KEY; 1423 key.offset = device->devid; 1424 1425 ret = btrfs_insert_empty_item(trans, root, path, &key, 1426 sizeof(*dev_item)); 1427 if (ret) 1428 goto out; 1429 1430 leaf = path->nodes[0]; 1431 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1432 1433 btrfs_set_device_id(leaf, dev_item, device->devid); 1434 btrfs_set_device_generation(leaf, dev_item, 0); 1435 btrfs_set_device_type(leaf, dev_item, device->type); 1436 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1437 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1438 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1439 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1440 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1441 btrfs_set_device_group(leaf, dev_item, 0); 1442 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1443 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1444 btrfs_set_device_start_offset(leaf, dev_item, 0); 1445 1446 ptr = btrfs_device_uuid(dev_item); 1447 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1448 ptr = btrfs_device_fsid(dev_item); 1449 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1450 btrfs_mark_buffer_dirty(leaf); 1451 1452 ret = 0; 1453 out: 1454 btrfs_free_path(path); 1455 return ret; 1456 } 1457 1458 /* 1459 * Function to update ctime/mtime for a given device path. 1460 * Mainly used for ctime/mtime based probe like libblkid. 1461 */ 1462 static void update_dev_time(char *path_name) 1463 { 1464 struct file *filp; 1465 1466 filp = filp_open(path_name, O_RDWR, 0); 1467 if (!filp) 1468 return; 1469 file_update_time(filp); 1470 filp_close(filp, NULL); 1471 return; 1472 } 1473 1474 static int btrfs_rm_dev_item(struct btrfs_root *root, 1475 struct btrfs_device *device) 1476 { 1477 int ret; 1478 struct btrfs_path *path; 1479 struct btrfs_key key; 1480 struct btrfs_trans_handle *trans; 1481 1482 root = root->fs_info->chunk_root; 1483 1484 path = btrfs_alloc_path(); 1485 if (!path) 1486 return -ENOMEM; 1487 1488 trans = btrfs_start_transaction(root, 0); 1489 if (IS_ERR(trans)) { 1490 btrfs_free_path(path); 1491 return PTR_ERR(trans); 1492 } 1493 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1494 key.type = BTRFS_DEV_ITEM_KEY; 1495 key.offset = device->devid; 1496 lock_chunks(root); 1497 1498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1499 if (ret < 0) 1500 goto out; 1501 1502 if (ret > 0) { 1503 ret = -ENOENT; 1504 goto out; 1505 } 1506 1507 ret = btrfs_del_item(trans, root, path); 1508 if (ret) 1509 goto out; 1510 out: 1511 btrfs_free_path(path); 1512 unlock_chunks(root); 1513 btrfs_commit_transaction(trans, root); 1514 return ret; 1515 } 1516 1517 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1518 { 1519 struct btrfs_device *device; 1520 struct btrfs_device *next_device; 1521 struct block_device *bdev; 1522 struct buffer_head *bh = NULL; 1523 struct btrfs_super_block *disk_super; 1524 struct btrfs_fs_devices *cur_devices; 1525 u64 all_avail; 1526 u64 devid; 1527 u64 num_devices; 1528 u8 *dev_uuid; 1529 unsigned seq; 1530 int ret = 0; 1531 bool clear_super = false; 1532 1533 mutex_lock(&uuid_mutex); 1534 1535 do { 1536 seq = read_seqbegin(&root->fs_info->profiles_lock); 1537 1538 all_avail = root->fs_info->avail_data_alloc_bits | 1539 root->fs_info->avail_system_alloc_bits | 1540 root->fs_info->avail_metadata_alloc_bits; 1541 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1542 1543 num_devices = root->fs_info->fs_devices->num_devices; 1544 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1545 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1546 WARN_ON(num_devices < 1); 1547 num_devices--; 1548 } 1549 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1550 1551 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1552 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1553 goto out; 1554 } 1555 1556 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1557 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; 1558 goto out; 1559 } 1560 1561 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1562 root->fs_info->fs_devices->rw_devices <= 2) { 1563 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1564 goto out; 1565 } 1566 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1567 root->fs_info->fs_devices->rw_devices <= 3) { 1568 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; 1569 goto out; 1570 } 1571 1572 if (strcmp(device_path, "missing") == 0) { 1573 struct list_head *devices; 1574 struct btrfs_device *tmp; 1575 1576 device = NULL; 1577 devices = &root->fs_info->fs_devices->devices; 1578 /* 1579 * It is safe to read the devices since the volume_mutex 1580 * is held. 1581 */ 1582 list_for_each_entry(tmp, devices, dev_list) { 1583 if (tmp->in_fs_metadata && 1584 !tmp->is_tgtdev_for_dev_replace && 1585 !tmp->bdev) { 1586 device = tmp; 1587 break; 1588 } 1589 } 1590 bdev = NULL; 1591 bh = NULL; 1592 disk_super = NULL; 1593 if (!device) { 1594 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 1595 goto out; 1596 } 1597 } else { 1598 ret = btrfs_get_bdev_and_sb(device_path, 1599 FMODE_WRITE | FMODE_EXCL, 1600 root->fs_info->bdev_holder, 0, 1601 &bdev, &bh); 1602 if (ret) 1603 goto out; 1604 disk_super = (struct btrfs_super_block *)bh->b_data; 1605 devid = btrfs_stack_device_id(&disk_super->dev_item); 1606 dev_uuid = disk_super->dev_item.uuid; 1607 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1608 disk_super->fsid); 1609 if (!device) { 1610 ret = -ENOENT; 1611 goto error_brelse; 1612 } 1613 } 1614 1615 if (device->is_tgtdev_for_dev_replace) { 1616 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1617 goto error_brelse; 1618 } 1619 1620 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1621 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1622 goto error_brelse; 1623 } 1624 1625 if (device->writeable) { 1626 lock_chunks(root); 1627 list_del_init(&device->dev_alloc_list); 1628 unlock_chunks(root); 1629 root->fs_info->fs_devices->rw_devices--; 1630 clear_super = true; 1631 } 1632 1633 mutex_unlock(&uuid_mutex); 1634 ret = btrfs_shrink_device(device, 0); 1635 mutex_lock(&uuid_mutex); 1636 if (ret) 1637 goto error_undo; 1638 1639 /* 1640 * TODO: the superblock still includes this device in its num_devices 1641 * counter although write_all_supers() is not locked out. This 1642 * could give a filesystem state which requires a degraded mount. 1643 */ 1644 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1645 if (ret) 1646 goto error_undo; 1647 1648 spin_lock(&root->fs_info->free_chunk_lock); 1649 root->fs_info->free_chunk_space = device->total_bytes - 1650 device->bytes_used; 1651 spin_unlock(&root->fs_info->free_chunk_lock); 1652 1653 device->in_fs_metadata = 0; 1654 btrfs_scrub_cancel_dev(root->fs_info, device); 1655 1656 /* 1657 * the device list mutex makes sure that we don't change 1658 * the device list while someone else is writing out all 1659 * the device supers. Whoever is writing all supers, should 1660 * lock the device list mutex before getting the number of 1661 * devices in the super block (super_copy). Conversely, 1662 * whoever updates the number of devices in the super block 1663 * (super_copy) should hold the device list mutex. 1664 */ 1665 1666 cur_devices = device->fs_devices; 1667 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1668 list_del_rcu(&device->dev_list); 1669 1670 device->fs_devices->num_devices--; 1671 device->fs_devices->total_devices--; 1672 1673 if (device->missing) 1674 root->fs_info->fs_devices->missing_devices--; 1675 1676 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1677 struct btrfs_device, dev_list); 1678 if (device->bdev == root->fs_info->sb->s_bdev) 1679 root->fs_info->sb->s_bdev = next_device->bdev; 1680 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1681 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1682 1683 if (device->bdev) { 1684 device->fs_devices->open_devices--; 1685 /* remove sysfs entry */ 1686 btrfs_kobj_rm_device(root->fs_info, device); 1687 } 1688 1689 call_rcu(&device->rcu, free_device); 1690 1691 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1692 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1693 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1694 1695 if (cur_devices->open_devices == 0) { 1696 struct btrfs_fs_devices *fs_devices; 1697 fs_devices = root->fs_info->fs_devices; 1698 while (fs_devices) { 1699 if (fs_devices->seed == cur_devices) { 1700 fs_devices->seed = cur_devices->seed; 1701 break; 1702 } 1703 fs_devices = fs_devices->seed; 1704 } 1705 cur_devices->seed = NULL; 1706 lock_chunks(root); 1707 __btrfs_close_devices(cur_devices); 1708 unlock_chunks(root); 1709 free_fs_devices(cur_devices); 1710 } 1711 1712 root->fs_info->num_tolerated_disk_barrier_failures = 1713 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 1714 1715 /* 1716 * at this point, the device is zero sized. We want to 1717 * remove it from the devices list and zero out the old super 1718 */ 1719 if (clear_super && disk_super) { 1720 u64 bytenr; 1721 int i; 1722 1723 /* make sure this device isn't detected as part of 1724 * the FS anymore 1725 */ 1726 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1727 set_buffer_dirty(bh); 1728 sync_dirty_buffer(bh); 1729 1730 /* clear the mirror copies of super block on the disk 1731 * being removed, 0th copy is been taken care above and 1732 * the below would take of the rest 1733 */ 1734 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1735 bytenr = btrfs_sb_offset(i); 1736 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 1737 i_size_read(bdev->bd_inode)) 1738 break; 1739 1740 brelse(bh); 1741 bh = __bread(bdev, bytenr / 4096, 1742 BTRFS_SUPER_INFO_SIZE); 1743 if (!bh) 1744 continue; 1745 1746 disk_super = (struct btrfs_super_block *)bh->b_data; 1747 1748 if (btrfs_super_bytenr(disk_super) != bytenr || 1749 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1750 continue; 1751 } 1752 memset(&disk_super->magic, 0, 1753 sizeof(disk_super->magic)); 1754 set_buffer_dirty(bh); 1755 sync_dirty_buffer(bh); 1756 } 1757 } 1758 1759 ret = 0; 1760 1761 if (bdev) { 1762 /* Notify udev that device has changed */ 1763 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1764 1765 /* Update ctime/mtime for device path for libblkid */ 1766 update_dev_time(device_path); 1767 } 1768 1769 error_brelse: 1770 brelse(bh); 1771 if (bdev) 1772 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1773 out: 1774 mutex_unlock(&uuid_mutex); 1775 return ret; 1776 error_undo: 1777 if (device->writeable) { 1778 lock_chunks(root); 1779 list_add(&device->dev_alloc_list, 1780 &root->fs_info->fs_devices->alloc_list); 1781 unlock_chunks(root); 1782 root->fs_info->fs_devices->rw_devices++; 1783 } 1784 goto error_brelse; 1785 } 1786 1787 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1788 struct btrfs_device *srcdev) 1789 { 1790 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1791 1792 list_del_rcu(&srcdev->dev_list); 1793 list_del_rcu(&srcdev->dev_alloc_list); 1794 fs_info->fs_devices->num_devices--; 1795 if (srcdev->missing) { 1796 fs_info->fs_devices->missing_devices--; 1797 fs_info->fs_devices->rw_devices++; 1798 } 1799 if (srcdev->can_discard) 1800 fs_info->fs_devices->num_can_discard--; 1801 if (srcdev->bdev) { 1802 fs_info->fs_devices->open_devices--; 1803 1804 /* zero out the old super */ 1805 btrfs_scratch_superblock(srcdev); 1806 } 1807 1808 call_rcu(&srcdev->rcu, free_device); 1809 } 1810 1811 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1812 struct btrfs_device *tgtdev) 1813 { 1814 struct btrfs_device *next_device; 1815 1816 WARN_ON(!tgtdev); 1817 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1818 if (tgtdev->bdev) { 1819 btrfs_scratch_superblock(tgtdev); 1820 fs_info->fs_devices->open_devices--; 1821 } 1822 fs_info->fs_devices->num_devices--; 1823 if (tgtdev->can_discard) 1824 fs_info->fs_devices->num_can_discard++; 1825 1826 next_device = list_entry(fs_info->fs_devices->devices.next, 1827 struct btrfs_device, dev_list); 1828 if (tgtdev->bdev == fs_info->sb->s_bdev) 1829 fs_info->sb->s_bdev = next_device->bdev; 1830 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1831 fs_info->fs_devices->latest_bdev = next_device->bdev; 1832 list_del_rcu(&tgtdev->dev_list); 1833 1834 call_rcu(&tgtdev->rcu, free_device); 1835 1836 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1837 } 1838 1839 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1840 struct btrfs_device **device) 1841 { 1842 int ret = 0; 1843 struct btrfs_super_block *disk_super; 1844 u64 devid; 1845 u8 *dev_uuid; 1846 struct block_device *bdev; 1847 struct buffer_head *bh; 1848 1849 *device = NULL; 1850 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1851 root->fs_info->bdev_holder, 0, &bdev, &bh); 1852 if (ret) 1853 return ret; 1854 disk_super = (struct btrfs_super_block *)bh->b_data; 1855 devid = btrfs_stack_device_id(&disk_super->dev_item); 1856 dev_uuid = disk_super->dev_item.uuid; 1857 *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1858 disk_super->fsid); 1859 brelse(bh); 1860 if (!*device) 1861 ret = -ENOENT; 1862 blkdev_put(bdev, FMODE_READ); 1863 return ret; 1864 } 1865 1866 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1867 char *device_path, 1868 struct btrfs_device **device) 1869 { 1870 *device = NULL; 1871 if (strcmp(device_path, "missing") == 0) { 1872 struct list_head *devices; 1873 struct btrfs_device *tmp; 1874 1875 devices = &root->fs_info->fs_devices->devices; 1876 /* 1877 * It is safe to read the devices since the volume_mutex 1878 * is held by the caller. 1879 */ 1880 list_for_each_entry(tmp, devices, dev_list) { 1881 if (tmp->in_fs_metadata && !tmp->bdev) { 1882 *device = tmp; 1883 break; 1884 } 1885 } 1886 1887 if (!*device) { 1888 btrfs_err(root->fs_info, "no missing device found"); 1889 return -ENOENT; 1890 } 1891 1892 return 0; 1893 } else { 1894 return btrfs_find_device_by_path(root, device_path, device); 1895 } 1896 } 1897 1898 /* 1899 * does all the dirty work required for changing file system's UUID. 1900 */ 1901 static int btrfs_prepare_sprout(struct btrfs_root *root) 1902 { 1903 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1904 struct btrfs_fs_devices *old_devices; 1905 struct btrfs_fs_devices *seed_devices; 1906 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1907 struct btrfs_device *device; 1908 u64 super_flags; 1909 1910 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1911 if (!fs_devices->seeding) 1912 return -EINVAL; 1913 1914 seed_devices = __alloc_fs_devices(); 1915 if (IS_ERR(seed_devices)) 1916 return PTR_ERR(seed_devices); 1917 1918 old_devices = clone_fs_devices(fs_devices); 1919 if (IS_ERR(old_devices)) { 1920 kfree(seed_devices); 1921 return PTR_ERR(old_devices); 1922 } 1923 1924 list_add(&old_devices->list, &fs_uuids); 1925 1926 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1927 seed_devices->opened = 1; 1928 INIT_LIST_HEAD(&seed_devices->devices); 1929 INIT_LIST_HEAD(&seed_devices->alloc_list); 1930 mutex_init(&seed_devices->device_list_mutex); 1931 1932 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1933 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1934 synchronize_rcu); 1935 1936 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1937 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1938 device->fs_devices = seed_devices; 1939 } 1940 1941 fs_devices->seeding = 0; 1942 fs_devices->num_devices = 0; 1943 fs_devices->open_devices = 0; 1944 fs_devices->seed = seed_devices; 1945 1946 generate_random_uuid(fs_devices->fsid); 1947 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1948 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1949 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1950 1951 super_flags = btrfs_super_flags(disk_super) & 1952 ~BTRFS_SUPER_FLAG_SEEDING; 1953 btrfs_set_super_flags(disk_super, super_flags); 1954 1955 return 0; 1956 } 1957 1958 /* 1959 * strore the expected generation for seed devices in device items. 1960 */ 1961 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1962 struct btrfs_root *root) 1963 { 1964 struct btrfs_path *path; 1965 struct extent_buffer *leaf; 1966 struct btrfs_dev_item *dev_item; 1967 struct btrfs_device *device; 1968 struct btrfs_key key; 1969 u8 fs_uuid[BTRFS_UUID_SIZE]; 1970 u8 dev_uuid[BTRFS_UUID_SIZE]; 1971 u64 devid; 1972 int ret; 1973 1974 path = btrfs_alloc_path(); 1975 if (!path) 1976 return -ENOMEM; 1977 1978 root = root->fs_info->chunk_root; 1979 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1980 key.offset = 0; 1981 key.type = BTRFS_DEV_ITEM_KEY; 1982 1983 while (1) { 1984 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1985 if (ret < 0) 1986 goto error; 1987 1988 leaf = path->nodes[0]; 1989 next_slot: 1990 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1991 ret = btrfs_next_leaf(root, path); 1992 if (ret > 0) 1993 break; 1994 if (ret < 0) 1995 goto error; 1996 leaf = path->nodes[0]; 1997 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1998 btrfs_release_path(path); 1999 continue; 2000 } 2001 2002 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2003 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2004 key.type != BTRFS_DEV_ITEM_KEY) 2005 break; 2006 2007 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2008 struct btrfs_dev_item); 2009 devid = btrfs_device_id(leaf, dev_item); 2010 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2011 BTRFS_UUID_SIZE); 2012 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2013 BTRFS_UUID_SIZE); 2014 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 2015 fs_uuid); 2016 BUG_ON(!device); /* Logic error */ 2017 2018 if (device->fs_devices->seeding) { 2019 btrfs_set_device_generation(leaf, dev_item, 2020 device->generation); 2021 btrfs_mark_buffer_dirty(leaf); 2022 } 2023 2024 path->slots[0]++; 2025 goto next_slot; 2026 } 2027 ret = 0; 2028 error: 2029 btrfs_free_path(path); 2030 return ret; 2031 } 2032 2033 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 2034 { 2035 struct request_queue *q; 2036 struct btrfs_trans_handle *trans; 2037 struct btrfs_device *device; 2038 struct block_device *bdev; 2039 struct list_head *devices; 2040 struct super_block *sb = root->fs_info->sb; 2041 struct rcu_string *name; 2042 u64 total_bytes; 2043 int seeding_dev = 0; 2044 int ret = 0; 2045 2046 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 2047 return -EROFS; 2048 2049 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2050 root->fs_info->bdev_holder); 2051 if (IS_ERR(bdev)) 2052 return PTR_ERR(bdev); 2053 2054 if (root->fs_info->fs_devices->seeding) { 2055 seeding_dev = 1; 2056 down_write(&sb->s_umount); 2057 mutex_lock(&uuid_mutex); 2058 } 2059 2060 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2061 2062 devices = &root->fs_info->fs_devices->devices; 2063 2064 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2065 list_for_each_entry(device, devices, dev_list) { 2066 if (device->bdev == bdev) { 2067 ret = -EEXIST; 2068 mutex_unlock( 2069 &root->fs_info->fs_devices->device_list_mutex); 2070 goto error; 2071 } 2072 } 2073 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2074 2075 device = btrfs_alloc_device(root->fs_info, NULL, NULL); 2076 if (IS_ERR(device)) { 2077 /* we can safely leave the fs_devices entry around */ 2078 ret = PTR_ERR(device); 2079 goto error; 2080 } 2081 2082 name = rcu_string_strdup(device_path, GFP_NOFS); 2083 if (!name) { 2084 kfree(device); 2085 ret = -ENOMEM; 2086 goto error; 2087 } 2088 rcu_assign_pointer(device->name, name); 2089 2090 trans = btrfs_start_transaction(root, 0); 2091 if (IS_ERR(trans)) { 2092 rcu_string_free(device->name); 2093 kfree(device); 2094 ret = PTR_ERR(trans); 2095 goto error; 2096 } 2097 2098 lock_chunks(root); 2099 2100 q = bdev_get_queue(bdev); 2101 if (blk_queue_discard(q)) 2102 device->can_discard = 1; 2103 device->writeable = 1; 2104 device->generation = trans->transid; 2105 device->io_width = root->sectorsize; 2106 device->io_align = root->sectorsize; 2107 device->sector_size = root->sectorsize; 2108 device->total_bytes = i_size_read(bdev->bd_inode); 2109 device->disk_total_bytes = device->total_bytes; 2110 device->dev_root = root->fs_info->dev_root; 2111 device->bdev = bdev; 2112 device->in_fs_metadata = 1; 2113 device->is_tgtdev_for_dev_replace = 0; 2114 device->mode = FMODE_EXCL; 2115 device->dev_stats_valid = 1; 2116 set_blocksize(device->bdev, 4096); 2117 2118 if (seeding_dev) { 2119 sb->s_flags &= ~MS_RDONLY; 2120 ret = btrfs_prepare_sprout(root); 2121 BUG_ON(ret); /* -ENOMEM */ 2122 } 2123 2124 device->fs_devices = root->fs_info->fs_devices; 2125 2126 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2127 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2128 list_add(&device->dev_alloc_list, 2129 &root->fs_info->fs_devices->alloc_list); 2130 root->fs_info->fs_devices->num_devices++; 2131 root->fs_info->fs_devices->open_devices++; 2132 root->fs_info->fs_devices->rw_devices++; 2133 root->fs_info->fs_devices->total_devices++; 2134 if (device->can_discard) 2135 root->fs_info->fs_devices->num_can_discard++; 2136 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2137 2138 spin_lock(&root->fs_info->free_chunk_lock); 2139 root->fs_info->free_chunk_space += device->total_bytes; 2140 spin_unlock(&root->fs_info->free_chunk_lock); 2141 2142 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2143 root->fs_info->fs_devices->rotating = 1; 2144 2145 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2146 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2147 total_bytes + device->total_bytes); 2148 2149 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2150 btrfs_set_super_num_devices(root->fs_info->super_copy, 2151 total_bytes + 1); 2152 2153 /* add sysfs device entry */ 2154 btrfs_kobj_add_device(root->fs_info, device); 2155 2156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2157 2158 if (seeding_dev) { 2159 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2160 ret = init_first_rw_device(trans, root, device); 2161 if (ret) { 2162 btrfs_abort_transaction(trans, root, ret); 2163 goto error_trans; 2164 } 2165 ret = btrfs_finish_sprout(trans, root); 2166 if (ret) { 2167 btrfs_abort_transaction(trans, root, ret); 2168 goto error_trans; 2169 } 2170 2171 /* Sprouting would change fsid of the mounted root, 2172 * so rename the fsid on the sysfs 2173 */ 2174 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2175 root->fs_info->fsid); 2176 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2177 goto error_trans; 2178 } else { 2179 ret = btrfs_add_device(trans, root, device); 2180 if (ret) { 2181 btrfs_abort_transaction(trans, root, ret); 2182 goto error_trans; 2183 } 2184 } 2185 2186 /* 2187 * we've got more storage, clear any full flags on the space 2188 * infos 2189 */ 2190 btrfs_clear_space_info_full(root->fs_info); 2191 2192 unlock_chunks(root); 2193 root->fs_info->num_tolerated_disk_barrier_failures = 2194 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2195 ret = btrfs_commit_transaction(trans, root); 2196 2197 if (seeding_dev) { 2198 mutex_unlock(&uuid_mutex); 2199 up_write(&sb->s_umount); 2200 2201 if (ret) /* transaction commit */ 2202 return ret; 2203 2204 ret = btrfs_relocate_sys_chunks(root); 2205 if (ret < 0) 2206 btrfs_error(root->fs_info, ret, 2207 "Failed to relocate sys chunks after " 2208 "device initialization. This can be fixed " 2209 "using the \"btrfs balance\" command."); 2210 trans = btrfs_attach_transaction(root); 2211 if (IS_ERR(trans)) { 2212 if (PTR_ERR(trans) == -ENOENT) 2213 return 0; 2214 return PTR_ERR(trans); 2215 } 2216 ret = btrfs_commit_transaction(trans, root); 2217 } 2218 2219 /* Update ctime/mtime for libblkid */ 2220 update_dev_time(device_path); 2221 return ret; 2222 2223 error_trans: 2224 unlock_chunks(root); 2225 btrfs_end_transaction(trans, root); 2226 rcu_string_free(device->name); 2227 btrfs_kobj_rm_device(root->fs_info, device); 2228 kfree(device); 2229 error: 2230 blkdev_put(bdev, FMODE_EXCL); 2231 if (seeding_dev) { 2232 mutex_unlock(&uuid_mutex); 2233 up_write(&sb->s_umount); 2234 } 2235 return ret; 2236 } 2237 2238 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2239 struct btrfs_device **device_out) 2240 { 2241 struct request_queue *q; 2242 struct btrfs_device *device; 2243 struct block_device *bdev; 2244 struct btrfs_fs_info *fs_info = root->fs_info; 2245 struct list_head *devices; 2246 struct rcu_string *name; 2247 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2248 int ret = 0; 2249 2250 *device_out = NULL; 2251 if (fs_info->fs_devices->seeding) 2252 return -EINVAL; 2253 2254 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2255 fs_info->bdev_holder); 2256 if (IS_ERR(bdev)) 2257 return PTR_ERR(bdev); 2258 2259 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2260 2261 devices = &fs_info->fs_devices->devices; 2262 list_for_each_entry(device, devices, dev_list) { 2263 if (device->bdev == bdev) { 2264 ret = -EEXIST; 2265 goto error; 2266 } 2267 } 2268 2269 device = btrfs_alloc_device(NULL, &devid, NULL); 2270 if (IS_ERR(device)) { 2271 ret = PTR_ERR(device); 2272 goto error; 2273 } 2274 2275 name = rcu_string_strdup(device_path, GFP_NOFS); 2276 if (!name) { 2277 kfree(device); 2278 ret = -ENOMEM; 2279 goto error; 2280 } 2281 rcu_assign_pointer(device->name, name); 2282 2283 q = bdev_get_queue(bdev); 2284 if (blk_queue_discard(q)) 2285 device->can_discard = 1; 2286 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2287 device->writeable = 1; 2288 device->generation = 0; 2289 device->io_width = root->sectorsize; 2290 device->io_align = root->sectorsize; 2291 device->sector_size = root->sectorsize; 2292 device->total_bytes = i_size_read(bdev->bd_inode); 2293 device->disk_total_bytes = device->total_bytes; 2294 device->dev_root = fs_info->dev_root; 2295 device->bdev = bdev; 2296 device->in_fs_metadata = 1; 2297 device->is_tgtdev_for_dev_replace = 1; 2298 device->mode = FMODE_EXCL; 2299 device->dev_stats_valid = 1; 2300 set_blocksize(device->bdev, 4096); 2301 device->fs_devices = fs_info->fs_devices; 2302 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2303 fs_info->fs_devices->num_devices++; 2304 fs_info->fs_devices->open_devices++; 2305 if (device->can_discard) 2306 fs_info->fs_devices->num_can_discard++; 2307 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2308 2309 *device_out = device; 2310 return ret; 2311 2312 error: 2313 blkdev_put(bdev, FMODE_EXCL); 2314 return ret; 2315 } 2316 2317 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2318 struct btrfs_device *tgtdev) 2319 { 2320 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2321 tgtdev->io_width = fs_info->dev_root->sectorsize; 2322 tgtdev->io_align = fs_info->dev_root->sectorsize; 2323 tgtdev->sector_size = fs_info->dev_root->sectorsize; 2324 tgtdev->dev_root = fs_info->dev_root; 2325 tgtdev->in_fs_metadata = 1; 2326 } 2327 2328 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2329 struct btrfs_device *device) 2330 { 2331 int ret; 2332 struct btrfs_path *path; 2333 struct btrfs_root *root; 2334 struct btrfs_dev_item *dev_item; 2335 struct extent_buffer *leaf; 2336 struct btrfs_key key; 2337 2338 root = device->dev_root->fs_info->chunk_root; 2339 2340 path = btrfs_alloc_path(); 2341 if (!path) 2342 return -ENOMEM; 2343 2344 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2345 key.type = BTRFS_DEV_ITEM_KEY; 2346 key.offset = device->devid; 2347 2348 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2349 if (ret < 0) 2350 goto out; 2351 2352 if (ret > 0) { 2353 ret = -ENOENT; 2354 goto out; 2355 } 2356 2357 leaf = path->nodes[0]; 2358 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2359 2360 btrfs_set_device_id(leaf, dev_item, device->devid); 2361 btrfs_set_device_type(leaf, dev_item, device->type); 2362 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2363 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2364 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2365 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2366 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2367 btrfs_mark_buffer_dirty(leaf); 2368 2369 out: 2370 btrfs_free_path(path); 2371 return ret; 2372 } 2373 2374 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2375 struct btrfs_device *device, u64 new_size) 2376 { 2377 struct btrfs_super_block *super_copy = 2378 device->dev_root->fs_info->super_copy; 2379 u64 old_total = btrfs_super_total_bytes(super_copy); 2380 u64 diff = new_size - device->total_bytes; 2381 2382 if (!device->writeable) 2383 return -EACCES; 2384 if (new_size <= device->total_bytes || 2385 device->is_tgtdev_for_dev_replace) 2386 return -EINVAL; 2387 2388 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2389 device->fs_devices->total_rw_bytes += diff; 2390 2391 device->total_bytes = new_size; 2392 device->disk_total_bytes = new_size; 2393 btrfs_clear_space_info_full(device->dev_root->fs_info); 2394 2395 return btrfs_update_device(trans, device); 2396 } 2397 2398 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2399 struct btrfs_device *device, u64 new_size) 2400 { 2401 int ret; 2402 lock_chunks(device->dev_root); 2403 ret = __btrfs_grow_device(trans, device, new_size); 2404 unlock_chunks(device->dev_root); 2405 return ret; 2406 } 2407 2408 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2409 struct btrfs_root *root, 2410 u64 chunk_tree, u64 chunk_objectid, 2411 u64 chunk_offset) 2412 { 2413 int ret; 2414 struct btrfs_path *path; 2415 struct btrfs_key key; 2416 2417 root = root->fs_info->chunk_root; 2418 path = btrfs_alloc_path(); 2419 if (!path) 2420 return -ENOMEM; 2421 2422 key.objectid = chunk_objectid; 2423 key.offset = chunk_offset; 2424 key.type = BTRFS_CHUNK_ITEM_KEY; 2425 2426 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2427 if (ret < 0) 2428 goto out; 2429 else if (ret > 0) { /* Logic error or corruption */ 2430 btrfs_error(root->fs_info, -ENOENT, 2431 "Failed lookup while freeing chunk."); 2432 ret = -ENOENT; 2433 goto out; 2434 } 2435 2436 ret = btrfs_del_item(trans, root, path); 2437 if (ret < 0) 2438 btrfs_error(root->fs_info, ret, 2439 "Failed to delete chunk item."); 2440 out: 2441 btrfs_free_path(path); 2442 return ret; 2443 } 2444 2445 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 2446 chunk_offset) 2447 { 2448 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2449 struct btrfs_disk_key *disk_key; 2450 struct btrfs_chunk *chunk; 2451 u8 *ptr; 2452 int ret = 0; 2453 u32 num_stripes; 2454 u32 array_size; 2455 u32 len = 0; 2456 u32 cur; 2457 struct btrfs_key key; 2458 2459 array_size = btrfs_super_sys_array_size(super_copy); 2460 2461 ptr = super_copy->sys_chunk_array; 2462 cur = 0; 2463 2464 while (cur < array_size) { 2465 disk_key = (struct btrfs_disk_key *)ptr; 2466 btrfs_disk_key_to_cpu(&key, disk_key); 2467 2468 len = sizeof(*disk_key); 2469 2470 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2471 chunk = (struct btrfs_chunk *)(ptr + len); 2472 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2473 len += btrfs_chunk_item_size(num_stripes); 2474 } else { 2475 ret = -EIO; 2476 break; 2477 } 2478 if (key.objectid == chunk_objectid && 2479 key.offset == chunk_offset) { 2480 memmove(ptr, ptr + len, array_size - (cur + len)); 2481 array_size -= len; 2482 btrfs_set_super_sys_array_size(super_copy, array_size); 2483 } else { 2484 ptr += len; 2485 cur += len; 2486 } 2487 } 2488 return ret; 2489 } 2490 2491 static int btrfs_relocate_chunk(struct btrfs_root *root, 2492 u64 chunk_tree, u64 chunk_objectid, 2493 u64 chunk_offset) 2494 { 2495 struct extent_map_tree *em_tree; 2496 struct btrfs_root *extent_root; 2497 struct btrfs_trans_handle *trans; 2498 struct extent_map *em; 2499 struct map_lookup *map; 2500 int ret; 2501 int i; 2502 2503 root = root->fs_info->chunk_root; 2504 extent_root = root->fs_info->extent_root; 2505 em_tree = &root->fs_info->mapping_tree.map_tree; 2506 2507 ret = btrfs_can_relocate(extent_root, chunk_offset); 2508 if (ret) 2509 return -ENOSPC; 2510 2511 /* step one, relocate all the extents inside this chunk */ 2512 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2513 if (ret) 2514 return ret; 2515 2516 trans = btrfs_start_transaction(root, 0); 2517 if (IS_ERR(trans)) { 2518 ret = PTR_ERR(trans); 2519 btrfs_std_error(root->fs_info, ret); 2520 return ret; 2521 } 2522 2523 lock_chunks(root); 2524 2525 /* 2526 * step two, delete the device extents and the 2527 * chunk tree entries 2528 */ 2529 read_lock(&em_tree->lock); 2530 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2531 read_unlock(&em_tree->lock); 2532 2533 BUG_ON(!em || em->start > chunk_offset || 2534 em->start + em->len < chunk_offset); 2535 map = (struct map_lookup *)em->bdev; 2536 2537 for (i = 0; i < map->num_stripes; i++) { 2538 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2539 map->stripes[i].physical); 2540 BUG_ON(ret); 2541 2542 if (map->stripes[i].dev) { 2543 ret = btrfs_update_device(trans, map->stripes[i].dev); 2544 BUG_ON(ret); 2545 } 2546 } 2547 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2548 chunk_offset); 2549 2550 BUG_ON(ret); 2551 2552 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2553 2554 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2555 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2556 BUG_ON(ret); 2557 } 2558 2559 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2560 BUG_ON(ret); 2561 2562 write_lock(&em_tree->lock); 2563 remove_extent_mapping(em_tree, em); 2564 write_unlock(&em_tree->lock); 2565 2566 /* once for the tree */ 2567 free_extent_map(em); 2568 /* once for us */ 2569 free_extent_map(em); 2570 2571 unlock_chunks(root); 2572 btrfs_end_transaction(trans, root); 2573 return 0; 2574 } 2575 2576 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2577 { 2578 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2579 struct btrfs_path *path; 2580 struct extent_buffer *leaf; 2581 struct btrfs_chunk *chunk; 2582 struct btrfs_key key; 2583 struct btrfs_key found_key; 2584 u64 chunk_tree = chunk_root->root_key.objectid; 2585 u64 chunk_type; 2586 bool retried = false; 2587 int failed = 0; 2588 int ret; 2589 2590 path = btrfs_alloc_path(); 2591 if (!path) 2592 return -ENOMEM; 2593 2594 again: 2595 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2596 key.offset = (u64)-1; 2597 key.type = BTRFS_CHUNK_ITEM_KEY; 2598 2599 while (1) { 2600 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2601 if (ret < 0) 2602 goto error; 2603 BUG_ON(ret == 0); /* Corruption */ 2604 2605 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2606 key.type); 2607 if (ret < 0) 2608 goto error; 2609 if (ret > 0) 2610 break; 2611 2612 leaf = path->nodes[0]; 2613 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2614 2615 chunk = btrfs_item_ptr(leaf, path->slots[0], 2616 struct btrfs_chunk); 2617 chunk_type = btrfs_chunk_type(leaf, chunk); 2618 btrfs_release_path(path); 2619 2620 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2621 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2622 found_key.objectid, 2623 found_key.offset); 2624 if (ret == -ENOSPC) 2625 failed++; 2626 else if (ret) 2627 BUG(); 2628 } 2629 2630 if (found_key.offset == 0) 2631 break; 2632 key.offset = found_key.offset - 1; 2633 } 2634 ret = 0; 2635 if (failed && !retried) { 2636 failed = 0; 2637 retried = true; 2638 goto again; 2639 } else if (WARN_ON(failed && retried)) { 2640 ret = -ENOSPC; 2641 } 2642 error: 2643 btrfs_free_path(path); 2644 return ret; 2645 } 2646 2647 static int insert_balance_item(struct btrfs_root *root, 2648 struct btrfs_balance_control *bctl) 2649 { 2650 struct btrfs_trans_handle *trans; 2651 struct btrfs_balance_item *item; 2652 struct btrfs_disk_balance_args disk_bargs; 2653 struct btrfs_path *path; 2654 struct extent_buffer *leaf; 2655 struct btrfs_key key; 2656 int ret, err; 2657 2658 path = btrfs_alloc_path(); 2659 if (!path) 2660 return -ENOMEM; 2661 2662 trans = btrfs_start_transaction(root, 0); 2663 if (IS_ERR(trans)) { 2664 btrfs_free_path(path); 2665 return PTR_ERR(trans); 2666 } 2667 2668 key.objectid = BTRFS_BALANCE_OBJECTID; 2669 key.type = BTRFS_BALANCE_ITEM_KEY; 2670 key.offset = 0; 2671 2672 ret = btrfs_insert_empty_item(trans, root, path, &key, 2673 sizeof(*item)); 2674 if (ret) 2675 goto out; 2676 2677 leaf = path->nodes[0]; 2678 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2679 2680 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2681 2682 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2683 btrfs_set_balance_data(leaf, item, &disk_bargs); 2684 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2685 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2686 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2687 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2688 2689 btrfs_set_balance_flags(leaf, item, bctl->flags); 2690 2691 btrfs_mark_buffer_dirty(leaf); 2692 out: 2693 btrfs_free_path(path); 2694 err = btrfs_commit_transaction(trans, root); 2695 if (err && !ret) 2696 ret = err; 2697 return ret; 2698 } 2699 2700 static int del_balance_item(struct btrfs_root *root) 2701 { 2702 struct btrfs_trans_handle *trans; 2703 struct btrfs_path *path; 2704 struct btrfs_key key; 2705 int ret, err; 2706 2707 path = btrfs_alloc_path(); 2708 if (!path) 2709 return -ENOMEM; 2710 2711 trans = btrfs_start_transaction(root, 0); 2712 if (IS_ERR(trans)) { 2713 btrfs_free_path(path); 2714 return PTR_ERR(trans); 2715 } 2716 2717 key.objectid = BTRFS_BALANCE_OBJECTID; 2718 key.type = BTRFS_BALANCE_ITEM_KEY; 2719 key.offset = 0; 2720 2721 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2722 if (ret < 0) 2723 goto out; 2724 if (ret > 0) { 2725 ret = -ENOENT; 2726 goto out; 2727 } 2728 2729 ret = btrfs_del_item(trans, root, path); 2730 out: 2731 btrfs_free_path(path); 2732 err = btrfs_commit_transaction(trans, root); 2733 if (err && !ret) 2734 ret = err; 2735 return ret; 2736 } 2737 2738 /* 2739 * This is a heuristic used to reduce the number of chunks balanced on 2740 * resume after balance was interrupted. 2741 */ 2742 static void update_balance_args(struct btrfs_balance_control *bctl) 2743 { 2744 /* 2745 * Turn on soft mode for chunk types that were being converted. 2746 */ 2747 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2748 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2749 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2750 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2751 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2752 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2753 2754 /* 2755 * Turn on usage filter if is not already used. The idea is 2756 * that chunks that we have already balanced should be 2757 * reasonably full. Don't do it for chunks that are being 2758 * converted - that will keep us from relocating unconverted 2759 * (albeit full) chunks. 2760 */ 2761 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2762 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2763 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2764 bctl->data.usage = 90; 2765 } 2766 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2767 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2768 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2769 bctl->sys.usage = 90; 2770 } 2771 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2772 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2773 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2774 bctl->meta.usage = 90; 2775 } 2776 } 2777 2778 /* 2779 * Should be called with both balance and volume mutexes held to 2780 * serialize other volume operations (add_dev/rm_dev/resize) with 2781 * restriper. Same goes for unset_balance_control. 2782 */ 2783 static void set_balance_control(struct btrfs_balance_control *bctl) 2784 { 2785 struct btrfs_fs_info *fs_info = bctl->fs_info; 2786 2787 BUG_ON(fs_info->balance_ctl); 2788 2789 spin_lock(&fs_info->balance_lock); 2790 fs_info->balance_ctl = bctl; 2791 spin_unlock(&fs_info->balance_lock); 2792 } 2793 2794 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2795 { 2796 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2797 2798 BUG_ON(!fs_info->balance_ctl); 2799 2800 spin_lock(&fs_info->balance_lock); 2801 fs_info->balance_ctl = NULL; 2802 spin_unlock(&fs_info->balance_lock); 2803 2804 kfree(bctl); 2805 } 2806 2807 /* 2808 * Balance filters. Return 1 if chunk should be filtered out 2809 * (should not be balanced). 2810 */ 2811 static int chunk_profiles_filter(u64 chunk_type, 2812 struct btrfs_balance_args *bargs) 2813 { 2814 chunk_type = chunk_to_extended(chunk_type) & 2815 BTRFS_EXTENDED_PROFILE_MASK; 2816 2817 if (bargs->profiles & chunk_type) 2818 return 0; 2819 2820 return 1; 2821 } 2822 2823 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2824 struct btrfs_balance_args *bargs) 2825 { 2826 struct btrfs_block_group_cache *cache; 2827 u64 chunk_used, user_thresh; 2828 int ret = 1; 2829 2830 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2831 chunk_used = btrfs_block_group_used(&cache->item); 2832 2833 if (bargs->usage == 0) 2834 user_thresh = 1; 2835 else if (bargs->usage > 100) 2836 user_thresh = cache->key.offset; 2837 else 2838 user_thresh = div_factor_fine(cache->key.offset, 2839 bargs->usage); 2840 2841 if (chunk_used < user_thresh) 2842 ret = 0; 2843 2844 btrfs_put_block_group(cache); 2845 return ret; 2846 } 2847 2848 static int chunk_devid_filter(struct extent_buffer *leaf, 2849 struct btrfs_chunk *chunk, 2850 struct btrfs_balance_args *bargs) 2851 { 2852 struct btrfs_stripe *stripe; 2853 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2854 int i; 2855 2856 for (i = 0; i < num_stripes; i++) { 2857 stripe = btrfs_stripe_nr(chunk, i); 2858 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2859 return 0; 2860 } 2861 2862 return 1; 2863 } 2864 2865 /* [pstart, pend) */ 2866 static int chunk_drange_filter(struct extent_buffer *leaf, 2867 struct btrfs_chunk *chunk, 2868 u64 chunk_offset, 2869 struct btrfs_balance_args *bargs) 2870 { 2871 struct btrfs_stripe *stripe; 2872 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2873 u64 stripe_offset; 2874 u64 stripe_length; 2875 int factor; 2876 int i; 2877 2878 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2879 return 0; 2880 2881 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2882 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 2883 factor = num_stripes / 2; 2884 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 2885 factor = num_stripes - 1; 2886 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 2887 factor = num_stripes - 2; 2888 } else { 2889 factor = num_stripes; 2890 } 2891 2892 for (i = 0; i < num_stripes; i++) { 2893 stripe = btrfs_stripe_nr(chunk, i); 2894 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2895 continue; 2896 2897 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2898 stripe_length = btrfs_chunk_length(leaf, chunk); 2899 do_div(stripe_length, factor); 2900 2901 if (stripe_offset < bargs->pend && 2902 stripe_offset + stripe_length > bargs->pstart) 2903 return 0; 2904 } 2905 2906 return 1; 2907 } 2908 2909 /* [vstart, vend) */ 2910 static int chunk_vrange_filter(struct extent_buffer *leaf, 2911 struct btrfs_chunk *chunk, 2912 u64 chunk_offset, 2913 struct btrfs_balance_args *bargs) 2914 { 2915 if (chunk_offset < bargs->vend && 2916 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2917 /* at least part of the chunk is inside this vrange */ 2918 return 0; 2919 2920 return 1; 2921 } 2922 2923 static int chunk_soft_convert_filter(u64 chunk_type, 2924 struct btrfs_balance_args *bargs) 2925 { 2926 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2927 return 0; 2928 2929 chunk_type = chunk_to_extended(chunk_type) & 2930 BTRFS_EXTENDED_PROFILE_MASK; 2931 2932 if (bargs->target == chunk_type) 2933 return 1; 2934 2935 return 0; 2936 } 2937 2938 static int should_balance_chunk(struct btrfs_root *root, 2939 struct extent_buffer *leaf, 2940 struct btrfs_chunk *chunk, u64 chunk_offset) 2941 { 2942 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2943 struct btrfs_balance_args *bargs = NULL; 2944 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2945 2946 /* type filter */ 2947 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2948 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2949 return 0; 2950 } 2951 2952 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2953 bargs = &bctl->data; 2954 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2955 bargs = &bctl->sys; 2956 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2957 bargs = &bctl->meta; 2958 2959 /* profiles filter */ 2960 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2961 chunk_profiles_filter(chunk_type, bargs)) { 2962 return 0; 2963 } 2964 2965 /* usage filter */ 2966 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2967 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2968 return 0; 2969 } 2970 2971 /* devid filter */ 2972 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2973 chunk_devid_filter(leaf, chunk, bargs)) { 2974 return 0; 2975 } 2976 2977 /* drange filter, makes sense only with devid filter */ 2978 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2979 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2980 return 0; 2981 } 2982 2983 /* vrange filter */ 2984 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2985 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2986 return 0; 2987 } 2988 2989 /* soft profile changing mode */ 2990 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2991 chunk_soft_convert_filter(chunk_type, bargs)) { 2992 return 0; 2993 } 2994 2995 /* 2996 * limited by count, must be the last filter 2997 */ 2998 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 2999 if (bargs->limit == 0) 3000 return 0; 3001 else 3002 bargs->limit--; 3003 } 3004 3005 return 1; 3006 } 3007 3008 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3009 { 3010 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3011 struct btrfs_root *chunk_root = fs_info->chunk_root; 3012 struct btrfs_root *dev_root = fs_info->dev_root; 3013 struct list_head *devices; 3014 struct btrfs_device *device; 3015 u64 old_size; 3016 u64 size_to_free; 3017 struct btrfs_chunk *chunk; 3018 struct btrfs_path *path; 3019 struct btrfs_key key; 3020 struct btrfs_key found_key; 3021 struct btrfs_trans_handle *trans; 3022 struct extent_buffer *leaf; 3023 int slot; 3024 int ret; 3025 int enospc_errors = 0; 3026 bool counting = true; 3027 u64 limit_data = bctl->data.limit; 3028 u64 limit_meta = bctl->meta.limit; 3029 u64 limit_sys = bctl->sys.limit; 3030 3031 /* step one make some room on all the devices */ 3032 devices = &fs_info->fs_devices->devices; 3033 list_for_each_entry(device, devices, dev_list) { 3034 old_size = device->total_bytes; 3035 size_to_free = div_factor(old_size, 1); 3036 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3037 if (!device->writeable || 3038 device->total_bytes - device->bytes_used > size_to_free || 3039 device->is_tgtdev_for_dev_replace) 3040 continue; 3041 3042 ret = btrfs_shrink_device(device, old_size - size_to_free); 3043 if (ret == -ENOSPC) 3044 break; 3045 BUG_ON(ret); 3046 3047 trans = btrfs_start_transaction(dev_root, 0); 3048 BUG_ON(IS_ERR(trans)); 3049 3050 ret = btrfs_grow_device(trans, device, old_size); 3051 BUG_ON(ret); 3052 3053 btrfs_end_transaction(trans, dev_root); 3054 } 3055 3056 /* step two, relocate all the chunks */ 3057 path = btrfs_alloc_path(); 3058 if (!path) { 3059 ret = -ENOMEM; 3060 goto error; 3061 } 3062 3063 /* zero out stat counters */ 3064 spin_lock(&fs_info->balance_lock); 3065 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3066 spin_unlock(&fs_info->balance_lock); 3067 again: 3068 if (!counting) { 3069 bctl->data.limit = limit_data; 3070 bctl->meta.limit = limit_meta; 3071 bctl->sys.limit = limit_sys; 3072 } 3073 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3074 key.offset = (u64)-1; 3075 key.type = BTRFS_CHUNK_ITEM_KEY; 3076 3077 while (1) { 3078 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3079 atomic_read(&fs_info->balance_cancel_req)) { 3080 ret = -ECANCELED; 3081 goto error; 3082 } 3083 3084 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3085 if (ret < 0) 3086 goto error; 3087 3088 /* 3089 * this shouldn't happen, it means the last relocate 3090 * failed 3091 */ 3092 if (ret == 0) 3093 BUG(); /* FIXME break ? */ 3094 3095 ret = btrfs_previous_item(chunk_root, path, 0, 3096 BTRFS_CHUNK_ITEM_KEY); 3097 if (ret) { 3098 ret = 0; 3099 break; 3100 } 3101 3102 leaf = path->nodes[0]; 3103 slot = path->slots[0]; 3104 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3105 3106 if (found_key.objectid != key.objectid) 3107 break; 3108 3109 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3110 3111 if (!counting) { 3112 spin_lock(&fs_info->balance_lock); 3113 bctl->stat.considered++; 3114 spin_unlock(&fs_info->balance_lock); 3115 } 3116 3117 ret = should_balance_chunk(chunk_root, leaf, chunk, 3118 found_key.offset); 3119 btrfs_release_path(path); 3120 if (!ret) 3121 goto loop; 3122 3123 if (counting) { 3124 spin_lock(&fs_info->balance_lock); 3125 bctl->stat.expected++; 3126 spin_unlock(&fs_info->balance_lock); 3127 goto loop; 3128 } 3129 3130 ret = btrfs_relocate_chunk(chunk_root, 3131 chunk_root->root_key.objectid, 3132 found_key.objectid, 3133 found_key.offset); 3134 if (ret && ret != -ENOSPC) 3135 goto error; 3136 if (ret == -ENOSPC) { 3137 enospc_errors++; 3138 } else { 3139 spin_lock(&fs_info->balance_lock); 3140 bctl->stat.completed++; 3141 spin_unlock(&fs_info->balance_lock); 3142 } 3143 loop: 3144 if (found_key.offset == 0) 3145 break; 3146 key.offset = found_key.offset - 1; 3147 } 3148 3149 if (counting) { 3150 btrfs_release_path(path); 3151 counting = false; 3152 goto again; 3153 } 3154 error: 3155 btrfs_free_path(path); 3156 if (enospc_errors) { 3157 btrfs_info(fs_info, "%d enospc errors during balance", 3158 enospc_errors); 3159 if (!ret) 3160 ret = -ENOSPC; 3161 } 3162 3163 return ret; 3164 } 3165 3166 /** 3167 * alloc_profile_is_valid - see if a given profile is valid and reduced 3168 * @flags: profile to validate 3169 * @extended: if true @flags is treated as an extended profile 3170 */ 3171 static int alloc_profile_is_valid(u64 flags, int extended) 3172 { 3173 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3174 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3175 3176 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3177 3178 /* 1) check that all other bits are zeroed */ 3179 if (flags & ~mask) 3180 return 0; 3181 3182 /* 2) see if profile is reduced */ 3183 if (flags == 0) 3184 return !extended; /* "0" is valid for usual profiles */ 3185 3186 /* true if exactly one bit set */ 3187 return (flags & (flags - 1)) == 0; 3188 } 3189 3190 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3191 { 3192 /* cancel requested || normal exit path */ 3193 return atomic_read(&fs_info->balance_cancel_req) || 3194 (atomic_read(&fs_info->balance_pause_req) == 0 && 3195 atomic_read(&fs_info->balance_cancel_req) == 0); 3196 } 3197 3198 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3199 { 3200 int ret; 3201 3202 unset_balance_control(fs_info); 3203 ret = del_balance_item(fs_info->tree_root); 3204 if (ret) 3205 btrfs_std_error(fs_info, ret); 3206 3207 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3208 } 3209 3210 /* 3211 * Should be called with both balance and volume mutexes held 3212 */ 3213 int btrfs_balance(struct btrfs_balance_control *bctl, 3214 struct btrfs_ioctl_balance_args *bargs) 3215 { 3216 struct btrfs_fs_info *fs_info = bctl->fs_info; 3217 u64 allowed; 3218 int mixed = 0; 3219 int ret; 3220 u64 num_devices; 3221 unsigned seq; 3222 3223 if (btrfs_fs_closing(fs_info) || 3224 atomic_read(&fs_info->balance_pause_req) || 3225 atomic_read(&fs_info->balance_cancel_req)) { 3226 ret = -EINVAL; 3227 goto out; 3228 } 3229 3230 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3231 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3232 mixed = 1; 3233 3234 /* 3235 * In case of mixed groups both data and meta should be picked, 3236 * and identical options should be given for both of them. 3237 */ 3238 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3239 if (mixed && (bctl->flags & allowed)) { 3240 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3241 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3242 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3243 btrfs_err(fs_info, "with mixed groups data and " 3244 "metadata balance options must be the same"); 3245 ret = -EINVAL; 3246 goto out; 3247 } 3248 } 3249 3250 num_devices = fs_info->fs_devices->num_devices; 3251 btrfs_dev_replace_lock(&fs_info->dev_replace); 3252 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3253 BUG_ON(num_devices < 1); 3254 num_devices--; 3255 } 3256 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3257 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3258 if (num_devices == 1) 3259 allowed |= BTRFS_BLOCK_GROUP_DUP; 3260 else if (num_devices > 1) 3261 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3262 if (num_devices > 2) 3263 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3264 if (num_devices > 3) 3265 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3266 BTRFS_BLOCK_GROUP_RAID6); 3267 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3268 (!alloc_profile_is_valid(bctl->data.target, 1) || 3269 (bctl->data.target & ~allowed))) { 3270 btrfs_err(fs_info, "unable to start balance with target " 3271 "data profile %llu", 3272 bctl->data.target); 3273 ret = -EINVAL; 3274 goto out; 3275 } 3276 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3277 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3278 (bctl->meta.target & ~allowed))) { 3279 btrfs_err(fs_info, 3280 "unable to start balance with target metadata profile %llu", 3281 bctl->meta.target); 3282 ret = -EINVAL; 3283 goto out; 3284 } 3285 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3286 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3287 (bctl->sys.target & ~allowed))) { 3288 btrfs_err(fs_info, 3289 "unable to start balance with target system profile %llu", 3290 bctl->sys.target); 3291 ret = -EINVAL; 3292 goto out; 3293 } 3294 3295 /* allow dup'ed data chunks only in mixed mode */ 3296 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3297 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3298 btrfs_err(fs_info, "dup for data is not allowed"); 3299 ret = -EINVAL; 3300 goto out; 3301 } 3302 3303 /* allow to reduce meta or sys integrity only if force set */ 3304 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3305 BTRFS_BLOCK_GROUP_RAID10 | 3306 BTRFS_BLOCK_GROUP_RAID5 | 3307 BTRFS_BLOCK_GROUP_RAID6; 3308 do { 3309 seq = read_seqbegin(&fs_info->profiles_lock); 3310 3311 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3312 (fs_info->avail_system_alloc_bits & allowed) && 3313 !(bctl->sys.target & allowed)) || 3314 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3315 (fs_info->avail_metadata_alloc_bits & allowed) && 3316 !(bctl->meta.target & allowed))) { 3317 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3318 btrfs_info(fs_info, "force reducing metadata integrity"); 3319 } else { 3320 btrfs_err(fs_info, "balance will reduce metadata " 3321 "integrity, use force if you want this"); 3322 ret = -EINVAL; 3323 goto out; 3324 } 3325 } 3326 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3327 3328 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3329 int num_tolerated_disk_barrier_failures; 3330 u64 target = bctl->sys.target; 3331 3332 num_tolerated_disk_barrier_failures = 3333 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3334 if (num_tolerated_disk_barrier_failures > 0 && 3335 (target & 3336 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3337 BTRFS_AVAIL_ALLOC_BIT_SINGLE))) 3338 num_tolerated_disk_barrier_failures = 0; 3339 else if (num_tolerated_disk_barrier_failures > 1 && 3340 (target & 3341 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) 3342 num_tolerated_disk_barrier_failures = 1; 3343 3344 fs_info->num_tolerated_disk_barrier_failures = 3345 num_tolerated_disk_barrier_failures; 3346 } 3347 3348 ret = insert_balance_item(fs_info->tree_root, bctl); 3349 if (ret && ret != -EEXIST) 3350 goto out; 3351 3352 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3353 BUG_ON(ret == -EEXIST); 3354 set_balance_control(bctl); 3355 } else { 3356 BUG_ON(ret != -EEXIST); 3357 spin_lock(&fs_info->balance_lock); 3358 update_balance_args(bctl); 3359 spin_unlock(&fs_info->balance_lock); 3360 } 3361 3362 atomic_inc(&fs_info->balance_running); 3363 mutex_unlock(&fs_info->balance_mutex); 3364 3365 ret = __btrfs_balance(fs_info); 3366 3367 mutex_lock(&fs_info->balance_mutex); 3368 atomic_dec(&fs_info->balance_running); 3369 3370 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3371 fs_info->num_tolerated_disk_barrier_failures = 3372 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3373 } 3374 3375 if (bargs) { 3376 memset(bargs, 0, sizeof(*bargs)); 3377 update_ioctl_balance_args(fs_info, 0, bargs); 3378 } 3379 3380 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3381 balance_need_close(fs_info)) { 3382 __cancel_balance(fs_info); 3383 } 3384 3385 wake_up(&fs_info->balance_wait_q); 3386 3387 return ret; 3388 out: 3389 if (bctl->flags & BTRFS_BALANCE_RESUME) 3390 __cancel_balance(fs_info); 3391 else { 3392 kfree(bctl); 3393 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3394 } 3395 return ret; 3396 } 3397 3398 static int balance_kthread(void *data) 3399 { 3400 struct btrfs_fs_info *fs_info = data; 3401 int ret = 0; 3402 3403 mutex_lock(&fs_info->volume_mutex); 3404 mutex_lock(&fs_info->balance_mutex); 3405 3406 if (fs_info->balance_ctl) { 3407 btrfs_info(fs_info, "continuing balance"); 3408 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3409 } 3410 3411 mutex_unlock(&fs_info->balance_mutex); 3412 mutex_unlock(&fs_info->volume_mutex); 3413 3414 return ret; 3415 } 3416 3417 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3418 { 3419 struct task_struct *tsk; 3420 3421 spin_lock(&fs_info->balance_lock); 3422 if (!fs_info->balance_ctl) { 3423 spin_unlock(&fs_info->balance_lock); 3424 return 0; 3425 } 3426 spin_unlock(&fs_info->balance_lock); 3427 3428 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3429 btrfs_info(fs_info, "force skipping balance"); 3430 return 0; 3431 } 3432 3433 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3434 return PTR_ERR_OR_ZERO(tsk); 3435 } 3436 3437 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3438 { 3439 struct btrfs_balance_control *bctl; 3440 struct btrfs_balance_item *item; 3441 struct btrfs_disk_balance_args disk_bargs; 3442 struct btrfs_path *path; 3443 struct extent_buffer *leaf; 3444 struct btrfs_key key; 3445 int ret; 3446 3447 path = btrfs_alloc_path(); 3448 if (!path) 3449 return -ENOMEM; 3450 3451 key.objectid = BTRFS_BALANCE_OBJECTID; 3452 key.type = BTRFS_BALANCE_ITEM_KEY; 3453 key.offset = 0; 3454 3455 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3456 if (ret < 0) 3457 goto out; 3458 if (ret > 0) { /* ret = -ENOENT; */ 3459 ret = 0; 3460 goto out; 3461 } 3462 3463 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3464 if (!bctl) { 3465 ret = -ENOMEM; 3466 goto out; 3467 } 3468 3469 leaf = path->nodes[0]; 3470 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3471 3472 bctl->fs_info = fs_info; 3473 bctl->flags = btrfs_balance_flags(leaf, item); 3474 bctl->flags |= BTRFS_BALANCE_RESUME; 3475 3476 btrfs_balance_data(leaf, item, &disk_bargs); 3477 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3478 btrfs_balance_meta(leaf, item, &disk_bargs); 3479 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3480 btrfs_balance_sys(leaf, item, &disk_bargs); 3481 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3482 3483 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3484 3485 mutex_lock(&fs_info->volume_mutex); 3486 mutex_lock(&fs_info->balance_mutex); 3487 3488 set_balance_control(bctl); 3489 3490 mutex_unlock(&fs_info->balance_mutex); 3491 mutex_unlock(&fs_info->volume_mutex); 3492 out: 3493 btrfs_free_path(path); 3494 return ret; 3495 } 3496 3497 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3498 { 3499 int ret = 0; 3500 3501 mutex_lock(&fs_info->balance_mutex); 3502 if (!fs_info->balance_ctl) { 3503 mutex_unlock(&fs_info->balance_mutex); 3504 return -ENOTCONN; 3505 } 3506 3507 if (atomic_read(&fs_info->balance_running)) { 3508 atomic_inc(&fs_info->balance_pause_req); 3509 mutex_unlock(&fs_info->balance_mutex); 3510 3511 wait_event(fs_info->balance_wait_q, 3512 atomic_read(&fs_info->balance_running) == 0); 3513 3514 mutex_lock(&fs_info->balance_mutex); 3515 /* we are good with balance_ctl ripped off from under us */ 3516 BUG_ON(atomic_read(&fs_info->balance_running)); 3517 atomic_dec(&fs_info->balance_pause_req); 3518 } else { 3519 ret = -ENOTCONN; 3520 } 3521 3522 mutex_unlock(&fs_info->balance_mutex); 3523 return ret; 3524 } 3525 3526 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 3527 { 3528 if (fs_info->sb->s_flags & MS_RDONLY) 3529 return -EROFS; 3530 3531 mutex_lock(&fs_info->balance_mutex); 3532 if (!fs_info->balance_ctl) { 3533 mutex_unlock(&fs_info->balance_mutex); 3534 return -ENOTCONN; 3535 } 3536 3537 atomic_inc(&fs_info->balance_cancel_req); 3538 /* 3539 * if we are running just wait and return, balance item is 3540 * deleted in btrfs_balance in this case 3541 */ 3542 if (atomic_read(&fs_info->balance_running)) { 3543 mutex_unlock(&fs_info->balance_mutex); 3544 wait_event(fs_info->balance_wait_q, 3545 atomic_read(&fs_info->balance_running) == 0); 3546 mutex_lock(&fs_info->balance_mutex); 3547 } else { 3548 /* __cancel_balance needs volume_mutex */ 3549 mutex_unlock(&fs_info->balance_mutex); 3550 mutex_lock(&fs_info->volume_mutex); 3551 mutex_lock(&fs_info->balance_mutex); 3552 3553 if (fs_info->balance_ctl) 3554 __cancel_balance(fs_info); 3555 3556 mutex_unlock(&fs_info->volume_mutex); 3557 } 3558 3559 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3560 atomic_dec(&fs_info->balance_cancel_req); 3561 mutex_unlock(&fs_info->balance_mutex); 3562 return 0; 3563 } 3564 3565 static int btrfs_uuid_scan_kthread(void *data) 3566 { 3567 struct btrfs_fs_info *fs_info = data; 3568 struct btrfs_root *root = fs_info->tree_root; 3569 struct btrfs_key key; 3570 struct btrfs_key max_key; 3571 struct btrfs_path *path = NULL; 3572 int ret = 0; 3573 struct extent_buffer *eb; 3574 int slot; 3575 struct btrfs_root_item root_item; 3576 u32 item_size; 3577 struct btrfs_trans_handle *trans = NULL; 3578 3579 path = btrfs_alloc_path(); 3580 if (!path) { 3581 ret = -ENOMEM; 3582 goto out; 3583 } 3584 3585 key.objectid = 0; 3586 key.type = BTRFS_ROOT_ITEM_KEY; 3587 key.offset = 0; 3588 3589 max_key.objectid = (u64)-1; 3590 max_key.type = BTRFS_ROOT_ITEM_KEY; 3591 max_key.offset = (u64)-1; 3592 3593 path->keep_locks = 1; 3594 3595 while (1) { 3596 ret = btrfs_search_forward(root, &key, path, 0); 3597 if (ret) { 3598 if (ret > 0) 3599 ret = 0; 3600 break; 3601 } 3602 3603 if (key.type != BTRFS_ROOT_ITEM_KEY || 3604 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 3605 key.objectid != BTRFS_FS_TREE_OBJECTID) || 3606 key.objectid > BTRFS_LAST_FREE_OBJECTID) 3607 goto skip; 3608 3609 eb = path->nodes[0]; 3610 slot = path->slots[0]; 3611 item_size = btrfs_item_size_nr(eb, slot); 3612 if (item_size < sizeof(root_item)) 3613 goto skip; 3614 3615 read_extent_buffer(eb, &root_item, 3616 btrfs_item_ptr_offset(eb, slot), 3617 (int)sizeof(root_item)); 3618 if (btrfs_root_refs(&root_item) == 0) 3619 goto skip; 3620 3621 if (!btrfs_is_empty_uuid(root_item.uuid) || 3622 !btrfs_is_empty_uuid(root_item.received_uuid)) { 3623 if (trans) 3624 goto update_tree; 3625 3626 btrfs_release_path(path); 3627 /* 3628 * 1 - subvol uuid item 3629 * 1 - received_subvol uuid item 3630 */ 3631 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 3632 if (IS_ERR(trans)) { 3633 ret = PTR_ERR(trans); 3634 break; 3635 } 3636 continue; 3637 } else { 3638 goto skip; 3639 } 3640 update_tree: 3641 if (!btrfs_is_empty_uuid(root_item.uuid)) { 3642 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3643 root_item.uuid, 3644 BTRFS_UUID_KEY_SUBVOL, 3645 key.objectid); 3646 if (ret < 0) { 3647 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3648 ret); 3649 break; 3650 } 3651 } 3652 3653 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 3654 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3655 root_item.received_uuid, 3656 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3657 key.objectid); 3658 if (ret < 0) { 3659 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3660 ret); 3661 break; 3662 } 3663 } 3664 3665 skip: 3666 if (trans) { 3667 ret = btrfs_end_transaction(trans, fs_info->uuid_root); 3668 trans = NULL; 3669 if (ret) 3670 break; 3671 } 3672 3673 btrfs_release_path(path); 3674 if (key.offset < (u64)-1) { 3675 key.offset++; 3676 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 3677 key.offset = 0; 3678 key.type = BTRFS_ROOT_ITEM_KEY; 3679 } else if (key.objectid < (u64)-1) { 3680 key.offset = 0; 3681 key.type = BTRFS_ROOT_ITEM_KEY; 3682 key.objectid++; 3683 } else { 3684 break; 3685 } 3686 cond_resched(); 3687 } 3688 3689 out: 3690 btrfs_free_path(path); 3691 if (trans && !IS_ERR(trans)) 3692 btrfs_end_transaction(trans, fs_info->uuid_root); 3693 if (ret) 3694 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 3695 else 3696 fs_info->update_uuid_tree_gen = 1; 3697 up(&fs_info->uuid_tree_rescan_sem); 3698 return 0; 3699 } 3700 3701 /* 3702 * Callback for btrfs_uuid_tree_iterate(). 3703 * returns: 3704 * 0 check succeeded, the entry is not outdated. 3705 * < 0 if an error occured. 3706 * > 0 if the check failed, which means the caller shall remove the entry. 3707 */ 3708 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 3709 u8 *uuid, u8 type, u64 subid) 3710 { 3711 struct btrfs_key key; 3712 int ret = 0; 3713 struct btrfs_root *subvol_root; 3714 3715 if (type != BTRFS_UUID_KEY_SUBVOL && 3716 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 3717 goto out; 3718 3719 key.objectid = subid; 3720 key.type = BTRFS_ROOT_ITEM_KEY; 3721 key.offset = (u64)-1; 3722 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 3723 if (IS_ERR(subvol_root)) { 3724 ret = PTR_ERR(subvol_root); 3725 if (ret == -ENOENT) 3726 ret = 1; 3727 goto out; 3728 } 3729 3730 switch (type) { 3731 case BTRFS_UUID_KEY_SUBVOL: 3732 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 3733 ret = 1; 3734 break; 3735 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 3736 if (memcmp(uuid, subvol_root->root_item.received_uuid, 3737 BTRFS_UUID_SIZE)) 3738 ret = 1; 3739 break; 3740 } 3741 3742 out: 3743 return ret; 3744 } 3745 3746 static int btrfs_uuid_rescan_kthread(void *data) 3747 { 3748 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3749 int ret; 3750 3751 /* 3752 * 1st step is to iterate through the existing UUID tree and 3753 * to delete all entries that contain outdated data. 3754 * 2nd step is to add all missing entries to the UUID tree. 3755 */ 3756 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3757 if (ret < 0) { 3758 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 3759 up(&fs_info->uuid_tree_rescan_sem); 3760 return ret; 3761 } 3762 return btrfs_uuid_scan_kthread(data); 3763 } 3764 3765 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 3766 { 3767 struct btrfs_trans_handle *trans; 3768 struct btrfs_root *tree_root = fs_info->tree_root; 3769 struct btrfs_root *uuid_root; 3770 struct task_struct *task; 3771 int ret; 3772 3773 /* 3774 * 1 - root node 3775 * 1 - root item 3776 */ 3777 trans = btrfs_start_transaction(tree_root, 2); 3778 if (IS_ERR(trans)) 3779 return PTR_ERR(trans); 3780 3781 uuid_root = btrfs_create_tree(trans, fs_info, 3782 BTRFS_UUID_TREE_OBJECTID); 3783 if (IS_ERR(uuid_root)) { 3784 btrfs_abort_transaction(trans, tree_root, 3785 PTR_ERR(uuid_root)); 3786 return PTR_ERR(uuid_root); 3787 } 3788 3789 fs_info->uuid_root = uuid_root; 3790 3791 ret = btrfs_commit_transaction(trans, tree_root); 3792 if (ret) 3793 return ret; 3794 3795 down(&fs_info->uuid_tree_rescan_sem); 3796 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3797 if (IS_ERR(task)) { 3798 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3799 btrfs_warn(fs_info, "failed to start uuid_scan task"); 3800 up(&fs_info->uuid_tree_rescan_sem); 3801 return PTR_ERR(task); 3802 } 3803 3804 return 0; 3805 } 3806 3807 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3808 { 3809 struct task_struct *task; 3810 3811 down(&fs_info->uuid_tree_rescan_sem); 3812 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3813 if (IS_ERR(task)) { 3814 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3815 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 3816 up(&fs_info->uuid_tree_rescan_sem); 3817 return PTR_ERR(task); 3818 } 3819 3820 return 0; 3821 } 3822 3823 /* 3824 * shrinking a device means finding all of the device extents past 3825 * the new size, and then following the back refs to the chunks. 3826 * The chunk relocation code actually frees the device extent 3827 */ 3828 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3829 { 3830 struct btrfs_trans_handle *trans; 3831 struct btrfs_root *root = device->dev_root; 3832 struct btrfs_dev_extent *dev_extent = NULL; 3833 struct btrfs_path *path; 3834 u64 length; 3835 u64 chunk_tree; 3836 u64 chunk_objectid; 3837 u64 chunk_offset; 3838 int ret; 3839 int slot; 3840 int failed = 0; 3841 bool retried = false; 3842 struct extent_buffer *l; 3843 struct btrfs_key key; 3844 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3845 u64 old_total = btrfs_super_total_bytes(super_copy); 3846 u64 old_size = device->total_bytes; 3847 u64 diff = device->total_bytes - new_size; 3848 3849 if (device->is_tgtdev_for_dev_replace) 3850 return -EINVAL; 3851 3852 path = btrfs_alloc_path(); 3853 if (!path) 3854 return -ENOMEM; 3855 3856 path->reada = 2; 3857 3858 lock_chunks(root); 3859 3860 device->total_bytes = new_size; 3861 if (device->writeable) { 3862 device->fs_devices->total_rw_bytes -= diff; 3863 spin_lock(&root->fs_info->free_chunk_lock); 3864 root->fs_info->free_chunk_space -= diff; 3865 spin_unlock(&root->fs_info->free_chunk_lock); 3866 } 3867 unlock_chunks(root); 3868 3869 again: 3870 key.objectid = device->devid; 3871 key.offset = (u64)-1; 3872 key.type = BTRFS_DEV_EXTENT_KEY; 3873 3874 do { 3875 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3876 if (ret < 0) 3877 goto done; 3878 3879 ret = btrfs_previous_item(root, path, 0, key.type); 3880 if (ret < 0) 3881 goto done; 3882 if (ret) { 3883 ret = 0; 3884 btrfs_release_path(path); 3885 break; 3886 } 3887 3888 l = path->nodes[0]; 3889 slot = path->slots[0]; 3890 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 3891 3892 if (key.objectid != device->devid) { 3893 btrfs_release_path(path); 3894 break; 3895 } 3896 3897 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3898 length = btrfs_dev_extent_length(l, dev_extent); 3899 3900 if (key.offset + length <= new_size) { 3901 btrfs_release_path(path); 3902 break; 3903 } 3904 3905 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3906 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3907 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3908 btrfs_release_path(path); 3909 3910 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3911 chunk_offset); 3912 if (ret && ret != -ENOSPC) 3913 goto done; 3914 if (ret == -ENOSPC) 3915 failed++; 3916 } while (key.offset-- > 0); 3917 3918 if (failed && !retried) { 3919 failed = 0; 3920 retried = true; 3921 goto again; 3922 } else if (failed && retried) { 3923 ret = -ENOSPC; 3924 lock_chunks(root); 3925 3926 device->total_bytes = old_size; 3927 if (device->writeable) 3928 device->fs_devices->total_rw_bytes += diff; 3929 spin_lock(&root->fs_info->free_chunk_lock); 3930 root->fs_info->free_chunk_space += diff; 3931 spin_unlock(&root->fs_info->free_chunk_lock); 3932 unlock_chunks(root); 3933 goto done; 3934 } 3935 3936 /* Shrinking succeeded, else we would be at "done". */ 3937 trans = btrfs_start_transaction(root, 0); 3938 if (IS_ERR(trans)) { 3939 ret = PTR_ERR(trans); 3940 goto done; 3941 } 3942 3943 lock_chunks(root); 3944 3945 device->disk_total_bytes = new_size; 3946 /* Now btrfs_update_device() will change the on-disk size. */ 3947 ret = btrfs_update_device(trans, device); 3948 if (ret) { 3949 unlock_chunks(root); 3950 btrfs_end_transaction(trans, root); 3951 goto done; 3952 } 3953 WARN_ON(diff > old_total); 3954 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3955 unlock_chunks(root); 3956 btrfs_end_transaction(trans, root); 3957 done: 3958 btrfs_free_path(path); 3959 return ret; 3960 } 3961 3962 static int btrfs_add_system_chunk(struct btrfs_root *root, 3963 struct btrfs_key *key, 3964 struct btrfs_chunk *chunk, int item_size) 3965 { 3966 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3967 struct btrfs_disk_key disk_key; 3968 u32 array_size; 3969 u8 *ptr; 3970 3971 array_size = btrfs_super_sys_array_size(super_copy); 3972 if (array_size + item_size + sizeof(disk_key) 3973 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3974 return -EFBIG; 3975 3976 ptr = super_copy->sys_chunk_array + array_size; 3977 btrfs_cpu_key_to_disk(&disk_key, key); 3978 memcpy(ptr, &disk_key, sizeof(disk_key)); 3979 ptr += sizeof(disk_key); 3980 memcpy(ptr, chunk, item_size); 3981 item_size += sizeof(disk_key); 3982 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3983 return 0; 3984 } 3985 3986 /* 3987 * sort the devices in descending order by max_avail, total_avail 3988 */ 3989 static int btrfs_cmp_device_info(const void *a, const void *b) 3990 { 3991 const struct btrfs_device_info *di_a = a; 3992 const struct btrfs_device_info *di_b = b; 3993 3994 if (di_a->max_avail > di_b->max_avail) 3995 return -1; 3996 if (di_a->max_avail < di_b->max_avail) 3997 return 1; 3998 if (di_a->total_avail > di_b->total_avail) 3999 return -1; 4000 if (di_a->total_avail < di_b->total_avail) 4001 return 1; 4002 return 0; 4003 } 4004 4005 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 4006 [BTRFS_RAID_RAID10] = { 4007 .sub_stripes = 2, 4008 .dev_stripes = 1, 4009 .devs_max = 0, /* 0 == as many as possible */ 4010 .devs_min = 4, 4011 .devs_increment = 2, 4012 .ncopies = 2, 4013 }, 4014 [BTRFS_RAID_RAID1] = { 4015 .sub_stripes = 1, 4016 .dev_stripes = 1, 4017 .devs_max = 2, 4018 .devs_min = 2, 4019 .devs_increment = 2, 4020 .ncopies = 2, 4021 }, 4022 [BTRFS_RAID_DUP] = { 4023 .sub_stripes = 1, 4024 .dev_stripes = 2, 4025 .devs_max = 1, 4026 .devs_min = 1, 4027 .devs_increment = 1, 4028 .ncopies = 2, 4029 }, 4030 [BTRFS_RAID_RAID0] = { 4031 .sub_stripes = 1, 4032 .dev_stripes = 1, 4033 .devs_max = 0, 4034 .devs_min = 2, 4035 .devs_increment = 1, 4036 .ncopies = 1, 4037 }, 4038 [BTRFS_RAID_SINGLE] = { 4039 .sub_stripes = 1, 4040 .dev_stripes = 1, 4041 .devs_max = 1, 4042 .devs_min = 1, 4043 .devs_increment = 1, 4044 .ncopies = 1, 4045 }, 4046 [BTRFS_RAID_RAID5] = { 4047 .sub_stripes = 1, 4048 .dev_stripes = 1, 4049 .devs_max = 0, 4050 .devs_min = 2, 4051 .devs_increment = 1, 4052 .ncopies = 2, 4053 }, 4054 [BTRFS_RAID_RAID6] = { 4055 .sub_stripes = 1, 4056 .dev_stripes = 1, 4057 .devs_max = 0, 4058 .devs_min = 3, 4059 .devs_increment = 1, 4060 .ncopies = 3, 4061 }, 4062 }; 4063 4064 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 4065 { 4066 /* TODO allow them to set a preferred stripe size */ 4067 return 64 * 1024; 4068 } 4069 4070 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4071 { 4072 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 4073 return; 4074 4075 btrfs_set_fs_incompat(info, RAID56); 4076 } 4077 4078 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ 4079 - sizeof(struct btrfs_item) \ 4080 - sizeof(struct btrfs_chunk)) \ 4081 / sizeof(struct btrfs_stripe) + 1) 4082 4083 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4084 - 2 * sizeof(struct btrfs_disk_key) \ 4085 - 2 * sizeof(struct btrfs_chunk)) \ 4086 / sizeof(struct btrfs_stripe) + 1) 4087 4088 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4089 struct btrfs_root *extent_root, u64 start, 4090 u64 type) 4091 { 4092 struct btrfs_fs_info *info = extent_root->fs_info; 4093 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4094 struct list_head *cur; 4095 struct map_lookup *map = NULL; 4096 struct extent_map_tree *em_tree; 4097 struct extent_map *em; 4098 struct btrfs_device_info *devices_info = NULL; 4099 u64 total_avail; 4100 int num_stripes; /* total number of stripes to allocate */ 4101 int data_stripes; /* number of stripes that count for 4102 block group size */ 4103 int sub_stripes; /* sub_stripes info for map */ 4104 int dev_stripes; /* stripes per dev */ 4105 int devs_max; /* max devs to use */ 4106 int devs_min; /* min devs needed */ 4107 int devs_increment; /* ndevs has to be a multiple of this */ 4108 int ncopies; /* how many copies to data has */ 4109 int ret; 4110 u64 max_stripe_size; 4111 u64 max_chunk_size; 4112 u64 stripe_size; 4113 u64 num_bytes; 4114 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 4115 int ndevs; 4116 int i; 4117 int j; 4118 int index; 4119 4120 BUG_ON(!alloc_profile_is_valid(type, 0)); 4121 4122 if (list_empty(&fs_devices->alloc_list)) 4123 return -ENOSPC; 4124 4125 index = __get_raid_index(type); 4126 4127 sub_stripes = btrfs_raid_array[index].sub_stripes; 4128 dev_stripes = btrfs_raid_array[index].dev_stripes; 4129 devs_max = btrfs_raid_array[index].devs_max; 4130 devs_min = btrfs_raid_array[index].devs_min; 4131 devs_increment = btrfs_raid_array[index].devs_increment; 4132 ncopies = btrfs_raid_array[index].ncopies; 4133 4134 if (type & BTRFS_BLOCK_GROUP_DATA) { 4135 max_stripe_size = 1024 * 1024 * 1024; 4136 max_chunk_size = 10 * max_stripe_size; 4137 if (!devs_max) 4138 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4139 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4140 /* for larger filesystems, use larger metadata chunks */ 4141 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4142 max_stripe_size = 1024 * 1024 * 1024; 4143 else 4144 max_stripe_size = 256 * 1024 * 1024; 4145 max_chunk_size = max_stripe_size; 4146 if (!devs_max) 4147 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4148 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4149 max_stripe_size = 32 * 1024 * 1024; 4150 max_chunk_size = 2 * max_stripe_size; 4151 if (!devs_max) 4152 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4153 } else { 4154 btrfs_err(info, "invalid chunk type 0x%llx requested", 4155 type); 4156 BUG_ON(1); 4157 } 4158 4159 /* we don't want a chunk larger than 10% of writeable space */ 4160 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4161 max_chunk_size); 4162 4163 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 4164 GFP_NOFS); 4165 if (!devices_info) 4166 return -ENOMEM; 4167 4168 cur = fs_devices->alloc_list.next; 4169 4170 /* 4171 * in the first pass through the devices list, we gather information 4172 * about the available holes on each device. 4173 */ 4174 ndevs = 0; 4175 while (cur != &fs_devices->alloc_list) { 4176 struct btrfs_device *device; 4177 u64 max_avail; 4178 u64 dev_offset; 4179 4180 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4181 4182 cur = cur->next; 4183 4184 if (!device->writeable) { 4185 WARN(1, KERN_ERR 4186 "BTRFS: read-only device in alloc_list\n"); 4187 continue; 4188 } 4189 4190 if (!device->in_fs_metadata || 4191 device->is_tgtdev_for_dev_replace) 4192 continue; 4193 4194 if (device->total_bytes > device->bytes_used) 4195 total_avail = device->total_bytes - device->bytes_used; 4196 else 4197 total_avail = 0; 4198 4199 /* If there is no space on this device, skip it. */ 4200 if (total_avail == 0) 4201 continue; 4202 4203 ret = find_free_dev_extent(trans, device, 4204 max_stripe_size * dev_stripes, 4205 &dev_offset, &max_avail); 4206 if (ret && ret != -ENOSPC) 4207 goto error; 4208 4209 if (ret == 0) 4210 max_avail = max_stripe_size * dev_stripes; 4211 4212 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4213 continue; 4214 4215 if (ndevs == fs_devices->rw_devices) { 4216 WARN(1, "%s: found more than %llu devices\n", 4217 __func__, fs_devices->rw_devices); 4218 break; 4219 } 4220 devices_info[ndevs].dev_offset = dev_offset; 4221 devices_info[ndevs].max_avail = max_avail; 4222 devices_info[ndevs].total_avail = total_avail; 4223 devices_info[ndevs].dev = device; 4224 ++ndevs; 4225 } 4226 4227 /* 4228 * now sort the devices by hole size / available space 4229 */ 4230 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4231 btrfs_cmp_device_info, NULL); 4232 4233 /* round down to number of usable stripes */ 4234 ndevs -= ndevs % devs_increment; 4235 4236 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4237 ret = -ENOSPC; 4238 goto error; 4239 } 4240 4241 if (devs_max && ndevs > devs_max) 4242 ndevs = devs_max; 4243 /* 4244 * the primary goal is to maximize the number of stripes, so use as many 4245 * devices as possible, even if the stripes are not maximum sized. 4246 */ 4247 stripe_size = devices_info[ndevs-1].max_avail; 4248 num_stripes = ndevs * dev_stripes; 4249 4250 /* 4251 * this will have to be fixed for RAID1 and RAID10 over 4252 * more drives 4253 */ 4254 data_stripes = num_stripes / ncopies; 4255 4256 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4257 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4258 btrfs_super_stripesize(info->super_copy)); 4259 data_stripes = num_stripes - 1; 4260 } 4261 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4262 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4263 btrfs_super_stripesize(info->super_copy)); 4264 data_stripes = num_stripes - 2; 4265 } 4266 4267 /* 4268 * Use the number of data stripes to figure out how big this chunk 4269 * is really going to be in terms of logical address space, 4270 * and compare that answer with the max chunk size 4271 */ 4272 if (stripe_size * data_stripes > max_chunk_size) { 4273 u64 mask = (1ULL << 24) - 1; 4274 stripe_size = max_chunk_size; 4275 do_div(stripe_size, data_stripes); 4276 4277 /* bump the answer up to a 16MB boundary */ 4278 stripe_size = (stripe_size + mask) & ~mask; 4279 4280 /* but don't go higher than the limits we found 4281 * while searching for free extents 4282 */ 4283 if (stripe_size > devices_info[ndevs-1].max_avail) 4284 stripe_size = devices_info[ndevs-1].max_avail; 4285 } 4286 4287 do_div(stripe_size, dev_stripes); 4288 4289 /* align to BTRFS_STRIPE_LEN */ 4290 do_div(stripe_size, raid_stripe_len); 4291 stripe_size *= raid_stripe_len; 4292 4293 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4294 if (!map) { 4295 ret = -ENOMEM; 4296 goto error; 4297 } 4298 map->num_stripes = num_stripes; 4299 4300 for (i = 0; i < ndevs; ++i) { 4301 for (j = 0; j < dev_stripes; ++j) { 4302 int s = i * dev_stripes + j; 4303 map->stripes[s].dev = devices_info[i].dev; 4304 map->stripes[s].physical = devices_info[i].dev_offset + 4305 j * stripe_size; 4306 } 4307 } 4308 map->sector_size = extent_root->sectorsize; 4309 map->stripe_len = raid_stripe_len; 4310 map->io_align = raid_stripe_len; 4311 map->io_width = raid_stripe_len; 4312 map->type = type; 4313 map->sub_stripes = sub_stripes; 4314 4315 num_bytes = stripe_size * data_stripes; 4316 4317 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 4318 4319 em = alloc_extent_map(); 4320 if (!em) { 4321 kfree(map); 4322 ret = -ENOMEM; 4323 goto error; 4324 } 4325 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4326 em->bdev = (struct block_device *)map; 4327 em->start = start; 4328 em->len = num_bytes; 4329 em->block_start = 0; 4330 em->block_len = em->len; 4331 em->orig_block_len = stripe_size; 4332 4333 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4334 write_lock(&em_tree->lock); 4335 ret = add_extent_mapping(em_tree, em, 0); 4336 if (!ret) { 4337 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4338 atomic_inc(&em->refs); 4339 } 4340 write_unlock(&em_tree->lock); 4341 if (ret) { 4342 free_extent_map(em); 4343 goto error; 4344 } 4345 4346 ret = btrfs_make_block_group(trans, extent_root, 0, type, 4347 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4348 start, num_bytes); 4349 if (ret) 4350 goto error_del_extent; 4351 4352 free_extent_map(em); 4353 check_raid56_incompat_flag(extent_root->fs_info, type); 4354 4355 kfree(devices_info); 4356 return 0; 4357 4358 error_del_extent: 4359 write_lock(&em_tree->lock); 4360 remove_extent_mapping(em_tree, em); 4361 write_unlock(&em_tree->lock); 4362 4363 /* One for our allocation */ 4364 free_extent_map(em); 4365 /* One for the tree reference */ 4366 free_extent_map(em); 4367 error: 4368 kfree(devices_info); 4369 return ret; 4370 } 4371 4372 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4373 struct btrfs_root *extent_root, 4374 u64 chunk_offset, u64 chunk_size) 4375 { 4376 struct btrfs_key key; 4377 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 4378 struct btrfs_device *device; 4379 struct btrfs_chunk *chunk; 4380 struct btrfs_stripe *stripe; 4381 struct extent_map_tree *em_tree; 4382 struct extent_map *em; 4383 struct map_lookup *map; 4384 size_t item_size; 4385 u64 dev_offset; 4386 u64 stripe_size; 4387 int i = 0; 4388 int ret; 4389 4390 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4391 read_lock(&em_tree->lock); 4392 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4393 read_unlock(&em_tree->lock); 4394 4395 if (!em) { 4396 btrfs_crit(extent_root->fs_info, "unable to find logical " 4397 "%Lu len %Lu", chunk_offset, chunk_size); 4398 return -EINVAL; 4399 } 4400 4401 if (em->start != chunk_offset || em->len != chunk_size) { 4402 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4403 " %Lu-%Lu, found %Lu-%Lu", chunk_offset, 4404 chunk_size, em->start, em->len); 4405 free_extent_map(em); 4406 return -EINVAL; 4407 } 4408 4409 map = (struct map_lookup *)em->bdev; 4410 item_size = btrfs_chunk_item_size(map->num_stripes); 4411 stripe_size = em->orig_block_len; 4412 4413 chunk = kzalloc(item_size, GFP_NOFS); 4414 if (!chunk) { 4415 ret = -ENOMEM; 4416 goto out; 4417 } 4418 4419 for (i = 0; i < map->num_stripes; i++) { 4420 device = map->stripes[i].dev; 4421 dev_offset = map->stripes[i].physical; 4422 4423 device->bytes_used += stripe_size; 4424 ret = btrfs_update_device(trans, device); 4425 if (ret) 4426 goto out; 4427 ret = btrfs_alloc_dev_extent(trans, device, 4428 chunk_root->root_key.objectid, 4429 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4430 chunk_offset, dev_offset, 4431 stripe_size); 4432 if (ret) 4433 goto out; 4434 } 4435 4436 spin_lock(&extent_root->fs_info->free_chunk_lock); 4437 extent_root->fs_info->free_chunk_space -= (stripe_size * 4438 map->num_stripes); 4439 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4440 4441 stripe = &chunk->stripe; 4442 for (i = 0; i < map->num_stripes; i++) { 4443 device = map->stripes[i].dev; 4444 dev_offset = map->stripes[i].physical; 4445 4446 btrfs_set_stack_stripe_devid(stripe, device->devid); 4447 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4448 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4449 stripe++; 4450 } 4451 4452 btrfs_set_stack_chunk_length(chunk, chunk_size); 4453 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4454 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4455 btrfs_set_stack_chunk_type(chunk, map->type); 4456 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4457 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4458 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4459 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 4460 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4461 4462 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4463 key.type = BTRFS_CHUNK_ITEM_KEY; 4464 key.offset = chunk_offset; 4465 4466 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4467 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4468 /* 4469 * TODO: Cleanup of inserted chunk root in case of 4470 * failure. 4471 */ 4472 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 4473 item_size); 4474 } 4475 4476 out: 4477 kfree(chunk); 4478 free_extent_map(em); 4479 return ret; 4480 } 4481 4482 /* 4483 * Chunk allocation falls into two parts. The first part does works 4484 * that make the new allocated chunk useable, but not do any operation 4485 * that modifies the chunk tree. The second part does the works that 4486 * require modifying the chunk tree. This division is important for the 4487 * bootstrap process of adding storage to a seed btrfs. 4488 */ 4489 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4490 struct btrfs_root *extent_root, u64 type) 4491 { 4492 u64 chunk_offset; 4493 4494 chunk_offset = find_next_chunk(extent_root->fs_info); 4495 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); 4496 } 4497 4498 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4499 struct btrfs_root *root, 4500 struct btrfs_device *device) 4501 { 4502 u64 chunk_offset; 4503 u64 sys_chunk_offset; 4504 u64 alloc_profile; 4505 struct btrfs_fs_info *fs_info = root->fs_info; 4506 struct btrfs_root *extent_root = fs_info->extent_root; 4507 int ret; 4508 4509 chunk_offset = find_next_chunk(fs_info); 4510 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4511 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, 4512 alloc_profile); 4513 if (ret) 4514 return ret; 4515 4516 sys_chunk_offset = find_next_chunk(root->fs_info); 4517 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4518 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4519 alloc_profile); 4520 if (ret) { 4521 btrfs_abort_transaction(trans, root, ret); 4522 goto out; 4523 } 4524 4525 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4526 if (ret) 4527 btrfs_abort_transaction(trans, root, ret); 4528 out: 4529 return ret; 4530 } 4531 4532 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4533 { 4534 struct extent_map *em; 4535 struct map_lookup *map; 4536 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4537 int readonly = 0; 4538 int i; 4539 4540 read_lock(&map_tree->map_tree.lock); 4541 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 4542 read_unlock(&map_tree->map_tree.lock); 4543 if (!em) 4544 return 1; 4545 4546 if (btrfs_test_opt(root, DEGRADED)) { 4547 free_extent_map(em); 4548 return 0; 4549 } 4550 4551 map = (struct map_lookup *)em->bdev; 4552 for (i = 0; i < map->num_stripes; i++) { 4553 if (!map->stripes[i].dev->writeable) { 4554 readonly = 1; 4555 break; 4556 } 4557 } 4558 free_extent_map(em); 4559 return readonly; 4560 } 4561 4562 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 4563 { 4564 extent_map_tree_init(&tree->map_tree); 4565 } 4566 4567 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 4568 { 4569 struct extent_map *em; 4570 4571 while (1) { 4572 write_lock(&tree->map_tree.lock); 4573 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 4574 if (em) 4575 remove_extent_mapping(&tree->map_tree, em); 4576 write_unlock(&tree->map_tree.lock); 4577 if (!em) 4578 break; 4579 /* once for us */ 4580 free_extent_map(em); 4581 /* once for the tree */ 4582 free_extent_map(em); 4583 } 4584 } 4585 4586 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 4587 { 4588 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4589 struct extent_map *em; 4590 struct map_lookup *map; 4591 struct extent_map_tree *em_tree = &map_tree->map_tree; 4592 int ret; 4593 4594 read_lock(&em_tree->lock); 4595 em = lookup_extent_mapping(em_tree, logical, len); 4596 read_unlock(&em_tree->lock); 4597 4598 /* 4599 * We could return errors for these cases, but that could get ugly and 4600 * we'd probably do the same thing which is just not do anything else 4601 * and exit, so return 1 so the callers don't try to use other copies. 4602 */ 4603 if (!em) { 4604 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, 4605 logical+len); 4606 return 1; 4607 } 4608 4609 if (em->start > logical || em->start + em->len < logical) { 4610 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4611 "%Lu-%Lu", logical, logical+len, em->start, 4612 em->start + em->len); 4613 free_extent_map(em); 4614 return 1; 4615 } 4616 4617 map = (struct map_lookup *)em->bdev; 4618 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 4619 ret = map->num_stripes; 4620 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4621 ret = map->sub_stripes; 4622 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4623 ret = 2; 4624 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4625 ret = 3; 4626 else 4627 ret = 1; 4628 free_extent_map(em); 4629 4630 btrfs_dev_replace_lock(&fs_info->dev_replace); 4631 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 4632 ret++; 4633 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4634 4635 return ret; 4636 } 4637 4638 unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4639 struct btrfs_mapping_tree *map_tree, 4640 u64 logical) 4641 { 4642 struct extent_map *em; 4643 struct map_lookup *map; 4644 struct extent_map_tree *em_tree = &map_tree->map_tree; 4645 unsigned long len = root->sectorsize; 4646 4647 read_lock(&em_tree->lock); 4648 em = lookup_extent_mapping(em_tree, logical, len); 4649 read_unlock(&em_tree->lock); 4650 BUG_ON(!em); 4651 4652 BUG_ON(em->start > logical || em->start + em->len < logical); 4653 map = (struct map_lookup *)em->bdev; 4654 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4655 BTRFS_BLOCK_GROUP_RAID6)) { 4656 len = map->stripe_len * nr_data_stripes(map); 4657 } 4658 free_extent_map(em); 4659 return len; 4660 } 4661 4662 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4663 u64 logical, u64 len, int mirror_num) 4664 { 4665 struct extent_map *em; 4666 struct map_lookup *map; 4667 struct extent_map_tree *em_tree = &map_tree->map_tree; 4668 int ret = 0; 4669 4670 read_lock(&em_tree->lock); 4671 em = lookup_extent_mapping(em_tree, logical, len); 4672 read_unlock(&em_tree->lock); 4673 BUG_ON(!em); 4674 4675 BUG_ON(em->start > logical || em->start + em->len < logical); 4676 map = (struct map_lookup *)em->bdev; 4677 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4678 BTRFS_BLOCK_GROUP_RAID6)) 4679 ret = 1; 4680 free_extent_map(em); 4681 return ret; 4682 } 4683 4684 static int find_live_mirror(struct btrfs_fs_info *fs_info, 4685 struct map_lookup *map, int first, int num, 4686 int optimal, int dev_replace_is_ongoing) 4687 { 4688 int i; 4689 int tolerance; 4690 struct btrfs_device *srcdev; 4691 4692 if (dev_replace_is_ongoing && 4693 fs_info->dev_replace.cont_reading_from_srcdev_mode == 4694 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 4695 srcdev = fs_info->dev_replace.srcdev; 4696 else 4697 srcdev = NULL; 4698 4699 /* 4700 * try to avoid the drive that is the source drive for a 4701 * dev-replace procedure, only choose it if no other non-missing 4702 * mirror is available 4703 */ 4704 for (tolerance = 0; tolerance < 2; tolerance++) { 4705 if (map->stripes[optimal].dev->bdev && 4706 (tolerance || map->stripes[optimal].dev != srcdev)) 4707 return optimal; 4708 for (i = first; i < first + num; i++) { 4709 if (map->stripes[i].dev->bdev && 4710 (tolerance || map->stripes[i].dev != srcdev)) 4711 return i; 4712 } 4713 } 4714 4715 /* we couldn't find one that doesn't fail. Just return something 4716 * and the io error handling code will clean up eventually 4717 */ 4718 return optimal; 4719 } 4720 4721 static inline int parity_smaller(u64 a, u64 b) 4722 { 4723 return a > b; 4724 } 4725 4726 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4727 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4728 { 4729 struct btrfs_bio_stripe s; 4730 int i; 4731 u64 l; 4732 int again = 1; 4733 4734 while (again) { 4735 again = 0; 4736 for (i = 0; i < bbio->num_stripes - 1; i++) { 4737 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4738 s = bbio->stripes[i]; 4739 l = raid_map[i]; 4740 bbio->stripes[i] = bbio->stripes[i+1]; 4741 raid_map[i] = raid_map[i+1]; 4742 bbio->stripes[i+1] = s; 4743 raid_map[i+1] = l; 4744 again = 1; 4745 } 4746 } 4747 } 4748 } 4749 4750 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4751 u64 logical, u64 *length, 4752 struct btrfs_bio **bbio_ret, 4753 int mirror_num, u64 **raid_map_ret) 4754 { 4755 struct extent_map *em; 4756 struct map_lookup *map; 4757 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4758 struct extent_map_tree *em_tree = &map_tree->map_tree; 4759 u64 offset; 4760 u64 stripe_offset; 4761 u64 stripe_end_offset; 4762 u64 stripe_nr; 4763 u64 stripe_nr_orig; 4764 u64 stripe_nr_end; 4765 u64 stripe_len; 4766 u64 *raid_map = NULL; 4767 int stripe_index; 4768 int i; 4769 int ret = 0; 4770 int num_stripes; 4771 int max_errors = 0; 4772 struct btrfs_bio *bbio = NULL; 4773 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4774 int dev_replace_is_ongoing = 0; 4775 int num_alloc_stripes; 4776 int patch_the_first_stripe_for_dev_replace = 0; 4777 u64 physical_to_patch_in_first_stripe = 0; 4778 u64 raid56_full_stripe_start = (u64)-1; 4779 4780 read_lock(&em_tree->lock); 4781 em = lookup_extent_mapping(em_tree, logical, *length); 4782 read_unlock(&em_tree->lock); 4783 4784 if (!em) { 4785 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 4786 logical, *length); 4787 return -EINVAL; 4788 } 4789 4790 if (em->start > logical || em->start + em->len < logical) { 4791 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4792 "found %Lu-%Lu", logical, em->start, 4793 em->start + em->len); 4794 free_extent_map(em); 4795 return -EINVAL; 4796 } 4797 4798 map = (struct map_lookup *)em->bdev; 4799 offset = logical - em->start; 4800 4801 stripe_len = map->stripe_len; 4802 stripe_nr = offset; 4803 /* 4804 * stripe_nr counts the total number of stripes we have to stride 4805 * to get to this block 4806 */ 4807 do_div(stripe_nr, stripe_len); 4808 4809 stripe_offset = stripe_nr * stripe_len; 4810 BUG_ON(offset < stripe_offset); 4811 4812 /* stripe_offset is the offset of this block in its stripe*/ 4813 stripe_offset = offset - stripe_offset; 4814 4815 /* if we're here for raid56, we need to know the stripe aligned start */ 4816 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4817 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4818 raid56_full_stripe_start = offset; 4819 4820 /* allow a write of a full stripe, but make sure we don't 4821 * allow straddling of stripes 4822 */ 4823 do_div(raid56_full_stripe_start, full_stripe_len); 4824 raid56_full_stripe_start *= full_stripe_len; 4825 } 4826 4827 if (rw & REQ_DISCARD) { 4828 /* we don't discard raid56 yet */ 4829 if (map->type & 4830 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4831 ret = -EOPNOTSUPP; 4832 goto out; 4833 } 4834 *length = min_t(u64, em->len - offset, *length); 4835 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4836 u64 max_len; 4837 /* For writes to RAID[56], allow a full stripeset across all disks. 4838 For other RAID types and for RAID[56] reads, just allow a single 4839 stripe (on a single disk). */ 4840 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 4841 (rw & REQ_WRITE)) { 4842 max_len = stripe_len * nr_data_stripes(map) - 4843 (offset - raid56_full_stripe_start); 4844 } else { 4845 /* we limit the length of each bio to what fits in a stripe */ 4846 max_len = stripe_len - stripe_offset; 4847 } 4848 *length = min_t(u64, em->len - offset, max_len); 4849 } else { 4850 *length = em->len - offset; 4851 } 4852 4853 /* This is for when we're called from btrfs_merge_bio_hook() and all 4854 it cares about is the length */ 4855 if (!bbio_ret) 4856 goto out; 4857 4858 btrfs_dev_replace_lock(dev_replace); 4859 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 4860 if (!dev_replace_is_ongoing) 4861 btrfs_dev_replace_unlock(dev_replace); 4862 4863 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 4864 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 4865 dev_replace->tgtdev != NULL) { 4866 /* 4867 * in dev-replace case, for repair case (that's the only 4868 * case where the mirror is selected explicitly when 4869 * calling btrfs_map_block), blocks left of the left cursor 4870 * can also be read from the target drive. 4871 * For REQ_GET_READ_MIRRORS, the target drive is added as 4872 * the last one to the array of stripes. For READ, it also 4873 * needs to be supported using the same mirror number. 4874 * If the requested block is not left of the left cursor, 4875 * EIO is returned. This can happen because btrfs_num_copies() 4876 * returns one more in the dev-replace case. 4877 */ 4878 u64 tmp_length = *length; 4879 struct btrfs_bio *tmp_bbio = NULL; 4880 int tmp_num_stripes; 4881 u64 srcdev_devid = dev_replace->srcdev->devid; 4882 int index_srcdev = 0; 4883 int found = 0; 4884 u64 physical_of_found = 0; 4885 4886 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4887 logical, &tmp_length, &tmp_bbio, 0, NULL); 4888 if (ret) { 4889 WARN_ON(tmp_bbio != NULL); 4890 goto out; 4891 } 4892 4893 tmp_num_stripes = tmp_bbio->num_stripes; 4894 if (mirror_num > tmp_num_stripes) { 4895 /* 4896 * REQ_GET_READ_MIRRORS does not contain this 4897 * mirror, that means that the requested area 4898 * is not left of the left cursor 4899 */ 4900 ret = -EIO; 4901 kfree(tmp_bbio); 4902 goto out; 4903 } 4904 4905 /* 4906 * process the rest of the function using the mirror_num 4907 * of the source drive. Therefore look it up first. 4908 * At the end, patch the device pointer to the one of the 4909 * target drive. 4910 */ 4911 for (i = 0; i < tmp_num_stripes; i++) { 4912 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 4913 /* 4914 * In case of DUP, in order to keep it 4915 * simple, only add the mirror with the 4916 * lowest physical address 4917 */ 4918 if (found && 4919 physical_of_found <= 4920 tmp_bbio->stripes[i].physical) 4921 continue; 4922 index_srcdev = i; 4923 found = 1; 4924 physical_of_found = 4925 tmp_bbio->stripes[i].physical; 4926 } 4927 } 4928 4929 if (found) { 4930 mirror_num = index_srcdev + 1; 4931 patch_the_first_stripe_for_dev_replace = 1; 4932 physical_to_patch_in_first_stripe = physical_of_found; 4933 } else { 4934 WARN_ON(1); 4935 ret = -EIO; 4936 kfree(tmp_bbio); 4937 goto out; 4938 } 4939 4940 kfree(tmp_bbio); 4941 } else if (mirror_num > map->num_stripes) { 4942 mirror_num = 0; 4943 } 4944 4945 num_stripes = 1; 4946 stripe_index = 0; 4947 stripe_nr_orig = stripe_nr; 4948 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 4949 do_div(stripe_nr_end, map->stripe_len); 4950 stripe_end_offset = stripe_nr_end * map->stripe_len - 4951 (offset + *length); 4952 4953 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4954 if (rw & REQ_DISCARD) 4955 num_stripes = min_t(u64, map->num_stripes, 4956 stripe_nr_end - stripe_nr_orig); 4957 stripe_index = do_div(stripe_nr, map->num_stripes); 4958 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4959 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 4960 num_stripes = map->num_stripes; 4961 else if (mirror_num) 4962 stripe_index = mirror_num - 1; 4963 else { 4964 stripe_index = find_live_mirror(fs_info, map, 0, 4965 map->num_stripes, 4966 current->pid % map->num_stripes, 4967 dev_replace_is_ongoing); 4968 mirror_num = stripe_index + 1; 4969 } 4970 4971 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4972 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 4973 num_stripes = map->num_stripes; 4974 } else if (mirror_num) { 4975 stripe_index = mirror_num - 1; 4976 } else { 4977 mirror_num = 1; 4978 } 4979 4980 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 4981 int factor = map->num_stripes / map->sub_stripes; 4982 4983 stripe_index = do_div(stripe_nr, factor); 4984 stripe_index *= map->sub_stripes; 4985 4986 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 4987 num_stripes = map->sub_stripes; 4988 else if (rw & REQ_DISCARD) 4989 num_stripes = min_t(u64, map->sub_stripes * 4990 (stripe_nr_end - stripe_nr_orig), 4991 map->num_stripes); 4992 else if (mirror_num) 4993 stripe_index += mirror_num - 1; 4994 else { 4995 int old_stripe_index = stripe_index; 4996 stripe_index = find_live_mirror(fs_info, map, 4997 stripe_index, 4998 map->sub_stripes, stripe_index + 4999 current->pid % map->sub_stripes, 5000 dev_replace_is_ongoing); 5001 mirror_num = stripe_index - old_stripe_index + 1; 5002 } 5003 5004 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5005 BTRFS_BLOCK_GROUP_RAID6)) { 5006 u64 tmp; 5007 5008 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 5009 && raid_map_ret) { 5010 int i, rot; 5011 5012 /* push stripe_nr back to the start of the full stripe */ 5013 stripe_nr = raid56_full_stripe_start; 5014 do_div(stripe_nr, stripe_len); 5015 5016 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 5017 5018 /* RAID[56] write or recovery. Return all stripes */ 5019 num_stripes = map->num_stripes; 5020 max_errors = nr_parity_stripes(map); 5021 5022 raid_map = kmalloc_array(num_stripes, sizeof(u64), 5023 GFP_NOFS); 5024 if (!raid_map) { 5025 ret = -ENOMEM; 5026 goto out; 5027 } 5028 5029 /* Work out the disk rotation on this stripe-set */ 5030 tmp = stripe_nr; 5031 rot = do_div(tmp, num_stripes); 5032 5033 /* Fill in the logical address of each stripe */ 5034 tmp = stripe_nr * nr_data_stripes(map); 5035 for (i = 0; i < nr_data_stripes(map); i++) 5036 raid_map[(i+rot) % num_stripes] = 5037 em->start + (tmp + i) * map->stripe_len; 5038 5039 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5040 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5041 raid_map[(i+rot+1) % num_stripes] = 5042 RAID6_Q_STRIPE; 5043 5044 *length = map->stripe_len; 5045 stripe_index = 0; 5046 stripe_offset = 0; 5047 } else { 5048 /* 5049 * Mirror #0 or #1 means the original data block. 5050 * Mirror #2 is RAID5 parity block. 5051 * Mirror #3 is RAID6 Q block. 5052 */ 5053 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 5054 if (mirror_num > 1) 5055 stripe_index = nr_data_stripes(map) + 5056 mirror_num - 2; 5057 5058 /* We distribute the parity blocks across stripes */ 5059 tmp = stripe_nr + stripe_index; 5060 stripe_index = do_div(tmp, map->num_stripes); 5061 } 5062 } else { 5063 /* 5064 * after this do_div call, stripe_nr is the number of stripes 5065 * on this device we have to walk to find the data, and 5066 * stripe_index is the number of our device in the stripe array 5067 */ 5068 stripe_index = do_div(stripe_nr, map->num_stripes); 5069 mirror_num = stripe_index + 1; 5070 } 5071 BUG_ON(stripe_index >= map->num_stripes); 5072 5073 num_alloc_stripes = num_stripes; 5074 if (dev_replace_is_ongoing) { 5075 if (rw & (REQ_WRITE | REQ_DISCARD)) 5076 num_alloc_stripes <<= 1; 5077 if (rw & REQ_GET_READ_MIRRORS) 5078 num_alloc_stripes++; 5079 } 5080 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 5081 if (!bbio) { 5082 kfree(raid_map); 5083 ret = -ENOMEM; 5084 goto out; 5085 } 5086 atomic_set(&bbio->error, 0); 5087 5088 if (rw & REQ_DISCARD) { 5089 int factor = 0; 5090 int sub_stripes = 0; 5091 u64 stripes_per_dev = 0; 5092 u32 remaining_stripes = 0; 5093 u32 last_stripe = 0; 5094 5095 if (map->type & 5096 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 5097 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5098 sub_stripes = 1; 5099 else 5100 sub_stripes = map->sub_stripes; 5101 5102 factor = map->num_stripes / sub_stripes; 5103 stripes_per_dev = div_u64_rem(stripe_nr_end - 5104 stripe_nr_orig, 5105 factor, 5106 &remaining_stripes); 5107 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5108 last_stripe *= sub_stripes; 5109 } 5110 5111 for (i = 0; i < num_stripes; i++) { 5112 bbio->stripes[i].physical = 5113 map->stripes[stripe_index].physical + 5114 stripe_offset + stripe_nr * map->stripe_len; 5115 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5116 5117 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5118 BTRFS_BLOCK_GROUP_RAID10)) { 5119 bbio->stripes[i].length = stripes_per_dev * 5120 map->stripe_len; 5121 5122 if (i / sub_stripes < remaining_stripes) 5123 bbio->stripes[i].length += 5124 map->stripe_len; 5125 5126 /* 5127 * Special for the first stripe and 5128 * the last stripe: 5129 * 5130 * |-------|...|-------| 5131 * |----------| 5132 * off end_off 5133 */ 5134 if (i < sub_stripes) 5135 bbio->stripes[i].length -= 5136 stripe_offset; 5137 5138 if (stripe_index >= last_stripe && 5139 stripe_index <= (last_stripe + 5140 sub_stripes - 1)) 5141 bbio->stripes[i].length -= 5142 stripe_end_offset; 5143 5144 if (i == sub_stripes - 1) 5145 stripe_offset = 0; 5146 } else 5147 bbio->stripes[i].length = *length; 5148 5149 stripe_index++; 5150 if (stripe_index == map->num_stripes) { 5151 /* This could only happen for RAID0/10 */ 5152 stripe_index = 0; 5153 stripe_nr++; 5154 } 5155 } 5156 } else { 5157 for (i = 0; i < num_stripes; i++) { 5158 bbio->stripes[i].physical = 5159 map->stripes[stripe_index].physical + 5160 stripe_offset + 5161 stripe_nr * map->stripe_len; 5162 bbio->stripes[i].dev = 5163 map->stripes[stripe_index].dev; 5164 stripe_index++; 5165 } 5166 } 5167 5168 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5169 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5170 BTRFS_BLOCK_GROUP_RAID10 | 5171 BTRFS_BLOCK_GROUP_RAID5 | 5172 BTRFS_BLOCK_GROUP_DUP)) { 5173 max_errors = 1; 5174 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5175 max_errors = 2; 5176 } 5177 } 5178 5179 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5180 dev_replace->tgtdev != NULL) { 5181 int index_where_to_add; 5182 u64 srcdev_devid = dev_replace->srcdev->devid; 5183 5184 /* 5185 * duplicate the write operations while the dev replace 5186 * procedure is running. Since the copying of the old disk 5187 * to the new disk takes place at run time while the 5188 * filesystem is mounted writable, the regular write 5189 * operations to the old disk have to be duplicated to go 5190 * to the new disk as well. 5191 * Note that device->missing is handled by the caller, and 5192 * that the write to the old disk is already set up in the 5193 * stripes array. 5194 */ 5195 index_where_to_add = num_stripes; 5196 for (i = 0; i < num_stripes; i++) { 5197 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5198 /* write to new disk, too */ 5199 struct btrfs_bio_stripe *new = 5200 bbio->stripes + index_where_to_add; 5201 struct btrfs_bio_stripe *old = 5202 bbio->stripes + i; 5203 5204 new->physical = old->physical; 5205 new->length = old->length; 5206 new->dev = dev_replace->tgtdev; 5207 index_where_to_add++; 5208 max_errors++; 5209 } 5210 } 5211 num_stripes = index_where_to_add; 5212 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 5213 dev_replace->tgtdev != NULL) { 5214 u64 srcdev_devid = dev_replace->srcdev->devid; 5215 int index_srcdev = 0; 5216 int found = 0; 5217 u64 physical_of_found = 0; 5218 5219 /* 5220 * During the dev-replace procedure, the target drive can 5221 * also be used to read data in case it is needed to repair 5222 * a corrupt block elsewhere. This is possible if the 5223 * requested area is left of the left cursor. In this area, 5224 * the target drive is a full copy of the source drive. 5225 */ 5226 for (i = 0; i < num_stripes; i++) { 5227 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5228 /* 5229 * In case of DUP, in order to keep it 5230 * simple, only add the mirror with the 5231 * lowest physical address 5232 */ 5233 if (found && 5234 physical_of_found <= 5235 bbio->stripes[i].physical) 5236 continue; 5237 index_srcdev = i; 5238 found = 1; 5239 physical_of_found = bbio->stripes[i].physical; 5240 } 5241 } 5242 if (found) { 5243 u64 length = map->stripe_len; 5244 5245 if (physical_of_found + length <= 5246 dev_replace->cursor_left) { 5247 struct btrfs_bio_stripe *tgtdev_stripe = 5248 bbio->stripes + num_stripes; 5249 5250 tgtdev_stripe->physical = physical_of_found; 5251 tgtdev_stripe->length = 5252 bbio->stripes[index_srcdev].length; 5253 tgtdev_stripe->dev = dev_replace->tgtdev; 5254 5255 num_stripes++; 5256 } 5257 } 5258 } 5259 5260 *bbio_ret = bbio; 5261 bbio->num_stripes = num_stripes; 5262 bbio->max_errors = max_errors; 5263 bbio->mirror_num = mirror_num; 5264 5265 /* 5266 * this is the case that REQ_READ && dev_replace_is_ongoing && 5267 * mirror_num == num_stripes + 1 && dev_replace target drive is 5268 * available as a mirror 5269 */ 5270 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5271 WARN_ON(num_stripes > 1); 5272 bbio->stripes[0].dev = dev_replace->tgtdev; 5273 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5274 bbio->mirror_num = map->num_stripes + 1; 5275 } 5276 if (raid_map) { 5277 sort_parity_stripes(bbio, raid_map); 5278 *raid_map_ret = raid_map; 5279 } 5280 out: 5281 if (dev_replace_is_ongoing) 5282 btrfs_dev_replace_unlock(dev_replace); 5283 free_extent_map(em); 5284 return ret; 5285 } 5286 5287 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 5288 u64 logical, u64 *length, 5289 struct btrfs_bio **bbio_ret, int mirror_num) 5290 { 5291 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5292 mirror_num, NULL); 5293 } 5294 5295 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5296 u64 chunk_start, u64 physical, u64 devid, 5297 u64 **logical, int *naddrs, int *stripe_len) 5298 { 5299 struct extent_map_tree *em_tree = &map_tree->map_tree; 5300 struct extent_map *em; 5301 struct map_lookup *map; 5302 u64 *buf; 5303 u64 bytenr; 5304 u64 length; 5305 u64 stripe_nr; 5306 u64 rmap_len; 5307 int i, j, nr = 0; 5308 5309 read_lock(&em_tree->lock); 5310 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5311 read_unlock(&em_tree->lock); 5312 5313 if (!em) { 5314 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", 5315 chunk_start); 5316 return -EIO; 5317 } 5318 5319 if (em->start != chunk_start) { 5320 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", 5321 em->start, chunk_start); 5322 free_extent_map(em); 5323 return -EIO; 5324 } 5325 map = (struct map_lookup *)em->bdev; 5326 5327 length = em->len; 5328 rmap_len = map->stripe_len; 5329 5330 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5331 do_div(length, map->num_stripes / map->sub_stripes); 5332 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5333 do_div(length, map->num_stripes); 5334 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5335 BTRFS_BLOCK_GROUP_RAID6)) { 5336 do_div(length, nr_data_stripes(map)); 5337 rmap_len = map->stripe_len * nr_data_stripes(map); 5338 } 5339 5340 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 5341 BUG_ON(!buf); /* -ENOMEM */ 5342 5343 for (i = 0; i < map->num_stripes; i++) { 5344 if (devid && map->stripes[i].dev->devid != devid) 5345 continue; 5346 if (map->stripes[i].physical > physical || 5347 map->stripes[i].physical + length <= physical) 5348 continue; 5349 5350 stripe_nr = physical - map->stripes[i].physical; 5351 do_div(stripe_nr, map->stripe_len); 5352 5353 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5354 stripe_nr = stripe_nr * map->num_stripes + i; 5355 do_div(stripe_nr, map->sub_stripes); 5356 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5357 stripe_nr = stripe_nr * map->num_stripes + i; 5358 } /* else if RAID[56], multiply by nr_data_stripes(). 5359 * Alternatively, just use rmap_len below instead of 5360 * map->stripe_len */ 5361 5362 bytenr = chunk_start + stripe_nr * rmap_len; 5363 WARN_ON(nr >= map->num_stripes); 5364 for (j = 0; j < nr; j++) { 5365 if (buf[j] == bytenr) 5366 break; 5367 } 5368 if (j == nr) { 5369 WARN_ON(nr >= map->num_stripes); 5370 buf[nr++] = bytenr; 5371 } 5372 } 5373 5374 *logical = buf; 5375 *naddrs = nr; 5376 *stripe_len = rmap_len; 5377 5378 free_extent_map(em); 5379 return 0; 5380 } 5381 5382 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) 5383 { 5384 if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) 5385 bio_endio_nodec(bio, err); 5386 else 5387 bio_endio(bio, err); 5388 kfree(bbio); 5389 } 5390 5391 static void btrfs_end_bio(struct bio *bio, int err) 5392 { 5393 struct btrfs_bio *bbio = bio->bi_private; 5394 struct btrfs_device *dev = bbio->stripes[0].dev; 5395 int is_orig_bio = 0; 5396 5397 if (err) { 5398 atomic_inc(&bbio->error); 5399 if (err == -EIO || err == -EREMOTEIO) { 5400 unsigned int stripe_index = 5401 btrfs_io_bio(bio)->stripe_index; 5402 5403 BUG_ON(stripe_index >= bbio->num_stripes); 5404 dev = bbio->stripes[stripe_index].dev; 5405 if (dev->bdev) { 5406 if (bio->bi_rw & WRITE) 5407 btrfs_dev_stat_inc(dev, 5408 BTRFS_DEV_STAT_WRITE_ERRS); 5409 else 5410 btrfs_dev_stat_inc(dev, 5411 BTRFS_DEV_STAT_READ_ERRS); 5412 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 5413 btrfs_dev_stat_inc(dev, 5414 BTRFS_DEV_STAT_FLUSH_ERRS); 5415 btrfs_dev_stat_print_on_error(dev); 5416 } 5417 } 5418 } 5419 5420 if (bio == bbio->orig_bio) 5421 is_orig_bio = 1; 5422 5423 btrfs_bio_counter_dec(bbio->fs_info); 5424 5425 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5426 if (!is_orig_bio) { 5427 bio_put(bio); 5428 bio = bbio->orig_bio; 5429 } 5430 5431 bio->bi_private = bbio->private; 5432 bio->bi_end_io = bbio->end_io; 5433 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5434 /* only send an error to the higher layers if it is 5435 * beyond the tolerance of the btrfs bio 5436 */ 5437 if (atomic_read(&bbio->error) > bbio->max_errors) { 5438 err = -EIO; 5439 } else { 5440 /* 5441 * this bio is actually up to date, we didn't 5442 * go over the max number of errors 5443 */ 5444 set_bit(BIO_UPTODATE, &bio->bi_flags); 5445 err = 0; 5446 } 5447 5448 btrfs_end_bbio(bbio, bio, err); 5449 } else if (!is_orig_bio) { 5450 bio_put(bio); 5451 } 5452 } 5453 5454 /* 5455 * see run_scheduled_bios for a description of why bios are collected for 5456 * async submit. 5457 * 5458 * This will add one bio to the pending list for a device and make sure 5459 * the work struct is scheduled. 5460 */ 5461 static noinline void btrfs_schedule_bio(struct btrfs_root *root, 5462 struct btrfs_device *device, 5463 int rw, struct bio *bio) 5464 { 5465 int should_queue = 1; 5466 struct btrfs_pending_bios *pending_bios; 5467 5468 if (device->missing || !device->bdev) { 5469 bio_endio(bio, -EIO); 5470 return; 5471 } 5472 5473 /* don't bother with additional async steps for reads, right now */ 5474 if (!(rw & REQ_WRITE)) { 5475 bio_get(bio); 5476 btrfsic_submit_bio(rw, bio); 5477 bio_put(bio); 5478 return; 5479 } 5480 5481 /* 5482 * nr_async_bios allows us to reliably return congestion to the 5483 * higher layers. Otherwise, the async bio makes it appear we have 5484 * made progress against dirty pages when we've really just put it 5485 * on a queue for later 5486 */ 5487 atomic_inc(&root->fs_info->nr_async_bios); 5488 WARN_ON(bio->bi_next); 5489 bio->bi_next = NULL; 5490 bio->bi_rw |= rw; 5491 5492 spin_lock(&device->io_lock); 5493 if (bio->bi_rw & REQ_SYNC) 5494 pending_bios = &device->pending_sync_bios; 5495 else 5496 pending_bios = &device->pending_bios; 5497 5498 if (pending_bios->tail) 5499 pending_bios->tail->bi_next = bio; 5500 5501 pending_bios->tail = bio; 5502 if (!pending_bios->head) 5503 pending_bios->head = bio; 5504 if (device->running_pending) 5505 should_queue = 0; 5506 5507 spin_unlock(&device->io_lock); 5508 5509 if (should_queue) 5510 btrfs_queue_work(root->fs_info->submit_workers, 5511 &device->work); 5512 } 5513 5514 static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5515 sector_t sector) 5516 { 5517 struct bio_vec *prev; 5518 struct request_queue *q = bdev_get_queue(bdev); 5519 unsigned int max_sectors = queue_max_sectors(q); 5520 struct bvec_merge_data bvm = { 5521 .bi_bdev = bdev, 5522 .bi_sector = sector, 5523 .bi_rw = bio->bi_rw, 5524 }; 5525 5526 if (WARN_ON(bio->bi_vcnt == 0)) 5527 return 1; 5528 5529 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5530 if (bio_sectors(bio) > max_sectors) 5531 return 0; 5532 5533 if (!q->merge_bvec_fn) 5534 return 1; 5535 5536 bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; 5537 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5538 return 0; 5539 return 1; 5540 } 5541 5542 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5543 struct bio *bio, u64 physical, int dev_nr, 5544 int rw, int async) 5545 { 5546 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5547 5548 bio->bi_private = bbio; 5549 btrfs_io_bio(bio)->stripe_index = dev_nr; 5550 bio->bi_end_io = btrfs_end_bio; 5551 bio->bi_iter.bi_sector = physical >> 9; 5552 #ifdef DEBUG 5553 { 5554 struct rcu_string *name; 5555 5556 rcu_read_lock(); 5557 name = rcu_dereference(dev->name); 5558 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5559 "(%s id %llu), size=%u\n", rw, 5560 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5561 name->str, dev->devid, bio->bi_size); 5562 rcu_read_unlock(); 5563 } 5564 #endif 5565 bio->bi_bdev = dev->bdev; 5566 5567 btrfs_bio_counter_inc_noblocked(root->fs_info); 5568 5569 if (async) 5570 btrfs_schedule_bio(root, dev, rw, bio); 5571 else 5572 btrfsic_submit_bio(rw, bio); 5573 } 5574 5575 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5576 struct bio *first_bio, struct btrfs_device *dev, 5577 int dev_nr, int rw, int async) 5578 { 5579 struct bio_vec *bvec = first_bio->bi_io_vec; 5580 struct bio *bio; 5581 int nr_vecs = bio_get_nr_vecs(dev->bdev); 5582 u64 physical = bbio->stripes[dev_nr].physical; 5583 5584 again: 5585 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 5586 if (!bio) 5587 return -ENOMEM; 5588 5589 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5590 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5591 bvec->bv_offset) < bvec->bv_len) { 5592 u64 len = bio->bi_iter.bi_size; 5593 5594 atomic_inc(&bbio->stripes_pending); 5595 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5596 rw, async); 5597 physical += len; 5598 goto again; 5599 } 5600 bvec++; 5601 } 5602 5603 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 5604 return 0; 5605 } 5606 5607 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 5608 { 5609 atomic_inc(&bbio->error); 5610 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5611 /* Shoud be the original bio. */ 5612 WARN_ON(bio != bbio->orig_bio); 5613 5614 bio->bi_private = bbio->private; 5615 bio->bi_end_io = bbio->end_io; 5616 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5617 bio->bi_iter.bi_sector = logical >> 9; 5618 5619 btrfs_end_bbio(bbio, bio, -EIO); 5620 } 5621 } 5622 5623 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 5624 int mirror_num, int async_submit) 5625 { 5626 struct btrfs_device *dev; 5627 struct bio *first_bio = bio; 5628 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 5629 u64 length = 0; 5630 u64 map_length; 5631 u64 *raid_map = NULL; 5632 int ret; 5633 int dev_nr = 0; 5634 int total_devs = 1; 5635 struct btrfs_bio *bbio = NULL; 5636 5637 length = bio->bi_iter.bi_size; 5638 map_length = length; 5639 5640 btrfs_bio_counter_inc_blocked(root->fs_info); 5641 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5642 mirror_num, &raid_map); 5643 if (ret) { 5644 btrfs_bio_counter_dec(root->fs_info); 5645 return ret; 5646 } 5647 5648 total_devs = bbio->num_stripes; 5649 bbio->orig_bio = first_bio; 5650 bbio->private = first_bio->bi_private; 5651 bbio->end_io = first_bio->bi_end_io; 5652 bbio->fs_info = root->fs_info; 5653 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5654 5655 if (raid_map) { 5656 /* In this case, map_length has been set to the length of 5657 a single stripe; not the whole write */ 5658 if (rw & WRITE) { 5659 ret = raid56_parity_write(root, bio, bbio, 5660 raid_map, map_length); 5661 } else { 5662 ret = raid56_parity_recover(root, bio, bbio, 5663 raid_map, map_length, 5664 mirror_num); 5665 } 5666 /* 5667 * FIXME, replace dosen't support raid56 yet, please fix 5668 * it in the future. 5669 */ 5670 btrfs_bio_counter_dec(root->fs_info); 5671 return ret; 5672 } 5673 5674 if (map_length < length) { 5675 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", 5676 logical, length, map_length); 5677 BUG(); 5678 } 5679 5680 while (dev_nr < total_devs) { 5681 dev = bbio->stripes[dev_nr].dev; 5682 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5683 bbio_error(bbio, first_bio, logical); 5684 dev_nr++; 5685 continue; 5686 } 5687 5688 /* 5689 * Check and see if we're ok with this bio based on it's size 5690 * and offset with the given device. 5691 */ 5692 if (!bio_size_ok(dev->bdev, first_bio, 5693 bbio->stripes[dev_nr].physical >> 9)) { 5694 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5695 dev_nr, rw, async_submit); 5696 BUG_ON(ret); 5697 dev_nr++; 5698 continue; 5699 } 5700 5701 if (dev_nr < total_devs - 1) { 5702 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 5703 BUG_ON(!bio); /* -ENOMEM */ 5704 } else { 5705 bio = first_bio; 5706 bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; 5707 } 5708 5709 submit_stripe_bio(root, bbio, bio, 5710 bbio->stripes[dev_nr].physical, dev_nr, rw, 5711 async_submit); 5712 dev_nr++; 5713 } 5714 btrfs_bio_counter_dec(root->fs_info); 5715 return 0; 5716 } 5717 5718 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 5719 u8 *uuid, u8 *fsid) 5720 { 5721 struct btrfs_device *device; 5722 struct btrfs_fs_devices *cur_devices; 5723 5724 cur_devices = fs_info->fs_devices; 5725 while (cur_devices) { 5726 if (!fsid || 5727 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5728 device = __find_device(&cur_devices->devices, 5729 devid, uuid); 5730 if (device) 5731 return device; 5732 } 5733 cur_devices = cur_devices->seed; 5734 } 5735 return NULL; 5736 } 5737 5738 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5739 u64 devid, u8 *dev_uuid) 5740 { 5741 struct btrfs_device *device; 5742 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 5743 5744 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5745 if (IS_ERR(device)) 5746 return NULL; 5747 5748 list_add(&device->dev_list, &fs_devices->devices); 5749 device->fs_devices = fs_devices; 5750 fs_devices->num_devices++; 5751 5752 device->missing = 1; 5753 fs_devices->missing_devices++; 5754 5755 return device; 5756 } 5757 5758 /** 5759 * btrfs_alloc_device - allocate struct btrfs_device 5760 * @fs_info: used only for generating a new devid, can be NULL if 5761 * devid is provided (i.e. @devid != NULL). 5762 * @devid: a pointer to devid for this device. If NULL a new devid 5763 * is generated. 5764 * @uuid: a pointer to UUID for this device. If NULL a new UUID 5765 * is generated. 5766 * 5767 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 5768 * on error. Returned struct is not linked onto any lists and can be 5769 * destroyed with kfree() right away. 5770 */ 5771 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 5772 const u64 *devid, 5773 const u8 *uuid) 5774 { 5775 struct btrfs_device *dev; 5776 u64 tmp; 5777 5778 if (WARN_ON(!devid && !fs_info)) 5779 return ERR_PTR(-EINVAL); 5780 5781 dev = __alloc_device(); 5782 if (IS_ERR(dev)) 5783 return dev; 5784 5785 if (devid) 5786 tmp = *devid; 5787 else { 5788 int ret; 5789 5790 ret = find_next_devid(fs_info, &tmp); 5791 if (ret) { 5792 kfree(dev); 5793 return ERR_PTR(ret); 5794 } 5795 } 5796 dev->devid = tmp; 5797 5798 if (uuid) 5799 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 5800 else 5801 generate_random_uuid(dev->uuid); 5802 5803 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); 5804 5805 return dev; 5806 } 5807 5808 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 5809 struct extent_buffer *leaf, 5810 struct btrfs_chunk *chunk) 5811 { 5812 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5813 struct map_lookup *map; 5814 struct extent_map *em; 5815 u64 logical; 5816 u64 length; 5817 u64 devid; 5818 u8 uuid[BTRFS_UUID_SIZE]; 5819 int num_stripes; 5820 int ret; 5821 int i; 5822 5823 logical = key->offset; 5824 length = btrfs_chunk_length(leaf, chunk); 5825 5826 read_lock(&map_tree->map_tree.lock); 5827 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 5828 read_unlock(&map_tree->map_tree.lock); 5829 5830 /* already mapped? */ 5831 if (em && em->start <= logical && em->start + em->len > logical) { 5832 free_extent_map(em); 5833 return 0; 5834 } else if (em) { 5835 free_extent_map(em); 5836 } 5837 5838 em = alloc_extent_map(); 5839 if (!em) 5840 return -ENOMEM; 5841 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 5842 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 5843 if (!map) { 5844 free_extent_map(em); 5845 return -ENOMEM; 5846 } 5847 5848 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5849 em->bdev = (struct block_device *)map; 5850 em->start = logical; 5851 em->len = length; 5852 em->orig_start = 0; 5853 em->block_start = 0; 5854 em->block_len = em->len; 5855 5856 map->num_stripes = num_stripes; 5857 map->io_width = btrfs_chunk_io_width(leaf, chunk); 5858 map->io_align = btrfs_chunk_io_align(leaf, chunk); 5859 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 5860 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 5861 map->type = btrfs_chunk_type(leaf, chunk); 5862 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 5863 for (i = 0; i < num_stripes; i++) { 5864 map->stripes[i].physical = 5865 btrfs_stripe_offset_nr(leaf, chunk, i); 5866 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 5867 read_extent_buffer(leaf, uuid, (unsigned long) 5868 btrfs_stripe_dev_uuid_nr(chunk, i), 5869 BTRFS_UUID_SIZE); 5870 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 5871 uuid, NULL); 5872 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5873 free_extent_map(em); 5874 return -EIO; 5875 } 5876 if (!map->stripes[i].dev) { 5877 map->stripes[i].dev = 5878 add_missing_dev(root, devid, uuid); 5879 if (!map->stripes[i].dev) { 5880 free_extent_map(em); 5881 return -EIO; 5882 } 5883 } 5884 map->stripes[i].dev->in_fs_metadata = 1; 5885 } 5886 5887 write_lock(&map_tree->map_tree.lock); 5888 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 5889 write_unlock(&map_tree->map_tree.lock); 5890 BUG_ON(ret); /* Tree corruption */ 5891 free_extent_map(em); 5892 5893 return 0; 5894 } 5895 5896 static void fill_device_from_item(struct extent_buffer *leaf, 5897 struct btrfs_dev_item *dev_item, 5898 struct btrfs_device *device) 5899 { 5900 unsigned long ptr; 5901 5902 device->devid = btrfs_device_id(leaf, dev_item); 5903 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 5904 device->total_bytes = device->disk_total_bytes; 5905 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 5906 device->type = btrfs_device_type(leaf, dev_item); 5907 device->io_align = btrfs_device_io_align(leaf, dev_item); 5908 device->io_width = btrfs_device_io_width(leaf, dev_item); 5909 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5910 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 5911 device->is_tgtdev_for_dev_replace = 0; 5912 5913 ptr = btrfs_device_uuid(dev_item); 5914 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5915 } 5916 5917 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 5918 { 5919 struct btrfs_fs_devices *fs_devices; 5920 int ret; 5921 5922 BUG_ON(!mutex_is_locked(&uuid_mutex)); 5923 5924 fs_devices = root->fs_info->fs_devices->seed; 5925 while (fs_devices) { 5926 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5927 ret = 0; 5928 goto out; 5929 } 5930 fs_devices = fs_devices->seed; 5931 } 5932 5933 fs_devices = find_fsid(fsid); 5934 if (!fs_devices) { 5935 ret = -ENOENT; 5936 goto out; 5937 } 5938 5939 fs_devices = clone_fs_devices(fs_devices); 5940 if (IS_ERR(fs_devices)) { 5941 ret = PTR_ERR(fs_devices); 5942 goto out; 5943 } 5944 5945 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 5946 root->fs_info->bdev_holder); 5947 if (ret) { 5948 free_fs_devices(fs_devices); 5949 goto out; 5950 } 5951 5952 if (!fs_devices->seeding) { 5953 __btrfs_close_devices(fs_devices); 5954 free_fs_devices(fs_devices); 5955 ret = -EINVAL; 5956 goto out; 5957 } 5958 5959 fs_devices->seed = root->fs_info->fs_devices->seed; 5960 root->fs_info->fs_devices->seed = fs_devices; 5961 out: 5962 return ret; 5963 } 5964 5965 static int read_one_dev(struct btrfs_root *root, 5966 struct extent_buffer *leaf, 5967 struct btrfs_dev_item *dev_item) 5968 { 5969 struct btrfs_device *device; 5970 u64 devid; 5971 int ret; 5972 u8 fs_uuid[BTRFS_UUID_SIZE]; 5973 u8 dev_uuid[BTRFS_UUID_SIZE]; 5974 5975 devid = btrfs_device_id(leaf, dev_item); 5976 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 5977 BTRFS_UUID_SIZE); 5978 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 5979 BTRFS_UUID_SIZE); 5980 5981 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 5982 ret = open_seed_devices(root, fs_uuid); 5983 if (ret && !btrfs_test_opt(root, DEGRADED)) 5984 return ret; 5985 } 5986 5987 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 5988 if (!device || !device->bdev) { 5989 if (!btrfs_test_opt(root, DEGRADED)) 5990 return -EIO; 5991 5992 if (!device) { 5993 btrfs_warn(root->fs_info, "devid %llu missing", devid); 5994 device = add_missing_dev(root, devid, dev_uuid); 5995 if (!device) 5996 return -ENOMEM; 5997 } else if (!device->missing) { 5998 /* 5999 * this happens when a device that was properly setup 6000 * in the device info lists suddenly goes bad. 6001 * device->bdev is NULL, and so we have to set 6002 * device->missing to one here 6003 */ 6004 root->fs_info->fs_devices->missing_devices++; 6005 device->missing = 1; 6006 } 6007 } 6008 6009 if (device->fs_devices != root->fs_info->fs_devices) { 6010 BUG_ON(device->writeable); 6011 if (device->generation != 6012 btrfs_device_generation(leaf, dev_item)) 6013 return -EINVAL; 6014 } 6015 6016 fill_device_from_item(leaf, dev_item, device); 6017 device->in_fs_metadata = 1; 6018 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6019 device->fs_devices->total_rw_bytes += device->total_bytes; 6020 spin_lock(&root->fs_info->free_chunk_lock); 6021 root->fs_info->free_chunk_space += device->total_bytes - 6022 device->bytes_used; 6023 spin_unlock(&root->fs_info->free_chunk_lock); 6024 } 6025 ret = 0; 6026 return ret; 6027 } 6028 6029 int btrfs_read_sys_array(struct btrfs_root *root) 6030 { 6031 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 6032 struct extent_buffer *sb; 6033 struct btrfs_disk_key *disk_key; 6034 struct btrfs_chunk *chunk; 6035 u8 *ptr; 6036 unsigned long sb_ptr; 6037 int ret = 0; 6038 u32 num_stripes; 6039 u32 array_size; 6040 u32 len = 0; 6041 u32 cur; 6042 struct btrfs_key key; 6043 6044 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 6045 BTRFS_SUPER_INFO_SIZE); 6046 if (!sb) 6047 return -ENOMEM; 6048 btrfs_set_buffer_uptodate(sb); 6049 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6050 /* 6051 * The sb extent buffer is artifical and just used to read the system array. 6052 * btrfs_set_buffer_uptodate() call does not properly mark all it's 6053 * pages up-to-date when the page is larger: extent does not cover the 6054 * whole page and consequently check_page_uptodate does not find all 6055 * the page's extents up-to-date (the hole beyond sb), 6056 * write_extent_buffer then triggers a WARN_ON. 6057 * 6058 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6059 * but sb spans only this function. Add an explicit SetPageUptodate call 6060 * to silence the warning eg. on PowerPC 64. 6061 */ 6062 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 6063 SetPageUptodate(sb->pages[0]); 6064 6065 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6066 array_size = btrfs_super_sys_array_size(super_copy); 6067 6068 ptr = super_copy->sys_chunk_array; 6069 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 6070 cur = 0; 6071 6072 while (cur < array_size) { 6073 disk_key = (struct btrfs_disk_key *)ptr; 6074 btrfs_disk_key_to_cpu(&key, disk_key); 6075 6076 len = sizeof(*disk_key); ptr += len; 6077 sb_ptr += len; 6078 cur += len; 6079 6080 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6081 chunk = (struct btrfs_chunk *)sb_ptr; 6082 ret = read_one_chunk(root, &key, sb, chunk); 6083 if (ret) 6084 break; 6085 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6086 len = btrfs_chunk_item_size(num_stripes); 6087 } else { 6088 ret = -EIO; 6089 break; 6090 } 6091 ptr += len; 6092 sb_ptr += len; 6093 cur += len; 6094 } 6095 free_extent_buffer(sb); 6096 return ret; 6097 } 6098 6099 int btrfs_read_chunk_tree(struct btrfs_root *root) 6100 { 6101 struct btrfs_path *path; 6102 struct extent_buffer *leaf; 6103 struct btrfs_key key; 6104 struct btrfs_key found_key; 6105 int ret; 6106 int slot; 6107 6108 root = root->fs_info->chunk_root; 6109 6110 path = btrfs_alloc_path(); 6111 if (!path) 6112 return -ENOMEM; 6113 6114 mutex_lock(&uuid_mutex); 6115 lock_chunks(root); 6116 6117 /* 6118 * Read all device items, and then all the chunk items. All 6119 * device items are found before any chunk item (their object id 6120 * is smaller than the lowest possible object id for a chunk 6121 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6122 */ 6123 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6124 key.offset = 0; 6125 key.type = 0; 6126 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6127 if (ret < 0) 6128 goto error; 6129 while (1) { 6130 leaf = path->nodes[0]; 6131 slot = path->slots[0]; 6132 if (slot >= btrfs_header_nritems(leaf)) { 6133 ret = btrfs_next_leaf(root, path); 6134 if (ret == 0) 6135 continue; 6136 if (ret < 0) 6137 goto error; 6138 break; 6139 } 6140 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6141 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6142 struct btrfs_dev_item *dev_item; 6143 dev_item = btrfs_item_ptr(leaf, slot, 6144 struct btrfs_dev_item); 6145 ret = read_one_dev(root, leaf, dev_item); 6146 if (ret) 6147 goto error; 6148 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6149 struct btrfs_chunk *chunk; 6150 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6151 ret = read_one_chunk(root, &found_key, leaf, chunk); 6152 if (ret) 6153 goto error; 6154 } 6155 path->slots[0]++; 6156 } 6157 ret = 0; 6158 error: 6159 unlock_chunks(root); 6160 mutex_unlock(&uuid_mutex); 6161 6162 btrfs_free_path(path); 6163 return ret; 6164 } 6165 6166 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6167 { 6168 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6169 struct btrfs_device *device; 6170 6171 while (fs_devices) { 6172 mutex_lock(&fs_devices->device_list_mutex); 6173 list_for_each_entry(device, &fs_devices->devices, dev_list) 6174 device->dev_root = fs_info->dev_root; 6175 mutex_unlock(&fs_devices->device_list_mutex); 6176 6177 fs_devices = fs_devices->seed; 6178 } 6179 } 6180 6181 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6182 { 6183 int i; 6184 6185 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6186 btrfs_dev_stat_reset(dev, i); 6187 } 6188 6189 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6190 { 6191 struct btrfs_key key; 6192 struct btrfs_key found_key; 6193 struct btrfs_root *dev_root = fs_info->dev_root; 6194 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6195 struct extent_buffer *eb; 6196 int slot; 6197 int ret = 0; 6198 struct btrfs_device *device; 6199 struct btrfs_path *path = NULL; 6200 int i; 6201 6202 path = btrfs_alloc_path(); 6203 if (!path) { 6204 ret = -ENOMEM; 6205 goto out; 6206 } 6207 6208 mutex_lock(&fs_devices->device_list_mutex); 6209 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6210 int item_size; 6211 struct btrfs_dev_stats_item *ptr; 6212 6213 key.objectid = 0; 6214 key.type = BTRFS_DEV_STATS_KEY; 6215 key.offset = device->devid; 6216 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6217 if (ret) { 6218 __btrfs_reset_dev_stats(device); 6219 device->dev_stats_valid = 1; 6220 btrfs_release_path(path); 6221 continue; 6222 } 6223 slot = path->slots[0]; 6224 eb = path->nodes[0]; 6225 btrfs_item_key_to_cpu(eb, &found_key, slot); 6226 item_size = btrfs_item_size_nr(eb, slot); 6227 6228 ptr = btrfs_item_ptr(eb, slot, 6229 struct btrfs_dev_stats_item); 6230 6231 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6232 if (item_size >= (1 + i) * sizeof(__le64)) 6233 btrfs_dev_stat_set(device, i, 6234 btrfs_dev_stats_value(eb, ptr, i)); 6235 else 6236 btrfs_dev_stat_reset(device, i); 6237 } 6238 6239 device->dev_stats_valid = 1; 6240 btrfs_dev_stat_print_on_load(device); 6241 btrfs_release_path(path); 6242 } 6243 mutex_unlock(&fs_devices->device_list_mutex); 6244 6245 out: 6246 btrfs_free_path(path); 6247 return ret < 0 ? ret : 0; 6248 } 6249 6250 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6251 struct btrfs_root *dev_root, 6252 struct btrfs_device *device) 6253 { 6254 struct btrfs_path *path; 6255 struct btrfs_key key; 6256 struct extent_buffer *eb; 6257 struct btrfs_dev_stats_item *ptr; 6258 int ret; 6259 int i; 6260 6261 key.objectid = 0; 6262 key.type = BTRFS_DEV_STATS_KEY; 6263 key.offset = device->devid; 6264 6265 path = btrfs_alloc_path(); 6266 BUG_ON(!path); 6267 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6268 if (ret < 0) { 6269 printk_in_rcu(KERN_WARNING "BTRFS: " 6270 "error %d while searching for dev_stats item for device %s!\n", 6271 ret, rcu_str_deref(device->name)); 6272 goto out; 6273 } 6274 6275 if (ret == 0 && 6276 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6277 /* need to delete old one and insert a new one */ 6278 ret = btrfs_del_item(trans, dev_root, path); 6279 if (ret != 0) { 6280 printk_in_rcu(KERN_WARNING "BTRFS: " 6281 "delete too small dev_stats item for device %s failed %d!\n", 6282 rcu_str_deref(device->name), ret); 6283 goto out; 6284 } 6285 ret = 1; 6286 } 6287 6288 if (ret == 1) { 6289 /* need to insert a new item */ 6290 btrfs_release_path(path); 6291 ret = btrfs_insert_empty_item(trans, dev_root, path, 6292 &key, sizeof(*ptr)); 6293 if (ret < 0) { 6294 printk_in_rcu(KERN_WARNING "BTRFS: " 6295 "insert dev_stats item for device %s failed %d!\n", 6296 rcu_str_deref(device->name), ret); 6297 goto out; 6298 } 6299 } 6300 6301 eb = path->nodes[0]; 6302 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6303 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6304 btrfs_set_dev_stats_value(eb, ptr, i, 6305 btrfs_dev_stat_read(device, i)); 6306 btrfs_mark_buffer_dirty(eb); 6307 6308 out: 6309 btrfs_free_path(path); 6310 return ret; 6311 } 6312 6313 /* 6314 * called from commit_transaction. Writes all changed device stats to disk. 6315 */ 6316 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 6317 struct btrfs_fs_info *fs_info) 6318 { 6319 struct btrfs_root *dev_root = fs_info->dev_root; 6320 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6321 struct btrfs_device *device; 6322 int ret = 0; 6323 6324 mutex_lock(&fs_devices->device_list_mutex); 6325 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6326 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6327 continue; 6328 6329 ret = update_dev_stat_item(trans, dev_root, device); 6330 if (!ret) 6331 device->dev_stats_dirty = 0; 6332 } 6333 mutex_unlock(&fs_devices->device_list_mutex); 6334 6335 return ret; 6336 } 6337 6338 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 6339 { 6340 btrfs_dev_stat_inc(dev, index); 6341 btrfs_dev_stat_print_on_error(dev); 6342 } 6343 6344 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 6345 { 6346 if (!dev->dev_stats_valid) 6347 return; 6348 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 6349 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6350 rcu_str_deref(dev->name), 6351 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6352 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6353 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6354 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6355 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6356 } 6357 6358 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6359 { 6360 int i; 6361 6362 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6363 if (btrfs_dev_stat_read(dev, i) != 0) 6364 break; 6365 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6366 return; /* all values == 0, suppress message */ 6367 6368 printk_in_rcu(KERN_INFO "BTRFS: " 6369 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6370 rcu_str_deref(dev->name), 6371 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6372 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6373 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6374 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6375 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6376 } 6377 6378 int btrfs_get_dev_stats(struct btrfs_root *root, 6379 struct btrfs_ioctl_get_dev_stats *stats) 6380 { 6381 struct btrfs_device *dev; 6382 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6383 int i; 6384 6385 mutex_lock(&fs_devices->device_list_mutex); 6386 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 6387 mutex_unlock(&fs_devices->device_list_mutex); 6388 6389 if (!dev) { 6390 btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); 6391 return -ENODEV; 6392 } else if (!dev->dev_stats_valid) { 6393 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); 6394 return -ENODEV; 6395 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6396 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6397 if (stats->nr_items > i) 6398 stats->values[i] = 6399 btrfs_dev_stat_read_and_reset(dev, i); 6400 else 6401 btrfs_dev_stat_reset(dev, i); 6402 } 6403 } else { 6404 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6405 if (stats->nr_items > i) 6406 stats->values[i] = btrfs_dev_stat_read(dev, i); 6407 } 6408 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 6409 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 6410 return 0; 6411 } 6412 6413 int btrfs_scratch_superblock(struct btrfs_device *device) 6414 { 6415 struct buffer_head *bh; 6416 struct btrfs_super_block *disk_super; 6417 6418 bh = btrfs_read_dev_super(device->bdev); 6419 if (!bh) 6420 return -EINVAL; 6421 disk_super = (struct btrfs_super_block *)bh->b_data; 6422 6423 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6424 set_buffer_dirty(bh); 6425 sync_dirty_buffer(bh); 6426 brelse(bh); 6427 6428 return 0; 6429 } 6430