1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <linux/raid/pq.h> 29 #include <linux/semaphore.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 #include "sysfs.h" 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 52 53 DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 static struct btrfs_fs_devices *__alloc_fs_devices(void) 57 { 58 struct btrfs_fs_devices *fs_devs; 59 60 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 61 if (!fs_devs) 62 return ERR_PTR(-ENOMEM); 63 64 mutex_init(&fs_devs->device_list_mutex); 65 66 INIT_LIST_HEAD(&fs_devs->devices); 67 INIT_LIST_HEAD(&fs_devs->resized_devices); 68 INIT_LIST_HEAD(&fs_devs->alloc_list); 69 INIT_LIST_HEAD(&fs_devs->list); 70 71 return fs_devs; 72 } 73 74 /** 75 * alloc_fs_devices - allocate struct btrfs_fs_devices 76 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 77 * generated. 78 * 79 * Return: a pointer to a new &struct btrfs_fs_devices on success; 80 * ERR_PTR() on error. Returned struct is not linked onto any lists and 81 * can be destroyed with kfree() right away. 82 */ 83 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 84 { 85 struct btrfs_fs_devices *fs_devs; 86 87 fs_devs = __alloc_fs_devices(); 88 if (IS_ERR(fs_devs)) 89 return fs_devs; 90 91 if (fsid) 92 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 93 else 94 generate_random_uuid(fs_devs->fsid); 95 96 return fs_devs; 97 } 98 99 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 100 { 101 struct btrfs_device *device; 102 WARN_ON(fs_devices->opened); 103 while (!list_empty(&fs_devices->devices)) { 104 device = list_entry(fs_devices->devices.next, 105 struct btrfs_device, dev_list); 106 list_del(&device->dev_list); 107 rcu_string_free(device->name); 108 kfree(device); 109 } 110 kfree(fs_devices); 111 } 112 113 static void btrfs_kobject_uevent(struct block_device *bdev, 114 enum kobject_action action) 115 { 116 int ret; 117 118 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 119 if (ret) 120 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 121 action, 122 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 123 &disk_to_dev(bdev->bd_disk)->kobj); 124 } 125 126 void btrfs_cleanup_fs_uuids(void) 127 { 128 struct btrfs_fs_devices *fs_devices; 129 130 while (!list_empty(&fs_uuids)) { 131 fs_devices = list_entry(fs_uuids.next, 132 struct btrfs_fs_devices, list); 133 list_del(&fs_devices->list); 134 free_fs_devices(fs_devices); 135 } 136 } 137 138 static struct btrfs_device *__alloc_device(void) 139 { 140 struct btrfs_device *dev; 141 142 dev = kzalloc(sizeof(*dev), GFP_NOFS); 143 if (!dev) 144 return ERR_PTR(-ENOMEM); 145 146 INIT_LIST_HEAD(&dev->dev_list); 147 INIT_LIST_HEAD(&dev->dev_alloc_list); 148 INIT_LIST_HEAD(&dev->resized_list); 149 150 spin_lock_init(&dev->io_lock); 151 152 spin_lock_init(&dev->reada_lock); 153 atomic_set(&dev->reada_in_flight, 0); 154 atomic_set(&dev->dev_stats_ccnt, 0); 155 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 156 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 157 158 return dev; 159 } 160 161 static noinline struct btrfs_device *__find_device(struct list_head *head, 162 u64 devid, u8 *uuid) 163 { 164 struct btrfs_device *dev; 165 166 list_for_each_entry(dev, head, dev_list) { 167 if (dev->devid == devid && 168 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 169 return dev; 170 } 171 } 172 return NULL; 173 } 174 175 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 176 { 177 struct btrfs_fs_devices *fs_devices; 178 179 list_for_each_entry(fs_devices, &fs_uuids, list) { 180 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 181 return fs_devices; 182 } 183 return NULL; 184 } 185 186 static int 187 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 188 int flush, struct block_device **bdev, 189 struct buffer_head **bh) 190 { 191 int ret; 192 193 *bdev = blkdev_get_by_path(device_path, flags, holder); 194 195 if (IS_ERR(*bdev)) { 196 ret = PTR_ERR(*bdev); 197 printk(KERN_INFO "BTRFS: open %s failed\n", device_path); 198 goto error; 199 } 200 201 if (flush) 202 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 203 ret = set_blocksize(*bdev, 4096); 204 if (ret) { 205 blkdev_put(*bdev, flags); 206 goto error; 207 } 208 invalidate_bdev(*bdev); 209 *bh = btrfs_read_dev_super(*bdev); 210 if (!*bh) { 211 ret = -EINVAL; 212 blkdev_put(*bdev, flags); 213 goto error; 214 } 215 216 return 0; 217 218 error: 219 *bdev = NULL; 220 *bh = NULL; 221 return ret; 222 } 223 224 static void requeue_list(struct btrfs_pending_bios *pending_bios, 225 struct bio *head, struct bio *tail) 226 { 227 228 struct bio *old_head; 229 230 old_head = pending_bios->head; 231 pending_bios->head = head; 232 if (pending_bios->tail) 233 tail->bi_next = old_head; 234 else 235 pending_bios->tail = tail; 236 } 237 238 /* 239 * we try to collect pending bios for a device so we don't get a large 240 * number of procs sending bios down to the same device. This greatly 241 * improves the schedulers ability to collect and merge the bios. 242 * 243 * But, it also turns into a long list of bios to process and that is sure 244 * to eventually make the worker thread block. The solution here is to 245 * make some progress and then put this work struct back at the end of 246 * the list if the block device is congested. This way, multiple devices 247 * can make progress from a single worker thread. 248 */ 249 static noinline void run_scheduled_bios(struct btrfs_device *device) 250 { 251 struct bio *pending; 252 struct backing_dev_info *bdi; 253 struct btrfs_fs_info *fs_info; 254 struct btrfs_pending_bios *pending_bios; 255 struct bio *tail; 256 struct bio *cur; 257 int again = 0; 258 unsigned long num_run; 259 unsigned long batch_run = 0; 260 unsigned long limit; 261 unsigned long last_waited = 0; 262 int force_reg = 0; 263 int sync_pending = 0; 264 struct blk_plug plug; 265 266 /* 267 * this function runs all the bios we've collected for 268 * a particular device. We don't want to wander off to 269 * another device without first sending all of these down. 270 * So, setup a plug here and finish it off before we return 271 */ 272 blk_start_plug(&plug); 273 274 bdi = blk_get_backing_dev_info(device->bdev); 275 fs_info = device->dev_root->fs_info; 276 limit = btrfs_async_submit_limit(fs_info); 277 limit = limit * 2 / 3; 278 279 loop: 280 spin_lock(&device->io_lock); 281 282 loop_lock: 283 num_run = 0; 284 285 /* take all the bios off the list at once and process them 286 * later on (without the lock held). But, remember the 287 * tail and other pointers so the bios can be properly reinserted 288 * into the list if we hit congestion 289 */ 290 if (!force_reg && device->pending_sync_bios.head) { 291 pending_bios = &device->pending_sync_bios; 292 force_reg = 1; 293 } else { 294 pending_bios = &device->pending_bios; 295 force_reg = 0; 296 } 297 298 pending = pending_bios->head; 299 tail = pending_bios->tail; 300 WARN_ON(pending && !tail); 301 302 /* 303 * if pending was null this time around, no bios need processing 304 * at all and we can stop. Otherwise it'll loop back up again 305 * and do an additional check so no bios are missed. 306 * 307 * device->running_pending is used to synchronize with the 308 * schedule_bio code. 309 */ 310 if (device->pending_sync_bios.head == NULL && 311 device->pending_bios.head == NULL) { 312 again = 0; 313 device->running_pending = 0; 314 } else { 315 again = 1; 316 device->running_pending = 1; 317 } 318 319 pending_bios->head = NULL; 320 pending_bios->tail = NULL; 321 322 spin_unlock(&device->io_lock); 323 324 while (pending) { 325 326 rmb(); 327 /* we want to work on both lists, but do more bios on the 328 * sync list than the regular list 329 */ 330 if ((num_run > 32 && 331 pending_bios != &device->pending_sync_bios && 332 device->pending_sync_bios.head) || 333 (num_run > 64 && pending_bios == &device->pending_sync_bios && 334 device->pending_bios.head)) { 335 spin_lock(&device->io_lock); 336 requeue_list(pending_bios, pending, tail); 337 goto loop_lock; 338 } 339 340 cur = pending; 341 pending = pending->bi_next; 342 cur->bi_next = NULL; 343 344 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 345 waitqueue_active(&fs_info->async_submit_wait)) 346 wake_up(&fs_info->async_submit_wait); 347 348 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 349 350 /* 351 * if we're doing the sync list, record that our 352 * plug has some sync requests on it 353 * 354 * If we're doing the regular list and there are 355 * sync requests sitting around, unplug before 356 * we add more 357 */ 358 if (pending_bios == &device->pending_sync_bios) { 359 sync_pending = 1; 360 } else if (sync_pending) { 361 blk_finish_plug(&plug); 362 blk_start_plug(&plug); 363 sync_pending = 0; 364 } 365 366 btrfsic_submit_bio(cur->bi_rw, cur); 367 num_run++; 368 batch_run++; 369 370 cond_resched(); 371 372 /* 373 * we made progress, there is more work to do and the bdi 374 * is now congested. Back off and let other work structs 375 * run instead 376 */ 377 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 378 fs_info->fs_devices->open_devices > 1) { 379 struct io_context *ioc; 380 381 ioc = current->io_context; 382 383 /* 384 * the main goal here is that we don't want to 385 * block if we're going to be able to submit 386 * more requests without blocking. 387 * 388 * This code does two great things, it pokes into 389 * the elevator code from a filesystem _and_ 390 * it makes assumptions about how batching works. 391 */ 392 if (ioc && ioc->nr_batch_requests > 0 && 393 time_before(jiffies, ioc->last_waited + HZ/50UL) && 394 (last_waited == 0 || 395 ioc->last_waited == last_waited)) { 396 /* 397 * we want to go through our batch of 398 * requests and stop. So, we copy out 399 * the ioc->last_waited time and test 400 * against it before looping 401 */ 402 last_waited = ioc->last_waited; 403 cond_resched(); 404 continue; 405 } 406 spin_lock(&device->io_lock); 407 requeue_list(pending_bios, pending, tail); 408 device->running_pending = 1; 409 410 spin_unlock(&device->io_lock); 411 btrfs_queue_work(fs_info->submit_workers, 412 &device->work); 413 goto done; 414 } 415 /* unplug every 64 requests just for good measure */ 416 if (batch_run % 64 == 0) { 417 blk_finish_plug(&plug); 418 blk_start_plug(&plug); 419 sync_pending = 0; 420 } 421 } 422 423 cond_resched(); 424 if (again) 425 goto loop; 426 427 spin_lock(&device->io_lock); 428 if (device->pending_bios.head || device->pending_sync_bios.head) 429 goto loop_lock; 430 spin_unlock(&device->io_lock); 431 432 done: 433 blk_finish_plug(&plug); 434 } 435 436 static void pending_bios_fn(struct btrfs_work *work) 437 { 438 struct btrfs_device *device; 439 440 device = container_of(work, struct btrfs_device, work); 441 run_scheduled_bios(device); 442 } 443 444 /* 445 * Add new device to list of registered devices 446 * 447 * Returns: 448 * 1 - first time device is seen 449 * 0 - device already known 450 * < 0 - error 451 */ 452 static noinline int device_list_add(const char *path, 453 struct btrfs_super_block *disk_super, 454 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 455 { 456 struct btrfs_device *device; 457 struct btrfs_fs_devices *fs_devices; 458 struct rcu_string *name; 459 int ret = 0; 460 u64 found_transid = btrfs_super_generation(disk_super); 461 462 fs_devices = find_fsid(disk_super->fsid); 463 if (!fs_devices) { 464 fs_devices = alloc_fs_devices(disk_super->fsid); 465 if (IS_ERR(fs_devices)) 466 return PTR_ERR(fs_devices); 467 468 list_add(&fs_devices->list, &fs_uuids); 469 470 device = NULL; 471 } else { 472 device = __find_device(&fs_devices->devices, devid, 473 disk_super->dev_item.uuid); 474 } 475 476 if (!device) { 477 if (fs_devices->opened) 478 return -EBUSY; 479 480 device = btrfs_alloc_device(NULL, &devid, 481 disk_super->dev_item.uuid); 482 if (IS_ERR(device)) { 483 /* we can safely leave the fs_devices entry around */ 484 return PTR_ERR(device); 485 } 486 487 name = rcu_string_strdup(path, GFP_NOFS); 488 if (!name) { 489 kfree(device); 490 return -ENOMEM; 491 } 492 rcu_assign_pointer(device->name, name); 493 494 mutex_lock(&fs_devices->device_list_mutex); 495 list_add_rcu(&device->dev_list, &fs_devices->devices); 496 fs_devices->num_devices++; 497 mutex_unlock(&fs_devices->device_list_mutex); 498 499 ret = 1; 500 device->fs_devices = fs_devices; 501 } else if (!device->name || strcmp(device->name->str, path)) { 502 /* 503 * When FS is already mounted. 504 * 1. If you are here and if the device->name is NULL that 505 * means this device was missing at time of FS mount. 506 * 2. If you are here and if the device->name is different 507 * from 'path' that means either 508 * a. The same device disappeared and reappeared with 509 * different name. or 510 * b. The missing-disk-which-was-replaced, has 511 * reappeared now. 512 * 513 * We must allow 1 and 2a above. But 2b would be a spurious 514 * and unintentional. 515 * 516 * Further in case of 1 and 2a above, the disk at 'path' 517 * would have missed some transaction when it was away and 518 * in case of 2a the stale bdev has to be updated as well. 519 * 2b must not be allowed at all time. 520 */ 521 522 /* 523 * For now, we do allow update to btrfs_fs_device through the 524 * btrfs dev scan cli after FS has been mounted. We're still 525 * tracking a problem where systems fail mount by subvolume id 526 * when we reject replacement on a mounted FS. 527 */ 528 if (!fs_devices->opened && found_transid < device->generation) { 529 /* 530 * That is if the FS is _not_ mounted and if you 531 * are here, that means there is more than one 532 * disk with same uuid and devid.We keep the one 533 * with larger generation number or the last-in if 534 * generation are equal. 535 */ 536 return -EEXIST; 537 } 538 539 name = rcu_string_strdup(path, GFP_NOFS); 540 if (!name) 541 return -ENOMEM; 542 rcu_string_free(device->name); 543 rcu_assign_pointer(device->name, name); 544 if (device->missing) { 545 fs_devices->missing_devices--; 546 device->missing = 0; 547 } 548 } 549 550 /* 551 * Unmount does not free the btrfs_device struct but would zero 552 * generation along with most of the other members. So just update 553 * it back. We need it to pick the disk with largest generation 554 * (as above). 555 */ 556 if (!fs_devices->opened) 557 device->generation = found_transid; 558 559 *fs_devices_ret = fs_devices; 560 561 return ret; 562 } 563 564 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 565 { 566 struct btrfs_fs_devices *fs_devices; 567 struct btrfs_device *device; 568 struct btrfs_device *orig_dev; 569 570 fs_devices = alloc_fs_devices(orig->fsid); 571 if (IS_ERR(fs_devices)) 572 return fs_devices; 573 574 mutex_lock(&orig->device_list_mutex); 575 fs_devices->total_devices = orig->total_devices; 576 577 /* We have held the volume lock, it is safe to get the devices. */ 578 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 579 struct rcu_string *name; 580 581 device = btrfs_alloc_device(NULL, &orig_dev->devid, 582 orig_dev->uuid); 583 if (IS_ERR(device)) 584 goto error; 585 586 /* 587 * This is ok to do without rcu read locked because we hold the 588 * uuid mutex so nothing we touch in here is going to disappear. 589 */ 590 if (orig_dev->name) { 591 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 592 if (!name) { 593 kfree(device); 594 goto error; 595 } 596 rcu_assign_pointer(device->name, name); 597 } 598 599 list_add(&device->dev_list, &fs_devices->devices); 600 device->fs_devices = fs_devices; 601 fs_devices->num_devices++; 602 } 603 mutex_unlock(&orig->device_list_mutex); 604 return fs_devices; 605 error: 606 mutex_unlock(&orig->device_list_mutex); 607 free_fs_devices(fs_devices); 608 return ERR_PTR(-ENOMEM); 609 } 610 611 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) 612 { 613 struct btrfs_device *device, *next; 614 struct btrfs_device *latest_dev = NULL; 615 616 mutex_lock(&uuid_mutex); 617 again: 618 /* This is the initialized path, it is safe to release the devices. */ 619 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 620 if (device->in_fs_metadata) { 621 if (!device->is_tgtdev_for_dev_replace && 622 (!latest_dev || 623 device->generation > latest_dev->generation)) { 624 latest_dev = device; 625 } 626 continue; 627 } 628 629 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 630 /* 631 * In the first step, keep the device which has 632 * the correct fsid and the devid that is used 633 * for the dev_replace procedure. 634 * In the second step, the dev_replace state is 635 * read from the device tree and it is known 636 * whether the procedure is really active or 637 * not, which means whether this device is 638 * used or whether it should be removed. 639 */ 640 if (step == 0 || device->is_tgtdev_for_dev_replace) { 641 continue; 642 } 643 } 644 if (device->bdev) { 645 blkdev_put(device->bdev, device->mode); 646 device->bdev = NULL; 647 fs_devices->open_devices--; 648 } 649 if (device->writeable) { 650 list_del_init(&device->dev_alloc_list); 651 device->writeable = 0; 652 if (!device->is_tgtdev_for_dev_replace) 653 fs_devices->rw_devices--; 654 } 655 list_del_init(&device->dev_list); 656 fs_devices->num_devices--; 657 rcu_string_free(device->name); 658 kfree(device); 659 } 660 661 if (fs_devices->seed) { 662 fs_devices = fs_devices->seed; 663 goto again; 664 } 665 666 fs_devices->latest_bdev = latest_dev->bdev; 667 668 mutex_unlock(&uuid_mutex); 669 } 670 671 static void __free_device(struct work_struct *work) 672 { 673 struct btrfs_device *device; 674 675 device = container_of(work, struct btrfs_device, rcu_work); 676 677 if (device->bdev) 678 blkdev_put(device->bdev, device->mode); 679 680 rcu_string_free(device->name); 681 kfree(device); 682 } 683 684 static void free_device(struct rcu_head *head) 685 { 686 struct btrfs_device *device; 687 688 device = container_of(head, struct btrfs_device, rcu); 689 690 INIT_WORK(&device->rcu_work, __free_device); 691 schedule_work(&device->rcu_work); 692 } 693 694 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 695 { 696 struct btrfs_device *device; 697 698 if (--fs_devices->opened > 0) 699 return 0; 700 701 mutex_lock(&fs_devices->device_list_mutex); 702 list_for_each_entry(device, &fs_devices->devices, dev_list) { 703 struct btrfs_device *new_device; 704 struct rcu_string *name; 705 706 if (device->bdev) 707 fs_devices->open_devices--; 708 709 if (device->writeable && 710 device->devid != BTRFS_DEV_REPLACE_DEVID) { 711 list_del_init(&device->dev_alloc_list); 712 fs_devices->rw_devices--; 713 } 714 715 if (device->missing) 716 fs_devices->missing_devices--; 717 718 new_device = btrfs_alloc_device(NULL, &device->devid, 719 device->uuid); 720 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 721 722 /* Safe because we are under uuid_mutex */ 723 if (device->name) { 724 name = rcu_string_strdup(device->name->str, GFP_NOFS); 725 BUG_ON(!name); /* -ENOMEM */ 726 rcu_assign_pointer(new_device->name, name); 727 } 728 729 list_replace_rcu(&device->dev_list, &new_device->dev_list); 730 new_device->fs_devices = device->fs_devices; 731 732 call_rcu(&device->rcu, free_device); 733 } 734 mutex_unlock(&fs_devices->device_list_mutex); 735 736 WARN_ON(fs_devices->open_devices); 737 WARN_ON(fs_devices->rw_devices); 738 fs_devices->opened = 0; 739 fs_devices->seeding = 0; 740 741 return 0; 742 } 743 744 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 745 { 746 struct btrfs_fs_devices *seed_devices = NULL; 747 int ret; 748 749 mutex_lock(&uuid_mutex); 750 ret = __btrfs_close_devices(fs_devices); 751 if (!fs_devices->opened) { 752 seed_devices = fs_devices->seed; 753 fs_devices->seed = NULL; 754 } 755 mutex_unlock(&uuid_mutex); 756 757 while (seed_devices) { 758 fs_devices = seed_devices; 759 seed_devices = fs_devices->seed; 760 __btrfs_close_devices(fs_devices); 761 free_fs_devices(fs_devices); 762 } 763 /* 764 * Wait for rcu kworkers under __btrfs_close_devices 765 * to finish all blkdev_puts so device is really 766 * free when umount is done. 767 */ 768 rcu_barrier(); 769 return ret; 770 } 771 772 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 773 fmode_t flags, void *holder) 774 { 775 struct request_queue *q; 776 struct block_device *bdev; 777 struct list_head *head = &fs_devices->devices; 778 struct btrfs_device *device; 779 struct btrfs_device *latest_dev = NULL; 780 struct buffer_head *bh; 781 struct btrfs_super_block *disk_super; 782 u64 devid; 783 int seeding = 1; 784 int ret = 0; 785 786 flags |= FMODE_EXCL; 787 788 list_for_each_entry(device, head, dev_list) { 789 if (device->bdev) 790 continue; 791 if (!device->name) 792 continue; 793 794 /* Just open everything we can; ignore failures here */ 795 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 796 &bdev, &bh)) 797 continue; 798 799 disk_super = (struct btrfs_super_block *)bh->b_data; 800 devid = btrfs_stack_device_id(&disk_super->dev_item); 801 if (devid != device->devid) 802 goto error_brelse; 803 804 if (memcmp(device->uuid, disk_super->dev_item.uuid, 805 BTRFS_UUID_SIZE)) 806 goto error_brelse; 807 808 device->generation = btrfs_super_generation(disk_super); 809 if (!latest_dev || 810 device->generation > latest_dev->generation) 811 latest_dev = device; 812 813 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 814 device->writeable = 0; 815 } else { 816 device->writeable = !bdev_read_only(bdev); 817 seeding = 0; 818 } 819 820 q = bdev_get_queue(bdev); 821 if (blk_queue_discard(q)) 822 device->can_discard = 1; 823 824 device->bdev = bdev; 825 device->in_fs_metadata = 0; 826 device->mode = flags; 827 828 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 829 fs_devices->rotating = 1; 830 831 fs_devices->open_devices++; 832 if (device->writeable && 833 device->devid != BTRFS_DEV_REPLACE_DEVID) { 834 fs_devices->rw_devices++; 835 list_add(&device->dev_alloc_list, 836 &fs_devices->alloc_list); 837 } 838 brelse(bh); 839 continue; 840 841 error_brelse: 842 brelse(bh); 843 blkdev_put(bdev, flags); 844 continue; 845 } 846 if (fs_devices->open_devices == 0) { 847 ret = -EINVAL; 848 goto out; 849 } 850 fs_devices->seeding = seeding; 851 fs_devices->opened = 1; 852 fs_devices->latest_bdev = latest_dev->bdev; 853 fs_devices->total_rw_bytes = 0; 854 out: 855 return ret; 856 } 857 858 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 859 fmode_t flags, void *holder) 860 { 861 int ret; 862 863 mutex_lock(&uuid_mutex); 864 if (fs_devices->opened) { 865 fs_devices->opened++; 866 ret = 0; 867 } else { 868 ret = __btrfs_open_devices(fs_devices, flags, holder); 869 } 870 mutex_unlock(&uuid_mutex); 871 return ret; 872 } 873 874 /* 875 * Look for a btrfs signature on a device. This may be called out of the mount path 876 * and we are not allowed to call set_blocksize during the scan. The superblock 877 * is read via pagecache 878 */ 879 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 880 struct btrfs_fs_devices **fs_devices_ret) 881 { 882 struct btrfs_super_block *disk_super; 883 struct block_device *bdev; 884 struct page *page; 885 void *p; 886 int ret = -EINVAL; 887 u64 devid; 888 u64 transid; 889 u64 total_devices; 890 u64 bytenr; 891 pgoff_t index; 892 893 /* 894 * we would like to check all the supers, but that would make 895 * a btrfs mount succeed after a mkfs from a different FS. 896 * So, we need to add a special mount option to scan for 897 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 898 */ 899 bytenr = btrfs_sb_offset(0); 900 flags |= FMODE_EXCL; 901 mutex_lock(&uuid_mutex); 902 903 bdev = blkdev_get_by_path(path, flags, holder); 904 905 if (IS_ERR(bdev)) { 906 ret = PTR_ERR(bdev); 907 goto error; 908 } 909 910 /* make sure our super fits in the device */ 911 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) 912 goto error_bdev_put; 913 914 /* make sure our super fits in the page */ 915 if (sizeof(*disk_super) > PAGE_CACHE_SIZE) 916 goto error_bdev_put; 917 918 /* make sure our super doesn't straddle pages on disk */ 919 index = bytenr >> PAGE_CACHE_SHIFT; 920 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) 921 goto error_bdev_put; 922 923 /* pull in the page with our super */ 924 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 925 index, GFP_NOFS); 926 927 if (IS_ERR_OR_NULL(page)) 928 goto error_bdev_put; 929 930 p = kmap(page); 931 932 /* align our pointer to the offset of the super block */ 933 disk_super = p + (bytenr & ~PAGE_CACHE_MASK); 934 935 if (btrfs_super_bytenr(disk_super) != bytenr || 936 btrfs_super_magic(disk_super) != BTRFS_MAGIC) 937 goto error_unmap; 938 939 devid = btrfs_stack_device_id(&disk_super->dev_item); 940 transid = btrfs_super_generation(disk_super); 941 total_devices = btrfs_super_num_devices(disk_super); 942 943 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 944 if (ret > 0) { 945 if (disk_super->label[0]) { 946 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 947 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 948 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); 949 } else { 950 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); 951 } 952 953 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 954 ret = 0; 955 } 956 if (!ret && fs_devices_ret) 957 (*fs_devices_ret)->total_devices = total_devices; 958 959 error_unmap: 960 kunmap(page); 961 page_cache_release(page); 962 963 error_bdev_put: 964 blkdev_put(bdev, flags); 965 error: 966 mutex_unlock(&uuid_mutex); 967 return ret; 968 } 969 970 /* helper to account the used device space in the range */ 971 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 972 u64 end, u64 *length) 973 { 974 struct btrfs_key key; 975 struct btrfs_root *root = device->dev_root; 976 struct btrfs_dev_extent *dev_extent; 977 struct btrfs_path *path; 978 u64 extent_end; 979 int ret; 980 int slot; 981 struct extent_buffer *l; 982 983 *length = 0; 984 985 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 986 return 0; 987 988 path = btrfs_alloc_path(); 989 if (!path) 990 return -ENOMEM; 991 path->reada = 2; 992 993 key.objectid = device->devid; 994 key.offset = start; 995 key.type = BTRFS_DEV_EXTENT_KEY; 996 997 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 998 if (ret < 0) 999 goto out; 1000 if (ret > 0) { 1001 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1002 if (ret < 0) 1003 goto out; 1004 } 1005 1006 while (1) { 1007 l = path->nodes[0]; 1008 slot = path->slots[0]; 1009 if (slot >= btrfs_header_nritems(l)) { 1010 ret = btrfs_next_leaf(root, path); 1011 if (ret == 0) 1012 continue; 1013 if (ret < 0) 1014 goto out; 1015 1016 break; 1017 } 1018 btrfs_item_key_to_cpu(l, &key, slot); 1019 1020 if (key.objectid < device->devid) 1021 goto next; 1022 1023 if (key.objectid > device->devid) 1024 break; 1025 1026 if (key.type != BTRFS_DEV_EXTENT_KEY) 1027 goto next; 1028 1029 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1030 extent_end = key.offset + btrfs_dev_extent_length(l, 1031 dev_extent); 1032 if (key.offset <= start && extent_end > end) { 1033 *length = end - start + 1; 1034 break; 1035 } else if (key.offset <= start && extent_end > start) 1036 *length += extent_end - start; 1037 else if (key.offset > start && extent_end <= end) 1038 *length += extent_end - key.offset; 1039 else if (key.offset > start && key.offset <= end) { 1040 *length += end - key.offset + 1; 1041 break; 1042 } else if (key.offset > end) 1043 break; 1044 1045 next: 1046 path->slots[0]++; 1047 } 1048 ret = 0; 1049 out: 1050 btrfs_free_path(path); 1051 return ret; 1052 } 1053 1054 static int contains_pending_extent(struct btrfs_trans_handle *trans, 1055 struct btrfs_device *device, 1056 u64 *start, u64 len) 1057 { 1058 struct extent_map *em; 1059 struct list_head *search_list = &trans->transaction->pending_chunks; 1060 int ret = 0; 1061 1062 again: 1063 list_for_each_entry(em, search_list, list) { 1064 struct map_lookup *map; 1065 int i; 1066 1067 map = (struct map_lookup *)em->bdev; 1068 for (i = 0; i < map->num_stripes; i++) { 1069 if (map->stripes[i].dev != device) 1070 continue; 1071 if (map->stripes[i].physical >= *start + len || 1072 map->stripes[i].physical + em->orig_block_len <= 1073 *start) 1074 continue; 1075 *start = map->stripes[i].physical + 1076 em->orig_block_len; 1077 ret = 1; 1078 } 1079 } 1080 if (search_list == &trans->transaction->pending_chunks) { 1081 search_list = &trans->root->fs_info->pinned_chunks; 1082 goto again; 1083 } 1084 1085 return ret; 1086 } 1087 1088 1089 /* 1090 * find_free_dev_extent - find free space in the specified device 1091 * @device: the device which we search the free space in 1092 * @num_bytes: the size of the free space that we need 1093 * @start: store the start of the free space. 1094 * @len: the size of the free space. that we find, or the size of the max 1095 * free space if we don't find suitable free space 1096 * 1097 * this uses a pretty simple search, the expectation is that it is 1098 * called very infrequently and that a given device has a small number 1099 * of extents 1100 * 1101 * @start is used to store the start of the free space if we find. But if we 1102 * don't find suitable free space, it will be used to store the start position 1103 * of the max free space. 1104 * 1105 * @len is used to store the size of the free space that we find. 1106 * But if we don't find suitable free space, it is used to store the size of 1107 * the max free space. 1108 */ 1109 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1110 struct btrfs_device *device, u64 num_bytes, 1111 u64 *start, u64 *len) 1112 { 1113 struct btrfs_key key; 1114 struct btrfs_root *root = device->dev_root; 1115 struct btrfs_dev_extent *dev_extent; 1116 struct btrfs_path *path; 1117 u64 hole_size; 1118 u64 max_hole_start; 1119 u64 max_hole_size; 1120 u64 extent_end; 1121 u64 search_start; 1122 u64 search_end = device->total_bytes; 1123 int ret; 1124 int slot; 1125 struct extent_buffer *l; 1126 1127 /* FIXME use last free of some kind */ 1128 1129 /* we don't want to overwrite the superblock on the drive, 1130 * so we make sure to start at an offset of at least 1MB 1131 */ 1132 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1133 1134 path = btrfs_alloc_path(); 1135 if (!path) 1136 return -ENOMEM; 1137 1138 max_hole_start = search_start; 1139 max_hole_size = 0; 1140 1141 again: 1142 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1143 ret = -ENOSPC; 1144 goto out; 1145 } 1146 1147 path->reada = 2; 1148 path->search_commit_root = 1; 1149 path->skip_locking = 1; 1150 1151 key.objectid = device->devid; 1152 key.offset = search_start; 1153 key.type = BTRFS_DEV_EXTENT_KEY; 1154 1155 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1156 if (ret < 0) 1157 goto out; 1158 if (ret > 0) { 1159 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1160 if (ret < 0) 1161 goto out; 1162 } 1163 1164 while (1) { 1165 l = path->nodes[0]; 1166 slot = path->slots[0]; 1167 if (slot >= btrfs_header_nritems(l)) { 1168 ret = btrfs_next_leaf(root, path); 1169 if (ret == 0) 1170 continue; 1171 if (ret < 0) 1172 goto out; 1173 1174 break; 1175 } 1176 btrfs_item_key_to_cpu(l, &key, slot); 1177 1178 if (key.objectid < device->devid) 1179 goto next; 1180 1181 if (key.objectid > device->devid) 1182 break; 1183 1184 if (key.type != BTRFS_DEV_EXTENT_KEY) 1185 goto next; 1186 1187 if (key.offset > search_start) { 1188 hole_size = key.offset - search_start; 1189 1190 /* 1191 * Have to check before we set max_hole_start, otherwise 1192 * we could end up sending back this offset anyway. 1193 */ 1194 if (contains_pending_extent(trans, device, 1195 &search_start, 1196 hole_size)) 1197 hole_size = 0; 1198 1199 if (hole_size > max_hole_size) { 1200 max_hole_start = search_start; 1201 max_hole_size = hole_size; 1202 } 1203 1204 /* 1205 * If this free space is greater than which we need, 1206 * it must be the max free space that we have found 1207 * until now, so max_hole_start must point to the start 1208 * of this free space and the length of this free space 1209 * is stored in max_hole_size. Thus, we return 1210 * max_hole_start and max_hole_size and go back to the 1211 * caller. 1212 */ 1213 if (hole_size >= num_bytes) { 1214 ret = 0; 1215 goto out; 1216 } 1217 } 1218 1219 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1220 extent_end = key.offset + btrfs_dev_extent_length(l, 1221 dev_extent); 1222 if (extent_end > search_start) 1223 search_start = extent_end; 1224 next: 1225 path->slots[0]++; 1226 cond_resched(); 1227 } 1228 1229 /* 1230 * At this point, search_start should be the end of 1231 * allocated dev extents, and when shrinking the device, 1232 * search_end may be smaller than search_start. 1233 */ 1234 if (search_end > search_start) { 1235 hole_size = search_end - search_start; 1236 1237 if (contains_pending_extent(trans, device, &search_start, 1238 hole_size)) { 1239 btrfs_release_path(path); 1240 goto again; 1241 } 1242 1243 if (hole_size > max_hole_size) { 1244 max_hole_start = search_start; 1245 max_hole_size = hole_size; 1246 } 1247 } 1248 1249 /* See above. */ 1250 if (max_hole_size < num_bytes) 1251 ret = -ENOSPC; 1252 else 1253 ret = 0; 1254 1255 out: 1256 btrfs_free_path(path); 1257 *start = max_hole_start; 1258 if (len) 1259 *len = max_hole_size; 1260 return ret; 1261 } 1262 1263 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1264 struct btrfs_device *device, 1265 u64 start, u64 *dev_extent_len) 1266 { 1267 int ret; 1268 struct btrfs_path *path; 1269 struct btrfs_root *root = device->dev_root; 1270 struct btrfs_key key; 1271 struct btrfs_key found_key; 1272 struct extent_buffer *leaf = NULL; 1273 struct btrfs_dev_extent *extent = NULL; 1274 1275 path = btrfs_alloc_path(); 1276 if (!path) 1277 return -ENOMEM; 1278 1279 key.objectid = device->devid; 1280 key.offset = start; 1281 key.type = BTRFS_DEV_EXTENT_KEY; 1282 again: 1283 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1284 if (ret > 0) { 1285 ret = btrfs_previous_item(root, path, key.objectid, 1286 BTRFS_DEV_EXTENT_KEY); 1287 if (ret) 1288 goto out; 1289 leaf = path->nodes[0]; 1290 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1291 extent = btrfs_item_ptr(leaf, path->slots[0], 1292 struct btrfs_dev_extent); 1293 BUG_ON(found_key.offset > start || found_key.offset + 1294 btrfs_dev_extent_length(leaf, extent) < start); 1295 key = found_key; 1296 btrfs_release_path(path); 1297 goto again; 1298 } else if (ret == 0) { 1299 leaf = path->nodes[0]; 1300 extent = btrfs_item_ptr(leaf, path->slots[0], 1301 struct btrfs_dev_extent); 1302 } else { 1303 btrfs_error(root->fs_info, ret, "Slot search failed"); 1304 goto out; 1305 } 1306 1307 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1308 1309 ret = btrfs_del_item(trans, root, path); 1310 if (ret) { 1311 btrfs_error(root->fs_info, ret, 1312 "Failed to remove dev extent item"); 1313 } else { 1314 trans->transaction->have_free_bgs = 1; 1315 } 1316 out: 1317 btrfs_free_path(path); 1318 return ret; 1319 } 1320 1321 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1322 struct btrfs_device *device, 1323 u64 chunk_tree, u64 chunk_objectid, 1324 u64 chunk_offset, u64 start, u64 num_bytes) 1325 { 1326 int ret; 1327 struct btrfs_path *path; 1328 struct btrfs_root *root = device->dev_root; 1329 struct btrfs_dev_extent *extent; 1330 struct extent_buffer *leaf; 1331 struct btrfs_key key; 1332 1333 WARN_ON(!device->in_fs_metadata); 1334 WARN_ON(device->is_tgtdev_for_dev_replace); 1335 path = btrfs_alloc_path(); 1336 if (!path) 1337 return -ENOMEM; 1338 1339 key.objectid = device->devid; 1340 key.offset = start; 1341 key.type = BTRFS_DEV_EXTENT_KEY; 1342 ret = btrfs_insert_empty_item(trans, root, path, &key, 1343 sizeof(*extent)); 1344 if (ret) 1345 goto out; 1346 1347 leaf = path->nodes[0]; 1348 extent = btrfs_item_ptr(leaf, path->slots[0], 1349 struct btrfs_dev_extent); 1350 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1351 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1352 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1353 1354 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1355 btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); 1356 1357 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1358 btrfs_mark_buffer_dirty(leaf); 1359 out: 1360 btrfs_free_path(path); 1361 return ret; 1362 } 1363 1364 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1365 { 1366 struct extent_map_tree *em_tree; 1367 struct extent_map *em; 1368 struct rb_node *n; 1369 u64 ret = 0; 1370 1371 em_tree = &fs_info->mapping_tree.map_tree; 1372 read_lock(&em_tree->lock); 1373 n = rb_last(&em_tree->map); 1374 if (n) { 1375 em = rb_entry(n, struct extent_map, rb_node); 1376 ret = em->start + em->len; 1377 } 1378 read_unlock(&em_tree->lock); 1379 1380 return ret; 1381 } 1382 1383 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1384 u64 *devid_ret) 1385 { 1386 int ret; 1387 struct btrfs_key key; 1388 struct btrfs_key found_key; 1389 struct btrfs_path *path; 1390 1391 path = btrfs_alloc_path(); 1392 if (!path) 1393 return -ENOMEM; 1394 1395 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1396 key.type = BTRFS_DEV_ITEM_KEY; 1397 key.offset = (u64)-1; 1398 1399 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1400 if (ret < 0) 1401 goto error; 1402 1403 BUG_ON(ret == 0); /* Corruption */ 1404 1405 ret = btrfs_previous_item(fs_info->chunk_root, path, 1406 BTRFS_DEV_ITEMS_OBJECTID, 1407 BTRFS_DEV_ITEM_KEY); 1408 if (ret) { 1409 *devid_ret = 1; 1410 } else { 1411 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1412 path->slots[0]); 1413 *devid_ret = found_key.offset + 1; 1414 } 1415 ret = 0; 1416 error: 1417 btrfs_free_path(path); 1418 return ret; 1419 } 1420 1421 /* 1422 * the device information is stored in the chunk root 1423 * the btrfs_device struct should be fully filled in 1424 */ 1425 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1426 struct btrfs_root *root, 1427 struct btrfs_device *device) 1428 { 1429 int ret; 1430 struct btrfs_path *path; 1431 struct btrfs_dev_item *dev_item; 1432 struct extent_buffer *leaf; 1433 struct btrfs_key key; 1434 unsigned long ptr; 1435 1436 root = root->fs_info->chunk_root; 1437 1438 path = btrfs_alloc_path(); 1439 if (!path) 1440 return -ENOMEM; 1441 1442 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1443 key.type = BTRFS_DEV_ITEM_KEY; 1444 key.offset = device->devid; 1445 1446 ret = btrfs_insert_empty_item(trans, root, path, &key, 1447 sizeof(*dev_item)); 1448 if (ret) 1449 goto out; 1450 1451 leaf = path->nodes[0]; 1452 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1453 1454 btrfs_set_device_id(leaf, dev_item, device->devid); 1455 btrfs_set_device_generation(leaf, dev_item, 0); 1456 btrfs_set_device_type(leaf, dev_item, device->type); 1457 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1458 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1459 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1460 btrfs_set_device_total_bytes(leaf, dev_item, 1461 btrfs_device_get_disk_total_bytes(device)); 1462 btrfs_set_device_bytes_used(leaf, dev_item, 1463 btrfs_device_get_bytes_used(device)); 1464 btrfs_set_device_group(leaf, dev_item, 0); 1465 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1466 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1467 btrfs_set_device_start_offset(leaf, dev_item, 0); 1468 1469 ptr = btrfs_device_uuid(dev_item); 1470 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1471 ptr = btrfs_device_fsid(dev_item); 1472 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1473 btrfs_mark_buffer_dirty(leaf); 1474 1475 ret = 0; 1476 out: 1477 btrfs_free_path(path); 1478 return ret; 1479 } 1480 1481 /* 1482 * Function to update ctime/mtime for a given device path. 1483 * Mainly used for ctime/mtime based probe like libblkid. 1484 */ 1485 static void update_dev_time(char *path_name) 1486 { 1487 struct file *filp; 1488 1489 filp = filp_open(path_name, O_RDWR, 0); 1490 if (IS_ERR(filp)) 1491 return; 1492 file_update_time(filp); 1493 filp_close(filp, NULL); 1494 return; 1495 } 1496 1497 static int btrfs_rm_dev_item(struct btrfs_root *root, 1498 struct btrfs_device *device) 1499 { 1500 int ret; 1501 struct btrfs_path *path; 1502 struct btrfs_key key; 1503 struct btrfs_trans_handle *trans; 1504 1505 root = root->fs_info->chunk_root; 1506 1507 path = btrfs_alloc_path(); 1508 if (!path) 1509 return -ENOMEM; 1510 1511 trans = btrfs_start_transaction(root, 0); 1512 if (IS_ERR(trans)) { 1513 btrfs_free_path(path); 1514 return PTR_ERR(trans); 1515 } 1516 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1517 key.type = BTRFS_DEV_ITEM_KEY; 1518 key.offset = device->devid; 1519 1520 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1521 if (ret < 0) 1522 goto out; 1523 1524 if (ret > 0) { 1525 ret = -ENOENT; 1526 goto out; 1527 } 1528 1529 ret = btrfs_del_item(trans, root, path); 1530 if (ret) 1531 goto out; 1532 out: 1533 btrfs_free_path(path); 1534 btrfs_commit_transaction(trans, root); 1535 return ret; 1536 } 1537 1538 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1539 { 1540 struct btrfs_device *device; 1541 struct btrfs_device *next_device; 1542 struct block_device *bdev; 1543 struct buffer_head *bh = NULL; 1544 struct btrfs_super_block *disk_super; 1545 struct btrfs_fs_devices *cur_devices; 1546 u64 all_avail; 1547 u64 devid; 1548 u64 num_devices; 1549 u8 *dev_uuid; 1550 unsigned seq; 1551 int ret = 0; 1552 bool clear_super = false; 1553 1554 mutex_lock(&uuid_mutex); 1555 1556 do { 1557 seq = read_seqbegin(&root->fs_info->profiles_lock); 1558 1559 all_avail = root->fs_info->avail_data_alloc_bits | 1560 root->fs_info->avail_system_alloc_bits | 1561 root->fs_info->avail_metadata_alloc_bits; 1562 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1563 1564 num_devices = root->fs_info->fs_devices->num_devices; 1565 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1566 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1567 WARN_ON(num_devices < 1); 1568 num_devices--; 1569 } 1570 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1571 1572 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1573 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1574 goto out; 1575 } 1576 1577 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1578 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; 1579 goto out; 1580 } 1581 1582 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1583 root->fs_info->fs_devices->rw_devices <= 2) { 1584 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1585 goto out; 1586 } 1587 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1588 root->fs_info->fs_devices->rw_devices <= 3) { 1589 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; 1590 goto out; 1591 } 1592 1593 if (strcmp(device_path, "missing") == 0) { 1594 struct list_head *devices; 1595 struct btrfs_device *tmp; 1596 1597 device = NULL; 1598 devices = &root->fs_info->fs_devices->devices; 1599 /* 1600 * It is safe to read the devices since the volume_mutex 1601 * is held. 1602 */ 1603 list_for_each_entry(tmp, devices, dev_list) { 1604 if (tmp->in_fs_metadata && 1605 !tmp->is_tgtdev_for_dev_replace && 1606 !tmp->bdev) { 1607 device = tmp; 1608 break; 1609 } 1610 } 1611 bdev = NULL; 1612 bh = NULL; 1613 disk_super = NULL; 1614 if (!device) { 1615 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 1616 goto out; 1617 } 1618 } else { 1619 ret = btrfs_get_bdev_and_sb(device_path, 1620 FMODE_WRITE | FMODE_EXCL, 1621 root->fs_info->bdev_holder, 0, 1622 &bdev, &bh); 1623 if (ret) 1624 goto out; 1625 disk_super = (struct btrfs_super_block *)bh->b_data; 1626 devid = btrfs_stack_device_id(&disk_super->dev_item); 1627 dev_uuid = disk_super->dev_item.uuid; 1628 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1629 disk_super->fsid); 1630 if (!device) { 1631 ret = -ENOENT; 1632 goto error_brelse; 1633 } 1634 } 1635 1636 if (device->is_tgtdev_for_dev_replace) { 1637 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1638 goto error_brelse; 1639 } 1640 1641 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1642 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1643 goto error_brelse; 1644 } 1645 1646 if (device->writeable) { 1647 lock_chunks(root); 1648 list_del_init(&device->dev_alloc_list); 1649 device->fs_devices->rw_devices--; 1650 unlock_chunks(root); 1651 clear_super = true; 1652 } 1653 1654 mutex_unlock(&uuid_mutex); 1655 ret = btrfs_shrink_device(device, 0); 1656 mutex_lock(&uuid_mutex); 1657 if (ret) 1658 goto error_undo; 1659 1660 /* 1661 * TODO: the superblock still includes this device in its num_devices 1662 * counter although write_all_supers() is not locked out. This 1663 * could give a filesystem state which requires a degraded mount. 1664 */ 1665 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1666 if (ret) 1667 goto error_undo; 1668 1669 device->in_fs_metadata = 0; 1670 btrfs_scrub_cancel_dev(root->fs_info, device); 1671 1672 /* 1673 * the device list mutex makes sure that we don't change 1674 * the device list while someone else is writing out all 1675 * the device supers. Whoever is writing all supers, should 1676 * lock the device list mutex before getting the number of 1677 * devices in the super block (super_copy). Conversely, 1678 * whoever updates the number of devices in the super block 1679 * (super_copy) should hold the device list mutex. 1680 */ 1681 1682 cur_devices = device->fs_devices; 1683 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1684 list_del_rcu(&device->dev_list); 1685 1686 device->fs_devices->num_devices--; 1687 device->fs_devices->total_devices--; 1688 1689 if (device->missing) 1690 device->fs_devices->missing_devices--; 1691 1692 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1693 struct btrfs_device, dev_list); 1694 if (device->bdev == root->fs_info->sb->s_bdev) 1695 root->fs_info->sb->s_bdev = next_device->bdev; 1696 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1697 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1698 1699 if (device->bdev) { 1700 device->fs_devices->open_devices--; 1701 /* remove sysfs entry */ 1702 btrfs_kobj_rm_device(root->fs_info, device); 1703 } 1704 1705 call_rcu(&device->rcu, free_device); 1706 1707 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1708 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1709 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1710 1711 if (cur_devices->open_devices == 0) { 1712 struct btrfs_fs_devices *fs_devices; 1713 fs_devices = root->fs_info->fs_devices; 1714 while (fs_devices) { 1715 if (fs_devices->seed == cur_devices) { 1716 fs_devices->seed = cur_devices->seed; 1717 break; 1718 } 1719 fs_devices = fs_devices->seed; 1720 } 1721 cur_devices->seed = NULL; 1722 __btrfs_close_devices(cur_devices); 1723 free_fs_devices(cur_devices); 1724 } 1725 1726 root->fs_info->num_tolerated_disk_barrier_failures = 1727 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 1728 1729 /* 1730 * at this point, the device is zero sized. We want to 1731 * remove it from the devices list and zero out the old super 1732 */ 1733 if (clear_super && disk_super) { 1734 u64 bytenr; 1735 int i; 1736 1737 /* make sure this device isn't detected as part of 1738 * the FS anymore 1739 */ 1740 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1741 set_buffer_dirty(bh); 1742 sync_dirty_buffer(bh); 1743 1744 /* clear the mirror copies of super block on the disk 1745 * being removed, 0th copy is been taken care above and 1746 * the below would take of the rest 1747 */ 1748 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1749 bytenr = btrfs_sb_offset(i); 1750 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 1751 i_size_read(bdev->bd_inode)) 1752 break; 1753 1754 brelse(bh); 1755 bh = __bread(bdev, bytenr / 4096, 1756 BTRFS_SUPER_INFO_SIZE); 1757 if (!bh) 1758 continue; 1759 1760 disk_super = (struct btrfs_super_block *)bh->b_data; 1761 1762 if (btrfs_super_bytenr(disk_super) != bytenr || 1763 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1764 continue; 1765 } 1766 memset(&disk_super->magic, 0, 1767 sizeof(disk_super->magic)); 1768 set_buffer_dirty(bh); 1769 sync_dirty_buffer(bh); 1770 } 1771 } 1772 1773 ret = 0; 1774 1775 if (bdev) { 1776 /* Notify udev that device has changed */ 1777 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1778 1779 /* Update ctime/mtime for device path for libblkid */ 1780 update_dev_time(device_path); 1781 } 1782 1783 error_brelse: 1784 brelse(bh); 1785 if (bdev) 1786 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1787 out: 1788 mutex_unlock(&uuid_mutex); 1789 return ret; 1790 error_undo: 1791 if (device->writeable) { 1792 lock_chunks(root); 1793 list_add(&device->dev_alloc_list, 1794 &root->fs_info->fs_devices->alloc_list); 1795 device->fs_devices->rw_devices++; 1796 unlock_chunks(root); 1797 } 1798 goto error_brelse; 1799 } 1800 1801 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 1802 struct btrfs_device *srcdev) 1803 { 1804 struct btrfs_fs_devices *fs_devices; 1805 1806 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1807 1808 /* 1809 * in case of fs with no seed, srcdev->fs_devices will point 1810 * to fs_devices of fs_info. However when the dev being replaced is 1811 * a seed dev it will point to the seed's local fs_devices. In short 1812 * srcdev will have its correct fs_devices in both the cases. 1813 */ 1814 fs_devices = srcdev->fs_devices; 1815 1816 list_del_rcu(&srcdev->dev_list); 1817 list_del_rcu(&srcdev->dev_alloc_list); 1818 fs_devices->num_devices--; 1819 if (srcdev->missing) 1820 fs_devices->missing_devices--; 1821 1822 if (srcdev->writeable) { 1823 fs_devices->rw_devices--; 1824 /* zero out the old super if it is writable */ 1825 btrfs_scratch_superblock(srcdev); 1826 } 1827 1828 if (srcdev->bdev) 1829 fs_devices->open_devices--; 1830 } 1831 1832 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 1833 struct btrfs_device *srcdev) 1834 { 1835 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 1836 1837 call_rcu(&srcdev->rcu, free_device); 1838 1839 /* 1840 * unless fs_devices is seed fs, num_devices shouldn't go 1841 * zero 1842 */ 1843 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); 1844 1845 /* if this is no devs we rather delete the fs_devices */ 1846 if (!fs_devices->num_devices) { 1847 struct btrfs_fs_devices *tmp_fs_devices; 1848 1849 tmp_fs_devices = fs_info->fs_devices; 1850 while (tmp_fs_devices) { 1851 if (tmp_fs_devices->seed == fs_devices) { 1852 tmp_fs_devices->seed = fs_devices->seed; 1853 break; 1854 } 1855 tmp_fs_devices = tmp_fs_devices->seed; 1856 } 1857 fs_devices->seed = NULL; 1858 __btrfs_close_devices(fs_devices); 1859 free_fs_devices(fs_devices); 1860 } 1861 } 1862 1863 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1864 struct btrfs_device *tgtdev) 1865 { 1866 struct btrfs_device *next_device; 1867 1868 mutex_lock(&uuid_mutex); 1869 WARN_ON(!tgtdev); 1870 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1871 if (tgtdev->bdev) { 1872 btrfs_scratch_superblock(tgtdev); 1873 fs_info->fs_devices->open_devices--; 1874 } 1875 fs_info->fs_devices->num_devices--; 1876 1877 next_device = list_entry(fs_info->fs_devices->devices.next, 1878 struct btrfs_device, dev_list); 1879 if (tgtdev->bdev == fs_info->sb->s_bdev) 1880 fs_info->sb->s_bdev = next_device->bdev; 1881 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1882 fs_info->fs_devices->latest_bdev = next_device->bdev; 1883 list_del_rcu(&tgtdev->dev_list); 1884 1885 call_rcu(&tgtdev->rcu, free_device); 1886 1887 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1888 mutex_unlock(&uuid_mutex); 1889 } 1890 1891 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1892 struct btrfs_device **device) 1893 { 1894 int ret = 0; 1895 struct btrfs_super_block *disk_super; 1896 u64 devid; 1897 u8 *dev_uuid; 1898 struct block_device *bdev; 1899 struct buffer_head *bh; 1900 1901 *device = NULL; 1902 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1903 root->fs_info->bdev_holder, 0, &bdev, &bh); 1904 if (ret) 1905 return ret; 1906 disk_super = (struct btrfs_super_block *)bh->b_data; 1907 devid = btrfs_stack_device_id(&disk_super->dev_item); 1908 dev_uuid = disk_super->dev_item.uuid; 1909 *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1910 disk_super->fsid); 1911 brelse(bh); 1912 if (!*device) 1913 ret = -ENOENT; 1914 blkdev_put(bdev, FMODE_READ); 1915 return ret; 1916 } 1917 1918 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1919 char *device_path, 1920 struct btrfs_device **device) 1921 { 1922 *device = NULL; 1923 if (strcmp(device_path, "missing") == 0) { 1924 struct list_head *devices; 1925 struct btrfs_device *tmp; 1926 1927 devices = &root->fs_info->fs_devices->devices; 1928 /* 1929 * It is safe to read the devices since the volume_mutex 1930 * is held by the caller. 1931 */ 1932 list_for_each_entry(tmp, devices, dev_list) { 1933 if (tmp->in_fs_metadata && !tmp->bdev) { 1934 *device = tmp; 1935 break; 1936 } 1937 } 1938 1939 if (!*device) { 1940 btrfs_err(root->fs_info, "no missing device found"); 1941 return -ENOENT; 1942 } 1943 1944 return 0; 1945 } else { 1946 return btrfs_find_device_by_path(root, device_path, device); 1947 } 1948 } 1949 1950 /* 1951 * does all the dirty work required for changing file system's UUID. 1952 */ 1953 static int btrfs_prepare_sprout(struct btrfs_root *root) 1954 { 1955 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1956 struct btrfs_fs_devices *old_devices; 1957 struct btrfs_fs_devices *seed_devices; 1958 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1959 struct btrfs_device *device; 1960 u64 super_flags; 1961 1962 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1963 if (!fs_devices->seeding) 1964 return -EINVAL; 1965 1966 seed_devices = __alloc_fs_devices(); 1967 if (IS_ERR(seed_devices)) 1968 return PTR_ERR(seed_devices); 1969 1970 old_devices = clone_fs_devices(fs_devices); 1971 if (IS_ERR(old_devices)) { 1972 kfree(seed_devices); 1973 return PTR_ERR(old_devices); 1974 } 1975 1976 list_add(&old_devices->list, &fs_uuids); 1977 1978 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1979 seed_devices->opened = 1; 1980 INIT_LIST_HEAD(&seed_devices->devices); 1981 INIT_LIST_HEAD(&seed_devices->alloc_list); 1982 mutex_init(&seed_devices->device_list_mutex); 1983 1984 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1985 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1986 synchronize_rcu); 1987 list_for_each_entry(device, &seed_devices->devices, dev_list) 1988 device->fs_devices = seed_devices; 1989 1990 lock_chunks(root); 1991 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1992 unlock_chunks(root); 1993 1994 fs_devices->seeding = 0; 1995 fs_devices->num_devices = 0; 1996 fs_devices->open_devices = 0; 1997 fs_devices->missing_devices = 0; 1998 fs_devices->rotating = 0; 1999 fs_devices->seed = seed_devices; 2000 2001 generate_random_uuid(fs_devices->fsid); 2002 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2003 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2004 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2005 2006 super_flags = btrfs_super_flags(disk_super) & 2007 ~BTRFS_SUPER_FLAG_SEEDING; 2008 btrfs_set_super_flags(disk_super, super_flags); 2009 2010 return 0; 2011 } 2012 2013 /* 2014 * strore the expected generation for seed devices in device items. 2015 */ 2016 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2017 struct btrfs_root *root) 2018 { 2019 struct btrfs_path *path; 2020 struct extent_buffer *leaf; 2021 struct btrfs_dev_item *dev_item; 2022 struct btrfs_device *device; 2023 struct btrfs_key key; 2024 u8 fs_uuid[BTRFS_UUID_SIZE]; 2025 u8 dev_uuid[BTRFS_UUID_SIZE]; 2026 u64 devid; 2027 int ret; 2028 2029 path = btrfs_alloc_path(); 2030 if (!path) 2031 return -ENOMEM; 2032 2033 root = root->fs_info->chunk_root; 2034 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2035 key.offset = 0; 2036 key.type = BTRFS_DEV_ITEM_KEY; 2037 2038 while (1) { 2039 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2040 if (ret < 0) 2041 goto error; 2042 2043 leaf = path->nodes[0]; 2044 next_slot: 2045 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2046 ret = btrfs_next_leaf(root, path); 2047 if (ret > 0) 2048 break; 2049 if (ret < 0) 2050 goto error; 2051 leaf = path->nodes[0]; 2052 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2053 btrfs_release_path(path); 2054 continue; 2055 } 2056 2057 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2058 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2059 key.type != BTRFS_DEV_ITEM_KEY) 2060 break; 2061 2062 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2063 struct btrfs_dev_item); 2064 devid = btrfs_device_id(leaf, dev_item); 2065 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2066 BTRFS_UUID_SIZE); 2067 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2068 BTRFS_UUID_SIZE); 2069 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 2070 fs_uuid); 2071 BUG_ON(!device); /* Logic error */ 2072 2073 if (device->fs_devices->seeding) { 2074 btrfs_set_device_generation(leaf, dev_item, 2075 device->generation); 2076 btrfs_mark_buffer_dirty(leaf); 2077 } 2078 2079 path->slots[0]++; 2080 goto next_slot; 2081 } 2082 ret = 0; 2083 error: 2084 btrfs_free_path(path); 2085 return ret; 2086 } 2087 2088 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 2089 { 2090 struct request_queue *q; 2091 struct btrfs_trans_handle *trans; 2092 struct btrfs_device *device; 2093 struct block_device *bdev; 2094 struct list_head *devices; 2095 struct super_block *sb = root->fs_info->sb; 2096 struct rcu_string *name; 2097 u64 tmp; 2098 int seeding_dev = 0; 2099 int ret = 0; 2100 2101 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 2102 return -EROFS; 2103 2104 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2105 root->fs_info->bdev_holder); 2106 if (IS_ERR(bdev)) 2107 return PTR_ERR(bdev); 2108 2109 if (root->fs_info->fs_devices->seeding) { 2110 seeding_dev = 1; 2111 down_write(&sb->s_umount); 2112 mutex_lock(&uuid_mutex); 2113 } 2114 2115 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2116 2117 devices = &root->fs_info->fs_devices->devices; 2118 2119 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2120 list_for_each_entry(device, devices, dev_list) { 2121 if (device->bdev == bdev) { 2122 ret = -EEXIST; 2123 mutex_unlock( 2124 &root->fs_info->fs_devices->device_list_mutex); 2125 goto error; 2126 } 2127 } 2128 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2129 2130 device = btrfs_alloc_device(root->fs_info, NULL, NULL); 2131 if (IS_ERR(device)) { 2132 /* we can safely leave the fs_devices entry around */ 2133 ret = PTR_ERR(device); 2134 goto error; 2135 } 2136 2137 name = rcu_string_strdup(device_path, GFP_NOFS); 2138 if (!name) { 2139 kfree(device); 2140 ret = -ENOMEM; 2141 goto error; 2142 } 2143 rcu_assign_pointer(device->name, name); 2144 2145 trans = btrfs_start_transaction(root, 0); 2146 if (IS_ERR(trans)) { 2147 rcu_string_free(device->name); 2148 kfree(device); 2149 ret = PTR_ERR(trans); 2150 goto error; 2151 } 2152 2153 q = bdev_get_queue(bdev); 2154 if (blk_queue_discard(q)) 2155 device->can_discard = 1; 2156 device->writeable = 1; 2157 device->generation = trans->transid; 2158 device->io_width = root->sectorsize; 2159 device->io_align = root->sectorsize; 2160 device->sector_size = root->sectorsize; 2161 device->total_bytes = i_size_read(bdev->bd_inode); 2162 device->disk_total_bytes = device->total_bytes; 2163 device->commit_total_bytes = device->total_bytes; 2164 device->dev_root = root->fs_info->dev_root; 2165 device->bdev = bdev; 2166 device->in_fs_metadata = 1; 2167 device->is_tgtdev_for_dev_replace = 0; 2168 device->mode = FMODE_EXCL; 2169 device->dev_stats_valid = 1; 2170 set_blocksize(device->bdev, 4096); 2171 2172 if (seeding_dev) { 2173 sb->s_flags &= ~MS_RDONLY; 2174 ret = btrfs_prepare_sprout(root); 2175 BUG_ON(ret); /* -ENOMEM */ 2176 } 2177 2178 device->fs_devices = root->fs_info->fs_devices; 2179 2180 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2181 lock_chunks(root); 2182 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2183 list_add(&device->dev_alloc_list, 2184 &root->fs_info->fs_devices->alloc_list); 2185 root->fs_info->fs_devices->num_devices++; 2186 root->fs_info->fs_devices->open_devices++; 2187 root->fs_info->fs_devices->rw_devices++; 2188 root->fs_info->fs_devices->total_devices++; 2189 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2190 2191 spin_lock(&root->fs_info->free_chunk_lock); 2192 root->fs_info->free_chunk_space += device->total_bytes; 2193 spin_unlock(&root->fs_info->free_chunk_lock); 2194 2195 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2196 root->fs_info->fs_devices->rotating = 1; 2197 2198 tmp = btrfs_super_total_bytes(root->fs_info->super_copy); 2199 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2200 tmp + device->total_bytes); 2201 2202 tmp = btrfs_super_num_devices(root->fs_info->super_copy); 2203 btrfs_set_super_num_devices(root->fs_info->super_copy, 2204 tmp + 1); 2205 2206 /* add sysfs device entry */ 2207 btrfs_kobj_add_device(root->fs_info, device); 2208 2209 /* 2210 * we've got more storage, clear any full flags on the space 2211 * infos 2212 */ 2213 btrfs_clear_space_info_full(root->fs_info); 2214 2215 unlock_chunks(root); 2216 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2217 2218 if (seeding_dev) { 2219 lock_chunks(root); 2220 ret = init_first_rw_device(trans, root, device); 2221 unlock_chunks(root); 2222 if (ret) { 2223 btrfs_abort_transaction(trans, root, ret); 2224 goto error_trans; 2225 } 2226 } 2227 2228 ret = btrfs_add_device(trans, root, device); 2229 if (ret) { 2230 btrfs_abort_transaction(trans, root, ret); 2231 goto error_trans; 2232 } 2233 2234 if (seeding_dev) { 2235 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2236 2237 ret = btrfs_finish_sprout(trans, root); 2238 if (ret) { 2239 btrfs_abort_transaction(trans, root, ret); 2240 goto error_trans; 2241 } 2242 2243 /* Sprouting would change fsid of the mounted root, 2244 * so rename the fsid on the sysfs 2245 */ 2246 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2247 root->fs_info->fsid); 2248 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2249 goto error_trans; 2250 } 2251 2252 root->fs_info->num_tolerated_disk_barrier_failures = 2253 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2254 ret = btrfs_commit_transaction(trans, root); 2255 2256 if (seeding_dev) { 2257 mutex_unlock(&uuid_mutex); 2258 up_write(&sb->s_umount); 2259 2260 if (ret) /* transaction commit */ 2261 return ret; 2262 2263 ret = btrfs_relocate_sys_chunks(root); 2264 if (ret < 0) 2265 btrfs_error(root->fs_info, ret, 2266 "Failed to relocate sys chunks after " 2267 "device initialization. This can be fixed " 2268 "using the \"btrfs balance\" command."); 2269 trans = btrfs_attach_transaction(root); 2270 if (IS_ERR(trans)) { 2271 if (PTR_ERR(trans) == -ENOENT) 2272 return 0; 2273 return PTR_ERR(trans); 2274 } 2275 ret = btrfs_commit_transaction(trans, root); 2276 } 2277 2278 /* Update ctime/mtime for libblkid */ 2279 update_dev_time(device_path); 2280 return ret; 2281 2282 error_trans: 2283 btrfs_end_transaction(trans, root); 2284 rcu_string_free(device->name); 2285 btrfs_kobj_rm_device(root->fs_info, device); 2286 kfree(device); 2287 error: 2288 blkdev_put(bdev, FMODE_EXCL); 2289 if (seeding_dev) { 2290 mutex_unlock(&uuid_mutex); 2291 up_write(&sb->s_umount); 2292 } 2293 return ret; 2294 } 2295 2296 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2297 struct btrfs_device *srcdev, 2298 struct btrfs_device **device_out) 2299 { 2300 struct request_queue *q; 2301 struct btrfs_device *device; 2302 struct block_device *bdev; 2303 struct btrfs_fs_info *fs_info = root->fs_info; 2304 struct list_head *devices; 2305 struct rcu_string *name; 2306 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2307 int ret = 0; 2308 2309 *device_out = NULL; 2310 if (fs_info->fs_devices->seeding) { 2311 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2312 return -EINVAL; 2313 } 2314 2315 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2316 fs_info->bdev_holder); 2317 if (IS_ERR(bdev)) { 2318 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2319 return PTR_ERR(bdev); 2320 } 2321 2322 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2323 2324 devices = &fs_info->fs_devices->devices; 2325 list_for_each_entry(device, devices, dev_list) { 2326 if (device->bdev == bdev) { 2327 btrfs_err(fs_info, "target device is in the filesystem!"); 2328 ret = -EEXIST; 2329 goto error; 2330 } 2331 } 2332 2333 2334 if (i_size_read(bdev->bd_inode) < 2335 btrfs_device_get_total_bytes(srcdev)) { 2336 btrfs_err(fs_info, "target device is smaller than source device!"); 2337 ret = -EINVAL; 2338 goto error; 2339 } 2340 2341 2342 device = btrfs_alloc_device(NULL, &devid, NULL); 2343 if (IS_ERR(device)) { 2344 ret = PTR_ERR(device); 2345 goto error; 2346 } 2347 2348 name = rcu_string_strdup(device_path, GFP_NOFS); 2349 if (!name) { 2350 kfree(device); 2351 ret = -ENOMEM; 2352 goto error; 2353 } 2354 rcu_assign_pointer(device->name, name); 2355 2356 q = bdev_get_queue(bdev); 2357 if (blk_queue_discard(q)) 2358 device->can_discard = 1; 2359 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2360 device->writeable = 1; 2361 device->generation = 0; 2362 device->io_width = root->sectorsize; 2363 device->io_align = root->sectorsize; 2364 device->sector_size = root->sectorsize; 2365 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2366 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2367 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2368 ASSERT(list_empty(&srcdev->resized_list)); 2369 device->commit_total_bytes = srcdev->commit_total_bytes; 2370 device->commit_bytes_used = device->bytes_used; 2371 device->dev_root = fs_info->dev_root; 2372 device->bdev = bdev; 2373 device->in_fs_metadata = 1; 2374 device->is_tgtdev_for_dev_replace = 1; 2375 device->mode = FMODE_EXCL; 2376 device->dev_stats_valid = 1; 2377 set_blocksize(device->bdev, 4096); 2378 device->fs_devices = fs_info->fs_devices; 2379 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2380 fs_info->fs_devices->num_devices++; 2381 fs_info->fs_devices->open_devices++; 2382 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2383 2384 *device_out = device; 2385 return ret; 2386 2387 error: 2388 blkdev_put(bdev, FMODE_EXCL); 2389 return ret; 2390 } 2391 2392 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2393 struct btrfs_device *tgtdev) 2394 { 2395 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2396 tgtdev->io_width = fs_info->dev_root->sectorsize; 2397 tgtdev->io_align = fs_info->dev_root->sectorsize; 2398 tgtdev->sector_size = fs_info->dev_root->sectorsize; 2399 tgtdev->dev_root = fs_info->dev_root; 2400 tgtdev->in_fs_metadata = 1; 2401 } 2402 2403 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2404 struct btrfs_device *device) 2405 { 2406 int ret; 2407 struct btrfs_path *path; 2408 struct btrfs_root *root; 2409 struct btrfs_dev_item *dev_item; 2410 struct extent_buffer *leaf; 2411 struct btrfs_key key; 2412 2413 root = device->dev_root->fs_info->chunk_root; 2414 2415 path = btrfs_alloc_path(); 2416 if (!path) 2417 return -ENOMEM; 2418 2419 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2420 key.type = BTRFS_DEV_ITEM_KEY; 2421 key.offset = device->devid; 2422 2423 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2424 if (ret < 0) 2425 goto out; 2426 2427 if (ret > 0) { 2428 ret = -ENOENT; 2429 goto out; 2430 } 2431 2432 leaf = path->nodes[0]; 2433 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2434 2435 btrfs_set_device_id(leaf, dev_item, device->devid); 2436 btrfs_set_device_type(leaf, dev_item, device->type); 2437 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2438 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2439 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2440 btrfs_set_device_total_bytes(leaf, dev_item, 2441 btrfs_device_get_disk_total_bytes(device)); 2442 btrfs_set_device_bytes_used(leaf, dev_item, 2443 btrfs_device_get_bytes_used(device)); 2444 btrfs_mark_buffer_dirty(leaf); 2445 2446 out: 2447 btrfs_free_path(path); 2448 return ret; 2449 } 2450 2451 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2452 struct btrfs_device *device, u64 new_size) 2453 { 2454 struct btrfs_super_block *super_copy = 2455 device->dev_root->fs_info->super_copy; 2456 struct btrfs_fs_devices *fs_devices; 2457 u64 old_total; 2458 u64 diff; 2459 2460 if (!device->writeable) 2461 return -EACCES; 2462 2463 lock_chunks(device->dev_root); 2464 old_total = btrfs_super_total_bytes(super_copy); 2465 diff = new_size - device->total_bytes; 2466 2467 if (new_size <= device->total_bytes || 2468 device->is_tgtdev_for_dev_replace) { 2469 unlock_chunks(device->dev_root); 2470 return -EINVAL; 2471 } 2472 2473 fs_devices = device->dev_root->fs_info->fs_devices; 2474 2475 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2476 device->fs_devices->total_rw_bytes += diff; 2477 2478 btrfs_device_set_total_bytes(device, new_size); 2479 btrfs_device_set_disk_total_bytes(device, new_size); 2480 btrfs_clear_space_info_full(device->dev_root->fs_info); 2481 if (list_empty(&device->resized_list)) 2482 list_add_tail(&device->resized_list, 2483 &fs_devices->resized_devices); 2484 unlock_chunks(device->dev_root); 2485 2486 return btrfs_update_device(trans, device); 2487 } 2488 2489 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2490 struct btrfs_root *root, u64 chunk_objectid, 2491 u64 chunk_offset) 2492 { 2493 int ret; 2494 struct btrfs_path *path; 2495 struct btrfs_key key; 2496 2497 root = root->fs_info->chunk_root; 2498 path = btrfs_alloc_path(); 2499 if (!path) 2500 return -ENOMEM; 2501 2502 key.objectid = chunk_objectid; 2503 key.offset = chunk_offset; 2504 key.type = BTRFS_CHUNK_ITEM_KEY; 2505 2506 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2507 if (ret < 0) 2508 goto out; 2509 else if (ret > 0) { /* Logic error or corruption */ 2510 btrfs_error(root->fs_info, -ENOENT, 2511 "Failed lookup while freeing chunk."); 2512 ret = -ENOENT; 2513 goto out; 2514 } 2515 2516 ret = btrfs_del_item(trans, root, path); 2517 if (ret < 0) 2518 btrfs_error(root->fs_info, ret, 2519 "Failed to delete chunk item."); 2520 out: 2521 btrfs_free_path(path); 2522 return ret; 2523 } 2524 2525 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 2526 chunk_offset) 2527 { 2528 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2529 struct btrfs_disk_key *disk_key; 2530 struct btrfs_chunk *chunk; 2531 u8 *ptr; 2532 int ret = 0; 2533 u32 num_stripes; 2534 u32 array_size; 2535 u32 len = 0; 2536 u32 cur; 2537 struct btrfs_key key; 2538 2539 lock_chunks(root); 2540 array_size = btrfs_super_sys_array_size(super_copy); 2541 2542 ptr = super_copy->sys_chunk_array; 2543 cur = 0; 2544 2545 while (cur < array_size) { 2546 disk_key = (struct btrfs_disk_key *)ptr; 2547 btrfs_disk_key_to_cpu(&key, disk_key); 2548 2549 len = sizeof(*disk_key); 2550 2551 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2552 chunk = (struct btrfs_chunk *)(ptr + len); 2553 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2554 len += btrfs_chunk_item_size(num_stripes); 2555 } else { 2556 ret = -EIO; 2557 break; 2558 } 2559 if (key.objectid == chunk_objectid && 2560 key.offset == chunk_offset) { 2561 memmove(ptr, ptr + len, array_size - (cur + len)); 2562 array_size -= len; 2563 btrfs_set_super_sys_array_size(super_copy, array_size); 2564 } else { 2565 ptr += len; 2566 cur += len; 2567 } 2568 } 2569 unlock_chunks(root); 2570 return ret; 2571 } 2572 2573 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2574 struct btrfs_root *root, u64 chunk_offset) 2575 { 2576 struct extent_map_tree *em_tree; 2577 struct extent_map *em; 2578 struct btrfs_root *extent_root = root->fs_info->extent_root; 2579 struct map_lookup *map; 2580 u64 dev_extent_len = 0; 2581 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2582 int i, ret = 0; 2583 2584 /* Just in case */ 2585 root = root->fs_info->chunk_root; 2586 em_tree = &root->fs_info->mapping_tree.map_tree; 2587 2588 read_lock(&em_tree->lock); 2589 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2590 read_unlock(&em_tree->lock); 2591 2592 if (!em || em->start > chunk_offset || 2593 em->start + em->len < chunk_offset) { 2594 /* 2595 * This is a logic error, but we don't want to just rely on the 2596 * user having built with ASSERT enabled, so if ASSERT doens't 2597 * do anything we still error out. 2598 */ 2599 ASSERT(0); 2600 if (em) 2601 free_extent_map(em); 2602 return -EINVAL; 2603 } 2604 map = (struct map_lookup *)em->bdev; 2605 2606 for (i = 0; i < map->num_stripes; i++) { 2607 struct btrfs_device *device = map->stripes[i].dev; 2608 ret = btrfs_free_dev_extent(trans, device, 2609 map->stripes[i].physical, 2610 &dev_extent_len); 2611 if (ret) { 2612 btrfs_abort_transaction(trans, root, ret); 2613 goto out; 2614 } 2615 2616 if (device->bytes_used > 0) { 2617 lock_chunks(root); 2618 btrfs_device_set_bytes_used(device, 2619 device->bytes_used - dev_extent_len); 2620 spin_lock(&root->fs_info->free_chunk_lock); 2621 root->fs_info->free_chunk_space += dev_extent_len; 2622 spin_unlock(&root->fs_info->free_chunk_lock); 2623 btrfs_clear_space_info_full(root->fs_info); 2624 unlock_chunks(root); 2625 } 2626 2627 if (map->stripes[i].dev) { 2628 ret = btrfs_update_device(trans, map->stripes[i].dev); 2629 if (ret) { 2630 btrfs_abort_transaction(trans, root, ret); 2631 goto out; 2632 } 2633 } 2634 } 2635 ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); 2636 if (ret) { 2637 btrfs_abort_transaction(trans, root, ret); 2638 goto out; 2639 } 2640 2641 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2642 2643 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2644 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2645 if (ret) { 2646 btrfs_abort_transaction(trans, root, ret); 2647 goto out; 2648 } 2649 } 2650 2651 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); 2652 if (ret) { 2653 btrfs_abort_transaction(trans, extent_root, ret); 2654 goto out; 2655 } 2656 2657 out: 2658 /* once for us */ 2659 free_extent_map(em); 2660 return ret; 2661 } 2662 2663 static int btrfs_relocate_chunk(struct btrfs_root *root, 2664 u64 chunk_objectid, 2665 u64 chunk_offset) 2666 { 2667 struct btrfs_root *extent_root; 2668 struct btrfs_trans_handle *trans; 2669 int ret; 2670 2671 root = root->fs_info->chunk_root; 2672 extent_root = root->fs_info->extent_root; 2673 2674 ret = btrfs_can_relocate(extent_root, chunk_offset); 2675 if (ret) 2676 return -ENOSPC; 2677 2678 /* step one, relocate all the extents inside this chunk */ 2679 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2680 if (ret) 2681 return ret; 2682 2683 trans = btrfs_start_transaction(root, 0); 2684 if (IS_ERR(trans)) { 2685 ret = PTR_ERR(trans); 2686 btrfs_std_error(root->fs_info, ret); 2687 return ret; 2688 } 2689 2690 /* 2691 * step two, delete the device extents and the 2692 * chunk tree entries 2693 */ 2694 ret = btrfs_remove_chunk(trans, root, chunk_offset); 2695 btrfs_end_transaction(trans, root); 2696 return ret; 2697 } 2698 2699 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2700 { 2701 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2702 struct btrfs_path *path; 2703 struct extent_buffer *leaf; 2704 struct btrfs_chunk *chunk; 2705 struct btrfs_key key; 2706 struct btrfs_key found_key; 2707 u64 chunk_type; 2708 bool retried = false; 2709 int failed = 0; 2710 int ret; 2711 2712 path = btrfs_alloc_path(); 2713 if (!path) 2714 return -ENOMEM; 2715 2716 again: 2717 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2718 key.offset = (u64)-1; 2719 key.type = BTRFS_CHUNK_ITEM_KEY; 2720 2721 while (1) { 2722 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2723 if (ret < 0) 2724 goto error; 2725 BUG_ON(ret == 0); /* Corruption */ 2726 2727 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2728 key.type); 2729 if (ret < 0) 2730 goto error; 2731 if (ret > 0) 2732 break; 2733 2734 leaf = path->nodes[0]; 2735 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2736 2737 chunk = btrfs_item_ptr(leaf, path->slots[0], 2738 struct btrfs_chunk); 2739 chunk_type = btrfs_chunk_type(leaf, chunk); 2740 btrfs_release_path(path); 2741 2742 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2743 ret = btrfs_relocate_chunk(chunk_root, 2744 found_key.objectid, 2745 found_key.offset); 2746 if (ret == -ENOSPC) 2747 failed++; 2748 else 2749 BUG_ON(ret); 2750 } 2751 2752 if (found_key.offset == 0) 2753 break; 2754 key.offset = found_key.offset - 1; 2755 } 2756 ret = 0; 2757 if (failed && !retried) { 2758 failed = 0; 2759 retried = true; 2760 goto again; 2761 } else if (WARN_ON(failed && retried)) { 2762 ret = -ENOSPC; 2763 } 2764 error: 2765 btrfs_free_path(path); 2766 return ret; 2767 } 2768 2769 static int insert_balance_item(struct btrfs_root *root, 2770 struct btrfs_balance_control *bctl) 2771 { 2772 struct btrfs_trans_handle *trans; 2773 struct btrfs_balance_item *item; 2774 struct btrfs_disk_balance_args disk_bargs; 2775 struct btrfs_path *path; 2776 struct extent_buffer *leaf; 2777 struct btrfs_key key; 2778 int ret, err; 2779 2780 path = btrfs_alloc_path(); 2781 if (!path) 2782 return -ENOMEM; 2783 2784 trans = btrfs_start_transaction(root, 0); 2785 if (IS_ERR(trans)) { 2786 btrfs_free_path(path); 2787 return PTR_ERR(trans); 2788 } 2789 2790 key.objectid = BTRFS_BALANCE_OBJECTID; 2791 key.type = BTRFS_BALANCE_ITEM_KEY; 2792 key.offset = 0; 2793 2794 ret = btrfs_insert_empty_item(trans, root, path, &key, 2795 sizeof(*item)); 2796 if (ret) 2797 goto out; 2798 2799 leaf = path->nodes[0]; 2800 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2801 2802 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2803 2804 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2805 btrfs_set_balance_data(leaf, item, &disk_bargs); 2806 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2807 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2808 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2809 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2810 2811 btrfs_set_balance_flags(leaf, item, bctl->flags); 2812 2813 btrfs_mark_buffer_dirty(leaf); 2814 out: 2815 btrfs_free_path(path); 2816 err = btrfs_commit_transaction(trans, root); 2817 if (err && !ret) 2818 ret = err; 2819 return ret; 2820 } 2821 2822 static int del_balance_item(struct btrfs_root *root) 2823 { 2824 struct btrfs_trans_handle *trans; 2825 struct btrfs_path *path; 2826 struct btrfs_key key; 2827 int ret, err; 2828 2829 path = btrfs_alloc_path(); 2830 if (!path) 2831 return -ENOMEM; 2832 2833 trans = btrfs_start_transaction(root, 0); 2834 if (IS_ERR(trans)) { 2835 btrfs_free_path(path); 2836 return PTR_ERR(trans); 2837 } 2838 2839 key.objectid = BTRFS_BALANCE_OBJECTID; 2840 key.type = BTRFS_BALANCE_ITEM_KEY; 2841 key.offset = 0; 2842 2843 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2844 if (ret < 0) 2845 goto out; 2846 if (ret > 0) { 2847 ret = -ENOENT; 2848 goto out; 2849 } 2850 2851 ret = btrfs_del_item(trans, root, path); 2852 out: 2853 btrfs_free_path(path); 2854 err = btrfs_commit_transaction(trans, root); 2855 if (err && !ret) 2856 ret = err; 2857 return ret; 2858 } 2859 2860 /* 2861 * This is a heuristic used to reduce the number of chunks balanced on 2862 * resume after balance was interrupted. 2863 */ 2864 static void update_balance_args(struct btrfs_balance_control *bctl) 2865 { 2866 /* 2867 * Turn on soft mode for chunk types that were being converted. 2868 */ 2869 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2870 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2871 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2872 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2873 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2874 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2875 2876 /* 2877 * Turn on usage filter if is not already used. The idea is 2878 * that chunks that we have already balanced should be 2879 * reasonably full. Don't do it for chunks that are being 2880 * converted - that will keep us from relocating unconverted 2881 * (albeit full) chunks. 2882 */ 2883 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2884 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2885 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2886 bctl->data.usage = 90; 2887 } 2888 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2889 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2890 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2891 bctl->sys.usage = 90; 2892 } 2893 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2894 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2895 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2896 bctl->meta.usage = 90; 2897 } 2898 } 2899 2900 /* 2901 * Should be called with both balance and volume mutexes held to 2902 * serialize other volume operations (add_dev/rm_dev/resize) with 2903 * restriper. Same goes for unset_balance_control. 2904 */ 2905 static void set_balance_control(struct btrfs_balance_control *bctl) 2906 { 2907 struct btrfs_fs_info *fs_info = bctl->fs_info; 2908 2909 BUG_ON(fs_info->balance_ctl); 2910 2911 spin_lock(&fs_info->balance_lock); 2912 fs_info->balance_ctl = bctl; 2913 spin_unlock(&fs_info->balance_lock); 2914 } 2915 2916 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2917 { 2918 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2919 2920 BUG_ON(!fs_info->balance_ctl); 2921 2922 spin_lock(&fs_info->balance_lock); 2923 fs_info->balance_ctl = NULL; 2924 spin_unlock(&fs_info->balance_lock); 2925 2926 kfree(bctl); 2927 } 2928 2929 /* 2930 * Balance filters. Return 1 if chunk should be filtered out 2931 * (should not be balanced). 2932 */ 2933 static int chunk_profiles_filter(u64 chunk_type, 2934 struct btrfs_balance_args *bargs) 2935 { 2936 chunk_type = chunk_to_extended(chunk_type) & 2937 BTRFS_EXTENDED_PROFILE_MASK; 2938 2939 if (bargs->profiles & chunk_type) 2940 return 0; 2941 2942 return 1; 2943 } 2944 2945 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2946 struct btrfs_balance_args *bargs) 2947 { 2948 struct btrfs_block_group_cache *cache; 2949 u64 chunk_used, user_thresh; 2950 int ret = 1; 2951 2952 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2953 chunk_used = btrfs_block_group_used(&cache->item); 2954 2955 if (bargs->usage == 0) 2956 user_thresh = 1; 2957 else if (bargs->usage > 100) 2958 user_thresh = cache->key.offset; 2959 else 2960 user_thresh = div_factor_fine(cache->key.offset, 2961 bargs->usage); 2962 2963 if (chunk_used < user_thresh) 2964 ret = 0; 2965 2966 btrfs_put_block_group(cache); 2967 return ret; 2968 } 2969 2970 static int chunk_devid_filter(struct extent_buffer *leaf, 2971 struct btrfs_chunk *chunk, 2972 struct btrfs_balance_args *bargs) 2973 { 2974 struct btrfs_stripe *stripe; 2975 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2976 int i; 2977 2978 for (i = 0; i < num_stripes; i++) { 2979 stripe = btrfs_stripe_nr(chunk, i); 2980 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2981 return 0; 2982 } 2983 2984 return 1; 2985 } 2986 2987 /* [pstart, pend) */ 2988 static int chunk_drange_filter(struct extent_buffer *leaf, 2989 struct btrfs_chunk *chunk, 2990 u64 chunk_offset, 2991 struct btrfs_balance_args *bargs) 2992 { 2993 struct btrfs_stripe *stripe; 2994 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2995 u64 stripe_offset; 2996 u64 stripe_length; 2997 int factor; 2998 int i; 2999 3000 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3001 return 0; 3002 3003 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3004 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3005 factor = num_stripes / 2; 3006 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3007 factor = num_stripes - 1; 3008 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3009 factor = num_stripes - 2; 3010 } else { 3011 factor = num_stripes; 3012 } 3013 3014 for (i = 0; i < num_stripes; i++) { 3015 stripe = btrfs_stripe_nr(chunk, i); 3016 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3017 continue; 3018 3019 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3020 stripe_length = btrfs_chunk_length(leaf, chunk); 3021 stripe_length = div_u64(stripe_length, factor); 3022 3023 if (stripe_offset < bargs->pend && 3024 stripe_offset + stripe_length > bargs->pstart) 3025 return 0; 3026 } 3027 3028 return 1; 3029 } 3030 3031 /* [vstart, vend) */ 3032 static int chunk_vrange_filter(struct extent_buffer *leaf, 3033 struct btrfs_chunk *chunk, 3034 u64 chunk_offset, 3035 struct btrfs_balance_args *bargs) 3036 { 3037 if (chunk_offset < bargs->vend && 3038 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3039 /* at least part of the chunk is inside this vrange */ 3040 return 0; 3041 3042 return 1; 3043 } 3044 3045 static int chunk_soft_convert_filter(u64 chunk_type, 3046 struct btrfs_balance_args *bargs) 3047 { 3048 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3049 return 0; 3050 3051 chunk_type = chunk_to_extended(chunk_type) & 3052 BTRFS_EXTENDED_PROFILE_MASK; 3053 3054 if (bargs->target == chunk_type) 3055 return 1; 3056 3057 return 0; 3058 } 3059 3060 static int should_balance_chunk(struct btrfs_root *root, 3061 struct extent_buffer *leaf, 3062 struct btrfs_chunk *chunk, u64 chunk_offset) 3063 { 3064 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 3065 struct btrfs_balance_args *bargs = NULL; 3066 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3067 3068 /* type filter */ 3069 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3070 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3071 return 0; 3072 } 3073 3074 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3075 bargs = &bctl->data; 3076 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3077 bargs = &bctl->sys; 3078 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3079 bargs = &bctl->meta; 3080 3081 /* profiles filter */ 3082 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3083 chunk_profiles_filter(chunk_type, bargs)) { 3084 return 0; 3085 } 3086 3087 /* usage filter */ 3088 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3089 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 3090 return 0; 3091 } 3092 3093 /* devid filter */ 3094 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3095 chunk_devid_filter(leaf, chunk, bargs)) { 3096 return 0; 3097 } 3098 3099 /* drange filter, makes sense only with devid filter */ 3100 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3101 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 3102 return 0; 3103 } 3104 3105 /* vrange filter */ 3106 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3107 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3108 return 0; 3109 } 3110 3111 /* soft profile changing mode */ 3112 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3113 chunk_soft_convert_filter(chunk_type, bargs)) { 3114 return 0; 3115 } 3116 3117 /* 3118 * limited by count, must be the last filter 3119 */ 3120 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3121 if (bargs->limit == 0) 3122 return 0; 3123 else 3124 bargs->limit--; 3125 } 3126 3127 return 1; 3128 } 3129 3130 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3131 { 3132 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3133 struct btrfs_root *chunk_root = fs_info->chunk_root; 3134 struct btrfs_root *dev_root = fs_info->dev_root; 3135 struct list_head *devices; 3136 struct btrfs_device *device; 3137 u64 old_size; 3138 u64 size_to_free; 3139 struct btrfs_chunk *chunk; 3140 struct btrfs_path *path; 3141 struct btrfs_key key; 3142 struct btrfs_key found_key; 3143 struct btrfs_trans_handle *trans; 3144 struct extent_buffer *leaf; 3145 int slot; 3146 int ret; 3147 int enospc_errors = 0; 3148 bool counting = true; 3149 u64 limit_data = bctl->data.limit; 3150 u64 limit_meta = bctl->meta.limit; 3151 u64 limit_sys = bctl->sys.limit; 3152 3153 /* step one make some room on all the devices */ 3154 devices = &fs_info->fs_devices->devices; 3155 list_for_each_entry(device, devices, dev_list) { 3156 old_size = btrfs_device_get_total_bytes(device); 3157 size_to_free = div_factor(old_size, 1); 3158 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3159 if (!device->writeable || 3160 btrfs_device_get_total_bytes(device) - 3161 btrfs_device_get_bytes_used(device) > size_to_free || 3162 device->is_tgtdev_for_dev_replace) 3163 continue; 3164 3165 ret = btrfs_shrink_device(device, old_size - size_to_free); 3166 if (ret == -ENOSPC) 3167 break; 3168 BUG_ON(ret); 3169 3170 trans = btrfs_start_transaction(dev_root, 0); 3171 BUG_ON(IS_ERR(trans)); 3172 3173 ret = btrfs_grow_device(trans, device, old_size); 3174 BUG_ON(ret); 3175 3176 btrfs_end_transaction(trans, dev_root); 3177 } 3178 3179 /* step two, relocate all the chunks */ 3180 path = btrfs_alloc_path(); 3181 if (!path) { 3182 ret = -ENOMEM; 3183 goto error; 3184 } 3185 3186 /* zero out stat counters */ 3187 spin_lock(&fs_info->balance_lock); 3188 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3189 spin_unlock(&fs_info->balance_lock); 3190 again: 3191 if (!counting) { 3192 bctl->data.limit = limit_data; 3193 bctl->meta.limit = limit_meta; 3194 bctl->sys.limit = limit_sys; 3195 } 3196 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3197 key.offset = (u64)-1; 3198 key.type = BTRFS_CHUNK_ITEM_KEY; 3199 3200 while (1) { 3201 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3202 atomic_read(&fs_info->balance_cancel_req)) { 3203 ret = -ECANCELED; 3204 goto error; 3205 } 3206 3207 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3208 if (ret < 0) 3209 goto error; 3210 3211 /* 3212 * this shouldn't happen, it means the last relocate 3213 * failed 3214 */ 3215 if (ret == 0) 3216 BUG(); /* FIXME break ? */ 3217 3218 ret = btrfs_previous_item(chunk_root, path, 0, 3219 BTRFS_CHUNK_ITEM_KEY); 3220 if (ret) { 3221 ret = 0; 3222 break; 3223 } 3224 3225 leaf = path->nodes[0]; 3226 slot = path->slots[0]; 3227 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3228 3229 if (found_key.objectid != key.objectid) 3230 break; 3231 3232 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3233 3234 if (!counting) { 3235 spin_lock(&fs_info->balance_lock); 3236 bctl->stat.considered++; 3237 spin_unlock(&fs_info->balance_lock); 3238 } 3239 3240 ret = should_balance_chunk(chunk_root, leaf, chunk, 3241 found_key.offset); 3242 btrfs_release_path(path); 3243 if (!ret) 3244 goto loop; 3245 3246 if (counting) { 3247 spin_lock(&fs_info->balance_lock); 3248 bctl->stat.expected++; 3249 spin_unlock(&fs_info->balance_lock); 3250 goto loop; 3251 } 3252 3253 ret = btrfs_relocate_chunk(chunk_root, 3254 found_key.objectid, 3255 found_key.offset); 3256 if (ret && ret != -ENOSPC) 3257 goto error; 3258 if (ret == -ENOSPC) { 3259 enospc_errors++; 3260 } else { 3261 spin_lock(&fs_info->balance_lock); 3262 bctl->stat.completed++; 3263 spin_unlock(&fs_info->balance_lock); 3264 } 3265 loop: 3266 if (found_key.offset == 0) 3267 break; 3268 key.offset = found_key.offset - 1; 3269 } 3270 3271 if (counting) { 3272 btrfs_release_path(path); 3273 counting = false; 3274 goto again; 3275 } 3276 error: 3277 btrfs_free_path(path); 3278 if (enospc_errors) { 3279 btrfs_info(fs_info, "%d enospc errors during balance", 3280 enospc_errors); 3281 if (!ret) 3282 ret = -ENOSPC; 3283 } 3284 3285 return ret; 3286 } 3287 3288 /** 3289 * alloc_profile_is_valid - see if a given profile is valid and reduced 3290 * @flags: profile to validate 3291 * @extended: if true @flags is treated as an extended profile 3292 */ 3293 static int alloc_profile_is_valid(u64 flags, int extended) 3294 { 3295 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3296 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3297 3298 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3299 3300 /* 1) check that all other bits are zeroed */ 3301 if (flags & ~mask) 3302 return 0; 3303 3304 /* 2) see if profile is reduced */ 3305 if (flags == 0) 3306 return !extended; /* "0" is valid for usual profiles */ 3307 3308 /* true if exactly one bit set */ 3309 return (flags & (flags - 1)) == 0; 3310 } 3311 3312 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3313 { 3314 /* cancel requested || normal exit path */ 3315 return atomic_read(&fs_info->balance_cancel_req) || 3316 (atomic_read(&fs_info->balance_pause_req) == 0 && 3317 atomic_read(&fs_info->balance_cancel_req) == 0); 3318 } 3319 3320 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3321 { 3322 int ret; 3323 3324 unset_balance_control(fs_info); 3325 ret = del_balance_item(fs_info->tree_root); 3326 if (ret) 3327 btrfs_std_error(fs_info, ret); 3328 3329 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3330 } 3331 3332 /* 3333 * Should be called with both balance and volume mutexes held 3334 */ 3335 int btrfs_balance(struct btrfs_balance_control *bctl, 3336 struct btrfs_ioctl_balance_args *bargs) 3337 { 3338 struct btrfs_fs_info *fs_info = bctl->fs_info; 3339 u64 allowed; 3340 int mixed = 0; 3341 int ret; 3342 u64 num_devices; 3343 unsigned seq; 3344 3345 if (btrfs_fs_closing(fs_info) || 3346 atomic_read(&fs_info->balance_pause_req) || 3347 atomic_read(&fs_info->balance_cancel_req)) { 3348 ret = -EINVAL; 3349 goto out; 3350 } 3351 3352 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3353 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3354 mixed = 1; 3355 3356 /* 3357 * In case of mixed groups both data and meta should be picked, 3358 * and identical options should be given for both of them. 3359 */ 3360 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3361 if (mixed && (bctl->flags & allowed)) { 3362 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3363 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3364 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3365 btrfs_err(fs_info, "with mixed groups data and " 3366 "metadata balance options must be the same"); 3367 ret = -EINVAL; 3368 goto out; 3369 } 3370 } 3371 3372 num_devices = fs_info->fs_devices->num_devices; 3373 btrfs_dev_replace_lock(&fs_info->dev_replace); 3374 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3375 BUG_ON(num_devices < 1); 3376 num_devices--; 3377 } 3378 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3379 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3380 if (num_devices == 1) 3381 allowed |= BTRFS_BLOCK_GROUP_DUP; 3382 else if (num_devices > 1) 3383 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3384 if (num_devices > 2) 3385 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3386 if (num_devices > 3) 3387 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3388 BTRFS_BLOCK_GROUP_RAID6); 3389 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3390 (!alloc_profile_is_valid(bctl->data.target, 1) || 3391 (bctl->data.target & ~allowed))) { 3392 btrfs_err(fs_info, "unable to start balance with target " 3393 "data profile %llu", 3394 bctl->data.target); 3395 ret = -EINVAL; 3396 goto out; 3397 } 3398 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3399 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3400 (bctl->meta.target & ~allowed))) { 3401 btrfs_err(fs_info, 3402 "unable to start balance with target metadata profile %llu", 3403 bctl->meta.target); 3404 ret = -EINVAL; 3405 goto out; 3406 } 3407 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3408 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3409 (bctl->sys.target & ~allowed))) { 3410 btrfs_err(fs_info, 3411 "unable to start balance with target system profile %llu", 3412 bctl->sys.target); 3413 ret = -EINVAL; 3414 goto out; 3415 } 3416 3417 /* allow dup'ed data chunks only in mixed mode */ 3418 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3419 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3420 btrfs_err(fs_info, "dup for data is not allowed"); 3421 ret = -EINVAL; 3422 goto out; 3423 } 3424 3425 /* allow to reduce meta or sys integrity only if force set */ 3426 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3427 BTRFS_BLOCK_GROUP_RAID10 | 3428 BTRFS_BLOCK_GROUP_RAID5 | 3429 BTRFS_BLOCK_GROUP_RAID6; 3430 do { 3431 seq = read_seqbegin(&fs_info->profiles_lock); 3432 3433 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3434 (fs_info->avail_system_alloc_bits & allowed) && 3435 !(bctl->sys.target & allowed)) || 3436 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3437 (fs_info->avail_metadata_alloc_bits & allowed) && 3438 !(bctl->meta.target & allowed))) { 3439 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3440 btrfs_info(fs_info, "force reducing metadata integrity"); 3441 } else { 3442 btrfs_err(fs_info, "balance will reduce metadata " 3443 "integrity, use force if you want this"); 3444 ret = -EINVAL; 3445 goto out; 3446 } 3447 } 3448 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3449 3450 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3451 int num_tolerated_disk_barrier_failures; 3452 u64 target = bctl->sys.target; 3453 3454 num_tolerated_disk_barrier_failures = 3455 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3456 if (num_tolerated_disk_barrier_failures > 0 && 3457 (target & 3458 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3459 BTRFS_AVAIL_ALLOC_BIT_SINGLE))) 3460 num_tolerated_disk_barrier_failures = 0; 3461 else if (num_tolerated_disk_barrier_failures > 1 && 3462 (target & 3463 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) 3464 num_tolerated_disk_barrier_failures = 1; 3465 3466 fs_info->num_tolerated_disk_barrier_failures = 3467 num_tolerated_disk_barrier_failures; 3468 } 3469 3470 ret = insert_balance_item(fs_info->tree_root, bctl); 3471 if (ret && ret != -EEXIST) 3472 goto out; 3473 3474 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3475 BUG_ON(ret == -EEXIST); 3476 set_balance_control(bctl); 3477 } else { 3478 BUG_ON(ret != -EEXIST); 3479 spin_lock(&fs_info->balance_lock); 3480 update_balance_args(bctl); 3481 spin_unlock(&fs_info->balance_lock); 3482 } 3483 3484 atomic_inc(&fs_info->balance_running); 3485 mutex_unlock(&fs_info->balance_mutex); 3486 3487 ret = __btrfs_balance(fs_info); 3488 3489 mutex_lock(&fs_info->balance_mutex); 3490 atomic_dec(&fs_info->balance_running); 3491 3492 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3493 fs_info->num_tolerated_disk_barrier_failures = 3494 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3495 } 3496 3497 if (bargs) { 3498 memset(bargs, 0, sizeof(*bargs)); 3499 update_ioctl_balance_args(fs_info, 0, bargs); 3500 } 3501 3502 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3503 balance_need_close(fs_info)) { 3504 __cancel_balance(fs_info); 3505 } 3506 3507 wake_up(&fs_info->balance_wait_q); 3508 3509 return ret; 3510 out: 3511 if (bctl->flags & BTRFS_BALANCE_RESUME) 3512 __cancel_balance(fs_info); 3513 else { 3514 kfree(bctl); 3515 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3516 } 3517 return ret; 3518 } 3519 3520 static int balance_kthread(void *data) 3521 { 3522 struct btrfs_fs_info *fs_info = data; 3523 int ret = 0; 3524 3525 mutex_lock(&fs_info->volume_mutex); 3526 mutex_lock(&fs_info->balance_mutex); 3527 3528 if (fs_info->balance_ctl) { 3529 btrfs_info(fs_info, "continuing balance"); 3530 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3531 } 3532 3533 mutex_unlock(&fs_info->balance_mutex); 3534 mutex_unlock(&fs_info->volume_mutex); 3535 3536 return ret; 3537 } 3538 3539 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3540 { 3541 struct task_struct *tsk; 3542 3543 spin_lock(&fs_info->balance_lock); 3544 if (!fs_info->balance_ctl) { 3545 spin_unlock(&fs_info->balance_lock); 3546 return 0; 3547 } 3548 spin_unlock(&fs_info->balance_lock); 3549 3550 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3551 btrfs_info(fs_info, "force skipping balance"); 3552 return 0; 3553 } 3554 3555 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3556 return PTR_ERR_OR_ZERO(tsk); 3557 } 3558 3559 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3560 { 3561 struct btrfs_balance_control *bctl; 3562 struct btrfs_balance_item *item; 3563 struct btrfs_disk_balance_args disk_bargs; 3564 struct btrfs_path *path; 3565 struct extent_buffer *leaf; 3566 struct btrfs_key key; 3567 int ret; 3568 3569 path = btrfs_alloc_path(); 3570 if (!path) 3571 return -ENOMEM; 3572 3573 key.objectid = BTRFS_BALANCE_OBJECTID; 3574 key.type = BTRFS_BALANCE_ITEM_KEY; 3575 key.offset = 0; 3576 3577 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3578 if (ret < 0) 3579 goto out; 3580 if (ret > 0) { /* ret = -ENOENT; */ 3581 ret = 0; 3582 goto out; 3583 } 3584 3585 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3586 if (!bctl) { 3587 ret = -ENOMEM; 3588 goto out; 3589 } 3590 3591 leaf = path->nodes[0]; 3592 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3593 3594 bctl->fs_info = fs_info; 3595 bctl->flags = btrfs_balance_flags(leaf, item); 3596 bctl->flags |= BTRFS_BALANCE_RESUME; 3597 3598 btrfs_balance_data(leaf, item, &disk_bargs); 3599 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3600 btrfs_balance_meta(leaf, item, &disk_bargs); 3601 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3602 btrfs_balance_sys(leaf, item, &disk_bargs); 3603 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3604 3605 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3606 3607 mutex_lock(&fs_info->volume_mutex); 3608 mutex_lock(&fs_info->balance_mutex); 3609 3610 set_balance_control(bctl); 3611 3612 mutex_unlock(&fs_info->balance_mutex); 3613 mutex_unlock(&fs_info->volume_mutex); 3614 out: 3615 btrfs_free_path(path); 3616 return ret; 3617 } 3618 3619 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3620 { 3621 int ret = 0; 3622 3623 mutex_lock(&fs_info->balance_mutex); 3624 if (!fs_info->balance_ctl) { 3625 mutex_unlock(&fs_info->balance_mutex); 3626 return -ENOTCONN; 3627 } 3628 3629 if (atomic_read(&fs_info->balance_running)) { 3630 atomic_inc(&fs_info->balance_pause_req); 3631 mutex_unlock(&fs_info->balance_mutex); 3632 3633 wait_event(fs_info->balance_wait_q, 3634 atomic_read(&fs_info->balance_running) == 0); 3635 3636 mutex_lock(&fs_info->balance_mutex); 3637 /* we are good with balance_ctl ripped off from under us */ 3638 BUG_ON(atomic_read(&fs_info->balance_running)); 3639 atomic_dec(&fs_info->balance_pause_req); 3640 } else { 3641 ret = -ENOTCONN; 3642 } 3643 3644 mutex_unlock(&fs_info->balance_mutex); 3645 return ret; 3646 } 3647 3648 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 3649 { 3650 if (fs_info->sb->s_flags & MS_RDONLY) 3651 return -EROFS; 3652 3653 mutex_lock(&fs_info->balance_mutex); 3654 if (!fs_info->balance_ctl) { 3655 mutex_unlock(&fs_info->balance_mutex); 3656 return -ENOTCONN; 3657 } 3658 3659 atomic_inc(&fs_info->balance_cancel_req); 3660 /* 3661 * if we are running just wait and return, balance item is 3662 * deleted in btrfs_balance in this case 3663 */ 3664 if (atomic_read(&fs_info->balance_running)) { 3665 mutex_unlock(&fs_info->balance_mutex); 3666 wait_event(fs_info->balance_wait_q, 3667 atomic_read(&fs_info->balance_running) == 0); 3668 mutex_lock(&fs_info->balance_mutex); 3669 } else { 3670 /* __cancel_balance needs volume_mutex */ 3671 mutex_unlock(&fs_info->balance_mutex); 3672 mutex_lock(&fs_info->volume_mutex); 3673 mutex_lock(&fs_info->balance_mutex); 3674 3675 if (fs_info->balance_ctl) 3676 __cancel_balance(fs_info); 3677 3678 mutex_unlock(&fs_info->volume_mutex); 3679 } 3680 3681 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3682 atomic_dec(&fs_info->balance_cancel_req); 3683 mutex_unlock(&fs_info->balance_mutex); 3684 return 0; 3685 } 3686 3687 static int btrfs_uuid_scan_kthread(void *data) 3688 { 3689 struct btrfs_fs_info *fs_info = data; 3690 struct btrfs_root *root = fs_info->tree_root; 3691 struct btrfs_key key; 3692 struct btrfs_key max_key; 3693 struct btrfs_path *path = NULL; 3694 int ret = 0; 3695 struct extent_buffer *eb; 3696 int slot; 3697 struct btrfs_root_item root_item; 3698 u32 item_size; 3699 struct btrfs_trans_handle *trans = NULL; 3700 3701 path = btrfs_alloc_path(); 3702 if (!path) { 3703 ret = -ENOMEM; 3704 goto out; 3705 } 3706 3707 key.objectid = 0; 3708 key.type = BTRFS_ROOT_ITEM_KEY; 3709 key.offset = 0; 3710 3711 max_key.objectid = (u64)-1; 3712 max_key.type = BTRFS_ROOT_ITEM_KEY; 3713 max_key.offset = (u64)-1; 3714 3715 while (1) { 3716 ret = btrfs_search_forward(root, &key, path, 0); 3717 if (ret) { 3718 if (ret > 0) 3719 ret = 0; 3720 break; 3721 } 3722 3723 if (key.type != BTRFS_ROOT_ITEM_KEY || 3724 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 3725 key.objectid != BTRFS_FS_TREE_OBJECTID) || 3726 key.objectid > BTRFS_LAST_FREE_OBJECTID) 3727 goto skip; 3728 3729 eb = path->nodes[0]; 3730 slot = path->slots[0]; 3731 item_size = btrfs_item_size_nr(eb, slot); 3732 if (item_size < sizeof(root_item)) 3733 goto skip; 3734 3735 read_extent_buffer(eb, &root_item, 3736 btrfs_item_ptr_offset(eb, slot), 3737 (int)sizeof(root_item)); 3738 if (btrfs_root_refs(&root_item) == 0) 3739 goto skip; 3740 3741 if (!btrfs_is_empty_uuid(root_item.uuid) || 3742 !btrfs_is_empty_uuid(root_item.received_uuid)) { 3743 if (trans) 3744 goto update_tree; 3745 3746 btrfs_release_path(path); 3747 /* 3748 * 1 - subvol uuid item 3749 * 1 - received_subvol uuid item 3750 */ 3751 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 3752 if (IS_ERR(trans)) { 3753 ret = PTR_ERR(trans); 3754 break; 3755 } 3756 continue; 3757 } else { 3758 goto skip; 3759 } 3760 update_tree: 3761 if (!btrfs_is_empty_uuid(root_item.uuid)) { 3762 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3763 root_item.uuid, 3764 BTRFS_UUID_KEY_SUBVOL, 3765 key.objectid); 3766 if (ret < 0) { 3767 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3768 ret); 3769 break; 3770 } 3771 } 3772 3773 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 3774 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3775 root_item.received_uuid, 3776 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3777 key.objectid); 3778 if (ret < 0) { 3779 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3780 ret); 3781 break; 3782 } 3783 } 3784 3785 skip: 3786 if (trans) { 3787 ret = btrfs_end_transaction(trans, fs_info->uuid_root); 3788 trans = NULL; 3789 if (ret) 3790 break; 3791 } 3792 3793 btrfs_release_path(path); 3794 if (key.offset < (u64)-1) { 3795 key.offset++; 3796 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 3797 key.offset = 0; 3798 key.type = BTRFS_ROOT_ITEM_KEY; 3799 } else if (key.objectid < (u64)-1) { 3800 key.offset = 0; 3801 key.type = BTRFS_ROOT_ITEM_KEY; 3802 key.objectid++; 3803 } else { 3804 break; 3805 } 3806 cond_resched(); 3807 } 3808 3809 out: 3810 btrfs_free_path(path); 3811 if (trans && !IS_ERR(trans)) 3812 btrfs_end_transaction(trans, fs_info->uuid_root); 3813 if (ret) 3814 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 3815 else 3816 fs_info->update_uuid_tree_gen = 1; 3817 up(&fs_info->uuid_tree_rescan_sem); 3818 return 0; 3819 } 3820 3821 /* 3822 * Callback for btrfs_uuid_tree_iterate(). 3823 * returns: 3824 * 0 check succeeded, the entry is not outdated. 3825 * < 0 if an error occured. 3826 * > 0 if the check failed, which means the caller shall remove the entry. 3827 */ 3828 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 3829 u8 *uuid, u8 type, u64 subid) 3830 { 3831 struct btrfs_key key; 3832 int ret = 0; 3833 struct btrfs_root *subvol_root; 3834 3835 if (type != BTRFS_UUID_KEY_SUBVOL && 3836 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 3837 goto out; 3838 3839 key.objectid = subid; 3840 key.type = BTRFS_ROOT_ITEM_KEY; 3841 key.offset = (u64)-1; 3842 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 3843 if (IS_ERR(subvol_root)) { 3844 ret = PTR_ERR(subvol_root); 3845 if (ret == -ENOENT) 3846 ret = 1; 3847 goto out; 3848 } 3849 3850 switch (type) { 3851 case BTRFS_UUID_KEY_SUBVOL: 3852 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 3853 ret = 1; 3854 break; 3855 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 3856 if (memcmp(uuid, subvol_root->root_item.received_uuid, 3857 BTRFS_UUID_SIZE)) 3858 ret = 1; 3859 break; 3860 } 3861 3862 out: 3863 return ret; 3864 } 3865 3866 static int btrfs_uuid_rescan_kthread(void *data) 3867 { 3868 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3869 int ret; 3870 3871 /* 3872 * 1st step is to iterate through the existing UUID tree and 3873 * to delete all entries that contain outdated data. 3874 * 2nd step is to add all missing entries to the UUID tree. 3875 */ 3876 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3877 if (ret < 0) { 3878 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 3879 up(&fs_info->uuid_tree_rescan_sem); 3880 return ret; 3881 } 3882 return btrfs_uuid_scan_kthread(data); 3883 } 3884 3885 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 3886 { 3887 struct btrfs_trans_handle *trans; 3888 struct btrfs_root *tree_root = fs_info->tree_root; 3889 struct btrfs_root *uuid_root; 3890 struct task_struct *task; 3891 int ret; 3892 3893 /* 3894 * 1 - root node 3895 * 1 - root item 3896 */ 3897 trans = btrfs_start_transaction(tree_root, 2); 3898 if (IS_ERR(trans)) 3899 return PTR_ERR(trans); 3900 3901 uuid_root = btrfs_create_tree(trans, fs_info, 3902 BTRFS_UUID_TREE_OBJECTID); 3903 if (IS_ERR(uuid_root)) { 3904 btrfs_abort_transaction(trans, tree_root, 3905 PTR_ERR(uuid_root)); 3906 return PTR_ERR(uuid_root); 3907 } 3908 3909 fs_info->uuid_root = uuid_root; 3910 3911 ret = btrfs_commit_transaction(trans, tree_root); 3912 if (ret) 3913 return ret; 3914 3915 down(&fs_info->uuid_tree_rescan_sem); 3916 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3917 if (IS_ERR(task)) { 3918 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3919 btrfs_warn(fs_info, "failed to start uuid_scan task"); 3920 up(&fs_info->uuid_tree_rescan_sem); 3921 return PTR_ERR(task); 3922 } 3923 3924 return 0; 3925 } 3926 3927 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3928 { 3929 struct task_struct *task; 3930 3931 down(&fs_info->uuid_tree_rescan_sem); 3932 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3933 if (IS_ERR(task)) { 3934 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3935 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 3936 up(&fs_info->uuid_tree_rescan_sem); 3937 return PTR_ERR(task); 3938 } 3939 3940 return 0; 3941 } 3942 3943 /* 3944 * shrinking a device means finding all of the device extents past 3945 * the new size, and then following the back refs to the chunks. 3946 * The chunk relocation code actually frees the device extent 3947 */ 3948 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3949 { 3950 struct btrfs_trans_handle *trans; 3951 struct btrfs_root *root = device->dev_root; 3952 struct btrfs_dev_extent *dev_extent = NULL; 3953 struct btrfs_path *path; 3954 u64 length; 3955 u64 chunk_objectid; 3956 u64 chunk_offset; 3957 int ret; 3958 int slot; 3959 int failed = 0; 3960 bool retried = false; 3961 struct extent_buffer *l; 3962 struct btrfs_key key; 3963 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3964 u64 old_total = btrfs_super_total_bytes(super_copy); 3965 u64 old_size = btrfs_device_get_total_bytes(device); 3966 u64 diff = old_size - new_size; 3967 3968 if (device->is_tgtdev_for_dev_replace) 3969 return -EINVAL; 3970 3971 path = btrfs_alloc_path(); 3972 if (!path) 3973 return -ENOMEM; 3974 3975 path->reada = 2; 3976 3977 lock_chunks(root); 3978 3979 btrfs_device_set_total_bytes(device, new_size); 3980 if (device->writeable) { 3981 device->fs_devices->total_rw_bytes -= diff; 3982 spin_lock(&root->fs_info->free_chunk_lock); 3983 root->fs_info->free_chunk_space -= diff; 3984 spin_unlock(&root->fs_info->free_chunk_lock); 3985 } 3986 unlock_chunks(root); 3987 3988 again: 3989 key.objectid = device->devid; 3990 key.offset = (u64)-1; 3991 key.type = BTRFS_DEV_EXTENT_KEY; 3992 3993 do { 3994 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3995 if (ret < 0) 3996 goto done; 3997 3998 ret = btrfs_previous_item(root, path, 0, key.type); 3999 if (ret < 0) 4000 goto done; 4001 if (ret) { 4002 ret = 0; 4003 btrfs_release_path(path); 4004 break; 4005 } 4006 4007 l = path->nodes[0]; 4008 slot = path->slots[0]; 4009 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4010 4011 if (key.objectid != device->devid) { 4012 btrfs_release_path(path); 4013 break; 4014 } 4015 4016 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4017 length = btrfs_dev_extent_length(l, dev_extent); 4018 4019 if (key.offset + length <= new_size) { 4020 btrfs_release_path(path); 4021 break; 4022 } 4023 4024 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 4025 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4026 btrfs_release_path(path); 4027 4028 ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); 4029 if (ret && ret != -ENOSPC) 4030 goto done; 4031 if (ret == -ENOSPC) 4032 failed++; 4033 } while (key.offset-- > 0); 4034 4035 if (failed && !retried) { 4036 failed = 0; 4037 retried = true; 4038 goto again; 4039 } else if (failed && retried) { 4040 ret = -ENOSPC; 4041 lock_chunks(root); 4042 4043 btrfs_device_set_total_bytes(device, old_size); 4044 if (device->writeable) 4045 device->fs_devices->total_rw_bytes += diff; 4046 spin_lock(&root->fs_info->free_chunk_lock); 4047 root->fs_info->free_chunk_space += diff; 4048 spin_unlock(&root->fs_info->free_chunk_lock); 4049 unlock_chunks(root); 4050 goto done; 4051 } 4052 4053 /* Shrinking succeeded, else we would be at "done". */ 4054 trans = btrfs_start_transaction(root, 0); 4055 if (IS_ERR(trans)) { 4056 ret = PTR_ERR(trans); 4057 goto done; 4058 } 4059 4060 lock_chunks(root); 4061 btrfs_device_set_disk_total_bytes(device, new_size); 4062 if (list_empty(&device->resized_list)) 4063 list_add_tail(&device->resized_list, 4064 &root->fs_info->fs_devices->resized_devices); 4065 4066 WARN_ON(diff > old_total); 4067 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4068 unlock_chunks(root); 4069 4070 /* Now btrfs_update_device() will change the on-disk size. */ 4071 ret = btrfs_update_device(trans, device); 4072 btrfs_end_transaction(trans, root); 4073 done: 4074 btrfs_free_path(path); 4075 return ret; 4076 } 4077 4078 static int btrfs_add_system_chunk(struct btrfs_root *root, 4079 struct btrfs_key *key, 4080 struct btrfs_chunk *chunk, int item_size) 4081 { 4082 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 4083 struct btrfs_disk_key disk_key; 4084 u32 array_size; 4085 u8 *ptr; 4086 4087 lock_chunks(root); 4088 array_size = btrfs_super_sys_array_size(super_copy); 4089 if (array_size + item_size + sizeof(disk_key) 4090 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4091 unlock_chunks(root); 4092 return -EFBIG; 4093 } 4094 4095 ptr = super_copy->sys_chunk_array + array_size; 4096 btrfs_cpu_key_to_disk(&disk_key, key); 4097 memcpy(ptr, &disk_key, sizeof(disk_key)); 4098 ptr += sizeof(disk_key); 4099 memcpy(ptr, chunk, item_size); 4100 item_size += sizeof(disk_key); 4101 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4102 unlock_chunks(root); 4103 4104 return 0; 4105 } 4106 4107 /* 4108 * sort the devices in descending order by max_avail, total_avail 4109 */ 4110 static int btrfs_cmp_device_info(const void *a, const void *b) 4111 { 4112 const struct btrfs_device_info *di_a = a; 4113 const struct btrfs_device_info *di_b = b; 4114 4115 if (di_a->max_avail > di_b->max_avail) 4116 return -1; 4117 if (di_a->max_avail < di_b->max_avail) 4118 return 1; 4119 if (di_a->total_avail > di_b->total_avail) 4120 return -1; 4121 if (di_a->total_avail < di_b->total_avail) 4122 return 1; 4123 return 0; 4124 } 4125 4126 static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 4127 [BTRFS_RAID_RAID10] = { 4128 .sub_stripes = 2, 4129 .dev_stripes = 1, 4130 .devs_max = 0, /* 0 == as many as possible */ 4131 .devs_min = 4, 4132 .devs_increment = 2, 4133 .ncopies = 2, 4134 }, 4135 [BTRFS_RAID_RAID1] = { 4136 .sub_stripes = 1, 4137 .dev_stripes = 1, 4138 .devs_max = 2, 4139 .devs_min = 2, 4140 .devs_increment = 2, 4141 .ncopies = 2, 4142 }, 4143 [BTRFS_RAID_DUP] = { 4144 .sub_stripes = 1, 4145 .dev_stripes = 2, 4146 .devs_max = 1, 4147 .devs_min = 1, 4148 .devs_increment = 1, 4149 .ncopies = 2, 4150 }, 4151 [BTRFS_RAID_RAID0] = { 4152 .sub_stripes = 1, 4153 .dev_stripes = 1, 4154 .devs_max = 0, 4155 .devs_min = 2, 4156 .devs_increment = 1, 4157 .ncopies = 1, 4158 }, 4159 [BTRFS_RAID_SINGLE] = { 4160 .sub_stripes = 1, 4161 .dev_stripes = 1, 4162 .devs_max = 1, 4163 .devs_min = 1, 4164 .devs_increment = 1, 4165 .ncopies = 1, 4166 }, 4167 [BTRFS_RAID_RAID5] = { 4168 .sub_stripes = 1, 4169 .dev_stripes = 1, 4170 .devs_max = 0, 4171 .devs_min = 2, 4172 .devs_increment = 1, 4173 .ncopies = 2, 4174 }, 4175 [BTRFS_RAID_RAID6] = { 4176 .sub_stripes = 1, 4177 .dev_stripes = 1, 4178 .devs_max = 0, 4179 .devs_min = 3, 4180 .devs_increment = 1, 4181 .ncopies = 3, 4182 }, 4183 }; 4184 4185 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 4186 { 4187 /* TODO allow them to set a preferred stripe size */ 4188 return 64 * 1024; 4189 } 4190 4191 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4192 { 4193 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4194 return; 4195 4196 btrfs_set_fs_incompat(info, RAID56); 4197 } 4198 4199 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ 4200 - sizeof(struct btrfs_item) \ 4201 - sizeof(struct btrfs_chunk)) \ 4202 / sizeof(struct btrfs_stripe) + 1) 4203 4204 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4205 - 2 * sizeof(struct btrfs_disk_key) \ 4206 - 2 * sizeof(struct btrfs_chunk)) \ 4207 / sizeof(struct btrfs_stripe) + 1) 4208 4209 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4210 struct btrfs_root *extent_root, u64 start, 4211 u64 type) 4212 { 4213 struct btrfs_fs_info *info = extent_root->fs_info; 4214 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4215 struct list_head *cur; 4216 struct map_lookup *map = NULL; 4217 struct extent_map_tree *em_tree; 4218 struct extent_map *em; 4219 struct btrfs_device_info *devices_info = NULL; 4220 u64 total_avail; 4221 int num_stripes; /* total number of stripes to allocate */ 4222 int data_stripes; /* number of stripes that count for 4223 block group size */ 4224 int sub_stripes; /* sub_stripes info for map */ 4225 int dev_stripes; /* stripes per dev */ 4226 int devs_max; /* max devs to use */ 4227 int devs_min; /* min devs needed */ 4228 int devs_increment; /* ndevs has to be a multiple of this */ 4229 int ncopies; /* how many copies to data has */ 4230 int ret; 4231 u64 max_stripe_size; 4232 u64 max_chunk_size; 4233 u64 stripe_size; 4234 u64 num_bytes; 4235 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 4236 int ndevs; 4237 int i; 4238 int j; 4239 int index; 4240 4241 BUG_ON(!alloc_profile_is_valid(type, 0)); 4242 4243 if (list_empty(&fs_devices->alloc_list)) 4244 return -ENOSPC; 4245 4246 index = __get_raid_index(type); 4247 4248 sub_stripes = btrfs_raid_array[index].sub_stripes; 4249 dev_stripes = btrfs_raid_array[index].dev_stripes; 4250 devs_max = btrfs_raid_array[index].devs_max; 4251 devs_min = btrfs_raid_array[index].devs_min; 4252 devs_increment = btrfs_raid_array[index].devs_increment; 4253 ncopies = btrfs_raid_array[index].ncopies; 4254 4255 if (type & BTRFS_BLOCK_GROUP_DATA) { 4256 max_stripe_size = 1024 * 1024 * 1024; 4257 max_chunk_size = 10 * max_stripe_size; 4258 if (!devs_max) 4259 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4260 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4261 /* for larger filesystems, use larger metadata chunks */ 4262 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4263 max_stripe_size = 1024 * 1024 * 1024; 4264 else 4265 max_stripe_size = 256 * 1024 * 1024; 4266 max_chunk_size = max_stripe_size; 4267 if (!devs_max) 4268 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4269 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4270 max_stripe_size = 32 * 1024 * 1024; 4271 max_chunk_size = 2 * max_stripe_size; 4272 if (!devs_max) 4273 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4274 } else { 4275 btrfs_err(info, "invalid chunk type 0x%llx requested", 4276 type); 4277 BUG_ON(1); 4278 } 4279 4280 /* we don't want a chunk larger than 10% of writeable space */ 4281 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4282 max_chunk_size); 4283 4284 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4285 GFP_NOFS); 4286 if (!devices_info) 4287 return -ENOMEM; 4288 4289 cur = fs_devices->alloc_list.next; 4290 4291 /* 4292 * in the first pass through the devices list, we gather information 4293 * about the available holes on each device. 4294 */ 4295 ndevs = 0; 4296 while (cur != &fs_devices->alloc_list) { 4297 struct btrfs_device *device; 4298 u64 max_avail; 4299 u64 dev_offset; 4300 4301 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4302 4303 cur = cur->next; 4304 4305 if (!device->writeable) { 4306 WARN(1, KERN_ERR 4307 "BTRFS: read-only device in alloc_list\n"); 4308 continue; 4309 } 4310 4311 if (!device->in_fs_metadata || 4312 device->is_tgtdev_for_dev_replace) 4313 continue; 4314 4315 if (device->total_bytes > device->bytes_used) 4316 total_avail = device->total_bytes - device->bytes_used; 4317 else 4318 total_avail = 0; 4319 4320 /* If there is no space on this device, skip it. */ 4321 if (total_avail == 0) 4322 continue; 4323 4324 ret = find_free_dev_extent(trans, device, 4325 max_stripe_size * dev_stripes, 4326 &dev_offset, &max_avail); 4327 if (ret && ret != -ENOSPC) 4328 goto error; 4329 4330 if (ret == 0) 4331 max_avail = max_stripe_size * dev_stripes; 4332 4333 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4334 continue; 4335 4336 if (ndevs == fs_devices->rw_devices) { 4337 WARN(1, "%s: found more than %llu devices\n", 4338 __func__, fs_devices->rw_devices); 4339 break; 4340 } 4341 devices_info[ndevs].dev_offset = dev_offset; 4342 devices_info[ndevs].max_avail = max_avail; 4343 devices_info[ndevs].total_avail = total_avail; 4344 devices_info[ndevs].dev = device; 4345 ++ndevs; 4346 } 4347 4348 /* 4349 * now sort the devices by hole size / available space 4350 */ 4351 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4352 btrfs_cmp_device_info, NULL); 4353 4354 /* round down to number of usable stripes */ 4355 ndevs -= ndevs % devs_increment; 4356 4357 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4358 ret = -ENOSPC; 4359 goto error; 4360 } 4361 4362 if (devs_max && ndevs > devs_max) 4363 ndevs = devs_max; 4364 /* 4365 * the primary goal is to maximize the number of stripes, so use as many 4366 * devices as possible, even if the stripes are not maximum sized. 4367 */ 4368 stripe_size = devices_info[ndevs-1].max_avail; 4369 num_stripes = ndevs * dev_stripes; 4370 4371 /* 4372 * this will have to be fixed for RAID1 and RAID10 over 4373 * more drives 4374 */ 4375 data_stripes = num_stripes / ncopies; 4376 4377 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4378 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4379 btrfs_super_stripesize(info->super_copy)); 4380 data_stripes = num_stripes - 1; 4381 } 4382 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4383 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4384 btrfs_super_stripesize(info->super_copy)); 4385 data_stripes = num_stripes - 2; 4386 } 4387 4388 /* 4389 * Use the number of data stripes to figure out how big this chunk 4390 * is really going to be in terms of logical address space, 4391 * and compare that answer with the max chunk size 4392 */ 4393 if (stripe_size * data_stripes > max_chunk_size) { 4394 u64 mask = (1ULL << 24) - 1; 4395 4396 stripe_size = div_u64(max_chunk_size, data_stripes); 4397 4398 /* bump the answer up to a 16MB boundary */ 4399 stripe_size = (stripe_size + mask) & ~mask; 4400 4401 /* but don't go higher than the limits we found 4402 * while searching for free extents 4403 */ 4404 if (stripe_size > devices_info[ndevs-1].max_avail) 4405 stripe_size = devices_info[ndevs-1].max_avail; 4406 } 4407 4408 stripe_size = div_u64(stripe_size, dev_stripes); 4409 4410 /* align to BTRFS_STRIPE_LEN */ 4411 stripe_size = div_u64(stripe_size, raid_stripe_len); 4412 stripe_size *= raid_stripe_len; 4413 4414 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4415 if (!map) { 4416 ret = -ENOMEM; 4417 goto error; 4418 } 4419 map->num_stripes = num_stripes; 4420 4421 for (i = 0; i < ndevs; ++i) { 4422 for (j = 0; j < dev_stripes; ++j) { 4423 int s = i * dev_stripes + j; 4424 map->stripes[s].dev = devices_info[i].dev; 4425 map->stripes[s].physical = devices_info[i].dev_offset + 4426 j * stripe_size; 4427 } 4428 } 4429 map->sector_size = extent_root->sectorsize; 4430 map->stripe_len = raid_stripe_len; 4431 map->io_align = raid_stripe_len; 4432 map->io_width = raid_stripe_len; 4433 map->type = type; 4434 map->sub_stripes = sub_stripes; 4435 4436 num_bytes = stripe_size * data_stripes; 4437 4438 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 4439 4440 em = alloc_extent_map(); 4441 if (!em) { 4442 kfree(map); 4443 ret = -ENOMEM; 4444 goto error; 4445 } 4446 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4447 em->bdev = (struct block_device *)map; 4448 em->start = start; 4449 em->len = num_bytes; 4450 em->block_start = 0; 4451 em->block_len = em->len; 4452 em->orig_block_len = stripe_size; 4453 4454 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4455 write_lock(&em_tree->lock); 4456 ret = add_extent_mapping(em_tree, em, 0); 4457 if (!ret) { 4458 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4459 atomic_inc(&em->refs); 4460 } 4461 write_unlock(&em_tree->lock); 4462 if (ret) { 4463 free_extent_map(em); 4464 goto error; 4465 } 4466 4467 ret = btrfs_make_block_group(trans, extent_root, 0, type, 4468 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4469 start, num_bytes); 4470 if (ret) 4471 goto error_del_extent; 4472 4473 for (i = 0; i < map->num_stripes; i++) { 4474 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4475 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4476 } 4477 4478 spin_lock(&extent_root->fs_info->free_chunk_lock); 4479 extent_root->fs_info->free_chunk_space -= (stripe_size * 4480 map->num_stripes); 4481 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4482 4483 free_extent_map(em); 4484 check_raid56_incompat_flag(extent_root->fs_info, type); 4485 4486 kfree(devices_info); 4487 return 0; 4488 4489 error_del_extent: 4490 write_lock(&em_tree->lock); 4491 remove_extent_mapping(em_tree, em); 4492 write_unlock(&em_tree->lock); 4493 4494 /* One for our allocation */ 4495 free_extent_map(em); 4496 /* One for the tree reference */ 4497 free_extent_map(em); 4498 /* One for the pending_chunks list reference */ 4499 free_extent_map(em); 4500 error: 4501 kfree(devices_info); 4502 return ret; 4503 } 4504 4505 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4506 struct btrfs_root *extent_root, 4507 u64 chunk_offset, u64 chunk_size) 4508 { 4509 struct btrfs_key key; 4510 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 4511 struct btrfs_device *device; 4512 struct btrfs_chunk *chunk; 4513 struct btrfs_stripe *stripe; 4514 struct extent_map_tree *em_tree; 4515 struct extent_map *em; 4516 struct map_lookup *map; 4517 size_t item_size; 4518 u64 dev_offset; 4519 u64 stripe_size; 4520 int i = 0; 4521 int ret; 4522 4523 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4524 read_lock(&em_tree->lock); 4525 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4526 read_unlock(&em_tree->lock); 4527 4528 if (!em) { 4529 btrfs_crit(extent_root->fs_info, "unable to find logical " 4530 "%Lu len %Lu", chunk_offset, chunk_size); 4531 return -EINVAL; 4532 } 4533 4534 if (em->start != chunk_offset || em->len != chunk_size) { 4535 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4536 " %Lu-%Lu, found %Lu-%Lu", chunk_offset, 4537 chunk_size, em->start, em->len); 4538 free_extent_map(em); 4539 return -EINVAL; 4540 } 4541 4542 map = (struct map_lookup *)em->bdev; 4543 item_size = btrfs_chunk_item_size(map->num_stripes); 4544 stripe_size = em->orig_block_len; 4545 4546 chunk = kzalloc(item_size, GFP_NOFS); 4547 if (!chunk) { 4548 ret = -ENOMEM; 4549 goto out; 4550 } 4551 4552 for (i = 0; i < map->num_stripes; i++) { 4553 device = map->stripes[i].dev; 4554 dev_offset = map->stripes[i].physical; 4555 4556 ret = btrfs_update_device(trans, device); 4557 if (ret) 4558 goto out; 4559 ret = btrfs_alloc_dev_extent(trans, device, 4560 chunk_root->root_key.objectid, 4561 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4562 chunk_offset, dev_offset, 4563 stripe_size); 4564 if (ret) 4565 goto out; 4566 } 4567 4568 stripe = &chunk->stripe; 4569 for (i = 0; i < map->num_stripes; i++) { 4570 device = map->stripes[i].dev; 4571 dev_offset = map->stripes[i].physical; 4572 4573 btrfs_set_stack_stripe_devid(stripe, device->devid); 4574 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4575 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4576 stripe++; 4577 } 4578 4579 btrfs_set_stack_chunk_length(chunk, chunk_size); 4580 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4581 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4582 btrfs_set_stack_chunk_type(chunk, map->type); 4583 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4584 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4585 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4586 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 4587 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4588 4589 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4590 key.type = BTRFS_CHUNK_ITEM_KEY; 4591 key.offset = chunk_offset; 4592 4593 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4594 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4595 /* 4596 * TODO: Cleanup of inserted chunk root in case of 4597 * failure. 4598 */ 4599 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 4600 item_size); 4601 } 4602 4603 out: 4604 kfree(chunk); 4605 free_extent_map(em); 4606 return ret; 4607 } 4608 4609 /* 4610 * Chunk allocation falls into two parts. The first part does works 4611 * that make the new allocated chunk useable, but not do any operation 4612 * that modifies the chunk tree. The second part does the works that 4613 * require modifying the chunk tree. This division is important for the 4614 * bootstrap process of adding storage to a seed btrfs. 4615 */ 4616 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4617 struct btrfs_root *extent_root, u64 type) 4618 { 4619 u64 chunk_offset; 4620 4621 chunk_offset = find_next_chunk(extent_root->fs_info); 4622 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); 4623 } 4624 4625 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4626 struct btrfs_root *root, 4627 struct btrfs_device *device) 4628 { 4629 u64 chunk_offset; 4630 u64 sys_chunk_offset; 4631 u64 alloc_profile; 4632 struct btrfs_fs_info *fs_info = root->fs_info; 4633 struct btrfs_root *extent_root = fs_info->extent_root; 4634 int ret; 4635 4636 chunk_offset = find_next_chunk(fs_info); 4637 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4638 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, 4639 alloc_profile); 4640 if (ret) 4641 return ret; 4642 4643 sys_chunk_offset = find_next_chunk(root->fs_info); 4644 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4645 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4646 alloc_profile); 4647 return ret; 4648 } 4649 4650 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 4651 { 4652 int max_errors; 4653 4654 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4655 BTRFS_BLOCK_GROUP_RAID10 | 4656 BTRFS_BLOCK_GROUP_RAID5 | 4657 BTRFS_BLOCK_GROUP_DUP)) { 4658 max_errors = 1; 4659 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 4660 max_errors = 2; 4661 } else { 4662 max_errors = 0; 4663 } 4664 4665 return max_errors; 4666 } 4667 4668 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4669 { 4670 struct extent_map *em; 4671 struct map_lookup *map; 4672 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4673 int readonly = 0; 4674 int miss_ndevs = 0; 4675 int i; 4676 4677 read_lock(&map_tree->map_tree.lock); 4678 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 4679 read_unlock(&map_tree->map_tree.lock); 4680 if (!em) 4681 return 1; 4682 4683 map = (struct map_lookup *)em->bdev; 4684 for (i = 0; i < map->num_stripes; i++) { 4685 if (map->stripes[i].dev->missing) { 4686 miss_ndevs++; 4687 continue; 4688 } 4689 4690 if (!map->stripes[i].dev->writeable) { 4691 readonly = 1; 4692 goto end; 4693 } 4694 } 4695 4696 /* 4697 * If the number of missing devices is larger than max errors, 4698 * we can not write the data into that chunk successfully, so 4699 * set it readonly. 4700 */ 4701 if (miss_ndevs > btrfs_chunk_max_errors(map)) 4702 readonly = 1; 4703 end: 4704 free_extent_map(em); 4705 return readonly; 4706 } 4707 4708 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 4709 { 4710 extent_map_tree_init(&tree->map_tree); 4711 } 4712 4713 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 4714 { 4715 struct extent_map *em; 4716 4717 while (1) { 4718 write_lock(&tree->map_tree.lock); 4719 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 4720 if (em) 4721 remove_extent_mapping(&tree->map_tree, em); 4722 write_unlock(&tree->map_tree.lock); 4723 if (!em) 4724 break; 4725 /* once for us */ 4726 free_extent_map(em); 4727 /* once for the tree */ 4728 free_extent_map(em); 4729 } 4730 } 4731 4732 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 4733 { 4734 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4735 struct extent_map *em; 4736 struct map_lookup *map; 4737 struct extent_map_tree *em_tree = &map_tree->map_tree; 4738 int ret; 4739 4740 read_lock(&em_tree->lock); 4741 em = lookup_extent_mapping(em_tree, logical, len); 4742 read_unlock(&em_tree->lock); 4743 4744 /* 4745 * We could return errors for these cases, but that could get ugly and 4746 * we'd probably do the same thing which is just not do anything else 4747 * and exit, so return 1 so the callers don't try to use other copies. 4748 */ 4749 if (!em) { 4750 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, 4751 logical+len); 4752 return 1; 4753 } 4754 4755 if (em->start > logical || em->start + em->len < logical) { 4756 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4757 "%Lu-%Lu", logical, logical+len, em->start, 4758 em->start + em->len); 4759 free_extent_map(em); 4760 return 1; 4761 } 4762 4763 map = (struct map_lookup *)em->bdev; 4764 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 4765 ret = map->num_stripes; 4766 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4767 ret = map->sub_stripes; 4768 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4769 ret = 2; 4770 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4771 ret = 3; 4772 else 4773 ret = 1; 4774 free_extent_map(em); 4775 4776 btrfs_dev_replace_lock(&fs_info->dev_replace); 4777 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 4778 ret++; 4779 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4780 4781 return ret; 4782 } 4783 4784 unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4785 struct btrfs_mapping_tree *map_tree, 4786 u64 logical) 4787 { 4788 struct extent_map *em; 4789 struct map_lookup *map; 4790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4791 unsigned long len = root->sectorsize; 4792 4793 read_lock(&em_tree->lock); 4794 em = lookup_extent_mapping(em_tree, logical, len); 4795 read_unlock(&em_tree->lock); 4796 BUG_ON(!em); 4797 4798 BUG_ON(em->start > logical || em->start + em->len < logical); 4799 map = (struct map_lookup *)em->bdev; 4800 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 4801 len = map->stripe_len * nr_data_stripes(map); 4802 free_extent_map(em); 4803 return len; 4804 } 4805 4806 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4807 u64 logical, u64 len, int mirror_num) 4808 { 4809 struct extent_map *em; 4810 struct map_lookup *map; 4811 struct extent_map_tree *em_tree = &map_tree->map_tree; 4812 int ret = 0; 4813 4814 read_lock(&em_tree->lock); 4815 em = lookup_extent_mapping(em_tree, logical, len); 4816 read_unlock(&em_tree->lock); 4817 BUG_ON(!em); 4818 4819 BUG_ON(em->start > logical || em->start + em->len < logical); 4820 map = (struct map_lookup *)em->bdev; 4821 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 4822 ret = 1; 4823 free_extent_map(em); 4824 return ret; 4825 } 4826 4827 static int find_live_mirror(struct btrfs_fs_info *fs_info, 4828 struct map_lookup *map, int first, int num, 4829 int optimal, int dev_replace_is_ongoing) 4830 { 4831 int i; 4832 int tolerance; 4833 struct btrfs_device *srcdev; 4834 4835 if (dev_replace_is_ongoing && 4836 fs_info->dev_replace.cont_reading_from_srcdev_mode == 4837 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 4838 srcdev = fs_info->dev_replace.srcdev; 4839 else 4840 srcdev = NULL; 4841 4842 /* 4843 * try to avoid the drive that is the source drive for a 4844 * dev-replace procedure, only choose it if no other non-missing 4845 * mirror is available 4846 */ 4847 for (tolerance = 0; tolerance < 2; tolerance++) { 4848 if (map->stripes[optimal].dev->bdev && 4849 (tolerance || map->stripes[optimal].dev != srcdev)) 4850 return optimal; 4851 for (i = first; i < first + num; i++) { 4852 if (map->stripes[i].dev->bdev && 4853 (tolerance || map->stripes[i].dev != srcdev)) 4854 return i; 4855 } 4856 } 4857 4858 /* we couldn't find one that doesn't fail. Just return something 4859 * and the io error handling code will clean up eventually 4860 */ 4861 return optimal; 4862 } 4863 4864 static inline int parity_smaller(u64 a, u64 b) 4865 { 4866 return a > b; 4867 } 4868 4869 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4870 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 4871 { 4872 struct btrfs_bio_stripe s; 4873 int i; 4874 u64 l; 4875 int again = 1; 4876 4877 while (again) { 4878 again = 0; 4879 for (i = 0; i < num_stripes - 1; i++) { 4880 if (parity_smaller(bbio->raid_map[i], 4881 bbio->raid_map[i+1])) { 4882 s = bbio->stripes[i]; 4883 l = bbio->raid_map[i]; 4884 bbio->stripes[i] = bbio->stripes[i+1]; 4885 bbio->raid_map[i] = bbio->raid_map[i+1]; 4886 bbio->stripes[i+1] = s; 4887 bbio->raid_map[i+1] = l; 4888 4889 again = 1; 4890 } 4891 } 4892 } 4893 } 4894 4895 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 4896 { 4897 struct btrfs_bio *bbio = kzalloc( 4898 /* the size of the btrfs_bio */ 4899 sizeof(struct btrfs_bio) + 4900 /* plus the variable array for the stripes */ 4901 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 4902 /* plus the variable array for the tgt dev */ 4903 sizeof(int) * (real_stripes) + 4904 /* 4905 * plus the raid_map, which includes both the tgt dev 4906 * and the stripes 4907 */ 4908 sizeof(u64) * (total_stripes), 4909 GFP_NOFS); 4910 if (!bbio) 4911 return NULL; 4912 4913 atomic_set(&bbio->error, 0); 4914 atomic_set(&bbio->refs, 1); 4915 4916 return bbio; 4917 } 4918 4919 void btrfs_get_bbio(struct btrfs_bio *bbio) 4920 { 4921 WARN_ON(!atomic_read(&bbio->refs)); 4922 atomic_inc(&bbio->refs); 4923 } 4924 4925 void btrfs_put_bbio(struct btrfs_bio *bbio) 4926 { 4927 if (!bbio) 4928 return; 4929 if (atomic_dec_and_test(&bbio->refs)) 4930 kfree(bbio); 4931 } 4932 4933 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4934 u64 logical, u64 *length, 4935 struct btrfs_bio **bbio_ret, 4936 int mirror_num, int need_raid_map) 4937 { 4938 struct extent_map *em; 4939 struct map_lookup *map; 4940 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4941 struct extent_map_tree *em_tree = &map_tree->map_tree; 4942 u64 offset; 4943 u64 stripe_offset; 4944 u64 stripe_end_offset; 4945 u64 stripe_nr; 4946 u64 stripe_nr_orig; 4947 u64 stripe_nr_end; 4948 u64 stripe_len; 4949 u32 stripe_index; 4950 int i; 4951 int ret = 0; 4952 int num_stripes; 4953 int max_errors = 0; 4954 int tgtdev_indexes = 0; 4955 struct btrfs_bio *bbio = NULL; 4956 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4957 int dev_replace_is_ongoing = 0; 4958 int num_alloc_stripes; 4959 int patch_the_first_stripe_for_dev_replace = 0; 4960 u64 physical_to_patch_in_first_stripe = 0; 4961 u64 raid56_full_stripe_start = (u64)-1; 4962 4963 read_lock(&em_tree->lock); 4964 em = lookup_extent_mapping(em_tree, logical, *length); 4965 read_unlock(&em_tree->lock); 4966 4967 if (!em) { 4968 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 4969 logical, *length); 4970 return -EINVAL; 4971 } 4972 4973 if (em->start > logical || em->start + em->len < logical) { 4974 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4975 "found %Lu-%Lu", logical, em->start, 4976 em->start + em->len); 4977 free_extent_map(em); 4978 return -EINVAL; 4979 } 4980 4981 map = (struct map_lookup *)em->bdev; 4982 offset = logical - em->start; 4983 4984 stripe_len = map->stripe_len; 4985 stripe_nr = offset; 4986 /* 4987 * stripe_nr counts the total number of stripes we have to stride 4988 * to get to this block 4989 */ 4990 stripe_nr = div64_u64(stripe_nr, stripe_len); 4991 4992 stripe_offset = stripe_nr * stripe_len; 4993 BUG_ON(offset < stripe_offset); 4994 4995 /* stripe_offset is the offset of this block in its stripe*/ 4996 stripe_offset = offset - stripe_offset; 4997 4998 /* if we're here for raid56, we need to know the stripe aligned start */ 4999 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5000 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5001 raid56_full_stripe_start = offset; 5002 5003 /* allow a write of a full stripe, but make sure we don't 5004 * allow straddling of stripes 5005 */ 5006 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5007 full_stripe_len); 5008 raid56_full_stripe_start *= full_stripe_len; 5009 } 5010 5011 if (rw & REQ_DISCARD) { 5012 /* we don't discard raid56 yet */ 5013 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5014 ret = -EOPNOTSUPP; 5015 goto out; 5016 } 5017 *length = min_t(u64, em->len - offset, *length); 5018 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5019 u64 max_len; 5020 /* For writes to RAID[56], allow a full stripeset across all disks. 5021 For other RAID types and for RAID[56] reads, just allow a single 5022 stripe (on a single disk). */ 5023 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5024 (rw & REQ_WRITE)) { 5025 max_len = stripe_len * nr_data_stripes(map) - 5026 (offset - raid56_full_stripe_start); 5027 } else { 5028 /* we limit the length of each bio to what fits in a stripe */ 5029 max_len = stripe_len - stripe_offset; 5030 } 5031 *length = min_t(u64, em->len - offset, max_len); 5032 } else { 5033 *length = em->len - offset; 5034 } 5035 5036 /* This is for when we're called from btrfs_merge_bio_hook() and all 5037 it cares about is the length */ 5038 if (!bbio_ret) 5039 goto out; 5040 5041 btrfs_dev_replace_lock(dev_replace); 5042 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5043 if (!dev_replace_is_ongoing) 5044 btrfs_dev_replace_unlock(dev_replace); 5045 5046 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5047 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 5048 dev_replace->tgtdev != NULL) { 5049 /* 5050 * in dev-replace case, for repair case (that's the only 5051 * case where the mirror is selected explicitly when 5052 * calling btrfs_map_block), blocks left of the left cursor 5053 * can also be read from the target drive. 5054 * For REQ_GET_READ_MIRRORS, the target drive is added as 5055 * the last one to the array of stripes. For READ, it also 5056 * needs to be supported using the same mirror number. 5057 * If the requested block is not left of the left cursor, 5058 * EIO is returned. This can happen because btrfs_num_copies() 5059 * returns one more in the dev-replace case. 5060 */ 5061 u64 tmp_length = *length; 5062 struct btrfs_bio *tmp_bbio = NULL; 5063 int tmp_num_stripes; 5064 u64 srcdev_devid = dev_replace->srcdev->devid; 5065 int index_srcdev = 0; 5066 int found = 0; 5067 u64 physical_of_found = 0; 5068 5069 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 5070 logical, &tmp_length, &tmp_bbio, 0, 0); 5071 if (ret) { 5072 WARN_ON(tmp_bbio != NULL); 5073 goto out; 5074 } 5075 5076 tmp_num_stripes = tmp_bbio->num_stripes; 5077 if (mirror_num > tmp_num_stripes) { 5078 /* 5079 * REQ_GET_READ_MIRRORS does not contain this 5080 * mirror, that means that the requested area 5081 * is not left of the left cursor 5082 */ 5083 ret = -EIO; 5084 btrfs_put_bbio(tmp_bbio); 5085 goto out; 5086 } 5087 5088 /* 5089 * process the rest of the function using the mirror_num 5090 * of the source drive. Therefore look it up first. 5091 * At the end, patch the device pointer to the one of the 5092 * target drive. 5093 */ 5094 for (i = 0; i < tmp_num_stripes; i++) { 5095 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 5096 /* 5097 * In case of DUP, in order to keep it 5098 * simple, only add the mirror with the 5099 * lowest physical address 5100 */ 5101 if (found && 5102 physical_of_found <= 5103 tmp_bbio->stripes[i].physical) 5104 continue; 5105 index_srcdev = i; 5106 found = 1; 5107 physical_of_found = 5108 tmp_bbio->stripes[i].physical; 5109 } 5110 } 5111 5112 if (found) { 5113 mirror_num = index_srcdev + 1; 5114 patch_the_first_stripe_for_dev_replace = 1; 5115 physical_to_patch_in_first_stripe = physical_of_found; 5116 } else { 5117 WARN_ON(1); 5118 ret = -EIO; 5119 btrfs_put_bbio(tmp_bbio); 5120 goto out; 5121 } 5122 5123 btrfs_put_bbio(tmp_bbio); 5124 } else if (mirror_num > map->num_stripes) { 5125 mirror_num = 0; 5126 } 5127 5128 num_stripes = 1; 5129 stripe_index = 0; 5130 stripe_nr_orig = stripe_nr; 5131 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 5132 stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); 5133 stripe_end_offset = stripe_nr_end * map->stripe_len - 5134 (offset + *length); 5135 5136 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5137 if (rw & REQ_DISCARD) 5138 num_stripes = min_t(u64, map->num_stripes, 5139 stripe_nr_end - stripe_nr_orig); 5140 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5141 &stripe_index); 5142 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) 5143 mirror_num = 1; 5144 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5145 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 5146 num_stripes = map->num_stripes; 5147 else if (mirror_num) 5148 stripe_index = mirror_num - 1; 5149 else { 5150 stripe_index = find_live_mirror(fs_info, map, 0, 5151 map->num_stripes, 5152 current->pid % map->num_stripes, 5153 dev_replace_is_ongoing); 5154 mirror_num = stripe_index + 1; 5155 } 5156 5157 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5158 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 5159 num_stripes = map->num_stripes; 5160 } else if (mirror_num) { 5161 stripe_index = mirror_num - 1; 5162 } else { 5163 mirror_num = 1; 5164 } 5165 5166 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5167 u32 factor = map->num_stripes / map->sub_stripes; 5168 5169 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5170 stripe_index *= map->sub_stripes; 5171 5172 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5173 num_stripes = map->sub_stripes; 5174 else if (rw & REQ_DISCARD) 5175 num_stripes = min_t(u64, map->sub_stripes * 5176 (stripe_nr_end - stripe_nr_orig), 5177 map->num_stripes); 5178 else if (mirror_num) 5179 stripe_index += mirror_num - 1; 5180 else { 5181 int old_stripe_index = stripe_index; 5182 stripe_index = find_live_mirror(fs_info, map, 5183 stripe_index, 5184 map->sub_stripes, stripe_index + 5185 current->pid % map->sub_stripes, 5186 dev_replace_is_ongoing); 5187 mirror_num = stripe_index - old_stripe_index + 1; 5188 } 5189 5190 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5191 if (need_raid_map && 5192 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5193 mirror_num > 1)) { 5194 /* push stripe_nr back to the start of the full stripe */ 5195 stripe_nr = div_u64(raid56_full_stripe_start, 5196 stripe_len * nr_data_stripes(map)); 5197 5198 /* RAID[56] write or recovery. Return all stripes */ 5199 num_stripes = map->num_stripes; 5200 max_errors = nr_parity_stripes(map); 5201 5202 *length = map->stripe_len; 5203 stripe_index = 0; 5204 stripe_offset = 0; 5205 } else { 5206 /* 5207 * Mirror #0 or #1 means the original data block. 5208 * Mirror #2 is RAID5 parity block. 5209 * Mirror #3 is RAID6 Q block. 5210 */ 5211 stripe_nr = div_u64_rem(stripe_nr, 5212 nr_data_stripes(map), &stripe_index); 5213 if (mirror_num > 1) 5214 stripe_index = nr_data_stripes(map) + 5215 mirror_num - 2; 5216 5217 /* We distribute the parity blocks across stripes */ 5218 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5219 &stripe_index); 5220 if (!(rw & (REQ_WRITE | REQ_DISCARD | 5221 REQ_GET_READ_MIRRORS)) && mirror_num <= 1) 5222 mirror_num = 1; 5223 } 5224 } else { 5225 /* 5226 * after this, stripe_nr is the number of stripes on this 5227 * device we have to walk to find the data, and stripe_index is 5228 * the number of our device in the stripe array 5229 */ 5230 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5231 &stripe_index); 5232 mirror_num = stripe_index + 1; 5233 } 5234 BUG_ON(stripe_index >= map->num_stripes); 5235 5236 num_alloc_stripes = num_stripes; 5237 if (dev_replace_is_ongoing) { 5238 if (rw & (REQ_WRITE | REQ_DISCARD)) 5239 num_alloc_stripes <<= 1; 5240 if (rw & REQ_GET_READ_MIRRORS) 5241 num_alloc_stripes++; 5242 tgtdev_indexes = num_stripes; 5243 } 5244 5245 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5246 if (!bbio) { 5247 ret = -ENOMEM; 5248 goto out; 5249 } 5250 if (dev_replace_is_ongoing) 5251 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5252 5253 /* build raid_map */ 5254 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && 5255 need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5256 mirror_num > 1)) { 5257 u64 tmp; 5258 unsigned rot; 5259 5260 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5261 sizeof(struct btrfs_bio_stripe) * 5262 num_alloc_stripes + 5263 sizeof(int) * tgtdev_indexes); 5264 5265 /* Work out the disk rotation on this stripe-set */ 5266 div_u64_rem(stripe_nr, num_stripes, &rot); 5267 5268 /* Fill in the logical address of each stripe */ 5269 tmp = stripe_nr * nr_data_stripes(map); 5270 for (i = 0; i < nr_data_stripes(map); i++) 5271 bbio->raid_map[(i+rot) % num_stripes] = 5272 em->start + (tmp + i) * map->stripe_len; 5273 5274 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5275 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5276 bbio->raid_map[(i+rot+1) % num_stripes] = 5277 RAID6_Q_STRIPE; 5278 } 5279 5280 if (rw & REQ_DISCARD) { 5281 u32 factor = 0; 5282 u32 sub_stripes = 0; 5283 u64 stripes_per_dev = 0; 5284 u32 remaining_stripes = 0; 5285 u32 last_stripe = 0; 5286 5287 if (map->type & 5288 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 5289 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5290 sub_stripes = 1; 5291 else 5292 sub_stripes = map->sub_stripes; 5293 5294 factor = map->num_stripes / sub_stripes; 5295 stripes_per_dev = div_u64_rem(stripe_nr_end - 5296 stripe_nr_orig, 5297 factor, 5298 &remaining_stripes); 5299 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5300 last_stripe *= sub_stripes; 5301 } 5302 5303 for (i = 0; i < num_stripes; i++) { 5304 bbio->stripes[i].physical = 5305 map->stripes[stripe_index].physical + 5306 stripe_offset + stripe_nr * map->stripe_len; 5307 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5308 5309 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5310 BTRFS_BLOCK_GROUP_RAID10)) { 5311 bbio->stripes[i].length = stripes_per_dev * 5312 map->stripe_len; 5313 5314 if (i / sub_stripes < remaining_stripes) 5315 bbio->stripes[i].length += 5316 map->stripe_len; 5317 5318 /* 5319 * Special for the first stripe and 5320 * the last stripe: 5321 * 5322 * |-------|...|-------| 5323 * |----------| 5324 * off end_off 5325 */ 5326 if (i < sub_stripes) 5327 bbio->stripes[i].length -= 5328 stripe_offset; 5329 5330 if (stripe_index >= last_stripe && 5331 stripe_index <= (last_stripe + 5332 sub_stripes - 1)) 5333 bbio->stripes[i].length -= 5334 stripe_end_offset; 5335 5336 if (i == sub_stripes - 1) 5337 stripe_offset = 0; 5338 } else 5339 bbio->stripes[i].length = *length; 5340 5341 stripe_index++; 5342 if (stripe_index == map->num_stripes) { 5343 /* This could only happen for RAID0/10 */ 5344 stripe_index = 0; 5345 stripe_nr++; 5346 } 5347 } 5348 } else { 5349 for (i = 0; i < num_stripes; i++) { 5350 bbio->stripes[i].physical = 5351 map->stripes[stripe_index].physical + 5352 stripe_offset + 5353 stripe_nr * map->stripe_len; 5354 bbio->stripes[i].dev = 5355 map->stripes[stripe_index].dev; 5356 stripe_index++; 5357 } 5358 } 5359 5360 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5361 max_errors = btrfs_chunk_max_errors(map); 5362 5363 if (bbio->raid_map) 5364 sort_parity_stripes(bbio, num_stripes); 5365 5366 tgtdev_indexes = 0; 5367 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5368 dev_replace->tgtdev != NULL) { 5369 int index_where_to_add; 5370 u64 srcdev_devid = dev_replace->srcdev->devid; 5371 5372 /* 5373 * duplicate the write operations while the dev replace 5374 * procedure is running. Since the copying of the old disk 5375 * to the new disk takes place at run time while the 5376 * filesystem is mounted writable, the regular write 5377 * operations to the old disk have to be duplicated to go 5378 * to the new disk as well. 5379 * Note that device->missing is handled by the caller, and 5380 * that the write to the old disk is already set up in the 5381 * stripes array. 5382 */ 5383 index_where_to_add = num_stripes; 5384 for (i = 0; i < num_stripes; i++) { 5385 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5386 /* write to new disk, too */ 5387 struct btrfs_bio_stripe *new = 5388 bbio->stripes + index_where_to_add; 5389 struct btrfs_bio_stripe *old = 5390 bbio->stripes + i; 5391 5392 new->physical = old->physical; 5393 new->length = old->length; 5394 new->dev = dev_replace->tgtdev; 5395 bbio->tgtdev_map[i] = index_where_to_add; 5396 index_where_to_add++; 5397 max_errors++; 5398 tgtdev_indexes++; 5399 } 5400 } 5401 num_stripes = index_where_to_add; 5402 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 5403 dev_replace->tgtdev != NULL) { 5404 u64 srcdev_devid = dev_replace->srcdev->devid; 5405 int index_srcdev = 0; 5406 int found = 0; 5407 u64 physical_of_found = 0; 5408 5409 /* 5410 * During the dev-replace procedure, the target drive can 5411 * also be used to read data in case it is needed to repair 5412 * a corrupt block elsewhere. This is possible if the 5413 * requested area is left of the left cursor. In this area, 5414 * the target drive is a full copy of the source drive. 5415 */ 5416 for (i = 0; i < num_stripes; i++) { 5417 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5418 /* 5419 * In case of DUP, in order to keep it 5420 * simple, only add the mirror with the 5421 * lowest physical address 5422 */ 5423 if (found && 5424 physical_of_found <= 5425 bbio->stripes[i].physical) 5426 continue; 5427 index_srcdev = i; 5428 found = 1; 5429 physical_of_found = bbio->stripes[i].physical; 5430 } 5431 } 5432 if (found) { 5433 if (physical_of_found + map->stripe_len <= 5434 dev_replace->cursor_left) { 5435 struct btrfs_bio_stripe *tgtdev_stripe = 5436 bbio->stripes + num_stripes; 5437 5438 tgtdev_stripe->physical = physical_of_found; 5439 tgtdev_stripe->length = 5440 bbio->stripes[index_srcdev].length; 5441 tgtdev_stripe->dev = dev_replace->tgtdev; 5442 bbio->tgtdev_map[index_srcdev] = num_stripes; 5443 5444 tgtdev_indexes++; 5445 num_stripes++; 5446 } 5447 } 5448 } 5449 5450 *bbio_ret = bbio; 5451 bbio->map_type = map->type; 5452 bbio->num_stripes = num_stripes; 5453 bbio->max_errors = max_errors; 5454 bbio->mirror_num = mirror_num; 5455 bbio->num_tgtdevs = tgtdev_indexes; 5456 5457 /* 5458 * this is the case that REQ_READ && dev_replace_is_ongoing && 5459 * mirror_num == num_stripes + 1 && dev_replace target drive is 5460 * available as a mirror 5461 */ 5462 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5463 WARN_ON(num_stripes > 1); 5464 bbio->stripes[0].dev = dev_replace->tgtdev; 5465 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5466 bbio->mirror_num = map->num_stripes + 1; 5467 } 5468 out: 5469 if (dev_replace_is_ongoing) 5470 btrfs_dev_replace_unlock(dev_replace); 5471 free_extent_map(em); 5472 return ret; 5473 } 5474 5475 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 5476 u64 logical, u64 *length, 5477 struct btrfs_bio **bbio_ret, int mirror_num) 5478 { 5479 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5480 mirror_num, 0); 5481 } 5482 5483 /* For Scrub/replace */ 5484 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, 5485 u64 logical, u64 *length, 5486 struct btrfs_bio **bbio_ret, int mirror_num, 5487 int need_raid_map) 5488 { 5489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5490 mirror_num, need_raid_map); 5491 } 5492 5493 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5494 u64 chunk_start, u64 physical, u64 devid, 5495 u64 **logical, int *naddrs, int *stripe_len) 5496 { 5497 struct extent_map_tree *em_tree = &map_tree->map_tree; 5498 struct extent_map *em; 5499 struct map_lookup *map; 5500 u64 *buf; 5501 u64 bytenr; 5502 u64 length; 5503 u64 stripe_nr; 5504 u64 rmap_len; 5505 int i, j, nr = 0; 5506 5507 read_lock(&em_tree->lock); 5508 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5509 read_unlock(&em_tree->lock); 5510 5511 if (!em) { 5512 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", 5513 chunk_start); 5514 return -EIO; 5515 } 5516 5517 if (em->start != chunk_start) { 5518 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", 5519 em->start, chunk_start); 5520 free_extent_map(em); 5521 return -EIO; 5522 } 5523 map = (struct map_lookup *)em->bdev; 5524 5525 length = em->len; 5526 rmap_len = map->stripe_len; 5527 5528 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5529 length = div_u64(length, map->num_stripes / map->sub_stripes); 5530 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5531 length = div_u64(length, map->num_stripes); 5532 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5533 length = div_u64(length, nr_data_stripes(map)); 5534 rmap_len = map->stripe_len * nr_data_stripes(map); 5535 } 5536 5537 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 5538 BUG_ON(!buf); /* -ENOMEM */ 5539 5540 for (i = 0; i < map->num_stripes; i++) { 5541 if (devid && map->stripes[i].dev->devid != devid) 5542 continue; 5543 if (map->stripes[i].physical > physical || 5544 map->stripes[i].physical + length <= physical) 5545 continue; 5546 5547 stripe_nr = physical - map->stripes[i].physical; 5548 stripe_nr = div_u64(stripe_nr, map->stripe_len); 5549 5550 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5551 stripe_nr = stripe_nr * map->num_stripes + i; 5552 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 5553 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5554 stripe_nr = stripe_nr * map->num_stripes + i; 5555 } /* else if RAID[56], multiply by nr_data_stripes(). 5556 * Alternatively, just use rmap_len below instead of 5557 * map->stripe_len */ 5558 5559 bytenr = chunk_start + stripe_nr * rmap_len; 5560 WARN_ON(nr >= map->num_stripes); 5561 for (j = 0; j < nr; j++) { 5562 if (buf[j] == bytenr) 5563 break; 5564 } 5565 if (j == nr) { 5566 WARN_ON(nr >= map->num_stripes); 5567 buf[nr++] = bytenr; 5568 } 5569 } 5570 5571 *logical = buf; 5572 *naddrs = nr; 5573 *stripe_len = rmap_len; 5574 5575 free_extent_map(em); 5576 return 0; 5577 } 5578 5579 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) 5580 { 5581 if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) 5582 bio_endio_nodec(bio, err); 5583 else 5584 bio_endio(bio, err); 5585 btrfs_put_bbio(bbio); 5586 } 5587 5588 static void btrfs_end_bio(struct bio *bio, int err) 5589 { 5590 struct btrfs_bio *bbio = bio->bi_private; 5591 struct btrfs_device *dev = bbio->stripes[0].dev; 5592 int is_orig_bio = 0; 5593 5594 if (err) { 5595 atomic_inc(&bbio->error); 5596 if (err == -EIO || err == -EREMOTEIO) { 5597 unsigned int stripe_index = 5598 btrfs_io_bio(bio)->stripe_index; 5599 5600 BUG_ON(stripe_index >= bbio->num_stripes); 5601 dev = bbio->stripes[stripe_index].dev; 5602 if (dev->bdev) { 5603 if (bio->bi_rw & WRITE) 5604 btrfs_dev_stat_inc(dev, 5605 BTRFS_DEV_STAT_WRITE_ERRS); 5606 else 5607 btrfs_dev_stat_inc(dev, 5608 BTRFS_DEV_STAT_READ_ERRS); 5609 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 5610 btrfs_dev_stat_inc(dev, 5611 BTRFS_DEV_STAT_FLUSH_ERRS); 5612 btrfs_dev_stat_print_on_error(dev); 5613 } 5614 } 5615 } 5616 5617 if (bio == bbio->orig_bio) 5618 is_orig_bio = 1; 5619 5620 btrfs_bio_counter_dec(bbio->fs_info); 5621 5622 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5623 if (!is_orig_bio) { 5624 bio_put(bio); 5625 bio = bbio->orig_bio; 5626 } 5627 5628 bio->bi_private = bbio->private; 5629 bio->bi_end_io = bbio->end_io; 5630 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5631 /* only send an error to the higher layers if it is 5632 * beyond the tolerance of the btrfs bio 5633 */ 5634 if (atomic_read(&bbio->error) > bbio->max_errors) { 5635 err = -EIO; 5636 } else { 5637 /* 5638 * this bio is actually up to date, we didn't 5639 * go over the max number of errors 5640 */ 5641 set_bit(BIO_UPTODATE, &bio->bi_flags); 5642 err = 0; 5643 } 5644 5645 btrfs_end_bbio(bbio, bio, err); 5646 } else if (!is_orig_bio) { 5647 bio_put(bio); 5648 } 5649 } 5650 5651 /* 5652 * see run_scheduled_bios for a description of why bios are collected for 5653 * async submit. 5654 * 5655 * This will add one bio to the pending list for a device and make sure 5656 * the work struct is scheduled. 5657 */ 5658 static noinline void btrfs_schedule_bio(struct btrfs_root *root, 5659 struct btrfs_device *device, 5660 int rw, struct bio *bio) 5661 { 5662 int should_queue = 1; 5663 struct btrfs_pending_bios *pending_bios; 5664 5665 if (device->missing || !device->bdev) { 5666 bio_endio(bio, -EIO); 5667 return; 5668 } 5669 5670 /* don't bother with additional async steps for reads, right now */ 5671 if (!(rw & REQ_WRITE)) { 5672 bio_get(bio); 5673 btrfsic_submit_bio(rw, bio); 5674 bio_put(bio); 5675 return; 5676 } 5677 5678 /* 5679 * nr_async_bios allows us to reliably return congestion to the 5680 * higher layers. Otherwise, the async bio makes it appear we have 5681 * made progress against dirty pages when we've really just put it 5682 * on a queue for later 5683 */ 5684 atomic_inc(&root->fs_info->nr_async_bios); 5685 WARN_ON(bio->bi_next); 5686 bio->bi_next = NULL; 5687 bio->bi_rw |= rw; 5688 5689 spin_lock(&device->io_lock); 5690 if (bio->bi_rw & REQ_SYNC) 5691 pending_bios = &device->pending_sync_bios; 5692 else 5693 pending_bios = &device->pending_bios; 5694 5695 if (pending_bios->tail) 5696 pending_bios->tail->bi_next = bio; 5697 5698 pending_bios->tail = bio; 5699 if (!pending_bios->head) 5700 pending_bios->head = bio; 5701 if (device->running_pending) 5702 should_queue = 0; 5703 5704 spin_unlock(&device->io_lock); 5705 5706 if (should_queue) 5707 btrfs_queue_work(root->fs_info->submit_workers, 5708 &device->work); 5709 } 5710 5711 static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5712 sector_t sector) 5713 { 5714 struct bio_vec *prev; 5715 struct request_queue *q = bdev_get_queue(bdev); 5716 unsigned int max_sectors = queue_max_sectors(q); 5717 struct bvec_merge_data bvm = { 5718 .bi_bdev = bdev, 5719 .bi_sector = sector, 5720 .bi_rw = bio->bi_rw, 5721 }; 5722 5723 if (WARN_ON(bio->bi_vcnt == 0)) 5724 return 1; 5725 5726 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5727 if (bio_sectors(bio) > max_sectors) 5728 return 0; 5729 5730 if (!q->merge_bvec_fn) 5731 return 1; 5732 5733 bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; 5734 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5735 return 0; 5736 return 1; 5737 } 5738 5739 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5740 struct bio *bio, u64 physical, int dev_nr, 5741 int rw, int async) 5742 { 5743 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5744 5745 bio->bi_private = bbio; 5746 btrfs_io_bio(bio)->stripe_index = dev_nr; 5747 bio->bi_end_io = btrfs_end_bio; 5748 bio->bi_iter.bi_sector = physical >> 9; 5749 #ifdef DEBUG 5750 { 5751 struct rcu_string *name; 5752 5753 rcu_read_lock(); 5754 name = rcu_dereference(dev->name); 5755 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5756 "(%s id %llu), size=%u\n", rw, 5757 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, 5758 name->str, dev->devid, bio->bi_iter.bi_size); 5759 rcu_read_unlock(); 5760 } 5761 #endif 5762 bio->bi_bdev = dev->bdev; 5763 5764 btrfs_bio_counter_inc_noblocked(root->fs_info); 5765 5766 if (async) 5767 btrfs_schedule_bio(root, dev, rw, bio); 5768 else 5769 btrfsic_submit_bio(rw, bio); 5770 } 5771 5772 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5773 struct bio *first_bio, struct btrfs_device *dev, 5774 int dev_nr, int rw, int async) 5775 { 5776 struct bio_vec *bvec = first_bio->bi_io_vec; 5777 struct bio *bio; 5778 int nr_vecs = bio_get_nr_vecs(dev->bdev); 5779 u64 physical = bbio->stripes[dev_nr].physical; 5780 5781 again: 5782 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 5783 if (!bio) 5784 return -ENOMEM; 5785 5786 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5787 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5788 bvec->bv_offset) < bvec->bv_len) { 5789 u64 len = bio->bi_iter.bi_size; 5790 5791 atomic_inc(&bbio->stripes_pending); 5792 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5793 rw, async); 5794 physical += len; 5795 goto again; 5796 } 5797 bvec++; 5798 } 5799 5800 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 5801 return 0; 5802 } 5803 5804 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 5805 { 5806 atomic_inc(&bbio->error); 5807 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5808 /* Shoud be the original bio. */ 5809 WARN_ON(bio != bbio->orig_bio); 5810 5811 bio->bi_private = bbio->private; 5812 bio->bi_end_io = bbio->end_io; 5813 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5814 bio->bi_iter.bi_sector = logical >> 9; 5815 5816 btrfs_end_bbio(bbio, bio, -EIO); 5817 } 5818 } 5819 5820 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 5821 int mirror_num, int async_submit) 5822 { 5823 struct btrfs_device *dev; 5824 struct bio *first_bio = bio; 5825 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 5826 u64 length = 0; 5827 u64 map_length; 5828 int ret; 5829 int dev_nr; 5830 int total_devs; 5831 struct btrfs_bio *bbio = NULL; 5832 5833 length = bio->bi_iter.bi_size; 5834 map_length = length; 5835 5836 btrfs_bio_counter_inc_blocked(root->fs_info); 5837 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5838 mirror_num, 1); 5839 if (ret) { 5840 btrfs_bio_counter_dec(root->fs_info); 5841 return ret; 5842 } 5843 5844 total_devs = bbio->num_stripes; 5845 bbio->orig_bio = first_bio; 5846 bbio->private = first_bio->bi_private; 5847 bbio->end_io = first_bio->bi_end_io; 5848 bbio->fs_info = root->fs_info; 5849 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5850 5851 if (bbio->raid_map) { 5852 /* In this case, map_length has been set to the length of 5853 a single stripe; not the whole write */ 5854 if (rw & WRITE) { 5855 ret = raid56_parity_write(root, bio, bbio, map_length); 5856 } else { 5857 ret = raid56_parity_recover(root, bio, bbio, map_length, 5858 mirror_num, 1); 5859 } 5860 5861 btrfs_bio_counter_dec(root->fs_info); 5862 return ret; 5863 } 5864 5865 if (map_length < length) { 5866 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", 5867 logical, length, map_length); 5868 BUG(); 5869 } 5870 5871 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 5872 dev = bbio->stripes[dev_nr].dev; 5873 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5874 bbio_error(bbio, first_bio, logical); 5875 continue; 5876 } 5877 5878 /* 5879 * Check and see if we're ok with this bio based on it's size 5880 * and offset with the given device. 5881 */ 5882 if (!bio_size_ok(dev->bdev, first_bio, 5883 bbio->stripes[dev_nr].physical >> 9)) { 5884 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5885 dev_nr, rw, async_submit); 5886 BUG_ON(ret); 5887 continue; 5888 } 5889 5890 if (dev_nr < total_devs - 1) { 5891 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 5892 BUG_ON(!bio); /* -ENOMEM */ 5893 } else { 5894 bio = first_bio; 5895 bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; 5896 } 5897 5898 submit_stripe_bio(root, bbio, bio, 5899 bbio->stripes[dev_nr].physical, dev_nr, rw, 5900 async_submit); 5901 } 5902 btrfs_bio_counter_dec(root->fs_info); 5903 return 0; 5904 } 5905 5906 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 5907 u8 *uuid, u8 *fsid) 5908 { 5909 struct btrfs_device *device; 5910 struct btrfs_fs_devices *cur_devices; 5911 5912 cur_devices = fs_info->fs_devices; 5913 while (cur_devices) { 5914 if (!fsid || 5915 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5916 device = __find_device(&cur_devices->devices, 5917 devid, uuid); 5918 if (device) 5919 return device; 5920 } 5921 cur_devices = cur_devices->seed; 5922 } 5923 return NULL; 5924 } 5925 5926 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5927 struct btrfs_fs_devices *fs_devices, 5928 u64 devid, u8 *dev_uuid) 5929 { 5930 struct btrfs_device *device; 5931 5932 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5933 if (IS_ERR(device)) 5934 return NULL; 5935 5936 list_add(&device->dev_list, &fs_devices->devices); 5937 device->fs_devices = fs_devices; 5938 fs_devices->num_devices++; 5939 5940 device->missing = 1; 5941 fs_devices->missing_devices++; 5942 5943 return device; 5944 } 5945 5946 /** 5947 * btrfs_alloc_device - allocate struct btrfs_device 5948 * @fs_info: used only for generating a new devid, can be NULL if 5949 * devid is provided (i.e. @devid != NULL). 5950 * @devid: a pointer to devid for this device. If NULL a new devid 5951 * is generated. 5952 * @uuid: a pointer to UUID for this device. If NULL a new UUID 5953 * is generated. 5954 * 5955 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 5956 * on error. Returned struct is not linked onto any lists and can be 5957 * destroyed with kfree() right away. 5958 */ 5959 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 5960 const u64 *devid, 5961 const u8 *uuid) 5962 { 5963 struct btrfs_device *dev; 5964 u64 tmp; 5965 5966 if (WARN_ON(!devid && !fs_info)) 5967 return ERR_PTR(-EINVAL); 5968 5969 dev = __alloc_device(); 5970 if (IS_ERR(dev)) 5971 return dev; 5972 5973 if (devid) 5974 tmp = *devid; 5975 else { 5976 int ret; 5977 5978 ret = find_next_devid(fs_info, &tmp); 5979 if (ret) { 5980 kfree(dev); 5981 return ERR_PTR(ret); 5982 } 5983 } 5984 dev->devid = tmp; 5985 5986 if (uuid) 5987 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 5988 else 5989 generate_random_uuid(dev->uuid); 5990 5991 btrfs_init_work(&dev->work, btrfs_submit_helper, 5992 pending_bios_fn, NULL, NULL); 5993 5994 return dev; 5995 } 5996 5997 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 5998 struct extent_buffer *leaf, 5999 struct btrfs_chunk *chunk) 6000 { 6001 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6002 struct map_lookup *map; 6003 struct extent_map *em; 6004 u64 logical; 6005 u64 length; 6006 u64 devid; 6007 u8 uuid[BTRFS_UUID_SIZE]; 6008 int num_stripes; 6009 int ret; 6010 int i; 6011 6012 logical = key->offset; 6013 length = btrfs_chunk_length(leaf, chunk); 6014 6015 read_lock(&map_tree->map_tree.lock); 6016 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6017 read_unlock(&map_tree->map_tree.lock); 6018 6019 /* already mapped? */ 6020 if (em && em->start <= logical && em->start + em->len > logical) { 6021 free_extent_map(em); 6022 return 0; 6023 } else if (em) { 6024 free_extent_map(em); 6025 } 6026 6027 em = alloc_extent_map(); 6028 if (!em) 6029 return -ENOMEM; 6030 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6031 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6032 if (!map) { 6033 free_extent_map(em); 6034 return -ENOMEM; 6035 } 6036 6037 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6038 em->bdev = (struct block_device *)map; 6039 em->start = logical; 6040 em->len = length; 6041 em->orig_start = 0; 6042 em->block_start = 0; 6043 em->block_len = em->len; 6044 6045 map->num_stripes = num_stripes; 6046 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6047 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6048 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 6049 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6050 map->type = btrfs_chunk_type(leaf, chunk); 6051 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6052 for (i = 0; i < num_stripes; i++) { 6053 map->stripes[i].physical = 6054 btrfs_stripe_offset_nr(leaf, chunk, i); 6055 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6056 read_extent_buffer(leaf, uuid, (unsigned long) 6057 btrfs_stripe_dev_uuid_nr(chunk, i), 6058 BTRFS_UUID_SIZE); 6059 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 6060 uuid, NULL); 6061 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 6062 free_extent_map(em); 6063 return -EIO; 6064 } 6065 if (!map->stripes[i].dev) { 6066 map->stripes[i].dev = 6067 add_missing_dev(root, root->fs_info->fs_devices, 6068 devid, uuid); 6069 if (!map->stripes[i].dev) { 6070 free_extent_map(em); 6071 return -EIO; 6072 } 6073 } 6074 map->stripes[i].dev->in_fs_metadata = 1; 6075 } 6076 6077 write_lock(&map_tree->map_tree.lock); 6078 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6079 write_unlock(&map_tree->map_tree.lock); 6080 BUG_ON(ret); /* Tree corruption */ 6081 free_extent_map(em); 6082 6083 return 0; 6084 } 6085 6086 static void fill_device_from_item(struct extent_buffer *leaf, 6087 struct btrfs_dev_item *dev_item, 6088 struct btrfs_device *device) 6089 { 6090 unsigned long ptr; 6091 6092 device->devid = btrfs_device_id(leaf, dev_item); 6093 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6094 device->total_bytes = device->disk_total_bytes; 6095 device->commit_total_bytes = device->disk_total_bytes; 6096 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6097 device->commit_bytes_used = device->bytes_used; 6098 device->type = btrfs_device_type(leaf, dev_item); 6099 device->io_align = btrfs_device_io_align(leaf, dev_item); 6100 device->io_width = btrfs_device_io_width(leaf, dev_item); 6101 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6102 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6103 device->is_tgtdev_for_dev_replace = 0; 6104 6105 ptr = btrfs_device_uuid(dev_item); 6106 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6107 } 6108 6109 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, 6110 u8 *fsid) 6111 { 6112 struct btrfs_fs_devices *fs_devices; 6113 int ret; 6114 6115 BUG_ON(!mutex_is_locked(&uuid_mutex)); 6116 6117 fs_devices = root->fs_info->fs_devices->seed; 6118 while (fs_devices) { 6119 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) 6120 return fs_devices; 6121 6122 fs_devices = fs_devices->seed; 6123 } 6124 6125 fs_devices = find_fsid(fsid); 6126 if (!fs_devices) { 6127 if (!btrfs_test_opt(root, DEGRADED)) 6128 return ERR_PTR(-ENOENT); 6129 6130 fs_devices = alloc_fs_devices(fsid); 6131 if (IS_ERR(fs_devices)) 6132 return fs_devices; 6133 6134 fs_devices->seeding = 1; 6135 fs_devices->opened = 1; 6136 return fs_devices; 6137 } 6138 6139 fs_devices = clone_fs_devices(fs_devices); 6140 if (IS_ERR(fs_devices)) 6141 return fs_devices; 6142 6143 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6144 root->fs_info->bdev_holder); 6145 if (ret) { 6146 free_fs_devices(fs_devices); 6147 fs_devices = ERR_PTR(ret); 6148 goto out; 6149 } 6150 6151 if (!fs_devices->seeding) { 6152 __btrfs_close_devices(fs_devices); 6153 free_fs_devices(fs_devices); 6154 fs_devices = ERR_PTR(-EINVAL); 6155 goto out; 6156 } 6157 6158 fs_devices->seed = root->fs_info->fs_devices->seed; 6159 root->fs_info->fs_devices->seed = fs_devices; 6160 out: 6161 return fs_devices; 6162 } 6163 6164 static int read_one_dev(struct btrfs_root *root, 6165 struct extent_buffer *leaf, 6166 struct btrfs_dev_item *dev_item) 6167 { 6168 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6169 struct btrfs_device *device; 6170 u64 devid; 6171 int ret; 6172 u8 fs_uuid[BTRFS_UUID_SIZE]; 6173 u8 dev_uuid[BTRFS_UUID_SIZE]; 6174 6175 devid = btrfs_device_id(leaf, dev_item); 6176 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6177 BTRFS_UUID_SIZE); 6178 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6179 BTRFS_UUID_SIZE); 6180 6181 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 6182 fs_devices = open_seed_devices(root, fs_uuid); 6183 if (IS_ERR(fs_devices)) 6184 return PTR_ERR(fs_devices); 6185 } 6186 6187 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 6188 if (!device) { 6189 if (!btrfs_test_opt(root, DEGRADED)) 6190 return -EIO; 6191 6192 btrfs_warn(root->fs_info, "devid %llu missing", devid); 6193 device = add_missing_dev(root, fs_devices, devid, dev_uuid); 6194 if (!device) 6195 return -ENOMEM; 6196 } else { 6197 if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) 6198 return -EIO; 6199 6200 if(!device->bdev && !device->missing) { 6201 /* 6202 * this happens when a device that was properly setup 6203 * in the device info lists suddenly goes bad. 6204 * device->bdev is NULL, and so we have to set 6205 * device->missing to one here 6206 */ 6207 device->fs_devices->missing_devices++; 6208 device->missing = 1; 6209 } 6210 6211 /* Move the device to its own fs_devices */ 6212 if (device->fs_devices != fs_devices) { 6213 ASSERT(device->missing); 6214 6215 list_move(&device->dev_list, &fs_devices->devices); 6216 device->fs_devices->num_devices--; 6217 fs_devices->num_devices++; 6218 6219 device->fs_devices->missing_devices--; 6220 fs_devices->missing_devices++; 6221 6222 device->fs_devices = fs_devices; 6223 } 6224 } 6225 6226 if (device->fs_devices != root->fs_info->fs_devices) { 6227 BUG_ON(device->writeable); 6228 if (device->generation != 6229 btrfs_device_generation(leaf, dev_item)) 6230 return -EINVAL; 6231 } 6232 6233 fill_device_from_item(leaf, dev_item, device); 6234 device->in_fs_metadata = 1; 6235 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6236 device->fs_devices->total_rw_bytes += device->total_bytes; 6237 spin_lock(&root->fs_info->free_chunk_lock); 6238 root->fs_info->free_chunk_space += device->total_bytes - 6239 device->bytes_used; 6240 spin_unlock(&root->fs_info->free_chunk_lock); 6241 } 6242 ret = 0; 6243 return ret; 6244 } 6245 6246 int btrfs_read_sys_array(struct btrfs_root *root) 6247 { 6248 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 6249 struct extent_buffer *sb; 6250 struct btrfs_disk_key *disk_key; 6251 struct btrfs_chunk *chunk; 6252 u8 *array_ptr; 6253 unsigned long sb_array_offset; 6254 int ret = 0; 6255 u32 num_stripes; 6256 u32 array_size; 6257 u32 len = 0; 6258 u32 cur_offset; 6259 struct btrfs_key key; 6260 6261 ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); 6262 /* 6263 * This will create extent buffer of nodesize, superblock size is 6264 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6265 * overallocate but we can keep it as-is, only the first page is used. 6266 */ 6267 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); 6268 if (!sb) 6269 return -ENOMEM; 6270 btrfs_set_buffer_uptodate(sb); 6271 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6272 /* 6273 * The sb extent buffer is artifical and just used to read the system array. 6274 * btrfs_set_buffer_uptodate() call does not properly mark all it's 6275 * pages up-to-date when the page is larger: extent does not cover the 6276 * whole page and consequently check_page_uptodate does not find all 6277 * the page's extents up-to-date (the hole beyond sb), 6278 * write_extent_buffer then triggers a WARN_ON. 6279 * 6280 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6281 * but sb spans only this function. Add an explicit SetPageUptodate call 6282 * to silence the warning eg. on PowerPC 64. 6283 */ 6284 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 6285 SetPageUptodate(sb->pages[0]); 6286 6287 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6288 array_size = btrfs_super_sys_array_size(super_copy); 6289 6290 array_ptr = super_copy->sys_chunk_array; 6291 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6292 cur_offset = 0; 6293 6294 while (cur_offset < array_size) { 6295 disk_key = (struct btrfs_disk_key *)array_ptr; 6296 len = sizeof(*disk_key); 6297 if (cur_offset + len > array_size) 6298 goto out_short_read; 6299 6300 btrfs_disk_key_to_cpu(&key, disk_key); 6301 6302 array_ptr += len; 6303 sb_array_offset += len; 6304 cur_offset += len; 6305 6306 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6307 chunk = (struct btrfs_chunk *)sb_array_offset; 6308 /* 6309 * At least one btrfs_chunk with one stripe must be 6310 * present, exact stripe count check comes afterwards 6311 */ 6312 len = btrfs_chunk_item_size(1); 6313 if (cur_offset + len > array_size) 6314 goto out_short_read; 6315 6316 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6317 len = btrfs_chunk_item_size(num_stripes); 6318 if (cur_offset + len > array_size) 6319 goto out_short_read; 6320 6321 ret = read_one_chunk(root, &key, sb, chunk); 6322 if (ret) 6323 break; 6324 } else { 6325 ret = -EIO; 6326 break; 6327 } 6328 array_ptr += len; 6329 sb_array_offset += len; 6330 cur_offset += len; 6331 } 6332 free_extent_buffer(sb); 6333 return ret; 6334 6335 out_short_read: 6336 printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", 6337 len, cur_offset); 6338 free_extent_buffer(sb); 6339 return -EIO; 6340 } 6341 6342 int btrfs_read_chunk_tree(struct btrfs_root *root) 6343 { 6344 struct btrfs_path *path; 6345 struct extent_buffer *leaf; 6346 struct btrfs_key key; 6347 struct btrfs_key found_key; 6348 int ret; 6349 int slot; 6350 6351 root = root->fs_info->chunk_root; 6352 6353 path = btrfs_alloc_path(); 6354 if (!path) 6355 return -ENOMEM; 6356 6357 mutex_lock(&uuid_mutex); 6358 lock_chunks(root); 6359 6360 /* 6361 * Read all device items, and then all the chunk items. All 6362 * device items are found before any chunk item (their object id 6363 * is smaller than the lowest possible object id for a chunk 6364 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6365 */ 6366 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6367 key.offset = 0; 6368 key.type = 0; 6369 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6370 if (ret < 0) 6371 goto error; 6372 while (1) { 6373 leaf = path->nodes[0]; 6374 slot = path->slots[0]; 6375 if (slot >= btrfs_header_nritems(leaf)) { 6376 ret = btrfs_next_leaf(root, path); 6377 if (ret == 0) 6378 continue; 6379 if (ret < 0) 6380 goto error; 6381 break; 6382 } 6383 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6384 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6385 struct btrfs_dev_item *dev_item; 6386 dev_item = btrfs_item_ptr(leaf, slot, 6387 struct btrfs_dev_item); 6388 ret = read_one_dev(root, leaf, dev_item); 6389 if (ret) 6390 goto error; 6391 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6392 struct btrfs_chunk *chunk; 6393 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6394 ret = read_one_chunk(root, &found_key, leaf, chunk); 6395 if (ret) 6396 goto error; 6397 } 6398 path->slots[0]++; 6399 } 6400 ret = 0; 6401 error: 6402 unlock_chunks(root); 6403 mutex_unlock(&uuid_mutex); 6404 6405 btrfs_free_path(path); 6406 return ret; 6407 } 6408 6409 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6410 { 6411 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6412 struct btrfs_device *device; 6413 6414 while (fs_devices) { 6415 mutex_lock(&fs_devices->device_list_mutex); 6416 list_for_each_entry(device, &fs_devices->devices, dev_list) 6417 device->dev_root = fs_info->dev_root; 6418 mutex_unlock(&fs_devices->device_list_mutex); 6419 6420 fs_devices = fs_devices->seed; 6421 } 6422 } 6423 6424 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6425 { 6426 int i; 6427 6428 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6429 btrfs_dev_stat_reset(dev, i); 6430 } 6431 6432 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6433 { 6434 struct btrfs_key key; 6435 struct btrfs_key found_key; 6436 struct btrfs_root *dev_root = fs_info->dev_root; 6437 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6438 struct extent_buffer *eb; 6439 int slot; 6440 int ret = 0; 6441 struct btrfs_device *device; 6442 struct btrfs_path *path = NULL; 6443 int i; 6444 6445 path = btrfs_alloc_path(); 6446 if (!path) { 6447 ret = -ENOMEM; 6448 goto out; 6449 } 6450 6451 mutex_lock(&fs_devices->device_list_mutex); 6452 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6453 int item_size; 6454 struct btrfs_dev_stats_item *ptr; 6455 6456 key.objectid = 0; 6457 key.type = BTRFS_DEV_STATS_KEY; 6458 key.offset = device->devid; 6459 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6460 if (ret) { 6461 __btrfs_reset_dev_stats(device); 6462 device->dev_stats_valid = 1; 6463 btrfs_release_path(path); 6464 continue; 6465 } 6466 slot = path->slots[0]; 6467 eb = path->nodes[0]; 6468 btrfs_item_key_to_cpu(eb, &found_key, slot); 6469 item_size = btrfs_item_size_nr(eb, slot); 6470 6471 ptr = btrfs_item_ptr(eb, slot, 6472 struct btrfs_dev_stats_item); 6473 6474 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6475 if (item_size >= (1 + i) * sizeof(__le64)) 6476 btrfs_dev_stat_set(device, i, 6477 btrfs_dev_stats_value(eb, ptr, i)); 6478 else 6479 btrfs_dev_stat_reset(device, i); 6480 } 6481 6482 device->dev_stats_valid = 1; 6483 btrfs_dev_stat_print_on_load(device); 6484 btrfs_release_path(path); 6485 } 6486 mutex_unlock(&fs_devices->device_list_mutex); 6487 6488 out: 6489 btrfs_free_path(path); 6490 return ret < 0 ? ret : 0; 6491 } 6492 6493 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6494 struct btrfs_root *dev_root, 6495 struct btrfs_device *device) 6496 { 6497 struct btrfs_path *path; 6498 struct btrfs_key key; 6499 struct extent_buffer *eb; 6500 struct btrfs_dev_stats_item *ptr; 6501 int ret; 6502 int i; 6503 6504 key.objectid = 0; 6505 key.type = BTRFS_DEV_STATS_KEY; 6506 key.offset = device->devid; 6507 6508 path = btrfs_alloc_path(); 6509 BUG_ON(!path); 6510 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6511 if (ret < 0) { 6512 printk_in_rcu(KERN_WARNING "BTRFS: " 6513 "error %d while searching for dev_stats item for device %s!\n", 6514 ret, rcu_str_deref(device->name)); 6515 goto out; 6516 } 6517 6518 if (ret == 0 && 6519 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6520 /* need to delete old one and insert a new one */ 6521 ret = btrfs_del_item(trans, dev_root, path); 6522 if (ret != 0) { 6523 printk_in_rcu(KERN_WARNING "BTRFS: " 6524 "delete too small dev_stats item for device %s failed %d!\n", 6525 rcu_str_deref(device->name), ret); 6526 goto out; 6527 } 6528 ret = 1; 6529 } 6530 6531 if (ret == 1) { 6532 /* need to insert a new item */ 6533 btrfs_release_path(path); 6534 ret = btrfs_insert_empty_item(trans, dev_root, path, 6535 &key, sizeof(*ptr)); 6536 if (ret < 0) { 6537 printk_in_rcu(KERN_WARNING "BTRFS: " 6538 "insert dev_stats item for device %s failed %d!\n", 6539 rcu_str_deref(device->name), ret); 6540 goto out; 6541 } 6542 } 6543 6544 eb = path->nodes[0]; 6545 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6546 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6547 btrfs_set_dev_stats_value(eb, ptr, i, 6548 btrfs_dev_stat_read(device, i)); 6549 btrfs_mark_buffer_dirty(eb); 6550 6551 out: 6552 btrfs_free_path(path); 6553 return ret; 6554 } 6555 6556 /* 6557 * called from commit_transaction. Writes all changed device stats to disk. 6558 */ 6559 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 6560 struct btrfs_fs_info *fs_info) 6561 { 6562 struct btrfs_root *dev_root = fs_info->dev_root; 6563 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6564 struct btrfs_device *device; 6565 int stats_cnt; 6566 int ret = 0; 6567 6568 mutex_lock(&fs_devices->device_list_mutex); 6569 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6570 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) 6571 continue; 6572 6573 stats_cnt = atomic_read(&device->dev_stats_ccnt); 6574 ret = update_dev_stat_item(trans, dev_root, device); 6575 if (!ret) 6576 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 6577 } 6578 mutex_unlock(&fs_devices->device_list_mutex); 6579 6580 return ret; 6581 } 6582 6583 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 6584 { 6585 btrfs_dev_stat_inc(dev, index); 6586 btrfs_dev_stat_print_on_error(dev); 6587 } 6588 6589 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 6590 { 6591 if (!dev->dev_stats_valid) 6592 return; 6593 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 6594 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6595 rcu_str_deref(dev->name), 6596 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6597 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6598 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6599 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6600 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6601 } 6602 6603 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6604 { 6605 int i; 6606 6607 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6608 if (btrfs_dev_stat_read(dev, i) != 0) 6609 break; 6610 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6611 return; /* all values == 0, suppress message */ 6612 6613 printk_in_rcu(KERN_INFO "BTRFS: " 6614 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6615 rcu_str_deref(dev->name), 6616 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6617 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6618 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6619 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6620 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6621 } 6622 6623 int btrfs_get_dev_stats(struct btrfs_root *root, 6624 struct btrfs_ioctl_get_dev_stats *stats) 6625 { 6626 struct btrfs_device *dev; 6627 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6628 int i; 6629 6630 mutex_lock(&fs_devices->device_list_mutex); 6631 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 6632 mutex_unlock(&fs_devices->device_list_mutex); 6633 6634 if (!dev) { 6635 btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); 6636 return -ENODEV; 6637 } else if (!dev->dev_stats_valid) { 6638 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); 6639 return -ENODEV; 6640 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6641 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6642 if (stats->nr_items > i) 6643 stats->values[i] = 6644 btrfs_dev_stat_read_and_reset(dev, i); 6645 else 6646 btrfs_dev_stat_reset(dev, i); 6647 } 6648 } else { 6649 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6650 if (stats->nr_items > i) 6651 stats->values[i] = btrfs_dev_stat_read(dev, i); 6652 } 6653 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 6654 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 6655 return 0; 6656 } 6657 6658 int btrfs_scratch_superblock(struct btrfs_device *device) 6659 { 6660 struct buffer_head *bh; 6661 struct btrfs_super_block *disk_super; 6662 6663 bh = btrfs_read_dev_super(device->bdev); 6664 if (!bh) 6665 return -EINVAL; 6666 disk_super = (struct btrfs_super_block *)bh->b_data; 6667 6668 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6669 set_buffer_dirty(bh); 6670 sync_dirty_buffer(bh); 6671 brelse(bh); 6672 6673 return 0; 6674 } 6675 6676 /* 6677 * Update the size of all devices, which is used for writing out the 6678 * super blocks. 6679 */ 6680 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 6681 { 6682 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6683 struct btrfs_device *curr, *next; 6684 6685 if (list_empty(&fs_devices->resized_devices)) 6686 return; 6687 6688 mutex_lock(&fs_devices->device_list_mutex); 6689 lock_chunks(fs_info->dev_root); 6690 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 6691 resized_list) { 6692 list_del_init(&curr->resized_list); 6693 curr->commit_total_bytes = curr->disk_total_bytes; 6694 } 6695 unlock_chunks(fs_info->dev_root); 6696 mutex_unlock(&fs_devices->device_list_mutex); 6697 } 6698 6699 /* Must be invoked during the transaction commit */ 6700 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 6701 struct btrfs_transaction *transaction) 6702 { 6703 struct extent_map *em; 6704 struct map_lookup *map; 6705 struct btrfs_device *dev; 6706 int i; 6707 6708 if (list_empty(&transaction->pending_chunks)) 6709 return; 6710 6711 /* In order to kick the device replace finish process */ 6712 lock_chunks(root); 6713 list_for_each_entry(em, &transaction->pending_chunks, list) { 6714 map = (struct map_lookup *)em->bdev; 6715 6716 for (i = 0; i < map->num_stripes; i++) { 6717 dev = map->stripes[i].dev; 6718 dev->commit_bytes_used = dev->bytes_used; 6719 } 6720 } 6721 unlock_chunks(root); 6722 } 6723