1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <linux/raid/pq.h> 29 #include <linux/semaphore.h> 30 #include <asm/div64.h> 31 #include "compat.h" 32 #include "ctree.h" 33 #include "extent_map.h" 34 #include "disk-io.h" 35 #include "transaction.h" 36 #include "print-tree.h" 37 #include "volumes.h" 38 #include "raid56.h" 39 #include "async-thread.h" 40 #include "check-integrity.h" 41 #include "rcu-string.h" 42 #include "math.h" 43 #include "dev-replace.h" 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 static void lock_chunks(struct btrfs_root *root) 57 { 58 mutex_lock(&root->fs_info->chunk_mutex); 59 } 60 61 static void unlock_chunks(struct btrfs_root *root) 62 { 63 mutex_unlock(&root->fs_info->chunk_mutex); 64 } 65 66 static struct btrfs_fs_devices *__alloc_fs_devices(void) 67 { 68 struct btrfs_fs_devices *fs_devs; 69 70 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 71 if (!fs_devs) 72 return ERR_PTR(-ENOMEM); 73 74 mutex_init(&fs_devs->device_list_mutex); 75 76 INIT_LIST_HEAD(&fs_devs->devices); 77 INIT_LIST_HEAD(&fs_devs->alloc_list); 78 INIT_LIST_HEAD(&fs_devs->list); 79 80 return fs_devs; 81 } 82 83 /** 84 * alloc_fs_devices - allocate struct btrfs_fs_devices 85 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 86 * generated. 87 * 88 * Return: a pointer to a new &struct btrfs_fs_devices on success; 89 * ERR_PTR() on error. Returned struct is not linked onto any lists and 90 * can be destroyed with kfree() right away. 91 */ 92 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 93 { 94 struct btrfs_fs_devices *fs_devs; 95 96 fs_devs = __alloc_fs_devices(); 97 if (IS_ERR(fs_devs)) 98 return fs_devs; 99 100 if (fsid) 101 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 102 else 103 generate_random_uuid(fs_devs->fsid); 104 105 return fs_devs; 106 } 107 108 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 109 { 110 struct btrfs_device *device; 111 WARN_ON(fs_devices->opened); 112 while (!list_empty(&fs_devices->devices)) { 113 device = list_entry(fs_devices->devices.next, 114 struct btrfs_device, dev_list); 115 list_del(&device->dev_list); 116 rcu_string_free(device->name); 117 kfree(device); 118 } 119 kfree(fs_devices); 120 } 121 122 static void btrfs_kobject_uevent(struct block_device *bdev, 123 enum kobject_action action) 124 { 125 int ret; 126 127 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 128 if (ret) 129 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", 130 action, 131 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 132 &disk_to_dev(bdev->bd_disk)->kobj); 133 } 134 135 void btrfs_cleanup_fs_uuids(void) 136 { 137 struct btrfs_fs_devices *fs_devices; 138 139 while (!list_empty(&fs_uuids)) { 140 fs_devices = list_entry(fs_uuids.next, 141 struct btrfs_fs_devices, list); 142 list_del(&fs_devices->list); 143 free_fs_devices(fs_devices); 144 } 145 } 146 147 static struct btrfs_device *__alloc_device(void) 148 { 149 struct btrfs_device *dev; 150 151 dev = kzalloc(sizeof(*dev), GFP_NOFS); 152 if (!dev) 153 return ERR_PTR(-ENOMEM); 154 155 INIT_LIST_HEAD(&dev->dev_list); 156 INIT_LIST_HEAD(&dev->dev_alloc_list); 157 158 spin_lock_init(&dev->io_lock); 159 160 spin_lock_init(&dev->reada_lock); 161 atomic_set(&dev->reada_in_flight, 0); 162 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 163 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 164 165 return dev; 166 } 167 168 static noinline struct btrfs_device *__find_device(struct list_head *head, 169 u64 devid, u8 *uuid) 170 { 171 struct btrfs_device *dev; 172 173 list_for_each_entry(dev, head, dev_list) { 174 if (dev->devid == devid && 175 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 176 return dev; 177 } 178 } 179 return NULL; 180 } 181 182 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 183 { 184 struct btrfs_fs_devices *fs_devices; 185 186 list_for_each_entry(fs_devices, &fs_uuids, list) { 187 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 188 return fs_devices; 189 } 190 return NULL; 191 } 192 193 static int 194 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 195 int flush, struct block_device **bdev, 196 struct buffer_head **bh) 197 { 198 int ret; 199 200 *bdev = blkdev_get_by_path(device_path, flags, holder); 201 202 if (IS_ERR(*bdev)) { 203 ret = PTR_ERR(*bdev); 204 printk(KERN_INFO "btrfs: open %s failed\n", device_path); 205 goto error; 206 } 207 208 if (flush) 209 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 210 ret = set_blocksize(*bdev, 4096); 211 if (ret) { 212 blkdev_put(*bdev, flags); 213 goto error; 214 } 215 invalidate_bdev(*bdev); 216 *bh = btrfs_read_dev_super(*bdev); 217 if (!*bh) { 218 ret = -EINVAL; 219 blkdev_put(*bdev, flags); 220 goto error; 221 } 222 223 return 0; 224 225 error: 226 *bdev = NULL; 227 *bh = NULL; 228 return ret; 229 } 230 231 static void requeue_list(struct btrfs_pending_bios *pending_bios, 232 struct bio *head, struct bio *tail) 233 { 234 235 struct bio *old_head; 236 237 old_head = pending_bios->head; 238 pending_bios->head = head; 239 if (pending_bios->tail) 240 tail->bi_next = old_head; 241 else 242 pending_bios->tail = tail; 243 } 244 245 /* 246 * we try to collect pending bios for a device so we don't get a large 247 * number of procs sending bios down to the same device. This greatly 248 * improves the schedulers ability to collect and merge the bios. 249 * 250 * But, it also turns into a long list of bios to process and that is sure 251 * to eventually make the worker thread block. The solution here is to 252 * make some progress and then put this work struct back at the end of 253 * the list if the block device is congested. This way, multiple devices 254 * can make progress from a single worker thread. 255 */ 256 static noinline void run_scheduled_bios(struct btrfs_device *device) 257 { 258 struct bio *pending; 259 struct backing_dev_info *bdi; 260 struct btrfs_fs_info *fs_info; 261 struct btrfs_pending_bios *pending_bios; 262 struct bio *tail; 263 struct bio *cur; 264 int again = 0; 265 unsigned long num_run; 266 unsigned long batch_run = 0; 267 unsigned long limit; 268 unsigned long last_waited = 0; 269 int force_reg = 0; 270 int sync_pending = 0; 271 struct blk_plug plug; 272 273 /* 274 * this function runs all the bios we've collected for 275 * a particular device. We don't want to wander off to 276 * another device without first sending all of these down. 277 * So, setup a plug here and finish it off before we return 278 */ 279 blk_start_plug(&plug); 280 281 bdi = blk_get_backing_dev_info(device->bdev); 282 fs_info = device->dev_root->fs_info; 283 limit = btrfs_async_submit_limit(fs_info); 284 limit = limit * 2 / 3; 285 286 loop: 287 spin_lock(&device->io_lock); 288 289 loop_lock: 290 num_run = 0; 291 292 /* take all the bios off the list at once and process them 293 * later on (without the lock held). But, remember the 294 * tail and other pointers so the bios can be properly reinserted 295 * into the list if we hit congestion 296 */ 297 if (!force_reg && device->pending_sync_bios.head) { 298 pending_bios = &device->pending_sync_bios; 299 force_reg = 1; 300 } else { 301 pending_bios = &device->pending_bios; 302 force_reg = 0; 303 } 304 305 pending = pending_bios->head; 306 tail = pending_bios->tail; 307 WARN_ON(pending && !tail); 308 309 /* 310 * if pending was null this time around, no bios need processing 311 * at all and we can stop. Otherwise it'll loop back up again 312 * and do an additional check so no bios are missed. 313 * 314 * device->running_pending is used to synchronize with the 315 * schedule_bio code. 316 */ 317 if (device->pending_sync_bios.head == NULL && 318 device->pending_bios.head == NULL) { 319 again = 0; 320 device->running_pending = 0; 321 } else { 322 again = 1; 323 device->running_pending = 1; 324 } 325 326 pending_bios->head = NULL; 327 pending_bios->tail = NULL; 328 329 spin_unlock(&device->io_lock); 330 331 while (pending) { 332 333 rmb(); 334 /* we want to work on both lists, but do more bios on the 335 * sync list than the regular list 336 */ 337 if ((num_run > 32 && 338 pending_bios != &device->pending_sync_bios && 339 device->pending_sync_bios.head) || 340 (num_run > 64 && pending_bios == &device->pending_sync_bios && 341 device->pending_bios.head)) { 342 spin_lock(&device->io_lock); 343 requeue_list(pending_bios, pending, tail); 344 goto loop_lock; 345 } 346 347 cur = pending; 348 pending = pending->bi_next; 349 cur->bi_next = NULL; 350 351 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 352 waitqueue_active(&fs_info->async_submit_wait)) 353 wake_up(&fs_info->async_submit_wait); 354 355 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 356 357 /* 358 * if we're doing the sync list, record that our 359 * plug has some sync requests on it 360 * 361 * If we're doing the regular list and there are 362 * sync requests sitting around, unplug before 363 * we add more 364 */ 365 if (pending_bios == &device->pending_sync_bios) { 366 sync_pending = 1; 367 } else if (sync_pending) { 368 blk_finish_plug(&plug); 369 blk_start_plug(&plug); 370 sync_pending = 0; 371 } 372 373 btrfsic_submit_bio(cur->bi_rw, cur); 374 num_run++; 375 batch_run++; 376 if (need_resched()) 377 cond_resched(); 378 379 /* 380 * we made progress, there is more work to do and the bdi 381 * is now congested. Back off and let other work structs 382 * run instead 383 */ 384 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 385 fs_info->fs_devices->open_devices > 1) { 386 struct io_context *ioc; 387 388 ioc = current->io_context; 389 390 /* 391 * the main goal here is that we don't want to 392 * block if we're going to be able to submit 393 * more requests without blocking. 394 * 395 * This code does two great things, it pokes into 396 * the elevator code from a filesystem _and_ 397 * it makes assumptions about how batching works. 398 */ 399 if (ioc && ioc->nr_batch_requests > 0 && 400 time_before(jiffies, ioc->last_waited + HZ/50UL) && 401 (last_waited == 0 || 402 ioc->last_waited == last_waited)) { 403 /* 404 * we want to go through our batch of 405 * requests and stop. So, we copy out 406 * the ioc->last_waited time and test 407 * against it before looping 408 */ 409 last_waited = ioc->last_waited; 410 if (need_resched()) 411 cond_resched(); 412 continue; 413 } 414 spin_lock(&device->io_lock); 415 requeue_list(pending_bios, pending, tail); 416 device->running_pending = 1; 417 418 spin_unlock(&device->io_lock); 419 btrfs_requeue_work(&device->work); 420 goto done; 421 } 422 /* unplug every 64 requests just for good measure */ 423 if (batch_run % 64 == 0) { 424 blk_finish_plug(&plug); 425 blk_start_plug(&plug); 426 sync_pending = 0; 427 } 428 } 429 430 cond_resched(); 431 if (again) 432 goto loop; 433 434 spin_lock(&device->io_lock); 435 if (device->pending_bios.head || device->pending_sync_bios.head) 436 goto loop_lock; 437 spin_unlock(&device->io_lock); 438 439 done: 440 blk_finish_plug(&plug); 441 } 442 443 static void pending_bios_fn(struct btrfs_work *work) 444 { 445 struct btrfs_device *device; 446 447 device = container_of(work, struct btrfs_device, work); 448 run_scheduled_bios(device); 449 } 450 451 static noinline int device_list_add(const char *path, 452 struct btrfs_super_block *disk_super, 453 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 454 { 455 struct btrfs_device *device; 456 struct btrfs_fs_devices *fs_devices; 457 struct rcu_string *name; 458 u64 found_transid = btrfs_super_generation(disk_super); 459 460 fs_devices = find_fsid(disk_super->fsid); 461 if (!fs_devices) { 462 fs_devices = alloc_fs_devices(disk_super->fsid); 463 if (IS_ERR(fs_devices)) 464 return PTR_ERR(fs_devices); 465 466 list_add(&fs_devices->list, &fs_uuids); 467 fs_devices->latest_devid = devid; 468 fs_devices->latest_trans = found_transid; 469 470 device = NULL; 471 } else { 472 device = __find_device(&fs_devices->devices, devid, 473 disk_super->dev_item.uuid); 474 } 475 if (!device) { 476 if (fs_devices->opened) 477 return -EBUSY; 478 479 device = btrfs_alloc_device(NULL, &devid, 480 disk_super->dev_item.uuid); 481 if (IS_ERR(device)) { 482 /* we can safely leave the fs_devices entry around */ 483 return PTR_ERR(device); 484 } 485 486 name = rcu_string_strdup(path, GFP_NOFS); 487 if (!name) { 488 kfree(device); 489 return -ENOMEM; 490 } 491 rcu_assign_pointer(device->name, name); 492 493 mutex_lock(&fs_devices->device_list_mutex); 494 list_add_rcu(&device->dev_list, &fs_devices->devices); 495 mutex_unlock(&fs_devices->device_list_mutex); 496 497 device->fs_devices = fs_devices; 498 fs_devices->num_devices++; 499 } else if (!device->name || strcmp(device->name->str, path)) { 500 name = rcu_string_strdup(path, GFP_NOFS); 501 if (!name) 502 return -ENOMEM; 503 rcu_string_free(device->name); 504 rcu_assign_pointer(device->name, name); 505 if (device->missing) { 506 fs_devices->missing_devices--; 507 device->missing = 0; 508 } 509 } 510 511 if (found_transid > fs_devices->latest_trans) { 512 fs_devices->latest_devid = devid; 513 fs_devices->latest_trans = found_transid; 514 } 515 *fs_devices_ret = fs_devices; 516 return 0; 517 } 518 519 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 520 { 521 struct btrfs_fs_devices *fs_devices; 522 struct btrfs_device *device; 523 struct btrfs_device *orig_dev; 524 525 fs_devices = alloc_fs_devices(orig->fsid); 526 if (IS_ERR(fs_devices)) 527 return fs_devices; 528 529 fs_devices->latest_devid = orig->latest_devid; 530 fs_devices->latest_trans = orig->latest_trans; 531 fs_devices->total_devices = orig->total_devices; 532 533 /* We have held the volume lock, it is safe to get the devices. */ 534 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 535 struct rcu_string *name; 536 537 device = btrfs_alloc_device(NULL, &orig_dev->devid, 538 orig_dev->uuid); 539 if (IS_ERR(device)) 540 goto error; 541 542 /* 543 * This is ok to do without rcu read locked because we hold the 544 * uuid mutex so nothing we touch in here is going to disappear. 545 */ 546 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 547 if (!name) { 548 kfree(device); 549 goto error; 550 } 551 rcu_assign_pointer(device->name, name); 552 553 list_add(&device->dev_list, &fs_devices->devices); 554 device->fs_devices = fs_devices; 555 fs_devices->num_devices++; 556 } 557 return fs_devices; 558 error: 559 free_fs_devices(fs_devices); 560 return ERR_PTR(-ENOMEM); 561 } 562 563 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 564 struct btrfs_fs_devices *fs_devices, int step) 565 { 566 struct btrfs_device *device, *next; 567 568 struct block_device *latest_bdev = NULL; 569 u64 latest_devid = 0; 570 u64 latest_transid = 0; 571 572 mutex_lock(&uuid_mutex); 573 again: 574 /* This is the initialized path, it is safe to release the devices. */ 575 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 576 if (device->in_fs_metadata) { 577 if (!device->is_tgtdev_for_dev_replace && 578 (!latest_transid || 579 device->generation > latest_transid)) { 580 latest_devid = device->devid; 581 latest_transid = device->generation; 582 latest_bdev = device->bdev; 583 } 584 continue; 585 } 586 587 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 588 /* 589 * In the first step, keep the device which has 590 * the correct fsid and the devid that is used 591 * for the dev_replace procedure. 592 * In the second step, the dev_replace state is 593 * read from the device tree and it is known 594 * whether the procedure is really active or 595 * not, which means whether this device is 596 * used or whether it should be removed. 597 */ 598 if (step == 0 || device->is_tgtdev_for_dev_replace) { 599 continue; 600 } 601 } 602 if (device->bdev) { 603 blkdev_put(device->bdev, device->mode); 604 device->bdev = NULL; 605 fs_devices->open_devices--; 606 } 607 if (device->writeable) { 608 list_del_init(&device->dev_alloc_list); 609 device->writeable = 0; 610 if (!device->is_tgtdev_for_dev_replace) 611 fs_devices->rw_devices--; 612 } 613 list_del_init(&device->dev_list); 614 fs_devices->num_devices--; 615 rcu_string_free(device->name); 616 kfree(device); 617 } 618 619 if (fs_devices->seed) { 620 fs_devices = fs_devices->seed; 621 goto again; 622 } 623 624 fs_devices->latest_bdev = latest_bdev; 625 fs_devices->latest_devid = latest_devid; 626 fs_devices->latest_trans = latest_transid; 627 628 mutex_unlock(&uuid_mutex); 629 } 630 631 static void __free_device(struct work_struct *work) 632 { 633 struct btrfs_device *device; 634 635 device = container_of(work, struct btrfs_device, rcu_work); 636 637 if (device->bdev) 638 blkdev_put(device->bdev, device->mode); 639 640 rcu_string_free(device->name); 641 kfree(device); 642 } 643 644 static void free_device(struct rcu_head *head) 645 { 646 struct btrfs_device *device; 647 648 device = container_of(head, struct btrfs_device, rcu); 649 650 INIT_WORK(&device->rcu_work, __free_device); 651 schedule_work(&device->rcu_work); 652 } 653 654 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 655 { 656 struct btrfs_device *device; 657 658 if (--fs_devices->opened > 0) 659 return 0; 660 661 mutex_lock(&fs_devices->device_list_mutex); 662 list_for_each_entry(device, &fs_devices->devices, dev_list) { 663 struct btrfs_device *new_device; 664 struct rcu_string *name; 665 666 if (device->bdev) 667 fs_devices->open_devices--; 668 669 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 670 list_del_init(&device->dev_alloc_list); 671 fs_devices->rw_devices--; 672 } 673 674 if (device->can_discard) 675 fs_devices->num_can_discard--; 676 677 new_device = btrfs_alloc_device(NULL, &device->devid, 678 device->uuid); 679 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 680 681 /* Safe because we are under uuid_mutex */ 682 if (device->name) { 683 name = rcu_string_strdup(device->name->str, GFP_NOFS); 684 BUG_ON(!name); /* -ENOMEM */ 685 rcu_assign_pointer(new_device->name, name); 686 } 687 688 list_replace_rcu(&device->dev_list, &new_device->dev_list); 689 new_device->fs_devices = device->fs_devices; 690 691 call_rcu(&device->rcu, free_device); 692 } 693 mutex_unlock(&fs_devices->device_list_mutex); 694 695 WARN_ON(fs_devices->open_devices); 696 WARN_ON(fs_devices->rw_devices); 697 fs_devices->opened = 0; 698 fs_devices->seeding = 0; 699 700 return 0; 701 } 702 703 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 704 { 705 struct btrfs_fs_devices *seed_devices = NULL; 706 int ret; 707 708 mutex_lock(&uuid_mutex); 709 ret = __btrfs_close_devices(fs_devices); 710 if (!fs_devices->opened) { 711 seed_devices = fs_devices->seed; 712 fs_devices->seed = NULL; 713 } 714 mutex_unlock(&uuid_mutex); 715 716 while (seed_devices) { 717 fs_devices = seed_devices; 718 seed_devices = fs_devices->seed; 719 __btrfs_close_devices(fs_devices); 720 free_fs_devices(fs_devices); 721 } 722 /* 723 * Wait for rcu kworkers under __btrfs_close_devices 724 * to finish all blkdev_puts so device is really 725 * free when umount is done. 726 */ 727 rcu_barrier(); 728 return ret; 729 } 730 731 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 732 fmode_t flags, void *holder) 733 { 734 struct request_queue *q; 735 struct block_device *bdev; 736 struct list_head *head = &fs_devices->devices; 737 struct btrfs_device *device; 738 struct block_device *latest_bdev = NULL; 739 struct buffer_head *bh; 740 struct btrfs_super_block *disk_super; 741 u64 latest_devid = 0; 742 u64 latest_transid = 0; 743 u64 devid; 744 int seeding = 1; 745 int ret = 0; 746 747 flags |= FMODE_EXCL; 748 749 list_for_each_entry(device, head, dev_list) { 750 if (device->bdev) 751 continue; 752 if (!device->name) 753 continue; 754 755 /* Just open everything we can; ignore failures here */ 756 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 757 &bdev, &bh)) 758 continue; 759 760 disk_super = (struct btrfs_super_block *)bh->b_data; 761 devid = btrfs_stack_device_id(&disk_super->dev_item); 762 if (devid != device->devid) 763 goto error_brelse; 764 765 if (memcmp(device->uuid, disk_super->dev_item.uuid, 766 BTRFS_UUID_SIZE)) 767 goto error_brelse; 768 769 device->generation = btrfs_super_generation(disk_super); 770 if (!latest_transid || device->generation > latest_transid) { 771 latest_devid = devid; 772 latest_transid = device->generation; 773 latest_bdev = bdev; 774 } 775 776 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 777 device->writeable = 0; 778 } else { 779 device->writeable = !bdev_read_only(bdev); 780 seeding = 0; 781 } 782 783 q = bdev_get_queue(bdev); 784 if (blk_queue_discard(q)) { 785 device->can_discard = 1; 786 fs_devices->num_can_discard++; 787 } 788 789 device->bdev = bdev; 790 device->in_fs_metadata = 0; 791 device->mode = flags; 792 793 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 794 fs_devices->rotating = 1; 795 796 fs_devices->open_devices++; 797 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 798 fs_devices->rw_devices++; 799 list_add(&device->dev_alloc_list, 800 &fs_devices->alloc_list); 801 } 802 brelse(bh); 803 continue; 804 805 error_brelse: 806 brelse(bh); 807 blkdev_put(bdev, flags); 808 continue; 809 } 810 if (fs_devices->open_devices == 0) { 811 ret = -EINVAL; 812 goto out; 813 } 814 fs_devices->seeding = seeding; 815 fs_devices->opened = 1; 816 fs_devices->latest_bdev = latest_bdev; 817 fs_devices->latest_devid = latest_devid; 818 fs_devices->latest_trans = latest_transid; 819 fs_devices->total_rw_bytes = 0; 820 out: 821 return ret; 822 } 823 824 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 825 fmode_t flags, void *holder) 826 { 827 int ret; 828 829 mutex_lock(&uuid_mutex); 830 if (fs_devices->opened) { 831 fs_devices->opened++; 832 ret = 0; 833 } else { 834 ret = __btrfs_open_devices(fs_devices, flags, holder); 835 } 836 mutex_unlock(&uuid_mutex); 837 return ret; 838 } 839 840 /* 841 * Look for a btrfs signature on a device. This may be called out of the mount path 842 * and we are not allowed to call set_blocksize during the scan. The superblock 843 * is read via pagecache 844 */ 845 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 846 struct btrfs_fs_devices **fs_devices_ret) 847 { 848 struct btrfs_super_block *disk_super; 849 struct block_device *bdev; 850 struct page *page; 851 void *p; 852 int ret = -EINVAL; 853 u64 devid; 854 u64 transid; 855 u64 total_devices; 856 u64 bytenr; 857 pgoff_t index; 858 859 /* 860 * we would like to check all the supers, but that would make 861 * a btrfs mount succeed after a mkfs from a different FS. 862 * So, we need to add a special mount option to scan for 863 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 864 */ 865 bytenr = btrfs_sb_offset(0); 866 flags |= FMODE_EXCL; 867 mutex_lock(&uuid_mutex); 868 869 bdev = blkdev_get_by_path(path, flags, holder); 870 871 if (IS_ERR(bdev)) { 872 ret = PTR_ERR(bdev); 873 goto error; 874 } 875 876 /* make sure our super fits in the device */ 877 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) 878 goto error_bdev_put; 879 880 /* make sure our super fits in the page */ 881 if (sizeof(*disk_super) > PAGE_CACHE_SIZE) 882 goto error_bdev_put; 883 884 /* make sure our super doesn't straddle pages on disk */ 885 index = bytenr >> PAGE_CACHE_SHIFT; 886 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) 887 goto error_bdev_put; 888 889 /* pull in the page with our super */ 890 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 891 index, GFP_NOFS); 892 893 if (IS_ERR_OR_NULL(page)) 894 goto error_bdev_put; 895 896 p = kmap(page); 897 898 /* align our pointer to the offset of the super block */ 899 disk_super = p + (bytenr & ~PAGE_CACHE_MASK); 900 901 if (btrfs_super_bytenr(disk_super) != bytenr || 902 btrfs_super_magic(disk_super) != BTRFS_MAGIC) 903 goto error_unmap; 904 905 devid = btrfs_stack_device_id(&disk_super->dev_item); 906 transid = btrfs_super_generation(disk_super); 907 total_devices = btrfs_super_num_devices(disk_super); 908 909 if (disk_super->label[0]) { 910 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 911 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 912 printk(KERN_INFO "device label %s ", disk_super->label); 913 } else { 914 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 915 } 916 917 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 918 919 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 920 if (!ret && fs_devices_ret) 921 (*fs_devices_ret)->total_devices = total_devices; 922 923 error_unmap: 924 kunmap(page); 925 page_cache_release(page); 926 927 error_bdev_put: 928 blkdev_put(bdev, flags); 929 error: 930 mutex_unlock(&uuid_mutex); 931 return ret; 932 } 933 934 /* helper to account the used device space in the range */ 935 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 936 u64 end, u64 *length) 937 { 938 struct btrfs_key key; 939 struct btrfs_root *root = device->dev_root; 940 struct btrfs_dev_extent *dev_extent; 941 struct btrfs_path *path; 942 u64 extent_end; 943 int ret; 944 int slot; 945 struct extent_buffer *l; 946 947 *length = 0; 948 949 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 950 return 0; 951 952 path = btrfs_alloc_path(); 953 if (!path) 954 return -ENOMEM; 955 path->reada = 2; 956 957 key.objectid = device->devid; 958 key.offset = start; 959 key.type = BTRFS_DEV_EXTENT_KEY; 960 961 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 962 if (ret < 0) 963 goto out; 964 if (ret > 0) { 965 ret = btrfs_previous_item(root, path, key.objectid, key.type); 966 if (ret < 0) 967 goto out; 968 } 969 970 while (1) { 971 l = path->nodes[0]; 972 slot = path->slots[0]; 973 if (slot >= btrfs_header_nritems(l)) { 974 ret = btrfs_next_leaf(root, path); 975 if (ret == 0) 976 continue; 977 if (ret < 0) 978 goto out; 979 980 break; 981 } 982 btrfs_item_key_to_cpu(l, &key, slot); 983 984 if (key.objectid < device->devid) 985 goto next; 986 987 if (key.objectid > device->devid) 988 break; 989 990 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 991 goto next; 992 993 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 994 extent_end = key.offset + btrfs_dev_extent_length(l, 995 dev_extent); 996 if (key.offset <= start && extent_end > end) { 997 *length = end - start + 1; 998 break; 999 } else if (key.offset <= start && extent_end > start) 1000 *length += extent_end - start; 1001 else if (key.offset > start && extent_end <= end) 1002 *length += extent_end - key.offset; 1003 else if (key.offset > start && key.offset <= end) { 1004 *length += end - key.offset + 1; 1005 break; 1006 } else if (key.offset > end) 1007 break; 1008 1009 next: 1010 path->slots[0]++; 1011 } 1012 ret = 0; 1013 out: 1014 btrfs_free_path(path); 1015 return ret; 1016 } 1017 1018 static int contains_pending_extent(struct btrfs_trans_handle *trans, 1019 struct btrfs_device *device, 1020 u64 *start, u64 len) 1021 { 1022 struct extent_map *em; 1023 int ret = 0; 1024 1025 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1026 struct map_lookup *map; 1027 int i; 1028 1029 map = (struct map_lookup *)em->bdev; 1030 for (i = 0; i < map->num_stripes; i++) { 1031 if (map->stripes[i].dev != device) 1032 continue; 1033 if (map->stripes[i].physical >= *start + len || 1034 map->stripes[i].physical + em->orig_block_len <= 1035 *start) 1036 continue; 1037 *start = map->stripes[i].physical + 1038 em->orig_block_len; 1039 ret = 1; 1040 } 1041 } 1042 1043 return ret; 1044 } 1045 1046 1047 /* 1048 * find_free_dev_extent - find free space in the specified device 1049 * @device: the device which we search the free space in 1050 * @num_bytes: the size of the free space that we need 1051 * @start: store the start of the free space. 1052 * @len: the size of the free space. that we find, or the size of the max 1053 * free space if we don't find suitable free space 1054 * 1055 * this uses a pretty simple search, the expectation is that it is 1056 * called very infrequently and that a given device has a small number 1057 * of extents 1058 * 1059 * @start is used to store the start of the free space if we find. But if we 1060 * don't find suitable free space, it will be used to store the start position 1061 * of the max free space. 1062 * 1063 * @len is used to store the size of the free space that we find. 1064 * But if we don't find suitable free space, it is used to store the size of 1065 * the max free space. 1066 */ 1067 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1068 struct btrfs_device *device, u64 num_bytes, 1069 u64 *start, u64 *len) 1070 { 1071 struct btrfs_key key; 1072 struct btrfs_root *root = device->dev_root; 1073 struct btrfs_dev_extent *dev_extent; 1074 struct btrfs_path *path; 1075 u64 hole_size; 1076 u64 max_hole_start; 1077 u64 max_hole_size; 1078 u64 extent_end; 1079 u64 search_start; 1080 u64 search_end = device->total_bytes; 1081 int ret; 1082 int slot; 1083 struct extent_buffer *l; 1084 1085 /* FIXME use last free of some kind */ 1086 1087 /* we don't want to overwrite the superblock on the drive, 1088 * so we make sure to start at an offset of at least 1MB 1089 */ 1090 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1091 1092 path = btrfs_alloc_path(); 1093 if (!path) 1094 return -ENOMEM; 1095 again: 1096 max_hole_start = search_start; 1097 max_hole_size = 0; 1098 hole_size = 0; 1099 1100 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1101 ret = -ENOSPC; 1102 goto out; 1103 } 1104 1105 path->reada = 2; 1106 path->search_commit_root = 1; 1107 path->skip_locking = 1; 1108 1109 key.objectid = device->devid; 1110 key.offset = search_start; 1111 key.type = BTRFS_DEV_EXTENT_KEY; 1112 1113 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1114 if (ret < 0) 1115 goto out; 1116 if (ret > 0) { 1117 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1118 if (ret < 0) 1119 goto out; 1120 } 1121 1122 while (1) { 1123 l = path->nodes[0]; 1124 slot = path->slots[0]; 1125 if (slot >= btrfs_header_nritems(l)) { 1126 ret = btrfs_next_leaf(root, path); 1127 if (ret == 0) 1128 continue; 1129 if (ret < 0) 1130 goto out; 1131 1132 break; 1133 } 1134 btrfs_item_key_to_cpu(l, &key, slot); 1135 1136 if (key.objectid < device->devid) 1137 goto next; 1138 1139 if (key.objectid > device->devid) 1140 break; 1141 1142 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1143 goto next; 1144 1145 if (key.offset > search_start) { 1146 hole_size = key.offset - search_start; 1147 1148 /* 1149 * Have to check before we set max_hole_start, otherwise 1150 * we could end up sending back this offset anyway. 1151 */ 1152 if (contains_pending_extent(trans, device, 1153 &search_start, 1154 hole_size)) 1155 hole_size = 0; 1156 1157 if (hole_size > max_hole_size) { 1158 max_hole_start = search_start; 1159 max_hole_size = hole_size; 1160 } 1161 1162 /* 1163 * If this free space is greater than which we need, 1164 * it must be the max free space that we have found 1165 * until now, so max_hole_start must point to the start 1166 * of this free space and the length of this free space 1167 * is stored in max_hole_size. Thus, we return 1168 * max_hole_start and max_hole_size and go back to the 1169 * caller. 1170 */ 1171 if (hole_size >= num_bytes) { 1172 ret = 0; 1173 goto out; 1174 } 1175 } 1176 1177 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1178 extent_end = key.offset + btrfs_dev_extent_length(l, 1179 dev_extent); 1180 if (extent_end > search_start) 1181 search_start = extent_end; 1182 next: 1183 path->slots[0]++; 1184 cond_resched(); 1185 } 1186 1187 /* 1188 * At this point, search_start should be the end of 1189 * allocated dev extents, and when shrinking the device, 1190 * search_end may be smaller than search_start. 1191 */ 1192 if (search_end > search_start) 1193 hole_size = search_end - search_start; 1194 1195 if (hole_size > max_hole_size) { 1196 max_hole_start = search_start; 1197 max_hole_size = hole_size; 1198 } 1199 1200 if (contains_pending_extent(trans, device, &search_start, hole_size)) { 1201 btrfs_release_path(path); 1202 goto again; 1203 } 1204 1205 /* See above. */ 1206 if (hole_size < num_bytes) 1207 ret = -ENOSPC; 1208 else 1209 ret = 0; 1210 1211 out: 1212 btrfs_free_path(path); 1213 *start = max_hole_start; 1214 if (len) 1215 *len = max_hole_size; 1216 return ret; 1217 } 1218 1219 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1220 struct btrfs_device *device, 1221 u64 start) 1222 { 1223 int ret; 1224 struct btrfs_path *path; 1225 struct btrfs_root *root = device->dev_root; 1226 struct btrfs_key key; 1227 struct btrfs_key found_key; 1228 struct extent_buffer *leaf = NULL; 1229 struct btrfs_dev_extent *extent = NULL; 1230 1231 path = btrfs_alloc_path(); 1232 if (!path) 1233 return -ENOMEM; 1234 1235 key.objectid = device->devid; 1236 key.offset = start; 1237 key.type = BTRFS_DEV_EXTENT_KEY; 1238 again: 1239 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1240 if (ret > 0) { 1241 ret = btrfs_previous_item(root, path, key.objectid, 1242 BTRFS_DEV_EXTENT_KEY); 1243 if (ret) 1244 goto out; 1245 leaf = path->nodes[0]; 1246 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1247 extent = btrfs_item_ptr(leaf, path->slots[0], 1248 struct btrfs_dev_extent); 1249 BUG_ON(found_key.offset > start || found_key.offset + 1250 btrfs_dev_extent_length(leaf, extent) < start); 1251 key = found_key; 1252 btrfs_release_path(path); 1253 goto again; 1254 } else if (ret == 0) { 1255 leaf = path->nodes[0]; 1256 extent = btrfs_item_ptr(leaf, path->slots[0], 1257 struct btrfs_dev_extent); 1258 } else { 1259 btrfs_error(root->fs_info, ret, "Slot search failed"); 1260 goto out; 1261 } 1262 1263 if (device->bytes_used > 0) { 1264 u64 len = btrfs_dev_extent_length(leaf, extent); 1265 device->bytes_used -= len; 1266 spin_lock(&root->fs_info->free_chunk_lock); 1267 root->fs_info->free_chunk_space += len; 1268 spin_unlock(&root->fs_info->free_chunk_lock); 1269 } 1270 ret = btrfs_del_item(trans, root, path); 1271 if (ret) { 1272 btrfs_error(root->fs_info, ret, 1273 "Failed to remove dev extent item"); 1274 } 1275 out: 1276 btrfs_free_path(path); 1277 return ret; 1278 } 1279 1280 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1281 struct btrfs_device *device, 1282 u64 chunk_tree, u64 chunk_objectid, 1283 u64 chunk_offset, u64 start, u64 num_bytes) 1284 { 1285 int ret; 1286 struct btrfs_path *path; 1287 struct btrfs_root *root = device->dev_root; 1288 struct btrfs_dev_extent *extent; 1289 struct extent_buffer *leaf; 1290 struct btrfs_key key; 1291 1292 WARN_ON(!device->in_fs_metadata); 1293 WARN_ON(device->is_tgtdev_for_dev_replace); 1294 path = btrfs_alloc_path(); 1295 if (!path) 1296 return -ENOMEM; 1297 1298 key.objectid = device->devid; 1299 key.offset = start; 1300 key.type = BTRFS_DEV_EXTENT_KEY; 1301 ret = btrfs_insert_empty_item(trans, root, path, &key, 1302 sizeof(*extent)); 1303 if (ret) 1304 goto out; 1305 1306 leaf = path->nodes[0]; 1307 extent = btrfs_item_ptr(leaf, path->slots[0], 1308 struct btrfs_dev_extent); 1309 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1310 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1311 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1312 1313 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1314 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1315 BTRFS_UUID_SIZE); 1316 1317 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1318 btrfs_mark_buffer_dirty(leaf); 1319 out: 1320 btrfs_free_path(path); 1321 return ret; 1322 } 1323 1324 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1325 { 1326 struct extent_map_tree *em_tree; 1327 struct extent_map *em; 1328 struct rb_node *n; 1329 u64 ret = 0; 1330 1331 em_tree = &fs_info->mapping_tree.map_tree; 1332 read_lock(&em_tree->lock); 1333 n = rb_last(&em_tree->map); 1334 if (n) { 1335 em = rb_entry(n, struct extent_map, rb_node); 1336 ret = em->start + em->len; 1337 } 1338 read_unlock(&em_tree->lock); 1339 1340 return ret; 1341 } 1342 1343 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1344 u64 *devid_ret) 1345 { 1346 int ret; 1347 struct btrfs_key key; 1348 struct btrfs_key found_key; 1349 struct btrfs_path *path; 1350 1351 path = btrfs_alloc_path(); 1352 if (!path) 1353 return -ENOMEM; 1354 1355 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1356 key.type = BTRFS_DEV_ITEM_KEY; 1357 key.offset = (u64)-1; 1358 1359 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1360 if (ret < 0) 1361 goto error; 1362 1363 BUG_ON(ret == 0); /* Corruption */ 1364 1365 ret = btrfs_previous_item(fs_info->chunk_root, path, 1366 BTRFS_DEV_ITEMS_OBJECTID, 1367 BTRFS_DEV_ITEM_KEY); 1368 if (ret) { 1369 *devid_ret = 1; 1370 } else { 1371 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1372 path->slots[0]); 1373 *devid_ret = found_key.offset + 1; 1374 } 1375 ret = 0; 1376 error: 1377 btrfs_free_path(path); 1378 return ret; 1379 } 1380 1381 /* 1382 * the device information is stored in the chunk root 1383 * the btrfs_device struct should be fully filled in 1384 */ 1385 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1386 struct btrfs_root *root, 1387 struct btrfs_device *device) 1388 { 1389 int ret; 1390 struct btrfs_path *path; 1391 struct btrfs_dev_item *dev_item; 1392 struct extent_buffer *leaf; 1393 struct btrfs_key key; 1394 unsigned long ptr; 1395 1396 root = root->fs_info->chunk_root; 1397 1398 path = btrfs_alloc_path(); 1399 if (!path) 1400 return -ENOMEM; 1401 1402 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1403 key.type = BTRFS_DEV_ITEM_KEY; 1404 key.offset = device->devid; 1405 1406 ret = btrfs_insert_empty_item(trans, root, path, &key, 1407 sizeof(*dev_item)); 1408 if (ret) 1409 goto out; 1410 1411 leaf = path->nodes[0]; 1412 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1413 1414 btrfs_set_device_id(leaf, dev_item, device->devid); 1415 btrfs_set_device_generation(leaf, dev_item, 0); 1416 btrfs_set_device_type(leaf, dev_item, device->type); 1417 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1418 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1419 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1420 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1421 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1422 btrfs_set_device_group(leaf, dev_item, 0); 1423 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1424 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1425 btrfs_set_device_start_offset(leaf, dev_item, 0); 1426 1427 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1428 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1429 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1430 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1431 btrfs_mark_buffer_dirty(leaf); 1432 1433 ret = 0; 1434 out: 1435 btrfs_free_path(path); 1436 return ret; 1437 } 1438 1439 static int btrfs_rm_dev_item(struct btrfs_root *root, 1440 struct btrfs_device *device) 1441 { 1442 int ret; 1443 struct btrfs_path *path; 1444 struct btrfs_key key; 1445 struct btrfs_trans_handle *trans; 1446 1447 root = root->fs_info->chunk_root; 1448 1449 path = btrfs_alloc_path(); 1450 if (!path) 1451 return -ENOMEM; 1452 1453 trans = btrfs_start_transaction(root, 0); 1454 if (IS_ERR(trans)) { 1455 btrfs_free_path(path); 1456 return PTR_ERR(trans); 1457 } 1458 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1459 key.type = BTRFS_DEV_ITEM_KEY; 1460 key.offset = device->devid; 1461 lock_chunks(root); 1462 1463 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1464 if (ret < 0) 1465 goto out; 1466 1467 if (ret > 0) { 1468 ret = -ENOENT; 1469 goto out; 1470 } 1471 1472 ret = btrfs_del_item(trans, root, path); 1473 if (ret) 1474 goto out; 1475 out: 1476 btrfs_free_path(path); 1477 unlock_chunks(root); 1478 btrfs_commit_transaction(trans, root); 1479 return ret; 1480 } 1481 1482 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1483 { 1484 struct btrfs_device *device; 1485 struct btrfs_device *next_device; 1486 struct block_device *bdev; 1487 struct buffer_head *bh = NULL; 1488 struct btrfs_super_block *disk_super; 1489 struct btrfs_fs_devices *cur_devices; 1490 u64 all_avail; 1491 u64 devid; 1492 u64 num_devices; 1493 u8 *dev_uuid; 1494 unsigned seq; 1495 int ret = 0; 1496 bool clear_super = false; 1497 1498 mutex_lock(&uuid_mutex); 1499 1500 do { 1501 seq = read_seqbegin(&root->fs_info->profiles_lock); 1502 1503 all_avail = root->fs_info->avail_data_alloc_bits | 1504 root->fs_info->avail_system_alloc_bits | 1505 root->fs_info->avail_metadata_alloc_bits; 1506 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1507 1508 num_devices = root->fs_info->fs_devices->num_devices; 1509 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1510 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1511 WARN_ON(num_devices < 1); 1512 num_devices--; 1513 } 1514 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1515 1516 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1517 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1518 goto out; 1519 } 1520 1521 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1522 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; 1523 goto out; 1524 } 1525 1526 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1527 root->fs_info->fs_devices->rw_devices <= 2) { 1528 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1529 goto out; 1530 } 1531 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1532 root->fs_info->fs_devices->rw_devices <= 3) { 1533 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; 1534 goto out; 1535 } 1536 1537 if (strcmp(device_path, "missing") == 0) { 1538 struct list_head *devices; 1539 struct btrfs_device *tmp; 1540 1541 device = NULL; 1542 devices = &root->fs_info->fs_devices->devices; 1543 /* 1544 * It is safe to read the devices since the volume_mutex 1545 * is held. 1546 */ 1547 list_for_each_entry(tmp, devices, dev_list) { 1548 if (tmp->in_fs_metadata && 1549 !tmp->is_tgtdev_for_dev_replace && 1550 !tmp->bdev) { 1551 device = tmp; 1552 break; 1553 } 1554 } 1555 bdev = NULL; 1556 bh = NULL; 1557 disk_super = NULL; 1558 if (!device) { 1559 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 1560 goto out; 1561 } 1562 } else { 1563 ret = btrfs_get_bdev_and_sb(device_path, 1564 FMODE_WRITE | FMODE_EXCL, 1565 root->fs_info->bdev_holder, 0, 1566 &bdev, &bh); 1567 if (ret) 1568 goto out; 1569 disk_super = (struct btrfs_super_block *)bh->b_data; 1570 devid = btrfs_stack_device_id(&disk_super->dev_item); 1571 dev_uuid = disk_super->dev_item.uuid; 1572 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1573 disk_super->fsid); 1574 if (!device) { 1575 ret = -ENOENT; 1576 goto error_brelse; 1577 } 1578 } 1579 1580 if (device->is_tgtdev_for_dev_replace) { 1581 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1582 goto error_brelse; 1583 } 1584 1585 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1586 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1587 goto error_brelse; 1588 } 1589 1590 if (device->writeable) { 1591 lock_chunks(root); 1592 list_del_init(&device->dev_alloc_list); 1593 unlock_chunks(root); 1594 root->fs_info->fs_devices->rw_devices--; 1595 clear_super = true; 1596 } 1597 1598 mutex_unlock(&uuid_mutex); 1599 ret = btrfs_shrink_device(device, 0); 1600 mutex_lock(&uuid_mutex); 1601 if (ret) 1602 goto error_undo; 1603 1604 /* 1605 * TODO: the superblock still includes this device in its num_devices 1606 * counter although write_all_supers() is not locked out. This 1607 * could give a filesystem state which requires a degraded mount. 1608 */ 1609 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1610 if (ret) 1611 goto error_undo; 1612 1613 spin_lock(&root->fs_info->free_chunk_lock); 1614 root->fs_info->free_chunk_space = device->total_bytes - 1615 device->bytes_used; 1616 spin_unlock(&root->fs_info->free_chunk_lock); 1617 1618 device->in_fs_metadata = 0; 1619 btrfs_scrub_cancel_dev(root->fs_info, device); 1620 1621 /* 1622 * the device list mutex makes sure that we don't change 1623 * the device list while someone else is writing out all 1624 * the device supers. 1625 */ 1626 1627 cur_devices = device->fs_devices; 1628 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1629 list_del_rcu(&device->dev_list); 1630 1631 device->fs_devices->num_devices--; 1632 device->fs_devices->total_devices--; 1633 1634 if (device->missing) 1635 root->fs_info->fs_devices->missing_devices--; 1636 1637 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1638 struct btrfs_device, dev_list); 1639 if (device->bdev == root->fs_info->sb->s_bdev) 1640 root->fs_info->sb->s_bdev = next_device->bdev; 1641 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1642 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1643 1644 if (device->bdev) 1645 device->fs_devices->open_devices--; 1646 1647 call_rcu(&device->rcu, free_device); 1648 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1649 1650 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1651 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1652 1653 if (cur_devices->open_devices == 0) { 1654 struct btrfs_fs_devices *fs_devices; 1655 fs_devices = root->fs_info->fs_devices; 1656 while (fs_devices) { 1657 if (fs_devices->seed == cur_devices) 1658 break; 1659 fs_devices = fs_devices->seed; 1660 } 1661 fs_devices->seed = cur_devices->seed; 1662 cur_devices->seed = NULL; 1663 lock_chunks(root); 1664 __btrfs_close_devices(cur_devices); 1665 unlock_chunks(root); 1666 free_fs_devices(cur_devices); 1667 } 1668 1669 root->fs_info->num_tolerated_disk_barrier_failures = 1670 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 1671 1672 /* 1673 * at this point, the device is zero sized. We want to 1674 * remove it from the devices list and zero out the old super 1675 */ 1676 if (clear_super && disk_super) { 1677 /* make sure this device isn't detected as part of 1678 * the FS anymore 1679 */ 1680 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1681 set_buffer_dirty(bh); 1682 sync_dirty_buffer(bh); 1683 } 1684 1685 ret = 0; 1686 1687 /* Notify udev that device has changed */ 1688 if (bdev) 1689 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1690 1691 error_brelse: 1692 brelse(bh); 1693 if (bdev) 1694 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1695 out: 1696 mutex_unlock(&uuid_mutex); 1697 return ret; 1698 error_undo: 1699 if (device->writeable) { 1700 lock_chunks(root); 1701 list_add(&device->dev_alloc_list, 1702 &root->fs_info->fs_devices->alloc_list); 1703 unlock_chunks(root); 1704 root->fs_info->fs_devices->rw_devices++; 1705 } 1706 goto error_brelse; 1707 } 1708 1709 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1710 struct btrfs_device *srcdev) 1711 { 1712 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1713 list_del_rcu(&srcdev->dev_list); 1714 list_del_rcu(&srcdev->dev_alloc_list); 1715 fs_info->fs_devices->num_devices--; 1716 if (srcdev->missing) { 1717 fs_info->fs_devices->missing_devices--; 1718 fs_info->fs_devices->rw_devices++; 1719 } 1720 if (srcdev->can_discard) 1721 fs_info->fs_devices->num_can_discard--; 1722 if (srcdev->bdev) 1723 fs_info->fs_devices->open_devices--; 1724 1725 call_rcu(&srcdev->rcu, free_device); 1726 } 1727 1728 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1729 struct btrfs_device *tgtdev) 1730 { 1731 struct btrfs_device *next_device; 1732 1733 WARN_ON(!tgtdev); 1734 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1735 if (tgtdev->bdev) { 1736 btrfs_scratch_superblock(tgtdev); 1737 fs_info->fs_devices->open_devices--; 1738 } 1739 fs_info->fs_devices->num_devices--; 1740 if (tgtdev->can_discard) 1741 fs_info->fs_devices->num_can_discard++; 1742 1743 next_device = list_entry(fs_info->fs_devices->devices.next, 1744 struct btrfs_device, dev_list); 1745 if (tgtdev->bdev == fs_info->sb->s_bdev) 1746 fs_info->sb->s_bdev = next_device->bdev; 1747 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1748 fs_info->fs_devices->latest_bdev = next_device->bdev; 1749 list_del_rcu(&tgtdev->dev_list); 1750 1751 call_rcu(&tgtdev->rcu, free_device); 1752 1753 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1754 } 1755 1756 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1757 struct btrfs_device **device) 1758 { 1759 int ret = 0; 1760 struct btrfs_super_block *disk_super; 1761 u64 devid; 1762 u8 *dev_uuid; 1763 struct block_device *bdev; 1764 struct buffer_head *bh; 1765 1766 *device = NULL; 1767 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1768 root->fs_info->bdev_holder, 0, &bdev, &bh); 1769 if (ret) 1770 return ret; 1771 disk_super = (struct btrfs_super_block *)bh->b_data; 1772 devid = btrfs_stack_device_id(&disk_super->dev_item); 1773 dev_uuid = disk_super->dev_item.uuid; 1774 *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1775 disk_super->fsid); 1776 brelse(bh); 1777 if (!*device) 1778 ret = -ENOENT; 1779 blkdev_put(bdev, FMODE_READ); 1780 return ret; 1781 } 1782 1783 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1784 char *device_path, 1785 struct btrfs_device **device) 1786 { 1787 *device = NULL; 1788 if (strcmp(device_path, "missing") == 0) { 1789 struct list_head *devices; 1790 struct btrfs_device *tmp; 1791 1792 devices = &root->fs_info->fs_devices->devices; 1793 /* 1794 * It is safe to read the devices since the volume_mutex 1795 * is held by the caller. 1796 */ 1797 list_for_each_entry(tmp, devices, dev_list) { 1798 if (tmp->in_fs_metadata && !tmp->bdev) { 1799 *device = tmp; 1800 break; 1801 } 1802 } 1803 1804 if (!*device) { 1805 pr_err("btrfs: no missing device found\n"); 1806 return -ENOENT; 1807 } 1808 1809 return 0; 1810 } else { 1811 return btrfs_find_device_by_path(root, device_path, device); 1812 } 1813 } 1814 1815 /* 1816 * does all the dirty work required for changing file system's UUID. 1817 */ 1818 static int btrfs_prepare_sprout(struct btrfs_root *root) 1819 { 1820 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1821 struct btrfs_fs_devices *old_devices; 1822 struct btrfs_fs_devices *seed_devices; 1823 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1824 struct btrfs_device *device; 1825 u64 super_flags; 1826 1827 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1828 if (!fs_devices->seeding) 1829 return -EINVAL; 1830 1831 seed_devices = __alloc_fs_devices(); 1832 if (IS_ERR(seed_devices)) 1833 return PTR_ERR(seed_devices); 1834 1835 old_devices = clone_fs_devices(fs_devices); 1836 if (IS_ERR(old_devices)) { 1837 kfree(seed_devices); 1838 return PTR_ERR(old_devices); 1839 } 1840 1841 list_add(&old_devices->list, &fs_uuids); 1842 1843 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1844 seed_devices->opened = 1; 1845 INIT_LIST_HEAD(&seed_devices->devices); 1846 INIT_LIST_HEAD(&seed_devices->alloc_list); 1847 mutex_init(&seed_devices->device_list_mutex); 1848 1849 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1850 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1851 synchronize_rcu); 1852 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1853 1854 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1855 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1856 device->fs_devices = seed_devices; 1857 } 1858 1859 fs_devices->seeding = 0; 1860 fs_devices->num_devices = 0; 1861 fs_devices->open_devices = 0; 1862 fs_devices->total_devices = 0; 1863 fs_devices->seed = seed_devices; 1864 1865 generate_random_uuid(fs_devices->fsid); 1866 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1867 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1868 super_flags = btrfs_super_flags(disk_super) & 1869 ~BTRFS_SUPER_FLAG_SEEDING; 1870 btrfs_set_super_flags(disk_super, super_flags); 1871 1872 return 0; 1873 } 1874 1875 /* 1876 * strore the expected generation for seed devices in device items. 1877 */ 1878 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1879 struct btrfs_root *root) 1880 { 1881 struct btrfs_path *path; 1882 struct extent_buffer *leaf; 1883 struct btrfs_dev_item *dev_item; 1884 struct btrfs_device *device; 1885 struct btrfs_key key; 1886 u8 fs_uuid[BTRFS_UUID_SIZE]; 1887 u8 dev_uuid[BTRFS_UUID_SIZE]; 1888 u64 devid; 1889 int ret; 1890 1891 path = btrfs_alloc_path(); 1892 if (!path) 1893 return -ENOMEM; 1894 1895 root = root->fs_info->chunk_root; 1896 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1897 key.offset = 0; 1898 key.type = BTRFS_DEV_ITEM_KEY; 1899 1900 while (1) { 1901 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1902 if (ret < 0) 1903 goto error; 1904 1905 leaf = path->nodes[0]; 1906 next_slot: 1907 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1908 ret = btrfs_next_leaf(root, path); 1909 if (ret > 0) 1910 break; 1911 if (ret < 0) 1912 goto error; 1913 leaf = path->nodes[0]; 1914 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1915 btrfs_release_path(path); 1916 continue; 1917 } 1918 1919 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1920 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1921 key.type != BTRFS_DEV_ITEM_KEY) 1922 break; 1923 1924 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1925 struct btrfs_dev_item); 1926 devid = btrfs_device_id(leaf, dev_item); 1927 read_extent_buffer(leaf, dev_uuid, 1928 (unsigned long)btrfs_device_uuid(dev_item), 1929 BTRFS_UUID_SIZE); 1930 read_extent_buffer(leaf, fs_uuid, 1931 (unsigned long)btrfs_device_fsid(dev_item), 1932 BTRFS_UUID_SIZE); 1933 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1934 fs_uuid); 1935 BUG_ON(!device); /* Logic error */ 1936 1937 if (device->fs_devices->seeding) { 1938 btrfs_set_device_generation(leaf, dev_item, 1939 device->generation); 1940 btrfs_mark_buffer_dirty(leaf); 1941 } 1942 1943 path->slots[0]++; 1944 goto next_slot; 1945 } 1946 ret = 0; 1947 error: 1948 btrfs_free_path(path); 1949 return ret; 1950 } 1951 1952 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1953 { 1954 struct request_queue *q; 1955 struct btrfs_trans_handle *trans; 1956 struct btrfs_device *device; 1957 struct block_device *bdev; 1958 struct list_head *devices; 1959 struct super_block *sb = root->fs_info->sb; 1960 struct rcu_string *name; 1961 u64 total_bytes; 1962 int seeding_dev = 0; 1963 int ret = 0; 1964 1965 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1966 return -EROFS; 1967 1968 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1969 root->fs_info->bdev_holder); 1970 if (IS_ERR(bdev)) 1971 return PTR_ERR(bdev); 1972 1973 if (root->fs_info->fs_devices->seeding) { 1974 seeding_dev = 1; 1975 down_write(&sb->s_umount); 1976 mutex_lock(&uuid_mutex); 1977 } 1978 1979 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1980 1981 devices = &root->fs_info->fs_devices->devices; 1982 1983 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1984 list_for_each_entry(device, devices, dev_list) { 1985 if (device->bdev == bdev) { 1986 ret = -EEXIST; 1987 mutex_unlock( 1988 &root->fs_info->fs_devices->device_list_mutex); 1989 goto error; 1990 } 1991 } 1992 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1993 1994 device = btrfs_alloc_device(root->fs_info, NULL, NULL); 1995 if (IS_ERR(device)) { 1996 /* we can safely leave the fs_devices entry around */ 1997 ret = PTR_ERR(device); 1998 goto error; 1999 } 2000 2001 name = rcu_string_strdup(device_path, GFP_NOFS); 2002 if (!name) { 2003 kfree(device); 2004 ret = -ENOMEM; 2005 goto error; 2006 } 2007 rcu_assign_pointer(device->name, name); 2008 2009 trans = btrfs_start_transaction(root, 0); 2010 if (IS_ERR(trans)) { 2011 rcu_string_free(device->name); 2012 kfree(device); 2013 ret = PTR_ERR(trans); 2014 goto error; 2015 } 2016 2017 lock_chunks(root); 2018 2019 q = bdev_get_queue(bdev); 2020 if (blk_queue_discard(q)) 2021 device->can_discard = 1; 2022 device->writeable = 1; 2023 device->generation = trans->transid; 2024 device->io_width = root->sectorsize; 2025 device->io_align = root->sectorsize; 2026 device->sector_size = root->sectorsize; 2027 device->total_bytes = i_size_read(bdev->bd_inode); 2028 device->disk_total_bytes = device->total_bytes; 2029 device->dev_root = root->fs_info->dev_root; 2030 device->bdev = bdev; 2031 device->in_fs_metadata = 1; 2032 device->is_tgtdev_for_dev_replace = 0; 2033 device->mode = FMODE_EXCL; 2034 set_blocksize(device->bdev, 4096); 2035 2036 if (seeding_dev) { 2037 sb->s_flags &= ~MS_RDONLY; 2038 ret = btrfs_prepare_sprout(root); 2039 BUG_ON(ret); /* -ENOMEM */ 2040 } 2041 2042 device->fs_devices = root->fs_info->fs_devices; 2043 2044 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2045 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2046 list_add(&device->dev_alloc_list, 2047 &root->fs_info->fs_devices->alloc_list); 2048 root->fs_info->fs_devices->num_devices++; 2049 root->fs_info->fs_devices->open_devices++; 2050 root->fs_info->fs_devices->rw_devices++; 2051 root->fs_info->fs_devices->total_devices++; 2052 if (device->can_discard) 2053 root->fs_info->fs_devices->num_can_discard++; 2054 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2055 2056 spin_lock(&root->fs_info->free_chunk_lock); 2057 root->fs_info->free_chunk_space += device->total_bytes; 2058 spin_unlock(&root->fs_info->free_chunk_lock); 2059 2060 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2061 root->fs_info->fs_devices->rotating = 1; 2062 2063 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2064 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2065 total_bytes + device->total_bytes); 2066 2067 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2068 btrfs_set_super_num_devices(root->fs_info->super_copy, 2069 total_bytes + 1); 2070 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2071 2072 if (seeding_dev) { 2073 ret = init_first_rw_device(trans, root, device); 2074 if (ret) { 2075 btrfs_abort_transaction(trans, root, ret); 2076 goto error_trans; 2077 } 2078 ret = btrfs_finish_sprout(trans, root); 2079 if (ret) { 2080 btrfs_abort_transaction(trans, root, ret); 2081 goto error_trans; 2082 } 2083 } else { 2084 ret = btrfs_add_device(trans, root, device); 2085 if (ret) { 2086 btrfs_abort_transaction(trans, root, ret); 2087 goto error_trans; 2088 } 2089 } 2090 2091 /* 2092 * we've got more storage, clear any full flags on the space 2093 * infos 2094 */ 2095 btrfs_clear_space_info_full(root->fs_info); 2096 2097 unlock_chunks(root); 2098 root->fs_info->num_tolerated_disk_barrier_failures = 2099 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2100 ret = btrfs_commit_transaction(trans, root); 2101 2102 if (seeding_dev) { 2103 mutex_unlock(&uuid_mutex); 2104 up_write(&sb->s_umount); 2105 2106 if (ret) /* transaction commit */ 2107 return ret; 2108 2109 ret = btrfs_relocate_sys_chunks(root); 2110 if (ret < 0) 2111 btrfs_error(root->fs_info, ret, 2112 "Failed to relocate sys chunks after " 2113 "device initialization. This can be fixed " 2114 "using the \"btrfs balance\" command."); 2115 trans = btrfs_attach_transaction(root); 2116 if (IS_ERR(trans)) { 2117 if (PTR_ERR(trans) == -ENOENT) 2118 return 0; 2119 return PTR_ERR(trans); 2120 } 2121 ret = btrfs_commit_transaction(trans, root); 2122 } 2123 2124 return ret; 2125 2126 error_trans: 2127 unlock_chunks(root); 2128 btrfs_end_transaction(trans, root); 2129 rcu_string_free(device->name); 2130 kfree(device); 2131 error: 2132 blkdev_put(bdev, FMODE_EXCL); 2133 if (seeding_dev) { 2134 mutex_unlock(&uuid_mutex); 2135 up_write(&sb->s_umount); 2136 } 2137 return ret; 2138 } 2139 2140 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2141 struct btrfs_device **device_out) 2142 { 2143 struct request_queue *q; 2144 struct btrfs_device *device; 2145 struct block_device *bdev; 2146 struct btrfs_fs_info *fs_info = root->fs_info; 2147 struct list_head *devices; 2148 struct rcu_string *name; 2149 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2150 int ret = 0; 2151 2152 *device_out = NULL; 2153 if (fs_info->fs_devices->seeding) 2154 return -EINVAL; 2155 2156 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2157 fs_info->bdev_holder); 2158 if (IS_ERR(bdev)) 2159 return PTR_ERR(bdev); 2160 2161 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2162 2163 devices = &fs_info->fs_devices->devices; 2164 list_for_each_entry(device, devices, dev_list) { 2165 if (device->bdev == bdev) { 2166 ret = -EEXIST; 2167 goto error; 2168 } 2169 } 2170 2171 device = btrfs_alloc_device(NULL, &devid, NULL); 2172 if (IS_ERR(device)) { 2173 ret = PTR_ERR(device); 2174 goto error; 2175 } 2176 2177 name = rcu_string_strdup(device_path, GFP_NOFS); 2178 if (!name) { 2179 kfree(device); 2180 ret = -ENOMEM; 2181 goto error; 2182 } 2183 rcu_assign_pointer(device->name, name); 2184 2185 q = bdev_get_queue(bdev); 2186 if (blk_queue_discard(q)) 2187 device->can_discard = 1; 2188 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2189 device->writeable = 1; 2190 device->generation = 0; 2191 device->io_width = root->sectorsize; 2192 device->io_align = root->sectorsize; 2193 device->sector_size = root->sectorsize; 2194 device->total_bytes = i_size_read(bdev->bd_inode); 2195 device->disk_total_bytes = device->total_bytes; 2196 device->dev_root = fs_info->dev_root; 2197 device->bdev = bdev; 2198 device->in_fs_metadata = 1; 2199 device->is_tgtdev_for_dev_replace = 1; 2200 device->mode = FMODE_EXCL; 2201 set_blocksize(device->bdev, 4096); 2202 device->fs_devices = fs_info->fs_devices; 2203 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2204 fs_info->fs_devices->num_devices++; 2205 fs_info->fs_devices->open_devices++; 2206 if (device->can_discard) 2207 fs_info->fs_devices->num_can_discard++; 2208 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2209 2210 *device_out = device; 2211 return ret; 2212 2213 error: 2214 blkdev_put(bdev, FMODE_EXCL); 2215 return ret; 2216 } 2217 2218 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2219 struct btrfs_device *tgtdev) 2220 { 2221 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2222 tgtdev->io_width = fs_info->dev_root->sectorsize; 2223 tgtdev->io_align = fs_info->dev_root->sectorsize; 2224 tgtdev->sector_size = fs_info->dev_root->sectorsize; 2225 tgtdev->dev_root = fs_info->dev_root; 2226 tgtdev->in_fs_metadata = 1; 2227 } 2228 2229 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2230 struct btrfs_device *device) 2231 { 2232 int ret; 2233 struct btrfs_path *path; 2234 struct btrfs_root *root; 2235 struct btrfs_dev_item *dev_item; 2236 struct extent_buffer *leaf; 2237 struct btrfs_key key; 2238 2239 root = device->dev_root->fs_info->chunk_root; 2240 2241 path = btrfs_alloc_path(); 2242 if (!path) 2243 return -ENOMEM; 2244 2245 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2246 key.type = BTRFS_DEV_ITEM_KEY; 2247 key.offset = device->devid; 2248 2249 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2250 if (ret < 0) 2251 goto out; 2252 2253 if (ret > 0) { 2254 ret = -ENOENT; 2255 goto out; 2256 } 2257 2258 leaf = path->nodes[0]; 2259 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2260 2261 btrfs_set_device_id(leaf, dev_item, device->devid); 2262 btrfs_set_device_type(leaf, dev_item, device->type); 2263 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2264 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2265 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2266 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2267 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2268 btrfs_mark_buffer_dirty(leaf); 2269 2270 out: 2271 btrfs_free_path(path); 2272 return ret; 2273 } 2274 2275 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2276 struct btrfs_device *device, u64 new_size) 2277 { 2278 struct btrfs_super_block *super_copy = 2279 device->dev_root->fs_info->super_copy; 2280 u64 old_total = btrfs_super_total_bytes(super_copy); 2281 u64 diff = new_size - device->total_bytes; 2282 2283 if (!device->writeable) 2284 return -EACCES; 2285 if (new_size <= device->total_bytes || 2286 device->is_tgtdev_for_dev_replace) 2287 return -EINVAL; 2288 2289 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2290 device->fs_devices->total_rw_bytes += diff; 2291 2292 device->total_bytes = new_size; 2293 device->disk_total_bytes = new_size; 2294 btrfs_clear_space_info_full(device->dev_root->fs_info); 2295 2296 return btrfs_update_device(trans, device); 2297 } 2298 2299 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2300 struct btrfs_device *device, u64 new_size) 2301 { 2302 int ret; 2303 lock_chunks(device->dev_root); 2304 ret = __btrfs_grow_device(trans, device, new_size); 2305 unlock_chunks(device->dev_root); 2306 return ret; 2307 } 2308 2309 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2310 struct btrfs_root *root, 2311 u64 chunk_tree, u64 chunk_objectid, 2312 u64 chunk_offset) 2313 { 2314 int ret; 2315 struct btrfs_path *path; 2316 struct btrfs_key key; 2317 2318 root = root->fs_info->chunk_root; 2319 path = btrfs_alloc_path(); 2320 if (!path) 2321 return -ENOMEM; 2322 2323 key.objectid = chunk_objectid; 2324 key.offset = chunk_offset; 2325 key.type = BTRFS_CHUNK_ITEM_KEY; 2326 2327 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2328 if (ret < 0) 2329 goto out; 2330 else if (ret > 0) { /* Logic error or corruption */ 2331 btrfs_error(root->fs_info, -ENOENT, 2332 "Failed lookup while freeing chunk."); 2333 ret = -ENOENT; 2334 goto out; 2335 } 2336 2337 ret = btrfs_del_item(trans, root, path); 2338 if (ret < 0) 2339 btrfs_error(root->fs_info, ret, 2340 "Failed to delete chunk item."); 2341 out: 2342 btrfs_free_path(path); 2343 return ret; 2344 } 2345 2346 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 2347 chunk_offset) 2348 { 2349 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2350 struct btrfs_disk_key *disk_key; 2351 struct btrfs_chunk *chunk; 2352 u8 *ptr; 2353 int ret = 0; 2354 u32 num_stripes; 2355 u32 array_size; 2356 u32 len = 0; 2357 u32 cur; 2358 struct btrfs_key key; 2359 2360 array_size = btrfs_super_sys_array_size(super_copy); 2361 2362 ptr = super_copy->sys_chunk_array; 2363 cur = 0; 2364 2365 while (cur < array_size) { 2366 disk_key = (struct btrfs_disk_key *)ptr; 2367 btrfs_disk_key_to_cpu(&key, disk_key); 2368 2369 len = sizeof(*disk_key); 2370 2371 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2372 chunk = (struct btrfs_chunk *)(ptr + len); 2373 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2374 len += btrfs_chunk_item_size(num_stripes); 2375 } else { 2376 ret = -EIO; 2377 break; 2378 } 2379 if (key.objectid == chunk_objectid && 2380 key.offset == chunk_offset) { 2381 memmove(ptr, ptr + len, array_size - (cur + len)); 2382 array_size -= len; 2383 btrfs_set_super_sys_array_size(super_copy, array_size); 2384 } else { 2385 ptr += len; 2386 cur += len; 2387 } 2388 } 2389 return ret; 2390 } 2391 2392 static int btrfs_relocate_chunk(struct btrfs_root *root, 2393 u64 chunk_tree, u64 chunk_objectid, 2394 u64 chunk_offset) 2395 { 2396 struct extent_map_tree *em_tree; 2397 struct btrfs_root *extent_root; 2398 struct btrfs_trans_handle *trans; 2399 struct extent_map *em; 2400 struct map_lookup *map; 2401 int ret; 2402 int i; 2403 2404 root = root->fs_info->chunk_root; 2405 extent_root = root->fs_info->extent_root; 2406 em_tree = &root->fs_info->mapping_tree.map_tree; 2407 2408 ret = btrfs_can_relocate(extent_root, chunk_offset); 2409 if (ret) 2410 return -ENOSPC; 2411 2412 /* step one, relocate all the extents inside this chunk */ 2413 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2414 if (ret) 2415 return ret; 2416 2417 trans = btrfs_start_transaction(root, 0); 2418 if (IS_ERR(trans)) { 2419 ret = PTR_ERR(trans); 2420 btrfs_std_error(root->fs_info, ret); 2421 return ret; 2422 } 2423 2424 lock_chunks(root); 2425 2426 /* 2427 * step two, delete the device extents and the 2428 * chunk tree entries 2429 */ 2430 read_lock(&em_tree->lock); 2431 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2432 read_unlock(&em_tree->lock); 2433 2434 BUG_ON(!em || em->start > chunk_offset || 2435 em->start + em->len < chunk_offset); 2436 map = (struct map_lookup *)em->bdev; 2437 2438 for (i = 0; i < map->num_stripes; i++) { 2439 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2440 map->stripes[i].physical); 2441 BUG_ON(ret); 2442 2443 if (map->stripes[i].dev) { 2444 ret = btrfs_update_device(trans, map->stripes[i].dev); 2445 BUG_ON(ret); 2446 } 2447 } 2448 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2449 chunk_offset); 2450 2451 BUG_ON(ret); 2452 2453 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2454 2455 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2456 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2457 BUG_ON(ret); 2458 } 2459 2460 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2461 BUG_ON(ret); 2462 2463 write_lock(&em_tree->lock); 2464 remove_extent_mapping(em_tree, em); 2465 write_unlock(&em_tree->lock); 2466 2467 kfree(map); 2468 em->bdev = NULL; 2469 2470 /* once for the tree */ 2471 free_extent_map(em); 2472 /* once for us */ 2473 free_extent_map(em); 2474 2475 unlock_chunks(root); 2476 btrfs_end_transaction(trans, root); 2477 return 0; 2478 } 2479 2480 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2481 { 2482 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2483 struct btrfs_path *path; 2484 struct extent_buffer *leaf; 2485 struct btrfs_chunk *chunk; 2486 struct btrfs_key key; 2487 struct btrfs_key found_key; 2488 u64 chunk_tree = chunk_root->root_key.objectid; 2489 u64 chunk_type; 2490 bool retried = false; 2491 int failed = 0; 2492 int ret; 2493 2494 path = btrfs_alloc_path(); 2495 if (!path) 2496 return -ENOMEM; 2497 2498 again: 2499 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2500 key.offset = (u64)-1; 2501 key.type = BTRFS_CHUNK_ITEM_KEY; 2502 2503 while (1) { 2504 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2505 if (ret < 0) 2506 goto error; 2507 BUG_ON(ret == 0); /* Corruption */ 2508 2509 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2510 key.type); 2511 if (ret < 0) 2512 goto error; 2513 if (ret > 0) 2514 break; 2515 2516 leaf = path->nodes[0]; 2517 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2518 2519 chunk = btrfs_item_ptr(leaf, path->slots[0], 2520 struct btrfs_chunk); 2521 chunk_type = btrfs_chunk_type(leaf, chunk); 2522 btrfs_release_path(path); 2523 2524 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2525 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2526 found_key.objectid, 2527 found_key.offset); 2528 if (ret == -ENOSPC) 2529 failed++; 2530 else if (ret) 2531 BUG(); 2532 } 2533 2534 if (found_key.offset == 0) 2535 break; 2536 key.offset = found_key.offset - 1; 2537 } 2538 ret = 0; 2539 if (failed && !retried) { 2540 failed = 0; 2541 retried = true; 2542 goto again; 2543 } else if (failed && retried) { 2544 WARN_ON(1); 2545 ret = -ENOSPC; 2546 } 2547 error: 2548 btrfs_free_path(path); 2549 return ret; 2550 } 2551 2552 static int insert_balance_item(struct btrfs_root *root, 2553 struct btrfs_balance_control *bctl) 2554 { 2555 struct btrfs_trans_handle *trans; 2556 struct btrfs_balance_item *item; 2557 struct btrfs_disk_balance_args disk_bargs; 2558 struct btrfs_path *path; 2559 struct extent_buffer *leaf; 2560 struct btrfs_key key; 2561 int ret, err; 2562 2563 path = btrfs_alloc_path(); 2564 if (!path) 2565 return -ENOMEM; 2566 2567 trans = btrfs_start_transaction(root, 0); 2568 if (IS_ERR(trans)) { 2569 btrfs_free_path(path); 2570 return PTR_ERR(trans); 2571 } 2572 2573 key.objectid = BTRFS_BALANCE_OBJECTID; 2574 key.type = BTRFS_BALANCE_ITEM_KEY; 2575 key.offset = 0; 2576 2577 ret = btrfs_insert_empty_item(trans, root, path, &key, 2578 sizeof(*item)); 2579 if (ret) 2580 goto out; 2581 2582 leaf = path->nodes[0]; 2583 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2584 2585 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2586 2587 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2588 btrfs_set_balance_data(leaf, item, &disk_bargs); 2589 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2590 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2591 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2592 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2593 2594 btrfs_set_balance_flags(leaf, item, bctl->flags); 2595 2596 btrfs_mark_buffer_dirty(leaf); 2597 out: 2598 btrfs_free_path(path); 2599 err = btrfs_commit_transaction(trans, root); 2600 if (err && !ret) 2601 ret = err; 2602 return ret; 2603 } 2604 2605 static int del_balance_item(struct btrfs_root *root) 2606 { 2607 struct btrfs_trans_handle *trans; 2608 struct btrfs_path *path; 2609 struct btrfs_key key; 2610 int ret, err; 2611 2612 path = btrfs_alloc_path(); 2613 if (!path) 2614 return -ENOMEM; 2615 2616 trans = btrfs_start_transaction(root, 0); 2617 if (IS_ERR(trans)) { 2618 btrfs_free_path(path); 2619 return PTR_ERR(trans); 2620 } 2621 2622 key.objectid = BTRFS_BALANCE_OBJECTID; 2623 key.type = BTRFS_BALANCE_ITEM_KEY; 2624 key.offset = 0; 2625 2626 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2627 if (ret < 0) 2628 goto out; 2629 if (ret > 0) { 2630 ret = -ENOENT; 2631 goto out; 2632 } 2633 2634 ret = btrfs_del_item(trans, root, path); 2635 out: 2636 btrfs_free_path(path); 2637 err = btrfs_commit_transaction(trans, root); 2638 if (err && !ret) 2639 ret = err; 2640 return ret; 2641 } 2642 2643 /* 2644 * This is a heuristic used to reduce the number of chunks balanced on 2645 * resume after balance was interrupted. 2646 */ 2647 static void update_balance_args(struct btrfs_balance_control *bctl) 2648 { 2649 /* 2650 * Turn on soft mode for chunk types that were being converted. 2651 */ 2652 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2653 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2654 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2655 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2656 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2657 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2658 2659 /* 2660 * Turn on usage filter if is not already used. The idea is 2661 * that chunks that we have already balanced should be 2662 * reasonably full. Don't do it for chunks that are being 2663 * converted - that will keep us from relocating unconverted 2664 * (albeit full) chunks. 2665 */ 2666 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2667 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2668 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2669 bctl->data.usage = 90; 2670 } 2671 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2672 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2673 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2674 bctl->sys.usage = 90; 2675 } 2676 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2677 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2678 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2679 bctl->meta.usage = 90; 2680 } 2681 } 2682 2683 /* 2684 * Should be called with both balance and volume mutexes held to 2685 * serialize other volume operations (add_dev/rm_dev/resize) with 2686 * restriper. Same goes for unset_balance_control. 2687 */ 2688 static void set_balance_control(struct btrfs_balance_control *bctl) 2689 { 2690 struct btrfs_fs_info *fs_info = bctl->fs_info; 2691 2692 BUG_ON(fs_info->balance_ctl); 2693 2694 spin_lock(&fs_info->balance_lock); 2695 fs_info->balance_ctl = bctl; 2696 spin_unlock(&fs_info->balance_lock); 2697 } 2698 2699 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2700 { 2701 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2702 2703 BUG_ON(!fs_info->balance_ctl); 2704 2705 spin_lock(&fs_info->balance_lock); 2706 fs_info->balance_ctl = NULL; 2707 spin_unlock(&fs_info->balance_lock); 2708 2709 kfree(bctl); 2710 } 2711 2712 /* 2713 * Balance filters. Return 1 if chunk should be filtered out 2714 * (should not be balanced). 2715 */ 2716 static int chunk_profiles_filter(u64 chunk_type, 2717 struct btrfs_balance_args *bargs) 2718 { 2719 chunk_type = chunk_to_extended(chunk_type) & 2720 BTRFS_EXTENDED_PROFILE_MASK; 2721 2722 if (bargs->profiles & chunk_type) 2723 return 0; 2724 2725 return 1; 2726 } 2727 2728 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2729 struct btrfs_balance_args *bargs) 2730 { 2731 struct btrfs_block_group_cache *cache; 2732 u64 chunk_used, user_thresh; 2733 int ret = 1; 2734 2735 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2736 chunk_used = btrfs_block_group_used(&cache->item); 2737 2738 if (bargs->usage == 0) 2739 user_thresh = 1; 2740 else if (bargs->usage > 100) 2741 user_thresh = cache->key.offset; 2742 else 2743 user_thresh = div_factor_fine(cache->key.offset, 2744 bargs->usage); 2745 2746 if (chunk_used < user_thresh) 2747 ret = 0; 2748 2749 btrfs_put_block_group(cache); 2750 return ret; 2751 } 2752 2753 static int chunk_devid_filter(struct extent_buffer *leaf, 2754 struct btrfs_chunk *chunk, 2755 struct btrfs_balance_args *bargs) 2756 { 2757 struct btrfs_stripe *stripe; 2758 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2759 int i; 2760 2761 for (i = 0; i < num_stripes; i++) { 2762 stripe = btrfs_stripe_nr(chunk, i); 2763 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2764 return 0; 2765 } 2766 2767 return 1; 2768 } 2769 2770 /* [pstart, pend) */ 2771 static int chunk_drange_filter(struct extent_buffer *leaf, 2772 struct btrfs_chunk *chunk, 2773 u64 chunk_offset, 2774 struct btrfs_balance_args *bargs) 2775 { 2776 struct btrfs_stripe *stripe; 2777 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2778 u64 stripe_offset; 2779 u64 stripe_length; 2780 int factor; 2781 int i; 2782 2783 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2784 return 0; 2785 2786 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2787 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 2788 factor = num_stripes / 2; 2789 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 2790 factor = num_stripes - 1; 2791 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 2792 factor = num_stripes - 2; 2793 } else { 2794 factor = num_stripes; 2795 } 2796 2797 for (i = 0; i < num_stripes; i++) { 2798 stripe = btrfs_stripe_nr(chunk, i); 2799 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2800 continue; 2801 2802 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2803 stripe_length = btrfs_chunk_length(leaf, chunk); 2804 do_div(stripe_length, factor); 2805 2806 if (stripe_offset < bargs->pend && 2807 stripe_offset + stripe_length > bargs->pstart) 2808 return 0; 2809 } 2810 2811 return 1; 2812 } 2813 2814 /* [vstart, vend) */ 2815 static int chunk_vrange_filter(struct extent_buffer *leaf, 2816 struct btrfs_chunk *chunk, 2817 u64 chunk_offset, 2818 struct btrfs_balance_args *bargs) 2819 { 2820 if (chunk_offset < bargs->vend && 2821 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2822 /* at least part of the chunk is inside this vrange */ 2823 return 0; 2824 2825 return 1; 2826 } 2827 2828 static int chunk_soft_convert_filter(u64 chunk_type, 2829 struct btrfs_balance_args *bargs) 2830 { 2831 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2832 return 0; 2833 2834 chunk_type = chunk_to_extended(chunk_type) & 2835 BTRFS_EXTENDED_PROFILE_MASK; 2836 2837 if (bargs->target == chunk_type) 2838 return 1; 2839 2840 return 0; 2841 } 2842 2843 static int should_balance_chunk(struct btrfs_root *root, 2844 struct extent_buffer *leaf, 2845 struct btrfs_chunk *chunk, u64 chunk_offset) 2846 { 2847 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2848 struct btrfs_balance_args *bargs = NULL; 2849 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2850 2851 /* type filter */ 2852 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2853 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2854 return 0; 2855 } 2856 2857 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2858 bargs = &bctl->data; 2859 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2860 bargs = &bctl->sys; 2861 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2862 bargs = &bctl->meta; 2863 2864 /* profiles filter */ 2865 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2866 chunk_profiles_filter(chunk_type, bargs)) { 2867 return 0; 2868 } 2869 2870 /* usage filter */ 2871 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2872 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2873 return 0; 2874 } 2875 2876 /* devid filter */ 2877 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2878 chunk_devid_filter(leaf, chunk, bargs)) { 2879 return 0; 2880 } 2881 2882 /* drange filter, makes sense only with devid filter */ 2883 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2884 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2885 return 0; 2886 } 2887 2888 /* vrange filter */ 2889 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2890 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2891 return 0; 2892 } 2893 2894 /* soft profile changing mode */ 2895 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2896 chunk_soft_convert_filter(chunk_type, bargs)) { 2897 return 0; 2898 } 2899 2900 return 1; 2901 } 2902 2903 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2904 { 2905 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2906 struct btrfs_root *chunk_root = fs_info->chunk_root; 2907 struct btrfs_root *dev_root = fs_info->dev_root; 2908 struct list_head *devices; 2909 struct btrfs_device *device; 2910 u64 old_size; 2911 u64 size_to_free; 2912 struct btrfs_chunk *chunk; 2913 struct btrfs_path *path; 2914 struct btrfs_key key; 2915 struct btrfs_key found_key; 2916 struct btrfs_trans_handle *trans; 2917 struct extent_buffer *leaf; 2918 int slot; 2919 int ret; 2920 int enospc_errors = 0; 2921 bool counting = true; 2922 2923 /* step one make some room on all the devices */ 2924 devices = &fs_info->fs_devices->devices; 2925 list_for_each_entry(device, devices, dev_list) { 2926 old_size = device->total_bytes; 2927 size_to_free = div_factor(old_size, 1); 2928 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2929 if (!device->writeable || 2930 device->total_bytes - device->bytes_used > size_to_free || 2931 device->is_tgtdev_for_dev_replace) 2932 continue; 2933 2934 ret = btrfs_shrink_device(device, old_size - size_to_free); 2935 if (ret == -ENOSPC) 2936 break; 2937 BUG_ON(ret); 2938 2939 trans = btrfs_start_transaction(dev_root, 0); 2940 BUG_ON(IS_ERR(trans)); 2941 2942 ret = btrfs_grow_device(trans, device, old_size); 2943 BUG_ON(ret); 2944 2945 btrfs_end_transaction(trans, dev_root); 2946 } 2947 2948 /* step two, relocate all the chunks */ 2949 path = btrfs_alloc_path(); 2950 if (!path) { 2951 ret = -ENOMEM; 2952 goto error; 2953 } 2954 2955 /* zero out stat counters */ 2956 spin_lock(&fs_info->balance_lock); 2957 memset(&bctl->stat, 0, sizeof(bctl->stat)); 2958 spin_unlock(&fs_info->balance_lock); 2959 again: 2960 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2961 key.offset = (u64)-1; 2962 key.type = BTRFS_CHUNK_ITEM_KEY; 2963 2964 while (1) { 2965 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 2966 atomic_read(&fs_info->balance_cancel_req)) { 2967 ret = -ECANCELED; 2968 goto error; 2969 } 2970 2971 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2972 if (ret < 0) 2973 goto error; 2974 2975 /* 2976 * this shouldn't happen, it means the last relocate 2977 * failed 2978 */ 2979 if (ret == 0) 2980 BUG(); /* FIXME break ? */ 2981 2982 ret = btrfs_previous_item(chunk_root, path, 0, 2983 BTRFS_CHUNK_ITEM_KEY); 2984 if (ret) { 2985 ret = 0; 2986 break; 2987 } 2988 2989 leaf = path->nodes[0]; 2990 slot = path->slots[0]; 2991 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2992 2993 if (found_key.objectid != key.objectid) 2994 break; 2995 2996 /* chunk zero is special */ 2997 if (found_key.offset == 0) 2998 break; 2999 3000 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3001 3002 if (!counting) { 3003 spin_lock(&fs_info->balance_lock); 3004 bctl->stat.considered++; 3005 spin_unlock(&fs_info->balance_lock); 3006 } 3007 3008 ret = should_balance_chunk(chunk_root, leaf, chunk, 3009 found_key.offset); 3010 btrfs_release_path(path); 3011 if (!ret) 3012 goto loop; 3013 3014 if (counting) { 3015 spin_lock(&fs_info->balance_lock); 3016 bctl->stat.expected++; 3017 spin_unlock(&fs_info->balance_lock); 3018 goto loop; 3019 } 3020 3021 ret = btrfs_relocate_chunk(chunk_root, 3022 chunk_root->root_key.objectid, 3023 found_key.objectid, 3024 found_key.offset); 3025 if (ret && ret != -ENOSPC) 3026 goto error; 3027 if (ret == -ENOSPC) { 3028 enospc_errors++; 3029 } else { 3030 spin_lock(&fs_info->balance_lock); 3031 bctl->stat.completed++; 3032 spin_unlock(&fs_info->balance_lock); 3033 } 3034 loop: 3035 key.offset = found_key.offset - 1; 3036 } 3037 3038 if (counting) { 3039 btrfs_release_path(path); 3040 counting = false; 3041 goto again; 3042 } 3043 error: 3044 btrfs_free_path(path); 3045 if (enospc_errors) { 3046 printk(KERN_INFO "btrfs: %d enospc errors during balance\n", 3047 enospc_errors); 3048 if (!ret) 3049 ret = -ENOSPC; 3050 } 3051 3052 return ret; 3053 } 3054 3055 /** 3056 * alloc_profile_is_valid - see if a given profile is valid and reduced 3057 * @flags: profile to validate 3058 * @extended: if true @flags is treated as an extended profile 3059 */ 3060 static int alloc_profile_is_valid(u64 flags, int extended) 3061 { 3062 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3063 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3064 3065 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3066 3067 /* 1) check that all other bits are zeroed */ 3068 if (flags & ~mask) 3069 return 0; 3070 3071 /* 2) see if profile is reduced */ 3072 if (flags == 0) 3073 return !extended; /* "0" is valid for usual profiles */ 3074 3075 /* true if exactly one bit set */ 3076 return (flags & (flags - 1)) == 0; 3077 } 3078 3079 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3080 { 3081 /* cancel requested || normal exit path */ 3082 return atomic_read(&fs_info->balance_cancel_req) || 3083 (atomic_read(&fs_info->balance_pause_req) == 0 && 3084 atomic_read(&fs_info->balance_cancel_req) == 0); 3085 } 3086 3087 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3088 { 3089 int ret; 3090 3091 unset_balance_control(fs_info); 3092 ret = del_balance_item(fs_info->tree_root); 3093 if (ret) 3094 btrfs_std_error(fs_info, ret); 3095 3096 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3097 } 3098 3099 /* 3100 * Should be called with both balance and volume mutexes held 3101 */ 3102 int btrfs_balance(struct btrfs_balance_control *bctl, 3103 struct btrfs_ioctl_balance_args *bargs) 3104 { 3105 struct btrfs_fs_info *fs_info = bctl->fs_info; 3106 u64 allowed; 3107 int mixed = 0; 3108 int ret; 3109 u64 num_devices; 3110 unsigned seq; 3111 3112 if (btrfs_fs_closing(fs_info) || 3113 atomic_read(&fs_info->balance_pause_req) || 3114 atomic_read(&fs_info->balance_cancel_req)) { 3115 ret = -EINVAL; 3116 goto out; 3117 } 3118 3119 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3120 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3121 mixed = 1; 3122 3123 /* 3124 * In case of mixed groups both data and meta should be picked, 3125 * and identical options should be given for both of them. 3126 */ 3127 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3128 if (mixed && (bctl->flags & allowed)) { 3129 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3130 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3131 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3132 printk(KERN_ERR "btrfs: with mixed groups data and " 3133 "metadata balance options must be the same\n"); 3134 ret = -EINVAL; 3135 goto out; 3136 } 3137 } 3138 3139 num_devices = fs_info->fs_devices->num_devices; 3140 btrfs_dev_replace_lock(&fs_info->dev_replace); 3141 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3142 BUG_ON(num_devices < 1); 3143 num_devices--; 3144 } 3145 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3146 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3147 if (num_devices == 1) 3148 allowed |= BTRFS_BLOCK_GROUP_DUP; 3149 else if (num_devices > 1) 3150 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3151 if (num_devices > 2) 3152 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3153 if (num_devices > 3) 3154 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3155 BTRFS_BLOCK_GROUP_RAID6); 3156 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3157 (!alloc_profile_is_valid(bctl->data.target, 1) || 3158 (bctl->data.target & ~allowed))) { 3159 printk(KERN_ERR "btrfs: unable to start balance with target " 3160 "data profile %llu\n", 3161 bctl->data.target); 3162 ret = -EINVAL; 3163 goto out; 3164 } 3165 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3166 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3167 (bctl->meta.target & ~allowed))) { 3168 printk(KERN_ERR "btrfs: unable to start balance with target " 3169 "metadata profile %llu\n", 3170 bctl->meta.target); 3171 ret = -EINVAL; 3172 goto out; 3173 } 3174 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3175 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3176 (bctl->sys.target & ~allowed))) { 3177 printk(KERN_ERR "btrfs: unable to start balance with target " 3178 "system profile %llu\n", 3179 bctl->sys.target); 3180 ret = -EINVAL; 3181 goto out; 3182 } 3183 3184 /* allow dup'ed data chunks only in mixed mode */ 3185 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3186 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3187 printk(KERN_ERR "btrfs: dup for data is not allowed\n"); 3188 ret = -EINVAL; 3189 goto out; 3190 } 3191 3192 /* allow to reduce meta or sys integrity only if force set */ 3193 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3194 BTRFS_BLOCK_GROUP_RAID10 | 3195 BTRFS_BLOCK_GROUP_RAID5 | 3196 BTRFS_BLOCK_GROUP_RAID6; 3197 do { 3198 seq = read_seqbegin(&fs_info->profiles_lock); 3199 3200 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3201 (fs_info->avail_system_alloc_bits & allowed) && 3202 !(bctl->sys.target & allowed)) || 3203 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3204 (fs_info->avail_metadata_alloc_bits & allowed) && 3205 !(bctl->meta.target & allowed))) { 3206 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3207 printk(KERN_INFO "btrfs: force reducing metadata " 3208 "integrity\n"); 3209 } else { 3210 printk(KERN_ERR "btrfs: balance will reduce metadata " 3211 "integrity, use force if you want this\n"); 3212 ret = -EINVAL; 3213 goto out; 3214 } 3215 } 3216 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3217 3218 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3219 int num_tolerated_disk_barrier_failures; 3220 u64 target = bctl->sys.target; 3221 3222 num_tolerated_disk_barrier_failures = 3223 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3224 if (num_tolerated_disk_barrier_failures > 0 && 3225 (target & 3226 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3227 BTRFS_AVAIL_ALLOC_BIT_SINGLE))) 3228 num_tolerated_disk_barrier_failures = 0; 3229 else if (num_tolerated_disk_barrier_failures > 1 && 3230 (target & 3231 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) 3232 num_tolerated_disk_barrier_failures = 1; 3233 3234 fs_info->num_tolerated_disk_barrier_failures = 3235 num_tolerated_disk_barrier_failures; 3236 } 3237 3238 ret = insert_balance_item(fs_info->tree_root, bctl); 3239 if (ret && ret != -EEXIST) 3240 goto out; 3241 3242 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3243 BUG_ON(ret == -EEXIST); 3244 set_balance_control(bctl); 3245 } else { 3246 BUG_ON(ret != -EEXIST); 3247 spin_lock(&fs_info->balance_lock); 3248 update_balance_args(bctl); 3249 spin_unlock(&fs_info->balance_lock); 3250 } 3251 3252 atomic_inc(&fs_info->balance_running); 3253 mutex_unlock(&fs_info->balance_mutex); 3254 3255 ret = __btrfs_balance(fs_info); 3256 3257 mutex_lock(&fs_info->balance_mutex); 3258 atomic_dec(&fs_info->balance_running); 3259 3260 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3261 fs_info->num_tolerated_disk_barrier_failures = 3262 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3263 } 3264 3265 if (bargs) { 3266 memset(bargs, 0, sizeof(*bargs)); 3267 update_ioctl_balance_args(fs_info, 0, bargs); 3268 } 3269 3270 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3271 balance_need_close(fs_info)) { 3272 __cancel_balance(fs_info); 3273 } 3274 3275 wake_up(&fs_info->balance_wait_q); 3276 3277 return ret; 3278 out: 3279 if (bctl->flags & BTRFS_BALANCE_RESUME) 3280 __cancel_balance(fs_info); 3281 else { 3282 kfree(bctl); 3283 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3284 } 3285 return ret; 3286 } 3287 3288 static int balance_kthread(void *data) 3289 { 3290 struct btrfs_fs_info *fs_info = data; 3291 int ret = 0; 3292 3293 mutex_lock(&fs_info->volume_mutex); 3294 mutex_lock(&fs_info->balance_mutex); 3295 3296 if (fs_info->balance_ctl) { 3297 printk(KERN_INFO "btrfs: continuing balance\n"); 3298 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3299 } 3300 3301 mutex_unlock(&fs_info->balance_mutex); 3302 mutex_unlock(&fs_info->volume_mutex); 3303 3304 return ret; 3305 } 3306 3307 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3308 { 3309 struct task_struct *tsk; 3310 3311 spin_lock(&fs_info->balance_lock); 3312 if (!fs_info->balance_ctl) { 3313 spin_unlock(&fs_info->balance_lock); 3314 return 0; 3315 } 3316 spin_unlock(&fs_info->balance_lock); 3317 3318 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3319 printk(KERN_INFO "btrfs: force skipping balance\n"); 3320 return 0; 3321 } 3322 3323 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3324 return PTR_RET(tsk); 3325 } 3326 3327 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3328 { 3329 struct btrfs_balance_control *bctl; 3330 struct btrfs_balance_item *item; 3331 struct btrfs_disk_balance_args disk_bargs; 3332 struct btrfs_path *path; 3333 struct extent_buffer *leaf; 3334 struct btrfs_key key; 3335 int ret; 3336 3337 path = btrfs_alloc_path(); 3338 if (!path) 3339 return -ENOMEM; 3340 3341 key.objectid = BTRFS_BALANCE_OBJECTID; 3342 key.type = BTRFS_BALANCE_ITEM_KEY; 3343 key.offset = 0; 3344 3345 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3346 if (ret < 0) 3347 goto out; 3348 if (ret > 0) { /* ret = -ENOENT; */ 3349 ret = 0; 3350 goto out; 3351 } 3352 3353 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3354 if (!bctl) { 3355 ret = -ENOMEM; 3356 goto out; 3357 } 3358 3359 leaf = path->nodes[0]; 3360 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3361 3362 bctl->fs_info = fs_info; 3363 bctl->flags = btrfs_balance_flags(leaf, item); 3364 bctl->flags |= BTRFS_BALANCE_RESUME; 3365 3366 btrfs_balance_data(leaf, item, &disk_bargs); 3367 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3368 btrfs_balance_meta(leaf, item, &disk_bargs); 3369 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3370 btrfs_balance_sys(leaf, item, &disk_bargs); 3371 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3372 3373 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3374 3375 mutex_lock(&fs_info->volume_mutex); 3376 mutex_lock(&fs_info->balance_mutex); 3377 3378 set_balance_control(bctl); 3379 3380 mutex_unlock(&fs_info->balance_mutex); 3381 mutex_unlock(&fs_info->volume_mutex); 3382 out: 3383 btrfs_free_path(path); 3384 return ret; 3385 } 3386 3387 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3388 { 3389 int ret = 0; 3390 3391 mutex_lock(&fs_info->balance_mutex); 3392 if (!fs_info->balance_ctl) { 3393 mutex_unlock(&fs_info->balance_mutex); 3394 return -ENOTCONN; 3395 } 3396 3397 if (atomic_read(&fs_info->balance_running)) { 3398 atomic_inc(&fs_info->balance_pause_req); 3399 mutex_unlock(&fs_info->balance_mutex); 3400 3401 wait_event(fs_info->balance_wait_q, 3402 atomic_read(&fs_info->balance_running) == 0); 3403 3404 mutex_lock(&fs_info->balance_mutex); 3405 /* we are good with balance_ctl ripped off from under us */ 3406 BUG_ON(atomic_read(&fs_info->balance_running)); 3407 atomic_dec(&fs_info->balance_pause_req); 3408 } else { 3409 ret = -ENOTCONN; 3410 } 3411 3412 mutex_unlock(&fs_info->balance_mutex); 3413 return ret; 3414 } 3415 3416 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 3417 { 3418 mutex_lock(&fs_info->balance_mutex); 3419 if (!fs_info->balance_ctl) { 3420 mutex_unlock(&fs_info->balance_mutex); 3421 return -ENOTCONN; 3422 } 3423 3424 atomic_inc(&fs_info->balance_cancel_req); 3425 /* 3426 * if we are running just wait and return, balance item is 3427 * deleted in btrfs_balance in this case 3428 */ 3429 if (atomic_read(&fs_info->balance_running)) { 3430 mutex_unlock(&fs_info->balance_mutex); 3431 wait_event(fs_info->balance_wait_q, 3432 atomic_read(&fs_info->balance_running) == 0); 3433 mutex_lock(&fs_info->balance_mutex); 3434 } else { 3435 /* __cancel_balance needs volume_mutex */ 3436 mutex_unlock(&fs_info->balance_mutex); 3437 mutex_lock(&fs_info->volume_mutex); 3438 mutex_lock(&fs_info->balance_mutex); 3439 3440 if (fs_info->balance_ctl) 3441 __cancel_balance(fs_info); 3442 3443 mutex_unlock(&fs_info->volume_mutex); 3444 } 3445 3446 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3447 atomic_dec(&fs_info->balance_cancel_req); 3448 mutex_unlock(&fs_info->balance_mutex); 3449 return 0; 3450 } 3451 3452 static int btrfs_uuid_scan_kthread(void *data) 3453 { 3454 struct btrfs_fs_info *fs_info = data; 3455 struct btrfs_root *root = fs_info->tree_root; 3456 struct btrfs_key key; 3457 struct btrfs_key max_key; 3458 struct btrfs_path *path = NULL; 3459 int ret = 0; 3460 struct extent_buffer *eb; 3461 int slot; 3462 struct btrfs_root_item root_item; 3463 u32 item_size; 3464 struct btrfs_trans_handle *trans; 3465 3466 path = btrfs_alloc_path(); 3467 if (!path) { 3468 ret = -ENOMEM; 3469 goto out; 3470 } 3471 3472 key.objectid = 0; 3473 key.type = BTRFS_ROOT_ITEM_KEY; 3474 key.offset = 0; 3475 3476 max_key.objectid = (u64)-1; 3477 max_key.type = BTRFS_ROOT_ITEM_KEY; 3478 max_key.offset = (u64)-1; 3479 3480 path->keep_locks = 1; 3481 3482 while (1) { 3483 ret = btrfs_search_forward(root, &key, &max_key, path, 0); 3484 if (ret) { 3485 if (ret > 0) 3486 ret = 0; 3487 break; 3488 } 3489 3490 if (key.type != BTRFS_ROOT_ITEM_KEY || 3491 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 3492 key.objectid != BTRFS_FS_TREE_OBJECTID) || 3493 key.objectid > BTRFS_LAST_FREE_OBJECTID) 3494 goto skip; 3495 3496 eb = path->nodes[0]; 3497 slot = path->slots[0]; 3498 item_size = btrfs_item_size_nr(eb, slot); 3499 if (item_size < sizeof(root_item)) 3500 goto skip; 3501 3502 trans = NULL; 3503 read_extent_buffer(eb, &root_item, 3504 btrfs_item_ptr_offset(eb, slot), 3505 (int)sizeof(root_item)); 3506 if (btrfs_root_refs(&root_item) == 0) 3507 goto skip; 3508 if (!btrfs_is_empty_uuid(root_item.uuid)) { 3509 /* 3510 * 1 - subvol uuid item 3511 * 1 - received_subvol uuid item 3512 */ 3513 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 3514 if (IS_ERR(trans)) { 3515 ret = PTR_ERR(trans); 3516 break; 3517 } 3518 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3519 root_item.uuid, 3520 BTRFS_UUID_KEY_SUBVOL, 3521 key.objectid); 3522 if (ret < 0) { 3523 pr_warn("btrfs: uuid_tree_add failed %d\n", 3524 ret); 3525 btrfs_end_transaction(trans, 3526 fs_info->uuid_root); 3527 break; 3528 } 3529 } 3530 3531 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 3532 if (!trans) { 3533 /* 1 - received_subvol uuid item */ 3534 trans = btrfs_start_transaction( 3535 fs_info->uuid_root, 1); 3536 if (IS_ERR(trans)) { 3537 ret = PTR_ERR(trans); 3538 break; 3539 } 3540 } 3541 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3542 root_item.received_uuid, 3543 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3544 key.objectid); 3545 if (ret < 0) { 3546 pr_warn("btrfs: uuid_tree_add failed %d\n", 3547 ret); 3548 btrfs_end_transaction(trans, 3549 fs_info->uuid_root); 3550 break; 3551 } 3552 } 3553 3554 if (trans) { 3555 ret = btrfs_end_transaction(trans, fs_info->uuid_root); 3556 if (ret) 3557 break; 3558 } 3559 3560 skip: 3561 btrfs_release_path(path); 3562 if (key.offset < (u64)-1) { 3563 key.offset++; 3564 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 3565 key.offset = 0; 3566 key.type = BTRFS_ROOT_ITEM_KEY; 3567 } else if (key.objectid < (u64)-1) { 3568 key.offset = 0; 3569 key.type = BTRFS_ROOT_ITEM_KEY; 3570 key.objectid++; 3571 } else { 3572 break; 3573 } 3574 cond_resched(); 3575 } 3576 3577 out: 3578 btrfs_free_path(path); 3579 if (ret) 3580 pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); 3581 else 3582 fs_info->update_uuid_tree_gen = 1; 3583 up(&fs_info->uuid_tree_rescan_sem); 3584 return 0; 3585 } 3586 3587 /* 3588 * Callback for btrfs_uuid_tree_iterate(). 3589 * returns: 3590 * 0 check succeeded, the entry is not outdated. 3591 * < 0 if an error occured. 3592 * > 0 if the check failed, which means the caller shall remove the entry. 3593 */ 3594 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 3595 u8 *uuid, u8 type, u64 subid) 3596 { 3597 struct btrfs_key key; 3598 int ret = 0; 3599 struct btrfs_root *subvol_root; 3600 3601 if (type != BTRFS_UUID_KEY_SUBVOL && 3602 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 3603 goto out; 3604 3605 key.objectid = subid; 3606 key.type = BTRFS_ROOT_ITEM_KEY; 3607 key.offset = (u64)-1; 3608 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 3609 if (IS_ERR(subvol_root)) { 3610 ret = PTR_ERR(subvol_root); 3611 if (ret == -ENOENT) 3612 ret = 1; 3613 goto out; 3614 } 3615 3616 switch (type) { 3617 case BTRFS_UUID_KEY_SUBVOL: 3618 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 3619 ret = 1; 3620 break; 3621 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 3622 if (memcmp(uuid, subvol_root->root_item.received_uuid, 3623 BTRFS_UUID_SIZE)) 3624 ret = 1; 3625 break; 3626 } 3627 3628 out: 3629 return ret; 3630 } 3631 3632 static int btrfs_uuid_rescan_kthread(void *data) 3633 { 3634 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3635 int ret; 3636 3637 /* 3638 * 1st step is to iterate through the existing UUID tree and 3639 * to delete all entries that contain outdated data. 3640 * 2nd step is to add all missing entries to the UUID tree. 3641 */ 3642 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3643 if (ret < 0) { 3644 pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); 3645 up(&fs_info->uuid_tree_rescan_sem); 3646 return ret; 3647 } 3648 return btrfs_uuid_scan_kthread(data); 3649 } 3650 3651 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 3652 { 3653 struct btrfs_trans_handle *trans; 3654 struct btrfs_root *tree_root = fs_info->tree_root; 3655 struct btrfs_root *uuid_root; 3656 struct task_struct *task; 3657 int ret; 3658 3659 /* 3660 * 1 - root node 3661 * 1 - root item 3662 */ 3663 trans = btrfs_start_transaction(tree_root, 2); 3664 if (IS_ERR(trans)) 3665 return PTR_ERR(trans); 3666 3667 uuid_root = btrfs_create_tree(trans, fs_info, 3668 BTRFS_UUID_TREE_OBJECTID); 3669 if (IS_ERR(uuid_root)) { 3670 btrfs_abort_transaction(trans, tree_root, 3671 PTR_ERR(uuid_root)); 3672 return PTR_ERR(uuid_root); 3673 } 3674 3675 fs_info->uuid_root = uuid_root; 3676 3677 ret = btrfs_commit_transaction(trans, tree_root); 3678 if (ret) 3679 return ret; 3680 3681 down(&fs_info->uuid_tree_rescan_sem); 3682 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3683 if (IS_ERR(task)) { 3684 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3685 pr_warn("btrfs: failed to start uuid_scan task\n"); 3686 up(&fs_info->uuid_tree_rescan_sem); 3687 return PTR_ERR(task); 3688 } 3689 3690 return 0; 3691 } 3692 3693 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3694 { 3695 struct task_struct *task; 3696 3697 down(&fs_info->uuid_tree_rescan_sem); 3698 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3699 if (IS_ERR(task)) { 3700 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3701 pr_warn("btrfs: failed to start uuid_rescan task\n"); 3702 up(&fs_info->uuid_tree_rescan_sem); 3703 return PTR_ERR(task); 3704 } 3705 3706 return 0; 3707 } 3708 3709 /* 3710 * shrinking a device means finding all of the device extents past 3711 * the new size, and then following the back refs to the chunks. 3712 * The chunk relocation code actually frees the device extent 3713 */ 3714 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3715 { 3716 struct btrfs_trans_handle *trans; 3717 struct btrfs_root *root = device->dev_root; 3718 struct btrfs_dev_extent *dev_extent = NULL; 3719 struct btrfs_path *path; 3720 u64 length; 3721 u64 chunk_tree; 3722 u64 chunk_objectid; 3723 u64 chunk_offset; 3724 int ret; 3725 int slot; 3726 int failed = 0; 3727 bool retried = false; 3728 struct extent_buffer *l; 3729 struct btrfs_key key; 3730 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3731 u64 old_total = btrfs_super_total_bytes(super_copy); 3732 u64 old_size = device->total_bytes; 3733 u64 diff = device->total_bytes - new_size; 3734 3735 if (device->is_tgtdev_for_dev_replace) 3736 return -EINVAL; 3737 3738 path = btrfs_alloc_path(); 3739 if (!path) 3740 return -ENOMEM; 3741 3742 path->reada = 2; 3743 3744 lock_chunks(root); 3745 3746 device->total_bytes = new_size; 3747 if (device->writeable) { 3748 device->fs_devices->total_rw_bytes -= diff; 3749 spin_lock(&root->fs_info->free_chunk_lock); 3750 root->fs_info->free_chunk_space -= diff; 3751 spin_unlock(&root->fs_info->free_chunk_lock); 3752 } 3753 unlock_chunks(root); 3754 3755 again: 3756 key.objectid = device->devid; 3757 key.offset = (u64)-1; 3758 key.type = BTRFS_DEV_EXTENT_KEY; 3759 3760 do { 3761 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3762 if (ret < 0) 3763 goto done; 3764 3765 ret = btrfs_previous_item(root, path, 0, key.type); 3766 if (ret < 0) 3767 goto done; 3768 if (ret) { 3769 ret = 0; 3770 btrfs_release_path(path); 3771 break; 3772 } 3773 3774 l = path->nodes[0]; 3775 slot = path->slots[0]; 3776 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 3777 3778 if (key.objectid != device->devid) { 3779 btrfs_release_path(path); 3780 break; 3781 } 3782 3783 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3784 length = btrfs_dev_extent_length(l, dev_extent); 3785 3786 if (key.offset + length <= new_size) { 3787 btrfs_release_path(path); 3788 break; 3789 } 3790 3791 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3792 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3793 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3794 btrfs_release_path(path); 3795 3796 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3797 chunk_offset); 3798 if (ret && ret != -ENOSPC) 3799 goto done; 3800 if (ret == -ENOSPC) 3801 failed++; 3802 } while (key.offset-- > 0); 3803 3804 if (failed && !retried) { 3805 failed = 0; 3806 retried = true; 3807 goto again; 3808 } else if (failed && retried) { 3809 ret = -ENOSPC; 3810 lock_chunks(root); 3811 3812 device->total_bytes = old_size; 3813 if (device->writeable) 3814 device->fs_devices->total_rw_bytes += diff; 3815 spin_lock(&root->fs_info->free_chunk_lock); 3816 root->fs_info->free_chunk_space += diff; 3817 spin_unlock(&root->fs_info->free_chunk_lock); 3818 unlock_chunks(root); 3819 goto done; 3820 } 3821 3822 /* Shrinking succeeded, else we would be at "done". */ 3823 trans = btrfs_start_transaction(root, 0); 3824 if (IS_ERR(trans)) { 3825 ret = PTR_ERR(trans); 3826 goto done; 3827 } 3828 3829 lock_chunks(root); 3830 3831 device->disk_total_bytes = new_size; 3832 /* Now btrfs_update_device() will change the on-disk size. */ 3833 ret = btrfs_update_device(trans, device); 3834 if (ret) { 3835 unlock_chunks(root); 3836 btrfs_end_transaction(trans, root); 3837 goto done; 3838 } 3839 WARN_ON(diff > old_total); 3840 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3841 unlock_chunks(root); 3842 btrfs_end_transaction(trans, root); 3843 done: 3844 btrfs_free_path(path); 3845 return ret; 3846 } 3847 3848 static int btrfs_add_system_chunk(struct btrfs_root *root, 3849 struct btrfs_key *key, 3850 struct btrfs_chunk *chunk, int item_size) 3851 { 3852 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3853 struct btrfs_disk_key disk_key; 3854 u32 array_size; 3855 u8 *ptr; 3856 3857 array_size = btrfs_super_sys_array_size(super_copy); 3858 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3859 return -EFBIG; 3860 3861 ptr = super_copy->sys_chunk_array + array_size; 3862 btrfs_cpu_key_to_disk(&disk_key, key); 3863 memcpy(ptr, &disk_key, sizeof(disk_key)); 3864 ptr += sizeof(disk_key); 3865 memcpy(ptr, chunk, item_size); 3866 item_size += sizeof(disk_key); 3867 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3868 return 0; 3869 } 3870 3871 /* 3872 * sort the devices in descending order by max_avail, total_avail 3873 */ 3874 static int btrfs_cmp_device_info(const void *a, const void *b) 3875 { 3876 const struct btrfs_device_info *di_a = a; 3877 const struct btrfs_device_info *di_b = b; 3878 3879 if (di_a->max_avail > di_b->max_avail) 3880 return -1; 3881 if (di_a->max_avail < di_b->max_avail) 3882 return 1; 3883 if (di_a->total_avail > di_b->total_avail) 3884 return -1; 3885 if (di_a->total_avail < di_b->total_avail) 3886 return 1; 3887 return 0; 3888 } 3889 3890 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3891 [BTRFS_RAID_RAID10] = { 3892 .sub_stripes = 2, 3893 .dev_stripes = 1, 3894 .devs_max = 0, /* 0 == as many as possible */ 3895 .devs_min = 4, 3896 .devs_increment = 2, 3897 .ncopies = 2, 3898 }, 3899 [BTRFS_RAID_RAID1] = { 3900 .sub_stripes = 1, 3901 .dev_stripes = 1, 3902 .devs_max = 2, 3903 .devs_min = 2, 3904 .devs_increment = 2, 3905 .ncopies = 2, 3906 }, 3907 [BTRFS_RAID_DUP] = { 3908 .sub_stripes = 1, 3909 .dev_stripes = 2, 3910 .devs_max = 1, 3911 .devs_min = 1, 3912 .devs_increment = 1, 3913 .ncopies = 2, 3914 }, 3915 [BTRFS_RAID_RAID0] = { 3916 .sub_stripes = 1, 3917 .dev_stripes = 1, 3918 .devs_max = 0, 3919 .devs_min = 2, 3920 .devs_increment = 1, 3921 .ncopies = 1, 3922 }, 3923 [BTRFS_RAID_SINGLE] = { 3924 .sub_stripes = 1, 3925 .dev_stripes = 1, 3926 .devs_max = 1, 3927 .devs_min = 1, 3928 .devs_increment = 1, 3929 .ncopies = 1, 3930 }, 3931 [BTRFS_RAID_RAID5] = { 3932 .sub_stripes = 1, 3933 .dev_stripes = 1, 3934 .devs_max = 0, 3935 .devs_min = 2, 3936 .devs_increment = 1, 3937 .ncopies = 2, 3938 }, 3939 [BTRFS_RAID_RAID6] = { 3940 .sub_stripes = 1, 3941 .dev_stripes = 1, 3942 .devs_max = 0, 3943 .devs_min = 3, 3944 .devs_increment = 1, 3945 .ncopies = 3, 3946 }, 3947 }; 3948 3949 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 3950 { 3951 /* TODO allow them to set a preferred stripe size */ 3952 return 64 * 1024; 3953 } 3954 3955 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 3956 { 3957 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 3958 return; 3959 3960 btrfs_set_fs_incompat(info, RAID56); 3961 } 3962 3963 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3964 struct btrfs_root *extent_root, u64 start, 3965 u64 type) 3966 { 3967 struct btrfs_fs_info *info = extent_root->fs_info; 3968 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3969 struct list_head *cur; 3970 struct map_lookup *map = NULL; 3971 struct extent_map_tree *em_tree; 3972 struct extent_map *em; 3973 struct btrfs_device_info *devices_info = NULL; 3974 u64 total_avail; 3975 int num_stripes; /* total number of stripes to allocate */ 3976 int data_stripes; /* number of stripes that count for 3977 block group size */ 3978 int sub_stripes; /* sub_stripes info for map */ 3979 int dev_stripes; /* stripes per dev */ 3980 int devs_max; /* max devs to use */ 3981 int devs_min; /* min devs needed */ 3982 int devs_increment; /* ndevs has to be a multiple of this */ 3983 int ncopies; /* how many copies to data has */ 3984 int ret; 3985 u64 max_stripe_size; 3986 u64 max_chunk_size; 3987 u64 stripe_size; 3988 u64 num_bytes; 3989 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 3990 int ndevs; 3991 int i; 3992 int j; 3993 int index; 3994 3995 BUG_ON(!alloc_profile_is_valid(type, 0)); 3996 3997 if (list_empty(&fs_devices->alloc_list)) 3998 return -ENOSPC; 3999 4000 index = __get_raid_index(type); 4001 4002 sub_stripes = btrfs_raid_array[index].sub_stripes; 4003 dev_stripes = btrfs_raid_array[index].dev_stripes; 4004 devs_max = btrfs_raid_array[index].devs_max; 4005 devs_min = btrfs_raid_array[index].devs_min; 4006 devs_increment = btrfs_raid_array[index].devs_increment; 4007 ncopies = btrfs_raid_array[index].ncopies; 4008 4009 if (type & BTRFS_BLOCK_GROUP_DATA) { 4010 max_stripe_size = 1024 * 1024 * 1024; 4011 max_chunk_size = 10 * max_stripe_size; 4012 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4013 /* for larger filesystems, use larger metadata chunks */ 4014 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4015 max_stripe_size = 1024 * 1024 * 1024; 4016 else 4017 max_stripe_size = 256 * 1024 * 1024; 4018 max_chunk_size = max_stripe_size; 4019 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4020 max_stripe_size = 32 * 1024 * 1024; 4021 max_chunk_size = 2 * max_stripe_size; 4022 } else { 4023 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 4024 type); 4025 BUG_ON(1); 4026 } 4027 4028 /* we don't want a chunk larger than 10% of writeable space */ 4029 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4030 max_chunk_size); 4031 4032 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 4033 GFP_NOFS); 4034 if (!devices_info) 4035 return -ENOMEM; 4036 4037 cur = fs_devices->alloc_list.next; 4038 4039 /* 4040 * in the first pass through the devices list, we gather information 4041 * about the available holes on each device. 4042 */ 4043 ndevs = 0; 4044 while (cur != &fs_devices->alloc_list) { 4045 struct btrfs_device *device; 4046 u64 max_avail; 4047 u64 dev_offset; 4048 4049 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4050 4051 cur = cur->next; 4052 4053 if (!device->writeable) { 4054 WARN(1, KERN_ERR 4055 "btrfs: read-only device in alloc_list\n"); 4056 continue; 4057 } 4058 4059 if (!device->in_fs_metadata || 4060 device->is_tgtdev_for_dev_replace) 4061 continue; 4062 4063 if (device->total_bytes > device->bytes_used) 4064 total_avail = device->total_bytes - device->bytes_used; 4065 else 4066 total_avail = 0; 4067 4068 /* If there is no space on this device, skip it. */ 4069 if (total_avail == 0) 4070 continue; 4071 4072 ret = find_free_dev_extent(trans, device, 4073 max_stripe_size * dev_stripes, 4074 &dev_offset, &max_avail); 4075 if (ret && ret != -ENOSPC) 4076 goto error; 4077 4078 if (ret == 0) 4079 max_avail = max_stripe_size * dev_stripes; 4080 4081 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4082 continue; 4083 4084 if (ndevs == fs_devices->rw_devices) { 4085 WARN(1, "%s: found more than %llu devices\n", 4086 __func__, fs_devices->rw_devices); 4087 break; 4088 } 4089 devices_info[ndevs].dev_offset = dev_offset; 4090 devices_info[ndevs].max_avail = max_avail; 4091 devices_info[ndevs].total_avail = total_avail; 4092 devices_info[ndevs].dev = device; 4093 ++ndevs; 4094 } 4095 4096 /* 4097 * now sort the devices by hole size / available space 4098 */ 4099 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4100 btrfs_cmp_device_info, NULL); 4101 4102 /* round down to number of usable stripes */ 4103 ndevs -= ndevs % devs_increment; 4104 4105 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4106 ret = -ENOSPC; 4107 goto error; 4108 } 4109 4110 if (devs_max && ndevs > devs_max) 4111 ndevs = devs_max; 4112 /* 4113 * the primary goal is to maximize the number of stripes, so use as many 4114 * devices as possible, even if the stripes are not maximum sized. 4115 */ 4116 stripe_size = devices_info[ndevs-1].max_avail; 4117 num_stripes = ndevs * dev_stripes; 4118 4119 /* 4120 * this will have to be fixed for RAID1 and RAID10 over 4121 * more drives 4122 */ 4123 data_stripes = num_stripes / ncopies; 4124 4125 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4126 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4127 btrfs_super_stripesize(info->super_copy)); 4128 data_stripes = num_stripes - 1; 4129 } 4130 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4131 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4132 btrfs_super_stripesize(info->super_copy)); 4133 data_stripes = num_stripes - 2; 4134 } 4135 4136 /* 4137 * Use the number of data stripes to figure out how big this chunk 4138 * is really going to be in terms of logical address space, 4139 * and compare that answer with the max chunk size 4140 */ 4141 if (stripe_size * data_stripes > max_chunk_size) { 4142 u64 mask = (1ULL << 24) - 1; 4143 stripe_size = max_chunk_size; 4144 do_div(stripe_size, data_stripes); 4145 4146 /* bump the answer up to a 16MB boundary */ 4147 stripe_size = (stripe_size + mask) & ~mask; 4148 4149 /* but don't go higher than the limits we found 4150 * while searching for free extents 4151 */ 4152 if (stripe_size > devices_info[ndevs-1].max_avail) 4153 stripe_size = devices_info[ndevs-1].max_avail; 4154 } 4155 4156 do_div(stripe_size, dev_stripes); 4157 4158 /* align to BTRFS_STRIPE_LEN */ 4159 do_div(stripe_size, raid_stripe_len); 4160 stripe_size *= raid_stripe_len; 4161 4162 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4163 if (!map) { 4164 ret = -ENOMEM; 4165 goto error; 4166 } 4167 map->num_stripes = num_stripes; 4168 4169 for (i = 0; i < ndevs; ++i) { 4170 for (j = 0; j < dev_stripes; ++j) { 4171 int s = i * dev_stripes + j; 4172 map->stripes[s].dev = devices_info[i].dev; 4173 map->stripes[s].physical = devices_info[i].dev_offset + 4174 j * stripe_size; 4175 } 4176 } 4177 map->sector_size = extent_root->sectorsize; 4178 map->stripe_len = raid_stripe_len; 4179 map->io_align = raid_stripe_len; 4180 map->io_width = raid_stripe_len; 4181 map->type = type; 4182 map->sub_stripes = sub_stripes; 4183 4184 num_bytes = stripe_size * data_stripes; 4185 4186 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 4187 4188 em = alloc_extent_map(); 4189 if (!em) { 4190 ret = -ENOMEM; 4191 goto error; 4192 } 4193 em->bdev = (struct block_device *)map; 4194 em->start = start; 4195 em->len = num_bytes; 4196 em->block_start = 0; 4197 em->block_len = em->len; 4198 em->orig_block_len = stripe_size; 4199 4200 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4201 write_lock(&em_tree->lock); 4202 ret = add_extent_mapping(em_tree, em, 0); 4203 if (!ret) { 4204 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4205 atomic_inc(&em->refs); 4206 } 4207 write_unlock(&em_tree->lock); 4208 if (ret) { 4209 free_extent_map(em); 4210 goto error; 4211 } 4212 4213 ret = btrfs_make_block_group(trans, extent_root, 0, type, 4214 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4215 start, num_bytes); 4216 if (ret) 4217 goto error_del_extent; 4218 4219 free_extent_map(em); 4220 check_raid56_incompat_flag(extent_root->fs_info, type); 4221 4222 kfree(devices_info); 4223 return 0; 4224 4225 error_del_extent: 4226 write_lock(&em_tree->lock); 4227 remove_extent_mapping(em_tree, em); 4228 write_unlock(&em_tree->lock); 4229 4230 /* One for our allocation */ 4231 free_extent_map(em); 4232 /* One for the tree reference */ 4233 free_extent_map(em); 4234 error: 4235 kfree(map); 4236 kfree(devices_info); 4237 return ret; 4238 } 4239 4240 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4241 struct btrfs_root *extent_root, 4242 u64 chunk_offset, u64 chunk_size) 4243 { 4244 struct btrfs_key key; 4245 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 4246 struct btrfs_device *device; 4247 struct btrfs_chunk *chunk; 4248 struct btrfs_stripe *stripe; 4249 struct extent_map_tree *em_tree; 4250 struct extent_map *em; 4251 struct map_lookup *map; 4252 size_t item_size; 4253 u64 dev_offset; 4254 u64 stripe_size; 4255 int i = 0; 4256 int ret; 4257 4258 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4259 read_lock(&em_tree->lock); 4260 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4261 read_unlock(&em_tree->lock); 4262 4263 if (!em) { 4264 btrfs_crit(extent_root->fs_info, "unable to find logical " 4265 "%Lu len %Lu", chunk_offset, chunk_size); 4266 return -EINVAL; 4267 } 4268 4269 if (em->start != chunk_offset || em->len != chunk_size) { 4270 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4271 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, 4272 chunk_size, em->start, em->len); 4273 free_extent_map(em); 4274 return -EINVAL; 4275 } 4276 4277 map = (struct map_lookup *)em->bdev; 4278 item_size = btrfs_chunk_item_size(map->num_stripes); 4279 stripe_size = em->orig_block_len; 4280 4281 chunk = kzalloc(item_size, GFP_NOFS); 4282 if (!chunk) { 4283 ret = -ENOMEM; 4284 goto out; 4285 } 4286 4287 for (i = 0; i < map->num_stripes; i++) { 4288 device = map->stripes[i].dev; 4289 dev_offset = map->stripes[i].physical; 4290 4291 device->bytes_used += stripe_size; 4292 ret = btrfs_update_device(trans, device); 4293 if (ret) 4294 goto out; 4295 ret = btrfs_alloc_dev_extent(trans, device, 4296 chunk_root->root_key.objectid, 4297 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4298 chunk_offset, dev_offset, 4299 stripe_size); 4300 if (ret) 4301 goto out; 4302 } 4303 4304 spin_lock(&extent_root->fs_info->free_chunk_lock); 4305 extent_root->fs_info->free_chunk_space -= (stripe_size * 4306 map->num_stripes); 4307 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4308 4309 stripe = &chunk->stripe; 4310 for (i = 0; i < map->num_stripes; i++) { 4311 device = map->stripes[i].dev; 4312 dev_offset = map->stripes[i].physical; 4313 4314 btrfs_set_stack_stripe_devid(stripe, device->devid); 4315 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4316 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4317 stripe++; 4318 } 4319 4320 btrfs_set_stack_chunk_length(chunk, chunk_size); 4321 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4322 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4323 btrfs_set_stack_chunk_type(chunk, map->type); 4324 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4325 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4326 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4327 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 4328 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4329 4330 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4331 key.type = BTRFS_CHUNK_ITEM_KEY; 4332 key.offset = chunk_offset; 4333 4334 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4335 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4336 /* 4337 * TODO: Cleanup of inserted chunk root in case of 4338 * failure. 4339 */ 4340 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 4341 item_size); 4342 } 4343 4344 out: 4345 kfree(chunk); 4346 free_extent_map(em); 4347 return ret; 4348 } 4349 4350 /* 4351 * Chunk allocation falls into two parts. The first part does works 4352 * that make the new allocated chunk useable, but not do any operation 4353 * that modifies the chunk tree. The second part does the works that 4354 * require modifying the chunk tree. This division is important for the 4355 * bootstrap process of adding storage to a seed btrfs. 4356 */ 4357 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4358 struct btrfs_root *extent_root, u64 type) 4359 { 4360 u64 chunk_offset; 4361 4362 chunk_offset = find_next_chunk(extent_root->fs_info); 4363 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); 4364 } 4365 4366 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4367 struct btrfs_root *root, 4368 struct btrfs_device *device) 4369 { 4370 u64 chunk_offset; 4371 u64 sys_chunk_offset; 4372 u64 alloc_profile; 4373 struct btrfs_fs_info *fs_info = root->fs_info; 4374 struct btrfs_root *extent_root = fs_info->extent_root; 4375 int ret; 4376 4377 chunk_offset = find_next_chunk(fs_info); 4378 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4379 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, 4380 alloc_profile); 4381 if (ret) 4382 return ret; 4383 4384 sys_chunk_offset = find_next_chunk(root->fs_info); 4385 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4386 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4387 alloc_profile); 4388 if (ret) { 4389 btrfs_abort_transaction(trans, root, ret); 4390 goto out; 4391 } 4392 4393 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4394 if (ret) 4395 btrfs_abort_transaction(trans, root, ret); 4396 out: 4397 return ret; 4398 } 4399 4400 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4401 { 4402 struct extent_map *em; 4403 struct map_lookup *map; 4404 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4405 int readonly = 0; 4406 int i; 4407 4408 read_lock(&map_tree->map_tree.lock); 4409 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 4410 read_unlock(&map_tree->map_tree.lock); 4411 if (!em) 4412 return 1; 4413 4414 if (btrfs_test_opt(root, DEGRADED)) { 4415 free_extent_map(em); 4416 return 0; 4417 } 4418 4419 map = (struct map_lookup *)em->bdev; 4420 for (i = 0; i < map->num_stripes; i++) { 4421 if (!map->stripes[i].dev->writeable) { 4422 readonly = 1; 4423 break; 4424 } 4425 } 4426 free_extent_map(em); 4427 return readonly; 4428 } 4429 4430 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 4431 { 4432 extent_map_tree_init(&tree->map_tree); 4433 } 4434 4435 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 4436 { 4437 struct extent_map *em; 4438 4439 while (1) { 4440 write_lock(&tree->map_tree.lock); 4441 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 4442 if (em) 4443 remove_extent_mapping(&tree->map_tree, em); 4444 write_unlock(&tree->map_tree.lock); 4445 if (!em) 4446 break; 4447 kfree(em->bdev); 4448 /* once for us */ 4449 free_extent_map(em); 4450 /* once for the tree */ 4451 free_extent_map(em); 4452 } 4453 } 4454 4455 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 4456 { 4457 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4458 struct extent_map *em; 4459 struct map_lookup *map; 4460 struct extent_map_tree *em_tree = &map_tree->map_tree; 4461 int ret; 4462 4463 read_lock(&em_tree->lock); 4464 em = lookup_extent_mapping(em_tree, logical, len); 4465 read_unlock(&em_tree->lock); 4466 4467 /* 4468 * We could return errors for these cases, but that could get ugly and 4469 * we'd probably do the same thing which is just not do anything else 4470 * and exit, so return 1 so the callers don't try to use other copies. 4471 */ 4472 if (!em) { 4473 btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, 4474 logical+len); 4475 return 1; 4476 } 4477 4478 if (em->start > logical || em->start + em->len < logical) { 4479 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4480 "%Lu-%Lu\n", logical, logical+len, em->start, 4481 em->start + em->len); 4482 return 1; 4483 } 4484 4485 map = (struct map_lookup *)em->bdev; 4486 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 4487 ret = map->num_stripes; 4488 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4489 ret = map->sub_stripes; 4490 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4491 ret = 2; 4492 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4493 ret = 3; 4494 else 4495 ret = 1; 4496 free_extent_map(em); 4497 4498 btrfs_dev_replace_lock(&fs_info->dev_replace); 4499 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 4500 ret++; 4501 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4502 4503 return ret; 4504 } 4505 4506 unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4507 struct btrfs_mapping_tree *map_tree, 4508 u64 logical) 4509 { 4510 struct extent_map *em; 4511 struct map_lookup *map; 4512 struct extent_map_tree *em_tree = &map_tree->map_tree; 4513 unsigned long len = root->sectorsize; 4514 4515 read_lock(&em_tree->lock); 4516 em = lookup_extent_mapping(em_tree, logical, len); 4517 read_unlock(&em_tree->lock); 4518 BUG_ON(!em); 4519 4520 BUG_ON(em->start > logical || em->start + em->len < logical); 4521 map = (struct map_lookup *)em->bdev; 4522 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4523 BTRFS_BLOCK_GROUP_RAID6)) { 4524 len = map->stripe_len * nr_data_stripes(map); 4525 } 4526 free_extent_map(em); 4527 return len; 4528 } 4529 4530 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4531 u64 logical, u64 len, int mirror_num) 4532 { 4533 struct extent_map *em; 4534 struct map_lookup *map; 4535 struct extent_map_tree *em_tree = &map_tree->map_tree; 4536 int ret = 0; 4537 4538 read_lock(&em_tree->lock); 4539 em = lookup_extent_mapping(em_tree, logical, len); 4540 read_unlock(&em_tree->lock); 4541 BUG_ON(!em); 4542 4543 BUG_ON(em->start > logical || em->start + em->len < logical); 4544 map = (struct map_lookup *)em->bdev; 4545 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4546 BTRFS_BLOCK_GROUP_RAID6)) 4547 ret = 1; 4548 free_extent_map(em); 4549 return ret; 4550 } 4551 4552 static int find_live_mirror(struct btrfs_fs_info *fs_info, 4553 struct map_lookup *map, int first, int num, 4554 int optimal, int dev_replace_is_ongoing) 4555 { 4556 int i; 4557 int tolerance; 4558 struct btrfs_device *srcdev; 4559 4560 if (dev_replace_is_ongoing && 4561 fs_info->dev_replace.cont_reading_from_srcdev_mode == 4562 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 4563 srcdev = fs_info->dev_replace.srcdev; 4564 else 4565 srcdev = NULL; 4566 4567 /* 4568 * try to avoid the drive that is the source drive for a 4569 * dev-replace procedure, only choose it if no other non-missing 4570 * mirror is available 4571 */ 4572 for (tolerance = 0; tolerance < 2; tolerance++) { 4573 if (map->stripes[optimal].dev->bdev && 4574 (tolerance || map->stripes[optimal].dev != srcdev)) 4575 return optimal; 4576 for (i = first; i < first + num; i++) { 4577 if (map->stripes[i].dev->bdev && 4578 (tolerance || map->stripes[i].dev != srcdev)) 4579 return i; 4580 } 4581 } 4582 4583 /* we couldn't find one that doesn't fail. Just return something 4584 * and the io error handling code will clean up eventually 4585 */ 4586 return optimal; 4587 } 4588 4589 static inline int parity_smaller(u64 a, u64 b) 4590 { 4591 return a > b; 4592 } 4593 4594 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4595 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4596 { 4597 struct btrfs_bio_stripe s; 4598 int i; 4599 u64 l; 4600 int again = 1; 4601 4602 while (again) { 4603 again = 0; 4604 for (i = 0; i < bbio->num_stripes - 1; i++) { 4605 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4606 s = bbio->stripes[i]; 4607 l = raid_map[i]; 4608 bbio->stripes[i] = bbio->stripes[i+1]; 4609 raid_map[i] = raid_map[i+1]; 4610 bbio->stripes[i+1] = s; 4611 raid_map[i+1] = l; 4612 again = 1; 4613 } 4614 } 4615 } 4616 } 4617 4618 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4619 u64 logical, u64 *length, 4620 struct btrfs_bio **bbio_ret, 4621 int mirror_num, u64 **raid_map_ret) 4622 { 4623 struct extent_map *em; 4624 struct map_lookup *map; 4625 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4626 struct extent_map_tree *em_tree = &map_tree->map_tree; 4627 u64 offset; 4628 u64 stripe_offset; 4629 u64 stripe_end_offset; 4630 u64 stripe_nr; 4631 u64 stripe_nr_orig; 4632 u64 stripe_nr_end; 4633 u64 stripe_len; 4634 u64 *raid_map = NULL; 4635 int stripe_index; 4636 int i; 4637 int ret = 0; 4638 int num_stripes; 4639 int max_errors = 0; 4640 struct btrfs_bio *bbio = NULL; 4641 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4642 int dev_replace_is_ongoing = 0; 4643 int num_alloc_stripes; 4644 int patch_the_first_stripe_for_dev_replace = 0; 4645 u64 physical_to_patch_in_first_stripe = 0; 4646 u64 raid56_full_stripe_start = (u64)-1; 4647 4648 read_lock(&em_tree->lock); 4649 em = lookup_extent_mapping(em_tree, logical, *length); 4650 read_unlock(&em_tree->lock); 4651 4652 if (!em) { 4653 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 4654 logical, *length); 4655 return -EINVAL; 4656 } 4657 4658 if (em->start > logical || em->start + em->len < logical) { 4659 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4660 "found %Lu-%Lu\n", logical, em->start, 4661 em->start + em->len); 4662 return -EINVAL; 4663 } 4664 4665 map = (struct map_lookup *)em->bdev; 4666 offset = logical - em->start; 4667 4668 stripe_len = map->stripe_len; 4669 stripe_nr = offset; 4670 /* 4671 * stripe_nr counts the total number of stripes we have to stride 4672 * to get to this block 4673 */ 4674 do_div(stripe_nr, stripe_len); 4675 4676 stripe_offset = stripe_nr * stripe_len; 4677 BUG_ON(offset < stripe_offset); 4678 4679 /* stripe_offset is the offset of this block in its stripe*/ 4680 stripe_offset = offset - stripe_offset; 4681 4682 /* if we're here for raid56, we need to know the stripe aligned start */ 4683 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4684 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4685 raid56_full_stripe_start = offset; 4686 4687 /* allow a write of a full stripe, but make sure we don't 4688 * allow straddling of stripes 4689 */ 4690 do_div(raid56_full_stripe_start, full_stripe_len); 4691 raid56_full_stripe_start *= full_stripe_len; 4692 } 4693 4694 if (rw & REQ_DISCARD) { 4695 /* we don't discard raid56 yet */ 4696 if (map->type & 4697 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4698 ret = -EOPNOTSUPP; 4699 goto out; 4700 } 4701 *length = min_t(u64, em->len - offset, *length); 4702 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4703 u64 max_len; 4704 /* For writes to RAID[56], allow a full stripeset across all disks. 4705 For other RAID types and for RAID[56] reads, just allow a single 4706 stripe (on a single disk). */ 4707 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 4708 (rw & REQ_WRITE)) { 4709 max_len = stripe_len * nr_data_stripes(map) - 4710 (offset - raid56_full_stripe_start); 4711 } else { 4712 /* we limit the length of each bio to what fits in a stripe */ 4713 max_len = stripe_len - stripe_offset; 4714 } 4715 *length = min_t(u64, em->len - offset, max_len); 4716 } else { 4717 *length = em->len - offset; 4718 } 4719 4720 /* This is for when we're called from btrfs_merge_bio_hook() and all 4721 it cares about is the length */ 4722 if (!bbio_ret) 4723 goto out; 4724 4725 btrfs_dev_replace_lock(dev_replace); 4726 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 4727 if (!dev_replace_is_ongoing) 4728 btrfs_dev_replace_unlock(dev_replace); 4729 4730 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 4731 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 4732 dev_replace->tgtdev != NULL) { 4733 /* 4734 * in dev-replace case, for repair case (that's the only 4735 * case where the mirror is selected explicitly when 4736 * calling btrfs_map_block), blocks left of the left cursor 4737 * can also be read from the target drive. 4738 * For REQ_GET_READ_MIRRORS, the target drive is added as 4739 * the last one to the array of stripes. For READ, it also 4740 * needs to be supported using the same mirror number. 4741 * If the requested block is not left of the left cursor, 4742 * EIO is returned. This can happen because btrfs_num_copies() 4743 * returns one more in the dev-replace case. 4744 */ 4745 u64 tmp_length = *length; 4746 struct btrfs_bio *tmp_bbio = NULL; 4747 int tmp_num_stripes; 4748 u64 srcdev_devid = dev_replace->srcdev->devid; 4749 int index_srcdev = 0; 4750 int found = 0; 4751 u64 physical_of_found = 0; 4752 4753 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4754 logical, &tmp_length, &tmp_bbio, 0, NULL); 4755 if (ret) { 4756 WARN_ON(tmp_bbio != NULL); 4757 goto out; 4758 } 4759 4760 tmp_num_stripes = tmp_bbio->num_stripes; 4761 if (mirror_num > tmp_num_stripes) { 4762 /* 4763 * REQ_GET_READ_MIRRORS does not contain this 4764 * mirror, that means that the requested area 4765 * is not left of the left cursor 4766 */ 4767 ret = -EIO; 4768 kfree(tmp_bbio); 4769 goto out; 4770 } 4771 4772 /* 4773 * process the rest of the function using the mirror_num 4774 * of the source drive. Therefore look it up first. 4775 * At the end, patch the device pointer to the one of the 4776 * target drive. 4777 */ 4778 for (i = 0; i < tmp_num_stripes; i++) { 4779 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 4780 /* 4781 * In case of DUP, in order to keep it 4782 * simple, only add the mirror with the 4783 * lowest physical address 4784 */ 4785 if (found && 4786 physical_of_found <= 4787 tmp_bbio->stripes[i].physical) 4788 continue; 4789 index_srcdev = i; 4790 found = 1; 4791 physical_of_found = 4792 tmp_bbio->stripes[i].physical; 4793 } 4794 } 4795 4796 if (found) { 4797 mirror_num = index_srcdev + 1; 4798 patch_the_first_stripe_for_dev_replace = 1; 4799 physical_to_patch_in_first_stripe = physical_of_found; 4800 } else { 4801 WARN_ON(1); 4802 ret = -EIO; 4803 kfree(tmp_bbio); 4804 goto out; 4805 } 4806 4807 kfree(tmp_bbio); 4808 } else if (mirror_num > map->num_stripes) { 4809 mirror_num = 0; 4810 } 4811 4812 num_stripes = 1; 4813 stripe_index = 0; 4814 stripe_nr_orig = stripe_nr; 4815 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 4816 do_div(stripe_nr_end, map->stripe_len); 4817 stripe_end_offset = stripe_nr_end * map->stripe_len - 4818 (offset + *length); 4819 4820 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4821 if (rw & REQ_DISCARD) 4822 num_stripes = min_t(u64, map->num_stripes, 4823 stripe_nr_end - stripe_nr_orig); 4824 stripe_index = do_div(stripe_nr, map->num_stripes); 4825 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4826 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 4827 num_stripes = map->num_stripes; 4828 else if (mirror_num) 4829 stripe_index = mirror_num - 1; 4830 else { 4831 stripe_index = find_live_mirror(fs_info, map, 0, 4832 map->num_stripes, 4833 current->pid % map->num_stripes, 4834 dev_replace_is_ongoing); 4835 mirror_num = stripe_index + 1; 4836 } 4837 4838 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4839 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 4840 num_stripes = map->num_stripes; 4841 } else if (mirror_num) { 4842 stripe_index = mirror_num - 1; 4843 } else { 4844 mirror_num = 1; 4845 } 4846 4847 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 4848 int factor = map->num_stripes / map->sub_stripes; 4849 4850 stripe_index = do_div(stripe_nr, factor); 4851 stripe_index *= map->sub_stripes; 4852 4853 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 4854 num_stripes = map->sub_stripes; 4855 else if (rw & REQ_DISCARD) 4856 num_stripes = min_t(u64, map->sub_stripes * 4857 (stripe_nr_end - stripe_nr_orig), 4858 map->num_stripes); 4859 else if (mirror_num) 4860 stripe_index += mirror_num - 1; 4861 else { 4862 int old_stripe_index = stripe_index; 4863 stripe_index = find_live_mirror(fs_info, map, 4864 stripe_index, 4865 map->sub_stripes, stripe_index + 4866 current->pid % map->sub_stripes, 4867 dev_replace_is_ongoing); 4868 mirror_num = stripe_index - old_stripe_index + 1; 4869 } 4870 4871 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4872 BTRFS_BLOCK_GROUP_RAID6)) { 4873 u64 tmp; 4874 4875 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 4876 && raid_map_ret) { 4877 int i, rot; 4878 4879 /* push stripe_nr back to the start of the full stripe */ 4880 stripe_nr = raid56_full_stripe_start; 4881 do_div(stripe_nr, stripe_len); 4882 4883 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4884 4885 /* RAID[56] write or recovery. Return all stripes */ 4886 num_stripes = map->num_stripes; 4887 max_errors = nr_parity_stripes(map); 4888 4889 raid_map = kmalloc(sizeof(u64) * num_stripes, 4890 GFP_NOFS); 4891 if (!raid_map) { 4892 ret = -ENOMEM; 4893 goto out; 4894 } 4895 4896 /* Work out the disk rotation on this stripe-set */ 4897 tmp = stripe_nr; 4898 rot = do_div(tmp, num_stripes); 4899 4900 /* Fill in the logical address of each stripe */ 4901 tmp = stripe_nr * nr_data_stripes(map); 4902 for (i = 0; i < nr_data_stripes(map); i++) 4903 raid_map[(i+rot) % num_stripes] = 4904 em->start + (tmp + i) * map->stripe_len; 4905 4906 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 4907 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4908 raid_map[(i+rot+1) % num_stripes] = 4909 RAID6_Q_STRIPE; 4910 4911 *length = map->stripe_len; 4912 stripe_index = 0; 4913 stripe_offset = 0; 4914 } else { 4915 /* 4916 * Mirror #0 or #1 means the original data block. 4917 * Mirror #2 is RAID5 parity block. 4918 * Mirror #3 is RAID6 Q block. 4919 */ 4920 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4921 if (mirror_num > 1) 4922 stripe_index = nr_data_stripes(map) + 4923 mirror_num - 2; 4924 4925 /* We distribute the parity blocks across stripes */ 4926 tmp = stripe_nr + stripe_index; 4927 stripe_index = do_div(tmp, map->num_stripes); 4928 } 4929 } else { 4930 /* 4931 * after this do_div call, stripe_nr is the number of stripes 4932 * on this device we have to walk to find the data, and 4933 * stripe_index is the number of our device in the stripe array 4934 */ 4935 stripe_index = do_div(stripe_nr, map->num_stripes); 4936 mirror_num = stripe_index + 1; 4937 } 4938 BUG_ON(stripe_index >= map->num_stripes); 4939 4940 num_alloc_stripes = num_stripes; 4941 if (dev_replace_is_ongoing) { 4942 if (rw & (REQ_WRITE | REQ_DISCARD)) 4943 num_alloc_stripes <<= 1; 4944 if (rw & REQ_GET_READ_MIRRORS) 4945 num_alloc_stripes++; 4946 } 4947 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 4948 if (!bbio) { 4949 kfree(raid_map); 4950 ret = -ENOMEM; 4951 goto out; 4952 } 4953 atomic_set(&bbio->error, 0); 4954 4955 if (rw & REQ_DISCARD) { 4956 int factor = 0; 4957 int sub_stripes = 0; 4958 u64 stripes_per_dev = 0; 4959 u32 remaining_stripes = 0; 4960 u32 last_stripe = 0; 4961 4962 if (map->type & 4963 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 4964 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4965 sub_stripes = 1; 4966 else 4967 sub_stripes = map->sub_stripes; 4968 4969 factor = map->num_stripes / sub_stripes; 4970 stripes_per_dev = div_u64_rem(stripe_nr_end - 4971 stripe_nr_orig, 4972 factor, 4973 &remaining_stripes); 4974 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 4975 last_stripe *= sub_stripes; 4976 } 4977 4978 for (i = 0; i < num_stripes; i++) { 4979 bbio->stripes[i].physical = 4980 map->stripes[stripe_index].physical + 4981 stripe_offset + stripe_nr * map->stripe_len; 4982 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 4983 4984 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 4985 BTRFS_BLOCK_GROUP_RAID10)) { 4986 bbio->stripes[i].length = stripes_per_dev * 4987 map->stripe_len; 4988 4989 if (i / sub_stripes < remaining_stripes) 4990 bbio->stripes[i].length += 4991 map->stripe_len; 4992 4993 /* 4994 * Special for the first stripe and 4995 * the last stripe: 4996 * 4997 * |-------|...|-------| 4998 * |----------| 4999 * off end_off 5000 */ 5001 if (i < sub_stripes) 5002 bbio->stripes[i].length -= 5003 stripe_offset; 5004 5005 if (stripe_index >= last_stripe && 5006 stripe_index <= (last_stripe + 5007 sub_stripes - 1)) 5008 bbio->stripes[i].length -= 5009 stripe_end_offset; 5010 5011 if (i == sub_stripes - 1) 5012 stripe_offset = 0; 5013 } else 5014 bbio->stripes[i].length = *length; 5015 5016 stripe_index++; 5017 if (stripe_index == map->num_stripes) { 5018 /* This could only happen for RAID0/10 */ 5019 stripe_index = 0; 5020 stripe_nr++; 5021 } 5022 } 5023 } else { 5024 for (i = 0; i < num_stripes; i++) { 5025 bbio->stripes[i].physical = 5026 map->stripes[stripe_index].physical + 5027 stripe_offset + 5028 stripe_nr * map->stripe_len; 5029 bbio->stripes[i].dev = 5030 map->stripes[stripe_index].dev; 5031 stripe_index++; 5032 } 5033 } 5034 5035 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5036 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5037 BTRFS_BLOCK_GROUP_RAID10 | 5038 BTRFS_BLOCK_GROUP_RAID5 | 5039 BTRFS_BLOCK_GROUP_DUP)) { 5040 max_errors = 1; 5041 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5042 max_errors = 2; 5043 } 5044 } 5045 5046 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5047 dev_replace->tgtdev != NULL) { 5048 int index_where_to_add; 5049 u64 srcdev_devid = dev_replace->srcdev->devid; 5050 5051 /* 5052 * duplicate the write operations while the dev replace 5053 * procedure is running. Since the copying of the old disk 5054 * to the new disk takes place at run time while the 5055 * filesystem is mounted writable, the regular write 5056 * operations to the old disk have to be duplicated to go 5057 * to the new disk as well. 5058 * Note that device->missing is handled by the caller, and 5059 * that the write to the old disk is already set up in the 5060 * stripes array. 5061 */ 5062 index_where_to_add = num_stripes; 5063 for (i = 0; i < num_stripes; i++) { 5064 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5065 /* write to new disk, too */ 5066 struct btrfs_bio_stripe *new = 5067 bbio->stripes + index_where_to_add; 5068 struct btrfs_bio_stripe *old = 5069 bbio->stripes + i; 5070 5071 new->physical = old->physical; 5072 new->length = old->length; 5073 new->dev = dev_replace->tgtdev; 5074 index_where_to_add++; 5075 max_errors++; 5076 } 5077 } 5078 num_stripes = index_where_to_add; 5079 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 5080 dev_replace->tgtdev != NULL) { 5081 u64 srcdev_devid = dev_replace->srcdev->devid; 5082 int index_srcdev = 0; 5083 int found = 0; 5084 u64 physical_of_found = 0; 5085 5086 /* 5087 * During the dev-replace procedure, the target drive can 5088 * also be used to read data in case it is needed to repair 5089 * a corrupt block elsewhere. This is possible if the 5090 * requested area is left of the left cursor. In this area, 5091 * the target drive is a full copy of the source drive. 5092 */ 5093 for (i = 0; i < num_stripes; i++) { 5094 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5095 /* 5096 * In case of DUP, in order to keep it 5097 * simple, only add the mirror with the 5098 * lowest physical address 5099 */ 5100 if (found && 5101 physical_of_found <= 5102 bbio->stripes[i].physical) 5103 continue; 5104 index_srcdev = i; 5105 found = 1; 5106 physical_of_found = bbio->stripes[i].physical; 5107 } 5108 } 5109 if (found) { 5110 u64 length = map->stripe_len; 5111 5112 if (physical_of_found + length <= 5113 dev_replace->cursor_left) { 5114 struct btrfs_bio_stripe *tgtdev_stripe = 5115 bbio->stripes + num_stripes; 5116 5117 tgtdev_stripe->physical = physical_of_found; 5118 tgtdev_stripe->length = 5119 bbio->stripes[index_srcdev].length; 5120 tgtdev_stripe->dev = dev_replace->tgtdev; 5121 5122 num_stripes++; 5123 } 5124 } 5125 } 5126 5127 *bbio_ret = bbio; 5128 bbio->num_stripes = num_stripes; 5129 bbio->max_errors = max_errors; 5130 bbio->mirror_num = mirror_num; 5131 5132 /* 5133 * this is the case that REQ_READ && dev_replace_is_ongoing && 5134 * mirror_num == num_stripes + 1 && dev_replace target drive is 5135 * available as a mirror 5136 */ 5137 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5138 WARN_ON(num_stripes > 1); 5139 bbio->stripes[0].dev = dev_replace->tgtdev; 5140 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5141 bbio->mirror_num = map->num_stripes + 1; 5142 } 5143 if (raid_map) { 5144 sort_parity_stripes(bbio, raid_map); 5145 *raid_map_ret = raid_map; 5146 } 5147 out: 5148 if (dev_replace_is_ongoing) 5149 btrfs_dev_replace_unlock(dev_replace); 5150 free_extent_map(em); 5151 return ret; 5152 } 5153 5154 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 5155 u64 logical, u64 *length, 5156 struct btrfs_bio **bbio_ret, int mirror_num) 5157 { 5158 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5159 mirror_num, NULL); 5160 } 5161 5162 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5163 u64 chunk_start, u64 physical, u64 devid, 5164 u64 **logical, int *naddrs, int *stripe_len) 5165 { 5166 struct extent_map_tree *em_tree = &map_tree->map_tree; 5167 struct extent_map *em; 5168 struct map_lookup *map; 5169 u64 *buf; 5170 u64 bytenr; 5171 u64 length; 5172 u64 stripe_nr; 5173 u64 rmap_len; 5174 int i, j, nr = 0; 5175 5176 read_lock(&em_tree->lock); 5177 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5178 read_unlock(&em_tree->lock); 5179 5180 if (!em) { 5181 printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", 5182 chunk_start); 5183 return -EIO; 5184 } 5185 5186 if (em->start != chunk_start) { 5187 printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", 5188 em->start, chunk_start); 5189 free_extent_map(em); 5190 return -EIO; 5191 } 5192 map = (struct map_lookup *)em->bdev; 5193 5194 length = em->len; 5195 rmap_len = map->stripe_len; 5196 5197 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5198 do_div(length, map->num_stripes / map->sub_stripes); 5199 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5200 do_div(length, map->num_stripes); 5201 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5202 BTRFS_BLOCK_GROUP_RAID6)) { 5203 do_div(length, nr_data_stripes(map)); 5204 rmap_len = map->stripe_len * nr_data_stripes(map); 5205 } 5206 5207 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 5208 BUG_ON(!buf); /* -ENOMEM */ 5209 5210 for (i = 0; i < map->num_stripes; i++) { 5211 if (devid && map->stripes[i].dev->devid != devid) 5212 continue; 5213 if (map->stripes[i].physical > physical || 5214 map->stripes[i].physical + length <= physical) 5215 continue; 5216 5217 stripe_nr = physical - map->stripes[i].physical; 5218 do_div(stripe_nr, map->stripe_len); 5219 5220 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5221 stripe_nr = stripe_nr * map->num_stripes + i; 5222 do_div(stripe_nr, map->sub_stripes); 5223 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5224 stripe_nr = stripe_nr * map->num_stripes + i; 5225 } /* else if RAID[56], multiply by nr_data_stripes(). 5226 * Alternatively, just use rmap_len below instead of 5227 * map->stripe_len */ 5228 5229 bytenr = chunk_start + stripe_nr * rmap_len; 5230 WARN_ON(nr >= map->num_stripes); 5231 for (j = 0; j < nr; j++) { 5232 if (buf[j] == bytenr) 5233 break; 5234 } 5235 if (j == nr) { 5236 WARN_ON(nr >= map->num_stripes); 5237 buf[nr++] = bytenr; 5238 } 5239 } 5240 5241 *logical = buf; 5242 *naddrs = nr; 5243 *stripe_len = rmap_len; 5244 5245 free_extent_map(em); 5246 return 0; 5247 } 5248 5249 static void btrfs_end_bio(struct bio *bio, int err) 5250 { 5251 struct btrfs_bio *bbio = bio->bi_private; 5252 int is_orig_bio = 0; 5253 5254 if (err) { 5255 atomic_inc(&bbio->error); 5256 if (err == -EIO || err == -EREMOTEIO) { 5257 unsigned int stripe_index = 5258 btrfs_io_bio(bio)->stripe_index; 5259 struct btrfs_device *dev; 5260 5261 BUG_ON(stripe_index >= bbio->num_stripes); 5262 dev = bbio->stripes[stripe_index].dev; 5263 if (dev->bdev) { 5264 if (bio->bi_rw & WRITE) 5265 btrfs_dev_stat_inc(dev, 5266 BTRFS_DEV_STAT_WRITE_ERRS); 5267 else 5268 btrfs_dev_stat_inc(dev, 5269 BTRFS_DEV_STAT_READ_ERRS); 5270 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 5271 btrfs_dev_stat_inc(dev, 5272 BTRFS_DEV_STAT_FLUSH_ERRS); 5273 btrfs_dev_stat_print_on_error(dev); 5274 } 5275 } 5276 } 5277 5278 if (bio == bbio->orig_bio) 5279 is_orig_bio = 1; 5280 5281 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5282 if (!is_orig_bio) { 5283 bio_put(bio); 5284 bio = bbio->orig_bio; 5285 } 5286 bio->bi_private = bbio->private; 5287 bio->bi_end_io = bbio->end_io; 5288 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5289 /* only send an error to the higher layers if it is 5290 * beyond the tolerance of the btrfs bio 5291 */ 5292 if (atomic_read(&bbio->error) > bbio->max_errors) { 5293 err = -EIO; 5294 } else { 5295 /* 5296 * this bio is actually up to date, we didn't 5297 * go over the max number of errors 5298 */ 5299 set_bit(BIO_UPTODATE, &bio->bi_flags); 5300 err = 0; 5301 } 5302 kfree(bbio); 5303 5304 bio_endio(bio, err); 5305 } else if (!is_orig_bio) { 5306 bio_put(bio); 5307 } 5308 } 5309 5310 struct async_sched { 5311 struct bio *bio; 5312 int rw; 5313 struct btrfs_fs_info *info; 5314 struct btrfs_work work; 5315 }; 5316 5317 /* 5318 * see run_scheduled_bios for a description of why bios are collected for 5319 * async submit. 5320 * 5321 * This will add one bio to the pending list for a device and make sure 5322 * the work struct is scheduled. 5323 */ 5324 static noinline void btrfs_schedule_bio(struct btrfs_root *root, 5325 struct btrfs_device *device, 5326 int rw, struct bio *bio) 5327 { 5328 int should_queue = 1; 5329 struct btrfs_pending_bios *pending_bios; 5330 5331 if (device->missing || !device->bdev) { 5332 bio_endio(bio, -EIO); 5333 return; 5334 } 5335 5336 /* don't bother with additional async steps for reads, right now */ 5337 if (!(rw & REQ_WRITE)) { 5338 bio_get(bio); 5339 btrfsic_submit_bio(rw, bio); 5340 bio_put(bio); 5341 return; 5342 } 5343 5344 /* 5345 * nr_async_bios allows us to reliably return congestion to the 5346 * higher layers. Otherwise, the async bio makes it appear we have 5347 * made progress against dirty pages when we've really just put it 5348 * on a queue for later 5349 */ 5350 atomic_inc(&root->fs_info->nr_async_bios); 5351 WARN_ON(bio->bi_next); 5352 bio->bi_next = NULL; 5353 bio->bi_rw |= rw; 5354 5355 spin_lock(&device->io_lock); 5356 if (bio->bi_rw & REQ_SYNC) 5357 pending_bios = &device->pending_sync_bios; 5358 else 5359 pending_bios = &device->pending_bios; 5360 5361 if (pending_bios->tail) 5362 pending_bios->tail->bi_next = bio; 5363 5364 pending_bios->tail = bio; 5365 if (!pending_bios->head) 5366 pending_bios->head = bio; 5367 if (device->running_pending) 5368 should_queue = 0; 5369 5370 spin_unlock(&device->io_lock); 5371 5372 if (should_queue) 5373 btrfs_queue_worker(&root->fs_info->submit_workers, 5374 &device->work); 5375 } 5376 5377 static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5378 sector_t sector) 5379 { 5380 struct bio_vec *prev; 5381 struct request_queue *q = bdev_get_queue(bdev); 5382 unsigned short max_sectors = queue_max_sectors(q); 5383 struct bvec_merge_data bvm = { 5384 .bi_bdev = bdev, 5385 .bi_sector = sector, 5386 .bi_rw = bio->bi_rw, 5387 }; 5388 5389 if (bio->bi_vcnt == 0) { 5390 WARN_ON(1); 5391 return 1; 5392 } 5393 5394 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5395 if (bio_sectors(bio) > max_sectors) 5396 return 0; 5397 5398 if (!q->merge_bvec_fn) 5399 return 1; 5400 5401 bvm.bi_size = bio->bi_size - prev->bv_len; 5402 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5403 return 0; 5404 return 1; 5405 } 5406 5407 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5408 struct bio *bio, u64 physical, int dev_nr, 5409 int rw, int async) 5410 { 5411 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5412 5413 bio->bi_private = bbio; 5414 btrfs_io_bio(bio)->stripe_index = dev_nr; 5415 bio->bi_end_io = btrfs_end_bio; 5416 bio->bi_sector = physical >> 9; 5417 #ifdef DEBUG 5418 { 5419 struct rcu_string *name; 5420 5421 rcu_read_lock(); 5422 name = rcu_dereference(dev->name); 5423 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5424 "(%s id %llu), size=%u\n", rw, 5425 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5426 name->str, dev->devid, bio->bi_size); 5427 rcu_read_unlock(); 5428 } 5429 #endif 5430 bio->bi_bdev = dev->bdev; 5431 if (async) 5432 btrfs_schedule_bio(root, dev, rw, bio); 5433 else 5434 btrfsic_submit_bio(rw, bio); 5435 } 5436 5437 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5438 struct bio *first_bio, struct btrfs_device *dev, 5439 int dev_nr, int rw, int async) 5440 { 5441 struct bio_vec *bvec = first_bio->bi_io_vec; 5442 struct bio *bio; 5443 int nr_vecs = bio_get_nr_vecs(dev->bdev); 5444 u64 physical = bbio->stripes[dev_nr].physical; 5445 5446 again: 5447 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 5448 if (!bio) 5449 return -ENOMEM; 5450 5451 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5452 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5453 bvec->bv_offset) < bvec->bv_len) { 5454 u64 len = bio->bi_size; 5455 5456 atomic_inc(&bbio->stripes_pending); 5457 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5458 rw, async); 5459 physical += len; 5460 goto again; 5461 } 5462 bvec++; 5463 } 5464 5465 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 5466 return 0; 5467 } 5468 5469 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 5470 { 5471 atomic_inc(&bbio->error); 5472 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5473 bio->bi_private = bbio->private; 5474 bio->bi_end_io = bbio->end_io; 5475 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5476 bio->bi_sector = logical >> 9; 5477 kfree(bbio); 5478 bio_endio(bio, -EIO); 5479 } 5480 } 5481 5482 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 5483 int mirror_num, int async_submit) 5484 { 5485 struct btrfs_device *dev; 5486 struct bio *first_bio = bio; 5487 u64 logical = (u64)bio->bi_sector << 9; 5488 u64 length = 0; 5489 u64 map_length; 5490 u64 *raid_map = NULL; 5491 int ret; 5492 int dev_nr = 0; 5493 int total_devs = 1; 5494 struct btrfs_bio *bbio = NULL; 5495 5496 length = bio->bi_size; 5497 map_length = length; 5498 5499 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5500 mirror_num, &raid_map); 5501 if (ret) /* -ENOMEM */ 5502 return ret; 5503 5504 total_devs = bbio->num_stripes; 5505 bbio->orig_bio = first_bio; 5506 bbio->private = first_bio->bi_private; 5507 bbio->end_io = first_bio->bi_end_io; 5508 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5509 5510 if (raid_map) { 5511 /* In this case, map_length has been set to the length of 5512 a single stripe; not the whole write */ 5513 if (rw & WRITE) { 5514 return raid56_parity_write(root, bio, bbio, 5515 raid_map, map_length); 5516 } else { 5517 return raid56_parity_recover(root, bio, bbio, 5518 raid_map, map_length, 5519 mirror_num); 5520 } 5521 } 5522 5523 if (map_length < length) { 5524 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", 5525 logical, length, map_length); 5526 BUG(); 5527 } 5528 5529 while (dev_nr < total_devs) { 5530 dev = bbio->stripes[dev_nr].dev; 5531 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5532 bbio_error(bbio, first_bio, logical); 5533 dev_nr++; 5534 continue; 5535 } 5536 5537 /* 5538 * Check and see if we're ok with this bio based on it's size 5539 * and offset with the given device. 5540 */ 5541 if (!bio_size_ok(dev->bdev, first_bio, 5542 bbio->stripes[dev_nr].physical >> 9)) { 5543 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5544 dev_nr, rw, async_submit); 5545 BUG_ON(ret); 5546 dev_nr++; 5547 continue; 5548 } 5549 5550 if (dev_nr < total_devs - 1) { 5551 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 5552 BUG_ON(!bio); /* -ENOMEM */ 5553 } else { 5554 bio = first_bio; 5555 } 5556 5557 submit_stripe_bio(root, bbio, bio, 5558 bbio->stripes[dev_nr].physical, dev_nr, rw, 5559 async_submit); 5560 dev_nr++; 5561 } 5562 return 0; 5563 } 5564 5565 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 5566 u8 *uuid, u8 *fsid) 5567 { 5568 struct btrfs_device *device; 5569 struct btrfs_fs_devices *cur_devices; 5570 5571 cur_devices = fs_info->fs_devices; 5572 while (cur_devices) { 5573 if (!fsid || 5574 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5575 device = __find_device(&cur_devices->devices, 5576 devid, uuid); 5577 if (device) 5578 return device; 5579 } 5580 cur_devices = cur_devices->seed; 5581 } 5582 return NULL; 5583 } 5584 5585 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5586 u64 devid, u8 *dev_uuid) 5587 { 5588 struct btrfs_device *device; 5589 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 5590 5591 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5592 if (IS_ERR(device)) 5593 return NULL; 5594 5595 list_add(&device->dev_list, &fs_devices->devices); 5596 device->fs_devices = fs_devices; 5597 fs_devices->num_devices++; 5598 5599 device->missing = 1; 5600 fs_devices->missing_devices++; 5601 5602 return device; 5603 } 5604 5605 /** 5606 * btrfs_alloc_device - allocate struct btrfs_device 5607 * @fs_info: used only for generating a new devid, can be NULL if 5608 * devid is provided (i.e. @devid != NULL). 5609 * @devid: a pointer to devid for this device. If NULL a new devid 5610 * is generated. 5611 * @uuid: a pointer to UUID for this device. If NULL a new UUID 5612 * is generated. 5613 * 5614 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 5615 * on error. Returned struct is not linked onto any lists and can be 5616 * destroyed with kfree() right away. 5617 */ 5618 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 5619 const u64 *devid, 5620 const u8 *uuid) 5621 { 5622 struct btrfs_device *dev; 5623 u64 tmp; 5624 5625 if (!devid && !fs_info) { 5626 WARN_ON(1); 5627 return ERR_PTR(-EINVAL); 5628 } 5629 5630 dev = __alloc_device(); 5631 if (IS_ERR(dev)) 5632 return dev; 5633 5634 if (devid) 5635 tmp = *devid; 5636 else { 5637 int ret; 5638 5639 ret = find_next_devid(fs_info, &tmp); 5640 if (ret) { 5641 kfree(dev); 5642 return ERR_PTR(ret); 5643 } 5644 } 5645 dev->devid = tmp; 5646 5647 if (uuid) 5648 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 5649 else 5650 generate_random_uuid(dev->uuid); 5651 5652 dev->work.func = pending_bios_fn; 5653 5654 return dev; 5655 } 5656 5657 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 5658 struct extent_buffer *leaf, 5659 struct btrfs_chunk *chunk) 5660 { 5661 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5662 struct map_lookup *map; 5663 struct extent_map *em; 5664 u64 logical; 5665 u64 length; 5666 u64 devid; 5667 u8 uuid[BTRFS_UUID_SIZE]; 5668 int num_stripes; 5669 int ret; 5670 int i; 5671 5672 logical = key->offset; 5673 length = btrfs_chunk_length(leaf, chunk); 5674 5675 read_lock(&map_tree->map_tree.lock); 5676 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 5677 read_unlock(&map_tree->map_tree.lock); 5678 5679 /* already mapped? */ 5680 if (em && em->start <= logical && em->start + em->len > logical) { 5681 free_extent_map(em); 5682 return 0; 5683 } else if (em) { 5684 free_extent_map(em); 5685 } 5686 5687 em = alloc_extent_map(); 5688 if (!em) 5689 return -ENOMEM; 5690 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 5691 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 5692 if (!map) { 5693 free_extent_map(em); 5694 return -ENOMEM; 5695 } 5696 5697 em->bdev = (struct block_device *)map; 5698 em->start = logical; 5699 em->len = length; 5700 em->orig_start = 0; 5701 em->block_start = 0; 5702 em->block_len = em->len; 5703 5704 map->num_stripes = num_stripes; 5705 map->io_width = btrfs_chunk_io_width(leaf, chunk); 5706 map->io_align = btrfs_chunk_io_align(leaf, chunk); 5707 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 5708 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 5709 map->type = btrfs_chunk_type(leaf, chunk); 5710 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 5711 for (i = 0; i < num_stripes; i++) { 5712 map->stripes[i].physical = 5713 btrfs_stripe_offset_nr(leaf, chunk, i); 5714 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 5715 read_extent_buffer(leaf, uuid, (unsigned long) 5716 btrfs_stripe_dev_uuid_nr(chunk, i), 5717 BTRFS_UUID_SIZE); 5718 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 5719 uuid, NULL); 5720 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5721 kfree(map); 5722 free_extent_map(em); 5723 return -EIO; 5724 } 5725 if (!map->stripes[i].dev) { 5726 map->stripes[i].dev = 5727 add_missing_dev(root, devid, uuid); 5728 if (!map->stripes[i].dev) { 5729 kfree(map); 5730 free_extent_map(em); 5731 return -EIO; 5732 } 5733 } 5734 map->stripes[i].dev->in_fs_metadata = 1; 5735 } 5736 5737 write_lock(&map_tree->map_tree.lock); 5738 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 5739 write_unlock(&map_tree->map_tree.lock); 5740 BUG_ON(ret); /* Tree corruption */ 5741 free_extent_map(em); 5742 5743 return 0; 5744 } 5745 5746 static void fill_device_from_item(struct extent_buffer *leaf, 5747 struct btrfs_dev_item *dev_item, 5748 struct btrfs_device *device) 5749 { 5750 unsigned long ptr; 5751 5752 device->devid = btrfs_device_id(leaf, dev_item); 5753 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 5754 device->total_bytes = device->disk_total_bytes; 5755 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 5756 device->type = btrfs_device_type(leaf, dev_item); 5757 device->io_align = btrfs_device_io_align(leaf, dev_item); 5758 device->io_width = btrfs_device_io_width(leaf, dev_item); 5759 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5760 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 5761 device->is_tgtdev_for_dev_replace = 0; 5762 5763 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5764 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5765 } 5766 5767 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 5768 { 5769 struct btrfs_fs_devices *fs_devices; 5770 int ret; 5771 5772 BUG_ON(!mutex_is_locked(&uuid_mutex)); 5773 5774 fs_devices = root->fs_info->fs_devices->seed; 5775 while (fs_devices) { 5776 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5777 ret = 0; 5778 goto out; 5779 } 5780 fs_devices = fs_devices->seed; 5781 } 5782 5783 fs_devices = find_fsid(fsid); 5784 if (!fs_devices) { 5785 ret = -ENOENT; 5786 goto out; 5787 } 5788 5789 fs_devices = clone_fs_devices(fs_devices); 5790 if (IS_ERR(fs_devices)) { 5791 ret = PTR_ERR(fs_devices); 5792 goto out; 5793 } 5794 5795 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 5796 root->fs_info->bdev_holder); 5797 if (ret) { 5798 free_fs_devices(fs_devices); 5799 goto out; 5800 } 5801 5802 if (!fs_devices->seeding) { 5803 __btrfs_close_devices(fs_devices); 5804 free_fs_devices(fs_devices); 5805 ret = -EINVAL; 5806 goto out; 5807 } 5808 5809 fs_devices->seed = root->fs_info->fs_devices->seed; 5810 root->fs_info->fs_devices->seed = fs_devices; 5811 out: 5812 return ret; 5813 } 5814 5815 static int read_one_dev(struct btrfs_root *root, 5816 struct extent_buffer *leaf, 5817 struct btrfs_dev_item *dev_item) 5818 { 5819 struct btrfs_device *device; 5820 u64 devid; 5821 int ret; 5822 u8 fs_uuid[BTRFS_UUID_SIZE]; 5823 u8 dev_uuid[BTRFS_UUID_SIZE]; 5824 5825 devid = btrfs_device_id(leaf, dev_item); 5826 read_extent_buffer(leaf, dev_uuid, 5827 (unsigned long)btrfs_device_uuid(dev_item), 5828 BTRFS_UUID_SIZE); 5829 read_extent_buffer(leaf, fs_uuid, 5830 (unsigned long)btrfs_device_fsid(dev_item), 5831 BTRFS_UUID_SIZE); 5832 5833 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 5834 ret = open_seed_devices(root, fs_uuid); 5835 if (ret && !btrfs_test_opt(root, DEGRADED)) 5836 return ret; 5837 } 5838 5839 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 5840 if (!device || !device->bdev) { 5841 if (!btrfs_test_opt(root, DEGRADED)) 5842 return -EIO; 5843 5844 if (!device) { 5845 btrfs_warn(root->fs_info, "devid %llu missing", devid); 5846 device = add_missing_dev(root, devid, dev_uuid); 5847 if (!device) 5848 return -ENOMEM; 5849 } else if (!device->missing) { 5850 /* 5851 * this happens when a device that was properly setup 5852 * in the device info lists suddenly goes bad. 5853 * device->bdev is NULL, and so we have to set 5854 * device->missing to one here 5855 */ 5856 root->fs_info->fs_devices->missing_devices++; 5857 device->missing = 1; 5858 } 5859 } 5860 5861 if (device->fs_devices != root->fs_info->fs_devices) { 5862 BUG_ON(device->writeable); 5863 if (device->generation != 5864 btrfs_device_generation(leaf, dev_item)) 5865 return -EINVAL; 5866 } 5867 5868 fill_device_from_item(leaf, dev_item, device); 5869 device->in_fs_metadata = 1; 5870 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5871 device->fs_devices->total_rw_bytes += device->total_bytes; 5872 spin_lock(&root->fs_info->free_chunk_lock); 5873 root->fs_info->free_chunk_space += device->total_bytes - 5874 device->bytes_used; 5875 spin_unlock(&root->fs_info->free_chunk_lock); 5876 } 5877 ret = 0; 5878 return ret; 5879 } 5880 5881 int btrfs_read_sys_array(struct btrfs_root *root) 5882 { 5883 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 5884 struct extent_buffer *sb; 5885 struct btrfs_disk_key *disk_key; 5886 struct btrfs_chunk *chunk; 5887 u8 *ptr; 5888 unsigned long sb_ptr; 5889 int ret = 0; 5890 u32 num_stripes; 5891 u32 array_size; 5892 u32 len = 0; 5893 u32 cur; 5894 struct btrfs_key key; 5895 5896 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 5897 BTRFS_SUPER_INFO_SIZE); 5898 if (!sb) 5899 return -ENOMEM; 5900 btrfs_set_buffer_uptodate(sb); 5901 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 5902 /* 5903 * The sb extent buffer is artifical and just used to read the system array. 5904 * btrfs_set_buffer_uptodate() call does not properly mark all it's 5905 * pages up-to-date when the page is larger: extent does not cover the 5906 * whole page and consequently check_page_uptodate does not find all 5907 * the page's extents up-to-date (the hole beyond sb), 5908 * write_extent_buffer then triggers a WARN_ON. 5909 * 5910 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 5911 * but sb spans only this function. Add an explicit SetPageUptodate call 5912 * to silence the warning eg. on PowerPC 64. 5913 */ 5914 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 5915 SetPageUptodate(sb->pages[0]); 5916 5917 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 5918 array_size = btrfs_super_sys_array_size(super_copy); 5919 5920 ptr = super_copy->sys_chunk_array; 5921 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 5922 cur = 0; 5923 5924 while (cur < array_size) { 5925 disk_key = (struct btrfs_disk_key *)ptr; 5926 btrfs_disk_key_to_cpu(&key, disk_key); 5927 5928 len = sizeof(*disk_key); ptr += len; 5929 sb_ptr += len; 5930 cur += len; 5931 5932 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 5933 chunk = (struct btrfs_chunk *)sb_ptr; 5934 ret = read_one_chunk(root, &key, sb, chunk); 5935 if (ret) 5936 break; 5937 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 5938 len = btrfs_chunk_item_size(num_stripes); 5939 } else { 5940 ret = -EIO; 5941 break; 5942 } 5943 ptr += len; 5944 sb_ptr += len; 5945 cur += len; 5946 } 5947 free_extent_buffer(sb); 5948 return ret; 5949 } 5950 5951 int btrfs_read_chunk_tree(struct btrfs_root *root) 5952 { 5953 struct btrfs_path *path; 5954 struct extent_buffer *leaf; 5955 struct btrfs_key key; 5956 struct btrfs_key found_key; 5957 int ret; 5958 int slot; 5959 5960 root = root->fs_info->chunk_root; 5961 5962 path = btrfs_alloc_path(); 5963 if (!path) 5964 return -ENOMEM; 5965 5966 mutex_lock(&uuid_mutex); 5967 lock_chunks(root); 5968 5969 /* 5970 * Read all device items, and then all the chunk items. All 5971 * device items are found before any chunk item (their object id 5972 * is smaller than the lowest possible object id for a chunk 5973 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 5974 */ 5975 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 5976 key.offset = 0; 5977 key.type = 0; 5978 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5979 if (ret < 0) 5980 goto error; 5981 while (1) { 5982 leaf = path->nodes[0]; 5983 slot = path->slots[0]; 5984 if (slot >= btrfs_header_nritems(leaf)) { 5985 ret = btrfs_next_leaf(root, path); 5986 if (ret == 0) 5987 continue; 5988 if (ret < 0) 5989 goto error; 5990 break; 5991 } 5992 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5993 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 5994 struct btrfs_dev_item *dev_item; 5995 dev_item = btrfs_item_ptr(leaf, slot, 5996 struct btrfs_dev_item); 5997 ret = read_one_dev(root, leaf, dev_item); 5998 if (ret) 5999 goto error; 6000 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6001 struct btrfs_chunk *chunk; 6002 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6003 ret = read_one_chunk(root, &found_key, leaf, chunk); 6004 if (ret) 6005 goto error; 6006 } 6007 path->slots[0]++; 6008 } 6009 ret = 0; 6010 error: 6011 unlock_chunks(root); 6012 mutex_unlock(&uuid_mutex); 6013 6014 btrfs_free_path(path); 6015 return ret; 6016 } 6017 6018 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6019 { 6020 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6021 struct btrfs_device *device; 6022 6023 mutex_lock(&fs_devices->device_list_mutex); 6024 list_for_each_entry(device, &fs_devices->devices, dev_list) 6025 device->dev_root = fs_info->dev_root; 6026 mutex_unlock(&fs_devices->device_list_mutex); 6027 } 6028 6029 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6030 { 6031 int i; 6032 6033 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6034 btrfs_dev_stat_reset(dev, i); 6035 } 6036 6037 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6038 { 6039 struct btrfs_key key; 6040 struct btrfs_key found_key; 6041 struct btrfs_root *dev_root = fs_info->dev_root; 6042 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6043 struct extent_buffer *eb; 6044 int slot; 6045 int ret = 0; 6046 struct btrfs_device *device; 6047 struct btrfs_path *path = NULL; 6048 int i; 6049 6050 path = btrfs_alloc_path(); 6051 if (!path) { 6052 ret = -ENOMEM; 6053 goto out; 6054 } 6055 6056 mutex_lock(&fs_devices->device_list_mutex); 6057 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6058 int item_size; 6059 struct btrfs_dev_stats_item *ptr; 6060 6061 key.objectid = 0; 6062 key.type = BTRFS_DEV_STATS_KEY; 6063 key.offset = device->devid; 6064 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6065 if (ret) { 6066 __btrfs_reset_dev_stats(device); 6067 device->dev_stats_valid = 1; 6068 btrfs_release_path(path); 6069 continue; 6070 } 6071 slot = path->slots[0]; 6072 eb = path->nodes[0]; 6073 btrfs_item_key_to_cpu(eb, &found_key, slot); 6074 item_size = btrfs_item_size_nr(eb, slot); 6075 6076 ptr = btrfs_item_ptr(eb, slot, 6077 struct btrfs_dev_stats_item); 6078 6079 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6080 if (item_size >= (1 + i) * sizeof(__le64)) 6081 btrfs_dev_stat_set(device, i, 6082 btrfs_dev_stats_value(eb, ptr, i)); 6083 else 6084 btrfs_dev_stat_reset(device, i); 6085 } 6086 6087 device->dev_stats_valid = 1; 6088 btrfs_dev_stat_print_on_load(device); 6089 btrfs_release_path(path); 6090 } 6091 mutex_unlock(&fs_devices->device_list_mutex); 6092 6093 out: 6094 btrfs_free_path(path); 6095 return ret < 0 ? ret : 0; 6096 } 6097 6098 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6099 struct btrfs_root *dev_root, 6100 struct btrfs_device *device) 6101 { 6102 struct btrfs_path *path; 6103 struct btrfs_key key; 6104 struct extent_buffer *eb; 6105 struct btrfs_dev_stats_item *ptr; 6106 int ret; 6107 int i; 6108 6109 key.objectid = 0; 6110 key.type = BTRFS_DEV_STATS_KEY; 6111 key.offset = device->devid; 6112 6113 path = btrfs_alloc_path(); 6114 BUG_ON(!path); 6115 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6116 if (ret < 0) { 6117 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", 6118 ret, rcu_str_deref(device->name)); 6119 goto out; 6120 } 6121 6122 if (ret == 0 && 6123 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6124 /* need to delete old one and insert a new one */ 6125 ret = btrfs_del_item(trans, dev_root, path); 6126 if (ret != 0) { 6127 printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", 6128 rcu_str_deref(device->name), ret); 6129 goto out; 6130 } 6131 ret = 1; 6132 } 6133 6134 if (ret == 1) { 6135 /* need to insert a new item */ 6136 btrfs_release_path(path); 6137 ret = btrfs_insert_empty_item(trans, dev_root, path, 6138 &key, sizeof(*ptr)); 6139 if (ret < 0) { 6140 printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", 6141 rcu_str_deref(device->name), ret); 6142 goto out; 6143 } 6144 } 6145 6146 eb = path->nodes[0]; 6147 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6148 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6149 btrfs_set_dev_stats_value(eb, ptr, i, 6150 btrfs_dev_stat_read(device, i)); 6151 btrfs_mark_buffer_dirty(eb); 6152 6153 out: 6154 btrfs_free_path(path); 6155 return ret; 6156 } 6157 6158 /* 6159 * called from commit_transaction. Writes all changed device stats to disk. 6160 */ 6161 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 6162 struct btrfs_fs_info *fs_info) 6163 { 6164 struct btrfs_root *dev_root = fs_info->dev_root; 6165 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6166 struct btrfs_device *device; 6167 int ret = 0; 6168 6169 mutex_lock(&fs_devices->device_list_mutex); 6170 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6171 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6172 continue; 6173 6174 ret = update_dev_stat_item(trans, dev_root, device); 6175 if (!ret) 6176 device->dev_stats_dirty = 0; 6177 } 6178 mutex_unlock(&fs_devices->device_list_mutex); 6179 6180 return ret; 6181 } 6182 6183 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 6184 { 6185 btrfs_dev_stat_inc(dev, index); 6186 btrfs_dev_stat_print_on_error(dev); 6187 } 6188 6189 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 6190 { 6191 if (!dev->dev_stats_valid) 6192 return; 6193 printk_ratelimited_in_rcu(KERN_ERR 6194 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6195 rcu_str_deref(dev->name), 6196 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6197 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6198 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6199 btrfs_dev_stat_read(dev, 6200 BTRFS_DEV_STAT_CORRUPTION_ERRS), 6201 btrfs_dev_stat_read(dev, 6202 BTRFS_DEV_STAT_GENERATION_ERRS)); 6203 } 6204 6205 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6206 { 6207 int i; 6208 6209 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6210 if (btrfs_dev_stat_read(dev, i) != 0) 6211 break; 6212 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6213 return; /* all values == 0, suppress message */ 6214 6215 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6216 rcu_str_deref(dev->name), 6217 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6218 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6219 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6220 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6221 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6222 } 6223 6224 int btrfs_get_dev_stats(struct btrfs_root *root, 6225 struct btrfs_ioctl_get_dev_stats *stats) 6226 { 6227 struct btrfs_device *dev; 6228 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6229 int i; 6230 6231 mutex_lock(&fs_devices->device_list_mutex); 6232 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 6233 mutex_unlock(&fs_devices->device_list_mutex); 6234 6235 if (!dev) { 6236 printk(KERN_WARNING 6237 "btrfs: get dev_stats failed, device not found\n"); 6238 return -ENODEV; 6239 } else if (!dev->dev_stats_valid) { 6240 printk(KERN_WARNING 6241 "btrfs: get dev_stats failed, not yet valid\n"); 6242 return -ENODEV; 6243 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6244 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6245 if (stats->nr_items > i) 6246 stats->values[i] = 6247 btrfs_dev_stat_read_and_reset(dev, i); 6248 else 6249 btrfs_dev_stat_reset(dev, i); 6250 } 6251 } else { 6252 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6253 if (stats->nr_items > i) 6254 stats->values[i] = btrfs_dev_stat_read(dev, i); 6255 } 6256 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 6257 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 6258 return 0; 6259 } 6260 6261 int btrfs_scratch_superblock(struct btrfs_device *device) 6262 { 6263 struct buffer_head *bh; 6264 struct btrfs_super_block *disk_super; 6265 6266 bh = btrfs_read_dev_super(device->bdev); 6267 if (!bh) 6268 return -EINVAL; 6269 disk_super = (struct btrfs_super_block *)bh->b_data; 6270 6271 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6272 set_buffer_dirty(bh); 6273 sync_dirty_buffer(bh); 6274 brelse(bh); 6275 6276 return 0; 6277 } 6278