1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/kthread.h> 27 #include <asm/div64.h> 28 #include "compat.h" 29 #include "ctree.h" 30 #include "extent_map.h" 31 #include "disk-io.h" 32 #include "transaction.h" 33 #include "print-tree.h" 34 #include "volumes.h" 35 #include "async-thread.h" 36 #include "check-integrity.h" 37 38 static int init_first_rw_device(struct btrfs_trans_handle *trans, 39 struct btrfs_root *root, 40 struct btrfs_device *device); 41 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42 43 static DEFINE_MUTEX(uuid_mutex); 44 static LIST_HEAD(fs_uuids); 45 46 static void lock_chunks(struct btrfs_root *root) 47 { 48 mutex_lock(&root->fs_info->chunk_mutex); 49 } 50 51 static void unlock_chunks(struct btrfs_root *root) 52 { 53 mutex_unlock(&root->fs_info->chunk_mutex); 54 } 55 56 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 57 { 58 struct btrfs_device *device; 59 WARN_ON(fs_devices->opened); 60 while (!list_empty(&fs_devices->devices)) { 61 device = list_entry(fs_devices->devices.next, 62 struct btrfs_device, dev_list); 63 list_del(&device->dev_list); 64 kfree(device->name); 65 kfree(device); 66 } 67 kfree(fs_devices); 68 } 69 70 void btrfs_cleanup_fs_uuids(void) 71 { 72 struct btrfs_fs_devices *fs_devices; 73 74 while (!list_empty(&fs_uuids)) { 75 fs_devices = list_entry(fs_uuids.next, 76 struct btrfs_fs_devices, list); 77 list_del(&fs_devices->list); 78 free_fs_devices(fs_devices); 79 } 80 } 81 82 static noinline struct btrfs_device *__find_device(struct list_head *head, 83 u64 devid, u8 *uuid) 84 { 85 struct btrfs_device *dev; 86 87 list_for_each_entry(dev, head, dev_list) { 88 if (dev->devid == devid && 89 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 90 return dev; 91 } 92 } 93 return NULL; 94 } 95 96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 97 { 98 struct btrfs_fs_devices *fs_devices; 99 100 list_for_each_entry(fs_devices, &fs_uuids, list) { 101 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 102 return fs_devices; 103 } 104 return NULL; 105 } 106 107 static void requeue_list(struct btrfs_pending_bios *pending_bios, 108 struct bio *head, struct bio *tail) 109 { 110 111 struct bio *old_head; 112 113 old_head = pending_bios->head; 114 pending_bios->head = head; 115 if (pending_bios->tail) 116 tail->bi_next = old_head; 117 else 118 pending_bios->tail = tail; 119 } 120 121 /* 122 * we try to collect pending bios for a device so we don't get a large 123 * number of procs sending bios down to the same device. This greatly 124 * improves the schedulers ability to collect and merge the bios. 125 * 126 * But, it also turns into a long list of bios to process and that is sure 127 * to eventually make the worker thread block. The solution here is to 128 * make some progress and then put this work struct back at the end of 129 * the list if the block device is congested. This way, multiple devices 130 * can make progress from a single worker thread. 131 */ 132 static noinline void run_scheduled_bios(struct btrfs_device *device) 133 { 134 struct bio *pending; 135 struct backing_dev_info *bdi; 136 struct btrfs_fs_info *fs_info; 137 struct btrfs_pending_bios *pending_bios; 138 struct bio *tail; 139 struct bio *cur; 140 int again = 0; 141 unsigned long num_run; 142 unsigned long batch_run = 0; 143 unsigned long limit; 144 unsigned long last_waited = 0; 145 int force_reg = 0; 146 int sync_pending = 0; 147 struct blk_plug plug; 148 149 /* 150 * this function runs all the bios we've collected for 151 * a particular device. We don't want to wander off to 152 * another device without first sending all of these down. 153 * So, setup a plug here and finish it off before we return 154 */ 155 blk_start_plug(&plug); 156 157 bdi = blk_get_backing_dev_info(device->bdev); 158 fs_info = device->dev_root->fs_info; 159 limit = btrfs_async_submit_limit(fs_info); 160 limit = limit * 2 / 3; 161 162 loop: 163 spin_lock(&device->io_lock); 164 165 loop_lock: 166 num_run = 0; 167 168 /* take all the bios off the list at once and process them 169 * later on (without the lock held). But, remember the 170 * tail and other pointers so the bios can be properly reinserted 171 * into the list if we hit congestion 172 */ 173 if (!force_reg && device->pending_sync_bios.head) { 174 pending_bios = &device->pending_sync_bios; 175 force_reg = 1; 176 } else { 177 pending_bios = &device->pending_bios; 178 force_reg = 0; 179 } 180 181 pending = pending_bios->head; 182 tail = pending_bios->tail; 183 WARN_ON(pending && !tail); 184 185 /* 186 * if pending was null this time around, no bios need processing 187 * at all and we can stop. Otherwise it'll loop back up again 188 * and do an additional check so no bios are missed. 189 * 190 * device->running_pending is used to synchronize with the 191 * schedule_bio code. 192 */ 193 if (device->pending_sync_bios.head == NULL && 194 device->pending_bios.head == NULL) { 195 again = 0; 196 device->running_pending = 0; 197 } else { 198 again = 1; 199 device->running_pending = 1; 200 } 201 202 pending_bios->head = NULL; 203 pending_bios->tail = NULL; 204 205 spin_unlock(&device->io_lock); 206 207 while (pending) { 208 209 rmb(); 210 /* we want to work on both lists, but do more bios on the 211 * sync list than the regular list 212 */ 213 if ((num_run > 32 && 214 pending_bios != &device->pending_sync_bios && 215 device->pending_sync_bios.head) || 216 (num_run > 64 && pending_bios == &device->pending_sync_bios && 217 device->pending_bios.head)) { 218 spin_lock(&device->io_lock); 219 requeue_list(pending_bios, pending, tail); 220 goto loop_lock; 221 } 222 223 cur = pending; 224 pending = pending->bi_next; 225 cur->bi_next = NULL; 226 atomic_dec(&fs_info->nr_async_bios); 227 228 if (atomic_read(&fs_info->nr_async_bios) < limit && 229 waitqueue_active(&fs_info->async_submit_wait)) 230 wake_up(&fs_info->async_submit_wait); 231 232 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 233 234 /* 235 * if we're doing the sync list, record that our 236 * plug has some sync requests on it 237 * 238 * If we're doing the regular list and there are 239 * sync requests sitting around, unplug before 240 * we add more 241 */ 242 if (pending_bios == &device->pending_sync_bios) { 243 sync_pending = 1; 244 } else if (sync_pending) { 245 blk_finish_plug(&plug); 246 blk_start_plug(&plug); 247 sync_pending = 0; 248 } 249 250 btrfsic_submit_bio(cur->bi_rw, cur); 251 num_run++; 252 batch_run++; 253 if (need_resched()) 254 cond_resched(); 255 256 /* 257 * we made progress, there is more work to do and the bdi 258 * is now congested. Back off and let other work structs 259 * run instead 260 */ 261 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 262 fs_info->fs_devices->open_devices > 1) { 263 struct io_context *ioc; 264 265 ioc = current->io_context; 266 267 /* 268 * the main goal here is that we don't want to 269 * block if we're going to be able to submit 270 * more requests without blocking. 271 * 272 * This code does two great things, it pokes into 273 * the elevator code from a filesystem _and_ 274 * it makes assumptions about how batching works. 275 */ 276 if (ioc && ioc->nr_batch_requests > 0 && 277 time_before(jiffies, ioc->last_waited + HZ/50UL) && 278 (last_waited == 0 || 279 ioc->last_waited == last_waited)) { 280 /* 281 * we want to go through our batch of 282 * requests and stop. So, we copy out 283 * the ioc->last_waited time and test 284 * against it before looping 285 */ 286 last_waited = ioc->last_waited; 287 if (need_resched()) 288 cond_resched(); 289 continue; 290 } 291 spin_lock(&device->io_lock); 292 requeue_list(pending_bios, pending, tail); 293 device->running_pending = 1; 294 295 spin_unlock(&device->io_lock); 296 btrfs_requeue_work(&device->work); 297 goto done; 298 } 299 /* unplug every 64 requests just for good measure */ 300 if (batch_run % 64 == 0) { 301 blk_finish_plug(&plug); 302 blk_start_plug(&plug); 303 sync_pending = 0; 304 } 305 } 306 307 cond_resched(); 308 if (again) 309 goto loop; 310 311 spin_lock(&device->io_lock); 312 if (device->pending_bios.head || device->pending_sync_bios.head) 313 goto loop_lock; 314 spin_unlock(&device->io_lock); 315 316 done: 317 blk_finish_plug(&plug); 318 } 319 320 static void pending_bios_fn(struct btrfs_work *work) 321 { 322 struct btrfs_device *device; 323 324 device = container_of(work, struct btrfs_device, work); 325 run_scheduled_bios(device); 326 } 327 328 static noinline int device_list_add(const char *path, 329 struct btrfs_super_block *disk_super, 330 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 331 { 332 struct btrfs_device *device; 333 struct btrfs_fs_devices *fs_devices; 334 u64 found_transid = btrfs_super_generation(disk_super); 335 char *name; 336 337 fs_devices = find_fsid(disk_super->fsid); 338 if (!fs_devices) { 339 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 340 if (!fs_devices) 341 return -ENOMEM; 342 INIT_LIST_HEAD(&fs_devices->devices); 343 INIT_LIST_HEAD(&fs_devices->alloc_list); 344 list_add(&fs_devices->list, &fs_uuids); 345 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 346 fs_devices->latest_devid = devid; 347 fs_devices->latest_trans = found_transid; 348 mutex_init(&fs_devices->device_list_mutex); 349 device = NULL; 350 } else { 351 device = __find_device(&fs_devices->devices, devid, 352 disk_super->dev_item.uuid); 353 } 354 if (!device) { 355 if (fs_devices->opened) 356 return -EBUSY; 357 358 device = kzalloc(sizeof(*device), GFP_NOFS); 359 if (!device) { 360 /* we can safely leave the fs_devices entry around */ 361 return -ENOMEM; 362 } 363 device->devid = devid; 364 device->work.func = pending_bios_fn; 365 memcpy(device->uuid, disk_super->dev_item.uuid, 366 BTRFS_UUID_SIZE); 367 spin_lock_init(&device->io_lock); 368 device->name = kstrdup(path, GFP_NOFS); 369 if (!device->name) { 370 kfree(device); 371 return -ENOMEM; 372 } 373 INIT_LIST_HEAD(&device->dev_alloc_list); 374 375 /* init readahead state */ 376 spin_lock_init(&device->reada_lock); 377 device->reada_curr_zone = NULL; 378 atomic_set(&device->reada_in_flight, 0); 379 device->reada_next = 0; 380 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); 381 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); 382 383 mutex_lock(&fs_devices->device_list_mutex); 384 list_add_rcu(&device->dev_list, &fs_devices->devices); 385 mutex_unlock(&fs_devices->device_list_mutex); 386 387 device->fs_devices = fs_devices; 388 fs_devices->num_devices++; 389 } else if (!device->name || strcmp(device->name, path)) { 390 name = kstrdup(path, GFP_NOFS); 391 if (!name) 392 return -ENOMEM; 393 kfree(device->name); 394 device->name = name; 395 if (device->missing) { 396 fs_devices->missing_devices--; 397 device->missing = 0; 398 } 399 } 400 401 if (found_transid > fs_devices->latest_trans) { 402 fs_devices->latest_devid = devid; 403 fs_devices->latest_trans = found_transid; 404 } 405 *fs_devices_ret = fs_devices; 406 return 0; 407 } 408 409 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 410 { 411 struct btrfs_fs_devices *fs_devices; 412 struct btrfs_device *device; 413 struct btrfs_device *orig_dev; 414 415 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 416 if (!fs_devices) 417 return ERR_PTR(-ENOMEM); 418 419 INIT_LIST_HEAD(&fs_devices->devices); 420 INIT_LIST_HEAD(&fs_devices->alloc_list); 421 INIT_LIST_HEAD(&fs_devices->list); 422 mutex_init(&fs_devices->device_list_mutex); 423 fs_devices->latest_devid = orig->latest_devid; 424 fs_devices->latest_trans = orig->latest_trans; 425 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 426 427 /* We have held the volume lock, it is safe to get the devices. */ 428 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 429 device = kzalloc(sizeof(*device), GFP_NOFS); 430 if (!device) 431 goto error; 432 433 device->name = kstrdup(orig_dev->name, GFP_NOFS); 434 if (!device->name) { 435 kfree(device); 436 goto error; 437 } 438 439 device->devid = orig_dev->devid; 440 device->work.func = pending_bios_fn; 441 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 442 spin_lock_init(&device->io_lock); 443 INIT_LIST_HEAD(&device->dev_list); 444 INIT_LIST_HEAD(&device->dev_alloc_list); 445 446 list_add(&device->dev_list, &fs_devices->devices); 447 device->fs_devices = fs_devices; 448 fs_devices->num_devices++; 449 } 450 return fs_devices; 451 error: 452 free_fs_devices(fs_devices); 453 return ERR_PTR(-ENOMEM); 454 } 455 456 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 457 { 458 struct btrfs_device *device, *next; 459 460 struct block_device *latest_bdev = NULL; 461 u64 latest_devid = 0; 462 u64 latest_transid = 0; 463 464 mutex_lock(&uuid_mutex); 465 again: 466 /* This is the initialized path, it is safe to release the devices. */ 467 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 468 if (device->in_fs_metadata) { 469 if (!latest_transid || 470 device->generation > latest_transid) { 471 latest_devid = device->devid; 472 latest_transid = device->generation; 473 latest_bdev = device->bdev; 474 } 475 continue; 476 } 477 478 if (device->bdev) { 479 blkdev_put(device->bdev, device->mode); 480 device->bdev = NULL; 481 fs_devices->open_devices--; 482 } 483 if (device->writeable) { 484 list_del_init(&device->dev_alloc_list); 485 device->writeable = 0; 486 fs_devices->rw_devices--; 487 } 488 list_del_init(&device->dev_list); 489 fs_devices->num_devices--; 490 kfree(device->name); 491 kfree(device); 492 } 493 494 if (fs_devices->seed) { 495 fs_devices = fs_devices->seed; 496 goto again; 497 } 498 499 fs_devices->latest_bdev = latest_bdev; 500 fs_devices->latest_devid = latest_devid; 501 fs_devices->latest_trans = latest_transid; 502 503 mutex_unlock(&uuid_mutex); 504 } 505 506 static void __free_device(struct work_struct *work) 507 { 508 struct btrfs_device *device; 509 510 device = container_of(work, struct btrfs_device, rcu_work); 511 512 if (device->bdev) 513 blkdev_put(device->bdev, device->mode); 514 515 kfree(device->name); 516 kfree(device); 517 } 518 519 static void free_device(struct rcu_head *head) 520 { 521 struct btrfs_device *device; 522 523 device = container_of(head, struct btrfs_device, rcu); 524 525 INIT_WORK(&device->rcu_work, __free_device); 526 schedule_work(&device->rcu_work); 527 } 528 529 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 530 { 531 struct btrfs_device *device; 532 533 if (--fs_devices->opened > 0) 534 return 0; 535 536 mutex_lock(&fs_devices->device_list_mutex); 537 list_for_each_entry(device, &fs_devices->devices, dev_list) { 538 struct btrfs_device *new_device; 539 540 if (device->bdev) 541 fs_devices->open_devices--; 542 543 if (device->writeable) { 544 list_del_init(&device->dev_alloc_list); 545 fs_devices->rw_devices--; 546 } 547 548 if (device->can_discard) 549 fs_devices->num_can_discard--; 550 551 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 552 BUG_ON(!new_device); /* -ENOMEM */ 553 memcpy(new_device, device, sizeof(*new_device)); 554 new_device->name = kstrdup(device->name, GFP_NOFS); 555 BUG_ON(device->name && !new_device->name); /* -ENOMEM */ 556 new_device->bdev = NULL; 557 new_device->writeable = 0; 558 new_device->in_fs_metadata = 0; 559 new_device->can_discard = 0; 560 list_replace_rcu(&device->dev_list, &new_device->dev_list); 561 562 call_rcu(&device->rcu, free_device); 563 } 564 mutex_unlock(&fs_devices->device_list_mutex); 565 566 WARN_ON(fs_devices->open_devices); 567 WARN_ON(fs_devices->rw_devices); 568 fs_devices->opened = 0; 569 fs_devices->seeding = 0; 570 571 return 0; 572 } 573 574 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 575 { 576 struct btrfs_fs_devices *seed_devices = NULL; 577 int ret; 578 579 mutex_lock(&uuid_mutex); 580 ret = __btrfs_close_devices(fs_devices); 581 if (!fs_devices->opened) { 582 seed_devices = fs_devices->seed; 583 fs_devices->seed = NULL; 584 } 585 mutex_unlock(&uuid_mutex); 586 587 while (seed_devices) { 588 fs_devices = seed_devices; 589 seed_devices = fs_devices->seed; 590 __btrfs_close_devices(fs_devices); 591 free_fs_devices(fs_devices); 592 } 593 return ret; 594 } 595 596 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 597 fmode_t flags, void *holder) 598 { 599 struct request_queue *q; 600 struct block_device *bdev; 601 struct list_head *head = &fs_devices->devices; 602 struct btrfs_device *device; 603 struct block_device *latest_bdev = NULL; 604 struct buffer_head *bh; 605 struct btrfs_super_block *disk_super; 606 u64 latest_devid = 0; 607 u64 latest_transid = 0; 608 u64 devid; 609 int seeding = 1; 610 int ret = 0; 611 612 flags |= FMODE_EXCL; 613 614 list_for_each_entry(device, head, dev_list) { 615 if (device->bdev) 616 continue; 617 if (!device->name) 618 continue; 619 620 bdev = blkdev_get_by_path(device->name, flags, holder); 621 if (IS_ERR(bdev)) { 622 printk(KERN_INFO "open %s failed\n", device->name); 623 goto error; 624 } 625 filemap_write_and_wait(bdev->bd_inode->i_mapping); 626 invalidate_bdev(bdev); 627 set_blocksize(bdev, 4096); 628 629 bh = btrfs_read_dev_super(bdev); 630 if (!bh) 631 goto error_close; 632 633 disk_super = (struct btrfs_super_block *)bh->b_data; 634 devid = btrfs_stack_device_id(&disk_super->dev_item); 635 if (devid != device->devid) 636 goto error_brelse; 637 638 if (memcmp(device->uuid, disk_super->dev_item.uuid, 639 BTRFS_UUID_SIZE)) 640 goto error_brelse; 641 642 device->generation = btrfs_super_generation(disk_super); 643 if (!latest_transid || device->generation > latest_transid) { 644 latest_devid = devid; 645 latest_transid = device->generation; 646 latest_bdev = bdev; 647 } 648 649 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 650 device->writeable = 0; 651 } else { 652 device->writeable = !bdev_read_only(bdev); 653 seeding = 0; 654 } 655 656 q = bdev_get_queue(bdev); 657 if (blk_queue_discard(q)) { 658 device->can_discard = 1; 659 fs_devices->num_can_discard++; 660 } 661 662 device->bdev = bdev; 663 device->in_fs_metadata = 0; 664 device->mode = flags; 665 666 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 667 fs_devices->rotating = 1; 668 669 fs_devices->open_devices++; 670 if (device->writeable) { 671 fs_devices->rw_devices++; 672 list_add(&device->dev_alloc_list, 673 &fs_devices->alloc_list); 674 } 675 brelse(bh); 676 continue; 677 678 error_brelse: 679 brelse(bh); 680 error_close: 681 blkdev_put(bdev, flags); 682 error: 683 continue; 684 } 685 if (fs_devices->open_devices == 0) { 686 ret = -EINVAL; 687 goto out; 688 } 689 fs_devices->seeding = seeding; 690 fs_devices->opened = 1; 691 fs_devices->latest_bdev = latest_bdev; 692 fs_devices->latest_devid = latest_devid; 693 fs_devices->latest_trans = latest_transid; 694 fs_devices->total_rw_bytes = 0; 695 out: 696 return ret; 697 } 698 699 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 700 fmode_t flags, void *holder) 701 { 702 int ret; 703 704 mutex_lock(&uuid_mutex); 705 if (fs_devices->opened) { 706 fs_devices->opened++; 707 ret = 0; 708 } else { 709 ret = __btrfs_open_devices(fs_devices, flags, holder); 710 } 711 mutex_unlock(&uuid_mutex); 712 return ret; 713 } 714 715 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 716 struct btrfs_fs_devices **fs_devices_ret) 717 { 718 struct btrfs_super_block *disk_super; 719 struct block_device *bdev; 720 struct buffer_head *bh; 721 int ret; 722 u64 devid; 723 u64 transid; 724 725 flags |= FMODE_EXCL; 726 bdev = blkdev_get_by_path(path, flags, holder); 727 728 if (IS_ERR(bdev)) { 729 ret = PTR_ERR(bdev); 730 goto error; 731 } 732 733 mutex_lock(&uuid_mutex); 734 ret = set_blocksize(bdev, 4096); 735 if (ret) 736 goto error_close; 737 bh = btrfs_read_dev_super(bdev); 738 if (!bh) { 739 ret = -EINVAL; 740 goto error_close; 741 } 742 disk_super = (struct btrfs_super_block *)bh->b_data; 743 devid = btrfs_stack_device_id(&disk_super->dev_item); 744 transid = btrfs_super_generation(disk_super); 745 if (disk_super->label[0]) 746 printk(KERN_INFO "device label %s ", disk_super->label); 747 else 748 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 749 printk(KERN_CONT "devid %llu transid %llu %s\n", 750 (unsigned long long)devid, (unsigned long long)transid, path); 751 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 752 753 brelse(bh); 754 error_close: 755 mutex_unlock(&uuid_mutex); 756 blkdev_put(bdev, flags); 757 error: 758 return ret; 759 } 760 761 /* helper to account the used device space in the range */ 762 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 763 u64 end, u64 *length) 764 { 765 struct btrfs_key key; 766 struct btrfs_root *root = device->dev_root; 767 struct btrfs_dev_extent *dev_extent; 768 struct btrfs_path *path; 769 u64 extent_end; 770 int ret; 771 int slot; 772 struct extent_buffer *l; 773 774 *length = 0; 775 776 if (start >= device->total_bytes) 777 return 0; 778 779 path = btrfs_alloc_path(); 780 if (!path) 781 return -ENOMEM; 782 path->reada = 2; 783 784 key.objectid = device->devid; 785 key.offset = start; 786 key.type = BTRFS_DEV_EXTENT_KEY; 787 788 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 789 if (ret < 0) 790 goto out; 791 if (ret > 0) { 792 ret = btrfs_previous_item(root, path, key.objectid, key.type); 793 if (ret < 0) 794 goto out; 795 } 796 797 while (1) { 798 l = path->nodes[0]; 799 slot = path->slots[0]; 800 if (slot >= btrfs_header_nritems(l)) { 801 ret = btrfs_next_leaf(root, path); 802 if (ret == 0) 803 continue; 804 if (ret < 0) 805 goto out; 806 807 break; 808 } 809 btrfs_item_key_to_cpu(l, &key, slot); 810 811 if (key.objectid < device->devid) 812 goto next; 813 814 if (key.objectid > device->devid) 815 break; 816 817 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 818 goto next; 819 820 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 821 extent_end = key.offset + btrfs_dev_extent_length(l, 822 dev_extent); 823 if (key.offset <= start && extent_end > end) { 824 *length = end - start + 1; 825 break; 826 } else if (key.offset <= start && extent_end > start) 827 *length += extent_end - start; 828 else if (key.offset > start && extent_end <= end) 829 *length += extent_end - key.offset; 830 else if (key.offset > start && key.offset <= end) { 831 *length += end - key.offset + 1; 832 break; 833 } else if (key.offset > end) 834 break; 835 836 next: 837 path->slots[0]++; 838 } 839 ret = 0; 840 out: 841 btrfs_free_path(path); 842 return ret; 843 } 844 845 /* 846 * find_free_dev_extent - find free space in the specified device 847 * @device: the device which we search the free space in 848 * @num_bytes: the size of the free space that we need 849 * @start: store the start of the free space. 850 * @len: the size of the free space. that we find, or the size of the max 851 * free space if we don't find suitable free space 852 * 853 * this uses a pretty simple search, the expectation is that it is 854 * called very infrequently and that a given device has a small number 855 * of extents 856 * 857 * @start is used to store the start of the free space if we find. But if we 858 * don't find suitable free space, it will be used to store the start position 859 * of the max free space. 860 * 861 * @len is used to store the size of the free space that we find. 862 * But if we don't find suitable free space, it is used to store the size of 863 * the max free space. 864 */ 865 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 866 u64 *start, u64 *len) 867 { 868 struct btrfs_key key; 869 struct btrfs_root *root = device->dev_root; 870 struct btrfs_dev_extent *dev_extent; 871 struct btrfs_path *path; 872 u64 hole_size; 873 u64 max_hole_start; 874 u64 max_hole_size; 875 u64 extent_end; 876 u64 search_start; 877 u64 search_end = device->total_bytes; 878 int ret; 879 int slot; 880 struct extent_buffer *l; 881 882 /* FIXME use last free of some kind */ 883 884 /* we don't want to overwrite the superblock on the drive, 885 * so we make sure to start at an offset of at least 1MB 886 */ 887 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 888 889 max_hole_start = search_start; 890 max_hole_size = 0; 891 hole_size = 0; 892 893 if (search_start >= search_end) { 894 ret = -ENOSPC; 895 goto error; 896 } 897 898 path = btrfs_alloc_path(); 899 if (!path) { 900 ret = -ENOMEM; 901 goto error; 902 } 903 path->reada = 2; 904 905 key.objectid = device->devid; 906 key.offset = search_start; 907 key.type = BTRFS_DEV_EXTENT_KEY; 908 909 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 910 if (ret < 0) 911 goto out; 912 if (ret > 0) { 913 ret = btrfs_previous_item(root, path, key.objectid, key.type); 914 if (ret < 0) 915 goto out; 916 } 917 918 while (1) { 919 l = path->nodes[0]; 920 slot = path->slots[0]; 921 if (slot >= btrfs_header_nritems(l)) { 922 ret = btrfs_next_leaf(root, path); 923 if (ret == 0) 924 continue; 925 if (ret < 0) 926 goto out; 927 928 break; 929 } 930 btrfs_item_key_to_cpu(l, &key, slot); 931 932 if (key.objectid < device->devid) 933 goto next; 934 935 if (key.objectid > device->devid) 936 break; 937 938 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 939 goto next; 940 941 if (key.offset > search_start) { 942 hole_size = key.offset - search_start; 943 944 if (hole_size > max_hole_size) { 945 max_hole_start = search_start; 946 max_hole_size = hole_size; 947 } 948 949 /* 950 * If this free space is greater than which we need, 951 * it must be the max free space that we have found 952 * until now, so max_hole_start must point to the start 953 * of this free space and the length of this free space 954 * is stored in max_hole_size. Thus, we return 955 * max_hole_start and max_hole_size and go back to the 956 * caller. 957 */ 958 if (hole_size >= num_bytes) { 959 ret = 0; 960 goto out; 961 } 962 } 963 964 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 965 extent_end = key.offset + btrfs_dev_extent_length(l, 966 dev_extent); 967 if (extent_end > search_start) 968 search_start = extent_end; 969 next: 970 path->slots[0]++; 971 cond_resched(); 972 } 973 974 /* 975 * At this point, search_start should be the end of 976 * allocated dev extents, and when shrinking the device, 977 * search_end may be smaller than search_start. 978 */ 979 if (search_end > search_start) 980 hole_size = search_end - search_start; 981 982 if (hole_size > max_hole_size) { 983 max_hole_start = search_start; 984 max_hole_size = hole_size; 985 } 986 987 /* See above. */ 988 if (hole_size < num_bytes) 989 ret = -ENOSPC; 990 else 991 ret = 0; 992 993 out: 994 btrfs_free_path(path); 995 error: 996 *start = max_hole_start; 997 if (len) 998 *len = max_hole_size; 999 return ret; 1000 } 1001 1002 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1003 struct btrfs_device *device, 1004 u64 start) 1005 { 1006 int ret; 1007 struct btrfs_path *path; 1008 struct btrfs_root *root = device->dev_root; 1009 struct btrfs_key key; 1010 struct btrfs_key found_key; 1011 struct extent_buffer *leaf = NULL; 1012 struct btrfs_dev_extent *extent = NULL; 1013 1014 path = btrfs_alloc_path(); 1015 if (!path) 1016 return -ENOMEM; 1017 1018 key.objectid = device->devid; 1019 key.offset = start; 1020 key.type = BTRFS_DEV_EXTENT_KEY; 1021 again: 1022 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1023 if (ret > 0) { 1024 ret = btrfs_previous_item(root, path, key.objectid, 1025 BTRFS_DEV_EXTENT_KEY); 1026 if (ret) 1027 goto out; 1028 leaf = path->nodes[0]; 1029 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1030 extent = btrfs_item_ptr(leaf, path->slots[0], 1031 struct btrfs_dev_extent); 1032 BUG_ON(found_key.offset > start || found_key.offset + 1033 btrfs_dev_extent_length(leaf, extent) < start); 1034 key = found_key; 1035 btrfs_release_path(path); 1036 goto again; 1037 } else if (ret == 0) { 1038 leaf = path->nodes[0]; 1039 extent = btrfs_item_ptr(leaf, path->slots[0], 1040 struct btrfs_dev_extent); 1041 } else { 1042 btrfs_error(root->fs_info, ret, "Slot search failed"); 1043 goto out; 1044 } 1045 1046 if (device->bytes_used > 0) { 1047 u64 len = btrfs_dev_extent_length(leaf, extent); 1048 device->bytes_used -= len; 1049 spin_lock(&root->fs_info->free_chunk_lock); 1050 root->fs_info->free_chunk_space += len; 1051 spin_unlock(&root->fs_info->free_chunk_lock); 1052 } 1053 ret = btrfs_del_item(trans, root, path); 1054 if (ret) { 1055 btrfs_error(root->fs_info, ret, 1056 "Failed to remove dev extent item"); 1057 } 1058 out: 1059 btrfs_free_path(path); 1060 return ret; 1061 } 1062 1063 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1064 struct btrfs_device *device, 1065 u64 chunk_tree, u64 chunk_objectid, 1066 u64 chunk_offset, u64 start, u64 num_bytes) 1067 { 1068 int ret; 1069 struct btrfs_path *path; 1070 struct btrfs_root *root = device->dev_root; 1071 struct btrfs_dev_extent *extent; 1072 struct extent_buffer *leaf; 1073 struct btrfs_key key; 1074 1075 WARN_ON(!device->in_fs_metadata); 1076 path = btrfs_alloc_path(); 1077 if (!path) 1078 return -ENOMEM; 1079 1080 key.objectid = device->devid; 1081 key.offset = start; 1082 key.type = BTRFS_DEV_EXTENT_KEY; 1083 ret = btrfs_insert_empty_item(trans, root, path, &key, 1084 sizeof(*extent)); 1085 if (ret) 1086 goto out; 1087 1088 leaf = path->nodes[0]; 1089 extent = btrfs_item_ptr(leaf, path->slots[0], 1090 struct btrfs_dev_extent); 1091 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1092 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1093 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1094 1095 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1096 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1097 BTRFS_UUID_SIZE); 1098 1099 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1100 btrfs_mark_buffer_dirty(leaf); 1101 out: 1102 btrfs_free_path(path); 1103 return ret; 1104 } 1105 1106 static noinline int find_next_chunk(struct btrfs_root *root, 1107 u64 objectid, u64 *offset) 1108 { 1109 struct btrfs_path *path; 1110 int ret; 1111 struct btrfs_key key; 1112 struct btrfs_chunk *chunk; 1113 struct btrfs_key found_key; 1114 1115 path = btrfs_alloc_path(); 1116 if (!path) 1117 return -ENOMEM; 1118 1119 key.objectid = objectid; 1120 key.offset = (u64)-1; 1121 key.type = BTRFS_CHUNK_ITEM_KEY; 1122 1123 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1124 if (ret < 0) 1125 goto error; 1126 1127 BUG_ON(ret == 0); /* Corruption */ 1128 1129 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1130 if (ret) { 1131 *offset = 0; 1132 } else { 1133 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1134 path->slots[0]); 1135 if (found_key.objectid != objectid) 1136 *offset = 0; 1137 else { 1138 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1139 struct btrfs_chunk); 1140 *offset = found_key.offset + 1141 btrfs_chunk_length(path->nodes[0], chunk); 1142 } 1143 } 1144 ret = 0; 1145 error: 1146 btrfs_free_path(path); 1147 return ret; 1148 } 1149 1150 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1151 { 1152 int ret; 1153 struct btrfs_key key; 1154 struct btrfs_key found_key; 1155 struct btrfs_path *path; 1156 1157 root = root->fs_info->chunk_root; 1158 1159 path = btrfs_alloc_path(); 1160 if (!path) 1161 return -ENOMEM; 1162 1163 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1164 key.type = BTRFS_DEV_ITEM_KEY; 1165 key.offset = (u64)-1; 1166 1167 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1168 if (ret < 0) 1169 goto error; 1170 1171 BUG_ON(ret == 0); /* Corruption */ 1172 1173 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1174 BTRFS_DEV_ITEM_KEY); 1175 if (ret) { 1176 *objectid = 1; 1177 } else { 1178 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1179 path->slots[0]); 1180 *objectid = found_key.offset + 1; 1181 } 1182 ret = 0; 1183 error: 1184 btrfs_free_path(path); 1185 return ret; 1186 } 1187 1188 /* 1189 * the device information is stored in the chunk root 1190 * the btrfs_device struct should be fully filled in 1191 */ 1192 int btrfs_add_device(struct btrfs_trans_handle *trans, 1193 struct btrfs_root *root, 1194 struct btrfs_device *device) 1195 { 1196 int ret; 1197 struct btrfs_path *path; 1198 struct btrfs_dev_item *dev_item; 1199 struct extent_buffer *leaf; 1200 struct btrfs_key key; 1201 unsigned long ptr; 1202 1203 root = root->fs_info->chunk_root; 1204 1205 path = btrfs_alloc_path(); 1206 if (!path) 1207 return -ENOMEM; 1208 1209 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1210 key.type = BTRFS_DEV_ITEM_KEY; 1211 key.offset = device->devid; 1212 1213 ret = btrfs_insert_empty_item(trans, root, path, &key, 1214 sizeof(*dev_item)); 1215 if (ret) 1216 goto out; 1217 1218 leaf = path->nodes[0]; 1219 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1220 1221 btrfs_set_device_id(leaf, dev_item, device->devid); 1222 btrfs_set_device_generation(leaf, dev_item, 0); 1223 btrfs_set_device_type(leaf, dev_item, device->type); 1224 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1225 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1226 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1227 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1228 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1229 btrfs_set_device_group(leaf, dev_item, 0); 1230 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1231 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1232 btrfs_set_device_start_offset(leaf, dev_item, 0); 1233 1234 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1235 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1236 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1237 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1238 btrfs_mark_buffer_dirty(leaf); 1239 1240 ret = 0; 1241 out: 1242 btrfs_free_path(path); 1243 return ret; 1244 } 1245 1246 static int btrfs_rm_dev_item(struct btrfs_root *root, 1247 struct btrfs_device *device) 1248 { 1249 int ret; 1250 struct btrfs_path *path; 1251 struct btrfs_key key; 1252 struct btrfs_trans_handle *trans; 1253 1254 root = root->fs_info->chunk_root; 1255 1256 path = btrfs_alloc_path(); 1257 if (!path) 1258 return -ENOMEM; 1259 1260 trans = btrfs_start_transaction(root, 0); 1261 if (IS_ERR(trans)) { 1262 btrfs_free_path(path); 1263 return PTR_ERR(trans); 1264 } 1265 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1266 key.type = BTRFS_DEV_ITEM_KEY; 1267 key.offset = device->devid; 1268 lock_chunks(root); 1269 1270 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1271 if (ret < 0) 1272 goto out; 1273 1274 if (ret > 0) { 1275 ret = -ENOENT; 1276 goto out; 1277 } 1278 1279 ret = btrfs_del_item(trans, root, path); 1280 if (ret) 1281 goto out; 1282 out: 1283 btrfs_free_path(path); 1284 unlock_chunks(root); 1285 btrfs_commit_transaction(trans, root); 1286 return ret; 1287 } 1288 1289 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1290 { 1291 struct btrfs_device *device; 1292 struct btrfs_device *next_device; 1293 struct block_device *bdev; 1294 struct buffer_head *bh = NULL; 1295 struct btrfs_super_block *disk_super; 1296 struct btrfs_fs_devices *cur_devices; 1297 u64 all_avail; 1298 u64 devid; 1299 u64 num_devices; 1300 u8 *dev_uuid; 1301 int ret = 0; 1302 bool clear_super = false; 1303 1304 mutex_lock(&uuid_mutex); 1305 1306 all_avail = root->fs_info->avail_data_alloc_bits | 1307 root->fs_info->avail_system_alloc_bits | 1308 root->fs_info->avail_metadata_alloc_bits; 1309 1310 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1311 root->fs_info->fs_devices->num_devices <= 4) { 1312 printk(KERN_ERR "btrfs: unable to go below four devices " 1313 "on raid10\n"); 1314 ret = -EINVAL; 1315 goto out; 1316 } 1317 1318 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1319 root->fs_info->fs_devices->num_devices <= 2) { 1320 printk(KERN_ERR "btrfs: unable to go below two " 1321 "devices on raid1\n"); 1322 ret = -EINVAL; 1323 goto out; 1324 } 1325 1326 if (strcmp(device_path, "missing") == 0) { 1327 struct list_head *devices; 1328 struct btrfs_device *tmp; 1329 1330 device = NULL; 1331 devices = &root->fs_info->fs_devices->devices; 1332 /* 1333 * It is safe to read the devices since the volume_mutex 1334 * is held. 1335 */ 1336 list_for_each_entry(tmp, devices, dev_list) { 1337 if (tmp->in_fs_metadata && !tmp->bdev) { 1338 device = tmp; 1339 break; 1340 } 1341 } 1342 bdev = NULL; 1343 bh = NULL; 1344 disk_super = NULL; 1345 if (!device) { 1346 printk(KERN_ERR "btrfs: no missing devices found to " 1347 "remove\n"); 1348 goto out; 1349 } 1350 } else { 1351 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1352 root->fs_info->bdev_holder); 1353 if (IS_ERR(bdev)) { 1354 ret = PTR_ERR(bdev); 1355 goto out; 1356 } 1357 1358 set_blocksize(bdev, 4096); 1359 invalidate_bdev(bdev); 1360 bh = btrfs_read_dev_super(bdev); 1361 if (!bh) { 1362 ret = -EINVAL; 1363 goto error_close; 1364 } 1365 disk_super = (struct btrfs_super_block *)bh->b_data; 1366 devid = btrfs_stack_device_id(&disk_super->dev_item); 1367 dev_uuid = disk_super->dev_item.uuid; 1368 device = btrfs_find_device(root, devid, dev_uuid, 1369 disk_super->fsid); 1370 if (!device) { 1371 ret = -ENOENT; 1372 goto error_brelse; 1373 } 1374 } 1375 1376 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1377 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1378 "device\n"); 1379 ret = -EINVAL; 1380 goto error_brelse; 1381 } 1382 1383 if (device->writeable) { 1384 lock_chunks(root); 1385 list_del_init(&device->dev_alloc_list); 1386 unlock_chunks(root); 1387 root->fs_info->fs_devices->rw_devices--; 1388 clear_super = true; 1389 } 1390 1391 ret = btrfs_shrink_device(device, 0); 1392 if (ret) 1393 goto error_undo; 1394 1395 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1396 if (ret) 1397 goto error_undo; 1398 1399 spin_lock(&root->fs_info->free_chunk_lock); 1400 root->fs_info->free_chunk_space = device->total_bytes - 1401 device->bytes_used; 1402 spin_unlock(&root->fs_info->free_chunk_lock); 1403 1404 device->in_fs_metadata = 0; 1405 btrfs_scrub_cancel_dev(root, device); 1406 1407 /* 1408 * the device list mutex makes sure that we don't change 1409 * the device list while someone else is writing out all 1410 * the device supers. 1411 */ 1412 1413 cur_devices = device->fs_devices; 1414 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1415 list_del_rcu(&device->dev_list); 1416 1417 device->fs_devices->num_devices--; 1418 1419 if (device->missing) 1420 root->fs_info->fs_devices->missing_devices--; 1421 1422 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1423 struct btrfs_device, dev_list); 1424 if (device->bdev == root->fs_info->sb->s_bdev) 1425 root->fs_info->sb->s_bdev = next_device->bdev; 1426 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1427 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1428 1429 if (device->bdev) 1430 device->fs_devices->open_devices--; 1431 1432 call_rcu(&device->rcu, free_device); 1433 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1434 1435 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1436 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1437 1438 if (cur_devices->open_devices == 0) { 1439 struct btrfs_fs_devices *fs_devices; 1440 fs_devices = root->fs_info->fs_devices; 1441 while (fs_devices) { 1442 if (fs_devices->seed == cur_devices) 1443 break; 1444 fs_devices = fs_devices->seed; 1445 } 1446 fs_devices->seed = cur_devices->seed; 1447 cur_devices->seed = NULL; 1448 lock_chunks(root); 1449 __btrfs_close_devices(cur_devices); 1450 unlock_chunks(root); 1451 free_fs_devices(cur_devices); 1452 } 1453 1454 /* 1455 * at this point, the device is zero sized. We want to 1456 * remove it from the devices list and zero out the old super 1457 */ 1458 if (clear_super) { 1459 /* make sure this device isn't detected as part of 1460 * the FS anymore 1461 */ 1462 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1463 set_buffer_dirty(bh); 1464 sync_dirty_buffer(bh); 1465 } 1466 1467 ret = 0; 1468 1469 error_brelse: 1470 brelse(bh); 1471 error_close: 1472 if (bdev) 1473 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1474 out: 1475 mutex_unlock(&uuid_mutex); 1476 return ret; 1477 error_undo: 1478 if (device->writeable) { 1479 lock_chunks(root); 1480 list_add(&device->dev_alloc_list, 1481 &root->fs_info->fs_devices->alloc_list); 1482 unlock_chunks(root); 1483 root->fs_info->fs_devices->rw_devices++; 1484 } 1485 goto error_brelse; 1486 } 1487 1488 /* 1489 * does all the dirty work required for changing file system's UUID. 1490 */ 1491 static int btrfs_prepare_sprout(struct btrfs_root *root) 1492 { 1493 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1494 struct btrfs_fs_devices *old_devices; 1495 struct btrfs_fs_devices *seed_devices; 1496 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1497 struct btrfs_device *device; 1498 u64 super_flags; 1499 1500 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1501 if (!fs_devices->seeding) 1502 return -EINVAL; 1503 1504 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1505 if (!seed_devices) 1506 return -ENOMEM; 1507 1508 old_devices = clone_fs_devices(fs_devices); 1509 if (IS_ERR(old_devices)) { 1510 kfree(seed_devices); 1511 return PTR_ERR(old_devices); 1512 } 1513 1514 list_add(&old_devices->list, &fs_uuids); 1515 1516 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1517 seed_devices->opened = 1; 1518 INIT_LIST_HEAD(&seed_devices->devices); 1519 INIT_LIST_HEAD(&seed_devices->alloc_list); 1520 mutex_init(&seed_devices->device_list_mutex); 1521 1522 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1523 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1524 synchronize_rcu); 1525 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1526 1527 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1528 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1529 device->fs_devices = seed_devices; 1530 } 1531 1532 fs_devices->seeding = 0; 1533 fs_devices->num_devices = 0; 1534 fs_devices->open_devices = 0; 1535 fs_devices->seed = seed_devices; 1536 1537 generate_random_uuid(fs_devices->fsid); 1538 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1539 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1540 super_flags = btrfs_super_flags(disk_super) & 1541 ~BTRFS_SUPER_FLAG_SEEDING; 1542 btrfs_set_super_flags(disk_super, super_flags); 1543 1544 return 0; 1545 } 1546 1547 /* 1548 * strore the expected generation for seed devices in device items. 1549 */ 1550 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1551 struct btrfs_root *root) 1552 { 1553 struct btrfs_path *path; 1554 struct extent_buffer *leaf; 1555 struct btrfs_dev_item *dev_item; 1556 struct btrfs_device *device; 1557 struct btrfs_key key; 1558 u8 fs_uuid[BTRFS_UUID_SIZE]; 1559 u8 dev_uuid[BTRFS_UUID_SIZE]; 1560 u64 devid; 1561 int ret; 1562 1563 path = btrfs_alloc_path(); 1564 if (!path) 1565 return -ENOMEM; 1566 1567 root = root->fs_info->chunk_root; 1568 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1569 key.offset = 0; 1570 key.type = BTRFS_DEV_ITEM_KEY; 1571 1572 while (1) { 1573 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1574 if (ret < 0) 1575 goto error; 1576 1577 leaf = path->nodes[0]; 1578 next_slot: 1579 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1580 ret = btrfs_next_leaf(root, path); 1581 if (ret > 0) 1582 break; 1583 if (ret < 0) 1584 goto error; 1585 leaf = path->nodes[0]; 1586 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1587 btrfs_release_path(path); 1588 continue; 1589 } 1590 1591 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1592 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1593 key.type != BTRFS_DEV_ITEM_KEY) 1594 break; 1595 1596 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1597 struct btrfs_dev_item); 1598 devid = btrfs_device_id(leaf, dev_item); 1599 read_extent_buffer(leaf, dev_uuid, 1600 (unsigned long)btrfs_device_uuid(dev_item), 1601 BTRFS_UUID_SIZE); 1602 read_extent_buffer(leaf, fs_uuid, 1603 (unsigned long)btrfs_device_fsid(dev_item), 1604 BTRFS_UUID_SIZE); 1605 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1606 BUG_ON(!device); /* Logic error */ 1607 1608 if (device->fs_devices->seeding) { 1609 btrfs_set_device_generation(leaf, dev_item, 1610 device->generation); 1611 btrfs_mark_buffer_dirty(leaf); 1612 } 1613 1614 path->slots[0]++; 1615 goto next_slot; 1616 } 1617 ret = 0; 1618 error: 1619 btrfs_free_path(path); 1620 return ret; 1621 } 1622 1623 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1624 { 1625 struct request_queue *q; 1626 struct btrfs_trans_handle *trans; 1627 struct btrfs_device *device; 1628 struct block_device *bdev; 1629 struct list_head *devices; 1630 struct super_block *sb = root->fs_info->sb; 1631 u64 total_bytes; 1632 int seeding_dev = 0; 1633 int ret = 0; 1634 1635 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1636 return -EINVAL; 1637 1638 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1639 root->fs_info->bdev_holder); 1640 if (IS_ERR(bdev)) 1641 return PTR_ERR(bdev); 1642 1643 if (root->fs_info->fs_devices->seeding) { 1644 seeding_dev = 1; 1645 down_write(&sb->s_umount); 1646 mutex_lock(&uuid_mutex); 1647 } 1648 1649 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1650 1651 devices = &root->fs_info->fs_devices->devices; 1652 /* 1653 * we have the volume lock, so we don't need the extra 1654 * device list mutex while reading the list here. 1655 */ 1656 list_for_each_entry(device, devices, dev_list) { 1657 if (device->bdev == bdev) { 1658 ret = -EEXIST; 1659 goto error; 1660 } 1661 } 1662 1663 device = kzalloc(sizeof(*device), GFP_NOFS); 1664 if (!device) { 1665 /* we can safely leave the fs_devices entry around */ 1666 ret = -ENOMEM; 1667 goto error; 1668 } 1669 1670 device->name = kstrdup(device_path, GFP_NOFS); 1671 if (!device->name) { 1672 kfree(device); 1673 ret = -ENOMEM; 1674 goto error; 1675 } 1676 1677 ret = find_next_devid(root, &device->devid); 1678 if (ret) { 1679 kfree(device->name); 1680 kfree(device); 1681 goto error; 1682 } 1683 1684 trans = btrfs_start_transaction(root, 0); 1685 if (IS_ERR(trans)) { 1686 kfree(device->name); 1687 kfree(device); 1688 ret = PTR_ERR(trans); 1689 goto error; 1690 } 1691 1692 lock_chunks(root); 1693 1694 q = bdev_get_queue(bdev); 1695 if (blk_queue_discard(q)) 1696 device->can_discard = 1; 1697 device->writeable = 1; 1698 device->work.func = pending_bios_fn; 1699 generate_random_uuid(device->uuid); 1700 spin_lock_init(&device->io_lock); 1701 device->generation = trans->transid; 1702 device->io_width = root->sectorsize; 1703 device->io_align = root->sectorsize; 1704 device->sector_size = root->sectorsize; 1705 device->total_bytes = i_size_read(bdev->bd_inode); 1706 device->disk_total_bytes = device->total_bytes; 1707 device->dev_root = root->fs_info->dev_root; 1708 device->bdev = bdev; 1709 device->in_fs_metadata = 1; 1710 device->mode = FMODE_EXCL; 1711 set_blocksize(device->bdev, 4096); 1712 1713 if (seeding_dev) { 1714 sb->s_flags &= ~MS_RDONLY; 1715 ret = btrfs_prepare_sprout(root); 1716 BUG_ON(ret); /* -ENOMEM */ 1717 } 1718 1719 device->fs_devices = root->fs_info->fs_devices; 1720 1721 /* 1722 * we don't want write_supers to jump in here with our device 1723 * half setup 1724 */ 1725 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1726 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1727 list_add(&device->dev_alloc_list, 1728 &root->fs_info->fs_devices->alloc_list); 1729 root->fs_info->fs_devices->num_devices++; 1730 root->fs_info->fs_devices->open_devices++; 1731 root->fs_info->fs_devices->rw_devices++; 1732 if (device->can_discard) 1733 root->fs_info->fs_devices->num_can_discard++; 1734 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1735 1736 spin_lock(&root->fs_info->free_chunk_lock); 1737 root->fs_info->free_chunk_space += device->total_bytes; 1738 spin_unlock(&root->fs_info->free_chunk_lock); 1739 1740 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1741 root->fs_info->fs_devices->rotating = 1; 1742 1743 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 1744 btrfs_set_super_total_bytes(root->fs_info->super_copy, 1745 total_bytes + device->total_bytes); 1746 1747 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 1748 btrfs_set_super_num_devices(root->fs_info->super_copy, 1749 total_bytes + 1); 1750 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1751 1752 if (seeding_dev) { 1753 ret = init_first_rw_device(trans, root, device); 1754 if (ret) 1755 goto error_trans; 1756 ret = btrfs_finish_sprout(trans, root); 1757 if (ret) 1758 goto error_trans; 1759 } else { 1760 ret = btrfs_add_device(trans, root, device); 1761 if (ret) 1762 goto error_trans; 1763 } 1764 1765 /* 1766 * we've got more storage, clear any full flags on the space 1767 * infos 1768 */ 1769 btrfs_clear_space_info_full(root->fs_info); 1770 1771 unlock_chunks(root); 1772 ret = btrfs_commit_transaction(trans, root); 1773 1774 if (seeding_dev) { 1775 mutex_unlock(&uuid_mutex); 1776 up_write(&sb->s_umount); 1777 1778 if (ret) /* transaction commit */ 1779 return ret; 1780 1781 ret = btrfs_relocate_sys_chunks(root); 1782 if (ret < 0) 1783 btrfs_error(root->fs_info, ret, 1784 "Failed to relocate sys chunks after " 1785 "device initialization. This can be fixed " 1786 "using the \"btrfs balance\" command."); 1787 } 1788 1789 return ret; 1790 1791 error_trans: 1792 unlock_chunks(root); 1793 btrfs_abort_transaction(trans, root, ret); 1794 btrfs_end_transaction(trans, root); 1795 kfree(device->name); 1796 kfree(device); 1797 error: 1798 blkdev_put(bdev, FMODE_EXCL); 1799 if (seeding_dev) { 1800 mutex_unlock(&uuid_mutex); 1801 up_write(&sb->s_umount); 1802 } 1803 return ret; 1804 } 1805 1806 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1807 struct btrfs_device *device) 1808 { 1809 int ret; 1810 struct btrfs_path *path; 1811 struct btrfs_root *root; 1812 struct btrfs_dev_item *dev_item; 1813 struct extent_buffer *leaf; 1814 struct btrfs_key key; 1815 1816 root = device->dev_root->fs_info->chunk_root; 1817 1818 path = btrfs_alloc_path(); 1819 if (!path) 1820 return -ENOMEM; 1821 1822 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1823 key.type = BTRFS_DEV_ITEM_KEY; 1824 key.offset = device->devid; 1825 1826 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1827 if (ret < 0) 1828 goto out; 1829 1830 if (ret > 0) { 1831 ret = -ENOENT; 1832 goto out; 1833 } 1834 1835 leaf = path->nodes[0]; 1836 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1837 1838 btrfs_set_device_id(leaf, dev_item, device->devid); 1839 btrfs_set_device_type(leaf, dev_item, device->type); 1840 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1841 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1842 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1843 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1844 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1845 btrfs_mark_buffer_dirty(leaf); 1846 1847 out: 1848 btrfs_free_path(path); 1849 return ret; 1850 } 1851 1852 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1853 struct btrfs_device *device, u64 new_size) 1854 { 1855 struct btrfs_super_block *super_copy = 1856 device->dev_root->fs_info->super_copy; 1857 u64 old_total = btrfs_super_total_bytes(super_copy); 1858 u64 diff = new_size - device->total_bytes; 1859 1860 if (!device->writeable) 1861 return -EACCES; 1862 if (new_size <= device->total_bytes) 1863 return -EINVAL; 1864 1865 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1866 device->fs_devices->total_rw_bytes += diff; 1867 1868 device->total_bytes = new_size; 1869 device->disk_total_bytes = new_size; 1870 btrfs_clear_space_info_full(device->dev_root->fs_info); 1871 1872 return btrfs_update_device(trans, device); 1873 } 1874 1875 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1876 struct btrfs_device *device, u64 new_size) 1877 { 1878 int ret; 1879 lock_chunks(device->dev_root); 1880 ret = __btrfs_grow_device(trans, device, new_size); 1881 unlock_chunks(device->dev_root); 1882 return ret; 1883 } 1884 1885 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1886 struct btrfs_root *root, 1887 u64 chunk_tree, u64 chunk_objectid, 1888 u64 chunk_offset) 1889 { 1890 int ret; 1891 struct btrfs_path *path; 1892 struct btrfs_key key; 1893 1894 root = root->fs_info->chunk_root; 1895 path = btrfs_alloc_path(); 1896 if (!path) 1897 return -ENOMEM; 1898 1899 key.objectid = chunk_objectid; 1900 key.offset = chunk_offset; 1901 key.type = BTRFS_CHUNK_ITEM_KEY; 1902 1903 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1904 if (ret < 0) 1905 goto out; 1906 else if (ret > 0) { /* Logic error or corruption */ 1907 btrfs_error(root->fs_info, -ENOENT, 1908 "Failed lookup while freeing chunk."); 1909 ret = -ENOENT; 1910 goto out; 1911 } 1912 1913 ret = btrfs_del_item(trans, root, path); 1914 if (ret < 0) 1915 btrfs_error(root->fs_info, ret, 1916 "Failed to delete chunk item."); 1917 out: 1918 btrfs_free_path(path); 1919 return ret; 1920 } 1921 1922 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1923 chunk_offset) 1924 { 1925 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 1926 struct btrfs_disk_key *disk_key; 1927 struct btrfs_chunk *chunk; 1928 u8 *ptr; 1929 int ret = 0; 1930 u32 num_stripes; 1931 u32 array_size; 1932 u32 len = 0; 1933 u32 cur; 1934 struct btrfs_key key; 1935 1936 array_size = btrfs_super_sys_array_size(super_copy); 1937 1938 ptr = super_copy->sys_chunk_array; 1939 cur = 0; 1940 1941 while (cur < array_size) { 1942 disk_key = (struct btrfs_disk_key *)ptr; 1943 btrfs_disk_key_to_cpu(&key, disk_key); 1944 1945 len = sizeof(*disk_key); 1946 1947 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1948 chunk = (struct btrfs_chunk *)(ptr + len); 1949 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1950 len += btrfs_chunk_item_size(num_stripes); 1951 } else { 1952 ret = -EIO; 1953 break; 1954 } 1955 if (key.objectid == chunk_objectid && 1956 key.offset == chunk_offset) { 1957 memmove(ptr, ptr + len, array_size - (cur + len)); 1958 array_size -= len; 1959 btrfs_set_super_sys_array_size(super_copy, array_size); 1960 } else { 1961 ptr += len; 1962 cur += len; 1963 } 1964 } 1965 return ret; 1966 } 1967 1968 static int btrfs_relocate_chunk(struct btrfs_root *root, 1969 u64 chunk_tree, u64 chunk_objectid, 1970 u64 chunk_offset) 1971 { 1972 struct extent_map_tree *em_tree; 1973 struct btrfs_root *extent_root; 1974 struct btrfs_trans_handle *trans; 1975 struct extent_map *em; 1976 struct map_lookup *map; 1977 int ret; 1978 int i; 1979 1980 root = root->fs_info->chunk_root; 1981 extent_root = root->fs_info->extent_root; 1982 em_tree = &root->fs_info->mapping_tree.map_tree; 1983 1984 ret = btrfs_can_relocate(extent_root, chunk_offset); 1985 if (ret) 1986 return -ENOSPC; 1987 1988 /* step one, relocate all the extents inside this chunk */ 1989 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1990 if (ret) 1991 return ret; 1992 1993 trans = btrfs_start_transaction(root, 0); 1994 BUG_ON(IS_ERR(trans)); 1995 1996 lock_chunks(root); 1997 1998 /* 1999 * step two, delete the device extents and the 2000 * chunk tree entries 2001 */ 2002 read_lock(&em_tree->lock); 2003 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2004 read_unlock(&em_tree->lock); 2005 2006 BUG_ON(!em || em->start > chunk_offset || 2007 em->start + em->len < chunk_offset); 2008 map = (struct map_lookup *)em->bdev; 2009 2010 for (i = 0; i < map->num_stripes; i++) { 2011 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2012 map->stripes[i].physical); 2013 BUG_ON(ret); 2014 2015 if (map->stripes[i].dev) { 2016 ret = btrfs_update_device(trans, map->stripes[i].dev); 2017 BUG_ON(ret); 2018 } 2019 } 2020 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2021 chunk_offset); 2022 2023 BUG_ON(ret); 2024 2025 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2026 2027 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2028 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2029 BUG_ON(ret); 2030 } 2031 2032 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2033 BUG_ON(ret); 2034 2035 write_lock(&em_tree->lock); 2036 remove_extent_mapping(em_tree, em); 2037 write_unlock(&em_tree->lock); 2038 2039 kfree(map); 2040 em->bdev = NULL; 2041 2042 /* once for the tree */ 2043 free_extent_map(em); 2044 /* once for us */ 2045 free_extent_map(em); 2046 2047 unlock_chunks(root); 2048 btrfs_end_transaction(trans, root); 2049 return 0; 2050 } 2051 2052 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2053 { 2054 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2055 struct btrfs_path *path; 2056 struct extent_buffer *leaf; 2057 struct btrfs_chunk *chunk; 2058 struct btrfs_key key; 2059 struct btrfs_key found_key; 2060 u64 chunk_tree = chunk_root->root_key.objectid; 2061 u64 chunk_type; 2062 bool retried = false; 2063 int failed = 0; 2064 int ret; 2065 2066 path = btrfs_alloc_path(); 2067 if (!path) 2068 return -ENOMEM; 2069 2070 again: 2071 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2072 key.offset = (u64)-1; 2073 key.type = BTRFS_CHUNK_ITEM_KEY; 2074 2075 while (1) { 2076 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2077 if (ret < 0) 2078 goto error; 2079 BUG_ON(ret == 0); /* Corruption */ 2080 2081 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2082 key.type); 2083 if (ret < 0) 2084 goto error; 2085 if (ret > 0) 2086 break; 2087 2088 leaf = path->nodes[0]; 2089 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2090 2091 chunk = btrfs_item_ptr(leaf, path->slots[0], 2092 struct btrfs_chunk); 2093 chunk_type = btrfs_chunk_type(leaf, chunk); 2094 btrfs_release_path(path); 2095 2096 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2097 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2098 found_key.objectid, 2099 found_key.offset); 2100 if (ret == -ENOSPC) 2101 failed++; 2102 else if (ret) 2103 BUG(); 2104 } 2105 2106 if (found_key.offset == 0) 2107 break; 2108 key.offset = found_key.offset - 1; 2109 } 2110 ret = 0; 2111 if (failed && !retried) { 2112 failed = 0; 2113 retried = true; 2114 goto again; 2115 } else if (failed && retried) { 2116 WARN_ON(1); 2117 ret = -ENOSPC; 2118 } 2119 error: 2120 btrfs_free_path(path); 2121 return ret; 2122 } 2123 2124 static int insert_balance_item(struct btrfs_root *root, 2125 struct btrfs_balance_control *bctl) 2126 { 2127 struct btrfs_trans_handle *trans; 2128 struct btrfs_balance_item *item; 2129 struct btrfs_disk_balance_args disk_bargs; 2130 struct btrfs_path *path; 2131 struct extent_buffer *leaf; 2132 struct btrfs_key key; 2133 int ret, err; 2134 2135 path = btrfs_alloc_path(); 2136 if (!path) 2137 return -ENOMEM; 2138 2139 trans = btrfs_start_transaction(root, 0); 2140 if (IS_ERR(trans)) { 2141 btrfs_free_path(path); 2142 return PTR_ERR(trans); 2143 } 2144 2145 key.objectid = BTRFS_BALANCE_OBJECTID; 2146 key.type = BTRFS_BALANCE_ITEM_KEY; 2147 key.offset = 0; 2148 2149 ret = btrfs_insert_empty_item(trans, root, path, &key, 2150 sizeof(*item)); 2151 if (ret) 2152 goto out; 2153 2154 leaf = path->nodes[0]; 2155 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2156 2157 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2158 2159 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2160 btrfs_set_balance_data(leaf, item, &disk_bargs); 2161 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2162 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2163 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2164 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2165 2166 btrfs_set_balance_flags(leaf, item, bctl->flags); 2167 2168 btrfs_mark_buffer_dirty(leaf); 2169 out: 2170 btrfs_free_path(path); 2171 err = btrfs_commit_transaction(trans, root); 2172 if (err && !ret) 2173 ret = err; 2174 return ret; 2175 } 2176 2177 static int del_balance_item(struct btrfs_root *root) 2178 { 2179 struct btrfs_trans_handle *trans; 2180 struct btrfs_path *path; 2181 struct btrfs_key key; 2182 int ret, err; 2183 2184 path = btrfs_alloc_path(); 2185 if (!path) 2186 return -ENOMEM; 2187 2188 trans = btrfs_start_transaction(root, 0); 2189 if (IS_ERR(trans)) { 2190 btrfs_free_path(path); 2191 return PTR_ERR(trans); 2192 } 2193 2194 key.objectid = BTRFS_BALANCE_OBJECTID; 2195 key.type = BTRFS_BALANCE_ITEM_KEY; 2196 key.offset = 0; 2197 2198 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2199 if (ret < 0) 2200 goto out; 2201 if (ret > 0) { 2202 ret = -ENOENT; 2203 goto out; 2204 } 2205 2206 ret = btrfs_del_item(trans, root, path); 2207 out: 2208 btrfs_free_path(path); 2209 err = btrfs_commit_transaction(trans, root); 2210 if (err && !ret) 2211 ret = err; 2212 return ret; 2213 } 2214 2215 /* 2216 * This is a heuristic used to reduce the number of chunks balanced on 2217 * resume after balance was interrupted. 2218 */ 2219 static void update_balance_args(struct btrfs_balance_control *bctl) 2220 { 2221 /* 2222 * Turn on soft mode for chunk types that were being converted. 2223 */ 2224 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2225 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2226 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2227 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2228 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2229 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2230 2231 /* 2232 * Turn on usage filter if is not already used. The idea is 2233 * that chunks that we have already balanced should be 2234 * reasonably full. Don't do it for chunks that are being 2235 * converted - that will keep us from relocating unconverted 2236 * (albeit full) chunks. 2237 */ 2238 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2239 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2240 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2241 bctl->data.usage = 90; 2242 } 2243 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2244 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2245 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2246 bctl->sys.usage = 90; 2247 } 2248 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2249 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2250 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2251 bctl->meta.usage = 90; 2252 } 2253 } 2254 2255 /* 2256 * Should be called with both balance and volume mutexes held to 2257 * serialize other volume operations (add_dev/rm_dev/resize) with 2258 * restriper. Same goes for unset_balance_control. 2259 */ 2260 static void set_balance_control(struct btrfs_balance_control *bctl) 2261 { 2262 struct btrfs_fs_info *fs_info = bctl->fs_info; 2263 2264 BUG_ON(fs_info->balance_ctl); 2265 2266 spin_lock(&fs_info->balance_lock); 2267 fs_info->balance_ctl = bctl; 2268 spin_unlock(&fs_info->balance_lock); 2269 } 2270 2271 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2272 { 2273 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2274 2275 BUG_ON(!fs_info->balance_ctl); 2276 2277 spin_lock(&fs_info->balance_lock); 2278 fs_info->balance_ctl = NULL; 2279 spin_unlock(&fs_info->balance_lock); 2280 2281 kfree(bctl); 2282 } 2283 2284 /* 2285 * Balance filters. Return 1 if chunk should be filtered out 2286 * (should not be balanced). 2287 */ 2288 static int chunk_profiles_filter(u64 chunk_type, 2289 struct btrfs_balance_args *bargs) 2290 { 2291 chunk_type = chunk_to_extended(chunk_type) & 2292 BTRFS_EXTENDED_PROFILE_MASK; 2293 2294 if (bargs->profiles & chunk_type) 2295 return 0; 2296 2297 return 1; 2298 } 2299 2300 static u64 div_factor_fine(u64 num, int factor) 2301 { 2302 if (factor <= 0) 2303 return 0; 2304 if (factor >= 100) 2305 return num; 2306 2307 num *= factor; 2308 do_div(num, 100); 2309 return num; 2310 } 2311 2312 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2313 struct btrfs_balance_args *bargs) 2314 { 2315 struct btrfs_block_group_cache *cache; 2316 u64 chunk_used, user_thresh; 2317 int ret = 1; 2318 2319 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2320 chunk_used = btrfs_block_group_used(&cache->item); 2321 2322 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2323 if (chunk_used < user_thresh) 2324 ret = 0; 2325 2326 btrfs_put_block_group(cache); 2327 return ret; 2328 } 2329 2330 static int chunk_devid_filter(struct extent_buffer *leaf, 2331 struct btrfs_chunk *chunk, 2332 struct btrfs_balance_args *bargs) 2333 { 2334 struct btrfs_stripe *stripe; 2335 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2336 int i; 2337 2338 for (i = 0; i < num_stripes; i++) { 2339 stripe = btrfs_stripe_nr(chunk, i); 2340 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2341 return 0; 2342 } 2343 2344 return 1; 2345 } 2346 2347 /* [pstart, pend) */ 2348 static int chunk_drange_filter(struct extent_buffer *leaf, 2349 struct btrfs_chunk *chunk, 2350 u64 chunk_offset, 2351 struct btrfs_balance_args *bargs) 2352 { 2353 struct btrfs_stripe *stripe; 2354 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2355 u64 stripe_offset; 2356 u64 stripe_length; 2357 int factor; 2358 int i; 2359 2360 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2361 return 0; 2362 2363 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2364 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2365 factor = 2; 2366 else 2367 factor = 1; 2368 factor = num_stripes / factor; 2369 2370 for (i = 0; i < num_stripes; i++) { 2371 stripe = btrfs_stripe_nr(chunk, i); 2372 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2373 continue; 2374 2375 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2376 stripe_length = btrfs_chunk_length(leaf, chunk); 2377 do_div(stripe_length, factor); 2378 2379 if (stripe_offset < bargs->pend && 2380 stripe_offset + stripe_length > bargs->pstart) 2381 return 0; 2382 } 2383 2384 return 1; 2385 } 2386 2387 /* [vstart, vend) */ 2388 static int chunk_vrange_filter(struct extent_buffer *leaf, 2389 struct btrfs_chunk *chunk, 2390 u64 chunk_offset, 2391 struct btrfs_balance_args *bargs) 2392 { 2393 if (chunk_offset < bargs->vend && 2394 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2395 /* at least part of the chunk is inside this vrange */ 2396 return 0; 2397 2398 return 1; 2399 } 2400 2401 static int chunk_soft_convert_filter(u64 chunk_type, 2402 struct btrfs_balance_args *bargs) 2403 { 2404 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2405 return 0; 2406 2407 chunk_type = chunk_to_extended(chunk_type) & 2408 BTRFS_EXTENDED_PROFILE_MASK; 2409 2410 if (bargs->target == chunk_type) 2411 return 1; 2412 2413 return 0; 2414 } 2415 2416 static int should_balance_chunk(struct btrfs_root *root, 2417 struct extent_buffer *leaf, 2418 struct btrfs_chunk *chunk, u64 chunk_offset) 2419 { 2420 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2421 struct btrfs_balance_args *bargs = NULL; 2422 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2423 2424 /* type filter */ 2425 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2426 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2427 return 0; 2428 } 2429 2430 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2431 bargs = &bctl->data; 2432 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2433 bargs = &bctl->sys; 2434 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2435 bargs = &bctl->meta; 2436 2437 /* profiles filter */ 2438 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2439 chunk_profiles_filter(chunk_type, bargs)) { 2440 return 0; 2441 } 2442 2443 /* usage filter */ 2444 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2445 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2446 return 0; 2447 } 2448 2449 /* devid filter */ 2450 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2451 chunk_devid_filter(leaf, chunk, bargs)) { 2452 return 0; 2453 } 2454 2455 /* drange filter, makes sense only with devid filter */ 2456 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2457 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2458 return 0; 2459 } 2460 2461 /* vrange filter */ 2462 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2463 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2464 return 0; 2465 } 2466 2467 /* soft profile changing mode */ 2468 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2469 chunk_soft_convert_filter(chunk_type, bargs)) { 2470 return 0; 2471 } 2472 2473 return 1; 2474 } 2475 2476 static u64 div_factor(u64 num, int factor) 2477 { 2478 if (factor == 10) 2479 return num; 2480 num *= factor; 2481 do_div(num, 10); 2482 return num; 2483 } 2484 2485 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2486 { 2487 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2488 struct btrfs_root *chunk_root = fs_info->chunk_root; 2489 struct btrfs_root *dev_root = fs_info->dev_root; 2490 struct list_head *devices; 2491 struct btrfs_device *device; 2492 u64 old_size; 2493 u64 size_to_free; 2494 struct btrfs_chunk *chunk; 2495 struct btrfs_path *path; 2496 struct btrfs_key key; 2497 struct btrfs_key found_key; 2498 struct btrfs_trans_handle *trans; 2499 struct extent_buffer *leaf; 2500 int slot; 2501 int ret; 2502 int enospc_errors = 0; 2503 bool counting = true; 2504 2505 /* step one make some room on all the devices */ 2506 devices = &fs_info->fs_devices->devices; 2507 list_for_each_entry(device, devices, dev_list) { 2508 old_size = device->total_bytes; 2509 size_to_free = div_factor(old_size, 1); 2510 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2511 if (!device->writeable || 2512 device->total_bytes - device->bytes_used > size_to_free) 2513 continue; 2514 2515 ret = btrfs_shrink_device(device, old_size - size_to_free); 2516 if (ret == -ENOSPC) 2517 break; 2518 BUG_ON(ret); 2519 2520 trans = btrfs_start_transaction(dev_root, 0); 2521 BUG_ON(IS_ERR(trans)); 2522 2523 ret = btrfs_grow_device(trans, device, old_size); 2524 BUG_ON(ret); 2525 2526 btrfs_end_transaction(trans, dev_root); 2527 } 2528 2529 /* step two, relocate all the chunks */ 2530 path = btrfs_alloc_path(); 2531 if (!path) { 2532 ret = -ENOMEM; 2533 goto error; 2534 } 2535 2536 /* zero out stat counters */ 2537 spin_lock(&fs_info->balance_lock); 2538 memset(&bctl->stat, 0, sizeof(bctl->stat)); 2539 spin_unlock(&fs_info->balance_lock); 2540 again: 2541 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2542 key.offset = (u64)-1; 2543 key.type = BTRFS_CHUNK_ITEM_KEY; 2544 2545 while (1) { 2546 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 2547 atomic_read(&fs_info->balance_cancel_req)) { 2548 ret = -ECANCELED; 2549 goto error; 2550 } 2551 2552 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2553 if (ret < 0) 2554 goto error; 2555 2556 /* 2557 * this shouldn't happen, it means the last relocate 2558 * failed 2559 */ 2560 if (ret == 0) 2561 BUG(); /* FIXME break ? */ 2562 2563 ret = btrfs_previous_item(chunk_root, path, 0, 2564 BTRFS_CHUNK_ITEM_KEY); 2565 if (ret) { 2566 ret = 0; 2567 break; 2568 } 2569 2570 leaf = path->nodes[0]; 2571 slot = path->slots[0]; 2572 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2573 2574 if (found_key.objectid != key.objectid) 2575 break; 2576 2577 /* chunk zero is special */ 2578 if (found_key.offset == 0) 2579 break; 2580 2581 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 2582 2583 if (!counting) { 2584 spin_lock(&fs_info->balance_lock); 2585 bctl->stat.considered++; 2586 spin_unlock(&fs_info->balance_lock); 2587 } 2588 2589 ret = should_balance_chunk(chunk_root, leaf, chunk, 2590 found_key.offset); 2591 btrfs_release_path(path); 2592 if (!ret) 2593 goto loop; 2594 2595 if (counting) { 2596 spin_lock(&fs_info->balance_lock); 2597 bctl->stat.expected++; 2598 spin_unlock(&fs_info->balance_lock); 2599 goto loop; 2600 } 2601 2602 ret = btrfs_relocate_chunk(chunk_root, 2603 chunk_root->root_key.objectid, 2604 found_key.objectid, 2605 found_key.offset); 2606 if (ret && ret != -ENOSPC) 2607 goto error; 2608 if (ret == -ENOSPC) { 2609 enospc_errors++; 2610 } else { 2611 spin_lock(&fs_info->balance_lock); 2612 bctl->stat.completed++; 2613 spin_unlock(&fs_info->balance_lock); 2614 } 2615 loop: 2616 key.offset = found_key.offset - 1; 2617 } 2618 2619 if (counting) { 2620 btrfs_release_path(path); 2621 counting = false; 2622 goto again; 2623 } 2624 error: 2625 btrfs_free_path(path); 2626 if (enospc_errors) { 2627 printk(KERN_INFO "btrfs: %d enospc errors during balance\n", 2628 enospc_errors); 2629 if (!ret) 2630 ret = -ENOSPC; 2631 } 2632 2633 return ret; 2634 } 2635 2636 /** 2637 * alloc_profile_is_valid - see if a given profile is valid and reduced 2638 * @flags: profile to validate 2639 * @extended: if true @flags is treated as an extended profile 2640 */ 2641 static int alloc_profile_is_valid(u64 flags, int extended) 2642 { 2643 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 2644 BTRFS_BLOCK_GROUP_PROFILE_MASK); 2645 2646 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 2647 2648 /* 1) check that all other bits are zeroed */ 2649 if (flags & ~mask) 2650 return 0; 2651 2652 /* 2) see if profile is reduced */ 2653 if (flags == 0) 2654 return !extended; /* "0" is valid for usual profiles */ 2655 2656 /* true if exactly one bit set */ 2657 return (flags & (flags - 1)) == 0; 2658 } 2659 2660 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 2661 { 2662 /* cancel requested || normal exit path */ 2663 return atomic_read(&fs_info->balance_cancel_req) || 2664 (atomic_read(&fs_info->balance_pause_req) == 0 && 2665 atomic_read(&fs_info->balance_cancel_req) == 0); 2666 } 2667 2668 static void __cancel_balance(struct btrfs_fs_info *fs_info) 2669 { 2670 int ret; 2671 2672 unset_balance_control(fs_info); 2673 ret = del_balance_item(fs_info->tree_root); 2674 BUG_ON(ret); 2675 } 2676 2677 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 2678 struct btrfs_ioctl_balance_args *bargs); 2679 2680 /* 2681 * Should be called with both balance and volume mutexes held 2682 */ 2683 int btrfs_balance(struct btrfs_balance_control *bctl, 2684 struct btrfs_ioctl_balance_args *bargs) 2685 { 2686 struct btrfs_fs_info *fs_info = bctl->fs_info; 2687 u64 allowed; 2688 int mixed = 0; 2689 int ret; 2690 2691 if (btrfs_fs_closing(fs_info) || 2692 atomic_read(&fs_info->balance_pause_req) || 2693 atomic_read(&fs_info->balance_cancel_req)) { 2694 ret = -EINVAL; 2695 goto out; 2696 } 2697 2698 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 2699 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 2700 mixed = 1; 2701 2702 /* 2703 * In case of mixed groups both data and meta should be picked, 2704 * and identical options should be given for both of them. 2705 */ 2706 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 2707 if (mixed && (bctl->flags & allowed)) { 2708 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 2709 !(bctl->flags & BTRFS_BALANCE_METADATA) || 2710 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 2711 printk(KERN_ERR "btrfs: with mixed groups data and " 2712 "metadata balance options must be the same\n"); 2713 ret = -EINVAL; 2714 goto out; 2715 } 2716 } 2717 2718 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 2719 if (fs_info->fs_devices->num_devices == 1) 2720 allowed |= BTRFS_BLOCK_GROUP_DUP; 2721 else if (fs_info->fs_devices->num_devices < 4) 2722 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 2723 else 2724 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2725 BTRFS_BLOCK_GROUP_RAID10); 2726 2727 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2728 (!alloc_profile_is_valid(bctl->data.target, 1) || 2729 (bctl->data.target & ~allowed))) { 2730 printk(KERN_ERR "btrfs: unable to start balance with target " 2731 "data profile %llu\n", 2732 (unsigned long long)bctl->data.target); 2733 ret = -EINVAL; 2734 goto out; 2735 } 2736 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2737 (!alloc_profile_is_valid(bctl->meta.target, 1) || 2738 (bctl->meta.target & ~allowed))) { 2739 printk(KERN_ERR "btrfs: unable to start balance with target " 2740 "metadata profile %llu\n", 2741 (unsigned long long)bctl->meta.target); 2742 ret = -EINVAL; 2743 goto out; 2744 } 2745 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2746 (!alloc_profile_is_valid(bctl->sys.target, 1) || 2747 (bctl->sys.target & ~allowed))) { 2748 printk(KERN_ERR "btrfs: unable to start balance with target " 2749 "system profile %llu\n", 2750 (unsigned long long)bctl->sys.target); 2751 ret = -EINVAL; 2752 goto out; 2753 } 2754 2755 /* allow dup'ed data chunks only in mixed mode */ 2756 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2757 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 2758 printk(KERN_ERR "btrfs: dup for data is not allowed\n"); 2759 ret = -EINVAL; 2760 goto out; 2761 } 2762 2763 /* allow to reduce meta or sys integrity only if force set */ 2764 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 2765 BTRFS_BLOCK_GROUP_RAID10; 2766 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2767 (fs_info->avail_system_alloc_bits & allowed) && 2768 !(bctl->sys.target & allowed)) || 2769 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2770 (fs_info->avail_metadata_alloc_bits & allowed) && 2771 !(bctl->meta.target & allowed))) { 2772 if (bctl->flags & BTRFS_BALANCE_FORCE) { 2773 printk(KERN_INFO "btrfs: force reducing metadata " 2774 "integrity\n"); 2775 } else { 2776 printk(KERN_ERR "btrfs: balance will reduce metadata " 2777 "integrity, use force if you want this\n"); 2778 ret = -EINVAL; 2779 goto out; 2780 } 2781 } 2782 2783 ret = insert_balance_item(fs_info->tree_root, bctl); 2784 if (ret && ret != -EEXIST) 2785 goto out; 2786 2787 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 2788 BUG_ON(ret == -EEXIST); 2789 set_balance_control(bctl); 2790 } else { 2791 BUG_ON(ret != -EEXIST); 2792 spin_lock(&fs_info->balance_lock); 2793 update_balance_args(bctl); 2794 spin_unlock(&fs_info->balance_lock); 2795 } 2796 2797 atomic_inc(&fs_info->balance_running); 2798 mutex_unlock(&fs_info->balance_mutex); 2799 2800 ret = __btrfs_balance(fs_info); 2801 2802 mutex_lock(&fs_info->balance_mutex); 2803 atomic_dec(&fs_info->balance_running); 2804 2805 if (bargs) { 2806 memset(bargs, 0, sizeof(*bargs)); 2807 update_ioctl_balance_args(fs_info, 0, bargs); 2808 } 2809 2810 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 2811 balance_need_close(fs_info)) { 2812 __cancel_balance(fs_info); 2813 } 2814 2815 wake_up(&fs_info->balance_wait_q); 2816 2817 return ret; 2818 out: 2819 if (bctl->flags & BTRFS_BALANCE_RESUME) 2820 __cancel_balance(fs_info); 2821 else 2822 kfree(bctl); 2823 return ret; 2824 } 2825 2826 static int balance_kthread(void *data) 2827 { 2828 struct btrfs_balance_control *bctl = 2829 (struct btrfs_balance_control *)data; 2830 struct btrfs_fs_info *fs_info = bctl->fs_info; 2831 int ret = 0; 2832 2833 mutex_lock(&fs_info->volume_mutex); 2834 mutex_lock(&fs_info->balance_mutex); 2835 2836 set_balance_control(bctl); 2837 2838 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 2839 printk(KERN_INFO "btrfs: force skipping balance\n"); 2840 } else { 2841 printk(KERN_INFO "btrfs: continuing balance\n"); 2842 ret = btrfs_balance(bctl, NULL); 2843 } 2844 2845 mutex_unlock(&fs_info->balance_mutex); 2846 mutex_unlock(&fs_info->volume_mutex); 2847 return ret; 2848 } 2849 2850 int btrfs_recover_balance(struct btrfs_root *tree_root) 2851 { 2852 struct task_struct *tsk; 2853 struct btrfs_balance_control *bctl; 2854 struct btrfs_balance_item *item; 2855 struct btrfs_disk_balance_args disk_bargs; 2856 struct btrfs_path *path; 2857 struct extent_buffer *leaf; 2858 struct btrfs_key key; 2859 int ret; 2860 2861 path = btrfs_alloc_path(); 2862 if (!path) 2863 return -ENOMEM; 2864 2865 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 2866 if (!bctl) { 2867 ret = -ENOMEM; 2868 goto out; 2869 } 2870 2871 key.objectid = BTRFS_BALANCE_OBJECTID; 2872 key.type = BTRFS_BALANCE_ITEM_KEY; 2873 key.offset = 0; 2874 2875 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 2876 if (ret < 0) 2877 goto out_bctl; 2878 if (ret > 0) { /* ret = -ENOENT; */ 2879 ret = 0; 2880 goto out_bctl; 2881 } 2882 2883 leaf = path->nodes[0]; 2884 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2885 2886 bctl->fs_info = tree_root->fs_info; 2887 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; 2888 2889 btrfs_balance_data(leaf, item, &disk_bargs); 2890 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 2891 btrfs_balance_meta(leaf, item, &disk_bargs); 2892 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 2893 btrfs_balance_sys(leaf, item, &disk_bargs); 2894 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 2895 2896 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); 2897 if (IS_ERR(tsk)) 2898 ret = PTR_ERR(tsk); 2899 else 2900 goto out; 2901 2902 out_bctl: 2903 kfree(bctl); 2904 out: 2905 btrfs_free_path(path); 2906 return ret; 2907 } 2908 2909 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 2910 { 2911 int ret = 0; 2912 2913 mutex_lock(&fs_info->balance_mutex); 2914 if (!fs_info->balance_ctl) { 2915 mutex_unlock(&fs_info->balance_mutex); 2916 return -ENOTCONN; 2917 } 2918 2919 if (atomic_read(&fs_info->balance_running)) { 2920 atomic_inc(&fs_info->balance_pause_req); 2921 mutex_unlock(&fs_info->balance_mutex); 2922 2923 wait_event(fs_info->balance_wait_q, 2924 atomic_read(&fs_info->balance_running) == 0); 2925 2926 mutex_lock(&fs_info->balance_mutex); 2927 /* we are good with balance_ctl ripped off from under us */ 2928 BUG_ON(atomic_read(&fs_info->balance_running)); 2929 atomic_dec(&fs_info->balance_pause_req); 2930 } else { 2931 ret = -ENOTCONN; 2932 } 2933 2934 mutex_unlock(&fs_info->balance_mutex); 2935 return ret; 2936 } 2937 2938 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 2939 { 2940 mutex_lock(&fs_info->balance_mutex); 2941 if (!fs_info->balance_ctl) { 2942 mutex_unlock(&fs_info->balance_mutex); 2943 return -ENOTCONN; 2944 } 2945 2946 atomic_inc(&fs_info->balance_cancel_req); 2947 /* 2948 * if we are running just wait and return, balance item is 2949 * deleted in btrfs_balance in this case 2950 */ 2951 if (atomic_read(&fs_info->balance_running)) { 2952 mutex_unlock(&fs_info->balance_mutex); 2953 wait_event(fs_info->balance_wait_q, 2954 atomic_read(&fs_info->balance_running) == 0); 2955 mutex_lock(&fs_info->balance_mutex); 2956 } else { 2957 /* __cancel_balance needs volume_mutex */ 2958 mutex_unlock(&fs_info->balance_mutex); 2959 mutex_lock(&fs_info->volume_mutex); 2960 mutex_lock(&fs_info->balance_mutex); 2961 2962 if (fs_info->balance_ctl) 2963 __cancel_balance(fs_info); 2964 2965 mutex_unlock(&fs_info->volume_mutex); 2966 } 2967 2968 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 2969 atomic_dec(&fs_info->balance_cancel_req); 2970 mutex_unlock(&fs_info->balance_mutex); 2971 return 0; 2972 } 2973 2974 /* 2975 * shrinking a device means finding all of the device extents past 2976 * the new size, and then following the back refs to the chunks. 2977 * The chunk relocation code actually frees the device extent 2978 */ 2979 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2980 { 2981 struct btrfs_trans_handle *trans; 2982 struct btrfs_root *root = device->dev_root; 2983 struct btrfs_dev_extent *dev_extent = NULL; 2984 struct btrfs_path *path; 2985 u64 length; 2986 u64 chunk_tree; 2987 u64 chunk_objectid; 2988 u64 chunk_offset; 2989 int ret; 2990 int slot; 2991 int failed = 0; 2992 bool retried = false; 2993 struct extent_buffer *l; 2994 struct btrfs_key key; 2995 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2996 u64 old_total = btrfs_super_total_bytes(super_copy); 2997 u64 old_size = device->total_bytes; 2998 u64 diff = device->total_bytes - new_size; 2999 3000 if (new_size >= device->total_bytes) 3001 return -EINVAL; 3002 3003 path = btrfs_alloc_path(); 3004 if (!path) 3005 return -ENOMEM; 3006 3007 path->reada = 2; 3008 3009 lock_chunks(root); 3010 3011 device->total_bytes = new_size; 3012 if (device->writeable) { 3013 device->fs_devices->total_rw_bytes -= diff; 3014 spin_lock(&root->fs_info->free_chunk_lock); 3015 root->fs_info->free_chunk_space -= diff; 3016 spin_unlock(&root->fs_info->free_chunk_lock); 3017 } 3018 unlock_chunks(root); 3019 3020 again: 3021 key.objectid = device->devid; 3022 key.offset = (u64)-1; 3023 key.type = BTRFS_DEV_EXTENT_KEY; 3024 3025 do { 3026 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3027 if (ret < 0) 3028 goto done; 3029 3030 ret = btrfs_previous_item(root, path, 0, key.type); 3031 if (ret < 0) 3032 goto done; 3033 if (ret) { 3034 ret = 0; 3035 btrfs_release_path(path); 3036 break; 3037 } 3038 3039 l = path->nodes[0]; 3040 slot = path->slots[0]; 3041 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 3042 3043 if (key.objectid != device->devid) { 3044 btrfs_release_path(path); 3045 break; 3046 } 3047 3048 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3049 length = btrfs_dev_extent_length(l, dev_extent); 3050 3051 if (key.offset + length <= new_size) { 3052 btrfs_release_path(path); 3053 break; 3054 } 3055 3056 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3057 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3058 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3059 btrfs_release_path(path); 3060 3061 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3062 chunk_offset); 3063 if (ret && ret != -ENOSPC) 3064 goto done; 3065 if (ret == -ENOSPC) 3066 failed++; 3067 } while (key.offset-- > 0); 3068 3069 if (failed && !retried) { 3070 failed = 0; 3071 retried = true; 3072 goto again; 3073 } else if (failed && retried) { 3074 ret = -ENOSPC; 3075 lock_chunks(root); 3076 3077 device->total_bytes = old_size; 3078 if (device->writeable) 3079 device->fs_devices->total_rw_bytes += diff; 3080 spin_lock(&root->fs_info->free_chunk_lock); 3081 root->fs_info->free_chunk_space += diff; 3082 spin_unlock(&root->fs_info->free_chunk_lock); 3083 unlock_chunks(root); 3084 goto done; 3085 } 3086 3087 /* Shrinking succeeded, else we would be at "done". */ 3088 trans = btrfs_start_transaction(root, 0); 3089 if (IS_ERR(trans)) { 3090 ret = PTR_ERR(trans); 3091 goto done; 3092 } 3093 3094 lock_chunks(root); 3095 3096 device->disk_total_bytes = new_size; 3097 /* Now btrfs_update_device() will change the on-disk size. */ 3098 ret = btrfs_update_device(trans, device); 3099 if (ret) { 3100 unlock_chunks(root); 3101 btrfs_end_transaction(trans, root); 3102 goto done; 3103 } 3104 WARN_ON(diff > old_total); 3105 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3106 unlock_chunks(root); 3107 btrfs_end_transaction(trans, root); 3108 done: 3109 btrfs_free_path(path); 3110 return ret; 3111 } 3112 3113 static int btrfs_add_system_chunk(struct btrfs_root *root, 3114 struct btrfs_key *key, 3115 struct btrfs_chunk *chunk, int item_size) 3116 { 3117 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3118 struct btrfs_disk_key disk_key; 3119 u32 array_size; 3120 u8 *ptr; 3121 3122 array_size = btrfs_super_sys_array_size(super_copy); 3123 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3124 return -EFBIG; 3125 3126 ptr = super_copy->sys_chunk_array + array_size; 3127 btrfs_cpu_key_to_disk(&disk_key, key); 3128 memcpy(ptr, &disk_key, sizeof(disk_key)); 3129 ptr += sizeof(disk_key); 3130 memcpy(ptr, chunk, item_size); 3131 item_size += sizeof(disk_key); 3132 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3133 return 0; 3134 } 3135 3136 /* 3137 * sort the devices in descending order by max_avail, total_avail 3138 */ 3139 static int btrfs_cmp_device_info(const void *a, const void *b) 3140 { 3141 const struct btrfs_device_info *di_a = a; 3142 const struct btrfs_device_info *di_b = b; 3143 3144 if (di_a->max_avail > di_b->max_avail) 3145 return -1; 3146 if (di_a->max_avail < di_b->max_avail) 3147 return 1; 3148 if (di_a->total_avail > di_b->total_avail) 3149 return -1; 3150 if (di_a->total_avail < di_b->total_avail) 3151 return 1; 3152 return 0; 3153 } 3154 3155 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3156 struct btrfs_root *extent_root, 3157 struct map_lookup **map_ret, 3158 u64 *num_bytes_out, u64 *stripe_size_out, 3159 u64 start, u64 type) 3160 { 3161 struct btrfs_fs_info *info = extent_root->fs_info; 3162 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3163 struct list_head *cur; 3164 struct map_lookup *map = NULL; 3165 struct extent_map_tree *em_tree; 3166 struct extent_map *em; 3167 struct btrfs_device_info *devices_info = NULL; 3168 u64 total_avail; 3169 int num_stripes; /* total number of stripes to allocate */ 3170 int sub_stripes; /* sub_stripes info for map */ 3171 int dev_stripes; /* stripes per dev */ 3172 int devs_max; /* max devs to use */ 3173 int devs_min; /* min devs needed */ 3174 int devs_increment; /* ndevs has to be a multiple of this */ 3175 int ncopies; /* how many copies to data has */ 3176 int ret; 3177 u64 max_stripe_size; 3178 u64 max_chunk_size; 3179 u64 stripe_size; 3180 u64 num_bytes; 3181 int ndevs; 3182 int i; 3183 int j; 3184 3185 BUG_ON(!alloc_profile_is_valid(type, 0)); 3186 3187 if (list_empty(&fs_devices->alloc_list)) 3188 return -ENOSPC; 3189 3190 sub_stripes = 1; 3191 dev_stripes = 1; 3192 devs_increment = 1; 3193 ncopies = 1; 3194 devs_max = 0; /* 0 == as many as possible */ 3195 devs_min = 1; 3196 3197 /* 3198 * define the properties of each RAID type. 3199 * FIXME: move this to a global table and use it in all RAID 3200 * calculation code 3201 */ 3202 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3203 dev_stripes = 2; 3204 ncopies = 2; 3205 devs_max = 1; 3206 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 3207 devs_min = 2; 3208 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 3209 devs_increment = 2; 3210 ncopies = 2; 3211 devs_max = 2; 3212 devs_min = 2; 3213 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 3214 sub_stripes = 2; 3215 devs_increment = 2; 3216 ncopies = 2; 3217 devs_min = 4; 3218 } else { 3219 devs_max = 1; 3220 } 3221 3222 if (type & BTRFS_BLOCK_GROUP_DATA) { 3223 max_stripe_size = 1024 * 1024 * 1024; 3224 max_chunk_size = 10 * max_stripe_size; 3225 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3226 /* for larger filesystems, use larger metadata chunks */ 3227 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 3228 max_stripe_size = 1024 * 1024 * 1024; 3229 else 3230 max_stripe_size = 256 * 1024 * 1024; 3231 max_chunk_size = max_stripe_size; 3232 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3233 max_stripe_size = 32 * 1024 * 1024; 3234 max_chunk_size = 2 * max_stripe_size; 3235 } else { 3236 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3237 type); 3238 BUG_ON(1); 3239 } 3240 3241 /* we don't want a chunk larger than 10% of writeable space */ 3242 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 3243 max_chunk_size); 3244 3245 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 3246 GFP_NOFS); 3247 if (!devices_info) 3248 return -ENOMEM; 3249 3250 cur = fs_devices->alloc_list.next; 3251 3252 /* 3253 * in the first pass through the devices list, we gather information 3254 * about the available holes on each device. 3255 */ 3256 ndevs = 0; 3257 while (cur != &fs_devices->alloc_list) { 3258 struct btrfs_device *device; 3259 u64 max_avail; 3260 u64 dev_offset; 3261 3262 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 3263 3264 cur = cur->next; 3265 3266 if (!device->writeable) { 3267 printk(KERN_ERR 3268 "btrfs: read-only device in alloc_list\n"); 3269 WARN_ON(1); 3270 continue; 3271 } 3272 3273 if (!device->in_fs_metadata) 3274 continue; 3275 3276 if (device->total_bytes > device->bytes_used) 3277 total_avail = device->total_bytes - device->bytes_used; 3278 else 3279 total_avail = 0; 3280 3281 /* If there is no space on this device, skip it. */ 3282 if (total_avail == 0) 3283 continue; 3284 3285 ret = find_free_dev_extent(device, 3286 max_stripe_size * dev_stripes, 3287 &dev_offset, &max_avail); 3288 if (ret && ret != -ENOSPC) 3289 goto error; 3290 3291 if (ret == 0) 3292 max_avail = max_stripe_size * dev_stripes; 3293 3294 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3295 continue; 3296 3297 devices_info[ndevs].dev_offset = dev_offset; 3298 devices_info[ndevs].max_avail = max_avail; 3299 devices_info[ndevs].total_avail = total_avail; 3300 devices_info[ndevs].dev = device; 3301 ++ndevs; 3302 } 3303 3304 /* 3305 * now sort the devices by hole size / available space 3306 */ 3307 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 3308 btrfs_cmp_device_info, NULL); 3309 3310 /* round down to number of usable stripes */ 3311 ndevs -= ndevs % devs_increment; 3312 3313 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 3314 ret = -ENOSPC; 3315 goto error; 3316 } 3317 3318 if (devs_max && ndevs > devs_max) 3319 ndevs = devs_max; 3320 /* 3321 * the primary goal is to maximize the number of stripes, so use as many 3322 * devices as possible, even if the stripes are not maximum sized. 3323 */ 3324 stripe_size = devices_info[ndevs-1].max_avail; 3325 num_stripes = ndevs * dev_stripes; 3326 3327 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3328 stripe_size = max_chunk_size * ncopies; 3329 do_div(stripe_size, ndevs); 3330 } 3331 3332 do_div(stripe_size, dev_stripes); 3333 3334 /* align to BTRFS_STRIPE_LEN */ 3335 do_div(stripe_size, BTRFS_STRIPE_LEN); 3336 stripe_size *= BTRFS_STRIPE_LEN; 3337 3338 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3339 if (!map) { 3340 ret = -ENOMEM; 3341 goto error; 3342 } 3343 map->num_stripes = num_stripes; 3344 3345 for (i = 0; i < ndevs; ++i) { 3346 for (j = 0; j < dev_stripes; ++j) { 3347 int s = i * dev_stripes + j; 3348 map->stripes[s].dev = devices_info[i].dev; 3349 map->stripes[s].physical = devices_info[i].dev_offset + 3350 j * stripe_size; 3351 } 3352 } 3353 map->sector_size = extent_root->sectorsize; 3354 map->stripe_len = BTRFS_STRIPE_LEN; 3355 map->io_align = BTRFS_STRIPE_LEN; 3356 map->io_width = BTRFS_STRIPE_LEN; 3357 map->type = type; 3358 map->sub_stripes = sub_stripes; 3359 3360 *map_ret = map; 3361 num_bytes = stripe_size * (num_stripes / ncopies); 3362 3363 *stripe_size_out = stripe_size; 3364 *num_bytes_out = num_bytes; 3365 3366 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3367 3368 em = alloc_extent_map(); 3369 if (!em) { 3370 ret = -ENOMEM; 3371 goto error; 3372 } 3373 em->bdev = (struct block_device *)map; 3374 em->start = start; 3375 em->len = num_bytes; 3376 em->block_start = 0; 3377 em->block_len = em->len; 3378 3379 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3380 write_lock(&em_tree->lock); 3381 ret = add_extent_mapping(em_tree, em); 3382 write_unlock(&em_tree->lock); 3383 free_extent_map(em); 3384 if (ret) 3385 goto error; 3386 3387 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3388 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3389 start, num_bytes); 3390 if (ret) 3391 goto error; 3392 3393 for (i = 0; i < map->num_stripes; ++i) { 3394 struct btrfs_device *device; 3395 u64 dev_offset; 3396 3397 device = map->stripes[i].dev; 3398 dev_offset = map->stripes[i].physical; 3399 3400 ret = btrfs_alloc_dev_extent(trans, device, 3401 info->chunk_root->root_key.objectid, 3402 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3403 start, dev_offset, stripe_size); 3404 if (ret) { 3405 btrfs_abort_transaction(trans, extent_root, ret); 3406 goto error; 3407 } 3408 } 3409 3410 kfree(devices_info); 3411 return 0; 3412 3413 error: 3414 kfree(map); 3415 kfree(devices_info); 3416 return ret; 3417 } 3418 3419 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3420 struct btrfs_root *extent_root, 3421 struct map_lookup *map, u64 chunk_offset, 3422 u64 chunk_size, u64 stripe_size) 3423 { 3424 u64 dev_offset; 3425 struct btrfs_key key; 3426 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3427 struct btrfs_device *device; 3428 struct btrfs_chunk *chunk; 3429 struct btrfs_stripe *stripe; 3430 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3431 int index = 0; 3432 int ret; 3433 3434 chunk = kzalloc(item_size, GFP_NOFS); 3435 if (!chunk) 3436 return -ENOMEM; 3437 3438 index = 0; 3439 while (index < map->num_stripes) { 3440 device = map->stripes[index].dev; 3441 device->bytes_used += stripe_size; 3442 ret = btrfs_update_device(trans, device); 3443 if (ret) 3444 goto out_free; 3445 index++; 3446 } 3447 3448 spin_lock(&extent_root->fs_info->free_chunk_lock); 3449 extent_root->fs_info->free_chunk_space -= (stripe_size * 3450 map->num_stripes); 3451 spin_unlock(&extent_root->fs_info->free_chunk_lock); 3452 3453 index = 0; 3454 stripe = &chunk->stripe; 3455 while (index < map->num_stripes) { 3456 device = map->stripes[index].dev; 3457 dev_offset = map->stripes[index].physical; 3458 3459 btrfs_set_stack_stripe_devid(stripe, device->devid); 3460 btrfs_set_stack_stripe_offset(stripe, dev_offset); 3461 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 3462 stripe++; 3463 index++; 3464 } 3465 3466 btrfs_set_stack_chunk_length(chunk, chunk_size); 3467 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 3468 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 3469 btrfs_set_stack_chunk_type(chunk, map->type); 3470 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 3471 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 3472 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 3473 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 3474 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 3475 3476 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3477 key.type = BTRFS_CHUNK_ITEM_KEY; 3478 key.offset = chunk_offset; 3479 3480 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 3481 3482 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3483 /* 3484 * TODO: Cleanup of inserted chunk root in case of 3485 * failure. 3486 */ 3487 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 3488 item_size); 3489 } 3490 3491 out_free: 3492 kfree(chunk); 3493 return ret; 3494 } 3495 3496 /* 3497 * Chunk allocation falls into two parts. The first part does works 3498 * that make the new allocated chunk useable, but not do any operation 3499 * that modifies the chunk tree. The second part does the works that 3500 * require modifying the chunk tree. This division is important for the 3501 * bootstrap process of adding storage to a seed btrfs. 3502 */ 3503 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3504 struct btrfs_root *extent_root, u64 type) 3505 { 3506 u64 chunk_offset; 3507 u64 chunk_size; 3508 u64 stripe_size; 3509 struct map_lookup *map; 3510 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3511 int ret; 3512 3513 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3514 &chunk_offset); 3515 if (ret) 3516 return ret; 3517 3518 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3519 &stripe_size, chunk_offset, type); 3520 if (ret) 3521 return ret; 3522 3523 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3524 chunk_size, stripe_size); 3525 if (ret) 3526 return ret; 3527 return 0; 3528 } 3529 3530 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 3531 struct btrfs_root *root, 3532 struct btrfs_device *device) 3533 { 3534 u64 chunk_offset; 3535 u64 sys_chunk_offset; 3536 u64 chunk_size; 3537 u64 sys_chunk_size; 3538 u64 stripe_size; 3539 u64 sys_stripe_size; 3540 u64 alloc_profile; 3541 struct map_lookup *map; 3542 struct map_lookup *sys_map; 3543 struct btrfs_fs_info *fs_info = root->fs_info; 3544 struct btrfs_root *extent_root = fs_info->extent_root; 3545 int ret; 3546 3547 ret = find_next_chunk(fs_info->chunk_root, 3548 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 3549 if (ret) 3550 return ret; 3551 3552 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3553 fs_info->avail_metadata_alloc_bits; 3554 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3555 3556 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3557 &stripe_size, chunk_offset, alloc_profile); 3558 if (ret) 3559 return ret; 3560 3561 sys_chunk_offset = chunk_offset + chunk_size; 3562 3563 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3564 fs_info->avail_system_alloc_bits; 3565 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3566 3567 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3568 &sys_chunk_size, &sys_stripe_size, 3569 sys_chunk_offset, alloc_profile); 3570 if (ret) 3571 goto abort; 3572 3573 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3574 if (ret) 3575 goto abort; 3576 3577 /* 3578 * Modifying chunk tree needs allocating new blocks from both 3579 * system block group and metadata block group. So we only can 3580 * do operations require modifying the chunk tree after both 3581 * block groups were created. 3582 */ 3583 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3584 chunk_size, stripe_size); 3585 if (ret) 3586 goto abort; 3587 3588 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3589 sys_chunk_offset, sys_chunk_size, 3590 sys_stripe_size); 3591 if (ret) 3592 goto abort; 3593 3594 return 0; 3595 3596 abort: 3597 btrfs_abort_transaction(trans, root, ret); 3598 return ret; 3599 } 3600 3601 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 3602 { 3603 struct extent_map *em; 3604 struct map_lookup *map; 3605 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3606 int readonly = 0; 3607 int i; 3608 3609 read_lock(&map_tree->map_tree.lock); 3610 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3611 read_unlock(&map_tree->map_tree.lock); 3612 if (!em) 3613 return 1; 3614 3615 if (btrfs_test_opt(root, DEGRADED)) { 3616 free_extent_map(em); 3617 return 0; 3618 } 3619 3620 map = (struct map_lookup *)em->bdev; 3621 for (i = 0; i < map->num_stripes; i++) { 3622 if (!map->stripes[i].dev->writeable) { 3623 readonly = 1; 3624 break; 3625 } 3626 } 3627 free_extent_map(em); 3628 return readonly; 3629 } 3630 3631 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 3632 { 3633 extent_map_tree_init(&tree->map_tree); 3634 } 3635 3636 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 3637 { 3638 struct extent_map *em; 3639 3640 while (1) { 3641 write_lock(&tree->map_tree.lock); 3642 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 3643 if (em) 3644 remove_extent_mapping(&tree->map_tree, em); 3645 write_unlock(&tree->map_tree.lock); 3646 if (!em) 3647 break; 3648 kfree(em->bdev); 3649 /* once for us */ 3650 free_extent_map(em); 3651 /* once for the tree */ 3652 free_extent_map(em); 3653 } 3654 } 3655 3656 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3657 { 3658 struct extent_map *em; 3659 struct map_lookup *map; 3660 struct extent_map_tree *em_tree = &map_tree->map_tree; 3661 int ret; 3662 3663 read_lock(&em_tree->lock); 3664 em = lookup_extent_mapping(em_tree, logical, len); 3665 read_unlock(&em_tree->lock); 3666 BUG_ON(!em); 3667 3668 BUG_ON(em->start > logical || em->start + em->len < logical); 3669 map = (struct map_lookup *)em->bdev; 3670 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 3671 ret = map->num_stripes; 3672 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3673 ret = map->sub_stripes; 3674 else 3675 ret = 1; 3676 free_extent_map(em); 3677 return ret; 3678 } 3679 3680 static int find_live_mirror(struct map_lookup *map, int first, int num, 3681 int optimal) 3682 { 3683 int i; 3684 if (map->stripes[optimal].dev->bdev) 3685 return optimal; 3686 for (i = first; i < first + num; i++) { 3687 if (map->stripes[i].dev->bdev) 3688 return i; 3689 } 3690 /* we couldn't find one that doesn't fail. Just return something 3691 * and the io error handling code will clean up eventually 3692 */ 3693 return optimal; 3694 } 3695 3696 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3697 u64 logical, u64 *length, 3698 struct btrfs_bio **bbio_ret, 3699 int mirror_num) 3700 { 3701 struct extent_map *em; 3702 struct map_lookup *map; 3703 struct extent_map_tree *em_tree = &map_tree->map_tree; 3704 u64 offset; 3705 u64 stripe_offset; 3706 u64 stripe_end_offset; 3707 u64 stripe_nr; 3708 u64 stripe_nr_orig; 3709 u64 stripe_nr_end; 3710 int stripe_index; 3711 int i; 3712 int ret = 0; 3713 int num_stripes; 3714 int max_errors = 0; 3715 struct btrfs_bio *bbio = NULL; 3716 3717 read_lock(&em_tree->lock); 3718 em = lookup_extent_mapping(em_tree, logical, *length); 3719 read_unlock(&em_tree->lock); 3720 3721 if (!em) { 3722 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3723 (unsigned long long)logical, 3724 (unsigned long long)*length); 3725 BUG(); 3726 } 3727 3728 BUG_ON(em->start > logical || em->start + em->len < logical); 3729 map = (struct map_lookup *)em->bdev; 3730 offset = logical - em->start; 3731 3732 if (mirror_num > map->num_stripes) 3733 mirror_num = 0; 3734 3735 stripe_nr = offset; 3736 /* 3737 * stripe_nr counts the total number of stripes we have to stride 3738 * to get to this block 3739 */ 3740 do_div(stripe_nr, map->stripe_len); 3741 3742 stripe_offset = stripe_nr * map->stripe_len; 3743 BUG_ON(offset < stripe_offset); 3744 3745 /* stripe_offset is the offset of this block in its stripe*/ 3746 stripe_offset = offset - stripe_offset; 3747 3748 if (rw & REQ_DISCARD) 3749 *length = min_t(u64, em->len - offset, *length); 3750 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 3751 /* we limit the length of each bio to what fits in a stripe */ 3752 *length = min_t(u64, em->len - offset, 3753 map->stripe_len - stripe_offset); 3754 } else { 3755 *length = em->len - offset; 3756 } 3757 3758 if (!bbio_ret) 3759 goto out; 3760 3761 num_stripes = 1; 3762 stripe_index = 0; 3763 stripe_nr_orig = stripe_nr; 3764 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 3765 (~(map->stripe_len - 1)); 3766 do_div(stripe_nr_end, map->stripe_len); 3767 stripe_end_offset = stripe_nr_end * map->stripe_len - 3768 (offset + *length); 3769 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3770 if (rw & REQ_DISCARD) 3771 num_stripes = min_t(u64, map->num_stripes, 3772 stripe_nr_end - stripe_nr_orig); 3773 stripe_index = do_div(stripe_nr, map->num_stripes); 3774 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3775 if (rw & (REQ_WRITE | REQ_DISCARD)) 3776 num_stripes = map->num_stripes; 3777 else if (mirror_num) 3778 stripe_index = mirror_num - 1; 3779 else { 3780 stripe_index = find_live_mirror(map, 0, 3781 map->num_stripes, 3782 current->pid % map->num_stripes); 3783 mirror_num = stripe_index + 1; 3784 } 3785 3786 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3787 if (rw & (REQ_WRITE | REQ_DISCARD)) { 3788 num_stripes = map->num_stripes; 3789 } else if (mirror_num) { 3790 stripe_index = mirror_num - 1; 3791 } else { 3792 mirror_num = 1; 3793 } 3794 3795 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3796 int factor = map->num_stripes / map->sub_stripes; 3797 3798 stripe_index = do_div(stripe_nr, factor); 3799 stripe_index *= map->sub_stripes; 3800 3801 if (rw & REQ_WRITE) 3802 num_stripes = map->sub_stripes; 3803 else if (rw & REQ_DISCARD) 3804 num_stripes = min_t(u64, map->sub_stripes * 3805 (stripe_nr_end - stripe_nr_orig), 3806 map->num_stripes); 3807 else if (mirror_num) 3808 stripe_index += mirror_num - 1; 3809 else { 3810 int old_stripe_index = stripe_index; 3811 stripe_index = find_live_mirror(map, stripe_index, 3812 map->sub_stripes, stripe_index + 3813 current->pid % map->sub_stripes); 3814 mirror_num = stripe_index - old_stripe_index + 1; 3815 } 3816 } else { 3817 /* 3818 * after this do_div call, stripe_nr is the number of stripes 3819 * on this device we have to walk to find the data, and 3820 * stripe_index is the number of our device in the stripe array 3821 */ 3822 stripe_index = do_div(stripe_nr, map->num_stripes); 3823 mirror_num = stripe_index + 1; 3824 } 3825 BUG_ON(stripe_index >= map->num_stripes); 3826 3827 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 3828 if (!bbio) { 3829 ret = -ENOMEM; 3830 goto out; 3831 } 3832 atomic_set(&bbio->error, 0); 3833 3834 if (rw & REQ_DISCARD) { 3835 int factor = 0; 3836 int sub_stripes = 0; 3837 u64 stripes_per_dev = 0; 3838 u32 remaining_stripes = 0; 3839 u32 last_stripe = 0; 3840 3841 if (map->type & 3842 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 3843 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3844 sub_stripes = 1; 3845 else 3846 sub_stripes = map->sub_stripes; 3847 3848 factor = map->num_stripes / sub_stripes; 3849 stripes_per_dev = div_u64_rem(stripe_nr_end - 3850 stripe_nr_orig, 3851 factor, 3852 &remaining_stripes); 3853 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 3854 last_stripe *= sub_stripes; 3855 } 3856 3857 for (i = 0; i < num_stripes; i++) { 3858 bbio->stripes[i].physical = 3859 map->stripes[stripe_index].physical + 3860 stripe_offset + stripe_nr * map->stripe_len; 3861 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3862 3863 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3864 BTRFS_BLOCK_GROUP_RAID10)) { 3865 bbio->stripes[i].length = stripes_per_dev * 3866 map->stripe_len; 3867 3868 if (i / sub_stripes < remaining_stripes) 3869 bbio->stripes[i].length += 3870 map->stripe_len; 3871 3872 /* 3873 * Special for the first stripe and 3874 * the last stripe: 3875 * 3876 * |-------|...|-------| 3877 * |----------| 3878 * off end_off 3879 */ 3880 if (i < sub_stripes) 3881 bbio->stripes[i].length -= 3882 stripe_offset; 3883 3884 if (stripe_index >= last_stripe && 3885 stripe_index <= (last_stripe + 3886 sub_stripes - 1)) 3887 bbio->stripes[i].length -= 3888 stripe_end_offset; 3889 3890 if (i == sub_stripes - 1) 3891 stripe_offset = 0; 3892 } else 3893 bbio->stripes[i].length = *length; 3894 3895 stripe_index++; 3896 if (stripe_index == map->num_stripes) { 3897 /* This could only happen for RAID0/10 */ 3898 stripe_index = 0; 3899 stripe_nr++; 3900 } 3901 } 3902 } else { 3903 for (i = 0; i < num_stripes; i++) { 3904 bbio->stripes[i].physical = 3905 map->stripes[stripe_index].physical + 3906 stripe_offset + 3907 stripe_nr * map->stripe_len; 3908 bbio->stripes[i].dev = 3909 map->stripes[stripe_index].dev; 3910 stripe_index++; 3911 } 3912 } 3913 3914 if (rw & REQ_WRITE) { 3915 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 3916 BTRFS_BLOCK_GROUP_RAID10 | 3917 BTRFS_BLOCK_GROUP_DUP)) { 3918 max_errors = 1; 3919 } 3920 } 3921 3922 *bbio_ret = bbio; 3923 bbio->num_stripes = num_stripes; 3924 bbio->max_errors = max_errors; 3925 bbio->mirror_num = mirror_num; 3926 out: 3927 free_extent_map(em); 3928 return ret; 3929 } 3930 3931 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3932 u64 logical, u64 *length, 3933 struct btrfs_bio **bbio_ret, int mirror_num) 3934 { 3935 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 3936 mirror_num); 3937 } 3938 3939 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3940 u64 chunk_start, u64 physical, u64 devid, 3941 u64 **logical, int *naddrs, int *stripe_len) 3942 { 3943 struct extent_map_tree *em_tree = &map_tree->map_tree; 3944 struct extent_map *em; 3945 struct map_lookup *map; 3946 u64 *buf; 3947 u64 bytenr; 3948 u64 length; 3949 u64 stripe_nr; 3950 int i, j, nr = 0; 3951 3952 read_lock(&em_tree->lock); 3953 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3954 read_unlock(&em_tree->lock); 3955 3956 BUG_ON(!em || em->start != chunk_start); 3957 map = (struct map_lookup *)em->bdev; 3958 3959 length = em->len; 3960 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3961 do_div(length, map->num_stripes / map->sub_stripes); 3962 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3963 do_div(length, map->num_stripes); 3964 3965 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3966 BUG_ON(!buf); /* -ENOMEM */ 3967 3968 for (i = 0; i < map->num_stripes; i++) { 3969 if (devid && map->stripes[i].dev->devid != devid) 3970 continue; 3971 if (map->stripes[i].physical > physical || 3972 map->stripes[i].physical + length <= physical) 3973 continue; 3974 3975 stripe_nr = physical - map->stripes[i].physical; 3976 do_div(stripe_nr, map->stripe_len); 3977 3978 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3979 stripe_nr = stripe_nr * map->num_stripes + i; 3980 do_div(stripe_nr, map->sub_stripes); 3981 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3982 stripe_nr = stripe_nr * map->num_stripes + i; 3983 } 3984 bytenr = chunk_start + stripe_nr * map->stripe_len; 3985 WARN_ON(nr >= map->num_stripes); 3986 for (j = 0; j < nr; j++) { 3987 if (buf[j] == bytenr) 3988 break; 3989 } 3990 if (j == nr) { 3991 WARN_ON(nr >= map->num_stripes); 3992 buf[nr++] = bytenr; 3993 } 3994 } 3995 3996 *logical = buf; 3997 *naddrs = nr; 3998 *stripe_len = map->stripe_len; 3999 4000 free_extent_map(em); 4001 return 0; 4002 } 4003 4004 static void btrfs_end_bio(struct bio *bio, int err) 4005 { 4006 struct btrfs_bio *bbio = bio->bi_private; 4007 int is_orig_bio = 0; 4008 4009 if (err) 4010 atomic_inc(&bbio->error); 4011 4012 if (bio == bbio->orig_bio) 4013 is_orig_bio = 1; 4014 4015 if (atomic_dec_and_test(&bbio->stripes_pending)) { 4016 if (!is_orig_bio) { 4017 bio_put(bio); 4018 bio = bbio->orig_bio; 4019 } 4020 bio->bi_private = bbio->private; 4021 bio->bi_end_io = bbio->end_io; 4022 bio->bi_bdev = (struct block_device *) 4023 (unsigned long)bbio->mirror_num; 4024 /* only send an error to the higher layers if it is 4025 * beyond the tolerance of the multi-bio 4026 */ 4027 if (atomic_read(&bbio->error) > bbio->max_errors) { 4028 err = -EIO; 4029 } else { 4030 /* 4031 * this bio is actually up to date, we didn't 4032 * go over the max number of errors 4033 */ 4034 set_bit(BIO_UPTODATE, &bio->bi_flags); 4035 err = 0; 4036 } 4037 kfree(bbio); 4038 4039 bio_endio(bio, err); 4040 } else if (!is_orig_bio) { 4041 bio_put(bio); 4042 } 4043 } 4044 4045 struct async_sched { 4046 struct bio *bio; 4047 int rw; 4048 struct btrfs_fs_info *info; 4049 struct btrfs_work work; 4050 }; 4051 4052 /* 4053 * see run_scheduled_bios for a description of why bios are collected for 4054 * async submit. 4055 * 4056 * This will add one bio to the pending list for a device and make sure 4057 * the work struct is scheduled. 4058 */ 4059 static noinline void schedule_bio(struct btrfs_root *root, 4060 struct btrfs_device *device, 4061 int rw, struct bio *bio) 4062 { 4063 int should_queue = 1; 4064 struct btrfs_pending_bios *pending_bios; 4065 4066 /* don't bother with additional async steps for reads, right now */ 4067 if (!(rw & REQ_WRITE)) { 4068 bio_get(bio); 4069 btrfsic_submit_bio(rw, bio); 4070 bio_put(bio); 4071 return; 4072 } 4073 4074 /* 4075 * nr_async_bios allows us to reliably return congestion to the 4076 * higher layers. Otherwise, the async bio makes it appear we have 4077 * made progress against dirty pages when we've really just put it 4078 * on a queue for later 4079 */ 4080 atomic_inc(&root->fs_info->nr_async_bios); 4081 WARN_ON(bio->bi_next); 4082 bio->bi_next = NULL; 4083 bio->bi_rw |= rw; 4084 4085 spin_lock(&device->io_lock); 4086 if (bio->bi_rw & REQ_SYNC) 4087 pending_bios = &device->pending_sync_bios; 4088 else 4089 pending_bios = &device->pending_bios; 4090 4091 if (pending_bios->tail) 4092 pending_bios->tail->bi_next = bio; 4093 4094 pending_bios->tail = bio; 4095 if (!pending_bios->head) 4096 pending_bios->head = bio; 4097 if (device->running_pending) 4098 should_queue = 0; 4099 4100 spin_unlock(&device->io_lock); 4101 4102 if (should_queue) 4103 btrfs_queue_worker(&root->fs_info->submit_workers, 4104 &device->work); 4105 } 4106 4107 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4108 int mirror_num, int async_submit) 4109 { 4110 struct btrfs_mapping_tree *map_tree; 4111 struct btrfs_device *dev; 4112 struct bio *first_bio = bio; 4113 u64 logical = (u64)bio->bi_sector << 9; 4114 u64 length = 0; 4115 u64 map_length; 4116 int ret; 4117 int dev_nr = 0; 4118 int total_devs = 1; 4119 struct btrfs_bio *bbio = NULL; 4120 4121 length = bio->bi_size; 4122 map_tree = &root->fs_info->mapping_tree; 4123 map_length = length; 4124 4125 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4126 mirror_num); 4127 if (ret) /* -ENOMEM */ 4128 return ret; 4129 4130 total_devs = bbio->num_stripes; 4131 if (map_length < length) { 4132 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4133 "len %llu\n", (unsigned long long)logical, 4134 (unsigned long long)length, 4135 (unsigned long long)map_length); 4136 BUG(); 4137 } 4138 4139 bbio->orig_bio = first_bio; 4140 bbio->private = first_bio->bi_private; 4141 bbio->end_io = first_bio->bi_end_io; 4142 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4143 4144 while (dev_nr < total_devs) { 4145 if (dev_nr < total_devs - 1) { 4146 bio = bio_clone(first_bio, GFP_NOFS); 4147 BUG_ON(!bio); /* -ENOMEM */ 4148 } else { 4149 bio = first_bio; 4150 } 4151 bio->bi_private = bbio; 4152 bio->bi_end_io = btrfs_end_bio; 4153 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4154 dev = bbio->stripes[dev_nr].dev; 4155 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 4156 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " 4157 "(%s id %llu), size=%u\n", rw, 4158 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4159 dev->name, dev->devid, bio->bi_size); 4160 bio->bi_bdev = dev->bdev; 4161 if (async_submit) 4162 schedule_bio(root, dev, rw, bio); 4163 else 4164 btrfsic_submit_bio(rw, bio); 4165 } else { 4166 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4167 bio->bi_sector = logical >> 9; 4168 bio_endio(bio, -EIO); 4169 } 4170 dev_nr++; 4171 } 4172 return 0; 4173 } 4174 4175 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4176 u8 *uuid, u8 *fsid) 4177 { 4178 struct btrfs_device *device; 4179 struct btrfs_fs_devices *cur_devices; 4180 4181 cur_devices = root->fs_info->fs_devices; 4182 while (cur_devices) { 4183 if (!fsid || 4184 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4185 device = __find_device(&cur_devices->devices, 4186 devid, uuid); 4187 if (device) 4188 return device; 4189 } 4190 cur_devices = cur_devices->seed; 4191 } 4192 return NULL; 4193 } 4194 4195 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 4196 u64 devid, u8 *dev_uuid) 4197 { 4198 struct btrfs_device *device; 4199 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4200 4201 device = kzalloc(sizeof(*device), GFP_NOFS); 4202 if (!device) 4203 return NULL; 4204 list_add(&device->dev_list, 4205 &fs_devices->devices); 4206 device->dev_root = root->fs_info->dev_root; 4207 device->devid = devid; 4208 device->work.func = pending_bios_fn; 4209 device->fs_devices = fs_devices; 4210 device->missing = 1; 4211 fs_devices->num_devices++; 4212 fs_devices->missing_devices++; 4213 spin_lock_init(&device->io_lock); 4214 INIT_LIST_HEAD(&device->dev_alloc_list); 4215 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 4216 return device; 4217 } 4218 4219 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 4220 struct extent_buffer *leaf, 4221 struct btrfs_chunk *chunk) 4222 { 4223 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4224 struct map_lookup *map; 4225 struct extent_map *em; 4226 u64 logical; 4227 u64 length; 4228 u64 devid; 4229 u8 uuid[BTRFS_UUID_SIZE]; 4230 int num_stripes; 4231 int ret; 4232 int i; 4233 4234 logical = key->offset; 4235 length = btrfs_chunk_length(leaf, chunk); 4236 4237 read_lock(&map_tree->map_tree.lock); 4238 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 4239 read_unlock(&map_tree->map_tree.lock); 4240 4241 /* already mapped? */ 4242 if (em && em->start <= logical && em->start + em->len > logical) { 4243 free_extent_map(em); 4244 return 0; 4245 } else if (em) { 4246 free_extent_map(em); 4247 } 4248 4249 em = alloc_extent_map(); 4250 if (!em) 4251 return -ENOMEM; 4252 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 4253 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4254 if (!map) { 4255 free_extent_map(em); 4256 return -ENOMEM; 4257 } 4258 4259 em->bdev = (struct block_device *)map; 4260 em->start = logical; 4261 em->len = length; 4262 em->block_start = 0; 4263 em->block_len = em->len; 4264 4265 map->num_stripes = num_stripes; 4266 map->io_width = btrfs_chunk_io_width(leaf, chunk); 4267 map->io_align = btrfs_chunk_io_align(leaf, chunk); 4268 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 4269 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 4270 map->type = btrfs_chunk_type(leaf, chunk); 4271 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 4272 for (i = 0; i < num_stripes; i++) { 4273 map->stripes[i].physical = 4274 btrfs_stripe_offset_nr(leaf, chunk, i); 4275 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 4276 read_extent_buffer(leaf, uuid, (unsigned long) 4277 btrfs_stripe_dev_uuid_nr(chunk, i), 4278 BTRFS_UUID_SIZE); 4279 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4280 NULL); 4281 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4282 kfree(map); 4283 free_extent_map(em); 4284 return -EIO; 4285 } 4286 if (!map->stripes[i].dev) { 4287 map->stripes[i].dev = 4288 add_missing_dev(root, devid, uuid); 4289 if (!map->stripes[i].dev) { 4290 kfree(map); 4291 free_extent_map(em); 4292 return -EIO; 4293 } 4294 } 4295 map->stripes[i].dev->in_fs_metadata = 1; 4296 } 4297 4298 write_lock(&map_tree->map_tree.lock); 4299 ret = add_extent_mapping(&map_tree->map_tree, em); 4300 write_unlock(&map_tree->map_tree.lock); 4301 BUG_ON(ret); /* Tree corruption */ 4302 free_extent_map(em); 4303 4304 return 0; 4305 } 4306 4307 static void fill_device_from_item(struct extent_buffer *leaf, 4308 struct btrfs_dev_item *dev_item, 4309 struct btrfs_device *device) 4310 { 4311 unsigned long ptr; 4312 4313 device->devid = btrfs_device_id(leaf, dev_item); 4314 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 4315 device->total_bytes = device->disk_total_bytes; 4316 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 4317 device->type = btrfs_device_type(leaf, dev_item); 4318 device->io_align = btrfs_device_io_align(leaf, dev_item); 4319 device->io_width = btrfs_device_io_width(leaf, dev_item); 4320 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 4321 4322 ptr = (unsigned long)btrfs_device_uuid(dev_item); 4323 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 4324 } 4325 4326 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 4327 { 4328 struct btrfs_fs_devices *fs_devices; 4329 int ret; 4330 4331 BUG_ON(!mutex_is_locked(&uuid_mutex)); 4332 4333 fs_devices = root->fs_info->fs_devices->seed; 4334 while (fs_devices) { 4335 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4336 ret = 0; 4337 goto out; 4338 } 4339 fs_devices = fs_devices->seed; 4340 } 4341 4342 fs_devices = find_fsid(fsid); 4343 if (!fs_devices) { 4344 ret = -ENOENT; 4345 goto out; 4346 } 4347 4348 fs_devices = clone_fs_devices(fs_devices); 4349 if (IS_ERR(fs_devices)) { 4350 ret = PTR_ERR(fs_devices); 4351 goto out; 4352 } 4353 4354 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 4355 root->fs_info->bdev_holder); 4356 if (ret) { 4357 free_fs_devices(fs_devices); 4358 goto out; 4359 } 4360 4361 if (!fs_devices->seeding) { 4362 __btrfs_close_devices(fs_devices); 4363 free_fs_devices(fs_devices); 4364 ret = -EINVAL; 4365 goto out; 4366 } 4367 4368 fs_devices->seed = root->fs_info->fs_devices->seed; 4369 root->fs_info->fs_devices->seed = fs_devices; 4370 out: 4371 return ret; 4372 } 4373 4374 static int read_one_dev(struct btrfs_root *root, 4375 struct extent_buffer *leaf, 4376 struct btrfs_dev_item *dev_item) 4377 { 4378 struct btrfs_device *device; 4379 u64 devid; 4380 int ret; 4381 u8 fs_uuid[BTRFS_UUID_SIZE]; 4382 u8 dev_uuid[BTRFS_UUID_SIZE]; 4383 4384 devid = btrfs_device_id(leaf, dev_item); 4385 read_extent_buffer(leaf, dev_uuid, 4386 (unsigned long)btrfs_device_uuid(dev_item), 4387 BTRFS_UUID_SIZE); 4388 read_extent_buffer(leaf, fs_uuid, 4389 (unsigned long)btrfs_device_fsid(dev_item), 4390 BTRFS_UUID_SIZE); 4391 4392 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 4393 ret = open_seed_devices(root, fs_uuid); 4394 if (ret && !btrfs_test_opt(root, DEGRADED)) 4395 return ret; 4396 } 4397 4398 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 4399 if (!device || !device->bdev) { 4400 if (!btrfs_test_opt(root, DEGRADED)) 4401 return -EIO; 4402 4403 if (!device) { 4404 printk(KERN_WARNING "warning devid %llu missing\n", 4405 (unsigned long long)devid); 4406 device = add_missing_dev(root, devid, dev_uuid); 4407 if (!device) 4408 return -ENOMEM; 4409 } else if (!device->missing) { 4410 /* 4411 * this happens when a device that was properly setup 4412 * in the device info lists suddenly goes bad. 4413 * device->bdev is NULL, and so we have to set 4414 * device->missing to one here 4415 */ 4416 root->fs_info->fs_devices->missing_devices++; 4417 device->missing = 1; 4418 } 4419 } 4420 4421 if (device->fs_devices != root->fs_info->fs_devices) { 4422 BUG_ON(device->writeable); 4423 if (device->generation != 4424 btrfs_device_generation(leaf, dev_item)) 4425 return -EINVAL; 4426 } 4427 4428 fill_device_from_item(leaf, dev_item, device); 4429 device->dev_root = root->fs_info->dev_root; 4430 device->in_fs_metadata = 1; 4431 if (device->writeable) { 4432 device->fs_devices->total_rw_bytes += device->total_bytes; 4433 spin_lock(&root->fs_info->free_chunk_lock); 4434 root->fs_info->free_chunk_space += device->total_bytes - 4435 device->bytes_used; 4436 spin_unlock(&root->fs_info->free_chunk_lock); 4437 } 4438 ret = 0; 4439 return ret; 4440 } 4441 4442 int btrfs_read_sys_array(struct btrfs_root *root) 4443 { 4444 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 4445 struct extent_buffer *sb; 4446 struct btrfs_disk_key *disk_key; 4447 struct btrfs_chunk *chunk; 4448 u8 *ptr; 4449 unsigned long sb_ptr; 4450 int ret = 0; 4451 u32 num_stripes; 4452 u32 array_size; 4453 u32 len = 0; 4454 u32 cur; 4455 struct btrfs_key key; 4456 4457 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 4458 BTRFS_SUPER_INFO_SIZE); 4459 if (!sb) 4460 return -ENOMEM; 4461 btrfs_set_buffer_uptodate(sb); 4462 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 4463 /* 4464 * The sb extent buffer is artifical and just used to read the system array. 4465 * btrfs_set_buffer_uptodate() call does not properly mark all it's 4466 * pages up-to-date when the page is larger: extent does not cover the 4467 * whole page and consequently check_page_uptodate does not find all 4468 * the page's extents up-to-date (the hole beyond sb), 4469 * write_extent_buffer then triggers a WARN_ON. 4470 * 4471 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 4472 * but sb spans only this function. Add an explicit SetPageUptodate call 4473 * to silence the warning eg. on PowerPC 64. 4474 */ 4475 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 4476 SetPageUptodate(sb->pages[0]); 4477 4478 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 4479 array_size = btrfs_super_sys_array_size(super_copy); 4480 4481 ptr = super_copy->sys_chunk_array; 4482 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 4483 cur = 0; 4484 4485 while (cur < array_size) { 4486 disk_key = (struct btrfs_disk_key *)ptr; 4487 btrfs_disk_key_to_cpu(&key, disk_key); 4488 4489 len = sizeof(*disk_key); ptr += len; 4490 sb_ptr += len; 4491 cur += len; 4492 4493 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 4494 chunk = (struct btrfs_chunk *)sb_ptr; 4495 ret = read_one_chunk(root, &key, sb, chunk); 4496 if (ret) 4497 break; 4498 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 4499 len = btrfs_chunk_item_size(num_stripes); 4500 } else { 4501 ret = -EIO; 4502 break; 4503 } 4504 ptr += len; 4505 sb_ptr += len; 4506 cur += len; 4507 } 4508 free_extent_buffer(sb); 4509 return ret; 4510 } 4511 4512 int btrfs_read_chunk_tree(struct btrfs_root *root) 4513 { 4514 struct btrfs_path *path; 4515 struct extent_buffer *leaf; 4516 struct btrfs_key key; 4517 struct btrfs_key found_key; 4518 int ret; 4519 int slot; 4520 4521 root = root->fs_info->chunk_root; 4522 4523 path = btrfs_alloc_path(); 4524 if (!path) 4525 return -ENOMEM; 4526 4527 mutex_lock(&uuid_mutex); 4528 lock_chunks(root); 4529 4530 /* first we search for all of the device items, and then we 4531 * read in all of the chunk items. This way we can create chunk 4532 * mappings that reference all of the devices that are afound 4533 */ 4534 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 4535 key.offset = 0; 4536 key.type = 0; 4537 again: 4538 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4539 if (ret < 0) 4540 goto error; 4541 while (1) { 4542 leaf = path->nodes[0]; 4543 slot = path->slots[0]; 4544 if (slot >= btrfs_header_nritems(leaf)) { 4545 ret = btrfs_next_leaf(root, path); 4546 if (ret == 0) 4547 continue; 4548 if (ret < 0) 4549 goto error; 4550 break; 4551 } 4552 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4553 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4554 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 4555 break; 4556 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 4557 struct btrfs_dev_item *dev_item; 4558 dev_item = btrfs_item_ptr(leaf, slot, 4559 struct btrfs_dev_item); 4560 ret = read_one_dev(root, leaf, dev_item); 4561 if (ret) 4562 goto error; 4563 } 4564 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 4565 struct btrfs_chunk *chunk; 4566 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 4567 ret = read_one_chunk(root, &found_key, leaf, chunk); 4568 if (ret) 4569 goto error; 4570 } 4571 path->slots[0]++; 4572 } 4573 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4574 key.objectid = 0; 4575 btrfs_release_path(path); 4576 goto again; 4577 } 4578 ret = 0; 4579 error: 4580 unlock_chunks(root); 4581 mutex_unlock(&uuid_mutex); 4582 4583 btrfs_free_path(path); 4584 return ret; 4585 } 4586