1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <asm/div64.h> 27 #include "compat.h" 28 #include "ctree.h" 29 #include "extent_map.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "async-thread.h" 35 36 static int init_first_rw_device(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct btrfs_device *device); 39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 40 41 static DEFINE_MUTEX(uuid_mutex); 42 static LIST_HEAD(fs_uuids); 43 44 static void lock_chunks(struct btrfs_root *root) 45 { 46 mutex_lock(&root->fs_info->chunk_mutex); 47 } 48 49 static void unlock_chunks(struct btrfs_root *root) 50 { 51 mutex_unlock(&root->fs_info->chunk_mutex); 52 } 53 54 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 55 { 56 struct btrfs_device *device; 57 WARN_ON(fs_devices->opened); 58 while (!list_empty(&fs_devices->devices)) { 59 device = list_entry(fs_devices->devices.next, 60 struct btrfs_device, dev_list); 61 list_del(&device->dev_list); 62 kfree(device->name); 63 kfree(device); 64 } 65 kfree(fs_devices); 66 } 67 68 int btrfs_cleanup_fs_uuids(void) 69 { 70 struct btrfs_fs_devices *fs_devices; 71 72 while (!list_empty(&fs_uuids)) { 73 fs_devices = list_entry(fs_uuids.next, 74 struct btrfs_fs_devices, list); 75 list_del(&fs_devices->list); 76 free_fs_devices(fs_devices); 77 } 78 return 0; 79 } 80 81 static noinline struct btrfs_device *__find_device(struct list_head *head, 82 u64 devid, u8 *uuid) 83 { 84 struct btrfs_device *dev; 85 86 list_for_each_entry(dev, head, dev_list) { 87 if (dev->devid == devid && 88 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 89 return dev; 90 } 91 } 92 return NULL; 93 } 94 95 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 96 { 97 struct btrfs_fs_devices *fs_devices; 98 99 list_for_each_entry(fs_devices, &fs_uuids, list) { 100 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 101 return fs_devices; 102 } 103 return NULL; 104 } 105 106 static void requeue_list(struct btrfs_pending_bios *pending_bios, 107 struct bio *head, struct bio *tail) 108 { 109 110 struct bio *old_head; 111 112 old_head = pending_bios->head; 113 pending_bios->head = head; 114 if (pending_bios->tail) 115 tail->bi_next = old_head; 116 else 117 pending_bios->tail = tail; 118 } 119 120 /* 121 * we try to collect pending bios for a device so we don't get a large 122 * number of procs sending bios down to the same device. This greatly 123 * improves the schedulers ability to collect and merge the bios. 124 * 125 * But, it also turns into a long list of bios to process and that is sure 126 * to eventually make the worker thread block. The solution here is to 127 * make some progress and then put this work struct back at the end of 128 * the list if the block device is congested. This way, multiple devices 129 * can make progress from a single worker thread. 130 */ 131 static noinline int run_scheduled_bios(struct btrfs_device *device) 132 { 133 struct bio *pending; 134 struct backing_dev_info *bdi; 135 struct btrfs_fs_info *fs_info; 136 struct btrfs_pending_bios *pending_bios; 137 struct bio *tail; 138 struct bio *cur; 139 int again = 0; 140 unsigned long num_run; 141 unsigned long batch_run = 0; 142 unsigned long limit; 143 unsigned long last_waited = 0; 144 int force_reg = 0; 145 int sync_pending = 0; 146 struct blk_plug plug; 147 148 /* 149 * this function runs all the bios we've collected for 150 * a particular device. We don't want to wander off to 151 * another device without first sending all of these down. 152 * So, setup a plug here and finish it off before we return 153 */ 154 blk_start_plug(&plug); 155 156 bdi = blk_get_backing_dev_info(device->bdev); 157 fs_info = device->dev_root->fs_info; 158 limit = btrfs_async_submit_limit(fs_info); 159 limit = limit * 2 / 3; 160 161 loop: 162 spin_lock(&device->io_lock); 163 164 loop_lock: 165 num_run = 0; 166 167 /* take all the bios off the list at once and process them 168 * later on (without the lock held). But, remember the 169 * tail and other pointers so the bios can be properly reinserted 170 * into the list if we hit congestion 171 */ 172 if (!force_reg && device->pending_sync_bios.head) { 173 pending_bios = &device->pending_sync_bios; 174 force_reg = 1; 175 } else { 176 pending_bios = &device->pending_bios; 177 force_reg = 0; 178 } 179 180 pending = pending_bios->head; 181 tail = pending_bios->tail; 182 WARN_ON(pending && !tail); 183 184 /* 185 * if pending was null this time around, no bios need processing 186 * at all and we can stop. Otherwise it'll loop back up again 187 * and do an additional check so no bios are missed. 188 * 189 * device->running_pending is used to synchronize with the 190 * schedule_bio code. 191 */ 192 if (device->pending_sync_bios.head == NULL && 193 device->pending_bios.head == NULL) { 194 again = 0; 195 device->running_pending = 0; 196 } else { 197 again = 1; 198 device->running_pending = 1; 199 } 200 201 pending_bios->head = NULL; 202 pending_bios->tail = NULL; 203 204 spin_unlock(&device->io_lock); 205 206 while (pending) { 207 208 rmb(); 209 /* we want to work on both lists, but do more bios on the 210 * sync list than the regular list 211 */ 212 if ((num_run > 32 && 213 pending_bios != &device->pending_sync_bios && 214 device->pending_sync_bios.head) || 215 (num_run > 64 && pending_bios == &device->pending_sync_bios && 216 device->pending_bios.head)) { 217 spin_lock(&device->io_lock); 218 requeue_list(pending_bios, pending, tail); 219 goto loop_lock; 220 } 221 222 cur = pending; 223 pending = pending->bi_next; 224 cur->bi_next = NULL; 225 atomic_dec(&fs_info->nr_async_bios); 226 227 if (atomic_read(&fs_info->nr_async_bios) < limit && 228 waitqueue_active(&fs_info->async_submit_wait)) 229 wake_up(&fs_info->async_submit_wait); 230 231 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 232 233 /* 234 * if we're doing the sync list, record that our 235 * plug has some sync requests on it 236 * 237 * If we're doing the regular list and there are 238 * sync requests sitting around, unplug before 239 * we add more 240 */ 241 if (pending_bios == &device->pending_sync_bios) { 242 sync_pending = 1; 243 } else if (sync_pending) { 244 blk_finish_plug(&plug); 245 blk_start_plug(&plug); 246 sync_pending = 0; 247 } 248 249 submit_bio(cur->bi_rw, cur); 250 num_run++; 251 batch_run++; 252 if (need_resched()) 253 cond_resched(); 254 255 /* 256 * we made progress, there is more work to do and the bdi 257 * is now congested. Back off and let other work structs 258 * run instead 259 */ 260 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 261 fs_info->fs_devices->open_devices > 1) { 262 struct io_context *ioc; 263 264 ioc = current->io_context; 265 266 /* 267 * the main goal here is that we don't want to 268 * block if we're going to be able to submit 269 * more requests without blocking. 270 * 271 * This code does two great things, it pokes into 272 * the elevator code from a filesystem _and_ 273 * it makes assumptions about how batching works. 274 */ 275 if (ioc && ioc->nr_batch_requests > 0 && 276 time_before(jiffies, ioc->last_waited + HZ/50UL) && 277 (last_waited == 0 || 278 ioc->last_waited == last_waited)) { 279 /* 280 * we want to go through our batch of 281 * requests and stop. So, we copy out 282 * the ioc->last_waited time and test 283 * against it before looping 284 */ 285 last_waited = ioc->last_waited; 286 if (need_resched()) 287 cond_resched(); 288 continue; 289 } 290 spin_lock(&device->io_lock); 291 requeue_list(pending_bios, pending, tail); 292 device->running_pending = 1; 293 294 spin_unlock(&device->io_lock); 295 btrfs_requeue_work(&device->work); 296 goto done; 297 } 298 } 299 300 cond_resched(); 301 if (again) 302 goto loop; 303 304 spin_lock(&device->io_lock); 305 if (device->pending_bios.head || device->pending_sync_bios.head) 306 goto loop_lock; 307 spin_unlock(&device->io_lock); 308 309 done: 310 blk_finish_plug(&plug); 311 return 0; 312 } 313 314 static void pending_bios_fn(struct btrfs_work *work) 315 { 316 struct btrfs_device *device; 317 318 device = container_of(work, struct btrfs_device, work); 319 run_scheduled_bios(device); 320 } 321 322 static noinline int device_list_add(const char *path, 323 struct btrfs_super_block *disk_super, 324 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 325 { 326 struct btrfs_device *device; 327 struct btrfs_fs_devices *fs_devices; 328 u64 found_transid = btrfs_super_generation(disk_super); 329 char *name; 330 331 fs_devices = find_fsid(disk_super->fsid); 332 if (!fs_devices) { 333 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 334 if (!fs_devices) 335 return -ENOMEM; 336 INIT_LIST_HEAD(&fs_devices->devices); 337 INIT_LIST_HEAD(&fs_devices->alloc_list); 338 list_add(&fs_devices->list, &fs_uuids); 339 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 340 fs_devices->latest_devid = devid; 341 fs_devices->latest_trans = found_transid; 342 mutex_init(&fs_devices->device_list_mutex); 343 device = NULL; 344 } else { 345 device = __find_device(&fs_devices->devices, devid, 346 disk_super->dev_item.uuid); 347 } 348 if (!device) { 349 if (fs_devices->opened) 350 return -EBUSY; 351 352 device = kzalloc(sizeof(*device), GFP_NOFS); 353 if (!device) { 354 /* we can safely leave the fs_devices entry around */ 355 return -ENOMEM; 356 } 357 device->devid = devid; 358 device->work.func = pending_bios_fn; 359 memcpy(device->uuid, disk_super->dev_item.uuid, 360 BTRFS_UUID_SIZE); 361 spin_lock_init(&device->io_lock); 362 device->name = kstrdup(path, GFP_NOFS); 363 if (!device->name) { 364 kfree(device); 365 return -ENOMEM; 366 } 367 INIT_LIST_HEAD(&device->dev_alloc_list); 368 369 mutex_lock(&fs_devices->device_list_mutex); 370 list_add_rcu(&device->dev_list, &fs_devices->devices); 371 mutex_unlock(&fs_devices->device_list_mutex); 372 373 device->fs_devices = fs_devices; 374 fs_devices->num_devices++; 375 } else if (!device->name || strcmp(device->name, path)) { 376 name = kstrdup(path, GFP_NOFS); 377 if (!name) 378 return -ENOMEM; 379 kfree(device->name); 380 device->name = name; 381 if (device->missing) { 382 fs_devices->missing_devices--; 383 device->missing = 0; 384 } 385 } 386 387 if (found_transid > fs_devices->latest_trans) { 388 fs_devices->latest_devid = devid; 389 fs_devices->latest_trans = found_transid; 390 } 391 *fs_devices_ret = fs_devices; 392 return 0; 393 } 394 395 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 396 { 397 struct btrfs_fs_devices *fs_devices; 398 struct btrfs_device *device; 399 struct btrfs_device *orig_dev; 400 401 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 402 if (!fs_devices) 403 return ERR_PTR(-ENOMEM); 404 405 INIT_LIST_HEAD(&fs_devices->devices); 406 INIT_LIST_HEAD(&fs_devices->alloc_list); 407 INIT_LIST_HEAD(&fs_devices->list); 408 mutex_init(&fs_devices->device_list_mutex); 409 fs_devices->latest_devid = orig->latest_devid; 410 fs_devices->latest_trans = orig->latest_trans; 411 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 412 413 /* We have held the volume lock, it is safe to get the devices. */ 414 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 415 device = kzalloc(sizeof(*device), GFP_NOFS); 416 if (!device) 417 goto error; 418 419 device->name = kstrdup(orig_dev->name, GFP_NOFS); 420 if (!device->name) { 421 kfree(device); 422 goto error; 423 } 424 425 device->devid = orig_dev->devid; 426 device->work.func = pending_bios_fn; 427 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 428 spin_lock_init(&device->io_lock); 429 INIT_LIST_HEAD(&device->dev_list); 430 INIT_LIST_HEAD(&device->dev_alloc_list); 431 432 list_add(&device->dev_list, &fs_devices->devices); 433 device->fs_devices = fs_devices; 434 fs_devices->num_devices++; 435 } 436 return fs_devices; 437 error: 438 free_fs_devices(fs_devices); 439 return ERR_PTR(-ENOMEM); 440 } 441 442 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 443 { 444 struct btrfs_device *device, *next; 445 446 mutex_lock(&uuid_mutex); 447 again: 448 /* This is the initialized path, it is safe to release the devices. */ 449 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 450 if (device->in_fs_metadata) 451 continue; 452 453 if (device->bdev) { 454 blkdev_put(device->bdev, device->mode); 455 device->bdev = NULL; 456 fs_devices->open_devices--; 457 } 458 if (device->writeable) { 459 list_del_init(&device->dev_alloc_list); 460 device->writeable = 0; 461 fs_devices->rw_devices--; 462 } 463 list_del_init(&device->dev_list); 464 fs_devices->num_devices--; 465 kfree(device->name); 466 kfree(device); 467 } 468 469 if (fs_devices->seed) { 470 fs_devices = fs_devices->seed; 471 goto again; 472 } 473 474 mutex_unlock(&uuid_mutex); 475 return 0; 476 } 477 478 static void __free_device(struct work_struct *work) 479 { 480 struct btrfs_device *device; 481 482 device = container_of(work, struct btrfs_device, rcu_work); 483 484 if (device->bdev) 485 blkdev_put(device->bdev, device->mode); 486 487 kfree(device->name); 488 kfree(device); 489 } 490 491 static void free_device(struct rcu_head *head) 492 { 493 struct btrfs_device *device; 494 495 device = container_of(head, struct btrfs_device, rcu); 496 497 INIT_WORK(&device->rcu_work, __free_device); 498 schedule_work(&device->rcu_work); 499 } 500 501 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 502 { 503 struct btrfs_device *device; 504 505 if (--fs_devices->opened > 0) 506 return 0; 507 508 mutex_lock(&fs_devices->device_list_mutex); 509 list_for_each_entry(device, &fs_devices->devices, dev_list) { 510 struct btrfs_device *new_device; 511 512 if (device->bdev) 513 fs_devices->open_devices--; 514 515 if (device->writeable) { 516 list_del_init(&device->dev_alloc_list); 517 fs_devices->rw_devices--; 518 } 519 520 if (device->can_discard) 521 fs_devices->num_can_discard--; 522 523 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 524 BUG_ON(!new_device); 525 memcpy(new_device, device, sizeof(*new_device)); 526 new_device->name = kstrdup(device->name, GFP_NOFS); 527 BUG_ON(device->name && !new_device->name); 528 new_device->bdev = NULL; 529 new_device->writeable = 0; 530 new_device->in_fs_metadata = 0; 531 new_device->can_discard = 0; 532 list_replace_rcu(&device->dev_list, &new_device->dev_list); 533 534 call_rcu(&device->rcu, free_device); 535 } 536 mutex_unlock(&fs_devices->device_list_mutex); 537 538 WARN_ON(fs_devices->open_devices); 539 WARN_ON(fs_devices->rw_devices); 540 fs_devices->opened = 0; 541 fs_devices->seeding = 0; 542 543 return 0; 544 } 545 546 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 547 { 548 struct btrfs_fs_devices *seed_devices = NULL; 549 int ret; 550 551 mutex_lock(&uuid_mutex); 552 ret = __btrfs_close_devices(fs_devices); 553 if (!fs_devices->opened) { 554 seed_devices = fs_devices->seed; 555 fs_devices->seed = NULL; 556 } 557 mutex_unlock(&uuid_mutex); 558 559 while (seed_devices) { 560 fs_devices = seed_devices; 561 seed_devices = fs_devices->seed; 562 __btrfs_close_devices(fs_devices); 563 free_fs_devices(fs_devices); 564 } 565 return ret; 566 } 567 568 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 569 fmode_t flags, void *holder) 570 { 571 struct request_queue *q; 572 struct block_device *bdev; 573 struct list_head *head = &fs_devices->devices; 574 struct btrfs_device *device; 575 struct block_device *latest_bdev = NULL; 576 struct buffer_head *bh; 577 struct btrfs_super_block *disk_super; 578 u64 latest_devid = 0; 579 u64 latest_transid = 0; 580 u64 devid; 581 int seeding = 1; 582 int ret = 0; 583 584 flags |= FMODE_EXCL; 585 586 list_for_each_entry(device, head, dev_list) { 587 if (device->bdev) 588 continue; 589 if (!device->name) 590 continue; 591 592 bdev = blkdev_get_by_path(device->name, flags, holder); 593 if (IS_ERR(bdev)) { 594 printk(KERN_INFO "open %s failed\n", device->name); 595 goto error; 596 } 597 set_blocksize(bdev, 4096); 598 599 bh = btrfs_read_dev_super(bdev); 600 if (!bh) { 601 ret = -EINVAL; 602 goto error_close; 603 } 604 605 disk_super = (struct btrfs_super_block *)bh->b_data; 606 devid = btrfs_stack_device_id(&disk_super->dev_item); 607 if (devid != device->devid) 608 goto error_brelse; 609 610 if (memcmp(device->uuid, disk_super->dev_item.uuid, 611 BTRFS_UUID_SIZE)) 612 goto error_brelse; 613 614 device->generation = btrfs_super_generation(disk_super); 615 if (!latest_transid || device->generation > latest_transid) { 616 latest_devid = devid; 617 latest_transid = device->generation; 618 latest_bdev = bdev; 619 } 620 621 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 622 device->writeable = 0; 623 } else { 624 device->writeable = !bdev_read_only(bdev); 625 seeding = 0; 626 } 627 628 q = bdev_get_queue(bdev); 629 if (blk_queue_discard(q)) { 630 device->can_discard = 1; 631 fs_devices->num_can_discard++; 632 } 633 634 device->bdev = bdev; 635 device->in_fs_metadata = 0; 636 device->mode = flags; 637 638 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 639 fs_devices->rotating = 1; 640 641 fs_devices->open_devices++; 642 if (device->writeable) { 643 fs_devices->rw_devices++; 644 list_add(&device->dev_alloc_list, 645 &fs_devices->alloc_list); 646 } 647 brelse(bh); 648 continue; 649 650 error_brelse: 651 brelse(bh); 652 error_close: 653 blkdev_put(bdev, flags); 654 error: 655 continue; 656 } 657 if (fs_devices->open_devices == 0) { 658 ret = -EIO; 659 goto out; 660 } 661 fs_devices->seeding = seeding; 662 fs_devices->opened = 1; 663 fs_devices->latest_bdev = latest_bdev; 664 fs_devices->latest_devid = latest_devid; 665 fs_devices->latest_trans = latest_transid; 666 fs_devices->total_rw_bytes = 0; 667 out: 668 return ret; 669 } 670 671 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 672 fmode_t flags, void *holder) 673 { 674 int ret; 675 676 mutex_lock(&uuid_mutex); 677 if (fs_devices->opened) { 678 fs_devices->opened++; 679 ret = 0; 680 } else { 681 ret = __btrfs_open_devices(fs_devices, flags, holder); 682 } 683 mutex_unlock(&uuid_mutex); 684 return ret; 685 } 686 687 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 688 struct btrfs_fs_devices **fs_devices_ret) 689 { 690 struct btrfs_super_block *disk_super; 691 struct block_device *bdev; 692 struct buffer_head *bh; 693 int ret; 694 u64 devid; 695 u64 transid; 696 697 mutex_lock(&uuid_mutex); 698 699 flags |= FMODE_EXCL; 700 bdev = blkdev_get_by_path(path, flags, holder); 701 702 if (IS_ERR(bdev)) { 703 ret = PTR_ERR(bdev); 704 goto error; 705 } 706 707 ret = set_blocksize(bdev, 4096); 708 if (ret) 709 goto error_close; 710 bh = btrfs_read_dev_super(bdev); 711 if (!bh) { 712 ret = -EINVAL; 713 goto error_close; 714 } 715 disk_super = (struct btrfs_super_block *)bh->b_data; 716 devid = btrfs_stack_device_id(&disk_super->dev_item); 717 transid = btrfs_super_generation(disk_super); 718 if (disk_super->label[0]) 719 printk(KERN_INFO "device label %s ", disk_super->label); 720 else 721 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 722 printk(KERN_CONT "devid %llu transid %llu %s\n", 723 (unsigned long long)devid, (unsigned long long)transid, path); 724 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 725 726 brelse(bh); 727 error_close: 728 blkdev_put(bdev, flags); 729 error: 730 mutex_unlock(&uuid_mutex); 731 return ret; 732 } 733 734 /* helper to account the used device space in the range */ 735 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 736 u64 end, u64 *length) 737 { 738 struct btrfs_key key; 739 struct btrfs_root *root = device->dev_root; 740 struct btrfs_dev_extent *dev_extent; 741 struct btrfs_path *path; 742 u64 extent_end; 743 int ret; 744 int slot; 745 struct extent_buffer *l; 746 747 *length = 0; 748 749 if (start >= device->total_bytes) 750 return 0; 751 752 path = btrfs_alloc_path(); 753 if (!path) 754 return -ENOMEM; 755 path->reada = 2; 756 757 key.objectid = device->devid; 758 key.offset = start; 759 key.type = BTRFS_DEV_EXTENT_KEY; 760 761 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 762 if (ret < 0) 763 goto out; 764 if (ret > 0) { 765 ret = btrfs_previous_item(root, path, key.objectid, key.type); 766 if (ret < 0) 767 goto out; 768 } 769 770 while (1) { 771 l = path->nodes[0]; 772 slot = path->slots[0]; 773 if (slot >= btrfs_header_nritems(l)) { 774 ret = btrfs_next_leaf(root, path); 775 if (ret == 0) 776 continue; 777 if (ret < 0) 778 goto out; 779 780 break; 781 } 782 btrfs_item_key_to_cpu(l, &key, slot); 783 784 if (key.objectid < device->devid) 785 goto next; 786 787 if (key.objectid > device->devid) 788 break; 789 790 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 791 goto next; 792 793 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 794 extent_end = key.offset + btrfs_dev_extent_length(l, 795 dev_extent); 796 if (key.offset <= start && extent_end > end) { 797 *length = end - start + 1; 798 break; 799 } else if (key.offset <= start && extent_end > start) 800 *length += extent_end - start; 801 else if (key.offset > start && extent_end <= end) 802 *length += extent_end - key.offset; 803 else if (key.offset > start && key.offset <= end) { 804 *length += end - key.offset + 1; 805 break; 806 } else if (key.offset > end) 807 break; 808 809 next: 810 path->slots[0]++; 811 } 812 ret = 0; 813 out: 814 btrfs_free_path(path); 815 return ret; 816 } 817 818 /* 819 * find_free_dev_extent - find free space in the specified device 820 * @trans: transaction handler 821 * @device: the device which we search the free space in 822 * @num_bytes: the size of the free space that we need 823 * @start: store the start of the free space. 824 * @len: the size of the free space. that we find, or the size of the max 825 * free space if we don't find suitable free space 826 * 827 * this uses a pretty simple search, the expectation is that it is 828 * called very infrequently and that a given device has a small number 829 * of extents 830 * 831 * @start is used to store the start of the free space if we find. But if we 832 * don't find suitable free space, it will be used to store the start position 833 * of the max free space. 834 * 835 * @len is used to store the size of the free space that we find. 836 * But if we don't find suitable free space, it is used to store the size of 837 * the max free space. 838 */ 839 int find_free_dev_extent(struct btrfs_trans_handle *trans, 840 struct btrfs_device *device, u64 num_bytes, 841 u64 *start, u64 *len) 842 { 843 struct btrfs_key key; 844 struct btrfs_root *root = device->dev_root; 845 struct btrfs_dev_extent *dev_extent; 846 struct btrfs_path *path; 847 u64 hole_size; 848 u64 max_hole_start; 849 u64 max_hole_size; 850 u64 extent_end; 851 u64 search_start; 852 u64 search_end = device->total_bytes; 853 int ret; 854 int slot; 855 struct extent_buffer *l; 856 857 /* FIXME use last free of some kind */ 858 859 /* we don't want to overwrite the superblock on the drive, 860 * so we make sure to start at an offset of at least 1MB 861 */ 862 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 863 864 max_hole_start = search_start; 865 max_hole_size = 0; 866 hole_size = 0; 867 868 if (search_start >= search_end) { 869 ret = -ENOSPC; 870 goto error; 871 } 872 873 path = btrfs_alloc_path(); 874 if (!path) { 875 ret = -ENOMEM; 876 goto error; 877 } 878 path->reada = 2; 879 880 key.objectid = device->devid; 881 key.offset = search_start; 882 key.type = BTRFS_DEV_EXTENT_KEY; 883 884 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 885 if (ret < 0) 886 goto out; 887 if (ret > 0) { 888 ret = btrfs_previous_item(root, path, key.objectid, key.type); 889 if (ret < 0) 890 goto out; 891 } 892 893 while (1) { 894 l = path->nodes[0]; 895 slot = path->slots[0]; 896 if (slot >= btrfs_header_nritems(l)) { 897 ret = btrfs_next_leaf(root, path); 898 if (ret == 0) 899 continue; 900 if (ret < 0) 901 goto out; 902 903 break; 904 } 905 btrfs_item_key_to_cpu(l, &key, slot); 906 907 if (key.objectid < device->devid) 908 goto next; 909 910 if (key.objectid > device->devid) 911 break; 912 913 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 914 goto next; 915 916 if (key.offset > search_start) { 917 hole_size = key.offset - search_start; 918 919 if (hole_size > max_hole_size) { 920 max_hole_start = search_start; 921 max_hole_size = hole_size; 922 } 923 924 /* 925 * If this free space is greater than which we need, 926 * it must be the max free space that we have found 927 * until now, so max_hole_start must point to the start 928 * of this free space and the length of this free space 929 * is stored in max_hole_size. Thus, we return 930 * max_hole_start and max_hole_size and go back to the 931 * caller. 932 */ 933 if (hole_size >= num_bytes) { 934 ret = 0; 935 goto out; 936 } 937 } 938 939 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 940 extent_end = key.offset + btrfs_dev_extent_length(l, 941 dev_extent); 942 if (extent_end > search_start) 943 search_start = extent_end; 944 next: 945 path->slots[0]++; 946 cond_resched(); 947 } 948 949 /* 950 * At this point, search_start should be the end of 951 * allocated dev extents, and when shrinking the device, 952 * search_end may be smaller than search_start. 953 */ 954 if (search_end > search_start) 955 hole_size = search_end - search_start; 956 957 if (hole_size > max_hole_size) { 958 max_hole_start = search_start; 959 max_hole_size = hole_size; 960 } 961 962 /* See above. */ 963 if (hole_size < num_bytes) 964 ret = -ENOSPC; 965 else 966 ret = 0; 967 968 out: 969 btrfs_free_path(path); 970 error: 971 *start = max_hole_start; 972 if (len) 973 *len = max_hole_size; 974 return ret; 975 } 976 977 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 978 struct btrfs_device *device, 979 u64 start) 980 { 981 int ret; 982 struct btrfs_path *path; 983 struct btrfs_root *root = device->dev_root; 984 struct btrfs_key key; 985 struct btrfs_key found_key; 986 struct extent_buffer *leaf = NULL; 987 struct btrfs_dev_extent *extent = NULL; 988 989 path = btrfs_alloc_path(); 990 if (!path) 991 return -ENOMEM; 992 993 key.objectid = device->devid; 994 key.offset = start; 995 key.type = BTRFS_DEV_EXTENT_KEY; 996 997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 998 if (ret > 0) { 999 ret = btrfs_previous_item(root, path, key.objectid, 1000 BTRFS_DEV_EXTENT_KEY); 1001 if (ret) 1002 goto out; 1003 leaf = path->nodes[0]; 1004 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1005 extent = btrfs_item_ptr(leaf, path->slots[0], 1006 struct btrfs_dev_extent); 1007 BUG_ON(found_key.offset > start || found_key.offset + 1008 btrfs_dev_extent_length(leaf, extent) < start); 1009 } else if (ret == 0) { 1010 leaf = path->nodes[0]; 1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1012 struct btrfs_dev_extent); 1013 } 1014 BUG_ON(ret); 1015 1016 if (device->bytes_used > 0) 1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1018 ret = btrfs_del_item(trans, root, path); 1019 1020 out: 1021 btrfs_free_path(path); 1022 return ret; 1023 } 1024 1025 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1026 struct btrfs_device *device, 1027 u64 chunk_tree, u64 chunk_objectid, 1028 u64 chunk_offset, u64 start, u64 num_bytes) 1029 { 1030 int ret; 1031 struct btrfs_path *path; 1032 struct btrfs_root *root = device->dev_root; 1033 struct btrfs_dev_extent *extent; 1034 struct extent_buffer *leaf; 1035 struct btrfs_key key; 1036 1037 WARN_ON(!device->in_fs_metadata); 1038 path = btrfs_alloc_path(); 1039 if (!path) 1040 return -ENOMEM; 1041 1042 key.objectid = device->devid; 1043 key.offset = start; 1044 key.type = BTRFS_DEV_EXTENT_KEY; 1045 ret = btrfs_insert_empty_item(trans, root, path, &key, 1046 sizeof(*extent)); 1047 BUG_ON(ret); 1048 1049 leaf = path->nodes[0]; 1050 extent = btrfs_item_ptr(leaf, path->slots[0], 1051 struct btrfs_dev_extent); 1052 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1053 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1054 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1055 1056 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1057 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1058 BTRFS_UUID_SIZE); 1059 1060 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1061 btrfs_mark_buffer_dirty(leaf); 1062 btrfs_free_path(path); 1063 return ret; 1064 } 1065 1066 static noinline int find_next_chunk(struct btrfs_root *root, 1067 u64 objectid, u64 *offset) 1068 { 1069 struct btrfs_path *path; 1070 int ret; 1071 struct btrfs_key key; 1072 struct btrfs_chunk *chunk; 1073 struct btrfs_key found_key; 1074 1075 path = btrfs_alloc_path(); 1076 if (!path) 1077 return -ENOMEM; 1078 1079 key.objectid = objectid; 1080 key.offset = (u64)-1; 1081 key.type = BTRFS_CHUNK_ITEM_KEY; 1082 1083 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1084 if (ret < 0) 1085 goto error; 1086 1087 BUG_ON(ret == 0); 1088 1089 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1090 if (ret) { 1091 *offset = 0; 1092 } else { 1093 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1094 path->slots[0]); 1095 if (found_key.objectid != objectid) 1096 *offset = 0; 1097 else { 1098 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1099 struct btrfs_chunk); 1100 *offset = found_key.offset + 1101 btrfs_chunk_length(path->nodes[0], chunk); 1102 } 1103 } 1104 ret = 0; 1105 error: 1106 btrfs_free_path(path); 1107 return ret; 1108 } 1109 1110 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1111 { 1112 int ret; 1113 struct btrfs_key key; 1114 struct btrfs_key found_key; 1115 struct btrfs_path *path; 1116 1117 root = root->fs_info->chunk_root; 1118 1119 path = btrfs_alloc_path(); 1120 if (!path) 1121 return -ENOMEM; 1122 1123 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1124 key.type = BTRFS_DEV_ITEM_KEY; 1125 key.offset = (u64)-1; 1126 1127 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1128 if (ret < 0) 1129 goto error; 1130 1131 BUG_ON(ret == 0); 1132 1133 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1134 BTRFS_DEV_ITEM_KEY); 1135 if (ret) { 1136 *objectid = 1; 1137 } else { 1138 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1139 path->slots[0]); 1140 *objectid = found_key.offset + 1; 1141 } 1142 ret = 0; 1143 error: 1144 btrfs_free_path(path); 1145 return ret; 1146 } 1147 1148 /* 1149 * the device information is stored in the chunk root 1150 * the btrfs_device struct should be fully filled in 1151 */ 1152 int btrfs_add_device(struct btrfs_trans_handle *trans, 1153 struct btrfs_root *root, 1154 struct btrfs_device *device) 1155 { 1156 int ret; 1157 struct btrfs_path *path; 1158 struct btrfs_dev_item *dev_item; 1159 struct extent_buffer *leaf; 1160 struct btrfs_key key; 1161 unsigned long ptr; 1162 1163 root = root->fs_info->chunk_root; 1164 1165 path = btrfs_alloc_path(); 1166 if (!path) 1167 return -ENOMEM; 1168 1169 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1170 key.type = BTRFS_DEV_ITEM_KEY; 1171 key.offset = device->devid; 1172 1173 ret = btrfs_insert_empty_item(trans, root, path, &key, 1174 sizeof(*dev_item)); 1175 if (ret) 1176 goto out; 1177 1178 leaf = path->nodes[0]; 1179 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1180 1181 btrfs_set_device_id(leaf, dev_item, device->devid); 1182 btrfs_set_device_generation(leaf, dev_item, 0); 1183 btrfs_set_device_type(leaf, dev_item, device->type); 1184 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1185 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1186 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1187 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1188 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1189 btrfs_set_device_group(leaf, dev_item, 0); 1190 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1191 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1192 btrfs_set_device_start_offset(leaf, dev_item, 0); 1193 1194 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1195 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1196 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1197 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1198 btrfs_mark_buffer_dirty(leaf); 1199 1200 ret = 0; 1201 out: 1202 btrfs_free_path(path); 1203 return ret; 1204 } 1205 1206 static int btrfs_rm_dev_item(struct btrfs_root *root, 1207 struct btrfs_device *device) 1208 { 1209 int ret; 1210 struct btrfs_path *path; 1211 struct btrfs_key key; 1212 struct btrfs_trans_handle *trans; 1213 1214 root = root->fs_info->chunk_root; 1215 1216 path = btrfs_alloc_path(); 1217 if (!path) 1218 return -ENOMEM; 1219 1220 trans = btrfs_start_transaction(root, 0); 1221 if (IS_ERR(trans)) { 1222 btrfs_free_path(path); 1223 return PTR_ERR(trans); 1224 } 1225 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1226 key.type = BTRFS_DEV_ITEM_KEY; 1227 key.offset = device->devid; 1228 lock_chunks(root); 1229 1230 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1231 if (ret < 0) 1232 goto out; 1233 1234 if (ret > 0) { 1235 ret = -ENOENT; 1236 goto out; 1237 } 1238 1239 ret = btrfs_del_item(trans, root, path); 1240 if (ret) 1241 goto out; 1242 out: 1243 btrfs_free_path(path); 1244 unlock_chunks(root); 1245 btrfs_commit_transaction(trans, root); 1246 return ret; 1247 } 1248 1249 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1250 { 1251 struct btrfs_device *device; 1252 struct btrfs_device *next_device; 1253 struct block_device *bdev; 1254 struct buffer_head *bh = NULL; 1255 struct btrfs_super_block *disk_super; 1256 struct btrfs_fs_devices *cur_devices; 1257 u64 all_avail; 1258 u64 devid; 1259 u64 num_devices; 1260 u8 *dev_uuid; 1261 int ret = 0; 1262 bool clear_super = false; 1263 1264 mutex_lock(&uuid_mutex); 1265 mutex_lock(&root->fs_info->volume_mutex); 1266 1267 all_avail = root->fs_info->avail_data_alloc_bits | 1268 root->fs_info->avail_system_alloc_bits | 1269 root->fs_info->avail_metadata_alloc_bits; 1270 1271 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1272 root->fs_info->fs_devices->num_devices <= 4) { 1273 printk(KERN_ERR "btrfs: unable to go below four devices " 1274 "on raid10\n"); 1275 ret = -EINVAL; 1276 goto out; 1277 } 1278 1279 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1280 root->fs_info->fs_devices->num_devices <= 2) { 1281 printk(KERN_ERR "btrfs: unable to go below two " 1282 "devices on raid1\n"); 1283 ret = -EINVAL; 1284 goto out; 1285 } 1286 1287 if (strcmp(device_path, "missing") == 0) { 1288 struct list_head *devices; 1289 struct btrfs_device *tmp; 1290 1291 device = NULL; 1292 devices = &root->fs_info->fs_devices->devices; 1293 /* 1294 * It is safe to read the devices since the volume_mutex 1295 * is held. 1296 */ 1297 list_for_each_entry(tmp, devices, dev_list) { 1298 if (tmp->in_fs_metadata && !tmp->bdev) { 1299 device = tmp; 1300 break; 1301 } 1302 } 1303 bdev = NULL; 1304 bh = NULL; 1305 disk_super = NULL; 1306 if (!device) { 1307 printk(KERN_ERR "btrfs: no missing devices found to " 1308 "remove\n"); 1309 goto out; 1310 } 1311 } else { 1312 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1313 root->fs_info->bdev_holder); 1314 if (IS_ERR(bdev)) { 1315 ret = PTR_ERR(bdev); 1316 goto out; 1317 } 1318 1319 set_blocksize(bdev, 4096); 1320 bh = btrfs_read_dev_super(bdev); 1321 if (!bh) { 1322 ret = -EINVAL; 1323 goto error_close; 1324 } 1325 disk_super = (struct btrfs_super_block *)bh->b_data; 1326 devid = btrfs_stack_device_id(&disk_super->dev_item); 1327 dev_uuid = disk_super->dev_item.uuid; 1328 device = btrfs_find_device(root, devid, dev_uuid, 1329 disk_super->fsid); 1330 if (!device) { 1331 ret = -ENOENT; 1332 goto error_brelse; 1333 } 1334 } 1335 1336 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1337 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1338 "device\n"); 1339 ret = -EINVAL; 1340 goto error_brelse; 1341 } 1342 1343 if (device->writeable) { 1344 lock_chunks(root); 1345 list_del_init(&device->dev_alloc_list); 1346 unlock_chunks(root); 1347 root->fs_info->fs_devices->rw_devices--; 1348 clear_super = true; 1349 } 1350 1351 ret = btrfs_shrink_device(device, 0); 1352 if (ret) 1353 goto error_undo; 1354 1355 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1356 if (ret) 1357 goto error_undo; 1358 1359 device->in_fs_metadata = 0; 1360 btrfs_scrub_cancel_dev(root, device); 1361 1362 /* 1363 * the device list mutex makes sure that we don't change 1364 * the device list while someone else is writing out all 1365 * the device supers. 1366 */ 1367 1368 cur_devices = device->fs_devices; 1369 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1370 list_del_rcu(&device->dev_list); 1371 1372 device->fs_devices->num_devices--; 1373 1374 if (device->missing) 1375 root->fs_info->fs_devices->missing_devices--; 1376 1377 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1378 struct btrfs_device, dev_list); 1379 if (device->bdev == root->fs_info->sb->s_bdev) 1380 root->fs_info->sb->s_bdev = next_device->bdev; 1381 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1382 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1383 1384 if (device->bdev) 1385 device->fs_devices->open_devices--; 1386 1387 call_rcu(&device->rcu, free_device); 1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1389 1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1392 1393 if (cur_devices->open_devices == 0) { 1394 struct btrfs_fs_devices *fs_devices; 1395 fs_devices = root->fs_info->fs_devices; 1396 while (fs_devices) { 1397 if (fs_devices->seed == cur_devices) 1398 break; 1399 fs_devices = fs_devices->seed; 1400 } 1401 fs_devices->seed = cur_devices->seed; 1402 cur_devices->seed = NULL; 1403 lock_chunks(root); 1404 __btrfs_close_devices(cur_devices); 1405 unlock_chunks(root); 1406 free_fs_devices(cur_devices); 1407 } 1408 1409 /* 1410 * at this point, the device is zero sized. We want to 1411 * remove it from the devices list and zero out the old super 1412 */ 1413 if (clear_super) { 1414 /* make sure this device isn't detected as part of 1415 * the FS anymore 1416 */ 1417 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1418 set_buffer_dirty(bh); 1419 sync_dirty_buffer(bh); 1420 } 1421 1422 ret = 0; 1423 1424 error_brelse: 1425 brelse(bh); 1426 error_close: 1427 if (bdev) 1428 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1429 out: 1430 mutex_unlock(&root->fs_info->volume_mutex); 1431 mutex_unlock(&uuid_mutex); 1432 return ret; 1433 error_undo: 1434 if (device->writeable) { 1435 lock_chunks(root); 1436 list_add(&device->dev_alloc_list, 1437 &root->fs_info->fs_devices->alloc_list); 1438 unlock_chunks(root); 1439 root->fs_info->fs_devices->rw_devices++; 1440 } 1441 goto error_brelse; 1442 } 1443 1444 /* 1445 * does all the dirty work required for changing file system's UUID. 1446 */ 1447 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1448 struct btrfs_root *root) 1449 { 1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1451 struct btrfs_fs_devices *old_devices; 1452 struct btrfs_fs_devices *seed_devices; 1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1454 struct btrfs_device *device; 1455 u64 super_flags; 1456 1457 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1458 if (!fs_devices->seeding) 1459 return -EINVAL; 1460 1461 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1462 if (!seed_devices) 1463 return -ENOMEM; 1464 1465 old_devices = clone_fs_devices(fs_devices); 1466 if (IS_ERR(old_devices)) { 1467 kfree(seed_devices); 1468 return PTR_ERR(old_devices); 1469 } 1470 1471 list_add(&old_devices->list, &fs_uuids); 1472 1473 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1474 seed_devices->opened = 1; 1475 INIT_LIST_HEAD(&seed_devices->devices); 1476 INIT_LIST_HEAD(&seed_devices->alloc_list); 1477 mutex_init(&seed_devices->device_list_mutex); 1478 1479 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1480 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1481 synchronize_rcu); 1482 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1483 1484 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1485 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1486 device->fs_devices = seed_devices; 1487 } 1488 1489 fs_devices->seeding = 0; 1490 fs_devices->num_devices = 0; 1491 fs_devices->open_devices = 0; 1492 fs_devices->seed = seed_devices; 1493 1494 generate_random_uuid(fs_devices->fsid); 1495 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1496 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1497 super_flags = btrfs_super_flags(disk_super) & 1498 ~BTRFS_SUPER_FLAG_SEEDING; 1499 btrfs_set_super_flags(disk_super, super_flags); 1500 1501 return 0; 1502 } 1503 1504 /* 1505 * strore the expected generation for seed devices in device items. 1506 */ 1507 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1508 struct btrfs_root *root) 1509 { 1510 struct btrfs_path *path; 1511 struct extent_buffer *leaf; 1512 struct btrfs_dev_item *dev_item; 1513 struct btrfs_device *device; 1514 struct btrfs_key key; 1515 u8 fs_uuid[BTRFS_UUID_SIZE]; 1516 u8 dev_uuid[BTRFS_UUID_SIZE]; 1517 u64 devid; 1518 int ret; 1519 1520 path = btrfs_alloc_path(); 1521 if (!path) 1522 return -ENOMEM; 1523 1524 root = root->fs_info->chunk_root; 1525 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1526 key.offset = 0; 1527 key.type = BTRFS_DEV_ITEM_KEY; 1528 1529 while (1) { 1530 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1531 if (ret < 0) 1532 goto error; 1533 1534 leaf = path->nodes[0]; 1535 next_slot: 1536 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1537 ret = btrfs_next_leaf(root, path); 1538 if (ret > 0) 1539 break; 1540 if (ret < 0) 1541 goto error; 1542 leaf = path->nodes[0]; 1543 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1544 btrfs_release_path(path); 1545 continue; 1546 } 1547 1548 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1549 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1550 key.type != BTRFS_DEV_ITEM_KEY) 1551 break; 1552 1553 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1554 struct btrfs_dev_item); 1555 devid = btrfs_device_id(leaf, dev_item); 1556 read_extent_buffer(leaf, dev_uuid, 1557 (unsigned long)btrfs_device_uuid(dev_item), 1558 BTRFS_UUID_SIZE); 1559 read_extent_buffer(leaf, fs_uuid, 1560 (unsigned long)btrfs_device_fsid(dev_item), 1561 BTRFS_UUID_SIZE); 1562 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1563 BUG_ON(!device); 1564 1565 if (device->fs_devices->seeding) { 1566 btrfs_set_device_generation(leaf, dev_item, 1567 device->generation); 1568 btrfs_mark_buffer_dirty(leaf); 1569 } 1570 1571 path->slots[0]++; 1572 goto next_slot; 1573 } 1574 ret = 0; 1575 error: 1576 btrfs_free_path(path); 1577 return ret; 1578 } 1579 1580 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1581 { 1582 struct request_queue *q; 1583 struct btrfs_trans_handle *trans; 1584 struct btrfs_device *device; 1585 struct block_device *bdev; 1586 struct list_head *devices; 1587 struct super_block *sb = root->fs_info->sb; 1588 u64 total_bytes; 1589 int seeding_dev = 0; 1590 int ret = 0; 1591 1592 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1593 return -EINVAL; 1594 1595 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1596 root->fs_info->bdev_holder); 1597 if (IS_ERR(bdev)) 1598 return PTR_ERR(bdev); 1599 1600 if (root->fs_info->fs_devices->seeding) { 1601 seeding_dev = 1; 1602 down_write(&sb->s_umount); 1603 mutex_lock(&uuid_mutex); 1604 } 1605 1606 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1607 mutex_lock(&root->fs_info->volume_mutex); 1608 1609 devices = &root->fs_info->fs_devices->devices; 1610 /* 1611 * we have the volume lock, so we don't need the extra 1612 * device list mutex while reading the list here. 1613 */ 1614 list_for_each_entry(device, devices, dev_list) { 1615 if (device->bdev == bdev) { 1616 ret = -EEXIST; 1617 goto error; 1618 } 1619 } 1620 1621 device = kzalloc(sizeof(*device), GFP_NOFS); 1622 if (!device) { 1623 /* we can safely leave the fs_devices entry around */ 1624 ret = -ENOMEM; 1625 goto error; 1626 } 1627 1628 device->name = kstrdup(device_path, GFP_NOFS); 1629 if (!device->name) { 1630 kfree(device); 1631 ret = -ENOMEM; 1632 goto error; 1633 } 1634 1635 ret = find_next_devid(root, &device->devid); 1636 if (ret) { 1637 kfree(device->name); 1638 kfree(device); 1639 goto error; 1640 } 1641 1642 trans = btrfs_start_transaction(root, 0); 1643 if (IS_ERR(trans)) { 1644 kfree(device->name); 1645 kfree(device); 1646 ret = PTR_ERR(trans); 1647 goto error; 1648 } 1649 1650 lock_chunks(root); 1651 1652 q = bdev_get_queue(bdev); 1653 if (blk_queue_discard(q)) 1654 device->can_discard = 1; 1655 device->writeable = 1; 1656 device->work.func = pending_bios_fn; 1657 generate_random_uuid(device->uuid); 1658 spin_lock_init(&device->io_lock); 1659 device->generation = trans->transid; 1660 device->io_width = root->sectorsize; 1661 device->io_align = root->sectorsize; 1662 device->sector_size = root->sectorsize; 1663 device->total_bytes = i_size_read(bdev->bd_inode); 1664 device->disk_total_bytes = device->total_bytes; 1665 device->dev_root = root->fs_info->dev_root; 1666 device->bdev = bdev; 1667 device->in_fs_metadata = 1; 1668 device->mode = FMODE_EXCL; 1669 set_blocksize(device->bdev, 4096); 1670 1671 if (seeding_dev) { 1672 sb->s_flags &= ~MS_RDONLY; 1673 ret = btrfs_prepare_sprout(trans, root); 1674 BUG_ON(ret); 1675 } 1676 1677 device->fs_devices = root->fs_info->fs_devices; 1678 1679 /* 1680 * we don't want write_supers to jump in here with our device 1681 * half setup 1682 */ 1683 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1684 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1685 list_add(&device->dev_alloc_list, 1686 &root->fs_info->fs_devices->alloc_list); 1687 root->fs_info->fs_devices->num_devices++; 1688 root->fs_info->fs_devices->open_devices++; 1689 root->fs_info->fs_devices->rw_devices++; 1690 if (device->can_discard) 1691 root->fs_info->fs_devices->num_can_discard++; 1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1693 1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1695 root->fs_info->fs_devices->rotating = 1; 1696 1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1699 total_bytes + device->total_bytes); 1700 1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1703 total_bytes + 1); 1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1705 1706 if (seeding_dev) { 1707 ret = init_first_rw_device(trans, root, device); 1708 BUG_ON(ret); 1709 ret = btrfs_finish_sprout(trans, root); 1710 BUG_ON(ret); 1711 } else { 1712 ret = btrfs_add_device(trans, root, device); 1713 } 1714 1715 /* 1716 * we've got more storage, clear any full flags on the space 1717 * infos 1718 */ 1719 btrfs_clear_space_info_full(root->fs_info); 1720 1721 unlock_chunks(root); 1722 btrfs_commit_transaction(trans, root); 1723 1724 if (seeding_dev) { 1725 mutex_unlock(&uuid_mutex); 1726 up_write(&sb->s_umount); 1727 1728 ret = btrfs_relocate_sys_chunks(root); 1729 BUG_ON(ret); 1730 } 1731 out: 1732 mutex_unlock(&root->fs_info->volume_mutex); 1733 return ret; 1734 error: 1735 blkdev_put(bdev, FMODE_EXCL); 1736 if (seeding_dev) { 1737 mutex_unlock(&uuid_mutex); 1738 up_write(&sb->s_umount); 1739 } 1740 goto out; 1741 } 1742 1743 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1744 struct btrfs_device *device) 1745 { 1746 int ret; 1747 struct btrfs_path *path; 1748 struct btrfs_root *root; 1749 struct btrfs_dev_item *dev_item; 1750 struct extent_buffer *leaf; 1751 struct btrfs_key key; 1752 1753 root = device->dev_root->fs_info->chunk_root; 1754 1755 path = btrfs_alloc_path(); 1756 if (!path) 1757 return -ENOMEM; 1758 1759 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1760 key.type = BTRFS_DEV_ITEM_KEY; 1761 key.offset = device->devid; 1762 1763 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1764 if (ret < 0) 1765 goto out; 1766 1767 if (ret > 0) { 1768 ret = -ENOENT; 1769 goto out; 1770 } 1771 1772 leaf = path->nodes[0]; 1773 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1774 1775 btrfs_set_device_id(leaf, dev_item, device->devid); 1776 btrfs_set_device_type(leaf, dev_item, device->type); 1777 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1778 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1779 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1780 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1781 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1782 btrfs_mark_buffer_dirty(leaf); 1783 1784 out: 1785 btrfs_free_path(path); 1786 return ret; 1787 } 1788 1789 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1790 struct btrfs_device *device, u64 new_size) 1791 { 1792 struct btrfs_super_block *super_copy = 1793 &device->dev_root->fs_info->super_copy; 1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1795 u64 diff = new_size - device->total_bytes; 1796 1797 if (!device->writeable) 1798 return -EACCES; 1799 if (new_size <= device->total_bytes) 1800 return -EINVAL; 1801 1802 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1803 device->fs_devices->total_rw_bytes += diff; 1804 1805 device->total_bytes = new_size; 1806 device->disk_total_bytes = new_size; 1807 btrfs_clear_space_info_full(device->dev_root->fs_info); 1808 1809 return btrfs_update_device(trans, device); 1810 } 1811 1812 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1813 struct btrfs_device *device, u64 new_size) 1814 { 1815 int ret; 1816 lock_chunks(device->dev_root); 1817 ret = __btrfs_grow_device(trans, device, new_size); 1818 unlock_chunks(device->dev_root); 1819 return ret; 1820 } 1821 1822 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1823 struct btrfs_root *root, 1824 u64 chunk_tree, u64 chunk_objectid, 1825 u64 chunk_offset) 1826 { 1827 int ret; 1828 struct btrfs_path *path; 1829 struct btrfs_key key; 1830 1831 root = root->fs_info->chunk_root; 1832 path = btrfs_alloc_path(); 1833 if (!path) 1834 return -ENOMEM; 1835 1836 key.objectid = chunk_objectid; 1837 key.offset = chunk_offset; 1838 key.type = BTRFS_CHUNK_ITEM_KEY; 1839 1840 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1841 BUG_ON(ret); 1842 1843 ret = btrfs_del_item(trans, root, path); 1844 1845 btrfs_free_path(path); 1846 return ret; 1847 } 1848 1849 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1850 chunk_offset) 1851 { 1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1853 struct btrfs_disk_key *disk_key; 1854 struct btrfs_chunk *chunk; 1855 u8 *ptr; 1856 int ret = 0; 1857 u32 num_stripes; 1858 u32 array_size; 1859 u32 len = 0; 1860 u32 cur; 1861 struct btrfs_key key; 1862 1863 array_size = btrfs_super_sys_array_size(super_copy); 1864 1865 ptr = super_copy->sys_chunk_array; 1866 cur = 0; 1867 1868 while (cur < array_size) { 1869 disk_key = (struct btrfs_disk_key *)ptr; 1870 btrfs_disk_key_to_cpu(&key, disk_key); 1871 1872 len = sizeof(*disk_key); 1873 1874 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1875 chunk = (struct btrfs_chunk *)(ptr + len); 1876 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1877 len += btrfs_chunk_item_size(num_stripes); 1878 } else { 1879 ret = -EIO; 1880 break; 1881 } 1882 if (key.objectid == chunk_objectid && 1883 key.offset == chunk_offset) { 1884 memmove(ptr, ptr + len, array_size - (cur + len)); 1885 array_size -= len; 1886 btrfs_set_super_sys_array_size(super_copy, array_size); 1887 } else { 1888 ptr += len; 1889 cur += len; 1890 } 1891 } 1892 return ret; 1893 } 1894 1895 static int btrfs_relocate_chunk(struct btrfs_root *root, 1896 u64 chunk_tree, u64 chunk_objectid, 1897 u64 chunk_offset) 1898 { 1899 struct extent_map_tree *em_tree; 1900 struct btrfs_root *extent_root; 1901 struct btrfs_trans_handle *trans; 1902 struct extent_map *em; 1903 struct map_lookup *map; 1904 int ret; 1905 int i; 1906 1907 root = root->fs_info->chunk_root; 1908 extent_root = root->fs_info->extent_root; 1909 em_tree = &root->fs_info->mapping_tree.map_tree; 1910 1911 ret = btrfs_can_relocate(extent_root, chunk_offset); 1912 if (ret) 1913 return -ENOSPC; 1914 1915 /* step one, relocate all the extents inside this chunk */ 1916 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1917 if (ret) 1918 return ret; 1919 1920 trans = btrfs_start_transaction(root, 0); 1921 BUG_ON(IS_ERR(trans)); 1922 1923 lock_chunks(root); 1924 1925 /* 1926 * step two, delete the device extents and the 1927 * chunk tree entries 1928 */ 1929 read_lock(&em_tree->lock); 1930 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1931 read_unlock(&em_tree->lock); 1932 1933 BUG_ON(em->start > chunk_offset || 1934 em->start + em->len < chunk_offset); 1935 map = (struct map_lookup *)em->bdev; 1936 1937 for (i = 0; i < map->num_stripes; i++) { 1938 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1939 map->stripes[i].physical); 1940 BUG_ON(ret); 1941 1942 if (map->stripes[i].dev) { 1943 ret = btrfs_update_device(trans, map->stripes[i].dev); 1944 BUG_ON(ret); 1945 } 1946 } 1947 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1948 chunk_offset); 1949 1950 BUG_ON(ret); 1951 1952 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 1953 1954 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1955 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1956 BUG_ON(ret); 1957 } 1958 1959 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1960 BUG_ON(ret); 1961 1962 write_lock(&em_tree->lock); 1963 remove_extent_mapping(em_tree, em); 1964 write_unlock(&em_tree->lock); 1965 1966 kfree(map); 1967 em->bdev = NULL; 1968 1969 /* once for the tree */ 1970 free_extent_map(em); 1971 /* once for us */ 1972 free_extent_map(em); 1973 1974 unlock_chunks(root); 1975 btrfs_end_transaction(trans, root); 1976 return 0; 1977 } 1978 1979 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1980 { 1981 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1982 struct btrfs_path *path; 1983 struct extent_buffer *leaf; 1984 struct btrfs_chunk *chunk; 1985 struct btrfs_key key; 1986 struct btrfs_key found_key; 1987 u64 chunk_tree = chunk_root->root_key.objectid; 1988 u64 chunk_type; 1989 bool retried = false; 1990 int failed = 0; 1991 int ret; 1992 1993 path = btrfs_alloc_path(); 1994 if (!path) 1995 return -ENOMEM; 1996 1997 again: 1998 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1999 key.offset = (u64)-1; 2000 key.type = BTRFS_CHUNK_ITEM_KEY; 2001 2002 while (1) { 2003 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2004 if (ret < 0) 2005 goto error; 2006 BUG_ON(ret == 0); 2007 2008 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2009 key.type); 2010 if (ret < 0) 2011 goto error; 2012 if (ret > 0) 2013 break; 2014 2015 leaf = path->nodes[0]; 2016 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2017 2018 chunk = btrfs_item_ptr(leaf, path->slots[0], 2019 struct btrfs_chunk); 2020 chunk_type = btrfs_chunk_type(leaf, chunk); 2021 btrfs_release_path(path); 2022 2023 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2024 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2025 found_key.objectid, 2026 found_key.offset); 2027 if (ret == -ENOSPC) 2028 failed++; 2029 else if (ret) 2030 BUG(); 2031 } 2032 2033 if (found_key.offset == 0) 2034 break; 2035 key.offset = found_key.offset - 1; 2036 } 2037 ret = 0; 2038 if (failed && !retried) { 2039 failed = 0; 2040 retried = true; 2041 goto again; 2042 } else if (failed && retried) { 2043 WARN_ON(1); 2044 ret = -ENOSPC; 2045 } 2046 error: 2047 btrfs_free_path(path); 2048 return ret; 2049 } 2050 2051 static u64 div_factor(u64 num, int factor) 2052 { 2053 if (factor == 10) 2054 return num; 2055 num *= factor; 2056 do_div(num, 10); 2057 return num; 2058 } 2059 2060 int btrfs_balance(struct btrfs_root *dev_root) 2061 { 2062 int ret; 2063 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2064 struct btrfs_device *device; 2065 u64 old_size; 2066 u64 size_to_free; 2067 struct btrfs_path *path; 2068 struct btrfs_key key; 2069 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2070 struct btrfs_trans_handle *trans; 2071 struct btrfs_key found_key; 2072 2073 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2074 return -EROFS; 2075 2076 if (!capable(CAP_SYS_ADMIN)) 2077 return -EPERM; 2078 2079 mutex_lock(&dev_root->fs_info->volume_mutex); 2080 dev_root = dev_root->fs_info->dev_root; 2081 2082 /* step one make some room on all the devices */ 2083 list_for_each_entry(device, devices, dev_list) { 2084 old_size = device->total_bytes; 2085 size_to_free = div_factor(old_size, 1); 2086 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2087 if (!device->writeable || 2088 device->total_bytes - device->bytes_used > size_to_free) 2089 continue; 2090 2091 ret = btrfs_shrink_device(device, old_size - size_to_free); 2092 if (ret == -ENOSPC) 2093 break; 2094 BUG_ON(ret); 2095 2096 trans = btrfs_start_transaction(dev_root, 0); 2097 BUG_ON(IS_ERR(trans)); 2098 2099 ret = btrfs_grow_device(trans, device, old_size); 2100 BUG_ON(ret); 2101 2102 btrfs_end_transaction(trans, dev_root); 2103 } 2104 2105 /* step two, relocate all the chunks */ 2106 path = btrfs_alloc_path(); 2107 if (!path) { 2108 ret = -ENOMEM; 2109 goto error; 2110 } 2111 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2112 key.offset = (u64)-1; 2113 key.type = BTRFS_CHUNK_ITEM_KEY; 2114 2115 while (1) { 2116 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2117 if (ret < 0) 2118 goto error; 2119 2120 /* 2121 * this shouldn't happen, it means the last relocate 2122 * failed 2123 */ 2124 if (ret == 0) 2125 break; 2126 2127 ret = btrfs_previous_item(chunk_root, path, 0, 2128 BTRFS_CHUNK_ITEM_KEY); 2129 if (ret) 2130 break; 2131 2132 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2133 path->slots[0]); 2134 if (found_key.objectid != key.objectid) 2135 break; 2136 2137 /* chunk zero is special */ 2138 if (found_key.offset == 0) 2139 break; 2140 2141 btrfs_release_path(path); 2142 ret = btrfs_relocate_chunk(chunk_root, 2143 chunk_root->root_key.objectid, 2144 found_key.objectid, 2145 found_key.offset); 2146 if (ret && ret != -ENOSPC) 2147 goto error; 2148 key.offset = found_key.offset - 1; 2149 } 2150 ret = 0; 2151 error: 2152 btrfs_free_path(path); 2153 mutex_unlock(&dev_root->fs_info->volume_mutex); 2154 return ret; 2155 } 2156 2157 /* 2158 * shrinking a device means finding all of the device extents past 2159 * the new size, and then following the back refs to the chunks. 2160 * The chunk relocation code actually frees the device extent 2161 */ 2162 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2163 { 2164 struct btrfs_trans_handle *trans; 2165 struct btrfs_root *root = device->dev_root; 2166 struct btrfs_dev_extent *dev_extent = NULL; 2167 struct btrfs_path *path; 2168 u64 length; 2169 u64 chunk_tree; 2170 u64 chunk_objectid; 2171 u64 chunk_offset; 2172 int ret; 2173 int slot; 2174 int failed = 0; 2175 bool retried = false; 2176 struct extent_buffer *l; 2177 struct btrfs_key key; 2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2180 u64 old_size = device->total_bytes; 2181 u64 diff = device->total_bytes - new_size; 2182 2183 if (new_size >= device->total_bytes) 2184 return -EINVAL; 2185 2186 path = btrfs_alloc_path(); 2187 if (!path) 2188 return -ENOMEM; 2189 2190 path->reada = 2; 2191 2192 lock_chunks(root); 2193 2194 device->total_bytes = new_size; 2195 if (device->writeable) 2196 device->fs_devices->total_rw_bytes -= diff; 2197 unlock_chunks(root); 2198 2199 again: 2200 key.objectid = device->devid; 2201 key.offset = (u64)-1; 2202 key.type = BTRFS_DEV_EXTENT_KEY; 2203 2204 while (1) { 2205 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2206 if (ret < 0) 2207 goto done; 2208 2209 ret = btrfs_previous_item(root, path, 0, key.type); 2210 if (ret < 0) 2211 goto done; 2212 if (ret) { 2213 ret = 0; 2214 btrfs_release_path(path); 2215 break; 2216 } 2217 2218 l = path->nodes[0]; 2219 slot = path->slots[0]; 2220 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2221 2222 if (key.objectid != device->devid) { 2223 btrfs_release_path(path); 2224 break; 2225 } 2226 2227 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2228 length = btrfs_dev_extent_length(l, dev_extent); 2229 2230 if (key.offset + length <= new_size) { 2231 btrfs_release_path(path); 2232 break; 2233 } 2234 2235 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2236 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2237 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2238 btrfs_release_path(path); 2239 2240 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2241 chunk_offset); 2242 if (ret && ret != -ENOSPC) 2243 goto done; 2244 if (ret == -ENOSPC) 2245 failed++; 2246 key.offset -= 1; 2247 } 2248 2249 if (failed && !retried) { 2250 failed = 0; 2251 retried = true; 2252 goto again; 2253 } else if (failed && retried) { 2254 ret = -ENOSPC; 2255 lock_chunks(root); 2256 2257 device->total_bytes = old_size; 2258 if (device->writeable) 2259 device->fs_devices->total_rw_bytes += diff; 2260 unlock_chunks(root); 2261 goto done; 2262 } 2263 2264 /* Shrinking succeeded, else we would be at "done". */ 2265 trans = btrfs_start_transaction(root, 0); 2266 if (IS_ERR(trans)) { 2267 ret = PTR_ERR(trans); 2268 goto done; 2269 } 2270 2271 lock_chunks(root); 2272 2273 device->disk_total_bytes = new_size; 2274 /* Now btrfs_update_device() will change the on-disk size. */ 2275 ret = btrfs_update_device(trans, device); 2276 if (ret) { 2277 unlock_chunks(root); 2278 btrfs_end_transaction(trans, root); 2279 goto done; 2280 } 2281 WARN_ON(diff > old_total); 2282 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2283 unlock_chunks(root); 2284 btrfs_end_transaction(trans, root); 2285 done: 2286 btrfs_free_path(path); 2287 return ret; 2288 } 2289 2290 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2291 struct btrfs_root *root, 2292 struct btrfs_key *key, 2293 struct btrfs_chunk *chunk, int item_size) 2294 { 2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2296 struct btrfs_disk_key disk_key; 2297 u32 array_size; 2298 u8 *ptr; 2299 2300 array_size = btrfs_super_sys_array_size(super_copy); 2301 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2302 return -EFBIG; 2303 2304 ptr = super_copy->sys_chunk_array + array_size; 2305 btrfs_cpu_key_to_disk(&disk_key, key); 2306 memcpy(ptr, &disk_key, sizeof(disk_key)); 2307 ptr += sizeof(disk_key); 2308 memcpy(ptr, chunk, item_size); 2309 item_size += sizeof(disk_key); 2310 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2311 return 0; 2312 } 2313 2314 /* 2315 * sort the devices in descending order by max_avail, total_avail 2316 */ 2317 static int btrfs_cmp_device_info(const void *a, const void *b) 2318 { 2319 const struct btrfs_device_info *di_a = a; 2320 const struct btrfs_device_info *di_b = b; 2321 2322 if (di_a->max_avail > di_b->max_avail) 2323 return -1; 2324 if (di_a->max_avail < di_b->max_avail) 2325 return 1; 2326 if (di_a->total_avail > di_b->total_avail) 2327 return -1; 2328 if (di_a->total_avail < di_b->total_avail) 2329 return 1; 2330 return 0; 2331 } 2332 2333 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2334 struct btrfs_root *extent_root, 2335 struct map_lookup **map_ret, 2336 u64 *num_bytes_out, u64 *stripe_size_out, 2337 u64 start, u64 type) 2338 { 2339 struct btrfs_fs_info *info = extent_root->fs_info; 2340 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2341 struct list_head *cur; 2342 struct map_lookup *map = NULL; 2343 struct extent_map_tree *em_tree; 2344 struct extent_map *em; 2345 struct btrfs_device_info *devices_info = NULL; 2346 u64 total_avail; 2347 int num_stripes; /* total number of stripes to allocate */ 2348 int sub_stripes; /* sub_stripes info for map */ 2349 int dev_stripes; /* stripes per dev */ 2350 int devs_max; /* max devs to use */ 2351 int devs_min; /* min devs needed */ 2352 int devs_increment; /* ndevs has to be a multiple of this */ 2353 int ncopies; /* how many copies to data has */ 2354 int ret; 2355 u64 max_stripe_size; 2356 u64 max_chunk_size; 2357 u64 stripe_size; 2358 u64 num_bytes; 2359 int ndevs; 2360 int i; 2361 int j; 2362 2363 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2364 (type & BTRFS_BLOCK_GROUP_DUP)) { 2365 WARN_ON(1); 2366 type &= ~BTRFS_BLOCK_GROUP_DUP; 2367 } 2368 2369 if (list_empty(&fs_devices->alloc_list)) 2370 return -ENOSPC; 2371 2372 sub_stripes = 1; 2373 dev_stripes = 1; 2374 devs_increment = 1; 2375 ncopies = 1; 2376 devs_max = 0; /* 0 == as many as possible */ 2377 devs_min = 1; 2378 2379 /* 2380 * define the properties of each RAID type. 2381 * FIXME: move this to a global table and use it in all RAID 2382 * calculation code 2383 */ 2384 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2385 dev_stripes = 2; 2386 ncopies = 2; 2387 devs_max = 1; 2388 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2389 devs_min = 2; 2390 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2391 devs_increment = 2; 2392 ncopies = 2; 2393 devs_max = 2; 2394 devs_min = 2; 2395 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2396 sub_stripes = 2; 2397 devs_increment = 2; 2398 ncopies = 2; 2399 devs_min = 4; 2400 } else { 2401 devs_max = 1; 2402 } 2403 2404 if (type & BTRFS_BLOCK_GROUP_DATA) { 2405 max_stripe_size = 1024 * 1024 * 1024; 2406 max_chunk_size = 10 * max_stripe_size; 2407 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2408 max_stripe_size = 256 * 1024 * 1024; 2409 max_chunk_size = max_stripe_size; 2410 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2411 max_stripe_size = 8 * 1024 * 1024; 2412 max_chunk_size = 2 * max_stripe_size; 2413 } else { 2414 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 2415 type); 2416 BUG_ON(1); 2417 } 2418 2419 /* we don't want a chunk larger than 10% of writeable space */ 2420 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2421 max_chunk_size); 2422 2423 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 2424 GFP_NOFS); 2425 if (!devices_info) 2426 return -ENOMEM; 2427 2428 cur = fs_devices->alloc_list.next; 2429 2430 /* 2431 * in the first pass through the devices list, we gather information 2432 * about the available holes on each device. 2433 */ 2434 ndevs = 0; 2435 while (cur != &fs_devices->alloc_list) { 2436 struct btrfs_device *device; 2437 u64 max_avail; 2438 u64 dev_offset; 2439 2440 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2441 2442 cur = cur->next; 2443 2444 if (!device->writeable) { 2445 printk(KERN_ERR 2446 "btrfs: read-only device in alloc_list\n"); 2447 WARN_ON(1); 2448 continue; 2449 } 2450 2451 if (!device->in_fs_metadata) 2452 continue; 2453 2454 if (device->total_bytes > device->bytes_used) 2455 total_avail = device->total_bytes - device->bytes_used; 2456 else 2457 total_avail = 0; 2458 2459 /* If there is no space on this device, skip it. */ 2460 if (total_avail == 0) 2461 continue; 2462 2463 ret = find_free_dev_extent(trans, device, 2464 max_stripe_size * dev_stripes, 2465 &dev_offset, &max_avail); 2466 if (ret && ret != -ENOSPC) 2467 goto error; 2468 2469 if (ret == 0) 2470 max_avail = max_stripe_size * dev_stripes; 2471 2472 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 2473 continue; 2474 2475 devices_info[ndevs].dev_offset = dev_offset; 2476 devices_info[ndevs].max_avail = max_avail; 2477 devices_info[ndevs].total_avail = total_avail; 2478 devices_info[ndevs].dev = device; 2479 ++ndevs; 2480 } 2481 2482 /* 2483 * now sort the devices by hole size / available space 2484 */ 2485 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 2486 btrfs_cmp_device_info, NULL); 2487 2488 /* round down to number of usable stripes */ 2489 ndevs -= ndevs % devs_increment; 2490 2491 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 2492 ret = -ENOSPC; 2493 goto error; 2494 } 2495 2496 if (devs_max && ndevs > devs_max) 2497 ndevs = devs_max; 2498 /* 2499 * the primary goal is to maximize the number of stripes, so use as many 2500 * devices as possible, even if the stripes are not maximum sized. 2501 */ 2502 stripe_size = devices_info[ndevs-1].max_avail; 2503 num_stripes = ndevs * dev_stripes; 2504 2505 if (stripe_size * num_stripes > max_chunk_size * ncopies) { 2506 stripe_size = max_chunk_size * ncopies; 2507 do_div(stripe_size, num_stripes); 2508 } 2509 2510 do_div(stripe_size, dev_stripes); 2511 do_div(stripe_size, BTRFS_STRIPE_LEN); 2512 stripe_size *= BTRFS_STRIPE_LEN; 2513 2514 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2515 if (!map) { 2516 ret = -ENOMEM; 2517 goto error; 2518 } 2519 map->num_stripes = num_stripes; 2520 2521 for (i = 0; i < ndevs; ++i) { 2522 for (j = 0; j < dev_stripes; ++j) { 2523 int s = i * dev_stripes + j; 2524 map->stripes[s].dev = devices_info[i].dev; 2525 map->stripes[s].physical = devices_info[i].dev_offset + 2526 j * stripe_size; 2527 } 2528 } 2529 map->sector_size = extent_root->sectorsize; 2530 map->stripe_len = BTRFS_STRIPE_LEN; 2531 map->io_align = BTRFS_STRIPE_LEN; 2532 map->io_width = BTRFS_STRIPE_LEN; 2533 map->type = type; 2534 map->sub_stripes = sub_stripes; 2535 2536 *map_ret = map; 2537 num_bytes = stripe_size * (num_stripes / ncopies); 2538 2539 *stripe_size_out = stripe_size; 2540 *num_bytes_out = num_bytes; 2541 2542 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 2543 2544 em = alloc_extent_map(); 2545 if (!em) { 2546 ret = -ENOMEM; 2547 goto error; 2548 } 2549 em->bdev = (struct block_device *)map; 2550 em->start = start; 2551 em->len = num_bytes; 2552 em->block_start = 0; 2553 em->block_len = em->len; 2554 2555 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2556 write_lock(&em_tree->lock); 2557 ret = add_extent_mapping(em_tree, em); 2558 write_unlock(&em_tree->lock); 2559 BUG_ON(ret); 2560 free_extent_map(em); 2561 2562 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2563 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2564 start, num_bytes); 2565 BUG_ON(ret); 2566 2567 for (i = 0; i < map->num_stripes; ++i) { 2568 struct btrfs_device *device; 2569 u64 dev_offset; 2570 2571 device = map->stripes[i].dev; 2572 dev_offset = map->stripes[i].physical; 2573 2574 ret = btrfs_alloc_dev_extent(trans, device, 2575 info->chunk_root->root_key.objectid, 2576 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2577 start, dev_offset, stripe_size); 2578 BUG_ON(ret); 2579 } 2580 2581 kfree(devices_info); 2582 return 0; 2583 2584 error: 2585 kfree(map); 2586 kfree(devices_info); 2587 return ret; 2588 } 2589 2590 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2591 struct btrfs_root *extent_root, 2592 struct map_lookup *map, u64 chunk_offset, 2593 u64 chunk_size, u64 stripe_size) 2594 { 2595 u64 dev_offset; 2596 struct btrfs_key key; 2597 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2598 struct btrfs_device *device; 2599 struct btrfs_chunk *chunk; 2600 struct btrfs_stripe *stripe; 2601 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2602 int index = 0; 2603 int ret; 2604 2605 chunk = kzalloc(item_size, GFP_NOFS); 2606 if (!chunk) 2607 return -ENOMEM; 2608 2609 index = 0; 2610 while (index < map->num_stripes) { 2611 device = map->stripes[index].dev; 2612 device->bytes_used += stripe_size; 2613 ret = btrfs_update_device(trans, device); 2614 BUG_ON(ret); 2615 index++; 2616 } 2617 2618 index = 0; 2619 stripe = &chunk->stripe; 2620 while (index < map->num_stripes) { 2621 device = map->stripes[index].dev; 2622 dev_offset = map->stripes[index].physical; 2623 2624 btrfs_set_stack_stripe_devid(stripe, device->devid); 2625 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2626 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2627 stripe++; 2628 index++; 2629 } 2630 2631 btrfs_set_stack_chunk_length(chunk, chunk_size); 2632 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2633 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2634 btrfs_set_stack_chunk_type(chunk, map->type); 2635 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2636 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2637 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2638 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2639 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2640 2641 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2642 key.type = BTRFS_CHUNK_ITEM_KEY; 2643 key.offset = chunk_offset; 2644 2645 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2646 BUG_ON(ret); 2647 2648 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2649 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2650 item_size); 2651 BUG_ON(ret); 2652 } 2653 2654 kfree(chunk); 2655 return 0; 2656 } 2657 2658 /* 2659 * Chunk allocation falls into two parts. The first part does works 2660 * that make the new allocated chunk useable, but not do any operation 2661 * that modifies the chunk tree. The second part does the works that 2662 * require modifying the chunk tree. This division is important for the 2663 * bootstrap process of adding storage to a seed btrfs. 2664 */ 2665 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2666 struct btrfs_root *extent_root, u64 type) 2667 { 2668 u64 chunk_offset; 2669 u64 chunk_size; 2670 u64 stripe_size; 2671 struct map_lookup *map; 2672 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2673 int ret; 2674 2675 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2676 &chunk_offset); 2677 if (ret) 2678 return ret; 2679 2680 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2681 &stripe_size, chunk_offset, type); 2682 if (ret) 2683 return ret; 2684 2685 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2686 chunk_size, stripe_size); 2687 BUG_ON(ret); 2688 return 0; 2689 } 2690 2691 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2692 struct btrfs_root *root, 2693 struct btrfs_device *device) 2694 { 2695 u64 chunk_offset; 2696 u64 sys_chunk_offset; 2697 u64 chunk_size; 2698 u64 sys_chunk_size; 2699 u64 stripe_size; 2700 u64 sys_stripe_size; 2701 u64 alloc_profile; 2702 struct map_lookup *map; 2703 struct map_lookup *sys_map; 2704 struct btrfs_fs_info *fs_info = root->fs_info; 2705 struct btrfs_root *extent_root = fs_info->extent_root; 2706 int ret; 2707 2708 ret = find_next_chunk(fs_info->chunk_root, 2709 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2710 if (ret) 2711 return ret; 2712 2713 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2714 (fs_info->metadata_alloc_profile & 2715 fs_info->avail_metadata_alloc_bits); 2716 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2717 2718 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2719 &stripe_size, chunk_offset, alloc_profile); 2720 BUG_ON(ret); 2721 2722 sys_chunk_offset = chunk_offset + chunk_size; 2723 2724 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2725 (fs_info->system_alloc_profile & 2726 fs_info->avail_system_alloc_bits); 2727 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2728 2729 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2730 &sys_chunk_size, &sys_stripe_size, 2731 sys_chunk_offset, alloc_profile); 2732 BUG_ON(ret); 2733 2734 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2735 BUG_ON(ret); 2736 2737 /* 2738 * Modifying chunk tree needs allocating new blocks from both 2739 * system block group and metadata block group. So we only can 2740 * do operations require modifying the chunk tree after both 2741 * block groups were created. 2742 */ 2743 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2744 chunk_size, stripe_size); 2745 BUG_ON(ret); 2746 2747 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2748 sys_chunk_offset, sys_chunk_size, 2749 sys_stripe_size); 2750 BUG_ON(ret); 2751 return 0; 2752 } 2753 2754 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2755 { 2756 struct extent_map *em; 2757 struct map_lookup *map; 2758 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2759 int readonly = 0; 2760 int i; 2761 2762 read_lock(&map_tree->map_tree.lock); 2763 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2764 read_unlock(&map_tree->map_tree.lock); 2765 if (!em) 2766 return 1; 2767 2768 if (btrfs_test_opt(root, DEGRADED)) { 2769 free_extent_map(em); 2770 return 0; 2771 } 2772 2773 map = (struct map_lookup *)em->bdev; 2774 for (i = 0; i < map->num_stripes; i++) { 2775 if (!map->stripes[i].dev->writeable) { 2776 readonly = 1; 2777 break; 2778 } 2779 } 2780 free_extent_map(em); 2781 return readonly; 2782 } 2783 2784 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2785 { 2786 extent_map_tree_init(&tree->map_tree); 2787 } 2788 2789 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2790 { 2791 struct extent_map *em; 2792 2793 while (1) { 2794 write_lock(&tree->map_tree.lock); 2795 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2796 if (em) 2797 remove_extent_mapping(&tree->map_tree, em); 2798 write_unlock(&tree->map_tree.lock); 2799 if (!em) 2800 break; 2801 kfree(em->bdev); 2802 /* once for us */ 2803 free_extent_map(em); 2804 /* once for the tree */ 2805 free_extent_map(em); 2806 } 2807 } 2808 2809 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2810 { 2811 struct extent_map *em; 2812 struct map_lookup *map; 2813 struct extent_map_tree *em_tree = &map_tree->map_tree; 2814 int ret; 2815 2816 read_lock(&em_tree->lock); 2817 em = lookup_extent_mapping(em_tree, logical, len); 2818 read_unlock(&em_tree->lock); 2819 BUG_ON(!em); 2820 2821 BUG_ON(em->start > logical || em->start + em->len < logical); 2822 map = (struct map_lookup *)em->bdev; 2823 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2824 ret = map->num_stripes; 2825 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2826 ret = map->sub_stripes; 2827 else 2828 ret = 1; 2829 free_extent_map(em); 2830 return ret; 2831 } 2832 2833 static int find_live_mirror(struct map_lookup *map, int first, int num, 2834 int optimal) 2835 { 2836 int i; 2837 if (map->stripes[optimal].dev->bdev) 2838 return optimal; 2839 for (i = first; i < first + num; i++) { 2840 if (map->stripes[i].dev->bdev) 2841 return i; 2842 } 2843 /* we couldn't find one that doesn't fail. Just return something 2844 * and the io error handling code will clean up eventually 2845 */ 2846 return optimal; 2847 } 2848 2849 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2850 u64 logical, u64 *length, 2851 struct btrfs_multi_bio **multi_ret, 2852 int mirror_num) 2853 { 2854 struct extent_map *em; 2855 struct map_lookup *map; 2856 struct extent_map_tree *em_tree = &map_tree->map_tree; 2857 u64 offset; 2858 u64 stripe_offset; 2859 u64 stripe_end_offset; 2860 u64 stripe_nr; 2861 u64 stripe_nr_orig; 2862 u64 stripe_nr_end; 2863 int stripes_allocated = 8; 2864 int stripes_required = 1; 2865 int stripe_index; 2866 int i; 2867 int num_stripes; 2868 int max_errors = 0; 2869 struct btrfs_multi_bio *multi = NULL; 2870 2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2872 stripes_allocated = 1; 2873 again: 2874 if (multi_ret) { 2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2876 GFP_NOFS); 2877 if (!multi) 2878 return -ENOMEM; 2879 2880 atomic_set(&multi->error, 0); 2881 } 2882 2883 read_lock(&em_tree->lock); 2884 em = lookup_extent_mapping(em_tree, logical, *length); 2885 read_unlock(&em_tree->lock); 2886 2887 if (!em) { 2888 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2889 (unsigned long long)logical, 2890 (unsigned long long)*length); 2891 BUG(); 2892 } 2893 2894 BUG_ON(em->start > logical || em->start + em->len < logical); 2895 map = (struct map_lookup *)em->bdev; 2896 offset = logical - em->start; 2897 2898 if (mirror_num > map->num_stripes) 2899 mirror_num = 0; 2900 2901 /* if our multi bio struct is too small, back off and try again */ 2902 if (rw & REQ_WRITE) { 2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2904 BTRFS_BLOCK_GROUP_DUP)) { 2905 stripes_required = map->num_stripes; 2906 max_errors = 1; 2907 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2908 stripes_required = map->sub_stripes; 2909 max_errors = 1; 2910 } 2911 } 2912 if (rw & REQ_DISCARD) { 2913 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2914 BTRFS_BLOCK_GROUP_RAID1 | 2915 BTRFS_BLOCK_GROUP_DUP | 2916 BTRFS_BLOCK_GROUP_RAID10)) { 2917 stripes_required = map->num_stripes; 2918 } 2919 } 2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2921 stripes_allocated < stripes_required) { 2922 stripes_allocated = map->num_stripes; 2923 free_extent_map(em); 2924 kfree(multi); 2925 goto again; 2926 } 2927 stripe_nr = offset; 2928 /* 2929 * stripe_nr counts the total number of stripes we have to stride 2930 * to get to this block 2931 */ 2932 do_div(stripe_nr, map->stripe_len); 2933 2934 stripe_offset = stripe_nr * map->stripe_len; 2935 BUG_ON(offset < stripe_offset); 2936 2937 /* stripe_offset is the offset of this block in its stripe*/ 2938 stripe_offset = offset - stripe_offset; 2939 2940 if (rw & REQ_DISCARD) 2941 *length = min_t(u64, em->len - offset, *length); 2942 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2943 BTRFS_BLOCK_GROUP_RAID1 | 2944 BTRFS_BLOCK_GROUP_RAID10 | 2945 BTRFS_BLOCK_GROUP_DUP)) { 2946 /* we limit the length of each bio to what fits in a stripe */ 2947 *length = min_t(u64, em->len - offset, 2948 map->stripe_len - stripe_offset); 2949 } else { 2950 *length = em->len - offset; 2951 } 2952 2953 if (!multi_ret) 2954 goto out; 2955 2956 num_stripes = 1; 2957 stripe_index = 0; 2958 stripe_nr_orig = stripe_nr; 2959 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 2960 (~(map->stripe_len - 1)); 2961 do_div(stripe_nr_end, map->stripe_len); 2962 stripe_end_offset = stripe_nr_end * map->stripe_len - 2963 (offset + *length); 2964 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2965 if (rw & REQ_DISCARD) 2966 num_stripes = min_t(u64, map->num_stripes, 2967 stripe_nr_end - stripe_nr_orig); 2968 stripe_index = do_div(stripe_nr, map->num_stripes); 2969 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2970 if (rw & (REQ_WRITE | REQ_DISCARD)) 2971 num_stripes = map->num_stripes; 2972 else if (mirror_num) 2973 stripe_index = mirror_num - 1; 2974 else { 2975 stripe_index = find_live_mirror(map, 0, 2976 map->num_stripes, 2977 current->pid % map->num_stripes); 2978 } 2979 2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 2982 num_stripes = map->num_stripes; 2983 else if (mirror_num) 2984 stripe_index = mirror_num - 1; 2985 2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2987 int factor = map->num_stripes / map->sub_stripes; 2988 2989 stripe_index = do_div(stripe_nr, factor); 2990 stripe_index *= map->sub_stripes; 2991 2992 if (rw & REQ_WRITE) 2993 num_stripes = map->sub_stripes; 2994 else if (rw & REQ_DISCARD) 2995 num_stripes = min_t(u64, map->sub_stripes * 2996 (stripe_nr_end - stripe_nr_orig), 2997 map->num_stripes); 2998 else if (mirror_num) 2999 stripe_index += mirror_num - 1; 3000 else { 3001 stripe_index = find_live_mirror(map, stripe_index, 3002 map->sub_stripes, stripe_index + 3003 current->pid % map->sub_stripes); 3004 } 3005 } else { 3006 /* 3007 * after this do_div call, stripe_nr is the number of stripes 3008 * on this device we have to walk to find the data, and 3009 * stripe_index is the number of our device in the stripe array 3010 */ 3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3012 } 3013 BUG_ON(stripe_index >= map->num_stripes); 3014 3015 if (rw & REQ_DISCARD) { 3016 for (i = 0; i < num_stripes; i++) { 3017 multi->stripes[i].physical = 3018 map->stripes[stripe_index].physical + 3019 stripe_offset + stripe_nr * map->stripe_len; 3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3021 3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3023 u64 stripes; 3024 u32 last_stripe = 0; 3025 int j; 3026 3027 div_u64_rem(stripe_nr_end - 1, 3028 map->num_stripes, 3029 &last_stripe); 3030 3031 for (j = 0; j < map->num_stripes; j++) { 3032 u32 test; 3033 3034 div_u64_rem(stripe_nr_end - 1 - j, 3035 map->num_stripes, &test); 3036 if (test == stripe_index) 3037 break; 3038 } 3039 stripes = stripe_nr_end - 1 - j; 3040 do_div(stripes, map->num_stripes); 3041 multi->stripes[i].length = map->stripe_len * 3042 (stripes - stripe_nr + 1); 3043 3044 if (i == 0) { 3045 multi->stripes[i].length -= 3046 stripe_offset; 3047 stripe_offset = 0; 3048 } 3049 if (stripe_index == last_stripe) 3050 multi->stripes[i].length -= 3051 stripe_end_offset; 3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3053 u64 stripes; 3054 int j; 3055 int factor = map->num_stripes / 3056 map->sub_stripes; 3057 u32 last_stripe = 0; 3058 3059 div_u64_rem(stripe_nr_end - 1, 3060 factor, &last_stripe); 3061 last_stripe *= map->sub_stripes; 3062 3063 for (j = 0; j < factor; j++) { 3064 u32 test; 3065 3066 div_u64_rem(stripe_nr_end - 1 - j, 3067 factor, &test); 3068 3069 if (test == 3070 stripe_index / map->sub_stripes) 3071 break; 3072 } 3073 stripes = stripe_nr_end - 1 - j; 3074 do_div(stripes, factor); 3075 multi->stripes[i].length = map->stripe_len * 3076 (stripes - stripe_nr + 1); 3077 3078 if (i < map->sub_stripes) { 3079 multi->stripes[i].length -= 3080 stripe_offset; 3081 if (i == map->sub_stripes - 1) 3082 stripe_offset = 0; 3083 } 3084 if (stripe_index >= last_stripe && 3085 stripe_index <= (last_stripe + 3086 map->sub_stripes - 1)) { 3087 multi->stripes[i].length -= 3088 stripe_end_offset; 3089 } 3090 } else 3091 multi->stripes[i].length = *length; 3092 3093 stripe_index++; 3094 if (stripe_index == map->num_stripes) { 3095 /* This could only happen for RAID0/10 */ 3096 stripe_index = 0; 3097 stripe_nr++; 3098 } 3099 } 3100 } else { 3101 for (i = 0; i < num_stripes; i++) { 3102 multi->stripes[i].physical = 3103 map->stripes[stripe_index].physical + 3104 stripe_offset + 3105 stripe_nr * map->stripe_len; 3106 multi->stripes[i].dev = 3107 map->stripes[stripe_index].dev; 3108 stripe_index++; 3109 } 3110 } 3111 if (multi_ret) { 3112 *multi_ret = multi; 3113 multi->num_stripes = num_stripes; 3114 multi->max_errors = max_errors; 3115 } 3116 out: 3117 free_extent_map(em); 3118 return 0; 3119 } 3120 3121 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3122 u64 logical, u64 *length, 3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3124 { 3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3126 mirror_num); 3127 } 3128 3129 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3130 u64 chunk_start, u64 physical, u64 devid, 3131 u64 **logical, int *naddrs, int *stripe_len) 3132 { 3133 struct extent_map_tree *em_tree = &map_tree->map_tree; 3134 struct extent_map *em; 3135 struct map_lookup *map; 3136 u64 *buf; 3137 u64 bytenr; 3138 u64 length; 3139 u64 stripe_nr; 3140 int i, j, nr = 0; 3141 3142 read_lock(&em_tree->lock); 3143 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3144 read_unlock(&em_tree->lock); 3145 3146 BUG_ON(!em || em->start != chunk_start); 3147 map = (struct map_lookup *)em->bdev; 3148 3149 length = em->len; 3150 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3151 do_div(length, map->num_stripes / map->sub_stripes); 3152 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3153 do_div(length, map->num_stripes); 3154 3155 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3156 BUG_ON(!buf); 3157 3158 for (i = 0; i < map->num_stripes; i++) { 3159 if (devid && map->stripes[i].dev->devid != devid) 3160 continue; 3161 if (map->stripes[i].physical > physical || 3162 map->stripes[i].physical + length <= physical) 3163 continue; 3164 3165 stripe_nr = physical - map->stripes[i].physical; 3166 do_div(stripe_nr, map->stripe_len); 3167 3168 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3169 stripe_nr = stripe_nr * map->num_stripes + i; 3170 do_div(stripe_nr, map->sub_stripes); 3171 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3172 stripe_nr = stripe_nr * map->num_stripes + i; 3173 } 3174 bytenr = chunk_start + stripe_nr * map->stripe_len; 3175 WARN_ON(nr >= map->num_stripes); 3176 for (j = 0; j < nr; j++) { 3177 if (buf[j] == bytenr) 3178 break; 3179 } 3180 if (j == nr) { 3181 WARN_ON(nr >= map->num_stripes); 3182 buf[nr++] = bytenr; 3183 } 3184 } 3185 3186 *logical = buf; 3187 *naddrs = nr; 3188 *stripe_len = map->stripe_len; 3189 3190 free_extent_map(em); 3191 return 0; 3192 } 3193 3194 static void end_bio_multi_stripe(struct bio *bio, int err) 3195 { 3196 struct btrfs_multi_bio *multi = bio->bi_private; 3197 int is_orig_bio = 0; 3198 3199 if (err) 3200 atomic_inc(&multi->error); 3201 3202 if (bio == multi->orig_bio) 3203 is_orig_bio = 1; 3204 3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3206 if (!is_orig_bio) { 3207 bio_put(bio); 3208 bio = multi->orig_bio; 3209 } 3210 bio->bi_private = multi->private; 3211 bio->bi_end_io = multi->end_io; 3212 /* only send an error to the higher layers if it is 3213 * beyond the tolerance of the multi-bio 3214 */ 3215 if (atomic_read(&multi->error) > multi->max_errors) { 3216 err = -EIO; 3217 } else if (err) { 3218 /* 3219 * this bio is actually up to date, we didn't 3220 * go over the max number of errors 3221 */ 3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3223 err = 0; 3224 } 3225 kfree(multi); 3226 3227 bio_endio(bio, err); 3228 } else if (!is_orig_bio) { 3229 bio_put(bio); 3230 } 3231 } 3232 3233 struct async_sched { 3234 struct bio *bio; 3235 int rw; 3236 struct btrfs_fs_info *info; 3237 struct btrfs_work work; 3238 }; 3239 3240 /* 3241 * see run_scheduled_bios for a description of why bios are collected for 3242 * async submit. 3243 * 3244 * This will add one bio to the pending list for a device and make sure 3245 * the work struct is scheduled. 3246 */ 3247 static noinline int schedule_bio(struct btrfs_root *root, 3248 struct btrfs_device *device, 3249 int rw, struct bio *bio) 3250 { 3251 int should_queue = 1; 3252 struct btrfs_pending_bios *pending_bios; 3253 3254 /* don't bother with additional async steps for reads, right now */ 3255 if (!(rw & REQ_WRITE)) { 3256 bio_get(bio); 3257 submit_bio(rw, bio); 3258 bio_put(bio); 3259 return 0; 3260 } 3261 3262 /* 3263 * nr_async_bios allows us to reliably return congestion to the 3264 * higher layers. Otherwise, the async bio makes it appear we have 3265 * made progress against dirty pages when we've really just put it 3266 * on a queue for later 3267 */ 3268 atomic_inc(&root->fs_info->nr_async_bios); 3269 WARN_ON(bio->bi_next); 3270 bio->bi_next = NULL; 3271 bio->bi_rw |= rw; 3272 3273 spin_lock(&device->io_lock); 3274 if (bio->bi_rw & REQ_SYNC) 3275 pending_bios = &device->pending_sync_bios; 3276 else 3277 pending_bios = &device->pending_bios; 3278 3279 if (pending_bios->tail) 3280 pending_bios->tail->bi_next = bio; 3281 3282 pending_bios->tail = bio; 3283 if (!pending_bios->head) 3284 pending_bios->head = bio; 3285 if (device->running_pending) 3286 should_queue = 0; 3287 3288 spin_unlock(&device->io_lock); 3289 3290 if (should_queue) 3291 btrfs_queue_worker(&root->fs_info->submit_workers, 3292 &device->work); 3293 return 0; 3294 } 3295 3296 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 3297 int mirror_num, int async_submit) 3298 { 3299 struct btrfs_mapping_tree *map_tree; 3300 struct btrfs_device *dev; 3301 struct bio *first_bio = bio; 3302 u64 logical = (u64)bio->bi_sector << 9; 3303 u64 length = 0; 3304 u64 map_length; 3305 struct btrfs_multi_bio *multi = NULL; 3306 int ret; 3307 int dev_nr = 0; 3308 int total_devs = 1; 3309 3310 length = bio->bi_size; 3311 map_tree = &root->fs_info->mapping_tree; 3312 map_length = length; 3313 3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3315 mirror_num); 3316 BUG_ON(ret); 3317 3318 total_devs = multi->num_stripes; 3319 if (map_length < length) { 3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3321 "len %llu\n", (unsigned long long)logical, 3322 (unsigned long long)length, 3323 (unsigned long long)map_length); 3324 BUG(); 3325 } 3326 multi->end_io = first_bio->bi_end_io; 3327 multi->private = first_bio->bi_private; 3328 multi->orig_bio = first_bio; 3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3330 3331 while (dev_nr < total_devs) { 3332 if (total_devs > 1) { 3333 if (dev_nr < total_devs - 1) { 3334 bio = bio_clone(first_bio, GFP_NOFS); 3335 BUG_ON(!bio); 3336 } else { 3337 bio = first_bio; 3338 } 3339 bio->bi_private = multi; 3340 bio->bi_end_io = end_bio_multi_stripe; 3341 } 3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3343 dev = multi->stripes[dev_nr].dev; 3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3345 bio->bi_bdev = dev->bdev; 3346 if (async_submit) 3347 schedule_bio(root, dev, rw, bio); 3348 else 3349 submit_bio(rw, bio); 3350 } else { 3351 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 3352 bio->bi_sector = logical >> 9; 3353 bio_endio(bio, -EIO); 3354 } 3355 dev_nr++; 3356 } 3357 if (total_devs == 1) 3358 kfree(multi); 3359 return 0; 3360 } 3361 3362 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3363 u8 *uuid, u8 *fsid) 3364 { 3365 struct btrfs_device *device; 3366 struct btrfs_fs_devices *cur_devices; 3367 3368 cur_devices = root->fs_info->fs_devices; 3369 while (cur_devices) { 3370 if (!fsid || 3371 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3372 device = __find_device(&cur_devices->devices, 3373 devid, uuid); 3374 if (device) 3375 return device; 3376 } 3377 cur_devices = cur_devices->seed; 3378 } 3379 return NULL; 3380 } 3381 3382 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3383 u64 devid, u8 *dev_uuid) 3384 { 3385 struct btrfs_device *device; 3386 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3387 3388 device = kzalloc(sizeof(*device), GFP_NOFS); 3389 if (!device) 3390 return NULL; 3391 list_add(&device->dev_list, 3392 &fs_devices->devices); 3393 device->dev_root = root->fs_info->dev_root; 3394 device->devid = devid; 3395 device->work.func = pending_bios_fn; 3396 device->fs_devices = fs_devices; 3397 device->missing = 1; 3398 fs_devices->num_devices++; 3399 fs_devices->missing_devices++; 3400 spin_lock_init(&device->io_lock); 3401 INIT_LIST_HEAD(&device->dev_alloc_list); 3402 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3403 return device; 3404 } 3405 3406 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3407 struct extent_buffer *leaf, 3408 struct btrfs_chunk *chunk) 3409 { 3410 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3411 struct map_lookup *map; 3412 struct extent_map *em; 3413 u64 logical; 3414 u64 length; 3415 u64 devid; 3416 u8 uuid[BTRFS_UUID_SIZE]; 3417 int num_stripes; 3418 int ret; 3419 int i; 3420 3421 logical = key->offset; 3422 length = btrfs_chunk_length(leaf, chunk); 3423 3424 read_lock(&map_tree->map_tree.lock); 3425 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3426 read_unlock(&map_tree->map_tree.lock); 3427 3428 /* already mapped? */ 3429 if (em && em->start <= logical && em->start + em->len > logical) { 3430 free_extent_map(em); 3431 return 0; 3432 } else if (em) { 3433 free_extent_map(em); 3434 } 3435 3436 em = alloc_extent_map(); 3437 if (!em) 3438 return -ENOMEM; 3439 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3440 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3441 if (!map) { 3442 free_extent_map(em); 3443 return -ENOMEM; 3444 } 3445 3446 em->bdev = (struct block_device *)map; 3447 em->start = logical; 3448 em->len = length; 3449 em->block_start = 0; 3450 em->block_len = em->len; 3451 3452 map->num_stripes = num_stripes; 3453 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3454 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3455 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3456 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3457 map->type = btrfs_chunk_type(leaf, chunk); 3458 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3459 for (i = 0; i < num_stripes; i++) { 3460 map->stripes[i].physical = 3461 btrfs_stripe_offset_nr(leaf, chunk, i); 3462 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3463 read_extent_buffer(leaf, uuid, (unsigned long) 3464 btrfs_stripe_dev_uuid_nr(chunk, i), 3465 BTRFS_UUID_SIZE); 3466 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3467 NULL); 3468 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3469 kfree(map); 3470 free_extent_map(em); 3471 return -EIO; 3472 } 3473 if (!map->stripes[i].dev) { 3474 map->stripes[i].dev = 3475 add_missing_dev(root, devid, uuid); 3476 if (!map->stripes[i].dev) { 3477 kfree(map); 3478 free_extent_map(em); 3479 return -EIO; 3480 } 3481 } 3482 map->stripes[i].dev->in_fs_metadata = 1; 3483 } 3484 3485 write_lock(&map_tree->map_tree.lock); 3486 ret = add_extent_mapping(&map_tree->map_tree, em); 3487 write_unlock(&map_tree->map_tree.lock); 3488 BUG_ON(ret); 3489 free_extent_map(em); 3490 3491 return 0; 3492 } 3493 3494 static int fill_device_from_item(struct extent_buffer *leaf, 3495 struct btrfs_dev_item *dev_item, 3496 struct btrfs_device *device) 3497 { 3498 unsigned long ptr; 3499 3500 device->devid = btrfs_device_id(leaf, dev_item); 3501 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3502 device->total_bytes = device->disk_total_bytes; 3503 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3504 device->type = btrfs_device_type(leaf, dev_item); 3505 device->io_align = btrfs_device_io_align(leaf, dev_item); 3506 device->io_width = btrfs_device_io_width(leaf, dev_item); 3507 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3508 3509 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3510 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3511 3512 return 0; 3513 } 3514 3515 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3516 { 3517 struct btrfs_fs_devices *fs_devices; 3518 int ret; 3519 3520 mutex_lock(&uuid_mutex); 3521 3522 fs_devices = root->fs_info->fs_devices->seed; 3523 while (fs_devices) { 3524 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3525 ret = 0; 3526 goto out; 3527 } 3528 fs_devices = fs_devices->seed; 3529 } 3530 3531 fs_devices = find_fsid(fsid); 3532 if (!fs_devices) { 3533 ret = -ENOENT; 3534 goto out; 3535 } 3536 3537 fs_devices = clone_fs_devices(fs_devices); 3538 if (IS_ERR(fs_devices)) { 3539 ret = PTR_ERR(fs_devices); 3540 goto out; 3541 } 3542 3543 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3544 root->fs_info->bdev_holder); 3545 if (ret) 3546 goto out; 3547 3548 if (!fs_devices->seeding) { 3549 __btrfs_close_devices(fs_devices); 3550 free_fs_devices(fs_devices); 3551 ret = -EINVAL; 3552 goto out; 3553 } 3554 3555 fs_devices->seed = root->fs_info->fs_devices->seed; 3556 root->fs_info->fs_devices->seed = fs_devices; 3557 out: 3558 mutex_unlock(&uuid_mutex); 3559 return ret; 3560 } 3561 3562 static int read_one_dev(struct btrfs_root *root, 3563 struct extent_buffer *leaf, 3564 struct btrfs_dev_item *dev_item) 3565 { 3566 struct btrfs_device *device; 3567 u64 devid; 3568 int ret; 3569 u8 fs_uuid[BTRFS_UUID_SIZE]; 3570 u8 dev_uuid[BTRFS_UUID_SIZE]; 3571 3572 devid = btrfs_device_id(leaf, dev_item); 3573 read_extent_buffer(leaf, dev_uuid, 3574 (unsigned long)btrfs_device_uuid(dev_item), 3575 BTRFS_UUID_SIZE); 3576 read_extent_buffer(leaf, fs_uuid, 3577 (unsigned long)btrfs_device_fsid(dev_item), 3578 BTRFS_UUID_SIZE); 3579 3580 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3581 ret = open_seed_devices(root, fs_uuid); 3582 if (ret && !btrfs_test_opt(root, DEGRADED)) 3583 return ret; 3584 } 3585 3586 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3587 if (!device || !device->bdev) { 3588 if (!btrfs_test_opt(root, DEGRADED)) 3589 return -EIO; 3590 3591 if (!device) { 3592 printk(KERN_WARNING "warning devid %llu missing\n", 3593 (unsigned long long)devid); 3594 device = add_missing_dev(root, devid, dev_uuid); 3595 if (!device) 3596 return -ENOMEM; 3597 } else if (!device->missing) { 3598 /* 3599 * this happens when a device that was properly setup 3600 * in the device info lists suddenly goes bad. 3601 * device->bdev is NULL, and so we have to set 3602 * device->missing to one here 3603 */ 3604 root->fs_info->fs_devices->missing_devices++; 3605 device->missing = 1; 3606 } 3607 } 3608 3609 if (device->fs_devices != root->fs_info->fs_devices) { 3610 BUG_ON(device->writeable); 3611 if (device->generation != 3612 btrfs_device_generation(leaf, dev_item)) 3613 return -EINVAL; 3614 } 3615 3616 fill_device_from_item(leaf, dev_item, device); 3617 device->dev_root = root->fs_info->dev_root; 3618 device->in_fs_metadata = 1; 3619 if (device->writeable) 3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3621 ret = 0; 3622 return ret; 3623 } 3624 3625 int btrfs_read_sys_array(struct btrfs_root *root) 3626 { 3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3628 struct extent_buffer *sb; 3629 struct btrfs_disk_key *disk_key; 3630 struct btrfs_chunk *chunk; 3631 u8 *ptr; 3632 unsigned long sb_ptr; 3633 int ret = 0; 3634 u32 num_stripes; 3635 u32 array_size; 3636 u32 len = 0; 3637 u32 cur; 3638 struct btrfs_key key; 3639 3640 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3641 BTRFS_SUPER_INFO_SIZE); 3642 if (!sb) 3643 return -ENOMEM; 3644 btrfs_set_buffer_uptodate(sb); 3645 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 3646 3647 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3648 array_size = btrfs_super_sys_array_size(super_copy); 3649 3650 ptr = super_copy->sys_chunk_array; 3651 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3652 cur = 0; 3653 3654 while (cur < array_size) { 3655 disk_key = (struct btrfs_disk_key *)ptr; 3656 btrfs_disk_key_to_cpu(&key, disk_key); 3657 3658 len = sizeof(*disk_key); ptr += len; 3659 sb_ptr += len; 3660 cur += len; 3661 3662 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3663 chunk = (struct btrfs_chunk *)sb_ptr; 3664 ret = read_one_chunk(root, &key, sb, chunk); 3665 if (ret) 3666 break; 3667 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3668 len = btrfs_chunk_item_size(num_stripes); 3669 } else { 3670 ret = -EIO; 3671 break; 3672 } 3673 ptr += len; 3674 sb_ptr += len; 3675 cur += len; 3676 } 3677 free_extent_buffer(sb); 3678 return ret; 3679 } 3680 3681 int btrfs_read_chunk_tree(struct btrfs_root *root) 3682 { 3683 struct btrfs_path *path; 3684 struct extent_buffer *leaf; 3685 struct btrfs_key key; 3686 struct btrfs_key found_key; 3687 int ret; 3688 int slot; 3689 3690 root = root->fs_info->chunk_root; 3691 3692 path = btrfs_alloc_path(); 3693 if (!path) 3694 return -ENOMEM; 3695 3696 /* first we search for all of the device items, and then we 3697 * read in all of the chunk items. This way we can create chunk 3698 * mappings that reference all of the devices that are afound 3699 */ 3700 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3701 key.offset = 0; 3702 key.type = 0; 3703 again: 3704 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3705 if (ret < 0) 3706 goto error; 3707 while (1) { 3708 leaf = path->nodes[0]; 3709 slot = path->slots[0]; 3710 if (slot >= btrfs_header_nritems(leaf)) { 3711 ret = btrfs_next_leaf(root, path); 3712 if (ret == 0) 3713 continue; 3714 if (ret < 0) 3715 goto error; 3716 break; 3717 } 3718 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3719 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3720 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3721 break; 3722 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3723 struct btrfs_dev_item *dev_item; 3724 dev_item = btrfs_item_ptr(leaf, slot, 3725 struct btrfs_dev_item); 3726 ret = read_one_dev(root, leaf, dev_item); 3727 if (ret) 3728 goto error; 3729 } 3730 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3731 struct btrfs_chunk *chunk; 3732 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3733 ret = read_one_chunk(root, &found_key, leaf, chunk); 3734 if (ret) 3735 goto error; 3736 } 3737 path->slots[0]++; 3738 } 3739 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3740 key.objectid = 0; 3741 btrfs_release_path(path); 3742 goto again; 3743 } 3744 ret = 0; 3745 error: 3746 btrfs_free_path(path); 3747 return ret; 3748 } 3749