1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <linux/iocontext.h> 24 #include <asm/div64.h> 25 #include "compat.h" 26 #include "ctree.h" 27 #include "extent_map.h" 28 #include "disk-io.h" 29 #include "transaction.h" 30 #include "print-tree.h" 31 #include "volumes.h" 32 #include "async-thread.h" 33 34 struct map_lookup { 35 u64 type; 36 int io_align; 37 int io_width; 38 int stripe_len; 39 int sector_size; 40 int num_stripes; 41 int sub_stripes; 42 struct btrfs_bio_stripe stripes[]; 43 }; 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 51 (sizeof(struct btrfs_bio_stripe) * (n))) 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 void btrfs_lock_volumes(void) 57 { 58 mutex_lock(&uuid_mutex); 59 } 60 61 void btrfs_unlock_volumes(void) 62 { 63 mutex_unlock(&uuid_mutex); 64 } 65 66 static void lock_chunks(struct btrfs_root *root) 67 { 68 mutex_lock(&root->fs_info->chunk_mutex); 69 } 70 71 static void unlock_chunks(struct btrfs_root *root) 72 { 73 mutex_unlock(&root->fs_info->chunk_mutex); 74 } 75 76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 77 { 78 struct btrfs_device *device; 79 WARN_ON(fs_devices->opened); 80 while (!list_empty(&fs_devices->devices)) { 81 device = list_entry(fs_devices->devices.next, 82 struct btrfs_device, dev_list); 83 list_del(&device->dev_list); 84 kfree(device->name); 85 kfree(device); 86 } 87 kfree(fs_devices); 88 } 89 90 int btrfs_cleanup_fs_uuids(void) 91 { 92 struct btrfs_fs_devices *fs_devices; 93 94 while (!list_empty(&fs_uuids)) { 95 fs_devices = list_entry(fs_uuids.next, 96 struct btrfs_fs_devices, list); 97 list_del(&fs_devices->list); 98 free_fs_devices(fs_devices); 99 } 100 return 0; 101 } 102 103 static noinline struct btrfs_device *__find_device(struct list_head *head, 104 u64 devid, u8 *uuid) 105 { 106 struct btrfs_device *dev; 107 108 list_for_each_entry(dev, head, dev_list) { 109 if (dev->devid == devid && 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 111 return dev; 112 } 113 } 114 return NULL; 115 } 116 117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 118 { 119 struct btrfs_fs_devices *fs_devices; 120 121 list_for_each_entry(fs_devices, &fs_uuids, list) { 122 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 123 return fs_devices; 124 } 125 return NULL; 126 } 127 128 static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 struct bio *head, struct bio *tail) 130 { 131 132 struct bio *old_head; 133 134 old_head = pending_bios->head; 135 pending_bios->head = head; 136 if (pending_bios->tail) 137 tail->bi_next = old_head; 138 else 139 pending_bios->tail = tail; 140 } 141 142 /* 143 * we try to collect pending bios for a device so we don't get a large 144 * number of procs sending bios down to the same device. This greatly 145 * improves the schedulers ability to collect and merge the bios. 146 * 147 * But, it also turns into a long list of bios to process and that is sure 148 * to eventually make the worker thread block. The solution here is to 149 * make some progress and then put this work struct back at the end of 150 * the list if the block device is congested. This way, multiple devices 151 * can make progress from a single worker thread. 152 */ 153 static noinline int run_scheduled_bios(struct btrfs_device *device) 154 { 155 struct bio *pending; 156 struct backing_dev_info *bdi; 157 struct btrfs_fs_info *fs_info; 158 struct btrfs_pending_bios *pending_bios; 159 struct bio *tail; 160 struct bio *cur; 161 int again = 0; 162 unsigned long num_run; 163 unsigned long num_sync_run; 164 unsigned long limit; 165 unsigned long last_waited = 0; 166 167 bdi = blk_get_backing_dev_info(device->bdev); 168 fs_info = device->dev_root->fs_info; 169 limit = btrfs_async_submit_limit(fs_info); 170 limit = limit * 2 / 3; 171 172 /* we want to make sure that every time we switch from the sync 173 * list to the normal list, we unplug 174 */ 175 num_sync_run = 0; 176 177 loop: 178 spin_lock(&device->io_lock); 179 num_run = 0; 180 181 loop_lock: 182 183 /* take all the bios off the list at once and process them 184 * later on (without the lock held). But, remember the 185 * tail and other pointers so the bios can be properly reinserted 186 * into the list if we hit congestion 187 */ 188 if (device->pending_sync_bios.head) 189 pending_bios = &device->pending_sync_bios; 190 else 191 pending_bios = &device->pending_bios; 192 193 pending = pending_bios->head; 194 tail = pending_bios->tail; 195 WARN_ON(pending && !tail); 196 197 /* 198 * if pending was null this time around, no bios need processing 199 * at all and we can stop. Otherwise it'll loop back up again 200 * and do an additional check so no bios are missed. 201 * 202 * device->running_pending is used to synchronize with the 203 * schedule_bio code. 204 */ 205 if (device->pending_sync_bios.head == NULL && 206 device->pending_bios.head == NULL) { 207 again = 0; 208 device->running_pending = 0; 209 } else { 210 again = 1; 211 device->running_pending = 1; 212 } 213 214 pending_bios->head = NULL; 215 pending_bios->tail = NULL; 216 217 spin_unlock(&device->io_lock); 218 219 /* 220 * if we're doing the regular priority list, make sure we unplug 221 * for any high prio bios we've sent down 222 */ 223 if (pending_bios == &device->pending_bios && num_sync_run > 0) { 224 num_sync_run = 0; 225 blk_run_backing_dev(bdi, NULL); 226 } 227 228 while (pending) { 229 230 rmb(); 231 if (pending_bios != &device->pending_sync_bios && 232 device->pending_sync_bios.head && 233 num_run > 16) { 234 cond_resched(); 235 spin_lock(&device->io_lock); 236 requeue_list(pending_bios, pending, tail); 237 goto loop_lock; 238 } 239 240 cur = pending; 241 pending = pending->bi_next; 242 cur->bi_next = NULL; 243 atomic_dec(&fs_info->nr_async_bios); 244 245 if (atomic_read(&fs_info->nr_async_bios) < limit && 246 waitqueue_active(&fs_info->async_submit_wait)) 247 wake_up(&fs_info->async_submit_wait); 248 249 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 250 submit_bio(cur->bi_rw, cur); 251 num_run++; 252 if (bio_sync(cur)) 253 num_sync_run++; 254 255 if (need_resched()) { 256 if (num_sync_run) { 257 blk_run_backing_dev(bdi, NULL); 258 num_sync_run = 0; 259 } 260 cond_resched(); 261 } 262 263 /* 264 * we made progress, there is more work to do and the bdi 265 * is now congested. Back off and let other work structs 266 * run instead 267 */ 268 if (pending && bdi_write_congested(bdi) && num_run > 16 && 269 fs_info->fs_devices->open_devices > 1) { 270 struct io_context *ioc; 271 272 ioc = current->io_context; 273 274 /* 275 * the main goal here is that we don't want to 276 * block if we're going to be able to submit 277 * more requests without blocking. 278 * 279 * This code does two great things, it pokes into 280 * the elevator code from a filesystem _and_ 281 * it makes assumptions about how batching works. 282 */ 283 if (ioc && ioc->nr_batch_requests > 0 && 284 time_before(jiffies, ioc->last_waited + HZ/50UL) && 285 (last_waited == 0 || 286 ioc->last_waited == last_waited)) { 287 /* 288 * we want to go through our batch of 289 * requests and stop. So, we copy out 290 * the ioc->last_waited time and test 291 * against it before looping 292 */ 293 last_waited = ioc->last_waited; 294 if (need_resched()) { 295 if (num_sync_run) { 296 blk_run_backing_dev(bdi, NULL); 297 num_sync_run = 0; 298 } 299 cond_resched(); 300 } 301 continue; 302 } 303 spin_lock(&device->io_lock); 304 requeue_list(pending_bios, pending, tail); 305 device->running_pending = 1; 306 307 spin_unlock(&device->io_lock); 308 btrfs_requeue_work(&device->work); 309 goto done; 310 } 311 } 312 313 if (num_sync_run) { 314 num_sync_run = 0; 315 blk_run_backing_dev(bdi, NULL); 316 } 317 318 cond_resched(); 319 if (again) 320 goto loop; 321 322 spin_lock(&device->io_lock); 323 if (device->pending_bios.head || device->pending_sync_bios.head) 324 goto loop_lock; 325 spin_unlock(&device->io_lock); 326 327 /* 328 * IO has already been through a long path to get here. Checksumming, 329 * async helper threads, perhaps compression. We've done a pretty 330 * good job of collecting a batch of IO and should just unplug 331 * the device right away. 332 * 333 * This will help anyone who is waiting on the IO, they might have 334 * already unplugged, but managed to do so before the bio they 335 * cared about found its way down here. 336 */ 337 blk_run_backing_dev(bdi, NULL); 338 done: 339 return 0; 340 } 341 342 static void pending_bios_fn(struct btrfs_work *work) 343 { 344 struct btrfs_device *device; 345 346 device = container_of(work, struct btrfs_device, work); 347 run_scheduled_bios(device); 348 } 349 350 static noinline int device_list_add(const char *path, 351 struct btrfs_super_block *disk_super, 352 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 353 { 354 struct btrfs_device *device; 355 struct btrfs_fs_devices *fs_devices; 356 u64 found_transid = btrfs_super_generation(disk_super); 357 358 fs_devices = find_fsid(disk_super->fsid); 359 if (!fs_devices) { 360 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 361 if (!fs_devices) 362 return -ENOMEM; 363 INIT_LIST_HEAD(&fs_devices->devices); 364 INIT_LIST_HEAD(&fs_devices->alloc_list); 365 list_add(&fs_devices->list, &fs_uuids); 366 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 367 fs_devices->latest_devid = devid; 368 fs_devices->latest_trans = found_transid; 369 device = NULL; 370 } else { 371 device = __find_device(&fs_devices->devices, devid, 372 disk_super->dev_item.uuid); 373 } 374 if (!device) { 375 if (fs_devices->opened) 376 return -EBUSY; 377 378 device = kzalloc(sizeof(*device), GFP_NOFS); 379 if (!device) { 380 /* we can safely leave the fs_devices entry around */ 381 return -ENOMEM; 382 } 383 device->devid = devid; 384 device->work.func = pending_bios_fn; 385 memcpy(device->uuid, disk_super->dev_item.uuid, 386 BTRFS_UUID_SIZE); 387 device->barriers = 1; 388 spin_lock_init(&device->io_lock); 389 device->name = kstrdup(path, GFP_NOFS); 390 if (!device->name) { 391 kfree(device); 392 return -ENOMEM; 393 } 394 INIT_LIST_HEAD(&device->dev_alloc_list); 395 list_add(&device->dev_list, &fs_devices->devices); 396 device->fs_devices = fs_devices; 397 fs_devices->num_devices++; 398 } 399 400 if (found_transid > fs_devices->latest_trans) { 401 fs_devices->latest_devid = devid; 402 fs_devices->latest_trans = found_transid; 403 } 404 *fs_devices_ret = fs_devices; 405 return 0; 406 } 407 408 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 409 { 410 struct btrfs_fs_devices *fs_devices; 411 struct btrfs_device *device; 412 struct btrfs_device *orig_dev; 413 414 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 415 if (!fs_devices) 416 return ERR_PTR(-ENOMEM); 417 418 INIT_LIST_HEAD(&fs_devices->devices); 419 INIT_LIST_HEAD(&fs_devices->alloc_list); 420 INIT_LIST_HEAD(&fs_devices->list); 421 fs_devices->latest_devid = orig->latest_devid; 422 fs_devices->latest_trans = orig->latest_trans; 423 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 424 425 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 426 device = kzalloc(sizeof(*device), GFP_NOFS); 427 if (!device) 428 goto error; 429 430 device->name = kstrdup(orig_dev->name, GFP_NOFS); 431 if (!device->name) 432 goto error; 433 434 device->devid = orig_dev->devid; 435 device->work.func = pending_bios_fn; 436 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 437 device->barriers = 1; 438 spin_lock_init(&device->io_lock); 439 INIT_LIST_HEAD(&device->dev_list); 440 INIT_LIST_HEAD(&device->dev_alloc_list); 441 442 list_add(&device->dev_list, &fs_devices->devices); 443 device->fs_devices = fs_devices; 444 fs_devices->num_devices++; 445 } 446 return fs_devices; 447 error: 448 free_fs_devices(fs_devices); 449 return ERR_PTR(-ENOMEM); 450 } 451 452 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 453 { 454 struct btrfs_device *device, *next; 455 456 mutex_lock(&uuid_mutex); 457 again: 458 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 459 if (device->in_fs_metadata) 460 continue; 461 462 if (device->bdev) { 463 close_bdev_exclusive(device->bdev, device->mode); 464 device->bdev = NULL; 465 fs_devices->open_devices--; 466 } 467 if (device->writeable) { 468 list_del_init(&device->dev_alloc_list); 469 device->writeable = 0; 470 fs_devices->rw_devices--; 471 } 472 list_del_init(&device->dev_list); 473 fs_devices->num_devices--; 474 kfree(device->name); 475 kfree(device); 476 } 477 478 if (fs_devices->seed) { 479 fs_devices = fs_devices->seed; 480 goto again; 481 } 482 483 mutex_unlock(&uuid_mutex); 484 return 0; 485 } 486 487 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 488 { 489 struct btrfs_device *device; 490 491 if (--fs_devices->opened > 0) 492 return 0; 493 494 list_for_each_entry(device, &fs_devices->devices, dev_list) { 495 if (device->bdev) { 496 close_bdev_exclusive(device->bdev, device->mode); 497 fs_devices->open_devices--; 498 } 499 if (device->writeable) { 500 list_del_init(&device->dev_alloc_list); 501 fs_devices->rw_devices--; 502 } 503 504 device->bdev = NULL; 505 device->writeable = 0; 506 device->in_fs_metadata = 0; 507 } 508 WARN_ON(fs_devices->open_devices); 509 WARN_ON(fs_devices->rw_devices); 510 fs_devices->opened = 0; 511 fs_devices->seeding = 0; 512 513 return 0; 514 } 515 516 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 517 { 518 struct btrfs_fs_devices *seed_devices = NULL; 519 int ret; 520 521 mutex_lock(&uuid_mutex); 522 ret = __btrfs_close_devices(fs_devices); 523 if (!fs_devices->opened) { 524 seed_devices = fs_devices->seed; 525 fs_devices->seed = NULL; 526 } 527 mutex_unlock(&uuid_mutex); 528 529 while (seed_devices) { 530 fs_devices = seed_devices; 531 seed_devices = fs_devices->seed; 532 __btrfs_close_devices(fs_devices); 533 free_fs_devices(fs_devices); 534 } 535 return ret; 536 } 537 538 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 539 fmode_t flags, void *holder) 540 { 541 struct block_device *bdev; 542 struct list_head *head = &fs_devices->devices; 543 struct btrfs_device *device; 544 struct block_device *latest_bdev = NULL; 545 struct buffer_head *bh; 546 struct btrfs_super_block *disk_super; 547 u64 latest_devid = 0; 548 u64 latest_transid = 0; 549 u64 devid; 550 int seeding = 1; 551 int ret = 0; 552 553 list_for_each_entry(device, head, dev_list) { 554 if (device->bdev) 555 continue; 556 if (!device->name) 557 continue; 558 559 bdev = open_bdev_exclusive(device->name, flags, holder); 560 if (IS_ERR(bdev)) { 561 printk(KERN_INFO "open %s failed\n", device->name); 562 goto error; 563 } 564 set_blocksize(bdev, 4096); 565 566 bh = btrfs_read_dev_super(bdev); 567 if (!bh) 568 goto error_close; 569 570 disk_super = (struct btrfs_super_block *)bh->b_data; 571 devid = le64_to_cpu(disk_super->dev_item.devid); 572 if (devid != device->devid) 573 goto error_brelse; 574 575 if (memcmp(device->uuid, disk_super->dev_item.uuid, 576 BTRFS_UUID_SIZE)) 577 goto error_brelse; 578 579 device->generation = btrfs_super_generation(disk_super); 580 if (!latest_transid || device->generation > latest_transid) { 581 latest_devid = devid; 582 latest_transid = device->generation; 583 latest_bdev = bdev; 584 } 585 586 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 587 device->writeable = 0; 588 } else { 589 device->writeable = !bdev_read_only(bdev); 590 seeding = 0; 591 } 592 593 device->bdev = bdev; 594 device->in_fs_metadata = 0; 595 device->mode = flags; 596 597 fs_devices->open_devices++; 598 if (device->writeable) { 599 fs_devices->rw_devices++; 600 list_add(&device->dev_alloc_list, 601 &fs_devices->alloc_list); 602 } 603 continue; 604 605 error_brelse: 606 brelse(bh); 607 error_close: 608 close_bdev_exclusive(bdev, FMODE_READ); 609 error: 610 continue; 611 } 612 if (fs_devices->open_devices == 0) { 613 ret = -EIO; 614 goto out; 615 } 616 fs_devices->seeding = seeding; 617 fs_devices->opened = 1; 618 fs_devices->latest_bdev = latest_bdev; 619 fs_devices->latest_devid = latest_devid; 620 fs_devices->latest_trans = latest_transid; 621 fs_devices->total_rw_bytes = 0; 622 out: 623 return ret; 624 } 625 626 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 627 fmode_t flags, void *holder) 628 { 629 int ret; 630 631 mutex_lock(&uuid_mutex); 632 if (fs_devices->opened) { 633 fs_devices->opened++; 634 ret = 0; 635 } else { 636 ret = __btrfs_open_devices(fs_devices, flags, holder); 637 } 638 mutex_unlock(&uuid_mutex); 639 return ret; 640 } 641 642 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 643 struct btrfs_fs_devices **fs_devices_ret) 644 { 645 struct btrfs_super_block *disk_super; 646 struct block_device *bdev; 647 struct buffer_head *bh; 648 int ret; 649 u64 devid; 650 u64 transid; 651 652 mutex_lock(&uuid_mutex); 653 654 bdev = open_bdev_exclusive(path, flags, holder); 655 656 if (IS_ERR(bdev)) { 657 ret = PTR_ERR(bdev); 658 goto error; 659 } 660 661 ret = set_blocksize(bdev, 4096); 662 if (ret) 663 goto error_close; 664 bh = btrfs_read_dev_super(bdev); 665 if (!bh) { 666 ret = -EIO; 667 goto error_close; 668 } 669 disk_super = (struct btrfs_super_block *)bh->b_data; 670 devid = le64_to_cpu(disk_super->dev_item.devid); 671 transid = btrfs_super_generation(disk_super); 672 if (disk_super->label[0]) 673 printk(KERN_INFO "device label %s ", disk_super->label); 674 else { 675 /* FIXME, make a readl uuid parser */ 676 printk(KERN_INFO "device fsid %llx-%llx ", 677 *(unsigned long long *)disk_super->fsid, 678 *(unsigned long long *)(disk_super->fsid + 8)); 679 } 680 printk(KERN_CONT "devid %llu transid %llu %s\n", 681 (unsigned long long)devid, (unsigned long long)transid, path); 682 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 683 684 brelse(bh); 685 error_close: 686 close_bdev_exclusive(bdev, flags); 687 error: 688 mutex_unlock(&uuid_mutex); 689 return ret; 690 } 691 692 /* 693 * this uses a pretty simple search, the expectation is that it is 694 * called very infrequently and that a given device has a small number 695 * of extents 696 */ 697 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 698 struct btrfs_device *device, 699 u64 num_bytes, u64 *start) 700 { 701 struct btrfs_key key; 702 struct btrfs_root *root = device->dev_root; 703 struct btrfs_dev_extent *dev_extent = NULL; 704 struct btrfs_path *path; 705 u64 hole_size = 0; 706 u64 last_byte = 0; 707 u64 search_start = 0; 708 u64 search_end = device->total_bytes; 709 int ret; 710 int slot = 0; 711 int start_found; 712 struct extent_buffer *l; 713 714 path = btrfs_alloc_path(); 715 if (!path) 716 return -ENOMEM; 717 path->reada = 2; 718 start_found = 0; 719 720 /* FIXME use last free of some kind */ 721 722 /* we don't want to overwrite the superblock on the drive, 723 * so we make sure to start at an offset of at least 1MB 724 */ 725 search_start = max((u64)1024 * 1024, search_start); 726 727 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 728 search_start = max(root->fs_info->alloc_start, search_start); 729 730 key.objectid = device->devid; 731 key.offset = search_start; 732 key.type = BTRFS_DEV_EXTENT_KEY; 733 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 734 if (ret < 0) 735 goto error; 736 ret = btrfs_previous_item(root, path, 0, key.type); 737 if (ret < 0) 738 goto error; 739 l = path->nodes[0]; 740 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 741 while (1) { 742 l = path->nodes[0]; 743 slot = path->slots[0]; 744 if (slot >= btrfs_header_nritems(l)) { 745 ret = btrfs_next_leaf(root, path); 746 if (ret == 0) 747 continue; 748 if (ret < 0) 749 goto error; 750 no_more_items: 751 if (!start_found) { 752 if (search_start >= search_end) { 753 ret = -ENOSPC; 754 goto error; 755 } 756 *start = search_start; 757 start_found = 1; 758 goto check_pending; 759 } 760 *start = last_byte > search_start ? 761 last_byte : search_start; 762 if (search_end <= *start) { 763 ret = -ENOSPC; 764 goto error; 765 } 766 goto check_pending; 767 } 768 btrfs_item_key_to_cpu(l, &key, slot); 769 770 if (key.objectid < device->devid) 771 goto next; 772 773 if (key.objectid > device->devid) 774 goto no_more_items; 775 776 if (key.offset >= search_start && key.offset > last_byte && 777 start_found) { 778 if (last_byte < search_start) 779 last_byte = search_start; 780 hole_size = key.offset - last_byte; 781 if (key.offset > last_byte && 782 hole_size >= num_bytes) { 783 *start = last_byte; 784 goto check_pending; 785 } 786 } 787 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 788 goto next; 789 790 start_found = 1; 791 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 792 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 793 next: 794 path->slots[0]++; 795 cond_resched(); 796 } 797 check_pending: 798 /* we have to make sure we didn't find an extent that has already 799 * been allocated by the map tree or the original allocation 800 */ 801 BUG_ON(*start < search_start); 802 803 if (*start + num_bytes > search_end) { 804 ret = -ENOSPC; 805 goto error; 806 } 807 /* check for pending inserts here */ 808 ret = 0; 809 810 error: 811 btrfs_free_path(path); 812 return ret; 813 } 814 815 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 816 struct btrfs_device *device, 817 u64 start) 818 { 819 int ret; 820 struct btrfs_path *path; 821 struct btrfs_root *root = device->dev_root; 822 struct btrfs_key key; 823 struct btrfs_key found_key; 824 struct extent_buffer *leaf = NULL; 825 struct btrfs_dev_extent *extent = NULL; 826 827 path = btrfs_alloc_path(); 828 if (!path) 829 return -ENOMEM; 830 831 key.objectid = device->devid; 832 key.offset = start; 833 key.type = BTRFS_DEV_EXTENT_KEY; 834 835 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 836 if (ret > 0) { 837 ret = btrfs_previous_item(root, path, key.objectid, 838 BTRFS_DEV_EXTENT_KEY); 839 BUG_ON(ret); 840 leaf = path->nodes[0]; 841 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 842 extent = btrfs_item_ptr(leaf, path->slots[0], 843 struct btrfs_dev_extent); 844 BUG_ON(found_key.offset > start || found_key.offset + 845 btrfs_dev_extent_length(leaf, extent) < start); 846 ret = 0; 847 } else if (ret == 0) { 848 leaf = path->nodes[0]; 849 extent = btrfs_item_ptr(leaf, path->slots[0], 850 struct btrfs_dev_extent); 851 } 852 BUG_ON(ret); 853 854 if (device->bytes_used > 0) 855 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 856 ret = btrfs_del_item(trans, root, path); 857 BUG_ON(ret); 858 859 btrfs_free_path(path); 860 return ret; 861 } 862 863 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 864 struct btrfs_device *device, 865 u64 chunk_tree, u64 chunk_objectid, 866 u64 chunk_offset, u64 start, u64 num_bytes) 867 { 868 int ret; 869 struct btrfs_path *path; 870 struct btrfs_root *root = device->dev_root; 871 struct btrfs_dev_extent *extent; 872 struct extent_buffer *leaf; 873 struct btrfs_key key; 874 875 WARN_ON(!device->in_fs_metadata); 876 path = btrfs_alloc_path(); 877 if (!path) 878 return -ENOMEM; 879 880 key.objectid = device->devid; 881 key.offset = start; 882 key.type = BTRFS_DEV_EXTENT_KEY; 883 ret = btrfs_insert_empty_item(trans, root, path, &key, 884 sizeof(*extent)); 885 BUG_ON(ret); 886 887 leaf = path->nodes[0]; 888 extent = btrfs_item_ptr(leaf, path->slots[0], 889 struct btrfs_dev_extent); 890 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 891 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 892 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 893 894 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 895 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 896 BTRFS_UUID_SIZE); 897 898 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 899 btrfs_mark_buffer_dirty(leaf); 900 btrfs_free_path(path); 901 return ret; 902 } 903 904 static noinline int find_next_chunk(struct btrfs_root *root, 905 u64 objectid, u64 *offset) 906 { 907 struct btrfs_path *path; 908 int ret; 909 struct btrfs_key key; 910 struct btrfs_chunk *chunk; 911 struct btrfs_key found_key; 912 913 path = btrfs_alloc_path(); 914 BUG_ON(!path); 915 916 key.objectid = objectid; 917 key.offset = (u64)-1; 918 key.type = BTRFS_CHUNK_ITEM_KEY; 919 920 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 921 if (ret < 0) 922 goto error; 923 924 BUG_ON(ret == 0); 925 926 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 927 if (ret) { 928 *offset = 0; 929 } else { 930 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 931 path->slots[0]); 932 if (found_key.objectid != objectid) 933 *offset = 0; 934 else { 935 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 936 struct btrfs_chunk); 937 *offset = found_key.offset + 938 btrfs_chunk_length(path->nodes[0], chunk); 939 } 940 } 941 ret = 0; 942 error: 943 btrfs_free_path(path); 944 return ret; 945 } 946 947 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 948 { 949 int ret; 950 struct btrfs_key key; 951 struct btrfs_key found_key; 952 struct btrfs_path *path; 953 954 root = root->fs_info->chunk_root; 955 956 path = btrfs_alloc_path(); 957 if (!path) 958 return -ENOMEM; 959 960 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 961 key.type = BTRFS_DEV_ITEM_KEY; 962 key.offset = (u64)-1; 963 964 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 965 if (ret < 0) 966 goto error; 967 968 BUG_ON(ret == 0); 969 970 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 971 BTRFS_DEV_ITEM_KEY); 972 if (ret) { 973 *objectid = 1; 974 } else { 975 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 976 path->slots[0]); 977 *objectid = found_key.offset + 1; 978 } 979 ret = 0; 980 error: 981 btrfs_free_path(path); 982 return ret; 983 } 984 985 /* 986 * the device information is stored in the chunk root 987 * the btrfs_device struct should be fully filled in 988 */ 989 int btrfs_add_device(struct btrfs_trans_handle *trans, 990 struct btrfs_root *root, 991 struct btrfs_device *device) 992 { 993 int ret; 994 struct btrfs_path *path; 995 struct btrfs_dev_item *dev_item; 996 struct extent_buffer *leaf; 997 struct btrfs_key key; 998 unsigned long ptr; 999 1000 root = root->fs_info->chunk_root; 1001 1002 path = btrfs_alloc_path(); 1003 if (!path) 1004 return -ENOMEM; 1005 1006 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1007 key.type = BTRFS_DEV_ITEM_KEY; 1008 key.offset = device->devid; 1009 1010 ret = btrfs_insert_empty_item(trans, root, path, &key, 1011 sizeof(*dev_item)); 1012 if (ret) 1013 goto out; 1014 1015 leaf = path->nodes[0]; 1016 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1017 1018 btrfs_set_device_id(leaf, dev_item, device->devid); 1019 btrfs_set_device_generation(leaf, dev_item, 0); 1020 btrfs_set_device_type(leaf, dev_item, device->type); 1021 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1022 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1023 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1024 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1025 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1026 btrfs_set_device_group(leaf, dev_item, 0); 1027 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1028 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1029 btrfs_set_device_start_offset(leaf, dev_item, 0); 1030 1031 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1032 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1033 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1034 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1035 btrfs_mark_buffer_dirty(leaf); 1036 1037 ret = 0; 1038 out: 1039 btrfs_free_path(path); 1040 return ret; 1041 } 1042 1043 static int btrfs_rm_dev_item(struct btrfs_root *root, 1044 struct btrfs_device *device) 1045 { 1046 int ret; 1047 struct btrfs_path *path; 1048 struct btrfs_key key; 1049 struct btrfs_trans_handle *trans; 1050 1051 root = root->fs_info->chunk_root; 1052 1053 path = btrfs_alloc_path(); 1054 if (!path) 1055 return -ENOMEM; 1056 1057 trans = btrfs_start_transaction(root, 1); 1058 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1059 key.type = BTRFS_DEV_ITEM_KEY; 1060 key.offset = device->devid; 1061 lock_chunks(root); 1062 1063 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1064 if (ret < 0) 1065 goto out; 1066 1067 if (ret > 0) { 1068 ret = -ENOENT; 1069 goto out; 1070 } 1071 1072 ret = btrfs_del_item(trans, root, path); 1073 if (ret) 1074 goto out; 1075 out: 1076 btrfs_free_path(path); 1077 unlock_chunks(root); 1078 btrfs_commit_transaction(trans, root); 1079 return ret; 1080 } 1081 1082 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1083 { 1084 struct btrfs_device *device; 1085 struct btrfs_device *next_device; 1086 struct block_device *bdev; 1087 struct buffer_head *bh = NULL; 1088 struct btrfs_super_block *disk_super; 1089 u64 all_avail; 1090 u64 devid; 1091 u64 num_devices; 1092 u8 *dev_uuid; 1093 int ret = 0; 1094 1095 mutex_lock(&uuid_mutex); 1096 mutex_lock(&root->fs_info->volume_mutex); 1097 1098 all_avail = root->fs_info->avail_data_alloc_bits | 1099 root->fs_info->avail_system_alloc_bits | 1100 root->fs_info->avail_metadata_alloc_bits; 1101 1102 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1103 root->fs_info->fs_devices->rw_devices <= 4) { 1104 printk(KERN_ERR "btrfs: unable to go below four devices " 1105 "on raid10\n"); 1106 ret = -EINVAL; 1107 goto out; 1108 } 1109 1110 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1111 root->fs_info->fs_devices->rw_devices <= 2) { 1112 printk(KERN_ERR "btrfs: unable to go below two " 1113 "devices on raid1\n"); 1114 ret = -EINVAL; 1115 goto out; 1116 } 1117 1118 if (strcmp(device_path, "missing") == 0) { 1119 struct list_head *devices; 1120 struct btrfs_device *tmp; 1121 1122 device = NULL; 1123 devices = &root->fs_info->fs_devices->devices; 1124 list_for_each_entry(tmp, devices, dev_list) { 1125 if (tmp->in_fs_metadata && !tmp->bdev) { 1126 device = tmp; 1127 break; 1128 } 1129 } 1130 bdev = NULL; 1131 bh = NULL; 1132 disk_super = NULL; 1133 if (!device) { 1134 printk(KERN_ERR "btrfs: no missing devices found to " 1135 "remove\n"); 1136 goto out; 1137 } 1138 } else { 1139 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1140 root->fs_info->bdev_holder); 1141 if (IS_ERR(bdev)) { 1142 ret = PTR_ERR(bdev); 1143 goto out; 1144 } 1145 1146 set_blocksize(bdev, 4096); 1147 bh = btrfs_read_dev_super(bdev); 1148 if (!bh) { 1149 ret = -EIO; 1150 goto error_close; 1151 } 1152 disk_super = (struct btrfs_super_block *)bh->b_data; 1153 devid = le64_to_cpu(disk_super->dev_item.devid); 1154 dev_uuid = disk_super->dev_item.uuid; 1155 device = btrfs_find_device(root, devid, dev_uuid, 1156 disk_super->fsid); 1157 if (!device) { 1158 ret = -ENOENT; 1159 goto error_brelse; 1160 } 1161 } 1162 1163 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1164 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1165 "device\n"); 1166 ret = -EINVAL; 1167 goto error_brelse; 1168 } 1169 1170 if (device->writeable) { 1171 list_del_init(&device->dev_alloc_list); 1172 root->fs_info->fs_devices->rw_devices--; 1173 } 1174 1175 ret = btrfs_shrink_device(device, 0); 1176 if (ret) 1177 goto error_brelse; 1178 1179 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1180 if (ret) 1181 goto error_brelse; 1182 1183 device->in_fs_metadata = 0; 1184 list_del_init(&device->dev_list); 1185 device->fs_devices->num_devices--; 1186 1187 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1188 struct btrfs_device, dev_list); 1189 if (device->bdev == root->fs_info->sb->s_bdev) 1190 root->fs_info->sb->s_bdev = next_device->bdev; 1191 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1192 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1193 1194 if (device->bdev) { 1195 close_bdev_exclusive(device->bdev, device->mode); 1196 device->bdev = NULL; 1197 device->fs_devices->open_devices--; 1198 } 1199 1200 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1201 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1202 1203 if (device->fs_devices->open_devices == 0) { 1204 struct btrfs_fs_devices *fs_devices; 1205 fs_devices = root->fs_info->fs_devices; 1206 while (fs_devices) { 1207 if (fs_devices->seed == device->fs_devices) 1208 break; 1209 fs_devices = fs_devices->seed; 1210 } 1211 fs_devices->seed = device->fs_devices->seed; 1212 device->fs_devices->seed = NULL; 1213 __btrfs_close_devices(device->fs_devices); 1214 free_fs_devices(device->fs_devices); 1215 } 1216 1217 /* 1218 * at this point, the device is zero sized. We want to 1219 * remove it from the devices list and zero out the old super 1220 */ 1221 if (device->writeable) { 1222 /* make sure this device isn't detected as part of 1223 * the FS anymore 1224 */ 1225 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1226 set_buffer_dirty(bh); 1227 sync_dirty_buffer(bh); 1228 } 1229 1230 kfree(device->name); 1231 kfree(device); 1232 ret = 0; 1233 1234 error_brelse: 1235 brelse(bh); 1236 error_close: 1237 if (bdev) 1238 close_bdev_exclusive(bdev, FMODE_READ); 1239 out: 1240 mutex_unlock(&root->fs_info->volume_mutex); 1241 mutex_unlock(&uuid_mutex); 1242 return ret; 1243 } 1244 1245 /* 1246 * does all the dirty work required for changing file system's UUID. 1247 */ 1248 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1249 struct btrfs_root *root) 1250 { 1251 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1252 struct btrfs_fs_devices *old_devices; 1253 struct btrfs_fs_devices *seed_devices; 1254 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1255 struct btrfs_device *device; 1256 u64 super_flags; 1257 1258 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1259 if (!fs_devices->seeding) 1260 return -EINVAL; 1261 1262 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1263 if (!seed_devices) 1264 return -ENOMEM; 1265 1266 old_devices = clone_fs_devices(fs_devices); 1267 if (IS_ERR(old_devices)) { 1268 kfree(seed_devices); 1269 return PTR_ERR(old_devices); 1270 } 1271 1272 list_add(&old_devices->list, &fs_uuids); 1273 1274 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1275 seed_devices->opened = 1; 1276 INIT_LIST_HEAD(&seed_devices->devices); 1277 INIT_LIST_HEAD(&seed_devices->alloc_list); 1278 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1279 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1280 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1281 device->fs_devices = seed_devices; 1282 } 1283 1284 fs_devices->seeding = 0; 1285 fs_devices->num_devices = 0; 1286 fs_devices->open_devices = 0; 1287 fs_devices->seed = seed_devices; 1288 1289 generate_random_uuid(fs_devices->fsid); 1290 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1291 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1292 super_flags = btrfs_super_flags(disk_super) & 1293 ~BTRFS_SUPER_FLAG_SEEDING; 1294 btrfs_set_super_flags(disk_super, super_flags); 1295 1296 return 0; 1297 } 1298 1299 /* 1300 * strore the expected generation for seed devices in device items. 1301 */ 1302 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1303 struct btrfs_root *root) 1304 { 1305 struct btrfs_path *path; 1306 struct extent_buffer *leaf; 1307 struct btrfs_dev_item *dev_item; 1308 struct btrfs_device *device; 1309 struct btrfs_key key; 1310 u8 fs_uuid[BTRFS_UUID_SIZE]; 1311 u8 dev_uuid[BTRFS_UUID_SIZE]; 1312 u64 devid; 1313 int ret; 1314 1315 path = btrfs_alloc_path(); 1316 if (!path) 1317 return -ENOMEM; 1318 1319 root = root->fs_info->chunk_root; 1320 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1321 key.offset = 0; 1322 key.type = BTRFS_DEV_ITEM_KEY; 1323 1324 while (1) { 1325 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1326 if (ret < 0) 1327 goto error; 1328 1329 leaf = path->nodes[0]; 1330 next_slot: 1331 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1332 ret = btrfs_next_leaf(root, path); 1333 if (ret > 0) 1334 break; 1335 if (ret < 0) 1336 goto error; 1337 leaf = path->nodes[0]; 1338 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1339 btrfs_release_path(root, path); 1340 continue; 1341 } 1342 1343 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1344 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1345 key.type != BTRFS_DEV_ITEM_KEY) 1346 break; 1347 1348 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1349 struct btrfs_dev_item); 1350 devid = btrfs_device_id(leaf, dev_item); 1351 read_extent_buffer(leaf, dev_uuid, 1352 (unsigned long)btrfs_device_uuid(dev_item), 1353 BTRFS_UUID_SIZE); 1354 read_extent_buffer(leaf, fs_uuid, 1355 (unsigned long)btrfs_device_fsid(dev_item), 1356 BTRFS_UUID_SIZE); 1357 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1358 BUG_ON(!device); 1359 1360 if (device->fs_devices->seeding) { 1361 btrfs_set_device_generation(leaf, dev_item, 1362 device->generation); 1363 btrfs_mark_buffer_dirty(leaf); 1364 } 1365 1366 path->slots[0]++; 1367 goto next_slot; 1368 } 1369 ret = 0; 1370 error: 1371 btrfs_free_path(path); 1372 return ret; 1373 } 1374 1375 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1376 { 1377 struct btrfs_trans_handle *trans; 1378 struct btrfs_device *device; 1379 struct block_device *bdev; 1380 struct list_head *devices; 1381 struct super_block *sb = root->fs_info->sb; 1382 u64 total_bytes; 1383 int seeding_dev = 0; 1384 int ret = 0; 1385 1386 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1387 return -EINVAL; 1388 1389 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1390 if (!bdev) 1391 return -EIO; 1392 1393 if (root->fs_info->fs_devices->seeding) { 1394 seeding_dev = 1; 1395 down_write(&sb->s_umount); 1396 mutex_lock(&uuid_mutex); 1397 } 1398 1399 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1400 mutex_lock(&root->fs_info->volume_mutex); 1401 1402 devices = &root->fs_info->fs_devices->devices; 1403 list_for_each_entry(device, devices, dev_list) { 1404 if (device->bdev == bdev) { 1405 ret = -EEXIST; 1406 goto error; 1407 } 1408 } 1409 1410 device = kzalloc(sizeof(*device), GFP_NOFS); 1411 if (!device) { 1412 /* we can safely leave the fs_devices entry around */ 1413 ret = -ENOMEM; 1414 goto error; 1415 } 1416 1417 device->name = kstrdup(device_path, GFP_NOFS); 1418 if (!device->name) { 1419 kfree(device); 1420 ret = -ENOMEM; 1421 goto error; 1422 } 1423 1424 ret = find_next_devid(root, &device->devid); 1425 if (ret) { 1426 kfree(device); 1427 goto error; 1428 } 1429 1430 trans = btrfs_start_transaction(root, 1); 1431 lock_chunks(root); 1432 1433 device->barriers = 1; 1434 device->writeable = 1; 1435 device->work.func = pending_bios_fn; 1436 generate_random_uuid(device->uuid); 1437 spin_lock_init(&device->io_lock); 1438 device->generation = trans->transid; 1439 device->io_width = root->sectorsize; 1440 device->io_align = root->sectorsize; 1441 device->sector_size = root->sectorsize; 1442 device->total_bytes = i_size_read(bdev->bd_inode); 1443 device->disk_total_bytes = device->total_bytes; 1444 device->dev_root = root->fs_info->dev_root; 1445 device->bdev = bdev; 1446 device->in_fs_metadata = 1; 1447 device->mode = 0; 1448 set_blocksize(device->bdev, 4096); 1449 1450 if (seeding_dev) { 1451 sb->s_flags &= ~MS_RDONLY; 1452 ret = btrfs_prepare_sprout(trans, root); 1453 BUG_ON(ret); 1454 } 1455 1456 device->fs_devices = root->fs_info->fs_devices; 1457 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1458 list_add(&device->dev_alloc_list, 1459 &root->fs_info->fs_devices->alloc_list); 1460 root->fs_info->fs_devices->num_devices++; 1461 root->fs_info->fs_devices->open_devices++; 1462 root->fs_info->fs_devices->rw_devices++; 1463 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1464 1465 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1466 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1467 total_bytes + device->total_bytes); 1468 1469 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1470 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1471 total_bytes + 1); 1472 1473 if (seeding_dev) { 1474 ret = init_first_rw_device(trans, root, device); 1475 BUG_ON(ret); 1476 ret = btrfs_finish_sprout(trans, root); 1477 BUG_ON(ret); 1478 } else { 1479 ret = btrfs_add_device(trans, root, device); 1480 } 1481 1482 /* 1483 * we've got more storage, clear any full flags on the space 1484 * infos 1485 */ 1486 btrfs_clear_space_info_full(root->fs_info); 1487 1488 unlock_chunks(root); 1489 btrfs_commit_transaction(trans, root); 1490 1491 if (seeding_dev) { 1492 mutex_unlock(&uuid_mutex); 1493 up_write(&sb->s_umount); 1494 1495 ret = btrfs_relocate_sys_chunks(root); 1496 BUG_ON(ret); 1497 } 1498 out: 1499 mutex_unlock(&root->fs_info->volume_mutex); 1500 return ret; 1501 error: 1502 close_bdev_exclusive(bdev, 0); 1503 if (seeding_dev) { 1504 mutex_unlock(&uuid_mutex); 1505 up_write(&sb->s_umount); 1506 } 1507 goto out; 1508 } 1509 1510 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1511 struct btrfs_device *device) 1512 { 1513 int ret; 1514 struct btrfs_path *path; 1515 struct btrfs_root *root; 1516 struct btrfs_dev_item *dev_item; 1517 struct extent_buffer *leaf; 1518 struct btrfs_key key; 1519 1520 root = device->dev_root->fs_info->chunk_root; 1521 1522 path = btrfs_alloc_path(); 1523 if (!path) 1524 return -ENOMEM; 1525 1526 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1527 key.type = BTRFS_DEV_ITEM_KEY; 1528 key.offset = device->devid; 1529 1530 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1531 if (ret < 0) 1532 goto out; 1533 1534 if (ret > 0) { 1535 ret = -ENOENT; 1536 goto out; 1537 } 1538 1539 leaf = path->nodes[0]; 1540 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1541 1542 btrfs_set_device_id(leaf, dev_item, device->devid); 1543 btrfs_set_device_type(leaf, dev_item, device->type); 1544 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1545 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1546 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1547 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1548 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1549 btrfs_mark_buffer_dirty(leaf); 1550 1551 out: 1552 btrfs_free_path(path); 1553 return ret; 1554 } 1555 1556 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1557 struct btrfs_device *device, u64 new_size) 1558 { 1559 struct btrfs_super_block *super_copy = 1560 &device->dev_root->fs_info->super_copy; 1561 u64 old_total = btrfs_super_total_bytes(super_copy); 1562 u64 diff = new_size - device->total_bytes; 1563 1564 if (!device->writeable) 1565 return -EACCES; 1566 if (new_size <= device->total_bytes) 1567 return -EINVAL; 1568 1569 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1570 device->fs_devices->total_rw_bytes += diff; 1571 1572 device->total_bytes = new_size; 1573 btrfs_clear_space_info_full(device->dev_root->fs_info); 1574 1575 return btrfs_update_device(trans, device); 1576 } 1577 1578 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1579 struct btrfs_device *device, u64 new_size) 1580 { 1581 int ret; 1582 lock_chunks(device->dev_root); 1583 ret = __btrfs_grow_device(trans, device, new_size); 1584 unlock_chunks(device->dev_root); 1585 return ret; 1586 } 1587 1588 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1589 struct btrfs_root *root, 1590 u64 chunk_tree, u64 chunk_objectid, 1591 u64 chunk_offset) 1592 { 1593 int ret; 1594 struct btrfs_path *path; 1595 struct btrfs_key key; 1596 1597 root = root->fs_info->chunk_root; 1598 path = btrfs_alloc_path(); 1599 if (!path) 1600 return -ENOMEM; 1601 1602 key.objectid = chunk_objectid; 1603 key.offset = chunk_offset; 1604 key.type = BTRFS_CHUNK_ITEM_KEY; 1605 1606 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1607 BUG_ON(ret); 1608 1609 ret = btrfs_del_item(trans, root, path); 1610 BUG_ON(ret); 1611 1612 btrfs_free_path(path); 1613 return 0; 1614 } 1615 1616 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1617 chunk_offset) 1618 { 1619 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1620 struct btrfs_disk_key *disk_key; 1621 struct btrfs_chunk *chunk; 1622 u8 *ptr; 1623 int ret = 0; 1624 u32 num_stripes; 1625 u32 array_size; 1626 u32 len = 0; 1627 u32 cur; 1628 struct btrfs_key key; 1629 1630 array_size = btrfs_super_sys_array_size(super_copy); 1631 1632 ptr = super_copy->sys_chunk_array; 1633 cur = 0; 1634 1635 while (cur < array_size) { 1636 disk_key = (struct btrfs_disk_key *)ptr; 1637 btrfs_disk_key_to_cpu(&key, disk_key); 1638 1639 len = sizeof(*disk_key); 1640 1641 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1642 chunk = (struct btrfs_chunk *)(ptr + len); 1643 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1644 len += btrfs_chunk_item_size(num_stripes); 1645 } else { 1646 ret = -EIO; 1647 break; 1648 } 1649 if (key.objectid == chunk_objectid && 1650 key.offset == chunk_offset) { 1651 memmove(ptr, ptr + len, array_size - (cur + len)); 1652 array_size -= len; 1653 btrfs_set_super_sys_array_size(super_copy, array_size); 1654 } else { 1655 ptr += len; 1656 cur += len; 1657 } 1658 } 1659 return ret; 1660 } 1661 1662 static int btrfs_relocate_chunk(struct btrfs_root *root, 1663 u64 chunk_tree, u64 chunk_objectid, 1664 u64 chunk_offset) 1665 { 1666 struct extent_map_tree *em_tree; 1667 struct btrfs_root *extent_root; 1668 struct btrfs_trans_handle *trans; 1669 struct extent_map *em; 1670 struct map_lookup *map; 1671 int ret; 1672 int i; 1673 1674 printk(KERN_INFO "btrfs relocating chunk %llu\n", 1675 (unsigned long long)chunk_offset); 1676 root = root->fs_info->chunk_root; 1677 extent_root = root->fs_info->extent_root; 1678 em_tree = &root->fs_info->mapping_tree.map_tree; 1679 1680 /* step one, relocate all the extents inside this chunk */ 1681 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1682 BUG_ON(ret); 1683 1684 trans = btrfs_start_transaction(root, 1); 1685 BUG_ON(!trans); 1686 1687 lock_chunks(root); 1688 1689 /* 1690 * step two, delete the device extents and the 1691 * chunk tree entries 1692 */ 1693 spin_lock(&em_tree->lock); 1694 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1695 spin_unlock(&em_tree->lock); 1696 1697 BUG_ON(em->start > chunk_offset || 1698 em->start + em->len < chunk_offset); 1699 map = (struct map_lookup *)em->bdev; 1700 1701 for (i = 0; i < map->num_stripes; i++) { 1702 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1703 map->stripes[i].physical); 1704 BUG_ON(ret); 1705 1706 if (map->stripes[i].dev) { 1707 ret = btrfs_update_device(trans, map->stripes[i].dev); 1708 BUG_ON(ret); 1709 } 1710 } 1711 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1712 chunk_offset); 1713 1714 BUG_ON(ret); 1715 1716 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1717 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1718 BUG_ON(ret); 1719 } 1720 1721 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1722 BUG_ON(ret); 1723 1724 spin_lock(&em_tree->lock); 1725 remove_extent_mapping(em_tree, em); 1726 spin_unlock(&em_tree->lock); 1727 1728 kfree(map); 1729 em->bdev = NULL; 1730 1731 /* once for the tree */ 1732 free_extent_map(em); 1733 /* once for us */ 1734 free_extent_map(em); 1735 1736 unlock_chunks(root); 1737 btrfs_end_transaction(trans, root); 1738 return 0; 1739 } 1740 1741 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1742 { 1743 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1744 struct btrfs_path *path; 1745 struct extent_buffer *leaf; 1746 struct btrfs_chunk *chunk; 1747 struct btrfs_key key; 1748 struct btrfs_key found_key; 1749 u64 chunk_tree = chunk_root->root_key.objectid; 1750 u64 chunk_type; 1751 int ret; 1752 1753 path = btrfs_alloc_path(); 1754 if (!path) 1755 return -ENOMEM; 1756 1757 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1758 key.offset = (u64)-1; 1759 key.type = BTRFS_CHUNK_ITEM_KEY; 1760 1761 while (1) { 1762 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1763 if (ret < 0) 1764 goto error; 1765 BUG_ON(ret == 0); 1766 1767 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1768 key.type); 1769 if (ret < 0) 1770 goto error; 1771 if (ret > 0) 1772 break; 1773 1774 leaf = path->nodes[0]; 1775 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1776 1777 chunk = btrfs_item_ptr(leaf, path->slots[0], 1778 struct btrfs_chunk); 1779 chunk_type = btrfs_chunk_type(leaf, chunk); 1780 btrfs_release_path(chunk_root, path); 1781 1782 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1783 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1784 found_key.objectid, 1785 found_key.offset); 1786 BUG_ON(ret); 1787 } 1788 1789 if (found_key.offset == 0) 1790 break; 1791 key.offset = found_key.offset - 1; 1792 } 1793 ret = 0; 1794 error: 1795 btrfs_free_path(path); 1796 return ret; 1797 } 1798 1799 static u64 div_factor(u64 num, int factor) 1800 { 1801 if (factor == 10) 1802 return num; 1803 num *= factor; 1804 do_div(num, 10); 1805 return num; 1806 } 1807 1808 int btrfs_balance(struct btrfs_root *dev_root) 1809 { 1810 int ret; 1811 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1812 struct btrfs_device *device; 1813 u64 old_size; 1814 u64 size_to_free; 1815 struct btrfs_path *path; 1816 struct btrfs_key key; 1817 struct btrfs_chunk *chunk; 1818 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1819 struct btrfs_trans_handle *trans; 1820 struct btrfs_key found_key; 1821 1822 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1823 return -EROFS; 1824 1825 mutex_lock(&dev_root->fs_info->volume_mutex); 1826 dev_root = dev_root->fs_info->dev_root; 1827 1828 /* step one make some room on all the devices */ 1829 list_for_each_entry(device, devices, dev_list) { 1830 old_size = device->total_bytes; 1831 size_to_free = div_factor(old_size, 1); 1832 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1833 if (!device->writeable || 1834 device->total_bytes - device->bytes_used > size_to_free) 1835 continue; 1836 1837 ret = btrfs_shrink_device(device, old_size - size_to_free); 1838 BUG_ON(ret); 1839 1840 trans = btrfs_start_transaction(dev_root, 1); 1841 BUG_ON(!trans); 1842 1843 ret = btrfs_grow_device(trans, device, old_size); 1844 BUG_ON(ret); 1845 1846 btrfs_end_transaction(trans, dev_root); 1847 } 1848 1849 /* step two, relocate all the chunks */ 1850 path = btrfs_alloc_path(); 1851 BUG_ON(!path); 1852 1853 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1854 key.offset = (u64)-1; 1855 key.type = BTRFS_CHUNK_ITEM_KEY; 1856 1857 while (1) { 1858 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1859 if (ret < 0) 1860 goto error; 1861 1862 /* 1863 * this shouldn't happen, it means the last relocate 1864 * failed 1865 */ 1866 if (ret == 0) 1867 break; 1868 1869 ret = btrfs_previous_item(chunk_root, path, 0, 1870 BTRFS_CHUNK_ITEM_KEY); 1871 if (ret) 1872 break; 1873 1874 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1875 path->slots[0]); 1876 if (found_key.objectid != key.objectid) 1877 break; 1878 1879 chunk = btrfs_item_ptr(path->nodes[0], 1880 path->slots[0], 1881 struct btrfs_chunk); 1882 key.offset = found_key.offset; 1883 /* chunk zero is special */ 1884 if (key.offset == 0) 1885 break; 1886 1887 btrfs_release_path(chunk_root, path); 1888 ret = btrfs_relocate_chunk(chunk_root, 1889 chunk_root->root_key.objectid, 1890 found_key.objectid, 1891 found_key.offset); 1892 BUG_ON(ret); 1893 } 1894 ret = 0; 1895 error: 1896 btrfs_free_path(path); 1897 mutex_unlock(&dev_root->fs_info->volume_mutex); 1898 return ret; 1899 } 1900 1901 /* 1902 * shrinking a device means finding all of the device extents past 1903 * the new size, and then following the back refs to the chunks. 1904 * The chunk relocation code actually frees the device extent 1905 */ 1906 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1907 { 1908 struct btrfs_trans_handle *trans; 1909 struct btrfs_root *root = device->dev_root; 1910 struct btrfs_dev_extent *dev_extent = NULL; 1911 struct btrfs_path *path; 1912 u64 length; 1913 u64 chunk_tree; 1914 u64 chunk_objectid; 1915 u64 chunk_offset; 1916 int ret; 1917 int slot; 1918 struct extent_buffer *l; 1919 struct btrfs_key key; 1920 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1921 u64 old_total = btrfs_super_total_bytes(super_copy); 1922 u64 diff = device->total_bytes - new_size; 1923 1924 if (new_size >= device->total_bytes) 1925 return -EINVAL; 1926 1927 path = btrfs_alloc_path(); 1928 if (!path) 1929 return -ENOMEM; 1930 1931 trans = btrfs_start_transaction(root, 1); 1932 if (!trans) { 1933 ret = -ENOMEM; 1934 goto done; 1935 } 1936 1937 path->reada = 2; 1938 1939 lock_chunks(root); 1940 1941 device->total_bytes = new_size; 1942 if (device->writeable) 1943 device->fs_devices->total_rw_bytes -= diff; 1944 unlock_chunks(root); 1945 btrfs_end_transaction(trans, root); 1946 1947 key.objectid = device->devid; 1948 key.offset = (u64)-1; 1949 key.type = BTRFS_DEV_EXTENT_KEY; 1950 1951 while (1) { 1952 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1953 if (ret < 0) 1954 goto done; 1955 1956 ret = btrfs_previous_item(root, path, 0, key.type); 1957 if (ret < 0) 1958 goto done; 1959 if (ret) { 1960 ret = 0; 1961 goto done; 1962 } 1963 1964 l = path->nodes[0]; 1965 slot = path->slots[0]; 1966 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 1967 1968 if (key.objectid != device->devid) 1969 goto done; 1970 1971 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1972 length = btrfs_dev_extent_length(l, dev_extent); 1973 1974 if (key.offset + length <= new_size) 1975 break; 1976 1977 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1978 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1979 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 1980 btrfs_release_path(root, path); 1981 1982 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 1983 chunk_offset); 1984 if (ret) 1985 goto done; 1986 } 1987 1988 /* Shrinking succeeded, else we would be at "done". */ 1989 trans = btrfs_start_transaction(root, 1); 1990 if (!trans) { 1991 ret = -ENOMEM; 1992 goto done; 1993 } 1994 lock_chunks(root); 1995 1996 device->disk_total_bytes = new_size; 1997 /* Now btrfs_update_device() will change the on-disk size. */ 1998 ret = btrfs_update_device(trans, device); 1999 if (ret) { 2000 unlock_chunks(root); 2001 btrfs_end_transaction(trans, root); 2002 goto done; 2003 } 2004 WARN_ON(diff > old_total); 2005 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2006 unlock_chunks(root); 2007 btrfs_end_transaction(trans, root); 2008 done: 2009 btrfs_free_path(path); 2010 return ret; 2011 } 2012 2013 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2014 struct btrfs_root *root, 2015 struct btrfs_key *key, 2016 struct btrfs_chunk *chunk, int item_size) 2017 { 2018 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2019 struct btrfs_disk_key disk_key; 2020 u32 array_size; 2021 u8 *ptr; 2022 2023 array_size = btrfs_super_sys_array_size(super_copy); 2024 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2025 return -EFBIG; 2026 2027 ptr = super_copy->sys_chunk_array + array_size; 2028 btrfs_cpu_key_to_disk(&disk_key, key); 2029 memcpy(ptr, &disk_key, sizeof(disk_key)); 2030 ptr += sizeof(disk_key); 2031 memcpy(ptr, chunk, item_size); 2032 item_size += sizeof(disk_key); 2033 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2034 return 0; 2035 } 2036 2037 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2038 int num_stripes, int sub_stripes) 2039 { 2040 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2041 return calc_size; 2042 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2043 return calc_size * (num_stripes / sub_stripes); 2044 else 2045 return calc_size * num_stripes; 2046 } 2047 2048 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2049 struct btrfs_root *extent_root, 2050 struct map_lookup **map_ret, 2051 u64 *num_bytes, u64 *stripe_size, 2052 u64 start, u64 type) 2053 { 2054 struct btrfs_fs_info *info = extent_root->fs_info; 2055 struct btrfs_device *device = NULL; 2056 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2057 struct list_head *cur; 2058 struct map_lookup *map = NULL; 2059 struct extent_map_tree *em_tree; 2060 struct extent_map *em; 2061 struct list_head private_devs; 2062 int min_stripe_size = 1 * 1024 * 1024; 2063 u64 calc_size = 1024 * 1024 * 1024; 2064 u64 max_chunk_size = calc_size; 2065 u64 min_free; 2066 u64 avail; 2067 u64 max_avail = 0; 2068 u64 dev_offset; 2069 int num_stripes = 1; 2070 int min_stripes = 1; 2071 int sub_stripes = 0; 2072 int looped = 0; 2073 int ret; 2074 int index; 2075 int stripe_len = 64 * 1024; 2076 2077 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2078 (type & BTRFS_BLOCK_GROUP_DUP)) { 2079 WARN_ON(1); 2080 type &= ~BTRFS_BLOCK_GROUP_DUP; 2081 } 2082 if (list_empty(&fs_devices->alloc_list)) 2083 return -ENOSPC; 2084 2085 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2086 num_stripes = fs_devices->rw_devices; 2087 min_stripes = 2; 2088 } 2089 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2090 num_stripes = 2; 2091 min_stripes = 2; 2092 } 2093 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2094 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2095 if (num_stripes < 2) 2096 return -ENOSPC; 2097 min_stripes = 2; 2098 } 2099 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2100 num_stripes = fs_devices->rw_devices; 2101 if (num_stripes < 4) 2102 return -ENOSPC; 2103 num_stripes &= ~(u32)1; 2104 sub_stripes = 2; 2105 min_stripes = 4; 2106 } 2107 2108 if (type & BTRFS_BLOCK_GROUP_DATA) { 2109 max_chunk_size = 10 * calc_size; 2110 min_stripe_size = 64 * 1024 * 1024; 2111 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2112 max_chunk_size = 4 * calc_size; 2113 min_stripe_size = 32 * 1024 * 1024; 2114 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2115 calc_size = 8 * 1024 * 1024; 2116 max_chunk_size = calc_size * 2; 2117 min_stripe_size = 1 * 1024 * 1024; 2118 } 2119 2120 /* we don't want a chunk larger than 10% of writeable space */ 2121 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2122 max_chunk_size); 2123 2124 again: 2125 if (!map || map->num_stripes != num_stripes) { 2126 kfree(map); 2127 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2128 if (!map) 2129 return -ENOMEM; 2130 map->num_stripes = num_stripes; 2131 } 2132 2133 if (calc_size * num_stripes > max_chunk_size) { 2134 calc_size = max_chunk_size; 2135 do_div(calc_size, num_stripes); 2136 do_div(calc_size, stripe_len); 2137 calc_size *= stripe_len; 2138 } 2139 /* we don't want tiny stripes */ 2140 calc_size = max_t(u64, min_stripe_size, calc_size); 2141 2142 do_div(calc_size, stripe_len); 2143 calc_size *= stripe_len; 2144 2145 cur = fs_devices->alloc_list.next; 2146 index = 0; 2147 2148 if (type & BTRFS_BLOCK_GROUP_DUP) 2149 min_free = calc_size * 2; 2150 else 2151 min_free = calc_size; 2152 2153 /* 2154 * we add 1MB because we never use the first 1MB of the device, unless 2155 * we've looped, then we are likely allocating the maximum amount of 2156 * space left already 2157 */ 2158 if (!looped) 2159 min_free += 1024 * 1024; 2160 2161 INIT_LIST_HEAD(&private_devs); 2162 while (index < num_stripes) { 2163 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2164 BUG_ON(!device->writeable); 2165 if (device->total_bytes > device->bytes_used) 2166 avail = device->total_bytes - device->bytes_used; 2167 else 2168 avail = 0; 2169 cur = cur->next; 2170 2171 if (device->in_fs_metadata && avail >= min_free) { 2172 ret = find_free_dev_extent(trans, device, 2173 min_free, &dev_offset); 2174 if (ret == 0) { 2175 list_move_tail(&device->dev_alloc_list, 2176 &private_devs); 2177 map->stripes[index].dev = device; 2178 map->stripes[index].physical = dev_offset; 2179 index++; 2180 if (type & BTRFS_BLOCK_GROUP_DUP) { 2181 map->stripes[index].dev = device; 2182 map->stripes[index].physical = 2183 dev_offset + calc_size; 2184 index++; 2185 } 2186 } 2187 } else if (device->in_fs_metadata && avail > max_avail) 2188 max_avail = avail; 2189 if (cur == &fs_devices->alloc_list) 2190 break; 2191 } 2192 list_splice(&private_devs, &fs_devices->alloc_list); 2193 if (index < num_stripes) { 2194 if (index >= min_stripes) { 2195 num_stripes = index; 2196 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2197 num_stripes /= sub_stripes; 2198 num_stripes *= sub_stripes; 2199 } 2200 looped = 1; 2201 goto again; 2202 } 2203 if (!looped && max_avail > 0) { 2204 looped = 1; 2205 calc_size = max_avail; 2206 goto again; 2207 } 2208 kfree(map); 2209 return -ENOSPC; 2210 } 2211 map->sector_size = extent_root->sectorsize; 2212 map->stripe_len = stripe_len; 2213 map->io_align = stripe_len; 2214 map->io_width = stripe_len; 2215 map->type = type; 2216 map->num_stripes = num_stripes; 2217 map->sub_stripes = sub_stripes; 2218 2219 *map_ret = map; 2220 *stripe_size = calc_size; 2221 *num_bytes = chunk_bytes_by_type(type, calc_size, 2222 num_stripes, sub_stripes); 2223 2224 em = alloc_extent_map(GFP_NOFS); 2225 if (!em) { 2226 kfree(map); 2227 return -ENOMEM; 2228 } 2229 em->bdev = (struct block_device *)map; 2230 em->start = start; 2231 em->len = *num_bytes; 2232 em->block_start = 0; 2233 em->block_len = em->len; 2234 2235 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2236 spin_lock(&em_tree->lock); 2237 ret = add_extent_mapping(em_tree, em); 2238 spin_unlock(&em_tree->lock); 2239 BUG_ON(ret); 2240 free_extent_map(em); 2241 2242 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2243 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2244 start, *num_bytes); 2245 BUG_ON(ret); 2246 2247 index = 0; 2248 while (index < map->num_stripes) { 2249 device = map->stripes[index].dev; 2250 dev_offset = map->stripes[index].physical; 2251 2252 ret = btrfs_alloc_dev_extent(trans, device, 2253 info->chunk_root->root_key.objectid, 2254 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2255 start, dev_offset, calc_size); 2256 BUG_ON(ret); 2257 index++; 2258 } 2259 2260 return 0; 2261 } 2262 2263 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2264 struct btrfs_root *extent_root, 2265 struct map_lookup *map, u64 chunk_offset, 2266 u64 chunk_size, u64 stripe_size) 2267 { 2268 u64 dev_offset; 2269 struct btrfs_key key; 2270 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2271 struct btrfs_device *device; 2272 struct btrfs_chunk *chunk; 2273 struct btrfs_stripe *stripe; 2274 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2275 int index = 0; 2276 int ret; 2277 2278 chunk = kzalloc(item_size, GFP_NOFS); 2279 if (!chunk) 2280 return -ENOMEM; 2281 2282 index = 0; 2283 while (index < map->num_stripes) { 2284 device = map->stripes[index].dev; 2285 device->bytes_used += stripe_size; 2286 ret = btrfs_update_device(trans, device); 2287 BUG_ON(ret); 2288 index++; 2289 } 2290 2291 index = 0; 2292 stripe = &chunk->stripe; 2293 while (index < map->num_stripes) { 2294 device = map->stripes[index].dev; 2295 dev_offset = map->stripes[index].physical; 2296 2297 btrfs_set_stack_stripe_devid(stripe, device->devid); 2298 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2299 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2300 stripe++; 2301 index++; 2302 } 2303 2304 btrfs_set_stack_chunk_length(chunk, chunk_size); 2305 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2306 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2307 btrfs_set_stack_chunk_type(chunk, map->type); 2308 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2309 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2310 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2311 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2312 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2313 2314 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2315 key.type = BTRFS_CHUNK_ITEM_KEY; 2316 key.offset = chunk_offset; 2317 2318 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2319 BUG_ON(ret); 2320 2321 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2322 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2323 item_size); 2324 BUG_ON(ret); 2325 } 2326 kfree(chunk); 2327 return 0; 2328 } 2329 2330 /* 2331 * Chunk allocation falls into two parts. The first part does works 2332 * that make the new allocated chunk useable, but not do any operation 2333 * that modifies the chunk tree. The second part does the works that 2334 * require modifying the chunk tree. This division is important for the 2335 * bootstrap process of adding storage to a seed btrfs. 2336 */ 2337 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2338 struct btrfs_root *extent_root, u64 type) 2339 { 2340 u64 chunk_offset; 2341 u64 chunk_size; 2342 u64 stripe_size; 2343 struct map_lookup *map; 2344 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2345 int ret; 2346 2347 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2348 &chunk_offset); 2349 if (ret) 2350 return ret; 2351 2352 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2353 &stripe_size, chunk_offset, type); 2354 if (ret) 2355 return ret; 2356 2357 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2358 chunk_size, stripe_size); 2359 BUG_ON(ret); 2360 return 0; 2361 } 2362 2363 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2364 struct btrfs_root *root, 2365 struct btrfs_device *device) 2366 { 2367 u64 chunk_offset; 2368 u64 sys_chunk_offset; 2369 u64 chunk_size; 2370 u64 sys_chunk_size; 2371 u64 stripe_size; 2372 u64 sys_stripe_size; 2373 u64 alloc_profile; 2374 struct map_lookup *map; 2375 struct map_lookup *sys_map; 2376 struct btrfs_fs_info *fs_info = root->fs_info; 2377 struct btrfs_root *extent_root = fs_info->extent_root; 2378 int ret; 2379 2380 ret = find_next_chunk(fs_info->chunk_root, 2381 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2382 BUG_ON(ret); 2383 2384 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2385 (fs_info->metadata_alloc_profile & 2386 fs_info->avail_metadata_alloc_bits); 2387 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2388 2389 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2390 &stripe_size, chunk_offset, alloc_profile); 2391 BUG_ON(ret); 2392 2393 sys_chunk_offset = chunk_offset + chunk_size; 2394 2395 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2396 (fs_info->system_alloc_profile & 2397 fs_info->avail_system_alloc_bits); 2398 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2399 2400 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2401 &sys_chunk_size, &sys_stripe_size, 2402 sys_chunk_offset, alloc_profile); 2403 BUG_ON(ret); 2404 2405 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2406 BUG_ON(ret); 2407 2408 /* 2409 * Modifying chunk tree needs allocating new blocks from both 2410 * system block group and metadata block group. So we only can 2411 * do operations require modifying the chunk tree after both 2412 * block groups were created. 2413 */ 2414 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2415 chunk_size, stripe_size); 2416 BUG_ON(ret); 2417 2418 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2419 sys_chunk_offset, sys_chunk_size, 2420 sys_stripe_size); 2421 BUG_ON(ret); 2422 return 0; 2423 } 2424 2425 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2426 { 2427 struct extent_map *em; 2428 struct map_lookup *map; 2429 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2430 int readonly = 0; 2431 int i; 2432 2433 spin_lock(&map_tree->map_tree.lock); 2434 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2435 spin_unlock(&map_tree->map_tree.lock); 2436 if (!em) 2437 return 1; 2438 2439 map = (struct map_lookup *)em->bdev; 2440 for (i = 0; i < map->num_stripes; i++) { 2441 if (!map->stripes[i].dev->writeable) { 2442 readonly = 1; 2443 break; 2444 } 2445 } 2446 free_extent_map(em); 2447 return readonly; 2448 } 2449 2450 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2451 { 2452 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2453 } 2454 2455 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2456 { 2457 struct extent_map *em; 2458 2459 while (1) { 2460 spin_lock(&tree->map_tree.lock); 2461 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2462 if (em) 2463 remove_extent_mapping(&tree->map_tree, em); 2464 spin_unlock(&tree->map_tree.lock); 2465 if (!em) 2466 break; 2467 kfree(em->bdev); 2468 /* once for us */ 2469 free_extent_map(em); 2470 /* once for the tree */ 2471 free_extent_map(em); 2472 } 2473 } 2474 2475 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2476 { 2477 struct extent_map *em; 2478 struct map_lookup *map; 2479 struct extent_map_tree *em_tree = &map_tree->map_tree; 2480 int ret; 2481 2482 spin_lock(&em_tree->lock); 2483 em = lookup_extent_mapping(em_tree, logical, len); 2484 spin_unlock(&em_tree->lock); 2485 BUG_ON(!em); 2486 2487 BUG_ON(em->start > logical || em->start + em->len < logical); 2488 map = (struct map_lookup *)em->bdev; 2489 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2490 ret = map->num_stripes; 2491 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2492 ret = map->sub_stripes; 2493 else 2494 ret = 1; 2495 free_extent_map(em); 2496 return ret; 2497 } 2498 2499 static int find_live_mirror(struct map_lookup *map, int first, int num, 2500 int optimal) 2501 { 2502 int i; 2503 if (map->stripes[optimal].dev->bdev) 2504 return optimal; 2505 for (i = first; i < first + num; i++) { 2506 if (map->stripes[i].dev->bdev) 2507 return i; 2508 } 2509 /* we couldn't find one that doesn't fail. Just return something 2510 * and the io error handling code will clean up eventually 2511 */ 2512 return optimal; 2513 } 2514 2515 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2516 u64 logical, u64 *length, 2517 struct btrfs_multi_bio **multi_ret, 2518 int mirror_num, struct page *unplug_page) 2519 { 2520 struct extent_map *em; 2521 struct map_lookup *map; 2522 struct extent_map_tree *em_tree = &map_tree->map_tree; 2523 u64 offset; 2524 u64 stripe_offset; 2525 u64 stripe_nr; 2526 int stripes_allocated = 8; 2527 int stripes_required = 1; 2528 int stripe_index; 2529 int i; 2530 int num_stripes; 2531 int max_errors = 0; 2532 struct btrfs_multi_bio *multi = NULL; 2533 2534 if (multi_ret && !(rw & (1 << BIO_RW))) 2535 stripes_allocated = 1; 2536 again: 2537 if (multi_ret) { 2538 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2539 GFP_NOFS); 2540 if (!multi) 2541 return -ENOMEM; 2542 2543 atomic_set(&multi->error, 0); 2544 } 2545 2546 spin_lock(&em_tree->lock); 2547 em = lookup_extent_mapping(em_tree, logical, *length); 2548 spin_unlock(&em_tree->lock); 2549 2550 if (!em && unplug_page) 2551 return 0; 2552 2553 if (!em) { 2554 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2555 (unsigned long long)logical, 2556 (unsigned long long)*length); 2557 BUG(); 2558 } 2559 2560 BUG_ON(em->start > logical || em->start + em->len < logical); 2561 map = (struct map_lookup *)em->bdev; 2562 offset = logical - em->start; 2563 2564 if (mirror_num > map->num_stripes) 2565 mirror_num = 0; 2566 2567 /* if our multi bio struct is too small, back off and try again */ 2568 if (rw & (1 << BIO_RW)) { 2569 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2570 BTRFS_BLOCK_GROUP_DUP)) { 2571 stripes_required = map->num_stripes; 2572 max_errors = 1; 2573 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2574 stripes_required = map->sub_stripes; 2575 max_errors = 1; 2576 } 2577 } 2578 if (multi_ret && (rw & (1 << BIO_RW)) && 2579 stripes_allocated < stripes_required) { 2580 stripes_allocated = map->num_stripes; 2581 free_extent_map(em); 2582 kfree(multi); 2583 goto again; 2584 } 2585 stripe_nr = offset; 2586 /* 2587 * stripe_nr counts the total number of stripes we have to stride 2588 * to get to this block 2589 */ 2590 do_div(stripe_nr, map->stripe_len); 2591 2592 stripe_offset = stripe_nr * map->stripe_len; 2593 BUG_ON(offset < stripe_offset); 2594 2595 /* stripe_offset is the offset of this block in its stripe*/ 2596 stripe_offset = offset - stripe_offset; 2597 2598 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2599 BTRFS_BLOCK_GROUP_RAID10 | 2600 BTRFS_BLOCK_GROUP_DUP)) { 2601 /* we limit the length of each bio to what fits in a stripe */ 2602 *length = min_t(u64, em->len - offset, 2603 map->stripe_len - stripe_offset); 2604 } else { 2605 *length = em->len - offset; 2606 } 2607 2608 if (!multi_ret && !unplug_page) 2609 goto out; 2610 2611 num_stripes = 1; 2612 stripe_index = 0; 2613 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2614 if (unplug_page || (rw & (1 << BIO_RW))) 2615 num_stripes = map->num_stripes; 2616 else if (mirror_num) 2617 stripe_index = mirror_num - 1; 2618 else { 2619 stripe_index = find_live_mirror(map, 0, 2620 map->num_stripes, 2621 current->pid % map->num_stripes); 2622 } 2623 2624 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2625 if (rw & (1 << BIO_RW)) 2626 num_stripes = map->num_stripes; 2627 else if (mirror_num) 2628 stripe_index = mirror_num - 1; 2629 2630 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2631 int factor = map->num_stripes / map->sub_stripes; 2632 2633 stripe_index = do_div(stripe_nr, factor); 2634 stripe_index *= map->sub_stripes; 2635 2636 if (unplug_page || (rw & (1 << BIO_RW))) 2637 num_stripes = map->sub_stripes; 2638 else if (mirror_num) 2639 stripe_index += mirror_num - 1; 2640 else { 2641 stripe_index = find_live_mirror(map, stripe_index, 2642 map->sub_stripes, stripe_index + 2643 current->pid % map->sub_stripes); 2644 } 2645 } else { 2646 /* 2647 * after this do_div call, stripe_nr is the number of stripes 2648 * on this device we have to walk to find the data, and 2649 * stripe_index is the number of our device in the stripe array 2650 */ 2651 stripe_index = do_div(stripe_nr, map->num_stripes); 2652 } 2653 BUG_ON(stripe_index >= map->num_stripes); 2654 2655 for (i = 0; i < num_stripes; i++) { 2656 if (unplug_page) { 2657 struct btrfs_device *device; 2658 struct backing_dev_info *bdi; 2659 2660 device = map->stripes[stripe_index].dev; 2661 if (device->bdev) { 2662 bdi = blk_get_backing_dev_info(device->bdev); 2663 if (bdi->unplug_io_fn) 2664 bdi->unplug_io_fn(bdi, unplug_page); 2665 } 2666 } else { 2667 multi->stripes[i].physical = 2668 map->stripes[stripe_index].physical + 2669 stripe_offset + stripe_nr * map->stripe_len; 2670 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2671 } 2672 stripe_index++; 2673 } 2674 if (multi_ret) { 2675 *multi_ret = multi; 2676 multi->num_stripes = num_stripes; 2677 multi->max_errors = max_errors; 2678 } 2679 out: 2680 free_extent_map(em); 2681 return 0; 2682 } 2683 2684 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2685 u64 logical, u64 *length, 2686 struct btrfs_multi_bio **multi_ret, int mirror_num) 2687 { 2688 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2689 mirror_num, NULL); 2690 } 2691 2692 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2693 u64 chunk_start, u64 physical, u64 devid, 2694 u64 **logical, int *naddrs, int *stripe_len) 2695 { 2696 struct extent_map_tree *em_tree = &map_tree->map_tree; 2697 struct extent_map *em; 2698 struct map_lookup *map; 2699 u64 *buf; 2700 u64 bytenr; 2701 u64 length; 2702 u64 stripe_nr; 2703 int i, j, nr = 0; 2704 2705 spin_lock(&em_tree->lock); 2706 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2707 spin_unlock(&em_tree->lock); 2708 2709 BUG_ON(!em || em->start != chunk_start); 2710 map = (struct map_lookup *)em->bdev; 2711 2712 length = em->len; 2713 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2714 do_div(length, map->num_stripes / map->sub_stripes); 2715 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2716 do_div(length, map->num_stripes); 2717 2718 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2719 BUG_ON(!buf); 2720 2721 for (i = 0; i < map->num_stripes; i++) { 2722 if (devid && map->stripes[i].dev->devid != devid) 2723 continue; 2724 if (map->stripes[i].physical > physical || 2725 map->stripes[i].physical + length <= physical) 2726 continue; 2727 2728 stripe_nr = physical - map->stripes[i].physical; 2729 do_div(stripe_nr, map->stripe_len); 2730 2731 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2732 stripe_nr = stripe_nr * map->num_stripes + i; 2733 do_div(stripe_nr, map->sub_stripes); 2734 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2735 stripe_nr = stripe_nr * map->num_stripes + i; 2736 } 2737 bytenr = chunk_start + stripe_nr * map->stripe_len; 2738 WARN_ON(nr >= map->num_stripes); 2739 for (j = 0; j < nr; j++) { 2740 if (buf[j] == bytenr) 2741 break; 2742 } 2743 if (j == nr) { 2744 WARN_ON(nr >= map->num_stripes); 2745 buf[nr++] = bytenr; 2746 } 2747 } 2748 2749 for (i = 0; i > nr; i++) { 2750 struct btrfs_multi_bio *multi; 2751 struct btrfs_bio_stripe *stripe; 2752 int ret; 2753 2754 length = 1; 2755 ret = btrfs_map_block(map_tree, WRITE, buf[i], 2756 &length, &multi, 0); 2757 BUG_ON(ret); 2758 2759 stripe = multi->stripes; 2760 for (j = 0; j < multi->num_stripes; j++) { 2761 if (stripe->physical >= physical && 2762 physical < stripe->physical + length) 2763 break; 2764 } 2765 BUG_ON(j >= multi->num_stripes); 2766 kfree(multi); 2767 } 2768 2769 *logical = buf; 2770 *naddrs = nr; 2771 *stripe_len = map->stripe_len; 2772 2773 free_extent_map(em); 2774 return 0; 2775 } 2776 2777 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2778 u64 logical, struct page *page) 2779 { 2780 u64 length = PAGE_CACHE_SIZE; 2781 return __btrfs_map_block(map_tree, READ, logical, &length, 2782 NULL, 0, page); 2783 } 2784 2785 static void end_bio_multi_stripe(struct bio *bio, int err) 2786 { 2787 struct btrfs_multi_bio *multi = bio->bi_private; 2788 int is_orig_bio = 0; 2789 2790 if (err) 2791 atomic_inc(&multi->error); 2792 2793 if (bio == multi->orig_bio) 2794 is_orig_bio = 1; 2795 2796 if (atomic_dec_and_test(&multi->stripes_pending)) { 2797 if (!is_orig_bio) { 2798 bio_put(bio); 2799 bio = multi->orig_bio; 2800 } 2801 bio->bi_private = multi->private; 2802 bio->bi_end_io = multi->end_io; 2803 /* only send an error to the higher layers if it is 2804 * beyond the tolerance of the multi-bio 2805 */ 2806 if (atomic_read(&multi->error) > multi->max_errors) { 2807 err = -EIO; 2808 } else if (err) { 2809 /* 2810 * this bio is actually up to date, we didn't 2811 * go over the max number of errors 2812 */ 2813 set_bit(BIO_UPTODATE, &bio->bi_flags); 2814 err = 0; 2815 } 2816 kfree(multi); 2817 2818 bio_endio(bio, err); 2819 } else if (!is_orig_bio) { 2820 bio_put(bio); 2821 } 2822 } 2823 2824 struct async_sched { 2825 struct bio *bio; 2826 int rw; 2827 struct btrfs_fs_info *info; 2828 struct btrfs_work work; 2829 }; 2830 2831 /* 2832 * see run_scheduled_bios for a description of why bios are collected for 2833 * async submit. 2834 * 2835 * This will add one bio to the pending list for a device and make sure 2836 * the work struct is scheduled. 2837 */ 2838 static noinline int schedule_bio(struct btrfs_root *root, 2839 struct btrfs_device *device, 2840 int rw, struct bio *bio) 2841 { 2842 int should_queue = 1; 2843 struct btrfs_pending_bios *pending_bios; 2844 2845 /* don't bother with additional async steps for reads, right now */ 2846 if (!(rw & (1 << BIO_RW))) { 2847 bio_get(bio); 2848 submit_bio(rw, bio); 2849 bio_put(bio); 2850 return 0; 2851 } 2852 2853 /* 2854 * nr_async_bios allows us to reliably return congestion to the 2855 * higher layers. Otherwise, the async bio makes it appear we have 2856 * made progress against dirty pages when we've really just put it 2857 * on a queue for later 2858 */ 2859 atomic_inc(&root->fs_info->nr_async_bios); 2860 WARN_ON(bio->bi_next); 2861 bio->bi_next = NULL; 2862 bio->bi_rw |= rw; 2863 2864 spin_lock(&device->io_lock); 2865 if (bio_sync(bio)) 2866 pending_bios = &device->pending_sync_bios; 2867 else 2868 pending_bios = &device->pending_bios; 2869 2870 if (pending_bios->tail) 2871 pending_bios->tail->bi_next = bio; 2872 2873 pending_bios->tail = bio; 2874 if (!pending_bios->head) 2875 pending_bios->head = bio; 2876 if (device->running_pending) 2877 should_queue = 0; 2878 2879 spin_unlock(&device->io_lock); 2880 2881 if (should_queue) 2882 btrfs_queue_worker(&root->fs_info->submit_workers, 2883 &device->work); 2884 return 0; 2885 } 2886 2887 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2888 int mirror_num, int async_submit) 2889 { 2890 struct btrfs_mapping_tree *map_tree; 2891 struct btrfs_device *dev; 2892 struct bio *first_bio = bio; 2893 u64 logical = (u64)bio->bi_sector << 9; 2894 u64 length = 0; 2895 u64 map_length; 2896 struct btrfs_multi_bio *multi = NULL; 2897 int ret; 2898 int dev_nr = 0; 2899 int total_devs = 1; 2900 2901 length = bio->bi_size; 2902 map_tree = &root->fs_info->mapping_tree; 2903 map_length = length; 2904 2905 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2906 mirror_num); 2907 BUG_ON(ret); 2908 2909 total_devs = multi->num_stripes; 2910 if (map_length < length) { 2911 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2912 "len %llu\n", (unsigned long long)logical, 2913 (unsigned long long)length, 2914 (unsigned long long)map_length); 2915 BUG(); 2916 } 2917 multi->end_io = first_bio->bi_end_io; 2918 multi->private = first_bio->bi_private; 2919 multi->orig_bio = first_bio; 2920 atomic_set(&multi->stripes_pending, multi->num_stripes); 2921 2922 while (dev_nr < total_devs) { 2923 if (total_devs > 1) { 2924 if (dev_nr < total_devs - 1) { 2925 bio = bio_clone(first_bio, GFP_NOFS); 2926 BUG_ON(!bio); 2927 } else { 2928 bio = first_bio; 2929 } 2930 bio->bi_private = multi; 2931 bio->bi_end_io = end_bio_multi_stripe; 2932 } 2933 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 2934 dev = multi->stripes[dev_nr].dev; 2935 BUG_ON(rw == WRITE && !dev->writeable); 2936 if (dev && dev->bdev) { 2937 bio->bi_bdev = dev->bdev; 2938 if (async_submit) 2939 schedule_bio(root, dev, rw, bio); 2940 else 2941 submit_bio(rw, bio); 2942 } else { 2943 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2944 bio->bi_sector = logical >> 9; 2945 bio_endio(bio, -EIO); 2946 } 2947 dev_nr++; 2948 } 2949 if (total_devs == 1) 2950 kfree(multi); 2951 return 0; 2952 } 2953 2954 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 2955 u8 *uuid, u8 *fsid) 2956 { 2957 struct btrfs_device *device; 2958 struct btrfs_fs_devices *cur_devices; 2959 2960 cur_devices = root->fs_info->fs_devices; 2961 while (cur_devices) { 2962 if (!fsid || 2963 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 2964 device = __find_device(&cur_devices->devices, 2965 devid, uuid); 2966 if (device) 2967 return device; 2968 } 2969 cur_devices = cur_devices->seed; 2970 } 2971 return NULL; 2972 } 2973 2974 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 2975 u64 devid, u8 *dev_uuid) 2976 { 2977 struct btrfs_device *device; 2978 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2979 2980 device = kzalloc(sizeof(*device), GFP_NOFS); 2981 if (!device) 2982 return NULL; 2983 list_add(&device->dev_list, 2984 &fs_devices->devices); 2985 device->barriers = 1; 2986 device->dev_root = root->fs_info->dev_root; 2987 device->devid = devid; 2988 device->work.func = pending_bios_fn; 2989 device->fs_devices = fs_devices; 2990 fs_devices->num_devices++; 2991 spin_lock_init(&device->io_lock); 2992 INIT_LIST_HEAD(&device->dev_alloc_list); 2993 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 2994 return device; 2995 } 2996 2997 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 2998 struct extent_buffer *leaf, 2999 struct btrfs_chunk *chunk) 3000 { 3001 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3002 struct map_lookup *map; 3003 struct extent_map *em; 3004 u64 logical; 3005 u64 length; 3006 u64 devid; 3007 u8 uuid[BTRFS_UUID_SIZE]; 3008 int num_stripes; 3009 int ret; 3010 int i; 3011 3012 logical = key->offset; 3013 length = btrfs_chunk_length(leaf, chunk); 3014 3015 spin_lock(&map_tree->map_tree.lock); 3016 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3017 spin_unlock(&map_tree->map_tree.lock); 3018 3019 /* already mapped? */ 3020 if (em && em->start <= logical && em->start + em->len > logical) { 3021 free_extent_map(em); 3022 return 0; 3023 } else if (em) { 3024 free_extent_map(em); 3025 } 3026 3027 em = alloc_extent_map(GFP_NOFS); 3028 if (!em) 3029 return -ENOMEM; 3030 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3031 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3032 if (!map) { 3033 free_extent_map(em); 3034 return -ENOMEM; 3035 } 3036 3037 em->bdev = (struct block_device *)map; 3038 em->start = logical; 3039 em->len = length; 3040 em->block_start = 0; 3041 em->block_len = em->len; 3042 3043 map->num_stripes = num_stripes; 3044 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3045 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3046 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3047 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3048 map->type = btrfs_chunk_type(leaf, chunk); 3049 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3050 for (i = 0; i < num_stripes; i++) { 3051 map->stripes[i].physical = 3052 btrfs_stripe_offset_nr(leaf, chunk, i); 3053 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3054 read_extent_buffer(leaf, uuid, (unsigned long) 3055 btrfs_stripe_dev_uuid_nr(chunk, i), 3056 BTRFS_UUID_SIZE); 3057 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3058 NULL); 3059 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3060 kfree(map); 3061 free_extent_map(em); 3062 return -EIO; 3063 } 3064 if (!map->stripes[i].dev) { 3065 map->stripes[i].dev = 3066 add_missing_dev(root, devid, uuid); 3067 if (!map->stripes[i].dev) { 3068 kfree(map); 3069 free_extent_map(em); 3070 return -EIO; 3071 } 3072 } 3073 map->stripes[i].dev->in_fs_metadata = 1; 3074 } 3075 3076 spin_lock(&map_tree->map_tree.lock); 3077 ret = add_extent_mapping(&map_tree->map_tree, em); 3078 spin_unlock(&map_tree->map_tree.lock); 3079 BUG_ON(ret); 3080 free_extent_map(em); 3081 3082 return 0; 3083 } 3084 3085 static int fill_device_from_item(struct extent_buffer *leaf, 3086 struct btrfs_dev_item *dev_item, 3087 struct btrfs_device *device) 3088 { 3089 unsigned long ptr; 3090 3091 device->devid = btrfs_device_id(leaf, dev_item); 3092 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3093 device->total_bytes = device->disk_total_bytes; 3094 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3095 device->type = btrfs_device_type(leaf, dev_item); 3096 device->io_align = btrfs_device_io_align(leaf, dev_item); 3097 device->io_width = btrfs_device_io_width(leaf, dev_item); 3098 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3099 3100 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3101 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3102 3103 return 0; 3104 } 3105 3106 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3107 { 3108 struct btrfs_fs_devices *fs_devices; 3109 int ret; 3110 3111 mutex_lock(&uuid_mutex); 3112 3113 fs_devices = root->fs_info->fs_devices->seed; 3114 while (fs_devices) { 3115 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3116 ret = 0; 3117 goto out; 3118 } 3119 fs_devices = fs_devices->seed; 3120 } 3121 3122 fs_devices = find_fsid(fsid); 3123 if (!fs_devices) { 3124 ret = -ENOENT; 3125 goto out; 3126 } 3127 3128 fs_devices = clone_fs_devices(fs_devices); 3129 if (IS_ERR(fs_devices)) { 3130 ret = PTR_ERR(fs_devices); 3131 goto out; 3132 } 3133 3134 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3135 root->fs_info->bdev_holder); 3136 if (ret) 3137 goto out; 3138 3139 if (!fs_devices->seeding) { 3140 __btrfs_close_devices(fs_devices); 3141 free_fs_devices(fs_devices); 3142 ret = -EINVAL; 3143 goto out; 3144 } 3145 3146 fs_devices->seed = root->fs_info->fs_devices->seed; 3147 root->fs_info->fs_devices->seed = fs_devices; 3148 out: 3149 mutex_unlock(&uuid_mutex); 3150 return ret; 3151 } 3152 3153 static int read_one_dev(struct btrfs_root *root, 3154 struct extent_buffer *leaf, 3155 struct btrfs_dev_item *dev_item) 3156 { 3157 struct btrfs_device *device; 3158 u64 devid; 3159 int ret; 3160 u8 fs_uuid[BTRFS_UUID_SIZE]; 3161 u8 dev_uuid[BTRFS_UUID_SIZE]; 3162 3163 devid = btrfs_device_id(leaf, dev_item); 3164 read_extent_buffer(leaf, dev_uuid, 3165 (unsigned long)btrfs_device_uuid(dev_item), 3166 BTRFS_UUID_SIZE); 3167 read_extent_buffer(leaf, fs_uuid, 3168 (unsigned long)btrfs_device_fsid(dev_item), 3169 BTRFS_UUID_SIZE); 3170 3171 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3172 ret = open_seed_devices(root, fs_uuid); 3173 if (ret && !btrfs_test_opt(root, DEGRADED)) 3174 return ret; 3175 } 3176 3177 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3178 if (!device || !device->bdev) { 3179 if (!btrfs_test_opt(root, DEGRADED)) 3180 return -EIO; 3181 3182 if (!device) { 3183 printk(KERN_WARNING "warning devid %llu missing\n", 3184 (unsigned long long)devid); 3185 device = add_missing_dev(root, devid, dev_uuid); 3186 if (!device) 3187 return -ENOMEM; 3188 } 3189 } 3190 3191 if (device->fs_devices != root->fs_info->fs_devices) { 3192 BUG_ON(device->writeable); 3193 if (device->generation != 3194 btrfs_device_generation(leaf, dev_item)) 3195 return -EINVAL; 3196 } 3197 3198 fill_device_from_item(leaf, dev_item, device); 3199 device->dev_root = root->fs_info->dev_root; 3200 device->in_fs_metadata = 1; 3201 if (device->writeable) 3202 device->fs_devices->total_rw_bytes += device->total_bytes; 3203 ret = 0; 3204 return ret; 3205 } 3206 3207 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3208 { 3209 struct btrfs_dev_item *dev_item; 3210 3211 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3212 dev_item); 3213 return read_one_dev(root, buf, dev_item); 3214 } 3215 3216 int btrfs_read_sys_array(struct btrfs_root *root) 3217 { 3218 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3219 struct extent_buffer *sb; 3220 struct btrfs_disk_key *disk_key; 3221 struct btrfs_chunk *chunk; 3222 u8 *ptr; 3223 unsigned long sb_ptr; 3224 int ret = 0; 3225 u32 num_stripes; 3226 u32 array_size; 3227 u32 len = 0; 3228 u32 cur; 3229 struct btrfs_key key; 3230 3231 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3232 BTRFS_SUPER_INFO_SIZE); 3233 if (!sb) 3234 return -ENOMEM; 3235 btrfs_set_buffer_uptodate(sb); 3236 btrfs_set_buffer_lockdep_class(sb, 0); 3237 3238 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3239 array_size = btrfs_super_sys_array_size(super_copy); 3240 3241 ptr = super_copy->sys_chunk_array; 3242 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3243 cur = 0; 3244 3245 while (cur < array_size) { 3246 disk_key = (struct btrfs_disk_key *)ptr; 3247 btrfs_disk_key_to_cpu(&key, disk_key); 3248 3249 len = sizeof(*disk_key); ptr += len; 3250 sb_ptr += len; 3251 cur += len; 3252 3253 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3254 chunk = (struct btrfs_chunk *)sb_ptr; 3255 ret = read_one_chunk(root, &key, sb, chunk); 3256 if (ret) 3257 break; 3258 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3259 len = btrfs_chunk_item_size(num_stripes); 3260 } else { 3261 ret = -EIO; 3262 break; 3263 } 3264 ptr += len; 3265 sb_ptr += len; 3266 cur += len; 3267 } 3268 free_extent_buffer(sb); 3269 return ret; 3270 } 3271 3272 int btrfs_read_chunk_tree(struct btrfs_root *root) 3273 { 3274 struct btrfs_path *path; 3275 struct extent_buffer *leaf; 3276 struct btrfs_key key; 3277 struct btrfs_key found_key; 3278 int ret; 3279 int slot; 3280 3281 root = root->fs_info->chunk_root; 3282 3283 path = btrfs_alloc_path(); 3284 if (!path) 3285 return -ENOMEM; 3286 3287 /* first we search for all of the device items, and then we 3288 * read in all of the chunk items. This way we can create chunk 3289 * mappings that reference all of the devices that are afound 3290 */ 3291 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3292 key.offset = 0; 3293 key.type = 0; 3294 again: 3295 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3296 while (1) { 3297 leaf = path->nodes[0]; 3298 slot = path->slots[0]; 3299 if (slot >= btrfs_header_nritems(leaf)) { 3300 ret = btrfs_next_leaf(root, path); 3301 if (ret == 0) 3302 continue; 3303 if (ret < 0) 3304 goto error; 3305 break; 3306 } 3307 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3308 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3309 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3310 break; 3311 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3312 struct btrfs_dev_item *dev_item; 3313 dev_item = btrfs_item_ptr(leaf, slot, 3314 struct btrfs_dev_item); 3315 ret = read_one_dev(root, leaf, dev_item); 3316 if (ret) 3317 goto error; 3318 } 3319 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3320 struct btrfs_chunk *chunk; 3321 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3322 ret = read_one_chunk(root, &found_key, leaf, chunk); 3323 if (ret) 3324 goto error; 3325 } 3326 path->slots[0]++; 3327 } 3328 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3329 key.objectid = 0; 3330 btrfs_release_path(root, path); 3331 goto again; 3332 } 3333 ret = 0; 3334 error: 3335 btrfs_free_path(path); 3336 return ret; 3337 } 3338