1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <linux/iocontext.h> 24 #include <asm/div64.h> 25 #include "compat.h" 26 #include "ctree.h" 27 #include "extent_map.h" 28 #include "disk-io.h" 29 #include "transaction.h" 30 #include "print-tree.h" 31 #include "volumes.h" 32 #include "async-thread.h" 33 34 struct map_lookup { 35 u64 type; 36 int io_align; 37 int io_width; 38 int stripe_len; 39 int sector_size; 40 int num_stripes; 41 int sub_stripes; 42 struct btrfs_bio_stripe stripes[]; 43 }; 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 51 (sizeof(struct btrfs_bio_stripe) * (n))) 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 void btrfs_lock_volumes(void) 57 { 58 mutex_lock(&uuid_mutex); 59 } 60 61 void btrfs_unlock_volumes(void) 62 { 63 mutex_unlock(&uuid_mutex); 64 } 65 66 static void lock_chunks(struct btrfs_root *root) 67 { 68 mutex_lock(&root->fs_info->chunk_mutex); 69 } 70 71 static void unlock_chunks(struct btrfs_root *root) 72 { 73 mutex_unlock(&root->fs_info->chunk_mutex); 74 } 75 76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 77 { 78 struct btrfs_device *device; 79 WARN_ON(fs_devices->opened); 80 while (!list_empty(&fs_devices->devices)) { 81 device = list_entry(fs_devices->devices.next, 82 struct btrfs_device, dev_list); 83 list_del(&device->dev_list); 84 kfree(device->name); 85 kfree(device); 86 } 87 kfree(fs_devices); 88 } 89 90 int btrfs_cleanup_fs_uuids(void) 91 { 92 struct btrfs_fs_devices *fs_devices; 93 94 while (!list_empty(&fs_uuids)) { 95 fs_devices = list_entry(fs_uuids.next, 96 struct btrfs_fs_devices, list); 97 list_del(&fs_devices->list); 98 free_fs_devices(fs_devices); 99 } 100 return 0; 101 } 102 103 static noinline struct btrfs_device *__find_device(struct list_head *head, 104 u64 devid, u8 *uuid) 105 { 106 struct btrfs_device *dev; 107 108 list_for_each_entry(dev, head, dev_list) { 109 if (dev->devid == devid && 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 111 return dev; 112 } 113 } 114 return NULL; 115 } 116 117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 118 { 119 struct btrfs_fs_devices *fs_devices; 120 121 list_for_each_entry(fs_devices, &fs_uuids, list) { 122 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 123 return fs_devices; 124 } 125 return NULL; 126 } 127 128 /* 129 * we try to collect pending bios for a device so we don't get a large 130 * number of procs sending bios down to the same device. This greatly 131 * improves the schedulers ability to collect and merge the bios. 132 * 133 * But, it also turns into a long list of bios to process and that is sure 134 * to eventually make the worker thread block. The solution here is to 135 * make some progress and then put this work struct back at the end of 136 * the list if the block device is congested. This way, multiple devices 137 * can make progress from a single worker thread. 138 */ 139 static noinline int run_scheduled_bios(struct btrfs_device *device) 140 { 141 struct bio *pending; 142 struct backing_dev_info *bdi; 143 struct btrfs_fs_info *fs_info; 144 struct bio *tail; 145 struct bio *cur; 146 int again = 0; 147 unsigned long num_run = 0; 148 unsigned long limit; 149 unsigned long last_waited = 0; 150 151 bdi = blk_get_backing_dev_info(device->bdev); 152 fs_info = device->dev_root->fs_info; 153 limit = btrfs_async_submit_limit(fs_info); 154 limit = limit * 2 / 3; 155 156 loop: 157 spin_lock(&device->io_lock); 158 159 loop_lock: 160 /* take all the bios off the list at once and process them 161 * later on (without the lock held). But, remember the 162 * tail and other pointers so the bios can be properly reinserted 163 * into the list if we hit congestion 164 */ 165 pending = device->pending_bios; 166 tail = device->pending_bio_tail; 167 WARN_ON(pending && !tail); 168 device->pending_bios = NULL; 169 device->pending_bio_tail = NULL; 170 171 /* 172 * if pending was null this time around, no bios need processing 173 * at all and we can stop. Otherwise it'll loop back up again 174 * and do an additional check so no bios are missed. 175 * 176 * device->running_pending is used to synchronize with the 177 * schedule_bio code. 178 */ 179 if (pending) { 180 again = 1; 181 device->running_pending = 1; 182 } else { 183 again = 0; 184 device->running_pending = 0; 185 } 186 spin_unlock(&device->io_lock); 187 188 while (pending) { 189 cur = pending; 190 pending = pending->bi_next; 191 cur->bi_next = NULL; 192 atomic_dec(&fs_info->nr_async_bios); 193 194 if (atomic_read(&fs_info->nr_async_bios) < limit && 195 waitqueue_active(&fs_info->async_submit_wait)) 196 wake_up(&fs_info->async_submit_wait); 197 198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 199 bio_get(cur); 200 submit_bio(cur->bi_rw, cur); 201 bio_put(cur); 202 num_run++; 203 204 /* 205 * we made progress, there is more work to do and the bdi 206 * is now congested. Back off and let other work structs 207 * run instead 208 */ 209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 210 fs_info->fs_devices->open_devices > 1) { 211 struct bio *old_head; 212 struct io_context *ioc; 213 214 ioc = current->io_context; 215 216 /* 217 * the main goal here is that we don't want to 218 * block if we're going to be able to submit 219 * more requests without blocking. 220 * 221 * This code does two great things, it pokes into 222 * the elevator code from a filesystem _and_ 223 * it makes assumptions about how batching works. 224 */ 225 if (ioc && ioc->nr_batch_requests > 0 && 226 time_before(jiffies, ioc->last_waited + HZ/50UL) && 227 (last_waited == 0 || 228 ioc->last_waited == last_waited)) { 229 /* 230 * we want to go through our batch of 231 * requests and stop. So, we copy out 232 * the ioc->last_waited time and test 233 * against it before looping 234 */ 235 last_waited = ioc->last_waited; 236 continue; 237 } 238 spin_lock(&device->io_lock); 239 240 old_head = device->pending_bios; 241 device->pending_bios = pending; 242 if (device->pending_bio_tail) 243 tail->bi_next = old_head; 244 else 245 device->pending_bio_tail = tail; 246 247 device->running_pending = 1; 248 249 spin_unlock(&device->io_lock); 250 btrfs_requeue_work(&device->work); 251 goto done; 252 } 253 } 254 if (again) 255 goto loop; 256 257 spin_lock(&device->io_lock); 258 if (device->pending_bios) 259 goto loop_lock; 260 spin_unlock(&device->io_lock); 261 262 /* 263 * IO has already been through a long path to get here. Checksumming, 264 * async helper threads, perhaps compression. We've done a pretty 265 * good job of collecting a batch of IO and should just unplug 266 * the device right away. 267 * 268 * This will help anyone who is waiting on the IO, they might have 269 * already unplugged, but managed to do so before the bio they 270 * cared about found its way down here. 271 */ 272 blk_run_backing_dev(bdi, NULL); 273 done: 274 return 0; 275 } 276 277 static void pending_bios_fn(struct btrfs_work *work) 278 { 279 struct btrfs_device *device; 280 281 device = container_of(work, struct btrfs_device, work); 282 run_scheduled_bios(device); 283 } 284 285 static noinline int device_list_add(const char *path, 286 struct btrfs_super_block *disk_super, 287 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 288 { 289 struct btrfs_device *device; 290 struct btrfs_fs_devices *fs_devices; 291 u64 found_transid = btrfs_super_generation(disk_super); 292 293 fs_devices = find_fsid(disk_super->fsid); 294 if (!fs_devices) { 295 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 296 if (!fs_devices) 297 return -ENOMEM; 298 INIT_LIST_HEAD(&fs_devices->devices); 299 INIT_LIST_HEAD(&fs_devices->alloc_list); 300 list_add(&fs_devices->list, &fs_uuids); 301 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 302 fs_devices->latest_devid = devid; 303 fs_devices->latest_trans = found_transid; 304 device = NULL; 305 } else { 306 device = __find_device(&fs_devices->devices, devid, 307 disk_super->dev_item.uuid); 308 } 309 if (!device) { 310 if (fs_devices->opened) 311 return -EBUSY; 312 313 device = kzalloc(sizeof(*device), GFP_NOFS); 314 if (!device) { 315 /* we can safely leave the fs_devices entry around */ 316 return -ENOMEM; 317 } 318 device->devid = devid; 319 device->work.func = pending_bios_fn; 320 memcpy(device->uuid, disk_super->dev_item.uuid, 321 BTRFS_UUID_SIZE); 322 device->barriers = 1; 323 spin_lock_init(&device->io_lock); 324 device->name = kstrdup(path, GFP_NOFS); 325 if (!device->name) { 326 kfree(device); 327 return -ENOMEM; 328 } 329 INIT_LIST_HEAD(&device->dev_alloc_list); 330 list_add(&device->dev_list, &fs_devices->devices); 331 device->fs_devices = fs_devices; 332 fs_devices->num_devices++; 333 } 334 335 if (found_transid > fs_devices->latest_trans) { 336 fs_devices->latest_devid = devid; 337 fs_devices->latest_trans = found_transid; 338 } 339 *fs_devices_ret = fs_devices; 340 return 0; 341 } 342 343 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 344 { 345 struct btrfs_fs_devices *fs_devices; 346 struct btrfs_device *device; 347 struct btrfs_device *orig_dev; 348 349 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 350 if (!fs_devices) 351 return ERR_PTR(-ENOMEM); 352 353 INIT_LIST_HEAD(&fs_devices->devices); 354 INIT_LIST_HEAD(&fs_devices->alloc_list); 355 INIT_LIST_HEAD(&fs_devices->list); 356 fs_devices->latest_devid = orig->latest_devid; 357 fs_devices->latest_trans = orig->latest_trans; 358 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 359 360 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 361 device = kzalloc(sizeof(*device), GFP_NOFS); 362 if (!device) 363 goto error; 364 365 device->name = kstrdup(orig_dev->name, GFP_NOFS); 366 if (!device->name) 367 goto error; 368 369 device->devid = orig_dev->devid; 370 device->work.func = pending_bios_fn; 371 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 372 device->barriers = 1; 373 spin_lock_init(&device->io_lock); 374 INIT_LIST_HEAD(&device->dev_list); 375 INIT_LIST_HEAD(&device->dev_alloc_list); 376 377 list_add(&device->dev_list, &fs_devices->devices); 378 device->fs_devices = fs_devices; 379 fs_devices->num_devices++; 380 } 381 return fs_devices; 382 error: 383 free_fs_devices(fs_devices); 384 return ERR_PTR(-ENOMEM); 385 } 386 387 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 388 { 389 struct btrfs_device *device, *next; 390 391 mutex_lock(&uuid_mutex); 392 again: 393 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 394 if (device->in_fs_metadata) 395 continue; 396 397 if (device->bdev) { 398 close_bdev_exclusive(device->bdev, device->mode); 399 device->bdev = NULL; 400 fs_devices->open_devices--; 401 } 402 if (device->writeable) { 403 list_del_init(&device->dev_alloc_list); 404 device->writeable = 0; 405 fs_devices->rw_devices--; 406 } 407 list_del_init(&device->dev_list); 408 fs_devices->num_devices--; 409 kfree(device->name); 410 kfree(device); 411 } 412 413 if (fs_devices->seed) { 414 fs_devices = fs_devices->seed; 415 goto again; 416 } 417 418 mutex_unlock(&uuid_mutex); 419 return 0; 420 } 421 422 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 423 { 424 struct btrfs_device *device; 425 426 if (--fs_devices->opened > 0) 427 return 0; 428 429 list_for_each_entry(device, &fs_devices->devices, dev_list) { 430 if (device->bdev) { 431 close_bdev_exclusive(device->bdev, device->mode); 432 fs_devices->open_devices--; 433 } 434 if (device->writeable) { 435 list_del_init(&device->dev_alloc_list); 436 fs_devices->rw_devices--; 437 } 438 439 device->bdev = NULL; 440 device->writeable = 0; 441 device->in_fs_metadata = 0; 442 } 443 WARN_ON(fs_devices->open_devices); 444 WARN_ON(fs_devices->rw_devices); 445 fs_devices->opened = 0; 446 fs_devices->seeding = 0; 447 448 return 0; 449 } 450 451 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 452 { 453 struct btrfs_fs_devices *seed_devices = NULL; 454 int ret; 455 456 mutex_lock(&uuid_mutex); 457 ret = __btrfs_close_devices(fs_devices); 458 if (!fs_devices->opened) { 459 seed_devices = fs_devices->seed; 460 fs_devices->seed = NULL; 461 } 462 mutex_unlock(&uuid_mutex); 463 464 while (seed_devices) { 465 fs_devices = seed_devices; 466 seed_devices = fs_devices->seed; 467 __btrfs_close_devices(fs_devices); 468 free_fs_devices(fs_devices); 469 } 470 return ret; 471 } 472 473 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 474 fmode_t flags, void *holder) 475 { 476 struct block_device *bdev; 477 struct list_head *head = &fs_devices->devices; 478 struct btrfs_device *device; 479 struct block_device *latest_bdev = NULL; 480 struct buffer_head *bh; 481 struct btrfs_super_block *disk_super; 482 u64 latest_devid = 0; 483 u64 latest_transid = 0; 484 u64 devid; 485 int seeding = 1; 486 int ret = 0; 487 488 list_for_each_entry(device, head, dev_list) { 489 if (device->bdev) 490 continue; 491 if (!device->name) 492 continue; 493 494 bdev = open_bdev_exclusive(device->name, flags, holder); 495 if (IS_ERR(bdev)) { 496 printk(KERN_INFO "open %s failed\n", device->name); 497 goto error; 498 } 499 set_blocksize(bdev, 4096); 500 501 bh = btrfs_read_dev_super(bdev); 502 if (!bh) 503 goto error_close; 504 505 disk_super = (struct btrfs_super_block *)bh->b_data; 506 devid = le64_to_cpu(disk_super->dev_item.devid); 507 if (devid != device->devid) 508 goto error_brelse; 509 510 if (memcmp(device->uuid, disk_super->dev_item.uuid, 511 BTRFS_UUID_SIZE)) 512 goto error_brelse; 513 514 device->generation = btrfs_super_generation(disk_super); 515 if (!latest_transid || device->generation > latest_transid) { 516 latest_devid = devid; 517 latest_transid = device->generation; 518 latest_bdev = bdev; 519 } 520 521 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 522 device->writeable = 0; 523 } else { 524 device->writeable = !bdev_read_only(bdev); 525 seeding = 0; 526 } 527 528 device->bdev = bdev; 529 device->in_fs_metadata = 0; 530 device->mode = flags; 531 532 fs_devices->open_devices++; 533 if (device->writeable) { 534 fs_devices->rw_devices++; 535 list_add(&device->dev_alloc_list, 536 &fs_devices->alloc_list); 537 } 538 continue; 539 540 error_brelse: 541 brelse(bh); 542 error_close: 543 close_bdev_exclusive(bdev, FMODE_READ); 544 error: 545 continue; 546 } 547 if (fs_devices->open_devices == 0) { 548 ret = -EIO; 549 goto out; 550 } 551 fs_devices->seeding = seeding; 552 fs_devices->opened = 1; 553 fs_devices->latest_bdev = latest_bdev; 554 fs_devices->latest_devid = latest_devid; 555 fs_devices->latest_trans = latest_transid; 556 fs_devices->total_rw_bytes = 0; 557 out: 558 return ret; 559 } 560 561 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 562 fmode_t flags, void *holder) 563 { 564 int ret; 565 566 mutex_lock(&uuid_mutex); 567 if (fs_devices->opened) { 568 fs_devices->opened++; 569 ret = 0; 570 } else { 571 ret = __btrfs_open_devices(fs_devices, flags, holder); 572 } 573 mutex_unlock(&uuid_mutex); 574 return ret; 575 } 576 577 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 578 struct btrfs_fs_devices **fs_devices_ret) 579 { 580 struct btrfs_super_block *disk_super; 581 struct block_device *bdev; 582 struct buffer_head *bh; 583 int ret; 584 u64 devid; 585 u64 transid; 586 587 mutex_lock(&uuid_mutex); 588 589 bdev = open_bdev_exclusive(path, flags, holder); 590 591 if (IS_ERR(bdev)) { 592 ret = PTR_ERR(bdev); 593 goto error; 594 } 595 596 ret = set_blocksize(bdev, 4096); 597 if (ret) 598 goto error_close; 599 bh = btrfs_read_dev_super(bdev); 600 if (!bh) { 601 ret = -EIO; 602 goto error_close; 603 } 604 disk_super = (struct btrfs_super_block *)bh->b_data; 605 devid = le64_to_cpu(disk_super->dev_item.devid); 606 transid = btrfs_super_generation(disk_super); 607 if (disk_super->label[0]) 608 printk(KERN_INFO "device label %s ", disk_super->label); 609 else { 610 /* FIXME, make a readl uuid parser */ 611 printk(KERN_INFO "device fsid %llx-%llx ", 612 *(unsigned long long *)disk_super->fsid, 613 *(unsigned long long *)(disk_super->fsid + 8)); 614 } 615 printk(KERN_CONT "devid %llu transid %llu %s\n", 616 (unsigned long long)devid, (unsigned long long)transid, path); 617 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 618 619 brelse(bh); 620 error_close: 621 close_bdev_exclusive(bdev, flags); 622 error: 623 mutex_unlock(&uuid_mutex); 624 return ret; 625 } 626 627 /* 628 * this uses a pretty simple search, the expectation is that it is 629 * called very infrequently and that a given device has a small number 630 * of extents 631 */ 632 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 633 struct btrfs_device *device, 634 u64 num_bytes, u64 *start) 635 { 636 struct btrfs_key key; 637 struct btrfs_root *root = device->dev_root; 638 struct btrfs_dev_extent *dev_extent = NULL; 639 struct btrfs_path *path; 640 u64 hole_size = 0; 641 u64 last_byte = 0; 642 u64 search_start = 0; 643 u64 search_end = device->total_bytes; 644 int ret; 645 int slot = 0; 646 int start_found; 647 struct extent_buffer *l; 648 649 path = btrfs_alloc_path(); 650 if (!path) 651 return -ENOMEM; 652 path->reada = 2; 653 start_found = 0; 654 655 /* FIXME use last free of some kind */ 656 657 /* we don't want to overwrite the superblock on the drive, 658 * so we make sure to start at an offset of at least 1MB 659 */ 660 search_start = max((u64)1024 * 1024, search_start); 661 662 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 663 search_start = max(root->fs_info->alloc_start, search_start); 664 665 key.objectid = device->devid; 666 key.offset = search_start; 667 key.type = BTRFS_DEV_EXTENT_KEY; 668 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 669 if (ret < 0) 670 goto error; 671 ret = btrfs_previous_item(root, path, 0, key.type); 672 if (ret < 0) 673 goto error; 674 l = path->nodes[0]; 675 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 676 while (1) { 677 l = path->nodes[0]; 678 slot = path->slots[0]; 679 if (slot >= btrfs_header_nritems(l)) { 680 ret = btrfs_next_leaf(root, path); 681 if (ret == 0) 682 continue; 683 if (ret < 0) 684 goto error; 685 no_more_items: 686 if (!start_found) { 687 if (search_start >= search_end) { 688 ret = -ENOSPC; 689 goto error; 690 } 691 *start = search_start; 692 start_found = 1; 693 goto check_pending; 694 } 695 *start = last_byte > search_start ? 696 last_byte : search_start; 697 if (search_end <= *start) { 698 ret = -ENOSPC; 699 goto error; 700 } 701 goto check_pending; 702 } 703 btrfs_item_key_to_cpu(l, &key, slot); 704 705 if (key.objectid < device->devid) 706 goto next; 707 708 if (key.objectid > device->devid) 709 goto no_more_items; 710 711 if (key.offset >= search_start && key.offset > last_byte && 712 start_found) { 713 if (last_byte < search_start) 714 last_byte = search_start; 715 hole_size = key.offset - last_byte; 716 if (key.offset > last_byte && 717 hole_size >= num_bytes) { 718 *start = last_byte; 719 goto check_pending; 720 } 721 } 722 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 723 goto next; 724 725 start_found = 1; 726 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 727 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 728 next: 729 path->slots[0]++; 730 cond_resched(); 731 } 732 check_pending: 733 /* we have to make sure we didn't find an extent that has already 734 * been allocated by the map tree or the original allocation 735 */ 736 BUG_ON(*start < search_start); 737 738 if (*start + num_bytes > search_end) { 739 ret = -ENOSPC; 740 goto error; 741 } 742 /* check for pending inserts here */ 743 ret = 0; 744 745 error: 746 btrfs_free_path(path); 747 return ret; 748 } 749 750 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 751 struct btrfs_device *device, 752 u64 start) 753 { 754 int ret; 755 struct btrfs_path *path; 756 struct btrfs_root *root = device->dev_root; 757 struct btrfs_key key; 758 struct btrfs_key found_key; 759 struct extent_buffer *leaf = NULL; 760 struct btrfs_dev_extent *extent = NULL; 761 762 path = btrfs_alloc_path(); 763 if (!path) 764 return -ENOMEM; 765 766 key.objectid = device->devid; 767 key.offset = start; 768 key.type = BTRFS_DEV_EXTENT_KEY; 769 770 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 771 if (ret > 0) { 772 ret = btrfs_previous_item(root, path, key.objectid, 773 BTRFS_DEV_EXTENT_KEY); 774 BUG_ON(ret); 775 leaf = path->nodes[0]; 776 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 777 extent = btrfs_item_ptr(leaf, path->slots[0], 778 struct btrfs_dev_extent); 779 BUG_ON(found_key.offset > start || found_key.offset + 780 btrfs_dev_extent_length(leaf, extent) < start); 781 ret = 0; 782 } else if (ret == 0) { 783 leaf = path->nodes[0]; 784 extent = btrfs_item_ptr(leaf, path->slots[0], 785 struct btrfs_dev_extent); 786 } 787 BUG_ON(ret); 788 789 if (device->bytes_used > 0) 790 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 791 ret = btrfs_del_item(trans, root, path); 792 BUG_ON(ret); 793 794 btrfs_free_path(path); 795 return ret; 796 } 797 798 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 799 struct btrfs_device *device, 800 u64 chunk_tree, u64 chunk_objectid, 801 u64 chunk_offset, u64 start, u64 num_bytes) 802 { 803 int ret; 804 struct btrfs_path *path; 805 struct btrfs_root *root = device->dev_root; 806 struct btrfs_dev_extent *extent; 807 struct extent_buffer *leaf; 808 struct btrfs_key key; 809 810 WARN_ON(!device->in_fs_metadata); 811 path = btrfs_alloc_path(); 812 if (!path) 813 return -ENOMEM; 814 815 key.objectid = device->devid; 816 key.offset = start; 817 key.type = BTRFS_DEV_EXTENT_KEY; 818 ret = btrfs_insert_empty_item(trans, root, path, &key, 819 sizeof(*extent)); 820 BUG_ON(ret); 821 822 leaf = path->nodes[0]; 823 extent = btrfs_item_ptr(leaf, path->slots[0], 824 struct btrfs_dev_extent); 825 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 826 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 827 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 828 829 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 830 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 831 BTRFS_UUID_SIZE); 832 833 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 834 btrfs_mark_buffer_dirty(leaf); 835 btrfs_free_path(path); 836 return ret; 837 } 838 839 static noinline int find_next_chunk(struct btrfs_root *root, 840 u64 objectid, u64 *offset) 841 { 842 struct btrfs_path *path; 843 int ret; 844 struct btrfs_key key; 845 struct btrfs_chunk *chunk; 846 struct btrfs_key found_key; 847 848 path = btrfs_alloc_path(); 849 BUG_ON(!path); 850 851 key.objectid = objectid; 852 key.offset = (u64)-1; 853 key.type = BTRFS_CHUNK_ITEM_KEY; 854 855 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 856 if (ret < 0) 857 goto error; 858 859 BUG_ON(ret == 0); 860 861 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 862 if (ret) { 863 *offset = 0; 864 } else { 865 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 866 path->slots[0]); 867 if (found_key.objectid != objectid) 868 *offset = 0; 869 else { 870 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 871 struct btrfs_chunk); 872 *offset = found_key.offset + 873 btrfs_chunk_length(path->nodes[0], chunk); 874 } 875 } 876 ret = 0; 877 error: 878 btrfs_free_path(path); 879 return ret; 880 } 881 882 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 883 { 884 int ret; 885 struct btrfs_key key; 886 struct btrfs_key found_key; 887 struct btrfs_path *path; 888 889 root = root->fs_info->chunk_root; 890 891 path = btrfs_alloc_path(); 892 if (!path) 893 return -ENOMEM; 894 895 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 896 key.type = BTRFS_DEV_ITEM_KEY; 897 key.offset = (u64)-1; 898 899 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 900 if (ret < 0) 901 goto error; 902 903 BUG_ON(ret == 0); 904 905 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 906 BTRFS_DEV_ITEM_KEY); 907 if (ret) { 908 *objectid = 1; 909 } else { 910 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 911 path->slots[0]); 912 *objectid = found_key.offset + 1; 913 } 914 ret = 0; 915 error: 916 btrfs_free_path(path); 917 return ret; 918 } 919 920 /* 921 * the device information is stored in the chunk root 922 * the btrfs_device struct should be fully filled in 923 */ 924 int btrfs_add_device(struct btrfs_trans_handle *trans, 925 struct btrfs_root *root, 926 struct btrfs_device *device) 927 { 928 int ret; 929 struct btrfs_path *path; 930 struct btrfs_dev_item *dev_item; 931 struct extent_buffer *leaf; 932 struct btrfs_key key; 933 unsigned long ptr; 934 935 root = root->fs_info->chunk_root; 936 937 path = btrfs_alloc_path(); 938 if (!path) 939 return -ENOMEM; 940 941 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 942 key.type = BTRFS_DEV_ITEM_KEY; 943 key.offset = device->devid; 944 945 ret = btrfs_insert_empty_item(trans, root, path, &key, 946 sizeof(*dev_item)); 947 if (ret) 948 goto out; 949 950 leaf = path->nodes[0]; 951 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 952 953 btrfs_set_device_id(leaf, dev_item, device->devid); 954 btrfs_set_device_generation(leaf, dev_item, 0); 955 btrfs_set_device_type(leaf, dev_item, device->type); 956 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 957 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 958 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 959 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 960 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 961 btrfs_set_device_group(leaf, dev_item, 0); 962 btrfs_set_device_seek_speed(leaf, dev_item, 0); 963 btrfs_set_device_bandwidth(leaf, dev_item, 0); 964 btrfs_set_device_start_offset(leaf, dev_item, 0); 965 966 ptr = (unsigned long)btrfs_device_uuid(dev_item); 967 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 968 ptr = (unsigned long)btrfs_device_fsid(dev_item); 969 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 970 btrfs_mark_buffer_dirty(leaf); 971 972 ret = 0; 973 out: 974 btrfs_free_path(path); 975 return ret; 976 } 977 978 static int btrfs_rm_dev_item(struct btrfs_root *root, 979 struct btrfs_device *device) 980 { 981 int ret; 982 struct btrfs_path *path; 983 struct btrfs_key key; 984 struct btrfs_trans_handle *trans; 985 986 root = root->fs_info->chunk_root; 987 988 path = btrfs_alloc_path(); 989 if (!path) 990 return -ENOMEM; 991 992 trans = btrfs_start_transaction(root, 1); 993 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 994 key.type = BTRFS_DEV_ITEM_KEY; 995 key.offset = device->devid; 996 lock_chunks(root); 997 998 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 999 if (ret < 0) 1000 goto out; 1001 1002 if (ret > 0) { 1003 ret = -ENOENT; 1004 goto out; 1005 } 1006 1007 ret = btrfs_del_item(trans, root, path); 1008 if (ret) 1009 goto out; 1010 out: 1011 btrfs_free_path(path); 1012 unlock_chunks(root); 1013 btrfs_commit_transaction(trans, root); 1014 return ret; 1015 } 1016 1017 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1018 { 1019 struct btrfs_device *device; 1020 struct btrfs_device *next_device; 1021 struct block_device *bdev; 1022 struct buffer_head *bh = NULL; 1023 struct btrfs_super_block *disk_super; 1024 u64 all_avail; 1025 u64 devid; 1026 u64 num_devices; 1027 u8 *dev_uuid; 1028 int ret = 0; 1029 1030 mutex_lock(&uuid_mutex); 1031 mutex_lock(&root->fs_info->volume_mutex); 1032 1033 all_avail = root->fs_info->avail_data_alloc_bits | 1034 root->fs_info->avail_system_alloc_bits | 1035 root->fs_info->avail_metadata_alloc_bits; 1036 1037 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1038 root->fs_info->fs_devices->rw_devices <= 4) { 1039 printk(KERN_ERR "btrfs: unable to go below four devices " 1040 "on raid10\n"); 1041 ret = -EINVAL; 1042 goto out; 1043 } 1044 1045 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1046 root->fs_info->fs_devices->rw_devices <= 2) { 1047 printk(KERN_ERR "btrfs: unable to go below two " 1048 "devices on raid1\n"); 1049 ret = -EINVAL; 1050 goto out; 1051 } 1052 1053 if (strcmp(device_path, "missing") == 0) { 1054 struct list_head *devices; 1055 struct btrfs_device *tmp; 1056 1057 device = NULL; 1058 devices = &root->fs_info->fs_devices->devices; 1059 list_for_each_entry(tmp, devices, dev_list) { 1060 if (tmp->in_fs_metadata && !tmp->bdev) { 1061 device = tmp; 1062 break; 1063 } 1064 } 1065 bdev = NULL; 1066 bh = NULL; 1067 disk_super = NULL; 1068 if (!device) { 1069 printk(KERN_ERR "btrfs: no missing devices found to " 1070 "remove\n"); 1071 goto out; 1072 } 1073 } else { 1074 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1075 root->fs_info->bdev_holder); 1076 if (IS_ERR(bdev)) { 1077 ret = PTR_ERR(bdev); 1078 goto out; 1079 } 1080 1081 set_blocksize(bdev, 4096); 1082 bh = btrfs_read_dev_super(bdev); 1083 if (!bh) { 1084 ret = -EIO; 1085 goto error_close; 1086 } 1087 disk_super = (struct btrfs_super_block *)bh->b_data; 1088 devid = le64_to_cpu(disk_super->dev_item.devid); 1089 dev_uuid = disk_super->dev_item.uuid; 1090 device = btrfs_find_device(root, devid, dev_uuid, 1091 disk_super->fsid); 1092 if (!device) { 1093 ret = -ENOENT; 1094 goto error_brelse; 1095 } 1096 } 1097 1098 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1099 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1100 "device\n"); 1101 ret = -EINVAL; 1102 goto error_brelse; 1103 } 1104 1105 if (device->writeable) { 1106 list_del_init(&device->dev_alloc_list); 1107 root->fs_info->fs_devices->rw_devices--; 1108 } 1109 1110 ret = btrfs_shrink_device(device, 0); 1111 if (ret) 1112 goto error_brelse; 1113 1114 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1115 if (ret) 1116 goto error_brelse; 1117 1118 device->in_fs_metadata = 0; 1119 list_del_init(&device->dev_list); 1120 device->fs_devices->num_devices--; 1121 1122 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1123 struct btrfs_device, dev_list); 1124 if (device->bdev == root->fs_info->sb->s_bdev) 1125 root->fs_info->sb->s_bdev = next_device->bdev; 1126 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1127 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1128 1129 if (device->bdev) { 1130 close_bdev_exclusive(device->bdev, device->mode); 1131 device->bdev = NULL; 1132 device->fs_devices->open_devices--; 1133 } 1134 1135 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1136 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1137 1138 if (device->fs_devices->open_devices == 0) { 1139 struct btrfs_fs_devices *fs_devices; 1140 fs_devices = root->fs_info->fs_devices; 1141 while (fs_devices) { 1142 if (fs_devices->seed == device->fs_devices) 1143 break; 1144 fs_devices = fs_devices->seed; 1145 } 1146 fs_devices->seed = device->fs_devices->seed; 1147 device->fs_devices->seed = NULL; 1148 __btrfs_close_devices(device->fs_devices); 1149 free_fs_devices(device->fs_devices); 1150 } 1151 1152 /* 1153 * at this point, the device is zero sized. We want to 1154 * remove it from the devices list and zero out the old super 1155 */ 1156 if (device->writeable) { 1157 /* make sure this device isn't detected as part of 1158 * the FS anymore 1159 */ 1160 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1161 set_buffer_dirty(bh); 1162 sync_dirty_buffer(bh); 1163 } 1164 1165 kfree(device->name); 1166 kfree(device); 1167 ret = 0; 1168 1169 error_brelse: 1170 brelse(bh); 1171 error_close: 1172 if (bdev) 1173 close_bdev_exclusive(bdev, FMODE_READ); 1174 out: 1175 mutex_unlock(&root->fs_info->volume_mutex); 1176 mutex_unlock(&uuid_mutex); 1177 return ret; 1178 } 1179 1180 /* 1181 * does all the dirty work required for changing file system's UUID. 1182 */ 1183 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1184 struct btrfs_root *root) 1185 { 1186 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1187 struct btrfs_fs_devices *old_devices; 1188 struct btrfs_fs_devices *seed_devices; 1189 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1190 struct btrfs_device *device; 1191 u64 super_flags; 1192 1193 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1194 if (!fs_devices->seeding) 1195 return -EINVAL; 1196 1197 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1198 if (!seed_devices) 1199 return -ENOMEM; 1200 1201 old_devices = clone_fs_devices(fs_devices); 1202 if (IS_ERR(old_devices)) { 1203 kfree(seed_devices); 1204 return PTR_ERR(old_devices); 1205 } 1206 1207 list_add(&old_devices->list, &fs_uuids); 1208 1209 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1210 seed_devices->opened = 1; 1211 INIT_LIST_HEAD(&seed_devices->devices); 1212 INIT_LIST_HEAD(&seed_devices->alloc_list); 1213 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1214 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1215 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1216 device->fs_devices = seed_devices; 1217 } 1218 1219 fs_devices->seeding = 0; 1220 fs_devices->num_devices = 0; 1221 fs_devices->open_devices = 0; 1222 fs_devices->seed = seed_devices; 1223 1224 generate_random_uuid(fs_devices->fsid); 1225 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1226 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1227 super_flags = btrfs_super_flags(disk_super) & 1228 ~BTRFS_SUPER_FLAG_SEEDING; 1229 btrfs_set_super_flags(disk_super, super_flags); 1230 1231 return 0; 1232 } 1233 1234 /* 1235 * strore the expected generation for seed devices in device items. 1236 */ 1237 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1238 struct btrfs_root *root) 1239 { 1240 struct btrfs_path *path; 1241 struct extent_buffer *leaf; 1242 struct btrfs_dev_item *dev_item; 1243 struct btrfs_device *device; 1244 struct btrfs_key key; 1245 u8 fs_uuid[BTRFS_UUID_SIZE]; 1246 u8 dev_uuid[BTRFS_UUID_SIZE]; 1247 u64 devid; 1248 int ret; 1249 1250 path = btrfs_alloc_path(); 1251 if (!path) 1252 return -ENOMEM; 1253 1254 root = root->fs_info->chunk_root; 1255 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1256 key.offset = 0; 1257 key.type = BTRFS_DEV_ITEM_KEY; 1258 1259 while (1) { 1260 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1261 if (ret < 0) 1262 goto error; 1263 1264 leaf = path->nodes[0]; 1265 next_slot: 1266 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1267 ret = btrfs_next_leaf(root, path); 1268 if (ret > 0) 1269 break; 1270 if (ret < 0) 1271 goto error; 1272 leaf = path->nodes[0]; 1273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1274 btrfs_release_path(root, path); 1275 continue; 1276 } 1277 1278 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1279 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1280 key.type != BTRFS_DEV_ITEM_KEY) 1281 break; 1282 1283 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1284 struct btrfs_dev_item); 1285 devid = btrfs_device_id(leaf, dev_item); 1286 read_extent_buffer(leaf, dev_uuid, 1287 (unsigned long)btrfs_device_uuid(dev_item), 1288 BTRFS_UUID_SIZE); 1289 read_extent_buffer(leaf, fs_uuid, 1290 (unsigned long)btrfs_device_fsid(dev_item), 1291 BTRFS_UUID_SIZE); 1292 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1293 BUG_ON(!device); 1294 1295 if (device->fs_devices->seeding) { 1296 btrfs_set_device_generation(leaf, dev_item, 1297 device->generation); 1298 btrfs_mark_buffer_dirty(leaf); 1299 } 1300 1301 path->slots[0]++; 1302 goto next_slot; 1303 } 1304 ret = 0; 1305 error: 1306 btrfs_free_path(path); 1307 return ret; 1308 } 1309 1310 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1311 { 1312 struct btrfs_trans_handle *trans; 1313 struct btrfs_device *device; 1314 struct block_device *bdev; 1315 struct list_head *devices; 1316 struct super_block *sb = root->fs_info->sb; 1317 u64 total_bytes; 1318 int seeding_dev = 0; 1319 int ret = 0; 1320 1321 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1322 return -EINVAL; 1323 1324 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1325 if (!bdev) 1326 return -EIO; 1327 1328 if (root->fs_info->fs_devices->seeding) { 1329 seeding_dev = 1; 1330 down_write(&sb->s_umount); 1331 mutex_lock(&uuid_mutex); 1332 } 1333 1334 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1335 mutex_lock(&root->fs_info->volume_mutex); 1336 1337 devices = &root->fs_info->fs_devices->devices; 1338 list_for_each_entry(device, devices, dev_list) { 1339 if (device->bdev == bdev) { 1340 ret = -EEXIST; 1341 goto error; 1342 } 1343 } 1344 1345 device = kzalloc(sizeof(*device), GFP_NOFS); 1346 if (!device) { 1347 /* we can safely leave the fs_devices entry around */ 1348 ret = -ENOMEM; 1349 goto error; 1350 } 1351 1352 device->name = kstrdup(device_path, GFP_NOFS); 1353 if (!device->name) { 1354 kfree(device); 1355 ret = -ENOMEM; 1356 goto error; 1357 } 1358 1359 ret = find_next_devid(root, &device->devid); 1360 if (ret) { 1361 kfree(device); 1362 goto error; 1363 } 1364 1365 trans = btrfs_start_transaction(root, 1); 1366 lock_chunks(root); 1367 1368 device->barriers = 1; 1369 device->writeable = 1; 1370 device->work.func = pending_bios_fn; 1371 generate_random_uuid(device->uuid); 1372 spin_lock_init(&device->io_lock); 1373 device->generation = trans->transid; 1374 device->io_width = root->sectorsize; 1375 device->io_align = root->sectorsize; 1376 device->sector_size = root->sectorsize; 1377 device->total_bytes = i_size_read(bdev->bd_inode); 1378 device->dev_root = root->fs_info->dev_root; 1379 device->bdev = bdev; 1380 device->in_fs_metadata = 1; 1381 device->mode = 0; 1382 set_blocksize(device->bdev, 4096); 1383 1384 if (seeding_dev) { 1385 sb->s_flags &= ~MS_RDONLY; 1386 ret = btrfs_prepare_sprout(trans, root); 1387 BUG_ON(ret); 1388 } 1389 1390 device->fs_devices = root->fs_info->fs_devices; 1391 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1392 list_add(&device->dev_alloc_list, 1393 &root->fs_info->fs_devices->alloc_list); 1394 root->fs_info->fs_devices->num_devices++; 1395 root->fs_info->fs_devices->open_devices++; 1396 root->fs_info->fs_devices->rw_devices++; 1397 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1398 1399 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1400 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1401 total_bytes + device->total_bytes); 1402 1403 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1404 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1405 total_bytes + 1); 1406 1407 if (seeding_dev) { 1408 ret = init_first_rw_device(trans, root, device); 1409 BUG_ON(ret); 1410 ret = btrfs_finish_sprout(trans, root); 1411 BUG_ON(ret); 1412 } else { 1413 ret = btrfs_add_device(trans, root, device); 1414 } 1415 1416 /* 1417 * we've got more storage, clear any full flags on the space 1418 * infos 1419 */ 1420 btrfs_clear_space_info_full(root->fs_info); 1421 1422 unlock_chunks(root); 1423 btrfs_commit_transaction(trans, root); 1424 1425 if (seeding_dev) { 1426 mutex_unlock(&uuid_mutex); 1427 up_write(&sb->s_umount); 1428 1429 ret = btrfs_relocate_sys_chunks(root); 1430 BUG_ON(ret); 1431 } 1432 out: 1433 mutex_unlock(&root->fs_info->volume_mutex); 1434 return ret; 1435 error: 1436 close_bdev_exclusive(bdev, 0); 1437 if (seeding_dev) { 1438 mutex_unlock(&uuid_mutex); 1439 up_write(&sb->s_umount); 1440 } 1441 goto out; 1442 } 1443 1444 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1445 struct btrfs_device *device) 1446 { 1447 int ret; 1448 struct btrfs_path *path; 1449 struct btrfs_root *root; 1450 struct btrfs_dev_item *dev_item; 1451 struct extent_buffer *leaf; 1452 struct btrfs_key key; 1453 1454 root = device->dev_root->fs_info->chunk_root; 1455 1456 path = btrfs_alloc_path(); 1457 if (!path) 1458 return -ENOMEM; 1459 1460 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1461 key.type = BTRFS_DEV_ITEM_KEY; 1462 key.offset = device->devid; 1463 1464 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1465 if (ret < 0) 1466 goto out; 1467 1468 if (ret > 0) { 1469 ret = -ENOENT; 1470 goto out; 1471 } 1472 1473 leaf = path->nodes[0]; 1474 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1475 1476 btrfs_set_device_id(leaf, dev_item, device->devid); 1477 btrfs_set_device_type(leaf, dev_item, device->type); 1478 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1479 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1480 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1481 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1482 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1483 btrfs_mark_buffer_dirty(leaf); 1484 1485 out: 1486 btrfs_free_path(path); 1487 return ret; 1488 } 1489 1490 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1491 struct btrfs_device *device, u64 new_size) 1492 { 1493 struct btrfs_super_block *super_copy = 1494 &device->dev_root->fs_info->super_copy; 1495 u64 old_total = btrfs_super_total_bytes(super_copy); 1496 u64 diff = new_size - device->total_bytes; 1497 1498 if (!device->writeable) 1499 return -EACCES; 1500 if (new_size <= device->total_bytes) 1501 return -EINVAL; 1502 1503 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1504 device->fs_devices->total_rw_bytes += diff; 1505 1506 device->total_bytes = new_size; 1507 btrfs_clear_space_info_full(device->dev_root->fs_info); 1508 1509 return btrfs_update_device(trans, device); 1510 } 1511 1512 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1513 struct btrfs_device *device, u64 new_size) 1514 { 1515 int ret; 1516 lock_chunks(device->dev_root); 1517 ret = __btrfs_grow_device(trans, device, new_size); 1518 unlock_chunks(device->dev_root); 1519 return ret; 1520 } 1521 1522 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1523 struct btrfs_root *root, 1524 u64 chunk_tree, u64 chunk_objectid, 1525 u64 chunk_offset) 1526 { 1527 int ret; 1528 struct btrfs_path *path; 1529 struct btrfs_key key; 1530 1531 root = root->fs_info->chunk_root; 1532 path = btrfs_alloc_path(); 1533 if (!path) 1534 return -ENOMEM; 1535 1536 key.objectid = chunk_objectid; 1537 key.offset = chunk_offset; 1538 key.type = BTRFS_CHUNK_ITEM_KEY; 1539 1540 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1541 BUG_ON(ret); 1542 1543 ret = btrfs_del_item(trans, root, path); 1544 BUG_ON(ret); 1545 1546 btrfs_free_path(path); 1547 return 0; 1548 } 1549 1550 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1551 chunk_offset) 1552 { 1553 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1554 struct btrfs_disk_key *disk_key; 1555 struct btrfs_chunk *chunk; 1556 u8 *ptr; 1557 int ret = 0; 1558 u32 num_stripes; 1559 u32 array_size; 1560 u32 len = 0; 1561 u32 cur; 1562 struct btrfs_key key; 1563 1564 array_size = btrfs_super_sys_array_size(super_copy); 1565 1566 ptr = super_copy->sys_chunk_array; 1567 cur = 0; 1568 1569 while (cur < array_size) { 1570 disk_key = (struct btrfs_disk_key *)ptr; 1571 btrfs_disk_key_to_cpu(&key, disk_key); 1572 1573 len = sizeof(*disk_key); 1574 1575 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1576 chunk = (struct btrfs_chunk *)(ptr + len); 1577 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1578 len += btrfs_chunk_item_size(num_stripes); 1579 } else { 1580 ret = -EIO; 1581 break; 1582 } 1583 if (key.objectid == chunk_objectid && 1584 key.offset == chunk_offset) { 1585 memmove(ptr, ptr + len, array_size - (cur + len)); 1586 array_size -= len; 1587 btrfs_set_super_sys_array_size(super_copy, array_size); 1588 } else { 1589 ptr += len; 1590 cur += len; 1591 } 1592 } 1593 return ret; 1594 } 1595 1596 static int btrfs_relocate_chunk(struct btrfs_root *root, 1597 u64 chunk_tree, u64 chunk_objectid, 1598 u64 chunk_offset) 1599 { 1600 struct extent_map_tree *em_tree; 1601 struct btrfs_root *extent_root; 1602 struct btrfs_trans_handle *trans; 1603 struct extent_map *em; 1604 struct map_lookup *map; 1605 int ret; 1606 int i; 1607 1608 printk(KERN_INFO "btrfs relocating chunk %llu\n", 1609 (unsigned long long)chunk_offset); 1610 root = root->fs_info->chunk_root; 1611 extent_root = root->fs_info->extent_root; 1612 em_tree = &root->fs_info->mapping_tree.map_tree; 1613 1614 /* step one, relocate all the extents inside this chunk */ 1615 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1616 BUG_ON(ret); 1617 1618 trans = btrfs_start_transaction(root, 1); 1619 BUG_ON(!trans); 1620 1621 lock_chunks(root); 1622 1623 /* 1624 * step two, delete the device extents and the 1625 * chunk tree entries 1626 */ 1627 spin_lock(&em_tree->lock); 1628 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1629 spin_unlock(&em_tree->lock); 1630 1631 BUG_ON(em->start > chunk_offset || 1632 em->start + em->len < chunk_offset); 1633 map = (struct map_lookup *)em->bdev; 1634 1635 for (i = 0; i < map->num_stripes; i++) { 1636 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1637 map->stripes[i].physical); 1638 BUG_ON(ret); 1639 1640 if (map->stripes[i].dev) { 1641 ret = btrfs_update_device(trans, map->stripes[i].dev); 1642 BUG_ON(ret); 1643 } 1644 } 1645 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1646 chunk_offset); 1647 1648 BUG_ON(ret); 1649 1650 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1651 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1652 BUG_ON(ret); 1653 } 1654 1655 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1656 BUG_ON(ret); 1657 1658 spin_lock(&em_tree->lock); 1659 remove_extent_mapping(em_tree, em); 1660 spin_unlock(&em_tree->lock); 1661 1662 kfree(map); 1663 em->bdev = NULL; 1664 1665 /* once for the tree */ 1666 free_extent_map(em); 1667 /* once for us */ 1668 free_extent_map(em); 1669 1670 unlock_chunks(root); 1671 btrfs_end_transaction(trans, root); 1672 return 0; 1673 } 1674 1675 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1676 { 1677 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1678 struct btrfs_path *path; 1679 struct extent_buffer *leaf; 1680 struct btrfs_chunk *chunk; 1681 struct btrfs_key key; 1682 struct btrfs_key found_key; 1683 u64 chunk_tree = chunk_root->root_key.objectid; 1684 u64 chunk_type; 1685 int ret; 1686 1687 path = btrfs_alloc_path(); 1688 if (!path) 1689 return -ENOMEM; 1690 1691 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1692 key.offset = (u64)-1; 1693 key.type = BTRFS_CHUNK_ITEM_KEY; 1694 1695 while (1) { 1696 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1697 if (ret < 0) 1698 goto error; 1699 BUG_ON(ret == 0); 1700 1701 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1702 key.type); 1703 if (ret < 0) 1704 goto error; 1705 if (ret > 0) 1706 break; 1707 1708 leaf = path->nodes[0]; 1709 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1710 1711 chunk = btrfs_item_ptr(leaf, path->slots[0], 1712 struct btrfs_chunk); 1713 chunk_type = btrfs_chunk_type(leaf, chunk); 1714 btrfs_release_path(chunk_root, path); 1715 1716 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1717 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1718 found_key.objectid, 1719 found_key.offset); 1720 BUG_ON(ret); 1721 } 1722 1723 if (found_key.offset == 0) 1724 break; 1725 key.offset = found_key.offset - 1; 1726 } 1727 ret = 0; 1728 error: 1729 btrfs_free_path(path); 1730 return ret; 1731 } 1732 1733 static u64 div_factor(u64 num, int factor) 1734 { 1735 if (factor == 10) 1736 return num; 1737 num *= factor; 1738 do_div(num, 10); 1739 return num; 1740 } 1741 1742 int btrfs_balance(struct btrfs_root *dev_root) 1743 { 1744 int ret; 1745 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1746 struct btrfs_device *device; 1747 u64 old_size; 1748 u64 size_to_free; 1749 struct btrfs_path *path; 1750 struct btrfs_key key; 1751 struct btrfs_chunk *chunk; 1752 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1753 struct btrfs_trans_handle *trans; 1754 struct btrfs_key found_key; 1755 1756 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1757 return -EROFS; 1758 1759 mutex_lock(&dev_root->fs_info->volume_mutex); 1760 dev_root = dev_root->fs_info->dev_root; 1761 1762 /* step one make some room on all the devices */ 1763 list_for_each_entry(device, devices, dev_list) { 1764 old_size = device->total_bytes; 1765 size_to_free = div_factor(old_size, 1); 1766 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1767 if (!device->writeable || 1768 device->total_bytes - device->bytes_used > size_to_free) 1769 continue; 1770 1771 ret = btrfs_shrink_device(device, old_size - size_to_free); 1772 BUG_ON(ret); 1773 1774 trans = btrfs_start_transaction(dev_root, 1); 1775 BUG_ON(!trans); 1776 1777 ret = btrfs_grow_device(trans, device, old_size); 1778 BUG_ON(ret); 1779 1780 btrfs_end_transaction(trans, dev_root); 1781 } 1782 1783 /* step two, relocate all the chunks */ 1784 path = btrfs_alloc_path(); 1785 BUG_ON(!path); 1786 1787 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1788 key.offset = (u64)-1; 1789 key.type = BTRFS_CHUNK_ITEM_KEY; 1790 1791 while (1) { 1792 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1793 if (ret < 0) 1794 goto error; 1795 1796 /* 1797 * this shouldn't happen, it means the last relocate 1798 * failed 1799 */ 1800 if (ret == 0) 1801 break; 1802 1803 ret = btrfs_previous_item(chunk_root, path, 0, 1804 BTRFS_CHUNK_ITEM_KEY); 1805 if (ret) 1806 break; 1807 1808 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1809 path->slots[0]); 1810 if (found_key.objectid != key.objectid) 1811 break; 1812 1813 chunk = btrfs_item_ptr(path->nodes[0], 1814 path->slots[0], 1815 struct btrfs_chunk); 1816 key.offset = found_key.offset; 1817 /* chunk zero is special */ 1818 if (key.offset == 0) 1819 break; 1820 1821 btrfs_release_path(chunk_root, path); 1822 ret = btrfs_relocate_chunk(chunk_root, 1823 chunk_root->root_key.objectid, 1824 found_key.objectid, 1825 found_key.offset); 1826 BUG_ON(ret); 1827 } 1828 ret = 0; 1829 error: 1830 btrfs_free_path(path); 1831 mutex_unlock(&dev_root->fs_info->volume_mutex); 1832 return ret; 1833 } 1834 1835 /* 1836 * shrinking a device means finding all of the device extents past 1837 * the new size, and then following the back refs to the chunks. 1838 * The chunk relocation code actually frees the device extent 1839 */ 1840 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1841 { 1842 struct btrfs_trans_handle *trans; 1843 struct btrfs_root *root = device->dev_root; 1844 struct btrfs_dev_extent *dev_extent = NULL; 1845 struct btrfs_path *path; 1846 u64 length; 1847 u64 chunk_tree; 1848 u64 chunk_objectid; 1849 u64 chunk_offset; 1850 int ret; 1851 int slot; 1852 struct extent_buffer *l; 1853 struct btrfs_key key; 1854 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1855 u64 old_total = btrfs_super_total_bytes(super_copy); 1856 u64 diff = device->total_bytes - new_size; 1857 1858 if (new_size >= device->total_bytes) 1859 return -EINVAL; 1860 1861 path = btrfs_alloc_path(); 1862 if (!path) 1863 return -ENOMEM; 1864 1865 trans = btrfs_start_transaction(root, 1); 1866 if (!trans) { 1867 ret = -ENOMEM; 1868 goto done; 1869 } 1870 1871 path->reada = 2; 1872 1873 lock_chunks(root); 1874 1875 device->total_bytes = new_size; 1876 if (device->writeable) 1877 device->fs_devices->total_rw_bytes -= diff; 1878 ret = btrfs_update_device(trans, device); 1879 if (ret) { 1880 unlock_chunks(root); 1881 btrfs_end_transaction(trans, root); 1882 goto done; 1883 } 1884 WARN_ON(diff > old_total); 1885 btrfs_set_super_total_bytes(super_copy, old_total - diff); 1886 unlock_chunks(root); 1887 btrfs_end_transaction(trans, root); 1888 1889 key.objectid = device->devid; 1890 key.offset = (u64)-1; 1891 key.type = BTRFS_DEV_EXTENT_KEY; 1892 1893 while (1) { 1894 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1895 if (ret < 0) 1896 goto done; 1897 1898 ret = btrfs_previous_item(root, path, 0, key.type); 1899 if (ret < 0) 1900 goto done; 1901 if (ret) { 1902 ret = 0; 1903 goto done; 1904 } 1905 1906 l = path->nodes[0]; 1907 slot = path->slots[0]; 1908 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 1909 1910 if (key.objectid != device->devid) 1911 goto done; 1912 1913 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1914 length = btrfs_dev_extent_length(l, dev_extent); 1915 1916 if (key.offset + length <= new_size) 1917 goto done; 1918 1919 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1920 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1921 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 1922 btrfs_release_path(root, path); 1923 1924 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 1925 chunk_offset); 1926 if (ret) 1927 goto done; 1928 } 1929 1930 done: 1931 btrfs_free_path(path); 1932 return ret; 1933 } 1934 1935 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 1936 struct btrfs_root *root, 1937 struct btrfs_key *key, 1938 struct btrfs_chunk *chunk, int item_size) 1939 { 1940 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1941 struct btrfs_disk_key disk_key; 1942 u32 array_size; 1943 u8 *ptr; 1944 1945 array_size = btrfs_super_sys_array_size(super_copy); 1946 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 1947 return -EFBIG; 1948 1949 ptr = super_copy->sys_chunk_array + array_size; 1950 btrfs_cpu_key_to_disk(&disk_key, key); 1951 memcpy(ptr, &disk_key, sizeof(disk_key)); 1952 ptr += sizeof(disk_key); 1953 memcpy(ptr, chunk, item_size); 1954 item_size += sizeof(disk_key); 1955 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 1956 return 0; 1957 } 1958 1959 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 1960 int num_stripes, int sub_stripes) 1961 { 1962 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 1963 return calc_size; 1964 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1965 return calc_size * (num_stripes / sub_stripes); 1966 else 1967 return calc_size * num_stripes; 1968 } 1969 1970 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 1971 struct btrfs_root *extent_root, 1972 struct map_lookup **map_ret, 1973 u64 *num_bytes, u64 *stripe_size, 1974 u64 start, u64 type) 1975 { 1976 struct btrfs_fs_info *info = extent_root->fs_info; 1977 struct btrfs_device *device = NULL; 1978 struct btrfs_fs_devices *fs_devices = info->fs_devices; 1979 struct list_head *cur; 1980 struct map_lookup *map = NULL; 1981 struct extent_map_tree *em_tree; 1982 struct extent_map *em; 1983 struct list_head private_devs; 1984 int min_stripe_size = 1 * 1024 * 1024; 1985 u64 calc_size = 1024 * 1024 * 1024; 1986 u64 max_chunk_size = calc_size; 1987 u64 min_free; 1988 u64 avail; 1989 u64 max_avail = 0; 1990 u64 dev_offset; 1991 int num_stripes = 1; 1992 int min_stripes = 1; 1993 int sub_stripes = 0; 1994 int looped = 0; 1995 int ret; 1996 int index; 1997 int stripe_len = 64 * 1024; 1998 1999 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2000 (type & BTRFS_BLOCK_GROUP_DUP)) { 2001 WARN_ON(1); 2002 type &= ~BTRFS_BLOCK_GROUP_DUP; 2003 } 2004 if (list_empty(&fs_devices->alloc_list)) 2005 return -ENOSPC; 2006 2007 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2008 num_stripes = fs_devices->rw_devices; 2009 min_stripes = 2; 2010 } 2011 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2012 num_stripes = 2; 2013 min_stripes = 2; 2014 } 2015 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2016 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2017 if (num_stripes < 2) 2018 return -ENOSPC; 2019 min_stripes = 2; 2020 } 2021 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2022 num_stripes = fs_devices->rw_devices; 2023 if (num_stripes < 4) 2024 return -ENOSPC; 2025 num_stripes &= ~(u32)1; 2026 sub_stripes = 2; 2027 min_stripes = 4; 2028 } 2029 2030 if (type & BTRFS_BLOCK_GROUP_DATA) { 2031 max_chunk_size = 10 * calc_size; 2032 min_stripe_size = 64 * 1024 * 1024; 2033 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2034 max_chunk_size = 4 * calc_size; 2035 min_stripe_size = 32 * 1024 * 1024; 2036 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2037 calc_size = 8 * 1024 * 1024; 2038 max_chunk_size = calc_size * 2; 2039 min_stripe_size = 1 * 1024 * 1024; 2040 } 2041 2042 /* we don't want a chunk larger than 10% of writeable space */ 2043 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2044 max_chunk_size); 2045 2046 again: 2047 if (!map || map->num_stripes != num_stripes) { 2048 kfree(map); 2049 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2050 if (!map) 2051 return -ENOMEM; 2052 map->num_stripes = num_stripes; 2053 } 2054 2055 if (calc_size * num_stripes > max_chunk_size) { 2056 calc_size = max_chunk_size; 2057 do_div(calc_size, num_stripes); 2058 do_div(calc_size, stripe_len); 2059 calc_size *= stripe_len; 2060 } 2061 /* we don't want tiny stripes */ 2062 calc_size = max_t(u64, min_stripe_size, calc_size); 2063 2064 do_div(calc_size, stripe_len); 2065 calc_size *= stripe_len; 2066 2067 cur = fs_devices->alloc_list.next; 2068 index = 0; 2069 2070 if (type & BTRFS_BLOCK_GROUP_DUP) 2071 min_free = calc_size * 2; 2072 else 2073 min_free = calc_size; 2074 2075 /* 2076 * we add 1MB because we never use the first 1MB of the device, unless 2077 * we've looped, then we are likely allocating the maximum amount of 2078 * space left already 2079 */ 2080 if (!looped) 2081 min_free += 1024 * 1024; 2082 2083 INIT_LIST_HEAD(&private_devs); 2084 while (index < num_stripes) { 2085 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2086 BUG_ON(!device->writeable); 2087 if (device->total_bytes > device->bytes_used) 2088 avail = device->total_bytes - device->bytes_used; 2089 else 2090 avail = 0; 2091 cur = cur->next; 2092 2093 if (device->in_fs_metadata && avail >= min_free) { 2094 ret = find_free_dev_extent(trans, device, 2095 min_free, &dev_offset); 2096 if (ret == 0) { 2097 list_move_tail(&device->dev_alloc_list, 2098 &private_devs); 2099 map->stripes[index].dev = device; 2100 map->stripes[index].physical = dev_offset; 2101 index++; 2102 if (type & BTRFS_BLOCK_GROUP_DUP) { 2103 map->stripes[index].dev = device; 2104 map->stripes[index].physical = 2105 dev_offset + calc_size; 2106 index++; 2107 } 2108 } 2109 } else if (device->in_fs_metadata && avail > max_avail) 2110 max_avail = avail; 2111 if (cur == &fs_devices->alloc_list) 2112 break; 2113 } 2114 list_splice(&private_devs, &fs_devices->alloc_list); 2115 if (index < num_stripes) { 2116 if (index >= min_stripes) { 2117 num_stripes = index; 2118 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2119 num_stripes /= sub_stripes; 2120 num_stripes *= sub_stripes; 2121 } 2122 looped = 1; 2123 goto again; 2124 } 2125 if (!looped && max_avail > 0) { 2126 looped = 1; 2127 calc_size = max_avail; 2128 goto again; 2129 } 2130 kfree(map); 2131 return -ENOSPC; 2132 } 2133 map->sector_size = extent_root->sectorsize; 2134 map->stripe_len = stripe_len; 2135 map->io_align = stripe_len; 2136 map->io_width = stripe_len; 2137 map->type = type; 2138 map->num_stripes = num_stripes; 2139 map->sub_stripes = sub_stripes; 2140 2141 *map_ret = map; 2142 *stripe_size = calc_size; 2143 *num_bytes = chunk_bytes_by_type(type, calc_size, 2144 num_stripes, sub_stripes); 2145 2146 em = alloc_extent_map(GFP_NOFS); 2147 if (!em) { 2148 kfree(map); 2149 return -ENOMEM; 2150 } 2151 em->bdev = (struct block_device *)map; 2152 em->start = start; 2153 em->len = *num_bytes; 2154 em->block_start = 0; 2155 em->block_len = em->len; 2156 2157 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2158 spin_lock(&em_tree->lock); 2159 ret = add_extent_mapping(em_tree, em); 2160 spin_unlock(&em_tree->lock); 2161 BUG_ON(ret); 2162 free_extent_map(em); 2163 2164 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2165 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2166 start, *num_bytes); 2167 BUG_ON(ret); 2168 2169 index = 0; 2170 while (index < map->num_stripes) { 2171 device = map->stripes[index].dev; 2172 dev_offset = map->stripes[index].physical; 2173 2174 ret = btrfs_alloc_dev_extent(trans, device, 2175 info->chunk_root->root_key.objectid, 2176 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2177 start, dev_offset, calc_size); 2178 BUG_ON(ret); 2179 index++; 2180 } 2181 2182 return 0; 2183 } 2184 2185 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2186 struct btrfs_root *extent_root, 2187 struct map_lookup *map, u64 chunk_offset, 2188 u64 chunk_size, u64 stripe_size) 2189 { 2190 u64 dev_offset; 2191 struct btrfs_key key; 2192 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2193 struct btrfs_device *device; 2194 struct btrfs_chunk *chunk; 2195 struct btrfs_stripe *stripe; 2196 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2197 int index = 0; 2198 int ret; 2199 2200 chunk = kzalloc(item_size, GFP_NOFS); 2201 if (!chunk) 2202 return -ENOMEM; 2203 2204 index = 0; 2205 while (index < map->num_stripes) { 2206 device = map->stripes[index].dev; 2207 device->bytes_used += stripe_size; 2208 ret = btrfs_update_device(trans, device); 2209 BUG_ON(ret); 2210 index++; 2211 } 2212 2213 index = 0; 2214 stripe = &chunk->stripe; 2215 while (index < map->num_stripes) { 2216 device = map->stripes[index].dev; 2217 dev_offset = map->stripes[index].physical; 2218 2219 btrfs_set_stack_stripe_devid(stripe, device->devid); 2220 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2221 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2222 stripe++; 2223 index++; 2224 } 2225 2226 btrfs_set_stack_chunk_length(chunk, chunk_size); 2227 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2228 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2229 btrfs_set_stack_chunk_type(chunk, map->type); 2230 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2231 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2232 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2233 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2234 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2235 2236 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2237 key.type = BTRFS_CHUNK_ITEM_KEY; 2238 key.offset = chunk_offset; 2239 2240 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2241 BUG_ON(ret); 2242 2243 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2244 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2245 item_size); 2246 BUG_ON(ret); 2247 } 2248 kfree(chunk); 2249 return 0; 2250 } 2251 2252 /* 2253 * Chunk allocation falls into two parts. The first part does works 2254 * that make the new allocated chunk useable, but not do any operation 2255 * that modifies the chunk tree. The second part does the works that 2256 * require modifying the chunk tree. This division is important for the 2257 * bootstrap process of adding storage to a seed btrfs. 2258 */ 2259 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2260 struct btrfs_root *extent_root, u64 type) 2261 { 2262 u64 chunk_offset; 2263 u64 chunk_size; 2264 u64 stripe_size; 2265 struct map_lookup *map; 2266 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2267 int ret; 2268 2269 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2270 &chunk_offset); 2271 if (ret) 2272 return ret; 2273 2274 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2275 &stripe_size, chunk_offset, type); 2276 if (ret) 2277 return ret; 2278 2279 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2280 chunk_size, stripe_size); 2281 BUG_ON(ret); 2282 return 0; 2283 } 2284 2285 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2286 struct btrfs_root *root, 2287 struct btrfs_device *device) 2288 { 2289 u64 chunk_offset; 2290 u64 sys_chunk_offset; 2291 u64 chunk_size; 2292 u64 sys_chunk_size; 2293 u64 stripe_size; 2294 u64 sys_stripe_size; 2295 u64 alloc_profile; 2296 struct map_lookup *map; 2297 struct map_lookup *sys_map; 2298 struct btrfs_fs_info *fs_info = root->fs_info; 2299 struct btrfs_root *extent_root = fs_info->extent_root; 2300 int ret; 2301 2302 ret = find_next_chunk(fs_info->chunk_root, 2303 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2304 BUG_ON(ret); 2305 2306 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2307 (fs_info->metadata_alloc_profile & 2308 fs_info->avail_metadata_alloc_bits); 2309 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2310 2311 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2312 &stripe_size, chunk_offset, alloc_profile); 2313 BUG_ON(ret); 2314 2315 sys_chunk_offset = chunk_offset + chunk_size; 2316 2317 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2318 (fs_info->system_alloc_profile & 2319 fs_info->avail_system_alloc_bits); 2320 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2321 2322 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2323 &sys_chunk_size, &sys_stripe_size, 2324 sys_chunk_offset, alloc_profile); 2325 BUG_ON(ret); 2326 2327 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2328 BUG_ON(ret); 2329 2330 /* 2331 * Modifying chunk tree needs allocating new blocks from both 2332 * system block group and metadata block group. So we only can 2333 * do operations require modifying the chunk tree after both 2334 * block groups were created. 2335 */ 2336 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2337 chunk_size, stripe_size); 2338 BUG_ON(ret); 2339 2340 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2341 sys_chunk_offset, sys_chunk_size, 2342 sys_stripe_size); 2343 BUG_ON(ret); 2344 return 0; 2345 } 2346 2347 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2348 { 2349 struct extent_map *em; 2350 struct map_lookup *map; 2351 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2352 int readonly = 0; 2353 int i; 2354 2355 spin_lock(&map_tree->map_tree.lock); 2356 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2357 spin_unlock(&map_tree->map_tree.lock); 2358 if (!em) 2359 return 1; 2360 2361 map = (struct map_lookup *)em->bdev; 2362 for (i = 0; i < map->num_stripes; i++) { 2363 if (!map->stripes[i].dev->writeable) { 2364 readonly = 1; 2365 break; 2366 } 2367 } 2368 free_extent_map(em); 2369 return readonly; 2370 } 2371 2372 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2373 { 2374 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2375 } 2376 2377 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2378 { 2379 struct extent_map *em; 2380 2381 while (1) { 2382 spin_lock(&tree->map_tree.lock); 2383 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2384 if (em) 2385 remove_extent_mapping(&tree->map_tree, em); 2386 spin_unlock(&tree->map_tree.lock); 2387 if (!em) 2388 break; 2389 kfree(em->bdev); 2390 /* once for us */ 2391 free_extent_map(em); 2392 /* once for the tree */ 2393 free_extent_map(em); 2394 } 2395 } 2396 2397 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2398 { 2399 struct extent_map *em; 2400 struct map_lookup *map; 2401 struct extent_map_tree *em_tree = &map_tree->map_tree; 2402 int ret; 2403 2404 spin_lock(&em_tree->lock); 2405 em = lookup_extent_mapping(em_tree, logical, len); 2406 spin_unlock(&em_tree->lock); 2407 BUG_ON(!em); 2408 2409 BUG_ON(em->start > logical || em->start + em->len < logical); 2410 map = (struct map_lookup *)em->bdev; 2411 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2412 ret = map->num_stripes; 2413 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2414 ret = map->sub_stripes; 2415 else 2416 ret = 1; 2417 free_extent_map(em); 2418 return ret; 2419 } 2420 2421 static int find_live_mirror(struct map_lookup *map, int first, int num, 2422 int optimal) 2423 { 2424 int i; 2425 if (map->stripes[optimal].dev->bdev) 2426 return optimal; 2427 for (i = first; i < first + num; i++) { 2428 if (map->stripes[i].dev->bdev) 2429 return i; 2430 } 2431 /* we couldn't find one that doesn't fail. Just return something 2432 * and the io error handling code will clean up eventually 2433 */ 2434 return optimal; 2435 } 2436 2437 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2438 u64 logical, u64 *length, 2439 struct btrfs_multi_bio **multi_ret, 2440 int mirror_num, struct page *unplug_page) 2441 { 2442 struct extent_map *em; 2443 struct map_lookup *map; 2444 struct extent_map_tree *em_tree = &map_tree->map_tree; 2445 u64 offset; 2446 u64 stripe_offset; 2447 u64 stripe_nr; 2448 int stripes_allocated = 8; 2449 int stripes_required = 1; 2450 int stripe_index; 2451 int i; 2452 int num_stripes; 2453 int max_errors = 0; 2454 struct btrfs_multi_bio *multi = NULL; 2455 2456 if (multi_ret && !(rw & (1 << BIO_RW))) 2457 stripes_allocated = 1; 2458 again: 2459 if (multi_ret) { 2460 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2461 GFP_NOFS); 2462 if (!multi) 2463 return -ENOMEM; 2464 2465 atomic_set(&multi->error, 0); 2466 } 2467 2468 spin_lock(&em_tree->lock); 2469 em = lookup_extent_mapping(em_tree, logical, *length); 2470 spin_unlock(&em_tree->lock); 2471 2472 if (!em && unplug_page) 2473 return 0; 2474 2475 if (!em) { 2476 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2477 (unsigned long long)logical, 2478 (unsigned long long)*length); 2479 BUG(); 2480 } 2481 2482 BUG_ON(em->start > logical || em->start + em->len < logical); 2483 map = (struct map_lookup *)em->bdev; 2484 offset = logical - em->start; 2485 2486 if (mirror_num > map->num_stripes) 2487 mirror_num = 0; 2488 2489 /* if our multi bio struct is too small, back off and try again */ 2490 if (rw & (1 << BIO_RW)) { 2491 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2492 BTRFS_BLOCK_GROUP_DUP)) { 2493 stripes_required = map->num_stripes; 2494 max_errors = 1; 2495 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2496 stripes_required = map->sub_stripes; 2497 max_errors = 1; 2498 } 2499 } 2500 if (multi_ret && rw == WRITE && 2501 stripes_allocated < stripes_required) { 2502 stripes_allocated = map->num_stripes; 2503 free_extent_map(em); 2504 kfree(multi); 2505 goto again; 2506 } 2507 stripe_nr = offset; 2508 /* 2509 * stripe_nr counts the total number of stripes we have to stride 2510 * to get to this block 2511 */ 2512 do_div(stripe_nr, map->stripe_len); 2513 2514 stripe_offset = stripe_nr * map->stripe_len; 2515 BUG_ON(offset < stripe_offset); 2516 2517 /* stripe_offset is the offset of this block in its stripe*/ 2518 stripe_offset = offset - stripe_offset; 2519 2520 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2521 BTRFS_BLOCK_GROUP_RAID10 | 2522 BTRFS_BLOCK_GROUP_DUP)) { 2523 /* we limit the length of each bio to what fits in a stripe */ 2524 *length = min_t(u64, em->len - offset, 2525 map->stripe_len - stripe_offset); 2526 } else { 2527 *length = em->len - offset; 2528 } 2529 2530 if (!multi_ret && !unplug_page) 2531 goto out; 2532 2533 num_stripes = 1; 2534 stripe_index = 0; 2535 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2536 if (unplug_page || (rw & (1 << BIO_RW))) 2537 num_stripes = map->num_stripes; 2538 else if (mirror_num) 2539 stripe_index = mirror_num - 1; 2540 else { 2541 stripe_index = find_live_mirror(map, 0, 2542 map->num_stripes, 2543 current->pid % map->num_stripes); 2544 } 2545 2546 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2547 if (rw & (1 << BIO_RW)) 2548 num_stripes = map->num_stripes; 2549 else if (mirror_num) 2550 stripe_index = mirror_num - 1; 2551 2552 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2553 int factor = map->num_stripes / map->sub_stripes; 2554 2555 stripe_index = do_div(stripe_nr, factor); 2556 stripe_index *= map->sub_stripes; 2557 2558 if (unplug_page || (rw & (1 << BIO_RW))) 2559 num_stripes = map->sub_stripes; 2560 else if (mirror_num) 2561 stripe_index += mirror_num - 1; 2562 else { 2563 stripe_index = find_live_mirror(map, stripe_index, 2564 map->sub_stripes, stripe_index + 2565 current->pid % map->sub_stripes); 2566 } 2567 } else { 2568 /* 2569 * after this do_div call, stripe_nr is the number of stripes 2570 * on this device we have to walk to find the data, and 2571 * stripe_index is the number of our device in the stripe array 2572 */ 2573 stripe_index = do_div(stripe_nr, map->num_stripes); 2574 } 2575 BUG_ON(stripe_index >= map->num_stripes); 2576 2577 for (i = 0; i < num_stripes; i++) { 2578 if (unplug_page) { 2579 struct btrfs_device *device; 2580 struct backing_dev_info *bdi; 2581 2582 device = map->stripes[stripe_index].dev; 2583 if (device->bdev) { 2584 bdi = blk_get_backing_dev_info(device->bdev); 2585 if (bdi->unplug_io_fn) 2586 bdi->unplug_io_fn(bdi, unplug_page); 2587 } 2588 } else { 2589 multi->stripes[i].physical = 2590 map->stripes[stripe_index].physical + 2591 stripe_offset + stripe_nr * map->stripe_len; 2592 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2593 } 2594 stripe_index++; 2595 } 2596 if (multi_ret) { 2597 *multi_ret = multi; 2598 multi->num_stripes = num_stripes; 2599 multi->max_errors = max_errors; 2600 } 2601 out: 2602 free_extent_map(em); 2603 return 0; 2604 } 2605 2606 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2607 u64 logical, u64 *length, 2608 struct btrfs_multi_bio **multi_ret, int mirror_num) 2609 { 2610 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2611 mirror_num, NULL); 2612 } 2613 2614 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2615 u64 chunk_start, u64 physical, u64 devid, 2616 u64 **logical, int *naddrs, int *stripe_len) 2617 { 2618 struct extent_map_tree *em_tree = &map_tree->map_tree; 2619 struct extent_map *em; 2620 struct map_lookup *map; 2621 u64 *buf; 2622 u64 bytenr; 2623 u64 length; 2624 u64 stripe_nr; 2625 int i, j, nr = 0; 2626 2627 spin_lock(&em_tree->lock); 2628 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2629 spin_unlock(&em_tree->lock); 2630 2631 BUG_ON(!em || em->start != chunk_start); 2632 map = (struct map_lookup *)em->bdev; 2633 2634 length = em->len; 2635 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2636 do_div(length, map->num_stripes / map->sub_stripes); 2637 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2638 do_div(length, map->num_stripes); 2639 2640 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2641 BUG_ON(!buf); 2642 2643 for (i = 0; i < map->num_stripes; i++) { 2644 if (devid && map->stripes[i].dev->devid != devid) 2645 continue; 2646 if (map->stripes[i].physical > physical || 2647 map->stripes[i].physical + length <= physical) 2648 continue; 2649 2650 stripe_nr = physical - map->stripes[i].physical; 2651 do_div(stripe_nr, map->stripe_len); 2652 2653 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2654 stripe_nr = stripe_nr * map->num_stripes + i; 2655 do_div(stripe_nr, map->sub_stripes); 2656 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2657 stripe_nr = stripe_nr * map->num_stripes + i; 2658 } 2659 bytenr = chunk_start + stripe_nr * map->stripe_len; 2660 WARN_ON(nr >= map->num_stripes); 2661 for (j = 0; j < nr; j++) { 2662 if (buf[j] == bytenr) 2663 break; 2664 } 2665 if (j == nr) { 2666 WARN_ON(nr >= map->num_stripes); 2667 buf[nr++] = bytenr; 2668 } 2669 } 2670 2671 for (i = 0; i > nr; i++) { 2672 struct btrfs_multi_bio *multi; 2673 struct btrfs_bio_stripe *stripe; 2674 int ret; 2675 2676 length = 1; 2677 ret = btrfs_map_block(map_tree, WRITE, buf[i], 2678 &length, &multi, 0); 2679 BUG_ON(ret); 2680 2681 stripe = multi->stripes; 2682 for (j = 0; j < multi->num_stripes; j++) { 2683 if (stripe->physical >= physical && 2684 physical < stripe->physical + length) 2685 break; 2686 } 2687 BUG_ON(j >= multi->num_stripes); 2688 kfree(multi); 2689 } 2690 2691 *logical = buf; 2692 *naddrs = nr; 2693 *stripe_len = map->stripe_len; 2694 2695 free_extent_map(em); 2696 return 0; 2697 } 2698 2699 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2700 u64 logical, struct page *page) 2701 { 2702 u64 length = PAGE_CACHE_SIZE; 2703 return __btrfs_map_block(map_tree, READ, logical, &length, 2704 NULL, 0, page); 2705 } 2706 2707 static void end_bio_multi_stripe(struct bio *bio, int err) 2708 { 2709 struct btrfs_multi_bio *multi = bio->bi_private; 2710 int is_orig_bio = 0; 2711 2712 if (err) 2713 atomic_inc(&multi->error); 2714 2715 if (bio == multi->orig_bio) 2716 is_orig_bio = 1; 2717 2718 if (atomic_dec_and_test(&multi->stripes_pending)) { 2719 if (!is_orig_bio) { 2720 bio_put(bio); 2721 bio = multi->orig_bio; 2722 } 2723 bio->bi_private = multi->private; 2724 bio->bi_end_io = multi->end_io; 2725 /* only send an error to the higher layers if it is 2726 * beyond the tolerance of the multi-bio 2727 */ 2728 if (atomic_read(&multi->error) > multi->max_errors) { 2729 err = -EIO; 2730 } else if (err) { 2731 /* 2732 * this bio is actually up to date, we didn't 2733 * go over the max number of errors 2734 */ 2735 set_bit(BIO_UPTODATE, &bio->bi_flags); 2736 err = 0; 2737 } 2738 kfree(multi); 2739 2740 bio_endio(bio, err); 2741 } else if (!is_orig_bio) { 2742 bio_put(bio); 2743 } 2744 } 2745 2746 struct async_sched { 2747 struct bio *bio; 2748 int rw; 2749 struct btrfs_fs_info *info; 2750 struct btrfs_work work; 2751 }; 2752 2753 /* 2754 * see run_scheduled_bios for a description of why bios are collected for 2755 * async submit. 2756 * 2757 * This will add one bio to the pending list for a device and make sure 2758 * the work struct is scheduled. 2759 */ 2760 static noinline int schedule_bio(struct btrfs_root *root, 2761 struct btrfs_device *device, 2762 int rw, struct bio *bio) 2763 { 2764 int should_queue = 1; 2765 2766 /* don't bother with additional async steps for reads, right now */ 2767 if (!(rw & (1 << BIO_RW))) { 2768 bio_get(bio); 2769 submit_bio(rw, bio); 2770 bio_put(bio); 2771 return 0; 2772 } 2773 2774 /* 2775 * nr_async_bios allows us to reliably return congestion to the 2776 * higher layers. Otherwise, the async bio makes it appear we have 2777 * made progress against dirty pages when we've really just put it 2778 * on a queue for later 2779 */ 2780 atomic_inc(&root->fs_info->nr_async_bios); 2781 WARN_ON(bio->bi_next); 2782 bio->bi_next = NULL; 2783 bio->bi_rw |= rw; 2784 2785 spin_lock(&device->io_lock); 2786 2787 if (device->pending_bio_tail) 2788 device->pending_bio_tail->bi_next = bio; 2789 2790 device->pending_bio_tail = bio; 2791 if (!device->pending_bios) 2792 device->pending_bios = bio; 2793 if (device->running_pending) 2794 should_queue = 0; 2795 2796 spin_unlock(&device->io_lock); 2797 2798 if (should_queue) 2799 btrfs_queue_worker(&root->fs_info->submit_workers, 2800 &device->work); 2801 return 0; 2802 } 2803 2804 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2805 int mirror_num, int async_submit) 2806 { 2807 struct btrfs_mapping_tree *map_tree; 2808 struct btrfs_device *dev; 2809 struct bio *first_bio = bio; 2810 u64 logical = (u64)bio->bi_sector << 9; 2811 u64 length = 0; 2812 u64 map_length; 2813 struct btrfs_multi_bio *multi = NULL; 2814 int ret; 2815 int dev_nr = 0; 2816 int total_devs = 1; 2817 2818 length = bio->bi_size; 2819 map_tree = &root->fs_info->mapping_tree; 2820 map_length = length; 2821 2822 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2823 mirror_num); 2824 BUG_ON(ret); 2825 2826 total_devs = multi->num_stripes; 2827 if (map_length < length) { 2828 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2829 "len %llu\n", (unsigned long long)logical, 2830 (unsigned long long)length, 2831 (unsigned long long)map_length); 2832 BUG(); 2833 } 2834 multi->end_io = first_bio->bi_end_io; 2835 multi->private = first_bio->bi_private; 2836 multi->orig_bio = first_bio; 2837 atomic_set(&multi->stripes_pending, multi->num_stripes); 2838 2839 while (dev_nr < total_devs) { 2840 if (total_devs > 1) { 2841 if (dev_nr < total_devs - 1) { 2842 bio = bio_clone(first_bio, GFP_NOFS); 2843 BUG_ON(!bio); 2844 } else { 2845 bio = first_bio; 2846 } 2847 bio->bi_private = multi; 2848 bio->bi_end_io = end_bio_multi_stripe; 2849 } 2850 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 2851 dev = multi->stripes[dev_nr].dev; 2852 BUG_ON(rw == WRITE && !dev->writeable); 2853 if (dev && dev->bdev) { 2854 bio->bi_bdev = dev->bdev; 2855 if (async_submit) 2856 schedule_bio(root, dev, rw, bio); 2857 else 2858 submit_bio(rw, bio); 2859 } else { 2860 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2861 bio->bi_sector = logical >> 9; 2862 bio_endio(bio, -EIO); 2863 } 2864 dev_nr++; 2865 } 2866 if (total_devs == 1) 2867 kfree(multi); 2868 return 0; 2869 } 2870 2871 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 2872 u8 *uuid, u8 *fsid) 2873 { 2874 struct btrfs_device *device; 2875 struct btrfs_fs_devices *cur_devices; 2876 2877 cur_devices = root->fs_info->fs_devices; 2878 while (cur_devices) { 2879 if (!fsid || 2880 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 2881 device = __find_device(&cur_devices->devices, 2882 devid, uuid); 2883 if (device) 2884 return device; 2885 } 2886 cur_devices = cur_devices->seed; 2887 } 2888 return NULL; 2889 } 2890 2891 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 2892 u64 devid, u8 *dev_uuid) 2893 { 2894 struct btrfs_device *device; 2895 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2896 2897 device = kzalloc(sizeof(*device), GFP_NOFS); 2898 if (!device) 2899 return NULL; 2900 list_add(&device->dev_list, 2901 &fs_devices->devices); 2902 device->barriers = 1; 2903 device->dev_root = root->fs_info->dev_root; 2904 device->devid = devid; 2905 device->work.func = pending_bios_fn; 2906 device->fs_devices = fs_devices; 2907 fs_devices->num_devices++; 2908 spin_lock_init(&device->io_lock); 2909 INIT_LIST_HEAD(&device->dev_alloc_list); 2910 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 2911 return device; 2912 } 2913 2914 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 2915 struct extent_buffer *leaf, 2916 struct btrfs_chunk *chunk) 2917 { 2918 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2919 struct map_lookup *map; 2920 struct extent_map *em; 2921 u64 logical; 2922 u64 length; 2923 u64 devid; 2924 u8 uuid[BTRFS_UUID_SIZE]; 2925 int num_stripes; 2926 int ret; 2927 int i; 2928 2929 logical = key->offset; 2930 length = btrfs_chunk_length(leaf, chunk); 2931 2932 spin_lock(&map_tree->map_tree.lock); 2933 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 2934 spin_unlock(&map_tree->map_tree.lock); 2935 2936 /* already mapped? */ 2937 if (em && em->start <= logical && em->start + em->len > logical) { 2938 free_extent_map(em); 2939 return 0; 2940 } else if (em) { 2941 free_extent_map(em); 2942 } 2943 2944 em = alloc_extent_map(GFP_NOFS); 2945 if (!em) 2946 return -ENOMEM; 2947 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2948 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2949 if (!map) { 2950 free_extent_map(em); 2951 return -ENOMEM; 2952 } 2953 2954 em->bdev = (struct block_device *)map; 2955 em->start = logical; 2956 em->len = length; 2957 em->block_start = 0; 2958 em->block_len = em->len; 2959 2960 map->num_stripes = num_stripes; 2961 map->io_width = btrfs_chunk_io_width(leaf, chunk); 2962 map->io_align = btrfs_chunk_io_align(leaf, chunk); 2963 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 2964 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 2965 map->type = btrfs_chunk_type(leaf, chunk); 2966 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 2967 for (i = 0; i < num_stripes; i++) { 2968 map->stripes[i].physical = 2969 btrfs_stripe_offset_nr(leaf, chunk, i); 2970 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 2971 read_extent_buffer(leaf, uuid, (unsigned long) 2972 btrfs_stripe_dev_uuid_nr(chunk, i), 2973 BTRFS_UUID_SIZE); 2974 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 2975 NULL); 2976 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 2977 kfree(map); 2978 free_extent_map(em); 2979 return -EIO; 2980 } 2981 if (!map->stripes[i].dev) { 2982 map->stripes[i].dev = 2983 add_missing_dev(root, devid, uuid); 2984 if (!map->stripes[i].dev) { 2985 kfree(map); 2986 free_extent_map(em); 2987 return -EIO; 2988 } 2989 } 2990 map->stripes[i].dev->in_fs_metadata = 1; 2991 } 2992 2993 spin_lock(&map_tree->map_tree.lock); 2994 ret = add_extent_mapping(&map_tree->map_tree, em); 2995 spin_unlock(&map_tree->map_tree.lock); 2996 BUG_ON(ret); 2997 free_extent_map(em); 2998 2999 return 0; 3000 } 3001 3002 static int fill_device_from_item(struct extent_buffer *leaf, 3003 struct btrfs_dev_item *dev_item, 3004 struct btrfs_device *device) 3005 { 3006 unsigned long ptr; 3007 3008 device->devid = btrfs_device_id(leaf, dev_item); 3009 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3010 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3011 device->type = btrfs_device_type(leaf, dev_item); 3012 device->io_align = btrfs_device_io_align(leaf, dev_item); 3013 device->io_width = btrfs_device_io_width(leaf, dev_item); 3014 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3015 3016 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3017 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3018 3019 return 0; 3020 } 3021 3022 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3023 { 3024 struct btrfs_fs_devices *fs_devices; 3025 int ret; 3026 3027 mutex_lock(&uuid_mutex); 3028 3029 fs_devices = root->fs_info->fs_devices->seed; 3030 while (fs_devices) { 3031 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3032 ret = 0; 3033 goto out; 3034 } 3035 fs_devices = fs_devices->seed; 3036 } 3037 3038 fs_devices = find_fsid(fsid); 3039 if (!fs_devices) { 3040 ret = -ENOENT; 3041 goto out; 3042 } 3043 3044 fs_devices = clone_fs_devices(fs_devices); 3045 if (IS_ERR(fs_devices)) { 3046 ret = PTR_ERR(fs_devices); 3047 goto out; 3048 } 3049 3050 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3051 root->fs_info->bdev_holder); 3052 if (ret) 3053 goto out; 3054 3055 if (!fs_devices->seeding) { 3056 __btrfs_close_devices(fs_devices); 3057 free_fs_devices(fs_devices); 3058 ret = -EINVAL; 3059 goto out; 3060 } 3061 3062 fs_devices->seed = root->fs_info->fs_devices->seed; 3063 root->fs_info->fs_devices->seed = fs_devices; 3064 out: 3065 mutex_unlock(&uuid_mutex); 3066 return ret; 3067 } 3068 3069 static int read_one_dev(struct btrfs_root *root, 3070 struct extent_buffer *leaf, 3071 struct btrfs_dev_item *dev_item) 3072 { 3073 struct btrfs_device *device; 3074 u64 devid; 3075 int ret; 3076 u8 fs_uuid[BTRFS_UUID_SIZE]; 3077 u8 dev_uuid[BTRFS_UUID_SIZE]; 3078 3079 devid = btrfs_device_id(leaf, dev_item); 3080 read_extent_buffer(leaf, dev_uuid, 3081 (unsigned long)btrfs_device_uuid(dev_item), 3082 BTRFS_UUID_SIZE); 3083 read_extent_buffer(leaf, fs_uuid, 3084 (unsigned long)btrfs_device_fsid(dev_item), 3085 BTRFS_UUID_SIZE); 3086 3087 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3088 ret = open_seed_devices(root, fs_uuid); 3089 if (ret && !btrfs_test_opt(root, DEGRADED)) 3090 return ret; 3091 } 3092 3093 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3094 if (!device || !device->bdev) { 3095 if (!btrfs_test_opt(root, DEGRADED)) 3096 return -EIO; 3097 3098 if (!device) { 3099 printk(KERN_WARNING "warning devid %llu missing\n", 3100 (unsigned long long)devid); 3101 device = add_missing_dev(root, devid, dev_uuid); 3102 if (!device) 3103 return -ENOMEM; 3104 } 3105 } 3106 3107 if (device->fs_devices != root->fs_info->fs_devices) { 3108 BUG_ON(device->writeable); 3109 if (device->generation != 3110 btrfs_device_generation(leaf, dev_item)) 3111 return -EINVAL; 3112 } 3113 3114 fill_device_from_item(leaf, dev_item, device); 3115 device->dev_root = root->fs_info->dev_root; 3116 device->in_fs_metadata = 1; 3117 if (device->writeable) 3118 device->fs_devices->total_rw_bytes += device->total_bytes; 3119 ret = 0; 3120 return ret; 3121 } 3122 3123 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3124 { 3125 struct btrfs_dev_item *dev_item; 3126 3127 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3128 dev_item); 3129 return read_one_dev(root, buf, dev_item); 3130 } 3131 3132 int btrfs_read_sys_array(struct btrfs_root *root) 3133 { 3134 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3135 struct extent_buffer *sb; 3136 struct btrfs_disk_key *disk_key; 3137 struct btrfs_chunk *chunk; 3138 u8 *ptr; 3139 unsigned long sb_ptr; 3140 int ret = 0; 3141 u32 num_stripes; 3142 u32 array_size; 3143 u32 len = 0; 3144 u32 cur; 3145 struct btrfs_key key; 3146 3147 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3148 BTRFS_SUPER_INFO_SIZE); 3149 if (!sb) 3150 return -ENOMEM; 3151 btrfs_set_buffer_uptodate(sb); 3152 btrfs_set_buffer_lockdep_class(sb, 0); 3153 3154 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3155 array_size = btrfs_super_sys_array_size(super_copy); 3156 3157 ptr = super_copy->sys_chunk_array; 3158 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3159 cur = 0; 3160 3161 while (cur < array_size) { 3162 disk_key = (struct btrfs_disk_key *)ptr; 3163 btrfs_disk_key_to_cpu(&key, disk_key); 3164 3165 len = sizeof(*disk_key); ptr += len; 3166 sb_ptr += len; 3167 cur += len; 3168 3169 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3170 chunk = (struct btrfs_chunk *)sb_ptr; 3171 ret = read_one_chunk(root, &key, sb, chunk); 3172 if (ret) 3173 break; 3174 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3175 len = btrfs_chunk_item_size(num_stripes); 3176 } else { 3177 ret = -EIO; 3178 break; 3179 } 3180 ptr += len; 3181 sb_ptr += len; 3182 cur += len; 3183 } 3184 free_extent_buffer(sb); 3185 return ret; 3186 } 3187 3188 int btrfs_read_chunk_tree(struct btrfs_root *root) 3189 { 3190 struct btrfs_path *path; 3191 struct extent_buffer *leaf; 3192 struct btrfs_key key; 3193 struct btrfs_key found_key; 3194 int ret; 3195 int slot; 3196 3197 root = root->fs_info->chunk_root; 3198 3199 path = btrfs_alloc_path(); 3200 if (!path) 3201 return -ENOMEM; 3202 3203 /* first we search for all of the device items, and then we 3204 * read in all of the chunk items. This way we can create chunk 3205 * mappings that reference all of the devices that are afound 3206 */ 3207 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3208 key.offset = 0; 3209 key.type = 0; 3210 again: 3211 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3212 while (1) { 3213 leaf = path->nodes[0]; 3214 slot = path->slots[0]; 3215 if (slot >= btrfs_header_nritems(leaf)) { 3216 ret = btrfs_next_leaf(root, path); 3217 if (ret == 0) 3218 continue; 3219 if (ret < 0) 3220 goto error; 3221 break; 3222 } 3223 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3224 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3225 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3226 break; 3227 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3228 struct btrfs_dev_item *dev_item; 3229 dev_item = btrfs_item_ptr(leaf, slot, 3230 struct btrfs_dev_item); 3231 ret = read_one_dev(root, leaf, dev_item); 3232 if (ret) 3233 goto error; 3234 } 3235 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3236 struct btrfs_chunk *chunk; 3237 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3238 ret = read_one_chunk(root, &found_key, leaf, chunk); 3239 if (ret) 3240 goto error; 3241 } 3242 path->slots[0]++; 3243 } 3244 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3245 key.objectid = 0; 3246 btrfs_release_path(root, path); 3247 goto again; 3248 } 3249 ret = 0; 3250 error: 3251 btrfs_free_path(path); 3252 return ret; 3253 } 3254