1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <asm/div64.h> 24 #include "compat.h" 25 #include "ctree.h" 26 #include "extent_map.h" 27 #include "disk-io.h" 28 #include "transaction.h" 29 #include "print-tree.h" 30 #include "volumes.h" 31 #include "async-thread.h" 32 33 struct map_lookup { 34 u64 type; 35 int io_align; 36 int io_width; 37 int stripe_len; 38 int sector_size; 39 int num_stripes; 40 int sub_stripes; 41 struct btrfs_bio_stripe stripes[]; 42 }; 43 44 static int init_first_rw_device(struct btrfs_trans_handle *trans, 45 struct btrfs_root *root, 46 struct btrfs_device *device); 47 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 48 49 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 50 (sizeof(struct btrfs_bio_stripe) * (n))) 51 52 static DEFINE_MUTEX(uuid_mutex); 53 static LIST_HEAD(fs_uuids); 54 55 void btrfs_lock_volumes(void) 56 { 57 mutex_lock(&uuid_mutex); 58 } 59 60 void btrfs_unlock_volumes(void) 61 { 62 mutex_unlock(&uuid_mutex); 63 } 64 65 static void lock_chunks(struct btrfs_root *root) 66 { 67 mutex_lock(&root->fs_info->chunk_mutex); 68 } 69 70 static void unlock_chunks(struct btrfs_root *root) 71 { 72 mutex_unlock(&root->fs_info->chunk_mutex); 73 } 74 75 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 76 { 77 struct btrfs_device *device; 78 WARN_ON(fs_devices->opened); 79 while (!list_empty(&fs_devices->devices)) { 80 device = list_entry(fs_devices->devices.next, 81 struct btrfs_device, dev_list); 82 list_del(&device->dev_list); 83 kfree(device->name); 84 kfree(device); 85 } 86 kfree(fs_devices); 87 } 88 89 int btrfs_cleanup_fs_uuids(void) 90 { 91 struct btrfs_fs_devices *fs_devices; 92 93 while (!list_empty(&fs_uuids)) { 94 fs_devices = list_entry(fs_uuids.next, 95 struct btrfs_fs_devices, list); 96 list_del(&fs_devices->list); 97 free_fs_devices(fs_devices); 98 } 99 return 0; 100 } 101 102 static noinline struct btrfs_device *__find_device(struct list_head *head, 103 u64 devid, u8 *uuid) 104 { 105 struct btrfs_device *dev; 106 107 list_for_each_entry(dev, head, dev_list) { 108 if (dev->devid == devid && 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 110 return dev; 111 } 112 } 113 return NULL; 114 } 115 116 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 117 { 118 struct btrfs_fs_devices *fs_devices; 119 120 list_for_each_entry(fs_devices, &fs_uuids, list) { 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 122 return fs_devices; 123 } 124 return NULL; 125 } 126 127 /* 128 * we try to collect pending bios for a device so we don't get a large 129 * number of procs sending bios down to the same device. This greatly 130 * improves the schedulers ability to collect and merge the bios. 131 * 132 * But, it also turns into a long list of bios to process and that is sure 133 * to eventually make the worker thread block. The solution here is to 134 * make some progress and then put this work struct back at the end of 135 * the list if the block device is congested. This way, multiple devices 136 * can make progress from a single worker thread. 137 */ 138 static noinline int run_scheduled_bios(struct btrfs_device *device) 139 { 140 struct bio *pending; 141 struct backing_dev_info *bdi; 142 struct btrfs_fs_info *fs_info; 143 struct bio *tail; 144 struct bio *cur; 145 int again = 0; 146 unsigned long num_run = 0; 147 unsigned long limit; 148 149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 150 fs_info = device->dev_root->fs_info; 151 limit = btrfs_async_submit_limit(fs_info); 152 limit = limit * 2 / 3; 153 154 loop: 155 spin_lock(&device->io_lock); 156 157 loop_lock: 158 /* take all the bios off the list at once and process them 159 * later on (without the lock held). But, remember the 160 * tail and other pointers so the bios can be properly reinserted 161 * into the list if we hit congestion 162 */ 163 pending = device->pending_bios; 164 tail = device->pending_bio_tail; 165 WARN_ON(pending && !tail); 166 device->pending_bios = NULL; 167 device->pending_bio_tail = NULL; 168 169 /* 170 * if pending was null this time around, no bios need processing 171 * at all and we can stop. Otherwise it'll loop back up again 172 * and do an additional check so no bios are missed. 173 * 174 * device->running_pending is used to synchronize with the 175 * schedule_bio code. 176 */ 177 if (pending) { 178 again = 1; 179 device->running_pending = 1; 180 } else { 181 again = 0; 182 device->running_pending = 0; 183 } 184 spin_unlock(&device->io_lock); 185 186 while (pending) { 187 cur = pending; 188 pending = pending->bi_next; 189 cur->bi_next = NULL; 190 atomic_dec(&fs_info->nr_async_bios); 191 192 if (atomic_read(&fs_info->nr_async_bios) < limit && 193 waitqueue_active(&fs_info->async_submit_wait)) 194 wake_up(&fs_info->async_submit_wait); 195 196 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 197 bio_get(cur); 198 submit_bio(cur->bi_rw, cur); 199 bio_put(cur); 200 num_run++; 201 202 /* 203 * we made progress, there is more work to do and the bdi 204 * is now congested. Back off and let other work structs 205 * run instead 206 */ 207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 208 fs_info->fs_devices->open_devices > 1) { 209 struct bio *old_head; 210 211 spin_lock(&device->io_lock); 212 213 old_head = device->pending_bios; 214 device->pending_bios = pending; 215 if (device->pending_bio_tail) 216 tail->bi_next = old_head; 217 else 218 device->pending_bio_tail = tail; 219 220 device->running_pending = 1; 221 222 spin_unlock(&device->io_lock); 223 btrfs_requeue_work(&device->work); 224 goto done; 225 } 226 } 227 if (again) 228 goto loop; 229 230 spin_lock(&device->io_lock); 231 if (device->pending_bios) 232 goto loop_lock; 233 spin_unlock(&device->io_lock); 234 done: 235 return 0; 236 } 237 238 static void pending_bios_fn(struct btrfs_work *work) 239 { 240 struct btrfs_device *device; 241 242 device = container_of(work, struct btrfs_device, work); 243 run_scheduled_bios(device); 244 } 245 246 static noinline int device_list_add(const char *path, 247 struct btrfs_super_block *disk_super, 248 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 249 { 250 struct btrfs_device *device; 251 struct btrfs_fs_devices *fs_devices; 252 u64 found_transid = btrfs_super_generation(disk_super); 253 254 fs_devices = find_fsid(disk_super->fsid); 255 if (!fs_devices) { 256 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 257 if (!fs_devices) 258 return -ENOMEM; 259 INIT_LIST_HEAD(&fs_devices->devices); 260 INIT_LIST_HEAD(&fs_devices->alloc_list); 261 list_add(&fs_devices->list, &fs_uuids); 262 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 263 fs_devices->latest_devid = devid; 264 fs_devices->latest_trans = found_transid; 265 device = NULL; 266 } else { 267 device = __find_device(&fs_devices->devices, devid, 268 disk_super->dev_item.uuid); 269 } 270 if (!device) { 271 if (fs_devices->opened) 272 return -EBUSY; 273 274 device = kzalloc(sizeof(*device), GFP_NOFS); 275 if (!device) { 276 /* we can safely leave the fs_devices entry around */ 277 return -ENOMEM; 278 } 279 device->devid = devid; 280 device->work.func = pending_bios_fn; 281 memcpy(device->uuid, disk_super->dev_item.uuid, 282 BTRFS_UUID_SIZE); 283 device->barriers = 1; 284 spin_lock_init(&device->io_lock); 285 device->name = kstrdup(path, GFP_NOFS); 286 if (!device->name) { 287 kfree(device); 288 return -ENOMEM; 289 } 290 INIT_LIST_HEAD(&device->dev_alloc_list); 291 list_add(&device->dev_list, &fs_devices->devices); 292 device->fs_devices = fs_devices; 293 fs_devices->num_devices++; 294 } 295 296 if (found_transid > fs_devices->latest_trans) { 297 fs_devices->latest_devid = devid; 298 fs_devices->latest_trans = found_transid; 299 } 300 *fs_devices_ret = fs_devices; 301 return 0; 302 } 303 304 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 305 { 306 struct btrfs_fs_devices *fs_devices; 307 struct btrfs_device *device; 308 struct btrfs_device *orig_dev; 309 310 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 311 if (!fs_devices) 312 return ERR_PTR(-ENOMEM); 313 314 INIT_LIST_HEAD(&fs_devices->devices); 315 INIT_LIST_HEAD(&fs_devices->alloc_list); 316 INIT_LIST_HEAD(&fs_devices->list); 317 fs_devices->latest_devid = orig->latest_devid; 318 fs_devices->latest_trans = orig->latest_trans; 319 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 320 321 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 322 device = kzalloc(sizeof(*device), GFP_NOFS); 323 if (!device) 324 goto error; 325 326 device->name = kstrdup(orig_dev->name, GFP_NOFS); 327 if (!device->name) 328 goto error; 329 330 device->devid = orig_dev->devid; 331 device->work.func = pending_bios_fn; 332 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 333 device->barriers = 1; 334 spin_lock_init(&device->io_lock); 335 INIT_LIST_HEAD(&device->dev_list); 336 INIT_LIST_HEAD(&device->dev_alloc_list); 337 338 list_add(&device->dev_list, &fs_devices->devices); 339 device->fs_devices = fs_devices; 340 fs_devices->num_devices++; 341 } 342 return fs_devices; 343 error: 344 free_fs_devices(fs_devices); 345 return ERR_PTR(-ENOMEM); 346 } 347 348 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 349 { 350 struct btrfs_device *device, *next; 351 352 mutex_lock(&uuid_mutex); 353 again: 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 355 if (device->in_fs_metadata) 356 continue; 357 358 if (device->bdev) { 359 close_bdev_exclusive(device->bdev, device->mode); 360 device->bdev = NULL; 361 fs_devices->open_devices--; 362 } 363 if (device->writeable) { 364 list_del_init(&device->dev_alloc_list); 365 device->writeable = 0; 366 fs_devices->rw_devices--; 367 } 368 list_del_init(&device->dev_list); 369 fs_devices->num_devices--; 370 kfree(device->name); 371 kfree(device); 372 } 373 374 if (fs_devices->seed) { 375 fs_devices = fs_devices->seed; 376 goto again; 377 } 378 379 mutex_unlock(&uuid_mutex); 380 return 0; 381 } 382 383 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 384 { 385 struct btrfs_device *device; 386 387 if (--fs_devices->opened > 0) 388 return 0; 389 390 list_for_each_entry(device, &fs_devices->devices, dev_list) { 391 if (device->bdev) { 392 close_bdev_exclusive(device->bdev, device->mode); 393 fs_devices->open_devices--; 394 } 395 if (device->writeable) { 396 list_del_init(&device->dev_alloc_list); 397 fs_devices->rw_devices--; 398 } 399 400 device->bdev = NULL; 401 device->writeable = 0; 402 device->in_fs_metadata = 0; 403 } 404 WARN_ON(fs_devices->open_devices); 405 WARN_ON(fs_devices->rw_devices); 406 fs_devices->opened = 0; 407 fs_devices->seeding = 0; 408 409 return 0; 410 } 411 412 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 413 { 414 struct btrfs_fs_devices *seed_devices = NULL; 415 int ret; 416 417 mutex_lock(&uuid_mutex); 418 ret = __btrfs_close_devices(fs_devices); 419 if (!fs_devices->opened) { 420 seed_devices = fs_devices->seed; 421 fs_devices->seed = NULL; 422 } 423 mutex_unlock(&uuid_mutex); 424 425 while (seed_devices) { 426 fs_devices = seed_devices; 427 seed_devices = fs_devices->seed; 428 __btrfs_close_devices(fs_devices); 429 free_fs_devices(fs_devices); 430 } 431 return ret; 432 } 433 434 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 435 fmode_t flags, void *holder) 436 { 437 struct block_device *bdev; 438 struct list_head *head = &fs_devices->devices; 439 struct btrfs_device *device; 440 struct block_device *latest_bdev = NULL; 441 struct buffer_head *bh; 442 struct btrfs_super_block *disk_super; 443 u64 latest_devid = 0; 444 u64 latest_transid = 0; 445 u64 devid; 446 int seeding = 1; 447 int ret = 0; 448 449 list_for_each_entry(device, head, dev_list) { 450 if (device->bdev) 451 continue; 452 if (!device->name) 453 continue; 454 455 bdev = open_bdev_exclusive(device->name, flags, holder); 456 if (IS_ERR(bdev)) { 457 printk(KERN_INFO "open %s failed\n", device->name); 458 goto error; 459 } 460 set_blocksize(bdev, 4096); 461 462 bh = btrfs_read_dev_super(bdev); 463 if (!bh) 464 goto error_close; 465 466 disk_super = (struct btrfs_super_block *)bh->b_data; 467 devid = le64_to_cpu(disk_super->dev_item.devid); 468 if (devid != device->devid) 469 goto error_brelse; 470 471 if (memcmp(device->uuid, disk_super->dev_item.uuid, 472 BTRFS_UUID_SIZE)) 473 goto error_brelse; 474 475 device->generation = btrfs_super_generation(disk_super); 476 if (!latest_transid || device->generation > latest_transid) { 477 latest_devid = devid; 478 latest_transid = device->generation; 479 latest_bdev = bdev; 480 } 481 482 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 483 device->writeable = 0; 484 } else { 485 device->writeable = !bdev_read_only(bdev); 486 seeding = 0; 487 } 488 489 device->bdev = bdev; 490 device->in_fs_metadata = 0; 491 device->mode = flags; 492 493 fs_devices->open_devices++; 494 if (device->writeable) { 495 fs_devices->rw_devices++; 496 list_add(&device->dev_alloc_list, 497 &fs_devices->alloc_list); 498 } 499 continue; 500 501 error_brelse: 502 brelse(bh); 503 error_close: 504 close_bdev_exclusive(bdev, FMODE_READ); 505 error: 506 continue; 507 } 508 if (fs_devices->open_devices == 0) { 509 ret = -EIO; 510 goto out; 511 } 512 fs_devices->seeding = seeding; 513 fs_devices->opened = 1; 514 fs_devices->latest_bdev = latest_bdev; 515 fs_devices->latest_devid = latest_devid; 516 fs_devices->latest_trans = latest_transid; 517 fs_devices->total_rw_bytes = 0; 518 out: 519 return ret; 520 } 521 522 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 523 fmode_t flags, void *holder) 524 { 525 int ret; 526 527 mutex_lock(&uuid_mutex); 528 if (fs_devices->opened) { 529 fs_devices->opened++; 530 ret = 0; 531 } else { 532 ret = __btrfs_open_devices(fs_devices, flags, holder); 533 } 534 mutex_unlock(&uuid_mutex); 535 return ret; 536 } 537 538 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 539 struct btrfs_fs_devices **fs_devices_ret) 540 { 541 struct btrfs_super_block *disk_super; 542 struct block_device *bdev; 543 struct buffer_head *bh; 544 int ret; 545 u64 devid; 546 u64 transid; 547 548 mutex_lock(&uuid_mutex); 549 550 bdev = open_bdev_exclusive(path, flags, holder); 551 552 if (IS_ERR(bdev)) { 553 ret = PTR_ERR(bdev); 554 goto error; 555 } 556 557 ret = set_blocksize(bdev, 4096); 558 if (ret) 559 goto error_close; 560 bh = btrfs_read_dev_super(bdev); 561 if (!bh) { 562 ret = -EIO; 563 goto error_close; 564 } 565 disk_super = (struct btrfs_super_block *)bh->b_data; 566 devid = le64_to_cpu(disk_super->dev_item.devid); 567 transid = btrfs_super_generation(disk_super); 568 if (disk_super->label[0]) 569 printk(KERN_INFO "device label %s ", disk_super->label); 570 else { 571 /* FIXME, make a readl uuid parser */ 572 printk(KERN_INFO "device fsid %llx-%llx ", 573 *(unsigned long long *)disk_super->fsid, 574 *(unsigned long long *)(disk_super->fsid + 8)); 575 } 576 printk(KERN_CONT "devid %llu transid %llu %s\n", 577 (unsigned long long)devid, (unsigned long long)transid, path); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 579 580 brelse(bh); 581 error_close: 582 close_bdev_exclusive(bdev, flags); 583 error: 584 mutex_unlock(&uuid_mutex); 585 return ret; 586 } 587 588 /* 589 * this uses a pretty simple search, the expectation is that it is 590 * called very infrequently and that a given device has a small number 591 * of extents 592 */ 593 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 594 struct btrfs_device *device, 595 u64 num_bytes, u64 *start) 596 { 597 struct btrfs_key key; 598 struct btrfs_root *root = device->dev_root; 599 struct btrfs_dev_extent *dev_extent = NULL; 600 struct btrfs_path *path; 601 u64 hole_size = 0; 602 u64 last_byte = 0; 603 u64 search_start = 0; 604 u64 search_end = device->total_bytes; 605 int ret; 606 int slot = 0; 607 int start_found; 608 struct extent_buffer *l; 609 610 path = btrfs_alloc_path(); 611 if (!path) 612 return -ENOMEM; 613 path->reada = 2; 614 start_found = 0; 615 616 /* FIXME use last free of some kind */ 617 618 /* we don't want to overwrite the superblock on the drive, 619 * so we make sure to start at an offset of at least 1MB 620 */ 621 search_start = max((u64)1024 * 1024, search_start); 622 623 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 624 search_start = max(root->fs_info->alloc_start, search_start); 625 626 key.objectid = device->devid; 627 key.offset = search_start; 628 key.type = BTRFS_DEV_EXTENT_KEY; 629 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 630 if (ret < 0) 631 goto error; 632 ret = btrfs_previous_item(root, path, 0, key.type); 633 if (ret < 0) 634 goto error; 635 l = path->nodes[0]; 636 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 637 while (1) { 638 l = path->nodes[0]; 639 slot = path->slots[0]; 640 if (slot >= btrfs_header_nritems(l)) { 641 ret = btrfs_next_leaf(root, path); 642 if (ret == 0) 643 continue; 644 if (ret < 0) 645 goto error; 646 no_more_items: 647 if (!start_found) { 648 if (search_start >= search_end) { 649 ret = -ENOSPC; 650 goto error; 651 } 652 *start = search_start; 653 start_found = 1; 654 goto check_pending; 655 } 656 *start = last_byte > search_start ? 657 last_byte : search_start; 658 if (search_end <= *start) { 659 ret = -ENOSPC; 660 goto error; 661 } 662 goto check_pending; 663 } 664 btrfs_item_key_to_cpu(l, &key, slot); 665 666 if (key.objectid < device->devid) 667 goto next; 668 669 if (key.objectid > device->devid) 670 goto no_more_items; 671 672 if (key.offset >= search_start && key.offset > last_byte && 673 start_found) { 674 if (last_byte < search_start) 675 last_byte = search_start; 676 hole_size = key.offset - last_byte; 677 if (key.offset > last_byte && 678 hole_size >= num_bytes) { 679 *start = last_byte; 680 goto check_pending; 681 } 682 } 683 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 684 goto next; 685 686 start_found = 1; 687 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 688 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 689 next: 690 path->slots[0]++; 691 cond_resched(); 692 } 693 check_pending: 694 /* we have to make sure we didn't find an extent that has already 695 * been allocated by the map tree or the original allocation 696 */ 697 BUG_ON(*start < search_start); 698 699 if (*start + num_bytes > search_end) { 700 ret = -ENOSPC; 701 goto error; 702 } 703 /* check for pending inserts here */ 704 ret = 0; 705 706 error: 707 btrfs_free_path(path); 708 return ret; 709 } 710 711 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 712 struct btrfs_device *device, 713 u64 start) 714 { 715 int ret; 716 struct btrfs_path *path; 717 struct btrfs_root *root = device->dev_root; 718 struct btrfs_key key; 719 struct btrfs_key found_key; 720 struct extent_buffer *leaf = NULL; 721 struct btrfs_dev_extent *extent = NULL; 722 723 path = btrfs_alloc_path(); 724 if (!path) 725 return -ENOMEM; 726 727 key.objectid = device->devid; 728 key.offset = start; 729 key.type = BTRFS_DEV_EXTENT_KEY; 730 731 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 732 if (ret > 0) { 733 ret = btrfs_previous_item(root, path, key.objectid, 734 BTRFS_DEV_EXTENT_KEY); 735 BUG_ON(ret); 736 leaf = path->nodes[0]; 737 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 738 extent = btrfs_item_ptr(leaf, path->slots[0], 739 struct btrfs_dev_extent); 740 BUG_ON(found_key.offset > start || found_key.offset + 741 btrfs_dev_extent_length(leaf, extent) < start); 742 ret = 0; 743 } else if (ret == 0) { 744 leaf = path->nodes[0]; 745 extent = btrfs_item_ptr(leaf, path->slots[0], 746 struct btrfs_dev_extent); 747 } 748 BUG_ON(ret); 749 750 if (device->bytes_used > 0) 751 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 752 ret = btrfs_del_item(trans, root, path); 753 BUG_ON(ret); 754 755 btrfs_free_path(path); 756 return ret; 757 } 758 759 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 760 struct btrfs_device *device, 761 u64 chunk_tree, u64 chunk_objectid, 762 u64 chunk_offset, u64 start, u64 num_bytes) 763 { 764 int ret; 765 struct btrfs_path *path; 766 struct btrfs_root *root = device->dev_root; 767 struct btrfs_dev_extent *extent; 768 struct extent_buffer *leaf; 769 struct btrfs_key key; 770 771 WARN_ON(!device->in_fs_metadata); 772 path = btrfs_alloc_path(); 773 if (!path) 774 return -ENOMEM; 775 776 key.objectid = device->devid; 777 key.offset = start; 778 key.type = BTRFS_DEV_EXTENT_KEY; 779 ret = btrfs_insert_empty_item(trans, root, path, &key, 780 sizeof(*extent)); 781 BUG_ON(ret); 782 783 leaf = path->nodes[0]; 784 extent = btrfs_item_ptr(leaf, path->slots[0], 785 struct btrfs_dev_extent); 786 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 787 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 788 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 789 790 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 791 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 792 BTRFS_UUID_SIZE); 793 794 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 795 btrfs_mark_buffer_dirty(leaf); 796 btrfs_free_path(path); 797 return ret; 798 } 799 800 static noinline int find_next_chunk(struct btrfs_root *root, 801 u64 objectid, u64 *offset) 802 { 803 struct btrfs_path *path; 804 int ret; 805 struct btrfs_key key; 806 struct btrfs_chunk *chunk; 807 struct btrfs_key found_key; 808 809 path = btrfs_alloc_path(); 810 BUG_ON(!path); 811 812 key.objectid = objectid; 813 key.offset = (u64)-1; 814 key.type = BTRFS_CHUNK_ITEM_KEY; 815 816 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 817 if (ret < 0) 818 goto error; 819 820 BUG_ON(ret == 0); 821 822 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 823 if (ret) { 824 *offset = 0; 825 } else { 826 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 827 path->slots[0]); 828 if (found_key.objectid != objectid) 829 *offset = 0; 830 else { 831 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 832 struct btrfs_chunk); 833 *offset = found_key.offset + 834 btrfs_chunk_length(path->nodes[0], chunk); 835 } 836 } 837 ret = 0; 838 error: 839 btrfs_free_path(path); 840 return ret; 841 } 842 843 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 844 { 845 int ret; 846 struct btrfs_key key; 847 struct btrfs_key found_key; 848 struct btrfs_path *path; 849 850 root = root->fs_info->chunk_root; 851 852 path = btrfs_alloc_path(); 853 if (!path) 854 return -ENOMEM; 855 856 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 857 key.type = BTRFS_DEV_ITEM_KEY; 858 key.offset = (u64)-1; 859 860 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 861 if (ret < 0) 862 goto error; 863 864 BUG_ON(ret == 0); 865 866 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 867 BTRFS_DEV_ITEM_KEY); 868 if (ret) { 869 *objectid = 1; 870 } else { 871 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 872 path->slots[0]); 873 *objectid = found_key.offset + 1; 874 } 875 ret = 0; 876 error: 877 btrfs_free_path(path); 878 return ret; 879 } 880 881 /* 882 * the device information is stored in the chunk root 883 * the btrfs_device struct should be fully filled in 884 */ 885 int btrfs_add_device(struct btrfs_trans_handle *trans, 886 struct btrfs_root *root, 887 struct btrfs_device *device) 888 { 889 int ret; 890 struct btrfs_path *path; 891 struct btrfs_dev_item *dev_item; 892 struct extent_buffer *leaf; 893 struct btrfs_key key; 894 unsigned long ptr; 895 896 root = root->fs_info->chunk_root; 897 898 path = btrfs_alloc_path(); 899 if (!path) 900 return -ENOMEM; 901 902 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 903 key.type = BTRFS_DEV_ITEM_KEY; 904 key.offset = device->devid; 905 906 ret = btrfs_insert_empty_item(trans, root, path, &key, 907 sizeof(*dev_item)); 908 if (ret) 909 goto out; 910 911 leaf = path->nodes[0]; 912 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 913 914 btrfs_set_device_id(leaf, dev_item, device->devid); 915 btrfs_set_device_generation(leaf, dev_item, 0); 916 btrfs_set_device_type(leaf, dev_item, device->type); 917 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 918 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 919 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 920 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 921 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 922 btrfs_set_device_group(leaf, dev_item, 0); 923 btrfs_set_device_seek_speed(leaf, dev_item, 0); 924 btrfs_set_device_bandwidth(leaf, dev_item, 0); 925 btrfs_set_device_start_offset(leaf, dev_item, 0); 926 927 ptr = (unsigned long)btrfs_device_uuid(dev_item); 928 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 929 ptr = (unsigned long)btrfs_device_fsid(dev_item); 930 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 931 btrfs_mark_buffer_dirty(leaf); 932 933 ret = 0; 934 out: 935 btrfs_free_path(path); 936 return ret; 937 } 938 939 static int btrfs_rm_dev_item(struct btrfs_root *root, 940 struct btrfs_device *device) 941 { 942 int ret; 943 struct btrfs_path *path; 944 struct btrfs_key key; 945 struct btrfs_trans_handle *trans; 946 947 root = root->fs_info->chunk_root; 948 949 path = btrfs_alloc_path(); 950 if (!path) 951 return -ENOMEM; 952 953 trans = btrfs_start_transaction(root, 1); 954 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 955 key.type = BTRFS_DEV_ITEM_KEY; 956 key.offset = device->devid; 957 lock_chunks(root); 958 959 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 960 if (ret < 0) 961 goto out; 962 963 if (ret > 0) { 964 ret = -ENOENT; 965 goto out; 966 } 967 968 ret = btrfs_del_item(trans, root, path); 969 if (ret) 970 goto out; 971 out: 972 btrfs_free_path(path); 973 unlock_chunks(root); 974 btrfs_commit_transaction(trans, root); 975 return ret; 976 } 977 978 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 979 { 980 struct btrfs_device *device; 981 struct btrfs_device *next_device; 982 struct block_device *bdev; 983 struct buffer_head *bh = NULL; 984 struct btrfs_super_block *disk_super; 985 u64 all_avail; 986 u64 devid; 987 u64 num_devices; 988 u8 *dev_uuid; 989 int ret = 0; 990 991 mutex_lock(&uuid_mutex); 992 mutex_lock(&root->fs_info->volume_mutex); 993 994 all_avail = root->fs_info->avail_data_alloc_bits | 995 root->fs_info->avail_system_alloc_bits | 996 root->fs_info->avail_metadata_alloc_bits; 997 998 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 999 root->fs_info->fs_devices->rw_devices <= 4) { 1000 printk(KERN_ERR "btrfs: unable to go below four devices " 1001 "on raid10\n"); 1002 ret = -EINVAL; 1003 goto out; 1004 } 1005 1006 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1007 root->fs_info->fs_devices->rw_devices <= 2) { 1008 printk(KERN_ERR "btrfs: unable to go below two " 1009 "devices on raid1\n"); 1010 ret = -EINVAL; 1011 goto out; 1012 } 1013 1014 if (strcmp(device_path, "missing") == 0) { 1015 struct list_head *devices; 1016 struct btrfs_device *tmp; 1017 1018 device = NULL; 1019 devices = &root->fs_info->fs_devices->devices; 1020 list_for_each_entry(tmp, devices, dev_list) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) { 1022 device = tmp; 1023 break; 1024 } 1025 } 1026 bdev = NULL; 1027 bh = NULL; 1028 disk_super = NULL; 1029 if (!device) { 1030 printk(KERN_ERR "btrfs: no missing devices found to " 1031 "remove\n"); 1032 goto out; 1033 } 1034 } else { 1035 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1036 root->fs_info->bdev_holder); 1037 if (IS_ERR(bdev)) { 1038 ret = PTR_ERR(bdev); 1039 goto out; 1040 } 1041 1042 set_blocksize(bdev, 4096); 1043 bh = btrfs_read_dev_super(bdev); 1044 if (!bh) { 1045 ret = -EIO; 1046 goto error_close; 1047 } 1048 disk_super = (struct btrfs_super_block *)bh->b_data; 1049 devid = le64_to_cpu(disk_super->dev_item.devid); 1050 dev_uuid = disk_super->dev_item.uuid; 1051 device = btrfs_find_device(root, devid, dev_uuid, 1052 disk_super->fsid); 1053 if (!device) { 1054 ret = -ENOENT; 1055 goto error_brelse; 1056 } 1057 } 1058 1059 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1060 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1061 "device\n"); 1062 ret = -EINVAL; 1063 goto error_brelse; 1064 } 1065 1066 if (device->writeable) { 1067 list_del_init(&device->dev_alloc_list); 1068 root->fs_info->fs_devices->rw_devices--; 1069 } 1070 1071 ret = btrfs_shrink_device(device, 0); 1072 if (ret) 1073 goto error_brelse; 1074 1075 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1076 if (ret) 1077 goto error_brelse; 1078 1079 device->in_fs_metadata = 0; 1080 list_del_init(&device->dev_list); 1081 device->fs_devices->num_devices--; 1082 1083 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1084 struct btrfs_device, dev_list); 1085 if (device->bdev == root->fs_info->sb->s_bdev) 1086 root->fs_info->sb->s_bdev = next_device->bdev; 1087 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1088 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1089 1090 if (device->bdev) { 1091 close_bdev_exclusive(device->bdev, device->mode); 1092 device->bdev = NULL; 1093 device->fs_devices->open_devices--; 1094 } 1095 1096 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1097 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1098 1099 if (device->fs_devices->open_devices == 0) { 1100 struct btrfs_fs_devices *fs_devices; 1101 fs_devices = root->fs_info->fs_devices; 1102 while (fs_devices) { 1103 if (fs_devices->seed == device->fs_devices) 1104 break; 1105 fs_devices = fs_devices->seed; 1106 } 1107 fs_devices->seed = device->fs_devices->seed; 1108 device->fs_devices->seed = NULL; 1109 __btrfs_close_devices(device->fs_devices); 1110 free_fs_devices(device->fs_devices); 1111 } 1112 1113 /* 1114 * at this point, the device is zero sized. We want to 1115 * remove it from the devices list and zero out the old super 1116 */ 1117 if (device->writeable) { 1118 /* make sure this device isn't detected as part of 1119 * the FS anymore 1120 */ 1121 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1122 set_buffer_dirty(bh); 1123 sync_dirty_buffer(bh); 1124 } 1125 1126 kfree(device->name); 1127 kfree(device); 1128 ret = 0; 1129 1130 error_brelse: 1131 brelse(bh); 1132 error_close: 1133 if (bdev) 1134 close_bdev_exclusive(bdev, FMODE_READ); 1135 out: 1136 mutex_unlock(&root->fs_info->volume_mutex); 1137 mutex_unlock(&uuid_mutex); 1138 return ret; 1139 } 1140 1141 /* 1142 * does all the dirty work required for changing file system's UUID. 1143 */ 1144 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1145 struct btrfs_root *root) 1146 { 1147 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1148 struct btrfs_fs_devices *old_devices; 1149 struct btrfs_fs_devices *seed_devices; 1150 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1151 struct btrfs_device *device; 1152 u64 super_flags; 1153 1154 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1155 if (!fs_devices->seeding) 1156 return -EINVAL; 1157 1158 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1159 if (!seed_devices) 1160 return -ENOMEM; 1161 1162 old_devices = clone_fs_devices(fs_devices); 1163 if (IS_ERR(old_devices)) { 1164 kfree(seed_devices); 1165 return PTR_ERR(old_devices); 1166 } 1167 1168 list_add(&old_devices->list, &fs_uuids); 1169 1170 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1171 seed_devices->opened = 1; 1172 INIT_LIST_HEAD(&seed_devices->devices); 1173 INIT_LIST_HEAD(&seed_devices->alloc_list); 1174 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1175 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1176 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1177 device->fs_devices = seed_devices; 1178 } 1179 1180 fs_devices->seeding = 0; 1181 fs_devices->num_devices = 0; 1182 fs_devices->open_devices = 0; 1183 fs_devices->seed = seed_devices; 1184 1185 generate_random_uuid(fs_devices->fsid); 1186 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1187 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1188 super_flags = btrfs_super_flags(disk_super) & 1189 ~BTRFS_SUPER_FLAG_SEEDING; 1190 btrfs_set_super_flags(disk_super, super_flags); 1191 1192 return 0; 1193 } 1194 1195 /* 1196 * strore the expected generation for seed devices in device items. 1197 */ 1198 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1199 struct btrfs_root *root) 1200 { 1201 struct btrfs_path *path; 1202 struct extent_buffer *leaf; 1203 struct btrfs_dev_item *dev_item; 1204 struct btrfs_device *device; 1205 struct btrfs_key key; 1206 u8 fs_uuid[BTRFS_UUID_SIZE]; 1207 u8 dev_uuid[BTRFS_UUID_SIZE]; 1208 u64 devid; 1209 int ret; 1210 1211 path = btrfs_alloc_path(); 1212 if (!path) 1213 return -ENOMEM; 1214 1215 root = root->fs_info->chunk_root; 1216 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1217 key.offset = 0; 1218 key.type = BTRFS_DEV_ITEM_KEY; 1219 1220 while (1) { 1221 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1222 if (ret < 0) 1223 goto error; 1224 1225 leaf = path->nodes[0]; 1226 next_slot: 1227 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1228 ret = btrfs_next_leaf(root, path); 1229 if (ret > 0) 1230 break; 1231 if (ret < 0) 1232 goto error; 1233 leaf = path->nodes[0]; 1234 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1235 btrfs_release_path(root, path); 1236 continue; 1237 } 1238 1239 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1240 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1241 key.type != BTRFS_DEV_ITEM_KEY) 1242 break; 1243 1244 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1245 struct btrfs_dev_item); 1246 devid = btrfs_device_id(leaf, dev_item); 1247 read_extent_buffer(leaf, dev_uuid, 1248 (unsigned long)btrfs_device_uuid(dev_item), 1249 BTRFS_UUID_SIZE); 1250 read_extent_buffer(leaf, fs_uuid, 1251 (unsigned long)btrfs_device_fsid(dev_item), 1252 BTRFS_UUID_SIZE); 1253 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1254 BUG_ON(!device); 1255 1256 if (device->fs_devices->seeding) { 1257 btrfs_set_device_generation(leaf, dev_item, 1258 device->generation); 1259 btrfs_mark_buffer_dirty(leaf); 1260 } 1261 1262 path->slots[0]++; 1263 goto next_slot; 1264 } 1265 ret = 0; 1266 error: 1267 btrfs_free_path(path); 1268 return ret; 1269 } 1270 1271 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1272 { 1273 struct btrfs_trans_handle *trans; 1274 struct btrfs_device *device; 1275 struct block_device *bdev; 1276 struct list_head *devices; 1277 struct super_block *sb = root->fs_info->sb; 1278 u64 total_bytes; 1279 int seeding_dev = 0; 1280 int ret = 0; 1281 1282 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1283 return -EINVAL; 1284 1285 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1286 if (!bdev) 1287 return -EIO; 1288 1289 if (root->fs_info->fs_devices->seeding) { 1290 seeding_dev = 1; 1291 down_write(&sb->s_umount); 1292 mutex_lock(&uuid_mutex); 1293 } 1294 1295 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1296 mutex_lock(&root->fs_info->volume_mutex); 1297 1298 devices = &root->fs_info->fs_devices->devices; 1299 list_for_each_entry(device, devices, dev_list) { 1300 if (device->bdev == bdev) { 1301 ret = -EEXIST; 1302 goto error; 1303 } 1304 } 1305 1306 device = kzalloc(sizeof(*device), GFP_NOFS); 1307 if (!device) { 1308 /* we can safely leave the fs_devices entry around */ 1309 ret = -ENOMEM; 1310 goto error; 1311 } 1312 1313 device->name = kstrdup(device_path, GFP_NOFS); 1314 if (!device->name) { 1315 kfree(device); 1316 ret = -ENOMEM; 1317 goto error; 1318 } 1319 1320 ret = find_next_devid(root, &device->devid); 1321 if (ret) { 1322 kfree(device); 1323 goto error; 1324 } 1325 1326 trans = btrfs_start_transaction(root, 1); 1327 lock_chunks(root); 1328 1329 device->barriers = 1; 1330 device->writeable = 1; 1331 device->work.func = pending_bios_fn; 1332 generate_random_uuid(device->uuid); 1333 spin_lock_init(&device->io_lock); 1334 device->generation = trans->transid; 1335 device->io_width = root->sectorsize; 1336 device->io_align = root->sectorsize; 1337 device->sector_size = root->sectorsize; 1338 device->total_bytes = i_size_read(bdev->bd_inode); 1339 device->dev_root = root->fs_info->dev_root; 1340 device->bdev = bdev; 1341 device->in_fs_metadata = 1; 1342 device->mode = 0; 1343 set_blocksize(device->bdev, 4096); 1344 1345 if (seeding_dev) { 1346 sb->s_flags &= ~MS_RDONLY; 1347 ret = btrfs_prepare_sprout(trans, root); 1348 BUG_ON(ret); 1349 } 1350 1351 device->fs_devices = root->fs_info->fs_devices; 1352 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1353 list_add(&device->dev_alloc_list, 1354 &root->fs_info->fs_devices->alloc_list); 1355 root->fs_info->fs_devices->num_devices++; 1356 root->fs_info->fs_devices->open_devices++; 1357 root->fs_info->fs_devices->rw_devices++; 1358 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1359 1360 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1361 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1362 total_bytes + device->total_bytes); 1363 1364 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1365 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1366 total_bytes + 1); 1367 1368 if (seeding_dev) { 1369 ret = init_first_rw_device(trans, root, device); 1370 BUG_ON(ret); 1371 ret = btrfs_finish_sprout(trans, root); 1372 BUG_ON(ret); 1373 } else { 1374 ret = btrfs_add_device(trans, root, device); 1375 } 1376 1377 /* 1378 * we've got more storage, clear any full flags on the space 1379 * infos 1380 */ 1381 btrfs_clear_space_info_full(root->fs_info); 1382 1383 unlock_chunks(root); 1384 btrfs_commit_transaction(trans, root); 1385 1386 if (seeding_dev) { 1387 mutex_unlock(&uuid_mutex); 1388 up_write(&sb->s_umount); 1389 1390 ret = btrfs_relocate_sys_chunks(root); 1391 BUG_ON(ret); 1392 } 1393 out: 1394 mutex_unlock(&root->fs_info->volume_mutex); 1395 return ret; 1396 error: 1397 close_bdev_exclusive(bdev, 0); 1398 if (seeding_dev) { 1399 mutex_unlock(&uuid_mutex); 1400 up_write(&sb->s_umount); 1401 } 1402 goto out; 1403 } 1404 1405 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1406 struct btrfs_device *device) 1407 { 1408 int ret; 1409 struct btrfs_path *path; 1410 struct btrfs_root *root; 1411 struct btrfs_dev_item *dev_item; 1412 struct extent_buffer *leaf; 1413 struct btrfs_key key; 1414 1415 root = device->dev_root->fs_info->chunk_root; 1416 1417 path = btrfs_alloc_path(); 1418 if (!path) 1419 return -ENOMEM; 1420 1421 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1422 key.type = BTRFS_DEV_ITEM_KEY; 1423 key.offset = device->devid; 1424 1425 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1426 if (ret < 0) 1427 goto out; 1428 1429 if (ret > 0) { 1430 ret = -ENOENT; 1431 goto out; 1432 } 1433 1434 leaf = path->nodes[0]; 1435 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1436 1437 btrfs_set_device_id(leaf, dev_item, device->devid); 1438 btrfs_set_device_type(leaf, dev_item, device->type); 1439 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1440 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1441 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1442 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1443 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1444 btrfs_mark_buffer_dirty(leaf); 1445 1446 out: 1447 btrfs_free_path(path); 1448 return ret; 1449 } 1450 1451 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1452 struct btrfs_device *device, u64 new_size) 1453 { 1454 struct btrfs_super_block *super_copy = 1455 &device->dev_root->fs_info->super_copy; 1456 u64 old_total = btrfs_super_total_bytes(super_copy); 1457 u64 diff = new_size - device->total_bytes; 1458 1459 if (!device->writeable) 1460 return -EACCES; 1461 if (new_size <= device->total_bytes) 1462 return -EINVAL; 1463 1464 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1465 device->fs_devices->total_rw_bytes += diff; 1466 1467 device->total_bytes = new_size; 1468 btrfs_clear_space_info_full(device->dev_root->fs_info); 1469 1470 return btrfs_update_device(trans, device); 1471 } 1472 1473 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1474 struct btrfs_device *device, u64 new_size) 1475 { 1476 int ret; 1477 lock_chunks(device->dev_root); 1478 ret = __btrfs_grow_device(trans, device, new_size); 1479 unlock_chunks(device->dev_root); 1480 return ret; 1481 } 1482 1483 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1484 struct btrfs_root *root, 1485 u64 chunk_tree, u64 chunk_objectid, 1486 u64 chunk_offset) 1487 { 1488 int ret; 1489 struct btrfs_path *path; 1490 struct btrfs_key key; 1491 1492 root = root->fs_info->chunk_root; 1493 path = btrfs_alloc_path(); 1494 if (!path) 1495 return -ENOMEM; 1496 1497 key.objectid = chunk_objectid; 1498 key.offset = chunk_offset; 1499 key.type = BTRFS_CHUNK_ITEM_KEY; 1500 1501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1502 BUG_ON(ret); 1503 1504 ret = btrfs_del_item(trans, root, path); 1505 BUG_ON(ret); 1506 1507 btrfs_free_path(path); 1508 return 0; 1509 } 1510 1511 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1512 chunk_offset) 1513 { 1514 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1515 struct btrfs_disk_key *disk_key; 1516 struct btrfs_chunk *chunk; 1517 u8 *ptr; 1518 int ret = 0; 1519 u32 num_stripes; 1520 u32 array_size; 1521 u32 len = 0; 1522 u32 cur; 1523 struct btrfs_key key; 1524 1525 array_size = btrfs_super_sys_array_size(super_copy); 1526 1527 ptr = super_copy->sys_chunk_array; 1528 cur = 0; 1529 1530 while (cur < array_size) { 1531 disk_key = (struct btrfs_disk_key *)ptr; 1532 btrfs_disk_key_to_cpu(&key, disk_key); 1533 1534 len = sizeof(*disk_key); 1535 1536 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1537 chunk = (struct btrfs_chunk *)(ptr + len); 1538 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1539 len += btrfs_chunk_item_size(num_stripes); 1540 } else { 1541 ret = -EIO; 1542 break; 1543 } 1544 if (key.objectid == chunk_objectid && 1545 key.offset == chunk_offset) { 1546 memmove(ptr, ptr + len, array_size - (cur + len)); 1547 array_size -= len; 1548 btrfs_set_super_sys_array_size(super_copy, array_size); 1549 } else { 1550 ptr += len; 1551 cur += len; 1552 } 1553 } 1554 return ret; 1555 } 1556 1557 static int btrfs_relocate_chunk(struct btrfs_root *root, 1558 u64 chunk_tree, u64 chunk_objectid, 1559 u64 chunk_offset) 1560 { 1561 struct extent_map_tree *em_tree; 1562 struct btrfs_root *extent_root; 1563 struct btrfs_trans_handle *trans; 1564 struct extent_map *em; 1565 struct map_lookup *map; 1566 int ret; 1567 int i; 1568 1569 printk(KERN_INFO "btrfs relocating chunk %llu\n", 1570 (unsigned long long)chunk_offset); 1571 root = root->fs_info->chunk_root; 1572 extent_root = root->fs_info->extent_root; 1573 em_tree = &root->fs_info->mapping_tree.map_tree; 1574 1575 /* step one, relocate all the extents inside this chunk */ 1576 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1577 BUG_ON(ret); 1578 1579 trans = btrfs_start_transaction(root, 1); 1580 BUG_ON(!trans); 1581 1582 lock_chunks(root); 1583 1584 /* 1585 * step two, delete the device extents and the 1586 * chunk tree entries 1587 */ 1588 spin_lock(&em_tree->lock); 1589 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1590 spin_unlock(&em_tree->lock); 1591 1592 BUG_ON(em->start > chunk_offset || 1593 em->start + em->len < chunk_offset); 1594 map = (struct map_lookup *)em->bdev; 1595 1596 for (i = 0; i < map->num_stripes; i++) { 1597 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1598 map->stripes[i].physical); 1599 BUG_ON(ret); 1600 1601 if (map->stripes[i].dev) { 1602 ret = btrfs_update_device(trans, map->stripes[i].dev); 1603 BUG_ON(ret); 1604 } 1605 } 1606 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1607 chunk_offset); 1608 1609 BUG_ON(ret); 1610 1611 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1612 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1613 BUG_ON(ret); 1614 } 1615 1616 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1617 BUG_ON(ret); 1618 1619 spin_lock(&em_tree->lock); 1620 remove_extent_mapping(em_tree, em); 1621 spin_unlock(&em_tree->lock); 1622 1623 kfree(map); 1624 em->bdev = NULL; 1625 1626 /* once for the tree */ 1627 free_extent_map(em); 1628 /* once for us */ 1629 free_extent_map(em); 1630 1631 unlock_chunks(root); 1632 btrfs_end_transaction(trans, root); 1633 return 0; 1634 } 1635 1636 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1637 { 1638 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1639 struct btrfs_path *path; 1640 struct extent_buffer *leaf; 1641 struct btrfs_chunk *chunk; 1642 struct btrfs_key key; 1643 struct btrfs_key found_key; 1644 u64 chunk_tree = chunk_root->root_key.objectid; 1645 u64 chunk_type; 1646 int ret; 1647 1648 path = btrfs_alloc_path(); 1649 if (!path) 1650 return -ENOMEM; 1651 1652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1653 key.offset = (u64)-1; 1654 key.type = BTRFS_CHUNK_ITEM_KEY; 1655 1656 while (1) { 1657 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1658 if (ret < 0) 1659 goto error; 1660 BUG_ON(ret == 0); 1661 1662 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1663 key.type); 1664 if (ret < 0) 1665 goto error; 1666 if (ret > 0) 1667 break; 1668 1669 leaf = path->nodes[0]; 1670 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1671 1672 chunk = btrfs_item_ptr(leaf, path->slots[0], 1673 struct btrfs_chunk); 1674 chunk_type = btrfs_chunk_type(leaf, chunk); 1675 btrfs_release_path(chunk_root, path); 1676 1677 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1678 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1679 found_key.objectid, 1680 found_key.offset); 1681 BUG_ON(ret); 1682 } 1683 1684 if (found_key.offset == 0) 1685 break; 1686 key.offset = found_key.offset - 1; 1687 } 1688 ret = 0; 1689 error: 1690 btrfs_free_path(path); 1691 return ret; 1692 } 1693 1694 static u64 div_factor(u64 num, int factor) 1695 { 1696 if (factor == 10) 1697 return num; 1698 num *= factor; 1699 do_div(num, 10); 1700 return num; 1701 } 1702 1703 int btrfs_balance(struct btrfs_root *dev_root) 1704 { 1705 int ret; 1706 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1707 struct btrfs_device *device; 1708 u64 old_size; 1709 u64 size_to_free; 1710 struct btrfs_path *path; 1711 struct btrfs_key key; 1712 struct btrfs_chunk *chunk; 1713 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1714 struct btrfs_trans_handle *trans; 1715 struct btrfs_key found_key; 1716 1717 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1718 return -EROFS; 1719 1720 mutex_lock(&dev_root->fs_info->volume_mutex); 1721 dev_root = dev_root->fs_info->dev_root; 1722 1723 /* step one make some room on all the devices */ 1724 list_for_each_entry(device, devices, dev_list) { 1725 old_size = device->total_bytes; 1726 size_to_free = div_factor(old_size, 1); 1727 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1728 if (!device->writeable || 1729 device->total_bytes - device->bytes_used > size_to_free) 1730 continue; 1731 1732 ret = btrfs_shrink_device(device, old_size - size_to_free); 1733 BUG_ON(ret); 1734 1735 trans = btrfs_start_transaction(dev_root, 1); 1736 BUG_ON(!trans); 1737 1738 ret = btrfs_grow_device(trans, device, old_size); 1739 BUG_ON(ret); 1740 1741 btrfs_end_transaction(trans, dev_root); 1742 } 1743 1744 /* step two, relocate all the chunks */ 1745 path = btrfs_alloc_path(); 1746 BUG_ON(!path); 1747 1748 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1749 key.offset = (u64)-1; 1750 key.type = BTRFS_CHUNK_ITEM_KEY; 1751 1752 while (1) { 1753 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1754 if (ret < 0) 1755 goto error; 1756 1757 /* 1758 * this shouldn't happen, it means the last relocate 1759 * failed 1760 */ 1761 if (ret == 0) 1762 break; 1763 1764 ret = btrfs_previous_item(chunk_root, path, 0, 1765 BTRFS_CHUNK_ITEM_KEY); 1766 if (ret) 1767 break; 1768 1769 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1770 path->slots[0]); 1771 if (found_key.objectid != key.objectid) 1772 break; 1773 1774 chunk = btrfs_item_ptr(path->nodes[0], 1775 path->slots[0], 1776 struct btrfs_chunk); 1777 key.offset = found_key.offset; 1778 /* chunk zero is special */ 1779 if (key.offset == 0) 1780 break; 1781 1782 btrfs_release_path(chunk_root, path); 1783 ret = btrfs_relocate_chunk(chunk_root, 1784 chunk_root->root_key.objectid, 1785 found_key.objectid, 1786 found_key.offset); 1787 BUG_ON(ret); 1788 } 1789 ret = 0; 1790 error: 1791 btrfs_free_path(path); 1792 mutex_unlock(&dev_root->fs_info->volume_mutex); 1793 return ret; 1794 } 1795 1796 /* 1797 * shrinking a device means finding all of the device extents past 1798 * the new size, and then following the back refs to the chunks. 1799 * The chunk relocation code actually frees the device extent 1800 */ 1801 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1802 { 1803 struct btrfs_trans_handle *trans; 1804 struct btrfs_root *root = device->dev_root; 1805 struct btrfs_dev_extent *dev_extent = NULL; 1806 struct btrfs_path *path; 1807 u64 length; 1808 u64 chunk_tree; 1809 u64 chunk_objectid; 1810 u64 chunk_offset; 1811 int ret; 1812 int slot; 1813 struct extent_buffer *l; 1814 struct btrfs_key key; 1815 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1816 u64 old_total = btrfs_super_total_bytes(super_copy); 1817 u64 diff = device->total_bytes - new_size; 1818 1819 if (new_size >= device->total_bytes) 1820 return -EINVAL; 1821 1822 path = btrfs_alloc_path(); 1823 if (!path) 1824 return -ENOMEM; 1825 1826 trans = btrfs_start_transaction(root, 1); 1827 if (!trans) { 1828 ret = -ENOMEM; 1829 goto done; 1830 } 1831 1832 path->reada = 2; 1833 1834 lock_chunks(root); 1835 1836 device->total_bytes = new_size; 1837 if (device->writeable) 1838 device->fs_devices->total_rw_bytes -= diff; 1839 ret = btrfs_update_device(trans, device); 1840 if (ret) { 1841 unlock_chunks(root); 1842 btrfs_end_transaction(trans, root); 1843 goto done; 1844 } 1845 WARN_ON(diff > old_total); 1846 btrfs_set_super_total_bytes(super_copy, old_total - diff); 1847 unlock_chunks(root); 1848 btrfs_end_transaction(trans, root); 1849 1850 key.objectid = device->devid; 1851 key.offset = (u64)-1; 1852 key.type = BTRFS_DEV_EXTENT_KEY; 1853 1854 while (1) { 1855 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1856 if (ret < 0) 1857 goto done; 1858 1859 ret = btrfs_previous_item(root, path, 0, key.type); 1860 if (ret < 0) 1861 goto done; 1862 if (ret) { 1863 ret = 0; 1864 goto done; 1865 } 1866 1867 l = path->nodes[0]; 1868 slot = path->slots[0]; 1869 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 1870 1871 if (key.objectid != device->devid) 1872 goto done; 1873 1874 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1875 length = btrfs_dev_extent_length(l, dev_extent); 1876 1877 if (key.offset + length <= new_size) 1878 goto done; 1879 1880 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1881 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1882 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 1883 btrfs_release_path(root, path); 1884 1885 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 1886 chunk_offset); 1887 if (ret) 1888 goto done; 1889 } 1890 1891 done: 1892 btrfs_free_path(path); 1893 return ret; 1894 } 1895 1896 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 1897 struct btrfs_root *root, 1898 struct btrfs_key *key, 1899 struct btrfs_chunk *chunk, int item_size) 1900 { 1901 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1902 struct btrfs_disk_key disk_key; 1903 u32 array_size; 1904 u8 *ptr; 1905 1906 array_size = btrfs_super_sys_array_size(super_copy); 1907 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 1908 return -EFBIG; 1909 1910 ptr = super_copy->sys_chunk_array + array_size; 1911 btrfs_cpu_key_to_disk(&disk_key, key); 1912 memcpy(ptr, &disk_key, sizeof(disk_key)); 1913 ptr += sizeof(disk_key); 1914 memcpy(ptr, chunk, item_size); 1915 item_size += sizeof(disk_key); 1916 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 1917 return 0; 1918 } 1919 1920 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 1921 int num_stripes, int sub_stripes) 1922 { 1923 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 1924 return calc_size; 1925 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1926 return calc_size * (num_stripes / sub_stripes); 1927 else 1928 return calc_size * num_stripes; 1929 } 1930 1931 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 1932 struct btrfs_root *extent_root, 1933 struct map_lookup **map_ret, 1934 u64 *num_bytes, u64 *stripe_size, 1935 u64 start, u64 type) 1936 { 1937 struct btrfs_fs_info *info = extent_root->fs_info; 1938 struct btrfs_device *device = NULL; 1939 struct btrfs_fs_devices *fs_devices = info->fs_devices; 1940 struct list_head *cur; 1941 struct map_lookup *map = NULL; 1942 struct extent_map_tree *em_tree; 1943 struct extent_map *em; 1944 struct list_head private_devs; 1945 int min_stripe_size = 1 * 1024 * 1024; 1946 u64 calc_size = 1024 * 1024 * 1024; 1947 u64 max_chunk_size = calc_size; 1948 u64 min_free; 1949 u64 avail; 1950 u64 max_avail = 0; 1951 u64 dev_offset; 1952 int num_stripes = 1; 1953 int min_stripes = 1; 1954 int sub_stripes = 0; 1955 int looped = 0; 1956 int ret; 1957 int index; 1958 int stripe_len = 64 * 1024; 1959 1960 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 1961 (type & BTRFS_BLOCK_GROUP_DUP)) { 1962 WARN_ON(1); 1963 type &= ~BTRFS_BLOCK_GROUP_DUP; 1964 } 1965 if (list_empty(&fs_devices->alloc_list)) 1966 return -ENOSPC; 1967 1968 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 1969 num_stripes = fs_devices->rw_devices; 1970 min_stripes = 2; 1971 } 1972 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 1973 num_stripes = 2; 1974 min_stripes = 2; 1975 } 1976 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 1977 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 1978 if (num_stripes < 2) 1979 return -ENOSPC; 1980 min_stripes = 2; 1981 } 1982 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 1983 num_stripes = fs_devices->rw_devices; 1984 if (num_stripes < 4) 1985 return -ENOSPC; 1986 num_stripes &= ~(u32)1; 1987 sub_stripes = 2; 1988 min_stripes = 4; 1989 } 1990 1991 if (type & BTRFS_BLOCK_GROUP_DATA) { 1992 max_chunk_size = 10 * calc_size; 1993 min_stripe_size = 64 * 1024 * 1024; 1994 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 1995 max_chunk_size = 4 * calc_size; 1996 min_stripe_size = 32 * 1024 * 1024; 1997 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 1998 calc_size = 8 * 1024 * 1024; 1999 max_chunk_size = calc_size * 2; 2000 min_stripe_size = 1 * 1024 * 1024; 2001 } 2002 2003 /* we don't want a chunk larger than 10% of writeable space */ 2004 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2005 max_chunk_size); 2006 2007 again: 2008 if (!map || map->num_stripes != num_stripes) { 2009 kfree(map); 2010 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2011 if (!map) 2012 return -ENOMEM; 2013 map->num_stripes = num_stripes; 2014 } 2015 2016 if (calc_size * num_stripes > max_chunk_size) { 2017 calc_size = max_chunk_size; 2018 do_div(calc_size, num_stripes); 2019 do_div(calc_size, stripe_len); 2020 calc_size *= stripe_len; 2021 } 2022 /* we don't want tiny stripes */ 2023 calc_size = max_t(u64, min_stripe_size, calc_size); 2024 2025 do_div(calc_size, stripe_len); 2026 calc_size *= stripe_len; 2027 2028 cur = fs_devices->alloc_list.next; 2029 index = 0; 2030 2031 if (type & BTRFS_BLOCK_GROUP_DUP) 2032 min_free = calc_size * 2; 2033 else 2034 min_free = calc_size; 2035 2036 /* 2037 * we add 1MB because we never use the first 1MB of the device, unless 2038 * we've looped, then we are likely allocating the maximum amount of 2039 * space left already 2040 */ 2041 if (!looped) 2042 min_free += 1024 * 1024; 2043 2044 INIT_LIST_HEAD(&private_devs); 2045 while (index < num_stripes) { 2046 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2047 BUG_ON(!device->writeable); 2048 if (device->total_bytes > device->bytes_used) 2049 avail = device->total_bytes - device->bytes_used; 2050 else 2051 avail = 0; 2052 cur = cur->next; 2053 2054 if (device->in_fs_metadata && avail >= min_free) { 2055 ret = find_free_dev_extent(trans, device, 2056 min_free, &dev_offset); 2057 if (ret == 0) { 2058 list_move_tail(&device->dev_alloc_list, 2059 &private_devs); 2060 map->stripes[index].dev = device; 2061 map->stripes[index].physical = dev_offset; 2062 index++; 2063 if (type & BTRFS_BLOCK_GROUP_DUP) { 2064 map->stripes[index].dev = device; 2065 map->stripes[index].physical = 2066 dev_offset + calc_size; 2067 index++; 2068 } 2069 } 2070 } else if (device->in_fs_metadata && avail > max_avail) 2071 max_avail = avail; 2072 if (cur == &fs_devices->alloc_list) 2073 break; 2074 } 2075 list_splice(&private_devs, &fs_devices->alloc_list); 2076 if (index < num_stripes) { 2077 if (index >= min_stripes) { 2078 num_stripes = index; 2079 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2080 num_stripes /= sub_stripes; 2081 num_stripes *= sub_stripes; 2082 } 2083 looped = 1; 2084 goto again; 2085 } 2086 if (!looped && max_avail > 0) { 2087 looped = 1; 2088 calc_size = max_avail; 2089 goto again; 2090 } 2091 kfree(map); 2092 return -ENOSPC; 2093 } 2094 map->sector_size = extent_root->sectorsize; 2095 map->stripe_len = stripe_len; 2096 map->io_align = stripe_len; 2097 map->io_width = stripe_len; 2098 map->type = type; 2099 map->num_stripes = num_stripes; 2100 map->sub_stripes = sub_stripes; 2101 2102 *map_ret = map; 2103 *stripe_size = calc_size; 2104 *num_bytes = chunk_bytes_by_type(type, calc_size, 2105 num_stripes, sub_stripes); 2106 2107 em = alloc_extent_map(GFP_NOFS); 2108 if (!em) { 2109 kfree(map); 2110 return -ENOMEM; 2111 } 2112 em->bdev = (struct block_device *)map; 2113 em->start = start; 2114 em->len = *num_bytes; 2115 em->block_start = 0; 2116 em->block_len = em->len; 2117 2118 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2119 spin_lock(&em_tree->lock); 2120 ret = add_extent_mapping(em_tree, em); 2121 spin_unlock(&em_tree->lock); 2122 BUG_ON(ret); 2123 free_extent_map(em); 2124 2125 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2126 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2127 start, *num_bytes); 2128 BUG_ON(ret); 2129 2130 index = 0; 2131 while (index < map->num_stripes) { 2132 device = map->stripes[index].dev; 2133 dev_offset = map->stripes[index].physical; 2134 2135 ret = btrfs_alloc_dev_extent(trans, device, 2136 info->chunk_root->root_key.objectid, 2137 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2138 start, dev_offset, calc_size); 2139 BUG_ON(ret); 2140 index++; 2141 } 2142 2143 return 0; 2144 } 2145 2146 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2147 struct btrfs_root *extent_root, 2148 struct map_lookup *map, u64 chunk_offset, 2149 u64 chunk_size, u64 stripe_size) 2150 { 2151 u64 dev_offset; 2152 struct btrfs_key key; 2153 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2154 struct btrfs_device *device; 2155 struct btrfs_chunk *chunk; 2156 struct btrfs_stripe *stripe; 2157 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2158 int index = 0; 2159 int ret; 2160 2161 chunk = kzalloc(item_size, GFP_NOFS); 2162 if (!chunk) 2163 return -ENOMEM; 2164 2165 index = 0; 2166 while (index < map->num_stripes) { 2167 device = map->stripes[index].dev; 2168 device->bytes_used += stripe_size; 2169 ret = btrfs_update_device(trans, device); 2170 BUG_ON(ret); 2171 index++; 2172 } 2173 2174 index = 0; 2175 stripe = &chunk->stripe; 2176 while (index < map->num_stripes) { 2177 device = map->stripes[index].dev; 2178 dev_offset = map->stripes[index].physical; 2179 2180 btrfs_set_stack_stripe_devid(stripe, device->devid); 2181 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2182 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2183 stripe++; 2184 index++; 2185 } 2186 2187 btrfs_set_stack_chunk_length(chunk, chunk_size); 2188 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2189 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2190 btrfs_set_stack_chunk_type(chunk, map->type); 2191 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2192 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2193 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2194 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2195 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2196 2197 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2198 key.type = BTRFS_CHUNK_ITEM_KEY; 2199 key.offset = chunk_offset; 2200 2201 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2202 BUG_ON(ret); 2203 2204 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2205 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2206 item_size); 2207 BUG_ON(ret); 2208 } 2209 kfree(chunk); 2210 return 0; 2211 } 2212 2213 /* 2214 * Chunk allocation falls into two parts. The first part does works 2215 * that make the new allocated chunk useable, but not do any operation 2216 * that modifies the chunk tree. The second part does the works that 2217 * require modifying the chunk tree. This division is important for the 2218 * bootstrap process of adding storage to a seed btrfs. 2219 */ 2220 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2221 struct btrfs_root *extent_root, u64 type) 2222 { 2223 u64 chunk_offset; 2224 u64 chunk_size; 2225 u64 stripe_size; 2226 struct map_lookup *map; 2227 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2228 int ret; 2229 2230 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2231 &chunk_offset); 2232 if (ret) 2233 return ret; 2234 2235 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2236 &stripe_size, chunk_offset, type); 2237 if (ret) 2238 return ret; 2239 2240 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2241 chunk_size, stripe_size); 2242 BUG_ON(ret); 2243 return 0; 2244 } 2245 2246 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2247 struct btrfs_root *root, 2248 struct btrfs_device *device) 2249 { 2250 u64 chunk_offset; 2251 u64 sys_chunk_offset; 2252 u64 chunk_size; 2253 u64 sys_chunk_size; 2254 u64 stripe_size; 2255 u64 sys_stripe_size; 2256 u64 alloc_profile; 2257 struct map_lookup *map; 2258 struct map_lookup *sys_map; 2259 struct btrfs_fs_info *fs_info = root->fs_info; 2260 struct btrfs_root *extent_root = fs_info->extent_root; 2261 int ret; 2262 2263 ret = find_next_chunk(fs_info->chunk_root, 2264 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2265 BUG_ON(ret); 2266 2267 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2268 (fs_info->metadata_alloc_profile & 2269 fs_info->avail_metadata_alloc_bits); 2270 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2271 2272 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2273 &stripe_size, chunk_offset, alloc_profile); 2274 BUG_ON(ret); 2275 2276 sys_chunk_offset = chunk_offset + chunk_size; 2277 2278 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2279 (fs_info->system_alloc_profile & 2280 fs_info->avail_system_alloc_bits); 2281 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2282 2283 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2284 &sys_chunk_size, &sys_stripe_size, 2285 sys_chunk_offset, alloc_profile); 2286 BUG_ON(ret); 2287 2288 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2289 BUG_ON(ret); 2290 2291 /* 2292 * Modifying chunk tree needs allocating new blocks from both 2293 * system block group and metadata block group. So we only can 2294 * do operations require modifying the chunk tree after both 2295 * block groups were created. 2296 */ 2297 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2298 chunk_size, stripe_size); 2299 BUG_ON(ret); 2300 2301 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2302 sys_chunk_offset, sys_chunk_size, 2303 sys_stripe_size); 2304 BUG_ON(ret); 2305 return 0; 2306 } 2307 2308 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2309 { 2310 struct extent_map *em; 2311 struct map_lookup *map; 2312 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2313 int readonly = 0; 2314 int i; 2315 2316 spin_lock(&map_tree->map_tree.lock); 2317 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2318 spin_unlock(&map_tree->map_tree.lock); 2319 if (!em) 2320 return 1; 2321 2322 map = (struct map_lookup *)em->bdev; 2323 for (i = 0; i < map->num_stripes; i++) { 2324 if (!map->stripes[i].dev->writeable) { 2325 readonly = 1; 2326 break; 2327 } 2328 } 2329 free_extent_map(em); 2330 return readonly; 2331 } 2332 2333 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2334 { 2335 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2336 } 2337 2338 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2339 { 2340 struct extent_map *em; 2341 2342 while (1) { 2343 spin_lock(&tree->map_tree.lock); 2344 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2345 if (em) 2346 remove_extent_mapping(&tree->map_tree, em); 2347 spin_unlock(&tree->map_tree.lock); 2348 if (!em) 2349 break; 2350 kfree(em->bdev); 2351 /* once for us */ 2352 free_extent_map(em); 2353 /* once for the tree */ 2354 free_extent_map(em); 2355 } 2356 } 2357 2358 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2359 { 2360 struct extent_map *em; 2361 struct map_lookup *map; 2362 struct extent_map_tree *em_tree = &map_tree->map_tree; 2363 int ret; 2364 2365 spin_lock(&em_tree->lock); 2366 em = lookup_extent_mapping(em_tree, logical, len); 2367 spin_unlock(&em_tree->lock); 2368 BUG_ON(!em); 2369 2370 BUG_ON(em->start > logical || em->start + em->len < logical); 2371 map = (struct map_lookup *)em->bdev; 2372 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2373 ret = map->num_stripes; 2374 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2375 ret = map->sub_stripes; 2376 else 2377 ret = 1; 2378 free_extent_map(em); 2379 return ret; 2380 } 2381 2382 static int find_live_mirror(struct map_lookup *map, int first, int num, 2383 int optimal) 2384 { 2385 int i; 2386 if (map->stripes[optimal].dev->bdev) 2387 return optimal; 2388 for (i = first; i < first + num; i++) { 2389 if (map->stripes[i].dev->bdev) 2390 return i; 2391 } 2392 /* we couldn't find one that doesn't fail. Just return something 2393 * and the io error handling code will clean up eventually 2394 */ 2395 return optimal; 2396 } 2397 2398 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2399 u64 logical, u64 *length, 2400 struct btrfs_multi_bio **multi_ret, 2401 int mirror_num, struct page *unplug_page) 2402 { 2403 struct extent_map *em; 2404 struct map_lookup *map; 2405 struct extent_map_tree *em_tree = &map_tree->map_tree; 2406 u64 offset; 2407 u64 stripe_offset; 2408 u64 stripe_nr; 2409 int stripes_allocated = 8; 2410 int stripes_required = 1; 2411 int stripe_index; 2412 int i; 2413 int num_stripes; 2414 int max_errors = 0; 2415 struct btrfs_multi_bio *multi = NULL; 2416 2417 if (multi_ret && !(rw & (1 << BIO_RW))) 2418 stripes_allocated = 1; 2419 again: 2420 if (multi_ret) { 2421 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2422 GFP_NOFS); 2423 if (!multi) 2424 return -ENOMEM; 2425 2426 atomic_set(&multi->error, 0); 2427 } 2428 2429 spin_lock(&em_tree->lock); 2430 em = lookup_extent_mapping(em_tree, logical, *length); 2431 spin_unlock(&em_tree->lock); 2432 2433 if (!em && unplug_page) 2434 return 0; 2435 2436 if (!em) { 2437 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2438 (unsigned long long)logical, 2439 (unsigned long long)*length); 2440 BUG(); 2441 } 2442 2443 BUG_ON(em->start > logical || em->start + em->len < logical); 2444 map = (struct map_lookup *)em->bdev; 2445 offset = logical - em->start; 2446 2447 if (mirror_num > map->num_stripes) 2448 mirror_num = 0; 2449 2450 /* if our multi bio struct is too small, back off and try again */ 2451 if (rw & (1 << BIO_RW)) { 2452 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2453 BTRFS_BLOCK_GROUP_DUP)) { 2454 stripes_required = map->num_stripes; 2455 max_errors = 1; 2456 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2457 stripes_required = map->sub_stripes; 2458 max_errors = 1; 2459 } 2460 } 2461 if (multi_ret && rw == WRITE && 2462 stripes_allocated < stripes_required) { 2463 stripes_allocated = map->num_stripes; 2464 free_extent_map(em); 2465 kfree(multi); 2466 goto again; 2467 } 2468 stripe_nr = offset; 2469 /* 2470 * stripe_nr counts the total number of stripes we have to stride 2471 * to get to this block 2472 */ 2473 do_div(stripe_nr, map->stripe_len); 2474 2475 stripe_offset = stripe_nr * map->stripe_len; 2476 BUG_ON(offset < stripe_offset); 2477 2478 /* stripe_offset is the offset of this block in its stripe*/ 2479 stripe_offset = offset - stripe_offset; 2480 2481 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2482 BTRFS_BLOCK_GROUP_RAID10 | 2483 BTRFS_BLOCK_GROUP_DUP)) { 2484 /* we limit the length of each bio to what fits in a stripe */ 2485 *length = min_t(u64, em->len - offset, 2486 map->stripe_len - stripe_offset); 2487 } else { 2488 *length = em->len - offset; 2489 } 2490 2491 if (!multi_ret && !unplug_page) 2492 goto out; 2493 2494 num_stripes = 1; 2495 stripe_index = 0; 2496 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2497 if (unplug_page || (rw & (1 << BIO_RW))) 2498 num_stripes = map->num_stripes; 2499 else if (mirror_num) 2500 stripe_index = mirror_num - 1; 2501 else { 2502 stripe_index = find_live_mirror(map, 0, 2503 map->num_stripes, 2504 current->pid % map->num_stripes); 2505 } 2506 2507 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2508 if (rw & (1 << BIO_RW)) 2509 num_stripes = map->num_stripes; 2510 else if (mirror_num) 2511 stripe_index = mirror_num - 1; 2512 2513 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2514 int factor = map->num_stripes / map->sub_stripes; 2515 2516 stripe_index = do_div(stripe_nr, factor); 2517 stripe_index *= map->sub_stripes; 2518 2519 if (unplug_page || (rw & (1 << BIO_RW))) 2520 num_stripes = map->sub_stripes; 2521 else if (mirror_num) 2522 stripe_index += mirror_num - 1; 2523 else { 2524 stripe_index = find_live_mirror(map, stripe_index, 2525 map->sub_stripes, stripe_index + 2526 current->pid % map->sub_stripes); 2527 } 2528 } else { 2529 /* 2530 * after this do_div call, stripe_nr is the number of stripes 2531 * on this device we have to walk to find the data, and 2532 * stripe_index is the number of our device in the stripe array 2533 */ 2534 stripe_index = do_div(stripe_nr, map->num_stripes); 2535 } 2536 BUG_ON(stripe_index >= map->num_stripes); 2537 2538 for (i = 0; i < num_stripes; i++) { 2539 if (unplug_page) { 2540 struct btrfs_device *device; 2541 struct backing_dev_info *bdi; 2542 2543 device = map->stripes[stripe_index].dev; 2544 if (device->bdev) { 2545 bdi = blk_get_backing_dev_info(device->bdev); 2546 if (bdi->unplug_io_fn) 2547 bdi->unplug_io_fn(bdi, unplug_page); 2548 } 2549 } else { 2550 multi->stripes[i].physical = 2551 map->stripes[stripe_index].physical + 2552 stripe_offset + stripe_nr * map->stripe_len; 2553 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2554 } 2555 stripe_index++; 2556 } 2557 if (multi_ret) { 2558 *multi_ret = multi; 2559 multi->num_stripes = num_stripes; 2560 multi->max_errors = max_errors; 2561 } 2562 out: 2563 free_extent_map(em); 2564 return 0; 2565 } 2566 2567 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2568 u64 logical, u64 *length, 2569 struct btrfs_multi_bio **multi_ret, int mirror_num) 2570 { 2571 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2572 mirror_num, NULL); 2573 } 2574 2575 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2576 u64 chunk_start, u64 physical, u64 devid, 2577 u64 **logical, int *naddrs, int *stripe_len) 2578 { 2579 struct extent_map_tree *em_tree = &map_tree->map_tree; 2580 struct extent_map *em; 2581 struct map_lookup *map; 2582 u64 *buf; 2583 u64 bytenr; 2584 u64 length; 2585 u64 stripe_nr; 2586 int i, j, nr = 0; 2587 2588 spin_lock(&em_tree->lock); 2589 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2590 spin_unlock(&em_tree->lock); 2591 2592 BUG_ON(!em || em->start != chunk_start); 2593 map = (struct map_lookup *)em->bdev; 2594 2595 length = em->len; 2596 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2597 do_div(length, map->num_stripes / map->sub_stripes); 2598 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2599 do_div(length, map->num_stripes); 2600 2601 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2602 BUG_ON(!buf); 2603 2604 for (i = 0; i < map->num_stripes; i++) { 2605 if (devid && map->stripes[i].dev->devid != devid) 2606 continue; 2607 if (map->stripes[i].physical > physical || 2608 map->stripes[i].physical + length <= physical) 2609 continue; 2610 2611 stripe_nr = physical - map->stripes[i].physical; 2612 do_div(stripe_nr, map->stripe_len); 2613 2614 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2615 stripe_nr = stripe_nr * map->num_stripes + i; 2616 do_div(stripe_nr, map->sub_stripes); 2617 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2618 stripe_nr = stripe_nr * map->num_stripes + i; 2619 } 2620 bytenr = chunk_start + stripe_nr * map->stripe_len; 2621 WARN_ON(nr >= map->num_stripes); 2622 for (j = 0; j < nr; j++) { 2623 if (buf[j] == bytenr) 2624 break; 2625 } 2626 if (j == nr) { 2627 WARN_ON(nr >= map->num_stripes); 2628 buf[nr++] = bytenr; 2629 } 2630 } 2631 2632 for (i = 0; i > nr; i++) { 2633 struct btrfs_multi_bio *multi; 2634 struct btrfs_bio_stripe *stripe; 2635 int ret; 2636 2637 length = 1; 2638 ret = btrfs_map_block(map_tree, WRITE, buf[i], 2639 &length, &multi, 0); 2640 BUG_ON(ret); 2641 2642 stripe = multi->stripes; 2643 for (j = 0; j < multi->num_stripes; j++) { 2644 if (stripe->physical >= physical && 2645 physical < stripe->physical + length) 2646 break; 2647 } 2648 BUG_ON(j >= multi->num_stripes); 2649 kfree(multi); 2650 } 2651 2652 *logical = buf; 2653 *naddrs = nr; 2654 *stripe_len = map->stripe_len; 2655 2656 free_extent_map(em); 2657 return 0; 2658 } 2659 2660 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2661 u64 logical, struct page *page) 2662 { 2663 u64 length = PAGE_CACHE_SIZE; 2664 return __btrfs_map_block(map_tree, READ, logical, &length, 2665 NULL, 0, page); 2666 } 2667 2668 static void end_bio_multi_stripe(struct bio *bio, int err) 2669 { 2670 struct btrfs_multi_bio *multi = bio->bi_private; 2671 int is_orig_bio = 0; 2672 2673 if (err) 2674 atomic_inc(&multi->error); 2675 2676 if (bio == multi->orig_bio) 2677 is_orig_bio = 1; 2678 2679 if (atomic_dec_and_test(&multi->stripes_pending)) { 2680 if (!is_orig_bio) { 2681 bio_put(bio); 2682 bio = multi->orig_bio; 2683 } 2684 bio->bi_private = multi->private; 2685 bio->bi_end_io = multi->end_io; 2686 /* only send an error to the higher layers if it is 2687 * beyond the tolerance of the multi-bio 2688 */ 2689 if (atomic_read(&multi->error) > multi->max_errors) { 2690 err = -EIO; 2691 } else if (err) { 2692 /* 2693 * this bio is actually up to date, we didn't 2694 * go over the max number of errors 2695 */ 2696 set_bit(BIO_UPTODATE, &bio->bi_flags); 2697 err = 0; 2698 } 2699 kfree(multi); 2700 2701 bio_endio(bio, err); 2702 } else if (!is_orig_bio) { 2703 bio_put(bio); 2704 } 2705 } 2706 2707 struct async_sched { 2708 struct bio *bio; 2709 int rw; 2710 struct btrfs_fs_info *info; 2711 struct btrfs_work work; 2712 }; 2713 2714 /* 2715 * see run_scheduled_bios for a description of why bios are collected for 2716 * async submit. 2717 * 2718 * This will add one bio to the pending list for a device and make sure 2719 * the work struct is scheduled. 2720 */ 2721 static noinline int schedule_bio(struct btrfs_root *root, 2722 struct btrfs_device *device, 2723 int rw, struct bio *bio) 2724 { 2725 int should_queue = 1; 2726 2727 /* don't bother with additional async steps for reads, right now */ 2728 if (!(rw & (1 << BIO_RW))) { 2729 bio_get(bio); 2730 submit_bio(rw, bio); 2731 bio_put(bio); 2732 return 0; 2733 } 2734 2735 /* 2736 * nr_async_bios allows us to reliably return congestion to the 2737 * higher layers. Otherwise, the async bio makes it appear we have 2738 * made progress against dirty pages when we've really just put it 2739 * on a queue for later 2740 */ 2741 atomic_inc(&root->fs_info->nr_async_bios); 2742 WARN_ON(bio->bi_next); 2743 bio->bi_next = NULL; 2744 bio->bi_rw |= rw; 2745 2746 spin_lock(&device->io_lock); 2747 2748 if (device->pending_bio_tail) 2749 device->pending_bio_tail->bi_next = bio; 2750 2751 device->pending_bio_tail = bio; 2752 if (!device->pending_bios) 2753 device->pending_bios = bio; 2754 if (device->running_pending) 2755 should_queue = 0; 2756 2757 spin_unlock(&device->io_lock); 2758 2759 if (should_queue) 2760 btrfs_queue_worker(&root->fs_info->submit_workers, 2761 &device->work); 2762 return 0; 2763 } 2764 2765 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2766 int mirror_num, int async_submit) 2767 { 2768 struct btrfs_mapping_tree *map_tree; 2769 struct btrfs_device *dev; 2770 struct bio *first_bio = bio; 2771 u64 logical = (u64)bio->bi_sector << 9; 2772 u64 length = 0; 2773 u64 map_length; 2774 struct btrfs_multi_bio *multi = NULL; 2775 int ret; 2776 int dev_nr = 0; 2777 int total_devs = 1; 2778 2779 length = bio->bi_size; 2780 map_tree = &root->fs_info->mapping_tree; 2781 map_length = length; 2782 2783 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2784 mirror_num); 2785 BUG_ON(ret); 2786 2787 total_devs = multi->num_stripes; 2788 if (map_length < length) { 2789 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2790 "len %llu\n", (unsigned long long)logical, 2791 (unsigned long long)length, 2792 (unsigned long long)map_length); 2793 BUG(); 2794 } 2795 multi->end_io = first_bio->bi_end_io; 2796 multi->private = first_bio->bi_private; 2797 multi->orig_bio = first_bio; 2798 atomic_set(&multi->stripes_pending, multi->num_stripes); 2799 2800 while (dev_nr < total_devs) { 2801 if (total_devs > 1) { 2802 if (dev_nr < total_devs - 1) { 2803 bio = bio_clone(first_bio, GFP_NOFS); 2804 BUG_ON(!bio); 2805 } else { 2806 bio = first_bio; 2807 } 2808 bio->bi_private = multi; 2809 bio->bi_end_io = end_bio_multi_stripe; 2810 } 2811 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 2812 dev = multi->stripes[dev_nr].dev; 2813 BUG_ON(rw == WRITE && !dev->writeable); 2814 if (dev && dev->bdev) { 2815 bio->bi_bdev = dev->bdev; 2816 if (async_submit) 2817 schedule_bio(root, dev, rw, bio); 2818 else 2819 submit_bio(rw, bio); 2820 } else { 2821 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2822 bio->bi_sector = logical >> 9; 2823 bio_endio(bio, -EIO); 2824 } 2825 dev_nr++; 2826 } 2827 if (total_devs == 1) 2828 kfree(multi); 2829 return 0; 2830 } 2831 2832 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 2833 u8 *uuid, u8 *fsid) 2834 { 2835 struct btrfs_device *device; 2836 struct btrfs_fs_devices *cur_devices; 2837 2838 cur_devices = root->fs_info->fs_devices; 2839 while (cur_devices) { 2840 if (!fsid || 2841 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 2842 device = __find_device(&cur_devices->devices, 2843 devid, uuid); 2844 if (device) 2845 return device; 2846 } 2847 cur_devices = cur_devices->seed; 2848 } 2849 return NULL; 2850 } 2851 2852 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 2853 u64 devid, u8 *dev_uuid) 2854 { 2855 struct btrfs_device *device; 2856 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2857 2858 device = kzalloc(sizeof(*device), GFP_NOFS); 2859 if (!device) 2860 return NULL; 2861 list_add(&device->dev_list, 2862 &fs_devices->devices); 2863 device->barriers = 1; 2864 device->dev_root = root->fs_info->dev_root; 2865 device->devid = devid; 2866 device->work.func = pending_bios_fn; 2867 device->fs_devices = fs_devices; 2868 fs_devices->num_devices++; 2869 spin_lock_init(&device->io_lock); 2870 INIT_LIST_HEAD(&device->dev_alloc_list); 2871 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 2872 return device; 2873 } 2874 2875 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 2876 struct extent_buffer *leaf, 2877 struct btrfs_chunk *chunk) 2878 { 2879 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2880 struct map_lookup *map; 2881 struct extent_map *em; 2882 u64 logical; 2883 u64 length; 2884 u64 devid; 2885 u8 uuid[BTRFS_UUID_SIZE]; 2886 int num_stripes; 2887 int ret; 2888 int i; 2889 2890 logical = key->offset; 2891 length = btrfs_chunk_length(leaf, chunk); 2892 2893 spin_lock(&map_tree->map_tree.lock); 2894 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 2895 spin_unlock(&map_tree->map_tree.lock); 2896 2897 /* already mapped? */ 2898 if (em && em->start <= logical && em->start + em->len > logical) { 2899 free_extent_map(em); 2900 return 0; 2901 } else if (em) { 2902 free_extent_map(em); 2903 } 2904 2905 em = alloc_extent_map(GFP_NOFS); 2906 if (!em) 2907 return -ENOMEM; 2908 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2909 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2910 if (!map) { 2911 free_extent_map(em); 2912 return -ENOMEM; 2913 } 2914 2915 em->bdev = (struct block_device *)map; 2916 em->start = logical; 2917 em->len = length; 2918 em->block_start = 0; 2919 em->block_len = em->len; 2920 2921 map->num_stripes = num_stripes; 2922 map->io_width = btrfs_chunk_io_width(leaf, chunk); 2923 map->io_align = btrfs_chunk_io_align(leaf, chunk); 2924 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 2925 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 2926 map->type = btrfs_chunk_type(leaf, chunk); 2927 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 2928 for (i = 0; i < num_stripes; i++) { 2929 map->stripes[i].physical = 2930 btrfs_stripe_offset_nr(leaf, chunk, i); 2931 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 2932 read_extent_buffer(leaf, uuid, (unsigned long) 2933 btrfs_stripe_dev_uuid_nr(chunk, i), 2934 BTRFS_UUID_SIZE); 2935 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 2936 NULL); 2937 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 2938 kfree(map); 2939 free_extent_map(em); 2940 return -EIO; 2941 } 2942 if (!map->stripes[i].dev) { 2943 map->stripes[i].dev = 2944 add_missing_dev(root, devid, uuid); 2945 if (!map->stripes[i].dev) { 2946 kfree(map); 2947 free_extent_map(em); 2948 return -EIO; 2949 } 2950 } 2951 map->stripes[i].dev->in_fs_metadata = 1; 2952 } 2953 2954 spin_lock(&map_tree->map_tree.lock); 2955 ret = add_extent_mapping(&map_tree->map_tree, em); 2956 spin_unlock(&map_tree->map_tree.lock); 2957 BUG_ON(ret); 2958 free_extent_map(em); 2959 2960 return 0; 2961 } 2962 2963 static int fill_device_from_item(struct extent_buffer *leaf, 2964 struct btrfs_dev_item *dev_item, 2965 struct btrfs_device *device) 2966 { 2967 unsigned long ptr; 2968 2969 device->devid = btrfs_device_id(leaf, dev_item); 2970 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); 2971 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 2972 device->type = btrfs_device_type(leaf, dev_item); 2973 device->io_align = btrfs_device_io_align(leaf, dev_item); 2974 device->io_width = btrfs_device_io_width(leaf, dev_item); 2975 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 2976 2977 ptr = (unsigned long)btrfs_device_uuid(dev_item); 2978 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 2979 2980 return 0; 2981 } 2982 2983 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 2984 { 2985 struct btrfs_fs_devices *fs_devices; 2986 int ret; 2987 2988 mutex_lock(&uuid_mutex); 2989 2990 fs_devices = root->fs_info->fs_devices->seed; 2991 while (fs_devices) { 2992 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 2993 ret = 0; 2994 goto out; 2995 } 2996 fs_devices = fs_devices->seed; 2997 } 2998 2999 fs_devices = find_fsid(fsid); 3000 if (!fs_devices) { 3001 ret = -ENOENT; 3002 goto out; 3003 } 3004 3005 fs_devices = clone_fs_devices(fs_devices); 3006 if (IS_ERR(fs_devices)) { 3007 ret = PTR_ERR(fs_devices); 3008 goto out; 3009 } 3010 3011 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3012 root->fs_info->bdev_holder); 3013 if (ret) 3014 goto out; 3015 3016 if (!fs_devices->seeding) { 3017 __btrfs_close_devices(fs_devices); 3018 free_fs_devices(fs_devices); 3019 ret = -EINVAL; 3020 goto out; 3021 } 3022 3023 fs_devices->seed = root->fs_info->fs_devices->seed; 3024 root->fs_info->fs_devices->seed = fs_devices; 3025 out: 3026 mutex_unlock(&uuid_mutex); 3027 return ret; 3028 } 3029 3030 static int read_one_dev(struct btrfs_root *root, 3031 struct extent_buffer *leaf, 3032 struct btrfs_dev_item *dev_item) 3033 { 3034 struct btrfs_device *device; 3035 u64 devid; 3036 int ret; 3037 u8 fs_uuid[BTRFS_UUID_SIZE]; 3038 u8 dev_uuid[BTRFS_UUID_SIZE]; 3039 3040 devid = btrfs_device_id(leaf, dev_item); 3041 read_extent_buffer(leaf, dev_uuid, 3042 (unsigned long)btrfs_device_uuid(dev_item), 3043 BTRFS_UUID_SIZE); 3044 read_extent_buffer(leaf, fs_uuid, 3045 (unsigned long)btrfs_device_fsid(dev_item), 3046 BTRFS_UUID_SIZE); 3047 3048 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3049 ret = open_seed_devices(root, fs_uuid); 3050 if (ret && !btrfs_test_opt(root, DEGRADED)) 3051 return ret; 3052 } 3053 3054 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3055 if (!device || !device->bdev) { 3056 if (!btrfs_test_opt(root, DEGRADED)) 3057 return -EIO; 3058 3059 if (!device) { 3060 printk(KERN_WARNING "warning devid %llu missing\n", 3061 (unsigned long long)devid); 3062 device = add_missing_dev(root, devid, dev_uuid); 3063 if (!device) 3064 return -ENOMEM; 3065 } 3066 } 3067 3068 if (device->fs_devices != root->fs_info->fs_devices) { 3069 BUG_ON(device->writeable); 3070 if (device->generation != 3071 btrfs_device_generation(leaf, dev_item)) 3072 return -EINVAL; 3073 } 3074 3075 fill_device_from_item(leaf, dev_item, device); 3076 device->dev_root = root->fs_info->dev_root; 3077 device->in_fs_metadata = 1; 3078 if (device->writeable) 3079 device->fs_devices->total_rw_bytes += device->total_bytes; 3080 ret = 0; 3081 return ret; 3082 } 3083 3084 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3085 { 3086 struct btrfs_dev_item *dev_item; 3087 3088 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3089 dev_item); 3090 return read_one_dev(root, buf, dev_item); 3091 } 3092 3093 int btrfs_read_sys_array(struct btrfs_root *root) 3094 { 3095 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3096 struct extent_buffer *sb; 3097 struct btrfs_disk_key *disk_key; 3098 struct btrfs_chunk *chunk; 3099 u8 *ptr; 3100 unsigned long sb_ptr; 3101 int ret = 0; 3102 u32 num_stripes; 3103 u32 array_size; 3104 u32 len = 0; 3105 u32 cur; 3106 struct btrfs_key key; 3107 3108 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3109 BTRFS_SUPER_INFO_SIZE); 3110 if (!sb) 3111 return -ENOMEM; 3112 btrfs_set_buffer_uptodate(sb); 3113 btrfs_set_buffer_lockdep_class(sb, 0); 3114 3115 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3116 array_size = btrfs_super_sys_array_size(super_copy); 3117 3118 ptr = super_copy->sys_chunk_array; 3119 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3120 cur = 0; 3121 3122 while (cur < array_size) { 3123 disk_key = (struct btrfs_disk_key *)ptr; 3124 btrfs_disk_key_to_cpu(&key, disk_key); 3125 3126 len = sizeof(*disk_key); ptr += len; 3127 sb_ptr += len; 3128 cur += len; 3129 3130 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3131 chunk = (struct btrfs_chunk *)sb_ptr; 3132 ret = read_one_chunk(root, &key, sb, chunk); 3133 if (ret) 3134 break; 3135 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3136 len = btrfs_chunk_item_size(num_stripes); 3137 } else { 3138 ret = -EIO; 3139 break; 3140 } 3141 ptr += len; 3142 sb_ptr += len; 3143 cur += len; 3144 } 3145 free_extent_buffer(sb); 3146 return ret; 3147 } 3148 3149 int btrfs_read_chunk_tree(struct btrfs_root *root) 3150 { 3151 struct btrfs_path *path; 3152 struct extent_buffer *leaf; 3153 struct btrfs_key key; 3154 struct btrfs_key found_key; 3155 int ret; 3156 int slot; 3157 3158 root = root->fs_info->chunk_root; 3159 3160 path = btrfs_alloc_path(); 3161 if (!path) 3162 return -ENOMEM; 3163 3164 /* first we search for all of the device items, and then we 3165 * read in all of the chunk items. This way we can create chunk 3166 * mappings that reference all of the devices that are afound 3167 */ 3168 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3169 key.offset = 0; 3170 key.type = 0; 3171 again: 3172 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3173 while (1) { 3174 leaf = path->nodes[0]; 3175 slot = path->slots[0]; 3176 if (slot >= btrfs_header_nritems(leaf)) { 3177 ret = btrfs_next_leaf(root, path); 3178 if (ret == 0) 3179 continue; 3180 if (ret < 0) 3181 goto error; 3182 break; 3183 } 3184 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3185 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3186 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3187 break; 3188 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3189 struct btrfs_dev_item *dev_item; 3190 dev_item = btrfs_item_ptr(leaf, slot, 3191 struct btrfs_dev_item); 3192 ret = read_one_dev(root, leaf, dev_item); 3193 if (ret) 3194 goto error; 3195 } 3196 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3197 struct btrfs_chunk *chunk; 3198 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3199 ret = read_one_chunk(root, &found_key, leaf, chunk); 3200 if (ret) 3201 goto error; 3202 } 3203 path->slots[0]++; 3204 } 3205 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3206 key.objectid = 0; 3207 btrfs_release_path(root, path); 3208 goto again; 3209 } 3210 ret = 0; 3211 error: 3212 btrfs_free_path(path); 3213 return ret; 3214 } 3215