1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <linux/iocontext.h> 24 #include <asm/div64.h> 25 #include "compat.h" 26 #include "ctree.h" 27 #include "extent_map.h" 28 #include "disk-io.h" 29 #include "transaction.h" 30 #include "print-tree.h" 31 #include "volumes.h" 32 #include "async-thread.h" 33 34 struct map_lookup { 35 u64 type; 36 int io_align; 37 int io_width; 38 int stripe_len; 39 int sector_size; 40 int num_stripes; 41 int sub_stripes; 42 struct btrfs_bio_stripe stripes[]; 43 }; 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 51 (sizeof(struct btrfs_bio_stripe) * (n))) 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 void btrfs_lock_volumes(void) 57 { 58 mutex_lock(&uuid_mutex); 59 } 60 61 void btrfs_unlock_volumes(void) 62 { 63 mutex_unlock(&uuid_mutex); 64 } 65 66 static void lock_chunks(struct btrfs_root *root) 67 { 68 mutex_lock(&root->fs_info->chunk_mutex); 69 } 70 71 static void unlock_chunks(struct btrfs_root *root) 72 { 73 mutex_unlock(&root->fs_info->chunk_mutex); 74 } 75 76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 77 { 78 struct btrfs_device *device; 79 WARN_ON(fs_devices->opened); 80 while (!list_empty(&fs_devices->devices)) { 81 device = list_entry(fs_devices->devices.next, 82 struct btrfs_device, dev_list); 83 list_del(&device->dev_list); 84 kfree(device->name); 85 kfree(device); 86 } 87 kfree(fs_devices); 88 } 89 90 int btrfs_cleanup_fs_uuids(void) 91 { 92 struct btrfs_fs_devices *fs_devices; 93 94 while (!list_empty(&fs_uuids)) { 95 fs_devices = list_entry(fs_uuids.next, 96 struct btrfs_fs_devices, list); 97 list_del(&fs_devices->list); 98 free_fs_devices(fs_devices); 99 } 100 return 0; 101 } 102 103 static noinline struct btrfs_device *__find_device(struct list_head *head, 104 u64 devid, u8 *uuid) 105 { 106 struct btrfs_device *dev; 107 108 list_for_each_entry(dev, head, dev_list) { 109 if (dev->devid == devid && 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 111 return dev; 112 } 113 } 114 return NULL; 115 } 116 117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 118 { 119 struct btrfs_fs_devices *fs_devices; 120 121 list_for_each_entry(fs_devices, &fs_uuids, list) { 122 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 123 return fs_devices; 124 } 125 return NULL; 126 } 127 128 static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 struct bio *head, struct bio *tail) 130 { 131 132 struct bio *old_head; 133 134 old_head = pending_bios->head; 135 pending_bios->head = head; 136 if (pending_bios->tail) 137 tail->bi_next = old_head; 138 else 139 pending_bios->tail = tail; 140 } 141 142 /* 143 * we try to collect pending bios for a device so we don't get a large 144 * number of procs sending bios down to the same device. This greatly 145 * improves the schedulers ability to collect and merge the bios. 146 * 147 * But, it also turns into a long list of bios to process and that is sure 148 * to eventually make the worker thread block. The solution here is to 149 * make some progress and then put this work struct back at the end of 150 * the list if the block device is congested. This way, multiple devices 151 * can make progress from a single worker thread. 152 */ 153 static noinline int run_scheduled_bios(struct btrfs_device *device) 154 { 155 struct bio *pending; 156 struct backing_dev_info *bdi; 157 struct btrfs_fs_info *fs_info; 158 struct btrfs_pending_bios *pending_bios; 159 struct bio *tail; 160 struct bio *cur; 161 int again = 0; 162 unsigned long num_run; 163 unsigned long num_sync_run; 164 unsigned long batch_run = 0; 165 unsigned long limit; 166 unsigned long last_waited = 0; 167 int force_reg = 0; 168 169 bdi = blk_get_backing_dev_info(device->bdev); 170 fs_info = device->dev_root->fs_info; 171 limit = btrfs_async_submit_limit(fs_info); 172 limit = limit * 2 / 3; 173 174 /* we want to make sure that every time we switch from the sync 175 * list to the normal list, we unplug 176 */ 177 num_sync_run = 0; 178 179 loop: 180 spin_lock(&device->io_lock); 181 182 loop_lock: 183 num_run = 0; 184 185 /* take all the bios off the list at once and process them 186 * later on (without the lock held). But, remember the 187 * tail and other pointers so the bios can be properly reinserted 188 * into the list if we hit congestion 189 */ 190 if (!force_reg && device->pending_sync_bios.head) { 191 pending_bios = &device->pending_sync_bios; 192 force_reg = 1; 193 } else { 194 pending_bios = &device->pending_bios; 195 force_reg = 0; 196 } 197 198 pending = pending_bios->head; 199 tail = pending_bios->tail; 200 WARN_ON(pending && !tail); 201 202 /* 203 * if pending was null this time around, no bios need processing 204 * at all and we can stop. Otherwise it'll loop back up again 205 * and do an additional check so no bios are missed. 206 * 207 * device->running_pending is used to synchronize with the 208 * schedule_bio code. 209 */ 210 if (device->pending_sync_bios.head == NULL && 211 device->pending_bios.head == NULL) { 212 again = 0; 213 device->running_pending = 0; 214 } else { 215 again = 1; 216 device->running_pending = 1; 217 } 218 219 pending_bios->head = NULL; 220 pending_bios->tail = NULL; 221 222 spin_unlock(&device->io_lock); 223 224 /* 225 * if we're doing the regular priority list, make sure we unplug 226 * for any high prio bios we've sent down 227 */ 228 if (pending_bios == &device->pending_bios && num_sync_run > 0) { 229 num_sync_run = 0; 230 blk_run_backing_dev(bdi, NULL); 231 } 232 233 while (pending) { 234 235 rmb(); 236 /* we want to work on both lists, but do more bios on the 237 * sync list than the regular list 238 */ 239 if ((num_run > 32 && 240 pending_bios != &device->pending_sync_bios && 241 device->pending_sync_bios.head) || 242 (num_run > 64 && pending_bios == &device->pending_sync_bios && 243 device->pending_bios.head)) { 244 spin_lock(&device->io_lock); 245 requeue_list(pending_bios, pending, tail); 246 goto loop_lock; 247 } 248 249 cur = pending; 250 pending = pending->bi_next; 251 cur->bi_next = NULL; 252 atomic_dec(&fs_info->nr_async_bios); 253 254 if (atomic_read(&fs_info->nr_async_bios) < limit && 255 waitqueue_active(&fs_info->async_submit_wait)) 256 wake_up(&fs_info->async_submit_wait); 257 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 submit_bio(cur->bi_rw, cur); 260 num_run++; 261 batch_run++; 262 263 if (bio_sync(cur)) 264 num_sync_run++; 265 266 if (need_resched()) { 267 if (num_sync_run) { 268 blk_run_backing_dev(bdi, NULL); 269 num_sync_run = 0; 270 } 271 cond_resched(); 272 } 273 274 /* 275 * we made progress, there is more work to do and the bdi 276 * is now congested. Back off and let other work structs 277 * run instead 278 */ 279 if (pending && bdi_write_congested(bdi) && batch_run > 32 && 280 fs_info->fs_devices->open_devices > 1) { 281 struct io_context *ioc; 282 283 ioc = current->io_context; 284 285 /* 286 * the main goal here is that we don't want to 287 * block if we're going to be able to submit 288 * more requests without blocking. 289 * 290 * This code does two great things, it pokes into 291 * the elevator code from a filesystem _and_ 292 * it makes assumptions about how batching works. 293 */ 294 if (ioc && ioc->nr_batch_requests > 0 && 295 time_before(jiffies, ioc->last_waited + HZ/50UL) && 296 (last_waited == 0 || 297 ioc->last_waited == last_waited)) { 298 /* 299 * we want to go through our batch of 300 * requests and stop. So, we copy out 301 * the ioc->last_waited time and test 302 * against it before looping 303 */ 304 last_waited = ioc->last_waited; 305 if (need_resched()) { 306 if (num_sync_run) { 307 blk_run_backing_dev(bdi, NULL); 308 num_sync_run = 0; 309 } 310 cond_resched(); 311 } 312 continue; 313 } 314 spin_lock(&device->io_lock); 315 requeue_list(pending_bios, pending, tail); 316 device->running_pending = 1; 317 318 spin_unlock(&device->io_lock); 319 btrfs_requeue_work(&device->work); 320 goto done; 321 } 322 } 323 324 if (num_sync_run) { 325 num_sync_run = 0; 326 blk_run_backing_dev(bdi, NULL); 327 } 328 329 cond_resched(); 330 if (again) 331 goto loop; 332 333 spin_lock(&device->io_lock); 334 if (device->pending_bios.head || device->pending_sync_bios.head) 335 goto loop_lock; 336 spin_unlock(&device->io_lock); 337 338 /* 339 * IO has already been through a long path to get here. Checksumming, 340 * async helper threads, perhaps compression. We've done a pretty 341 * good job of collecting a batch of IO and should just unplug 342 * the device right away. 343 * 344 * This will help anyone who is waiting on the IO, they might have 345 * already unplugged, but managed to do so before the bio they 346 * cared about found its way down here. 347 */ 348 blk_run_backing_dev(bdi, NULL); 349 done: 350 return 0; 351 } 352 353 static void pending_bios_fn(struct btrfs_work *work) 354 { 355 struct btrfs_device *device; 356 357 device = container_of(work, struct btrfs_device, work); 358 run_scheduled_bios(device); 359 } 360 361 static noinline int device_list_add(const char *path, 362 struct btrfs_super_block *disk_super, 363 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 364 { 365 struct btrfs_device *device; 366 struct btrfs_fs_devices *fs_devices; 367 u64 found_transid = btrfs_super_generation(disk_super); 368 369 fs_devices = find_fsid(disk_super->fsid); 370 if (!fs_devices) { 371 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 372 if (!fs_devices) 373 return -ENOMEM; 374 INIT_LIST_HEAD(&fs_devices->devices); 375 INIT_LIST_HEAD(&fs_devices->alloc_list); 376 list_add(&fs_devices->list, &fs_uuids); 377 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 378 fs_devices->latest_devid = devid; 379 fs_devices->latest_trans = found_transid; 380 mutex_init(&fs_devices->device_list_mutex); 381 device = NULL; 382 } else { 383 device = __find_device(&fs_devices->devices, devid, 384 disk_super->dev_item.uuid); 385 } 386 if (!device) { 387 if (fs_devices->opened) 388 return -EBUSY; 389 390 device = kzalloc(sizeof(*device), GFP_NOFS); 391 if (!device) { 392 /* we can safely leave the fs_devices entry around */ 393 return -ENOMEM; 394 } 395 device->devid = devid; 396 device->work.func = pending_bios_fn; 397 memcpy(device->uuid, disk_super->dev_item.uuid, 398 BTRFS_UUID_SIZE); 399 device->barriers = 1; 400 spin_lock_init(&device->io_lock); 401 device->name = kstrdup(path, GFP_NOFS); 402 if (!device->name) { 403 kfree(device); 404 return -ENOMEM; 405 } 406 INIT_LIST_HEAD(&device->dev_alloc_list); 407 408 mutex_lock(&fs_devices->device_list_mutex); 409 list_add(&device->dev_list, &fs_devices->devices); 410 mutex_unlock(&fs_devices->device_list_mutex); 411 412 device->fs_devices = fs_devices; 413 fs_devices->num_devices++; 414 } 415 416 if (found_transid > fs_devices->latest_trans) { 417 fs_devices->latest_devid = devid; 418 fs_devices->latest_trans = found_transid; 419 } 420 *fs_devices_ret = fs_devices; 421 return 0; 422 } 423 424 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 425 { 426 struct btrfs_fs_devices *fs_devices; 427 struct btrfs_device *device; 428 struct btrfs_device *orig_dev; 429 430 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 431 if (!fs_devices) 432 return ERR_PTR(-ENOMEM); 433 434 INIT_LIST_HEAD(&fs_devices->devices); 435 INIT_LIST_HEAD(&fs_devices->alloc_list); 436 INIT_LIST_HEAD(&fs_devices->list); 437 mutex_init(&fs_devices->device_list_mutex); 438 fs_devices->latest_devid = orig->latest_devid; 439 fs_devices->latest_trans = orig->latest_trans; 440 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 441 442 mutex_lock(&orig->device_list_mutex); 443 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 444 device = kzalloc(sizeof(*device), GFP_NOFS); 445 if (!device) 446 goto error; 447 448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 449 if (!device->name) 450 goto error; 451 452 device->devid = orig_dev->devid; 453 device->work.func = pending_bios_fn; 454 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 455 device->barriers = 1; 456 spin_lock_init(&device->io_lock); 457 INIT_LIST_HEAD(&device->dev_list); 458 INIT_LIST_HEAD(&device->dev_alloc_list); 459 460 list_add(&device->dev_list, &fs_devices->devices); 461 device->fs_devices = fs_devices; 462 fs_devices->num_devices++; 463 } 464 mutex_unlock(&orig->device_list_mutex); 465 return fs_devices; 466 error: 467 mutex_unlock(&orig->device_list_mutex); 468 free_fs_devices(fs_devices); 469 return ERR_PTR(-ENOMEM); 470 } 471 472 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 473 { 474 struct btrfs_device *device, *next; 475 476 mutex_lock(&uuid_mutex); 477 again: 478 mutex_lock(&fs_devices->device_list_mutex); 479 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 480 if (device->in_fs_metadata) 481 continue; 482 483 if (device->bdev) { 484 close_bdev_exclusive(device->bdev, device->mode); 485 device->bdev = NULL; 486 fs_devices->open_devices--; 487 } 488 if (device->writeable) { 489 list_del_init(&device->dev_alloc_list); 490 device->writeable = 0; 491 fs_devices->rw_devices--; 492 } 493 list_del_init(&device->dev_list); 494 fs_devices->num_devices--; 495 kfree(device->name); 496 kfree(device); 497 } 498 mutex_unlock(&fs_devices->device_list_mutex); 499 500 if (fs_devices->seed) { 501 fs_devices = fs_devices->seed; 502 goto again; 503 } 504 505 mutex_unlock(&uuid_mutex); 506 return 0; 507 } 508 509 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 510 { 511 struct btrfs_device *device; 512 513 if (--fs_devices->opened > 0) 514 return 0; 515 516 list_for_each_entry(device, &fs_devices->devices, dev_list) { 517 if (device->bdev) { 518 close_bdev_exclusive(device->bdev, device->mode); 519 fs_devices->open_devices--; 520 } 521 if (device->writeable) { 522 list_del_init(&device->dev_alloc_list); 523 fs_devices->rw_devices--; 524 } 525 526 device->bdev = NULL; 527 device->writeable = 0; 528 device->in_fs_metadata = 0; 529 } 530 WARN_ON(fs_devices->open_devices); 531 WARN_ON(fs_devices->rw_devices); 532 fs_devices->opened = 0; 533 fs_devices->seeding = 0; 534 535 return 0; 536 } 537 538 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 539 { 540 struct btrfs_fs_devices *seed_devices = NULL; 541 int ret; 542 543 mutex_lock(&uuid_mutex); 544 ret = __btrfs_close_devices(fs_devices); 545 if (!fs_devices->opened) { 546 seed_devices = fs_devices->seed; 547 fs_devices->seed = NULL; 548 } 549 mutex_unlock(&uuid_mutex); 550 551 while (seed_devices) { 552 fs_devices = seed_devices; 553 seed_devices = fs_devices->seed; 554 __btrfs_close_devices(fs_devices); 555 free_fs_devices(fs_devices); 556 } 557 return ret; 558 } 559 560 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 561 fmode_t flags, void *holder) 562 { 563 struct block_device *bdev; 564 struct list_head *head = &fs_devices->devices; 565 struct btrfs_device *device; 566 struct block_device *latest_bdev = NULL; 567 struct buffer_head *bh; 568 struct btrfs_super_block *disk_super; 569 u64 latest_devid = 0; 570 u64 latest_transid = 0; 571 u64 devid; 572 int seeding = 1; 573 int ret = 0; 574 575 list_for_each_entry(device, head, dev_list) { 576 if (device->bdev) 577 continue; 578 if (!device->name) 579 continue; 580 581 bdev = open_bdev_exclusive(device->name, flags, holder); 582 if (IS_ERR(bdev)) { 583 printk(KERN_INFO "open %s failed\n", device->name); 584 goto error; 585 } 586 set_blocksize(bdev, 4096); 587 588 bh = btrfs_read_dev_super(bdev); 589 if (!bh) 590 goto error_close; 591 592 disk_super = (struct btrfs_super_block *)bh->b_data; 593 devid = le64_to_cpu(disk_super->dev_item.devid); 594 if (devid != device->devid) 595 goto error_brelse; 596 597 if (memcmp(device->uuid, disk_super->dev_item.uuid, 598 BTRFS_UUID_SIZE)) 599 goto error_brelse; 600 601 device->generation = btrfs_super_generation(disk_super); 602 if (!latest_transid || device->generation > latest_transid) { 603 latest_devid = devid; 604 latest_transid = device->generation; 605 latest_bdev = bdev; 606 } 607 608 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 609 device->writeable = 0; 610 } else { 611 device->writeable = !bdev_read_only(bdev); 612 seeding = 0; 613 } 614 615 device->bdev = bdev; 616 device->in_fs_metadata = 0; 617 device->mode = flags; 618 619 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 620 fs_devices->rotating = 1; 621 622 fs_devices->open_devices++; 623 if (device->writeable) { 624 fs_devices->rw_devices++; 625 list_add(&device->dev_alloc_list, 626 &fs_devices->alloc_list); 627 } 628 continue; 629 630 error_brelse: 631 brelse(bh); 632 error_close: 633 close_bdev_exclusive(bdev, FMODE_READ); 634 error: 635 continue; 636 } 637 if (fs_devices->open_devices == 0) { 638 ret = -EIO; 639 goto out; 640 } 641 fs_devices->seeding = seeding; 642 fs_devices->opened = 1; 643 fs_devices->latest_bdev = latest_bdev; 644 fs_devices->latest_devid = latest_devid; 645 fs_devices->latest_trans = latest_transid; 646 fs_devices->total_rw_bytes = 0; 647 out: 648 return ret; 649 } 650 651 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 652 fmode_t flags, void *holder) 653 { 654 int ret; 655 656 mutex_lock(&uuid_mutex); 657 if (fs_devices->opened) { 658 fs_devices->opened++; 659 ret = 0; 660 } else { 661 ret = __btrfs_open_devices(fs_devices, flags, holder); 662 } 663 mutex_unlock(&uuid_mutex); 664 return ret; 665 } 666 667 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 668 struct btrfs_fs_devices **fs_devices_ret) 669 { 670 struct btrfs_super_block *disk_super; 671 struct block_device *bdev; 672 struct buffer_head *bh; 673 int ret; 674 u64 devid; 675 u64 transid; 676 677 mutex_lock(&uuid_mutex); 678 679 bdev = open_bdev_exclusive(path, flags, holder); 680 681 if (IS_ERR(bdev)) { 682 ret = PTR_ERR(bdev); 683 goto error; 684 } 685 686 ret = set_blocksize(bdev, 4096); 687 if (ret) 688 goto error_close; 689 bh = btrfs_read_dev_super(bdev); 690 if (!bh) { 691 ret = -EIO; 692 goto error_close; 693 } 694 disk_super = (struct btrfs_super_block *)bh->b_data; 695 devid = le64_to_cpu(disk_super->dev_item.devid); 696 transid = btrfs_super_generation(disk_super); 697 if (disk_super->label[0]) 698 printk(KERN_INFO "device label %s ", disk_super->label); 699 else { 700 /* FIXME, make a readl uuid parser */ 701 printk(KERN_INFO "device fsid %llx-%llx ", 702 *(unsigned long long *)disk_super->fsid, 703 *(unsigned long long *)(disk_super->fsid + 8)); 704 } 705 printk(KERN_CONT "devid %llu transid %llu %s\n", 706 (unsigned long long)devid, (unsigned long long)transid, path); 707 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 708 709 brelse(bh); 710 error_close: 711 close_bdev_exclusive(bdev, flags); 712 error: 713 mutex_unlock(&uuid_mutex); 714 return ret; 715 } 716 717 /* 718 * this uses a pretty simple search, the expectation is that it is 719 * called very infrequently and that a given device has a small number 720 * of extents 721 */ 722 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 723 struct btrfs_device *device, 724 u64 num_bytes, u64 *start, 725 u64 *max_avail) 726 { 727 struct btrfs_key key; 728 struct btrfs_root *root = device->dev_root; 729 struct btrfs_dev_extent *dev_extent = NULL; 730 struct btrfs_path *path; 731 u64 hole_size = 0; 732 u64 last_byte = 0; 733 u64 search_start = 0; 734 u64 search_end = device->total_bytes; 735 int ret; 736 int slot = 0; 737 int start_found; 738 struct extent_buffer *l; 739 740 path = btrfs_alloc_path(); 741 if (!path) 742 return -ENOMEM; 743 path->reada = 2; 744 start_found = 0; 745 746 /* FIXME use last free of some kind */ 747 748 /* we don't want to overwrite the superblock on the drive, 749 * so we make sure to start at an offset of at least 1MB 750 */ 751 search_start = max((u64)1024 * 1024, search_start); 752 753 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 754 search_start = max(root->fs_info->alloc_start, search_start); 755 756 key.objectid = device->devid; 757 key.offset = search_start; 758 key.type = BTRFS_DEV_EXTENT_KEY; 759 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 760 if (ret < 0) 761 goto error; 762 if (ret > 0) { 763 ret = btrfs_previous_item(root, path, key.objectid, key.type); 764 if (ret < 0) 765 goto error; 766 if (ret > 0) 767 start_found = 1; 768 } 769 l = path->nodes[0]; 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 771 while (1) { 772 l = path->nodes[0]; 773 slot = path->slots[0]; 774 if (slot >= btrfs_header_nritems(l)) { 775 ret = btrfs_next_leaf(root, path); 776 if (ret == 0) 777 continue; 778 if (ret < 0) 779 goto error; 780 no_more_items: 781 if (!start_found) { 782 if (search_start >= search_end) { 783 ret = -ENOSPC; 784 goto error; 785 } 786 *start = search_start; 787 start_found = 1; 788 goto check_pending; 789 } 790 *start = last_byte > search_start ? 791 last_byte : search_start; 792 if (search_end <= *start) { 793 ret = -ENOSPC; 794 goto error; 795 } 796 goto check_pending; 797 } 798 btrfs_item_key_to_cpu(l, &key, slot); 799 800 if (key.objectid < device->devid) 801 goto next; 802 803 if (key.objectid > device->devid) 804 goto no_more_items; 805 806 if (key.offset >= search_start && key.offset > last_byte && 807 start_found) { 808 if (last_byte < search_start) 809 last_byte = search_start; 810 hole_size = key.offset - last_byte; 811 812 if (hole_size > *max_avail) 813 *max_avail = hole_size; 814 815 if (key.offset > last_byte && 816 hole_size >= num_bytes) { 817 *start = last_byte; 818 goto check_pending; 819 } 820 } 821 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 822 goto next; 823 824 start_found = 1; 825 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 826 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 827 next: 828 path->slots[0]++; 829 cond_resched(); 830 } 831 check_pending: 832 /* we have to make sure we didn't find an extent that has already 833 * been allocated by the map tree or the original allocation 834 */ 835 BUG_ON(*start < search_start); 836 837 if (*start + num_bytes > search_end) { 838 ret = -ENOSPC; 839 goto error; 840 } 841 /* check for pending inserts here */ 842 ret = 0; 843 844 error: 845 btrfs_free_path(path); 846 return ret; 847 } 848 849 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 850 struct btrfs_device *device, 851 u64 start) 852 { 853 int ret; 854 struct btrfs_path *path; 855 struct btrfs_root *root = device->dev_root; 856 struct btrfs_key key; 857 struct btrfs_key found_key; 858 struct extent_buffer *leaf = NULL; 859 struct btrfs_dev_extent *extent = NULL; 860 861 path = btrfs_alloc_path(); 862 if (!path) 863 return -ENOMEM; 864 865 key.objectid = device->devid; 866 key.offset = start; 867 key.type = BTRFS_DEV_EXTENT_KEY; 868 869 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 870 if (ret > 0) { 871 ret = btrfs_previous_item(root, path, key.objectid, 872 BTRFS_DEV_EXTENT_KEY); 873 BUG_ON(ret); 874 leaf = path->nodes[0]; 875 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 876 extent = btrfs_item_ptr(leaf, path->slots[0], 877 struct btrfs_dev_extent); 878 BUG_ON(found_key.offset > start || found_key.offset + 879 btrfs_dev_extent_length(leaf, extent) < start); 880 ret = 0; 881 } else if (ret == 0) { 882 leaf = path->nodes[0]; 883 extent = btrfs_item_ptr(leaf, path->slots[0], 884 struct btrfs_dev_extent); 885 } 886 BUG_ON(ret); 887 888 if (device->bytes_used > 0) 889 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 890 ret = btrfs_del_item(trans, root, path); 891 BUG_ON(ret); 892 893 btrfs_free_path(path); 894 return ret; 895 } 896 897 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 898 struct btrfs_device *device, 899 u64 chunk_tree, u64 chunk_objectid, 900 u64 chunk_offset, u64 start, u64 num_bytes) 901 { 902 int ret; 903 struct btrfs_path *path; 904 struct btrfs_root *root = device->dev_root; 905 struct btrfs_dev_extent *extent; 906 struct extent_buffer *leaf; 907 struct btrfs_key key; 908 909 WARN_ON(!device->in_fs_metadata); 910 path = btrfs_alloc_path(); 911 if (!path) 912 return -ENOMEM; 913 914 key.objectid = device->devid; 915 key.offset = start; 916 key.type = BTRFS_DEV_EXTENT_KEY; 917 ret = btrfs_insert_empty_item(trans, root, path, &key, 918 sizeof(*extent)); 919 BUG_ON(ret); 920 921 leaf = path->nodes[0]; 922 extent = btrfs_item_ptr(leaf, path->slots[0], 923 struct btrfs_dev_extent); 924 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 925 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 926 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 927 928 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 929 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 930 BTRFS_UUID_SIZE); 931 932 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 933 btrfs_mark_buffer_dirty(leaf); 934 btrfs_free_path(path); 935 return ret; 936 } 937 938 static noinline int find_next_chunk(struct btrfs_root *root, 939 u64 objectid, u64 *offset) 940 { 941 struct btrfs_path *path; 942 int ret; 943 struct btrfs_key key; 944 struct btrfs_chunk *chunk; 945 struct btrfs_key found_key; 946 947 path = btrfs_alloc_path(); 948 BUG_ON(!path); 949 950 key.objectid = objectid; 951 key.offset = (u64)-1; 952 key.type = BTRFS_CHUNK_ITEM_KEY; 953 954 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 955 if (ret < 0) 956 goto error; 957 958 BUG_ON(ret == 0); 959 960 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 961 if (ret) { 962 *offset = 0; 963 } else { 964 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 965 path->slots[0]); 966 if (found_key.objectid != objectid) 967 *offset = 0; 968 else { 969 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 970 struct btrfs_chunk); 971 *offset = found_key.offset + 972 btrfs_chunk_length(path->nodes[0], chunk); 973 } 974 } 975 ret = 0; 976 error: 977 btrfs_free_path(path); 978 return ret; 979 } 980 981 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 982 { 983 int ret; 984 struct btrfs_key key; 985 struct btrfs_key found_key; 986 struct btrfs_path *path; 987 988 root = root->fs_info->chunk_root; 989 990 path = btrfs_alloc_path(); 991 if (!path) 992 return -ENOMEM; 993 994 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 995 key.type = BTRFS_DEV_ITEM_KEY; 996 key.offset = (u64)-1; 997 998 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 999 if (ret < 0) 1000 goto error; 1001 1002 BUG_ON(ret == 0); 1003 1004 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1005 BTRFS_DEV_ITEM_KEY); 1006 if (ret) { 1007 *objectid = 1; 1008 } else { 1009 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1010 path->slots[0]); 1011 *objectid = found_key.offset + 1; 1012 } 1013 ret = 0; 1014 error: 1015 btrfs_free_path(path); 1016 return ret; 1017 } 1018 1019 /* 1020 * the device information is stored in the chunk root 1021 * the btrfs_device struct should be fully filled in 1022 */ 1023 int btrfs_add_device(struct btrfs_trans_handle *trans, 1024 struct btrfs_root *root, 1025 struct btrfs_device *device) 1026 { 1027 int ret; 1028 struct btrfs_path *path; 1029 struct btrfs_dev_item *dev_item; 1030 struct extent_buffer *leaf; 1031 struct btrfs_key key; 1032 unsigned long ptr; 1033 1034 root = root->fs_info->chunk_root; 1035 1036 path = btrfs_alloc_path(); 1037 if (!path) 1038 return -ENOMEM; 1039 1040 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1041 key.type = BTRFS_DEV_ITEM_KEY; 1042 key.offset = device->devid; 1043 1044 ret = btrfs_insert_empty_item(trans, root, path, &key, 1045 sizeof(*dev_item)); 1046 if (ret) 1047 goto out; 1048 1049 leaf = path->nodes[0]; 1050 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1051 1052 btrfs_set_device_id(leaf, dev_item, device->devid); 1053 btrfs_set_device_generation(leaf, dev_item, 0); 1054 btrfs_set_device_type(leaf, dev_item, device->type); 1055 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1056 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1057 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1058 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1059 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1060 btrfs_set_device_group(leaf, dev_item, 0); 1061 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1062 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1063 btrfs_set_device_start_offset(leaf, dev_item, 0); 1064 1065 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1066 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1067 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1068 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1069 btrfs_mark_buffer_dirty(leaf); 1070 1071 ret = 0; 1072 out: 1073 btrfs_free_path(path); 1074 return ret; 1075 } 1076 1077 static int btrfs_rm_dev_item(struct btrfs_root *root, 1078 struct btrfs_device *device) 1079 { 1080 int ret; 1081 struct btrfs_path *path; 1082 struct btrfs_key key; 1083 struct btrfs_trans_handle *trans; 1084 1085 root = root->fs_info->chunk_root; 1086 1087 path = btrfs_alloc_path(); 1088 if (!path) 1089 return -ENOMEM; 1090 1091 trans = btrfs_start_transaction(root, 1); 1092 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1093 key.type = BTRFS_DEV_ITEM_KEY; 1094 key.offset = device->devid; 1095 lock_chunks(root); 1096 1097 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1098 if (ret < 0) 1099 goto out; 1100 1101 if (ret > 0) { 1102 ret = -ENOENT; 1103 goto out; 1104 } 1105 1106 ret = btrfs_del_item(trans, root, path); 1107 if (ret) 1108 goto out; 1109 out: 1110 btrfs_free_path(path); 1111 unlock_chunks(root); 1112 btrfs_commit_transaction(trans, root); 1113 return ret; 1114 } 1115 1116 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1117 { 1118 struct btrfs_device *device; 1119 struct btrfs_device *next_device; 1120 struct block_device *bdev; 1121 struct buffer_head *bh = NULL; 1122 struct btrfs_super_block *disk_super; 1123 u64 all_avail; 1124 u64 devid; 1125 u64 num_devices; 1126 u8 *dev_uuid; 1127 int ret = 0; 1128 1129 mutex_lock(&uuid_mutex); 1130 mutex_lock(&root->fs_info->volume_mutex); 1131 1132 all_avail = root->fs_info->avail_data_alloc_bits | 1133 root->fs_info->avail_system_alloc_bits | 1134 root->fs_info->avail_metadata_alloc_bits; 1135 1136 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1137 root->fs_info->fs_devices->rw_devices <= 4) { 1138 printk(KERN_ERR "btrfs: unable to go below four devices " 1139 "on raid10\n"); 1140 ret = -EINVAL; 1141 goto out; 1142 } 1143 1144 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1145 root->fs_info->fs_devices->rw_devices <= 2) { 1146 printk(KERN_ERR "btrfs: unable to go below two " 1147 "devices on raid1\n"); 1148 ret = -EINVAL; 1149 goto out; 1150 } 1151 1152 if (strcmp(device_path, "missing") == 0) { 1153 struct list_head *devices; 1154 struct btrfs_device *tmp; 1155 1156 device = NULL; 1157 devices = &root->fs_info->fs_devices->devices; 1158 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1159 list_for_each_entry(tmp, devices, dev_list) { 1160 if (tmp->in_fs_metadata && !tmp->bdev) { 1161 device = tmp; 1162 break; 1163 } 1164 } 1165 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1166 bdev = NULL; 1167 bh = NULL; 1168 disk_super = NULL; 1169 if (!device) { 1170 printk(KERN_ERR "btrfs: no missing devices found to " 1171 "remove\n"); 1172 goto out; 1173 } 1174 } else { 1175 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1176 root->fs_info->bdev_holder); 1177 if (IS_ERR(bdev)) { 1178 ret = PTR_ERR(bdev); 1179 goto out; 1180 } 1181 1182 set_blocksize(bdev, 4096); 1183 bh = btrfs_read_dev_super(bdev); 1184 if (!bh) { 1185 ret = -EIO; 1186 goto error_close; 1187 } 1188 disk_super = (struct btrfs_super_block *)bh->b_data; 1189 devid = le64_to_cpu(disk_super->dev_item.devid); 1190 dev_uuid = disk_super->dev_item.uuid; 1191 device = btrfs_find_device(root, devid, dev_uuid, 1192 disk_super->fsid); 1193 if (!device) { 1194 ret = -ENOENT; 1195 goto error_brelse; 1196 } 1197 } 1198 1199 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1200 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1201 "device\n"); 1202 ret = -EINVAL; 1203 goto error_brelse; 1204 } 1205 1206 if (device->writeable) { 1207 list_del_init(&device->dev_alloc_list); 1208 root->fs_info->fs_devices->rw_devices--; 1209 } 1210 1211 ret = btrfs_shrink_device(device, 0); 1212 if (ret) 1213 goto error_brelse; 1214 1215 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1216 if (ret) 1217 goto error_brelse; 1218 1219 device->in_fs_metadata = 0; 1220 1221 /* 1222 * the device list mutex makes sure that we don't change 1223 * the device list while someone else is writing out all 1224 * the device supers. 1225 */ 1226 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1227 list_del_init(&device->dev_list); 1228 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1229 1230 device->fs_devices->num_devices--; 1231 1232 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1233 struct btrfs_device, dev_list); 1234 if (device->bdev == root->fs_info->sb->s_bdev) 1235 root->fs_info->sb->s_bdev = next_device->bdev; 1236 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1237 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1238 1239 if (device->bdev) { 1240 close_bdev_exclusive(device->bdev, device->mode); 1241 device->bdev = NULL; 1242 device->fs_devices->open_devices--; 1243 } 1244 1245 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1246 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1247 1248 if (device->fs_devices->open_devices == 0) { 1249 struct btrfs_fs_devices *fs_devices; 1250 fs_devices = root->fs_info->fs_devices; 1251 while (fs_devices) { 1252 if (fs_devices->seed == device->fs_devices) 1253 break; 1254 fs_devices = fs_devices->seed; 1255 } 1256 fs_devices->seed = device->fs_devices->seed; 1257 device->fs_devices->seed = NULL; 1258 __btrfs_close_devices(device->fs_devices); 1259 free_fs_devices(device->fs_devices); 1260 } 1261 1262 /* 1263 * at this point, the device is zero sized. We want to 1264 * remove it from the devices list and zero out the old super 1265 */ 1266 if (device->writeable) { 1267 /* make sure this device isn't detected as part of 1268 * the FS anymore 1269 */ 1270 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1271 set_buffer_dirty(bh); 1272 sync_dirty_buffer(bh); 1273 } 1274 1275 kfree(device->name); 1276 kfree(device); 1277 ret = 0; 1278 1279 error_brelse: 1280 brelse(bh); 1281 error_close: 1282 if (bdev) 1283 close_bdev_exclusive(bdev, FMODE_READ); 1284 out: 1285 mutex_unlock(&root->fs_info->volume_mutex); 1286 mutex_unlock(&uuid_mutex); 1287 return ret; 1288 } 1289 1290 /* 1291 * does all the dirty work required for changing file system's UUID. 1292 */ 1293 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1294 struct btrfs_root *root) 1295 { 1296 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1297 struct btrfs_fs_devices *old_devices; 1298 struct btrfs_fs_devices *seed_devices; 1299 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1300 struct btrfs_device *device; 1301 u64 super_flags; 1302 1303 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1304 if (!fs_devices->seeding) 1305 return -EINVAL; 1306 1307 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1308 if (!seed_devices) 1309 return -ENOMEM; 1310 1311 old_devices = clone_fs_devices(fs_devices); 1312 if (IS_ERR(old_devices)) { 1313 kfree(seed_devices); 1314 return PTR_ERR(old_devices); 1315 } 1316 1317 list_add(&old_devices->list, &fs_uuids); 1318 1319 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1320 seed_devices->opened = 1; 1321 INIT_LIST_HEAD(&seed_devices->devices); 1322 INIT_LIST_HEAD(&seed_devices->alloc_list); 1323 mutex_init(&seed_devices->device_list_mutex); 1324 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1325 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1326 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1327 device->fs_devices = seed_devices; 1328 } 1329 1330 fs_devices->seeding = 0; 1331 fs_devices->num_devices = 0; 1332 fs_devices->open_devices = 0; 1333 fs_devices->seed = seed_devices; 1334 1335 generate_random_uuid(fs_devices->fsid); 1336 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1337 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1338 super_flags = btrfs_super_flags(disk_super) & 1339 ~BTRFS_SUPER_FLAG_SEEDING; 1340 btrfs_set_super_flags(disk_super, super_flags); 1341 1342 return 0; 1343 } 1344 1345 /* 1346 * strore the expected generation for seed devices in device items. 1347 */ 1348 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1349 struct btrfs_root *root) 1350 { 1351 struct btrfs_path *path; 1352 struct extent_buffer *leaf; 1353 struct btrfs_dev_item *dev_item; 1354 struct btrfs_device *device; 1355 struct btrfs_key key; 1356 u8 fs_uuid[BTRFS_UUID_SIZE]; 1357 u8 dev_uuid[BTRFS_UUID_SIZE]; 1358 u64 devid; 1359 int ret; 1360 1361 path = btrfs_alloc_path(); 1362 if (!path) 1363 return -ENOMEM; 1364 1365 root = root->fs_info->chunk_root; 1366 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1367 key.offset = 0; 1368 key.type = BTRFS_DEV_ITEM_KEY; 1369 1370 while (1) { 1371 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1372 if (ret < 0) 1373 goto error; 1374 1375 leaf = path->nodes[0]; 1376 next_slot: 1377 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1378 ret = btrfs_next_leaf(root, path); 1379 if (ret > 0) 1380 break; 1381 if (ret < 0) 1382 goto error; 1383 leaf = path->nodes[0]; 1384 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1385 btrfs_release_path(root, path); 1386 continue; 1387 } 1388 1389 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1390 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1391 key.type != BTRFS_DEV_ITEM_KEY) 1392 break; 1393 1394 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1395 struct btrfs_dev_item); 1396 devid = btrfs_device_id(leaf, dev_item); 1397 read_extent_buffer(leaf, dev_uuid, 1398 (unsigned long)btrfs_device_uuid(dev_item), 1399 BTRFS_UUID_SIZE); 1400 read_extent_buffer(leaf, fs_uuid, 1401 (unsigned long)btrfs_device_fsid(dev_item), 1402 BTRFS_UUID_SIZE); 1403 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1404 BUG_ON(!device); 1405 1406 if (device->fs_devices->seeding) { 1407 btrfs_set_device_generation(leaf, dev_item, 1408 device->generation); 1409 btrfs_mark_buffer_dirty(leaf); 1410 } 1411 1412 path->slots[0]++; 1413 goto next_slot; 1414 } 1415 ret = 0; 1416 error: 1417 btrfs_free_path(path); 1418 return ret; 1419 } 1420 1421 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1422 { 1423 struct btrfs_trans_handle *trans; 1424 struct btrfs_device *device; 1425 struct block_device *bdev; 1426 struct list_head *devices; 1427 struct super_block *sb = root->fs_info->sb; 1428 u64 total_bytes; 1429 int seeding_dev = 0; 1430 int ret = 0; 1431 1432 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1433 return -EINVAL; 1434 1435 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1436 if (!bdev) 1437 return -EIO; 1438 1439 if (root->fs_info->fs_devices->seeding) { 1440 seeding_dev = 1; 1441 down_write(&sb->s_umount); 1442 mutex_lock(&uuid_mutex); 1443 } 1444 1445 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1446 mutex_lock(&root->fs_info->volume_mutex); 1447 1448 devices = &root->fs_info->fs_devices->devices; 1449 /* 1450 * we have the volume lock, so we don't need the extra 1451 * device list mutex while reading the list here. 1452 */ 1453 list_for_each_entry(device, devices, dev_list) { 1454 if (device->bdev == bdev) { 1455 ret = -EEXIST; 1456 goto error; 1457 } 1458 } 1459 1460 device = kzalloc(sizeof(*device), GFP_NOFS); 1461 if (!device) { 1462 /* we can safely leave the fs_devices entry around */ 1463 ret = -ENOMEM; 1464 goto error; 1465 } 1466 1467 device->name = kstrdup(device_path, GFP_NOFS); 1468 if (!device->name) { 1469 kfree(device); 1470 ret = -ENOMEM; 1471 goto error; 1472 } 1473 1474 ret = find_next_devid(root, &device->devid); 1475 if (ret) { 1476 kfree(device); 1477 goto error; 1478 } 1479 1480 trans = btrfs_start_transaction(root, 1); 1481 lock_chunks(root); 1482 1483 device->barriers = 1; 1484 device->writeable = 1; 1485 device->work.func = pending_bios_fn; 1486 generate_random_uuid(device->uuid); 1487 spin_lock_init(&device->io_lock); 1488 device->generation = trans->transid; 1489 device->io_width = root->sectorsize; 1490 device->io_align = root->sectorsize; 1491 device->sector_size = root->sectorsize; 1492 device->total_bytes = i_size_read(bdev->bd_inode); 1493 device->disk_total_bytes = device->total_bytes; 1494 device->dev_root = root->fs_info->dev_root; 1495 device->bdev = bdev; 1496 device->in_fs_metadata = 1; 1497 device->mode = 0; 1498 set_blocksize(device->bdev, 4096); 1499 1500 if (seeding_dev) { 1501 sb->s_flags &= ~MS_RDONLY; 1502 ret = btrfs_prepare_sprout(trans, root); 1503 BUG_ON(ret); 1504 } 1505 1506 device->fs_devices = root->fs_info->fs_devices; 1507 1508 /* 1509 * we don't want write_supers to jump in here with our device 1510 * half setup 1511 */ 1512 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1513 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1514 list_add(&device->dev_alloc_list, 1515 &root->fs_info->fs_devices->alloc_list); 1516 root->fs_info->fs_devices->num_devices++; 1517 root->fs_info->fs_devices->open_devices++; 1518 root->fs_info->fs_devices->rw_devices++; 1519 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1520 1521 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1522 root->fs_info->fs_devices->rotating = 1; 1523 1524 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1525 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1526 total_bytes + device->total_bytes); 1527 1528 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1529 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1530 total_bytes + 1); 1531 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1532 1533 if (seeding_dev) { 1534 ret = init_first_rw_device(trans, root, device); 1535 BUG_ON(ret); 1536 ret = btrfs_finish_sprout(trans, root); 1537 BUG_ON(ret); 1538 } else { 1539 ret = btrfs_add_device(trans, root, device); 1540 } 1541 1542 /* 1543 * we've got more storage, clear any full flags on the space 1544 * infos 1545 */ 1546 btrfs_clear_space_info_full(root->fs_info); 1547 1548 unlock_chunks(root); 1549 btrfs_commit_transaction(trans, root); 1550 1551 if (seeding_dev) { 1552 mutex_unlock(&uuid_mutex); 1553 up_write(&sb->s_umount); 1554 1555 ret = btrfs_relocate_sys_chunks(root); 1556 BUG_ON(ret); 1557 } 1558 out: 1559 mutex_unlock(&root->fs_info->volume_mutex); 1560 return ret; 1561 error: 1562 close_bdev_exclusive(bdev, 0); 1563 if (seeding_dev) { 1564 mutex_unlock(&uuid_mutex); 1565 up_write(&sb->s_umount); 1566 } 1567 goto out; 1568 } 1569 1570 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1571 struct btrfs_device *device) 1572 { 1573 int ret; 1574 struct btrfs_path *path; 1575 struct btrfs_root *root; 1576 struct btrfs_dev_item *dev_item; 1577 struct extent_buffer *leaf; 1578 struct btrfs_key key; 1579 1580 root = device->dev_root->fs_info->chunk_root; 1581 1582 path = btrfs_alloc_path(); 1583 if (!path) 1584 return -ENOMEM; 1585 1586 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1587 key.type = BTRFS_DEV_ITEM_KEY; 1588 key.offset = device->devid; 1589 1590 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1591 if (ret < 0) 1592 goto out; 1593 1594 if (ret > 0) { 1595 ret = -ENOENT; 1596 goto out; 1597 } 1598 1599 leaf = path->nodes[0]; 1600 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1601 1602 btrfs_set_device_id(leaf, dev_item, device->devid); 1603 btrfs_set_device_type(leaf, dev_item, device->type); 1604 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1605 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1606 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1607 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1608 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1609 btrfs_mark_buffer_dirty(leaf); 1610 1611 out: 1612 btrfs_free_path(path); 1613 return ret; 1614 } 1615 1616 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1617 struct btrfs_device *device, u64 new_size) 1618 { 1619 struct btrfs_super_block *super_copy = 1620 &device->dev_root->fs_info->super_copy; 1621 u64 old_total = btrfs_super_total_bytes(super_copy); 1622 u64 diff = new_size - device->total_bytes; 1623 1624 if (!device->writeable) 1625 return -EACCES; 1626 if (new_size <= device->total_bytes) 1627 return -EINVAL; 1628 1629 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1630 device->fs_devices->total_rw_bytes += diff; 1631 1632 device->total_bytes = new_size; 1633 device->disk_total_bytes = new_size; 1634 btrfs_clear_space_info_full(device->dev_root->fs_info); 1635 1636 return btrfs_update_device(trans, device); 1637 } 1638 1639 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1640 struct btrfs_device *device, u64 new_size) 1641 { 1642 int ret; 1643 lock_chunks(device->dev_root); 1644 ret = __btrfs_grow_device(trans, device, new_size); 1645 unlock_chunks(device->dev_root); 1646 return ret; 1647 } 1648 1649 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1650 struct btrfs_root *root, 1651 u64 chunk_tree, u64 chunk_objectid, 1652 u64 chunk_offset) 1653 { 1654 int ret; 1655 struct btrfs_path *path; 1656 struct btrfs_key key; 1657 1658 root = root->fs_info->chunk_root; 1659 path = btrfs_alloc_path(); 1660 if (!path) 1661 return -ENOMEM; 1662 1663 key.objectid = chunk_objectid; 1664 key.offset = chunk_offset; 1665 key.type = BTRFS_CHUNK_ITEM_KEY; 1666 1667 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1668 BUG_ON(ret); 1669 1670 ret = btrfs_del_item(trans, root, path); 1671 BUG_ON(ret); 1672 1673 btrfs_free_path(path); 1674 return 0; 1675 } 1676 1677 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1678 chunk_offset) 1679 { 1680 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1681 struct btrfs_disk_key *disk_key; 1682 struct btrfs_chunk *chunk; 1683 u8 *ptr; 1684 int ret = 0; 1685 u32 num_stripes; 1686 u32 array_size; 1687 u32 len = 0; 1688 u32 cur; 1689 struct btrfs_key key; 1690 1691 array_size = btrfs_super_sys_array_size(super_copy); 1692 1693 ptr = super_copy->sys_chunk_array; 1694 cur = 0; 1695 1696 while (cur < array_size) { 1697 disk_key = (struct btrfs_disk_key *)ptr; 1698 btrfs_disk_key_to_cpu(&key, disk_key); 1699 1700 len = sizeof(*disk_key); 1701 1702 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1703 chunk = (struct btrfs_chunk *)(ptr + len); 1704 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1705 len += btrfs_chunk_item_size(num_stripes); 1706 } else { 1707 ret = -EIO; 1708 break; 1709 } 1710 if (key.objectid == chunk_objectid && 1711 key.offset == chunk_offset) { 1712 memmove(ptr, ptr + len, array_size - (cur + len)); 1713 array_size -= len; 1714 btrfs_set_super_sys_array_size(super_copy, array_size); 1715 } else { 1716 ptr += len; 1717 cur += len; 1718 } 1719 } 1720 return ret; 1721 } 1722 1723 static int btrfs_relocate_chunk(struct btrfs_root *root, 1724 u64 chunk_tree, u64 chunk_objectid, 1725 u64 chunk_offset) 1726 { 1727 struct extent_map_tree *em_tree; 1728 struct btrfs_root *extent_root; 1729 struct btrfs_trans_handle *trans; 1730 struct extent_map *em; 1731 struct map_lookup *map; 1732 int ret; 1733 int i; 1734 1735 root = root->fs_info->chunk_root; 1736 extent_root = root->fs_info->extent_root; 1737 em_tree = &root->fs_info->mapping_tree.map_tree; 1738 1739 /* step one, relocate all the extents inside this chunk */ 1740 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1741 BUG_ON(ret); 1742 1743 trans = btrfs_start_transaction(root, 1); 1744 BUG_ON(!trans); 1745 1746 lock_chunks(root); 1747 1748 /* 1749 * step two, delete the device extents and the 1750 * chunk tree entries 1751 */ 1752 spin_lock(&em_tree->lock); 1753 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1754 spin_unlock(&em_tree->lock); 1755 1756 BUG_ON(em->start > chunk_offset || 1757 em->start + em->len < chunk_offset); 1758 map = (struct map_lookup *)em->bdev; 1759 1760 for (i = 0; i < map->num_stripes; i++) { 1761 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1762 map->stripes[i].physical); 1763 BUG_ON(ret); 1764 1765 if (map->stripes[i].dev) { 1766 ret = btrfs_update_device(trans, map->stripes[i].dev); 1767 BUG_ON(ret); 1768 } 1769 } 1770 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1771 chunk_offset); 1772 1773 BUG_ON(ret); 1774 1775 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1776 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1777 BUG_ON(ret); 1778 } 1779 1780 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1781 BUG_ON(ret); 1782 1783 spin_lock(&em_tree->lock); 1784 remove_extent_mapping(em_tree, em); 1785 spin_unlock(&em_tree->lock); 1786 1787 kfree(map); 1788 em->bdev = NULL; 1789 1790 /* once for the tree */ 1791 free_extent_map(em); 1792 /* once for us */ 1793 free_extent_map(em); 1794 1795 unlock_chunks(root); 1796 btrfs_end_transaction(trans, root); 1797 return 0; 1798 } 1799 1800 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1801 { 1802 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1803 struct btrfs_path *path; 1804 struct extent_buffer *leaf; 1805 struct btrfs_chunk *chunk; 1806 struct btrfs_key key; 1807 struct btrfs_key found_key; 1808 u64 chunk_tree = chunk_root->root_key.objectid; 1809 u64 chunk_type; 1810 int ret; 1811 1812 path = btrfs_alloc_path(); 1813 if (!path) 1814 return -ENOMEM; 1815 1816 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1817 key.offset = (u64)-1; 1818 key.type = BTRFS_CHUNK_ITEM_KEY; 1819 1820 while (1) { 1821 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1822 if (ret < 0) 1823 goto error; 1824 BUG_ON(ret == 0); 1825 1826 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1827 key.type); 1828 if (ret < 0) 1829 goto error; 1830 if (ret > 0) 1831 break; 1832 1833 leaf = path->nodes[0]; 1834 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1835 1836 chunk = btrfs_item_ptr(leaf, path->slots[0], 1837 struct btrfs_chunk); 1838 chunk_type = btrfs_chunk_type(leaf, chunk); 1839 btrfs_release_path(chunk_root, path); 1840 1841 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1842 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1843 found_key.objectid, 1844 found_key.offset); 1845 BUG_ON(ret); 1846 } 1847 1848 if (found_key.offset == 0) 1849 break; 1850 key.offset = found_key.offset - 1; 1851 } 1852 ret = 0; 1853 error: 1854 btrfs_free_path(path); 1855 return ret; 1856 } 1857 1858 static u64 div_factor(u64 num, int factor) 1859 { 1860 if (factor == 10) 1861 return num; 1862 num *= factor; 1863 do_div(num, 10); 1864 return num; 1865 } 1866 1867 int btrfs_balance(struct btrfs_root *dev_root) 1868 { 1869 int ret; 1870 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1871 struct btrfs_device *device; 1872 u64 old_size; 1873 u64 size_to_free; 1874 struct btrfs_path *path; 1875 struct btrfs_key key; 1876 struct btrfs_chunk *chunk; 1877 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1878 struct btrfs_trans_handle *trans; 1879 struct btrfs_key found_key; 1880 1881 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1882 return -EROFS; 1883 1884 mutex_lock(&dev_root->fs_info->volume_mutex); 1885 dev_root = dev_root->fs_info->dev_root; 1886 1887 /* step one make some room on all the devices */ 1888 list_for_each_entry(device, devices, dev_list) { 1889 old_size = device->total_bytes; 1890 size_to_free = div_factor(old_size, 1); 1891 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1892 if (!device->writeable || 1893 device->total_bytes - device->bytes_used > size_to_free) 1894 continue; 1895 1896 ret = btrfs_shrink_device(device, old_size - size_to_free); 1897 BUG_ON(ret); 1898 1899 trans = btrfs_start_transaction(dev_root, 1); 1900 BUG_ON(!trans); 1901 1902 ret = btrfs_grow_device(trans, device, old_size); 1903 BUG_ON(ret); 1904 1905 btrfs_end_transaction(trans, dev_root); 1906 } 1907 1908 /* step two, relocate all the chunks */ 1909 path = btrfs_alloc_path(); 1910 BUG_ON(!path); 1911 1912 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1913 key.offset = (u64)-1; 1914 key.type = BTRFS_CHUNK_ITEM_KEY; 1915 1916 while (1) { 1917 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1918 if (ret < 0) 1919 goto error; 1920 1921 /* 1922 * this shouldn't happen, it means the last relocate 1923 * failed 1924 */ 1925 if (ret == 0) 1926 break; 1927 1928 ret = btrfs_previous_item(chunk_root, path, 0, 1929 BTRFS_CHUNK_ITEM_KEY); 1930 if (ret) 1931 break; 1932 1933 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1934 path->slots[0]); 1935 if (found_key.objectid != key.objectid) 1936 break; 1937 1938 chunk = btrfs_item_ptr(path->nodes[0], 1939 path->slots[0], 1940 struct btrfs_chunk); 1941 key.offset = found_key.offset; 1942 /* chunk zero is special */ 1943 if (key.offset == 0) 1944 break; 1945 1946 btrfs_release_path(chunk_root, path); 1947 ret = btrfs_relocate_chunk(chunk_root, 1948 chunk_root->root_key.objectid, 1949 found_key.objectid, 1950 found_key.offset); 1951 BUG_ON(ret); 1952 } 1953 ret = 0; 1954 error: 1955 btrfs_free_path(path); 1956 mutex_unlock(&dev_root->fs_info->volume_mutex); 1957 return ret; 1958 } 1959 1960 /* 1961 * shrinking a device means finding all of the device extents past 1962 * the new size, and then following the back refs to the chunks. 1963 * The chunk relocation code actually frees the device extent 1964 */ 1965 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1966 { 1967 struct btrfs_trans_handle *trans; 1968 struct btrfs_root *root = device->dev_root; 1969 struct btrfs_dev_extent *dev_extent = NULL; 1970 struct btrfs_path *path; 1971 u64 length; 1972 u64 chunk_tree; 1973 u64 chunk_objectid; 1974 u64 chunk_offset; 1975 int ret; 1976 int slot; 1977 struct extent_buffer *l; 1978 struct btrfs_key key; 1979 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1980 u64 old_total = btrfs_super_total_bytes(super_copy); 1981 u64 diff = device->total_bytes - new_size; 1982 1983 if (new_size >= device->total_bytes) 1984 return -EINVAL; 1985 1986 path = btrfs_alloc_path(); 1987 if (!path) 1988 return -ENOMEM; 1989 1990 trans = btrfs_start_transaction(root, 1); 1991 if (!trans) { 1992 ret = -ENOMEM; 1993 goto done; 1994 } 1995 1996 path->reada = 2; 1997 1998 lock_chunks(root); 1999 2000 device->total_bytes = new_size; 2001 if (device->writeable) 2002 device->fs_devices->total_rw_bytes -= diff; 2003 unlock_chunks(root); 2004 btrfs_end_transaction(trans, root); 2005 2006 key.objectid = device->devid; 2007 key.offset = (u64)-1; 2008 key.type = BTRFS_DEV_EXTENT_KEY; 2009 2010 while (1) { 2011 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2012 if (ret < 0) 2013 goto done; 2014 2015 ret = btrfs_previous_item(root, path, 0, key.type); 2016 if (ret < 0) 2017 goto done; 2018 if (ret) { 2019 ret = 0; 2020 break; 2021 } 2022 2023 l = path->nodes[0]; 2024 slot = path->slots[0]; 2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2026 2027 if (key.objectid != device->devid) 2028 break; 2029 2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2031 length = btrfs_dev_extent_length(l, dev_extent); 2032 2033 if (key.offset + length <= new_size) 2034 break; 2035 2036 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2037 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2038 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2039 btrfs_release_path(root, path); 2040 2041 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2042 chunk_offset); 2043 if (ret) 2044 goto done; 2045 } 2046 2047 /* Shrinking succeeded, else we would be at "done". */ 2048 trans = btrfs_start_transaction(root, 1); 2049 if (!trans) { 2050 ret = -ENOMEM; 2051 goto done; 2052 } 2053 lock_chunks(root); 2054 2055 device->disk_total_bytes = new_size; 2056 /* Now btrfs_update_device() will change the on-disk size. */ 2057 ret = btrfs_update_device(trans, device); 2058 if (ret) { 2059 unlock_chunks(root); 2060 btrfs_end_transaction(trans, root); 2061 goto done; 2062 } 2063 WARN_ON(diff > old_total); 2064 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2065 unlock_chunks(root); 2066 btrfs_end_transaction(trans, root); 2067 done: 2068 btrfs_free_path(path); 2069 return ret; 2070 } 2071 2072 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2073 struct btrfs_root *root, 2074 struct btrfs_key *key, 2075 struct btrfs_chunk *chunk, int item_size) 2076 { 2077 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2078 struct btrfs_disk_key disk_key; 2079 u32 array_size; 2080 u8 *ptr; 2081 2082 array_size = btrfs_super_sys_array_size(super_copy); 2083 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2084 return -EFBIG; 2085 2086 ptr = super_copy->sys_chunk_array + array_size; 2087 btrfs_cpu_key_to_disk(&disk_key, key); 2088 memcpy(ptr, &disk_key, sizeof(disk_key)); 2089 ptr += sizeof(disk_key); 2090 memcpy(ptr, chunk, item_size); 2091 item_size += sizeof(disk_key); 2092 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2093 return 0; 2094 } 2095 2096 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2097 int num_stripes, int sub_stripes) 2098 { 2099 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2100 return calc_size; 2101 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2102 return calc_size * (num_stripes / sub_stripes); 2103 else 2104 return calc_size * num_stripes; 2105 } 2106 2107 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2108 struct btrfs_root *extent_root, 2109 struct map_lookup **map_ret, 2110 u64 *num_bytes, u64 *stripe_size, 2111 u64 start, u64 type) 2112 { 2113 struct btrfs_fs_info *info = extent_root->fs_info; 2114 struct btrfs_device *device = NULL; 2115 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2116 struct list_head *cur; 2117 struct map_lookup *map = NULL; 2118 struct extent_map_tree *em_tree; 2119 struct extent_map *em; 2120 struct list_head private_devs; 2121 int min_stripe_size = 1 * 1024 * 1024; 2122 u64 calc_size = 1024 * 1024 * 1024; 2123 u64 max_chunk_size = calc_size; 2124 u64 min_free; 2125 u64 avail; 2126 u64 max_avail = 0; 2127 u64 dev_offset; 2128 int num_stripes = 1; 2129 int min_stripes = 1; 2130 int sub_stripes = 0; 2131 int looped = 0; 2132 int ret; 2133 int index; 2134 int stripe_len = 64 * 1024; 2135 2136 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2137 (type & BTRFS_BLOCK_GROUP_DUP)) { 2138 WARN_ON(1); 2139 type &= ~BTRFS_BLOCK_GROUP_DUP; 2140 } 2141 if (list_empty(&fs_devices->alloc_list)) 2142 return -ENOSPC; 2143 2144 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2145 num_stripes = fs_devices->rw_devices; 2146 min_stripes = 2; 2147 } 2148 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2149 num_stripes = 2; 2150 min_stripes = 2; 2151 } 2152 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2153 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2154 if (num_stripes < 2) 2155 return -ENOSPC; 2156 min_stripes = 2; 2157 } 2158 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2159 num_stripes = fs_devices->rw_devices; 2160 if (num_stripes < 4) 2161 return -ENOSPC; 2162 num_stripes &= ~(u32)1; 2163 sub_stripes = 2; 2164 min_stripes = 4; 2165 } 2166 2167 if (type & BTRFS_BLOCK_GROUP_DATA) { 2168 max_chunk_size = 10 * calc_size; 2169 min_stripe_size = 64 * 1024 * 1024; 2170 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2171 max_chunk_size = 4 * calc_size; 2172 min_stripe_size = 32 * 1024 * 1024; 2173 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2174 calc_size = 8 * 1024 * 1024; 2175 max_chunk_size = calc_size * 2; 2176 min_stripe_size = 1 * 1024 * 1024; 2177 } 2178 2179 /* we don't want a chunk larger than 10% of writeable space */ 2180 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2181 max_chunk_size); 2182 2183 again: 2184 max_avail = 0; 2185 if (!map || map->num_stripes != num_stripes) { 2186 kfree(map); 2187 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2188 if (!map) 2189 return -ENOMEM; 2190 map->num_stripes = num_stripes; 2191 } 2192 2193 if (calc_size * num_stripes > max_chunk_size) { 2194 calc_size = max_chunk_size; 2195 do_div(calc_size, num_stripes); 2196 do_div(calc_size, stripe_len); 2197 calc_size *= stripe_len; 2198 } 2199 /* we don't want tiny stripes */ 2200 calc_size = max_t(u64, min_stripe_size, calc_size); 2201 2202 do_div(calc_size, stripe_len); 2203 calc_size *= stripe_len; 2204 2205 cur = fs_devices->alloc_list.next; 2206 index = 0; 2207 2208 if (type & BTRFS_BLOCK_GROUP_DUP) 2209 min_free = calc_size * 2; 2210 else 2211 min_free = calc_size; 2212 2213 /* 2214 * we add 1MB because we never use the first 1MB of the device, unless 2215 * we've looped, then we are likely allocating the maximum amount of 2216 * space left already 2217 */ 2218 if (!looped) 2219 min_free += 1024 * 1024; 2220 2221 INIT_LIST_HEAD(&private_devs); 2222 while (index < num_stripes) { 2223 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2224 BUG_ON(!device->writeable); 2225 if (device->total_bytes > device->bytes_used) 2226 avail = device->total_bytes - device->bytes_used; 2227 else 2228 avail = 0; 2229 cur = cur->next; 2230 2231 if (device->in_fs_metadata && avail >= min_free) { 2232 ret = find_free_dev_extent(trans, device, 2233 min_free, &dev_offset, 2234 &max_avail); 2235 if (ret == 0) { 2236 list_move_tail(&device->dev_alloc_list, 2237 &private_devs); 2238 map->stripes[index].dev = device; 2239 map->stripes[index].physical = dev_offset; 2240 index++; 2241 if (type & BTRFS_BLOCK_GROUP_DUP) { 2242 map->stripes[index].dev = device; 2243 map->stripes[index].physical = 2244 dev_offset + calc_size; 2245 index++; 2246 } 2247 } 2248 } else if (device->in_fs_metadata && avail > max_avail) 2249 max_avail = avail; 2250 if (cur == &fs_devices->alloc_list) 2251 break; 2252 } 2253 list_splice(&private_devs, &fs_devices->alloc_list); 2254 if (index < num_stripes) { 2255 if (index >= min_stripes) { 2256 num_stripes = index; 2257 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2258 num_stripes /= sub_stripes; 2259 num_stripes *= sub_stripes; 2260 } 2261 looped = 1; 2262 goto again; 2263 } 2264 if (!looped && max_avail > 0) { 2265 looped = 1; 2266 calc_size = max_avail; 2267 goto again; 2268 } 2269 kfree(map); 2270 return -ENOSPC; 2271 } 2272 map->sector_size = extent_root->sectorsize; 2273 map->stripe_len = stripe_len; 2274 map->io_align = stripe_len; 2275 map->io_width = stripe_len; 2276 map->type = type; 2277 map->num_stripes = num_stripes; 2278 map->sub_stripes = sub_stripes; 2279 2280 *map_ret = map; 2281 *stripe_size = calc_size; 2282 *num_bytes = chunk_bytes_by_type(type, calc_size, 2283 num_stripes, sub_stripes); 2284 2285 em = alloc_extent_map(GFP_NOFS); 2286 if (!em) { 2287 kfree(map); 2288 return -ENOMEM; 2289 } 2290 em->bdev = (struct block_device *)map; 2291 em->start = start; 2292 em->len = *num_bytes; 2293 em->block_start = 0; 2294 em->block_len = em->len; 2295 2296 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2297 spin_lock(&em_tree->lock); 2298 ret = add_extent_mapping(em_tree, em); 2299 spin_unlock(&em_tree->lock); 2300 BUG_ON(ret); 2301 free_extent_map(em); 2302 2303 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2304 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2305 start, *num_bytes); 2306 BUG_ON(ret); 2307 2308 index = 0; 2309 while (index < map->num_stripes) { 2310 device = map->stripes[index].dev; 2311 dev_offset = map->stripes[index].physical; 2312 2313 ret = btrfs_alloc_dev_extent(trans, device, 2314 info->chunk_root->root_key.objectid, 2315 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2316 start, dev_offset, calc_size); 2317 BUG_ON(ret); 2318 index++; 2319 } 2320 2321 return 0; 2322 } 2323 2324 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2325 struct btrfs_root *extent_root, 2326 struct map_lookup *map, u64 chunk_offset, 2327 u64 chunk_size, u64 stripe_size) 2328 { 2329 u64 dev_offset; 2330 struct btrfs_key key; 2331 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2332 struct btrfs_device *device; 2333 struct btrfs_chunk *chunk; 2334 struct btrfs_stripe *stripe; 2335 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2336 int index = 0; 2337 int ret; 2338 2339 chunk = kzalloc(item_size, GFP_NOFS); 2340 if (!chunk) 2341 return -ENOMEM; 2342 2343 index = 0; 2344 while (index < map->num_stripes) { 2345 device = map->stripes[index].dev; 2346 device->bytes_used += stripe_size; 2347 ret = btrfs_update_device(trans, device); 2348 BUG_ON(ret); 2349 index++; 2350 } 2351 2352 index = 0; 2353 stripe = &chunk->stripe; 2354 while (index < map->num_stripes) { 2355 device = map->stripes[index].dev; 2356 dev_offset = map->stripes[index].physical; 2357 2358 btrfs_set_stack_stripe_devid(stripe, device->devid); 2359 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2360 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2361 stripe++; 2362 index++; 2363 } 2364 2365 btrfs_set_stack_chunk_length(chunk, chunk_size); 2366 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2367 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2368 btrfs_set_stack_chunk_type(chunk, map->type); 2369 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2370 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2371 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2372 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2373 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2374 2375 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2376 key.type = BTRFS_CHUNK_ITEM_KEY; 2377 key.offset = chunk_offset; 2378 2379 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2380 BUG_ON(ret); 2381 2382 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2383 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2384 item_size); 2385 BUG_ON(ret); 2386 } 2387 kfree(chunk); 2388 return 0; 2389 } 2390 2391 /* 2392 * Chunk allocation falls into two parts. The first part does works 2393 * that make the new allocated chunk useable, but not do any operation 2394 * that modifies the chunk tree. The second part does the works that 2395 * require modifying the chunk tree. This division is important for the 2396 * bootstrap process of adding storage to a seed btrfs. 2397 */ 2398 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2399 struct btrfs_root *extent_root, u64 type) 2400 { 2401 u64 chunk_offset; 2402 u64 chunk_size; 2403 u64 stripe_size; 2404 struct map_lookup *map; 2405 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2406 int ret; 2407 2408 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2409 &chunk_offset); 2410 if (ret) 2411 return ret; 2412 2413 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2414 &stripe_size, chunk_offset, type); 2415 if (ret) 2416 return ret; 2417 2418 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2419 chunk_size, stripe_size); 2420 BUG_ON(ret); 2421 return 0; 2422 } 2423 2424 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2425 struct btrfs_root *root, 2426 struct btrfs_device *device) 2427 { 2428 u64 chunk_offset; 2429 u64 sys_chunk_offset; 2430 u64 chunk_size; 2431 u64 sys_chunk_size; 2432 u64 stripe_size; 2433 u64 sys_stripe_size; 2434 u64 alloc_profile; 2435 struct map_lookup *map; 2436 struct map_lookup *sys_map; 2437 struct btrfs_fs_info *fs_info = root->fs_info; 2438 struct btrfs_root *extent_root = fs_info->extent_root; 2439 int ret; 2440 2441 ret = find_next_chunk(fs_info->chunk_root, 2442 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2443 BUG_ON(ret); 2444 2445 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2446 (fs_info->metadata_alloc_profile & 2447 fs_info->avail_metadata_alloc_bits); 2448 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2449 2450 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2451 &stripe_size, chunk_offset, alloc_profile); 2452 BUG_ON(ret); 2453 2454 sys_chunk_offset = chunk_offset + chunk_size; 2455 2456 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2457 (fs_info->system_alloc_profile & 2458 fs_info->avail_system_alloc_bits); 2459 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2460 2461 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2462 &sys_chunk_size, &sys_stripe_size, 2463 sys_chunk_offset, alloc_profile); 2464 BUG_ON(ret); 2465 2466 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2467 BUG_ON(ret); 2468 2469 /* 2470 * Modifying chunk tree needs allocating new blocks from both 2471 * system block group and metadata block group. So we only can 2472 * do operations require modifying the chunk tree after both 2473 * block groups were created. 2474 */ 2475 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2476 chunk_size, stripe_size); 2477 BUG_ON(ret); 2478 2479 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2480 sys_chunk_offset, sys_chunk_size, 2481 sys_stripe_size); 2482 BUG_ON(ret); 2483 return 0; 2484 } 2485 2486 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2487 { 2488 struct extent_map *em; 2489 struct map_lookup *map; 2490 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2491 int readonly = 0; 2492 int i; 2493 2494 spin_lock(&map_tree->map_tree.lock); 2495 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2496 spin_unlock(&map_tree->map_tree.lock); 2497 if (!em) 2498 return 1; 2499 2500 map = (struct map_lookup *)em->bdev; 2501 for (i = 0; i < map->num_stripes; i++) { 2502 if (!map->stripes[i].dev->writeable) { 2503 readonly = 1; 2504 break; 2505 } 2506 } 2507 free_extent_map(em); 2508 return readonly; 2509 } 2510 2511 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2512 { 2513 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2514 } 2515 2516 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2517 { 2518 struct extent_map *em; 2519 2520 while (1) { 2521 spin_lock(&tree->map_tree.lock); 2522 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2523 if (em) 2524 remove_extent_mapping(&tree->map_tree, em); 2525 spin_unlock(&tree->map_tree.lock); 2526 if (!em) 2527 break; 2528 kfree(em->bdev); 2529 /* once for us */ 2530 free_extent_map(em); 2531 /* once for the tree */ 2532 free_extent_map(em); 2533 } 2534 } 2535 2536 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2537 { 2538 struct extent_map *em; 2539 struct map_lookup *map; 2540 struct extent_map_tree *em_tree = &map_tree->map_tree; 2541 int ret; 2542 2543 spin_lock(&em_tree->lock); 2544 em = lookup_extent_mapping(em_tree, logical, len); 2545 spin_unlock(&em_tree->lock); 2546 BUG_ON(!em); 2547 2548 BUG_ON(em->start > logical || em->start + em->len < logical); 2549 map = (struct map_lookup *)em->bdev; 2550 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2551 ret = map->num_stripes; 2552 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2553 ret = map->sub_stripes; 2554 else 2555 ret = 1; 2556 free_extent_map(em); 2557 return ret; 2558 } 2559 2560 static int find_live_mirror(struct map_lookup *map, int first, int num, 2561 int optimal) 2562 { 2563 int i; 2564 if (map->stripes[optimal].dev->bdev) 2565 return optimal; 2566 for (i = first; i < first + num; i++) { 2567 if (map->stripes[i].dev->bdev) 2568 return i; 2569 } 2570 /* we couldn't find one that doesn't fail. Just return something 2571 * and the io error handling code will clean up eventually 2572 */ 2573 return optimal; 2574 } 2575 2576 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2577 u64 logical, u64 *length, 2578 struct btrfs_multi_bio **multi_ret, 2579 int mirror_num, struct page *unplug_page) 2580 { 2581 struct extent_map *em; 2582 struct map_lookup *map; 2583 struct extent_map_tree *em_tree = &map_tree->map_tree; 2584 u64 offset; 2585 u64 stripe_offset; 2586 u64 stripe_nr; 2587 int stripes_allocated = 8; 2588 int stripes_required = 1; 2589 int stripe_index; 2590 int i; 2591 int num_stripes; 2592 int max_errors = 0; 2593 struct btrfs_multi_bio *multi = NULL; 2594 2595 if (multi_ret && !(rw & (1 << BIO_RW))) 2596 stripes_allocated = 1; 2597 again: 2598 if (multi_ret) { 2599 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2600 GFP_NOFS); 2601 if (!multi) 2602 return -ENOMEM; 2603 2604 atomic_set(&multi->error, 0); 2605 } 2606 2607 spin_lock(&em_tree->lock); 2608 em = lookup_extent_mapping(em_tree, logical, *length); 2609 spin_unlock(&em_tree->lock); 2610 2611 if (!em && unplug_page) 2612 return 0; 2613 2614 if (!em) { 2615 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2616 (unsigned long long)logical, 2617 (unsigned long long)*length); 2618 BUG(); 2619 } 2620 2621 BUG_ON(em->start > logical || em->start + em->len < logical); 2622 map = (struct map_lookup *)em->bdev; 2623 offset = logical - em->start; 2624 2625 if (mirror_num > map->num_stripes) 2626 mirror_num = 0; 2627 2628 /* if our multi bio struct is too small, back off and try again */ 2629 if (rw & (1 << BIO_RW)) { 2630 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2631 BTRFS_BLOCK_GROUP_DUP)) { 2632 stripes_required = map->num_stripes; 2633 max_errors = 1; 2634 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2635 stripes_required = map->sub_stripes; 2636 max_errors = 1; 2637 } 2638 } 2639 if (multi_ret && (rw & (1 << BIO_RW)) && 2640 stripes_allocated < stripes_required) { 2641 stripes_allocated = map->num_stripes; 2642 free_extent_map(em); 2643 kfree(multi); 2644 goto again; 2645 } 2646 stripe_nr = offset; 2647 /* 2648 * stripe_nr counts the total number of stripes we have to stride 2649 * to get to this block 2650 */ 2651 do_div(stripe_nr, map->stripe_len); 2652 2653 stripe_offset = stripe_nr * map->stripe_len; 2654 BUG_ON(offset < stripe_offset); 2655 2656 /* stripe_offset is the offset of this block in its stripe*/ 2657 stripe_offset = offset - stripe_offset; 2658 2659 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2660 BTRFS_BLOCK_GROUP_RAID10 | 2661 BTRFS_BLOCK_GROUP_DUP)) { 2662 /* we limit the length of each bio to what fits in a stripe */ 2663 *length = min_t(u64, em->len - offset, 2664 map->stripe_len - stripe_offset); 2665 } else { 2666 *length = em->len - offset; 2667 } 2668 2669 if (!multi_ret && !unplug_page) 2670 goto out; 2671 2672 num_stripes = 1; 2673 stripe_index = 0; 2674 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2675 if (unplug_page || (rw & (1 << BIO_RW))) 2676 num_stripes = map->num_stripes; 2677 else if (mirror_num) 2678 stripe_index = mirror_num - 1; 2679 else { 2680 stripe_index = find_live_mirror(map, 0, 2681 map->num_stripes, 2682 current->pid % map->num_stripes); 2683 } 2684 2685 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2686 if (rw & (1 << BIO_RW)) 2687 num_stripes = map->num_stripes; 2688 else if (mirror_num) 2689 stripe_index = mirror_num - 1; 2690 2691 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2692 int factor = map->num_stripes / map->sub_stripes; 2693 2694 stripe_index = do_div(stripe_nr, factor); 2695 stripe_index *= map->sub_stripes; 2696 2697 if (unplug_page || (rw & (1 << BIO_RW))) 2698 num_stripes = map->sub_stripes; 2699 else if (mirror_num) 2700 stripe_index += mirror_num - 1; 2701 else { 2702 stripe_index = find_live_mirror(map, stripe_index, 2703 map->sub_stripes, stripe_index + 2704 current->pid % map->sub_stripes); 2705 } 2706 } else { 2707 /* 2708 * after this do_div call, stripe_nr is the number of stripes 2709 * on this device we have to walk to find the data, and 2710 * stripe_index is the number of our device in the stripe array 2711 */ 2712 stripe_index = do_div(stripe_nr, map->num_stripes); 2713 } 2714 BUG_ON(stripe_index >= map->num_stripes); 2715 2716 for (i = 0; i < num_stripes; i++) { 2717 if (unplug_page) { 2718 struct btrfs_device *device; 2719 struct backing_dev_info *bdi; 2720 2721 device = map->stripes[stripe_index].dev; 2722 if (device->bdev) { 2723 bdi = blk_get_backing_dev_info(device->bdev); 2724 if (bdi->unplug_io_fn) 2725 bdi->unplug_io_fn(bdi, unplug_page); 2726 } 2727 } else { 2728 multi->stripes[i].physical = 2729 map->stripes[stripe_index].physical + 2730 stripe_offset + stripe_nr * map->stripe_len; 2731 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2732 } 2733 stripe_index++; 2734 } 2735 if (multi_ret) { 2736 *multi_ret = multi; 2737 multi->num_stripes = num_stripes; 2738 multi->max_errors = max_errors; 2739 } 2740 out: 2741 free_extent_map(em); 2742 return 0; 2743 } 2744 2745 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2746 u64 logical, u64 *length, 2747 struct btrfs_multi_bio **multi_ret, int mirror_num) 2748 { 2749 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2750 mirror_num, NULL); 2751 } 2752 2753 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2754 u64 chunk_start, u64 physical, u64 devid, 2755 u64 **logical, int *naddrs, int *stripe_len) 2756 { 2757 struct extent_map_tree *em_tree = &map_tree->map_tree; 2758 struct extent_map *em; 2759 struct map_lookup *map; 2760 u64 *buf; 2761 u64 bytenr; 2762 u64 length; 2763 u64 stripe_nr; 2764 int i, j, nr = 0; 2765 2766 spin_lock(&em_tree->lock); 2767 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2768 spin_unlock(&em_tree->lock); 2769 2770 BUG_ON(!em || em->start != chunk_start); 2771 map = (struct map_lookup *)em->bdev; 2772 2773 length = em->len; 2774 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2775 do_div(length, map->num_stripes / map->sub_stripes); 2776 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2777 do_div(length, map->num_stripes); 2778 2779 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2780 BUG_ON(!buf); 2781 2782 for (i = 0; i < map->num_stripes; i++) { 2783 if (devid && map->stripes[i].dev->devid != devid) 2784 continue; 2785 if (map->stripes[i].physical > physical || 2786 map->stripes[i].physical + length <= physical) 2787 continue; 2788 2789 stripe_nr = physical - map->stripes[i].physical; 2790 do_div(stripe_nr, map->stripe_len); 2791 2792 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2793 stripe_nr = stripe_nr * map->num_stripes + i; 2794 do_div(stripe_nr, map->sub_stripes); 2795 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2796 stripe_nr = stripe_nr * map->num_stripes + i; 2797 } 2798 bytenr = chunk_start + stripe_nr * map->stripe_len; 2799 WARN_ON(nr >= map->num_stripes); 2800 for (j = 0; j < nr; j++) { 2801 if (buf[j] == bytenr) 2802 break; 2803 } 2804 if (j == nr) { 2805 WARN_ON(nr >= map->num_stripes); 2806 buf[nr++] = bytenr; 2807 } 2808 } 2809 2810 *logical = buf; 2811 *naddrs = nr; 2812 *stripe_len = map->stripe_len; 2813 2814 free_extent_map(em); 2815 return 0; 2816 } 2817 2818 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2819 u64 logical, struct page *page) 2820 { 2821 u64 length = PAGE_CACHE_SIZE; 2822 return __btrfs_map_block(map_tree, READ, logical, &length, 2823 NULL, 0, page); 2824 } 2825 2826 static void end_bio_multi_stripe(struct bio *bio, int err) 2827 { 2828 struct btrfs_multi_bio *multi = bio->bi_private; 2829 int is_orig_bio = 0; 2830 2831 if (err) 2832 atomic_inc(&multi->error); 2833 2834 if (bio == multi->orig_bio) 2835 is_orig_bio = 1; 2836 2837 if (atomic_dec_and_test(&multi->stripes_pending)) { 2838 if (!is_orig_bio) { 2839 bio_put(bio); 2840 bio = multi->orig_bio; 2841 } 2842 bio->bi_private = multi->private; 2843 bio->bi_end_io = multi->end_io; 2844 /* only send an error to the higher layers if it is 2845 * beyond the tolerance of the multi-bio 2846 */ 2847 if (atomic_read(&multi->error) > multi->max_errors) { 2848 err = -EIO; 2849 } else if (err) { 2850 /* 2851 * this bio is actually up to date, we didn't 2852 * go over the max number of errors 2853 */ 2854 set_bit(BIO_UPTODATE, &bio->bi_flags); 2855 err = 0; 2856 } 2857 kfree(multi); 2858 2859 bio_endio(bio, err); 2860 } else if (!is_orig_bio) { 2861 bio_put(bio); 2862 } 2863 } 2864 2865 struct async_sched { 2866 struct bio *bio; 2867 int rw; 2868 struct btrfs_fs_info *info; 2869 struct btrfs_work work; 2870 }; 2871 2872 /* 2873 * see run_scheduled_bios for a description of why bios are collected for 2874 * async submit. 2875 * 2876 * This will add one bio to the pending list for a device and make sure 2877 * the work struct is scheduled. 2878 */ 2879 static noinline int schedule_bio(struct btrfs_root *root, 2880 struct btrfs_device *device, 2881 int rw, struct bio *bio) 2882 { 2883 int should_queue = 1; 2884 struct btrfs_pending_bios *pending_bios; 2885 2886 /* don't bother with additional async steps for reads, right now */ 2887 if (!(rw & (1 << BIO_RW))) { 2888 bio_get(bio); 2889 submit_bio(rw, bio); 2890 bio_put(bio); 2891 return 0; 2892 } 2893 2894 /* 2895 * nr_async_bios allows us to reliably return congestion to the 2896 * higher layers. Otherwise, the async bio makes it appear we have 2897 * made progress against dirty pages when we've really just put it 2898 * on a queue for later 2899 */ 2900 atomic_inc(&root->fs_info->nr_async_bios); 2901 WARN_ON(bio->bi_next); 2902 bio->bi_next = NULL; 2903 bio->bi_rw |= rw; 2904 2905 spin_lock(&device->io_lock); 2906 if (bio_sync(bio)) 2907 pending_bios = &device->pending_sync_bios; 2908 else 2909 pending_bios = &device->pending_bios; 2910 2911 if (pending_bios->tail) 2912 pending_bios->tail->bi_next = bio; 2913 2914 pending_bios->tail = bio; 2915 if (!pending_bios->head) 2916 pending_bios->head = bio; 2917 if (device->running_pending) 2918 should_queue = 0; 2919 2920 spin_unlock(&device->io_lock); 2921 2922 if (should_queue) 2923 btrfs_queue_worker(&root->fs_info->submit_workers, 2924 &device->work); 2925 return 0; 2926 } 2927 2928 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2929 int mirror_num, int async_submit) 2930 { 2931 struct btrfs_mapping_tree *map_tree; 2932 struct btrfs_device *dev; 2933 struct bio *first_bio = bio; 2934 u64 logical = (u64)bio->bi_sector << 9; 2935 u64 length = 0; 2936 u64 map_length; 2937 struct btrfs_multi_bio *multi = NULL; 2938 int ret; 2939 int dev_nr = 0; 2940 int total_devs = 1; 2941 2942 length = bio->bi_size; 2943 map_tree = &root->fs_info->mapping_tree; 2944 map_length = length; 2945 2946 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2947 mirror_num); 2948 BUG_ON(ret); 2949 2950 total_devs = multi->num_stripes; 2951 if (map_length < length) { 2952 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2953 "len %llu\n", (unsigned long long)logical, 2954 (unsigned long long)length, 2955 (unsigned long long)map_length); 2956 BUG(); 2957 } 2958 multi->end_io = first_bio->bi_end_io; 2959 multi->private = first_bio->bi_private; 2960 multi->orig_bio = first_bio; 2961 atomic_set(&multi->stripes_pending, multi->num_stripes); 2962 2963 while (dev_nr < total_devs) { 2964 if (total_devs > 1) { 2965 if (dev_nr < total_devs - 1) { 2966 bio = bio_clone(first_bio, GFP_NOFS); 2967 BUG_ON(!bio); 2968 } else { 2969 bio = first_bio; 2970 } 2971 bio->bi_private = multi; 2972 bio->bi_end_io = end_bio_multi_stripe; 2973 } 2974 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 2975 dev = multi->stripes[dev_nr].dev; 2976 BUG_ON(rw == WRITE && !dev->writeable); 2977 if (dev && dev->bdev) { 2978 bio->bi_bdev = dev->bdev; 2979 if (async_submit) 2980 schedule_bio(root, dev, rw, bio); 2981 else 2982 submit_bio(rw, bio); 2983 } else { 2984 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2985 bio->bi_sector = logical >> 9; 2986 bio_endio(bio, -EIO); 2987 } 2988 dev_nr++; 2989 } 2990 if (total_devs == 1) 2991 kfree(multi); 2992 return 0; 2993 } 2994 2995 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 2996 u8 *uuid, u8 *fsid) 2997 { 2998 struct btrfs_device *device; 2999 struct btrfs_fs_devices *cur_devices; 3000 3001 cur_devices = root->fs_info->fs_devices; 3002 while (cur_devices) { 3003 if (!fsid || 3004 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3005 device = __find_device(&cur_devices->devices, 3006 devid, uuid); 3007 if (device) 3008 return device; 3009 } 3010 cur_devices = cur_devices->seed; 3011 } 3012 return NULL; 3013 } 3014 3015 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3016 u64 devid, u8 *dev_uuid) 3017 { 3018 struct btrfs_device *device; 3019 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3020 3021 device = kzalloc(sizeof(*device), GFP_NOFS); 3022 if (!device) 3023 return NULL; 3024 list_add(&device->dev_list, 3025 &fs_devices->devices); 3026 device->barriers = 1; 3027 device->dev_root = root->fs_info->dev_root; 3028 device->devid = devid; 3029 device->work.func = pending_bios_fn; 3030 device->fs_devices = fs_devices; 3031 fs_devices->num_devices++; 3032 spin_lock_init(&device->io_lock); 3033 INIT_LIST_HEAD(&device->dev_alloc_list); 3034 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3035 return device; 3036 } 3037 3038 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3039 struct extent_buffer *leaf, 3040 struct btrfs_chunk *chunk) 3041 { 3042 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3043 struct map_lookup *map; 3044 struct extent_map *em; 3045 u64 logical; 3046 u64 length; 3047 u64 devid; 3048 u8 uuid[BTRFS_UUID_SIZE]; 3049 int num_stripes; 3050 int ret; 3051 int i; 3052 3053 logical = key->offset; 3054 length = btrfs_chunk_length(leaf, chunk); 3055 3056 spin_lock(&map_tree->map_tree.lock); 3057 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3058 spin_unlock(&map_tree->map_tree.lock); 3059 3060 /* already mapped? */ 3061 if (em && em->start <= logical && em->start + em->len > logical) { 3062 free_extent_map(em); 3063 return 0; 3064 } else if (em) { 3065 free_extent_map(em); 3066 } 3067 3068 em = alloc_extent_map(GFP_NOFS); 3069 if (!em) 3070 return -ENOMEM; 3071 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3072 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3073 if (!map) { 3074 free_extent_map(em); 3075 return -ENOMEM; 3076 } 3077 3078 em->bdev = (struct block_device *)map; 3079 em->start = logical; 3080 em->len = length; 3081 em->block_start = 0; 3082 em->block_len = em->len; 3083 3084 map->num_stripes = num_stripes; 3085 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3086 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3087 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3088 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3089 map->type = btrfs_chunk_type(leaf, chunk); 3090 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3091 for (i = 0; i < num_stripes; i++) { 3092 map->stripes[i].physical = 3093 btrfs_stripe_offset_nr(leaf, chunk, i); 3094 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3095 read_extent_buffer(leaf, uuid, (unsigned long) 3096 btrfs_stripe_dev_uuid_nr(chunk, i), 3097 BTRFS_UUID_SIZE); 3098 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3099 NULL); 3100 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3101 kfree(map); 3102 free_extent_map(em); 3103 return -EIO; 3104 } 3105 if (!map->stripes[i].dev) { 3106 map->stripes[i].dev = 3107 add_missing_dev(root, devid, uuid); 3108 if (!map->stripes[i].dev) { 3109 kfree(map); 3110 free_extent_map(em); 3111 return -EIO; 3112 } 3113 } 3114 map->stripes[i].dev->in_fs_metadata = 1; 3115 } 3116 3117 spin_lock(&map_tree->map_tree.lock); 3118 ret = add_extent_mapping(&map_tree->map_tree, em); 3119 spin_unlock(&map_tree->map_tree.lock); 3120 BUG_ON(ret); 3121 free_extent_map(em); 3122 3123 return 0; 3124 } 3125 3126 static int fill_device_from_item(struct extent_buffer *leaf, 3127 struct btrfs_dev_item *dev_item, 3128 struct btrfs_device *device) 3129 { 3130 unsigned long ptr; 3131 3132 device->devid = btrfs_device_id(leaf, dev_item); 3133 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3134 device->total_bytes = device->disk_total_bytes; 3135 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3136 device->type = btrfs_device_type(leaf, dev_item); 3137 device->io_align = btrfs_device_io_align(leaf, dev_item); 3138 device->io_width = btrfs_device_io_width(leaf, dev_item); 3139 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3140 3141 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3142 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3143 3144 return 0; 3145 } 3146 3147 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3148 { 3149 struct btrfs_fs_devices *fs_devices; 3150 int ret; 3151 3152 mutex_lock(&uuid_mutex); 3153 3154 fs_devices = root->fs_info->fs_devices->seed; 3155 while (fs_devices) { 3156 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3157 ret = 0; 3158 goto out; 3159 } 3160 fs_devices = fs_devices->seed; 3161 } 3162 3163 fs_devices = find_fsid(fsid); 3164 if (!fs_devices) { 3165 ret = -ENOENT; 3166 goto out; 3167 } 3168 3169 fs_devices = clone_fs_devices(fs_devices); 3170 if (IS_ERR(fs_devices)) { 3171 ret = PTR_ERR(fs_devices); 3172 goto out; 3173 } 3174 3175 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3176 root->fs_info->bdev_holder); 3177 if (ret) 3178 goto out; 3179 3180 if (!fs_devices->seeding) { 3181 __btrfs_close_devices(fs_devices); 3182 free_fs_devices(fs_devices); 3183 ret = -EINVAL; 3184 goto out; 3185 } 3186 3187 fs_devices->seed = root->fs_info->fs_devices->seed; 3188 root->fs_info->fs_devices->seed = fs_devices; 3189 out: 3190 mutex_unlock(&uuid_mutex); 3191 return ret; 3192 } 3193 3194 static int read_one_dev(struct btrfs_root *root, 3195 struct extent_buffer *leaf, 3196 struct btrfs_dev_item *dev_item) 3197 { 3198 struct btrfs_device *device; 3199 u64 devid; 3200 int ret; 3201 u8 fs_uuid[BTRFS_UUID_SIZE]; 3202 u8 dev_uuid[BTRFS_UUID_SIZE]; 3203 3204 devid = btrfs_device_id(leaf, dev_item); 3205 read_extent_buffer(leaf, dev_uuid, 3206 (unsigned long)btrfs_device_uuid(dev_item), 3207 BTRFS_UUID_SIZE); 3208 read_extent_buffer(leaf, fs_uuid, 3209 (unsigned long)btrfs_device_fsid(dev_item), 3210 BTRFS_UUID_SIZE); 3211 3212 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3213 ret = open_seed_devices(root, fs_uuid); 3214 if (ret && !btrfs_test_opt(root, DEGRADED)) 3215 return ret; 3216 } 3217 3218 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3219 if (!device || !device->bdev) { 3220 if (!btrfs_test_opt(root, DEGRADED)) 3221 return -EIO; 3222 3223 if (!device) { 3224 printk(KERN_WARNING "warning devid %llu missing\n", 3225 (unsigned long long)devid); 3226 device = add_missing_dev(root, devid, dev_uuid); 3227 if (!device) 3228 return -ENOMEM; 3229 } 3230 } 3231 3232 if (device->fs_devices != root->fs_info->fs_devices) { 3233 BUG_ON(device->writeable); 3234 if (device->generation != 3235 btrfs_device_generation(leaf, dev_item)) 3236 return -EINVAL; 3237 } 3238 3239 fill_device_from_item(leaf, dev_item, device); 3240 device->dev_root = root->fs_info->dev_root; 3241 device->in_fs_metadata = 1; 3242 if (device->writeable) 3243 device->fs_devices->total_rw_bytes += device->total_bytes; 3244 ret = 0; 3245 return ret; 3246 } 3247 3248 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3249 { 3250 struct btrfs_dev_item *dev_item; 3251 3252 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3253 dev_item); 3254 return read_one_dev(root, buf, dev_item); 3255 } 3256 3257 int btrfs_read_sys_array(struct btrfs_root *root) 3258 { 3259 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3260 struct extent_buffer *sb; 3261 struct btrfs_disk_key *disk_key; 3262 struct btrfs_chunk *chunk; 3263 u8 *ptr; 3264 unsigned long sb_ptr; 3265 int ret = 0; 3266 u32 num_stripes; 3267 u32 array_size; 3268 u32 len = 0; 3269 u32 cur; 3270 struct btrfs_key key; 3271 3272 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3273 BTRFS_SUPER_INFO_SIZE); 3274 if (!sb) 3275 return -ENOMEM; 3276 btrfs_set_buffer_uptodate(sb); 3277 btrfs_set_buffer_lockdep_class(sb, 0); 3278 3279 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3280 array_size = btrfs_super_sys_array_size(super_copy); 3281 3282 ptr = super_copy->sys_chunk_array; 3283 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3284 cur = 0; 3285 3286 while (cur < array_size) { 3287 disk_key = (struct btrfs_disk_key *)ptr; 3288 btrfs_disk_key_to_cpu(&key, disk_key); 3289 3290 len = sizeof(*disk_key); ptr += len; 3291 sb_ptr += len; 3292 cur += len; 3293 3294 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3295 chunk = (struct btrfs_chunk *)sb_ptr; 3296 ret = read_one_chunk(root, &key, sb, chunk); 3297 if (ret) 3298 break; 3299 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3300 len = btrfs_chunk_item_size(num_stripes); 3301 } else { 3302 ret = -EIO; 3303 break; 3304 } 3305 ptr += len; 3306 sb_ptr += len; 3307 cur += len; 3308 } 3309 free_extent_buffer(sb); 3310 return ret; 3311 } 3312 3313 int btrfs_read_chunk_tree(struct btrfs_root *root) 3314 { 3315 struct btrfs_path *path; 3316 struct extent_buffer *leaf; 3317 struct btrfs_key key; 3318 struct btrfs_key found_key; 3319 int ret; 3320 int slot; 3321 3322 root = root->fs_info->chunk_root; 3323 3324 path = btrfs_alloc_path(); 3325 if (!path) 3326 return -ENOMEM; 3327 3328 /* first we search for all of the device items, and then we 3329 * read in all of the chunk items. This way we can create chunk 3330 * mappings that reference all of the devices that are afound 3331 */ 3332 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3333 key.offset = 0; 3334 key.type = 0; 3335 again: 3336 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3337 while (1) { 3338 leaf = path->nodes[0]; 3339 slot = path->slots[0]; 3340 if (slot >= btrfs_header_nritems(leaf)) { 3341 ret = btrfs_next_leaf(root, path); 3342 if (ret == 0) 3343 continue; 3344 if (ret < 0) 3345 goto error; 3346 break; 3347 } 3348 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3349 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3350 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3351 break; 3352 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3353 struct btrfs_dev_item *dev_item; 3354 dev_item = btrfs_item_ptr(leaf, slot, 3355 struct btrfs_dev_item); 3356 ret = read_one_dev(root, leaf, dev_item); 3357 if (ret) 3358 goto error; 3359 } 3360 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3361 struct btrfs_chunk *chunk; 3362 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3363 ret = read_one_chunk(root, &found_key, leaf, chunk); 3364 if (ret) 3365 goto error; 3366 } 3367 path->slots[0]++; 3368 } 3369 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3370 key.objectid = 0; 3371 btrfs_release_path(root, path); 3372 goto again; 3373 } 3374 ret = 0; 3375 error: 3376 btrfs_free_path(path); 3377 return ret; 3378 } 3379