1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <asm/div64.h> 27 #include "compat.h" 28 #include "ctree.h" 29 #include "extent_map.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "async-thread.h" 35 36 static int init_first_rw_device(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct btrfs_device *device); 39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 40 41 static DEFINE_MUTEX(uuid_mutex); 42 static LIST_HEAD(fs_uuids); 43 44 static void lock_chunks(struct btrfs_root *root) 45 { 46 mutex_lock(&root->fs_info->chunk_mutex); 47 } 48 49 static void unlock_chunks(struct btrfs_root *root) 50 { 51 mutex_unlock(&root->fs_info->chunk_mutex); 52 } 53 54 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 55 { 56 struct btrfs_device *device; 57 WARN_ON(fs_devices->opened); 58 while (!list_empty(&fs_devices->devices)) { 59 device = list_entry(fs_devices->devices.next, 60 struct btrfs_device, dev_list); 61 list_del(&device->dev_list); 62 kfree(device->name); 63 kfree(device); 64 } 65 kfree(fs_devices); 66 } 67 68 int btrfs_cleanup_fs_uuids(void) 69 { 70 struct btrfs_fs_devices *fs_devices; 71 72 while (!list_empty(&fs_uuids)) { 73 fs_devices = list_entry(fs_uuids.next, 74 struct btrfs_fs_devices, list); 75 list_del(&fs_devices->list); 76 free_fs_devices(fs_devices); 77 } 78 return 0; 79 } 80 81 static noinline struct btrfs_device *__find_device(struct list_head *head, 82 u64 devid, u8 *uuid) 83 { 84 struct btrfs_device *dev; 85 86 list_for_each_entry(dev, head, dev_list) { 87 if (dev->devid == devid && 88 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 89 return dev; 90 } 91 } 92 return NULL; 93 } 94 95 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 96 { 97 struct btrfs_fs_devices *fs_devices; 98 99 list_for_each_entry(fs_devices, &fs_uuids, list) { 100 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 101 return fs_devices; 102 } 103 return NULL; 104 } 105 106 static void requeue_list(struct btrfs_pending_bios *pending_bios, 107 struct bio *head, struct bio *tail) 108 { 109 110 struct bio *old_head; 111 112 old_head = pending_bios->head; 113 pending_bios->head = head; 114 if (pending_bios->tail) 115 tail->bi_next = old_head; 116 else 117 pending_bios->tail = tail; 118 } 119 120 /* 121 * we try to collect pending bios for a device so we don't get a large 122 * number of procs sending bios down to the same device. This greatly 123 * improves the schedulers ability to collect and merge the bios. 124 * 125 * But, it also turns into a long list of bios to process and that is sure 126 * to eventually make the worker thread block. The solution here is to 127 * make some progress and then put this work struct back at the end of 128 * the list if the block device is congested. This way, multiple devices 129 * can make progress from a single worker thread. 130 */ 131 static noinline int run_scheduled_bios(struct btrfs_device *device) 132 { 133 struct bio *pending; 134 struct backing_dev_info *bdi; 135 struct btrfs_fs_info *fs_info; 136 struct btrfs_pending_bios *pending_bios; 137 struct bio *tail; 138 struct bio *cur; 139 int again = 0; 140 unsigned long num_run; 141 unsigned long batch_run = 0; 142 unsigned long limit; 143 unsigned long last_waited = 0; 144 int force_reg = 0; 145 struct blk_plug plug; 146 147 /* 148 * this function runs all the bios we've collected for 149 * a particular device. We don't want to wander off to 150 * another device without first sending all of these down. 151 * So, setup a plug here and finish it off before we return 152 */ 153 blk_start_plug(&plug); 154 155 bdi = blk_get_backing_dev_info(device->bdev); 156 fs_info = device->dev_root->fs_info; 157 limit = btrfs_async_submit_limit(fs_info); 158 limit = limit * 2 / 3; 159 160 loop: 161 spin_lock(&device->io_lock); 162 163 loop_lock: 164 num_run = 0; 165 166 /* take all the bios off the list at once and process them 167 * later on (without the lock held). But, remember the 168 * tail and other pointers so the bios can be properly reinserted 169 * into the list if we hit congestion 170 */ 171 if (!force_reg && device->pending_sync_bios.head) { 172 pending_bios = &device->pending_sync_bios; 173 force_reg = 1; 174 } else { 175 pending_bios = &device->pending_bios; 176 force_reg = 0; 177 } 178 179 pending = pending_bios->head; 180 tail = pending_bios->tail; 181 WARN_ON(pending && !tail); 182 183 /* 184 * if pending was null this time around, no bios need processing 185 * at all and we can stop. Otherwise it'll loop back up again 186 * and do an additional check so no bios are missed. 187 * 188 * device->running_pending is used to synchronize with the 189 * schedule_bio code. 190 */ 191 if (device->pending_sync_bios.head == NULL && 192 device->pending_bios.head == NULL) { 193 again = 0; 194 device->running_pending = 0; 195 } else { 196 again = 1; 197 device->running_pending = 1; 198 } 199 200 pending_bios->head = NULL; 201 pending_bios->tail = NULL; 202 203 spin_unlock(&device->io_lock); 204 205 while (pending) { 206 207 rmb(); 208 /* we want to work on both lists, but do more bios on the 209 * sync list than the regular list 210 */ 211 if ((num_run > 32 && 212 pending_bios != &device->pending_sync_bios && 213 device->pending_sync_bios.head) || 214 (num_run > 64 && pending_bios == &device->pending_sync_bios && 215 device->pending_bios.head)) { 216 spin_lock(&device->io_lock); 217 requeue_list(pending_bios, pending, tail); 218 goto loop_lock; 219 } 220 221 cur = pending; 222 pending = pending->bi_next; 223 cur->bi_next = NULL; 224 atomic_dec(&fs_info->nr_async_bios); 225 226 if (atomic_read(&fs_info->nr_async_bios) < limit && 227 waitqueue_active(&fs_info->async_submit_wait)) 228 wake_up(&fs_info->async_submit_wait); 229 230 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 231 232 submit_bio(cur->bi_rw, cur); 233 num_run++; 234 batch_run++; 235 if (need_resched()) 236 cond_resched(); 237 238 /* 239 * we made progress, there is more work to do and the bdi 240 * is now congested. Back off and let other work structs 241 * run instead 242 */ 243 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 244 fs_info->fs_devices->open_devices > 1) { 245 struct io_context *ioc; 246 247 ioc = current->io_context; 248 249 /* 250 * the main goal here is that we don't want to 251 * block if we're going to be able to submit 252 * more requests without blocking. 253 * 254 * This code does two great things, it pokes into 255 * the elevator code from a filesystem _and_ 256 * it makes assumptions about how batching works. 257 */ 258 if (ioc && ioc->nr_batch_requests > 0 && 259 time_before(jiffies, ioc->last_waited + HZ/50UL) && 260 (last_waited == 0 || 261 ioc->last_waited == last_waited)) { 262 /* 263 * we want to go through our batch of 264 * requests and stop. So, we copy out 265 * the ioc->last_waited time and test 266 * against it before looping 267 */ 268 last_waited = ioc->last_waited; 269 if (need_resched()) 270 cond_resched(); 271 continue; 272 } 273 spin_lock(&device->io_lock); 274 requeue_list(pending_bios, pending, tail); 275 device->running_pending = 1; 276 277 spin_unlock(&device->io_lock); 278 btrfs_requeue_work(&device->work); 279 goto done; 280 } 281 } 282 283 cond_resched(); 284 if (again) 285 goto loop; 286 287 spin_lock(&device->io_lock); 288 if (device->pending_bios.head || device->pending_sync_bios.head) 289 goto loop_lock; 290 spin_unlock(&device->io_lock); 291 292 done: 293 blk_finish_plug(&plug); 294 return 0; 295 } 296 297 static void pending_bios_fn(struct btrfs_work *work) 298 { 299 struct btrfs_device *device; 300 301 device = container_of(work, struct btrfs_device, work); 302 run_scheduled_bios(device); 303 } 304 305 static noinline int device_list_add(const char *path, 306 struct btrfs_super_block *disk_super, 307 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 308 { 309 struct btrfs_device *device; 310 struct btrfs_fs_devices *fs_devices; 311 u64 found_transid = btrfs_super_generation(disk_super); 312 char *name; 313 314 fs_devices = find_fsid(disk_super->fsid); 315 if (!fs_devices) { 316 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 317 if (!fs_devices) 318 return -ENOMEM; 319 INIT_LIST_HEAD(&fs_devices->devices); 320 INIT_LIST_HEAD(&fs_devices->alloc_list); 321 list_add(&fs_devices->list, &fs_uuids); 322 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 323 fs_devices->latest_devid = devid; 324 fs_devices->latest_trans = found_transid; 325 mutex_init(&fs_devices->device_list_mutex); 326 device = NULL; 327 } else { 328 device = __find_device(&fs_devices->devices, devid, 329 disk_super->dev_item.uuid); 330 } 331 if (!device) { 332 if (fs_devices->opened) 333 return -EBUSY; 334 335 device = kzalloc(sizeof(*device), GFP_NOFS); 336 if (!device) { 337 /* we can safely leave the fs_devices entry around */ 338 return -ENOMEM; 339 } 340 device->devid = devid; 341 device->work.func = pending_bios_fn; 342 memcpy(device->uuid, disk_super->dev_item.uuid, 343 BTRFS_UUID_SIZE); 344 spin_lock_init(&device->io_lock); 345 device->name = kstrdup(path, GFP_NOFS); 346 if (!device->name) { 347 kfree(device); 348 return -ENOMEM; 349 } 350 INIT_LIST_HEAD(&device->dev_alloc_list); 351 352 mutex_lock(&fs_devices->device_list_mutex); 353 list_add_rcu(&device->dev_list, &fs_devices->devices); 354 mutex_unlock(&fs_devices->device_list_mutex); 355 356 device->fs_devices = fs_devices; 357 fs_devices->num_devices++; 358 } else if (!device->name || strcmp(device->name, path)) { 359 name = kstrdup(path, GFP_NOFS); 360 if (!name) 361 return -ENOMEM; 362 kfree(device->name); 363 device->name = name; 364 if (device->missing) { 365 fs_devices->missing_devices--; 366 device->missing = 0; 367 } 368 } 369 370 if (found_transid > fs_devices->latest_trans) { 371 fs_devices->latest_devid = devid; 372 fs_devices->latest_trans = found_transid; 373 } 374 *fs_devices_ret = fs_devices; 375 return 0; 376 } 377 378 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 379 { 380 struct btrfs_fs_devices *fs_devices; 381 struct btrfs_device *device; 382 struct btrfs_device *orig_dev; 383 384 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 385 if (!fs_devices) 386 return ERR_PTR(-ENOMEM); 387 388 INIT_LIST_HEAD(&fs_devices->devices); 389 INIT_LIST_HEAD(&fs_devices->alloc_list); 390 INIT_LIST_HEAD(&fs_devices->list); 391 mutex_init(&fs_devices->device_list_mutex); 392 fs_devices->latest_devid = orig->latest_devid; 393 fs_devices->latest_trans = orig->latest_trans; 394 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 395 396 /* We have held the volume lock, it is safe to get the devices. */ 397 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 398 device = kzalloc(sizeof(*device), GFP_NOFS); 399 if (!device) 400 goto error; 401 402 device->name = kstrdup(orig_dev->name, GFP_NOFS); 403 if (!device->name) { 404 kfree(device); 405 goto error; 406 } 407 408 device->devid = orig_dev->devid; 409 device->work.func = pending_bios_fn; 410 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 411 spin_lock_init(&device->io_lock); 412 INIT_LIST_HEAD(&device->dev_list); 413 INIT_LIST_HEAD(&device->dev_alloc_list); 414 415 list_add(&device->dev_list, &fs_devices->devices); 416 device->fs_devices = fs_devices; 417 fs_devices->num_devices++; 418 } 419 return fs_devices; 420 error: 421 free_fs_devices(fs_devices); 422 return ERR_PTR(-ENOMEM); 423 } 424 425 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 426 { 427 struct btrfs_device *device, *next; 428 429 mutex_lock(&uuid_mutex); 430 again: 431 /* This is the initialized path, it is safe to release the devices. */ 432 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 433 if (device->in_fs_metadata) 434 continue; 435 436 if (device->bdev) { 437 blkdev_put(device->bdev, device->mode); 438 device->bdev = NULL; 439 fs_devices->open_devices--; 440 } 441 if (device->writeable) { 442 list_del_init(&device->dev_alloc_list); 443 device->writeable = 0; 444 fs_devices->rw_devices--; 445 } 446 list_del_init(&device->dev_list); 447 fs_devices->num_devices--; 448 kfree(device->name); 449 kfree(device); 450 } 451 452 if (fs_devices->seed) { 453 fs_devices = fs_devices->seed; 454 goto again; 455 } 456 457 mutex_unlock(&uuid_mutex); 458 return 0; 459 } 460 461 static void __free_device(struct work_struct *work) 462 { 463 struct btrfs_device *device; 464 465 device = container_of(work, struct btrfs_device, rcu_work); 466 467 if (device->bdev) 468 blkdev_put(device->bdev, device->mode); 469 470 kfree(device->name); 471 kfree(device); 472 } 473 474 static void free_device(struct rcu_head *head) 475 { 476 struct btrfs_device *device; 477 478 device = container_of(head, struct btrfs_device, rcu); 479 480 INIT_WORK(&device->rcu_work, __free_device); 481 schedule_work(&device->rcu_work); 482 } 483 484 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 485 { 486 struct btrfs_device *device; 487 488 if (--fs_devices->opened > 0) 489 return 0; 490 491 mutex_lock(&fs_devices->device_list_mutex); 492 list_for_each_entry(device, &fs_devices->devices, dev_list) { 493 struct btrfs_device *new_device; 494 495 if (device->bdev) 496 fs_devices->open_devices--; 497 498 if (device->writeable) { 499 list_del_init(&device->dev_alloc_list); 500 fs_devices->rw_devices--; 501 } 502 503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 504 BUG_ON(!new_device); 505 memcpy(new_device, device, sizeof(*new_device)); 506 new_device->name = kstrdup(device->name, GFP_NOFS); 507 BUG_ON(device->name && !new_device->name); 508 new_device->bdev = NULL; 509 new_device->writeable = 0; 510 new_device->in_fs_metadata = 0; 511 list_replace_rcu(&device->dev_list, &new_device->dev_list); 512 513 call_rcu(&device->rcu, free_device); 514 } 515 mutex_unlock(&fs_devices->device_list_mutex); 516 517 WARN_ON(fs_devices->open_devices); 518 WARN_ON(fs_devices->rw_devices); 519 fs_devices->opened = 0; 520 fs_devices->seeding = 0; 521 522 return 0; 523 } 524 525 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 526 { 527 struct btrfs_fs_devices *seed_devices = NULL; 528 int ret; 529 530 mutex_lock(&uuid_mutex); 531 ret = __btrfs_close_devices(fs_devices); 532 if (!fs_devices->opened) { 533 seed_devices = fs_devices->seed; 534 fs_devices->seed = NULL; 535 } 536 mutex_unlock(&uuid_mutex); 537 538 while (seed_devices) { 539 fs_devices = seed_devices; 540 seed_devices = fs_devices->seed; 541 __btrfs_close_devices(fs_devices); 542 free_fs_devices(fs_devices); 543 } 544 return ret; 545 } 546 547 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 548 fmode_t flags, void *holder) 549 { 550 struct block_device *bdev; 551 struct list_head *head = &fs_devices->devices; 552 struct btrfs_device *device; 553 struct block_device *latest_bdev = NULL; 554 struct buffer_head *bh; 555 struct btrfs_super_block *disk_super; 556 u64 latest_devid = 0; 557 u64 latest_transid = 0; 558 u64 devid; 559 int seeding = 1; 560 int ret = 0; 561 562 flags |= FMODE_EXCL; 563 564 list_for_each_entry(device, head, dev_list) { 565 if (device->bdev) 566 continue; 567 if (!device->name) 568 continue; 569 570 bdev = blkdev_get_by_path(device->name, flags, holder); 571 if (IS_ERR(bdev)) { 572 printk(KERN_INFO "open %s failed\n", device->name); 573 goto error; 574 } 575 set_blocksize(bdev, 4096); 576 577 bh = btrfs_read_dev_super(bdev); 578 if (!bh) { 579 ret = -EINVAL; 580 goto error_close; 581 } 582 583 disk_super = (struct btrfs_super_block *)bh->b_data; 584 devid = btrfs_stack_device_id(&disk_super->dev_item); 585 if (devid != device->devid) 586 goto error_brelse; 587 588 if (memcmp(device->uuid, disk_super->dev_item.uuid, 589 BTRFS_UUID_SIZE)) 590 goto error_brelse; 591 592 device->generation = btrfs_super_generation(disk_super); 593 if (!latest_transid || device->generation > latest_transid) { 594 latest_devid = devid; 595 latest_transid = device->generation; 596 latest_bdev = bdev; 597 } 598 599 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 600 device->writeable = 0; 601 } else { 602 device->writeable = !bdev_read_only(bdev); 603 seeding = 0; 604 } 605 606 device->bdev = bdev; 607 device->in_fs_metadata = 0; 608 device->mode = flags; 609 610 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 611 fs_devices->rotating = 1; 612 613 fs_devices->open_devices++; 614 if (device->writeable) { 615 fs_devices->rw_devices++; 616 list_add(&device->dev_alloc_list, 617 &fs_devices->alloc_list); 618 } 619 brelse(bh); 620 continue; 621 622 error_brelse: 623 brelse(bh); 624 error_close: 625 blkdev_put(bdev, flags); 626 error: 627 continue; 628 } 629 if (fs_devices->open_devices == 0) { 630 ret = -EIO; 631 goto out; 632 } 633 fs_devices->seeding = seeding; 634 fs_devices->opened = 1; 635 fs_devices->latest_bdev = latest_bdev; 636 fs_devices->latest_devid = latest_devid; 637 fs_devices->latest_trans = latest_transid; 638 fs_devices->total_rw_bytes = 0; 639 out: 640 return ret; 641 } 642 643 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 644 fmode_t flags, void *holder) 645 { 646 int ret; 647 648 mutex_lock(&uuid_mutex); 649 if (fs_devices->opened) { 650 fs_devices->opened++; 651 ret = 0; 652 } else { 653 ret = __btrfs_open_devices(fs_devices, flags, holder); 654 } 655 mutex_unlock(&uuid_mutex); 656 return ret; 657 } 658 659 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 660 struct btrfs_fs_devices **fs_devices_ret) 661 { 662 struct btrfs_super_block *disk_super; 663 struct block_device *bdev; 664 struct buffer_head *bh; 665 int ret; 666 u64 devid; 667 u64 transid; 668 669 mutex_lock(&uuid_mutex); 670 671 flags |= FMODE_EXCL; 672 bdev = blkdev_get_by_path(path, flags, holder); 673 674 if (IS_ERR(bdev)) { 675 ret = PTR_ERR(bdev); 676 goto error; 677 } 678 679 ret = set_blocksize(bdev, 4096); 680 if (ret) 681 goto error_close; 682 bh = btrfs_read_dev_super(bdev); 683 if (!bh) { 684 ret = -EINVAL; 685 goto error_close; 686 } 687 disk_super = (struct btrfs_super_block *)bh->b_data; 688 devid = btrfs_stack_device_id(&disk_super->dev_item); 689 transid = btrfs_super_generation(disk_super); 690 if (disk_super->label[0]) 691 printk(KERN_INFO "device label %s ", disk_super->label); 692 else 693 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 694 printk(KERN_CONT "devid %llu transid %llu %s\n", 695 (unsigned long long)devid, (unsigned long long)transid, path); 696 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 697 698 brelse(bh); 699 error_close: 700 blkdev_put(bdev, flags); 701 error: 702 mutex_unlock(&uuid_mutex); 703 return ret; 704 } 705 706 /* helper to account the used device space in the range */ 707 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 708 u64 end, u64 *length) 709 { 710 struct btrfs_key key; 711 struct btrfs_root *root = device->dev_root; 712 struct btrfs_dev_extent *dev_extent; 713 struct btrfs_path *path; 714 u64 extent_end; 715 int ret; 716 int slot; 717 struct extent_buffer *l; 718 719 *length = 0; 720 721 if (start >= device->total_bytes) 722 return 0; 723 724 path = btrfs_alloc_path(); 725 if (!path) 726 return -ENOMEM; 727 path->reada = 2; 728 729 key.objectid = device->devid; 730 key.offset = start; 731 key.type = BTRFS_DEV_EXTENT_KEY; 732 733 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 734 if (ret < 0) 735 goto out; 736 if (ret > 0) { 737 ret = btrfs_previous_item(root, path, key.objectid, key.type); 738 if (ret < 0) 739 goto out; 740 } 741 742 while (1) { 743 l = path->nodes[0]; 744 slot = path->slots[0]; 745 if (slot >= btrfs_header_nritems(l)) { 746 ret = btrfs_next_leaf(root, path); 747 if (ret == 0) 748 continue; 749 if (ret < 0) 750 goto out; 751 752 break; 753 } 754 btrfs_item_key_to_cpu(l, &key, slot); 755 756 if (key.objectid < device->devid) 757 goto next; 758 759 if (key.objectid > device->devid) 760 break; 761 762 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 763 goto next; 764 765 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 766 extent_end = key.offset + btrfs_dev_extent_length(l, 767 dev_extent); 768 if (key.offset <= start && extent_end > end) { 769 *length = end - start + 1; 770 break; 771 } else if (key.offset <= start && extent_end > start) 772 *length += extent_end - start; 773 else if (key.offset > start && extent_end <= end) 774 *length += extent_end - key.offset; 775 else if (key.offset > start && key.offset <= end) { 776 *length += end - key.offset + 1; 777 break; 778 } else if (key.offset > end) 779 break; 780 781 next: 782 path->slots[0]++; 783 } 784 ret = 0; 785 out: 786 btrfs_free_path(path); 787 return ret; 788 } 789 790 /* 791 * find_free_dev_extent - find free space in the specified device 792 * @trans: transaction handler 793 * @device: the device which we search the free space in 794 * @num_bytes: the size of the free space that we need 795 * @start: store the start of the free space. 796 * @len: the size of the free space. that we find, or the size of the max 797 * free space if we don't find suitable free space 798 * 799 * this uses a pretty simple search, the expectation is that it is 800 * called very infrequently and that a given device has a small number 801 * of extents 802 * 803 * @start is used to store the start of the free space if we find. But if we 804 * don't find suitable free space, it will be used to store the start position 805 * of the max free space. 806 * 807 * @len is used to store the size of the free space that we find. 808 * But if we don't find suitable free space, it is used to store the size of 809 * the max free space. 810 */ 811 int find_free_dev_extent(struct btrfs_trans_handle *trans, 812 struct btrfs_device *device, u64 num_bytes, 813 u64 *start, u64 *len) 814 { 815 struct btrfs_key key; 816 struct btrfs_root *root = device->dev_root; 817 struct btrfs_dev_extent *dev_extent; 818 struct btrfs_path *path; 819 u64 hole_size; 820 u64 max_hole_start; 821 u64 max_hole_size; 822 u64 extent_end; 823 u64 search_start; 824 u64 search_end = device->total_bytes; 825 int ret; 826 int slot; 827 struct extent_buffer *l; 828 829 /* FIXME use last free of some kind */ 830 831 /* we don't want to overwrite the superblock on the drive, 832 * so we make sure to start at an offset of at least 1MB 833 */ 834 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 835 836 max_hole_start = search_start; 837 max_hole_size = 0; 838 839 if (search_start >= search_end) { 840 ret = -ENOSPC; 841 goto error; 842 } 843 844 path = btrfs_alloc_path(); 845 if (!path) { 846 ret = -ENOMEM; 847 goto error; 848 } 849 path->reada = 2; 850 851 key.objectid = device->devid; 852 key.offset = search_start; 853 key.type = BTRFS_DEV_EXTENT_KEY; 854 855 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 856 if (ret < 0) 857 goto out; 858 if (ret > 0) { 859 ret = btrfs_previous_item(root, path, key.objectid, key.type); 860 if (ret < 0) 861 goto out; 862 } 863 864 while (1) { 865 l = path->nodes[0]; 866 slot = path->slots[0]; 867 if (slot >= btrfs_header_nritems(l)) { 868 ret = btrfs_next_leaf(root, path); 869 if (ret == 0) 870 continue; 871 if (ret < 0) 872 goto out; 873 874 break; 875 } 876 btrfs_item_key_to_cpu(l, &key, slot); 877 878 if (key.objectid < device->devid) 879 goto next; 880 881 if (key.objectid > device->devid) 882 break; 883 884 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 885 goto next; 886 887 if (key.offset > search_start) { 888 hole_size = key.offset - search_start; 889 890 if (hole_size > max_hole_size) { 891 max_hole_start = search_start; 892 max_hole_size = hole_size; 893 } 894 895 /* 896 * If this free space is greater than which we need, 897 * it must be the max free space that we have found 898 * until now, so max_hole_start must point to the start 899 * of this free space and the length of this free space 900 * is stored in max_hole_size. Thus, we return 901 * max_hole_start and max_hole_size and go back to the 902 * caller. 903 */ 904 if (hole_size >= num_bytes) { 905 ret = 0; 906 goto out; 907 } 908 } 909 910 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 911 extent_end = key.offset + btrfs_dev_extent_length(l, 912 dev_extent); 913 if (extent_end > search_start) 914 search_start = extent_end; 915 next: 916 path->slots[0]++; 917 cond_resched(); 918 } 919 920 hole_size = search_end- search_start; 921 if (hole_size > max_hole_size) { 922 max_hole_start = search_start; 923 max_hole_size = hole_size; 924 } 925 926 /* See above. */ 927 if (hole_size < num_bytes) 928 ret = -ENOSPC; 929 else 930 ret = 0; 931 932 out: 933 btrfs_free_path(path); 934 error: 935 *start = max_hole_start; 936 if (len) 937 *len = max_hole_size; 938 return ret; 939 } 940 941 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 942 struct btrfs_device *device, 943 u64 start) 944 { 945 int ret; 946 struct btrfs_path *path; 947 struct btrfs_root *root = device->dev_root; 948 struct btrfs_key key; 949 struct btrfs_key found_key; 950 struct extent_buffer *leaf = NULL; 951 struct btrfs_dev_extent *extent = NULL; 952 953 path = btrfs_alloc_path(); 954 if (!path) 955 return -ENOMEM; 956 957 key.objectid = device->devid; 958 key.offset = start; 959 key.type = BTRFS_DEV_EXTENT_KEY; 960 961 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 962 if (ret > 0) { 963 ret = btrfs_previous_item(root, path, key.objectid, 964 BTRFS_DEV_EXTENT_KEY); 965 if (ret) 966 goto out; 967 leaf = path->nodes[0]; 968 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 969 extent = btrfs_item_ptr(leaf, path->slots[0], 970 struct btrfs_dev_extent); 971 BUG_ON(found_key.offset > start || found_key.offset + 972 btrfs_dev_extent_length(leaf, extent) < start); 973 } else if (ret == 0) { 974 leaf = path->nodes[0]; 975 extent = btrfs_item_ptr(leaf, path->slots[0], 976 struct btrfs_dev_extent); 977 } 978 BUG_ON(ret); 979 980 if (device->bytes_used > 0) 981 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 982 ret = btrfs_del_item(trans, root, path); 983 984 out: 985 btrfs_free_path(path); 986 return ret; 987 } 988 989 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 990 struct btrfs_device *device, 991 u64 chunk_tree, u64 chunk_objectid, 992 u64 chunk_offset, u64 start, u64 num_bytes) 993 { 994 int ret; 995 struct btrfs_path *path; 996 struct btrfs_root *root = device->dev_root; 997 struct btrfs_dev_extent *extent; 998 struct extent_buffer *leaf; 999 struct btrfs_key key; 1000 1001 WARN_ON(!device->in_fs_metadata); 1002 path = btrfs_alloc_path(); 1003 if (!path) 1004 return -ENOMEM; 1005 1006 key.objectid = device->devid; 1007 key.offset = start; 1008 key.type = BTRFS_DEV_EXTENT_KEY; 1009 ret = btrfs_insert_empty_item(trans, root, path, &key, 1010 sizeof(*extent)); 1011 BUG_ON(ret); 1012 1013 leaf = path->nodes[0]; 1014 extent = btrfs_item_ptr(leaf, path->slots[0], 1015 struct btrfs_dev_extent); 1016 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1017 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1018 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1019 1020 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1021 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1022 BTRFS_UUID_SIZE); 1023 1024 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1025 btrfs_mark_buffer_dirty(leaf); 1026 btrfs_free_path(path); 1027 return ret; 1028 } 1029 1030 static noinline int find_next_chunk(struct btrfs_root *root, 1031 u64 objectid, u64 *offset) 1032 { 1033 struct btrfs_path *path; 1034 int ret; 1035 struct btrfs_key key; 1036 struct btrfs_chunk *chunk; 1037 struct btrfs_key found_key; 1038 1039 path = btrfs_alloc_path(); 1040 BUG_ON(!path); 1041 1042 key.objectid = objectid; 1043 key.offset = (u64)-1; 1044 key.type = BTRFS_CHUNK_ITEM_KEY; 1045 1046 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1047 if (ret < 0) 1048 goto error; 1049 1050 BUG_ON(ret == 0); 1051 1052 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1053 if (ret) { 1054 *offset = 0; 1055 } else { 1056 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1057 path->slots[0]); 1058 if (found_key.objectid != objectid) 1059 *offset = 0; 1060 else { 1061 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1062 struct btrfs_chunk); 1063 *offset = found_key.offset + 1064 btrfs_chunk_length(path->nodes[0], chunk); 1065 } 1066 } 1067 ret = 0; 1068 error: 1069 btrfs_free_path(path); 1070 return ret; 1071 } 1072 1073 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1074 { 1075 int ret; 1076 struct btrfs_key key; 1077 struct btrfs_key found_key; 1078 struct btrfs_path *path; 1079 1080 root = root->fs_info->chunk_root; 1081 1082 path = btrfs_alloc_path(); 1083 if (!path) 1084 return -ENOMEM; 1085 1086 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1087 key.type = BTRFS_DEV_ITEM_KEY; 1088 key.offset = (u64)-1; 1089 1090 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1091 if (ret < 0) 1092 goto error; 1093 1094 BUG_ON(ret == 0); 1095 1096 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1097 BTRFS_DEV_ITEM_KEY); 1098 if (ret) { 1099 *objectid = 1; 1100 } else { 1101 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1102 path->slots[0]); 1103 *objectid = found_key.offset + 1; 1104 } 1105 ret = 0; 1106 error: 1107 btrfs_free_path(path); 1108 return ret; 1109 } 1110 1111 /* 1112 * the device information is stored in the chunk root 1113 * the btrfs_device struct should be fully filled in 1114 */ 1115 int btrfs_add_device(struct btrfs_trans_handle *trans, 1116 struct btrfs_root *root, 1117 struct btrfs_device *device) 1118 { 1119 int ret; 1120 struct btrfs_path *path; 1121 struct btrfs_dev_item *dev_item; 1122 struct extent_buffer *leaf; 1123 struct btrfs_key key; 1124 unsigned long ptr; 1125 1126 root = root->fs_info->chunk_root; 1127 1128 path = btrfs_alloc_path(); 1129 if (!path) 1130 return -ENOMEM; 1131 1132 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1133 key.type = BTRFS_DEV_ITEM_KEY; 1134 key.offset = device->devid; 1135 1136 ret = btrfs_insert_empty_item(trans, root, path, &key, 1137 sizeof(*dev_item)); 1138 if (ret) 1139 goto out; 1140 1141 leaf = path->nodes[0]; 1142 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1143 1144 btrfs_set_device_id(leaf, dev_item, device->devid); 1145 btrfs_set_device_generation(leaf, dev_item, 0); 1146 btrfs_set_device_type(leaf, dev_item, device->type); 1147 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1148 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1149 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1150 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1151 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1152 btrfs_set_device_group(leaf, dev_item, 0); 1153 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1154 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1155 btrfs_set_device_start_offset(leaf, dev_item, 0); 1156 1157 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1158 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1159 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1160 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1161 btrfs_mark_buffer_dirty(leaf); 1162 1163 ret = 0; 1164 out: 1165 btrfs_free_path(path); 1166 return ret; 1167 } 1168 1169 static int btrfs_rm_dev_item(struct btrfs_root *root, 1170 struct btrfs_device *device) 1171 { 1172 int ret; 1173 struct btrfs_path *path; 1174 struct btrfs_key key; 1175 struct btrfs_trans_handle *trans; 1176 1177 root = root->fs_info->chunk_root; 1178 1179 path = btrfs_alloc_path(); 1180 if (!path) 1181 return -ENOMEM; 1182 1183 trans = btrfs_start_transaction(root, 0); 1184 if (IS_ERR(trans)) { 1185 btrfs_free_path(path); 1186 return PTR_ERR(trans); 1187 } 1188 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1189 key.type = BTRFS_DEV_ITEM_KEY; 1190 key.offset = device->devid; 1191 lock_chunks(root); 1192 1193 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1194 if (ret < 0) 1195 goto out; 1196 1197 if (ret > 0) { 1198 ret = -ENOENT; 1199 goto out; 1200 } 1201 1202 ret = btrfs_del_item(trans, root, path); 1203 if (ret) 1204 goto out; 1205 out: 1206 btrfs_free_path(path); 1207 unlock_chunks(root); 1208 btrfs_commit_transaction(trans, root); 1209 return ret; 1210 } 1211 1212 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1213 { 1214 struct btrfs_device *device; 1215 struct btrfs_device *next_device; 1216 struct block_device *bdev; 1217 struct buffer_head *bh = NULL; 1218 struct btrfs_super_block *disk_super; 1219 struct btrfs_fs_devices *cur_devices; 1220 u64 all_avail; 1221 u64 devid; 1222 u64 num_devices; 1223 u8 *dev_uuid; 1224 int ret = 0; 1225 bool clear_super = false; 1226 1227 mutex_lock(&uuid_mutex); 1228 mutex_lock(&root->fs_info->volume_mutex); 1229 1230 all_avail = root->fs_info->avail_data_alloc_bits | 1231 root->fs_info->avail_system_alloc_bits | 1232 root->fs_info->avail_metadata_alloc_bits; 1233 1234 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1235 root->fs_info->fs_devices->num_devices <= 4) { 1236 printk(KERN_ERR "btrfs: unable to go below four devices " 1237 "on raid10\n"); 1238 ret = -EINVAL; 1239 goto out; 1240 } 1241 1242 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1243 root->fs_info->fs_devices->num_devices <= 2) { 1244 printk(KERN_ERR "btrfs: unable to go below two " 1245 "devices on raid1\n"); 1246 ret = -EINVAL; 1247 goto out; 1248 } 1249 1250 if (strcmp(device_path, "missing") == 0) { 1251 struct list_head *devices; 1252 struct btrfs_device *tmp; 1253 1254 device = NULL; 1255 devices = &root->fs_info->fs_devices->devices; 1256 /* 1257 * It is safe to read the devices since the volume_mutex 1258 * is held. 1259 */ 1260 list_for_each_entry(tmp, devices, dev_list) { 1261 if (tmp->in_fs_metadata && !tmp->bdev) { 1262 device = tmp; 1263 break; 1264 } 1265 } 1266 bdev = NULL; 1267 bh = NULL; 1268 disk_super = NULL; 1269 if (!device) { 1270 printk(KERN_ERR "btrfs: no missing devices found to " 1271 "remove\n"); 1272 goto out; 1273 } 1274 } else { 1275 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1276 root->fs_info->bdev_holder); 1277 if (IS_ERR(bdev)) { 1278 ret = PTR_ERR(bdev); 1279 goto out; 1280 } 1281 1282 set_blocksize(bdev, 4096); 1283 bh = btrfs_read_dev_super(bdev); 1284 if (!bh) { 1285 ret = -EINVAL; 1286 goto error_close; 1287 } 1288 disk_super = (struct btrfs_super_block *)bh->b_data; 1289 devid = btrfs_stack_device_id(&disk_super->dev_item); 1290 dev_uuid = disk_super->dev_item.uuid; 1291 device = btrfs_find_device(root, devid, dev_uuid, 1292 disk_super->fsid); 1293 if (!device) { 1294 ret = -ENOENT; 1295 goto error_brelse; 1296 } 1297 } 1298 1299 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1300 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1301 "device\n"); 1302 ret = -EINVAL; 1303 goto error_brelse; 1304 } 1305 1306 if (device->writeable) { 1307 lock_chunks(root); 1308 list_del_init(&device->dev_alloc_list); 1309 unlock_chunks(root); 1310 root->fs_info->fs_devices->rw_devices--; 1311 clear_super = true; 1312 } 1313 1314 ret = btrfs_shrink_device(device, 0); 1315 if (ret) 1316 goto error_undo; 1317 1318 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1319 if (ret) 1320 goto error_undo; 1321 1322 device->in_fs_metadata = 0; 1323 btrfs_scrub_cancel_dev(root, device); 1324 1325 /* 1326 * the device list mutex makes sure that we don't change 1327 * the device list while someone else is writing out all 1328 * the device supers. 1329 */ 1330 1331 cur_devices = device->fs_devices; 1332 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1333 list_del_rcu(&device->dev_list); 1334 1335 device->fs_devices->num_devices--; 1336 1337 if (device->missing) 1338 root->fs_info->fs_devices->missing_devices--; 1339 1340 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1341 struct btrfs_device, dev_list); 1342 if (device->bdev == root->fs_info->sb->s_bdev) 1343 root->fs_info->sb->s_bdev = next_device->bdev; 1344 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1345 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1346 1347 if (device->bdev) 1348 device->fs_devices->open_devices--; 1349 1350 call_rcu(&device->rcu, free_device); 1351 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1352 1353 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1354 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1355 1356 if (cur_devices->open_devices == 0) { 1357 struct btrfs_fs_devices *fs_devices; 1358 fs_devices = root->fs_info->fs_devices; 1359 while (fs_devices) { 1360 if (fs_devices->seed == cur_devices) 1361 break; 1362 fs_devices = fs_devices->seed; 1363 } 1364 fs_devices->seed = cur_devices->seed; 1365 cur_devices->seed = NULL; 1366 lock_chunks(root); 1367 __btrfs_close_devices(cur_devices); 1368 unlock_chunks(root); 1369 free_fs_devices(cur_devices); 1370 } 1371 1372 /* 1373 * at this point, the device is zero sized. We want to 1374 * remove it from the devices list and zero out the old super 1375 */ 1376 if (clear_super) { 1377 /* make sure this device isn't detected as part of 1378 * the FS anymore 1379 */ 1380 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1381 set_buffer_dirty(bh); 1382 sync_dirty_buffer(bh); 1383 } 1384 1385 ret = 0; 1386 1387 error_brelse: 1388 brelse(bh); 1389 error_close: 1390 if (bdev) 1391 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1392 out: 1393 mutex_unlock(&root->fs_info->volume_mutex); 1394 mutex_unlock(&uuid_mutex); 1395 return ret; 1396 error_undo: 1397 if (device->writeable) { 1398 lock_chunks(root); 1399 list_add(&device->dev_alloc_list, 1400 &root->fs_info->fs_devices->alloc_list); 1401 unlock_chunks(root); 1402 root->fs_info->fs_devices->rw_devices++; 1403 } 1404 goto error_brelse; 1405 } 1406 1407 /* 1408 * does all the dirty work required for changing file system's UUID. 1409 */ 1410 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1411 struct btrfs_root *root) 1412 { 1413 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1414 struct btrfs_fs_devices *old_devices; 1415 struct btrfs_fs_devices *seed_devices; 1416 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1417 struct btrfs_device *device; 1418 u64 super_flags; 1419 1420 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1421 if (!fs_devices->seeding) 1422 return -EINVAL; 1423 1424 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1425 if (!seed_devices) 1426 return -ENOMEM; 1427 1428 old_devices = clone_fs_devices(fs_devices); 1429 if (IS_ERR(old_devices)) { 1430 kfree(seed_devices); 1431 return PTR_ERR(old_devices); 1432 } 1433 1434 list_add(&old_devices->list, &fs_uuids); 1435 1436 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1437 seed_devices->opened = 1; 1438 INIT_LIST_HEAD(&seed_devices->devices); 1439 INIT_LIST_HEAD(&seed_devices->alloc_list); 1440 mutex_init(&seed_devices->device_list_mutex); 1441 1442 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1443 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1444 synchronize_rcu); 1445 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1446 1447 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1448 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1449 device->fs_devices = seed_devices; 1450 } 1451 1452 fs_devices->seeding = 0; 1453 fs_devices->num_devices = 0; 1454 fs_devices->open_devices = 0; 1455 fs_devices->seed = seed_devices; 1456 1457 generate_random_uuid(fs_devices->fsid); 1458 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1459 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1460 super_flags = btrfs_super_flags(disk_super) & 1461 ~BTRFS_SUPER_FLAG_SEEDING; 1462 btrfs_set_super_flags(disk_super, super_flags); 1463 1464 return 0; 1465 } 1466 1467 /* 1468 * strore the expected generation for seed devices in device items. 1469 */ 1470 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1471 struct btrfs_root *root) 1472 { 1473 struct btrfs_path *path; 1474 struct extent_buffer *leaf; 1475 struct btrfs_dev_item *dev_item; 1476 struct btrfs_device *device; 1477 struct btrfs_key key; 1478 u8 fs_uuid[BTRFS_UUID_SIZE]; 1479 u8 dev_uuid[BTRFS_UUID_SIZE]; 1480 u64 devid; 1481 int ret; 1482 1483 path = btrfs_alloc_path(); 1484 if (!path) 1485 return -ENOMEM; 1486 1487 root = root->fs_info->chunk_root; 1488 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1489 key.offset = 0; 1490 key.type = BTRFS_DEV_ITEM_KEY; 1491 1492 while (1) { 1493 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1494 if (ret < 0) 1495 goto error; 1496 1497 leaf = path->nodes[0]; 1498 next_slot: 1499 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1500 ret = btrfs_next_leaf(root, path); 1501 if (ret > 0) 1502 break; 1503 if (ret < 0) 1504 goto error; 1505 leaf = path->nodes[0]; 1506 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1507 btrfs_release_path(path); 1508 continue; 1509 } 1510 1511 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1512 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1513 key.type != BTRFS_DEV_ITEM_KEY) 1514 break; 1515 1516 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1517 struct btrfs_dev_item); 1518 devid = btrfs_device_id(leaf, dev_item); 1519 read_extent_buffer(leaf, dev_uuid, 1520 (unsigned long)btrfs_device_uuid(dev_item), 1521 BTRFS_UUID_SIZE); 1522 read_extent_buffer(leaf, fs_uuid, 1523 (unsigned long)btrfs_device_fsid(dev_item), 1524 BTRFS_UUID_SIZE); 1525 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1526 BUG_ON(!device); 1527 1528 if (device->fs_devices->seeding) { 1529 btrfs_set_device_generation(leaf, dev_item, 1530 device->generation); 1531 btrfs_mark_buffer_dirty(leaf); 1532 } 1533 1534 path->slots[0]++; 1535 goto next_slot; 1536 } 1537 ret = 0; 1538 error: 1539 btrfs_free_path(path); 1540 return ret; 1541 } 1542 1543 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1544 { 1545 struct btrfs_trans_handle *trans; 1546 struct btrfs_device *device; 1547 struct block_device *bdev; 1548 struct list_head *devices; 1549 struct super_block *sb = root->fs_info->sb; 1550 u64 total_bytes; 1551 int seeding_dev = 0; 1552 int ret = 0; 1553 1554 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1555 return -EINVAL; 1556 1557 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1558 root->fs_info->bdev_holder); 1559 if (IS_ERR(bdev)) 1560 return PTR_ERR(bdev); 1561 1562 if (root->fs_info->fs_devices->seeding) { 1563 seeding_dev = 1; 1564 down_write(&sb->s_umount); 1565 mutex_lock(&uuid_mutex); 1566 } 1567 1568 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1569 mutex_lock(&root->fs_info->volume_mutex); 1570 1571 devices = &root->fs_info->fs_devices->devices; 1572 /* 1573 * we have the volume lock, so we don't need the extra 1574 * device list mutex while reading the list here. 1575 */ 1576 list_for_each_entry(device, devices, dev_list) { 1577 if (device->bdev == bdev) { 1578 ret = -EEXIST; 1579 goto error; 1580 } 1581 } 1582 1583 device = kzalloc(sizeof(*device), GFP_NOFS); 1584 if (!device) { 1585 /* we can safely leave the fs_devices entry around */ 1586 ret = -ENOMEM; 1587 goto error; 1588 } 1589 1590 device->name = kstrdup(device_path, GFP_NOFS); 1591 if (!device->name) { 1592 kfree(device); 1593 ret = -ENOMEM; 1594 goto error; 1595 } 1596 1597 ret = find_next_devid(root, &device->devid); 1598 if (ret) { 1599 kfree(device->name); 1600 kfree(device); 1601 goto error; 1602 } 1603 1604 trans = btrfs_start_transaction(root, 0); 1605 if (IS_ERR(trans)) { 1606 kfree(device->name); 1607 kfree(device); 1608 ret = PTR_ERR(trans); 1609 goto error; 1610 } 1611 1612 lock_chunks(root); 1613 1614 device->writeable = 1; 1615 device->work.func = pending_bios_fn; 1616 generate_random_uuid(device->uuid); 1617 spin_lock_init(&device->io_lock); 1618 device->generation = trans->transid; 1619 device->io_width = root->sectorsize; 1620 device->io_align = root->sectorsize; 1621 device->sector_size = root->sectorsize; 1622 device->total_bytes = i_size_read(bdev->bd_inode); 1623 device->disk_total_bytes = device->total_bytes; 1624 device->dev_root = root->fs_info->dev_root; 1625 device->bdev = bdev; 1626 device->in_fs_metadata = 1; 1627 device->mode = FMODE_EXCL; 1628 set_blocksize(device->bdev, 4096); 1629 1630 if (seeding_dev) { 1631 sb->s_flags &= ~MS_RDONLY; 1632 ret = btrfs_prepare_sprout(trans, root); 1633 BUG_ON(ret); 1634 } 1635 1636 device->fs_devices = root->fs_info->fs_devices; 1637 1638 /* 1639 * we don't want write_supers to jump in here with our device 1640 * half setup 1641 */ 1642 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1643 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1644 list_add(&device->dev_alloc_list, 1645 &root->fs_info->fs_devices->alloc_list); 1646 root->fs_info->fs_devices->num_devices++; 1647 root->fs_info->fs_devices->open_devices++; 1648 root->fs_info->fs_devices->rw_devices++; 1649 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1650 1651 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1652 root->fs_info->fs_devices->rotating = 1; 1653 1654 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1655 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1656 total_bytes + device->total_bytes); 1657 1658 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1659 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1660 total_bytes + 1); 1661 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1662 1663 if (seeding_dev) { 1664 ret = init_first_rw_device(trans, root, device); 1665 BUG_ON(ret); 1666 ret = btrfs_finish_sprout(trans, root); 1667 BUG_ON(ret); 1668 } else { 1669 ret = btrfs_add_device(trans, root, device); 1670 } 1671 1672 /* 1673 * we've got more storage, clear any full flags on the space 1674 * infos 1675 */ 1676 btrfs_clear_space_info_full(root->fs_info); 1677 1678 unlock_chunks(root); 1679 btrfs_commit_transaction(trans, root); 1680 1681 if (seeding_dev) { 1682 mutex_unlock(&uuid_mutex); 1683 up_write(&sb->s_umount); 1684 1685 ret = btrfs_relocate_sys_chunks(root); 1686 BUG_ON(ret); 1687 } 1688 out: 1689 mutex_unlock(&root->fs_info->volume_mutex); 1690 return ret; 1691 error: 1692 blkdev_put(bdev, FMODE_EXCL); 1693 if (seeding_dev) { 1694 mutex_unlock(&uuid_mutex); 1695 up_write(&sb->s_umount); 1696 } 1697 goto out; 1698 } 1699 1700 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1701 struct btrfs_device *device) 1702 { 1703 int ret; 1704 struct btrfs_path *path; 1705 struct btrfs_root *root; 1706 struct btrfs_dev_item *dev_item; 1707 struct extent_buffer *leaf; 1708 struct btrfs_key key; 1709 1710 root = device->dev_root->fs_info->chunk_root; 1711 1712 path = btrfs_alloc_path(); 1713 if (!path) 1714 return -ENOMEM; 1715 1716 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1717 key.type = BTRFS_DEV_ITEM_KEY; 1718 key.offset = device->devid; 1719 1720 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1721 if (ret < 0) 1722 goto out; 1723 1724 if (ret > 0) { 1725 ret = -ENOENT; 1726 goto out; 1727 } 1728 1729 leaf = path->nodes[0]; 1730 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1731 1732 btrfs_set_device_id(leaf, dev_item, device->devid); 1733 btrfs_set_device_type(leaf, dev_item, device->type); 1734 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1735 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1736 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1737 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1738 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1739 btrfs_mark_buffer_dirty(leaf); 1740 1741 out: 1742 btrfs_free_path(path); 1743 return ret; 1744 } 1745 1746 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1747 struct btrfs_device *device, u64 new_size) 1748 { 1749 struct btrfs_super_block *super_copy = 1750 &device->dev_root->fs_info->super_copy; 1751 u64 old_total = btrfs_super_total_bytes(super_copy); 1752 u64 diff = new_size - device->total_bytes; 1753 1754 if (!device->writeable) 1755 return -EACCES; 1756 if (new_size <= device->total_bytes) 1757 return -EINVAL; 1758 1759 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1760 device->fs_devices->total_rw_bytes += diff; 1761 1762 device->total_bytes = new_size; 1763 device->disk_total_bytes = new_size; 1764 btrfs_clear_space_info_full(device->dev_root->fs_info); 1765 1766 return btrfs_update_device(trans, device); 1767 } 1768 1769 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1770 struct btrfs_device *device, u64 new_size) 1771 { 1772 int ret; 1773 lock_chunks(device->dev_root); 1774 ret = __btrfs_grow_device(trans, device, new_size); 1775 unlock_chunks(device->dev_root); 1776 return ret; 1777 } 1778 1779 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1780 struct btrfs_root *root, 1781 u64 chunk_tree, u64 chunk_objectid, 1782 u64 chunk_offset) 1783 { 1784 int ret; 1785 struct btrfs_path *path; 1786 struct btrfs_key key; 1787 1788 root = root->fs_info->chunk_root; 1789 path = btrfs_alloc_path(); 1790 if (!path) 1791 return -ENOMEM; 1792 1793 key.objectid = chunk_objectid; 1794 key.offset = chunk_offset; 1795 key.type = BTRFS_CHUNK_ITEM_KEY; 1796 1797 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1798 BUG_ON(ret); 1799 1800 ret = btrfs_del_item(trans, root, path); 1801 1802 btrfs_free_path(path); 1803 return ret; 1804 } 1805 1806 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1807 chunk_offset) 1808 { 1809 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1810 struct btrfs_disk_key *disk_key; 1811 struct btrfs_chunk *chunk; 1812 u8 *ptr; 1813 int ret = 0; 1814 u32 num_stripes; 1815 u32 array_size; 1816 u32 len = 0; 1817 u32 cur; 1818 struct btrfs_key key; 1819 1820 array_size = btrfs_super_sys_array_size(super_copy); 1821 1822 ptr = super_copy->sys_chunk_array; 1823 cur = 0; 1824 1825 while (cur < array_size) { 1826 disk_key = (struct btrfs_disk_key *)ptr; 1827 btrfs_disk_key_to_cpu(&key, disk_key); 1828 1829 len = sizeof(*disk_key); 1830 1831 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1832 chunk = (struct btrfs_chunk *)(ptr + len); 1833 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1834 len += btrfs_chunk_item_size(num_stripes); 1835 } else { 1836 ret = -EIO; 1837 break; 1838 } 1839 if (key.objectid == chunk_objectid && 1840 key.offset == chunk_offset) { 1841 memmove(ptr, ptr + len, array_size - (cur + len)); 1842 array_size -= len; 1843 btrfs_set_super_sys_array_size(super_copy, array_size); 1844 } else { 1845 ptr += len; 1846 cur += len; 1847 } 1848 } 1849 return ret; 1850 } 1851 1852 static int btrfs_relocate_chunk(struct btrfs_root *root, 1853 u64 chunk_tree, u64 chunk_objectid, 1854 u64 chunk_offset) 1855 { 1856 struct extent_map_tree *em_tree; 1857 struct btrfs_root *extent_root; 1858 struct btrfs_trans_handle *trans; 1859 struct extent_map *em; 1860 struct map_lookup *map; 1861 int ret; 1862 int i; 1863 1864 root = root->fs_info->chunk_root; 1865 extent_root = root->fs_info->extent_root; 1866 em_tree = &root->fs_info->mapping_tree.map_tree; 1867 1868 ret = btrfs_can_relocate(extent_root, chunk_offset); 1869 if (ret) 1870 return -ENOSPC; 1871 1872 /* step one, relocate all the extents inside this chunk */ 1873 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1874 if (ret) 1875 return ret; 1876 1877 trans = btrfs_start_transaction(root, 0); 1878 BUG_ON(IS_ERR(trans)); 1879 1880 lock_chunks(root); 1881 1882 /* 1883 * step two, delete the device extents and the 1884 * chunk tree entries 1885 */ 1886 read_lock(&em_tree->lock); 1887 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1888 read_unlock(&em_tree->lock); 1889 1890 BUG_ON(em->start > chunk_offset || 1891 em->start + em->len < chunk_offset); 1892 map = (struct map_lookup *)em->bdev; 1893 1894 for (i = 0; i < map->num_stripes; i++) { 1895 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1896 map->stripes[i].physical); 1897 BUG_ON(ret); 1898 1899 if (map->stripes[i].dev) { 1900 ret = btrfs_update_device(trans, map->stripes[i].dev); 1901 BUG_ON(ret); 1902 } 1903 } 1904 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1905 chunk_offset); 1906 1907 BUG_ON(ret); 1908 1909 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 1910 1911 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1912 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1913 BUG_ON(ret); 1914 } 1915 1916 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1917 BUG_ON(ret); 1918 1919 write_lock(&em_tree->lock); 1920 remove_extent_mapping(em_tree, em); 1921 write_unlock(&em_tree->lock); 1922 1923 kfree(map); 1924 em->bdev = NULL; 1925 1926 /* once for the tree */ 1927 free_extent_map(em); 1928 /* once for us */ 1929 free_extent_map(em); 1930 1931 unlock_chunks(root); 1932 btrfs_end_transaction(trans, root); 1933 return 0; 1934 } 1935 1936 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1937 { 1938 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1939 struct btrfs_path *path; 1940 struct extent_buffer *leaf; 1941 struct btrfs_chunk *chunk; 1942 struct btrfs_key key; 1943 struct btrfs_key found_key; 1944 u64 chunk_tree = chunk_root->root_key.objectid; 1945 u64 chunk_type; 1946 bool retried = false; 1947 int failed = 0; 1948 int ret; 1949 1950 path = btrfs_alloc_path(); 1951 if (!path) 1952 return -ENOMEM; 1953 1954 again: 1955 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1956 key.offset = (u64)-1; 1957 key.type = BTRFS_CHUNK_ITEM_KEY; 1958 1959 while (1) { 1960 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1961 if (ret < 0) 1962 goto error; 1963 BUG_ON(ret == 0); 1964 1965 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1966 key.type); 1967 if (ret < 0) 1968 goto error; 1969 if (ret > 0) 1970 break; 1971 1972 leaf = path->nodes[0]; 1973 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1974 1975 chunk = btrfs_item_ptr(leaf, path->slots[0], 1976 struct btrfs_chunk); 1977 chunk_type = btrfs_chunk_type(leaf, chunk); 1978 btrfs_release_path(path); 1979 1980 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1981 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1982 found_key.objectid, 1983 found_key.offset); 1984 if (ret == -ENOSPC) 1985 failed++; 1986 else if (ret) 1987 BUG(); 1988 } 1989 1990 if (found_key.offset == 0) 1991 break; 1992 key.offset = found_key.offset - 1; 1993 } 1994 ret = 0; 1995 if (failed && !retried) { 1996 failed = 0; 1997 retried = true; 1998 goto again; 1999 } else if (failed && retried) { 2000 WARN_ON(1); 2001 ret = -ENOSPC; 2002 } 2003 error: 2004 btrfs_free_path(path); 2005 return ret; 2006 } 2007 2008 static u64 div_factor(u64 num, int factor) 2009 { 2010 if (factor == 10) 2011 return num; 2012 num *= factor; 2013 do_div(num, 10); 2014 return num; 2015 } 2016 2017 int btrfs_balance(struct btrfs_root *dev_root) 2018 { 2019 int ret; 2020 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2021 struct btrfs_device *device; 2022 u64 old_size; 2023 u64 size_to_free; 2024 struct btrfs_path *path; 2025 struct btrfs_key key; 2026 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2027 struct btrfs_trans_handle *trans; 2028 struct btrfs_key found_key; 2029 2030 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2031 return -EROFS; 2032 2033 if (!capable(CAP_SYS_ADMIN)) 2034 return -EPERM; 2035 2036 mutex_lock(&dev_root->fs_info->volume_mutex); 2037 dev_root = dev_root->fs_info->dev_root; 2038 2039 /* step one make some room on all the devices */ 2040 list_for_each_entry(device, devices, dev_list) { 2041 old_size = device->total_bytes; 2042 size_to_free = div_factor(old_size, 1); 2043 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2044 if (!device->writeable || 2045 device->total_bytes - device->bytes_used > size_to_free) 2046 continue; 2047 2048 ret = btrfs_shrink_device(device, old_size - size_to_free); 2049 if (ret == -ENOSPC) 2050 break; 2051 BUG_ON(ret); 2052 2053 trans = btrfs_start_transaction(dev_root, 0); 2054 BUG_ON(IS_ERR(trans)); 2055 2056 ret = btrfs_grow_device(trans, device, old_size); 2057 BUG_ON(ret); 2058 2059 btrfs_end_transaction(trans, dev_root); 2060 } 2061 2062 /* step two, relocate all the chunks */ 2063 path = btrfs_alloc_path(); 2064 BUG_ON(!path); 2065 2066 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2067 key.offset = (u64)-1; 2068 key.type = BTRFS_CHUNK_ITEM_KEY; 2069 2070 while (1) { 2071 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2072 if (ret < 0) 2073 goto error; 2074 2075 /* 2076 * this shouldn't happen, it means the last relocate 2077 * failed 2078 */ 2079 if (ret == 0) 2080 break; 2081 2082 ret = btrfs_previous_item(chunk_root, path, 0, 2083 BTRFS_CHUNK_ITEM_KEY); 2084 if (ret) 2085 break; 2086 2087 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2088 path->slots[0]); 2089 if (found_key.objectid != key.objectid) 2090 break; 2091 2092 /* chunk zero is special */ 2093 if (found_key.offset == 0) 2094 break; 2095 2096 btrfs_release_path(path); 2097 ret = btrfs_relocate_chunk(chunk_root, 2098 chunk_root->root_key.objectid, 2099 found_key.objectid, 2100 found_key.offset); 2101 if (ret && ret != -ENOSPC) 2102 goto error; 2103 key.offset = found_key.offset - 1; 2104 } 2105 ret = 0; 2106 error: 2107 btrfs_free_path(path); 2108 mutex_unlock(&dev_root->fs_info->volume_mutex); 2109 return ret; 2110 } 2111 2112 /* 2113 * shrinking a device means finding all of the device extents past 2114 * the new size, and then following the back refs to the chunks. 2115 * The chunk relocation code actually frees the device extent 2116 */ 2117 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2118 { 2119 struct btrfs_trans_handle *trans; 2120 struct btrfs_root *root = device->dev_root; 2121 struct btrfs_dev_extent *dev_extent = NULL; 2122 struct btrfs_path *path; 2123 u64 length; 2124 u64 chunk_tree; 2125 u64 chunk_objectid; 2126 u64 chunk_offset; 2127 int ret; 2128 int slot; 2129 int failed = 0; 2130 bool retried = false; 2131 struct extent_buffer *l; 2132 struct btrfs_key key; 2133 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2134 u64 old_total = btrfs_super_total_bytes(super_copy); 2135 u64 old_size = device->total_bytes; 2136 u64 diff = device->total_bytes - new_size; 2137 2138 if (new_size >= device->total_bytes) 2139 return -EINVAL; 2140 2141 path = btrfs_alloc_path(); 2142 if (!path) 2143 return -ENOMEM; 2144 2145 path->reada = 2; 2146 2147 lock_chunks(root); 2148 2149 device->total_bytes = new_size; 2150 if (device->writeable) 2151 device->fs_devices->total_rw_bytes -= diff; 2152 unlock_chunks(root); 2153 2154 again: 2155 key.objectid = device->devid; 2156 key.offset = (u64)-1; 2157 key.type = BTRFS_DEV_EXTENT_KEY; 2158 2159 while (1) { 2160 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2161 if (ret < 0) 2162 goto done; 2163 2164 ret = btrfs_previous_item(root, path, 0, key.type); 2165 if (ret < 0) 2166 goto done; 2167 if (ret) { 2168 ret = 0; 2169 btrfs_release_path(path); 2170 break; 2171 } 2172 2173 l = path->nodes[0]; 2174 slot = path->slots[0]; 2175 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2176 2177 if (key.objectid != device->devid) { 2178 btrfs_release_path(path); 2179 break; 2180 } 2181 2182 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2183 length = btrfs_dev_extent_length(l, dev_extent); 2184 2185 if (key.offset + length <= new_size) { 2186 btrfs_release_path(path); 2187 break; 2188 } 2189 2190 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2191 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2192 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2193 btrfs_release_path(path); 2194 2195 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2196 chunk_offset); 2197 if (ret && ret != -ENOSPC) 2198 goto done; 2199 if (ret == -ENOSPC) 2200 failed++; 2201 key.offset -= 1; 2202 } 2203 2204 if (failed && !retried) { 2205 failed = 0; 2206 retried = true; 2207 goto again; 2208 } else if (failed && retried) { 2209 ret = -ENOSPC; 2210 lock_chunks(root); 2211 2212 device->total_bytes = old_size; 2213 if (device->writeable) 2214 device->fs_devices->total_rw_bytes += diff; 2215 unlock_chunks(root); 2216 goto done; 2217 } 2218 2219 /* Shrinking succeeded, else we would be at "done". */ 2220 trans = btrfs_start_transaction(root, 0); 2221 if (IS_ERR(trans)) { 2222 ret = PTR_ERR(trans); 2223 goto done; 2224 } 2225 2226 lock_chunks(root); 2227 2228 device->disk_total_bytes = new_size; 2229 /* Now btrfs_update_device() will change the on-disk size. */ 2230 ret = btrfs_update_device(trans, device); 2231 if (ret) { 2232 unlock_chunks(root); 2233 btrfs_end_transaction(trans, root); 2234 goto done; 2235 } 2236 WARN_ON(diff > old_total); 2237 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2238 unlock_chunks(root); 2239 btrfs_end_transaction(trans, root); 2240 done: 2241 btrfs_free_path(path); 2242 return ret; 2243 } 2244 2245 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2246 struct btrfs_root *root, 2247 struct btrfs_key *key, 2248 struct btrfs_chunk *chunk, int item_size) 2249 { 2250 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2251 struct btrfs_disk_key disk_key; 2252 u32 array_size; 2253 u8 *ptr; 2254 2255 array_size = btrfs_super_sys_array_size(super_copy); 2256 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2257 return -EFBIG; 2258 2259 ptr = super_copy->sys_chunk_array + array_size; 2260 btrfs_cpu_key_to_disk(&disk_key, key); 2261 memcpy(ptr, &disk_key, sizeof(disk_key)); 2262 ptr += sizeof(disk_key); 2263 memcpy(ptr, chunk, item_size); 2264 item_size += sizeof(disk_key); 2265 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2266 return 0; 2267 } 2268 2269 /* 2270 * sort the devices in descending order by max_avail, total_avail 2271 */ 2272 static int btrfs_cmp_device_info(const void *a, const void *b) 2273 { 2274 const struct btrfs_device_info *di_a = a; 2275 const struct btrfs_device_info *di_b = b; 2276 2277 if (di_a->max_avail > di_b->max_avail) 2278 return -1; 2279 if (di_a->max_avail < di_b->max_avail) 2280 return 1; 2281 if (di_a->total_avail > di_b->total_avail) 2282 return -1; 2283 if (di_a->total_avail < di_b->total_avail) 2284 return 1; 2285 return 0; 2286 } 2287 2288 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2289 struct btrfs_root *extent_root, 2290 struct map_lookup **map_ret, 2291 u64 *num_bytes_out, u64 *stripe_size_out, 2292 u64 start, u64 type) 2293 { 2294 struct btrfs_fs_info *info = extent_root->fs_info; 2295 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2296 struct list_head *cur; 2297 struct map_lookup *map = NULL; 2298 struct extent_map_tree *em_tree; 2299 struct extent_map *em; 2300 struct btrfs_device_info *devices_info = NULL; 2301 u64 total_avail; 2302 int num_stripes; /* total number of stripes to allocate */ 2303 int sub_stripes; /* sub_stripes info for map */ 2304 int dev_stripes; /* stripes per dev */ 2305 int devs_max; /* max devs to use */ 2306 int devs_min; /* min devs needed */ 2307 int devs_increment; /* ndevs has to be a multiple of this */ 2308 int ncopies; /* how many copies to data has */ 2309 int ret; 2310 u64 max_stripe_size; 2311 u64 max_chunk_size; 2312 u64 stripe_size; 2313 u64 num_bytes; 2314 int ndevs; 2315 int i; 2316 int j; 2317 2318 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2319 (type & BTRFS_BLOCK_GROUP_DUP)) { 2320 WARN_ON(1); 2321 type &= ~BTRFS_BLOCK_GROUP_DUP; 2322 } 2323 2324 if (list_empty(&fs_devices->alloc_list)) 2325 return -ENOSPC; 2326 2327 sub_stripes = 1; 2328 dev_stripes = 1; 2329 devs_increment = 1; 2330 ncopies = 1; 2331 devs_max = 0; /* 0 == as many as possible */ 2332 devs_min = 1; 2333 2334 /* 2335 * define the properties of each RAID type. 2336 * FIXME: move this to a global table and use it in all RAID 2337 * calculation code 2338 */ 2339 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2340 dev_stripes = 2; 2341 ncopies = 2; 2342 devs_max = 1; 2343 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2344 devs_min = 2; 2345 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2346 devs_increment = 2; 2347 ncopies = 2; 2348 devs_max = 2; 2349 devs_min = 2; 2350 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2351 sub_stripes = 2; 2352 devs_increment = 2; 2353 ncopies = 2; 2354 devs_min = 4; 2355 } else { 2356 devs_max = 1; 2357 } 2358 2359 if (type & BTRFS_BLOCK_GROUP_DATA) { 2360 max_stripe_size = 1024 * 1024 * 1024; 2361 max_chunk_size = 10 * max_stripe_size; 2362 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2363 max_stripe_size = 256 * 1024 * 1024; 2364 max_chunk_size = max_stripe_size; 2365 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2366 max_stripe_size = 8 * 1024 * 1024; 2367 max_chunk_size = 2 * max_stripe_size; 2368 } else { 2369 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 2370 type); 2371 BUG_ON(1); 2372 } 2373 2374 /* we don't want a chunk larger than 10% of writeable space */ 2375 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2376 max_chunk_size); 2377 2378 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 2379 GFP_NOFS); 2380 if (!devices_info) 2381 return -ENOMEM; 2382 2383 cur = fs_devices->alloc_list.next; 2384 2385 /* 2386 * in the first pass through the devices list, we gather information 2387 * about the available holes on each device. 2388 */ 2389 ndevs = 0; 2390 while (cur != &fs_devices->alloc_list) { 2391 struct btrfs_device *device; 2392 u64 max_avail; 2393 u64 dev_offset; 2394 2395 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2396 2397 cur = cur->next; 2398 2399 if (!device->writeable) { 2400 printk(KERN_ERR 2401 "btrfs: read-only device in alloc_list\n"); 2402 WARN_ON(1); 2403 continue; 2404 } 2405 2406 if (!device->in_fs_metadata) 2407 continue; 2408 2409 if (device->total_bytes > device->bytes_used) 2410 total_avail = device->total_bytes - device->bytes_used; 2411 else 2412 total_avail = 0; 2413 /* avail is off by max(alloc_start, 1MB), but that is the same 2414 * for all devices, so it doesn't hurt the sorting later on 2415 */ 2416 2417 ret = find_free_dev_extent(trans, device, 2418 max_stripe_size * dev_stripes, 2419 &dev_offset, &max_avail); 2420 if (ret && ret != -ENOSPC) 2421 goto error; 2422 2423 if (ret == 0) 2424 max_avail = max_stripe_size * dev_stripes; 2425 2426 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 2427 continue; 2428 2429 devices_info[ndevs].dev_offset = dev_offset; 2430 devices_info[ndevs].max_avail = max_avail; 2431 devices_info[ndevs].total_avail = total_avail; 2432 devices_info[ndevs].dev = device; 2433 ++ndevs; 2434 } 2435 2436 /* 2437 * now sort the devices by hole size / available space 2438 */ 2439 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 2440 btrfs_cmp_device_info, NULL); 2441 2442 /* round down to number of usable stripes */ 2443 ndevs -= ndevs % devs_increment; 2444 2445 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 2446 ret = -ENOSPC; 2447 goto error; 2448 } 2449 2450 if (devs_max && ndevs > devs_max) 2451 ndevs = devs_max; 2452 /* 2453 * the primary goal is to maximize the number of stripes, so use as many 2454 * devices as possible, even if the stripes are not maximum sized. 2455 */ 2456 stripe_size = devices_info[ndevs-1].max_avail; 2457 num_stripes = ndevs * dev_stripes; 2458 2459 if (stripe_size * num_stripes > max_chunk_size * ncopies) { 2460 stripe_size = max_chunk_size * ncopies; 2461 do_div(stripe_size, num_stripes); 2462 } 2463 2464 do_div(stripe_size, dev_stripes); 2465 do_div(stripe_size, BTRFS_STRIPE_LEN); 2466 stripe_size *= BTRFS_STRIPE_LEN; 2467 2468 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2469 if (!map) { 2470 ret = -ENOMEM; 2471 goto error; 2472 } 2473 map->num_stripes = num_stripes; 2474 2475 for (i = 0; i < ndevs; ++i) { 2476 for (j = 0; j < dev_stripes; ++j) { 2477 int s = i * dev_stripes + j; 2478 map->stripes[s].dev = devices_info[i].dev; 2479 map->stripes[s].physical = devices_info[i].dev_offset + 2480 j * stripe_size; 2481 } 2482 } 2483 map->sector_size = extent_root->sectorsize; 2484 map->stripe_len = BTRFS_STRIPE_LEN; 2485 map->io_align = BTRFS_STRIPE_LEN; 2486 map->io_width = BTRFS_STRIPE_LEN; 2487 map->type = type; 2488 map->sub_stripes = sub_stripes; 2489 2490 *map_ret = map; 2491 num_bytes = stripe_size * (num_stripes / ncopies); 2492 2493 *stripe_size_out = stripe_size; 2494 *num_bytes_out = num_bytes; 2495 2496 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 2497 2498 em = alloc_extent_map(); 2499 if (!em) { 2500 ret = -ENOMEM; 2501 goto error; 2502 } 2503 em->bdev = (struct block_device *)map; 2504 em->start = start; 2505 em->len = num_bytes; 2506 em->block_start = 0; 2507 em->block_len = em->len; 2508 2509 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2510 write_lock(&em_tree->lock); 2511 ret = add_extent_mapping(em_tree, em); 2512 write_unlock(&em_tree->lock); 2513 BUG_ON(ret); 2514 free_extent_map(em); 2515 2516 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2517 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2518 start, num_bytes); 2519 BUG_ON(ret); 2520 2521 for (i = 0; i < map->num_stripes; ++i) { 2522 struct btrfs_device *device; 2523 u64 dev_offset; 2524 2525 device = map->stripes[i].dev; 2526 dev_offset = map->stripes[i].physical; 2527 2528 ret = btrfs_alloc_dev_extent(trans, device, 2529 info->chunk_root->root_key.objectid, 2530 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2531 start, dev_offset, stripe_size); 2532 BUG_ON(ret); 2533 } 2534 2535 kfree(devices_info); 2536 return 0; 2537 2538 error: 2539 kfree(map); 2540 kfree(devices_info); 2541 return ret; 2542 } 2543 2544 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2545 struct btrfs_root *extent_root, 2546 struct map_lookup *map, u64 chunk_offset, 2547 u64 chunk_size, u64 stripe_size) 2548 { 2549 u64 dev_offset; 2550 struct btrfs_key key; 2551 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2552 struct btrfs_device *device; 2553 struct btrfs_chunk *chunk; 2554 struct btrfs_stripe *stripe; 2555 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2556 int index = 0; 2557 int ret; 2558 2559 chunk = kzalloc(item_size, GFP_NOFS); 2560 if (!chunk) 2561 return -ENOMEM; 2562 2563 index = 0; 2564 while (index < map->num_stripes) { 2565 device = map->stripes[index].dev; 2566 device->bytes_used += stripe_size; 2567 ret = btrfs_update_device(trans, device); 2568 BUG_ON(ret); 2569 index++; 2570 } 2571 2572 index = 0; 2573 stripe = &chunk->stripe; 2574 while (index < map->num_stripes) { 2575 device = map->stripes[index].dev; 2576 dev_offset = map->stripes[index].physical; 2577 2578 btrfs_set_stack_stripe_devid(stripe, device->devid); 2579 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2580 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2581 stripe++; 2582 index++; 2583 } 2584 2585 btrfs_set_stack_chunk_length(chunk, chunk_size); 2586 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2587 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2588 btrfs_set_stack_chunk_type(chunk, map->type); 2589 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2590 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2591 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2592 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2593 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2594 2595 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2596 key.type = BTRFS_CHUNK_ITEM_KEY; 2597 key.offset = chunk_offset; 2598 2599 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2600 BUG_ON(ret); 2601 2602 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2603 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2604 item_size); 2605 BUG_ON(ret); 2606 } 2607 2608 kfree(chunk); 2609 return 0; 2610 } 2611 2612 /* 2613 * Chunk allocation falls into two parts. The first part does works 2614 * that make the new allocated chunk useable, but not do any operation 2615 * that modifies the chunk tree. The second part does the works that 2616 * require modifying the chunk tree. This division is important for the 2617 * bootstrap process of adding storage to a seed btrfs. 2618 */ 2619 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2620 struct btrfs_root *extent_root, u64 type) 2621 { 2622 u64 chunk_offset; 2623 u64 chunk_size; 2624 u64 stripe_size; 2625 struct map_lookup *map; 2626 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2627 int ret; 2628 2629 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2630 &chunk_offset); 2631 if (ret) 2632 return ret; 2633 2634 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2635 &stripe_size, chunk_offset, type); 2636 if (ret) 2637 return ret; 2638 2639 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2640 chunk_size, stripe_size); 2641 BUG_ON(ret); 2642 return 0; 2643 } 2644 2645 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2646 struct btrfs_root *root, 2647 struct btrfs_device *device) 2648 { 2649 u64 chunk_offset; 2650 u64 sys_chunk_offset; 2651 u64 chunk_size; 2652 u64 sys_chunk_size; 2653 u64 stripe_size; 2654 u64 sys_stripe_size; 2655 u64 alloc_profile; 2656 struct map_lookup *map; 2657 struct map_lookup *sys_map; 2658 struct btrfs_fs_info *fs_info = root->fs_info; 2659 struct btrfs_root *extent_root = fs_info->extent_root; 2660 int ret; 2661 2662 ret = find_next_chunk(fs_info->chunk_root, 2663 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2664 BUG_ON(ret); 2665 2666 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2667 (fs_info->metadata_alloc_profile & 2668 fs_info->avail_metadata_alloc_bits); 2669 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2670 2671 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2672 &stripe_size, chunk_offset, alloc_profile); 2673 BUG_ON(ret); 2674 2675 sys_chunk_offset = chunk_offset + chunk_size; 2676 2677 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2678 (fs_info->system_alloc_profile & 2679 fs_info->avail_system_alloc_bits); 2680 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2681 2682 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2683 &sys_chunk_size, &sys_stripe_size, 2684 sys_chunk_offset, alloc_profile); 2685 BUG_ON(ret); 2686 2687 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2688 BUG_ON(ret); 2689 2690 /* 2691 * Modifying chunk tree needs allocating new blocks from both 2692 * system block group and metadata block group. So we only can 2693 * do operations require modifying the chunk tree after both 2694 * block groups were created. 2695 */ 2696 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2697 chunk_size, stripe_size); 2698 BUG_ON(ret); 2699 2700 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2701 sys_chunk_offset, sys_chunk_size, 2702 sys_stripe_size); 2703 BUG_ON(ret); 2704 return 0; 2705 } 2706 2707 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2708 { 2709 struct extent_map *em; 2710 struct map_lookup *map; 2711 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2712 int readonly = 0; 2713 int i; 2714 2715 read_lock(&map_tree->map_tree.lock); 2716 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2717 read_unlock(&map_tree->map_tree.lock); 2718 if (!em) 2719 return 1; 2720 2721 if (btrfs_test_opt(root, DEGRADED)) { 2722 free_extent_map(em); 2723 return 0; 2724 } 2725 2726 map = (struct map_lookup *)em->bdev; 2727 for (i = 0; i < map->num_stripes; i++) { 2728 if (!map->stripes[i].dev->writeable) { 2729 readonly = 1; 2730 break; 2731 } 2732 } 2733 free_extent_map(em); 2734 return readonly; 2735 } 2736 2737 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2738 { 2739 extent_map_tree_init(&tree->map_tree); 2740 } 2741 2742 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2743 { 2744 struct extent_map *em; 2745 2746 while (1) { 2747 write_lock(&tree->map_tree.lock); 2748 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2749 if (em) 2750 remove_extent_mapping(&tree->map_tree, em); 2751 write_unlock(&tree->map_tree.lock); 2752 if (!em) 2753 break; 2754 kfree(em->bdev); 2755 /* once for us */ 2756 free_extent_map(em); 2757 /* once for the tree */ 2758 free_extent_map(em); 2759 } 2760 } 2761 2762 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2763 { 2764 struct extent_map *em; 2765 struct map_lookup *map; 2766 struct extent_map_tree *em_tree = &map_tree->map_tree; 2767 int ret; 2768 2769 read_lock(&em_tree->lock); 2770 em = lookup_extent_mapping(em_tree, logical, len); 2771 read_unlock(&em_tree->lock); 2772 BUG_ON(!em); 2773 2774 BUG_ON(em->start > logical || em->start + em->len < logical); 2775 map = (struct map_lookup *)em->bdev; 2776 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2777 ret = map->num_stripes; 2778 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2779 ret = map->sub_stripes; 2780 else 2781 ret = 1; 2782 free_extent_map(em); 2783 return ret; 2784 } 2785 2786 static int find_live_mirror(struct map_lookup *map, int first, int num, 2787 int optimal) 2788 { 2789 int i; 2790 if (map->stripes[optimal].dev->bdev) 2791 return optimal; 2792 for (i = first; i < first + num; i++) { 2793 if (map->stripes[i].dev->bdev) 2794 return i; 2795 } 2796 /* we couldn't find one that doesn't fail. Just return something 2797 * and the io error handling code will clean up eventually 2798 */ 2799 return optimal; 2800 } 2801 2802 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2803 u64 logical, u64 *length, 2804 struct btrfs_multi_bio **multi_ret, 2805 int mirror_num) 2806 { 2807 struct extent_map *em; 2808 struct map_lookup *map; 2809 struct extent_map_tree *em_tree = &map_tree->map_tree; 2810 u64 offset; 2811 u64 stripe_offset; 2812 u64 stripe_end_offset; 2813 u64 stripe_nr; 2814 u64 stripe_nr_orig; 2815 u64 stripe_nr_end; 2816 int stripes_allocated = 8; 2817 int stripes_required = 1; 2818 int stripe_index; 2819 int i; 2820 int num_stripes; 2821 int max_errors = 0; 2822 struct btrfs_multi_bio *multi = NULL; 2823 2824 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2825 stripes_allocated = 1; 2826 again: 2827 if (multi_ret) { 2828 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2829 GFP_NOFS); 2830 if (!multi) 2831 return -ENOMEM; 2832 2833 atomic_set(&multi->error, 0); 2834 } 2835 2836 read_lock(&em_tree->lock); 2837 em = lookup_extent_mapping(em_tree, logical, *length); 2838 read_unlock(&em_tree->lock); 2839 2840 if (!em) { 2841 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2842 (unsigned long long)logical, 2843 (unsigned long long)*length); 2844 BUG(); 2845 } 2846 2847 BUG_ON(em->start > logical || em->start + em->len < logical); 2848 map = (struct map_lookup *)em->bdev; 2849 offset = logical - em->start; 2850 2851 if (mirror_num > map->num_stripes) 2852 mirror_num = 0; 2853 2854 /* if our multi bio struct is too small, back off and try again */ 2855 if (rw & REQ_WRITE) { 2856 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2857 BTRFS_BLOCK_GROUP_DUP)) { 2858 stripes_required = map->num_stripes; 2859 max_errors = 1; 2860 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2861 stripes_required = map->sub_stripes; 2862 max_errors = 1; 2863 } 2864 } 2865 if (rw & REQ_DISCARD) { 2866 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2867 BTRFS_BLOCK_GROUP_RAID1 | 2868 BTRFS_BLOCK_GROUP_DUP | 2869 BTRFS_BLOCK_GROUP_RAID10)) { 2870 stripes_required = map->num_stripes; 2871 } 2872 } 2873 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2874 stripes_allocated < stripes_required) { 2875 stripes_allocated = map->num_stripes; 2876 free_extent_map(em); 2877 kfree(multi); 2878 goto again; 2879 } 2880 stripe_nr = offset; 2881 /* 2882 * stripe_nr counts the total number of stripes we have to stride 2883 * to get to this block 2884 */ 2885 do_div(stripe_nr, map->stripe_len); 2886 2887 stripe_offset = stripe_nr * map->stripe_len; 2888 BUG_ON(offset < stripe_offset); 2889 2890 /* stripe_offset is the offset of this block in its stripe*/ 2891 stripe_offset = offset - stripe_offset; 2892 2893 if (rw & REQ_DISCARD) 2894 *length = min_t(u64, em->len - offset, *length); 2895 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2896 BTRFS_BLOCK_GROUP_RAID1 | 2897 BTRFS_BLOCK_GROUP_RAID10 | 2898 BTRFS_BLOCK_GROUP_DUP)) { 2899 /* we limit the length of each bio to what fits in a stripe */ 2900 *length = min_t(u64, em->len - offset, 2901 map->stripe_len - stripe_offset); 2902 } else { 2903 *length = em->len - offset; 2904 } 2905 2906 if (!multi_ret) 2907 goto out; 2908 2909 num_stripes = 1; 2910 stripe_index = 0; 2911 stripe_nr_orig = stripe_nr; 2912 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 2913 (~(map->stripe_len - 1)); 2914 do_div(stripe_nr_end, map->stripe_len); 2915 stripe_end_offset = stripe_nr_end * map->stripe_len - 2916 (offset + *length); 2917 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2918 if (rw & REQ_DISCARD) 2919 num_stripes = min_t(u64, map->num_stripes, 2920 stripe_nr_end - stripe_nr_orig); 2921 stripe_index = do_div(stripe_nr, map->num_stripes); 2922 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2923 if (rw & (REQ_WRITE | REQ_DISCARD)) 2924 num_stripes = map->num_stripes; 2925 else if (mirror_num) 2926 stripe_index = mirror_num - 1; 2927 else { 2928 stripe_index = find_live_mirror(map, 0, 2929 map->num_stripes, 2930 current->pid % map->num_stripes); 2931 } 2932 2933 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2934 if (rw & (REQ_WRITE | REQ_DISCARD)) 2935 num_stripes = map->num_stripes; 2936 else if (mirror_num) 2937 stripe_index = mirror_num - 1; 2938 2939 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2940 int factor = map->num_stripes / map->sub_stripes; 2941 2942 stripe_index = do_div(stripe_nr, factor); 2943 stripe_index *= map->sub_stripes; 2944 2945 if (rw & REQ_WRITE) 2946 num_stripes = map->sub_stripes; 2947 else if (rw & REQ_DISCARD) 2948 num_stripes = min_t(u64, map->sub_stripes * 2949 (stripe_nr_end - stripe_nr_orig), 2950 map->num_stripes); 2951 else if (mirror_num) 2952 stripe_index += mirror_num - 1; 2953 else { 2954 stripe_index = find_live_mirror(map, stripe_index, 2955 map->sub_stripes, stripe_index + 2956 current->pid % map->sub_stripes); 2957 } 2958 } else { 2959 /* 2960 * after this do_div call, stripe_nr is the number of stripes 2961 * on this device we have to walk to find the data, and 2962 * stripe_index is the number of our device in the stripe array 2963 */ 2964 stripe_index = do_div(stripe_nr, map->num_stripes); 2965 } 2966 BUG_ON(stripe_index >= map->num_stripes); 2967 2968 if (rw & REQ_DISCARD) { 2969 for (i = 0; i < num_stripes; i++) { 2970 multi->stripes[i].physical = 2971 map->stripes[stripe_index].physical + 2972 stripe_offset + stripe_nr * map->stripe_len; 2973 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2974 2975 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2976 u64 stripes; 2977 u32 last_stripe = 0; 2978 int j; 2979 2980 div_u64_rem(stripe_nr_end - 1, 2981 map->num_stripes, 2982 &last_stripe); 2983 2984 for (j = 0; j < map->num_stripes; j++) { 2985 u32 test; 2986 2987 div_u64_rem(stripe_nr_end - 1 - j, 2988 map->num_stripes, &test); 2989 if (test == stripe_index) 2990 break; 2991 } 2992 stripes = stripe_nr_end - 1 - j; 2993 do_div(stripes, map->num_stripes); 2994 multi->stripes[i].length = map->stripe_len * 2995 (stripes - stripe_nr + 1); 2996 2997 if (i == 0) { 2998 multi->stripes[i].length -= 2999 stripe_offset; 3000 stripe_offset = 0; 3001 } 3002 if (stripe_index == last_stripe) 3003 multi->stripes[i].length -= 3004 stripe_end_offset; 3005 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3006 u64 stripes; 3007 int j; 3008 int factor = map->num_stripes / 3009 map->sub_stripes; 3010 u32 last_stripe = 0; 3011 3012 div_u64_rem(stripe_nr_end - 1, 3013 factor, &last_stripe); 3014 last_stripe *= map->sub_stripes; 3015 3016 for (j = 0; j < factor; j++) { 3017 u32 test; 3018 3019 div_u64_rem(stripe_nr_end - 1 - j, 3020 factor, &test); 3021 3022 if (test == 3023 stripe_index / map->sub_stripes) 3024 break; 3025 } 3026 stripes = stripe_nr_end - 1 - j; 3027 do_div(stripes, factor); 3028 multi->stripes[i].length = map->stripe_len * 3029 (stripes - stripe_nr + 1); 3030 3031 if (i < map->sub_stripes) { 3032 multi->stripes[i].length -= 3033 stripe_offset; 3034 if (i == map->sub_stripes - 1) 3035 stripe_offset = 0; 3036 } 3037 if (stripe_index >= last_stripe && 3038 stripe_index <= (last_stripe + 3039 map->sub_stripes - 1)) { 3040 multi->stripes[i].length -= 3041 stripe_end_offset; 3042 } 3043 } else 3044 multi->stripes[i].length = *length; 3045 3046 stripe_index++; 3047 if (stripe_index == map->num_stripes) { 3048 /* This could only happen for RAID0/10 */ 3049 stripe_index = 0; 3050 stripe_nr++; 3051 } 3052 } 3053 } else { 3054 for (i = 0; i < num_stripes; i++) { 3055 multi->stripes[i].physical = 3056 map->stripes[stripe_index].physical + 3057 stripe_offset + 3058 stripe_nr * map->stripe_len; 3059 multi->stripes[i].dev = 3060 map->stripes[stripe_index].dev; 3061 stripe_index++; 3062 } 3063 } 3064 if (multi_ret) { 3065 *multi_ret = multi; 3066 multi->num_stripes = num_stripes; 3067 multi->max_errors = max_errors; 3068 } 3069 out: 3070 free_extent_map(em); 3071 return 0; 3072 } 3073 3074 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3075 u64 logical, u64 *length, 3076 struct btrfs_multi_bio **multi_ret, int mirror_num) 3077 { 3078 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3079 mirror_num); 3080 } 3081 3082 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3083 u64 chunk_start, u64 physical, u64 devid, 3084 u64 **logical, int *naddrs, int *stripe_len) 3085 { 3086 struct extent_map_tree *em_tree = &map_tree->map_tree; 3087 struct extent_map *em; 3088 struct map_lookup *map; 3089 u64 *buf; 3090 u64 bytenr; 3091 u64 length; 3092 u64 stripe_nr; 3093 int i, j, nr = 0; 3094 3095 read_lock(&em_tree->lock); 3096 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3097 read_unlock(&em_tree->lock); 3098 3099 BUG_ON(!em || em->start != chunk_start); 3100 map = (struct map_lookup *)em->bdev; 3101 3102 length = em->len; 3103 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3104 do_div(length, map->num_stripes / map->sub_stripes); 3105 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3106 do_div(length, map->num_stripes); 3107 3108 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3109 BUG_ON(!buf); 3110 3111 for (i = 0; i < map->num_stripes; i++) { 3112 if (devid && map->stripes[i].dev->devid != devid) 3113 continue; 3114 if (map->stripes[i].physical > physical || 3115 map->stripes[i].physical + length <= physical) 3116 continue; 3117 3118 stripe_nr = physical - map->stripes[i].physical; 3119 do_div(stripe_nr, map->stripe_len); 3120 3121 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3122 stripe_nr = stripe_nr * map->num_stripes + i; 3123 do_div(stripe_nr, map->sub_stripes); 3124 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3125 stripe_nr = stripe_nr * map->num_stripes + i; 3126 } 3127 bytenr = chunk_start + stripe_nr * map->stripe_len; 3128 WARN_ON(nr >= map->num_stripes); 3129 for (j = 0; j < nr; j++) { 3130 if (buf[j] == bytenr) 3131 break; 3132 } 3133 if (j == nr) { 3134 WARN_ON(nr >= map->num_stripes); 3135 buf[nr++] = bytenr; 3136 } 3137 } 3138 3139 *logical = buf; 3140 *naddrs = nr; 3141 *stripe_len = map->stripe_len; 3142 3143 free_extent_map(em); 3144 return 0; 3145 } 3146 3147 static void end_bio_multi_stripe(struct bio *bio, int err) 3148 { 3149 struct btrfs_multi_bio *multi = bio->bi_private; 3150 int is_orig_bio = 0; 3151 3152 if (err) 3153 atomic_inc(&multi->error); 3154 3155 if (bio == multi->orig_bio) 3156 is_orig_bio = 1; 3157 3158 if (atomic_dec_and_test(&multi->stripes_pending)) { 3159 if (!is_orig_bio) { 3160 bio_put(bio); 3161 bio = multi->orig_bio; 3162 } 3163 bio->bi_private = multi->private; 3164 bio->bi_end_io = multi->end_io; 3165 /* only send an error to the higher layers if it is 3166 * beyond the tolerance of the multi-bio 3167 */ 3168 if (atomic_read(&multi->error) > multi->max_errors) { 3169 err = -EIO; 3170 } else if (err) { 3171 /* 3172 * this bio is actually up to date, we didn't 3173 * go over the max number of errors 3174 */ 3175 set_bit(BIO_UPTODATE, &bio->bi_flags); 3176 err = 0; 3177 } 3178 kfree(multi); 3179 3180 bio_endio(bio, err); 3181 } else if (!is_orig_bio) { 3182 bio_put(bio); 3183 } 3184 } 3185 3186 struct async_sched { 3187 struct bio *bio; 3188 int rw; 3189 struct btrfs_fs_info *info; 3190 struct btrfs_work work; 3191 }; 3192 3193 /* 3194 * see run_scheduled_bios for a description of why bios are collected for 3195 * async submit. 3196 * 3197 * This will add one bio to the pending list for a device and make sure 3198 * the work struct is scheduled. 3199 */ 3200 static noinline int schedule_bio(struct btrfs_root *root, 3201 struct btrfs_device *device, 3202 int rw, struct bio *bio) 3203 { 3204 int should_queue = 1; 3205 struct btrfs_pending_bios *pending_bios; 3206 3207 /* don't bother with additional async steps for reads, right now */ 3208 if (!(rw & REQ_WRITE)) { 3209 bio_get(bio); 3210 submit_bio(rw, bio); 3211 bio_put(bio); 3212 return 0; 3213 } 3214 3215 /* 3216 * nr_async_bios allows us to reliably return congestion to the 3217 * higher layers. Otherwise, the async bio makes it appear we have 3218 * made progress against dirty pages when we've really just put it 3219 * on a queue for later 3220 */ 3221 atomic_inc(&root->fs_info->nr_async_bios); 3222 WARN_ON(bio->bi_next); 3223 bio->bi_next = NULL; 3224 bio->bi_rw |= rw; 3225 3226 spin_lock(&device->io_lock); 3227 if (bio->bi_rw & REQ_SYNC) 3228 pending_bios = &device->pending_sync_bios; 3229 else 3230 pending_bios = &device->pending_bios; 3231 3232 if (pending_bios->tail) 3233 pending_bios->tail->bi_next = bio; 3234 3235 pending_bios->tail = bio; 3236 if (!pending_bios->head) 3237 pending_bios->head = bio; 3238 if (device->running_pending) 3239 should_queue = 0; 3240 3241 spin_unlock(&device->io_lock); 3242 3243 if (should_queue) 3244 btrfs_queue_worker(&root->fs_info->submit_workers, 3245 &device->work); 3246 return 0; 3247 } 3248 3249 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 3250 int mirror_num, int async_submit) 3251 { 3252 struct btrfs_mapping_tree *map_tree; 3253 struct btrfs_device *dev; 3254 struct bio *first_bio = bio; 3255 u64 logical = (u64)bio->bi_sector << 9; 3256 u64 length = 0; 3257 u64 map_length; 3258 struct btrfs_multi_bio *multi = NULL; 3259 int ret; 3260 int dev_nr = 0; 3261 int total_devs = 1; 3262 3263 length = bio->bi_size; 3264 map_tree = &root->fs_info->mapping_tree; 3265 map_length = length; 3266 3267 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3268 mirror_num); 3269 BUG_ON(ret); 3270 3271 total_devs = multi->num_stripes; 3272 if (map_length < length) { 3273 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3274 "len %llu\n", (unsigned long long)logical, 3275 (unsigned long long)length, 3276 (unsigned long long)map_length); 3277 BUG(); 3278 } 3279 multi->end_io = first_bio->bi_end_io; 3280 multi->private = first_bio->bi_private; 3281 multi->orig_bio = first_bio; 3282 atomic_set(&multi->stripes_pending, multi->num_stripes); 3283 3284 while (dev_nr < total_devs) { 3285 if (total_devs > 1) { 3286 if (dev_nr < total_devs - 1) { 3287 bio = bio_clone(first_bio, GFP_NOFS); 3288 BUG_ON(!bio); 3289 } else { 3290 bio = first_bio; 3291 } 3292 bio->bi_private = multi; 3293 bio->bi_end_io = end_bio_multi_stripe; 3294 } 3295 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3296 dev = multi->stripes[dev_nr].dev; 3297 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3298 bio->bi_bdev = dev->bdev; 3299 if (async_submit) 3300 schedule_bio(root, dev, rw, bio); 3301 else 3302 submit_bio(rw, bio); 3303 } else { 3304 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 3305 bio->bi_sector = logical >> 9; 3306 bio_endio(bio, -EIO); 3307 } 3308 dev_nr++; 3309 } 3310 if (total_devs == 1) 3311 kfree(multi); 3312 return 0; 3313 } 3314 3315 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3316 u8 *uuid, u8 *fsid) 3317 { 3318 struct btrfs_device *device; 3319 struct btrfs_fs_devices *cur_devices; 3320 3321 cur_devices = root->fs_info->fs_devices; 3322 while (cur_devices) { 3323 if (!fsid || 3324 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3325 device = __find_device(&cur_devices->devices, 3326 devid, uuid); 3327 if (device) 3328 return device; 3329 } 3330 cur_devices = cur_devices->seed; 3331 } 3332 return NULL; 3333 } 3334 3335 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3336 u64 devid, u8 *dev_uuid) 3337 { 3338 struct btrfs_device *device; 3339 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3340 3341 device = kzalloc(sizeof(*device), GFP_NOFS); 3342 if (!device) 3343 return NULL; 3344 list_add(&device->dev_list, 3345 &fs_devices->devices); 3346 device->dev_root = root->fs_info->dev_root; 3347 device->devid = devid; 3348 device->work.func = pending_bios_fn; 3349 device->fs_devices = fs_devices; 3350 device->missing = 1; 3351 fs_devices->num_devices++; 3352 fs_devices->missing_devices++; 3353 spin_lock_init(&device->io_lock); 3354 INIT_LIST_HEAD(&device->dev_alloc_list); 3355 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3356 return device; 3357 } 3358 3359 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3360 struct extent_buffer *leaf, 3361 struct btrfs_chunk *chunk) 3362 { 3363 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3364 struct map_lookup *map; 3365 struct extent_map *em; 3366 u64 logical; 3367 u64 length; 3368 u64 devid; 3369 u8 uuid[BTRFS_UUID_SIZE]; 3370 int num_stripes; 3371 int ret; 3372 int i; 3373 3374 logical = key->offset; 3375 length = btrfs_chunk_length(leaf, chunk); 3376 3377 read_lock(&map_tree->map_tree.lock); 3378 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3379 read_unlock(&map_tree->map_tree.lock); 3380 3381 /* already mapped? */ 3382 if (em && em->start <= logical && em->start + em->len > logical) { 3383 free_extent_map(em); 3384 return 0; 3385 } else if (em) { 3386 free_extent_map(em); 3387 } 3388 3389 em = alloc_extent_map(); 3390 if (!em) 3391 return -ENOMEM; 3392 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3393 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3394 if (!map) { 3395 free_extent_map(em); 3396 return -ENOMEM; 3397 } 3398 3399 em->bdev = (struct block_device *)map; 3400 em->start = logical; 3401 em->len = length; 3402 em->block_start = 0; 3403 em->block_len = em->len; 3404 3405 map->num_stripes = num_stripes; 3406 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3407 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3408 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3409 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3410 map->type = btrfs_chunk_type(leaf, chunk); 3411 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3412 for (i = 0; i < num_stripes; i++) { 3413 map->stripes[i].physical = 3414 btrfs_stripe_offset_nr(leaf, chunk, i); 3415 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3416 read_extent_buffer(leaf, uuid, (unsigned long) 3417 btrfs_stripe_dev_uuid_nr(chunk, i), 3418 BTRFS_UUID_SIZE); 3419 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3420 NULL); 3421 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3422 kfree(map); 3423 free_extent_map(em); 3424 return -EIO; 3425 } 3426 if (!map->stripes[i].dev) { 3427 map->stripes[i].dev = 3428 add_missing_dev(root, devid, uuid); 3429 if (!map->stripes[i].dev) { 3430 kfree(map); 3431 free_extent_map(em); 3432 return -EIO; 3433 } 3434 } 3435 map->stripes[i].dev->in_fs_metadata = 1; 3436 } 3437 3438 write_lock(&map_tree->map_tree.lock); 3439 ret = add_extent_mapping(&map_tree->map_tree, em); 3440 write_unlock(&map_tree->map_tree.lock); 3441 BUG_ON(ret); 3442 free_extent_map(em); 3443 3444 return 0; 3445 } 3446 3447 static int fill_device_from_item(struct extent_buffer *leaf, 3448 struct btrfs_dev_item *dev_item, 3449 struct btrfs_device *device) 3450 { 3451 unsigned long ptr; 3452 3453 device->devid = btrfs_device_id(leaf, dev_item); 3454 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3455 device->total_bytes = device->disk_total_bytes; 3456 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3457 device->type = btrfs_device_type(leaf, dev_item); 3458 device->io_align = btrfs_device_io_align(leaf, dev_item); 3459 device->io_width = btrfs_device_io_width(leaf, dev_item); 3460 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3461 3462 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3463 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3464 3465 return 0; 3466 } 3467 3468 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3469 { 3470 struct btrfs_fs_devices *fs_devices; 3471 int ret; 3472 3473 mutex_lock(&uuid_mutex); 3474 3475 fs_devices = root->fs_info->fs_devices->seed; 3476 while (fs_devices) { 3477 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3478 ret = 0; 3479 goto out; 3480 } 3481 fs_devices = fs_devices->seed; 3482 } 3483 3484 fs_devices = find_fsid(fsid); 3485 if (!fs_devices) { 3486 ret = -ENOENT; 3487 goto out; 3488 } 3489 3490 fs_devices = clone_fs_devices(fs_devices); 3491 if (IS_ERR(fs_devices)) { 3492 ret = PTR_ERR(fs_devices); 3493 goto out; 3494 } 3495 3496 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3497 root->fs_info->bdev_holder); 3498 if (ret) 3499 goto out; 3500 3501 if (!fs_devices->seeding) { 3502 __btrfs_close_devices(fs_devices); 3503 free_fs_devices(fs_devices); 3504 ret = -EINVAL; 3505 goto out; 3506 } 3507 3508 fs_devices->seed = root->fs_info->fs_devices->seed; 3509 root->fs_info->fs_devices->seed = fs_devices; 3510 out: 3511 mutex_unlock(&uuid_mutex); 3512 return ret; 3513 } 3514 3515 static int read_one_dev(struct btrfs_root *root, 3516 struct extent_buffer *leaf, 3517 struct btrfs_dev_item *dev_item) 3518 { 3519 struct btrfs_device *device; 3520 u64 devid; 3521 int ret; 3522 u8 fs_uuid[BTRFS_UUID_SIZE]; 3523 u8 dev_uuid[BTRFS_UUID_SIZE]; 3524 3525 devid = btrfs_device_id(leaf, dev_item); 3526 read_extent_buffer(leaf, dev_uuid, 3527 (unsigned long)btrfs_device_uuid(dev_item), 3528 BTRFS_UUID_SIZE); 3529 read_extent_buffer(leaf, fs_uuid, 3530 (unsigned long)btrfs_device_fsid(dev_item), 3531 BTRFS_UUID_SIZE); 3532 3533 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3534 ret = open_seed_devices(root, fs_uuid); 3535 if (ret && !btrfs_test_opt(root, DEGRADED)) 3536 return ret; 3537 } 3538 3539 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3540 if (!device || !device->bdev) { 3541 if (!btrfs_test_opt(root, DEGRADED)) 3542 return -EIO; 3543 3544 if (!device) { 3545 printk(KERN_WARNING "warning devid %llu missing\n", 3546 (unsigned long long)devid); 3547 device = add_missing_dev(root, devid, dev_uuid); 3548 if (!device) 3549 return -ENOMEM; 3550 } else if (!device->missing) { 3551 /* 3552 * this happens when a device that was properly setup 3553 * in the device info lists suddenly goes bad. 3554 * device->bdev is NULL, and so we have to set 3555 * device->missing to one here 3556 */ 3557 root->fs_info->fs_devices->missing_devices++; 3558 device->missing = 1; 3559 } 3560 } 3561 3562 if (device->fs_devices != root->fs_info->fs_devices) { 3563 BUG_ON(device->writeable); 3564 if (device->generation != 3565 btrfs_device_generation(leaf, dev_item)) 3566 return -EINVAL; 3567 } 3568 3569 fill_device_from_item(leaf, dev_item, device); 3570 device->dev_root = root->fs_info->dev_root; 3571 device->in_fs_metadata = 1; 3572 if (device->writeable) 3573 device->fs_devices->total_rw_bytes += device->total_bytes; 3574 ret = 0; 3575 return ret; 3576 } 3577 3578 int btrfs_read_sys_array(struct btrfs_root *root) 3579 { 3580 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3581 struct extent_buffer *sb; 3582 struct btrfs_disk_key *disk_key; 3583 struct btrfs_chunk *chunk; 3584 u8 *ptr; 3585 unsigned long sb_ptr; 3586 int ret = 0; 3587 u32 num_stripes; 3588 u32 array_size; 3589 u32 len = 0; 3590 u32 cur; 3591 struct btrfs_key key; 3592 3593 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3594 BTRFS_SUPER_INFO_SIZE); 3595 if (!sb) 3596 return -ENOMEM; 3597 btrfs_set_buffer_uptodate(sb); 3598 btrfs_set_buffer_lockdep_class(sb, 0); 3599 3600 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3601 array_size = btrfs_super_sys_array_size(super_copy); 3602 3603 ptr = super_copy->sys_chunk_array; 3604 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3605 cur = 0; 3606 3607 while (cur < array_size) { 3608 disk_key = (struct btrfs_disk_key *)ptr; 3609 btrfs_disk_key_to_cpu(&key, disk_key); 3610 3611 len = sizeof(*disk_key); ptr += len; 3612 sb_ptr += len; 3613 cur += len; 3614 3615 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3616 chunk = (struct btrfs_chunk *)sb_ptr; 3617 ret = read_one_chunk(root, &key, sb, chunk); 3618 if (ret) 3619 break; 3620 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3621 len = btrfs_chunk_item_size(num_stripes); 3622 } else { 3623 ret = -EIO; 3624 break; 3625 } 3626 ptr += len; 3627 sb_ptr += len; 3628 cur += len; 3629 } 3630 free_extent_buffer(sb); 3631 return ret; 3632 } 3633 3634 int btrfs_read_chunk_tree(struct btrfs_root *root) 3635 { 3636 struct btrfs_path *path; 3637 struct extent_buffer *leaf; 3638 struct btrfs_key key; 3639 struct btrfs_key found_key; 3640 int ret; 3641 int slot; 3642 3643 root = root->fs_info->chunk_root; 3644 3645 path = btrfs_alloc_path(); 3646 if (!path) 3647 return -ENOMEM; 3648 3649 /* first we search for all of the device items, and then we 3650 * read in all of the chunk items. This way we can create chunk 3651 * mappings that reference all of the devices that are afound 3652 */ 3653 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3654 key.offset = 0; 3655 key.type = 0; 3656 again: 3657 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3658 if (ret < 0) 3659 goto error; 3660 while (1) { 3661 leaf = path->nodes[0]; 3662 slot = path->slots[0]; 3663 if (slot >= btrfs_header_nritems(leaf)) { 3664 ret = btrfs_next_leaf(root, path); 3665 if (ret == 0) 3666 continue; 3667 if (ret < 0) 3668 goto error; 3669 break; 3670 } 3671 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3672 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3673 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3674 break; 3675 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3676 struct btrfs_dev_item *dev_item; 3677 dev_item = btrfs_item_ptr(leaf, slot, 3678 struct btrfs_dev_item); 3679 ret = read_one_dev(root, leaf, dev_item); 3680 if (ret) 3681 goto error; 3682 } 3683 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3684 struct btrfs_chunk *chunk; 3685 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3686 ret = read_one_chunk(root, &found_key, leaf, chunk); 3687 if (ret) 3688 goto error; 3689 } 3690 path->slots[0]++; 3691 } 3692 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3693 key.objectid = 0; 3694 btrfs_release_path(path); 3695 goto again; 3696 } 3697 ret = 0; 3698 error: 3699 btrfs_free_path(path); 3700 return ret; 3701 } 3702