1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <asm/div64.h> 27 #include "compat.h" 28 #include "ctree.h" 29 #include "extent_map.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "async-thread.h" 35 36 static int init_first_rw_device(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct btrfs_device *device); 39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 40 41 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 42 (sizeof(struct btrfs_bio_stripe) * (n))) 43 44 static DEFINE_MUTEX(uuid_mutex); 45 static LIST_HEAD(fs_uuids); 46 47 void btrfs_lock_volumes(void) 48 { 49 mutex_lock(&uuid_mutex); 50 } 51 52 void btrfs_unlock_volumes(void) 53 { 54 mutex_unlock(&uuid_mutex); 55 } 56 57 static void lock_chunks(struct btrfs_root *root) 58 { 59 mutex_lock(&root->fs_info->chunk_mutex); 60 } 61 62 static void unlock_chunks(struct btrfs_root *root) 63 { 64 mutex_unlock(&root->fs_info->chunk_mutex); 65 } 66 67 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 68 { 69 struct btrfs_device *device; 70 WARN_ON(fs_devices->opened); 71 while (!list_empty(&fs_devices->devices)) { 72 device = list_entry(fs_devices->devices.next, 73 struct btrfs_device, dev_list); 74 list_del(&device->dev_list); 75 kfree(device->name); 76 kfree(device); 77 } 78 kfree(fs_devices); 79 } 80 81 int btrfs_cleanup_fs_uuids(void) 82 { 83 struct btrfs_fs_devices *fs_devices; 84 85 while (!list_empty(&fs_uuids)) { 86 fs_devices = list_entry(fs_uuids.next, 87 struct btrfs_fs_devices, list); 88 list_del(&fs_devices->list); 89 free_fs_devices(fs_devices); 90 } 91 return 0; 92 } 93 94 static noinline struct btrfs_device *__find_device(struct list_head *head, 95 u64 devid, u8 *uuid) 96 { 97 struct btrfs_device *dev; 98 99 list_for_each_entry(dev, head, dev_list) { 100 if (dev->devid == devid && 101 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 102 return dev; 103 } 104 } 105 return NULL; 106 } 107 108 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 109 { 110 struct btrfs_fs_devices *fs_devices; 111 112 list_for_each_entry(fs_devices, &fs_uuids, list) { 113 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 114 return fs_devices; 115 } 116 return NULL; 117 } 118 119 static void requeue_list(struct btrfs_pending_bios *pending_bios, 120 struct bio *head, struct bio *tail) 121 { 122 123 struct bio *old_head; 124 125 old_head = pending_bios->head; 126 pending_bios->head = head; 127 if (pending_bios->tail) 128 tail->bi_next = old_head; 129 else 130 pending_bios->tail = tail; 131 } 132 133 /* 134 * we try to collect pending bios for a device so we don't get a large 135 * number of procs sending bios down to the same device. This greatly 136 * improves the schedulers ability to collect and merge the bios. 137 * 138 * But, it also turns into a long list of bios to process and that is sure 139 * to eventually make the worker thread block. The solution here is to 140 * make some progress and then put this work struct back at the end of 141 * the list if the block device is congested. This way, multiple devices 142 * can make progress from a single worker thread. 143 */ 144 static noinline int run_scheduled_bios(struct btrfs_device *device) 145 { 146 struct bio *pending; 147 struct backing_dev_info *bdi; 148 struct btrfs_fs_info *fs_info; 149 struct btrfs_pending_bios *pending_bios; 150 struct bio *tail; 151 struct bio *cur; 152 int again = 0; 153 unsigned long num_run; 154 unsigned long batch_run = 0; 155 unsigned long limit; 156 unsigned long last_waited = 0; 157 int force_reg = 0; 158 struct blk_plug plug; 159 160 /* 161 * this function runs all the bios we've collected for 162 * a particular device. We don't want to wander off to 163 * another device without first sending all of these down. 164 * So, setup a plug here and finish it off before we return 165 */ 166 blk_start_plug(&plug); 167 168 bdi = blk_get_backing_dev_info(device->bdev); 169 fs_info = device->dev_root->fs_info; 170 limit = btrfs_async_submit_limit(fs_info); 171 limit = limit * 2 / 3; 172 173 loop: 174 spin_lock(&device->io_lock); 175 176 loop_lock: 177 num_run = 0; 178 179 /* take all the bios off the list at once and process them 180 * later on (without the lock held). But, remember the 181 * tail and other pointers so the bios can be properly reinserted 182 * into the list if we hit congestion 183 */ 184 if (!force_reg && device->pending_sync_bios.head) { 185 pending_bios = &device->pending_sync_bios; 186 force_reg = 1; 187 } else { 188 pending_bios = &device->pending_bios; 189 force_reg = 0; 190 } 191 192 pending = pending_bios->head; 193 tail = pending_bios->tail; 194 WARN_ON(pending && !tail); 195 196 /* 197 * if pending was null this time around, no bios need processing 198 * at all and we can stop. Otherwise it'll loop back up again 199 * and do an additional check so no bios are missed. 200 * 201 * device->running_pending is used to synchronize with the 202 * schedule_bio code. 203 */ 204 if (device->pending_sync_bios.head == NULL && 205 device->pending_bios.head == NULL) { 206 again = 0; 207 device->running_pending = 0; 208 } else { 209 again = 1; 210 device->running_pending = 1; 211 } 212 213 pending_bios->head = NULL; 214 pending_bios->tail = NULL; 215 216 spin_unlock(&device->io_lock); 217 218 while (pending) { 219 220 rmb(); 221 /* we want to work on both lists, but do more bios on the 222 * sync list than the regular list 223 */ 224 if ((num_run > 32 && 225 pending_bios != &device->pending_sync_bios && 226 device->pending_sync_bios.head) || 227 (num_run > 64 && pending_bios == &device->pending_sync_bios && 228 device->pending_bios.head)) { 229 spin_lock(&device->io_lock); 230 requeue_list(pending_bios, pending, tail); 231 goto loop_lock; 232 } 233 234 cur = pending; 235 pending = pending->bi_next; 236 cur->bi_next = NULL; 237 atomic_dec(&fs_info->nr_async_bios); 238 239 if (atomic_read(&fs_info->nr_async_bios) < limit && 240 waitqueue_active(&fs_info->async_submit_wait)) 241 wake_up(&fs_info->async_submit_wait); 242 243 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 244 245 submit_bio(cur->bi_rw, cur); 246 num_run++; 247 batch_run++; 248 if (need_resched()) 249 cond_resched(); 250 251 /* 252 * we made progress, there is more work to do and the bdi 253 * is now congested. Back off and let other work structs 254 * run instead 255 */ 256 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 257 fs_info->fs_devices->open_devices > 1) { 258 struct io_context *ioc; 259 260 ioc = current->io_context; 261 262 /* 263 * the main goal here is that we don't want to 264 * block if we're going to be able to submit 265 * more requests without blocking. 266 * 267 * This code does two great things, it pokes into 268 * the elevator code from a filesystem _and_ 269 * it makes assumptions about how batching works. 270 */ 271 if (ioc && ioc->nr_batch_requests > 0 && 272 time_before(jiffies, ioc->last_waited + HZ/50UL) && 273 (last_waited == 0 || 274 ioc->last_waited == last_waited)) { 275 /* 276 * we want to go through our batch of 277 * requests and stop. So, we copy out 278 * the ioc->last_waited time and test 279 * against it before looping 280 */ 281 last_waited = ioc->last_waited; 282 if (need_resched()) 283 cond_resched(); 284 continue; 285 } 286 spin_lock(&device->io_lock); 287 requeue_list(pending_bios, pending, tail); 288 device->running_pending = 1; 289 290 spin_unlock(&device->io_lock); 291 btrfs_requeue_work(&device->work); 292 goto done; 293 } 294 } 295 296 cond_resched(); 297 if (again) 298 goto loop; 299 300 spin_lock(&device->io_lock); 301 if (device->pending_bios.head || device->pending_sync_bios.head) 302 goto loop_lock; 303 spin_unlock(&device->io_lock); 304 305 done: 306 blk_finish_plug(&plug); 307 return 0; 308 } 309 310 static void pending_bios_fn(struct btrfs_work *work) 311 { 312 struct btrfs_device *device; 313 314 device = container_of(work, struct btrfs_device, work); 315 run_scheduled_bios(device); 316 } 317 318 static noinline int device_list_add(const char *path, 319 struct btrfs_super_block *disk_super, 320 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 321 { 322 struct btrfs_device *device; 323 struct btrfs_fs_devices *fs_devices; 324 u64 found_transid = btrfs_super_generation(disk_super); 325 char *name; 326 327 fs_devices = find_fsid(disk_super->fsid); 328 if (!fs_devices) { 329 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 330 if (!fs_devices) 331 return -ENOMEM; 332 INIT_LIST_HEAD(&fs_devices->devices); 333 INIT_LIST_HEAD(&fs_devices->alloc_list); 334 list_add(&fs_devices->list, &fs_uuids); 335 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 336 fs_devices->latest_devid = devid; 337 fs_devices->latest_trans = found_transid; 338 mutex_init(&fs_devices->device_list_mutex); 339 device = NULL; 340 } else { 341 device = __find_device(&fs_devices->devices, devid, 342 disk_super->dev_item.uuid); 343 } 344 if (!device) { 345 if (fs_devices->opened) 346 return -EBUSY; 347 348 device = kzalloc(sizeof(*device), GFP_NOFS); 349 if (!device) { 350 /* we can safely leave the fs_devices entry around */ 351 return -ENOMEM; 352 } 353 device->devid = devid; 354 device->work.func = pending_bios_fn; 355 memcpy(device->uuid, disk_super->dev_item.uuid, 356 BTRFS_UUID_SIZE); 357 spin_lock_init(&device->io_lock); 358 device->name = kstrdup(path, GFP_NOFS); 359 if (!device->name) { 360 kfree(device); 361 return -ENOMEM; 362 } 363 INIT_LIST_HEAD(&device->dev_alloc_list); 364 365 mutex_lock(&fs_devices->device_list_mutex); 366 list_add(&device->dev_list, &fs_devices->devices); 367 mutex_unlock(&fs_devices->device_list_mutex); 368 369 device->fs_devices = fs_devices; 370 fs_devices->num_devices++; 371 } else if (!device->name || strcmp(device->name, path)) { 372 name = kstrdup(path, GFP_NOFS); 373 if (!name) 374 return -ENOMEM; 375 kfree(device->name); 376 device->name = name; 377 if (device->missing) { 378 fs_devices->missing_devices--; 379 device->missing = 0; 380 } 381 } 382 383 if (found_transid > fs_devices->latest_trans) { 384 fs_devices->latest_devid = devid; 385 fs_devices->latest_trans = found_transid; 386 } 387 *fs_devices_ret = fs_devices; 388 return 0; 389 } 390 391 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 392 { 393 struct btrfs_fs_devices *fs_devices; 394 struct btrfs_device *device; 395 struct btrfs_device *orig_dev; 396 397 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 398 if (!fs_devices) 399 return ERR_PTR(-ENOMEM); 400 401 INIT_LIST_HEAD(&fs_devices->devices); 402 INIT_LIST_HEAD(&fs_devices->alloc_list); 403 INIT_LIST_HEAD(&fs_devices->list); 404 mutex_init(&fs_devices->device_list_mutex); 405 fs_devices->latest_devid = orig->latest_devid; 406 fs_devices->latest_trans = orig->latest_trans; 407 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 408 409 mutex_lock(&orig->device_list_mutex); 410 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 411 device = kzalloc(sizeof(*device), GFP_NOFS); 412 if (!device) 413 goto error; 414 415 device->name = kstrdup(orig_dev->name, GFP_NOFS); 416 if (!device->name) { 417 kfree(device); 418 goto error; 419 } 420 421 device->devid = orig_dev->devid; 422 device->work.func = pending_bios_fn; 423 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 424 spin_lock_init(&device->io_lock); 425 INIT_LIST_HEAD(&device->dev_list); 426 INIT_LIST_HEAD(&device->dev_alloc_list); 427 428 list_add(&device->dev_list, &fs_devices->devices); 429 device->fs_devices = fs_devices; 430 fs_devices->num_devices++; 431 } 432 mutex_unlock(&orig->device_list_mutex); 433 return fs_devices; 434 error: 435 mutex_unlock(&orig->device_list_mutex); 436 free_fs_devices(fs_devices); 437 return ERR_PTR(-ENOMEM); 438 } 439 440 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 441 { 442 struct btrfs_device *device, *next; 443 444 mutex_lock(&uuid_mutex); 445 again: 446 mutex_lock(&fs_devices->device_list_mutex); 447 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 448 if (device->in_fs_metadata) 449 continue; 450 451 if (device->bdev) { 452 blkdev_put(device->bdev, device->mode); 453 device->bdev = NULL; 454 fs_devices->open_devices--; 455 } 456 if (device->writeable) { 457 list_del_init(&device->dev_alloc_list); 458 device->writeable = 0; 459 fs_devices->rw_devices--; 460 } 461 list_del_init(&device->dev_list); 462 fs_devices->num_devices--; 463 kfree(device->name); 464 kfree(device); 465 } 466 mutex_unlock(&fs_devices->device_list_mutex); 467 468 if (fs_devices->seed) { 469 fs_devices = fs_devices->seed; 470 goto again; 471 } 472 473 mutex_unlock(&uuid_mutex); 474 return 0; 475 } 476 477 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 478 { 479 struct btrfs_device *device; 480 481 if (--fs_devices->opened > 0) 482 return 0; 483 484 list_for_each_entry(device, &fs_devices->devices, dev_list) { 485 if (device->bdev) { 486 blkdev_put(device->bdev, device->mode); 487 fs_devices->open_devices--; 488 } 489 if (device->writeable) { 490 list_del_init(&device->dev_alloc_list); 491 fs_devices->rw_devices--; 492 } 493 494 device->bdev = NULL; 495 device->writeable = 0; 496 device->in_fs_metadata = 0; 497 } 498 WARN_ON(fs_devices->open_devices); 499 WARN_ON(fs_devices->rw_devices); 500 fs_devices->opened = 0; 501 fs_devices->seeding = 0; 502 503 return 0; 504 } 505 506 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 507 { 508 struct btrfs_fs_devices *seed_devices = NULL; 509 int ret; 510 511 mutex_lock(&uuid_mutex); 512 ret = __btrfs_close_devices(fs_devices); 513 if (!fs_devices->opened) { 514 seed_devices = fs_devices->seed; 515 fs_devices->seed = NULL; 516 } 517 mutex_unlock(&uuid_mutex); 518 519 while (seed_devices) { 520 fs_devices = seed_devices; 521 seed_devices = fs_devices->seed; 522 __btrfs_close_devices(fs_devices); 523 free_fs_devices(fs_devices); 524 } 525 return ret; 526 } 527 528 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 529 fmode_t flags, void *holder) 530 { 531 struct block_device *bdev; 532 struct list_head *head = &fs_devices->devices; 533 struct btrfs_device *device; 534 struct block_device *latest_bdev = NULL; 535 struct buffer_head *bh; 536 struct btrfs_super_block *disk_super; 537 u64 latest_devid = 0; 538 u64 latest_transid = 0; 539 u64 devid; 540 int seeding = 1; 541 int ret = 0; 542 543 flags |= FMODE_EXCL; 544 545 list_for_each_entry(device, head, dev_list) { 546 if (device->bdev) 547 continue; 548 if (!device->name) 549 continue; 550 551 bdev = blkdev_get_by_path(device->name, flags, holder); 552 if (IS_ERR(bdev)) { 553 printk(KERN_INFO "open %s failed\n", device->name); 554 goto error; 555 } 556 set_blocksize(bdev, 4096); 557 558 bh = btrfs_read_dev_super(bdev); 559 if (!bh) { 560 ret = -EINVAL; 561 goto error_close; 562 } 563 564 disk_super = (struct btrfs_super_block *)bh->b_data; 565 devid = btrfs_stack_device_id(&disk_super->dev_item); 566 if (devid != device->devid) 567 goto error_brelse; 568 569 if (memcmp(device->uuid, disk_super->dev_item.uuid, 570 BTRFS_UUID_SIZE)) 571 goto error_brelse; 572 573 device->generation = btrfs_super_generation(disk_super); 574 if (!latest_transid || device->generation > latest_transid) { 575 latest_devid = devid; 576 latest_transid = device->generation; 577 latest_bdev = bdev; 578 } 579 580 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 581 device->writeable = 0; 582 } else { 583 device->writeable = !bdev_read_only(bdev); 584 seeding = 0; 585 } 586 587 device->bdev = bdev; 588 device->in_fs_metadata = 0; 589 device->mode = flags; 590 591 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 592 fs_devices->rotating = 1; 593 594 fs_devices->open_devices++; 595 if (device->writeable) { 596 fs_devices->rw_devices++; 597 list_add(&device->dev_alloc_list, 598 &fs_devices->alloc_list); 599 } 600 continue; 601 602 error_brelse: 603 brelse(bh); 604 error_close: 605 blkdev_put(bdev, flags); 606 error: 607 continue; 608 } 609 if (fs_devices->open_devices == 0) { 610 ret = -EIO; 611 goto out; 612 } 613 fs_devices->seeding = seeding; 614 fs_devices->opened = 1; 615 fs_devices->latest_bdev = latest_bdev; 616 fs_devices->latest_devid = latest_devid; 617 fs_devices->latest_trans = latest_transid; 618 fs_devices->total_rw_bytes = 0; 619 out: 620 return ret; 621 } 622 623 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 624 fmode_t flags, void *holder) 625 { 626 int ret; 627 628 mutex_lock(&uuid_mutex); 629 if (fs_devices->opened) { 630 fs_devices->opened++; 631 ret = 0; 632 } else { 633 ret = __btrfs_open_devices(fs_devices, flags, holder); 634 } 635 mutex_unlock(&uuid_mutex); 636 return ret; 637 } 638 639 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 640 struct btrfs_fs_devices **fs_devices_ret) 641 { 642 struct btrfs_super_block *disk_super; 643 struct block_device *bdev; 644 struct buffer_head *bh; 645 int ret; 646 u64 devid; 647 u64 transid; 648 649 mutex_lock(&uuid_mutex); 650 651 flags |= FMODE_EXCL; 652 bdev = blkdev_get_by_path(path, flags, holder); 653 654 if (IS_ERR(bdev)) { 655 ret = PTR_ERR(bdev); 656 goto error; 657 } 658 659 ret = set_blocksize(bdev, 4096); 660 if (ret) 661 goto error_close; 662 bh = btrfs_read_dev_super(bdev); 663 if (!bh) { 664 ret = -EINVAL; 665 goto error_close; 666 } 667 disk_super = (struct btrfs_super_block *)bh->b_data; 668 devid = btrfs_stack_device_id(&disk_super->dev_item); 669 transid = btrfs_super_generation(disk_super); 670 if (disk_super->label[0]) 671 printk(KERN_INFO "device label %s ", disk_super->label); 672 else { 673 /* FIXME, make a readl uuid parser */ 674 printk(KERN_INFO "device fsid %llx-%llx ", 675 *(unsigned long long *)disk_super->fsid, 676 *(unsigned long long *)(disk_super->fsid + 8)); 677 } 678 printk(KERN_CONT "devid %llu transid %llu %s\n", 679 (unsigned long long)devid, (unsigned long long)transid, path); 680 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 681 682 brelse(bh); 683 error_close: 684 blkdev_put(bdev, flags); 685 error: 686 mutex_unlock(&uuid_mutex); 687 return ret; 688 } 689 690 /* helper to account the used device space in the range */ 691 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 692 u64 end, u64 *length) 693 { 694 struct btrfs_key key; 695 struct btrfs_root *root = device->dev_root; 696 struct btrfs_dev_extent *dev_extent; 697 struct btrfs_path *path; 698 u64 extent_end; 699 int ret; 700 int slot; 701 struct extent_buffer *l; 702 703 *length = 0; 704 705 if (start >= device->total_bytes) 706 return 0; 707 708 path = btrfs_alloc_path(); 709 if (!path) 710 return -ENOMEM; 711 path->reada = 2; 712 713 key.objectid = device->devid; 714 key.offset = start; 715 key.type = BTRFS_DEV_EXTENT_KEY; 716 717 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 718 if (ret < 0) 719 goto out; 720 if (ret > 0) { 721 ret = btrfs_previous_item(root, path, key.objectid, key.type); 722 if (ret < 0) 723 goto out; 724 } 725 726 while (1) { 727 l = path->nodes[0]; 728 slot = path->slots[0]; 729 if (slot >= btrfs_header_nritems(l)) { 730 ret = btrfs_next_leaf(root, path); 731 if (ret == 0) 732 continue; 733 if (ret < 0) 734 goto out; 735 736 break; 737 } 738 btrfs_item_key_to_cpu(l, &key, slot); 739 740 if (key.objectid < device->devid) 741 goto next; 742 743 if (key.objectid > device->devid) 744 break; 745 746 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 747 goto next; 748 749 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 750 extent_end = key.offset + btrfs_dev_extent_length(l, 751 dev_extent); 752 if (key.offset <= start && extent_end > end) { 753 *length = end - start + 1; 754 break; 755 } else if (key.offset <= start && extent_end > start) 756 *length += extent_end - start; 757 else if (key.offset > start && extent_end <= end) 758 *length += extent_end - key.offset; 759 else if (key.offset > start && key.offset <= end) { 760 *length += end - key.offset + 1; 761 break; 762 } else if (key.offset > end) 763 break; 764 765 next: 766 path->slots[0]++; 767 } 768 ret = 0; 769 out: 770 btrfs_free_path(path); 771 return ret; 772 } 773 774 /* 775 * find_free_dev_extent - find free space in the specified device 776 * @trans: transaction handler 777 * @device: the device which we search the free space in 778 * @num_bytes: the size of the free space that we need 779 * @start: store the start of the free space. 780 * @len: the size of the free space. that we find, or the size of the max 781 * free space if we don't find suitable free space 782 * 783 * this uses a pretty simple search, the expectation is that it is 784 * called very infrequently and that a given device has a small number 785 * of extents 786 * 787 * @start is used to store the start of the free space if we find. But if we 788 * don't find suitable free space, it will be used to store the start position 789 * of the max free space. 790 * 791 * @len is used to store the size of the free space that we find. 792 * But if we don't find suitable free space, it is used to store the size of 793 * the max free space. 794 */ 795 int find_free_dev_extent(struct btrfs_trans_handle *trans, 796 struct btrfs_device *device, u64 num_bytes, 797 u64 *start, u64 *len) 798 { 799 struct btrfs_key key; 800 struct btrfs_root *root = device->dev_root; 801 struct btrfs_dev_extent *dev_extent; 802 struct btrfs_path *path; 803 u64 hole_size; 804 u64 max_hole_start; 805 u64 max_hole_size; 806 u64 extent_end; 807 u64 search_start; 808 u64 search_end = device->total_bytes; 809 int ret; 810 int slot; 811 struct extent_buffer *l; 812 813 /* FIXME use last free of some kind */ 814 815 /* we don't want to overwrite the superblock on the drive, 816 * so we make sure to start at an offset of at least 1MB 817 */ 818 search_start = 1024 * 1024; 819 820 if (root->fs_info->alloc_start + num_bytes <= search_end) 821 search_start = max(root->fs_info->alloc_start, search_start); 822 823 max_hole_start = search_start; 824 max_hole_size = 0; 825 826 if (search_start >= search_end) { 827 ret = -ENOSPC; 828 goto error; 829 } 830 831 path = btrfs_alloc_path(); 832 if (!path) { 833 ret = -ENOMEM; 834 goto error; 835 } 836 path->reada = 2; 837 838 key.objectid = device->devid; 839 key.offset = search_start; 840 key.type = BTRFS_DEV_EXTENT_KEY; 841 842 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 843 if (ret < 0) 844 goto out; 845 if (ret > 0) { 846 ret = btrfs_previous_item(root, path, key.objectid, key.type); 847 if (ret < 0) 848 goto out; 849 } 850 851 while (1) { 852 l = path->nodes[0]; 853 slot = path->slots[0]; 854 if (slot >= btrfs_header_nritems(l)) { 855 ret = btrfs_next_leaf(root, path); 856 if (ret == 0) 857 continue; 858 if (ret < 0) 859 goto out; 860 861 break; 862 } 863 btrfs_item_key_to_cpu(l, &key, slot); 864 865 if (key.objectid < device->devid) 866 goto next; 867 868 if (key.objectid > device->devid) 869 break; 870 871 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 872 goto next; 873 874 if (key.offset > search_start) { 875 hole_size = key.offset - search_start; 876 877 if (hole_size > max_hole_size) { 878 max_hole_start = search_start; 879 max_hole_size = hole_size; 880 } 881 882 /* 883 * If this free space is greater than which we need, 884 * it must be the max free space that we have found 885 * until now, so max_hole_start must point to the start 886 * of this free space and the length of this free space 887 * is stored in max_hole_size. Thus, we return 888 * max_hole_start and max_hole_size and go back to the 889 * caller. 890 */ 891 if (hole_size >= num_bytes) { 892 ret = 0; 893 goto out; 894 } 895 } 896 897 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 898 extent_end = key.offset + btrfs_dev_extent_length(l, 899 dev_extent); 900 if (extent_end > search_start) 901 search_start = extent_end; 902 next: 903 path->slots[0]++; 904 cond_resched(); 905 } 906 907 hole_size = search_end- search_start; 908 if (hole_size > max_hole_size) { 909 max_hole_start = search_start; 910 max_hole_size = hole_size; 911 } 912 913 /* See above. */ 914 if (hole_size < num_bytes) 915 ret = -ENOSPC; 916 else 917 ret = 0; 918 919 out: 920 btrfs_free_path(path); 921 error: 922 *start = max_hole_start; 923 if (len) 924 *len = max_hole_size; 925 return ret; 926 } 927 928 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 929 struct btrfs_device *device, 930 u64 start) 931 { 932 int ret; 933 struct btrfs_path *path; 934 struct btrfs_root *root = device->dev_root; 935 struct btrfs_key key; 936 struct btrfs_key found_key; 937 struct extent_buffer *leaf = NULL; 938 struct btrfs_dev_extent *extent = NULL; 939 940 path = btrfs_alloc_path(); 941 if (!path) 942 return -ENOMEM; 943 944 key.objectid = device->devid; 945 key.offset = start; 946 key.type = BTRFS_DEV_EXTENT_KEY; 947 948 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 949 if (ret > 0) { 950 ret = btrfs_previous_item(root, path, key.objectid, 951 BTRFS_DEV_EXTENT_KEY); 952 BUG_ON(ret); 953 leaf = path->nodes[0]; 954 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 955 extent = btrfs_item_ptr(leaf, path->slots[0], 956 struct btrfs_dev_extent); 957 BUG_ON(found_key.offset > start || found_key.offset + 958 btrfs_dev_extent_length(leaf, extent) < start); 959 ret = 0; 960 } else if (ret == 0) { 961 leaf = path->nodes[0]; 962 extent = btrfs_item_ptr(leaf, path->slots[0], 963 struct btrfs_dev_extent); 964 } 965 BUG_ON(ret); 966 967 if (device->bytes_used > 0) 968 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 969 ret = btrfs_del_item(trans, root, path); 970 BUG_ON(ret); 971 972 btrfs_free_path(path); 973 return ret; 974 } 975 976 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 977 struct btrfs_device *device, 978 u64 chunk_tree, u64 chunk_objectid, 979 u64 chunk_offset, u64 start, u64 num_bytes) 980 { 981 int ret; 982 struct btrfs_path *path; 983 struct btrfs_root *root = device->dev_root; 984 struct btrfs_dev_extent *extent; 985 struct extent_buffer *leaf; 986 struct btrfs_key key; 987 988 WARN_ON(!device->in_fs_metadata); 989 path = btrfs_alloc_path(); 990 if (!path) 991 return -ENOMEM; 992 993 key.objectid = device->devid; 994 key.offset = start; 995 key.type = BTRFS_DEV_EXTENT_KEY; 996 ret = btrfs_insert_empty_item(trans, root, path, &key, 997 sizeof(*extent)); 998 BUG_ON(ret); 999 1000 leaf = path->nodes[0]; 1001 extent = btrfs_item_ptr(leaf, path->slots[0], 1002 struct btrfs_dev_extent); 1003 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1004 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1005 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1006 1007 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1008 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1009 BTRFS_UUID_SIZE); 1010 1011 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1012 btrfs_mark_buffer_dirty(leaf); 1013 btrfs_free_path(path); 1014 return ret; 1015 } 1016 1017 static noinline int find_next_chunk(struct btrfs_root *root, 1018 u64 objectid, u64 *offset) 1019 { 1020 struct btrfs_path *path; 1021 int ret; 1022 struct btrfs_key key; 1023 struct btrfs_chunk *chunk; 1024 struct btrfs_key found_key; 1025 1026 path = btrfs_alloc_path(); 1027 BUG_ON(!path); 1028 1029 key.objectid = objectid; 1030 key.offset = (u64)-1; 1031 key.type = BTRFS_CHUNK_ITEM_KEY; 1032 1033 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1034 if (ret < 0) 1035 goto error; 1036 1037 BUG_ON(ret == 0); 1038 1039 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1040 if (ret) { 1041 *offset = 0; 1042 } else { 1043 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1044 path->slots[0]); 1045 if (found_key.objectid != objectid) 1046 *offset = 0; 1047 else { 1048 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1049 struct btrfs_chunk); 1050 *offset = found_key.offset + 1051 btrfs_chunk_length(path->nodes[0], chunk); 1052 } 1053 } 1054 ret = 0; 1055 error: 1056 btrfs_free_path(path); 1057 return ret; 1058 } 1059 1060 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1061 { 1062 int ret; 1063 struct btrfs_key key; 1064 struct btrfs_key found_key; 1065 struct btrfs_path *path; 1066 1067 root = root->fs_info->chunk_root; 1068 1069 path = btrfs_alloc_path(); 1070 if (!path) 1071 return -ENOMEM; 1072 1073 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1074 key.type = BTRFS_DEV_ITEM_KEY; 1075 key.offset = (u64)-1; 1076 1077 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1078 if (ret < 0) 1079 goto error; 1080 1081 BUG_ON(ret == 0); 1082 1083 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1084 BTRFS_DEV_ITEM_KEY); 1085 if (ret) { 1086 *objectid = 1; 1087 } else { 1088 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1089 path->slots[0]); 1090 *objectid = found_key.offset + 1; 1091 } 1092 ret = 0; 1093 error: 1094 btrfs_free_path(path); 1095 return ret; 1096 } 1097 1098 /* 1099 * the device information is stored in the chunk root 1100 * the btrfs_device struct should be fully filled in 1101 */ 1102 int btrfs_add_device(struct btrfs_trans_handle *trans, 1103 struct btrfs_root *root, 1104 struct btrfs_device *device) 1105 { 1106 int ret; 1107 struct btrfs_path *path; 1108 struct btrfs_dev_item *dev_item; 1109 struct extent_buffer *leaf; 1110 struct btrfs_key key; 1111 unsigned long ptr; 1112 1113 root = root->fs_info->chunk_root; 1114 1115 path = btrfs_alloc_path(); 1116 if (!path) 1117 return -ENOMEM; 1118 1119 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1120 key.type = BTRFS_DEV_ITEM_KEY; 1121 key.offset = device->devid; 1122 1123 ret = btrfs_insert_empty_item(trans, root, path, &key, 1124 sizeof(*dev_item)); 1125 if (ret) 1126 goto out; 1127 1128 leaf = path->nodes[0]; 1129 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1130 1131 btrfs_set_device_id(leaf, dev_item, device->devid); 1132 btrfs_set_device_generation(leaf, dev_item, 0); 1133 btrfs_set_device_type(leaf, dev_item, device->type); 1134 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1135 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1136 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1137 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1138 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1139 btrfs_set_device_group(leaf, dev_item, 0); 1140 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1141 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1142 btrfs_set_device_start_offset(leaf, dev_item, 0); 1143 1144 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1145 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1146 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1147 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1148 btrfs_mark_buffer_dirty(leaf); 1149 1150 ret = 0; 1151 out: 1152 btrfs_free_path(path); 1153 return ret; 1154 } 1155 1156 static int btrfs_rm_dev_item(struct btrfs_root *root, 1157 struct btrfs_device *device) 1158 { 1159 int ret; 1160 struct btrfs_path *path; 1161 struct btrfs_key key; 1162 struct btrfs_trans_handle *trans; 1163 1164 root = root->fs_info->chunk_root; 1165 1166 path = btrfs_alloc_path(); 1167 if (!path) 1168 return -ENOMEM; 1169 1170 trans = btrfs_start_transaction(root, 0); 1171 if (IS_ERR(trans)) { 1172 btrfs_free_path(path); 1173 return PTR_ERR(trans); 1174 } 1175 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1176 key.type = BTRFS_DEV_ITEM_KEY; 1177 key.offset = device->devid; 1178 lock_chunks(root); 1179 1180 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1181 if (ret < 0) 1182 goto out; 1183 1184 if (ret > 0) { 1185 ret = -ENOENT; 1186 goto out; 1187 } 1188 1189 ret = btrfs_del_item(trans, root, path); 1190 if (ret) 1191 goto out; 1192 out: 1193 btrfs_free_path(path); 1194 unlock_chunks(root); 1195 btrfs_commit_transaction(trans, root); 1196 return ret; 1197 } 1198 1199 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1200 { 1201 struct btrfs_device *device; 1202 struct btrfs_device *next_device; 1203 struct block_device *bdev; 1204 struct buffer_head *bh = NULL; 1205 struct btrfs_super_block *disk_super; 1206 u64 all_avail; 1207 u64 devid; 1208 u64 num_devices; 1209 u8 *dev_uuid; 1210 int ret = 0; 1211 1212 mutex_lock(&uuid_mutex); 1213 mutex_lock(&root->fs_info->volume_mutex); 1214 1215 all_avail = root->fs_info->avail_data_alloc_bits | 1216 root->fs_info->avail_system_alloc_bits | 1217 root->fs_info->avail_metadata_alloc_bits; 1218 1219 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1220 root->fs_info->fs_devices->num_devices <= 4) { 1221 printk(KERN_ERR "btrfs: unable to go below four devices " 1222 "on raid10\n"); 1223 ret = -EINVAL; 1224 goto out; 1225 } 1226 1227 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1228 root->fs_info->fs_devices->num_devices <= 2) { 1229 printk(KERN_ERR "btrfs: unable to go below two " 1230 "devices on raid1\n"); 1231 ret = -EINVAL; 1232 goto out; 1233 } 1234 1235 if (strcmp(device_path, "missing") == 0) { 1236 struct list_head *devices; 1237 struct btrfs_device *tmp; 1238 1239 device = NULL; 1240 devices = &root->fs_info->fs_devices->devices; 1241 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1242 list_for_each_entry(tmp, devices, dev_list) { 1243 if (tmp->in_fs_metadata && !tmp->bdev) { 1244 device = tmp; 1245 break; 1246 } 1247 } 1248 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1249 bdev = NULL; 1250 bh = NULL; 1251 disk_super = NULL; 1252 if (!device) { 1253 printk(KERN_ERR "btrfs: no missing devices found to " 1254 "remove\n"); 1255 goto out; 1256 } 1257 } else { 1258 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1259 root->fs_info->bdev_holder); 1260 if (IS_ERR(bdev)) { 1261 ret = PTR_ERR(bdev); 1262 goto out; 1263 } 1264 1265 set_blocksize(bdev, 4096); 1266 bh = btrfs_read_dev_super(bdev); 1267 if (!bh) { 1268 ret = -EINVAL; 1269 goto error_close; 1270 } 1271 disk_super = (struct btrfs_super_block *)bh->b_data; 1272 devid = btrfs_stack_device_id(&disk_super->dev_item); 1273 dev_uuid = disk_super->dev_item.uuid; 1274 device = btrfs_find_device(root, devid, dev_uuid, 1275 disk_super->fsid); 1276 if (!device) { 1277 ret = -ENOENT; 1278 goto error_brelse; 1279 } 1280 } 1281 1282 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1283 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1284 "device\n"); 1285 ret = -EINVAL; 1286 goto error_brelse; 1287 } 1288 1289 if (device->writeable) { 1290 list_del_init(&device->dev_alloc_list); 1291 root->fs_info->fs_devices->rw_devices--; 1292 } 1293 1294 ret = btrfs_shrink_device(device, 0); 1295 if (ret) 1296 goto error_undo; 1297 1298 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1299 if (ret) 1300 goto error_undo; 1301 1302 device->in_fs_metadata = 0; 1303 1304 /* 1305 * the device list mutex makes sure that we don't change 1306 * the device list while someone else is writing out all 1307 * the device supers. 1308 */ 1309 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1310 list_del_init(&device->dev_list); 1311 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1312 1313 device->fs_devices->num_devices--; 1314 1315 if (device->missing) 1316 root->fs_info->fs_devices->missing_devices--; 1317 1318 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1319 struct btrfs_device, dev_list); 1320 if (device->bdev == root->fs_info->sb->s_bdev) 1321 root->fs_info->sb->s_bdev = next_device->bdev; 1322 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1323 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1324 1325 if (device->bdev) { 1326 blkdev_put(device->bdev, device->mode); 1327 device->bdev = NULL; 1328 device->fs_devices->open_devices--; 1329 } 1330 1331 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1332 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1333 1334 if (device->fs_devices->open_devices == 0) { 1335 struct btrfs_fs_devices *fs_devices; 1336 fs_devices = root->fs_info->fs_devices; 1337 while (fs_devices) { 1338 if (fs_devices->seed == device->fs_devices) 1339 break; 1340 fs_devices = fs_devices->seed; 1341 } 1342 fs_devices->seed = device->fs_devices->seed; 1343 device->fs_devices->seed = NULL; 1344 __btrfs_close_devices(device->fs_devices); 1345 free_fs_devices(device->fs_devices); 1346 } 1347 1348 /* 1349 * at this point, the device is zero sized. We want to 1350 * remove it from the devices list and zero out the old super 1351 */ 1352 if (device->writeable) { 1353 /* make sure this device isn't detected as part of 1354 * the FS anymore 1355 */ 1356 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1357 set_buffer_dirty(bh); 1358 sync_dirty_buffer(bh); 1359 } 1360 1361 kfree(device->name); 1362 kfree(device); 1363 ret = 0; 1364 1365 error_brelse: 1366 brelse(bh); 1367 error_close: 1368 if (bdev) 1369 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1370 out: 1371 mutex_unlock(&root->fs_info->volume_mutex); 1372 mutex_unlock(&uuid_mutex); 1373 return ret; 1374 error_undo: 1375 if (device->writeable) { 1376 list_add(&device->dev_alloc_list, 1377 &root->fs_info->fs_devices->alloc_list); 1378 root->fs_info->fs_devices->rw_devices++; 1379 } 1380 goto error_brelse; 1381 } 1382 1383 /* 1384 * does all the dirty work required for changing file system's UUID. 1385 */ 1386 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1387 struct btrfs_root *root) 1388 { 1389 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1390 struct btrfs_fs_devices *old_devices; 1391 struct btrfs_fs_devices *seed_devices; 1392 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1393 struct btrfs_device *device; 1394 u64 super_flags; 1395 1396 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1397 if (!fs_devices->seeding) 1398 return -EINVAL; 1399 1400 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1401 if (!seed_devices) 1402 return -ENOMEM; 1403 1404 old_devices = clone_fs_devices(fs_devices); 1405 if (IS_ERR(old_devices)) { 1406 kfree(seed_devices); 1407 return PTR_ERR(old_devices); 1408 } 1409 1410 list_add(&old_devices->list, &fs_uuids); 1411 1412 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1413 seed_devices->opened = 1; 1414 INIT_LIST_HEAD(&seed_devices->devices); 1415 INIT_LIST_HEAD(&seed_devices->alloc_list); 1416 mutex_init(&seed_devices->device_list_mutex); 1417 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1418 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1419 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1420 device->fs_devices = seed_devices; 1421 } 1422 1423 fs_devices->seeding = 0; 1424 fs_devices->num_devices = 0; 1425 fs_devices->open_devices = 0; 1426 fs_devices->seed = seed_devices; 1427 1428 generate_random_uuid(fs_devices->fsid); 1429 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1430 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1431 super_flags = btrfs_super_flags(disk_super) & 1432 ~BTRFS_SUPER_FLAG_SEEDING; 1433 btrfs_set_super_flags(disk_super, super_flags); 1434 1435 return 0; 1436 } 1437 1438 /* 1439 * strore the expected generation for seed devices in device items. 1440 */ 1441 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1442 struct btrfs_root *root) 1443 { 1444 struct btrfs_path *path; 1445 struct extent_buffer *leaf; 1446 struct btrfs_dev_item *dev_item; 1447 struct btrfs_device *device; 1448 struct btrfs_key key; 1449 u8 fs_uuid[BTRFS_UUID_SIZE]; 1450 u8 dev_uuid[BTRFS_UUID_SIZE]; 1451 u64 devid; 1452 int ret; 1453 1454 path = btrfs_alloc_path(); 1455 if (!path) 1456 return -ENOMEM; 1457 1458 root = root->fs_info->chunk_root; 1459 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1460 key.offset = 0; 1461 key.type = BTRFS_DEV_ITEM_KEY; 1462 1463 while (1) { 1464 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1465 if (ret < 0) 1466 goto error; 1467 1468 leaf = path->nodes[0]; 1469 next_slot: 1470 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1471 ret = btrfs_next_leaf(root, path); 1472 if (ret > 0) 1473 break; 1474 if (ret < 0) 1475 goto error; 1476 leaf = path->nodes[0]; 1477 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1478 btrfs_release_path(root, path); 1479 continue; 1480 } 1481 1482 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1483 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1484 key.type != BTRFS_DEV_ITEM_KEY) 1485 break; 1486 1487 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1488 struct btrfs_dev_item); 1489 devid = btrfs_device_id(leaf, dev_item); 1490 read_extent_buffer(leaf, dev_uuid, 1491 (unsigned long)btrfs_device_uuid(dev_item), 1492 BTRFS_UUID_SIZE); 1493 read_extent_buffer(leaf, fs_uuid, 1494 (unsigned long)btrfs_device_fsid(dev_item), 1495 BTRFS_UUID_SIZE); 1496 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1497 BUG_ON(!device); 1498 1499 if (device->fs_devices->seeding) { 1500 btrfs_set_device_generation(leaf, dev_item, 1501 device->generation); 1502 btrfs_mark_buffer_dirty(leaf); 1503 } 1504 1505 path->slots[0]++; 1506 goto next_slot; 1507 } 1508 ret = 0; 1509 error: 1510 btrfs_free_path(path); 1511 return ret; 1512 } 1513 1514 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1515 { 1516 struct btrfs_trans_handle *trans; 1517 struct btrfs_device *device; 1518 struct block_device *bdev; 1519 struct list_head *devices; 1520 struct super_block *sb = root->fs_info->sb; 1521 u64 total_bytes; 1522 int seeding_dev = 0; 1523 int ret = 0; 1524 1525 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1526 return -EINVAL; 1527 1528 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1529 root->fs_info->bdev_holder); 1530 if (IS_ERR(bdev)) 1531 return PTR_ERR(bdev); 1532 1533 if (root->fs_info->fs_devices->seeding) { 1534 seeding_dev = 1; 1535 down_write(&sb->s_umount); 1536 mutex_lock(&uuid_mutex); 1537 } 1538 1539 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1540 mutex_lock(&root->fs_info->volume_mutex); 1541 1542 devices = &root->fs_info->fs_devices->devices; 1543 /* 1544 * we have the volume lock, so we don't need the extra 1545 * device list mutex while reading the list here. 1546 */ 1547 list_for_each_entry(device, devices, dev_list) { 1548 if (device->bdev == bdev) { 1549 ret = -EEXIST; 1550 goto error; 1551 } 1552 } 1553 1554 device = kzalloc(sizeof(*device), GFP_NOFS); 1555 if (!device) { 1556 /* we can safely leave the fs_devices entry around */ 1557 ret = -ENOMEM; 1558 goto error; 1559 } 1560 1561 device->name = kstrdup(device_path, GFP_NOFS); 1562 if (!device->name) { 1563 kfree(device); 1564 ret = -ENOMEM; 1565 goto error; 1566 } 1567 1568 ret = find_next_devid(root, &device->devid); 1569 if (ret) { 1570 kfree(device->name); 1571 kfree(device); 1572 goto error; 1573 } 1574 1575 trans = btrfs_start_transaction(root, 0); 1576 if (IS_ERR(trans)) { 1577 kfree(device->name); 1578 kfree(device); 1579 ret = PTR_ERR(trans); 1580 goto error; 1581 } 1582 1583 lock_chunks(root); 1584 1585 device->writeable = 1; 1586 device->work.func = pending_bios_fn; 1587 generate_random_uuid(device->uuid); 1588 spin_lock_init(&device->io_lock); 1589 device->generation = trans->transid; 1590 device->io_width = root->sectorsize; 1591 device->io_align = root->sectorsize; 1592 device->sector_size = root->sectorsize; 1593 device->total_bytes = i_size_read(bdev->bd_inode); 1594 device->disk_total_bytes = device->total_bytes; 1595 device->dev_root = root->fs_info->dev_root; 1596 device->bdev = bdev; 1597 device->in_fs_metadata = 1; 1598 device->mode = FMODE_EXCL; 1599 set_blocksize(device->bdev, 4096); 1600 1601 if (seeding_dev) { 1602 sb->s_flags &= ~MS_RDONLY; 1603 ret = btrfs_prepare_sprout(trans, root); 1604 BUG_ON(ret); 1605 } 1606 1607 device->fs_devices = root->fs_info->fs_devices; 1608 1609 /* 1610 * we don't want write_supers to jump in here with our device 1611 * half setup 1612 */ 1613 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1614 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1615 list_add(&device->dev_alloc_list, 1616 &root->fs_info->fs_devices->alloc_list); 1617 root->fs_info->fs_devices->num_devices++; 1618 root->fs_info->fs_devices->open_devices++; 1619 root->fs_info->fs_devices->rw_devices++; 1620 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1621 1622 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1623 root->fs_info->fs_devices->rotating = 1; 1624 1625 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1626 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1627 total_bytes + device->total_bytes); 1628 1629 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1630 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1631 total_bytes + 1); 1632 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1633 1634 if (seeding_dev) { 1635 ret = init_first_rw_device(trans, root, device); 1636 BUG_ON(ret); 1637 ret = btrfs_finish_sprout(trans, root); 1638 BUG_ON(ret); 1639 } else { 1640 ret = btrfs_add_device(trans, root, device); 1641 } 1642 1643 /* 1644 * we've got more storage, clear any full flags on the space 1645 * infos 1646 */ 1647 btrfs_clear_space_info_full(root->fs_info); 1648 1649 unlock_chunks(root); 1650 btrfs_commit_transaction(trans, root); 1651 1652 if (seeding_dev) { 1653 mutex_unlock(&uuid_mutex); 1654 up_write(&sb->s_umount); 1655 1656 ret = btrfs_relocate_sys_chunks(root); 1657 BUG_ON(ret); 1658 } 1659 out: 1660 mutex_unlock(&root->fs_info->volume_mutex); 1661 return ret; 1662 error: 1663 blkdev_put(bdev, FMODE_EXCL); 1664 if (seeding_dev) { 1665 mutex_unlock(&uuid_mutex); 1666 up_write(&sb->s_umount); 1667 } 1668 goto out; 1669 } 1670 1671 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1672 struct btrfs_device *device) 1673 { 1674 int ret; 1675 struct btrfs_path *path; 1676 struct btrfs_root *root; 1677 struct btrfs_dev_item *dev_item; 1678 struct extent_buffer *leaf; 1679 struct btrfs_key key; 1680 1681 root = device->dev_root->fs_info->chunk_root; 1682 1683 path = btrfs_alloc_path(); 1684 if (!path) 1685 return -ENOMEM; 1686 1687 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1688 key.type = BTRFS_DEV_ITEM_KEY; 1689 key.offset = device->devid; 1690 1691 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1692 if (ret < 0) 1693 goto out; 1694 1695 if (ret > 0) { 1696 ret = -ENOENT; 1697 goto out; 1698 } 1699 1700 leaf = path->nodes[0]; 1701 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1702 1703 btrfs_set_device_id(leaf, dev_item, device->devid); 1704 btrfs_set_device_type(leaf, dev_item, device->type); 1705 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1706 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1707 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1708 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1709 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1710 btrfs_mark_buffer_dirty(leaf); 1711 1712 out: 1713 btrfs_free_path(path); 1714 return ret; 1715 } 1716 1717 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1718 struct btrfs_device *device, u64 new_size) 1719 { 1720 struct btrfs_super_block *super_copy = 1721 &device->dev_root->fs_info->super_copy; 1722 u64 old_total = btrfs_super_total_bytes(super_copy); 1723 u64 diff = new_size - device->total_bytes; 1724 1725 if (!device->writeable) 1726 return -EACCES; 1727 if (new_size <= device->total_bytes) 1728 return -EINVAL; 1729 1730 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1731 device->fs_devices->total_rw_bytes += diff; 1732 1733 device->total_bytes = new_size; 1734 device->disk_total_bytes = new_size; 1735 btrfs_clear_space_info_full(device->dev_root->fs_info); 1736 1737 return btrfs_update_device(trans, device); 1738 } 1739 1740 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1741 struct btrfs_device *device, u64 new_size) 1742 { 1743 int ret; 1744 lock_chunks(device->dev_root); 1745 ret = __btrfs_grow_device(trans, device, new_size); 1746 unlock_chunks(device->dev_root); 1747 return ret; 1748 } 1749 1750 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1751 struct btrfs_root *root, 1752 u64 chunk_tree, u64 chunk_objectid, 1753 u64 chunk_offset) 1754 { 1755 int ret; 1756 struct btrfs_path *path; 1757 struct btrfs_key key; 1758 1759 root = root->fs_info->chunk_root; 1760 path = btrfs_alloc_path(); 1761 if (!path) 1762 return -ENOMEM; 1763 1764 key.objectid = chunk_objectid; 1765 key.offset = chunk_offset; 1766 key.type = BTRFS_CHUNK_ITEM_KEY; 1767 1768 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1769 BUG_ON(ret); 1770 1771 ret = btrfs_del_item(trans, root, path); 1772 BUG_ON(ret); 1773 1774 btrfs_free_path(path); 1775 return 0; 1776 } 1777 1778 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1779 chunk_offset) 1780 { 1781 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1782 struct btrfs_disk_key *disk_key; 1783 struct btrfs_chunk *chunk; 1784 u8 *ptr; 1785 int ret = 0; 1786 u32 num_stripes; 1787 u32 array_size; 1788 u32 len = 0; 1789 u32 cur; 1790 struct btrfs_key key; 1791 1792 array_size = btrfs_super_sys_array_size(super_copy); 1793 1794 ptr = super_copy->sys_chunk_array; 1795 cur = 0; 1796 1797 while (cur < array_size) { 1798 disk_key = (struct btrfs_disk_key *)ptr; 1799 btrfs_disk_key_to_cpu(&key, disk_key); 1800 1801 len = sizeof(*disk_key); 1802 1803 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1804 chunk = (struct btrfs_chunk *)(ptr + len); 1805 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1806 len += btrfs_chunk_item_size(num_stripes); 1807 } else { 1808 ret = -EIO; 1809 break; 1810 } 1811 if (key.objectid == chunk_objectid && 1812 key.offset == chunk_offset) { 1813 memmove(ptr, ptr + len, array_size - (cur + len)); 1814 array_size -= len; 1815 btrfs_set_super_sys_array_size(super_copy, array_size); 1816 } else { 1817 ptr += len; 1818 cur += len; 1819 } 1820 } 1821 return ret; 1822 } 1823 1824 static int btrfs_relocate_chunk(struct btrfs_root *root, 1825 u64 chunk_tree, u64 chunk_objectid, 1826 u64 chunk_offset) 1827 { 1828 struct extent_map_tree *em_tree; 1829 struct btrfs_root *extent_root; 1830 struct btrfs_trans_handle *trans; 1831 struct extent_map *em; 1832 struct map_lookup *map; 1833 int ret; 1834 int i; 1835 1836 root = root->fs_info->chunk_root; 1837 extent_root = root->fs_info->extent_root; 1838 em_tree = &root->fs_info->mapping_tree.map_tree; 1839 1840 ret = btrfs_can_relocate(extent_root, chunk_offset); 1841 if (ret) 1842 return -ENOSPC; 1843 1844 /* step one, relocate all the extents inside this chunk */ 1845 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1846 if (ret) 1847 return ret; 1848 1849 trans = btrfs_start_transaction(root, 0); 1850 BUG_ON(IS_ERR(trans)); 1851 1852 lock_chunks(root); 1853 1854 /* 1855 * step two, delete the device extents and the 1856 * chunk tree entries 1857 */ 1858 read_lock(&em_tree->lock); 1859 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1860 read_unlock(&em_tree->lock); 1861 1862 BUG_ON(em->start > chunk_offset || 1863 em->start + em->len < chunk_offset); 1864 map = (struct map_lookup *)em->bdev; 1865 1866 for (i = 0; i < map->num_stripes; i++) { 1867 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1868 map->stripes[i].physical); 1869 BUG_ON(ret); 1870 1871 if (map->stripes[i].dev) { 1872 ret = btrfs_update_device(trans, map->stripes[i].dev); 1873 BUG_ON(ret); 1874 } 1875 } 1876 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1877 chunk_offset); 1878 1879 BUG_ON(ret); 1880 1881 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 1882 1883 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1884 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1885 BUG_ON(ret); 1886 } 1887 1888 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1889 BUG_ON(ret); 1890 1891 write_lock(&em_tree->lock); 1892 remove_extent_mapping(em_tree, em); 1893 write_unlock(&em_tree->lock); 1894 1895 kfree(map); 1896 em->bdev = NULL; 1897 1898 /* once for the tree */ 1899 free_extent_map(em); 1900 /* once for us */ 1901 free_extent_map(em); 1902 1903 unlock_chunks(root); 1904 btrfs_end_transaction(trans, root); 1905 return 0; 1906 } 1907 1908 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1909 { 1910 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1911 struct btrfs_path *path; 1912 struct extent_buffer *leaf; 1913 struct btrfs_chunk *chunk; 1914 struct btrfs_key key; 1915 struct btrfs_key found_key; 1916 u64 chunk_tree = chunk_root->root_key.objectid; 1917 u64 chunk_type; 1918 bool retried = false; 1919 int failed = 0; 1920 int ret; 1921 1922 path = btrfs_alloc_path(); 1923 if (!path) 1924 return -ENOMEM; 1925 1926 again: 1927 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1928 key.offset = (u64)-1; 1929 key.type = BTRFS_CHUNK_ITEM_KEY; 1930 1931 while (1) { 1932 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1933 if (ret < 0) 1934 goto error; 1935 BUG_ON(ret == 0); 1936 1937 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1938 key.type); 1939 if (ret < 0) 1940 goto error; 1941 if (ret > 0) 1942 break; 1943 1944 leaf = path->nodes[0]; 1945 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1946 1947 chunk = btrfs_item_ptr(leaf, path->slots[0], 1948 struct btrfs_chunk); 1949 chunk_type = btrfs_chunk_type(leaf, chunk); 1950 btrfs_release_path(chunk_root, path); 1951 1952 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1953 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1954 found_key.objectid, 1955 found_key.offset); 1956 if (ret == -ENOSPC) 1957 failed++; 1958 else if (ret) 1959 BUG(); 1960 } 1961 1962 if (found_key.offset == 0) 1963 break; 1964 key.offset = found_key.offset - 1; 1965 } 1966 ret = 0; 1967 if (failed && !retried) { 1968 failed = 0; 1969 retried = true; 1970 goto again; 1971 } else if (failed && retried) { 1972 WARN_ON(1); 1973 ret = -ENOSPC; 1974 } 1975 error: 1976 btrfs_free_path(path); 1977 return ret; 1978 } 1979 1980 static u64 div_factor(u64 num, int factor) 1981 { 1982 if (factor == 10) 1983 return num; 1984 num *= factor; 1985 do_div(num, 10); 1986 return num; 1987 } 1988 1989 int btrfs_balance(struct btrfs_root *dev_root) 1990 { 1991 int ret; 1992 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1993 struct btrfs_device *device; 1994 u64 old_size; 1995 u64 size_to_free; 1996 struct btrfs_path *path; 1997 struct btrfs_key key; 1998 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1999 struct btrfs_trans_handle *trans; 2000 struct btrfs_key found_key; 2001 2002 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2003 return -EROFS; 2004 2005 if (!capable(CAP_SYS_ADMIN)) 2006 return -EPERM; 2007 2008 mutex_lock(&dev_root->fs_info->volume_mutex); 2009 dev_root = dev_root->fs_info->dev_root; 2010 2011 /* step one make some room on all the devices */ 2012 list_for_each_entry(device, devices, dev_list) { 2013 old_size = device->total_bytes; 2014 size_to_free = div_factor(old_size, 1); 2015 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2016 if (!device->writeable || 2017 device->total_bytes - device->bytes_used > size_to_free) 2018 continue; 2019 2020 ret = btrfs_shrink_device(device, old_size - size_to_free); 2021 if (ret == -ENOSPC) 2022 break; 2023 BUG_ON(ret); 2024 2025 trans = btrfs_start_transaction(dev_root, 0); 2026 BUG_ON(IS_ERR(trans)); 2027 2028 ret = btrfs_grow_device(trans, device, old_size); 2029 BUG_ON(ret); 2030 2031 btrfs_end_transaction(trans, dev_root); 2032 } 2033 2034 /* step two, relocate all the chunks */ 2035 path = btrfs_alloc_path(); 2036 BUG_ON(!path); 2037 2038 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2039 key.offset = (u64)-1; 2040 key.type = BTRFS_CHUNK_ITEM_KEY; 2041 2042 while (1) { 2043 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2044 if (ret < 0) 2045 goto error; 2046 2047 /* 2048 * this shouldn't happen, it means the last relocate 2049 * failed 2050 */ 2051 if (ret == 0) 2052 break; 2053 2054 ret = btrfs_previous_item(chunk_root, path, 0, 2055 BTRFS_CHUNK_ITEM_KEY); 2056 if (ret) 2057 break; 2058 2059 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2060 path->slots[0]); 2061 if (found_key.objectid != key.objectid) 2062 break; 2063 2064 /* chunk zero is special */ 2065 if (found_key.offset == 0) 2066 break; 2067 2068 btrfs_release_path(chunk_root, path); 2069 ret = btrfs_relocate_chunk(chunk_root, 2070 chunk_root->root_key.objectid, 2071 found_key.objectid, 2072 found_key.offset); 2073 BUG_ON(ret && ret != -ENOSPC); 2074 key.offset = found_key.offset - 1; 2075 } 2076 ret = 0; 2077 error: 2078 btrfs_free_path(path); 2079 mutex_unlock(&dev_root->fs_info->volume_mutex); 2080 return ret; 2081 } 2082 2083 /* 2084 * shrinking a device means finding all of the device extents past 2085 * the new size, and then following the back refs to the chunks. 2086 * The chunk relocation code actually frees the device extent 2087 */ 2088 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2089 { 2090 struct btrfs_trans_handle *trans; 2091 struct btrfs_root *root = device->dev_root; 2092 struct btrfs_dev_extent *dev_extent = NULL; 2093 struct btrfs_path *path; 2094 u64 length; 2095 u64 chunk_tree; 2096 u64 chunk_objectid; 2097 u64 chunk_offset; 2098 int ret; 2099 int slot; 2100 int failed = 0; 2101 bool retried = false; 2102 struct extent_buffer *l; 2103 struct btrfs_key key; 2104 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2105 u64 old_total = btrfs_super_total_bytes(super_copy); 2106 u64 old_size = device->total_bytes; 2107 u64 diff = device->total_bytes - new_size; 2108 2109 if (new_size >= device->total_bytes) 2110 return -EINVAL; 2111 2112 path = btrfs_alloc_path(); 2113 if (!path) 2114 return -ENOMEM; 2115 2116 path->reada = 2; 2117 2118 lock_chunks(root); 2119 2120 device->total_bytes = new_size; 2121 if (device->writeable) 2122 device->fs_devices->total_rw_bytes -= diff; 2123 unlock_chunks(root); 2124 2125 again: 2126 key.objectid = device->devid; 2127 key.offset = (u64)-1; 2128 key.type = BTRFS_DEV_EXTENT_KEY; 2129 2130 while (1) { 2131 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2132 if (ret < 0) 2133 goto done; 2134 2135 ret = btrfs_previous_item(root, path, 0, key.type); 2136 if (ret < 0) 2137 goto done; 2138 if (ret) { 2139 ret = 0; 2140 btrfs_release_path(root, path); 2141 break; 2142 } 2143 2144 l = path->nodes[0]; 2145 slot = path->slots[0]; 2146 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2147 2148 if (key.objectid != device->devid) { 2149 btrfs_release_path(root, path); 2150 break; 2151 } 2152 2153 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2154 length = btrfs_dev_extent_length(l, dev_extent); 2155 2156 if (key.offset + length <= new_size) { 2157 btrfs_release_path(root, path); 2158 break; 2159 } 2160 2161 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2162 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2163 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2164 btrfs_release_path(root, path); 2165 2166 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2167 chunk_offset); 2168 if (ret && ret != -ENOSPC) 2169 goto done; 2170 if (ret == -ENOSPC) 2171 failed++; 2172 key.offset -= 1; 2173 } 2174 2175 if (failed && !retried) { 2176 failed = 0; 2177 retried = true; 2178 goto again; 2179 } else if (failed && retried) { 2180 ret = -ENOSPC; 2181 lock_chunks(root); 2182 2183 device->total_bytes = old_size; 2184 if (device->writeable) 2185 device->fs_devices->total_rw_bytes += diff; 2186 unlock_chunks(root); 2187 goto done; 2188 } 2189 2190 /* Shrinking succeeded, else we would be at "done". */ 2191 trans = btrfs_start_transaction(root, 0); 2192 if (IS_ERR(trans)) { 2193 ret = PTR_ERR(trans); 2194 goto done; 2195 } 2196 2197 lock_chunks(root); 2198 2199 device->disk_total_bytes = new_size; 2200 /* Now btrfs_update_device() will change the on-disk size. */ 2201 ret = btrfs_update_device(trans, device); 2202 if (ret) { 2203 unlock_chunks(root); 2204 btrfs_end_transaction(trans, root); 2205 goto done; 2206 } 2207 WARN_ON(diff > old_total); 2208 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2209 unlock_chunks(root); 2210 btrfs_end_transaction(trans, root); 2211 done: 2212 btrfs_free_path(path); 2213 return ret; 2214 } 2215 2216 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2217 struct btrfs_root *root, 2218 struct btrfs_key *key, 2219 struct btrfs_chunk *chunk, int item_size) 2220 { 2221 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2222 struct btrfs_disk_key disk_key; 2223 u32 array_size; 2224 u8 *ptr; 2225 2226 array_size = btrfs_super_sys_array_size(super_copy); 2227 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2228 return -EFBIG; 2229 2230 ptr = super_copy->sys_chunk_array + array_size; 2231 btrfs_cpu_key_to_disk(&disk_key, key); 2232 memcpy(ptr, &disk_key, sizeof(disk_key)); 2233 ptr += sizeof(disk_key); 2234 memcpy(ptr, chunk, item_size); 2235 item_size += sizeof(disk_key); 2236 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2237 return 0; 2238 } 2239 2240 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2241 int num_stripes, int sub_stripes) 2242 { 2243 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2244 return calc_size; 2245 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2246 return calc_size * (num_stripes / sub_stripes); 2247 else 2248 return calc_size * num_stripes; 2249 } 2250 2251 /* Used to sort the devices by max_avail(descending sort) */ 2252 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) 2253 { 2254 if (((struct btrfs_device_info *)dev_info1)->max_avail > 2255 ((struct btrfs_device_info *)dev_info2)->max_avail) 2256 return -1; 2257 else if (((struct btrfs_device_info *)dev_info1)->max_avail < 2258 ((struct btrfs_device_info *)dev_info2)->max_avail) 2259 return 1; 2260 else 2261 return 0; 2262 } 2263 2264 static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, 2265 int *num_stripes, int *min_stripes, 2266 int *sub_stripes) 2267 { 2268 *num_stripes = 1; 2269 *min_stripes = 1; 2270 *sub_stripes = 0; 2271 2272 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2273 *num_stripes = fs_devices->rw_devices; 2274 *min_stripes = 2; 2275 } 2276 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2277 *num_stripes = 2; 2278 *min_stripes = 2; 2279 } 2280 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2281 if (fs_devices->rw_devices < 2) 2282 return -ENOSPC; 2283 *num_stripes = 2; 2284 *min_stripes = 2; 2285 } 2286 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2287 *num_stripes = fs_devices->rw_devices; 2288 if (*num_stripes < 4) 2289 return -ENOSPC; 2290 *num_stripes &= ~(u32)1; 2291 *sub_stripes = 2; 2292 *min_stripes = 4; 2293 } 2294 2295 return 0; 2296 } 2297 2298 static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, 2299 u64 proposed_size, u64 type, 2300 int num_stripes, int small_stripe) 2301 { 2302 int min_stripe_size = 1 * 1024 * 1024; 2303 u64 calc_size = proposed_size; 2304 u64 max_chunk_size = calc_size; 2305 int ncopies = 1; 2306 2307 if (type & (BTRFS_BLOCK_GROUP_RAID1 | 2308 BTRFS_BLOCK_GROUP_DUP | 2309 BTRFS_BLOCK_GROUP_RAID10)) 2310 ncopies = 2; 2311 2312 if (type & BTRFS_BLOCK_GROUP_DATA) { 2313 max_chunk_size = 10 * calc_size; 2314 min_stripe_size = 64 * 1024 * 1024; 2315 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2316 max_chunk_size = 256 * 1024 * 1024; 2317 min_stripe_size = 32 * 1024 * 1024; 2318 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2319 calc_size = 8 * 1024 * 1024; 2320 max_chunk_size = calc_size * 2; 2321 min_stripe_size = 1 * 1024 * 1024; 2322 } 2323 2324 /* we don't want a chunk larger than 10% of writeable space */ 2325 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2326 max_chunk_size); 2327 2328 if (calc_size * num_stripes > max_chunk_size * ncopies) { 2329 calc_size = max_chunk_size * ncopies; 2330 do_div(calc_size, num_stripes); 2331 do_div(calc_size, BTRFS_STRIPE_LEN); 2332 calc_size *= BTRFS_STRIPE_LEN; 2333 } 2334 2335 /* we don't want tiny stripes */ 2336 if (!small_stripe) 2337 calc_size = max_t(u64, min_stripe_size, calc_size); 2338 2339 /* 2340 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure 2341 * we end up with something bigger than a stripe 2342 */ 2343 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); 2344 2345 do_div(calc_size, BTRFS_STRIPE_LEN); 2346 calc_size *= BTRFS_STRIPE_LEN; 2347 2348 return calc_size; 2349 } 2350 2351 static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, 2352 int num_stripes) 2353 { 2354 struct map_lookup *new; 2355 size_t len = map_lookup_size(num_stripes); 2356 2357 BUG_ON(map->num_stripes < num_stripes); 2358 2359 if (map->num_stripes == num_stripes) 2360 return map; 2361 2362 new = kmalloc(len, GFP_NOFS); 2363 if (!new) { 2364 /* just change map->num_stripes */ 2365 map->num_stripes = num_stripes; 2366 return map; 2367 } 2368 2369 memcpy(new, map, len); 2370 new->num_stripes = num_stripes; 2371 kfree(map); 2372 return new; 2373 } 2374 2375 /* 2376 * helper to allocate device space from btrfs_device_info, in which we stored 2377 * max free space information of every device. It is used when we can not 2378 * allocate chunks by default size. 2379 * 2380 * By this helper, we can allocate a new chunk as larger as possible. 2381 */ 2382 static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, 2383 struct btrfs_fs_devices *fs_devices, 2384 struct btrfs_device_info *devices, 2385 int nr_device, u64 type, 2386 struct map_lookup **map_lookup, 2387 int min_stripes, u64 *stripe_size) 2388 { 2389 int i, index, sort_again = 0; 2390 int min_devices = min_stripes; 2391 u64 max_avail, min_free; 2392 struct map_lookup *map = *map_lookup; 2393 int ret; 2394 2395 if (nr_device < min_stripes) 2396 return -ENOSPC; 2397 2398 btrfs_descending_sort_devices(devices, nr_device); 2399 2400 max_avail = devices[0].max_avail; 2401 if (!max_avail) 2402 return -ENOSPC; 2403 2404 for (i = 0; i < nr_device; i++) { 2405 /* 2406 * if dev_offset = 0, it means the free space of this device 2407 * is less than what we need, and we didn't search max avail 2408 * extent on this device, so do it now. 2409 */ 2410 if (!devices[i].dev_offset) { 2411 ret = find_free_dev_extent(trans, devices[i].dev, 2412 max_avail, 2413 &devices[i].dev_offset, 2414 &devices[i].max_avail); 2415 if (ret != 0 && ret != -ENOSPC) 2416 return ret; 2417 sort_again = 1; 2418 } 2419 } 2420 2421 /* we update the max avail free extent of each devices, sort again */ 2422 if (sort_again) 2423 btrfs_descending_sort_devices(devices, nr_device); 2424 2425 if (type & BTRFS_BLOCK_GROUP_DUP) 2426 min_devices = 1; 2427 2428 if (!devices[min_devices - 1].max_avail) 2429 return -ENOSPC; 2430 2431 max_avail = devices[min_devices - 1].max_avail; 2432 if (type & BTRFS_BLOCK_GROUP_DUP) 2433 do_div(max_avail, 2); 2434 2435 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, 2436 min_stripes, 1); 2437 if (type & BTRFS_BLOCK_GROUP_DUP) 2438 min_free = max_avail * 2; 2439 else 2440 min_free = max_avail; 2441 2442 if (min_free > devices[min_devices - 1].max_avail) 2443 return -ENOSPC; 2444 2445 map = __shrink_map_lookup_stripes(map, min_stripes); 2446 *stripe_size = max_avail; 2447 2448 index = 0; 2449 for (i = 0; i < min_stripes; i++) { 2450 map->stripes[i].dev = devices[index].dev; 2451 map->stripes[i].physical = devices[index].dev_offset; 2452 if (type & BTRFS_BLOCK_GROUP_DUP) { 2453 i++; 2454 map->stripes[i].dev = devices[index].dev; 2455 map->stripes[i].physical = devices[index].dev_offset + 2456 max_avail; 2457 } 2458 index++; 2459 } 2460 *map_lookup = map; 2461 2462 return 0; 2463 } 2464 2465 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2466 struct btrfs_root *extent_root, 2467 struct map_lookup **map_ret, 2468 u64 *num_bytes, u64 *stripe_size, 2469 u64 start, u64 type) 2470 { 2471 struct btrfs_fs_info *info = extent_root->fs_info; 2472 struct btrfs_device *device = NULL; 2473 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2474 struct list_head *cur; 2475 struct map_lookup *map; 2476 struct extent_map_tree *em_tree; 2477 struct extent_map *em; 2478 struct btrfs_device_info *devices_info; 2479 struct list_head private_devs; 2480 u64 calc_size = 1024 * 1024 * 1024; 2481 u64 min_free; 2482 u64 avail; 2483 u64 dev_offset; 2484 int num_stripes; 2485 int min_stripes; 2486 int sub_stripes; 2487 int min_devices; /* the min number of devices we need */ 2488 int i; 2489 int ret; 2490 int index; 2491 2492 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2493 (type & BTRFS_BLOCK_GROUP_DUP)) { 2494 WARN_ON(1); 2495 type &= ~BTRFS_BLOCK_GROUP_DUP; 2496 } 2497 if (list_empty(&fs_devices->alloc_list)) 2498 return -ENOSPC; 2499 2500 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, 2501 &min_stripes, &sub_stripes); 2502 if (ret) 2503 return ret; 2504 2505 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 2506 GFP_NOFS); 2507 if (!devices_info) 2508 return -ENOMEM; 2509 2510 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2511 if (!map) { 2512 ret = -ENOMEM; 2513 goto error; 2514 } 2515 map->num_stripes = num_stripes; 2516 2517 cur = fs_devices->alloc_list.next; 2518 index = 0; 2519 i = 0; 2520 2521 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, 2522 num_stripes, 0); 2523 2524 if (type & BTRFS_BLOCK_GROUP_DUP) { 2525 min_free = calc_size * 2; 2526 min_devices = 1; 2527 } else { 2528 min_free = calc_size; 2529 min_devices = min_stripes; 2530 } 2531 2532 INIT_LIST_HEAD(&private_devs); 2533 while (index < num_stripes) { 2534 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2535 BUG_ON(!device->writeable); 2536 if (device->total_bytes > device->bytes_used) 2537 avail = device->total_bytes - device->bytes_used; 2538 else 2539 avail = 0; 2540 cur = cur->next; 2541 2542 if (device->in_fs_metadata && avail >= min_free) { 2543 ret = find_free_dev_extent(trans, device, min_free, 2544 &devices_info[i].dev_offset, 2545 &devices_info[i].max_avail); 2546 if (ret == 0) { 2547 list_move_tail(&device->dev_alloc_list, 2548 &private_devs); 2549 map->stripes[index].dev = device; 2550 map->stripes[index].physical = 2551 devices_info[i].dev_offset; 2552 index++; 2553 if (type & BTRFS_BLOCK_GROUP_DUP) { 2554 map->stripes[index].dev = device; 2555 map->stripes[index].physical = 2556 devices_info[i].dev_offset + 2557 calc_size; 2558 index++; 2559 } 2560 } else if (ret != -ENOSPC) 2561 goto error; 2562 2563 devices_info[i].dev = device; 2564 i++; 2565 } else if (device->in_fs_metadata && 2566 avail >= BTRFS_STRIPE_LEN) { 2567 devices_info[i].dev = device; 2568 devices_info[i].max_avail = avail; 2569 i++; 2570 } 2571 2572 if (cur == &fs_devices->alloc_list) 2573 break; 2574 } 2575 2576 list_splice(&private_devs, &fs_devices->alloc_list); 2577 if (index < num_stripes) { 2578 if (index >= min_stripes) { 2579 num_stripes = index; 2580 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2581 num_stripes /= sub_stripes; 2582 num_stripes *= sub_stripes; 2583 } 2584 2585 map = __shrink_map_lookup_stripes(map, num_stripes); 2586 } else if (i >= min_devices) { 2587 ret = __btrfs_alloc_tiny_space(trans, fs_devices, 2588 devices_info, i, type, 2589 &map, min_stripes, 2590 &calc_size); 2591 if (ret) 2592 goto error; 2593 } else { 2594 ret = -ENOSPC; 2595 goto error; 2596 } 2597 } 2598 map->sector_size = extent_root->sectorsize; 2599 map->stripe_len = BTRFS_STRIPE_LEN; 2600 map->io_align = BTRFS_STRIPE_LEN; 2601 map->io_width = BTRFS_STRIPE_LEN; 2602 map->type = type; 2603 map->sub_stripes = sub_stripes; 2604 2605 *map_ret = map; 2606 *stripe_size = calc_size; 2607 *num_bytes = chunk_bytes_by_type(type, calc_size, 2608 map->num_stripes, sub_stripes); 2609 2610 trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes); 2611 2612 em = alloc_extent_map(GFP_NOFS); 2613 if (!em) { 2614 ret = -ENOMEM; 2615 goto error; 2616 } 2617 em->bdev = (struct block_device *)map; 2618 em->start = start; 2619 em->len = *num_bytes; 2620 em->block_start = 0; 2621 em->block_len = em->len; 2622 2623 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2624 write_lock(&em_tree->lock); 2625 ret = add_extent_mapping(em_tree, em); 2626 write_unlock(&em_tree->lock); 2627 BUG_ON(ret); 2628 free_extent_map(em); 2629 2630 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2631 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2632 start, *num_bytes); 2633 BUG_ON(ret); 2634 2635 index = 0; 2636 while (index < map->num_stripes) { 2637 device = map->stripes[index].dev; 2638 dev_offset = map->stripes[index].physical; 2639 2640 ret = btrfs_alloc_dev_extent(trans, device, 2641 info->chunk_root->root_key.objectid, 2642 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2643 start, dev_offset, calc_size); 2644 BUG_ON(ret); 2645 index++; 2646 } 2647 2648 kfree(devices_info); 2649 return 0; 2650 2651 error: 2652 kfree(map); 2653 kfree(devices_info); 2654 return ret; 2655 } 2656 2657 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2658 struct btrfs_root *extent_root, 2659 struct map_lookup *map, u64 chunk_offset, 2660 u64 chunk_size, u64 stripe_size) 2661 { 2662 u64 dev_offset; 2663 struct btrfs_key key; 2664 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2665 struct btrfs_device *device; 2666 struct btrfs_chunk *chunk; 2667 struct btrfs_stripe *stripe; 2668 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2669 int index = 0; 2670 int ret; 2671 2672 chunk = kzalloc(item_size, GFP_NOFS); 2673 if (!chunk) 2674 return -ENOMEM; 2675 2676 index = 0; 2677 while (index < map->num_stripes) { 2678 device = map->stripes[index].dev; 2679 device->bytes_used += stripe_size; 2680 ret = btrfs_update_device(trans, device); 2681 BUG_ON(ret); 2682 index++; 2683 } 2684 2685 index = 0; 2686 stripe = &chunk->stripe; 2687 while (index < map->num_stripes) { 2688 device = map->stripes[index].dev; 2689 dev_offset = map->stripes[index].physical; 2690 2691 btrfs_set_stack_stripe_devid(stripe, device->devid); 2692 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2693 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2694 stripe++; 2695 index++; 2696 } 2697 2698 btrfs_set_stack_chunk_length(chunk, chunk_size); 2699 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2700 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2701 btrfs_set_stack_chunk_type(chunk, map->type); 2702 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2703 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2704 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2705 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2706 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2707 2708 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2709 key.type = BTRFS_CHUNK_ITEM_KEY; 2710 key.offset = chunk_offset; 2711 2712 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2713 BUG_ON(ret); 2714 2715 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2716 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2717 item_size); 2718 BUG_ON(ret); 2719 } 2720 2721 kfree(chunk); 2722 return 0; 2723 } 2724 2725 /* 2726 * Chunk allocation falls into two parts. The first part does works 2727 * that make the new allocated chunk useable, but not do any operation 2728 * that modifies the chunk tree. The second part does the works that 2729 * require modifying the chunk tree. This division is important for the 2730 * bootstrap process of adding storage to a seed btrfs. 2731 */ 2732 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2733 struct btrfs_root *extent_root, u64 type) 2734 { 2735 u64 chunk_offset; 2736 u64 chunk_size; 2737 u64 stripe_size; 2738 struct map_lookup *map; 2739 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2740 int ret; 2741 2742 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2743 &chunk_offset); 2744 if (ret) 2745 return ret; 2746 2747 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2748 &stripe_size, chunk_offset, type); 2749 if (ret) 2750 return ret; 2751 2752 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2753 chunk_size, stripe_size); 2754 BUG_ON(ret); 2755 return 0; 2756 } 2757 2758 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2759 struct btrfs_root *root, 2760 struct btrfs_device *device) 2761 { 2762 u64 chunk_offset; 2763 u64 sys_chunk_offset; 2764 u64 chunk_size; 2765 u64 sys_chunk_size; 2766 u64 stripe_size; 2767 u64 sys_stripe_size; 2768 u64 alloc_profile; 2769 struct map_lookup *map; 2770 struct map_lookup *sys_map; 2771 struct btrfs_fs_info *fs_info = root->fs_info; 2772 struct btrfs_root *extent_root = fs_info->extent_root; 2773 int ret; 2774 2775 ret = find_next_chunk(fs_info->chunk_root, 2776 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2777 BUG_ON(ret); 2778 2779 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2780 (fs_info->metadata_alloc_profile & 2781 fs_info->avail_metadata_alloc_bits); 2782 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2783 2784 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2785 &stripe_size, chunk_offset, alloc_profile); 2786 BUG_ON(ret); 2787 2788 sys_chunk_offset = chunk_offset + chunk_size; 2789 2790 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2791 (fs_info->system_alloc_profile & 2792 fs_info->avail_system_alloc_bits); 2793 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2794 2795 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2796 &sys_chunk_size, &sys_stripe_size, 2797 sys_chunk_offset, alloc_profile); 2798 BUG_ON(ret); 2799 2800 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2801 BUG_ON(ret); 2802 2803 /* 2804 * Modifying chunk tree needs allocating new blocks from both 2805 * system block group and metadata block group. So we only can 2806 * do operations require modifying the chunk tree after both 2807 * block groups were created. 2808 */ 2809 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2810 chunk_size, stripe_size); 2811 BUG_ON(ret); 2812 2813 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2814 sys_chunk_offset, sys_chunk_size, 2815 sys_stripe_size); 2816 BUG_ON(ret); 2817 return 0; 2818 } 2819 2820 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2821 { 2822 struct extent_map *em; 2823 struct map_lookup *map; 2824 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2825 int readonly = 0; 2826 int i; 2827 2828 read_lock(&map_tree->map_tree.lock); 2829 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2830 read_unlock(&map_tree->map_tree.lock); 2831 if (!em) 2832 return 1; 2833 2834 if (btrfs_test_opt(root, DEGRADED)) { 2835 free_extent_map(em); 2836 return 0; 2837 } 2838 2839 map = (struct map_lookup *)em->bdev; 2840 for (i = 0; i < map->num_stripes; i++) { 2841 if (!map->stripes[i].dev->writeable) { 2842 readonly = 1; 2843 break; 2844 } 2845 } 2846 free_extent_map(em); 2847 return readonly; 2848 } 2849 2850 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2851 { 2852 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2853 } 2854 2855 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2856 { 2857 struct extent_map *em; 2858 2859 while (1) { 2860 write_lock(&tree->map_tree.lock); 2861 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2862 if (em) 2863 remove_extent_mapping(&tree->map_tree, em); 2864 write_unlock(&tree->map_tree.lock); 2865 if (!em) 2866 break; 2867 kfree(em->bdev); 2868 /* once for us */ 2869 free_extent_map(em); 2870 /* once for the tree */ 2871 free_extent_map(em); 2872 } 2873 } 2874 2875 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2876 { 2877 struct extent_map *em; 2878 struct map_lookup *map; 2879 struct extent_map_tree *em_tree = &map_tree->map_tree; 2880 int ret; 2881 2882 read_lock(&em_tree->lock); 2883 em = lookup_extent_mapping(em_tree, logical, len); 2884 read_unlock(&em_tree->lock); 2885 BUG_ON(!em); 2886 2887 BUG_ON(em->start > logical || em->start + em->len < logical); 2888 map = (struct map_lookup *)em->bdev; 2889 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2890 ret = map->num_stripes; 2891 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2892 ret = map->sub_stripes; 2893 else 2894 ret = 1; 2895 free_extent_map(em); 2896 return ret; 2897 } 2898 2899 static int find_live_mirror(struct map_lookup *map, int first, int num, 2900 int optimal) 2901 { 2902 int i; 2903 if (map->stripes[optimal].dev->bdev) 2904 return optimal; 2905 for (i = first; i < first + num; i++) { 2906 if (map->stripes[i].dev->bdev) 2907 return i; 2908 } 2909 /* we couldn't find one that doesn't fail. Just return something 2910 * and the io error handling code will clean up eventually 2911 */ 2912 return optimal; 2913 } 2914 2915 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2916 u64 logical, u64 *length, 2917 struct btrfs_multi_bio **multi_ret, 2918 int mirror_num) 2919 { 2920 struct extent_map *em; 2921 struct map_lookup *map; 2922 struct extent_map_tree *em_tree = &map_tree->map_tree; 2923 u64 offset; 2924 u64 stripe_offset; 2925 u64 stripe_end_offset; 2926 u64 stripe_nr; 2927 u64 stripe_nr_orig; 2928 u64 stripe_nr_end; 2929 int stripes_allocated = 8; 2930 int stripes_required = 1; 2931 int stripe_index; 2932 int i; 2933 int num_stripes; 2934 int max_errors = 0; 2935 struct btrfs_multi_bio *multi = NULL; 2936 2937 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2938 stripes_allocated = 1; 2939 again: 2940 if (multi_ret) { 2941 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2942 GFP_NOFS); 2943 if (!multi) 2944 return -ENOMEM; 2945 2946 atomic_set(&multi->error, 0); 2947 } 2948 2949 read_lock(&em_tree->lock); 2950 em = lookup_extent_mapping(em_tree, logical, *length); 2951 read_unlock(&em_tree->lock); 2952 2953 if (!em) { 2954 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2955 (unsigned long long)logical, 2956 (unsigned long long)*length); 2957 BUG(); 2958 } 2959 2960 BUG_ON(em->start > logical || em->start + em->len < logical); 2961 map = (struct map_lookup *)em->bdev; 2962 offset = logical - em->start; 2963 2964 if (mirror_num > map->num_stripes) 2965 mirror_num = 0; 2966 2967 /* if our multi bio struct is too small, back off and try again */ 2968 if (rw & REQ_WRITE) { 2969 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2970 BTRFS_BLOCK_GROUP_DUP)) { 2971 stripes_required = map->num_stripes; 2972 max_errors = 1; 2973 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2974 stripes_required = map->sub_stripes; 2975 max_errors = 1; 2976 } 2977 } 2978 if (rw & REQ_DISCARD) { 2979 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2980 BTRFS_BLOCK_GROUP_RAID1 | 2981 BTRFS_BLOCK_GROUP_DUP | 2982 BTRFS_BLOCK_GROUP_RAID10)) { 2983 stripes_required = map->num_stripes; 2984 } 2985 } 2986 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2987 stripes_allocated < stripes_required) { 2988 stripes_allocated = map->num_stripes; 2989 free_extent_map(em); 2990 kfree(multi); 2991 goto again; 2992 } 2993 stripe_nr = offset; 2994 /* 2995 * stripe_nr counts the total number of stripes we have to stride 2996 * to get to this block 2997 */ 2998 do_div(stripe_nr, map->stripe_len); 2999 3000 stripe_offset = stripe_nr * map->stripe_len; 3001 BUG_ON(offset < stripe_offset); 3002 3003 /* stripe_offset is the offset of this block in its stripe*/ 3004 stripe_offset = offset - stripe_offset; 3005 3006 if (rw & REQ_DISCARD) 3007 *length = min_t(u64, em->len - offset, *length); 3008 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3009 BTRFS_BLOCK_GROUP_RAID1 | 3010 BTRFS_BLOCK_GROUP_RAID10 | 3011 BTRFS_BLOCK_GROUP_DUP)) { 3012 /* we limit the length of each bio to what fits in a stripe */ 3013 *length = min_t(u64, em->len - offset, 3014 map->stripe_len - stripe_offset); 3015 } else { 3016 *length = em->len - offset; 3017 } 3018 3019 if (!multi_ret) 3020 goto out; 3021 3022 num_stripes = 1; 3023 stripe_index = 0; 3024 stripe_nr_orig = stripe_nr; 3025 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 3026 (~(map->stripe_len - 1)); 3027 do_div(stripe_nr_end, map->stripe_len); 3028 stripe_end_offset = stripe_nr_end * map->stripe_len - 3029 (offset + *length); 3030 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3031 if (rw & REQ_DISCARD) 3032 num_stripes = min_t(u64, map->num_stripes, 3033 stripe_nr_end - stripe_nr_orig); 3034 stripe_index = do_div(stripe_nr, map->num_stripes); 3035 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3036 if (rw & (REQ_WRITE | REQ_DISCARD)) 3037 num_stripes = map->num_stripes; 3038 else if (mirror_num) 3039 stripe_index = mirror_num - 1; 3040 else { 3041 stripe_index = find_live_mirror(map, 0, 3042 map->num_stripes, 3043 current->pid % map->num_stripes); 3044 } 3045 3046 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3047 if (rw & (REQ_WRITE | REQ_DISCARD)) 3048 num_stripes = map->num_stripes; 3049 else if (mirror_num) 3050 stripe_index = mirror_num - 1; 3051 3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3053 int factor = map->num_stripes / map->sub_stripes; 3054 3055 stripe_index = do_div(stripe_nr, factor); 3056 stripe_index *= map->sub_stripes; 3057 3058 if (rw & REQ_WRITE) 3059 num_stripes = map->sub_stripes; 3060 else if (rw & REQ_DISCARD) 3061 num_stripes = min_t(u64, map->sub_stripes * 3062 (stripe_nr_end - stripe_nr_orig), 3063 map->num_stripes); 3064 else if (mirror_num) 3065 stripe_index += mirror_num - 1; 3066 else { 3067 stripe_index = find_live_mirror(map, stripe_index, 3068 map->sub_stripes, stripe_index + 3069 current->pid % map->sub_stripes); 3070 } 3071 } else { 3072 /* 3073 * after this do_div call, stripe_nr is the number of stripes 3074 * on this device we have to walk to find the data, and 3075 * stripe_index is the number of our device in the stripe array 3076 */ 3077 stripe_index = do_div(stripe_nr, map->num_stripes); 3078 } 3079 BUG_ON(stripe_index >= map->num_stripes); 3080 3081 if (rw & REQ_DISCARD) { 3082 for (i = 0; i < num_stripes; i++) { 3083 multi->stripes[i].physical = 3084 map->stripes[stripe_index].physical + 3085 stripe_offset + stripe_nr * map->stripe_len; 3086 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3087 3088 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3089 u64 stripes; 3090 u32 last_stripe = 0; 3091 int j; 3092 3093 div_u64_rem(stripe_nr_end - 1, 3094 map->num_stripes, 3095 &last_stripe); 3096 3097 for (j = 0; j < map->num_stripes; j++) { 3098 u32 test; 3099 3100 div_u64_rem(stripe_nr_end - 1 - j, 3101 map->num_stripes, &test); 3102 if (test == stripe_index) 3103 break; 3104 } 3105 stripes = stripe_nr_end - 1 - j; 3106 do_div(stripes, map->num_stripes); 3107 multi->stripes[i].length = map->stripe_len * 3108 (stripes - stripe_nr + 1); 3109 3110 if (i == 0) { 3111 multi->stripes[i].length -= 3112 stripe_offset; 3113 stripe_offset = 0; 3114 } 3115 if (stripe_index == last_stripe) 3116 multi->stripes[i].length -= 3117 stripe_end_offset; 3118 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3119 u64 stripes; 3120 int j; 3121 int factor = map->num_stripes / 3122 map->sub_stripes; 3123 u32 last_stripe = 0; 3124 3125 div_u64_rem(stripe_nr_end - 1, 3126 factor, &last_stripe); 3127 last_stripe *= map->sub_stripes; 3128 3129 for (j = 0; j < factor; j++) { 3130 u32 test; 3131 3132 div_u64_rem(stripe_nr_end - 1 - j, 3133 factor, &test); 3134 3135 if (test == 3136 stripe_index / map->sub_stripes) 3137 break; 3138 } 3139 stripes = stripe_nr_end - 1 - j; 3140 do_div(stripes, factor); 3141 multi->stripes[i].length = map->stripe_len * 3142 (stripes - stripe_nr + 1); 3143 3144 if (i < map->sub_stripes) { 3145 multi->stripes[i].length -= 3146 stripe_offset; 3147 if (i == map->sub_stripes - 1) 3148 stripe_offset = 0; 3149 } 3150 if (stripe_index >= last_stripe && 3151 stripe_index <= (last_stripe + 3152 map->sub_stripes - 1)) { 3153 multi->stripes[i].length -= 3154 stripe_end_offset; 3155 } 3156 } else 3157 multi->stripes[i].length = *length; 3158 3159 stripe_index++; 3160 if (stripe_index == map->num_stripes) { 3161 /* This could only happen for RAID0/10 */ 3162 stripe_index = 0; 3163 stripe_nr++; 3164 } 3165 } 3166 } else { 3167 for (i = 0; i < num_stripes; i++) { 3168 multi->stripes[i].physical = 3169 map->stripes[stripe_index].physical + 3170 stripe_offset + 3171 stripe_nr * map->stripe_len; 3172 multi->stripes[i].dev = 3173 map->stripes[stripe_index].dev; 3174 stripe_index++; 3175 } 3176 } 3177 if (multi_ret) { 3178 *multi_ret = multi; 3179 multi->num_stripes = num_stripes; 3180 multi->max_errors = max_errors; 3181 } 3182 out: 3183 free_extent_map(em); 3184 return 0; 3185 } 3186 3187 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3188 u64 logical, u64 *length, 3189 struct btrfs_multi_bio **multi_ret, int mirror_num) 3190 { 3191 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3192 mirror_num); 3193 } 3194 3195 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3196 u64 chunk_start, u64 physical, u64 devid, 3197 u64 **logical, int *naddrs, int *stripe_len) 3198 { 3199 struct extent_map_tree *em_tree = &map_tree->map_tree; 3200 struct extent_map *em; 3201 struct map_lookup *map; 3202 u64 *buf; 3203 u64 bytenr; 3204 u64 length; 3205 u64 stripe_nr; 3206 int i, j, nr = 0; 3207 3208 read_lock(&em_tree->lock); 3209 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3210 read_unlock(&em_tree->lock); 3211 3212 BUG_ON(!em || em->start != chunk_start); 3213 map = (struct map_lookup *)em->bdev; 3214 3215 length = em->len; 3216 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3217 do_div(length, map->num_stripes / map->sub_stripes); 3218 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3219 do_div(length, map->num_stripes); 3220 3221 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3222 BUG_ON(!buf); 3223 3224 for (i = 0; i < map->num_stripes; i++) { 3225 if (devid && map->stripes[i].dev->devid != devid) 3226 continue; 3227 if (map->stripes[i].physical > physical || 3228 map->stripes[i].physical + length <= physical) 3229 continue; 3230 3231 stripe_nr = physical - map->stripes[i].physical; 3232 do_div(stripe_nr, map->stripe_len); 3233 3234 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3235 stripe_nr = stripe_nr * map->num_stripes + i; 3236 do_div(stripe_nr, map->sub_stripes); 3237 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3238 stripe_nr = stripe_nr * map->num_stripes + i; 3239 } 3240 bytenr = chunk_start + stripe_nr * map->stripe_len; 3241 WARN_ON(nr >= map->num_stripes); 3242 for (j = 0; j < nr; j++) { 3243 if (buf[j] == bytenr) 3244 break; 3245 } 3246 if (j == nr) { 3247 WARN_ON(nr >= map->num_stripes); 3248 buf[nr++] = bytenr; 3249 } 3250 } 3251 3252 *logical = buf; 3253 *naddrs = nr; 3254 *stripe_len = map->stripe_len; 3255 3256 free_extent_map(em); 3257 return 0; 3258 } 3259 3260 static void end_bio_multi_stripe(struct bio *bio, int err) 3261 { 3262 struct btrfs_multi_bio *multi = bio->bi_private; 3263 int is_orig_bio = 0; 3264 3265 if (err) 3266 atomic_inc(&multi->error); 3267 3268 if (bio == multi->orig_bio) 3269 is_orig_bio = 1; 3270 3271 if (atomic_dec_and_test(&multi->stripes_pending)) { 3272 if (!is_orig_bio) { 3273 bio_put(bio); 3274 bio = multi->orig_bio; 3275 } 3276 bio->bi_private = multi->private; 3277 bio->bi_end_io = multi->end_io; 3278 /* only send an error to the higher layers if it is 3279 * beyond the tolerance of the multi-bio 3280 */ 3281 if (atomic_read(&multi->error) > multi->max_errors) { 3282 err = -EIO; 3283 } else if (err) { 3284 /* 3285 * this bio is actually up to date, we didn't 3286 * go over the max number of errors 3287 */ 3288 set_bit(BIO_UPTODATE, &bio->bi_flags); 3289 err = 0; 3290 } 3291 kfree(multi); 3292 3293 bio_endio(bio, err); 3294 } else if (!is_orig_bio) { 3295 bio_put(bio); 3296 } 3297 } 3298 3299 struct async_sched { 3300 struct bio *bio; 3301 int rw; 3302 struct btrfs_fs_info *info; 3303 struct btrfs_work work; 3304 }; 3305 3306 /* 3307 * see run_scheduled_bios for a description of why bios are collected for 3308 * async submit. 3309 * 3310 * This will add one bio to the pending list for a device and make sure 3311 * the work struct is scheduled. 3312 */ 3313 static noinline int schedule_bio(struct btrfs_root *root, 3314 struct btrfs_device *device, 3315 int rw, struct bio *bio) 3316 { 3317 int should_queue = 1; 3318 struct btrfs_pending_bios *pending_bios; 3319 3320 /* don't bother with additional async steps for reads, right now */ 3321 if (!(rw & REQ_WRITE)) { 3322 bio_get(bio); 3323 submit_bio(rw, bio); 3324 bio_put(bio); 3325 return 0; 3326 } 3327 3328 /* 3329 * nr_async_bios allows us to reliably return congestion to the 3330 * higher layers. Otherwise, the async bio makes it appear we have 3331 * made progress against dirty pages when we've really just put it 3332 * on a queue for later 3333 */ 3334 atomic_inc(&root->fs_info->nr_async_bios); 3335 WARN_ON(bio->bi_next); 3336 bio->bi_next = NULL; 3337 bio->bi_rw |= rw; 3338 3339 spin_lock(&device->io_lock); 3340 if (bio->bi_rw & REQ_SYNC) 3341 pending_bios = &device->pending_sync_bios; 3342 else 3343 pending_bios = &device->pending_bios; 3344 3345 if (pending_bios->tail) 3346 pending_bios->tail->bi_next = bio; 3347 3348 pending_bios->tail = bio; 3349 if (!pending_bios->head) 3350 pending_bios->head = bio; 3351 if (device->running_pending) 3352 should_queue = 0; 3353 3354 spin_unlock(&device->io_lock); 3355 3356 if (should_queue) 3357 btrfs_queue_worker(&root->fs_info->submit_workers, 3358 &device->work); 3359 return 0; 3360 } 3361 3362 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 3363 int mirror_num, int async_submit) 3364 { 3365 struct btrfs_mapping_tree *map_tree; 3366 struct btrfs_device *dev; 3367 struct bio *first_bio = bio; 3368 u64 logical = (u64)bio->bi_sector << 9; 3369 u64 length = 0; 3370 u64 map_length; 3371 struct btrfs_multi_bio *multi = NULL; 3372 int ret; 3373 int dev_nr = 0; 3374 int total_devs = 1; 3375 3376 length = bio->bi_size; 3377 map_tree = &root->fs_info->mapping_tree; 3378 map_length = length; 3379 3380 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3381 mirror_num); 3382 BUG_ON(ret); 3383 3384 total_devs = multi->num_stripes; 3385 if (map_length < length) { 3386 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3387 "len %llu\n", (unsigned long long)logical, 3388 (unsigned long long)length, 3389 (unsigned long long)map_length); 3390 BUG(); 3391 } 3392 multi->end_io = first_bio->bi_end_io; 3393 multi->private = first_bio->bi_private; 3394 multi->orig_bio = first_bio; 3395 atomic_set(&multi->stripes_pending, multi->num_stripes); 3396 3397 while (dev_nr < total_devs) { 3398 if (total_devs > 1) { 3399 if (dev_nr < total_devs - 1) { 3400 bio = bio_clone(first_bio, GFP_NOFS); 3401 BUG_ON(!bio); 3402 } else { 3403 bio = first_bio; 3404 } 3405 bio->bi_private = multi; 3406 bio->bi_end_io = end_bio_multi_stripe; 3407 } 3408 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3409 dev = multi->stripes[dev_nr].dev; 3410 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3411 bio->bi_bdev = dev->bdev; 3412 if (async_submit) 3413 schedule_bio(root, dev, rw, bio); 3414 else 3415 submit_bio(rw, bio); 3416 } else { 3417 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 3418 bio->bi_sector = logical >> 9; 3419 bio_endio(bio, -EIO); 3420 } 3421 dev_nr++; 3422 } 3423 if (total_devs == 1) 3424 kfree(multi); 3425 return 0; 3426 } 3427 3428 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3429 u8 *uuid, u8 *fsid) 3430 { 3431 struct btrfs_device *device; 3432 struct btrfs_fs_devices *cur_devices; 3433 3434 cur_devices = root->fs_info->fs_devices; 3435 while (cur_devices) { 3436 if (!fsid || 3437 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3438 device = __find_device(&cur_devices->devices, 3439 devid, uuid); 3440 if (device) 3441 return device; 3442 } 3443 cur_devices = cur_devices->seed; 3444 } 3445 return NULL; 3446 } 3447 3448 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3449 u64 devid, u8 *dev_uuid) 3450 { 3451 struct btrfs_device *device; 3452 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3453 3454 device = kzalloc(sizeof(*device), GFP_NOFS); 3455 if (!device) 3456 return NULL; 3457 list_add(&device->dev_list, 3458 &fs_devices->devices); 3459 device->dev_root = root->fs_info->dev_root; 3460 device->devid = devid; 3461 device->work.func = pending_bios_fn; 3462 device->fs_devices = fs_devices; 3463 device->missing = 1; 3464 fs_devices->num_devices++; 3465 fs_devices->missing_devices++; 3466 spin_lock_init(&device->io_lock); 3467 INIT_LIST_HEAD(&device->dev_alloc_list); 3468 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3469 return device; 3470 } 3471 3472 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3473 struct extent_buffer *leaf, 3474 struct btrfs_chunk *chunk) 3475 { 3476 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3477 struct map_lookup *map; 3478 struct extent_map *em; 3479 u64 logical; 3480 u64 length; 3481 u64 devid; 3482 u8 uuid[BTRFS_UUID_SIZE]; 3483 int num_stripes; 3484 int ret; 3485 int i; 3486 3487 logical = key->offset; 3488 length = btrfs_chunk_length(leaf, chunk); 3489 3490 read_lock(&map_tree->map_tree.lock); 3491 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3492 read_unlock(&map_tree->map_tree.lock); 3493 3494 /* already mapped? */ 3495 if (em && em->start <= logical && em->start + em->len > logical) { 3496 free_extent_map(em); 3497 return 0; 3498 } else if (em) { 3499 free_extent_map(em); 3500 } 3501 3502 em = alloc_extent_map(GFP_NOFS); 3503 if (!em) 3504 return -ENOMEM; 3505 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3506 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3507 if (!map) { 3508 free_extent_map(em); 3509 return -ENOMEM; 3510 } 3511 3512 em->bdev = (struct block_device *)map; 3513 em->start = logical; 3514 em->len = length; 3515 em->block_start = 0; 3516 em->block_len = em->len; 3517 3518 map->num_stripes = num_stripes; 3519 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3520 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3521 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3522 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3523 map->type = btrfs_chunk_type(leaf, chunk); 3524 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3525 for (i = 0; i < num_stripes; i++) { 3526 map->stripes[i].physical = 3527 btrfs_stripe_offset_nr(leaf, chunk, i); 3528 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3529 read_extent_buffer(leaf, uuid, (unsigned long) 3530 btrfs_stripe_dev_uuid_nr(chunk, i), 3531 BTRFS_UUID_SIZE); 3532 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3533 NULL); 3534 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3535 kfree(map); 3536 free_extent_map(em); 3537 return -EIO; 3538 } 3539 if (!map->stripes[i].dev) { 3540 map->stripes[i].dev = 3541 add_missing_dev(root, devid, uuid); 3542 if (!map->stripes[i].dev) { 3543 kfree(map); 3544 free_extent_map(em); 3545 return -EIO; 3546 } 3547 } 3548 map->stripes[i].dev->in_fs_metadata = 1; 3549 } 3550 3551 write_lock(&map_tree->map_tree.lock); 3552 ret = add_extent_mapping(&map_tree->map_tree, em); 3553 write_unlock(&map_tree->map_tree.lock); 3554 BUG_ON(ret); 3555 free_extent_map(em); 3556 3557 return 0; 3558 } 3559 3560 static int fill_device_from_item(struct extent_buffer *leaf, 3561 struct btrfs_dev_item *dev_item, 3562 struct btrfs_device *device) 3563 { 3564 unsigned long ptr; 3565 3566 device->devid = btrfs_device_id(leaf, dev_item); 3567 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3568 device->total_bytes = device->disk_total_bytes; 3569 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3570 device->type = btrfs_device_type(leaf, dev_item); 3571 device->io_align = btrfs_device_io_align(leaf, dev_item); 3572 device->io_width = btrfs_device_io_width(leaf, dev_item); 3573 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3574 3575 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3576 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3577 3578 return 0; 3579 } 3580 3581 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3582 { 3583 struct btrfs_fs_devices *fs_devices; 3584 int ret; 3585 3586 mutex_lock(&uuid_mutex); 3587 3588 fs_devices = root->fs_info->fs_devices->seed; 3589 while (fs_devices) { 3590 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3591 ret = 0; 3592 goto out; 3593 } 3594 fs_devices = fs_devices->seed; 3595 } 3596 3597 fs_devices = find_fsid(fsid); 3598 if (!fs_devices) { 3599 ret = -ENOENT; 3600 goto out; 3601 } 3602 3603 fs_devices = clone_fs_devices(fs_devices); 3604 if (IS_ERR(fs_devices)) { 3605 ret = PTR_ERR(fs_devices); 3606 goto out; 3607 } 3608 3609 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3610 root->fs_info->bdev_holder); 3611 if (ret) 3612 goto out; 3613 3614 if (!fs_devices->seeding) { 3615 __btrfs_close_devices(fs_devices); 3616 free_fs_devices(fs_devices); 3617 ret = -EINVAL; 3618 goto out; 3619 } 3620 3621 fs_devices->seed = root->fs_info->fs_devices->seed; 3622 root->fs_info->fs_devices->seed = fs_devices; 3623 out: 3624 mutex_unlock(&uuid_mutex); 3625 return ret; 3626 } 3627 3628 static int read_one_dev(struct btrfs_root *root, 3629 struct extent_buffer *leaf, 3630 struct btrfs_dev_item *dev_item) 3631 { 3632 struct btrfs_device *device; 3633 u64 devid; 3634 int ret; 3635 u8 fs_uuid[BTRFS_UUID_SIZE]; 3636 u8 dev_uuid[BTRFS_UUID_SIZE]; 3637 3638 devid = btrfs_device_id(leaf, dev_item); 3639 read_extent_buffer(leaf, dev_uuid, 3640 (unsigned long)btrfs_device_uuid(dev_item), 3641 BTRFS_UUID_SIZE); 3642 read_extent_buffer(leaf, fs_uuid, 3643 (unsigned long)btrfs_device_fsid(dev_item), 3644 BTRFS_UUID_SIZE); 3645 3646 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3647 ret = open_seed_devices(root, fs_uuid); 3648 if (ret && !btrfs_test_opt(root, DEGRADED)) 3649 return ret; 3650 } 3651 3652 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3653 if (!device || !device->bdev) { 3654 if (!btrfs_test_opt(root, DEGRADED)) 3655 return -EIO; 3656 3657 if (!device) { 3658 printk(KERN_WARNING "warning devid %llu missing\n", 3659 (unsigned long long)devid); 3660 device = add_missing_dev(root, devid, dev_uuid); 3661 if (!device) 3662 return -ENOMEM; 3663 } else if (!device->missing) { 3664 /* 3665 * this happens when a device that was properly setup 3666 * in the device info lists suddenly goes bad. 3667 * device->bdev is NULL, and so we have to set 3668 * device->missing to one here 3669 */ 3670 root->fs_info->fs_devices->missing_devices++; 3671 device->missing = 1; 3672 } 3673 } 3674 3675 if (device->fs_devices != root->fs_info->fs_devices) { 3676 BUG_ON(device->writeable); 3677 if (device->generation != 3678 btrfs_device_generation(leaf, dev_item)) 3679 return -EINVAL; 3680 } 3681 3682 fill_device_from_item(leaf, dev_item, device); 3683 device->dev_root = root->fs_info->dev_root; 3684 device->in_fs_metadata = 1; 3685 if (device->writeable) 3686 device->fs_devices->total_rw_bytes += device->total_bytes; 3687 ret = 0; 3688 return ret; 3689 } 3690 3691 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3692 { 3693 struct btrfs_dev_item *dev_item; 3694 3695 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3696 dev_item); 3697 return read_one_dev(root, buf, dev_item); 3698 } 3699 3700 int btrfs_read_sys_array(struct btrfs_root *root) 3701 { 3702 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3703 struct extent_buffer *sb; 3704 struct btrfs_disk_key *disk_key; 3705 struct btrfs_chunk *chunk; 3706 u8 *ptr; 3707 unsigned long sb_ptr; 3708 int ret = 0; 3709 u32 num_stripes; 3710 u32 array_size; 3711 u32 len = 0; 3712 u32 cur; 3713 struct btrfs_key key; 3714 3715 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3716 BTRFS_SUPER_INFO_SIZE); 3717 if (!sb) 3718 return -ENOMEM; 3719 btrfs_set_buffer_uptodate(sb); 3720 btrfs_set_buffer_lockdep_class(sb, 0); 3721 3722 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3723 array_size = btrfs_super_sys_array_size(super_copy); 3724 3725 ptr = super_copy->sys_chunk_array; 3726 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3727 cur = 0; 3728 3729 while (cur < array_size) { 3730 disk_key = (struct btrfs_disk_key *)ptr; 3731 btrfs_disk_key_to_cpu(&key, disk_key); 3732 3733 len = sizeof(*disk_key); ptr += len; 3734 sb_ptr += len; 3735 cur += len; 3736 3737 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3738 chunk = (struct btrfs_chunk *)sb_ptr; 3739 ret = read_one_chunk(root, &key, sb, chunk); 3740 if (ret) 3741 break; 3742 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3743 len = btrfs_chunk_item_size(num_stripes); 3744 } else { 3745 ret = -EIO; 3746 break; 3747 } 3748 ptr += len; 3749 sb_ptr += len; 3750 cur += len; 3751 } 3752 free_extent_buffer(sb); 3753 return ret; 3754 } 3755 3756 int btrfs_read_chunk_tree(struct btrfs_root *root) 3757 { 3758 struct btrfs_path *path; 3759 struct extent_buffer *leaf; 3760 struct btrfs_key key; 3761 struct btrfs_key found_key; 3762 int ret; 3763 int slot; 3764 3765 root = root->fs_info->chunk_root; 3766 3767 path = btrfs_alloc_path(); 3768 if (!path) 3769 return -ENOMEM; 3770 3771 /* first we search for all of the device items, and then we 3772 * read in all of the chunk items. This way we can create chunk 3773 * mappings that reference all of the devices that are afound 3774 */ 3775 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3776 key.offset = 0; 3777 key.type = 0; 3778 again: 3779 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3780 if (ret < 0) 3781 goto error; 3782 while (1) { 3783 leaf = path->nodes[0]; 3784 slot = path->slots[0]; 3785 if (slot >= btrfs_header_nritems(leaf)) { 3786 ret = btrfs_next_leaf(root, path); 3787 if (ret == 0) 3788 continue; 3789 if (ret < 0) 3790 goto error; 3791 break; 3792 } 3793 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3794 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3795 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3796 break; 3797 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3798 struct btrfs_dev_item *dev_item; 3799 dev_item = btrfs_item_ptr(leaf, slot, 3800 struct btrfs_dev_item); 3801 ret = read_one_dev(root, leaf, dev_item); 3802 if (ret) 3803 goto error; 3804 } 3805 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3806 struct btrfs_chunk *chunk; 3807 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3808 ret = read_one_chunk(root, &found_key, leaf, chunk); 3809 if (ret) 3810 goto error; 3811 } 3812 path->slots[0]++; 3813 } 3814 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3815 key.objectid = 0; 3816 btrfs_release_path(root, path); 3817 goto again; 3818 } 3819 ret = 0; 3820 error: 3821 btrfs_free_path(path); 3822 return ret; 3823 } 3824