1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/iocontext.h> 24 #include <linux/capability.h> 25 #include <linux/ratelimit.h> 26 #include <linux/kthread.h> 27 #include <linux/raid/pq.h> 28 #include <linux/semaphore.h> 29 #include <linux/uuid.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 #include "sysfs.h" 44 45 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 46 [BTRFS_RAID_RAID10] = { 47 .sub_stripes = 2, 48 .dev_stripes = 1, 49 .devs_max = 0, /* 0 == as many as possible */ 50 .devs_min = 4, 51 .tolerated_failures = 1, 52 .devs_increment = 2, 53 .ncopies = 2, 54 }, 55 [BTRFS_RAID_RAID1] = { 56 .sub_stripes = 1, 57 .dev_stripes = 1, 58 .devs_max = 2, 59 .devs_min = 2, 60 .tolerated_failures = 1, 61 .devs_increment = 2, 62 .ncopies = 2, 63 }, 64 [BTRFS_RAID_DUP] = { 65 .sub_stripes = 1, 66 .dev_stripes = 2, 67 .devs_max = 1, 68 .devs_min = 1, 69 .tolerated_failures = 0, 70 .devs_increment = 1, 71 .ncopies = 2, 72 }, 73 [BTRFS_RAID_RAID0] = { 74 .sub_stripes = 1, 75 .dev_stripes = 1, 76 .devs_max = 0, 77 .devs_min = 2, 78 .tolerated_failures = 0, 79 .devs_increment = 1, 80 .ncopies = 1, 81 }, 82 [BTRFS_RAID_SINGLE] = { 83 .sub_stripes = 1, 84 .dev_stripes = 1, 85 .devs_max = 1, 86 .devs_min = 1, 87 .tolerated_failures = 0, 88 .devs_increment = 1, 89 .ncopies = 1, 90 }, 91 [BTRFS_RAID_RAID5] = { 92 .sub_stripes = 1, 93 .dev_stripes = 1, 94 .devs_max = 0, 95 .devs_min = 2, 96 .tolerated_failures = 1, 97 .devs_increment = 1, 98 .ncopies = 2, 99 }, 100 [BTRFS_RAID_RAID6] = { 101 .sub_stripes = 1, 102 .dev_stripes = 1, 103 .devs_max = 0, 104 .devs_min = 3, 105 .tolerated_failures = 2, 106 .devs_increment = 1, 107 .ncopies = 3, 108 }, 109 }; 110 111 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { 112 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, 113 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, 114 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, 115 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, 116 [BTRFS_RAID_SINGLE] = 0, 117 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, 118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 119 }; 120 121 /* 122 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices 123 * condition is not met. Zero means there's no corresponding 124 * BTRFS_ERROR_DEV_*_NOT_MET value. 125 */ 126 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { 127 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 128 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 129 [BTRFS_RAID_DUP] = 0, 130 [BTRFS_RAID_RAID0] = 0, 131 [BTRFS_RAID_SINGLE] = 0, 132 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 133 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 134 }; 135 136 static int init_first_rw_device(struct btrfs_trans_handle *trans, 137 struct btrfs_fs_info *fs_info, 138 struct btrfs_device *device); 139 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 140 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 141 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 142 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 143 144 DEFINE_MUTEX(uuid_mutex); 145 static LIST_HEAD(fs_uuids); 146 struct list_head *btrfs_get_fs_uuids(void) 147 { 148 return &fs_uuids; 149 } 150 151 static struct btrfs_fs_devices *__alloc_fs_devices(void) 152 { 153 struct btrfs_fs_devices *fs_devs; 154 155 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 156 if (!fs_devs) 157 return ERR_PTR(-ENOMEM); 158 159 mutex_init(&fs_devs->device_list_mutex); 160 161 INIT_LIST_HEAD(&fs_devs->devices); 162 INIT_LIST_HEAD(&fs_devs->resized_devices); 163 INIT_LIST_HEAD(&fs_devs->alloc_list); 164 INIT_LIST_HEAD(&fs_devs->list); 165 166 return fs_devs; 167 } 168 169 /** 170 * alloc_fs_devices - allocate struct btrfs_fs_devices 171 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 172 * generated. 173 * 174 * Return: a pointer to a new &struct btrfs_fs_devices on success; 175 * ERR_PTR() on error. Returned struct is not linked onto any lists and 176 * can be destroyed with kfree() right away. 177 */ 178 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 179 { 180 struct btrfs_fs_devices *fs_devs; 181 182 fs_devs = __alloc_fs_devices(); 183 if (IS_ERR(fs_devs)) 184 return fs_devs; 185 186 if (fsid) 187 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 188 else 189 generate_random_uuid(fs_devs->fsid); 190 191 return fs_devs; 192 } 193 194 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 195 { 196 struct btrfs_device *device; 197 WARN_ON(fs_devices->opened); 198 while (!list_empty(&fs_devices->devices)) { 199 device = list_entry(fs_devices->devices.next, 200 struct btrfs_device, dev_list); 201 list_del(&device->dev_list); 202 rcu_string_free(device->name); 203 kfree(device); 204 } 205 kfree(fs_devices); 206 } 207 208 static void btrfs_kobject_uevent(struct block_device *bdev, 209 enum kobject_action action) 210 { 211 int ret; 212 213 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 214 if (ret) 215 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 216 action, 217 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 218 &disk_to_dev(bdev->bd_disk)->kobj); 219 } 220 221 void btrfs_cleanup_fs_uuids(void) 222 { 223 struct btrfs_fs_devices *fs_devices; 224 225 while (!list_empty(&fs_uuids)) { 226 fs_devices = list_entry(fs_uuids.next, 227 struct btrfs_fs_devices, list); 228 list_del(&fs_devices->list); 229 free_fs_devices(fs_devices); 230 } 231 } 232 233 static struct btrfs_device *__alloc_device(void) 234 { 235 struct btrfs_device *dev; 236 237 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 238 if (!dev) 239 return ERR_PTR(-ENOMEM); 240 241 INIT_LIST_HEAD(&dev->dev_list); 242 INIT_LIST_HEAD(&dev->dev_alloc_list); 243 INIT_LIST_HEAD(&dev->resized_list); 244 245 spin_lock_init(&dev->io_lock); 246 247 spin_lock_init(&dev->reada_lock); 248 atomic_set(&dev->reada_in_flight, 0); 249 atomic_set(&dev->dev_stats_ccnt, 0); 250 btrfs_device_data_ordered_init(dev); 251 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 252 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 253 254 return dev; 255 } 256 257 static noinline struct btrfs_device *__find_device(struct list_head *head, 258 u64 devid, u8 *uuid) 259 { 260 struct btrfs_device *dev; 261 262 list_for_each_entry(dev, head, dev_list) { 263 if (dev->devid == devid && 264 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 265 return dev; 266 } 267 } 268 return NULL; 269 } 270 271 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 272 { 273 struct btrfs_fs_devices *fs_devices; 274 275 list_for_each_entry(fs_devices, &fs_uuids, list) { 276 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 277 return fs_devices; 278 } 279 return NULL; 280 } 281 282 static int 283 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 284 int flush, struct block_device **bdev, 285 struct buffer_head **bh) 286 { 287 int ret; 288 289 *bdev = blkdev_get_by_path(device_path, flags, holder); 290 291 if (IS_ERR(*bdev)) { 292 ret = PTR_ERR(*bdev); 293 goto error; 294 } 295 296 if (flush) 297 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 298 ret = set_blocksize(*bdev, 4096); 299 if (ret) { 300 blkdev_put(*bdev, flags); 301 goto error; 302 } 303 invalidate_bdev(*bdev); 304 *bh = btrfs_read_dev_super(*bdev); 305 if (IS_ERR(*bh)) { 306 ret = PTR_ERR(*bh); 307 blkdev_put(*bdev, flags); 308 goto error; 309 } 310 311 return 0; 312 313 error: 314 *bdev = NULL; 315 *bh = NULL; 316 return ret; 317 } 318 319 static void requeue_list(struct btrfs_pending_bios *pending_bios, 320 struct bio *head, struct bio *tail) 321 { 322 323 struct bio *old_head; 324 325 old_head = pending_bios->head; 326 pending_bios->head = head; 327 if (pending_bios->tail) 328 tail->bi_next = old_head; 329 else 330 pending_bios->tail = tail; 331 } 332 333 /* 334 * we try to collect pending bios for a device so we don't get a large 335 * number of procs sending bios down to the same device. This greatly 336 * improves the schedulers ability to collect and merge the bios. 337 * 338 * But, it also turns into a long list of bios to process and that is sure 339 * to eventually make the worker thread block. The solution here is to 340 * make some progress and then put this work struct back at the end of 341 * the list if the block device is congested. This way, multiple devices 342 * can make progress from a single worker thread. 343 */ 344 static noinline void run_scheduled_bios(struct btrfs_device *device) 345 { 346 struct btrfs_fs_info *fs_info = device->fs_info; 347 struct bio *pending; 348 struct backing_dev_info *bdi; 349 struct btrfs_pending_bios *pending_bios; 350 struct bio *tail; 351 struct bio *cur; 352 int again = 0; 353 unsigned long num_run; 354 unsigned long batch_run = 0; 355 unsigned long limit; 356 unsigned long last_waited = 0; 357 int force_reg = 0; 358 int sync_pending = 0; 359 struct blk_plug plug; 360 361 /* 362 * this function runs all the bios we've collected for 363 * a particular device. We don't want to wander off to 364 * another device without first sending all of these down. 365 * So, setup a plug here and finish it off before we return 366 */ 367 blk_start_plug(&plug); 368 369 bdi = blk_get_backing_dev_info(device->bdev); 370 limit = btrfs_async_submit_limit(fs_info); 371 limit = limit * 2 / 3; 372 373 loop: 374 spin_lock(&device->io_lock); 375 376 loop_lock: 377 num_run = 0; 378 379 /* take all the bios off the list at once and process them 380 * later on (without the lock held). But, remember the 381 * tail and other pointers so the bios can be properly reinserted 382 * into the list if we hit congestion 383 */ 384 if (!force_reg && device->pending_sync_bios.head) { 385 pending_bios = &device->pending_sync_bios; 386 force_reg = 1; 387 } else { 388 pending_bios = &device->pending_bios; 389 force_reg = 0; 390 } 391 392 pending = pending_bios->head; 393 tail = pending_bios->tail; 394 WARN_ON(pending && !tail); 395 396 /* 397 * if pending was null this time around, no bios need processing 398 * at all and we can stop. Otherwise it'll loop back up again 399 * and do an additional check so no bios are missed. 400 * 401 * device->running_pending is used to synchronize with the 402 * schedule_bio code. 403 */ 404 if (device->pending_sync_bios.head == NULL && 405 device->pending_bios.head == NULL) { 406 again = 0; 407 device->running_pending = 0; 408 } else { 409 again = 1; 410 device->running_pending = 1; 411 } 412 413 pending_bios->head = NULL; 414 pending_bios->tail = NULL; 415 416 spin_unlock(&device->io_lock); 417 418 while (pending) { 419 420 rmb(); 421 /* we want to work on both lists, but do more bios on the 422 * sync list than the regular list 423 */ 424 if ((num_run > 32 && 425 pending_bios != &device->pending_sync_bios && 426 device->pending_sync_bios.head) || 427 (num_run > 64 && pending_bios == &device->pending_sync_bios && 428 device->pending_bios.head)) { 429 spin_lock(&device->io_lock); 430 requeue_list(pending_bios, pending, tail); 431 goto loop_lock; 432 } 433 434 cur = pending; 435 pending = pending->bi_next; 436 cur->bi_next = NULL; 437 438 /* 439 * atomic_dec_return implies a barrier for waitqueue_active 440 */ 441 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 442 waitqueue_active(&fs_info->async_submit_wait)) 443 wake_up(&fs_info->async_submit_wait); 444 445 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 446 447 /* 448 * if we're doing the sync list, record that our 449 * plug has some sync requests on it 450 * 451 * If we're doing the regular list and there are 452 * sync requests sitting around, unplug before 453 * we add more 454 */ 455 if (pending_bios == &device->pending_sync_bios) { 456 sync_pending = 1; 457 } else if (sync_pending) { 458 blk_finish_plug(&plug); 459 blk_start_plug(&plug); 460 sync_pending = 0; 461 } 462 463 btrfsic_submit_bio(cur); 464 num_run++; 465 batch_run++; 466 467 cond_resched(); 468 469 /* 470 * we made progress, there is more work to do and the bdi 471 * is now congested. Back off and let other work structs 472 * run instead 473 */ 474 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 475 fs_info->fs_devices->open_devices > 1) { 476 struct io_context *ioc; 477 478 ioc = current->io_context; 479 480 /* 481 * the main goal here is that we don't want to 482 * block if we're going to be able to submit 483 * more requests without blocking. 484 * 485 * This code does two great things, it pokes into 486 * the elevator code from a filesystem _and_ 487 * it makes assumptions about how batching works. 488 */ 489 if (ioc && ioc->nr_batch_requests > 0 && 490 time_before(jiffies, ioc->last_waited + HZ/50UL) && 491 (last_waited == 0 || 492 ioc->last_waited == last_waited)) { 493 /* 494 * we want to go through our batch of 495 * requests and stop. So, we copy out 496 * the ioc->last_waited time and test 497 * against it before looping 498 */ 499 last_waited = ioc->last_waited; 500 cond_resched(); 501 continue; 502 } 503 spin_lock(&device->io_lock); 504 requeue_list(pending_bios, pending, tail); 505 device->running_pending = 1; 506 507 spin_unlock(&device->io_lock); 508 btrfs_queue_work(fs_info->submit_workers, 509 &device->work); 510 goto done; 511 } 512 /* unplug every 64 requests just for good measure */ 513 if (batch_run % 64 == 0) { 514 blk_finish_plug(&plug); 515 blk_start_plug(&plug); 516 sync_pending = 0; 517 } 518 } 519 520 cond_resched(); 521 if (again) 522 goto loop; 523 524 spin_lock(&device->io_lock); 525 if (device->pending_bios.head || device->pending_sync_bios.head) 526 goto loop_lock; 527 spin_unlock(&device->io_lock); 528 529 done: 530 blk_finish_plug(&plug); 531 } 532 533 static void pending_bios_fn(struct btrfs_work *work) 534 { 535 struct btrfs_device *device; 536 537 device = container_of(work, struct btrfs_device, work); 538 run_scheduled_bios(device); 539 } 540 541 542 void btrfs_free_stale_device(struct btrfs_device *cur_dev) 543 { 544 struct btrfs_fs_devices *fs_devs; 545 struct btrfs_device *dev; 546 547 if (!cur_dev->name) 548 return; 549 550 list_for_each_entry(fs_devs, &fs_uuids, list) { 551 int del = 1; 552 553 if (fs_devs->opened) 554 continue; 555 if (fs_devs->seeding) 556 continue; 557 558 list_for_each_entry(dev, &fs_devs->devices, dev_list) { 559 560 if (dev == cur_dev) 561 continue; 562 if (!dev->name) 563 continue; 564 565 /* 566 * Todo: This won't be enough. What if the same device 567 * comes back (with new uuid and) with its mapper path? 568 * But for now, this does help as mostly an admin will 569 * either use mapper or non mapper path throughout. 570 */ 571 rcu_read_lock(); 572 del = strcmp(rcu_str_deref(dev->name), 573 rcu_str_deref(cur_dev->name)); 574 rcu_read_unlock(); 575 if (!del) 576 break; 577 } 578 579 if (!del) { 580 /* delete the stale device */ 581 if (fs_devs->num_devices == 1) { 582 btrfs_sysfs_remove_fsid(fs_devs); 583 list_del(&fs_devs->list); 584 free_fs_devices(fs_devs); 585 } else { 586 fs_devs->num_devices--; 587 list_del(&dev->dev_list); 588 rcu_string_free(dev->name); 589 kfree(dev); 590 } 591 break; 592 } 593 } 594 } 595 596 /* 597 * Add new device to list of registered devices 598 * 599 * Returns: 600 * 1 - first time device is seen 601 * 0 - device already known 602 * < 0 - error 603 */ 604 static noinline int device_list_add(const char *path, 605 struct btrfs_super_block *disk_super, 606 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 607 { 608 struct btrfs_device *device; 609 struct btrfs_fs_devices *fs_devices; 610 struct rcu_string *name; 611 int ret = 0; 612 u64 found_transid = btrfs_super_generation(disk_super); 613 614 fs_devices = find_fsid(disk_super->fsid); 615 if (!fs_devices) { 616 fs_devices = alloc_fs_devices(disk_super->fsid); 617 if (IS_ERR(fs_devices)) 618 return PTR_ERR(fs_devices); 619 620 list_add(&fs_devices->list, &fs_uuids); 621 622 device = NULL; 623 } else { 624 device = __find_device(&fs_devices->devices, devid, 625 disk_super->dev_item.uuid); 626 } 627 628 if (!device) { 629 if (fs_devices->opened) 630 return -EBUSY; 631 632 device = btrfs_alloc_device(NULL, &devid, 633 disk_super->dev_item.uuid); 634 if (IS_ERR(device)) { 635 /* we can safely leave the fs_devices entry around */ 636 return PTR_ERR(device); 637 } 638 639 name = rcu_string_strdup(path, GFP_NOFS); 640 if (!name) { 641 kfree(device); 642 return -ENOMEM; 643 } 644 rcu_assign_pointer(device->name, name); 645 646 mutex_lock(&fs_devices->device_list_mutex); 647 list_add_rcu(&device->dev_list, &fs_devices->devices); 648 fs_devices->num_devices++; 649 mutex_unlock(&fs_devices->device_list_mutex); 650 651 ret = 1; 652 device->fs_devices = fs_devices; 653 } else if (!device->name || strcmp(device->name->str, path)) { 654 /* 655 * When FS is already mounted. 656 * 1. If you are here and if the device->name is NULL that 657 * means this device was missing at time of FS mount. 658 * 2. If you are here and if the device->name is different 659 * from 'path' that means either 660 * a. The same device disappeared and reappeared with 661 * different name. or 662 * b. The missing-disk-which-was-replaced, has 663 * reappeared now. 664 * 665 * We must allow 1 and 2a above. But 2b would be a spurious 666 * and unintentional. 667 * 668 * Further in case of 1 and 2a above, the disk at 'path' 669 * would have missed some transaction when it was away and 670 * in case of 2a the stale bdev has to be updated as well. 671 * 2b must not be allowed at all time. 672 */ 673 674 /* 675 * For now, we do allow update to btrfs_fs_device through the 676 * btrfs dev scan cli after FS has been mounted. We're still 677 * tracking a problem where systems fail mount by subvolume id 678 * when we reject replacement on a mounted FS. 679 */ 680 if (!fs_devices->opened && found_transid < device->generation) { 681 /* 682 * That is if the FS is _not_ mounted and if you 683 * are here, that means there is more than one 684 * disk with same uuid and devid.We keep the one 685 * with larger generation number or the last-in if 686 * generation are equal. 687 */ 688 return -EEXIST; 689 } 690 691 name = rcu_string_strdup(path, GFP_NOFS); 692 if (!name) 693 return -ENOMEM; 694 rcu_string_free(device->name); 695 rcu_assign_pointer(device->name, name); 696 if (device->missing) { 697 fs_devices->missing_devices--; 698 device->missing = 0; 699 } 700 } 701 702 /* 703 * Unmount does not free the btrfs_device struct but would zero 704 * generation along with most of the other members. So just update 705 * it back. We need it to pick the disk with largest generation 706 * (as above). 707 */ 708 if (!fs_devices->opened) 709 device->generation = found_transid; 710 711 /* 712 * if there is new btrfs on an already registered device, 713 * then remove the stale device entry. 714 */ 715 if (ret > 0) 716 btrfs_free_stale_device(device); 717 718 *fs_devices_ret = fs_devices; 719 720 return ret; 721 } 722 723 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 724 { 725 struct btrfs_fs_devices *fs_devices; 726 struct btrfs_device *device; 727 struct btrfs_device *orig_dev; 728 729 fs_devices = alloc_fs_devices(orig->fsid); 730 if (IS_ERR(fs_devices)) 731 return fs_devices; 732 733 mutex_lock(&orig->device_list_mutex); 734 fs_devices->total_devices = orig->total_devices; 735 736 /* We have held the volume lock, it is safe to get the devices. */ 737 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 738 struct rcu_string *name; 739 740 device = btrfs_alloc_device(NULL, &orig_dev->devid, 741 orig_dev->uuid); 742 if (IS_ERR(device)) 743 goto error; 744 745 /* 746 * This is ok to do without rcu read locked because we hold the 747 * uuid mutex so nothing we touch in here is going to disappear. 748 */ 749 if (orig_dev->name) { 750 name = rcu_string_strdup(orig_dev->name->str, 751 GFP_KERNEL); 752 if (!name) { 753 kfree(device); 754 goto error; 755 } 756 rcu_assign_pointer(device->name, name); 757 } 758 759 list_add(&device->dev_list, &fs_devices->devices); 760 device->fs_devices = fs_devices; 761 fs_devices->num_devices++; 762 } 763 mutex_unlock(&orig->device_list_mutex); 764 return fs_devices; 765 error: 766 mutex_unlock(&orig->device_list_mutex); 767 free_fs_devices(fs_devices); 768 return ERR_PTR(-ENOMEM); 769 } 770 771 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) 772 { 773 struct btrfs_device *device, *next; 774 struct btrfs_device *latest_dev = NULL; 775 776 mutex_lock(&uuid_mutex); 777 again: 778 /* This is the initialized path, it is safe to release the devices. */ 779 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 780 if (device->in_fs_metadata) { 781 if (!device->is_tgtdev_for_dev_replace && 782 (!latest_dev || 783 device->generation > latest_dev->generation)) { 784 latest_dev = device; 785 } 786 continue; 787 } 788 789 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 790 /* 791 * In the first step, keep the device which has 792 * the correct fsid and the devid that is used 793 * for the dev_replace procedure. 794 * In the second step, the dev_replace state is 795 * read from the device tree and it is known 796 * whether the procedure is really active or 797 * not, which means whether this device is 798 * used or whether it should be removed. 799 */ 800 if (step == 0 || device->is_tgtdev_for_dev_replace) { 801 continue; 802 } 803 } 804 if (device->bdev) { 805 blkdev_put(device->bdev, device->mode); 806 device->bdev = NULL; 807 fs_devices->open_devices--; 808 } 809 if (device->writeable) { 810 list_del_init(&device->dev_alloc_list); 811 device->writeable = 0; 812 if (!device->is_tgtdev_for_dev_replace) 813 fs_devices->rw_devices--; 814 } 815 list_del_init(&device->dev_list); 816 fs_devices->num_devices--; 817 rcu_string_free(device->name); 818 kfree(device); 819 } 820 821 if (fs_devices->seed) { 822 fs_devices = fs_devices->seed; 823 goto again; 824 } 825 826 fs_devices->latest_bdev = latest_dev->bdev; 827 828 mutex_unlock(&uuid_mutex); 829 } 830 831 static void __free_device(struct work_struct *work) 832 { 833 struct btrfs_device *device; 834 835 device = container_of(work, struct btrfs_device, rcu_work); 836 rcu_string_free(device->name); 837 kfree(device); 838 } 839 840 static void free_device(struct rcu_head *head) 841 { 842 struct btrfs_device *device; 843 844 device = container_of(head, struct btrfs_device, rcu); 845 846 INIT_WORK(&device->rcu_work, __free_device); 847 schedule_work(&device->rcu_work); 848 } 849 850 static void btrfs_close_bdev(struct btrfs_device *device) 851 { 852 if (device->bdev && device->writeable) { 853 sync_blockdev(device->bdev); 854 invalidate_bdev(device->bdev); 855 } 856 857 if (device->bdev) 858 blkdev_put(device->bdev, device->mode); 859 } 860 861 static void btrfs_prepare_close_one_device(struct btrfs_device *device) 862 { 863 struct btrfs_fs_devices *fs_devices = device->fs_devices; 864 struct btrfs_device *new_device; 865 struct rcu_string *name; 866 867 if (device->bdev) 868 fs_devices->open_devices--; 869 870 if (device->writeable && 871 device->devid != BTRFS_DEV_REPLACE_DEVID) { 872 list_del_init(&device->dev_alloc_list); 873 fs_devices->rw_devices--; 874 } 875 876 if (device->missing) 877 fs_devices->missing_devices--; 878 879 new_device = btrfs_alloc_device(NULL, &device->devid, 880 device->uuid); 881 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 882 883 /* Safe because we are under uuid_mutex */ 884 if (device->name) { 885 name = rcu_string_strdup(device->name->str, GFP_NOFS); 886 BUG_ON(!name); /* -ENOMEM */ 887 rcu_assign_pointer(new_device->name, name); 888 } 889 890 list_replace_rcu(&device->dev_list, &new_device->dev_list); 891 new_device->fs_devices = device->fs_devices; 892 } 893 894 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 895 { 896 struct btrfs_device *device, *tmp; 897 struct list_head pending_put; 898 899 INIT_LIST_HEAD(&pending_put); 900 901 if (--fs_devices->opened > 0) 902 return 0; 903 904 mutex_lock(&fs_devices->device_list_mutex); 905 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 906 btrfs_prepare_close_one_device(device); 907 list_add(&device->dev_list, &pending_put); 908 } 909 mutex_unlock(&fs_devices->device_list_mutex); 910 911 /* 912 * btrfs_show_devname() is using the device_list_mutex, 913 * sometimes call to blkdev_put() leads vfs calling 914 * into this func. So do put outside of device_list_mutex, 915 * as of now. 916 */ 917 while (!list_empty(&pending_put)) { 918 device = list_first_entry(&pending_put, 919 struct btrfs_device, dev_list); 920 list_del(&device->dev_list); 921 btrfs_close_bdev(device); 922 call_rcu(&device->rcu, free_device); 923 } 924 925 WARN_ON(fs_devices->open_devices); 926 WARN_ON(fs_devices->rw_devices); 927 fs_devices->opened = 0; 928 fs_devices->seeding = 0; 929 930 return 0; 931 } 932 933 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 934 { 935 struct btrfs_fs_devices *seed_devices = NULL; 936 int ret; 937 938 mutex_lock(&uuid_mutex); 939 ret = __btrfs_close_devices(fs_devices); 940 if (!fs_devices->opened) { 941 seed_devices = fs_devices->seed; 942 fs_devices->seed = NULL; 943 } 944 mutex_unlock(&uuid_mutex); 945 946 while (seed_devices) { 947 fs_devices = seed_devices; 948 seed_devices = fs_devices->seed; 949 __btrfs_close_devices(fs_devices); 950 free_fs_devices(fs_devices); 951 } 952 /* 953 * Wait for rcu kworkers under __btrfs_close_devices 954 * to finish all blkdev_puts so device is really 955 * free when umount is done. 956 */ 957 rcu_barrier(); 958 return ret; 959 } 960 961 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 962 fmode_t flags, void *holder) 963 { 964 struct request_queue *q; 965 struct block_device *bdev; 966 struct list_head *head = &fs_devices->devices; 967 struct btrfs_device *device; 968 struct btrfs_device *latest_dev = NULL; 969 struct buffer_head *bh; 970 struct btrfs_super_block *disk_super; 971 u64 devid; 972 int seeding = 1; 973 int ret = 0; 974 975 flags |= FMODE_EXCL; 976 977 list_for_each_entry(device, head, dev_list) { 978 if (device->bdev) 979 continue; 980 if (!device->name) 981 continue; 982 983 /* Just open everything we can; ignore failures here */ 984 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 985 &bdev, &bh)) 986 continue; 987 988 disk_super = (struct btrfs_super_block *)bh->b_data; 989 devid = btrfs_stack_device_id(&disk_super->dev_item); 990 if (devid != device->devid) 991 goto error_brelse; 992 993 if (memcmp(device->uuid, disk_super->dev_item.uuid, 994 BTRFS_UUID_SIZE)) 995 goto error_brelse; 996 997 device->generation = btrfs_super_generation(disk_super); 998 if (!latest_dev || 999 device->generation > latest_dev->generation) 1000 latest_dev = device; 1001 1002 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 1003 device->writeable = 0; 1004 } else { 1005 device->writeable = !bdev_read_only(bdev); 1006 seeding = 0; 1007 } 1008 1009 q = bdev_get_queue(bdev); 1010 if (blk_queue_discard(q)) 1011 device->can_discard = 1; 1012 1013 device->bdev = bdev; 1014 device->in_fs_metadata = 0; 1015 device->mode = flags; 1016 1017 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1018 fs_devices->rotating = 1; 1019 1020 fs_devices->open_devices++; 1021 if (device->writeable && 1022 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1023 fs_devices->rw_devices++; 1024 list_add(&device->dev_alloc_list, 1025 &fs_devices->alloc_list); 1026 } 1027 brelse(bh); 1028 continue; 1029 1030 error_brelse: 1031 brelse(bh); 1032 blkdev_put(bdev, flags); 1033 continue; 1034 } 1035 if (fs_devices->open_devices == 0) { 1036 ret = -EINVAL; 1037 goto out; 1038 } 1039 fs_devices->seeding = seeding; 1040 fs_devices->opened = 1; 1041 fs_devices->latest_bdev = latest_dev->bdev; 1042 fs_devices->total_rw_bytes = 0; 1043 out: 1044 return ret; 1045 } 1046 1047 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1048 fmode_t flags, void *holder) 1049 { 1050 int ret; 1051 1052 mutex_lock(&uuid_mutex); 1053 if (fs_devices->opened) { 1054 fs_devices->opened++; 1055 ret = 0; 1056 } else { 1057 ret = __btrfs_open_devices(fs_devices, flags, holder); 1058 } 1059 mutex_unlock(&uuid_mutex); 1060 return ret; 1061 } 1062 1063 void btrfs_release_disk_super(struct page *page) 1064 { 1065 kunmap(page); 1066 put_page(page); 1067 } 1068 1069 int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1070 struct page **page, struct btrfs_super_block **disk_super) 1071 { 1072 void *p; 1073 pgoff_t index; 1074 1075 /* make sure our super fits in the device */ 1076 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1077 return 1; 1078 1079 /* make sure our super fits in the page */ 1080 if (sizeof(**disk_super) > PAGE_SIZE) 1081 return 1; 1082 1083 /* make sure our super doesn't straddle pages on disk */ 1084 index = bytenr >> PAGE_SHIFT; 1085 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1086 return 1; 1087 1088 /* pull in the page with our super */ 1089 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1090 index, GFP_KERNEL); 1091 1092 if (IS_ERR_OR_NULL(*page)) 1093 return 1; 1094 1095 p = kmap(*page); 1096 1097 /* align our pointer to the offset of the super block */ 1098 *disk_super = p + (bytenr & ~PAGE_MASK); 1099 1100 if (btrfs_super_bytenr(*disk_super) != bytenr || 1101 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1102 btrfs_release_disk_super(*page); 1103 return 1; 1104 } 1105 1106 if ((*disk_super)->label[0] && 1107 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1108 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1109 1110 return 0; 1111 } 1112 1113 /* 1114 * Look for a btrfs signature on a device. This may be called out of the mount path 1115 * and we are not allowed to call set_blocksize during the scan. The superblock 1116 * is read via pagecache 1117 */ 1118 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 1119 struct btrfs_fs_devices **fs_devices_ret) 1120 { 1121 struct btrfs_super_block *disk_super; 1122 struct block_device *bdev; 1123 struct page *page; 1124 int ret = -EINVAL; 1125 u64 devid; 1126 u64 transid; 1127 u64 total_devices; 1128 u64 bytenr; 1129 1130 /* 1131 * we would like to check all the supers, but that would make 1132 * a btrfs mount succeed after a mkfs from a different FS. 1133 * So, we need to add a special mount option to scan for 1134 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1135 */ 1136 bytenr = btrfs_sb_offset(0); 1137 flags |= FMODE_EXCL; 1138 mutex_lock(&uuid_mutex); 1139 1140 bdev = blkdev_get_by_path(path, flags, holder); 1141 if (IS_ERR(bdev)) { 1142 ret = PTR_ERR(bdev); 1143 goto error; 1144 } 1145 1146 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) 1147 goto error_bdev_put; 1148 1149 devid = btrfs_stack_device_id(&disk_super->dev_item); 1150 transid = btrfs_super_generation(disk_super); 1151 total_devices = btrfs_super_num_devices(disk_super); 1152 1153 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 1154 if (ret > 0) { 1155 if (disk_super->label[0]) { 1156 pr_info("BTRFS: device label %s ", disk_super->label); 1157 } else { 1158 pr_info("BTRFS: device fsid %pU ", disk_super->fsid); 1159 } 1160 1161 pr_cont("devid %llu transid %llu %s\n", devid, transid, path); 1162 ret = 0; 1163 } 1164 if (!ret && fs_devices_ret) 1165 (*fs_devices_ret)->total_devices = total_devices; 1166 1167 btrfs_release_disk_super(page); 1168 1169 error_bdev_put: 1170 blkdev_put(bdev, flags); 1171 error: 1172 mutex_unlock(&uuid_mutex); 1173 return ret; 1174 } 1175 1176 /* helper to account the used device space in the range */ 1177 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 1178 u64 end, u64 *length) 1179 { 1180 struct btrfs_key key; 1181 struct btrfs_root *root = device->fs_info->dev_root; 1182 struct btrfs_dev_extent *dev_extent; 1183 struct btrfs_path *path; 1184 u64 extent_end; 1185 int ret; 1186 int slot; 1187 struct extent_buffer *l; 1188 1189 *length = 0; 1190 1191 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 1192 return 0; 1193 1194 path = btrfs_alloc_path(); 1195 if (!path) 1196 return -ENOMEM; 1197 path->reada = READA_FORWARD; 1198 1199 key.objectid = device->devid; 1200 key.offset = start; 1201 key.type = BTRFS_DEV_EXTENT_KEY; 1202 1203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1204 if (ret < 0) 1205 goto out; 1206 if (ret > 0) { 1207 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1208 if (ret < 0) 1209 goto out; 1210 } 1211 1212 while (1) { 1213 l = path->nodes[0]; 1214 slot = path->slots[0]; 1215 if (slot >= btrfs_header_nritems(l)) { 1216 ret = btrfs_next_leaf(root, path); 1217 if (ret == 0) 1218 continue; 1219 if (ret < 0) 1220 goto out; 1221 1222 break; 1223 } 1224 btrfs_item_key_to_cpu(l, &key, slot); 1225 1226 if (key.objectid < device->devid) 1227 goto next; 1228 1229 if (key.objectid > device->devid) 1230 break; 1231 1232 if (key.type != BTRFS_DEV_EXTENT_KEY) 1233 goto next; 1234 1235 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1236 extent_end = key.offset + btrfs_dev_extent_length(l, 1237 dev_extent); 1238 if (key.offset <= start && extent_end > end) { 1239 *length = end - start + 1; 1240 break; 1241 } else if (key.offset <= start && extent_end > start) 1242 *length += extent_end - start; 1243 else if (key.offset > start && extent_end <= end) 1244 *length += extent_end - key.offset; 1245 else if (key.offset > start && key.offset <= end) { 1246 *length += end - key.offset + 1; 1247 break; 1248 } else if (key.offset > end) 1249 break; 1250 1251 next: 1252 path->slots[0]++; 1253 } 1254 ret = 0; 1255 out: 1256 btrfs_free_path(path); 1257 return ret; 1258 } 1259 1260 static int contains_pending_extent(struct btrfs_transaction *transaction, 1261 struct btrfs_device *device, 1262 u64 *start, u64 len) 1263 { 1264 struct btrfs_fs_info *fs_info = device->fs_info; 1265 struct extent_map *em; 1266 struct list_head *search_list = &fs_info->pinned_chunks; 1267 int ret = 0; 1268 u64 physical_start = *start; 1269 1270 if (transaction) 1271 search_list = &transaction->pending_chunks; 1272 again: 1273 list_for_each_entry(em, search_list, list) { 1274 struct map_lookup *map; 1275 int i; 1276 1277 map = em->map_lookup; 1278 for (i = 0; i < map->num_stripes; i++) { 1279 u64 end; 1280 1281 if (map->stripes[i].dev != device) 1282 continue; 1283 if (map->stripes[i].physical >= physical_start + len || 1284 map->stripes[i].physical + em->orig_block_len <= 1285 physical_start) 1286 continue; 1287 /* 1288 * Make sure that while processing the pinned list we do 1289 * not override our *start with a lower value, because 1290 * we can have pinned chunks that fall within this 1291 * device hole and that have lower physical addresses 1292 * than the pending chunks we processed before. If we 1293 * do not take this special care we can end up getting 1294 * 2 pending chunks that start at the same physical 1295 * device offsets because the end offset of a pinned 1296 * chunk can be equal to the start offset of some 1297 * pending chunk. 1298 */ 1299 end = map->stripes[i].physical + em->orig_block_len; 1300 if (end > *start) { 1301 *start = end; 1302 ret = 1; 1303 } 1304 } 1305 } 1306 if (search_list != &fs_info->pinned_chunks) { 1307 search_list = &fs_info->pinned_chunks; 1308 goto again; 1309 } 1310 1311 return ret; 1312 } 1313 1314 1315 /* 1316 * find_free_dev_extent_start - find free space in the specified device 1317 * @device: the device which we search the free space in 1318 * @num_bytes: the size of the free space that we need 1319 * @search_start: the position from which to begin the search 1320 * @start: store the start of the free space. 1321 * @len: the size of the free space. that we find, or the size 1322 * of the max free space if we don't find suitable free space 1323 * 1324 * this uses a pretty simple search, the expectation is that it is 1325 * called very infrequently and that a given device has a small number 1326 * of extents 1327 * 1328 * @start is used to store the start of the free space if we find. But if we 1329 * don't find suitable free space, it will be used to store the start position 1330 * of the max free space. 1331 * 1332 * @len is used to store the size of the free space that we find. 1333 * But if we don't find suitable free space, it is used to store the size of 1334 * the max free space. 1335 */ 1336 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1337 struct btrfs_device *device, u64 num_bytes, 1338 u64 search_start, u64 *start, u64 *len) 1339 { 1340 struct btrfs_fs_info *fs_info = device->fs_info; 1341 struct btrfs_root *root = fs_info->dev_root; 1342 struct btrfs_key key; 1343 struct btrfs_dev_extent *dev_extent; 1344 struct btrfs_path *path; 1345 u64 hole_size; 1346 u64 max_hole_start; 1347 u64 max_hole_size; 1348 u64 extent_end; 1349 u64 search_end = device->total_bytes; 1350 int ret; 1351 int slot; 1352 struct extent_buffer *l; 1353 u64 min_search_start; 1354 1355 /* 1356 * We don't want to overwrite the superblock on the drive nor any area 1357 * used by the boot loader (grub for example), so we make sure to start 1358 * at an offset of at least 1MB. 1359 */ 1360 min_search_start = max(fs_info->alloc_start, 1024ull * 1024); 1361 search_start = max(search_start, min_search_start); 1362 1363 path = btrfs_alloc_path(); 1364 if (!path) 1365 return -ENOMEM; 1366 1367 max_hole_start = search_start; 1368 max_hole_size = 0; 1369 1370 again: 1371 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1372 ret = -ENOSPC; 1373 goto out; 1374 } 1375 1376 path->reada = READA_FORWARD; 1377 path->search_commit_root = 1; 1378 path->skip_locking = 1; 1379 1380 key.objectid = device->devid; 1381 key.offset = search_start; 1382 key.type = BTRFS_DEV_EXTENT_KEY; 1383 1384 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1385 if (ret < 0) 1386 goto out; 1387 if (ret > 0) { 1388 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1389 if (ret < 0) 1390 goto out; 1391 } 1392 1393 while (1) { 1394 l = path->nodes[0]; 1395 slot = path->slots[0]; 1396 if (slot >= btrfs_header_nritems(l)) { 1397 ret = btrfs_next_leaf(root, path); 1398 if (ret == 0) 1399 continue; 1400 if (ret < 0) 1401 goto out; 1402 1403 break; 1404 } 1405 btrfs_item_key_to_cpu(l, &key, slot); 1406 1407 if (key.objectid < device->devid) 1408 goto next; 1409 1410 if (key.objectid > device->devid) 1411 break; 1412 1413 if (key.type != BTRFS_DEV_EXTENT_KEY) 1414 goto next; 1415 1416 if (key.offset > search_start) { 1417 hole_size = key.offset - search_start; 1418 1419 /* 1420 * Have to check before we set max_hole_start, otherwise 1421 * we could end up sending back this offset anyway. 1422 */ 1423 if (contains_pending_extent(transaction, device, 1424 &search_start, 1425 hole_size)) { 1426 if (key.offset >= search_start) { 1427 hole_size = key.offset - search_start; 1428 } else { 1429 WARN_ON_ONCE(1); 1430 hole_size = 0; 1431 } 1432 } 1433 1434 if (hole_size > max_hole_size) { 1435 max_hole_start = search_start; 1436 max_hole_size = hole_size; 1437 } 1438 1439 /* 1440 * If this free space is greater than which we need, 1441 * it must be the max free space that we have found 1442 * until now, so max_hole_start must point to the start 1443 * of this free space and the length of this free space 1444 * is stored in max_hole_size. Thus, we return 1445 * max_hole_start and max_hole_size and go back to the 1446 * caller. 1447 */ 1448 if (hole_size >= num_bytes) { 1449 ret = 0; 1450 goto out; 1451 } 1452 } 1453 1454 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1455 extent_end = key.offset + btrfs_dev_extent_length(l, 1456 dev_extent); 1457 if (extent_end > search_start) 1458 search_start = extent_end; 1459 next: 1460 path->slots[0]++; 1461 cond_resched(); 1462 } 1463 1464 /* 1465 * At this point, search_start should be the end of 1466 * allocated dev extents, and when shrinking the device, 1467 * search_end may be smaller than search_start. 1468 */ 1469 if (search_end > search_start) { 1470 hole_size = search_end - search_start; 1471 1472 if (contains_pending_extent(transaction, device, &search_start, 1473 hole_size)) { 1474 btrfs_release_path(path); 1475 goto again; 1476 } 1477 1478 if (hole_size > max_hole_size) { 1479 max_hole_start = search_start; 1480 max_hole_size = hole_size; 1481 } 1482 } 1483 1484 /* See above. */ 1485 if (max_hole_size < num_bytes) 1486 ret = -ENOSPC; 1487 else 1488 ret = 0; 1489 1490 out: 1491 btrfs_free_path(path); 1492 *start = max_hole_start; 1493 if (len) 1494 *len = max_hole_size; 1495 return ret; 1496 } 1497 1498 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1499 struct btrfs_device *device, u64 num_bytes, 1500 u64 *start, u64 *len) 1501 { 1502 /* FIXME use last free of some kind */ 1503 return find_free_dev_extent_start(trans->transaction, device, 1504 num_bytes, 0, start, len); 1505 } 1506 1507 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1508 struct btrfs_device *device, 1509 u64 start, u64 *dev_extent_len) 1510 { 1511 struct btrfs_fs_info *fs_info = device->fs_info; 1512 struct btrfs_root *root = fs_info->dev_root; 1513 int ret; 1514 struct btrfs_path *path; 1515 struct btrfs_key key; 1516 struct btrfs_key found_key; 1517 struct extent_buffer *leaf = NULL; 1518 struct btrfs_dev_extent *extent = NULL; 1519 1520 path = btrfs_alloc_path(); 1521 if (!path) 1522 return -ENOMEM; 1523 1524 key.objectid = device->devid; 1525 key.offset = start; 1526 key.type = BTRFS_DEV_EXTENT_KEY; 1527 again: 1528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1529 if (ret > 0) { 1530 ret = btrfs_previous_item(root, path, key.objectid, 1531 BTRFS_DEV_EXTENT_KEY); 1532 if (ret) 1533 goto out; 1534 leaf = path->nodes[0]; 1535 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1536 extent = btrfs_item_ptr(leaf, path->slots[0], 1537 struct btrfs_dev_extent); 1538 BUG_ON(found_key.offset > start || found_key.offset + 1539 btrfs_dev_extent_length(leaf, extent) < start); 1540 key = found_key; 1541 btrfs_release_path(path); 1542 goto again; 1543 } else if (ret == 0) { 1544 leaf = path->nodes[0]; 1545 extent = btrfs_item_ptr(leaf, path->slots[0], 1546 struct btrfs_dev_extent); 1547 } else { 1548 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1549 goto out; 1550 } 1551 1552 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1553 1554 ret = btrfs_del_item(trans, root, path); 1555 if (ret) { 1556 btrfs_handle_fs_error(fs_info, ret, 1557 "Failed to remove dev extent item"); 1558 } else { 1559 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1560 } 1561 out: 1562 btrfs_free_path(path); 1563 return ret; 1564 } 1565 1566 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1567 struct btrfs_device *device, 1568 u64 chunk_tree, u64 chunk_objectid, 1569 u64 chunk_offset, u64 start, u64 num_bytes) 1570 { 1571 int ret; 1572 struct btrfs_path *path; 1573 struct btrfs_fs_info *fs_info = device->fs_info; 1574 struct btrfs_root *root = fs_info->dev_root; 1575 struct btrfs_dev_extent *extent; 1576 struct extent_buffer *leaf; 1577 struct btrfs_key key; 1578 1579 WARN_ON(!device->in_fs_metadata); 1580 WARN_ON(device->is_tgtdev_for_dev_replace); 1581 path = btrfs_alloc_path(); 1582 if (!path) 1583 return -ENOMEM; 1584 1585 key.objectid = device->devid; 1586 key.offset = start; 1587 key.type = BTRFS_DEV_EXTENT_KEY; 1588 ret = btrfs_insert_empty_item(trans, root, path, &key, 1589 sizeof(*extent)); 1590 if (ret) 1591 goto out; 1592 1593 leaf = path->nodes[0]; 1594 extent = btrfs_item_ptr(leaf, path->slots[0], 1595 struct btrfs_dev_extent); 1596 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1597 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1598 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1599 1600 write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); 1601 1602 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1603 btrfs_mark_buffer_dirty(leaf); 1604 out: 1605 btrfs_free_path(path); 1606 return ret; 1607 } 1608 1609 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1610 { 1611 struct extent_map_tree *em_tree; 1612 struct extent_map *em; 1613 struct rb_node *n; 1614 u64 ret = 0; 1615 1616 em_tree = &fs_info->mapping_tree.map_tree; 1617 read_lock(&em_tree->lock); 1618 n = rb_last(&em_tree->map); 1619 if (n) { 1620 em = rb_entry(n, struct extent_map, rb_node); 1621 ret = em->start + em->len; 1622 } 1623 read_unlock(&em_tree->lock); 1624 1625 return ret; 1626 } 1627 1628 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1629 u64 *devid_ret) 1630 { 1631 int ret; 1632 struct btrfs_key key; 1633 struct btrfs_key found_key; 1634 struct btrfs_path *path; 1635 1636 path = btrfs_alloc_path(); 1637 if (!path) 1638 return -ENOMEM; 1639 1640 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1641 key.type = BTRFS_DEV_ITEM_KEY; 1642 key.offset = (u64)-1; 1643 1644 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1645 if (ret < 0) 1646 goto error; 1647 1648 BUG_ON(ret == 0); /* Corruption */ 1649 1650 ret = btrfs_previous_item(fs_info->chunk_root, path, 1651 BTRFS_DEV_ITEMS_OBJECTID, 1652 BTRFS_DEV_ITEM_KEY); 1653 if (ret) { 1654 *devid_ret = 1; 1655 } else { 1656 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1657 path->slots[0]); 1658 *devid_ret = found_key.offset + 1; 1659 } 1660 ret = 0; 1661 error: 1662 btrfs_free_path(path); 1663 return ret; 1664 } 1665 1666 /* 1667 * the device information is stored in the chunk root 1668 * the btrfs_device struct should be fully filled in 1669 */ 1670 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1671 struct btrfs_fs_info *fs_info, 1672 struct btrfs_device *device) 1673 { 1674 struct btrfs_root *root = fs_info->chunk_root; 1675 int ret; 1676 struct btrfs_path *path; 1677 struct btrfs_dev_item *dev_item; 1678 struct extent_buffer *leaf; 1679 struct btrfs_key key; 1680 unsigned long ptr; 1681 1682 path = btrfs_alloc_path(); 1683 if (!path) 1684 return -ENOMEM; 1685 1686 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1687 key.type = BTRFS_DEV_ITEM_KEY; 1688 key.offset = device->devid; 1689 1690 ret = btrfs_insert_empty_item(trans, root, path, &key, 1691 sizeof(*dev_item)); 1692 if (ret) 1693 goto out; 1694 1695 leaf = path->nodes[0]; 1696 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1697 1698 btrfs_set_device_id(leaf, dev_item, device->devid); 1699 btrfs_set_device_generation(leaf, dev_item, 0); 1700 btrfs_set_device_type(leaf, dev_item, device->type); 1701 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1702 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1703 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1704 btrfs_set_device_total_bytes(leaf, dev_item, 1705 btrfs_device_get_disk_total_bytes(device)); 1706 btrfs_set_device_bytes_used(leaf, dev_item, 1707 btrfs_device_get_bytes_used(device)); 1708 btrfs_set_device_group(leaf, dev_item, 0); 1709 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1710 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1711 btrfs_set_device_start_offset(leaf, dev_item, 0); 1712 1713 ptr = btrfs_device_uuid(dev_item); 1714 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1715 ptr = btrfs_device_fsid(dev_item); 1716 write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1717 btrfs_mark_buffer_dirty(leaf); 1718 1719 ret = 0; 1720 out: 1721 btrfs_free_path(path); 1722 return ret; 1723 } 1724 1725 /* 1726 * Function to update ctime/mtime for a given device path. 1727 * Mainly used for ctime/mtime based probe like libblkid. 1728 */ 1729 static void update_dev_time(char *path_name) 1730 { 1731 struct file *filp; 1732 1733 filp = filp_open(path_name, O_RDWR, 0); 1734 if (IS_ERR(filp)) 1735 return; 1736 file_update_time(filp); 1737 filp_close(filp, NULL); 1738 } 1739 1740 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1741 struct btrfs_device *device) 1742 { 1743 struct btrfs_root *root = fs_info->chunk_root; 1744 int ret; 1745 struct btrfs_path *path; 1746 struct btrfs_key key; 1747 struct btrfs_trans_handle *trans; 1748 1749 path = btrfs_alloc_path(); 1750 if (!path) 1751 return -ENOMEM; 1752 1753 trans = btrfs_start_transaction(root, 0); 1754 if (IS_ERR(trans)) { 1755 btrfs_free_path(path); 1756 return PTR_ERR(trans); 1757 } 1758 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1759 key.type = BTRFS_DEV_ITEM_KEY; 1760 key.offset = device->devid; 1761 1762 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1763 if (ret < 0) 1764 goto out; 1765 1766 if (ret > 0) { 1767 ret = -ENOENT; 1768 goto out; 1769 } 1770 1771 ret = btrfs_del_item(trans, root, path); 1772 if (ret) 1773 goto out; 1774 out: 1775 btrfs_free_path(path); 1776 btrfs_commit_transaction(trans); 1777 return ret; 1778 } 1779 1780 /* 1781 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1782 * filesystem. It's up to the caller to adjust that number regarding eg. device 1783 * replace. 1784 */ 1785 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1786 u64 num_devices) 1787 { 1788 u64 all_avail; 1789 unsigned seq; 1790 int i; 1791 1792 do { 1793 seq = read_seqbegin(&fs_info->profiles_lock); 1794 1795 all_avail = fs_info->avail_data_alloc_bits | 1796 fs_info->avail_system_alloc_bits | 1797 fs_info->avail_metadata_alloc_bits; 1798 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1799 1800 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1801 if (!(all_avail & btrfs_raid_group[i])) 1802 continue; 1803 1804 if (num_devices < btrfs_raid_array[i].devs_min) { 1805 int ret = btrfs_raid_mindev_error[i]; 1806 1807 if (ret) 1808 return ret; 1809 } 1810 } 1811 1812 return 0; 1813 } 1814 1815 struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, 1816 struct btrfs_device *device) 1817 { 1818 struct btrfs_device *next_device; 1819 1820 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1821 if (next_device != device && 1822 !next_device->missing && next_device->bdev) 1823 return next_device; 1824 } 1825 1826 return NULL; 1827 } 1828 1829 /* 1830 * Helper function to check if the given device is part of s_bdev / latest_bdev 1831 * and replace it with the provided or the next active device, in the context 1832 * where this function called, there should be always be another device (or 1833 * this_dev) which is active. 1834 */ 1835 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, 1836 struct btrfs_device *device, struct btrfs_device *this_dev) 1837 { 1838 struct btrfs_device *next_device; 1839 1840 if (this_dev) 1841 next_device = this_dev; 1842 else 1843 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1844 device); 1845 ASSERT(next_device); 1846 1847 if (fs_info->sb->s_bdev && 1848 (fs_info->sb->s_bdev == device->bdev)) 1849 fs_info->sb->s_bdev = next_device->bdev; 1850 1851 if (fs_info->fs_devices->latest_bdev == device->bdev) 1852 fs_info->fs_devices->latest_bdev = next_device->bdev; 1853 } 1854 1855 int btrfs_rm_device(struct btrfs_fs_info *fs_info, char *device_path, u64 devid) 1856 { 1857 struct btrfs_device *device; 1858 struct btrfs_fs_devices *cur_devices; 1859 u64 num_devices; 1860 int ret = 0; 1861 bool clear_super = false; 1862 1863 mutex_lock(&uuid_mutex); 1864 1865 num_devices = fs_info->fs_devices->num_devices; 1866 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 1867 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1868 WARN_ON(num_devices < 1); 1869 num_devices--; 1870 } 1871 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 1872 1873 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1874 if (ret) 1875 goto out; 1876 1877 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1878 &device); 1879 if (ret) 1880 goto out; 1881 1882 if (device->is_tgtdev_for_dev_replace) { 1883 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1884 goto out; 1885 } 1886 1887 if (device->writeable && fs_info->fs_devices->rw_devices == 1) { 1888 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1889 goto out; 1890 } 1891 1892 if (device->writeable) { 1893 mutex_lock(&fs_info->chunk_mutex); 1894 list_del_init(&device->dev_alloc_list); 1895 device->fs_devices->rw_devices--; 1896 mutex_unlock(&fs_info->chunk_mutex); 1897 clear_super = true; 1898 } 1899 1900 mutex_unlock(&uuid_mutex); 1901 ret = btrfs_shrink_device(device, 0); 1902 mutex_lock(&uuid_mutex); 1903 if (ret) 1904 goto error_undo; 1905 1906 /* 1907 * TODO: the superblock still includes this device in its num_devices 1908 * counter although write_all_supers() is not locked out. This 1909 * could give a filesystem state which requires a degraded mount. 1910 */ 1911 ret = btrfs_rm_dev_item(fs_info, device); 1912 if (ret) 1913 goto error_undo; 1914 1915 device->in_fs_metadata = 0; 1916 btrfs_scrub_cancel_dev(fs_info, device); 1917 1918 /* 1919 * the device list mutex makes sure that we don't change 1920 * the device list while someone else is writing out all 1921 * the device supers. Whoever is writing all supers, should 1922 * lock the device list mutex before getting the number of 1923 * devices in the super block (super_copy). Conversely, 1924 * whoever updates the number of devices in the super block 1925 * (super_copy) should hold the device list mutex. 1926 */ 1927 1928 cur_devices = device->fs_devices; 1929 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1930 list_del_rcu(&device->dev_list); 1931 1932 device->fs_devices->num_devices--; 1933 device->fs_devices->total_devices--; 1934 1935 if (device->missing) 1936 device->fs_devices->missing_devices--; 1937 1938 btrfs_assign_next_active_device(fs_info, device, NULL); 1939 1940 if (device->bdev) { 1941 device->fs_devices->open_devices--; 1942 /* remove sysfs entry */ 1943 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 1944 } 1945 1946 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 1947 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 1948 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1949 1950 /* 1951 * at this point, the device is zero sized and detached from 1952 * the devices list. All that's left is to zero out the old 1953 * supers and free the device. 1954 */ 1955 if (device->writeable) 1956 btrfs_scratch_superblocks(device->bdev, device->name->str); 1957 1958 btrfs_close_bdev(device); 1959 call_rcu(&device->rcu, free_device); 1960 1961 if (cur_devices->open_devices == 0) { 1962 struct btrfs_fs_devices *fs_devices; 1963 fs_devices = fs_info->fs_devices; 1964 while (fs_devices) { 1965 if (fs_devices->seed == cur_devices) { 1966 fs_devices->seed = cur_devices->seed; 1967 break; 1968 } 1969 fs_devices = fs_devices->seed; 1970 } 1971 cur_devices->seed = NULL; 1972 __btrfs_close_devices(cur_devices); 1973 free_fs_devices(cur_devices); 1974 } 1975 1976 fs_info->num_tolerated_disk_barrier_failures = 1977 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 1978 1979 out: 1980 mutex_unlock(&uuid_mutex); 1981 return ret; 1982 1983 error_undo: 1984 if (device->writeable) { 1985 mutex_lock(&fs_info->chunk_mutex); 1986 list_add(&device->dev_alloc_list, 1987 &fs_info->fs_devices->alloc_list); 1988 device->fs_devices->rw_devices++; 1989 mutex_unlock(&fs_info->chunk_mutex); 1990 } 1991 goto out; 1992 } 1993 1994 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 1995 struct btrfs_device *srcdev) 1996 { 1997 struct btrfs_fs_devices *fs_devices; 1998 1999 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 2000 2001 /* 2002 * in case of fs with no seed, srcdev->fs_devices will point 2003 * to fs_devices of fs_info. However when the dev being replaced is 2004 * a seed dev it will point to the seed's local fs_devices. In short 2005 * srcdev will have its correct fs_devices in both the cases. 2006 */ 2007 fs_devices = srcdev->fs_devices; 2008 2009 list_del_rcu(&srcdev->dev_list); 2010 list_del_rcu(&srcdev->dev_alloc_list); 2011 fs_devices->num_devices--; 2012 if (srcdev->missing) 2013 fs_devices->missing_devices--; 2014 2015 if (srcdev->writeable) 2016 fs_devices->rw_devices--; 2017 2018 if (srcdev->bdev) 2019 fs_devices->open_devices--; 2020 } 2021 2022 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2023 struct btrfs_device *srcdev) 2024 { 2025 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2026 2027 if (srcdev->writeable) { 2028 /* zero out the old super if it is writable */ 2029 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2030 } 2031 2032 btrfs_close_bdev(srcdev); 2033 2034 call_rcu(&srcdev->rcu, free_device); 2035 2036 /* 2037 * unless fs_devices is seed fs, num_devices shouldn't go 2038 * zero 2039 */ 2040 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); 2041 2042 /* if this is no devs we rather delete the fs_devices */ 2043 if (!fs_devices->num_devices) { 2044 struct btrfs_fs_devices *tmp_fs_devices; 2045 2046 tmp_fs_devices = fs_info->fs_devices; 2047 while (tmp_fs_devices) { 2048 if (tmp_fs_devices->seed == fs_devices) { 2049 tmp_fs_devices->seed = fs_devices->seed; 2050 break; 2051 } 2052 tmp_fs_devices = tmp_fs_devices->seed; 2053 } 2054 fs_devices->seed = NULL; 2055 __btrfs_close_devices(fs_devices); 2056 free_fs_devices(fs_devices); 2057 } 2058 } 2059 2060 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2061 struct btrfs_device *tgtdev) 2062 { 2063 mutex_lock(&uuid_mutex); 2064 WARN_ON(!tgtdev); 2065 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2066 2067 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2068 2069 if (tgtdev->bdev) 2070 fs_info->fs_devices->open_devices--; 2071 2072 fs_info->fs_devices->num_devices--; 2073 2074 btrfs_assign_next_active_device(fs_info, tgtdev, NULL); 2075 2076 list_del_rcu(&tgtdev->dev_list); 2077 2078 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2079 mutex_unlock(&uuid_mutex); 2080 2081 /* 2082 * The update_dev_time() with in btrfs_scratch_superblocks() 2083 * may lead to a call to btrfs_show_devname() which will try 2084 * to hold device_list_mutex. And here this device 2085 * is already out of device list, so we don't have to hold 2086 * the device_list_mutex lock. 2087 */ 2088 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2089 2090 btrfs_close_bdev(tgtdev); 2091 call_rcu(&tgtdev->rcu, free_device); 2092 } 2093 2094 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2095 char *device_path, 2096 struct btrfs_device **device) 2097 { 2098 int ret = 0; 2099 struct btrfs_super_block *disk_super; 2100 u64 devid; 2101 u8 *dev_uuid; 2102 struct block_device *bdev; 2103 struct buffer_head *bh; 2104 2105 *device = NULL; 2106 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2107 fs_info->bdev_holder, 0, &bdev, &bh); 2108 if (ret) 2109 return ret; 2110 disk_super = (struct btrfs_super_block *)bh->b_data; 2111 devid = btrfs_stack_device_id(&disk_super->dev_item); 2112 dev_uuid = disk_super->dev_item.uuid; 2113 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2114 brelse(bh); 2115 if (!*device) 2116 ret = -ENOENT; 2117 blkdev_put(bdev, FMODE_READ); 2118 return ret; 2119 } 2120 2121 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2122 char *device_path, 2123 struct btrfs_device **device) 2124 { 2125 *device = NULL; 2126 if (strcmp(device_path, "missing") == 0) { 2127 struct list_head *devices; 2128 struct btrfs_device *tmp; 2129 2130 devices = &fs_info->fs_devices->devices; 2131 /* 2132 * It is safe to read the devices since the volume_mutex 2133 * is held by the caller. 2134 */ 2135 list_for_each_entry(tmp, devices, dev_list) { 2136 if (tmp->in_fs_metadata && !tmp->bdev) { 2137 *device = tmp; 2138 break; 2139 } 2140 } 2141 2142 if (!*device) 2143 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2144 2145 return 0; 2146 } else { 2147 return btrfs_find_device_by_path(fs_info, device_path, device); 2148 } 2149 } 2150 2151 /* 2152 * Lookup a device given by device id, or the path if the id is 0. 2153 */ 2154 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2155 char *devpath, struct btrfs_device **device) 2156 { 2157 int ret; 2158 2159 if (devid) { 2160 ret = 0; 2161 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2162 if (!*device) 2163 ret = -ENOENT; 2164 } else { 2165 if (!devpath || !devpath[0]) 2166 return -EINVAL; 2167 2168 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2169 device); 2170 } 2171 return ret; 2172 } 2173 2174 /* 2175 * does all the dirty work required for changing file system's UUID. 2176 */ 2177 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2178 { 2179 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2180 struct btrfs_fs_devices *old_devices; 2181 struct btrfs_fs_devices *seed_devices; 2182 struct btrfs_super_block *disk_super = fs_info->super_copy; 2183 struct btrfs_device *device; 2184 u64 super_flags; 2185 2186 BUG_ON(!mutex_is_locked(&uuid_mutex)); 2187 if (!fs_devices->seeding) 2188 return -EINVAL; 2189 2190 seed_devices = __alloc_fs_devices(); 2191 if (IS_ERR(seed_devices)) 2192 return PTR_ERR(seed_devices); 2193 2194 old_devices = clone_fs_devices(fs_devices); 2195 if (IS_ERR(old_devices)) { 2196 kfree(seed_devices); 2197 return PTR_ERR(old_devices); 2198 } 2199 2200 list_add(&old_devices->list, &fs_uuids); 2201 2202 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2203 seed_devices->opened = 1; 2204 INIT_LIST_HEAD(&seed_devices->devices); 2205 INIT_LIST_HEAD(&seed_devices->alloc_list); 2206 mutex_init(&seed_devices->device_list_mutex); 2207 2208 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2209 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2210 synchronize_rcu); 2211 list_for_each_entry(device, &seed_devices->devices, dev_list) 2212 device->fs_devices = seed_devices; 2213 2214 mutex_lock(&fs_info->chunk_mutex); 2215 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2216 mutex_unlock(&fs_info->chunk_mutex); 2217 2218 fs_devices->seeding = 0; 2219 fs_devices->num_devices = 0; 2220 fs_devices->open_devices = 0; 2221 fs_devices->missing_devices = 0; 2222 fs_devices->rotating = 0; 2223 fs_devices->seed = seed_devices; 2224 2225 generate_random_uuid(fs_devices->fsid); 2226 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2227 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2228 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2229 2230 super_flags = btrfs_super_flags(disk_super) & 2231 ~BTRFS_SUPER_FLAG_SEEDING; 2232 btrfs_set_super_flags(disk_super, super_flags); 2233 2234 return 0; 2235 } 2236 2237 /* 2238 * Store the expected generation for seed devices in device items. 2239 */ 2240 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2241 struct btrfs_fs_info *fs_info) 2242 { 2243 struct btrfs_root *root = fs_info->chunk_root; 2244 struct btrfs_path *path; 2245 struct extent_buffer *leaf; 2246 struct btrfs_dev_item *dev_item; 2247 struct btrfs_device *device; 2248 struct btrfs_key key; 2249 u8 fs_uuid[BTRFS_UUID_SIZE]; 2250 u8 dev_uuid[BTRFS_UUID_SIZE]; 2251 u64 devid; 2252 int ret; 2253 2254 path = btrfs_alloc_path(); 2255 if (!path) 2256 return -ENOMEM; 2257 2258 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2259 key.offset = 0; 2260 key.type = BTRFS_DEV_ITEM_KEY; 2261 2262 while (1) { 2263 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2264 if (ret < 0) 2265 goto error; 2266 2267 leaf = path->nodes[0]; 2268 next_slot: 2269 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2270 ret = btrfs_next_leaf(root, path); 2271 if (ret > 0) 2272 break; 2273 if (ret < 0) 2274 goto error; 2275 leaf = path->nodes[0]; 2276 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2277 btrfs_release_path(path); 2278 continue; 2279 } 2280 2281 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2282 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2283 key.type != BTRFS_DEV_ITEM_KEY) 2284 break; 2285 2286 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2287 struct btrfs_dev_item); 2288 devid = btrfs_device_id(leaf, dev_item); 2289 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2290 BTRFS_UUID_SIZE); 2291 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2292 BTRFS_UUID_SIZE); 2293 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2294 BUG_ON(!device); /* Logic error */ 2295 2296 if (device->fs_devices->seeding) { 2297 btrfs_set_device_generation(leaf, dev_item, 2298 device->generation); 2299 btrfs_mark_buffer_dirty(leaf); 2300 } 2301 2302 path->slots[0]++; 2303 goto next_slot; 2304 } 2305 ret = 0; 2306 error: 2307 btrfs_free_path(path); 2308 return ret; 2309 } 2310 2311 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) 2312 { 2313 struct btrfs_root *root = fs_info->dev_root; 2314 struct request_queue *q; 2315 struct btrfs_trans_handle *trans; 2316 struct btrfs_device *device; 2317 struct block_device *bdev; 2318 struct list_head *devices; 2319 struct super_block *sb = fs_info->sb; 2320 struct rcu_string *name; 2321 u64 tmp; 2322 int seeding_dev = 0; 2323 int ret = 0; 2324 2325 if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding) 2326 return -EROFS; 2327 2328 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2329 fs_info->bdev_holder); 2330 if (IS_ERR(bdev)) 2331 return PTR_ERR(bdev); 2332 2333 if (fs_info->fs_devices->seeding) { 2334 seeding_dev = 1; 2335 down_write(&sb->s_umount); 2336 mutex_lock(&uuid_mutex); 2337 } 2338 2339 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2340 2341 devices = &fs_info->fs_devices->devices; 2342 2343 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2344 list_for_each_entry(device, devices, dev_list) { 2345 if (device->bdev == bdev) { 2346 ret = -EEXIST; 2347 mutex_unlock( 2348 &fs_info->fs_devices->device_list_mutex); 2349 goto error; 2350 } 2351 } 2352 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2353 2354 device = btrfs_alloc_device(fs_info, NULL, NULL); 2355 if (IS_ERR(device)) { 2356 /* we can safely leave the fs_devices entry around */ 2357 ret = PTR_ERR(device); 2358 goto error; 2359 } 2360 2361 name = rcu_string_strdup(device_path, GFP_KERNEL); 2362 if (!name) { 2363 kfree(device); 2364 ret = -ENOMEM; 2365 goto error; 2366 } 2367 rcu_assign_pointer(device->name, name); 2368 2369 trans = btrfs_start_transaction(root, 0); 2370 if (IS_ERR(trans)) { 2371 rcu_string_free(device->name); 2372 kfree(device); 2373 ret = PTR_ERR(trans); 2374 goto error; 2375 } 2376 2377 q = bdev_get_queue(bdev); 2378 if (blk_queue_discard(q)) 2379 device->can_discard = 1; 2380 device->writeable = 1; 2381 device->generation = trans->transid; 2382 device->io_width = fs_info->sectorsize; 2383 device->io_align = fs_info->sectorsize; 2384 device->sector_size = fs_info->sectorsize; 2385 device->total_bytes = i_size_read(bdev->bd_inode); 2386 device->disk_total_bytes = device->total_bytes; 2387 device->commit_total_bytes = device->total_bytes; 2388 device->fs_info = fs_info; 2389 device->bdev = bdev; 2390 device->in_fs_metadata = 1; 2391 device->is_tgtdev_for_dev_replace = 0; 2392 device->mode = FMODE_EXCL; 2393 device->dev_stats_valid = 1; 2394 set_blocksize(device->bdev, 4096); 2395 2396 if (seeding_dev) { 2397 sb->s_flags &= ~MS_RDONLY; 2398 ret = btrfs_prepare_sprout(fs_info); 2399 BUG_ON(ret); /* -ENOMEM */ 2400 } 2401 2402 device->fs_devices = fs_info->fs_devices; 2403 2404 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2405 mutex_lock(&fs_info->chunk_mutex); 2406 list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); 2407 list_add(&device->dev_alloc_list, 2408 &fs_info->fs_devices->alloc_list); 2409 fs_info->fs_devices->num_devices++; 2410 fs_info->fs_devices->open_devices++; 2411 fs_info->fs_devices->rw_devices++; 2412 fs_info->fs_devices->total_devices++; 2413 fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2414 2415 spin_lock(&fs_info->free_chunk_lock); 2416 fs_info->free_chunk_space += device->total_bytes; 2417 spin_unlock(&fs_info->free_chunk_lock); 2418 2419 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2420 fs_info->fs_devices->rotating = 1; 2421 2422 tmp = btrfs_super_total_bytes(fs_info->super_copy); 2423 btrfs_set_super_total_bytes(fs_info->super_copy, 2424 tmp + device->total_bytes); 2425 2426 tmp = btrfs_super_num_devices(fs_info->super_copy); 2427 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 2428 2429 /* add sysfs device entry */ 2430 btrfs_sysfs_add_device_link(fs_info->fs_devices, device); 2431 2432 /* 2433 * we've got more storage, clear any full flags on the space 2434 * infos 2435 */ 2436 btrfs_clear_space_info_full(fs_info); 2437 2438 mutex_unlock(&fs_info->chunk_mutex); 2439 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2440 2441 if (seeding_dev) { 2442 mutex_lock(&fs_info->chunk_mutex); 2443 ret = init_first_rw_device(trans, fs_info, device); 2444 mutex_unlock(&fs_info->chunk_mutex); 2445 if (ret) { 2446 btrfs_abort_transaction(trans, ret); 2447 goto error_trans; 2448 } 2449 } 2450 2451 ret = btrfs_add_device(trans, fs_info, device); 2452 if (ret) { 2453 btrfs_abort_transaction(trans, ret); 2454 goto error_trans; 2455 } 2456 2457 if (seeding_dev) { 2458 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2459 2460 ret = btrfs_finish_sprout(trans, fs_info); 2461 if (ret) { 2462 btrfs_abort_transaction(trans, ret); 2463 goto error_trans; 2464 } 2465 2466 /* Sprouting would change fsid of the mounted root, 2467 * so rename the fsid on the sysfs 2468 */ 2469 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2470 fs_info->fsid); 2471 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) 2472 btrfs_warn(fs_info, 2473 "sysfs: failed to create fsid for sprout"); 2474 } 2475 2476 fs_info->num_tolerated_disk_barrier_failures = 2477 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2478 ret = btrfs_commit_transaction(trans); 2479 2480 if (seeding_dev) { 2481 mutex_unlock(&uuid_mutex); 2482 up_write(&sb->s_umount); 2483 2484 if (ret) /* transaction commit */ 2485 return ret; 2486 2487 ret = btrfs_relocate_sys_chunks(fs_info); 2488 if (ret < 0) 2489 btrfs_handle_fs_error(fs_info, ret, 2490 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2491 trans = btrfs_attach_transaction(root); 2492 if (IS_ERR(trans)) { 2493 if (PTR_ERR(trans) == -ENOENT) 2494 return 0; 2495 return PTR_ERR(trans); 2496 } 2497 ret = btrfs_commit_transaction(trans); 2498 } 2499 2500 /* Update ctime/mtime for libblkid */ 2501 update_dev_time(device_path); 2502 return ret; 2503 2504 error_trans: 2505 btrfs_end_transaction(trans); 2506 rcu_string_free(device->name); 2507 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2508 kfree(device); 2509 error: 2510 blkdev_put(bdev, FMODE_EXCL); 2511 if (seeding_dev) { 2512 mutex_unlock(&uuid_mutex); 2513 up_write(&sb->s_umount); 2514 } 2515 return ret; 2516 } 2517 2518 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2519 char *device_path, 2520 struct btrfs_device *srcdev, 2521 struct btrfs_device **device_out) 2522 { 2523 struct request_queue *q; 2524 struct btrfs_device *device; 2525 struct block_device *bdev; 2526 struct list_head *devices; 2527 struct rcu_string *name; 2528 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2529 int ret = 0; 2530 2531 *device_out = NULL; 2532 if (fs_info->fs_devices->seeding) { 2533 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2534 return -EINVAL; 2535 } 2536 2537 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2538 fs_info->bdev_holder); 2539 if (IS_ERR(bdev)) { 2540 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2541 return PTR_ERR(bdev); 2542 } 2543 2544 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2545 2546 devices = &fs_info->fs_devices->devices; 2547 list_for_each_entry(device, devices, dev_list) { 2548 if (device->bdev == bdev) { 2549 btrfs_err(fs_info, 2550 "target device is in the filesystem!"); 2551 ret = -EEXIST; 2552 goto error; 2553 } 2554 } 2555 2556 2557 if (i_size_read(bdev->bd_inode) < 2558 btrfs_device_get_total_bytes(srcdev)) { 2559 btrfs_err(fs_info, 2560 "target device is smaller than source device!"); 2561 ret = -EINVAL; 2562 goto error; 2563 } 2564 2565 2566 device = btrfs_alloc_device(NULL, &devid, NULL); 2567 if (IS_ERR(device)) { 2568 ret = PTR_ERR(device); 2569 goto error; 2570 } 2571 2572 name = rcu_string_strdup(device_path, GFP_NOFS); 2573 if (!name) { 2574 kfree(device); 2575 ret = -ENOMEM; 2576 goto error; 2577 } 2578 rcu_assign_pointer(device->name, name); 2579 2580 q = bdev_get_queue(bdev); 2581 if (blk_queue_discard(q)) 2582 device->can_discard = 1; 2583 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2584 device->writeable = 1; 2585 device->generation = 0; 2586 device->io_width = fs_info->sectorsize; 2587 device->io_align = fs_info->sectorsize; 2588 device->sector_size = fs_info->sectorsize; 2589 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2590 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2591 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2592 ASSERT(list_empty(&srcdev->resized_list)); 2593 device->commit_total_bytes = srcdev->commit_total_bytes; 2594 device->commit_bytes_used = device->bytes_used; 2595 device->fs_info = fs_info; 2596 device->bdev = bdev; 2597 device->in_fs_metadata = 1; 2598 device->is_tgtdev_for_dev_replace = 1; 2599 device->mode = FMODE_EXCL; 2600 device->dev_stats_valid = 1; 2601 set_blocksize(device->bdev, 4096); 2602 device->fs_devices = fs_info->fs_devices; 2603 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2604 fs_info->fs_devices->num_devices++; 2605 fs_info->fs_devices->open_devices++; 2606 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2607 2608 *device_out = device; 2609 return ret; 2610 2611 error: 2612 blkdev_put(bdev, FMODE_EXCL); 2613 return ret; 2614 } 2615 2616 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2617 struct btrfs_device *tgtdev) 2618 { 2619 u32 sectorsize = fs_info->sectorsize; 2620 2621 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2622 tgtdev->io_width = sectorsize; 2623 tgtdev->io_align = sectorsize; 2624 tgtdev->sector_size = sectorsize; 2625 tgtdev->fs_info = fs_info; 2626 tgtdev->in_fs_metadata = 1; 2627 } 2628 2629 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2630 struct btrfs_device *device) 2631 { 2632 int ret; 2633 struct btrfs_path *path; 2634 struct btrfs_root *root = device->fs_info->chunk_root; 2635 struct btrfs_dev_item *dev_item; 2636 struct extent_buffer *leaf; 2637 struct btrfs_key key; 2638 2639 path = btrfs_alloc_path(); 2640 if (!path) 2641 return -ENOMEM; 2642 2643 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2644 key.type = BTRFS_DEV_ITEM_KEY; 2645 key.offset = device->devid; 2646 2647 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2648 if (ret < 0) 2649 goto out; 2650 2651 if (ret > 0) { 2652 ret = -ENOENT; 2653 goto out; 2654 } 2655 2656 leaf = path->nodes[0]; 2657 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2658 2659 btrfs_set_device_id(leaf, dev_item, device->devid); 2660 btrfs_set_device_type(leaf, dev_item, device->type); 2661 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2662 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2663 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2664 btrfs_set_device_total_bytes(leaf, dev_item, 2665 btrfs_device_get_disk_total_bytes(device)); 2666 btrfs_set_device_bytes_used(leaf, dev_item, 2667 btrfs_device_get_bytes_used(device)); 2668 btrfs_mark_buffer_dirty(leaf); 2669 2670 out: 2671 btrfs_free_path(path); 2672 return ret; 2673 } 2674 2675 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2676 struct btrfs_device *device, u64 new_size) 2677 { 2678 struct btrfs_fs_info *fs_info = device->fs_info; 2679 struct btrfs_super_block *super_copy = fs_info->super_copy; 2680 struct btrfs_fs_devices *fs_devices; 2681 u64 old_total; 2682 u64 diff; 2683 2684 if (!device->writeable) 2685 return -EACCES; 2686 2687 mutex_lock(&fs_info->chunk_mutex); 2688 old_total = btrfs_super_total_bytes(super_copy); 2689 diff = new_size - device->total_bytes; 2690 2691 if (new_size <= device->total_bytes || 2692 device->is_tgtdev_for_dev_replace) { 2693 mutex_unlock(&fs_info->chunk_mutex); 2694 return -EINVAL; 2695 } 2696 2697 fs_devices = fs_info->fs_devices; 2698 2699 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2700 device->fs_devices->total_rw_bytes += diff; 2701 2702 btrfs_device_set_total_bytes(device, new_size); 2703 btrfs_device_set_disk_total_bytes(device, new_size); 2704 btrfs_clear_space_info_full(device->fs_info); 2705 if (list_empty(&device->resized_list)) 2706 list_add_tail(&device->resized_list, 2707 &fs_devices->resized_devices); 2708 mutex_unlock(&fs_info->chunk_mutex); 2709 2710 return btrfs_update_device(trans, device); 2711 } 2712 2713 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2714 struct btrfs_fs_info *fs_info, u64 chunk_objectid, 2715 u64 chunk_offset) 2716 { 2717 struct btrfs_root *root = fs_info->chunk_root; 2718 int ret; 2719 struct btrfs_path *path; 2720 struct btrfs_key key; 2721 2722 path = btrfs_alloc_path(); 2723 if (!path) 2724 return -ENOMEM; 2725 2726 key.objectid = chunk_objectid; 2727 key.offset = chunk_offset; 2728 key.type = BTRFS_CHUNK_ITEM_KEY; 2729 2730 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2731 if (ret < 0) 2732 goto out; 2733 else if (ret > 0) { /* Logic error or corruption */ 2734 btrfs_handle_fs_error(fs_info, -ENOENT, 2735 "Failed lookup while freeing chunk."); 2736 ret = -ENOENT; 2737 goto out; 2738 } 2739 2740 ret = btrfs_del_item(trans, root, path); 2741 if (ret < 0) 2742 btrfs_handle_fs_error(fs_info, ret, 2743 "Failed to delete chunk item."); 2744 out: 2745 btrfs_free_path(path); 2746 return ret; 2747 } 2748 2749 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, 2750 u64 chunk_objectid, u64 chunk_offset) 2751 { 2752 struct btrfs_super_block *super_copy = fs_info->super_copy; 2753 struct btrfs_disk_key *disk_key; 2754 struct btrfs_chunk *chunk; 2755 u8 *ptr; 2756 int ret = 0; 2757 u32 num_stripes; 2758 u32 array_size; 2759 u32 len = 0; 2760 u32 cur; 2761 struct btrfs_key key; 2762 2763 mutex_lock(&fs_info->chunk_mutex); 2764 array_size = btrfs_super_sys_array_size(super_copy); 2765 2766 ptr = super_copy->sys_chunk_array; 2767 cur = 0; 2768 2769 while (cur < array_size) { 2770 disk_key = (struct btrfs_disk_key *)ptr; 2771 btrfs_disk_key_to_cpu(&key, disk_key); 2772 2773 len = sizeof(*disk_key); 2774 2775 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2776 chunk = (struct btrfs_chunk *)(ptr + len); 2777 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2778 len += btrfs_chunk_item_size(num_stripes); 2779 } else { 2780 ret = -EIO; 2781 break; 2782 } 2783 if (key.objectid == chunk_objectid && 2784 key.offset == chunk_offset) { 2785 memmove(ptr, ptr + len, array_size - (cur + len)); 2786 array_size -= len; 2787 btrfs_set_super_sys_array_size(super_copy, array_size); 2788 } else { 2789 ptr += len; 2790 cur += len; 2791 } 2792 } 2793 mutex_unlock(&fs_info->chunk_mutex); 2794 return ret; 2795 } 2796 2797 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2798 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2799 { 2800 struct extent_map_tree *em_tree; 2801 struct extent_map *em; 2802 struct map_lookup *map; 2803 u64 dev_extent_len = 0; 2804 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2805 int i, ret = 0; 2806 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2807 2808 em_tree = &fs_info->mapping_tree.map_tree; 2809 2810 read_lock(&em_tree->lock); 2811 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2812 read_unlock(&em_tree->lock); 2813 2814 if (!em || em->start > chunk_offset || 2815 em->start + em->len < chunk_offset) { 2816 /* 2817 * This is a logic error, but we don't want to just rely on the 2818 * user having built with ASSERT enabled, so if ASSERT doesn't 2819 * do anything we still error out. 2820 */ 2821 ASSERT(0); 2822 if (em) 2823 free_extent_map(em); 2824 return -EINVAL; 2825 } 2826 map = em->map_lookup; 2827 mutex_lock(&fs_info->chunk_mutex); 2828 check_system_chunk(trans, fs_info, map->type); 2829 mutex_unlock(&fs_info->chunk_mutex); 2830 2831 /* 2832 * Take the device list mutex to prevent races with the final phase of 2833 * a device replace operation that replaces the device object associated 2834 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2835 */ 2836 mutex_lock(&fs_devices->device_list_mutex); 2837 for (i = 0; i < map->num_stripes; i++) { 2838 struct btrfs_device *device = map->stripes[i].dev; 2839 ret = btrfs_free_dev_extent(trans, device, 2840 map->stripes[i].physical, 2841 &dev_extent_len); 2842 if (ret) { 2843 mutex_unlock(&fs_devices->device_list_mutex); 2844 btrfs_abort_transaction(trans, ret); 2845 goto out; 2846 } 2847 2848 if (device->bytes_used > 0) { 2849 mutex_lock(&fs_info->chunk_mutex); 2850 btrfs_device_set_bytes_used(device, 2851 device->bytes_used - dev_extent_len); 2852 spin_lock(&fs_info->free_chunk_lock); 2853 fs_info->free_chunk_space += dev_extent_len; 2854 spin_unlock(&fs_info->free_chunk_lock); 2855 btrfs_clear_space_info_full(fs_info); 2856 mutex_unlock(&fs_info->chunk_mutex); 2857 } 2858 2859 if (map->stripes[i].dev) { 2860 ret = btrfs_update_device(trans, map->stripes[i].dev); 2861 if (ret) { 2862 mutex_unlock(&fs_devices->device_list_mutex); 2863 btrfs_abort_transaction(trans, ret); 2864 goto out; 2865 } 2866 } 2867 } 2868 mutex_unlock(&fs_devices->device_list_mutex); 2869 2870 ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset); 2871 if (ret) { 2872 btrfs_abort_transaction(trans, ret); 2873 goto out; 2874 } 2875 2876 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2877 2878 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2879 ret = btrfs_del_sys_chunk(fs_info, chunk_objectid, 2880 chunk_offset); 2881 if (ret) { 2882 btrfs_abort_transaction(trans, ret); 2883 goto out; 2884 } 2885 } 2886 2887 ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); 2888 if (ret) { 2889 btrfs_abort_transaction(trans, ret); 2890 goto out; 2891 } 2892 2893 out: 2894 /* once for us */ 2895 free_extent_map(em); 2896 return ret; 2897 } 2898 2899 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2900 { 2901 struct btrfs_root *root = fs_info->chunk_root; 2902 struct btrfs_trans_handle *trans; 2903 int ret; 2904 2905 /* 2906 * Prevent races with automatic removal of unused block groups. 2907 * After we relocate and before we remove the chunk with offset 2908 * chunk_offset, automatic removal of the block group can kick in, 2909 * resulting in a failure when calling btrfs_remove_chunk() below. 2910 * 2911 * Make sure to acquire this mutex before doing a tree search (dev 2912 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2913 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2914 * we release the path used to search the chunk/dev tree and before 2915 * the current task acquires this mutex and calls us. 2916 */ 2917 ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); 2918 2919 ret = btrfs_can_relocate(fs_info, chunk_offset); 2920 if (ret) 2921 return -ENOSPC; 2922 2923 /* step one, relocate all the extents inside this chunk */ 2924 btrfs_scrub_pause(fs_info); 2925 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 2926 btrfs_scrub_continue(fs_info); 2927 if (ret) 2928 return ret; 2929 2930 trans = btrfs_start_trans_remove_block_group(root->fs_info, 2931 chunk_offset); 2932 if (IS_ERR(trans)) { 2933 ret = PTR_ERR(trans); 2934 btrfs_handle_fs_error(root->fs_info, ret, NULL); 2935 return ret; 2936 } 2937 2938 /* 2939 * step two, delete the device extents and the 2940 * chunk tree entries 2941 */ 2942 ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); 2943 btrfs_end_transaction(trans); 2944 return ret; 2945 } 2946 2947 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 2948 { 2949 struct btrfs_root *chunk_root = fs_info->chunk_root; 2950 struct btrfs_path *path; 2951 struct extent_buffer *leaf; 2952 struct btrfs_chunk *chunk; 2953 struct btrfs_key key; 2954 struct btrfs_key found_key; 2955 u64 chunk_type; 2956 bool retried = false; 2957 int failed = 0; 2958 int ret; 2959 2960 path = btrfs_alloc_path(); 2961 if (!path) 2962 return -ENOMEM; 2963 2964 again: 2965 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2966 key.offset = (u64)-1; 2967 key.type = BTRFS_CHUNK_ITEM_KEY; 2968 2969 while (1) { 2970 mutex_lock(&fs_info->delete_unused_bgs_mutex); 2971 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2972 if (ret < 0) { 2973 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2974 goto error; 2975 } 2976 BUG_ON(ret == 0); /* Corruption */ 2977 2978 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2979 key.type); 2980 if (ret) 2981 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 2982 if (ret < 0) 2983 goto error; 2984 if (ret > 0) 2985 break; 2986 2987 leaf = path->nodes[0]; 2988 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2989 2990 chunk = btrfs_item_ptr(leaf, path->slots[0], 2991 struct btrfs_chunk); 2992 chunk_type = btrfs_chunk_type(leaf, chunk); 2993 btrfs_release_path(path); 2994 2995 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2996 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 2997 if (ret == -ENOSPC) 2998 failed++; 2999 else 3000 BUG_ON(ret); 3001 } 3002 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3003 3004 if (found_key.offset == 0) 3005 break; 3006 key.offset = found_key.offset - 1; 3007 } 3008 ret = 0; 3009 if (failed && !retried) { 3010 failed = 0; 3011 retried = true; 3012 goto again; 3013 } else if (WARN_ON(failed && retried)) { 3014 ret = -ENOSPC; 3015 } 3016 error: 3017 btrfs_free_path(path); 3018 return ret; 3019 } 3020 3021 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3022 struct btrfs_balance_control *bctl) 3023 { 3024 struct btrfs_root *root = fs_info->tree_root; 3025 struct btrfs_trans_handle *trans; 3026 struct btrfs_balance_item *item; 3027 struct btrfs_disk_balance_args disk_bargs; 3028 struct btrfs_path *path; 3029 struct extent_buffer *leaf; 3030 struct btrfs_key key; 3031 int ret, err; 3032 3033 path = btrfs_alloc_path(); 3034 if (!path) 3035 return -ENOMEM; 3036 3037 trans = btrfs_start_transaction(root, 0); 3038 if (IS_ERR(trans)) { 3039 btrfs_free_path(path); 3040 return PTR_ERR(trans); 3041 } 3042 3043 key.objectid = BTRFS_BALANCE_OBJECTID; 3044 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3045 key.offset = 0; 3046 3047 ret = btrfs_insert_empty_item(trans, root, path, &key, 3048 sizeof(*item)); 3049 if (ret) 3050 goto out; 3051 3052 leaf = path->nodes[0]; 3053 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3054 3055 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3056 3057 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3058 btrfs_set_balance_data(leaf, item, &disk_bargs); 3059 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3060 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3061 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3062 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3063 3064 btrfs_set_balance_flags(leaf, item, bctl->flags); 3065 3066 btrfs_mark_buffer_dirty(leaf); 3067 out: 3068 btrfs_free_path(path); 3069 err = btrfs_commit_transaction(trans); 3070 if (err && !ret) 3071 ret = err; 3072 return ret; 3073 } 3074 3075 static int del_balance_item(struct btrfs_fs_info *fs_info) 3076 { 3077 struct btrfs_root *root = fs_info->tree_root; 3078 struct btrfs_trans_handle *trans; 3079 struct btrfs_path *path; 3080 struct btrfs_key key; 3081 int ret, err; 3082 3083 path = btrfs_alloc_path(); 3084 if (!path) 3085 return -ENOMEM; 3086 3087 trans = btrfs_start_transaction(root, 0); 3088 if (IS_ERR(trans)) { 3089 btrfs_free_path(path); 3090 return PTR_ERR(trans); 3091 } 3092 3093 key.objectid = BTRFS_BALANCE_OBJECTID; 3094 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3095 key.offset = 0; 3096 3097 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3098 if (ret < 0) 3099 goto out; 3100 if (ret > 0) { 3101 ret = -ENOENT; 3102 goto out; 3103 } 3104 3105 ret = btrfs_del_item(trans, root, path); 3106 out: 3107 btrfs_free_path(path); 3108 err = btrfs_commit_transaction(trans); 3109 if (err && !ret) 3110 ret = err; 3111 return ret; 3112 } 3113 3114 /* 3115 * This is a heuristic used to reduce the number of chunks balanced on 3116 * resume after balance was interrupted. 3117 */ 3118 static void update_balance_args(struct btrfs_balance_control *bctl) 3119 { 3120 /* 3121 * Turn on soft mode for chunk types that were being converted. 3122 */ 3123 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3124 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3125 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3126 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3127 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3128 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3129 3130 /* 3131 * Turn on usage filter if is not already used. The idea is 3132 * that chunks that we have already balanced should be 3133 * reasonably full. Don't do it for chunks that are being 3134 * converted - that will keep us from relocating unconverted 3135 * (albeit full) chunks. 3136 */ 3137 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3138 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3139 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3140 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3141 bctl->data.usage = 90; 3142 } 3143 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3144 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3145 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3146 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3147 bctl->sys.usage = 90; 3148 } 3149 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3150 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3151 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3152 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3153 bctl->meta.usage = 90; 3154 } 3155 } 3156 3157 /* 3158 * Should be called with both balance and volume mutexes held to 3159 * serialize other volume operations (add_dev/rm_dev/resize) with 3160 * restriper. Same goes for unset_balance_control. 3161 */ 3162 static void set_balance_control(struct btrfs_balance_control *bctl) 3163 { 3164 struct btrfs_fs_info *fs_info = bctl->fs_info; 3165 3166 BUG_ON(fs_info->balance_ctl); 3167 3168 spin_lock(&fs_info->balance_lock); 3169 fs_info->balance_ctl = bctl; 3170 spin_unlock(&fs_info->balance_lock); 3171 } 3172 3173 static void unset_balance_control(struct btrfs_fs_info *fs_info) 3174 { 3175 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3176 3177 BUG_ON(!fs_info->balance_ctl); 3178 3179 spin_lock(&fs_info->balance_lock); 3180 fs_info->balance_ctl = NULL; 3181 spin_unlock(&fs_info->balance_lock); 3182 3183 kfree(bctl); 3184 } 3185 3186 /* 3187 * Balance filters. Return 1 if chunk should be filtered out 3188 * (should not be balanced). 3189 */ 3190 static int chunk_profiles_filter(u64 chunk_type, 3191 struct btrfs_balance_args *bargs) 3192 { 3193 chunk_type = chunk_to_extended(chunk_type) & 3194 BTRFS_EXTENDED_PROFILE_MASK; 3195 3196 if (bargs->profiles & chunk_type) 3197 return 0; 3198 3199 return 1; 3200 } 3201 3202 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3203 struct btrfs_balance_args *bargs) 3204 { 3205 struct btrfs_block_group_cache *cache; 3206 u64 chunk_used; 3207 u64 user_thresh_min; 3208 u64 user_thresh_max; 3209 int ret = 1; 3210 3211 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3212 chunk_used = btrfs_block_group_used(&cache->item); 3213 3214 if (bargs->usage_min == 0) 3215 user_thresh_min = 0; 3216 else 3217 user_thresh_min = div_factor_fine(cache->key.offset, 3218 bargs->usage_min); 3219 3220 if (bargs->usage_max == 0) 3221 user_thresh_max = 1; 3222 else if (bargs->usage_max > 100) 3223 user_thresh_max = cache->key.offset; 3224 else 3225 user_thresh_max = div_factor_fine(cache->key.offset, 3226 bargs->usage_max); 3227 3228 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3229 ret = 0; 3230 3231 btrfs_put_block_group(cache); 3232 return ret; 3233 } 3234 3235 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3236 u64 chunk_offset, struct btrfs_balance_args *bargs) 3237 { 3238 struct btrfs_block_group_cache *cache; 3239 u64 chunk_used, user_thresh; 3240 int ret = 1; 3241 3242 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3243 chunk_used = btrfs_block_group_used(&cache->item); 3244 3245 if (bargs->usage_min == 0) 3246 user_thresh = 1; 3247 else if (bargs->usage > 100) 3248 user_thresh = cache->key.offset; 3249 else 3250 user_thresh = div_factor_fine(cache->key.offset, 3251 bargs->usage); 3252 3253 if (chunk_used < user_thresh) 3254 ret = 0; 3255 3256 btrfs_put_block_group(cache); 3257 return ret; 3258 } 3259 3260 static int chunk_devid_filter(struct extent_buffer *leaf, 3261 struct btrfs_chunk *chunk, 3262 struct btrfs_balance_args *bargs) 3263 { 3264 struct btrfs_stripe *stripe; 3265 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3266 int i; 3267 3268 for (i = 0; i < num_stripes; i++) { 3269 stripe = btrfs_stripe_nr(chunk, i); 3270 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3271 return 0; 3272 } 3273 3274 return 1; 3275 } 3276 3277 /* [pstart, pend) */ 3278 static int chunk_drange_filter(struct extent_buffer *leaf, 3279 struct btrfs_chunk *chunk, 3280 u64 chunk_offset, 3281 struct btrfs_balance_args *bargs) 3282 { 3283 struct btrfs_stripe *stripe; 3284 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3285 u64 stripe_offset; 3286 u64 stripe_length; 3287 int factor; 3288 int i; 3289 3290 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3291 return 0; 3292 3293 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3294 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3295 factor = num_stripes / 2; 3296 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3297 factor = num_stripes - 1; 3298 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3299 factor = num_stripes - 2; 3300 } else { 3301 factor = num_stripes; 3302 } 3303 3304 for (i = 0; i < num_stripes; i++) { 3305 stripe = btrfs_stripe_nr(chunk, i); 3306 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3307 continue; 3308 3309 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3310 stripe_length = btrfs_chunk_length(leaf, chunk); 3311 stripe_length = div_u64(stripe_length, factor); 3312 3313 if (stripe_offset < bargs->pend && 3314 stripe_offset + stripe_length > bargs->pstart) 3315 return 0; 3316 } 3317 3318 return 1; 3319 } 3320 3321 /* [vstart, vend) */ 3322 static int chunk_vrange_filter(struct extent_buffer *leaf, 3323 struct btrfs_chunk *chunk, 3324 u64 chunk_offset, 3325 struct btrfs_balance_args *bargs) 3326 { 3327 if (chunk_offset < bargs->vend && 3328 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3329 /* at least part of the chunk is inside this vrange */ 3330 return 0; 3331 3332 return 1; 3333 } 3334 3335 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3336 struct btrfs_chunk *chunk, 3337 struct btrfs_balance_args *bargs) 3338 { 3339 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3340 3341 if (bargs->stripes_min <= num_stripes 3342 && num_stripes <= bargs->stripes_max) 3343 return 0; 3344 3345 return 1; 3346 } 3347 3348 static int chunk_soft_convert_filter(u64 chunk_type, 3349 struct btrfs_balance_args *bargs) 3350 { 3351 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3352 return 0; 3353 3354 chunk_type = chunk_to_extended(chunk_type) & 3355 BTRFS_EXTENDED_PROFILE_MASK; 3356 3357 if (bargs->target == chunk_type) 3358 return 1; 3359 3360 return 0; 3361 } 3362 3363 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3364 struct extent_buffer *leaf, 3365 struct btrfs_chunk *chunk, u64 chunk_offset) 3366 { 3367 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3368 struct btrfs_balance_args *bargs = NULL; 3369 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3370 3371 /* type filter */ 3372 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3373 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3374 return 0; 3375 } 3376 3377 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3378 bargs = &bctl->data; 3379 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3380 bargs = &bctl->sys; 3381 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3382 bargs = &bctl->meta; 3383 3384 /* profiles filter */ 3385 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3386 chunk_profiles_filter(chunk_type, bargs)) { 3387 return 0; 3388 } 3389 3390 /* usage filter */ 3391 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3392 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3393 return 0; 3394 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3395 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3396 return 0; 3397 } 3398 3399 /* devid filter */ 3400 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3401 chunk_devid_filter(leaf, chunk, bargs)) { 3402 return 0; 3403 } 3404 3405 /* drange filter, makes sense only with devid filter */ 3406 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3407 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 3408 return 0; 3409 } 3410 3411 /* vrange filter */ 3412 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3413 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3414 return 0; 3415 } 3416 3417 /* stripes filter */ 3418 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3419 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3420 return 0; 3421 } 3422 3423 /* soft profile changing mode */ 3424 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3425 chunk_soft_convert_filter(chunk_type, bargs)) { 3426 return 0; 3427 } 3428 3429 /* 3430 * limited by count, must be the last filter 3431 */ 3432 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3433 if (bargs->limit == 0) 3434 return 0; 3435 else 3436 bargs->limit--; 3437 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3438 /* 3439 * Same logic as the 'limit' filter; the minimum cannot be 3440 * determined here because we do not have the global information 3441 * about the count of all chunks that satisfy the filters. 3442 */ 3443 if (bargs->limit_max == 0) 3444 return 0; 3445 else 3446 bargs->limit_max--; 3447 } 3448 3449 return 1; 3450 } 3451 3452 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3453 { 3454 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3455 struct btrfs_root *chunk_root = fs_info->chunk_root; 3456 struct btrfs_root *dev_root = fs_info->dev_root; 3457 struct list_head *devices; 3458 struct btrfs_device *device; 3459 u64 old_size; 3460 u64 size_to_free; 3461 u64 chunk_type; 3462 struct btrfs_chunk *chunk; 3463 struct btrfs_path *path = NULL; 3464 struct btrfs_key key; 3465 struct btrfs_key found_key; 3466 struct btrfs_trans_handle *trans; 3467 struct extent_buffer *leaf; 3468 int slot; 3469 int ret; 3470 int enospc_errors = 0; 3471 bool counting = true; 3472 /* The single value limit and min/max limits use the same bytes in the */ 3473 u64 limit_data = bctl->data.limit; 3474 u64 limit_meta = bctl->meta.limit; 3475 u64 limit_sys = bctl->sys.limit; 3476 u32 count_data = 0; 3477 u32 count_meta = 0; 3478 u32 count_sys = 0; 3479 int chunk_reserved = 0; 3480 u64 bytes_used = 0; 3481 3482 /* step one make some room on all the devices */ 3483 devices = &fs_info->fs_devices->devices; 3484 list_for_each_entry(device, devices, dev_list) { 3485 old_size = btrfs_device_get_total_bytes(device); 3486 size_to_free = div_factor(old_size, 1); 3487 size_to_free = min_t(u64, size_to_free, SZ_1M); 3488 if (!device->writeable || 3489 btrfs_device_get_total_bytes(device) - 3490 btrfs_device_get_bytes_used(device) > size_to_free || 3491 device->is_tgtdev_for_dev_replace) 3492 continue; 3493 3494 ret = btrfs_shrink_device(device, old_size - size_to_free); 3495 if (ret == -ENOSPC) 3496 break; 3497 if (ret) { 3498 /* btrfs_shrink_device never returns ret > 0 */ 3499 WARN_ON(ret > 0); 3500 goto error; 3501 } 3502 3503 trans = btrfs_start_transaction(dev_root, 0); 3504 if (IS_ERR(trans)) { 3505 ret = PTR_ERR(trans); 3506 btrfs_info_in_rcu(fs_info, 3507 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3508 rcu_str_deref(device->name), ret, 3509 old_size, old_size - size_to_free); 3510 goto error; 3511 } 3512 3513 ret = btrfs_grow_device(trans, device, old_size); 3514 if (ret) { 3515 btrfs_end_transaction(trans); 3516 /* btrfs_grow_device never returns ret > 0 */ 3517 WARN_ON(ret > 0); 3518 btrfs_info_in_rcu(fs_info, 3519 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3520 rcu_str_deref(device->name), ret, 3521 old_size, old_size - size_to_free); 3522 goto error; 3523 } 3524 3525 btrfs_end_transaction(trans); 3526 } 3527 3528 /* step two, relocate all the chunks */ 3529 path = btrfs_alloc_path(); 3530 if (!path) { 3531 ret = -ENOMEM; 3532 goto error; 3533 } 3534 3535 /* zero out stat counters */ 3536 spin_lock(&fs_info->balance_lock); 3537 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3538 spin_unlock(&fs_info->balance_lock); 3539 again: 3540 if (!counting) { 3541 /* 3542 * The single value limit and min/max limits use the same bytes 3543 * in the 3544 */ 3545 bctl->data.limit = limit_data; 3546 bctl->meta.limit = limit_meta; 3547 bctl->sys.limit = limit_sys; 3548 } 3549 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3550 key.offset = (u64)-1; 3551 key.type = BTRFS_CHUNK_ITEM_KEY; 3552 3553 while (1) { 3554 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3555 atomic_read(&fs_info->balance_cancel_req)) { 3556 ret = -ECANCELED; 3557 goto error; 3558 } 3559 3560 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3561 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3562 if (ret < 0) { 3563 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3564 goto error; 3565 } 3566 3567 /* 3568 * this shouldn't happen, it means the last relocate 3569 * failed 3570 */ 3571 if (ret == 0) 3572 BUG(); /* FIXME break ? */ 3573 3574 ret = btrfs_previous_item(chunk_root, path, 0, 3575 BTRFS_CHUNK_ITEM_KEY); 3576 if (ret) { 3577 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3578 ret = 0; 3579 break; 3580 } 3581 3582 leaf = path->nodes[0]; 3583 slot = path->slots[0]; 3584 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3585 3586 if (found_key.objectid != key.objectid) { 3587 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3588 break; 3589 } 3590 3591 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3592 chunk_type = btrfs_chunk_type(leaf, chunk); 3593 3594 if (!counting) { 3595 spin_lock(&fs_info->balance_lock); 3596 bctl->stat.considered++; 3597 spin_unlock(&fs_info->balance_lock); 3598 } 3599 3600 ret = should_balance_chunk(fs_info, leaf, chunk, 3601 found_key.offset); 3602 3603 btrfs_release_path(path); 3604 if (!ret) { 3605 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3606 goto loop; 3607 } 3608 3609 if (counting) { 3610 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3611 spin_lock(&fs_info->balance_lock); 3612 bctl->stat.expected++; 3613 spin_unlock(&fs_info->balance_lock); 3614 3615 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3616 count_data++; 3617 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3618 count_sys++; 3619 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3620 count_meta++; 3621 3622 goto loop; 3623 } 3624 3625 /* 3626 * Apply limit_min filter, no need to check if the LIMITS 3627 * filter is used, limit_min is 0 by default 3628 */ 3629 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3630 count_data < bctl->data.limit_min) 3631 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3632 count_meta < bctl->meta.limit_min) 3633 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3634 count_sys < bctl->sys.limit_min)) { 3635 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3636 goto loop; 3637 } 3638 3639 ASSERT(fs_info->data_sinfo); 3640 spin_lock(&fs_info->data_sinfo->lock); 3641 bytes_used = fs_info->data_sinfo->bytes_used; 3642 spin_unlock(&fs_info->data_sinfo->lock); 3643 3644 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3645 !chunk_reserved && !bytes_used) { 3646 trans = btrfs_start_transaction(chunk_root, 0); 3647 if (IS_ERR(trans)) { 3648 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3649 ret = PTR_ERR(trans); 3650 goto error; 3651 } 3652 3653 ret = btrfs_force_chunk_alloc(trans, fs_info, 3654 BTRFS_BLOCK_GROUP_DATA); 3655 btrfs_end_transaction(trans); 3656 if (ret < 0) { 3657 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3658 goto error; 3659 } 3660 chunk_reserved = 1; 3661 } 3662 3663 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3664 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3665 if (ret && ret != -ENOSPC) 3666 goto error; 3667 if (ret == -ENOSPC) { 3668 enospc_errors++; 3669 } else { 3670 spin_lock(&fs_info->balance_lock); 3671 bctl->stat.completed++; 3672 spin_unlock(&fs_info->balance_lock); 3673 } 3674 loop: 3675 if (found_key.offset == 0) 3676 break; 3677 key.offset = found_key.offset - 1; 3678 } 3679 3680 if (counting) { 3681 btrfs_release_path(path); 3682 counting = false; 3683 goto again; 3684 } 3685 error: 3686 btrfs_free_path(path); 3687 if (enospc_errors) { 3688 btrfs_info(fs_info, "%d enospc errors during balance", 3689 enospc_errors); 3690 if (!ret) 3691 ret = -ENOSPC; 3692 } 3693 3694 return ret; 3695 } 3696 3697 /** 3698 * alloc_profile_is_valid - see if a given profile is valid and reduced 3699 * @flags: profile to validate 3700 * @extended: if true @flags is treated as an extended profile 3701 */ 3702 static int alloc_profile_is_valid(u64 flags, int extended) 3703 { 3704 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3705 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3706 3707 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3708 3709 /* 1) check that all other bits are zeroed */ 3710 if (flags & ~mask) 3711 return 0; 3712 3713 /* 2) see if profile is reduced */ 3714 if (flags == 0) 3715 return !extended; /* "0" is valid for usual profiles */ 3716 3717 /* true if exactly one bit set */ 3718 return (flags & (flags - 1)) == 0; 3719 } 3720 3721 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3722 { 3723 /* cancel requested || normal exit path */ 3724 return atomic_read(&fs_info->balance_cancel_req) || 3725 (atomic_read(&fs_info->balance_pause_req) == 0 && 3726 atomic_read(&fs_info->balance_cancel_req) == 0); 3727 } 3728 3729 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3730 { 3731 int ret; 3732 3733 unset_balance_control(fs_info); 3734 ret = del_balance_item(fs_info); 3735 if (ret) 3736 btrfs_handle_fs_error(fs_info, ret, NULL); 3737 3738 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3739 } 3740 3741 /* Non-zero return value signifies invalidity */ 3742 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3743 u64 allowed) 3744 { 3745 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3746 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3747 (bctl_arg->target & ~allowed))); 3748 } 3749 3750 /* 3751 * Should be called with both balance and volume mutexes held 3752 */ 3753 int btrfs_balance(struct btrfs_balance_control *bctl, 3754 struct btrfs_ioctl_balance_args *bargs) 3755 { 3756 struct btrfs_fs_info *fs_info = bctl->fs_info; 3757 u64 allowed; 3758 int mixed = 0; 3759 int ret; 3760 u64 num_devices; 3761 unsigned seq; 3762 3763 if (btrfs_fs_closing(fs_info) || 3764 atomic_read(&fs_info->balance_pause_req) || 3765 atomic_read(&fs_info->balance_cancel_req)) { 3766 ret = -EINVAL; 3767 goto out; 3768 } 3769 3770 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3771 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3772 mixed = 1; 3773 3774 /* 3775 * In case of mixed groups both data and meta should be picked, 3776 * and identical options should be given for both of them. 3777 */ 3778 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3779 if (mixed && (bctl->flags & allowed)) { 3780 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3781 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3782 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3783 btrfs_err(fs_info, 3784 "with mixed groups data and metadata balance options must be the same"); 3785 ret = -EINVAL; 3786 goto out; 3787 } 3788 } 3789 3790 num_devices = fs_info->fs_devices->num_devices; 3791 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 3792 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3793 BUG_ON(num_devices < 1); 3794 num_devices--; 3795 } 3796 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3797 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3798 if (num_devices > 1) 3799 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3800 if (num_devices > 2) 3801 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3802 if (num_devices > 3) 3803 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3804 BTRFS_BLOCK_GROUP_RAID6); 3805 if (validate_convert_profile(&bctl->data, allowed)) { 3806 btrfs_err(fs_info, 3807 "unable to start balance with target data profile %llu", 3808 bctl->data.target); 3809 ret = -EINVAL; 3810 goto out; 3811 } 3812 if (validate_convert_profile(&bctl->meta, allowed)) { 3813 btrfs_err(fs_info, 3814 "unable to start balance with target metadata profile %llu", 3815 bctl->meta.target); 3816 ret = -EINVAL; 3817 goto out; 3818 } 3819 if (validate_convert_profile(&bctl->sys, allowed)) { 3820 btrfs_err(fs_info, 3821 "unable to start balance with target system profile %llu", 3822 bctl->sys.target); 3823 ret = -EINVAL; 3824 goto out; 3825 } 3826 3827 /* allow to reduce meta or sys integrity only if force set */ 3828 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3829 BTRFS_BLOCK_GROUP_RAID10 | 3830 BTRFS_BLOCK_GROUP_RAID5 | 3831 BTRFS_BLOCK_GROUP_RAID6; 3832 do { 3833 seq = read_seqbegin(&fs_info->profiles_lock); 3834 3835 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3836 (fs_info->avail_system_alloc_bits & allowed) && 3837 !(bctl->sys.target & allowed)) || 3838 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3839 (fs_info->avail_metadata_alloc_bits & allowed) && 3840 !(bctl->meta.target & allowed))) { 3841 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3842 btrfs_info(fs_info, 3843 "force reducing metadata integrity"); 3844 } else { 3845 btrfs_err(fs_info, 3846 "balance will reduce metadata integrity, use force if you want this"); 3847 ret = -EINVAL; 3848 goto out; 3849 } 3850 } 3851 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3852 3853 if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < 3854 btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { 3855 btrfs_warn(fs_info, 3856 "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", 3857 bctl->meta.target, bctl->data.target); 3858 } 3859 3860 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3861 fs_info->num_tolerated_disk_barrier_failures = min( 3862 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), 3863 btrfs_get_num_tolerated_disk_barrier_failures( 3864 bctl->sys.target)); 3865 } 3866 3867 ret = insert_balance_item(fs_info, bctl); 3868 if (ret && ret != -EEXIST) 3869 goto out; 3870 3871 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3872 BUG_ON(ret == -EEXIST); 3873 set_balance_control(bctl); 3874 } else { 3875 BUG_ON(ret != -EEXIST); 3876 spin_lock(&fs_info->balance_lock); 3877 update_balance_args(bctl); 3878 spin_unlock(&fs_info->balance_lock); 3879 } 3880 3881 atomic_inc(&fs_info->balance_running); 3882 mutex_unlock(&fs_info->balance_mutex); 3883 3884 ret = __btrfs_balance(fs_info); 3885 3886 mutex_lock(&fs_info->balance_mutex); 3887 atomic_dec(&fs_info->balance_running); 3888 3889 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3890 fs_info->num_tolerated_disk_barrier_failures = 3891 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3892 } 3893 3894 if (bargs) { 3895 memset(bargs, 0, sizeof(*bargs)); 3896 update_ioctl_balance_args(fs_info, 0, bargs); 3897 } 3898 3899 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3900 balance_need_close(fs_info)) { 3901 __cancel_balance(fs_info); 3902 } 3903 3904 wake_up(&fs_info->balance_wait_q); 3905 3906 return ret; 3907 out: 3908 if (bctl->flags & BTRFS_BALANCE_RESUME) 3909 __cancel_balance(fs_info); 3910 else { 3911 kfree(bctl); 3912 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3913 } 3914 return ret; 3915 } 3916 3917 static int balance_kthread(void *data) 3918 { 3919 struct btrfs_fs_info *fs_info = data; 3920 int ret = 0; 3921 3922 mutex_lock(&fs_info->volume_mutex); 3923 mutex_lock(&fs_info->balance_mutex); 3924 3925 if (fs_info->balance_ctl) { 3926 btrfs_info(fs_info, "continuing balance"); 3927 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3928 } 3929 3930 mutex_unlock(&fs_info->balance_mutex); 3931 mutex_unlock(&fs_info->volume_mutex); 3932 3933 return ret; 3934 } 3935 3936 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3937 { 3938 struct task_struct *tsk; 3939 3940 spin_lock(&fs_info->balance_lock); 3941 if (!fs_info->balance_ctl) { 3942 spin_unlock(&fs_info->balance_lock); 3943 return 0; 3944 } 3945 spin_unlock(&fs_info->balance_lock); 3946 3947 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 3948 btrfs_info(fs_info, "force skipping balance"); 3949 return 0; 3950 } 3951 3952 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3953 return PTR_ERR_OR_ZERO(tsk); 3954 } 3955 3956 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3957 { 3958 struct btrfs_balance_control *bctl; 3959 struct btrfs_balance_item *item; 3960 struct btrfs_disk_balance_args disk_bargs; 3961 struct btrfs_path *path; 3962 struct extent_buffer *leaf; 3963 struct btrfs_key key; 3964 int ret; 3965 3966 path = btrfs_alloc_path(); 3967 if (!path) 3968 return -ENOMEM; 3969 3970 key.objectid = BTRFS_BALANCE_OBJECTID; 3971 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3972 key.offset = 0; 3973 3974 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3975 if (ret < 0) 3976 goto out; 3977 if (ret > 0) { /* ret = -ENOENT; */ 3978 ret = 0; 3979 goto out; 3980 } 3981 3982 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3983 if (!bctl) { 3984 ret = -ENOMEM; 3985 goto out; 3986 } 3987 3988 leaf = path->nodes[0]; 3989 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3990 3991 bctl->fs_info = fs_info; 3992 bctl->flags = btrfs_balance_flags(leaf, item); 3993 bctl->flags |= BTRFS_BALANCE_RESUME; 3994 3995 btrfs_balance_data(leaf, item, &disk_bargs); 3996 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3997 btrfs_balance_meta(leaf, item, &disk_bargs); 3998 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3999 btrfs_balance_sys(leaf, item, &disk_bargs); 4000 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4001 4002 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 4003 4004 mutex_lock(&fs_info->volume_mutex); 4005 mutex_lock(&fs_info->balance_mutex); 4006 4007 set_balance_control(bctl); 4008 4009 mutex_unlock(&fs_info->balance_mutex); 4010 mutex_unlock(&fs_info->volume_mutex); 4011 out: 4012 btrfs_free_path(path); 4013 return ret; 4014 } 4015 4016 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4017 { 4018 int ret = 0; 4019 4020 mutex_lock(&fs_info->balance_mutex); 4021 if (!fs_info->balance_ctl) { 4022 mutex_unlock(&fs_info->balance_mutex); 4023 return -ENOTCONN; 4024 } 4025 4026 if (atomic_read(&fs_info->balance_running)) { 4027 atomic_inc(&fs_info->balance_pause_req); 4028 mutex_unlock(&fs_info->balance_mutex); 4029 4030 wait_event(fs_info->balance_wait_q, 4031 atomic_read(&fs_info->balance_running) == 0); 4032 4033 mutex_lock(&fs_info->balance_mutex); 4034 /* we are good with balance_ctl ripped off from under us */ 4035 BUG_ON(atomic_read(&fs_info->balance_running)); 4036 atomic_dec(&fs_info->balance_pause_req); 4037 } else { 4038 ret = -ENOTCONN; 4039 } 4040 4041 mutex_unlock(&fs_info->balance_mutex); 4042 return ret; 4043 } 4044 4045 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4046 { 4047 if (fs_info->sb->s_flags & MS_RDONLY) 4048 return -EROFS; 4049 4050 mutex_lock(&fs_info->balance_mutex); 4051 if (!fs_info->balance_ctl) { 4052 mutex_unlock(&fs_info->balance_mutex); 4053 return -ENOTCONN; 4054 } 4055 4056 atomic_inc(&fs_info->balance_cancel_req); 4057 /* 4058 * if we are running just wait and return, balance item is 4059 * deleted in btrfs_balance in this case 4060 */ 4061 if (atomic_read(&fs_info->balance_running)) { 4062 mutex_unlock(&fs_info->balance_mutex); 4063 wait_event(fs_info->balance_wait_q, 4064 atomic_read(&fs_info->balance_running) == 0); 4065 mutex_lock(&fs_info->balance_mutex); 4066 } else { 4067 /* __cancel_balance needs volume_mutex */ 4068 mutex_unlock(&fs_info->balance_mutex); 4069 mutex_lock(&fs_info->volume_mutex); 4070 mutex_lock(&fs_info->balance_mutex); 4071 4072 if (fs_info->balance_ctl) 4073 __cancel_balance(fs_info); 4074 4075 mutex_unlock(&fs_info->volume_mutex); 4076 } 4077 4078 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 4079 atomic_dec(&fs_info->balance_cancel_req); 4080 mutex_unlock(&fs_info->balance_mutex); 4081 return 0; 4082 } 4083 4084 static int btrfs_uuid_scan_kthread(void *data) 4085 { 4086 struct btrfs_fs_info *fs_info = data; 4087 struct btrfs_root *root = fs_info->tree_root; 4088 struct btrfs_key key; 4089 struct btrfs_key max_key; 4090 struct btrfs_path *path = NULL; 4091 int ret = 0; 4092 struct extent_buffer *eb; 4093 int slot; 4094 struct btrfs_root_item root_item; 4095 u32 item_size; 4096 struct btrfs_trans_handle *trans = NULL; 4097 4098 path = btrfs_alloc_path(); 4099 if (!path) { 4100 ret = -ENOMEM; 4101 goto out; 4102 } 4103 4104 key.objectid = 0; 4105 key.type = BTRFS_ROOT_ITEM_KEY; 4106 key.offset = 0; 4107 4108 max_key.objectid = (u64)-1; 4109 max_key.type = BTRFS_ROOT_ITEM_KEY; 4110 max_key.offset = (u64)-1; 4111 4112 while (1) { 4113 ret = btrfs_search_forward(root, &key, path, 0); 4114 if (ret) { 4115 if (ret > 0) 4116 ret = 0; 4117 break; 4118 } 4119 4120 if (key.type != BTRFS_ROOT_ITEM_KEY || 4121 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4122 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4123 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4124 goto skip; 4125 4126 eb = path->nodes[0]; 4127 slot = path->slots[0]; 4128 item_size = btrfs_item_size_nr(eb, slot); 4129 if (item_size < sizeof(root_item)) 4130 goto skip; 4131 4132 read_extent_buffer(eb, &root_item, 4133 btrfs_item_ptr_offset(eb, slot), 4134 (int)sizeof(root_item)); 4135 if (btrfs_root_refs(&root_item) == 0) 4136 goto skip; 4137 4138 if (!btrfs_is_empty_uuid(root_item.uuid) || 4139 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4140 if (trans) 4141 goto update_tree; 4142 4143 btrfs_release_path(path); 4144 /* 4145 * 1 - subvol uuid item 4146 * 1 - received_subvol uuid item 4147 */ 4148 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4149 if (IS_ERR(trans)) { 4150 ret = PTR_ERR(trans); 4151 break; 4152 } 4153 continue; 4154 } else { 4155 goto skip; 4156 } 4157 update_tree: 4158 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4159 ret = btrfs_uuid_tree_add(trans, fs_info, 4160 root_item.uuid, 4161 BTRFS_UUID_KEY_SUBVOL, 4162 key.objectid); 4163 if (ret < 0) { 4164 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4165 ret); 4166 break; 4167 } 4168 } 4169 4170 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4171 ret = btrfs_uuid_tree_add(trans, fs_info, 4172 root_item.received_uuid, 4173 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4174 key.objectid); 4175 if (ret < 0) { 4176 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4177 ret); 4178 break; 4179 } 4180 } 4181 4182 skip: 4183 if (trans) { 4184 ret = btrfs_end_transaction(trans); 4185 trans = NULL; 4186 if (ret) 4187 break; 4188 } 4189 4190 btrfs_release_path(path); 4191 if (key.offset < (u64)-1) { 4192 key.offset++; 4193 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4194 key.offset = 0; 4195 key.type = BTRFS_ROOT_ITEM_KEY; 4196 } else if (key.objectid < (u64)-1) { 4197 key.offset = 0; 4198 key.type = BTRFS_ROOT_ITEM_KEY; 4199 key.objectid++; 4200 } else { 4201 break; 4202 } 4203 cond_resched(); 4204 } 4205 4206 out: 4207 btrfs_free_path(path); 4208 if (trans && !IS_ERR(trans)) 4209 btrfs_end_transaction(trans); 4210 if (ret) 4211 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4212 else 4213 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4214 up(&fs_info->uuid_tree_rescan_sem); 4215 return 0; 4216 } 4217 4218 /* 4219 * Callback for btrfs_uuid_tree_iterate(). 4220 * returns: 4221 * 0 check succeeded, the entry is not outdated. 4222 * < 0 if an error occurred. 4223 * > 0 if the check failed, which means the caller shall remove the entry. 4224 */ 4225 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4226 u8 *uuid, u8 type, u64 subid) 4227 { 4228 struct btrfs_key key; 4229 int ret = 0; 4230 struct btrfs_root *subvol_root; 4231 4232 if (type != BTRFS_UUID_KEY_SUBVOL && 4233 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4234 goto out; 4235 4236 key.objectid = subid; 4237 key.type = BTRFS_ROOT_ITEM_KEY; 4238 key.offset = (u64)-1; 4239 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4240 if (IS_ERR(subvol_root)) { 4241 ret = PTR_ERR(subvol_root); 4242 if (ret == -ENOENT) 4243 ret = 1; 4244 goto out; 4245 } 4246 4247 switch (type) { 4248 case BTRFS_UUID_KEY_SUBVOL: 4249 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4250 ret = 1; 4251 break; 4252 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4253 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4254 BTRFS_UUID_SIZE)) 4255 ret = 1; 4256 break; 4257 } 4258 4259 out: 4260 return ret; 4261 } 4262 4263 static int btrfs_uuid_rescan_kthread(void *data) 4264 { 4265 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4266 int ret; 4267 4268 /* 4269 * 1st step is to iterate through the existing UUID tree and 4270 * to delete all entries that contain outdated data. 4271 * 2nd step is to add all missing entries to the UUID tree. 4272 */ 4273 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4274 if (ret < 0) { 4275 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4276 up(&fs_info->uuid_tree_rescan_sem); 4277 return ret; 4278 } 4279 return btrfs_uuid_scan_kthread(data); 4280 } 4281 4282 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4283 { 4284 struct btrfs_trans_handle *trans; 4285 struct btrfs_root *tree_root = fs_info->tree_root; 4286 struct btrfs_root *uuid_root; 4287 struct task_struct *task; 4288 int ret; 4289 4290 /* 4291 * 1 - root node 4292 * 1 - root item 4293 */ 4294 trans = btrfs_start_transaction(tree_root, 2); 4295 if (IS_ERR(trans)) 4296 return PTR_ERR(trans); 4297 4298 uuid_root = btrfs_create_tree(trans, fs_info, 4299 BTRFS_UUID_TREE_OBJECTID); 4300 if (IS_ERR(uuid_root)) { 4301 ret = PTR_ERR(uuid_root); 4302 btrfs_abort_transaction(trans, ret); 4303 btrfs_end_transaction(trans); 4304 return ret; 4305 } 4306 4307 fs_info->uuid_root = uuid_root; 4308 4309 ret = btrfs_commit_transaction(trans); 4310 if (ret) 4311 return ret; 4312 4313 down(&fs_info->uuid_tree_rescan_sem); 4314 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4315 if (IS_ERR(task)) { 4316 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4317 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4318 up(&fs_info->uuid_tree_rescan_sem); 4319 return PTR_ERR(task); 4320 } 4321 4322 return 0; 4323 } 4324 4325 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4326 { 4327 struct task_struct *task; 4328 4329 down(&fs_info->uuid_tree_rescan_sem); 4330 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4331 if (IS_ERR(task)) { 4332 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4333 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4334 up(&fs_info->uuid_tree_rescan_sem); 4335 return PTR_ERR(task); 4336 } 4337 4338 return 0; 4339 } 4340 4341 /* 4342 * shrinking a device means finding all of the device extents past 4343 * the new size, and then following the back refs to the chunks. 4344 * The chunk relocation code actually frees the device extent 4345 */ 4346 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4347 { 4348 struct btrfs_fs_info *fs_info = device->fs_info; 4349 struct btrfs_root *root = fs_info->dev_root; 4350 struct btrfs_trans_handle *trans; 4351 struct btrfs_dev_extent *dev_extent = NULL; 4352 struct btrfs_path *path; 4353 u64 length; 4354 u64 chunk_offset; 4355 int ret; 4356 int slot; 4357 int failed = 0; 4358 bool retried = false; 4359 bool checked_pending_chunks = false; 4360 struct extent_buffer *l; 4361 struct btrfs_key key; 4362 struct btrfs_super_block *super_copy = fs_info->super_copy; 4363 u64 old_total = btrfs_super_total_bytes(super_copy); 4364 u64 old_size = btrfs_device_get_total_bytes(device); 4365 u64 diff = old_size - new_size; 4366 4367 if (device->is_tgtdev_for_dev_replace) 4368 return -EINVAL; 4369 4370 path = btrfs_alloc_path(); 4371 if (!path) 4372 return -ENOMEM; 4373 4374 path->reada = READA_FORWARD; 4375 4376 mutex_lock(&fs_info->chunk_mutex); 4377 4378 btrfs_device_set_total_bytes(device, new_size); 4379 if (device->writeable) { 4380 device->fs_devices->total_rw_bytes -= diff; 4381 spin_lock(&fs_info->free_chunk_lock); 4382 fs_info->free_chunk_space -= diff; 4383 spin_unlock(&fs_info->free_chunk_lock); 4384 } 4385 mutex_unlock(&fs_info->chunk_mutex); 4386 4387 again: 4388 key.objectid = device->devid; 4389 key.offset = (u64)-1; 4390 key.type = BTRFS_DEV_EXTENT_KEY; 4391 4392 do { 4393 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4394 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4395 if (ret < 0) { 4396 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4397 goto done; 4398 } 4399 4400 ret = btrfs_previous_item(root, path, 0, key.type); 4401 if (ret) 4402 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4403 if (ret < 0) 4404 goto done; 4405 if (ret) { 4406 ret = 0; 4407 btrfs_release_path(path); 4408 break; 4409 } 4410 4411 l = path->nodes[0]; 4412 slot = path->slots[0]; 4413 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4414 4415 if (key.objectid != device->devid) { 4416 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4417 btrfs_release_path(path); 4418 break; 4419 } 4420 4421 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4422 length = btrfs_dev_extent_length(l, dev_extent); 4423 4424 if (key.offset + length <= new_size) { 4425 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4426 btrfs_release_path(path); 4427 break; 4428 } 4429 4430 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4431 btrfs_release_path(path); 4432 4433 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4434 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4435 if (ret && ret != -ENOSPC) 4436 goto done; 4437 if (ret == -ENOSPC) 4438 failed++; 4439 } while (key.offset-- > 0); 4440 4441 if (failed && !retried) { 4442 failed = 0; 4443 retried = true; 4444 goto again; 4445 } else if (failed && retried) { 4446 ret = -ENOSPC; 4447 goto done; 4448 } 4449 4450 /* Shrinking succeeded, else we would be at "done". */ 4451 trans = btrfs_start_transaction(root, 0); 4452 if (IS_ERR(trans)) { 4453 ret = PTR_ERR(trans); 4454 goto done; 4455 } 4456 4457 mutex_lock(&fs_info->chunk_mutex); 4458 4459 /* 4460 * We checked in the above loop all device extents that were already in 4461 * the device tree. However before we have updated the device's 4462 * total_bytes to the new size, we might have had chunk allocations that 4463 * have not complete yet (new block groups attached to transaction 4464 * handles), and therefore their device extents were not yet in the 4465 * device tree and we missed them in the loop above. So if we have any 4466 * pending chunk using a device extent that overlaps the device range 4467 * that we can not use anymore, commit the current transaction and 4468 * repeat the search on the device tree - this way we guarantee we will 4469 * not have chunks using device extents that end beyond 'new_size'. 4470 */ 4471 if (!checked_pending_chunks) { 4472 u64 start = new_size; 4473 u64 len = old_size - new_size; 4474 4475 if (contains_pending_extent(trans->transaction, device, 4476 &start, len)) { 4477 mutex_unlock(&fs_info->chunk_mutex); 4478 checked_pending_chunks = true; 4479 failed = 0; 4480 retried = false; 4481 ret = btrfs_commit_transaction(trans); 4482 if (ret) 4483 goto done; 4484 goto again; 4485 } 4486 } 4487 4488 btrfs_device_set_disk_total_bytes(device, new_size); 4489 if (list_empty(&device->resized_list)) 4490 list_add_tail(&device->resized_list, 4491 &fs_info->fs_devices->resized_devices); 4492 4493 WARN_ON(diff > old_total); 4494 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4495 mutex_unlock(&fs_info->chunk_mutex); 4496 4497 /* Now btrfs_update_device() will change the on-disk size. */ 4498 ret = btrfs_update_device(trans, device); 4499 btrfs_end_transaction(trans); 4500 done: 4501 btrfs_free_path(path); 4502 if (ret) { 4503 mutex_lock(&fs_info->chunk_mutex); 4504 btrfs_device_set_total_bytes(device, old_size); 4505 if (device->writeable) 4506 device->fs_devices->total_rw_bytes += diff; 4507 spin_lock(&fs_info->free_chunk_lock); 4508 fs_info->free_chunk_space += diff; 4509 spin_unlock(&fs_info->free_chunk_lock); 4510 mutex_unlock(&fs_info->chunk_mutex); 4511 } 4512 return ret; 4513 } 4514 4515 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4516 struct btrfs_key *key, 4517 struct btrfs_chunk *chunk, int item_size) 4518 { 4519 struct btrfs_super_block *super_copy = fs_info->super_copy; 4520 struct btrfs_disk_key disk_key; 4521 u32 array_size; 4522 u8 *ptr; 4523 4524 mutex_lock(&fs_info->chunk_mutex); 4525 array_size = btrfs_super_sys_array_size(super_copy); 4526 if (array_size + item_size + sizeof(disk_key) 4527 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4528 mutex_unlock(&fs_info->chunk_mutex); 4529 return -EFBIG; 4530 } 4531 4532 ptr = super_copy->sys_chunk_array + array_size; 4533 btrfs_cpu_key_to_disk(&disk_key, key); 4534 memcpy(ptr, &disk_key, sizeof(disk_key)); 4535 ptr += sizeof(disk_key); 4536 memcpy(ptr, chunk, item_size); 4537 item_size += sizeof(disk_key); 4538 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4539 mutex_unlock(&fs_info->chunk_mutex); 4540 4541 return 0; 4542 } 4543 4544 /* 4545 * sort the devices in descending order by max_avail, total_avail 4546 */ 4547 static int btrfs_cmp_device_info(const void *a, const void *b) 4548 { 4549 const struct btrfs_device_info *di_a = a; 4550 const struct btrfs_device_info *di_b = b; 4551 4552 if (di_a->max_avail > di_b->max_avail) 4553 return -1; 4554 if (di_a->max_avail < di_b->max_avail) 4555 return 1; 4556 if (di_a->total_avail > di_b->total_avail) 4557 return -1; 4558 if (di_a->total_avail < di_b->total_avail) 4559 return 1; 4560 return 0; 4561 } 4562 4563 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 4564 { 4565 /* TODO allow them to set a preferred stripe size */ 4566 return SZ_64K; 4567 } 4568 4569 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4570 { 4571 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4572 return; 4573 4574 btrfs_set_fs_incompat(info, RAID56); 4575 } 4576 4577 #define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ 4578 - sizeof(struct btrfs_chunk)) \ 4579 / sizeof(struct btrfs_stripe) + 1) 4580 4581 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4582 - 2 * sizeof(struct btrfs_disk_key) \ 4583 - 2 * sizeof(struct btrfs_chunk)) \ 4584 / sizeof(struct btrfs_stripe) + 1) 4585 4586 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4587 struct btrfs_fs_info *fs_info, u64 start, 4588 u64 type) 4589 { 4590 struct btrfs_fs_info *info = trans->fs_info; 4591 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4592 struct list_head *cur; 4593 struct map_lookup *map = NULL; 4594 struct extent_map_tree *em_tree; 4595 struct extent_map *em; 4596 struct btrfs_device_info *devices_info = NULL; 4597 u64 total_avail; 4598 int num_stripes; /* total number of stripes to allocate */ 4599 int data_stripes; /* number of stripes that count for 4600 block group size */ 4601 int sub_stripes; /* sub_stripes info for map */ 4602 int dev_stripes; /* stripes per dev */ 4603 int devs_max; /* max devs to use */ 4604 int devs_min; /* min devs needed */ 4605 int devs_increment; /* ndevs has to be a multiple of this */ 4606 int ncopies; /* how many copies to data has */ 4607 int ret; 4608 u64 max_stripe_size; 4609 u64 max_chunk_size; 4610 u64 stripe_size; 4611 u64 num_bytes; 4612 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 4613 int ndevs; 4614 int i; 4615 int j; 4616 int index; 4617 4618 BUG_ON(!alloc_profile_is_valid(type, 0)); 4619 4620 if (list_empty(&fs_devices->alloc_list)) 4621 return -ENOSPC; 4622 4623 index = __get_raid_index(type); 4624 4625 sub_stripes = btrfs_raid_array[index].sub_stripes; 4626 dev_stripes = btrfs_raid_array[index].dev_stripes; 4627 devs_max = btrfs_raid_array[index].devs_max; 4628 devs_min = btrfs_raid_array[index].devs_min; 4629 devs_increment = btrfs_raid_array[index].devs_increment; 4630 ncopies = btrfs_raid_array[index].ncopies; 4631 4632 if (type & BTRFS_BLOCK_GROUP_DATA) { 4633 max_stripe_size = SZ_1G; 4634 max_chunk_size = 10 * max_stripe_size; 4635 if (!devs_max) 4636 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4637 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4638 /* for larger filesystems, use larger metadata chunks */ 4639 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4640 max_stripe_size = SZ_1G; 4641 else 4642 max_stripe_size = SZ_256M; 4643 max_chunk_size = max_stripe_size; 4644 if (!devs_max) 4645 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4646 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4647 max_stripe_size = SZ_32M; 4648 max_chunk_size = 2 * max_stripe_size; 4649 if (!devs_max) 4650 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4651 } else { 4652 btrfs_err(info, "invalid chunk type 0x%llx requested", 4653 type); 4654 BUG_ON(1); 4655 } 4656 4657 /* we don't want a chunk larger than 10% of writeable space */ 4658 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4659 max_chunk_size); 4660 4661 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4662 GFP_NOFS); 4663 if (!devices_info) 4664 return -ENOMEM; 4665 4666 cur = fs_devices->alloc_list.next; 4667 4668 /* 4669 * in the first pass through the devices list, we gather information 4670 * about the available holes on each device. 4671 */ 4672 ndevs = 0; 4673 while (cur != &fs_devices->alloc_list) { 4674 struct btrfs_device *device; 4675 u64 max_avail; 4676 u64 dev_offset; 4677 4678 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4679 4680 cur = cur->next; 4681 4682 if (!device->writeable) { 4683 WARN(1, KERN_ERR 4684 "BTRFS: read-only device in alloc_list\n"); 4685 continue; 4686 } 4687 4688 if (!device->in_fs_metadata || 4689 device->is_tgtdev_for_dev_replace) 4690 continue; 4691 4692 if (device->total_bytes > device->bytes_used) 4693 total_avail = device->total_bytes - device->bytes_used; 4694 else 4695 total_avail = 0; 4696 4697 /* If there is no space on this device, skip it. */ 4698 if (total_avail == 0) 4699 continue; 4700 4701 ret = find_free_dev_extent(trans, device, 4702 max_stripe_size * dev_stripes, 4703 &dev_offset, &max_avail); 4704 if (ret && ret != -ENOSPC) 4705 goto error; 4706 4707 if (ret == 0) 4708 max_avail = max_stripe_size * dev_stripes; 4709 4710 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4711 continue; 4712 4713 if (ndevs == fs_devices->rw_devices) { 4714 WARN(1, "%s: found more than %llu devices\n", 4715 __func__, fs_devices->rw_devices); 4716 break; 4717 } 4718 devices_info[ndevs].dev_offset = dev_offset; 4719 devices_info[ndevs].max_avail = max_avail; 4720 devices_info[ndevs].total_avail = total_avail; 4721 devices_info[ndevs].dev = device; 4722 ++ndevs; 4723 } 4724 4725 /* 4726 * now sort the devices by hole size / available space 4727 */ 4728 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4729 btrfs_cmp_device_info, NULL); 4730 4731 /* round down to number of usable stripes */ 4732 ndevs -= ndevs % devs_increment; 4733 4734 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4735 ret = -ENOSPC; 4736 goto error; 4737 } 4738 4739 if (devs_max && ndevs > devs_max) 4740 ndevs = devs_max; 4741 /* 4742 * the primary goal is to maximize the number of stripes, so use as many 4743 * devices as possible, even if the stripes are not maximum sized. 4744 */ 4745 stripe_size = devices_info[ndevs-1].max_avail; 4746 num_stripes = ndevs * dev_stripes; 4747 4748 /* 4749 * this will have to be fixed for RAID1 and RAID10 over 4750 * more drives 4751 */ 4752 data_stripes = num_stripes / ncopies; 4753 4754 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4755 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4756 info->stripesize); 4757 data_stripes = num_stripes - 1; 4758 } 4759 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4760 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4761 info->stripesize); 4762 data_stripes = num_stripes - 2; 4763 } 4764 4765 /* 4766 * Use the number of data stripes to figure out how big this chunk 4767 * is really going to be in terms of logical address space, 4768 * and compare that answer with the max chunk size 4769 */ 4770 if (stripe_size * data_stripes > max_chunk_size) { 4771 u64 mask = (1ULL << 24) - 1; 4772 4773 stripe_size = div_u64(max_chunk_size, data_stripes); 4774 4775 /* bump the answer up to a 16MB boundary */ 4776 stripe_size = (stripe_size + mask) & ~mask; 4777 4778 /* but don't go higher than the limits we found 4779 * while searching for free extents 4780 */ 4781 if (stripe_size > devices_info[ndevs-1].max_avail) 4782 stripe_size = devices_info[ndevs-1].max_avail; 4783 } 4784 4785 stripe_size = div_u64(stripe_size, dev_stripes); 4786 4787 /* align to BTRFS_STRIPE_LEN */ 4788 stripe_size = div_u64(stripe_size, raid_stripe_len); 4789 stripe_size *= raid_stripe_len; 4790 4791 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4792 if (!map) { 4793 ret = -ENOMEM; 4794 goto error; 4795 } 4796 map->num_stripes = num_stripes; 4797 4798 for (i = 0; i < ndevs; ++i) { 4799 for (j = 0; j < dev_stripes; ++j) { 4800 int s = i * dev_stripes + j; 4801 map->stripes[s].dev = devices_info[i].dev; 4802 map->stripes[s].physical = devices_info[i].dev_offset + 4803 j * stripe_size; 4804 } 4805 } 4806 map->sector_size = info->sectorsize; 4807 map->stripe_len = raid_stripe_len; 4808 map->io_align = raid_stripe_len; 4809 map->io_width = raid_stripe_len; 4810 map->type = type; 4811 map->sub_stripes = sub_stripes; 4812 4813 num_bytes = stripe_size * data_stripes; 4814 4815 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4816 4817 em = alloc_extent_map(); 4818 if (!em) { 4819 kfree(map); 4820 ret = -ENOMEM; 4821 goto error; 4822 } 4823 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4824 em->map_lookup = map; 4825 em->start = start; 4826 em->len = num_bytes; 4827 em->block_start = 0; 4828 em->block_len = em->len; 4829 em->orig_block_len = stripe_size; 4830 4831 em_tree = &info->mapping_tree.map_tree; 4832 write_lock(&em_tree->lock); 4833 ret = add_extent_mapping(em_tree, em, 0); 4834 if (!ret) { 4835 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4836 atomic_inc(&em->refs); 4837 } 4838 write_unlock(&em_tree->lock); 4839 if (ret) { 4840 free_extent_map(em); 4841 goto error; 4842 } 4843 4844 ret = btrfs_make_block_group(trans, info, 0, type, 4845 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4846 start, num_bytes); 4847 if (ret) 4848 goto error_del_extent; 4849 4850 for (i = 0; i < map->num_stripes; i++) { 4851 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4852 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4853 } 4854 4855 spin_lock(&info->free_chunk_lock); 4856 info->free_chunk_space -= (stripe_size * map->num_stripes); 4857 spin_unlock(&info->free_chunk_lock); 4858 4859 free_extent_map(em); 4860 check_raid56_incompat_flag(info, type); 4861 4862 kfree(devices_info); 4863 return 0; 4864 4865 error_del_extent: 4866 write_lock(&em_tree->lock); 4867 remove_extent_mapping(em_tree, em); 4868 write_unlock(&em_tree->lock); 4869 4870 /* One for our allocation */ 4871 free_extent_map(em); 4872 /* One for the tree reference */ 4873 free_extent_map(em); 4874 /* One for the pending_chunks list reference */ 4875 free_extent_map(em); 4876 error: 4877 kfree(devices_info); 4878 return ret; 4879 } 4880 4881 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4882 struct btrfs_fs_info *fs_info, 4883 u64 chunk_offset, u64 chunk_size) 4884 { 4885 struct btrfs_root *extent_root = fs_info->extent_root; 4886 struct btrfs_root *chunk_root = fs_info->chunk_root; 4887 struct btrfs_key key; 4888 struct btrfs_device *device; 4889 struct btrfs_chunk *chunk; 4890 struct btrfs_stripe *stripe; 4891 struct extent_map_tree *em_tree; 4892 struct extent_map *em; 4893 struct map_lookup *map; 4894 size_t item_size; 4895 u64 dev_offset; 4896 u64 stripe_size; 4897 int i = 0; 4898 int ret = 0; 4899 4900 em_tree = &fs_info->mapping_tree.map_tree; 4901 read_lock(&em_tree->lock); 4902 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4903 read_unlock(&em_tree->lock); 4904 4905 if (!em) { 4906 btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", 4907 chunk_offset, chunk_size); 4908 return -EINVAL; 4909 } 4910 4911 if (em->start != chunk_offset || em->len != chunk_size) { 4912 btrfs_crit(fs_info, 4913 "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", 4914 chunk_offset, chunk_size, em->start, em->len); 4915 free_extent_map(em); 4916 return -EINVAL; 4917 } 4918 4919 map = em->map_lookup; 4920 item_size = btrfs_chunk_item_size(map->num_stripes); 4921 stripe_size = em->orig_block_len; 4922 4923 chunk = kzalloc(item_size, GFP_NOFS); 4924 if (!chunk) { 4925 ret = -ENOMEM; 4926 goto out; 4927 } 4928 4929 /* 4930 * Take the device list mutex to prevent races with the final phase of 4931 * a device replace operation that replaces the device object associated 4932 * with the map's stripes, because the device object's id can change 4933 * at any time during that final phase of the device replace operation 4934 * (dev-replace.c:btrfs_dev_replace_finishing()). 4935 */ 4936 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4937 for (i = 0; i < map->num_stripes; i++) { 4938 device = map->stripes[i].dev; 4939 dev_offset = map->stripes[i].physical; 4940 4941 ret = btrfs_update_device(trans, device); 4942 if (ret) 4943 break; 4944 ret = btrfs_alloc_dev_extent(trans, device, 4945 chunk_root->root_key.objectid, 4946 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4947 chunk_offset, dev_offset, 4948 stripe_size); 4949 if (ret) 4950 break; 4951 } 4952 if (ret) { 4953 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4954 goto out; 4955 } 4956 4957 stripe = &chunk->stripe; 4958 for (i = 0; i < map->num_stripes; i++) { 4959 device = map->stripes[i].dev; 4960 dev_offset = map->stripes[i].physical; 4961 4962 btrfs_set_stack_stripe_devid(stripe, device->devid); 4963 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4964 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4965 stripe++; 4966 } 4967 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4968 4969 btrfs_set_stack_chunk_length(chunk, chunk_size); 4970 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4971 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4972 btrfs_set_stack_chunk_type(chunk, map->type); 4973 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4974 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4975 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4976 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 4977 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4978 4979 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4980 key.type = BTRFS_CHUNK_ITEM_KEY; 4981 key.offset = chunk_offset; 4982 4983 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4984 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4985 /* 4986 * TODO: Cleanup of inserted chunk root in case of 4987 * failure. 4988 */ 4989 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 4990 } 4991 4992 out: 4993 kfree(chunk); 4994 free_extent_map(em); 4995 return ret; 4996 } 4997 4998 /* 4999 * Chunk allocation falls into two parts. The first part does works 5000 * that make the new allocated chunk useable, but not do any operation 5001 * that modifies the chunk tree. The second part does the works that 5002 * require modifying the chunk tree. This division is important for the 5003 * bootstrap process of adding storage to a seed btrfs. 5004 */ 5005 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 5006 struct btrfs_fs_info *fs_info, u64 type) 5007 { 5008 u64 chunk_offset; 5009 5010 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 5011 chunk_offset = find_next_chunk(fs_info); 5012 return __btrfs_alloc_chunk(trans, fs_info, chunk_offset, type); 5013 } 5014 5015 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5016 struct btrfs_fs_info *fs_info, 5017 struct btrfs_device *device) 5018 { 5019 struct btrfs_root *extent_root = fs_info->extent_root; 5020 u64 chunk_offset; 5021 u64 sys_chunk_offset; 5022 u64 alloc_profile; 5023 int ret; 5024 5025 chunk_offset = find_next_chunk(fs_info); 5026 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 5027 ret = __btrfs_alloc_chunk(trans, fs_info, chunk_offset, alloc_profile); 5028 if (ret) 5029 return ret; 5030 5031 sys_chunk_offset = find_next_chunk(fs_info); 5032 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 5033 ret = __btrfs_alloc_chunk(trans, fs_info, sys_chunk_offset, 5034 alloc_profile); 5035 return ret; 5036 } 5037 5038 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5039 { 5040 int max_errors; 5041 5042 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5043 BTRFS_BLOCK_GROUP_RAID10 | 5044 BTRFS_BLOCK_GROUP_RAID5 | 5045 BTRFS_BLOCK_GROUP_DUP)) { 5046 max_errors = 1; 5047 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5048 max_errors = 2; 5049 } else { 5050 max_errors = 0; 5051 } 5052 5053 return max_errors; 5054 } 5055 5056 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5057 { 5058 struct extent_map *em; 5059 struct map_lookup *map; 5060 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 5061 int readonly = 0; 5062 int miss_ndevs = 0; 5063 int i; 5064 5065 read_lock(&map_tree->map_tree.lock); 5066 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 5067 read_unlock(&map_tree->map_tree.lock); 5068 if (!em) 5069 return 1; 5070 5071 map = em->map_lookup; 5072 for (i = 0; i < map->num_stripes; i++) { 5073 if (map->stripes[i].dev->missing) { 5074 miss_ndevs++; 5075 continue; 5076 } 5077 5078 if (!map->stripes[i].dev->writeable) { 5079 readonly = 1; 5080 goto end; 5081 } 5082 } 5083 5084 /* 5085 * If the number of missing devices is larger than max errors, 5086 * we can not write the data into that chunk successfully, so 5087 * set it readonly. 5088 */ 5089 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5090 readonly = 1; 5091 end: 5092 free_extent_map(em); 5093 return readonly; 5094 } 5095 5096 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5097 { 5098 extent_map_tree_init(&tree->map_tree); 5099 } 5100 5101 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5102 { 5103 struct extent_map *em; 5104 5105 while (1) { 5106 write_lock(&tree->map_tree.lock); 5107 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5108 if (em) 5109 remove_extent_mapping(&tree->map_tree, em); 5110 write_unlock(&tree->map_tree.lock); 5111 if (!em) 5112 break; 5113 /* once for us */ 5114 free_extent_map(em); 5115 /* once for the tree */ 5116 free_extent_map(em); 5117 } 5118 } 5119 5120 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5121 { 5122 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 5123 struct extent_map *em; 5124 struct map_lookup *map; 5125 struct extent_map_tree *em_tree = &map_tree->map_tree; 5126 int ret; 5127 5128 read_lock(&em_tree->lock); 5129 em = lookup_extent_mapping(em_tree, logical, len); 5130 read_unlock(&em_tree->lock); 5131 5132 /* 5133 * We could return errors for these cases, but that could get ugly and 5134 * we'd probably do the same thing which is just not do anything else 5135 * and exit, so return 1 so the callers don't try to use other copies. 5136 */ 5137 if (!em) { 5138 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, 5139 logical+len); 5140 return 1; 5141 } 5142 5143 if (em->start > logical || em->start + em->len < logical) { 5144 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", 5145 logical, logical+len, em->start, 5146 em->start + em->len); 5147 free_extent_map(em); 5148 return 1; 5149 } 5150 5151 map = em->map_lookup; 5152 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5153 ret = map->num_stripes; 5154 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5155 ret = map->sub_stripes; 5156 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5157 ret = 2; 5158 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5159 ret = 3; 5160 else 5161 ret = 1; 5162 free_extent_map(em); 5163 5164 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 5165 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 5166 ret++; 5167 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 5168 5169 return ret; 5170 } 5171 5172 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5173 struct btrfs_mapping_tree *map_tree, 5174 u64 logical) 5175 { 5176 struct extent_map *em; 5177 struct map_lookup *map; 5178 struct extent_map_tree *em_tree = &map_tree->map_tree; 5179 unsigned long len = fs_info->sectorsize; 5180 5181 read_lock(&em_tree->lock); 5182 em = lookup_extent_mapping(em_tree, logical, len); 5183 read_unlock(&em_tree->lock); 5184 BUG_ON(!em); 5185 5186 BUG_ON(em->start > logical || em->start + em->len < logical); 5187 map = em->map_lookup; 5188 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5189 len = map->stripe_len * nr_data_stripes(map); 5190 free_extent_map(em); 5191 return len; 5192 } 5193 5194 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 5195 u64 logical, u64 len, int mirror_num) 5196 { 5197 struct extent_map *em; 5198 struct map_lookup *map; 5199 struct extent_map_tree *em_tree = &map_tree->map_tree; 5200 int ret = 0; 5201 5202 read_lock(&em_tree->lock); 5203 em = lookup_extent_mapping(em_tree, logical, len); 5204 read_unlock(&em_tree->lock); 5205 BUG_ON(!em); 5206 5207 BUG_ON(em->start > logical || em->start + em->len < logical); 5208 map = em->map_lookup; 5209 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5210 ret = 1; 5211 free_extent_map(em); 5212 return ret; 5213 } 5214 5215 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5216 struct map_lookup *map, int first, int num, 5217 int optimal, int dev_replace_is_ongoing) 5218 { 5219 int i; 5220 int tolerance; 5221 struct btrfs_device *srcdev; 5222 5223 if (dev_replace_is_ongoing && 5224 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5225 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5226 srcdev = fs_info->dev_replace.srcdev; 5227 else 5228 srcdev = NULL; 5229 5230 /* 5231 * try to avoid the drive that is the source drive for a 5232 * dev-replace procedure, only choose it if no other non-missing 5233 * mirror is available 5234 */ 5235 for (tolerance = 0; tolerance < 2; tolerance++) { 5236 if (map->stripes[optimal].dev->bdev && 5237 (tolerance || map->stripes[optimal].dev != srcdev)) 5238 return optimal; 5239 for (i = first; i < first + num; i++) { 5240 if (map->stripes[i].dev->bdev && 5241 (tolerance || map->stripes[i].dev != srcdev)) 5242 return i; 5243 } 5244 } 5245 5246 /* we couldn't find one that doesn't fail. Just return something 5247 * and the io error handling code will clean up eventually 5248 */ 5249 return optimal; 5250 } 5251 5252 static inline int parity_smaller(u64 a, u64 b) 5253 { 5254 return a > b; 5255 } 5256 5257 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5258 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5259 { 5260 struct btrfs_bio_stripe s; 5261 int i; 5262 u64 l; 5263 int again = 1; 5264 5265 while (again) { 5266 again = 0; 5267 for (i = 0; i < num_stripes - 1; i++) { 5268 if (parity_smaller(bbio->raid_map[i], 5269 bbio->raid_map[i+1])) { 5270 s = bbio->stripes[i]; 5271 l = bbio->raid_map[i]; 5272 bbio->stripes[i] = bbio->stripes[i+1]; 5273 bbio->raid_map[i] = bbio->raid_map[i+1]; 5274 bbio->stripes[i+1] = s; 5275 bbio->raid_map[i+1] = l; 5276 5277 again = 1; 5278 } 5279 } 5280 } 5281 } 5282 5283 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5284 { 5285 struct btrfs_bio *bbio = kzalloc( 5286 /* the size of the btrfs_bio */ 5287 sizeof(struct btrfs_bio) + 5288 /* plus the variable array for the stripes */ 5289 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5290 /* plus the variable array for the tgt dev */ 5291 sizeof(int) * (real_stripes) + 5292 /* 5293 * plus the raid_map, which includes both the tgt dev 5294 * and the stripes 5295 */ 5296 sizeof(u64) * (total_stripes), 5297 GFP_NOFS|__GFP_NOFAIL); 5298 5299 atomic_set(&bbio->error, 0); 5300 atomic_set(&bbio->refs, 1); 5301 5302 return bbio; 5303 } 5304 5305 void btrfs_get_bbio(struct btrfs_bio *bbio) 5306 { 5307 WARN_ON(!atomic_read(&bbio->refs)); 5308 atomic_inc(&bbio->refs); 5309 } 5310 5311 void btrfs_put_bbio(struct btrfs_bio *bbio) 5312 { 5313 if (!bbio) 5314 return; 5315 if (atomic_dec_and_test(&bbio->refs)) 5316 kfree(bbio); 5317 } 5318 5319 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5320 enum btrfs_map_op op, 5321 u64 logical, u64 *length, 5322 struct btrfs_bio **bbio_ret, 5323 int mirror_num, int need_raid_map) 5324 { 5325 struct extent_map *em; 5326 struct map_lookup *map; 5327 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 5328 struct extent_map_tree *em_tree = &map_tree->map_tree; 5329 u64 offset; 5330 u64 stripe_offset; 5331 u64 stripe_end_offset; 5332 u64 stripe_nr; 5333 u64 stripe_nr_orig; 5334 u64 stripe_nr_end; 5335 u64 stripe_len; 5336 u32 stripe_index; 5337 int i; 5338 int ret = 0; 5339 int num_stripes; 5340 int max_errors = 0; 5341 int tgtdev_indexes = 0; 5342 struct btrfs_bio *bbio = NULL; 5343 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5344 int dev_replace_is_ongoing = 0; 5345 int num_alloc_stripes; 5346 int patch_the_first_stripe_for_dev_replace = 0; 5347 u64 physical_to_patch_in_first_stripe = 0; 5348 u64 raid56_full_stripe_start = (u64)-1; 5349 5350 read_lock(&em_tree->lock); 5351 em = lookup_extent_mapping(em_tree, logical, *length); 5352 read_unlock(&em_tree->lock); 5353 5354 if (!em) { 5355 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 5356 logical, *length); 5357 return -EINVAL; 5358 } 5359 5360 if (em->start > logical || em->start + em->len < logical) { 5361 btrfs_crit(fs_info, 5362 "found a bad mapping, wanted %Lu, found %Lu-%Lu", 5363 logical, em->start, em->start + em->len); 5364 free_extent_map(em); 5365 return -EINVAL; 5366 } 5367 5368 map = em->map_lookup; 5369 offset = logical - em->start; 5370 5371 stripe_len = map->stripe_len; 5372 stripe_nr = offset; 5373 /* 5374 * stripe_nr counts the total number of stripes we have to stride 5375 * to get to this block 5376 */ 5377 stripe_nr = div64_u64(stripe_nr, stripe_len); 5378 5379 stripe_offset = stripe_nr * stripe_len; 5380 if (offset < stripe_offset) { 5381 btrfs_crit(fs_info, 5382 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5383 stripe_offset, offset, em->start, logical, 5384 stripe_len); 5385 free_extent_map(em); 5386 return -EINVAL; 5387 } 5388 5389 /* stripe_offset is the offset of this block in its stripe*/ 5390 stripe_offset = offset - stripe_offset; 5391 5392 /* if we're here for raid56, we need to know the stripe aligned start */ 5393 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5394 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5395 raid56_full_stripe_start = offset; 5396 5397 /* allow a write of a full stripe, but make sure we don't 5398 * allow straddling of stripes 5399 */ 5400 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5401 full_stripe_len); 5402 raid56_full_stripe_start *= full_stripe_len; 5403 } 5404 5405 if (op == BTRFS_MAP_DISCARD) { 5406 /* we don't discard raid56 yet */ 5407 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5408 ret = -EOPNOTSUPP; 5409 goto out; 5410 } 5411 *length = min_t(u64, em->len - offset, *length); 5412 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5413 u64 max_len; 5414 /* For writes to RAID[56], allow a full stripeset across all disks. 5415 For other RAID types and for RAID[56] reads, just allow a single 5416 stripe (on a single disk). */ 5417 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5418 (op == BTRFS_MAP_WRITE)) { 5419 max_len = stripe_len * nr_data_stripes(map) - 5420 (offset - raid56_full_stripe_start); 5421 } else { 5422 /* we limit the length of each bio to what fits in a stripe */ 5423 max_len = stripe_len - stripe_offset; 5424 } 5425 *length = min_t(u64, em->len - offset, max_len); 5426 } else { 5427 *length = em->len - offset; 5428 } 5429 5430 /* This is for when we're called from btrfs_merge_bio_hook() and all 5431 it cares about is the length */ 5432 if (!bbio_ret) 5433 goto out; 5434 5435 btrfs_dev_replace_lock(dev_replace, 0); 5436 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5437 if (!dev_replace_is_ongoing) 5438 btrfs_dev_replace_unlock(dev_replace, 0); 5439 else 5440 btrfs_dev_replace_set_lock_blocking(dev_replace); 5441 5442 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5443 op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && 5444 op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { 5445 /* 5446 * in dev-replace case, for repair case (that's the only 5447 * case where the mirror is selected explicitly when 5448 * calling btrfs_map_block), blocks left of the left cursor 5449 * can also be read from the target drive. 5450 * For REQ_GET_READ_MIRRORS, the target drive is added as 5451 * the last one to the array of stripes. For READ, it also 5452 * needs to be supported using the same mirror number. 5453 * If the requested block is not left of the left cursor, 5454 * EIO is returned. This can happen because btrfs_num_copies() 5455 * returns one more in the dev-replace case. 5456 */ 5457 u64 tmp_length = *length; 5458 struct btrfs_bio *tmp_bbio = NULL; 5459 int tmp_num_stripes; 5460 u64 srcdev_devid = dev_replace->srcdev->devid; 5461 int index_srcdev = 0; 5462 int found = 0; 5463 u64 physical_of_found = 0; 5464 5465 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5466 logical, &tmp_length, &tmp_bbio, 0, 0); 5467 if (ret) { 5468 WARN_ON(tmp_bbio != NULL); 5469 goto out; 5470 } 5471 5472 tmp_num_stripes = tmp_bbio->num_stripes; 5473 if (mirror_num > tmp_num_stripes) { 5474 /* 5475 * BTRFS_MAP_GET_READ_MIRRORS does not contain this 5476 * mirror, that means that the requested area 5477 * is not left of the left cursor 5478 */ 5479 ret = -EIO; 5480 btrfs_put_bbio(tmp_bbio); 5481 goto out; 5482 } 5483 5484 /* 5485 * process the rest of the function using the mirror_num 5486 * of the source drive. Therefore look it up first. 5487 * At the end, patch the device pointer to the one of the 5488 * target drive. 5489 */ 5490 for (i = 0; i < tmp_num_stripes; i++) { 5491 if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) 5492 continue; 5493 5494 /* 5495 * In case of DUP, in order to keep it simple, only add 5496 * the mirror with the lowest physical address 5497 */ 5498 if (found && 5499 physical_of_found <= tmp_bbio->stripes[i].physical) 5500 continue; 5501 5502 index_srcdev = i; 5503 found = 1; 5504 physical_of_found = tmp_bbio->stripes[i].physical; 5505 } 5506 5507 btrfs_put_bbio(tmp_bbio); 5508 5509 if (!found) { 5510 WARN_ON(1); 5511 ret = -EIO; 5512 goto out; 5513 } 5514 5515 mirror_num = index_srcdev + 1; 5516 patch_the_first_stripe_for_dev_replace = 1; 5517 physical_to_patch_in_first_stripe = physical_of_found; 5518 } else if (mirror_num > map->num_stripes) { 5519 mirror_num = 0; 5520 } 5521 5522 num_stripes = 1; 5523 stripe_index = 0; 5524 stripe_nr_orig = stripe_nr; 5525 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 5526 stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); 5527 stripe_end_offset = stripe_nr_end * map->stripe_len - 5528 (offset + *length); 5529 5530 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5531 if (op == BTRFS_MAP_DISCARD) 5532 num_stripes = min_t(u64, map->num_stripes, 5533 stripe_nr_end - stripe_nr_orig); 5534 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5535 &stripe_index); 5536 if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && 5537 op != BTRFS_MAP_GET_READ_MIRRORS) 5538 mirror_num = 1; 5539 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5540 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || 5541 op == BTRFS_MAP_GET_READ_MIRRORS) 5542 num_stripes = map->num_stripes; 5543 else if (mirror_num) 5544 stripe_index = mirror_num - 1; 5545 else { 5546 stripe_index = find_live_mirror(fs_info, map, 0, 5547 map->num_stripes, 5548 current->pid % map->num_stripes, 5549 dev_replace_is_ongoing); 5550 mirror_num = stripe_index + 1; 5551 } 5552 5553 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5554 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || 5555 op == BTRFS_MAP_GET_READ_MIRRORS) { 5556 num_stripes = map->num_stripes; 5557 } else if (mirror_num) { 5558 stripe_index = mirror_num - 1; 5559 } else { 5560 mirror_num = 1; 5561 } 5562 5563 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5564 u32 factor = map->num_stripes / map->sub_stripes; 5565 5566 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5567 stripe_index *= map->sub_stripes; 5568 5569 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5570 num_stripes = map->sub_stripes; 5571 else if (op == BTRFS_MAP_DISCARD) 5572 num_stripes = min_t(u64, map->sub_stripes * 5573 (stripe_nr_end - stripe_nr_orig), 5574 map->num_stripes); 5575 else if (mirror_num) 5576 stripe_index += mirror_num - 1; 5577 else { 5578 int old_stripe_index = stripe_index; 5579 stripe_index = find_live_mirror(fs_info, map, 5580 stripe_index, 5581 map->sub_stripes, stripe_index + 5582 current->pid % map->sub_stripes, 5583 dev_replace_is_ongoing); 5584 mirror_num = stripe_index - old_stripe_index + 1; 5585 } 5586 5587 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5588 if (need_raid_map && 5589 (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || 5590 mirror_num > 1)) { 5591 /* push stripe_nr back to the start of the full stripe */ 5592 stripe_nr = div_u64(raid56_full_stripe_start, 5593 stripe_len * nr_data_stripes(map)); 5594 5595 /* RAID[56] write or recovery. Return all stripes */ 5596 num_stripes = map->num_stripes; 5597 max_errors = nr_parity_stripes(map); 5598 5599 *length = map->stripe_len; 5600 stripe_index = 0; 5601 stripe_offset = 0; 5602 } else { 5603 /* 5604 * Mirror #0 or #1 means the original data block. 5605 * Mirror #2 is RAID5 parity block. 5606 * Mirror #3 is RAID6 Q block. 5607 */ 5608 stripe_nr = div_u64_rem(stripe_nr, 5609 nr_data_stripes(map), &stripe_index); 5610 if (mirror_num > 1) 5611 stripe_index = nr_data_stripes(map) + 5612 mirror_num - 2; 5613 5614 /* We distribute the parity blocks across stripes */ 5615 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5616 &stripe_index); 5617 if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && 5618 op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) 5619 mirror_num = 1; 5620 } 5621 } else { 5622 /* 5623 * after this, stripe_nr is the number of stripes on this 5624 * device we have to walk to find the data, and stripe_index is 5625 * the number of our device in the stripe array 5626 */ 5627 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5628 &stripe_index); 5629 mirror_num = stripe_index + 1; 5630 } 5631 if (stripe_index >= map->num_stripes) { 5632 btrfs_crit(fs_info, 5633 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5634 stripe_index, map->num_stripes); 5635 ret = -EINVAL; 5636 goto out; 5637 } 5638 5639 num_alloc_stripes = num_stripes; 5640 if (dev_replace_is_ongoing) { 5641 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) 5642 num_alloc_stripes <<= 1; 5643 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5644 num_alloc_stripes++; 5645 tgtdev_indexes = num_stripes; 5646 } 5647 5648 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5649 if (!bbio) { 5650 ret = -ENOMEM; 5651 goto out; 5652 } 5653 if (dev_replace_is_ongoing) 5654 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5655 5656 /* build raid_map */ 5657 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && 5658 need_raid_map && 5659 ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || 5660 mirror_num > 1)) { 5661 u64 tmp; 5662 unsigned rot; 5663 5664 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5665 sizeof(struct btrfs_bio_stripe) * 5666 num_alloc_stripes + 5667 sizeof(int) * tgtdev_indexes); 5668 5669 /* Work out the disk rotation on this stripe-set */ 5670 div_u64_rem(stripe_nr, num_stripes, &rot); 5671 5672 /* Fill in the logical address of each stripe */ 5673 tmp = stripe_nr * nr_data_stripes(map); 5674 for (i = 0; i < nr_data_stripes(map); i++) 5675 bbio->raid_map[(i+rot) % num_stripes] = 5676 em->start + (tmp + i) * map->stripe_len; 5677 5678 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5679 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5680 bbio->raid_map[(i+rot+1) % num_stripes] = 5681 RAID6_Q_STRIPE; 5682 } 5683 5684 if (op == BTRFS_MAP_DISCARD) { 5685 u32 factor = 0; 5686 u32 sub_stripes = 0; 5687 u64 stripes_per_dev = 0; 5688 u32 remaining_stripes = 0; 5689 u32 last_stripe = 0; 5690 5691 if (map->type & 5692 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 5693 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5694 sub_stripes = 1; 5695 else 5696 sub_stripes = map->sub_stripes; 5697 5698 factor = map->num_stripes / sub_stripes; 5699 stripes_per_dev = div_u64_rem(stripe_nr_end - 5700 stripe_nr_orig, 5701 factor, 5702 &remaining_stripes); 5703 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5704 last_stripe *= sub_stripes; 5705 } 5706 5707 for (i = 0; i < num_stripes; i++) { 5708 bbio->stripes[i].physical = 5709 map->stripes[stripe_index].physical + 5710 stripe_offset + stripe_nr * map->stripe_len; 5711 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5712 5713 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5714 BTRFS_BLOCK_GROUP_RAID10)) { 5715 bbio->stripes[i].length = stripes_per_dev * 5716 map->stripe_len; 5717 5718 if (i / sub_stripes < remaining_stripes) 5719 bbio->stripes[i].length += 5720 map->stripe_len; 5721 5722 /* 5723 * Special for the first stripe and 5724 * the last stripe: 5725 * 5726 * |-------|...|-------| 5727 * |----------| 5728 * off end_off 5729 */ 5730 if (i < sub_stripes) 5731 bbio->stripes[i].length -= 5732 stripe_offset; 5733 5734 if (stripe_index >= last_stripe && 5735 stripe_index <= (last_stripe + 5736 sub_stripes - 1)) 5737 bbio->stripes[i].length -= 5738 stripe_end_offset; 5739 5740 if (i == sub_stripes - 1) 5741 stripe_offset = 0; 5742 } else 5743 bbio->stripes[i].length = *length; 5744 5745 stripe_index++; 5746 if (stripe_index == map->num_stripes) { 5747 /* This could only happen for RAID0/10 */ 5748 stripe_index = 0; 5749 stripe_nr++; 5750 } 5751 } 5752 } else { 5753 for (i = 0; i < num_stripes; i++) { 5754 bbio->stripes[i].physical = 5755 map->stripes[stripe_index].physical + 5756 stripe_offset + 5757 stripe_nr * map->stripe_len; 5758 bbio->stripes[i].dev = 5759 map->stripes[stripe_index].dev; 5760 stripe_index++; 5761 } 5762 } 5763 5764 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5765 max_errors = btrfs_chunk_max_errors(map); 5766 5767 if (bbio->raid_map) 5768 sort_parity_stripes(bbio, num_stripes); 5769 5770 tgtdev_indexes = 0; 5771 if (dev_replace_is_ongoing && 5772 (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && 5773 dev_replace->tgtdev != NULL) { 5774 int index_where_to_add; 5775 u64 srcdev_devid = dev_replace->srcdev->devid; 5776 5777 /* 5778 * duplicate the write operations while the dev replace 5779 * procedure is running. Since the copying of the old disk 5780 * to the new disk takes place at run time while the 5781 * filesystem is mounted writable, the regular write 5782 * operations to the old disk have to be duplicated to go 5783 * to the new disk as well. 5784 * Note that device->missing is handled by the caller, and 5785 * that the write to the old disk is already set up in the 5786 * stripes array. 5787 */ 5788 index_where_to_add = num_stripes; 5789 for (i = 0; i < num_stripes; i++) { 5790 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5791 /* write to new disk, too */ 5792 struct btrfs_bio_stripe *new = 5793 bbio->stripes + index_where_to_add; 5794 struct btrfs_bio_stripe *old = 5795 bbio->stripes + i; 5796 5797 new->physical = old->physical; 5798 new->length = old->length; 5799 new->dev = dev_replace->tgtdev; 5800 bbio->tgtdev_map[i] = index_where_to_add; 5801 index_where_to_add++; 5802 max_errors++; 5803 tgtdev_indexes++; 5804 } 5805 } 5806 num_stripes = index_where_to_add; 5807 } else if (dev_replace_is_ongoing && 5808 op == BTRFS_MAP_GET_READ_MIRRORS && 5809 dev_replace->tgtdev != NULL) { 5810 u64 srcdev_devid = dev_replace->srcdev->devid; 5811 int index_srcdev = 0; 5812 int found = 0; 5813 u64 physical_of_found = 0; 5814 5815 /* 5816 * During the dev-replace procedure, the target drive can 5817 * also be used to read data in case it is needed to repair 5818 * a corrupt block elsewhere. This is possible if the 5819 * requested area is left of the left cursor. In this area, 5820 * the target drive is a full copy of the source drive. 5821 */ 5822 for (i = 0; i < num_stripes; i++) { 5823 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5824 /* 5825 * In case of DUP, in order to keep it 5826 * simple, only add the mirror with the 5827 * lowest physical address 5828 */ 5829 if (found && 5830 physical_of_found <= 5831 bbio->stripes[i].physical) 5832 continue; 5833 index_srcdev = i; 5834 found = 1; 5835 physical_of_found = bbio->stripes[i].physical; 5836 } 5837 } 5838 if (found) { 5839 struct btrfs_bio_stripe *tgtdev_stripe = 5840 bbio->stripes + num_stripes; 5841 5842 tgtdev_stripe->physical = physical_of_found; 5843 tgtdev_stripe->length = 5844 bbio->stripes[index_srcdev].length; 5845 tgtdev_stripe->dev = dev_replace->tgtdev; 5846 bbio->tgtdev_map[index_srcdev] = num_stripes; 5847 5848 tgtdev_indexes++; 5849 num_stripes++; 5850 } 5851 } 5852 5853 *bbio_ret = bbio; 5854 bbio->map_type = map->type; 5855 bbio->num_stripes = num_stripes; 5856 bbio->max_errors = max_errors; 5857 bbio->mirror_num = mirror_num; 5858 bbio->num_tgtdevs = tgtdev_indexes; 5859 5860 /* 5861 * this is the case that REQ_READ && dev_replace_is_ongoing && 5862 * mirror_num == num_stripes + 1 && dev_replace target drive is 5863 * available as a mirror 5864 */ 5865 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5866 WARN_ON(num_stripes > 1); 5867 bbio->stripes[0].dev = dev_replace->tgtdev; 5868 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5869 bbio->mirror_num = map->num_stripes + 1; 5870 } 5871 out: 5872 if (dev_replace_is_ongoing) { 5873 btrfs_dev_replace_clear_lock_blocking(dev_replace); 5874 btrfs_dev_replace_unlock(dev_replace, 0); 5875 } 5876 free_extent_map(em); 5877 return ret; 5878 } 5879 5880 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5881 u64 logical, u64 *length, 5882 struct btrfs_bio **bbio_ret, int mirror_num) 5883 { 5884 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 5885 mirror_num, 0); 5886 } 5887 5888 /* For Scrub/replace */ 5889 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5890 u64 logical, u64 *length, 5891 struct btrfs_bio **bbio_ret, int mirror_num, 5892 int need_raid_map) 5893 { 5894 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 5895 mirror_num, need_raid_map); 5896 } 5897 5898 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, 5899 u64 chunk_start, u64 physical, u64 devid, 5900 u64 **logical, int *naddrs, int *stripe_len) 5901 { 5902 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 5903 struct extent_map_tree *em_tree = &map_tree->map_tree; 5904 struct extent_map *em; 5905 struct map_lookup *map; 5906 u64 *buf; 5907 u64 bytenr; 5908 u64 length; 5909 u64 stripe_nr; 5910 u64 rmap_len; 5911 int i, j, nr = 0; 5912 5913 read_lock(&em_tree->lock); 5914 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5915 read_unlock(&em_tree->lock); 5916 5917 if (!em) { 5918 btrfs_err(fs_info, "couldn't find em for chunk %Lu", 5919 chunk_start); 5920 return -EIO; 5921 } 5922 5923 if (em->start != chunk_start) { 5924 btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", 5925 em->start, chunk_start); 5926 free_extent_map(em); 5927 return -EIO; 5928 } 5929 map = em->map_lookup; 5930 5931 length = em->len; 5932 rmap_len = map->stripe_len; 5933 5934 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5935 length = div_u64(length, map->num_stripes / map->sub_stripes); 5936 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5937 length = div_u64(length, map->num_stripes); 5938 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5939 length = div_u64(length, nr_data_stripes(map)); 5940 rmap_len = map->stripe_len * nr_data_stripes(map); 5941 } 5942 5943 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 5944 BUG_ON(!buf); /* -ENOMEM */ 5945 5946 for (i = 0; i < map->num_stripes; i++) { 5947 if (devid && map->stripes[i].dev->devid != devid) 5948 continue; 5949 if (map->stripes[i].physical > physical || 5950 map->stripes[i].physical + length <= physical) 5951 continue; 5952 5953 stripe_nr = physical - map->stripes[i].physical; 5954 stripe_nr = div_u64(stripe_nr, map->stripe_len); 5955 5956 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5957 stripe_nr = stripe_nr * map->num_stripes + i; 5958 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 5959 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5960 stripe_nr = stripe_nr * map->num_stripes + i; 5961 } /* else if RAID[56], multiply by nr_data_stripes(). 5962 * Alternatively, just use rmap_len below instead of 5963 * map->stripe_len */ 5964 5965 bytenr = chunk_start + stripe_nr * rmap_len; 5966 WARN_ON(nr >= map->num_stripes); 5967 for (j = 0; j < nr; j++) { 5968 if (buf[j] == bytenr) 5969 break; 5970 } 5971 if (j == nr) { 5972 WARN_ON(nr >= map->num_stripes); 5973 buf[nr++] = bytenr; 5974 } 5975 } 5976 5977 *logical = buf; 5978 *naddrs = nr; 5979 *stripe_len = rmap_len; 5980 5981 free_extent_map(em); 5982 return 0; 5983 } 5984 5985 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 5986 { 5987 bio->bi_private = bbio->private; 5988 bio->bi_end_io = bbio->end_io; 5989 bio_endio(bio); 5990 5991 btrfs_put_bbio(bbio); 5992 } 5993 5994 static void btrfs_end_bio(struct bio *bio) 5995 { 5996 struct btrfs_bio *bbio = bio->bi_private; 5997 int is_orig_bio = 0; 5998 5999 if (bio->bi_error) { 6000 atomic_inc(&bbio->error); 6001 if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { 6002 unsigned int stripe_index = 6003 btrfs_io_bio(bio)->stripe_index; 6004 struct btrfs_device *dev; 6005 6006 BUG_ON(stripe_index >= bbio->num_stripes); 6007 dev = bbio->stripes[stripe_index].dev; 6008 if (dev->bdev) { 6009 if (bio_op(bio) == REQ_OP_WRITE) 6010 btrfs_dev_stat_inc(dev, 6011 BTRFS_DEV_STAT_WRITE_ERRS); 6012 else 6013 btrfs_dev_stat_inc(dev, 6014 BTRFS_DEV_STAT_READ_ERRS); 6015 if (bio->bi_opf & REQ_PREFLUSH) 6016 btrfs_dev_stat_inc(dev, 6017 BTRFS_DEV_STAT_FLUSH_ERRS); 6018 btrfs_dev_stat_print_on_error(dev); 6019 } 6020 } 6021 } 6022 6023 if (bio == bbio->orig_bio) 6024 is_orig_bio = 1; 6025 6026 btrfs_bio_counter_dec(bbio->fs_info); 6027 6028 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6029 if (!is_orig_bio) { 6030 bio_put(bio); 6031 bio = bbio->orig_bio; 6032 } 6033 6034 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6035 /* only send an error to the higher layers if it is 6036 * beyond the tolerance of the btrfs bio 6037 */ 6038 if (atomic_read(&bbio->error) > bbio->max_errors) { 6039 bio->bi_error = -EIO; 6040 } else { 6041 /* 6042 * this bio is actually up to date, we didn't 6043 * go over the max number of errors 6044 */ 6045 bio->bi_error = 0; 6046 } 6047 6048 btrfs_end_bbio(bbio, bio); 6049 } else if (!is_orig_bio) { 6050 bio_put(bio); 6051 } 6052 } 6053 6054 /* 6055 * see run_scheduled_bios for a description of why bios are collected for 6056 * async submit. 6057 * 6058 * This will add one bio to the pending list for a device and make sure 6059 * the work struct is scheduled. 6060 */ 6061 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6062 struct bio *bio) 6063 { 6064 struct btrfs_fs_info *fs_info = device->fs_info; 6065 int should_queue = 1; 6066 struct btrfs_pending_bios *pending_bios; 6067 6068 if (device->missing || !device->bdev) { 6069 bio_io_error(bio); 6070 return; 6071 } 6072 6073 /* don't bother with additional async steps for reads, right now */ 6074 if (bio_op(bio) == REQ_OP_READ) { 6075 bio_get(bio); 6076 btrfsic_submit_bio(bio); 6077 bio_put(bio); 6078 return; 6079 } 6080 6081 /* 6082 * nr_async_bios allows us to reliably return congestion to the 6083 * higher layers. Otherwise, the async bio makes it appear we have 6084 * made progress against dirty pages when we've really just put it 6085 * on a queue for later 6086 */ 6087 atomic_inc(&fs_info->nr_async_bios); 6088 WARN_ON(bio->bi_next); 6089 bio->bi_next = NULL; 6090 6091 spin_lock(&device->io_lock); 6092 if (op_is_sync(bio->bi_opf)) 6093 pending_bios = &device->pending_sync_bios; 6094 else 6095 pending_bios = &device->pending_bios; 6096 6097 if (pending_bios->tail) 6098 pending_bios->tail->bi_next = bio; 6099 6100 pending_bios->tail = bio; 6101 if (!pending_bios->head) 6102 pending_bios->head = bio; 6103 if (device->running_pending) 6104 should_queue = 0; 6105 6106 spin_unlock(&device->io_lock); 6107 6108 if (should_queue) 6109 btrfs_queue_work(fs_info->submit_workers, &device->work); 6110 } 6111 6112 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6113 u64 physical, int dev_nr, int async) 6114 { 6115 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6116 struct btrfs_fs_info *fs_info = bbio->fs_info; 6117 6118 bio->bi_private = bbio; 6119 btrfs_io_bio(bio)->stripe_index = dev_nr; 6120 bio->bi_end_io = btrfs_end_bio; 6121 bio->bi_iter.bi_sector = physical >> 9; 6122 #ifdef DEBUG 6123 { 6124 struct rcu_string *name; 6125 6126 rcu_read_lock(); 6127 name = rcu_dereference(dev->name); 6128 btrfs_debug(fs_info, 6129 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6130 bio_op(bio), bio->bi_opf, 6131 (u64)bio->bi_iter.bi_sector, 6132 (u_long)dev->bdev->bd_dev, name->str, dev->devid, 6133 bio->bi_iter.bi_size); 6134 rcu_read_unlock(); 6135 } 6136 #endif 6137 bio->bi_bdev = dev->bdev; 6138 6139 btrfs_bio_counter_inc_noblocked(fs_info); 6140 6141 if (async) 6142 btrfs_schedule_bio(dev, bio); 6143 else 6144 btrfsic_submit_bio(bio); 6145 } 6146 6147 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6148 { 6149 atomic_inc(&bbio->error); 6150 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6151 /* Should be the original bio. */ 6152 WARN_ON(bio != bbio->orig_bio); 6153 6154 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6155 bio->bi_iter.bi_sector = logical >> 9; 6156 bio->bi_error = -EIO; 6157 btrfs_end_bbio(bbio, bio); 6158 } 6159 } 6160 6161 int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6162 int mirror_num, int async_submit) 6163 { 6164 struct btrfs_device *dev; 6165 struct bio *first_bio = bio; 6166 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6167 u64 length = 0; 6168 u64 map_length; 6169 int ret; 6170 int dev_nr; 6171 int total_devs; 6172 struct btrfs_bio *bbio = NULL; 6173 6174 length = bio->bi_iter.bi_size; 6175 map_length = length; 6176 6177 btrfs_bio_counter_inc_blocked(fs_info); 6178 ret = __btrfs_map_block(fs_info, bio_op(bio), logical, 6179 &map_length, &bbio, mirror_num, 1); 6180 if (ret) { 6181 btrfs_bio_counter_dec(fs_info); 6182 return ret; 6183 } 6184 6185 total_devs = bbio->num_stripes; 6186 bbio->orig_bio = first_bio; 6187 bbio->private = first_bio->bi_private; 6188 bbio->end_io = first_bio->bi_end_io; 6189 bbio->fs_info = fs_info; 6190 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6191 6192 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6193 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6194 /* In this case, map_length has been set to the length of 6195 a single stripe; not the whole write */ 6196 if (bio_op(bio) == REQ_OP_WRITE) { 6197 ret = raid56_parity_write(fs_info, bio, bbio, 6198 map_length); 6199 } else { 6200 ret = raid56_parity_recover(fs_info, bio, bbio, 6201 map_length, mirror_num, 1); 6202 } 6203 6204 btrfs_bio_counter_dec(fs_info); 6205 return ret; 6206 } 6207 6208 if (map_length < length) { 6209 btrfs_crit(fs_info, 6210 "mapping failed logical %llu bio len %llu len %llu", 6211 logical, length, map_length); 6212 BUG(); 6213 } 6214 6215 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6216 dev = bbio->stripes[dev_nr].dev; 6217 if (!dev || !dev->bdev || 6218 (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { 6219 bbio_error(bbio, first_bio, logical); 6220 continue; 6221 } 6222 6223 if (dev_nr < total_devs - 1) { 6224 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 6225 BUG_ON(!bio); /* -ENOMEM */ 6226 } else 6227 bio = first_bio; 6228 6229 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6230 dev_nr, async_submit); 6231 } 6232 btrfs_bio_counter_dec(fs_info); 6233 return 0; 6234 } 6235 6236 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6237 u8 *uuid, u8 *fsid) 6238 { 6239 struct btrfs_device *device; 6240 struct btrfs_fs_devices *cur_devices; 6241 6242 cur_devices = fs_info->fs_devices; 6243 while (cur_devices) { 6244 if (!fsid || 6245 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 6246 device = __find_device(&cur_devices->devices, 6247 devid, uuid); 6248 if (device) 6249 return device; 6250 } 6251 cur_devices = cur_devices->seed; 6252 } 6253 return NULL; 6254 } 6255 6256 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6257 u64 devid, u8 *dev_uuid) 6258 { 6259 struct btrfs_device *device; 6260 6261 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6262 if (IS_ERR(device)) 6263 return NULL; 6264 6265 list_add(&device->dev_list, &fs_devices->devices); 6266 device->fs_devices = fs_devices; 6267 fs_devices->num_devices++; 6268 6269 device->missing = 1; 6270 fs_devices->missing_devices++; 6271 6272 return device; 6273 } 6274 6275 /** 6276 * btrfs_alloc_device - allocate struct btrfs_device 6277 * @fs_info: used only for generating a new devid, can be NULL if 6278 * devid is provided (i.e. @devid != NULL). 6279 * @devid: a pointer to devid for this device. If NULL a new devid 6280 * is generated. 6281 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6282 * is generated. 6283 * 6284 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6285 * on error. Returned struct is not linked onto any lists and can be 6286 * destroyed with kfree() right away. 6287 */ 6288 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6289 const u64 *devid, 6290 const u8 *uuid) 6291 { 6292 struct btrfs_device *dev; 6293 u64 tmp; 6294 6295 if (WARN_ON(!devid && !fs_info)) 6296 return ERR_PTR(-EINVAL); 6297 6298 dev = __alloc_device(); 6299 if (IS_ERR(dev)) 6300 return dev; 6301 6302 if (devid) 6303 tmp = *devid; 6304 else { 6305 int ret; 6306 6307 ret = find_next_devid(fs_info, &tmp); 6308 if (ret) { 6309 kfree(dev); 6310 return ERR_PTR(ret); 6311 } 6312 } 6313 dev->devid = tmp; 6314 6315 if (uuid) 6316 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6317 else 6318 generate_random_uuid(dev->uuid); 6319 6320 btrfs_init_work(&dev->work, btrfs_submit_helper, 6321 pending_bios_fn, NULL, NULL); 6322 6323 return dev; 6324 } 6325 6326 /* Return -EIO if any error, otherwise return 0. */ 6327 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6328 struct extent_buffer *leaf, 6329 struct btrfs_chunk *chunk, u64 logical) 6330 { 6331 u64 length; 6332 u64 stripe_len; 6333 u16 num_stripes; 6334 u16 sub_stripes; 6335 u64 type; 6336 6337 length = btrfs_chunk_length(leaf, chunk); 6338 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6339 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6340 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6341 type = btrfs_chunk_type(leaf, chunk); 6342 6343 if (!num_stripes) { 6344 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6345 num_stripes); 6346 return -EIO; 6347 } 6348 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6349 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6350 return -EIO; 6351 } 6352 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6353 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6354 btrfs_chunk_sector_size(leaf, chunk)); 6355 return -EIO; 6356 } 6357 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6358 btrfs_err(fs_info, "invalid chunk length %llu", length); 6359 return -EIO; 6360 } 6361 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6362 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6363 stripe_len); 6364 return -EIO; 6365 } 6366 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6367 type) { 6368 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6369 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6370 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6371 btrfs_chunk_type(leaf, chunk)); 6372 return -EIO; 6373 } 6374 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6375 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6376 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6377 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6378 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6379 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6380 num_stripes != 1)) { 6381 btrfs_err(fs_info, 6382 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6383 num_stripes, sub_stripes, 6384 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6385 return -EIO; 6386 } 6387 6388 return 0; 6389 } 6390 6391 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6392 struct extent_buffer *leaf, 6393 struct btrfs_chunk *chunk) 6394 { 6395 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6396 struct map_lookup *map; 6397 struct extent_map *em; 6398 u64 logical; 6399 u64 length; 6400 u64 stripe_len; 6401 u64 devid; 6402 u8 uuid[BTRFS_UUID_SIZE]; 6403 int num_stripes; 6404 int ret; 6405 int i; 6406 6407 logical = key->offset; 6408 length = btrfs_chunk_length(leaf, chunk); 6409 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6410 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6411 6412 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6413 if (ret) 6414 return ret; 6415 6416 read_lock(&map_tree->map_tree.lock); 6417 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6418 read_unlock(&map_tree->map_tree.lock); 6419 6420 /* already mapped? */ 6421 if (em && em->start <= logical && em->start + em->len > logical) { 6422 free_extent_map(em); 6423 return 0; 6424 } else if (em) { 6425 free_extent_map(em); 6426 } 6427 6428 em = alloc_extent_map(); 6429 if (!em) 6430 return -ENOMEM; 6431 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6432 if (!map) { 6433 free_extent_map(em); 6434 return -ENOMEM; 6435 } 6436 6437 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6438 em->map_lookup = map; 6439 em->start = logical; 6440 em->len = length; 6441 em->orig_start = 0; 6442 em->block_start = 0; 6443 em->block_len = em->len; 6444 6445 map->num_stripes = num_stripes; 6446 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6447 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6448 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 6449 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6450 map->type = btrfs_chunk_type(leaf, chunk); 6451 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6452 for (i = 0; i < num_stripes; i++) { 6453 map->stripes[i].physical = 6454 btrfs_stripe_offset_nr(leaf, chunk, i); 6455 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6456 read_extent_buffer(leaf, uuid, (unsigned long) 6457 btrfs_stripe_dev_uuid_nr(chunk, i), 6458 BTRFS_UUID_SIZE); 6459 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6460 uuid, NULL); 6461 if (!map->stripes[i].dev && 6462 !btrfs_test_opt(fs_info, DEGRADED)) { 6463 free_extent_map(em); 6464 return -EIO; 6465 } 6466 if (!map->stripes[i].dev) { 6467 map->stripes[i].dev = 6468 add_missing_dev(fs_info->fs_devices, devid, 6469 uuid); 6470 if (!map->stripes[i].dev) { 6471 free_extent_map(em); 6472 return -EIO; 6473 } 6474 btrfs_warn(fs_info, "devid %llu uuid %pU is missing", 6475 devid, uuid); 6476 } 6477 map->stripes[i].dev->in_fs_metadata = 1; 6478 } 6479 6480 write_lock(&map_tree->map_tree.lock); 6481 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6482 write_unlock(&map_tree->map_tree.lock); 6483 BUG_ON(ret); /* Tree corruption */ 6484 free_extent_map(em); 6485 6486 return 0; 6487 } 6488 6489 static void fill_device_from_item(struct extent_buffer *leaf, 6490 struct btrfs_dev_item *dev_item, 6491 struct btrfs_device *device) 6492 { 6493 unsigned long ptr; 6494 6495 device->devid = btrfs_device_id(leaf, dev_item); 6496 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6497 device->total_bytes = device->disk_total_bytes; 6498 device->commit_total_bytes = device->disk_total_bytes; 6499 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6500 device->commit_bytes_used = device->bytes_used; 6501 device->type = btrfs_device_type(leaf, dev_item); 6502 device->io_align = btrfs_device_io_align(leaf, dev_item); 6503 device->io_width = btrfs_device_io_width(leaf, dev_item); 6504 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6505 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6506 device->is_tgtdev_for_dev_replace = 0; 6507 6508 ptr = btrfs_device_uuid(dev_item); 6509 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6510 } 6511 6512 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6513 u8 *fsid) 6514 { 6515 struct btrfs_fs_devices *fs_devices; 6516 int ret; 6517 6518 BUG_ON(!mutex_is_locked(&uuid_mutex)); 6519 6520 fs_devices = fs_info->fs_devices->seed; 6521 while (fs_devices) { 6522 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) 6523 return fs_devices; 6524 6525 fs_devices = fs_devices->seed; 6526 } 6527 6528 fs_devices = find_fsid(fsid); 6529 if (!fs_devices) { 6530 if (!btrfs_test_opt(fs_info, DEGRADED)) 6531 return ERR_PTR(-ENOENT); 6532 6533 fs_devices = alloc_fs_devices(fsid); 6534 if (IS_ERR(fs_devices)) 6535 return fs_devices; 6536 6537 fs_devices->seeding = 1; 6538 fs_devices->opened = 1; 6539 return fs_devices; 6540 } 6541 6542 fs_devices = clone_fs_devices(fs_devices); 6543 if (IS_ERR(fs_devices)) 6544 return fs_devices; 6545 6546 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6547 fs_info->bdev_holder); 6548 if (ret) { 6549 free_fs_devices(fs_devices); 6550 fs_devices = ERR_PTR(ret); 6551 goto out; 6552 } 6553 6554 if (!fs_devices->seeding) { 6555 __btrfs_close_devices(fs_devices); 6556 free_fs_devices(fs_devices); 6557 fs_devices = ERR_PTR(-EINVAL); 6558 goto out; 6559 } 6560 6561 fs_devices->seed = fs_info->fs_devices->seed; 6562 fs_info->fs_devices->seed = fs_devices; 6563 out: 6564 return fs_devices; 6565 } 6566 6567 static int read_one_dev(struct btrfs_fs_info *fs_info, 6568 struct extent_buffer *leaf, 6569 struct btrfs_dev_item *dev_item) 6570 { 6571 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6572 struct btrfs_device *device; 6573 u64 devid; 6574 int ret; 6575 u8 fs_uuid[BTRFS_UUID_SIZE]; 6576 u8 dev_uuid[BTRFS_UUID_SIZE]; 6577 6578 devid = btrfs_device_id(leaf, dev_item); 6579 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6580 BTRFS_UUID_SIZE); 6581 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6582 BTRFS_UUID_SIZE); 6583 6584 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) { 6585 fs_devices = open_seed_devices(fs_info, fs_uuid); 6586 if (IS_ERR(fs_devices)) 6587 return PTR_ERR(fs_devices); 6588 } 6589 6590 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6591 if (!device) { 6592 if (!btrfs_test_opt(fs_info, DEGRADED)) 6593 return -EIO; 6594 6595 device = add_missing_dev(fs_devices, devid, dev_uuid); 6596 if (!device) 6597 return -ENOMEM; 6598 btrfs_warn(fs_info, "devid %llu uuid %pU missing", 6599 devid, dev_uuid); 6600 } else { 6601 if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED)) 6602 return -EIO; 6603 6604 if(!device->bdev && !device->missing) { 6605 /* 6606 * this happens when a device that was properly setup 6607 * in the device info lists suddenly goes bad. 6608 * device->bdev is NULL, and so we have to set 6609 * device->missing to one here 6610 */ 6611 device->fs_devices->missing_devices++; 6612 device->missing = 1; 6613 } 6614 6615 /* Move the device to its own fs_devices */ 6616 if (device->fs_devices != fs_devices) { 6617 ASSERT(device->missing); 6618 6619 list_move(&device->dev_list, &fs_devices->devices); 6620 device->fs_devices->num_devices--; 6621 fs_devices->num_devices++; 6622 6623 device->fs_devices->missing_devices--; 6624 fs_devices->missing_devices++; 6625 6626 device->fs_devices = fs_devices; 6627 } 6628 } 6629 6630 if (device->fs_devices != fs_info->fs_devices) { 6631 BUG_ON(device->writeable); 6632 if (device->generation != 6633 btrfs_device_generation(leaf, dev_item)) 6634 return -EINVAL; 6635 } 6636 6637 fill_device_from_item(leaf, dev_item, device); 6638 device->in_fs_metadata = 1; 6639 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6640 device->fs_devices->total_rw_bytes += device->total_bytes; 6641 spin_lock(&fs_info->free_chunk_lock); 6642 fs_info->free_chunk_space += device->total_bytes - 6643 device->bytes_used; 6644 spin_unlock(&fs_info->free_chunk_lock); 6645 } 6646 ret = 0; 6647 return ret; 6648 } 6649 6650 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6651 { 6652 struct btrfs_root *root = fs_info->tree_root; 6653 struct btrfs_super_block *super_copy = fs_info->super_copy; 6654 struct extent_buffer *sb; 6655 struct btrfs_disk_key *disk_key; 6656 struct btrfs_chunk *chunk; 6657 u8 *array_ptr; 6658 unsigned long sb_array_offset; 6659 int ret = 0; 6660 u32 num_stripes; 6661 u32 array_size; 6662 u32 len = 0; 6663 u32 cur_offset; 6664 u64 type; 6665 struct btrfs_key key; 6666 6667 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6668 /* 6669 * This will create extent buffer of nodesize, superblock size is 6670 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6671 * overallocate but we can keep it as-is, only the first page is used. 6672 */ 6673 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6674 if (IS_ERR(sb)) 6675 return PTR_ERR(sb); 6676 set_extent_buffer_uptodate(sb); 6677 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6678 /* 6679 * The sb extent buffer is artificial and just used to read the system array. 6680 * set_extent_buffer_uptodate() call does not properly mark all it's 6681 * pages up-to-date when the page is larger: extent does not cover the 6682 * whole page and consequently check_page_uptodate does not find all 6683 * the page's extents up-to-date (the hole beyond sb), 6684 * write_extent_buffer then triggers a WARN_ON. 6685 * 6686 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6687 * but sb spans only this function. Add an explicit SetPageUptodate call 6688 * to silence the warning eg. on PowerPC 64. 6689 */ 6690 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6691 SetPageUptodate(sb->pages[0]); 6692 6693 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6694 array_size = btrfs_super_sys_array_size(super_copy); 6695 6696 array_ptr = super_copy->sys_chunk_array; 6697 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6698 cur_offset = 0; 6699 6700 while (cur_offset < array_size) { 6701 disk_key = (struct btrfs_disk_key *)array_ptr; 6702 len = sizeof(*disk_key); 6703 if (cur_offset + len > array_size) 6704 goto out_short_read; 6705 6706 btrfs_disk_key_to_cpu(&key, disk_key); 6707 6708 array_ptr += len; 6709 sb_array_offset += len; 6710 cur_offset += len; 6711 6712 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6713 chunk = (struct btrfs_chunk *)sb_array_offset; 6714 /* 6715 * At least one btrfs_chunk with one stripe must be 6716 * present, exact stripe count check comes afterwards 6717 */ 6718 len = btrfs_chunk_item_size(1); 6719 if (cur_offset + len > array_size) 6720 goto out_short_read; 6721 6722 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6723 if (!num_stripes) { 6724 btrfs_err(fs_info, 6725 "invalid number of stripes %u in sys_array at offset %u", 6726 num_stripes, cur_offset); 6727 ret = -EIO; 6728 break; 6729 } 6730 6731 type = btrfs_chunk_type(sb, chunk); 6732 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6733 btrfs_err(fs_info, 6734 "invalid chunk type %llu in sys_array at offset %u", 6735 type, cur_offset); 6736 ret = -EIO; 6737 break; 6738 } 6739 6740 len = btrfs_chunk_item_size(num_stripes); 6741 if (cur_offset + len > array_size) 6742 goto out_short_read; 6743 6744 ret = read_one_chunk(fs_info, &key, sb, chunk); 6745 if (ret) 6746 break; 6747 } else { 6748 btrfs_err(fs_info, 6749 "unexpected item type %u in sys_array at offset %u", 6750 (u32)key.type, cur_offset); 6751 ret = -EIO; 6752 break; 6753 } 6754 array_ptr += len; 6755 sb_array_offset += len; 6756 cur_offset += len; 6757 } 6758 clear_extent_buffer_uptodate(sb); 6759 free_extent_buffer_stale(sb); 6760 return ret; 6761 6762 out_short_read: 6763 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6764 len, cur_offset); 6765 clear_extent_buffer_uptodate(sb); 6766 free_extent_buffer_stale(sb); 6767 return -EIO; 6768 } 6769 6770 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6771 { 6772 struct btrfs_root *root = fs_info->chunk_root; 6773 struct btrfs_path *path; 6774 struct extent_buffer *leaf; 6775 struct btrfs_key key; 6776 struct btrfs_key found_key; 6777 int ret; 6778 int slot; 6779 u64 total_dev = 0; 6780 6781 path = btrfs_alloc_path(); 6782 if (!path) 6783 return -ENOMEM; 6784 6785 mutex_lock(&uuid_mutex); 6786 mutex_lock(&fs_info->chunk_mutex); 6787 6788 /* 6789 * Read all device items, and then all the chunk items. All 6790 * device items are found before any chunk item (their object id 6791 * is smaller than the lowest possible object id for a chunk 6792 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6793 */ 6794 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6795 key.offset = 0; 6796 key.type = 0; 6797 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6798 if (ret < 0) 6799 goto error; 6800 while (1) { 6801 leaf = path->nodes[0]; 6802 slot = path->slots[0]; 6803 if (slot >= btrfs_header_nritems(leaf)) { 6804 ret = btrfs_next_leaf(root, path); 6805 if (ret == 0) 6806 continue; 6807 if (ret < 0) 6808 goto error; 6809 break; 6810 } 6811 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6812 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6813 struct btrfs_dev_item *dev_item; 6814 dev_item = btrfs_item_ptr(leaf, slot, 6815 struct btrfs_dev_item); 6816 ret = read_one_dev(fs_info, leaf, dev_item); 6817 if (ret) 6818 goto error; 6819 total_dev++; 6820 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6821 struct btrfs_chunk *chunk; 6822 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6823 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 6824 if (ret) 6825 goto error; 6826 } 6827 path->slots[0]++; 6828 } 6829 6830 /* 6831 * After loading chunk tree, we've got all device information, 6832 * do another round of validation checks. 6833 */ 6834 if (total_dev != fs_info->fs_devices->total_devices) { 6835 btrfs_err(fs_info, 6836 "super_num_devices %llu mismatch with num_devices %llu found here", 6837 btrfs_super_num_devices(fs_info->super_copy), 6838 total_dev); 6839 ret = -EINVAL; 6840 goto error; 6841 } 6842 if (btrfs_super_total_bytes(fs_info->super_copy) < 6843 fs_info->fs_devices->total_rw_bytes) { 6844 btrfs_err(fs_info, 6845 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 6846 btrfs_super_total_bytes(fs_info->super_copy), 6847 fs_info->fs_devices->total_rw_bytes); 6848 ret = -EINVAL; 6849 goto error; 6850 } 6851 ret = 0; 6852 error: 6853 mutex_unlock(&fs_info->chunk_mutex); 6854 mutex_unlock(&uuid_mutex); 6855 6856 btrfs_free_path(path); 6857 return ret; 6858 } 6859 6860 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6861 { 6862 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6863 struct btrfs_device *device; 6864 6865 while (fs_devices) { 6866 mutex_lock(&fs_devices->device_list_mutex); 6867 list_for_each_entry(device, &fs_devices->devices, dev_list) 6868 device->fs_info = fs_info; 6869 mutex_unlock(&fs_devices->device_list_mutex); 6870 6871 fs_devices = fs_devices->seed; 6872 } 6873 } 6874 6875 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6876 { 6877 int i; 6878 6879 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6880 btrfs_dev_stat_reset(dev, i); 6881 } 6882 6883 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6884 { 6885 struct btrfs_key key; 6886 struct btrfs_key found_key; 6887 struct btrfs_root *dev_root = fs_info->dev_root; 6888 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6889 struct extent_buffer *eb; 6890 int slot; 6891 int ret = 0; 6892 struct btrfs_device *device; 6893 struct btrfs_path *path = NULL; 6894 int i; 6895 6896 path = btrfs_alloc_path(); 6897 if (!path) { 6898 ret = -ENOMEM; 6899 goto out; 6900 } 6901 6902 mutex_lock(&fs_devices->device_list_mutex); 6903 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6904 int item_size; 6905 struct btrfs_dev_stats_item *ptr; 6906 6907 key.objectid = BTRFS_DEV_STATS_OBJECTID; 6908 key.type = BTRFS_PERSISTENT_ITEM_KEY; 6909 key.offset = device->devid; 6910 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6911 if (ret) { 6912 __btrfs_reset_dev_stats(device); 6913 device->dev_stats_valid = 1; 6914 btrfs_release_path(path); 6915 continue; 6916 } 6917 slot = path->slots[0]; 6918 eb = path->nodes[0]; 6919 btrfs_item_key_to_cpu(eb, &found_key, slot); 6920 item_size = btrfs_item_size_nr(eb, slot); 6921 6922 ptr = btrfs_item_ptr(eb, slot, 6923 struct btrfs_dev_stats_item); 6924 6925 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6926 if (item_size >= (1 + i) * sizeof(__le64)) 6927 btrfs_dev_stat_set(device, i, 6928 btrfs_dev_stats_value(eb, ptr, i)); 6929 else 6930 btrfs_dev_stat_reset(device, i); 6931 } 6932 6933 device->dev_stats_valid = 1; 6934 btrfs_dev_stat_print_on_load(device); 6935 btrfs_release_path(path); 6936 } 6937 mutex_unlock(&fs_devices->device_list_mutex); 6938 6939 out: 6940 btrfs_free_path(path); 6941 return ret < 0 ? ret : 0; 6942 } 6943 6944 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6945 struct btrfs_fs_info *fs_info, 6946 struct btrfs_device *device) 6947 { 6948 struct btrfs_root *dev_root = fs_info->dev_root; 6949 struct btrfs_path *path; 6950 struct btrfs_key key; 6951 struct extent_buffer *eb; 6952 struct btrfs_dev_stats_item *ptr; 6953 int ret; 6954 int i; 6955 6956 key.objectid = BTRFS_DEV_STATS_OBJECTID; 6957 key.type = BTRFS_PERSISTENT_ITEM_KEY; 6958 key.offset = device->devid; 6959 6960 path = btrfs_alloc_path(); 6961 BUG_ON(!path); 6962 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6963 if (ret < 0) { 6964 btrfs_warn_in_rcu(fs_info, 6965 "error %d while searching for dev_stats item for device %s", 6966 ret, rcu_str_deref(device->name)); 6967 goto out; 6968 } 6969 6970 if (ret == 0 && 6971 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6972 /* need to delete old one and insert a new one */ 6973 ret = btrfs_del_item(trans, dev_root, path); 6974 if (ret != 0) { 6975 btrfs_warn_in_rcu(fs_info, 6976 "delete too small dev_stats item for device %s failed %d", 6977 rcu_str_deref(device->name), ret); 6978 goto out; 6979 } 6980 ret = 1; 6981 } 6982 6983 if (ret == 1) { 6984 /* need to insert a new item */ 6985 btrfs_release_path(path); 6986 ret = btrfs_insert_empty_item(trans, dev_root, path, 6987 &key, sizeof(*ptr)); 6988 if (ret < 0) { 6989 btrfs_warn_in_rcu(fs_info, 6990 "insert dev_stats item for device %s failed %d", 6991 rcu_str_deref(device->name), ret); 6992 goto out; 6993 } 6994 } 6995 6996 eb = path->nodes[0]; 6997 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6998 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6999 btrfs_set_dev_stats_value(eb, ptr, i, 7000 btrfs_dev_stat_read(device, i)); 7001 btrfs_mark_buffer_dirty(eb); 7002 7003 out: 7004 btrfs_free_path(path); 7005 return ret; 7006 } 7007 7008 /* 7009 * called from commit_transaction. Writes all changed device stats to disk. 7010 */ 7011 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7012 struct btrfs_fs_info *fs_info) 7013 { 7014 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7015 struct btrfs_device *device; 7016 int stats_cnt; 7017 int ret = 0; 7018 7019 mutex_lock(&fs_devices->device_list_mutex); 7020 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7021 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) 7022 continue; 7023 7024 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7025 ret = update_dev_stat_item(trans, fs_info, device); 7026 if (!ret) 7027 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7028 } 7029 mutex_unlock(&fs_devices->device_list_mutex); 7030 7031 return ret; 7032 } 7033 7034 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7035 { 7036 btrfs_dev_stat_inc(dev, index); 7037 btrfs_dev_stat_print_on_error(dev); 7038 } 7039 7040 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7041 { 7042 if (!dev->dev_stats_valid) 7043 return; 7044 btrfs_err_rl_in_rcu(dev->fs_info, 7045 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7046 rcu_str_deref(dev->name), 7047 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7048 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7049 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7050 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7051 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7052 } 7053 7054 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7055 { 7056 int i; 7057 7058 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7059 if (btrfs_dev_stat_read(dev, i) != 0) 7060 break; 7061 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7062 return; /* all values == 0, suppress message */ 7063 7064 btrfs_info_in_rcu(dev->fs_info, 7065 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7066 rcu_str_deref(dev->name), 7067 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7068 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7069 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7070 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7071 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7072 } 7073 7074 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7075 struct btrfs_ioctl_get_dev_stats *stats) 7076 { 7077 struct btrfs_device *dev; 7078 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7079 int i; 7080 7081 mutex_lock(&fs_devices->device_list_mutex); 7082 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7083 mutex_unlock(&fs_devices->device_list_mutex); 7084 7085 if (!dev) { 7086 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7087 return -ENODEV; 7088 } else if (!dev->dev_stats_valid) { 7089 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7090 return -ENODEV; 7091 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7092 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7093 if (stats->nr_items > i) 7094 stats->values[i] = 7095 btrfs_dev_stat_read_and_reset(dev, i); 7096 else 7097 btrfs_dev_stat_reset(dev, i); 7098 } 7099 } else { 7100 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7101 if (stats->nr_items > i) 7102 stats->values[i] = btrfs_dev_stat_read(dev, i); 7103 } 7104 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7105 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7106 return 0; 7107 } 7108 7109 void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) 7110 { 7111 struct buffer_head *bh; 7112 struct btrfs_super_block *disk_super; 7113 int copy_num; 7114 7115 if (!bdev) 7116 return; 7117 7118 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7119 copy_num++) { 7120 7121 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7122 continue; 7123 7124 disk_super = (struct btrfs_super_block *)bh->b_data; 7125 7126 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7127 set_buffer_dirty(bh); 7128 sync_dirty_buffer(bh); 7129 brelse(bh); 7130 } 7131 7132 /* Notify udev that device has changed */ 7133 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7134 7135 /* Update ctime/mtime for device path for libblkid */ 7136 update_dev_time(device_path); 7137 } 7138 7139 /* 7140 * Update the size of all devices, which is used for writing out the 7141 * super blocks. 7142 */ 7143 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7144 { 7145 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7146 struct btrfs_device *curr, *next; 7147 7148 if (list_empty(&fs_devices->resized_devices)) 7149 return; 7150 7151 mutex_lock(&fs_devices->device_list_mutex); 7152 mutex_lock(&fs_info->chunk_mutex); 7153 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7154 resized_list) { 7155 list_del_init(&curr->resized_list); 7156 curr->commit_total_bytes = curr->disk_total_bytes; 7157 } 7158 mutex_unlock(&fs_info->chunk_mutex); 7159 mutex_unlock(&fs_devices->device_list_mutex); 7160 } 7161 7162 /* Must be invoked during the transaction commit */ 7163 void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, 7164 struct btrfs_transaction *transaction) 7165 { 7166 struct extent_map *em; 7167 struct map_lookup *map; 7168 struct btrfs_device *dev; 7169 int i; 7170 7171 if (list_empty(&transaction->pending_chunks)) 7172 return; 7173 7174 /* In order to kick the device replace finish process */ 7175 mutex_lock(&fs_info->chunk_mutex); 7176 list_for_each_entry(em, &transaction->pending_chunks, list) { 7177 map = em->map_lookup; 7178 7179 for (i = 0; i < map->num_stripes; i++) { 7180 dev = map->stripes[i].dev; 7181 dev->commit_bytes_used = dev->bytes_used; 7182 } 7183 } 7184 mutex_unlock(&fs_info->chunk_mutex); 7185 } 7186 7187 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7188 { 7189 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7190 while (fs_devices) { 7191 fs_devices->fs_info = fs_info; 7192 fs_devices = fs_devices->seed; 7193 } 7194 } 7195 7196 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7197 { 7198 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7199 while (fs_devices) { 7200 fs_devices->fs_info = NULL; 7201 fs_devices = fs_devices->seed; 7202 } 7203 } 7204