1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/iocontext.h> 24 #include <linux/capability.h> 25 #include <linux/ratelimit.h> 26 #include <linux/kthread.h> 27 #include <linux/raid/pq.h> 28 #include <linux/semaphore.h> 29 #include <linux/uuid.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 #include "sysfs.h" 44 45 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 46 [BTRFS_RAID_RAID10] = { 47 .sub_stripes = 2, 48 .dev_stripes = 1, 49 .devs_max = 0, /* 0 == as many as possible */ 50 .devs_min = 4, 51 .tolerated_failures = 1, 52 .devs_increment = 2, 53 .ncopies = 2, 54 }, 55 [BTRFS_RAID_RAID1] = { 56 .sub_stripes = 1, 57 .dev_stripes = 1, 58 .devs_max = 2, 59 .devs_min = 2, 60 .tolerated_failures = 1, 61 .devs_increment = 2, 62 .ncopies = 2, 63 }, 64 [BTRFS_RAID_DUP] = { 65 .sub_stripes = 1, 66 .dev_stripes = 2, 67 .devs_max = 1, 68 .devs_min = 1, 69 .tolerated_failures = 0, 70 .devs_increment = 1, 71 .ncopies = 2, 72 }, 73 [BTRFS_RAID_RAID0] = { 74 .sub_stripes = 1, 75 .dev_stripes = 1, 76 .devs_max = 0, 77 .devs_min = 2, 78 .tolerated_failures = 0, 79 .devs_increment = 1, 80 .ncopies = 1, 81 }, 82 [BTRFS_RAID_SINGLE] = { 83 .sub_stripes = 1, 84 .dev_stripes = 1, 85 .devs_max = 1, 86 .devs_min = 1, 87 .tolerated_failures = 0, 88 .devs_increment = 1, 89 .ncopies = 1, 90 }, 91 [BTRFS_RAID_RAID5] = { 92 .sub_stripes = 1, 93 .dev_stripes = 1, 94 .devs_max = 0, 95 .devs_min = 2, 96 .tolerated_failures = 1, 97 .devs_increment = 1, 98 .ncopies = 2, 99 }, 100 [BTRFS_RAID_RAID6] = { 101 .sub_stripes = 1, 102 .dev_stripes = 1, 103 .devs_max = 0, 104 .devs_min = 3, 105 .tolerated_failures = 2, 106 .devs_increment = 1, 107 .ncopies = 3, 108 }, 109 }; 110 111 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { 112 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, 113 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, 114 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, 115 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, 116 [BTRFS_RAID_SINGLE] = 0, 117 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, 118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 119 }; 120 121 /* 122 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices 123 * condition is not met. Zero means there's no corresponding 124 * BTRFS_ERROR_DEV_*_NOT_MET value. 125 */ 126 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { 127 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 128 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 129 [BTRFS_RAID_DUP] = 0, 130 [BTRFS_RAID_RAID0] = 0, 131 [BTRFS_RAID_SINGLE] = 0, 132 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 133 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 134 }; 135 136 static int init_first_rw_device(struct btrfs_trans_handle *trans, 137 struct btrfs_fs_info *fs_info); 138 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 139 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 140 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 141 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 142 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 143 enum btrfs_map_op op, 144 u64 logical, u64 *length, 145 struct btrfs_bio **bbio_ret, 146 int mirror_num, int need_raid_map); 147 148 DEFINE_MUTEX(uuid_mutex); 149 static LIST_HEAD(fs_uuids); 150 struct list_head *btrfs_get_fs_uuids(void) 151 { 152 return &fs_uuids; 153 } 154 155 /* 156 * alloc_fs_devices - allocate struct btrfs_fs_devices 157 * @fsid: if not NULL, copy the uuid to fs_devices::fsid 158 * 159 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 160 * The returned struct is not linked onto any lists and can be destroyed with 161 * kfree() right away. 162 */ 163 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 164 { 165 struct btrfs_fs_devices *fs_devs; 166 167 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 168 if (!fs_devs) 169 return ERR_PTR(-ENOMEM); 170 171 mutex_init(&fs_devs->device_list_mutex); 172 173 INIT_LIST_HEAD(&fs_devs->devices); 174 INIT_LIST_HEAD(&fs_devs->resized_devices); 175 INIT_LIST_HEAD(&fs_devs->alloc_list); 176 INIT_LIST_HEAD(&fs_devs->list); 177 if (fsid) 178 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 179 180 return fs_devs; 181 } 182 183 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 184 { 185 struct btrfs_device *device; 186 WARN_ON(fs_devices->opened); 187 while (!list_empty(&fs_devices->devices)) { 188 device = list_entry(fs_devices->devices.next, 189 struct btrfs_device, dev_list); 190 list_del(&device->dev_list); 191 rcu_string_free(device->name); 192 bio_put(device->flush_bio); 193 kfree(device); 194 } 195 kfree(fs_devices); 196 } 197 198 static void btrfs_kobject_uevent(struct block_device *bdev, 199 enum kobject_action action) 200 { 201 int ret; 202 203 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 204 if (ret) 205 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 206 action, 207 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 208 &disk_to_dev(bdev->bd_disk)->kobj); 209 } 210 211 void btrfs_cleanup_fs_uuids(void) 212 { 213 struct btrfs_fs_devices *fs_devices; 214 215 while (!list_empty(&fs_uuids)) { 216 fs_devices = list_entry(fs_uuids.next, 217 struct btrfs_fs_devices, list); 218 list_del(&fs_devices->list); 219 free_fs_devices(fs_devices); 220 } 221 } 222 223 static struct btrfs_device *__alloc_device(void) 224 { 225 struct btrfs_device *dev; 226 227 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 228 if (!dev) 229 return ERR_PTR(-ENOMEM); 230 231 /* 232 * Preallocate a bio that's always going to be used for flushing device 233 * barriers and matches the device lifespan 234 */ 235 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 236 if (!dev->flush_bio) { 237 kfree(dev); 238 return ERR_PTR(-ENOMEM); 239 } 240 bio_get(dev->flush_bio); 241 242 INIT_LIST_HEAD(&dev->dev_list); 243 INIT_LIST_HEAD(&dev->dev_alloc_list); 244 INIT_LIST_HEAD(&dev->resized_list); 245 246 spin_lock_init(&dev->io_lock); 247 248 spin_lock_init(&dev->reada_lock); 249 atomic_set(&dev->reada_in_flight, 0); 250 atomic_set(&dev->dev_stats_ccnt, 0); 251 btrfs_device_data_ordered_init(dev); 252 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 253 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 254 255 return dev; 256 } 257 258 /* 259 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 260 * return NULL. 261 * 262 * If devid and uuid are both specified, the match must be exact, otherwise 263 * only devid is used. 264 */ 265 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 266 u64 devid, const u8 *uuid) 267 { 268 struct list_head *head = &fs_devices->devices; 269 struct btrfs_device *dev; 270 271 list_for_each_entry(dev, head, dev_list) { 272 if (dev->devid == devid && 273 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 274 return dev; 275 } 276 } 277 return NULL; 278 } 279 280 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 281 { 282 struct btrfs_fs_devices *fs_devices; 283 284 list_for_each_entry(fs_devices, &fs_uuids, list) { 285 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 286 return fs_devices; 287 } 288 return NULL; 289 } 290 291 static int 292 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 293 int flush, struct block_device **bdev, 294 struct buffer_head **bh) 295 { 296 int ret; 297 298 *bdev = blkdev_get_by_path(device_path, flags, holder); 299 300 if (IS_ERR(*bdev)) { 301 ret = PTR_ERR(*bdev); 302 goto error; 303 } 304 305 if (flush) 306 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 307 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 308 if (ret) { 309 blkdev_put(*bdev, flags); 310 goto error; 311 } 312 invalidate_bdev(*bdev); 313 *bh = btrfs_read_dev_super(*bdev); 314 if (IS_ERR(*bh)) { 315 ret = PTR_ERR(*bh); 316 blkdev_put(*bdev, flags); 317 goto error; 318 } 319 320 return 0; 321 322 error: 323 *bdev = NULL; 324 *bh = NULL; 325 return ret; 326 } 327 328 static void requeue_list(struct btrfs_pending_bios *pending_bios, 329 struct bio *head, struct bio *tail) 330 { 331 332 struct bio *old_head; 333 334 old_head = pending_bios->head; 335 pending_bios->head = head; 336 if (pending_bios->tail) 337 tail->bi_next = old_head; 338 else 339 pending_bios->tail = tail; 340 } 341 342 /* 343 * we try to collect pending bios for a device so we don't get a large 344 * number of procs sending bios down to the same device. This greatly 345 * improves the schedulers ability to collect and merge the bios. 346 * 347 * But, it also turns into a long list of bios to process and that is sure 348 * to eventually make the worker thread block. The solution here is to 349 * make some progress and then put this work struct back at the end of 350 * the list if the block device is congested. This way, multiple devices 351 * can make progress from a single worker thread. 352 */ 353 static noinline void run_scheduled_bios(struct btrfs_device *device) 354 { 355 struct btrfs_fs_info *fs_info = device->fs_info; 356 struct bio *pending; 357 struct backing_dev_info *bdi; 358 struct btrfs_pending_bios *pending_bios; 359 struct bio *tail; 360 struct bio *cur; 361 int again = 0; 362 unsigned long num_run; 363 unsigned long batch_run = 0; 364 unsigned long last_waited = 0; 365 int force_reg = 0; 366 int sync_pending = 0; 367 struct blk_plug plug; 368 369 /* 370 * this function runs all the bios we've collected for 371 * a particular device. We don't want to wander off to 372 * another device without first sending all of these down. 373 * So, setup a plug here and finish it off before we return 374 */ 375 blk_start_plug(&plug); 376 377 bdi = device->bdev->bd_bdi; 378 379 loop: 380 spin_lock(&device->io_lock); 381 382 loop_lock: 383 num_run = 0; 384 385 /* take all the bios off the list at once and process them 386 * later on (without the lock held). But, remember the 387 * tail and other pointers so the bios can be properly reinserted 388 * into the list if we hit congestion 389 */ 390 if (!force_reg && device->pending_sync_bios.head) { 391 pending_bios = &device->pending_sync_bios; 392 force_reg = 1; 393 } else { 394 pending_bios = &device->pending_bios; 395 force_reg = 0; 396 } 397 398 pending = pending_bios->head; 399 tail = pending_bios->tail; 400 WARN_ON(pending && !tail); 401 402 /* 403 * if pending was null this time around, no bios need processing 404 * at all and we can stop. Otherwise it'll loop back up again 405 * and do an additional check so no bios are missed. 406 * 407 * device->running_pending is used to synchronize with the 408 * schedule_bio code. 409 */ 410 if (device->pending_sync_bios.head == NULL && 411 device->pending_bios.head == NULL) { 412 again = 0; 413 device->running_pending = 0; 414 } else { 415 again = 1; 416 device->running_pending = 1; 417 } 418 419 pending_bios->head = NULL; 420 pending_bios->tail = NULL; 421 422 spin_unlock(&device->io_lock); 423 424 while (pending) { 425 426 rmb(); 427 /* we want to work on both lists, but do more bios on the 428 * sync list than the regular list 429 */ 430 if ((num_run > 32 && 431 pending_bios != &device->pending_sync_bios && 432 device->pending_sync_bios.head) || 433 (num_run > 64 && pending_bios == &device->pending_sync_bios && 434 device->pending_bios.head)) { 435 spin_lock(&device->io_lock); 436 requeue_list(pending_bios, pending, tail); 437 goto loop_lock; 438 } 439 440 cur = pending; 441 pending = pending->bi_next; 442 cur->bi_next = NULL; 443 444 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 445 446 /* 447 * if we're doing the sync list, record that our 448 * plug has some sync requests on it 449 * 450 * If we're doing the regular list and there are 451 * sync requests sitting around, unplug before 452 * we add more 453 */ 454 if (pending_bios == &device->pending_sync_bios) { 455 sync_pending = 1; 456 } else if (sync_pending) { 457 blk_finish_plug(&plug); 458 blk_start_plug(&plug); 459 sync_pending = 0; 460 } 461 462 btrfsic_submit_bio(cur); 463 num_run++; 464 batch_run++; 465 466 cond_resched(); 467 468 /* 469 * we made progress, there is more work to do and the bdi 470 * is now congested. Back off and let other work structs 471 * run instead 472 */ 473 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 474 fs_info->fs_devices->open_devices > 1) { 475 struct io_context *ioc; 476 477 ioc = current->io_context; 478 479 /* 480 * the main goal here is that we don't want to 481 * block if we're going to be able to submit 482 * more requests without blocking. 483 * 484 * This code does two great things, it pokes into 485 * the elevator code from a filesystem _and_ 486 * it makes assumptions about how batching works. 487 */ 488 if (ioc && ioc->nr_batch_requests > 0 && 489 time_before(jiffies, ioc->last_waited + HZ/50UL) && 490 (last_waited == 0 || 491 ioc->last_waited == last_waited)) { 492 /* 493 * we want to go through our batch of 494 * requests and stop. So, we copy out 495 * the ioc->last_waited time and test 496 * against it before looping 497 */ 498 last_waited = ioc->last_waited; 499 cond_resched(); 500 continue; 501 } 502 spin_lock(&device->io_lock); 503 requeue_list(pending_bios, pending, tail); 504 device->running_pending = 1; 505 506 spin_unlock(&device->io_lock); 507 btrfs_queue_work(fs_info->submit_workers, 508 &device->work); 509 goto done; 510 } 511 } 512 513 cond_resched(); 514 if (again) 515 goto loop; 516 517 spin_lock(&device->io_lock); 518 if (device->pending_bios.head || device->pending_sync_bios.head) 519 goto loop_lock; 520 spin_unlock(&device->io_lock); 521 522 done: 523 blk_finish_plug(&plug); 524 } 525 526 static void pending_bios_fn(struct btrfs_work *work) 527 { 528 struct btrfs_device *device; 529 530 device = container_of(work, struct btrfs_device, work); 531 run_scheduled_bios(device); 532 } 533 534 535 static void btrfs_free_stale_device(struct btrfs_device *cur_dev) 536 { 537 struct btrfs_fs_devices *fs_devs; 538 struct btrfs_device *dev; 539 540 if (!cur_dev->name) 541 return; 542 543 list_for_each_entry(fs_devs, &fs_uuids, list) { 544 int del = 1; 545 546 if (fs_devs->opened) 547 continue; 548 if (fs_devs->seeding) 549 continue; 550 551 list_for_each_entry(dev, &fs_devs->devices, dev_list) { 552 553 if (dev == cur_dev) 554 continue; 555 if (!dev->name) 556 continue; 557 558 /* 559 * Todo: This won't be enough. What if the same device 560 * comes back (with new uuid and) with its mapper path? 561 * But for now, this does help as mostly an admin will 562 * either use mapper or non mapper path throughout. 563 */ 564 rcu_read_lock(); 565 del = strcmp(rcu_str_deref(dev->name), 566 rcu_str_deref(cur_dev->name)); 567 rcu_read_unlock(); 568 if (!del) 569 break; 570 } 571 572 if (!del) { 573 /* delete the stale device */ 574 if (fs_devs->num_devices == 1) { 575 btrfs_sysfs_remove_fsid(fs_devs); 576 list_del(&fs_devs->list); 577 free_fs_devices(fs_devs); 578 } else { 579 fs_devs->num_devices--; 580 list_del(&dev->dev_list); 581 rcu_string_free(dev->name); 582 bio_put(dev->flush_bio); 583 kfree(dev); 584 } 585 break; 586 } 587 } 588 } 589 590 /* 591 * Add new device to list of registered devices 592 * 593 * Returns: 594 * 1 - first time device is seen 595 * 0 - device already known 596 * < 0 - error 597 */ 598 static noinline int device_list_add(const char *path, 599 struct btrfs_super_block *disk_super, 600 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 601 { 602 struct btrfs_device *device; 603 struct btrfs_fs_devices *fs_devices; 604 struct rcu_string *name; 605 int ret = 0; 606 u64 found_transid = btrfs_super_generation(disk_super); 607 608 fs_devices = find_fsid(disk_super->fsid); 609 if (!fs_devices) { 610 fs_devices = alloc_fs_devices(disk_super->fsid); 611 if (IS_ERR(fs_devices)) 612 return PTR_ERR(fs_devices); 613 614 list_add(&fs_devices->list, &fs_uuids); 615 616 device = NULL; 617 } else { 618 device = find_device(fs_devices, devid, 619 disk_super->dev_item.uuid); 620 } 621 622 if (!device) { 623 if (fs_devices->opened) 624 return -EBUSY; 625 626 device = btrfs_alloc_device(NULL, &devid, 627 disk_super->dev_item.uuid); 628 if (IS_ERR(device)) { 629 /* we can safely leave the fs_devices entry around */ 630 return PTR_ERR(device); 631 } 632 633 name = rcu_string_strdup(path, GFP_NOFS); 634 if (!name) { 635 bio_put(device->flush_bio); 636 kfree(device); 637 return -ENOMEM; 638 } 639 rcu_assign_pointer(device->name, name); 640 641 mutex_lock(&fs_devices->device_list_mutex); 642 list_add_rcu(&device->dev_list, &fs_devices->devices); 643 fs_devices->num_devices++; 644 mutex_unlock(&fs_devices->device_list_mutex); 645 646 ret = 1; 647 device->fs_devices = fs_devices; 648 } else if (!device->name || strcmp(device->name->str, path)) { 649 /* 650 * When FS is already mounted. 651 * 1. If you are here and if the device->name is NULL that 652 * means this device was missing at time of FS mount. 653 * 2. If you are here and if the device->name is different 654 * from 'path' that means either 655 * a. The same device disappeared and reappeared with 656 * different name. or 657 * b. The missing-disk-which-was-replaced, has 658 * reappeared now. 659 * 660 * We must allow 1 and 2a above. But 2b would be a spurious 661 * and unintentional. 662 * 663 * Further in case of 1 and 2a above, the disk at 'path' 664 * would have missed some transaction when it was away and 665 * in case of 2a the stale bdev has to be updated as well. 666 * 2b must not be allowed at all time. 667 */ 668 669 /* 670 * For now, we do allow update to btrfs_fs_device through the 671 * btrfs dev scan cli after FS has been mounted. We're still 672 * tracking a problem where systems fail mount by subvolume id 673 * when we reject replacement on a mounted FS. 674 */ 675 if (!fs_devices->opened && found_transid < device->generation) { 676 /* 677 * That is if the FS is _not_ mounted and if you 678 * are here, that means there is more than one 679 * disk with same uuid and devid.We keep the one 680 * with larger generation number or the last-in if 681 * generation are equal. 682 */ 683 return -EEXIST; 684 } 685 686 name = rcu_string_strdup(path, GFP_NOFS); 687 if (!name) 688 return -ENOMEM; 689 rcu_string_free(device->name); 690 rcu_assign_pointer(device->name, name); 691 if (device->missing) { 692 fs_devices->missing_devices--; 693 device->missing = 0; 694 } 695 } 696 697 /* 698 * Unmount does not free the btrfs_device struct but would zero 699 * generation along with most of the other members. So just update 700 * it back. We need it to pick the disk with largest generation 701 * (as above). 702 */ 703 if (!fs_devices->opened) 704 device->generation = found_transid; 705 706 /* 707 * if there is new btrfs on an already registered device, 708 * then remove the stale device entry. 709 */ 710 if (ret > 0) 711 btrfs_free_stale_device(device); 712 713 *fs_devices_ret = fs_devices; 714 715 return ret; 716 } 717 718 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 719 { 720 struct btrfs_fs_devices *fs_devices; 721 struct btrfs_device *device; 722 struct btrfs_device *orig_dev; 723 724 fs_devices = alloc_fs_devices(orig->fsid); 725 if (IS_ERR(fs_devices)) 726 return fs_devices; 727 728 mutex_lock(&orig->device_list_mutex); 729 fs_devices->total_devices = orig->total_devices; 730 731 /* We have held the volume lock, it is safe to get the devices. */ 732 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 733 struct rcu_string *name; 734 735 device = btrfs_alloc_device(NULL, &orig_dev->devid, 736 orig_dev->uuid); 737 if (IS_ERR(device)) 738 goto error; 739 740 /* 741 * This is ok to do without rcu read locked because we hold the 742 * uuid mutex so nothing we touch in here is going to disappear. 743 */ 744 if (orig_dev->name) { 745 name = rcu_string_strdup(orig_dev->name->str, 746 GFP_KERNEL); 747 if (!name) { 748 bio_put(device->flush_bio); 749 kfree(device); 750 goto error; 751 } 752 rcu_assign_pointer(device->name, name); 753 } 754 755 list_add(&device->dev_list, &fs_devices->devices); 756 device->fs_devices = fs_devices; 757 fs_devices->num_devices++; 758 } 759 mutex_unlock(&orig->device_list_mutex); 760 return fs_devices; 761 error: 762 mutex_unlock(&orig->device_list_mutex); 763 free_fs_devices(fs_devices); 764 return ERR_PTR(-ENOMEM); 765 } 766 767 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) 768 { 769 struct btrfs_device *device, *next; 770 struct btrfs_device *latest_dev = NULL; 771 772 mutex_lock(&uuid_mutex); 773 again: 774 /* This is the initialized path, it is safe to release the devices. */ 775 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 776 if (device->in_fs_metadata) { 777 if (!device->is_tgtdev_for_dev_replace && 778 (!latest_dev || 779 device->generation > latest_dev->generation)) { 780 latest_dev = device; 781 } 782 continue; 783 } 784 785 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 786 /* 787 * In the first step, keep the device which has 788 * the correct fsid and the devid that is used 789 * for the dev_replace procedure. 790 * In the second step, the dev_replace state is 791 * read from the device tree and it is known 792 * whether the procedure is really active or 793 * not, which means whether this device is 794 * used or whether it should be removed. 795 */ 796 if (step == 0 || device->is_tgtdev_for_dev_replace) { 797 continue; 798 } 799 } 800 if (device->bdev) { 801 blkdev_put(device->bdev, device->mode); 802 device->bdev = NULL; 803 fs_devices->open_devices--; 804 } 805 if (device->writeable) { 806 list_del_init(&device->dev_alloc_list); 807 device->writeable = 0; 808 if (!device->is_tgtdev_for_dev_replace) 809 fs_devices->rw_devices--; 810 } 811 list_del_init(&device->dev_list); 812 fs_devices->num_devices--; 813 rcu_string_free(device->name); 814 bio_put(device->flush_bio); 815 kfree(device); 816 } 817 818 if (fs_devices->seed) { 819 fs_devices = fs_devices->seed; 820 goto again; 821 } 822 823 fs_devices->latest_bdev = latest_dev->bdev; 824 825 mutex_unlock(&uuid_mutex); 826 } 827 828 static void __free_device(struct work_struct *work) 829 { 830 struct btrfs_device *device; 831 832 device = container_of(work, struct btrfs_device, rcu_work); 833 rcu_string_free(device->name); 834 bio_put(device->flush_bio); 835 kfree(device); 836 } 837 838 static void free_device(struct rcu_head *head) 839 { 840 struct btrfs_device *device; 841 842 device = container_of(head, struct btrfs_device, rcu); 843 844 INIT_WORK(&device->rcu_work, __free_device); 845 schedule_work(&device->rcu_work); 846 } 847 848 static void btrfs_close_bdev(struct btrfs_device *device) 849 { 850 if (device->bdev && device->writeable) { 851 sync_blockdev(device->bdev); 852 invalidate_bdev(device->bdev); 853 } 854 855 if (device->bdev) 856 blkdev_put(device->bdev, device->mode); 857 } 858 859 static void btrfs_prepare_close_one_device(struct btrfs_device *device) 860 { 861 struct btrfs_fs_devices *fs_devices = device->fs_devices; 862 struct btrfs_device *new_device; 863 struct rcu_string *name; 864 865 if (device->bdev) 866 fs_devices->open_devices--; 867 868 if (device->writeable && 869 device->devid != BTRFS_DEV_REPLACE_DEVID) { 870 list_del_init(&device->dev_alloc_list); 871 fs_devices->rw_devices--; 872 } 873 874 if (device->missing) 875 fs_devices->missing_devices--; 876 877 new_device = btrfs_alloc_device(NULL, &device->devid, 878 device->uuid); 879 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 880 881 /* Safe because we are under uuid_mutex */ 882 if (device->name) { 883 name = rcu_string_strdup(device->name->str, GFP_NOFS); 884 BUG_ON(!name); /* -ENOMEM */ 885 rcu_assign_pointer(new_device->name, name); 886 } 887 888 list_replace_rcu(&device->dev_list, &new_device->dev_list); 889 new_device->fs_devices = device->fs_devices; 890 } 891 892 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 893 { 894 struct btrfs_device *device, *tmp; 895 struct list_head pending_put; 896 897 INIT_LIST_HEAD(&pending_put); 898 899 if (--fs_devices->opened > 0) 900 return 0; 901 902 mutex_lock(&fs_devices->device_list_mutex); 903 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 904 btrfs_prepare_close_one_device(device); 905 list_add(&device->dev_list, &pending_put); 906 } 907 mutex_unlock(&fs_devices->device_list_mutex); 908 909 /* 910 * btrfs_show_devname() is using the device_list_mutex, 911 * sometimes call to blkdev_put() leads vfs calling 912 * into this func. So do put outside of device_list_mutex, 913 * as of now. 914 */ 915 while (!list_empty(&pending_put)) { 916 device = list_first_entry(&pending_put, 917 struct btrfs_device, dev_list); 918 list_del(&device->dev_list); 919 btrfs_close_bdev(device); 920 call_rcu(&device->rcu, free_device); 921 } 922 923 WARN_ON(fs_devices->open_devices); 924 WARN_ON(fs_devices->rw_devices); 925 fs_devices->opened = 0; 926 fs_devices->seeding = 0; 927 928 return 0; 929 } 930 931 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 932 { 933 struct btrfs_fs_devices *seed_devices = NULL; 934 int ret; 935 936 mutex_lock(&uuid_mutex); 937 ret = __btrfs_close_devices(fs_devices); 938 if (!fs_devices->opened) { 939 seed_devices = fs_devices->seed; 940 fs_devices->seed = NULL; 941 } 942 mutex_unlock(&uuid_mutex); 943 944 while (seed_devices) { 945 fs_devices = seed_devices; 946 seed_devices = fs_devices->seed; 947 __btrfs_close_devices(fs_devices); 948 free_fs_devices(fs_devices); 949 } 950 /* 951 * Wait for rcu kworkers under __btrfs_close_devices 952 * to finish all blkdev_puts so device is really 953 * free when umount is done. 954 */ 955 rcu_barrier(); 956 return ret; 957 } 958 959 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 960 fmode_t flags, void *holder) 961 { 962 struct request_queue *q; 963 struct block_device *bdev; 964 struct list_head *head = &fs_devices->devices; 965 struct btrfs_device *device; 966 struct btrfs_device *latest_dev = NULL; 967 struct buffer_head *bh; 968 struct btrfs_super_block *disk_super; 969 u64 devid; 970 int seeding = 1; 971 int ret = 0; 972 973 flags |= FMODE_EXCL; 974 975 list_for_each_entry(device, head, dev_list) { 976 if (device->bdev) 977 continue; 978 if (!device->name) 979 continue; 980 981 /* Just open everything we can; ignore failures here */ 982 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 983 &bdev, &bh)) 984 continue; 985 986 disk_super = (struct btrfs_super_block *)bh->b_data; 987 devid = btrfs_stack_device_id(&disk_super->dev_item); 988 if (devid != device->devid) 989 goto error_brelse; 990 991 if (memcmp(device->uuid, disk_super->dev_item.uuid, 992 BTRFS_UUID_SIZE)) 993 goto error_brelse; 994 995 device->generation = btrfs_super_generation(disk_super); 996 if (!latest_dev || 997 device->generation > latest_dev->generation) 998 latest_dev = device; 999 1000 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 1001 device->writeable = 0; 1002 } else { 1003 device->writeable = !bdev_read_only(bdev); 1004 seeding = 0; 1005 } 1006 1007 q = bdev_get_queue(bdev); 1008 if (blk_queue_discard(q)) 1009 device->can_discard = 1; 1010 if (!blk_queue_nonrot(q)) 1011 fs_devices->rotating = 1; 1012 1013 device->bdev = bdev; 1014 device->in_fs_metadata = 0; 1015 device->mode = flags; 1016 1017 fs_devices->open_devices++; 1018 if (device->writeable && 1019 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1020 fs_devices->rw_devices++; 1021 list_add(&device->dev_alloc_list, 1022 &fs_devices->alloc_list); 1023 } 1024 brelse(bh); 1025 continue; 1026 1027 error_brelse: 1028 brelse(bh); 1029 blkdev_put(bdev, flags); 1030 continue; 1031 } 1032 if (fs_devices->open_devices == 0) { 1033 ret = -EINVAL; 1034 goto out; 1035 } 1036 fs_devices->seeding = seeding; 1037 fs_devices->opened = 1; 1038 fs_devices->latest_bdev = latest_dev->bdev; 1039 fs_devices->total_rw_bytes = 0; 1040 out: 1041 return ret; 1042 } 1043 1044 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1045 fmode_t flags, void *holder) 1046 { 1047 int ret; 1048 1049 mutex_lock(&uuid_mutex); 1050 if (fs_devices->opened) { 1051 fs_devices->opened++; 1052 ret = 0; 1053 } else { 1054 ret = __btrfs_open_devices(fs_devices, flags, holder); 1055 } 1056 mutex_unlock(&uuid_mutex); 1057 return ret; 1058 } 1059 1060 static void btrfs_release_disk_super(struct page *page) 1061 { 1062 kunmap(page); 1063 put_page(page); 1064 } 1065 1066 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1067 struct page **page, 1068 struct btrfs_super_block **disk_super) 1069 { 1070 void *p; 1071 pgoff_t index; 1072 1073 /* make sure our super fits in the device */ 1074 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1075 return 1; 1076 1077 /* make sure our super fits in the page */ 1078 if (sizeof(**disk_super) > PAGE_SIZE) 1079 return 1; 1080 1081 /* make sure our super doesn't straddle pages on disk */ 1082 index = bytenr >> PAGE_SHIFT; 1083 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 1084 return 1; 1085 1086 /* pull in the page with our super */ 1087 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 1088 index, GFP_KERNEL); 1089 1090 if (IS_ERR_OR_NULL(*page)) 1091 return 1; 1092 1093 p = kmap(*page); 1094 1095 /* align our pointer to the offset of the super block */ 1096 *disk_super = p + (bytenr & ~PAGE_MASK); 1097 1098 if (btrfs_super_bytenr(*disk_super) != bytenr || 1099 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 1100 btrfs_release_disk_super(*page); 1101 return 1; 1102 } 1103 1104 if ((*disk_super)->label[0] && 1105 (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 1106 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 1107 1108 return 0; 1109 } 1110 1111 /* 1112 * Look for a btrfs signature on a device. This may be called out of the mount path 1113 * and we are not allowed to call set_blocksize during the scan. The superblock 1114 * is read via pagecache 1115 */ 1116 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 1117 struct btrfs_fs_devices **fs_devices_ret) 1118 { 1119 struct btrfs_super_block *disk_super; 1120 struct block_device *bdev; 1121 struct page *page; 1122 int ret = -EINVAL; 1123 u64 devid; 1124 u64 transid; 1125 u64 total_devices; 1126 u64 bytenr; 1127 1128 /* 1129 * we would like to check all the supers, but that would make 1130 * a btrfs mount succeed after a mkfs from a different FS. 1131 * So, we need to add a special mount option to scan for 1132 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1133 */ 1134 bytenr = btrfs_sb_offset(0); 1135 flags |= FMODE_EXCL; 1136 mutex_lock(&uuid_mutex); 1137 1138 bdev = blkdev_get_by_path(path, flags, holder); 1139 if (IS_ERR(bdev)) { 1140 ret = PTR_ERR(bdev); 1141 goto error; 1142 } 1143 1144 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) 1145 goto error_bdev_put; 1146 1147 devid = btrfs_stack_device_id(&disk_super->dev_item); 1148 transid = btrfs_super_generation(disk_super); 1149 total_devices = btrfs_super_num_devices(disk_super); 1150 1151 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 1152 if (ret > 0) { 1153 if (disk_super->label[0]) { 1154 pr_info("BTRFS: device label %s ", disk_super->label); 1155 } else { 1156 pr_info("BTRFS: device fsid %pU ", disk_super->fsid); 1157 } 1158 1159 pr_cont("devid %llu transid %llu %s\n", devid, transid, path); 1160 ret = 0; 1161 } 1162 if (!ret && fs_devices_ret) 1163 (*fs_devices_ret)->total_devices = total_devices; 1164 1165 btrfs_release_disk_super(page); 1166 1167 error_bdev_put: 1168 blkdev_put(bdev, flags); 1169 error: 1170 mutex_unlock(&uuid_mutex); 1171 return ret; 1172 } 1173 1174 /* helper to account the used device space in the range */ 1175 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 1176 u64 end, u64 *length) 1177 { 1178 struct btrfs_key key; 1179 struct btrfs_root *root = device->fs_info->dev_root; 1180 struct btrfs_dev_extent *dev_extent; 1181 struct btrfs_path *path; 1182 u64 extent_end; 1183 int ret; 1184 int slot; 1185 struct extent_buffer *l; 1186 1187 *length = 0; 1188 1189 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 1190 return 0; 1191 1192 path = btrfs_alloc_path(); 1193 if (!path) 1194 return -ENOMEM; 1195 path->reada = READA_FORWARD; 1196 1197 key.objectid = device->devid; 1198 key.offset = start; 1199 key.type = BTRFS_DEV_EXTENT_KEY; 1200 1201 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1202 if (ret < 0) 1203 goto out; 1204 if (ret > 0) { 1205 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1206 if (ret < 0) 1207 goto out; 1208 } 1209 1210 while (1) { 1211 l = path->nodes[0]; 1212 slot = path->slots[0]; 1213 if (slot >= btrfs_header_nritems(l)) { 1214 ret = btrfs_next_leaf(root, path); 1215 if (ret == 0) 1216 continue; 1217 if (ret < 0) 1218 goto out; 1219 1220 break; 1221 } 1222 btrfs_item_key_to_cpu(l, &key, slot); 1223 1224 if (key.objectid < device->devid) 1225 goto next; 1226 1227 if (key.objectid > device->devid) 1228 break; 1229 1230 if (key.type != BTRFS_DEV_EXTENT_KEY) 1231 goto next; 1232 1233 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1234 extent_end = key.offset + btrfs_dev_extent_length(l, 1235 dev_extent); 1236 if (key.offset <= start && extent_end > end) { 1237 *length = end - start + 1; 1238 break; 1239 } else if (key.offset <= start && extent_end > start) 1240 *length += extent_end - start; 1241 else if (key.offset > start && extent_end <= end) 1242 *length += extent_end - key.offset; 1243 else if (key.offset > start && key.offset <= end) { 1244 *length += end - key.offset + 1; 1245 break; 1246 } else if (key.offset > end) 1247 break; 1248 1249 next: 1250 path->slots[0]++; 1251 } 1252 ret = 0; 1253 out: 1254 btrfs_free_path(path); 1255 return ret; 1256 } 1257 1258 static int contains_pending_extent(struct btrfs_transaction *transaction, 1259 struct btrfs_device *device, 1260 u64 *start, u64 len) 1261 { 1262 struct btrfs_fs_info *fs_info = device->fs_info; 1263 struct extent_map *em; 1264 struct list_head *search_list = &fs_info->pinned_chunks; 1265 int ret = 0; 1266 u64 physical_start = *start; 1267 1268 if (transaction) 1269 search_list = &transaction->pending_chunks; 1270 again: 1271 list_for_each_entry(em, search_list, list) { 1272 struct map_lookup *map; 1273 int i; 1274 1275 map = em->map_lookup; 1276 for (i = 0; i < map->num_stripes; i++) { 1277 u64 end; 1278 1279 if (map->stripes[i].dev != device) 1280 continue; 1281 if (map->stripes[i].physical >= physical_start + len || 1282 map->stripes[i].physical + em->orig_block_len <= 1283 physical_start) 1284 continue; 1285 /* 1286 * Make sure that while processing the pinned list we do 1287 * not override our *start with a lower value, because 1288 * we can have pinned chunks that fall within this 1289 * device hole and that have lower physical addresses 1290 * than the pending chunks we processed before. If we 1291 * do not take this special care we can end up getting 1292 * 2 pending chunks that start at the same physical 1293 * device offsets because the end offset of a pinned 1294 * chunk can be equal to the start offset of some 1295 * pending chunk. 1296 */ 1297 end = map->stripes[i].physical + em->orig_block_len; 1298 if (end > *start) { 1299 *start = end; 1300 ret = 1; 1301 } 1302 } 1303 } 1304 if (search_list != &fs_info->pinned_chunks) { 1305 search_list = &fs_info->pinned_chunks; 1306 goto again; 1307 } 1308 1309 return ret; 1310 } 1311 1312 1313 /* 1314 * find_free_dev_extent_start - find free space in the specified device 1315 * @device: the device which we search the free space in 1316 * @num_bytes: the size of the free space that we need 1317 * @search_start: the position from which to begin the search 1318 * @start: store the start of the free space. 1319 * @len: the size of the free space. that we find, or the size 1320 * of the max free space if we don't find suitable free space 1321 * 1322 * this uses a pretty simple search, the expectation is that it is 1323 * called very infrequently and that a given device has a small number 1324 * of extents 1325 * 1326 * @start is used to store the start of the free space if we find. But if we 1327 * don't find suitable free space, it will be used to store the start position 1328 * of the max free space. 1329 * 1330 * @len is used to store the size of the free space that we find. 1331 * But if we don't find suitable free space, it is used to store the size of 1332 * the max free space. 1333 */ 1334 int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1335 struct btrfs_device *device, u64 num_bytes, 1336 u64 search_start, u64 *start, u64 *len) 1337 { 1338 struct btrfs_fs_info *fs_info = device->fs_info; 1339 struct btrfs_root *root = fs_info->dev_root; 1340 struct btrfs_key key; 1341 struct btrfs_dev_extent *dev_extent; 1342 struct btrfs_path *path; 1343 u64 hole_size; 1344 u64 max_hole_start; 1345 u64 max_hole_size; 1346 u64 extent_end; 1347 u64 search_end = device->total_bytes; 1348 int ret; 1349 int slot; 1350 struct extent_buffer *l; 1351 1352 /* 1353 * We don't want to overwrite the superblock on the drive nor any area 1354 * used by the boot loader (grub for example), so we make sure to start 1355 * at an offset of at least 1MB. 1356 */ 1357 search_start = max_t(u64, search_start, SZ_1M); 1358 1359 path = btrfs_alloc_path(); 1360 if (!path) 1361 return -ENOMEM; 1362 1363 max_hole_start = search_start; 1364 max_hole_size = 0; 1365 1366 again: 1367 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1368 ret = -ENOSPC; 1369 goto out; 1370 } 1371 1372 path->reada = READA_FORWARD; 1373 path->search_commit_root = 1; 1374 path->skip_locking = 1; 1375 1376 key.objectid = device->devid; 1377 key.offset = search_start; 1378 key.type = BTRFS_DEV_EXTENT_KEY; 1379 1380 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1381 if (ret < 0) 1382 goto out; 1383 if (ret > 0) { 1384 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1385 if (ret < 0) 1386 goto out; 1387 } 1388 1389 while (1) { 1390 l = path->nodes[0]; 1391 slot = path->slots[0]; 1392 if (slot >= btrfs_header_nritems(l)) { 1393 ret = btrfs_next_leaf(root, path); 1394 if (ret == 0) 1395 continue; 1396 if (ret < 0) 1397 goto out; 1398 1399 break; 1400 } 1401 btrfs_item_key_to_cpu(l, &key, slot); 1402 1403 if (key.objectid < device->devid) 1404 goto next; 1405 1406 if (key.objectid > device->devid) 1407 break; 1408 1409 if (key.type != BTRFS_DEV_EXTENT_KEY) 1410 goto next; 1411 1412 if (key.offset > search_start) { 1413 hole_size = key.offset - search_start; 1414 1415 /* 1416 * Have to check before we set max_hole_start, otherwise 1417 * we could end up sending back this offset anyway. 1418 */ 1419 if (contains_pending_extent(transaction, device, 1420 &search_start, 1421 hole_size)) { 1422 if (key.offset >= search_start) { 1423 hole_size = key.offset - search_start; 1424 } else { 1425 WARN_ON_ONCE(1); 1426 hole_size = 0; 1427 } 1428 } 1429 1430 if (hole_size > max_hole_size) { 1431 max_hole_start = search_start; 1432 max_hole_size = hole_size; 1433 } 1434 1435 /* 1436 * If this free space is greater than which we need, 1437 * it must be the max free space that we have found 1438 * until now, so max_hole_start must point to the start 1439 * of this free space and the length of this free space 1440 * is stored in max_hole_size. Thus, we return 1441 * max_hole_start and max_hole_size and go back to the 1442 * caller. 1443 */ 1444 if (hole_size >= num_bytes) { 1445 ret = 0; 1446 goto out; 1447 } 1448 } 1449 1450 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1451 extent_end = key.offset + btrfs_dev_extent_length(l, 1452 dev_extent); 1453 if (extent_end > search_start) 1454 search_start = extent_end; 1455 next: 1456 path->slots[0]++; 1457 cond_resched(); 1458 } 1459 1460 /* 1461 * At this point, search_start should be the end of 1462 * allocated dev extents, and when shrinking the device, 1463 * search_end may be smaller than search_start. 1464 */ 1465 if (search_end > search_start) { 1466 hole_size = search_end - search_start; 1467 1468 if (contains_pending_extent(transaction, device, &search_start, 1469 hole_size)) { 1470 btrfs_release_path(path); 1471 goto again; 1472 } 1473 1474 if (hole_size > max_hole_size) { 1475 max_hole_start = search_start; 1476 max_hole_size = hole_size; 1477 } 1478 } 1479 1480 /* See above. */ 1481 if (max_hole_size < num_bytes) 1482 ret = -ENOSPC; 1483 else 1484 ret = 0; 1485 1486 out: 1487 btrfs_free_path(path); 1488 *start = max_hole_start; 1489 if (len) 1490 *len = max_hole_size; 1491 return ret; 1492 } 1493 1494 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1495 struct btrfs_device *device, u64 num_bytes, 1496 u64 *start, u64 *len) 1497 { 1498 /* FIXME use last free of some kind */ 1499 return find_free_dev_extent_start(trans->transaction, device, 1500 num_bytes, 0, start, len); 1501 } 1502 1503 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1504 struct btrfs_device *device, 1505 u64 start, u64 *dev_extent_len) 1506 { 1507 struct btrfs_fs_info *fs_info = device->fs_info; 1508 struct btrfs_root *root = fs_info->dev_root; 1509 int ret; 1510 struct btrfs_path *path; 1511 struct btrfs_key key; 1512 struct btrfs_key found_key; 1513 struct extent_buffer *leaf = NULL; 1514 struct btrfs_dev_extent *extent = NULL; 1515 1516 path = btrfs_alloc_path(); 1517 if (!path) 1518 return -ENOMEM; 1519 1520 key.objectid = device->devid; 1521 key.offset = start; 1522 key.type = BTRFS_DEV_EXTENT_KEY; 1523 again: 1524 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1525 if (ret > 0) { 1526 ret = btrfs_previous_item(root, path, key.objectid, 1527 BTRFS_DEV_EXTENT_KEY); 1528 if (ret) 1529 goto out; 1530 leaf = path->nodes[0]; 1531 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1532 extent = btrfs_item_ptr(leaf, path->slots[0], 1533 struct btrfs_dev_extent); 1534 BUG_ON(found_key.offset > start || found_key.offset + 1535 btrfs_dev_extent_length(leaf, extent) < start); 1536 key = found_key; 1537 btrfs_release_path(path); 1538 goto again; 1539 } else if (ret == 0) { 1540 leaf = path->nodes[0]; 1541 extent = btrfs_item_ptr(leaf, path->slots[0], 1542 struct btrfs_dev_extent); 1543 } else { 1544 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1545 goto out; 1546 } 1547 1548 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1549 1550 ret = btrfs_del_item(trans, root, path); 1551 if (ret) { 1552 btrfs_handle_fs_error(fs_info, ret, 1553 "Failed to remove dev extent item"); 1554 } else { 1555 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1556 } 1557 out: 1558 btrfs_free_path(path); 1559 return ret; 1560 } 1561 1562 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1563 struct btrfs_device *device, 1564 u64 chunk_offset, u64 start, u64 num_bytes) 1565 { 1566 int ret; 1567 struct btrfs_path *path; 1568 struct btrfs_fs_info *fs_info = device->fs_info; 1569 struct btrfs_root *root = fs_info->dev_root; 1570 struct btrfs_dev_extent *extent; 1571 struct extent_buffer *leaf; 1572 struct btrfs_key key; 1573 1574 WARN_ON(!device->in_fs_metadata); 1575 WARN_ON(device->is_tgtdev_for_dev_replace); 1576 path = btrfs_alloc_path(); 1577 if (!path) 1578 return -ENOMEM; 1579 1580 key.objectid = device->devid; 1581 key.offset = start; 1582 key.type = BTRFS_DEV_EXTENT_KEY; 1583 ret = btrfs_insert_empty_item(trans, root, path, &key, 1584 sizeof(*extent)); 1585 if (ret) 1586 goto out; 1587 1588 leaf = path->nodes[0]; 1589 extent = btrfs_item_ptr(leaf, path->slots[0], 1590 struct btrfs_dev_extent); 1591 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1592 BTRFS_CHUNK_TREE_OBJECTID); 1593 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1594 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1595 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1596 1597 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1598 btrfs_mark_buffer_dirty(leaf); 1599 out: 1600 btrfs_free_path(path); 1601 return ret; 1602 } 1603 1604 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1605 { 1606 struct extent_map_tree *em_tree; 1607 struct extent_map *em; 1608 struct rb_node *n; 1609 u64 ret = 0; 1610 1611 em_tree = &fs_info->mapping_tree.map_tree; 1612 read_lock(&em_tree->lock); 1613 n = rb_last(&em_tree->map); 1614 if (n) { 1615 em = rb_entry(n, struct extent_map, rb_node); 1616 ret = em->start + em->len; 1617 } 1618 read_unlock(&em_tree->lock); 1619 1620 return ret; 1621 } 1622 1623 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1624 u64 *devid_ret) 1625 { 1626 int ret; 1627 struct btrfs_key key; 1628 struct btrfs_key found_key; 1629 struct btrfs_path *path; 1630 1631 path = btrfs_alloc_path(); 1632 if (!path) 1633 return -ENOMEM; 1634 1635 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1636 key.type = BTRFS_DEV_ITEM_KEY; 1637 key.offset = (u64)-1; 1638 1639 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1640 if (ret < 0) 1641 goto error; 1642 1643 BUG_ON(ret == 0); /* Corruption */ 1644 1645 ret = btrfs_previous_item(fs_info->chunk_root, path, 1646 BTRFS_DEV_ITEMS_OBJECTID, 1647 BTRFS_DEV_ITEM_KEY); 1648 if (ret) { 1649 *devid_ret = 1; 1650 } else { 1651 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1652 path->slots[0]); 1653 *devid_ret = found_key.offset + 1; 1654 } 1655 ret = 0; 1656 error: 1657 btrfs_free_path(path); 1658 return ret; 1659 } 1660 1661 /* 1662 * the device information is stored in the chunk root 1663 * the btrfs_device struct should be fully filled in 1664 */ 1665 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1666 struct btrfs_fs_info *fs_info, 1667 struct btrfs_device *device) 1668 { 1669 struct btrfs_root *root = fs_info->chunk_root; 1670 int ret; 1671 struct btrfs_path *path; 1672 struct btrfs_dev_item *dev_item; 1673 struct extent_buffer *leaf; 1674 struct btrfs_key key; 1675 unsigned long ptr; 1676 1677 path = btrfs_alloc_path(); 1678 if (!path) 1679 return -ENOMEM; 1680 1681 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1682 key.type = BTRFS_DEV_ITEM_KEY; 1683 key.offset = device->devid; 1684 1685 ret = btrfs_insert_empty_item(trans, root, path, &key, 1686 sizeof(*dev_item)); 1687 if (ret) 1688 goto out; 1689 1690 leaf = path->nodes[0]; 1691 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1692 1693 btrfs_set_device_id(leaf, dev_item, device->devid); 1694 btrfs_set_device_generation(leaf, dev_item, 0); 1695 btrfs_set_device_type(leaf, dev_item, device->type); 1696 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1697 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1698 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1699 btrfs_set_device_total_bytes(leaf, dev_item, 1700 btrfs_device_get_disk_total_bytes(device)); 1701 btrfs_set_device_bytes_used(leaf, dev_item, 1702 btrfs_device_get_bytes_used(device)); 1703 btrfs_set_device_group(leaf, dev_item, 0); 1704 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1705 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1706 btrfs_set_device_start_offset(leaf, dev_item, 0); 1707 1708 ptr = btrfs_device_uuid(dev_item); 1709 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1710 ptr = btrfs_device_fsid(dev_item); 1711 write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); 1712 btrfs_mark_buffer_dirty(leaf); 1713 1714 ret = 0; 1715 out: 1716 btrfs_free_path(path); 1717 return ret; 1718 } 1719 1720 /* 1721 * Function to update ctime/mtime for a given device path. 1722 * Mainly used for ctime/mtime based probe like libblkid. 1723 */ 1724 static void update_dev_time(const char *path_name) 1725 { 1726 struct file *filp; 1727 1728 filp = filp_open(path_name, O_RDWR, 0); 1729 if (IS_ERR(filp)) 1730 return; 1731 file_update_time(filp); 1732 filp_close(filp, NULL); 1733 } 1734 1735 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1736 struct btrfs_device *device) 1737 { 1738 struct btrfs_root *root = fs_info->chunk_root; 1739 int ret; 1740 struct btrfs_path *path; 1741 struct btrfs_key key; 1742 struct btrfs_trans_handle *trans; 1743 1744 path = btrfs_alloc_path(); 1745 if (!path) 1746 return -ENOMEM; 1747 1748 trans = btrfs_start_transaction(root, 0); 1749 if (IS_ERR(trans)) { 1750 btrfs_free_path(path); 1751 return PTR_ERR(trans); 1752 } 1753 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1754 key.type = BTRFS_DEV_ITEM_KEY; 1755 key.offset = device->devid; 1756 1757 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1758 if (ret) { 1759 if (ret > 0) 1760 ret = -ENOENT; 1761 btrfs_abort_transaction(trans, ret); 1762 btrfs_end_transaction(trans); 1763 goto out; 1764 } 1765 1766 ret = btrfs_del_item(trans, root, path); 1767 if (ret) { 1768 btrfs_abort_transaction(trans, ret); 1769 btrfs_end_transaction(trans); 1770 } 1771 1772 out: 1773 btrfs_free_path(path); 1774 if (!ret) 1775 ret = btrfs_commit_transaction(trans); 1776 return ret; 1777 } 1778 1779 /* 1780 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1781 * filesystem. It's up to the caller to adjust that number regarding eg. device 1782 * replace. 1783 */ 1784 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1785 u64 num_devices) 1786 { 1787 u64 all_avail; 1788 unsigned seq; 1789 int i; 1790 1791 do { 1792 seq = read_seqbegin(&fs_info->profiles_lock); 1793 1794 all_avail = fs_info->avail_data_alloc_bits | 1795 fs_info->avail_system_alloc_bits | 1796 fs_info->avail_metadata_alloc_bits; 1797 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1798 1799 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1800 if (!(all_avail & btrfs_raid_group[i])) 1801 continue; 1802 1803 if (num_devices < btrfs_raid_array[i].devs_min) { 1804 int ret = btrfs_raid_mindev_error[i]; 1805 1806 if (ret) 1807 return ret; 1808 } 1809 } 1810 1811 return 0; 1812 } 1813 1814 static struct btrfs_device * btrfs_find_next_active_device( 1815 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1816 { 1817 struct btrfs_device *next_device; 1818 1819 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1820 if (next_device != device && 1821 !next_device->missing && next_device->bdev) 1822 return next_device; 1823 } 1824 1825 return NULL; 1826 } 1827 1828 /* 1829 * Helper function to check if the given device is part of s_bdev / latest_bdev 1830 * and replace it with the provided or the next active device, in the context 1831 * where this function called, there should be always be another device (or 1832 * this_dev) which is active. 1833 */ 1834 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, 1835 struct btrfs_device *device, struct btrfs_device *this_dev) 1836 { 1837 struct btrfs_device *next_device; 1838 1839 if (this_dev) 1840 next_device = this_dev; 1841 else 1842 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1843 device); 1844 ASSERT(next_device); 1845 1846 if (fs_info->sb->s_bdev && 1847 (fs_info->sb->s_bdev == device->bdev)) 1848 fs_info->sb->s_bdev = next_device->bdev; 1849 1850 if (fs_info->fs_devices->latest_bdev == device->bdev) 1851 fs_info->fs_devices->latest_bdev = next_device->bdev; 1852 } 1853 1854 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1855 u64 devid) 1856 { 1857 struct btrfs_device *device; 1858 struct btrfs_fs_devices *cur_devices; 1859 u64 num_devices; 1860 int ret = 0; 1861 1862 mutex_lock(&uuid_mutex); 1863 1864 num_devices = fs_info->fs_devices->num_devices; 1865 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 1866 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1867 WARN_ON(num_devices < 1); 1868 num_devices--; 1869 } 1870 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 1871 1872 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1873 if (ret) 1874 goto out; 1875 1876 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 1877 &device); 1878 if (ret) 1879 goto out; 1880 1881 if (device->is_tgtdev_for_dev_replace) { 1882 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1883 goto out; 1884 } 1885 1886 if (device->writeable && fs_info->fs_devices->rw_devices == 1) { 1887 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1888 goto out; 1889 } 1890 1891 if (device->writeable) { 1892 mutex_lock(&fs_info->chunk_mutex); 1893 list_del_init(&device->dev_alloc_list); 1894 device->fs_devices->rw_devices--; 1895 mutex_unlock(&fs_info->chunk_mutex); 1896 } 1897 1898 mutex_unlock(&uuid_mutex); 1899 ret = btrfs_shrink_device(device, 0); 1900 mutex_lock(&uuid_mutex); 1901 if (ret) 1902 goto error_undo; 1903 1904 /* 1905 * TODO: the superblock still includes this device in its num_devices 1906 * counter although write_all_supers() is not locked out. This 1907 * could give a filesystem state which requires a degraded mount. 1908 */ 1909 ret = btrfs_rm_dev_item(fs_info, device); 1910 if (ret) 1911 goto error_undo; 1912 1913 device->in_fs_metadata = 0; 1914 btrfs_scrub_cancel_dev(fs_info, device); 1915 1916 /* 1917 * the device list mutex makes sure that we don't change 1918 * the device list while someone else is writing out all 1919 * the device supers. Whoever is writing all supers, should 1920 * lock the device list mutex before getting the number of 1921 * devices in the super block (super_copy). Conversely, 1922 * whoever updates the number of devices in the super block 1923 * (super_copy) should hold the device list mutex. 1924 */ 1925 1926 cur_devices = device->fs_devices; 1927 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1928 list_del_rcu(&device->dev_list); 1929 1930 device->fs_devices->num_devices--; 1931 device->fs_devices->total_devices--; 1932 1933 if (device->missing) 1934 device->fs_devices->missing_devices--; 1935 1936 btrfs_assign_next_active_device(fs_info, device, NULL); 1937 1938 if (device->bdev) { 1939 device->fs_devices->open_devices--; 1940 /* remove sysfs entry */ 1941 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 1942 } 1943 1944 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 1945 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 1946 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1947 1948 /* 1949 * at this point, the device is zero sized and detached from 1950 * the devices list. All that's left is to zero out the old 1951 * supers and free the device. 1952 */ 1953 if (device->writeable) 1954 btrfs_scratch_superblocks(device->bdev, device->name->str); 1955 1956 btrfs_close_bdev(device); 1957 call_rcu(&device->rcu, free_device); 1958 1959 if (cur_devices->open_devices == 0) { 1960 struct btrfs_fs_devices *fs_devices; 1961 fs_devices = fs_info->fs_devices; 1962 while (fs_devices) { 1963 if (fs_devices->seed == cur_devices) { 1964 fs_devices->seed = cur_devices->seed; 1965 break; 1966 } 1967 fs_devices = fs_devices->seed; 1968 } 1969 cur_devices->seed = NULL; 1970 __btrfs_close_devices(cur_devices); 1971 free_fs_devices(cur_devices); 1972 } 1973 1974 out: 1975 mutex_unlock(&uuid_mutex); 1976 return ret; 1977 1978 error_undo: 1979 if (device->writeable) { 1980 mutex_lock(&fs_info->chunk_mutex); 1981 list_add(&device->dev_alloc_list, 1982 &fs_info->fs_devices->alloc_list); 1983 device->fs_devices->rw_devices++; 1984 mutex_unlock(&fs_info->chunk_mutex); 1985 } 1986 goto out; 1987 } 1988 1989 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 1990 struct btrfs_device *srcdev) 1991 { 1992 struct btrfs_fs_devices *fs_devices; 1993 1994 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1995 1996 /* 1997 * in case of fs with no seed, srcdev->fs_devices will point 1998 * to fs_devices of fs_info. However when the dev being replaced is 1999 * a seed dev it will point to the seed's local fs_devices. In short 2000 * srcdev will have its correct fs_devices in both the cases. 2001 */ 2002 fs_devices = srcdev->fs_devices; 2003 2004 list_del_rcu(&srcdev->dev_list); 2005 list_del(&srcdev->dev_alloc_list); 2006 fs_devices->num_devices--; 2007 if (srcdev->missing) 2008 fs_devices->missing_devices--; 2009 2010 if (srcdev->writeable) 2011 fs_devices->rw_devices--; 2012 2013 if (srcdev->bdev) 2014 fs_devices->open_devices--; 2015 } 2016 2017 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2018 struct btrfs_device *srcdev) 2019 { 2020 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2021 2022 if (srcdev->writeable) { 2023 /* zero out the old super if it is writable */ 2024 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2025 } 2026 2027 btrfs_close_bdev(srcdev); 2028 call_rcu(&srcdev->rcu, free_device); 2029 2030 /* if this is no devs we rather delete the fs_devices */ 2031 if (!fs_devices->num_devices) { 2032 struct btrfs_fs_devices *tmp_fs_devices; 2033 2034 /* 2035 * On a mounted FS, num_devices can't be zero unless it's a 2036 * seed. In case of a seed device being replaced, the replace 2037 * target added to the sprout FS, so there will be no more 2038 * device left under the seed FS. 2039 */ 2040 ASSERT(fs_devices->seeding); 2041 2042 tmp_fs_devices = fs_info->fs_devices; 2043 while (tmp_fs_devices) { 2044 if (tmp_fs_devices->seed == fs_devices) { 2045 tmp_fs_devices->seed = fs_devices->seed; 2046 break; 2047 } 2048 tmp_fs_devices = tmp_fs_devices->seed; 2049 } 2050 fs_devices->seed = NULL; 2051 __btrfs_close_devices(fs_devices); 2052 free_fs_devices(fs_devices); 2053 } 2054 } 2055 2056 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2057 struct btrfs_device *tgtdev) 2058 { 2059 mutex_lock(&uuid_mutex); 2060 WARN_ON(!tgtdev); 2061 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2062 2063 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2064 2065 if (tgtdev->bdev) 2066 fs_info->fs_devices->open_devices--; 2067 2068 fs_info->fs_devices->num_devices--; 2069 2070 btrfs_assign_next_active_device(fs_info, tgtdev, NULL); 2071 2072 list_del_rcu(&tgtdev->dev_list); 2073 2074 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2075 mutex_unlock(&uuid_mutex); 2076 2077 /* 2078 * The update_dev_time() with in btrfs_scratch_superblocks() 2079 * may lead to a call to btrfs_show_devname() which will try 2080 * to hold device_list_mutex. And here this device 2081 * is already out of device list, so we don't have to hold 2082 * the device_list_mutex lock. 2083 */ 2084 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2085 2086 btrfs_close_bdev(tgtdev); 2087 call_rcu(&tgtdev->rcu, free_device); 2088 } 2089 2090 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2091 const char *device_path, 2092 struct btrfs_device **device) 2093 { 2094 int ret = 0; 2095 struct btrfs_super_block *disk_super; 2096 u64 devid; 2097 u8 *dev_uuid; 2098 struct block_device *bdev; 2099 struct buffer_head *bh; 2100 2101 *device = NULL; 2102 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2103 fs_info->bdev_holder, 0, &bdev, &bh); 2104 if (ret) 2105 return ret; 2106 disk_super = (struct btrfs_super_block *)bh->b_data; 2107 devid = btrfs_stack_device_id(&disk_super->dev_item); 2108 dev_uuid = disk_super->dev_item.uuid; 2109 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 2110 brelse(bh); 2111 if (!*device) 2112 ret = -ENOENT; 2113 blkdev_put(bdev, FMODE_READ); 2114 return ret; 2115 } 2116 2117 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2118 const char *device_path, 2119 struct btrfs_device **device) 2120 { 2121 *device = NULL; 2122 if (strcmp(device_path, "missing") == 0) { 2123 struct list_head *devices; 2124 struct btrfs_device *tmp; 2125 2126 devices = &fs_info->fs_devices->devices; 2127 /* 2128 * It is safe to read the devices since the volume_mutex 2129 * is held by the caller. 2130 */ 2131 list_for_each_entry(tmp, devices, dev_list) { 2132 if (tmp->in_fs_metadata && !tmp->bdev) { 2133 *device = tmp; 2134 break; 2135 } 2136 } 2137 2138 if (!*device) 2139 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2140 2141 return 0; 2142 } else { 2143 return btrfs_find_device_by_path(fs_info, device_path, device); 2144 } 2145 } 2146 2147 /* 2148 * Lookup a device given by device id, or the path if the id is 0. 2149 */ 2150 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2151 const char *devpath, 2152 struct btrfs_device **device) 2153 { 2154 int ret; 2155 2156 if (devid) { 2157 ret = 0; 2158 *device = btrfs_find_device(fs_info, devid, NULL, NULL); 2159 if (!*device) 2160 ret = -ENOENT; 2161 } else { 2162 if (!devpath || !devpath[0]) 2163 return -EINVAL; 2164 2165 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 2166 device); 2167 } 2168 return ret; 2169 } 2170 2171 /* 2172 * does all the dirty work required for changing file system's UUID. 2173 */ 2174 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2175 { 2176 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2177 struct btrfs_fs_devices *old_devices; 2178 struct btrfs_fs_devices *seed_devices; 2179 struct btrfs_super_block *disk_super = fs_info->super_copy; 2180 struct btrfs_device *device; 2181 u64 super_flags; 2182 2183 BUG_ON(!mutex_is_locked(&uuid_mutex)); 2184 if (!fs_devices->seeding) 2185 return -EINVAL; 2186 2187 seed_devices = alloc_fs_devices(NULL); 2188 if (IS_ERR(seed_devices)) 2189 return PTR_ERR(seed_devices); 2190 2191 old_devices = clone_fs_devices(fs_devices); 2192 if (IS_ERR(old_devices)) { 2193 kfree(seed_devices); 2194 return PTR_ERR(old_devices); 2195 } 2196 2197 list_add(&old_devices->list, &fs_uuids); 2198 2199 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2200 seed_devices->opened = 1; 2201 INIT_LIST_HEAD(&seed_devices->devices); 2202 INIT_LIST_HEAD(&seed_devices->alloc_list); 2203 mutex_init(&seed_devices->device_list_mutex); 2204 2205 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2206 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2207 synchronize_rcu); 2208 list_for_each_entry(device, &seed_devices->devices, dev_list) 2209 device->fs_devices = seed_devices; 2210 2211 mutex_lock(&fs_info->chunk_mutex); 2212 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2213 mutex_unlock(&fs_info->chunk_mutex); 2214 2215 fs_devices->seeding = 0; 2216 fs_devices->num_devices = 0; 2217 fs_devices->open_devices = 0; 2218 fs_devices->missing_devices = 0; 2219 fs_devices->rotating = 0; 2220 fs_devices->seed = seed_devices; 2221 2222 generate_random_uuid(fs_devices->fsid); 2223 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2224 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2225 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2226 2227 super_flags = btrfs_super_flags(disk_super) & 2228 ~BTRFS_SUPER_FLAG_SEEDING; 2229 btrfs_set_super_flags(disk_super, super_flags); 2230 2231 return 0; 2232 } 2233 2234 /* 2235 * Store the expected generation for seed devices in device items. 2236 */ 2237 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2238 struct btrfs_fs_info *fs_info) 2239 { 2240 struct btrfs_root *root = fs_info->chunk_root; 2241 struct btrfs_path *path; 2242 struct extent_buffer *leaf; 2243 struct btrfs_dev_item *dev_item; 2244 struct btrfs_device *device; 2245 struct btrfs_key key; 2246 u8 fs_uuid[BTRFS_FSID_SIZE]; 2247 u8 dev_uuid[BTRFS_UUID_SIZE]; 2248 u64 devid; 2249 int ret; 2250 2251 path = btrfs_alloc_path(); 2252 if (!path) 2253 return -ENOMEM; 2254 2255 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2256 key.offset = 0; 2257 key.type = BTRFS_DEV_ITEM_KEY; 2258 2259 while (1) { 2260 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2261 if (ret < 0) 2262 goto error; 2263 2264 leaf = path->nodes[0]; 2265 next_slot: 2266 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2267 ret = btrfs_next_leaf(root, path); 2268 if (ret > 0) 2269 break; 2270 if (ret < 0) 2271 goto error; 2272 leaf = path->nodes[0]; 2273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2274 btrfs_release_path(path); 2275 continue; 2276 } 2277 2278 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2279 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2280 key.type != BTRFS_DEV_ITEM_KEY) 2281 break; 2282 2283 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2284 struct btrfs_dev_item); 2285 devid = btrfs_device_id(leaf, dev_item); 2286 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2287 BTRFS_UUID_SIZE); 2288 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2289 BTRFS_FSID_SIZE); 2290 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2291 BUG_ON(!device); /* Logic error */ 2292 2293 if (device->fs_devices->seeding) { 2294 btrfs_set_device_generation(leaf, dev_item, 2295 device->generation); 2296 btrfs_mark_buffer_dirty(leaf); 2297 } 2298 2299 path->slots[0]++; 2300 goto next_slot; 2301 } 2302 ret = 0; 2303 error: 2304 btrfs_free_path(path); 2305 return ret; 2306 } 2307 2308 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2309 { 2310 struct btrfs_root *root = fs_info->dev_root; 2311 struct request_queue *q; 2312 struct btrfs_trans_handle *trans; 2313 struct btrfs_device *device; 2314 struct block_device *bdev; 2315 struct list_head *devices; 2316 struct super_block *sb = fs_info->sb; 2317 struct rcu_string *name; 2318 u64 tmp; 2319 int seeding_dev = 0; 2320 int ret = 0; 2321 bool unlocked = false; 2322 2323 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2324 return -EROFS; 2325 2326 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2327 fs_info->bdev_holder); 2328 if (IS_ERR(bdev)) 2329 return PTR_ERR(bdev); 2330 2331 if (fs_info->fs_devices->seeding) { 2332 seeding_dev = 1; 2333 down_write(&sb->s_umount); 2334 mutex_lock(&uuid_mutex); 2335 } 2336 2337 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2338 2339 devices = &fs_info->fs_devices->devices; 2340 2341 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2342 list_for_each_entry(device, devices, dev_list) { 2343 if (device->bdev == bdev) { 2344 ret = -EEXIST; 2345 mutex_unlock( 2346 &fs_info->fs_devices->device_list_mutex); 2347 goto error; 2348 } 2349 } 2350 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2351 2352 device = btrfs_alloc_device(fs_info, NULL, NULL); 2353 if (IS_ERR(device)) { 2354 /* we can safely leave the fs_devices entry around */ 2355 ret = PTR_ERR(device); 2356 goto error; 2357 } 2358 2359 name = rcu_string_strdup(device_path, GFP_KERNEL); 2360 if (!name) { 2361 bio_put(device->flush_bio); 2362 kfree(device); 2363 ret = -ENOMEM; 2364 goto error; 2365 } 2366 rcu_assign_pointer(device->name, name); 2367 2368 trans = btrfs_start_transaction(root, 0); 2369 if (IS_ERR(trans)) { 2370 rcu_string_free(device->name); 2371 bio_put(device->flush_bio); 2372 kfree(device); 2373 ret = PTR_ERR(trans); 2374 goto error; 2375 } 2376 2377 q = bdev_get_queue(bdev); 2378 if (blk_queue_discard(q)) 2379 device->can_discard = 1; 2380 device->writeable = 1; 2381 device->generation = trans->transid; 2382 device->io_width = fs_info->sectorsize; 2383 device->io_align = fs_info->sectorsize; 2384 device->sector_size = fs_info->sectorsize; 2385 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2386 fs_info->sectorsize); 2387 device->disk_total_bytes = device->total_bytes; 2388 device->commit_total_bytes = device->total_bytes; 2389 device->fs_info = fs_info; 2390 device->bdev = bdev; 2391 device->in_fs_metadata = 1; 2392 device->is_tgtdev_for_dev_replace = 0; 2393 device->mode = FMODE_EXCL; 2394 device->dev_stats_valid = 1; 2395 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2396 2397 if (seeding_dev) { 2398 sb->s_flags &= ~SB_RDONLY; 2399 ret = btrfs_prepare_sprout(fs_info); 2400 if (ret) { 2401 btrfs_abort_transaction(trans, ret); 2402 goto error_trans; 2403 } 2404 } 2405 2406 device->fs_devices = fs_info->fs_devices; 2407 2408 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2409 mutex_lock(&fs_info->chunk_mutex); 2410 list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); 2411 list_add(&device->dev_alloc_list, 2412 &fs_info->fs_devices->alloc_list); 2413 fs_info->fs_devices->num_devices++; 2414 fs_info->fs_devices->open_devices++; 2415 fs_info->fs_devices->rw_devices++; 2416 fs_info->fs_devices->total_devices++; 2417 fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2418 2419 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2420 2421 if (!blk_queue_nonrot(q)) 2422 fs_info->fs_devices->rotating = 1; 2423 2424 tmp = btrfs_super_total_bytes(fs_info->super_copy); 2425 btrfs_set_super_total_bytes(fs_info->super_copy, 2426 round_down(tmp + device->total_bytes, fs_info->sectorsize)); 2427 2428 tmp = btrfs_super_num_devices(fs_info->super_copy); 2429 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 2430 2431 /* add sysfs device entry */ 2432 btrfs_sysfs_add_device_link(fs_info->fs_devices, device); 2433 2434 /* 2435 * we've got more storage, clear any full flags on the space 2436 * infos 2437 */ 2438 btrfs_clear_space_info_full(fs_info); 2439 2440 mutex_unlock(&fs_info->chunk_mutex); 2441 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2442 2443 if (seeding_dev) { 2444 mutex_lock(&fs_info->chunk_mutex); 2445 ret = init_first_rw_device(trans, fs_info); 2446 mutex_unlock(&fs_info->chunk_mutex); 2447 if (ret) { 2448 btrfs_abort_transaction(trans, ret); 2449 goto error_sysfs; 2450 } 2451 } 2452 2453 ret = btrfs_add_device(trans, fs_info, device); 2454 if (ret) { 2455 btrfs_abort_transaction(trans, ret); 2456 goto error_sysfs; 2457 } 2458 2459 if (seeding_dev) { 2460 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2461 2462 ret = btrfs_finish_sprout(trans, fs_info); 2463 if (ret) { 2464 btrfs_abort_transaction(trans, ret); 2465 goto error_sysfs; 2466 } 2467 2468 /* Sprouting would change fsid of the mounted root, 2469 * so rename the fsid on the sysfs 2470 */ 2471 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2472 fs_info->fsid); 2473 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) 2474 btrfs_warn(fs_info, 2475 "sysfs: failed to create fsid for sprout"); 2476 } 2477 2478 ret = btrfs_commit_transaction(trans); 2479 2480 if (seeding_dev) { 2481 mutex_unlock(&uuid_mutex); 2482 up_write(&sb->s_umount); 2483 unlocked = true; 2484 2485 if (ret) /* transaction commit */ 2486 return ret; 2487 2488 ret = btrfs_relocate_sys_chunks(fs_info); 2489 if (ret < 0) 2490 btrfs_handle_fs_error(fs_info, ret, 2491 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2492 trans = btrfs_attach_transaction(root); 2493 if (IS_ERR(trans)) { 2494 if (PTR_ERR(trans) == -ENOENT) 2495 return 0; 2496 ret = PTR_ERR(trans); 2497 trans = NULL; 2498 goto error_sysfs; 2499 } 2500 ret = btrfs_commit_transaction(trans); 2501 } 2502 2503 /* Update ctime/mtime for libblkid */ 2504 update_dev_time(device_path); 2505 return ret; 2506 2507 error_sysfs: 2508 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2509 error_trans: 2510 if (seeding_dev) 2511 sb->s_flags |= SB_RDONLY; 2512 if (trans) 2513 btrfs_end_transaction(trans); 2514 rcu_string_free(device->name); 2515 bio_put(device->flush_bio); 2516 kfree(device); 2517 error: 2518 blkdev_put(bdev, FMODE_EXCL); 2519 if (seeding_dev && !unlocked) { 2520 mutex_unlock(&uuid_mutex); 2521 up_write(&sb->s_umount); 2522 } 2523 return ret; 2524 } 2525 2526 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2527 const char *device_path, 2528 struct btrfs_device *srcdev, 2529 struct btrfs_device **device_out) 2530 { 2531 struct request_queue *q; 2532 struct btrfs_device *device; 2533 struct block_device *bdev; 2534 struct list_head *devices; 2535 struct rcu_string *name; 2536 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2537 int ret = 0; 2538 2539 *device_out = NULL; 2540 if (fs_info->fs_devices->seeding) { 2541 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2542 return -EINVAL; 2543 } 2544 2545 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2546 fs_info->bdev_holder); 2547 if (IS_ERR(bdev)) { 2548 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2549 return PTR_ERR(bdev); 2550 } 2551 2552 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2553 2554 devices = &fs_info->fs_devices->devices; 2555 list_for_each_entry(device, devices, dev_list) { 2556 if (device->bdev == bdev) { 2557 btrfs_err(fs_info, 2558 "target device is in the filesystem!"); 2559 ret = -EEXIST; 2560 goto error; 2561 } 2562 } 2563 2564 2565 if (i_size_read(bdev->bd_inode) < 2566 btrfs_device_get_total_bytes(srcdev)) { 2567 btrfs_err(fs_info, 2568 "target device is smaller than source device!"); 2569 ret = -EINVAL; 2570 goto error; 2571 } 2572 2573 2574 device = btrfs_alloc_device(NULL, &devid, NULL); 2575 if (IS_ERR(device)) { 2576 ret = PTR_ERR(device); 2577 goto error; 2578 } 2579 2580 name = rcu_string_strdup(device_path, GFP_KERNEL); 2581 if (!name) { 2582 bio_put(device->flush_bio); 2583 kfree(device); 2584 ret = -ENOMEM; 2585 goto error; 2586 } 2587 rcu_assign_pointer(device->name, name); 2588 2589 q = bdev_get_queue(bdev); 2590 if (blk_queue_discard(q)) 2591 device->can_discard = 1; 2592 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2593 device->writeable = 1; 2594 device->generation = 0; 2595 device->io_width = fs_info->sectorsize; 2596 device->io_align = fs_info->sectorsize; 2597 device->sector_size = fs_info->sectorsize; 2598 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2599 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2600 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2601 ASSERT(list_empty(&srcdev->resized_list)); 2602 device->commit_total_bytes = srcdev->commit_total_bytes; 2603 device->commit_bytes_used = device->bytes_used; 2604 device->fs_info = fs_info; 2605 device->bdev = bdev; 2606 device->in_fs_metadata = 1; 2607 device->is_tgtdev_for_dev_replace = 1; 2608 device->mode = FMODE_EXCL; 2609 device->dev_stats_valid = 1; 2610 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2611 device->fs_devices = fs_info->fs_devices; 2612 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2613 fs_info->fs_devices->num_devices++; 2614 fs_info->fs_devices->open_devices++; 2615 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2616 2617 *device_out = device; 2618 return ret; 2619 2620 error: 2621 blkdev_put(bdev, FMODE_EXCL); 2622 return ret; 2623 } 2624 2625 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2626 struct btrfs_device *tgtdev) 2627 { 2628 u32 sectorsize = fs_info->sectorsize; 2629 2630 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2631 tgtdev->io_width = sectorsize; 2632 tgtdev->io_align = sectorsize; 2633 tgtdev->sector_size = sectorsize; 2634 tgtdev->fs_info = fs_info; 2635 tgtdev->in_fs_metadata = 1; 2636 } 2637 2638 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2639 struct btrfs_device *device) 2640 { 2641 int ret; 2642 struct btrfs_path *path; 2643 struct btrfs_root *root = device->fs_info->chunk_root; 2644 struct btrfs_dev_item *dev_item; 2645 struct extent_buffer *leaf; 2646 struct btrfs_key key; 2647 2648 path = btrfs_alloc_path(); 2649 if (!path) 2650 return -ENOMEM; 2651 2652 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2653 key.type = BTRFS_DEV_ITEM_KEY; 2654 key.offset = device->devid; 2655 2656 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2657 if (ret < 0) 2658 goto out; 2659 2660 if (ret > 0) { 2661 ret = -ENOENT; 2662 goto out; 2663 } 2664 2665 leaf = path->nodes[0]; 2666 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2667 2668 btrfs_set_device_id(leaf, dev_item, device->devid); 2669 btrfs_set_device_type(leaf, dev_item, device->type); 2670 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2671 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2672 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2673 btrfs_set_device_total_bytes(leaf, dev_item, 2674 btrfs_device_get_disk_total_bytes(device)); 2675 btrfs_set_device_bytes_used(leaf, dev_item, 2676 btrfs_device_get_bytes_used(device)); 2677 btrfs_mark_buffer_dirty(leaf); 2678 2679 out: 2680 btrfs_free_path(path); 2681 return ret; 2682 } 2683 2684 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2685 struct btrfs_device *device, u64 new_size) 2686 { 2687 struct btrfs_fs_info *fs_info = device->fs_info; 2688 struct btrfs_super_block *super_copy = fs_info->super_copy; 2689 struct btrfs_fs_devices *fs_devices; 2690 u64 old_total; 2691 u64 diff; 2692 2693 if (!device->writeable) 2694 return -EACCES; 2695 2696 new_size = round_down(new_size, fs_info->sectorsize); 2697 2698 mutex_lock(&fs_info->chunk_mutex); 2699 old_total = btrfs_super_total_bytes(super_copy); 2700 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2701 2702 if (new_size <= device->total_bytes || 2703 device->is_tgtdev_for_dev_replace) { 2704 mutex_unlock(&fs_info->chunk_mutex); 2705 return -EINVAL; 2706 } 2707 2708 fs_devices = fs_info->fs_devices; 2709 2710 btrfs_set_super_total_bytes(super_copy, 2711 round_down(old_total + diff, fs_info->sectorsize)); 2712 device->fs_devices->total_rw_bytes += diff; 2713 2714 btrfs_device_set_total_bytes(device, new_size); 2715 btrfs_device_set_disk_total_bytes(device, new_size); 2716 btrfs_clear_space_info_full(device->fs_info); 2717 if (list_empty(&device->resized_list)) 2718 list_add_tail(&device->resized_list, 2719 &fs_devices->resized_devices); 2720 mutex_unlock(&fs_info->chunk_mutex); 2721 2722 return btrfs_update_device(trans, device); 2723 } 2724 2725 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2726 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2727 { 2728 struct btrfs_root *root = fs_info->chunk_root; 2729 int ret; 2730 struct btrfs_path *path; 2731 struct btrfs_key key; 2732 2733 path = btrfs_alloc_path(); 2734 if (!path) 2735 return -ENOMEM; 2736 2737 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2738 key.offset = chunk_offset; 2739 key.type = BTRFS_CHUNK_ITEM_KEY; 2740 2741 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2742 if (ret < 0) 2743 goto out; 2744 else if (ret > 0) { /* Logic error or corruption */ 2745 btrfs_handle_fs_error(fs_info, -ENOENT, 2746 "Failed lookup while freeing chunk."); 2747 ret = -ENOENT; 2748 goto out; 2749 } 2750 2751 ret = btrfs_del_item(trans, root, path); 2752 if (ret < 0) 2753 btrfs_handle_fs_error(fs_info, ret, 2754 "Failed to delete chunk item."); 2755 out: 2756 btrfs_free_path(path); 2757 return ret; 2758 } 2759 2760 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2761 { 2762 struct btrfs_super_block *super_copy = fs_info->super_copy; 2763 struct btrfs_disk_key *disk_key; 2764 struct btrfs_chunk *chunk; 2765 u8 *ptr; 2766 int ret = 0; 2767 u32 num_stripes; 2768 u32 array_size; 2769 u32 len = 0; 2770 u32 cur; 2771 struct btrfs_key key; 2772 2773 mutex_lock(&fs_info->chunk_mutex); 2774 array_size = btrfs_super_sys_array_size(super_copy); 2775 2776 ptr = super_copy->sys_chunk_array; 2777 cur = 0; 2778 2779 while (cur < array_size) { 2780 disk_key = (struct btrfs_disk_key *)ptr; 2781 btrfs_disk_key_to_cpu(&key, disk_key); 2782 2783 len = sizeof(*disk_key); 2784 2785 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2786 chunk = (struct btrfs_chunk *)(ptr + len); 2787 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2788 len += btrfs_chunk_item_size(num_stripes); 2789 } else { 2790 ret = -EIO; 2791 break; 2792 } 2793 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2794 key.offset == chunk_offset) { 2795 memmove(ptr, ptr + len, array_size - (cur + len)); 2796 array_size -= len; 2797 btrfs_set_super_sys_array_size(super_copy, array_size); 2798 } else { 2799 ptr += len; 2800 cur += len; 2801 } 2802 } 2803 mutex_unlock(&fs_info->chunk_mutex); 2804 return ret; 2805 } 2806 2807 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2808 u64 logical, u64 length) 2809 { 2810 struct extent_map_tree *em_tree; 2811 struct extent_map *em; 2812 2813 em_tree = &fs_info->mapping_tree.map_tree; 2814 read_lock(&em_tree->lock); 2815 em = lookup_extent_mapping(em_tree, logical, length); 2816 read_unlock(&em_tree->lock); 2817 2818 if (!em) { 2819 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2820 logical, length); 2821 return ERR_PTR(-EINVAL); 2822 } 2823 2824 if (em->start > logical || em->start + em->len < logical) { 2825 btrfs_crit(fs_info, 2826 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2827 logical, length, em->start, em->start + em->len); 2828 free_extent_map(em); 2829 return ERR_PTR(-EINVAL); 2830 } 2831 2832 /* callers are responsible for dropping em's ref. */ 2833 return em; 2834 } 2835 2836 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2837 struct btrfs_fs_info *fs_info, u64 chunk_offset) 2838 { 2839 struct extent_map *em; 2840 struct map_lookup *map; 2841 u64 dev_extent_len = 0; 2842 int i, ret = 0; 2843 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2844 2845 em = get_chunk_map(fs_info, chunk_offset, 1); 2846 if (IS_ERR(em)) { 2847 /* 2848 * This is a logic error, but we don't want to just rely on the 2849 * user having built with ASSERT enabled, so if ASSERT doesn't 2850 * do anything we still error out. 2851 */ 2852 ASSERT(0); 2853 return PTR_ERR(em); 2854 } 2855 map = em->map_lookup; 2856 mutex_lock(&fs_info->chunk_mutex); 2857 check_system_chunk(trans, fs_info, map->type); 2858 mutex_unlock(&fs_info->chunk_mutex); 2859 2860 /* 2861 * Take the device list mutex to prevent races with the final phase of 2862 * a device replace operation that replaces the device object associated 2863 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2864 */ 2865 mutex_lock(&fs_devices->device_list_mutex); 2866 for (i = 0; i < map->num_stripes; i++) { 2867 struct btrfs_device *device = map->stripes[i].dev; 2868 ret = btrfs_free_dev_extent(trans, device, 2869 map->stripes[i].physical, 2870 &dev_extent_len); 2871 if (ret) { 2872 mutex_unlock(&fs_devices->device_list_mutex); 2873 btrfs_abort_transaction(trans, ret); 2874 goto out; 2875 } 2876 2877 if (device->bytes_used > 0) { 2878 mutex_lock(&fs_info->chunk_mutex); 2879 btrfs_device_set_bytes_used(device, 2880 device->bytes_used - dev_extent_len); 2881 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2882 btrfs_clear_space_info_full(fs_info); 2883 mutex_unlock(&fs_info->chunk_mutex); 2884 } 2885 2886 if (map->stripes[i].dev) { 2887 ret = btrfs_update_device(trans, map->stripes[i].dev); 2888 if (ret) { 2889 mutex_unlock(&fs_devices->device_list_mutex); 2890 btrfs_abort_transaction(trans, ret); 2891 goto out; 2892 } 2893 } 2894 } 2895 mutex_unlock(&fs_devices->device_list_mutex); 2896 2897 ret = btrfs_free_chunk(trans, fs_info, chunk_offset); 2898 if (ret) { 2899 btrfs_abort_transaction(trans, ret); 2900 goto out; 2901 } 2902 2903 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2904 2905 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2906 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2907 if (ret) { 2908 btrfs_abort_transaction(trans, ret); 2909 goto out; 2910 } 2911 } 2912 2913 ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); 2914 if (ret) { 2915 btrfs_abort_transaction(trans, ret); 2916 goto out; 2917 } 2918 2919 out: 2920 /* once for us */ 2921 free_extent_map(em); 2922 return ret; 2923 } 2924 2925 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2926 { 2927 struct btrfs_root *root = fs_info->chunk_root; 2928 struct btrfs_trans_handle *trans; 2929 int ret; 2930 2931 /* 2932 * Prevent races with automatic removal of unused block groups. 2933 * After we relocate and before we remove the chunk with offset 2934 * chunk_offset, automatic removal of the block group can kick in, 2935 * resulting in a failure when calling btrfs_remove_chunk() below. 2936 * 2937 * Make sure to acquire this mutex before doing a tree search (dev 2938 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 2939 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 2940 * we release the path used to search the chunk/dev tree and before 2941 * the current task acquires this mutex and calls us. 2942 */ 2943 ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); 2944 2945 ret = btrfs_can_relocate(fs_info, chunk_offset); 2946 if (ret) 2947 return -ENOSPC; 2948 2949 /* step one, relocate all the extents inside this chunk */ 2950 btrfs_scrub_pause(fs_info); 2951 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 2952 btrfs_scrub_continue(fs_info); 2953 if (ret) 2954 return ret; 2955 2956 trans = btrfs_start_trans_remove_block_group(root->fs_info, 2957 chunk_offset); 2958 if (IS_ERR(trans)) { 2959 ret = PTR_ERR(trans); 2960 btrfs_handle_fs_error(root->fs_info, ret, NULL); 2961 return ret; 2962 } 2963 2964 /* 2965 * step two, delete the device extents and the 2966 * chunk tree entries 2967 */ 2968 ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); 2969 btrfs_end_transaction(trans); 2970 return ret; 2971 } 2972 2973 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 2974 { 2975 struct btrfs_root *chunk_root = fs_info->chunk_root; 2976 struct btrfs_path *path; 2977 struct extent_buffer *leaf; 2978 struct btrfs_chunk *chunk; 2979 struct btrfs_key key; 2980 struct btrfs_key found_key; 2981 u64 chunk_type; 2982 bool retried = false; 2983 int failed = 0; 2984 int ret; 2985 2986 path = btrfs_alloc_path(); 2987 if (!path) 2988 return -ENOMEM; 2989 2990 again: 2991 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2992 key.offset = (u64)-1; 2993 key.type = BTRFS_CHUNK_ITEM_KEY; 2994 2995 while (1) { 2996 mutex_lock(&fs_info->delete_unused_bgs_mutex); 2997 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2998 if (ret < 0) { 2999 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3000 goto error; 3001 } 3002 BUG_ON(ret == 0); /* Corruption */ 3003 3004 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3005 key.type); 3006 if (ret) 3007 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3008 if (ret < 0) 3009 goto error; 3010 if (ret > 0) 3011 break; 3012 3013 leaf = path->nodes[0]; 3014 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3015 3016 chunk = btrfs_item_ptr(leaf, path->slots[0], 3017 struct btrfs_chunk); 3018 chunk_type = btrfs_chunk_type(leaf, chunk); 3019 btrfs_release_path(path); 3020 3021 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3022 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3023 if (ret == -ENOSPC) 3024 failed++; 3025 else 3026 BUG_ON(ret); 3027 } 3028 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3029 3030 if (found_key.offset == 0) 3031 break; 3032 key.offset = found_key.offset - 1; 3033 } 3034 ret = 0; 3035 if (failed && !retried) { 3036 failed = 0; 3037 retried = true; 3038 goto again; 3039 } else if (WARN_ON(failed && retried)) { 3040 ret = -ENOSPC; 3041 } 3042 error: 3043 btrfs_free_path(path); 3044 return ret; 3045 } 3046 3047 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3048 struct btrfs_balance_control *bctl) 3049 { 3050 struct btrfs_root *root = fs_info->tree_root; 3051 struct btrfs_trans_handle *trans; 3052 struct btrfs_balance_item *item; 3053 struct btrfs_disk_balance_args disk_bargs; 3054 struct btrfs_path *path; 3055 struct extent_buffer *leaf; 3056 struct btrfs_key key; 3057 int ret, err; 3058 3059 path = btrfs_alloc_path(); 3060 if (!path) 3061 return -ENOMEM; 3062 3063 trans = btrfs_start_transaction(root, 0); 3064 if (IS_ERR(trans)) { 3065 btrfs_free_path(path); 3066 return PTR_ERR(trans); 3067 } 3068 3069 key.objectid = BTRFS_BALANCE_OBJECTID; 3070 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3071 key.offset = 0; 3072 3073 ret = btrfs_insert_empty_item(trans, root, path, &key, 3074 sizeof(*item)); 3075 if (ret) 3076 goto out; 3077 3078 leaf = path->nodes[0]; 3079 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3080 3081 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3082 3083 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3084 btrfs_set_balance_data(leaf, item, &disk_bargs); 3085 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3086 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3087 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3088 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3089 3090 btrfs_set_balance_flags(leaf, item, bctl->flags); 3091 3092 btrfs_mark_buffer_dirty(leaf); 3093 out: 3094 btrfs_free_path(path); 3095 err = btrfs_commit_transaction(trans); 3096 if (err && !ret) 3097 ret = err; 3098 return ret; 3099 } 3100 3101 static int del_balance_item(struct btrfs_fs_info *fs_info) 3102 { 3103 struct btrfs_root *root = fs_info->tree_root; 3104 struct btrfs_trans_handle *trans; 3105 struct btrfs_path *path; 3106 struct btrfs_key key; 3107 int ret, err; 3108 3109 path = btrfs_alloc_path(); 3110 if (!path) 3111 return -ENOMEM; 3112 3113 trans = btrfs_start_transaction(root, 0); 3114 if (IS_ERR(trans)) { 3115 btrfs_free_path(path); 3116 return PTR_ERR(trans); 3117 } 3118 3119 key.objectid = BTRFS_BALANCE_OBJECTID; 3120 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3121 key.offset = 0; 3122 3123 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3124 if (ret < 0) 3125 goto out; 3126 if (ret > 0) { 3127 ret = -ENOENT; 3128 goto out; 3129 } 3130 3131 ret = btrfs_del_item(trans, root, path); 3132 out: 3133 btrfs_free_path(path); 3134 err = btrfs_commit_transaction(trans); 3135 if (err && !ret) 3136 ret = err; 3137 return ret; 3138 } 3139 3140 /* 3141 * This is a heuristic used to reduce the number of chunks balanced on 3142 * resume after balance was interrupted. 3143 */ 3144 static void update_balance_args(struct btrfs_balance_control *bctl) 3145 { 3146 /* 3147 * Turn on soft mode for chunk types that were being converted. 3148 */ 3149 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3150 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3151 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3152 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3153 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3154 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3155 3156 /* 3157 * Turn on usage filter if is not already used. The idea is 3158 * that chunks that we have already balanced should be 3159 * reasonably full. Don't do it for chunks that are being 3160 * converted - that will keep us from relocating unconverted 3161 * (albeit full) chunks. 3162 */ 3163 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3164 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3165 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3166 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3167 bctl->data.usage = 90; 3168 } 3169 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3170 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3171 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3172 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3173 bctl->sys.usage = 90; 3174 } 3175 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3176 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3177 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3178 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3179 bctl->meta.usage = 90; 3180 } 3181 } 3182 3183 /* 3184 * Should be called with both balance and volume mutexes held to 3185 * serialize other volume operations (add_dev/rm_dev/resize) with 3186 * restriper. Same goes for unset_balance_control. 3187 */ 3188 static void set_balance_control(struct btrfs_balance_control *bctl) 3189 { 3190 struct btrfs_fs_info *fs_info = bctl->fs_info; 3191 3192 BUG_ON(fs_info->balance_ctl); 3193 3194 spin_lock(&fs_info->balance_lock); 3195 fs_info->balance_ctl = bctl; 3196 spin_unlock(&fs_info->balance_lock); 3197 } 3198 3199 static void unset_balance_control(struct btrfs_fs_info *fs_info) 3200 { 3201 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3202 3203 BUG_ON(!fs_info->balance_ctl); 3204 3205 spin_lock(&fs_info->balance_lock); 3206 fs_info->balance_ctl = NULL; 3207 spin_unlock(&fs_info->balance_lock); 3208 3209 kfree(bctl); 3210 } 3211 3212 /* 3213 * Balance filters. Return 1 if chunk should be filtered out 3214 * (should not be balanced). 3215 */ 3216 static int chunk_profiles_filter(u64 chunk_type, 3217 struct btrfs_balance_args *bargs) 3218 { 3219 chunk_type = chunk_to_extended(chunk_type) & 3220 BTRFS_EXTENDED_PROFILE_MASK; 3221 3222 if (bargs->profiles & chunk_type) 3223 return 0; 3224 3225 return 1; 3226 } 3227 3228 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3229 struct btrfs_balance_args *bargs) 3230 { 3231 struct btrfs_block_group_cache *cache; 3232 u64 chunk_used; 3233 u64 user_thresh_min; 3234 u64 user_thresh_max; 3235 int ret = 1; 3236 3237 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3238 chunk_used = btrfs_block_group_used(&cache->item); 3239 3240 if (bargs->usage_min == 0) 3241 user_thresh_min = 0; 3242 else 3243 user_thresh_min = div_factor_fine(cache->key.offset, 3244 bargs->usage_min); 3245 3246 if (bargs->usage_max == 0) 3247 user_thresh_max = 1; 3248 else if (bargs->usage_max > 100) 3249 user_thresh_max = cache->key.offset; 3250 else 3251 user_thresh_max = div_factor_fine(cache->key.offset, 3252 bargs->usage_max); 3253 3254 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3255 ret = 0; 3256 3257 btrfs_put_block_group(cache); 3258 return ret; 3259 } 3260 3261 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3262 u64 chunk_offset, struct btrfs_balance_args *bargs) 3263 { 3264 struct btrfs_block_group_cache *cache; 3265 u64 chunk_used, user_thresh; 3266 int ret = 1; 3267 3268 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3269 chunk_used = btrfs_block_group_used(&cache->item); 3270 3271 if (bargs->usage_min == 0) 3272 user_thresh = 1; 3273 else if (bargs->usage > 100) 3274 user_thresh = cache->key.offset; 3275 else 3276 user_thresh = div_factor_fine(cache->key.offset, 3277 bargs->usage); 3278 3279 if (chunk_used < user_thresh) 3280 ret = 0; 3281 3282 btrfs_put_block_group(cache); 3283 return ret; 3284 } 3285 3286 static int chunk_devid_filter(struct extent_buffer *leaf, 3287 struct btrfs_chunk *chunk, 3288 struct btrfs_balance_args *bargs) 3289 { 3290 struct btrfs_stripe *stripe; 3291 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3292 int i; 3293 3294 for (i = 0; i < num_stripes; i++) { 3295 stripe = btrfs_stripe_nr(chunk, i); 3296 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3297 return 0; 3298 } 3299 3300 return 1; 3301 } 3302 3303 /* [pstart, pend) */ 3304 static int chunk_drange_filter(struct extent_buffer *leaf, 3305 struct btrfs_chunk *chunk, 3306 struct btrfs_balance_args *bargs) 3307 { 3308 struct btrfs_stripe *stripe; 3309 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3310 u64 stripe_offset; 3311 u64 stripe_length; 3312 int factor; 3313 int i; 3314 3315 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3316 return 0; 3317 3318 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3319 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3320 factor = num_stripes / 2; 3321 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3322 factor = num_stripes - 1; 3323 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3324 factor = num_stripes - 2; 3325 } else { 3326 factor = num_stripes; 3327 } 3328 3329 for (i = 0; i < num_stripes; i++) { 3330 stripe = btrfs_stripe_nr(chunk, i); 3331 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3332 continue; 3333 3334 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3335 stripe_length = btrfs_chunk_length(leaf, chunk); 3336 stripe_length = div_u64(stripe_length, factor); 3337 3338 if (stripe_offset < bargs->pend && 3339 stripe_offset + stripe_length > bargs->pstart) 3340 return 0; 3341 } 3342 3343 return 1; 3344 } 3345 3346 /* [vstart, vend) */ 3347 static int chunk_vrange_filter(struct extent_buffer *leaf, 3348 struct btrfs_chunk *chunk, 3349 u64 chunk_offset, 3350 struct btrfs_balance_args *bargs) 3351 { 3352 if (chunk_offset < bargs->vend && 3353 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3354 /* at least part of the chunk is inside this vrange */ 3355 return 0; 3356 3357 return 1; 3358 } 3359 3360 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3361 struct btrfs_chunk *chunk, 3362 struct btrfs_balance_args *bargs) 3363 { 3364 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3365 3366 if (bargs->stripes_min <= num_stripes 3367 && num_stripes <= bargs->stripes_max) 3368 return 0; 3369 3370 return 1; 3371 } 3372 3373 static int chunk_soft_convert_filter(u64 chunk_type, 3374 struct btrfs_balance_args *bargs) 3375 { 3376 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3377 return 0; 3378 3379 chunk_type = chunk_to_extended(chunk_type) & 3380 BTRFS_EXTENDED_PROFILE_MASK; 3381 3382 if (bargs->target == chunk_type) 3383 return 1; 3384 3385 return 0; 3386 } 3387 3388 static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3389 struct extent_buffer *leaf, 3390 struct btrfs_chunk *chunk, u64 chunk_offset) 3391 { 3392 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3393 struct btrfs_balance_args *bargs = NULL; 3394 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3395 3396 /* type filter */ 3397 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3398 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3399 return 0; 3400 } 3401 3402 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3403 bargs = &bctl->data; 3404 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3405 bargs = &bctl->sys; 3406 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3407 bargs = &bctl->meta; 3408 3409 /* profiles filter */ 3410 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3411 chunk_profiles_filter(chunk_type, bargs)) { 3412 return 0; 3413 } 3414 3415 /* usage filter */ 3416 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3417 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3418 return 0; 3419 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3420 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3421 return 0; 3422 } 3423 3424 /* devid filter */ 3425 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3426 chunk_devid_filter(leaf, chunk, bargs)) { 3427 return 0; 3428 } 3429 3430 /* drange filter, makes sense only with devid filter */ 3431 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3432 chunk_drange_filter(leaf, chunk, bargs)) { 3433 return 0; 3434 } 3435 3436 /* vrange filter */ 3437 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3438 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3439 return 0; 3440 } 3441 3442 /* stripes filter */ 3443 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3444 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3445 return 0; 3446 } 3447 3448 /* soft profile changing mode */ 3449 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3450 chunk_soft_convert_filter(chunk_type, bargs)) { 3451 return 0; 3452 } 3453 3454 /* 3455 * limited by count, must be the last filter 3456 */ 3457 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3458 if (bargs->limit == 0) 3459 return 0; 3460 else 3461 bargs->limit--; 3462 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3463 /* 3464 * Same logic as the 'limit' filter; the minimum cannot be 3465 * determined here because we do not have the global information 3466 * about the count of all chunks that satisfy the filters. 3467 */ 3468 if (bargs->limit_max == 0) 3469 return 0; 3470 else 3471 bargs->limit_max--; 3472 } 3473 3474 return 1; 3475 } 3476 3477 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3478 { 3479 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3480 struct btrfs_root *chunk_root = fs_info->chunk_root; 3481 struct btrfs_root *dev_root = fs_info->dev_root; 3482 struct list_head *devices; 3483 struct btrfs_device *device; 3484 u64 old_size; 3485 u64 size_to_free; 3486 u64 chunk_type; 3487 struct btrfs_chunk *chunk; 3488 struct btrfs_path *path = NULL; 3489 struct btrfs_key key; 3490 struct btrfs_key found_key; 3491 struct btrfs_trans_handle *trans; 3492 struct extent_buffer *leaf; 3493 int slot; 3494 int ret; 3495 int enospc_errors = 0; 3496 bool counting = true; 3497 /* The single value limit and min/max limits use the same bytes in the */ 3498 u64 limit_data = bctl->data.limit; 3499 u64 limit_meta = bctl->meta.limit; 3500 u64 limit_sys = bctl->sys.limit; 3501 u32 count_data = 0; 3502 u32 count_meta = 0; 3503 u32 count_sys = 0; 3504 int chunk_reserved = 0; 3505 u64 bytes_used = 0; 3506 3507 /* step one make some room on all the devices */ 3508 devices = &fs_info->fs_devices->devices; 3509 list_for_each_entry(device, devices, dev_list) { 3510 old_size = btrfs_device_get_total_bytes(device); 3511 size_to_free = div_factor(old_size, 1); 3512 size_to_free = min_t(u64, size_to_free, SZ_1M); 3513 if (!device->writeable || 3514 btrfs_device_get_total_bytes(device) - 3515 btrfs_device_get_bytes_used(device) > size_to_free || 3516 device->is_tgtdev_for_dev_replace) 3517 continue; 3518 3519 ret = btrfs_shrink_device(device, old_size - size_to_free); 3520 if (ret == -ENOSPC) 3521 break; 3522 if (ret) { 3523 /* btrfs_shrink_device never returns ret > 0 */ 3524 WARN_ON(ret > 0); 3525 goto error; 3526 } 3527 3528 trans = btrfs_start_transaction(dev_root, 0); 3529 if (IS_ERR(trans)) { 3530 ret = PTR_ERR(trans); 3531 btrfs_info_in_rcu(fs_info, 3532 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 3533 rcu_str_deref(device->name), ret, 3534 old_size, old_size - size_to_free); 3535 goto error; 3536 } 3537 3538 ret = btrfs_grow_device(trans, device, old_size); 3539 if (ret) { 3540 btrfs_end_transaction(trans); 3541 /* btrfs_grow_device never returns ret > 0 */ 3542 WARN_ON(ret > 0); 3543 btrfs_info_in_rcu(fs_info, 3544 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 3545 rcu_str_deref(device->name), ret, 3546 old_size, old_size - size_to_free); 3547 goto error; 3548 } 3549 3550 btrfs_end_transaction(trans); 3551 } 3552 3553 /* step two, relocate all the chunks */ 3554 path = btrfs_alloc_path(); 3555 if (!path) { 3556 ret = -ENOMEM; 3557 goto error; 3558 } 3559 3560 /* zero out stat counters */ 3561 spin_lock(&fs_info->balance_lock); 3562 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3563 spin_unlock(&fs_info->balance_lock); 3564 again: 3565 if (!counting) { 3566 /* 3567 * The single value limit and min/max limits use the same bytes 3568 * in the 3569 */ 3570 bctl->data.limit = limit_data; 3571 bctl->meta.limit = limit_meta; 3572 bctl->sys.limit = limit_sys; 3573 } 3574 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3575 key.offset = (u64)-1; 3576 key.type = BTRFS_CHUNK_ITEM_KEY; 3577 3578 while (1) { 3579 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3580 atomic_read(&fs_info->balance_cancel_req)) { 3581 ret = -ECANCELED; 3582 goto error; 3583 } 3584 3585 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3586 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3587 if (ret < 0) { 3588 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3589 goto error; 3590 } 3591 3592 /* 3593 * this shouldn't happen, it means the last relocate 3594 * failed 3595 */ 3596 if (ret == 0) 3597 BUG(); /* FIXME break ? */ 3598 3599 ret = btrfs_previous_item(chunk_root, path, 0, 3600 BTRFS_CHUNK_ITEM_KEY); 3601 if (ret) { 3602 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3603 ret = 0; 3604 break; 3605 } 3606 3607 leaf = path->nodes[0]; 3608 slot = path->slots[0]; 3609 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3610 3611 if (found_key.objectid != key.objectid) { 3612 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3613 break; 3614 } 3615 3616 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3617 chunk_type = btrfs_chunk_type(leaf, chunk); 3618 3619 if (!counting) { 3620 spin_lock(&fs_info->balance_lock); 3621 bctl->stat.considered++; 3622 spin_unlock(&fs_info->balance_lock); 3623 } 3624 3625 ret = should_balance_chunk(fs_info, leaf, chunk, 3626 found_key.offset); 3627 3628 btrfs_release_path(path); 3629 if (!ret) { 3630 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3631 goto loop; 3632 } 3633 3634 if (counting) { 3635 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3636 spin_lock(&fs_info->balance_lock); 3637 bctl->stat.expected++; 3638 spin_unlock(&fs_info->balance_lock); 3639 3640 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3641 count_data++; 3642 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3643 count_sys++; 3644 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3645 count_meta++; 3646 3647 goto loop; 3648 } 3649 3650 /* 3651 * Apply limit_min filter, no need to check if the LIMITS 3652 * filter is used, limit_min is 0 by default 3653 */ 3654 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3655 count_data < bctl->data.limit_min) 3656 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3657 count_meta < bctl->meta.limit_min) 3658 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3659 count_sys < bctl->sys.limit_min)) { 3660 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3661 goto loop; 3662 } 3663 3664 ASSERT(fs_info->data_sinfo); 3665 spin_lock(&fs_info->data_sinfo->lock); 3666 bytes_used = fs_info->data_sinfo->bytes_used; 3667 spin_unlock(&fs_info->data_sinfo->lock); 3668 3669 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3670 !chunk_reserved && !bytes_used) { 3671 trans = btrfs_start_transaction(chunk_root, 0); 3672 if (IS_ERR(trans)) { 3673 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3674 ret = PTR_ERR(trans); 3675 goto error; 3676 } 3677 3678 ret = btrfs_force_chunk_alloc(trans, fs_info, 3679 BTRFS_BLOCK_GROUP_DATA); 3680 btrfs_end_transaction(trans); 3681 if (ret < 0) { 3682 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3683 goto error; 3684 } 3685 chunk_reserved = 1; 3686 } 3687 3688 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3689 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3690 if (ret && ret != -ENOSPC) 3691 goto error; 3692 if (ret == -ENOSPC) { 3693 enospc_errors++; 3694 } else { 3695 spin_lock(&fs_info->balance_lock); 3696 bctl->stat.completed++; 3697 spin_unlock(&fs_info->balance_lock); 3698 } 3699 loop: 3700 if (found_key.offset == 0) 3701 break; 3702 key.offset = found_key.offset - 1; 3703 } 3704 3705 if (counting) { 3706 btrfs_release_path(path); 3707 counting = false; 3708 goto again; 3709 } 3710 error: 3711 btrfs_free_path(path); 3712 if (enospc_errors) { 3713 btrfs_info(fs_info, "%d enospc errors during balance", 3714 enospc_errors); 3715 if (!ret) 3716 ret = -ENOSPC; 3717 } 3718 3719 return ret; 3720 } 3721 3722 /** 3723 * alloc_profile_is_valid - see if a given profile is valid and reduced 3724 * @flags: profile to validate 3725 * @extended: if true @flags is treated as an extended profile 3726 */ 3727 static int alloc_profile_is_valid(u64 flags, int extended) 3728 { 3729 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3730 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3731 3732 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3733 3734 /* 1) check that all other bits are zeroed */ 3735 if (flags & ~mask) 3736 return 0; 3737 3738 /* 2) see if profile is reduced */ 3739 if (flags == 0) 3740 return !extended; /* "0" is valid for usual profiles */ 3741 3742 /* true if exactly one bit set */ 3743 return (flags & (flags - 1)) == 0; 3744 } 3745 3746 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3747 { 3748 /* cancel requested || normal exit path */ 3749 return atomic_read(&fs_info->balance_cancel_req) || 3750 (atomic_read(&fs_info->balance_pause_req) == 0 && 3751 atomic_read(&fs_info->balance_cancel_req) == 0); 3752 } 3753 3754 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3755 { 3756 int ret; 3757 3758 unset_balance_control(fs_info); 3759 ret = del_balance_item(fs_info); 3760 if (ret) 3761 btrfs_handle_fs_error(fs_info, ret, NULL); 3762 3763 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3764 } 3765 3766 /* Non-zero return value signifies invalidity */ 3767 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3768 u64 allowed) 3769 { 3770 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3771 (!alloc_profile_is_valid(bctl_arg->target, 1) || 3772 (bctl_arg->target & ~allowed))); 3773 } 3774 3775 /* 3776 * Should be called with both balance and volume mutexes held 3777 */ 3778 int btrfs_balance(struct btrfs_balance_control *bctl, 3779 struct btrfs_ioctl_balance_args *bargs) 3780 { 3781 struct btrfs_fs_info *fs_info = bctl->fs_info; 3782 u64 meta_target, data_target; 3783 u64 allowed; 3784 int mixed = 0; 3785 int ret; 3786 u64 num_devices; 3787 unsigned seq; 3788 3789 if (btrfs_fs_closing(fs_info) || 3790 atomic_read(&fs_info->balance_pause_req) || 3791 atomic_read(&fs_info->balance_cancel_req)) { 3792 ret = -EINVAL; 3793 goto out; 3794 } 3795 3796 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3797 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3798 mixed = 1; 3799 3800 /* 3801 * In case of mixed groups both data and meta should be picked, 3802 * and identical options should be given for both of them. 3803 */ 3804 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3805 if (mixed && (bctl->flags & allowed)) { 3806 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3807 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3808 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3809 btrfs_err(fs_info, 3810 "with mixed groups data and metadata balance options must be the same"); 3811 ret = -EINVAL; 3812 goto out; 3813 } 3814 } 3815 3816 num_devices = fs_info->fs_devices->num_devices; 3817 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 3818 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3819 BUG_ON(num_devices < 1); 3820 num_devices--; 3821 } 3822 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3823 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 3824 if (num_devices > 1) 3825 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3826 if (num_devices > 2) 3827 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3828 if (num_devices > 3) 3829 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3830 BTRFS_BLOCK_GROUP_RAID6); 3831 if (validate_convert_profile(&bctl->data, allowed)) { 3832 btrfs_err(fs_info, 3833 "unable to start balance with target data profile %llu", 3834 bctl->data.target); 3835 ret = -EINVAL; 3836 goto out; 3837 } 3838 if (validate_convert_profile(&bctl->meta, allowed)) { 3839 btrfs_err(fs_info, 3840 "unable to start balance with target metadata profile %llu", 3841 bctl->meta.target); 3842 ret = -EINVAL; 3843 goto out; 3844 } 3845 if (validate_convert_profile(&bctl->sys, allowed)) { 3846 btrfs_err(fs_info, 3847 "unable to start balance with target system profile %llu", 3848 bctl->sys.target); 3849 ret = -EINVAL; 3850 goto out; 3851 } 3852 3853 /* allow to reduce meta or sys integrity only if force set */ 3854 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3855 BTRFS_BLOCK_GROUP_RAID10 | 3856 BTRFS_BLOCK_GROUP_RAID5 | 3857 BTRFS_BLOCK_GROUP_RAID6; 3858 do { 3859 seq = read_seqbegin(&fs_info->profiles_lock); 3860 3861 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3862 (fs_info->avail_system_alloc_bits & allowed) && 3863 !(bctl->sys.target & allowed)) || 3864 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3865 (fs_info->avail_metadata_alloc_bits & allowed) && 3866 !(bctl->meta.target & allowed))) { 3867 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3868 btrfs_info(fs_info, 3869 "force reducing metadata integrity"); 3870 } else { 3871 btrfs_err(fs_info, 3872 "balance will reduce metadata integrity, use force if you want this"); 3873 ret = -EINVAL; 3874 goto out; 3875 } 3876 } 3877 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3878 3879 /* if we're not converting, the target field is uninitialized */ 3880 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3881 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 3882 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 3883 bctl->data.target : fs_info->avail_data_alloc_bits; 3884 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 3885 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3886 btrfs_warn(fs_info, 3887 "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", 3888 meta_target, data_target); 3889 } 3890 3891 ret = insert_balance_item(fs_info, bctl); 3892 if (ret && ret != -EEXIST) 3893 goto out; 3894 3895 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3896 BUG_ON(ret == -EEXIST); 3897 set_balance_control(bctl); 3898 } else { 3899 BUG_ON(ret != -EEXIST); 3900 spin_lock(&fs_info->balance_lock); 3901 update_balance_args(bctl); 3902 spin_unlock(&fs_info->balance_lock); 3903 } 3904 3905 atomic_inc(&fs_info->balance_running); 3906 mutex_unlock(&fs_info->balance_mutex); 3907 3908 ret = __btrfs_balance(fs_info); 3909 3910 mutex_lock(&fs_info->balance_mutex); 3911 atomic_dec(&fs_info->balance_running); 3912 3913 if (bargs) { 3914 memset(bargs, 0, sizeof(*bargs)); 3915 update_ioctl_balance_args(fs_info, 0, bargs); 3916 } 3917 3918 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3919 balance_need_close(fs_info)) { 3920 __cancel_balance(fs_info); 3921 } 3922 3923 wake_up(&fs_info->balance_wait_q); 3924 3925 return ret; 3926 out: 3927 if (bctl->flags & BTRFS_BALANCE_RESUME) 3928 __cancel_balance(fs_info); 3929 else { 3930 kfree(bctl); 3931 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3932 } 3933 return ret; 3934 } 3935 3936 static int balance_kthread(void *data) 3937 { 3938 struct btrfs_fs_info *fs_info = data; 3939 int ret = 0; 3940 3941 mutex_lock(&fs_info->volume_mutex); 3942 mutex_lock(&fs_info->balance_mutex); 3943 3944 if (fs_info->balance_ctl) { 3945 btrfs_info(fs_info, "continuing balance"); 3946 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3947 } 3948 3949 mutex_unlock(&fs_info->balance_mutex); 3950 mutex_unlock(&fs_info->volume_mutex); 3951 3952 return ret; 3953 } 3954 3955 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3956 { 3957 struct task_struct *tsk; 3958 3959 spin_lock(&fs_info->balance_lock); 3960 if (!fs_info->balance_ctl) { 3961 spin_unlock(&fs_info->balance_lock); 3962 return 0; 3963 } 3964 spin_unlock(&fs_info->balance_lock); 3965 3966 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 3967 btrfs_info(fs_info, "force skipping balance"); 3968 return 0; 3969 } 3970 3971 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3972 return PTR_ERR_OR_ZERO(tsk); 3973 } 3974 3975 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3976 { 3977 struct btrfs_balance_control *bctl; 3978 struct btrfs_balance_item *item; 3979 struct btrfs_disk_balance_args disk_bargs; 3980 struct btrfs_path *path; 3981 struct extent_buffer *leaf; 3982 struct btrfs_key key; 3983 int ret; 3984 3985 path = btrfs_alloc_path(); 3986 if (!path) 3987 return -ENOMEM; 3988 3989 key.objectid = BTRFS_BALANCE_OBJECTID; 3990 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3991 key.offset = 0; 3992 3993 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3994 if (ret < 0) 3995 goto out; 3996 if (ret > 0) { /* ret = -ENOENT; */ 3997 ret = 0; 3998 goto out; 3999 } 4000 4001 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4002 if (!bctl) { 4003 ret = -ENOMEM; 4004 goto out; 4005 } 4006 4007 leaf = path->nodes[0]; 4008 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4009 4010 bctl->fs_info = fs_info; 4011 bctl->flags = btrfs_balance_flags(leaf, item); 4012 bctl->flags |= BTRFS_BALANCE_RESUME; 4013 4014 btrfs_balance_data(leaf, item, &disk_bargs); 4015 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4016 btrfs_balance_meta(leaf, item, &disk_bargs); 4017 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4018 btrfs_balance_sys(leaf, item, &disk_bargs); 4019 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4020 4021 WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 4022 4023 mutex_lock(&fs_info->volume_mutex); 4024 mutex_lock(&fs_info->balance_mutex); 4025 4026 set_balance_control(bctl); 4027 4028 mutex_unlock(&fs_info->balance_mutex); 4029 mutex_unlock(&fs_info->volume_mutex); 4030 out: 4031 btrfs_free_path(path); 4032 return ret; 4033 } 4034 4035 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4036 { 4037 int ret = 0; 4038 4039 mutex_lock(&fs_info->balance_mutex); 4040 if (!fs_info->balance_ctl) { 4041 mutex_unlock(&fs_info->balance_mutex); 4042 return -ENOTCONN; 4043 } 4044 4045 if (atomic_read(&fs_info->balance_running)) { 4046 atomic_inc(&fs_info->balance_pause_req); 4047 mutex_unlock(&fs_info->balance_mutex); 4048 4049 wait_event(fs_info->balance_wait_q, 4050 atomic_read(&fs_info->balance_running) == 0); 4051 4052 mutex_lock(&fs_info->balance_mutex); 4053 /* we are good with balance_ctl ripped off from under us */ 4054 BUG_ON(atomic_read(&fs_info->balance_running)); 4055 atomic_dec(&fs_info->balance_pause_req); 4056 } else { 4057 ret = -ENOTCONN; 4058 } 4059 4060 mutex_unlock(&fs_info->balance_mutex); 4061 return ret; 4062 } 4063 4064 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4065 { 4066 if (sb_rdonly(fs_info->sb)) 4067 return -EROFS; 4068 4069 mutex_lock(&fs_info->balance_mutex); 4070 if (!fs_info->balance_ctl) { 4071 mutex_unlock(&fs_info->balance_mutex); 4072 return -ENOTCONN; 4073 } 4074 4075 atomic_inc(&fs_info->balance_cancel_req); 4076 /* 4077 * if we are running just wait and return, balance item is 4078 * deleted in btrfs_balance in this case 4079 */ 4080 if (atomic_read(&fs_info->balance_running)) { 4081 mutex_unlock(&fs_info->balance_mutex); 4082 wait_event(fs_info->balance_wait_q, 4083 atomic_read(&fs_info->balance_running) == 0); 4084 mutex_lock(&fs_info->balance_mutex); 4085 } else { 4086 /* __cancel_balance needs volume_mutex */ 4087 mutex_unlock(&fs_info->balance_mutex); 4088 mutex_lock(&fs_info->volume_mutex); 4089 mutex_lock(&fs_info->balance_mutex); 4090 4091 if (fs_info->balance_ctl) 4092 __cancel_balance(fs_info); 4093 4094 mutex_unlock(&fs_info->volume_mutex); 4095 } 4096 4097 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 4098 atomic_dec(&fs_info->balance_cancel_req); 4099 mutex_unlock(&fs_info->balance_mutex); 4100 return 0; 4101 } 4102 4103 static int btrfs_uuid_scan_kthread(void *data) 4104 { 4105 struct btrfs_fs_info *fs_info = data; 4106 struct btrfs_root *root = fs_info->tree_root; 4107 struct btrfs_key key; 4108 struct btrfs_path *path = NULL; 4109 int ret = 0; 4110 struct extent_buffer *eb; 4111 int slot; 4112 struct btrfs_root_item root_item; 4113 u32 item_size; 4114 struct btrfs_trans_handle *trans = NULL; 4115 4116 path = btrfs_alloc_path(); 4117 if (!path) { 4118 ret = -ENOMEM; 4119 goto out; 4120 } 4121 4122 key.objectid = 0; 4123 key.type = BTRFS_ROOT_ITEM_KEY; 4124 key.offset = 0; 4125 4126 while (1) { 4127 ret = btrfs_search_forward(root, &key, path, 0); 4128 if (ret) { 4129 if (ret > 0) 4130 ret = 0; 4131 break; 4132 } 4133 4134 if (key.type != BTRFS_ROOT_ITEM_KEY || 4135 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4136 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4137 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4138 goto skip; 4139 4140 eb = path->nodes[0]; 4141 slot = path->slots[0]; 4142 item_size = btrfs_item_size_nr(eb, slot); 4143 if (item_size < sizeof(root_item)) 4144 goto skip; 4145 4146 read_extent_buffer(eb, &root_item, 4147 btrfs_item_ptr_offset(eb, slot), 4148 (int)sizeof(root_item)); 4149 if (btrfs_root_refs(&root_item) == 0) 4150 goto skip; 4151 4152 if (!btrfs_is_empty_uuid(root_item.uuid) || 4153 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4154 if (trans) 4155 goto update_tree; 4156 4157 btrfs_release_path(path); 4158 /* 4159 * 1 - subvol uuid item 4160 * 1 - received_subvol uuid item 4161 */ 4162 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4163 if (IS_ERR(trans)) { 4164 ret = PTR_ERR(trans); 4165 break; 4166 } 4167 continue; 4168 } else { 4169 goto skip; 4170 } 4171 update_tree: 4172 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4173 ret = btrfs_uuid_tree_add(trans, fs_info, 4174 root_item.uuid, 4175 BTRFS_UUID_KEY_SUBVOL, 4176 key.objectid); 4177 if (ret < 0) { 4178 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4179 ret); 4180 break; 4181 } 4182 } 4183 4184 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4185 ret = btrfs_uuid_tree_add(trans, fs_info, 4186 root_item.received_uuid, 4187 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4188 key.objectid); 4189 if (ret < 0) { 4190 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4191 ret); 4192 break; 4193 } 4194 } 4195 4196 skip: 4197 if (trans) { 4198 ret = btrfs_end_transaction(trans); 4199 trans = NULL; 4200 if (ret) 4201 break; 4202 } 4203 4204 btrfs_release_path(path); 4205 if (key.offset < (u64)-1) { 4206 key.offset++; 4207 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4208 key.offset = 0; 4209 key.type = BTRFS_ROOT_ITEM_KEY; 4210 } else if (key.objectid < (u64)-1) { 4211 key.offset = 0; 4212 key.type = BTRFS_ROOT_ITEM_KEY; 4213 key.objectid++; 4214 } else { 4215 break; 4216 } 4217 cond_resched(); 4218 } 4219 4220 out: 4221 btrfs_free_path(path); 4222 if (trans && !IS_ERR(trans)) 4223 btrfs_end_transaction(trans); 4224 if (ret) 4225 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4226 else 4227 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4228 up(&fs_info->uuid_tree_rescan_sem); 4229 return 0; 4230 } 4231 4232 /* 4233 * Callback for btrfs_uuid_tree_iterate(). 4234 * returns: 4235 * 0 check succeeded, the entry is not outdated. 4236 * < 0 if an error occurred. 4237 * > 0 if the check failed, which means the caller shall remove the entry. 4238 */ 4239 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4240 u8 *uuid, u8 type, u64 subid) 4241 { 4242 struct btrfs_key key; 4243 int ret = 0; 4244 struct btrfs_root *subvol_root; 4245 4246 if (type != BTRFS_UUID_KEY_SUBVOL && 4247 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 4248 goto out; 4249 4250 key.objectid = subid; 4251 key.type = BTRFS_ROOT_ITEM_KEY; 4252 key.offset = (u64)-1; 4253 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 4254 if (IS_ERR(subvol_root)) { 4255 ret = PTR_ERR(subvol_root); 4256 if (ret == -ENOENT) 4257 ret = 1; 4258 goto out; 4259 } 4260 4261 switch (type) { 4262 case BTRFS_UUID_KEY_SUBVOL: 4263 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 4264 ret = 1; 4265 break; 4266 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 4267 if (memcmp(uuid, subvol_root->root_item.received_uuid, 4268 BTRFS_UUID_SIZE)) 4269 ret = 1; 4270 break; 4271 } 4272 4273 out: 4274 return ret; 4275 } 4276 4277 static int btrfs_uuid_rescan_kthread(void *data) 4278 { 4279 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 4280 int ret; 4281 4282 /* 4283 * 1st step is to iterate through the existing UUID tree and 4284 * to delete all entries that contain outdated data. 4285 * 2nd step is to add all missing entries to the UUID tree. 4286 */ 4287 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 4288 if (ret < 0) { 4289 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 4290 up(&fs_info->uuid_tree_rescan_sem); 4291 return ret; 4292 } 4293 return btrfs_uuid_scan_kthread(data); 4294 } 4295 4296 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4297 { 4298 struct btrfs_trans_handle *trans; 4299 struct btrfs_root *tree_root = fs_info->tree_root; 4300 struct btrfs_root *uuid_root; 4301 struct task_struct *task; 4302 int ret; 4303 4304 /* 4305 * 1 - root node 4306 * 1 - root item 4307 */ 4308 trans = btrfs_start_transaction(tree_root, 2); 4309 if (IS_ERR(trans)) 4310 return PTR_ERR(trans); 4311 4312 uuid_root = btrfs_create_tree(trans, fs_info, 4313 BTRFS_UUID_TREE_OBJECTID); 4314 if (IS_ERR(uuid_root)) { 4315 ret = PTR_ERR(uuid_root); 4316 btrfs_abort_transaction(trans, ret); 4317 btrfs_end_transaction(trans); 4318 return ret; 4319 } 4320 4321 fs_info->uuid_root = uuid_root; 4322 4323 ret = btrfs_commit_transaction(trans); 4324 if (ret) 4325 return ret; 4326 4327 down(&fs_info->uuid_tree_rescan_sem); 4328 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4329 if (IS_ERR(task)) { 4330 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4331 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4332 up(&fs_info->uuid_tree_rescan_sem); 4333 return PTR_ERR(task); 4334 } 4335 4336 return 0; 4337 } 4338 4339 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 4340 { 4341 struct task_struct *task; 4342 4343 down(&fs_info->uuid_tree_rescan_sem); 4344 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 4345 if (IS_ERR(task)) { 4346 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4347 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 4348 up(&fs_info->uuid_tree_rescan_sem); 4349 return PTR_ERR(task); 4350 } 4351 4352 return 0; 4353 } 4354 4355 /* 4356 * shrinking a device means finding all of the device extents past 4357 * the new size, and then following the back refs to the chunks. 4358 * The chunk relocation code actually frees the device extent 4359 */ 4360 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4361 { 4362 struct btrfs_fs_info *fs_info = device->fs_info; 4363 struct btrfs_root *root = fs_info->dev_root; 4364 struct btrfs_trans_handle *trans; 4365 struct btrfs_dev_extent *dev_extent = NULL; 4366 struct btrfs_path *path; 4367 u64 length; 4368 u64 chunk_offset; 4369 int ret; 4370 int slot; 4371 int failed = 0; 4372 bool retried = false; 4373 bool checked_pending_chunks = false; 4374 struct extent_buffer *l; 4375 struct btrfs_key key; 4376 struct btrfs_super_block *super_copy = fs_info->super_copy; 4377 u64 old_total = btrfs_super_total_bytes(super_copy); 4378 u64 old_size = btrfs_device_get_total_bytes(device); 4379 u64 diff; 4380 4381 new_size = round_down(new_size, fs_info->sectorsize); 4382 diff = round_down(old_size - new_size, fs_info->sectorsize); 4383 4384 if (device->is_tgtdev_for_dev_replace) 4385 return -EINVAL; 4386 4387 path = btrfs_alloc_path(); 4388 if (!path) 4389 return -ENOMEM; 4390 4391 path->reada = READA_FORWARD; 4392 4393 mutex_lock(&fs_info->chunk_mutex); 4394 4395 btrfs_device_set_total_bytes(device, new_size); 4396 if (device->writeable) { 4397 device->fs_devices->total_rw_bytes -= diff; 4398 atomic64_sub(diff, &fs_info->free_chunk_space); 4399 } 4400 mutex_unlock(&fs_info->chunk_mutex); 4401 4402 again: 4403 key.objectid = device->devid; 4404 key.offset = (u64)-1; 4405 key.type = BTRFS_DEV_EXTENT_KEY; 4406 4407 do { 4408 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4409 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4410 if (ret < 0) { 4411 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4412 goto done; 4413 } 4414 4415 ret = btrfs_previous_item(root, path, 0, key.type); 4416 if (ret) 4417 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4418 if (ret < 0) 4419 goto done; 4420 if (ret) { 4421 ret = 0; 4422 btrfs_release_path(path); 4423 break; 4424 } 4425 4426 l = path->nodes[0]; 4427 slot = path->slots[0]; 4428 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4429 4430 if (key.objectid != device->devid) { 4431 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4432 btrfs_release_path(path); 4433 break; 4434 } 4435 4436 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4437 length = btrfs_dev_extent_length(l, dev_extent); 4438 4439 if (key.offset + length <= new_size) { 4440 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4441 btrfs_release_path(path); 4442 break; 4443 } 4444 4445 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4446 btrfs_release_path(path); 4447 4448 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4449 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4450 if (ret && ret != -ENOSPC) 4451 goto done; 4452 if (ret == -ENOSPC) 4453 failed++; 4454 } while (key.offset-- > 0); 4455 4456 if (failed && !retried) { 4457 failed = 0; 4458 retried = true; 4459 goto again; 4460 } else if (failed && retried) { 4461 ret = -ENOSPC; 4462 goto done; 4463 } 4464 4465 /* Shrinking succeeded, else we would be at "done". */ 4466 trans = btrfs_start_transaction(root, 0); 4467 if (IS_ERR(trans)) { 4468 ret = PTR_ERR(trans); 4469 goto done; 4470 } 4471 4472 mutex_lock(&fs_info->chunk_mutex); 4473 4474 /* 4475 * We checked in the above loop all device extents that were already in 4476 * the device tree. However before we have updated the device's 4477 * total_bytes to the new size, we might have had chunk allocations that 4478 * have not complete yet (new block groups attached to transaction 4479 * handles), and therefore their device extents were not yet in the 4480 * device tree and we missed them in the loop above. So if we have any 4481 * pending chunk using a device extent that overlaps the device range 4482 * that we can not use anymore, commit the current transaction and 4483 * repeat the search on the device tree - this way we guarantee we will 4484 * not have chunks using device extents that end beyond 'new_size'. 4485 */ 4486 if (!checked_pending_chunks) { 4487 u64 start = new_size; 4488 u64 len = old_size - new_size; 4489 4490 if (contains_pending_extent(trans->transaction, device, 4491 &start, len)) { 4492 mutex_unlock(&fs_info->chunk_mutex); 4493 checked_pending_chunks = true; 4494 failed = 0; 4495 retried = false; 4496 ret = btrfs_commit_transaction(trans); 4497 if (ret) 4498 goto done; 4499 goto again; 4500 } 4501 } 4502 4503 btrfs_device_set_disk_total_bytes(device, new_size); 4504 if (list_empty(&device->resized_list)) 4505 list_add_tail(&device->resized_list, 4506 &fs_info->fs_devices->resized_devices); 4507 4508 WARN_ON(diff > old_total); 4509 btrfs_set_super_total_bytes(super_copy, 4510 round_down(old_total - diff, fs_info->sectorsize)); 4511 mutex_unlock(&fs_info->chunk_mutex); 4512 4513 /* Now btrfs_update_device() will change the on-disk size. */ 4514 ret = btrfs_update_device(trans, device); 4515 btrfs_end_transaction(trans); 4516 done: 4517 btrfs_free_path(path); 4518 if (ret) { 4519 mutex_lock(&fs_info->chunk_mutex); 4520 btrfs_device_set_total_bytes(device, old_size); 4521 if (device->writeable) 4522 device->fs_devices->total_rw_bytes += diff; 4523 atomic64_add(diff, &fs_info->free_chunk_space); 4524 mutex_unlock(&fs_info->chunk_mutex); 4525 } 4526 return ret; 4527 } 4528 4529 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4530 struct btrfs_key *key, 4531 struct btrfs_chunk *chunk, int item_size) 4532 { 4533 struct btrfs_super_block *super_copy = fs_info->super_copy; 4534 struct btrfs_disk_key disk_key; 4535 u32 array_size; 4536 u8 *ptr; 4537 4538 mutex_lock(&fs_info->chunk_mutex); 4539 array_size = btrfs_super_sys_array_size(super_copy); 4540 if (array_size + item_size + sizeof(disk_key) 4541 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4542 mutex_unlock(&fs_info->chunk_mutex); 4543 return -EFBIG; 4544 } 4545 4546 ptr = super_copy->sys_chunk_array + array_size; 4547 btrfs_cpu_key_to_disk(&disk_key, key); 4548 memcpy(ptr, &disk_key, sizeof(disk_key)); 4549 ptr += sizeof(disk_key); 4550 memcpy(ptr, chunk, item_size); 4551 item_size += sizeof(disk_key); 4552 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4553 mutex_unlock(&fs_info->chunk_mutex); 4554 4555 return 0; 4556 } 4557 4558 /* 4559 * sort the devices in descending order by max_avail, total_avail 4560 */ 4561 static int btrfs_cmp_device_info(const void *a, const void *b) 4562 { 4563 const struct btrfs_device_info *di_a = a; 4564 const struct btrfs_device_info *di_b = b; 4565 4566 if (di_a->max_avail > di_b->max_avail) 4567 return -1; 4568 if (di_a->max_avail < di_b->max_avail) 4569 return 1; 4570 if (di_a->total_avail > di_b->total_avail) 4571 return -1; 4572 if (di_a->total_avail < di_b->total_avail) 4573 return 1; 4574 return 0; 4575 } 4576 4577 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4578 { 4579 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4580 return; 4581 4582 btrfs_set_fs_incompat(info, RAID56); 4583 } 4584 4585 #define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ 4586 - sizeof(struct btrfs_chunk)) \ 4587 / sizeof(struct btrfs_stripe) + 1) 4588 4589 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4590 - 2 * sizeof(struct btrfs_disk_key) \ 4591 - 2 * sizeof(struct btrfs_chunk)) \ 4592 / sizeof(struct btrfs_stripe) + 1) 4593 4594 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4595 u64 start, u64 type) 4596 { 4597 struct btrfs_fs_info *info = trans->fs_info; 4598 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4599 struct btrfs_device *device; 4600 struct map_lookup *map = NULL; 4601 struct extent_map_tree *em_tree; 4602 struct extent_map *em; 4603 struct btrfs_device_info *devices_info = NULL; 4604 u64 total_avail; 4605 int num_stripes; /* total number of stripes to allocate */ 4606 int data_stripes; /* number of stripes that count for 4607 block group size */ 4608 int sub_stripes; /* sub_stripes info for map */ 4609 int dev_stripes; /* stripes per dev */ 4610 int devs_max; /* max devs to use */ 4611 int devs_min; /* min devs needed */ 4612 int devs_increment; /* ndevs has to be a multiple of this */ 4613 int ncopies; /* how many copies to data has */ 4614 int ret; 4615 u64 max_stripe_size; 4616 u64 max_chunk_size; 4617 u64 stripe_size; 4618 u64 num_bytes; 4619 int ndevs; 4620 int i; 4621 int j; 4622 int index; 4623 4624 BUG_ON(!alloc_profile_is_valid(type, 0)); 4625 4626 if (list_empty(&fs_devices->alloc_list)) 4627 return -ENOSPC; 4628 4629 index = __get_raid_index(type); 4630 4631 sub_stripes = btrfs_raid_array[index].sub_stripes; 4632 dev_stripes = btrfs_raid_array[index].dev_stripes; 4633 devs_max = btrfs_raid_array[index].devs_max; 4634 devs_min = btrfs_raid_array[index].devs_min; 4635 devs_increment = btrfs_raid_array[index].devs_increment; 4636 ncopies = btrfs_raid_array[index].ncopies; 4637 4638 if (type & BTRFS_BLOCK_GROUP_DATA) { 4639 max_stripe_size = SZ_1G; 4640 max_chunk_size = 10 * max_stripe_size; 4641 if (!devs_max) 4642 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4643 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4644 /* for larger filesystems, use larger metadata chunks */ 4645 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4646 max_stripe_size = SZ_1G; 4647 else 4648 max_stripe_size = SZ_256M; 4649 max_chunk_size = max_stripe_size; 4650 if (!devs_max) 4651 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4652 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4653 max_stripe_size = SZ_32M; 4654 max_chunk_size = 2 * max_stripe_size; 4655 if (!devs_max) 4656 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4657 } else { 4658 btrfs_err(info, "invalid chunk type 0x%llx requested", 4659 type); 4660 BUG_ON(1); 4661 } 4662 4663 /* we don't want a chunk larger than 10% of writeable space */ 4664 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4665 max_chunk_size); 4666 4667 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4668 GFP_NOFS); 4669 if (!devices_info) 4670 return -ENOMEM; 4671 4672 /* 4673 * in the first pass through the devices list, we gather information 4674 * about the available holes on each device. 4675 */ 4676 ndevs = 0; 4677 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4678 u64 max_avail; 4679 u64 dev_offset; 4680 4681 if (!device->writeable) { 4682 WARN(1, KERN_ERR 4683 "BTRFS: read-only device in alloc_list\n"); 4684 continue; 4685 } 4686 4687 if (!device->in_fs_metadata || 4688 device->is_tgtdev_for_dev_replace) 4689 continue; 4690 4691 if (device->total_bytes > device->bytes_used) 4692 total_avail = device->total_bytes - device->bytes_used; 4693 else 4694 total_avail = 0; 4695 4696 /* If there is no space on this device, skip it. */ 4697 if (total_avail == 0) 4698 continue; 4699 4700 ret = find_free_dev_extent(trans, device, 4701 max_stripe_size * dev_stripes, 4702 &dev_offset, &max_avail); 4703 if (ret && ret != -ENOSPC) 4704 goto error; 4705 4706 if (ret == 0) 4707 max_avail = max_stripe_size * dev_stripes; 4708 4709 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4710 continue; 4711 4712 if (ndevs == fs_devices->rw_devices) { 4713 WARN(1, "%s: found more than %llu devices\n", 4714 __func__, fs_devices->rw_devices); 4715 break; 4716 } 4717 devices_info[ndevs].dev_offset = dev_offset; 4718 devices_info[ndevs].max_avail = max_avail; 4719 devices_info[ndevs].total_avail = total_avail; 4720 devices_info[ndevs].dev = device; 4721 ++ndevs; 4722 } 4723 4724 /* 4725 * now sort the devices by hole size / available space 4726 */ 4727 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4728 btrfs_cmp_device_info, NULL); 4729 4730 /* round down to number of usable stripes */ 4731 ndevs = round_down(ndevs, devs_increment); 4732 4733 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4734 ret = -ENOSPC; 4735 goto error; 4736 } 4737 4738 ndevs = min(ndevs, devs_max); 4739 4740 /* 4741 * the primary goal is to maximize the number of stripes, so use as many 4742 * devices as possible, even if the stripes are not maximum sized. 4743 */ 4744 stripe_size = devices_info[ndevs-1].max_avail; 4745 num_stripes = ndevs * dev_stripes; 4746 4747 /* 4748 * this will have to be fixed for RAID1 and RAID10 over 4749 * more drives 4750 */ 4751 data_stripes = num_stripes / ncopies; 4752 4753 if (type & BTRFS_BLOCK_GROUP_RAID5) 4754 data_stripes = num_stripes - 1; 4755 4756 if (type & BTRFS_BLOCK_GROUP_RAID6) 4757 data_stripes = num_stripes - 2; 4758 4759 /* 4760 * Use the number of data stripes to figure out how big this chunk 4761 * is really going to be in terms of logical address space, 4762 * and compare that answer with the max chunk size 4763 */ 4764 if (stripe_size * data_stripes > max_chunk_size) { 4765 u64 mask = (1ULL << 24) - 1; 4766 4767 stripe_size = div_u64(max_chunk_size, data_stripes); 4768 4769 /* bump the answer up to a 16MB boundary */ 4770 stripe_size = (stripe_size + mask) & ~mask; 4771 4772 /* but don't go higher than the limits we found 4773 * while searching for free extents 4774 */ 4775 if (stripe_size > devices_info[ndevs-1].max_avail) 4776 stripe_size = devices_info[ndevs-1].max_avail; 4777 } 4778 4779 stripe_size = div_u64(stripe_size, dev_stripes); 4780 4781 /* align to BTRFS_STRIPE_LEN */ 4782 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 4783 4784 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4785 if (!map) { 4786 ret = -ENOMEM; 4787 goto error; 4788 } 4789 map->num_stripes = num_stripes; 4790 4791 for (i = 0; i < ndevs; ++i) { 4792 for (j = 0; j < dev_stripes; ++j) { 4793 int s = i * dev_stripes + j; 4794 map->stripes[s].dev = devices_info[i].dev; 4795 map->stripes[s].physical = devices_info[i].dev_offset + 4796 j * stripe_size; 4797 } 4798 } 4799 map->stripe_len = BTRFS_STRIPE_LEN; 4800 map->io_align = BTRFS_STRIPE_LEN; 4801 map->io_width = BTRFS_STRIPE_LEN; 4802 map->type = type; 4803 map->sub_stripes = sub_stripes; 4804 4805 num_bytes = stripe_size * data_stripes; 4806 4807 trace_btrfs_chunk_alloc(info, map, start, num_bytes); 4808 4809 em = alloc_extent_map(); 4810 if (!em) { 4811 kfree(map); 4812 ret = -ENOMEM; 4813 goto error; 4814 } 4815 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4816 em->map_lookup = map; 4817 em->start = start; 4818 em->len = num_bytes; 4819 em->block_start = 0; 4820 em->block_len = em->len; 4821 em->orig_block_len = stripe_size; 4822 4823 em_tree = &info->mapping_tree.map_tree; 4824 write_lock(&em_tree->lock); 4825 ret = add_extent_mapping(em_tree, em, 0); 4826 if (ret) { 4827 write_unlock(&em_tree->lock); 4828 free_extent_map(em); 4829 goto error; 4830 } 4831 4832 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4833 refcount_inc(&em->refs); 4834 write_unlock(&em_tree->lock); 4835 4836 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 4837 if (ret) 4838 goto error_del_extent; 4839 4840 for (i = 0; i < map->num_stripes; i++) { 4841 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4842 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4843 } 4844 4845 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 4846 4847 free_extent_map(em); 4848 check_raid56_incompat_flag(info, type); 4849 4850 kfree(devices_info); 4851 return 0; 4852 4853 error_del_extent: 4854 write_lock(&em_tree->lock); 4855 remove_extent_mapping(em_tree, em); 4856 write_unlock(&em_tree->lock); 4857 4858 /* One for our allocation */ 4859 free_extent_map(em); 4860 /* One for the tree reference */ 4861 free_extent_map(em); 4862 /* One for the pending_chunks list reference */ 4863 free_extent_map(em); 4864 error: 4865 kfree(devices_info); 4866 return ret; 4867 } 4868 4869 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4870 struct btrfs_fs_info *fs_info, 4871 u64 chunk_offset, u64 chunk_size) 4872 { 4873 struct btrfs_root *extent_root = fs_info->extent_root; 4874 struct btrfs_root *chunk_root = fs_info->chunk_root; 4875 struct btrfs_key key; 4876 struct btrfs_device *device; 4877 struct btrfs_chunk *chunk; 4878 struct btrfs_stripe *stripe; 4879 struct extent_map *em; 4880 struct map_lookup *map; 4881 size_t item_size; 4882 u64 dev_offset; 4883 u64 stripe_size; 4884 int i = 0; 4885 int ret = 0; 4886 4887 em = get_chunk_map(fs_info, chunk_offset, chunk_size); 4888 if (IS_ERR(em)) 4889 return PTR_ERR(em); 4890 4891 map = em->map_lookup; 4892 item_size = btrfs_chunk_item_size(map->num_stripes); 4893 stripe_size = em->orig_block_len; 4894 4895 chunk = kzalloc(item_size, GFP_NOFS); 4896 if (!chunk) { 4897 ret = -ENOMEM; 4898 goto out; 4899 } 4900 4901 /* 4902 * Take the device list mutex to prevent races with the final phase of 4903 * a device replace operation that replaces the device object associated 4904 * with the map's stripes, because the device object's id can change 4905 * at any time during that final phase of the device replace operation 4906 * (dev-replace.c:btrfs_dev_replace_finishing()). 4907 */ 4908 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4909 for (i = 0; i < map->num_stripes; i++) { 4910 device = map->stripes[i].dev; 4911 dev_offset = map->stripes[i].physical; 4912 4913 ret = btrfs_update_device(trans, device); 4914 if (ret) 4915 break; 4916 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 4917 dev_offset, stripe_size); 4918 if (ret) 4919 break; 4920 } 4921 if (ret) { 4922 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4923 goto out; 4924 } 4925 4926 stripe = &chunk->stripe; 4927 for (i = 0; i < map->num_stripes; i++) { 4928 device = map->stripes[i].dev; 4929 dev_offset = map->stripes[i].physical; 4930 4931 btrfs_set_stack_stripe_devid(stripe, device->devid); 4932 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4933 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4934 stripe++; 4935 } 4936 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4937 4938 btrfs_set_stack_chunk_length(chunk, chunk_size); 4939 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4940 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4941 btrfs_set_stack_chunk_type(chunk, map->type); 4942 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4943 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4944 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4945 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 4946 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4947 4948 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4949 key.type = BTRFS_CHUNK_ITEM_KEY; 4950 key.offset = chunk_offset; 4951 4952 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4953 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4954 /* 4955 * TODO: Cleanup of inserted chunk root in case of 4956 * failure. 4957 */ 4958 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 4959 } 4960 4961 out: 4962 kfree(chunk); 4963 free_extent_map(em); 4964 return ret; 4965 } 4966 4967 /* 4968 * Chunk allocation falls into two parts. The first part does works 4969 * that make the new allocated chunk useable, but not do any operation 4970 * that modifies the chunk tree. The second part does the works that 4971 * require modifying the chunk tree. This division is important for the 4972 * bootstrap process of adding storage to a seed btrfs. 4973 */ 4974 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4975 struct btrfs_fs_info *fs_info, u64 type) 4976 { 4977 u64 chunk_offset; 4978 4979 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 4980 chunk_offset = find_next_chunk(fs_info); 4981 return __btrfs_alloc_chunk(trans, chunk_offset, type); 4982 } 4983 4984 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4985 struct btrfs_fs_info *fs_info) 4986 { 4987 u64 chunk_offset; 4988 u64 sys_chunk_offset; 4989 u64 alloc_profile; 4990 int ret; 4991 4992 chunk_offset = find_next_chunk(fs_info); 4993 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 4994 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 4995 if (ret) 4996 return ret; 4997 4998 sys_chunk_offset = find_next_chunk(fs_info); 4999 alloc_profile = btrfs_system_alloc_profile(fs_info); 5000 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 5001 return ret; 5002 } 5003 5004 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5005 { 5006 int max_errors; 5007 5008 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5009 BTRFS_BLOCK_GROUP_RAID10 | 5010 BTRFS_BLOCK_GROUP_RAID5 | 5011 BTRFS_BLOCK_GROUP_DUP)) { 5012 max_errors = 1; 5013 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5014 max_errors = 2; 5015 } else { 5016 max_errors = 0; 5017 } 5018 5019 return max_errors; 5020 } 5021 5022 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5023 { 5024 struct extent_map *em; 5025 struct map_lookup *map; 5026 int readonly = 0; 5027 int miss_ndevs = 0; 5028 int i; 5029 5030 em = get_chunk_map(fs_info, chunk_offset, 1); 5031 if (IS_ERR(em)) 5032 return 1; 5033 5034 map = em->map_lookup; 5035 for (i = 0; i < map->num_stripes; i++) { 5036 if (map->stripes[i].dev->missing) { 5037 miss_ndevs++; 5038 continue; 5039 } 5040 5041 if (!map->stripes[i].dev->writeable) { 5042 readonly = 1; 5043 goto end; 5044 } 5045 } 5046 5047 /* 5048 * If the number of missing devices is larger than max errors, 5049 * we can not write the data into that chunk successfully, so 5050 * set it readonly. 5051 */ 5052 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5053 readonly = 1; 5054 end: 5055 free_extent_map(em); 5056 return readonly; 5057 } 5058 5059 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5060 { 5061 extent_map_tree_init(&tree->map_tree); 5062 } 5063 5064 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5065 { 5066 struct extent_map *em; 5067 5068 while (1) { 5069 write_lock(&tree->map_tree.lock); 5070 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5071 if (em) 5072 remove_extent_mapping(&tree->map_tree, em); 5073 write_unlock(&tree->map_tree.lock); 5074 if (!em) 5075 break; 5076 /* once for us */ 5077 free_extent_map(em); 5078 /* once for the tree */ 5079 free_extent_map(em); 5080 } 5081 } 5082 5083 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5084 { 5085 struct extent_map *em; 5086 struct map_lookup *map; 5087 int ret; 5088 5089 em = get_chunk_map(fs_info, logical, len); 5090 if (IS_ERR(em)) 5091 /* 5092 * We could return errors for these cases, but that could get 5093 * ugly and we'd probably do the same thing which is just not do 5094 * anything else and exit, so return 1 so the callers don't try 5095 * to use other copies. 5096 */ 5097 return 1; 5098 5099 map = em->map_lookup; 5100 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5101 ret = map->num_stripes; 5102 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5103 ret = map->sub_stripes; 5104 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5105 ret = 2; 5106 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5107 ret = 3; 5108 else 5109 ret = 1; 5110 free_extent_map(em); 5111 5112 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 5113 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5114 fs_info->dev_replace.tgtdev) 5115 ret++; 5116 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 5117 5118 return ret; 5119 } 5120 5121 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5122 u64 logical) 5123 { 5124 struct extent_map *em; 5125 struct map_lookup *map; 5126 unsigned long len = fs_info->sectorsize; 5127 5128 em = get_chunk_map(fs_info, logical, len); 5129 5130 if (!WARN_ON(IS_ERR(em))) { 5131 map = em->map_lookup; 5132 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5133 len = map->stripe_len * nr_data_stripes(map); 5134 free_extent_map(em); 5135 } 5136 return len; 5137 } 5138 5139 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5140 { 5141 struct extent_map *em; 5142 struct map_lookup *map; 5143 int ret = 0; 5144 5145 em = get_chunk_map(fs_info, logical, len); 5146 5147 if(!WARN_ON(IS_ERR(em))) { 5148 map = em->map_lookup; 5149 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5150 ret = 1; 5151 free_extent_map(em); 5152 } 5153 return ret; 5154 } 5155 5156 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5157 struct map_lookup *map, int first, int num, 5158 int optimal, int dev_replace_is_ongoing) 5159 { 5160 int i; 5161 int tolerance; 5162 struct btrfs_device *srcdev; 5163 5164 if (dev_replace_is_ongoing && 5165 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5166 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5167 srcdev = fs_info->dev_replace.srcdev; 5168 else 5169 srcdev = NULL; 5170 5171 /* 5172 * try to avoid the drive that is the source drive for a 5173 * dev-replace procedure, only choose it if no other non-missing 5174 * mirror is available 5175 */ 5176 for (tolerance = 0; tolerance < 2; tolerance++) { 5177 if (map->stripes[optimal].dev->bdev && 5178 (tolerance || map->stripes[optimal].dev != srcdev)) 5179 return optimal; 5180 for (i = first; i < first + num; i++) { 5181 if (map->stripes[i].dev->bdev && 5182 (tolerance || map->stripes[i].dev != srcdev)) 5183 return i; 5184 } 5185 } 5186 5187 /* we couldn't find one that doesn't fail. Just return something 5188 * and the io error handling code will clean up eventually 5189 */ 5190 return optimal; 5191 } 5192 5193 static inline int parity_smaller(u64 a, u64 b) 5194 { 5195 return a > b; 5196 } 5197 5198 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5199 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5200 { 5201 struct btrfs_bio_stripe s; 5202 int i; 5203 u64 l; 5204 int again = 1; 5205 5206 while (again) { 5207 again = 0; 5208 for (i = 0; i < num_stripes - 1; i++) { 5209 if (parity_smaller(bbio->raid_map[i], 5210 bbio->raid_map[i+1])) { 5211 s = bbio->stripes[i]; 5212 l = bbio->raid_map[i]; 5213 bbio->stripes[i] = bbio->stripes[i+1]; 5214 bbio->raid_map[i] = bbio->raid_map[i+1]; 5215 bbio->stripes[i+1] = s; 5216 bbio->raid_map[i+1] = l; 5217 5218 again = 1; 5219 } 5220 } 5221 } 5222 } 5223 5224 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5225 { 5226 struct btrfs_bio *bbio = kzalloc( 5227 /* the size of the btrfs_bio */ 5228 sizeof(struct btrfs_bio) + 5229 /* plus the variable array for the stripes */ 5230 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5231 /* plus the variable array for the tgt dev */ 5232 sizeof(int) * (real_stripes) + 5233 /* 5234 * plus the raid_map, which includes both the tgt dev 5235 * and the stripes 5236 */ 5237 sizeof(u64) * (total_stripes), 5238 GFP_NOFS|__GFP_NOFAIL); 5239 5240 atomic_set(&bbio->error, 0); 5241 refcount_set(&bbio->refs, 1); 5242 5243 return bbio; 5244 } 5245 5246 void btrfs_get_bbio(struct btrfs_bio *bbio) 5247 { 5248 WARN_ON(!refcount_read(&bbio->refs)); 5249 refcount_inc(&bbio->refs); 5250 } 5251 5252 void btrfs_put_bbio(struct btrfs_bio *bbio) 5253 { 5254 if (!bbio) 5255 return; 5256 if (refcount_dec_and_test(&bbio->refs)) 5257 kfree(bbio); 5258 } 5259 5260 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5261 /* 5262 * Please note that, discard won't be sent to target device of device 5263 * replace. 5264 */ 5265 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5266 u64 logical, u64 length, 5267 struct btrfs_bio **bbio_ret) 5268 { 5269 struct extent_map *em; 5270 struct map_lookup *map; 5271 struct btrfs_bio *bbio; 5272 u64 offset; 5273 u64 stripe_nr; 5274 u64 stripe_nr_end; 5275 u64 stripe_end_offset; 5276 u64 stripe_cnt; 5277 u64 stripe_len; 5278 u64 stripe_offset; 5279 u64 num_stripes; 5280 u32 stripe_index; 5281 u32 factor = 0; 5282 u32 sub_stripes = 0; 5283 u64 stripes_per_dev = 0; 5284 u32 remaining_stripes = 0; 5285 u32 last_stripe = 0; 5286 int ret = 0; 5287 int i; 5288 5289 /* discard always return a bbio */ 5290 ASSERT(bbio_ret); 5291 5292 em = get_chunk_map(fs_info, logical, length); 5293 if (IS_ERR(em)) 5294 return PTR_ERR(em); 5295 5296 map = em->map_lookup; 5297 /* we don't discard raid56 yet */ 5298 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5299 ret = -EOPNOTSUPP; 5300 goto out; 5301 } 5302 5303 offset = logical - em->start; 5304 length = min_t(u64, em->len - offset, length); 5305 5306 stripe_len = map->stripe_len; 5307 /* 5308 * stripe_nr counts the total number of stripes we have to stride 5309 * to get to this block 5310 */ 5311 stripe_nr = div64_u64(offset, stripe_len); 5312 5313 /* stripe_offset is the offset of this block in its stripe */ 5314 stripe_offset = offset - stripe_nr * stripe_len; 5315 5316 stripe_nr_end = round_up(offset + length, map->stripe_len); 5317 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5318 stripe_cnt = stripe_nr_end - stripe_nr; 5319 stripe_end_offset = stripe_nr_end * map->stripe_len - 5320 (offset + length); 5321 /* 5322 * after this, stripe_nr is the number of stripes on this 5323 * device we have to walk to find the data, and stripe_index is 5324 * the number of our device in the stripe array 5325 */ 5326 num_stripes = 1; 5327 stripe_index = 0; 5328 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5329 BTRFS_BLOCK_GROUP_RAID10)) { 5330 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5331 sub_stripes = 1; 5332 else 5333 sub_stripes = map->sub_stripes; 5334 5335 factor = map->num_stripes / sub_stripes; 5336 num_stripes = min_t(u64, map->num_stripes, 5337 sub_stripes * stripe_cnt); 5338 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5339 stripe_index *= sub_stripes; 5340 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5341 &remaining_stripes); 5342 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5343 last_stripe *= sub_stripes; 5344 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5345 BTRFS_BLOCK_GROUP_DUP)) { 5346 num_stripes = map->num_stripes; 5347 } else { 5348 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5349 &stripe_index); 5350 } 5351 5352 bbio = alloc_btrfs_bio(num_stripes, 0); 5353 if (!bbio) { 5354 ret = -ENOMEM; 5355 goto out; 5356 } 5357 5358 for (i = 0; i < num_stripes; i++) { 5359 bbio->stripes[i].physical = 5360 map->stripes[stripe_index].physical + 5361 stripe_offset + stripe_nr * map->stripe_len; 5362 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5363 5364 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5365 BTRFS_BLOCK_GROUP_RAID10)) { 5366 bbio->stripes[i].length = stripes_per_dev * 5367 map->stripe_len; 5368 5369 if (i / sub_stripes < remaining_stripes) 5370 bbio->stripes[i].length += 5371 map->stripe_len; 5372 5373 /* 5374 * Special for the first stripe and 5375 * the last stripe: 5376 * 5377 * |-------|...|-------| 5378 * |----------| 5379 * off end_off 5380 */ 5381 if (i < sub_stripes) 5382 bbio->stripes[i].length -= 5383 stripe_offset; 5384 5385 if (stripe_index >= last_stripe && 5386 stripe_index <= (last_stripe + 5387 sub_stripes - 1)) 5388 bbio->stripes[i].length -= 5389 stripe_end_offset; 5390 5391 if (i == sub_stripes - 1) 5392 stripe_offset = 0; 5393 } else { 5394 bbio->stripes[i].length = length; 5395 } 5396 5397 stripe_index++; 5398 if (stripe_index == map->num_stripes) { 5399 stripe_index = 0; 5400 stripe_nr++; 5401 } 5402 } 5403 5404 *bbio_ret = bbio; 5405 bbio->map_type = map->type; 5406 bbio->num_stripes = num_stripes; 5407 out: 5408 free_extent_map(em); 5409 return ret; 5410 } 5411 5412 /* 5413 * In dev-replace case, for repair case (that's the only case where the mirror 5414 * is selected explicitly when calling btrfs_map_block), blocks left of the 5415 * left cursor can also be read from the target drive. 5416 * 5417 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5418 * array of stripes. 5419 * For READ, it also needs to be supported using the same mirror number. 5420 * 5421 * If the requested block is not left of the left cursor, EIO is returned. This 5422 * can happen because btrfs_num_copies() returns one more in the dev-replace 5423 * case. 5424 */ 5425 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5426 u64 logical, u64 length, 5427 u64 srcdev_devid, int *mirror_num, 5428 u64 *physical) 5429 { 5430 struct btrfs_bio *bbio = NULL; 5431 int num_stripes; 5432 int index_srcdev = 0; 5433 int found = 0; 5434 u64 physical_of_found = 0; 5435 int i; 5436 int ret = 0; 5437 5438 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5439 logical, &length, &bbio, 0, 0); 5440 if (ret) { 5441 ASSERT(bbio == NULL); 5442 return ret; 5443 } 5444 5445 num_stripes = bbio->num_stripes; 5446 if (*mirror_num > num_stripes) { 5447 /* 5448 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5449 * that means that the requested area is not left of the left 5450 * cursor 5451 */ 5452 btrfs_put_bbio(bbio); 5453 return -EIO; 5454 } 5455 5456 /* 5457 * process the rest of the function using the mirror_num of the source 5458 * drive. Therefore look it up first. At the end, patch the device 5459 * pointer to the one of the target drive. 5460 */ 5461 for (i = 0; i < num_stripes; i++) { 5462 if (bbio->stripes[i].dev->devid != srcdev_devid) 5463 continue; 5464 5465 /* 5466 * In case of DUP, in order to keep it simple, only add the 5467 * mirror with the lowest physical address 5468 */ 5469 if (found && 5470 physical_of_found <= bbio->stripes[i].physical) 5471 continue; 5472 5473 index_srcdev = i; 5474 found = 1; 5475 physical_of_found = bbio->stripes[i].physical; 5476 } 5477 5478 btrfs_put_bbio(bbio); 5479 5480 ASSERT(found); 5481 if (!found) 5482 return -EIO; 5483 5484 *mirror_num = index_srcdev + 1; 5485 *physical = physical_of_found; 5486 return ret; 5487 } 5488 5489 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5490 struct btrfs_bio **bbio_ret, 5491 struct btrfs_dev_replace *dev_replace, 5492 int *num_stripes_ret, int *max_errors_ret) 5493 { 5494 struct btrfs_bio *bbio = *bbio_ret; 5495 u64 srcdev_devid = dev_replace->srcdev->devid; 5496 int tgtdev_indexes = 0; 5497 int num_stripes = *num_stripes_ret; 5498 int max_errors = *max_errors_ret; 5499 int i; 5500 5501 if (op == BTRFS_MAP_WRITE) { 5502 int index_where_to_add; 5503 5504 /* 5505 * duplicate the write operations while the dev replace 5506 * procedure is running. Since the copying of the old disk to 5507 * the new disk takes place at run time while the filesystem is 5508 * mounted writable, the regular write operations to the old 5509 * disk have to be duplicated to go to the new disk as well. 5510 * 5511 * Note that device->missing is handled by the caller, and that 5512 * the write to the old disk is already set up in the stripes 5513 * array. 5514 */ 5515 index_where_to_add = num_stripes; 5516 for (i = 0; i < num_stripes; i++) { 5517 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5518 /* write to new disk, too */ 5519 struct btrfs_bio_stripe *new = 5520 bbio->stripes + index_where_to_add; 5521 struct btrfs_bio_stripe *old = 5522 bbio->stripes + i; 5523 5524 new->physical = old->physical; 5525 new->length = old->length; 5526 new->dev = dev_replace->tgtdev; 5527 bbio->tgtdev_map[i] = index_where_to_add; 5528 index_where_to_add++; 5529 max_errors++; 5530 tgtdev_indexes++; 5531 } 5532 } 5533 num_stripes = index_where_to_add; 5534 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5535 int index_srcdev = 0; 5536 int found = 0; 5537 u64 physical_of_found = 0; 5538 5539 /* 5540 * During the dev-replace procedure, the target drive can also 5541 * be used to read data in case it is needed to repair a corrupt 5542 * block elsewhere. This is possible if the requested area is 5543 * left of the left cursor. In this area, the target drive is a 5544 * full copy of the source drive. 5545 */ 5546 for (i = 0; i < num_stripes; i++) { 5547 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5548 /* 5549 * In case of DUP, in order to keep it simple, 5550 * only add the mirror with the lowest physical 5551 * address 5552 */ 5553 if (found && 5554 physical_of_found <= 5555 bbio->stripes[i].physical) 5556 continue; 5557 index_srcdev = i; 5558 found = 1; 5559 physical_of_found = bbio->stripes[i].physical; 5560 } 5561 } 5562 if (found) { 5563 struct btrfs_bio_stripe *tgtdev_stripe = 5564 bbio->stripes + num_stripes; 5565 5566 tgtdev_stripe->physical = physical_of_found; 5567 tgtdev_stripe->length = 5568 bbio->stripes[index_srcdev].length; 5569 tgtdev_stripe->dev = dev_replace->tgtdev; 5570 bbio->tgtdev_map[index_srcdev] = num_stripes; 5571 5572 tgtdev_indexes++; 5573 num_stripes++; 5574 } 5575 } 5576 5577 *num_stripes_ret = num_stripes; 5578 *max_errors_ret = max_errors; 5579 bbio->num_tgtdevs = tgtdev_indexes; 5580 *bbio_ret = bbio; 5581 } 5582 5583 static bool need_full_stripe(enum btrfs_map_op op) 5584 { 5585 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5586 } 5587 5588 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5589 enum btrfs_map_op op, 5590 u64 logical, u64 *length, 5591 struct btrfs_bio **bbio_ret, 5592 int mirror_num, int need_raid_map) 5593 { 5594 struct extent_map *em; 5595 struct map_lookup *map; 5596 u64 offset; 5597 u64 stripe_offset; 5598 u64 stripe_nr; 5599 u64 stripe_len; 5600 u32 stripe_index; 5601 int i; 5602 int ret = 0; 5603 int num_stripes; 5604 int max_errors = 0; 5605 int tgtdev_indexes = 0; 5606 struct btrfs_bio *bbio = NULL; 5607 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5608 int dev_replace_is_ongoing = 0; 5609 int num_alloc_stripes; 5610 int patch_the_first_stripe_for_dev_replace = 0; 5611 u64 physical_to_patch_in_first_stripe = 0; 5612 u64 raid56_full_stripe_start = (u64)-1; 5613 5614 if (op == BTRFS_MAP_DISCARD) 5615 return __btrfs_map_block_for_discard(fs_info, logical, 5616 *length, bbio_ret); 5617 5618 em = get_chunk_map(fs_info, logical, *length); 5619 if (IS_ERR(em)) 5620 return PTR_ERR(em); 5621 5622 map = em->map_lookup; 5623 offset = logical - em->start; 5624 5625 stripe_len = map->stripe_len; 5626 stripe_nr = offset; 5627 /* 5628 * stripe_nr counts the total number of stripes we have to stride 5629 * to get to this block 5630 */ 5631 stripe_nr = div64_u64(stripe_nr, stripe_len); 5632 5633 stripe_offset = stripe_nr * stripe_len; 5634 if (offset < stripe_offset) { 5635 btrfs_crit(fs_info, 5636 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5637 stripe_offset, offset, em->start, logical, 5638 stripe_len); 5639 free_extent_map(em); 5640 return -EINVAL; 5641 } 5642 5643 /* stripe_offset is the offset of this block in its stripe*/ 5644 stripe_offset = offset - stripe_offset; 5645 5646 /* if we're here for raid56, we need to know the stripe aligned start */ 5647 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5648 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5649 raid56_full_stripe_start = offset; 5650 5651 /* allow a write of a full stripe, but make sure we don't 5652 * allow straddling of stripes 5653 */ 5654 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5655 full_stripe_len); 5656 raid56_full_stripe_start *= full_stripe_len; 5657 } 5658 5659 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5660 u64 max_len; 5661 /* For writes to RAID[56], allow a full stripeset across all disks. 5662 For other RAID types and for RAID[56] reads, just allow a single 5663 stripe (on a single disk). */ 5664 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5665 (op == BTRFS_MAP_WRITE)) { 5666 max_len = stripe_len * nr_data_stripes(map) - 5667 (offset - raid56_full_stripe_start); 5668 } else { 5669 /* we limit the length of each bio to what fits in a stripe */ 5670 max_len = stripe_len - stripe_offset; 5671 } 5672 *length = min_t(u64, em->len - offset, max_len); 5673 } else { 5674 *length = em->len - offset; 5675 } 5676 5677 /* This is for when we're called from btrfs_merge_bio_hook() and all 5678 it cares about is the length */ 5679 if (!bbio_ret) 5680 goto out; 5681 5682 btrfs_dev_replace_lock(dev_replace, 0); 5683 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5684 if (!dev_replace_is_ongoing) 5685 btrfs_dev_replace_unlock(dev_replace, 0); 5686 else 5687 btrfs_dev_replace_set_lock_blocking(dev_replace); 5688 5689 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5690 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 5691 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 5692 dev_replace->srcdev->devid, 5693 &mirror_num, 5694 &physical_to_patch_in_first_stripe); 5695 if (ret) 5696 goto out; 5697 else 5698 patch_the_first_stripe_for_dev_replace = 1; 5699 } else if (mirror_num > map->num_stripes) { 5700 mirror_num = 0; 5701 } 5702 5703 num_stripes = 1; 5704 stripe_index = 0; 5705 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5706 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5707 &stripe_index); 5708 if (!need_full_stripe(op)) 5709 mirror_num = 1; 5710 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5711 if (need_full_stripe(op)) 5712 num_stripes = map->num_stripes; 5713 else if (mirror_num) 5714 stripe_index = mirror_num - 1; 5715 else { 5716 stripe_index = find_live_mirror(fs_info, map, 0, 5717 map->num_stripes, 5718 current->pid % map->num_stripes, 5719 dev_replace_is_ongoing); 5720 mirror_num = stripe_index + 1; 5721 } 5722 5723 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5724 if (need_full_stripe(op)) { 5725 num_stripes = map->num_stripes; 5726 } else if (mirror_num) { 5727 stripe_index = mirror_num - 1; 5728 } else { 5729 mirror_num = 1; 5730 } 5731 5732 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5733 u32 factor = map->num_stripes / map->sub_stripes; 5734 5735 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5736 stripe_index *= map->sub_stripes; 5737 5738 if (need_full_stripe(op)) 5739 num_stripes = map->sub_stripes; 5740 else if (mirror_num) 5741 stripe_index += mirror_num - 1; 5742 else { 5743 int old_stripe_index = stripe_index; 5744 stripe_index = find_live_mirror(fs_info, map, 5745 stripe_index, 5746 map->sub_stripes, stripe_index + 5747 current->pid % map->sub_stripes, 5748 dev_replace_is_ongoing); 5749 mirror_num = stripe_index - old_stripe_index + 1; 5750 } 5751 5752 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5753 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5754 /* push stripe_nr back to the start of the full stripe */ 5755 stripe_nr = div64_u64(raid56_full_stripe_start, 5756 stripe_len * nr_data_stripes(map)); 5757 5758 /* RAID[56] write or recovery. Return all stripes */ 5759 num_stripes = map->num_stripes; 5760 max_errors = nr_parity_stripes(map); 5761 5762 *length = map->stripe_len; 5763 stripe_index = 0; 5764 stripe_offset = 0; 5765 } else { 5766 /* 5767 * Mirror #0 or #1 means the original data block. 5768 * Mirror #2 is RAID5 parity block. 5769 * Mirror #3 is RAID6 Q block. 5770 */ 5771 stripe_nr = div_u64_rem(stripe_nr, 5772 nr_data_stripes(map), &stripe_index); 5773 if (mirror_num > 1) 5774 stripe_index = nr_data_stripes(map) + 5775 mirror_num - 2; 5776 5777 /* We distribute the parity blocks across stripes */ 5778 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5779 &stripe_index); 5780 if (!need_full_stripe(op) && mirror_num <= 1) 5781 mirror_num = 1; 5782 } 5783 } else { 5784 /* 5785 * after this, stripe_nr is the number of stripes on this 5786 * device we have to walk to find the data, and stripe_index is 5787 * the number of our device in the stripe array 5788 */ 5789 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5790 &stripe_index); 5791 mirror_num = stripe_index + 1; 5792 } 5793 if (stripe_index >= map->num_stripes) { 5794 btrfs_crit(fs_info, 5795 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5796 stripe_index, map->num_stripes); 5797 ret = -EINVAL; 5798 goto out; 5799 } 5800 5801 num_alloc_stripes = num_stripes; 5802 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 5803 if (op == BTRFS_MAP_WRITE) 5804 num_alloc_stripes <<= 1; 5805 if (op == BTRFS_MAP_GET_READ_MIRRORS) 5806 num_alloc_stripes++; 5807 tgtdev_indexes = num_stripes; 5808 } 5809 5810 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5811 if (!bbio) { 5812 ret = -ENOMEM; 5813 goto out; 5814 } 5815 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 5816 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5817 5818 /* build raid_map */ 5819 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 5820 (need_full_stripe(op) || mirror_num > 1)) { 5821 u64 tmp; 5822 unsigned rot; 5823 5824 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5825 sizeof(struct btrfs_bio_stripe) * 5826 num_alloc_stripes + 5827 sizeof(int) * tgtdev_indexes); 5828 5829 /* Work out the disk rotation on this stripe-set */ 5830 div_u64_rem(stripe_nr, num_stripes, &rot); 5831 5832 /* Fill in the logical address of each stripe */ 5833 tmp = stripe_nr * nr_data_stripes(map); 5834 for (i = 0; i < nr_data_stripes(map); i++) 5835 bbio->raid_map[(i+rot) % num_stripes] = 5836 em->start + (tmp + i) * map->stripe_len; 5837 5838 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5839 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5840 bbio->raid_map[(i+rot+1) % num_stripes] = 5841 RAID6_Q_STRIPE; 5842 } 5843 5844 5845 for (i = 0; i < num_stripes; i++) { 5846 bbio->stripes[i].physical = 5847 map->stripes[stripe_index].physical + 5848 stripe_offset + 5849 stripe_nr * map->stripe_len; 5850 bbio->stripes[i].dev = 5851 map->stripes[stripe_index].dev; 5852 stripe_index++; 5853 } 5854 5855 if (need_full_stripe(op)) 5856 max_errors = btrfs_chunk_max_errors(map); 5857 5858 if (bbio->raid_map) 5859 sort_parity_stripes(bbio, num_stripes); 5860 5861 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 5862 need_full_stripe(op)) { 5863 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 5864 &max_errors); 5865 } 5866 5867 *bbio_ret = bbio; 5868 bbio->map_type = map->type; 5869 bbio->num_stripes = num_stripes; 5870 bbio->max_errors = max_errors; 5871 bbio->mirror_num = mirror_num; 5872 5873 /* 5874 * this is the case that REQ_READ && dev_replace_is_ongoing && 5875 * mirror_num == num_stripes + 1 && dev_replace target drive is 5876 * available as a mirror 5877 */ 5878 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5879 WARN_ON(num_stripes > 1); 5880 bbio->stripes[0].dev = dev_replace->tgtdev; 5881 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5882 bbio->mirror_num = map->num_stripes + 1; 5883 } 5884 out: 5885 if (dev_replace_is_ongoing) { 5886 btrfs_dev_replace_clear_lock_blocking(dev_replace); 5887 btrfs_dev_replace_unlock(dev_replace, 0); 5888 } 5889 free_extent_map(em); 5890 return ret; 5891 } 5892 5893 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5894 u64 logical, u64 *length, 5895 struct btrfs_bio **bbio_ret, int mirror_num) 5896 { 5897 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 5898 mirror_num, 0); 5899 } 5900 5901 /* For Scrub/replace */ 5902 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5903 u64 logical, u64 *length, 5904 struct btrfs_bio **bbio_ret) 5905 { 5906 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 5907 } 5908 5909 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, 5910 u64 chunk_start, u64 physical, u64 devid, 5911 u64 **logical, int *naddrs, int *stripe_len) 5912 { 5913 struct extent_map *em; 5914 struct map_lookup *map; 5915 u64 *buf; 5916 u64 bytenr; 5917 u64 length; 5918 u64 stripe_nr; 5919 u64 rmap_len; 5920 int i, j, nr = 0; 5921 5922 em = get_chunk_map(fs_info, chunk_start, 1); 5923 if (IS_ERR(em)) 5924 return -EIO; 5925 5926 map = em->map_lookup; 5927 length = em->len; 5928 rmap_len = map->stripe_len; 5929 5930 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5931 length = div_u64(length, map->num_stripes / map->sub_stripes); 5932 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5933 length = div_u64(length, map->num_stripes); 5934 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5935 length = div_u64(length, nr_data_stripes(map)); 5936 rmap_len = map->stripe_len * nr_data_stripes(map); 5937 } 5938 5939 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 5940 BUG_ON(!buf); /* -ENOMEM */ 5941 5942 for (i = 0; i < map->num_stripes; i++) { 5943 if (devid && map->stripes[i].dev->devid != devid) 5944 continue; 5945 if (map->stripes[i].physical > physical || 5946 map->stripes[i].physical + length <= physical) 5947 continue; 5948 5949 stripe_nr = physical - map->stripes[i].physical; 5950 stripe_nr = div64_u64(stripe_nr, map->stripe_len); 5951 5952 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5953 stripe_nr = stripe_nr * map->num_stripes + i; 5954 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 5955 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5956 stripe_nr = stripe_nr * map->num_stripes + i; 5957 } /* else if RAID[56], multiply by nr_data_stripes(). 5958 * Alternatively, just use rmap_len below instead of 5959 * map->stripe_len */ 5960 5961 bytenr = chunk_start + stripe_nr * rmap_len; 5962 WARN_ON(nr >= map->num_stripes); 5963 for (j = 0; j < nr; j++) { 5964 if (buf[j] == bytenr) 5965 break; 5966 } 5967 if (j == nr) { 5968 WARN_ON(nr >= map->num_stripes); 5969 buf[nr++] = bytenr; 5970 } 5971 } 5972 5973 *logical = buf; 5974 *naddrs = nr; 5975 *stripe_len = rmap_len; 5976 5977 free_extent_map(em); 5978 return 0; 5979 } 5980 5981 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 5982 { 5983 bio->bi_private = bbio->private; 5984 bio->bi_end_io = bbio->end_io; 5985 bio_endio(bio); 5986 5987 btrfs_put_bbio(bbio); 5988 } 5989 5990 static void btrfs_end_bio(struct bio *bio) 5991 { 5992 struct btrfs_bio *bbio = bio->bi_private; 5993 int is_orig_bio = 0; 5994 5995 if (bio->bi_status) { 5996 atomic_inc(&bbio->error); 5997 if (bio->bi_status == BLK_STS_IOERR || 5998 bio->bi_status == BLK_STS_TARGET) { 5999 unsigned int stripe_index = 6000 btrfs_io_bio(bio)->stripe_index; 6001 struct btrfs_device *dev; 6002 6003 BUG_ON(stripe_index >= bbio->num_stripes); 6004 dev = bbio->stripes[stripe_index].dev; 6005 if (dev->bdev) { 6006 if (bio_op(bio) == REQ_OP_WRITE) 6007 btrfs_dev_stat_inc(dev, 6008 BTRFS_DEV_STAT_WRITE_ERRS); 6009 else 6010 btrfs_dev_stat_inc(dev, 6011 BTRFS_DEV_STAT_READ_ERRS); 6012 if (bio->bi_opf & REQ_PREFLUSH) 6013 btrfs_dev_stat_inc(dev, 6014 BTRFS_DEV_STAT_FLUSH_ERRS); 6015 btrfs_dev_stat_print_on_error(dev); 6016 } 6017 } 6018 } 6019 6020 if (bio == bbio->orig_bio) 6021 is_orig_bio = 1; 6022 6023 btrfs_bio_counter_dec(bbio->fs_info); 6024 6025 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6026 if (!is_orig_bio) { 6027 bio_put(bio); 6028 bio = bbio->orig_bio; 6029 } 6030 6031 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6032 /* only send an error to the higher layers if it is 6033 * beyond the tolerance of the btrfs bio 6034 */ 6035 if (atomic_read(&bbio->error) > bbio->max_errors) { 6036 bio->bi_status = BLK_STS_IOERR; 6037 } else { 6038 /* 6039 * this bio is actually up to date, we didn't 6040 * go over the max number of errors 6041 */ 6042 bio->bi_status = BLK_STS_OK; 6043 } 6044 6045 btrfs_end_bbio(bbio, bio); 6046 } else if (!is_orig_bio) { 6047 bio_put(bio); 6048 } 6049 } 6050 6051 /* 6052 * see run_scheduled_bios for a description of why bios are collected for 6053 * async submit. 6054 * 6055 * This will add one bio to the pending list for a device and make sure 6056 * the work struct is scheduled. 6057 */ 6058 static noinline void btrfs_schedule_bio(struct btrfs_device *device, 6059 struct bio *bio) 6060 { 6061 struct btrfs_fs_info *fs_info = device->fs_info; 6062 int should_queue = 1; 6063 struct btrfs_pending_bios *pending_bios; 6064 6065 if (device->missing || !device->bdev) { 6066 bio_io_error(bio); 6067 return; 6068 } 6069 6070 /* don't bother with additional async steps for reads, right now */ 6071 if (bio_op(bio) == REQ_OP_READ) { 6072 bio_get(bio); 6073 btrfsic_submit_bio(bio); 6074 bio_put(bio); 6075 return; 6076 } 6077 6078 WARN_ON(bio->bi_next); 6079 bio->bi_next = NULL; 6080 6081 spin_lock(&device->io_lock); 6082 if (op_is_sync(bio->bi_opf)) 6083 pending_bios = &device->pending_sync_bios; 6084 else 6085 pending_bios = &device->pending_bios; 6086 6087 if (pending_bios->tail) 6088 pending_bios->tail->bi_next = bio; 6089 6090 pending_bios->tail = bio; 6091 if (!pending_bios->head) 6092 pending_bios->head = bio; 6093 if (device->running_pending) 6094 should_queue = 0; 6095 6096 spin_unlock(&device->io_lock); 6097 6098 if (should_queue) 6099 btrfs_queue_work(fs_info->submit_workers, &device->work); 6100 } 6101 6102 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6103 u64 physical, int dev_nr, int async) 6104 { 6105 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 6106 struct btrfs_fs_info *fs_info = bbio->fs_info; 6107 6108 bio->bi_private = bbio; 6109 btrfs_io_bio(bio)->stripe_index = dev_nr; 6110 bio->bi_end_io = btrfs_end_bio; 6111 bio->bi_iter.bi_sector = physical >> 9; 6112 #ifdef DEBUG 6113 { 6114 struct rcu_string *name; 6115 6116 rcu_read_lock(); 6117 name = rcu_dereference(dev->name); 6118 btrfs_debug(fs_info, 6119 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6120 bio_op(bio), bio->bi_opf, 6121 (u64)bio->bi_iter.bi_sector, 6122 (u_long)dev->bdev->bd_dev, name->str, dev->devid, 6123 bio->bi_iter.bi_size); 6124 rcu_read_unlock(); 6125 } 6126 #endif 6127 bio_set_dev(bio, dev->bdev); 6128 6129 btrfs_bio_counter_inc_noblocked(fs_info); 6130 6131 if (async) 6132 btrfs_schedule_bio(dev, bio); 6133 else 6134 btrfsic_submit_bio(bio); 6135 } 6136 6137 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6138 { 6139 atomic_inc(&bbio->error); 6140 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6141 /* Should be the original bio. */ 6142 WARN_ON(bio != bbio->orig_bio); 6143 6144 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6145 bio->bi_iter.bi_sector = logical >> 9; 6146 if (atomic_read(&bbio->error) > bbio->max_errors) 6147 bio->bi_status = BLK_STS_IOERR; 6148 else 6149 bio->bi_status = BLK_STS_OK; 6150 btrfs_end_bbio(bbio, bio); 6151 } 6152 } 6153 6154 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6155 int mirror_num, int async_submit) 6156 { 6157 struct btrfs_device *dev; 6158 struct bio *first_bio = bio; 6159 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6160 u64 length = 0; 6161 u64 map_length; 6162 int ret; 6163 int dev_nr; 6164 int total_devs; 6165 struct btrfs_bio *bbio = NULL; 6166 6167 length = bio->bi_iter.bi_size; 6168 map_length = length; 6169 6170 btrfs_bio_counter_inc_blocked(fs_info); 6171 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6172 &map_length, &bbio, mirror_num, 1); 6173 if (ret) { 6174 btrfs_bio_counter_dec(fs_info); 6175 return errno_to_blk_status(ret); 6176 } 6177 6178 total_devs = bbio->num_stripes; 6179 bbio->orig_bio = first_bio; 6180 bbio->private = first_bio->bi_private; 6181 bbio->end_io = first_bio->bi_end_io; 6182 bbio->fs_info = fs_info; 6183 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6184 6185 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6186 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6187 /* In this case, map_length has been set to the length of 6188 a single stripe; not the whole write */ 6189 if (bio_op(bio) == REQ_OP_WRITE) { 6190 ret = raid56_parity_write(fs_info, bio, bbio, 6191 map_length); 6192 } else { 6193 ret = raid56_parity_recover(fs_info, bio, bbio, 6194 map_length, mirror_num, 1); 6195 } 6196 6197 btrfs_bio_counter_dec(fs_info); 6198 return errno_to_blk_status(ret); 6199 } 6200 6201 if (map_length < length) { 6202 btrfs_crit(fs_info, 6203 "mapping failed logical %llu bio len %llu len %llu", 6204 logical, length, map_length); 6205 BUG(); 6206 } 6207 6208 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6209 dev = bbio->stripes[dev_nr].dev; 6210 if (!dev || !dev->bdev || 6211 (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { 6212 bbio_error(bbio, first_bio, logical); 6213 continue; 6214 } 6215 6216 if (dev_nr < total_devs - 1) 6217 bio = btrfs_bio_clone(first_bio); 6218 else 6219 bio = first_bio; 6220 6221 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6222 dev_nr, async_submit); 6223 } 6224 btrfs_bio_counter_dec(fs_info); 6225 return BLK_STS_OK; 6226 } 6227 6228 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6229 u8 *uuid, u8 *fsid) 6230 { 6231 struct btrfs_device *device; 6232 struct btrfs_fs_devices *cur_devices; 6233 6234 cur_devices = fs_info->fs_devices; 6235 while (cur_devices) { 6236 if (!fsid || 6237 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 6238 device = find_device(cur_devices, devid, uuid); 6239 if (device) 6240 return device; 6241 } 6242 cur_devices = cur_devices->seed; 6243 } 6244 return NULL; 6245 } 6246 6247 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6248 u64 devid, u8 *dev_uuid) 6249 { 6250 struct btrfs_device *device; 6251 6252 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6253 if (IS_ERR(device)) 6254 return device; 6255 6256 list_add(&device->dev_list, &fs_devices->devices); 6257 device->fs_devices = fs_devices; 6258 fs_devices->num_devices++; 6259 6260 device->missing = 1; 6261 fs_devices->missing_devices++; 6262 6263 return device; 6264 } 6265 6266 /** 6267 * btrfs_alloc_device - allocate struct btrfs_device 6268 * @fs_info: used only for generating a new devid, can be NULL if 6269 * devid is provided (i.e. @devid != NULL). 6270 * @devid: a pointer to devid for this device. If NULL a new devid 6271 * is generated. 6272 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6273 * is generated. 6274 * 6275 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6276 * on error. Returned struct is not linked onto any lists and can be 6277 * destroyed with kfree() right away. 6278 */ 6279 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6280 const u64 *devid, 6281 const u8 *uuid) 6282 { 6283 struct btrfs_device *dev; 6284 u64 tmp; 6285 6286 if (WARN_ON(!devid && !fs_info)) 6287 return ERR_PTR(-EINVAL); 6288 6289 dev = __alloc_device(); 6290 if (IS_ERR(dev)) 6291 return dev; 6292 6293 if (devid) 6294 tmp = *devid; 6295 else { 6296 int ret; 6297 6298 ret = find_next_devid(fs_info, &tmp); 6299 if (ret) { 6300 bio_put(dev->flush_bio); 6301 kfree(dev); 6302 return ERR_PTR(ret); 6303 } 6304 } 6305 dev->devid = tmp; 6306 6307 if (uuid) 6308 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6309 else 6310 generate_random_uuid(dev->uuid); 6311 6312 btrfs_init_work(&dev->work, btrfs_submit_helper, 6313 pending_bios_fn, NULL, NULL); 6314 6315 return dev; 6316 } 6317 6318 /* Return -EIO if any error, otherwise return 0. */ 6319 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6320 struct extent_buffer *leaf, 6321 struct btrfs_chunk *chunk, u64 logical) 6322 { 6323 u64 length; 6324 u64 stripe_len; 6325 u16 num_stripes; 6326 u16 sub_stripes; 6327 u64 type; 6328 6329 length = btrfs_chunk_length(leaf, chunk); 6330 stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6331 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6332 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6333 type = btrfs_chunk_type(leaf, chunk); 6334 6335 if (!num_stripes) { 6336 btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6337 num_stripes); 6338 return -EIO; 6339 } 6340 if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 6341 btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6342 return -EIO; 6343 } 6344 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 6345 btrfs_err(fs_info, "invalid chunk sectorsize %u", 6346 btrfs_chunk_sector_size(leaf, chunk)); 6347 return -EIO; 6348 } 6349 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 6350 btrfs_err(fs_info, "invalid chunk length %llu", length); 6351 return -EIO; 6352 } 6353 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 6354 btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6355 stripe_len); 6356 return -EIO; 6357 } 6358 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6359 type) { 6360 btrfs_err(fs_info, "unrecognized chunk type: %llu", 6361 ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6362 BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6363 btrfs_chunk_type(leaf, chunk)); 6364 return -EIO; 6365 } 6366 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6367 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6368 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6369 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6370 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6371 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6372 num_stripes != 1)) { 6373 btrfs_err(fs_info, 6374 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6375 num_stripes, sub_stripes, 6376 type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6377 return -EIO; 6378 } 6379 6380 return 0; 6381 } 6382 6383 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6384 u64 devid, u8 *uuid, bool error) 6385 { 6386 if (error) 6387 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6388 devid, uuid); 6389 else 6390 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6391 devid, uuid); 6392 } 6393 6394 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6395 struct extent_buffer *leaf, 6396 struct btrfs_chunk *chunk) 6397 { 6398 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6399 struct map_lookup *map; 6400 struct extent_map *em; 6401 u64 logical; 6402 u64 length; 6403 u64 devid; 6404 u8 uuid[BTRFS_UUID_SIZE]; 6405 int num_stripes; 6406 int ret; 6407 int i; 6408 6409 logical = key->offset; 6410 length = btrfs_chunk_length(leaf, chunk); 6411 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6412 6413 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6414 if (ret) 6415 return ret; 6416 6417 read_lock(&map_tree->map_tree.lock); 6418 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6419 read_unlock(&map_tree->map_tree.lock); 6420 6421 /* already mapped? */ 6422 if (em && em->start <= logical && em->start + em->len > logical) { 6423 free_extent_map(em); 6424 return 0; 6425 } else if (em) { 6426 free_extent_map(em); 6427 } 6428 6429 em = alloc_extent_map(); 6430 if (!em) 6431 return -ENOMEM; 6432 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6433 if (!map) { 6434 free_extent_map(em); 6435 return -ENOMEM; 6436 } 6437 6438 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6439 em->map_lookup = map; 6440 em->start = logical; 6441 em->len = length; 6442 em->orig_start = 0; 6443 em->block_start = 0; 6444 em->block_len = em->len; 6445 6446 map->num_stripes = num_stripes; 6447 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6448 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6449 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6450 map->type = btrfs_chunk_type(leaf, chunk); 6451 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6452 for (i = 0; i < num_stripes; i++) { 6453 map->stripes[i].physical = 6454 btrfs_stripe_offset_nr(leaf, chunk, i); 6455 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6456 read_extent_buffer(leaf, uuid, (unsigned long) 6457 btrfs_stripe_dev_uuid_nr(chunk, i), 6458 BTRFS_UUID_SIZE); 6459 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6460 uuid, NULL); 6461 if (!map->stripes[i].dev && 6462 !btrfs_test_opt(fs_info, DEGRADED)) { 6463 free_extent_map(em); 6464 btrfs_report_missing_device(fs_info, devid, uuid, true); 6465 return -ENOENT; 6466 } 6467 if (!map->stripes[i].dev) { 6468 map->stripes[i].dev = 6469 add_missing_dev(fs_info->fs_devices, devid, 6470 uuid); 6471 if (IS_ERR(map->stripes[i].dev)) { 6472 free_extent_map(em); 6473 btrfs_err(fs_info, 6474 "failed to init missing dev %llu: %ld", 6475 devid, PTR_ERR(map->stripes[i].dev)); 6476 return PTR_ERR(map->stripes[i].dev); 6477 } 6478 btrfs_report_missing_device(fs_info, devid, uuid, false); 6479 } 6480 map->stripes[i].dev->in_fs_metadata = 1; 6481 } 6482 6483 write_lock(&map_tree->map_tree.lock); 6484 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6485 write_unlock(&map_tree->map_tree.lock); 6486 BUG_ON(ret); /* Tree corruption */ 6487 free_extent_map(em); 6488 6489 return 0; 6490 } 6491 6492 static void fill_device_from_item(struct extent_buffer *leaf, 6493 struct btrfs_dev_item *dev_item, 6494 struct btrfs_device *device) 6495 { 6496 unsigned long ptr; 6497 6498 device->devid = btrfs_device_id(leaf, dev_item); 6499 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6500 device->total_bytes = device->disk_total_bytes; 6501 device->commit_total_bytes = device->disk_total_bytes; 6502 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6503 device->commit_bytes_used = device->bytes_used; 6504 device->type = btrfs_device_type(leaf, dev_item); 6505 device->io_align = btrfs_device_io_align(leaf, dev_item); 6506 device->io_width = btrfs_device_io_width(leaf, dev_item); 6507 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6508 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6509 device->is_tgtdev_for_dev_replace = 0; 6510 6511 ptr = btrfs_device_uuid(dev_item); 6512 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6513 } 6514 6515 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6516 u8 *fsid) 6517 { 6518 struct btrfs_fs_devices *fs_devices; 6519 int ret; 6520 6521 BUG_ON(!mutex_is_locked(&uuid_mutex)); 6522 ASSERT(fsid); 6523 6524 fs_devices = fs_info->fs_devices->seed; 6525 while (fs_devices) { 6526 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6527 return fs_devices; 6528 6529 fs_devices = fs_devices->seed; 6530 } 6531 6532 fs_devices = find_fsid(fsid); 6533 if (!fs_devices) { 6534 if (!btrfs_test_opt(fs_info, DEGRADED)) 6535 return ERR_PTR(-ENOENT); 6536 6537 fs_devices = alloc_fs_devices(fsid); 6538 if (IS_ERR(fs_devices)) 6539 return fs_devices; 6540 6541 fs_devices->seeding = 1; 6542 fs_devices->opened = 1; 6543 return fs_devices; 6544 } 6545 6546 fs_devices = clone_fs_devices(fs_devices); 6547 if (IS_ERR(fs_devices)) 6548 return fs_devices; 6549 6550 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6551 fs_info->bdev_holder); 6552 if (ret) { 6553 free_fs_devices(fs_devices); 6554 fs_devices = ERR_PTR(ret); 6555 goto out; 6556 } 6557 6558 if (!fs_devices->seeding) { 6559 __btrfs_close_devices(fs_devices); 6560 free_fs_devices(fs_devices); 6561 fs_devices = ERR_PTR(-EINVAL); 6562 goto out; 6563 } 6564 6565 fs_devices->seed = fs_info->fs_devices->seed; 6566 fs_info->fs_devices->seed = fs_devices; 6567 out: 6568 return fs_devices; 6569 } 6570 6571 static int read_one_dev(struct btrfs_fs_info *fs_info, 6572 struct extent_buffer *leaf, 6573 struct btrfs_dev_item *dev_item) 6574 { 6575 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6576 struct btrfs_device *device; 6577 u64 devid; 6578 int ret; 6579 u8 fs_uuid[BTRFS_FSID_SIZE]; 6580 u8 dev_uuid[BTRFS_UUID_SIZE]; 6581 6582 devid = btrfs_device_id(leaf, dev_item); 6583 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6584 BTRFS_UUID_SIZE); 6585 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6586 BTRFS_FSID_SIZE); 6587 6588 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 6589 fs_devices = open_seed_devices(fs_info, fs_uuid); 6590 if (IS_ERR(fs_devices)) 6591 return PTR_ERR(fs_devices); 6592 } 6593 6594 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6595 if (!device) { 6596 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6597 btrfs_report_missing_device(fs_info, devid, 6598 dev_uuid, true); 6599 return -ENOENT; 6600 } 6601 6602 device = add_missing_dev(fs_devices, devid, dev_uuid); 6603 if (IS_ERR(device)) { 6604 btrfs_err(fs_info, 6605 "failed to add missing dev %llu: %ld", 6606 devid, PTR_ERR(device)); 6607 return PTR_ERR(device); 6608 } 6609 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6610 } else { 6611 if (!device->bdev) { 6612 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6613 btrfs_report_missing_device(fs_info, 6614 devid, dev_uuid, true); 6615 return -ENOENT; 6616 } 6617 btrfs_report_missing_device(fs_info, devid, 6618 dev_uuid, false); 6619 } 6620 6621 if(!device->bdev && !device->missing) { 6622 /* 6623 * this happens when a device that was properly setup 6624 * in the device info lists suddenly goes bad. 6625 * device->bdev is NULL, and so we have to set 6626 * device->missing to one here 6627 */ 6628 device->fs_devices->missing_devices++; 6629 device->missing = 1; 6630 } 6631 6632 /* Move the device to its own fs_devices */ 6633 if (device->fs_devices != fs_devices) { 6634 ASSERT(device->missing); 6635 6636 list_move(&device->dev_list, &fs_devices->devices); 6637 device->fs_devices->num_devices--; 6638 fs_devices->num_devices++; 6639 6640 device->fs_devices->missing_devices--; 6641 fs_devices->missing_devices++; 6642 6643 device->fs_devices = fs_devices; 6644 } 6645 } 6646 6647 if (device->fs_devices != fs_info->fs_devices) { 6648 BUG_ON(device->writeable); 6649 if (device->generation != 6650 btrfs_device_generation(leaf, dev_item)) 6651 return -EINVAL; 6652 } 6653 6654 fill_device_from_item(leaf, dev_item, device); 6655 device->in_fs_metadata = 1; 6656 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6657 device->fs_devices->total_rw_bytes += device->total_bytes; 6658 atomic64_add(device->total_bytes - device->bytes_used, 6659 &fs_info->free_chunk_space); 6660 } 6661 ret = 0; 6662 return ret; 6663 } 6664 6665 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6666 { 6667 struct btrfs_root *root = fs_info->tree_root; 6668 struct btrfs_super_block *super_copy = fs_info->super_copy; 6669 struct extent_buffer *sb; 6670 struct btrfs_disk_key *disk_key; 6671 struct btrfs_chunk *chunk; 6672 u8 *array_ptr; 6673 unsigned long sb_array_offset; 6674 int ret = 0; 6675 u32 num_stripes; 6676 u32 array_size; 6677 u32 len = 0; 6678 u32 cur_offset; 6679 u64 type; 6680 struct btrfs_key key; 6681 6682 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6683 /* 6684 * This will create extent buffer of nodesize, superblock size is 6685 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6686 * overallocate but we can keep it as-is, only the first page is used. 6687 */ 6688 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6689 if (IS_ERR(sb)) 6690 return PTR_ERR(sb); 6691 set_extent_buffer_uptodate(sb); 6692 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6693 /* 6694 * The sb extent buffer is artificial and just used to read the system array. 6695 * set_extent_buffer_uptodate() call does not properly mark all it's 6696 * pages up-to-date when the page is larger: extent does not cover the 6697 * whole page and consequently check_page_uptodate does not find all 6698 * the page's extents up-to-date (the hole beyond sb), 6699 * write_extent_buffer then triggers a WARN_ON. 6700 * 6701 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6702 * but sb spans only this function. Add an explicit SetPageUptodate call 6703 * to silence the warning eg. on PowerPC 64. 6704 */ 6705 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6706 SetPageUptodate(sb->pages[0]); 6707 6708 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6709 array_size = btrfs_super_sys_array_size(super_copy); 6710 6711 array_ptr = super_copy->sys_chunk_array; 6712 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6713 cur_offset = 0; 6714 6715 while (cur_offset < array_size) { 6716 disk_key = (struct btrfs_disk_key *)array_ptr; 6717 len = sizeof(*disk_key); 6718 if (cur_offset + len > array_size) 6719 goto out_short_read; 6720 6721 btrfs_disk_key_to_cpu(&key, disk_key); 6722 6723 array_ptr += len; 6724 sb_array_offset += len; 6725 cur_offset += len; 6726 6727 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6728 chunk = (struct btrfs_chunk *)sb_array_offset; 6729 /* 6730 * At least one btrfs_chunk with one stripe must be 6731 * present, exact stripe count check comes afterwards 6732 */ 6733 len = btrfs_chunk_item_size(1); 6734 if (cur_offset + len > array_size) 6735 goto out_short_read; 6736 6737 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6738 if (!num_stripes) { 6739 btrfs_err(fs_info, 6740 "invalid number of stripes %u in sys_array at offset %u", 6741 num_stripes, cur_offset); 6742 ret = -EIO; 6743 break; 6744 } 6745 6746 type = btrfs_chunk_type(sb, chunk); 6747 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6748 btrfs_err(fs_info, 6749 "invalid chunk type %llu in sys_array at offset %u", 6750 type, cur_offset); 6751 ret = -EIO; 6752 break; 6753 } 6754 6755 len = btrfs_chunk_item_size(num_stripes); 6756 if (cur_offset + len > array_size) 6757 goto out_short_read; 6758 6759 ret = read_one_chunk(fs_info, &key, sb, chunk); 6760 if (ret) 6761 break; 6762 } else { 6763 btrfs_err(fs_info, 6764 "unexpected item type %u in sys_array at offset %u", 6765 (u32)key.type, cur_offset); 6766 ret = -EIO; 6767 break; 6768 } 6769 array_ptr += len; 6770 sb_array_offset += len; 6771 cur_offset += len; 6772 } 6773 clear_extent_buffer_uptodate(sb); 6774 free_extent_buffer_stale(sb); 6775 return ret; 6776 6777 out_short_read: 6778 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6779 len, cur_offset); 6780 clear_extent_buffer_uptodate(sb); 6781 free_extent_buffer_stale(sb); 6782 return -EIO; 6783 } 6784 6785 /* 6786 * Check if all chunks in the fs are OK for read-write degraded mount 6787 * 6788 * Return true if all chunks meet the minimal RW mount requirements. 6789 * Return false if any chunk doesn't meet the minimal RW mount requirements. 6790 */ 6791 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) 6792 { 6793 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6794 struct extent_map *em; 6795 u64 next_start = 0; 6796 bool ret = true; 6797 6798 read_lock(&map_tree->map_tree.lock); 6799 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 6800 read_unlock(&map_tree->map_tree.lock); 6801 /* No chunk at all? Return false anyway */ 6802 if (!em) { 6803 ret = false; 6804 goto out; 6805 } 6806 while (em) { 6807 struct map_lookup *map; 6808 int missing = 0; 6809 int max_tolerated; 6810 int i; 6811 6812 map = em->map_lookup; 6813 max_tolerated = 6814 btrfs_get_num_tolerated_disk_barrier_failures( 6815 map->type); 6816 for (i = 0; i < map->num_stripes; i++) { 6817 struct btrfs_device *dev = map->stripes[i].dev; 6818 6819 if (!dev || !dev->bdev || dev->missing || 6820 dev->last_flush_error) 6821 missing++; 6822 } 6823 if (missing > max_tolerated) { 6824 btrfs_warn(fs_info, 6825 "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 6826 em->start, missing, max_tolerated); 6827 free_extent_map(em); 6828 ret = false; 6829 goto out; 6830 } 6831 next_start = extent_map_end(em); 6832 free_extent_map(em); 6833 6834 read_lock(&map_tree->map_tree.lock); 6835 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 6836 (u64)(-1) - next_start); 6837 read_unlock(&map_tree->map_tree.lock); 6838 } 6839 out: 6840 return ret; 6841 } 6842 6843 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 6844 { 6845 struct btrfs_root *root = fs_info->chunk_root; 6846 struct btrfs_path *path; 6847 struct extent_buffer *leaf; 6848 struct btrfs_key key; 6849 struct btrfs_key found_key; 6850 int ret; 6851 int slot; 6852 u64 total_dev = 0; 6853 6854 path = btrfs_alloc_path(); 6855 if (!path) 6856 return -ENOMEM; 6857 6858 mutex_lock(&uuid_mutex); 6859 mutex_lock(&fs_info->chunk_mutex); 6860 6861 /* 6862 * Read all device items, and then all the chunk items. All 6863 * device items are found before any chunk item (their object id 6864 * is smaller than the lowest possible object id for a chunk 6865 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6866 */ 6867 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6868 key.offset = 0; 6869 key.type = 0; 6870 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6871 if (ret < 0) 6872 goto error; 6873 while (1) { 6874 leaf = path->nodes[0]; 6875 slot = path->slots[0]; 6876 if (slot >= btrfs_header_nritems(leaf)) { 6877 ret = btrfs_next_leaf(root, path); 6878 if (ret == 0) 6879 continue; 6880 if (ret < 0) 6881 goto error; 6882 break; 6883 } 6884 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6885 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6886 struct btrfs_dev_item *dev_item; 6887 dev_item = btrfs_item_ptr(leaf, slot, 6888 struct btrfs_dev_item); 6889 ret = read_one_dev(fs_info, leaf, dev_item); 6890 if (ret) 6891 goto error; 6892 total_dev++; 6893 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6894 struct btrfs_chunk *chunk; 6895 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6896 ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 6897 if (ret) 6898 goto error; 6899 } 6900 path->slots[0]++; 6901 } 6902 6903 /* 6904 * After loading chunk tree, we've got all device information, 6905 * do another round of validation checks. 6906 */ 6907 if (total_dev != fs_info->fs_devices->total_devices) { 6908 btrfs_err(fs_info, 6909 "super_num_devices %llu mismatch with num_devices %llu found here", 6910 btrfs_super_num_devices(fs_info->super_copy), 6911 total_dev); 6912 ret = -EINVAL; 6913 goto error; 6914 } 6915 if (btrfs_super_total_bytes(fs_info->super_copy) < 6916 fs_info->fs_devices->total_rw_bytes) { 6917 btrfs_err(fs_info, 6918 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 6919 btrfs_super_total_bytes(fs_info->super_copy), 6920 fs_info->fs_devices->total_rw_bytes); 6921 ret = -EINVAL; 6922 goto error; 6923 } 6924 ret = 0; 6925 error: 6926 mutex_unlock(&fs_info->chunk_mutex); 6927 mutex_unlock(&uuid_mutex); 6928 6929 btrfs_free_path(path); 6930 return ret; 6931 } 6932 6933 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6934 { 6935 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6936 struct btrfs_device *device; 6937 6938 while (fs_devices) { 6939 mutex_lock(&fs_devices->device_list_mutex); 6940 list_for_each_entry(device, &fs_devices->devices, dev_list) 6941 device->fs_info = fs_info; 6942 mutex_unlock(&fs_devices->device_list_mutex); 6943 6944 fs_devices = fs_devices->seed; 6945 } 6946 } 6947 6948 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6949 { 6950 int i; 6951 6952 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6953 btrfs_dev_stat_reset(dev, i); 6954 } 6955 6956 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6957 { 6958 struct btrfs_key key; 6959 struct btrfs_key found_key; 6960 struct btrfs_root *dev_root = fs_info->dev_root; 6961 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6962 struct extent_buffer *eb; 6963 int slot; 6964 int ret = 0; 6965 struct btrfs_device *device; 6966 struct btrfs_path *path = NULL; 6967 int i; 6968 6969 path = btrfs_alloc_path(); 6970 if (!path) { 6971 ret = -ENOMEM; 6972 goto out; 6973 } 6974 6975 mutex_lock(&fs_devices->device_list_mutex); 6976 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6977 int item_size; 6978 struct btrfs_dev_stats_item *ptr; 6979 6980 key.objectid = BTRFS_DEV_STATS_OBJECTID; 6981 key.type = BTRFS_PERSISTENT_ITEM_KEY; 6982 key.offset = device->devid; 6983 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6984 if (ret) { 6985 __btrfs_reset_dev_stats(device); 6986 device->dev_stats_valid = 1; 6987 btrfs_release_path(path); 6988 continue; 6989 } 6990 slot = path->slots[0]; 6991 eb = path->nodes[0]; 6992 btrfs_item_key_to_cpu(eb, &found_key, slot); 6993 item_size = btrfs_item_size_nr(eb, slot); 6994 6995 ptr = btrfs_item_ptr(eb, slot, 6996 struct btrfs_dev_stats_item); 6997 6998 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6999 if (item_size >= (1 + i) * sizeof(__le64)) 7000 btrfs_dev_stat_set(device, i, 7001 btrfs_dev_stats_value(eb, ptr, i)); 7002 else 7003 btrfs_dev_stat_reset(device, i); 7004 } 7005 7006 device->dev_stats_valid = 1; 7007 btrfs_dev_stat_print_on_load(device); 7008 btrfs_release_path(path); 7009 } 7010 mutex_unlock(&fs_devices->device_list_mutex); 7011 7012 out: 7013 btrfs_free_path(path); 7014 return ret < 0 ? ret : 0; 7015 } 7016 7017 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7018 struct btrfs_fs_info *fs_info, 7019 struct btrfs_device *device) 7020 { 7021 struct btrfs_root *dev_root = fs_info->dev_root; 7022 struct btrfs_path *path; 7023 struct btrfs_key key; 7024 struct extent_buffer *eb; 7025 struct btrfs_dev_stats_item *ptr; 7026 int ret; 7027 int i; 7028 7029 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7030 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7031 key.offset = device->devid; 7032 7033 path = btrfs_alloc_path(); 7034 if (!path) 7035 return -ENOMEM; 7036 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7037 if (ret < 0) { 7038 btrfs_warn_in_rcu(fs_info, 7039 "error %d while searching for dev_stats item for device %s", 7040 ret, rcu_str_deref(device->name)); 7041 goto out; 7042 } 7043 7044 if (ret == 0 && 7045 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7046 /* need to delete old one and insert a new one */ 7047 ret = btrfs_del_item(trans, dev_root, path); 7048 if (ret != 0) { 7049 btrfs_warn_in_rcu(fs_info, 7050 "delete too small dev_stats item for device %s failed %d", 7051 rcu_str_deref(device->name), ret); 7052 goto out; 7053 } 7054 ret = 1; 7055 } 7056 7057 if (ret == 1) { 7058 /* need to insert a new item */ 7059 btrfs_release_path(path); 7060 ret = btrfs_insert_empty_item(trans, dev_root, path, 7061 &key, sizeof(*ptr)); 7062 if (ret < 0) { 7063 btrfs_warn_in_rcu(fs_info, 7064 "insert dev_stats item for device %s failed %d", 7065 rcu_str_deref(device->name), ret); 7066 goto out; 7067 } 7068 } 7069 7070 eb = path->nodes[0]; 7071 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7072 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7073 btrfs_set_dev_stats_value(eb, ptr, i, 7074 btrfs_dev_stat_read(device, i)); 7075 btrfs_mark_buffer_dirty(eb); 7076 7077 out: 7078 btrfs_free_path(path); 7079 return ret; 7080 } 7081 7082 /* 7083 * called from commit_transaction. Writes all changed device stats to disk. 7084 */ 7085 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7086 struct btrfs_fs_info *fs_info) 7087 { 7088 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7089 struct btrfs_device *device; 7090 int stats_cnt; 7091 int ret = 0; 7092 7093 mutex_lock(&fs_devices->device_list_mutex); 7094 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7095 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) 7096 continue; 7097 7098 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7099 ret = update_dev_stat_item(trans, fs_info, device); 7100 if (!ret) 7101 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7102 } 7103 mutex_unlock(&fs_devices->device_list_mutex); 7104 7105 return ret; 7106 } 7107 7108 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7109 { 7110 btrfs_dev_stat_inc(dev, index); 7111 btrfs_dev_stat_print_on_error(dev); 7112 } 7113 7114 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7115 { 7116 if (!dev->dev_stats_valid) 7117 return; 7118 btrfs_err_rl_in_rcu(dev->fs_info, 7119 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7120 rcu_str_deref(dev->name), 7121 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7122 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7123 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7124 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7125 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7126 } 7127 7128 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7129 { 7130 int i; 7131 7132 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7133 if (btrfs_dev_stat_read(dev, i) != 0) 7134 break; 7135 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7136 return; /* all values == 0, suppress message */ 7137 7138 btrfs_info_in_rcu(dev->fs_info, 7139 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7140 rcu_str_deref(dev->name), 7141 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7142 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7143 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7144 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7145 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7146 } 7147 7148 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7149 struct btrfs_ioctl_get_dev_stats *stats) 7150 { 7151 struct btrfs_device *dev; 7152 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7153 int i; 7154 7155 mutex_lock(&fs_devices->device_list_mutex); 7156 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7157 mutex_unlock(&fs_devices->device_list_mutex); 7158 7159 if (!dev) { 7160 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7161 return -ENODEV; 7162 } else if (!dev->dev_stats_valid) { 7163 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7164 return -ENODEV; 7165 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7166 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7167 if (stats->nr_items > i) 7168 stats->values[i] = 7169 btrfs_dev_stat_read_and_reset(dev, i); 7170 else 7171 btrfs_dev_stat_reset(dev, i); 7172 } 7173 } else { 7174 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7175 if (stats->nr_items > i) 7176 stats->values[i] = btrfs_dev_stat_read(dev, i); 7177 } 7178 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7179 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7180 return 0; 7181 } 7182 7183 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7184 { 7185 struct buffer_head *bh; 7186 struct btrfs_super_block *disk_super; 7187 int copy_num; 7188 7189 if (!bdev) 7190 return; 7191 7192 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 7193 copy_num++) { 7194 7195 if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 7196 continue; 7197 7198 disk_super = (struct btrfs_super_block *)bh->b_data; 7199 7200 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7201 set_buffer_dirty(bh); 7202 sync_dirty_buffer(bh); 7203 brelse(bh); 7204 } 7205 7206 /* Notify udev that device has changed */ 7207 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 7208 7209 /* Update ctime/mtime for device path for libblkid */ 7210 update_dev_time(device_path); 7211 } 7212 7213 /* 7214 * Update the size of all devices, which is used for writing out the 7215 * super blocks. 7216 */ 7217 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7218 { 7219 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7220 struct btrfs_device *curr, *next; 7221 7222 if (list_empty(&fs_devices->resized_devices)) 7223 return; 7224 7225 mutex_lock(&fs_devices->device_list_mutex); 7226 mutex_lock(&fs_info->chunk_mutex); 7227 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7228 resized_list) { 7229 list_del_init(&curr->resized_list); 7230 curr->commit_total_bytes = curr->disk_total_bytes; 7231 } 7232 mutex_unlock(&fs_info->chunk_mutex); 7233 mutex_unlock(&fs_devices->device_list_mutex); 7234 } 7235 7236 /* Must be invoked during the transaction commit */ 7237 void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, 7238 struct btrfs_transaction *transaction) 7239 { 7240 struct extent_map *em; 7241 struct map_lookup *map; 7242 struct btrfs_device *dev; 7243 int i; 7244 7245 if (list_empty(&transaction->pending_chunks)) 7246 return; 7247 7248 /* In order to kick the device replace finish process */ 7249 mutex_lock(&fs_info->chunk_mutex); 7250 list_for_each_entry(em, &transaction->pending_chunks, list) { 7251 map = em->map_lookup; 7252 7253 for (i = 0; i < map->num_stripes; i++) { 7254 dev = map->stripes[i].dev; 7255 dev->commit_bytes_used = dev->bytes_used; 7256 } 7257 } 7258 mutex_unlock(&fs_info->chunk_mutex); 7259 } 7260 7261 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7262 { 7263 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7264 while (fs_devices) { 7265 fs_devices->fs_info = fs_info; 7266 fs_devices = fs_devices->seed; 7267 } 7268 } 7269 7270 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7271 { 7272 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7273 while (fs_devices) { 7274 fs_devices->fs_info = NULL; 7275 fs_devices = fs_devices->seed; 7276 } 7277 } 7278