10b86a832SChris Mason /* 20b86a832SChris Mason * Copyright (C) 2007 Oracle. All rights reserved. 30b86a832SChris Mason * 40b86a832SChris Mason * This program is free software; you can redistribute it and/or 50b86a832SChris Mason * modify it under the terms of the GNU General Public 60b86a832SChris Mason * License v2 as published by the Free Software Foundation. 70b86a832SChris Mason * 80b86a832SChris Mason * This program is distributed in the hope that it will be useful, 90b86a832SChris Mason * but WITHOUT ANY WARRANTY; without even the implied warranty of 100b86a832SChris Mason * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 110b86a832SChris Mason * General Public License for more details. 120b86a832SChris Mason * 130b86a832SChris Mason * You should have received a copy of the GNU General Public 140b86a832SChris Mason * License along with this program; if not, write to the 150b86a832SChris Mason * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 160b86a832SChris Mason * Boston, MA 021110-1307, USA. 170b86a832SChris Mason */ 180b86a832SChris Mason #include <linux/sched.h> 190b86a832SChris Mason #include <linux/bio.h> 205a0e3ad6STejun Heo #include <linux/slab.h> 218a4b83ccSChris Mason #include <linux/buffer_head.h> 22f2d8d74dSChris Mason #include <linux/blkdev.h> 23b765ead5SChris Mason #include <linux/iocontext.h> 246f88a440SBen Hutchings #include <linux/capability.h> 25442a4f63SStefan Behrens #include <linux/ratelimit.h> 2659641015SIlya Dryomov #include <linux/kthread.h> 2753b381b3SDavid Woodhouse #include <linux/raid/pq.h> 28803b2f54SStefan Behrens #include <linux/semaphore.h> 298da4b8c4SAndy Shevchenko #include <linux/uuid.h> 3053b381b3SDavid Woodhouse #include <asm/div64.h> 310b86a832SChris Mason #include "ctree.h" 320b86a832SChris Mason #include "extent_map.h" 330b86a832SChris Mason #include "disk-io.h" 340b86a832SChris Mason #include "transaction.h" 350b86a832SChris Mason #include "print-tree.h" 360b86a832SChris Mason #include "volumes.h" 3753b381b3SDavid Woodhouse #include "raid56.h" 388b712842SChris Mason #include "async-thread.h" 3921adbd5cSStefan Behrens #include "check-integrity.h" 40606686eeSJosef Bacik #include "rcu-string.h" 413fed40ccSMiao Xie #include "math.h" 428dabb742SStefan Behrens #include "dev-replace.h" 4399994cdeSAnand Jain #include "sysfs.h" 440b86a832SChris Mason 45af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 46af902047SZhao Lei [BTRFS_RAID_RAID10] = { 47af902047SZhao Lei .sub_stripes = 2, 48af902047SZhao Lei .dev_stripes = 1, 49af902047SZhao Lei .devs_max = 0, /* 0 == as many as possible */ 50af902047SZhao Lei .devs_min = 4, 518789f4feSZhao Lei .tolerated_failures = 1, 52af902047SZhao Lei .devs_increment = 2, 53af902047SZhao Lei .ncopies = 2, 54af902047SZhao Lei }, 55af902047SZhao Lei [BTRFS_RAID_RAID1] = { 56af902047SZhao Lei .sub_stripes = 1, 57af902047SZhao Lei .dev_stripes = 1, 58af902047SZhao Lei .devs_max = 2, 59af902047SZhao Lei .devs_min = 2, 608789f4feSZhao Lei .tolerated_failures = 1, 61af902047SZhao Lei .devs_increment = 2, 62af902047SZhao Lei .ncopies = 2, 63af902047SZhao Lei }, 64af902047SZhao Lei [BTRFS_RAID_DUP] = { 65af902047SZhao Lei .sub_stripes = 1, 66af902047SZhao Lei .dev_stripes = 2, 67af902047SZhao Lei .devs_max = 1, 68af902047SZhao Lei .devs_min = 1, 698789f4feSZhao Lei .tolerated_failures = 0, 70af902047SZhao Lei .devs_increment = 1, 71af902047SZhao Lei .ncopies = 2, 72af902047SZhao Lei }, 73af902047SZhao Lei [BTRFS_RAID_RAID0] = { 74af902047SZhao Lei .sub_stripes = 1, 75af902047SZhao Lei .dev_stripes = 1, 76af902047SZhao Lei .devs_max = 0, 77af902047SZhao Lei .devs_min = 2, 788789f4feSZhao Lei .tolerated_failures = 0, 79af902047SZhao Lei .devs_increment = 1, 80af902047SZhao Lei .ncopies = 1, 81af902047SZhao Lei }, 82af902047SZhao Lei [BTRFS_RAID_SINGLE] = { 83af902047SZhao Lei .sub_stripes = 1, 84af902047SZhao Lei .dev_stripes = 1, 85af902047SZhao Lei .devs_max = 1, 86af902047SZhao Lei .devs_min = 1, 878789f4feSZhao Lei .tolerated_failures = 0, 88af902047SZhao Lei .devs_increment = 1, 89af902047SZhao Lei .ncopies = 1, 90af902047SZhao Lei }, 91af902047SZhao Lei [BTRFS_RAID_RAID5] = { 92af902047SZhao Lei .sub_stripes = 1, 93af902047SZhao Lei .dev_stripes = 1, 94af902047SZhao Lei .devs_max = 0, 95af902047SZhao Lei .devs_min = 2, 968789f4feSZhao Lei .tolerated_failures = 1, 97af902047SZhao Lei .devs_increment = 1, 98af902047SZhao Lei .ncopies = 2, 99af902047SZhao Lei }, 100af902047SZhao Lei [BTRFS_RAID_RAID6] = { 101af902047SZhao Lei .sub_stripes = 1, 102af902047SZhao Lei .dev_stripes = 1, 103af902047SZhao Lei .devs_max = 0, 104af902047SZhao Lei .devs_min = 3, 1058789f4feSZhao Lei .tolerated_failures = 2, 106af902047SZhao Lei .devs_increment = 1, 107af902047SZhao Lei .ncopies = 3, 108af902047SZhao Lei }, 109af902047SZhao Lei }; 110af902047SZhao Lei 111fb75d857SColin Ian King const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { 112af902047SZhao Lei [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, 113af902047SZhao Lei [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, 114af902047SZhao Lei [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, 115af902047SZhao Lei [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, 116af902047SZhao Lei [BTRFS_RAID_SINGLE] = 0, 117af902047SZhao Lei [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, 118af902047SZhao Lei [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 119af902047SZhao Lei }; 120af902047SZhao Lei 121621292baSDavid Sterba /* 122621292baSDavid Sterba * Table to convert BTRFS_RAID_* to the error code if minimum number of devices 123621292baSDavid Sterba * condition is not met. Zero means there's no corresponding 124621292baSDavid Sterba * BTRFS_ERROR_DEV_*_NOT_MET value. 125621292baSDavid Sterba */ 126621292baSDavid Sterba const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { 127621292baSDavid Sterba [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 128621292baSDavid Sterba [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 129621292baSDavid Sterba [BTRFS_RAID_DUP] = 0, 130621292baSDavid Sterba [BTRFS_RAID_RAID0] = 0, 131621292baSDavid Sterba [BTRFS_RAID_SINGLE] = 0, 132621292baSDavid Sterba [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 133621292baSDavid Sterba [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 134621292baSDavid Sterba }; 135621292baSDavid Sterba 1362b82032cSYan Zheng static int init_first_rw_device(struct btrfs_trans_handle *trans, 137e4a4dce7SDavid Sterba struct btrfs_fs_info *fs_info); 1382ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 139733f4fbbSStefan Behrens static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 14048a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 141733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 1425ab56090SLiu Bo static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 1435ab56090SLiu Bo enum btrfs_map_op op, 1445ab56090SLiu Bo u64 logical, u64 *length, 1455ab56090SLiu Bo struct btrfs_bio **bbio_ret, 1465ab56090SLiu Bo int mirror_num, int need_raid_map); 1472b82032cSYan Zheng 1489c6b1c4dSDavid Sterba /* 1499c6b1c4dSDavid Sterba * Device locking 1509c6b1c4dSDavid Sterba * ============== 1519c6b1c4dSDavid Sterba * 1529c6b1c4dSDavid Sterba * There are several mutexes that protect manipulation of devices and low-level 1539c6b1c4dSDavid Sterba * structures like chunks but not block groups, extents or files 1549c6b1c4dSDavid Sterba * 1559c6b1c4dSDavid Sterba * uuid_mutex (global lock) 1569c6b1c4dSDavid Sterba * ------------------------ 1579c6b1c4dSDavid Sterba * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 1589c6b1c4dSDavid Sterba * the SCAN_DEV ioctl registration or from mount either implicitly (the first 1599c6b1c4dSDavid Sterba * device) or requested by the device= mount option 1609c6b1c4dSDavid Sterba * 1619c6b1c4dSDavid Sterba * the mutex can be very coarse and can cover long-running operations 1629c6b1c4dSDavid Sterba * 1639c6b1c4dSDavid Sterba * protects: updates to fs_devices counters like missing devices, rw devices, 1649c6b1c4dSDavid Sterba * seeding, structure cloning, openning/closing devices at mount/umount time 1659c6b1c4dSDavid Sterba * 1669c6b1c4dSDavid Sterba * global::fs_devs - add, remove, updates to the global list 1679c6b1c4dSDavid Sterba * 1689c6b1c4dSDavid Sterba * does not protect: manipulation of the fs_devices::devices list! 1699c6b1c4dSDavid Sterba * 1709c6b1c4dSDavid Sterba * btrfs_device::name - renames (write side), read is RCU 1719c6b1c4dSDavid Sterba * 1729c6b1c4dSDavid Sterba * fs_devices::device_list_mutex (per-fs, with RCU) 1739c6b1c4dSDavid Sterba * ------------------------------------------------ 1749c6b1c4dSDavid Sterba * protects updates to fs_devices::devices, ie. adding and deleting 1759c6b1c4dSDavid Sterba * 1769c6b1c4dSDavid Sterba * simple list traversal with read-only actions can be done with RCU protection 1779c6b1c4dSDavid Sterba * 1789c6b1c4dSDavid Sterba * may be used to exclude some operations from running concurrently without any 1799c6b1c4dSDavid Sterba * modifications to the list (see write_all_supers) 1809c6b1c4dSDavid Sterba * 1819c6b1c4dSDavid Sterba * volume_mutex 1829c6b1c4dSDavid Sterba * ------------ 1839c6b1c4dSDavid Sterba * coarse lock owned by a mounted filesystem; used to exclude some operations 1849c6b1c4dSDavid Sterba * that cannot run in parallel and affect the higher-level properties of the 1859c6b1c4dSDavid Sterba * filesystem like: device add/deleting/resize/replace, or balance 1869c6b1c4dSDavid Sterba * 1879c6b1c4dSDavid Sterba * balance_mutex 1889c6b1c4dSDavid Sterba * ------------- 1899c6b1c4dSDavid Sterba * protects balance structures (status, state) and context accessed from 1909c6b1c4dSDavid Sterba * several places (internally, ioctl) 1919c6b1c4dSDavid Sterba * 1929c6b1c4dSDavid Sterba * chunk_mutex 1939c6b1c4dSDavid Sterba * ----------- 1949c6b1c4dSDavid Sterba * protects chunks, adding or removing during allocation, trim or when a new 1959c6b1c4dSDavid Sterba * device is added/removed 1969c6b1c4dSDavid Sterba * 1979c6b1c4dSDavid Sterba * cleaner_mutex 1989c6b1c4dSDavid Sterba * ------------- 1999c6b1c4dSDavid Sterba * a big lock that is held by the cleaner thread and prevents running subvolume 2009c6b1c4dSDavid Sterba * cleaning together with relocation or delayed iputs 2019c6b1c4dSDavid Sterba * 2029c6b1c4dSDavid Sterba * 2039c6b1c4dSDavid Sterba * Lock nesting 2049c6b1c4dSDavid Sterba * ============ 2059c6b1c4dSDavid Sterba * 2069c6b1c4dSDavid Sterba * uuid_mutex 2079c6b1c4dSDavid Sterba * volume_mutex 2089c6b1c4dSDavid Sterba * device_list_mutex 2099c6b1c4dSDavid Sterba * chunk_mutex 2109c6b1c4dSDavid Sterba * balance_mutex 2119c6b1c4dSDavid Sterba */ 2129c6b1c4dSDavid Sterba 21367a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex); 2148a4b83ccSChris Mason static LIST_HEAD(fs_uuids); 215c73eccf7SAnand Jain struct list_head *btrfs_get_fs_uuids(void) 216c73eccf7SAnand Jain { 217c73eccf7SAnand Jain return &fs_uuids; 218c73eccf7SAnand Jain } 2198a4b83ccSChris Mason 2202dfeca9bSDavid Sterba /* 2212dfeca9bSDavid Sterba * alloc_fs_devices - allocate struct btrfs_fs_devices 2222dfeca9bSDavid Sterba * @fsid: if not NULL, copy the uuid to fs_devices::fsid 2232dfeca9bSDavid Sterba * 2242dfeca9bSDavid Sterba * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 2252dfeca9bSDavid Sterba * The returned struct is not linked onto any lists and can be destroyed with 2262dfeca9bSDavid Sterba * kfree() right away. 2272dfeca9bSDavid Sterba */ 2282dfeca9bSDavid Sterba static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 2292208a378SIlya Dryomov { 2302208a378SIlya Dryomov struct btrfs_fs_devices *fs_devs; 2312208a378SIlya Dryomov 23278f2c9e6SDavid Sterba fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 2332208a378SIlya Dryomov if (!fs_devs) 2342208a378SIlya Dryomov return ERR_PTR(-ENOMEM); 2352208a378SIlya Dryomov 2362208a378SIlya Dryomov mutex_init(&fs_devs->device_list_mutex); 2372208a378SIlya Dryomov 2382208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->devices); 239935e5cc9SMiao Xie INIT_LIST_HEAD(&fs_devs->resized_devices); 2402208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->alloc_list); 2412208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->list); 2422208a378SIlya Dryomov if (fsid) 2432208a378SIlya Dryomov memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 2442208a378SIlya Dryomov 2452208a378SIlya Dryomov return fs_devs; 2462208a378SIlya Dryomov } 2472208a378SIlya Dryomov 24848dae9cfSDavid Sterba static void free_device(struct btrfs_device *device) 24948dae9cfSDavid Sterba { 25048dae9cfSDavid Sterba rcu_string_free(device->name); 25148dae9cfSDavid Sterba bio_put(device->flush_bio); 25248dae9cfSDavid Sterba kfree(device); 25348dae9cfSDavid Sterba } 25448dae9cfSDavid Sterba 255e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 256e4404d6eSYan Zheng { 257e4404d6eSYan Zheng struct btrfs_device *device; 258e4404d6eSYan Zheng WARN_ON(fs_devices->opened); 259e4404d6eSYan Zheng while (!list_empty(&fs_devices->devices)) { 260e4404d6eSYan Zheng device = list_entry(fs_devices->devices.next, 261e4404d6eSYan Zheng struct btrfs_device, dev_list); 262e4404d6eSYan Zheng list_del(&device->dev_list); 26355de4803SDavid Sterba free_device(device); 264e4404d6eSYan Zheng } 265e4404d6eSYan Zheng kfree(fs_devices); 266e4404d6eSYan Zheng } 267e4404d6eSYan Zheng 268b8b8ff59SLukas Czerner static void btrfs_kobject_uevent(struct block_device *bdev, 269b8b8ff59SLukas Czerner enum kobject_action action) 270b8b8ff59SLukas Czerner { 271b8b8ff59SLukas Czerner int ret; 272b8b8ff59SLukas Czerner 273b8b8ff59SLukas Czerner ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 274b8b8ff59SLukas Czerner if (ret) 275efe120a0SFrank Holton pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 276b8b8ff59SLukas Czerner action, 277b8b8ff59SLukas Czerner kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 278b8b8ff59SLukas Czerner &disk_to_dev(bdev->bd_disk)->kobj); 279b8b8ff59SLukas Czerner } 280b8b8ff59SLukas Czerner 281143bede5SJeff Mahoney void btrfs_cleanup_fs_uuids(void) 2828a4b83ccSChris Mason { 2838a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 2848a4b83ccSChris Mason 2852b82032cSYan Zheng while (!list_empty(&fs_uuids)) { 2862b82032cSYan Zheng fs_devices = list_entry(fs_uuids.next, 2872b82032cSYan Zheng struct btrfs_fs_devices, list); 2882b82032cSYan Zheng list_del(&fs_devices->list); 289e4404d6eSYan Zheng free_fs_devices(fs_devices); 2908a4b83ccSChris Mason } 2918a4b83ccSChris Mason } 2928a4b83ccSChris Mason 29348dae9cfSDavid Sterba /* 29448dae9cfSDavid Sterba * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 29548dae9cfSDavid Sterba * Returned struct is not linked onto any lists and must be destroyed using 29648dae9cfSDavid Sterba * free_device. 29748dae9cfSDavid Sterba */ 29812bd2fc0SIlya Dryomov static struct btrfs_device *__alloc_device(void) 29912bd2fc0SIlya Dryomov { 30012bd2fc0SIlya Dryomov struct btrfs_device *dev; 30112bd2fc0SIlya Dryomov 30278f2c9e6SDavid Sterba dev = kzalloc(sizeof(*dev), GFP_KERNEL); 30312bd2fc0SIlya Dryomov if (!dev) 30412bd2fc0SIlya Dryomov return ERR_PTR(-ENOMEM); 30512bd2fc0SIlya Dryomov 306e0ae9994SDavid Sterba /* 307e0ae9994SDavid Sterba * Preallocate a bio that's always going to be used for flushing device 308e0ae9994SDavid Sterba * barriers and matches the device lifespan 309e0ae9994SDavid Sterba */ 310e0ae9994SDavid Sterba dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 311e0ae9994SDavid Sterba if (!dev->flush_bio) { 312e0ae9994SDavid Sterba kfree(dev); 313e0ae9994SDavid Sterba return ERR_PTR(-ENOMEM); 314e0ae9994SDavid Sterba } 315e0ae9994SDavid Sterba 31612bd2fc0SIlya Dryomov INIT_LIST_HEAD(&dev->dev_list); 31712bd2fc0SIlya Dryomov INIT_LIST_HEAD(&dev->dev_alloc_list); 318935e5cc9SMiao Xie INIT_LIST_HEAD(&dev->resized_list); 31912bd2fc0SIlya Dryomov 32012bd2fc0SIlya Dryomov spin_lock_init(&dev->io_lock); 32112bd2fc0SIlya Dryomov 32212bd2fc0SIlya Dryomov spin_lock_init(&dev->reada_lock); 32312bd2fc0SIlya Dryomov atomic_set(&dev->reada_in_flight, 0); 324addc3fa7SMiao Xie atomic_set(&dev->dev_stats_ccnt, 0); 325546bed63SSebastian Andrzej Siewior btrfs_device_data_ordered_init(dev); 3269bcaaea7SChris Mason INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 327d0164adcSMel Gorman INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 32812bd2fc0SIlya Dryomov 32912bd2fc0SIlya Dryomov return dev; 33012bd2fc0SIlya Dryomov } 33112bd2fc0SIlya Dryomov 33235c70103SDavid Sterba /* 33335c70103SDavid Sterba * Find a device specified by @devid or @uuid in the list of @fs_devices, or 33435c70103SDavid Sterba * return NULL. 33535c70103SDavid Sterba * 33635c70103SDavid Sterba * If devid and uuid are both specified, the match must be exact, otherwise 33735c70103SDavid Sterba * only devid is used. 33835c70103SDavid Sterba */ 33935c70103SDavid Sterba static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, 34035c70103SDavid Sterba u64 devid, const u8 *uuid) 3418a4b83ccSChris Mason { 34235c70103SDavid Sterba struct list_head *head = &fs_devices->devices; 3438a4b83ccSChris Mason struct btrfs_device *dev; 3448a4b83ccSChris Mason 345c6e30871SQinghuang Feng list_for_each_entry(dev, head, dev_list) { 346a443755fSChris Mason if (dev->devid == devid && 3478f18cf13SChris Mason (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 3488a4b83ccSChris Mason return dev; 3498a4b83ccSChris Mason } 350a443755fSChris Mason } 3518a4b83ccSChris Mason return NULL; 3528a4b83ccSChris Mason } 3538a4b83ccSChris Mason 354a1b32a59SChris Mason static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 3558a4b83ccSChris Mason { 3568a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 3578a4b83ccSChris Mason 358c6e30871SQinghuang Feng list_for_each_entry(fs_devices, &fs_uuids, list) { 3598a4b83ccSChris Mason if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 3608a4b83ccSChris Mason return fs_devices; 3618a4b83ccSChris Mason } 3628a4b83ccSChris Mason return NULL; 3638a4b83ccSChris Mason } 3648a4b83ccSChris Mason 365beaf8ab3SStefan Behrens static int 366beaf8ab3SStefan Behrens btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 367beaf8ab3SStefan Behrens int flush, struct block_device **bdev, 368beaf8ab3SStefan Behrens struct buffer_head **bh) 369beaf8ab3SStefan Behrens { 370beaf8ab3SStefan Behrens int ret; 371beaf8ab3SStefan Behrens 372beaf8ab3SStefan Behrens *bdev = blkdev_get_by_path(device_path, flags, holder); 373beaf8ab3SStefan Behrens 374beaf8ab3SStefan Behrens if (IS_ERR(*bdev)) { 375beaf8ab3SStefan Behrens ret = PTR_ERR(*bdev); 376beaf8ab3SStefan Behrens goto error; 377beaf8ab3SStefan Behrens } 378beaf8ab3SStefan Behrens 379beaf8ab3SStefan Behrens if (flush) 380beaf8ab3SStefan Behrens filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 3819f6d2510SDavid Sterba ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 382beaf8ab3SStefan Behrens if (ret) { 383beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 384beaf8ab3SStefan Behrens goto error; 385beaf8ab3SStefan Behrens } 386beaf8ab3SStefan Behrens invalidate_bdev(*bdev); 387beaf8ab3SStefan Behrens *bh = btrfs_read_dev_super(*bdev); 38892fc03fbSAnand Jain if (IS_ERR(*bh)) { 38992fc03fbSAnand Jain ret = PTR_ERR(*bh); 390beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 391beaf8ab3SStefan Behrens goto error; 392beaf8ab3SStefan Behrens } 393beaf8ab3SStefan Behrens 394beaf8ab3SStefan Behrens return 0; 395beaf8ab3SStefan Behrens 396beaf8ab3SStefan Behrens error: 397beaf8ab3SStefan Behrens *bdev = NULL; 398beaf8ab3SStefan Behrens *bh = NULL; 399beaf8ab3SStefan Behrens return ret; 400beaf8ab3SStefan Behrens } 401beaf8ab3SStefan Behrens 402ffbd517dSChris Mason static void requeue_list(struct btrfs_pending_bios *pending_bios, 403ffbd517dSChris Mason struct bio *head, struct bio *tail) 404ffbd517dSChris Mason { 405ffbd517dSChris Mason 406ffbd517dSChris Mason struct bio *old_head; 407ffbd517dSChris Mason 408ffbd517dSChris Mason old_head = pending_bios->head; 409ffbd517dSChris Mason pending_bios->head = head; 410ffbd517dSChris Mason if (pending_bios->tail) 411ffbd517dSChris Mason tail->bi_next = old_head; 412ffbd517dSChris Mason else 413ffbd517dSChris Mason pending_bios->tail = tail; 414ffbd517dSChris Mason } 415ffbd517dSChris Mason 4168b712842SChris Mason /* 4178b712842SChris Mason * we try to collect pending bios for a device so we don't get a large 4188b712842SChris Mason * number of procs sending bios down to the same device. This greatly 4198b712842SChris Mason * improves the schedulers ability to collect and merge the bios. 4208b712842SChris Mason * 4218b712842SChris Mason * But, it also turns into a long list of bios to process and that is sure 4228b712842SChris Mason * to eventually make the worker thread block. The solution here is to 4238b712842SChris Mason * make some progress and then put this work struct back at the end of 4248b712842SChris Mason * the list if the block device is congested. This way, multiple devices 4258b712842SChris Mason * can make progress from a single worker thread. 4268b712842SChris Mason */ 427143bede5SJeff Mahoney static noinline void run_scheduled_bios(struct btrfs_device *device) 4288b712842SChris Mason { 4290b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 4308b712842SChris Mason struct bio *pending; 4318b712842SChris Mason struct backing_dev_info *bdi; 432ffbd517dSChris Mason struct btrfs_pending_bios *pending_bios; 4338b712842SChris Mason struct bio *tail; 4348b712842SChris Mason struct bio *cur; 4358b712842SChris Mason int again = 0; 436ffbd517dSChris Mason unsigned long num_run; 437d644d8a1SChris Mason unsigned long batch_run = 0; 438b765ead5SChris Mason unsigned long last_waited = 0; 439d84275c9SChris Mason int force_reg = 0; 4400e588859SMiao Xie int sync_pending = 0; 441211588adSChris Mason struct blk_plug plug; 442211588adSChris Mason 443211588adSChris Mason /* 444211588adSChris Mason * this function runs all the bios we've collected for 445211588adSChris Mason * a particular device. We don't want to wander off to 446211588adSChris Mason * another device without first sending all of these down. 447211588adSChris Mason * So, setup a plug here and finish it off before we return 448211588adSChris Mason */ 449211588adSChris Mason blk_start_plug(&plug); 4508b712842SChris Mason 451efa7c9f9SJan Kara bdi = device->bdev->bd_bdi; 452b64a2851SChris Mason 4538b712842SChris Mason loop: 4548b712842SChris Mason spin_lock(&device->io_lock); 4558b712842SChris Mason 456a6837051SChris Mason loop_lock: 457d84275c9SChris Mason num_run = 0; 458ffbd517dSChris Mason 4598b712842SChris Mason /* take all the bios off the list at once and process them 4608b712842SChris Mason * later on (without the lock held). But, remember the 4618b712842SChris Mason * tail and other pointers so the bios can be properly reinserted 4628b712842SChris Mason * into the list if we hit congestion 4638b712842SChris Mason */ 464d84275c9SChris Mason if (!force_reg && device->pending_sync_bios.head) { 465ffbd517dSChris Mason pending_bios = &device->pending_sync_bios; 466d84275c9SChris Mason force_reg = 1; 467d84275c9SChris Mason } else { 468ffbd517dSChris Mason pending_bios = &device->pending_bios; 469d84275c9SChris Mason force_reg = 0; 470d84275c9SChris Mason } 471ffbd517dSChris Mason 472ffbd517dSChris Mason pending = pending_bios->head; 473ffbd517dSChris Mason tail = pending_bios->tail; 4748b712842SChris Mason WARN_ON(pending && !tail); 4758b712842SChris Mason 4768b712842SChris Mason /* 4778b712842SChris Mason * if pending was null this time around, no bios need processing 4788b712842SChris Mason * at all and we can stop. Otherwise it'll loop back up again 4798b712842SChris Mason * and do an additional check so no bios are missed. 4808b712842SChris Mason * 4818b712842SChris Mason * device->running_pending is used to synchronize with the 4828b712842SChris Mason * schedule_bio code. 4838b712842SChris Mason */ 484ffbd517dSChris Mason if (device->pending_sync_bios.head == NULL && 485ffbd517dSChris Mason device->pending_bios.head == NULL) { 4868b712842SChris Mason again = 0; 4878b712842SChris Mason device->running_pending = 0; 488ffbd517dSChris Mason } else { 489ffbd517dSChris Mason again = 1; 490ffbd517dSChris Mason device->running_pending = 1; 4918b712842SChris Mason } 492ffbd517dSChris Mason 493ffbd517dSChris Mason pending_bios->head = NULL; 494ffbd517dSChris Mason pending_bios->tail = NULL; 495ffbd517dSChris Mason 4968b712842SChris Mason spin_unlock(&device->io_lock); 4978b712842SChris Mason 4988b712842SChris Mason while (pending) { 499ffbd517dSChris Mason 500ffbd517dSChris Mason rmb(); 501d84275c9SChris Mason /* we want to work on both lists, but do more bios on the 502d84275c9SChris Mason * sync list than the regular list 503d84275c9SChris Mason */ 504d84275c9SChris Mason if ((num_run > 32 && 505d84275c9SChris Mason pending_bios != &device->pending_sync_bios && 506d84275c9SChris Mason device->pending_sync_bios.head) || 507d84275c9SChris Mason (num_run > 64 && pending_bios == &device->pending_sync_bios && 508d84275c9SChris Mason device->pending_bios.head)) { 509ffbd517dSChris Mason spin_lock(&device->io_lock); 510ffbd517dSChris Mason requeue_list(pending_bios, pending, tail); 511ffbd517dSChris Mason goto loop_lock; 512ffbd517dSChris Mason } 513ffbd517dSChris Mason 5148b712842SChris Mason cur = pending; 5158b712842SChris Mason pending = pending->bi_next; 5168b712842SChris Mason cur->bi_next = NULL; 517b64a2851SChris Mason 518dac56212SJens Axboe BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 519d644d8a1SChris Mason 5202ab1ba68SChris Mason /* 5212ab1ba68SChris Mason * if we're doing the sync list, record that our 5222ab1ba68SChris Mason * plug has some sync requests on it 5232ab1ba68SChris Mason * 5242ab1ba68SChris Mason * If we're doing the regular list and there are 5252ab1ba68SChris Mason * sync requests sitting around, unplug before 5262ab1ba68SChris Mason * we add more 5272ab1ba68SChris Mason */ 5282ab1ba68SChris Mason if (pending_bios == &device->pending_sync_bios) { 5292ab1ba68SChris Mason sync_pending = 1; 5302ab1ba68SChris Mason } else if (sync_pending) { 5312ab1ba68SChris Mason blk_finish_plug(&plug); 5322ab1ba68SChris Mason blk_start_plug(&plug); 5332ab1ba68SChris Mason sync_pending = 0; 5342ab1ba68SChris Mason } 5352ab1ba68SChris Mason 5364e49ea4aSMike Christie btrfsic_submit_bio(cur); 5375ff7ba3aSChris Mason num_run++; 5385ff7ba3aSChris Mason batch_run++; 539853d8ec4SDavid Sterba 540ffbd517dSChris Mason cond_resched(); 5418b712842SChris Mason 5428b712842SChris Mason /* 5438b712842SChris Mason * we made progress, there is more work to do and the bdi 5448b712842SChris Mason * is now congested. Back off and let other work structs 5458b712842SChris Mason * run instead 5468b712842SChris Mason */ 54757fd5a5fSChris Mason if (pending && bdi_write_congested(bdi) && batch_run > 8 && 5485f2cc086SChris Mason fs_info->fs_devices->open_devices > 1) { 549b765ead5SChris Mason struct io_context *ioc; 5508b712842SChris Mason 551b765ead5SChris Mason ioc = current->io_context; 552b765ead5SChris Mason 553b765ead5SChris Mason /* 554b765ead5SChris Mason * the main goal here is that we don't want to 555b765ead5SChris Mason * block if we're going to be able to submit 556b765ead5SChris Mason * more requests without blocking. 557b765ead5SChris Mason * 558b765ead5SChris Mason * This code does two great things, it pokes into 559b765ead5SChris Mason * the elevator code from a filesystem _and_ 560b765ead5SChris Mason * it makes assumptions about how batching works. 561b765ead5SChris Mason */ 562b765ead5SChris Mason if (ioc && ioc->nr_batch_requests > 0 && 563b765ead5SChris Mason time_before(jiffies, ioc->last_waited + HZ/50UL) && 564b765ead5SChris Mason (last_waited == 0 || 565b765ead5SChris Mason ioc->last_waited == last_waited)) { 566b765ead5SChris Mason /* 567b765ead5SChris Mason * we want to go through our batch of 568b765ead5SChris Mason * requests and stop. So, we copy out 569b765ead5SChris Mason * the ioc->last_waited time and test 570b765ead5SChris Mason * against it before looping 571b765ead5SChris Mason */ 572b765ead5SChris Mason last_waited = ioc->last_waited; 573ffbd517dSChris Mason cond_resched(); 574b765ead5SChris Mason continue; 575b765ead5SChris Mason } 5768b712842SChris Mason spin_lock(&device->io_lock); 577ffbd517dSChris Mason requeue_list(pending_bios, pending, tail); 578a6837051SChris Mason device->running_pending = 1; 5798b712842SChris Mason 5808b712842SChris Mason spin_unlock(&device->io_lock); 581a8c93d4eSQu Wenruo btrfs_queue_work(fs_info->submit_workers, 582a8c93d4eSQu Wenruo &device->work); 5838b712842SChris Mason goto done; 5848b712842SChris Mason } 5858b712842SChris Mason } 586ffbd517dSChris Mason 58751684082SChris Mason cond_resched(); 58851684082SChris Mason if (again) 58951684082SChris Mason goto loop; 59051684082SChris Mason 59151684082SChris Mason spin_lock(&device->io_lock); 59251684082SChris Mason if (device->pending_bios.head || device->pending_sync_bios.head) 59351684082SChris Mason goto loop_lock; 59451684082SChris Mason spin_unlock(&device->io_lock); 59551684082SChris Mason 5968b712842SChris Mason done: 597211588adSChris Mason blk_finish_plug(&plug); 5988b712842SChris Mason } 5998b712842SChris Mason 600b2950863SChristoph Hellwig static void pending_bios_fn(struct btrfs_work *work) 6018b712842SChris Mason { 6028b712842SChris Mason struct btrfs_device *device; 6038b712842SChris Mason 6048b712842SChris Mason device = container_of(work, struct btrfs_device, work); 6058b712842SChris Mason run_scheduled_bios(device); 6068b712842SChris Mason } 6078b712842SChris Mason 6084fde46f0SAnand Jain 609c9162bdfSOmar Sandoval static void btrfs_free_stale_device(struct btrfs_device *cur_dev) 6104fde46f0SAnand Jain { 6114fde46f0SAnand Jain struct btrfs_fs_devices *fs_devs; 6124fde46f0SAnand Jain struct btrfs_device *dev; 6134fde46f0SAnand Jain 6144fde46f0SAnand Jain if (!cur_dev->name) 6154fde46f0SAnand Jain return; 6164fde46f0SAnand Jain 6174fde46f0SAnand Jain list_for_each_entry(fs_devs, &fs_uuids, list) { 6184fde46f0SAnand Jain int del = 1; 6194fde46f0SAnand Jain 6204fde46f0SAnand Jain if (fs_devs->opened) 6214fde46f0SAnand Jain continue; 6224fde46f0SAnand Jain if (fs_devs->seeding) 6234fde46f0SAnand Jain continue; 6244fde46f0SAnand Jain 6254fde46f0SAnand Jain list_for_each_entry(dev, &fs_devs->devices, dev_list) { 6264fde46f0SAnand Jain 6274fde46f0SAnand Jain if (dev == cur_dev) 6284fde46f0SAnand Jain continue; 6294fde46f0SAnand Jain if (!dev->name) 6304fde46f0SAnand Jain continue; 6314fde46f0SAnand Jain 6324fde46f0SAnand Jain /* 6334fde46f0SAnand Jain * Todo: This won't be enough. What if the same device 6344fde46f0SAnand Jain * comes back (with new uuid and) with its mapper path? 6354fde46f0SAnand Jain * But for now, this does help as mostly an admin will 6364fde46f0SAnand Jain * either use mapper or non mapper path throughout. 6374fde46f0SAnand Jain */ 6384fde46f0SAnand Jain rcu_read_lock(); 6394fde46f0SAnand Jain del = strcmp(rcu_str_deref(dev->name), 6404fde46f0SAnand Jain rcu_str_deref(cur_dev->name)); 6414fde46f0SAnand Jain rcu_read_unlock(); 6424fde46f0SAnand Jain if (!del) 6434fde46f0SAnand Jain break; 6444fde46f0SAnand Jain } 6454fde46f0SAnand Jain 6464fde46f0SAnand Jain if (!del) { 6474fde46f0SAnand Jain /* delete the stale device */ 6484fde46f0SAnand Jain if (fs_devs->num_devices == 1) { 6494fde46f0SAnand Jain btrfs_sysfs_remove_fsid(fs_devs); 6504fde46f0SAnand Jain list_del(&fs_devs->list); 6514fde46f0SAnand Jain free_fs_devices(fs_devs); 6524fde46f0SAnand Jain } else { 6534fde46f0SAnand Jain fs_devs->num_devices--; 6544fde46f0SAnand Jain list_del(&dev->dev_list); 65555de4803SDavid Sterba free_device(dev); 6564fde46f0SAnand Jain } 6574fde46f0SAnand Jain break; 6584fde46f0SAnand Jain } 6594fde46f0SAnand Jain } 6604fde46f0SAnand Jain } 6614fde46f0SAnand Jain 6620fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 6630fb08bccSAnand Jain struct btrfs_device *device, fmode_t flags, 6640fb08bccSAnand Jain void *holder) 6650fb08bccSAnand Jain { 6660fb08bccSAnand Jain struct request_queue *q; 6670fb08bccSAnand Jain struct block_device *bdev; 6680fb08bccSAnand Jain struct buffer_head *bh; 6690fb08bccSAnand Jain struct btrfs_super_block *disk_super; 6700fb08bccSAnand Jain u64 devid; 6710fb08bccSAnand Jain int ret; 6720fb08bccSAnand Jain 6730fb08bccSAnand Jain if (device->bdev) 6740fb08bccSAnand Jain return -EINVAL; 6750fb08bccSAnand Jain if (!device->name) 6760fb08bccSAnand Jain return -EINVAL; 6770fb08bccSAnand Jain 6780fb08bccSAnand Jain ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 6790fb08bccSAnand Jain &bdev, &bh); 6800fb08bccSAnand Jain if (ret) 6810fb08bccSAnand Jain return ret; 6820fb08bccSAnand Jain 6830fb08bccSAnand Jain disk_super = (struct btrfs_super_block *)bh->b_data; 6840fb08bccSAnand Jain devid = btrfs_stack_device_id(&disk_super->dev_item); 6850fb08bccSAnand Jain if (devid != device->devid) 6860fb08bccSAnand Jain goto error_brelse; 6870fb08bccSAnand Jain 6880fb08bccSAnand Jain if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 6890fb08bccSAnand Jain goto error_brelse; 6900fb08bccSAnand Jain 6910fb08bccSAnand Jain device->generation = btrfs_super_generation(disk_super); 6920fb08bccSAnand Jain 6930fb08bccSAnand Jain if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 694ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 6950fb08bccSAnand Jain fs_devices->seeding = 1; 6960fb08bccSAnand Jain } else { 697ebbede42SAnand Jain if (bdev_read_only(bdev)) 698ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 699ebbede42SAnand Jain else 700ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 7010fb08bccSAnand Jain } 7020fb08bccSAnand Jain 7030fb08bccSAnand Jain q = bdev_get_queue(bdev); 7040fb08bccSAnand Jain if (!blk_queue_nonrot(q)) 7050fb08bccSAnand Jain fs_devices->rotating = 1; 7060fb08bccSAnand Jain 7070fb08bccSAnand Jain device->bdev = bdev; 708e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7090fb08bccSAnand Jain device->mode = flags; 7100fb08bccSAnand Jain 7110fb08bccSAnand Jain fs_devices->open_devices++; 712ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 713ebbede42SAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 7140fb08bccSAnand Jain fs_devices->rw_devices++; 7150fb08bccSAnand Jain list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 7160fb08bccSAnand Jain } 7170fb08bccSAnand Jain brelse(bh); 7180fb08bccSAnand Jain 7190fb08bccSAnand Jain return 0; 7200fb08bccSAnand Jain 7210fb08bccSAnand Jain error_brelse: 7220fb08bccSAnand Jain brelse(bh); 7230fb08bccSAnand Jain blkdev_put(bdev, flags); 7240fb08bccSAnand Jain 7250fb08bccSAnand Jain return -EINVAL; 7260fb08bccSAnand Jain } 7270fb08bccSAnand Jain 72860999ca4SDavid Sterba /* 72960999ca4SDavid Sterba * Add new device to list of registered devices 73060999ca4SDavid Sterba * 73160999ca4SDavid Sterba * Returns: 73260999ca4SDavid Sterba * 1 - first time device is seen 73360999ca4SDavid Sterba * 0 - device already known 73460999ca4SDavid Sterba * < 0 - error 73560999ca4SDavid Sterba */ 736a1b32a59SChris Mason static noinline int device_list_add(const char *path, 7378a4b83ccSChris Mason struct btrfs_super_block *disk_super, 7388a4b83ccSChris Mason u64 devid, struct btrfs_fs_devices **fs_devices_ret) 7398a4b83ccSChris Mason { 7408a4b83ccSChris Mason struct btrfs_device *device; 7418a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 742606686eeSJosef Bacik struct rcu_string *name; 74360999ca4SDavid Sterba int ret = 0; 7448a4b83ccSChris Mason u64 found_transid = btrfs_super_generation(disk_super); 7458a4b83ccSChris Mason 7468a4b83ccSChris Mason fs_devices = find_fsid(disk_super->fsid); 7478a4b83ccSChris Mason if (!fs_devices) { 7482208a378SIlya Dryomov fs_devices = alloc_fs_devices(disk_super->fsid); 7492208a378SIlya Dryomov if (IS_ERR(fs_devices)) 7502208a378SIlya Dryomov return PTR_ERR(fs_devices); 7512208a378SIlya Dryomov 7528a4b83ccSChris Mason list_add(&fs_devices->list, &fs_uuids); 7532208a378SIlya Dryomov 7548a4b83ccSChris Mason device = NULL; 7558a4b83ccSChris Mason } else { 75635c70103SDavid Sterba device = find_device(fs_devices, devid, 757a443755fSChris Mason disk_super->dev_item.uuid); 7588a4b83ccSChris Mason } 759443f24feSMiao Xie 7608a4b83ccSChris Mason if (!device) { 7612b82032cSYan Zheng if (fs_devices->opened) 7622b82032cSYan Zheng return -EBUSY; 7632b82032cSYan Zheng 76412bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, 76512bd2fc0SIlya Dryomov disk_super->dev_item.uuid); 76612bd2fc0SIlya Dryomov if (IS_ERR(device)) { 7678a4b83ccSChris Mason /* we can safely leave the fs_devices entry around */ 76812bd2fc0SIlya Dryomov return PTR_ERR(device); 7698a4b83ccSChris Mason } 770606686eeSJosef Bacik 771606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 772606686eeSJosef Bacik if (!name) { 77355de4803SDavid Sterba free_device(device); 7748a4b83ccSChris Mason return -ENOMEM; 7758a4b83ccSChris Mason } 776606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 77790519d66SArne Jansen 778e5e9a520SChris Mason mutex_lock(&fs_devices->device_list_mutex); 7791f78160cSXiao Guangrong list_add_rcu(&device->dev_list, &fs_devices->devices); 780f7171750SFilipe David Borba Manana fs_devices->num_devices++; 781e5e9a520SChris Mason mutex_unlock(&fs_devices->device_list_mutex); 782e5e9a520SChris Mason 78360999ca4SDavid Sterba ret = 1; 7842b82032cSYan Zheng device->fs_devices = fs_devices; 785606686eeSJosef Bacik } else if (!device->name || strcmp(device->name->str, path)) { 786b96de000SAnand Jain /* 787b96de000SAnand Jain * When FS is already mounted. 788b96de000SAnand Jain * 1. If you are here and if the device->name is NULL that 789b96de000SAnand Jain * means this device was missing at time of FS mount. 790b96de000SAnand Jain * 2. If you are here and if the device->name is different 791b96de000SAnand Jain * from 'path' that means either 792b96de000SAnand Jain * a. The same device disappeared and reappeared with 793b96de000SAnand Jain * different name. or 794b96de000SAnand Jain * b. The missing-disk-which-was-replaced, has 795b96de000SAnand Jain * reappeared now. 796b96de000SAnand Jain * 797b96de000SAnand Jain * We must allow 1 and 2a above. But 2b would be a spurious 798b96de000SAnand Jain * and unintentional. 799b96de000SAnand Jain * 800b96de000SAnand Jain * Further in case of 1 and 2a above, the disk at 'path' 801b96de000SAnand Jain * would have missed some transaction when it was away and 802b96de000SAnand Jain * in case of 2a the stale bdev has to be updated as well. 803b96de000SAnand Jain * 2b must not be allowed at all time. 804b96de000SAnand Jain */ 805b96de000SAnand Jain 806b96de000SAnand Jain /* 8070f23ae74SChris Mason * For now, we do allow update to btrfs_fs_device through the 8080f23ae74SChris Mason * btrfs dev scan cli after FS has been mounted. We're still 8090f23ae74SChris Mason * tracking a problem where systems fail mount by subvolume id 8100f23ae74SChris Mason * when we reject replacement on a mounted FS. 811b96de000SAnand Jain */ 8120f23ae74SChris Mason if (!fs_devices->opened && found_transid < device->generation) { 81377bdae4dSAnand Jain /* 81477bdae4dSAnand Jain * That is if the FS is _not_ mounted and if you 81577bdae4dSAnand Jain * are here, that means there is more than one 81677bdae4dSAnand Jain * disk with same uuid and devid.We keep the one 81777bdae4dSAnand Jain * with larger generation number or the last-in if 81877bdae4dSAnand Jain * generation are equal. 81977bdae4dSAnand Jain */ 82077bdae4dSAnand Jain return -EEXIST; 82177bdae4dSAnand Jain } 822b96de000SAnand Jain 823606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 8243a0524dcSTARUISI Hiroaki if (!name) 8253a0524dcSTARUISI Hiroaki return -ENOMEM; 826606686eeSJosef Bacik rcu_string_free(device->name); 827606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 828e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 829cd02dca5SChris Mason fs_devices->missing_devices--; 830e6e674bdSAnand Jain clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 831cd02dca5SChris Mason } 8328a4b83ccSChris Mason } 8338a4b83ccSChris Mason 83477bdae4dSAnand Jain /* 83577bdae4dSAnand Jain * Unmount does not free the btrfs_device struct but would zero 83677bdae4dSAnand Jain * generation along with most of the other members. So just update 83777bdae4dSAnand Jain * it back. We need it to pick the disk with largest generation 83877bdae4dSAnand Jain * (as above). 83977bdae4dSAnand Jain */ 84077bdae4dSAnand Jain if (!fs_devices->opened) 84177bdae4dSAnand Jain device->generation = found_transid; 84277bdae4dSAnand Jain 8434fde46f0SAnand Jain /* 8444fde46f0SAnand Jain * if there is new btrfs on an already registered device, 8454fde46f0SAnand Jain * then remove the stale device entry. 8464fde46f0SAnand Jain */ 84702feae3cSAnand Jain if (ret > 0) 8484fde46f0SAnand Jain btrfs_free_stale_device(device); 8494fde46f0SAnand Jain 8508a4b83ccSChris Mason *fs_devices_ret = fs_devices; 85160999ca4SDavid Sterba 85260999ca4SDavid Sterba return ret; 8538a4b83ccSChris Mason } 8548a4b83ccSChris Mason 855e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 856e4404d6eSYan Zheng { 857e4404d6eSYan Zheng struct btrfs_fs_devices *fs_devices; 858e4404d6eSYan Zheng struct btrfs_device *device; 859e4404d6eSYan Zheng struct btrfs_device *orig_dev; 860e4404d6eSYan Zheng 8612208a378SIlya Dryomov fs_devices = alloc_fs_devices(orig->fsid); 8622208a378SIlya Dryomov if (IS_ERR(fs_devices)) 8632208a378SIlya Dryomov return fs_devices; 864e4404d6eSYan Zheng 865adbbb863SMiao Xie mutex_lock(&orig->device_list_mutex); 86602db0844SJosef Bacik fs_devices->total_devices = orig->total_devices; 867e4404d6eSYan Zheng 86846224705SXiao Guangrong /* We have held the volume lock, it is safe to get the devices. */ 869e4404d6eSYan Zheng list_for_each_entry(orig_dev, &orig->devices, dev_list) { 870606686eeSJosef Bacik struct rcu_string *name; 871606686eeSJosef Bacik 87212bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &orig_dev->devid, 87312bd2fc0SIlya Dryomov orig_dev->uuid); 87412bd2fc0SIlya Dryomov if (IS_ERR(device)) 875e4404d6eSYan Zheng goto error; 876e4404d6eSYan Zheng 877606686eeSJosef Bacik /* 878606686eeSJosef Bacik * This is ok to do without rcu read locked because we hold the 879606686eeSJosef Bacik * uuid mutex so nothing we touch in here is going to disappear. 880606686eeSJosef Bacik */ 881e755f780SAnand Jain if (orig_dev->name) { 88278f2c9e6SDavid Sterba name = rcu_string_strdup(orig_dev->name->str, 88378f2c9e6SDavid Sterba GFP_KERNEL); 884606686eeSJosef Bacik if (!name) { 88555de4803SDavid Sterba free_device(device); 886e4404d6eSYan Zheng goto error; 887fd2696f3SJulia Lawall } 888606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 889e755f780SAnand Jain } 890e4404d6eSYan Zheng 891e4404d6eSYan Zheng list_add(&device->dev_list, &fs_devices->devices); 892e4404d6eSYan Zheng device->fs_devices = fs_devices; 893e4404d6eSYan Zheng fs_devices->num_devices++; 894e4404d6eSYan Zheng } 895adbbb863SMiao Xie mutex_unlock(&orig->device_list_mutex); 896e4404d6eSYan Zheng return fs_devices; 897e4404d6eSYan Zheng error: 898adbbb863SMiao Xie mutex_unlock(&orig->device_list_mutex); 899e4404d6eSYan Zheng free_fs_devices(fs_devices); 900e4404d6eSYan Zheng return ERR_PTR(-ENOMEM); 901e4404d6eSYan Zheng } 902e4404d6eSYan Zheng 9039eaed21eSEric Sandeen void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) 904dfe25020SChris Mason { 905c6e30871SQinghuang Feng struct btrfs_device *device, *next; 906443f24feSMiao Xie struct btrfs_device *latest_dev = NULL; 907a6b0d5c8SChris Mason 908dfe25020SChris Mason mutex_lock(&uuid_mutex); 909dfe25020SChris Mason again: 91046224705SXiao Guangrong /* This is the initialized path, it is safe to release the devices. */ 911c6e30871SQinghuang Feng list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 912e12c9621SAnand Jain if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 913e12c9621SAnand Jain &device->dev_state)) { 914401e29c1SAnand Jain if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 915401e29c1SAnand Jain &device->dev_state) && 916443f24feSMiao Xie (!latest_dev || 917443f24feSMiao Xie device->generation > latest_dev->generation)) { 918443f24feSMiao Xie latest_dev = device; 919a6b0d5c8SChris Mason } 9202b82032cSYan Zheng continue; 921a6b0d5c8SChris Mason } 9222b82032cSYan Zheng 9238dabb742SStefan Behrens if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 9248dabb742SStefan Behrens /* 9258dabb742SStefan Behrens * In the first step, keep the device which has 9268dabb742SStefan Behrens * the correct fsid and the devid that is used 9278dabb742SStefan Behrens * for the dev_replace procedure. 9288dabb742SStefan Behrens * In the second step, the dev_replace state is 9298dabb742SStefan Behrens * read from the device tree and it is known 9308dabb742SStefan Behrens * whether the procedure is really active or 9318dabb742SStefan Behrens * not, which means whether this device is 9328dabb742SStefan Behrens * used or whether it should be removed. 9338dabb742SStefan Behrens */ 934401e29c1SAnand Jain if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 935401e29c1SAnand Jain &device->dev_state)) { 9368dabb742SStefan Behrens continue; 9378dabb742SStefan Behrens } 9388dabb742SStefan Behrens } 939a74a4b97SChris Mason if (device->bdev) { 940d4d77629STejun Heo blkdev_put(device->bdev, device->mode); 9412b82032cSYan Zheng device->bdev = NULL; 942a74a4b97SChris Mason fs_devices->open_devices--; 943a74a4b97SChris Mason } 944ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 9452b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 946ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 947401e29c1SAnand Jain if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 948401e29c1SAnand Jain &device->dev_state)) 9492b82032cSYan Zheng fs_devices->rw_devices--; 9502b82032cSYan Zheng } 9512b82032cSYan Zheng list_del_init(&device->dev_list); 9522b82032cSYan Zheng fs_devices->num_devices--; 95355de4803SDavid Sterba free_device(device); 9542b82032cSYan Zheng } 9552b82032cSYan Zheng 9562b82032cSYan Zheng if (fs_devices->seed) { 9572b82032cSYan Zheng fs_devices = fs_devices->seed; 958dfe25020SChris Mason goto again; 959dfe25020SChris Mason } 9602b82032cSYan Zheng 961443f24feSMiao Xie fs_devices->latest_bdev = latest_dev->bdev; 962a6b0d5c8SChris Mason 963dfe25020SChris Mason mutex_unlock(&uuid_mutex); 964dfe25020SChris Mason } 965a0af469bSChris Mason 966f06c5965SDavid Sterba static void free_device_rcu(struct rcu_head *head) 9671f78160cSXiao Guangrong { 9681f78160cSXiao Guangrong struct btrfs_device *device; 9691f78160cSXiao Guangrong 9701f78160cSXiao Guangrong device = container_of(head, struct btrfs_device, rcu); 97155de4803SDavid Sterba free_device(device); 9721f78160cSXiao Guangrong } 9731f78160cSXiao Guangrong 97414238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device) 97514238819SAnand Jain { 97608ffcae8SDavid Sterba if (!device->bdev) 97708ffcae8SDavid Sterba return; 97808ffcae8SDavid Sterba 979ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 98014238819SAnand Jain sync_blockdev(device->bdev); 98114238819SAnand Jain invalidate_bdev(device->bdev); 98214238819SAnand Jain } 98314238819SAnand Jain 98414238819SAnand Jain blkdev_put(device->bdev, device->mode); 98514238819SAnand Jain } 98614238819SAnand Jain 9870ccd0528SAnand Jain static void btrfs_prepare_close_one_device(struct btrfs_device *device) 988f448341aSAnand Jain { 989f448341aSAnand Jain struct btrfs_fs_devices *fs_devices = device->fs_devices; 990f448341aSAnand Jain struct btrfs_device *new_device; 991f448341aSAnand Jain struct rcu_string *name; 992f448341aSAnand Jain 993f448341aSAnand Jain if (device->bdev) 994f448341aSAnand Jain fs_devices->open_devices--; 995f448341aSAnand Jain 996ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 997f448341aSAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 998f448341aSAnand Jain list_del_init(&device->dev_alloc_list); 999f448341aSAnand Jain fs_devices->rw_devices--; 1000f448341aSAnand Jain } 1001f448341aSAnand Jain 1002e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1003f448341aSAnand Jain fs_devices->missing_devices--; 1004f448341aSAnand Jain 1005f448341aSAnand Jain new_device = btrfs_alloc_device(NULL, &device->devid, 1006f448341aSAnand Jain device->uuid); 1007f448341aSAnand Jain BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 1008f448341aSAnand Jain 1009f448341aSAnand Jain /* Safe because we are under uuid_mutex */ 1010f448341aSAnand Jain if (device->name) { 1011f448341aSAnand Jain name = rcu_string_strdup(device->name->str, GFP_NOFS); 1012f448341aSAnand Jain BUG_ON(!name); /* -ENOMEM */ 1013f448341aSAnand Jain rcu_assign_pointer(new_device->name, name); 1014f448341aSAnand Jain } 1015f448341aSAnand Jain 1016f448341aSAnand Jain list_replace_rcu(&device->dev_list, &new_device->dev_list); 1017f448341aSAnand Jain new_device->fs_devices = device->fs_devices; 1018f448341aSAnand Jain } 1019f448341aSAnand Jain 10202b82032cSYan Zheng static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 10218a4b83ccSChris Mason { 10222037a093SSasha Levin struct btrfs_device *device, *tmp; 10230ccd0528SAnand Jain struct list_head pending_put; 10240ccd0528SAnand Jain 10250ccd0528SAnand Jain INIT_LIST_HEAD(&pending_put); 1026e4404d6eSYan Zheng 10272b82032cSYan Zheng if (--fs_devices->opened > 0) 10282b82032cSYan Zheng return 0; 10298a4b83ccSChris Mason 1030c9513edbSXiao Guangrong mutex_lock(&fs_devices->device_list_mutex); 10312037a093SSasha Levin list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 10320ccd0528SAnand Jain btrfs_prepare_close_one_device(device); 10330ccd0528SAnand Jain list_add(&device->dev_list, &pending_put); 10348a4b83ccSChris Mason } 1035c9513edbSXiao Guangrong mutex_unlock(&fs_devices->device_list_mutex); 1036c9513edbSXiao Guangrong 10370ccd0528SAnand Jain /* 10380ccd0528SAnand Jain * btrfs_show_devname() is using the device_list_mutex, 10390ccd0528SAnand Jain * sometimes call to blkdev_put() leads vfs calling 10400ccd0528SAnand Jain * into this func. So do put outside of device_list_mutex, 10410ccd0528SAnand Jain * as of now. 10420ccd0528SAnand Jain */ 10430ccd0528SAnand Jain while (!list_empty(&pending_put)) { 10440ccd0528SAnand Jain device = list_first_entry(&pending_put, 10450ccd0528SAnand Jain struct btrfs_device, dev_list); 10460ccd0528SAnand Jain list_del(&device->dev_list); 10470ccd0528SAnand Jain btrfs_close_bdev(device); 1048f06c5965SDavid Sterba call_rcu(&device->rcu, free_device_rcu); 10490ccd0528SAnand Jain } 10500ccd0528SAnand Jain 1051e4404d6eSYan Zheng WARN_ON(fs_devices->open_devices); 1052e4404d6eSYan Zheng WARN_ON(fs_devices->rw_devices); 10532b82032cSYan Zheng fs_devices->opened = 0; 10542b82032cSYan Zheng fs_devices->seeding = 0; 10552b82032cSYan Zheng 10568a4b83ccSChris Mason return 0; 10578a4b83ccSChris Mason } 10588a4b83ccSChris Mason 10592b82032cSYan Zheng int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 10602b82032cSYan Zheng { 1061e4404d6eSYan Zheng struct btrfs_fs_devices *seed_devices = NULL; 10622b82032cSYan Zheng int ret; 10632b82032cSYan Zheng 10642b82032cSYan Zheng mutex_lock(&uuid_mutex); 10652b82032cSYan Zheng ret = __btrfs_close_devices(fs_devices); 1066e4404d6eSYan Zheng if (!fs_devices->opened) { 1067e4404d6eSYan Zheng seed_devices = fs_devices->seed; 1068e4404d6eSYan Zheng fs_devices->seed = NULL; 1069e4404d6eSYan Zheng } 10702b82032cSYan Zheng mutex_unlock(&uuid_mutex); 1071e4404d6eSYan Zheng 1072e4404d6eSYan Zheng while (seed_devices) { 1073e4404d6eSYan Zheng fs_devices = seed_devices; 1074e4404d6eSYan Zheng seed_devices = fs_devices->seed; 1075e4404d6eSYan Zheng __btrfs_close_devices(fs_devices); 1076e4404d6eSYan Zheng free_fs_devices(fs_devices); 1077e4404d6eSYan Zheng } 10782b82032cSYan Zheng return ret; 10792b82032cSYan Zheng } 10802b82032cSYan Zheng 1081e4404d6eSYan Zheng static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 108297288f2cSChristoph Hellwig fmode_t flags, void *holder) 10838a4b83ccSChris Mason { 10848a4b83ccSChris Mason struct list_head *head = &fs_devices->devices; 10858a4b83ccSChris Mason struct btrfs_device *device; 1086443f24feSMiao Xie struct btrfs_device *latest_dev = NULL; 1087a0af469bSChris Mason int ret = 0; 10888a4b83ccSChris Mason 1089d4d77629STejun Heo flags |= FMODE_EXCL; 1090d4d77629STejun Heo 1091c6e30871SQinghuang Feng list_for_each_entry(device, head, dev_list) { 1092f63e0ccaSEric Sandeen /* Just open everything we can; ignore failures here */ 10930fb08bccSAnand Jain if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1094beaf8ab3SStefan Behrens continue; 1095a0af469bSChris Mason 10969f050db4SAnand Jain if (!latest_dev || 10979f050db4SAnand Jain device->generation > latest_dev->generation) 10989f050db4SAnand Jain latest_dev = device; 10998a4b83ccSChris Mason } 1100a0af469bSChris Mason if (fs_devices->open_devices == 0) { 110120bcd649SIlya Dryomov ret = -EINVAL; 1102a0af469bSChris Mason goto out; 1103a0af469bSChris Mason } 11042b82032cSYan Zheng fs_devices->opened = 1; 1105443f24feSMiao Xie fs_devices->latest_bdev = latest_dev->bdev; 11062b82032cSYan Zheng fs_devices->total_rw_bytes = 0; 1107a0af469bSChris Mason out: 11082b82032cSYan Zheng return ret; 11092b82032cSYan Zheng } 11102b82032cSYan Zheng 11112b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 111297288f2cSChristoph Hellwig fmode_t flags, void *holder) 11132b82032cSYan Zheng { 11142b82032cSYan Zheng int ret; 11152b82032cSYan Zheng 11162b82032cSYan Zheng mutex_lock(&uuid_mutex); 11172b82032cSYan Zheng if (fs_devices->opened) { 11182b82032cSYan Zheng fs_devices->opened++; 11192b82032cSYan Zheng ret = 0; 11202b82032cSYan Zheng } else { 112115916de8SChris Mason ret = __btrfs_open_devices(fs_devices, flags, holder); 11222b82032cSYan Zheng } 11238a4b83ccSChris Mason mutex_unlock(&uuid_mutex); 11248a4b83ccSChris Mason return ret; 11258a4b83ccSChris Mason } 11268a4b83ccSChris Mason 1127c9162bdfSOmar Sandoval static void btrfs_release_disk_super(struct page *page) 11286cf86a00SAnand Jain { 11296cf86a00SAnand Jain kunmap(page); 11306cf86a00SAnand Jain put_page(page); 11316cf86a00SAnand Jain } 11326cf86a00SAnand Jain 1133c9162bdfSOmar Sandoval static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1134c9162bdfSOmar Sandoval struct page **page, 1135c9162bdfSOmar Sandoval struct btrfs_super_block **disk_super) 11366cf86a00SAnand Jain { 11376cf86a00SAnand Jain void *p; 11386cf86a00SAnand Jain pgoff_t index; 11396cf86a00SAnand Jain 11406cf86a00SAnand Jain /* make sure our super fits in the device */ 11416cf86a00SAnand Jain if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 11426cf86a00SAnand Jain return 1; 11436cf86a00SAnand Jain 11446cf86a00SAnand Jain /* make sure our super fits in the page */ 11456cf86a00SAnand Jain if (sizeof(**disk_super) > PAGE_SIZE) 11466cf86a00SAnand Jain return 1; 11476cf86a00SAnand Jain 11486cf86a00SAnand Jain /* make sure our super doesn't straddle pages on disk */ 11496cf86a00SAnand Jain index = bytenr >> PAGE_SHIFT; 11506cf86a00SAnand Jain if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) 11516cf86a00SAnand Jain return 1; 11526cf86a00SAnand Jain 11536cf86a00SAnand Jain /* pull in the page with our super */ 11546cf86a00SAnand Jain *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 11556cf86a00SAnand Jain index, GFP_KERNEL); 11566cf86a00SAnand Jain 11576cf86a00SAnand Jain if (IS_ERR_OR_NULL(*page)) 11586cf86a00SAnand Jain return 1; 11596cf86a00SAnand Jain 11606cf86a00SAnand Jain p = kmap(*page); 11616cf86a00SAnand Jain 11626cf86a00SAnand Jain /* align our pointer to the offset of the super block */ 11636cf86a00SAnand Jain *disk_super = p + (bytenr & ~PAGE_MASK); 11646cf86a00SAnand Jain 11656cf86a00SAnand Jain if (btrfs_super_bytenr(*disk_super) != bytenr || 11666cf86a00SAnand Jain btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { 11676cf86a00SAnand Jain btrfs_release_disk_super(*page); 11686cf86a00SAnand Jain return 1; 11696cf86a00SAnand Jain } 11706cf86a00SAnand Jain 11716cf86a00SAnand Jain if ((*disk_super)->label[0] && 11726cf86a00SAnand Jain (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) 11736cf86a00SAnand Jain (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; 11746cf86a00SAnand Jain 11756cf86a00SAnand Jain return 0; 11766cf86a00SAnand Jain } 11776cf86a00SAnand Jain 11786f60cbd3SDavid Sterba /* 11796f60cbd3SDavid Sterba * Look for a btrfs signature on a device. This may be called out of the mount path 11806f60cbd3SDavid Sterba * and we are not allowed to call set_blocksize during the scan. The superblock 11816f60cbd3SDavid Sterba * is read via pagecache 11826f60cbd3SDavid Sterba */ 118397288f2cSChristoph Hellwig int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 11848a4b83ccSChris Mason struct btrfs_fs_devices **fs_devices_ret) 11858a4b83ccSChris Mason { 11868a4b83ccSChris Mason struct btrfs_super_block *disk_super; 11878a4b83ccSChris Mason struct block_device *bdev; 11886f60cbd3SDavid Sterba struct page *page; 118905a5c55dSAnand Jain int ret; 11908a4b83ccSChris Mason u64 devid; 1191f2984462SChris Mason u64 transid; 119202db0844SJosef Bacik u64 total_devices; 11936f60cbd3SDavid Sterba u64 bytenr; 11948a4b83ccSChris Mason 11956f60cbd3SDavid Sterba /* 11966f60cbd3SDavid Sterba * we would like to check all the supers, but that would make 11976f60cbd3SDavid Sterba * a btrfs mount succeed after a mkfs from a different FS. 11986f60cbd3SDavid Sterba * So, we need to add a special mount option to scan for 11996f60cbd3SDavid Sterba * later supers, using BTRFS_SUPER_MIRROR_MAX instead 12006f60cbd3SDavid Sterba */ 12016f60cbd3SDavid Sterba bytenr = btrfs_sb_offset(0); 1202d4d77629STejun Heo flags |= FMODE_EXCL; 120310f6327bSAl Viro mutex_lock(&uuid_mutex); 12046f60cbd3SDavid Sterba 12056f60cbd3SDavid Sterba bdev = blkdev_get_by_path(path, flags, holder); 12066f60cbd3SDavid Sterba if (IS_ERR(bdev)) { 12076f60cbd3SDavid Sterba ret = PTR_ERR(bdev); 1208beaf8ab3SStefan Behrens goto error; 12096f60cbd3SDavid Sterba } 12106f60cbd3SDavid Sterba 121105a5c55dSAnand Jain if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { 121205a5c55dSAnand Jain ret = -EINVAL; 12136f60cbd3SDavid Sterba goto error_bdev_put; 121405a5c55dSAnand Jain } 12156f60cbd3SDavid Sterba 1216a343832fSXiao Guangrong devid = btrfs_stack_device_id(&disk_super->dev_item); 1217f2984462SChris Mason transid = btrfs_super_generation(disk_super); 121802db0844SJosef Bacik total_devices = btrfs_super_num_devices(disk_super); 12196f60cbd3SDavid Sterba 122060999ca4SDavid Sterba ret = device_list_add(path, disk_super, devid, fs_devices_ret); 122160999ca4SDavid Sterba if (ret > 0) { 122205a5c55dSAnand Jain if (disk_super->label[0]) 122362e85577SJeff Mahoney pr_info("BTRFS: device label %s ", disk_super->label); 122405a5c55dSAnand Jain else 122562e85577SJeff Mahoney pr_info("BTRFS: device fsid %pU ", disk_super->fsid); 12266f60cbd3SDavid Sterba 122762e85577SJeff Mahoney pr_cont("devid %llu transid %llu %s\n", devid, transid, path); 122860999ca4SDavid Sterba ret = 0; 122960999ca4SDavid Sterba } 123002db0844SJosef Bacik if (!ret && fs_devices_ret) 123102db0844SJosef Bacik (*fs_devices_ret)->total_devices = total_devices; 12326f60cbd3SDavid Sterba 12336cf86a00SAnand Jain btrfs_release_disk_super(page); 12346f60cbd3SDavid Sterba 12356f60cbd3SDavid Sterba error_bdev_put: 1236d4d77629STejun Heo blkdev_put(bdev, flags); 12378a4b83ccSChris Mason error: 1238beaf8ab3SStefan Behrens mutex_unlock(&uuid_mutex); 12398a4b83ccSChris Mason return ret; 12408a4b83ccSChris Mason } 12410b86a832SChris Mason 12426d07bcecSMiao Xie /* helper to account the used device space in the range */ 12436d07bcecSMiao Xie int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 12446d07bcecSMiao Xie u64 end, u64 *length) 12450b86a832SChris Mason { 12460b86a832SChris Mason struct btrfs_key key; 1247fb456252SJeff Mahoney struct btrfs_root *root = device->fs_info->dev_root; 12486d07bcecSMiao Xie struct btrfs_dev_extent *dev_extent; 12492b82032cSYan Zheng struct btrfs_path *path; 12506d07bcecSMiao Xie u64 extent_end; 12510b86a832SChris Mason int ret; 12526d07bcecSMiao Xie int slot; 12530b86a832SChris Mason struct extent_buffer *l; 12540b86a832SChris Mason 12556d07bcecSMiao Xie *length = 0; 12566d07bcecSMiao Xie 1257401e29c1SAnand Jain if (start >= device->total_bytes || 1258401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 12596d07bcecSMiao Xie return 0; 12606d07bcecSMiao Xie 12612b82032cSYan Zheng path = btrfs_alloc_path(); 12622b82032cSYan Zheng if (!path) 12632b82032cSYan Zheng return -ENOMEM; 1264e4058b54SDavid Sterba path->reada = READA_FORWARD; 12658f18cf13SChris Mason 12660b86a832SChris Mason key.objectid = device->devid; 12676d07bcecSMiao Xie key.offset = start; 12680b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 12696d07bcecSMiao Xie 12706d07bcecSMiao Xie ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 12710b86a832SChris Mason if (ret < 0) 12726d07bcecSMiao Xie goto out; 12731fcbac58SYan Zheng if (ret > 0) { 12741fcbac58SYan Zheng ret = btrfs_previous_item(root, path, key.objectid, key.type); 12750b86a832SChris Mason if (ret < 0) 12766d07bcecSMiao Xie goto out; 12771fcbac58SYan Zheng } 12786d07bcecSMiao Xie 12790b86a832SChris Mason while (1) { 12800b86a832SChris Mason l = path->nodes[0]; 12810b86a832SChris Mason slot = path->slots[0]; 12820b86a832SChris Mason if (slot >= btrfs_header_nritems(l)) { 12830b86a832SChris Mason ret = btrfs_next_leaf(root, path); 12840b86a832SChris Mason if (ret == 0) 12850b86a832SChris Mason continue; 12860b86a832SChris Mason if (ret < 0) 12876d07bcecSMiao Xie goto out; 12886d07bcecSMiao Xie 12896d07bcecSMiao Xie break; 12900b86a832SChris Mason } 12910b86a832SChris Mason btrfs_item_key_to_cpu(l, &key, slot); 12920b86a832SChris Mason 12930b86a832SChris Mason if (key.objectid < device->devid) 12940b86a832SChris Mason goto next; 12950b86a832SChris Mason 12960b86a832SChris Mason if (key.objectid > device->devid) 12976d07bcecSMiao Xie break; 12980b86a832SChris Mason 1299962a298fSDavid Sterba if (key.type != BTRFS_DEV_EXTENT_KEY) 13000b86a832SChris Mason goto next; 13010b86a832SChris Mason 13020b86a832SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 13036d07bcecSMiao Xie extent_end = key.offset + btrfs_dev_extent_length(l, 13046d07bcecSMiao Xie dev_extent); 13056d07bcecSMiao Xie if (key.offset <= start && extent_end > end) { 13066d07bcecSMiao Xie *length = end - start + 1; 13076d07bcecSMiao Xie break; 13086d07bcecSMiao Xie } else if (key.offset <= start && extent_end > start) 13096d07bcecSMiao Xie *length += extent_end - start; 13106d07bcecSMiao Xie else if (key.offset > start && extent_end <= end) 13116d07bcecSMiao Xie *length += extent_end - key.offset; 13126d07bcecSMiao Xie else if (key.offset > start && key.offset <= end) { 13136d07bcecSMiao Xie *length += end - key.offset + 1; 13146d07bcecSMiao Xie break; 13156d07bcecSMiao Xie } else if (key.offset > end) 13166d07bcecSMiao Xie break; 13176d07bcecSMiao Xie 13186d07bcecSMiao Xie next: 13196d07bcecSMiao Xie path->slots[0]++; 13206d07bcecSMiao Xie } 13216d07bcecSMiao Xie ret = 0; 13226d07bcecSMiao Xie out: 13236d07bcecSMiao Xie btrfs_free_path(path); 13246d07bcecSMiao Xie return ret; 13256d07bcecSMiao Xie } 13266d07bcecSMiao Xie 1327499f377fSJeff Mahoney static int contains_pending_extent(struct btrfs_transaction *transaction, 13286df9a95eSJosef Bacik struct btrfs_device *device, 13296df9a95eSJosef Bacik u64 *start, u64 len) 13306df9a95eSJosef Bacik { 1331fb456252SJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 13326df9a95eSJosef Bacik struct extent_map *em; 1333499f377fSJeff Mahoney struct list_head *search_list = &fs_info->pinned_chunks; 13346df9a95eSJosef Bacik int ret = 0; 13351b984508SForrest Liu u64 physical_start = *start; 13366df9a95eSJosef Bacik 1337499f377fSJeff Mahoney if (transaction) 1338499f377fSJeff Mahoney search_list = &transaction->pending_chunks; 133904216820SFilipe Manana again: 134004216820SFilipe Manana list_for_each_entry(em, search_list, list) { 13416df9a95eSJosef Bacik struct map_lookup *map; 13426df9a95eSJosef Bacik int i; 13436df9a95eSJosef Bacik 134495617d69SJeff Mahoney map = em->map_lookup; 13456df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 1346c152b63eSFilipe Manana u64 end; 1347c152b63eSFilipe Manana 13486df9a95eSJosef Bacik if (map->stripes[i].dev != device) 13496df9a95eSJosef Bacik continue; 13501b984508SForrest Liu if (map->stripes[i].physical >= physical_start + len || 13516df9a95eSJosef Bacik map->stripes[i].physical + em->orig_block_len <= 13521b984508SForrest Liu physical_start) 13536df9a95eSJosef Bacik continue; 1354c152b63eSFilipe Manana /* 1355c152b63eSFilipe Manana * Make sure that while processing the pinned list we do 1356c152b63eSFilipe Manana * not override our *start with a lower value, because 1357c152b63eSFilipe Manana * we can have pinned chunks that fall within this 1358c152b63eSFilipe Manana * device hole and that have lower physical addresses 1359c152b63eSFilipe Manana * than the pending chunks we processed before. If we 1360c152b63eSFilipe Manana * do not take this special care we can end up getting 1361c152b63eSFilipe Manana * 2 pending chunks that start at the same physical 1362c152b63eSFilipe Manana * device offsets because the end offset of a pinned 1363c152b63eSFilipe Manana * chunk can be equal to the start offset of some 1364c152b63eSFilipe Manana * pending chunk. 1365c152b63eSFilipe Manana */ 1366c152b63eSFilipe Manana end = map->stripes[i].physical + em->orig_block_len; 1367c152b63eSFilipe Manana if (end > *start) { 1368c152b63eSFilipe Manana *start = end; 13696df9a95eSJosef Bacik ret = 1; 13706df9a95eSJosef Bacik } 13716df9a95eSJosef Bacik } 1372c152b63eSFilipe Manana } 1373499f377fSJeff Mahoney if (search_list != &fs_info->pinned_chunks) { 1374499f377fSJeff Mahoney search_list = &fs_info->pinned_chunks; 137504216820SFilipe Manana goto again; 137604216820SFilipe Manana } 13776df9a95eSJosef Bacik 13786df9a95eSJosef Bacik return ret; 13796df9a95eSJosef Bacik } 13806df9a95eSJosef Bacik 13816df9a95eSJosef Bacik 13820b86a832SChris Mason /* 1383499f377fSJeff Mahoney * find_free_dev_extent_start - find free space in the specified device 13847bfc837dSMiao Xie * @device: the device which we search the free space in 13857bfc837dSMiao Xie * @num_bytes: the size of the free space that we need 1386499f377fSJeff Mahoney * @search_start: the position from which to begin the search 13877bfc837dSMiao Xie * @start: store the start of the free space. 1388499f377fSJeff Mahoney * @len: the size of the free space. that we find, or the size 1389499f377fSJeff Mahoney * of the max free space if we don't find suitable free space 13907bfc837dSMiao Xie * 13910b86a832SChris Mason * this uses a pretty simple search, the expectation is that it is 13920b86a832SChris Mason * called very infrequently and that a given device has a small number 13930b86a832SChris Mason * of extents 13947bfc837dSMiao Xie * 13957bfc837dSMiao Xie * @start is used to store the start of the free space if we find. But if we 13967bfc837dSMiao Xie * don't find suitable free space, it will be used to store the start position 13977bfc837dSMiao Xie * of the max free space. 13987bfc837dSMiao Xie * 13997bfc837dSMiao Xie * @len is used to store the size of the free space that we find. 14007bfc837dSMiao Xie * But if we don't find suitable free space, it is used to store the size of 14017bfc837dSMiao Xie * the max free space. 14020b86a832SChris Mason */ 1403499f377fSJeff Mahoney int find_free_dev_extent_start(struct btrfs_transaction *transaction, 14046df9a95eSJosef Bacik struct btrfs_device *device, u64 num_bytes, 1405499f377fSJeff Mahoney u64 search_start, u64 *start, u64 *len) 14060b86a832SChris Mason { 14070b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 14080b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 14090b86a832SChris Mason struct btrfs_key key; 14107bfc837dSMiao Xie struct btrfs_dev_extent *dev_extent; 14110b86a832SChris Mason struct btrfs_path *path; 14127bfc837dSMiao Xie u64 hole_size; 14137bfc837dSMiao Xie u64 max_hole_start; 14147bfc837dSMiao Xie u64 max_hole_size; 14157bfc837dSMiao Xie u64 extent_end; 14160b86a832SChris Mason u64 search_end = device->total_bytes; 14170b86a832SChris Mason int ret; 14187bfc837dSMiao Xie int slot; 14190b86a832SChris Mason struct extent_buffer *l; 14208cdc7c5bSFilipe Manana 14218cdc7c5bSFilipe Manana /* 14228cdc7c5bSFilipe Manana * We don't want to overwrite the superblock on the drive nor any area 14238cdc7c5bSFilipe Manana * used by the boot loader (grub for example), so we make sure to start 14248cdc7c5bSFilipe Manana * at an offset of at least 1MB. 14258cdc7c5bSFilipe Manana */ 14260d0c71b3SDavid Sterba search_start = max_t(u64, search_start, SZ_1M); 14270b86a832SChris Mason 14286df9a95eSJosef Bacik path = btrfs_alloc_path(); 14296df9a95eSJosef Bacik if (!path) 14306df9a95eSJosef Bacik return -ENOMEM; 1431f2ab7618SZhao Lei 14327bfc837dSMiao Xie max_hole_start = search_start; 14337bfc837dSMiao Xie max_hole_size = 0; 14347bfc837dSMiao Xie 1435f2ab7618SZhao Lei again: 1436401e29c1SAnand Jain if (search_start >= search_end || 1437401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 14387bfc837dSMiao Xie ret = -ENOSPC; 14396df9a95eSJosef Bacik goto out; 14407bfc837dSMiao Xie } 14417bfc837dSMiao Xie 1442e4058b54SDavid Sterba path->reada = READA_FORWARD; 14436df9a95eSJosef Bacik path->search_commit_root = 1; 14446df9a95eSJosef Bacik path->skip_locking = 1; 14457bfc837dSMiao Xie 14460b86a832SChris Mason key.objectid = device->devid; 14470b86a832SChris Mason key.offset = search_start; 14480b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 14497bfc837dSMiao Xie 1450125ccb0aSLi Zefan ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 14510b86a832SChris Mason if (ret < 0) 14527bfc837dSMiao Xie goto out; 14530b86a832SChris Mason if (ret > 0) { 14540b86a832SChris Mason ret = btrfs_previous_item(root, path, key.objectid, key.type); 14550b86a832SChris Mason if (ret < 0) 14567bfc837dSMiao Xie goto out; 14570b86a832SChris Mason } 14587bfc837dSMiao Xie 14590b86a832SChris Mason while (1) { 14600b86a832SChris Mason l = path->nodes[0]; 14610b86a832SChris Mason slot = path->slots[0]; 14620b86a832SChris Mason if (slot >= btrfs_header_nritems(l)) { 14630b86a832SChris Mason ret = btrfs_next_leaf(root, path); 14640b86a832SChris Mason if (ret == 0) 14650b86a832SChris Mason continue; 14660b86a832SChris Mason if (ret < 0) 14677bfc837dSMiao Xie goto out; 14687bfc837dSMiao Xie 14697bfc837dSMiao Xie break; 14700b86a832SChris Mason } 14710b86a832SChris Mason btrfs_item_key_to_cpu(l, &key, slot); 14720b86a832SChris Mason 14730b86a832SChris Mason if (key.objectid < device->devid) 14740b86a832SChris Mason goto next; 14750b86a832SChris Mason 14760b86a832SChris Mason if (key.objectid > device->devid) 14777bfc837dSMiao Xie break; 14780b86a832SChris Mason 1479962a298fSDavid Sterba if (key.type != BTRFS_DEV_EXTENT_KEY) 14800b86a832SChris Mason goto next; 14810b86a832SChris Mason 14827bfc837dSMiao Xie if (key.offset > search_start) { 14837bfc837dSMiao Xie hole_size = key.offset - search_start; 14847bfc837dSMiao Xie 14856df9a95eSJosef Bacik /* 14866df9a95eSJosef Bacik * Have to check before we set max_hole_start, otherwise 14876df9a95eSJosef Bacik * we could end up sending back this offset anyway. 14886df9a95eSJosef Bacik */ 1489499f377fSJeff Mahoney if (contains_pending_extent(transaction, device, 14906df9a95eSJosef Bacik &search_start, 14911b984508SForrest Liu hole_size)) { 14921b984508SForrest Liu if (key.offset >= search_start) { 14931b984508SForrest Liu hole_size = key.offset - search_start; 14941b984508SForrest Liu } else { 14951b984508SForrest Liu WARN_ON_ONCE(1); 14966df9a95eSJosef Bacik hole_size = 0; 14971b984508SForrest Liu } 14981b984508SForrest Liu } 14996df9a95eSJosef Bacik 15007bfc837dSMiao Xie if (hole_size > max_hole_size) { 15017bfc837dSMiao Xie max_hole_start = search_start; 15027bfc837dSMiao Xie max_hole_size = hole_size; 15037bfc837dSMiao Xie } 15047bfc837dSMiao Xie 15057bfc837dSMiao Xie /* 15067bfc837dSMiao Xie * If this free space is greater than which we need, 15077bfc837dSMiao Xie * it must be the max free space that we have found 15087bfc837dSMiao Xie * until now, so max_hole_start must point to the start 15097bfc837dSMiao Xie * of this free space and the length of this free space 15107bfc837dSMiao Xie * is stored in max_hole_size. Thus, we return 15117bfc837dSMiao Xie * max_hole_start and max_hole_size and go back to the 15127bfc837dSMiao Xie * caller. 15137bfc837dSMiao Xie */ 15147bfc837dSMiao Xie if (hole_size >= num_bytes) { 15157bfc837dSMiao Xie ret = 0; 15167bfc837dSMiao Xie goto out; 15177bfc837dSMiao Xie } 15187bfc837dSMiao Xie } 15197bfc837dSMiao Xie 15200b86a832SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 15217bfc837dSMiao Xie extent_end = key.offset + btrfs_dev_extent_length(l, 15227bfc837dSMiao Xie dev_extent); 15237bfc837dSMiao Xie if (extent_end > search_start) 15247bfc837dSMiao Xie search_start = extent_end; 15250b86a832SChris Mason next: 15260b86a832SChris Mason path->slots[0]++; 15270b86a832SChris Mason cond_resched(); 15280b86a832SChris Mason } 15290b86a832SChris Mason 153038c01b96Sliubo /* 153138c01b96Sliubo * At this point, search_start should be the end of 153238c01b96Sliubo * allocated dev extents, and when shrinking the device, 153338c01b96Sliubo * search_end may be smaller than search_start. 153438c01b96Sliubo */ 1535f2ab7618SZhao Lei if (search_end > search_start) { 15367bfc837dSMiao Xie hole_size = search_end - search_start; 153738c01b96Sliubo 1538499f377fSJeff Mahoney if (contains_pending_extent(transaction, device, &search_start, 1539f2ab7618SZhao Lei hole_size)) { 1540f2ab7618SZhao Lei btrfs_release_path(path); 1541f2ab7618SZhao Lei goto again; 1542f2ab7618SZhao Lei } 1543f2ab7618SZhao Lei 15447bfc837dSMiao Xie if (hole_size > max_hole_size) { 15457bfc837dSMiao Xie max_hole_start = search_start; 15467bfc837dSMiao Xie max_hole_size = hole_size; 15470b86a832SChris Mason } 15486df9a95eSJosef Bacik } 15496df9a95eSJosef Bacik 15507bfc837dSMiao Xie /* See above. */ 1551f2ab7618SZhao Lei if (max_hole_size < num_bytes) 15527bfc837dSMiao Xie ret = -ENOSPC; 15537bfc837dSMiao Xie else 15542b82032cSYan Zheng ret = 0; 15550b86a832SChris Mason 15567bfc837dSMiao Xie out: 15572b82032cSYan Zheng btrfs_free_path(path); 15587bfc837dSMiao Xie *start = max_hole_start; 1559b2117a39SMiao Xie if (len) 15607bfc837dSMiao Xie *len = max_hole_size; 15610b86a832SChris Mason return ret; 15620b86a832SChris Mason } 15630b86a832SChris Mason 1564499f377fSJeff Mahoney int find_free_dev_extent(struct btrfs_trans_handle *trans, 1565499f377fSJeff Mahoney struct btrfs_device *device, u64 num_bytes, 1566499f377fSJeff Mahoney u64 *start, u64 *len) 1567499f377fSJeff Mahoney { 1568499f377fSJeff Mahoney /* FIXME use last free of some kind */ 1569499f377fSJeff Mahoney return find_free_dev_extent_start(trans->transaction, device, 15708cdc7c5bSFilipe Manana num_bytes, 0, start, len); 1571499f377fSJeff Mahoney } 1572499f377fSJeff Mahoney 1573b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 15748f18cf13SChris Mason struct btrfs_device *device, 15752196d6e8SMiao Xie u64 start, u64 *dev_extent_len) 15768f18cf13SChris Mason { 15770b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 15780b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 15798f18cf13SChris Mason int ret; 15808f18cf13SChris Mason struct btrfs_path *path; 15818f18cf13SChris Mason struct btrfs_key key; 1582a061fc8dSChris Mason struct btrfs_key found_key; 1583a061fc8dSChris Mason struct extent_buffer *leaf = NULL; 1584a061fc8dSChris Mason struct btrfs_dev_extent *extent = NULL; 15858f18cf13SChris Mason 15868f18cf13SChris Mason path = btrfs_alloc_path(); 15878f18cf13SChris Mason if (!path) 15888f18cf13SChris Mason return -ENOMEM; 15898f18cf13SChris Mason 15908f18cf13SChris Mason key.objectid = device->devid; 15918f18cf13SChris Mason key.offset = start; 15928f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 1593924cd8fbSMiao Xie again: 15948f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1595a061fc8dSChris Mason if (ret > 0) { 1596a061fc8dSChris Mason ret = btrfs_previous_item(root, path, key.objectid, 1597a061fc8dSChris Mason BTRFS_DEV_EXTENT_KEY); 1598b0b802d7STsutomu Itoh if (ret) 1599b0b802d7STsutomu Itoh goto out; 1600a061fc8dSChris Mason leaf = path->nodes[0]; 1601a061fc8dSChris Mason btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1602a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1603a061fc8dSChris Mason struct btrfs_dev_extent); 1604a061fc8dSChris Mason BUG_ON(found_key.offset > start || found_key.offset + 1605a061fc8dSChris Mason btrfs_dev_extent_length(leaf, extent) < start); 1606924cd8fbSMiao Xie key = found_key; 1607924cd8fbSMiao Xie btrfs_release_path(path); 1608924cd8fbSMiao Xie goto again; 1609a061fc8dSChris Mason } else if (ret == 0) { 1610a061fc8dSChris Mason leaf = path->nodes[0]; 1611a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1612a061fc8dSChris Mason struct btrfs_dev_extent); 161379787eaaSJeff Mahoney } else { 16140b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 161579787eaaSJeff Mahoney goto out; 1616a061fc8dSChris Mason } 16178f18cf13SChris Mason 16182196d6e8SMiao Xie *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 16192196d6e8SMiao Xie 16208f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 162179787eaaSJeff Mahoney if (ret) { 16220b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 162379787eaaSJeff Mahoney "Failed to remove dev extent item"); 162413212b54SZhao Lei } else { 16253204d33cSJosef Bacik set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 162679787eaaSJeff Mahoney } 1627b0b802d7STsutomu Itoh out: 16288f18cf13SChris Mason btrfs_free_path(path); 16298f18cf13SChris Mason return ret; 16308f18cf13SChris Mason } 16318f18cf13SChris Mason 163248a3b636SEric Sandeen static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 16330b86a832SChris Mason struct btrfs_device *device, 16342b82032cSYan Zheng u64 chunk_offset, u64 start, u64 num_bytes) 16350b86a832SChris Mason { 16360b86a832SChris Mason int ret; 16370b86a832SChris Mason struct btrfs_path *path; 16380b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 16390b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 16400b86a832SChris Mason struct btrfs_dev_extent *extent; 16410b86a832SChris Mason struct extent_buffer *leaf; 16420b86a832SChris Mason struct btrfs_key key; 16430b86a832SChris Mason 1644e12c9621SAnand Jain WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1645401e29c1SAnand Jain WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 16460b86a832SChris Mason path = btrfs_alloc_path(); 16470b86a832SChris Mason if (!path) 16480b86a832SChris Mason return -ENOMEM; 16490b86a832SChris Mason 16500b86a832SChris Mason key.objectid = device->devid; 16512b82032cSYan Zheng key.offset = start; 16520b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 16530b86a832SChris Mason ret = btrfs_insert_empty_item(trans, root, path, &key, 16540b86a832SChris Mason sizeof(*extent)); 16552cdcecbcSMark Fasheh if (ret) 16562cdcecbcSMark Fasheh goto out; 16570b86a832SChris Mason 16580b86a832SChris Mason leaf = path->nodes[0]; 16590b86a832SChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 16600b86a832SChris Mason struct btrfs_dev_extent); 1661b5d9071cSNikolay Borisov btrfs_set_dev_extent_chunk_tree(leaf, extent, 1662b5d9071cSNikolay Borisov BTRFS_CHUNK_TREE_OBJECTID); 16630ca00afbSNikolay Borisov btrfs_set_dev_extent_chunk_objectid(leaf, extent, 16640ca00afbSNikolay Borisov BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1665e17cade2SChris Mason btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1666e17cade2SChris Mason 16670b86a832SChris Mason btrfs_set_dev_extent_length(leaf, extent, num_bytes); 16680b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 16692cdcecbcSMark Fasheh out: 16700b86a832SChris Mason btrfs_free_path(path); 16710b86a832SChris Mason return ret; 16720b86a832SChris Mason } 16730b86a832SChris Mason 16746df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 16750b86a832SChris Mason { 16766df9a95eSJosef Bacik struct extent_map_tree *em_tree; 16776df9a95eSJosef Bacik struct extent_map *em; 16786df9a95eSJosef Bacik struct rb_node *n; 16796df9a95eSJosef Bacik u64 ret = 0; 16800b86a832SChris Mason 16816df9a95eSJosef Bacik em_tree = &fs_info->mapping_tree.map_tree; 16826df9a95eSJosef Bacik read_lock(&em_tree->lock); 16836df9a95eSJosef Bacik n = rb_last(&em_tree->map); 16846df9a95eSJosef Bacik if (n) { 16856df9a95eSJosef Bacik em = rb_entry(n, struct extent_map, rb_node); 16866df9a95eSJosef Bacik ret = em->start + em->len; 1687e17cade2SChris Mason } 16886df9a95eSJosef Bacik read_unlock(&em_tree->lock); 16896df9a95eSJosef Bacik 16900b86a832SChris Mason return ret; 16910b86a832SChris Mason } 16920b86a832SChris Mason 169353f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 169453f10659SIlya Dryomov u64 *devid_ret) 16950b86a832SChris Mason { 16960b86a832SChris Mason int ret; 16970b86a832SChris Mason struct btrfs_key key; 16980b86a832SChris Mason struct btrfs_key found_key; 16992b82032cSYan Zheng struct btrfs_path *path; 17002b82032cSYan Zheng 17012b82032cSYan Zheng path = btrfs_alloc_path(); 17022b82032cSYan Zheng if (!path) 17032b82032cSYan Zheng return -ENOMEM; 17040b86a832SChris Mason 17050b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 17060b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 17070b86a832SChris Mason key.offset = (u64)-1; 17080b86a832SChris Mason 170953f10659SIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 17100b86a832SChris Mason if (ret < 0) 17110b86a832SChris Mason goto error; 17120b86a832SChris Mason 171379787eaaSJeff Mahoney BUG_ON(ret == 0); /* Corruption */ 17140b86a832SChris Mason 171553f10659SIlya Dryomov ret = btrfs_previous_item(fs_info->chunk_root, path, 171653f10659SIlya Dryomov BTRFS_DEV_ITEMS_OBJECTID, 17170b86a832SChris Mason BTRFS_DEV_ITEM_KEY); 17180b86a832SChris Mason if (ret) { 171953f10659SIlya Dryomov *devid_ret = 1; 17200b86a832SChris Mason } else { 17210b86a832SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 17220b86a832SChris Mason path->slots[0]); 172353f10659SIlya Dryomov *devid_ret = found_key.offset + 1; 17240b86a832SChris Mason } 17250b86a832SChris Mason ret = 0; 17260b86a832SChris Mason error: 17272b82032cSYan Zheng btrfs_free_path(path); 17280b86a832SChris Mason return ret; 17290b86a832SChris Mason } 17300b86a832SChris Mason 17310b86a832SChris Mason /* 17320b86a832SChris Mason * the device information is stored in the chunk root 17330b86a832SChris Mason * the btrfs_device struct should be fully filled in 17340b86a832SChris Mason */ 1735c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 17365b4aacefSJeff Mahoney struct btrfs_fs_info *fs_info, 17370b86a832SChris Mason struct btrfs_device *device) 17380b86a832SChris Mason { 17395b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 17400b86a832SChris Mason int ret; 17410b86a832SChris Mason struct btrfs_path *path; 17420b86a832SChris Mason struct btrfs_dev_item *dev_item; 17430b86a832SChris Mason struct extent_buffer *leaf; 17440b86a832SChris Mason struct btrfs_key key; 17450b86a832SChris Mason unsigned long ptr; 17460b86a832SChris Mason 17470b86a832SChris Mason path = btrfs_alloc_path(); 17480b86a832SChris Mason if (!path) 17490b86a832SChris Mason return -ENOMEM; 17500b86a832SChris Mason 17510b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 17520b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 17532b82032cSYan Zheng key.offset = device->devid; 17540b86a832SChris Mason 17550b86a832SChris Mason ret = btrfs_insert_empty_item(trans, root, path, &key, 17560d81ba5dSChris Mason sizeof(*dev_item)); 17570b86a832SChris Mason if (ret) 17580b86a832SChris Mason goto out; 17590b86a832SChris Mason 17600b86a832SChris Mason leaf = path->nodes[0]; 17610b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 17620b86a832SChris Mason 17630b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 17642b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 0); 17650b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 17660b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 17670b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 17680b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 17697cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 17707cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 17717cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 17727cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 1773e17cade2SChris Mason btrfs_set_device_group(leaf, dev_item, 0); 1774e17cade2SChris Mason btrfs_set_device_seek_speed(leaf, dev_item, 0); 1775e17cade2SChris Mason btrfs_set_device_bandwidth(leaf, dev_item, 0); 1776c3027eb5SChris Mason btrfs_set_device_start_offset(leaf, dev_item, 0); 17770b86a832SChris Mason 1778410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 1779e17cade2SChris Mason write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 17801473b24eSGeert Uytterhoeven ptr = btrfs_device_fsid(dev_item); 178144880fdcSAnand Jain write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); 17820b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 17830b86a832SChris Mason 17842b82032cSYan Zheng ret = 0; 17850b86a832SChris Mason out: 17860b86a832SChris Mason btrfs_free_path(path); 17870b86a832SChris Mason return ret; 17880b86a832SChris Mason } 17898f18cf13SChris Mason 17905a1972bdSQu Wenruo /* 17915a1972bdSQu Wenruo * Function to update ctime/mtime for a given device path. 17925a1972bdSQu Wenruo * Mainly used for ctime/mtime based probe like libblkid. 17935a1972bdSQu Wenruo */ 1794da353f6bSDavid Sterba static void update_dev_time(const char *path_name) 17955a1972bdSQu Wenruo { 17965a1972bdSQu Wenruo struct file *filp; 17975a1972bdSQu Wenruo 17985a1972bdSQu Wenruo filp = filp_open(path_name, O_RDWR, 0); 179998af592fSAl Viro if (IS_ERR(filp)) 18005a1972bdSQu Wenruo return; 18015a1972bdSQu Wenruo file_update_time(filp); 18025a1972bdSQu Wenruo filp_close(filp, NULL); 18035a1972bdSQu Wenruo } 18045a1972bdSQu Wenruo 18055b4aacefSJeff Mahoney static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, 1806a061fc8dSChris Mason struct btrfs_device *device) 1807a061fc8dSChris Mason { 18085b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 1809a061fc8dSChris Mason int ret; 1810a061fc8dSChris Mason struct btrfs_path *path; 1811a061fc8dSChris Mason struct btrfs_key key; 1812a061fc8dSChris Mason struct btrfs_trans_handle *trans; 1813a061fc8dSChris Mason 1814a061fc8dSChris Mason path = btrfs_alloc_path(); 1815a061fc8dSChris Mason if (!path) 1816a061fc8dSChris Mason return -ENOMEM; 1817a061fc8dSChris Mason 1818a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 181998d5dc13STsutomu Itoh if (IS_ERR(trans)) { 182098d5dc13STsutomu Itoh btrfs_free_path(path); 182198d5dc13STsutomu Itoh return PTR_ERR(trans); 182298d5dc13STsutomu Itoh } 1823a061fc8dSChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1824a061fc8dSChris Mason key.type = BTRFS_DEV_ITEM_KEY; 1825a061fc8dSChris Mason key.offset = device->devid; 1826a061fc8dSChris Mason 1827a061fc8dSChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 18285e9f2ad5SNikolay Borisov if (ret) { 18295e9f2ad5SNikolay Borisov if (ret > 0) 1830a061fc8dSChris Mason ret = -ENOENT; 18315e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 18325e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 1833a061fc8dSChris Mason goto out; 1834a061fc8dSChris Mason } 1835a061fc8dSChris Mason 1836a061fc8dSChris Mason ret = btrfs_del_item(trans, root, path); 18375e9f2ad5SNikolay Borisov if (ret) { 18385e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 18395e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 18405e9f2ad5SNikolay Borisov } 18415e9f2ad5SNikolay Borisov 1842a061fc8dSChris Mason out: 1843a061fc8dSChris Mason btrfs_free_path(path); 18445e9f2ad5SNikolay Borisov if (!ret) 18455e9f2ad5SNikolay Borisov ret = btrfs_commit_transaction(trans); 1846a061fc8dSChris Mason return ret; 1847a061fc8dSChris Mason } 1848a061fc8dSChris Mason 18493cc31a0dSDavid Sterba /* 18503cc31a0dSDavid Sterba * Verify that @num_devices satisfies the RAID profile constraints in the whole 18513cc31a0dSDavid Sterba * filesystem. It's up to the caller to adjust that number regarding eg. device 18523cc31a0dSDavid Sterba * replace. 18533cc31a0dSDavid Sterba */ 18543cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 18553cc31a0dSDavid Sterba u64 num_devices) 1856a061fc8dSChris Mason { 1857a061fc8dSChris Mason u64 all_avail; 1858de98ced9SMiao Xie unsigned seq; 1859418775a2SDavid Sterba int i; 1860a061fc8dSChris Mason 1861de98ced9SMiao Xie do { 1862bd45ffbcSAnand Jain seq = read_seqbegin(&fs_info->profiles_lock); 1863de98ced9SMiao Xie 1864bd45ffbcSAnand Jain all_avail = fs_info->avail_data_alloc_bits | 1865bd45ffbcSAnand Jain fs_info->avail_system_alloc_bits | 1866bd45ffbcSAnand Jain fs_info->avail_metadata_alloc_bits; 1867bd45ffbcSAnand Jain } while (read_seqretry(&fs_info->profiles_lock, seq)); 1868f1fa7f26SAnand Jain 1869418775a2SDavid Sterba for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1870418775a2SDavid Sterba if (!(all_avail & btrfs_raid_group[i])) 1871418775a2SDavid Sterba continue; 1872a061fc8dSChris Mason 1873418775a2SDavid Sterba if (num_devices < btrfs_raid_array[i].devs_min) { 1874418775a2SDavid Sterba int ret = btrfs_raid_mindev_error[i]; 1875a061fc8dSChris Mason 1876418775a2SDavid Sterba if (ret) 1877418775a2SDavid Sterba return ret; 187853b381b3SDavid Woodhouse } 1879bd45ffbcSAnand Jain } 1880bd45ffbcSAnand Jain 1881bd45ffbcSAnand Jain return 0; 1882f1fa7f26SAnand Jain } 1883f1fa7f26SAnand Jain 1884c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device( 1885c9162bdfSOmar Sandoval struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 188688acff64SAnand Jain { 188788acff64SAnand Jain struct btrfs_device *next_device; 188888acff64SAnand Jain 188988acff64SAnand Jain list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 189088acff64SAnand Jain if (next_device != device && 1891e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1892e6e674bdSAnand Jain && next_device->bdev) 189388acff64SAnand Jain return next_device; 189488acff64SAnand Jain } 189588acff64SAnand Jain 189688acff64SAnand Jain return NULL; 189788acff64SAnand Jain } 189888acff64SAnand Jain 189988acff64SAnand Jain /* 190088acff64SAnand Jain * Helper function to check if the given device is part of s_bdev / latest_bdev 190188acff64SAnand Jain * and replace it with the provided or the next active device, in the context 190288acff64SAnand Jain * where this function called, there should be always be another device (or 190388acff64SAnand Jain * this_dev) which is active. 190488acff64SAnand Jain */ 190588acff64SAnand Jain void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, 190688acff64SAnand Jain struct btrfs_device *device, struct btrfs_device *this_dev) 190788acff64SAnand Jain { 190888acff64SAnand Jain struct btrfs_device *next_device; 190988acff64SAnand Jain 191088acff64SAnand Jain if (this_dev) 191188acff64SAnand Jain next_device = this_dev; 191288acff64SAnand Jain else 191388acff64SAnand Jain next_device = btrfs_find_next_active_device(fs_info->fs_devices, 191488acff64SAnand Jain device); 191588acff64SAnand Jain ASSERT(next_device); 191688acff64SAnand Jain 191788acff64SAnand Jain if (fs_info->sb->s_bdev && 191888acff64SAnand Jain (fs_info->sb->s_bdev == device->bdev)) 191988acff64SAnand Jain fs_info->sb->s_bdev = next_device->bdev; 192088acff64SAnand Jain 192188acff64SAnand Jain if (fs_info->fs_devices->latest_bdev == device->bdev) 192288acff64SAnand Jain fs_info->fs_devices->latest_bdev = next_device->bdev; 192388acff64SAnand Jain } 192488acff64SAnand Jain 1925da353f6bSDavid Sterba int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 1926da353f6bSDavid Sterba u64 devid) 1927f1fa7f26SAnand Jain { 1928f1fa7f26SAnand Jain struct btrfs_device *device; 1929f1fa7f26SAnand Jain struct btrfs_fs_devices *cur_devices; 1930f1fa7f26SAnand Jain u64 num_devices; 1931f1fa7f26SAnand Jain int ret = 0; 1932f1fa7f26SAnand Jain 19332c997384SAnand Jain mutex_lock(&fs_info->volume_mutex); 1934f1fa7f26SAnand Jain mutex_lock(&uuid_mutex); 1935a061fc8dSChris Mason 19360b246afaSJeff Mahoney num_devices = fs_info->fs_devices->num_devices; 19370b246afaSJeff Mahoney btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 19380b246afaSJeff Mahoney if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1939a061fc8dSChris Mason WARN_ON(num_devices < 1); 1940a061fc8dSChris Mason num_devices--; 1941a061fc8dSChris Mason } 19420b246afaSJeff Mahoney btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 1943a061fc8dSChris Mason 19440b246afaSJeff Mahoney ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 1945beaf8ab3SStefan Behrens if (ret) 1946a061fc8dSChris Mason goto out; 1947f1fa7f26SAnand Jain 19482ff7e61eSJeff Mahoney ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, 194924fc572fSAnand Jain &device); 1950a061fc8dSChris Mason if (ret) 1951a061fc8dSChris Mason goto out; 19522b82032cSYan Zheng 1953401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1954183860f6SAnand Jain ret = BTRFS_ERROR_DEV_TGT_REPLACE; 195524fc572fSAnand Jain goto out; 195663a212abSStefan Behrens } 195763a212abSStefan Behrens 1958ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1959ebbede42SAnand Jain fs_info->fs_devices->rw_devices == 1) { 1960183860f6SAnand Jain ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 196124fc572fSAnand Jain goto out; 19622b82032cSYan Zheng } 19632b82032cSYan Zheng 1964ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 196534441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 19662b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 1967c3929c36SMiao Xie device->fs_devices->rw_devices--; 196834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 19692b82032cSYan Zheng } 1970a061fc8dSChris Mason 1971d7901554SCarey Underwood mutex_unlock(&uuid_mutex); 1972a061fc8dSChris Mason ret = btrfs_shrink_device(device, 0); 1973d7901554SCarey Underwood mutex_lock(&uuid_mutex); 1974a061fc8dSChris Mason if (ret) 19759b3517e9SIlya Dryomov goto error_undo; 1976a061fc8dSChris Mason 197763a212abSStefan Behrens /* 197863a212abSStefan Behrens * TODO: the superblock still includes this device in its num_devices 197963a212abSStefan Behrens * counter although write_all_supers() is not locked out. This 198063a212abSStefan Behrens * could give a filesystem state which requires a degraded mount. 198163a212abSStefan Behrens */ 19820b246afaSJeff Mahoney ret = btrfs_rm_dev_item(fs_info, device); 1983a061fc8dSChris Mason if (ret) 19849b3517e9SIlya Dryomov goto error_undo; 1985a061fc8dSChris Mason 1986e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 19870b246afaSJeff Mahoney btrfs_scrub_cancel_dev(fs_info, device); 1988e5e9a520SChris Mason 1989e5e9a520SChris Mason /* 1990e5e9a520SChris Mason * the device list mutex makes sure that we don't change 1991e5e9a520SChris Mason * the device list while someone else is writing out all 1992d7306801SFilipe David Borba Manana * the device supers. Whoever is writing all supers, should 1993d7306801SFilipe David Borba Manana * lock the device list mutex before getting the number of 1994d7306801SFilipe David Borba Manana * devices in the super block (super_copy). Conversely, 1995d7306801SFilipe David Borba Manana * whoever updates the number of devices in the super block 1996d7306801SFilipe David Borba Manana * (super_copy) should hold the device list mutex. 1997e5e9a520SChris Mason */ 19981f78160cSXiao Guangrong 19991f78160cSXiao Guangrong cur_devices = device->fs_devices; 20000b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 20011f78160cSXiao Guangrong list_del_rcu(&device->dev_list); 2002e5e9a520SChris Mason 2003e4404d6eSYan Zheng device->fs_devices->num_devices--; 200402db0844SJosef Bacik device->fs_devices->total_devices--; 20052b82032cSYan Zheng 2006e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 20073a7d55c8SMiao Xie device->fs_devices->missing_devices--; 2008cd02dca5SChris Mason 20090b246afaSJeff Mahoney btrfs_assign_next_active_device(fs_info, device, NULL); 20102b82032cSYan Zheng 20110bfaa9c5SEric Sandeen if (device->bdev) { 20122b82032cSYan Zheng device->fs_devices->open_devices--; 201399994cdeSAnand Jain /* remove sysfs entry */ 20140b246afaSJeff Mahoney btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 20150bfaa9c5SEric Sandeen } 201699994cdeSAnand Jain 20170b246afaSJeff Mahoney num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 20180b246afaSJeff Mahoney btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 20190b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2020e4404d6eSYan Zheng 2021cea67ab9SJeff Mahoney /* 2022cea67ab9SJeff Mahoney * at this point, the device is zero sized and detached from 2023cea67ab9SJeff Mahoney * the devices list. All that's left is to zero out the old 2024cea67ab9SJeff Mahoney * supers and free the device. 2025cea67ab9SJeff Mahoney */ 2026ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2027cea67ab9SJeff Mahoney btrfs_scratch_superblocks(device->bdev, device->name->str); 2028cea67ab9SJeff Mahoney 2029cea67ab9SJeff Mahoney btrfs_close_bdev(device); 2030f06c5965SDavid Sterba call_rcu(&device->rcu, free_device_rcu); 2031cea67ab9SJeff Mahoney 20321f78160cSXiao Guangrong if (cur_devices->open_devices == 0) { 20332b82032cSYan Zheng struct btrfs_fs_devices *fs_devices; 20340b246afaSJeff Mahoney fs_devices = fs_info->fs_devices; 20352b82032cSYan Zheng while (fs_devices) { 20368321cf25SRickard Strandqvist if (fs_devices->seed == cur_devices) { 20378321cf25SRickard Strandqvist fs_devices->seed = cur_devices->seed; 20382b82032cSYan Zheng break; 20398321cf25SRickard Strandqvist } 20402b82032cSYan Zheng fs_devices = fs_devices->seed; 20412b82032cSYan Zheng } 20421f78160cSXiao Guangrong cur_devices->seed = NULL; 20431f78160cSXiao Guangrong __btrfs_close_devices(cur_devices); 20441f78160cSXiao Guangrong free_fs_devices(cur_devices); 20452b82032cSYan Zheng } 20462b82032cSYan Zheng 2047a061fc8dSChris Mason out: 2048a061fc8dSChris Mason mutex_unlock(&uuid_mutex); 20492c997384SAnand Jain mutex_unlock(&fs_info->volume_mutex); 2050a061fc8dSChris Mason return ret; 205124fc572fSAnand Jain 20529b3517e9SIlya Dryomov error_undo: 2053ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 205434441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 20559b3517e9SIlya Dryomov list_add(&device->dev_alloc_list, 20560b246afaSJeff Mahoney &fs_info->fs_devices->alloc_list); 2057c3929c36SMiao Xie device->fs_devices->rw_devices++; 205834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 20599b3517e9SIlya Dryomov } 206024fc572fSAnand Jain goto out; 2061a061fc8dSChris Mason } 2062a061fc8dSChris Mason 2063084b6e7cSQu Wenruo void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 2064e93c89c1SStefan Behrens struct btrfs_device *srcdev) 2065e93c89c1SStefan Behrens { 2066d51908ceSAnand Jain struct btrfs_fs_devices *fs_devices; 2067d51908ceSAnand Jain 2068e93c89c1SStefan Behrens WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 20691357272fSIlya Dryomov 207025e8e911SAnand Jain /* 207125e8e911SAnand Jain * in case of fs with no seed, srcdev->fs_devices will point 207225e8e911SAnand Jain * to fs_devices of fs_info. However when the dev being replaced is 207325e8e911SAnand Jain * a seed dev it will point to the seed's local fs_devices. In short 207425e8e911SAnand Jain * srcdev will have its correct fs_devices in both the cases. 207525e8e911SAnand Jain */ 207625e8e911SAnand Jain fs_devices = srcdev->fs_devices; 2077d51908ceSAnand Jain 2078e93c89c1SStefan Behrens list_del_rcu(&srcdev->dev_list); 2079619c47f3SDavid Sterba list_del(&srcdev->dev_alloc_list); 2080d51908ceSAnand Jain fs_devices->num_devices--; 2081e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2082d51908ceSAnand Jain fs_devices->missing_devices--; 2083e93c89c1SStefan Behrens 2084ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 208582372bc8SMiao Xie fs_devices->rw_devices--; 20861357272fSIlya Dryomov 208782372bc8SMiao Xie if (srcdev->bdev) 208882372bc8SMiao Xie fs_devices->open_devices--; 2089084b6e7cSQu Wenruo } 2090084b6e7cSQu Wenruo 2091084b6e7cSQu Wenruo void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 2092084b6e7cSQu Wenruo struct btrfs_device *srcdev) 2093084b6e7cSQu Wenruo { 2094084b6e7cSQu Wenruo struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 209582372bc8SMiao Xie 2096ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 209748b3b9d4SAnand Jain /* zero out the old super if it is writable */ 209848b3b9d4SAnand Jain btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 209948b3b9d4SAnand Jain } 210014238819SAnand Jain 210114238819SAnand Jain btrfs_close_bdev(srcdev); 2102f06c5965SDavid Sterba call_rcu(&srcdev->rcu, free_device_rcu); 210394d5f0c2SAnand Jain 210494d5f0c2SAnand Jain /* if this is no devs we rather delete the fs_devices */ 210594d5f0c2SAnand Jain if (!fs_devices->num_devices) { 210694d5f0c2SAnand Jain struct btrfs_fs_devices *tmp_fs_devices; 210794d5f0c2SAnand Jain 21086dd38f81SAnand Jain /* 21096dd38f81SAnand Jain * On a mounted FS, num_devices can't be zero unless it's a 21106dd38f81SAnand Jain * seed. In case of a seed device being replaced, the replace 21116dd38f81SAnand Jain * target added to the sprout FS, so there will be no more 21126dd38f81SAnand Jain * device left under the seed FS. 21136dd38f81SAnand Jain */ 21146dd38f81SAnand Jain ASSERT(fs_devices->seeding); 21156dd38f81SAnand Jain 211694d5f0c2SAnand Jain tmp_fs_devices = fs_info->fs_devices; 211794d5f0c2SAnand Jain while (tmp_fs_devices) { 211894d5f0c2SAnand Jain if (tmp_fs_devices->seed == fs_devices) { 211994d5f0c2SAnand Jain tmp_fs_devices->seed = fs_devices->seed; 212094d5f0c2SAnand Jain break; 212194d5f0c2SAnand Jain } 212294d5f0c2SAnand Jain tmp_fs_devices = tmp_fs_devices->seed; 212394d5f0c2SAnand Jain } 212494d5f0c2SAnand Jain fs_devices->seed = NULL; 21258bef8401SAnand Jain __btrfs_close_devices(fs_devices); 21268bef8401SAnand Jain free_fs_devices(fs_devices); 212794d5f0c2SAnand Jain } 2128e93c89c1SStefan Behrens } 2129e93c89c1SStefan Behrens 2130e93c89c1SStefan Behrens void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2131e93c89c1SStefan Behrens struct btrfs_device *tgtdev) 2132e93c89c1SStefan Behrens { 213367a2c45eSMiao Xie mutex_lock(&uuid_mutex); 2134e93c89c1SStefan Behrens WARN_ON(!tgtdev); 2135e93c89c1SStefan Behrens mutex_lock(&fs_info->fs_devices->device_list_mutex); 2136d2ff1b20SAnand Jain 213732576040SAnand Jain btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2138d2ff1b20SAnand Jain 2139779bf3feSAnand Jain if (tgtdev->bdev) 2140e93c89c1SStefan Behrens fs_info->fs_devices->open_devices--; 2141779bf3feSAnand Jain 2142e93c89c1SStefan Behrens fs_info->fs_devices->num_devices--; 2143e93c89c1SStefan Behrens 214488acff64SAnand Jain btrfs_assign_next_active_device(fs_info, tgtdev, NULL); 2145e93c89c1SStefan Behrens 2146e93c89c1SStefan Behrens list_del_rcu(&tgtdev->dev_list); 2147e93c89c1SStefan Behrens 2148e93c89c1SStefan Behrens mutex_unlock(&fs_info->fs_devices->device_list_mutex); 214967a2c45eSMiao Xie mutex_unlock(&uuid_mutex); 2150779bf3feSAnand Jain 2151779bf3feSAnand Jain /* 2152779bf3feSAnand Jain * The update_dev_time() with in btrfs_scratch_superblocks() 2153779bf3feSAnand Jain * may lead to a call to btrfs_show_devname() which will try 2154779bf3feSAnand Jain * to hold device_list_mutex. And here this device 2155779bf3feSAnand Jain * is already out of device list, so we don't have to hold 2156779bf3feSAnand Jain * the device_list_mutex lock. 2157779bf3feSAnand Jain */ 2158779bf3feSAnand Jain btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 215914238819SAnand Jain 216014238819SAnand Jain btrfs_close_bdev(tgtdev); 2161f06c5965SDavid Sterba call_rcu(&tgtdev->rcu, free_device_rcu); 2162e93c89c1SStefan Behrens } 2163e93c89c1SStefan Behrens 21642ff7e61eSJeff Mahoney static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, 2165da353f6bSDavid Sterba const char *device_path, 21667ba15b7dSStefan Behrens struct btrfs_device **device) 21677ba15b7dSStefan Behrens { 21687ba15b7dSStefan Behrens int ret = 0; 21697ba15b7dSStefan Behrens struct btrfs_super_block *disk_super; 21707ba15b7dSStefan Behrens u64 devid; 21717ba15b7dSStefan Behrens u8 *dev_uuid; 21727ba15b7dSStefan Behrens struct block_device *bdev; 21737ba15b7dSStefan Behrens struct buffer_head *bh; 21747ba15b7dSStefan Behrens 21757ba15b7dSStefan Behrens *device = NULL; 21767ba15b7dSStefan Behrens ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 21770b246afaSJeff Mahoney fs_info->bdev_holder, 0, &bdev, &bh); 21787ba15b7dSStefan Behrens if (ret) 21797ba15b7dSStefan Behrens return ret; 21807ba15b7dSStefan Behrens disk_super = (struct btrfs_super_block *)bh->b_data; 21817ba15b7dSStefan Behrens devid = btrfs_stack_device_id(&disk_super->dev_item); 21827ba15b7dSStefan Behrens dev_uuid = disk_super->dev_item.uuid; 21830b246afaSJeff Mahoney *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); 21847ba15b7dSStefan Behrens brelse(bh); 21857ba15b7dSStefan Behrens if (!*device) 21867ba15b7dSStefan Behrens ret = -ENOENT; 21877ba15b7dSStefan Behrens blkdev_put(bdev, FMODE_READ); 21887ba15b7dSStefan Behrens return ret; 21897ba15b7dSStefan Behrens } 21907ba15b7dSStefan Behrens 21912ff7e61eSJeff Mahoney int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, 2192da353f6bSDavid Sterba const char *device_path, 21937ba15b7dSStefan Behrens struct btrfs_device **device) 21947ba15b7dSStefan Behrens { 21957ba15b7dSStefan Behrens *device = NULL; 21967ba15b7dSStefan Behrens if (strcmp(device_path, "missing") == 0) { 21977ba15b7dSStefan Behrens struct list_head *devices; 21987ba15b7dSStefan Behrens struct btrfs_device *tmp; 21997ba15b7dSStefan Behrens 22000b246afaSJeff Mahoney devices = &fs_info->fs_devices->devices; 22017ba15b7dSStefan Behrens /* 22027ba15b7dSStefan Behrens * It is safe to read the devices since the volume_mutex 22037ba15b7dSStefan Behrens * is held by the caller. 22047ba15b7dSStefan Behrens */ 22057ba15b7dSStefan Behrens list_for_each_entry(tmp, devices, dev_list) { 2206e12c9621SAnand Jain if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2207e12c9621SAnand Jain &tmp->dev_state) && !tmp->bdev) { 22087ba15b7dSStefan Behrens *device = tmp; 22097ba15b7dSStefan Behrens break; 22107ba15b7dSStefan Behrens } 22117ba15b7dSStefan Behrens } 22127ba15b7dSStefan Behrens 2213d74a6259SAnand Jain if (!*device) 2214d74a6259SAnand Jain return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 22157ba15b7dSStefan Behrens 22167ba15b7dSStefan Behrens return 0; 22177ba15b7dSStefan Behrens } else { 22182ff7e61eSJeff Mahoney return btrfs_find_device_by_path(fs_info, device_path, device); 22197ba15b7dSStefan Behrens } 22207ba15b7dSStefan Behrens } 22217ba15b7dSStefan Behrens 22222b82032cSYan Zheng /* 22235c5c0df0SDavid Sterba * Lookup a device given by device id, or the path if the id is 0. 22245c5c0df0SDavid Sterba */ 22252ff7e61eSJeff Mahoney int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, 2226da353f6bSDavid Sterba const char *devpath, 2227da353f6bSDavid Sterba struct btrfs_device **device) 222824e0474bSAnand Jain { 222924e0474bSAnand Jain int ret; 223024e0474bSAnand Jain 22315c5c0df0SDavid Sterba if (devid) { 223224e0474bSAnand Jain ret = 0; 22330b246afaSJeff Mahoney *device = btrfs_find_device(fs_info, devid, NULL, NULL); 223424e0474bSAnand Jain if (!*device) 223524e0474bSAnand Jain ret = -ENOENT; 223624e0474bSAnand Jain } else { 22375c5c0df0SDavid Sterba if (!devpath || !devpath[0]) 2238b3d1b153SAnand Jain return -EINVAL; 2239b3d1b153SAnand Jain 22402ff7e61eSJeff Mahoney ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, 224124e0474bSAnand Jain device); 224224e0474bSAnand Jain } 224324e0474bSAnand Jain return ret; 224424e0474bSAnand Jain } 224524e0474bSAnand Jain 22462b82032cSYan Zheng /* 22472b82032cSYan Zheng * does all the dirty work required for changing file system's UUID. 22482b82032cSYan Zheng */ 22492ff7e61eSJeff Mahoney static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 22502b82032cSYan Zheng { 22510b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 22522b82032cSYan Zheng struct btrfs_fs_devices *old_devices; 2253e4404d6eSYan Zheng struct btrfs_fs_devices *seed_devices; 22540b246afaSJeff Mahoney struct btrfs_super_block *disk_super = fs_info->super_copy; 22552b82032cSYan Zheng struct btrfs_device *device; 22562b82032cSYan Zheng u64 super_flags; 22572b82032cSYan Zheng 22582b82032cSYan Zheng BUG_ON(!mutex_is_locked(&uuid_mutex)); 2259e4404d6eSYan Zheng if (!fs_devices->seeding) 22602b82032cSYan Zheng return -EINVAL; 22612b82032cSYan Zheng 22622dfeca9bSDavid Sterba seed_devices = alloc_fs_devices(NULL); 22632208a378SIlya Dryomov if (IS_ERR(seed_devices)) 22642208a378SIlya Dryomov return PTR_ERR(seed_devices); 22652b82032cSYan Zheng 2266e4404d6eSYan Zheng old_devices = clone_fs_devices(fs_devices); 2267e4404d6eSYan Zheng if (IS_ERR(old_devices)) { 2268e4404d6eSYan Zheng kfree(seed_devices); 2269e4404d6eSYan Zheng return PTR_ERR(old_devices); 22702b82032cSYan Zheng } 2271e4404d6eSYan Zheng 22722b82032cSYan Zheng list_add(&old_devices->list, &fs_uuids); 22732b82032cSYan Zheng 2274e4404d6eSYan Zheng memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2275e4404d6eSYan Zheng seed_devices->opened = 1; 2276e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->devices); 2277e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->alloc_list); 2278e5e9a520SChris Mason mutex_init(&seed_devices->device_list_mutex); 2279c9513edbSXiao Guangrong 22800b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 22811f78160cSXiao Guangrong list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 22821f78160cSXiao Guangrong synchronize_rcu); 22832196d6e8SMiao Xie list_for_each_entry(device, &seed_devices->devices, dev_list) 2284e4404d6eSYan Zheng device->fs_devices = seed_devices; 22852196d6e8SMiao Xie 228634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 22872196d6e8SMiao Xie list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 228834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2289e4404d6eSYan Zheng 22902b82032cSYan Zheng fs_devices->seeding = 0; 22912b82032cSYan Zheng fs_devices->num_devices = 0; 22922b82032cSYan Zheng fs_devices->open_devices = 0; 229369611ac8SMiao Xie fs_devices->missing_devices = 0; 229469611ac8SMiao Xie fs_devices->rotating = 0; 2295e4404d6eSYan Zheng fs_devices->seed = seed_devices; 22962b82032cSYan Zheng 22972b82032cSYan Zheng generate_random_uuid(fs_devices->fsid); 22980b246afaSJeff Mahoney memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 22992b82032cSYan Zheng memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 23000b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2301f7171750SFilipe David Borba Manana 23022b82032cSYan Zheng super_flags = btrfs_super_flags(disk_super) & 23032b82032cSYan Zheng ~BTRFS_SUPER_FLAG_SEEDING; 23042b82032cSYan Zheng btrfs_set_super_flags(disk_super, super_flags); 23052b82032cSYan Zheng 23062b82032cSYan Zheng return 0; 23072b82032cSYan Zheng } 23082b82032cSYan Zheng 23092b82032cSYan Zheng /* 231001327610SNicholas D Steeves * Store the expected generation for seed devices in device items. 23112b82032cSYan Zheng */ 23122b82032cSYan Zheng static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 23135b4aacefSJeff Mahoney struct btrfs_fs_info *fs_info) 23142b82032cSYan Zheng { 23155b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 23162b82032cSYan Zheng struct btrfs_path *path; 23172b82032cSYan Zheng struct extent_buffer *leaf; 23182b82032cSYan Zheng struct btrfs_dev_item *dev_item; 23192b82032cSYan Zheng struct btrfs_device *device; 23202b82032cSYan Zheng struct btrfs_key key; 232144880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 23222b82032cSYan Zheng u8 dev_uuid[BTRFS_UUID_SIZE]; 23232b82032cSYan Zheng u64 devid; 23242b82032cSYan Zheng int ret; 23252b82032cSYan Zheng 23262b82032cSYan Zheng path = btrfs_alloc_path(); 23272b82032cSYan Zheng if (!path) 23282b82032cSYan Zheng return -ENOMEM; 23292b82032cSYan Zheng 23302b82032cSYan Zheng key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 23312b82032cSYan Zheng key.offset = 0; 23322b82032cSYan Zheng key.type = BTRFS_DEV_ITEM_KEY; 23332b82032cSYan Zheng 23342b82032cSYan Zheng while (1) { 23352b82032cSYan Zheng ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 23362b82032cSYan Zheng if (ret < 0) 23372b82032cSYan Zheng goto error; 23382b82032cSYan Zheng 23392b82032cSYan Zheng leaf = path->nodes[0]; 23402b82032cSYan Zheng next_slot: 23412b82032cSYan Zheng if (path->slots[0] >= btrfs_header_nritems(leaf)) { 23422b82032cSYan Zheng ret = btrfs_next_leaf(root, path); 23432b82032cSYan Zheng if (ret > 0) 23442b82032cSYan Zheng break; 23452b82032cSYan Zheng if (ret < 0) 23462b82032cSYan Zheng goto error; 23472b82032cSYan Zheng leaf = path->nodes[0]; 23482b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2349b3b4aa74SDavid Sterba btrfs_release_path(path); 23502b82032cSYan Zheng continue; 23512b82032cSYan Zheng } 23522b82032cSYan Zheng 23532b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 23542b82032cSYan Zheng if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 23552b82032cSYan Zheng key.type != BTRFS_DEV_ITEM_KEY) 23562b82032cSYan Zheng break; 23572b82032cSYan Zheng 23582b82032cSYan Zheng dev_item = btrfs_item_ptr(leaf, path->slots[0], 23592b82032cSYan Zheng struct btrfs_dev_item); 23602b82032cSYan Zheng devid = btrfs_device_id(leaf, dev_item); 2361410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 23622b82032cSYan Zheng BTRFS_UUID_SIZE); 23631473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 236444880fdcSAnand Jain BTRFS_FSID_SIZE); 23650b246afaSJeff Mahoney device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 236679787eaaSJeff Mahoney BUG_ON(!device); /* Logic error */ 23672b82032cSYan Zheng 23682b82032cSYan Zheng if (device->fs_devices->seeding) { 23692b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 23702b82032cSYan Zheng device->generation); 23712b82032cSYan Zheng btrfs_mark_buffer_dirty(leaf); 23722b82032cSYan Zheng } 23732b82032cSYan Zheng 23742b82032cSYan Zheng path->slots[0]++; 23752b82032cSYan Zheng goto next_slot; 23762b82032cSYan Zheng } 23772b82032cSYan Zheng ret = 0; 23782b82032cSYan Zheng error: 23792b82032cSYan Zheng btrfs_free_path(path); 23802b82032cSYan Zheng return ret; 23812b82032cSYan Zheng } 23822b82032cSYan Zheng 2383da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2384788f20ebSChris Mason { 23855112febbSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 2386d5e2003cSJosef Bacik struct request_queue *q; 2387788f20ebSChris Mason struct btrfs_trans_handle *trans; 2388788f20ebSChris Mason struct btrfs_device *device; 2389788f20ebSChris Mason struct block_device *bdev; 2390788f20ebSChris Mason struct list_head *devices; 23910b246afaSJeff Mahoney struct super_block *sb = fs_info->sb; 2392606686eeSJosef Bacik struct rcu_string *name; 23933c1dbdf5SAnand Jain u64 tmp; 23942b82032cSYan Zheng int seeding_dev = 0; 2395788f20ebSChris Mason int ret = 0; 23967132a262SAnand Jain bool unlocked = false; 2397788f20ebSChris Mason 2398bc98a42cSDavid Howells if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2399f8c5d0b4SLiu Bo return -EROFS; 2400788f20ebSChris Mason 2401a5d16333SLi Zefan bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 24020b246afaSJeff Mahoney fs_info->bdev_holder); 24037f59203aSJosef Bacik if (IS_ERR(bdev)) 24047f59203aSJosef Bacik return PTR_ERR(bdev); 2405a2135011SChris Mason 24060b246afaSJeff Mahoney if (fs_info->fs_devices->seeding) { 24072b82032cSYan Zheng seeding_dev = 1; 24082b82032cSYan Zheng down_write(&sb->s_umount); 24092b82032cSYan Zheng mutex_lock(&uuid_mutex); 24102b82032cSYan Zheng } 24112b82032cSYan Zheng 24128c8bee1dSChris Mason filemap_write_and_wait(bdev->bd_inode->i_mapping); 2413a2135011SChris Mason 24140b246afaSJeff Mahoney devices = &fs_info->fs_devices->devices; 2415d25628bdSLiu Bo 24160b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 2417c6e30871SQinghuang Feng list_for_each_entry(device, devices, dev_list) { 2418788f20ebSChris Mason if (device->bdev == bdev) { 2419788f20ebSChris Mason ret = -EEXIST; 2420d25628bdSLiu Bo mutex_unlock( 24210b246afaSJeff Mahoney &fs_info->fs_devices->device_list_mutex); 24222b82032cSYan Zheng goto error; 2423788f20ebSChris Mason } 2424788f20ebSChris Mason } 24250b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2426788f20ebSChris Mason 24270b246afaSJeff Mahoney device = btrfs_alloc_device(fs_info, NULL, NULL); 242812bd2fc0SIlya Dryomov if (IS_ERR(device)) { 2429788f20ebSChris Mason /* we can safely leave the fs_devices entry around */ 243012bd2fc0SIlya Dryomov ret = PTR_ERR(device); 24312b82032cSYan Zheng goto error; 2432788f20ebSChris Mason } 2433788f20ebSChris Mason 243478f2c9e6SDavid Sterba name = rcu_string_strdup(device_path, GFP_KERNEL); 2435606686eeSJosef Bacik if (!name) { 24362b82032cSYan Zheng ret = -ENOMEM; 24375c4cf6c9SDavid Sterba goto error_free_device; 2438788f20ebSChris Mason } 2439606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 24402b82032cSYan Zheng 2441a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 244298d5dc13STsutomu Itoh if (IS_ERR(trans)) { 244398d5dc13STsutomu Itoh ret = PTR_ERR(trans); 24445c4cf6c9SDavid Sterba goto error_free_device; 244598d5dc13STsutomu Itoh } 244698d5dc13STsutomu Itoh 2447d5e2003cSJosef Bacik q = bdev_get_queue(bdev); 2448ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 24492b82032cSYan Zheng device->generation = trans->transid; 24500b246afaSJeff Mahoney device->io_width = fs_info->sectorsize; 24510b246afaSJeff Mahoney device->io_align = fs_info->sectorsize; 24520b246afaSJeff Mahoney device->sector_size = fs_info->sectorsize; 24537dfb8be1SNikolay Borisov device->total_bytes = round_down(i_size_read(bdev->bd_inode), 24547dfb8be1SNikolay Borisov fs_info->sectorsize); 24552cc3c559SYan Zheng device->disk_total_bytes = device->total_bytes; 2456935e5cc9SMiao Xie device->commit_total_bytes = device->total_bytes; 2457fb456252SJeff Mahoney device->fs_info = fs_info; 2458788f20ebSChris Mason device->bdev = bdev; 2459e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2460401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2461fb01aa85SIlya Dryomov device->mode = FMODE_EXCL; 246227087f37SStefan Behrens device->dev_stats_valid = 1; 24639f6d2510SDavid Sterba set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2464325cd4baSZheng Yan 24652b82032cSYan Zheng if (seeding_dev) { 24661751e8a6SLinus Torvalds sb->s_flags &= ~SB_RDONLY; 24672ff7e61eSJeff Mahoney ret = btrfs_prepare_sprout(fs_info); 2468d31c32f6SAnand Jain if (ret) { 2469d31c32f6SAnand Jain btrfs_abort_transaction(trans, ret); 2470d31c32f6SAnand Jain goto error_trans; 2471d31c32f6SAnand Jain } 24722b82032cSYan Zheng } 24732b82032cSYan Zheng 24740b246afaSJeff Mahoney device->fs_devices = fs_info->fs_devices; 2475e5e9a520SChris Mason 24760b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 247734441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 24780b246afaSJeff Mahoney list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); 24792b82032cSYan Zheng list_add(&device->dev_alloc_list, 24800b246afaSJeff Mahoney &fs_info->fs_devices->alloc_list); 24810b246afaSJeff Mahoney fs_info->fs_devices->num_devices++; 24820b246afaSJeff Mahoney fs_info->fs_devices->open_devices++; 24830b246afaSJeff Mahoney fs_info->fs_devices->rw_devices++; 24840b246afaSJeff Mahoney fs_info->fs_devices->total_devices++; 24850b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes += device->total_bytes; 24862b82032cSYan Zheng 2487a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 24882bf64758SJosef Bacik 2489e884f4f0SAnand Jain if (!blk_queue_nonrot(q)) 24900b246afaSJeff Mahoney fs_info->fs_devices->rotating = 1; 2491c289811cSChris Mason 24920b246afaSJeff Mahoney tmp = btrfs_super_total_bytes(fs_info->super_copy); 24930b246afaSJeff Mahoney btrfs_set_super_total_bytes(fs_info->super_copy, 24947dfb8be1SNikolay Borisov round_down(tmp + device->total_bytes, fs_info->sectorsize)); 2495788f20ebSChris Mason 24960b246afaSJeff Mahoney tmp = btrfs_super_num_devices(fs_info->super_copy); 24970b246afaSJeff Mahoney btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 24980d39376aSAnand Jain 24990d39376aSAnand Jain /* add sysfs device entry */ 25000b246afaSJeff Mahoney btrfs_sysfs_add_device_link(fs_info->fs_devices, device); 25010d39376aSAnand Jain 25022196d6e8SMiao Xie /* 25032196d6e8SMiao Xie * we've got more storage, clear any full flags on the space 25042196d6e8SMiao Xie * infos 25052196d6e8SMiao Xie */ 25060b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 25072196d6e8SMiao Xie 250834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 25090b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2510788f20ebSChris Mason 25112b82032cSYan Zheng if (seeding_dev) { 251234441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 2513e4a4dce7SDavid Sterba ret = init_first_rw_device(trans, fs_info); 251434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2515005d6427SDavid Sterba if (ret) { 251666642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2517d31c32f6SAnand Jain goto error_sysfs; 2518005d6427SDavid Sterba } 25192196d6e8SMiao Xie } 25202196d6e8SMiao Xie 2521c74a0b02SAnand Jain ret = btrfs_add_dev_item(trans, fs_info, device); 25222196d6e8SMiao Xie if (ret) { 252366642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2524d31c32f6SAnand Jain goto error_sysfs; 25252196d6e8SMiao Xie } 25262196d6e8SMiao Xie 25272196d6e8SMiao Xie if (seeding_dev) { 25282196d6e8SMiao Xie char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 25292196d6e8SMiao Xie 25300b246afaSJeff Mahoney ret = btrfs_finish_sprout(trans, fs_info); 2531005d6427SDavid Sterba if (ret) { 253266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2533d31c32f6SAnand Jain goto error_sysfs; 2534005d6427SDavid Sterba } 2535b2373f25SAnand Jain 2536b2373f25SAnand Jain /* Sprouting would change fsid of the mounted root, 2537b2373f25SAnand Jain * so rename the fsid on the sysfs 2538b2373f25SAnand Jain */ 2539b2373f25SAnand Jain snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 25400b246afaSJeff Mahoney fs_info->fsid); 25410b246afaSJeff Mahoney if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) 25420b246afaSJeff Mahoney btrfs_warn(fs_info, 2543f14d104dSDavid Sterba "sysfs: failed to create fsid for sprout"); 2544005d6427SDavid Sterba } 25452b82032cSYan Zheng 25463a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 25472b82032cSYan Zheng 25482b82032cSYan Zheng if (seeding_dev) { 25492b82032cSYan Zheng mutex_unlock(&uuid_mutex); 25502b82032cSYan Zheng up_write(&sb->s_umount); 25517132a262SAnand Jain unlocked = true; 25522b82032cSYan Zheng 255379787eaaSJeff Mahoney if (ret) /* transaction commit */ 255479787eaaSJeff Mahoney return ret; 255579787eaaSJeff Mahoney 25562ff7e61eSJeff Mahoney ret = btrfs_relocate_sys_chunks(fs_info); 255779787eaaSJeff Mahoney if (ret < 0) 25580b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 25595d163e0eSJeff Mahoney "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2560671415b7SMiao Xie trans = btrfs_attach_transaction(root); 2561671415b7SMiao Xie if (IS_ERR(trans)) { 2562671415b7SMiao Xie if (PTR_ERR(trans) == -ENOENT) 2563671415b7SMiao Xie return 0; 25647132a262SAnand Jain ret = PTR_ERR(trans); 25657132a262SAnand Jain trans = NULL; 25667132a262SAnand Jain goto error_sysfs; 2567671415b7SMiao Xie } 25683a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 25692b82032cSYan Zheng } 2570c9e9f97bSIlya Dryomov 25715a1972bdSQu Wenruo /* Update ctime/mtime for libblkid */ 25725a1972bdSQu Wenruo update_dev_time(device_path); 2573788f20ebSChris Mason return ret; 257479787eaaSJeff Mahoney 2575d31c32f6SAnand Jain error_sysfs: 2576d31c32f6SAnand Jain btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 257779787eaaSJeff Mahoney error_trans: 25780af2c4bfSAnand Jain if (seeding_dev) 25791751e8a6SLinus Torvalds sb->s_flags |= SB_RDONLY; 25807132a262SAnand Jain if (trans) 25813a45bb20SJeff Mahoney btrfs_end_transaction(trans); 25825c4cf6c9SDavid Sterba error_free_device: 258355de4803SDavid Sterba free_device(device); 25842b82032cSYan Zheng error: 2585e525fd89STejun Heo blkdev_put(bdev, FMODE_EXCL); 25867132a262SAnand Jain if (seeding_dev && !unlocked) { 25872b82032cSYan Zheng mutex_unlock(&uuid_mutex); 25882b82032cSYan Zheng up_write(&sb->s_umount); 25892b82032cSYan Zheng } 2590c9e9f97bSIlya Dryomov return ret; 2591788f20ebSChris Mason } 2592788f20ebSChris Mason 25932ff7e61eSJeff Mahoney int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2594da353f6bSDavid Sterba const char *device_path, 25951c43366dSMiao Xie struct btrfs_device *srcdev, 2596e93c89c1SStefan Behrens struct btrfs_device **device_out) 2597e93c89c1SStefan Behrens { 2598e93c89c1SStefan Behrens struct btrfs_device *device; 2599e93c89c1SStefan Behrens struct block_device *bdev; 2600e93c89c1SStefan Behrens struct list_head *devices; 2601e93c89c1SStefan Behrens struct rcu_string *name; 260212bd2fc0SIlya Dryomov u64 devid = BTRFS_DEV_REPLACE_DEVID; 2603e93c89c1SStefan Behrens int ret = 0; 2604e93c89c1SStefan Behrens 2605e93c89c1SStefan Behrens *device_out = NULL; 26061c43366dSMiao Xie if (fs_info->fs_devices->seeding) { 26071c43366dSMiao Xie btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2608e93c89c1SStefan Behrens return -EINVAL; 26091c43366dSMiao Xie } 2610e93c89c1SStefan Behrens 2611e93c89c1SStefan Behrens bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2612e93c89c1SStefan Behrens fs_info->bdev_holder); 26131c43366dSMiao Xie if (IS_ERR(bdev)) { 26141c43366dSMiao Xie btrfs_err(fs_info, "target device %s is invalid!", device_path); 2615e93c89c1SStefan Behrens return PTR_ERR(bdev); 26161c43366dSMiao Xie } 2617e93c89c1SStefan Behrens 2618e93c89c1SStefan Behrens filemap_write_and_wait(bdev->bd_inode->i_mapping); 2619e93c89c1SStefan Behrens 2620e93c89c1SStefan Behrens devices = &fs_info->fs_devices->devices; 2621e93c89c1SStefan Behrens list_for_each_entry(device, devices, dev_list) { 2622e93c89c1SStefan Behrens if (device->bdev == bdev) { 26235d163e0eSJeff Mahoney btrfs_err(fs_info, 26245d163e0eSJeff Mahoney "target device is in the filesystem!"); 2625e93c89c1SStefan Behrens ret = -EEXIST; 2626e93c89c1SStefan Behrens goto error; 2627e93c89c1SStefan Behrens } 2628e93c89c1SStefan Behrens } 2629e93c89c1SStefan Behrens 26301c43366dSMiao Xie 26317cc8e58dSMiao Xie if (i_size_read(bdev->bd_inode) < 26327cc8e58dSMiao Xie btrfs_device_get_total_bytes(srcdev)) { 26335d163e0eSJeff Mahoney btrfs_err(fs_info, 26345d163e0eSJeff Mahoney "target device is smaller than source device!"); 26351c43366dSMiao Xie ret = -EINVAL; 26361c43366dSMiao Xie goto error; 26371c43366dSMiao Xie } 26381c43366dSMiao Xie 26391c43366dSMiao Xie 264012bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, NULL); 264112bd2fc0SIlya Dryomov if (IS_ERR(device)) { 264212bd2fc0SIlya Dryomov ret = PTR_ERR(device); 2643e93c89c1SStefan Behrens goto error; 2644e93c89c1SStefan Behrens } 2645e93c89c1SStefan Behrens 26466165572cSDavid Sterba name = rcu_string_strdup(device_path, GFP_KERNEL); 2647e93c89c1SStefan Behrens if (!name) { 264855de4803SDavid Sterba free_device(device); 2649e93c89c1SStefan Behrens ret = -ENOMEM; 2650e93c89c1SStefan Behrens goto error; 2651e93c89c1SStefan Behrens } 2652e93c89c1SStefan Behrens rcu_assign_pointer(device->name, name); 2653e93c89c1SStefan Behrens 26540b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 2655ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2656e93c89c1SStefan Behrens device->generation = 0; 26570b246afaSJeff Mahoney device->io_width = fs_info->sectorsize; 26580b246afaSJeff Mahoney device->io_align = fs_info->sectorsize; 26590b246afaSJeff Mahoney device->sector_size = fs_info->sectorsize; 26607cc8e58dSMiao Xie device->total_bytes = btrfs_device_get_total_bytes(srcdev); 26617cc8e58dSMiao Xie device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 26627cc8e58dSMiao Xie device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2663935e5cc9SMiao Xie ASSERT(list_empty(&srcdev->resized_list)); 2664935e5cc9SMiao Xie device->commit_total_bytes = srcdev->commit_total_bytes; 2665ce7213c7SMiao Xie device->commit_bytes_used = device->bytes_used; 2666fb456252SJeff Mahoney device->fs_info = fs_info; 2667e93c89c1SStefan Behrens device->bdev = bdev; 2668e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2669401e29c1SAnand Jain set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2670e93c89c1SStefan Behrens device->mode = FMODE_EXCL; 267127087f37SStefan Behrens device->dev_stats_valid = 1; 26729f6d2510SDavid Sterba set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2673e93c89c1SStefan Behrens device->fs_devices = fs_info->fs_devices; 2674e93c89c1SStefan Behrens list_add(&device->dev_list, &fs_info->fs_devices->devices); 2675e93c89c1SStefan Behrens fs_info->fs_devices->num_devices++; 2676e93c89c1SStefan Behrens fs_info->fs_devices->open_devices++; 26770b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2678e93c89c1SStefan Behrens 2679e93c89c1SStefan Behrens *device_out = device; 2680e93c89c1SStefan Behrens return ret; 2681e93c89c1SStefan Behrens 2682e93c89c1SStefan Behrens error: 2683e93c89c1SStefan Behrens blkdev_put(bdev, FMODE_EXCL); 2684e93c89c1SStefan Behrens return ret; 2685e93c89c1SStefan Behrens } 2686e93c89c1SStefan Behrens 2687e93c89c1SStefan Behrens void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2688e93c89c1SStefan Behrens struct btrfs_device *tgtdev) 2689e93c89c1SStefan Behrens { 2690da17066cSJeff Mahoney u32 sectorsize = fs_info->sectorsize; 2691da17066cSJeff Mahoney 2692e93c89c1SStefan Behrens WARN_ON(fs_info->fs_devices->rw_devices == 0); 2693da17066cSJeff Mahoney tgtdev->io_width = sectorsize; 2694da17066cSJeff Mahoney tgtdev->io_align = sectorsize; 2695da17066cSJeff Mahoney tgtdev->sector_size = sectorsize; 2696fb456252SJeff Mahoney tgtdev->fs_info = fs_info; 2697e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state); 2698e93c89c1SStefan Behrens } 2699e93c89c1SStefan Behrens 2700d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 27010b86a832SChris Mason struct btrfs_device *device) 27020b86a832SChris Mason { 27030b86a832SChris Mason int ret; 27040b86a832SChris Mason struct btrfs_path *path; 27050b246afaSJeff Mahoney struct btrfs_root *root = device->fs_info->chunk_root; 27060b86a832SChris Mason struct btrfs_dev_item *dev_item; 27070b86a832SChris Mason struct extent_buffer *leaf; 27080b86a832SChris Mason struct btrfs_key key; 27090b86a832SChris Mason 27100b86a832SChris Mason path = btrfs_alloc_path(); 27110b86a832SChris Mason if (!path) 27120b86a832SChris Mason return -ENOMEM; 27130b86a832SChris Mason 27140b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 27150b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 27160b86a832SChris Mason key.offset = device->devid; 27170b86a832SChris Mason 27180b86a832SChris Mason ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 27190b86a832SChris Mason if (ret < 0) 27200b86a832SChris Mason goto out; 27210b86a832SChris Mason 27220b86a832SChris Mason if (ret > 0) { 27230b86a832SChris Mason ret = -ENOENT; 27240b86a832SChris Mason goto out; 27250b86a832SChris Mason } 27260b86a832SChris Mason 27270b86a832SChris Mason leaf = path->nodes[0]; 27280b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 27290b86a832SChris Mason 27300b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 27310b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 27320b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 27330b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 27340b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 27357cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 27367cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 27377cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 27387cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 27390b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 27400b86a832SChris Mason 27410b86a832SChris Mason out: 27420b86a832SChris Mason btrfs_free_path(path); 27430b86a832SChris Mason return ret; 27440b86a832SChris Mason } 27450b86a832SChris Mason 27462196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans, 27478f18cf13SChris Mason struct btrfs_device *device, u64 new_size) 27488f18cf13SChris Mason { 27490b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 27500b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 2751935e5cc9SMiao Xie struct btrfs_fs_devices *fs_devices; 27522196d6e8SMiao Xie u64 old_total; 27532196d6e8SMiao Xie u64 diff; 27548f18cf13SChris Mason 2755ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 27562b82032cSYan Zheng return -EACCES; 27572196d6e8SMiao Xie 27587dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 27597dfb8be1SNikolay Borisov 276034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 27612196d6e8SMiao Xie old_total = btrfs_super_total_bytes(super_copy); 27620e4324a4SNikolay Borisov diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 27632196d6e8SMiao Xie 276463a212abSStefan Behrens if (new_size <= device->total_bytes || 2765401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 276634441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 27672b82032cSYan Zheng return -EINVAL; 27682196d6e8SMiao Xie } 27692b82032cSYan Zheng 27700b246afaSJeff Mahoney fs_devices = fs_info->fs_devices; 27718f18cf13SChris Mason 27727dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 27737dfb8be1SNikolay Borisov round_down(old_total + diff, fs_info->sectorsize)); 27742b82032cSYan Zheng device->fs_devices->total_rw_bytes += diff; 27752b82032cSYan Zheng 27767cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 27777cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 2778fb456252SJeff Mahoney btrfs_clear_space_info_full(device->fs_info); 2779935e5cc9SMiao Xie if (list_empty(&device->resized_list)) 2780935e5cc9SMiao Xie list_add_tail(&device->resized_list, 2781935e5cc9SMiao Xie &fs_devices->resized_devices); 278234441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 27834184ea7fSChris Mason 27848f18cf13SChris Mason return btrfs_update_device(trans, device); 27858f18cf13SChris Mason } 27868f18cf13SChris Mason 27878f18cf13SChris Mason static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2788408fbf19SNikolay Borisov struct btrfs_fs_info *fs_info, u64 chunk_offset) 27898f18cf13SChris Mason { 27905b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 27918f18cf13SChris Mason int ret; 27928f18cf13SChris Mason struct btrfs_path *path; 27938f18cf13SChris Mason struct btrfs_key key; 27948f18cf13SChris Mason 27958f18cf13SChris Mason path = btrfs_alloc_path(); 27968f18cf13SChris Mason if (!path) 27978f18cf13SChris Mason return -ENOMEM; 27988f18cf13SChris Mason 2799408fbf19SNikolay Borisov key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 28008f18cf13SChris Mason key.offset = chunk_offset; 28018f18cf13SChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 28028f18cf13SChris Mason 28038f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 280479787eaaSJeff Mahoney if (ret < 0) 280579787eaaSJeff Mahoney goto out; 280679787eaaSJeff Mahoney else if (ret > 0) { /* Logic error or corruption */ 28070b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, -ENOENT, 280879787eaaSJeff Mahoney "Failed lookup while freeing chunk."); 280979787eaaSJeff Mahoney ret = -ENOENT; 281079787eaaSJeff Mahoney goto out; 281179787eaaSJeff Mahoney } 28128f18cf13SChris Mason 28138f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 281479787eaaSJeff Mahoney if (ret < 0) 28150b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 281679787eaaSJeff Mahoney "Failed to delete chunk item."); 281779787eaaSJeff Mahoney out: 28188f18cf13SChris Mason btrfs_free_path(path); 281965a246c5STsutomu Itoh return ret; 28208f18cf13SChris Mason } 28218f18cf13SChris Mason 2822408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 28238f18cf13SChris Mason { 28240b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 28258f18cf13SChris Mason struct btrfs_disk_key *disk_key; 28268f18cf13SChris Mason struct btrfs_chunk *chunk; 28278f18cf13SChris Mason u8 *ptr; 28288f18cf13SChris Mason int ret = 0; 28298f18cf13SChris Mason u32 num_stripes; 28308f18cf13SChris Mason u32 array_size; 28318f18cf13SChris Mason u32 len = 0; 28328f18cf13SChris Mason u32 cur; 28338f18cf13SChris Mason struct btrfs_key key; 28348f18cf13SChris Mason 283534441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 28368f18cf13SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 28378f18cf13SChris Mason 28388f18cf13SChris Mason ptr = super_copy->sys_chunk_array; 28398f18cf13SChris Mason cur = 0; 28408f18cf13SChris Mason 28418f18cf13SChris Mason while (cur < array_size) { 28428f18cf13SChris Mason disk_key = (struct btrfs_disk_key *)ptr; 28438f18cf13SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 28448f18cf13SChris Mason 28458f18cf13SChris Mason len = sizeof(*disk_key); 28468f18cf13SChris Mason 28478f18cf13SChris Mason if (key.type == BTRFS_CHUNK_ITEM_KEY) { 28488f18cf13SChris Mason chunk = (struct btrfs_chunk *)(ptr + len); 28498f18cf13SChris Mason num_stripes = btrfs_stack_chunk_num_stripes(chunk); 28508f18cf13SChris Mason len += btrfs_chunk_item_size(num_stripes); 28518f18cf13SChris Mason } else { 28528f18cf13SChris Mason ret = -EIO; 28538f18cf13SChris Mason break; 28548f18cf13SChris Mason } 2855408fbf19SNikolay Borisov if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 28568f18cf13SChris Mason key.offset == chunk_offset) { 28578f18cf13SChris Mason memmove(ptr, ptr + len, array_size - (cur + len)); 28588f18cf13SChris Mason array_size -= len; 28598f18cf13SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size); 28608f18cf13SChris Mason } else { 28618f18cf13SChris Mason ptr += len; 28628f18cf13SChris Mason cur += len; 28638f18cf13SChris Mason } 28648f18cf13SChris Mason } 286534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 28668f18cf13SChris Mason return ret; 28678f18cf13SChris Mason } 28688f18cf13SChris Mason 2869592d92eeSLiu Bo static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, 2870592d92eeSLiu Bo u64 logical, u64 length) 2871592d92eeSLiu Bo { 2872592d92eeSLiu Bo struct extent_map_tree *em_tree; 2873592d92eeSLiu Bo struct extent_map *em; 2874592d92eeSLiu Bo 2875592d92eeSLiu Bo em_tree = &fs_info->mapping_tree.map_tree; 2876592d92eeSLiu Bo read_lock(&em_tree->lock); 2877592d92eeSLiu Bo em = lookup_extent_mapping(em_tree, logical, length); 2878592d92eeSLiu Bo read_unlock(&em_tree->lock); 2879592d92eeSLiu Bo 2880592d92eeSLiu Bo if (!em) { 2881592d92eeSLiu Bo btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2882592d92eeSLiu Bo logical, length); 2883592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2884592d92eeSLiu Bo } 2885592d92eeSLiu Bo 2886592d92eeSLiu Bo if (em->start > logical || em->start + em->len < logical) { 2887592d92eeSLiu Bo btrfs_crit(fs_info, 2888592d92eeSLiu Bo "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2889592d92eeSLiu Bo logical, length, em->start, em->start + em->len); 2890592d92eeSLiu Bo free_extent_map(em); 2891592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2892592d92eeSLiu Bo } 2893592d92eeSLiu Bo 2894592d92eeSLiu Bo /* callers are responsible for dropping em's ref. */ 2895592d92eeSLiu Bo return em; 2896592d92eeSLiu Bo } 2897592d92eeSLiu Bo 289847ab2a6cSJosef Bacik int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 28995b4aacefSJeff Mahoney struct btrfs_fs_info *fs_info, u64 chunk_offset) 290047ab2a6cSJosef Bacik { 290147ab2a6cSJosef Bacik struct extent_map *em; 290247ab2a6cSJosef Bacik struct map_lookup *map; 290347ab2a6cSJosef Bacik u64 dev_extent_len = 0; 290447ab2a6cSJosef Bacik int i, ret = 0; 29050b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 290647ab2a6cSJosef Bacik 2907592d92eeSLiu Bo em = get_chunk_map(fs_info, chunk_offset, 1); 2908592d92eeSLiu Bo if (IS_ERR(em)) { 290947ab2a6cSJosef Bacik /* 291047ab2a6cSJosef Bacik * This is a logic error, but we don't want to just rely on the 2911bb7ab3b9SAdam Buchbinder * user having built with ASSERT enabled, so if ASSERT doesn't 291247ab2a6cSJosef Bacik * do anything we still error out. 291347ab2a6cSJosef Bacik */ 291447ab2a6cSJosef Bacik ASSERT(0); 2915592d92eeSLiu Bo return PTR_ERR(em); 291647ab2a6cSJosef Bacik } 291795617d69SJeff Mahoney map = em->map_lookup; 291834441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 29192ff7e61eSJeff Mahoney check_system_chunk(trans, fs_info, map->type); 292034441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 292147ab2a6cSJosef Bacik 292257ba4cb8SFilipe Manana /* 292357ba4cb8SFilipe Manana * Take the device list mutex to prevent races with the final phase of 292457ba4cb8SFilipe Manana * a device replace operation that replaces the device object associated 292557ba4cb8SFilipe Manana * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 292657ba4cb8SFilipe Manana */ 292757ba4cb8SFilipe Manana mutex_lock(&fs_devices->device_list_mutex); 292847ab2a6cSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 292947ab2a6cSJosef Bacik struct btrfs_device *device = map->stripes[i].dev; 293047ab2a6cSJosef Bacik ret = btrfs_free_dev_extent(trans, device, 293147ab2a6cSJosef Bacik map->stripes[i].physical, 293247ab2a6cSJosef Bacik &dev_extent_len); 293347ab2a6cSJosef Bacik if (ret) { 293457ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 293566642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 293647ab2a6cSJosef Bacik goto out; 293747ab2a6cSJosef Bacik } 293847ab2a6cSJosef Bacik 293947ab2a6cSJosef Bacik if (device->bytes_used > 0) { 294034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 294147ab2a6cSJosef Bacik btrfs_device_set_bytes_used(device, 294247ab2a6cSJosef Bacik device->bytes_used - dev_extent_len); 2943a5ed45f8SNikolay Borisov atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 29440b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 294534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 294647ab2a6cSJosef Bacik } 294747ab2a6cSJosef Bacik 294847ab2a6cSJosef Bacik if (map->stripes[i].dev) { 294947ab2a6cSJosef Bacik ret = btrfs_update_device(trans, map->stripes[i].dev); 295047ab2a6cSJosef Bacik if (ret) { 295157ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 295266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 295347ab2a6cSJosef Bacik goto out; 295447ab2a6cSJosef Bacik } 295547ab2a6cSJosef Bacik } 295647ab2a6cSJosef Bacik } 295757ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 295857ba4cb8SFilipe Manana 2959408fbf19SNikolay Borisov ret = btrfs_free_chunk(trans, fs_info, chunk_offset); 296047ab2a6cSJosef Bacik if (ret) { 296166642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 296247ab2a6cSJosef Bacik goto out; 296347ab2a6cSJosef Bacik } 296447ab2a6cSJosef Bacik 29656bccf3abSJeff Mahoney trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 296647ab2a6cSJosef Bacik 296747ab2a6cSJosef Bacik if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2968408fbf19SNikolay Borisov ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 296947ab2a6cSJosef Bacik if (ret) { 297066642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 297147ab2a6cSJosef Bacik goto out; 297247ab2a6cSJosef Bacik } 297347ab2a6cSJosef Bacik } 297447ab2a6cSJosef Bacik 29756bccf3abSJeff Mahoney ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); 297647ab2a6cSJosef Bacik if (ret) { 297766642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 297847ab2a6cSJosef Bacik goto out; 297947ab2a6cSJosef Bacik } 298047ab2a6cSJosef Bacik 298147ab2a6cSJosef Bacik out: 298247ab2a6cSJosef Bacik /* once for us */ 298347ab2a6cSJosef Bacik free_extent_map(em); 29848f18cf13SChris Mason return ret; 29858f18cf13SChris Mason } 29868f18cf13SChris Mason 29875b4aacefSJeff Mahoney static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 29888f18cf13SChris Mason { 29895b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 299019c4d2f9SChris Mason struct btrfs_trans_handle *trans; 29918f18cf13SChris Mason int ret; 29928f18cf13SChris Mason 299367c5e7d4SFilipe Manana /* 299467c5e7d4SFilipe Manana * Prevent races with automatic removal of unused block groups. 299567c5e7d4SFilipe Manana * After we relocate and before we remove the chunk with offset 299667c5e7d4SFilipe Manana * chunk_offset, automatic removal of the block group can kick in, 299767c5e7d4SFilipe Manana * resulting in a failure when calling btrfs_remove_chunk() below. 299867c5e7d4SFilipe Manana * 299967c5e7d4SFilipe Manana * Make sure to acquire this mutex before doing a tree search (dev 300067c5e7d4SFilipe Manana * or chunk trees) to find chunks. Otherwise the cleaner kthread might 300167c5e7d4SFilipe Manana * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 300267c5e7d4SFilipe Manana * we release the path used to search the chunk/dev tree and before 300367c5e7d4SFilipe Manana * the current task acquires this mutex and calls us. 300467c5e7d4SFilipe Manana */ 30050b246afaSJeff Mahoney ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); 300667c5e7d4SFilipe Manana 30070b246afaSJeff Mahoney ret = btrfs_can_relocate(fs_info, chunk_offset); 3008ba1bf481SJosef Bacik if (ret) 3009ba1bf481SJosef Bacik return -ENOSPC; 3010ba1bf481SJosef Bacik 30118f18cf13SChris Mason /* step one, relocate all the extents inside this chunk */ 30122ff7e61eSJeff Mahoney btrfs_scrub_pause(fs_info); 30130b246afaSJeff Mahoney ret = btrfs_relocate_block_group(fs_info, chunk_offset); 30142ff7e61eSJeff Mahoney btrfs_scrub_continue(fs_info); 3015a22285a6SYan, Zheng if (ret) 3016a22285a6SYan, Zheng return ret; 30178f18cf13SChris Mason 301819c4d2f9SChris Mason trans = btrfs_start_trans_remove_block_group(root->fs_info, 301919c4d2f9SChris Mason chunk_offset); 302019c4d2f9SChris Mason if (IS_ERR(trans)) { 302119c4d2f9SChris Mason ret = PTR_ERR(trans); 302219c4d2f9SChris Mason btrfs_handle_fs_error(root->fs_info, ret, NULL); 302319c4d2f9SChris Mason return ret; 302419c4d2f9SChris Mason } 30255d8eb6feSNaohiro Aota 302619c4d2f9SChris Mason /* 302719c4d2f9SChris Mason * step two, delete the device extents and the 302819c4d2f9SChris Mason * chunk tree entries 302919c4d2f9SChris Mason */ 30305b4aacefSJeff Mahoney ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); 30313a45bb20SJeff Mahoney btrfs_end_transaction(trans); 303219c4d2f9SChris Mason return ret; 30338f18cf13SChris Mason } 30348f18cf13SChris Mason 30352ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 30362b82032cSYan Zheng { 30370b246afaSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root; 30382b82032cSYan Zheng struct btrfs_path *path; 30392b82032cSYan Zheng struct extent_buffer *leaf; 30402b82032cSYan Zheng struct btrfs_chunk *chunk; 30412b82032cSYan Zheng struct btrfs_key key; 30422b82032cSYan Zheng struct btrfs_key found_key; 30432b82032cSYan Zheng u64 chunk_type; 3044ba1bf481SJosef Bacik bool retried = false; 3045ba1bf481SJosef Bacik int failed = 0; 30462b82032cSYan Zheng int ret; 30472b82032cSYan Zheng 30482b82032cSYan Zheng path = btrfs_alloc_path(); 30492b82032cSYan Zheng if (!path) 30502b82032cSYan Zheng return -ENOMEM; 30512b82032cSYan Zheng 3052ba1bf481SJosef Bacik again: 30532b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 30542b82032cSYan Zheng key.offset = (u64)-1; 30552b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 30562b82032cSYan Zheng 30572b82032cSYan Zheng while (1) { 30580b246afaSJeff Mahoney mutex_lock(&fs_info->delete_unused_bgs_mutex); 30592b82032cSYan Zheng ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 306067c5e7d4SFilipe Manana if (ret < 0) { 30610b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 30622b82032cSYan Zheng goto error; 306367c5e7d4SFilipe Manana } 306479787eaaSJeff Mahoney BUG_ON(ret == 0); /* Corruption */ 30652b82032cSYan Zheng 30662b82032cSYan Zheng ret = btrfs_previous_item(chunk_root, path, key.objectid, 30672b82032cSYan Zheng key.type); 306867c5e7d4SFilipe Manana if (ret) 30690b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 30702b82032cSYan Zheng if (ret < 0) 30712b82032cSYan Zheng goto error; 30722b82032cSYan Zheng if (ret > 0) 30732b82032cSYan Zheng break; 30742b82032cSYan Zheng 30752b82032cSYan Zheng leaf = path->nodes[0]; 30762b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 30772b82032cSYan Zheng 30782b82032cSYan Zheng chunk = btrfs_item_ptr(leaf, path->slots[0], 30792b82032cSYan Zheng struct btrfs_chunk); 30802b82032cSYan Zheng chunk_type = btrfs_chunk_type(leaf, chunk); 3081b3b4aa74SDavid Sterba btrfs_release_path(path); 30822b82032cSYan Zheng 30832b82032cSYan Zheng if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 30840b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3085ba1bf481SJosef Bacik if (ret == -ENOSPC) 3086ba1bf481SJosef Bacik failed++; 308714586651SHIMANGI SARAOGI else 308814586651SHIMANGI SARAOGI BUG_ON(ret); 30892b82032cSYan Zheng } 30900b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 30912b82032cSYan Zheng 30922b82032cSYan Zheng if (found_key.offset == 0) 30932b82032cSYan Zheng break; 30942b82032cSYan Zheng key.offset = found_key.offset - 1; 30952b82032cSYan Zheng } 30962b82032cSYan Zheng ret = 0; 3097ba1bf481SJosef Bacik if (failed && !retried) { 3098ba1bf481SJosef Bacik failed = 0; 3099ba1bf481SJosef Bacik retried = true; 3100ba1bf481SJosef Bacik goto again; 3101fae7f21cSDulshani Gunawardhana } else if (WARN_ON(failed && retried)) { 3102ba1bf481SJosef Bacik ret = -ENOSPC; 3103ba1bf481SJosef Bacik } 31042b82032cSYan Zheng error: 31052b82032cSYan Zheng btrfs_free_path(path); 31062b82032cSYan Zheng return ret; 31072b82032cSYan Zheng } 31082b82032cSYan Zheng 3109*a6f93c71SLiu Bo /* 3110*a6f93c71SLiu Bo * return 1 : allocate a data chunk successfully, 3111*a6f93c71SLiu Bo * return <0: errors during allocating a data chunk, 3112*a6f93c71SLiu Bo * return 0 : no need to allocate a data chunk. 3113*a6f93c71SLiu Bo */ 3114*a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3115*a6f93c71SLiu Bo u64 chunk_offset) 3116*a6f93c71SLiu Bo { 3117*a6f93c71SLiu Bo struct btrfs_block_group_cache *cache; 3118*a6f93c71SLiu Bo u64 bytes_used; 3119*a6f93c71SLiu Bo u64 chunk_type; 3120*a6f93c71SLiu Bo 3121*a6f93c71SLiu Bo cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3122*a6f93c71SLiu Bo ASSERT(cache); 3123*a6f93c71SLiu Bo chunk_type = cache->flags; 3124*a6f93c71SLiu Bo btrfs_put_block_group(cache); 3125*a6f93c71SLiu Bo 3126*a6f93c71SLiu Bo if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { 3127*a6f93c71SLiu Bo spin_lock(&fs_info->data_sinfo->lock); 3128*a6f93c71SLiu Bo bytes_used = fs_info->data_sinfo->bytes_used; 3129*a6f93c71SLiu Bo spin_unlock(&fs_info->data_sinfo->lock); 3130*a6f93c71SLiu Bo 3131*a6f93c71SLiu Bo if (!bytes_used) { 3132*a6f93c71SLiu Bo struct btrfs_trans_handle *trans; 3133*a6f93c71SLiu Bo int ret; 3134*a6f93c71SLiu Bo 3135*a6f93c71SLiu Bo trans = btrfs_join_transaction(fs_info->tree_root); 3136*a6f93c71SLiu Bo if (IS_ERR(trans)) 3137*a6f93c71SLiu Bo return PTR_ERR(trans); 3138*a6f93c71SLiu Bo 3139*a6f93c71SLiu Bo ret = btrfs_force_chunk_alloc(trans, fs_info, 3140*a6f93c71SLiu Bo BTRFS_BLOCK_GROUP_DATA); 3141*a6f93c71SLiu Bo btrfs_end_transaction(trans); 3142*a6f93c71SLiu Bo if (ret < 0) 3143*a6f93c71SLiu Bo return ret; 3144*a6f93c71SLiu Bo 3145*a6f93c71SLiu Bo return 1; 3146*a6f93c71SLiu Bo } 3147*a6f93c71SLiu Bo } 3148*a6f93c71SLiu Bo return 0; 3149*a6f93c71SLiu Bo } 3150*a6f93c71SLiu Bo 31516bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info, 31520940ebf6SIlya Dryomov struct btrfs_balance_control *bctl) 31530940ebf6SIlya Dryomov { 31546bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 31550940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 31560940ebf6SIlya Dryomov struct btrfs_balance_item *item; 31570940ebf6SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 31580940ebf6SIlya Dryomov struct btrfs_path *path; 31590940ebf6SIlya Dryomov struct extent_buffer *leaf; 31600940ebf6SIlya Dryomov struct btrfs_key key; 31610940ebf6SIlya Dryomov int ret, err; 31620940ebf6SIlya Dryomov 31630940ebf6SIlya Dryomov path = btrfs_alloc_path(); 31640940ebf6SIlya Dryomov if (!path) 31650940ebf6SIlya Dryomov return -ENOMEM; 31660940ebf6SIlya Dryomov 31670940ebf6SIlya Dryomov trans = btrfs_start_transaction(root, 0); 31680940ebf6SIlya Dryomov if (IS_ERR(trans)) { 31690940ebf6SIlya Dryomov btrfs_free_path(path); 31700940ebf6SIlya Dryomov return PTR_ERR(trans); 31710940ebf6SIlya Dryomov } 31720940ebf6SIlya Dryomov 31730940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3174c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 31750940ebf6SIlya Dryomov key.offset = 0; 31760940ebf6SIlya Dryomov 31770940ebf6SIlya Dryomov ret = btrfs_insert_empty_item(trans, root, path, &key, 31780940ebf6SIlya Dryomov sizeof(*item)); 31790940ebf6SIlya Dryomov if (ret) 31800940ebf6SIlya Dryomov goto out; 31810940ebf6SIlya Dryomov 31820940ebf6SIlya Dryomov leaf = path->nodes[0]; 31830940ebf6SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 31840940ebf6SIlya Dryomov 3185b159fa28SDavid Sterba memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 31860940ebf6SIlya Dryomov 31870940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 31880940ebf6SIlya Dryomov btrfs_set_balance_data(leaf, item, &disk_bargs); 31890940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 31900940ebf6SIlya Dryomov btrfs_set_balance_meta(leaf, item, &disk_bargs); 31910940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 31920940ebf6SIlya Dryomov btrfs_set_balance_sys(leaf, item, &disk_bargs); 31930940ebf6SIlya Dryomov 31940940ebf6SIlya Dryomov btrfs_set_balance_flags(leaf, item, bctl->flags); 31950940ebf6SIlya Dryomov 31960940ebf6SIlya Dryomov btrfs_mark_buffer_dirty(leaf); 31970940ebf6SIlya Dryomov out: 31980940ebf6SIlya Dryomov btrfs_free_path(path); 31993a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 32000940ebf6SIlya Dryomov if (err && !ret) 32010940ebf6SIlya Dryomov ret = err; 32020940ebf6SIlya Dryomov return ret; 32030940ebf6SIlya Dryomov } 32040940ebf6SIlya Dryomov 32056bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info) 32060940ebf6SIlya Dryomov { 32076bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 32080940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 32090940ebf6SIlya Dryomov struct btrfs_path *path; 32100940ebf6SIlya Dryomov struct btrfs_key key; 32110940ebf6SIlya Dryomov int ret, err; 32120940ebf6SIlya Dryomov 32130940ebf6SIlya Dryomov path = btrfs_alloc_path(); 32140940ebf6SIlya Dryomov if (!path) 32150940ebf6SIlya Dryomov return -ENOMEM; 32160940ebf6SIlya Dryomov 32170940ebf6SIlya Dryomov trans = btrfs_start_transaction(root, 0); 32180940ebf6SIlya Dryomov if (IS_ERR(trans)) { 32190940ebf6SIlya Dryomov btrfs_free_path(path); 32200940ebf6SIlya Dryomov return PTR_ERR(trans); 32210940ebf6SIlya Dryomov } 32220940ebf6SIlya Dryomov 32230940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3224c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 32250940ebf6SIlya Dryomov key.offset = 0; 32260940ebf6SIlya Dryomov 32270940ebf6SIlya Dryomov ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 32280940ebf6SIlya Dryomov if (ret < 0) 32290940ebf6SIlya Dryomov goto out; 32300940ebf6SIlya Dryomov if (ret > 0) { 32310940ebf6SIlya Dryomov ret = -ENOENT; 32320940ebf6SIlya Dryomov goto out; 32330940ebf6SIlya Dryomov } 32340940ebf6SIlya Dryomov 32350940ebf6SIlya Dryomov ret = btrfs_del_item(trans, root, path); 32360940ebf6SIlya Dryomov out: 32370940ebf6SIlya Dryomov btrfs_free_path(path); 32383a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 32390940ebf6SIlya Dryomov if (err && !ret) 32400940ebf6SIlya Dryomov ret = err; 32410940ebf6SIlya Dryomov return ret; 32420940ebf6SIlya Dryomov } 32430940ebf6SIlya Dryomov 3244c9e9f97bSIlya Dryomov /* 324559641015SIlya Dryomov * This is a heuristic used to reduce the number of chunks balanced on 324659641015SIlya Dryomov * resume after balance was interrupted. 324759641015SIlya Dryomov */ 324859641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl) 324959641015SIlya Dryomov { 325059641015SIlya Dryomov /* 325159641015SIlya Dryomov * Turn on soft mode for chunk types that were being converted. 325259641015SIlya Dryomov */ 325359641015SIlya Dryomov if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 325459641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 325559641015SIlya Dryomov if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 325659641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 325759641015SIlya Dryomov if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 325859641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 325959641015SIlya Dryomov 326059641015SIlya Dryomov /* 326159641015SIlya Dryomov * Turn on usage filter if is not already used. The idea is 326259641015SIlya Dryomov * that chunks that we have already balanced should be 326359641015SIlya Dryomov * reasonably full. Don't do it for chunks that are being 326459641015SIlya Dryomov * converted - that will keep us from relocating unconverted 326559641015SIlya Dryomov * (albeit full) chunks. 326659641015SIlya Dryomov */ 326759641015SIlya Dryomov if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3268bc309467SDavid Sterba !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 326959641015SIlya Dryomov !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 327059641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 327159641015SIlya Dryomov bctl->data.usage = 90; 327259641015SIlya Dryomov } 327359641015SIlya Dryomov if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3274bc309467SDavid Sterba !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 327559641015SIlya Dryomov !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 327659641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 327759641015SIlya Dryomov bctl->sys.usage = 90; 327859641015SIlya Dryomov } 327959641015SIlya Dryomov if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3280bc309467SDavid Sterba !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 328159641015SIlya Dryomov !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 328259641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 328359641015SIlya Dryomov bctl->meta.usage = 90; 328459641015SIlya Dryomov } 328559641015SIlya Dryomov } 328659641015SIlya Dryomov 328759641015SIlya Dryomov /* 3288c9e9f97bSIlya Dryomov * Should be called with both balance and volume mutexes held to 3289c9e9f97bSIlya Dryomov * serialize other volume operations (add_dev/rm_dev/resize) with 3290c9e9f97bSIlya Dryomov * restriper. Same goes for unset_balance_control. 3291c9e9f97bSIlya Dryomov */ 3292c9e9f97bSIlya Dryomov static void set_balance_control(struct btrfs_balance_control *bctl) 3293c9e9f97bSIlya Dryomov { 3294c9e9f97bSIlya Dryomov struct btrfs_fs_info *fs_info = bctl->fs_info; 3295c9e9f97bSIlya Dryomov 3296c9e9f97bSIlya Dryomov BUG_ON(fs_info->balance_ctl); 3297c9e9f97bSIlya Dryomov 3298c9e9f97bSIlya Dryomov spin_lock(&fs_info->balance_lock); 3299c9e9f97bSIlya Dryomov fs_info->balance_ctl = bctl; 3300c9e9f97bSIlya Dryomov spin_unlock(&fs_info->balance_lock); 3301c9e9f97bSIlya Dryomov } 3302c9e9f97bSIlya Dryomov 3303c9e9f97bSIlya Dryomov static void unset_balance_control(struct btrfs_fs_info *fs_info) 3304c9e9f97bSIlya Dryomov { 3305c9e9f97bSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3306c9e9f97bSIlya Dryomov 3307c9e9f97bSIlya Dryomov BUG_ON(!fs_info->balance_ctl); 3308c9e9f97bSIlya Dryomov 3309c9e9f97bSIlya Dryomov spin_lock(&fs_info->balance_lock); 3310c9e9f97bSIlya Dryomov fs_info->balance_ctl = NULL; 3311c9e9f97bSIlya Dryomov spin_unlock(&fs_info->balance_lock); 3312c9e9f97bSIlya Dryomov 3313c9e9f97bSIlya Dryomov kfree(bctl); 3314c9e9f97bSIlya Dryomov } 3315c9e9f97bSIlya Dryomov 3316ed25e9b2SIlya Dryomov /* 3317ed25e9b2SIlya Dryomov * Balance filters. Return 1 if chunk should be filtered out 3318ed25e9b2SIlya Dryomov * (should not be balanced). 3319ed25e9b2SIlya Dryomov */ 3320899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type, 3321ed25e9b2SIlya Dryomov struct btrfs_balance_args *bargs) 3322ed25e9b2SIlya Dryomov { 3323899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3324899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3325ed25e9b2SIlya Dryomov 3326899c81eaSIlya Dryomov if (bargs->profiles & chunk_type) 3327ed25e9b2SIlya Dryomov return 0; 3328ed25e9b2SIlya Dryomov 3329ed25e9b2SIlya Dryomov return 1; 3330ed25e9b2SIlya Dryomov } 3331ed25e9b2SIlya Dryomov 3332dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 33335ce5b3c0SIlya Dryomov struct btrfs_balance_args *bargs) 33345ce5b3c0SIlya Dryomov { 33355ce5b3c0SIlya Dryomov struct btrfs_block_group_cache *cache; 3336bc309467SDavid Sterba u64 chunk_used; 3337bc309467SDavid Sterba u64 user_thresh_min; 3338bc309467SDavid Sterba u64 user_thresh_max; 3339bc309467SDavid Sterba int ret = 1; 3340bc309467SDavid Sterba 3341bc309467SDavid Sterba cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3342bc309467SDavid Sterba chunk_used = btrfs_block_group_used(&cache->item); 3343bc309467SDavid Sterba 3344bc309467SDavid Sterba if (bargs->usage_min == 0) 3345bc309467SDavid Sterba user_thresh_min = 0; 3346bc309467SDavid Sterba else 3347bc309467SDavid Sterba user_thresh_min = div_factor_fine(cache->key.offset, 3348bc309467SDavid Sterba bargs->usage_min); 3349bc309467SDavid Sterba 3350bc309467SDavid Sterba if (bargs->usage_max == 0) 3351bc309467SDavid Sterba user_thresh_max = 1; 3352bc309467SDavid Sterba else if (bargs->usage_max > 100) 3353bc309467SDavid Sterba user_thresh_max = cache->key.offset; 3354bc309467SDavid Sterba else 3355bc309467SDavid Sterba user_thresh_max = div_factor_fine(cache->key.offset, 3356bc309467SDavid Sterba bargs->usage_max); 3357bc309467SDavid Sterba 3358bc309467SDavid Sterba if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3359bc309467SDavid Sterba ret = 0; 3360bc309467SDavid Sterba 3361bc309467SDavid Sterba btrfs_put_block_group(cache); 3362bc309467SDavid Sterba return ret; 3363bc309467SDavid Sterba } 3364bc309467SDavid Sterba 3365dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3366bc309467SDavid Sterba u64 chunk_offset, struct btrfs_balance_args *bargs) 3367bc309467SDavid Sterba { 3368bc309467SDavid Sterba struct btrfs_block_group_cache *cache; 33695ce5b3c0SIlya Dryomov u64 chunk_used, user_thresh; 33705ce5b3c0SIlya Dryomov int ret = 1; 33715ce5b3c0SIlya Dryomov 33725ce5b3c0SIlya Dryomov cache = btrfs_lookup_block_group(fs_info, chunk_offset); 33735ce5b3c0SIlya Dryomov chunk_used = btrfs_block_group_used(&cache->item); 33745ce5b3c0SIlya Dryomov 3375bc309467SDavid Sterba if (bargs->usage_min == 0) 33763e39cea6SIlya Dryomov user_thresh = 1; 3377a105bb88SIlya Dryomov else if (bargs->usage > 100) 3378a105bb88SIlya Dryomov user_thresh = cache->key.offset; 3379a105bb88SIlya Dryomov else 3380a105bb88SIlya Dryomov user_thresh = div_factor_fine(cache->key.offset, 3381a105bb88SIlya Dryomov bargs->usage); 3382a105bb88SIlya Dryomov 33835ce5b3c0SIlya Dryomov if (chunk_used < user_thresh) 33845ce5b3c0SIlya Dryomov ret = 0; 33855ce5b3c0SIlya Dryomov 33865ce5b3c0SIlya Dryomov btrfs_put_block_group(cache); 33875ce5b3c0SIlya Dryomov return ret; 33885ce5b3c0SIlya Dryomov } 33895ce5b3c0SIlya Dryomov 3390409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf, 3391409d404bSIlya Dryomov struct btrfs_chunk *chunk, 3392409d404bSIlya Dryomov struct btrfs_balance_args *bargs) 3393409d404bSIlya Dryomov { 3394409d404bSIlya Dryomov struct btrfs_stripe *stripe; 3395409d404bSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3396409d404bSIlya Dryomov int i; 3397409d404bSIlya Dryomov 3398409d404bSIlya Dryomov for (i = 0; i < num_stripes; i++) { 3399409d404bSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 3400409d404bSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3401409d404bSIlya Dryomov return 0; 3402409d404bSIlya Dryomov } 3403409d404bSIlya Dryomov 3404409d404bSIlya Dryomov return 1; 3405409d404bSIlya Dryomov } 3406409d404bSIlya Dryomov 340794e60d5aSIlya Dryomov /* [pstart, pend) */ 340894e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf, 340994e60d5aSIlya Dryomov struct btrfs_chunk *chunk, 341094e60d5aSIlya Dryomov struct btrfs_balance_args *bargs) 341194e60d5aSIlya Dryomov { 341294e60d5aSIlya Dryomov struct btrfs_stripe *stripe; 341394e60d5aSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 341494e60d5aSIlya Dryomov u64 stripe_offset; 341594e60d5aSIlya Dryomov u64 stripe_length; 341694e60d5aSIlya Dryomov int factor; 341794e60d5aSIlya Dryomov int i; 341894e60d5aSIlya Dryomov 341994e60d5aSIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 342094e60d5aSIlya Dryomov return 0; 342194e60d5aSIlya Dryomov 342294e60d5aSIlya Dryomov if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 342353b381b3SDavid Woodhouse BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 342453b381b3SDavid Woodhouse factor = num_stripes / 2; 342553b381b3SDavid Woodhouse } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 342653b381b3SDavid Woodhouse factor = num_stripes - 1; 342753b381b3SDavid Woodhouse } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 342853b381b3SDavid Woodhouse factor = num_stripes - 2; 342953b381b3SDavid Woodhouse } else { 343053b381b3SDavid Woodhouse factor = num_stripes; 343153b381b3SDavid Woodhouse } 343294e60d5aSIlya Dryomov 343394e60d5aSIlya Dryomov for (i = 0; i < num_stripes; i++) { 343494e60d5aSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 343594e60d5aSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 343694e60d5aSIlya Dryomov continue; 343794e60d5aSIlya Dryomov 343894e60d5aSIlya Dryomov stripe_offset = btrfs_stripe_offset(leaf, stripe); 343994e60d5aSIlya Dryomov stripe_length = btrfs_chunk_length(leaf, chunk); 3440b8b93addSDavid Sterba stripe_length = div_u64(stripe_length, factor); 344194e60d5aSIlya Dryomov 344294e60d5aSIlya Dryomov if (stripe_offset < bargs->pend && 344394e60d5aSIlya Dryomov stripe_offset + stripe_length > bargs->pstart) 344494e60d5aSIlya Dryomov return 0; 344594e60d5aSIlya Dryomov } 344694e60d5aSIlya Dryomov 344794e60d5aSIlya Dryomov return 1; 344894e60d5aSIlya Dryomov } 344994e60d5aSIlya Dryomov 3450ea67176aSIlya Dryomov /* [vstart, vend) */ 3451ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf, 3452ea67176aSIlya Dryomov struct btrfs_chunk *chunk, 3453ea67176aSIlya Dryomov u64 chunk_offset, 3454ea67176aSIlya Dryomov struct btrfs_balance_args *bargs) 3455ea67176aSIlya Dryomov { 3456ea67176aSIlya Dryomov if (chunk_offset < bargs->vend && 3457ea67176aSIlya Dryomov chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3458ea67176aSIlya Dryomov /* at least part of the chunk is inside this vrange */ 3459ea67176aSIlya Dryomov return 0; 3460ea67176aSIlya Dryomov 3461ea67176aSIlya Dryomov return 1; 3462ea67176aSIlya Dryomov } 3463ea67176aSIlya Dryomov 3464dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3465dee32d0aSGabríel Arthúr Pétursson struct btrfs_chunk *chunk, 3466dee32d0aSGabríel Arthúr Pétursson struct btrfs_balance_args *bargs) 3467dee32d0aSGabríel Arthúr Pétursson { 3468dee32d0aSGabríel Arthúr Pétursson int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3469dee32d0aSGabríel Arthúr Pétursson 3470dee32d0aSGabríel Arthúr Pétursson if (bargs->stripes_min <= num_stripes 3471dee32d0aSGabríel Arthúr Pétursson && num_stripes <= bargs->stripes_max) 3472dee32d0aSGabríel Arthúr Pétursson return 0; 3473dee32d0aSGabríel Arthúr Pétursson 3474dee32d0aSGabríel Arthúr Pétursson return 1; 3475dee32d0aSGabríel Arthúr Pétursson } 3476dee32d0aSGabríel Arthúr Pétursson 3477899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type, 3478cfa4c961SIlya Dryomov struct btrfs_balance_args *bargs) 3479cfa4c961SIlya Dryomov { 3480cfa4c961SIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3481cfa4c961SIlya Dryomov return 0; 3482cfa4c961SIlya Dryomov 3483899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3484899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3485cfa4c961SIlya Dryomov 3486899c81eaSIlya Dryomov if (bargs->target == chunk_type) 3487cfa4c961SIlya Dryomov return 1; 3488cfa4c961SIlya Dryomov 3489cfa4c961SIlya Dryomov return 0; 3490cfa4c961SIlya Dryomov } 3491cfa4c961SIlya Dryomov 34922ff7e61eSJeff Mahoney static int should_balance_chunk(struct btrfs_fs_info *fs_info, 3493f43ffb60SIlya Dryomov struct extent_buffer *leaf, 3494f43ffb60SIlya Dryomov struct btrfs_chunk *chunk, u64 chunk_offset) 3495f43ffb60SIlya Dryomov { 34960b246afaSJeff Mahoney struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3497f43ffb60SIlya Dryomov struct btrfs_balance_args *bargs = NULL; 3498f43ffb60SIlya Dryomov u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3499f43ffb60SIlya Dryomov 3500f43ffb60SIlya Dryomov /* type filter */ 3501f43ffb60SIlya Dryomov if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3502f43ffb60SIlya Dryomov (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3503f43ffb60SIlya Dryomov return 0; 3504f43ffb60SIlya Dryomov } 3505f43ffb60SIlya Dryomov 3506f43ffb60SIlya Dryomov if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3507f43ffb60SIlya Dryomov bargs = &bctl->data; 3508f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3509f43ffb60SIlya Dryomov bargs = &bctl->sys; 3510f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3511f43ffb60SIlya Dryomov bargs = &bctl->meta; 3512f43ffb60SIlya Dryomov 3513ed25e9b2SIlya Dryomov /* profiles filter */ 3514ed25e9b2SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3515ed25e9b2SIlya Dryomov chunk_profiles_filter(chunk_type, bargs)) { 3516ed25e9b2SIlya Dryomov return 0; 3517ed25e9b2SIlya Dryomov } 3518ed25e9b2SIlya Dryomov 35195ce5b3c0SIlya Dryomov /* usage filter */ 35205ce5b3c0SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 35210b246afaSJeff Mahoney chunk_usage_filter(fs_info, chunk_offset, bargs)) { 35225ce5b3c0SIlya Dryomov return 0; 3523bc309467SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 35240b246afaSJeff Mahoney chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3525bc309467SDavid Sterba return 0; 35265ce5b3c0SIlya Dryomov } 35275ce5b3c0SIlya Dryomov 3528409d404bSIlya Dryomov /* devid filter */ 3529409d404bSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3530409d404bSIlya Dryomov chunk_devid_filter(leaf, chunk, bargs)) { 3531409d404bSIlya Dryomov return 0; 3532409d404bSIlya Dryomov } 3533409d404bSIlya Dryomov 353494e60d5aSIlya Dryomov /* drange filter, makes sense only with devid filter */ 353594e60d5aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3536e4ff5fb5SNikolay Borisov chunk_drange_filter(leaf, chunk, bargs)) { 353794e60d5aSIlya Dryomov return 0; 353894e60d5aSIlya Dryomov } 353994e60d5aSIlya Dryomov 3540ea67176aSIlya Dryomov /* vrange filter */ 3541ea67176aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3542ea67176aSIlya Dryomov chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3543ea67176aSIlya Dryomov return 0; 3544ea67176aSIlya Dryomov } 3545ea67176aSIlya Dryomov 3546dee32d0aSGabríel Arthúr Pétursson /* stripes filter */ 3547dee32d0aSGabríel Arthúr Pétursson if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3548dee32d0aSGabríel Arthúr Pétursson chunk_stripes_range_filter(leaf, chunk, bargs)) { 3549dee32d0aSGabríel Arthúr Pétursson return 0; 3550dee32d0aSGabríel Arthúr Pétursson } 3551dee32d0aSGabríel Arthúr Pétursson 3552cfa4c961SIlya Dryomov /* soft profile changing mode */ 3553cfa4c961SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3554cfa4c961SIlya Dryomov chunk_soft_convert_filter(chunk_type, bargs)) { 3555cfa4c961SIlya Dryomov return 0; 3556cfa4c961SIlya Dryomov } 3557cfa4c961SIlya Dryomov 35587d824b6fSDavid Sterba /* 35597d824b6fSDavid Sterba * limited by count, must be the last filter 35607d824b6fSDavid Sterba */ 35617d824b6fSDavid Sterba if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 35627d824b6fSDavid Sterba if (bargs->limit == 0) 35637d824b6fSDavid Sterba return 0; 35647d824b6fSDavid Sterba else 35657d824b6fSDavid Sterba bargs->limit--; 356612907fc7SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 356712907fc7SDavid Sterba /* 356812907fc7SDavid Sterba * Same logic as the 'limit' filter; the minimum cannot be 356901327610SNicholas D Steeves * determined here because we do not have the global information 357012907fc7SDavid Sterba * about the count of all chunks that satisfy the filters. 357112907fc7SDavid Sterba */ 357212907fc7SDavid Sterba if (bargs->limit_max == 0) 357312907fc7SDavid Sterba return 0; 357412907fc7SDavid Sterba else 357512907fc7SDavid Sterba bargs->limit_max--; 35767d824b6fSDavid Sterba } 35777d824b6fSDavid Sterba 3578f43ffb60SIlya Dryomov return 1; 3579f43ffb60SIlya Dryomov } 3580f43ffb60SIlya Dryomov 3581c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3582ec44a35cSChris Mason { 358319a39dceSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3584c9e9f97bSIlya Dryomov struct btrfs_root *chunk_root = fs_info->chunk_root; 3585c9e9f97bSIlya Dryomov struct btrfs_root *dev_root = fs_info->dev_root; 3586c9e9f97bSIlya Dryomov struct list_head *devices; 3587ec44a35cSChris Mason struct btrfs_device *device; 3588ec44a35cSChris Mason u64 old_size; 3589ec44a35cSChris Mason u64 size_to_free; 359012907fc7SDavid Sterba u64 chunk_type; 3591f43ffb60SIlya Dryomov struct btrfs_chunk *chunk; 35925a488b9dSLiu Bo struct btrfs_path *path = NULL; 3593ec44a35cSChris Mason struct btrfs_key key; 3594ec44a35cSChris Mason struct btrfs_key found_key; 3595c9e9f97bSIlya Dryomov struct btrfs_trans_handle *trans; 3596f43ffb60SIlya Dryomov struct extent_buffer *leaf; 3597f43ffb60SIlya Dryomov int slot; 3598c9e9f97bSIlya Dryomov int ret; 3599c9e9f97bSIlya Dryomov int enospc_errors = 0; 360019a39dceSIlya Dryomov bool counting = true; 360112907fc7SDavid Sterba /* The single value limit and min/max limits use the same bytes in the */ 36027d824b6fSDavid Sterba u64 limit_data = bctl->data.limit; 36037d824b6fSDavid Sterba u64 limit_meta = bctl->meta.limit; 36047d824b6fSDavid Sterba u64 limit_sys = bctl->sys.limit; 360512907fc7SDavid Sterba u32 count_data = 0; 360612907fc7SDavid Sterba u32 count_meta = 0; 360712907fc7SDavid Sterba u32 count_sys = 0; 36082c9fe835SZhao Lei int chunk_reserved = 0; 3609ec44a35cSChris Mason 3610ec44a35cSChris Mason /* step one make some room on all the devices */ 3611c9e9f97bSIlya Dryomov devices = &fs_info->fs_devices->devices; 3612c6e30871SQinghuang Feng list_for_each_entry(device, devices, dev_list) { 36137cc8e58dSMiao Xie old_size = btrfs_device_get_total_bytes(device); 3614ec44a35cSChris Mason size_to_free = div_factor(old_size, 1); 3615ee22184bSByongho Lee size_to_free = min_t(u64, size_to_free, SZ_1M); 3616ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || 36177cc8e58dSMiao Xie btrfs_device_get_total_bytes(device) - 36187cc8e58dSMiao Xie btrfs_device_get_bytes_used(device) > size_to_free || 3619401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 3620ec44a35cSChris Mason continue; 3621ec44a35cSChris Mason 3622ec44a35cSChris Mason ret = btrfs_shrink_device(device, old_size - size_to_free); 3623ba1bf481SJosef Bacik if (ret == -ENOSPC) 3624ba1bf481SJosef Bacik break; 36255a488b9dSLiu Bo if (ret) { 36265a488b9dSLiu Bo /* btrfs_shrink_device never returns ret > 0 */ 36275a488b9dSLiu Bo WARN_ON(ret > 0); 36285a488b9dSLiu Bo goto error; 36295a488b9dSLiu Bo } 3630ec44a35cSChris Mason 3631a22285a6SYan, Zheng trans = btrfs_start_transaction(dev_root, 0); 36325a488b9dSLiu Bo if (IS_ERR(trans)) { 36335a488b9dSLiu Bo ret = PTR_ERR(trans); 36345a488b9dSLiu Bo btrfs_info_in_rcu(fs_info, 36355a488b9dSLiu Bo "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", 36365a488b9dSLiu Bo rcu_str_deref(device->name), ret, 36375a488b9dSLiu Bo old_size, old_size - size_to_free); 36385a488b9dSLiu Bo goto error; 36395a488b9dSLiu Bo } 3640ec44a35cSChris Mason 3641ec44a35cSChris Mason ret = btrfs_grow_device(trans, device, old_size); 36425a488b9dSLiu Bo if (ret) { 36433a45bb20SJeff Mahoney btrfs_end_transaction(trans); 36445a488b9dSLiu Bo /* btrfs_grow_device never returns ret > 0 */ 36455a488b9dSLiu Bo WARN_ON(ret > 0); 36465a488b9dSLiu Bo btrfs_info_in_rcu(fs_info, 36475a488b9dSLiu Bo "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", 36485a488b9dSLiu Bo rcu_str_deref(device->name), ret, 36495a488b9dSLiu Bo old_size, old_size - size_to_free); 36505a488b9dSLiu Bo goto error; 36515a488b9dSLiu Bo } 3652ec44a35cSChris Mason 36533a45bb20SJeff Mahoney btrfs_end_transaction(trans); 3654ec44a35cSChris Mason } 3655ec44a35cSChris Mason 3656ec44a35cSChris Mason /* step two, relocate all the chunks */ 3657ec44a35cSChris Mason path = btrfs_alloc_path(); 365817e9f796SMark Fasheh if (!path) { 365917e9f796SMark Fasheh ret = -ENOMEM; 366017e9f796SMark Fasheh goto error; 366117e9f796SMark Fasheh } 366219a39dceSIlya Dryomov 366319a39dceSIlya Dryomov /* zero out stat counters */ 366419a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 366519a39dceSIlya Dryomov memset(&bctl->stat, 0, sizeof(bctl->stat)); 366619a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 366719a39dceSIlya Dryomov again: 36687d824b6fSDavid Sterba if (!counting) { 366912907fc7SDavid Sterba /* 367012907fc7SDavid Sterba * The single value limit and min/max limits use the same bytes 367112907fc7SDavid Sterba * in the 367212907fc7SDavid Sterba */ 36737d824b6fSDavid Sterba bctl->data.limit = limit_data; 36747d824b6fSDavid Sterba bctl->meta.limit = limit_meta; 36757d824b6fSDavid Sterba bctl->sys.limit = limit_sys; 36767d824b6fSDavid Sterba } 3677ec44a35cSChris Mason key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3678ec44a35cSChris Mason key.offset = (u64)-1; 3679ec44a35cSChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 3680ec44a35cSChris Mason 3681ec44a35cSChris Mason while (1) { 368219a39dceSIlya Dryomov if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3683a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req)) { 3684837d5b6eSIlya Dryomov ret = -ECANCELED; 3685837d5b6eSIlya Dryomov goto error; 3686837d5b6eSIlya Dryomov } 3687837d5b6eSIlya Dryomov 368867c5e7d4SFilipe Manana mutex_lock(&fs_info->delete_unused_bgs_mutex); 3689ec44a35cSChris Mason ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 369067c5e7d4SFilipe Manana if (ret < 0) { 369167c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3692ec44a35cSChris Mason goto error; 369367c5e7d4SFilipe Manana } 3694ec44a35cSChris Mason 3695ec44a35cSChris Mason /* 3696ec44a35cSChris Mason * this shouldn't happen, it means the last relocate 3697ec44a35cSChris Mason * failed 3698ec44a35cSChris Mason */ 3699ec44a35cSChris Mason if (ret == 0) 3700c9e9f97bSIlya Dryomov BUG(); /* FIXME break ? */ 3701ec44a35cSChris Mason 3702ec44a35cSChris Mason ret = btrfs_previous_item(chunk_root, path, 0, 3703ec44a35cSChris Mason BTRFS_CHUNK_ITEM_KEY); 3704c9e9f97bSIlya Dryomov if (ret) { 370567c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3706c9e9f97bSIlya Dryomov ret = 0; 3707ec44a35cSChris Mason break; 3708c9e9f97bSIlya Dryomov } 37097d9eb12cSChris Mason 3710f43ffb60SIlya Dryomov leaf = path->nodes[0]; 3711f43ffb60SIlya Dryomov slot = path->slots[0]; 3712f43ffb60SIlya Dryomov btrfs_item_key_to_cpu(leaf, &found_key, slot); 3713f43ffb60SIlya Dryomov 371467c5e7d4SFilipe Manana if (found_key.objectid != key.objectid) { 371567c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3716ec44a35cSChris Mason break; 371767c5e7d4SFilipe Manana } 37187d9eb12cSChris Mason 3719f43ffb60SIlya Dryomov chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 372012907fc7SDavid Sterba chunk_type = btrfs_chunk_type(leaf, chunk); 3721f43ffb60SIlya Dryomov 372219a39dceSIlya Dryomov if (!counting) { 372319a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 372419a39dceSIlya Dryomov bctl->stat.considered++; 372519a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 372619a39dceSIlya Dryomov } 372719a39dceSIlya Dryomov 37282ff7e61eSJeff Mahoney ret = should_balance_chunk(fs_info, leaf, chunk, 3729f43ffb60SIlya Dryomov found_key.offset); 37302c9fe835SZhao Lei 3731b3b4aa74SDavid Sterba btrfs_release_path(path); 373267c5e7d4SFilipe Manana if (!ret) { 373367c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3734f43ffb60SIlya Dryomov goto loop; 373567c5e7d4SFilipe Manana } 3736f43ffb60SIlya Dryomov 373719a39dceSIlya Dryomov if (counting) { 373867c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 373919a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 374019a39dceSIlya Dryomov bctl->stat.expected++; 374119a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 374212907fc7SDavid Sterba 374312907fc7SDavid Sterba if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 374412907fc7SDavid Sterba count_data++; 374512907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 374612907fc7SDavid Sterba count_sys++; 374712907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 374812907fc7SDavid Sterba count_meta++; 374912907fc7SDavid Sterba 375012907fc7SDavid Sterba goto loop; 375112907fc7SDavid Sterba } 375212907fc7SDavid Sterba 375312907fc7SDavid Sterba /* 375412907fc7SDavid Sterba * Apply limit_min filter, no need to check if the LIMITS 375512907fc7SDavid Sterba * filter is used, limit_min is 0 by default 375612907fc7SDavid Sterba */ 375712907fc7SDavid Sterba if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 375812907fc7SDavid Sterba count_data < bctl->data.limit_min) 375912907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 376012907fc7SDavid Sterba count_meta < bctl->meta.limit_min) 376112907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 376212907fc7SDavid Sterba count_sys < bctl->sys.limit_min)) { 376312907fc7SDavid Sterba mutex_unlock(&fs_info->delete_unused_bgs_mutex); 376419a39dceSIlya Dryomov goto loop; 376519a39dceSIlya Dryomov } 376619a39dceSIlya Dryomov 3767*a6f93c71SLiu Bo if (!chunk_reserved) { 3768*a6f93c71SLiu Bo /* 3769*a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 3770*a6f93c71SLiu Bo * which could potentially end up with losing data's 3771*a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 3772*a6f93c71SLiu Bo * advance. 3773*a6f93c71SLiu Bo */ 3774*a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, 3775*a6f93c71SLiu Bo found_key.offset); 37762c9fe835SZhao Lei if (ret < 0) { 37772c9fe835SZhao Lei mutex_unlock(&fs_info->delete_unused_bgs_mutex); 37782c9fe835SZhao Lei goto error; 3779*a6f93c71SLiu Bo } else if (ret == 1) { 37802c9fe835SZhao Lei chunk_reserved = 1; 37812c9fe835SZhao Lei } 3782*a6f93c71SLiu Bo } 37832c9fe835SZhao Lei 37845b4aacefSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 378567c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3786508794ebSJosef Bacik if (ret && ret != -ENOSPC) 3787508794ebSJosef Bacik goto error; 378819a39dceSIlya Dryomov if (ret == -ENOSPC) { 3789c9e9f97bSIlya Dryomov enospc_errors++; 379019a39dceSIlya Dryomov } else { 379119a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 379219a39dceSIlya Dryomov bctl->stat.completed++; 379319a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 379419a39dceSIlya Dryomov } 3795f43ffb60SIlya Dryomov loop: 3796795a3321SIlya Dryomov if (found_key.offset == 0) 3797795a3321SIlya Dryomov break; 3798ba1bf481SJosef Bacik key.offset = found_key.offset - 1; 3799ec44a35cSChris Mason } 3800c9e9f97bSIlya Dryomov 380119a39dceSIlya Dryomov if (counting) { 380219a39dceSIlya Dryomov btrfs_release_path(path); 380319a39dceSIlya Dryomov counting = false; 380419a39dceSIlya Dryomov goto again; 380519a39dceSIlya Dryomov } 3806ec44a35cSChris Mason error: 3807ec44a35cSChris Mason btrfs_free_path(path); 3808c9e9f97bSIlya Dryomov if (enospc_errors) { 3809efe120a0SFrank Holton btrfs_info(fs_info, "%d enospc errors during balance", 3810c9e9f97bSIlya Dryomov enospc_errors); 3811c9e9f97bSIlya Dryomov if (!ret) 3812c9e9f97bSIlya Dryomov ret = -ENOSPC; 3813c9e9f97bSIlya Dryomov } 3814c9e9f97bSIlya Dryomov 3815ec44a35cSChris Mason return ret; 3816ec44a35cSChris Mason } 3817ec44a35cSChris Mason 38180c460c0dSIlya Dryomov /** 38190c460c0dSIlya Dryomov * alloc_profile_is_valid - see if a given profile is valid and reduced 38200c460c0dSIlya Dryomov * @flags: profile to validate 38210c460c0dSIlya Dryomov * @extended: if true @flags is treated as an extended profile 38220c460c0dSIlya Dryomov */ 38230c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended) 38240c460c0dSIlya Dryomov { 38250c460c0dSIlya Dryomov u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 38260c460c0dSIlya Dryomov BTRFS_BLOCK_GROUP_PROFILE_MASK); 38270c460c0dSIlya Dryomov 38280c460c0dSIlya Dryomov flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 38290c460c0dSIlya Dryomov 38300c460c0dSIlya Dryomov /* 1) check that all other bits are zeroed */ 38310c460c0dSIlya Dryomov if (flags & ~mask) 38320c460c0dSIlya Dryomov return 0; 38330c460c0dSIlya Dryomov 38340c460c0dSIlya Dryomov /* 2) see if profile is reduced */ 38350c460c0dSIlya Dryomov if (flags == 0) 38360c460c0dSIlya Dryomov return !extended; /* "0" is valid for usual profiles */ 38370c460c0dSIlya Dryomov 38380c460c0dSIlya Dryomov /* true if exactly one bit set */ 38390c460c0dSIlya Dryomov return (flags & (flags - 1)) == 0; 38400c460c0dSIlya Dryomov } 38410c460c0dSIlya Dryomov 3842837d5b6eSIlya Dryomov static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3843837d5b6eSIlya Dryomov { 3844a7e99c69SIlya Dryomov /* cancel requested || normal exit path */ 3845a7e99c69SIlya Dryomov return atomic_read(&fs_info->balance_cancel_req) || 3846a7e99c69SIlya Dryomov (atomic_read(&fs_info->balance_pause_req) == 0 && 3847a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req) == 0); 3848837d5b6eSIlya Dryomov } 3849837d5b6eSIlya Dryomov 3850c9e9f97bSIlya Dryomov static void __cancel_balance(struct btrfs_fs_info *fs_info) 3851c9e9f97bSIlya Dryomov { 38520940ebf6SIlya Dryomov int ret; 38530940ebf6SIlya Dryomov 3854c9e9f97bSIlya Dryomov unset_balance_control(fs_info); 38556bccf3abSJeff Mahoney ret = del_balance_item(fs_info); 38560f788c58SLiu Bo if (ret) 385734d97007SAnand Jain btrfs_handle_fs_error(fs_info, ret, NULL); 3858ed0fb78fSIlya Dryomov 3859171938e5SDavid Sterba clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3860c9e9f97bSIlya Dryomov } 3861c9e9f97bSIlya Dryomov 3862bdcd3c97SAlexandru Moise /* Non-zero return value signifies invalidity */ 3863bdcd3c97SAlexandru Moise static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, 3864bdcd3c97SAlexandru Moise u64 allowed) 3865bdcd3c97SAlexandru Moise { 3866bdcd3c97SAlexandru Moise return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && 3867bdcd3c97SAlexandru Moise (!alloc_profile_is_valid(bctl_arg->target, 1) || 3868bdcd3c97SAlexandru Moise (bctl_arg->target & ~allowed))); 3869bdcd3c97SAlexandru Moise } 3870bdcd3c97SAlexandru Moise 3871c9e9f97bSIlya Dryomov /* 3872c9e9f97bSIlya Dryomov * Should be called with both balance and volume mutexes held 3873c9e9f97bSIlya Dryomov */ 3874c9e9f97bSIlya Dryomov int btrfs_balance(struct btrfs_balance_control *bctl, 3875c9e9f97bSIlya Dryomov struct btrfs_ioctl_balance_args *bargs) 3876c9e9f97bSIlya Dryomov { 3877c9e9f97bSIlya Dryomov struct btrfs_fs_info *fs_info = bctl->fs_info; 387814506127SAdam Borowski u64 meta_target, data_target; 3879f43ffb60SIlya Dryomov u64 allowed; 3880e4837f8fSIlya Dryomov int mixed = 0; 3881c9e9f97bSIlya Dryomov int ret; 38828dabb742SStefan Behrens u64 num_devices; 3883de98ced9SMiao Xie unsigned seq; 3884c9e9f97bSIlya Dryomov 3885837d5b6eSIlya Dryomov if (btrfs_fs_closing(fs_info) || 3886a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_pause_req) || 3887a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req)) { 3888c9e9f97bSIlya Dryomov ret = -EINVAL; 3889c9e9f97bSIlya Dryomov goto out; 3890c9e9f97bSIlya Dryomov } 3891c9e9f97bSIlya Dryomov 3892e4837f8fSIlya Dryomov allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3893e4837f8fSIlya Dryomov if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3894e4837f8fSIlya Dryomov mixed = 1; 3895e4837f8fSIlya Dryomov 3896f43ffb60SIlya Dryomov /* 3897f43ffb60SIlya Dryomov * In case of mixed groups both data and meta should be picked, 3898f43ffb60SIlya Dryomov * and identical options should be given for both of them. 3899f43ffb60SIlya Dryomov */ 3900e4837f8fSIlya Dryomov allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3901e4837f8fSIlya Dryomov if (mixed && (bctl->flags & allowed)) { 3902f43ffb60SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3903f43ffb60SIlya Dryomov !(bctl->flags & BTRFS_BALANCE_METADATA) || 3904f43ffb60SIlya Dryomov memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 39055d163e0eSJeff Mahoney btrfs_err(fs_info, 39065d163e0eSJeff Mahoney "with mixed groups data and metadata balance options must be the same"); 3907f43ffb60SIlya Dryomov ret = -EINVAL; 3908f43ffb60SIlya Dryomov goto out; 3909f43ffb60SIlya Dryomov } 3910f43ffb60SIlya Dryomov } 3911f43ffb60SIlya Dryomov 39128dabb742SStefan Behrens num_devices = fs_info->fs_devices->num_devices; 391373beece9SLiu Bo btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 39148dabb742SStefan Behrens if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 39158dabb742SStefan Behrens BUG_ON(num_devices < 1); 39168dabb742SStefan Behrens num_devices--; 39178dabb742SStefan Behrens } 391873beece9SLiu Bo btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 391988be159cSAustin S. Hemmelgarn allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 392088be159cSAustin S. Hemmelgarn if (num_devices > 1) 3921e4d8ec0fSIlya Dryomov allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 39228250dabeSAndreas Philipp if (num_devices > 2) 39238250dabeSAndreas Philipp allowed |= BTRFS_BLOCK_GROUP_RAID5; 39248250dabeSAndreas Philipp if (num_devices > 3) 39258250dabeSAndreas Philipp allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 392653b381b3SDavid Woodhouse BTRFS_BLOCK_GROUP_RAID6); 3927bdcd3c97SAlexandru Moise if (validate_convert_profile(&bctl->data, allowed)) { 39285d163e0eSJeff Mahoney btrfs_err(fs_info, 39295d163e0eSJeff Mahoney "unable to start balance with target data profile %llu", 3930c1c9ff7cSGeert Uytterhoeven bctl->data.target); 3931e4d8ec0fSIlya Dryomov ret = -EINVAL; 3932e4d8ec0fSIlya Dryomov goto out; 3933e4d8ec0fSIlya Dryomov } 3934bdcd3c97SAlexandru Moise if (validate_convert_profile(&bctl->meta, allowed)) { 3935efe120a0SFrank Holton btrfs_err(fs_info, 3936efe120a0SFrank Holton "unable to start balance with target metadata profile %llu", 3937c1c9ff7cSGeert Uytterhoeven bctl->meta.target); 3938e4d8ec0fSIlya Dryomov ret = -EINVAL; 3939e4d8ec0fSIlya Dryomov goto out; 3940e4d8ec0fSIlya Dryomov } 3941bdcd3c97SAlexandru Moise if (validate_convert_profile(&bctl->sys, allowed)) { 3942efe120a0SFrank Holton btrfs_err(fs_info, 3943efe120a0SFrank Holton "unable to start balance with target system profile %llu", 3944c1c9ff7cSGeert Uytterhoeven bctl->sys.target); 3945e4d8ec0fSIlya Dryomov ret = -EINVAL; 3946e4d8ec0fSIlya Dryomov goto out; 3947e4d8ec0fSIlya Dryomov } 3948e4d8ec0fSIlya Dryomov 3949e4d8ec0fSIlya Dryomov /* allow to reduce meta or sys integrity only if force set */ 3950e4d8ec0fSIlya Dryomov allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 395153b381b3SDavid Woodhouse BTRFS_BLOCK_GROUP_RAID10 | 395253b381b3SDavid Woodhouse BTRFS_BLOCK_GROUP_RAID5 | 395353b381b3SDavid Woodhouse BTRFS_BLOCK_GROUP_RAID6; 3954de98ced9SMiao Xie do { 3955de98ced9SMiao Xie seq = read_seqbegin(&fs_info->profiles_lock); 3956de98ced9SMiao Xie 3957e4d8ec0fSIlya Dryomov if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3958e4d8ec0fSIlya Dryomov (fs_info->avail_system_alloc_bits & allowed) && 3959e4d8ec0fSIlya Dryomov !(bctl->sys.target & allowed)) || 3960e4d8ec0fSIlya Dryomov ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3961e4d8ec0fSIlya Dryomov (fs_info->avail_metadata_alloc_bits & allowed) && 3962e4d8ec0fSIlya Dryomov !(bctl->meta.target & allowed))) { 3963e4d8ec0fSIlya Dryomov if (bctl->flags & BTRFS_BALANCE_FORCE) { 39645d163e0eSJeff Mahoney btrfs_info(fs_info, 39655d163e0eSJeff Mahoney "force reducing metadata integrity"); 3966e4d8ec0fSIlya Dryomov } else { 39675d163e0eSJeff Mahoney btrfs_err(fs_info, 39685d163e0eSJeff Mahoney "balance will reduce metadata integrity, use force if you want this"); 3969e4d8ec0fSIlya Dryomov ret = -EINVAL; 3970e4d8ec0fSIlya Dryomov goto out; 3971e4d8ec0fSIlya Dryomov } 3972e4d8ec0fSIlya Dryomov } 3973de98ced9SMiao Xie } while (read_seqretry(&fs_info->profiles_lock, seq)); 3974e4d8ec0fSIlya Dryomov 397514506127SAdam Borowski /* if we're not converting, the target field is uninitialized */ 397614506127SAdam Borowski meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 397714506127SAdam Borowski bctl->meta.target : fs_info->avail_metadata_alloc_bits; 397814506127SAdam Borowski data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 397914506127SAdam Borowski bctl->data.target : fs_info->avail_data_alloc_bits; 398014506127SAdam Borowski if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 398114506127SAdam Borowski btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 3982ee592d07SSam Tygier btrfs_warn(fs_info, 3983fedc0045SFilipe Manana "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", 398414506127SAdam Borowski meta_target, data_target); 3985ee592d07SSam Tygier } 3986ee592d07SSam Tygier 39876bccf3abSJeff Mahoney ret = insert_balance_item(fs_info, bctl); 398859641015SIlya Dryomov if (ret && ret != -EEXIST) 39890940ebf6SIlya Dryomov goto out; 39900940ebf6SIlya Dryomov 399159641015SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 399259641015SIlya Dryomov BUG_ON(ret == -EEXIST); 3993c9e9f97bSIlya Dryomov set_balance_control(bctl); 399459641015SIlya Dryomov } else { 399559641015SIlya Dryomov BUG_ON(ret != -EEXIST); 399659641015SIlya Dryomov spin_lock(&fs_info->balance_lock); 399759641015SIlya Dryomov update_balance_args(bctl); 399859641015SIlya Dryomov spin_unlock(&fs_info->balance_lock); 399959641015SIlya Dryomov } 4000c9e9f97bSIlya Dryomov 4001837d5b6eSIlya Dryomov atomic_inc(&fs_info->balance_running); 4002c9e9f97bSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4003c9e9f97bSIlya Dryomov 4004c9e9f97bSIlya Dryomov ret = __btrfs_balance(fs_info); 4005c9e9f97bSIlya Dryomov 4006c9e9f97bSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4007837d5b6eSIlya Dryomov atomic_dec(&fs_info->balance_running); 4008c9e9f97bSIlya Dryomov 4009c9e9f97bSIlya Dryomov if (bargs) { 4010c9e9f97bSIlya Dryomov memset(bargs, 0, sizeof(*bargs)); 401119a39dceSIlya Dryomov update_ioctl_balance_args(fs_info, 0, bargs); 4012c9e9f97bSIlya Dryomov } 4013c9e9f97bSIlya Dryomov 40143a01aa7aSIlya Dryomov if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 40153a01aa7aSIlya Dryomov balance_need_close(fs_info)) { 40163a01aa7aSIlya Dryomov __cancel_balance(fs_info); 40173a01aa7aSIlya Dryomov } 40183a01aa7aSIlya Dryomov 4019837d5b6eSIlya Dryomov wake_up(&fs_info->balance_wait_q); 4020c9e9f97bSIlya Dryomov 4021c9e9f97bSIlya Dryomov return ret; 4022c9e9f97bSIlya Dryomov out: 402359641015SIlya Dryomov if (bctl->flags & BTRFS_BALANCE_RESUME) 402459641015SIlya Dryomov __cancel_balance(fs_info); 4025ed0fb78fSIlya Dryomov else { 4026c9e9f97bSIlya Dryomov kfree(bctl); 4027171938e5SDavid Sterba clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4028ed0fb78fSIlya Dryomov } 40298f18cf13SChris Mason return ret; 40308f18cf13SChris Mason } 40318f18cf13SChris Mason 403259641015SIlya Dryomov static int balance_kthread(void *data) 403359641015SIlya Dryomov { 40342b6ba629SIlya Dryomov struct btrfs_fs_info *fs_info = data; 40359555c6c1SIlya Dryomov int ret = 0; 403659641015SIlya Dryomov 403759641015SIlya Dryomov mutex_lock(&fs_info->volume_mutex); 403859641015SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 403959641015SIlya Dryomov 40402b6ba629SIlya Dryomov if (fs_info->balance_ctl) { 4041efe120a0SFrank Holton btrfs_info(fs_info, "continuing balance"); 40422b6ba629SIlya Dryomov ret = btrfs_balance(fs_info->balance_ctl, NULL); 40439555c6c1SIlya Dryomov } 404459641015SIlya Dryomov 404559641015SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 404659641015SIlya Dryomov mutex_unlock(&fs_info->volume_mutex); 40472b6ba629SIlya Dryomov 404859641015SIlya Dryomov return ret; 404959641015SIlya Dryomov } 405059641015SIlya Dryomov 40512b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 40522b6ba629SIlya Dryomov { 40532b6ba629SIlya Dryomov struct task_struct *tsk; 40542b6ba629SIlya Dryomov 40552b6ba629SIlya Dryomov spin_lock(&fs_info->balance_lock); 40562b6ba629SIlya Dryomov if (!fs_info->balance_ctl) { 40572b6ba629SIlya Dryomov spin_unlock(&fs_info->balance_lock); 40582b6ba629SIlya Dryomov return 0; 40592b6ba629SIlya Dryomov } 40602b6ba629SIlya Dryomov spin_unlock(&fs_info->balance_lock); 40612b6ba629SIlya Dryomov 40623cdde224SJeff Mahoney if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4063efe120a0SFrank Holton btrfs_info(fs_info, "force skipping balance"); 40642b6ba629SIlya Dryomov return 0; 40652b6ba629SIlya Dryomov } 40662b6ba629SIlya Dryomov 40672b6ba629SIlya Dryomov tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4068cd633972SSachin Kamat return PTR_ERR_OR_ZERO(tsk); 40692b6ba629SIlya Dryomov } 40702b6ba629SIlya Dryomov 407168310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 407259641015SIlya Dryomov { 407359641015SIlya Dryomov struct btrfs_balance_control *bctl; 407459641015SIlya Dryomov struct btrfs_balance_item *item; 407559641015SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 407659641015SIlya Dryomov struct btrfs_path *path; 407759641015SIlya Dryomov struct extent_buffer *leaf; 407859641015SIlya Dryomov struct btrfs_key key; 407959641015SIlya Dryomov int ret; 408059641015SIlya Dryomov 408159641015SIlya Dryomov path = btrfs_alloc_path(); 408259641015SIlya Dryomov if (!path) 408359641015SIlya Dryomov return -ENOMEM; 408459641015SIlya Dryomov 408568310a5eSIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 4086c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 408768310a5eSIlya Dryomov key.offset = 0; 408868310a5eSIlya Dryomov 408968310a5eSIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 409068310a5eSIlya Dryomov if (ret < 0) 409168310a5eSIlya Dryomov goto out; 409268310a5eSIlya Dryomov if (ret > 0) { /* ret = -ENOENT; */ 409368310a5eSIlya Dryomov ret = 0; 409468310a5eSIlya Dryomov goto out; 409568310a5eSIlya Dryomov } 409668310a5eSIlya Dryomov 409759641015SIlya Dryomov bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 409859641015SIlya Dryomov if (!bctl) { 409959641015SIlya Dryomov ret = -ENOMEM; 410059641015SIlya Dryomov goto out; 410159641015SIlya Dryomov } 410259641015SIlya Dryomov 410359641015SIlya Dryomov leaf = path->nodes[0]; 410459641015SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 410559641015SIlya Dryomov 410668310a5eSIlya Dryomov bctl->fs_info = fs_info; 410768310a5eSIlya Dryomov bctl->flags = btrfs_balance_flags(leaf, item); 410868310a5eSIlya Dryomov bctl->flags |= BTRFS_BALANCE_RESUME; 410959641015SIlya Dryomov 411059641015SIlya Dryomov btrfs_balance_data(leaf, item, &disk_bargs); 411159641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 411259641015SIlya Dryomov btrfs_balance_meta(leaf, item, &disk_bargs); 411359641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 411459641015SIlya Dryomov btrfs_balance_sys(leaf, item, &disk_bargs); 411559641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 411659641015SIlya Dryomov 4117171938e5SDavid Sterba WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 4118ed0fb78fSIlya Dryomov 411968310a5eSIlya Dryomov mutex_lock(&fs_info->volume_mutex); 412068310a5eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 412159641015SIlya Dryomov 412268310a5eSIlya Dryomov set_balance_control(bctl); 412368310a5eSIlya Dryomov 412468310a5eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 412568310a5eSIlya Dryomov mutex_unlock(&fs_info->volume_mutex); 412659641015SIlya Dryomov out: 412759641015SIlya Dryomov btrfs_free_path(path); 412859641015SIlya Dryomov return ret; 412959641015SIlya Dryomov } 413059641015SIlya Dryomov 4131837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4132837d5b6eSIlya Dryomov { 4133837d5b6eSIlya Dryomov int ret = 0; 4134837d5b6eSIlya Dryomov 4135837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4136837d5b6eSIlya Dryomov if (!fs_info->balance_ctl) { 4137837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4138837d5b6eSIlya Dryomov return -ENOTCONN; 4139837d5b6eSIlya Dryomov } 4140837d5b6eSIlya Dryomov 4141837d5b6eSIlya Dryomov if (atomic_read(&fs_info->balance_running)) { 4142837d5b6eSIlya Dryomov atomic_inc(&fs_info->balance_pause_req); 4143837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4144837d5b6eSIlya Dryomov 4145837d5b6eSIlya Dryomov wait_event(fs_info->balance_wait_q, 4146837d5b6eSIlya Dryomov atomic_read(&fs_info->balance_running) == 0); 4147837d5b6eSIlya Dryomov 4148837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4149837d5b6eSIlya Dryomov /* we are good with balance_ctl ripped off from under us */ 4150837d5b6eSIlya Dryomov BUG_ON(atomic_read(&fs_info->balance_running)); 4151837d5b6eSIlya Dryomov atomic_dec(&fs_info->balance_pause_req); 4152837d5b6eSIlya Dryomov } else { 4153837d5b6eSIlya Dryomov ret = -ENOTCONN; 4154837d5b6eSIlya Dryomov } 4155837d5b6eSIlya Dryomov 4156837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4157837d5b6eSIlya Dryomov return ret; 4158837d5b6eSIlya Dryomov } 4159837d5b6eSIlya Dryomov 4160a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4161a7e99c69SIlya Dryomov { 4162bc98a42cSDavid Howells if (sb_rdonly(fs_info->sb)) 4163e649e587SIlya Dryomov return -EROFS; 4164e649e587SIlya Dryomov 4165a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4166a7e99c69SIlya Dryomov if (!fs_info->balance_ctl) { 4167a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4168a7e99c69SIlya Dryomov return -ENOTCONN; 4169a7e99c69SIlya Dryomov } 4170a7e99c69SIlya Dryomov 4171a7e99c69SIlya Dryomov atomic_inc(&fs_info->balance_cancel_req); 4172a7e99c69SIlya Dryomov /* 4173a7e99c69SIlya Dryomov * if we are running just wait and return, balance item is 4174a7e99c69SIlya Dryomov * deleted in btrfs_balance in this case 4175a7e99c69SIlya Dryomov */ 4176a7e99c69SIlya Dryomov if (atomic_read(&fs_info->balance_running)) { 4177a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4178a7e99c69SIlya Dryomov wait_event(fs_info->balance_wait_q, 4179a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_running) == 0); 4180a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4181a7e99c69SIlya Dryomov } else { 4182a7e99c69SIlya Dryomov /* __cancel_balance needs volume_mutex */ 4183a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4184a7e99c69SIlya Dryomov mutex_lock(&fs_info->volume_mutex); 4185a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4186a7e99c69SIlya Dryomov 4187a7e99c69SIlya Dryomov if (fs_info->balance_ctl) 4188a7e99c69SIlya Dryomov __cancel_balance(fs_info); 4189a7e99c69SIlya Dryomov 4190a7e99c69SIlya Dryomov mutex_unlock(&fs_info->volume_mutex); 4191a7e99c69SIlya Dryomov } 4192a7e99c69SIlya Dryomov 4193a7e99c69SIlya Dryomov BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 4194a7e99c69SIlya Dryomov atomic_dec(&fs_info->balance_cancel_req); 4195a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4196a7e99c69SIlya Dryomov return 0; 4197a7e99c69SIlya Dryomov } 4198a7e99c69SIlya Dryomov 4199803b2f54SStefan Behrens static int btrfs_uuid_scan_kthread(void *data) 4200803b2f54SStefan Behrens { 4201803b2f54SStefan Behrens struct btrfs_fs_info *fs_info = data; 4202803b2f54SStefan Behrens struct btrfs_root *root = fs_info->tree_root; 4203803b2f54SStefan Behrens struct btrfs_key key; 4204803b2f54SStefan Behrens struct btrfs_path *path = NULL; 4205803b2f54SStefan Behrens int ret = 0; 4206803b2f54SStefan Behrens struct extent_buffer *eb; 4207803b2f54SStefan Behrens int slot; 4208803b2f54SStefan Behrens struct btrfs_root_item root_item; 4209803b2f54SStefan Behrens u32 item_size; 4210f45388f3SFilipe David Borba Manana struct btrfs_trans_handle *trans = NULL; 4211803b2f54SStefan Behrens 4212803b2f54SStefan Behrens path = btrfs_alloc_path(); 4213803b2f54SStefan Behrens if (!path) { 4214803b2f54SStefan Behrens ret = -ENOMEM; 4215803b2f54SStefan Behrens goto out; 4216803b2f54SStefan Behrens } 4217803b2f54SStefan Behrens 4218803b2f54SStefan Behrens key.objectid = 0; 4219803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4220803b2f54SStefan Behrens key.offset = 0; 4221803b2f54SStefan Behrens 4222803b2f54SStefan Behrens while (1) { 42236174d3cbSFilipe David Borba Manana ret = btrfs_search_forward(root, &key, path, 0); 4224803b2f54SStefan Behrens if (ret) { 4225803b2f54SStefan Behrens if (ret > 0) 4226803b2f54SStefan Behrens ret = 0; 4227803b2f54SStefan Behrens break; 4228803b2f54SStefan Behrens } 4229803b2f54SStefan Behrens 4230803b2f54SStefan Behrens if (key.type != BTRFS_ROOT_ITEM_KEY || 4231803b2f54SStefan Behrens (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4232803b2f54SStefan Behrens key.objectid != BTRFS_FS_TREE_OBJECTID) || 4233803b2f54SStefan Behrens key.objectid > BTRFS_LAST_FREE_OBJECTID) 4234803b2f54SStefan Behrens goto skip; 4235803b2f54SStefan Behrens 4236803b2f54SStefan Behrens eb = path->nodes[0]; 4237803b2f54SStefan Behrens slot = path->slots[0]; 4238803b2f54SStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 4239803b2f54SStefan Behrens if (item_size < sizeof(root_item)) 4240803b2f54SStefan Behrens goto skip; 4241803b2f54SStefan Behrens 4242803b2f54SStefan Behrens read_extent_buffer(eb, &root_item, 4243803b2f54SStefan Behrens btrfs_item_ptr_offset(eb, slot), 4244803b2f54SStefan Behrens (int)sizeof(root_item)); 4245803b2f54SStefan Behrens if (btrfs_root_refs(&root_item) == 0) 4246803b2f54SStefan Behrens goto skip; 4247f45388f3SFilipe David Borba Manana 4248f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid) || 4249f45388f3SFilipe David Borba Manana !btrfs_is_empty_uuid(root_item.received_uuid)) { 4250f45388f3SFilipe David Borba Manana if (trans) 4251f45388f3SFilipe David Borba Manana goto update_tree; 4252f45388f3SFilipe David Borba Manana 4253f45388f3SFilipe David Borba Manana btrfs_release_path(path); 4254803b2f54SStefan Behrens /* 4255803b2f54SStefan Behrens * 1 - subvol uuid item 4256803b2f54SStefan Behrens * 1 - received_subvol uuid item 4257803b2f54SStefan Behrens */ 4258803b2f54SStefan Behrens trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4259803b2f54SStefan Behrens if (IS_ERR(trans)) { 4260803b2f54SStefan Behrens ret = PTR_ERR(trans); 4261803b2f54SStefan Behrens break; 4262803b2f54SStefan Behrens } 4263f45388f3SFilipe David Borba Manana continue; 4264f45388f3SFilipe David Borba Manana } else { 4265f45388f3SFilipe David Borba Manana goto skip; 4266f45388f3SFilipe David Borba Manana } 4267f45388f3SFilipe David Borba Manana update_tree: 4268f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid)) { 42696bccf3abSJeff Mahoney ret = btrfs_uuid_tree_add(trans, fs_info, 4270803b2f54SStefan Behrens root_item.uuid, 4271803b2f54SStefan Behrens BTRFS_UUID_KEY_SUBVOL, 4272803b2f54SStefan Behrens key.objectid); 4273803b2f54SStefan Behrens if (ret < 0) { 4274efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4275803b2f54SStefan Behrens ret); 4276803b2f54SStefan Behrens break; 4277803b2f54SStefan Behrens } 4278803b2f54SStefan Behrens } 4279803b2f54SStefan Behrens 4280803b2f54SStefan Behrens if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 42816bccf3abSJeff Mahoney ret = btrfs_uuid_tree_add(trans, fs_info, 4282803b2f54SStefan Behrens root_item.received_uuid, 4283803b2f54SStefan Behrens BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4284803b2f54SStefan Behrens key.objectid); 4285803b2f54SStefan Behrens if (ret < 0) { 4286efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4287803b2f54SStefan Behrens ret); 4288803b2f54SStefan Behrens break; 4289803b2f54SStefan Behrens } 4290803b2f54SStefan Behrens } 4291803b2f54SStefan Behrens 4292f45388f3SFilipe David Borba Manana skip: 4293803b2f54SStefan Behrens if (trans) { 42943a45bb20SJeff Mahoney ret = btrfs_end_transaction(trans); 4295f45388f3SFilipe David Borba Manana trans = NULL; 4296803b2f54SStefan Behrens if (ret) 4297803b2f54SStefan Behrens break; 4298803b2f54SStefan Behrens } 4299803b2f54SStefan Behrens 4300803b2f54SStefan Behrens btrfs_release_path(path); 4301803b2f54SStefan Behrens if (key.offset < (u64)-1) { 4302803b2f54SStefan Behrens key.offset++; 4303803b2f54SStefan Behrens } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4304803b2f54SStefan Behrens key.offset = 0; 4305803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4306803b2f54SStefan Behrens } else if (key.objectid < (u64)-1) { 4307803b2f54SStefan Behrens key.offset = 0; 4308803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4309803b2f54SStefan Behrens key.objectid++; 4310803b2f54SStefan Behrens } else { 4311803b2f54SStefan Behrens break; 4312803b2f54SStefan Behrens } 4313803b2f54SStefan Behrens cond_resched(); 4314803b2f54SStefan Behrens } 4315803b2f54SStefan Behrens 4316803b2f54SStefan Behrens out: 4317803b2f54SStefan Behrens btrfs_free_path(path); 4318f45388f3SFilipe David Borba Manana if (trans && !IS_ERR(trans)) 43193a45bb20SJeff Mahoney btrfs_end_transaction(trans); 4320803b2f54SStefan Behrens if (ret) 4321efe120a0SFrank Holton btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 432270f80175SStefan Behrens else 4323afcdd129SJosef Bacik set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4324803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4325803b2f54SStefan Behrens return 0; 4326803b2f54SStefan Behrens } 4327803b2f54SStefan Behrens 432870f80175SStefan Behrens /* 432970f80175SStefan Behrens * Callback for btrfs_uuid_tree_iterate(). 433070f80175SStefan Behrens * returns: 433170f80175SStefan Behrens * 0 check succeeded, the entry is not outdated. 4332bb7ab3b9SAdam Buchbinder * < 0 if an error occurred. 433370f80175SStefan Behrens * > 0 if the check failed, which means the caller shall remove the entry. 433470f80175SStefan Behrens */ 433570f80175SStefan Behrens static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 433670f80175SStefan Behrens u8 *uuid, u8 type, u64 subid) 433770f80175SStefan Behrens { 433870f80175SStefan Behrens struct btrfs_key key; 433970f80175SStefan Behrens int ret = 0; 434070f80175SStefan Behrens struct btrfs_root *subvol_root; 434170f80175SStefan Behrens 434270f80175SStefan Behrens if (type != BTRFS_UUID_KEY_SUBVOL && 434370f80175SStefan Behrens type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 434470f80175SStefan Behrens goto out; 434570f80175SStefan Behrens 434670f80175SStefan Behrens key.objectid = subid; 434770f80175SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 434870f80175SStefan Behrens key.offset = (u64)-1; 434970f80175SStefan Behrens subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 435070f80175SStefan Behrens if (IS_ERR(subvol_root)) { 435170f80175SStefan Behrens ret = PTR_ERR(subvol_root); 435270f80175SStefan Behrens if (ret == -ENOENT) 435370f80175SStefan Behrens ret = 1; 435470f80175SStefan Behrens goto out; 435570f80175SStefan Behrens } 435670f80175SStefan Behrens 435770f80175SStefan Behrens switch (type) { 435870f80175SStefan Behrens case BTRFS_UUID_KEY_SUBVOL: 435970f80175SStefan Behrens if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 436070f80175SStefan Behrens ret = 1; 436170f80175SStefan Behrens break; 436270f80175SStefan Behrens case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 436370f80175SStefan Behrens if (memcmp(uuid, subvol_root->root_item.received_uuid, 436470f80175SStefan Behrens BTRFS_UUID_SIZE)) 436570f80175SStefan Behrens ret = 1; 436670f80175SStefan Behrens break; 436770f80175SStefan Behrens } 436870f80175SStefan Behrens 436970f80175SStefan Behrens out: 437070f80175SStefan Behrens return ret; 437170f80175SStefan Behrens } 437270f80175SStefan Behrens 437370f80175SStefan Behrens static int btrfs_uuid_rescan_kthread(void *data) 437470f80175SStefan Behrens { 437570f80175SStefan Behrens struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 437670f80175SStefan Behrens int ret; 437770f80175SStefan Behrens 437870f80175SStefan Behrens /* 437970f80175SStefan Behrens * 1st step is to iterate through the existing UUID tree and 438070f80175SStefan Behrens * to delete all entries that contain outdated data. 438170f80175SStefan Behrens * 2nd step is to add all missing entries to the UUID tree. 438270f80175SStefan Behrens */ 438370f80175SStefan Behrens ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 438470f80175SStefan Behrens if (ret < 0) { 4385efe120a0SFrank Holton btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 438670f80175SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 438770f80175SStefan Behrens return ret; 438870f80175SStefan Behrens } 438970f80175SStefan Behrens return btrfs_uuid_scan_kthread(data); 439070f80175SStefan Behrens } 439170f80175SStefan Behrens 4392f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4393f7a81ea4SStefan Behrens { 4394f7a81ea4SStefan Behrens struct btrfs_trans_handle *trans; 4395f7a81ea4SStefan Behrens struct btrfs_root *tree_root = fs_info->tree_root; 4396f7a81ea4SStefan Behrens struct btrfs_root *uuid_root; 4397803b2f54SStefan Behrens struct task_struct *task; 4398803b2f54SStefan Behrens int ret; 4399f7a81ea4SStefan Behrens 4400f7a81ea4SStefan Behrens /* 4401f7a81ea4SStefan Behrens * 1 - root node 4402f7a81ea4SStefan Behrens * 1 - root item 4403f7a81ea4SStefan Behrens */ 4404f7a81ea4SStefan Behrens trans = btrfs_start_transaction(tree_root, 2); 4405f7a81ea4SStefan Behrens if (IS_ERR(trans)) 4406f7a81ea4SStefan Behrens return PTR_ERR(trans); 4407f7a81ea4SStefan Behrens 4408f7a81ea4SStefan Behrens uuid_root = btrfs_create_tree(trans, fs_info, 4409f7a81ea4SStefan Behrens BTRFS_UUID_TREE_OBJECTID); 4410f7a81ea4SStefan Behrens if (IS_ERR(uuid_root)) { 44116d13f549SDavid Sterba ret = PTR_ERR(uuid_root); 441266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 44133a45bb20SJeff Mahoney btrfs_end_transaction(trans); 44146d13f549SDavid Sterba return ret; 4415f7a81ea4SStefan Behrens } 4416f7a81ea4SStefan Behrens 4417f7a81ea4SStefan Behrens fs_info->uuid_root = uuid_root; 4418f7a81ea4SStefan Behrens 44193a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 4420803b2f54SStefan Behrens if (ret) 4421803b2f54SStefan Behrens return ret; 4422803b2f54SStefan Behrens 4423803b2f54SStefan Behrens down(&fs_info->uuid_tree_rescan_sem); 4424803b2f54SStefan Behrens task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4425803b2f54SStefan Behrens if (IS_ERR(task)) { 442670f80175SStefan Behrens /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4427efe120a0SFrank Holton btrfs_warn(fs_info, "failed to start uuid_scan task"); 4428803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4429803b2f54SStefan Behrens return PTR_ERR(task); 4430f7a81ea4SStefan Behrens } 4431803b2f54SStefan Behrens 4432803b2f54SStefan Behrens return 0; 4433803b2f54SStefan Behrens } 4434803b2f54SStefan Behrens 443570f80175SStefan Behrens int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 443670f80175SStefan Behrens { 443770f80175SStefan Behrens struct task_struct *task; 443870f80175SStefan Behrens 443970f80175SStefan Behrens down(&fs_info->uuid_tree_rescan_sem); 444070f80175SStefan Behrens task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 444170f80175SStefan Behrens if (IS_ERR(task)) { 444270f80175SStefan Behrens /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4443efe120a0SFrank Holton btrfs_warn(fs_info, "failed to start uuid_rescan task"); 444470f80175SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 444570f80175SStefan Behrens return PTR_ERR(task); 444670f80175SStefan Behrens } 444770f80175SStefan Behrens 444870f80175SStefan Behrens return 0; 444970f80175SStefan Behrens } 445070f80175SStefan Behrens 44518f18cf13SChris Mason /* 44528f18cf13SChris Mason * shrinking a device means finding all of the device extents past 44538f18cf13SChris Mason * the new size, and then following the back refs to the chunks. 44548f18cf13SChris Mason * The chunk relocation code actually frees the device extent 44558f18cf13SChris Mason */ 44568f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 44578f18cf13SChris Mason { 44580b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 44590b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 44608f18cf13SChris Mason struct btrfs_trans_handle *trans; 44618f18cf13SChris Mason struct btrfs_dev_extent *dev_extent = NULL; 44628f18cf13SChris Mason struct btrfs_path *path; 44638f18cf13SChris Mason u64 length; 44648f18cf13SChris Mason u64 chunk_offset; 44658f18cf13SChris Mason int ret; 44668f18cf13SChris Mason int slot; 4467ba1bf481SJosef Bacik int failed = 0; 4468ba1bf481SJosef Bacik bool retried = false; 446953e489bcSFilipe Manana bool checked_pending_chunks = false; 44708f18cf13SChris Mason struct extent_buffer *l; 44718f18cf13SChris Mason struct btrfs_key key; 44720b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 44738f18cf13SChris Mason u64 old_total = btrfs_super_total_bytes(super_copy); 44747cc8e58dSMiao Xie u64 old_size = btrfs_device_get_total_bytes(device); 44757dfb8be1SNikolay Borisov u64 diff; 44767dfb8be1SNikolay Borisov 44777dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 44780e4324a4SNikolay Borisov diff = round_down(old_size - new_size, fs_info->sectorsize); 44798f18cf13SChris Mason 4480401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 448163a212abSStefan Behrens return -EINVAL; 448263a212abSStefan Behrens 44838f18cf13SChris Mason path = btrfs_alloc_path(); 44848f18cf13SChris Mason if (!path) 44858f18cf13SChris Mason return -ENOMEM; 44868f18cf13SChris Mason 4487e4058b54SDavid Sterba path->reada = READA_FORWARD; 44888f18cf13SChris Mason 448934441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 44907d9eb12cSChris Mason 44917cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 4492ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 44932b82032cSYan Zheng device->fs_devices->total_rw_bytes -= diff; 4494a5ed45f8SNikolay Borisov atomic64_sub(diff, &fs_info->free_chunk_space); 44952bf64758SJosef Bacik } 449634441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 44978f18cf13SChris Mason 4498ba1bf481SJosef Bacik again: 44998f18cf13SChris Mason key.objectid = device->devid; 45008f18cf13SChris Mason key.offset = (u64)-1; 45018f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 45028f18cf13SChris Mason 4503213e64daSIlya Dryomov do { 45040b246afaSJeff Mahoney mutex_lock(&fs_info->delete_unused_bgs_mutex); 45058f18cf13SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 450667c5e7d4SFilipe Manana if (ret < 0) { 45070b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 45088f18cf13SChris Mason goto done; 450967c5e7d4SFilipe Manana } 45108f18cf13SChris Mason 45118f18cf13SChris Mason ret = btrfs_previous_item(root, path, 0, key.type); 451267c5e7d4SFilipe Manana if (ret) 45130b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 45148f18cf13SChris Mason if (ret < 0) 45158f18cf13SChris Mason goto done; 45168f18cf13SChris Mason if (ret) { 45178f18cf13SChris Mason ret = 0; 4518b3b4aa74SDavid Sterba btrfs_release_path(path); 4519bf1fb512SYan Zheng break; 45208f18cf13SChris Mason } 45218f18cf13SChris Mason 45228f18cf13SChris Mason l = path->nodes[0]; 45238f18cf13SChris Mason slot = path->slots[0]; 45248f18cf13SChris Mason btrfs_item_key_to_cpu(l, &key, path->slots[0]); 45258f18cf13SChris Mason 4526ba1bf481SJosef Bacik if (key.objectid != device->devid) { 45270b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4528b3b4aa74SDavid Sterba btrfs_release_path(path); 4529bf1fb512SYan Zheng break; 4530ba1bf481SJosef Bacik } 45318f18cf13SChris Mason 45328f18cf13SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 45338f18cf13SChris Mason length = btrfs_dev_extent_length(l, dev_extent); 45348f18cf13SChris Mason 4535ba1bf481SJosef Bacik if (key.offset + length <= new_size) { 45360b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4537b3b4aa74SDavid Sterba btrfs_release_path(path); 4538d6397baeSChris Ball break; 4539ba1bf481SJosef Bacik } 45408f18cf13SChris Mason 45418f18cf13SChris Mason chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4542b3b4aa74SDavid Sterba btrfs_release_path(path); 45438f18cf13SChris Mason 4544*a6f93c71SLiu Bo /* 4545*a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 4546*a6f93c71SLiu Bo * which could potentially end up with losing data's 4547*a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 4548*a6f93c71SLiu Bo * advance. 4549*a6f93c71SLiu Bo */ 4550*a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4551*a6f93c71SLiu Bo if (ret < 0) { 4552*a6f93c71SLiu Bo mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4553*a6f93c71SLiu Bo goto done; 4554*a6f93c71SLiu Bo } 4555*a6f93c71SLiu Bo 45560b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, chunk_offset); 45570b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4558ba1bf481SJosef Bacik if (ret && ret != -ENOSPC) 4559ba1bf481SJosef Bacik goto done; 4560ba1bf481SJosef Bacik if (ret == -ENOSPC) 4561ba1bf481SJosef Bacik failed++; 4562213e64daSIlya Dryomov } while (key.offset-- > 0); 4563ba1bf481SJosef Bacik 4564ba1bf481SJosef Bacik if (failed && !retried) { 4565ba1bf481SJosef Bacik failed = 0; 4566ba1bf481SJosef Bacik retried = true; 4567ba1bf481SJosef Bacik goto again; 4568ba1bf481SJosef Bacik } else if (failed && retried) { 4569ba1bf481SJosef Bacik ret = -ENOSPC; 45708f18cf13SChris Mason goto done; 45718f18cf13SChris Mason } 45728f18cf13SChris Mason 4573d6397baeSChris Ball /* Shrinking succeeded, else we would be at "done". */ 4574a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 457598d5dc13STsutomu Itoh if (IS_ERR(trans)) { 457698d5dc13STsutomu Itoh ret = PTR_ERR(trans); 457798d5dc13STsutomu Itoh goto done; 457898d5dc13STsutomu Itoh } 457998d5dc13STsutomu Itoh 458034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 458153e489bcSFilipe Manana 458253e489bcSFilipe Manana /* 458353e489bcSFilipe Manana * We checked in the above loop all device extents that were already in 458453e489bcSFilipe Manana * the device tree. However before we have updated the device's 458553e489bcSFilipe Manana * total_bytes to the new size, we might have had chunk allocations that 458653e489bcSFilipe Manana * have not complete yet (new block groups attached to transaction 458753e489bcSFilipe Manana * handles), and therefore their device extents were not yet in the 458853e489bcSFilipe Manana * device tree and we missed them in the loop above. So if we have any 458953e489bcSFilipe Manana * pending chunk using a device extent that overlaps the device range 459053e489bcSFilipe Manana * that we can not use anymore, commit the current transaction and 459153e489bcSFilipe Manana * repeat the search on the device tree - this way we guarantee we will 459253e489bcSFilipe Manana * not have chunks using device extents that end beyond 'new_size'. 459353e489bcSFilipe Manana */ 459453e489bcSFilipe Manana if (!checked_pending_chunks) { 459553e489bcSFilipe Manana u64 start = new_size; 459653e489bcSFilipe Manana u64 len = old_size - new_size; 459753e489bcSFilipe Manana 4598499f377fSJeff Mahoney if (contains_pending_extent(trans->transaction, device, 4599499f377fSJeff Mahoney &start, len)) { 460034441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 460153e489bcSFilipe Manana checked_pending_chunks = true; 460253e489bcSFilipe Manana failed = 0; 460353e489bcSFilipe Manana retried = false; 46043a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 460553e489bcSFilipe Manana if (ret) 460653e489bcSFilipe Manana goto done; 460753e489bcSFilipe Manana goto again; 460853e489bcSFilipe Manana } 460953e489bcSFilipe Manana } 461053e489bcSFilipe Manana 46117cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 4612935e5cc9SMiao Xie if (list_empty(&device->resized_list)) 4613935e5cc9SMiao Xie list_add_tail(&device->resized_list, 46140b246afaSJeff Mahoney &fs_info->fs_devices->resized_devices); 4615d6397baeSChris Ball 4616d6397baeSChris Ball WARN_ON(diff > old_total); 46177dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 46187dfb8be1SNikolay Borisov round_down(old_total - diff, fs_info->sectorsize)); 461934441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 46202196d6e8SMiao Xie 46212196d6e8SMiao Xie /* Now btrfs_update_device() will change the on-disk size. */ 46222196d6e8SMiao Xie ret = btrfs_update_device(trans, device); 46233a45bb20SJeff Mahoney btrfs_end_transaction(trans); 46248f18cf13SChris Mason done: 46258f18cf13SChris Mason btrfs_free_path(path); 462653e489bcSFilipe Manana if (ret) { 462734441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 462853e489bcSFilipe Manana btrfs_device_set_total_bytes(device, old_size); 4629ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 463053e489bcSFilipe Manana device->fs_devices->total_rw_bytes += diff; 4631a5ed45f8SNikolay Borisov atomic64_add(diff, &fs_info->free_chunk_space); 463234441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 463353e489bcSFilipe Manana } 46348f18cf13SChris Mason return ret; 46358f18cf13SChris Mason } 46368f18cf13SChris Mason 46372ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 46380b86a832SChris Mason struct btrfs_key *key, 46390b86a832SChris Mason struct btrfs_chunk *chunk, int item_size) 46400b86a832SChris Mason { 46410b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 46420b86a832SChris Mason struct btrfs_disk_key disk_key; 46430b86a832SChris Mason u32 array_size; 46440b86a832SChris Mason u8 *ptr; 46450b86a832SChris Mason 464634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 46470b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 46485f43f86eSGui Hecheng if (array_size + item_size + sizeof(disk_key) 4649fe48a5c0SMiao Xie > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 465034441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 46510b86a832SChris Mason return -EFBIG; 4652fe48a5c0SMiao Xie } 46530b86a832SChris Mason 46540b86a832SChris Mason ptr = super_copy->sys_chunk_array + array_size; 46550b86a832SChris Mason btrfs_cpu_key_to_disk(&disk_key, key); 46560b86a832SChris Mason memcpy(ptr, &disk_key, sizeof(disk_key)); 46570b86a832SChris Mason ptr += sizeof(disk_key); 46580b86a832SChris Mason memcpy(ptr, chunk, item_size); 46590b86a832SChris Mason item_size += sizeof(disk_key); 46600b86a832SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 466134441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 4662fe48a5c0SMiao Xie 46630b86a832SChris Mason return 0; 46640b86a832SChris Mason } 46650b86a832SChris Mason 46669f680ce0SChris Mason /* 466773c5de00SArne Jansen * sort the devices in descending order by max_avail, total_avail 46689f680ce0SChris Mason */ 466973c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b) 46702b82032cSYan Zheng { 467173c5de00SArne Jansen const struct btrfs_device_info *di_a = a; 467273c5de00SArne Jansen const struct btrfs_device_info *di_b = b; 46732b82032cSYan Zheng 467473c5de00SArne Jansen if (di_a->max_avail > di_b->max_avail) 4675a40a90a0SChris Mason return -1; 467673c5de00SArne Jansen if (di_a->max_avail < di_b->max_avail) 46779b3f68b9SChris Mason return 1; 467873c5de00SArne Jansen if (di_a->total_avail > di_b->total_avail) 467973c5de00SArne Jansen return -1; 468073c5de00SArne Jansen if (di_a->total_avail < di_b->total_avail) 468173c5de00SArne Jansen return 1; 4682b2117a39SMiao Xie return 0; 4683b2117a39SMiao Xie } 4684b2117a39SMiao Xie 468553b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 468653b381b3SDavid Woodhouse { 4687ffe2d203SZhao Lei if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 468853b381b3SDavid Woodhouse return; 468953b381b3SDavid Woodhouse 4690ceda0864SMiao Xie btrfs_set_fs_incompat(info, RAID56); 469153b381b3SDavid Woodhouse } 469253b381b3SDavid Woodhouse 4693da17066cSJeff Mahoney #define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ 469423f8f9b7SGui Hecheng - sizeof(struct btrfs_chunk)) \ 469523f8f9b7SGui Hecheng / sizeof(struct btrfs_stripe) + 1) 469623f8f9b7SGui Hecheng 469723f8f9b7SGui Hecheng #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 469823f8f9b7SGui Hecheng - 2 * sizeof(struct btrfs_disk_key) \ 469923f8f9b7SGui Hecheng - 2 * sizeof(struct btrfs_chunk)) \ 470023f8f9b7SGui Hecheng / sizeof(struct btrfs_stripe) + 1) 470123f8f9b7SGui Hecheng 4702b2117a39SMiao Xie static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 470372b468c8SDavid Sterba u64 start, u64 type) 4704b2117a39SMiao Xie { 47052ff7e61eSJeff Mahoney struct btrfs_fs_info *info = trans->fs_info; 4706b2117a39SMiao Xie struct btrfs_fs_devices *fs_devices = info->fs_devices; 4707ebcc9301SNikolay Borisov struct btrfs_device *device; 470873c5de00SArne Jansen struct map_lookup *map = NULL; 4709b2117a39SMiao Xie struct extent_map_tree *em_tree; 4710b2117a39SMiao Xie struct extent_map *em; 471173c5de00SArne Jansen struct btrfs_device_info *devices_info = NULL; 471273c5de00SArne Jansen u64 total_avail; 471373c5de00SArne Jansen int num_stripes; /* total number of stripes to allocate */ 471453b381b3SDavid Woodhouse int data_stripes; /* number of stripes that count for 471553b381b3SDavid Woodhouse block group size */ 471673c5de00SArne Jansen int sub_stripes; /* sub_stripes info for map */ 471773c5de00SArne Jansen int dev_stripes; /* stripes per dev */ 471873c5de00SArne Jansen int devs_max; /* max devs to use */ 471973c5de00SArne Jansen int devs_min; /* min devs needed */ 472073c5de00SArne Jansen int devs_increment; /* ndevs has to be a multiple of this */ 472173c5de00SArne Jansen int ncopies; /* how many copies to data has */ 4722b2117a39SMiao Xie int ret; 472373c5de00SArne Jansen u64 max_stripe_size; 472473c5de00SArne Jansen u64 max_chunk_size; 472573c5de00SArne Jansen u64 stripe_size; 472673c5de00SArne Jansen u64 num_bytes; 472773c5de00SArne Jansen int ndevs; 472873c5de00SArne Jansen int i; 472973c5de00SArne Jansen int j; 473031e50229SLiu Bo int index; 4731b2117a39SMiao Xie 47320c460c0dSIlya Dryomov BUG_ON(!alloc_profile_is_valid(type, 0)); 473373c5de00SArne Jansen 4734b2117a39SMiao Xie if (list_empty(&fs_devices->alloc_list)) 4735b2117a39SMiao Xie return -ENOSPC; 4736b2117a39SMiao Xie 473731e50229SLiu Bo index = __get_raid_index(type); 473873c5de00SArne Jansen 473931e50229SLiu Bo sub_stripes = btrfs_raid_array[index].sub_stripes; 474031e50229SLiu Bo dev_stripes = btrfs_raid_array[index].dev_stripes; 474131e50229SLiu Bo devs_max = btrfs_raid_array[index].devs_max; 474231e50229SLiu Bo devs_min = btrfs_raid_array[index].devs_min; 474331e50229SLiu Bo devs_increment = btrfs_raid_array[index].devs_increment; 474431e50229SLiu Bo ncopies = btrfs_raid_array[index].ncopies; 474573c5de00SArne Jansen 474673c5de00SArne Jansen if (type & BTRFS_BLOCK_GROUP_DATA) { 4747ee22184bSByongho Lee max_stripe_size = SZ_1G; 474873c5de00SArne Jansen max_chunk_size = 10 * max_stripe_size; 474923f8f9b7SGui Hecheng if (!devs_max) 475023f8f9b7SGui Hecheng devs_max = BTRFS_MAX_DEVS(info->chunk_root); 475173c5de00SArne Jansen } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 47521100373fSChris Mason /* for larger filesystems, use larger metadata chunks */ 4753ee22184bSByongho Lee if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4754ee22184bSByongho Lee max_stripe_size = SZ_1G; 47551100373fSChris Mason else 4756ee22184bSByongho Lee max_stripe_size = SZ_256M; 475773c5de00SArne Jansen max_chunk_size = max_stripe_size; 475823f8f9b7SGui Hecheng if (!devs_max) 475923f8f9b7SGui Hecheng devs_max = BTRFS_MAX_DEVS(info->chunk_root); 476073c5de00SArne Jansen } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4761ee22184bSByongho Lee max_stripe_size = SZ_32M; 476273c5de00SArne Jansen max_chunk_size = 2 * max_stripe_size; 476323f8f9b7SGui Hecheng if (!devs_max) 476423f8f9b7SGui Hecheng devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 476573c5de00SArne Jansen } else { 4766351fd353SDavid Sterba btrfs_err(info, "invalid chunk type 0x%llx requested", 476773c5de00SArne Jansen type); 476873c5de00SArne Jansen BUG_ON(1); 476973c5de00SArne Jansen } 477073c5de00SArne Jansen 477173c5de00SArne Jansen /* we don't want a chunk larger than 10% of writeable space */ 477273c5de00SArne Jansen max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 477373c5de00SArne Jansen max_chunk_size); 4774b2117a39SMiao Xie 477531e818feSDavid Sterba devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 4776b2117a39SMiao Xie GFP_NOFS); 4777b2117a39SMiao Xie if (!devices_info) 4778b2117a39SMiao Xie return -ENOMEM; 4779b2117a39SMiao Xie 478073c5de00SArne Jansen /* 478173c5de00SArne Jansen * in the first pass through the devices list, we gather information 478273c5de00SArne Jansen * about the available holes on each device. 478373c5de00SArne Jansen */ 478473c5de00SArne Jansen ndevs = 0; 4785ebcc9301SNikolay Borisov list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 478673c5de00SArne Jansen u64 max_avail; 478773c5de00SArne Jansen u64 dev_offset; 478873c5de00SArne Jansen 4789ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 479031b1a2bdSJulia Lawall WARN(1, KERN_ERR 4791efe120a0SFrank Holton "BTRFS: read-only device in alloc_list\n"); 479273c5de00SArne Jansen continue; 479373c5de00SArne Jansen } 479473c5de00SArne Jansen 4795e12c9621SAnand Jain if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4796e12c9621SAnand Jain &device->dev_state) || 4797401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 479873c5de00SArne Jansen continue; 479973c5de00SArne Jansen 480073c5de00SArne Jansen if (device->total_bytes > device->bytes_used) 480173c5de00SArne Jansen total_avail = device->total_bytes - device->bytes_used; 480273c5de00SArne Jansen else 480373c5de00SArne Jansen total_avail = 0; 480438c01b96Sliubo 480538c01b96Sliubo /* If there is no space on this device, skip it. */ 480638c01b96Sliubo if (total_avail == 0) 480738c01b96Sliubo continue; 480873c5de00SArne Jansen 48096df9a95eSJosef Bacik ret = find_free_dev_extent(trans, device, 481073c5de00SArne Jansen max_stripe_size * dev_stripes, 481173c5de00SArne Jansen &dev_offset, &max_avail); 481273c5de00SArne Jansen if (ret && ret != -ENOSPC) 481373c5de00SArne Jansen goto error; 481473c5de00SArne Jansen 481573c5de00SArne Jansen if (ret == 0) 481673c5de00SArne Jansen max_avail = max_stripe_size * dev_stripes; 481773c5de00SArne Jansen 481873c5de00SArne Jansen if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 481973c5de00SArne Jansen continue; 482073c5de00SArne Jansen 4821063d006fSEric Sandeen if (ndevs == fs_devices->rw_devices) { 4822063d006fSEric Sandeen WARN(1, "%s: found more than %llu devices\n", 4823063d006fSEric Sandeen __func__, fs_devices->rw_devices); 4824063d006fSEric Sandeen break; 4825063d006fSEric Sandeen } 482673c5de00SArne Jansen devices_info[ndevs].dev_offset = dev_offset; 482773c5de00SArne Jansen devices_info[ndevs].max_avail = max_avail; 482873c5de00SArne Jansen devices_info[ndevs].total_avail = total_avail; 482973c5de00SArne Jansen devices_info[ndevs].dev = device; 483073c5de00SArne Jansen ++ndevs; 483173c5de00SArne Jansen } 483273c5de00SArne Jansen 483373c5de00SArne Jansen /* 483473c5de00SArne Jansen * now sort the devices by hole size / available space 483573c5de00SArne Jansen */ 483673c5de00SArne Jansen sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 483773c5de00SArne Jansen btrfs_cmp_device_info, NULL); 483873c5de00SArne Jansen 483973c5de00SArne Jansen /* round down to number of usable stripes */ 4840e5600fd6SNikolay Borisov ndevs = round_down(ndevs, devs_increment); 484173c5de00SArne Jansen 484273c5de00SArne Jansen if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 484373c5de00SArne Jansen ret = -ENOSPC; 484473c5de00SArne Jansen goto error; 484573c5de00SArne Jansen } 484673c5de00SArne Jansen 4847f148ef4dSNikolay Borisov ndevs = min(ndevs, devs_max); 4848f148ef4dSNikolay Borisov 484973c5de00SArne Jansen /* 485073c5de00SArne Jansen * the primary goal is to maximize the number of stripes, so use as many 485173c5de00SArne Jansen * devices as possible, even if the stripes are not maximum sized. 485273c5de00SArne Jansen */ 485373c5de00SArne Jansen stripe_size = devices_info[ndevs-1].max_avail; 485473c5de00SArne Jansen num_stripes = ndevs * dev_stripes; 485573c5de00SArne Jansen 485653b381b3SDavid Woodhouse /* 485753b381b3SDavid Woodhouse * this will have to be fixed for RAID1 and RAID10 over 485853b381b3SDavid Woodhouse * more drives 485953b381b3SDavid Woodhouse */ 486053b381b3SDavid Woodhouse data_stripes = num_stripes / ncopies; 486153b381b3SDavid Woodhouse 4862500ceed8SNikolay Borisov if (type & BTRFS_BLOCK_GROUP_RAID5) 486353b381b3SDavid Woodhouse data_stripes = num_stripes - 1; 4864500ceed8SNikolay Borisov 4865500ceed8SNikolay Borisov if (type & BTRFS_BLOCK_GROUP_RAID6) 486653b381b3SDavid Woodhouse data_stripes = num_stripes - 2; 486786db2578SChris Mason 486886db2578SChris Mason /* 486986db2578SChris Mason * Use the number of data stripes to figure out how big this chunk 487086db2578SChris Mason * is really going to be in terms of logical address space, 487186db2578SChris Mason * and compare that answer with the max chunk size 487286db2578SChris Mason */ 487386db2578SChris Mason if (stripe_size * data_stripes > max_chunk_size) { 487486db2578SChris Mason u64 mask = (1ULL << 24) - 1; 4875b8b93addSDavid Sterba 4876b8b93addSDavid Sterba stripe_size = div_u64(max_chunk_size, data_stripes); 487786db2578SChris Mason 487886db2578SChris Mason /* bump the answer up to a 16MB boundary */ 487986db2578SChris Mason stripe_size = (stripe_size + mask) & ~mask; 488086db2578SChris Mason 488186db2578SChris Mason /* but don't go higher than the limits we found 488286db2578SChris Mason * while searching for free extents 488386db2578SChris Mason */ 488486db2578SChris Mason if (stripe_size > devices_info[ndevs-1].max_avail) 488586db2578SChris Mason stripe_size = devices_info[ndevs-1].max_avail; 488686db2578SChris Mason } 488786db2578SChris Mason 4888b8b93addSDavid Sterba stripe_size = div_u64(stripe_size, dev_stripes); 488937db63a4SIlya Dryomov 489037db63a4SIlya Dryomov /* align to BTRFS_STRIPE_LEN */ 4891500ceed8SNikolay Borisov stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); 489273c5de00SArne Jansen 4893b2117a39SMiao Xie map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4894b2117a39SMiao Xie if (!map) { 4895b2117a39SMiao Xie ret = -ENOMEM; 4896b2117a39SMiao Xie goto error; 4897b2117a39SMiao Xie } 4898b2117a39SMiao Xie map->num_stripes = num_stripes; 48999b3f68b9SChris Mason 490073c5de00SArne Jansen for (i = 0; i < ndevs; ++i) { 490173c5de00SArne Jansen for (j = 0; j < dev_stripes; ++j) { 490273c5de00SArne Jansen int s = i * dev_stripes + j; 490373c5de00SArne Jansen map->stripes[s].dev = devices_info[i].dev; 490473c5de00SArne Jansen map->stripes[s].physical = devices_info[i].dev_offset + 490573c5de00SArne Jansen j * stripe_size; 4906a40a90a0SChris Mason } 49076324fbf3SChris Mason } 4908500ceed8SNikolay Borisov map->stripe_len = BTRFS_STRIPE_LEN; 4909500ceed8SNikolay Borisov map->io_align = BTRFS_STRIPE_LEN; 4910500ceed8SNikolay Borisov map->io_width = BTRFS_STRIPE_LEN; 4911593060d7SChris Mason map->type = type; 4912321aecc6SChris Mason map->sub_stripes = sub_stripes; 49130b86a832SChris Mason 491453b381b3SDavid Woodhouse num_bytes = stripe_size * data_stripes; 49150b86a832SChris Mason 49166bccf3abSJeff Mahoney trace_btrfs_chunk_alloc(info, map, start, num_bytes); 49171abe9b8aSliubo 4918172ddd60SDavid Sterba em = alloc_extent_map(); 49192b82032cSYan Zheng if (!em) { 4920298a8f9cSWang Shilong kfree(map); 4921b2117a39SMiao Xie ret = -ENOMEM; 4922b2117a39SMiao Xie goto error; 49232b82032cSYan Zheng } 4924298a8f9cSWang Shilong set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 492595617d69SJeff Mahoney em->map_lookup = map; 49262b82032cSYan Zheng em->start = start; 492773c5de00SArne Jansen em->len = num_bytes; 49280b86a832SChris Mason em->block_start = 0; 4929c8b97818SChris Mason em->block_len = em->len; 49306df9a95eSJosef Bacik em->orig_block_len = stripe_size; 49310b86a832SChris Mason 49320b246afaSJeff Mahoney em_tree = &info->mapping_tree.map_tree; 4933890871beSChris Mason write_lock(&em_tree->lock); 493409a2a8f9SJosef Bacik ret = add_extent_mapping(em_tree, em, 0); 49350f5d42b2SJosef Bacik if (ret) { 49361efb72a3SNikolay Borisov write_unlock(&em_tree->lock); 49370b86a832SChris Mason free_extent_map(em); 49381dd4602fSMark Fasheh goto error; 49390f5d42b2SJosef Bacik } 49402b82032cSYan Zheng 49411efb72a3SNikolay Borisov list_add_tail(&em->list, &trans->transaction->pending_chunks); 49421efb72a3SNikolay Borisov refcount_inc(&em->refs); 49431efb72a3SNikolay Borisov write_unlock(&em_tree->lock); 49441efb72a3SNikolay Borisov 49450174484dSNikolay Borisov ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 49466df9a95eSJosef Bacik if (ret) 49476df9a95eSJosef Bacik goto error_del_extent; 49482b82032cSYan Zheng 49497cc8e58dSMiao Xie for (i = 0; i < map->num_stripes; i++) { 49507cc8e58dSMiao Xie num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 49517cc8e58dSMiao Xie btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 49527cc8e58dSMiao Xie } 495343530c46SMiao Xie 4954a5ed45f8SNikolay Borisov atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); 49551c116187SMiao Xie 49560f5d42b2SJosef Bacik free_extent_map(em); 49570b246afaSJeff Mahoney check_raid56_incompat_flag(info, type); 495853b381b3SDavid Woodhouse 4959b2117a39SMiao Xie kfree(devices_info); 49602b82032cSYan Zheng return 0; 4961b2117a39SMiao Xie 49626df9a95eSJosef Bacik error_del_extent: 49630f5d42b2SJosef Bacik write_lock(&em_tree->lock); 49640f5d42b2SJosef Bacik remove_extent_mapping(em_tree, em); 49650f5d42b2SJosef Bacik write_unlock(&em_tree->lock); 49660f5d42b2SJosef Bacik 49670f5d42b2SJosef Bacik /* One for our allocation */ 49680f5d42b2SJosef Bacik free_extent_map(em); 49690f5d42b2SJosef Bacik /* One for the tree reference */ 49700f5d42b2SJosef Bacik free_extent_map(em); 4971495e64f4SFilipe Manana /* One for the pending_chunks list reference */ 4972495e64f4SFilipe Manana free_extent_map(em); 4973b2117a39SMiao Xie error: 4974b2117a39SMiao Xie kfree(devices_info); 4975b2117a39SMiao Xie return ret; 49762b82032cSYan Zheng } 49772b82032cSYan Zheng 49786df9a95eSJosef Bacik int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 49796bccf3abSJeff Mahoney struct btrfs_fs_info *fs_info, 49806df9a95eSJosef Bacik u64 chunk_offset, u64 chunk_size) 49812b82032cSYan Zheng { 49826bccf3abSJeff Mahoney struct btrfs_root *extent_root = fs_info->extent_root; 49836bccf3abSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root; 49842b82032cSYan Zheng struct btrfs_key key; 49852b82032cSYan Zheng struct btrfs_device *device; 49862b82032cSYan Zheng struct btrfs_chunk *chunk; 49872b82032cSYan Zheng struct btrfs_stripe *stripe; 49886df9a95eSJosef Bacik struct extent_map *em; 49896df9a95eSJosef Bacik struct map_lookup *map; 49906df9a95eSJosef Bacik size_t item_size; 49916df9a95eSJosef Bacik u64 dev_offset; 49926df9a95eSJosef Bacik u64 stripe_size; 49936df9a95eSJosef Bacik int i = 0; 4994140e639fSChris Mason int ret = 0; 49952b82032cSYan Zheng 4996592d92eeSLiu Bo em = get_chunk_map(fs_info, chunk_offset, chunk_size); 4997592d92eeSLiu Bo if (IS_ERR(em)) 4998592d92eeSLiu Bo return PTR_ERR(em); 49996df9a95eSJosef Bacik 500095617d69SJeff Mahoney map = em->map_lookup; 50016df9a95eSJosef Bacik item_size = btrfs_chunk_item_size(map->num_stripes); 50026df9a95eSJosef Bacik stripe_size = em->orig_block_len; 50036df9a95eSJosef Bacik 50046df9a95eSJosef Bacik chunk = kzalloc(item_size, GFP_NOFS); 50056df9a95eSJosef Bacik if (!chunk) { 50066df9a95eSJosef Bacik ret = -ENOMEM; 50076df9a95eSJosef Bacik goto out; 50086df9a95eSJosef Bacik } 50096df9a95eSJosef Bacik 501050460e37SFilipe Manana /* 501150460e37SFilipe Manana * Take the device list mutex to prevent races with the final phase of 501250460e37SFilipe Manana * a device replace operation that replaces the device object associated 501350460e37SFilipe Manana * with the map's stripes, because the device object's id can change 501450460e37SFilipe Manana * at any time during that final phase of the device replace operation 501550460e37SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()). 501650460e37SFilipe Manana */ 50170b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 50186df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 50196df9a95eSJosef Bacik device = map->stripes[i].dev; 50206df9a95eSJosef Bacik dev_offset = map->stripes[i].physical; 50216df9a95eSJosef Bacik 50222b82032cSYan Zheng ret = btrfs_update_device(trans, device); 50233acd3953SMark Fasheh if (ret) 502450460e37SFilipe Manana break; 5025b5d9071cSNikolay Borisov ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5026b5d9071cSNikolay Borisov dev_offset, stripe_size); 50276df9a95eSJosef Bacik if (ret) 502850460e37SFilipe Manana break; 502950460e37SFilipe Manana } 503050460e37SFilipe Manana if (ret) { 50310b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 50326df9a95eSJosef Bacik goto out; 50332b82032cSYan Zheng } 50342b82032cSYan Zheng 50352b82032cSYan Zheng stripe = &chunk->stripe; 50366df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 50376df9a95eSJosef Bacik device = map->stripes[i].dev; 50386df9a95eSJosef Bacik dev_offset = map->stripes[i].physical; 50392b82032cSYan Zheng 50402b82032cSYan Zheng btrfs_set_stack_stripe_devid(stripe, device->devid); 50412b82032cSYan Zheng btrfs_set_stack_stripe_offset(stripe, dev_offset); 50422b82032cSYan Zheng memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 50432b82032cSYan Zheng stripe++; 50442b82032cSYan Zheng } 50450b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 50462b82032cSYan Zheng 50472b82032cSYan Zheng btrfs_set_stack_chunk_length(chunk, chunk_size); 50482b82032cSYan Zheng btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 50492b82032cSYan Zheng btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 50502b82032cSYan Zheng btrfs_set_stack_chunk_type(chunk, map->type); 50512b82032cSYan Zheng btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 50522b82032cSYan Zheng btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 50532b82032cSYan Zheng btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 50540b246afaSJeff Mahoney btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 50552b82032cSYan Zheng btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 50562b82032cSYan Zheng 50572b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 50582b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 50592b82032cSYan Zheng key.offset = chunk_offset; 50602b82032cSYan Zheng 50612b82032cSYan Zheng ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 50624ed1d16eSMark Fasheh if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 50634ed1d16eSMark Fasheh /* 50644ed1d16eSMark Fasheh * TODO: Cleanup of inserted chunk root in case of 50654ed1d16eSMark Fasheh * failure. 50664ed1d16eSMark Fasheh */ 50672ff7e61eSJeff Mahoney ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 50682b82032cSYan Zheng } 50691abe9b8aSliubo 50706df9a95eSJosef Bacik out: 50712b82032cSYan Zheng kfree(chunk); 50726df9a95eSJosef Bacik free_extent_map(em); 50734ed1d16eSMark Fasheh return ret; 50742b82032cSYan Zheng } 50752b82032cSYan Zheng 50762b82032cSYan Zheng /* 50772b82032cSYan Zheng * Chunk allocation falls into two parts. The first part does works 50782b82032cSYan Zheng * that make the new allocated chunk useable, but not do any operation 50792b82032cSYan Zheng * that modifies the chunk tree. The second part does the works that 50802b82032cSYan Zheng * require modifying the chunk tree. This division is important for the 50812b82032cSYan Zheng * bootstrap process of adding storage to a seed btrfs. 50822b82032cSYan Zheng */ 50832b82032cSYan Zheng int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 50842ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info, u64 type) 50852b82032cSYan Zheng { 50862b82032cSYan Zheng u64 chunk_offset; 50872b82032cSYan Zheng 50880b246afaSJeff Mahoney ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 50890b246afaSJeff Mahoney chunk_offset = find_next_chunk(fs_info); 509072b468c8SDavid Sterba return __btrfs_alloc_chunk(trans, chunk_offset, type); 50912b82032cSYan Zheng } 50922b82032cSYan Zheng 5093d397712bSChris Mason static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5094e4a4dce7SDavid Sterba struct btrfs_fs_info *fs_info) 50952b82032cSYan Zheng { 50962b82032cSYan Zheng u64 chunk_offset; 50972b82032cSYan Zheng u64 sys_chunk_offset; 50982b82032cSYan Zheng u64 alloc_profile; 50992b82032cSYan Zheng int ret; 51002b82032cSYan Zheng 51016df9a95eSJosef Bacik chunk_offset = find_next_chunk(fs_info); 51021b86826dSJeff Mahoney alloc_profile = btrfs_metadata_alloc_profile(fs_info); 510372b468c8SDavid Sterba ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 510479787eaaSJeff Mahoney if (ret) 510579787eaaSJeff Mahoney return ret; 51062b82032cSYan Zheng 51070b246afaSJeff Mahoney sys_chunk_offset = find_next_chunk(fs_info); 51081b86826dSJeff Mahoney alloc_profile = btrfs_system_alloc_profile(fs_info); 510972b468c8SDavid Sterba ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 511079787eaaSJeff Mahoney return ret; 51112b82032cSYan Zheng } 51122b82032cSYan Zheng 5113d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5114d20983b4SMiao Xie { 5115d20983b4SMiao Xie int max_errors; 5116d20983b4SMiao Xie 5117d20983b4SMiao Xie if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5118d20983b4SMiao Xie BTRFS_BLOCK_GROUP_RAID10 | 5119d20983b4SMiao Xie BTRFS_BLOCK_GROUP_RAID5 | 5120d20983b4SMiao Xie BTRFS_BLOCK_GROUP_DUP)) { 5121d20983b4SMiao Xie max_errors = 1; 5122d20983b4SMiao Xie } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5123d20983b4SMiao Xie max_errors = 2; 5124d20983b4SMiao Xie } else { 5125d20983b4SMiao Xie max_errors = 0; 5126d20983b4SMiao Xie } 5127d20983b4SMiao Xie 5128d20983b4SMiao Xie return max_errors; 51292b82032cSYan Zheng } 51302b82032cSYan Zheng 51312ff7e61eSJeff Mahoney int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 51322b82032cSYan Zheng { 51332b82032cSYan Zheng struct extent_map *em; 51342b82032cSYan Zheng struct map_lookup *map; 51352b82032cSYan Zheng int readonly = 0; 5136d20983b4SMiao Xie int miss_ndevs = 0; 51372b82032cSYan Zheng int i; 51382b82032cSYan Zheng 5139592d92eeSLiu Bo em = get_chunk_map(fs_info, chunk_offset, 1); 5140592d92eeSLiu Bo if (IS_ERR(em)) 51412b82032cSYan Zheng return 1; 51422b82032cSYan Zheng 514395617d69SJeff Mahoney map = em->map_lookup; 51442b82032cSYan Zheng for (i = 0; i < map->num_stripes; i++) { 5145e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, 5146e6e674bdSAnand Jain &map->stripes[i].dev->dev_state)) { 5147d20983b4SMiao Xie miss_ndevs++; 5148d20983b4SMiao Xie continue; 5149d20983b4SMiao Xie } 5150ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5151ebbede42SAnand Jain &map->stripes[i].dev->dev_state)) { 51522b82032cSYan Zheng readonly = 1; 5153d20983b4SMiao Xie goto end; 51542b82032cSYan Zheng } 51552b82032cSYan Zheng } 5156d20983b4SMiao Xie 5157d20983b4SMiao Xie /* 5158d20983b4SMiao Xie * If the number of missing devices is larger than max errors, 5159d20983b4SMiao Xie * we can not write the data into that chunk successfully, so 5160d20983b4SMiao Xie * set it readonly. 5161d20983b4SMiao Xie */ 5162d20983b4SMiao Xie if (miss_ndevs > btrfs_chunk_max_errors(map)) 5163d20983b4SMiao Xie readonly = 1; 5164d20983b4SMiao Xie end: 51652b82032cSYan Zheng free_extent_map(em); 51662b82032cSYan Zheng return readonly; 51670b86a832SChris Mason } 51680b86a832SChris Mason 51690b86a832SChris Mason void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 51700b86a832SChris Mason { 5171a8067e02SDavid Sterba extent_map_tree_init(&tree->map_tree); 51720b86a832SChris Mason } 51730b86a832SChris Mason 51740b86a832SChris Mason void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 51750b86a832SChris Mason { 51760b86a832SChris Mason struct extent_map *em; 51770b86a832SChris Mason 51780b86a832SChris Mason while (1) { 5179890871beSChris Mason write_lock(&tree->map_tree.lock); 51800b86a832SChris Mason em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 51810b86a832SChris Mason if (em) 51820b86a832SChris Mason remove_extent_mapping(&tree->map_tree, em); 5183890871beSChris Mason write_unlock(&tree->map_tree.lock); 51840b86a832SChris Mason if (!em) 51850b86a832SChris Mason break; 51860b86a832SChris Mason /* once for us */ 51870b86a832SChris Mason free_extent_map(em); 51880b86a832SChris Mason /* once for the tree */ 51890b86a832SChris Mason free_extent_map(em); 51900b86a832SChris Mason } 51910b86a832SChris Mason } 51920b86a832SChris Mason 51935d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5194f188591eSChris Mason { 5195f188591eSChris Mason struct extent_map *em; 5196f188591eSChris Mason struct map_lookup *map; 5197f188591eSChris Mason int ret; 5198f188591eSChris Mason 5199592d92eeSLiu Bo em = get_chunk_map(fs_info, logical, len); 5200592d92eeSLiu Bo if (IS_ERR(em)) 5201fb7669b5SJosef Bacik /* 5202592d92eeSLiu Bo * We could return errors for these cases, but that could get 5203592d92eeSLiu Bo * ugly and we'd probably do the same thing which is just not do 5204592d92eeSLiu Bo * anything else and exit, so return 1 so the callers don't try 5205592d92eeSLiu Bo * to use other copies. 5206fb7669b5SJosef Bacik */ 5207fb7669b5SJosef Bacik return 1; 5208fb7669b5SJosef Bacik 520995617d69SJeff Mahoney map = em->map_lookup; 5210f188591eSChris Mason if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5211f188591eSChris Mason ret = map->num_stripes; 5212321aecc6SChris Mason else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5213321aecc6SChris Mason ret = map->sub_stripes; 521453b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 521553b381b3SDavid Woodhouse ret = 2; 521653b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 52178810f751SLiu Bo /* 52188810f751SLiu Bo * There could be two corrupted data stripes, we need 52198810f751SLiu Bo * to loop retry in order to rebuild the correct data. 52208810f751SLiu Bo * 52218810f751SLiu Bo * Fail a stripe at a time on every retry except the 52228810f751SLiu Bo * stripe under reconstruction. 52238810f751SLiu Bo */ 52248810f751SLiu Bo ret = map->num_stripes; 5225f188591eSChris Mason else 5226f188591eSChris Mason ret = 1; 5227f188591eSChris Mason free_extent_map(em); 5228ad6d620eSStefan Behrens 522973beece9SLiu Bo btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 52306fad823fSLiu Bo if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 52316fad823fSLiu Bo fs_info->dev_replace.tgtdev) 5232ad6d620eSStefan Behrens ret++; 523373beece9SLiu Bo btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 5234ad6d620eSStefan Behrens 5235f188591eSChris Mason return ret; 5236f188591eSChris Mason } 5237f188591eSChris Mason 52382ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 523953b381b3SDavid Woodhouse u64 logical) 524053b381b3SDavid Woodhouse { 524153b381b3SDavid Woodhouse struct extent_map *em; 524253b381b3SDavid Woodhouse struct map_lookup *map; 52430b246afaSJeff Mahoney unsigned long len = fs_info->sectorsize; 524453b381b3SDavid Woodhouse 5245592d92eeSLiu Bo em = get_chunk_map(fs_info, logical, len); 524653b381b3SDavid Woodhouse 524769f03f13SNikolay Borisov if (!WARN_ON(IS_ERR(em))) { 524895617d69SJeff Mahoney map = em->map_lookup; 5249ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 525053b381b3SDavid Woodhouse len = map->stripe_len * nr_data_stripes(map); 525153b381b3SDavid Woodhouse free_extent_map(em); 525269f03f13SNikolay Borisov } 525353b381b3SDavid Woodhouse return len; 525453b381b3SDavid Woodhouse } 525553b381b3SDavid Woodhouse 5256e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 525753b381b3SDavid Woodhouse { 525853b381b3SDavid Woodhouse struct extent_map *em; 525953b381b3SDavid Woodhouse struct map_lookup *map; 526053b381b3SDavid Woodhouse int ret = 0; 526153b381b3SDavid Woodhouse 5262592d92eeSLiu Bo em = get_chunk_map(fs_info, logical, len); 526353b381b3SDavid Woodhouse 526469f03f13SNikolay Borisov if(!WARN_ON(IS_ERR(em))) { 526595617d69SJeff Mahoney map = em->map_lookup; 5266ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 526753b381b3SDavid Woodhouse ret = 1; 526853b381b3SDavid Woodhouse free_extent_map(em); 526969f03f13SNikolay Borisov } 527053b381b3SDavid Woodhouse return ret; 527153b381b3SDavid Woodhouse } 527253b381b3SDavid Woodhouse 527330d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info, 527430d9861fSStefan Behrens struct map_lookup *map, int first, int num, 527530d9861fSStefan Behrens int optimal, int dev_replace_is_ongoing) 5276dfe25020SChris Mason { 5277dfe25020SChris Mason int i; 527830d9861fSStefan Behrens int tolerance; 527930d9861fSStefan Behrens struct btrfs_device *srcdev; 528030d9861fSStefan Behrens 528130d9861fSStefan Behrens if (dev_replace_is_ongoing && 528230d9861fSStefan Behrens fs_info->dev_replace.cont_reading_from_srcdev_mode == 528330d9861fSStefan Behrens BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 528430d9861fSStefan Behrens srcdev = fs_info->dev_replace.srcdev; 528530d9861fSStefan Behrens else 528630d9861fSStefan Behrens srcdev = NULL; 528730d9861fSStefan Behrens 528830d9861fSStefan Behrens /* 528930d9861fSStefan Behrens * try to avoid the drive that is the source drive for a 529030d9861fSStefan Behrens * dev-replace procedure, only choose it if no other non-missing 529130d9861fSStefan Behrens * mirror is available 529230d9861fSStefan Behrens */ 529330d9861fSStefan Behrens for (tolerance = 0; tolerance < 2; tolerance++) { 529430d9861fSStefan Behrens if (map->stripes[optimal].dev->bdev && 529530d9861fSStefan Behrens (tolerance || map->stripes[optimal].dev != srcdev)) 5296dfe25020SChris Mason return optimal; 5297dfe25020SChris Mason for (i = first; i < first + num; i++) { 529830d9861fSStefan Behrens if (map->stripes[i].dev->bdev && 529930d9861fSStefan Behrens (tolerance || map->stripes[i].dev != srcdev)) 5300dfe25020SChris Mason return i; 5301dfe25020SChris Mason } 530230d9861fSStefan Behrens } 530330d9861fSStefan Behrens 5304dfe25020SChris Mason /* we couldn't find one that doesn't fail. Just return something 5305dfe25020SChris Mason * and the io error handling code will clean up eventually 5306dfe25020SChris Mason */ 5307dfe25020SChris Mason return optimal; 5308dfe25020SChris Mason } 5309dfe25020SChris Mason 531053b381b3SDavid Woodhouse static inline int parity_smaller(u64 a, u64 b) 531153b381b3SDavid Woodhouse { 531253b381b3SDavid Woodhouse return a > b; 531353b381b3SDavid Woodhouse } 531453b381b3SDavid Woodhouse 531553b381b3SDavid Woodhouse /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 53168e5cfb55SZhao Lei static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 531753b381b3SDavid Woodhouse { 531853b381b3SDavid Woodhouse struct btrfs_bio_stripe s; 531953b381b3SDavid Woodhouse int i; 532053b381b3SDavid Woodhouse u64 l; 532153b381b3SDavid Woodhouse int again = 1; 532253b381b3SDavid Woodhouse 532353b381b3SDavid Woodhouse while (again) { 532453b381b3SDavid Woodhouse again = 0; 5325cc7539edSZhao Lei for (i = 0; i < num_stripes - 1; i++) { 53268e5cfb55SZhao Lei if (parity_smaller(bbio->raid_map[i], 53278e5cfb55SZhao Lei bbio->raid_map[i+1])) { 532853b381b3SDavid Woodhouse s = bbio->stripes[i]; 53298e5cfb55SZhao Lei l = bbio->raid_map[i]; 533053b381b3SDavid Woodhouse bbio->stripes[i] = bbio->stripes[i+1]; 53318e5cfb55SZhao Lei bbio->raid_map[i] = bbio->raid_map[i+1]; 533253b381b3SDavid Woodhouse bbio->stripes[i+1] = s; 53338e5cfb55SZhao Lei bbio->raid_map[i+1] = l; 53342c8cdd6eSMiao Xie 533553b381b3SDavid Woodhouse again = 1; 533653b381b3SDavid Woodhouse } 533753b381b3SDavid Woodhouse } 533853b381b3SDavid Woodhouse } 533953b381b3SDavid Woodhouse } 534053b381b3SDavid Woodhouse 53416e9606d2SZhao Lei static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 53426e9606d2SZhao Lei { 53436e9606d2SZhao Lei struct btrfs_bio *bbio = kzalloc( 5344e57cf21eSChris Mason /* the size of the btrfs_bio */ 53456e9606d2SZhao Lei sizeof(struct btrfs_bio) + 5346e57cf21eSChris Mason /* plus the variable array for the stripes */ 53476e9606d2SZhao Lei sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5348e57cf21eSChris Mason /* plus the variable array for the tgt dev */ 53496e9606d2SZhao Lei sizeof(int) * (real_stripes) + 5350e57cf21eSChris Mason /* 5351e57cf21eSChris Mason * plus the raid_map, which includes both the tgt dev 5352e57cf21eSChris Mason * and the stripes 5353e57cf21eSChris Mason */ 5354e57cf21eSChris Mason sizeof(u64) * (total_stripes), 5355277fb5fcSMichal Hocko GFP_NOFS|__GFP_NOFAIL); 53566e9606d2SZhao Lei 53576e9606d2SZhao Lei atomic_set(&bbio->error, 0); 5358140475aeSElena Reshetova refcount_set(&bbio->refs, 1); 53596e9606d2SZhao Lei 53606e9606d2SZhao Lei return bbio; 53616e9606d2SZhao Lei } 53626e9606d2SZhao Lei 53636e9606d2SZhao Lei void btrfs_get_bbio(struct btrfs_bio *bbio) 53646e9606d2SZhao Lei { 5365140475aeSElena Reshetova WARN_ON(!refcount_read(&bbio->refs)); 5366140475aeSElena Reshetova refcount_inc(&bbio->refs); 53676e9606d2SZhao Lei } 53686e9606d2SZhao Lei 53696e9606d2SZhao Lei void btrfs_put_bbio(struct btrfs_bio *bbio) 53706e9606d2SZhao Lei { 53716e9606d2SZhao Lei if (!bbio) 53726e9606d2SZhao Lei return; 5373140475aeSElena Reshetova if (refcount_dec_and_test(&bbio->refs)) 53746e9606d2SZhao Lei kfree(bbio); 53756e9606d2SZhao Lei } 53766e9606d2SZhao Lei 53770b3d4cd3SLiu Bo /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 53780b3d4cd3SLiu Bo /* 53790b3d4cd3SLiu Bo * Please note that, discard won't be sent to target device of device 53800b3d4cd3SLiu Bo * replace. 53810b3d4cd3SLiu Bo */ 53820b3d4cd3SLiu Bo static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 53830b3d4cd3SLiu Bo u64 logical, u64 length, 53840b3d4cd3SLiu Bo struct btrfs_bio **bbio_ret) 53850b3d4cd3SLiu Bo { 53860b3d4cd3SLiu Bo struct extent_map *em; 53870b3d4cd3SLiu Bo struct map_lookup *map; 53880b3d4cd3SLiu Bo struct btrfs_bio *bbio; 53890b3d4cd3SLiu Bo u64 offset; 53900b3d4cd3SLiu Bo u64 stripe_nr; 53910b3d4cd3SLiu Bo u64 stripe_nr_end; 53920b3d4cd3SLiu Bo u64 stripe_end_offset; 53930b3d4cd3SLiu Bo u64 stripe_cnt; 53940b3d4cd3SLiu Bo u64 stripe_len; 53950b3d4cd3SLiu Bo u64 stripe_offset; 53960b3d4cd3SLiu Bo u64 num_stripes; 53970b3d4cd3SLiu Bo u32 stripe_index; 53980b3d4cd3SLiu Bo u32 factor = 0; 53990b3d4cd3SLiu Bo u32 sub_stripes = 0; 54000b3d4cd3SLiu Bo u64 stripes_per_dev = 0; 54010b3d4cd3SLiu Bo u32 remaining_stripes = 0; 54020b3d4cd3SLiu Bo u32 last_stripe = 0; 54030b3d4cd3SLiu Bo int ret = 0; 54040b3d4cd3SLiu Bo int i; 54050b3d4cd3SLiu Bo 54060b3d4cd3SLiu Bo /* discard always return a bbio */ 54070b3d4cd3SLiu Bo ASSERT(bbio_ret); 54080b3d4cd3SLiu Bo 54090b3d4cd3SLiu Bo em = get_chunk_map(fs_info, logical, length); 54100b3d4cd3SLiu Bo if (IS_ERR(em)) 54110b3d4cd3SLiu Bo return PTR_ERR(em); 54120b3d4cd3SLiu Bo 54130b3d4cd3SLiu Bo map = em->map_lookup; 54140b3d4cd3SLiu Bo /* we don't discard raid56 yet */ 54150b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 54160b3d4cd3SLiu Bo ret = -EOPNOTSUPP; 54170b3d4cd3SLiu Bo goto out; 54180b3d4cd3SLiu Bo } 54190b3d4cd3SLiu Bo 54200b3d4cd3SLiu Bo offset = logical - em->start; 54210b3d4cd3SLiu Bo length = min_t(u64, em->len - offset, length); 54220b3d4cd3SLiu Bo 54230b3d4cd3SLiu Bo stripe_len = map->stripe_len; 54240b3d4cd3SLiu Bo /* 54250b3d4cd3SLiu Bo * stripe_nr counts the total number of stripes we have to stride 54260b3d4cd3SLiu Bo * to get to this block 54270b3d4cd3SLiu Bo */ 54280b3d4cd3SLiu Bo stripe_nr = div64_u64(offset, stripe_len); 54290b3d4cd3SLiu Bo 54300b3d4cd3SLiu Bo /* stripe_offset is the offset of this block in its stripe */ 54310b3d4cd3SLiu Bo stripe_offset = offset - stripe_nr * stripe_len; 54320b3d4cd3SLiu Bo 54330b3d4cd3SLiu Bo stripe_nr_end = round_up(offset + length, map->stripe_len); 543442c61ab6SLiu Bo stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 54350b3d4cd3SLiu Bo stripe_cnt = stripe_nr_end - stripe_nr; 54360b3d4cd3SLiu Bo stripe_end_offset = stripe_nr_end * map->stripe_len - 54370b3d4cd3SLiu Bo (offset + length); 54380b3d4cd3SLiu Bo /* 54390b3d4cd3SLiu Bo * after this, stripe_nr is the number of stripes on this 54400b3d4cd3SLiu Bo * device we have to walk to find the data, and stripe_index is 54410b3d4cd3SLiu Bo * the number of our device in the stripe array 54420b3d4cd3SLiu Bo */ 54430b3d4cd3SLiu Bo num_stripes = 1; 54440b3d4cd3SLiu Bo stripe_index = 0; 54450b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 54460b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 54470b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID0) 54480b3d4cd3SLiu Bo sub_stripes = 1; 54490b3d4cd3SLiu Bo else 54500b3d4cd3SLiu Bo sub_stripes = map->sub_stripes; 54510b3d4cd3SLiu Bo 54520b3d4cd3SLiu Bo factor = map->num_stripes / sub_stripes; 54530b3d4cd3SLiu Bo num_stripes = min_t(u64, map->num_stripes, 54540b3d4cd3SLiu Bo sub_stripes * stripe_cnt); 54550b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 54560b3d4cd3SLiu Bo stripe_index *= sub_stripes; 54570b3d4cd3SLiu Bo stripes_per_dev = div_u64_rem(stripe_cnt, factor, 54580b3d4cd3SLiu Bo &remaining_stripes); 54590b3d4cd3SLiu Bo div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 54600b3d4cd3SLiu Bo last_stripe *= sub_stripes; 54610b3d4cd3SLiu Bo } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 54620b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_DUP)) { 54630b3d4cd3SLiu Bo num_stripes = map->num_stripes; 54640b3d4cd3SLiu Bo } else { 54650b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 54660b3d4cd3SLiu Bo &stripe_index); 54670b3d4cd3SLiu Bo } 54680b3d4cd3SLiu Bo 54690b3d4cd3SLiu Bo bbio = alloc_btrfs_bio(num_stripes, 0); 54700b3d4cd3SLiu Bo if (!bbio) { 54710b3d4cd3SLiu Bo ret = -ENOMEM; 54720b3d4cd3SLiu Bo goto out; 54730b3d4cd3SLiu Bo } 54740b3d4cd3SLiu Bo 54750b3d4cd3SLiu Bo for (i = 0; i < num_stripes; i++) { 54760b3d4cd3SLiu Bo bbio->stripes[i].physical = 54770b3d4cd3SLiu Bo map->stripes[stripe_index].physical + 54780b3d4cd3SLiu Bo stripe_offset + stripe_nr * map->stripe_len; 54790b3d4cd3SLiu Bo bbio->stripes[i].dev = map->stripes[stripe_index].dev; 54800b3d4cd3SLiu Bo 54810b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 54820b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 54830b3d4cd3SLiu Bo bbio->stripes[i].length = stripes_per_dev * 54840b3d4cd3SLiu Bo map->stripe_len; 54850b3d4cd3SLiu Bo 54860b3d4cd3SLiu Bo if (i / sub_stripes < remaining_stripes) 54870b3d4cd3SLiu Bo bbio->stripes[i].length += 54880b3d4cd3SLiu Bo map->stripe_len; 54890b3d4cd3SLiu Bo 54900b3d4cd3SLiu Bo /* 54910b3d4cd3SLiu Bo * Special for the first stripe and 54920b3d4cd3SLiu Bo * the last stripe: 54930b3d4cd3SLiu Bo * 54940b3d4cd3SLiu Bo * |-------|...|-------| 54950b3d4cd3SLiu Bo * |----------| 54960b3d4cd3SLiu Bo * off end_off 54970b3d4cd3SLiu Bo */ 54980b3d4cd3SLiu Bo if (i < sub_stripes) 54990b3d4cd3SLiu Bo bbio->stripes[i].length -= 55000b3d4cd3SLiu Bo stripe_offset; 55010b3d4cd3SLiu Bo 55020b3d4cd3SLiu Bo if (stripe_index >= last_stripe && 55030b3d4cd3SLiu Bo stripe_index <= (last_stripe + 55040b3d4cd3SLiu Bo sub_stripes - 1)) 55050b3d4cd3SLiu Bo bbio->stripes[i].length -= 55060b3d4cd3SLiu Bo stripe_end_offset; 55070b3d4cd3SLiu Bo 55080b3d4cd3SLiu Bo if (i == sub_stripes - 1) 55090b3d4cd3SLiu Bo stripe_offset = 0; 55100b3d4cd3SLiu Bo } else { 55110b3d4cd3SLiu Bo bbio->stripes[i].length = length; 55120b3d4cd3SLiu Bo } 55130b3d4cd3SLiu Bo 55140b3d4cd3SLiu Bo stripe_index++; 55150b3d4cd3SLiu Bo if (stripe_index == map->num_stripes) { 55160b3d4cd3SLiu Bo stripe_index = 0; 55170b3d4cd3SLiu Bo stripe_nr++; 55180b3d4cd3SLiu Bo } 55190b3d4cd3SLiu Bo } 55200b3d4cd3SLiu Bo 55210b3d4cd3SLiu Bo *bbio_ret = bbio; 55220b3d4cd3SLiu Bo bbio->map_type = map->type; 55230b3d4cd3SLiu Bo bbio->num_stripes = num_stripes; 55240b3d4cd3SLiu Bo out: 55250b3d4cd3SLiu Bo free_extent_map(em); 55260b3d4cd3SLiu Bo return ret; 55270b3d4cd3SLiu Bo } 55280b3d4cd3SLiu Bo 55295ab56090SLiu Bo /* 55305ab56090SLiu Bo * In dev-replace case, for repair case (that's the only case where the mirror 55315ab56090SLiu Bo * is selected explicitly when calling btrfs_map_block), blocks left of the 55325ab56090SLiu Bo * left cursor can also be read from the target drive. 55335ab56090SLiu Bo * 55345ab56090SLiu Bo * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 55355ab56090SLiu Bo * array of stripes. 55365ab56090SLiu Bo * For READ, it also needs to be supported using the same mirror number. 55375ab56090SLiu Bo * 55385ab56090SLiu Bo * If the requested block is not left of the left cursor, EIO is returned. This 55395ab56090SLiu Bo * can happen because btrfs_num_copies() returns one more in the dev-replace 55405ab56090SLiu Bo * case. 55415ab56090SLiu Bo */ 55425ab56090SLiu Bo static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 55435ab56090SLiu Bo u64 logical, u64 length, 55445ab56090SLiu Bo u64 srcdev_devid, int *mirror_num, 55455ab56090SLiu Bo u64 *physical) 55465ab56090SLiu Bo { 55475ab56090SLiu Bo struct btrfs_bio *bbio = NULL; 55485ab56090SLiu Bo int num_stripes; 55495ab56090SLiu Bo int index_srcdev = 0; 55505ab56090SLiu Bo int found = 0; 55515ab56090SLiu Bo u64 physical_of_found = 0; 55525ab56090SLiu Bo int i; 55535ab56090SLiu Bo int ret = 0; 55545ab56090SLiu Bo 55555ab56090SLiu Bo ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 55565ab56090SLiu Bo logical, &length, &bbio, 0, 0); 55575ab56090SLiu Bo if (ret) { 55585ab56090SLiu Bo ASSERT(bbio == NULL); 55595ab56090SLiu Bo return ret; 55605ab56090SLiu Bo } 55615ab56090SLiu Bo 55625ab56090SLiu Bo num_stripes = bbio->num_stripes; 55635ab56090SLiu Bo if (*mirror_num > num_stripes) { 55645ab56090SLiu Bo /* 55655ab56090SLiu Bo * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 55665ab56090SLiu Bo * that means that the requested area is not left of the left 55675ab56090SLiu Bo * cursor 55685ab56090SLiu Bo */ 55695ab56090SLiu Bo btrfs_put_bbio(bbio); 55705ab56090SLiu Bo return -EIO; 55715ab56090SLiu Bo } 55725ab56090SLiu Bo 55735ab56090SLiu Bo /* 55745ab56090SLiu Bo * process the rest of the function using the mirror_num of the source 55755ab56090SLiu Bo * drive. Therefore look it up first. At the end, patch the device 55765ab56090SLiu Bo * pointer to the one of the target drive. 55775ab56090SLiu Bo */ 55785ab56090SLiu Bo for (i = 0; i < num_stripes; i++) { 55795ab56090SLiu Bo if (bbio->stripes[i].dev->devid != srcdev_devid) 55805ab56090SLiu Bo continue; 55815ab56090SLiu Bo 55825ab56090SLiu Bo /* 55835ab56090SLiu Bo * In case of DUP, in order to keep it simple, only add the 55845ab56090SLiu Bo * mirror with the lowest physical address 55855ab56090SLiu Bo */ 55865ab56090SLiu Bo if (found && 55875ab56090SLiu Bo physical_of_found <= bbio->stripes[i].physical) 55885ab56090SLiu Bo continue; 55895ab56090SLiu Bo 55905ab56090SLiu Bo index_srcdev = i; 55915ab56090SLiu Bo found = 1; 55925ab56090SLiu Bo physical_of_found = bbio->stripes[i].physical; 55935ab56090SLiu Bo } 55945ab56090SLiu Bo 55955ab56090SLiu Bo btrfs_put_bbio(bbio); 55965ab56090SLiu Bo 55975ab56090SLiu Bo ASSERT(found); 55985ab56090SLiu Bo if (!found) 55995ab56090SLiu Bo return -EIO; 56005ab56090SLiu Bo 56015ab56090SLiu Bo *mirror_num = index_srcdev + 1; 56025ab56090SLiu Bo *physical = physical_of_found; 56035ab56090SLiu Bo return ret; 56045ab56090SLiu Bo } 56055ab56090SLiu Bo 560673c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op, 560773c0f228SLiu Bo struct btrfs_bio **bbio_ret, 560873c0f228SLiu Bo struct btrfs_dev_replace *dev_replace, 560973c0f228SLiu Bo int *num_stripes_ret, int *max_errors_ret) 561073c0f228SLiu Bo { 561173c0f228SLiu Bo struct btrfs_bio *bbio = *bbio_ret; 561273c0f228SLiu Bo u64 srcdev_devid = dev_replace->srcdev->devid; 561373c0f228SLiu Bo int tgtdev_indexes = 0; 561473c0f228SLiu Bo int num_stripes = *num_stripes_ret; 561573c0f228SLiu Bo int max_errors = *max_errors_ret; 561673c0f228SLiu Bo int i; 561773c0f228SLiu Bo 561873c0f228SLiu Bo if (op == BTRFS_MAP_WRITE) { 561973c0f228SLiu Bo int index_where_to_add; 562073c0f228SLiu Bo 562173c0f228SLiu Bo /* 562273c0f228SLiu Bo * duplicate the write operations while the dev replace 562373c0f228SLiu Bo * procedure is running. Since the copying of the old disk to 562473c0f228SLiu Bo * the new disk takes place at run time while the filesystem is 562573c0f228SLiu Bo * mounted writable, the regular write operations to the old 562673c0f228SLiu Bo * disk have to be duplicated to go to the new disk as well. 562773c0f228SLiu Bo * 562873c0f228SLiu Bo * Note that device->missing is handled by the caller, and that 562973c0f228SLiu Bo * the write to the old disk is already set up in the stripes 563073c0f228SLiu Bo * array. 563173c0f228SLiu Bo */ 563273c0f228SLiu Bo index_where_to_add = num_stripes; 563373c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 563473c0f228SLiu Bo if (bbio->stripes[i].dev->devid == srcdev_devid) { 563573c0f228SLiu Bo /* write to new disk, too */ 563673c0f228SLiu Bo struct btrfs_bio_stripe *new = 563773c0f228SLiu Bo bbio->stripes + index_where_to_add; 563873c0f228SLiu Bo struct btrfs_bio_stripe *old = 563973c0f228SLiu Bo bbio->stripes + i; 564073c0f228SLiu Bo 564173c0f228SLiu Bo new->physical = old->physical; 564273c0f228SLiu Bo new->length = old->length; 564373c0f228SLiu Bo new->dev = dev_replace->tgtdev; 564473c0f228SLiu Bo bbio->tgtdev_map[i] = index_where_to_add; 564573c0f228SLiu Bo index_where_to_add++; 564673c0f228SLiu Bo max_errors++; 564773c0f228SLiu Bo tgtdev_indexes++; 564873c0f228SLiu Bo } 564973c0f228SLiu Bo } 565073c0f228SLiu Bo num_stripes = index_where_to_add; 565173c0f228SLiu Bo } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 565273c0f228SLiu Bo int index_srcdev = 0; 565373c0f228SLiu Bo int found = 0; 565473c0f228SLiu Bo u64 physical_of_found = 0; 565573c0f228SLiu Bo 565673c0f228SLiu Bo /* 565773c0f228SLiu Bo * During the dev-replace procedure, the target drive can also 565873c0f228SLiu Bo * be used to read data in case it is needed to repair a corrupt 565973c0f228SLiu Bo * block elsewhere. This is possible if the requested area is 566073c0f228SLiu Bo * left of the left cursor. In this area, the target drive is a 566173c0f228SLiu Bo * full copy of the source drive. 566273c0f228SLiu Bo */ 566373c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 566473c0f228SLiu Bo if (bbio->stripes[i].dev->devid == srcdev_devid) { 566573c0f228SLiu Bo /* 566673c0f228SLiu Bo * In case of DUP, in order to keep it simple, 566773c0f228SLiu Bo * only add the mirror with the lowest physical 566873c0f228SLiu Bo * address 566973c0f228SLiu Bo */ 567073c0f228SLiu Bo if (found && 567173c0f228SLiu Bo physical_of_found <= 567273c0f228SLiu Bo bbio->stripes[i].physical) 567373c0f228SLiu Bo continue; 567473c0f228SLiu Bo index_srcdev = i; 567573c0f228SLiu Bo found = 1; 567673c0f228SLiu Bo physical_of_found = bbio->stripes[i].physical; 567773c0f228SLiu Bo } 567873c0f228SLiu Bo } 567973c0f228SLiu Bo if (found) { 568073c0f228SLiu Bo struct btrfs_bio_stripe *tgtdev_stripe = 568173c0f228SLiu Bo bbio->stripes + num_stripes; 568273c0f228SLiu Bo 568373c0f228SLiu Bo tgtdev_stripe->physical = physical_of_found; 568473c0f228SLiu Bo tgtdev_stripe->length = 568573c0f228SLiu Bo bbio->stripes[index_srcdev].length; 568673c0f228SLiu Bo tgtdev_stripe->dev = dev_replace->tgtdev; 568773c0f228SLiu Bo bbio->tgtdev_map[index_srcdev] = num_stripes; 568873c0f228SLiu Bo 568973c0f228SLiu Bo tgtdev_indexes++; 569073c0f228SLiu Bo num_stripes++; 569173c0f228SLiu Bo } 569273c0f228SLiu Bo } 569373c0f228SLiu Bo 569473c0f228SLiu Bo *num_stripes_ret = num_stripes; 569573c0f228SLiu Bo *max_errors_ret = max_errors; 569673c0f228SLiu Bo bbio->num_tgtdevs = tgtdev_indexes; 569773c0f228SLiu Bo *bbio_ret = bbio; 569873c0f228SLiu Bo } 569973c0f228SLiu Bo 57002b19a1feSLiu Bo static bool need_full_stripe(enum btrfs_map_op op) 57012b19a1feSLiu Bo { 57022b19a1feSLiu Bo return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 57032b19a1feSLiu Bo } 57042b19a1feSLiu Bo 5705cf8cddd3SChristoph Hellwig static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5706cf8cddd3SChristoph Hellwig enum btrfs_map_op op, 5707cea9e445SChris Mason u64 logical, u64 *length, 5708a1d3c478SJan Schmidt struct btrfs_bio **bbio_ret, 57098e5cfb55SZhao Lei int mirror_num, int need_raid_map) 57100b86a832SChris Mason { 57110b86a832SChris Mason struct extent_map *em; 57120b86a832SChris Mason struct map_lookup *map; 57130b86a832SChris Mason u64 offset; 5714593060d7SChris Mason u64 stripe_offset; 5715593060d7SChris Mason u64 stripe_nr; 571653b381b3SDavid Woodhouse u64 stripe_len; 57179d644a62SDavid Sterba u32 stripe_index; 5718cea9e445SChris Mason int i; 5719de11cc12SLi Zefan int ret = 0; 5720f2d8d74dSChris Mason int num_stripes; 5721a236aed1SChris Mason int max_errors = 0; 57222c8cdd6eSMiao Xie int tgtdev_indexes = 0; 5723a1d3c478SJan Schmidt struct btrfs_bio *bbio = NULL; 5724472262f3SStefan Behrens struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 5725472262f3SStefan Behrens int dev_replace_is_ongoing = 0; 5726472262f3SStefan Behrens int num_alloc_stripes; 5727ad6d620eSStefan Behrens int patch_the_first_stripe_for_dev_replace = 0; 5728ad6d620eSStefan Behrens u64 physical_to_patch_in_first_stripe = 0; 572953b381b3SDavid Woodhouse u64 raid56_full_stripe_start = (u64)-1; 57300b86a832SChris Mason 57310b3d4cd3SLiu Bo if (op == BTRFS_MAP_DISCARD) 57320b3d4cd3SLiu Bo return __btrfs_map_block_for_discard(fs_info, logical, 57330b3d4cd3SLiu Bo *length, bbio_ret); 57340b3d4cd3SLiu Bo 5735592d92eeSLiu Bo em = get_chunk_map(fs_info, logical, *length); 5736592d92eeSLiu Bo if (IS_ERR(em)) 5737592d92eeSLiu Bo return PTR_ERR(em); 57389bb91873SJosef Bacik 573995617d69SJeff Mahoney map = em->map_lookup; 57400b86a832SChris Mason offset = logical - em->start; 5741593060d7SChris Mason 574253b381b3SDavid Woodhouse stripe_len = map->stripe_len; 5743593060d7SChris Mason stripe_nr = offset; 5744593060d7SChris Mason /* 5745593060d7SChris Mason * stripe_nr counts the total number of stripes we have to stride 5746593060d7SChris Mason * to get to this block 5747593060d7SChris Mason */ 574847c5713fSDavid Sterba stripe_nr = div64_u64(stripe_nr, stripe_len); 5749593060d7SChris Mason 575053b381b3SDavid Woodhouse stripe_offset = stripe_nr * stripe_len; 5751e042d1ecSJosef Bacik if (offset < stripe_offset) { 57525d163e0eSJeff Mahoney btrfs_crit(fs_info, 57535d163e0eSJeff Mahoney "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 5754e042d1ecSJosef Bacik stripe_offset, offset, em->start, logical, 5755e042d1ecSJosef Bacik stripe_len); 5756e042d1ecSJosef Bacik free_extent_map(em); 5757e042d1ecSJosef Bacik return -EINVAL; 5758e042d1ecSJosef Bacik } 5759593060d7SChris Mason 5760593060d7SChris Mason /* stripe_offset is the offset of this block in its stripe*/ 5761593060d7SChris Mason stripe_offset = offset - stripe_offset; 5762593060d7SChris Mason 576353b381b3SDavid Woodhouse /* if we're here for raid56, we need to know the stripe aligned start */ 5764ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 576553b381b3SDavid Woodhouse unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 576653b381b3SDavid Woodhouse raid56_full_stripe_start = offset; 576753b381b3SDavid Woodhouse 576853b381b3SDavid Woodhouse /* allow a write of a full stripe, but make sure we don't 576953b381b3SDavid Woodhouse * allow straddling of stripes 577053b381b3SDavid Woodhouse */ 577147c5713fSDavid Sterba raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 577247c5713fSDavid Sterba full_stripe_len); 577353b381b3SDavid Woodhouse raid56_full_stripe_start *= full_stripe_len; 577453b381b3SDavid Woodhouse } 577553b381b3SDavid Woodhouse 57760b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 577753b381b3SDavid Woodhouse u64 max_len; 577853b381b3SDavid Woodhouse /* For writes to RAID[56], allow a full stripeset across all disks. 577953b381b3SDavid Woodhouse For other RAID types and for RAID[56] reads, just allow a single 578053b381b3SDavid Woodhouse stripe (on a single disk). */ 5781ffe2d203SZhao Lei if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 5782cf8cddd3SChristoph Hellwig (op == BTRFS_MAP_WRITE)) { 578353b381b3SDavid Woodhouse max_len = stripe_len * nr_data_stripes(map) - 578453b381b3SDavid Woodhouse (offset - raid56_full_stripe_start); 578553b381b3SDavid Woodhouse } else { 5786cea9e445SChris Mason /* we limit the length of each bio to what fits in a stripe */ 578753b381b3SDavid Woodhouse max_len = stripe_len - stripe_offset; 578853b381b3SDavid Woodhouse } 578953b381b3SDavid Woodhouse *length = min_t(u64, em->len - offset, max_len); 5790cea9e445SChris Mason } else { 5791cea9e445SChris Mason *length = em->len - offset; 5792cea9e445SChris Mason } 5793f2d8d74dSChris Mason 579453b381b3SDavid Woodhouse /* This is for when we're called from btrfs_merge_bio_hook() and all 579553b381b3SDavid Woodhouse it cares about is the length */ 5796a1d3c478SJan Schmidt if (!bbio_ret) 5797cea9e445SChris Mason goto out; 5798cea9e445SChris Mason 579973beece9SLiu Bo btrfs_dev_replace_lock(dev_replace, 0); 5800472262f3SStefan Behrens dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5801472262f3SStefan Behrens if (!dev_replace_is_ongoing) 580273beece9SLiu Bo btrfs_dev_replace_unlock(dev_replace, 0); 580373beece9SLiu Bo else 580473beece9SLiu Bo btrfs_dev_replace_set_lock_blocking(dev_replace); 5805472262f3SStefan Behrens 5806ad6d620eSStefan Behrens if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 58072b19a1feSLiu Bo !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 58085ab56090SLiu Bo ret = get_extra_mirror_from_replace(fs_info, logical, *length, 58095ab56090SLiu Bo dev_replace->srcdev->devid, 58105ab56090SLiu Bo &mirror_num, 58115ab56090SLiu Bo &physical_to_patch_in_first_stripe); 58125ab56090SLiu Bo if (ret) 5813ad6d620eSStefan Behrens goto out; 58145ab56090SLiu Bo else 581594a97dfeSZhao Lei patch_the_first_stripe_for_dev_replace = 1; 5816ad6d620eSStefan Behrens } else if (mirror_num > map->num_stripes) { 5817ad6d620eSStefan Behrens mirror_num = 0; 5818ad6d620eSStefan Behrens } 5819ad6d620eSStefan Behrens 5820f2d8d74dSChris Mason num_stripes = 1; 5821cea9e445SChris Mason stripe_index = 0; 5822fce3bb9aSLi Dongyang if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 582347c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 582447c5713fSDavid Sterba &stripe_index); 5825de483734SAnand Jain if (!need_full_stripe(op)) 582628e1cc7dSMiao Xie mirror_num = 1; 5827fce3bb9aSLi Dongyang } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5828de483734SAnand Jain if (need_full_stripe(op)) 5829f2d8d74dSChris Mason num_stripes = map->num_stripes; 58302fff734fSChris Mason else if (mirror_num) 5831f188591eSChris Mason stripe_index = mirror_num - 1; 5832dfe25020SChris Mason else { 583330d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 0, 5834dfe25020SChris Mason map->num_stripes, 583530d9861fSStefan Behrens current->pid % map->num_stripes, 583630d9861fSStefan Behrens dev_replace_is_ongoing); 5837a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 5838dfe25020SChris Mason } 58392fff734fSChris Mason 5840611f0e00SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5841de483734SAnand Jain if (need_full_stripe(op)) { 5842f2d8d74dSChris Mason num_stripes = map->num_stripes; 5843a1d3c478SJan Schmidt } else if (mirror_num) { 5844f188591eSChris Mason stripe_index = mirror_num - 1; 5845a1d3c478SJan Schmidt } else { 5846a1d3c478SJan Schmidt mirror_num = 1; 5847a1d3c478SJan Schmidt } 58482fff734fSChris Mason 5849321aecc6SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 58509d644a62SDavid Sterba u32 factor = map->num_stripes / map->sub_stripes; 5851321aecc6SChris Mason 585247c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5853321aecc6SChris Mason stripe_index *= map->sub_stripes; 5854321aecc6SChris Mason 5855de483734SAnand Jain if (need_full_stripe(op)) 5856f2d8d74dSChris Mason num_stripes = map->sub_stripes; 5857321aecc6SChris Mason else if (mirror_num) 5858321aecc6SChris Mason stripe_index += mirror_num - 1; 5859dfe25020SChris Mason else { 58603e74317aSJan Schmidt int old_stripe_index = stripe_index; 586130d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 586230d9861fSStefan Behrens stripe_index, 5863dfe25020SChris Mason map->sub_stripes, stripe_index + 586430d9861fSStefan Behrens current->pid % map->sub_stripes, 586530d9861fSStefan Behrens dev_replace_is_ongoing); 58663e74317aSJan Schmidt mirror_num = stripe_index - old_stripe_index + 1; 5867dfe25020SChris Mason } 586853b381b3SDavid Woodhouse 5869ffe2d203SZhao Lei } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5870de483734SAnand Jain if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 587153b381b3SDavid Woodhouse /* push stripe_nr back to the start of the full stripe */ 587242c61ab6SLiu Bo stripe_nr = div64_u64(raid56_full_stripe_start, 5873b8b93addSDavid Sterba stripe_len * nr_data_stripes(map)); 587453b381b3SDavid Woodhouse 587553b381b3SDavid Woodhouse /* RAID[56] write or recovery. Return all stripes */ 587653b381b3SDavid Woodhouse num_stripes = map->num_stripes; 587753b381b3SDavid Woodhouse max_errors = nr_parity_stripes(map); 587853b381b3SDavid Woodhouse 587953b381b3SDavid Woodhouse *length = map->stripe_len; 588053b381b3SDavid Woodhouse stripe_index = 0; 588153b381b3SDavid Woodhouse stripe_offset = 0; 588253b381b3SDavid Woodhouse } else { 588353b381b3SDavid Woodhouse /* 588453b381b3SDavid Woodhouse * Mirror #0 or #1 means the original data block. 588553b381b3SDavid Woodhouse * Mirror #2 is RAID5 parity block. 588653b381b3SDavid Woodhouse * Mirror #3 is RAID6 Q block. 588753b381b3SDavid Woodhouse */ 588847c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, 588947c5713fSDavid Sterba nr_data_stripes(map), &stripe_index); 589053b381b3SDavid Woodhouse if (mirror_num > 1) 589153b381b3SDavid Woodhouse stripe_index = nr_data_stripes(map) + 589253b381b3SDavid Woodhouse mirror_num - 2; 589353b381b3SDavid Woodhouse 589453b381b3SDavid Woodhouse /* We distribute the parity blocks across stripes */ 589547c5713fSDavid Sterba div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 589647c5713fSDavid Sterba &stripe_index); 5897de483734SAnand Jain if (!need_full_stripe(op) && mirror_num <= 1) 589828e1cc7dSMiao Xie mirror_num = 1; 589953b381b3SDavid Woodhouse } 59008790d502SChris Mason } else { 5901593060d7SChris Mason /* 590247c5713fSDavid Sterba * after this, stripe_nr is the number of stripes on this 590347c5713fSDavid Sterba * device we have to walk to find the data, and stripe_index is 590447c5713fSDavid Sterba * the number of our device in the stripe array 5905593060d7SChris Mason */ 590647c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 590747c5713fSDavid Sterba &stripe_index); 5908a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 59098790d502SChris Mason } 5910e042d1ecSJosef Bacik if (stripe_index >= map->num_stripes) { 59115d163e0eSJeff Mahoney btrfs_crit(fs_info, 59125d163e0eSJeff Mahoney "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 5913e042d1ecSJosef Bacik stripe_index, map->num_stripes); 5914e042d1ecSJosef Bacik ret = -EINVAL; 5915e042d1ecSJosef Bacik goto out; 5916e042d1ecSJosef Bacik } 5917593060d7SChris Mason 5918472262f3SStefan Behrens num_alloc_stripes = num_stripes; 59196fad823fSLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 59200b3d4cd3SLiu Bo if (op == BTRFS_MAP_WRITE) 5921472262f3SStefan Behrens num_alloc_stripes <<= 1; 5922cf8cddd3SChristoph Hellwig if (op == BTRFS_MAP_GET_READ_MIRRORS) 5923ad6d620eSStefan Behrens num_alloc_stripes++; 59242c8cdd6eSMiao Xie tgtdev_indexes = num_stripes; 5925ad6d620eSStefan Behrens } 59262c8cdd6eSMiao Xie 59276e9606d2SZhao Lei bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 5928de11cc12SLi Zefan if (!bbio) { 5929de11cc12SLi Zefan ret = -ENOMEM; 5930de11cc12SLi Zefan goto out; 5931de11cc12SLi Zefan } 59326fad823fSLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 59332c8cdd6eSMiao Xie bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5934de11cc12SLi Zefan 59358e5cfb55SZhao Lei /* build raid_map */ 59362b19a1feSLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 59372b19a1feSLiu Bo (need_full_stripe(op) || mirror_num > 1)) { 59388e5cfb55SZhao Lei u64 tmp; 59399d644a62SDavid Sterba unsigned rot; 59408e5cfb55SZhao Lei 59418e5cfb55SZhao Lei bbio->raid_map = (u64 *)((void *)bbio->stripes + 59428e5cfb55SZhao Lei sizeof(struct btrfs_bio_stripe) * 59438e5cfb55SZhao Lei num_alloc_stripes + 59448e5cfb55SZhao Lei sizeof(int) * tgtdev_indexes); 59458e5cfb55SZhao Lei 59468e5cfb55SZhao Lei /* Work out the disk rotation on this stripe-set */ 594747c5713fSDavid Sterba div_u64_rem(stripe_nr, num_stripes, &rot); 59488e5cfb55SZhao Lei 59498e5cfb55SZhao Lei /* Fill in the logical address of each stripe */ 59508e5cfb55SZhao Lei tmp = stripe_nr * nr_data_stripes(map); 59518e5cfb55SZhao Lei for (i = 0; i < nr_data_stripes(map); i++) 59528e5cfb55SZhao Lei bbio->raid_map[(i+rot) % num_stripes] = 59538e5cfb55SZhao Lei em->start + (tmp + i) * map->stripe_len; 59548e5cfb55SZhao Lei 59558e5cfb55SZhao Lei bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 59568e5cfb55SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID6) 59578e5cfb55SZhao Lei bbio->raid_map[(i+rot+1) % num_stripes] = 59588e5cfb55SZhao Lei RAID6_Q_STRIPE; 59598e5cfb55SZhao Lei } 59608e5cfb55SZhao Lei 5961ec9ef7a1SLi Zefan 5962f2d8d74dSChris Mason for (i = 0; i < num_stripes; i++) { 5963a1d3c478SJan Schmidt bbio->stripes[i].physical = 5964f2d8d74dSChris Mason map->stripes[stripe_index].physical + 5965fce3bb9aSLi Dongyang stripe_offset + 5966fce3bb9aSLi Dongyang stripe_nr * map->stripe_len; 5967a1d3c478SJan Schmidt bbio->stripes[i].dev = 5968fce3bb9aSLi Dongyang map->stripes[stripe_index].dev; 5969cea9e445SChris Mason stripe_index++; 5970593060d7SChris Mason } 5971de11cc12SLi Zefan 59722b19a1feSLiu Bo if (need_full_stripe(op)) 5973d20983b4SMiao Xie max_errors = btrfs_chunk_max_errors(map); 5974de11cc12SLi Zefan 59758e5cfb55SZhao Lei if (bbio->raid_map) 59768e5cfb55SZhao Lei sort_parity_stripes(bbio, num_stripes); 5977cc7539edSZhao Lei 597873c0f228SLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 59792b19a1feSLiu Bo need_full_stripe(op)) { 598073c0f228SLiu Bo handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 598173c0f228SLiu Bo &max_errors); 5982ad6d620eSStefan Behrens } 5983472262f3SStefan Behrens 5984a1d3c478SJan Schmidt *bbio_ret = bbio; 598510f11900SZhao Lei bbio->map_type = map->type; 5986a1d3c478SJan Schmidt bbio->num_stripes = num_stripes; 5987a1d3c478SJan Schmidt bbio->max_errors = max_errors; 5988a1d3c478SJan Schmidt bbio->mirror_num = mirror_num; 5989ad6d620eSStefan Behrens 5990ad6d620eSStefan Behrens /* 5991ad6d620eSStefan Behrens * this is the case that REQ_READ && dev_replace_is_ongoing && 5992ad6d620eSStefan Behrens * mirror_num == num_stripes + 1 && dev_replace target drive is 5993ad6d620eSStefan Behrens * available as a mirror 5994ad6d620eSStefan Behrens */ 5995ad6d620eSStefan Behrens if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5996ad6d620eSStefan Behrens WARN_ON(num_stripes > 1); 5997ad6d620eSStefan Behrens bbio->stripes[0].dev = dev_replace->tgtdev; 5998ad6d620eSStefan Behrens bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5999ad6d620eSStefan Behrens bbio->mirror_num = map->num_stripes + 1; 6000ad6d620eSStefan Behrens } 6001cea9e445SChris Mason out: 600273beece9SLiu Bo if (dev_replace_is_ongoing) { 600373beece9SLiu Bo btrfs_dev_replace_clear_lock_blocking(dev_replace); 600473beece9SLiu Bo btrfs_dev_replace_unlock(dev_replace, 0); 600573beece9SLiu Bo } 60060b86a832SChris Mason free_extent_map(em); 6007de11cc12SLi Zefan return ret; 60080b86a832SChris Mason } 60090b86a832SChris Mason 6010cf8cddd3SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6011f2d8d74dSChris Mason u64 logical, u64 *length, 6012a1d3c478SJan Schmidt struct btrfs_bio **bbio_ret, int mirror_num) 6013f2d8d74dSChris Mason { 6014b3d3fa51SMike Christie return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 60158e5cfb55SZhao Lei mirror_num, 0); 6016f2d8d74dSChris Mason } 6017f2d8d74dSChris Mason 6018af8e2d1dSMiao Xie /* For Scrub/replace */ 6019cf8cddd3SChristoph Hellwig int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6020af8e2d1dSMiao Xie u64 logical, u64 *length, 6021825ad4c9SDavid Sterba struct btrfs_bio **bbio_ret) 6022af8e2d1dSMiao Xie { 6023825ad4c9SDavid Sterba return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6024af8e2d1dSMiao Xie } 6025af8e2d1dSMiao Xie 6026ab8d0fc4SJeff Mahoney int btrfs_rmap_block(struct btrfs_fs_info *fs_info, 6027a512bbf8SYan Zheng u64 chunk_start, u64 physical, u64 devid, 6028a512bbf8SYan Zheng u64 **logical, int *naddrs, int *stripe_len) 6029a512bbf8SYan Zheng { 6030a512bbf8SYan Zheng struct extent_map *em; 6031a512bbf8SYan Zheng struct map_lookup *map; 6032a512bbf8SYan Zheng u64 *buf; 6033a512bbf8SYan Zheng u64 bytenr; 6034a512bbf8SYan Zheng u64 length; 6035a512bbf8SYan Zheng u64 stripe_nr; 603653b381b3SDavid Woodhouse u64 rmap_len; 6037a512bbf8SYan Zheng int i, j, nr = 0; 6038a512bbf8SYan Zheng 6039592d92eeSLiu Bo em = get_chunk_map(fs_info, chunk_start, 1); 6040592d92eeSLiu Bo if (IS_ERR(em)) 6041835d974fSJosef Bacik return -EIO; 6042835d974fSJosef Bacik 604395617d69SJeff Mahoney map = em->map_lookup; 6044a512bbf8SYan Zheng length = em->len; 604553b381b3SDavid Woodhouse rmap_len = map->stripe_len; 604653b381b3SDavid Woodhouse 6047a512bbf8SYan Zheng if (map->type & BTRFS_BLOCK_GROUP_RAID10) 6048b8b93addSDavid Sterba length = div_u64(length, map->num_stripes / map->sub_stripes); 6049a512bbf8SYan Zheng else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6050b8b93addSDavid Sterba length = div_u64(length, map->num_stripes); 6051ffe2d203SZhao Lei else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6052b8b93addSDavid Sterba length = div_u64(length, nr_data_stripes(map)); 605353b381b3SDavid Woodhouse rmap_len = map->stripe_len * nr_data_stripes(map); 605453b381b3SDavid Woodhouse } 6055a512bbf8SYan Zheng 605631e818feSDavid Sterba buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 605779787eaaSJeff Mahoney BUG_ON(!buf); /* -ENOMEM */ 6058a512bbf8SYan Zheng 6059a512bbf8SYan Zheng for (i = 0; i < map->num_stripes; i++) { 6060a512bbf8SYan Zheng if (devid && map->stripes[i].dev->devid != devid) 6061a512bbf8SYan Zheng continue; 6062a512bbf8SYan Zheng if (map->stripes[i].physical > physical || 6063a512bbf8SYan Zheng map->stripes[i].physical + length <= physical) 6064a512bbf8SYan Zheng continue; 6065a512bbf8SYan Zheng 6066a512bbf8SYan Zheng stripe_nr = physical - map->stripes[i].physical; 606742c61ab6SLiu Bo stripe_nr = div64_u64(stripe_nr, map->stripe_len); 6068a512bbf8SYan Zheng 6069a512bbf8SYan Zheng if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6070a512bbf8SYan Zheng stripe_nr = stripe_nr * map->num_stripes + i; 6071b8b93addSDavid Sterba stripe_nr = div_u64(stripe_nr, map->sub_stripes); 6072a512bbf8SYan Zheng } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6073a512bbf8SYan Zheng stripe_nr = stripe_nr * map->num_stripes + i; 607453b381b3SDavid Woodhouse } /* else if RAID[56], multiply by nr_data_stripes(). 607553b381b3SDavid Woodhouse * Alternatively, just use rmap_len below instead of 607653b381b3SDavid Woodhouse * map->stripe_len */ 607753b381b3SDavid Woodhouse 607853b381b3SDavid Woodhouse bytenr = chunk_start + stripe_nr * rmap_len; 6079934d375bSChris Mason WARN_ON(nr >= map->num_stripes); 6080a512bbf8SYan Zheng for (j = 0; j < nr; j++) { 6081a512bbf8SYan Zheng if (buf[j] == bytenr) 6082a512bbf8SYan Zheng break; 6083a512bbf8SYan Zheng } 6084934d375bSChris Mason if (j == nr) { 6085934d375bSChris Mason WARN_ON(nr >= map->num_stripes); 6086a512bbf8SYan Zheng buf[nr++] = bytenr; 6087a512bbf8SYan Zheng } 6088934d375bSChris Mason } 6089a512bbf8SYan Zheng 6090a512bbf8SYan Zheng *logical = buf; 6091a512bbf8SYan Zheng *naddrs = nr; 609253b381b3SDavid Woodhouse *stripe_len = rmap_len; 6093a512bbf8SYan Zheng 6094a512bbf8SYan Zheng free_extent_map(em); 6095a512bbf8SYan Zheng return 0; 6096a512bbf8SYan Zheng } 6097a512bbf8SYan Zheng 60984246a0b6SChristoph Hellwig static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 60998408c716SMiao Xie { 6100326e1dbbSMike Snitzer bio->bi_private = bbio->private; 6101326e1dbbSMike Snitzer bio->bi_end_io = bbio->end_io; 61024246a0b6SChristoph Hellwig bio_endio(bio); 6103326e1dbbSMike Snitzer 61046e9606d2SZhao Lei btrfs_put_bbio(bbio); 61058408c716SMiao Xie } 61068408c716SMiao Xie 61074246a0b6SChristoph Hellwig static void btrfs_end_bio(struct bio *bio) 61088790d502SChris Mason { 61099be3395bSChris Mason struct btrfs_bio *bbio = bio->bi_private; 61107d2b4daaSChris Mason int is_orig_bio = 0; 61118790d502SChris Mason 61124e4cbee9SChristoph Hellwig if (bio->bi_status) { 6113a1d3c478SJan Schmidt atomic_inc(&bbio->error); 61144e4cbee9SChristoph Hellwig if (bio->bi_status == BLK_STS_IOERR || 61154e4cbee9SChristoph Hellwig bio->bi_status == BLK_STS_TARGET) { 6116442a4f63SStefan Behrens unsigned int stripe_index = 61179be3395bSChris Mason btrfs_io_bio(bio)->stripe_index; 611865f53338SZhao Lei struct btrfs_device *dev; 6119442a4f63SStefan Behrens 6120442a4f63SStefan Behrens BUG_ON(stripe_index >= bbio->num_stripes); 6121442a4f63SStefan Behrens dev = bbio->stripes[stripe_index].dev; 6122597a60faSStefan Behrens if (dev->bdev) { 612337226b21SMike Christie if (bio_op(bio) == REQ_OP_WRITE) 61241cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6125442a4f63SStefan Behrens BTRFS_DEV_STAT_WRITE_ERRS); 6126442a4f63SStefan Behrens else 61271cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6128442a4f63SStefan Behrens BTRFS_DEV_STAT_READ_ERRS); 612970fd7614SChristoph Hellwig if (bio->bi_opf & REQ_PREFLUSH) 61301cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6131442a4f63SStefan Behrens BTRFS_DEV_STAT_FLUSH_ERRS); 6132442a4f63SStefan Behrens } 6133442a4f63SStefan Behrens } 6134597a60faSStefan Behrens } 61358790d502SChris Mason 6136a1d3c478SJan Schmidt if (bio == bbio->orig_bio) 61377d2b4daaSChris Mason is_orig_bio = 1; 61387d2b4daaSChris Mason 6139c404e0dcSMiao Xie btrfs_bio_counter_dec(bbio->fs_info); 6140c404e0dcSMiao Xie 6141a1d3c478SJan Schmidt if (atomic_dec_and_test(&bbio->stripes_pending)) { 61427d2b4daaSChris Mason if (!is_orig_bio) { 61437d2b4daaSChris Mason bio_put(bio); 6144a1d3c478SJan Schmidt bio = bbio->orig_bio; 61457d2b4daaSChris Mason } 6146c7b22bb1SMuthu Kumar 61479be3395bSChris Mason btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6148a236aed1SChris Mason /* only send an error to the higher layers if it is 614953b381b3SDavid Woodhouse * beyond the tolerance of the btrfs bio 6150a236aed1SChris Mason */ 6151a1d3c478SJan Schmidt if (atomic_read(&bbio->error) > bbio->max_errors) { 61524e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 61535dbc8fcaSChris Mason } else { 61541259ab75SChris Mason /* 61551259ab75SChris Mason * this bio is actually up to date, we didn't 61561259ab75SChris Mason * go over the max number of errors 61571259ab75SChris Mason */ 61582dbe0c77SAnand Jain bio->bi_status = BLK_STS_OK; 61591259ab75SChris Mason } 6160c55f1396SMiao Xie 61614246a0b6SChristoph Hellwig btrfs_end_bbio(bbio, bio); 61627d2b4daaSChris Mason } else if (!is_orig_bio) { 61638790d502SChris Mason bio_put(bio); 61648790d502SChris Mason } 61658790d502SChris Mason } 61668790d502SChris Mason 61678b712842SChris Mason /* 61688b712842SChris Mason * see run_scheduled_bios for a description of why bios are collected for 61698b712842SChris Mason * async submit. 61708b712842SChris Mason * 61718b712842SChris Mason * This will add one bio to the pending list for a device and make sure 61728b712842SChris Mason * the work struct is scheduled. 61738b712842SChris Mason */ 61742ff7e61eSJeff Mahoney static noinline void btrfs_schedule_bio(struct btrfs_device *device, 61754e49ea4aSMike Christie struct bio *bio) 61768b712842SChris Mason { 61770b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 61788b712842SChris Mason int should_queue = 1; 6179ffbd517dSChris Mason struct btrfs_pending_bios *pending_bios; 61808b712842SChris Mason 6181e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || 6182e6e674bdSAnand Jain !device->bdev) { 61834246a0b6SChristoph Hellwig bio_io_error(bio); 618453b381b3SDavid Woodhouse return; 618553b381b3SDavid Woodhouse } 618653b381b3SDavid Woodhouse 61878b712842SChris Mason /* don't bother with additional async steps for reads, right now */ 618837226b21SMike Christie if (bio_op(bio) == REQ_OP_READ) { 61894e49ea4aSMike Christie btrfsic_submit_bio(bio); 6190143bede5SJeff Mahoney return; 61918b712842SChris Mason } 61928b712842SChris Mason 6193492bb6deSChris Mason WARN_ON(bio->bi_next); 61948b712842SChris Mason bio->bi_next = NULL; 61958b712842SChris Mason 61968b712842SChris Mason spin_lock(&device->io_lock); 619767f055c7SChristoph Hellwig if (op_is_sync(bio->bi_opf)) 6198ffbd517dSChris Mason pending_bios = &device->pending_sync_bios; 6199ffbd517dSChris Mason else 6200ffbd517dSChris Mason pending_bios = &device->pending_bios; 62018b712842SChris Mason 6202ffbd517dSChris Mason if (pending_bios->tail) 6203ffbd517dSChris Mason pending_bios->tail->bi_next = bio; 62048b712842SChris Mason 6205ffbd517dSChris Mason pending_bios->tail = bio; 6206ffbd517dSChris Mason if (!pending_bios->head) 6207ffbd517dSChris Mason pending_bios->head = bio; 62088b712842SChris Mason if (device->running_pending) 62098b712842SChris Mason should_queue = 0; 62108b712842SChris Mason 62118b712842SChris Mason spin_unlock(&device->io_lock); 62128b712842SChris Mason 62138b712842SChris Mason if (should_queue) 62140b246afaSJeff Mahoney btrfs_queue_work(fs_info->submit_workers, &device->work); 62158b712842SChris Mason } 62168b712842SChris Mason 62172ff7e61eSJeff Mahoney static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 62182ff7e61eSJeff Mahoney u64 physical, int dev_nr, int async) 6219de1ee92aSJosef Bacik { 6220de1ee92aSJosef Bacik struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 62212ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = bbio->fs_info; 6222de1ee92aSJosef Bacik 6223de1ee92aSJosef Bacik bio->bi_private = bbio; 62249be3395bSChris Mason btrfs_io_bio(bio)->stripe_index = dev_nr; 6225de1ee92aSJosef Bacik bio->bi_end_io = btrfs_end_bio; 62264f024f37SKent Overstreet bio->bi_iter.bi_sector = physical >> 9; 6227de1ee92aSJosef Bacik #ifdef DEBUG 6228de1ee92aSJosef Bacik { 6229de1ee92aSJosef Bacik struct rcu_string *name; 6230de1ee92aSJosef Bacik 6231de1ee92aSJosef Bacik rcu_read_lock(); 6232de1ee92aSJosef Bacik name = rcu_dereference(dev->name); 6233ab8d0fc4SJeff Mahoney btrfs_debug(fs_info, 6234ab8d0fc4SJeff Mahoney "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6235ab8d0fc4SJeff Mahoney bio_op(bio), bio->bi_opf, 6236ab8d0fc4SJeff Mahoney (u64)bio->bi_iter.bi_sector, 62375d163e0eSJeff Mahoney (u_long)dev->bdev->bd_dev, name->str, dev->devid, 62385d163e0eSJeff Mahoney bio->bi_iter.bi_size); 6239de1ee92aSJosef Bacik rcu_read_unlock(); 6240de1ee92aSJosef Bacik } 6241de1ee92aSJosef Bacik #endif 624274d46992SChristoph Hellwig bio_set_dev(bio, dev->bdev); 6243c404e0dcSMiao Xie 62442ff7e61eSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 6245c404e0dcSMiao Xie 6246de1ee92aSJosef Bacik if (async) 62472ff7e61eSJeff Mahoney btrfs_schedule_bio(dev, bio); 6248de1ee92aSJosef Bacik else 62494e49ea4aSMike Christie btrfsic_submit_bio(bio); 6250de1ee92aSJosef Bacik } 6251de1ee92aSJosef Bacik 6252de1ee92aSJosef Bacik static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6253de1ee92aSJosef Bacik { 6254de1ee92aSJosef Bacik atomic_inc(&bbio->error); 6255de1ee92aSJosef Bacik if (atomic_dec_and_test(&bbio->stripes_pending)) { 625601327610SNicholas D Steeves /* Should be the original bio. */ 62578408c716SMiao Xie WARN_ON(bio != bbio->orig_bio); 62588408c716SMiao Xie 62599be3395bSChris Mason btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 62604f024f37SKent Overstreet bio->bi_iter.bi_sector = logical >> 9; 6261102ed2c5SAnand Jain if (atomic_read(&bbio->error) > bbio->max_errors) 62624e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 6263102ed2c5SAnand Jain else 6264102ed2c5SAnand Jain bio->bi_status = BLK_STS_OK; 62654246a0b6SChristoph Hellwig btrfs_end_bbio(bbio, bio); 6266de1ee92aSJosef Bacik } 6267de1ee92aSJosef Bacik } 6268de1ee92aSJosef Bacik 626958efbc9fSOmar Sandoval blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 62708b712842SChris Mason int mirror_num, int async_submit) 62710b86a832SChris Mason { 62720b86a832SChris Mason struct btrfs_device *dev; 62738790d502SChris Mason struct bio *first_bio = bio; 62744f024f37SKent Overstreet u64 logical = (u64)bio->bi_iter.bi_sector << 9; 62750b86a832SChris Mason u64 length = 0; 62760b86a832SChris Mason u64 map_length; 62770b86a832SChris Mason int ret; 627808da757dSZhao Lei int dev_nr; 627908da757dSZhao Lei int total_devs; 6280a1d3c478SJan Schmidt struct btrfs_bio *bbio = NULL; 62810b86a832SChris Mason 62824f024f37SKent Overstreet length = bio->bi_iter.bi_size; 62830b86a832SChris Mason map_length = length; 6284cea9e445SChris Mason 62850b246afaSJeff Mahoney btrfs_bio_counter_inc_blocked(fs_info); 6286bd7d63c2SLiu Bo ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 628737226b21SMike Christie &map_length, &bbio, mirror_num, 1); 6288c404e0dcSMiao Xie if (ret) { 62890b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 629058efbc9fSOmar Sandoval return errno_to_blk_status(ret); 6291c404e0dcSMiao Xie } 6292cea9e445SChris Mason 6293a1d3c478SJan Schmidt total_devs = bbio->num_stripes; 629453b381b3SDavid Woodhouse bbio->orig_bio = first_bio; 629553b381b3SDavid Woodhouse bbio->private = first_bio->bi_private; 629653b381b3SDavid Woodhouse bbio->end_io = first_bio->bi_end_io; 62970b246afaSJeff Mahoney bbio->fs_info = fs_info; 629853b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bbio->num_stripes); 629953b381b3SDavid Woodhouse 6300ad1ba2a0SZhao Lei if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 630137226b21SMike Christie ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 630253b381b3SDavid Woodhouse /* In this case, map_length has been set to the length of 630353b381b3SDavid Woodhouse a single stripe; not the whole write */ 630437226b21SMike Christie if (bio_op(bio) == REQ_OP_WRITE) { 63052ff7e61eSJeff Mahoney ret = raid56_parity_write(fs_info, bio, bbio, 63062ff7e61eSJeff Mahoney map_length); 630753b381b3SDavid Woodhouse } else { 63082ff7e61eSJeff Mahoney ret = raid56_parity_recover(fs_info, bio, bbio, 63092ff7e61eSJeff Mahoney map_length, mirror_num, 1); 631053b381b3SDavid Woodhouse } 63114245215dSMiao Xie 63120b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 631358efbc9fSOmar Sandoval return errno_to_blk_status(ret); 631453b381b3SDavid Woodhouse } 631553b381b3SDavid Woodhouse 6316239b14b3SChris Mason if (map_length < length) { 63170b246afaSJeff Mahoney btrfs_crit(fs_info, 63185d163e0eSJeff Mahoney "mapping failed logical %llu bio len %llu len %llu", 6319c1c9ff7cSGeert Uytterhoeven logical, length, map_length); 6320239b14b3SChris Mason BUG(); 6321239b14b3SChris Mason } 6322a1d3c478SJan Schmidt 632308da757dSZhao Lei for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6324de1ee92aSJosef Bacik dev = bbio->stripes[dev_nr].dev; 632537226b21SMike Christie if (!dev || !dev->bdev || 6326ebbede42SAnand Jain (bio_op(first_bio) == REQ_OP_WRITE && 6327ebbede42SAnand Jain !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6328de1ee92aSJosef Bacik bbio_error(bbio, first_bio, logical); 6329de1ee92aSJosef Bacik continue; 6330de1ee92aSJosef Bacik } 6331de1ee92aSJosef Bacik 63323aa8e074SDavid Sterba if (dev_nr < total_devs - 1) 63338b6c1d56SDavid Sterba bio = btrfs_bio_clone(first_bio); 63343aa8e074SDavid Sterba else 63358790d502SChris Mason bio = first_bio; 6336606686eeSJosef Bacik 63372ff7e61eSJeff Mahoney submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 63382ff7e61eSJeff Mahoney dev_nr, async_submit); 63398790d502SChris Mason } 63400b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 634158efbc9fSOmar Sandoval return BLK_STS_OK; 63420b86a832SChris Mason } 63430b86a832SChris Mason 6344aa1b8cd4SStefan Behrens struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 63452b82032cSYan Zheng u8 *uuid, u8 *fsid) 63460b86a832SChris Mason { 63472b82032cSYan Zheng struct btrfs_device *device; 63482b82032cSYan Zheng struct btrfs_fs_devices *cur_devices; 63490b86a832SChris Mason 6350aa1b8cd4SStefan Behrens cur_devices = fs_info->fs_devices; 63512b82032cSYan Zheng while (cur_devices) { 63522b82032cSYan Zheng if (!fsid || 635344880fdcSAnand Jain !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { 635435c70103SDavid Sterba device = find_device(cur_devices, devid, uuid); 63552b82032cSYan Zheng if (device) 63562b82032cSYan Zheng return device; 63572b82032cSYan Zheng } 63582b82032cSYan Zheng cur_devices = cur_devices->seed; 63592b82032cSYan Zheng } 63602b82032cSYan Zheng return NULL; 63610b86a832SChris Mason } 63620b86a832SChris Mason 63632ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6364dfe25020SChris Mason u64 devid, u8 *dev_uuid) 6365dfe25020SChris Mason { 6366dfe25020SChris Mason struct btrfs_device *device; 6367dfe25020SChris Mason 636812bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, dev_uuid); 636912bd2fc0SIlya Dryomov if (IS_ERR(device)) 6370adfb69afSAnand Jain return device; 637112bd2fc0SIlya Dryomov 637212bd2fc0SIlya Dryomov list_add(&device->dev_list, &fs_devices->devices); 6373e4404d6eSYan Zheng device->fs_devices = fs_devices; 6374dfe25020SChris Mason fs_devices->num_devices++; 637512bd2fc0SIlya Dryomov 6376e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6377cd02dca5SChris Mason fs_devices->missing_devices++; 637812bd2fc0SIlya Dryomov 6379dfe25020SChris Mason return device; 6380dfe25020SChris Mason } 6381dfe25020SChris Mason 638212bd2fc0SIlya Dryomov /** 638312bd2fc0SIlya Dryomov * btrfs_alloc_device - allocate struct btrfs_device 638412bd2fc0SIlya Dryomov * @fs_info: used only for generating a new devid, can be NULL if 638512bd2fc0SIlya Dryomov * devid is provided (i.e. @devid != NULL). 638612bd2fc0SIlya Dryomov * @devid: a pointer to devid for this device. If NULL a new devid 638712bd2fc0SIlya Dryomov * is generated. 638812bd2fc0SIlya Dryomov * @uuid: a pointer to UUID for this device. If NULL a new UUID 638912bd2fc0SIlya Dryomov * is generated. 639012bd2fc0SIlya Dryomov * 639112bd2fc0SIlya Dryomov * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 639248dae9cfSDavid Sterba * on error. Returned struct is not linked onto any lists and must be 639348dae9cfSDavid Sterba * destroyed with free_device. 639412bd2fc0SIlya Dryomov */ 639512bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 639612bd2fc0SIlya Dryomov const u64 *devid, 639712bd2fc0SIlya Dryomov const u8 *uuid) 639812bd2fc0SIlya Dryomov { 639912bd2fc0SIlya Dryomov struct btrfs_device *dev; 640012bd2fc0SIlya Dryomov u64 tmp; 640112bd2fc0SIlya Dryomov 6402fae7f21cSDulshani Gunawardhana if (WARN_ON(!devid && !fs_info)) 640312bd2fc0SIlya Dryomov return ERR_PTR(-EINVAL); 640412bd2fc0SIlya Dryomov 640512bd2fc0SIlya Dryomov dev = __alloc_device(); 640612bd2fc0SIlya Dryomov if (IS_ERR(dev)) 640712bd2fc0SIlya Dryomov return dev; 640812bd2fc0SIlya Dryomov 640912bd2fc0SIlya Dryomov if (devid) 641012bd2fc0SIlya Dryomov tmp = *devid; 641112bd2fc0SIlya Dryomov else { 641212bd2fc0SIlya Dryomov int ret; 641312bd2fc0SIlya Dryomov 641412bd2fc0SIlya Dryomov ret = find_next_devid(fs_info, &tmp); 641512bd2fc0SIlya Dryomov if (ret) { 641655de4803SDavid Sterba free_device(dev); 641712bd2fc0SIlya Dryomov return ERR_PTR(ret); 641812bd2fc0SIlya Dryomov } 641912bd2fc0SIlya Dryomov } 642012bd2fc0SIlya Dryomov dev->devid = tmp; 642112bd2fc0SIlya Dryomov 642212bd2fc0SIlya Dryomov if (uuid) 642312bd2fc0SIlya Dryomov memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 642412bd2fc0SIlya Dryomov else 642512bd2fc0SIlya Dryomov generate_random_uuid(dev->uuid); 642612bd2fc0SIlya Dryomov 64279e0af237SLiu Bo btrfs_init_work(&dev->work, btrfs_submit_helper, 64289e0af237SLiu Bo pending_bios_fn, NULL, NULL); 642912bd2fc0SIlya Dryomov 643012bd2fc0SIlya Dryomov return dev; 643112bd2fc0SIlya Dryomov } 643212bd2fc0SIlya Dryomov 6433e06cd3ddSLiu Bo /* Return -EIO if any error, otherwise return 0. */ 64342ff7e61eSJeff Mahoney static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, 6435e06cd3ddSLiu Bo struct extent_buffer *leaf, 6436e06cd3ddSLiu Bo struct btrfs_chunk *chunk, u64 logical) 6437e06cd3ddSLiu Bo { 6438e06cd3ddSLiu Bo u64 length; 6439e06cd3ddSLiu Bo u64 stripe_len; 6440e06cd3ddSLiu Bo u16 num_stripes; 6441e06cd3ddSLiu Bo u16 sub_stripes; 6442e06cd3ddSLiu Bo u64 type; 6443e06cd3ddSLiu Bo 6444e06cd3ddSLiu Bo length = btrfs_chunk_length(leaf, chunk); 6445e06cd3ddSLiu Bo stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6446e06cd3ddSLiu Bo num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6447e06cd3ddSLiu Bo sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6448e06cd3ddSLiu Bo type = btrfs_chunk_type(leaf, chunk); 6449e06cd3ddSLiu Bo 6450e06cd3ddSLiu Bo if (!num_stripes) { 64510b246afaSJeff Mahoney btrfs_err(fs_info, "invalid chunk num_stripes: %u", 6452e06cd3ddSLiu Bo num_stripes); 6453e06cd3ddSLiu Bo return -EIO; 6454e06cd3ddSLiu Bo } 64550b246afaSJeff Mahoney if (!IS_ALIGNED(logical, fs_info->sectorsize)) { 64560b246afaSJeff Mahoney btrfs_err(fs_info, "invalid chunk logical %llu", logical); 6457e06cd3ddSLiu Bo return -EIO; 6458e06cd3ddSLiu Bo } 64590b246afaSJeff Mahoney if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { 64600b246afaSJeff Mahoney btrfs_err(fs_info, "invalid chunk sectorsize %u", 6461e06cd3ddSLiu Bo btrfs_chunk_sector_size(leaf, chunk)); 6462e06cd3ddSLiu Bo return -EIO; 6463e06cd3ddSLiu Bo } 64640b246afaSJeff Mahoney if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { 64650b246afaSJeff Mahoney btrfs_err(fs_info, "invalid chunk length %llu", length); 6466e06cd3ddSLiu Bo return -EIO; 6467e06cd3ddSLiu Bo } 6468e06cd3ddSLiu Bo if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { 64690b246afaSJeff Mahoney btrfs_err(fs_info, "invalid chunk stripe length: %llu", 6470e06cd3ddSLiu Bo stripe_len); 6471e06cd3ddSLiu Bo return -EIO; 6472e06cd3ddSLiu Bo } 6473e06cd3ddSLiu Bo if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6474e06cd3ddSLiu Bo type) { 64750b246afaSJeff Mahoney btrfs_err(fs_info, "unrecognized chunk type: %llu", 6476e06cd3ddSLiu Bo ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 6477e06cd3ddSLiu Bo BTRFS_BLOCK_GROUP_PROFILE_MASK) & 6478e06cd3ddSLiu Bo btrfs_chunk_type(leaf, chunk)); 6479e06cd3ddSLiu Bo return -EIO; 6480e06cd3ddSLiu Bo } 6481e06cd3ddSLiu Bo if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6482e06cd3ddSLiu Bo (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6483e06cd3ddSLiu Bo (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6484e06cd3ddSLiu Bo (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6485e06cd3ddSLiu Bo (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6486e06cd3ddSLiu Bo ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6487e06cd3ddSLiu Bo num_stripes != 1)) { 64880b246afaSJeff Mahoney btrfs_err(fs_info, 6489e06cd3ddSLiu Bo "invalid num_stripes:sub_stripes %u:%u for profile %llu", 6490e06cd3ddSLiu Bo num_stripes, sub_stripes, 6491e06cd3ddSLiu Bo type & BTRFS_BLOCK_GROUP_PROFILE_MASK); 6492e06cd3ddSLiu Bo return -EIO; 6493e06cd3ddSLiu Bo } 6494e06cd3ddSLiu Bo 6495e06cd3ddSLiu Bo return 0; 6496e06cd3ddSLiu Bo } 6497e06cd3ddSLiu Bo 64985a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 64992b902dfcSAnand Jain u64 devid, u8 *uuid, bool error) 65005a2b8e60SAnand Jain { 65012b902dfcSAnand Jain if (error) 65022b902dfcSAnand Jain btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 65032b902dfcSAnand Jain devid, uuid); 65042b902dfcSAnand Jain else 65052b902dfcSAnand Jain btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 65062b902dfcSAnand Jain devid, uuid); 65075a2b8e60SAnand Jain } 65085a2b8e60SAnand Jain 65092ff7e61eSJeff Mahoney static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 65100b86a832SChris Mason struct extent_buffer *leaf, 65110b86a832SChris Mason struct btrfs_chunk *chunk) 65120b86a832SChris Mason { 65130b246afaSJeff Mahoney struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 65140b86a832SChris Mason struct map_lookup *map; 65150b86a832SChris Mason struct extent_map *em; 65160b86a832SChris Mason u64 logical; 65170b86a832SChris Mason u64 length; 65180b86a832SChris Mason u64 devid; 6519a443755fSChris Mason u8 uuid[BTRFS_UUID_SIZE]; 6520593060d7SChris Mason int num_stripes; 65210b86a832SChris Mason int ret; 6522593060d7SChris Mason int i; 65230b86a832SChris Mason 6524e17cade2SChris Mason logical = key->offset; 6525e17cade2SChris Mason length = btrfs_chunk_length(leaf, chunk); 6526f04b772bSQu Wenruo num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6527e06cd3ddSLiu Bo 65282ff7e61eSJeff Mahoney ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); 6529e06cd3ddSLiu Bo if (ret) 6530e06cd3ddSLiu Bo return ret; 6531a061fc8dSChris Mason 6532890871beSChris Mason read_lock(&map_tree->map_tree.lock); 65330b86a832SChris Mason em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6534890871beSChris Mason read_unlock(&map_tree->map_tree.lock); 65350b86a832SChris Mason 65360b86a832SChris Mason /* already mapped? */ 65370b86a832SChris Mason if (em && em->start <= logical && em->start + em->len > logical) { 65380b86a832SChris Mason free_extent_map(em); 65390b86a832SChris Mason return 0; 65400b86a832SChris Mason } else if (em) { 65410b86a832SChris Mason free_extent_map(em); 65420b86a832SChris Mason } 65430b86a832SChris Mason 6544172ddd60SDavid Sterba em = alloc_extent_map(); 65450b86a832SChris Mason if (!em) 65460b86a832SChris Mason return -ENOMEM; 6547593060d7SChris Mason map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 65480b86a832SChris Mason if (!map) { 65490b86a832SChris Mason free_extent_map(em); 65500b86a832SChris Mason return -ENOMEM; 65510b86a832SChris Mason } 65520b86a832SChris Mason 6553298a8f9cSWang Shilong set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 655495617d69SJeff Mahoney em->map_lookup = map; 65550b86a832SChris Mason em->start = logical; 65560b86a832SChris Mason em->len = length; 655770c8a91cSJosef Bacik em->orig_start = 0; 65580b86a832SChris Mason em->block_start = 0; 6559c8b97818SChris Mason em->block_len = em->len; 65600b86a832SChris Mason 6561593060d7SChris Mason map->num_stripes = num_stripes; 6562593060d7SChris Mason map->io_width = btrfs_chunk_io_width(leaf, chunk); 6563593060d7SChris Mason map->io_align = btrfs_chunk_io_align(leaf, chunk); 6564593060d7SChris Mason map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6565593060d7SChris Mason map->type = btrfs_chunk_type(leaf, chunk); 6566321aecc6SChris Mason map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6567593060d7SChris Mason for (i = 0; i < num_stripes; i++) { 6568593060d7SChris Mason map->stripes[i].physical = 6569593060d7SChris Mason btrfs_stripe_offset_nr(leaf, chunk, i); 6570593060d7SChris Mason devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6571a443755fSChris Mason read_extent_buffer(leaf, uuid, (unsigned long) 6572a443755fSChris Mason btrfs_stripe_dev_uuid_nr(chunk, i), 6573a443755fSChris Mason BTRFS_UUID_SIZE); 65740b246afaSJeff Mahoney map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6575aa1b8cd4SStefan Behrens uuid, NULL); 65763cdde224SJeff Mahoney if (!map->stripes[i].dev && 65770b246afaSJeff Mahoney !btrfs_test_opt(fs_info, DEGRADED)) { 6578dfe25020SChris Mason free_extent_map(em); 65792b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, true); 658045dbdbc9SAnand Jain return -ENOENT; 6581dfe25020SChris Mason } 6582dfe25020SChris Mason if (!map->stripes[i].dev) { 6583dfe25020SChris Mason map->stripes[i].dev = 65842ff7e61eSJeff Mahoney add_missing_dev(fs_info->fs_devices, devid, 65852ff7e61eSJeff Mahoney uuid); 6586adfb69afSAnand Jain if (IS_ERR(map->stripes[i].dev)) { 65870b86a832SChris Mason free_extent_map(em); 6588adfb69afSAnand Jain btrfs_err(fs_info, 6589adfb69afSAnand Jain "failed to init missing dev %llu: %ld", 6590adfb69afSAnand Jain devid, PTR_ERR(map->stripes[i].dev)); 6591adfb69afSAnand Jain return PTR_ERR(map->stripes[i].dev); 65920b86a832SChris Mason } 65932b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, false); 6594593060d7SChris Mason } 6595e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6596e12c9621SAnand Jain &(map->stripes[i].dev->dev_state)); 6597e12c9621SAnand Jain 6598dfe25020SChris Mason } 65990b86a832SChris Mason 6600890871beSChris Mason write_lock(&map_tree->map_tree.lock); 660109a2a8f9SJosef Bacik ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6602890871beSChris Mason write_unlock(&map_tree->map_tree.lock); 660379787eaaSJeff Mahoney BUG_ON(ret); /* Tree corruption */ 66040b86a832SChris Mason free_extent_map(em); 66050b86a832SChris Mason 66060b86a832SChris Mason return 0; 66070b86a832SChris Mason } 66080b86a832SChris Mason 6609143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf, 66100b86a832SChris Mason struct btrfs_dev_item *dev_item, 66110b86a832SChris Mason struct btrfs_device *device) 66120b86a832SChris Mason { 66130b86a832SChris Mason unsigned long ptr; 66140b86a832SChris Mason 66150b86a832SChris Mason device->devid = btrfs_device_id(leaf, dev_item); 6616d6397baeSChris Ball device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6617d6397baeSChris Ball device->total_bytes = device->disk_total_bytes; 6618935e5cc9SMiao Xie device->commit_total_bytes = device->disk_total_bytes; 66190b86a832SChris Mason device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6620ce7213c7SMiao Xie device->commit_bytes_used = device->bytes_used; 66210b86a832SChris Mason device->type = btrfs_device_type(leaf, dev_item); 66220b86a832SChris Mason device->io_align = btrfs_device_io_align(leaf, dev_item); 66230b86a832SChris Mason device->io_width = btrfs_device_io_width(leaf, dev_item); 66240b86a832SChris Mason device->sector_size = btrfs_device_sector_size(leaf, dev_item); 66258dabb742SStefan Behrens WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6626401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 66270b86a832SChris Mason 6628410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 6629e17cade2SChris Mason read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 66300b86a832SChris Mason } 66310b86a832SChris Mason 66322ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 66335f375835SMiao Xie u8 *fsid) 66342b82032cSYan Zheng { 66352b82032cSYan Zheng struct btrfs_fs_devices *fs_devices; 66362b82032cSYan Zheng int ret; 66372b82032cSYan Zheng 6638b367e47fSLi Zefan BUG_ON(!mutex_is_locked(&uuid_mutex)); 66392dfeca9bSDavid Sterba ASSERT(fsid); 66402b82032cSYan Zheng 66410b246afaSJeff Mahoney fs_devices = fs_info->fs_devices->seed; 66422b82032cSYan Zheng while (fs_devices) { 664344880fdcSAnand Jain if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 66445f375835SMiao Xie return fs_devices; 66455f375835SMiao Xie 66462b82032cSYan Zheng fs_devices = fs_devices->seed; 66472b82032cSYan Zheng } 66482b82032cSYan Zheng 66492b82032cSYan Zheng fs_devices = find_fsid(fsid); 66502b82032cSYan Zheng if (!fs_devices) { 66510b246afaSJeff Mahoney if (!btrfs_test_opt(fs_info, DEGRADED)) 66525f375835SMiao Xie return ERR_PTR(-ENOENT); 66535f375835SMiao Xie 66545f375835SMiao Xie fs_devices = alloc_fs_devices(fsid); 66555f375835SMiao Xie if (IS_ERR(fs_devices)) 66565f375835SMiao Xie return fs_devices; 66575f375835SMiao Xie 66585f375835SMiao Xie fs_devices->seeding = 1; 66595f375835SMiao Xie fs_devices->opened = 1; 66605f375835SMiao Xie return fs_devices; 66612b82032cSYan Zheng } 6662e4404d6eSYan Zheng 6663e4404d6eSYan Zheng fs_devices = clone_fs_devices(fs_devices); 66645f375835SMiao Xie if (IS_ERR(fs_devices)) 66655f375835SMiao Xie return fs_devices; 66662b82032cSYan Zheng 666797288f2cSChristoph Hellwig ret = __btrfs_open_devices(fs_devices, FMODE_READ, 66680b246afaSJeff Mahoney fs_info->bdev_holder); 666948d28232SJulia Lawall if (ret) { 667048d28232SJulia Lawall free_fs_devices(fs_devices); 66715f375835SMiao Xie fs_devices = ERR_PTR(ret); 66722b82032cSYan Zheng goto out; 667348d28232SJulia Lawall } 66742b82032cSYan Zheng 66752b82032cSYan Zheng if (!fs_devices->seeding) { 66762b82032cSYan Zheng __btrfs_close_devices(fs_devices); 6677e4404d6eSYan Zheng free_fs_devices(fs_devices); 66785f375835SMiao Xie fs_devices = ERR_PTR(-EINVAL); 66792b82032cSYan Zheng goto out; 66802b82032cSYan Zheng } 66812b82032cSYan Zheng 66820b246afaSJeff Mahoney fs_devices->seed = fs_info->fs_devices->seed; 66830b246afaSJeff Mahoney fs_info->fs_devices->seed = fs_devices; 66842b82032cSYan Zheng out: 66855f375835SMiao Xie return fs_devices; 66862b82032cSYan Zheng } 66872b82032cSYan Zheng 66882ff7e61eSJeff Mahoney static int read_one_dev(struct btrfs_fs_info *fs_info, 66890b86a832SChris Mason struct extent_buffer *leaf, 66900b86a832SChris Mason struct btrfs_dev_item *dev_item) 66910b86a832SChris Mason { 66920b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 66930b86a832SChris Mason struct btrfs_device *device; 66940b86a832SChris Mason u64 devid; 66950b86a832SChris Mason int ret; 669644880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 6697a443755fSChris Mason u8 dev_uuid[BTRFS_UUID_SIZE]; 6698a443755fSChris Mason 66990b86a832SChris Mason devid = btrfs_device_id(leaf, dev_item); 6700410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6701a443755fSChris Mason BTRFS_UUID_SIZE); 67021473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 670344880fdcSAnand Jain BTRFS_FSID_SIZE); 67042b82032cSYan Zheng 670544880fdcSAnand Jain if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { 67062ff7e61eSJeff Mahoney fs_devices = open_seed_devices(fs_info, fs_uuid); 67075f375835SMiao Xie if (IS_ERR(fs_devices)) 67085f375835SMiao Xie return PTR_ERR(fs_devices); 67092b82032cSYan Zheng } 67102b82032cSYan Zheng 67110b246afaSJeff Mahoney device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 67125f375835SMiao Xie if (!device) { 6713c5502451SQu Wenruo if (!btrfs_test_opt(fs_info, DEGRADED)) { 67142b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 67152b902dfcSAnand Jain dev_uuid, true); 671645dbdbc9SAnand Jain return -ENOENT; 6717c5502451SQu Wenruo } 67182b82032cSYan Zheng 67192ff7e61eSJeff Mahoney device = add_missing_dev(fs_devices, devid, dev_uuid); 6720adfb69afSAnand Jain if (IS_ERR(device)) { 6721adfb69afSAnand Jain btrfs_err(fs_info, 6722adfb69afSAnand Jain "failed to add missing dev %llu: %ld", 6723adfb69afSAnand Jain devid, PTR_ERR(device)); 6724adfb69afSAnand Jain return PTR_ERR(device); 6725adfb69afSAnand Jain } 67262b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 67275f375835SMiao Xie } else { 6728c5502451SQu Wenruo if (!device->bdev) { 67292b902dfcSAnand Jain if (!btrfs_test_opt(fs_info, DEGRADED)) { 67302b902dfcSAnand Jain btrfs_report_missing_device(fs_info, 67312b902dfcSAnand Jain devid, dev_uuid, true); 673245dbdbc9SAnand Jain return -ENOENT; 6733c5502451SQu Wenruo } 67342b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 67352b902dfcSAnand Jain dev_uuid, false); 67362b902dfcSAnand Jain } 67375f375835SMiao Xie 6738e6e674bdSAnand Jain if (!device->bdev && 6739e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6740cd02dca5SChris Mason /* 6741cd02dca5SChris Mason * this happens when a device that was properly setup 6742cd02dca5SChris Mason * in the device info lists suddenly goes bad. 6743cd02dca5SChris Mason * device->bdev is NULL, and so we have to set 6744cd02dca5SChris Mason * device->missing to one here 6745cd02dca5SChris Mason */ 67465f375835SMiao Xie device->fs_devices->missing_devices++; 6747e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 67486324fbf3SChris Mason } 67495f375835SMiao Xie 67505f375835SMiao Xie /* Move the device to its own fs_devices */ 67515f375835SMiao Xie if (device->fs_devices != fs_devices) { 6752e6e674bdSAnand Jain ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6753e6e674bdSAnand Jain &device->dev_state)); 67545f375835SMiao Xie 67555f375835SMiao Xie list_move(&device->dev_list, &fs_devices->devices); 67565f375835SMiao Xie device->fs_devices->num_devices--; 67575f375835SMiao Xie fs_devices->num_devices++; 67585f375835SMiao Xie 67595f375835SMiao Xie device->fs_devices->missing_devices--; 67605f375835SMiao Xie fs_devices->missing_devices++; 67615f375835SMiao Xie 67625f375835SMiao Xie device->fs_devices = fs_devices; 67635f375835SMiao Xie } 67642b82032cSYan Zheng } 67652b82032cSYan Zheng 67660b246afaSJeff Mahoney if (device->fs_devices != fs_info->fs_devices) { 6767ebbede42SAnand Jain BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 67682b82032cSYan Zheng if (device->generation != 67692b82032cSYan Zheng btrfs_device_generation(leaf, dev_item)) 67702b82032cSYan Zheng return -EINVAL; 67712b82032cSYan Zheng } 67720b86a832SChris Mason 67730b86a832SChris Mason fill_device_from_item(leaf, dev_item, device); 6774e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6775ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6776401e29c1SAnand Jain !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 67772b82032cSYan Zheng device->fs_devices->total_rw_bytes += device->total_bytes; 6778a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes - device->bytes_used, 6779a5ed45f8SNikolay Borisov &fs_info->free_chunk_space); 67802bf64758SJosef Bacik } 67810b86a832SChris Mason ret = 0; 67820b86a832SChris Mason return ret; 67830b86a832SChris Mason } 67840b86a832SChris Mason 67856bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 67860b86a832SChris Mason { 67876bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 6788ab8d0fc4SJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 6789a061fc8dSChris Mason struct extent_buffer *sb; 67900b86a832SChris Mason struct btrfs_disk_key *disk_key; 67910b86a832SChris Mason struct btrfs_chunk *chunk; 67921ffb22cfSDavid Sterba u8 *array_ptr; 67931ffb22cfSDavid Sterba unsigned long sb_array_offset; 679484eed90fSChris Mason int ret = 0; 67950b86a832SChris Mason u32 num_stripes; 67960b86a832SChris Mason u32 array_size; 67970b86a832SChris Mason u32 len = 0; 67981ffb22cfSDavid Sterba u32 cur_offset; 6799e06cd3ddSLiu Bo u64 type; 680084eed90fSChris Mason struct btrfs_key key; 68010b86a832SChris Mason 68020b246afaSJeff Mahoney ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6803a83fffb7SDavid Sterba /* 6804a83fffb7SDavid Sterba * This will create extent buffer of nodesize, superblock size is 6805a83fffb7SDavid Sterba * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6806a83fffb7SDavid Sterba * overallocate but we can keep it as-is, only the first page is used. 6807a83fffb7SDavid Sterba */ 68082ff7e61eSJeff Mahoney sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6809c871b0f2SLiu Bo if (IS_ERR(sb)) 6810c871b0f2SLiu Bo return PTR_ERR(sb); 68114db8c528SDavid Sterba set_extent_buffer_uptodate(sb); 681285d4e461SChris Mason btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 68138a334426SDavid Sterba /* 681401327610SNicholas D Steeves * The sb extent buffer is artificial and just used to read the system array. 68154db8c528SDavid Sterba * set_extent_buffer_uptodate() call does not properly mark all it's 68168a334426SDavid Sterba * pages up-to-date when the page is larger: extent does not cover the 68178a334426SDavid Sterba * whole page and consequently check_page_uptodate does not find all 68188a334426SDavid Sterba * the page's extents up-to-date (the hole beyond sb), 68198a334426SDavid Sterba * write_extent_buffer then triggers a WARN_ON. 68208a334426SDavid Sterba * 68218a334426SDavid Sterba * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 68228a334426SDavid Sterba * but sb spans only this function. Add an explicit SetPageUptodate call 68238a334426SDavid Sterba * to silence the warning eg. on PowerPC 64. 68248a334426SDavid Sterba */ 682509cbfeafSKirill A. Shutemov if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6826727011e0SChris Mason SetPageUptodate(sb->pages[0]); 68274008c04aSChris Mason 6828a061fc8dSChris Mason write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 68290b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 68300b86a832SChris Mason 68311ffb22cfSDavid Sterba array_ptr = super_copy->sys_chunk_array; 68321ffb22cfSDavid Sterba sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 68331ffb22cfSDavid Sterba cur_offset = 0; 68340b86a832SChris Mason 68351ffb22cfSDavid Sterba while (cur_offset < array_size) { 68361ffb22cfSDavid Sterba disk_key = (struct btrfs_disk_key *)array_ptr; 6837e3540eabSDavid Sterba len = sizeof(*disk_key); 6838e3540eabSDavid Sterba if (cur_offset + len > array_size) 6839e3540eabSDavid Sterba goto out_short_read; 6840e3540eabSDavid Sterba 68410b86a832SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 68420b86a832SChris Mason 68431ffb22cfSDavid Sterba array_ptr += len; 68441ffb22cfSDavid Sterba sb_array_offset += len; 68451ffb22cfSDavid Sterba cur_offset += len; 68460b86a832SChris Mason 68470d81ba5dSChris Mason if (key.type == BTRFS_CHUNK_ITEM_KEY) { 68481ffb22cfSDavid Sterba chunk = (struct btrfs_chunk *)sb_array_offset; 6849e3540eabSDavid Sterba /* 6850e3540eabSDavid Sterba * At least one btrfs_chunk with one stripe must be 6851e3540eabSDavid Sterba * present, exact stripe count check comes afterwards 6852e3540eabSDavid Sterba */ 6853e3540eabSDavid Sterba len = btrfs_chunk_item_size(1); 6854e3540eabSDavid Sterba if (cur_offset + len > array_size) 6855e3540eabSDavid Sterba goto out_short_read; 6856e3540eabSDavid Sterba 6857e3540eabSDavid Sterba num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6858f5cdedd7SDavid Sterba if (!num_stripes) { 6859ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 6860ab8d0fc4SJeff Mahoney "invalid number of stripes %u in sys_array at offset %u", 6861f5cdedd7SDavid Sterba num_stripes, cur_offset); 6862f5cdedd7SDavid Sterba ret = -EIO; 6863f5cdedd7SDavid Sterba break; 6864f5cdedd7SDavid Sterba } 6865f5cdedd7SDavid Sterba 6866e06cd3ddSLiu Bo type = btrfs_chunk_type(sb, chunk); 6867e06cd3ddSLiu Bo if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6868ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 6869e06cd3ddSLiu Bo "invalid chunk type %llu in sys_array at offset %u", 6870e06cd3ddSLiu Bo type, cur_offset); 6871e06cd3ddSLiu Bo ret = -EIO; 6872e06cd3ddSLiu Bo break; 6873e06cd3ddSLiu Bo } 6874e06cd3ddSLiu Bo 6875e3540eabSDavid Sterba len = btrfs_chunk_item_size(num_stripes); 6876e3540eabSDavid Sterba if (cur_offset + len > array_size) 6877e3540eabSDavid Sterba goto out_short_read; 6878e3540eabSDavid Sterba 68792ff7e61eSJeff Mahoney ret = read_one_chunk(fs_info, &key, sb, chunk); 688084eed90fSChris Mason if (ret) 688184eed90fSChris Mason break; 68820b86a832SChris Mason } else { 6883ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 6884ab8d0fc4SJeff Mahoney "unexpected item type %u in sys_array at offset %u", 688593a3d467SDavid Sterba (u32)key.type, cur_offset); 688684eed90fSChris Mason ret = -EIO; 688784eed90fSChris Mason break; 68880b86a832SChris Mason } 68891ffb22cfSDavid Sterba array_ptr += len; 68901ffb22cfSDavid Sterba sb_array_offset += len; 68911ffb22cfSDavid Sterba cur_offset += len; 68920b86a832SChris Mason } 6893d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 68941c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 689584eed90fSChris Mason return ret; 6896e3540eabSDavid Sterba 6897e3540eabSDavid Sterba out_short_read: 6898ab8d0fc4SJeff Mahoney btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6899e3540eabSDavid Sterba len, cur_offset); 6900d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 69011c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 6902e3540eabSDavid Sterba return -EIO; 69030b86a832SChris Mason } 69040b86a832SChris Mason 690521634a19SQu Wenruo /* 690621634a19SQu Wenruo * Check if all chunks in the fs are OK for read-write degraded mount 690721634a19SQu Wenruo * 69086528b99dSAnand Jain * If the @failing_dev is specified, it's accounted as missing. 69096528b99dSAnand Jain * 691021634a19SQu Wenruo * Return true if all chunks meet the minimal RW mount requirements. 691121634a19SQu Wenruo * Return false if any chunk doesn't meet the minimal RW mount requirements. 691221634a19SQu Wenruo */ 69136528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 69146528b99dSAnand Jain struct btrfs_device *failing_dev) 691521634a19SQu Wenruo { 691621634a19SQu Wenruo struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 691721634a19SQu Wenruo struct extent_map *em; 691821634a19SQu Wenruo u64 next_start = 0; 691921634a19SQu Wenruo bool ret = true; 692021634a19SQu Wenruo 692121634a19SQu Wenruo read_lock(&map_tree->map_tree.lock); 692221634a19SQu Wenruo em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 692321634a19SQu Wenruo read_unlock(&map_tree->map_tree.lock); 692421634a19SQu Wenruo /* No chunk at all? Return false anyway */ 692521634a19SQu Wenruo if (!em) { 692621634a19SQu Wenruo ret = false; 692721634a19SQu Wenruo goto out; 692821634a19SQu Wenruo } 692921634a19SQu Wenruo while (em) { 693021634a19SQu Wenruo struct map_lookup *map; 693121634a19SQu Wenruo int missing = 0; 693221634a19SQu Wenruo int max_tolerated; 693321634a19SQu Wenruo int i; 693421634a19SQu Wenruo 693521634a19SQu Wenruo map = em->map_lookup; 693621634a19SQu Wenruo max_tolerated = 693721634a19SQu Wenruo btrfs_get_num_tolerated_disk_barrier_failures( 693821634a19SQu Wenruo map->type); 693921634a19SQu Wenruo for (i = 0; i < map->num_stripes; i++) { 694021634a19SQu Wenruo struct btrfs_device *dev = map->stripes[i].dev; 694121634a19SQu Wenruo 6942e6e674bdSAnand Jain if (!dev || !dev->bdev || 6943e6e674bdSAnand Jain test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 694421634a19SQu Wenruo dev->last_flush_error) 694521634a19SQu Wenruo missing++; 69466528b99dSAnand Jain else if (failing_dev && failing_dev == dev) 69476528b99dSAnand Jain missing++; 694821634a19SQu Wenruo } 694921634a19SQu Wenruo if (missing > max_tolerated) { 69506528b99dSAnand Jain if (!failing_dev) 695121634a19SQu Wenruo btrfs_warn(fs_info, 695221634a19SQu Wenruo "chunk %llu missing %d devices, max tolerance is %d for writeable mount", 695321634a19SQu Wenruo em->start, missing, max_tolerated); 695421634a19SQu Wenruo free_extent_map(em); 695521634a19SQu Wenruo ret = false; 695621634a19SQu Wenruo goto out; 695721634a19SQu Wenruo } 695821634a19SQu Wenruo next_start = extent_map_end(em); 695921634a19SQu Wenruo free_extent_map(em); 696021634a19SQu Wenruo 696121634a19SQu Wenruo read_lock(&map_tree->map_tree.lock); 696221634a19SQu Wenruo em = lookup_extent_mapping(&map_tree->map_tree, next_start, 696321634a19SQu Wenruo (u64)(-1) - next_start); 696421634a19SQu Wenruo read_unlock(&map_tree->map_tree.lock); 696521634a19SQu Wenruo } 696621634a19SQu Wenruo out: 696721634a19SQu Wenruo return ret; 696821634a19SQu Wenruo } 696921634a19SQu Wenruo 69705b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 69710b86a832SChris Mason { 69725b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 69730b86a832SChris Mason struct btrfs_path *path; 69740b86a832SChris Mason struct extent_buffer *leaf; 69750b86a832SChris Mason struct btrfs_key key; 69760b86a832SChris Mason struct btrfs_key found_key; 69770b86a832SChris Mason int ret; 69780b86a832SChris Mason int slot; 697999e3ecfcSLiu Bo u64 total_dev = 0; 69800b86a832SChris Mason 69810b86a832SChris Mason path = btrfs_alloc_path(); 69820b86a832SChris Mason if (!path) 69830b86a832SChris Mason return -ENOMEM; 69840b86a832SChris Mason 6985b367e47fSLi Zefan mutex_lock(&uuid_mutex); 698634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 6987b367e47fSLi Zefan 6988395927a9SFilipe David Borba Manana /* 6989395927a9SFilipe David Borba Manana * Read all device items, and then all the chunk items. All 6990395927a9SFilipe David Borba Manana * device items are found before any chunk item (their object id 6991395927a9SFilipe David Borba Manana * is smaller than the lowest possible object id for a chunk 6992395927a9SFilipe David Borba Manana * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 69930b86a832SChris Mason */ 69940b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 69950b86a832SChris Mason key.offset = 0; 69960b86a832SChris Mason key.type = 0; 69970b86a832SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6998ab59381eSZhao Lei if (ret < 0) 6999ab59381eSZhao Lei goto error; 70000b86a832SChris Mason while (1) { 70010b86a832SChris Mason leaf = path->nodes[0]; 70020b86a832SChris Mason slot = path->slots[0]; 70030b86a832SChris Mason if (slot >= btrfs_header_nritems(leaf)) { 70040b86a832SChris Mason ret = btrfs_next_leaf(root, path); 70050b86a832SChris Mason if (ret == 0) 70060b86a832SChris Mason continue; 70070b86a832SChris Mason if (ret < 0) 70080b86a832SChris Mason goto error; 70090b86a832SChris Mason break; 70100b86a832SChris Mason } 70110b86a832SChris Mason btrfs_item_key_to_cpu(leaf, &found_key, slot); 70120b86a832SChris Mason if (found_key.type == BTRFS_DEV_ITEM_KEY) { 70130b86a832SChris Mason struct btrfs_dev_item *dev_item; 70140b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, slot, 70150b86a832SChris Mason struct btrfs_dev_item); 70162ff7e61eSJeff Mahoney ret = read_one_dev(fs_info, leaf, dev_item); 70172b82032cSYan Zheng if (ret) 70182b82032cSYan Zheng goto error; 701999e3ecfcSLiu Bo total_dev++; 70200b86a832SChris Mason } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 70210b86a832SChris Mason struct btrfs_chunk *chunk; 70220b86a832SChris Mason chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 70232ff7e61eSJeff Mahoney ret = read_one_chunk(fs_info, &found_key, leaf, chunk); 70242b82032cSYan Zheng if (ret) 70252b82032cSYan Zheng goto error; 70260b86a832SChris Mason } 70270b86a832SChris Mason path->slots[0]++; 70280b86a832SChris Mason } 702999e3ecfcSLiu Bo 703099e3ecfcSLiu Bo /* 703199e3ecfcSLiu Bo * After loading chunk tree, we've got all device information, 703299e3ecfcSLiu Bo * do another round of validation checks. 703399e3ecfcSLiu Bo */ 70340b246afaSJeff Mahoney if (total_dev != fs_info->fs_devices->total_devices) { 70350b246afaSJeff Mahoney btrfs_err(fs_info, 703699e3ecfcSLiu Bo "super_num_devices %llu mismatch with num_devices %llu found here", 70370b246afaSJeff Mahoney btrfs_super_num_devices(fs_info->super_copy), 703899e3ecfcSLiu Bo total_dev); 703999e3ecfcSLiu Bo ret = -EINVAL; 704099e3ecfcSLiu Bo goto error; 704199e3ecfcSLiu Bo } 70420b246afaSJeff Mahoney if (btrfs_super_total_bytes(fs_info->super_copy) < 70430b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes) { 70440b246afaSJeff Mahoney btrfs_err(fs_info, 704599e3ecfcSLiu Bo "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 70460b246afaSJeff Mahoney btrfs_super_total_bytes(fs_info->super_copy), 70470b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes); 704899e3ecfcSLiu Bo ret = -EINVAL; 704999e3ecfcSLiu Bo goto error; 705099e3ecfcSLiu Bo } 70510b86a832SChris Mason ret = 0; 70520b86a832SChris Mason error: 705334441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 7054b367e47fSLi Zefan mutex_unlock(&uuid_mutex); 7055b367e47fSLi Zefan 70562b82032cSYan Zheng btrfs_free_path(path); 70570b86a832SChris Mason return ret; 70580b86a832SChris Mason } 7059442a4f63SStefan Behrens 7060cb517eabSMiao Xie void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7061cb517eabSMiao Xie { 7062cb517eabSMiao Xie struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7063cb517eabSMiao Xie struct btrfs_device *device; 7064cb517eabSMiao Xie 706529cc83f6SLiu Bo while (fs_devices) { 7066cb517eabSMiao Xie mutex_lock(&fs_devices->device_list_mutex); 7067cb517eabSMiao Xie list_for_each_entry(device, &fs_devices->devices, dev_list) 7068fb456252SJeff Mahoney device->fs_info = fs_info; 7069cb517eabSMiao Xie mutex_unlock(&fs_devices->device_list_mutex); 707029cc83f6SLiu Bo 707129cc83f6SLiu Bo fs_devices = fs_devices->seed; 707229cc83f6SLiu Bo } 7073cb517eabSMiao Xie } 7074cb517eabSMiao Xie 7075733f4fbbSStefan Behrens static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 7076733f4fbbSStefan Behrens { 7077733f4fbbSStefan Behrens int i; 7078733f4fbbSStefan Behrens 7079733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7080733f4fbbSStefan Behrens btrfs_dev_stat_reset(dev, i); 7081733f4fbbSStefan Behrens } 7082733f4fbbSStefan Behrens 7083733f4fbbSStefan Behrens int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7084733f4fbbSStefan Behrens { 7085733f4fbbSStefan Behrens struct btrfs_key key; 7086733f4fbbSStefan Behrens struct btrfs_key found_key; 7087733f4fbbSStefan Behrens struct btrfs_root *dev_root = fs_info->dev_root; 7088733f4fbbSStefan Behrens struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7089733f4fbbSStefan Behrens struct extent_buffer *eb; 7090733f4fbbSStefan Behrens int slot; 7091733f4fbbSStefan Behrens int ret = 0; 7092733f4fbbSStefan Behrens struct btrfs_device *device; 7093733f4fbbSStefan Behrens struct btrfs_path *path = NULL; 7094733f4fbbSStefan Behrens int i; 7095733f4fbbSStefan Behrens 7096733f4fbbSStefan Behrens path = btrfs_alloc_path(); 7097733f4fbbSStefan Behrens if (!path) { 7098733f4fbbSStefan Behrens ret = -ENOMEM; 7099733f4fbbSStefan Behrens goto out; 7100733f4fbbSStefan Behrens } 7101733f4fbbSStefan Behrens 7102733f4fbbSStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 7103733f4fbbSStefan Behrens list_for_each_entry(device, &fs_devices->devices, dev_list) { 7104733f4fbbSStefan Behrens int item_size; 7105733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7106733f4fbbSStefan Behrens 7107242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7108242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7109733f4fbbSStefan Behrens key.offset = device->devid; 7110733f4fbbSStefan Behrens ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7111733f4fbbSStefan Behrens if (ret) { 7112733f4fbbSStefan Behrens __btrfs_reset_dev_stats(device); 7113733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7114733f4fbbSStefan Behrens btrfs_release_path(path); 7115733f4fbbSStefan Behrens continue; 7116733f4fbbSStefan Behrens } 7117733f4fbbSStefan Behrens slot = path->slots[0]; 7118733f4fbbSStefan Behrens eb = path->nodes[0]; 7119733f4fbbSStefan Behrens btrfs_item_key_to_cpu(eb, &found_key, slot); 7120733f4fbbSStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 7121733f4fbbSStefan Behrens 7122733f4fbbSStefan Behrens ptr = btrfs_item_ptr(eb, slot, 7123733f4fbbSStefan Behrens struct btrfs_dev_stats_item); 7124733f4fbbSStefan Behrens 7125733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7126733f4fbbSStefan Behrens if (item_size >= (1 + i) * sizeof(__le64)) 7127733f4fbbSStefan Behrens btrfs_dev_stat_set(device, i, 7128733f4fbbSStefan Behrens btrfs_dev_stats_value(eb, ptr, i)); 7129733f4fbbSStefan Behrens else 7130733f4fbbSStefan Behrens btrfs_dev_stat_reset(device, i); 7131733f4fbbSStefan Behrens } 7132733f4fbbSStefan Behrens 7133733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7134733f4fbbSStefan Behrens btrfs_dev_stat_print_on_load(device); 7135733f4fbbSStefan Behrens btrfs_release_path(path); 7136733f4fbbSStefan Behrens } 7137733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7138733f4fbbSStefan Behrens 7139733f4fbbSStefan Behrens out: 7140733f4fbbSStefan Behrens btrfs_free_path(path); 7141733f4fbbSStefan Behrens return ret < 0 ? ret : 0; 7142733f4fbbSStefan Behrens } 7143733f4fbbSStefan Behrens 7144733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans, 71456bccf3abSJeff Mahoney struct btrfs_fs_info *fs_info, 7146733f4fbbSStefan Behrens struct btrfs_device *device) 7147733f4fbbSStefan Behrens { 71486bccf3abSJeff Mahoney struct btrfs_root *dev_root = fs_info->dev_root; 7149733f4fbbSStefan Behrens struct btrfs_path *path; 7150733f4fbbSStefan Behrens struct btrfs_key key; 7151733f4fbbSStefan Behrens struct extent_buffer *eb; 7152733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7153733f4fbbSStefan Behrens int ret; 7154733f4fbbSStefan Behrens int i; 7155733f4fbbSStefan Behrens 7156242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7157242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7158733f4fbbSStefan Behrens key.offset = device->devid; 7159733f4fbbSStefan Behrens 7160733f4fbbSStefan Behrens path = btrfs_alloc_path(); 7161fa252992SDavid Sterba if (!path) 7162fa252992SDavid Sterba return -ENOMEM; 7163733f4fbbSStefan Behrens ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7164733f4fbbSStefan Behrens if (ret < 0) { 71650b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7166ecaeb14bSDavid Sterba "error %d while searching for dev_stats item for device %s", 7167606686eeSJosef Bacik ret, rcu_str_deref(device->name)); 7168733f4fbbSStefan Behrens goto out; 7169733f4fbbSStefan Behrens } 7170733f4fbbSStefan Behrens 7171733f4fbbSStefan Behrens if (ret == 0 && 7172733f4fbbSStefan Behrens btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7173733f4fbbSStefan Behrens /* need to delete old one and insert a new one */ 7174733f4fbbSStefan Behrens ret = btrfs_del_item(trans, dev_root, path); 7175733f4fbbSStefan Behrens if (ret != 0) { 71760b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7177ecaeb14bSDavid Sterba "delete too small dev_stats item for device %s failed %d", 7178606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7179733f4fbbSStefan Behrens goto out; 7180733f4fbbSStefan Behrens } 7181733f4fbbSStefan Behrens ret = 1; 7182733f4fbbSStefan Behrens } 7183733f4fbbSStefan Behrens 7184733f4fbbSStefan Behrens if (ret == 1) { 7185733f4fbbSStefan Behrens /* need to insert a new item */ 7186733f4fbbSStefan Behrens btrfs_release_path(path); 7187733f4fbbSStefan Behrens ret = btrfs_insert_empty_item(trans, dev_root, path, 7188733f4fbbSStefan Behrens &key, sizeof(*ptr)); 7189733f4fbbSStefan Behrens if (ret < 0) { 71900b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7191ecaeb14bSDavid Sterba "insert dev_stats item for device %s failed %d", 7192606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7193733f4fbbSStefan Behrens goto out; 7194733f4fbbSStefan Behrens } 7195733f4fbbSStefan Behrens } 7196733f4fbbSStefan Behrens 7197733f4fbbSStefan Behrens eb = path->nodes[0]; 7198733f4fbbSStefan Behrens ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7199733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7200733f4fbbSStefan Behrens btrfs_set_dev_stats_value(eb, ptr, i, 7201733f4fbbSStefan Behrens btrfs_dev_stat_read(device, i)); 7202733f4fbbSStefan Behrens btrfs_mark_buffer_dirty(eb); 7203733f4fbbSStefan Behrens 7204733f4fbbSStefan Behrens out: 7205733f4fbbSStefan Behrens btrfs_free_path(path); 7206733f4fbbSStefan Behrens return ret; 7207733f4fbbSStefan Behrens } 7208733f4fbbSStefan Behrens 7209733f4fbbSStefan Behrens /* 7210733f4fbbSStefan Behrens * called from commit_transaction. Writes all changed device stats to disk. 7211733f4fbbSStefan Behrens */ 7212733f4fbbSStefan Behrens int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 7213733f4fbbSStefan Behrens struct btrfs_fs_info *fs_info) 7214733f4fbbSStefan Behrens { 7215733f4fbbSStefan Behrens struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7216733f4fbbSStefan Behrens struct btrfs_device *device; 7217addc3fa7SMiao Xie int stats_cnt; 7218733f4fbbSStefan Behrens int ret = 0; 7219733f4fbbSStefan Behrens 7220733f4fbbSStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 7221733f4fbbSStefan Behrens list_for_each_entry(device, &fs_devices->devices, dev_list) { 72229deae968SNikolay Borisov stats_cnt = atomic_read(&device->dev_stats_ccnt); 72239deae968SNikolay Borisov if (!device->dev_stats_valid || stats_cnt == 0) 7224733f4fbbSStefan Behrens continue; 7225733f4fbbSStefan Behrens 72269deae968SNikolay Borisov 72279deae968SNikolay Borisov /* 72289deae968SNikolay Borisov * There is a LOAD-LOAD control dependency between the value of 72299deae968SNikolay Borisov * dev_stats_ccnt and updating the on-disk values which requires 72309deae968SNikolay Borisov * reading the in-memory counters. Such control dependencies 72319deae968SNikolay Borisov * require explicit read memory barriers. 72329deae968SNikolay Borisov * 72339deae968SNikolay Borisov * This memory barriers pairs with smp_mb__before_atomic in 72349deae968SNikolay Borisov * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 72359deae968SNikolay Borisov * barrier implied by atomic_xchg in 72369deae968SNikolay Borisov * btrfs_dev_stats_read_and_reset 72379deae968SNikolay Borisov */ 72389deae968SNikolay Borisov smp_rmb(); 72399deae968SNikolay Borisov 72406bccf3abSJeff Mahoney ret = update_dev_stat_item(trans, fs_info, device); 7241733f4fbbSStefan Behrens if (!ret) 7242addc3fa7SMiao Xie atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7243733f4fbbSStefan Behrens } 7244733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7245733f4fbbSStefan Behrens 7246733f4fbbSStefan Behrens return ret; 7247733f4fbbSStefan Behrens } 7248733f4fbbSStefan Behrens 7249442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7250442a4f63SStefan Behrens { 7251442a4f63SStefan Behrens btrfs_dev_stat_inc(dev, index); 7252442a4f63SStefan Behrens btrfs_dev_stat_print_on_error(dev); 7253442a4f63SStefan Behrens } 7254442a4f63SStefan Behrens 725548a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7256442a4f63SStefan Behrens { 7257733f4fbbSStefan Behrens if (!dev->dev_stats_valid) 7258733f4fbbSStefan Behrens return; 7259fb456252SJeff Mahoney btrfs_err_rl_in_rcu(dev->fs_info, 7260b14af3b4SDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7261606686eeSJosef Bacik rcu_str_deref(dev->name), 7262442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7263442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7264442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7265efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7266efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7267442a4f63SStefan Behrens } 7268c11d2c23SStefan Behrens 7269733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7270733f4fbbSStefan Behrens { 7271a98cdb85SStefan Behrens int i; 7272a98cdb85SStefan Behrens 7273a98cdb85SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7274a98cdb85SStefan Behrens if (btrfs_dev_stat_read(dev, i) != 0) 7275a98cdb85SStefan Behrens break; 7276a98cdb85SStefan Behrens if (i == BTRFS_DEV_STAT_VALUES_MAX) 7277a98cdb85SStefan Behrens return; /* all values == 0, suppress message */ 7278a98cdb85SStefan Behrens 7279fb456252SJeff Mahoney btrfs_info_in_rcu(dev->fs_info, 7280ecaeb14bSDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7281606686eeSJosef Bacik rcu_str_deref(dev->name), 7282733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7283733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7284733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7285733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7286733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7287733f4fbbSStefan Behrens } 7288733f4fbbSStefan Behrens 72892ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7290b27f7c0cSDavid Sterba struct btrfs_ioctl_get_dev_stats *stats) 7291c11d2c23SStefan Behrens { 7292c11d2c23SStefan Behrens struct btrfs_device *dev; 72930b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7294c11d2c23SStefan Behrens int i; 7295c11d2c23SStefan Behrens 7296c11d2c23SStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 72970b246afaSJeff Mahoney dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7298c11d2c23SStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7299c11d2c23SStefan Behrens 7300c11d2c23SStefan Behrens if (!dev) { 73010b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7302c11d2c23SStefan Behrens return -ENODEV; 7303733f4fbbSStefan Behrens } else if (!dev->dev_stats_valid) { 73040b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7305733f4fbbSStefan Behrens return -ENODEV; 7306b27f7c0cSDavid Sterba } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7307c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7308c11d2c23SStefan Behrens if (stats->nr_items > i) 7309c11d2c23SStefan Behrens stats->values[i] = 7310c11d2c23SStefan Behrens btrfs_dev_stat_read_and_reset(dev, i); 7311c11d2c23SStefan Behrens else 7312c11d2c23SStefan Behrens btrfs_dev_stat_reset(dev, i); 7313c11d2c23SStefan Behrens } 7314c11d2c23SStefan Behrens } else { 7315c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7316c11d2c23SStefan Behrens if (stats->nr_items > i) 7317c11d2c23SStefan Behrens stats->values[i] = btrfs_dev_stat_read(dev, i); 7318c11d2c23SStefan Behrens } 7319c11d2c23SStefan Behrens if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7320c11d2c23SStefan Behrens stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7321c11d2c23SStefan Behrens return 0; 7322c11d2c23SStefan Behrens } 7323a8a6dab7SStefan Behrens 7324da353f6bSDavid Sterba void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) 7325a8a6dab7SStefan Behrens { 7326a8a6dab7SStefan Behrens struct buffer_head *bh; 7327a8a6dab7SStefan Behrens struct btrfs_super_block *disk_super; 732812b1c263SAnand Jain int copy_num; 7329a8a6dab7SStefan Behrens 733012b1c263SAnand Jain if (!bdev) 733112b1c263SAnand Jain return; 733212b1c263SAnand Jain 733312b1c263SAnand Jain for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; 733412b1c263SAnand Jain copy_num++) { 733512b1c263SAnand Jain 733612b1c263SAnand Jain if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) 733712b1c263SAnand Jain continue; 733812b1c263SAnand Jain 7339a8a6dab7SStefan Behrens disk_super = (struct btrfs_super_block *)bh->b_data; 7340a8a6dab7SStefan Behrens 7341a8a6dab7SStefan Behrens memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 7342a8a6dab7SStefan Behrens set_buffer_dirty(bh); 7343a8a6dab7SStefan Behrens sync_dirty_buffer(bh); 7344a8a6dab7SStefan Behrens brelse(bh); 734512b1c263SAnand Jain } 7346a8a6dab7SStefan Behrens 734712b1c263SAnand Jain /* Notify udev that device has changed */ 734812b1c263SAnand Jain btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 734912b1c263SAnand Jain 735012b1c263SAnand Jain /* Update ctime/mtime for device path for libblkid */ 735112b1c263SAnand Jain update_dev_time(device_path); 7352a8a6dab7SStefan Behrens } 7353935e5cc9SMiao Xie 7354935e5cc9SMiao Xie /* 7355935e5cc9SMiao Xie * Update the size of all devices, which is used for writing out the 7356935e5cc9SMiao Xie * super blocks. 7357935e5cc9SMiao Xie */ 7358935e5cc9SMiao Xie void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 7359935e5cc9SMiao Xie { 7360935e5cc9SMiao Xie struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7361935e5cc9SMiao Xie struct btrfs_device *curr, *next; 7362935e5cc9SMiao Xie 7363935e5cc9SMiao Xie if (list_empty(&fs_devices->resized_devices)) 7364935e5cc9SMiao Xie return; 7365935e5cc9SMiao Xie 7366935e5cc9SMiao Xie mutex_lock(&fs_devices->device_list_mutex); 736734441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 7368935e5cc9SMiao Xie list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 7369935e5cc9SMiao Xie resized_list) { 7370935e5cc9SMiao Xie list_del_init(&curr->resized_list); 7371935e5cc9SMiao Xie curr->commit_total_bytes = curr->disk_total_bytes; 7372935e5cc9SMiao Xie } 737334441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 7374935e5cc9SMiao Xie mutex_unlock(&fs_devices->device_list_mutex); 7375935e5cc9SMiao Xie } 7376ce7213c7SMiao Xie 7377ce7213c7SMiao Xie /* Must be invoked during the transaction commit */ 73782ff7e61eSJeff Mahoney void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, 7379ce7213c7SMiao Xie struct btrfs_transaction *transaction) 7380ce7213c7SMiao Xie { 7381ce7213c7SMiao Xie struct extent_map *em; 7382ce7213c7SMiao Xie struct map_lookup *map; 7383ce7213c7SMiao Xie struct btrfs_device *dev; 7384ce7213c7SMiao Xie int i; 7385ce7213c7SMiao Xie 7386ce7213c7SMiao Xie if (list_empty(&transaction->pending_chunks)) 7387ce7213c7SMiao Xie return; 7388ce7213c7SMiao Xie 7389ce7213c7SMiao Xie /* In order to kick the device replace finish process */ 739034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 7391ce7213c7SMiao Xie list_for_each_entry(em, &transaction->pending_chunks, list) { 739295617d69SJeff Mahoney map = em->map_lookup; 7393ce7213c7SMiao Xie 7394ce7213c7SMiao Xie for (i = 0; i < map->num_stripes; i++) { 7395ce7213c7SMiao Xie dev = map->stripes[i].dev; 7396ce7213c7SMiao Xie dev->commit_bytes_used = dev->bytes_used; 7397ce7213c7SMiao Xie } 7398ce7213c7SMiao Xie } 739934441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 7400ce7213c7SMiao Xie } 74015a13f430SAnand Jain 74025a13f430SAnand Jain void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 74035a13f430SAnand Jain { 74045a13f430SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 74055a13f430SAnand Jain while (fs_devices) { 74065a13f430SAnand Jain fs_devices->fs_info = fs_info; 74075a13f430SAnand Jain fs_devices = fs_devices->seed; 74085a13f430SAnand Jain } 74095a13f430SAnand Jain } 74105a13f430SAnand Jain 74115a13f430SAnand Jain void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 74125a13f430SAnand Jain { 74135a13f430SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 74145a13f430SAnand Jain while (fs_devices) { 74155a13f430SAnand Jain fs_devices->fs_info = NULL; 74165a13f430SAnand Jain fs_devices = fs_devices->seed; 74175a13f430SAnand Jain } 74185a13f430SAnand Jain } 7419