1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 20b86a832SChris Mason /* 30b86a832SChris Mason * Copyright (C) 2007 Oracle. All rights reserved. 40b86a832SChris Mason */ 5c1d7c514SDavid Sterba 60b86a832SChris Mason #include <linux/sched.h> 7fccc0007SJosef Bacik #include <linux/sched/mm.h> 80b86a832SChris Mason #include <linux/bio.h> 95a0e3ad6STejun Heo #include <linux/slab.h> 10f2d8d74dSChris Mason #include <linux/blkdev.h> 11442a4f63SStefan Behrens #include <linux/ratelimit.h> 1259641015SIlya Dryomov #include <linux/kthread.h> 1353b381b3SDavid Woodhouse #include <linux/raid/pq.h> 14803b2f54SStefan Behrens #include <linux/semaphore.h> 158da4b8c4SAndy Shevchenko #include <linux/uuid.h> 16f8e10cd3SAnand Jain #include <linux/list_sort.h> 17784352feSDavid Sterba #include "misc.h" 180b86a832SChris Mason #include "ctree.h" 190b86a832SChris Mason #include "extent_map.h" 200b86a832SChris Mason #include "disk-io.h" 210b86a832SChris Mason #include "transaction.h" 220b86a832SChris Mason #include "print-tree.h" 230b86a832SChris Mason #include "volumes.h" 2453b381b3SDavid Woodhouse #include "raid56.h" 258b712842SChris Mason #include "async-thread.h" 2621adbd5cSStefan Behrens #include "check-integrity.h" 27606686eeSJosef Bacik #include "rcu-string.h" 288dabb742SStefan Behrens #include "dev-replace.h" 2999994cdeSAnand Jain #include "sysfs.h" 3082fc28fbSQu Wenruo #include "tree-checker.h" 318719aaaeSJosef Bacik #include "space-info.h" 32aac0023cSJosef Bacik #include "block-group.h" 33b0643e59SDennis Zhou #include "discard.h" 340b86a832SChris Mason 35af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 36af902047SZhao Lei [BTRFS_RAID_RAID10] = { 37af902047SZhao Lei .sub_stripes = 2, 38af902047SZhao Lei .dev_stripes = 1, 39af902047SZhao Lei .devs_max = 0, /* 0 == as many as possible */ 40af902047SZhao Lei .devs_min = 4, 418789f4feSZhao Lei .tolerated_failures = 1, 42af902047SZhao Lei .devs_increment = 2, 43af902047SZhao Lei .ncopies = 2, 44b50836edSHans van Kranenburg .nparity = 0, 45ed23467bSAnand Jain .raid_name = "raid10", 4641a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 47f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 48af902047SZhao Lei }, 49af902047SZhao Lei [BTRFS_RAID_RAID1] = { 50af902047SZhao Lei .sub_stripes = 1, 51af902047SZhao Lei .dev_stripes = 1, 52af902047SZhao Lei .devs_max = 2, 53af902047SZhao Lei .devs_min = 2, 548789f4feSZhao Lei .tolerated_failures = 1, 55af902047SZhao Lei .devs_increment = 2, 56af902047SZhao Lei .ncopies = 2, 57b50836edSHans van Kranenburg .nparity = 0, 58ed23467bSAnand Jain .raid_name = "raid1", 5941a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 60f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 61af902047SZhao Lei }, 6247e6f742SDavid Sterba [BTRFS_RAID_RAID1C3] = { 6347e6f742SDavid Sterba .sub_stripes = 1, 6447e6f742SDavid Sterba .dev_stripes = 1, 65cf93e15eSDavid Sterba .devs_max = 3, 6647e6f742SDavid Sterba .devs_min = 3, 6747e6f742SDavid Sterba .tolerated_failures = 2, 6847e6f742SDavid Sterba .devs_increment = 3, 6947e6f742SDavid Sterba .ncopies = 3, 70db26a024SDavid Sterba .nparity = 0, 7147e6f742SDavid Sterba .raid_name = "raid1c3", 7247e6f742SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 7347e6f742SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 7447e6f742SDavid Sterba }, 758d6fac00SDavid Sterba [BTRFS_RAID_RAID1C4] = { 768d6fac00SDavid Sterba .sub_stripes = 1, 778d6fac00SDavid Sterba .dev_stripes = 1, 78cf93e15eSDavid Sterba .devs_max = 4, 798d6fac00SDavid Sterba .devs_min = 4, 808d6fac00SDavid Sterba .tolerated_failures = 3, 818d6fac00SDavid Sterba .devs_increment = 4, 828d6fac00SDavid Sterba .ncopies = 4, 83db26a024SDavid Sterba .nparity = 0, 848d6fac00SDavid Sterba .raid_name = "raid1c4", 858d6fac00SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 868d6fac00SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 878d6fac00SDavid Sterba }, 88af902047SZhao Lei [BTRFS_RAID_DUP] = { 89af902047SZhao Lei .sub_stripes = 1, 90af902047SZhao Lei .dev_stripes = 2, 91af902047SZhao Lei .devs_max = 1, 92af902047SZhao Lei .devs_min = 1, 938789f4feSZhao Lei .tolerated_failures = 0, 94af902047SZhao Lei .devs_increment = 1, 95af902047SZhao Lei .ncopies = 2, 96b50836edSHans van Kranenburg .nparity = 0, 97ed23467bSAnand Jain .raid_name = "dup", 9841a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_DUP, 99f9fbcaa2SAnand Jain .mindev_error = 0, 100af902047SZhao Lei }, 101af902047SZhao Lei [BTRFS_RAID_RAID0] = { 102af902047SZhao Lei .sub_stripes = 1, 103af902047SZhao Lei .dev_stripes = 1, 104af902047SZhao Lei .devs_max = 0, 105af902047SZhao Lei .devs_min = 2, 1068789f4feSZhao Lei .tolerated_failures = 0, 107af902047SZhao Lei .devs_increment = 1, 108af902047SZhao Lei .ncopies = 1, 109b50836edSHans van Kranenburg .nparity = 0, 110ed23467bSAnand Jain .raid_name = "raid0", 11141a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 112f9fbcaa2SAnand Jain .mindev_error = 0, 113af902047SZhao Lei }, 114af902047SZhao Lei [BTRFS_RAID_SINGLE] = { 115af902047SZhao Lei .sub_stripes = 1, 116af902047SZhao Lei .dev_stripes = 1, 117af902047SZhao Lei .devs_max = 1, 118af902047SZhao Lei .devs_min = 1, 1198789f4feSZhao Lei .tolerated_failures = 0, 120af902047SZhao Lei .devs_increment = 1, 121af902047SZhao Lei .ncopies = 1, 122b50836edSHans van Kranenburg .nparity = 0, 123ed23467bSAnand Jain .raid_name = "single", 12441a6e891SAnand Jain .bg_flag = 0, 125f9fbcaa2SAnand Jain .mindev_error = 0, 126af902047SZhao Lei }, 127af902047SZhao Lei [BTRFS_RAID_RAID5] = { 128af902047SZhao Lei .sub_stripes = 1, 129af902047SZhao Lei .dev_stripes = 1, 130af902047SZhao Lei .devs_max = 0, 131af902047SZhao Lei .devs_min = 2, 1328789f4feSZhao Lei .tolerated_failures = 1, 133af902047SZhao Lei .devs_increment = 1, 134da612e31SHans van Kranenburg .ncopies = 1, 135b50836edSHans van Kranenburg .nparity = 1, 136ed23467bSAnand Jain .raid_name = "raid5", 13741a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 138f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 139af902047SZhao Lei }, 140af902047SZhao Lei [BTRFS_RAID_RAID6] = { 141af902047SZhao Lei .sub_stripes = 1, 142af902047SZhao Lei .dev_stripes = 1, 143af902047SZhao Lei .devs_max = 0, 144af902047SZhao Lei .devs_min = 3, 1458789f4feSZhao Lei .tolerated_failures = 2, 146af902047SZhao Lei .devs_increment = 1, 147da612e31SHans van Kranenburg .ncopies = 1, 148b50836edSHans van Kranenburg .nparity = 2, 149ed23467bSAnand Jain .raid_name = "raid6", 15041a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 151f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 152af902047SZhao Lei }, 153af902047SZhao Lei }; 154af902047SZhao Lei 155158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags) 156ed23467bSAnand Jain { 157158da513SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags); 158158da513SDavid Sterba 159158da513SDavid Sterba if (index >= BTRFS_NR_RAID_TYPES) 160ed23467bSAnand Jain return NULL; 161ed23467bSAnand Jain 162158da513SDavid Sterba return btrfs_raid_array[index].raid_name; 163ed23467bSAnand Jain } 164ed23467bSAnand Jain 165f89e09cfSAnand Jain /* 166f89e09cfSAnand Jain * Fill @buf with textual description of @bg_flags, no more than @size_buf 167f89e09cfSAnand Jain * bytes including terminating null byte. 168f89e09cfSAnand Jain */ 169f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 170f89e09cfSAnand Jain { 171f89e09cfSAnand Jain int i; 172f89e09cfSAnand Jain int ret; 173f89e09cfSAnand Jain char *bp = buf; 174f89e09cfSAnand Jain u64 flags = bg_flags; 175f89e09cfSAnand Jain u32 size_bp = size_buf; 176f89e09cfSAnand Jain 177f89e09cfSAnand Jain if (!flags) { 178f89e09cfSAnand Jain strcpy(bp, "NONE"); 179f89e09cfSAnand Jain return; 180f89e09cfSAnand Jain } 181f89e09cfSAnand Jain 182f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc) \ 183f89e09cfSAnand Jain do { \ 184f89e09cfSAnand Jain if (flags & (flag)) { \ 185f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "%s|", (desc)); \ 186f89e09cfSAnand Jain if (ret < 0 || ret >= size_bp) \ 187f89e09cfSAnand Jain goto out_overflow; \ 188f89e09cfSAnand Jain size_bp -= ret; \ 189f89e09cfSAnand Jain bp += ret; \ 190f89e09cfSAnand Jain flags &= ~(flag); \ 191f89e09cfSAnand Jain } \ 192f89e09cfSAnand Jain } while (0) 193f89e09cfSAnand Jain 194f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 195f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 196f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 197f89e09cfSAnand Jain 198f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 199f89e09cfSAnand Jain for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 200f89e09cfSAnand Jain DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 201f89e09cfSAnand Jain btrfs_raid_array[i].raid_name); 202f89e09cfSAnand Jain #undef DESCRIBE_FLAG 203f89e09cfSAnand Jain 204f89e09cfSAnand Jain if (flags) { 205f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "0x%llx|", flags); 206f89e09cfSAnand Jain size_bp -= ret; 207f89e09cfSAnand Jain } 208f89e09cfSAnand Jain 209f89e09cfSAnand Jain if (size_bp < size_buf) 210f89e09cfSAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 211f89e09cfSAnand Jain 212f89e09cfSAnand Jain /* 213f89e09cfSAnand Jain * The text is trimmed, it's up to the caller to provide sufficiently 214f89e09cfSAnand Jain * large buffer 215f89e09cfSAnand Jain */ 216f89e09cfSAnand Jain out_overflow:; 217f89e09cfSAnand Jain } 218f89e09cfSAnand Jain 2196f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans); 2202ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 22148a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 222733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 2235ab56090SLiu Bo static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 2245ab56090SLiu Bo enum btrfs_map_op op, 2255ab56090SLiu Bo u64 logical, u64 *length, 2265ab56090SLiu Bo struct btrfs_bio **bbio_ret, 2275ab56090SLiu Bo int mirror_num, int need_raid_map); 2282b82032cSYan Zheng 2299c6b1c4dSDavid Sterba /* 2309c6b1c4dSDavid Sterba * Device locking 2319c6b1c4dSDavid Sterba * ============== 2329c6b1c4dSDavid Sterba * 2339c6b1c4dSDavid Sterba * There are several mutexes that protect manipulation of devices and low-level 2349c6b1c4dSDavid Sterba * structures like chunks but not block groups, extents or files 2359c6b1c4dSDavid Sterba * 2369c6b1c4dSDavid Sterba * uuid_mutex (global lock) 2379c6b1c4dSDavid Sterba * ------------------------ 2389c6b1c4dSDavid Sterba * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 2399c6b1c4dSDavid Sterba * the SCAN_DEV ioctl registration or from mount either implicitly (the first 2409c6b1c4dSDavid Sterba * device) or requested by the device= mount option 2419c6b1c4dSDavid Sterba * 2429c6b1c4dSDavid Sterba * the mutex can be very coarse and can cover long-running operations 2439c6b1c4dSDavid Sterba * 2449c6b1c4dSDavid Sterba * protects: updates to fs_devices counters like missing devices, rw devices, 24552042d8eSAndrea Gelmini * seeding, structure cloning, opening/closing devices at mount/umount time 2469c6b1c4dSDavid Sterba * 2479c6b1c4dSDavid Sterba * global::fs_devs - add, remove, updates to the global list 2489c6b1c4dSDavid Sterba * 24918c850fdSJosef Bacik * does not protect: manipulation of the fs_devices::devices list in general 25018c850fdSJosef Bacik * but in mount context it could be used to exclude list modifications by eg. 25118c850fdSJosef Bacik * scan ioctl 2529c6b1c4dSDavid Sterba * 2539c6b1c4dSDavid Sterba * btrfs_device::name - renames (write side), read is RCU 2549c6b1c4dSDavid Sterba * 2559c6b1c4dSDavid Sterba * fs_devices::device_list_mutex (per-fs, with RCU) 2569c6b1c4dSDavid Sterba * ------------------------------------------------ 2579c6b1c4dSDavid Sterba * protects updates to fs_devices::devices, ie. adding and deleting 2589c6b1c4dSDavid Sterba * 2599c6b1c4dSDavid Sterba * simple list traversal with read-only actions can be done with RCU protection 2609c6b1c4dSDavid Sterba * 2619c6b1c4dSDavid Sterba * may be used to exclude some operations from running concurrently without any 2629c6b1c4dSDavid Sterba * modifications to the list (see write_all_supers) 2639c6b1c4dSDavid Sterba * 26418c850fdSJosef Bacik * Is not required at mount and close times, because our device list is 26518c850fdSJosef Bacik * protected by the uuid_mutex at that point. 26618c850fdSJosef Bacik * 2679c6b1c4dSDavid Sterba * balance_mutex 2689c6b1c4dSDavid Sterba * ------------- 2699c6b1c4dSDavid Sterba * protects balance structures (status, state) and context accessed from 2709c6b1c4dSDavid Sterba * several places (internally, ioctl) 2719c6b1c4dSDavid Sterba * 2729c6b1c4dSDavid Sterba * chunk_mutex 2739c6b1c4dSDavid Sterba * ----------- 2749c6b1c4dSDavid Sterba * protects chunks, adding or removing during allocation, trim or when a new 2750b6f5d40SNikolay Borisov * device is added/removed. Additionally it also protects post_commit_list of 2760b6f5d40SNikolay Borisov * individual devices, since they can be added to the transaction's 2770b6f5d40SNikolay Borisov * post_commit_list only with chunk_mutex held. 2789c6b1c4dSDavid Sterba * 2799c6b1c4dSDavid Sterba * cleaner_mutex 2809c6b1c4dSDavid Sterba * ------------- 2819c6b1c4dSDavid Sterba * a big lock that is held by the cleaner thread and prevents running subvolume 2829c6b1c4dSDavid Sterba * cleaning together with relocation or delayed iputs 2839c6b1c4dSDavid Sterba * 2849c6b1c4dSDavid Sterba * 2859c6b1c4dSDavid Sterba * Lock nesting 2869c6b1c4dSDavid Sterba * ============ 2879c6b1c4dSDavid Sterba * 2889c6b1c4dSDavid Sterba * uuid_mutex 2899c6b1c4dSDavid Sterba * device_list_mutex 2909c6b1c4dSDavid Sterba * chunk_mutex 2919c6b1c4dSDavid Sterba * balance_mutex 29289595e80SAnand Jain * 29389595e80SAnand Jain * 294c3e1f96cSGoldwyn Rodrigues * Exclusive operations 295c3e1f96cSGoldwyn Rodrigues * ==================== 29689595e80SAnand Jain * 29789595e80SAnand Jain * Maintains the exclusivity of the following operations that apply to the 29889595e80SAnand Jain * whole filesystem and cannot run in parallel. 29989595e80SAnand Jain * 30089595e80SAnand Jain * - Balance (*) 30189595e80SAnand Jain * - Device add 30289595e80SAnand Jain * - Device remove 30389595e80SAnand Jain * - Device replace (*) 30489595e80SAnand Jain * - Resize 30589595e80SAnand Jain * 30689595e80SAnand Jain * The device operations (as above) can be in one of the following states: 30789595e80SAnand Jain * 30889595e80SAnand Jain * - Running state 30989595e80SAnand Jain * - Paused state 31089595e80SAnand Jain * - Completed state 31189595e80SAnand Jain * 31289595e80SAnand Jain * Only device operations marked with (*) can go into the Paused state for the 31389595e80SAnand Jain * following reasons: 31489595e80SAnand Jain * 31589595e80SAnand Jain * - ioctl (only Balance can be Paused through ioctl) 31689595e80SAnand Jain * - filesystem remounted as read-only 31789595e80SAnand Jain * - filesystem unmounted and mounted as read-only 31889595e80SAnand Jain * - system power-cycle and filesystem mounted as read-only 31989595e80SAnand Jain * - filesystem or device errors leading to forced read-only 32089595e80SAnand Jain * 321c3e1f96cSGoldwyn Rodrigues * The status of exclusive operation is set and cleared atomically. 322c3e1f96cSGoldwyn Rodrigues * During the course of Paused state, fs_info::exclusive_operation remains set. 32389595e80SAnand Jain * A device operation in Paused or Running state can be canceled or resumed 32489595e80SAnand Jain * either by ioctl (Balance only) or when remounted as read-write. 325c3e1f96cSGoldwyn Rodrigues * The exclusive status is cleared when the device operation is canceled or 32689595e80SAnand Jain * completed. 3279c6b1c4dSDavid Sterba */ 3289c6b1c4dSDavid Sterba 32967a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex); 3308a4b83ccSChris Mason static LIST_HEAD(fs_uuids); 3314143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 332c73eccf7SAnand Jain { 333c73eccf7SAnand Jain return &fs_uuids; 334c73eccf7SAnand Jain } 3358a4b83ccSChris Mason 3362dfeca9bSDavid Sterba /* 3372dfeca9bSDavid Sterba * alloc_fs_devices - allocate struct btrfs_fs_devices 3387239ff4bSNikolay Borisov * @fsid: if not NULL, copy the UUID to fs_devices::fsid 3397239ff4bSNikolay Borisov * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 3402dfeca9bSDavid Sterba * 3412dfeca9bSDavid Sterba * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 3422dfeca9bSDavid Sterba * The returned struct is not linked onto any lists and can be destroyed with 3432dfeca9bSDavid Sterba * kfree() right away. 3442dfeca9bSDavid Sterba */ 3457239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 3467239ff4bSNikolay Borisov const u8 *metadata_fsid) 3472208a378SIlya Dryomov { 3482208a378SIlya Dryomov struct btrfs_fs_devices *fs_devs; 3492208a378SIlya Dryomov 35078f2c9e6SDavid Sterba fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 3512208a378SIlya Dryomov if (!fs_devs) 3522208a378SIlya Dryomov return ERR_PTR(-ENOMEM); 3532208a378SIlya Dryomov 3542208a378SIlya Dryomov mutex_init(&fs_devs->device_list_mutex); 3552208a378SIlya Dryomov 3562208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->devices); 3572208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->alloc_list); 358c4babc5eSAnand Jain INIT_LIST_HEAD(&fs_devs->fs_list); 359944d3f9fSNikolay Borisov INIT_LIST_HEAD(&fs_devs->seed_list); 3602208a378SIlya Dryomov if (fsid) 3612208a378SIlya Dryomov memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 3622208a378SIlya Dryomov 3637239ff4bSNikolay Borisov if (metadata_fsid) 3647239ff4bSNikolay Borisov memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 3657239ff4bSNikolay Borisov else if (fsid) 3667239ff4bSNikolay Borisov memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 3677239ff4bSNikolay Borisov 3682208a378SIlya Dryomov return fs_devs; 3692208a378SIlya Dryomov } 3702208a378SIlya Dryomov 371a425f9d4SDavid Sterba void btrfs_free_device(struct btrfs_device *device) 37248dae9cfSDavid Sterba { 373bbbf7243SNikolay Borisov WARN_ON(!list_empty(&device->post_commit_list)); 37448dae9cfSDavid Sterba rcu_string_free(device->name); 3751c11b63eSJeff Mahoney extent_io_tree_release(&device->alloc_state); 37648dae9cfSDavid Sterba bio_put(device->flush_bio); 37748dae9cfSDavid Sterba kfree(device); 37848dae9cfSDavid Sterba } 37948dae9cfSDavid Sterba 380e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 381e4404d6eSYan Zheng { 382e4404d6eSYan Zheng struct btrfs_device *device; 383e4404d6eSYan Zheng WARN_ON(fs_devices->opened); 384e4404d6eSYan Zheng while (!list_empty(&fs_devices->devices)) { 385e4404d6eSYan Zheng device = list_entry(fs_devices->devices.next, 386e4404d6eSYan Zheng struct btrfs_device, dev_list); 387e4404d6eSYan Zheng list_del(&device->dev_list); 388a425f9d4SDavid Sterba btrfs_free_device(device); 389e4404d6eSYan Zheng } 390e4404d6eSYan Zheng kfree(fs_devices); 391e4404d6eSYan Zheng } 392e4404d6eSYan Zheng 393ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void) 3948a4b83ccSChris Mason { 3958a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 3968a4b83ccSChris Mason 3972b82032cSYan Zheng while (!list_empty(&fs_uuids)) { 3982b82032cSYan Zheng fs_devices = list_entry(fs_uuids.next, 399c4babc5eSAnand Jain struct btrfs_fs_devices, fs_list); 400c4babc5eSAnand Jain list_del(&fs_devices->fs_list); 401e4404d6eSYan Zheng free_fs_devices(fs_devices); 4028a4b83ccSChris Mason } 4038a4b83ccSChris Mason } 4048a4b83ccSChris Mason 40548dae9cfSDavid Sterba /* 40648dae9cfSDavid Sterba * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 40748dae9cfSDavid Sterba * Returned struct is not linked onto any lists and must be destroyed using 408a425f9d4SDavid Sterba * btrfs_free_device. 40948dae9cfSDavid Sterba */ 410154f7cb8SQu Wenruo static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) 41112bd2fc0SIlya Dryomov { 41212bd2fc0SIlya Dryomov struct btrfs_device *dev; 41312bd2fc0SIlya Dryomov 41478f2c9e6SDavid Sterba dev = kzalloc(sizeof(*dev), GFP_KERNEL); 41512bd2fc0SIlya Dryomov if (!dev) 41612bd2fc0SIlya Dryomov return ERR_PTR(-ENOMEM); 41712bd2fc0SIlya Dryomov 418e0ae9994SDavid Sterba /* 419e0ae9994SDavid Sterba * Preallocate a bio that's always going to be used for flushing device 420e0ae9994SDavid Sterba * barriers and matches the device lifespan 421e0ae9994SDavid Sterba */ 422e0ae9994SDavid Sterba dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 423e0ae9994SDavid Sterba if (!dev->flush_bio) { 424e0ae9994SDavid Sterba kfree(dev); 425e0ae9994SDavid Sterba return ERR_PTR(-ENOMEM); 426e0ae9994SDavid Sterba } 427e0ae9994SDavid Sterba 42812bd2fc0SIlya Dryomov INIT_LIST_HEAD(&dev->dev_list); 42912bd2fc0SIlya Dryomov INIT_LIST_HEAD(&dev->dev_alloc_list); 430bbbf7243SNikolay Borisov INIT_LIST_HEAD(&dev->post_commit_list); 43112bd2fc0SIlya Dryomov 43212bd2fc0SIlya Dryomov atomic_set(&dev->reada_in_flight, 0); 433addc3fa7SMiao Xie atomic_set(&dev->dev_stats_ccnt, 0); 434d5c82388SDavidlohr Bueso btrfs_device_data_ordered_init(dev, fs_info); 4359bcaaea7SChris Mason INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 436d0164adcSMel Gorman INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 437154f7cb8SQu Wenruo extent_io_tree_init(fs_info, &dev->alloc_state, 438154f7cb8SQu Wenruo IO_TREE_DEVICE_ALLOC_STATE, NULL); 43912bd2fc0SIlya Dryomov 44012bd2fc0SIlya Dryomov return dev; 44112bd2fc0SIlya Dryomov } 44212bd2fc0SIlya Dryomov 4437239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid( 4447239ff4bSNikolay Borisov const u8 *fsid, const u8 *metadata_fsid) 4458a4b83ccSChris Mason { 4468a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 4478a4b83ccSChris Mason 4487239ff4bSNikolay Borisov ASSERT(fsid); 4497239ff4bSNikolay Borisov 450c6730a0eSSu Yue /* Handle non-split brain cases */ 451c6730a0eSSu Yue list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 4527a62d0f0SNikolay Borisov if (metadata_fsid) { 453c6730a0eSSu Yue if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 454c6730a0eSSu Yue && memcmp(metadata_fsid, fs_devices->metadata_uuid, 455c6730a0eSSu Yue BTRFS_FSID_SIZE) == 0) 456c6730a0eSSu Yue return fs_devices; 457c6730a0eSSu Yue } else { 458c6730a0eSSu Yue if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 459c6730a0eSSu Yue return fs_devices; 460c6730a0eSSu Yue } 461c6730a0eSSu Yue } 462c6730a0eSSu Yue return NULL; 463c6730a0eSSu Yue } 464c6730a0eSSu Yue 465c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 466c6730a0eSSu Yue struct btrfs_super_block *disk_super) 467c6730a0eSSu Yue { 468c6730a0eSSu Yue 469c6730a0eSSu Yue struct btrfs_fs_devices *fs_devices; 470c6730a0eSSu Yue 4717a62d0f0SNikolay Borisov /* 4727a62d0f0SNikolay Borisov * Handle scanned device having completed its fsid change but 4737a62d0f0SNikolay Borisov * belonging to a fs_devices that was created by first scanning 4747a62d0f0SNikolay Borisov * a device which didn't have its fsid/metadata_uuid changed 4757a62d0f0SNikolay Borisov * at all and the CHANGING_FSID_V2 flag set. 4767a62d0f0SNikolay Borisov */ 4777a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 4787a62d0f0SNikolay Borisov if (fs_devices->fsid_change && 479c6730a0eSSu Yue memcmp(disk_super->metadata_uuid, fs_devices->fsid, 4807a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 4817a62d0f0SNikolay Borisov memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 4827a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0) { 4837a62d0f0SNikolay Borisov return fs_devices; 4847a62d0f0SNikolay Borisov } 4857a62d0f0SNikolay Borisov } 486cc5de4e7SNikolay Borisov /* 487cc5de4e7SNikolay Borisov * Handle scanned device having completed its fsid change but 488cc5de4e7SNikolay Borisov * belonging to a fs_devices that was created by a device that 489cc5de4e7SNikolay Borisov * has an outdated pair of fsid/metadata_uuid and 490cc5de4e7SNikolay Borisov * CHANGING_FSID_V2 flag set. 491cc5de4e7SNikolay Borisov */ 492cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 493cc5de4e7SNikolay Borisov if (fs_devices->fsid_change && 494cc5de4e7SNikolay Borisov memcmp(fs_devices->metadata_uuid, 495cc5de4e7SNikolay Borisov fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 496c6730a0eSSu Yue memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 497cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) == 0) { 498cc5de4e7SNikolay Borisov return fs_devices; 499cc5de4e7SNikolay Borisov } 500cc5de4e7SNikolay Borisov } 501c6730a0eSSu Yue 502c6730a0eSSu Yue return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 5037a62d0f0SNikolay Borisov } 5047a62d0f0SNikolay Borisov 5058a4b83ccSChris Mason 506beaf8ab3SStefan Behrens static int 507beaf8ab3SStefan Behrens btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 508beaf8ab3SStefan Behrens int flush, struct block_device **bdev, 5098f32380dSJohannes Thumshirn struct btrfs_super_block **disk_super) 510beaf8ab3SStefan Behrens { 511beaf8ab3SStefan Behrens int ret; 512beaf8ab3SStefan Behrens 513beaf8ab3SStefan Behrens *bdev = blkdev_get_by_path(device_path, flags, holder); 514beaf8ab3SStefan Behrens 515beaf8ab3SStefan Behrens if (IS_ERR(*bdev)) { 516beaf8ab3SStefan Behrens ret = PTR_ERR(*bdev); 517beaf8ab3SStefan Behrens goto error; 518beaf8ab3SStefan Behrens } 519beaf8ab3SStefan Behrens 520beaf8ab3SStefan Behrens if (flush) 521beaf8ab3SStefan Behrens filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 5229f6d2510SDavid Sterba ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 523beaf8ab3SStefan Behrens if (ret) { 524beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 525beaf8ab3SStefan Behrens goto error; 526beaf8ab3SStefan Behrens } 527beaf8ab3SStefan Behrens invalidate_bdev(*bdev); 5288f32380dSJohannes Thumshirn *disk_super = btrfs_read_dev_super(*bdev); 5298f32380dSJohannes Thumshirn if (IS_ERR(*disk_super)) { 5308f32380dSJohannes Thumshirn ret = PTR_ERR(*disk_super); 531beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 532beaf8ab3SStefan Behrens goto error; 533beaf8ab3SStefan Behrens } 534beaf8ab3SStefan Behrens 535beaf8ab3SStefan Behrens return 0; 536beaf8ab3SStefan Behrens 537beaf8ab3SStefan Behrens error: 538beaf8ab3SStefan Behrens *bdev = NULL; 539beaf8ab3SStefan Behrens return ret; 540beaf8ab3SStefan Behrens } 541beaf8ab3SStefan Behrens 54270bc7088SAnand Jain static bool device_path_matched(const char *path, struct btrfs_device *device) 54370bc7088SAnand Jain { 54470bc7088SAnand Jain int found; 54570bc7088SAnand Jain 54670bc7088SAnand Jain rcu_read_lock(); 54770bc7088SAnand Jain found = strcmp(rcu_str_deref(device->name), path); 54870bc7088SAnand Jain rcu_read_unlock(); 54970bc7088SAnand Jain 55070bc7088SAnand Jain return found == 0; 55170bc7088SAnand Jain } 55270bc7088SAnand Jain 553d8367db3SAnand Jain /* 554d8367db3SAnand Jain * Search and remove all stale (devices which are not mounted) devices. 555d8367db3SAnand Jain * When both inputs are NULL, it will search and release all stale devices. 556d8367db3SAnand Jain * path: Optional. When provided will it release all unmounted devices 557d8367db3SAnand Jain * matching this path only. 558d8367db3SAnand Jain * skip_dev: Optional. Will skip this device when searching for the stale 559d8367db3SAnand Jain * devices. 56070bc7088SAnand Jain * Return: 0 for success or if @path is NULL. 56170bc7088SAnand Jain * -EBUSY if @path is a mounted device. 56270bc7088SAnand Jain * -ENOENT if @path does not match any device in the list. 563d8367db3SAnand Jain */ 56470bc7088SAnand Jain static int btrfs_free_stale_devices(const char *path, 565fa6d2ae5SAnand Jain struct btrfs_device *skip_device) 5664fde46f0SAnand Jain { 567fa6d2ae5SAnand Jain struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 568fa6d2ae5SAnand Jain struct btrfs_device *device, *tmp_device; 56970bc7088SAnand Jain int ret = 0; 57070bc7088SAnand Jain 57170bc7088SAnand Jain if (path) 57270bc7088SAnand Jain ret = -ENOENT; 5734fde46f0SAnand Jain 574fa6d2ae5SAnand Jain list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 5754fde46f0SAnand Jain 57670bc7088SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 577fa6d2ae5SAnand Jain list_for_each_entry_safe(device, tmp_device, 578fa6d2ae5SAnand Jain &fs_devices->devices, dev_list) { 579fa6d2ae5SAnand Jain if (skip_device && skip_device == device) 580d8367db3SAnand Jain continue; 581fa6d2ae5SAnand Jain if (path && !device->name) 5824fde46f0SAnand Jain continue; 58370bc7088SAnand Jain if (path && !device_path_matched(path, device)) 58438cf665dSAnand Jain continue; 58570bc7088SAnand Jain if (fs_devices->opened) { 58670bc7088SAnand Jain /* for an already deleted device return 0 */ 58770bc7088SAnand Jain if (path && ret != 0) 58870bc7088SAnand Jain ret = -EBUSY; 58970bc7088SAnand Jain break; 59070bc7088SAnand Jain } 5914fde46f0SAnand Jain 5924fde46f0SAnand Jain /* delete the stale device */ 593fa6d2ae5SAnand Jain fs_devices->num_devices--; 594fa6d2ae5SAnand Jain list_del(&device->dev_list); 595fa6d2ae5SAnand Jain btrfs_free_device(device); 5967bcb8164SAnand Jain 59770bc7088SAnand Jain ret = 0; 5984fde46f0SAnand Jain } 5997bcb8164SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 60070bc7088SAnand Jain 6017bcb8164SAnand Jain if (fs_devices->num_devices == 0) { 6027bcb8164SAnand Jain btrfs_sysfs_remove_fsid(fs_devices); 6037bcb8164SAnand Jain list_del(&fs_devices->fs_list); 6047bcb8164SAnand Jain free_fs_devices(fs_devices); 6054fde46f0SAnand Jain } 6064fde46f0SAnand Jain } 60770bc7088SAnand Jain 60870bc7088SAnand Jain return ret; 6094fde46f0SAnand Jain } 6104fde46f0SAnand Jain 61118c850fdSJosef Bacik /* 61218c850fdSJosef Bacik * This is only used on mount, and we are protected from competing things 61318c850fdSJosef Bacik * messing with our fs_devices by the uuid_mutex, thus we do not need the 61418c850fdSJosef Bacik * fs_devices->device_list_mutex here. 61518c850fdSJosef Bacik */ 6160fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 6170fb08bccSAnand Jain struct btrfs_device *device, fmode_t flags, 6180fb08bccSAnand Jain void *holder) 6190fb08bccSAnand Jain { 6200fb08bccSAnand Jain struct request_queue *q; 6210fb08bccSAnand Jain struct block_device *bdev; 6220fb08bccSAnand Jain struct btrfs_super_block *disk_super; 6230fb08bccSAnand Jain u64 devid; 6240fb08bccSAnand Jain int ret; 6250fb08bccSAnand Jain 6260fb08bccSAnand Jain if (device->bdev) 6270fb08bccSAnand Jain return -EINVAL; 6280fb08bccSAnand Jain if (!device->name) 6290fb08bccSAnand Jain return -EINVAL; 6300fb08bccSAnand Jain 6310fb08bccSAnand Jain ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 6328f32380dSJohannes Thumshirn &bdev, &disk_super); 6330fb08bccSAnand Jain if (ret) 6340fb08bccSAnand Jain return ret; 6350fb08bccSAnand Jain 6360fb08bccSAnand Jain devid = btrfs_stack_device_id(&disk_super->dev_item); 6370fb08bccSAnand Jain if (devid != device->devid) 6388f32380dSJohannes Thumshirn goto error_free_page; 6390fb08bccSAnand Jain 6400fb08bccSAnand Jain if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 6418f32380dSJohannes Thumshirn goto error_free_page; 6420fb08bccSAnand Jain 6430fb08bccSAnand Jain device->generation = btrfs_super_generation(disk_super); 6440fb08bccSAnand Jain 6450fb08bccSAnand Jain if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 6467239ff4bSNikolay Borisov if (btrfs_super_incompat_flags(disk_super) & 6477239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 6487239ff4bSNikolay Borisov pr_err( 6497239ff4bSNikolay Borisov "BTRFS: Invalid seeding and uuid-changed device detected\n"); 6508f32380dSJohannes Thumshirn goto error_free_page; 6517239ff4bSNikolay Borisov } 6527239ff4bSNikolay Borisov 653ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 6540395d84fSJohannes Thumshirn fs_devices->seeding = true; 6550fb08bccSAnand Jain } else { 656ebbede42SAnand Jain if (bdev_read_only(bdev)) 657ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 658ebbede42SAnand Jain else 659ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 6600fb08bccSAnand Jain } 6610fb08bccSAnand Jain 6620fb08bccSAnand Jain q = bdev_get_queue(bdev); 6630fb08bccSAnand Jain if (!blk_queue_nonrot(q)) 6647f0432d0SJohannes Thumshirn fs_devices->rotating = true; 6650fb08bccSAnand Jain 6660fb08bccSAnand Jain device->bdev = bdev; 667e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6680fb08bccSAnand Jain device->mode = flags; 6690fb08bccSAnand Jain 6700fb08bccSAnand Jain fs_devices->open_devices++; 671ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 672ebbede42SAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 6730fb08bccSAnand Jain fs_devices->rw_devices++; 674b1b8e386SAnand Jain list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 6750fb08bccSAnand Jain } 6768f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 6770fb08bccSAnand Jain 6780fb08bccSAnand Jain return 0; 6790fb08bccSAnand Jain 6808f32380dSJohannes Thumshirn error_free_page: 6818f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 6820fb08bccSAnand Jain blkdev_put(bdev, flags); 6830fb08bccSAnand Jain 6840fb08bccSAnand Jain return -EINVAL; 6850fb08bccSAnand Jain } 6860fb08bccSAnand Jain 68760999ca4SDavid Sterba /* 6887a62d0f0SNikolay Borisov * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 689c0d81c7cSSu Yue * being created with a disk that has already completed its fsid change. Such 690c0d81c7cSSu Yue * disk can belong to an fs which has its FSID changed or to one which doesn't. 691c0d81c7cSSu Yue * Handle both cases here. 6927a62d0f0SNikolay Borisov */ 6937a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress( 6947a62d0f0SNikolay Borisov struct btrfs_super_block *disk_super) 6957a62d0f0SNikolay Borisov { 6967a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices; 6977a62d0f0SNikolay Borisov 6987a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 6997a62d0f0SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 7007a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) != 0 && 7017a62d0f0SNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->fsid, 7027a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 7037a62d0f0SNikolay Borisov return fs_devices; 7047a62d0f0SNikolay Borisov } 7057a62d0f0SNikolay Borisov } 7067a62d0f0SNikolay Borisov 707c0d81c7cSSu Yue return find_fsid(disk_super->fsid, NULL); 7087a62d0f0SNikolay Borisov } 7097a62d0f0SNikolay Borisov 710cc5de4e7SNikolay Borisov 711cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed( 712cc5de4e7SNikolay Borisov struct btrfs_super_block *disk_super) 713cc5de4e7SNikolay Borisov { 714cc5de4e7SNikolay Borisov struct btrfs_fs_devices *fs_devices; 715cc5de4e7SNikolay Borisov 716cc5de4e7SNikolay Borisov /* 717cc5de4e7SNikolay Borisov * Handles the case where scanned device is part of an fs that had 718cc5de4e7SNikolay Borisov * multiple successful changes of FSID but curently device didn't 71905840710SNikolay Borisov * observe it. Meaning our fsid will be different than theirs. We need 72005840710SNikolay Borisov * to handle two subcases : 72105840710SNikolay Borisov * 1 - The fs still continues to have different METADATA/FSID uuids. 72205840710SNikolay Borisov * 2 - The fs is switched back to its original FSID (METADATA/FSID 72305840710SNikolay Borisov * are equal). 724cc5de4e7SNikolay Borisov */ 725cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 72605840710SNikolay Borisov /* Changed UUIDs */ 727cc5de4e7SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 728cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) != 0 && 729cc5de4e7SNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 730cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 731cc5de4e7SNikolay Borisov memcmp(fs_devices->fsid, disk_super->fsid, 73205840710SNikolay Borisov BTRFS_FSID_SIZE) != 0) 733cc5de4e7SNikolay Borisov return fs_devices; 73405840710SNikolay Borisov 73505840710SNikolay Borisov /* Unchanged UUIDs */ 73605840710SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 73705840710SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 73805840710SNikolay Borisov memcmp(fs_devices->fsid, disk_super->metadata_uuid, 73905840710SNikolay Borisov BTRFS_FSID_SIZE) == 0) 74005840710SNikolay Borisov return fs_devices; 741cc5de4e7SNikolay Borisov } 742cc5de4e7SNikolay Borisov 743cc5de4e7SNikolay Borisov return NULL; 744cc5de4e7SNikolay Borisov } 7451362089dSNikolay Borisov 7461362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata( 7471362089dSNikolay Borisov struct btrfs_super_block *disk_super) 7481362089dSNikolay Borisov { 7491362089dSNikolay Borisov struct btrfs_fs_devices *fs_devices; 7501362089dSNikolay Borisov 7511362089dSNikolay Borisov /* 7521362089dSNikolay Borisov * Handle the case where the scanned device is part of an fs whose last 7531362089dSNikolay Borisov * metadata UUID change reverted it to the original FSID. At the same 7541362089dSNikolay Borisov * time * fs_devices was first created by another constitutent device 7551362089dSNikolay Borisov * which didn't fully observe the operation. This results in an 7561362089dSNikolay Borisov * btrfs_fs_devices created with metadata/fsid different AND 7571362089dSNikolay Borisov * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 7581362089dSNikolay Borisov * fs_devices equal to the FSID of the disk. 7591362089dSNikolay Borisov */ 7601362089dSNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 7611362089dSNikolay Borisov if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 7621362089dSNikolay Borisov BTRFS_FSID_SIZE) != 0 && 7631362089dSNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->fsid, 7641362089dSNikolay Borisov BTRFS_FSID_SIZE) == 0 && 7651362089dSNikolay Borisov fs_devices->fsid_change) 7661362089dSNikolay Borisov return fs_devices; 7671362089dSNikolay Borisov } 7681362089dSNikolay Borisov 7691362089dSNikolay Borisov return NULL; 7701362089dSNikolay Borisov } 7717a62d0f0SNikolay Borisov /* 77260999ca4SDavid Sterba * Add new device to list of registered devices 77360999ca4SDavid Sterba * 77460999ca4SDavid Sterba * Returns: 775e124ece5SAnand Jain * device pointer which was just added or updated when successful 776e124ece5SAnand Jain * error pointer when failed 77760999ca4SDavid Sterba */ 778e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path, 7794306a974SAnand Jain struct btrfs_super_block *disk_super, 7804306a974SAnand Jain bool *new_device_added) 7818a4b83ccSChris Mason { 7828a4b83ccSChris Mason struct btrfs_device *device; 7837a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices = NULL; 784606686eeSJosef Bacik struct rcu_string *name; 7858a4b83ccSChris Mason u64 found_transid = btrfs_super_generation(disk_super); 7863acbcbfcSAnand Jain u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 7877239ff4bSNikolay Borisov bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 7887239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 789d1a63002SNikolay Borisov bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 790d1a63002SNikolay Borisov BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 7918a4b83ccSChris Mason 792cc5de4e7SNikolay Borisov if (fsid_change_in_progress) { 793c0d81c7cSSu Yue if (!has_metadata_uuid) 7947a62d0f0SNikolay Borisov fs_devices = find_fsid_inprogress(disk_super); 795c0d81c7cSSu Yue else 796cc5de4e7SNikolay Borisov fs_devices = find_fsid_changed(disk_super); 7977a62d0f0SNikolay Borisov } else if (has_metadata_uuid) { 798c6730a0eSSu Yue fs_devices = find_fsid_with_metadata_uuid(disk_super); 7997a62d0f0SNikolay Borisov } else { 8001362089dSNikolay Borisov fs_devices = find_fsid_reverted_metadata(disk_super); 8011362089dSNikolay Borisov if (!fs_devices) 8027a62d0f0SNikolay Borisov fs_devices = find_fsid(disk_super->fsid, NULL); 8037a62d0f0SNikolay Borisov } 8047a62d0f0SNikolay Borisov 8057a62d0f0SNikolay Borisov 8068a4b83ccSChris Mason if (!fs_devices) { 8077239ff4bSNikolay Borisov if (has_metadata_uuid) 8087239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(disk_super->fsid, 8097239ff4bSNikolay Borisov disk_super->metadata_uuid); 8107239ff4bSNikolay Borisov else 8117239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 8127239ff4bSNikolay Borisov 8132208a378SIlya Dryomov if (IS_ERR(fs_devices)) 814e124ece5SAnand Jain return ERR_CAST(fs_devices); 8152208a378SIlya Dryomov 81692900e51SAl Viro fs_devices->fsid_change = fsid_change_in_progress; 81792900e51SAl Viro 8189c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex); 819c4babc5eSAnand Jain list_add(&fs_devices->fs_list, &fs_uuids); 8202208a378SIlya Dryomov 8218a4b83ccSChris Mason device = NULL; 8228a4b83ccSChris Mason } else { 8239c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex); 82409ba3bc9SAnand Jain device = btrfs_find_device(fs_devices, devid, 82509ba3bc9SAnand Jain disk_super->dev_item.uuid, NULL, false); 8267a62d0f0SNikolay Borisov 8277a62d0f0SNikolay Borisov /* 8287a62d0f0SNikolay Borisov * If this disk has been pulled into an fs devices created by 8297a62d0f0SNikolay Borisov * a device which had the CHANGING_FSID_V2 flag then replace the 8307a62d0f0SNikolay Borisov * metadata_uuid/fsid values of the fs_devices. 8317a62d0f0SNikolay Borisov */ 8321362089dSNikolay Borisov if (fs_devices->fsid_change && 8337a62d0f0SNikolay Borisov found_transid > fs_devices->latest_generation) { 8347a62d0f0SNikolay Borisov memcpy(fs_devices->fsid, disk_super->fsid, 8357a62d0f0SNikolay Borisov BTRFS_FSID_SIZE); 8361362089dSNikolay Borisov 8371362089dSNikolay Borisov if (has_metadata_uuid) 8387a62d0f0SNikolay Borisov memcpy(fs_devices->metadata_uuid, 8391362089dSNikolay Borisov disk_super->metadata_uuid, 8401362089dSNikolay Borisov BTRFS_FSID_SIZE); 8411362089dSNikolay Borisov else 8421362089dSNikolay Borisov memcpy(fs_devices->metadata_uuid, 8431362089dSNikolay Borisov disk_super->fsid, BTRFS_FSID_SIZE); 8447a62d0f0SNikolay Borisov 8457a62d0f0SNikolay Borisov fs_devices->fsid_change = false; 8467a62d0f0SNikolay Borisov } 8478a4b83ccSChris Mason } 848443f24feSMiao Xie 8498a4b83ccSChris Mason if (!device) { 8509c6d173eSAnand Jain if (fs_devices->opened) { 8519c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 852e124ece5SAnand Jain return ERR_PTR(-EBUSY); 8539c6d173eSAnand Jain } 8542b82032cSYan Zheng 85512bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, 85612bd2fc0SIlya Dryomov disk_super->dev_item.uuid); 85712bd2fc0SIlya Dryomov if (IS_ERR(device)) { 8589c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 8598a4b83ccSChris Mason /* we can safely leave the fs_devices entry around */ 860e124ece5SAnand Jain return device; 8618a4b83ccSChris Mason } 862606686eeSJosef Bacik 863606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 864606686eeSJosef Bacik if (!name) { 865a425f9d4SDavid Sterba btrfs_free_device(device); 8669c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 867e124ece5SAnand Jain return ERR_PTR(-ENOMEM); 8688a4b83ccSChris Mason } 869606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 87090519d66SArne Jansen 8711f78160cSXiao Guangrong list_add_rcu(&device->dev_list, &fs_devices->devices); 872f7171750SFilipe David Borba Manana fs_devices->num_devices++; 873e5e9a520SChris Mason 8742b82032cSYan Zheng device->fs_devices = fs_devices; 8754306a974SAnand Jain *new_device_added = true; 876327f18ccSAnand Jain 877327f18ccSAnand Jain if (disk_super->label[0]) 878aa6c0df7SAnand Jain pr_info( 879aa6c0df7SAnand Jain "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 880aa6c0df7SAnand Jain disk_super->label, devid, found_transid, path, 881aa6c0df7SAnand Jain current->comm, task_pid_nr(current)); 882327f18ccSAnand Jain else 883aa6c0df7SAnand Jain pr_info( 884aa6c0df7SAnand Jain "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 885aa6c0df7SAnand Jain disk_super->fsid, devid, found_transid, path, 886aa6c0df7SAnand Jain current->comm, task_pid_nr(current)); 887327f18ccSAnand Jain 888606686eeSJosef Bacik } else if (!device->name || strcmp(device->name->str, path)) { 889b96de000SAnand Jain /* 890b96de000SAnand Jain * When FS is already mounted. 891b96de000SAnand Jain * 1. If you are here and if the device->name is NULL that 892b96de000SAnand Jain * means this device was missing at time of FS mount. 893b96de000SAnand Jain * 2. If you are here and if the device->name is different 894b96de000SAnand Jain * from 'path' that means either 895b96de000SAnand Jain * a. The same device disappeared and reappeared with 896b96de000SAnand Jain * different name. or 897b96de000SAnand Jain * b. The missing-disk-which-was-replaced, has 898b96de000SAnand Jain * reappeared now. 899b96de000SAnand Jain * 900b96de000SAnand Jain * We must allow 1 and 2a above. But 2b would be a spurious 901b96de000SAnand Jain * and unintentional. 902b96de000SAnand Jain * 903b96de000SAnand Jain * Further in case of 1 and 2a above, the disk at 'path' 904b96de000SAnand Jain * would have missed some transaction when it was away and 905b96de000SAnand Jain * in case of 2a the stale bdev has to be updated as well. 906b96de000SAnand Jain * 2b must not be allowed at all time. 907b96de000SAnand Jain */ 908b96de000SAnand Jain 909b96de000SAnand Jain /* 9100f23ae74SChris Mason * For now, we do allow update to btrfs_fs_device through the 9110f23ae74SChris Mason * btrfs dev scan cli after FS has been mounted. We're still 9120f23ae74SChris Mason * tracking a problem where systems fail mount by subvolume id 9130f23ae74SChris Mason * when we reject replacement on a mounted FS. 914b96de000SAnand Jain */ 9150f23ae74SChris Mason if (!fs_devices->opened && found_transid < device->generation) { 91677bdae4dSAnand Jain /* 91777bdae4dSAnand Jain * That is if the FS is _not_ mounted and if you 91877bdae4dSAnand Jain * are here, that means there is more than one 91977bdae4dSAnand Jain * disk with same uuid and devid.We keep the one 92077bdae4dSAnand Jain * with larger generation number or the last-in if 92177bdae4dSAnand Jain * generation are equal. 92277bdae4dSAnand Jain */ 9239c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 924e124ece5SAnand Jain return ERR_PTR(-EEXIST); 92577bdae4dSAnand Jain } 926b96de000SAnand Jain 927a9261d41SAnand Jain /* 928a9261d41SAnand Jain * We are going to replace the device path for a given devid, 929a9261d41SAnand Jain * make sure it's the same device if the device is mounted 930a9261d41SAnand Jain */ 931a9261d41SAnand Jain if (device->bdev) { 932a9261d41SAnand Jain struct block_device *path_bdev; 933a9261d41SAnand Jain 934a9261d41SAnand Jain path_bdev = lookup_bdev(path); 935a9261d41SAnand Jain if (IS_ERR(path_bdev)) { 936a9261d41SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 937a9261d41SAnand Jain return ERR_CAST(path_bdev); 938a9261d41SAnand Jain } 939a9261d41SAnand Jain 940a9261d41SAnand Jain if (device->bdev != path_bdev) { 941a9261d41SAnand Jain bdput(path_bdev); 942a9261d41SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 9430697d9a6SJohannes Thumshirn /* 9440697d9a6SJohannes Thumshirn * device->fs_info may not be reliable here, so 9450697d9a6SJohannes Thumshirn * pass in a NULL instead. This avoids a 9460697d9a6SJohannes Thumshirn * possible use-after-free when the fs_info and 9470697d9a6SJohannes Thumshirn * fs_info->sb are already torn down. 9480697d9a6SJohannes Thumshirn */ 9490697d9a6SJohannes Thumshirn btrfs_warn_in_rcu(NULL, 95079dae17dSAnand Jain "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 95179dae17dSAnand Jain path, devid, found_transid, 95279dae17dSAnand Jain current->comm, 95379dae17dSAnand Jain task_pid_nr(current)); 954a9261d41SAnand Jain return ERR_PTR(-EEXIST); 955a9261d41SAnand Jain } 956a9261d41SAnand Jain bdput(path_bdev); 957a9261d41SAnand Jain btrfs_info_in_rcu(device->fs_info, 95879dae17dSAnand Jain "devid %llu device path %s changed to %s scanned by %s (%d)", 95979dae17dSAnand Jain devid, rcu_str_deref(device->name), 96079dae17dSAnand Jain path, current->comm, 96179dae17dSAnand Jain task_pid_nr(current)); 962a9261d41SAnand Jain } 963a9261d41SAnand Jain 964606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 9659c6d173eSAnand Jain if (!name) { 9669c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 967e124ece5SAnand Jain return ERR_PTR(-ENOMEM); 9689c6d173eSAnand Jain } 969606686eeSJosef Bacik rcu_string_free(device->name); 970606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 971e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 972cd02dca5SChris Mason fs_devices->missing_devices--; 973e6e674bdSAnand Jain clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 974cd02dca5SChris Mason } 9758a4b83ccSChris Mason } 9768a4b83ccSChris Mason 97777bdae4dSAnand Jain /* 97877bdae4dSAnand Jain * Unmount does not free the btrfs_device struct but would zero 97977bdae4dSAnand Jain * generation along with most of the other members. So just update 98077bdae4dSAnand Jain * it back. We need it to pick the disk with largest generation 98177bdae4dSAnand Jain * (as above). 98277bdae4dSAnand Jain */ 983d1a63002SNikolay Borisov if (!fs_devices->opened) { 98477bdae4dSAnand Jain device->generation = found_transid; 985d1a63002SNikolay Borisov fs_devices->latest_generation = max_t(u64, found_transid, 986d1a63002SNikolay Borisov fs_devices->latest_generation); 987d1a63002SNikolay Borisov } 98877bdae4dSAnand Jain 989f2788d2fSAnand Jain fs_devices->total_devices = btrfs_super_num_devices(disk_super); 990f2788d2fSAnand Jain 9919c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 992e124ece5SAnand Jain return device; 9938a4b83ccSChris Mason } 9948a4b83ccSChris Mason 995e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 996e4404d6eSYan Zheng { 997e4404d6eSYan Zheng struct btrfs_fs_devices *fs_devices; 998e4404d6eSYan Zheng struct btrfs_device *device; 999e4404d6eSYan Zheng struct btrfs_device *orig_dev; 1000d2979aa2SAnand Jain int ret = 0; 1001e4404d6eSYan Zheng 10027239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(orig->fsid, NULL); 10032208a378SIlya Dryomov if (IS_ERR(fs_devices)) 10042208a378SIlya Dryomov return fs_devices; 1005e4404d6eSYan Zheng 1006adbbb863SMiao Xie mutex_lock(&orig->device_list_mutex); 100702db0844SJosef Bacik fs_devices->total_devices = orig->total_devices; 1008e4404d6eSYan Zheng 1009e4404d6eSYan Zheng list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1010606686eeSJosef Bacik struct rcu_string *name; 1011606686eeSJosef Bacik 101212bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &orig_dev->devid, 101312bd2fc0SIlya Dryomov orig_dev->uuid); 1014d2979aa2SAnand Jain if (IS_ERR(device)) { 1015d2979aa2SAnand Jain ret = PTR_ERR(device); 1016e4404d6eSYan Zheng goto error; 1017d2979aa2SAnand Jain } 1018e4404d6eSYan Zheng 1019606686eeSJosef Bacik /* 1020606686eeSJosef Bacik * This is ok to do without rcu read locked because we hold the 1021606686eeSJosef Bacik * uuid mutex so nothing we touch in here is going to disappear. 1022606686eeSJosef Bacik */ 1023e755f780SAnand Jain if (orig_dev->name) { 102478f2c9e6SDavid Sterba name = rcu_string_strdup(orig_dev->name->str, 102578f2c9e6SDavid Sterba GFP_KERNEL); 1026606686eeSJosef Bacik if (!name) { 1027a425f9d4SDavid Sterba btrfs_free_device(device); 1028d2979aa2SAnand Jain ret = -ENOMEM; 1029e4404d6eSYan Zheng goto error; 1030fd2696f3SJulia Lawall } 1031606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 1032e755f780SAnand Jain } 1033e4404d6eSYan Zheng 1034e4404d6eSYan Zheng list_add(&device->dev_list, &fs_devices->devices); 1035e4404d6eSYan Zheng device->fs_devices = fs_devices; 1036e4404d6eSYan Zheng fs_devices->num_devices++; 1037e4404d6eSYan Zheng } 1038adbbb863SMiao Xie mutex_unlock(&orig->device_list_mutex); 1039e4404d6eSYan Zheng return fs_devices; 1040e4404d6eSYan Zheng error: 1041adbbb863SMiao Xie mutex_unlock(&orig->device_list_mutex); 1042e4404d6eSYan Zheng free_fs_devices(fs_devices); 1043d2979aa2SAnand Jain return ERR_PTR(ret); 1044e4404d6eSYan Zheng } 1045e4404d6eSYan Zheng 10463712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1047*bacce86aSAnand Jain struct btrfs_device **latest_dev) 1048dfe25020SChris Mason { 1049c6e30871SQinghuang Feng struct btrfs_device *device, *next; 1050a6b0d5c8SChris Mason 105146224705SXiao Guangrong /* This is the initialized path, it is safe to release the devices. */ 1052c6e30871SQinghuang Feng list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 10533712ccb7SNikolay Borisov if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1054401e29c1SAnand Jain if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1055401e29c1SAnand Jain &device->dev_state) && 1056998a0671SAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, 1057998a0671SAnand Jain &device->dev_state) && 10583712ccb7SNikolay Borisov (!*latest_dev || 10593712ccb7SNikolay Borisov device->generation > (*latest_dev)->generation)) { 10603712ccb7SNikolay Borisov *latest_dev = device; 1061a6b0d5c8SChris Mason } 10622b82032cSYan Zheng continue; 1063a6b0d5c8SChris Mason } 10642b82032cSYan Zheng 10658dabb742SStefan Behrens /* 1066cf89af14SAnand Jain * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1067cf89af14SAnand Jain * in btrfs_init_dev_replace() so just continue. 10688dabb742SStefan Behrens */ 1069cf89af14SAnand Jain if (device->devid == BTRFS_DEV_REPLACE_DEVID) 10708dabb742SStefan Behrens continue; 1071cf89af14SAnand Jain 1072a74a4b97SChris Mason if (device->bdev) { 1073d4d77629STejun Heo blkdev_put(device->bdev, device->mode); 10742b82032cSYan Zheng device->bdev = NULL; 1075a74a4b97SChris Mason fs_devices->open_devices--; 1076a74a4b97SChris Mason } 1077ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 10782b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 1079ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 10802b82032cSYan Zheng } 10812b82032cSYan Zheng list_del_init(&device->dev_list); 10822b82032cSYan Zheng fs_devices->num_devices--; 1083a425f9d4SDavid Sterba btrfs_free_device(device); 10842b82032cSYan Zheng } 10852b82032cSYan Zheng 10863712ccb7SNikolay Borisov } 10873712ccb7SNikolay Borisov 10883712ccb7SNikolay Borisov /* 10893712ccb7SNikolay Borisov * After we have read the system tree and know devids belonging to this 10903712ccb7SNikolay Borisov * filesystem, remove the device which does not belong there. 10913712ccb7SNikolay Borisov */ 1092*bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 10933712ccb7SNikolay Borisov { 10943712ccb7SNikolay Borisov struct btrfs_device *latest_dev = NULL; 1095944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_dev; 10963712ccb7SNikolay Borisov 10973712ccb7SNikolay Borisov mutex_lock(&uuid_mutex); 1098*bacce86aSAnand Jain __btrfs_free_extra_devids(fs_devices, &latest_dev); 1099944d3f9fSNikolay Borisov 1100944d3f9fSNikolay Borisov list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1101*bacce86aSAnand Jain __btrfs_free_extra_devids(seed_dev, &latest_dev); 11022b82032cSYan Zheng 1103443f24feSMiao Xie fs_devices->latest_bdev = latest_dev->bdev; 1104a6b0d5c8SChris Mason 1105dfe25020SChris Mason mutex_unlock(&uuid_mutex); 1106dfe25020SChris Mason } 1107a0af469bSChris Mason 110814238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device) 110914238819SAnand Jain { 111008ffcae8SDavid Sterba if (!device->bdev) 111108ffcae8SDavid Sterba return; 111208ffcae8SDavid Sterba 1113ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 111414238819SAnand Jain sync_blockdev(device->bdev); 111514238819SAnand Jain invalidate_bdev(device->bdev); 111614238819SAnand Jain } 111714238819SAnand Jain 111814238819SAnand Jain blkdev_put(device->bdev, device->mode); 111914238819SAnand Jain } 112014238819SAnand Jain 1121959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device) 1122f448341aSAnand Jain { 1123f448341aSAnand Jain struct btrfs_fs_devices *fs_devices = device->fs_devices; 1124f448341aSAnand Jain 1125ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1126f448341aSAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 1127f448341aSAnand Jain list_del_init(&device->dev_alloc_list); 1128f448341aSAnand Jain fs_devices->rw_devices--; 1129f448341aSAnand Jain } 1130f448341aSAnand Jain 1131e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1132f448341aSAnand Jain fs_devices->missing_devices--; 1133f448341aSAnand Jain 1134959b1c04SNikolay Borisov btrfs_close_bdev(device); 1135321f69f8SJohannes Thumshirn if (device->bdev) { 11363fff3975SJohannes Thumshirn fs_devices->open_devices--; 1137321f69f8SJohannes Thumshirn device->bdev = NULL; 1138f448341aSAnand Jain } 1139321f69f8SJohannes Thumshirn clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1140f448341aSAnand Jain 1141321f69f8SJohannes Thumshirn device->fs_info = NULL; 1142321f69f8SJohannes Thumshirn atomic_set(&device->dev_stats_ccnt, 0); 1143321f69f8SJohannes Thumshirn extent_io_tree_release(&device->alloc_state); 1144959b1c04SNikolay Borisov 1145321f69f8SJohannes Thumshirn /* Verify the device is back in a pristine state */ 1146321f69f8SJohannes Thumshirn ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1147321f69f8SJohannes Thumshirn ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1148321f69f8SJohannes Thumshirn ASSERT(list_empty(&device->dev_alloc_list)); 1149321f69f8SJohannes Thumshirn ASSERT(list_empty(&device->post_commit_list)); 1150321f69f8SJohannes Thumshirn ASSERT(atomic_read(&device->reada_in_flight) == 0); 1151f448341aSAnand Jain } 1152f448341aSAnand Jain 115354eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 11548a4b83ccSChris Mason { 11552037a093SSasha Levin struct btrfs_device *device, *tmp; 1156e4404d6eSYan Zheng 1157425c6ed6SJosef Bacik lockdep_assert_held(&uuid_mutex); 1158425c6ed6SJosef Bacik 11592b82032cSYan Zheng if (--fs_devices->opened > 0) 116054eed6aeSNikolay Borisov return; 11618a4b83ccSChris Mason 1162425c6ed6SJosef Bacik list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1163959b1c04SNikolay Borisov btrfs_close_one_device(device); 1164c9513edbSXiao Guangrong 1165e4404d6eSYan Zheng WARN_ON(fs_devices->open_devices); 1166e4404d6eSYan Zheng WARN_ON(fs_devices->rw_devices); 11672b82032cSYan Zheng fs_devices->opened = 0; 11680395d84fSJohannes Thumshirn fs_devices->seeding = false; 1169c4989c2fSNikolay Borisov fs_devices->fs_info = NULL; 11708a4b83ccSChris Mason } 11718a4b83ccSChris Mason 117254eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 11732b82032cSYan Zheng { 1174944d3f9fSNikolay Borisov LIST_HEAD(list); 1175944d3f9fSNikolay Borisov struct btrfs_fs_devices *tmp; 11762b82032cSYan Zheng 11772b82032cSYan Zheng mutex_lock(&uuid_mutex); 117854eed6aeSNikolay Borisov close_fs_devices(fs_devices); 1179944d3f9fSNikolay Borisov if (!fs_devices->opened) 1180944d3f9fSNikolay Borisov list_splice_init(&fs_devices->seed_list, &list); 1181e4404d6eSYan Zheng 1182944d3f9fSNikolay Borisov list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 11830226e0ebSAnand Jain close_fs_devices(fs_devices); 1184944d3f9fSNikolay Borisov list_del(&fs_devices->seed_list); 1185e4404d6eSYan Zheng free_fs_devices(fs_devices); 1186e4404d6eSYan Zheng } 1187425c6ed6SJosef Bacik mutex_unlock(&uuid_mutex); 11882b82032cSYan Zheng } 11892b82032cSYan Zheng 1190897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 119197288f2cSChristoph Hellwig fmode_t flags, void *holder) 11928a4b83ccSChris Mason { 11938a4b83ccSChris Mason struct btrfs_device *device; 1194443f24feSMiao Xie struct btrfs_device *latest_dev = NULL; 119596c2e067SAnand Jain struct btrfs_device *tmp_device; 11968a4b83ccSChris Mason 1197d4d77629STejun Heo flags |= FMODE_EXCL; 1198d4d77629STejun Heo 119996c2e067SAnand Jain list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 120096c2e067SAnand Jain dev_list) { 120196c2e067SAnand Jain int ret; 1202a0af469bSChris Mason 120396c2e067SAnand Jain ret = btrfs_open_one_device(fs_devices, device, flags, holder); 120496c2e067SAnand Jain if (ret == 0 && 120596c2e067SAnand Jain (!latest_dev || device->generation > latest_dev->generation)) { 12069f050db4SAnand Jain latest_dev = device; 120796c2e067SAnand Jain } else if (ret == -ENODATA) { 120896c2e067SAnand Jain fs_devices->num_devices--; 120996c2e067SAnand Jain list_del(&device->dev_list); 121096c2e067SAnand Jain btrfs_free_device(device); 121196c2e067SAnand Jain } 12128a4b83ccSChris Mason } 12131ed802c9SAnand Jain if (fs_devices->open_devices == 0) 12141ed802c9SAnand Jain return -EINVAL; 12151ed802c9SAnand Jain 12162b82032cSYan Zheng fs_devices->opened = 1; 1217443f24feSMiao Xie fs_devices->latest_bdev = latest_dev->bdev; 12182b82032cSYan Zheng fs_devices->total_rw_bytes = 0; 1219c4a816c6SNaohiro Aota fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 122033fd2f71SAnand Jain fs_devices->read_policy = BTRFS_READ_POLICY_PID; 12211ed802c9SAnand Jain 12221ed802c9SAnand Jain return 0; 12232b82032cSYan Zheng } 12242b82032cSYan Zheng 1225f8e10cd3SAnand Jain static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1226f8e10cd3SAnand Jain { 1227f8e10cd3SAnand Jain struct btrfs_device *dev1, *dev2; 1228f8e10cd3SAnand Jain 1229f8e10cd3SAnand Jain dev1 = list_entry(a, struct btrfs_device, dev_list); 1230f8e10cd3SAnand Jain dev2 = list_entry(b, struct btrfs_device, dev_list); 1231f8e10cd3SAnand Jain 1232f8e10cd3SAnand Jain if (dev1->devid < dev2->devid) 1233f8e10cd3SAnand Jain return -1; 1234f8e10cd3SAnand Jain else if (dev1->devid > dev2->devid) 1235f8e10cd3SAnand Jain return 1; 1236f8e10cd3SAnand Jain return 0; 1237f8e10cd3SAnand Jain } 1238f8e10cd3SAnand Jain 12392b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 124097288f2cSChristoph Hellwig fmode_t flags, void *holder) 12412b82032cSYan Zheng { 12422b82032cSYan Zheng int ret; 12432b82032cSYan Zheng 1244f5194e34SDavid Sterba lockdep_assert_held(&uuid_mutex); 124518c850fdSJosef Bacik /* 124618c850fdSJosef Bacik * The device_list_mutex cannot be taken here in case opening the 124718c850fdSJosef Bacik * underlying device takes further locks like bd_mutex. 124818c850fdSJosef Bacik * 124918c850fdSJosef Bacik * We also don't need the lock here as this is called during mount and 125018c850fdSJosef Bacik * exclusion is provided by uuid_mutex 125118c850fdSJosef Bacik */ 1252f5194e34SDavid Sterba 12532b82032cSYan Zheng if (fs_devices->opened) { 12542b82032cSYan Zheng fs_devices->opened++; 12552b82032cSYan Zheng ret = 0; 12562b82032cSYan Zheng } else { 1257f8e10cd3SAnand Jain list_sort(NULL, &fs_devices->devices, devid_cmp); 1258897fb573SAnand Jain ret = open_fs_devices(fs_devices, flags, holder); 12592b82032cSYan Zheng } 1260542c5908SAnand Jain 12618a4b83ccSChris Mason return ret; 12628a4b83ccSChris Mason } 12638a4b83ccSChris Mason 12648f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super) 12656cf86a00SAnand Jain { 12668f32380dSJohannes Thumshirn struct page *page = virt_to_page(super); 12678f32380dSJohannes Thumshirn 12686cf86a00SAnand Jain put_page(page); 12696cf86a00SAnand Jain } 12706cf86a00SAnand Jain 1271b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1272b335eab8SNikolay Borisov u64 bytenr) 12736cf86a00SAnand Jain { 1274b335eab8SNikolay Borisov struct btrfs_super_block *disk_super; 1275b335eab8SNikolay Borisov struct page *page; 12766cf86a00SAnand Jain void *p; 12776cf86a00SAnand Jain pgoff_t index; 12786cf86a00SAnand Jain 12796cf86a00SAnand Jain /* make sure our super fits in the device */ 12806cf86a00SAnand Jain if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1281b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 12826cf86a00SAnand Jain 12836cf86a00SAnand Jain /* make sure our super fits in the page */ 1284b335eab8SNikolay Borisov if (sizeof(*disk_super) > PAGE_SIZE) 1285b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 12866cf86a00SAnand Jain 12876cf86a00SAnand Jain /* make sure our super doesn't straddle pages on disk */ 12886cf86a00SAnand Jain index = bytenr >> PAGE_SHIFT; 1289b335eab8SNikolay Borisov if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1290b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 12916cf86a00SAnand Jain 12926cf86a00SAnand Jain /* pull in the page with our super */ 1293b335eab8SNikolay Borisov page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 12946cf86a00SAnand Jain 1295b335eab8SNikolay Borisov if (IS_ERR(page)) 1296b335eab8SNikolay Borisov return ERR_CAST(page); 12976cf86a00SAnand Jain 1298b335eab8SNikolay Borisov p = page_address(page); 12996cf86a00SAnand Jain 13006cf86a00SAnand Jain /* align our pointer to the offset of the super block */ 1301b335eab8SNikolay Borisov disk_super = p + offset_in_page(bytenr); 13026cf86a00SAnand Jain 1303b335eab8SNikolay Borisov if (btrfs_super_bytenr(disk_super) != bytenr || 1304b335eab8SNikolay Borisov btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 13058f32380dSJohannes Thumshirn btrfs_release_disk_super(p); 1306b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 13076cf86a00SAnand Jain } 13086cf86a00SAnand Jain 1309b335eab8SNikolay Borisov if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1310b335eab8SNikolay Borisov disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 13116cf86a00SAnand Jain 1312b335eab8SNikolay Borisov return disk_super; 13136cf86a00SAnand Jain } 13146cf86a00SAnand Jain 1315228a73abSAnand Jain int btrfs_forget_devices(const char *path) 1316228a73abSAnand Jain { 1317228a73abSAnand Jain int ret; 1318228a73abSAnand Jain 1319228a73abSAnand Jain mutex_lock(&uuid_mutex); 1320228a73abSAnand Jain ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1321228a73abSAnand Jain mutex_unlock(&uuid_mutex); 1322228a73abSAnand Jain 1323228a73abSAnand Jain return ret; 1324228a73abSAnand Jain } 1325228a73abSAnand Jain 13266f60cbd3SDavid Sterba /* 13276f60cbd3SDavid Sterba * Look for a btrfs signature on a device. This may be called out of the mount path 13286f60cbd3SDavid Sterba * and we are not allowed to call set_blocksize during the scan. The superblock 13296f60cbd3SDavid Sterba * is read via pagecache 13306f60cbd3SDavid Sterba */ 133136350e95SGu Jinxiang struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 133236350e95SGu Jinxiang void *holder) 13338a4b83ccSChris Mason { 13348a4b83ccSChris Mason struct btrfs_super_block *disk_super; 13354306a974SAnand Jain bool new_device_added = false; 133636350e95SGu Jinxiang struct btrfs_device *device = NULL; 13378a4b83ccSChris Mason struct block_device *bdev; 13386f60cbd3SDavid Sterba u64 bytenr; 13398a4b83ccSChris Mason 1340899f9307SDavid Sterba lockdep_assert_held(&uuid_mutex); 1341899f9307SDavid Sterba 13426f60cbd3SDavid Sterba /* 13436f60cbd3SDavid Sterba * we would like to check all the supers, but that would make 13446f60cbd3SDavid Sterba * a btrfs mount succeed after a mkfs from a different FS. 13456f60cbd3SDavid Sterba * So, we need to add a special mount option to scan for 13466f60cbd3SDavid Sterba * later supers, using BTRFS_SUPER_MIRROR_MAX instead 13476f60cbd3SDavid Sterba */ 13486f60cbd3SDavid Sterba bytenr = btrfs_sb_offset(0); 1349d4d77629STejun Heo flags |= FMODE_EXCL; 13506f60cbd3SDavid Sterba 13516f60cbd3SDavid Sterba bdev = blkdev_get_by_path(path, flags, holder); 1352b6ed73bcSAnand Jain if (IS_ERR(bdev)) 135336350e95SGu Jinxiang return ERR_CAST(bdev); 13546f60cbd3SDavid Sterba 1355b335eab8SNikolay Borisov disk_super = btrfs_read_disk_super(bdev, bytenr); 1356b335eab8SNikolay Borisov if (IS_ERR(disk_super)) { 1357b335eab8SNikolay Borisov device = ERR_CAST(disk_super); 13586f60cbd3SDavid Sterba goto error_bdev_put; 135905a5c55dSAnand Jain } 13606f60cbd3SDavid Sterba 13614306a974SAnand Jain device = device_list_add(path, disk_super, &new_device_added); 136236350e95SGu Jinxiang if (!IS_ERR(device)) { 13634306a974SAnand Jain if (new_device_added) 13644306a974SAnand Jain btrfs_free_stale_devices(path, device); 13654306a974SAnand Jain } 13666f60cbd3SDavid Sterba 13678f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 13686f60cbd3SDavid Sterba 13696f60cbd3SDavid Sterba error_bdev_put: 1370d4d77629STejun Heo blkdev_put(bdev, flags); 1371b6ed73bcSAnand Jain 137236350e95SGu Jinxiang return device; 13738a4b83ccSChris Mason } 13740b86a832SChris Mason 1375c152b63eSFilipe Manana /* 13761c11b63eSJeff Mahoney * Try to find a chunk that intersects [start, start + len] range and when one 13771c11b63eSJeff Mahoney * such is found, record the end of it in *start 1378c152b63eSFilipe Manana */ 13791c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 13801c11b63eSJeff Mahoney u64 len) 13811c11b63eSJeff Mahoney { 13821c11b63eSJeff Mahoney u64 physical_start, physical_end; 13836df9a95eSJosef Bacik 13841c11b63eSJeff Mahoney lockdep_assert_held(&device->fs_info->chunk_mutex); 13851c11b63eSJeff Mahoney 13861c11b63eSJeff Mahoney if (!find_first_extent_bit(&device->alloc_state, *start, 13871c11b63eSJeff Mahoney &physical_start, &physical_end, 13881c11b63eSJeff Mahoney CHUNK_ALLOCATED, NULL)) { 13891c11b63eSJeff Mahoney 13901c11b63eSJeff Mahoney if (in_range(physical_start, *start, len) || 13911c11b63eSJeff Mahoney in_range(*start, physical_start, 13921c11b63eSJeff Mahoney physical_end - physical_start)) { 13931c11b63eSJeff Mahoney *start = physical_end + 1; 13941c11b63eSJeff Mahoney return true; 13951c11b63eSJeff Mahoney } 13961c11b63eSJeff Mahoney } 13971c11b63eSJeff Mahoney return false; 13986df9a95eSJosef Bacik } 13996df9a95eSJosef Bacik 14003b4ffa40SNaohiro Aota static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 14013b4ffa40SNaohiro Aota { 14023b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) { 14033b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 14043b4ffa40SNaohiro Aota /* 14053b4ffa40SNaohiro Aota * We don't want to overwrite the superblock on the drive nor 14063b4ffa40SNaohiro Aota * any area used by the boot loader (grub for example), so we 14073b4ffa40SNaohiro Aota * make sure to start at an offset of at least 1MB. 14083b4ffa40SNaohiro Aota */ 14093b4ffa40SNaohiro Aota return max_t(u64, start, SZ_1M); 14103b4ffa40SNaohiro Aota default: 14113b4ffa40SNaohiro Aota BUG(); 14123b4ffa40SNaohiro Aota } 14133b4ffa40SNaohiro Aota } 14143b4ffa40SNaohiro Aota 14153b4ffa40SNaohiro Aota /** 14163b4ffa40SNaohiro Aota * dev_extent_hole_check - check if specified hole is suitable for allocation 14173b4ffa40SNaohiro Aota * @device: the device which we have the hole 14183b4ffa40SNaohiro Aota * @hole_start: starting position of the hole 14193b4ffa40SNaohiro Aota * @hole_size: the size of the hole 14203b4ffa40SNaohiro Aota * @num_bytes: the size of the free space that we need 14213b4ffa40SNaohiro Aota * 14223b4ffa40SNaohiro Aota * This function may modify @hole_start and @hole_end to reflect the suitable 14233b4ffa40SNaohiro Aota * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 14243b4ffa40SNaohiro Aota */ 14253b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 14263b4ffa40SNaohiro Aota u64 *hole_size, u64 num_bytes) 14273b4ffa40SNaohiro Aota { 14283b4ffa40SNaohiro Aota bool changed = false; 14293b4ffa40SNaohiro Aota u64 hole_end = *hole_start + *hole_size; 14303b4ffa40SNaohiro Aota 14313b4ffa40SNaohiro Aota /* 14323b4ffa40SNaohiro Aota * Check before we set max_hole_start, otherwise we could end up 14333b4ffa40SNaohiro Aota * sending back this offset anyway. 14343b4ffa40SNaohiro Aota */ 14353b4ffa40SNaohiro Aota if (contains_pending_extent(device, hole_start, *hole_size)) { 14363b4ffa40SNaohiro Aota if (hole_end >= *hole_start) 14373b4ffa40SNaohiro Aota *hole_size = hole_end - *hole_start; 14383b4ffa40SNaohiro Aota else 14393b4ffa40SNaohiro Aota *hole_size = 0; 14403b4ffa40SNaohiro Aota changed = true; 14413b4ffa40SNaohiro Aota } 14423b4ffa40SNaohiro Aota 14433b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) { 14443b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 14453b4ffa40SNaohiro Aota /* No extra check */ 14463b4ffa40SNaohiro Aota break; 14473b4ffa40SNaohiro Aota default: 14483b4ffa40SNaohiro Aota BUG(); 14493b4ffa40SNaohiro Aota } 14503b4ffa40SNaohiro Aota 14513b4ffa40SNaohiro Aota return changed; 14523b4ffa40SNaohiro Aota } 14536df9a95eSJosef Bacik 14540b86a832SChris Mason /* 1455499f377fSJeff Mahoney * find_free_dev_extent_start - find free space in the specified device 14567bfc837dSMiao Xie * @device: the device which we search the free space in 14577bfc837dSMiao Xie * @num_bytes: the size of the free space that we need 1458499f377fSJeff Mahoney * @search_start: the position from which to begin the search 14597bfc837dSMiao Xie * @start: store the start of the free space. 1460499f377fSJeff Mahoney * @len: the size of the free space. that we find, or the size 1461499f377fSJeff Mahoney * of the max free space if we don't find suitable free space 14627bfc837dSMiao Xie * 14630b86a832SChris Mason * this uses a pretty simple search, the expectation is that it is 14640b86a832SChris Mason * called very infrequently and that a given device has a small number 14650b86a832SChris Mason * of extents 14667bfc837dSMiao Xie * 14677bfc837dSMiao Xie * @start is used to store the start of the free space if we find. But if we 14687bfc837dSMiao Xie * don't find suitable free space, it will be used to store the start position 14697bfc837dSMiao Xie * of the max free space. 14707bfc837dSMiao Xie * 14717bfc837dSMiao Xie * @len is used to store the size of the free space that we find. 14727bfc837dSMiao Xie * But if we don't find suitable free space, it is used to store the size of 14737bfc837dSMiao Xie * the max free space. 1474135da976SQu Wenruo * 1475135da976SQu Wenruo * NOTE: This function will search *commit* root of device tree, and does extra 1476135da976SQu Wenruo * check to ensure dev extents are not double allocated. 1477135da976SQu Wenruo * This makes the function safe to allocate dev extents but may not report 1478135da976SQu Wenruo * correct usable device space, as device extent freed in current transaction 1479135da976SQu Wenruo * is not reported as avaiable. 14800b86a832SChris Mason */ 14819e3246a5SQu Wenruo static int find_free_dev_extent_start(struct btrfs_device *device, 14829e3246a5SQu Wenruo u64 num_bytes, u64 search_start, u64 *start, 14839e3246a5SQu Wenruo u64 *len) 14840b86a832SChris Mason { 14850b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 14860b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 14870b86a832SChris Mason struct btrfs_key key; 14887bfc837dSMiao Xie struct btrfs_dev_extent *dev_extent; 14892b82032cSYan Zheng struct btrfs_path *path; 14907bfc837dSMiao Xie u64 hole_size; 14917bfc837dSMiao Xie u64 max_hole_start; 14927bfc837dSMiao Xie u64 max_hole_size; 14937bfc837dSMiao Xie u64 extent_end; 14940b86a832SChris Mason u64 search_end = device->total_bytes; 14950b86a832SChris Mason int ret; 14967bfc837dSMiao Xie int slot; 14970b86a832SChris Mason struct extent_buffer *l; 14988cdc7c5bSFilipe Manana 14993b4ffa40SNaohiro Aota search_start = dev_extent_search_start(device, search_start); 15000b86a832SChris Mason 15016df9a95eSJosef Bacik path = btrfs_alloc_path(); 15026df9a95eSJosef Bacik if (!path) 15036df9a95eSJosef Bacik return -ENOMEM; 1504f2ab7618SZhao Lei 15057bfc837dSMiao Xie max_hole_start = search_start; 15067bfc837dSMiao Xie max_hole_size = 0; 15077bfc837dSMiao Xie 1508f2ab7618SZhao Lei again: 1509401e29c1SAnand Jain if (search_start >= search_end || 1510401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 15117bfc837dSMiao Xie ret = -ENOSPC; 15126df9a95eSJosef Bacik goto out; 15137bfc837dSMiao Xie } 15147bfc837dSMiao Xie 1515e4058b54SDavid Sterba path->reada = READA_FORWARD; 15166df9a95eSJosef Bacik path->search_commit_root = 1; 15176df9a95eSJosef Bacik path->skip_locking = 1; 15187bfc837dSMiao Xie 15190b86a832SChris Mason key.objectid = device->devid; 15200b86a832SChris Mason key.offset = search_start; 15210b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 15227bfc837dSMiao Xie 1523125ccb0aSLi Zefan ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 15240b86a832SChris Mason if (ret < 0) 15257bfc837dSMiao Xie goto out; 15261fcbac58SYan Zheng if (ret > 0) { 15271fcbac58SYan Zheng ret = btrfs_previous_item(root, path, key.objectid, key.type); 15280b86a832SChris Mason if (ret < 0) 15297bfc837dSMiao Xie goto out; 15301fcbac58SYan Zheng } 15317bfc837dSMiao Xie 15320b86a832SChris Mason while (1) { 15330b86a832SChris Mason l = path->nodes[0]; 15340b86a832SChris Mason slot = path->slots[0]; 15350b86a832SChris Mason if (slot >= btrfs_header_nritems(l)) { 15360b86a832SChris Mason ret = btrfs_next_leaf(root, path); 15370b86a832SChris Mason if (ret == 0) 15380b86a832SChris Mason continue; 15390b86a832SChris Mason if (ret < 0) 15407bfc837dSMiao Xie goto out; 15417bfc837dSMiao Xie 15427bfc837dSMiao Xie break; 15430b86a832SChris Mason } 15440b86a832SChris Mason btrfs_item_key_to_cpu(l, &key, slot); 15450b86a832SChris Mason 15460b86a832SChris Mason if (key.objectid < device->devid) 15470b86a832SChris Mason goto next; 15480b86a832SChris Mason 15490b86a832SChris Mason if (key.objectid > device->devid) 15507bfc837dSMiao Xie break; 15510b86a832SChris Mason 1552962a298fSDavid Sterba if (key.type != BTRFS_DEV_EXTENT_KEY) 15530b86a832SChris Mason goto next; 15540b86a832SChris Mason 15557bfc837dSMiao Xie if (key.offset > search_start) { 15567bfc837dSMiao Xie hole_size = key.offset - search_start; 15573b4ffa40SNaohiro Aota dev_extent_hole_check(device, &search_start, &hole_size, 15583b4ffa40SNaohiro Aota num_bytes); 15596df9a95eSJosef Bacik 15607bfc837dSMiao Xie if (hole_size > max_hole_size) { 15617bfc837dSMiao Xie max_hole_start = search_start; 15627bfc837dSMiao Xie max_hole_size = hole_size; 15637bfc837dSMiao Xie } 15647bfc837dSMiao Xie 15657bfc837dSMiao Xie /* 15667bfc837dSMiao Xie * If this free space is greater than which we need, 15677bfc837dSMiao Xie * it must be the max free space that we have found 15687bfc837dSMiao Xie * until now, so max_hole_start must point to the start 15697bfc837dSMiao Xie * of this free space and the length of this free space 15707bfc837dSMiao Xie * is stored in max_hole_size. Thus, we return 15717bfc837dSMiao Xie * max_hole_start and max_hole_size and go back to the 15727bfc837dSMiao Xie * caller. 15737bfc837dSMiao Xie */ 15747bfc837dSMiao Xie if (hole_size >= num_bytes) { 15757bfc837dSMiao Xie ret = 0; 15767bfc837dSMiao Xie goto out; 15777bfc837dSMiao Xie } 15787bfc837dSMiao Xie } 15797bfc837dSMiao Xie 15800b86a832SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 15817bfc837dSMiao Xie extent_end = key.offset + btrfs_dev_extent_length(l, 15827bfc837dSMiao Xie dev_extent); 15837bfc837dSMiao Xie if (extent_end > search_start) 15847bfc837dSMiao Xie search_start = extent_end; 15850b86a832SChris Mason next: 15860b86a832SChris Mason path->slots[0]++; 15870b86a832SChris Mason cond_resched(); 15880b86a832SChris Mason } 15890b86a832SChris Mason 159038c01b96Sliubo /* 159138c01b96Sliubo * At this point, search_start should be the end of 159238c01b96Sliubo * allocated dev extents, and when shrinking the device, 159338c01b96Sliubo * search_end may be smaller than search_start. 159438c01b96Sliubo */ 1595f2ab7618SZhao Lei if (search_end > search_start) { 15967bfc837dSMiao Xie hole_size = search_end - search_start; 15973b4ffa40SNaohiro Aota if (dev_extent_hole_check(device, &search_start, &hole_size, 15983b4ffa40SNaohiro Aota num_bytes)) { 1599f2ab7618SZhao Lei btrfs_release_path(path); 1600f2ab7618SZhao Lei goto again; 1601f2ab7618SZhao Lei } 1602f2ab7618SZhao Lei 16037bfc837dSMiao Xie if (hole_size > max_hole_size) { 16047bfc837dSMiao Xie max_hole_start = search_start; 16057bfc837dSMiao Xie max_hole_size = hole_size; 16060b86a832SChris Mason } 16076df9a95eSJosef Bacik } 16086df9a95eSJosef Bacik 16097bfc837dSMiao Xie /* See above. */ 1610f2ab7618SZhao Lei if (max_hole_size < num_bytes) 16117bfc837dSMiao Xie ret = -ENOSPC; 16127bfc837dSMiao Xie else 16132b82032cSYan Zheng ret = 0; 16140b86a832SChris Mason 16157bfc837dSMiao Xie out: 16162b82032cSYan Zheng btrfs_free_path(path); 16177bfc837dSMiao Xie *start = max_hole_start; 1618b2117a39SMiao Xie if (len) 16197bfc837dSMiao Xie *len = max_hole_size; 16200b86a832SChris Mason return ret; 16210b86a832SChris Mason } 16220b86a832SChris Mason 162360dfdf25SNikolay Borisov int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1624499f377fSJeff Mahoney u64 *start, u64 *len) 1625499f377fSJeff Mahoney { 1626499f377fSJeff Mahoney /* FIXME use last free of some kind */ 162760dfdf25SNikolay Borisov return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1628499f377fSJeff Mahoney } 1629499f377fSJeff Mahoney 1630b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 16318f18cf13SChris Mason struct btrfs_device *device, 16322196d6e8SMiao Xie u64 start, u64 *dev_extent_len) 16338f18cf13SChris Mason { 16340b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 16350b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 16368f18cf13SChris Mason int ret; 16378f18cf13SChris Mason struct btrfs_path *path; 16388f18cf13SChris Mason struct btrfs_key key; 1639a061fc8dSChris Mason struct btrfs_key found_key; 1640a061fc8dSChris Mason struct extent_buffer *leaf = NULL; 1641a061fc8dSChris Mason struct btrfs_dev_extent *extent = NULL; 16428f18cf13SChris Mason 16438f18cf13SChris Mason path = btrfs_alloc_path(); 16448f18cf13SChris Mason if (!path) 16458f18cf13SChris Mason return -ENOMEM; 16468f18cf13SChris Mason 16478f18cf13SChris Mason key.objectid = device->devid; 16488f18cf13SChris Mason key.offset = start; 16498f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 1650924cd8fbSMiao Xie again: 16518f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1652a061fc8dSChris Mason if (ret > 0) { 1653a061fc8dSChris Mason ret = btrfs_previous_item(root, path, key.objectid, 1654a061fc8dSChris Mason BTRFS_DEV_EXTENT_KEY); 1655b0b802d7STsutomu Itoh if (ret) 1656b0b802d7STsutomu Itoh goto out; 1657a061fc8dSChris Mason leaf = path->nodes[0]; 1658a061fc8dSChris Mason btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1659a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1660a061fc8dSChris Mason struct btrfs_dev_extent); 1661a061fc8dSChris Mason BUG_ON(found_key.offset > start || found_key.offset + 1662a061fc8dSChris Mason btrfs_dev_extent_length(leaf, extent) < start); 1663924cd8fbSMiao Xie key = found_key; 1664924cd8fbSMiao Xie btrfs_release_path(path); 1665924cd8fbSMiao Xie goto again; 1666a061fc8dSChris Mason } else if (ret == 0) { 1667a061fc8dSChris Mason leaf = path->nodes[0]; 1668a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1669a061fc8dSChris Mason struct btrfs_dev_extent); 167079787eaaSJeff Mahoney } else { 16710b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 167279787eaaSJeff Mahoney goto out; 1673a061fc8dSChris Mason } 16748f18cf13SChris Mason 16752196d6e8SMiao Xie *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 16762196d6e8SMiao Xie 16778f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 167879787eaaSJeff Mahoney if (ret) { 16790b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 168079787eaaSJeff Mahoney "Failed to remove dev extent item"); 168113212b54SZhao Lei } else { 16823204d33cSJosef Bacik set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 168379787eaaSJeff Mahoney } 1684b0b802d7STsutomu Itoh out: 16858f18cf13SChris Mason btrfs_free_path(path); 16868f18cf13SChris Mason return ret; 16878f18cf13SChris Mason } 16888f18cf13SChris Mason 168948a3b636SEric Sandeen static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 16900b86a832SChris Mason struct btrfs_device *device, 16912b82032cSYan Zheng u64 chunk_offset, u64 start, u64 num_bytes) 16920b86a832SChris Mason { 16930b86a832SChris Mason int ret; 16940b86a832SChris Mason struct btrfs_path *path; 16950b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 16960b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 16970b86a832SChris Mason struct btrfs_dev_extent *extent; 16980b86a832SChris Mason struct extent_buffer *leaf; 16990b86a832SChris Mason struct btrfs_key key; 17000b86a832SChris Mason 1701e12c9621SAnand Jain WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1702401e29c1SAnand Jain WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 17030b86a832SChris Mason path = btrfs_alloc_path(); 17040b86a832SChris Mason if (!path) 17050b86a832SChris Mason return -ENOMEM; 17060b86a832SChris Mason 17070b86a832SChris Mason key.objectid = device->devid; 17082b82032cSYan Zheng key.offset = start; 17090b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 17100b86a832SChris Mason ret = btrfs_insert_empty_item(trans, root, path, &key, 17110b86a832SChris Mason sizeof(*extent)); 17122cdcecbcSMark Fasheh if (ret) 17132cdcecbcSMark Fasheh goto out; 17140b86a832SChris Mason 17150b86a832SChris Mason leaf = path->nodes[0]; 17160b86a832SChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 17170b86a832SChris Mason struct btrfs_dev_extent); 1718b5d9071cSNikolay Borisov btrfs_set_dev_extent_chunk_tree(leaf, extent, 1719b5d9071cSNikolay Borisov BTRFS_CHUNK_TREE_OBJECTID); 17200ca00afbSNikolay Borisov btrfs_set_dev_extent_chunk_objectid(leaf, extent, 17210ca00afbSNikolay Borisov BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1722e17cade2SChris Mason btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1723e17cade2SChris Mason 17240b86a832SChris Mason btrfs_set_dev_extent_length(leaf, extent, num_bytes); 17250b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 17262cdcecbcSMark Fasheh out: 17270b86a832SChris Mason btrfs_free_path(path); 17280b86a832SChris Mason return ret; 17290b86a832SChris Mason } 17300b86a832SChris Mason 17316df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 17320b86a832SChris Mason { 17336df9a95eSJosef Bacik struct extent_map_tree *em_tree; 17346df9a95eSJosef Bacik struct extent_map *em; 17356df9a95eSJosef Bacik struct rb_node *n; 17366df9a95eSJosef Bacik u64 ret = 0; 17370b86a832SChris Mason 1738c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree; 17396df9a95eSJosef Bacik read_lock(&em_tree->lock); 174007e1ce09SLiu Bo n = rb_last(&em_tree->map.rb_root); 17416df9a95eSJosef Bacik if (n) { 17426df9a95eSJosef Bacik em = rb_entry(n, struct extent_map, rb_node); 17436df9a95eSJosef Bacik ret = em->start + em->len; 1744e17cade2SChris Mason } 17456df9a95eSJosef Bacik read_unlock(&em_tree->lock); 17466df9a95eSJosef Bacik 17470b86a832SChris Mason return ret; 17480b86a832SChris Mason } 17490b86a832SChris Mason 175053f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 175153f10659SIlya Dryomov u64 *devid_ret) 17520b86a832SChris Mason { 17530b86a832SChris Mason int ret; 17540b86a832SChris Mason struct btrfs_key key; 17550b86a832SChris Mason struct btrfs_key found_key; 17562b82032cSYan Zheng struct btrfs_path *path; 17572b82032cSYan Zheng 17582b82032cSYan Zheng path = btrfs_alloc_path(); 17592b82032cSYan Zheng if (!path) 17602b82032cSYan Zheng return -ENOMEM; 17610b86a832SChris Mason 17620b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 17630b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 17640b86a832SChris Mason key.offset = (u64)-1; 17650b86a832SChris Mason 176653f10659SIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 17670b86a832SChris Mason if (ret < 0) 17680b86a832SChris Mason goto error; 17690b86a832SChris Mason 1770a06dee4dSAnand Jain if (ret == 0) { 1771a06dee4dSAnand Jain /* Corruption */ 1772a06dee4dSAnand Jain btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1773a06dee4dSAnand Jain ret = -EUCLEAN; 1774a06dee4dSAnand Jain goto error; 1775a06dee4dSAnand Jain } 17760b86a832SChris Mason 177753f10659SIlya Dryomov ret = btrfs_previous_item(fs_info->chunk_root, path, 177853f10659SIlya Dryomov BTRFS_DEV_ITEMS_OBJECTID, 17790b86a832SChris Mason BTRFS_DEV_ITEM_KEY); 17800b86a832SChris Mason if (ret) { 178153f10659SIlya Dryomov *devid_ret = 1; 17820b86a832SChris Mason } else { 17830b86a832SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 17840b86a832SChris Mason path->slots[0]); 178553f10659SIlya Dryomov *devid_ret = found_key.offset + 1; 17860b86a832SChris Mason } 17870b86a832SChris Mason ret = 0; 17880b86a832SChris Mason error: 17892b82032cSYan Zheng btrfs_free_path(path); 17900b86a832SChris Mason return ret; 17910b86a832SChris Mason } 17920b86a832SChris Mason 17930b86a832SChris Mason /* 17940b86a832SChris Mason * the device information is stored in the chunk root 17950b86a832SChris Mason * the btrfs_device struct should be fully filled in 17960b86a832SChris Mason */ 1797c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 17980b86a832SChris Mason struct btrfs_device *device) 17990b86a832SChris Mason { 18000b86a832SChris Mason int ret; 18010b86a832SChris Mason struct btrfs_path *path; 18020b86a832SChris Mason struct btrfs_dev_item *dev_item; 18030b86a832SChris Mason struct extent_buffer *leaf; 18040b86a832SChris Mason struct btrfs_key key; 18050b86a832SChris Mason unsigned long ptr; 18060b86a832SChris Mason 18070b86a832SChris Mason path = btrfs_alloc_path(); 18080b86a832SChris Mason if (!path) 18090b86a832SChris Mason return -ENOMEM; 18100b86a832SChris Mason 18110b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 18120b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 18132b82032cSYan Zheng key.offset = device->devid; 18140b86a832SChris Mason 18158e87e856SNikolay Borisov ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 18168e87e856SNikolay Borisov &key, sizeof(*dev_item)); 18170b86a832SChris Mason if (ret) 18180b86a832SChris Mason goto out; 18190b86a832SChris Mason 18200b86a832SChris Mason leaf = path->nodes[0]; 18210b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 18220b86a832SChris Mason 18230b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 18242b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 0); 18250b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 18260b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 18270b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 18280b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 18297cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 18307cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 18317cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 18327cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 1833e17cade2SChris Mason btrfs_set_device_group(leaf, dev_item, 0); 1834e17cade2SChris Mason btrfs_set_device_seek_speed(leaf, dev_item, 0); 1835e17cade2SChris Mason btrfs_set_device_bandwidth(leaf, dev_item, 0); 1836c3027eb5SChris Mason btrfs_set_device_start_offset(leaf, dev_item, 0); 18370b86a832SChris Mason 1838410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 1839e17cade2SChris Mason write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 18401473b24eSGeert Uytterhoeven ptr = btrfs_device_fsid(dev_item); 1841de37aa51SNikolay Borisov write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1842de37aa51SNikolay Borisov ptr, BTRFS_FSID_SIZE); 18430b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 18440b86a832SChris Mason 18452b82032cSYan Zheng ret = 0; 18460b86a832SChris Mason out: 18470b86a832SChris Mason btrfs_free_path(path); 18480b86a832SChris Mason return ret; 18490b86a832SChris Mason } 18508f18cf13SChris Mason 18515a1972bdSQu Wenruo /* 18525a1972bdSQu Wenruo * Function to update ctime/mtime for a given device path. 18535a1972bdSQu Wenruo * Mainly used for ctime/mtime based probe like libblkid. 18545a1972bdSQu Wenruo */ 1855da353f6bSDavid Sterba static void update_dev_time(const char *path_name) 18565a1972bdSQu Wenruo { 18575a1972bdSQu Wenruo struct file *filp; 18585a1972bdSQu Wenruo 18595a1972bdSQu Wenruo filp = filp_open(path_name, O_RDWR, 0); 186098af592fSAl Viro if (IS_ERR(filp)) 18615a1972bdSQu Wenruo return; 18625a1972bdSQu Wenruo file_update_time(filp); 18635a1972bdSQu Wenruo filp_close(filp, NULL); 18645a1972bdSQu Wenruo } 18655a1972bdSQu Wenruo 1866f331a952SDavid Sterba static int btrfs_rm_dev_item(struct btrfs_device *device) 1867a061fc8dSChris Mason { 1868f331a952SDavid Sterba struct btrfs_root *root = device->fs_info->chunk_root; 1869a061fc8dSChris Mason int ret; 1870a061fc8dSChris Mason struct btrfs_path *path; 1871a061fc8dSChris Mason struct btrfs_key key; 1872a061fc8dSChris Mason struct btrfs_trans_handle *trans; 1873a061fc8dSChris Mason 1874a061fc8dSChris Mason path = btrfs_alloc_path(); 1875a061fc8dSChris Mason if (!path) 1876a061fc8dSChris Mason return -ENOMEM; 1877a061fc8dSChris Mason 1878a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 187998d5dc13STsutomu Itoh if (IS_ERR(trans)) { 188098d5dc13STsutomu Itoh btrfs_free_path(path); 188198d5dc13STsutomu Itoh return PTR_ERR(trans); 188298d5dc13STsutomu Itoh } 1883a061fc8dSChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1884a061fc8dSChris Mason key.type = BTRFS_DEV_ITEM_KEY; 1885a061fc8dSChris Mason key.offset = device->devid; 1886a061fc8dSChris Mason 1887a061fc8dSChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 18885e9f2ad5SNikolay Borisov if (ret) { 18895e9f2ad5SNikolay Borisov if (ret > 0) 1890a061fc8dSChris Mason ret = -ENOENT; 18915e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 18925e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 1893a061fc8dSChris Mason goto out; 1894a061fc8dSChris Mason } 1895a061fc8dSChris Mason 1896a061fc8dSChris Mason ret = btrfs_del_item(trans, root, path); 18975e9f2ad5SNikolay Borisov if (ret) { 18985e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 18995e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 19005e9f2ad5SNikolay Borisov } 19015e9f2ad5SNikolay Borisov 1902a061fc8dSChris Mason out: 1903a061fc8dSChris Mason btrfs_free_path(path); 19045e9f2ad5SNikolay Borisov if (!ret) 19055e9f2ad5SNikolay Borisov ret = btrfs_commit_transaction(trans); 1906a061fc8dSChris Mason return ret; 1907a061fc8dSChris Mason } 1908a061fc8dSChris Mason 19093cc31a0dSDavid Sterba /* 19103cc31a0dSDavid Sterba * Verify that @num_devices satisfies the RAID profile constraints in the whole 19113cc31a0dSDavid Sterba * filesystem. It's up to the caller to adjust that number regarding eg. device 19123cc31a0dSDavid Sterba * replace. 19133cc31a0dSDavid Sterba */ 19143cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 19153cc31a0dSDavid Sterba u64 num_devices) 1916a061fc8dSChris Mason { 1917a061fc8dSChris Mason u64 all_avail; 1918de98ced9SMiao Xie unsigned seq; 1919418775a2SDavid Sterba int i; 1920a061fc8dSChris Mason 1921de98ced9SMiao Xie do { 1922bd45ffbcSAnand Jain seq = read_seqbegin(&fs_info->profiles_lock); 1923de98ced9SMiao Xie 1924bd45ffbcSAnand Jain all_avail = fs_info->avail_data_alloc_bits | 1925bd45ffbcSAnand Jain fs_info->avail_system_alloc_bits | 1926bd45ffbcSAnand Jain fs_info->avail_metadata_alloc_bits; 1927bd45ffbcSAnand Jain } while (read_seqretry(&fs_info->profiles_lock, seq)); 1928f1fa7f26SAnand Jain 1929418775a2SDavid Sterba for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 193041a6e891SAnand Jain if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1931418775a2SDavid Sterba continue; 1932a061fc8dSChris Mason 1933418775a2SDavid Sterba if (num_devices < btrfs_raid_array[i].devs_min) { 1934f9fbcaa2SAnand Jain int ret = btrfs_raid_array[i].mindev_error; 1935a061fc8dSChris Mason 1936418775a2SDavid Sterba if (ret) 1937418775a2SDavid Sterba return ret; 193853b381b3SDavid Woodhouse } 1939bd45ffbcSAnand Jain } 1940bd45ffbcSAnand Jain 1941bd45ffbcSAnand Jain return 0; 1942f1fa7f26SAnand Jain } 1943f1fa7f26SAnand Jain 1944c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device( 1945c9162bdfSOmar Sandoval struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 194688acff64SAnand Jain { 194788acff64SAnand Jain struct btrfs_device *next_device; 194888acff64SAnand Jain 194988acff64SAnand Jain list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 195088acff64SAnand Jain if (next_device != device && 1951e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1952e6e674bdSAnand Jain && next_device->bdev) 195388acff64SAnand Jain return next_device; 195488acff64SAnand Jain } 195588acff64SAnand Jain 195688acff64SAnand Jain return NULL; 195788acff64SAnand Jain } 195888acff64SAnand Jain 195988acff64SAnand Jain /* 196088acff64SAnand Jain * Helper function to check if the given device is part of s_bdev / latest_bdev 196188acff64SAnand Jain * and replace it with the provided or the next active device, in the context 196288acff64SAnand Jain * where this function called, there should be always be another device (or 196388acff64SAnand Jain * this_dev) which is active. 196488acff64SAnand Jain */ 1965b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1966e493e8f9SAnand Jain struct btrfs_device *next_device) 196788acff64SAnand Jain { 1968d6507cf1SNikolay Borisov struct btrfs_fs_info *fs_info = device->fs_info; 196988acff64SAnand Jain 1970e493e8f9SAnand Jain if (!next_device) 197188acff64SAnand Jain next_device = btrfs_find_next_active_device(fs_info->fs_devices, 197288acff64SAnand Jain device); 197388acff64SAnand Jain ASSERT(next_device); 197488acff64SAnand Jain 197588acff64SAnand Jain if (fs_info->sb->s_bdev && 197688acff64SAnand Jain (fs_info->sb->s_bdev == device->bdev)) 197788acff64SAnand Jain fs_info->sb->s_bdev = next_device->bdev; 197888acff64SAnand Jain 197988acff64SAnand Jain if (fs_info->fs_devices->latest_bdev == device->bdev) 198088acff64SAnand Jain fs_info->fs_devices->latest_bdev = next_device->bdev; 198188acff64SAnand Jain } 198288acff64SAnand Jain 19831da73967SAnand Jain /* 19841da73967SAnand Jain * Return btrfs_fs_devices::num_devices excluding the device that's being 19851da73967SAnand Jain * currently replaced. 19861da73967SAnand Jain */ 19871da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 19881da73967SAnand Jain { 19891da73967SAnand Jain u64 num_devices = fs_info->fs_devices->num_devices; 19901da73967SAnand Jain 1991cb5583ddSDavid Sterba down_read(&fs_info->dev_replace.rwsem); 19921da73967SAnand Jain if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 19931da73967SAnand Jain ASSERT(num_devices > 1); 19941da73967SAnand Jain num_devices--; 19951da73967SAnand Jain } 1996cb5583ddSDavid Sterba up_read(&fs_info->dev_replace.rwsem); 19971da73967SAnand Jain 19981da73967SAnand Jain return num_devices; 19991da73967SAnand Jain } 20001da73967SAnand Jain 2001313b0858SJosef Bacik void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 20028f32380dSJohannes Thumshirn struct block_device *bdev, 20036fbceb9fSJohannes Thumshirn const char *device_path) 20046fbceb9fSJohannes Thumshirn { 20056fbceb9fSJohannes Thumshirn struct btrfs_super_block *disk_super; 20066fbceb9fSJohannes Thumshirn int copy_num; 20076fbceb9fSJohannes Thumshirn 20086fbceb9fSJohannes Thumshirn if (!bdev) 20096fbceb9fSJohannes Thumshirn return; 20106fbceb9fSJohannes Thumshirn 20116fbceb9fSJohannes Thumshirn for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 20128f32380dSJohannes Thumshirn struct page *page; 20138f32380dSJohannes Thumshirn int ret; 20148f32380dSJohannes Thumshirn 20158f32380dSJohannes Thumshirn disk_super = btrfs_read_dev_one_super(bdev, copy_num); 20168f32380dSJohannes Thumshirn if (IS_ERR(disk_super)) 20176fbceb9fSJohannes Thumshirn continue; 20186fbceb9fSJohannes Thumshirn 20196fbceb9fSJohannes Thumshirn memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 20208f32380dSJohannes Thumshirn 20218f32380dSJohannes Thumshirn page = virt_to_page(disk_super); 20228f32380dSJohannes Thumshirn set_page_dirty(page); 20238f32380dSJohannes Thumshirn lock_page(page); 20248f32380dSJohannes Thumshirn /* write_on_page() unlocks the page */ 20258f32380dSJohannes Thumshirn ret = write_one_page(page); 20268f32380dSJohannes Thumshirn if (ret) 20278f32380dSJohannes Thumshirn btrfs_warn(fs_info, 20288f32380dSJohannes Thumshirn "error clearing superblock number %d (%d)", 20298f32380dSJohannes Thumshirn copy_num, ret); 20308f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 20318f32380dSJohannes Thumshirn 20326fbceb9fSJohannes Thumshirn } 20336fbceb9fSJohannes Thumshirn 20346fbceb9fSJohannes Thumshirn /* Notify udev that device has changed */ 20356fbceb9fSJohannes Thumshirn btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 20366fbceb9fSJohannes Thumshirn 20376fbceb9fSJohannes Thumshirn /* Update ctime/mtime for device path for libblkid */ 20386fbceb9fSJohannes Thumshirn update_dev_time(device_path); 20396fbceb9fSJohannes Thumshirn } 20406fbceb9fSJohannes Thumshirn 2041da353f6bSDavid Sterba int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2042da353f6bSDavid Sterba u64 devid) 2043f1fa7f26SAnand Jain { 2044f1fa7f26SAnand Jain struct btrfs_device *device; 2045f1fa7f26SAnand Jain struct btrfs_fs_devices *cur_devices; 2046b5185197SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2047f1fa7f26SAnand Jain u64 num_devices; 2048f1fa7f26SAnand Jain int ret = 0; 2049f1fa7f26SAnand Jain 2050f1fa7f26SAnand Jain mutex_lock(&uuid_mutex); 2051a061fc8dSChris Mason 20521da73967SAnand Jain num_devices = btrfs_num_devices(fs_info); 2053a061fc8dSChris Mason 20540b246afaSJeff Mahoney ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2055beaf8ab3SStefan Behrens if (ret) 2056a061fc8dSChris Mason goto out; 2057f1fa7f26SAnand Jain 2058a27a94c2SNikolay Borisov device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2059a27a94c2SNikolay Borisov 2060a27a94c2SNikolay Borisov if (IS_ERR(device)) { 2061a27a94c2SNikolay Borisov if (PTR_ERR(device) == -ENOENT && 2062a27a94c2SNikolay Borisov strcmp(device_path, "missing") == 0) 2063a27a94c2SNikolay Borisov ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2064a27a94c2SNikolay Borisov else 2065a27a94c2SNikolay Borisov ret = PTR_ERR(device); 2066a061fc8dSChris Mason goto out; 2067a27a94c2SNikolay Borisov } 20682b82032cSYan Zheng 2069eede2bf3SOmar Sandoval if (btrfs_pinned_by_swapfile(fs_info, device)) { 2070eede2bf3SOmar Sandoval btrfs_warn_in_rcu(fs_info, 2071eede2bf3SOmar Sandoval "cannot remove device %s (devid %llu) due to active swapfile", 2072eede2bf3SOmar Sandoval rcu_str_deref(device->name), device->devid); 2073eede2bf3SOmar Sandoval ret = -ETXTBSY; 2074eede2bf3SOmar Sandoval goto out; 2075eede2bf3SOmar Sandoval } 2076eede2bf3SOmar Sandoval 2077401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2078183860f6SAnand Jain ret = BTRFS_ERROR_DEV_TGT_REPLACE; 207924fc572fSAnand Jain goto out; 208063a212abSStefan Behrens } 208163a212abSStefan Behrens 2082ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2083ebbede42SAnand Jain fs_info->fs_devices->rw_devices == 1) { 2084183860f6SAnand Jain ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 208524fc572fSAnand Jain goto out; 20862b82032cSYan Zheng } 20872b82032cSYan Zheng 2088ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 208934441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 20902b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 2091c3929c36SMiao Xie device->fs_devices->rw_devices--; 209234441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 20932b82032cSYan Zheng } 2094a061fc8dSChris Mason 2095d7901554SCarey Underwood mutex_unlock(&uuid_mutex); 2096a061fc8dSChris Mason ret = btrfs_shrink_device(device, 0); 209766d204a1SFilipe Manana if (!ret) 209866d204a1SFilipe Manana btrfs_reada_remove_dev(device); 2099d7901554SCarey Underwood mutex_lock(&uuid_mutex); 2100a061fc8dSChris Mason if (ret) 21019b3517e9SIlya Dryomov goto error_undo; 2102a061fc8dSChris Mason 210363a212abSStefan Behrens /* 210463a212abSStefan Behrens * TODO: the superblock still includes this device in its num_devices 210563a212abSStefan Behrens * counter although write_all_supers() is not locked out. This 210663a212abSStefan Behrens * could give a filesystem state which requires a degraded mount. 210763a212abSStefan Behrens */ 2108f331a952SDavid Sterba ret = btrfs_rm_dev_item(device); 2109a061fc8dSChris Mason if (ret) 21109b3517e9SIlya Dryomov goto error_undo; 2111a061fc8dSChris Mason 2112e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2113163e97eeSDavid Sterba btrfs_scrub_cancel_dev(device); 2114e5e9a520SChris Mason 2115e5e9a520SChris Mason /* 2116e5e9a520SChris Mason * the device list mutex makes sure that we don't change 2117e5e9a520SChris Mason * the device list while someone else is writing out all 2118d7306801SFilipe David Borba Manana * the device supers. Whoever is writing all supers, should 2119d7306801SFilipe David Borba Manana * lock the device list mutex before getting the number of 2120d7306801SFilipe David Borba Manana * devices in the super block (super_copy). Conversely, 2121d7306801SFilipe David Borba Manana * whoever updates the number of devices in the super block 2122d7306801SFilipe David Borba Manana * (super_copy) should hold the device list mutex. 2123e5e9a520SChris Mason */ 21241f78160cSXiao Guangrong 212541a52a0fSAnand Jain /* 212641a52a0fSAnand Jain * In normal cases the cur_devices == fs_devices. But in case 212741a52a0fSAnand Jain * of deleting a seed device, the cur_devices should point to 212841a52a0fSAnand Jain * its own fs_devices listed under the fs_devices->seed. 212941a52a0fSAnand Jain */ 21301f78160cSXiao Guangrong cur_devices = device->fs_devices; 2131b5185197SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 21321f78160cSXiao Guangrong list_del_rcu(&device->dev_list); 2133e5e9a520SChris Mason 213441a52a0fSAnand Jain cur_devices->num_devices--; 213541a52a0fSAnand Jain cur_devices->total_devices--; 2136b4993e64SAnand Jain /* Update total_devices of the parent fs_devices if it's seed */ 2137b4993e64SAnand Jain if (cur_devices != fs_devices) 2138b4993e64SAnand Jain fs_devices->total_devices--; 21392b82032cSYan Zheng 2140e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 214141a52a0fSAnand Jain cur_devices->missing_devices--; 2142cd02dca5SChris Mason 2143d6507cf1SNikolay Borisov btrfs_assign_next_active_device(device, NULL); 21442b82032cSYan Zheng 21450bfaa9c5SEric Sandeen if (device->bdev) { 214641a52a0fSAnand Jain cur_devices->open_devices--; 214799994cdeSAnand Jain /* remove sysfs entry */ 214853f8a74cSAnand Jain btrfs_sysfs_remove_device(device); 21490bfaa9c5SEric Sandeen } 215099994cdeSAnand Jain 21510b246afaSJeff Mahoney num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 21520b246afaSJeff Mahoney btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2153b5185197SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2154e4404d6eSYan Zheng 2155cea67ab9SJeff Mahoney /* 2156cea67ab9SJeff Mahoney * at this point, the device is zero sized and detached from 2157cea67ab9SJeff Mahoney * the devices list. All that's left is to zero out the old 2158cea67ab9SJeff Mahoney * supers and free the device. 2159cea67ab9SJeff Mahoney */ 2160ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 21618f32380dSJohannes Thumshirn btrfs_scratch_superblocks(fs_info, device->bdev, 21628f32380dSJohannes Thumshirn device->name->str); 2163cea67ab9SJeff Mahoney 2164cea67ab9SJeff Mahoney btrfs_close_bdev(device); 21658e75fd89SNikolay Borisov synchronize_rcu(); 21668e75fd89SNikolay Borisov btrfs_free_device(device); 2167cea67ab9SJeff Mahoney 21681f78160cSXiao Guangrong if (cur_devices->open_devices == 0) { 2169944d3f9fSNikolay Borisov list_del_init(&cur_devices->seed_list); 21700226e0ebSAnand Jain close_fs_devices(cur_devices); 21711f78160cSXiao Guangrong free_fs_devices(cur_devices); 21722b82032cSYan Zheng } 21732b82032cSYan Zheng 2174a061fc8dSChris Mason out: 2175a061fc8dSChris Mason mutex_unlock(&uuid_mutex); 2176a061fc8dSChris Mason return ret; 217724fc572fSAnand Jain 21789b3517e9SIlya Dryomov error_undo: 217966d204a1SFilipe Manana btrfs_reada_undo_remove_dev(device); 2180ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 218134441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 21829b3517e9SIlya Dryomov list_add(&device->dev_alloc_list, 2183b5185197SAnand Jain &fs_devices->alloc_list); 2184c3929c36SMiao Xie device->fs_devices->rw_devices++; 218534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 21869b3517e9SIlya Dryomov } 218724fc572fSAnand Jain goto out; 2188a061fc8dSChris Mason } 2189a061fc8dSChris Mason 219068a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2191e93c89c1SStefan Behrens { 2192d51908ceSAnand Jain struct btrfs_fs_devices *fs_devices; 2193d51908ceSAnand Jain 219468a9db5fSNikolay Borisov lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 21951357272fSIlya Dryomov 219625e8e911SAnand Jain /* 219725e8e911SAnand Jain * in case of fs with no seed, srcdev->fs_devices will point 219825e8e911SAnand Jain * to fs_devices of fs_info. However when the dev being replaced is 219925e8e911SAnand Jain * a seed dev it will point to the seed's local fs_devices. In short 220025e8e911SAnand Jain * srcdev will have its correct fs_devices in both the cases. 220125e8e911SAnand Jain */ 220225e8e911SAnand Jain fs_devices = srcdev->fs_devices; 2203d51908ceSAnand Jain 2204e93c89c1SStefan Behrens list_del_rcu(&srcdev->dev_list); 2205619c47f3SDavid Sterba list_del(&srcdev->dev_alloc_list); 2206d51908ceSAnand Jain fs_devices->num_devices--; 2207e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2208d51908ceSAnand Jain fs_devices->missing_devices--; 2209e93c89c1SStefan Behrens 2210ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 221182372bc8SMiao Xie fs_devices->rw_devices--; 22121357272fSIlya Dryomov 221382372bc8SMiao Xie if (srcdev->bdev) 221482372bc8SMiao Xie fs_devices->open_devices--; 2215084b6e7cSQu Wenruo } 2216084b6e7cSQu Wenruo 221765237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2218084b6e7cSQu Wenruo { 2219084b6e7cSQu Wenruo struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 222082372bc8SMiao Xie 2221a466c85eSJosef Bacik mutex_lock(&uuid_mutex); 2222a466c85eSJosef Bacik 222314238819SAnand Jain btrfs_close_bdev(srcdev); 22248e75fd89SNikolay Borisov synchronize_rcu(); 22258e75fd89SNikolay Borisov btrfs_free_device(srcdev); 222694d5f0c2SAnand Jain 222794d5f0c2SAnand Jain /* if this is no devs we rather delete the fs_devices */ 222894d5f0c2SAnand Jain if (!fs_devices->num_devices) { 22296dd38f81SAnand Jain /* 22306dd38f81SAnand Jain * On a mounted FS, num_devices can't be zero unless it's a 22316dd38f81SAnand Jain * seed. In case of a seed device being replaced, the replace 22326dd38f81SAnand Jain * target added to the sprout FS, so there will be no more 22336dd38f81SAnand Jain * device left under the seed FS. 22346dd38f81SAnand Jain */ 22356dd38f81SAnand Jain ASSERT(fs_devices->seeding); 22366dd38f81SAnand Jain 2237944d3f9fSNikolay Borisov list_del_init(&fs_devices->seed_list); 22380226e0ebSAnand Jain close_fs_devices(fs_devices); 22398bef8401SAnand Jain free_fs_devices(fs_devices); 224094d5f0c2SAnand Jain } 2241a466c85eSJosef Bacik mutex_unlock(&uuid_mutex); 2242e93c89c1SStefan Behrens } 2243e93c89c1SStefan Behrens 22444f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2245e93c89c1SStefan Behrens { 22464f5ad7bdSNikolay Borisov struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2247d2ff1b20SAnand Jain 2248d9a071f0SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 2249d9a071f0SAnand Jain 225053f8a74cSAnand Jain btrfs_sysfs_remove_device(tgtdev); 2251d2ff1b20SAnand Jain 2252779bf3feSAnand Jain if (tgtdev->bdev) 2253d9a071f0SAnand Jain fs_devices->open_devices--; 2254779bf3feSAnand Jain 2255d9a071f0SAnand Jain fs_devices->num_devices--; 2256e93c89c1SStefan Behrens 2257d6507cf1SNikolay Borisov btrfs_assign_next_active_device(tgtdev, NULL); 2258e93c89c1SStefan Behrens 2259e93c89c1SStefan Behrens list_del_rcu(&tgtdev->dev_list); 2260e93c89c1SStefan Behrens 2261d9a071f0SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2262779bf3feSAnand Jain 2263779bf3feSAnand Jain /* 2264779bf3feSAnand Jain * The update_dev_time() with in btrfs_scratch_superblocks() 2265779bf3feSAnand Jain * may lead to a call to btrfs_show_devname() which will try 2266779bf3feSAnand Jain * to hold device_list_mutex. And here this device 2267779bf3feSAnand Jain * is already out of device list, so we don't have to hold 2268779bf3feSAnand Jain * the device_list_mutex lock. 2269779bf3feSAnand Jain */ 22708f32380dSJohannes Thumshirn btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 22718f32380dSJohannes Thumshirn tgtdev->name->str); 227214238819SAnand Jain 227314238819SAnand Jain btrfs_close_bdev(tgtdev); 22748e75fd89SNikolay Borisov synchronize_rcu(); 22758e75fd89SNikolay Borisov btrfs_free_device(tgtdev); 2276e93c89c1SStefan Behrens } 2277e93c89c1SStefan Behrens 2278b444ad46SNikolay Borisov static struct btrfs_device *btrfs_find_device_by_path( 2279b444ad46SNikolay Borisov struct btrfs_fs_info *fs_info, const char *device_path) 22807ba15b7dSStefan Behrens { 22817ba15b7dSStefan Behrens int ret = 0; 22827ba15b7dSStefan Behrens struct btrfs_super_block *disk_super; 22837ba15b7dSStefan Behrens u64 devid; 22847ba15b7dSStefan Behrens u8 *dev_uuid; 22857ba15b7dSStefan Behrens struct block_device *bdev; 2286b444ad46SNikolay Borisov struct btrfs_device *device; 22877ba15b7dSStefan Behrens 22887ba15b7dSStefan Behrens ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 22898f32380dSJohannes Thumshirn fs_info->bdev_holder, 0, &bdev, &disk_super); 22907ba15b7dSStefan Behrens if (ret) 2291b444ad46SNikolay Borisov return ERR_PTR(ret); 22928f32380dSJohannes Thumshirn 22937ba15b7dSStefan Behrens devid = btrfs_stack_device_id(&disk_super->dev_item); 22947ba15b7dSStefan Behrens dev_uuid = disk_super->dev_item.uuid; 22957239ff4bSNikolay Borisov if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2296e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 229709ba3bc9SAnand Jain disk_super->metadata_uuid, true); 22987239ff4bSNikolay Borisov else 2299e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 230009ba3bc9SAnand Jain disk_super->fsid, true); 23017239ff4bSNikolay Borisov 23028f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 2303b444ad46SNikolay Borisov if (!device) 2304b444ad46SNikolay Borisov device = ERR_PTR(-ENOENT); 23057ba15b7dSStefan Behrens blkdev_put(bdev, FMODE_READ); 2306b444ad46SNikolay Borisov return device; 23077ba15b7dSStefan Behrens } 23087ba15b7dSStefan Behrens 23092b82032cSYan Zheng /* 23105c5c0df0SDavid Sterba * Lookup a device given by device id, or the path if the id is 0. 23115c5c0df0SDavid Sterba */ 2312a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec( 23136e927cebSAnand Jain struct btrfs_fs_info *fs_info, u64 devid, 23146e927cebSAnand Jain const char *device_path) 231524e0474bSAnand Jain { 2316a27a94c2SNikolay Borisov struct btrfs_device *device; 231724e0474bSAnand Jain 23185c5c0df0SDavid Sterba if (devid) { 2319e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 232009ba3bc9SAnand Jain NULL, true); 2321a27a94c2SNikolay Borisov if (!device) 2322a27a94c2SNikolay Borisov return ERR_PTR(-ENOENT); 23236e927cebSAnand Jain return device; 23246e927cebSAnand Jain } 23256e927cebSAnand Jain 23266e927cebSAnand Jain if (!device_path || !device_path[0]) 2327a27a94c2SNikolay Borisov return ERR_PTR(-EINVAL); 2328d95a830cSAnand Jain 23296e927cebSAnand Jain if (strcmp(device_path, "missing") == 0) { 23306e927cebSAnand Jain /* Find first missing device */ 2331d95a830cSAnand Jain list_for_each_entry(device, &fs_info->fs_devices->devices, 2332d95a830cSAnand Jain dev_list) { 2333d95a830cSAnand Jain if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 23346e927cebSAnand Jain &device->dev_state) && !device->bdev) 2335d95a830cSAnand Jain return device; 2336d95a830cSAnand Jain } 2337d95a830cSAnand Jain return ERR_PTR(-ENOENT); 2338d95a830cSAnand Jain } 23396e927cebSAnand Jain 23406e927cebSAnand Jain return btrfs_find_device_by_path(fs_info, device_path); 234124e0474bSAnand Jain } 234224e0474bSAnand Jain 23432b82032cSYan Zheng /* 23442b82032cSYan Zheng * does all the dirty work required for changing file system's UUID. 23452b82032cSYan Zheng */ 23462ff7e61eSJeff Mahoney static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 23472b82032cSYan Zheng { 23480b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 23492b82032cSYan Zheng struct btrfs_fs_devices *old_devices; 2350e4404d6eSYan Zheng struct btrfs_fs_devices *seed_devices; 23510b246afaSJeff Mahoney struct btrfs_super_block *disk_super = fs_info->super_copy; 23522b82032cSYan Zheng struct btrfs_device *device; 23532b82032cSYan Zheng u64 super_flags; 23542b82032cSYan Zheng 2355a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex); 2356e4404d6eSYan Zheng if (!fs_devices->seeding) 23572b82032cSYan Zheng return -EINVAL; 23582b82032cSYan Zheng 2359427c8fddSNikolay Borisov /* 2360427c8fddSNikolay Borisov * Private copy of the seed devices, anchored at 2361427c8fddSNikolay Borisov * fs_info->fs_devices->seed_list 2362427c8fddSNikolay Borisov */ 23637239ff4bSNikolay Borisov seed_devices = alloc_fs_devices(NULL, NULL); 23642208a378SIlya Dryomov if (IS_ERR(seed_devices)) 23652208a378SIlya Dryomov return PTR_ERR(seed_devices); 23662b82032cSYan Zheng 2367427c8fddSNikolay Borisov /* 2368427c8fddSNikolay Borisov * It's necessary to retain a copy of the original seed fs_devices in 2369427c8fddSNikolay Borisov * fs_uuids so that filesystems which have been seeded can successfully 2370427c8fddSNikolay Borisov * reference the seed device from open_seed_devices. This also supports 2371427c8fddSNikolay Borisov * multiple fs seed. 2372427c8fddSNikolay Borisov */ 2373e4404d6eSYan Zheng old_devices = clone_fs_devices(fs_devices); 2374e4404d6eSYan Zheng if (IS_ERR(old_devices)) { 2375e4404d6eSYan Zheng kfree(seed_devices); 2376e4404d6eSYan Zheng return PTR_ERR(old_devices); 23772b82032cSYan Zheng } 2378e4404d6eSYan Zheng 2379c4babc5eSAnand Jain list_add(&old_devices->fs_list, &fs_uuids); 23802b82032cSYan Zheng 2381e4404d6eSYan Zheng memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2382e4404d6eSYan Zheng seed_devices->opened = 1; 2383e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->devices); 2384e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->alloc_list); 2385e5e9a520SChris Mason mutex_init(&seed_devices->device_list_mutex); 2386c9513edbSXiao Guangrong 2387321a4bf7SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 23881f78160cSXiao Guangrong list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 23891f78160cSXiao Guangrong synchronize_rcu); 23902196d6e8SMiao Xie list_for_each_entry(device, &seed_devices->devices, dev_list) 2391e4404d6eSYan Zheng device->fs_devices = seed_devices; 23922196d6e8SMiao Xie 23930395d84fSJohannes Thumshirn fs_devices->seeding = false; 23942b82032cSYan Zheng fs_devices->num_devices = 0; 23952b82032cSYan Zheng fs_devices->open_devices = 0; 239669611ac8SMiao Xie fs_devices->missing_devices = 0; 23977f0432d0SJohannes Thumshirn fs_devices->rotating = false; 2398944d3f9fSNikolay Borisov list_add(&seed_devices->seed_list, &fs_devices->seed_list); 23992b82032cSYan Zheng 24002b82032cSYan Zheng generate_random_uuid(fs_devices->fsid); 24017239ff4bSNikolay Borisov memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 24022b82032cSYan Zheng memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2403321a4bf7SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2404f7171750SFilipe David Borba Manana 24052b82032cSYan Zheng super_flags = btrfs_super_flags(disk_super) & 24062b82032cSYan Zheng ~BTRFS_SUPER_FLAG_SEEDING; 24072b82032cSYan Zheng btrfs_set_super_flags(disk_super, super_flags); 24082b82032cSYan Zheng 24092b82032cSYan Zheng return 0; 24102b82032cSYan Zheng } 24112b82032cSYan Zheng 24122b82032cSYan Zheng /* 241301327610SNicholas D Steeves * Store the expected generation for seed devices in device items. 24142b82032cSYan Zheng */ 24155c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 24162b82032cSYan Zheng { 24175c466629SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 24185b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 24192b82032cSYan Zheng struct btrfs_path *path; 24202b82032cSYan Zheng struct extent_buffer *leaf; 24212b82032cSYan Zheng struct btrfs_dev_item *dev_item; 24222b82032cSYan Zheng struct btrfs_device *device; 24232b82032cSYan Zheng struct btrfs_key key; 242444880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 24252b82032cSYan Zheng u8 dev_uuid[BTRFS_UUID_SIZE]; 24262b82032cSYan Zheng u64 devid; 24272b82032cSYan Zheng int ret; 24282b82032cSYan Zheng 24292b82032cSYan Zheng path = btrfs_alloc_path(); 24302b82032cSYan Zheng if (!path) 24312b82032cSYan Zheng return -ENOMEM; 24322b82032cSYan Zheng 24332b82032cSYan Zheng key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 24342b82032cSYan Zheng key.offset = 0; 24352b82032cSYan Zheng key.type = BTRFS_DEV_ITEM_KEY; 24362b82032cSYan Zheng 24372b82032cSYan Zheng while (1) { 24382b82032cSYan Zheng ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 24392b82032cSYan Zheng if (ret < 0) 24402b82032cSYan Zheng goto error; 24412b82032cSYan Zheng 24422b82032cSYan Zheng leaf = path->nodes[0]; 24432b82032cSYan Zheng next_slot: 24442b82032cSYan Zheng if (path->slots[0] >= btrfs_header_nritems(leaf)) { 24452b82032cSYan Zheng ret = btrfs_next_leaf(root, path); 24462b82032cSYan Zheng if (ret > 0) 24472b82032cSYan Zheng break; 24482b82032cSYan Zheng if (ret < 0) 24492b82032cSYan Zheng goto error; 24502b82032cSYan Zheng leaf = path->nodes[0]; 24512b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2452b3b4aa74SDavid Sterba btrfs_release_path(path); 24532b82032cSYan Zheng continue; 24542b82032cSYan Zheng } 24552b82032cSYan Zheng 24562b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 24572b82032cSYan Zheng if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 24582b82032cSYan Zheng key.type != BTRFS_DEV_ITEM_KEY) 24592b82032cSYan Zheng break; 24602b82032cSYan Zheng 24612b82032cSYan Zheng dev_item = btrfs_item_ptr(leaf, path->slots[0], 24622b82032cSYan Zheng struct btrfs_dev_item); 24632b82032cSYan Zheng devid = btrfs_device_id(leaf, dev_item); 2464410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 24652b82032cSYan Zheng BTRFS_UUID_SIZE); 24661473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 246744880fdcSAnand Jain BTRFS_FSID_SIZE); 2468e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 246909ba3bc9SAnand Jain fs_uuid, true); 247079787eaaSJeff Mahoney BUG_ON(!device); /* Logic error */ 24712b82032cSYan Zheng 24722b82032cSYan Zheng if (device->fs_devices->seeding) { 24732b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 24742b82032cSYan Zheng device->generation); 24752b82032cSYan Zheng btrfs_mark_buffer_dirty(leaf); 24762b82032cSYan Zheng } 24772b82032cSYan Zheng 24782b82032cSYan Zheng path->slots[0]++; 24792b82032cSYan Zheng goto next_slot; 24802b82032cSYan Zheng } 24812b82032cSYan Zheng ret = 0; 24822b82032cSYan Zheng error: 24832b82032cSYan Zheng btrfs_free_path(path); 24842b82032cSYan Zheng return ret; 24852b82032cSYan Zheng } 24862b82032cSYan Zheng 2487da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2488788f20ebSChris Mason { 24895112febbSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 2490d5e2003cSJosef Bacik struct request_queue *q; 2491788f20ebSChris Mason struct btrfs_trans_handle *trans; 2492788f20ebSChris Mason struct btrfs_device *device; 2493788f20ebSChris Mason struct block_device *bdev; 24940b246afaSJeff Mahoney struct super_block *sb = fs_info->sb; 2495606686eeSJosef Bacik struct rcu_string *name; 24965da54bc1SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 249739379faaSNaohiro Aota u64 orig_super_total_bytes; 249839379faaSNaohiro Aota u64 orig_super_num_devices; 24992b82032cSYan Zheng int seeding_dev = 0; 2500788f20ebSChris Mason int ret = 0; 250144cab9baSNikolay Borisov bool locked = false; 2502788f20ebSChris Mason 25035da54bc1SAnand Jain if (sb_rdonly(sb) && !fs_devices->seeding) 2504f8c5d0b4SLiu Bo return -EROFS; 2505788f20ebSChris Mason 2506a5d16333SLi Zefan bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 25070b246afaSJeff Mahoney fs_info->bdev_holder); 25087f59203aSJosef Bacik if (IS_ERR(bdev)) 25097f59203aSJosef Bacik return PTR_ERR(bdev); 2510a2135011SChris Mason 25115da54bc1SAnand Jain if (fs_devices->seeding) { 25122b82032cSYan Zheng seeding_dev = 1; 25132b82032cSYan Zheng down_write(&sb->s_umount); 25142b82032cSYan Zheng mutex_lock(&uuid_mutex); 251544cab9baSNikolay Borisov locked = true; 25162b82032cSYan Zheng } 25172b82032cSYan Zheng 2518b9ba017fSNikolay Borisov sync_blockdev(bdev); 2519a2135011SChris Mason 2520f4cfa9bdSNikolay Borisov rcu_read_lock(); 2521f4cfa9bdSNikolay Borisov list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2522788f20ebSChris Mason if (device->bdev == bdev) { 2523788f20ebSChris Mason ret = -EEXIST; 2524f4cfa9bdSNikolay Borisov rcu_read_unlock(); 25252b82032cSYan Zheng goto error; 2526788f20ebSChris Mason } 2527788f20ebSChris Mason } 2528f4cfa9bdSNikolay Borisov rcu_read_unlock(); 2529788f20ebSChris Mason 25300b246afaSJeff Mahoney device = btrfs_alloc_device(fs_info, NULL, NULL); 253112bd2fc0SIlya Dryomov if (IS_ERR(device)) { 2532788f20ebSChris Mason /* we can safely leave the fs_devices entry around */ 253312bd2fc0SIlya Dryomov ret = PTR_ERR(device); 25342b82032cSYan Zheng goto error; 2535788f20ebSChris Mason } 2536788f20ebSChris Mason 253778f2c9e6SDavid Sterba name = rcu_string_strdup(device_path, GFP_KERNEL); 2538606686eeSJosef Bacik if (!name) { 25392b82032cSYan Zheng ret = -ENOMEM; 25405c4cf6c9SDavid Sterba goto error_free_device; 2541788f20ebSChris Mason } 2542606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 25432b82032cSYan Zheng 2544a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 254598d5dc13STsutomu Itoh if (IS_ERR(trans)) { 254698d5dc13STsutomu Itoh ret = PTR_ERR(trans); 25475c4cf6c9SDavid Sterba goto error_free_device; 254898d5dc13STsutomu Itoh } 254998d5dc13STsutomu Itoh 2550d5e2003cSJosef Bacik q = bdev_get_queue(bdev); 2551ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 25522b82032cSYan Zheng device->generation = trans->transid; 25530b246afaSJeff Mahoney device->io_width = fs_info->sectorsize; 25540b246afaSJeff Mahoney device->io_align = fs_info->sectorsize; 25550b246afaSJeff Mahoney device->sector_size = fs_info->sectorsize; 25567dfb8be1SNikolay Borisov device->total_bytes = round_down(i_size_read(bdev->bd_inode), 25577dfb8be1SNikolay Borisov fs_info->sectorsize); 25582cc3c559SYan Zheng device->disk_total_bytes = device->total_bytes; 2559935e5cc9SMiao Xie device->commit_total_bytes = device->total_bytes; 2560fb456252SJeff Mahoney device->fs_info = fs_info; 2561788f20ebSChris Mason device->bdev = bdev; 2562e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2563401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2564fb01aa85SIlya Dryomov device->mode = FMODE_EXCL; 256527087f37SStefan Behrens device->dev_stats_valid = 1; 25669f6d2510SDavid Sterba set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2567325cd4baSZheng Yan 25682b82032cSYan Zheng if (seeding_dev) { 25691751e8a6SLinus Torvalds sb->s_flags &= ~SB_RDONLY; 25702ff7e61eSJeff Mahoney ret = btrfs_prepare_sprout(fs_info); 2571d31c32f6SAnand Jain if (ret) { 2572d31c32f6SAnand Jain btrfs_abort_transaction(trans, ret); 2573d31c32f6SAnand Jain goto error_trans; 2574d31c32f6SAnand Jain } 25752b82032cSYan Zheng } 25762b82032cSYan Zheng 25775da54bc1SAnand Jain device->fs_devices = fs_devices; 2578e5e9a520SChris Mason 25795da54bc1SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 258034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 25815da54bc1SAnand Jain list_add_rcu(&device->dev_list, &fs_devices->devices); 25825da54bc1SAnand Jain list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 25835da54bc1SAnand Jain fs_devices->num_devices++; 25845da54bc1SAnand Jain fs_devices->open_devices++; 25855da54bc1SAnand Jain fs_devices->rw_devices++; 25865da54bc1SAnand Jain fs_devices->total_devices++; 25875da54bc1SAnand Jain fs_devices->total_rw_bytes += device->total_bytes; 25882b82032cSYan Zheng 2589a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 25902bf64758SJosef Bacik 2591e884f4f0SAnand Jain if (!blk_queue_nonrot(q)) 25927f0432d0SJohannes Thumshirn fs_devices->rotating = true; 2593c289811cSChris Mason 259439379faaSNaohiro Aota orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 25950b246afaSJeff Mahoney btrfs_set_super_total_bytes(fs_info->super_copy, 259639379faaSNaohiro Aota round_down(orig_super_total_bytes + device->total_bytes, 259739379faaSNaohiro Aota fs_info->sectorsize)); 2598788f20ebSChris Mason 259939379faaSNaohiro Aota orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 260039379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy, 260139379faaSNaohiro Aota orig_super_num_devices + 1); 26020d39376aSAnand Jain 26032196d6e8SMiao Xie /* 26042196d6e8SMiao Xie * we've got more storage, clear any full flags on the space 26052196d6e8SMiao Xie * infos 26062196d6e8SMiao Xie */ 26070b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 26082196d6e8SMiao Xie 260934441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2610ca10845aSJosef Bacik 2611ca10845aSJosef Bacik /* Add sysfs device entry */ 2612cd36da2eSAnand Jain btrfs_sysfs_add_device(device); 2613ca10845aSJosef Bacik 26145da54bc1SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2615788f20ebSChris Mason 26162b82032cSYan Zheng if (seeding_dev) { 261734441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 26186f8e0fc7SDavid Sterba ret = init_first_rw_device(trans); 261934441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2620005d6427SDavid Sterba if (ret) { 262166642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2622d31c32f6SAnand Jain goto error_sysfs; 2623005d6427SDavid Sterba } 26242196d6e8SMiao Xie } 26252196d6e8SMiao Xie 26268e87e856SNikolay Borisov ret = btrfs_add_dev_item(trans, device); 26272196d6e8SMiao Xie if (ret) { 262866642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2629d31c32f6SAnand Jain goto error_sysfs; 26302196d6e8SMiao Xie } 26312196d6e8SMiao Xie 26322196d6e8SMiao Xie if (seeding_dev) { 26335c466629SDavid Sterba ret = btrfs_finish_sprout(trans); 2634005d6427SDavid Sterba if (ret) { 263566642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2636d31c32f6SAnand Jain goto error_sysfs; 2637005d6427SDavid Sterba } 2638b2373f25SAnand Jain 26398e560081SNikolay Borisov /* 26408e560081SNikolay Borisov * fs_devices now represents the newly sprouted filesystem and 26418e560081SNikolay Borisov * its fsid has been changed by btrfs_prepare_sprout 26428e560081SNikolay Borisov */ 26438e560081SNikolay Borisov btrfs_sysfs_update_sprout_fsid(fs_devices); 2644005d6427SDavid Sterba } 26452b82032cSYan Zheng 26463a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 26472b82032cSYan Zheng 26482b82032cSYan Zheng if (seeding_dev) { 26492b82032cSYan Zheng mutex_unlock(&uuid_mutex); 26502b82032cSYan Zheng up_write(&sb->s_umount); 265144cab9baSNikolay Borisov locked = false; 26522b82032cSYan Zheng 265379787eaaSJeff Mahoney if (ret) /* transaction commit */ 265479787eaaSJeff Mahoney return ret; 265579787eaaSJeff Mahoney 26562ff7e61eSJeff Mahoney ret = btrfs_relocate_sys_chunks(fs_info); 265779787eaaSJeff Mahoney if (ret < 0) 26580b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 26595d163e0eSJeff Mahoney "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2660671415b7SMiao Xie trans = btrfs_attach_transaction(root); 2661671415b7SMiao Xie if (IS_ERR(trans)) { 2662671415b7SMiao Xie if (PTR_ERR(trans) == -ENOENT) 2663671415b7SMiao Xie return 0; 26647132a262SAnand Jain ret = PTR_ERR(trans); 26657132a262SAnand Jain trans = NULL; 26667132a262SAnand Jain goto error_sysfs; 2667671415b7SMiao Xie } 26683a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 26692b82032cSYan Zheng } 2670c9e9f97bSIlya Dryomov 26717f551d96SAnand Jain /* 26727f551d96SAnand Jain * Now that we have written a new super block to this device, check all 26737f551d96SAnand Jain * other fs_devices list if device_path alienates any other scanned 26747f551d96SAnand Jain * device. 26757f551d96SAnand Jain * We can ignore the return value as it typically returns -EINVAL and 26767f551d96SAnand Jain * only succeeds if the device was an alien. 26777f551d96SAnand Jain */ 26787f551d96SAnand Jain btrfs_forget_devices(device_path); 26797f551d96SAnand Jain 26807f551d96SAnand Jain /* Update ctime/mtime for blkid or udev */ 26815a1972bdSQu Wenruo update_dev_time(device_path); 26827f551d96SAnand Jain 2683788f20ebSChris Mason return ret; 268479787eaaSJeff Mahoney 2685d31c32f6SAnand Jain error_sysfs: 268653f8a74cSAnand Jain btrfs_sysfs_remove_device(device); 268739379faaSNaohiro Aota mutex_lock(&fs_info->fs_devices->device_list_mutex); 268839379faaSNaohiro Aota mutex_lock(&fs_info->chunk_mutex); 268939379faaSNaohiro Aota list_del_rcu(&device->dev_list); 269039379faaSNaohiro Aota list_del(&device->dev_alloc_list); 269139379faaSNaohiro Aota fs_info->fs_devices->num_devices--; 269239379faaSNaohiro Aota fs_info->fs_devices->open_devices--; 269339379faaSNaohiro Aota fs_info->fs_devices->rw_devices--; 269439379faaSNaohiro Aota fs_info->fs_devices->total_devices--; 269539379faaSNaohiro Aota fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 269639379faaSNaohiro Aota atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 269739379faaSNaohiro Aota btrfs_set_super_total_bytes(fs_info->super_copy, 269839379faaSNaohiro Aota orig_super_total_bytes); 269939379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy, 270039379faaSNaohiro Aota orig_super_num_devices); 270139379faaSNaohiro Aota mutex_unlock(&fs_info->chunk_mutex); 270239379faaSNaohiro Aota mutex_unlock(&fs_info->fs_devices->device_list_mutex); 270379787eaaSJeff Mahoney error_trans: 27040af2c4bfSAnand Jain if (seeding_dev) 27051751e8a6SLinus Torvalds sb->s_flags |= SB_RDONLY; 27067132a262SAnand Jain if (trans) 27073a45bb20SJeff Mahoney btrfs_end_transaction(trans); 27085c4cf6c9SDavid Sterba error_free_device: 2709a425f9d4SDavid Sterba btrfs_free_device(device); 27102b82032cSYan Zheng error: 2711e525fd89STejun Heo blkdev_put(bdev, FMODE_EXCL); 271244cab9baSNikolay Borisov if (locked) { 27132b82032cSYan Zheng mutex_unlock(&uuid_mutex); 27142b82032cSYan Zheng up_write(&sb->s_umount); 27152b82032cSYan Zheng } 2716c9e9f97bSIlya Dryomov return ret; 2717788f20ebSChris Mason } 2718788f20ebSChris Mason 2719d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 27200b86a832SChris Mason struct btrfs_device *device) 27210b86a832SChris Mason { 27220b86a832SChris Mason int ret; 27230b86a832SChris Mason struct btrfs_path *path; 27240b246afaSJeff Mahoney struct btrfs_root *root = device->fs_info->chunk_root; 27250b86a832SChris Mason struct btrfs_dev_item *dev_item; 27260b86a832SChris Mason struct extent_buffer *leaf; 27270b86a832SChris Mason struct btrfs_key key; 27280b86a832SChris Mason 27290b86a832SChris Mason path = btrfs_alloc_path(); 27300b86a832SChris Mason if (!path) 27310b86a832SChris Mason return -ENOMEM; 27320b86a832SChris Mason 27330b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 27340b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 27350b86a832SChris Mason key.offset = device->devid; 27360b86a832SChris Mason 27370b86a832SChris Mason ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 27380b86a832SChris Mason if (ret < 0) 27390b86a832SChris Mason goto out; 27400b86a832SChris Mason 27410b86a832SChris Mason if (ret > 0) { 27420b86a832SChris Mason ret = -ENOENT; 27430b86a832SChris Mason goto out; 27440b86a832SChris Mason } 27450b86a832SChris Mason 27460b86a832SChris Mason leaf = path->nodes[0]; 27470b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 27480b86a832SChris Mason 27490b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 27500b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 27510b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 27520b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 27530b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 27547cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 27557cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 27567cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 27577cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 27580b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 27590b86a832SChris Mason 27600b86a832SChris Mason out: 27610b86a832SChris Mason btrfs_free_path(path); 27620b86a832SChris Mason return ret; 27630b86a832SChris Mason } 27640b86a832SChris Mason 27652196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans, 27668f18cf13SChris Mason struct btrfs_device *device, u64 new_size) 27678f18cf13SChris Mason { 27680b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 27690b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 27702196d6e8SMiao Xie u64 old_total; 27712196d6e8SMiao Xie u64 diff; 27728f18cf13SChris Mason 2773ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 27742b82032cSYan Zheng return -EACCES; 27752196d6e8SMiao Xie 27767dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 27777dfb8be1SNikolay Borisov 277834441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 27792196d6e8SMiao Xie old_total = btrfs_super_total_bytes(super_copy); 27800e4324a4SNikolay Borisov diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 27812196d6e8SMiao Xie 278263a212abSStefan Behrens if (new_size <= device->total_bytes || 2783401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 278434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 27852b82032cSYan Zheng return -EINVAL; 27862196d6e8SMiao Xie } 27872b82032cSYan Zheng 27887dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 27897dfb8be1SNikolay Borisov round_down(old_total + diff, fs_info->sectorsize)); 27902b82032cSYan Zheng device->fs_devices->total_rw_bytes += diff; 27912b82032cSYan Zheng 27927cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 27937cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 2794fb456252SJeff Mahoney btrfs_clear_space_info_full(device->fs_info); 2795bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list)) 2796bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list, 2797bbbf7243SNikolay Borisov &trans->transaction->dev_update_list); 279834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 27994184ea7fSChris Mason 28008f18cf13SChris Mason return btrfs_update_device(trans, device); 28018f18cf13SChris Mason } 28028f18cf13SChris Mason 2803f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 28048f18cf13SChris Mason { 2805f4208794SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 28065b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 28078f18cf13SChris Mason int ret; 28088f18cf13SChris Mason struct btrfs_path *path; 28098f18cf13SChris Mason struct btrfs_key key; 28108f18cf13SChris Mason 28118f18cf13SChris Mason path = btrfs_alloc_path(); 28128f18cf13SChris Mason if (!path) 28138f18cf13SChris Mason return -ENOMEM; 28148f18cf13SChris Mason 2815408fbf19SNikolay Borisov key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 28168f18cf13SChris Mason key.offset = chunk_offset; 28178f18cf13SChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 28188f18cf13SChris Mason 28198f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 282079787eaaSJeff Mahoney if (ret < 0) 282179787eaaSJeff Mahoney goto out; 282279787eaaSJeff Mahoney else if (ret > 0) { /* Logic error or corruption */ 28230b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, -ENOENT, 282479787eaaSJeff Mahoney "Failed lookup while freeing chunk."); 282579787eaaSJeff Mahoney ret = -ENOENT; 282679787eaaSJeff Mahoney goto out; 282779787eaaSJeff Mahoney } 28288f18cf13SChris Mason 28298f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 283079787eaaSJeff Mahoney if (ret < 0) 28310b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 283279787eaaSJeff Mahoney "Failed to delete chunk item."); 283379787eaaSJeff Mahoney out: 28348f18cf13SChris Mason btrfs_free_path(path); 283565a246c5STsutomu Itoh return ret; 28368f18cf13SChris Mason } 28378f18cf13SChris Mason 2838408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 28398f18cf13SChris Mason { 28400b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 28418f18cf13SChris Mason struct btrfs_disk_key *disk_key; 28428f18cf13SChris Mason struct btrfs_chunk *chunk; 28438f18cf13SChris Mason u8 *ptr; 28448f18cf13SChris Mason int ret = 0; 28458f18cf13SChris Mason u32 num_stripes; 28468f18cf13SChris Mason u32 array_size; 28478f18cf13SChris Mason u32 len = 0; 28488f18cf13SChris Mason u32 cur; 28498f18cf13SChris Mason struct btrfs_key key; 28508f18cf13SChris Mason 285134441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 28528f18cf13SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 28538f18cf13SChris Mason 28548f18cf13SChris Mason ptr = super_copy->sys_chunk_array; 28558f18cf13SChris Mason cur = 0; 28568f18cf13SChris Mason 28578f18cf13SChris Mason while (cur < array_size) { 28588f18cf13SChris Mason disk_key = (struct btrfs_disk_key *)ptr; 28598f18cf13SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 28608f18cf13SChris Mason 28618f18cf13SChris Mason len = sizeof(*disk_key); 28628f18cf13SChris Mason 28638f18cf13SChris Mason if (key.type == BTRFS_CHUNK_ITEM_KEY) { 28648f18cf13SChris Mason chunk = (struct btrfs_chunk *)(ptr + len); 28658f18cf13SChris Mason num_stripes = btrfs_stack_chunk_num_stripes(chunk); 28668f18cf13SChris Mason len += btrfs_chunk_item_size(num_stripes); 28678f18cf13SChris Mason } else { 28688f18cf13SChris Mason ret = -EIO; 28698f18cf13SChris Mason break; 28708f18cf13SChris Mason } 2871408fbf19SNikolay Borisov if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 28728f18cf13SChris Mason key.offset == chunk_offset) { 28738f18cf13SChris Mason memmove(ptr, ptr + len, array_size - (cur + len)); 28748f18cf13SChris Mason array_size -= len; 28758f18cf13SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size); 28768f18cf13SChris Mason } else { 28778f18cf13SChris Mason ptr += len; 28788f18cf13SChris Mason cur += len; 28798f18cf13SChris Mason } 28808f18cf13SChris Mason } 288134441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 28828f18cf13SChris Mason return ret; 28838f18cf13SChris Mason } 28848f18cf13SChris Mason 288560ca842eSOmar Sandoval /* 288660ca842eSOmar Sandoval * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 288760ca842eSOmar Sandoval * @logical: Logical block offset in bytes. 288860ca842eSOmar Sandoval * @length: Length of extent in bytes. 288960ca842eSOmar Sandoval * 289060ca842eSOmar Sandoval * Return: Chunk mapping or ERR_PTR. 289160ca842eSOmar Sandoval */ 289260ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2893592d92eeSLiu Bo u64 logical, u64 length) 2894592d92eeSLiu Bo { 2895592d92eeSLiu Bo struct extent_map_tree *em_tree; 2896592d92eeSLiu Bo struct extent_map *em; 2897592d92eeSLiu Bo 2898c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree; 2899592d92eeSLiu Bo read_lock(&em_tree->lock); 2900592d92eeSLiu Bo em = lookup_extent_mapping(em_tree, logical, length); 2901592d92eeSLiu Bo read_unlock(&em_tree->lock); 2902592d92eeSLiu Bo 2903592d92eeSLiu Bo if (!em) { 2904592d92eeSLiu Bo btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2905592d92eeSLiu Bo logical, length); 2906592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2907592d92eeSLiu Bo } 2908592d92eeSLiu Bo 2909592d92eeSLiu Bo if (em->start > logical || em->start + em->len < logical) { 2910592d92eeSLiu Bo btrfs_crit(fs_info, 2911592d92eeSLiu Bo "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2912592d92eeSLiu Bo logical, length, em->start, em->start + em->len); 2913592d92eeSLiu Bo free_extent_map(em); 2914592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2915592d92eeSLiu Bo } 2916592d92eeSLiu Bo 2917592d92eeSLiu Bo /* callers are responsible for dropping em's ref. */ 2918592d92eeSLiu Bo return em; 2919592d92eeSLiu Bo } 2920592d92eeSLiu Bo 292197aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 292247ab2a6cSJosef Bacik { 292397aff912SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 292447ab2a6cSJosef Bacik struct extent_map *em; 292547ab2a6cSJosef Bacik struct map_lookup *map; 292647ab2a6cSJosef Bacik u64 dev_extent_len = 0; 292747ab2a6cSJosef Bacik int i, ret = 0; 29280b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 292947ab2a6cSJosef Bacik 293060ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 2931592d92eeSLiu Bo if (IS_ERR(em)) { 293247ab2a6cSJosef Bacik /* 293347ab2a6cSJosef Bacik * This is a logic error, but we don't want to just rely on the 2934bb7ab3b9SAdam Buchbinder * user having built with ASSERT enabled, so if ASSERT doesn't 293547ab2a6cSJosef Bacik * do anything we still error out. 293647ab2a6cSJosef Bacik */ 293747ab2a6cSJosef Bacik ASSERT(0); 2938592d92eeSLiu Bo return PTR_ERR(em); 293947ab2a6cSJosef Bacik } 294095617d69SJeff Mahoney map = em->map_lookup; 294134441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 2942451a2c13SNikolay Borisov check_system_chunk(trans, map->type); 294334441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 294447ab2a6cSJosef Bacik 294557ba4cb8SFilipe Manana /* 294657ba4cb8SFilipe Manana * Take the device list mutex to prevent races with the final phase of 294757ba4cb8SFilipe Manana * a device replace operation that replaces the device object associated 294857ba4cb8SFilipe Manana * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 294957ba4cb8SFilipe Manana */ 295057ba4cb8SFilipe Manana mutex_lock(&fs_devices->device_list_mutex); 295147ab2a6cSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 295247ab2a6cSJosef Bacik struct btrfs_device *device = map->stripes[i].dev; 295347ab2a6cSJosef Bacik ret = btrfs_free_dev_extent(trans, device, 295447ab2a6cSJosef Bacik map->stripes[i].physical, 295547ab2a6cSJosef Bacik &dev_extent_len); 295647ab2a6cSJosef Bacik if (ret) { 295757ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 295866642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 295947ab2a6cSJosef Bacik goto out; 296047ab2a6cSJosef Bacik } 296147ab2a6cSJosef Bacik 296247ab2a6cSJosef Bacik if (device->bytes_used > 0) { 296334441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 296447ab2a6cSJosef Bacik btrfs_device_set_bytes_used(device, 296547ab2a6cSJosef Bacik device->bytes_used - dev_extent_len); 2966a5ed45f8SNikolay Borisov atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 29670b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 296834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 296947ab2a6cSJosef Bacik } 297047ab2a6cSJosef Bacik 297164bc6c2aSNikolay Borisov ret = btrfs_update_device(trans, device); 297247ab2a6cSJosef Bacik if (ret) { 297357ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 297466642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 297547ab2a6cSJosef Bacik goto out; 297647ab2a6cSJosef Bacik } 297747ab2a6cSJosef Bacik } 297857ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 297957ba4cb8SFilipe Manana 2980f4208794SNikolay Borisov ret = btrfs_free_chunk(trans, chunk_offset); 298147ab2a6cSJosef Bacik if (ret) { 298266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 298347ab2a6cSJosef Bacik goto out; 298447ab2a6cSJosef Bacik } 298547ab2a6cSJosef Bacik 29866bccf3abSJeff Mahoney trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 298747ab2a6cSJosef Bacik 298847ab2a6cSJosef Bacik if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2989408fbf19SNikolay Borisov ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 299047ab2a6cSJosef Bacik if (ret) { 299166642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 299247ab2a6cSJosef Bacik goto out; 299347ab2a6cSJosef Bacik } 299447ab2a6cSJosef Bacik } 299547ab2a6cSJosef Bacik 29965a98ec01SNikolay Borisov ret = btrfs_remove_block_group(trans, chunk_offset, em); 299747ab2a6cSJosef Bacik if (ret) { 299866642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 299947ab2a6cSJosef Bacik goto out; 300047ab2a6cSJosef Bacik } 300147ab2a6cSJosef Bacik 300247ab2a6cSJosef Bacik out: 300347ab2a6cSJosef Bacik /* once for us */ 300447ab2a6cSJosef Bacik free_extent_map(em); 30058f18cf13SChris Mason return ret; 30068f18cf13SChris Mason } 30078f18cf13SChris Mason 30085b4aacefSJeff Mahoney static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 30098f18cf13SChris Mason { 30105b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 301119c4d2f9SChris Mason struct btrfs_trans_handle *trans; 3012b0643e59SDennis Zhou struct btrfs_block_group *block_group; 30138f18cf13SChris Mason int ret; 30148f18cf13SChris Mason 301567c5e7d4SFilipe Manana /* 301667c5e7d4SFilipe Manana * Prevent races with automatic removal of unused block groups. 301767c5e7d4SFilipe Manana * After we relocate and before we remove the chunk with offset 301867c5e7d4SFilipe Manana * chunk_offset, automatic removal of the block group can kick in, 301967c5e7d4SFilipe Manana * resulting in a failure when calling btrfs_remove_chunk() below. 302067c5e7d4SFilipe Manana * 302167c5e7d4SFilipe Manana * Make sure to acquire this mutex before doing a tree search (dev 302267c5e7d4SFilipe Manana * or chunk trees) to find chunks. Otherwise the cleaner kthread might 302367c5e7d4SFilipe Manana * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 302467c5e7d4SFilipe Manana * we release the path used to search the chunk/dev tree and before 302567c5e7d4SFilipe Manana * the current task acquires this mutex and calls us. 302667c5e7d4SFilipe Manana */ 3027a32bf9a3SDavid Sterba lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 302867c5e7d4SFilipe Manana 30298f18cf13SChris Mason /* step one, relocate all the extents inside this chunk */ 30302ff7e61eSJeff Mahoney btrfs_scrub_pause(fs_info); 30310b246afaSJeff Mahoney ret = btrfs_relocate_block_group(fs_info, chunk_offset); 30322ff7e61eSJeff Mahoney btrfs_scrub_continue(fs_info); 3033a22285a6SYan, Zheng if (ret) 3034a22285a6SYan, Zheng return ret; 30358f18cf13SChris Mason 3036b0643e59SDennis Zhou block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3037b0643e59SDennis Zhou if (!block_group) 3038b0643e59SDennis Zhou return -ENOENT; 3039b0643e59SDennis Zhou btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3040b0643e59SDennis Zhou btrfs_put_block_group(block_group); 3041b0643e59SDennis Zhou 304219c4d2f9SChris Mason trans = btrfs_start_trans_remove_block_group(root->fs_info, 304319c4d2f9SChris Mason chunk_offset); 304419c4d2f9SChris Mason if (IS_ERR(trans)) { 304519c4d2f9SChris Mason ret = PTR_ERR(trans); 304619c4d2f9SChris Mason btrfs_handle_fs_error(root->fs_info, ret, NULL); 304719c4d2f9SChris Mason return ret; 304819c4d2f9SChris Mason } 30495d8eb6feSNaohiro Aota 305019c4d2f9SChris Mason /* 305119c4d2f9SChris Mason * step two, delete the device extents and the 305219c4d2f9SChris Mason * chunk tree entries 305319c4d2f9SChris Mason */ 305497aff912SNikolay Borisov ret = btrfs_remove_chunk(trans, chunk_offset); 30553a45bb20SJeff Mahoney btrfs_end_transaction(trans); 305619c4d2f9SChris Mason return ret; 30578f18cf13SChris Mason } 30588f18cf13SChris Mason 30592ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 30602b82032cSYan Zheng { 30610b246afaSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root; 30622b82032cSYan Zheng struct btrfs_path *path; 30632b82032cSYan Zheng struct extent_buffer *leaf; 30642b82032cSYan Zheng struct btrfs_chunk *chunk; 30652b82032cSYan Zheng struct btrfs_key key; 30662b82032cSYan Zheng struct btrfs_key found_key; 30672b82032cSYan Zheng u64 chunk_type; 3068ba1bf481SJosef Bacik bool retried = false; 3069ba1bf481SJosef Bacik int failed = 0; 30702b82032cSYan Zheng int ret; 30712b82032cSYan Zheng 30722b82032cSYan Zheng path = btrfs_alloc_path(); 30732b82032cSYan Zheng if (!path) 30742b82032cSYan Zheng return -ENOMEM; 30752b82032cSYan Zheng 3076ba1bf481SJosef Bacik again: 30772b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 30782b82032cSYan Zheng key.offset = (u64)-1; 30792b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 30802b82032cSYan Zheng 30812b82032cSYan Zheng while (1) { 30820b246afaSJeff Mahoney mutex_lock(&fs_info->delete_unused_bgs_mutex); 30832b82032cSYan Zheng ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 308467c5e7d4SFilipe Manana if (ret < 0) { 30850b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 30862b82032cSYan Zheng goto error; 308767c5e7d4SFilipe Manana } 308879787eaaSJeff Mahoney BUG_ON(ret == 0); /* Corruption */ 30892b82032cSYan Zheng 30902b82032cSYan Zheng ret = btrfs_previous_item(chunk_root, path, key.objectid, 30912b82032cSYan Zheng key.type); 309267c5e7d4SFilipe Manana if (ret) 30930b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 30942b82032cSYan Zheng if (ret < 0) 30952b82032cSYan Zheng goto error; 30962b82032cSYan Zheng if (ret > 0) 30972b82032cSYan Zheng break; 30982b82032cSYan Zheng 30992b82032cSYan Zheng leaf = path->nodes[0]; 31002b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 31012b82032cSYan Zheng 31022b82032cSYan Zheng chunk = btrfs_item_ptr(leaf, path->slots[0], 31032b82032cSYan Zheng struct btrfs_chunk); 31042b82032cSYan Zheng chunk_type = btrfs_chunk_type(leaf, chunk); 3105b3b4aa74SDavid Sterba btrfs_release_path(path); 31062b82032cSYan Zheng 31072b82032cSYan Zheng if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 31080b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3109ba1bf481SJosef Bacik if (ret == -ENOSPC) 3110ba1bf481SJosef Bacik failed++; 311114586651SHIMANGI SARAOGI else 311214586651SHIMANGI SARAOGI BUG_ON(ret); 31132b82032cSYan Zheng } 31140b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 31152b82032cSYan Zheng 31162b82032cSYan Zheng if (found_key.offset == 0) 31172b82032cSYan Zheng break; 31182b82032cSYan Zheng key.offset = found_key.offset - 1; 31192b82032cSYan Zheng } 31202b82032cSYan Zheng ret = 0; 3121ba1bf481SJosef Bacik if (failed && !retried) { 3122ba1bf481SJosef Bacik failed = 0; 3123ba1bf481SJosef Bacik retried = true; 3124ba1bf481SJosef Bacik goto again; 3125fae7f21cSDulshani Gunawardhana } else if (WARN_ON(failed && retried)) { 3126ba1bf481SJosef Bacik ret = -ENOSPC; 3127ba1bf481SJosef Bacik } 31282b82032cSYan Zheng error: 31292b82032cSYan Zheng btrfs_free_path(path); 31302b82032cSYan Zheng return ret; 31312b82032cSYan Zheng } 31322b82032cSYan Zheng 3133a6f93c71SLiu Bo /* 3134a6f93c71SLiu Bo * return 1 : allocate a data chunk successfully, 3135a6f93c71SLiu Bo * return <0: errors during allocating a data chunk, 3136a6f93c71SLiu Bo * return 0 : no need to allocate a data chunk. 3137a6f93c71SLiu Bo */ 3138a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3139a6f93c71SLiu Bo u64 chunk_offset) 3140a6f93c71SLiu Bo { 314132da5386SDavid Sterba struct btrfs_block_group *cache; 3142a6f93c71SLiu Bo u64 bytes_used; 3143a6f93c71SLiu Bo u64 chunk_type; 3144a6f93c71SLiu Bo 3145a6f93c71SLiu Bo cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3146a6f93c71SLiu Bo ASSERT(cache); 3147a6f93c71SLiu Bo chunk_type = cache->flags; 3148a6f93c71SLiu Bo btrfs_put_block_group(cache); 3149a6f93c71SLiu Bo 31505ae21692SJohannes Thumshirn if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 31515ae21692SJohannes Thumshirn return 0; 31525ae21692SJohannes Thumshirn 3153a6f93c71SLiu Bo spin_lock(&fs_info->data_sinfo->lock); 3154a6f93c71SLiu Bo bytes_used = fs_info->data_sinfo->bytes_used; 3155a6f93c71SLiu Bo spin_unlock(&fs_info->data_sinfo->lock); 3156a6f93c71SLiu Bo 3157a6f93c71SLiu Bo if (!bytes_used) { 3158a6f93c71SLiu Bo struct btrfs_trans_handle *trans; 3159a6f93c71SLiu Bo int ret; 3160a6f93c71SLiu Bo 3161a6f93c71SLiu Bo trans = btrfs_join_transaction(fs_info->tree_root); 3162a6f93c71SLiu Bo if (IS_ERR(trans)) 3163a6f93c71SLiu Bo return PTR_ERR(trans); 3164a6f93c71SLiu Bo 31655ae21692SJohannes Thumshirn ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3166a6f93c71SLiu Bo btrfs_end_transaction(trans); 3167a6f93c71SLiu Bo if (ret < 0) 3168a6f93c71SLiu Bo return ret; 3169a6f93c71SLiu Bo return 1; 3170a6f93c71SLiu Bo } 31715ae21692SJohannes Thumshirn 3172a6f93c71SLiu Bo return 0; 3173a6f93c71SLiu Bo } 3174a6f93c71SLiu Bo 31756bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info, 31760940ebf6SIlya Dryomov struct btrfs_balance_control *bctl) 31770940ebf6SIlya Dryomov { 31786bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 31790940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 31800940ebf6SIlya Dryomov struct btrfs_balance_item *item; 31810940ebf6SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 31820940ebf6SIlya Dryomov struct btrfs_path *path; 31830940ebf6SIlya Dryomov struct extent_buffer *leaf; 31840940ebf6SIlya Dryomov struct btrfs_key key; 31850940ebf6SIlya Dryomov int ret, err; 31860940ebf6SIlya Dryomov 31870940ebf6SIlya Dryomov path = btrfs_alloc_path(); 31880940ebf6SIlya Dryomov if (!path) 31890940ebf6SIlya Dryomov return -ENOMEM; 31900940ebf6SIlya Dryomov 31910940ebf6SIlya Dryomov trans = btrfs_start_transaction(root, 0); 31920940ebf6SIlya Dryomov if (IS_ERR(trans)) { 31930940ebf6SIlya Dryomov btrfs_free_path(path); 31940940ebf6SIlya Dryomov return PTR_ERR(trans); 31950940ebf6SIlya Dryomov } 31960940ebf6SIlya Dryomov 31970940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3198c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 31990940ebf6SIlya Dryomov key.offset = 0; 32000940ebf6SIlya Dryomov 32010940ebf6SIlya Dryomov ret = btrfs_insert_empty_item(trans, root, path, &key, 32020940ebf6SIlya Dryomov sizeof(*item)); 32030940ebf6SIlya Dryomov if (ret) 32040940ebf6SIlya Dryomov goto out; 32050940ebf6SIlya Dryomov 32060940ebf6SIlya Dryomov leaf = path->nodes[0]; 32070940ebf6SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 32080940ebf6SIlya Dryomov 3209b159fa28SDavid Sterba memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 32100940ebf6SIlya Dryomov 32110940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 32120940ebf6SIlya Dryomov btrfs_set_balance_data(leaf, item, &disk_bargs); 32130940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 32140940ebf6SIlya Dryomov btrfs_set_balance_meta(leaf, item, &disk_bargs); 32150940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 32160940ebf6SIlya Dryomov btrfs_set_balance_sys(leaf, item, &disk_bargs); 32170940ebf6SIlya Dryomov 32180940ebf6SIlya Dryomov btrfs_set_balance_flags(leaf, item, bctl->flags); 32190940ebf6SIlya Dryomov 32200940ebf6SIlya Dryomov btrfs_mark_buffer_dirty(leaf); 32210940ebf6SIlya Dryomov out: 32220940ebf6SIlya Dryomov btrfs_free_path(path); 32233a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 32240940ebf6SIlya Dryomov if (err && !ret) 32250940ebf6SIlya Dryomov ret = err; 32260940ebf6SIlya Dryomov return ret; 32270940ebf6SIlya Dryomov } 32280940ebf6SIlya Dryomov 32296bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info) 32300940ebf6SIlya Dryomov { 32316bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 32320940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 32330940ebf6SIlya Dryomov struct btrfs_path *path; 32340940ebf6SIlya Dryomov struct btrfs_key key; 32350940ebf6SIlya Dryomov int ret, err; 32360940ebf6SIlya Dryomov 32370940ebf6SIlya Dryomov path = btrfs_alloc_path(); 32380940ebf6SIlya Dryomov if (!path) 32390940ebf6SIlya Dryomov return -ENOMEM; 32400940ebf6SIlya Dryomov 32413502a8c0SDavid Sterba trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 32420940ebf6SIlya Dryomov if (IS_ERR(trans)) { 32430940ebf6SIlya Dryomov btrfs_free_path(path); 32440940ebf6SIlya Dryomov return PTR_ERR(trans); 32450940ebf6SIlya Dryomov } 32460940ebf6SIlya Dryomov 32470940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3248c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 32490940ebf6SIlya Dryomov key.offset = 0; 32500940ebf6SIlya Dryomov 32510940ebf6SIlya Dryomov ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 32520940ebf6SIlya Dryomov if (ret < 0) 32530940ebf6SIlya Dryomov goto out; 32540940ebf6SIlya Dryomov if (ret > 0) { 32550940ebf6SIlya Dryomov ret = -ENOENT; 32560940ebf6SIlya Dryomov goto out; 32570940ebf6SIlya Dryomov } 32580940ebf6SIlya Dryomov 32590940ebf6SIlya Dryomov ret = btrfs_del_item(trans, root, path); 32600940ebf6SIlya Dryomov out: 32610940ebf6SIlya Dryomov btrfs_free_path(path); 32623a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 32630940ebf6SIlya Dryomov if (err && !ret) 32640940ebf6SIlya Dryomov ret = err; 32650940ebf6SIlya Dryomov return ret; 32660940ebf6SIlya Dryomov } 32670940ebf6SIlya Dryomov 3268c9e9f97bSIlya Dryomov /* 326959641015SIlya Dryomov * This is a heuristic used to reduce the number of chunks balanced on 327059641015SIlya Dryomov * resume after balance was interrupted. 327159641015SIlya Dryomov */ 327259641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl) 327359641015SIlya Dryomov { 327459641015SIlya Dryomov /* 327559641015SIlya Dryomov * Turn on soft mode for chunk types that were being converted. 327659641015SIlya Dryomov */ 327759641015SIlya Dryomov if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 327859641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 327959641015SIlya Dryomov if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 328059641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 328159641015SIlya Dryomov if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 328259641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 328359641015SIlya Dryomov 328459641015SIlya Dryomov /* 328559641015SIlya Dryomov * Turn on usage filter if is not already used. The idea is 328659641015SIlya Dryomov * that chunks that we have already balanced should be 328759641015SIlya Dryomov * reasonably full. Don't do it for chunks that are being 328859641015SIlya Dryomov * converted - that will keep us from relocating unconverted 328959641015SIlya Dryomov * (albeit full) chunks. 329059641015SIlya Dryomov */ 329159641015SIlya Dryomov if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3292bc309467SDavid Sterba !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 329359641015SIlya Dryomov !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 329459641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 329559641015SIlya Dryomov bctl->data.usage = 90; 329659641015SIlya Dryomov } 329759641015SIlya Dryomov if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3298bc309467SDavid Sterba !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 329959641015SIlya Dryomov !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 330059641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 330159641015SIlya Dryomov bctl->sys.usage = 90; 330259641015SIlya Dryomov } 330359641015SIlya Dryomov if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3304bc309467SDavid Sterba !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 330559641015SIlya Dryomov !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 330659641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 330759641015SIlya Dryomov bctl->meta.usage = 90; 330859641015SIlya Dryomov } 330959641015SIlya Dryomov } 331059641015SIlya Dryomov 331159641015SIlya Dryomov /* 3312149196a2SDavid Sterba * Clear the balance status in fs_info and delete the balance item from disk. 3313149196a2SDavid Sterba */ 3314149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info) 3315c9e9f97bSIlya Dryomov { 3316c9e9f97bSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3317149196a2SDavid Sterba int ret; 3318c9e9f97bSIlya Dryomov 3319c9e9f97bSIlya Dryomov BUG_ON(!fs_info->balance_ctl); 3320c9e9f97bSIlya Dryomov 3321c9e9f97bSIlya Dryomov spin_lock(&fs_info->balance_lock); 3322c9e9f97bSIlya Dryomov fs_info->balance_ctl = NULL; 3323c9e9f97bSIlya Dryomov spin_unlock(&fs_info->balance_lock); 3324c9e9f97bSIlya Dryomov 3325c9e9f97bSIlya Dryomov kfree(bctl); 3326149196a2SDavid Sterba ret = del_balance_item(fs_info); 3327149196a2SDavid Sterba if (ret) 3328149196a2SDavid Sterba btrfs_handle_fs_error(fs_info, ret, NULL); 3329c9e9f97bSIlya Dryomov } 3330c9e9f97bSIlya Dryomov 3331ed25e9b2SIlya Dryomov /* 3332ed25e9b2SIlya Dryomov * Balance filters. Return 1 if chunk should be filtered out 3333ed25e9b2SIlya Dryomov * (should not be balanced). 3334ed25e9b2SIlya Dryomov */ 3335899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type, 3336ed25e9b2SIlya Dryomov struct btrfs_balance_args *bargs) 3337ed25e9b2SIlya Dryomov { 3338899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3339899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3340ed25e9b2SIlya Dryomov 3341899c81eaSIlya Dryomov if (bargs->profiles & chunk_type) 3342ed25e9b2SIlya Dryomov return 0; 3343ed25e9b2SIlya Dryomov 3344ed25e9b2SIlya Dryomov return 1; 3345ed25e9b2SIlya Dryomov } 3346ed25e9b2SIlya Dryomov 3347dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 33485ce5b3c0SIlya Dryomov struct btrfs_balance_args *bargs) 33495ce5b3c0SIlya Dryomov { 335032da5386SDavid Sterba struct btrfs_block_group *cache; 3351bc309467SDavid Sterba u64 chunk_used; 3352bc309467SDavid Sterba u64 user_thresh_min; 3353bc309467SDavid Sterba u64 user_thresh_max; 3354bc309467SDavid Sterba int ret = 1; 3355bc309467SDavid Sterba 3356bc309467SDavid Sterba cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3357bf38be65SDavid Sterba chunk_used = cache->used; 3358bc309467SDavid Sterba 3359bc309467SDavid Sterba if (bargs->usage_min == 0) 3360bc309467SDavid Sterba user_thresh_min = 0; 3361bc309467SDavid Sterba else 3362b3470b5dSDavid Sterba user_thresh_min = div_factor_fine(cache->length, 3363bc309467SDavid Sterba bargs->usage_min); 3364bc309467SDavid Sterba 3365bc309467SDavid Sterba if (bargs->usage_max == 0) 3366bc309467SDavid Sterba user_thresh_max = 1; 3367bc309467SDavid Sterba else if (bargs->usage_max > 100) 3368b3470b5dSDavid Sterba user_thresh_max = cache->length; 3369bc309467SDavid Sterba else 3370b3470b5dSDavid Sterba user_thresh_max = div_factor_fine(cache->length, 3371bc309467SDavid Sterba bargs->usage_max); 3372bc309467SDavid Sterba 3373bc309467SDavid Sterba if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3374bc309467SDavid Sterba ret = 0; 3375bc309467SDavid Sterba 3376bc309467SDavid Sterba btrfs_put_block_group(cache); 3377bc309467SDavid Sterba return ret; 3378bc309467SDavid Sterba } 3379bc309467SDavid Sterba 3380dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3381bc309467SDavid Sterba u64 chunk_offset, struct btrfs_balance_args *bargs) 3382bc309467SDavid Sterba { 338332da5386SDavid Sterba struct btrfs_block_group *cache; 33845ce5b3c0SIlya Dryomov u64 chunk_used, user_thresh; 33855ce5b3c0SIlya Dryomov int ret = 1; 33865ce5b3c0SIlya Dryomov 33875ce5b3c0SIlya Dryomov cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3388bf38be65SDavid Sterba chunk_used = cache->used; 33895ce5b3c0SIlya Dryomov 3390bc309467SDavid Sterba if (bargs->usage_min == 0) 33913e39cea6SIlya Dryomov user_thresh = 1; 3392a105bb88SIlya Dryomov else if (bargs->usage > 100) 3393b3470b5dSDavid Sterba user_thresh = cache->length; 3394a105bb88SIlya Dryomov else 3395b3470b5dSDavid Sterba user_thresh = div_factor_fine(cache->length, bargs->usage); 3396a105bb88SIlya Dryomov 33975ce5b3c0SIlya Dryomov if (chunk_used < user_thresh) 33985ce5b3c0SIlya Dryomov ret = 0; 33995ce5b3c0SIlya Dryomov 34005ce5b3c0SIlya Dryomov btrfs_put_block_group(cache); 34015ce5b3c0SIlya Dryomov return ret; 34025ce5b3c0SIlya Dryomov } 34035ce5b3c0SIlya Dryomov 3404409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf, 3405409d404bSIlya Dryomov struct btrfs_chunk *chunk, 3406409d404bSIlya Dryomov struct btrfs_balance_args *bargs) 3407409d404bSIlya Dryomov { 3408409d404bSIlya Dryomov struct btrfs_stripe *stripe; 3409409d404bSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3410409d404bSIlya Dryomov int i; 3411409d404bSIlya Dryomov 3412409d404bSIlya Dryomov for (i = 0; i < num_stripes; i++) { 3413409d404bSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 3414409d404bSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3415409d404bSIlya Dryomov return 0; 3416409d404bSIlya Dryomov } 3417409d404bSIlya Dryomov 3418409d404bSIlya Dryomov return 1; 3419409d404bSIlya Dryomov } 3420409d404bSIlya Dryomov 3421946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes) 3422946c9256SDavid Sterba { 3423946c9256SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(type); 3424946c9256SDavid Sterba const int ncopies = btrfs_raid_array[index].ncopies; 3425946c9256SDavid Sterba const int nparity = btrfs_raid_array[index].nparity; 3426946c9256SDavid Sterba 3427946c9256SDavid Sterba if (nparity) 3428946c9256SDavid Sterba return num_stripes - nparity; 3429946c9256SDavid Sterba else 3430946c9256SDavid Sterba return num_stripes / ncopies; 3431946c9256SDavid Sterba } 3432946c9256SDavid Sterba 343394e60d5aSIlya Dryomov /* [pstart, pend) */ 343494e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf, 343594e60d5aSIlya Dryomov struct btrfs_chunk *chunk, 343694e60d5aSIlya Dryomov struct btrfs_balance_args *bargs) 343794e60d5aSIlya Dryomov { 343894e60d5aSIlya Dryomov struct btrfs_stripe *stripe; 343994e60d5aSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 344094e60d5aSIlya Dryomov u64 stripe_offset; 344194e60d5aSIlya Dryomov u64 stripe_length; 3442946c9256SDavid Sterba u64 type; 344394e60d5aSIlya Dryomov int factor; 344494e60d5aSIlya Dryomov int i; 344594e60d5aSIlya Dryomov 344694e60d5aSIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 344794e60d5aSIlya Dryomov return 0; 344894e60d5aSIlya Dryomov 3449946c9256SDavid Sterba type = btrfs_chunk_type(leaf, chunk); 3450946c9256SDavid Sterba factor = calc_data_stripes(type, num_stripes); 345194e60d5aSIlya Dryomov 345294e60d5aSIlya Dryomov for (i = 0; i < num_stripes; i++) { 345394e60d5aSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 345494e60d5aSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 345594e60d5aSIlya Dryomov continue; 345694e60d5aSIlya Dryomov 345794e60d5aSIlya Dryomov stripe_offset = btrfs_stripe_offset(leaf, stripe); 345894e60d5aSIlya Dryomov stripe_length = btrfs_chunk_length(leaf, chunk); 3459b8b93addSDavid Sterba stripe_length = div_u64(stripe_length, factor); 346094e60d5aSIlya Dryomov 346194e60d5aSIlya Dryomov if (stripe_offset < bargs->pend && 346294e60d5aSIlya Dryomov stripe_offset + stripe_length > bargs->pstart) 346394e60d5aSIlya Dryomov return 0; 346494e60d5aSIlya Dryomov } 346594e60d5aSIlya Dryomov 346694e60d5aSIlya Dryomov return 1; 346794e60d5aSIlya Dryomov } 346894e60d5aSIlya Dryomov 3469ea67176aSIlya Dryomov /* [vstart, vend) */ 3470ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf, 3471ea67176aSIlya Dryomov struct btrfs_chunk *chunk, 3472ea67176aSIlya Dryomov u64 chunk_offset, 3473ea67176aSIlya Dryomov struct btrfs_balance_args *bargs) 3474ea67176aSIlya Dryomov { 3475ea67176aSIlya Dryomov if (chunk_offset < bargs->vend && 3476ea67176aSIlya Dryomov chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3477ea67176aSIlya Dryomov /* at least part of the chunk is inside this vrange */ 3478ea67176aSIlya Dryomov return 0; 3479ea67176aSIlya Dryomov 3480ea67176aSIlya Dryomov return 1; 3481ea67176aSIlya Dryomov } 3482ea67176aSIlya Dryomov 3483dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3484dee32d0aSGabríel Arthúr Pétursson struct btrfs_chunk *chunk, 3485dee32d0aSGabríel Arthúr Pétursson struct btrfs_balance_args *bargs) 3486dee32d0aSGabríel Arthúr Pétursson { 3487dee32d0aSGabríel Arthúr Pétursson int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3488dee32d0aSGabríel Arthúr Pétursson 3489dee32d0aSGabríel Arthúr Pétursson if (bargs->stripes_min <= num_stripes 3490dee32d0aSGabríel Arthúr Pétursson && num_stripes <= bargs->stripes_max) 3491dee32d0aSGabríel Arthúr Pétursson return 0; 3492dee32d0aSGabríel Arthúr Pétursson 3493dee32d0aSGabríel Arthúr Pétursson return 1; 3494dee32d0aSGabríel Arthúr Pétursson } 3495dee32d0aSGabríel Arthúr Pétursson 3496899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type, 3497cfa4c961SIlya Dryomov struct btrfs_balance_args *bargs) 3498cfa4c961SIlya Dryomov { 3499cfa4c961SIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3500cfa4c961SIlya Dryomov return 0; 3501cfa4c961SIlya Dryomov 3502899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3503899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3504cfa4c961SIlya Dryomov 3505899c81eaSIlya Dryomov if (bargs->target == chunk_type) 3506cfa4c961SIlya Dryomov return 1; 3507cfa4c961SIlya Dryomov 3508cfa4c961SIlya Dryomov return 0; 3509cfa4c961SIlya Dryomov } 3510cfa4c961SIlya Dryomov 35116ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf, 3512f43ffb60SIlya Dryomov struct btrfs_chunk *chunk, u64 chunk_offset) 3513f43ffb60SIlya Dryomov { 35146ec0896cSDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 35150b246afaSJeff Mahoney struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3516f43ffb60SIlya Dryomov struct btrfs_balance_args *bargs = NULL; 3517f43ffb60SIlya Dryomov u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3518f43ffb60SIlya Dryomov 3519f43ffb60SIlya Dryomov /* type filter */ 3520f43ffb60SIlya Dryomov if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3521f43ffb60SIlya Dryomov (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3522f43ffb60SIlya Dryomov return 0; 3523f43ffb60SIlya Dryomov } 3524f43ffb60SIlya Dryomov 3525f43ffb60SIlya Dryomov if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3526f43ffb60SIlya Dryomov bargs = &bctl->data; 3527f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3528f43ffb60SIlya Dryomov bargs = &bctl->sys; 3529f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3530f43ffb60SIlya Dryomov bargs = &bctl->meta; 3531f43ffb60SIlya Dryomov 3532ed25e9b2SIlya Dryomov /* profiles filter */ 3533ed25e9b2SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3534ed25e9b2SIlya Dryomov chunk_profiles_filter(chunk_type, bargs)) { 3535ed25e9b2SIlya Dryomov return 0; 3536ed25e9b2SIlya Dryomov } 3537ed25e9b2SIlya Dryomov 35385ce5b3c0SIlya Dryomov /* usage filter */ 35395ce5b3c0SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 35400b246afaSJeff Mahoney chunk_usage_filter(fs_info, chunk_offset, bargs)) { 35415ce5b3c0SIlya Dryomov return 0; 3542bc309467SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 35430b246afaSJeff Mahoney chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3544bc309467SDavid Sterba return 0; 35455ce5b3c0SIlya Dryomov } 35465ce5b3c0SIlya Dryomov 3547409d404bSIlya Dryomov /* devid filter */ 3548409d404bSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3549409d404bSIlya Dryomov chunk_devid_filter(leaf, chunk, bargs)) { 3550409d404bSIlya Dryomov return 0; 3551409d404bSIlya Dryomov } 3552409d404bSIlya Dryomov 355394e60d5aSIlya Dryomov /* drange filter, makes sense only with devid filter */ 355494e60d5aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3555e4ff5fb5SNikolay Borisov chunk_drange_filter(leaf, chunk, bargs)) { 355694e60d5aSIlya Dryomov return 0; 355794e60d5aSIlya Dryomov } 355894e60d5aSIlya Dryomov 3559ea67176aSIlya Dryomov /* vrange filter */ 3560ea67176aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3561ea67176aSIlya Dryomov chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3562ea67176aSIlya Dryomov return 0; 3563ea67176aSIlya Dryomov } 3564ea67176aSIlya Dryomov 3565dee32d0aSGabríel Arthúr Pétursson /* stripes filter */ 3566dee32d0aSGabríel Arthúr Pétursson if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3567dee32d0aSGabríel Arthúr Pétursson chunk_stripes_range_filter(leaf, chunk, bargs)) { 3568dee32d0aSGabríel Arthúr Pétursson return 0; 3569dee32d0aSGabríel Arthúr Pétursson } 3570dee32d0aSGabríel Arthúr Pétursson 3571cfa4c961SIlya Dryomov /* soft profile changing mode */ 3572cfa4c961SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3573cfa4c961SIlya Dryomov chunk_soft_convert_filter(chunk_type, bargs)) { 3574cfa4c961SIlya Dryomov return 0; 3575cfa4c961SIlya Dryomov } 3576cfa4c961SIlya Dryomov 35777d824b6fSDavid Sterba /* 35787d824b6fSDavid Sterba * limited by count, must be the last filter 35797d824b6fSDavid Sterba */ 35807d824b6fSDavid Sterba if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 35817d824b6fSDavid Sterba if (bargs->limit == 0) 35827d824b6fSDavid Sterba return 0; 35837d824b6fSDavid Sterba else 35847d824b6fSDavid Sterba bargs->limit--; 358512907fc7SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 358612907fc7SDavid Sterba /* 358712907fc7SDavid Sterba * Same logic as the 'limit' filter; the minimum cannot be 358801327610SNicholas D Steeves * determined here because we do not have the global information 358912907fc7SDavid Sterba * about the count of all chunks that satisfy the filters. 359012907fc7SDavid Sterba */ 359112907fc7SDavid Sterba if (bargs->limit_max == 0) 359212907fc7SDavid Sterba return 0; 359312907fc7SDavid Sterba else 359412907fc7SDavid Sterba bargs->limit_max--; 35957d824b6fSDavid Sterba } 35967d824b6fSDavid Sterba 3597f43ffb60SIlya Dryomov return 1; 3598f43ffb60SIlya Dryomov } 3599f43ffb60SIlya Dryomov 3600c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3601ec44a35cSChris Mason { 360219a39dceSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3603c9e9f97bSIlya Dryomov struct btrfs_root *chunk_root = fs_info->chunk_root; 360412907fc7SDavid Sterba u64 chunk_type; 3605f43ffb60SIlya Dryomov struct btrfs_chunk *chunk; 36065a488b9dSLiu Bo struct btrfs_path *path = NULL; 3607ec44a35cSChris Mason struct btrfs_key key; 3608ec44a35cSChris Mason struct btrfs_key found_key; 3609f43ffb60SIlya Dryomov struct extent_buffer *leaf; 3610f43ffb60SIlya Dryomov int slot; 3611c9e9f97bSIlya Dryomov int ret; 3612c9e9f97bSIlya Dryomov int enospc_errors = 0; 361319a39dceSIlya Dryomov bool counting = true; 361412907fc7SDavid Sterba /* The single value limit and min/max limits use the same bytes in the */ 36157d824b6fSDavid Sterba u64 limit_data = bctl->data.limit; 36167d824b6fSDavid Sterba u64 limit_meta = bctl->meta.limit; 36177d824b6fSDavid Sterba u64 limit_sys = bctl->sys.limit; 361812907fc7SDavid Sterba u32 count_data = 0; 361912907fc7SDavid Sterba u32 count_meta = 0; 362012907fc7SDavid Sterba u32 count_sys = 0; 36212c9fe835SZhao Lei int chunk_reserved = 0; 3622ec44a35cSChris Mason 3623ec44a35cSChris Mason path = btrfs_alloc_path(); 362417e9f796SMark Fasheh if (!path) { 362517e9f796SMark Fasheh ret = -ENOMEM; 362617e9f796SMark Fasheh goto error; 362717e9f796SMark Fasheh } 362819a39dceSIlya Dryomov 362919a39dceSIlya Dryomov /* zero out stat counters */ 363019a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 363119a39dceSIlya Dryomov memset(&bctl->stat, 0, sizeof(bctl->stat)); 363219a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 363319a39dceSIlya Dryomov again: 36347d824b6fSDavid Sterba if (!counting) { 363512907fc7SDavid Sterba /* 363612907fc7SDavid Sterba * The single value limit and min/max limits use the same bytes 363712907fc7SDavid Sterba * in the 363812907fc7SDavid Sterba */ 36397d824b6fSDavid Sterba bctl->data.limit = limit_data; 36407d824b6fSDavid Sterba bctl->meta.limit = limit_meta; 36417d824b6fSDavid Sterba bctl->sys.limit = limit_sys; 36427d824b6fSDavid Sterba } 3643ec44a35cSChris Mason key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3644ec44a35cSChris Mason key.offset = (u64)-1; 3645ec44a35cSChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 3646ec44a35cSChris Mason 3647ec44a35cSChris Mason while (1) { 364819a39dceSIlya Dryomov if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3649a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req)) { 3650837d5b6eSIlya Dryomov ret = -ECANCELED; 3651837d5b6eSIlya Dryomov goto error; 3652837d5b6eSIlya Dryomov } 3653837d5b6eSIlya Dryomov 365467c5e7d4SFilipe Manana mutex_lock(&fs_info->delete_unused_bgs_mutex); 3655ec44a35cSChris Mason ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 365667c5e7d4SFilipe Manana if (ret < 0) { 365767c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3658ec44a35cSChris Mason goto error; 365967c5e7d4SFilipe Manana } 3660ec44a35cSChris Mason 3661ec44a35cSChris Mason /* 3662ec44a35cSChris Mason * this shouldn't happen, it means the last relocate 3663ec44a35cSChris Mason * failed 3664ec44a35cSChris Mason */ 3665ec44a35cSChris Mason if (ret == 0) 3666c9e9f97bSIlya Dryomov BUG(); /* FIXME break ? */ 3667ec44a35cSChris Mason 3668ec44a35cSChris Mason ret = btrfs_previous_item(chunk_root, path, 0, 3669ec44a35cSChris Mason BTRFS_CHUNK_ITEM_KEY); 3670c9e9f97bSIlya Dryomov if (ret) { 367167c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3672c9e9f97bSIlya Dryomov ret = 0; 3673ec44a35cSChris Mason break; 3674c9e9f97bSIlya Dryomov } 36757d9eb12cSChris Mason 3676f43ffb60SIlya Dryomov leaf = path->nodes[0]; 3677f43ffb60SIlya Dryomov slot = path->slots[0]; 3678f43ffb60SIlya Dryomov btrfs_item_key_to_cpu(leaf, &found_key, slot); 3679f43ffb60SIlya Dryomov 368067c5e7d4SFilipe Manana if (found_key.objectid != key.objectid) { 368167c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3682ec44a35cSChris Mason break; 368367c5e7d4SFilipe Manana } 36847d9eb12cSChris Mason 3685f43ffb60SIlya Dryomov chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 368612907fc7SDavid Sterba chunk_type = btrfs_chunk_type(leaf, chunk); 3687f43ffb60SIlya Dryomov 368819a39dceSIlya Dryomov if (!counting) { 368919a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 369019a39dceSIlya Dryomov bctl->stat.considered++; 369119a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 369219a39dceSIlya Dryomov } 369319a39dceSIlya Dryomov 36946ec0896cSDavid Sterba ret = should_balance_chunk(leaf, chunk, found_key.offset); 36952c9fe835SZhao Lei 3696b3b4aa74SDavid Sterba btrfs_release_path(path); 369767c5e7d4SFilipe Manana if (!ret) { 369867c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3699f43ffb60SIlya Dryomov goto loop; 370067c5e7d4SFilipe Manana } 3701f43ffb60SIlya Dryomov 370219a39dceSIlya Dryomov if (counting) { 370367c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 370419a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 370519a39dceSIlya Dryomov bctl->stat.expected++; 370619a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 370712907fc7SDavid Sterba 370812907fc7SDavid Sterba if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 370912907fc7SDavid Sterba count_data++; 371012907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 371112907fc7SDavid Sterba count_sys++; 371212907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 371312907fc7SDavid Sterba count_meta++; 371412907fc7SDavid Sterba 371512907fc7SDavid Sterba goto loop; 371612907fc7SDavid Sterba } 371712907fc7SDavid Sterba 371812907fc7SDavid Sterba /* 371912907fc7SDavid Sterba * Apply limit_min filter, no need to check if the LIMITS 372012907fc7SDavid Sterba * filter is used, limit_min is 0 by default 372112907fc7SDavid Sterba */ 372212907fc7SDavid Sterba if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 372312907fc7SDavid Sterba count_data < bctl->data.limit_min) 372412907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 372512907fc7SDavid Sterba count_meta < bctl->meta.limit_min) 372612907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 372712907fc7SDavid Sterba count_sys < bctl->sys.limit_min)) { 372812907fc7SDavid Sterba mutex_unlock(&fs_info->delete_unused_bgs_mutex); 372919a39dceSIlya Dryomov goto loop; 373019a39dceSIlya Dryomov } 373119a39dceSIlya Dryomov 3732a6f93c71SLiu Bo if (!chunk_reserved) { 3733a6f93c71SLiu Bo /* 3734a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 3735a6f93c71SLiu Bo * which could potentially end up with losing data's 3736a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 3737a6f93c71SLiu Bo * advance. 3738a6f93c71SLiu Bo */ 3739a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, 3740a6f93c71SLiu Bo found_key.offset); 37412c9fe835SZhao Lei if (ret < 0) { 37422c9fe835SZhao Lei mutex_unlock(&fs_info->delete_unused_bgs_mutex); 37432c9fe835SZhao Lei goto error; 3744a6f93c71SLiu Bo } else if (ret == 1) { 37452c9fe835SZhao Lei chunk_reserved = 1; 37462c9fe835SZhao Lei } 3747a6f93c71SLiu Bo } 37482c9fe835SZhao Lei 37495b4aacefSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 375067c5e7d4SFilipe Manana mutex_unlock(&fs_info->delete_unused_bgs_mutex); 375119a39dceSIlya Dryomov if (ret == -ENOSPC) { 3752c9e9f97bSIlya Dryomov enospc_errors++; 3753eede2bf3SOmar Sandoval } else if (ret == -ETXTBSY) { 3754eede2bf3SOmar Sandoval btrfs_info(fs_info, 3755eede2bf3SOmar Sandoval "skipping relocation of block group %llu due to active swapfile", 3756eede2bf3SOmar Sandoval found_key.offset); 3757eede2bf3SOmar Sandoval ret = 0; 3758eede2bf3SOmar Sandoval } else if (ret) { 3759eede2bf3SOmar Sandoval goto error; 376019a39dceSIlya Dryomov } else { 376119a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 376219a39dceSIlya Dryomov bctl->stat.completed++; 376319a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 376419a39dceSIlya Dryomov } 3765f43ffb60SIlya Dryomov loop: 3766795a3321SIlya Dryomov if (found_key.offset == 0) 3767795a3321SIlya Dryomov break; 3768ba1bf481SJosef Bacik key.offset = found_key.offset - 1; 3769ec44a35cSChris Mason } 3770c9e9f97bSIlya Dryomov 377119a39dceSIlya Dryomov if (counting) { 377219a39dceSIlya Dryomov btrfs_release_path(path); 377319a39dceSIlya Dryomov counting = false; 377419a39dceSIlya Dryomov goto again; 377519a39dceSIlya Dryomov } 3776ec44a35cSChris Mason error: 3777ec44a35cSChris Mason btrfs_free_path(path); 3778c9e9f97bSIlya Dryomov if (enospc_errors) { 3779efe120a0SFrank Holton btrfs_info(fs_info, "%d enospc errors during balance", 3780c9e9f97bSIlya Dryomov enospc_errors); 3781c9e9f97bSIlya Dryomov if (!ret) 3782c9e9f97bSIlya Dryomov ret = -ENOSPC; 3783c9e9f97bSIlya Dryomov } 3784c9e9f97bSIlya Dryomov 3785ec44a35cSChris Mason return ret; 3786ec44a35cSChris Mason } 3787ec44a35cSChris Mason 37880c460c0dSIlya Dryomov /** 37890c460c0dSIlya Dryomov * alloc_profile_is_valid - see if a given profile is valid and reduced 37900c460c0dSIlya Dryomov * @flags: profile to validate 37910c460c0dSIlya Dryomov * @extended: if true @flags is treated as an extended profile 37920c460c0dSIlya Dryomov */ 37930c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended) 37940c460c0dSIlya Dryomov { 37950c460c0dSIlya Dryomov u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 37960c460c0dSIlya Dryomov BTRFS_BLOCK_GROUP_PROFILE_MASK); 37970c460c0dSIlya Dryomov 37980c460c0dSIlya Dryomov flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 37990c460c0dSIlya Dryomov 38000c460c0dSIlya Dryomov /* 1) check that all other bits are zeroed */ 38010c460c0dSIlya Dryomov if (flags & ~mask) 38020c460c0dSIlya Dryomov return 0; 38030c460c0dSIlya Dryomov 38040c460c0dSIlya Dryomov /* 2) see if profile is reduced */ 38050c460c0dSIlya Dryomov if (flags == 0) 38060c460c0dSIlya Dryomov return !extended; /* "0" is valid for usual profiles */ 38070c460c0dSIlya Dryomov 3808c1499166SDavid Sterba return has_single_bit_set(flags); 38090c460c0dSIlya Dryomov } 38100c460c0dSIlya Dryomov 3811837d5b6eSIlya Dryomov static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3812837d5b6eSIlya Dryomov { 3813a7e99c69SIlya Dryomov /* cancel requested || normal exit path */ 3814a7e99c69SIlya Dryomov return atomic_read(&fs_info->balance_cancel_req) || 3815a7e99c69SIlya Dryomov (atomic_read(&fs_info->balance_pause_req) == 0 && 3816a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req) == 0); 3817837d5b6eSIlya Dryomov } 3818837d5b6eSIlya Dryomov 38195ba366c3SDavid Sterba /* 38205ba366c3SDavid Sterba * Validate target profile against allowed profiles and return true if it's OK. 38215ba366c3SDavid Sterba * Otherwise print the error message and return false. 38225ba366c3SDavid Sterba */ 38235ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 38245ba366c3SDavid Sterba const struct btrfs_balance_args *bargs, 38255ba366c3SDavid Sterba u64 allowed, const char *type) 3826bdcd3c97SAlexandru Moise { 38275ba366c3SDavid Sterba if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 38285ba366c3SDavid Sterba return true; 38295ba366c3SDavid Sterba 38305ba366c3SDavid Sterba /* Profile is valid and does not have bits outside of the allowed set */ 38315ba366c3SDavid Sterba if (alloc_profile_is_valid(bargs->target, 1) && 38325ba366c3SDavid Sterba (bargs->target & ~allowed) == 0) 38335ba366c3SDavid Sterba return true; 38345ba366c3SDavid Sterba 38355ba366c3SDavid Sterba btrfs_err(fs_info, "balance: invalid convert %s profile %s", 38365ba366c3SDavid Sterba type, btrfs_bg_type_to_raid_name(bargs->target)); 38375ba366c3SDavid Sterba return false; 3838bdcd3c97SAlexandru Moise } 3839bdcd3c97SAlexandru Moise 3840c9e9f97bSIlya Dryomov /* 384156fc37d9SAnand Jain * Fill @buf with textual description of balance filter flags @bargs, up to 384256fc37d9SAnand Jain * @size_buf including the terminating null. The output may be trimmed if it 384356fc37d9SAnand Jain * does not fit into the provided buffer. 384456fc37d9SAnand Jain */ 384556fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 384656fc37d9SAnand Jain u32 size_buf) 384756fc37d9SAnand Jain { 384856fc37d9SAnand Jain int ret; 384956fc37d9SAnand Jain u32 size_bp = size_buf; 385056fc37d9SAnand Jain char *bp = buf; 385156fc37d9SAnand Jain u64 flags = bargs->flags; 385256fc37d9SAnand Jain char tmp_buf[128] = {'\0'}; 385356fc37d9SAnand Jain 385456fc37d9SAnand Jain if (!flags) 385556fc37d9SAnand Jain return; 385656fc37d9SAnand Jain 385756fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a) \ 385856fc37d9SAnand Jain do { \ 385956fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a)); \ 386056fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 386156fc37d9SAnand Jain goto out_overflow; \ 386256fc37d9SAnand Jain size_bp -= ret; \ 386356fc37d9SAnand Jain bp += ret; \ 386456fc37d9SAnand Jain } while (0) 386556fc37d9SAnand Jain 386656fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \ 386756fc37d9SAnand Jain do { \ 386856fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \ 386956fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 387056fc37d9SAnand Jain goto out_overflow; \ 387156fc37d9SAnand Jain size_bp -= ret; \ 387256fc37d9SAnand Jain bp += ret; \ 387356fc37d9SAnand Jain } while (0) 387456fc37d9SAnand Jain 387556fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2) \ 387656fc37d9SAnand Jain do { \ 387756fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 387856fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 387956fc37d9SAnand Jain goto out_overflow; \ 388056fc37d9SAnand Jain size_bp -= ret; \ 388156fc37d9SAnand Jain bp += ret; \ 388256fc37d9SAnand Jain } while (0) 388356fc37d9SAnand Jain 3884158da513SDavid Sterba if (flags & BTRFS_BALANCE_ARGS_CONVERT) 3885158da513SDavid Sterba CHECK_APPEND_1ARG("convert=%s,", 3886158da513SDavid Sterba btrfs_bg_type_to_raid_name(bargs->target)); 388756fc37d9SAnand Jain 388856fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_SOFT) 388956fc37d9SAnand Jain CHECK_APPEND_NOARG("soft,"); 389056fc37d9SAnand Jain 389156fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 389256fc37d9SAnand Jain btrfs_describe_block_groups(bargs->profiles, tmp_buf, 389356fc37d9SAnand Jain sizeof(tmp_buf)); 389456fc37d9SAnand Jain CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 389556fc37d9SAnand Jain } 389656fc37d9SAnand Jain 389756fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE) 389856fc37d9SAnand Jain CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 389956fc37d9SAnand Jain 390056fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 390156fc37d9SAnand Jain CHECK_APPEND_2ARG("usage=%u..%u,", 390256fc37d9SAnand Jain bargs->usage_min, bargs->usage_max); 390356fc37d9SAnand Jain 390456fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DEVID) 390556fc37d9SAnand Jain CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 390656fc37d9SAnand Jain 390756fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DRANGE) 390856fc37d9SAnand Jain CHECK_APPEND_2ARG("drange=%llu..%llu,", 390956fc37d9SAnand Jain bargs->pstart, bargs->pend); 391056fc37d9SAnand Jain 391156fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_VRANGE) 391256fc37d9SAnand Jain CHECK_APPEND_2ARG("vrange=%llu..%llu,", 391356fc37d9SAnand Jain bargs->vstart, bargs->vend); 391456fc37d9SAnand Jain 391556fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT) 391656fc37d9SAnand Jain CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 391756fc37d9SAnand Jain 391856fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 391956fc37d9SAnand Jain CHECK_APPEND_2ARG("limit=%u..%u,", 392056fc37d9SAnand Jain bargs->limit_min, bargs->limit_max); 392156fc37d9SAnand Jain 392256fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 392356fc37d9SAnand Jain CHECK_APPEND_2ARG("stripes=%u..%u,", 392456fc37d9SAnand Jain bargs->stripes_min, bargs->stripes_max); 392556fc37d9SAnand Jain 392656fc37d9SAnand Jain #undef CHECK_APPEND_2ARG 392756fc37d9SAnand Jain #undef CHECK_APPEND_1ARG 392856fc37d9SAnand Jain #undef CHECK_APPEND_NOARG 392956fc37d9SAnand Jain 393056fc37d9SAnand Jain out_overflow: 393156fc37d9SAnand Jain 393256fc37d9SAnand Jain if (size_bp < size_buf) 393356fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 393456fc37d9SAnand Jain else 393556fc37d9SAnand Jain buf[0] = '\0'; 393656fc37d9SAnand Jain } 393756fc37d9SAnand Jain 393856fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 393956fc37d9SAnand Jain { 394056fc37d9SAnand Jain u32 size_buf = 1024; 394156fc37d9SAnand Jain char tmp_buf[192] = {'\0'}; 394256fc37d9SAnand Jain char *buf; 394356fc37d9SAnand Jain char *bp; 394456fc37d9SAnand Jain u32 size_bp = size_buf; 394556fc37d9SAnand Jain int ret; 394656fc37d9SAnand Jain struct btrfs_balance_control *bctl = fs_info->balance_ctl; 394756fc37d9SAnand Jain 394856fc37d9SAnand Jain buf = kzalloc(size_buf, GFP_KERNEL); 394956fc37d9SAnand Jain if (!buf) 395056fc37d9SAnand Jain return; 395156fc37d9SAnand Jain 395256fc37d9SAnand Jain bp = buf; 395356fc37d9SAnand Jain 395456fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \ 395556fc37d9SAnand Jain do { \ 395656fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \ 395756fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 395856fc37d9SAnand Jain goto out_overflow; \ 395956fc37d9SAnand Jain size_bp -= ret; \ 396056fc37d9SAnand Jain bp += ret; \ 396156fc37d9SAnand Jain } while (0) 396256fc37d9SAnand Jain 396356fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_FORCE) 396456fc37d9SAnand Jain CHECK_APPEND_1ARG("%s", "-f "); 396556fc37d9SAnand Jain 396656fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_DATA) { 396756fc37d9SAnand Jain describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 396856fc37d9SAnand Jain CHECK_APPEND_1ARG("-d%s ", tmp_buf); 396956fc37d9SAnand Jain } 397056fc37d9SAnand Jain 397156fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_METADATA) { 397256fc37d9SAnand Jain describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 397356fc37d9SAnand Jain CHECK_APPEND_1ARG("-m%s ", tmp_buf); 397456fc37d9SAnand Jain } 397556fc37d9SAnand Jain 397656fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 397756fc37d9SAnand Jain describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 397856fc37d9SAnand Jain CHECK_APPEND_1ARG("-s%s ", tmp_buf); 397956fc37d9SAnand Jain } 398056fc37d9SAnand Jain 398156fc37d9SAnand Jain #undef CHECK_APPEND_1ARG 398256fc37d9SAnand Jain 398356fc37d9SAnand Jain out_overflow: 398456fc37d9SAnand Jain 398556fc37d9SAnand Jain if (size_bp < size_buf) 398656fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 398756fc37d9SAnand Jain btrfs_info(fs_info, "balance: %s %s", 398856fc37d9SAnand Jain (bctl->flags & BTRFS_BALANCE_RESUME) ? 398956fc37d9SAnand Jain "resume" : "start", buf); 399056fc37d9SAnand Jain 399156fc37d9SAnand Jain kfree(buf); 399256fc37d9SAnand Jain } 399356fc37d9SAnand Jain 399456fc37d9SAnand Jain /* 3995dccdb07bSDavid Sterba * Should be called with balance mutexe held 3996c9e9f97bSIlya Dryomov */ 39976fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info, 39986fcf6e2bSDavid Sterba struct btrfs_balance_control *bctl, 3999c9e9f97bSIlya Dryomov struct btrfs_ioctl_balance_args *bargs) 4000c9e9f97bSIlya Dryomov { 400114506127SAdam Borowski u64 meta_target, data_target; 4002f43ffb60SIlya Dryomov u64 allowed; 4003e4837f8fSIlya Dryomov int mixed = 0; 4004c9e9f97bSIlya Dryomov int ret; 40058dabb742SStefan Behrens u64 num_devices; 4006de98ced9SMiao Xie unsigned seq; 4007e62869beSAnand Jain bool reducing_redundancy; 4008081db89bSDavid Sterba int i; 4009c9e9f97bSIlya Dryomov 4010837d5b6eSIlya Dryomov if (btrfs_fs_closing(fs_info) || 4011a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_pause_req) || 4012726a3421SQu Wenruo btrfs_should_cancel_balance(fs_info)) { 4013c9e9f97bSIlya Dryomov ret = -EINVAL; 4014c9e9f97bSIlya Dryomov goto out; 4015c9e9f97bSIlya Dryomov } 4016c9e9f97bSIlya Dryomov 4017e4837f8fSIlya Dryomov allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4018e4837f8fSIlya Dryomov if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4019e4837f8fSIlya Dryomov mixed = 1; 4020e4837f8fSIlya Dryomov 4021f43ffb60SIlya Dryomov /* 4022f43ffb60SIlya Dryomov * In case of mixed groups both data and meta should be picked, 4023f43ffb60SIlya Dryomov * and identical options should be given for both of them. 4024f43ffb60SIlya Dryomov */ 4025e4837f8fSIlya Dryomov allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4026e4837f8fSIlya Dryomov if (mixed && (bctl->flags & allowed)) { 4027f43ffb60SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4028f43ffb60SIlya Dryomov !(bctl->flags & BTRFS_BALANCE_METADATA) || 4029f43ffb60SIlya Dryomov memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 40305d163e0eSJeff Mahoney btrfs_err(fs_info, 40316dac13f8SAnand Jain "balance: mixed groups data and metadata options must be the same"); 4032f43ffb60SIlya Dryomov ret = -EINVAL; 4033f43ffb60SIlya Dryomov goto out; 4034f43ffb60SIlya Dryomov } 4035f43ffb60SIlya Dryomov } 4036f43ffb60SIlya Dryomov 4037b35cf1f0SJosef Bacik /* 4038b35cf1f0SJosef Bacik * rw_devices will not change at the moment, device add/delete/replace 4039c3e1f96cSGoldwyn Rodrigues * are exclusive 4040b35cf1f0SJosef Bacik */ 4041b35cf1f0SJosef Bacik num_devices = fs_info->fs_devices->rw_devices; 4042fab27359SQu Wenruo 4043fab27359SQu Wenruo /* 4044fab27359SQu Wenruo * SINGLE profile on-disk has no profile bit, but in-memory we have a 4045fab27359SQu Wenruo * special bit for it, to make it easier to distinguish. Thus we need 4046fab27359SQu Wenruo * to set it manually, or balance would refuse the profile. 4047fab27359SQu Wenruo */ 4048fab27359SQu Wenruo allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4049081db89bSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4050081db89bSDavid Sterba if (num_devices >= btrfs_raid_array[i].devs_min) 4051081db89bSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag; 40521da73967SAnand Jain 40535ba366c3SDavid Sterba if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 40545ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 40555ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4056e4d8ec0fSIlya Dryomov ret = -EINVAL; 4057e4d8ec0fSIlya Dryomov goto out; 4058e4d8ec0fSIlya Dryomov } 4059e4d8ec0fSIlya Dryomov 40606079e12cSDavid Sterba /* 40616079e12cSDavid Sterba * Allow to reduce metadata or system integrity only if force set for 40626079e12cSDavid Sterba * profiles with redundancy (copies, parity) 40636079e12cSDavid Sterba */ 40646079e12cSDavid Sterba allowed = 0; 40656079e12cSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 40666079e12cSDavid Sterba if (btrfs_raid_array[i].ncopies >= 2 || 40676079e12cSDavid Sterba btrfs_raid_array[i].tolerated_failures >= 1) 40686079e12cSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag; 40696079e12cSDavid Sterba } 4070de98ced9SMiao Xie do { 4071de98ced9SMiao Xie seq = read_seqbegin(&fs_info->profiles_lock); 4072de98ced9SMiao Xie 4073e4d8ec0fSIlya Dryomov if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4074e4d8ec0fSIlya Dryomov (fs_info->avail_system_alloc_bits & allowed) && 4075e4d8ec0fSIlya Dryomov !(bctl->sys.target & allowed)) || 4076e4d8ec0fSIlya Dryomov ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4077e4d8ec0fSIlya Dryomov (fs_info->avail_metadata_alloc_bits & allowed) && 40785a8067c0SFilipe Manana !(bctl->meta.target & allowed))) 4079e62869beSAnand Jain reducing_redundancy = true; 40805a8067c0SFilipe Manana else 4081e62869beSAnand Jain reducing_redundancy = false; 40825a8067c0SFilipe Manana 40835a8067c0SFilipe Manana /* if we're not converting, the target field is uninitialized */ 40845a8067c0SFilipe Manana meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 40855a8067c0SFilipe Manana bctl->meta.target : fs_info->avail_metadata_alloc_bits; 40865a8067c0SFilipe Manana data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 40875a8067c0SFilipe Manana bctl->data.target : fs_info->avail_data_alloc_bits; 40885a8067c0SFilipe Manana } while (read_seqretry(&fs_info->profiles_lock, seq)); 40895a8067c0SFilipe Manana 4090e62869beSAnand Jain if (reducing_redundancy) { 4091e4d8ec0fSIlya Dryomov if (bctl->flags & BTRFS_BALANCE_FORCE) { 40925d163e0eSJeff Mahoney btrfs_info(fs_info, 4093e62869beSAnand Jain "balance: force reducing metadata redundancy"); 4094e4d8ec0fSIlya Dryomov } else { 40955d163e0eSJeff Mahoney btrfs_err(fs_info, 4096e62869beSAnand Jain "balance: reduces metadata redundancy, use --force if you want this"); 4097e4d8ec0fSIlya Dryomov ret = -EINVAL; 4098e4d8ec0fSIlya Dryomov goto out; 4099e4d8ec0fSIlya Dryomov } 4100e4d8ec0fSIlya Dryomov } 4101e4d8ec0fSIlya Dryomov 410214506127SAdam Borowski if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 410314506127SAdam Borowski btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4104ee592d07SSam Tygier btrfs_warn(fs_info, 41056dac13f8SAnand Jain "balance: metadata profile %s has lower redundancy than data profile %s", 4106158da513SDavid Sterba btrfs_bg_type_to_raid_name(meta_target), 4107158da513SDavid Sterba btrfs_bg_type_to_raid_name(data_target)); 4108ee592d07SSam Tygier } 4109ee592d07SSam Tygier 41109e967495SFilipe Manana if (fs_info->send_in_progress) { 41119e967495SFilipe Manana btrfs_warn_rl(fs_info, 41129e967495SFilipe Manana "cannot run balance while send operations are in progress (%d in progress)", 41139e967495SFilipe Manana fs_info->send_in_progress); 41149e967495SFilipe Manana ret = -EAGAIN; 41159e967495SFilipe Manana goto out; 41169e967495SFilipe Manana } 41179e967495SFilipe Manana 41186bccf3abSJeff Mahoney ret = insert_balance_item(fs_info, bctl); 411959641015SIlya Dryomov if (ret && ret != -EEXIST) 41200940ebf6SIlya Dryomov goto out; 41210940ebf6SIlya Dryomov 412259641015SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 412359641015SIlya Dryomov BUG_ON(ret == -EEXIST); 4124833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl); 4125833aae18SDavid Sterba spin_lock(&fs_info->balance_lock); 4126833aae18SDavid Sterba fs_info->balance_ctl = bctl; 4127833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock); 412859641015SIlya Dryomov } else { 412959641015SIlya Dryomov BUG_ON(ret != -EEXIST); 413059641015SIlya Dryomov spin_lock(&fs_info->balance_lock); 413159641015SIlya Dryomov update_balance_args(bctl); 413259641015SIlya Dryomov spin_unlock(&fs_info->balance_lock); 413359641015SIlya Dryomov } 4134c9e9f97bSIlya Dryomov 41353009a62fSDavid Sterba ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 41363009a62fSDavid Sterba set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 413756fc37d9SAnand Jain describe_balance_start_or_resume(fs_info); 4138c9e9f97bSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4139c9e9f97bSIlya Dryomov 4140c9e9f97bSIlya Dryomov ret = __btrfs_balance(fs_info); 4141c9e9f97bSIlya Dryomov 4142c9e9f97bSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 41437333bd02SAnand Jain if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 41447333bd02SAnand Jain btrfs_info(fs_info, "balance: paused"); 414544d354abSQu Wenruo /* 414644d354abSQu Wenruo * Balance can be canceled by: 414744d354abSQu Wenruo * 414844d354abSQu Wenruo * - Regular cancel request 414944d354abSQu Wenruo * Then ret == -ECANCELED and balance_cancel_req > 0 415044d354abSQu Wenruo * 415144d354abSQu Wenruo * - Fatal signal to "btrfs" process 415244d354abSQu Wenruo * Either the signal caught by wait_reserve_ticket() and callers 415344d354abSQu Wenruo * got -EINTR, or caught by btrfs_should_cancel_balance() and 415444d354abSQu Wenruo * got -ECANCELED. 415544d354abSQu Wenruo * Either way, in this case balance_cancel_req = 0, and 415644d354abSQu Wenruo * ret == -EINTR or ret == -ECANCELED. 415744d354abSQu Wenruo * 415844d354abSQu Wenruo * So here we only check the return value to catch canceled balance. 415944d354abSQu Wenruo */ 416044d354abSQu Wenruo else if (ret == -ECANCELED || ret == -EINTR) 41617333bd02SAnand Jain btrfs_info(fs_info, "balance: canceled"); 41627333bd02SAnand Jain else 41637333bd02SAnand Jain btrfs_info(fs_info, "balance: ended with status: %d", ret); 41647333bd02SAnand Jain 41653009a62fSDavid Sterba clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4166c9e9f97bSIlya Dryomov 4167c9e9f97bSIlya Dryomov if (bargs) { 4168c9e9f97bSIlya Dryomov memset(bargs, 0, sizeof(*bargs)); 4169008ef096SDavid Sterba btrfs_update_ioctl_balance_args(fs_info, bargs); 4170c9e9f97bSIlya Dryomov } 4171c9e9f97bSIlya Dryomov 41723a01aa7aSIlya Dryomov if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 41733a01aa7aSIlya Dryomov balance_need_close(fs_info)) { 4174149196a2SDavid Sterba reset_balance_state(fs_info); 4175c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 41763a01aa7aSIlya Dryomov } 41773a01aa7aSIlya Dryomov 4178837d5b6eSIlya Dryomov wake_up(&fs_info->balance_wait_q); 4179c9e9f97bSIlya Dryomov 4180c9e9f97bSIlya Dryomov return ret; 4181c9e9f97bSIlya Dryomov out: 418259641015SIlya Dryomov if (bctl->flags & BTRFS_BALANCE_RESUME) 4183149196a2SDavid Sterba reset_balance_state(fs_info); 4184a17c95dfSDavid Sterba else 4185c9e9f97bSIlya Dryomov kfree(bctl); 4186c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 4187a17c95dfSDavid Sterba 41888f18cf13SChris Mason return ret; 41898f18cf13SChris Mason } 41908f18cf13SChris Mason 419159641015SIlya Dryomov static int balance_kthread(void *data) 419259641015SIlya Dryomov { 41932b6ba629SIlya Dryomov struct btrfs_fs_info *fs_info = data; 41949555c6c1SIlya Dryomov int ret = 0; 419559641015SIlya Dryomov 419659641015SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 419756fc37d9SAnand Jain if (fs_info->balance_ctl) 41986fcf6e2bSDavid Sterba ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 419959641015SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 42002b6ba629SIlya Dryomov 420159641015SIlya Dryomov return ret; 420259641015SIlya Dryomov } 420359641015SIlya Dryomov 42042b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 42052b6ba629SIlya Dryomov { 42062b6ba629SIlya Dryomov struct task_struct *tsk; 42072b6ba629SIlya Dryomov 42081354e1a1SDavid Sterba mutex_lock(&fs_info->balance_mutex); 42092b6ba629SIlya Dryomov if (!fs_info->balance_ctl) { 42101354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 42112b6ba629SIlya Dryomov return 0; 42122b6ba629SIlya Dryomov } 42131354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 42142b6ba629SIlya Dryomov 42153cdde224SJeff Mahoney if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 42166dac13f8SAnand Jain btrfs_info(fs_info, "balance: resume skipped"); 42172b6ba629SIlya Dryomov return 0; 42182b6ba629SIlya Dryomov } 42192b6ba629SIlya Dryomov 422002ee654dSAnand Jain /* 422102ee654dSAnand Jain * A ro->rw remount sequence should continue with the paused balance 422202ee654dSAnand Jain * regardless of who pauses it, system or the user as of now, so set 422302ee654dSAnand Jain * the resume flag. 422402ee654dSAnand Jain */ 422502ee654dSAnand Jain spin_lock(&fs_info->balance_lock); 422602ee654dSAnand Jain fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 422702ee654dSAnand Jain spin_unlock(&fs_info->balance_lock); 422802ee654dSAnand Jain 42292b6ba629SIlya Dryomov tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4230cd633972SSachin Kamat return PTR_ERR_OR_ZERO(tsk); 42312b6ba629SIlya Dryomov } 42322b6ba629SIlya Dryomov 423368310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 423459641015SIlya Dryomov { 423559641015SIlya Dryomov struct btrfs_balance_control *bctl; 423659641015SIlya Dryomov struct btrfs_balance_item *item; 423759641015SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 423859641015SIlya Dryomov struct btrfs_path *path; 423959641015SIlya Dryomov struct extent_buffer *leaf; 424059641015SIlya Dryomov struct btrfs_key key; 424159641015SIlya Dryomov int ret; 424259641015SIlya Dryomov 424359641015SIlya Dryomov path = btrfs_alloc_path(); 424459641015SIlya Dryomov if (!path) 424559641015SIlya Dryomov return -ENOMEM; 424659641015SIlya Dryomov 424768310a5eSIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 4248c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 424968310a5eSIlya Dryomov key.offset = 0; 425068310a5eSIlya Dryomov 425168310a5eSIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 425268310a5eSIlya Dryomov if (ret < 0) 425368310a5eSIlya Dryomov goto out; 425468310a5eSIlya Dryomov if (ret > 0) { /* ret = -ENOENT; */ 425568310a5eSIlya Dryomov ret = 0; 425668310a5eSIlya Dryomov goto out; 425768310a5eSIlya Dryomov } 425868310a5eSIlya Dryomov 425959641015SIlya Dryomov bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 426059641015SIlya Dryomov if (!bctl) { 426159641015SIlya Dryomov ret = -ENOMEM; 426259641015SIlya Dryomov goto out; 426359641015SIlya Dryomov } 426459641015SIlya Dryomov 426559641015SIlya Dryomov leaf = path->nodes[0]; 426659641015SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 426759641015SIlya Dryomov 426868310a5eSIlya Dryomov bctl->flags = btrfs_balance_flags(leaf, item); 426968310a5eSIlya Dryomov bctl->flags |= BTRFS_BALANCE_RESUME; 427059641015SIlya Dryomov 427159641015SIlya Dryomov btrfs_balance_data(leaf, item, &disk_bargs); 427259641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 427359641015SIlya Dryomov btrfs_balance_meta(leaf, item, &disk_bargs); 427459641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 427559641015SIlya Dryomov btrfs_balance_sys(leaf, item, &disk_bargs); 427659641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 427759641015SIlya Dryomov 4278eee95e3fSDavid Sterba /* 4279eee95e3fSDavid Sterba * This should never happen, as the paused balance state is recovered 4280eee95e3fSDavid Sterba * during mount without any chance of other exclusive ops to collide. 4281eee95e3fSDavid Sterba * 4282eee95e3fSDavid Sterba * This gives the exclusive op status to balance and keeps in paused 4283eee95e3fSDavid Sterba * state until user intervention (cancel or umount). If the ownership 4284eee95e3fSDavid Sterba * cannot be assigned, show a message but do not fail. The balance 4285eee95e3fSDavid Sterba * is in a paused state and must have fs_info::balance_ctl properly 4286eee95e3fSDavid Sterba * set up. 4287eee95e3fSDavid Sterba */ 4288c3e1f96cSGoldwyn Rodrigues if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4289eee95e3fSDavid Sterba btrfs_warn(fs_info, 42906dac13f8SAnand Jain "balance: cannot set exclusive op status, resume manually"); 4291ed0fb78fSIlya Dryomov 429268310a5eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4293833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl); 4294833aae18SDavid Sterba spin_lock(&fs_info->balance_lock); 4295833aae18SDavid Sterba fs_info->balance_ctl = bctl; 4296833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock); 429768310a5eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 429859641015SIlya Dryomov out: 429959641015SIlya Dryomov btrfs_free_path(path); 430059641015SIlya Dryomov return ret; 430159641015SIlya Dryomov } 430259641015SIlya Dryomov 4303837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4304837d5b6eSIlya Dryomov { 4305837d5b6eSIlya Dryomov int ret = 0; 4306837d5b6eSIlya Dryomov 4307837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4308837d5b6eSIlya Dryomov if (!fs_info->balance_ctl) { 4309837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4310837d5b6eSIlya Dryomov return -ENOTCONN; 4311837d5b6eSIlya Dryomov } 4312837d5b6eSIlya Dryomov 43133009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4314837d5b6eSIlya Dryomov atomic_inc(&fs_info->balance_pause_req); 4315837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4316837d5b6eSIlya Dryomov 4317837d5b6eSIlya Dryomov wait_event(fs_info->balance_wait_q, 43183009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4319837d5b6eSIlya Dryomov 4320837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4321837d5b6eSIlya Dryomov /* we are good with balance_ctl ripped off from under us */ 43223009a62fSDavid Sterba BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4323837d5b6eSIlya Dryomov atomic_dec(&fs_info->balance_pause_req); 4324837d5b6eSIlya Dryomov } else { 4325837d5b6eSIlya Dryomov ret = -ENOTCONN; 4326837d5b6eSIlya Dryomov } 4327837d5b6eSIlya Dryomov 4328837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4329837d5b6eSIlya Dryomov return ret; 4330837d5b6eSIlya Dryomov } 4331837d5b6eSIlya Dryomov 4332a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4333a7e99c69SIlya Dryomov { 4334a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4335a7e99c69SIlya Dryomov if (!fs_info->balance_ctl) { 4336a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4337a7e99c69SIlya Dryomov return -ENOTCONN; 4338a7e99c69SIlya Dryomov } 4339a7e99c69SIlya Dryomov 4340cf7d20f4SDavid Sterba /* 4341cf7d20f4SDavid Sterba * A paused balance with the item stored on disk can be resumed at 4342cf7d20f4SDavid Sterba * mount time if the mount is read-write. Otherwise it's still paused 4343cf7d20f4SDavid Sterba * and we must not allow cancelling as it deletes the item. 4344cf7d20f4SDavid Sterba */ 4345cf7d20f4SDavid Sterba if (sb_rdonly(fs_info->sb)) { 4346cf7d20f4SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 4347cf7d20f4SDavid Sterba return -EROFS; 4348cf7d20f4SDavid Sterba } 4349cf7d20f4SDavid Sterba 4350a7e99c69SIlya Dryomov atomic_inc(&fs_info->balance_cancel_req); 4351a7e99c69SIlya Dryomov /* 4352a7e99c69SIlya Dryomov * if we are running just wait and return, balance item is 4353a7e99c69SIlya Dryomov * deleted in btrfs_balance in this case 4354a7e99c69SIlya Dryomov */ 43553009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4356a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4357a7e99c69SIlya Dryomov wait_event(fs_info->balance_wait_q, 43583009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4359a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4360a7e99c69SIlya Dryomov } else { 4361a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4362dccdb07bSDavid Sterba /* 4363dccdb07bSDavid Sterba * Lock released to allow other waiters to continue, we'll 4364dccdb07bSDavid Sterba * reexamine the status again. 4365dccdb07bSDavid Sterba */ 4366a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4367a7e99c69SIlya Dryomov 4368a17c95dfSDavid Sterba if (fs_info->balance_ctl) { 4369149196a2SDavid Sterba reset_balance_state(fs_info); 4370c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 43716dac13f8SAnand Jain btrfs_info(fs_info, "balance: canceled"); 4372a17c95dfSDavid Sterba } 4373a7e99c69SIlya Dryomov } 4374a7e99c69SIlya Dryomov 43753009a62fSDavid Sterba BUG_ON(fs_info->balance_ctl || 43763009a62fSDavid Sterba test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4377a7e99c69SIlya Dryomov atomic_dec(&fs_info->balance_cancel_req); 4378a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4379a7e99c69SIlya Dryomov return 0; 4380a7e99c69SIlya Dryomov } 4381a7e99c69SIlya Dryomov 438297f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data) 4383803b2f54SStefan Behrens { 4384803b2f54SStefan Behrens struct btrfs_fs_info *fs_info = data; 4385803b2f54SStefan Behrens struct btrfs_root *root = fs_info->tree_root; 4386803b2f54SStefan Behrens struct btrfs_key key; 4387803b2f54SStefan Behrens struct btrfs_path *path = NULL; 4388803b2f54SStefan Behrens int ret = 0; 4389803b2f54SStefan Behrens struct extent_buffer *eb; 4390803b2f54SStefan Behrens int slot; 4391803b2f54SStefan Behrens struct btrfs_root_item root_item; 4392803b2f54SStefan Behrens u32 item_size; 4393f45388f3SFilipe David Borba Manana struct btrfs_trans_handle *trans = NULL; 4394c94bec2cSJosef Bacik bool closing = false; 4395803b2f54SStefan Behrens 4396803b2f54SStefan Behrens path = btrfs_alloc_path(); 4397803b2f54SStefan Behrens if (!path) { 4398803b2f54SStefan Behrens ret = -ENOMEM; 4399803b2f54SStefan Behrens goto out; 4400803b2f54SStefan Behrens } 4401803b2f54SStefan Behrens 4402803b2f54SStefan Behrens key.objectid = 0; 4403803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4404803b2f54SStefan Behrens key.offset = 0; 4405803b2f54SStefan Behrens 4406803b2f54SStefan Behrens while (1) { 4407c94bec2cSJosef Bacik if (btrfs_fs_closing(fs_info)) { 4408c94bec2cSJosef Bacik closing = true; 4409c94bec2cSJosef Bacik break; 4410c94bec2cSJosef Bacik } 44117c829b72SAnand Jain ret = btrfs_search_forward(root, &key, path, 44127c829b72SAnand Jain BTRFS_OLDEST_GENERATION); 4413803b2f54SStefan Behrens if (ret) { 4414803b2f54SStefan Behrens if (ret > 0) 4415803b2f54SStefan Behrens ret = 0; 4416803b2f54SStefan Behrens break; 4417803b2f54SStefan Behrens } 4418803b2f54SStefan Behrens 4419803b2f54SStefan Behrens if (key.type != BTRFS_ROOT_ITEM_KEY || 4420803b2f54SStefan Behrens (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4421803b2f54SStefan Behrens key.objectid != BTRFS_FS_TREE_OBJECTID) || 4422803b2f54SStefan Behrens key.objectid > BTRFS_LAST_FREE_OBJECTID) 4423803b2f54SStefan Behrens goto skip; 4424803b2f54SStefan Behrens 4425803b2f54SStefan Behrens eb = path->nodes[0]; 4426803b2f54SStefan Behrens slot = path->slots[0]; 4427803b2f54SStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 4428803b2f54SStefan Behrens if (item_size < sizeof(root_item)) 4429803b2f54SStefan Behrens goto skip; 4430803b2f54SStefan Behrens 4431803b2f54SStefan Behrens read_extent_buffer(eb, &root_item, 4432803b2f54SStefan Behrens btrfs_item_ptr_offset(eb, slot), 4433803b2f54SStefan Behrens (int)sizeof(root_item)); 4434803b2f54SStefan Behrens if (btrfs_root_refs(&root_item) == 0) 4435803b2f54SStefan Behrens goto skip; 4436f45388f3SFilipe David Borba Manana 4437f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid) || 4438f45388f3SFilipe David Borba Manana !btrfs_is_empty_uuid(root_item.received_uuid)) { 4439f45388f3SFilipe David Borba Manana if (trans) 4440f45388f3SFilipe David Borba Manana goto update_tree; 4441f45388f3SFilipe David Borba Manana 4442f45388f3SFilipe David Borba Manana btrfs_release_path(path); 4443803b2f54SStefan Behrens /* 4444803b2f54SStefan Behrens * 1 - subvol uuid item 4445803b2f54SStefan Behrens * 1 - received_subvol uuid item 4446803b2f54SStefan Behrens */ 4447803b2f54SStefan Behrens trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4448803b2f54SStefan Behrens if (IS_ERR(trans)) { 4449803b2f54SStefan Behrens ret = PTR_ERR(trans); 4450803b2f54SStefan Behrens break; 4451803b2f54SStefan Behrens } 4452f45388f3SFilipe David Borba Manana continue; 4453f45388f3SFilipe David Borba Manana } else { 4454f45388f3SFilipe David Borba Manana goto skip; 4455f45388f3SFilipe David Borba Manana } 4456f45388f3SFilipe David Borba Manana update_tree: 44579771a5cfSJosef Bacik btrfs_release_path(path); 4458f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid)) { 4459cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4460803b2f54SStefan Behrens BTRFS_UUID_KEY_SUBVOL, 4461803b2f54SStefan Behrens key.objectid); 4462803b2f54SStefan Behrens if (ret < 0) { 4463efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4464803b2f54SStefan Behrens ret); 4465803b2f54SStefan Behrens break; 4466803b2f54SStefan Behrens } 4467803b2f54SStefan Behrens } 4468803b2f54SStefan Behrens 4469803b2f54SStefan Behrens if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4470cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans, 4471803b2f54SStefan Behrens root_item.received_uuid, 4472803b2f54SStefan Behrens BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4473803b2f54SStefan Behrens key.objectid); 4474803b2f54SStefan Behrens if (ret < 0) { 4475efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4476803b2f54SStefan Behrens ret); 4477803b2f54SStefan Behrens break; 4478803b2f54SStefan Behrens } 4479803b2f54SStefan Behrens } 4480803b2f54SStefan Behrens 4481f45388f3SFilipe David Borba Manana skip: 44829771a5cfSJosef Bacik btrfs_release_path(path); 4483803b2f54SStefan Behrens if (trans) { 44843a45bb20SJeff Mahoney ret = btrfs_end_transaction(trans); 4485f45388f3SFilipe David Borba Manana trans = NULL; 4486803b2f54SStefan Behrens if (ret) 4487803b2f54SStefan Behrens break; 4488803b2f54SStefan Behrens } 4489803b2f54SStefan Behrens 4490803b2f54SStefan Behrens if (key.offset < (u64)-1) { 4491803b2f54SStefan Behrens key.offset++; 4492803b2f54SStefan Behrens } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4493803b2f54SStefan Behrens key.offset = 0; 4494803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4495803b2f54SStefan Behrens } else if (key.objectid < (u64)-1) { 4496803b2f54SStefan Behrens key.offset = 0; 4497803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4498803b2f54SStefan Behrens key.objectid++; 4499803b2f54SStefan Behrens } else { 4500803b2f54SStefan Behrens break; 4501803b2f54SStefan Behrens } 4502803b2f54SStefan Behrens cond_resched(); 4503803b2f54SStefan Behrens } 4504803b2f54SStefan Behrens 4505803b2f54SStefan Behrens out: 4506803b2f54SStefan Behrens btrfs_free_path(path); 4507f45388f3SFilipe David Borba Manana if (trans && !IS_ERR(trans)) 45083a45bb20SJeff Mahoney btrfs_end_transaction(trans); 4509803b2f54SStefan Behrens if (ret) 4510efe120a0SFrank Holton btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4511c94bec2cSJosef Bacik else if (!closing) 4512afcdd129SJosef Bacik set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4513803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4514803b2f54SStefan Behrens return 0; 4515803b2f54SStefan Behrens } 4516803b2f54SStefan Behrens 4517f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4518f7a81ea4SStefan Behrens { 4519f7a81ea4SStefan Behrens struct btrfs_trans_handle *trans; 4520f7a81ea4SStefan Behrens struct btrfs_root *tree_root = fs_info->tree_root; 4521f7a81ea4SStefan Behrens struct btrfs_root *uuid_root; 4522803b2f54SStefan Behrens struct task_struct *task; 4523803b2f54SStefan Behrens int ret; 4524f7a81ea4SStefan Behrens 4525f7a81ea4SStefan Behrens /* 4526f7a81ea4SStefan Behrens * 1 - root node 4527f7a81ea4SStefan Behrens * 1 - root item 4528f7a81ea4SStefan Behrens */ 4529f7a81ea4SStefan Behrens trans = btrfs_start_transaction(tree_root, 2); 4530f7a81ea4SStefan Behrens if (IS_ERR(trans)) 4531f7a81ea4SStefan Behrens return PTR_ERR(trans); 4532f7a81ea4SStefan Behrens 45339b7a2440SDavid Sterba uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4534f7a81ea4SStefan Behrens if (IS_ERR(uuid_root)) { 45356d13f549SDavid Sterba ret = PTR_ERR(uuid_root); 453666642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 45373a45bb20SJeff Mahoney btrfs_end_transaction(trans); 45386d13f549SDavid Sterba return ret; 4539f7a81ea4SStefan Behrens } 4540f7a81ea4SStefan Behrens 4541f7a81ea4SStefan Behrens fs_info->uuid_root = uuid_root; 4542f7a81ea4SStefan Behrens 45433a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 4544803b2f54SStefan Behrens if (ret) 4545803b2f54SStefan Behrens return ret; 4546803b2f54SStefan Behrens 4547803b2f54SStefan Behrens down(&fs_info->uuid_tree_rescan_sem); 4548803b2f54SStefan Behrens task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4549803b2f54SStefan Behrens if (IS_ERR(task)) { 455070f80175SStefan Behrens /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4551efe120a0SFrank Holton btrfs_warn(fs_info, "failed to start uuid_scan task"); 4552803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4553803b2f54SStefan Behrens return PTR_ERR(task); 4554f7a81ea4SStefan Behrens } 4555803b2f54SStefan Behrens 4556803b2f54SStefan Behrens return 0; 4557803b2f54SStefan Behrens } 4558803b2f54SStefan Behrens 45598f18cf13SChris Mason /* 45608f18cf13SChris Mason * shrinking a device means finding all of the device extents past 45618f18cf13SChris Mason * the new size, and then following the back refs to the chunks. 45628f18cf13SChris Mason * The chunk relocation code actually frees the device extent 45638f18cf13SChris Mason */ 45648f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 45658f18cf13SChris Mason { 45660b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 45670b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 45688f18cf13SChris Mason struct btrfs_trans_handle *trans; 45698f18cf13SChris Mason struct btrfs_dev_extent *dev_extent = NULL; 45708f18cf13SChris Mason struct btrfs_path *path; 45718f18cf13SChris Mason u64 length; 45728f18cf13SChris Mason u64 chunk_offset; 45738f18cf13SChris Mason int ret; 45748f18cf13SChris Mason int slot; 4575ba1bf481SJosef Bacik int failed = 0; 4576ba1bf481SJosef Bacik bool retried = false; 45778f18cf13SChris Mason struct extent_buffer *l; 45788f18cf13SChris Mason struct btrfs_key key; 45790b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 45808f18cf13SChris Mason u64 old_total = btrfs_super_total_bytes(super_copy); 45817cc8e58dSMiao Xie u64 old_size = btrfs_device_get_total_bytes(device); 45827dfb8be1SNikolay Borisov u64 diff; 458361d0d0d2SNikolay Borisov u64 start; 45847dfb8be1SNikolay Borisov 45857dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 458661d0d0d2SNikolay Borisov start = new_size; 45870e4324a4SNikolay Borisov diff = round_down(old_size - new_size, fs_info->sectorsize); 45888f18cf13SChris Mason 4589401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 459063a212abSStefan Behrens return -EINVAL; 459163a212abSStefan Behrens 45928f18cf13SChris Mason path = btrfs_alloc_path(); 45938f18cf13SChris Mason if (!path) 45948f18cf13SChris Mason return -ENOMEM; 45958f18cf13SChris Mason 45960338dff6SGu Jinxiang path->reada = READA_BACK; 45978f18cf13SChris Mason 459861d0d0d2SNikolay Borisov trans = btrfs_start_transaction(root, 0); 459961d0d0d2SNikolay Borisov if (IS_ERR(trans)) { 460061d0d0d2SNikolay Borisov btrfs_free_path(path); 460161d0d0d2SNikolay Borisov return PTR_ERR(trans); 460261d0d0d2SNikolay Borisov } 460361d0d0d2SNikolay Borisov 460434441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 46057d9eb12cSChris Mason 46067cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 4607ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 46082b82032cSYan Zheng device->fs_devices->total_rw_bytes -= diff; 4609a5ed45f8SNikolay Borisov atomic64_sub(diff, &fs_info->free_chunk_space); 46102bf64758SJosef Bacik } 461161d0d0d2SNikolay Borisov 461261d0d0d2SNikolay Borisov /* 461361d0d0d2SNikolay Borisov * Once the device's size has been set to the new size, ensure all 461461d0d0d2SNikolay Borisov * in-memory chunks are synced to disk so that the loop below sees them 461561d0d0d2SNikolay Borisov * and relocates them accordingly. 461661d0d0d2SNikolay Borisov */ 46171c11b63eSJeff Mahoney if (contains_pending_extent(device, &start, diff)) { 461834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 461961d0d0d2SNikolay Borisov ret = btrfs_commit_transaction(trans); 462061d0d0d2SNikolay Borisov if (ret) 462161d0d0d2SNikolay Borisov goto done; 462261d0d0d2SNikolay Borisov } else { 462361d0d0d2SNikolay Borisov mutex_unlock(&fs_info->chunk_mutex); 462461d0d0d2SNikolay Borisov btrfs_end_transaction(trans); 462561d0d0d2SNikolay Borisov } 46268f18cf13SChris Mason 4627ba1bf481SJosef Bacik again: 46288f18cf13SChris Mason key.objectid = device->devid; 46298f18cf13SChris Mason key.offset = (u64)-1; 46308f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 46318f18cf13SChris Mason 4632213e64daSIlya Dryomov do { 46330b246afaSJeff Mahoney mutex_lock(&fs_info->delete_unused_bgs_mutex); 46348f18cf13SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 463567c5e7d4SFilipe Manana if (ret < 0) { 46360b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 46378f18cf13SChris Mason goto done; 463867c5e7d4SFilipe Manana } 46398f18cf13SChris Mason 46408f18cf13SChris Mason ret = btrfs_previous_item(root, path, 0, key.type); 464167c5e7d4SFilipe Manana if (ret) 46420b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 46438f18cf13SChris Mason if (ret < 0) 46448f18cf13SChris Mason goto done; 46458f18cf13SChris Mason if (ret) { 46468f18cf13SChris Mason ret = 0; 4647b3b4aa74SDavid Sterba btrfs_release_path(path); 4648bf1fb512SYan Zheng break; 46498f18cf13SChris Mason } 46508f18cf13SChris Mason 46518f18cf13SChris Mason l = path->nodes[0]; 46528f18cf13SChris Mason slot = path->slots[0]; 46538f18cf13SChris Mason btrfs_item_key_to_cpu(l, &key, path->slots[0]); 46548f18cf13SChris Mason 4655ba1bf481SJosef Bacik if (key.objectid != device->devid) { 46560b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4657b3b4aa74SDavid Sterba btrfs_release_path(path); 4658bf1fb512SYan Zheng break; 4659ba1bf481SJosef Bacik } 46608f18cf13SChris Mason 46618f18cf13SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 46628f18cf13SChris Mason length = btrfs_dev_extent_length(l, dev_extent); 46638f18cf13SChris Mason 4664ba1bf481SJosef Bacik if (key.offset + length <= new_size) { 46650b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4666b3b4aa74SDavid Sterba btrfs_release_path(path); 4667d6397baeSChris Ball break; 4668ba1bf481SJosef Bacik } 46698f18cf13SChris Mason 46708f18cf13SChris Mason chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4671b3b4aa74SDavid Sterba btrfs_release_path(path); 46728f18cf13SChris Mason 4673a6f93c71SLiu Bo /* 4674a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 4675a6f93c71SLiu Bo * which could potentially end up with losing data's 4676a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 4677a6f93c71SLiu Bo * advance. 4678a6f93c71SLiu Bo */ 4679a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4680a6f93c71SLiu Bo if (ret < 0) { 4681a6f93c71SLiu Bo mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4682a6f93c71SLiu Bo goto done; 4683a6f93c71SLiu Bo } 4684a6f93c71SLiu Bo 46850b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, chunk_offset); 46860b246afaSJeff Mahoney mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4687eede2bf3SOmar Sandoval if (ret == -ENOSPC) { 4688ba1bf481SJosef Bacik failed++; 4689eede2bf3SOmar Sandoval } else if (ret) { 4690eede2bf3SOmar Sandoval if (ret == -ETXTBSY) { 4691eede2bf3SOmar Sandoval btrfs_warn(fs_info, 4692eede2bf3SOmar Sandoval "could not shrink block group %llu due to active swapfile", 4693eede2bf3SOmar Sandoval chunk_offset); 4694eede2bf3SOmar Sandoval } 4695eede2bf3SOmar Sandoval goto done; 4696eede2bf3SOmar Sandoval } 4697213e64daSIlya Dryomov } while (key.offset-- > 0); 4698ba1bf481SJosef Bacik 4699ba1bf481SJosef Bacik if (failed && !retried) { 4700ba1bf481SJosef Bacik failed = 0; 4701ba1bf481SJosef Bacik retried = true; 4702ba1bf481SJosef Bacik goto again; 4703ba1bf481SJosef Bacik } else if (failed && retried) { 4704ba1bf481SJosef Bacik ret = -ENOSPC; 47058f18cf13SChris Mason goto done; 47068f18cf13SChris Mason } 47078f18cf13SChris Mason 4708d6397baeSChris Ball /* Shrinking succeeded, else we would be at "done". */ 4709a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 471098d5dc13STsutomu Itoh if (IS_ERR(trans)) { 471198d5dc13STsutomu Itoh ret = PTR_ERR(trans); 471298d5dc13STsutomu Itoh goto done; 471398d5dc13STsutomu Itoh } 471498d5dc13STsutomu Itoh 471534441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 4716c57dd1f2SQu Wenruo /* Clear all state bits beyond the shrunk device size */ 4717c57dd1f2SQu Wenruo clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4718c57dd1f2SQu Wenruo CHUNK_STATE_MASK); 4719c57dd1f2SQu Wenruo 47207cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 4721bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list)) 4722bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list, 4723bbbf7243SNikolay Borisov &trans->transaction->dev_update_list); 4724d6397baeSChris Ball 4725d6397baeSChris Ball WARN_ON(diff > old_total); 47267dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 47277dfb8be1SNikolay Borisov round_down(old_total - diff, fs_info->sectorsize)); 472834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 47292196d6e8SMiao Xie 47302196d6e8SMiao Xie /* Now btrfs_update_device() will change the on-disk size. */ 47312196d6e8SMiao Xie ret = btrfs_update_device(trans, device); 4732801660b0SAnand Jain if (ret < 0) { 4733801660b0SAnand Jain btrfs_abort_transaction(trans, ret); 47343a45bb20SJeff Mahoney btrfs_end_transaction(trans); 4735801660b0SAnand Jain } else { 4736801660b0SAnand Jain ret = btrfs_commit_transaction(trans); 4737801660b0SAnand Jain } 47388f18cf13SChris Mason done: 47398f18cf13SChris Mason btrfs_free_path(path); 474053e489bcSFilipe Manana if (ret) { 474134441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 474253e489bcSFilipe Manana btrfs_device_set_total_bytes(device, old_size); 4743ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 474453e489bcSFilipe Manana device->fs_devices->total_rw_bytes += diff; 4745a5ed45f8SNikolay Borisov atomic64_add(diff, &fs_info->free_chunk_space); 474634441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 474753e489bcSFilipe Manana } 47488f18cf13SChris Mason return ret; 47498f18cf13SChris Mason } 47508f18cf13SChris Mason 47512ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 47520b86a832SChris Mason struct btrfs_key *key, 47530b86a832SChris Mason struct btrfs_chunk *chunk, int item_size) 47540b86a832SChris Mason { 47550b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 47560b86a832SChris Mason struct btrfs_disk_key disk_key; 47570b86a832SChris Mason u32 array_size; 47580b86a832SChris Mason u8 *ptr; 47590b86a832SChris Mason 476034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 47610b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 47625f43f86eSGui Hecheng if (array_size + item_size + sizeof(disk_key) 4763fe48a5c0SMiao Xie > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 476434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 47650b86a832SChris Mason return -EFBIG; 4766fe48a5c0SMiao Xie } 47670b86a832SChris Mason 47680b86a832SChris Mason ptr = super_copy->sys_chunk_array + array_size; 47690b86a832SChris Mason btrfs_cpu_key_to_disk(&disk_key, key); 47700b86a832SChris Mason memcpy(ptr, &disk_key, sizeof(disk_key)); 47710b86a832SChris Mason ptr += sizeof(disk_key); 47720b86a832SChris Mason memcpy(ptr, chunk, item_size); 47730b86a832SChris Mason item_size += sizeof(disk_key); 47740b86a832SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 477534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 4776fe48a5c0SMiao Xie 47770b86a832SChris Mason return 0; 47780b86a832SChris Mason } 47790b86a832SChris Mason 47809f680ce0SChris Mason /* 478173c5de00SArne Jansen * sort the devices in descending order by max_avail, total_avail 47829f680ce0SChris Mason */ 478373c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b) 47842b82032cSYan Zheng { 478573c5de00SArne Jansen const struct btrfs_device_info *di_a = a; 478673c5de00SArne Jansen const struct btrfs_device_info *di_b = b; 47872b82032cSYan Zheng 478873c5de00SArne Jansen if (di_a->max_avail > di_b->max_avail) 4789a40a90a0SChris Mason return -1; 479073c5de00SArne Jansen if (di_a->max_avail < di_b->max_avail) 47919b3f68b9SChris Mason return 1; 479273c5de00SArne Jansen if (di_a->total_avail > di_b->total_avail) 479373c5de00SArne Jansen return -1; 479473c5de00SArne Jansen if (di_a->total_avail < di_b->total_avail) 479573c5de00SArne Jansen return 1; 4796b2117a39SMiao Xie return 0; 4797b2117a39SMiao Xie } 4798b2117a39SMiao Xie 479953b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 480053b381b3SDavid Woodhouse { 4801ffe2d203SZhao Lei if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 480253b381b3SDavid Woodhouse return; 480353b381b3SDavid Woodhouse 4804ceda0864SMiao Xie btrfs_set_fs_incompat(info, RAID56); 480553b381b3SDavid Woodhouse } 480653b381b3SDavid Woodhouse 4807cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4808cfbb825cSDavid Sterba { 4809cfbb825cSDavid Sterba if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4810cfbb825cSDavid Sterba return; 4811cfbb825cSDavid Sterba 4812cfbb825cSDavid Sterba btrfs_set_fs_incompat(info, RAID1C34); 4813cfbb825cSDavid Sterba } 4814cfbb825cSDavid Sterba 48154f2bafe8SNaohiro Aota /* 48164f2bafe8SNaohiro Aota * Structure used internally for __btrfs_alloc_chunk() function. 48174f2bafe8SNaohiro Aota * Wraps needed parameters. 48184f2bafe8SNaohiro Aota */ 48194f2bafe8SNaohiro Aota struct alloc_chunk_ctl { 48204f2bafe8SNaohiro Aota u64 start; 48214f2bafe8SNaohiro Aota u64 type; 48224f2bafe8SNaohiro Aota /* Total number of stripes to allocate */ 48234f2bafe8SNaohiro Aota int num_stripes; 48244f2bafe8SNaohiro Aota /* sub_stripes info for map */ 48254f2bafe8SNaohiro Aota int sub_stripes; 48264f2bafe8SNaohiro Aota /* Stripes per device */ 48274f2bafe8SNaohiro Aota int dev_stripes; 48284f2bafe8SNaohiro Aota /* Maximum number of devices to use */ 48294f2bafe8SNaohiro Aota int devs_max; 48304f2bafe8SNaohiro Aota /* Minimum number of devices to use */ 48314f2bafe8SNaohiro Aota int devs_min; 48324f2bafe8SNaohiro Aota /* ndevs has to be a multiple of this */ 48334f2bafe8SNaohiro Aota int devs_increment; 48344f2bafe8SNaohiro Aota /* Number of copies */ 48354f2bafe8SNaohiro Aota int ncopies; 48364f2bafe8SNaohiro Aota /* Number of stripes worth of bytes to store parity information */ 48374f2bafe8SNaohiro Aota int nparity; 48384f2bafe8SNaohiro Aota u64 max_stripe_size; 48394f2bafe8SNaohiro Aota u64 max_chunk_size; 48406aafb303SNaohiro Aota u64 dev_extent_min; 48414f2bafe8SNaohiro Aota u64 stripe_size; 48424f2bafe8SNaohiro Aota u64 chunk_size; 48434f2bafe8SNaohiro Aota int ndevs; 48444f2bafe8SNaohiro Aota }; 48454f2bafe8SNaohiro Aota 484627c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular( 484727c314d5SNaohiro Aota struct btrfs_fs_devices *fs_devices, 484827c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl) 484927c314d5SNaohiro Aota { 485027c314d5SNaohiro Aota u64 type = ctl->type; 485127c314d5SNaohiro Aota 485227c314d5SNaohiro Aota if (type & BTRFS_BLOCK_GROUP_DATA) { 485327c314d5SNaohiro Aota ctl->max_stripe_size = SZ_1G; 485427c314d5SNaohiro Aota ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 485527c314d5SNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 485627c314d5SNaohiro Aota /* For larger filesystems, use larger metadata chunks */ 485727c314d5SNaohiro Aota if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 485827c314d5SNaohiro Aota ctl->max_stripe_size = SZ_1G; 485927c314d5SNaohiro Aota else 486027c314d5SNaohiro Aota ctl->max_stripe_size = SZ_256M; 486127c314d5SNaohiro Aota ctl->max_chunk_size = ctl->max_stripe_size; 486227c314d5SNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 486327c314d5SNaohiro Aota ctl->max_stripe_size = SZ_32M; 486427c314d5SNaohiro Aota ctl->max_chunk_size = 2 * ctl->max_stripe_size; 486527c314d5SNaohiro Aota ctl->devs_max = min_t(int, ctl->devs_max, 486627c314d5SNaohiro Aota BTRFS_MAX_DEVS_SYS_CHUNK); 486727c314d5SNaohiro Aota } else { 486827c314d5SNaohiro Aota BUG(); 486927c314d5SNaohiro Aota } 487027c314d5SNaohiro Aota 487127c314d5SNaohiro Aota /* We don't want a chunk larger than 10% of writable space */ 487227c314d5SNaohiro Aota ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 487327c314d5SNaohiro Aota ctl->max_chunk_size); 48746aafb303SNaohiro Aota ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 487527c314d5SNaohiro Aota } 487627c314d5SNaohiro Aota 487727c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 487827c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl) 487927c314d5SNaohiro Aota { 488027c314d5SNaohiro Aota int index = btrfs_bg_flags_to_raid_index(ctl->type); 488127c314d5SNaohiro Aota 488227c314d5SNaohiro Aota ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 488327c314d5SNaohiro Aota ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 488427c314d5SNaohiro Aota ctl->devs_max = btrfs_raid_array[index].devs_max; 488527c314d5SNaohiro Aota if (!ctl->devs_max) 488627c314d5SNaohiro Aota ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 488727c314d5SNaohiro Aota ctl->devs_min = btrfs_raid_array[index].devs_min; 488827c314d5SNaohiro Aota ctl->devs_increment = btrfs_raid_array[index].devs_increment; 488927c314d5SNaohiro Aota ctl->ncopies = btrfs_raid_array[index].ncopies; 489027c314d5SNaohiro Aota ctl->nparity = btrfs_raid_array[index].nparity; 489127c314d5SNaohiro Aota ctl->ndevs = 0; 489227c314d5SNaohiro Aota 489327c314d5SNaohiro Aota switch (fs_devices->chunk_alloc_policy) { 489427c314d5SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 489527c314d5SNaohiro Aota init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 489627c314d5SNaohiro Aota break; 489727c314d5SNaohiro Aota default: 489827c314d5SNaohiro Aota BUG(); 489927c314d5SNaohiro Aota } 490027c314d5SNaohiro Aota } 490127c314d5SNaohiro Aota 4902560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices, 4903560156cbSNaohiro Aota struct alloc_chunk_ctl *ctl, 4904560156cbSNaohiro Aota struct btrfs_device_info *devices_info) 4905560156cbSNaohiro Aota { 4906560156cbSNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info; 4907560156cbSNaohiro Aota struct btrfs_device *device; 4908560156cbSNaohiro Aota u64 total_avail; 4909560156cbSNaohiro Aota u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 4910560156cbSNaohiro Aota int ret; 4911560156cbSNaohiro Aota int ndevs = 0; 4912560156cbSNaohiro Aota u64 max_avail; 4913560156cbSNaohiro Aota u64 dev_offset; 4914560156cbSNaohiro Aota 4915560156cbSNaohiro Aota /* 4916560156cbSNaohiro Aota * in the first pass through the devices list, we gather information 4917560156cbSNaohiro Aota * about the available holes on each device. 4918560156cbSNaohiro Aota */ 4919560156cbSNaohiro Aota list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4920560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4921560156cbSNaohiro Aota WARN(1, KERN_ERR 4922560156cbSNaohiro Aota "BTRFS: read-only device in alloc_list\n"); 4923560156cbSNaohiro Aota continue; 4924560156cbSNaohiro Aota } 4925560156cbSNaohiro Aota 4926560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4927560156cbSNaohiro Aota &device->dev_state) || 4928560156cbSNaohiro Aota test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4929560156cbSNaohiro Aota continue; 4930560156cbSNaohiro Aota 4931560156cbSNaohiro Aota if (device->total_bytes > device->bytes_used) 4932560156cbSNaohiro Aota total_avail = device->total_bytes - device->bytes_used; 4933560156cbSNaohiro Aota else 4934560156cbSNaohiro Aota total_avail = 0; 4935560156cbSNaohiro Aota 4936560156cbSNaohiro Aota /* If there is no space on this device, skip it. */ 49376aafb303SNaohiro Aota if (total_avail < ctl->dev_extent_min) 4938560156cbSNaohiro Aota continue; 4939560156cbSNaohiro Aota 4940560156cbSNaohiro Aota ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 4941560156cbSNaohiro Aota &max_avail); 4942560156cbSNaohiro Aota if (ret && ret != -ENOSPC) 4943560156cbSNaohiro Aota return ret; 4944560156cbSNaohiro Aota 4945560156cbSNaohiro Aota if (ret == 0) 4946560156cbSNaohiro Aota max_avail = dev_extent_want; 4947560156cbSNaohiro Aota 49486aafb303SNaohiro Aota if (max_avail < ctl->dev_extent_min) { 4949560156cbSNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4950560156cbSNaohiro Aota btrfs_debug(info, 4951560156cbSNaohiro Aota "%s: devid %llu has no free space, have=%llu want=%llu", 4952560156cbSNaohiro Aota __func__, device->devid, max_avail, 49536aafb303SNaohiro Aota ctl->dev_extent_min); 4954560156cbSNaohiro Aota continue; 4955560156cbSNaohiro Aota } 4956560156cbSNaohiro Aota 4957560156cbSNaohiro Aota if (ndevs == fs_devices->rw_devices) { 4958560156cbSNaohiro Aota WARN(1, "%s: found more than %llu devices\n", 4959560156cbSNaohiro Aota __func__, fs_devices->rw_devices); 4960560156cbSNaohiro Aota break; 4961560156cbSNaohiro Aota } 4962560156cbSNaohiro Aota devices_info[ndevs].dev_offset = dev_offset; 4963560156cbSNaohiro Aota devices_info[ndevs].max_avail = max_avail; 4964560156cbSNaohiro Aota devices_info[ndevs].total_avail = total_avail; 4965560156cbSNaohiro Aota devices_info[ndevs].dev = device; 4966560156cbSNaohiro Aota ++ndevs; 4967560156cbSNaohiro Aota } 4968560156cbSNaohiro Aota ctl->ndevs = ndevs; 4969560156cbSNaohiro Aota 4970560156cbSNaohiro Aota /* 4971560156cbSNaohiro Aota * now sort the devices by hole size / available space 4972560156cbSNaohiro Aota */ 4973560156cbSNaohiro Aota sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4974560156cbSNaohiro Aota btrfs_cmp_device_info, NULL); 4975560156cbSNaohiro Aota 4976560156cbSNaohiro Aota return 0; 4977560156cbSNaohiro Aota } 4978560156cbSNaohiro Aota 49795badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 49805badf512SNaohiro Aota struct btrfs_device_info *devices_info) 49815badf512SNaohiro Aota { 49825badf512SNaohiro Aota /* Number of stripes that count for block group size */ 49835badf512SNaohiro Aota int data_stripes; 49845badf512SNaohiro Aota 49855badf512SNaohiro Aota /* 49865badf512SNaohiro Aota * The primary goal is to maximize the number of stripes, so use as 49875badf512SNaohiro Aota * many devices as possible, even if the stripes are not maximum sized. 49885badf512SNaohiro Aota * 49895badf512SNaohiro Aota * The DUP profile stores more than one stripe per device, the 49905badf512SNaohiro Aota * max_avail is the total size so we have to adjust. 49915badf512SNaohiro Aota */ 49925badf512SNaohiro Aota ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 49935badf512SNaohiro Aota ctl->dev_stripes); 49945badf512SNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 49955badf512SNaohiro Aota 49965badf512SNaohiro Aota /* This will have to be fixed for RAID1 and RAID10 over more drives */ 49975badf512SNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 49985badf512SNaohiro Aota 49995badf512SNaohiro Aota /* 50005badf512SNaohiro Aota * Use the number of data stripes to figure out how big this chunk is 50015badf512SNaohiro Aota * really going to be in terms of logical address space, and compare 50025badf512SNaohiro Aota * that answer with the max chunk size. If it's higher, we try to 50035badf512SNaohiro Aota * reduce stripe_size. 50045badf512SNaohiro Aota */ 50055badf512SNaohiro Aota if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 50065badf512SNaohiro Aota /* 50075badf512SNaohiro Aota * Reduce stripe_size, round it up to a 16MB boundary again and 50085badf512SNaohiro Aota * then use it, unless it ends up being even bigger than the 50095badf512SNaohiro Aota * previous value we had already. 50105badf512SNaohiro Aota */ 50115badf512SNaohiro Aota ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 50125badf512SNaohiro Aota data_stripes), SZ_16M), 50135badf512SNaohiro Aota ctl->stripe_size); 50145badf512SNaohiro Aota } 50155badf512SNaohiro Aota 50165badf512SNaohiro Aota /* Align to BTRFS_STRIPE_LEN */ 50175badf512SNaohiro Aota ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 50185badf512SNaohiro Aota ctl->chunk_size = ctl->stripe_size * data_stripes; 50195badf512SNaohiro Aota 50205badf512SNaohiro Aota return 0; 50215badf512SNaohiro Aota } 50225badf512SNaohiro Aota 50235badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 50245badf512SNaohiro Aota struct alloc_chunk_ctl *ctl, 50255badf512SNaohiro Aota struct btrfs_device_info *devices_info) 50265badf512SNaohiro Aota { 50275badf512SNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info; 50285badf512SNaohiro Aota 50295badf512SNaohiro Aota /* 50305badf512SNaohiro Aota * Round down to number of usable stripes, devs_increment can be any 50315badf512SNaohiro Aota * number so we can't use round_down() that requires power of 2, while 50325badf512SNaohiro Aota * rounddown is safe. 50335badf512SNaohiro Aota */ 50345badf512SNaohiro Aota ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 50355badf512SNaohiro Aota 50365badf512SNaohiro Aota if (ctl->ndevs < ctl->devs_min) { 50375badf512SNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 50385badf512SNaohiro Aota btrfs_debug(info, 50395badf512SNaohiro Aota "%s: not enough devices with free space: have=%d minimum required=%d", 50405badf512SNaohiro Aota __func__, ctl->ndevs, ctl->devs_min); 50415badf512SNaohiro Aota } 50425badf512SNaohiro Aota return -ENOSPC; 50435badf512SNaohiro Aota } 50445badf512SNaohiro Aota 50455badf512SNaohiro Aota ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 50465badf512SNaohiro Aota 50475badf512SNaohiro Aota switch (fs_devices->chunk_alloc_policy) { 50485badf512SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 50495badf512SNaohiro Aota return decide_stripe_size_regular(ctl, devices_info); 50505badf512SNaohiro Aota default: 50515badf512SNaohiro Aota BUG(); 50525badf512SNaohiro Aota } 50535badf512SNaohiro Aota } 50545badf512SNaohiro Aota 5055dce580caSNaohiro Aota static int create_chunk(struct btrfs_trans_handle *trans, 5056dce580caSNaohiro Aota struct alloc_chunk_ctl *ctl, 5057dce580caSNaohiro Aota struct btrfs_device_info *devices_info) 5058dce580caSNaohiro Aota { 5059dce580caSNaohiro Aota struct btrfs_fs_info *info = trans->fs_info; 5060dce580caSNaohiro Aota struct map_lookup *map = NULL; 5061dce580caSNaohiro Aota struct extent_map_tree *em_tree; 5062dce580caSNaohiro Aota struct extent_map *em; 5063dce580caSNaohiro Aota u64 start = ctl->start; 5064dce580caSNaohiro Aota u64 type = ctl->type; 5065dce580caSNaohiro Aota int ret; 5066dce580caSNaohiro Aota int i; 5067dce580caSNaohiro Aota int j; 5068dce580caSNaohiro Aota 5069dce580caSNaohiro Aota map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5070dce580caSNaohiro Aota if (!map) 5071dce580caSNaohiro Aota return -ENOMEM; 5072dce580caSNaohiro Aota map->num_stripes = ctl->num_stripes; 5073dce580caSNaohiro Aota 5074dce580caSNaohiro Aota for (i = 0; i < ctl->ndevs; ++i) { 5075dce580caSNaohiro Aota for (j = 0; j < ctl->dev_stripes; ++j) { 5076dce580caSNaohiro Aota int s = i * ctl->dev_stripes + j; 5077dce580caSNaohiro Aota map->stripes[s].dev = devices_info[i].dev; 5078dce580caSNaohiro Aota map->stripes[s].physical = devices_info[i].dev_offset + 5079dce580caSNaohiro Aota j * ctl->stripe_size; 5080dce580caSNaohiro Aota } 5081dce580caSNaohiro Aota } 5082dce580caSNaohiro Aota map->stripe_len = BTRFS_STRIPE_LEN; 5083dce580caSNaohiro Aota map->io_align = BTRFS_STRIPE_LEN; 5084dce580caSNaohiro Aota map->io_width = BTRFS_STRIPE_LEN; 5085dce580caSNaohiro Aota map->type = type; 5086dce580caSNaohiro Aota map->sub_stripes = ctl->sub_stripes; 5087dce580caSNaohiro Aota 5088dce580caSNaohiro Aota trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5089dce580caSNaohiro Aota 5090dce580caSNaohiro Aota em = alloc_extent_map(); 5091dce580caSNaohiro Aota if (!em) { 5092dce580caSNaohiro Aota kfree(map); 5093dce580caSNaohiro Aota return -ENOMEM; 5094dce580caSNaohiro Aota } 5095dce580caSNaohiro Aota set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5096dce580caSNaohiro Aota em->map_lookup = map; 5097dce580caSNaohiro Aota em->start = start; 5098dce580caSNaohiro Aota em->len = ctl->chunk_size; 5099dce580caSNaohiro Aota em->block_start = 0; 5100dce580caSNaohiro Aota em->block_len = em->len; 5101dce580caSNaohiro Aota em->orig_block_len = ctl->stripe_size; 5102dce580caSNaohiro Aota 5103dce580caSNaohiro Aota em_tree = &info->mapping_tree; 5104dce580caSNaohiro Aota write_lock(&em_tree->lock); 5105dce580caSNaohiro Aota ret = add_extent_mapping(em_tree, em, 0); 5106dce580caSNaohiro Aota if (ret) { 5107dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5108dce580caSNaohiro Aota free_extent_map(em); 5109dce580caSNaohiro Aota return ret; 5110dce580caSNaohiro Aota } 5111dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5112dce580caSNaohiro Aota 5113dce580caSNaohiro Aota ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5114dce580caSNaohiro Aota if (ret) 5115dce580caSNaohiro Aota goto error_del_extent; 5116dce580caSNaohiro Aota 5117dce580caSNaohiro Aota for (i = 0; i < map->num_stripes; i++) { 5118dce580caSNaohiro Aota struct btrfs_device *dev = map->stripes[i].dev; 5119dce580caSNaohiro Aota 5120dce580caSNaohiro Aota btrfs_device_set_bytes_used(dev, 5121dce580caSNaohiro Aota dev->bytes_used + ctl->stripe_size); 5122dce580caSNaohiro Aota if (list_empty(&dev->post_commit_list)) 5123dce580caSNaohiro Aota list_add_tail(&dev->post_commit_list, 5124dce580caSNaohiro Aota &trans->transaction->dev_update_list); 5125dce580caSNaohiro Aota } 5126dce580caSNaohiro Aota 5127dce580caSNaohiro Aota atomic64_sub(ctl->stripe_size * map->num_stripes, 5128dce580caSNaohiro Aota &info->free_chunk_space); 5129dce580caSNaohiro Aota 5130dce580caSNaohiro Aota free_extent_map(em); 5131dce580caSNaohiro Aota check_raid56_incompat_flag(info, type); 5132dce580caSNaohiro Aota check_raid1c34_incompat_flag(info, type); 5133dce580caSNaohiro Aota 5134dce580caSNaohiro Aota return 0; 5135dce580caSNaohiro Aota 5136dce580caSNaohiro Aota error_del_extent: 5137dce580caSNaohiro Aota write_lock(&em_tree->lock); 5138dce580caSNaohiro Aota remove_extent_mapping(em_tree, em); 5139dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5140dce580caSNaohiro Aota 5141dce580caSNaohiro Aota /* One for our allocation */ 5142dce580caSNaohiro Aota free_extent_map(em); 5143dce580caSNaohiro Aota /* One for the tree reference */ 5144dce580caSNaohiro Aota free_extent_map(em); 5145dce580caSNaohiro Aota 5146dce580caSNaohiro Aota return ret; 5147dce580caSNaohiro Aota } 5148dce580caSNaohiro Aota 514911c67b1aSNikolay Borisov int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) 5150b2117a39SMiao Xie { 51512ff7e61eSJeff Mahoney struct btrfs_fs_info *info = trans->fs_info; 5152b2117a39SMiao Xie struct btrfs_fs_devices *fs_devices = info->fs_devices; 515373c5de00SArne Jansen struct btrfs_device_info *devices_info = NULL; 51544f2bafe8SNaohiro Aota struct alloc_chunk_ctl ctl; 5155b2117a39SMiao Xie int ret; 5156b2117a39SMiao Xie 515711c67b1aSNikolay Borisov lockdep_assert_held(&info->chunk_mutex); 515811c67b1aSNikolay Borisov 5159b25c19f4SNaohiro Aota if (!alloc_profile_is_valid(type, 0)) { 5160b25c19f4SNaohiro Aota ASSERT(0); 5161b25c19f4SNaohiro Aota return -EINVAL; 5162b25c19f4SNaohiro Aota } 516373c5de00SArne Jansen 51644117f207SQu Wenruo if (list_empty(&fs_devices->alloc_list)) { 51654117f207SQu Wenruo if (btrfs_test_opt(info, ENOSPC_DEBUG)) 51664117f207SQu Wenruo btrfs_debug(info, "%s: no writable device", __func__); 5167b2117a39SMiao Xie return -ENOSPC; 51684117f207SQu Wenruo } 5169b2117a39SMiao Xie 517027c314d5SNaohiro Aota if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 517127c314d5SNaohiro Aota btrfs_err(info, "invalid chunk type 0x%llx requested", type); 517227c314d5SNaohiro Aota ASSERT(0); 517327c314d5SNaohiro Aota return -EINVAL; 517473c5de00SArne Jansen } 517573c5de00SArne Jansen 517611c67b1aSNikolay Borisov ctl.start = find_next_chunk(info); 517727c314d5SNaohiro Aota ctl.type = type; 517827c314d5SNaohiro Aota init_alloc_chunk_ctl(fs_devices, &ctl); 5179b2117a39SMiao Xie 518031e818feSDavid Sterba devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5181b2117a39SMiao Xie GFP_NOFS); 5182b2117a39SMiao Xie if (!devices_info) 5183b2117a39SMiao Xie return -ENOMEM; 5184b2117a39SMiao Xie 5185560156cbSNaohiro Aota ret = gather_device_info(fs_devices, &ctl, devices_info); 5186560156cbSNaohiro Aota if (ret < 0) 5187dce580caSNaohiro Aota goto out; 518873c5de00SArne Jansen 51895badf512SNaohiro Aota ret = decide_stripe_size(fs_devices, &ctl, devices_info); 51905badf512SNaohiro Aota if (ret < 0) 5191dce580caSNaohiro Aota goto out; 519273c5de00SArne Jansen 5193dce580caSNaohiro Aota ret = create_chunk(trans, &ctl, devices_info); 51949b3f68b9SChris Mason 5195dce580caSNaohiro Aota out: 5196b2117a39SMiao Xie kfree(devices_info); 5197b2117a39SMiao Xie return ret; 51982b82032cSYan Zheng } 51992b82032cSYan Zheng 520011c67b1aSNikolay Borisov /* 520111c67b1aSNikolay Borisov * Chunk allocation falls into two parts. The first part does work 520211c67b1aSNikolay Borisov * that makes the new allocated chunk usable, but does not do any operation 520311c67b1aSNikolay Borisov * that modifies the chunk tree. The second part does the work that 520411c67b1aSNikolay Borisov * requires modifying the chunk tree. This division is important for the 520511c67b1aSNikolay Borisov * bootstrap process of adding storage to a seed btrfs. 520611c67b1aSNikolay Borisov */ 52076df9a95eSJosef Bacik int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 52086df9a95eSJosef Bacik u64 chunk_offset, u64 chunk_size) 52092b82032cSYan Zheng { 521097aff912SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 52116bccf3abSJeff Mahoney struct btrfs_root *extent_root = fs_info->extent_root; 52126bccf3abSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root; 52132b82032cSYan Zheng struct btrfs_key key; 52142b82032cSYan Zheng struct btrfs_device *device; 52152b82032cSYan Zheng struct btrfs_chunk *chunk; 52162b82032cSYan Zheng struct btrfs_stripe *stripe; 52176df9a95eSJosef Bacik struct extent_map *em; 52186df9a95eSJosef Bacik struct map_lookup *map; 52196df9a95eSJosef Bacik size_t item_size; 52206df9a95eSJosef Bacik u64 dev_offset; 52216df9a95eSJosef Bacik u64 stripe_size; 52226df9a95eSJosef Bacik int i = 0; 5223140e639fSChris Mason int ret = 0; 52242b82032cSYan Zheng 522560ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 5226592d92eeSLiu Bo if (IS_ERR(em)) 5227592d92eeSLiu Bo return PTR_ERR(em); 52286df9a95eSJosef Bacik 522995617d69SJeff Mahoney map = em->map_lookup; 52306df9a95eSJosef Bacik item_size = btrfs_chunk_item_size(map->num_stripes); 52316df9a95eSJosef Bacik stripe_size = em->orig_block_len; 52326df9a95eSJosef Bacik 52336df9a95eSJosef Bacik chunk = kzalloc(item_size, GFP_NOFS); 52346df9a95eSJosef Bacik if (!chunk) { 52356df9a95eSJosef Bacik ret = -ENOMEM; 52366df9a95eSJosef Bacik goto out; 52376df9a95eSJosef Bacik } 52386df9a95eSJosef Bacik 523950460e37SFilipe Manana /* 524050460e37SFilipe Manana * Take the device list mutex to prevent races with the final phase of 524150460e37SFilipe Manana * a device replace operation that replaces the device object associated 524250460e37SFilipe Manana * with the map's stripes, because the device object's id can change 524350460e37SFilipe Manana * at any time during that final phase of the device replace operation 524450460e37SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()). 524550460e37SFilipe Manana */ 52460b246afaSJeff Mahoney mutex_lock(&fs_info->fs_devices->device_list_mutex); 52476df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 52486df9a95eSJosef Bacik device = map->stripes[i].dev; 52496df9a95eSJosef Bacik dev_offset = map->stripes[i].physical; 52506df9a95eSJosef Bacik 52512b82032cSYan Zheng ret = btrfs_update_device(trans, device); 52523acd3953SMark Fasheh if (ret) 525350460e37SFilipe Manana break; 5254b5d9071cSNikolay Borisov ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5255b5d9071cSNikolay Borisov dev_offset, stripe_size); 52566df9a95eSJosef Bacik if (ret) 525750460e37SFilipe Manana break; 525850460e37SFilipe Manana } 525950460e37SFilipe Manana if (ret) { 52600b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 52616df9a95eSJosef Bacik goto out; 52622b82032cSYan Zheng } 52632b82032cSYan Zheng 52642b82032cSYan Zheng stripe = &chunk->stripe; 52656df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 52666df9a95eSJosef Bacik device = map->stripes[i].dev; 52676df9a95eSJosef Bacik dev_offset = map->stripes[i].physical; 52682b82032cSYan Zheng 52692b82032cSYan Zheng btrfs_set_stack_stripe_devid(stripe, device->devid); 52702b82032cSYan Zheng btrfs_set_stack_stripe_offset(stripe, dev_offset); 52712b82032cSYan Zheng memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 52722b82032cSYan Zheng stripe++; 52732b82032cSYan Zheng } 52740b246afaSJeff Mahoney mutex_unlock(&fs_info->fs_devices->device_list_mutex); 52752b82032cSYan Zheng 52762b82032cSYan Zheng btrfs_set_stack_chunk_length(chunk, chunk_size); 52772b82032cSYan Zheng btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 52782b82032cSYan Zheng btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 52792b82032cSYan Zheng btrfs_set_stack_chunk_type(chunk, map->type); 52802b82032cSYan Zheng btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 52812b82032cSYan Zheng btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 52822b82032cSYan Zheng btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 52830b246afaSJeff Mahoney btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 52842b82032cSYan Zheng btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 52852b82032cSYan Zheng 52862b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 52872b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 52882b82032cSYan Zheng key.offset = chunk_offset; 52892b82032cSYan Zheng 52902b82032cSYan Zheng ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 52914ed1d16eSMark Fasheh if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 52924ed1d16eSMark Fasheh /* 52934ed1d16eSMark Fasheh * TODO: Cleanup of inserted chunk root in case of 52944ed1d16eSMark Fasheh * failure. 52954ed1d16eSMark Fasheh */ 52962ff7e61eSJeff Mahoney ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 52972b82032cSYan Zheng } 52981abe9b8aSliubo 52996df9a95eSJosef Bacik out: 53002b82032cSYan Zheng kfree(chunk); 53016df9a95eSJosef Bacik free_extent_map(em); 53024ed1d16eSMark Fasheh return ret; 53032b82032cSYan Zheng } 53042b82032cSYan Zheng 53056f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 53062b82032cSYan Zheng { 53076f8e0fc7SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 53082b82032cSYan Zheng u64 alloc_profile; 53092b82032cSYan Zheng int ret; 53102b82032cSYan Zheng 53111b86826dSJeff Mahoney alloc_profile = btrfs_metadata_alloc_profile(fs_info); 531211c67b1aSNikolay Borisov ret = btrfs_alloc_chunk(trans, alloc_profile); 531379787eaaSJeff Mahoney if (ret) 531479787eaaSJeff Mahoney return ret; 53152b82032cSYan Zheng 53161b86826dSJeff Mahoney alloc_profile = btrfs_system_alloc_profile(fs_info); 531711c67b1aSNikolay Borisov ret = btrfs_alloc_chunk(trans, alloc_profile); 531879787eaaSJeff Mahoney return ret; 53192b82032cSYan Zheng } 53202b82032cSYan Zheng 5321d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5322d20983b4SMiao Xie { 5323fc9a2ac7SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(map->type); 5324d20983b4SMiao Xie 5325fc9a2ac7SDavid Sterba return btrfs_raid_array[index].tolerated_failures; 53262b82032cSYan Zheng } 53272b82032cSYan Zheng 53282ff7e61eSJeff Mahoney int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 53292b82032cSYan Zheng { 53302b82032cSYan Zheng struct extent_map *em; 53312b82032cSYan Zheng struct map_lookup *map; 53322b82032cSYan Zheng int readonly = 0; 5333d20983b4SMiao Xie int miss_ndevs = 0; 53342b82032cSYan Zheng int i; 53352b82032cSYan Zheng 533660ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5337592d92eeSLiu Bo if (IS_ERR(em)) 53382b82032cSYan Zheng return 1; 53392b82032cSYan Zheng 534095617d69SJeff Mahoney map = em->map_lookup; 53412b82032cSYan Zheng for (i = 0; i < map->num_stripes; i++) { 5342e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, 5343e6e674bdSAnand Jain &map->stripes[i].dev->dev_state)) { 5344d20983b4SMiao Xie miss_ndevs++; 5345d20983b4SMiao Xie continue; 5346d20983b4SMiao Xie } 5347ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5348ebbede42SAnand Jain &map->stripes[i].dev->dev_state)) { 53492b82032cSYan Zheng readonly = 1; 5350d20983b4SMiao Xie goto end; 53512b82032cSYan Zheng } 53522b82032cSYan Zheng } 5353d20983b4SMiao Xie 5354d20983b4SMiao Xie /* 5355d20983b4SMiao Xie * If the number of missing devices is larger than max errors, 5356d20983b4SMiao Xie * we can not write the data into that chunk successfully, so 5357d20983b4SMiao Xie * set it readonly. 5358d20983b4SMiao Xie */ 5359d20983b4SMiao Xie if (miss_ndevs > btrfs_chunk_max_errors(map)) 5360d20983b4SMiao Xie readonly = 1; 5361d20983b4SMiao Xie end: 53622b82032cSYan Zheng free_extent_map(em); 53632b82032cSYan Zheng return readonly; 53640b86a832SChris Mason } 53650b86a832SChris Mason 5366c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree) 53670b86a832SChris Mason { 53680b86a832SChris Mason struct extent_map *em; 53690b86a832SChris Mason 53700b86a832SChris Mason while (1) { 5371c8bf1b67SDavid Sterba write_lock(&tree->lock); 5372c8bf1b67SDavid Sterba em = lookup_extent_mapping(tree, 0, (u64)-1); 53730b86a832SChris Mason if (em) 5374c8bf1b67SDavid Sterba remove_extent_mapping(tree, em); 5375c8bf1b67SDavid Sterba write_unlock(&tree->lock); 53760b86a832SChris Mason if (!em) 53770b86a832SChris Mason break; 53780b86a832SChris Mason /* once for us */ 53790b86a832SChris Mason free_extent_map(em); 53800b86a832SChris Mason /* once for the tree */ 53810b86a832SChris Mason free_extent_map(em); 53820b86a832SChris Mason } 53830b86a832SChris Mason } 53840b86a832SChris Mason 53855d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5386f188591eSChris Mason { 5387f188591eSChris Mason struct extent_map *em; 5388f188591eSChris Mason struct map_lookup *map; 5389f188591eSChris Mason int ret; 5390f188591eSChris Mason 539160ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 5392592d92eeSLiu Bo if (IS_ERR(em)) 5393fb7669b5SJosef Bacik /* 5394592d92eeSLiu Bo * We could return errors for these cases, but that could get 5395592d92eeSLiu Bo * ugly and we'd probably do the same thing which is just not do 5396592d92eeSLiu Bo * anything else and exit, so return 1 so the callers don't try 5397592d92eeSLiu Bo * to use other copies. 5398fb7669b5SJosef Bacik */ 5399fb7669b5SJosef Bacik return 1; 5400fb7669b5SJosef Bacik 540195617d69SJeff Mahoney map = em->map_lookup; 5402c7369b3fSDavid Sterba if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5403f188591eSChris Mason ret = map->num_stripes; 5404321aecc6SChris Mason else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5405321aecc6SChris Mason ret = map->sub_stripes; 540653b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 540753b381b3SDavid Woodhouse ret = 2; 540853b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 54098810f751SLiu Bo /* 54108810f751SLiu Bo * There could be two corrupted data stripes, we need 54118810f751SLiu Bo * to loop retry in order to rebuild the correct data. 54128810f751SLiu Bo * 54138810f751SLiu Bo * Fail a stripe at a time on every retry except the 54148810f751SLiu Bo * stripe under reconstruction. 54158810f751SLiu Bo */ 54168810f751SLiu Bo ret = map->num_stripes; 5417f188591eSChris Mason else 5418f188591eSChris Mason ret = 1; 5419f188591eSChris Mason free_extent_map(em); 5420ad6d620eSStefan Behrens 5421cb5583ddSDavid Sterba down_read(&fs_info->dev_replace.rwsem); 54226fad823fSLiu Bo if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 54236fad823fSLiu Bo fs_info->dev_replace.tgtdev) 5424ad6d620eSStefan Behrens ret++; 5425cb5583ddSDavid Sterba up_read(&fs_info->dev_replace.rwsem); 5426ad6d620eSStefan Behrens 5427f188591eSChris Mason return ret; 5428f188591eSChris Mason } 5429f188591eSChris Mason 54302ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 543153b381b3SDavid Woodhouse u64 logical) 543253b381b3SDavid Woodhouse { 543353b381b3SDavid Woodhouse struct extent_map *em; 543453b381b3SDavid Woodhouse struct map_lookup *map; 54350b246afaSJeff Mahoney unsigned long len = fs_info->sectorsize; 543653b381b3SDavid Woodhouse 543760ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 543853b381b3SDavid Woodhouse 543969f03f13SNikolay Borisov if (!WARN_ON(IS_ERR(em))) { 544095617d69SJeff Mahoney map = em->map_lookup; 5441ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 544253b381b3SDavid Woodhouse len = map->stripe_len * nr_data_stripes(map); 544353b381b3SDavid Woodhouse free_extent_map(em); 544469f03f13SNikolay Borisov } 544553b381b3SDavid Woodhouse return len; 544653b381b3SDavid Woodhouse } 544753b381b3SDavid Woodhouse 5448e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 544953b381b3SDavid Woodhouse { 545053b381b3SDavid Woodhouse struct extent_map *em; 545153b381b3SDavid Woodhouse struct map_lookup *map; 545253b381b3SDavid Woodhouse int ret = 0; 545353b381b3SDavid Woodhouse 545460ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 545553b381b3SDavid Woodhouse 545669f03f13SNikolay Borisov if(!WARN_ON(IS_ERR(em))) { 545795617d69SJeff Mahoney map = em->map_lookup; 5458ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 545953b381b3SDavid Woodhouse ret = 1; 546053b381b3SDavid Woodhouse free_extent_map(em); 546169f03f13SNikolay Borisov } 546253b381b3SDavid Woodhouse return ret; 546353b381b3SDavid Woodhouse } 546453b381b3SDavid Woodhouse 546530d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info, 546699f92a7cSAnand Jain struct map_lookup *map, int first, 54678ba0ae78SAnand Jain int dev_replace_is_ongoing) 5468dfe25020SChris Mason { 5469dfe25020SChris Mason int i; 547099f92a7cSAnand Jain int num_stripes; 54718ba0ae78SAnand Jain int preferred_mirror; 547230d9861fSStefan Behrens int tolerance; 547330d9861fSStefan Behrens struct btrfs_device *srcdev; 547430d9861fSStefan Behrens 547599f92a7cSAnand Jain ASSERT((map->type & 5476c7369b3fSDavid Sterba (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 547799f92a7cSAnand Jain 547899f92a7cSAnand Jain if (map->type & BTRFS_BLOCK_GROUP_RAID10) 547999f92a7cSAnand Jain num_stripes = map->sub_stripes; 548099f92a7cSAnand Jain else 548199f92a7cSAnand Jain num_stripes = map->num_stripes; 548299f92a7cSAnand Jain 548333fd2f71SAnand Jain switch (fs_info->fs_devices->read_policy) { 548433fd2f71SAnand Jain default: 548533fd2f71SAnand Jain /* Shouldn't happen, just warn and use pid instead of failing */ 548633fd2f71SAnand Jain btrfs_warn_rl(fs_info, 548733fd2f71SAnand Jain "unknown read_policy type %u, reset to pid", 548833fd2f71SAnand Jain fs_info->fs_devices->read_policy); 548933fd2f71SAnand Jain fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 549033fd2f71SAnand Jain fallthrough; 549133fd2f71SAnand Jain case BTRFS_READ_POLICY_PID: 549233fd2f71SAnand Jain preferred_mirror = first + (current->pid % num_stripes); 549333fd2f71SAnand Jain break; 549433fd2f71SAnand Jain } 54958ba0ae78SAnand Jain 549630d9861fSStefan Behrens if (dev_replace_is_ongoing && 549730d9861fSStefan Behrens fs_info->dev_replace.cont_reading_from_srcdev_mode == 549830d9861fSStefan Behrens BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 549930d9861fSStefan Behrens srcdev = fs_info->dev_replace.srcdev; 550030d9861fSStefan Behrens else 550130d9861fSStefan Behrens srcdev = NULL; 550230d9861fSStefan Behrens 550330d9861fSStefan Behrens /* 550430d9861fSStefan Behrens * try to avoid the drive that is the source drive for a 550530d9861fSStefan Behrens * dev-replace procedure, only choose it if no other non-missing 550630d9861fSStefan Behrens * mirror is available 550730d9861fSStefan Behrens */ 550830d9861fSStefan Behrens for (tolerance = 0; tolerance < 2; tolerance++) { 55098ba0ae78SAnand Jain if (map->stripes[preferred_mirror].dev->bdev && 55108ba0ae78SAnand Jain (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 55118ba0ae78SAnand Jain return preferred_mirror; 551299f92a7cSAnand Jain for (i = first; i < first + num_stripes; i++) { 551330d9861fSStefan Behrens if (map->stripes[i].dev->bdev && 551430d9861fSStefan Behrens (tolerance || map->stripes[i].dev != srcdev)) 5515dfe25020SChris Mason return i; 5516dfe25020SChris Mason } 551730d9861fSStefan Behrens } 551830d9861fSStefan Behrens 5519dfe25020SChris Mason /* we couldn't find one that doesn't fail. Just return something 5520dfe25020SChris Mason * and the io error handling code will clean up eventually 5521dfe25020SChris Mason */ 55228ba0ae78SAnand Jain return preferred_mirror; 5523dfe25020SChris Mason } 5524dfe25020SChris Mason 552553b381b3SDavid Woodhouse /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 55268e5cfb55SZhao Lei static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 552753b381b3SDavid Woodhouse { 552853b381b3SDavid Woodhouse int i; 552953b381b3SDavid Woodhouse int again = 1; 553053b381b3SDavid Woodhouse 553153b381b3SDavid Woodhouse while (again) { 553253b381b3SDavid Woodhouse again = 0; 5533cc7539edSZhao Lei for (i = 0; i < num_stripes - 1; i++) { 5534eeb6f172SDavid Sterba /* Swap if parity is on a smaller index */ 5535eeb6f172SDavid Sterba if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5536eeb6f172SDavid Sterba swap(bbio->stripes[i], bbio->stripes[i + 1]); 5537eeb6f172SDavid Sterba swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 553853b381b3SDavid Woodhouse again = 1; 553953b381b3SDavid Woodhouse } 554053b381b3SDavid Woodhouse } 554153b381b3SDavid Woodhouse } 554253b381b3SDavid Woodhouse } 554353b381b3SDavid Woodhouse 55446e9606d2SZhao Lei static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 55456e9606d2SZhao Lei { 55466e9606d2SZhao Lei struct btrfs_bio *bbio = kzalloc( 5547e57cf21eSChris Mason /* the size of the btrfs_bio */ 55486e9606d2SZhao Lei sizeof(struct btrfs_bio) + 5549e57cf21eSChris Mason /* plus the variable array for the stripes */ 55506e9606d2SZhao Lei sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5551e57cf21eSChris Mason /* plus the variable array for the tgt dev */ 55526e9606d2SZhao Lei sizeof(int) * (real_stripes) + 5553e57cf21eSChris Mason /* 5554e57cf21eSChris Mason * plus the raid_map, which includes both the tgt dev 5555e57cf21eSChris Mason * and the stripes 5556e57cf21eSChris Mason */ 5557e57cf21eSChris Mason sizeof(u64) * (total_stripes), 5558277fb5fcSMichal Hocko GFP_NOFS|__GFP_NOFAIL); 55596e9606d2SZhao Lei 55606e9606d2SZhao Lei atomic_set(&bbio->error, 0); 5561140475aeSElena Reshetova refcount_set(&bbio->refs, 1); 55626e9606d2SZhao Lei 5563608769a4SNikolay Borisov bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5564608769a4SNikolay Borisov bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5565608769a4SNikolay Borisov 55666e9606d2SZhao Lei return bbio; 55676e9606d2SZhao Lei } 55686e9606d2SZhao Lei 55696e9606d2SZhao Lei void btrfs_get_bbio(struct btrfs_bio *bbio) 55706e9606d2SZhao Lei { 5571140475aeSElena Reshetova WARN_ON(!refcount_read(&bbio->refs)); 5572140475aeSElena Reshetova refcount_inc(&bbio->refs); 55736e9606d2SZhao Lei } 55746e9606d2SZhao Lei 55756e9606d2SZhao Lei void btrfs_put_bbio(struct btrfs_bio *bbio) 55766e9606d2SZhao Lei { 55776e9606d2SZhao Lei if (!bbio) 55786e9606d2SZhao Lei return; 5579140475aeSElena Reshetova if (refcount_dec_and_test(&bbio->refs)) 55806e9606d2SZhao Lei kfree(bbio); 55816e9606d2SZhao Lei } 55826e9606d2SZhao Lei 55830b3d4cd3SLiu Bo /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 55840b3d4cd3SLiu Bo /* 55850b3d4cd3SLiu Bo * Please note that, discard won't be sent to target device of device 55860b3d4cd3SLiu Bo * replace. 55870b3d4cd3SLiu Bo */ 55880b3d4cd3SLiu Bo static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 55896b7faaddSQu Wenruo u64 logical, u64 *length_ret, 55900b3d4cd3SLiu Bo struct btrfs_bio **bbio_ret) 55910b3d4cd3SLiu Bo { 55920b3d4cd3SLiu Bo struct extent_map *em; 55930b3d4cd3SLiu Bo struct map_lookup *map; 55940b3d4cd3SLiu Bo struct btrfs_bio *bbio; 55956b7faaddSQu Wenruo u64 length = *length_ret; 55960b3d4cd3SLiu Bo u64 offset; 55970b3d4cd3SLiu Bo u64 stripe_nr; 55980b3d4cd3SLiu Bo u64 stripe_nr_end; 55990b3d4cd3SLiu Bo u64 stripe_end_offset; 56000b3d4cd3SLiu Bo u64 stripe_cnt; 56010b3d4cd3SLiu Bo u64 stripe_len; 56020b3d4cd3SLiu Bo u64 stripe_offset; 56030b3d4cd3SLiu Bo u64 num_stripes; 56040b3d4cd3SLiu Bo u32 stripe_index; 56050b3d4cd3SLiu Bo u32 factor = 0; 56060b3d4cd3SLiu Bo u32 sub_stripes = 0; 56070b3d4cd3SLiu Bo u64 stripes_per_dev = 0; 56080b3d4cd3SLiu Bo u32 remaining_stripes = 0; 56090b3d4cd3SLiu Bo u32 last_stripe = 0; 56100b3d4cd3SLiu Bo int ret = 0; 56110b3d4cd3SLiu Bo int i; 56120b3d4cd3SLiu Bo 56130b3d4cd3SLiu Bo /* discard always return a bbio */ 56140b3d4cd3SLiu Bo ASSERT(bbio_ret); 56150b3d4cd3SLiu Bo 561660ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, length); 56170b3d4cd3SLiu Bo if (IS_ERR(em)) 56180b3d4cd3SLiu Bo return PTR_ERR(em); 56190b3d4cd3SLiu Bo 56200b3d4cd3SLiu Bo map = em->map_lookup; 56210b3d4cd3SLiu Bo /* we don't discard raid56 yet */ 56220b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 56230b3d4cd3SLiu Bo ret = -EOPNOTSUPP; 56240b3d4cd3SLiu Bo goto out; 56250b3d4cd3SLiu Bo } 56260b3d4cd3SLiu Bo 56270b3d4cd3SLiu Bo offset = logical - em->start; 56282d974619SQu Wenruo length = min_t(u64, em->start + em->len - logical, length); 56296b7faaddSQu Wenruo *length_ret = length; 56300b3d4cd3SLiu Bo 56310b3d4cd3SLiu Bo stripe_len = map->stripe_len; 56320b3d4cd3SLiu Bo /* 56330b3d4cd3SLiu Bo * stripe_nr counts the total number of stripes we have to stride 56340b3d4cd3SLiu Bo * to get to this block 56350b3d4cd3SLiu Bo */ 56360b3d4cd3SLiu Bo stripe_nr = div64_u64(offset, stripe_len); 56370b3d4cd3SLiu Bo 56380b3d4cd3SLiu Bo /* stripe_offset is the offset of this block in its stripe */ 56390b3d4cd3SLiu Bo stripe_offset = offset - stripe_nr * stripe_len; 56400b3d4cd3SLiu Bo 56410b3d4cd3SLiu Bo stripe_nr_end = round_up(offset + length, map->stripe_len); 564242c61ab6SLiu Bo stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 56430b3d4cd3SLiu Bo stripe_cnt = stripe_nr_end - stripe_nr; 56440b3d4cd3SLiu Bo stripe_end_offset = stripe_nr_end * map->stripe_len - 56450b3d4cd3SLiu Bo (offset + length); 56460b3d4cd3SLiu Bo /* 56470b3d4cd3SLiu Bo * after this, stripe_nr is the number of stripes on this 56480b3d4cd3SLiu Bo * device we have to walk to find the data, and stripe_index is 56490b3d4cd3SLiu Bo * the number of our device in the stripe array 56500b3d4cd3SLiu Bo */ 56510b3d4cd3SLiu Bo num_stripes = 1; 56520b3d4cd3SLiu Bo stripe_index = 0; 56530b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 56540b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 56550b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID0) 56560b3d4cd3SLiu Bo sub_stripes = 1; 56570b3d4cd3SLiu Bo else 56580b3d4cd3SLiu Bo sub_stripes = map->sub_stripes; 56590b3d4cd3SLiu Bo 56600b3d4cd3SLiu Bo factor = map->num_stripes / sub_stripes; 56610b3d4cd3SLiu Bo num_stripes = min_t(u64, map->num_stripes, 56620b3d4cd3SLiu Bo sub_stripes * stripe_cnt); 56630b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 56640b3d4cd3SLiu Bo stripe_index *= sub_stripes; 56650b3d4cd3SLiu Bo stripes_per_dev = div_u64_rem(stripe_cnt, factor, 56660b3d4cd3SLiu Bo &remaining_stripes); 56670b3d4cd3SLiu Bo div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 56680b3d4cd3SLiu Bo last_stripe *= sub_stripes; 5669c7369b3fSDavid Sterba } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 56700b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_DUP)) { 56710b3d4cd3SLiu Bo num_stripes = map->num_stripes; 56720b3d4cd3SLiu Bo } else { 56730b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 56740b3d4cd3SLiu Bo &stripe_index); 56750b3d4cd3SLiu Bo } 56760b3d4cd3SLiu Bo 56770b3d4cd3SLiu Bo bbio = alloc_btrfs_bio(num_stripes, 0); 56780b3d4cd3SLiu Bo if (!bbio) { 56790b3d4cd3SLiu Bo ret = -ENOMEM; 56800b3d4cd3SLiu Bo goto out; 56810b3d4cd3SLiu Bo } 56820b3d4cd3SLiu Bo 56830b3d4cd3SLiu Bo for (i = 0; i < num_stripes; i++) { 56840b3d4cd3SLiu Bo bbio->stripes[i].physical = 56850b3d4cd3SLiu Bo map->stripes[stripe_index].physical + 56860b3d4cd3SLiu Bo stripe_offset + stripe_nr * map->stripe_len; 56870b3d4cd3SLiu Bo bbio->stripes[i].dev = map->stripes[stripe_index].dev; 56880b3d4cd3SLiu Bo 56890b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 56900b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 56910b3d4cd3SLiu Bo bbio->stripes[i].length = stripes_per_dev * 56920b3d4cd3SLiu Bo map->stripe_len; 56930b3d4cd3SLiu Bo 56940b3d4cd3SLiu Bo if (i / sub_stripes < remaining_stripes) 56950b3d4cd3SLiu Bo bbio->stripes[i].length += 56960b3d4cd3SLiu Bo map->stripe_len; 56970b3d4cd3SLiu Bo 56980b3d4cd3SLiu Bo /* 56990b3d4cd3SLiu Bo * Special for the first stripe and 57000b3d4cd3SLiu Bo * the last stripe: 57010b3d4cd3SLiu Bo * 57020b3d4cd3SLiu Bo * |-------|...|-------| 57030b3d4cd3SLiu Bo * |----------| 57040b3d4cd3SLiu Bo * off end_off 57050b3d4cd3SLiu Bo */ 57060b3d4cd3SLiu Bo if (i < sub_stripes) 57070b3d4cd3SLiu Bo bbio->stripes[i].length -= 57080b3d4cd3SLiu Bo stripe_offset; 57090b3d4cd3SLiu Bo 57100b3d4cd3SLiu Bo if (stripe_index >= last_stripe && 57110b3d4cd3SLiu Bo stripe_index <= (last_stripe + 57120b3d4cd3SLiu Bo sub_stripes - 1)) 57130b3d4cd3SLiu Bo bbio->stripes[i].length -= 57140b3d4cd3SLiu Bo stripe_end_offset; 57150b3d4cd3SLiu Bo 57160b3d4cd3SLiu Bo if (i == sub_stripes - 1) 57170b3d4cd3SLiu Bo stripe_offset = 0; 57180b3d4cd3SLiu Bo } else { 57190b3d4cd3SLiu Bo bbio->stripes[i].length = length; 57200b3d4cd3SLiu Bo } 57210b3d4cd3SLiu Bo 57220b3d4cd3SLiu Bo stripe_index++; 57230b3d4cd3SLiu Bo if (stripe_index == map->num_stripes) { 57240b3d4cd3SLiu Bo stripe_index = 0; 57250b3d4cd3SLiu Bo stripe_nr++; 57260b3d4cd3SLiu Bo } 57270b3d4cd3SLiu Bo } 57280b3d4cd3SLiu Bo 57290b3d4cd3SLiu Bo *bbio_ret = bbio; 57300b3d4cd3SLiu Bo bbio->map_type = map->type; 57310b3d4cd3SLiu Bo bbio->num_stripes = num_stripes; 57320b3d4cd3SLiu Bo out: 57330b3d4cd3SLiu Bo free_extent_map(em); 57340b3d4cd3SLiu Bo return ret; 57350b3d4cd3SLiu Bo } 57360b3d4cd3SLiu Bo 57375ab56090SLiu Bo /* 57385ab56090SLiu Bo * In dev-replace case, for repair case (that's the only case where the mirror 57395ab56090SLiu Bo * is selected explicitly when calling btrfs_map_block), blocks left of the 57405ab56090SLiu Bo * left cursor can also be read from the target drive. 57415ab56090SLiu Bo * 57425ab56090SLiu Bo * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 57435ab56090SLiu Bo * array of stripes. 57445ab56090SLiu Bo * For READ, it also needs to be supported using the same mirror number. 57455ab56090SLiu Bo * 57465ab56090SLiu Bo * If the requested block is not left of the left cursor, EIO is returned. This 57475ab56090SLiu Bo * can happen because btrfs_num_copies() returns one more in the dev-replace 57485ab56090SLiu Bo * case. 57495ab56090SLiu Bo */ 57505ab56090SLiu Bo static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 57515ab56090SLiu Bo u64 logical, u64 length, 57525ab56090SLiu Bo u64 srcdev_devid, int *mirror_num, 57535ab56090SLiu Bo u64 *physical) 57545ab56090SLiu Bo { 57555ab56090SLiu Bo struct btrfs_bio *bbio = NULL; 57565ab56090SLiu Bo int num_stripes; 57575ab56090SLiu Bo int index_srcdev = 0; 57585ab56090SLiu Bo int found = 0; 57595ab56090SLiu Bo u64 physical_of_found = 0; 57605ab56090SLiu Bo int i; 57615ab56090SLiu Bo int ret = 0; 57625ab56090SLiu Bo 57635ab56090SLiu Bo ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 57645ab56090SLiu Bo logical, &length, &bbio, 0, 0); 57655ab56090SLiu Bo if (ret) { 57665ab56090SLiu Bo ASSERT(bbio == NULL); 57675ab56090SLiu Bo return ret; 57685ab56090SLiu Bo } 57695ab56090SLiu Bo 57705ab56090SLiu Bo num_stripes = bbio->num_stripes; 57715ab56090SLiu Bo if (*mirror_num > num_stripes) { 57725ab56090SLiu Bo /* 57735ab56090SLiu Bo * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 57745ab56090SLiu Bo * that means that the requested area is not left of the left 57755ab56090SLiu Bo * cursor 57765ab56090SLiu Bo */ 57775ab56090SLiu Bo btrfs_put_bbio(bbio); 57785ab56090SLiu Bo return -EIO; 57795ab56090SLiu Bo } 57805ab56090SLiu Bo 57815ab56090SLiu Bo /* 57825ab56090SLiu Bo * process the rest of the function using the mirror_num of the source 57835ab56090SLiu Bo * drive. Therefore look it up first. At the end, patch the device 57845ab56090SLiu Bo * pointer to the one of the target drive. 57855ab56090SLiu Bo */ 57865ab56090SLiu Bo for (i = 0; i < num_stripes; i++) { 57875ab56090SLiu Bo if (bbio->stripes[i].dev->devid != srcdev_devid) 57885ab56090SLiu Bo continue; 57895ab56090SLiu Bo 57905ab56090SLiu Bo /* 57915ab56090SLiu Bo * In case of DUP, in order to keep it simple, only add the 57925ab56090SLiu Bo * mirror with the lowest physical address 57935ab56090SLiu Bo */ 57945ab56090SLiu Bo if (found && 57955ab56090SLiu Bo physical_of_found <= bbio->stripes[i].physical) 57965ab56090SLiu Bo continue; 57975ab56090SLiu Bo 57985ab56090SLiu Bo index_srcdev = i; 57995ab56090SLiu Bo found = 1; 58005ab56090SLiu Bo physical_of_found = bbio->stripes[i].physical; 58015ab56090SLiu Bo } 58025ab56090SLiu Bo 58035ab56090SLiu Bo btrfs_put_bbio(bbio); 58045ab56090SLiu Bo 58055ab56090SLiu Bo ASSERT(found); 58065ab56090SLiu Bo if (!found) 58075ab56090SLiu Bo return -EIO; 58085ab56090SLiu Bo 58095ab56090SLiu Bo *mirror_num = index_srcdev + 1; 58105ab56090SLiu Bo *physical = physical_of_found; 58115ab56090SLiu Bo return ret; 58125ab56090SLiu Bo } 58135ab56090SLiu Bo 581473c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op, 581573c0f228SLiu Bo struct btrfs_bio **bbio_ret, 581673c0f228SLiu Bo struct btrfs_dev_replace *dev_replace, 581773c0f228SLiu Bo int *num_stripes_ret, int *max_errors_ret) 581873c0f228SLiu Bo { 581973c0f228SLiu Bo struct btrfs_bio *bbio = *bbio_ret; 582073c0f228SLiu Bo u64 srcdev_devid = dev_replace->srcdev->devid; 582173c0f228SLiu Bo int tgtdev_indexes = 0; 582273c0f228SLiu Bo int num_stripes = *num_stripes_ret; 582373c0f228SLiu Bo int max_errors = *max_errors_ret; 582473c0f228SLiu Bo int i; 582573c0f228SLiu Bo 582673c0f228SLiu Bo if (op == BTRFS_MAP_WRITE) { 582773c0f228SLiu Bo int index_where_to_add; 582873c0f228SLiu Bo 582973c0f228SLiu Bo /* 583073c0f228SLiu Bo * duplicate the write operations while the dev replace 583173c0f228SLiu Bo * procedure is running. Since the copying of the old disk to 583273c0f228SLiu Bo * the new disk takes place at run time while the filesystem is 583373c0f228SLiu Bo * mounted writable, the regular write operations to the old 583473c0f228SLiu Bo * disk have to be duplicated to go to the new disk as well. 583573c0f228SLiu Bo * 583673c0f228SLiu Bo * Note that device->missing is handled by the caller, and that 583773c0f228SLiu Bo * the write to the old disk is already set up in the stripes 583873c0f228SLiu Bo * array. 583973c0f228SLiu Bo */ 584073c0f228SLiu Bo index_where_to_add = num_stripes; 584173c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 584273c0f228SLiu Bo if (bbio->stripes[i].dev->devid == srcdev_devid) { 584373c0f228SLiu Bo /* write to new disk, too */ 584473c0f228SLiu Bo struct btrfs_bio_stripe *new = 584573c0f228SLiu Bo bbio->stripes + index_where_to_add; 584673c0f228SLiu Bo struct btrfs_bio_stripe *old = 584773c0f228SLiu Bo bbio->stripes + i; 584873c0f228SLiu Bo 584973c0f228SLiu Bo new->physical = old->physical; 585073c0f228SLiu Bo new->length = old->length; 585173c0f228SLiu Bo new->dev = dev_replace->tgtdev; 585273c0f228SLiu Bo bbio->tgtdev_map[i] = index_where_to_add; 585373c0f228SLiu Bo index_where_to_add++; 585473c0f228SLiu Bo max_errors++; 585573c0f228SLiu Bo tgtdev_indexes++; 585673c0f228SLiu Bo } 585773c0f228SLiu Bo } 585873c0f228SLiu Bo num_stripes = index_where_to_add; 585973c0f228SLiu Bo } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 586073c0f228SLiu Bo int index_srcdev = 0; 586173c0f228SLiu Bo int found = 0; 586273c0f228SLiu Bo u64 physical_of_found = 0; 586373c0f228SLiu Bo 586473c0f228SLiu Bo /* 586573c0f228SLiu Bo * During the dev-replace procedure, the target drive can also 586673c0f228SLiu Bo * be used to read data in case it is needed to repair a corrupt 586773c0f228SLiu Bo * block elsewhere. This is possible if the requested area is 586873c0f228SLiu Bo * left of the left cursor. In this area, the target drive is a 586973c0f228SLiu Bo * full copy of the source drive. 587073c0f228SLiu Bo */ 587173c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 587273c0f228SLiu Bo if (bbio->stripes[i].dev->devid == srcdev_devid) { 587373c0f228SLiu Bo /* 587473c0f228SLiu Bo * In case of DUP, in order to keep it simple, 587573c0f228SLiu Bo * only add the mirror with the lowest physical 587673c0f228SLiu Bo * address 587773c0f228SLiu Bo */ 587873c0f228SLiu Bo if (found && 587973c0f228SLiu Bo physical_of_found <= 588073c0f228SLiu Bo bbio->stripes[i].physical) 588173c0f228SLiu Bo continue; 588273c0f228SLiu Bo index_srcdev = i; 588373c0f228SLiu Bo found = 1; 588473c0f228SLiu Bo physical_of_found = bbio->stripes[i].physical; 588573c0f228SLiu Bo } 588673c0f228SLiu Bo } 588773c0f228SLiu Bo if (found) { 588873c0f228SLiu Bo struct btrfs_bio_stripe *tgtdev_stripe = 588973c0f228SLiu Bo bbio->stripes + num_stripes; 589073c0f228SLiu Bo 589173c0f228SLiu Bo tgtdev_stripe->physical = physical_of_found; 589273c0f228SLiu Bo tgtdev_stripe->length = 589373c0f228SLiu Bo bbio->stripes[index_srcdev].length; 589473c0f228SLiu Bo tgtdev_stripe->dev = dev_replace->tgtdev; 589573c0f228SLiu Bo bbio->tgtdev_map[index_srcdev] = num_stripes; 589673c0f228SLiu Bo 589773c0f228SLiu Bo tgtdev_indexes++; 589873c0f228SLiu Bo num_stripes++; 589973c0f228SLiu Bo } 590073c0f228SLiu Bo } 590173c0f228SLiu Bo 590273c0f228SLiu Bo *num_stripes_ret = num_stripes; 590373c0f228SLiu Bo *max_errors_ret = max_errors; 590473c0f228SLiu Bo bbio->num_tgtdevs = tgtdev_indexes; 590573c0f228SLiu Bo *bbio_ret = bbio; 590673c0f228SLiu Bo } 590773c0f228SLiu Bo 59082b19a1feSLiu Bo static bool need_full_stripe(enum btrfs_map_op op) 59092b19a1feSLiu Bo { 59102b19a1feSLiu Bo return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 59112b19a1feSLiu Bo } 59122b19a1feSLiu Bo 59135f141126SNikolay Borisov /* 59145f141126SNikolay Borisov * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) 59155f141126SNikolay Borisov * tuple. This information is used to calculate how big a 59165f141126SNikolay Borisov * particular bio can get before it straddles a stripe. 59175f141126SNikolay Borisov * 59185f141126SNikolay Borisov * @fs_info - the filesystem 59195f141126SNikolay Borisov * @logical - address that we want to figure out the geometry of 59205f141126SNikolay Borisov * @len - the length of IO we are going to perform, starting at @logical 59215f141126SNikolay Borisov * @op - type of operation - write or read 59225f141126SNikolay Borisov * @io_geom - pointer used to return values 59235f141126SNikolay Borisov * 59245f141126SNikolay Borisov * Returns < 0 in case a chunk for the given logical address cannot be found, 59255f141126SNikolay Borisov * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 59265f141126SNikolay Borisov */ 59275f141126SNikolay Borisov int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 59285f141126SNikolay Borisov u64 logical, u64 len, struct btrfs_io_geometry *io_geom) 59295f141126SNikolay Borisov { 59305f141126SNikolay Borisov struct extent_map *em; 59315f141126SNikolay Borisov struct map_lookup *map; 59325f141126SNikolay Borisov u64 offset; 59335f141126SNikolay Borisov u64 stripe_offset; 59345f141126SNikolay Borisov u64 stripe_nr; 59355f141126SNikolay Borisov u64 stripe_len; 59365f141126SNikolay Borisov u64 raid56_full_stripe_start = (u64)-1; 59375f141126SNikolay Borisov int data_stripes; 5938373c3b80SJohannes Thumshirn int ret = 0; 59395f141126SNikolay Borisov 59405f141126SNikolay Borisov ASSERT(op != BTRFS_MAP_DISCARD); 59415f141126SNikolay Borisov 59425f141126SNikolay Borisov em = btrfs_get_chunk_map(fs_info, logical, len); 59435f141126SNikolay Borisov if (IS_ERR(em)) 59445f141126SNikolay Borisov return PTR_ERR(em); 59455f141126SNikolay Borisov 59465f141126SNikolay Borisov map = em->map_lookup; 59475f141126SNikolay Borisov /* Offset of this logical address in the chunk */ 59485f141126SNikolay Borisov offset = logical - em->start; 59495f141126SNikolay Borisov /* Len of a stripe in a chunk */ 59505f141126SNikolay Borisov stripe_len = map->stripe_len; 59515f141126SNikolay Borisov /* Stripe wher this block falls in */ 59525f141126SNikolay Borisov stripe_nr = div64_u64(offset, stripe_len); 59535f141126SNikolay Borisov /* Offset of stripe in the chunk */ 59545f141126SNikolay Borisov stripe_offset = stripe_nr * stripe_len; 59555f141126SNikolay Borisov if (offset < stripe_offset) { 59565f141126SNikolay Borisov btrfs_crit(fs_info, 59575f141126SNikolay Borisov "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 59585f141126SNikolay Borisov stripe_offset, offset, em->start, logical, stripe_len); 5959373c3b80SJohannes Thumshirn ret = -EINVAL; 5960373c3b80SJohannes Thumshirn goto out; 59615f141126SNikolay Borisov } 59625f141126SNikolay Borisov 59635f141126SNikolay Borisov /* stripe_offset is the offset of this block in its stripe */ 59645f141126SNikolay Borisov stripe_offset = offset - stripe_offset; 59655f141126SNikolay Borisov data_stripes = nr_data_stripes(map); 59665f141126SNikolay Borisov 59675f141126SNikolay Borisov if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 59685f141126SNikolay Borisov u64 max_len = stripe_len - stripe_offset; 59695f141126SNikolay Borisov 59705f141126SNikolay Borisov /* 59715f141126SNikolay Borisov * In case of raid56, we need to know the stripe aligned start 59725f141126SNikolay Borisov */ 59735f141126SNikolay Borisov if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 59745f141126SNikolay Borisov unsigned long full_stripe_len = stripe_len * data_stripes; 59755f141126SNikolay Borisov raid56_full_stripe_start = offset; 59765f141126SNikolay Borisov 59775f141126SNikolay Borisov /* 59785f141126SNikolay Borisov * Allow a write of a full stripe, but make sure we 59795f141126SNikolay Borisov * don't allow straddling of stripes 59805f141126SNikolay Borisov */ 59815f141126SNikolay Borisov raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 59825f141126SNikolay Borisov full_stripe_len); 59835f141126SNikolay Borisov raid56_full_stripe_start *= full_stripe_len; 59845f141126SNikolay Borisov 59855f141126SNikolay Borisov /* 59865f141126SNikolay Borisov * For writes to RAID[56], allow a full stripeset across 59875f141126SNikolay Borisov * all disks. For other RAID types and for RAID[56] 59885f141126SNikolay Borisov * reads, just allow a single stripe (on a single disk). 59895f141126SNikolay Borisov */ 59905f141126SNikolay Borisov if (op == BTRFS_MAP_WRITE) { 59915f141126SNikolay Borisov max_len = stripe_len * data_stripes - 59925f141126SNikolay Borisov (offset - raid56_full_stripe_start); 59935f141126SNikolay Borisov } 59945f141126SNikolay Borisov } 59955f141126SNikolay Borisov len = min_t(u64, em->len - offset, max_len); 59965f141126SNikolay Borisov } else { 59975f141126SNikolay Borisov len = em->len - offset; 59985f141126SNikolay Borisov } 59995f141126SNikolay Borisov 60005f141126SNikolay Borisov io_geom->len = len; 60015f141126SNikolay Borisov io_geom->offset = offset; 60025f141126SNikolay Borisov io_geom->stripe_len = stripe_len; 60035f141126SNikolay Borisov io_geom->stripe_nr = stripe_nr; 60045f141126SNikolay Borisov io_geom->stripe_offset = stripe_offset; 60055f141126SNikolay Borisov io_geom->raid56_stripe_offset = raid56_full_stripe_start; 60065f141126SNikolay Borisov 6007373c3b80SJohannes Thumshirn out: 6008373c3b80SJohannes Thumshirn /* once for us */ 6009373c3b80SJohannes Thumshirn free_extent_map(em); 6010373c3b80SJohannes Thumshirn return ret; 60115f141126SNikolay Borisov } 60125f141126SNikolay Borisov 6013cf8cddd3SChristoph Hellwig static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6014cf8cddd3SChristoph Hellwig enum btrfs_map_op op, 6015cea9e445SChris Mason u64 logical, u64 *length, 6016a1d3c478SJan Schmidt struct btrfs_bio **bbio_ret, 60178e5cfb55SZhao Lei int mirror_num, int need_raid_map) 60180b86a832SChris Mason { 60190b86a832SChris Mason struct extent_map *em; 60200b86a832SChris Mason struct map_lookup *map; 6021593060d7SChris Mason u64 stripe_offset; 6022593060d7SChris Mason u64 stripe_nr; 602353b381b3SDavid Woodhouse u64 stripe_len; 60249d644a62SDavid Sterba u32 stripe_index; 6025cff82672SDavid Sterba int data_stripes; 6026cea9e445SChris Mason int i; 6027de11cc12SLi Zefan int ret = 0; 6028f2d8d74dSChris Mason int num_stripes; 6029a236aed1SChris Mason int max_errors = 0; 60302c8cdd6eSMiao Xie int tgtdev_indexes = 0; 6031a1d3c478SJan Schmidt struct btrfs_bio *bbio = NULL; 6032472262f3SStefan Behrens struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6033472262f3SStefan Behrens int dev_replace_is_ongoing = 0; 6034472262f3SStefan Behrens int num_alloc_stripes; 6035ad6d620eSStefan Behrens int patch_the_first_stripe_for_dev_replace = 0; 6036ad6d620eSStefan Behrens u64 physical_to_patch_in_first_stripe = 0; 603753b381b3SDavid Woodhouse u64 raid56_full_stripe_start = (u64)-1; 603889b798adSNikolay Borisov struct btrfs_io_geometry geom; 603989b798adSNikolay Borisov 604089b798adSNikolay Borisov ASSERT(bbio_ret); 604175fb2e9eSDavid Sterba ASSERT(op != BTRFS_MAP_DISCARD); 60420b3d4cd3SLiu Bo 604389b798adSNikolay Borisov ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); 604489b798adSNikolay Borisov if (ret < 0) 604589b798adSNikolay Borisov return ret; 604689b798adSNikolay Borisov 604760ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, *length); 6048f1136989SDan Carpenter ASSERT(!IS_ERR(em)); 604995617d69SJeff Mahoney map = em->map_lookup; 6050593060d7SChris Mason 605189b798adSNikolay Borisov *length = geom.len; 605289b798adSNikolay Borisov stripe_len = geom.stripe_len; 605389b798adSNikolay Borisov stripe_nr = geom.stripe_nr; 605489b798adSNikolay Borisov stripe_offset = geom.stripe_offset; 605589b798adSNikolay Borisov raid56_full_stripe_start = geom.raid56_stripe_offset; 6056cff82672SDavid Sterba data_stripes = nr_data_stripes(map); 6057593060d7SChris Mason 6058cb5583ddSDavid Sterba down_read(&dev_replace->rwsem); 6059472262f3SStefan Behrens dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 606053176ddeSDavid Sterba /* 606153176ddeSDavid Sterba * Hold the semaphore for read during the whole operation, write is 606253176ddeSDavid Sterba * requested at commit time but must wait. 606353176ddeSDavid Sterba */ 6064472262f3SStefan Behrens if (!dev_replace_is_ongoing) 6065cb5583ddSDavid Sterba up_read(&dev_replace->rwsem); 6066472262f3SStefan Behrens 6067ad6d620eSStefan Behrens if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 60682b19a1feSLiu Bo !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 60695ab56090SLiu Bo ret = get_extra_mirror_from_replace(fs_info, logical, *length, 60705ab56090SLiu Bo dev_replace->srcdev->devid, 60715ab56090SLiu Bo &mirror_num, 60725ab56090SLiu Bo &physical_to_patch_in_first_stripe); 60735ab56090SLiu Bo if (ret) 6074ad6d620eSStefan Behrens goto out; 60755ab56090SLiu Bo else 607694a97dfeSZhao Lei patch_the_first_stripe_for_dev_replace = 1; 6077ad6d620eSStefan Behrens } else if (mirror_num > map->num_stripes) { 6078ad6d620eSStefan Behrens mirror_num = 0; 6079ad6d620eSStefan Behrens } 6080ad6d620eSStefan Behrens 6081f2d8d74dSChris Mason num_stripes = 1; 6082cea9e445SChris Mason stripe_index = 0; 6083fce3bb9aSLi Dongyang if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 608447c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 608547c5713fSDavid Sterba &stripe_index); 6086de483734SAnand Jain if (!need_full_stripe(op)) 608728e1cc7dSMiao Xie mirror_num = 1; 6088c7369b3fSDavid Sterba } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6089de483734SAnand Jain if (need_full_stripe(op)) 6090f2d8d74dSChris Mason num_stripes = map->num_stripes; 60912fff734fSChris Mason else if (mirror_num) 6092f188591eSChris Mason stripe_index = mirror_num - 1; 6093dfe25020SChris Mason else { 609430d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 0, 609530d9861fSStefan Behrens dev_replace_is_ongoing); 6096a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 6097dfe25020SChris Mason } 60982fff734fSChris Mason 6099611f0e00SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6100de483734SAnand Jain if (need_full_stripe(op)) { 6101f2d8d74dSChris Mason num_stripes = map->num_stripes; 6102a1d3c478SJan Schmidt } else if (mirror_num) { 6103f188591eSChris Mason stripe_index = mirror_num - 1; 6104a1d3c478SJan Schmidt } else { 6105a1d3c478SJan Schmidt mirror_num = 1; 6106a1d3c478SJan Schmidt } 61072fff734fSChris Mason 6108321aecc6SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 61099d644a62SDavid Sterba u32 factor = map->num_stripes / map->sub_stripes; 6110321aecc6SChris Mason 611147c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6112321aecc6SChris Mason stripe_index *= map->sub_stripes; 6113321aecc6SChris Mason 6114de483734SAnand Jain if (need_full_stripe(op)) 6115f2d8d74dSChris Mason num_stripes = map->sub_stripes; 6116321aecc6SChris Mason else if (mirror_num) 6117321aecc6SChris Mason stripe_index += mirror_num - 1; 6118dfe25020SChris Mason else { 61193e74317aSJan Schmidt int old_stripe_index = stripe_index; 612030d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 612130d9861fSStefan Behrens stripe_index, 612230d9861fSStefan Behrens dev_replace_is_ongoing); 61233e74317aSJan Schmidt mirror_num = stripe_index - old_stripe_index + 1; 6124dfe25020SChris Mason } 612553b381b3SDavid Woodhouse 6126ffe2d203SZhao Lei } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6127de483734SAnand Jain if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 612853b381b3SDavid Woodhouse /* push stripe_nr back to the start of the full stripe */ 612942c61ab6SLiu Bo stripe_nr = div64_u64(raid56_full_stripe_start, 6130cff82672SDavid Sterba stripe_len * data_stripes); 613153b381b3SDavid Woodhouse 613253b381b3SDavid Woodhouse /* RAID[56] write or recovery. Return all stripes */ 613353b381b3SDavid Woodhouse num_stripes = map->num_stripes; 613453b381b3SDavid Woodhouse max_errors = nr_parity_stripes(map); 613553b381b3SDavid Woodhouse 613653b381b3SDavid Woodhouse *length = map->stripe_len; 613753b381b3SDavid Woodhouse stripe_index = 0; 613853b381b3SDavid Woodhouse stripe_offset = 0; 613953b381b3SDavid Woodhouse } else { 614053b381b3SDavid Woodhouse /* 614153b381b3SDavid Woodhouse * Mirror #0 or #1 means the original data block. 614253b381b3SDavid Woodhouse * Mirror #2 is RAID5 parity block. 614353b381b3SDavid Woodhouse * Mirror #3 is RAID6 Q block. 614453b381b3SDavid Woodhouse */ 614547c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, 6146cff82672SDavid Sterba data_stripes, &stripe_index); 614753b381b3SDavid Woodhouse if (mirror_num > 1) 6148cff82672SDavid Sterba stripe_index = data_stripes + mirror_num - 2; 614953b381b3SDavid Woodhouse 615053b381b3SDavid Woodhouse /* We distribute the parity blocks across stripes */ 615147c5713fSDavid Sterba div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 615247c5713fSDavid Sterba &stripe_index); 6153de483734SAnand Jain if (!need_full_stripe(op) && mirror_num <= 1) 615428e1cc7dSMiao Xie mirror_num = 1; 615553b381b3SDavid Woodhouse } 61568790d502SChris Mason } else { 6157593060d7SChris Mason /* 615847c5713fSDavid Sterba * after this, stripe_nr is the number of stripes on this 615947c5713fSDavid Sterba * device we have to walk to find the data, and stripe_index is 616047c5713fSDavid Sterba * the number of our device in the stripe array 6161593060d7SChris Mason */ 616247c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 616347c5713fSDavid Sterba &stripe_index); 6164a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 61658790d502SChris Mason } 6166e042d1ecSJosef Bacik if (stripe_index >= map->num_stripes) { 61675d163e0eSJeff Mahoney btrfs_crit(fs_info, 61685d163e0eSJeff Mahoney "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6169e042d1ecSJosef Bacik stripe_index, map->num_stripes); 6170e042d1ecSJosef Bacik ret = -EINVAL; 6171e042d1ecSJosef Bacik goto out; 6172e042d1ecSJosef Bacik } 6173593060d7SChris Mason 6174472262f3SStefan Behrens num_alloc_stripes = num_stripes; 61756fad823fSLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 61760b3d4cd3SLiu Bo if (op == BTRFS_MAP_WRITE) 6177472262f3SStefan Behrens num_alloc_stripes <<= 1; 6178cf8cddd3SChristoph Hellwig if (op == BTRFS_MAP_GET_READ_MIRRORS) 6179ad6d620eSStefan Behrens num_alloc_stripes++; 61802c8cdd6eSMiao Xie tgtdev_indexes = num_stripes; 6181ad6d620eSStefan Behrens } 61822c8cdd6eSMiao Xie 61836e9606d2SZhao Lei bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6184de11cc12SLi Zefan if (!bbio) { 6185de11cc12SLi Zefan ret = -ENOMEM; 6186de11cc12SLi Zefan goto out; 6187de11cc12SLi Zefan } 6188608769a4SNikolay Borisov 6189608769a4SNikolay Borisov for (i = 0; i < num_stripes; i++) { 6190608769a4SNikolay Borisov bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6191608769a4SNikolay Borisov stripe_offset + stripe_nr * map->stripe_len; 6192608769a4SNikolay Borisov bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6193608769a4SNikolay Borisov stripe_index++; 6194608769a4SNikolay Borisov } 6195de11cc12SLi Zefan 61968e5cfb55SZhao Lei /* build raid_map */ 61972b19a1feSLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 61982b19a1feSLiu Bo (need_full_stripe(op) || mirror_num > 1)) { 61998e5cfb55SZhao Lei u64 tmp; 62009d644a62SDavid Sterba unsigned rot; 62018e5cfb55SZhao Lei 62028e5cfb55SZhao Lei /* Work out the disk rotation on this stripe-set */ 620347c5713fSDavid Sterba div_u64_rem(stripe_nr, num_stripes, &rot); 62048e5cfb55SZhao Lei 62058e5cfb55SZhao Lei /* Fill in the logical address of each stripe */ 6206cff82672SDavid Sterba tmp = stripe_nr * data_stripes; 6207cff82672SDavid Sterba for (i = 0; i < data_stripes; i++) 62088e5cfb55SZhao Lei bbio->raid_map[(i+rot) % num_stripes] = 62098e5cfb55SZhao Lei em->start + (tmp + i) * map->stripe_len; 62108e5cfb55SZhao Lei 62118e5cfb55SZhao Lei bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 62128e5cfb55SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID6) 62138e5cfb55SZhao Lei bbio->raid_map[(i+rot+1) % num_stripes] = 62148e5cfb55SZhao Lei RAID6_Q_STRIPE; 62158e5cfb55SZhao Lei 6216608769a4SNikolay Borisov sort_parity_stripes(bbio, num_stripes); 6217593060d7SChris Mason } 6218de11cc12SLi Zefan 62192b19a1feSLiu Bo if (need_full_stripe(op)) 6220d20983b4SMiao Xie max_errors = btrfs_chunk_max_errors(map); 6221de11cc12SLi Zefan 622273c0f228SLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 62232b19a1feSLiu Bo need_full_stripe(op)) { 622473c0f228SLiu Bo handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 622573c0f228SLiu Bo &max_errors); 6226ad6d620eSStefan Behrens } 6227472262f3SStefan Behrens 6228a1d3c478SJan Schmidt *bbio_ret = bbio; 622910f11900SZhao Lei bbio->map_type = map->type; 6230a1d3c478SJan Schmidt bbio->num_stripes = num_stripes; 6231a1d3c478SJan Schmidt bbio->max_errors = max_errors; 6232a1d3c478SJan Schmidt bbio->mirror_num = mirror_num; 6233ad6d620eSStefan Behrens 6234ad6d620eSStefan Behrens /* 6235ad6d620eSStefan Behrens * this is the case that REQ_READ && dev_replace_is_ongoing && 6236ad6d620eSStefan Behrens * mirror_num == num_stripes + 1 && dev_replace target drive is 6237ad6d620eSStefan Behrens * available as a mirror 6238ad6d620eSStefan Behrens */ 6239ad6d620eSStefan Behrens if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6240ad6d620eSStefan Behrens WARN_ON(num_stripes > 1); 6241ad6d620eSStefan Behrens bbio->stripes[0].dev = dev_replace->tgtdev; 6242ad6d620eSStefan Behrens bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6243ad6d620eSStefan Behrens bbio->mirror_num = map->num_stripes + 1; 6244ad6d620eSStefan Behrens } 6245cea9e445SChris Mason out: 624673beece9SLiu Bo if (dev_replace_is_ongoing) { 624753176ddeSDavid Sterba lockdep_assert_held(&dev_replace->rwsem); 624853176ddeSDavid Sterba /* Unlock and let waiting writers proceed */ 6249cb5583ddSDavid Sterba up_read(&dev_replace->rwsem); 625073beece9SLiu Bo } 62510b86a832SChris Mason free_extent_map(em); 6252de11cc12SLi Zefan return ret; 62530b86a832SChris Mason } 62540b86a832SChris Mason 6255cf8cddd3SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6256f2d8d74dSChris Mason u64 logical, u64 *length, 6257a1d3c478SJan Schmidt struct btrfs_bio **bbio_ret, int mirror_num) 6258f2d8d74dSChris Mason { 625975fb2e9eSDavid Sterba if (op == BTRFS_MAP_DISCARD) 626075fb2e9eSDavid Sterba return __btrfs_map_block_for_discard(fs_info, logical, 626175fb2e9eSDavid Sterba length, bbio_ret); 626275fb2e9eSDavid Sterba 6263b3d3fa51SMike Christie return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 62648e5cfb55SZhao Lei mirror_num, 0); 6265f2d8d74dSChris Mason } 6266f2d8d74dSChris Mason 6267af8e2d1dSMiao Xie /* For Scrub/replace */ 6268cf8cddd3SChristoph Hellwig int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6269af8e2d1dSMiao Xie u64 logical, u64 *length, 6270825ad4c9SDavid Sterba struct btrfs_bio **bbio_ret) 6271af8e2d1dSMiao Xie { 6272825ad4c9SDavid Sterba return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6273af8e2d1dSMiao Xie } 6274af8e2d1dSMiao Xie 62754246a0b6SChristoph Hellwig static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 62768408c716SMiao Xie { 6277326e1dbbSMike Snitzer bio->bi_private = bbio->private; 6278326e1dbbSMike Snitzer bio->bi_end_io = bbio->end_io; 62794246a0b6SChristoph Hellwig bio_endio(bio); 6280326e1dbbSMike Snitzer 62816e9606d2SZhao Lei btrfs_put_bbio(bbio); 62828408c716SMiao Xie } 62838408c716SMiao Xie 62844246a0b6SChristoph Hellwig static void btrfs_end_bio(struct bio *bio) 62858790d502SChris Mason { 62869be3395bSChris Mason struct btrfs_bio *bbio = bio->bi_private; 62877d2b4daaSChris Mason int is_orig_bio = 0; 62888790d502SChris Mason 62894e4cbee9SChristoph Hellwig if (bio->bi_status) { 6290a1d3c478SJan Schmidt atomic_inc(&bbio->error); 62914e4cbee9SChristoph Hellwig if (bio->bi_status == BLK_STS_IOERR || 62924e4cbee9SChristoph Hellwig bio->bi_status == BLK_STS_TARGET) { 6293c31efbdfSNikolay Borisov struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6294442a4f63SStefan Behrens 62953eee86c8SNikolay Borisov ASSERT(dev->bdev); 629637226b21SMike Christie if (bio_op(bio) == REQ_OP_WRITE) 62971cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6298442a4f63SStefan Behrens BTRFS_DEV_STAT_WRITE_ERRS); 62990cc068e6SDavid Sterba else if (!(bio->bi_opf & REQ_RAHEAD)) 63001cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6301442a4f63SStefan Behrens BTRFS_DEV_STAT_READ_ERRS); 630270fd7614SChristoph Hellwig if (bio->bi_opf & REQ_PREFLUSH) 63031cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6304442a4f63SStefan Behrens BTRFS_DEV_STAT_FLUSH_ERRS); 6305442a4f63SStefan Behrens } 6306442a4f63SStefan Behrens } 63078790d502SChris Mason 6308a1d3c478SJan Schmidt if (bio == bbio->orig_bio) 63097d2b4daaSChris Mason is_orig_bio = 1; 63107d2b4daaSChris Mason 6311c404e0dcSMiao Xie btrfs_bio_counter_dec(bbio->fs_info); 6312c404e0dcSMiao Xie 6313a1d3c478SJan Schmidt if (atomic_dec_and_test(&bbio->stripes_pending)) { 63147d2b4daaSChris Mason if (!is_orig_bio) { 63157d2b4daaSChris Mason bio_put(bio); 6316a1d3c478SJan Schmidt bio = bbio->orig_bio; 63177d2b4daaSChris Mason } 6318c7b22bb1SMuthu Kumar 63199be3395bSChris Mason btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6320a236aed1SChris Mason /* only send an error to the higher layers if it is 632153b381b3SDavid Woodhouse * beyond the tolerance of the btrfs bio 6322a236aed1SChris Mason */ 6323a1d3c478SJan Schmidt if (atomic_read(&bbio->error) > bbio->max_errors) { 63244e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 63255dbc8fcaSChris Mason } else { 63261259ab75SChris Mason /* 63271259ab75SChris Mason * this bio is actually up to date, we didn't 63281259ab75SChris Mason * go over the max number of errors 63291259ab75SChris Mason */ 63302dbe0c77SAnand Jain bio->bi_status = BLK_STS_OK; 63311259ab75SChris Mason } 6332c55f1396SMiao Xie 63334246a0b6SChristoph Hellwig btrfs_end_bbio(bbio, bio); 63347d2b4daaSChris Mason } else if (!is_orig_bio) { 63358790d502SChris Mason bio_put(bio); 63368790d502SChris Mason } 63378790d502SChris Mason } 63388790d502SChris Mason 63392ff7e61eSJeff Mahoney static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6340c31efbdfSNikolay Borisov u64 physical, struct btrfs_device *dev) 6341de1ee92aSJosef Bacik { 63422ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = bbio->fs_info; 6343de1ee92aSJosef Bacik 6344de1ee92aSJosef Bacik bio->bi_private = bbio; 6345c31efbdfSNikolay Borisov btrfs_io_bio(bio)->device = dev; 6346de1ee92aSJosef Bacik bio->bi_end_io = btrfs_end_bio; 63474f024f37SKent Overstreet bio->bi_iter.bi_sector = physical >> 9; 6348672d5990SMisono Tomohiro btrfs_debug_in_rcu(fs_info, 6349ab8d0fc4SJeff Mahoney "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6350672d5990SMisono Tomohiro bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, 63511db45a35SDavid Sterba (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 63521db45a35SDavid Sterba dev->devid, bio->bi_iter.bi_size); 635374d46992SChristoph Hellwig bio_set_dev(bio, dev->bdev); 6354c404e0dcSMiao Xie 63552ff7e61eSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 6356c404e0dcSMiao Xie 63574e49ea4aSMike Christie btrfsic_submit_bio(bio); 6358de1ee92aSJosef Bacik } 6359de1ee92aSJosef Bacik 6360de1ee92aSJosef Bacik static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6361de1ee92aSJosef Bacik { 6362de1ee92aSJosef Bacik atomic_inc(&bbio->error); 6363de1ee92aSJosef Bacik if (atomic_dec_and_test(&bbio->stripes_pending)) { 636401327610SNicholas D Steeves /* Should be the original bio. */ 63658408c716SMiao Xie WARN_ON(bio != bbio->orig_bio); 63668408c716SMiao Xie 63679be3395bSChris Mason btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 63684f024f37SKent Overstreet bio->bi_iter.bi_sector = logical >> 9; 6369102ed2c5SAnand Jain if (atomic_read(&bbio->error) > bbio->max_errors) 63704e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 6371102ed2c5SAnand Jain else 6372102ed2c5SAnand Jain bio->bi_status = BLK_STS_OK; 63734246a0b6SChristoph Hellwig btrfs_end_bbio(bbio, bio); 6374de1ee92aSJosef Bacik } 6375de1ee92aSJosef Bacik } 6376de1ee92aSJosef Bacik 637758efbc9fSOmar Sandoval blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 637808635baeSChris Mason int mirror_num) 63790b86a832SChris Mason { 63800b86a832SChris Mason struct btrfs_device *dev; 63818790d502SChris Mason struct bio *first_bio = bio; 63824f024f37SKent Overstreet u64 logical = (u64)bio->bi_iter.bi_sector << 9; 63830b86a832SChris Mason u64 length = 0; 63840b86a832SChris Mason u64 map_length; 63850b86a832SChris Mason int ret; 638608da757dSZhao Lei int dev_nr; 638708da757dSZhao Lei int total_devs; 6388a1d3c478SJan Schmidt struct btrfs_bio *bbio = NULL; 63890b86a832SChris Mason 63904f024f37SKent Overstreet length = bio->bi_iter.bi_size; 63910b86a832SChris Mason map_length = length; 6392cea9e445SChris Mason 63930b246afaSJeff Mahoney btrfs_bio_counter_inc_blocked(fs_info); 6394bd7d63c2SLiu Bo ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 639537226b21SMike Christie &map_length, &bbio, mirror_num, 1); 6396c404e0dcSMiao Xie if (ret) { 63970b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 639858efbc9fSOmar Sandoval return errno_to_blk_status(ret); 6399c404e0dcSMiao Xie } 6400cea9e445SChris Mason 6401a1d3c478SJan Schmidt total_devs = bbio->num_stripes; 640253b381b3SDavid Woodhouse bbio->orig_bio = first_bio; 640353b381b3SDavid Woodhouse bbio->private = first_bio->bi_private; 640453b381b3SDavid Woodhouse bbio->end_io = first_bio->bi_end_io; 64050b246afaSJeff Mahoney bbio->fs_info = fs_info; 640653b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bbio->num_stripes); 640753b381b3SDavid Woodhouse 6408ad1ba2a0SZhao Lei if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 640937226b21SMike Christie ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 641053b381b3SDavid Woodhouse /* In this case, map_length has been set to the length of 641153b381b3SDavid Woodhouse a single stripe; not the whole write */ 641237226b21SMike Christie if (bio_op(bio) == REQ_OP_WRITE) { 64132ff7e61eSJeff Mahoney ret = raid56_parity_write(fs_info, bio, bbio, 64142ff7e61eSJeff Mahoney map_length); 641553b381b3SDavid Woodhouse } else { 64162ff7e61eSJeff Mahoney ret = raid56_parity_recover(fs_info, bio, bbio, 64172ff7e61eSJeff Mahoney map_length, mirror_num, 1); 641853b381b3SDavid Woodhouse } 64194245215dSMiao Xie 64200b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 642158efbc9fSOmar Sandoval return errno_to_blk_status(ret); 642253b381b3SDavid Woodhouse } 642353b381b3SDavid Woodhouse 6424239b14b3SChris Mason if (map_length < length) { 64250b246afaSJeff Mahoney btrfs_crit(fs_info, 64265d163e0eSJeff Mahoney "mapping failed logical %llu bio len %llu len %llu", 6427c1c9ff7cSGeert Uytterhoeven logical, length, map_length); 6428239b14b3SChris Mason BUG(); 6429239b14b3SChris Mason } 6430a1d3c478SJan Schmidt 643108da757dSZhao Lei for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6432de1ee92aSJosef Bacik dev = bbio->stripes[dev_nr].dev; 6433fc8a168aSNikolay Borisov if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6434fc8a168aSNikolay Borisov &dev->dev_state) || 6435ebbede42SAnand Jain (bio_op(first_bio) == REQ_OP_WRITE && 6436ebbede42SAnand Jain !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6437de1ee92aSJosef Bacik bbio_error(bbio, first_bio, logical); 6438de1ee92aSJosef Bacik continue; 6439de1ee92aSJosef Bacik } 6440de1ee92aSJosef Bacik 64413aa8e074SDavid Sterba if (dev_nr < total_devs - 1) 64428b6c1d56SDavid Sterba bio = btrfs_bio_clone(first_bio); 64433aa8e074SDavid Sterba else 64448790d502SChris Mason bio = first_bio; 6445606686eeSJosef Bacik 6446c31efbdfSNikolay Borisov submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 64478790d502SChris Mason } 64480b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 644958efbc9fSOmar Sandoval return BLK_STS_OK; 64500b86a832SChris Mason } 64510b86a832SChris Mason 645209ba3bc9SAnand Jain /* 645309ba3bc9SAnand Jain * Find a device specified by @devid or @uuid in the list of @fs_devices, or 645409ba3bc9SAnand Jain * return NULL. 645509ba3bc9SAnand Jain * 645609ba3bc9SAnand Jain * If devid and uuid are both specified, the match must be exact, otherwise 645709ba3bc9SAnand Jain * only devid is used. 645809ba3bc9SAnand Jain * 645909ba3bc9SAnand Jain * If @seed is true, traverse through the seed devices. 646009ba3bc9SAnand Jain */ 6461e4319cd9SAnand Jain struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 646209ba3bc9SAnand Jain u64 devid, u8 *uuid, u8 *fsid, 646309ba3bc9SAnand Jain bool seed) 64640b86a832SChris Mason { 64652b82032cSYan Zheng struct btrfs_device *device; 6466944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_devs; 64670b86a832SChris Mason 6468944d3f9fSNikolay Borisov if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6469944d3f9fSNikolay Borisov list_for_each_entry(device, &fs_devices->devices, dev_list) { 6470944d3f9fSNikolay Borisov if (device->devid == devid && 6471944d3f9fSNikolay Borisov (!uuid || memcmp(device->uuid, uuid, 6472944d3f9fSNikolay Borisov BTRFS_UUID_SIZE) == 0)) 6473944d3f9fSNikolay Borisov return device; 6474944d3f9fSNikolay Borisov } 6475944d3f9fSNikolay Borisov } 6476944d3f9fSNikolay Borisov 6477944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 64782b82032cSYan Zheng if (!fsid || 6479944d3f9fSNikolay Borisov !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6480944d3f9fSNikolay Borisov list_for_each_entry(device, &seed_devs->devices, 648109ba3bc9SAnand Jain dev_list) { 648209ba3bc9SAnand Jain if (device->devid == devid && 648309ba3bc9SAnand Jain (!uuid || memcmp(device->uuid, uuid, 648409ba3bc9SAnand Jain BTRFS_UUID_SIZE) == 0)) 64852b82032cSYan Zheng return device; 64862b82032cSYan Zheng } 648709ba3bc9SAnand Jain } 64882b82032cSYan Zheng } 6489944d3f9fSNikolay Borisov 64902b82032cSYan Zheng return NULL; 64910b86a832SChris Mason } 64920b86a832SChris Mason 64932ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6494dfe25020SChris Mason u64 devid, u8 *dev_uuid) 6495dfe25020SChris Mason { 6496dfe25020SChris Mason struct btrfs_device *device; 6497fccc0007SJosef Bacik unsigned int nofs_flag; 6498dfe25020SChris Mason 6499fccc0007SJosef Bacik /* 6500fccc0007SJosef Bacik * We call this under the chunk_mutex, so we want to use NOFS for this 6501fccc0007SJosef Bacik * allocation, however we don't want to change btrfs_alloc_device() to 6502fccc0007SJosef Bacik * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6503fccc0007SJosef Bacik * places. 6504fccc0007SJosef Bacik */ 6505fccc0007SJosef Bacik nofs_flag = memalloc_nofs_save(); 650612bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6507fccc0007SJosef Bacik memalloc_nofs_restore(nofs_flag); 650812bd2fc0SIlya Dryomov if (IS_ERR(device)) 6509adfb69afSAnand Jain return device; 651012bd2fc0SIlya Dryomov 651112bd2fc0SIlya Dryomov list_add(&device->dev_list, &fs_devices->devices); 6512e4404d6eSYan Zheng device->fs_devices = fs_devices; 6513dfe25020SChris Mason fs_devices->num_devices++; 651412bd2fc0SIlya Dryomov 6515e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6516cd02dca5SChris Mason fs_devices->missing_devices++; 651712bd2fc0SIlya Dryomov 6518dfe25020SChris Mason return device; 6519dfe25020SChris Mason } 6520dfe25020SChris Mason 652112bd2fc0SIlya Dryomov /** 652212bd2fc0SIlya Dryomov * btrfs_alloc_device - allocate struct btrfs_device 652312bd2fc0SIlya Dryomov * @fs_info: used only for generating a new devid, can be NULL if 652412bd2fc0SIlya Dryomov * devid is provided (i.e. @devid != NULL). 652512bd2fc0SIlya Dryomov * @devid: a pointer to devid for this device. If NULL a new devid 652612bd2fc0SIlya Dryomov * is generated. 652712bd2fc0SIlya Dryomov * @uuid: a pointer to UUID for this device. If NULL a new UUID 652812bd2fc0SIlya Dryomov * is generated. 652912bd2fc0SIlya Dryomov * 653012bd2fc0SIlya Dryomov * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 653148dae9cfSDavid Sterba * on error. Returned struct is not linked onto any lists and must be 6532a425f9d4SDavid Sterba * destroyed with btrfs_free_device. 653312bd2fc0SIlya Dryomov */ 653412bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 653512bd2fc0SIlya Dryomov const u64 *devid, 653612bd2fc0SIlya Dryomov const u8 *uuid) 653712bd2fc0SIlya Dryomov { 653812bd2fc0SIlya Dryomov struct btrfs_device *dev; 653912bd2fc0SIlya Dryomov u64 tmp; 654012bd2fc0SIlya Dryomov 6541fae7f21cSDulshani Gunawardhana if (WARN_ON(!devid && !fs_info)) 654212bd2fc0SIlya Dryomov return ERR_PTR(-EINVAL); 654312bd2fc0SIlya Dryomov 6544154f7cb8SQu Wenruo dev = __alloc_device(fs_info); 654512bd2fc0SIlya Dryomov if (IS_ERR(dev)) 654612bd2fc0SIlya Dryomov return dev; 654712bd2fc0SIlya Dryomov 654812bd2fc0SIlya Dryomov if (devid) 654912bd2fc0SIlya Dryomov tmp = *devid; 655012bd2fc0SIlya Dryomov else { 655112bd2fc0SIlya Dryomov int ret; 655212bd2fc0SIlya Dryomov 655312bd2fc0SIlya Dryomov ret = find_next_devid(fs_info, &tmp); 655412bd2fc0SIlya Dryomov if (ret) { 6555a425f9d4SDavid Sterba btrfs_free_device(dev); 655612bd2fc0SIlya Dryomov return ERR_PTR(ret); 655712bd2fc0SIlya Dryomov } 655812bd2fc0SIlya Dryomov } 655912bd2fc0SIlya Dryomov dev->devid = tmp; 656012bd2fc0SIlya Dryomov 656112bd2fc0SIlya Dryomov if (uuid) 656212bd2fc0SIlya Dryomov memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 656312bd2fc0SIlya Dryomov else 656412bd2fc0SIlya Dryomov generate_random_uuid(dev->uuid); 656512bd2fc0SIlya Dryomov 656612bd2fc0SIlya Dryomov return dev; 656712bd2fc0SIlya Dryomov } 656812bd2fc0SIlya Dryomov 65695a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 65702b902dfcSAnand Jain u64 devid, u8 *uuid, bool error) 65715a2b8e60SAnand Jain { 65722b902dfcSAnand Jain if (error) 65732b902dfcSAnand Jain btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 65742b902dfcSAnand Jain devid, uuid); 65752b902dfcSAnand Jain else 65762b902dfcSAnand Jain btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 65772b902dfcSAnand Jain devid, uuid); 65785a2b8e60SAnand Jain } 65795a2b8e60SAnand Jain 658039e264a4SNikolay Borisov static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 658139e264a4SNikolay Borisov { 658239e264a4SNikolay Borisov int index = btrfs_bg_flags_to_raid_index(type); 658339e264a4SNikolay Borisov int ncopies = btrfs_raid_array[index].ncopies; 6584e4f6c6beSDavid Sterba const int nparity = btrfs_raid_array[index].nparity; 658539e264a4SNikolay Borisov int data_stripes; 658639e264a4SNikolay Borisov 6587e4f6c6beSDavid Sterba if (nparity) 6588e4f6c6beSDavid Sterba data_stripes = num_stripes - nparity; 6589e4f6c6beSDavid Sterba else 659039e264a4SNikolay Borisov data_stripes = num_stripes / ncopies; 6591e4f6c6beSDavid Sterba 659239e264a4SNikolay Borisov return div_u64(chunk_len, data_stripes); 659339e264a4SNikolay Borisov } 659439e264a4SNikolay Borisov 65959690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 65960b86a832SChris Mason struct btrfs_chunk *chunk) 65970b86a832SChris Mason { 65989690ac09SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 6599c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree; 66000b86a832SChris Mason struct map_lookup *map; 66010b86a832SChris Mason struct extent_map *em; 66020b86a832SChris Mason u64 logical; 66030b86a832SChris Mason u64 length; 66040b86a832SChris Mason u64 devid; 6605a443755fSChris Mason u8 uuid[BTRFS_UUID_SIZE]; 6606593060d7SChris Mason int num_stripes; 66070b86a832SChris Mason int ret; 6608593060d7SChris Mason int i; 66090b86a832SChris Mason 6610e17cade2SChris Mason logical = key->offset; 6611e17cade2SChris Mason length = btrfs_chunk_length(leaf, chunk); 6612f04b772bSQu Wenruo num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6613e06cd3ddSLiu Bo 6614075cb3c7SQu Wenruo /* 6615075cb3c7SQu Wenruo * Only need to verify chunk item if we're reading from sys chunk array, 6616075cb3c7SQu Wenruo * as chunk item in tree block is already verified by tree-checker. 6617075cb3c7SQu Wenruo */ 6618075cb3c7SQu Wenruo if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6619ddaf1d5aSDavid Sterba ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6620e06cd3ddSLiu Bo if (ret) 6621e06cd3ddSLiu Bo return ret; 6622075cb3c7SQu Wenruo } 6623a061fc8dSChris Mason 6624c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 6625c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, logical, 1); 6626c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 66270b86a832SChris Mason 66280b86a832SChris Mason /* already mapped? */ 66290b86a832SChris Mason if (em && em->start <= logical && em->start + em->len > logical) { 66300b86a832SChris Mason free_extent_map(em); 66310b86a832SChris Mason return 0; 66320b86a832SChris Mason } else if (em) { 66330b86a832SChris Mason free_extent_map(em); 66340b86a832SChris Mason } 66350b86a832SChris Mason 6636172ddd60SDavid Sterba em = alloc_extent_map(); 66370b86a832SChris Mason if (!em) 66380b86a832SChris Mason return -ENOMEM; 6639593060d7SChris Mason map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 66400b86a832SChris Mason if (!map) { 66410b86a832SChris Mason free_extent_map(em); 66420b86a832SChris Mason return -ENOMEM; 66430b86a832SChris Mason } 66440b86a832SChris Mason 6645298a8f9cSWang Shilong set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 664695617d69SJeff Mahoney em->map_lookup = map; 66470b86a832SChris Mason em->start = logical; 66480b86a832SChris Mason em->len = length; 664970c8a91cSJosef Bacik em->orig_start = 0; 66500b86a832SChris Mason em->block_start = 0; 6651c8b97818SChris Mason em->block_len = em->len; 66520b86a832SChris Mason 6653593060d7SChris Mason map->num_stripes = num_stripes; 6654593060d7SChris Mason map->io_width = btrfs_chunk_io_width(leaf, chunk); 6655593060d7SChris Mason map->io_align = btrfs_chunk_io_align(leaf, chunk); 6656593060d7SChris Mason map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6657593060d7SChris Mason map->type = btrfs_chunk_type(leaf, chunk); 6658321aecc6SChris Mason map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6659cf90d884SQu Wenruo map->verified_stripes = 0; 666039e264a4SNikolay Borisov em->orig_block_len = calc_stripe_length(map->type, em->len, 666139e264a4SNikolay Borisov map->num_stripes); 6662593060d7SChris Mason for (i = 0; i < num_stripes; i++) { 6663593060d7SChris Mason map->stripes[i].physical = 6664593060d7SChris Mason btrfs_stripe_offset_nr(leaf, chunk, i); 6665593060d7SChris Mason devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6666a443755fSChris Mason read_extent_buffer(leaf, uuid, (unsigned long) 6667a443755fSChris Mason btrfs_stripe_dev_uuid_nr(chunk, i), 6668a443755fSChris Mason BTRFS_UUID_SIZE); 6669e4319cd9SAnand Jain map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 667009ba3bc9SAnand Jain devid, uuid, NULL, true); 66713cdde224SJeff Mahoney if (!map->stripes[i].dev && 66720b246afaSJeff Mahoney !btrfs_test_opt(fs_info, DEGRADED)) { 6673dfe25020SChris Mason free_extent_map(em); 66742b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, true); 667545dbdbc9SAnand Jain return -ENOENT; 6676dfe25020SChris Mason } 6677dfe25020SChris Mason if (!map->stripes[i].dev) { 6678dfe25020SChris Mason map->stripes[i].dev = 66792ff7e61eSJeff Mahoney add_missing_dev(fs_info->fs_devices, devid, 66802ff7e61eSJeff Mahoney uuid); 6681adfb69afSAnand Jain if (IS_ERR(map->stripes[i].dev)) { 66820b86a832SChris Mason free_extent_map(em); 6683adfb69afSAnand Jain btrfs_err(fs_info, 6684adfb69afSAnand Jain "failed to init missing dev %llu: %ld", 6685adfb69afSAnand Jain devid, PTR_ERR(map->stripes[i].dev)); 6686adfb69afSAnand Jain return PTR_ERR(map->stripes[i].dev); 66870b86a832SChris Mason } 66882b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, false); 6689593060d7SChris Mason } 6690e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6691e12c9621SAnand Jain &(map->stripes[i].dev->dev_state)); 6692e12c9621SAnand Jain 6693dfe25020SChris Mason } 66940b86a832SChris Mason 6695c8bf1b67SDavid Sterba write_lock(&map_tree->lock); 6696c8bf1b67SDavid Sterba ret = add_extent_mapping(map_tree, em, 0); 6697c8bf1b67SDavid Sterba write_unlock(&map_tree->lock); 669864f64f43SQu Wenruo if (ret < 0) { 669964f64f43SQu Wenruo btrfs_err(fs_info, 670064f64f43SQu Wenruo "failed to add chunk map, start=%llu len=%llu: %d", 670164f64f43SQu Wenruo em->start, em->len, ret); 670264f64f43SQu Wenruo } 67030b86a832SChris Mason free_extent_map(em); 67040b86a832SChris Mason 670564f64f43SQu Wenruo return ret; 67060b86a832SChris Mason } 67070b86a832SChris Mason 6708143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf, 67090b86a832SChris Mason struct btrfs_dev_item *dev_item, 67100b86a832SChris Mason struct btrfs_device *device) 67110b86a832SChris Mason { 67120b86a832SChris Mason unsigned long ptr; 67130b86a832SChris Mason 67140b86a832SChris Mason device->devid = btrfs_device_id(leaf, dev_item); 6715d6397baeSChris Ball device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6716d6397baeSChris Ball device->total_bytes = device->disk_total_bytes; 6717935e5cc9SMiao Xie device->commit_total_bytes = device->disk_total_bytes; 67180b86a832SChris Mason device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6719ce7213c7SMiao Xie device->commit_bytes_used = device->bytes_used; 67200b86a832SChris Mason device->type = btrfs_device_type(leaf, dev_item); 67210b86a832SChris Mason device->io_align = btrfs_device_io_align(leaf, dev_item); 67220b86a832SChris Mason device->io_width = btrfs_device_io_width(leaf, dev_item); 67230b86a832SChris Mason device->sector_size = btrfs_device_sector_size(leaf, dev_item); 67248dabb742SStefan Behrens WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6725401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 67260b86a832SChris Mason 6727410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 6728e17cade2SChris Mason read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 67290b86a832SChris Mason } 67300b86a832SChris Mason 67312ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 67325f375835SMiao Xie u8 *fsid) 67332b82032cSYan Zheng { 67342b82032cSYan Zheng struct btrfs_fs_devices *fs_devices; 67352b82032cSYan Zheng int ret; 67362b82032cSYan Zheng 6737a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex); 67382dfeca9bSDavid Sterba ASSERT(fsid); 67392b82032cSYan Zheng 6740427c8fddSNikolay Borisov /* This will match only for multi-device seed fs */ 6741944d3f9fSNikolay Borisov list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 674244880fdcSAnand Jain if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 67435f375835SMiao Xie return fs_devices; 67445f375835SMiao Xie 67452b82032cSYan Zheng 67467239ff4bSNikolay Borisov fs_devices = find_fsid(fsid, NULL); 67472b82032cSYan Zheng if (!fs_devices) { 67480b246afaSJeff Mahoney if (!btrfs_test_opt(fs_info, DEGRADED)) 67495f375835SMiao Xie return ERR_PTR(-ENOENT); 67505f375835SMiao Xie 67517239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(fsid, NULL); 67525f375835SMiao Xie if (IS_ERR(fs_devices)) 67535f375835SMiao Xie return fs_devices; 67545f375835SMiao Xie 67550395d84fSJohannes Thumshirn fs_devices->seeding = true; 67565f375835SMiao Xie fs_devices->opened = 1; 67575f375835SMiao Xie return fs_devices; 67582b82032cSYan Zheng } 6759e4404d6eSYan Zheng 6760427c8fddSNikolay Borisov /* 6761427c8fddSNikolay Borisov * Upon first call for a seed fs fsid, just create a private copy of the 6762427c8fddSNikolay Borisov * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 6763427c8fddSNikolay Borisov */ 6764e4404d6eSYan Zheng fs_devices = clone_fs_devices(fs_devices); 67655f375835SMiao Xie if (IS_ERR(fs_devices)) 67665f375835SMiao Xie return fs_devices; 67672b82032cSYan Zheng 6768897fb573SAnand Jain ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 676948d28232SJulia Lawall if (ret) { 677048d28232SJulia Lawall free_fs_devices(fs_devices); 6771c83b60c0SAnand Jain return ERR_PTR(ret); 677248d28232SJulia Lawall } 67732b82032cSYan Zheng 67742b82032cSYan Zheng if (!fs_devices->seeding) { 67750226e0ebSAnand Jain close_fs_devices(fs_devices); 6776e4404d6eSYan Zheng free_fs_devices(fs_devices); 6777c83b60c0SAnand Jain return ERR_PTR(-EINVAL); 67782b82032cSYan Zheng } 67792b82032cSYan Zheng 6780944d3f9fSNikolay Borisov list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 6781c83b60c0SAnand Jain 67825f375835SMiao Xie return fs_devices; 67832b82032cSYan Zheng } 67842b82032cSYan Zheng 678517850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf, 67860b86a832SChris Mason struct btrfs_dev_item *dev_item) 67870b86a832SChris Mason { 678817850759SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 67890b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 67900b86a832SChris Mason struct btrfs_device *device; 67910b86a832SChris Mason u64 devid; 67920b86a832SChris Mason int ret; 679344880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 6794a443755fSChris Mason u8 dev_uuid[BTRFS_UUID_SIZE]; 6795a443755fSChris Mason 67960b86a832SChris Mason devid = btrfs_device_id(leaf, dev_item); 6797410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6798a443755fSChris Mason BTRFS_UUID_SIZE); 67991473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 680044880fdcSAnand Jain BTRFS_FSID_SIZE); 68012b82032cSYan Zheng 6802de37aa51SNikolay Borisov if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 68032ff7e61eSJeff Mahoney fs_devices = open_seed_devices(fs_info, fs_uuid); 68045f375835SMiao Xie if (IS_ERR(fs_devices)) 68055f375835SMiao Xie return PTR_ERR(fs_devices); 68062b82032cSYan Zheng } 68072b82032cSYan Zheng 6808e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 680909ba3bc9SAnand Jain fs_uuid, true); 68105f375835SMiao Xie if (!device) { 6811c5502451SQu Wenruo if (!btrfs_test_opt(fs_info, DEGRADED)) { 68122b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 68132b902dfcSAnand Jain dev_uuid, true); 681445dbdbc9SAnand Jain return -ENOENT; 6815c5502451SQu Wenruo } 68162b82032cSYan Zheng 68172ff7e61eSJeff Mahoney device = add_missing_dev(fs_devices, devid, dev_uuid); 6818adfb69afSAnand Jain if (IS_ERR(device)) { 6819adfb69afSAnand Jain btrfs_err(fs_info, 6820adfb69afSAnand Jain "failed to add missing dev %llu: %ld", 6821adfb69afSAnand Jain devid, PTR_ERR(device)); 6822adfb69afSAnand Jain return PTR_ERR(device); 6823adfb69afSAnand Jain } 68242b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 68255f375835SMiao Xie } else { 6826c5502451SQu Wenruo if (!device->bdev) { 68272b902dfcSAnand Jain if (!btrfs_test_opt(fs_info, DEGRADED)) { 68282b902dfcSAnand Jain btrfs_report_missing_device(fs_info, 68292b902dfcSAnand Jain devid, dev_uuid, true); 683045dbdbc9SAnand Jain return -ENOENT; 6831c5502451SQu Wenruo } 68322b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 68332b902dfcSAnand Jain dev_uuid, false); 68342b902dfcSAnand Jain } 68355f375835SMiao Xie 6836e6e674bdSAnand Jain if (!device->bdev && 6837e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6838cd02dca5SChris Mason /* 6839cd02dca5SChris Mason * this happens when a device that was properly setup 6840cd02dca5SChris Mason * in the device info lists suddenly goes bad. 6841cd02dca5SChris Mason * device->bdev is NULL, and so we have to set 6842cd02dca5SChris Mason * device->missing to one here 6843cd02dca5SChris Mason */ 68445f375835SMiao Xie device->fs_devices->missing_devices++; 6845e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 68466324fbf3SChris Mason } 68475f375835SMiao Xie 68485f375835SMiao Xie /* Move the device to its own fs_devices */ 68495f375835SMiao Xie if (device->fs_devices != fs_devices) { 6850e6e674bdSAnand Jain ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6851e6e674bdSAnand Jain &device->dev_state)); 68525f375835SMiao Xie 68535f375835SMiao Xie list_move(&device->dev_list, &fs_devices->devices); 68545f375835SMiao Xie device->fs_devices->num_devices--; 68555f375835SMiao Xie fs_devices->num_devices++; 68565f375835SMiao Xie 68575f375835SMiao Xie device->fs_devices->missing_devices--; 68585f375835SMiao Xie fs_devices->missing_devices++; 68595f375835SMiao Xie 68605f375835SMiao Xie device->fs_devices = fs_devices; 68615f375835SMiao Xie } 68622b82032cSYan Zheng } 68632b82032cSYan Zheng 68640b246afaSJeff Mahoney if (device->fs_devices != fs_info->fs_devices) { 6865ebbede42SAnand Jain BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 68662b82032cSYan Zheng if (device->generation != 68672b82032cSYan Zheng btrfs_device_generation(leaf, dev_item)) 68682b82032cSYan Zheng return -EINVAL; 68692b82032cSYan Zheng } 68700b86a832SChris Mason 68710b86a832SChris Mason fill_device_from_item(leaf, dev_item, device); 6872e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6873ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6874401e29c1SAnand Jain !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 68752b82032cSYan Zheng device->fs_devices->total_rw_bytes += device->total_bytes; 6876a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes - device->bytes_used, 6877a5ed45f8SNikolay Borisov &fs_info->free_chunk_space); 68782bf64758SJosef Bacik } 68790b86a832SChris Mason ret = 0; 68800b86a832SChris Mason return ret; 68810b86a832SChris Mason } 68820b86a832SChris Mason 68836bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 68840b86a832SChris Mason { 68856bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 6886ab8d0fc4SJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 6887a061fc8dSChris Mason struct extent_buffer *sb; 68880b86a832SChris Mason struct btrfs_disk_key *disk_key; 68890b86a832SChris Mason struct btrfs_chunk *chunk; 68901ffb22cfSDavid Sterba u8 *array_ptr; 68911ffb22cfSDavid Sterba unsigned long sb_array_offset; 689284eed90fSChris Mason int ret = 0; 68930b86a832SChris Mason u32 num_stripes; 68940b86a832SChris Mason u32 array_size; 68950b86a832SChris Mason u32 len = 0; 68961ffb22cfSDavid Sterba u32 cur_offset; 6897e06cd3ddSLiu Bo u64 type; 689884eed90fSChris Mason struct btrfs_key key; 68990b86a832SChris Mason 69000b246afaSJeff Mahoney ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6901a83fffb7SDavid Sterba /* 6902a83fffb7SDavid Sterba * This will create extent buffer of nodesize, superblock size is 6903a83fffb7SDavid Sterba * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6904a83fffb7SDavid Sterba * overallocate but we can keep it as-is, only the first page is used. 6905a83fffb7SDavid Sterba */ 69063fbaf258SJosef Bacik sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 69073fbaf258SJosef Bacik root->root_key.objectid, 0); 6908c871b0f2SLiu Bo if (IS_ERR(sb)) 6909c871b0f2SLiu Bo return PTR_ERR(sb); 69104db8c528SDavid Sterba set_extent_buffer_uptodate(sb); 69118a334426SDavid Sterba /* 691201327610SNicholas D Steeves * The sb extent buffer is artificial and just used to read the system array. 69134db8c528SDavid Sterba * set_extent_buffer_uptodate() call does not properly mark all it's 69148a334426SDavid Sterba * pages up-to-date when the page is larger: extent does not cover the 69158a334426SDavid Sterba * whole page and consequently check_page_uptodate does not find all 69168a334426SDavid Sterba * the page's extents up-to-date (the hole beyond sb), 69178a334426SDavid Sterba * write_extent_buffer then triggers a WARN_ON. 69188a334426SDavid Sterba * 69198a334426SDavid Sterba * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 69208a334426SDavid Sterba * but sb spans only this function. Add an explicit SetPageUptodate call 69218a334426SDavid Sterba * to silence the warning eg. on PowerPC 64. 69228a334426SDavid Sterba */ 692309cbfeafSKirill A. Shutemov if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6924727011e0SChris Mason SetPageUptodate(sb->pages[0]); 69254008c04aSChris Mason 6926a061fc8dSChris Mason write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 69270b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 69280b86a832SChris Mason 69291ffb22cfSDavid Sterba array_ptr = super_copy->sys_chunk_array; 69301ffb22cfSDavid Sterba sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 69311ffb22cfSDavid Sterba cur_offset = 0; 69320b86a832SChris Mason 69331ffb22cfSDavid Sterba while (cur_offset < array_size) { 69341ffb22cfSDavid Sterba disk_key = (struct btrfs_disk_key *)array_ptr; 6935e3540eabSDavid Sterba len = sizeof(*disk_key); 6936e3540eabSDavid Sterba if (cur_offset + len > array_size) 6937e3540eabSDavid Sterba goto out_short_read; 6938e3540eabSDavid Sterba 69390b86a832SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 69400b86a832SChris Mason 69411ffb22cfSDavid Sterba array_ptr += len; 69421ffb22cfSDavid Sterba sb_array_offset += len; 69431ffb22cfSDavid Sterba cur_offset += len; 69440b86a832SChris Mason 694532ab3d1bSJohannes Thumshirn if (key.type != BTRFS_CHUNK_ITEM_KEY) { 694632ab3d1bSJohannes Thumshirn btrfs_err(fs_info, 694732ab3d1bSJohannes Thumshirn "unexpected item type %u in sys_array at offset %u", 694832ab3d1bSJohannes Thumshirn (u32)key.type, cur_offset); 694932ab3d1bSJohannes Thumshirn ret = -EIO; 695032ab3d1bSJohannes Thumshirn break; 695132ab3d1bSJohannes Thumshirn } 695232ab3d1bSJohannes Thumshirn 69531ffb22cfSDavid Sterba chunk = (struct btrfs_chunk *)sb_array_offset; 6954e3540eabSDavid Sterba /* 695532ab3d1bSJohannes Thumshirn * At least one btrfs_chunk with one stripe must be present, 695632ab3d1bSJohannes Thumshirn * exact stripe count check comes afterwards 6957e3540eabSDavid Sterba */ 6958e3540eabSDavid Sterba len = btrfs_chunk_item_size(1); 6959e3540eabSDavid Sterba if (cur_offset + len > array_size) 6960e3540eabSDavid Sterba goto out_short_read; 6961e3540eabSDavid Sterba 6962e3540eabSDavid Sterba num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6963f5cdedd7SDavid Sterba if (!num_stripes) { 6964ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 6965ab8d0fc4SJeff Mahoney "invalid number of stripes %u in sys_array at offset %u", 6966f5cdedd7SDavid Sterba num_stripes, cur_offset); 6967f5cdedd7SDavid Sterba ret = -EIO; 6968f5cdedd7SDavid Sterba break; 6969f5cdedd7SDavid Sterba } 6970f5cdedd7SDavid Sterba 6971e06cd3ddSLiu Bo type = btrfs_chunk_type(sb, chunk); 6972e06cd3ddSLiu Bo if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6973ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 6974e06cd3ddSLiu Bo "invalid chunk type %llu in sys_array at offset %u", 6975e06cd3ddSLiu Bo type, cur_offset); 6976e06cd3ddSLiu Bo ret = -EIO; 6977e06cd3ddSLiu Bo break; 6978e06cd3ddSLiu Bo } 6979e06cd3ddSLiu Bo 6980e3540eabSDavid Sterba len = btrfs_chunk_item_size(num_stripes); 6981e3540eabSDavid Sterba if (cur_offset + len > array_size) 6982e3540eabSDavid Sterba goto out_short_read; 6983e3540eabSDavid Sterba 69849690ac09SDavid Sterba ret = read_one_chunk(&key, sb, chunk); 698584eed90fSChris Mason if (ret) 698684eed90fSChris Mason break; 698732ab3d1bSJohannes Thumshirn 69881ffb22cfSDavid Sterba array_ptr += len; 69891ffb22cfSDavid Sterba sb_array_offset += len; 69901ffb22cfSDavid Sterba cur_offset += len; 69910b86a832SChris Mason } 6992d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 69931c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 699484eed90fSChris Mason return ret; 6995e3540eabSDavid Sterba 6996e3540eabSDavid Sterba out_short_read: 6997ab8d0fc4SJeff Mahoney btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6998e3540eabSDavid Sterba len, cur_offset); 6999d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 70001c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 7001e3540eabSDavid Sterba return -EIO; 70020b86a832SChris Mason } 70030b86a832SChris Mason 700421634a19SQu Wenruo /* 700521634a19SQu Wenruo * Check if all chunks in the fs are OK for read-write degraded mount 700621634a19SQu Wenruo * 70076528b99dSAnand Jain * If the @failing_dev is specified, it's accounted as missing. 70086528b99dSAnand Jain * 700921634a19SQu Wenruo * Return true if all chunks meet the minimal RW mount requirements. 701021634a19SQu Wenruo * Return false if any chunk doesn't meet the minimal RW mount requirements. 701121634a19SQu Wenruo */ 70126528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 70136528b99dSAnand Jain struct btrfs_device *failing_dev) 701421634a19SQu Wenruo { 7015c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree; 701621634a19SQu Wenruo struct extent_map *em; 701721634a19SQu Wenruo u64 next_start = 0; 701821634a19SQu Wenruo bool ret = true; 701921634a19SQu Wenruo 7020c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 7021c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7022c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 702321634a19SQu Wenruo /* No chunk at all? Return false anyway */ 702421634a19SQu Wenruo if (!em) { 702521634a19SQu Wenruo ret = false; 702621634a19SQu Wenruo goto out; 702721634a19SQu Wenruo } 702821634a19SQu Wenruo while (em) { 702921634a19SQu Wenruo struct map_lookup *map; 703021634a19SQu Wenruo int missing = 0; 703121634a19SQu Wenruo int max_tolerated; 703221634a19SQu Wenruo int i; 703321634a19SQu Wenruo 703421634a19SQu Wenruo map = em->map_lookup; 703521634a19SQu Wenruo max_tolerated = 703621634a19SQu Wenruo btrfs_get_num_tolerated_disk_barrier_failures( 703721634a19SQu Wenruo map->type); 703821634a19SQu Wenruo for (i = 0; i < map->num_stripes; i++) { 703921634a19SQu Wenruo struct btrfs_device *dev = map->stripes[i].dev; 704021634a19SQu Wenruo 7041e6e674bdSAnand Jain if (!dev || !dev->bdev || 7042e6e674bdSAnand Jain test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 704321634a19SQu Wenruo dev->last_flush_error) 704421634a19SQu Wenruo missing++; 70456528b99dSAnand Jain else if (failing_dev && failing_dev == dev) 70466528b99dSAnand Jain missing++; 704721634a19SQu Wenruo } 704821634a19SQu Wenruo if (missing > max_tolerated) { 70496528b99dSAnand Jain if (!failing_dev) 705021634a19SQu Wenruo btrfs_warn(fs_info, 705152042d8eSAndrea Gelmini "chunk %llu missing %d devices, max tolerance is %d for writable mount", 705221634a19SQu Wenruo em->start, missing, max_tolerated); 705321634a19SQu Wenruo free_extent_map(em); 705421634a19SQu Wenruo ret = false; 705521634a19SQu Wenruo goto out; 705621634a19SQu Wenruo } 705721634a19SQu Wenruo next_start = extent_map_end(em); 705821634a19SQu Wenruo free_extent_map(em); 705921634a19SQu Wenruo 7060c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 7061c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, next_start, 706221634a19SQu Wenruo (u64)(-1) - next_start); 7063c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 706421634a19SQu Wenruo } 706521634a19SQu Wenruo out: 706621634a19SQu Wenruo return ret; 706721634a19SQu Wenruo } 706821634a19SQu Wenruo 7069d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node) 7070d85327b1SDavid Sterba { 7071d85327b1SDavid Sterba int i; 7072d85327b1SDavid Sterba const int nr_items = btrfs_header_nritems(node); 7073d85327b1SDavid Sterba 7074bfb484d9SJosef Bacik for (i = 0; i < nr_items; i++) 7075bfb484d9SJosef Bacik btrfs_readahead_node_child(node, i); 7076d85327b1SDavid Sterba } 7077d85327b1SDavid Sterba 70785b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 70790b86a832SChris Mason { 70805b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 70810b86a832SChris Mason struct btrfs_path *path; 70820b86a832SChris Mason struct extent_buffer *leaf; 70830b86a832SChris Mason struct btrfs_key key; 70840b86a832SChris Mason struct btrfs_key found_key; 70850b86a832SChris Mason int ret; 70860b86a832SChris Mason int slot; 708799e3ecfcSLiu Bo u64 total_dev = 0; 7088d85327b1SDavid Sterba u64 last_ra_node = 0; 70890b86a832SChris Mason 70900b86a832SChris Mason path = btrfs_alloc_path(); 70910b86a832SChris Mason if (!path) 70920b86a832SChris Mason return -ENOMEM; 70930b86a832SChris Mason 70943dd0f7a3SAnand Jain /* 70953dd0f7a3SAnand Jain * uuid_mutex is needed only if we are mounting a sprout FS 70963dd0f7a3SAnand Jain * otherwise we don't need it. 70973dd0f7a3SAnand Jain */ 7098b367e47fSLi Zefan mutex_lock(&uuid_mutex); 7099b367e47fSLi Zefan 7100395927a9SFilipe David Borba Manana /* 710148cfa61bSBoris Burkov * It is possible for mount and umount to race in such a way that 710248cfa61bSBoris Burkov * we execute this code path, but open_fs_devices failed to clear 710348cfa61bSBoris Burkov * total_rw_bytes. We certainly want it cleared before reading the 710448cfa61bSBoris Burkov * device items, so clear it here. 710548cfa61bSBoris Burkov */ 710648cfa61bSBoris Burkov fs_info->fs_devices->total_rw_bytes = 0; 710748cfa61bSBoris Burkov 710848cfa61bSBoris Burkov /* 7109395927a9SFilipe David Borba Manana * Read all device items, and then all the chunk items. All 7110395927a9SFilipe David Borba Manana * device items are found before any chunk item (their object id 7111395927a9SFilipe David Borba Manana * is smaller than the lowest possible object id for a chunk 7112395927a9SFilipe David Borba Manana * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 71130b86a832SChris Mason */ 71140b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 71150b86a832SChris Mason key.offset = 0; 71160b86a832SChris Mason key.type = 0; 71170b86a832SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7118ab59381eSZhao Lei if (ret < 0) 7119ab59381eSZhao Lei goto error; 71200b86a832SChris Mason while (1) { 7121d85327b1SDavid Sterba struct extent_buffer *node; 7122d85327b1SDavid Sterba 71230b86a832SChris Mason leaf = path->nodes[0]; 71240b86a832SChris Mason slot = path->slots[0]; 71250b86a832SChris Mason if (slot >= btrfs_header_nritems(leaf)) { 71260b86a832SChris Mason ret = btrfs_next_leaf(root, path); 71270b86a832SChris Mason if (ret == 0) 71280b86a832SChris Mason continue; 71290b86a832SChris Mason if (ret < 0) 71300b86a832SChris Mason goto error; 71310b86a832SChris Mason break; 71320b86a832SChris Mason } 7133d85327b1SDavid Sterba /* 7134d85327b1SDavid Sterba * The nodes on level 1 are not locked but we don't need to do 7135d85327b1SDavid Sterba * that during mount time as nothing else can access the tree 7136d85327b1SDavid Sterba */ 7137d85327b1SDavid Sterba node = path->nodes[1]; 7138d85327b1SDavid Sterba if (node) { 7139d85327b1SDavid Sterba if (last_ra_node != node->start) { 7140d85327b1SDavid Sterba readahead_tree_node_children(node); 7141d85327b1SDavid Sterba last_ra_node = node->start; 7142d85327b1SDavid Sterba } 7143d85327b1SDavid Sterba } 71440b86a832SChris Mason btrfs_item_key_to_cpu(leaf, &found_key, slot); 71450b86a832SChris Mason if (found_key.type == BTRFS_DEV_ITEM_KEY) { 71460b86a832SChris Mason struct btrfs_dev_item *dev_item; 71470b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, slot, 71480b86a832SChris Mason struct btrfs_dev_item); 714917850759SDavid Sterba ret = read_one_dev(leaf, dev_item); 71502b82032cSYan Zheng if (ret) 71512b82032cSYan Zheng goto error; 715299e3ecfcSLiu Bo total_dev++; 71530b86a832SChris Mason } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 71540b86a832SChris Mason struct btrfs_chunk *chunk; 71550b86a832SChris Mason chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 715601d01cafSJosef Bacik mutex_lock(&fs_info->chunk_mutex); 71579690ac09SDavid Sterba ret = read_one_chunk(&found_key, leaf, chunk); 715801d01cafSJosef Bacik mutex_unlock(&fs_info->chunk_mutex); 71592b82032cSYan Zheng if (ret) 71602b82032cSYan Zheng goto error; 71610b86a832SChris Mason } 71620b86a832SChris Mason path->slots[0]++; 71630b86a832SChris Mason } 716499e3ecfcSLiu Bo 716599e3ecfcSLiu Bo /* 716699e3ecfcSLiu Bo * After loading chunk tree, we've got all device information, 716799e3ecfcSLiu Bo * do another round of validation checks. 716899e3ecfcSLiu Bo */ 71690b246afaSJeff Mahoney if (total_dev != fs_info->fs_devices->total_devices) { 71700b246afaSJeff Mahoney btrfs_err(fs_info, 717199e3ecfcSLiu Bo "super_num_devices %llu mismatch with num_devices %llu found here", 71720b246afaSJeff Mahoney btrfs_super_num_devices(fs_info->super_copy), 717399e3ecfcSLiu Bo total_dev); 717499e3ecfcSLiu Bo ret = -EINVAL; 717599e3ecfcSLiu Bo goto error; 717699e3ecfcSLiu Bo } 71770b246afaSJeff Mahoney if (btrfs_super_total_bytes(fs_info->super_copy) < 71780b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes) { 71790b246afaSJeff Mahoney btrfs_err(fs_info, 718099e3ecfcSLiu Bo "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 71810b246afaSJeff Mahoney btrfs_super_total_bytes(fs_info->super_copy), 71820b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes); 718399e3ecfcSLiu Bo ret = -EINVAL; 718499e3ecfcSLiu Bo goto error; 718599e3ecfcSLiu Bo } 71860b86a832SChris Mason ret = 0; 71870b86a832SChris Mason error: 7188b367e47fSLi Zefan mutex_unlock(&uuid_mutex); 7189b367e47fSLi Zefan 71902b82032cSYan Zheng btrfs_free_path(path); 71910b86a832SChris Mason return ret; 71920b86a832SChris Mason } 7193442a4f63SStefan Behrens 7194cb517eabSMiao Xie void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7195cb517eabSMiao Xie { 7196944d3f9fSNikolay Borisov struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7197cb517eabSMiao Xie struct btrfs_device *device; 7198cb517eabSMiao Xie 7199944d3f9fSNikolay Borisov fs_devices->fs_info = fs_info; 7200944d3f9fSNikolay Borisov 7201cb517eabSMiao Xie mutex_lock(&fs_devices->device_list_mutex); 7202cb517eabSMiao Xie list_for_each_entry(device, &fs_devices->devices, dev_list) 7203fb456252SJeff Mahoney device->fs_info = fs_info; 720429cc83f6SLiu Bo 7205944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7206944d3f9fSNikolay Borisov list_for_each_entry(device, &seed_devs->devices, dev_list) 7207944d3f9fSNikolay Borisov device->fs_info = fs_info; 7208944d3f9fSNikolay Borisov 7209944d3f9fSNikolay Borisov seed_devs->fs_info = fs_info; 721029cc83f6SLiu Bo } 7211e17125b5SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 7212cb517eabSMiao Xie } 7213cb517eabSMiao Xie 72141dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 72151dc990dfSDavid Sterba const struct btrfs_dev_stats_item *ptr, 72161dc990dfSDavid Sterba int index) 72171dc990dfSDavid Sterba { 72181dc990dfSDavid Sterba u64 val; 72191dc990dfSDavid Sterba 72201dc990dfSDavid Sterba read_extent_buffer(eb, &val, 72211dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) + 72221dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)), 72231dc990dfSDavid Sterba sizeof(val)); 72241dc990dfSDavid Sterba return val; 72251dc990dfSDavid Sterba } 72261dc990dfSDavid Sterba 72271dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 72281dc990dfSDavid Sterba struct btrfs_dev_stats_item *ptr, 72291dc990dfSDavid Sterba int index, u64 val) 72301dc990dfSDavid Sterba { 72311dc990dfSDavid Sterba write_extent_buffer(eb, &val, 72321dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) + 72331dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)), 72341dc990dfSDavid Sterba sizeof(val)); 72351dc990dfSDavid Sterba } 72361dc990dfSDavid Sterba 723792e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7238124604ebSJosef Bacik struct btrfs_path *path) 7239733f4fbbSStefan Behrens { 7240733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7241124604ebSJosef Bacik struct extent_buffer *eb; 7242124604ebSJosef Bacik struct btrfs_key key; 7243124604ebSJosef Bacik int item_size; 7244124604ebSJosef Bacik int i, ret, slot; 7245733f4fbbSStefan Behrens 7246242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7247242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7248733f4fbbSStefan Behrens key.offset = device->devid; 7249124604ebSJosef Bacik ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7250733f4fbbSStefan Behrens if (ret) { 7251ae4b9b4cSAnand Jain for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7252ae4b9b4cSAnand Jain btrfs_dev_stat_set(device, i, 0); 7253733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7254733f4fbbSStefan Behrens btrfs_release_path(path); 725592e26df4SJosef Bacik return ret < 0 ? ret : 0; 7256733f4fbbSStefan Behrens } 7257733f4fbbSStefan Behrens slot = path->slots[0]; 7258733f4fbbSStefan Behrens eb = path->nodes[0]; 7259733f4fbbSStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 7260733f4fbbSStefan Behrens 7261124604ebSJosef Bacik ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7262733f4fbbSStefan Behrens 7263733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7264733f4fbbSStefan Behrens if (item_size >= (1 + i) * sizeof(__le64)) 7265733f4fbbSStefan Behrens btrfs_dev_stat_set(device, i, 7266733f4fbbSStefan Behrens btrfs_dev_stats_value(eb, ptr, i)); 7267733f4fbbSStefan Behrens else 72684e411a7dSAnand Jain btrfs_dev_stat_set(device, i, 0); 7269733f4fbbSStefan Behrens } 7270733f4fbbSStefan Behrens 7271733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7272733f4fbbSStefan Behrens btrfs_dev_stat_print_on_load(device); 7273733f4fbbSStefan Behrens btrfs_release_path(path); 727492e26df4SJosef Bacik 727592e26df4SJosef Bacik return 0; 7276733f4fbbSStefan Behrens } 7277124604ebSJosef Bacik 7278124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7279124604ebSJosef Bacik { 7280124604ebSJosef Bacik struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7281124604ebSJosef Bacik struct btrfs_device *device; 7282124604ebSJosef Bacik struct btrfs_path *path = NULL; 728392e26df4SJosef Bacik int ret = 0; 7284124604ebSJosef Bacik 7285124604ebSJosef Bacik path = btrfs_alloc_path(); 7286124604ebSJosef Bacik if (!path) 7287124604ebSJosef Bacik return -ENOMEM; 7288124604ebSJosef Bacik 7289124604ebSJosef Bacik mutex_lock(&fs_devices->device_list_mutex); 729092e26df4SJosef Bacik list_for_each_entry(device, &fs_devices->devices, dev_list) { 729192e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path); 729292e26df4SJosef Bacik if (ret) 729392e26df4SJosef Bacik goto out; 7294124604ebSJosef Bacik } 729592e26df4SJosef Bacik list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 729692e26df4SJosef Bacik list_for_each_entry(device, &seed_devs->devices, dev_list) { 729792e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path); 729892e26df4SJosef Bacik if (ret) 729992e26df4SJosef Bacik goto out; 730092e26df4SJosef Bacik } 730192e26df4SJosef Bacik } 730292e26df4SJosef Bacik out: 7303733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7304733f4fbbSStefan Behrens 7305733f4fbbSStefan Behrens btrfs_free_path(path); 730692e26df4SJosef Bacik return ret; 7307733f4fbbSStefan Behrens } 7308733f4fbbSStefan Behrens 7309733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7310733f4fbbSStefan Behrens struct btrfs_device *device) 7311733f4fbbSStefan Behrens { 73125495f195SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 73136bccf3abSJeff Mahoney struct btrfs_root *dev_root = fs_info->dev_root; 7314733f4fbbSStefan Behrens struct btrfs_path *path; 7315733f4fbbSStefan Behrens struct btrfs_key key; 7316733f4fbbSStefan Behrens struct extent_buffer *eb; 7317733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7318733f4fbbSStefan Behrens int ret; 7319733f4fbbSStefan Behrens int i; 7320733f4fbbSStefan Behrens 7321242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7322242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7323733f4fbbSStefan Behrens key.offset = device->devid; 7324733f4fbbSStefan Behrens 7325733f4fbbSStefan Behrens path = btrfs_alloc_path(); 7326fa252992SDavid Sterba if (!path) 7327fa252992SDavid Sterba return -ENOMEM; 7328733f4fbbSStefan Behrens ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7329733f4fbbSStefan Behrens if (ret < 0) { 73300b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7331ecaeb14bSDavid Sterba "error %d while searching for dev_stats item for device %s", 7332606686eeSJosef Bacik ret, rcu_str_deref(device->name)); 7333733f4fbbSStefan Behrens goto out; 7334733f4fbbSStefan Behrens } 7335733f4fbbSStefan Behrens 7336733f4fbbSStefan Behrens if (ret == 0 && 7337733f4fbbSStefan Behrens btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7338733f4fbbSStefan Behrens /* need to delete old one and insert a new one */ 7339733f4fbbSStefan Behrens ret = btrfs_del_item(trans, dev_root, path); 7340733f4fbbSStefan Behrens if (ret != 0) { 73410b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7342ecaeb14bSDavid Sterba "delete too small dev_stats item for device %s failed %d", 7343606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7344733f4fbbSStefan Behrens goto out; 7345733f4fbbSStefan Behrens } 7346733f4fbbSStefan Behrens ret = 1; 7347733f4fbbSStefan Behrens } 7348733f4fbbSStefan Behrens 7349733f4fbbSStefan Behrens if (ret == 1) { 7350733f4fbbSStefan Behrens /* need to insert a new item */ 7351733f4fbbSStefan Behrens btrfs_release_path(path); 7352733f4fbbSStefan Behrens ret = btrfs_insert_empty_item(trans, dev_root, path, 7353733f4fbbSStefan Behrens &key, sizeof(*ptr)); 7354733f4fbbSStefan Behrens if (ret < 0) { 73550b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7356ecaeb14bSDavid Sterba "insert dev_stats item for device %s failed %d", 7357606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7358733f4fbbSStefan Behrens goto out; 7359733f4fbbSStefan Behrens } 7360733f4fbbSStefan Behrens } 7361733f4fbbSStefan Behrens 7362733f4fbbSStefan Behrens eb = path->nodes[0]; 7363733f4fbbSStefan Behrens ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7364733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7365733f4fbbSStefan Behrens btrfs_set_dev_stats_value(eb, ptr, i, 7366733f4fbbSStefan Behrens btrfs_dev_stat_read(device, i)); 7367733f4fbbSStefan Behrens btrfs_mark_buffer_dirty(eb); 7368733f4fbbSStefan Behrens 7369733f4fbbSStefan Behrens out: 7370733f4fbbSStefan Behrens btrfs_free_path(path); 7371733f4fbbSStefan Behrens return ret; 7372733f4fbbSStefan Behrens } 7373733f4fbbSStefan Behrens 7374733f4fbbSStefan Behrens /* 7375733f4fbbSStefan Behrens * called from commit_transaction. Writes all changed device stats to disk. 7376733f4fbbSStefan Behrens */ 7377196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7378733f4fbbSStefan Behrens { 7379196c9d8dSDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 7380733f4fbbSStefan Behrens struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7381733f4fbbSStefan Behrens struct btrfs_device *device; 7382addc3fa7SMiao Xie int stats_cnt; 7383733f4fbbSStefan Behrens int ret = 0; 7384733f4fbbSStefan Behrens 7385733f4fbbSStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 7386733f4fbbSStefan Behrens list_for_each_entry(device, &fs_devices->devices, dev_list) { 73879deae968SNikolay Borisov stats_cnt = atomic_read(&device->dev_stats_ccnt); 73889deae968SNikolay Borisov if (!device->dev_stats_valid || stats_cnt == 0) 7389733f4fbbSStefan Behrens continue; 7390733f4fbbSStefan Behrens 73919deae968SNikolay Borisov 73929deae968SNikolay Borisov /* 73939deae968SNikolay Borisov * There is a LOAD-LOAD control dependency between the value of 73949deae968SNikolay Borisov * dev_stats_ccnt and updating the on-disk values which requires 73959deae968SNikolay Borisov * reading the in-memory counters. Such control dependencies 73969deae968SNikolay Borisov * require explicit read memory barriers. 73979deae968SNikolay Borisov * 73989deae968SNikolay Borisov * This memory barriers pairs with smp_mb__before_atomic in 73999deae968SNikolay Borisov * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 74009deae968SNikolay Borisov * barrier implied by atomic_xchg in 74019deae968SNikolay Borisov * btrfs_dev_stats_read_and_reset 74029deae968SNikolay Borisov */ 74039deae968SNikolay Borisov smp_rmb(); 74049deae968SNikolay Borisov 74055495f195SNikolay Borisov ret = update_dev_stat_item(trans, device); 7406733f4fbbSStefan Behrens if (!ret) 7407addc3fa7SMiao Xie atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7408733f4fbbSStefan Behrens } 7409733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7410733f4fbbSStefan Behrens 7411733f4fbbSStefan Behrens return ret; 7412733f4fbbSStefan Behrens } 7413733f4fbbSStefan Behrens 7414442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7415442a4f63SStefan Behrens { 7416442a4f63SStefan Behrens btrfs_dev_stat_inc(dev, index); 7417442a4f63SStefan Behrens btrfs_dev_stat_print_on_error(dev); 7418442a4f63SStefan Behrens } 7419442a4f63SStefan Behrens 742048a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7421442a4f63SStefan Behrens { 7422733f4fbbSStefan Behrens if (!dev->dev_stats_valid) 7423733f4fbbSStefan Behrens return; 7424fb456252SJeff Mahoney btrfs_err_rl_in_rcu(dev->fs_info, 7425b14af3b4SDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7426606686eeSJosef Bacik rcu_str_deref(dev->name), 7427442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7428442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7429442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7430efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7431efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7432442a4f63SStefan Behrens } 7433c11d2c23SStefan Behrens 7434733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7435733f4fbbSStefan Behrens { 7436a98cdb85SStefan Behrens int i; 7437a98cdb85SStefan Behrens 7438a98cdb85SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7439a98cdb85SStefan Behrens if (btrfs_dev_stat_read(dev, i) != 0) 7440a98cdb85SStefan Behrens break; 7441a98cdb85SStefan Behrens if (i == BTRFS_DEV_STAT_VALUES_MAX) 7442a98cdb85SStefan Behrens return; /* all values == 0, suppress message */ 7443a98cdb85SStefan Behrens 7444fb456252SJeff Mahoney btrfs_info_in_rcu(dev->fs_info, 7445ecaeb14bSDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7446606686eeSJosef Bacik rcu_str_deref(dev->name), 7447733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7448733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7449733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7450733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7451733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7452733f4fbbSStefan Behrens } 7453733f4fbbSStefan Behrens 74542ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7455b27f7c0cSDavid Sterba struct btrfs_ioctl_get_dev_stats *stats) 7456c11d2c23SStefan Behrens { 7457c11d2c23SStefan Behrens struct btrfs_device *dev; 74580b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7459c11d2c23SStefan Behrens int i; 7460c11d2c23SStefan Behrens 7461c11d2c23SStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 746209ba3bc9SAnand Jain dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, 746309ba3bc9SAnand Jain true); 7464c11d2c23SStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7465c11d2c23SStefan Behrens 7466c11d2c23SStefan Behrens if (!dev) { 74670b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7468c11d2c23SStefan Behrens return -ENODEV; 7469733f4fbbSStefan Behrens } else if (!dev->dev_stats_valid) { 74700b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7471733f4fbbSStefan Behrens return -ENODEV; 7472b27f7c0cSDavid Sterba } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7473c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7474c11d2c23SStefan Behrens if (stats->nr_items > i) 7475c11d2c23SStefan Behrens stats->values[i] = 7476c11d2c23SStefan Behrens btrfs_dev_stat_read_and_reset(dev, i); 7477c11d2c23SStefan Behrens else 74784e411a7dSAnand Jain btrfs_dev_stat_set(dev, i, 0); 7479c11d2c23SStefan Behrens } 7480a69976bcSAnand Jain btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7481a69976bcSAnand Jain current->comm, task_pid_nr(current)); 7482c11d2c23SStefan Behrens } else { 7483c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7484c11d2c23SStefan Behrens if (stats->nr_items > i) 7485c11d2c23SStefan Behrens stats->values[i] = btrfs_dev_stat_read(dev, i); 7486c11d2c23SStefan Behrens } 7487c11d2c23SStefan Behrens if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7488c11d2c23SStefan Behrens stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7489c11d2c23SStefan Behrens return 0; 7490c11d2c23SStefan Behrens } 7491a8a6dab7SStefan Behrens 7492935e5cc9SMiao Xie /* 7493bbbf7243SNikolay Borisov * Update the size and bytes used for each device where it changed. This is 7494bbbf7243SNikolay Borisov * delayed since we would otherwise get errors while writing out the 7495935e5cc9SMiao Xie * superblocks. 7496bbbf7243SNikolay Borisov * 7497bbbf7243SNikolay Borisov * Must be invoked during transaction commit. 7498935e5cc9SMiao Xie */ 7499bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7500935e5cc9SMiao Xie { 7501935e5cc9SMiao Xie struct btrfs_device *curr, *next; 7502935e5cc9SMiao Xie 7503bbbf7243SNikolay Borisov ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7504bbbf7243SNikolay Borisov 7505bbbf7243SNikolay Borisov if (list_empty(&trans->dev_update_list)) 7506935e5cc9SMiao Xie return; 7507935e5cc9SMiao Xie 7508bbbf7243SNikolay Borisov /* 7509bbbf7243SNikolay Borisov * We don't need the device_list_mutex here. This list is owned by the 7510bbbf7243SNikolay Borisov * transaction and the transaction must complete before the device is 7511bbbf7243SNikolay Borisov * released. 7512bbbf7243SNikolay Borisov */ 7513bbbf7243SNikolay Borisov mutex_lock(&trans->fs_info->chunk_mutex); 7514bbbf7243SNikolay Borisov list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7515bbbf7243SNikolay Borisov post_commit_list) { 7516bbbf7243SNikolay Borisov list_del_init(&curr->post_commit_list); 7517935e5cc9SMiao Xie curr->commit_total_bytes = curr->disk_total_bytes; 7518bbbf7243SNikolay Borisov curr->commit_bytes_used = curr->bytes_used; 7519935e5cc9SMiao Xie } 7520bbbf7243SNikolay Borisov mutex_unlock(&trans->fs_info->chunk_mutex); 7521ce7213c7SMiao Xie } 75225a13f430SAnand Jain 752346df06b8SDavid Sterba /* 752446df06b8SDavid Sterba * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 752546df06b8SDavid Sterba */ 752646df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags) 752746df06b8SDavid Sterba { 752844b28adaSDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags); 752944b28adaSDavid Sterba 753044b28adaSDavid Sterba return btrfs_raid_array[index].ncopies; 753146df06b8SDavid Sterba } 7532cf90d884SQu Wenruo 7533cf90d884SQu Wenruo 7534cf90d884SQu Wenruo 7535cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7536cf90d884SQu Wenruo u64 chunk_offset, u64 devid, 7537cf90d884SQu Wenruo u64 physical_offset, u64 physical_len) 7538cf90d884SQu Wenruo { 7539c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7540cf90d884SQu Wenruo struct extent_map *em; 7541cf90d884SQu Wenruo struct map_lookup *map; 754205a37c48SQu Wenruo struct btrfs_device *dev; 7543cf90d884SQu Wenruo u64 stripe_len; 7544cf90d884SQu Wenruo bool found = false; 7545cf90d884SQu Wenruo int ret = 0; 7546cf90d884SQu Wenruo int i; 7547cf90d884SQu Wenruo 7548cf90d884SQu Wenruo read_lock(&em_tree->lock); 7549cf90d884SQu Wenruo em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7550cf90d884SQu Wenruo read_unlock(&em_tree->lock); 7551cf90d884SQu Wenruo 7552cf90d884SQu Wenruo if (!em) { 7553cf90d884SQu Wenruo btrfs_err(fs_info, 7554cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7555cf90d884SQu Wenruo physical_offset, devid); 7556cf90d884SQu Wenruo ret = -EUCLEAN; 7557cf90d884SQu Wenruo goto out; 7558cf90d884SQu Wenruo } 7559cf90d884SQu Wenruo 7560cf90d884SQu Wenruo map = em->map_lookup; 7561cf90d884SQu Wenruo stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7562cf90d884SQu Wenruo if (physical_len != stripe_len) { 7563cf90d884SQu Wenruo btrfs_err(fs_info, 7564cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7565cf90d884SQu Wenruo physical_offset, devid, em->start, physical_len, 7566cf90d884SQu Wenruo stripe_len); 7567cf90d884SQu Wenruo ret = -EUCLEAN; 7568cf90d884SQu Wenruo goto out; 7569cf90d884SQu Wenruo } 7570cf90d884SQu Wenruo 7571cf90d884SQu Wenruo for (i = 0; i < map->num_stripes; i++) { 7572cf90d884SQu Wenruo if (map->stripes[i].dev->devid == devid && 7573cf90d884SQu Wenruo map->stripes[i].physical == physical_offset) { 7574cf90d884SQu Wenruo found = true; 7575cf90d884SQu Wenruo if (map->verified_stripes >= map->num_stripes) { 7576cf90d884SQu Wenruo btrfs_err(fs_info, 7577cf90d884SQu Wenruo "too many dev extents for chunk %llu found", 7578cf90d884SQu Wenruo em->start); 7579cf90d884SQu Wenruo ret = -EUCLEAN; 7580cf90d884SQu Wenruo goto out; 7581cf90d884SQu Wenruo } 7582cf90d884SQu Wenruo map->verified_stripes++; 7583cf90d884SQu Wenruo break; 7584cf90d884SQu Wenruo } 7585cf90d884SQu Wenruo } 7586cf90d884SQu Wenruo if (!found) { 7587cf90d884SQu Wenruo btrfs_err(fs_info, 7588cf90d884SQu Wenruo "dev extent physical offset %llu devid %llu has no corresponding chunk", 7589cf90d884SQu Wenruo physical_offset, devid); 7590cf90d884SQu Wenruo ret = -EUCLEAN; 7591cf90d884SQu Wenruo } 759205a37c48SQu Wenruo 759305a37c48SQu Wenruo /* Make sure no dev extent is beyond device bondary */ 759409ba3bc9SAnand Jain dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); 759505a37c48SQu Wenruo if (!dev) { 759605a37c48SQu Wenruo btrfs_err(fs_info, "failed to find devid %llu", devid); 759705a37c48SQu Wenruo ret = -EUCLEAN; 759805a37c48SQu Wenruo goto out; 759905a37c48SQu Wenruo } 76001b3922a8SQu Wenruo 76011b3922a8SQu Wenruo /* It's possible this device is a dummy for seed device */ 76021b3922a8SQu Wenruo if (dev->disk_total_bytes == 0) { 7603944d3f9fSNikolay Borisov struct btrfs_fs_devices *devs; 7604944d3f9fSNikolay Borisov 7605944d3f9fSNikolay Borisov devs = list_first_entry(&fs_info->fs_devices->seed_list, 7606944d3f9fSNikolay Borisov struct btrfs_fs_devices, seed_list); 7607944d3f9fSNikolay Borisov dev = btrfs_find_device(devs, devid, NULL, NULL, false); 76081b3922a8SQu Wenruo if (!dev) { 76091b3922a8SQu Wenruo btrfs_err(fs_info, "failed to find seed devid %llu", 76101b3922a8SQu Wenruo devid); 76111b3922a8SQu Wenruo ret = -EUCLEAN; 76121b3922a8SQu Wenruo goto out; 76131b3922a8SQu Wenruo } 76141b3922a8SQu Wenruo } 76151b3922a8SQu Wenruo 761605a37c48SQu Wenruo if (physical_offset + physical_len > dev->disk_total_bytes) { 761705a37c48SQu Wenruo btrfs_err(fs_info, 761805a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 761905a37c48SQu Wenruo devid, physical_offset, physical_len, 762005a37c48SQu Wenruo dev->disk_total_bytes); 762105a37c48SQu Wenruo ret = -EUCLEAN; 762205a37c48SQu Wenruo goto out; 762305a37c48SQu Wenruo } 7624cf90d884SQu Wenruo out: 7625cf90d884SQu Wenruo free_extent_map(em); 7626cf90d884SQu Wenruo return ret; 7627cf90d884SQu Wenruo } 7628cf90d884SQu Wenruo 7629cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7630cf90d884SQu Wenruo { 7631c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7632cf90d884SQu Wenruo struct extent_map *em; 7633cf90d884SQu Wenruo struct rb_node *node; 7634cf90d884SQu Wenruo int ret = 0; 7635cf90d884SQu Wenruo 7636cf90d884SQu Wenruo read_lock(&em_tree->lock); 763707e1ce09SLiu Bo for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7638cf90d884SQu Wenruo em = rb_entry(node, struct extent_map, rb_node); 7639cf90d884SQu Wenruo if (em->map_lookup->num_stripes != 7640cf90d884SQu Wenruo em->map_lookup->verified_stripes) { 7641cf90d884SQu Wenruo btrfs_err(fs_info, 7642cf90d884SQu Wenruo "chunk %llu has missing dev extent, have %d expect %d", 7643cf90d884SQu Wenruo em->start, em->map_lookup->verified_stripes, 7644cf90d884SQu Wenruo em->map_lookup->num_stripes); 7645cf90d884SQu Wenruo ret = -EUCLEAN; 7646cf90d884SQu Wenruo goto out; 7647cf90d884SQu Wenruo } 7648cf90d884SQu Wenruo } 7649cf90d884SQu Wenruo out: 7650cf90d884SQu Wenruo read_unlock(&em_tree->lock); 7651cf90d884SQu Wenruo return ret; 7652cf90d884SQu Wenruo } 7653cf90d884SQu Wenruo 7654cf90d884SQu Wenruo /* 7655cf90d884SQu Wenruo * Ensure that all dev extents are mapped to correct chunk, otherwise 7656cf90d884SQu Wenruo * later chunk allocation/free would cause unexpected behavior. 7657cf90d884SQu Wenruo * 7658cf90d884SQu Wenruo * NOTE: This will iterate through the whole device tree, which should be of 7659cf90d884SQu Wenruo * the same size level as the chunk tree. This slightly increases mount time. 7660cf90d884SQu Wenruo */ 7661cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7662cf90d884SQu Wenruo { 7663cf90d884SQu Wenruo struct btrfs_path *path; 7664cf90d884SQu Wenruo struct btrfs_root *root = fs_info->dev_root; 7665cf90d884SQu Wenruo struct btrfs_key key; 76665eb19381SQu Wenruo u64 prev_devid = 0; 76675eb19381SQu Wenruo u64 prev_dev_ext_end = 0; 7668cf90d884SQu Wenruo int ret = 0; 7669cf90d884SQu Wenruo 767042437a63SJosef Bacik /* 767142437a63SJosef Bacik * We don't have a dev_root because we mounted with ignorebadroots and 767242437a63SJosef Bacik * failed to load the root, so we want to skip the verification in this 767342437a63SJosef Bacik * case for sure. 767442437a63SJosef Bacik * 767542437a63SJosef Bacik * However if the dev root is fine, but the tree itself is corrupted 767642437a63SJosef Bacik * we'd still fail to mount. This verification is only to make sure 767742437a63SJosef Bacik * writes can happen safely, so instead just bypass this check 767842437a63SJosef Bacik * completely in the case of IGNOREBADROOTS. 767942437a63SJosef Bacik */ 768042437a63SJosef Bacik if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 768142437a63SJosef Bacik return 0; 768242437a63SJosef Bacik 7683cf90d884SQu Wenruo key.objectid = 1; 7684cf90d884SQu Wenruo key.type = BTRFS_DEV_EXTENT_KEY; 7685cf90d884SQu Wenruo key.offset = 0; 7686cf90d884SQu Wenruo 7687cf90d884SQu Wenruo path = btrfs_alloc_path(); 7688cf90d884SQu Wenruo if (!path) 7689cf90d884SQu Wenruo return -ENOMEM; 7690cf90d884SQu Wenruo 7691cf90d884SQu Wenruo path->reada = READA_FORWARD; 7692cf90d884SQu Wenruo ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7693cf90d884SQu Wenruo if (ret < 0) 7694cf90d884SQu Wenruo goto out; 7695cf90d884SQu Wenruo 7696cf90d884SQu Wenruo if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7697cf90d884SQu Wenruo ret = btrfs_next_item(root, path); 7698cf90d884SQu Wenruo if (ret < 0) 7699cf90d884SQu Wenruo goto out; 7700cf90d884SQu Wenruo /* No dev extents at all? Not good */ 7701cf90d884SQu Wenruo if (ret > 0) { 7702cf90d884SQu Wenruo ret = -EUCLEAN; 7703cf90d884SQu Wenruo goto out; 7704cf90d884SQu Wenruo } 7705cf90d884SQu Wenruo } 7706cf90d884SQu Wenruo while (1) { 7707cf90d884SQu Wenruo struct extent_buffer *leaf = path->nodes[0]; 7708cf90d884SQu Wenruo struct btrfs_dev_extent *dext; 7709cf90d884SQu Wenruo int slot = path->slots[0]; 7710cf90d884SQu Wenruo u64 chunk_offset; 7711cf90d884SQu Wenruo u64 physical_offset; 7712cf90d884SQu Wenruo u64 physical_len; 7713cf90d884SQu Wenruo u64 devid; 7714cf90d884SQu Wenruo 7715cf90d884SQu Wenruo btrfs_item_key_to_cpu(leaf, &key, slot); 7716cf90d884SQu Wenruo if (key.type != BTRFS_DEV_EXTENT_KEY) 7717cf90d884SQu Wenruo break; 7718cf90d884SQu Wenruo devid = key.objectid; 7719cf90d884SQu Wenruo physical_offset = key.offset; 7720cf90d884SQu Wenruo 7721cf90d884SQu Wenruo dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 7722cf90d884SQu Wenruo chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 7723cf90d884SQu Wenruo physical_len = btrfs_dev_extent_length(leaf, dext); 7724cf90d884SQu Wenruo 77255eb19381SQu Wenruo /* Check if this dev extent overlaps with the previous one */ 77265eb19381SQu Wenruo if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 77275eb19381SQu Wenruo btrfs_err(fs_info, 77285eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 77295eb19381SQu Wenruo devid, physical_offset, prev_dev_ext_end); 77305eb19381SQu Wenruo ret = -EUCLEAN; 77315eb19381SQu Wenruo goto out; 77325eb19381SQu Wenruo } 77335eb19381SQu Wenruo 7734cf90d884SQu Wenruo ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 7735cf90d884SQu Wenruo physical_offset, physical_len); 7736cf90d884SQu Wenruo if (ret < 0) 7737cf90d884SQu Wenruo goto out; 77385eb19381SQu Wenruo prev_devid = devid; 77395eb19381SQu Wenruo prev_dev_ext_end = physical_offset + physical_len; 77405eb19381SQu Wenruo 7741cf90d884SQu Wenruo ret = btrfs_next_item(root, path); 7742cf90d884SQu Wenruo if (ret < 0) 7743cf90d884SQu Wenruo goto out; 7744cf90d884SQu Wenruo if (ret > 0) { 7745cf90d884SQu Wenruo ret = 0; 7746cf90d884SQu Wenruo break; 7747cf90d884SQu Wenruo } 7748cf90d884SQu Wenruo } 7749cf90d884SQu Wenruo 7750cf90d884SQu Wenruo /* Ensure all chunks have corresponding dev extents */ 7751cf90d884SQu Wenruo ret = verify_chunk_dev_extent_mapping(fs_info); 7752cf90d884SQu Wenruo out: 7753cf90d884SQu Wenruo btrfs_free_path(path); 7754cf90d884SQu Wenruo return ret; 7755cf90d884SQu Wenruo } 7756eede2bf3SOmar Sandoval 7757eede2bf3SOmar Sandoval /* 7758eede2bf3SOmar Sandoval * Check whether the given block group or device is pinned by any inode being 7759eede2bf3SOmar Sandoval * used as a swapfile. 7760eede2bf3SOmar Sandoval */ 7761eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 7762eede2bf3SOmar Sandoval { 7763eede2bf3SOmar Sandoval struct btrfs_swapfile_pin *sp; 7764eede2bf3SOmar Sandoval struct rb_node *node; 7765eede2bf3SOmar Sandoval 7766eede2bf3SOmar Sandoval spin_lock(&fs_info->swapfile_pins_lock); 7767eede2bf3SOmar Sandoval node = fs_info->swapfile_pins.rb_node; 7768eede2bf3SOmar Sandoval while (node) { 7769eede2bf3SOmar Sandoval sp = rb_entry(node, struct btrfs_swapfile_pin, node); 7770eede2bf3SOmar Sandoval if (ptr < sp->ptr) 7771eede2bf3SOmar Sandoval node = node->rb_left; 7772eede2bf3SOmar Sandoval else if (ptr > sp->ptr) 7773eede2bf3SOmar Sandoval node = node->rb_right; 7774eede2bf3SOmar Sandoval else 7775eede2bf3SOmar Sandoval break; 7776eede2bf3SOmar Sandoval } 7777eede2bf3SOmar Sandoval spin_unlock(&fs_info->swapfile_pins_lock); 7778eede2bf3SOmar Sandoval return node != NULL; 7779eede2bf3SOmar Sandoval } 7780