1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 20b86a832SChris Mason /* 30b86a832SChris Mason * Copyright (C) 2007 Oracle. All rights reserved. 40b86a832SChris Mason */ 5c1d7c514SDavid Sterba 60b86a832SChris Mason #include <linux/sched.h> 7fccc0007SJosef Bacik #include <linux/sched/mm.h> 80b86a832SChris Mason #include <linux/bio.h> 95a0e3ad6STejun Heo #include <linux/slab.h> 10f2d8d74dSChris Mason #include <linux/blkdev.h> 11442a4f63SStefan Behrens #include <linux/ratelimit.h> 1259641015SIlya Dryomov #include <linux/kthread.h> 1353b381b3SDavid Woodhouse #include <linux/raid/pq.h> 14803b2f54SStefan Behrens #include <linux/semaphore.h> 158da4b8c4SAndy Shevchenko #include <linux/uuid.h> 16f8e10cd3SAnand Jain #include <linux/list_sort.h> 17784352feSDavid Sterba #include "misc.h" 180b86a832SChris Mason #include "ctree.h" 190b86a832SChris Mason #include "extent_map.h" 200b86a832SChris Mason #include "disk-io.h" 210b86a832SChris Mason #include "transaction.h" 220b86a832SChris Mason #include "print-tree.h" 230b86a832SChris Mason #include "volumes.h" 2453b381b3SDavid Woodhouse #include "raid56.h" 258b712842SChris Mason #include "async-thread.h" 2621adbd5cSStefan Behrens #include "check-integrity.h" 27606686eeSJosef Bacik #include "rcu-string.h" 288dabb742SStefan Behrens #include "dev-replace.h" 2999994cdeSAnand Jain #include "sysfs.h" 3082fc28fbSQu Wenruo #include "tree-checker.h" 318719aaaeSJosef Bacik #include "space-info.h" 32aac0023cSJosef Bacik #include "block-group.h" 33b0643e59SDennis Zhou #include "discard.h" 345b316468SNaohiro Aota #include "zoned.h" 350b86a832SChris Mason 36af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 37af902047SZhao Lei [BTRFS_RAID_RAID10] = { 38af902047SZhao Lei .sub_stripes = 2, 39af902047SZhao Lei .dev_stripes = 1, 40af902047SZhao Lei .devs_max = 0, /* 0 == as many as possible */ 41b2f78e88SDavid Sterba .devs_min = 2, 428789f4feSZhao Lei .tolerated_failures = 1, 43af902047SZhao Lei .devs_increment = 2, 44af902047SZhao Lei .ncopies = 2, 45b50836edSHans van Kranenburg .nparity = 0, 46ed23467bSAnand Jain .raid_name = "raid10", 4741a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 48f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 49af902047SZhao Lei }, 50af902047SZhao Lei [BTRFS_RAID_RAID1] = { 51af902047SZhao Lei .sub_stripes = 1, 52af902047SZhao Lei .dev_stripes = 1, 53af902047SZhao Lei .devs_max = 2, 54af902047SZhao Lei .devs_min = 2, 558789f4feSZhao Lei .tolerated_failures = 1, 56af902047SZhao Lei .devs_increment = 2, 57af902047SZhao Lei .ncopies = 2, 58b50836edSHans van Kranenburg .nparity = 0, 59ed23467bSAnand Jain .raid_name = "raid1", 6041a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 61f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 62af902047SZhao Lei }, 6347e6f742SDavid Sterba [BTRFS_RAID_RAID1C3] = { 6447e6f742SDavid Sterba .sub_stripes = 1, 6547e6f742SDavid Sterba .dev_stripes = 1, 66cf93e15eSDavid Sterba .devs_max = 3, 6747e6f742SDavid Sterba .devs_min = 3, 6847e6f742SDavid Sterba .tolerated_failures = 2, 6947e6f742SDavid Sterba .devs_increment = 3, 7047e6f742SDavid Sterba .ncopies = 3, 71db26a024SDavid Sterba .nparity = 0, 7247e6f742SDavid Sterba .raid_name = "raid1c3", 7347e6f742SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 7447e6f742SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 7547e6f742SDavid Sterba }, 768d6fac00SDavid Sterba [BTRFS_RAID_RAID1C4] = { 778d6fac00SDavid Sterba .sub_stripes = 1, 788d6fac00SDavid Sterba .dev_stripes = 1, 79cf93e15eSDavid Sterba .devs_max = 4, 808d6fac00SDavid Sterba .devs_min = 4, 818d6fac00SDavid Sterba .tolerated_failures = 3, 828d6fac00SDavid Sterba .devs_increment = 4, 838d6fac00SDavid Sterba .ncopies = 4, 84db26a024SDavid Sterba .nparity = 0, 858d6fac00SDavid Sterba .raid_name = "raid1c4", 868d6fac00SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 878d6fac00SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 888d6fac00SDavid Sterba }, 89af902047SZhao Lei [BTRFS_RAID_DUP] = { 90af902047SZhao Lei .sub_stripes = 1, 91af902047SZhao Lei .dev_stripes = 2, 92af902047SZhao Lei .devs_max = 1, 93af902047SZhao Lei .devs_min = 1, 948789f4feSZhao Lei .tolerated_failures = 0, 95af902047SZhao Lei .devs_increment = 1, 96af902047SZhao Lei .ncopies = 2, 97b50836edSHans van Kranenburg .nparity = 0, 98ed23467bSAnand Jain .raid_name = "dup", 9941a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_DUP, 100f9fbcaa2SAnand Jain .mindev_error = 0, 101af902047SZhao Lei }, 102af902047SZhao Lei [BTRFS_RAID_RAID0] = { 103af902047SZhao Lei .sub_stripes = 1, 104af902047SZhao Lei .dev_stripes = 1, 105af902047SZhao Lei .devs_max = 0, 106b2f78e88SDavid Sterba .devs_min = 1, 1078789f4feSZhao Lei .tolerated_failures = 0, 108af902047SZhao Lei .devs_increment = 1, 109af902047SZhao Lei .ncopies = 1, 110b50836edSHans van Kranenburg .nparity = 0, 111ed23467bSAnand Jain .raid_name = "raid0", 11241a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 113f9fbcaa2SAnand Jain .mindev_error = 0, 114af902047SZhao Lei }, 115af902047SZhao Lei [BTRFS_RAID_SINGLE] = { 116af902047SZhao Lei .sub_stripes = 1, 117af902047SZhao Lei .dev_stripes = 1, 118af902047SZhao Lei .devs_max = 1, 119af902047SZhao Lei .devs_min = 1, 1208789f4feSZhao Lei .tolerated_failures = 0, 121af902047SZhao Lei .devs_increment = 1, 122af902047SZhao Lei .ncopies = 1, 123b50836edSHans van Kranenburg .nparity = 0, 124ed23467bSAnand Jain .raid_name = "single", 12541a6e891SAnand Jain .bg_flag = 0, 126f9fbcaa2SAnand Jain .mindev_error = 0, 127af902047SZhao Lei }, 128af902047SZhao Lei [BTRFS_RAID_RAID5] = { 129af902047SZhao Lei .sub_stripes = 1, 130af902047SZhao Lei .dev_stripes = 1, 131af902047SZhao Lei .devs_max = 0, 132af902047SZhao Lei .devs_min = 2, 1338789f4feSZhao Lei .tolerated_failures = 1, 134af902047SZhao Lei .devs_increment = 1, 135da612e31SHans van Kranenburg .ncopies = 1, 136b50836edSHans van Kranenburg .nparity = 1, 137ed23467bSAnand Jain .raid_name = "raid5", 13841a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 139f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 140af902047SZhao Lei }, 141af902047SZhao Lei [BTRFS_RAID_RAID6] = { 142af902047SZhao Lei .sub_stripes = 1, 143af902047SZhao Lei .dev_stripes = 1, 144af902047SZhao Lei .devs_max = 0, 145af902047SZhao Lei .devs_min = 3, 1468789f4feSZhao Lei .tolerated_failures = 2, 147af902047SZhao Lei .devs_increment = 1, 148da612e31SHans van Kranenburg .ncopies = 1, 149b50836edSHans van Kranenburg .nparity = 2, 150ed23467bSAnand Jain .raid_name = "raid6", 15141a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 152f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 153af902047SZhao Lei }, 154af902047SZhao Lei }; 155af902047SZhao Lei 156500a44c9SDavid Sterba /* 157500a44c9SDavid Sterba * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 158500a44c9SDavid Sterba * can be used as index to access btrfs_raid_array[]. 159500a44c9SDavid Sterba */ 160500a44c9SDavid Sterba enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 161500a44c9SDavid Sterba { 162500a44c9SDavid Sterba if (flags & BTRFS_BLOCK_GROUP_RAID10) 163500a44c9SDavid Sterba return BTRFS_RAID_RAID10; 164500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID1) 165500a44c9SDavid Sterba return BTRFS_RAID_RAID1; 166500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 167500a44c9SDavid Sterba return BTRFS_RAID_RAID1C3; 168500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 169500a44c9SDavid Sterba return BTRFS_RAID_RAID1C4; 170500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_DUP) 171500a44c9SDavid Sterba return BTRFS_RAID_DUP; 172500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID0) 173500a44c9SDavid Sterba return BTRFS_RAID_RAID0; 174500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID5) 175500a44c9SDavid Sterba return BTRFS_RAID_RAID5; 176500a44c9SDavid Sterba else if (flags & BTRFS_BLOCK_GROUP_RAID6) 177500a44c9SDavid Sterba return BTRFS_RAID_RAID6; 178500a44c9SDavid Sterba 179500a44c9SDavid Sterba return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 180500a44c9SDavid Sterba } 181500a44c9SDavid Sterba 182158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags) 183ed23467bSAnand Jain { 184158da513SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags); 185158da513SDavid Sterba 186158da513SDavid Sterba if (index >= BTRFS_NR_RAID_TYPES) 187ed23467bSAnand Jain return NULL; 188ed23467bSAnand Jain 189158da513SDavid Sterba return btrfs_raid_array[index].raid_name; 190ed23467bSAnand Jain } 191ed23467bSAnand Jain 192f89e09cfSAnand Jain /* 193f89e09cfSAnand Jain * Fill @buf with textual description of @bg_flags, no more than @size_buf 194f89e09cfSAnand Jain * bytes including terminating null byte. 195f89e09cfSAnand Jain */ 196f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 197f89e09cfSAnand Jain { 198f89e09cfSAnand Jain int i; 199f89e09cfSAnand Jain int ret; 200f89e09cfSAnand Jain char *bp = buf; 201f89e09cfSAnand Jain u64 flags = bg_flags; 202f89e09cfSAnand Jain u32 size_bp = size_buf; 203f89e09cfSAnand Jain 204f89e09cfSAnand Jain if (!flags) { 205f89e09cfSAnand Jain strcpy(bp, "NONE"); 206f89e09cfSAnand Jain return; 207f89e09cfSAnand Jain } 208f89e09cfSAnand Jain 209f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc) \ 210f89e09cfSAnand Jain do { \ 211f89e09cfSAnand Jain if (flags & (flag)) { \ 212f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "%s|", (desc)); \ 213f89e09cfSAnand Jain if (ret < 0 || ret >= size_bp) \ 214f89e09cfSAnand Jain goto out_overflow; \ 215f89e09cfSAnand Jain size_bp -= ret; \ 216f89e09cfSAnand Jain bp += ret; \ 217f89e09cfSAnand Jain flags &= ~(flag); \ 218f89e09cfSAnand Jain } \ 219f89e09cfSAnand Jain } while (0) 220f89e09cfSAnand Jain 221f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 222f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 223f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 224f89e09cfSAnand Jain 225f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 226f89e09cfSAnand Jain for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 227f89e09cfSAnand Jain DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 228f89e09cfSAnand Jain btrfs_raid_array[i].raid_name); 229f89e09cfSAnand Jain #undef DESCRIBE_FLAG 230f89e09cfSAnand Jain 231f89e09cfSAnand Jain if (flags) { 232f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "0x%llx|", flags); 233f89e09cfSAnand Jain size_bp -= ret; 234f89e09cfSAnand Jain } 235f89e09cfSAnand Jain 236f89e09cfSAnand Jain if (size_bp < size_buf) 237f89e09cfSAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 238f89e09cfSAnand Jain 239f89e09cfSAnand Jain /* 240f89e09cfSAnand Jain * The text is trimmed, it's up to the caller to provide sufficiently 241f89e09cfSAnand Jain * large buffer 242f89e09cfSAnand Jain */ 243f89e09cfSAnand Jain out_overflow:; 244f89e09cfSAnand Jain } 245f89e09cfSAnand Jain 2466f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans); 2472ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 24848a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 249733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 2505ab56090SLiu Bo static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 2515ab56090SLiu Bo enum btrfs_map_op op, 2525ab56090SLiu Bo u64 logical, u64 *length, 2534c664611SQu Wenruo struct btrfs_io_context **bioc_ret, 2545ab56090SLiu Bo int mirror_num, int need_raid_map); 2552b82032cSYan Zheng 2569c6b1c4dSDavid Sterba /* 2579c6b1c4dSDavid Sterba * Device locking 2589c6b1c4dSDavid Sterba * ============== 2599c6b1c4dSDavid Sterba * 2609c6b1c4dSDavid Sterba * There are several mutexes that protect manipulation of devices and low-level 2619c6b1c4dSDavid Sterba * structures like chunks but not block groups, extents or files 2629c6b1c4dSDavid Sterba * 2639c6b1c4dSDavid Sterba * uuid_mutex (global lock) 2649c6b1c4dSDavid Sterba * ------------------------ 2659c6b1c4dSDavid Sterba * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 2669c6b1c4dSDavid Sterba * the SCAN_DEV ioctl registration or from mount either implicitly (the first 2679c6b1c4dSDavid Sterba * device) or requested by the device= mount option 2689c6b1c4dSDavid Sterba * 2699c6b1c4dSDavid Sterba * the mutex can be very coarse and can cover long-running operations 2709c6b1c4dSDavid Sterba * 2719c6b1c4dSDavid Sterba * protects: updates to fs_devices counters like missing devices, rw devices, 27252042d8eSAndrea Gelmini * seeding, structure cloning, opening/closing devices at mount/umount time 2739c6b1c4dSDavid Sterba * 2749c6b1c4dSDavid Sterba * global::fs_devs - add, remove, updates to the global list 2759c6b1c4dSDavid Sterba * 27618c850fdSJosef Bacik * does not protect: manipulation of the fs_devices::devices list in general 27718c850fdSJosef Bacik * but in mount context it could be used to exclude list modifications by eg. 27818c850fdSJosef Bacik * scan ioctl 2799c6b1c4dSDavid Sterba * 2809c6b1c4dSDavid Sterba * btrfs_device::name - renames (write side), read is RCU 2819c6b1c4dSDavid Sterba * 2829c6b1c4dSDavid Sterba * fs_devices::device_list_mutex (per-fs, with RCU) 2839c6b1c4dSDavid Sterba * ------------------------------------------------ 2849c6b1c4dSDavid Sterba * protects updates to fs_devices::devices, ie. adding and deleting 2859c6b1c4dSDavid Sterba * 2869c6b1c4dSDavid Sterba * simple list traversal with read-only actions can be done with RCU protection 2879c6b1c4dSDavid Sterba * 2889c6b1c4dSDavid Sterba * may be used to exclude some operations from running concurrently without any 2899c6b1c4dSDavid Sterba * modifications to the list (see write_all_supers) 2909c6b1c4dSDavid Sterba * 29118c850fdSJosef Bacik * Is not required at mount and close times, because our device list is 29218c850fdSJosef Bacik * protected by the uuid_mutex at that point. 29318c850fdSJosef Bacik * 2949c6b1c4dSDavid Sterba * balance_mutex 2959c6b1c4dSDavid Sterba * ------------- 2969c6b1c4dSDavid Sterba * protects balance structures (status, state) and context accessed from 2979c6b1c4dSDavid Sterba * several places (internally, ioctl) 2989c6b1c4dSDavid Sterba * 2999c6b1c4dSDavid Sterba * chunk_mutex 3009c6b1c4dSDavid Sterba * ----------- 3019c6b1c4dSDavid Sterba * protects chunks, adding or removing during allocation, trim or when a new 3020b6f5d40SNikolay Borisov * device is added/removed. Additionally it also protects post_commit_list of 3030b6f5d40SNikolay Borisov * individual devices, since they can be added to the transaction's 3040b6f5d40SNikolay Borisov * post_commit_list only with chunk_mutex held. 3059c6b1c4dSDavid Sterba * 3069c6b1c4dSDavid Sterba * cleaner_mutex 3079c6b1c4dSDavid Sterba * ------------- 3089c6b1c4dSDavid Sterba * a big lock that is held by the cleaner thread and prevents running subvolume 3099c6b1c4dSDavid Sterba * cleaning together with relocation or delayed iputs 3109c6b1c4dSDavid Sterba * 3119c6b1c4dSDavid Sterba * 3129c6b1c4dSDavid Sterba * Lock nesting 3139c6b1c4dSDavid Sterba * ============ 3149c6b1c4dSDavid Sterba * 3159c6b1c4dSDavid Sterba * uuid_mutex 3169c6b1c4dSDavid Sterba * device_list_mutex 3179c6b1c4dSDavid Sterba * chunk_mutex 3189c6b1c4dSDavid Sterba * balance_mutex 31989595e80SAnand Jain * 32089595e80SAnand Jain * 321c3e1f96cSGoldwyn Rodrigues * Exclusive operations 322c3e1f96cSGoldwyn Rodrigues * ==================== 32389595e80SAnand Jain * 32489595e80SAnand Jain * Maintains the exclusivity of the following operations that apply to the 32589595e80SAnand Jain * whole filesystem and cannot run in parallel. 32689595e80SAnand Jain * 32789595e80SAnand Jain * - Balance (*) 32889595e80SAnand Jain * - Device add 32989595e80SAnand Jain * - Device remove 33089595e80SAnand Jain * - Device replace (*) 33189595e80SAnand Jain * - Resize 33289595e80SAnand Jain * 33389595e80SAnand Jain * The device operations (as above) can be in one of the following states: 33489595e80SAnand Jain * 33589595e80SAnand Jain * - Running state 33689595e80SAnand Jain * - Paused state 33789595e80SAnand Jain * - Completed state 33889595e80SAnand Jain * 33989595e80SAnand Jain * Only device operations marked with (*) can go into the Paused state for the 34089595e80SAnand Jain * following reasons: 34189595e80SAnand Jain * 34289595e80SAnand Jain * - ioctl (only Balance can be Paused through ioctl) 34389595e80SAnand Jain * - filesystem remounted as read-only 34489595e80SAnand Jain * - filesystem unmounted and mounted as read-only 34589595e80SAnand Jain * - system power-cycle and filesystem mounted as read-only 34689595e80SAnand Jain * - filesystem or device errors leading to forced read-only 34789595e80SAnand Jain * 348c3e1f96cSGoldwyn Rodrigues * The status of exclusive operation is set and cleared atomically. 349c3e1f96cSGoldwyn Rodrigues * During the course of Paused state, fs_info::exclusive_operation remains set. 35089595e80SAnand Jain * A device operation in Paused or Running state can be canceled or resumed 35189595e80SAnand Jain * either by ioctl (Balance only) or when remounted as read-write. 352c3e1f96cSGoldwyn Rodrigues * The exclusive status is cleared when the device operation is canceled or 35389595e80SAnand Jain * completed. 3549c6b1c4dSDavid Sterba */ 3559c6b1c4dSDavid Sterba 35667a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex); 3578a4b83ccSChris Mason static LIST_HEAD(fs_uuids); 3584143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 359c73eccf7SAnand Jain { 360c73eccf7SAnand Jain return &fs_uuids; 361c73eccf7SAnand Jain } 3628a4b83ccSChris Mason 3632dfeca9bSDavid Sterba /* 3642dfeca9bSDavid Sterba * alloc_fs_devices - allocate struct btrfs_fs_devices 3657239ff4bSNikolay Borisov * @fsid: if not NULL, copy the UUID to fs_devices::fsid 3667239ff4bSNikolay Borisov * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 3672dfeca9bSDavid Sterba * 3682dfeca9bSDavid Sterba * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 3692dfeca9bSDavid Sterba * The returned struct is not linked onto any lists and can be destroyed with 3702dfeca9bSDavid Sterba * kfree() right away. 3712dfeca9bSDavid Sterba */ 3727239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 3737239ff4bSNikolay Borisov const u8 *metadata_fsid) 3742208a378SIlya Dryomov { 3752208a378SIlya Dryomov struct btrfs_fs_devices *fs_devs; 3762208a378SIlya Dryomov 37778f2c9e6SDavid Sterba fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 3782208a378SIlya Dryomov if (!fs_devs) 3792208a378SIlya Dryomov return ERR_PTR(-ENOMEM); 3802208a378SIlya Dryomov 3812208a378SIlya Dryomov mutex_init(&fs_devs->device_list_mutex); 3822208a378SIlya Dryomov 3832208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->devices); 3842208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->alloc_list); 385c4babc5eSAnand Jain INIT_LIST_HEAD(&fs_devs->fs_list); 386944d3f9fSNikolay Borisov INIT_LIST_HEAD(&fs_devs->seed_list); 3872208a378SIlya Dryomov if (fsid) 3882208a378SIlya Dryomov memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 3892208a378SIlya Dryomov 3907239ff4bSNikolay Borisov if (metadata_fsid) 3917239ff4bSNikolay Borisov memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 3927239ff4bSNikolay Borisov else if (fsid) 3937239ff4bSNikolay Borisov memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 3947239ff4bSNikolay Borisov 3952208a378SIlya Dryomov return fs_devs; 3962208a378SIlya Dryomov } 3972208a378SIlya Dryomov 398a425f9d4SDavid Sterba void btrfs_free_device(struct btrfs_device *device) 39948dae9cfSDavid Sterba { 400bbbf7243SNikolay Borisov WARN_ON(!list_empty(&device->post_commit_list)); 40148dae9cfSDavid Sterba rcu_string_free(device->name); 4021c11b63eSJeff Mahoney extent_io_tree_release(&device->alloc_state); 40348dae9cfSDavid Sterba bio_put(device->flush_bio); 4045b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device); 40548dae9cfSDavid Sterba kfree(device); 40648dae9cfSDavid Sterba } 40748dae9cfSDavid Sterba 408e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409e4404d6eSYan Zheng { 410e4404d6eSYan Zheng struct btrfs_device *device; 411e4404d6eSYan Zheng WARN_ON(fs_devices->opened); 412e4404d6eSYan Zheng while (!list_empty(&fs_devices->devices)) { 413e4404d6eSYan Zheng device = list_entry(fs_devices->devices.next, 414e4404d6eSYan Zheng struct btrfs_device, dev_list); 415e4404d6eSYan Zheng list_del(&device->dev_list); 416a425f9d4SDavid Sterba btrfs_free_device(device); 417e4404d6eSYan Zheng } 418e4404d6eSYan Zheng kfree(fs_devices); 419e4404d6eSYan Zheng } 420e4404d6eSYan Zheng 421ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void) 4228a4b83ccSChris Mason { 4238a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 4248a4b83ccSChris Mason 4252b82032cSYan Zheng while (!list_empty(&fs_uuids)) { 4262b82032cSYan Zheng fs_devices = list_entry(fs_uuids.next, 427c4babc5eSAnand Jain struct btrfs_fs_devices, fs_list); 428c4babc5eSAnand Jain list_del(&fs_devices->fs_list); 429e4404d6eSYan Zheng free_fs_devices(fs_devices); 4308a4b83ccSChris Mason } 4318a4b83ccSChris Mason } 4328a4b83ccSChris Mason 4337239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid( 4347239ff4bSNikolay Borisov const u8 *fsid, const u8 *metadata_fsid) 4358a4b83ccSChris Mason { 4368a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices; 4378a4b83ccSChris Mason 4387239ff4bSNikolay Borisov ASSERT(fsid); 4397239ff4bSNikolay Borisov 440c6730a0eSSu Yue /* Handle non-split brain cases */ 441c6730a0eSSu Yue list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 4427a62d0f0SNikolay Borisov if (metadata_fsid) { 443c6730a0eSSu Yue if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444c6730a0eSSu Yue && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445c6730a0eSSu Yue BTRFS_FSID_SIZE) == 0) 446c6730a0eSSu Yue return fs_devices; 447c6730a0eSSu Yue } else { 448c6730a0eSSu Yue if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449c6730a0eSSu Yue return fs_devices; 450c6730a0eSSu Yue } 451c6730a0eSSu Yue } 452c6730a0eSSu Yue return NULL; 453c6730a0eSSu Yue } 454c6730a0eSSu Yue 455c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456c6730a0eSSu Yue struct btrfs_super_block *disk_super) 457c6730a0eSSu Yue { 458c6730a0eSSu Yue 459c6730a0eSSu Yue struct btrfs_fs_devices *fs_devices; 460c6730a0eSSu Yue 4617a62d0f0SNikolay Borisov /* 4627a62d0f0SNikolay Borisov * Handle scanned device having completed its fsid change but 4637a62d0f0SNikolay Borisov * belonging to a fs_devices that was created by first scanning 4647a62d0f0SNikolay Borisov * a device which didn't have its fsid/metadata_uuid changed 4657a62d0f0SNikolay Borisov * at all and the CHANGING_FSID_V2 flag set. 4667a62d0f0SNikolay Borisov */ 4677a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 4687a62d0f0SNikolay Borisov if (fs_devices->fsid_change && 469c6730a0eSSu Yue memcmp(disk_super->metadata_uuid, fs_devices->fsid, 4707a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 4717a62d0f0SNikolay Borisov memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 4727a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0) { 4737a62d0f0SNikolay Borisov return fs_devices; 4747a62d0f0SNikolay Borisov } 4757a62d0f0SNikolay Borisov } 476cc5de4e7SNikolay Borisov /* 477cc5de4e7SNikolay Borisov * Handle scanned device having completed its fsid change but 478cc5de4e7SNikolay Borisov * belonging to a fs_devices that was created by a device that 479cc5de4e7SNikolay Borisov * has an outdated pair of fsid/metadata_uuid and 480cc5de4e7SNikolay Borisov * CHANGING_FSID_V2 flag set. 481cc5de4e7SNikolay Borisov */ 482cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483cc5de4e7SNikolay Borisov if (fs_devices->fsid_change && 484cc5de4e7SNikolay Borisov memcmp(fs_devices->metadata_uuid, 485cc5de4e7SNikolay Borisov fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486c6730a0eSSu Yue memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) == 0) { 488cc5de4e7SNikolay Borisov return fs_devices; 489cc5de4e7SNikolay Borisov } 490cc5de4e7SNikolay Borisov } 491c6730a0eSSu Yue 492c6730a0eSSu Yue return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 4937a62d0f0SNikolay Borisov } 4947a62d0f0SNikolay Borisov 4958a4b83ccSChris Mason 496beaf8ab3SStefan Behrens static int 497beaf8ab3SStefan Behrens btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498beaf8ab3SStefan Behrens int flush, struct block_device **bdev, 4998f32380dSJohannes Thumshirn struct btrfs_super_block **disk_super) 500beaf8ab3SStefan Behrens { 501beaf8ab3SStefan Behrens int ret; 502beaf8ab3SStefan Behrens 503beaf8ab3SStefan Behrens *bdev = blkdev_get_by_path(device_path, flags, holder); 504beaf8ab3SStefan Behrens 505beaf8ab3SStefan Behrens if (IS_ERR(*bdev)) { 506beaf8ab3SStefan Behrens ret = PTR_ERR(*bdev); 507beaf8ab3SStefan Behrens goto error; 508beaf8ab3SStefan Behrens } 509beaf8ab3SStefan Behrens 510beaf8ab3SStefan Behrens if (flush) 511beaf8ab3SStefan Behrens filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 5129f6d2510SDavid Sterba ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513beaf8ab3SStefan Behrens if (ret) { 514beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 515beaf8ab3SStefan Behrens goto error; 516beaf8ab3SStefan Behrens } 517beaf8ab3SStefan Behrens invalidate_bdev(*bdev); 5188f32380dSJohannes Thumshirn *disk_super = btrfs_read_dev_super(*bdev); 5198f32380dSJohannes Thumshirn if (IS_ERR(*disk_super)) { 5208f32380dSJohannes Thumshirn ret = PTR_ERR(*disk_super); 521beaf8ab3SStefan Behrens blkdev_put(*bdev, flags); 522beaf8ab3SStefan Behrens goto error; 523beaf8ab3SStefan Behrens } 524beaf8ab3SStefan Behrens 525beaf8ab3SStefan Behrens return 0; 526beaf8ab3SStefan Behrens 527beaf8ab3SStefan Behrens error: 528beaf8ab3SStefan Behrens *bdev = NULL; 529beaf8ab3SStefan Behrens return ret; 530beaf8ab3SStefan Behrens } 531beaf8ab3SStefan Behrens 53270bc7088SAnand Jain static bool device_path_matched(const char *path, struct btrfs_device *device) 53370bc7088SAnand Jain { 53470bc7088SAnand Jain int found; 53570bc7088SAnand Jain 53670bc7088SAnand Jain rcu_read_lock(); 53770bc7088SAnand Jain found = strcmp(rcu_str_deref(device->name), path); 53870bc7088SAnand Jain rcu_read_unlock(); 53970bc7088SAnand Jain 54070bc7088SAnand Jain return found == 0; 54170bc7088SAnand Jain } 54270bc7088SAnand Jain 543d8367db3SAnand Jain /* 544d8367db3SAnand Jain * Search and remove all stale (devices which are not mounted) devices. 545d8367db3SAnand Jain * When both inputs are NULL, it will search and release all stale devices. 546d8367db3SAnand Jain * path: Optional. When provided will it release all unmounted devices 547d8367db3SAnand Jain * matching this path only. 548d8367db3SAnand Jain * skip_dev: Optional. Will skip this device when searching for the stale 549d8367db3SAnand Jain * devices. 55070bc7088SAnand Jain * Return: 0 for success or if @path is NULL. 55170bc7088SAnand Jain * -EBUSY if @path is a mounted device. 55270bc7088SAnand Jain * -ENOENT if @path does not match any device in the list. 553d8367db3SAnand Jain */ 55470bc7088SAnand Jain static int btrfs_free_stale_devices(const char *path, 555fa6d2ae5SAnand Jain struct btrfs_device *skip_device) 5564fde46f0SAnand Jain { 557fa6d2ae5SAnand Jain struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 558fa6d2ae5SAnand Jain struct btrfs_device *device, *tmp_device; 55970bc7088SAnand Jain int ret = 0; 56070bc7088SAnand Jain 561c1247069SAnand Jain lockdep_assert_held(&uuid_mutex); 562c1247069SAnand Jain 56370bc7088SAnand Jain if (path) 56470bc7088SAnand Jain ret = -ENOENT; 5654fde46f0SAnand Jain 566fa6d2ae5SAnand Jain list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 5674fde46f0SAnand Jain 56870bc7088SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 569fa6d2ae5SAnand Jain list_for_each_entry_safe(device, tmp_device, 570fa6d2ae5SAnand Jain &fs_devices->devices, dev_list) { 571fa6d2ae5SAnand Jain if (skip_device && skip_device == device) 572d8367db3SAnand Jain continue; 573fa6d2ae5SAnand Jain if (path && !device->name) 5744fde46f0SAnand Jain continue; 57570bc7088SAnand Jain if (path && !device_path_matched(path, device)) 57638cf665dSAnand Jain continue; 57770bc7088SAnand Jain if (fs_devices->opened) { 57870bc7088SAnand Jain /* for an already deleted device return 0 */ 57970bc7088SAnand Jain if (path && ret != 0) 58070bc7088SAnand Jain ret = -EBUSY; 58170bc7088SAnand Jain break; 58270bc7088SAnand Jain } 5834fde46f0SAnand Jain 5844fde46f0SAnand Jain /* delete the stale device */ 585fa6d2ae5SAnand Jain fs_devices->num_devices--; 586fa6d2ae5SAnand Jain list_del(&device->dev_list); 587fa6d2ae5SAnand Jain btrfs_free_device(device); 5887bcb8164SAnand Jain 58970bc7088SAnand Jain ret = 0; 5904fde46f0SAnand Jain } 5917bcb8164SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 59270bc7088SAnand Jain 5937bcb8164SAnand Jain if (fs_devices->num_devices == 0) { 5947bcb8164SAnand Jain btrfs_sysfs_remove_fsid(fs_devices); 5957bcb8164SAnand Jain list_del(&fs_devices->fs_list); 5967bcb8164SAnand Jain free_fs_devices(fs_devices); 5974fde46f0SAnand Jain } 5984fde46f0SAnand Jain } 59970bc7088SAnand Jain 60070bc7088SAnand Jain return ret; 6014fde46f0SAnand Jain } 6024fde46f0SAnand Jain 60318c850fdSJosef Bacik /* 60418c850fdSJosef Bacik * This is only used on mount, and we are protected from competing things 60518c850fdSJosef Bacik * messing with our fs_devices by the uuid_mutex, thus we do not need the 60618c850fdSJosef Bacik * fs_devices->device_list_mutex here. 60718c850fdSJosef Bacik */ 6080fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 6090fb08bccSAnand Jain struct btrfs_device *device, fmode_t flags, 6100fb08bccSAnand Jain void *holder) 6110fb08bccSAnand Jain { 6120fb08bccSAnand Jain struct request_queue *q; 6130fb08bccSAnand Jain struct block_device *bdev; 6140fb08bccSAnand Jain struct btrfs_super_block *disk_super; 6150fb08bccSAnand Jain u64 devid; 6160fb08bccSAnand Jain int ret; 6170fb08bccSAnand Jain 6180fb08bccSAnand Jain if (device->bdev) 6190fb08bccSAnand Jain return -EINVAL; 6200fb08bccSAnand Jain if (!device->name) 6210fb08bccSAnand Jain return -EINVAL; 6220fb08bccSAnand Jain 6230fb08bccSAnand Jain ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 6248f32380dSJohannes Thumshirn &bdev, &disk_super); 6250fb08bccSAnand Jain if (ret) 6260fb08bccSAnand Jain return ret; 6270fb08bccSAnand Jain 6280fb08bccSAnand Jain devid = btrfs_stack_device_id(&disk_super->dev_item); 6290fb08bccSAnand Jain if (devid != device->devid) 6308f32380dSJohannes Thumshirn goto error_free_page; 6310fb08bccSAnand Jain 6320fb08bccSAnand Jain if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 6338f32380dSJohannes Thumshirn goto error_free_page; 6340fb08bccSAnand Jain 6350fb08bccSAnand Jain device->generation = btrfs_super_generation(disk_super); 6360fb08bccSAnand Jain 6370fb08bccSAnand Jain if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 6387239ff4bSNikolay Borisov if (btrfs_super_incompat_flags(disk_super) & 6397239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 6407239ff4bSNikolay Borisov pr_err( 6417239ff4bSNikolay Borisov "BTRFS: Invalid seeding and uuid-changed device detected\n"); 6428f32380dSJohannes Thumshirn goto error_free_page; 6437239ff4bSNikolay Borisov } 6447239ff4bSNikolay Borisov 645ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 6460395d84fSJohannes Thumshirn fs_devices->seeding = true; 6470fb08bccSAnand Jain } else { 648ebbede42SAnand Jain if (bdev_read_only(bdev)) 649ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 650ebbede42SAnand Jain else 651ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 6520fb08bccSAnand Jain } 6530fb08bccSAnand Jain 6540fb08bccSAnand Jain q = bdev_get_queue(bdev); 6550fb08bccSAnand Jain if (!blk_queue_nonrot(q)) 6567f0432d0SJohannes Thumshirn fs_devices->rotating = true; 6570fb08bccSAnand Jain 6580fb08bccSAnand Jain device->bdev = bdev; 659e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6600fb08bccSAnand Jain device->mode = flags; 6610fb08bccSAnand Jain 6620fb08bccSAnand Jain fs_devices->open_devices++; 663ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 664ebbede42SAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 6650fb08bccSAnand Jain fs_devices->rw_devices++; 666b1b8e386SAnand Jain list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 6670fb08bccSAnand Jain } 6688f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 6690fb08bccSAnand Jain 6700fb08bccSAnand Jain return 0; 6710fb08bccSAnand Jain 6728f32380dSJohannes Thumshirn error_free_page: 6738f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 6740fb08bccSAnand Jain blkdev_put(bdev, flags); 6750fb08bccSAnand Jain 6760fb08bccSAnand Jain return -EINVAL; 6770fb08bccSAnand Jain } 6780fb08bccSAnand Jain 67960999ca4SDavid Sterba /* 6807a62d0f0SNikolay Borisov * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 681c0d81c7cSSu Yue * being created with a disk that has already completed its fsid change. Such 682c0d81c7cSSu Yue * disk can belong to an fs which has its FSID changed or to one which doesn't. 683c0d81c7cSSu Yue * Handle both cases here. 6847a62d0f0SNikolay Borisov */ 6857a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress( 6867a62d0f0SNikolay Borisov struct btrfs_super_block *disk_super) 6877a62d0f0SNikolay Borisov { 6887a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices; 6897a62d0f0SNikolay Borisov 6907a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 6917a62d0f0SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 6927a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) != 0 && 6937a62d0f0SNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->fsid, 6947a62d0f0SNikolay Borisov BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 6957a62d0f0SNikolay Borisov return fs_devices; 6967a62d0f0SNikolay Borisov } 6977a62d0f0SNikolay Borisov } 6987a62d0f0SNikolay Borisov 699c0d81c7cSSu Yue return find_fsid(disk_super->fsid, NULL); 7007a62d0f0SNikolay Borisov } 7017a62d0f0SNikolay Borisov 702cc5de4e7SNikolay Borisov 703cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed( 704cc5de4e7SNikolay Borisov struct btrfs_super_block *disk_super) 705cc5de4e7SNikolay Borisov { 706cc5de4e7SNikolay Borisov struct btrfs_fs_devices *fs_devices; 707cc5de4e7SNikolay Borisov 708cc5de4e7SNikolay Borisov /* 709cc5de4e7SNikolay Borisov * Handles the case where scanned device is part of an fs that had 7101a9fd417SDavid Sterba * multiple successful changes of FSID but currently device didn't 71105840710SNikolay Borisov * observe it. Meaning our fsid will be different than theirs. We need 71205840710SNikolay Borisov * to handle two subcases : 71305840710SNikolay Borisov * 1 - The fs still continues to have different METADATA/FSID uuids. 71405840710SNikolay Borisov * 2 - The fs is switched back to its original FSID (METADATA/FSID 71505840710SNikolay Borisov * are equal). 716cc5de4e7SNikolay Borisov */ 717cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 71805840710SNikolay Borisov /* Changed UUIDs */ 719cc5de4e7SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 720cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) != 0 && 721cc5de4e7SNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 722cc5de4e7SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 723cc5de4e7SNikolay Borisov memcmp(fs_devices->fsid, disk_super->fsid, 72405840710SNikolay Borisov BTRFS_FSID_SIZE) != 0) 725cc5de4e7SNikolay Borisov return fs_devices; 72605840710SNikolay Borisov 72705840710SNikolay Borisov /* Unchanged UUIDs */ 72805840710SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 72905840710SNikolay Borisov BTRFS_FSID_SIZE) == 0 && 73005840710SNikolay Borisov memcmp(fs_devices->fsid, disk_super->metadata_uuid, 73105840710SNikolay Borisov BTRFS_FSID_SIZE) == 0) 73205840710SNikolay Borisov return fs_devices; 733cc5de4e7SNikolay Borisov } 734cc5de4e7SNikolay Borisov 735cc5de4e7SNikolay Borisov return NULL; 736cc5de4e7SNikolay Borisov } 7371362089dSNikolay Borisov 7381362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata( 7391362089dSNikolay Borisov struct btrfs_super_block *disk_super) 7401362089dSNikolay Borisov { 7411362089dSNikolay Borisov struct btrfs_fs_devices *fs_devices; 7421362089dSNikolay Borisov 7431362089dSNikolay Borisov /* 7441362089dSNikolay Borisov * Handle the case where the scanned device is part of an fs whose last 7451362089dSNikolay Borisov * metadata UUID change reverted it to the original FSID. At the same 7461362089dSNikolay Borisov * time * fs_devices was first created by another constitutent device 7471362089dSNikolay Borisov * which didn't fully observe the operation. This results in an 7481362089dSNikolay Borisov * btrfs_fs_devices created with metadata/fsid different AND 7491362089dSNikolay Borisov * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 7501362089dSNikolay Borisov * fs_devices equal to the FSID of the disk. 7511362089dSNikolay Borisov */ 7521362089dSNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 7531362089dSNikolay Borisov if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 7541362089dSNikolay Borisov BTRFS_FSID_SIZE) != 0 && 7551362089dSNikolay Borisov memcmp(fs_devices->metadata_uuid, disk_super->fsid, 7561362089dSNikolay Borisov BTRFS_FSID_SIZE) == 0 && 7571362089dSNikolay Borisov fs_devices->fsid_change) 7581362089dSNikolay Borisov return fs_devices; 7591362089dSNikolay Borisov } 7601362089dSNikolay Borisov 7611362089dSNikolay Borisov return NULL; 7621362089dSNikolay Borisov } 7637a62d0f0SNikolay Borisov /* 76460999ca4SDavid Sterba * Add new device to list of registered devices 76560999ca4SDavid Sterba * 76660999ca4SDavid Sterba * Returns: 767e124ece5SAnand Jain * device pointer which was just added or updated when successful 768e124ece5SAnand Jain * error pointer when failed 76960999ca4SDavid Sterba */ 770e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path, 7714306a974SAnand Jain struct btrfs_super_block *disk_super, 7724306a974SAnand Jain bool *new_device_added) 7738a4b83ccSChris Mason { 7748a4b83ccSChris Mason struct btrfs_device *device; 7757a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices = NULL; 776606686eeSJosef Bacik struct rcu_string *name; 7778a4b83ccSChris Mason u64 found_transid = btrfs_super_generation(disk_super); 7783acbcbfcSAnand Jain u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 7797239ff4bSNikolay Borisov bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 7807239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 781d1a63002SNikolay Borisov bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 782d1a63002SNikolay Borisov BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 7838a4b83ccSChris Mason 784cc5de4e7SNikolay Borisov if (fsid_change_in_progress) { 785c0d81c7cSSu Yue if (!has_metadata_uuid) 7867a62d0f0SNikolay Borisov fs_devices = find_fsid_inprogress(disk_super); 787c0d81c7cSSu Yue else 788cc5de4e7SNikolay Borisov fs_devices = find_fsid_changed(disk_super); 7897a62d0f0SNikolay Borisov } else if (has_metadata_uuid) { 790c6730a0eSSu Yue fs_devices = find_fsid_with_metadata_uuid(disk_super); 7917a62d0f0SNikolay Borisov } else { 7921362089dSNikolay Borisov fs_devices = find_fsid_reverted_metadata(disk_super); 7931362089dSNikolay Borisov if (!fs_devices) 7947a62d0f0SNikolay Borisov fs_devices = find_fsid(disk_super->fsid, NULL); 7957a62d0f0SNikolay Borisov } 7967a62d0f0SNikolay Borisov 7977a62d0f0SNikolay Borisov 7988a4b83ccSChris Mason if (!fs_devices) { 7997239ff4bSNikolay Borisov if (has_metadata_uuid) 8007239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(disk_super->fsid, 8017239ff4bSNikolay Borisov disk_super->metadata_uuid); 8027239ff4bSNikolay Borisov else 8037239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 8047239ff4bSNikolay Borisov 8052208a378SIlya Dryomov if (IS_ERR(fs_devices)) 806e124ece5SAnand Jain return ERR_CAST(fs_devices); 8072208a378SIlya Dryomov 80892900e51SAl Viro fs_devices->fsid_change = fsid_change_in_progress; 80992900e51SAl Viro 8109c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex); 811c4babc5eSAnand Jain list_add(&fs_devices->fs_list, &fs_uuids); 8122208a378SIlya Dryomov 8138a4b83ccSChris Mason device = NULL; 8148a4b83ccSChris Mason } else { 8159c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex); 81609ba3bc9SAnand Jain device = btrfs_find_device(fs_devices, devid, 817b2598edfSAnand Jain disk_super->dev_item.uuid, NULL); 8187a62d0f0SNikolay Borisov 8197a62d0f0SNikolay Borisov /* 8207a62d0f0SNikolay Borisov * If this disk has been pulled into an fs devices created by 8217a62d0f0SNikolay Borisov * a device which had the CHANGING_FSID_V2 flag then replace the 8227a62d0f0SNikolay Borisov * metadata_uuid/fsid values of the fs_devices. 8237a62d0f0SNikolay Borisov */ 8241362089dSNikolay Borisov if (fs_devices->fsid_change && 8257a62d0f0SNikolay Borisov found_transid > fs_devices->latest_generation) { 8267a62d0f0SNikolay Borisov memcpy(fs_devices->fsid, disk_super->fsid, 8277a62d0f0SNikolay Borisov BTRFS_FSID_SIZE); 8281362089dSNikolay Borisov 8291362089dSNikolay Borisov if (has_metadata_uuid) 8307a62d0f0SNikolay Borisov memcpy(fs_devices->metadata_uuid, 8311362089dSNikolay Borisov disk_super->metadata_uuid, 8321362089dSNikolay Borisov BTRFS_FSID_SIZE); 8331362089dSNikolay Borisov else 8341362089dSNikolay Borisov memcpy(fs_devices->metadata_uuid, 8351362089dSNikolay Borisov disk_super->fsid, BTRFS_FSID_SIZE); 8367a62d0f0SNikolay Borisov 8377a62d0f0SNikolay Borisov fs_devices->fsid_change = false; 8387a62d0f0SNikolay Borisov } 8398a4b83ccSChris Mason } 840443f24feSMiao Xie 8418a4b83ccSChris Mason if (!device) { 8429c6d173eSAnand Jain if (fs_devices->opened) { 8439c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 844e124ece5SAnand Jain return ERR_PTR(-EBUSY); 8459c6d173eSAnand Jain } 8462b82032cSYan Zheng 84712bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, 84812bd2fc0SIlya Dryomov disk_super->dev_item.uuid); 84912bd2fc0SIlya Dryomov if (IS_ERR(device)) { 8509c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 8518a4b83ccSChris Mason /* we can safely leave the fs_devices entry around */ 852e124ece5SAnand Jain return device; 8538a4b83ccSChris Mason } 854606686eeSJosef Bacik 855606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 856606686eeSJosef Bacik if (!name) { 857a425f9d4SDavid Sterba btrfs_free_device(device); 8589c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 859e124ece5SAnand Jain return ERR_PTR(-ENOMEM); 8608a4b83ccSChris Mason } 861606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 86290519d66SArne Jansen 8631f78160cSXiao Guangrong list_add_rcu(&device->dev_list, &fs_devices->devices); 864f7171750SFilipe David Borba Manana fs_devices->num_devices++; 865e5e9a520SChris Mason 8662b82032cSYan Zheng device->fs_devices = fs_devices; 8674306a974SAnand Jain *new_device_added = true; 868327f18ccSAnand Jain 869327f18ccSAnand Jain if (disk_super->label[0]) 870aa6c0df7SAnand Jain pr_info( 871aa6c0df7SAnand Jain "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 872aa6c0df7SAnand Jain disk_super->label, devid, found_transid, path, 873aa6c0df7SAnand Jain current->comm, task_pid_nr(current)); 874327f18ccSAnand Jain else 875aa6c0df7SAnand Jain pr_info( 876aa6c0df7SAnand Jain "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 877aa6c0df7SAnand Jain disk_super->fsid, devid, found_transid, path, 878aa6c0df7SAnand Jain current->comm, task_pid_nr(current)); 879327f18ccSAnand Jain 880606686eeSJosef Bacik } else if (!device->name || strcmp(device->name->str, path)) { 881b96de000SAnand Jain /* 882b96de000SAnand Jain * When FS is already mounted. 883b96de000SAnand Jain * 1. If you are here and if the device->name is NULL that 884b96de000SAnand Jain * means this device was missing at time of FS mount. 885b96de000SAnand Jain * 2. If you are here and if the device->name is different 886b96de000SAnand Jain * from 'path' that means either 887b96de000SAnand Jain * a. The same device disappeared and reappeared with 888b96de000SAnand Jain * different name. or 889b96de000SAnand Jain * b. The missing-disk-which-was-replaced, has 890b96de000SAnand Jain * reappeared now. 891b96de000SAnand Jain * 892b96de000SAnand Jain * We must allow 1 and 2a above. But 2b would be a spurious 893b96de000SAnand Jain * and unintentional. 894b96de000SAnand Jain * 895b96de000SAnand Jain * Further in case of 1 and 2a above, the disk at 'path' 896b96de000SAnand Jain * would have missed some transaction when it was away and 897b96de000SAnand Jain * in case of 2a the stale bdev has to be updated as well. 898b96de000SAnand Jain * 2b must not be allowed at all time. 899b96de000SAnand Jain */ 900b96de000SAnand Jain 901b96de000SAnand Jain /* 9020f23ae74SChris Mason * For now, we do allow update to btrfs_fs_device through the 9030f23ae74SChris Mason * btrfs dev scan cli after FS has been mounted. We're still 9040f23ae74SChris Mason * tracking a problem where systems fail mount by subvolume id 9050f23ae74SChris Mason * when we reject replacement on a mounted FS. 906b96de000SAnand Jain */ 9070f23ae74SChris Mason if (!fs_devices->opened && found_transid < device->generation) { 90877bdae4dSAnand Jain /* 90977bdae4dSAnand Jain * That is if the FS is _not_ mounted and if you 91077bdae4dSAnand Jain * are here, that means there is more than one 91177bdae4dSAnand Jain * disk with same uuid and devid.We keep the one 91277bdae4dSAnand Jain * with larger generation number or the last-in if 91377bdae4dSAnand Jain * generation are equal. 91477bdae4dSAnand Jain */ 9159c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 916e124ece5SAnand Jain return ERR_PTR(-EEXIST); 91777bdae4dSAnand Jain } 918b96de000SAnand Jain 919a9261d41SAnand Jain /* 920a9261d41SAnand Jain * We are going to replace the device path for a given devid, 921a9261d41SAnand Jain * make sure it's the same device if the device is mounted 922a9261d41SAnand Jain */ 923a9261d41SAnand Jain if (device->bdev) { 9244e7b5671SChristoph Hellwig int error; 9254e7b5671SChristoph Hellwig dev_t path_dev; 926a9261d41SAnand Jain 9274e7b5671SChristoph Hellwig error = lookup_bdev(path, &path_dev); 9284e7b5671SChristoph Hellwig if (error) { 929a9261d41SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 9304e7b5671SChristoph Hellwig return ERR_PTR(error); 931a9261d41SAnand Jain } 932a9261d41SAnand Jain 9334e7b5671SChristoph Hellwig if (device->bdev->bd_dev != path_dev) { 934a9261d41SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 9350697d9a6SJohannes Thumshirn /* 9360697d9a6SJohannes Thumshirn * device->fs_info may not be reliable here, so 9370697d9a6SJohannes Thumshirn * pass in a NULL instead. This avoids a 9380697d9a6SJohannes Thumshirn * possible use-after-free when the fs_info and 9390697d9a6SJohannes Thumshirn * fs_info->sb are already torn down. 9400697d9a6SJohannes Thumshirn */ 9410697d9a6SJohannes Thumshirn btrfs_warn_in_rcu(NULL, 94279dae17dSAnand Jain "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 94379dae17dSAnand Jain path, devid, found_transid, 94479dae17dSAnand Jain current->comm, 94579dae17dSAnand Jain task_pid_nr(current)); 946a9261d41SAnand Jain return ERR_PTR(-EEXIST); 947a9261d41SAnand Jain } 948a9261d41SAnand Jain btrfs_info_in_rcu(device->fs_info, 94979dae17dSAnand Jain "devid %llu device path %s changed to %s scanned by %s (%d)", 95079dae17dSAnand Jain devid, rcu_str_deref(device->name), 95179dae17dSAnand Jain path, current->comm, 95279dae17dSAnand Jain task_pid_nr(current)); 953a9261d41SAnand Jain } 954a9261d41SAnand Jain 955606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS); 9569c6d173eSAnand Jain if (!name) { 9579c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 958e124ece5SAnand Jain return ERR_PTR(-ENOMEM); 9599c6d173eSAnand Jain } 960606686eeSJosef Bacik rcu_string_free(device->name); 961606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 962e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 963cd02dca5SChris Mason fs_devices->missing_devices--; 964e6e674bdSAnand Jain clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 965cd02dca5SChris Mason } 9668a4b83ccSChris Mason } 9678a4b83ccSChris Mason 96877bdae4dSAnand Jain /* 96977bdae4dSAnand Jain * Unmount does not free the btrfs_device struct but would zero 97077bdae4dSAnand Jain * generation along with most of the other members. So just update 97177bdae4dSAnand Jain * it back. We need it to pick the disk with largest generation 97277bdae4dSAnand Jain * (as above). 97377bdae4dSAnand Jain */ 974d1a63002SNikolay Borisov if (!fs_devices->opened) { 97577bdae4dSAnand Jain device->generation = found_transid; 976d1a63002SNikolay Borisov fs_devices->latest_generation = max_t(u64, found_transid, 977d1a63002SNikolay Borisov fs_devices->latest_generation); 978d1a63002SNikolay Borisov } 97977bdae4dSAnand Jain 980f2788d2fSAnand Jain fs_devices->total_devices = btrfs_super_num_devices(disk_super); 981f2788d2fSAnand Jain 9829c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 983e124ece5SAnand Jain return device; 9848a4b83ccSChris Mason } 9858a4b83ccSChris Mason 986e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 987e4404d6eSYan Zheng { 988e4404d6eSYan Zheng struct btrfs_fs_devices *fs_devices; 989e4404d6eSYan Zheng struct btrfs_device *device; 990e4404d6eSYan Zheng struct btrfs_device *orig_dev; 991d2979aa2SAnand Jain int ret = 0; 992e4404d6eSYan Zheng 993c1247069SAnand Jain lockdep_assert_held(&uuid_mutex); 994c1247069SAnand Jain 9957239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(orig->fsid, NULL); 9962208a378SIlya Dryomov if (IS_ERR(fs_devices)) 9972208a378SIlya Dryomov return fs_devices; 998e4404d6eSYan Zheng 99902db0844SJosef Bacik fs_devices->total_devices = orig->total_devices; 1000e4404d6eSYan Zheng 1001e4404d6eSYan Zheng list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1002606686eeSJosef Bacik struct rcu_string *name; 1003606686eeSJosef Bacik 100412bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &orig_dev->devid, 100512bd2fc0SIlya Dryomov orig_dev->uuid); 1006d2979aa2SAnand Jain if (IS_ERR(device)) { 1007d2979aa2SAnand Jain ret = PTR_ERR(device); 1008e4404d6eSYan Zheng goto error; 1009d2979aa2SAnand Jain } 1010e4404d6eSYan Zheng 1011606686eeSJosef Bacik /* 1012606686eeSJosef Bacik * This is ok to do without rcu read locked because we hold the 1013606686eeSJosef Bacik * uuid mutex so nothing we touch in here is going to disappear. 1014606686eeSJosef Bacik */ 1015e755f780SAnand Jain if (orig_dev->name) { 101678f2c9e6SDavid Sterba name = rcu_string_strdup(orig_dev->name->str, 101778f2c9e6SDavid Sterba GFP_KERNEL); 1018606686eeSJosef Bacik if (!name) { 1019a425f9d4SDavid Sterba btrfs_free_device(device); 1020d2979aa2SAnand Jain ret = -ENOMEM; 1021e4404d6eSYan Zheng goto error; 1022fd2696f3SJulia Lawall } 1023606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 1024e755f780SAnand Jain } 1025e4404d6eSYan Zheng 1026e4404d6eSYan Zheng list_add(&device->dev_list, &fs_devices->devices); 1027e4404d6eSYan Zheng device->fs_devices = fs_devices; 1028e4404d6eSYan Zheng fs_devices->num_devices++; 1029e4404d6eSYan Zheng } 1030e4404d6eSYan Zheng return fs_devices; 1031e4404d6eSYan Zheng error: 1032e4404d6eSYan Zheng free_fs_devices(fs_devices); 1033d2979aa2SAnand Jain return ERR_PTR(ret); 1034e4404d6eSYan Zheng } 1035e4404d6eSYan Zheng 10363712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1037bacce86aSAnand Jain struct btrfs_device **latest_dev) 1038dfe25020SChris Mason { 1039c6e30871SQinghuang Feng struct btrfs_device *device, *next; 1040a6b0d5c8SChris Mason 104146224705SXiao Guangrong /* This is the initialized path, it is safe to release the devices. */ 1042c6e30871SQinghuang Feng list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 10433712ccb7SNikolay Borisov if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1044401e29c1SAnand Jain if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1045401e29c1SAnand Jain &device->dev_state) && 1046998a0671SAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, 1047998a0671SAnand Jain &device->dev_state) && 10483712ccb7SNikolay Borisov (!*latest_dev || 10493712ccb7SNikolay Borisov device->generation > (*latest_dev)->generation)) { 10503712ccb7SNikolay Borisov *latest_dev = device; 1051a6b0d5c8SChris Mason } 10522b82032cSYan Zheng continue; 1053a6b0d5c8SChris Mason } 10542b82032cSYan Zheng 10558dabb742SStefan Behrens /* 1056cf89af14SAnand Jain * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1057cf89af14SAnand Jain * in btrfs_init_dev_replace() so just continue. 10588dabb742SStefan Behrens */ 1059cf89af14SAnand Jain if (device->devid == BTRFS_DEV_REPLACE_DEVID) 10608dabb742SStefan Behrens continue; 1061cf89af14SAnand Jain 1062a74a4b97SChris Mason if (device->bdev) { 1063d4d77629STejun Heo blkdev_put(device->bdev, device->mode); 10642b82032cSYan Zheng device->bdev = NULL; 1065a74a4b97SChris Mason fs_devices->open_devices--; 1066a74a4b97SChris Mason } 1067ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 10682b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 1069ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1070b2a61667SDesmond Cheong Zhi Xi fs_devices->rw_devices--; 10712b82032cSYan Zheng } 10722b82032cSYan Zheng list_del_init(&device->dev_list); 10732b82032cSYan Zheng fs_devices->num_devices--; 1074a425f9d4SDavid Sterba btrfs_free_device(device); 10752b82032cSYan Zheng } 10762b82032cSYan Zheng 10773712ccb7SNikolay Borisov } 10783712ccb7SNikolay Borisov 10793712ccb7SNikolay Borisov /* 10803712ccb7SNikolay Borisov * After we have read the system tree and know devids belonging to this 10813712ccb7SNikolay Borisov * filesystem, remove the device which does not belong there. 10823712ccb7SNikolay Borisov */ 1083bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 10843712ccb7SNikolay Borisov { 10853712ccb7SNikolay Borisov struct btrfs_device *latest_dev = NULL; 1086944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_dev; 10873712ccb7SNikolay Borisov 10883712ccb7SNikolay Borisov mutex_lock(&uuid_mutex); 1089bacce86aSAnand Jain __btrfs_free_extra_devids(fs_devices, &latest_dev); 1090944d3f9fSNikolay Borisov 1091944d3f9fSNikolay Borisov list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1092bacce86aSAnand Jain __btrfs_free_extra_devids(seed_dev, &latest_dev); 10932b82032cSYan Zheng 1094d24fa5c1SAnand Jain fs_devices->latest_dev = latest_dev; 1095a6b0d5c8SChris Mason 1096dfe25020SChris Mason mutex_unlock(&uuid_mutex); 1097dfe25020SChris Mason } 1098a0af469bSChris Mason 109914238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device) 110014238819SAnand Jain { 110108ffcae8SDavid Sterba if (!device->bdev) 110208ffcae8SDavid Sterba return; 110308ffcae8SDavid Sterba 1104ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 110514238819SAnand Jain sync_blockdev(device->bdev); 110614238819SAnand Jain invalidate_bdev(device->bdev); 110714238819SAnand Jain } 110814238819SAnand Jain 110914238819SAnand Jain blkdev_put(device->bdev, device->mode); 111014238819SAnand Jain } 111114238819SAnand Jain 1112959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device) 1113f448341aSAnand Jain { 1114f448341aSAnand Jain struct btrfs_fs_devices *fs_devices = device->fs_devices; 1115f448341aSAnand Jain 1116ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1117f448341aSAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) { 1118f448341aSAnand Jain list_del_init(&device->dev_alloc_list); 1119f448341aSAnand Jain fs_devices->rw_devices--; 1120f448341aSAnand Jain } 1121f448341aSAnand Jain 11220d977e0eSDesmond Cheong Zhi Xi if (device->devid == BTRFS_DEV_REPLACE_DEVID) 11230d977e0eSDesmond Cheong Zhi Xi clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 11240d977e0eSDesmond Cheong Zhi Xi 1125e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1126f448341aSAnand Jain fs_devices->missing_devices--; 1127f448341aSAnand Jain 1128959b1c04SNikolay Borisov btrfs_close_bdev(device); 1129321f69f8SJohannes Thumshirn if (device->bdev) { 11303fff3975SJohannes Thumshirn fs_devices->open_devices--; 1131321f69f8SJohannes Thumshirn device->bdev = NULL; 1132f448341aSAnand Jain } 1133321f69f8SJohannes Thumshirn clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 11345b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device); 1135f448341aSAnand Jain 1136321f69f8SJohannes Thumshirn device->fs_info = NULL; 1137321f69f8SJohannes Thumshirn atomic_set(&device->dev_stats_ccnt, 0); 1138321f69f8SJohannes Thumshirn extent_io_tree_release(&device->alloc_state); 1139959b1c04SNikolay Borisov 11406b225baaSFilipe Manana /* 11416b225baaSFilipe Manana * Reset the flush error record. We might have a transient flush error 11426b225baaSFilipe Manana * in this mount, and if so we aborted the current transaction and set 11436b225baaSFilipe Manana * the fs to an error state, guaranteeing no super blocks can be further 11446b225baaSFilipe Manana * committed. However that error might be transient and if we unmount the 11456b225baaSFilipe Manana * filesystem and mount it again, we should allow the mount to succeed 11466b225baaSFilipe Manana * (btrfs_check_rw_degradable() should not fail) - if after mounting the 11476b225baaSFilipe Manana * filesystem again we still get flush errors, then we will again abort 11486b225baaSFilipe Manana * any transaction and set the error state, guaranteeing no commits of 11496b225baaSFilipe Manana * unsafe super blocks. 11506b225baaSFilipe Manana */ 11516b225baaSFilipe Manana device->last_flush_error = 0; 11526b225baaSFilipe Manana 1153321f69f8SJohannes Thumshirn /* Verify the device is back in a pristine state */ 1154321f69f8SJohannes Thumshirn ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1155321f69f8SJohannes Thumshirn ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1156321f69f8SJohannes Thumshirn ASSERT(list_empty(&device->dev_alloc_list)); 1157321f69f8SJohannes Thumshirn ASSERT(list_empty(&device->post_commit_list)); 1158321f69f8SJohannes Thumshirn ASSERT(atomic_read(&device->reada_in_flight) == 0); 1159f448341aSAnand Jain } 1160f448341aSAnand Jain 116154eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 11628a4b83ccSChris Mason { 11632037a093SSasha Levin struct btrfs_device *device, *tmp; 1164e4404d6eSYan Zheng 1165425c6ed6SJosef Bacik lockdep_assert_held(&uuid_mutex); 1166425c6ed6SJosef Bacik 11672b82032cSYan Zheng if (--fs_devices->opened > 0) 116854eed6aeSNikolay Borisov return; 11698a4b83ccSChris Mason 1170425c6ed6SJosef Bacik list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1171959b1c04SNikolay Borisov btrfs_close_one_device(device); 1172c9513edbSXiao Guangrong 1173e4404d6eSYan Zheng WARN_ON(fs_devices->open_devices); 1174e4404d6eSYan Zheng WARN_ON(fs_devices->rw_devices); 11752b82032cSYan Zheng fs_devices->opened = 0; 11760395d84fSJohannes Thumshirn fs_devices->seeding = false; 1177c4989c2fSNikolay Borisov fs_devices->fs_info = NULL; 11788a4b83ccSChris Mason } 11798a4b83ccSChris Mason 118054eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 11812b82032cSYan Zheng { 1182944d3f9fSNikolay Borisov LIST_HEAD(list); 1183944d3f9fSNikolay Borisov struct btrfs_fs_devices *tmp; 11842b82032cSYan Zheng 11852b82032cSYan Zheng mutex_lock(&uuid_mutex); 118654eed6aeSNikolay Borisov close_fs_devices(fs_devices); 1187944d3f9fSNikolay Borisov if (!fs_devices->opened) 1188944d3f9fSNikolay Borisov list_splice_init(&fs_devices->seed_list, &list); 1189e4404d6eSYan Zheng 1190944d3f9fSNikolay Borisov list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 11910226e0ebSAnand Jain close_fs_devices(fs_devices); 1192944d3f9fSNikolay Borisov list_del(&fs_devices->seed_list); 1193e4404d6eSYan Zheng free_fs_devices(fs_devices); 1194e4404d6eSYan Zheng } 1195425c6ed6SJosef Bacik mutex_unlock(&uuid_mutex); 11962b82032cSYan Zheng } 11972b82032cSYan Zheng 1198897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 119997288f2cSChristoph Hellwig fmode_t flags, void *holder) 12008a4b83ccSChris Mason { 12018a4b83ccSChris Mason struct btrfs_device *device; 1202443f24feSMiao Xie struct btrfs_device *latest_dev = NULL; 120396c2e067SAnand Jain struct btrfs_device *tmp_device; 12048a4b83ccSChris Mason 1205d4d77629STejun Heo flags |= FMODE_EXCL; 1206d4d77629STejun Heo 120796c2e067SAnand Jain list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 120896c2e067SAnand Jain dev_list) { 120996c2e067SAnand Jain int ret; 1210a0af469bSChris Mason 121196c2e067SAnand Jain ret = btrfs_open_one_device(fs_devices, device, flags, holder); 121296c2e067SAnand Jain if (ret == 0 && 121396c2e067SAnand Jain (!latest_dev || device->generation > latest_dev->generation)) { 12149f050db4SAnand Jain latest_dev = device; 121596c2e067SAnand Jain } else if (ret == -ENODATA) { 121696c2e067SAnand Jain fs_devices->num_devices--; 121796c2e067SAnand Jain list_del(&device->dev_list); 121896c2e067SAnand Jain btrfs_free_device(device); 121996c2e067SAnand Jain } 12208a4b83ccSChris Mason } 12211ed802c9SAnand Jain if (fs_devices->open_devices == 0) 12221ed802c9SAnand Jain return -EINVAL; 12231ed802c9SAnand Jain 12242b82032cSYan Zheng fs_devices->opened = 1; 1225d24fa5c1SAnand Jain fs_devices->latest_dev = latest_dev; 12262b82032cSYan Zheng fs_devices->total_rw_bytes = 0; 1227c4a816c6SNaohiro Aota fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 122833fd2f71SAnand Jain fs_devices->read_policy = BTRFS_READ_POLICY_PID; 12291ed802c9SAnand Jain 12301ed802c9SAnand Jain return 0; 12312b82032cSYan Zheng } 12322b82032cSYan Zheng 12334f0f586bSSami Tolvanen static int devid_cmp(void *priv, const struct list_head *a, 12344f0f586bSSami Tolvanen const struct list_head *b) 1235f8e10cd3SAnand Jain { 1236214cc184SDavid Sterba const struct btrfs_device *dev1, *dev2; 1237f8e10cd3SAnand Jain 1238f8e10cd3SAnand Jain dev1 = list_entry(a, struct btrfs_device, dev_list); 1239f8e10cd3SAnand Jain dev2 = list_entry(b, struct btrfs_device, dev_list); 1240f8e10cd3SAnand Jain 1241f8e10cd3SAnand Jain if (dev1->devid < dev2->devid) 1242f8e10cd3SAnand Jain return -1; 1243f8e10cd3SAnand Jain else if (dev1->devid > dev2->devid) 1244f8e10cd3SAnand Jain return 1; 1245f8e10cd3SAnand Jain return 0; 1246f8e10cd3SAnand Jain } 1247f8e10cd3SAnand Jain 12482b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 124997288f2cSChristoph Hellwig fmode_t flags, void *holder) 12502b82032cSYan Zheng { 12512b82032cSYan Zheng int ret; 12522b82032cSYan Zheng 1253f5194e34SDavid Sterba lockdep_assert_held(&uuid_mutex); 125418c850fdSJosef Bacik /* 125518c850fdSJosef Bacik * The device_list_mutex cannot be taken here in case opening the 1256a8698707SChristoph Hellwig * underlying device takes further locks like open_mutex. 125718c850fdSJosef Bacik * 125818c850fdSJosef Bacik * We also don't need the lock here as this is called during mount and 125918c850fdSJosef Bacik * exclusion is provided by uuid_mutex 126018c850fdSJosef Bacik */ 1261f5194e34SDavid Sterba 12622b82032cSYan Zheng if (fs_devices->opened) { 12632b82032cSYan Zheng fs_devices->opened++; 12642b82032cSYan Zheng ret = 0; 12652b82032cSYan Zheng } else { 1266f8e10cd3SAnand Jain list_sort(NULL, &fs_devices->devices, devid_cmp); 1267897fb573SAnand Jain ret = open_fs_devices(fs_devices, flags, holder); 12682b82032cSYan Zheng } 1269542c5908SAnand Jain 12708a4b83ccSChris Mason return ret; 12718a4b83ccSChris Mason } 12728a4b83ccSChris Mason 12738f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super) 12746cf86a00SAnand Jain { 12758f32380dSJohannes Thumshirn struct page *page = virt_to_page(super); 12768f32380dSJohannes Thumshirn 12776cf86a00SAnand Jain put_page(page); 12786cf86a00SAnand Jain } 12796cf86a00SAnand Jain 1280b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 128112659251SNaohiro Aota u64 bytenr, u64 bytenr_orig) 12826cf86a00SAnand Jain { 1283b335eab8SNikolay Borisov struct btrfs_super_block *disk_super; 1284b335eab8SNikolay Borisov struct page *page; 12856cf86a00SAnand Jain void *p; 12866cf86a00SAnand Jain pgoff_t index; 12876cf86a00SAnand Jain 12886cf86a00SAnand Jain /* make sure our super fits in the device */ 12896cf86a00SAnand Jain if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1290b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 12916cf86a00SAnand Jain 12926cf86a00SAnand Jain /* make sure our super fits in the page */ 1293b335eab8SNikolay Borisov if (sizeof(*disk_super) > PAGE_SIZE) 1294b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 12956cf86a00SAnand Jain 12966cf86a00SAnand Jain /* make sure our super doesn't straddle pages on disk */ 12976cf86a00SAnand Jain index = bytenr >> PAGE_SHIFT; 1298b335eab8SNikolay Borisov if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1299b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 13006cf86a00SAnand Jain 13016cf86a00SAnand Jain /* pull in the page with our super */ 1302b335eab8SNikolay Borisov page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 13036cf86a00SAnand Jain 1304b335eab8SNikolay Borisov if (IS_ERR(page)) 1305b335eab8SNikolay Borisov return ERR_CAST(page); 13066cf86a00SAnand Jain 1307b335eab8SNikolay Borisov p = page_address(page); 13086cf86a00SAnand Jain 13096cf86a00SAnand Jain /* align our pointer to the offset of the super block */ 1310b335eab8SNikolay Borisov disk_super = p + offset_in_page(bytenr); 13116cf86a00SAnand Jain 131212659251SNaohiro Aota if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1313b335eab8SNikolay Borisov btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 13148f32380dSJohannes Thumshirn btrfs_release_disk_super(p); 1315b335eab8SNikolay Borisov return ERR_PTR(-EINVAL); 13166cf86a00SAnand Jain } 13176cf86a00SAnand Jain 1318b335eab8SNikolay Borisov if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1319b335eab8SNikolay Borisov disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 13206cf86a00SAnand Jain 1321b335eab8SNikolay Borisov return disk_super; 13226cf86a00SAnand Jain } 13236cf86a00SAnand Jain 1324228a73abSAnand Jain int btrfs_forget_devices(const char *path) 1325228a73abSAnand Jain { 1326228a73abSAnand Jain int ret; 1327228a73abSAnand Jain 1328228a73abSAnand Jain mutex_lock(&uuid_mutex); 1329228a73abSAnand Jain ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1330228a73abSAnand Jain mutex_unlock(&uuid_mutex); 1331228a73abSAnand Jain 1332228a73abSAnand Jain return ret; 1333228a73abSAnand Jain } 1334228a73abSAnand Jain 13356f60cbd3SDavid Sterba /* 13366f60cbd3SDavid Sterba * Look for a btrfs signature on a device. This may be called out of the mount path 13376f60cbd3SDavid Sterba * and we are not allowed to call set_blocksize during the scan. The superblock 13386f60cbd3SDavid Sterba * is read via pagecache 13396f60cbd3SDavid Sterba */ 134036350e95SGu Jinxiang struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 134136350e95SGu Jinxiang void *holder) 13428a4b83ccSChris Mason { 13438a4b83ccSChris Mason struct btrfs_super_block *disk_super; 13444306a974SAnand Jain bool new_device_added = false; 134536350e95SGu Jinxiang struct btrfs_device *device = NULL; 13468a4b83ccSChris Mason struct block_device *bdev; 134712659251SNaohiro Aota u64 bytenr, bytenr_orig; 134812659251SNaohiro Aota int ret; 13498a4b83ccSChris Mason 1350899f9307SDavid Sterba lockdep_assert_held(&uuid_mutex); 1351899f9307SDavid Sterba 13526f60cbd3SDavid Sterba /* 13536f60cbd3SDavid Sterba * we would like to check all the supers, but that would make 13546f60cbd3SDavid Sterba * a btrfs mount succeed after a mkfs from a different FS. 13556f60cbd3SDavid Sterba * So, we need to add a special mount option to scan for 13566f60cbd3SDavid Sterba * later supers, using BTRFS_SUPER_MIRROR_MAX instead 13576f60cbd3SDavid Sterba */ 1358d4d77629STejun Heo flags |= FMODE_EXCL; 13596f60cbd3SDavid Sterba 13606f60cbd3SDavid Sterba bdev = blkdev_get_by_path(path, flags, holder); 1361b6ed73bcSAnand Jain if (IS_ERR(bdev)) 136236350e95SGu Jinxiang return ERR_CAST(bdev); 13636f60cbd3SDavid Sterba 136412659251SNaohiro Aota bytenr_orig = btrfs_sb_offset(0); 136512659251SNaohiro Aota ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 136612659251SNaohiro Aota if (ret) 136712659251SNaohiro Aota return ERR_PTR(ret); 136812659251SNaohiro Aota 136912659251SNaohiro Aota disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1370b335eab8SNikolay Borisov if (IS_ERR(disk_super)) { 1371b335eab8SNikolay Borisov device = ERR_CAST(disk_super); 13726f60cbd3SDavid Sterba goto error_bdev_put; 137305a5c55dSAnand Jain } 13746f60cbd3SDavid Sterba 13754306a974SAnand Jain device = device_list_add(path, disk_super, &new_device_added); 137636350e95SGu Jinxiang if (!IS_ERR(device)) { 13774306a974SAnand Jain if (new_device_added) 13784306a974SAnand Jain btrfs_free_stale_devices(path, device); 13794306a974SAnand Jain } 13806f60cbd3SDavid Sterba 13818f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 13826f60cbd3SDavid Sterba 13836f60cbd3SDavid Sterba error_bdev_put: 1384d4d77629STejun Heo blkdev_put(bdev, flags); 1385b6ed73bcSAnand Jain 138636350e95SGu Jinxiang return device; 13878a4b83ccSChris Mason } 13880b86a832SChris Mason 1389c152b63eSFilipe Manana /* 13901c11b63eSJeff Mahoney * Try to find a chunk that intersects [start, start + len] range and when one 13911c11b63eSJeff Mahoney * such is found, record the end of it in *start 1392c152b63eSFilipe Manana */ 13931c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 13941c11b63eSJeff Mahoney u64 len) 13951c11b63eSJeff Mahoney { 13961c11b63eSJeff Mahoney u64 physical_start, physical_end; 13976df9a95eSJosef Bacik 13981c11b63eSJeff Mahoney lockdep_assert_held(&device->fs_info->chunk_mutex); 13991c11b63eSJeff Mahoney 14001c11b63eSJeff Mahoney if (!find_first_extent_bit(&device->alloc_state, *start, 14011c11b63eSJeff Mahoney &physical_start, &physical_end, 14021c11b63eSJeff Mahoney CHUNK_ALLOCATED, NULL)) { 14031c11b63eSJeff Mahoney 14041c11b63eSJeff Mahoney if (in_range(physical_start, *start, len) || 14051c11b63eSJeff Mahoney in_range(*start, physical_start, 14061c11b63eSJeff Mahoney physical_end - physical_start)) { 14071c11b63eSJeff Mahoney *start = physical_end + 1; 14081c11b63eSJeff Mahoney return true; 14091c11b63eSJeff Mahoney } 14101c11b63eSJeff Mahoney } 14111c11b63eSJeff Mahoney return false; 14126df9a95eSJosef Bacik } 14136df9a95eSJosef Bacik 14143b4ffa40SNaohiro Aota static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 14153b4ffa40SNaohiro Aota { 14163b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) { 14173b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 14183b4ffa40SNaohiro Aota /* 14193b4ffa40SNaohiro Aota * We don't want to overwrite the superblock on the drive nor 14203b4ffa40SNaohiro Aota * any area used by the boot loader (grub for example), so we 14213b4ffa40SNaohiro Aota * make sure to start at an offset of at least 1MB. 14223b4ffa40SNaohiro Aota */ 14233b4ffa40SNaohiro Aota return max_t(u64, start, SZ_1M); 14241cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED: 14251cd6121fSNaohiro Aota /* 14261cd6121fSNaohiro Aota * We don't care about the starting region like regular 14271cd6121fSNaohiro Aota * allocator, because we anyway use/reserve the first two zones 14281cd6121fSNaohiro Aota * for superblock logging. 14291cd6121fSNaohiro Aota */ 14301cd6121fSNaohiro Aota return ALIGN(start, device->zone_info->zone_size); 14313b4ffa40SNaohiro Aota default: 14323b4ffa40SNaohiro Aota BUG(); 14333b4ffa40SNaohiro Aota } 14343b4ffa40SNaohiro Aota } 14353b4ffa40SNaohiro Aota 14361cd6121fSNaohiro Aota static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 14371cd6121fSNaohiro Aota u64 *hole_start, u64 *hole_size, 14381cd6121fSNaohiro Aota u64 num_bytes) 14391cd6121fSNaohiro Aota { 14401cd6121fSNaohiro Aota u64 zone_size = device->zone_info->zone_size; 14411cd6121fSNaohiro Aota u64 pos; 14421cd6121fSNaohiro Aota int ret; 14431cd6121fSNaohiro Aota bool changed = false; 14441cd6121fSNaohiro Aota 14451cd6121fSNaohiro Aota ASSERT(IS_ALIGNED(*hole_start, zone_size)); 14461cd6121fSNaohiro Aota 14471cd6121fSNaohiro Aota while (*hole_size > 0) { 14481cd6121fSNaohiro Aota pos = btrfs_find_allocatable_zones(device, *hole_start, 14491cd6121fSNaohiro Aota *hole_start + *hole_size, 14501cd6121fSNaohiro Aota num_bytes); 14511cd6121fSNaohiro Aota if (pos != *hole_start) { 14521cd6121fSNaohiro Aota *hole_size = *hole_start + *hole_size - pos; 14531cd6121fSNaohiro Aota *hole_start = pos; 14541cd6121fSNaohiro Aota changed = true; 14551cd6121fSNaohiro Aota if (*hole_size < num_bytes) 14561cd6121fSNaohiro Aota break; 14571cd6121fSNaohiro Aota } 14581cd6121fSNaohiro Aota 14591cd6121fSNaohiro Aota ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 14601cd6121fSNaohiro Aota 14611cd6121fSNaohiro Aota /* Range is ensured to be empty */ 14621cd6121fSNaohiro Aota if (!ret) 14631cd6121fSNaohiro Aota return changed; 14641cd6121fSNaohiro Aota 14651cd6121fSNaohiro Aota /* Given hole range was invalid (outside of device) */ 14661cd6121fSNaohiro Aota if (ret == -ERANGE) { 14671cd6121fSNaohiro Aota *hole_start += *hole_size; 1468d6f67afbSJohannes Thumshirn *hole_size = 0; 14697000babdSJiapeng Chong return true; 14701cd6121fSNaohiro Aota } 14711cd6121fSNaohiro Aota 14721cd6121fSNaohiro Aota *hole_start += zone_size; 14731cd6121fSNaohiro Aota *hole_size -= zone_size; 14741cd6121fSNaohiro Aota changed = true; 14751cd6121fSNaohiro Aota } 14761cd6121fSNaohiro Aota 14771cd6121fSNaohiro Aota return changed; 14781cd6121fSNaohiro Aota } 14791cd6121fSNaohiro Aota 14803b4ffa40SNaohiro Aota /** 14813b4ffa40SNaohiro Aota * dev_extent_hole_check - check if specified hole is suitable for allocation 14823b4ffa40SNaohiro Aota * @device: the device which we have the hole 14833b4ffa40SNaohiro Aota * @hole_start: starting position of the hole 14843b4ffa40SNaohiro Aota * @hole_size: the size of the hole 14853b4ffa40SNaohiro Aota * @num_bytes: the size of the free space that we need 14863b4ffa40SNaohiro Aota * 14871cd6121fSNaohiro Aota * This function may modify @hole_start and @hole_size to reflect the suitable 14883b4ffa40SNaohiro Aota * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 14893b4ffa40SNaohiro Aota */ 14903b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 14913b4ffa40SNaohiro Aota u64 *hole_size, u64 num_bytes) 14923b4ffa40SNaohiro Aota { 14933b4ffa40SNaohiro Aota bool changed = false; 14943b4ffa40SNaohiro Aota u64 hole_end = *hole_start + *hole_size; 14953b4ffa40SNaohiro Aota 14961cd6121fSNaohiro Aota for (;;) { 14973b4ffa40SNaohiro Aota /* 14983b4ffa40SNaohiro Aota * Check before we set max_hole_start, otherwise we could end up 14993b4ffa40SNaohiro Aota * sending back this offset anyway. 15003b4ffa40SNaohiro Aota */ 15013b4ffa40SNaohiro Aota if (contains_pending_extent(device, hole_start, *hole_size)) { 15023b4ffa40SNaohiro Aota if (hole_end >= *hole_start) 15033b4ffa40SNaohiro Aota *hole_size = hole_end - *hole_start; 15043b4ffa40SNaohiro Aota else 15053b4ffa40SNaohiro Aota *hole_size = 0; 15063b4ffa40SNaohiro Aota changed = true; 15073b4ffa40SNaohiro Aota } 15083b4ffa40SNaohiro Aota 15093b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) { 15103b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 15113b4ffa40SNaohiro Aota /* No extra check */ 15123b4ffa40SNaohiro Aota break; 15131cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED: 15141cd6121fSNaohiro Aota if (dev_extent_hole_check_zoned(device, hole_start, 15151cd6121fSNaohiro Aota hole_size, num_bytes)) { 15161cd6121fSNaohiro Aota changed = true; 15171cd6121fSNaohiro Aota /* 15181cd6121fSNaohiro Aota * The changed hole can contain pending extent. 15191cd6121fSNaohiro Aota * Loop again to check that. 15201cd6121fSNaohiro Aota */ 15211cd6121fSNaohiro Aota continue; 15221cd6121fSNaohiro Aota } 15231cd6121fSNaohiro Aota break; 15243b4ffa40SNaohiro Aota default: 15253b4ffa40SNaohiro Aota BUG(); 15263b4ffa40SNaohiro Aota } 15273b4ffa40SNaohiro Aota 15281cd6121fSNaohiro Aota break; 15291cd6121fSNaohiro Aota } 15301cd6121fSNaohiro Aota 15313b4ffa40SNaohiro Aota return changed; 15323b4ffa40SNaohiro Aota } 15336df9a95eSJosef Bacik 15340b86a832SChris Mason /* 1535499f377fSJeff Mahoney * find_free_dev_extent_start - find free space in the specified device 15367bfc837dSMiao Xie * @device: the device which we search the free space in 15377bfc837dSMiao Xie * @num_bytes: the size of the free space that we need 1538499f377fSJeff Mahoney * @search_start: the position from which to begin the search 15397bfc837dSMiao Xie * @start: store the start of the free space. 1540499f377fSJeff Mahoney * @len: the size of the free space. that we find, or the size 1541499f377fSJeff Mahoney * of the max free space if we don't find suitable free space 15427bfc837dSMiao Xie * 15430b86a832SChris Mason * this uses a pretty simple search, the expectation is that it is 15440b86a832SChris Mason * called very infrequently and that a given device has a small number 15450b86a832SChris Mason * of extents 15467bfc837dSMiao Xie * 15477bfc837dSMiao Xie * @start is used to store the start of the free space if we find. But if we 15487bfc837dSMiao Xie * don't find suitable free space, it will be used to store the start position 15497bfc837dSMiao Xie * of the max free space. 15507bfc837dSMiao Xie * 15517bfc837dSMiao Xie * @len is used to store the size of the free space that we find. 15527bfc837dSMiao Xie * But if we don't find suitable free space, it is used to store the size of 15537bfc837dSMiao Xie * the max free space. 1554135da976SQu Wenruo * 1555135da976SQu Wenruo * NOTE: This function will search *commit* root of device tree, and does extra 1556135da976SQu Wenruo * check to ensure dev extents are not double allocated. 1557135da976SQu Wenruo * This makes the function safe to allocate dev extents but may not report 1558135da976SQu Wenruo * correct usable device space, as device extent freed in current transaction 15591a9fd417SDavid Sterba * is not reported as available. 15600b86a832SChris Mason */ 15619e3246a5SQu Wenruo static int find_free_dev_extent_start(struct btrfs_device *device, 15629e3246a5SQu Wenruo u64 num_bytes, u64 search_start, u64 *start, 15639e3246a5SQu Wenruo u64 *len) 15640b86a832SChris Mason { 15650b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 15660b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 15670b86a832SChris Mason struct btrfs_key key; 15687bfc837dSMiao Xie struct btrfs_dev_extent *dev_extent; 15692b82032cSYan Zheng struct btrfs_path *path; 15707bfc837dSMiao Xie u64 hole_size; 15717bfc837dSMiao Xie u64 max_hole_start; 15727bfc837dSMiao Xie u64 max_hole_size; 15737bfc837dSMiao Xie u64 extent_end; 15740b86a832SChris Mason u64 search_end = device->total_bytes; 15750b86a832SChris Mason int ret; 15767bfc837dSMiao Xie int slot; 15770b86a832SChris Mason struct extent_buffer *l; 15788cdc7c5bSFilipe Manana 15793b4ffa40SNaohiro Aota search_start = dev_extent_search_start(device, search_start); 15800b86a832SChris Mason 15811cd6121fSNaohiro Aota WARN_ON(device->zone_info && 15821cd6121fSNaohiro Aota !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 15831cd6121fSNaohiro Aota 15846df9a95eSJosef Bacik path = btrfs_alloc_path(); 15856df9a95eSJosef Bacik if (!path) 15866df9a95eSJosef Bacik return -ENOMEM; 1587f2ab7618SZhao Lei 15887bfc837dSMiao Xie max_hole_start = search_start; 15897bfc837dSMiao Xie max_hole_size = 0; 15907bfc837dSMiao Xie 1591f2ab7618SZhao Lei again: 1592401e29c1SAnand Jain if (search_start >= search_end || 1593401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 15947bfc837dSMiao Xie ret = -ENOSPC; 15956df9a95eSJosef Bacik goto out; 15967bfc837dSMiao Xie } 15977bfc837dSMiao Xie 1598e4058b54SDavid Sterba path->reada = READA_FORWARD; 15996df9a95eSJosef Bacik path->search_commit_root = 1; 16006df9a95eSJosef Bacik path->skip_locking = 1; 16017bfc837dSMiao Xie 16020b86a832SChris Mason key.objectid = device->devid; 16030b86a832SChris Mason key.offset = search_start; 16040b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 16057bfc837dSMiao Xie 16060ff40a91SMarcos Paulo de Souza ret = btrfs_search_backwards(root, &key, path); 16070b86a832SChris Mason if (ret < 0) 16087bfc837dSMiao Xie goto out; 16097bfc837dSMiao Xie 16100b86a832SChris Mason while (1) { 16110b86a832SChris Mason l = path->nodes[0]; 16120b86a832SChris Mason slot = path->slots[0]; 16130b86a832SChris Mason if (slot >= btrfs_header_nritems(l)) { 16140b86a832SChris Mason ret = btrfs_next_leaf(root, path); 16150b86a832SChris Mason if (ret == 0) 16160b86a832SChris Mason continue; 16170b86a832SChris Mason if (ret < 0) 16187bfc837dSMiao Xie goto out; 16197bfc837dSMiao Xie 16207bfc837dSMiao Xie break; 16210b86a832SChris Mason } 16220b86a832SChris Mason btrfs_item_key_to_cpu(l, &key, slot); 16230b86a832SChris Mason 16240b86a832SChris Mason if (key.objectid < device->devid) 16250b86a832SChris Mason goto next; 16260b86a832SChris Mason 16270b86a832SChris Mason if (key.objectid > device->devid) 16287bfc837dSMiao Xie break; 16290b86a832SChris Mason 1630962a298fSDavid Sterba if (key.type != BTRFS_DEV_EXTENT_KEY) 16310b86a832SChris Mason goto next; 16320b86a832SChris Mason 16337bfc837dSMiao Xie if (key.offset > search_start) { 16347bfc837dSMiao Xie hole_size = key.offset - search_start; 16353b4ffa40SNaohiro Aota dev_extent_hole_check(device, &search_start, &hole_size, 16363b4ffa40SNaohiro Aota num_bytes); 16376df9a95eSJosef Bacik 16387bfc837dSMiao Xie if (hole_size > max_hole_size) { 16397bfc837dSMiao Xie max_hole_start = search_start; 16407bfc837dSMiao Xie max_hole_size = hole_size; 16417bfc837dSMiao Xie } 16427bfc837dSMiao Xie 16437bfc837dSMiao Xie /* 16447bfc837dSMiao Xie * If this free space is greater than which we need, 16457bfc837dSMiao Xie * it must be the max free space that we have found 16467bfc837dSMiao Xie * until now, so max_hole_start must point to the start 16477bfc837dSMiao Xie * of this free space and the length of this free space 16487bfc837dSMiao Xie * is stored in max_hole_size. Thus, we return 16497bfc837dSMiao Xie * max_hole_start and max_hole_size and go back to the 16507bfc837dSMiao Xie * caller. 16517bfc837dSMiao Xie */ 16527bfc837dSMiao Xie if (hole_size >= num_bytes) { 16537bfc837dSMiao Xie ret = 0; 16547bfc837dSMiao Xie goto out; 16557bfc837dSMiao Xie } 16567bfc837dSMiao Xie } 16577bfc837dSMiao Xie 16580b86a832SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 16597bfc837dSMiao Xie extent_end = key.offset + btrfs_dev_extent_length(l, 16607bfc837dSMiao Xie dev_extent); 16617bfc837dSMiao Xie if (extent_end > search_start) 16627bfc837dSMiao Xie search_start = extent_end; 16630b86a832SChris Mason next: 16640b86a832SChris Mason path->slots[0]++; 16650b86a832SChris Mason cond_resched(); 16660b86a832SChris Mason } 16670b86a832SChris Mason 166838c01b96Sliubo /* 166938c01b96Sliubo * At this point, search_start should be the end of 167038c01b96Sliubo * allocated dev extents, and when shrinking the device, 167138c01b96Sliubo * search_end may be smaller than search_start. 167238c01b96Sliubo */ 1673f2ab7618SZhao Lei if (search_end > search_start) { 16747bfc837dSMiao Xie hole_size = search_end - search_start; 16753b4ffa40SNaohiro Aota if (dev_extent_hole_check(device, &search_start, &hole_size, 16763b4ffa40SNaohiro Aota num_bytes)) { 1677f2ab7618SZhao Lei btrfs_release_path(path); 1678f2ab7618SZhao Lei goto again; 1679f2ab7618SZhao Lei } 1680f2ab7618SZhao Lei 16817bfc837dSMiao Xie if (hole_size > max_hole_size) { 16827bfc837dSMiao Xie max_hole_start = search_start; 16837bfc837dSMiao Xie max_hole_size = hole_size; 16840b86a832SChris Mason } 16856df9a95eSJosef Bacik } 16866df9a95eSJosef Bacik 16877bfc837dSMiao Xie /* See above. */ 1688f2ab7618SZhao Lei if (max_hole_size < num_bytes) 16897bfc837dSMiao Xie ret = -ENOSPC; 16907bfc837dSMiao Xie else 16912b82032cSYan Zheng ret = 0; 16920b86a832SChris Mason 16937bfc837dSMiao Xie out: 16942b82032cSYan Zheng btrfs_free_path(path); 16957bfc837dSMiao Xie *start = max_hole_start; 1696b2117a39SMiao Xie if (len) 16977bfc837dSMiao Xie *len = max_hole_size; 16980b86a832SChris Mason return ret; 16990b86a832SChris Mason } 17000b86a832SChris Mason 170160dfdf25SNikolay Borisov int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1702499f377fSJeff Mahoney u64 *start, u64 *len) 1703499f377fSJeff Mahoney { 1704499f377fSJeff Mahoney /* FIXME use last free of some kind */ 170560dfdf25SNikolay Borisov return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1706499f377fSJeff Mahoney } 1707499f377fSJeff Mahoney 1708b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 17098f18cf13SChris Mason struct btrfs_device *device, 17102196d6e8SMiao Xie u64 start, u64 *dev_extent_len) 17118f18cf13SChris Mason { 17120b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 17130b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 17148f18cf13SChris Mason int ret; 17158f18cf13SChris Mason struct btrfs_path *path; 17168f18cf13SChris Mason struct btrfs_key key; 1717a061fc8dSChris Mason struct btrfs_key found_key; 1718a061fc8dSChris Mason struct extent_buffer *leaf = NULL; 1719a061fc8dSChris Mason struct btrfs_dev_extent *extent = NULL; 17208f18cf13SChris Mason 17218f18cf13SChris Mason path = btrfs_alloc_path(); 17228f18cf13SChris Mason if (!path) 17238f18cf13SChris Mason return -ENOMEM; 17248f18cf13SChris Mason 17258f18cf13SChris Mason key.objectid = device->devid; 17268f18cf13SChris Mason key.offset = start; 17278f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 1728924cd8fbSMiao Xie again: 17298f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1730a061fc8dSChris Mason if (ret > 0) { 1731a061fc8dSChris Mason ret = btrfs_previous_item(root, path, key.objectid, 1732a061fc8dSChris Mason BTRFS_DEV_EXTENT_KEY); 1733b0b802d7STsutomu Itoh if (ret) 1734b0b802d7STsutomu Itoh goto out; 1735a061fc8dSChris Mason leaf = path->nodes[0]; 1736a061fc8dSChris Mason btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1737a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1738a061fc8dSChris Mason struct btrfs_dev_extent); 1739a061fc8dSChris Mason BUG_ON(found_key.offset > start || found_key.offset + 1740a061fc8dSChris Mason btrfs_dev_extent_length(leaf, extent) < start); 1741924cd8fbSMiao Xie key = found_key; 1742924cd8fbSMiao Xie btrfs_release_path(path); 1743924cd8fbSMiao Xie goto again; 1744a061fc8dSChris Mason } else if (ret == 0) { 1745a061fc8dSChris Mason leaf = path->nodes[0]; 1746a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0], 1747a061fc8dSChris Mason struct btrfs_dev_extent); 174879787eaaSJeff Mahoney } else { 174979787eaaSJeff Mahoney goto out; 1750a061fc8dSChris Mason } 17518f18cf13SChris Mason 17522196d6e8SMiao Xie *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 17532196d6e8SMiao Xie 17548f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 175579bd3712SFilipe Manana if (ret == 0) 17563204d33cSJosef Bacik set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1757b0b802d7STsutomu Itoh out: 17588f18cf13SChris Mason btrfs_free_path(path); 17598f18cf13SChris Mason return ret; 17608f18cf13SChris Mason } 17618f18cf13SChris Mason 17626df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 17630b86a832SChris Mason { 17646df9a95eSJosef Bacik struct extent_map_tree *em_tree; 17656df9a95eSJosef Bacik struct extent_map *em; 17666df9a95eSJosef Bacik struct rb_node *n; 17676df9a95eSJosef Bacik u64 ret = 0; 17680b86a832SChris Mason 1769c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree; 17706df9a95eSJosef Bacik read_lock(&em_tree->lock); 177107e1ce09SLiu Bo n = rb_last(&em_tree->map.rb_root); 17726df9a95eSJosef Bacik if (n) { 17736df9a95eSJosef Bacik em = rb_entry(n, struct extent_map, rb_node); 17746df9a95eSJosef Bacik ret = em->start + em->len; 1775e17cade2SChris Mason } 17766df9a95eSJosef Bacik read_unlock(&em_tree->lock); 17776df9a95eSJosef Bacik 17780b86a832SChris Mason return ret; 17790b86a832SChris Mason } 17800b86a832SChris Mason 178153f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 178253f10659SIlya Dryomov u64 *devid_ret) 17830b86a832SChris Mason { 17840b86a832SChris Mason int ret; 17850b86a832SChris Mason struct btrfs_key key; 17860b86a832SChris Mason struct btrfs_key found_key; 17872b82032cSYan Zheng struct btrfs_path *path; 17882b82032cSYan Zheng 17892b82032cSYan Zheng path = btrfs_alloc_path(); 17902b82032cSYan Zheng if (!path) 17912b82032cSYan Zheng return -ENOMEM; 17920b86a832SChris Mason 17930b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 17940b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 17950b86a832SChris Mason key.offset = (u64)-1; 17960b86a832SChris Mason 179753f10659SIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 17980b86a832SChris Mason if (ret < 0) 17990b86a832SChris Mason goto error; 18000b86a832SChris Mason 1801a06dee4dSAnand Jain if (ret == 0) { 1802a06dee4dSAnand Jain /* Corruption */ 1803a06dee4dSAnand Jain btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1804a06dee4dSAnand Jain ret = -EUCLEAN; 1805a06dee4dSAnand Jain goto error; 1806a06dee4dSAnand Jain } 18070b86a832SChris Mason 180853f10659SIlya Dryomov ret = btrfs_previous_item(fs_info->chunk_root, path, 180953f10659SIlya Dryomov BTRFS_DEV_ITEMS_OBJECTID, 18100b86a832SChris Mason BTRFS_DEV_ITEM_KEY); 18110b86a832SChris Mason if (ret) { 181253f10659SIlya Dryomov *devid_ret = 1; 18130b86a832SChris Mason } else { 18140b86a832SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 18150b86a832SChris Mason path->slots[0]); 181653f10659SIlya Dryomov *devid_ret = found_key.offset + 1; 18170b86a832SChris Mason } 18180b86a832SChris Mason ret = 0; 18190b86a832SChris Mason error: 18202b82032cSYan Zheng btrfs_free_path(path); 18210b86a832SChris Mason return ret; 18220b86a832SChris Mason } 18230b86a832SChris Mason 18240b86a832SChris Mason /* 18250b86a832SChris Mason * the device information is stored in the chunk root 18260b86a832SChris Mason * the btrfs_device struct should be fully filled in 18270b86a832SChris Mason */ 1828c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 18290b86a832SChris Mason struct btrfs_device *device) 18300b86a832SChris Mason { 18310b86a832SChris Mason int ret; 18320b86a832SChris Mason struct btrfs_path *path; 18330b86a832SChris Mason struct btrfs_dev_item *dev_item; 18340b86a832SChris Mason struct extent_buffer *leaf; 18350b86a832SChris Mason struct btrfs_key key; 18360b86a832SChris Mason unsigned long ptr; 18370b86a832SChris Mason 18380b86a832SChris Mason path = btrfs_alloc_path(); 18390b86a832SChris Mason if (!path) 18400b86a832SChris Mason return -ENOMEM; 18410b86a832SChris Mason 18420b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 18430b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 18442b82032cSYan Zheng key.offset = device->devid; 18450b86a832SChris Mason 18468e87e856SNikolay Borisov ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 18478e87e856SNikolay Borisov &key, sizeof(*dev_item)); 18480b86a832SChris Mason if (ret) 18490b86a832SChris Mason goto out; 18500b86a832SChris Mason 18510b86a832SChris Mason leaf = path->nodes[0]; 18520b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 18530b86a832SChris Mason 18540b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 18552b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 0); 18560b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 18570b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 18580b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 18590b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 18607cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 18617cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 18627cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 18637cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 1864e17cade2SChris Mason btrfs_set_device_group(leaf, dev_item, 0); 1865e17cade2SChris Mason btrfs_set_device_seek_speed(leaf, dev_item, 0); 1866e17cade2SChris Mason btrfs_set_device_bandwidth(leaf, dev_item, 0); 1867c3027eb5SChris Mason btrfs_set_device_start_offset(leaf, dev_item, 0); 18680b86a832SChris Mason 1869410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 1870e17cade2SChris Mason write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 18711473b24eSGeert Uytterhoeven ptr = btrfs_device_fsid(dev_item); 1872de37aa51SNikolay Borisov write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1873de37aa51SNikolay Borisov ptr, BTRFS_FSID_SIZE); 18740b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 18750b86a832SChris Mason 18762b82032cSYan Zheng ret = 0; 18770b86a832SChris Mason out: 18780b86a832SChris Mason btrfs_free_path(path); 18790b86a832SChris Mason return ret; 18800b86a832SChris Mason } 18818f18cf13SChris Mason 18825a1972bdSQu Wenruo /* 18835a1972bdSQu Wenruo * Function to update ctime/mtime for a given device path. 18845a1972bdSQu Wenruo * Mainly used for ctime/mtime based probe like libblkid. 18855a1972bdSQu Wenruo */ 18868f96a5bfSJosef Bacik static void update_dev_time(struct block_device *bdev) 18875a1972bdSQu Wenruo { 18888f96a5bfSJosef Bacik struct inode *inode = bdev->bd_inode; 18898f96a5bfSJosef Bacik struct timespec64 now; 18905a1972bdSQu Wenruo 18918f96a5bfSJosef Bacik /* Shouldn't happen but just in case. */ 18928f96a5bfSJosef Bacik if (!inode) 18935a1972bdSQu Wenruo return; 18948f96a5bfSJosef Bacik 18958f96a5bfSJosef Bacik now = current_time(inode); 18968f96a5bfSJosef Bacik generic_update_time(inode, &now, S_MTIME | S_CTIME); 18975a1972bdSQu Wenruo } 18985a1972bdSQu Wenruo 1899f331a952SDavid Sterba static int btrfs_rm_dev_item(struct btrfs_device *device) 1900a061fc8dSChris Mason { 1901f331a952SDavid Sterba struct btrfs_root *root = device->fs_info->chunk_root; 1902a061fc8dSChris Mason int ret; 1903a061fc8dSChris Mason struct btrfs_path *path; 1904a061fc8dSChris Mason struct btrfs_key key; 1905a061fc8dSChris Mason struct btrfs_trans_handle *trans; 1906a061fc8dSChris Mason 1907a061fc8dSChris Mason path = btrfs_alloc_path(); 1908a061fc8dSChris Mason if (!path) 1909a061fc8dSChris Mason return -ENOMEM; 1910a061fc8dSChris Mason 1911a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 191298d5dc13STsutomu Itoh if (IS_ERR(trans)) { 191398d5dc13STsutomu Itoh btrfs_free_path(path); 191498d5dc13STsutomu Itoh return PTR_ERR(trans); 191598d5dc13STsutomu Itoh } 1916a061fc8dSChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1917a061fc8dSChris Mason key.type = BTRFS_DEV_ITEM_KEY; 1918a061fc8dSChris Mason key.offset = device->devid; 1919a061fc8dSChris Mason 1920a061fc8dSChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 19215e9f2ad5SNikolay Borisov if (ret) { 19225e9f2ad5SNikolay Borisov if (ret > 0) 1923a061fc8dSChris Mason ret = -ENOENT; 19245e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 19255e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 1926a061fc8dSChris Mason goto out; 1927a061fc8dSChris Mason } 1928a061fc8dSChris Mason 1929a061fc8dSChris Mason ret = btrfs_del_item(trans, root, path); 19305e9f2ad5SNikolay Borisov if (ret) { 19315e9f2ad5SNikolay Borisov btrfs_abort_transaction(trans, ret); 19325e9f2ad5SNikolay Borisov btrfs_end_transaction(trans); 19335e9f2ad5SNikolay Borisov } 19345e9f2ad5SNikolay Borisov 1935a061fc8dSChris Mason out: 1936a061fc8dSChris Mason btrfs_free_path(path); 19375e9f2ad5SNikolay Borisov if (!ret) 19385e9f2ad5SNikolay Borisov ret = btrfs_commit_transaction(trans); 1939a061fc8dSChris Mason return ret; 1940a061fc8dSChris Mason } 1941a061fc8dSChris Mason 19423cc31a0dSDavid Sterba /* 19433cc31a0dSDavid Sterba * Verify that @num_devices satisfies the RAID profile constraints in the whole 19443cc31a0dSDavid Sterba * filesystem. It's up to the caller to adjust that number regarding eg. device 19453cc31a0dSDavid Sterba * replace. 19463cc31a0dSDavid Sterba */ 19473cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 19483cc31a0dSDavid Sterba u64 num_devices) 1949a061fc8dSChris Mason { 1950a061fc8dSChris Mason u64 all_avail; 1951de98ced9SMiao Xie unsigned seq; 1952418775a2SDavid Sterba int i; 1953a061fc8dSChris Mason 1954de98ced9SMiao Xie do { 1955bd45ffbcSAnand Jain seq = read_seqbegin(&fs_info->profiles_lock); 1956de98ced9SMiao Xie 1957bd45ffbcSAnand Jain all_avail = fs_info->avail_data_alloc_bits | 1958bd45ffbcSAnand Jain fs_info->avail_system_alloc_bits | 1959bd45ffbcSAnand Jain fs_info->avail_metadata_alloc_bits; 1960bd45ffbcSAnand Jain } while (read_seqretry(&fs_info->profiles_lock, seq)); 1961f1fa7f26SAnand Jain 1962418775a2SDavid Sterba for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 196341a6e891SAnand Jain if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1964418775a2SDavid Sterba continue; 1965a061fc8dSChris Mason 1966efc222f8SAnand Jain if (num_devices < btrfs_raid_array[i].devs_min) 1967efc222f8SAnand Jain return btrfs_raid_array[i].mindev_error; 1968bd45ffbcSAnand Jain } 1969bd45ffbcSAnand Jain 1970bd45ffbcSAnand Jain return 0; 1971f1fa7f26SAnand Jain } 1972f1fa7f26SAnand Jain 1973c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device( 1974c9162bdfSOmar Sandoval struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 197588acff64SAnand Jain { 197688acff64SAnand Jain struct btrfs_device *next_device; 197788acff64SAnand Jain 197888acff64SAnand Jain list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 197988acff64SAnand Jain if (next_device != device && 1980e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1981e6e674bdSAnand Jain && next_device->bdev) 198288acff64SAnand Jain return next_device; 198388acff64SAnand Jain } 198488acff64SAnand Jain 198588acff64SAnand Jain return NULL; 198688acff64SAnand Jain } 198788acff64SAnand Jain 198888acff64SAnand Jain /* 1989d24fa5c1SAnand Jain * Helper function to check if the given device is part of s_bdev / latest_dev 199088acff64SAnand Jain * and replace it with the provided or the next active device, in the context 199188acff64SAnand Jain * where this function called, there should be always be another device (or 199288acff64SAnand Jain * this_dev) which is active. 199388acff64SAnand Jain */ 1994b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1995e493e8f9SAnand Jain struct btrfs_device *next_device) 199688acff64SAnand Jain { 1997d6507cf1SNikolay Borisov struct btrfs_fs_info *fs_info = device->fs_info; 199888acff64SAnand Jain 1999e493e8f9SAnand Jain if (!next_device) 200088acff64SAnand Jain next_device = btrfs_find_next_active_device(fs_info->fs_devices, 200188acff64SAnand Jain device); 200288acff64SAnand Jain ASSERT(next_device); 200388acff64SAnand Jain 200488acff64SAnand Jain if (fs_info->sb->s_bdev && 200588acff64SAnand Jain (fs_info->sb->s_bdev == device->bdev)) 200688acff64SAnand Jain fs_info->sb->s_bdev = next_device->bdev; 200788acff64SAnand Jain 2008d24fa5c1SAnand Jain if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2009d24fa5c1SAnand Jain fs_info->fs_devices->latest_dev = next_device; 201088acff64SAnand Jain } 201188acff64SAnand Jain 20121da73967SAnand Jain /* 20131da73967SAnand Jain * Return btrfs_fs_devices::num_devices excluding the device that's being 20141da73967SAnand Jain * currently replaced. 20151da73967SAnand Jain */ 20161da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 20171da73967SAnand Jain { 20181da73967SAnand Jain u64 num_devices = fs_info->fs_devices->num_devices; 20191da73967SAnand Jain 2020cb5583ddSDavid Sterba down_read(&fs_info->dev_replace.rwsem); 20211da73967SAnand Jain if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 20221da73967SAnand Jain ASSERT(num_devices > 1); 20231da73967SAnand Jain num_devices--; 20241da73967SAnand Jain } 2025cb5583ddSDavid Sterba up_read(&fs_info->dev_replace.rwsem); 20261da73967SAnand Jain 20271da73967SAnand Jain return num_devices; 20281da73967SAnand Jain } 20291da73967SAnand Jain 2030313b0858SJosef Bacik void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 20318f32380dSJohannes Thumshirn struct block_device *bdev, 20326fbceb9fSJohannes Thumshirn const char *device_path) 20336fbceb9fSJohannes Thumshirn { 20346fbceb9fSJohannes Thumshirn struct btrfs_super_block *disk_super; 20356fbceb9fSJohannes Thumshirn int copy_num; 20366fbceb9fSJohannes Thumshirn 20376fbceb9fSJohannes Thumshirn if (!bdev) 20386fbceb9fSJohannes Thumshirn return; 20396fbceb9fSJohannes Thumshirn 20406fbceb9fSJohannes Thumshirn for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 20418f32380dSJohannes Thumshirn struct page *page; 20428f32380dSJohannes Thumshirn int ret; 20438f32380dSJohannes Thumshirn 20448f32380dSJohannes Thumshirn disk_super = btrfs_read_dev_one_super(bdev, copy_num); 20458f32380dSJohannes Thumshirn if (IS_ERR(disk_super)) 20466fbceb9fSJohannes Thumshirn continue; 20476fbceb9fSJohannes Thumshirn 204812659251SNaohiro Aota if (bdev_is_zoned(bdev)) { 204912659251SNaohiro Aota btrfs_reset_sb_log_zones(bdev, copy_num); 205012659251SNaohiro Aota continue; 205112659251SNaohiro Aota } 205212659251SNaohiro Aota 20536fbceb9fSJohannes Thumshirn memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 20548f32380dSJohannes Thumshirn 20558f32380dSJohannes Thumshirn page = virt_to_page(disk_super); 20568f32380dSJohannes Thumshirn set_page_dirty(page); 20578f32380dSJohannes Thumshirn lock_page(page); 20588f32380dSJohannes Thumshirn /* write_on_page() unlocks the page */ 20598f32380dSJohannes Thumshirn ret = write_one_page(page); 20608f32380dSJohannes Thumshirn if (ret) 20618f32380dSJohannes Thumshirn btrfs_warn(fs_info, 20628f32380dSJohannes Thumshirn "error clearing superblock number %d (%d)", 20638f32380dSJohannes Thumshirn copy_num, ret); 20648f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 20658f32380dSJohannes Thumshirn 20666fbceb9fSJohannes Thumshirn } 20676fbceb9fSJohannes Thumshirn 20686fbceb9fSJohannes Thumshirn /* Notify udev that device has changed */ 20696fbceb9fSJohannes Thumshirn btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 20706fbceb9fSJohannes Thumshirn 20716fbceb9fSJohannes Thumshirn /* Update ctime/mtime for device path for libblkid */ 20728f96a5bfSJosef Bacik update_dev_time(bdev); 20736fbceb9fSJohannes Thumshirn } 20746fbceb9fSJohannes Thumshirn 2075da353f6bSDavid Sterba int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 20763fa421deSJosef Bacik u64 devid, struct block_device **bdev, fmode_t *mode) 2077f1fa7f26SAnand Jain { 2078f1fa7f26SAnand Jain struct btrfs_device *device; 2079f1fa7f26SAnand Jain struct btrfs_fs_devices *cur_devices; 2080b5185197SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2081f1fa7f26SAnand Jain u64 num_devices; 2082f1fa7f26SAnand Jain int ret = 0; 2083f1fa7f26SAnand Jain 20848ef9dc0fSJosef Bacik /* 20858ef9dc0fSJosef Bacik * The device list in fs_devices is accessed without locks (neither 20868ef9dc0fSJosef Bacik * uuid_mutex nor device_list_mutex) as it won't change on a mounted 20878ef9dc0fSJosef Bacik * filesystem and another device rm cannot run. 20888ef9dc0fSJosef Bacik */ 20891da73967SAnand Jain num_devices = btrfs_num_devices(fs_info); 2090a061fc8dSChris Mason 20910b246afaSJeff Mahoney ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2092beaf8ab3SStefan Behrens if (ret) 2093a061fc8dSChris Mason goto out; 2094f1fa7f26SAnand Jain 2095a27a94c2SNikolay Borisov device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2096a27a94c2SNikolay Borisov 2097a27a94c2SNikolay Borisov if (IS_ERR(device)) { 2098a27a94c2SNikolay Borisov if (PTR_ERR(device) == -ENOENT && 2099e4571b8cSQu Wenruo device_path && strcmp(device_path, "missing") == 0) 2100a27a94c2SNikolay Borisov ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2101a27a94c2SNikolay Borisov else 2102a27a94c2SNikolay Borisov ret = PTR_ERR(device); 2103a061fc8dSChris Mason goto out; 2104a27a94c2SNikolay Borisov } 21052b82032cSYan Zheng 2106eede2bf3SOmar Sandoval if (btrfs_pinned_by_swapfile(fs_info, device)) { 2107eede2bf3SOmar Sandoval btrfs_warn_in_rcu(fs_info, 2108eede2bf3SOmar Sandoval "cannot remove device %s (devid %llu) due to active swapfile", 2109eede2bf3SOmar Sandoval rcu_str_deref(device->name), device->devid); 2110eede2bf3SOmar Sandoval ret = -ETXTBSY; 2111eede2bf3SOmar Sandoval goto out; 2112eede2bf3SOmar Sandoval } 2113eede2bf3SOmar Sandoval 2114401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2115183860f6SAnand Jain ret = BTRFS_ERROR_DEV_TGT_REPLACE; 211624fc572fSAnand Jain goto out; 211763a212abSStefan Behrens } 211863a212abSStefan Behrens 2119ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2120ebbede42SAnand Jain fs_info->fs_devices->rw_devices == 1) { 2121183860f6SAnand Jain ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 212224fc572fSAnand Jain goto out; 21232b82032cSYan Zheng } 21242b82032cSYan Zheng 2125ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 212634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 21272b82032cSYan Zheng list_del_init(&device->dev_alloc_list); 2128c3929c36SMiao Xie device->fs_devices->rw_devices--; 212934441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 21302b82032cSYan Zheng } 2131a061fc8dSChris Mason 2132a061fc8dSChris Mason ret = btrfs_shrink_device(device, 0); 213366d204a1SFilipe Manana if (!ret) 213466d204a1SFilipe Manana btrfs_reada_remove_dev(device); 2135a061fc8dSChris Mason if (ret) 21369b3517e9SIlya Dryomov goto error_undo; 2137a061fc8dSChris Mason 213863a212abSStefan Behrens /* 213963a212abSStefan Behrens * TODO: the superblock still includes this device in its num_devices 214063a212abSStefan Behrens * counter although write_all_supers() is not locked out. This 214163a212abSStefan Behrens * could give a filesystem state which requires a degraded mount. 214263a212abSStefan Behrens */ 2143f331a952SDavid Sterba ret = btrfs_rm_dev_item(device); 2144a061fc8dSChris Mason if (ret) 21459b3517e9SIlya Dryomov goto error_undo; 2146a061fc8dSChris Mason 2147e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2148163e97eeSDavid Sterba btrfs_scrub_cancel_dev(device); 2149e5e9a520SChris Mason 2150e5e9a520SChris Mason /* 2151e5e9a520SChris Mason * the device list mutex makes sure that we don't change 2152e5e9a520SChris Mason * the device list while someone else is writing out all 2153d7306801SFilipe David Borba Manana * the device supers. Whoever is writing all supers, should 2154d7306801SFilipe David Borba Manana * lock the device list mutex before getting the number of 2155d7306801SFilipe David Borba Manana * devices in the super block (super_copy). Conversely, 2156d7306801SFilipe David Borba Manana * whoever updates the number of devices in the super block 2157d7306801SFilipe David Borba Manana * (super_copy) should hold the device list mutex. 2158e5e9a520SChris Mason */ 21591f78160cSXiao Guangrong 216041a52a0fSAnand Jain /* 216141a52a0fSAnand Jain * In normal cases the cur_devices == fs_devices. But in case 216241a52a0fSAnand Jain * of deleting a seed device, the cur_devices should point to 21639675ea8cSSu Yue * its own fs_devices listed under the fs_devices->seed_list. 216441a52a0fSAnand Jain */ 21651f78160cSXiao Guangrong cur_devices = device->fs_devices; 2166b5185197SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 21671f78160cSXiao Guangrong list_del_rcu(&device->dev_list); 2168e5e9a520SChris Mason 216941a52a0fSAnand Jain cur_devices->num_devices--; 217041a52a0fSAnand Jain cur_devices->total_devices--; 2171b4993e64SAnand Jain /* Update total_devices of the parent fs_devices if it's seed */ 2172b4993e64SAnand Jain if (cur_devices != fs_devices) 2173b4993e64SAnand Jain fs_devices->total_devices--; 21742b82032cSYan Zheng 2175e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 217641a52a0fSAnand Jain cur_devices->missing_devices--; 2177cd02dca5SChris Mason 2178d6507cf1SNikolay Borisov btrfs_assign_next_active_device(device, NULL); 21792b82032cSYan Zheng 21800bfaa9c5SEric Sandeen if (device->bdev) { 218141a52a0fSAnand Jain cur_devices->open_devices--; 218299994cdeSAnand Jain /* remove sysfs entry */ 218353f8a74cSAnand Jain btrfs_sysfs_remove_device(device); 21840bfaa9c5SEric Sandeen } 218599994cdeSAnand Jain 21860b246afaSJeff Mahoney num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 21870b246afaSJeff Mahoney btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2188b5185197SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2189e4404d6eSYan Zheng 2190cea67ab9SJeff Mahoney /* 21913fa421deSJosef Bacik * At this point, the device is zero sized and detached from the 21923fa421deSJosef Bacik * devices list. All that's left is to zero out the old supers and 21933fa421deSJosef Bacik * free the device. 21943fa421deSJosef Bacik * 21953fa421deSJosef Bacik * We cannot call btrfs_close_bdev() here because we're holding the sb 21963fa421deSJosef Bacik * write lock, and blkdev_put() will pull in the ->open_mutex on the 21973fa421deSJosef Bacik * block device and it's dependencies. Instead just flush the device 21983fa421deSJosef Bacik * and let the caller do the final blkdev_put. 2199cea67ab9SJeff Mahoney */ 22003fa421deSJosef Bacik if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 22018f32380dSJohannes Thumshirn btrfs_scratch_superblocks(fs_info, device->bdev, 22028f32380dSJohannes Thumshirn device->name->str); 22033fa421deSJosef Bacik if (device->bdev) { 22043fa421deSJosef Bacik sync_blockdev(device->bdev); 22053fa421deSJosef Bacik invalidate_bdev(device->bdev); 22063fa421deSJosef Bacik } 22073fa421deSJosef Bacik } 2208cea67ab9SJeff Mahoney 22093fa421deSJosef Bacik *bdev = device->bdev; 22103fa421deSJosef Bacik *mode = device->mode; 22118e75fd89SNikolay Borisov synchronize_rcu(); 22128e75fd89SNikolay Borisov btrfs_free_device(device); 2213cea67ab9SJeff Mahoney 2214*8b41393fSJosef Bacik /* 2215*8b41393fSJosef Bacik * This can happen if cur_devices is the private seed devices list. We 2216*8b41393fSJosef Bacik * cannot call close_fs_devices() here because it expects the uuid_mutex 2217*8b41393fSJosef Bacik * to be held, but in fact we don't need that for the private 2218*8b41393fSJosef Bacik * seed_devices, we can simply decrement cur_devices->opened and then 2219*8b41393fSJosef Bacik * remove it from our list and free the fs_devices. 2220*8b41393fSJosef Bacik */ 22218e906945SAnand Jain if (cur_devices->num_devices == 0) { 2222944d3f9fSNikolay Borisov list_del_init(&cur_devices->seed_list); 2223*8b41393fSJosef Bacik ASSERT(cur_devices->opened == 1); 2224*8b41393fSJosef Bacik cur_devices->opened--; 22251f78160cSXiao Guangrong free_fs_devices(cur_devices); 22262b82032cSYan Zheng } 22272b82032cSYan Zheng 2228a061fc8dSChris Mason out: 2229a061fc8dSChris Mason return ret; 223024fc572fSAnand Jain 22319b3517e9SIlya Dryomov error_undo: 223266d204a1SFilipe Manana btrfs_reada_undo_remove_dev(device); 2233ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 223434441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 22359b3517e9SIlya Dryomov list_add(&device->dev_alloc_list, 2236b5185197SAnand Jain &fs_devices->alloc_list); 2237c3929c36SMiao Xie device->fs_devices->rw_devices++; 223834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 22399b3517e9SIlya Dryomov } 224024fc572fSAnand Jain goto out; 2241a061fc8dSChris Mason } 2242a061fc8dSChris Mason 224368a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2244e93c89c1SStefan Behrens { 2245d51908ceSAnand Jain struct btrfs_fs_devices *fs_devices; 2246d51908ceSAnand Jain 224768a9db5fSNikolay Borisov lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 22481357272fSIlya Dryomov 224925e8e911SAnand Jain /* 225025e8e911SAnand Jain * in case of fs with no seed, srcdev->fs_devices will point 225125e8e911SAnand Jain * to fs_devices of fs_info. However when the dev being replaced is 225225e8e911SAnand Jain * a seed dev it will point to the seed's local fs_devices. In short 225325e8e911SAnand Jain * srcdev will have its correct fs_devices in both the cases. 225425e8e911SAnand Jain */ 225525e8e911SAnand Jain fs_devices = srcdev->fs_devices; 2256d51908ceSAnand Jain 2257e93c89c1SStefan Behrens list_del_rcu(&srcdev->dev_list); 2258619c47f3SDavid Sterba list_del(&srcdev->dev_alloc_list); 2259d51908ceSAnand Jain fs_devices->num_devices--; 2260e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2261d51908ceSAnand Jain fs_devices->missing_devices--; 2262e93c89c1SStefan Behrens 2263ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 226482372bc8SMiao Xie fs_devices->rw_devices--; 22651357272fSIlya Dryomov 226682372bc8SMiao Xie if (srcdev->bdev) 226782372bc8SMiao Xie fs_devices->open_devices--; 2268084b6e7cSQu Wenruo } 2269084b6e7cSQu Wenruo 227065237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2271084b6e7cSQu Wenruo { 2272084b6e7cSQu Wenruo struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 227382372bc8SMiao Xie 2274a466c85eSJosef Bacik mutex_lock(&uuid_mutex); 2275a466c85eSJosef Bacik 227614238819SAnand Jain btrfs_close_bdev(srcdev); 22778e75fd89SNikolay Borisov synchronize_rcu(); 22788e75fd89SNikolay Borisov btrfs_free_device(srcdev); 227994d5f0c2SAnand Jain 228094d5f0c2SAnand Jain /* if this is no devs we rather delete the fs_devices */ 228194d5f0c2SAnand Jain if (!fs_devices->num_devices) { 22826dd38f81SAnand Jain /* 22836dd38f81SAnand Jain * On a mounted FS, num_devices can't be zero unless it's a 22846dd38f81SAnand Jain * seed. In case of a seed device being replaced, the replace 22856dd38f81SAnand Jain * target added to the sprout FS, so there will be no more 22866dd38f81SAnand Jain * device left under the seed FS. 22876dd38f81SAnand Jain */ 22886dd38f81SAnand Jain ASSERT(fs_devices->seeding); 22896dd38f81SAnand Jain 2290944d3f9fSNikolay Borisov list_del_init(&fs_devices->seed_list); 22910226e0ebSAnand Jain close_fs_devices(fs_devices); 22928bef8401SAnand Jain free_fs_devices(fs_devices); 229394d5f0c2SAnand Jain } 2294a466c85eSJosef Bacik mutex_unlock(&uuid_mutex); 2295e93c89c1SStefan Behrens } 2296e93c89c1SStefan Behrens 22974f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2298e93c89c1SStefan Behrens { 22994f5ad7bdSNikolay Borisov struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2300d2ff1b20SAnand Jain 2301d9a071f0SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 2302d9a071f0SAnand Jain 230353f8a74cSAnand Jain btrfs_sysfs_remove_device(tgtdev); 2304d2ff1b20SAnand Jain 2305779bf3feSAnand Jain if (tgtdev->bdev) 2306d9a071f0SAnand Jain fs_devices->open_devices--; 2307779bf3feSAnand Jain 2308d9a071f0SAnand Jain fs_devices->num_devices--; 2309e93c89c1SStefan Behrens 2310d6507cf1SNikolay Borisov btrfs_assign_next_active_device(tgtdev, NULL); 2311e93c89c1SStefan Behrens 2312e93c89c1SStefan Behrens list_del_rcu(&tgtdev->dev_list); 2313e93c89c1SStefan Behrens 2314d9a071f0SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2315779bf3feSAnand Jain 23168f32380dSJohannes Thumshirn btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 23178f32380dSJohannes Thumshirn tgtdev->name->str); 231814238819SAnand Jain 231914238819SAnand Jain btrfs_close_bdev(tgtdev); 23208e75fd89SNikolay Borisov synchronize_rcu(); 23218e75fd89SNikolay Borisov btrfs_free_device(tgtdev); 2322e93c89c1SStefan Behrens } 2323e93c89c1SStefan Behrens 2324b444ad46SNikolay Borisov static struct btrfs_device *btrfs_find_device_by_path( 2325b444ad46SNikolay Borisov struct btrfs_fs_info *fs_info, const char *device_path) 23267ba15b7dSStefan Behrens { 23277ba15b7dSStefan Behrens int ret = 0; 23287ba15b7dSStefan Behrens struct btrfs_super_block *disk_super; 23297ba15b7dSStefan Behrens u64 devid; 23307ba15b7dSStefan Behrens u8 *dev_uuid; 23317ba15b7dSStefan Behrens struct block_device *bdev; 2332b444ad46SNikolay Borisov struct btrfs_device *device; 23337ba15b7dSStefan Behrens 23347ba15b7dSStefan Behrens ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 23358f32380dSJohannes Thumshirn fs_info->bdev_holder, 0, &bdev, &disk_super); 23367ba15b7dSStefan Behrens if (ret) 2337b444ad46SNikolay Borisov return ERR_PTR(ret); 23388f32380dSJohannes Thumshirn 23397ba15b7dSStefan Behrens devid = btrfs_stack_device_id(&disk_super->dev_item); 23407ba15b7dSStefan Behrens dev_uuid = disk_super->dev_item.uuid; 23417239ff4bSNikolay Borisov if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2342e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2343b2598edfSAnand Jain disk_super->metadata_uuid); 23447239ff4bSNikolay Borisov else 2345e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2346b2598edfSAnand Jain disk_super->fsid); 23477239ff4bSNikolay Borisov 23488f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super); 2349b444ad46SNikolay Borisov if (!device) 2350b444ad46SNikolay Borisov device = ERR_PTR(-ENOENT); 23517ba15b7dSStefan Behrens blkdev_put(bdev, FMODE_READ); 2352b444ad46SNikolay Borisov return device; 23537ba15b7dSStefan Behrens } 23547ba15b7dSStefan Behrens 23552b82032cSYan Zheng /* 23565c5c0df0SDavid Sterba * Lookup a device given by device id, or the path if the id is 0. 23575c5c0df0SDavid Sterba */ 2358a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec( 23596e927cebSAnand Jain struct btrfs_fs_info *fs_info, u64 devid, 23606e927cebSAnand Jain const char *device_path) 236124e0474bSAnand Jain { 2362a27a94c2SNikolay Borisov struct btrfs_device *device; 236324e0474bSAnand Jain 23645c5c0df0SDavid Sterba if (devid) { 2365e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2366b2598edfSAnand Jain NULL); 2367a27a94c2SNikolay Borisov if (!device) 2368a27a94c2SNikolay Borisov return ERR_PTR(-ENOENT); 23696e927cebSAnand Jain return device; 23706e927cebSAnand Jain } 23716e927cebSAnand Jain 23726e927cebSAnand Jain if (!device_path || !device_path[0]) 2373a27a94c2SNikolay Borisov return ERR_PTR(-EINVAL); 2374d95a830cSAnand Jain 23756e927cebSAnand Jain if (strcmp(device_path, "missing") == 0) { 23766e927cebSAnand Jain /* Find first missing device */ 2377d95a830cSAnand Jain list_for_each_entry(device, &fs_info->fs_devices->devices, 2378d95a830cSAnand Jain dev_list) { 2379d95a830cSAnand Jain if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 23806e927cebSAnand Jain &device->dev_state) && !device->bdev) 2381d95a830cSAnand Jain return device; 2382d95a830cSAnand Jain } 2383d95a830cSAnand Jain return ERR_PTR(-ENOENT); 2384d95a830cSAnand Jain } 23856e927cebSAnand Jain 23866e927cebSAnand Jain return btrfs_find_device_by_path(fs_info, device_path); 238724e0474bSAnand Jain } 238824e0474bSAnand Jain 23892b82032cSYan Zheng /* 23902b82032cSYan Zheng * does all the dirty work required for changing file system's UUID. 23912b82032cSYan Zheng */ 23922ff7e61eSJeff Mahoney static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 23932b82032cSYan Zheng { 23940b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 23952b82032cSYan Zheng struct btrfs_fs_devices *old_devices; 2396e4404d6eSYan Zheng struct btrfs_fs_devices *seed_devices; 23970b246afaSJeff Mahoney struct btrfs_super_block *disk_super = fs_info->super_copy; 23982b82032cSYan Zheng struct btrfs_device *device; 23992b82032cSYan Zheng u64 super_flags; 24002b82032cSYan Zheng 2401a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex); 2402e4404d6eSYan Zheng if (!fs_devices->seeding) 24032b82032cSYan Zheng return -EINVAL; 24042b82032cSYan Zheng 2405427c8fddSNikolay Borisov /* 2406427c8fddSNikolay Borisov * Private copy of the seed devices, anchored at 2407427c8fddSNikolay Borisov * fs_info->fs_devices->seed_list 2408427c8fddSNikolay Borisov */ 24097239ff4bSNikolay Borisov seed_devices = alloc_fs_devices(NULL, NULL); 24102208a378SIlya Dryomov if (IS_ERR(seed_devices)) 24112208a378SIlya Dryomov return PTR_ERR(seed_devices); 24122b82032cSYan Zheng 2413427c8fddSNikolay Borisov /* 2414427c8fddSNikolay Borisov * It's necessary to retain a copy of the original seed fs_devices in 2415427c8fddSNikolay Borisov * fs_uuids so that filesystems which have been seeded can successfully 2416427c8fddSNikolay Borisov * reference the seed device from open_seed_devices. This also supports 2417427c8fddSNikolay Borisov * multiple fs seed. 2418427c8fddSNikolay Borisov */ 2419e4404d6eSYan Zheng old_devices = clone_fs_devices(fs_devices); 2420e4404d6eSYan Zheng if (IS_ERR(old_devices)) { 2421e4404d6eSYan Zheng kfree(seed_devices); 2422e4404d6eSYan Zheng return PTR_ERR(old_devices); 24232b82032cSYan Zheng } 2424e4404d6eSYan Zheng 2425c4babc5eSAnand Jain list_add(&old_devices->fs_list, &fs_uuids); 24262b82032cSYan Zheng 2427e4404d6eSYan Zheng memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2428e4404d6eSYan Zheng seed_devices->opened = 1; 2429e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->devices); 2430e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->alloc_list); 2431e5e9a520SChris Mason mutex_init(&seed_devices->device_list_mutex); 2432c9513edbSXiao Guangrong 2433321a4bf7SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 24341f78160cSXiao Guangrong list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 24351f78160cSXiao Guangrong synchronize_rcu); 24362196d6e8SMiao Xie list_for_each_entry(device, &seed_devices->devices, dev_list) 2437e4404d6eSYan Zheng device->fs_devices = seed_devices; 24382196d6e8SMiao Xie 24390395d84fSJohannes Thumshirn fs_devices->seeding = false; 24402b82032cSYan Zheng fs_devices->num_devices = 0; 24412b82032cSYan Zheng fs_devices->open_devices = 0; 244269611ac8SMiao Xie fs_devices->missing_devices = 0; 24437f0432d0SJohannes Thumshirn fs_devices->rotating = false; 2444944d3f9fSNikolay Borisov list_add(&seed_devices->seed_list, &fs_devices->seed_list); 24452b82032cSYan Zheng 24462b82032cSYan Zheng generate_random_uuid(fs_devices->fsid); 24477239ff4bSNikolay Borisov memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 24482b82032cSYan Zheng memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2449321a4bf7SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2450f7171750SFilipe David Borba Manana 24512b82032cSYan Zheng super_flags = btrfs_super_flags(disk_super) & 24522b82032cSYan Zheng ~BTRFS_SUPER_FLAG_SEEDING; 24532b82032cSYan Zheng btrfs_set_super_flags(disk_super, super_flags); 24542b82032cSYan Zheng 24552b82032cSYan Zheng return 0; 24562b82032cSYan Zheng } 24572b82032cSYan Zheng 24582b82032cSYan Zheng /* 245901327610SNicholas D Steeves * Store the expected generation for seed devices in device items. 24602b82032cSYan Zheng */ 24615c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 24622b82032cSYan Zheng { 24635c466629SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 24645b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 24652b82032cSYan Zheng struct btrfs_path *path; 24662b82032cSYan Zheng struct extent_buffer *leaf; 24672b82032cSYan Zheng struct btrfs_dev_item *dev_item; 24682b82032cSYan Zheng struct btrfs_device *device; 24692b82032cSYan Zheng struct btrfs_key key; 247044880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 24712b82032cSYan Zheng u8 dev_uuid[BTRFS_UUID_SIZE]; 24722b82032cSYan Zheng u64 devid; 24732b82032cSYan Zheng int ret; 24742b82032cSYan Zheng 24752b82032cSYan Zheng path = btrfs_alloc_path(); 24762b82032cSYan Zheng if (!path) 24772b82032cSYan Zheng return -ENOMEM; 24782b82032cSYan Zheng 24792b82032cSYan Zheng key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 24802b82032cSYan Zheng key.offset = 0; 24812b82032cSYan Zheng key.type = BTRFS_DEV_ITEM_KEY; 24822b82032cSYan Zheng 24832b82032cSYan Zheng while (1) { 24842b82032cSYan Zheng ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 24852b82032cSYan Zheng if (ret < 0) 24862b82032cSYan Zheng goto error; 24872b82032cSYan Zheng 24882b82032cSYan Zheng leaf = path->nodes[0]; 24892b82032cSYan Zheng next_slot: 24902b82032cSYan Zheng if (path->slots[0] >= btrfs_header_nritems(leaf)) { 24912b82032cSYan Zheng ret = btrfs_next_leaf(root, path); 24922b82032cSYan Zheng if (ret > 0) 24932b82032cSYan Zheng break; 24942b82032cSYan Zheng if (ret < 0) 24952b82032cSYan Zheng goto error; 24962b82032cSYan Zheng leaf = path->nodes[0]; 24972b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2498b3b4aa74SDavid Sterba btrfs_release_path(path); 24992b82032cSYan Zheng continue; 25002b82032cSYan Zheng } 25012b82032cSYan Zheng 25022b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 25032b82032cSYan Zheng if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 25042b82032cSYan Zheng key.type != BTRFS_DEV_ITEM_KEY) 25052b82032cSYan Zheng break; 25062b82032cSYan Zheng 25072b82032cSYan Zheng dev_item = btrfs_item_ptr(leaf, path->slots[0], 25082b82032cSYan Zheng struct btrfs_dev_item); 25092b82032cSYan Zheng devid = btrfs_device_id(leaf, dev_item); 2510410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 25112b82032cSYan Zheng BTRFS_UUID_SIZE); 25121473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 251344880fdcSAnand Jain BTRFS_FSID_SIZE); 2514e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2515b2598edfSAnand Jain fs_uuid); 251679787eaaSJeff Mahoney BUG_ON(!device); /* Logic error */ 25172b82032cSYan Zheng 25182b82032cSYan Zheng if (device->fs_devices->seeding) { 25192b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 25202b82032cSYan Zheng device->generation); 25212b82032cSYan Zheng btrfs_mark_buffer_dirty(leaf); 25222b82032cSYan Zheng } 25232b82032cSYan Zheng 25242b82032cSYan Zheng path->slots[0]++; 25252b82032cSYan Zheng goto next_slot; 25262b82032cSYan Zheng } 25272b82032cSYan Zheng ret = 0; 25282b82032cSYan Zheng error: 25292b82032cSYan Zheng btrfs_free_path(path); 25302b82032cSYan Zheng return ret; 25312b82032cSYan Zheng } 25322b82032cSYan Zheng 2533da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2534788f20ebSChris Mason { 25355112febbSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 2536d5e2003cSJosef Bacik struct request_queue *q; 2537788f20ebSChris Mason struct btrfs_trans_handle *trans; 2538788f20ebSChris Mason struct btrfs_device *device; 2539788f20ebSChris Mason struct block_device *bdev; 25400b246afaSJeff Mahoney struct super_block *sb = fs_info->sb; 2541606686eeSJosef Bacik struct rcu_string *name; 25425da54bc1SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 254339379faaSNaohiro Aota u64 orig_super_total_bytes; 254439379faaSNaohiro Aota u64 orig_super_num_devices; 25452b82032cSYan Zheng int seeding_dev = 0; 2546788f20ebSChris Mason int ret = 0; 254744cab9baSNikolay Borisov bool locked = false; 2548788f20ebSChris Mason 25495da54bc1SAnand Jain if (sb_rdonly(sb) && !fs_devices->seeding) 2550f8c5d0b4SLiu Bo return -EROFS; 2551788f20ebSChris Mason 2552a5d16333SLi Zefan bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 25530b246afaSJeff Mahoney fs_info->bdev_holder); 25547f59203aSJosef Bacik if (IS_ERR(bdev)) 25557f59203aSJosef Bacik return PTR_ERR(bdev); 2556a2135011SChris Mason 2557b70f5097SNaohiro Aota if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2558b70f5097SNaohiro Aota ret = -EINVAL; 2559b70f5097SNaohiro Aota goto error; 2560b70f5097SNaohiro Aota } 2561b70f5097SNaohiro Aota 25625da54bc1SAnand Jain if (fs_devices->seeding) { 25632b82032cSYan Zheng seeding_dev = 1; 25642b82032cSYan Zheng down_write(&sb->s_umount); 25652b82032cSYan Zheng mutex_lock(&uuid_mutex); 256644cab9baSNikolay Borisov locked = true; 25672b82032cSYan Zheng } 25682b82032cSYan Zheng 2569b9ba017fSNikolay Borisov sync_blockdev(bdev); 2570a2135011SChris Mason 2571f4cfa9bdSNikolay Borisov rcu_read_lock(); 2572f4cfa9bdSNikolay Borisov list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2573788f20ebSChris Mason if (device->bdev == bdev) { 2574788f20ebSChris Mason ret = -EEXIST; 2575f4cfa9bdSNikolay Borisov rcu_read_unlock(); 25762b82032cSYan Zheng goto error; 2577788f20ebSChris Mason } 2578788f20ebSChris Mason } 2579f4cfa9bdSNikolay Borisov rcu_read_unlock(); 2580788f20ebSChris Mason 25810b246afaSJeff Mahoney device = btrfs_alloc_device(fs_info, NULL, NULL); 258212bd2fc0SIlya Dryomov if (IS_ERR(device)) { 2583788f20ebSChris Mason /* we can safely leave the fs_devices entry around */ 258412bd2fc0SIlya Dryomov ret = PTR_ERR(device); 25852b82032cSYan Zheng goto error; 2586788f20ebSChris Mason } 2587788f20ebSChris Mason 258878f2c9e6SDavid Sterba name = rcu_string_strdup(device_path, GFP_KERNEL); 2589606686eeSJosef Bacik if (!name) { 25902b82032cSYan Zheng ret = -ENOMEM; 25915c4cf6c9SDavid Sterba goto error_free_device; 2592788f20ebSChris Mason } 2593606686eeSJosef Bacik rcu_assign_pointer(device->name, name); 25942b82032cSYan Zheng 25955b316468SNaohiro Aota device->fs_info = fs_info; 25965b316468SNaohiro Aota device->bdev = bdev; 25975b316468SNaohiro Aota 25985b316468SNaohiro Aota ret = btrfs_get_dev_zone_info(device); 25995b316468SNaohiro Aota if (ret) 26005b316468SNaohiro Aota goto error_free_device; 26015b316468SNaohiro Aota 2602a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 260398d5dc13STsutomu Itoh if (IS_ERR(trans)) { 260498d5dc13STsutomu Itoh ret = PTR_ERR(trans); 26055b316468SNaohiro Aota goto error_free_zone; 260698d5dc13STsutomu Itoh } 260798d5dc13STsutomu Itoh 2608d5e2003cSJosef Bacik q = bdev_get_queue(bdev); 2609ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 26102b82032cSYan Zheng device->generation = trans->transid; 26110b246afaSJeff Mahoney device->io_width = fs_info->sectorsize; 26120b246afaSJeff Mahoney device->io_align = fs_info->sectorsize; 26130b246afaSJeff Mahoney device->sector_size = fs_info->sectorsize; 26147dfb8be1SNikolay Borisov device->total_bytes = round_down(i_size_read(bdev->bd_inode), 26157dfb8be1SNikolay Borisov fs_info->sectorsize); 26162cc3c559SYan Zheng device->disk_total_bytes = device->total_bytes; 2617935e5cc9SMiao Xie device->commit_total_bytes = device->total_bytes; 2618e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2619401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2620fb01aa85SIlya Dryomov device->mode = FMODE_EXCL; 262127087f37SStefan Behrens device->dev_stats_valid = 1; 26229f6d2510SDavid Sterba set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2623325cd4baSZheng Yan 26242b82032cSYan Zheng if (seeding_dev) { 2625a0a1db70SFilipe Manana btrfs_clear_sb_rdonly(sb); 26262ff7e61eSJeff Mahoney ret = btrfs_prepare_sprout(fs_info); 2627d31c32f6SAnand Jain if (ret) { 2628d31c32f6SAnand Jain btrfs_abort_transaction(trans, ret); 2629d31c32f6SAnand Jain goto error_trans; 2630d31c32f6SAnand Jain } 2631b7cb29e6SAnand Jain btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2632b7cb29e6SAnand Jain device); 26332b82032cSYan Zheng } 26342b82032cSYan Zheng 26355da54bc1SAnand Jain device->fs_devices = fs_devices; 2636e5e9a520SChris Mason 26375da54bc1SAnand Jain mutex_lock(&fs_devices->device_list_mutex); 263834441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 26395da54bc1SAnand Jain list_add_rcu(&device->dev_list, &fs_devices->devices); 26405da54bc1SAnand Jain list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 26415da54bc1SAnand Jain fs_devices->num_devices++; 26425da54bc1SAnand Jain fs_devices->open_devices++; 26435da54bc1SAnand Jain fs_devices->rw_devices++; 26445da54bc1SAnand Jain fs_devices->total_devices++; 26455da54bc1SAnand Jain fs_devices->total_rw_bytes += device->total_bytes; 26462b82032cSYan Zheng 2647a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 26482bf64758SJosef Bacik 2649e884f4f0SAnand Jain if (!blk_queue_nonrot(q)) 26507f0432d0SJohannes Thumshirn fs_devices->rotating = true; 2651c289811cSChris Mason 265239379faaSNaohiro Aota orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 26530b246afaSJeff Mahoney btrfs_set_super_total_bytes(fs_info->super_copy, 265439379faaSNaohiro Aota round_down(orig_super_total_bytes + device->total_bytes, 265539379faaSNaohiro Aota fs_info->sectorsize)); 2656788f20ebSChris Mason 265739379faaSNaohiro Aota orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 265839379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy, 265939379faaSNaohiro Aota orig_super_num_devices + 1); 26600d39376aSAnand Jain 26612196d6e8SMiao Xie /* 26622196d6e8SMiao Xie * we've got more storage, clear any full flags on the space 26632196d6e8SMiao Xie * infos 26642196d6e8SMiao Xie */ 26650b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 26662196d6e8SMiao Xie 266734441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2668ca10845aSJosef Bacik 2669ca10845aSJosef Bacik /* Add sysfs device entry */ 2670cd36da2eSAnand Jain btrfs_sysfs_add_device(device); 2671ca10845aSJosef Bacik 26725da54bc1SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 2673788f20ebSChris Mason 26742b82032cSYan Zheng if (seeding_dev) { 267534441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 26766f8e0fc7SDavid Sterba ret = init_first_rw_device(trans); 267734441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 2678005d6427SDavid Sterba if (ret) { 267966642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2680d31c32f6SAnand Jain goto error_sysfs; 2681005d6427SDavid Sterba } 26822196d6e8SMiao Xie } 26832196d6e8SMiao Xie 26848e87e856SNikolay Borisov ret = btrfs_add_dev_item(trans, device); 26852196d6e8SMiao Xie if (ret) { 268666642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2687d31c32f6SAnand Jain goto error_sysfs; 26882196d6e8SMiao Xie } 26892196d6e8SMiao Xie 26902196d6e8SMiao Xie if (seeding_dev) { 26915c466629SDavid Sterba ret = btrfs_finish_sprout(trans); 2692005d6427SDavid Sterba if (ret) { 269366642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 2694d31c32f6SAnand Jain goto error_sysfs; 2695005d6427SDavid Sterba } 2696b2373f25SAnand Jain 26978e560081SNikolay Borisov /* 26988e560081SNikolay Borisov * fs_devices now represents the newly sprouted filesystem and 26998e560081SNikolay Borisov * its fsid has been changed by btrfs_prepare_sprout 27008e560081SNikolay Borisov */ 27018e560081SNikolay Borisov btrfs_sysfs_update_sprout_fsid(fs_devices); 2702005d6427SDavid Sterba } 27032b82032cSYan Zheng 27043a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 27052b82032cSYan Zheng 27062b82032cSYan Zheng if (seeding_dev) { 27072b82032cSYan Zheng mutex_unlock(&uuid_mutex); 27082b82032cSYan Zheng up_write(&sb->s_umount); 270944cab9baSNikolay Borisov locked = false; 27102b82032cSYan Zheng 271179787eaaSJeff Mahoney if (ret) /* transaction commit */ 271279787eaaSJeff Mahoney return ret; 271379787eaaSJeff Mahoney 27142ff7e61eSJeff Mahoney ret = btrfs_relocate_sys_chunks(fs_info); 271579787eaaSJeff Mahoney if (ret < 0) 27160b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 27175d163e0eSJeff Mahoney "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2718671415b7SMiao Xie trans = btrfs_attach_transaction(root); 2719671415b7SMiao Xie if (IS_ERR(trans)) { 2720671415b7SMiao Xie if (PTR_ERR(trans) == -ENOENT) 2721671415b7SMiao Xie return 0; 27227132a262SAnand Jain ret = PTR_ERR(trans); 27237132a262SAnand Jain trans = NULL; 27247132a262SAnand Jain goto error_sysfs; 2725671415b7SMiao Xie } 27263a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 27272b82032cSYan Zheng } 2728c9e9f97bSIlya Dryomov 27297f551d96SAnand Jain /* 27307f551d96SAnand Jain * Now that we have written a new super block to this device, check all 27317f551d96SAnand Jain * other fs_devices list if device_path alienates any other scanned 27327f551d96SAnand Jain * device. 27337f551d96SAnand Jain * We can ignore the return value as it typically returns -EINVAL and 27347f551d96SAnand Jain * only succeeds if the device was an alien. 27357f551d96SAnand Jain */ 27367f551d96SAnand Jain btrfs_forget_devices(device_path); 27377f551d96SAnand Jain 27387f551d96SAnand Jain /* Update ctime/mtime for blkid or udev */ 27398f96a5bfSJosef Bacik update_dev_time(bdev); 27407f551d96SAnand Jain 2741788f20ebSChris Mason return ret; 274279787eaaSJeff Mahoney 2743d31c32f6SAnand Jain error_sysfs: 274453f8a74cSAnand Jain btrfs_sysfs_remove_device(device); 274539379faaSNaohiro Aota mutex_lock(&fs_info->fs_devices->device_list_mutex); 274639379faaSNaohiro Aota mutex_lock(&fs_info->chunk_mutex); 274739379faaSNaohiro Aota list_del_rcu(&device->dev_list); 274839379faaSNaohiro Aota list_del(&device->dev_alloc_list); 274939379faaSNaohiro Aota fs_info->fs_devices->num_devices--; 275039379faaSNaohiro Aota fs_info->fs_devices->open_devices--; 275139379faaSNaohiro Aota fs_info->fs_devices->rw_devices--; 275239379faaSNaohiro Aota fs_info->fs_devices->total_devices--; 275339379faaSNaohiro Aota fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 275439379faaSNaohiro Aota atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 275539379faaSNaohiro Aota btrfs_set_super_total_bytes(fs_info->super_copy, 275639379faaSNaohiro Aota orig_super_total_bytes); 275739379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy, 275839379faaSNaohiro Aota orig_super_num_devices); 275939379faaSNaohiro Aota mutex_unlock(&fs_info->chunk_mutex); 276039379faaSNaohiro Aota mutex_unlock(&fs_info->fs_devices->device_list_mutex); 276179787eaaSJeff Mahoney error_trans: 27620af2c4bfSAnand Jain if (seeding_dev) 2763a0a1db70SFilipe Manana btrfs_set_sb_rdonly(sb); 27647132a262SAnand Jain if (trans) 27653a45bb20SJeff Mahoney btrfs_end_transaction(trans); 27665b316468SNaohiro Aota error_free_zone: 27675b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device); 27685c4cf6c9SDavid Sterba error_free_device: 2769a425f9d4SDavid Sterba btrfs_free_device(device); 27702b82032cSYan Zheng error: 2771e525fd89STejun Heo blkdev_put(bdev, FMODE_EXCL); 277244cab9baSNikolay Borisov if (locked) { 27732b82032cSYan Zheng mutex_unlock(&uuid_mutex); 27742b82032cSYan Zheng up_write(&sb->s_umount); 27752b82032cSYan Zheng } 2776c9e9f97bSIlya Dryomov return ret; 2777788f20ebSChris Mason } 2778788f20ebSChris Mason 2779d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 27800b86a832SChris Mason struct btrfs_device *device) 27810b86a832SChris Mason { 27820b86a832SChris Mason int ret; 27830b86a832SChris Mason struct btrfs_path *path; 27840b246afaSJeff Mahoney struct btrfs_root *root = device->fs_info->chunk_root; 27850b86a832SChris Mason struct btrfs_dev_item *dev_item; 27860b86a832SChris Mason struct extent_buffer *leaf; 27870b86a832SChris Mason struct btrfs_key key; 27880b86a832SChris Mason 27890b86a832SChris Mason path = btrfs_alloc_path(); 27900b86a832SChris Mason if (!path) 27910b86a832SChris Mason return -ENOMEM; 27920b86a832SChris Mason 27930b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 27940b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY; 27950b86a832SChris Mason key.offset = device->devid; 27960b86a832SChris Mason 27970b86a832SChris Mason ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 27980b86a832SChris Mason if (ret < 0) 27990b86a832SChris Mason goto out; 28000b86a832SChris Mason 28010b86a832SChris Mason if (ret > 0) { 28020b86a832SChris Mason ret = -ENOENT; 28030b86a832SChris Mason goto out; 28040b86a832SChris Mason } 28050b86a832SChris Mason 28060b86a832SChris Mason leaf = path->nodes[0]; 28070b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 28080b86a832SChris Mason 28090b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid); 28100b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type); 28110b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align); 28120b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width); 28130b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 28147cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item, 28157cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device)); 28167cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item, 28177cc8e58dSMiao Xie btrfs_device_get_bytes_used(device)); 28180b86a832SChris Mason btrfs_mark_buffer_dirty(leaf); 28190b86a832SChris Mason 28200b86a832SChris Mason out: 28210b86a832SChris Mason btrfs_free_path(path); 28220b86a832SChris Mason return ret; 28230b86a832SChris Mason } 28240b86a832SChris Mason 28252196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans, 28268f18cf13SChris Mason struct btrfs_device *device, u64 new_size) 28278f18cf13SChris Mason { 28280b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 28290b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 28302196d6e8SMiao Xie u64 old_total; 28312196d6e8SMiao Xie u64 diff; 28328f18cf13SChris Mason 2833ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 28342b82032cSYan Zheng return -EACCES; 28352196d6e8SMiao Xie 28367dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 28377dfb8be1SNikolay Borisov 283834441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 28392196d6e8SMiao Xie old_total = btrfs_super_total_bytes(super_copy); 28400e4324a4SNikolay Borisov diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 28412196d6e8SMiao Xie 284263a212abSStefan Behrens if (new_size <= device->total_bytes || 2843401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 284434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 28452b82032cSYan Zheng return -EINVAL; 28462196d6e8SMiao Xie } 28472b82032cSYan Zheng 28487dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 28497dfb8be1SNikolay Borisov round_down(old_total + diff, fs_info->sectorsize)); 28502b82032cSYan Zheng device->fs_devices->total_rw_bytes += diff; 28512b82032cSYan Zheng 28527cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 28537cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 2854fb456252SJeff Mahoney btrfs_clear_space_info_full(device->fs_info); 2855bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list)) 2856bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list, 2857bbbf7243SNikolay Borisov &trans->transaction->dev_update_list); 285834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 28594184ea7fSChris Mason 28608f18cf13SChris Mason return btrfs_update_device(trans, device); 28618f18cf13SChris Mason } 28628f18cf13SChris Mason 2863f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 28648f18cf13SChris Mason { 2865f4208794SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 28665b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 28678f18cf13SChris Mason int ret; 28688f18cf13SChris Mason struct btrfs_path *path; 28698f18cf13SChris Mason struct btrfs_key key; 28708f18cf13SChris Mason 28718f18cf13SChris Mason path = btrfs_alloc_path(); 28728f18cf13SChris Mason if (!path) 28738f18cf13SChris Mason return -ENOMEM; 28748f18cf13SChris Mason 2875408fbf19SNikolay Borisov key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 28768f18cf13SChris Mason key.offset = chunk_offset; 28778f18cf13SChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 28788f18cf13SChris Mason 28798f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 288079787eaaSJeff Mahoney if (ret < 0) 288179787eaaSJeff Mahoney goto out; 288279787eaaSJeff Mahoney else if (ret > 0) { /* Logic error or corruption */ 28830b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, -ENOENT, 288479787eaaSJeff Mahoney "Failed lookup while freeing chunk."); 288579787eaaSJeff Mahoney ret = -ENOENT; 288679787eaaSJeff Mahoney goto out; 288779787eaaSJeff Mahoney } 28888f18cf13SChris Mason 28898f18cf13SChris Mason ret = btrfs_del_item(trans, root, path); 289079787eaaSJeff Mahoney if (ret < 0) 28910b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 289279787eaaSJeff Mahoney "Failed to delete chunk item."); 289379787eaaSJeff Mahoney out: 28948f18cf13SChris Mason btrfs_free_path(path); 289565a246c5STsutomu Itoh return ret; 28968f18cf13SChris Mason } 28978f18cf13SChris Mason 2898408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 28998f18cf13SChris Mason { 29000b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 29018f18cf13SChris Mason struct btrfs_disk_key *disk_key; 29028f18cf13SChris Mason struct btrfs_chunk *chunk; 29038f18cf13SChris Mason u8 *ptr; 29048f18cf13SChris Mason int ret = 0; 29058f18cf13SChris Mason u32 num_stripes; 29068f18cf13SChris Mason u32 array_size; 29078f18cf13SChris Mason u32 len = 0; 29088f18cf13SChris Mason u32 cur; 29098f18cf13SChris Mason struct btrfs_key key; 29108f18cf13SChris Mason 291179bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex); 29128f18cf13SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 29138f18cf13SChris Mason 29148f18cf13SChris Mason ptr = super_copy->sys_chunk_array; 29158f18cf13SChris Mason cur = 0; 29168f18cf13SChris Mason 29178f18cf13SChris Mason while (cur < array_size) { 29188f18cf13SChris Mason disk_key = (struct btrfs_disk_key *)ptr; 29198f18cf13SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 29208f18cf13SChris Mason 29218f18cf13SChris Mason len = sizeof(*disk_key); 29228f18cf13SChris Mason 29238f18cf13SChris Mason if (key.type == BTRFS_CHUNK_ITEM_KEY) { 29248f18cf13SChris Mason chunk = (struct btrfs_chunk *)(ptr + len); 29258f18cf13SChris Mason num_stripes = btrfs_stack_chunk_num_stripes(chunk); 29268f18cf13SChris Mason len += btrfs_chunk_item_size(num_stripes); 29278f18cf13SChris Mason } else { 29288f18cf13SChris Mason ret = -EIO; 29298f18cf13SChris Mason break; 29308f18cf13SChris Mason } 2931408fbf19SNikolay Borisov if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 29328f18cf13SChris Mason key.offset == chunk_offset) { 29338f18cf13SChris Mason memmove(ptr, ptr + len, array_size - (cur + len)); 29348f18cf13SChris Mason array_size -= len; 29358f18cf13SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size); 29368f18cf13SChris Mason } else { 29378f18cf13SChris Mason ptr += len; 29388f18cf13SChris Mason cur += len; 29398f18cf13SChris Mason } 29408f18cf13SChris Mason } 29418f18cf13SChris Mason return ret; 29428f18cf13SChris Mason } 29438f18cf13SChris Mason 294460ca842eSOmar Sandoval /* 294560ca842eSOmar Sandoval * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 294660ca842eSOmar Sandoval * @logical: Logical block offset in bytes. 294760ca842eSOmar Sandoval * @length: Length of extent in bytes. 294860ca842eSOmar Sandoval * 294960ca842eSOmar Sandoval * Return: Chunk mapping or ERR_PTR. 295060ca842eSOmar Sandoval */ 295160ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2952592d92eeSLiu Bo u64 logical, u64 length) 2953592d92eeSLiu Bo { 2954592d92eeSLiu Bo struct extent_map_tree *em_tree; 2955592d92eeSLiu Bo struct extent_map *em; 2956592d92eeSLiu Bo 2957c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree; 2958592d92eeSLiu Bo read_lock(&em_tree->lock); 2959592d92eeSLiu Bo em = lookup_extent_mapping(em_tree, logical, length); 2960592d92eeSLiu Bo read_unlock(&em_tree->lock); 2961592d92eeSLiu Bo 2962592d92eeSLiu Bo if (!em) { 2963592d92eeSLiu Bo btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2964592d92eeSLiu Bo logical, length); 2965592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2966592d92eeSLiu Bo } 2967592d92eeSLiu Bo 2968592d92eeSLiu Bo if (em->start > logical || em->start + em->len < logical) { 2969592d92eeSLiu Bo btrfs_crit(fs_info, 2970592d92eeSLiu Bo "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2971592d92eeSLiu Bo logical, length, em->start, em->start + em->len); 2972592d92eeSLiu Bo free_extent_map(em); 2973592d92eeSLiu Bo return ERR_PTR(-EINVAL); 2974592d92eeSLiu Bo } 2975592d92eeSLiu Bo 2976592d92eeSLiu Bo /* callers are responsible for dropping em's ref. */ 2977592d92eeSLiu Bo return em; 2978592d92eeSLiu Bo } 2979592d92eeSLiu Bo 298079bd3712SFilipe Manana static int remove_chunk_item(struct btrfs_trans_handle *trans, 298179bd3712SFilipe Manana struct map_lookup *map, u64 chunk_offset) 298279bd3712SFilipe Manana { 298379bd3712SFilipe Manana int i; 298479bd3712SFilipe Manana 298579bd3712SFilipe Manana /* 298679bd3712SFilipe Manana * Removing chunk items and updating the device items in the chunks btree 298779bd3712SFilipe Manana * requires holding the chunk_mutex. 298879bd3712SFilipe Manana * See the comment at btrfs_chunk_alloc() for the details. 298979bd3712SFilipe Manana */ 299079bd3712SFilipe Manana lockdep_assert_held(&trans->fs_info->chunk_mutex); 299179bd3712SFilipe Manana 299279bd3712SFilipe Manana for (i = 0; i < map->num_stripes; i++) { 299379bd3712SFilipe Manana int ret; 299479bd3712SFilipe Manana 299579bd3712SFilipe Manana ret = btrfs_update_device(trans, map->stripes[i].dev); 299679bd3712SFilipe Manana if (ret) 299779bd3712SFilipe Manana return ret; 299879bd3712SFilipe Manana } 299979bd3712SFilipe Manana 300079bd3712SFilipe Manana return btrfs_free_chunk(trans, chunk_offset); 300179bd3712SFilipe Manana } 300279bd3712SFilipe Manana 300397aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 300447ab2a6cSJosef Bacik { 300597aff912SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 300647ab2a6cSJosef Bacik struct extent_map *em; 300747ab2a6cSJosef Bacik struct map_lookup *map; 300847ab2a6cSJosef Bacik u64 dev_extent_len = 0; 300947ab2a6cSJosef Bacik int i, ret = 0; 30100b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 301147ab2a6cSJosef Bacik 301260ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3013592d92eeSLiu Bo if (IS_ERR(em)) { 301447ab2a6cSJosef Bacik /* 301547ab2a6cSJosef Bacik * This is a logic error, but we don't want to just rely on the 3016bb7ab3b9SAdam Buchbinder * user having built with ASSERT enabled, so if ASSERT doesn't 301747ab2a6cSJosef Bacik * do anything we still error out. 301847ab2a6cSJosef Bacik */ 301947ab2a6cSJosef Bacik ASSERT(0); 3020592d92eeSLiu Bo return PTR_ERR(em); 302147ab2a6cSJosef Bacik } 302295617d69SJeff Mahoney map = em->map_lookup; 302347ab2a6cSJosef Bacik 302457ba4cb8SFilipe Manana /* 302579bd3712SFilipe Manana * First delete the device extent items from the devices btree. 302679bd3712SFilipe Manana * We take the device_list_mutex to avoid racing with the finishing phase 302779bd3712SFilipe Manana * of a device replace operation. See the comment below before acquiring 302879bd3712SFilipe Manana * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 302979bd3712SFilipe Manana * because that can result in a deadlock when deleting the device extent 303079bd3712SFilipe Manana * items from the devices btree - COWing an extent buffer from the btree 303179bd3712SFilipe Manana * may result in allocating a new metadata chunk, which would attempt to 303279bd3712SFilipe Manana * lock again fs_info->chunk_mutex. 303357ba4cb8SFilipe Manana */ 303457ba4cb8SFilipe Manana mutex_lock(&fs_devices->device_list_mutex); 303547ab2a6cSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 303647ab2a6cSJosef Bacik struct btrfs_device *device = map->stripes[i].dev; 303747ab2a6cSJosef Bacik ret = btrfs_free_dev_extent(trans, device, 303847ab2a6cSJosef Bacik map->stripes[i].physical, 303947ab2a6cSJosef Bacik &dev_extent_len); 304047ab2a6cSJosef Bacik if (ret) { 304157ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 304266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 304347ab2a6cSJosef Bacik goto out; 304447ab2a6cSJosef Bacik } 304547ab2a6cSJosef Bacik 304647ab2a6cSJosef Bacik if (device->bytes_used > 0) { 304734441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 304847ab2a6cSJosef Bacik btrfs_device_set_bytes_used(device, 304947ab2a6cSJosef Bacik device->bytes_used - dev_extent_len); 3050a5ed45f8SNikolay Borisov atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 30510b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info); 305234441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 305347ab2a6cSJosef Bacik } 305479bd3712SFilipe Manana } 305557ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex); 305679bd3712SFilipe Manana 305779bd3712SFilipe Manana /* 305879bd3712SFilipe Manana * We acquire fs_info->chunk_mutex for 2 reasons: 305979bd3712SFilipe Manana * 306079bd3712SFilipe Manana * 1) Just like with the first phase of the chunk allocation, we must 306179bd3712SFilipe Manana * reserve system space, do all chunk btree updates and deletions, and 306279bd3712SFilipe Manana * update the system chunk array in the superblock while holding this 306379bd3712SFilipe Manana * mutex. This is for similar reasons as explained on the comment at 306479bd3712SFilipe Manana * the top of btrfs_chunk_alloc(); 306579bd3712SFilipe Manana * 306679bd3712SFilipe Manana * 2) Prevent races with the final phase of a device replace operation 306779bd3712SFilipe Manana * that replaces the device object associated with the map's stripes, 306879bd3712SFilipe Manana * because the device object's id can change at any time during that 306979bd3712SFilipe Manana * final phase of the device replace operation 307079bd3712SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 307179bd3712SFilipe Manana * replaced device and then see it with an ID of 307279bd3712SFilipe Manana * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 307379bd3712SFilipe Manana * the device item, which does not exists on the chunk btree. 307479bd3712SFilipe Manana * The finishing phase of device replace acquires both the 307579bd3712SFilipe Manana * device_list_mutex and the chunk_mutex, in that order, so we are 307679bd3712SFilipe Manana * safe by just acquiring the chunk_mutex. 307779bd3712SFilipe Manana */ 307879bd3712SFilipe Manana trans->removing_chunk = true; 307979bd3712SFilipe Manana mutex_lock(&fs_info->chunk_mutex); 308079bd3712SFilipe Manana 308179bd3712SFilipe Manana check_system_chunk(trans, map->type); 308279bd3712SFilipe Manana 308379bd3712SFilipe Manana ret = remove_chunk_item(trans, map, chunk_offset); 308479bd3712SFilipe Manana /* 308579bd3712SFilipe Manana * Normally we should not get -ENOSPC since we reserved space before 308679bd3712SFilipe Manana * through the call to check_system_chunk(). 308779bd3712SFilipe Manana * 308879bd3712SFilipe Manana * Despite our system space_info having enough free space, we may not 308979bd3712SFilipe Manana * be able to allocate extents from its block groups, because all have 309079bd3712SFilipe Manana * an incompatible profile, which will force us to allocate a new system 309179bd3712SFilipe Manana * block group with the right profile, or right after we called 309279bd3712SFilipe Manana * check_system_space() above, a scrub turned the only system block group 309379bd3712SFilipe Manana * with enough free space into RO mode. 309479bd3712SFilipe Manana * This is explained with more detail at do_chunk_alloc(). 309579bd3712SFilipe Manana * 309679bd3712SFilipe Manana * So if we get -ENOSPC, allocate a new system chunk and retry once. 309779bd3712SFilipe Manana */ 309879bd3712SFilipe Manana if (ret == -ENOSPC) { 309979bd3712SFilipe Manana const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 310079bd3712SFilipe Manana struct btrfs_block_group *sys_bg; 310179bd3712SFilipe Manana 3102f6f39f7aSNikolay Borisov sys_bg = btrfs_create_chunk(trans, sys_flags); 310379bd3712SFilipe Manana if (IS_ERR(sys_bg)) { 310479bd3712SFilipe Manana ret = PTR_ERR(sys_bg); 310566642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 310647ab2a6cSJosef Bacik goto out; 310747ab2a6cSJosef Bacik } 310857ba4cb8SFilipe Manana 310979bd3712SFilipe Manana ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 311047ab2a6cSJosef Bacik if (ret) { 311166642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 311247ab2a6cSJosef Bacik goto out; 311347ab2a6cSJosef Bacik } 311447ab2a6cSJosef Bacik 311579bd3712SFilipe Manana ret = remove_chunk_item(trans, map, chunk_offset); 311679bd3712SFilipe Manana if (ret) { 311779bd3712SFilipe Manana btrfs_abort_transaction(trans, ret); 311879bd3712SFilipe Manana goto out; 311979bd3712SFilipe Manana } 312079bd3712SFilipe Manana } else if (ret) { 312179bd3712SFilipe Manana btrfs_abort_transaction(trans, ret); 312279bd3712SFilipe Manana goto out; 312379bd3712SFilipe Manana } 312479bd3712SFilipe Manana 31256bccf3abSJeff Mahoney trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 312647ab2a6cSJosef Bacik 312747ab2a6cSJosef Bacik if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3128408fbf19SNikolay Borisov ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 312947ab2a6cSJosef Bacik if (ret) { 313066642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 313147ab2a6cSJosef Bacik goto out; 313247ab2a6cSJosef Bacik } 313347ab2a6cSJosef Bacik } 313447ab2a6cSJosef Bacik 313579bd3712SFilipe Manana mutex_unlock(&fs_info->chunk_mutex); 313679bd3712SFilipe Manana trans->removing_chunk = false; 313779bd3712SFilipe Manana 313879bd3712SFilipe Manana /* 313979bd3712SFilipe Manana * We are done with chunk btree updates and deletions, so release the 314079bd3712SFilipe Manana * system space we previously reserved (with check_system_chunk()). 314179bd3712SFilipe Manana */ 314279bd3712SFilipe Manana btrfs_trans_release_chunk_metadata(trans); 314379bd3712SFilipe Manana 31445a98ec01SNikolay Borisov ret = btrfs_remove_block_group(trans, chunk_offset, em); 314547ab2a6cSJosef Bacik if (ret) { 314666642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 314747ab2a6cSJosef Bacik goto out; 314847ab2a6cSJosef Bacik } 314947ab2a6cSJosef Bacik 315047ab2a6cSJosef Bacik out: 315179bd3712SFilipe Manana if (trans->removing_chunk) { 315279bd3712SFilipe Manana mutex_unlock(&fs_info->chunk_mutex); 315379bd3712SFilipe Manana trans->removing_chunk = false; 315479bd3712SFilipe Manana } 315547ab2a6cSJosef Bacik /* once for us */ 315647ab2a6cSJosef Bacik free_extent_map(em); 31578f18cf13SChris Mason return ret; 31588f18cf13SChris Mason } 31598f18cf13SChris Mason 316018bb8bbfSJohannes Thumshirn int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 31618f18cf13SChris Mason { 31625b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 316319c4d2f9SChris Mason struct btrfs_trans_handle *trans; 3164b0643e59SDennis Zhou struct btrfs_block_group *block_group; 316501e86008SJohannes Thumshirn u64 length; 31668f18cf13SChris Mason int ret; 31678f18cf13SChris Mason 316867c5e7d4SFilipe Manana /* 316967c5e7d4SFilipe Manana * Prevent races with automatic removal of unused block groups. 317067c5e7d4SFilipe Manana * After we relocate and before we remove the chunk with offset 317167c5e7d4SFilipe Manana * chunk_offset, automatic removal of the block group can kick in, 317267c5e7d4SFilipe Manana * resulting in a failure when calling btrfs_remove_chunk() below. 317367c5e7d4SFilipe Manana * 317467c5e7d4SFilipe Manana * Make sure to acquire this mutex before doing a tree search (dev 317567c5e7d4SFilipe Manana * or chunk trees) to find chunks. Otherwise the cleaner kthread might 317667c5e7d4SFilipe Manana * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 317767c5e7d4SFilipe Manana * we release the path used to search the chunk/dev tree and before 317867c5e7d4SFilipe Manana * the current task acquires this mutex and calls us. 317967c5e7d4SFilipe Manana */ 3180f3372065SJohannes Thumshirn lockdep_assert_held(&fs_info->reclaim_bgs_lock); 318167c5e7d4SFilipe Manana 31828f18cf13SChris Mason /* step one, relocate all the extents inside this chunk */ 31832ff7e61eSJeff Mahoney btrfs_scrub_pause(fs_info); 31840b246afaSJeff Mahoney ret = btrfs_relocate_block_group(fs_info, chunk_offset); 31852ff7e61eSJeff Mahoney btrfs_scrub_continue(fs_info); 3186a22285a6SYan, Zheng if (ret) 3187a22285a6SYan, Zheng return ret; 31888f18cf13SChris Mason 3189b0643e59SDennis Zhou block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3190b0643e59SDennis Zhou if (!block_group) 3191b0643e59SDennis Zhou return -ENOENT; 3192b0643e59SDennis Zhou btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 319301e86008SJohannes Thumshirn length = block_group->length; 3194b0643e59SDennis Zhou btrfs_put_block_group(block_group); 3195b0643e59SDennis Zhou 319601e86008SJohannes Thumshirn /* 319701e86008SJohannes Thumshirn * On a zoned file system, discard the whole block group, this will 319801e86008SJohannes Thumshirn * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 319901e86008SJohannes Thumshirn * resetting the zone fails, don't treat it as a fatal problem from the 320001e86008SJohannes Thumshirn * filesystem's point of view. 320101e86008SJohannes Thumshirn */ 320201e86008SJohannes Thumshirn if (btrfs_is_zoned(fs_info)) { 320301e86008SJohannes Thumshirn ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 320401e86008SJohannes Thumshirn if (ret) 320501e86008SJohannes Thumshirn btrfs_info(fs_info, 320601e86008SJohannes Thumshirn "failed to reset zone %llu after relocation", 320701e86008SJohannes Thumshirn chunk_offset); 320801e86008SJohannes Thumshirn } 320901e86008SJohannes Thumshirn 321019c4d2f9SChris Mason trans = btrfs_start_trans_remove_block_group(root->fs_info, 321119c4d2f9SChris Mason chunk_offset); 321219c4d2f9SChris Mason if (IS_ERR(trans)) { 321319c4d2f9SChris Mason ret = PTR_ERR(trans); 321419c4d2f9SChris Mason btrfs_handle_fs_error(root->fs_info, ret, NULL); 321519c4d2f9SChris Mason return ret; 321619c4d2f9SChris Mason } 32175d8eb6feSNaohiro Aota 321819c4d2f9SChris Mason /* 321919c4d2f9SChris Mason * step two, delete the device extents and the 322019c4d2f9SChris Mason * chunk tree entries 322119c4d2f9SChris Mason */ 322297aff912SNikolay Borisov ret = btrfs_remove_chunk(trans, chunk_offset); 32233a45bb20SJeff Mahoney btrfs_end_transaction(trans); 322419c4d2f9SChris Mason return ret; 32258f18cf13SChris Mason } 32268f18cf13SChris Mason 32272ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 32282b82032cSYan Zheng { 32290b246afaSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root; 32302b82032cSYan Zheng struct btrfs_path *path; 32312b82032cSYan Zheng struct extent_buffer *leaf; 32322b82032cSYan Zheng struct btrfs_chunk *chunk; 32332b82032cSYan Zheng struct btrfs_key key; 32342b82032cSYan Zheng struct btrfs_key found_key; 32352b82032cSYan Zheng u64 chunk_type; 3236ba1bf481SJosef Bacik bool retried = false; 3237ba1bf481SJosef Bacik int failed = 0; 32382b82032cSYan Zheng int ret; 32392b82032cSYan Zheng 32402b82032cSYan Zheng path = btrfs_alloc_path(); 32412b82032cSYan Zheng if (!path) 32422b82032cSYan Zheng return -ENOMEM; 32432b82032cSYan Zheng 3244ba1bf481SJosef Bacik again: 32452b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 32462b82032cSYan Zheng key.offset = (u64)-1; 32472b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 32482b82032cSYan Zheng 32492b82032cSYan Zheng while (1) { 3250f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock); 32512b82032cSYan Zheng ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 325267c5e7d4SFilipe Manana if (ret < 0) { 3253f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 32542b82032cSYan Zheng goto error; 325567c5e7d4SFilipe Manana } 325679787eaaSJeff Mahoney BUG_ON(ret == 0); /* Corruption */ 32572b82032cSYan Zheng 32582b82032cSYan Zheng ret = btrfs_previous_item(chunk_root, path, key.objectid, 32592b82032cSYan Zheng key.type); 326067c5e7d4SFilipe Manana if (ret) 3261f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 32622b82032cSYan Zheng if (ret < 0) 32632b82032cSYan Zheng goto error; 32642b82032cSYan Zheng if (ret > 0) 32652b82032cSYan Zheng break; 32662b82032cSYan Zheng 32672b82032cSYan Zheng leaf = path->nodes[0]; 32682b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 32692b82032cSYan Zheng 32702b82032cSYan Zheng chunk = btrfs_item_ptr(leaf, path->slots[0], 32712b82032cSYan Zheng struct btrfs_chunk); 32722b82032cSYan Zheng chunk_type = btrfs_chunk_type(leaf, chunk); 3273b3b4aa74SDavid Sterba btrfs_release_path(path); 32742b82032cSYan Zheng 32752b82032cSYan Zheng if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 32760b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3277ba1bf481SJosef Bacik if (ret == -ENOSPC) 3278ba1bf481SJosef Bacik failed++; 327914586651SHIMANGI SARAOGI else 328014586651SHIMANGI SARAOGI BUG_ON(ret); 32812b82032cSYan Zheng } 3282f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 32832b82032cSYan Zheng 32842b82032cSYan Zheng if (found_key.offset == 0) 32852b82032cSYan Zheng break; 32862b82032cSYan Zheng key.offset = found_key.offset - 1; 32872b82032cSYan Zheng } 32882b82032cSYan Zheng ret = 0; 3289ba1bf481SJosef Bacik if (failed && !retried) { 3290ba1bf481SJosef Bacik failed = 0; 3291ba1bf481SJosef Bacik retried = true; 3292ba1bf481SJosef Bacik goto again; 3293fae7f21cSDulshani Gunawardhana } else if (WARN_ON(failed && retried)) { 3294ba1bf481SJosef Bacik ret = -ENOSPC; 3295ba1bf481SJosef Bacik } 32962b82032cSYan Zheng error: 32972b82032cSYan Zheng btrfs_free_path(path); 32982b82032cSYan Zheng return ret; 32992b82032cSYan Zheng } 33002b82032cSYan Zheng 3301a6f93c71SLiu Bo /* 3302a6f93c71SLiu Bo * return 1 : allocate a data chunk successfully, 3303a6f93c71SLiu Bo * return <0: errors during allocating a data chunk, 3304a6f93c71SLiu Bo * return 0 : no need to allocate a data chunk. 3305a6f93c71SLiu Bo */ 3306a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3307a6f93c71SLiu Bo u64 chunk_offset) 3308a6f93c71SLiu Bo { 330932da5386SDavid Sterba struct btrfs_block_group *cache; 3310a6f93c71SLiu Bo u64 bytes_used; 3311a6f93c71SLiu Bo u64 chunk_type; 3312a6f93c71SLiu Bo 3313a6f93c71SLiu Bo cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3314a6f93c71SLiu Bo ASSERT(cache); 3315a6f93c71SLiu Bo chunk_type = cache->flags; 3316a6f93c71SLiu Bo btrfs_put_block_group(cache); 3317a6f93c71SLiu Bo 33185ae21692SJohannes Thumshirn if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 33195ae21692SJohannes Thumshirn return 0; 33205ae21692SJohannes Thumshirn 3321a6f93c71SLiu Bo spin_lock(&fs_info->data_sinfo->lock); 3322a6f93c71SLiu Bo bytes_used = fs_info->data_sinfo->bytes_used; 3323a6f93c71SLiu Bo spin_unlock(&fs_info->data_sinfo->lock); 3324a6f93c71SLiu Bo 3325a6f93c71SLiu Bo if (!bytes_used) { 3326a6f93c71SLiu Bo struct btrfs_trans_handle *trans; 3327a6f93c71SLiu Bo int ret; 3328a6f93c71SLiu Bo 3329a6f93c71SLiu Bo trans = btrfs_join_transaction(fs_info->tree_root); 3330a6f93c71SLiu Bo if (IS_ERR(trans)) 3331a6f93c71SLiu Bo return PTR_ERR(trans); 3332a6f93c71SLiu Bo 33335ae21692SJohannes Thumshirn ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3334a6f93c71SLiu Bo btrfs_end_transaction(trans); 3335a6f93c71SLiu Bo if (ret < 0) 3336a6f93c71SLiu Bo return ret; 3337a6f93c71SLiu Bo return 1; 3338a6f93c71SLiu Bo } 33395ae21692SJohannes Thumshirn 3340a6f93c71SLiu Bo return 0; 3341a6f93c71SLiu Bo } 3342a6f93c71SLiu Bo 33436bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info, 33440940ebf6SIlya Dryomov struct btrfs_balance_control *bctl) 33450940ebf6SIlya Dryomov { 33466bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 33470940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 33480940ebf6SIlya Dryomov struct btrfs_balance_item *item; 33490940ebf6SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 33500940ebf6SIlya Dryomov struct btrfs_path *path; 33510940ebf6SIlya Dryomov struct extent_buffer *leaf; 33520940ebf6SIlya Dryomov struct btrfs_key key; 33530940ebf6SIlya Dryomov int ret, err; 33540940ebf6SIlya Dryomov 33550940ebf6SIlya Dryomov path = btrfs_alloc_path(); 33560940ebf6SIlya Dryomov if (!path) 33570940ebf6SIlya Dryomov return -ENOMEM; 33580940ebf6SIlya Dryomov 33590940ebf6SIlya Dryomov trans = btrfs_start_transaction(root, 0); 33600940ebf6SIlya Dryomov if (IS_ERR(trans)) { 33610940ebf6SIlya Dryomov btrfs_free_path(path); 33620940ebf6SIlya Dryomov return PTR_ERR(trans); 33630940ebf6SIlya Dryomov } 33640940ebf6SIlya Dryomov 33650940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3366c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 33670940ebf6SIlya Dryomov key.offset = 0; 33680940ebf6SIlya Dryomov 33690940ebf6SIlya Dryomov ret = btrfs_insert_empty_item(trans, root, path, &key, 33700940ebf6SIlya Dryomov sizeof(*item)); 33710940ebf6SIlya Dryomov if (ret) 33720940ebf6SIlya Dryomov goto out; 33730940ebf6SIlya Dryomov 33740940ebf6SIlya Dryomov leaf = path->nodes[0]; 33750940ebf6SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 33760940ebf6SIlya Dryomov 3377b159fa28SDavid Sterba memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 33780940ebf6SIlya Dryomov 33790940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 33800940ebf6SIlya Dryomov btrfs_set_balance_data(leaf, item, &disk_bargs); 33810940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 33820940ebf6SIlya Dryomov btrfs_set_balance_meta(leaf, item, &disk_bargs); 33830940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 33840940ebf6SIlya Dryomov btrfs_set_balance_sys(leaf, item, &disk_bargs); 33850940ebf6SIlya Dryomov 33860940ebf6SIlya Dryomov btrfs_set_balance_flags(leaf, item, bctl->flags); 33870940ebf6SIlya Dryomov 33880940ebf6SIlya Dryomov btrfs_mark_buffer_dirty(leaf); 33890940ebf6SIlya Dryomov out: 33900940ebf6SIlya Dryomov btrfs_free_path(path); 33913a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 33920940ebf6SIlya Dryomov if (err && !ret) 33930940ebf6SIlya Dryomov ret = err; 33940940ebf6SIlya Dryomov return ret; 33950940ebf6SIlya Dryomov } 33960940ebf6SIlya Dryomov 33976bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info) 33980940ebf6SIlya Dryomov { 33996bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 34000940ebf6SIlya Dryomov struct btrfs_trans_handle *trans; 34010940ebf6SIlya Dryomov struct btrfs_path *path; 34020940ebf6SIlya Dryomov struct btrfs_key key; 34030940ebf6SIlya Dryomov int ret, err; 34040940ebf6SIlya Dryomov 34050940ebf6SIlya Dryomov path = btrfs_alloc_path(); 34060940ebf6SIlya Dryomov if (!path) 34070940ebf6SIlya Dryomov return -ENOMEM; 34080940ebf6SIlya Dryomov 34093502a8c0SDavid Sterba trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 34100940ebf6SIlya Dryomov if (IS_ERR(trans)) { 34110940ebf6SIlya Dryomov btrfs_free_path(path); 34120940ebf6SIlya Dryomov return PTR_ERR(trans); 34130940ebf6SIlya Dryomov } 34140940ebf6SIlya Dryomov 34150940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 3416c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 34170940ebf6SIlya Dryomov key.offset = 0; 34180940ebf6SIlya Dryomov 34190940ebf6SIlya Dryomov ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 34200940ebf6SIlya Dryomov if (ret < 0) 34210940ebf6SIlya Dryomov goto out; 34220940ebf6SIlya Dryomov if (ret > 0) { 34230940ebf6SIlya Dryomov ret = -ENOENT; 34240940ebf6SIlya Dryomov goto out; 34250940ebf6SIlya Dryomov } 34260940ebf6SIlya Dryomov 34270940ebf6SIlya Dryomov ret = btrfs_del_item(trans, root, path); 34280940ebf6SIlya Dryomov out: 34290940ebf6SIlya Dryomov btrfs_free_path(path); 34303a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans); 34310940ebf6SIlya Dryomov if (err && !ret) 34320940ebf6SIlya Dryomov ret = err; 34330940ebf6SIlya Dryomov return ret; 34340940ebf6SIlya Dryomov } 34350940ebf6SIlya Dryomov 3436c9e9f97bSIlya Dryomov /* 343759641015SIlya Dryomov * This is a heuristic used to reduce the number of chunks balanced on 343859641015SIlya Dryomov * resume after balance was interrupted. 343959641015SIlya Dryomov */ 344059641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl) 344159641015SIlya Dryomov { 344259641015SIlya Dryomov /* 344359641015SIlya Dryomov * Turn on soft mode for chunk types that were being converted. 344459641015SIlya Dryomov */ 344559641015SIlya Dryomov if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 344659641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 344759641015SIlya Dryomov if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 344859641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 344959641015SIlya Dryomov if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 345059641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 345159641015SIlya Dryomov 345259641015SIlya Dryomov /* 345359641015SIlya Dryomov * Turn on usage filter if is not already used. The idea is 345459641015SIlya Dryomov * that chunks that we have already balanced should be 345559641015SIlya Dryomov * reasonably full. Don't do it for chunks that are being 345659641015SIlya Dryomov * converted - that will keep us from relocating unconverted 345759641015SIlya Dryomov * (albeit full) chunks. 345859641015SIlya Dryomov */ 345959641015SIlya Dryomov if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3460bc309467SDavid Sterba !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 346159641015SIlya Dryomov !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 346259641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 346359641015SIlya Dryomov bctl->data.usage = 90; 346459641015SIlya Dryomov } 346559641015SIlya Dryomov if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3466bc309467SDavid Sterba !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 346759641015SIlya Dryomov !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 346859641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 346959641015SIlya Dryomov bctl->sys.usage = 90; 347059641015SIlya Dryomov } 347159641015SIlya Dryomov if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3472bc309467SDavid Sterba !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 347359641015SIlya Dryomov !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 347459641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 347559641015SIlya Dryomov bctl->meta.usage = 90; 347659641015SIlya Dryomov } 347759641015SIlya Dryomov } 347859641015SIlya Dryomov 347959641015SIlya Dryomov /* 3480149196a2SDavid Sterba * Clear the balance status in fs_info and delete the balance item from disk. 3481149196a2SDavid Sterba */ 3482149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info) 3483c9e9f97bSIlya Dryomov { 3484c9e9f97bSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3485149196a2SDavid Sterba int ret; 3486c9e9f97bSIlya Dryomov 3487c9e9f97bSIlya Dryomov BUG_ON(!fs_info->balance_ctl); 3488c9e9f97bSIlya Dryomov 3489c9e9f97bSIlya Dryomov spin_lock(&fs_info->balance_lock); 3490c9e9f97bSIlya Dryomov fs_info->balance_ctl = NULL; 3491c9e9f97bSIlya Dryomov spin_unlock(&fs_info->balance_lock); 3492c9e9f97bSIlya Dryomov 3493c9e9f97bSIlya Dryomov kfree(bctl); 3494149196a2SDavid Sterba ret = del_balance_item(fs_info); 3495149196a2SDavid Sterba if (ret) 3496149196a2SDavid Sterba btrfs_handle_fs_error(fs_info, ret, NULL); 3497c9e9f97bSIlya Dryomov } 3498c9e9f97bSIlya Dryomov 3499ed25e9b2SIlya Dryomov /* 3500ed25e9b2SIlya Dryomov * Balance filters. Return 1 if chunk should be filtered out 3501ed25e9b2SIlya Dryomov * (should not be balanced). 3502ed25e9b2SIlya Dryomov */ 3503899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type, 3504ed25e9b2SIlya Dryomov struct btrfs_balance_args *bargs) 3505ed25e9b2SIlya Dryomov { 3506899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3507899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3508ed25e9b2SIlya Dryomov 3509899c81eaSIlya Dryomov if (bargs->profiles & chunk_type) 3510ed25e9b2SIlya Dryomov return 0; 3511ed25e9b2SIlya Dryomov 3512ed25e9b2SIlya Dryomov return 1; 3513ed25e9b2SIlya Dryomov } 3514ed25e9b2SIlya Dryomov 3515dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 35165ce5b3c0SIlya Dryomov struct btrfs_balance_args *bargs) 35175ce5b3c0SIlya Dryomov { 351832da5386SDavid Sterba struct btrfs_block_group *cache; 3519bc309467SDavid Sterba u64 chunk_used; 3520bc309467SDavid Sterba u64 user_thresh_min; 3521bc309467SDavid Sterba u64 user_thresh_max; 3522bc309467SDavid Sterba int ret = 1; 3523bc309467SDavid Sterba 3524bc309467SDavid Sterba cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3525bf38be65SDavid Sterba chunk_used = cache->used; 3526bc309467SDavid Sterba 3527bc309467SDavid Sterba if (bargs->usage_min == 0) 3528bc309467SDavid Sterba user_thresh_min = 0; 3529bc309467SDavid Sterba else 3530b3470b5dSDavid Sterba user_thresh_min = div_factor_fine(cache->length, 3531bc309467SDavid Sterba bargs->usage_min); 3532bc309467SDavid Sterba 3533bc309467SDavid Sterba if (bargs->usage_max == 0) 3534bc309467SDavid Sterba user_thresh_max = 1; 3535bc309467SDavid Sterba else if (bargs->usage_max > 100) 3536b3470b5dSDavid Sterba user_thresh_max = cache->length; 3537bc309467SDavid Sterba else 3538b3470b5dSDavid Sterba user_thresh_max = div_factor_fine(cache->length, 3539bc309467SDavid Sterba bargs->usage_max); 3540bc309467SDavid Sterba 3541bc309467SDavid Sterba if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3542bc309467SDavid Sterba ret = 0; 3543bc309467SDavid Sterba 3544bc309467SDavid Sterba btrfs_put_block_group(cache); 3545bc309467SDavid Sterba return ret; 3546bc309467SDavid Sterba } 3547bc309467SDavid Sterba 3548dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3549bc309467SDavid Sterba u64 chunk_offset, struct btrfs_balance_args *bargs) 3550bc309467SDavid Sterba { 355132da5386SDavid Sterba struct btrfs_block_group *cache; 35525ce5b3c0SIlya Dryomov u64 chunk_used, user_thresh; 35535ce5b3c0SIlya Dryomov int ret = 1; 35545ce5b3c0SIlya Dryomov 35555ce5b3c0SIlya Dryomov cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3556bf38be65SDavid Sterba chunk_used = cache->used; 35575ce5b3c0SIlya Dryomov 3558bc309467SDavid Sterba if (bargs->usage_min == 0) 35593e39cea6SIlya Dryomov user_thresh = 1; 3560a105bb88SIlya Dryomov else if (bargs->usage > 100) 3561b3470b5dSDavid Sterba user_thresh = cache->length; 3562a105bb88SIlya Dryomov else 3563b3470b5dSDavid Sterba user_thresh = div_factor_fine(cache->length, bargs->usage); 3564a105bb88SIlya Dryomov 35655ce5b3c0SIlya Dryomov if (chunk_used < user_thresh) 35665ce5b3c0SIlya Dryomov ret = 0; 35675ce5b3c0SIlya Dryomov 35685ce5b3c0SIlya Dryomov btrfs_put_block_group(cache); 35695ce5b3c0SIlya Dryomov return ret; 35705ce5b3c0SIlya Dryomov } 35715ce5b3c0SIlya Dryomov 3572409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf, 3573409d404bSIlya Dryomov struct btrfs_chunk *chunk, 3574409d404bSIlya Dryomov struct btrfs_balance_args *bargs) 3575409d404bSIlya Dryomov { 3576409d404bSIlya Dryomov struct btrfs_stripe *stripe; 3577409d404bSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3578409d404bSIlya Dryomov int i; 3579409d404bSIlya Dryomov 3580409d404bSIlya Dryomov for (i = 0; i < num_stripes; i++) { 3581409d404bSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 3582409d404bSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3583409d404bSIlya Dryomov return 0; 3584409d404bSIlya Dryomov } 3585409d404bSIlya Dryomov 3586409d404bSIlya Dryomov return 1; 3587409d404bSIlya Dryomov } 3588409d404bSIlya Dryomov 3589946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes) 3590946c9256SDavid Sterba { 3591946c9256SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(type); 3592946c9256SDavid Sterba const int ncopies = btrfs_raid_array[index].ncopies; 3593946c9256SDavid Sterba const int nparity = btrfs_raid_array[index].nparity; 3594946c9256SDavid Sterba 3595d58ede8dSDavid Sterba return (num_stripes - nparity) / ncopies; 3596946c9256SDavid Sterba } 3597946c9256SDavid Sterba 359894e60d5aSIlya Dryomov /* [pstart, pend) */ 359994e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf, 360094e60d5aSIlya Dryomov struct btrfs_chunk *chunk, 360194e60d5aSIlya Dryomov struct btrfs_balance_args *bargs) 360294e60d5aSIlya Dryomov { 360394e60d5aSIlya Dryomov struct btrfs_stripe *stripe; 360494e60d5aSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 360594e60d5aSIlya Dryomov u64 stripe_offset; 360694e60d5aSIlya Dryomov u64 stripe_length; 3607946c9256SDavid Sterba u64 type; 360894e60d5aSIlya Dryomov int factor; 360994e60d5aSIlya Dryomov int i; 361094e60d5aSIlya Dryomov 361194e60d5aSIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 361294e60d5aSIlya Dryomov return 0; 361394e60d5aSIlya Dryomov 3614946c9256SDavid Sterba type = btrfs_chunk_type(leaf, chunk); 3615946c9256SDavid Sterba factor = calc_data_stripes(type, num_stripes); 361694e60d5aSIlya Dryomov 361794e60d5aSIlya Dryomov for (i = 0; i < num_stripes; i++) { 361894e60d5aSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i); 361994e60d5aSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 362094e60d5aSIlya Dryomov continue; 362194e60d5aSIlya Dryomov 362294e60d5aSIlya Dryomov stripe_offset = btrfs_stripe_offset(leaf, stripe); 362394e60d5aSIlya Dryomov stripe_length = btrfs_chunk_length(leaf, chunk); 3624b8b93addSDavid Sterba stripe_length = div_u64(stripe_length, factor); 362594e60d5aSIlya Dryomov 362694e60d5aSIlya Dryomov if (stripe_offset < bargs->pend && 362794e60d5aSIlya Dryomov stripe_offset + stripe_length > bargs->pstart) 362894e60d5aSIlya Dryomov return 0; 362994e60d5aSIlya Dryomov } 363094e60d5aSIlya Dryomov 363194e60d5aSIlya Dryomov return 1; 363294e60d5aSIlya Dryomov } 363394e60d5aSIlya Dryomov 3634ea67176aSIlya Dryomov /* [vstart, vend) */ 3635ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf, 3636ea67176aSIlya Dryomov struct btrfs_chunk *chunk, 3637ea67176aSIlya Dryomov u64 chunk_offset, 3638ea67176aSIlya Dryomov struct btrfs_balance_args *bargs) 3639ea67176aSIlya Dryomov { 3640ea67176aSIlya Dryomov if (chunk_offset < bargs->vend && 3641ea67176aSIlya Dryomov chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3642ea67176aSIlya Dryomov /* at least part of the chunk is inside this vrange */ 3643ea67176aSIlya Dryomov return 0; 3644ea67176aSIlya Dryomov 3645ea67176aSIlya Dryomov return 1; 3646ea67176aSIlya Dryomov } 3647ea67176aSIlya Dryomov 3648dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3649dee32d0aSGabríel Arthúr Pétursson struct btrfs_chunk *chunk, 3650dee32d0aSGabríel Arthúr Pétursson struct btrfs_balance_args *bargs) 3651dee32d0aSGabríel Arthúr Pétursson { 3652dee32d0aSGabríel Arthúr Pétursson int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3653dee32d0aSGabríel Arthúr Pétursson 3654dee32d0aSGabríel Arthúr Pétursson if (bargs->stripes_min <= num_stripes 3655dee32d0aSGabríel Arthúr Pétursson && num_stripes <= bargs->stripes_max) 3656dee32d0aSGabríel Arthúr Pétursson return 0; 3657dee32d0aSGabríel Arthúr Pétursson 3658dee32d0aSGabríel Arthúr Pétursson return 1; 3659dee32d0aSGabríel Arthúr Pétursson } 3660dee32d0aSGabríel Arthúr Pétursson 3661899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type, 3662cfa4c961SIlya Dryomov struct btrfs_balance_args *bargs) 3663cfa4c961SIlya Dryomov { 3664cfa4c961SIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3665cfa4c961SIlya Dryomov return 0; 3666cfa4c961SIlya Dryomov 3667899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) & 3668899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK; 3669cfa4c961SIlya Dryomov 3670899c81eaSIlya Dryomov if (bargs->target == chunk_type) 3671cfa4c961SIlya Dryomov return 1; 3672cfa4c961SIlya Dryomov 3673cfa4c961SIlya Dryomov return 0; 3674cfa4c961SIlya Dryomov } 3675cfa4c961SIlya Dryomov 36766ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf, 3677f43ffb60SIlya Dryomov struct btrfs_chunk *chunk, u64 chunk_offset) 3678f43ffb60SIlya Dryomov { 36796ec0896cSDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 36800b246afaSJeff Mahoney struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3681f43ffb60SIlya Dryomov struct btrfs_balance_args *bargs = NULL; 3682f43ffb60SIlya Dryomov u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3683f43ffb60SIlya Dryomov 3684f43ffb60SIlya Dryomov /* type filter */ 3685f43ffb60SIlya Dryomov if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3686f43ffb60SIlya Dryomov (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3687f43ffb60SIlya Dryomov return 0; 3688f43ffb60SIlya Dryomov } 3689f43ffb60SIlya Dryomov 3690f43ffb60SIlya Dryomov if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3691f43ffb60SIlya Dryomov bargs = &bctl->data; 3692f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3693f43ffb60SIlya Dryomov bargs = &bctl->sys; 3694f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3695f43ffb60SIlya Dryomov bargs = &bctl->meta; 3696f43ffb60SIlya Dryomov 3697ed25e9b2SIlya Dryomov /* profiles filter */ 3698ed25e9b2SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3699ed25e9b2SIlya Dryomov chunk_profiles_filter(chunk_type, bargs)) { 3700ed25e9b2SIlya Dryomov return 0; 3701ed25e9b2SIlya Dryomov } 3702ed25e9b2SIlya Dryomov 37035ce5b3c0SIlya Dryomov /* usage filter */ 37045ce5b3c0SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 37050b246afaSJeff Mahoney chunk_usage_filter(fs_info, chunk_offset, bargs)) { 37065ce5b3c0SIlya Dryomov return 0; 3707bc309467SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 37080b246afaSJeff Mahoney chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3709bc309467SDavid Sterba return 0; 37105ce5b3c0SIlya Dryomov } 37115ce5b3c0SIlya Dryomov 3712409d404bSIlya Dryomov /* devid filter */ 3713409d404bSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3714409d404bSIlya Dryomov chunk_devid_filter(leaf, chunk, bargs)) { 3715409d404bSIlya Dryomov return 0; 3716409d404bSIlya Dryomov } 3717409d404bSIlya Dryomov 371894e60d5aSIlya Dryomov /* drange filter, makes sense only with devid filter */ 371994e60d5aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3720e4ff5fb5SNikolay Borisov chunk_drange_filter(leaf, chunk, bargs)) { 372194e60d5aSIlya Dryomov return 0; 372294e60d5aSIlya Dryomov } 372394e60d5aSIlya Dryomov 3724ea67176aSIlya Dryomov /* vrange filter */ 3725ea67176aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3726ea67176aSIlya Dryomov chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3727ea67176aSIlya Dryomov return 0; 3728ea67176aSIlya Dryomov } 3729ea67176aSIlya Dryomov 3730dee32d0aSGabríel Arthúr Pétursson /* stripes filter */ 3731dee32d0aSGabríel Arthúr Pétursson if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3732dee32d0aSGabríel Arthúr Pétursson chunk_stripes_range_filter(leaf, chunk, bargs)) { 3733dee32d0aSGabríel Arthúr Pétursson return 0; 3734dee32d0aSGabríel Arthúr Pétursson } 3735dee32d0aSGabríel Arthúr Pétursson 3736cfa4c961SIlya Dryomov /* soft profile changing mode */ 3737cfa4c961SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3738cfa4c961SIlya Dryomov chunk_soft_convert_filter(chunk_type, bargs)) { 3739cfa4c961SIlya Dryomov return 0; 3740cfa4c961SIlya Dryomov } 3741cfa4c961SIlya Dryomov 37427d824b6fSDavid Sterba /* 37437d824b6fSDavid Sterba * limited by count, must be the last filter 37447d824b6fSDavid Sterba */ 37457d824b6fSDavid Sterba if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 37467d824b6fSDavid Sterba if (bargs->limit == 0) 37477d824b6fSDavid Sterba return 0; 37487d824b6fSDavid Sterba else 37497d824b6fSDavid Sterba bargs->limit--; 375012907fc7SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 375112907fc7SDavid Sterba /* 375212907fc7SDavid Sterba * Same logic as the 'limit' filter; the minimum cannot be 375301327610SNicholas D Steeves * determined here because we do not have the global information 375412907fc7SDavid Sterba * about the count of all chunks that satisfy the filters. 375512907fc7SDavid Sterba */ 375612907fc7SDavid Sterba if (bargs->limit_max == 0) 375712907fc7SDavid Sterba return 0; 375812907fc7SDavid Sterba else 375912907fc7SDavid Sterba bargs->limit_max--; 37607d824b6fSDavid Sterba } 37617d824b6fSDavid Sterba 3762f43ffb60SIlya Dryomov return 1; 3763f43ffb60SIlya Dryomov } 3764f43ffb60SIlya Dryomov 3765c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3766ec44a35cSChris Mason { 376719a39dceSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3768c9e9f97bSIlya Dryomov struct btrfs_root *chunk_root = fs_info->chunk_root; 376912907fc7SDavid Sterba u64 chunk_type; 3770f43ffb60SIlya Dryomov struct btrfs_chunk *chunk; 37715a488b9dSLiu Bo struct btrfs_path *path = NULL; 3772ec44a35cSChris Mason struct btrfs_key key; 3773ec44a35cSChris Mason struct btrfs_key found_key; 3774f43ffb60SIlya Dryomov struct extent_buffer *leaf; 3775f43ffb60SIlya Dryomov int slot; 3776c9e9f97bSIlya Dryomov int ret; 3777c9e9f97bSIlya Dryomov int enospc_errors = 0; 377819a39dceSIlya Dryomov bool counting = true; 377912907fc7SDavid Sterba /* The single value limit and min/max limits use the same bytes in the */ 37807d824b6fSDavid Sterba u64 limit_data = bctl->data.limit; 37817d824b6fSDavid Sterba u64 limit_meta = bctl->meta.limit; 37827d824b6fSDavid Sterba u64 limit_sys = bctl->sys.limit; 378312907fc7SDavid Sterba u32 count_data = 0; 378412907fc7SDavid Sterba u32 count_meta = 0; 378512907fc7SDavid Sterba u32 count_sys = 0; 37862c9fe835SZhao Lei int chunk_reserved = 0; 3787ec44a35cSChris Mason 3788ec44a35cSChris Mason path = btrfs_alloc_path(); 378917e9f796SMark Fasheh if (!path) { 379017e9f796SMark Fasheh ret = -ENOMEM; 379117e9f796SMark Fasheh goto error; 379217e9f796SMark Fasheh } 379319a39dceSIlya Dryomov 379419a39dceSIlya Dryomov /* zero out stat counters */ 379519a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 379619a39dceSIlya Dryomov memset(&bctl->stat, 0, sizeof(bctl->stat)); 379719a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 379819a39dceSIlya Dryomov again: 37997d824b6fSDavid Sterba if (!counting) { 380012907fc7SDavid Sterba /* 380112907fc7SDavid Sterba * The single value limit and min/max limits use the same bytes 380212907fc7SDavid Sterba * in the 380312907fc7SDavid Sterba */ 38047d824b6fSDavid Sterba bctl->data.limit = limit_data; 38057d824b6fSDavid Sterba bctl->meta.limit = limit_meta; 38067d824b6fSDavid Sterba bctl->sys.limit = limit_sys; 38077d824b6fSDavid Sterba } 3808ec44a35cSChris Mason key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3809ec44a35cSChris Mason key.offset = (u64)-1; 3810ec44a35cSChris Mason key.type = BTRFS_CHUNK_ITEM_KEY; 3811ec44a35cSChris Mason 3812ec44a35cSChris Mason while (1) { 381319a39dceSIlya Dryomov if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3814a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req)) { 3815837d5b6eSIlya Dryomov ret = -ECANCELED; 3816837d5b6eSIlya Dryomov goto error; 3817837d5b6eSIlya Dryomov } 3818837d5b6eSIlya Dryomov 3819f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock); 3820ec44a35cSChris Mason ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 382167c5e7d4SFilipe Manana if (ret < 0) { 3822f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 3823ec44a35cSChris Mason goto error; 382467c5e7d4SFilipe Manana } 3825ec44a35cSChris Mason 3826ec44a35cSChris Mason /* 3827ec44a35cSChris Mason * this shouldn't happen, it means the last relocate 3828ec44a35cSChris Mason * failed 3829ec44a35cSChris Mason */ 3830ec44a35cSChris Mason if (ret == 0) 3831c9e9f97bSIlya Dryomov BUG(); /* FIXME break ? */ 3832ec44a35cSChris Mason 3833ec44a35cSChris Mason ret = btrfs_previous_item(chunk_root, path, 0, 3834ec44a35cSChris Mason BTRFS_CHUNK_ITEM_KEY); 3835c9e9f97bSIlya Dryomov if (ret) { 3836f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 3837c9e9f97bSIlya Dryomov ret = 0; 3838ec44a35cSChris Mason break; 3839c9e9f97bSIlya Dryomov } 38407d9eb12cSChris Mason 3841f43ffb60SIlya Dryomov leaf = path->nodes[0]; 3842f43ffb60SIlya Dryomov slot = path->slots[0]; 3843f43ffb60SIlya Dryomov btrfs_item_key_to_cpu(leaf, &found_key, slot); 3844f43ffb60SIlya Dryomov 384567c5e7d4SFilipe Manana if (found_key.objectid != key.objectid) { 3846f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 3847ec44a35cSChris Mason break; 384867c5e7d4SFilipe Manana } 38497d9eb12cSChris Mason 3850f43ffb60SIlya Dryomov chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 385112907fc7SDavid Sterba chunk_type = btrfs_chunk_type(leaf, chunk); 3852f43ffb60SIlya Dryomov 385319a39dceSIlya Dryomov if (!counting) { 385419a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 385519a39dceSIlya Dryomov bctl->stat.considered++; 385619a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 385719a39dceSIlya Dryomov } 385819a39dceSIlya Dryomov 38596ec0896cSDavid Sterba ret = should_balance_chunk(leaf, chunk, found_key.offset); 38602c9fe835SZhao Lei 3861b3b4aa74SDavid Sterba btrfs_release_path(path); 386267c5e7d4SFilipe Manana if (!ret) { 3863f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 3864f43ffb60SIlya Dryomov goto loop; 386567c5e7d4SFilipe Manana } 3866f43ffb60SIlya Dryomov 386719a39dceSIlya Dryomov if (counting) { 3868f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 386919a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 387019a39dceSIlya Dryomov bctl->stat.expected++; 387119a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 387212907fc7SDavid Sterba 387312907fc7SDavid Sterba if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 387412907fc7SDavid Sterba count_data++; 387512907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 387612907fc7SDavid Sterba count_sys++; 387712907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 387812907fc7SDavid Sterba count_meta++; 387912907fc7SDavid Sterba 388012907fc7SDavid Sterba goto loop; 388112907fc7SDavid Sterba } 388212907fc7SDavid Sterba 388312907fc7SDavid Sterba /* 388412907fc7SDavid Sterba * Apply limit_min filter, no need to check if the LIMITS 388512907fc7SDavid Sterba * filter is used, limit_min is 0 by default 388612907fc7SDavid Sterba */ 388712907fc7SDavid Sterba if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 388812907fc7SDavid Sterba count_data < bctl->data.limit_min) 388912907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 389012907fc7SDavid Sterba count_meta < bctl->meta.limit_min) 389112907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 389212907fc7SDavid Sterba count_sys < bctl->sys.limit_min)) { 3893f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 389419a39dceSIlya Dryomov goto loop; 389519a39dceSIlya Dryomov } 389619a39dceSIlya Dryomov 3897a6f93c71SLiu Bo if (!chunk_reserved) { 3898a6f93c71SLiu Bo /* 3899a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 3900a6f93c71SLiu Bo * which could potentially end up with losing data's 3901a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 3902a6f93c71SLiu Bo * advance. 3903a6f93c71SLiu Bo */ 3904a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, 3905a6f93c71SLiu Bo found_key.offset); 39062c9fe835SZhao Lei if (ret < 0) { 3907f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 39082c9fe835SZhao Lei goto error; 3909a6f93c71SLiu Bo } else if (ret == 1) { 39102c9fe835SZhao Lei chunk_reserved = 1; 39112c9fe835SZhao Lei } 3912a6f93c71SLiu Bo } 39132c9fe835SZhao Lei 39145b4aacefSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3915f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 391619a39dceSIlya Dryomov if (ret == -ENOSPC) { 3917c9e9f97bSIlya Dryomov enospc_errors++; 3918eede2bf3SOmar Sandoval } else if (ret == -ETXTBSY) { 3919eede2bf3SOmar Sandoval btrfs_info(fs_info, 3920eede2bf3SOmar Sandoval "skipping relocation of block group %llu due to active swapfile", 3921eede2bf3SOmar Sandoval found_key.offset); 3922eede2bf3SOmar Sandoval ret = 0; 3923eede2bf3SOmar Sandoval } else if (ret) { 3924eede2bf3SOmar Sandoval goto error; 392519a39dceSIlya Dryomov } else { 392619a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock); 392719a39dceSIlya Dryomov bctl->stat.completed++; 392819a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock); 392919a39dceSIlya Dryomov } 3930f43ffb60SIlya Dryomov loop: 3931795a3321SIlya Dryomov if (found_key.offset == 0) 3932795a3321SIlya Dryomov break; 3933ba1bf481SJosef Bacik key.offset = found_key.offset - 1; 3934ec44a35cSChris Mason } 3935c9e9f97bSIlya Dryomov 393619a39dceSIlya Dryomov if (counting) { 393719a39dceSIlya Dryomov btrfs_release_path(path); 393819a39dceSIlya Dryomov counting = false; 393919a39dceSIlya Dryomov goto again; 394019a39dceSIlya Dryomov } 3941ec44a35cSChris Mason error: 3942ec44a35cSChris Mason btrfs_free_path(path); 3943c9e9f97bSIlya Dryomov if (enospc_errors) { 3944efe120a0SFrank Holton btrfs_info(fs_info, "%d enospc errors during balance", 3945c9e9f97bSIlya Dryomov enospc_errors); 3946c9e9f97bSIlya Dryomov if (!ret) 3947c9e9f97bSIlya Dryomov ret = -ENOSPC; 3948c9e9f97bSIlya Dryomov } 3949c9e9f97bSIlya Dryomov 3950ec44a35cSChris Mason return ret; 3951ec44a35cSChris Mason } 3952ec44a35cSChris Mason 39530c460c0dSIlya Dryomov /** 39540c460c0dSIlya Dryomov * alloc_profile_is_valid - see if a given profile is valid and reduced 39550c460c0dSIlya Dryomov * @flags: profile to validate 39560c460c0dSIlya Dryomov * @extended: if true @flags is treated as an extended profile 39570c460c0dSIlya Dryomov */ 39580c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended) 39590c460c0dSIlya Dryomov { 39600c460c0dSIlya Dryomov u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 39610c460c0dSIlya Dryomov BTRFS_BLOCK_GROUP_PROFILE_MASK); 39620c460c0dSIlya Dryomov 39630c460c0dSIlya Dryomov flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 39640c460c0dSIlya Dryomov 39650c460c0dSIlya Dryomov /* 1) check that all other bits are zeroed */ 39660c460c0dSIlya Dryomov if (flags & ~mask) 39670c460c0dSIlya Dryomov return 0; 39680c460c0dSIlya Dryomov 39690c460c0dSIlya Dryomov /* 2) see if profile is reduced */ 39700c460c0dSIlya Dryomov if (flags == 0) 39710c460c0dSIlya Dryomov return !extended; /* "0" is valid for usual profiles */ 39720c460c0dSIlya Dryomov 3973c1499166SDavid Sterba return has_single_bit_set(flags); 39740c460c0dSIlya Dryomov } 39750c460c0dSIlya Dryomov 3976837d5b6eSIlya Dryomov static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3977837d5b6eSIlya Dryomov { 3978a7e99c69SIlya Dryomov /* cancel requested || normal exit path */ 3979a7e99c69SIlya Dryomov return atomic_read(&fs_info->balance_cancel_req) || 3980a7e99c69SIlya Dryomov (atomic_read(&fs_info->balance_pause_req) == 0 && 3981a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req) == 0); 3982837d5b6eSIlya Dryomov } 3983837d5b6eSIlya Dryomov 39845ba366c3SDavid Sterba /* 39855ba366c3SDavid Sterba * Validate target profile against allowed profiles and return true if it's OK. 39865ba366c3SDavid Sterba * Otherwise print the error message and return false. 39875ba366c3SDavid Sterba */ 39885ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 39895ba366c3SDavid Sterba const struct btrfs_balance_args *bargs, 39905ba366c3SDavid Sterba u64 allowed, const char *type) 3991bdcd3c97SAlexandru Moise { 39925ba366c3SDavid Sterba if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 39935ba366c3SDavid Sterba return true; 39945ba366c3SDavid Sterba 3995c8050b3bSQu Wenruo if (fs_info->sectorsize < PAGE_SIZE && 3996c8050b3bSQu Wenruo bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3997c8050b3bSQu Wenruo btrfs_err(fs_info, 3998c8050b3bSQu Wenruo "RAID56 is not yet supported for sectorsize %u with page size %lu", 3999c8050b3bSQu Wenruo fs_info->sectorsize, PAGE_SIZE); 4000c8050b3bSQu Wenruo return false; 4001c8050b3bSQu Wenruo } 40025ba366c3SDavid Sterba /* Profile is valid and does not have bits outside of the allowed set */ 40035ba366c3SDavid Sterba if (alloc_profile_is_valid(bargs->target, 1) && 40045ba366c3SDavid Sterba (bargs->target & ~allowed) == 0) 40055ba366c3SDavid Sterba return true; 40065ba366c3SDavid Sterba 40075ba366c3SDavid Sterba btrfs_err(fs_info, "balance: invalid convert %s profile %s", 40085ba366c3SDavid Sterba type, btrfs_bg_type_to_raid_name(bargs->target)); 40095ba366c3SDavid Sterba return false; 4010bdcd3c97SAlexandru Moise } 4011bdcd3c97SAlexandru Moise 4012c9e9f97bSIlya Dryomov /* 401356fc37d9SAnand Jain * Fill @buf with textual description of balance filter flags @bargs, up to 401456fc37d9SAnand Jain * @size_buf including the terminating null. The output may be trimmed if it 401556fc37d9SAnand Jain * does not fit into the provided buffer. 401656fc37d9SAnand Jain */ 401756fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 401856fc37d9SAnand Jain u32 size_buf) 401956fc37d9SAnand Jain { 402056fc37d9SAnand Jain int ret; 402156fc37d9SAnand Jain u32 size_bp = size_buf; 402256fc37d9SAnand Jain char *bp = buf; 402356fc37d9SAnand Jain u64 flags = bargs->flags; 402456fc37d9SAnand Jain char tmp_buf[128] = {'\0'}; 402556fc37d9SAnand Jain 402656fc37d9SAnand Jain if (!flags) 402756fc37d9SAnand Jain return; 402856fc37d9SAnand Jain 402956fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a) \ 403056fc37d9SAnand Jain do { \ 403156fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a)); \ 403256fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 403356fc37d9SAnand Jain goto out_overflow; \ 403456fc37d9SAnand Jain size_bp -= ret; \ 403556fc37d9SAnand Jain bp += ret; \ 403656fc37d9SAnand Jain } while (0) 403756fc37d9SAnand Jain 403856fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \ 403956fc37d9SAnand Jain do { \ 404056fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \ 404156fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 404256fc37d9SAnand Jain goto out_overflow; \ 404356fc37d9SAnand Jain size_bp -= ret; \ 404456fc37d9SAnand Jain bp += ret; \ 404556fc37d9SAnand Jain } while (0) 404656fc37d9SAnand Jain 404756fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2) \ 404856fc37d9SAnand Jain do { \ 404956fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 405056fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 405156fc37d9SAnand Jain goto out_overflow; \ 405256fc37d9SAnand Jain size_bp -= ret; \ 405356fc37d9SAnand Jain bp += ret; \ 405456fc37d9SAnand Jain } while (0) 405556fc37d9SAnand Jain 4056158da513SDavid Sterba if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4057158da513SDavid Sterba CHECK_APPEND_1ARG("convert=%s,", 4058158da513SDavid Sterba btrfs_bg_type_to_raid_name(bargs->target)); 405956fc37d9SAnand Jain 406056fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_SOFT) 406156fc37d9SAnand Jain CHECK_APPEND_NOARG("soft,"); 406256fc37d9SAnand Jain 406356fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 406456fc37d9SAnand Jain btrfs_describe_block_groups(bargs->profiles, tmp_buf, 406556fc37d9SAnand Jain sizeof(tmp_buf)); 406656fc37d9SAnand Jain CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 406756fc37d9SAnand Jain } 406856fc37d9SAnand Jain 406956fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE) 407056fc37d9SAnand Jain CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 407156fc37d9SAnand Jain 407256fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 407356fc37d9SAnand Jain CHECK_APPEND_2ARG("usage=%u..%u,", 407456fc37d9SAnand Jain bargs->usage_min, bargs->usage_max); 407556fc37d9SAnand Jain 407656fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DEVID) 407756fc37d9SAnand Jain CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 407856fc37d9SAnand Jain 407956fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DRANGE) 408056fc37d9SAnand Jain CHECK_APPEND_2ARG("drange=%llu..%llu,", 408156fc37d9SAnand Jain bargs->pstart, bargs->pend); 408256fc37d9SAnand Jain 408356fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_VRANGE) 408456fc37d9SAnand Jain CHECK_APPEND_2ARG("vrange=%llu..%llu,", 408556fc37d9SAnand Jain bargs->vstart, bargs->vend); 408656fc37d9SAnand Jain 408756fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT) 408856fc37d9SAnand Jain CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 408956fc37d9SAnand Jain 409056fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 409156fc37d9SAnand Jain CHECK_APPEND_2ARG("limit=%u..%u,", 409256fc37d9SAnand Jain bargs->limit_min, bargs->limit_max); 409356fc37d9SAnand Jain 409456fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 409556fc37d9SAnand Jain CHECK_APPEND_2ARG("stripes=%u..%u,", 409656fc37d9SAnand Jain bargs->stripes_min, bargs->stripes_max); 409756fc37d9SAnand Jain 409856fc37d9SAnand Jain #undef CHECK_APPEND_2ARG 409956fc37d9SAnand Jain #undef CHECK_APPEND_1ARG 410056fc37d9SAnand Jain #undef CHECK_APPEND_NOARG 410156fc37d9SAnand Jain 410256fc37d9SAnand Jain out_overflow: 410356fc37d9SAnand Jain 410456fc37d9SAnand Jain if (size_bp < size_buf) 410556fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 410656fc37d9SAnand Jain else 410756fc37d9SAnand Jain buf[0] = '\0'; 410856fc37d9SAnand Jain } 410956fc37d9SAnand Jain 411056fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 411156fc37d9SAnand Jain { 411256fc37d9SAnand Jain u32 size_buf = 1024; 411356fc37d9SAnand Jain char tmp_buf[192] = {'\0'}; 411456fc37d9SAnand Jain char *buf; 411556fc37d9SAnand Jain char *bp; 411656fc37d9SAnand Jain u32 size_bp = size_buf; 411756fc37d9SAnand Jain int ret; 411856fc37d9SAnand Jain struct btrfs_balance_control *bctl = fs_info->balance_ctl; 411956fc37d9SAnand Jain 412056fc37d9SAnand Jain buf = kzalloc(size_buf, GFP_KERNEL); 412156fc37d9SAnand Jain if (!buf) 412256fc37d9SAnand Jain return; 412356fc37d9SAnand Jain 412456fc37d9SAnand Jain bp = buf; 412556fc37d9SAnand Jain 412656fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \ 412756fc37d9SAnand Jain do { \ 412856fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \ 412956fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \ 413056fc37d9SAnand Jain goto out_overflow; \ 413156fc37d9SAnand Jain size_bp -= ret; \ 413256fc37d9SAnand Jain bp += ret; \ 413356fc37d9SAnand Jain } while (0) 413456fc37d9SAnand Jain 413556fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_FORCE) 413656fc37d9SAnand Jain CHECK_APPEND_1ARG("%s", "-f "); 413756fc37d9SAnand Jain 413856fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_DATA) { 413956fc37d9SAnand Jain describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 414056fc37d9SAnand Jain CHECK_APPEND_1ARG("-d%s ", tmp_buf); 414156fc37d9SAnand Jain } 414256fc37d9SAnand Jain 414356fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_METADATA) { 414456fc37d9SAnand Jain describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 414556fc37d9SAnand Jain CHECK_APPEND_1ARG("-m%s ", tmp_buf); 414656fc37d9SAnand Jain } 414756fc37d9SAnand Jain 414856fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 414956fc37d9SAnand Jain describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 415056fc37d9SAnand Jain CHECK_APPEND_1ARG("-s%s ", tmp_buf); 415156fc37d9SAnand Jain } 415256fc37d9SAnand Jain 415356fc37d9SAnand Jain #undef CHECK_APPEND_1ARG 415456fc37d9SAnand Jain 415556fc37d9SAnand Jain out_overflow: 415656fc37d9SAnand Jain 415756fc37d9SAnand Jain if (size_bp < size_buf) 415856fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 415956fc37d9SAnand Jain btrfs_info(fs_info, "balance: %s %s", 416056fc37d9SAnand Jain (bctl->flags & BTRFS_BALANCE_RESUME) ? 416156fc37d9SAnand Jain "resume" : "start", buf); 416256fc37d9SAnand Jain 416356fc37d9SAnand Jain kfree(buf); 416456fc37d9SAnand Jain } 416556fc37d9SAnand Jain 416656fc37d9SAnand Jain /* 4167dccdb07bSDavid Sterba * Should be called with balance mutexe held 4168c9e9f97bSIlya Dryomov */ 41696fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info, 41706fcf6e2bSDavid Sterba struct btrfs_balance_control *bctl, 4171c9e9f97bSIlya Dryomov struct btrfs_ioctl_balance_args *bargs) 4172c9e9f97bSIlya Dryomov { 417314506127SAdam Borowski u64 meta_target, data_target; 4174f43ffb60SIlya Dryomov u64 allowed; 4175e4837f8fSIlya Dryomov int mixed = 0; 4176c9e9f97bSIlya Dryomov int ret; 41778dabb742SStefan Behrens u64 num_devices; 4178de98ced9SMiao Xie unsigned seq; 4179e62869beSAnand Jain bool reducing_redundancy; 4180081db89bSDavid Sterba int i; 4181c9e9f97bSIlya Dryomov 4182837d5b6eSIlya Dryomov if (btrfs_fs_closing(fs_info) || 4183a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_pause_req) || 4184726a3421SQu Wenruo btrfs_should_cancel_balance(fs_info)) { 4185c9e9f97bSIlya Dryomov ret = -EINVAL; 4186c9e9f97bSIlya Dryomov goto out; 4187c9e9f97bSIlya Dryomov } 4188c9e9f97bSIlya Dryomov 4189e4837f8fSIlya Dryomov allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4190e4837f8fSIlya Dryomov if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4191e4837f8fSIlya Dryomov mixed = 1; 4192e4837f8fSIlya Dryomov 4193f43ffb60SIlya Dryomov /* 4194f43ffb60SIlya Dryomov * In case of mixed groups both data and meta should be picked, 4195f43ffb60SIlya Dryomov * and identical options should be given for both of them. 4196f43ffb60SIlya Dryomov */ 4197e4837f8fSIlya Dryomov allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4198e4837f8fSIlya Dryomov if (mixed && (bctl->flags & allowed)) { 4199f43ffb60SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4200f43ffb60SIlya Dryomov !(bctl->flags & BTRFS_BALANCE_METADATA) || 4201f43ffb60SIlya Dryomov memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 42025d163e0eSJeff Mahoney btrfs_err(fs_info, 42036dac13f8SAnand Jain "balance: mixed groups data and metadata options must be the same"); 4204f43ffb60SIlya Dryomov ret = -EINVAL; 4205f43ffb60SIlya Dryomov goto out; 4206f43ffb60SIlya Dryomov } 4207f43ffb60SIlya Dryomov } 4208f43ffb60SIlya Dryomov 4209b35cf1f0SJosef Bacik /* 4210b35cf1f0SJosef Bacik * rw_devices will not change at the moment, device add/delete/replace 4211c3e1f96cSGoldwyn Rodrigues * are exclusive 4212b35cf1f0SJosef Bacik */ 4213b35cf1f0SJosef Bacik num_devices = fs_info->fs_devices->rw_devices; 4214fab27359SQu Wenruo 4215fab27359SQu Wenruo /* 4216fab27359SQu Wenruo * SINGLE profile on-disk has no profile bit, but in-memory we have a 4217fab27359SQu Wenruo * special bit for it, to make it easier to distinguish. Thus we need 4218fab27359SQu Wenruo * to set it manually, or balance would refuse the profile. 4219fab27359SQu Wenruo */ 4220fab27359SQu Wenruo allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4221081db89bSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4222081db89bSDavid Sterba if (num_devices >= btrfs_raid_array[i].devs_min) 4223081db89bSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag; 42241da73967SAnand Jain 42255ba366c3SDavid Sterba if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 42265ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 42275ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4228e4d8ec0fSIlya Dryomov ret = -EINVAL; 4229e4d8ec0fSIlya Dryomov goto out; 4230e4d8ec0fSIlya Dryomov } 4231e4d8ec0fSIlya Dryomov 42326079e12cSDavid Sterba /* 42336079e12cSDavid Sterba * Allow to reduce metadata or system integrity only if force set for 42346079e12cSDavid Sterba * profiles with redundancy (copies, parity) 42356079e12cSDavid Sterba */ 42366079e12cSDavid Sterba allowed = 0; 42376079e12cSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 42386079e12cSDavid Sterba if (btrfs_raid_array[i].ncopies >= 2 || 42396079e12cSDavid Sterba btrfs_raid_array[i].tolerated_failures >= 1) 42406079e12cSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag; 42416079e12cSDavid Sterba } 4242de98ced9SMiao Xie do { 4243de98ced9SMiao Xie seq = read_seqbegin(&fs_info->profiles_lock); 4244de98ced9SMiao Xie 4245e4d8ec0fSIlya Dryomov if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4246e4d8ec0fSIlya Dryomov (fs_info->avail_system_alloc_bits & allowed) && 4247e4d8ec0fSIlya Dryomov !(bctl->sys.target & allowed)) || 4248e4d8ec0fSIlya Dryomov ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4249e4d8ec0fSIlya Dryomov (fs_info->avail_metadata_alloc_bits & allowed) && 42505a8067c0SFilipe Manana !(bctl->meta.target & allowed))) 4251e62869beSAnand Jain reducing_redundancy = true; 42525a8067c0SFilipe Manana else 4253e62869beSAnand Jain reducing_redundancy = false; 42545a8067c0SFilipe Manana 42555a8067c0SFilipe Manana /* if we're not converting, the target field is uninitialized */ 42565a8067c0SFilipe Manana meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 42575a8067c0SFilipe Manana bctl->meta.target : fs_info->avail_metadata_alloc_bits; 42585a8067c0SFilipe Manana data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 42595a8067c0SFilipe Manana bctl->data.target : fs_info->avail_data_alloc_bits; 42605a8067c0SFilipe Manana } while (read_seqretry(&fs_info->profiles_lock, seq)); 42615a8067c0SFilipe Manana 4262e62869beSAnand Jain if (reducing_redundancy) { 4263e4d8ec0fSIlya Dryomov if (bctl->flags & BTRFS_BALANCE_FORCE) { 42645d163e0eSJeff Mahoney btrfs_info(fs_info, 4265e62869beSAnand Jain "balance: force reducing metadata redundancy"); 4266e4d8ec0fSIlya Dryomov } else { 42675d163e0eSJeff Mahoney btrfs_err(fs_info, 4268e62869beSAnand Jain "balance: reduces metadata redundancy, use --force if you want this"); 4269e4d8ec0fSIlya Dryomov ret = -EINVAL; 4270e4d8ec0fSIlya Dryomov goto out; 4271e4d8ec0fSIlya Dryomov } 4272e4d8ec0fSIlya Dryomov } 4273e4d8ec0fSIlya Dryomov 427414506127SAdam Borowski if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 427514506127SAdam Borowski btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4276ee592d07SSam Tygier btrfs_warn(fs_info, 42776dac13f8SAnand Jain "balance: metadata profile %s has lower redundancy than data profile %s", 4278158da513SDavid Sterba btrfs_bg_type_to_raid_name(meta_target), 4279158da513SDavid Sterba btrfs_bg_type_to_raid_name(data_target)); 4280ee592d07SSam Tygier } 4281ee592d07SSam Tygier 42826bccf3abSJeff Mahoney ret = insert_balance_item(fs_info, bctl); 428359641015SIlya Dryomov if (ret && ret != -EEXIST) 42840940ebf6SIlya Dryomov goto out; 42850940ebf6SIlya Dryomov 428659641015SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 428759641015SIlya Dryomov BUG_ON(ret == -EEXIST); 4288833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl); 4289833aae18SDavid Sterba spin_lock(&fs_info->balance_lock); 4290833aae18SDavid Sterba fs_info->balance_ctl = bctl; 4291833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock); 429259641015SIlya Dryomov } else { 429359641015SIlya Dryomov BUG_ON(ret != -EEXIST); 429459641015SIlya Dryomov spin_lock(&fs_info->balance_lock); 429559641015SIlya Dryomov update_balance_args(bctl); 429659641015SIlya Dryomov spin_unlock(&fs_info->balance_lock); 429759641015SIlya Dryomov } 4298c9e9f97bSIlya Dryomov 42993009a62fSDavid Sterba ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 43003009a62fSDavid Sterba set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 430156fc37d9SAnand Jain describe_balance_start_or_resume(fs_info); 4302c9e9f97bSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4303c9e9f97bSIlya Dryomov 4304c9e9f97bSIlya Dryomov ret = __btrfs_balance(fs_info); 4305c9e9f97bSIlya Dryomov 4306c9e9f97bSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 43077333bd02SAnand Jain if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 43087333bd02SAnand Jain btrfs_info(fs_info, "balance: paused"); 430944d354abSQu Wenruo /* 431044d354abSQu Wenruo * Balance can be canceled by: 431144d354abSQu Wenruo * 431244d354abSQu Wenruo * - Regular cancel request 431344d354abSQu Wenruo * Then ret == -ECANCELED and balance_cancel_req > 0 431444d354abSQu Wenruo * 431544d354abSQu Wenruo * - Fatal signal to "btrfs" process 431644d354abSQu Wenruo * Either the signal caught by wait_reserve_ticket() and callers 431744d354abSQu Wenruo * got -EINTR, or caught by btrfs_should_cancel_balance() and 431844d354abSQu Wenruo * got -ECANCELED. 431944d354abSQu Wenruo * Either way, in this case balance_cancel_req = 0, and 432044d354abSQu Wenruo * ret == -EINTR or ret == -ECANCELED. 432144d354abSQu Wenruo * 432244d354abSQu Wenruo * So here we only check the return value to catch canceled balance. 432344d354abSQu Wenruo */ 432444d354abSQu Wenruo else if (ret == -ECANCELED || ret == -EINTR) 43257333bd02SAnand Jain btrfs_info(fs_info, "balance: canceled"); 43267333bd02SAnand Jain else 43277333bd02SAnand Jain btrfs_info(fs_info, "balance: ended with status: %d", ret); 43287333bd02SAnand Jain 43293009a62fSDavid Sterba clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4330c9e9f97bSIlya Dryomov 4331c9e9f97bSIlya Dryomov if (bargs) { 4332c9e9f97bSIlya Dryomov memset(bargs, 0, sizeof(*bargs)); 4333008ef096SDavid Sterba btrfs_update_ioctl_balance_args(fs_info, bargs); 4334c9e9f97bSIlya Dryomov } 4335c9e9f97bSIlya Dryomov 43363a01aa7aSIlya Dryomov if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 43373a01aa7aSIlya Dryomov balance_need_close(fs_info)) { 4338149196a2SDavid Sterba reset_balance_state(fs_info); 4339c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 43403a01aa7aSIlya Dryomov } 43413a01aa7aSIlya Dryomov 4342837d5b6eSIlya Dryomov wake_up(&fs_info->balance_wait_q); 4343c9e9f97bSIlya Dryomov 4344c9e9f97bSIlya Dryomov return ret; 4345c9e9f97bSIlya Dryomov out: 434659641015SIlya Dryomov if (bctl->flags & BTRFS_BALANCE_RESUME) 4347149196a2SDavid Sterba reset_balance_state(fs_info); 4348a17c95dfSDavid Sterba else 4349c9e9f97bSIlya Dryomov kfree(bctl); 4350c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 4351a17c95dfSDavid Sterba 43528f18cf13SChris Mason return ret; 43538f18cf13SChris Mason } 43548f18cf13SChris Mason 435559641015SIlya Dryomov static int balance_kthread(void *data) 435659641015SIlya Dryomov { 43572b6ba629SIlya Dryomov struct btrfs_fs_info *fs_info = data; 43589555c6c1SIlya Dryomov int ret = 0; 435959641015SIlya Dryomov 436059641015SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 436156fc37d9SAnand Jain if (fs_info->balance_ctl) 43626fcf6e2bSDavid Sterba ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 436359641015SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 43642b6ba629SIlya Dryomov 436559641015SIlya Dryomov return ret; 436659641015SIlya Dryomov } 436759641015SIlya Dryomov 43682b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 43692b6ba629SIlya Dryomov { 43702b6ba629SIlya Dryomov struct task_struct *tsk; 43712b6ba629SIlya Dryomov 43721354e1a1SDavid Sterba mutex_lock(&fs_info->balance_mutex); 43732b6ba629SIlya Dryomov if (!fs_info->balance_ctl) { 43741354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 43752b6ba629SIlya Dryomov return 0; 43762b6ba629SIlya Dryomov } 43771354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 43782b6ba629SIlya Dryomov 43793cdde224SJeff Mahoney if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 43806dac13f8SAnand Jain btrfs_info(fs_info, "balance: resume skipped"); 43812b6ba629SIlya Dryomov return 0; 43822b6ba629SIlya Dryomov } 43832b6ba629SIlya Dryomov 438402ee654dSAnand Jain /* 438502ee654dSAnand Jain * A ro->rw remount sequence should continue with the paused balance 438602ee654dSAnand Jain * regardless of who pauses it, system or the user as of now, so set 438702ee654dSAnand Jain * the resume flag. 438802ee654dSAnand Jain */ 438902ee654dSAnand Jain spin_lock(&fs_info->balance_lock); 439002ee654dSAnand Jain fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 439102ee654dSAnand Jain spin_unlock(&fs_info->balance_lock); 439202ee654dSAnand Jain 43932b6ba629SIlya Dryomov tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4394cd633972SSachin Kamat return PTR_ERR_OR_ZERO(tsk); 43952b6ba629SIlya Dryomov } 43962b6ba629SIlya Dryomov 439768310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 439859641015SIlya Dryomov { 439959641015SIlya Dryomov struct btrfs_balance_control *bctl; 440059641015SIlya Dryomov struct btrfs_balance_item *item; 440159641015SIlya Dryomov struct btrfs_disk_balance_args disk_bargs; 440259641015SIlya Dryomov struct btrfs_path *path; 440359641015SIlya Dryomov struct extent_buffer *leaf; 440459641015SIlya Dryomov struct btrfs_key key; 440559641015SIlya Dryomov int ret; 440659641015SIlya Dryomov 440759641015SIlya Dryomov path = btrfs_alloc_path(); 440859641015SIlya Dryomov if (!path) 440959641015SIlya Dryomov return -ENOMEM; 441059641015SIlya Dryomov 441168310a5eSIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID; 4412c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY; 441368310a5eSIlya Dryomov key.offset = 0; 441468310a5eSIlya Dryomov 441568310a5eSIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 441668310a5eSIlya Dryomov if (ret < 0) 441768310a5eSIlya Dryomov goto out; 441868310a5eSIlya Dryomov if (ret > 0) { /* ret = -ENOENT; */ 441968310a5eSIlya Dryomov ret = 0; 442068310a5eSIlya Dryomov goto out; 442168310a5eSIlya Dryomov } 442268310a5eSIlya Dryomov 442359641015SIlya Dryomov bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 442459641015SIlya Dryomov if (!bctl) { 442559641015SIlya Dryomov ret = -ENOMEM; 442659641015SIlya Dryomov goto out; 442759641015SIlya Dryomov } 442859641015SIlya Dryomov 442959641015SIlya Dryomov leaf = path->nodes[0]; 443059641015SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 443159641015SIlya Dryomov 443268310a5eSIlya Dryomov bctl->flags = btrfs_balance_flags(leaf, item); 443368310a5eSIlya Dryomov bctl->flags |= BTRFS_BALANCE_RESUME; 443459641015SIlya Dryomov 443559641015SIlya Dryomov btrfs_balance_data(leaf, item, &disk_bargs); 443659641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 443759641015SIlya Dryomov btrfs_balance_meta(leaf, item, &disk_bargs); 443859641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 443959641015SIlya Dryomov btrfs_balance_sys(leaf, item, &disk_bargs); 444059641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 444159641015SIlya Dryomov 4442eee95e3fSDavid Sterba /* 4443eee95e3fSDavid Sterba * This should never happen, as the paused balance state is recovered 4444eee95e3fSDavid Sterba * during mount without any chance of other exclusive ops to collide. 4445eee95e3fSDavid Sterba * 4446eee95e3fSDavid Sterba * This gives the exclusive op status to balance and keeps in paused 4447eee95e3fSDavid Sterba * state until user intervention (cancel or umount). If the ownership 4448eee95e3fSDavid Sterba * cannot be assigned, show a message but do not fail. The balance 4449eee95e3fSDavid Sterba * is in a paused state and must have fs_info::balance_ctl properly 4450eee95e3fSDavid Sterba * set up. 4451eee95e3fSDavid Sterba */ 4452c3e1f96cSGoldwyn Rodrigues if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 4453eee95e3fSDavid Sterba btrfs_warn(fs_info, 44546dac13f8SAnand Jain "balance: cannot set exclusive op status, resume manually"); 4455ed0fb78fSIlya Dryomov 4456fb286100SJosef Bacik btrfs_release_path(path); 4457fb286100SJosef Bacik 445868310a5eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4459833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl); 4460833aae18SDavid Sterba spin_lock(&fs_info->balance_lock); 4461833aae18SDavid Sterba fs_info->balance_ctl = bctl; 4462833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock); 446368310a5eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 446459641015SIlya Dryomov out: 446559641015SIlya Dryomov btrfs_free_path(path); 446659641015SIlya Dryomov return ret; 446759641015SIlya Dryomov } 446859641015SIlya Dryomov 4469837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4470837d5b6eSIlya Dryomov { 4471837d5b6eSIlya Dryomov int ret = 0; 4472837d5b6eSIlya Dryomov 4473837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4474837d5b6eSIlya Dryomov if (!fs_info->balance_ctl) { 4475837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4476837d5b6eSIlya Dryomov return -ENOTCONN; 4477837d5b6eSIlya Dryomov } 4478837d5b6eSIlya Dryomov 44793009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4480837d5b6eSIlya Dryomov atomic_inc(&fs_info->balance_pause_req); 4481837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4482837d5b6eSIlya Dryomov 4483837d5b6eSIlya Dryomov wait_event(fs_info->balance_wait_q, 44843009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4485837d5b6eSIlya Dryomov 4486837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4487837d5b6eSIlya Dryomov /* we are good with balance_ctl ripped off from under us */ 44883009a62fSDavid Sterba BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4489837d5b6eSIlya Dryomov atomic_dec(&fs_info->balance_pause_req); 4490837d5b6eSIlya Dryomov } else { 4491837d5b6eSIlya Dryomov ret = -ENOTCONN; 4492837d5b6eSIlya Dryomov } 4493837d5b6eSIlya Dryomov 4494837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4495837d5b6eSIlya Dryomov return ret; 4496837d5b6eSIlya Dryomov } 4497837d5b6eSIlya Dryomov 4498a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4499a7e99c69SIlya Dryomov { 4500a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4501a7e99c69SIlya Dryomov if (!fs_info->balance_ctl) { 4502a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4503a7e99c69SIlya Dryomov return -ENOTCONN; 4504a7e99c69SIlya Dryomov } 4505a7e99c69SIlya Dryomov 4506cf7d20f4SDavid Sterba /* 4507cf7d20f4SDavid Sterba * A paused balance with the item stored on disk can be resumed at 4508cf7d20f4SDavid Sterba * mount time if the mount is read-write. Otherwise it's still paused 4509cf7d20f4SDavid Sterba * and we must not allow cancelling as it deletes the item. 4510cf7d20f4SDavid Sterba */ 4511cf7d20f4SDavid Sterba if (sb_rdonly(fs_info->sb)) { 4512cf7d20f4SDavid Sterba mutex_unlock(&fs_info->balance_mutex); 4513cf7d20f4SDavid Sterba return -EROFS; 4514cf7d20f4SDavid Sterba } 4515cf7d20f4SDavid Sterba 4516a7e99c69SIlya Dryomov atomic_inc(&fs_info->balance_cancel_req); 4517a7e99c69SIlya Dryomov /* 4518a7e99c69SIlya Dryomov * if we are running just wait and return, balance item is 4519a7e99c69SIlya Dryomov * deleted in btrfs_balance in this case 4520a7e99c69SIlya Dryomov */ 45213009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4522a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4523a7e99c69SIlya Dryomov wait_event(fs_info->balance_wait_q, 45243009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4525a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4526a7e99c69SIlya Dryomov } else { 4527a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4528dccdb07bSDavid Sterba /* 4529dccdb07bSDavid Sterba * Lock released to allow other waiters to continue, we'll 4530dccdb07bSDavid Sterba * reexamine the status again. 4531dccdb07bSDavid Sterba */ 4532a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex); 4533a7e99c69SIlya Dryomov 4534a17c95dfSDavid Sterba if (fs_info->balance_ctl) { 4535149196a2SDavid Sterba reset_balance_state(fs_info); 4536c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info); 45376dac13f8SAnand Jain btrfs_info(fs_info, "balance: canceled"); 4538a17c95dfSDavid Sterba } 4539a7e99c69SIlya Dryomov } 4540a7e99c69SIlya Dryomov 45413009a62fSDavid Sterba BUG_ON(fs_info->balance_ctl || 45423009a62fSDavid Sterba test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4543a7e99c69SIlya Dryomov atomic_dec(&fs_info->balance_cancel_req); 4544a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex); 4545a7e99c69SIlya Dryomov return 0; 4546a7e99c69SIlya Dryomov } 4547a7e99c69SIlya Dryomov 454897f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data) 4549803b2f54SStefan Behrens { 4550803b2f54SStefan Behrens struct btrfs_fs_info *fs_info = data; 4551803b2f54SStefan Behrens struct btrfs_root *root = fs_info->tree_root; 4552803b2f54SStefan Behrens struct btrfs_key key; 4553803b2f54SStefan Behrens struct btrfs_path *path = NULL; 4554803b2f54SStefan Behrens int ret = 0; 4555803b2f54SStefan Behrens struct extent_buffer *eb; 4556803b2f54SStefan Behrens int slot; 4557803b2f54SStefan Behrens struct btrfs_root_item root_item; 4558803b2f54SStefan Behrens u32 item_size; 4559f45388f3SFilipe David Borba Manana struct btrfs_trans_handle *trans = NULL; 4560c94bec2cSJosef Bacik bool closing = false; 4561803b2f54SStefan Behrens 4562803b2f54SStefan Behrens path = btrfs_alloc_path(); 4563803b2f54SStefan Behrens if (!path) { 4564803b2f54SStefan Behrens ret = -ENOMEM; 4565803b2f54SStefan Behrens goto out; 4566803b2f54SStefan Behrens } 4567803b2f54SStefan Behrens 4568803b2f54SStefan Behrens key.objectid = 0; 4569803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4570803b2f54SStefan Behrens key.offset = 0; 4571803b2f54SStefan Behrens 4572803b2f54SStefan Behrens while (1) { 4573c94bec2cSJosef Bacik if (btrfs_fs_closing(fs_info)) { 4574c94bec2cSJosef Bacik closing = true; 4575c94bec2cSJosef Bacik break; 4576c94bec2cSJosef Bacik } 45777c829b72SAnand Jain ret = btrfs_search_forward(root, &key, path, 45787c829b72SAnand Jain BTRFS_OLDEST_GENERATION); 4579803b2f54SStefan Behrens if (ret) { 4580803b2f54SStefan Behrens if (ret > 0) 4581803b2f54SStefan Behrens ret = 0; 4582803b2f54SStefan Behrens break; 4583803b2f54SStefan Behrens } 4584803b2f54SStefan Behrens 4585803b2f54SStefan Behrens if (key.type != BTRFS_ROOT_ITEM_KEY || 4586803b2f54SStefan Behrens (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4587803b2f54SStefan Behrens key.objectid != BTRFS_FS_TREE_OBJECTID) || 4588803b2f54SStefan Behrens key.objectid > BTRFS_LAST_FREE_OBJECTID) 4589803b2f54SStefan Behrens goto skip; 4590803b2f54SStefan Behrens 4591803b2f54SStefan Behrens eb = path->nodes[0]; 4592803b2f54SStefan Behrens slot = path->slots[0]; 4593803b2f54SStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 4594803b2f54SStefan Behrens if (item_size < sizeof(root_item)) 4595803b2f54SStefan Behrens goto skip; 4596803b2f54SStefan Behrens 4597803b2f54SStefan Behrens read_extent_buffer(eb, &root_item, 4598803b2f54SStefan Behrens btrfs_item_ptr_offset(eb, slot), 4599803b2f54SStefan Behrens (int)sizeof(root_item)); 4600803b2f54SStefan Behrens if (btrfs_root_refs(&root_item) == 0) 4601803b2f54SStefan Behrens goto skip; 4602f45388f3SFilipe David Borba Manana 4603f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid) || 4604f45388f3SFilipe David Borba Manana !btrfs_is_empty_uuid(root_item.received_uuid)) { 4605f45388f3SFilipe David Borba Manana if (trans) 4606f45388f3SFilipe David Borba Manana goto update_tree; 4607f45388f3SFilipe David Borba Manana 4608f45388f3SFilipe David Borba Manana btrfs_release_path(path); 4609803b2f54SStefan Behrens /* 4610803b2f54SStefan Behrens * 1 - subvol uuid item 4611803b2f54SStefan Behrens * 1 - received_subvol uuid item 4612803b2f54SStefan Behrens */ 4613803b2f54SStefan Behrens trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4614803b2f54SStefan Behrens if (IS_ERR(trans)) { 4615803b2f54SStefan Behrens ret = PTR_ERR(trans); 4616803b2f54SStefan Behrens break; 4617803b2f54SStefan Behrens } 4618f45388f3SFilipe David Borba Manana continue; 4619f45388f3SFilipe David Borba Manana } else { 4620f45388f3SFilipe David Borba Manana goto skip; 4621f45388f3SFilipe David Borba Manana } 4622f45388f3SFilipe David Borba Manana update_tree: 46239771a5cfSJosef Bacik btrfs_release_path(path); 4624f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid)) { 4625cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4626803b2f54SStefan Behrens BTRFS_UUID_KEY_SUBVOL, 4627803b2f54SStefan Behrens key.objectid); 4628803b2f54SStefan Behrens if (ret < 0) { 4629efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4630803b2f54SStefan Behrens ret); 4631803b2f54SStefan Behrens break; 4632803b2f54SStefan Behrens } 4633803b2f54SStefan Behrens } 4634803b2f54SStefan Behrens 4635803b2f54SStefan Behrens if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4636cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans, 4637803b2f54SStefan Behrens root_item.received_uuid, 4638803b2f54SStefan Behrens BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4639803b2f54SStefan Behrens key.objectid); 4640803b2f54SStefan Behrens if (ret < 0) { 4641efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d", 4642803b2f54SStefan Behrens ret); 4643803b2f54SStefan Behrens break; 4644803b2f54SStefan Behrens } 4645803b2f54SStefan Behrens } 4646803b2f54SStefan Behrens 4647f45388f3SFilipe David Borba Manana skip: 46489771a5cfSJosef Bacik btrfs_release_path(path); 4649803b2f54SStefan Behrens if (trans) { 46503a45bb20SJeff Mahoney ret = btrfs_end_transaction(trans); 4651f45388f3SFilipe David Borba Manana trans = NULL; 4652803b2f54SStefan Behrens if (ret) 4653803b2f54SStefan Behrens break; 4654803b2f54SStefan Behrens } 4655803b2f54SStefan Behrens 4656803b2f54SStefan Behrens if (key.offset < (u64)-1) { 4657803b2f54SStefan Behrens key.offset++; 4658803b2f54SStefan Behrens } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4659803b2f54SStefan Behrens key.offset = 0; 4660803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4661803b2f54SStefan Behrens } else if (key.objectid < (u64)-1) { 4662803b2f54SStefan Behrens key.offset = 0; 4663803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY; 4664803b2f54SStefan Behrens key.objectid++; 4665803b2f54SStefan Behrens } else { 4666803b2f54SStefan Behrens break; 4667803b2f54SStefan Behrens } 4668803b2f54SStefan Behrens cond_resched(); 4669803b2f54SStefan Behrens } 4670803b2f54SStefan Behrens 4671803b2f54SStefan Behrens out: 4672803b2f54SStefan Behrens btrfs_free_path(path); 4673f45388f3SFilipe David Borba Manana if (trans && !IS_ERR(trans)) 46743a45bb20SJeff Mahoney btrfs_end_transaction(trans); 4675803b2f54SStefan Behrens if (ret) 4676efe120a0SFrank Holton btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4677c94bec2cSJosef Bacik else if (!closing) 4678afcdd129SJosef Bacik set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4679803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4680803b2f54SStefan Behrens return 0; 4681803b2f54SStefan Behrens } 4682803b2f54SStefan Behrens 4683f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4684f7a81ea4SStefan Behrens { 4685f7a81ea4SStefan Behrens struct btrfs_trans_handle *trans; 4686f7a81ea4SStefan Behrens struct btrfs_root *tree_root = fs_info->tree_root; 4687f7a81ea4SStefan Behrens struct btrfs_root *uuid_root; 4688803b2f54SStefan Behrens struct task_struct *task; 4689803b2f54SStefan Behrens int ret; 4690f7a81ea4SStefan Behrens 4691f7a81ea4SStefan Behrens /* 4692f7a81ea4SStefan Behrens * 1 - root node 4693f7a81ea4SStefan Behrens * 1 - root item 4694f7a81ea4SStefan Behrens */ 4695f7a81ea4SStefan Behrens trans = btrfs_start_transaction(tree_root, 2); 4696f7a81ea4SStefan Behrens if (IS_ERR(trans)) 4697f7a81ea4SStefan Behrens return PTR_ERR(trans); 4698f7a81ea4SStefan Behrens 46999b7a2440SDavid Sterba uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4700f7a81ea4SStefan Behrens if (IS_ERR(uuid_root)) { 47016d13f549SDavid Sterba ret = PTR_ERR(uuid_root); 470266642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 47033a45bb20SJeff Mahoney btrfs_end_transaction(trans); 47046d13f549SDavid Sterba return ret; 4705f7a81ea4SStefan Behrens } 4706f7a81ea4SStefan Behrens 4707f7a81ea4SStefan Behrens fs_info->uuid_root = uuid_root; 4708f7a81ea4SStefan Behrens 47093a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 4710803b2f54SStefan Behrens if (ret) 4711803b2f54SStefan Behrens return ret; 4712803b2f54SStefan Behrens 4713803b2f54SStefan Behrens down(&fs_info->uuid_tree_rescan_sem); 4714803b2f54SStefan Behrens task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4715803b2f54SStefan Behrens if (IS_ERR(task)) { 471670f80175SStefan Behrens /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4717efe120a0SFrank Holton btrfs_warn(fs_info, "failed to start uuid_scan task"); 4718803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem); 4719803b2f54SStefan Behrens return PTR_ERR(task); 4720f7a81ea4SStefan Behrens } 4721803b2f54SStefan Behrens 4722803b2f54SStefan Behrens return 0; 4723803b2f54SStefan Behrens } 4724803b2f54SStefan Behrens 47258f18cf13SChris Mason /* 47268f18cf13SChris Mason * shrinking a device means finding all of the device extents past 47278f18cf13SChris Mason * the new size, and then following the back refs to the chunks. 47288f18cf13SChris Mason * The chunk relocation code actually frees the device extent 47298f18cf13SChris Mason */ 47308f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 47318f18cf13SChris Mason { 47320b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info; 47330b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root; 47348f18cf13SChris Mason struct btrfs_trans_handle *trans; 47358f18cf13SChris Mason struct btrfs_dev_extent *dev_extent = NULL; 47368f18cf13SChris Mason struct btrfs_path *path; 47378f18cf13SChris Mason u64 length; 47388f18cf13SChris Mason u64 chunk_offset; 47398f18cf13SChris Mason int ret; 47408f18cf13SChris Mason int slot; 4741ba1bf481SJosef Bacik int failed = 0; 4742ba1bf481SJosef Bacik bool retried = false; 47438f18cf13SChris Mason struct extent_buffer *l; 47448f18cf13SChris Mason struct btrfs_key key; 47450b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 47468f18cf13SChris Mason u64 old_total = btrfs_super_total_bytes(super_copy); 47477cc8e58dSMiao Xie u64 old_size = btrfs_device_get_total_bytes(device); 47487dfb8be1SNikolay Borisov u64 diff; 474961d0d0d2SNikolay Borisov u64 start; 47507dfb8be1SNikolay Borisov 47517dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize); 475261d0d0d2SNikolay Borisov start = new_size; 47530e4324a4SNikolay Borisov diff = round_down(old_size - new_size, fs_info->sectorsize); 47548f18cf13SChris Mason 4755401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 475663a212abSStefan Behrens return -EINVAL; 475763a212abSStefan Behrens 47588f18cf13SChris Mason path = btrfs_alloc_path(); 47598f18cf13SChris Mason if (!path) 47608f18cf13SChris Mason return -ENOMEM; 47618f18cf13SChris Mason 47620338dff6SGu Jinxiang path->reada = READA_BACK; 47638f18cf13SChris Mason 476461d0d0d2SNikolay Borisov trans = btrfs_start_transaction(root, 0); 476561d0d0d2SNikolay Borisov if (IS_ERR(trans)) { 476661d0d0d2SNikolay Borisov btrfs_free_path(path); 476761d0d0d2SNikolay Borisov return PTR_ERR(trans); 476861d0d0d2SNikolay Borisov } 476961d0d0d2SNikolay Borisov 477034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 47717d9eb12cSChris Mason 47727cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size); 4773ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 47742b82032cSYan Zheng device->fs_devices->total_rw_bytes -= diff; 4775a5ed45f8SNikolay Borisov atomic64_sub(diff, &fs_info->free_chunk_space); 47762bf64758SJosef Bacik } 477761d0d0d2SNikolay Borisov 477861d0d0d2SNikolay Borisov /* 477961d0d0d2SNikolay Borisov * Once the device's size has been set to the new size, ensure all 478061d0d0d2SNikolay Borisov * in-memory chunks are synced to disk so that the loop below sees them 478161d0d0d2SNikolay Borisov * and relocates them accordingly. 478261d0d0d2SNikolay Borisov */ 47831c11b63eSJeff Mahoney if (contains_pending_extent(device, &start, diff)) { 478434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 478561d0d0d2SNikolay Borisov ret = btrfs_commit_transaction(trans); 478661d0d0d2SNikolay Borisov if (ret) 478761d0d0d2SNikolay Borisov goto done; 478861d0d0d2SNikolay Borisov } else { 478961d0d0d2SNikolay Borisov mutex_unlock(&fs_info->chunk_mutex); 479061d0d0d2SNikolay Borisov btrfs_end_transaction(trans); 479161d0d0d2SNikolay Borisov } 47928f18cf13SChris Mason 4793ba1bf481SJosef Bacik again: 47948f18cf13SChris Mason key.objectid = device->devid; 47958f18cf13SChris Mason key.offset = (u64)-1; 47968f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY; 47978f18cf13SChris Mason 4798213e64daSIlya Dryomov do { 4799f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock); 48008f18cf13SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 480167c5e7d4SFilipe Manana if (ret < 0) { 4802f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 48038f18cf13SChris Mason goto done; 480467c5e7d4SFilipe Manana } 48058f18cf13SChris Mason 48068f18cf13SChris Mason ret = btrfs_previous_item(root, path, 0, key.type); 48077056bf69SNikolay Borisov if (ret) { 4808f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 48098f18cf13SChris Mason if (ret < 0) 48108f18cf13SChris Mason goto done; 48118f18cf13SChris Mason ret = 0; 4812b3b4aa74SDavid Sterba btrfs_release_path(path); 4813bf1fb512SYan Zheng break; 48148f18cf13SChris Mason } 48158f18cf13SChris Mason 48168f18cf13SChris Mason l = path->nodes[0]; 48178f18cf13SChris Mason slot = path->slots[0]; 48188f18cf13SChris Mason btrfs_item_key_to_cpu(l, &key, path->slots[0]); 48198f18cf13SChris Mason 4820ba1bf481SJosef Bacik if (key.objectid != device->devid) { 4821f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 4822b3b4aa74SDavid Sterba btrfs_release_path(path); 4823bf1fb512SYan Zheng break; 4824ba1bf481SJosef Bacik } 48258f18cf13SChris Mason 48268f18cf13SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 48278f18cf13SChris Mason length = btrfs_dev_extent_length(l, dev_extent); 48288f18cf13SChris Mason 4829ba1bf481SJosef Bacik if (key.offset + length <= new_size) { 4830f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 4831b3b4aa74SDavid Sterba btrfs_release_path(path); 4832d6397baeSChris Ball break; 4833ba1bf481SJosef Bacik } 48348f18cf13SChris Mason 48358f18cf13SChris Mason chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4836b3b4aa74SDavid Sterba btrfs_release_path(path); 48378f18cf13SChris Mason 4838a6f93c71SLiu Bo /* 4839a6f93c71SLiu Bo * We may be relocating the only data chunk we have, 4840a6f93c71SLiu Bo * which could potentially end up with losing data's 4841a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in 4842a6f93c71SLiu Bo * advance. 4843a6f93c71SLiu Bo */ 4844a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4845a6f93c71SLiu Bo if (ret < 0) { 4846f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 4847a6f93c71SLiu Bo goto done; 4848a6f93c71SLiu Bo } 4849a6f93c71SLiu Bo 48500b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4851f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 4852eede2bf3SOmar Sandoval if (ret == -ENOSPC) { 4853ba1bf481SJosef Bacik failed++; 4854eede2bf3SOmar Sandoval } else if (ret) { 4855eede2bf3SOmar Sandoval if (ret == -ETXTBSY) { 4856eede2bf3SOmar Sandoval btrfs_warn(fs_info, 4857eede2bf3SOmar Sandoval "could not shrink block group %llu due to active swapfile", 4858eede2bf3SOmar Sandoval chunk_offset); 4859eede2bf3SOmar Sandoval } 4860eede2bf3SOmar Sandoval goto done; 4861eede2bf3SOmar Sandoval } 4862213e64daSIlya Dryomov } while (key.offset-- > 0); 4863ba1bf481SJosef Bacik 4864ba1bf481SJosef Bacik if (failed && !retried) { 4865ba1bf481SJosef Bacik failed = 0; 4866ba1bf481SJosef Bacik retried = true; 4867ba1bf481SJosef Bacik goto again; 4868ba1bf481SJosef Bacik } else if (failed && retried) { 4869ba1bf481SJosef Bacik ret = -ENOSPC; 48708f18cf13SChris Mason goto done; 48718f18cf13SChris Mason } 48728f18cf13SChris Mason 4873d6397baeSChris Ball /* Shrinking succeeded, else we would be at "done". */ 4874a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0); 487598d5dc13STsutomu Itoh if (IS_ERR(trans)) { 487698d5dc13STsutomu Itoh ret = PTR_ERR(trans); 487798d5dc13STsutomu Itoh goto done; 487898d5dc13STsutomu Itoh } 487998d5dc13STsutomu Itoh 488034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 4881c57dd1f2SQu Wenruo /* Clear all state bits beyond the shrunk device size */ 4882c57dd1f2SQu Wenruo clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4883c57dd1f2SQu Wenruo CHUNK_STATE_MASK); 4884c57dd1f2SQu Wenruo 48857cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size); 4886bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list)) 4887bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list, 4888bbbf7243SNikolay Borisov &trans->transaction->dev_update_list); 4889d6397baeSChris Ball 4890d6397baeSChris Ball WARN_ON(diff > old_total); 48917dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy, 48927dfb8be1SNikolay Borisov round_down(old_total - diff, fs_info->sectorsize)); 489334441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 48942196d6e8SMiao Xie 48952196d6e8SMiao Xie /* Now btrfs_update_device() will change the on-disk size. */ 48962196d6e8SMiao Xie ret = btrfs_update_device(trans, device); 4897801660b0SAnand Jain if (ret < 0) { 4898801660b0SAnand Jain btrfs_abort_transaction(trans, ret); 48993a45bb20SJeff Mahoney btrfs_end_transaction(trans); 4900801660b0SAnand Jain } else { 4901801660b0SAnand Jain ret = btrfs_commit_transaction(trans); 4902801660b0SAnand Jain } 49038f18cf13SChris Mason done: 49048f18cf13SChris Mason btrfs_free_path(path); 490553e489bcSFilipe Manana if (ret) { 490634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex); 490753e489bcSFilipe Manana btrfs_device_set_total_bytes(device, old_size); 4908ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 490953e489bcSFilipe Manana device->fs_devices->total_rw_bytes += diff; 4910a5ed45f8SNikolay Borisov atomic64_add(diff, &fs_info->free_chunk_space); 491134441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex); 491253e489bcSFilipe Manana } 49138f18cf13SChris Mason return ret; 49148f18cf13SChris Mason } 49158f18cf13SChris Mason 49162ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 49170b86a832SChris Mason struct btrfs_key *key, 49180b86a832SChris Mason struct btrfs_chunk *chunk, int item_size) 49190b86a832SChris Mason { 49200b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 49210b86a832SChris Mason struct btrfs_disk_key disk_key; 49220b86a832SChris Mason u32 array_size; 49230b86a832SChris Mason u8 *ptr; 49240b86a832SChris Mason 492579bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex); 492679bd3712SFilipe Manana 49270b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 49285f43f86eSGui Hecheng if (array_size + item_size + sizeof(disk_key) 492979bd3712SFilipe Manana > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 49300b86a832SChris Mason return -EFBIG; 49310b86a832SChris Mason 49320b86a832SChris Mason ptr = super_copy->sys_chunk_array + array_size; 49330b86a832SChris Mason btrfs_cpu_key_to_disk(&disk_key, key); 49340b86a832SChris Mason memcpy(ptr, &disk_key, sizeof(disk_key)); 49350b86a832SChris Mason ptr += sizeof(disk_key); 49360b86a832SChris Mason memcpy(ptr, chunk, item_size); 49370b86a832SChris Mason item_size += sizeof(disk_key); 49380b86a832SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4939fe48a5c0SMiao Xie 49400b86a832SChris Mason return 0; 49410b86a832SChris Mason } 49420b86a832SChris Mason 49439f680ce0SChris Mason /* 494473c5de00SArne Jansen * sort the devices in descending order by max_avail, total_avail 49459f680ce0SChris Mason */ 494673c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b) 49472b82032cSYan Zheng { 494873c5de00SArne Jansen const struct btrfs_device_info *di_a = a; 494973c5de00SArne Jansen const struct btrfs_device_info *di_b = b; 49502b82032cSYan Zheng 495173c5de00SArne Jansen if (di_a->max_avail > di_b->max_avail) 4952a40a90a0SChris Mason return -1; 495373c5de00SArne Jansen if (di_a->max_avail < di_b->max_avail) 49549b3f68b9SChris Mason return 1; 495573c5de00SArne Jansen if (di_a->total_avail > di_b->total_avail) 495673c5de00SArne Jansen return -1; 495773c5de00SArne Jansen if (di_a->total_avail < di_b->total_avail) 495873c5de00SArne Jansen return 1; 4959b2117a39SMiao Xie return 0; 4960b2117a39SMiao Xie } 4961b2117a39SMiao Xie 496253b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 496353b381b3SDavid Woodhouse { 4964ffe2d203SZhao Lei if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 496553b381b3SDavid Woodhouse return; 496653b381b3SDavid Woodhouse 4967ceda0864SMiao Xie btrfs_set_fs_incompat(info, RAID56); 496853b381b3SDavid Woodhouse } 496953b381b3SDavid Woodhouse 4970cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4971cfbb825cSDavid Sterba { 4972cfbb825cSDavid Sterba if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4973cfbb825cSDavid Sterba return; 4974cfbb825cSDavid Sterba 4975cfbb825cSDavid Sterba btrfs_set_fs_incompat(info, RAID1C34); 4976cfbb825cSDavid Sterba } 4977cfbb825cSDavid Sterba 49784f2bafe8SNaohiro Aota /* 4979f6f39f7aSNikolay Borisov * Structure used internally for btrfs_create_chunk() function. 49804f2bafe8SNaohiro Aota * Wraps needed parameters. 49814f2bafe8SNaohiro Aota */ 49824f2bafe8SNaohiro Aota struct alloc_chunk_ctl { 49834f2bafe8SNaohiro Aota u64 start; 49844f2bafe8SNaohiro Aota u64 type; 49854f2bafe8SNaohiro Aota /* Total number of stripes to allocate */ 49864f2bafe8SNaohiro Aota int num_stripes; 49874f2bafe8SNaohiro Aota /* sub_stripes info for map */ 49884f2bafe8SNaohiro Aota int sub_stripes; 49894f2bafe8SNaohiro Aota /* Stripes per device */ 49904f2bafe8SNaohiro Aota int dev_stripes; 49914f2bafe8SNaohiro Aota /* Maximum number of devices to use */ 49924f2bafe8SNaohiro Aota int devs_max; 49934f2bafe8SNaohiro Aota /* Minimum number of devices to use */ 49944f2bafe8SNaohiro Aota int devs_min; 49954f2bafe8SNaohiro Aota /* ndevs has to be a multiple of this */ 49964f2bafe8SNaohiro Aota int devs_increment; 49974f2bafe8SNaohiro Aota /* Number of copies */ 49984f2bafe8SNaohiro Aota int ncopies; 49994f2bafe8SNaohiro Aota /* Number of stripes worth of bytes to store parity information */ 50004f2bafe8SNaohiro Aota int nparity; 50014f2bafe8SNaohiro Aota u64 max_stripe_size; 50024f2bafe8SNaohiro Aota u64 max_chunk_size; 50036aafb303SNaohiro Aota u64 dev_extent_min; 50044f2bafe8SNaohiro Aota u64 stripe_size; 50054f2bafe8SNaohiro Aota u64 chunk_size; 50064f2bafe8SNaohiro Aota int ndevs; 50074f2bafe8SNaohiro Aota }; 50084f2bafe8SNaohiro Aota 500927c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular( 501027c314d5SNaohiro Aota struct btrfs_fs_devices *fs_devices, 501127c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl) 501227c314d5SNaohiro Aota { 501327c314d5SNaohiro Aota u64 type = ctl->type; 501427c314d5SNaohiro Aota 501527c314d5SNaohiro Aota if (type & BTRFS_BLOCK_GROUP_DATA) { 501627c314d5SNaohiro Aota ctl->max_stripe_size = SZ_1G; 501727c314d5SNaohiro Aota ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 501827c314d5SNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 501927c314d5SNaohiro Aota /* For larger filesystems, use larger metadata chunks */ 502027c314d5SNaohiro Aota if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 502127c314d5SNaohiro Aota ctl->max_stripe_size = SZ_1G; 502227c314d5SNaohiro Aota else 502327c314d5SNaohiro Aota ctl->max_stripe_size = SZ_256M; 502427c314d5SNaohiro Aota ctl->max_chunk_size = ctl->max_stripe_size; 502527c314d5SNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 502627c314d5SNaohiro Aota ctl->max_stripe_size = SZ_32M; 502727c314d5SNaohiro Aota ctl->max_chunk_size = 2 * ctl->max_stripe_size; 502827c314d5SNaohiro Aota ctl->devs_max = min_t(int, ctl->devs_max, 502927c314d5SNaohiro Aota BTRFS_MAX_DEVS_SYS_CHUNK); 503027c314d5SNaohiro Aota } else { 503127c314d5SNaohiro Aota BUG(); 503227c314d5SNaohiro Aota } 503327c314d5SNaohiro Aota 503427c314d5SNaohiro Aota /* We don't want a chunk larger than 10% of writable space */ 503527c314d5SNaohiro Aota ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 503627c314d5SNaohiro Aota ctl->max_chunk_size); 50376aafb303SNaohiro Aota ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 503827c314d5SNaohiro Aota } 503927c314d5SNaohiro Aota 50401cd6121fSNaohiro Aota static void init_alloc_chunk_ctl_policy_zoned( 50411cd6121fSNaohiro Aota struct btrfs_fs_devices *fs_devices, 50421cd6121fSNaohiro Aota struct alloc_chunk_ctl *ctl) 50431cd6121fSNaohiro Aota { 50441cd6121fSNaohiro Aota u64 zone_size = fs_devices->fs_info->zone_size; 50451cd6121fSNaohiro Aota u64 limit; 50461cd6121fSNaohiro Aota int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 50471cd6121fSNaohiro Aota int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 50481cd6121fSNaohiro Aota u64 min_chunk_size = min_data_stripes * zone_size; 50491cd6121fSNaohiro Aota u64 type = ctl->type; 50501cd6121fSNaohiro Aota 50511cd6121fSNaohiro Aota ctl->max_stripe_size = zone_size; 50521cd6121fSNaohiro Aota if (type & BTRFS_BLOCK_GROUP_DATA) { 50531cd6121fSNaohiro Aota ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 50541cd6121fSNaohiro Aota zone_size); 50551cd6121fSNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 50561cd6121fSNaohiro Aota ctl->max_chunk_size = ctl->max_stripe_size; 50571cd6121fSNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 50581cd6121fSNaohiro Aota ctl->max_chunk_size = 2 * ctl->max_stripe_size; 50591cd6121fSNaohiro Aota ctl->devs_max = min_t(int, ctl->devs_max, 50601cd6121fSNaohiro Aota BTRFS_MAX_DEVS_SYS_CHUNK); 5061bb05b298SArnd Bergmann } else { 5062bb05b298SArnd Bergmann BUG(); 50631cd6121fSNaohiro Aota } 50641cd6121fSNaohiro Aota 50651cd6121fSNaohiro Aota /* We don't want a chunk larger than 10% of writable space */ 50661cd6121fSNaohiro Aota limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 50671cd6121fSNaohiro Aota zone_size), 50681cd6121fSNaohiro Aota min_chunk_size); 50691cd6121fSNaohiro Aota ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 50701cd6121fSNaohiro Aota ctl->dev_extent_min = zone_size * ctl->dev_stripes; 50711cd6121fSNaohiro Aota } 50721cd6121fSNaohiro Aota 507327c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 507427c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl) 507527c314d5SNaohiro Aota { 507627c314d5SNaohiro Aota int index = btrfs_bg_flags_to_raid_index(ctl->type); 507727c314d5SNaohiro Aota 507827c314d5SNaohiro Aota ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 507927c314d5SNaohiro Aota ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 508027c314d5SNaohiro Aota ctl->devs_max = btrfs_raid_array[index].devs_max; 508127c314d5SNaohiro Aota if (!ctl->devs_max) 508227c314d5SNaohiro Aota ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 508327c314d5SNaohiro Aota ctl->devs_min = btrfs_raid_array[index].devs_min; 508427c314d5SNaohiro Aota ctl->devs_increment = btrfs_raid_array[index].devs_increment; 508527c314d5SNaohiro Aota ctl->ncopies = btrfs_raid_array[index].ncopies; 508627c314d5SNaohiro Aota ctl->nparity = btrfs_raid_array[index].nparity; 508727c314d5SNaohiro Aota ctl->ndevs = 0; 508827c314d5SNaohiro Aota 508927c314d5SNaohiro Aota switch (fs_devices->chunk_alloc_policy) { 509027c314d5SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 509127c314d5SNaohiro Aota init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 509227c314d5SNaohiro Aota break; 50931cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED: 50941cd6121fSNaohiro Aota init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 50951cd6121fSNaohiro Aota break; 509627c314d5SNaohiro Aota default: 509727c314d5SNaohiro Aota BUG(); 509827c314d5SNaohiro Aota } 509927c314d5SNaohiro Aota } 510027c314d5SNaohiro Aota 5101560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5102560156cbSNaohiro Aota struct alloc_chunk_ctl *ctl, 5103560156cbSNaohiro Aota struct btrfs_device_info *devices_info) 5104560156cbSNaohiro Aota { 5105560156cbSNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info; 5106560156cbSNaohiro Aota struct btrfs_device *device; 5107560156cbSNaohiro Aota u64 total_avail; 5108560156cbSNaohiro Aota u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5109560156cbSNaohiro Aota int ret; 5110560156cbSNaohiro Aota int ndevs = 0; 5111560156cbSNaohiro Aota u64 max_avail; 5112560156cbSNaohiro Aota u64 dev_offset; 5113560156cbSNaohiro Aota 5114560156cbSNaohiro Aota /* 5115560156cbSNaohiro Aota * in the first pass through the devices list, we gather information 5116560156cbSNaohiro Aota * about the available holes on each device. 5117560156cbSNaohiro Aota */ 5118560156cbSNaohiro Aota list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5119560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5120560156cbSNaohiro Aota WARN(1, KERN_ERR 5121560156cbSNaohiro Aota "BTRFS: read-only device in alloc_list\n"); 5122560156cbSNaohiro Aota continue; 5123560156cbSNaohiro Aota } 5124560156cbSNaohiro Aota 5125560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5126560156cbSNaohiro Aota &device->dev_state) || 5127560156cbSNaohiro Aota test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5128560156cbSNaohiro Aota continue; 5129560156cbSNaohiro Aota 5130560156cbSNaohiro Aota if (device->total_bytes > device->bytes_used) 5131560156cbSNaohiro Aota total_avail = device->total_bytes - device->bytes_used; 5132560156cbSNaohiro Aota else 5133560156cbSNaohiro Aota total_avail = 0; 5134560156cbSNaohiro Aota 5135560156cbSNaohiro Aota /* If there is no space on this device, skip it. */ 51366aafb303SNaohiro Aota if (total_avail < ctl->dev_extent_min) 5137560156cbSNaohiro Aota continue; 5138560156cbSNaohiro Aota 5139560156cbSNaohiro Aota ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5140560156cbSNaohiro Aota &max_avail); 5141560156cbSNaohiro Aota if (ret && ret != -ENOSPC) 5142560156cbSNaohiro Aota return ret; 5143560156cbSNaohiro Aota 5144560156cbSNaohiro Aota if (ret == 0) 5145560156cbSNaohiro Aota max_avail = dev_extent_want; 5146560156cbSNaohiro Aota 51476aafb303SNaohiro Aota if (max_avail < ctl->dev_extent_min) { 5148560156cbSNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5149560156cbSNaohiro Aota btrfs_debug(info, 5150560156cbSNaohiro Aota "%s: devid %llu has no free space, have=%llu want=%llu", 5151560156cbSNaohiro Aota __func__, device->devid, max_avail, 51526aafb303SNaohiro Aota ctl->dev_extent_min); 5153560156cbSNaohiro Aota continue; 5154560156cbSNaohiro Aota } 5155560156cbSNaohiro Aota 5156560156cbSNaohiro Aota if (ndevs == fs_devices->rw_devices) { 5157560156cbSNaohiro Aota WARN(1, "%s: found more than %llu devices\n", 5158560156cbSNaohiro Aota __func__, fs_devices->rw_devices); 5159560156cbSNaohiro Aota break; 5160560156cbSNaohiro Aota } 5161560156cbSNaohiro Aota devices_info[ndevs].dev_offset = dev_offset; 5162560156cbSNaohiro Aota devices_info[ndevs].max_avail = max_avail; 5163560156cbSNaohiro Aota devices_info[ndevs].total_avail = total_avail; 5164560156cbSNaohiro Aota devices_info[ndevs].dev = device; 5165560156cbSNaohiro Aota ++ndevs; 5166560156cbSNaohiro Aota } 5167560156cbSNaohiro Aota ctl->ndevs = ndevs; 5168560156cbSNaohiro Aota 5169560156cbSNaohiro Aota /* 5170560156cbSNaohiro Aota * now sort the devices by hole size / available space 5171560156cbSNaohiro Aota */ 5172560156cbSNaohiro Aota sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5173560156cbSNaohiro Aota btrfs_cmp_device_info, NULL); 5174560156cbSNaohiro Aota 5175560156cbSNaohiro Aota return 0; 5176560156cbSNaohiro Aota } 5177560156cbSNaohiro Aota 51785badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 51795badf512SNaohiro Aota struct btrfs_device_info *devices_info) 51805badf512SNaohiro Aota { 51815badf512SNaohiro Aota /* Number of stripes that count for block group size */ 51825badf512SNaohiro Aota int data_stripes; 51835badf512SNaohiro Aota 51845badf512SNaohiro Aota /* 51855badf512SNaohiro Aota * The primary goal is to maximize the number of stripes, so use as 51865badf512SNaohiro Aota * many devices as possible, even if the stripes are not maximum sized. 51875badf512SNaohiro Aota * 51885badf512SNaohiro Aota * The DUP profile stores more than one stripe per device, the 51895badf512SNaohiro Aota * max_avail is the total size so we have to adjust. 51905badf512SNaohiro Aota */ 51915badf512SNaohiro Aota ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 51925badf512SNaohiro Aota ctl->dev_stripes); 51935badf512SNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 51945badf512SNaohiro Aota 51955badf512SNaohiro Aota /* This will have to be fixed for RAID1 and RAID10 over more drives */ 51965badf512SNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 51975badf512SNaohiro Aota 51985badf512SNaohiro Aota /* 51995badf512SNaohiro Aota * Use the number of data stripes to figure out how big this chunk is 52005badf512SNaohiro Aota * really going to be in terms of logical address space, and compare 52015badf512SNaohiro Aota * that answer with the max chunk size. If it's higher, we try to 52025badf512SNaohiro Aota * reduce stripe_size. 52035badf512SNaohiro Aota */ 52045badf512SNaohiro Aota if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 52055badf512SNaohiro Aota /* 52065badf512SNaohiro Aota * Reduce stripe_size, round it up to a 16MB boundary again and 52075badf512SNaohiro Aota * then use it, unless it ends up being even bigger than the 52085badf512SNaohiro Aota * previous value we had already. 52095badf512SNaohiro Aota */ 52105badf512SNaohiro Aota ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 52115badf512SNaohiro Aota data_stripes), SZ_16M), 52125badf512SNaohiro Aota ctl->stripe_size); 52135badf512SNaohiro Aota } 52145badf512SNaohiro Aota 52155badf512SNaohiro Aota /* Align to BTRFS_STRIPE_LEN */ 52165badf512SNaohiro Aota ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 52175badf512SNaohiro Aota ctl->chunk_size = ctl->stripe_size * data_stripes; 52185badf512SNaohiro Aota 52195badf512SNaohiro Aota return 0; 52205badf512SNaohiro Aota } 52215badf512SNaohiro Aota 52221cd6121fSNaohiro Aota static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 52231cd6121fSNaohiro Aota struct btrfs_device_info *devices_info) 52241cd6121fSNaohiro Aota { 52251cd6121fSNaohiro Aota u64 zone_size = devices_info[0].dev->zone_info->zone_size; 52261cd6121fSNaohiro Aota /* Number of stripes that count for block group size */ 52271cd6121fSNaohiro Aota int data_stripes; 52281cd6121fSNaohiro Aota 52291cd6121fSNaohiro Aota /* 52301cd6121fSNaohiro Aota * It should hold because: 52311cd6121fSNaohiro Aota * dev_extent_min == dev_extent_want == zone_size * dev_stripes 52321cd6121fSNaohiro Aota */ 52331cd6121fSNaohiro Aota ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 52341cd6121fSNaohiro Aota 52351cd6121fSNaohiro Aota ctl->stripe_size = zone_size; 52361cd6121fSNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 52371cd6121fSNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 52381cd6121fSNaohiro Aota 52391cd6121fSNaohiro Aota /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 52401cd6121fSNaohiro Aota if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 52411cd6121fSNaohiro Aota ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 52421cd6121fSNaohiro Aota ctl->stripe_size) + ctl->nparity, 52431cd6121fSNaohiro Aota ctl->dev_stripes); 52441cd6121fSNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 52451cd6121fSNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 52461cd6121fSNaohiro Aota ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 52471cd6121fSNaohiro Aota } 52481cd6121fSNaohiro Aota 52491cd6121fSNaohiro Aota ctl->chunk_size = ctl->stripe_size * data_stripes; 52501cd6121fSNaohiro Aota 52511cd6121fSNaohiro Aota return 0; 52521cd6121fSNaohiro Aota } 52531cd6121fSNaohiro Aota 52545badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 52555badf512SNaohiro Aota struct alloc_chunk_ctl *ctl, 52565badf512SNaohiro Aota struct btrfs_device_info *devices_info) 52575badf512SNaohiro Aota { 52585badf512SNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info; 52595badf512SNaohiro Aota 52605badf512SNaohiro Aota /* 52615badf512SNaohiro Aota * Round down to number of usable stripes, devs_increment can be any 52625badf512SNaohiro Aota * number so we can't use round_down() that requires power of 2, while 52635badf512SNaohiro Aota * rounddown is safe. 52645badf512SNaohiro Aota */ 52655badf512SNaohiro Aota ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 52665badf512SNaohiro Aota 52675badf512SNaohiro Aota if (ctl->ndevs < ctl->devs_min) { 52685badf512SNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 52695badf512SNaohiro Aota btrfs_debug(info, 52705badf512SNaohiro Aota "%s: not enough devices with free space: have=%d minimum required=%d", 52715badf512SNaohiro Aota __func__, ctl->ndevs, ctl->devs_min); 52725badf512SNaohiro Aota } 52735badf512SNaohiro Aota return -ENOSPC; 52745badf512SNaohiro Aota } 52755badf512SNaohiro Aota 52765badf512SNaohiro Aota ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 52775badf512SNaohiro Aota 52785badf512SNaohiro Aota switch (fs_devices->chunk_alloc_policy) { 52795badf512SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR: 52805badf512SNaohiro Aota return decide_stripe_size_regular(ctl, devices_info); 52811cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED: 52821cd6121fSNaohiro Aota return decide_stripe_size_zoned(ctl, devices_info); 52835badf512SNaohiro Aota default: 52845badf512SNaohiro Aota BUG(); 52855badf512SNaohiro Aota } 52865badf512SNaohiro Aota } 52875badf512SNaohiro Aota 528879bd3712SFilipe Manana static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5289dce580caSNaohiro Aota struct alloc_chunk_ctl *ctl, 5290dce580caSNaohiro Aota struct btrfs_device_info *devices_info) 5291dce580caSNaohiro Aota { 5292dce580caSNaohiro Aota struct btrfs_fs_info *info = trans->fs_info; 5293dce580caSNaohiro Aota struct map_lookup *map = NULL; 5294dce580caSNaohiro Aota struct extent_map_tree *em_tree; 529579bd3712SFilipe Manana struct btrfs_block_group *block_group; 5296dce580caSNaohiro Aota struct extent_map *em; 5297dce580caSNaohiro Aota u64 start = ctl->start; 5298dce580caSNaohiro Aota u64 type = ctl->type; 5299dce580caSNaohiro Aota int ret; 5300dce580caSNaohiro Aota int i; 5301dce580caSNaohiro Aota int j; 5302dce580caSNaohiro Aota 5303dce580caSNaohiro Aota map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5304dce580caSNaohiro Aota if (!map) 530579bd3712SFilipe Manana return ERR_PTR(-ENOMEM); 5306dce580caSNaohiro Aota map->num_stripes = ctl->num_stripes; 5307dce580caSNaohiro Aota 5308dce580caSNaohiro Aota for (i = 0; i < ctl->ndevs; ++i) { 5309dce580caSNaohiro Aota for (j = 0; j < ctl->dev_stripes; ++j) { 5310dce580caSNaohiro Aota int s = i * ctl->dev_stripes + j; 5311dce580caSNaohiro Aota map->stripes[s].dev = devices_info[i].dev; 5312dce580caSNaohiro Aota map->stripes[s].physical = devices_info[i].dev_offset + 5313dce580caSNaohiro Aota j * ctl->stripe_size; 5314dce580caSNaohiro Aota } 5315dce580caSNaohiro Aota } 5316dce580caSNaohiro Aota map->stripe_len = BTRFS_STRIPE_LEN; 5317dce580caSNaohiro Aota map->io_align = BTRFS_STRIPE_LEN; 5318dce580caSNaohiro Aota map->io_width = BTRFS_STRIPE_LEN; 5319dce580caSNaohiro Aota map->type = type; 5320dce580caSNaohiro Aota map->sub_stripes = ctl->sub_stripes; 5321dce580caSNaohiro Aota 5322dce580caSNaohiro Aota trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5323dce580caSNaohiro Aota 5324dce580caSNaohiro Aota em = alloc_extent_map(); 5325dce580caSNaohiro Aota if (!em) { 5326dce580caSNaohiro Aota kfree(map); 532779bd3712SFilipe Manana return ERR_PTR(-ENOMEM); 5328dce580caSNaohiro Aota } 5329dce580caSNaohiro Aota set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5330dce580caSNaohiro Aota em->map_lookup = map; 5331dce580caSNaohiro Aota em->start = start; 5332dce580caSNaohiro Aota em->len = ctl->chunk_size; 5333dce580caSNaohiro Aota em->block_start = 0; 5334dce580caSNaohiro Aota em->block_len = em->len; 5335dce580caSNaohiro Aota em->orig_block_len = ctl->stripe_size; 5336dce580caSNaohiro Aota 5337dce580caSNaohiro Aota em_tree = &info->mapping_tree; 5338dce580caSNaohiro Aota write_lock(&em_tree->lock); 5339dce580caSNaohiro Aota ret = add_extent_mapping(em_tree, em, 0); 5340dce580caSNaohiro Aota if (ret) { 5341dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5342dce580caSNaohiro Aota free_extent_map(em); 534379bd3712SFilipe Manana return ERR_PTR(ret); 5344dce580caSNaohiro Aota } 5345dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5346dce580caSNaohiro Aota 534779bd3712SFilipe Manana block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 534879bd3712SFilipe Manana if (IS_ERR(block_group)) 5349dce580caSNaohiro Aota goto error_del_extent; 5350dce580caSNaohiro Aota 5351dce580caSNaohiro Aota for (i = 0; i < map->num_stripes; i++) { 5352dce580caSNaohiro Aota struct btrfs_device *dev = map->stripes[i].dev; 5353dce580caSNaohiro Aota 5354dce580caSNaohiro Aota btrfs_device_set_bytes_used(dev, 5355dce580caSNaohiro Aota dev->bytes_used + ctl->stripe_size); 5356dce580caSNaohiro Aota if (list_empty(&dev->post_commit_list)) 5357dce580caSNaohiro Aota list_add_tail(&dev->post_commit_list, 5358dce580caSNaohiro Aota &trans->transaction->dev_update_list); 5359dce580caSNaohiro Aota } 5360dce580caSNaohiro Aota 5361dce580caSNaohiro Aota atomic64_sub(ctl->stripe_size * map->num_stripes, 5362dce580caSNaohiro Aota &info->free_chunk_space); 5363dce580caSNaohiro Aota 5364dce580caSNaohiro Aota free_extent_map(em); 5365dce580caSNaohiro Aota check_raid56_incompat_flag(info, type); 5366dce580caSNaohiro Aota check_raid1c34_incompat_flag(info, type); 5367dce580caSNaohiro Aota 536879bd3712SFilipe Manana return block_group; 5369dce580caSNaohiro Aota 5370dce580caSNaohiro Aota error_del_extent: 5371dce580caSNaohiro Aota write_lock(&em_tree->lock); 5372dce580caSNaohiro Aota remove_extent_mapping(em_tree, em); 5373dce580caSNaohiro Aota write_unlock(&em_tree->lock); 5374dce580caSNaohiro Aota 5375dce580caSNaohiro Aota /* One for our allocation */ 5376dce580caSNaohiro Aota free_extent_map(em); 5377dce580caSNaohiro Aota /* One for the tree reference */ 5378dce580caSNaohiro Aota free_extent_map(em); 5379dce580caSNaohiro Aota 538079bd3712SFilipe Manana return block_group; 5381dce580caSNaohiro Aota } 5382dce580caSNaohiro Aota 5383f6f39f7aSNikolay Borisov struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 538479bd3712SFilipe Manana u64 type) 5385b2117a39SMiao Xie { 53862ff7e61eSJeff Mahoney struct btrfs_fs_info *info = trans->fs_info; 5387b2117a39SMiao Xie struct btrfs_fs_devices *fs_devices = info->fs_devices; 538873c5de00SArne Jansen struct btrfs_device_info *devices_info = NULL; 53894f2bafe8SNaohiro Aota struct alloc_chunk_ctl ctl; 539079bd3712SFilipe Manana struct btrfs_block_group *block_group; 5391b2117a39SMiao Xie int ret; 5392b2117a39SMiao Xie 539311c67b1aSNikolay Borisov lockdep_assert_held(&info->chunk_mutex); 539411c67b1aSNikolay Borisov 5395b25c19f4SNaohiro Aota if (!alloc_profile_is_valid(type, 0)) { 5396b25c19f4SNaohiro Aota ASSERT(0); 539779bd3712SFilipe Manana return ERR_PTR(-EINVAL); 5398b25c19f4SNaohiro Aota } 539973c5de00SArne Jansen 54004117f207SQu Wenruo if (list_empty(&fs_devices->alloc_list)) { 54014117f207SQu Wenruo if (btrfs_test_opt(info, ENOSPC_DEBUG)) 54024117f207SQu Wenruo btrfs_debug(info, "%s: no writable device", __func__); 540379bd3712SFilipe Manana return ERR_PTR(-ENOSPC); 54044117f207SQu Wenruo } 5405b2117a39SMiao Xie 540627c314d5SNaohiro Aota if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 540727c314d5SNaohiro Aota btrfs_err(info, "invalid chunk type 0x%llx requested", type); 540827c314d5SNaohiro Aota ASSERT(0); 540979bd3712SFilipe Manana return ERR_PTR(-EINVAL); 541073c5de00SArne Jansen } 541173c5de00SArne Jansen 541211c67b1aSNikolay Borisov ctl.start = find_next_chunk(info); 541327c314d5SNaohiro Aota ctl.type = type; 541427c314d5SNaohiro Aota init_alloc_chunk_ctl(fs_devices, &ctl); 5415b2117a39SMiao Xie 541631e818feSDavid Sterba devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5417b2117a39SMiao Xie GFP_NOFS); 5418b2117a39SMiao Xie if (!devices_info) 541979bd3712SFilipe Manana return ERR_PTR(-ENOMEM); 5420b2117a39SMiao Xie 5421560156cbSNaohiro Aota ret = gather_device_info(fs_devices, &ctl, devices_info); 542279bd3712SFilipe Manana if (ret < 0) { 542379bd3712SFilipe Manana block_group = ERR_PTR(ret); 5424dce580caSNaohiro Aota goto out; 542579bd3712SFilipe Manana } 542673c5de00SArne Jansen 54275badf512SNaohiro Aota ret = decide_stripe_size(fs_devices, &ctl, devices_info); 542879bd3712SFilipe Manana if (ret < 0) { 542979bd3712SFilipe Manana block_group = ERR_PTR(ret); 5430dce580caSNaohiro Aota goto out; 543179bd3712SFilipe Manana } 543273c5de00SArne Jansen 543379bd3712SFilipe Manana block_group = create_chunk(trans, &ctl, devices_info); 54349b3f68b9SChris Mason 5435dce580caSNaohiro Aota out: 5436b2117a39SMiao Xie kfree(devices_info); 543779bd3712SFilipe Manana return block_group; 54382b82032cSYan Zheng } 54392b82032cSYan Zheng 544011c67b1aSNikolay Borisov /* 544179bd3712SFilipe Manana * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 544279bd3712SFilipe Manana * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 544379bd3712SFilipe Manana * chunks. 544479bd3712SFilipe Manana * 544579bd3712SFilipe Manana * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 544679bd3712SFilipe Manana * phases. 544779bd3712SFilipe Manana */ 544879bd3712SFilipe Manana int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 544979bd3712SFilipe Manana struct btrfs_block_group *bg) 545079bd3712SFilipe Manana { 545179bd3712SFilipe Manana struct btrfs_fs_info *fs_info = trans->fs_info; 545279bd3712SFilipe Manana struct btrfs_root *extent_root = fs_info->extent_root; 545379bd3712SFilipe Manana struct btrfs_root *chunk_root = fs_info->chunk_root; 545479bd3712SFilipe Manana struct btrfs_key key; 545579bd3712SFilipe Manana struct btrfs_chunk *chunk; 545679bd3712SFilipe Manana struct btrfs_stripe *stripe; 545779bd3712SFilipe Manana struct extent_map *em; 545879bd3712SFilipe Manana struct map_lookup *map; 545979bd3712SFilipe Manana size_t item_size; 546079bd3712SFilipe Manana int i; 546179bd3712SFilipe Manana int ret; 546279bd3712SFilipe Manana 546379bd3712SFilipe Manana /* 546479bd3712SFilipe Manana * We take the chunk_mutex for 2 reasons: 546579bd3712SFilipe Manana * 546679bd3712SFilipe Manana * 1) Updates and insertions in the chunk btree must be done while holding 546779bd3712SFilipe Manana * the chunk_mutex, as well as updating the system chunk array in the 546879bd3712SFilipe Manana * superblock. See the comment on top of btrfs_chunk_alloc() for the 546979bd3712SFilipe Manana * details; 547079bd3712SFilipe Manana * 547179bd3712SFilipe Manana * 2) To prevent races with the final phase of a device replace operation 547279bd3712SFilipe Manana * that replaces the device object associated with the map's stripes, 547379bd3712SFilipe Manana * because the device object's id can change at any time during that 547479bd3712SFilipe Manana * final phase of the device replace operation 547579bd3712SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 547679bd3712SFilipe Manana * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 547779bd3712SFilipe Manana * which would cause a failure when updating the device item, which does 547879bd3712SFilipe Manana * not exists, or persisting a stripe of the chunk item with such ID. 547979bd3712SFilipe Manana * Here we can't use the device_list_mutex because our caller already 548079bd3712SFilipe Manana * has locked the chunk_mutex, and the final phase of device replace 548179bd3712SFilipe Manana * acquires both mutexes - first the device_list_mutex and then the 548279bd3712SFilipe Manana * chunk_mutex. Using any of those two mutexes protects us from a 548379bd3712SFilipe Manana * concurrent device replace. 548479bd3712SFilipe Manana */ 548579bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex); 548679bd3712SFilipe Manana 548779bd3712SFilipe Manana em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 548879bd3712SFilipe Manana if (IS_ERR(em)) { 548979bd3712SFilipe Manana ret = PTR_ERR(em); 549079bd3712SFilipe Manana btrfs_abort_transaction(trans, ret); 549179bd3712SFilipe Manana return ret; 549279bd3712SFilipe Manana } 549379bd3712SFilipe Manana 549479bd3712SFilipe Manana map = em->map_lookup; 549579bd3712SFilipe Manana item_size = btrfs_chunk_item_size(map->num_stripes); 549679bd3712SFilipe Manana 549779bd3712SFilipe Manana chunk = kzalloc(item_size, GFP_NOFS); 549879bd3712SFilipe Manana if (!chunk) { 549979bd3712SFilipe Manana ret = -ENOMEM; 550079bd3712SFilipe Manana btrfs_abort_transaction(trans, ret); 550179bd3712SFilipe Manana goto out; 550279bd3712SFilipe Manana } 550379bd3712SFilipe Manana 550479bd3712SFilipe Manana for (i = 0; i < map->num_stripes; i++) { 550579bd3712SFilipe Manana struct btrfs_device *device = map->stripes[i].dev; 550679bd3712SFilipe Manana 550779bd3712SFilipe Manana ret = btrfs_update_device(trans, device); 550879bd3712SFilipe Manana if (ret) 55096df9a95eSJosef Bacik goto out; 55102b82032cSYan Zheng } 55112b82032cSYan Zheng 55122b82032cSYan Zheng stripe = &chunk->stripe; 55136df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) { 551479bd3712SFilipe Manana struct btrfs_device *device = map->stripes[i].dev; 551579bd3712SFilipe Manana const u64 dev_offset = map->stripes[i].physical; 55162b82032cSYan Zheng 55172b82032cSYan Zheng btrfs_set_stack_stripe_devid(stripe, device->devid); 55182b82032cSYan Zheng btrfs_set_stack_stripe_offset(stripe, dev_offset); 55192b82032cSYan Zheng memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 55202b82032cSYan Zheng stripe++; 55212b82032cSYan Zheng } 55222b82032cSYan Zheng 552379bd3712SFilipe Manana btrfs_set_stack_chunk_length(chunk, bg->length); 55242b82032cSYan Zheng btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 55252b82032cSYan Zheng btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 55262b82032cSYan Zheng btrfs_set_stack_chunk_type(chunk, map->type); 55272b82032cSYan Zheng btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 55282b82032cSYan Zheng btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 55292b82032cSYan Zheng btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 55300b246afaSJeff Mahoney btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 55312b82032cSYan Zheng btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 55322b82032cSYan Zheng 55332b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 55342b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY; 553579bd3712SFilipe Manana key.offset = bg->start; 55362b82032cSYan Zheng 55372b82032cSYan Zheng ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 553879bd3712SFilipe Manana if (ret) 553979bd3712SFilipe Manana goto out; 554079bd3712SFilipe Manana 554179bd3712SFilipe Manana bg->chunk_item_inserted = 1; 554279bd3712SFilipe Manana 554379bd3712SFilipe Manana if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 55442ff7e61eSJeff Mahoney ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 554579bd3712SFilipe Manana if (ret) 554679bd3712SFilipe Manana goto out; 55472b82032cSYan Zheng } 55481abe9b8aSliubo 55496df9a95eSJosef Bacik out: 55502b82032cSYan Zheng kfree(chunk); 55516df9a95eSJosef Bacik free_extent_map(em); 55524ed1d16eSMark Fasheh return ret; 55532b82032cSYan Zheng } 55542b82032cSYan Zheng 55556f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 55562b82032cSYan Zheng { 55576f8e0fc7SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 55582b82032cSYan Zheng u64 alloc_profile; 555979bd3712SFilipe Manana struct btrfs_block_group *meta_bg; 556079bd3712SFilipe Manana struct btrfs_block_group *sys_bg; 556179bd3712SFilipe Manana 556279bd3712SFilipe Manana /* 556379bd3712SFilipe Manana * When adding a new device for sprouting, the seed device is read-only 556479bd3712SFilipe Manana * so we must first allocate a metadata and a system chunk. But before 556579bd3712SFilipe Manana * adding the block group items to the extent, device and chunk btrees, 556679bd3712SFilipe Manana * we must first: 556779bd3712SFilipe Manana * 556879bd3712SFilipe Manana * 1) Create both chunks without doing any changes to the btrees, as 556979bd3712SFilipe Manana * otherwise we would get -ENOSPC since the block groups from the 557079bd3712SFilipe Manana * seed device are read-only; 557179bd3712SFilipe Manana * 557279bd3712SFilipe Manana * 2) Add the device item for the new sprout device - finishing the setup 557379bd3712SFilipe Manana * of a new block group requires updating the device item in the chunk 557479bd3712SFilipe Manana * btree, so it must exist when we attempt to do it. The previous step 557579bd3712SFilipe Manana * ensures this does not fail with -ENOSPC. 557679bd3712SFilipe Manana * 557779bd3712SFilipe Manana * After that we can add the block group items to their btrees: 557879bd3712SFilipe Manana * update existing device item in the chunk btree, add a new block group 557979bd3712SFilipe Manana * item to the extent btree, add a new chunk item to the chunk btree and 558079bd3712SFilipe Manana * finally add the new device extent items to the devices btree. 558179bd3712SFilipe Manana */ 55822b82032cSYan Zheng 55831b86826dSJeff Mahoney alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5584f6f39f7aSNikolay Borisov meta_bg = btrfs_create_chunk(trans, alloc_profile); 558579bd3712SFilipe Manana if (IS_ERR(meta_bg)) 558679bd3712SFilipe Manana return PTR_ERR(meta_bg); 55872b82032cSYan Zheng 55881b86826dSJeff Mahoney alloc_profile = btrfs_system_alloc_profile(fs_info); 5589f6f39f7aSNikolay Borisov sys_bg = btrfs_create_chunk(trans, alloc_profile); 559079bd3712SFilipe Manana if (IS_ERR(sys_bg)) 559179bd3712SFilipe Manana return PTR_ERR(sys_bg); 559279bd3712SFilipe Manana 559379bd3712SFilipe Manana return 0; 5594005d6427SDavid Sterba } 55952b82032cSYan Zheng 5596d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5597d20983b4SMiao Xie { 5598fc9a2ac7SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(map->type); 5599d20983b4SMiao Xie 5600fc9a2ac7SDavid Sterba return btrfs_raid_array[index].tolerated_failures; 56012b82032cSYan Zheng } 56022b82032cSYan Zheng 5603a09f23c3SAnand Jain bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 56042b82032cSYan Zheng { 56052b82032cSYan Zheng struct extent_map *em; 56062b82032cSYan Zheng struct map_lookup *map; 5607d20983b4SMiao Xie int miss_ndevs = 0; 56082b82032cSYan Zheng int i; 5609a09f23c3SAnand Jain bool ret = true; 56102b82032cSYan Zheng 561160ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5612592d92eeSLiu Bo if (IS_ERR(em)) 5613a09f23c3SAnand Jain return false; 56142b82032cSYan Zheng 561595617d69SJeff Mahoney map = em->map_lookup; 56162b82032cSYan Zheng for (i = 0; i < map->num_stripes; i++) { 5617e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, 5618e6e674bdSAnand Jain &map->stripes[i].dev->dev_state)) { 5619d20983b4SMiao Xie miss_ndevs++; 5620d20983b4SMiao Xie continue; 5621d20983b4SMiao Xie } 5622ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5623ebbede42SAnand Jain &map->stripes[i].dev->dev_state)) { 5624a09f23c3SAnand Jain ret = false; 5625d20983b4SMiao Xie goto end; 56262b82032cSYan Zheng } 56272b82032cSYan Zheng } 5628d20983b4SMiao Xie 5629d20983b4SMiao Xie /* 5630a09f23c3SAnand Jain * If the number of missing devices is larger than max errors, we can 5631a09f23c3SAnand Jain * not write the data into that chunk successfully. 5632d20983b4SMiao Xie */ 5633d20983b4SMiao Xie if (miss_ndevs > btrfs_chunk_max_errors(map)) 5634a09f23c3SAnand Jain ret = false; 5635d20983b4SMiao Xie end: 56362b82032cSYan Zheng free_extent_map(em); 5637a09f23c3SAnand Jain return ret; 56380b86a832SChris Mason } 56390b86a832SChris Mason 5640c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree) 56410b86a832SChris Mason { 56420b86a832SChris Mason struct extent_map *em; 56430b86a832SChris Mason 56440b86a832SChris Mason while (1) { 5645c8bf1b67SDavid Sterba write_lock(&tree->lock); 5646c8bf1b67SDavid Sterba em = lookup_extent_mapping(tree, 0, (u64)-1); 56470b86a832SChris Mason if (em) 5648c8bf1b67SDavid Sterba remove_extent_mapping(tree, em); 5649c8bf1b67SDavid Sterba write_unlock(&tree->lock); 56500b86a832SChris Mason if (!em) 56510b86a832SChris Mason break; 56520b86a832SChris Mason /* once for us */ 56530b86a832SChris Mason free_extent_map(em); 56540b86a832SChris Mason /* once for the tree */ 56550b86a832SChris Mason free_extent_map(em); 56560b86a832SChris Mason } 56570b86a832SChris Mason } 56580b86a832SChris Mason 56595d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5660f188591eSChris Mason { 5661f188591eSChris Mason struct extent_map *em; 5662f188591eSChris Mason struct map_lookup *map; 5663f188591eSChris Mason int ret; 5664f188591eSChris Mason 566560ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 5666592d92eeSLiu Bo if (IS_ERR(em)) 5667fb7669b5SJosef Bacik /* 5668592d92eeSLiu Bo * We could return errors for these cases, but that could get 5669592d92eeSLiu Bo * ugly and we'd probably do the same thing which is just not do 5670592d92eeSLiu Bo * anything else and exit, so return 1 so the callers don't try 5671592d92eeSLiu Bo * to use other copies. 5672fb7669b5SJosef Bacik */ 5673fb7669b5SJosef Bacik return 1; 5674fb7669b5SJosef Bacik 567595617d69SJeff Mahoney map = em->map_lookup; 5676c7369b3fSDavid Sterba if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5677f188591eSChris Mason ret = map->num_stripes; 5678321aecc6SChris Mason else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5679321aecc6SChris Mason ret = map->sub_stripes; 568053b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 568153b381b3SDavid Woodhouse ret = 2; 568253b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 56838810f751SLiu Bo /* 56848810f751SLiu Bo * There could be two corrupted data stripes, we need 56858810f751SLiu Bo * to loop retry in order to rebuild the correct data. 56868810f751SLiu Bo * 56878810f751SLiu Bo * Fail a stripe at a time on every retry except the 56888810f751SLiu Bo * stripe under reconstruction. 56898810f751SLiu Bo */ 56908810f751SLiu Bo ret = map->num_stripes; 5691f188591eSChris Mason else 5692f188591eSChris Mason ret = 1; 5693f188591eSChris Mason free_extent_map(em); 5694ad6d620eSStefan Behrens 5695cb5583ddSDavid Sterba down_read(&fs_info->dev_replace.rwsem); 56966fad823fSLiu Bo if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 56976fad823fSLiu Bo fs_info->dev_replace.tgtdev) 5698ad6d620eSStefan Behrens ret++; 5699cb5583ddSDavid Sterba up_read(&fs_info->dev_replace.rwsem); 5700ad6d620eSStefan Behrens 5701f188591eSChris Mason return ret; 5702f188591eSChris Mason } 5703f188591eSChris Mason 57042ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 570553b381b3SDavid Woodhouse u64 logical) 570653b381b3SDavid Woodhouse { 570753b381b3SDavid Woodhouse struct extent_map *em; 570853b381b3SDavid Woodhouse struct map_lookup *map; 57090b246afaSJeff Mahoney unsigned long len = fs_info->sectorsize; 571053b381b3SDavid Woodhouse 571160ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 571253b381b3SDavid Woodhouse 571369f03f13SNikolay Borisov if (!WARN_ON(IS_ERR(em))) { 571495617d69SJeff Mahoney map = em->map_lookup; 5715ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 571653b381b3SDavid Woodhouse len = map->stripe_len * nr_data_stripes(map); 571753b381b3SDavid Woodhouse free_extent_map(em); 571869f03f13SNikolay Borisov } 571953b381b3SDavid Woodhouse return len; 572053b381b3SDavid Woodhouse } 572153b381b3SDavid Woodhouse 5722e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 572353b381b3SDavid Woodhouse { 572453b381b3SDavid Woodhouse struct extent_map *em; 572553b381b3SDavid Woodhouse struct map_lookup *map; 572653b381b3SDavid Woodhouse int ret = 0; 572753b381b3SDavid Woodhouse 572860ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len); 572953b381b3SDavid Woodhouse 573069f03f13SNikolay Borisov if(!WARN_ON(IS_ERR(em))) { 573195617d69SJeff Mahoney map = em->map_lookup; 5732ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 573353b381b3SDavid Woodhouse ret = 1; 573453b381b3SDavid Woodhouse free_extent_map(em); 573569f03f13SNikolay Borisov } 573653b381b3SDavid Woodhouse return ret; 573753b381b3SDavid Woodhouse } 573853b381b3SDavid Woodhouse 573930d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info, 574099f92a7cSAnand Jain struct map_lookup *map, int first, 57418ba0ae78SAnand Jain int dev_replace_is_ongoing) 5742dfe25020SChris Mason { 5743dfe25020SChris Mason int i; 574499f92a7cSAnand Jain int num_stripes; 57458ba0ae78SAnand Jain int preferred_mirror; 574630d9861fSStefan Behrens int tolerance; 574730d9861fSStefan Behrens struct btrfs_device *srcdev; 574830d9861fSStefan Behrens 574999f92a7cSAnand Jain ASSERT((map->type & 5750c7369b3fSDavid Sterba (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 575199f92a7cSAnand Jain 575299f92a7cSAnand Jain if (map->type & BTRFS_BLOCK_GROUP_RAID10) 575399f92a7cSAnand Jain num_stripes = map->sub_stripes; 575499f92a7cSAnand Jain else 575599f92a7cSAnand Jain num_stripes = map->num_stripes; 575699f92a7cSAnand Jain 575733fd2f71SAnand Jain switch (fs_info->fs_devices->read_policy) { 575833fd2f71SAnand Jain default: 575933fd2f71SAnand Jain /* Shouldn't happen, just warn and use pid instead of failing */ 576033fd2f71SAnand Jain btrfs_warn_rl(fs_info, 576133fd2f71SAnand Jain "unknown read_policy type %u, reset to pid", 576233fd2f71SAnand Jain fs_info->fs_devices->read_policy); 576333fd2f71SAnand Jain fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 576433fd2f71SAnand Jain fallthrough; 576533fd2f71SAnand Jain case BTRFS_READ_POLICY_PID: 576633fd2f71SAnand Jain preferred_mirror = first + (current->pid % num_stripes); 576733fd2f71SAnand Jain break; 576833fd2f71SAnand Jain } 57698ba0ae78SAnand Jain 577030d9861fSStefan Behrens if (dev_replace_is_ongoing && 577130d9861fSStefan Behrens fs_info->dev_replace.cont_reading_from_srcdev_mode == 577230d9861fSStefan Behrens BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 577330d9861fSStefan Behrens srcdev = fs_info->dev_replace.srcdev; 577430d9861fSStefan Behrens else 577530d9861fSStefan Behrens srcdev = NULL; 577630d9861fSStefan Behrens 577730d9861fSStefan Behrens /* 577830d9861fSStefan Behrens * try to avoid the drive that is the source drive for a 577930d9861fSStefan Behrens * dev-replace procedure, only choose it if no other non-missing 578030d9861fSStefan Behrens * mirror is available 578130d9861fSStefan Behrens */ 578230d9861fSStefan Behrens for (tolerance = 0; tolerance < 2; tolerance++) { 57838ba0ae78SAnand Jain if (map->stripes[preferred_mirror].dev->bdev && 57848ba0ae78SAnand Jain (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 57858ba0ae78SAnand Jain return preferred_mirror; 578699f92a7cSAnand Jain for (i = first; i < first + num_stripes; i++) { 578730d9861fSStefan Behrens if (map->stripes[i].dev->bdev && 578830d9861fSStefan Behrens (tolerance || map->stripes[i].dev != srcdev)) 5789dfe25020SChris Mason return i; 5790dfe25020SChris Mason } 579130d9861fSStefan Behrens } 579230d9861fSStefan Behrens 5793dfe25020SChris Mason /* we couldn't find one that doesn't fail. Just return something 5794dfe25020SChris Mason * and the io error handling code will clean up eventually 5795dfe25020SChris Mason */ 57968ba0ae78SAnand Jain return preferred_mirror; 5797dfe25020SChris Mason } 5798dfe25020SChris Mason 579953b381b3SDavid Woodhouse /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 58004c664611SQu Wenruo static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 580153b381b3SDavid Woodhouse { 580253b381b3SDavid Woodhouse int i; 580353b381b3SDavid Woodhouse int again = 1; 580453b381b3SDavid Woodhouse 580553b381b3SDavid Woodhouse while (again) { 580653b381b3SDavid Woodhouse again = 0; 5807cc7539edSZhao Lei for (i = 0; i < num_stripes - 1; i++) { 5808eeb6f172SDavid Sterba /* Swap if parity is on a smaller index */ 58094c664611SQu Wenruo if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 58104c664611SQu Wenruo swap(bioc->stripes[i], bioc->stripes[i + 1]); 58114c664611SQu Wenruo swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 581253b381b3SDavid Woodhouse again = 1; 581353b381b3SDavid Woodhouse } 581453b381b3SDavid Woodhouse } 581553b381b3SDavid Woodhouse } 581653b381b3SDavid Woodhouse } 581753b381b3SDavid Woodhouse 5818731ccf15SQu Wenruo static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5819731ccf15SQu Wenruo int total_stripes, 58204c664611SQu Wenruo int real_stripes) 58216e9606d2SZhao Lei { 58224c664611SQu Wenruo struct btrfs_io_context *bioc = kzalloc( 58234c664611SQu Wenruo /* The size of btrfs_io_context */ 58244c664611SQu Wenruo sizeof(struct btrfs_io_context) + 58254c664611SQu Wenruo /* Plus the variable array for the stripes */ 58264c664611SQu Wenruo sizeof(struct btrfs_io_stripe) * (total_stripes) + 58274c664611SQu Wenruo /* Plus the variable array for the tgt dev */ 58286e9606d2SZhao Lei sizeof(int) * (real_stripes) + 5829e57cf21eSChris Mason /* 58304c664611SQu Wenruo * Plus the raid_map, which includes both the tgt dev 58314c664611SQu Wenruo * and the stripes. 5832e57cf21eSChris Mason */ 5833e57cf21eSChris Mason sizeof(u64) * (total_stripes), 5834277fb5fcSMichal Hocko GFP_NOFS|__GFP_NOFAIL); 58356e9606d2SZhao Lei 58364c664611SQu Wenruo atomic_set(&bioc->error, 0); 58374c664611SQu Wenruo refcount_set(&bioc->refs, 1); 58386e9606d2SZhao Lei 5839731ccf15SQu Wenruo bioc->fs_info = fs_info; 58404c664611SQu Wenruo bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 58414c664611SQu Wenruo bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5842608769a4SNikolay Borisov 58434c664611SQu Wenruo return bioc; 58446e9606d2SZhao Lei } 58456e9606d2SZhao Lei 58464c664611SQu Wenruo void btrfs_get_bioc(struct btrfs_io_context *bioc) 58476e9606d2SZhao Lei { 58484c664611SQu Wenruo WARN_ON(!refcount_read(&bioc->refs)); 58494c664611SQu Wenruo refcount_inc(&bioc->refs); 58506e9606d2SZhao Lei } 58516e9606d2SZhao Lei 58524c664611SQu Wenruo void btrfs_put_bioc(struct btrfs_io_context *bioc) 58536e9606d2SZhao Lei { 58544c664611SQu Wenruo if (!bioc) 58556e9606d2SZhao Lei return; 58564c664611SQu Wenruo if (refcount_dec_and_test(&bioc->refs)) 58574c664611SQu Wenruo kfree(bioc); 58586e9606d2SZhao Lei } 58596e9606d2SZhao Lei 58600b3d4cd3SLiu Bo /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 58610b3d4cd3SLiu Bo /* 58620b3d4cd3SLiu Bo * Please note that, discard won't be sent to target device of device 58630b3d4cd3SLiu Bo * replace. 58640b3d4cd3SLiu Bo */ 58650b3d4cd3SLiu Bo static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 58666b7faaddSQu Wenruo u64 logical, u64 *length_ret, 58674c664611SQu Wenruo struct btrfs_io_context **bioc_ret) 58680b3d4cd3SLiu Bo { 58690b3d4cd3SLiu Bo struct extent_map *em; 58700b3d4cd3SLiu Bo struct map_lookup *map; 58714c664611SQu Wenruo struct btrfs_io_context *bioc; 58726b7faaddSQu Wenruo u64 length = *length_ret; 58730b3d4cd3SLiu Bo u64 offset; 58740b3d4cd3SLiu Bo u64 stripe_nr; 58750b3d4cd3SLiu Bo u64 stripe_nr_end; 58760b3d4cd3SLiu Bo u64 stripe_end_offset; 58770b3d4cd3SLiu Bo u64 stripe_cnt; 58780b3d4cd3SLiu Bo u64 stripe_len; 58790b3d4cd3SLiu Bo u64 stripe_offset; 58800b3d4cd3SLiu Bo u64 num_stripes; 58810b3d4cd3SLiu Bo u32 stripe_index; 58820b3d4cd3SLiu Bo u32 factor = 0; 58830b3d4cd3SLiu Bo u32 sub_stripes = 0; 58840b3d4cd3SLiu Bo u64 stripes_per_dev = 0; 58850b3d4cd3SLiu Bo u32 remaining_stripes = 0; 58860b3d4cd3SLiu Bo u32 last_stripe = 0; 58870b3d4cd3SLiu Bo int ret = 0; 58880b3d4cd3SLiu Bo int i; 58890b3d4cd3SLiu Bo 58904c664611SQu Wenruo /* Discard always returns a bioc. */ 58914c664611SQu Wenruo ASSERT(bioc_ret); 58920b3d4cd3SLiu Bo 589360ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, length); 58940b3d4cd3SLiu Bo if (IS_ERR(em)) 58950b3d4cd3SLiu Bo return PTR_ERR(em); 58960b3d4cd3SLiu Bo 58970b3d4cd3SLiu Bo map = em->map_lookup; 58980b3d4cd3SLiu Bo /* we don't discard raid56 yet */ 58990b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 59000b3d4cd3SLiu Bo ret = -EOPNOTSUPP; 59010b3d4cd3SLiu Bo goto out; 59020b3d4cd3SLiu Bo } 59030b3d4cd3SLiu Bo 59040b3d4cd3SLiu Bo offset = logical - em->start; 59052d974619SQu Wenruo length = min_t(u64, em->start + em->len - logical, length); 59066b7faaddSQu Wenruo *length_ret = length; 59070b3d4cd3SLiu Bo 59080b3d4cd3SLiu Bo stripe_len = map->stripe_len; 59090b3d4cd3SLiu Bo /* 59100b3d4cd3SLiu Bo * stripe_nr counts the total number of stripes we have to stride 59110b3d4cd3SLiu Bo * to get to this block 59120b3d4cd3SLiu Bo */ 59130b3d4cd3SLiu Bo stripe_nr = div64_u64(offset, stripe_len); 59140b3d4cd3SLiu Bo 59150b3d4cd3SLiu Bo /* stripe_offset is the offset of this block in its stripe */ 59160b3d4cd3SLiu Bo stripe_offset = offset - stripe_nr * stripe_len; 59170b3d4cd3SLiu Bo 59180b3d4cd3SLiu Bo stripe_nr_end = round_up(offset + length, map->stripe_len); 591942c61ab6SLiu Bo stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 59200b3d4cd3SLiu Bo stripe_cnt = stripe_nr_end - stripe_nr; 59210b3d4cd3SLiu Bo stripe_end_offset = stripe_nr_end * map->stripe_len - 59220b3d4cd3SLiu Bo (offset + length); 59230b3d4cd3SLiu Bo /* 59240b3d4cd3SLiu Bo * after this, stripe_nr is the number of stripes on this 59250b3d4cd3SLiu Bo * device we have to walk to find the data, and stripe_index is 59260b3d4cd3SLiu Bo * the number of our device in the stripe array 59270b3d4cd3SLiu Bo */ 59280b3d4cd3SLiu Bo num_stripes = 1; 59290b3d4cd3SLiu Bo stripe_index = 0; 59300b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 59310b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 59320b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID0) 59330b3d4cd3SLiu Bo sub_stripes = 1; 59340b3d4cd3SLiu Bo else 59350b3d4cd3SLiu Bo sub_stripes = map->sub_stripes; 59360b3d4cd3SLiu Bo 59370b3d4cd3SLiu Bo factor = map->num_stripes / sub_stripes; 59380b3d4cd3SLiu Bo num_stripes = min_t(u64, map->num_stripes, 59390b3d4cd3SLiu Bo sub_stripes * stripe_cnt); 59400b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 59410b3d4cd3SLiu Bo stripe_index *= sub_stripes; 59420b3d4cd3SLiu Bo stripes_per_dev = div_u64_rem(stripe_cnt, factor, 59430b3d4cd3SLiu Bo &remaining_stripes); 59440b3d4cd3SLiu Bo div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 59450b3d4cd3SLiu Bo last_stripe *= sub_stripes; 5946c7369b3fSDavid Sterba } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 59470b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_DUP)) { 59480b3d4cd3SLiu Bo num_stripes = map->num_stripes; 59490b3d4cd3SLiu Bo } else { 59500b3d4cd3SLiu Bo stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 59510b3d4cd3SLiu Bo &stripe_index); 59520b3d4cd3SLiu Bo } 59530b3d4cd3SLiu Bo 5954731ccf15SQu Wenruo bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); 59554c664611SQu Wenruo if (!bioc) { 59560b3d4cd3SLiu Bo ret = -ENOMEM; 59570b3d4cd3SLiu Bo goto out; 59580b3d4cd3SLiu Bo } 59590b3d4cd3SLiu Bo 59600b3d4cd3SLiu Bo for (i = 0; i < num_stripes; i++) { 59614c664611SQu Wenruo bioc->stripes[i].physical = 59620b3d4cd3SLiu Bo map->stripes[stripe_index].physical + 59630b3d4cd3SLiu Bo stripe_offset + stripe_nr * map->stripe_len; 59644c664611SQu Wenruo bioc->stripes[i].dev = map->stripes[stripe_index].dev; 59650b3d4cd3SLiu Bo 59660b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 59670b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) { 59684c664611SQu Wenruo bioc->stripes[i].length = stripes_per_dev * 59690b3d4cd3SLiu Bo map->stripe_len; 59700b3d4cd3SLiu Bo 59710b3d4cd3SLiu Bo if (i / sub_stripes < remaining_stripes) 59724c664611SQu Wenruo bioc->stripes[i].length += map->stripe_len; 59730b3d4cd3SLiu Bo 59740b3d4cd3SLiu Bo /* 59750b3d4cd3SLiu Bo * Special for the first stripe and 59760b3d4cd3SLiu Bo * the last stripe: 59770b3d4cd3SLiu Bo * 59780b3d4cd3SLiu Bo * |-------|...|-------| 59790b3d4cd3SLiu Bo * |----------| 59800b3d4cd3SLiu Bo * off end_off 59810b3d4cd3SLiu Bo */ 59820b3d4cd3SLiu Bo if (i < sub_stripes) 59834c664611SQu Wenruo bioc->stripes[i].length -= stripe_offset; 59840b3d4cd3SLiu Bo 59850b3d4cd3SLiu Bo if (stripe_index >= last_stripe && 59860b3d4cd3SLiu Bo stripe_index <= (last_stripe + 59870b3d4cd3SLiu Bo sub_stripes - 1)) 59884c664611SQu Wenruo bioc->stripes[i].length -= stripe_end_offset; 59890b3d4cd3SLiu Bo 59900b3d4cd3SLiu Bo if (i == sub_stripes - 1) 59910b3d4cd3SLiu Bo stripe_offset = 0; 59920b3d4cd3SLiu Bo } else { 59934c664611SQu Wenruo bioc->stripes[i].length = length; 59940b3d4cd3SLiu Bo } 59950b3d4cd3SLiu Bo 59960b3d4cd3SLiu Bo stripe_index++; 59970b3d4cd3SLiu Bo if (stripe_index == map->num_stripes) { 59980b3d4cd3SLiu Bo stripe_index = 0; 59990b3d4cd3SLiu Bo stripe_nr++; 60000b3d4cd3SLiu Bo } 60010b3d4cd3SLiu Bo } 60020b3d4cd3SLiu Bo 60034c664611SQu Wenruo *bioc_ret = bioc; 60044c664611SQu Wenruo bioc->map_type = map->type; 60054c664611SQu Wenruo bioc->num_stripes = num_stripes; 60060b3d4cd3SLiu Bo out: 60070b3d4cd3SLiu Bo free_extent_map(em); 60080b3d4cd3SLiu Bo return ret; 60090b3d4cd3SLiu Bo } 60100b3d4cd3SLiu Bo 60115ab56090SLiu Bo /* 60125ab56090SLiu Bo * In dev-replace case, for repair case (that's the only case where the mirror 60135ab56090SLiu Bo * is selected explicitly when calling btrfs_map_block), blocks left of the 60145ab56090SLiu Bo * left cursor can also be read from the target drive. 60155ab56090SLiu Bo * 60165ab56090SLiu Bo * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 60175ab56090SLiu Bo * array of stripes. 60185ab56090SLiu Bo * For READ, it also needs to be supported using the same mirror number. 60195ab56090SLiu Bo * 60205ab56090SLiu Bo * If the requested block is not left of the left cursor, EIO is returned. This 60215ab56090SLiu Bo * can happen because btrfs_num_copies() returns one more in the dev-replace 60225ab56090SLiu Bo * case. 60235ab56090SLiu Bo */ 60245ab56090SLiu Bo static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 60255ab56090SLiu Bo u64 logical, u64 length, 60265ab56090SLiu Bo u64 srcdev_devid, int *mirror_num, 60275ab56090SLiu Bo u64 *physical) 60285ab56090SLiu Bo { 60294c664611SQu Wenruo struct btrfs_io_context *bioc = NULL; 60305ab56090SLiu Bo int num_stripes; 60315ab56090SLiu Bo int index_srcdev = 0; 60325ab56090SLiu Bo int found = 0; 60335ab56090SLiu Bo u64 physical_of_found = 0; 60345ab56090SLiu Bo int i; 60355ab56090SLiu Bo int ret = 0; 60365ab56090SLiu Bo 60375ab56090SLiu Bo ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 60384c664611SQu Wenruo logical, &length, &bioc, 0, 0); 60395ab56090SLiu Bo if (ret) { 60404c664611SQu Wenruo ASSERT(bioc == NULL); 60415ab56090SLiu Bo return ret; 60425ab56090SLiu Bo } 60435ab56090SLiu Bo 60444c664611SQu Wenruo num_stripes = bioc->num_stripes; 60455ab56090SLiu Bo if (*mirror_num > num_stripes) { 60465ab56090SLiu Bo /* 60475ab56090SLiu Bo * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 60485ab56090SLiu Bo * that means that the requested area is not left of the left 60495ab56090SLiu Bo * cursor 60505ab56090SLiu Bo */ 60514c664611SQu Wenruo btrfs_put_bioc(bioc); 60525ab56090SLiu Bo return -EIO; 60535ab56090SLiu Bo } 60545ab56090SLiu Bo 60555ab56090SLiu Bo /* 60565ab56090SLiu Bo * process the rest of the function using the mirror_num of the source 60575ab56090SLiu Bo * drive. Therefore look it up first. At the end, patch the device 60585ab56090SLiu Bo * pointer to the one of the target drive. 60595ab56090SLiu Bo */ 60605ab56090SLiu Bo for (i = 0; i < num_stripes; i++) { 60614c664611SQu Wenruo if (bioc->stripes[i].dev->devid != srcdev_devid) 60625ab56090SLiu Bo continue; 60635ab56090SLiu Bo 60645ab56090SLiu Bo /* 60655ab56090SLiu Bo * In case of DUP, in order to keep it simple, only add the 60665ab56090SLiu Bo * mirror with the lowest physical address 60675ab56090SLiu Bo */ 60685ab56090SLiu Bo if (found && 60694c664611SQu Wenruo physical_of_found <= bioc->stripes[i].physical) 60705ab56090SLiu Bo continue; 60715ab56090SLiu Bo 60725ab56090SLiu Bo index_srcdev = i; 60735ab56090SLiu Bo found = 1; 60744c664611SQu Wenruo physical_of_found = bioc->stripes[i].physical; 60755ab56090SLiu Bo } 60765ab56090SLiu Bo 60774c664611SQu Wenruo btrfs_put_bioc(bioc); 60785ab56090SLiu Bo 60795ab56090SLiu Bo ASSERT(found); 60805ab56090SLiu Bo if (!found) 60815ab56090SLiu Bo return -EIO; 60825ab56090SLiu Bo 60835ab56090SLiu Bo *mirror_num = index_srcdev + 1; 60845ab56090SLiu Bo *physical = physical_of_found; 60855ab56090SLiu Bo return ret; 60865ab56090SLiu Bo } 60875ab56090SLiu Bo 60886143c23cSNaohiro Aota static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 60896143c23cSNaohiro Aota { 60906143c23cSNaohiro Aota struct btrfs_block_group *cache; 60916143c23cSNaohiro Aota bool ret; 60926143c23cSNaohiro Aota 6093de17addcSNaohiro Aota /* Non zoned filesystem does not use "to_copy" flag */ 60946143c23cSNaohiro Aota if (!btrfs_is_zoned(fs_info)) 60956143c23cSNaohiro Aota return false; 60966143c23cSNaohiro Aota 60976143c23cSNaohiro Aota cache = btrfs_lookup_block_group(fs_info, logical); 60986143c23cSNaohiro Aota 60996143c23cSNaohiro Aota spin_lock(&cache->lock); 61006143c23cSNaohiro Aota ret = cache->to_copy; 61016143c23cSNaohiro Aota spin_unlock(&cache->lock); 61026143c23cSNaohiro Aota 61036143c23cSNaohiro Aota btrfs_put_block_group(cache); 61046143c23cSNaohiro Aota return ret; 61056143c23cSNaohiro Aota } 61066143c23cSNaohiro Aota 610773c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op, 61084c664611SQu Wenruo struct btrfs_io_context **bioc_ret, 610973c0f228SLiu Bo struct btrfs_dev_replace *dev_replace, 61106143c23cSNaohiro Aota u64 logical, 611173c0f228SLiu Bo int *num_stripes_ret, int *max_errors_ret) 611273c0f228SLiu Bo { 61134c664611SQu Wenruo struct btrfs_io_context *bioc = *bioc_ret; 611473c0f228SLiu Bo u64 srcdev_devid = dev_replace->srcdev->devid; 611573c0f228SLiu Bo int tgtdev_indexes = 0; 611673c0f228SLiu Bo int num_stripes = *num_stripes_ret; 611773c0f228SLiu Bo int max_errors = *max_errors_ret; 611873c0f228SLiu Bo int i; 611973c0f228SLiu Bo 612073c0f228SLiu Bo if (op == BTRFS_MAP_WRITE) { 612173c0f228SLiu Bo int index_where_to_add; 612273c0f228SLiu Bo 612373c0f228SLiu Bo /* 61246143c23cSNaohiro Aota * A block group which have "to_copy" set will eventually 61256143c23cSNaohiro Aota * copied by dev-replace process. We can avoid cloning IO here. 61266143c23cSNaohiro Aota */ 61276143c23cSNaohiro Aota if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 61286143c23cSNaohiro Aota return; 61296143c23cSNaohiro Aota 61306143c23cSNaohiro Aota /* 613173c0f228SLiu Bo * duplicate the write operations while the dev replace 613273c0f228SLiu Bo * procedure is running. Since the copying of the old disk to 613373c0f228SLiu Bo * the new disk takes place at run time while the filesystem is 613473c0f228SLiu Bo * mounted writable, the regular write operations to the old 613573c0f228SLiu Bo * disk have to be duplicated to go to the new disk as well. 613673c0f228SLiu Bo * 613773c0f228SLiu Bo * Note that device->missing is handled by the caller, and that 613873c0f228SLiu Bo * the write to the old disk is already set up in the stripes 613973c0f228SLiu Bo * array. 614073c0f228SLiu Bo */ 614173c0f228SLiu Bo index_where_to_add = num_stripes; 614273c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 61434c664611SQu Wenruo if (bioc->stripes[i].dev->devid == srcdev_devid) { 614473c0f228SLiu Bo /* write to new disk, too */ 61454c664611SQu Wenruo struct btrfs_io_stripe *new = 61464c664611SQu Wenruo bioc->stripes + index_where_to_add; 61474c664611SQu Wenruo struct btrfs_io_stripe *old = 61484c664611SQu Wenruo bioc->stripes + i; 614973c0f228SLiu Bo 615073c0f228SLiu Bo new->physical = old->physical; 615173c0f228SLiu Bo new->length = old->length; 615273c0f228SLiu Bo new->dev = dev_replace->tgtdev; 61534c664611SQu Wenruo bioc->tgtdev_map[i] = index_where_to_add; 615473c0f228SLiu Bo index_where_to_add++; 615573c0f228SLiu Bo max_errors++; 615673c0f228SLiu Bo tgtdev_indexes++; 615773c0f228SLiu Bo } 615873c0f228SLiu Bo } 615973c0f228SLiu Bo num_stripes = index_where_to_add; 616073c0f228SLiu Bo } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 616173c0f228SLiu Bo int index_srcdev = 0; 616273c0f228SLiu Bo int found = 0; 616373c0f228SLiu Bo u64 physical_of_found = 0; 616473c0f228SLiu Bo 616573c0f228SLiu Bo /* 616673c0f228SLiu Bo * During the dev-replace procedure, the target drive can also 616773c0f228SLiu Bo * be used to read data in case it is needed to repair a corrupt 616873c0f228SLiu Bo * block elsewhere. This is possible if the requested area is 616973c0f228SLiu Bo * left of the left cursor. In this area, the target drive is a 617073c0f228SLiu Bo * full copy of the source drive. 617173c0f228SLiu Bo */ 617273c0f228SLiu Bo for (i = 0; i < num_stripes; i++) { 61734c664611SQu Wenruo if (bioc->stripes[i].dev->devid == srcdev_devid) { 617473c0f228SLiu Bo /* 617573c0f228SLiu Bo * In case of DUP, in order to keep it simple, 617673c0f228SLiu Bo * only add the mirror with the lowest physical 617773c0f228SLiu Bo * address 617873c0f228SLiu Bo */ 617973c0f228SLiu Bo if (found && 61804c664611SQu Wenruo physical_of_found <= bioc->stripes[i].physical) 618173c0f228SLiu Bo continue; 618273c0f228SLiu Bo index_srcdev = i; 618373c0f228SLiu Bo found = 1; 61844c664611SQu Wenruo physical_of_found = bioc->stripes[i].physical; 618573c0f228SLiu Bo } 618673c0f228SLiu Bo } 618773c0f228SLiu Bo if (found) { 61884c664611SQu Wenruo struct btrfs_io_stripe *tgtdev_stripe = 61894c664611SQu Wenruo bioc->stripes + num_stripes; 619073c0f228SLiu Bo 619173c0f228SLiu Bo tgtdev_stripe->physical = physical_of_found; 619273c0f228SLiu Bo tgtdev_stripe->length = 61934c664611SQu Wenruo bioc->stripes[index_srcdev].length; 619473c0f228SLiu Bo tgtdev_stripe->dev = dev_replace->tgtdev; 61954c664611SQu Wenruo bioc->tgtdev_map[index_srcdev] = num_stripes; 619673c0f228SLiu Bo 619773c0f228SLiu Bo tgtdev_indexes++; 619873c0f228SLiu Bo num_stripes++; 619973c0f228SLiu Bo } 620073c0f228SLiu Bo } 620173c0f228SLiu Bo 620273c0f228SLiu Bo *num_stripes_ret = num_stripes; 620373c0f228SLiu Bo *max_errors_ret = max_errors; 62044c664611SQu Wenruo bioc->num_tgtdevs = tgtdev_indexes; 62054c664611SQu Wenruo *bioc_ret = bioc; 620673c0f228SLiu Bo } 620773c0f228SLiu Bo 62082b19a1feSLiu Bo static bool need_full_stripe(enum btrfs_map_op op) 62092b19a1feSLiu Bo { 62102b19a1feSLiu Bo return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 62112b19a1feSLiu Bo } 62122b19a1feSLiu Bo 62135f141126SNikolay Borisov /* 621442034313SMichal Rostecki * Calculate the geometry of a particular (address, len) tuple. This 621542034313SMichal Rostecki * information is used to calculate how big a particular bio can get before it 621642034313SMichal Rostecki * straddles a stripe. 62175f141126SNikolay Borisov * 621842034313SMichal Rostecki * @fs_info: the filesystem 621942034313SMichal Rostecki * @em: mapping containing the logical extent 622042034313SMichal Rostecki * @op: type of operation - write or read 622142034313SMichal Rostecki * @logical: address that we want to figure out the geometry of 622242034313SMichal Rostecki * @io_geom: pointer used to return values 62235f141126SNikolay Borisov * 62245f141126SNikolay Borisov * Returns < 0 in case a chunk for the given logical address cannot be found, 62255f141126SNikolay Borisov * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 62265f141126SNikolay Borisov */ 622742034313SMichal Rostecki int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 622843c0d1a5SQu Wenruo enum btrfs_map_op op, u64 logical, 622942034313SMichal Rostecki struct btrfs_io_geometry *io_geom) 62305f141126SNikolay Borisov { 62315f141126SNikolay Borisov struct map_lookup *map; 623243c0d1a5SQu Wenruo u64 len; 62335f141126SNikolay Borisov u64 offset; 62345f141126SNikolay Borisov u64 stripe_offset; 62355f141126SNikolay Borisov u64 stripe_nr; 62365f141126SNikolay Borisov u64 stripe_len; 62375f141126SNikolay Borisov u64 raid56_full_stripe_start = (u64)-1; 62385f141126SNikolay Borisov int data_stripes; 62395f141126SNikolay Borisov 62405f141126SNikolay Borisov ASSERT(op != BTRFS_MAP_DISCARD); 62415f141126SNikolay Borisov 62425f141126SNikolay Borisov map = em->map_lookup; 62435f141126SNikolay Borisov /* Offset of this logical address in the chunk */ 62445f141126SNikolay Borisov offset = logical - em->start; 62455f141126SNikolay Borisov /* Len of a stripe in a chunk */ 62465f141126SNikolay Borisov stripe_len = map->stripe_len; 62471a9fd417SDavid Sterba /* Stripe where this block falls in */ 62485f141126SNikolay Borisov stripe_nr = div64_u64(offset, stripe_len); 62495f141126SNikolay Borisov /* Offset of stripe in the chunk */ 62505f141126SNikolay Borisov stripe_offset = stripe_nr * stripe_len; 62515f141126SNikolay Borisov if (offset < stripe_offset) { 62525f141126SNikolay Borisov btrfs_crit(fs_info, 62535f141126SNikolay Borisov "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 62545f141126SNikolay Borisov stripe_offset, offset, em->start, logical, stripe_len); 625542034313SMichal Rostecki return -EINVAL; 62565f141126SNikolay Borisov } 62575f141126SNikolay Borisov 62585f141126SNikolay Borisov /* stripe_offset is the offset of this block in its stripe */ 62595f141126SNikolay Borisov stripe_offset = offset - stripe_offset; 62605f141126SNikolay Borisov data_stripes = nr_data_stripes(map); 62615f141126SNikolay Borisov 62625f141126SNikolay Borisov if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 62635f141126SNikolay Borisov u64 max_len = stripe_len - stripe_offset; 62645f141126SNikolay Borisov 62655f141126SNikolay Borisov /* 62665f141126SNikolay Borisov * In case of raid56, we need to know the stripe aligned start 62675f141126SNikolay Borisov */ 62685f141126SNikolay Borisov if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 62695f141126SNikolay Borisov unsigned long full_stripe_len = stripe_len * data_stripes; 62705f141126SNikolay Borisov raid56_full_stripe_start = offset; 62715f141126SNikolay Borisov 62725f141126SNikolay Borisov /* 62735f141126SNikolay Borisov * Allow a write of a full stripe, but make sure we 62745f141126SNikolay Borisov * don't allow straddling of stripes 62755f141126SNikolay Borisov */ 62765f141126SNikolay Borisov raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 62775f141126SNikolay Borisov full_stripe_len); 62785f141126SNikolay Borisov raid56_full_stripe_start *= full_stripe_len; 62795f141126SNikolay Borisov 62805f141126SNikolay Borisov /* 62815f141126SNikolay Borisov * For writes to RAID[56], allow a full stripeset across 62825f141126SNikolay Borisov * all disks. For other RAID types and for RAID[56] 62835f141126SNikolay Borisov * reads, just allow a single stripe (on a single disk). 62845f141126SNikolay Borisov */ 62855f141126SNikolay Borisov if (op == BTRFS_MAP_WRITE) { 62865f141126SNikolay Borisov max_len = stripe_len * data_stripes - 62875f141126SNikolay Borisov (offset - raid56_full_stripe_start); 62885f141126SNikolay Borisov } 62895f141126SNikolay Borisov } 62905f141126SNikolay Borisov len = min_t(u64, em->len - offset, max_len); 62915f141126SNikolay Borisov } else { 62925f141126SNikolay Borisov len = em->len - offset; 62935f141126SNikolay Borisov } 62945f141126SNikolay Borisov 62955f141126SNikolay Borisov io_geom->len = len; 62965f141126SNikolay Borisov io_geom->offset = offset; 62975f141126SNikolay Borisov io_geom->stripe_len = stripe_len; 62985f141126SNikolay Borisov io_geom->stripe_nr = stripe_nr; 62995f141126SNikolay Borisov io_geom->stripe_offset = stripe_offset; 63005f141126SNikolay Borisov io_geom->raid56_stripe_offset = raid56_full_stripe_start; 63015f141126SNikolay Borisov 630242034313SMichal Rostecki return 0; 63035f141126SNikolay Borisov } 63045f141126SNikolay Borisov 6305cf8cddd3SChristoph Hellwig static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6306cf8cddd3SChristoph Hellwig enum btrfs_map_op op, 6307cea9e445SChris Mason u64 logical, u64 *length, 63084c664611SQu Wenruo struct btrfs_io_context **bioc_ret, 63098e5cfb55SZhao Lei int mirror_num, int need_raid_map) 63100b86a832SChris Mason { 63110b86a832SChris Mason struct extent_map *em; 63120b86a832SChris Mason struct map_lookup *map; 6313593060d7SChris Mason u64 stripe_offset; 6314593060d7SChris Mason u64 stripe_nr; 631553b381b3SDavid Woodhouse u64 stripe_len; 63169d644a62SDavid Sterba u32 stripe_index; 6317cff82672SDavid Sterba int data_stripes; 6318cea9e445SChris Mason int i; 6319de11cc12SLi Zefan int ret = 0; 6320f2d8d74dSChris Mason int num_stripes; 6321a236aed1SChris Mason int max_errors = 0; 63222c8cdd6eSMiao Xie int tgtdev_indexes = 0; 63234c664611SQu Wenruo struct btrfs_io_context *bioc = NULL; 6324472262f3SStefan Behrens struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6325472262f3SStefan Behrens int dev_replace_is_ongoing = 0; 6326472262f3SStefan Behrens int num_alloc_stripes; 6327ad6d620eSStefan Behrens int patch_the_first_stripe_for_dev_replace = 0; 6328ad6d620eSStefan Behrens u64 physical_to_patch_in_first_stripe = 0; 632953b381b3SDavid Woodhouse u64 raid56_full_stripe_start = (u64)-1; 633089b798adSNikolay Borisov struct btrfs_io_geometry geom; 633189b798adSNikolay Borisov 63324c664611SQu Wenruo ASSERT(bioc_ret); 633375fb2e9eSDavid Sterba ASSERT(op != BTRFS_MAP_DISCARD); 63340b3d4cd3SLiu Bo 633542034313SMichal Rostecki em = btrfs_get_chunk_map(fs_info, logical, *length); 633642034313SMichal Rostecki ASSERT(!IS_ERR(em)); 633742034313SMichal Rostecki 633843c0d1a5SQu Wenruo ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 633989b798adSNikolay Borisov if (ret < 0) 634089b798adSNikolay Borisov return ret; 634189b798adSNikolay Borisov 634295617d69SJeff Mahoney map = em->map_lookup; 6343593060d7SChris Mason 634489b798adSNikolay Borisov *length = geom.len; 634589b798adSNikolay Borisov stripe_len = geom.stripe_len; 634689b798adSNikolay Borisov stripe_nr = geom.stripe_nr; 634789b798adSNikolay Borisov stripe_offset = geom.stripe_offset; 634889b798adSNikolay Borisov raid56_full_stripe_start = geom.raid56_stripe_offset; 6349cff82672SDavid Sterba data_stripes = nr_data_stripes(map); 6350593060d7SChris Mason 6351cb5583ddSDavid Sterba down_read(&dev_replace->rwsem); 6352472262f3SStefan Behrens dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 635353176ddeSDavid Sterba /* 635453176ddeSDavid Sterba * Hold the semaphore for read during the whole operation, write is 635553176ddeSDavid Sterba * requested at commit time but must wait. 635653176ddeSDavid Sterba */ 6357472262f3SStefan Behrens if (!dev_replace_is_ongoing) 6358cb5583ddSDavid Sterba up_read(&dev_replace->rwsem); 6359472262f3SStefan Behrens 6360ad6d620eSStefan Behrens if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 63612b19a1feSLiu Bo !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 63625ab56090SLiu Bo ret = get_extra_mirror_from_replace(fs_info, logical, *length, 63635ab56090SLiu Bo dev_replace->srcdev->devid, 63645ab56090SLiu Bo &mirror_num, 63655ab56090SLiu Bo &physical_to_patch_in_first_stripe); 63665ab56090SLiu Bo if (ret) 6367ad6d620eSStefan Behrens goto out; 63685ab56090SLiu Bo else 636994a97dfeSZhao Lei patch_the_first_stripe_for_dev_replace = 1; 6370ad6d620eSStefan Behrens } else if (mirror_num > map->num_stripes) { 6371ad6d620eSStefan Behrens mirror_num = 0; 6372ad6d620eSStefan Behrens } 6373ad6d620eSStefan Behrens 6374f2d8d74dSChris Mason num_stripes = 1; 6375cea9e445SChris Mason stripe_index = 0; 6376fce3bb9aSLi Dongyang if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 637747c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 637847c5713fSDavid Sterba &stripe_index); 6379de483734SAnand Jain if (!need_full_stripe(op)) 638028e1cc7dSMiao Xie mirror_num = 1; 6381c7369b3fSDavid Sterba } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6382de483734SAnand Jain if (need_full_stripe(op)) 6383f2d8d74dSChris Mason num_stripes = map->num_stripes; 63842fff734fSChris Mason else if (mirror_num) 6385f188591eSChris Mason stripe_index = mirror_num - 1; 6386dfe25020SChris Mason else { 638730d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 0, 638830d9861fSStefan Behrens dev_replace_is_ongoing); 6389a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 6390dfe25020SChris Mason } 63912fff734fSChris Mason 6392611f0e00SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6393de483734SAnand Jain if (need_full_stripe(op)) { 6394f2d8d74dSChris Mason num_stripes = map->num_stripes; 6395a1d3c478SJan Schmidt } else if (mirror_num) { 6396f188591eSChris Mason stripe_index = mirror_num - 1; 6397a1d3c478SJan Schmidt } else { 6398a1d3c478SJan Schmidt mirror_num = 1; 6399a1d3c478SJan Schmidt } 64002fff734fSChris Mason 6401321aecc6SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 64029d644a62SDavid Sterba u32 factor = map->num_stripes / map->sub_stripes; 6403321aecc6SChris Mason 640447c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6405321aecc6SChris Mason stripe_index *= map->sub_stripes; 6406321aecc6SChris Mason 6407de483734SAnand Jain if (need_full_stripe(op)) 6408f2d8d74dSChris Mason num_stripes = map->sub_stripes; 6409321aecc6SChris Mason else if (mirror_num) 6410321aecc6SChris Mason stripe_index += mirror_num - 1; 6411dfe25020SChris Mason else { 64123e74317aSJan Schmidt int old_stripe_index = stripe_index; 641330d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 641430d9861fSStefan Behrens stripe_index, 641530d9861fSStefan Behrens dev_replace_is_ongoing); 64163e74317aSJan Schmidt mirror_num = stripe_index - old_stripe_index + 1; 6417dfe25020SChris Mason } 641853b381b3SDavid Woodhouse 6419ffe2d203SZhao Lei } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6420de483734SAnand Jain if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 642153b381b3SDavid Woodhouse /* push stripe_nr back to the start of the full stripe */ 642242c61ab6SLiu Bo stripe_nr = div64_u64(raid56_full_stripe_start, 6423cff82672SDavid Sterba stripe_len * data_stripes); 642453b381b3SDavid Woodhouse 642553b381b3SDavid Woodhouse /* RAID[56] write or recovery. Return all stripes */ 642653b381b3SDavid Woodhouse num_stripes = map->num_stripes; 642753b381b3SDavid Woodhouse max_errors = nr_parity_stripes(map); 642853b381b3SDavid Woodhouse 642953b381b3SDavid Woodhouse *length = map->stripe_len; 643053b381b3SDavid Woodhouse stripe_index = 0; 643153b381b3SDavid Woodhouse stripe_offset = 0; 643253b381b3SDavid Woodhouse } else { 643353b381b3SDavid Woodhouse /* 643453b381b3SDavid Woodhouse * Mirror #0 or #1 means the original data block. 643553b381b3SDavid Woodhouse * Mirror #2 is RAID5 parity block. 643653b381b3SDavid Woodhouse * Mirror #3 is RAID6 Q block. 643753b381b3SDavid Woodhouse */ 643847c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, 6439cff82672SDavid Sterba data_stripes, &stripe_index); 644053b381b3SDavid Woodhouse if (mirror_num > 1) 6441cff82672SDavid Sterba stripe_index = data_stripes + mirror_num - 2; 644253b381b3SDavid Woodhouse 644353b381b3SDavid Woodhouse /* We distribute the parity blocks across stripes */ 644447c5713fSDavid Sterba div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 644547c5713fSDavid Sterba &stripe_index); 6446de483734SAnand Jain if (!need_full_stripe(op) && mirror_num <= 1) 644728e1cc7dSMiao Xie mirror_num = 1; 644853b381b3SDavid Woodhouse } 64498790d502SChris Mason } else { 6450593060d7SChris Mason /* 645147c5713fSDavid Sterba * after this, stripe_nr is the number of stripes on this 645247c5713fSDavid Sterba * device we have to walk to find the data, and stripe_index is 645347c5713fSDavid Sterba * the number of our device in the stripe array 6454593060d7SChris Mason */ 645547c5713fSDavid Sterba stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 645647c5713fSDavid Sterba &stripe_index); 6457a1d3c478SJan Schmidt mirror_num = stripe_index + 1; 64588790d502SChris Mason } 6459e042d1ecSJosef Bacik if (stripe_index >= map->num_stripes) { 64605d163e0eSJeff Mahoney btrfs_crit(fs_info, 64615d163e0eSJeff Mahoney "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6462e042d1ecSJosef Bacik stripe_index, map->num_stripes); 6463e042d1ecSJosef Bacik ret = -EINVAL; 6464e042d1ecSJosef Bacik goto out; 6465e042d1ecSJosef Bacik } 6466593060d7SChris Mason 6467472262f3SStefan Behrens num_alloc_stripes = num_stripes; 64686fad823fSLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 64690b3d4cd3SLiu Bo if (op == BTRFS_MAP_WRITE) 6470472262f3SStefan Behrens num_alloc_stripes <<= 1; 6471cf8cddd3SChristoph Hellwig if (op == BTRFS_MAP_GET_READ_MIRRORS) 6472ad6d620eSStefan Behrens num_alloc_stripes++; 64732c8cdd6eSMiao Xie tgtdev_indexes = num_stripes; 6474ad6d620eSStefan Behrens } 64752c8cdd6eSMiao Xie 6476731ccf15SQu Wenruo bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 64774c664611SQu Wenruo if (!bioc) { 6478de11cc12SLi Zefan ret = -ENOMEM; 6479de11cc12SLi Zefan goto out; 6480de11cc12SLi Zefan } 6481608769a4SNikolay Borisov 6482608769a4SNikolay Borisov for (i = 0; i < num_stripes; i++) { 64834c664611SQu Wenruo bioc->stripes[i].physical = map->stripes[stripe_index].physical + 6484608769a4SNikolay Borisov stripe_offset + stripe_nr * map->stripe_len; 64854c664611SQu Wenruo bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6486608769a4SNikolay Borisov stripe_index++; 6487608769a4SNikolay Borisov } 6488de11cc12SLi Zefan 64894c664611SQu Wenruo /* Build raid_map */ 64902b19a1feSLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 64912b19a1feSLiu Bo (need_full_stripe(op) || mirror_num > 1)) { 64928e5cfb55SZhao Lei u64 tmp; 64939d644a62SDavid Sterba unsigned rot; 64948e5cfb55SZhao Lei 64958e5cfb55SZhao Lei /* Work out the disk rotation on this stripe-set */ 649647c5713fSDavid Sterba div_u64_rem(stripe_nr, num_stripes, &rot); 64978e5cfb55SZhao Lei 64988e5cfb55SZhao Lei /* Fill in the logical address of each stripe */ 6499cff82672SDavid Sterba tmp = stripe_nr * data_stripes; 6500cff82672SDavid Sterba for (i = 0; i < data_stripes; i++) 65014c664611SQu Wenruo bioc->raid_map[(i + rot) % num_stripes] = 65028e5cfb55SZhao Lei em->start + (tmp + i) * map->stripe_len; 65038e5cfb55SZhao Lei 65044c664611SQu Wenruo bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 65058e5cfb55SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID6) 65064c664611SQu Wenruo bioc->raid_map[(i + rot + 1) % num_stripes] = 65078e5cfb55SZhao Lei RAID6_Q_STRIPE; 65088e5cfb55SZhao Lei 65094c664611SQu Wenruo sort_parity_stripes(bioc, num_stripes); 6510593060d7SChris Mason } 6511de11cc12SLi Zefan 65122b19a1feSLiu Bo if (need_full_stripe(op)) 6513d20983b4SMiao Xie max_errors = btrfs_chunk_max_errors(map); 6514de11cc12SLi Zefan 651573c0f228SLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 65162b19a1feSLiu Bo need_full_stripe(op)) { 65174c664611SQu Wenruo handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 65186143c23cSNaohiro Aota &num_stripes, &max_errors); 6519ad6d620eSStefan Behrens } 6520472262f3SStefan Behrens 65214c664611SQu Wenruo *bioc_ret = bioc; 65224c664611SQu Wenruo bioc->map_type = map->type; 65234c664611SQu Wenruo bioc->num_stripes = num_stripes; 65244c664611SQu Wenruo bioc->max_errors = max_errors; 65254c664611SQu Wenruo bioc->mirror_num = mirror_num; 6526ad6d620eSStefan Behrens 6527ad6d620eSStefan Behrens /* 6528ad6d620eSStefan Behrens * this is the case that REQ_READ && dev_replace_is_ongoing && 6529ad6d620eSStefan Behrens * mirror_num == num_stripes + 1 && dev_replace target drive is 6530ad6d620eSStefan Behrens * available as a mirror 6531ad6d620eSStefan Behrens */ 6532ad6d620eSStefan Behrens if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6533ad6d620eSStefan Behrens WARN_ON(num_stripes > 1); 65344c664611SQu Wenruo bioc->stripes[0].dev = dev_replace->tgtdev; 65354c664611SQu Wenruo bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 65364c664611SQu Wenruo bioc->mirror_num = map->num_stripes + 1; 6537ad6d620eSStefan Behrens } 6538cea9e445SChris Mason out: 653973beece9SLiu Bo if (dev_replace_is_ongoing) { 654053176ddeSDavid Sterba lockdep_assert_held(&dev_replace->rwsem); 654153176ddeSDavid Sterba /* Unlock and let waiting writers proceed */ 6542cb5583ddSDavid Sterba up_read(&dev_replace->rwsem); 654373beece9SLiu Bo } 65440b86a832SChris Mason free_extent_map(em); 6545de11cc12SLi Zefan return ret; 65460b86a832SChris Mason } 65470b86a832SChris Mason 6548cf8cddd3SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6549f2d8d74dSChris Mason u64 logical, u64 *length, 65504c664611SQu Wenruo struct btrfs_io_context **bioc_ret, int mirror_num) 6551f2d8d74dSChris Mason { 655275fb2e9eSDavid Sterba if (op == BTRFS_MAP_DISCARD) 655375fb2e9eSDavid Sterba return __btrfs_map_block_for_discard(fs_info, logical, 65544c664611SQu Wenruo length, bioc_ret); 655575fb2e9eSDavid Sterba 65564c664611SQu Wenruo return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 65578e5cfb55SZhao Lei mirror_num, 0); 6558f2d8d74dSChris Mason } 6559f2d8d74dSChris Mason 6560af8e2d1dSMiao Xie /* For Scrub/replace */ 6561cf8cddd3SChristoph Hellwig int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6562af8e2d1dSMiao Xie u64 logical, u64 *length, 65634c664611SQu Wenruo struct btrfs_io_context **bioc_ret) 6564af8e2d1dSMiao Xie { 65654c664611SQu Wenruo return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); 6566af8e2d1dSMiao Xie } 6567af8e2d1dSMiao Xie 65684c664611SQu Wenruo static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) 65698408c716SMiao Xie { 65704c664611SQu Wenruo bio->bi_private = bioc->private; 65714c664611SQu Wenruo bio->bi_end_io = bioc->end_io; 65724246a0b6SChristoph Hellwig bio_endio(bio); 6573326e1dbbSMike Snitzer 65744c664611SQu Wenruo btrfs_put_bioc(bioc); 65758408c716SMiao Xie } 65768408c716SMiao Xie 65774246a0b6SChristoph Hellwig static void btrfs_end_bio(struct bio *bio) 65788790d502SChris Mason { 65794c664611SQu Wenruo struct btrfs_io_context *bioc = bio->bi_private; 65807d2b4daaSChris Mason int is_orig_bio = 0; 65818790d502SChris Mason 65824e4cbee9SChristoph Hellwig if (bio->bi_status) { 65834c664611SQu Wenruo atomic_inc(&bioc->error); 65844e4cbee9SChristoph Hellwig if (bio->bi_status == BLK_STS_IOERR || 65854e4cbee9SChristoph Hellwig bio->bi_status == BLK_STS_TARGET) { 6586c3a3b19bSQu Wenruo struct btrfs_device *dev = btrfs_bio(bio)->device; 6587442a4f63SStefan Behrens 65883eee86c8SNikolay Borisov ASSERT(dev->bdev); 6589cfe94440SNaohiro Aota if (btrfs_op(bio) == BTRFS_MAP_WRITE) 65901cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6591442a4f63SStefan Behrens BTRFS_DEV_STAT_WRITE_ERRS); 65920cc068e6SDavid Sterba else if (!(bio->bi_opf & REQ_RAHEAD)) 65931cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6594442a4f63SStefan Behrens BTRFS_DEV_STAT_READ_ERRS); 659570fd7614SChristoph Hellwig if (bio->bi_opf & REQ_PREFLUSH) 65961cb34c8eSAnand Jain btrfs_dev_stat_inc_and_print(dev, 6597442a4f63SStefan Behrens BTRFS_DEV_STAT_FLUSH_ERRS); 6598442a4f63SStefan Behrens } 6599442a4f63SStefan Behrens } 66008790d502SChris Mason 66014c664611SQu Wenruo if (bio == bioc->orig_bio) 66027d2b4daaSChris Mason is_orig_bio = 1; 66037d2b4daaSChris Mason 66044c664611SQu Wenruo btrfs_bio_counter_dec(bioc->fs_info); 6605c404e0dcSMiao Xie 66064c664611SQu Wenruo if (atomic_dec_and_test(&bioc->stripes_pending)) { 66077d2b4daaSChris Mason if (!is_orig_bio) { 66087d2b4daaSChris Mason bio_put(bio); 66094c664611SQu Wenruo bio = bioc->orig_bio; 66107d2b4daaSChris Mason } 6611c7b22bb1SMuthu Kumar 6612c3a3b19bSQu Wenruo btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6613a236aed1SChris Mason /* only send an error to the higher layers if it is 661453b381b3SDavid Woodhouse * beyond the tolerance of the btrfs bio 6615a236aed1SChris Mason */ 66164c664611SQu Wenruo if (atomic_read(&bioc->error) > bioc->max_errors) { 66174e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 66185dbc8fcaSChris Mason } else { 66191259ab75SChris Mason /* 66201259ab75SChris Mason * this bio is actually up to date, we didn't 66211259ab75SChris Mason * go over the max number of errors 66221259ab75SChris Mason */ 66232dbe0c77SAnand Jain bio->bi_status = BLK_STS_OK; 66241259ab75SChris Mason } 6625c55f1396SMiao Xie 66264c664611SQu Wenruo btrfs_end_bioc(bioc, bio); 66277d2b4daaSChris Mason } else if (!is_orig_bio) { 66288790d502SChris Mason bio_put(bio); 66298790d502SChris Mason } 66308790d502SChris Mason } 66318790d502SChris Mason 66324c664611SQu Wenruo static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, 6633c31efbdfSNikolay Borisov u64 physical, struct btrfs_device *dev) 6634de1ee92aSJosef Bacik { 66354c664611SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 6636de1ee92aSJosef Bacik 66374c664611SQu Wenruo bio->bi_private = bioc; 6638c3a3b19bSQu Wenruo btrfs_bio(bio)->device = dev; 6639de1ee92aSJosef Bacik bio->bi_end_io = btrfs_end_bio; 66404f024f37SKent Overstreet bio->bi_iter.bi_sector = physical >> 9; 6641d8e3fb10SNaohiro Aota /* 6642d8e3fb10SNaohiro Aota * For zone append writing, bi_sector must point the beginning of the 6643d8e3fb10SNaohiro Aota * zone 6644d8e3fb10SNaohiro Aota */ 6645d8e3fb10SNaohiro Aota if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6646d8e3fb10SNaohiro Aota if (btrfs_dev_is_sequential(dev, physical)) { 6647d8e3fb10SNaohiro Aota u64 zone_start = round_down(physical, fs_info->zone_size); 6648d8e3fb10SNaohiro Aota 6649d8e3fb10SNaohiro Aota bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6650d8e3fb10SNaohiro Aota } else { 6651d8e3fb10SNaohiro Aota bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6652d8e3fb10SNaohiro Aota bio->bi_opf |= REQ_OP_WRITE; 6653d8e3fb10SNaohiro Aota } 6654d8e3fb10SNaohiro Aota } 6655672d5990SMisono Tomohiro btrfs_debug_in_rcu(fs_info, 6656ab8d0fc4SJeff Mahoney "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 66571201b58bSDavid Sterba bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 66581db45a35SDavid Sterba (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 66591db45a35SDavid Sterba dev->devid, bio->bi_iter.bi_size); 666074d46992SChristoph Hellwig bio_set_dev(bio, dev->bdev); 6661c404e0dcSMiao Xie 66622ff7e61eSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 6663c404e0dcSMiao Xie 66644e49ea4aSMike Christie btrfsic_submit_bio(bio); 6665de1ee92aSJosef Bacik } 6666de1ee92aSJosef Bacik 66674c664611SQu Wenruo static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) 6668de1ee92aSJosef Bacik { 66694c664611SQu Wenruo atomic_inc(&bioc->error); 66704c664611SQu Wenruo if (atomic_dec_and_test(&bioc->stripes_pending)) { 667101327610SNicholas D Steeves /* Should be the original bio. */ 66724c664611SQu Wenruo WARN_ON(bio != bioc->orig_bio); 66738408c716SMiao Xie 6674c3a3b19bSQu Wenruo btrfs_bio(bio)->mirror_num = bioc->mirror_num; 66754f024f37SKent Overstreet bio->bi_iter.bi_sector = logical >> 9; 66764c664611SQu Wenruo if (atomic_read(&bioc->error) > bioc->max_errors) 66774e4cbee9SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 6678102ed2c5SAnand Jain else 6679102ed2c5SAnand Jain bio->bi_status = BLK_STS_OK; 66804c664611SQu Wenruo btrfs_end_bioc(bioc, bio); 6681de1ee92aSJosef Bacik } 6682de1ee92aSJosef Bacik } 6683de1ee92aSJosef Bacik 668458efbc9fSOmar Sandoval blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 668508635baeSChris Mason int mirror_num) 66860b86a832SChris Mason { 66870b86a832SChris Mason struct btrfs_device *dev; 66888790d502SChris Mason struct bio *first_bio = bio; 66891201b58bSDavid Sterba u64 logical = bio->bi_iter.bi_sector << 9; 66900b86a832SChris Mason u64 length = 0; 66910b86a832SChris Mason u64 map_length; 66920b86a832SChris Mason int ret; 669308da757dSZhao Lei int dev_nr; 669408da757dSZhao Lei int total_devs; 66954c664611SQu Wenruo struct btrfs_io_context *bioc = NULL; 66960b86a832SChris Mason 66974f024f37SKent Overstreet length = bio->bi_iter.bi_size; 66980b86a832SChris Mason map_length = length; 6699cea9e445SChris Mason 67000b246afaSJeff Mahoney btrfs_bio_counter_inc_blocked(fs_info); 6701bd7d63c2SLiu Bo ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 67024c664611SQu Wenruo &map_length, &bioc, mirror_num, 1); 6703c404e0dcSMiao Xie if (ret) { 67040b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 670558efbc9fSOmar Sandoval return errno_to_blk_status(ret); 6706c404e0dcSMiao Xie } 6707cea9e445SChris Mason 67084c664611SQu Wenruo total_devs = bioc->num_stripes; 67094c664611SQu Wenruo bioc->orig_bio = first_bio; 67104c664611SQu Wenruo bioc->private = first_bio->bi_private; 67114c664611SQu Wenruo bioc->end_io = first_bio->bi_end_io; 67124c664611SQu Wenruo atomic_set(&bioc->stripes_pending, bioc->num_stripes); 671353b381b3SDavid Woodhouse 67144c664611SQu Wenruo if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6715cfe94440SNaohiro Aota ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 671653b381b3SDavid Woodhouse /* In this case, map_length has been set to the length of 671753b381b3SDavid Woodhouse a single stripe; not the whole write */ 6718cfe94440SNaohiro Aota if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 67196a258d72SQu Wenruo ret = raid56_parity_write(bio, bioc, map_length); 672053b381b3SDavid Woodhouse } else { 67216a258d72SQu Wenruo ret = raid56_parity_recover(bio, bioc, map_length, 67226a258d72SQu Wenruo mirror_num, 1); 672353b381b3SDavid Woodhouse } 67244245215dSMiao Xie 67250b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 672658efbc9fSOmar Sandoval return errno_to_blk_status(ret); 672753b381b3SDavid Woodhouse } 672853b381b3SDavid Woodhouse 6729239b14b3SChris Mason if (map_length < length) { 67300b246afaSJeff Mahoney btrfs_crit(fs_info, 67315d163e0eSJeff Mahoney "mapping failed logical %llu bio len %llu len %llu", 6732c1c9ff7cSGeert Uytterhoeven logical, length, map_length); 6733239b14b3SChris Mason BUG(); 6734239b14b3SChris Mason } 6735a1d3c478SJan Schmidt 673608da757dSZhao Lei for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 67374c664611SQu Wenruo dev = bioc->stripes[dev_nr].dev; 6738fc8a168aSNikolay Borisov if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6739fc8a168aSNikolay Borisov &dev->dev_state) || 6740cfe94440SNaohiro Aota (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6741ebbede42SAnand Jain !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 67424c664611SQu Wenruo bioc_error(bioc, first_bio, logical); 6743de1ee92aSJosef Bacik continue; 6744de1ee92aSJosef Bacik } 6745de1ee92aSJosef Bacik 67463aa8e074SDavid Sterba if (dev_nr < total_devs - 1) 67478b6c1d56SDavid Sterba bio = btrfs_bio_clone(first_bio); 67483aa8e074SDavid Sterba else 67498790d502SChris Mason bio = first_bio; 6750606686eeSJosef Bacik 67514c664611SQu Wenruo submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); 67528790d502SChris Mason } 67530b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 675458efbc9fSOmar Sandoval return BLK_STS_OK; 67550b86a832SChris Mason } 67560b86a832SChris Mason 675709ba3bc9SAnand Jain /* 675809ba3bc9SAnand Jain * Find a device specified by @devid or @uuid in the list of @fs_devices, or 675909ba3bc9SAnand Jain * return NULL. 676009ba3bc9SAnand Jain * 676109ba3bc9SAnand Jain * If devid and uuid are both specified, the match must be exact, otherwise 676209ba3bc9SAnand Jain * only devid is used. 676309ba3bc9SAnand Jain */ 6764e4319cd9SAnand Jain struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6765b2598edfSAnand Jain u64 devid, u8 *uuid, u8 *fsid) 67660b86a832SChris Mason { 67672b82032cSYan Zheng struct btrfs_device *device; 6768944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_devs; 67690b86a832SChris Mason 6770944d3f9fSNikolay Borisov if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6771944d3f9fSNikolay Borisov list_for_each_entry(device, &fs_devices->devices, dev_list) { 6772944d3f9fSNikolay Borisov if (device->devid == devid && 6773944d3f9fSNikolay Borisov (!uuid || memcmp(device->uuid, uuid, 6774944d3f9fSNikolay Borisov BTRFS_UUID_SIZE) == 0)) 6775944d3f9fSNikolay Borisov return device; 6776944d3f9fSNikolay Borisov } 6777944d3f9fSNikolay Borisov } 6778944d3f9fSNikolay Borisov 6779944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 67802b82032cSYan Zheng if (!fsid || 6781944d3f9fSNikolay Borisov !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6782944d3f9fSNikolay Borisov list_for_each_entry(device, &seed_devs->devices, 678309ba3bc9SAnand Jain dev_list) { 678409ba3bc9SAnand Jain if (device->devid == devid && 678509ba3bc9SAnand Jain (!uuid || memcmp(device->uuid, uuid, 678609ba3bc9SAnand Jain BTRFS_UUID_SIZE) == 0)) 67872b82032cSYan Zheng return device; 67882b82032cSYan Zheng } 678909ba3bc9SAnand Jain } 67902b82032cSYan Zheng } 6791944d3f9fSNikolay Borisov 67922b82032cSYan Zheng return NULL; 67930b86a832SChris Mason } 67940b86a832SChris Mason 67952ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6796dfe25020SChris Mason u64 devid, u8 *dev_uuid) 6797dfe25020SChris Mason { 6798dfe25020SChris Mason struct btrfs_device *device; 6799fccc0007SJosef Bacik unsigned int nofs_flag; 6800dfe25020SChris Mason 6801fccc0007SJosef Bacik /* 6802fccc0007SJosef Bacik * We call this under the chunk_mutex, so we want to use NOFS for this 6803fccc0007SJosef Bacik * allocation, however we don't want to change btrfs_alloc_device() to 6804fccc0007SJosef Bacik * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6805fccc0007SJosef Bacik * places. 6806fccc0007SJosef Bacik */ 6807fccc0007SJosef Bacik nofs_flag = memalloc_nofs_save(); 680812bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6809fccc0007SJosef Bacik memalloc_nofs_restore(nofs_flag); 681012bd2fc0SIlya Dryomov if (IS_ERR(device)) 6811adfb69afSAnand Jain return device; 681212bd2fc0SIlya Dryomov 681312bd2fc0SIlya Dryomov list_add(&device->dev_list, &fs_devices->devices); 6814e4404d6eSYan Zheng device->fs_devices = fs_devices; 6815dfe25020SChris Mason fs_devices->num_devices++; 681612bd2fc0SIlya Dryomov 6817e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6818cd02dca5SChris Mason fs_devices->missing_devices++; 681912bd2fc0SIlya Dryomov 6820dfe25020SChris Mason return device; 6821dfe25020SChris Mason } 6822dfe25020SChris Mason 682312bd2fc0SIlya Dryomov /** 682412bd2fc0SIlya Dryomov * btrfs_alloc_device - allocate struct btrfs_device 682512bd2fc0SIlya Dryomov * @fs_info: used only for generating a new devid, can be NULL if 682612bd2fc0SIlya Dryomov * devid is provided (i.e. @devid != NULL). 682712bd2fc0SIlya Dryomov * @devid: a pointer to devid for this device. If NULL a new devid 682812bd2fc0SIlya Dryomov * is generated. 682912bd2fc0SIlya Dryomov * @uuid: a pointer to UUID for this device. If NULL a new UUID 683012bd2fc0SIlya Dryomov * is generated. 683112bd2fc0SIlya Dryomov * 683212bd2fc0SIlya Dryomov * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 683348dae9cfSDavid Sterba * on error. Returned struct is not linked onto any lists and must be 6834a425f9d4SDavid Sterba * destroyed with btrfs_free_device. 683512bd2fc0SIlya Dryomov */ 683612bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 683712bd2fc0SIlya Dryomov const u64 *devid, 683812bd2fc0SIlya Dryomov const u8 *uuid) 683912bd2fc0SIlya Dryomov { 684012bd2fc0SIlya Dryomov struct btrfs_device *dev; 684112bd2fc0SIlya Dryomov u64 tmp; 684212bd2fc0SIlya Dryomov 6843fae7f21cSDulshani Gunawardhana if (WARN_ON(!devid && !fs_info)) 684412bd2fc0SIlya Dryomov return ERR_PTR(-EINVAL); 684512bd2fc0SIlya Dryomov 6846fe4f46d4SDavid Sterba dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6847fe4f46d4SDavid Sterba if (!dev) 6848fe4f46d4SDavid Sterba return ERR_PTR(-ENOMEM); 6849fe4f46d4SDavid Sterba 6850fe4f46d4SDavid Sterba /* 6851fe4f46d4SDavid Sterba * Preallocate a bio that's always going to be used for flushing device 6852fe4f46d4SDavid Sterba * barriers and matches the device lifespan 6853fe4f46d4SDavid Sterba */ 6854fe4f46d4SDavid Sterba dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6855fe4f46d4SDavid Sterba if (!dev->flush_bio) { 6856fe4f46d4SDavid Sterba kfree(dev); 6857fe4f46d4SDavid Sterba return ERR_PTR(-ENOMEM); 6858fe4f46d4SDavid Sterba } 6859fe4f46d4SDavid Sterba 6860fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->dev_list); 6861fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->dev_alloc_list); 6862fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->post_commit_list); 6863fe4f46d4SDavid Sterba 6864fe4f46d4SDavid Sterba atomic_set(&dev->reada_in_flight, 0); 6865fe4f46d4SDavid Sterba atomic_set(&dev->dev_stats_ccnt, 0); 6866fe4f46d4SDavid Sterba btrfs_device_data_ordered_init(dev); 6867fe4f46d4SDavid Sterba INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6868fe4f46d4SDavid Sterba INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 6869fe4f46d4SDavid Sterba extent_io_tree_init(fs_info, &dev->alloc_state, 6870fe4f46d4SDavid Sterba IO_TREE_DEVICE_ALLOC_STATE, NULL); 687112bd2fc0SIlya Dryomov 687212bd2fc0SIlya Dryomov if (devid) 687312bd2fc0SIlya Dryomov tmp = *devid; 687412bd2fc0SIlya Dryomov else { 687512bd2fc0SIlya Dryomov int ret; 687612bd2fc0SIlya Dryomov 687712bd2fc0SIlya Dryomov ret = find_next_devid(fs_info, &tmp); 687812bd2fc0SIlya Dryomov if (ret) { 6879a425f9d4SDavid Sterba btrfs_free_device(dev); 688012bd2fc0SIlya Dryomov return ERR_PTR(ret); 688112bd2fc0SIlya Dryomov } 688212bd2fc0SIlya Dryomov } 688312bd2fc0SIlya Dryomov dev->devid = tmp; 688412bd2fc0SIlya Dryomov 688512bd2fc0SIlya Dryomov if (uuid) 688612bd2fc0SIlya Dryomov memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 688712bd2fc0SIlya Dryomov else 688812bd2fc0SIlya Dryomov generate_random_uuid(dev->uuid); 688912bd2fc0SIlya Dryomov 689012bd2fc0SIlya Dryomov return dev; 689112bd2fc0SIlya Dryomov } 689212bd2fc0SIlya Dryomov 68935a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 68942b902dfcSAnand Jain u64 devid, u8 *uuid, bool error) 68955a2b8e60SAnand Jain { 68962b902dfcSAnand Jain if (error) 68972b902dfcSAnand Jain btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 68982b902dfcSAnand Jain devid, uuid); 68992b902dfcSAnand Jain else 69002b902dfcSAnand Jain btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 69012b902dfcSAnand Jain devid, uuid); 69025a2b8e60SAnand Jain } 69035a2b8e60SAnand Jain 690439e264a4SNikolay Borisov static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 690539e264a4SNikolay Borisov { 6906d58ede8dSDavid Sterba const int data_stripes = calc_data_stripes(type, num_stripes); 6907e4f6c6beSDavid Sterba 690839e264a4SNikolay Borisov return div_u64(chunk_len, data_stripes); 690939e264a4SNikolay Borisov } 691039e264a4SNikolay Borisov 6911e9306ad4SQu Wenruo #if BITS_PER_LONG == 32 6912e9306ad4SQu Wenruo /* 6913e9306ad4SQu Wenruo * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6914e9306ad4SQu Wenruo * can't be accessed on 32bit systems. 6915e9306ad4SQu Wenruo * 6916e9306ad4SQu Wenruo * This function do mount time check to reject the fs if it already has 6917e9306ad4SQu Wenruo * metadata chunk beyond that limit. 6918e9306ad4SQu Wenruo */ 6919e9306ad4SQu Wenruo static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6920e9306ad4SQu Wenruo u64 logical, u64 length, u64 type) 6921e9306ad4SQu Wenruo { 6922e9306ad4SQu Wenruo if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6923e9306ad4SQu Wenruo return 0; 6924e9306ad4SQu Wenruo 6925e9306ad4SQu Wenruo if (logical + length < MAX_LFS_FILESIZE) 6926e9306ad4SQu Wenruo return 0; 6927e9306ad4SQu Wenruo 6928e9306ad4SQu Wenruo btrfs_err_32bit_limit(fs_info); 6929e9306ad4SQu Wenruo return -EOVERFLOW; 6930e9306ad4SQu Wenruo } 6931e9306ad4SQu Wenruo 6932e9306ad4SQu Wenruo /* 6933e9306ad4SQu Wenruo * This is to give early warning for any metadata chunk reaching 6934e9306ad4SQu Wenruo * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6935e9306ad4SQu Wenruo * Although we can still access the metadata, it's not going to be possible 6936e9306ad4SQu Wenruo * once the limit is reached. 6937e9306ad4SQu Wenruo */ 6938e9306ad4SQu Wenruo static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6939e9306ad4SQu Wenruo u64 logical, u64 length, u64 type) 6940e9306ad4SQu Wenruo { 6941e9306ad4SQu Wenruo if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6942e9306ad4SQu Wenruo return; 6943e9306ad4SQu Wenruo 6944e9306ad4SQu Wenruo if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6945e9306ad4SQu Wenruo return; 6946e9306ad4SQu Wenruo 6947e9306ad4SQu Wenruo btrfs_warn_32bit_limit(fs_info); 6948e9306ad4SQu Wenruo } 6949e9306ad4SQu Wenruo #endif 6950e9306ad4SQu Wenruo 69519690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 69520b86a832SChris Mason struct btrfs_chunk *chunk) 69530b86a832SChris Mason { 69549690ac09SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 6955c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree; 69560b86a832SChris Mason struct map_lookup *map; 69570b86a832SChris Mason struct extent_map *em; 69580b86a832SChris Mason u64 logical; 69590b86a832SChris Mason u64 length; 69600b86a832SChris Mason u64 devid; 6961e9306ad4SQu Wenruo u64 type; 6962a443755fSChris Mason u8 uuid[BTRFS_UUID_SIZE]; 6963593060d7SChris Mason int num_stripes; 69640b86a832SChris Mason int ret; 6965593060d7SChris Mason int i; 69660b86a832SChris Mason 6967e17cade2SChris Mason logical = key->offset; 6968e17cade2SChris Mason length = btrfs_chunk_length(leaf, chunk); 6969e9306ad4SQu Wenruo type = btrfs_chunk_type(leaf, chunk); 6970f04b772bSQu Wenruo num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6971e06cd3ddSLiu Bo 6972e9306ad4SQu Wenruo #if BITS_PER_LONG == 32 6973e9306ad4SQu Wenruo ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6974e9306ad4SQu Wenruo if (ret < 0) 6975e9306ad4SQu Wenruo return ret; 6976e9306ad4SQu Wenruo warn_32bit_meta_chunk(fs_info, logical, length, type); 6977e9306ad4SQu Wenruo #endif 6978e9306ad4SQu Wenruo 6979075cb3c7SQu Wenruo /* 6980075cb3c7SQu Wenruo * Only need to verify chunk item if we're reading from sys chunk array, 6981075cb3c7SQu Wenruo * as chunk item in tree block is already verified by tree-checker. 6982075cb3c7SQu Wenruo */ 6983075cb3c7SQu Wenruo if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6984ddaf1d5aSDavid Sterba ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6985e06cd3ddSLiu Bo if (ret) 6986e06cd3ddSLiu Bo return ret; 6987075cb3c7SQu Wenruo } 6988a061fc8dSChris Mason 6989c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 6990c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, logical, 1); 6991c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 69920b86a832SChris Mason 69930b86a832SChris Mason /* already mapped? */ 69940b86a832SChris Mason if (em && em->start <= logical && em->start + em->len > logical) { 69950b86a832SChris Mason free_extent_map(em); 69960b86a832SChris Mason return 0; 69970b86a832SChris Mason } else if (em) { 69980b86a832SChris Mason free_extent_map(em); 69990b86a832SChris Mason } 70000b86a832SChris Mason 7001172ddd60SDavid Sterba em = alloc_extent_map(); 70020b86a832SChris Mason if (!em) 70030b86a832SChris Mason return -ENOMEM; 7004593060d7SChris Mason map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 70050b86a832SChris Mason if (!map) { 70060b86a832SChris Mason free_extent_map(em); 70070b86a832SChris Mason return -ENOMEM; 70080b86a832SChris Mason } 70090b86a832SChris Mason 7010298a8f9cSWang Shilong set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 701195617d69SJeff Mahoney em->map_lookup = map; 70120b86a832SChris Mason em->start = logical; 70130b86a832SChris Mason em->len = length; 701470c8a91cSJosef Bacik em->orig_start = 0; 70150b86a832SChris Mason em->block_start = 0; 7016c8b97818SChris Mason em->block_len = em->len; 70170b86a832SChris Mason 7018593060d7SChris Mason map->num_stripes = num_stripes; 7019593060d7SChris Mason map->io_width = btrfs_chunk_io_width(leaf, chunk); 7020593060d7SChris Mason map->io_align = btrfs_chunk_io_align(leaf, chunk); 7021593060d7SChris Mason map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7022e9306ad4SQu Wenruo map->type = type; 7023321aecc6SChris Mason map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7024cf90d884SQu Wenruo map->verified_stripes = 0; 7025e9306ad4SQu Wenruo em->orig_block_len = calc_stripe_length(type, em->len, 702639e264a4SNikolay Borisov map->num_stripes); 7027593060d7SChris Mason for (i = 0; i < num_stripes; i++) { 7028593060d7SChris Mason map->stripes[i].physical = 7029593060d7SChris Mason btrfs_stripe_offset_nr(leaf, chunk, i); 7030593060d7SChris Mason devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7031a443755fSChris Mason read_extent_buffer(leaf, uuid, (unsigned long) 7032a443755fSChris Mason btrfs_stripe_dev_uuid_nr(chunk, i), 7033a443755fSChris Mason BTRFS_UUID_SIZE); 7034e4319cd9SAnand Jain map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 7035b2598edfSAnand Jain devid, uuid, NULL); 70363cdde224SJeff Mahoney if (!map->stripes[i].dev && 70370b246afaSJeff Mahoney !btrfs_test_opt(fs_info, DEGRADED)) { 7038dfe25020SChris Mason free_extent_map(em); 70392b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, true); 704045dbdbc9SAnand Jain return -ENOENT; 7041dfe25020SChris Mason } 7042dfe25020SChris Mason if (!map->stripes[i].dev) { 7043dfe25020SChris Mason map->stripes[i].dev = 70442ff7e61eSJeff Mahoney add_missing_dev(fs_info->fs_devices, devid, 70452ff7e61eSJeff Mahoney uuid); 7046adfb69afSAnand Jain if (IS_ERR(map->stripes[i].dev)) { 70470b86a832SChris Mason free_extent_map(em); 7048adfb69afSAnand Jain btrfs_err(fs_info, 7049adfb69afSAnand Jain "failed to init missing dev %llu: %ld", 7050adfb69afSAnand Jain devid, PTR_ERR(map->stripes[i].dev)); 7051adfb69afSAnand Jain return PTR_ERR(map->stripes[i].dev); 70520b86a832SChris Mason } 70532b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, uuid, false); 7054593060d7SChris Mason } 7055e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7056e12c9621SAnand Jain &(map->stripes[i].dev->dev_state)); 7057e12c9621SAnand Jain 7058dfe25020SChris Mason } 70590b86a832SChris Mason 7060c8bf1b67SDavid Sterba write_lock(&map_tree->lock); 7061c8bf1b67SDavid Sterba ret = add_extent_mapping(map_tree, em, 0); 7062c8bf1b67SDavid Sterba write_unlock(&map_tree->lock); 706364f64f43SQu Wenruo if (ret < 0) { 706464f64f43SQu Wenruo btrfs_err(fs_info, 706564f64f43SQu Wenruo "failed to add chunk map, start=%llu len=%llu: %d", 706664f64f43SQu Wenruo em->start, em->len, ret); 706764f64f43SQu Wenruo } 70680b86a832SChris Mason free_extent_map(em); 70690b86a832SChris Mason 707064f64f43SQu Wenruo return ret; 70710b86a832SChris Mason } 70720b86a832SChris Mason 7073143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf, 70740b86a832SChris Mason struct btrfs_dev_item *dev_item, 70750b86a832SChris Mason struct btrfs_device *device) 70760b86a832SChris Mason { 70770b86a832SChris Mason unsigned long ptr; 70780b86a832SChris Mason 70790b86a832SChris Mason device->devid = btrfs_device_id(leaf, dev_item); 7080d6397baeSChris Ball device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7081d6397baeSChris Ball device->total_bytes = device->disk_total_bytes; 7082935e5cc9SMiao Xie device->commit_total_bytes = device->disk_total_bytes; 70830b86a832SChris Mason device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7084ce7213c7SMiao Xie device->commit_bytes_used = device->bytes_used; 70850b86a832SChris Mason device->type = btrfs_device_type(leaf, dev_item); 70860b86a832SChris Mason device->io_align = btrfs_device_io_align(leaf, dev_item); 70870b86a832SChris Mason device->io_width = btrfs_device_io_width(leaf, dev_item); 70880b86a832SChris Mason device->sector_size = btrfs_device_sector_size(leaf, dev_item); 70898dabb742SStefan Behrens WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7090401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 70910b86a832SChris Mason 7092410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item); 7093e17cade2SChris Mason read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 70940b86a832SChris Mason } 70950b86a832SChris Mason 70962ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 70975f375835SMiao Xie u8 *fsid) 70982b82032cSYan Zheng { 70992b82032cSYan Zheng struct btrfs_fs_devices *fs_devices; 71002b82032cSYan Zheng int ret; 71012b82032cSYan Zheng 7102a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex); 71032dfeca9bSDavid Sterba ASSERT(fsid); 71042b82032cSYan Zheng 7105427c8fddSNikolay Borisov /* This will match only for multi-device seed fs */ 7106944d3f9fSNikolay Borisov list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 710744880fdcSAnand Jain if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 71085f375835SMiao Xie return fs_devices; 71095f375835SMiao Xie 71102b82032cSYan Zheng 71117239ff4bSNikolay Borisov fs_devices = find_fsid(fsid, NULL); 71122b82032cSYan Zheng if (!fs_devices) { 71130b246afaSJeff Mahoney if (!btrfs_test_opt(fs_info, DEGRADED)) 71145f375835SMiao Xie return ERR_PTR(-ENOENT); 71155f375835SMiao Xie 71167239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(fsid, NULL); 71175f375835SMiao Xie if (IS_ERR(fs_devices)) 71185f375835SMiao Xie return fs_devices; 71195f375835SMiao Xie 71200395d84fSJohannes Thumshirn fs_devices->seeding = true; 71215f375835SMiao Xie fs_devices->opened = 1; 71225f375835SMiao Xie return fs_devices; 71232b82032cSYan Zheng } 7124e4404d6eSYan Zheng 7125427c8fddSNikolay Borisov /* 7126427c8fddSNikolay Borisov * Upon first call for a seed fs fsid, just create a private copy of the 7127427c8fddSNikolay Borisov * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7128427c8fddSNikolay Borisov */ 7129e4404d6eSYan Zheng fs_devices = clone_fs_devices(fs_devices); 71305f375835SMiao Xie if (IS_ERR(fs_devices)) 71315f375835SMiao Xie return fs_devices; 71322b82032cSYan Zheng 7133897fb573SAnand Jain ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 713448d28232SJulia Lawall if (ret) { 713548d28232SJulia Lawall free_fs_devices(fs_devices); 7136c83b60c0SAnand Jain return ERR_PTR(ret); 713748d28232SJulia Lawall } 71382b82032cSYan Zheng 71392b82032cSYan Zheng if (!fs_devices->seeding) { 71400226e0ebSAnand Jain close_fs_devices(fs_devices); 7141e4404d6eSYan Zheng free_fs_devices(fs_devices); 7142c83b60c0SAnand Jain return ERR_PTR(-EINVAL); 71432b82032cSYan Zheng } 71442b82032cSYan Zheng 7145944d3f9fSNikolay Borisov list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7146c83b60c0SAnand Jain 71475f375835SMiao Xie return fs_devices; 71482b82032cSYan Zheng } 71492b82032cSYan Zheng 715017850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf, 71510b86a832SChris Mason struct btrfs_dev_item *dev_item) 71520b86a832SChris Mason { 715317850759SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info; 71540b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 71550b86a832SChris Mason struct btrfs_device *device; 71560b86a832SChris Mason u64 devid; 71570b86a832SChris Mason int ret; 715844880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE]; 7159a443755fSChris Mason u8 dev_uuid[BTRFS_UUID_SIZE]; 7160a443755fSChris Mason 71610b86a832SChris Mason devid = btrfs_device_id(leaf, dev_item); 7162410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7163a443755fSChris Mason BTRFS_UUID_SIZE); 71641473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 716544880fdcSAnand Jain BTRFS_FSID_SIZE); 71662b82032cSYan Zheng 7167de37aa51SNikolay Borisov if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 71682ff7e61eSJeff Mahoney fs_devices = open_seed_devices(fs_info, fs_uuid); 71695f375835SMiao Xie if (IS_ERR(fs_devices)) 71705f375835SMiao Xie return PTR_ERR(fs_devices); 71712b82032cSYan Zheng } 71722b82032cSYan Zheng 7173e4319cd9SAnand Jain device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 7174b2598edfSAnand Jain fs_uuid); 71755f375835SMiao Xie if (!device) { 7176c5502451SQu Wenruo if (!btrfs_test_opt(fs_info, DEGRADED)) { 71772b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 71782b902dfcSAnand Jain dev_uuid, true); 717945dbdbc9SAnand Jain return -ENOENT; 7180c5502451SQu Wenruo } 71812b82032cSYan Zheng 71822ff7e61eSJeff Mahoney device = add_missing_dev(fs_devices, devid, dev_uuid); 7183adfb69afSAnand Jain if (IS_ERR(device)) { 7184adfb69afSAnand Jain btrfs_err(fs_info, 7185adfb69afSAnand Jain "failed to add missing dev %llu: %ld", 7186adfb69afSAnand Jain devid, PTR_ERR(device)); 7187adfb69afSAnand Jain return PTR_ERR(device); 7188adfb69afSAnand Jain } 71892b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 71905f375835SMiao Xie } else { 7191c5502451SQu Wenruo if (!device->bdev) { 71922b902dfcSAnand Jain if (!btrfs_test_opt(fs_info, DEGRADED)) { 71932b902dfcSAnand Jain btrfs_report_missing_device(fs_info, 71942b902dfcSAnand Jain devid, dev_uuid, true); 719545dbdbc9SAnand Jain return -ENOENT; 7196c5502451SQu Wenruo } 71972b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, 71982b902dfcSAnand Jain dev_uuid, false); 71992b902dfcSAnand Jain } 72005f375835SMiao Xie 7201e6e674bdSAnand Jain if (!device->bdev && 7202e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7203cd02dca5SChris Mason /* 7204cd02dca5SChris Mason * this happens when a device that was properly setup 7205cd02dca5SChris Mason * in the device info lists suddenly goes bad. 7206cd02dca5SChris Mason * device->bdev is NULL, and so we have to set 7207cd02dca5SChris Mason * device->missing to one here 7208cd02dca5SChris Mason */ 72095f375835SMiao Xie device->fs_devices->missing_devices++; 7210e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 72116324fbf3SChris Mason } 72125f375835SMiao Xie 72135f375835SMiao Xie /* Move the device to its own fs_devices */ 72145f375835SMiao Xie if (device->fs_devices != fs_devices) { 7215e6e674bdSAnand Jain ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7216e6e674bdSAnand Jain &device->dev_state)); 72175f375835SMiao Xie 72185f375835SMiao Xie list_move(&device->dev_list, &fs_devices->devices); 72195f375835SMiao Xie device->fs_devices->num_devices--; 72205f375835SMiao Xie fs_devices->num_devices++; 72215f375835SMiao Xie 72225f375835SMiao Xie device->fs_devices->missing_devices--; 72235f375835SMiao Xie fs_devices->missing_devices++; 72245f375835SMiao Xie 72255f375835SMiao Xie device->fs_devices = fs_devices; 72265f375835SMiao Xie } 72272b82032cSYan Zheng } 72282b82032cSYan Zheng 72290b246afaSJeff Mahoney if (device->fs_devices != fs_info->fs_devices) { 7230ebbede42SAnand Jain BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 72312b82032cSYan Zheng if (device->generation != 72322b82032cSYan Zheng btrfs_device_generation(leaf, dev_item)) 72332b82032cSYan Zheng return -EINVAL; 72342b82032cSYan Zheng } 72350b86a832SChris Mason 72360b86a832SChris Mason fill_device_from_item(leaf, dev_item, device); 72373a160a93SAnand Jain if (device->bdev) { 72383a160a93SAnand Jain u64 max_total_bytes = i_size_read(device->bdev->bd_inode); 72393a160a93SAnand Jain 72403a160a93SAnand Jain if (device->total_bytes > max_total_bytes) { 72413a160a93SAnand Jain btrfs_err(fs_info, 72423a160a93SAnand Jain "device total_bytes should be at most %llu but found %llu", 72433a160a93SAnand Jain max_total_bytes, device->total_bytes); 72443a160a93SAnand Jain return -EINVAL; 72453a160a93SAnand Jain } 72463a160a93SAnand Jain } 7247e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7248ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7249401e29c1SAnand Jain !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 72502b82032cSYan Zheng device->fs_devices->total_rw_bytes += device->total_bytes; 7251a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes - device->bytes_used, 7252a5ed45f8SNikolay Borisov &fs_info->free_chunk_space); 72532bf64758SJosef Bacik } 72540b86a832SChris Mason ret = 0; 72550b86a832SChris Mason return ret; 72560b86a832SChris Mason } 72570b86a832SChris Mason 72586bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 72590b86a832SChris Mason { 72606bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root; 7261ab8d0fc4SJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy; 7262a061fc8dSChris Mason struct extent_buffer *sb; 72630b86a832SChris Mason struct btrfs_disk_key *disk_key; 72640b86a832SChris Mason struct btrfs_chunk *chunk; 72651ffb22cfSDavid Sterba u8 *array_ptr; 72661ffb22cfSDavid Sterba unsigned long sb_array_offset; 726784eed90fSChris Mason int ret = 0; 72680b86a832SChris Mason u32 num_stripes; 72690b86a832SChris Mason u32 array_size; 72700b86a832SChris Mason u32 len = 0; 72711ffb22cfSDavid Sterba u32 cur_offset; 7272e06cd3ddSLiu Bo u64 type; 727384eed90fSChris Mason struct btrfs_key key; 72740b86a832SChris Mason 72750b246afaSJeff Mahoney ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7276a83fffb7SDavid Sterba /* 7277a83fffb7SDavid Sterba * This will create extent buffer of nodesize, superblock size is 7278a83fffb7SDavid Sterba * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7279a83fffb7SDavid Sterba * overallocate but we can keep it as-is, only the first page is used. 7280a83fffb7SDavid Sterba */ 72813fbaf258SJosef Bacik sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 72823fbaf258SJosef Bacik root->root_key.objectid, 0); 7283c871b0f2SLiu Bo if (IS_ERR(sb)) 7284c871b0f2SLiu Bo return PTR_ERR(sb); 72854db8c528SDavid Sterba set_extent_buffer_uptodate(sb); 72868a334426SDavid Sterba /* 728701327610SNicholas D Steeves * The sb extent buffer is artificial and just used to read the system array. 72884db8c528SDavid Sterba * set_extent_buffer_uptodate() call does not properly mark all it's 72898a334426SDavid Sterba * pages up-to-date when the page is larger: extent does not cover the 72908a334426SDavid Sterba * whole page and consequently check_page_uptodate does not find all 72918a334426SDavid Sterba * the page's extents up-to-date (the hole beyond sb), 72928a334426SDavid Sterba * write_extent_buffer then triggers a WARN_ON. 72938a334426SDavid Sterba * 72948a334426SDavid Sterba * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 72958a334426SDavid Sterba * but sb spans only this function. Add an explicit SetPageUptodate call 72968a334426SDavid Sterba * to silence the warning eg. on PowerPC 64. 72978a334426SDavid Sterba */ 729809cbfeafSKirill A. Shutemov if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7299727011e0SChris Mason SetPageUptodate(sb->pages[0]); 73004008c04aSChris Mason 7301a061fc8dSChris Mason write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 73020b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy); 73030b86a832SChris Mason 73041ffb22cfSDavid Sterba array_ptr = super_copy->sys_chunk_array; 73051ffb22cfSDavid Sterba sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 73061ffb22cfSDavid Sterba cur_offset = 0; 73070b86a832SChris Mason 73081ffb22cfSDavid Sterba while (cur_offset < array_size) { 73091ffb22cfSDavid Sterba disk_key = (struct btrfs_disk_key *)array_ptr; 7310e3540eabSDavid Sterba len = sizeof(*disk_key); 7311e3540eabSDavid Sterba if (cur_offset + len > array_size) 7312e3540eabSDavid Sterba goto out_short_read; 7313e3540eabSDavid Sterba 73140b86a832SChris Mason btrfs_disk_key_to_cpu(&key, disk_key); 73150b86a832SChris Mason 73161ffb22cfSDavid Sterba array_ptr += len; 73171ffb22cfSDavid Sterba sb_array_offset += len; 73181ffb22cfSDavid Sterba cur_offset += len; 73190b86a832SChris Mason 732032ab3d1bSJohannes Thumshirn if (key.type != BTRFS_CHUNK_ITEM_KEY) { 732132ab3d1bSJohannes Thumshirn btrfs_err(fs_info, 732232ab3d1bSJohannes Thumshirn "unexpected item type %u in sys_array at offset %u", 732332ab3d1bSJohannes Thumshirn (u32)key.type, cur_offset); 732432ab3d1bSJohannes Thumshirn ret = -EIO; 732532ab3d1bSJohannes Thumshirn break; 732632ab3d1bSJohannes Thumshirn } 732732ab3d1bSJohannes Thumshirn 73281ffb22cfSDavid Sterba chunk = (struct btrfs_chunk *)sb_array_offset; 7329e3540eabSDavid Sterba /* 733032ab3d1bSJohannes Thumshirn * At least one btrfs_chunk with one stripe must be present, 733132ab3d1bSJohannes Thumshirn * exact stripe count check comes afterwards 7332e3540eabSDavid Sterba */ 7333e3540eabSDavid Sterba len = btrfs_chunk_item_size(1); 7334e3540eabSDavid Sterba if (cur_offset + len > array_size) 7335e3540eabSDavid Sterba goto out_short_read; 7336e3540eabSDavid Sterba 7337e3540eabSDavid Sterba num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7338f5cdedd7SDavid Sterba if (!num_stripes) { 7339ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 7340ab8d0fc4SJeff Mahoney "invalid number of stripes %u in sys_array at offset %u", 7341f5cdedd7SDavid Sterba num_stripes, cur_offset); 7342f5cdedd7SDavid Sterba ret = -EIO; 7343f5cdedd7SDavid Sterba break; 7344f5cdedd7SDavid Sterba } 7345f5cdedd7SDavid Sterba 7346e06cd3ddSLiu Bo type = btrfs_chunk_type(sb, chunk); 7347e06cd3ddSLiu Bo if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7348ab8d0fc4SJeff Mahoney btrfs_err(fs_info, 7349e06cd3ddSLiu Bo "invalid chunk type %llu in sys_array at offset %u", 7350e06cd3ddSLiu Bo type, cur_offset); 7351e06cd3ddSLiu Bo ret = -EIO; 7352e06cd3ddSLiu Bo break; 7353e06cd3ddSLiu Bo } 7354e06cd3ddSLiu Bo 7355e3540eabSDavid Sterba len = btrfs_chunk_item_size(num_stripes); 7356e3540eabSDavid Sterba if (cur_offset + len > array_size) 7357e3540eabSDavid Sterba goto out_short_read; 7358e3540eabSDavid Sterba 73599690ac09SDavid Sterba ret = read_one_chunk(&key, sb, chunk); 736084eed90fSChris Mason if (ret) 736184eed90fSChris Mason break; 736232ab3d1bSJohannes Thumshirn 73631ffb22cfSDavid Sterba array_ptr += len; 73641ffb22cfSDavid Sterba sb_array_offset += len; 73651ffb22cfSDavid Sterba cur_offset += len; 73660b86a832SChris Mason } 7367d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 73681c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 736984eed90fSChris Mason return ret; 7370e3540eabSDavid Sterba 7371e3540eabSDavid Sterba out_short_read: 7372ab8d0fc4SJeff Mahoney btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7373e3540eabSDavid Sterba len, cur_offset); 7374d865177aSLiu Bo clear_extent_buffer_uptodate(sb); 73751c8b5b6eSLiu Bo free_extent_buffer_stale(sb); 7376e3540eabSDavid Sterba return -EIO; 73770b86a832SChris Mason } 73780b86a832SChris Mason 737921634a19SQu Wenruo /* 738021634a19SQu Wenruo * Check if all chunks in the fs are OK for read-write degraded mount 738121634a19SQu Wenruo * 73826528b99dSAnand Jain * If the @failing_dev is specified, it's accounted as missing. 73836528b99dSAnand Jain * 738421634a19SQu Wenruo * Return true if all chunks meet the minimal RW mount requirements. 738521634a19SQu Wenruo * Return false if any chunk doesn't meet the minimal RW mount requirements. 738621634a19SQu Wenruo */ 73876528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 73886528b99dSAnand Jain struct btrfs_device *failing_dev) 738921634a19SQu Wenruo { 7390c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree; 739121634a19SQu Wenruo struct extent_map *em; 739221634a19SQu Wenruo u64 next_start = 0; 739321634a19SQu Wenruo bool ret = true; 739421634a19SQu Wenruo 7395c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 7396c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7397c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 739821634a19SQu Wenruo /* No chunk at all? Return false anyway */ 739921634a19SQu Wenruo if (!em) { 740021634a19SQu Wenruo ret = false; 740121634a19SQu Wenruo goto out; 740221634a19SQu Wenruo } 740321634a19SQu Wenruo while (em) { 740421634a19SQu Wenruo struct map_lookup *map; 740521634a19SQu Wenruo int missing = 0; 740621634a19SQu Wenruo int max_tolerated; 740721634a19SQu Wenruo int i; 740821634a19SQu Wenruo 740921634a19SQu Wenruo map = em->map_lookup; 741021634a19SQu Wenruo max_tolerated = 741121634a19SQu Wenruo btrfs_get_num_tolerated_disk_barrier_failures( 741221634a19SQu Wenruo map->type); 741321634a19SQu Wenruo for (i = 0; i < map->num_stripes; i++) { 741421634a19SQu Wenruo struct btrfs_device *dev = map->stripes[i].dev; 741521634a19SQu Wenruo 7416e6e674bdSAnand Jain if (!dev || !dev->bdev || 7417e6e674bdSAnand Jain test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 741821634a19SQu Wenruo dev->last_flush_error) 741921634a19SQu Wenruo missing++; 74206528b99dSAnand Jain else if (failing_dev && failing_dev == dev) 74216528b99dSAnand Jain missing++; 742221634a19SQu Wenruo } 742321634a19SQu Wenruo if (missing > max_tolerated) { 74246528b99dSAnand Jain if (!failing_dev) 742521634a19SQu Wenruo btrfs_warn(fs_info, 742652042d8eSAndrea Gelmini "chunk %llu missing %d devices, max tolerance is %d for writable mount", 742721634a19SQu Wenruo em->start, missing, max_tolerated); 742821634a19SQu Wenruo free_extent_map(em); 742921634a19SQu Wenruo ret = false; 743021634a19SQu Wenruo goto out; 743121634a19SQu Wenruo } 743221634a19SQu Wenruo next_start = extent_map_end(em); 743321634a19SQu Wenruo free_extent_map(em); 743421634a19SQu Wenruo 7435c8bf1b67SDavid Sterba read_lock(&map_tree->lock); 7436c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, next_start, 743721634a19SQu Wenruo (u64)(-1) - next_start); 7438c8bf1b67SDavid Sterba read_unlock(&map_tree->lock); 743921634a19SQu Wenruo } 744021634a19SQu Wenruo out: 744121634a19SQu Wenruo return ret; 744221634a19SQu Wenruo } 744321634a19SQu Wenruo 7444d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node) 7445d85327b1SDavid Sterba { 7446d85327b1SDavid Sterba int i; 7447d85327b1SDavid Sterba const int nr_items = btrfs_header_nritems(node); 7448d85327b1SDavid Sterba 7449bfb484d9SJosef Bacik for (i = 0; i < nr_items; i++) 7450bfb484d9SJosef Bacik btrfs_readahead_node_child(node, i); 7451d85327b1SDavid Sterba } 7452d85327b1SDavid Sterba 74535b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 74540b86a832SChris Mason { 74555b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root; 74560b86a832SChris Mason struct btrfs_path *path; 74570b86a832SChris Mason struct extent_buffer *leaf; 74580b86a832SChris Mason struct btrfs_key key; 74590b86a832SChris Mason struct btrfs_key found_key; 74600b86a832SChris Mason int ret; 74610b86a832SChris Mason int slot; 746299e3ecfcSLiu Bo u64 total_dev = 0; 7463d85327b1SDavid Sterba u64 last_ra_node = 0; 74640b86a832SChris Mason 74650b86a832SChris Mason path = btrfs_alloc_path(); 74660b86a832SChris Mason if (!path) 74670b86a832SChris Mason return -ENOMEM; 74680b86a832SChris Mason 74693dd0f7a3SAnand Jain /* 74703dd0f7a3SAnand Jain * uuid_mutex is needed only if we are mounting a sprout FS 74713dd0f7a3SAnand Jain * otherwise we don't need it. 74723dd0f7a3SAnand Jain */ 7473b367e47fSLi Zefan mutex_lock(&uuid_mutex); 7474b367e47fSLi Zefan 7475395927a9SFilipe David Borba Manana /* 747648cfa61bSBoris Burkov * It is possible for mount and umount to race in such a way that 747748cfa61bSBoris Burkov * we execute this code path, but open_fs_devices failed to clear 747848cfa61bSBoris Burkov * total_rw_bytes. We certainly want it cleared before reading the 747948cfa61bSBoris Burkov * device items, so clear it here. 748048cfa61bSBoris Burkov */ 748148cfa61bSBoris Burkov fs_info->fs_devices->total_rw_bytes = 0; 748248cfa61bSBoris Burkov 748348cfa61bSBoris Burkov /* 7484395927a9SFilipe David Borba Manana * Read all device items, and then all the chunk items. All 7485395927a9SFilipe David Borba Manana * device items are found before any chunk item (their object id 7486395927a9SFilipe David Borba Manana * is smaller than the lowest possible object id for a chunk 7487395927a9SFilipe David Borba Manana * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 74880b86a832SChris Mason */ 74890b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 74900b86a832SChris Mason key.offset = 0; 74910b86a832SChris Mason key.type = 0; 74920b86a832SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7493ab59381eSZhao Lei if (ret < 0) 7494ab59381eSZhao Lei goto error; 74950b86a832SChris Mason while (1) { 7496d85327b1SDavid Sterba struct extent_buffer *node; 7497d85327b1SDavid Sterba 74980b86a832SChris Mason leaf = path->nodes[0]; 74990b86a832SChris Mason slot = path->slots[0]; 75000b86a832SChris Mason if (slot >= btrfs_header_nritems(leaf)) { 75010b86a832SChris Mason ret = btrfs_next_leaf(root, path); 75020b86a832SChris Mason if (ret == 0) 75030b86a832SChris Mason continue; 75040b86a832SChris Mason if (ret < 0) 75050b86a832SChris Mason goto error; 75060b86a832SChris Mason break; 75070b86a832SChris Mason } 7508d85327b1SDavid Sterba /* 7509d85327b1SDavid Sterba * The nodes on level 1 are not locked but we don't need to do 7510d85327b1SDavid Sterba * that during mount time as nothing else can access the tree 7511d85327b1SDavid Sterba */ 7512d85327b1SDavid Sterba node = path->nodes[1]; 7513d85327b1SDavid Sterba if (node) { 7514d85327b1SDavid Sterba if (last_ra_node != node->start) { 7515d85327b1SDavid Sterba readahead_tree_node_children(node); 7516d85327b1SDavid Sterba last_ra_node = node->start; 7517d85327b1SDavid Sterba } 7518d85327b1SDavid Sterba } 75190b86a832SChris Mason btrfs_item_key_to_cpu(leaf, &found_key, slot); 75200b86a832SChris Mason if (found_key.type == BTRFS_DEV_ITEM_KEY) { 75210b86a832SChris Mason struct btrfs_dev_item *dev_item; 75220b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, slot, 75230b86a832SChris Mason struct btrfs_dev_item); 752417850759SDavid Sterba ret = read_one_dev(leaf, dev_item); 75252b82032cSYan Zheng if (ret) 75262b82032cSYan Zheng goto error; 752799e3ecfcSLiu Bo total_dev++; 75280b86a832SChris Mason } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 75290b86a832SChris Mason struct btrfs_chunk *chunk; 753079bd3712SFilipe Manana 753179bd3712SFilipe Manana /* 753279bd3712SFilipe Manana * We are only called at mount time, so no need to take 753379bd3712SFilipe Manana * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 753479bd3712SFilipe Manana * we always lock first fs_info->chunk_mutex before 753579bd3712SFilipe Manana * acquiring any locks on the chunk tree. This is a 753679bd3712SFilipe Manana * requirement for chunk allocation, see the comment on 753779bd3712SFilipe Manana * top of btrfs_chunk_alloc() for details. 753879bd3712SFilipe Manana */ 753979bd3712SFilipe Manana ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 75400b86a832SChris Mason chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 75419690ac09SDavid Sterba ret = read_one_chunk(&found_key, leaf, chunk); 75422b82032cSYan Zheng if (ret) 75432b82032cSYan Zheng goto error; 75440b86a832SChris Mason } 75450b86a832SChris Mason path->slots[0]++; 75460b86a832SChris Mason } 754799e3ecfcSLiu Bo 754899e3ecfcSLiu Bo /* 754999e3ecfcSLiu Bo * After loading chunk tree, we've got all device information, 755099e3ecfcSLiu Bo * do another round of validation checks. 755199e3ecfcSLiu Bo */ 75520b246afaSJeff Mahoney if (total_dev != fs_info->fs_devices->total_devices) { 75530b246afaSJeff Mahoney btrfs_err(fs_info, 755499e3ecfcSLiu Bo "super_num_devices %llu mismatch with num_devices %llu found here", 75550b246afaSJeff Mahoney btrfs_super_num_devices(fs_info->super_copy), 755699e3ecfcSLiu Bo total_dev); 755799e3ecfcSLiu Bo ret = -EINVAL; 755899e3ecfcSLiu Bo goto error; 755999e3ecfcSLiu Bo } 75600b246afaSJeff Mahoney if (btrfs_super_total_bytes(fs_info->super_copy) < 75610b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes) { 75620b246afaSJeff Mahoney btrfs_err(fs_info, 756399e3ecfcSLiu Bo "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 75640b246afaSJeff Mahoney btrfs_super_total_bytes(fs_info->super_copy), 75650b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes); 756699e3ecfcSLiu Bo ret = -EINVAL; 756799e3ecfcSLiu Bo goto error; 756899e3ecfcSLiu Bo } 75690b86a832SChris Mason ret = 0; 75700b86a832SChris Mason error: 7571b367e47fSLi Zefan mutex_unlock(&uuid_mutex); 7572b367e47fSLi Zefan 75732b82032cSYan Zheng btrfs_free_path(path); 75740b86a832SChris Mason return ret; 75750b86a832SChris Mason } 7576442a4f63SStefan Behrens 7577cb517eabSMiao Xie void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7578cb517eabSMiao Xie { 7579944d3f9fSNikolay Borisov struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7580cb517eabSMiao Xie struct btrfs_device *device; 7581cb517eabSMiao Xie 7582944d3f9fSNikolay Borisov fs_devices->fs_info = fs_info; 7583944d3f9fSNikolay Borisov 7584cb517eabSMiao Xie mutex_lock(&fs_devices->device_list_mutex); 7585cb517eabSMiao Xie list_for_each_entry(device, &fs_devices->devices, dev_list) 7586fb456252SJeff Mahoney device->fs_info = fs_info; 758729cc83f6SLiu Bo 7588944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7589944d3f9fSNikolay Borisov list_for_each_entry(device, &seed_devs->devices, dev_list) 7590944d3f9fSNikolay Borisov device->fs_info = fs_info; 7591944d3f9fSNikolay Borisov 7592944d3f9fSNikolay Borisov seed_devs->fs_info = fs_info; 759329cc83f6SLiu Bo } 7594e17125b5SAnand Jain mutex_unlock(&fs_devices->device_list_mutex); 7595cb517eabSMiao Xie } 7596cb517eabSMiao Xie 75971dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 75981dc990dfSDavid Sterba const struct btrfs_dev_stats_item *ptr, 75991dc990dfSDavid Sterba int index) 76001dc990dfSDavid Sterba { 76011dc990dfSDavid Sterba u64 val; 76021dc990dfSDavid Sterba 76031dc990dfSDavid Sterba read_extent_buffer(eb, &val, 76041dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) + 76051dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)), 76061dc990dfSDavid Sterba sizeof(val)); 76071dc990dfSDavid Sterba return val; 76081dc990dfSDavid Sterba } 76091dc990dfSDavid Sterba 76101dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 76111dc990dfSDavid Sterba struct btrfs_dev_stats_item *ptr, 76121dc990dfSDavid Sterba int index, u64 val) 76131dc990dfSDavid Sterba { 76141dc990dfSDavid Sterba write_extent_buffer(eb, &val, 76151dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) + 76161dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)), 76171dc990dfSDavid Sterba sizeof(val)); 76181dc990dfSDavid Sterba } 76191dc990dfSDavid Sterba 762092e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7621124604ebSJosef Bacik struct btrfs_path *path) 7622733f4fbbSStefan Behrens { 7623733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7624124604ebSJosef Bacik struct extent_buffer *eb; 7625124604ebSJosef Bacik struct btrfs_key key; 7626124604ebSJosef Bacik int item_size; 7627124604ebSJosef Bacik int i, ret, slot; 7628733f4fbbSStefan Behrens 762982d62d06SJosef Bacik if (!device->fs_info->dev_root) 763082d62d06SJosef Bacik return 0; 763182d62d06SJosef Bacik 7632242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7633242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7634733f4fbbSStefan Behrens key.offset = device->devid; 7635124604ebSJosef Bacik ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7636733f4fbbSStefan Behrens if (ret) { 7637ae4b9b4cSAnand Jain for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7638ae4b9b4cSAnand Jain btrfs_dev_stat_set(device, i, 0); 7639733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7640733f4fbbSStefan Behrens btrfs_release_path(path); 764192e26df4SJosef Bacik return ret < 0 ? ret : 0; 7642733f4fbbSStefan Behrens } 7643733f4fbbSStefan Behrens slot = path->slots[0]; 7644733f4fbbSStefan Behrens eb = path->nodes[0]; 7645733f4fbbSStefan Behrens item_size = btrfs_item_size_nr(eb, slot); 7646733f4fbbSStefan Behrens 7647124604ebSJosef Bacik ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7648733f4fbbSStefan Behrens 7649733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7650733f4fbbSStefan Behrens if (item_size >= (1 + i) * sizeof(__le64)) 7651733f4fbbSStefan Behrens btrfs_dev_stat_set(device, i, 7652733f4fbbSStefan Behrens btrfs_dev_stats_value(eb, ptr, i)); 7653733f4fbbSStefan Behrens else 76544e411a7dSAnand Jain btrfs_dev_stat_set(device, i, 0); 7655733f4fbbSStefan Behrens } 7656733f4fbbSStefan Behrens 7657733f4fbbSStefan Behrens device->dev_stats_valid = 1; 7658733f4fbbSStefan Behrens btrfs_dev_stat_print_on_load(device); 7659733f4fbbSStefan Behrens btrfs_release_path(path); 766092e26df4SJosef Bacik 766192e26df4SJosef Bacik return 0; 7662733f4fbbSStefan Behrens } 7663124604ebSJosef Bacik 7664124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7665124604ebSJosef Bacik { 7666124604ebSJosef Bacik struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7667124604ebSJosef Bacik struct btrfs_device *device; 7668124604ebSJosef Bacik struct btrfs_path *path = NULL; 766992e26df4SJosef Bacik int ret = 0; 7670124604ebSJosef Bacik 7671124604ebSJosef Bacik path = btrfs_alloc_path(); 7672124604ebSJosef Bacik if (!path) 7673124604ebSJosef Bacik return -ENOMEM; 7674124604ebSJosef Bacik 7675124604ebSJosef Bacik mutex_lock(&fs_devices->device_list_mutex); 767692e26df4SJosef Bacik list_for_each_entry(device, &fs_devices->devices, dev_list) { 767792e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path); 767892e26df4SJosef Bacik if (ret) 767992e26df4SJosef Bacik goto out; 7680124604ebSJosef Bacik } 768192e26df4SJosef Bacik list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 768292e26df4SJosef Bacik list_for_each_entry(device, &seed_devs->devices, dev_list) { 768392e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path); 768492e26df4SJosef Bacik if (ret) 768592e26df4SJosef Bacik goto out; 768692e26df4SJosef Bacik } 768792e26df4SJosef Bacik } 768892e26df4SJosef Bacik out: 7689733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7690733f4fbbSStefan Behrens 7691733f4fbbSStefan Behrens btrfs_free_path(path); 769292e26df4SJosef Bacik return ret; 7693733f4fbbSStefan Behrens } 7694733f4fbbSStefan Behrens 7695733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7696733f4fbbSStefan Behrens struct btrfs_device *device) 7697733f4fbbSStefan Behrens { 76985495f195SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info; 76996bccf3abSJeff Mahoney struct btrfs_root *dev_root = fs_info->dev_root; 7700733f4fbbSStefan Behrens struct btrfs_path *path; 7701733f4fbbSStefan Behrens struct btrfs_key key; 7702733f4fbbSStefan Behrens struct extent_buffer *eb; 7703733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr; 7704733f4fbbSStefan Behrens int ret; 7705733f4fbbSStefan Behrens int i; 7706733f4fbbSStefan Behrens 7707242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID; 7708242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY; 7709733f4fbbSStefan Behrens key.offset = device->devid; 7710733f4fbbSStefan Behrens 7711733f4fbbSStefan Behrens path = btrfs_alloc_path(); 7712fa252992SDavid Sterba if (!path) 7713fa252992SDavid Sterba return -ENOMEM; 7714733f4fbbSStefan Behrens ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7715733f4fbbSStefan Behrens if (ret < 0) { 77160b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7717ecaeb14bSDavid Sterba "error %d while searching for dev_stats item for device %s", 7718606686eeSJosef Bacik ret, rcu_str_deref(device->name)); 7719733f4fbbSStefan Behrens goto out; 7720733f4fbbSStefan Behrens } 7721733f4fbbSStefan Behrens 7722733f4fbbSStefan Behrens if (ret == 0 && 7723733f4fbbSStefan Behrens btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7724733f4fbbSStefan Behrens /* need to delete old one and insert a new one */ 7725733f4fbbSStefan Behrens ret = btrfs_del_item(trans, dev_root, path); 7726733f4fbbSStefan Behrens if (ret != 0) { 77270b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7728ecaeb14bSDavid Sterba "delete too small dev_stats item for device %s failed %d", 7729606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7730733f4fbbSStefan Behrens goto out; 7731733f4fbbSStefan Behrens } 7732733f4fbbSStefan Behrens ret = 1; 7733733f4fbbSStefan Behrens } 7734733f4fbbSStefan Behrens 7735733f4fbbSStefan Behrens if (ret == 1) { 7736733f4fbbSStefan Behrens /* need to insert a new item */ 7737733f4fbbSStefan Behrens btrfs_release_path(path); 7738733f4fbbSStefan Behrens ret = btrfs_insert_empty_item(trans, dev_root, path, 7739733f4fbbSStefan Behrens &key, sizeof(*ptr)); 7740733f4fbbSStefan Behrens if (ret < 0) { 77410b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info, 7742ecaeb14bSDavid Sterba "insert dev_stats item for device %s failed %d", 7743606686eeSJosef Bacik rcu_str_deref(device->name), ret); 7744733f4fbbSStefan Behrens goto out; 7745733f4fbbSStefan Behrens } 7746733f4fbbSStefan Behrens } 7747733f4fbbSStefan Behrens 7748733f4fbbSStefan Behrens eb = path->nodes[0]; 7749733f4fbbSStefan Behrens ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7750733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7751733f4fbbSStefan Behrens btrfs_set_dev_stats_value(eb, ptr, i, 7752733f4fbbSStefan Behrens btrfs_dev_stat_read(device, i)); 7753733f4fbbSStefan Behrens btrfs_mark_buffer_dirty(eb); 7754733f4fbbSStefan Behrens 7755733f4fbbSStefan Behrens out: 7756733f4fbbSStefan Behrens btrfs_free_path(path); 7757733f4fbbSStefan Behrens return ret; 7758733f4fbbSStefan Behrens } 7759733f4fbbSStefan Behrens 7760733f4fbbSStefan Behrens /* 7761733f4fbbSStefan Behrens * called from commit_transaction. Writes all changed device stats to disk. 7762733f4fbbSStefan Behrens */ 7763196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7764733f4fbbSStefan Behrens { 7765196c9d8dSDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info; 7766733f4fbbSStefan Behrens struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7767733f4fbbSStefan Behrens struct btrfs_device *device; 7768addc3fa7SMiao Xie int stats_cnt; 7769733f4fbbSStefan Behrens int ret = 0; 7770733f4fbbSStefan Behrens 7771733f4fbbSStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 7772733f4fbbSStefan Behrens list_for_each_entry(device, &fs_devices->devices, dev_list) { 77739deae968SNikolay Borisov stats_cnt = atomic_read(&device->dev_stats_ccnt); 77749deae968SNikolay Borisov if (!device->dev_stats_valid || stats_cnt == 0) 7775733f4fbbSStefan Behrens continue; 7776733f4fbbSStefan Behrens 77779deae968SNikolay Borisov 77789deae968SNikolay Borisov /* 77799deae968SNikolay Borisov * There is a LOAD-LOAD control dependency between the value of 77809deae968SNikolay Borisov * dev_stats_ccnt and updating the on-disk values which requires 77819deae968SNikolay Borisov * reading the in-memory counters. Such control dependencies 77829deae968SNikolay Borisov * require explicit read memory barriers. 77839deae968SNikolay Borisov * 77849deae968SNikolay Borisov * This memory barriers pairs with smp_mb__before_atomic in 77859deae968SNikolay Borisov * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 77869deae968SNikolay Borisov * barrier implied by atomic_xchg in 77879deae968SNikolay Borisov * btrfs_dev_stats_read_and_reset 77889deae968SNikolay Borisov */ 77899deae968SNikolay Borisov smp_rmb(); 77909deae968SNikolay Borisov 77915495f195SNikolay Borisov ret = update_dev_stat_item(trans, device); 7792733f4fbbSStefan Behrens if (!ret) 7793addc3fa7SMiao Xie atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7794733f4fbbSStefan Behrens } 7795733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7796733f4fbbSStefan Behrens 7797733f4fbbSStefan Behrens return ret; 7798733f4fbbSStefan Behrens } 7799733f4fbbSStefan Behrens 7800442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7801442a4f63SStefan Behrens { 7802442a4f63SStefan Behrens btrfs_dev_stat_inc(dev, index); 7803442a4f63SStefan Behrens btrfs_dev_stat_print_on_error(dev); 7804442a4f63SStefan Behrens } 7805442a4f63SStefan Behrens 780648a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7807442a4f63SStefan Behrens { 7808733f4fbbSStefan Behrens if (!dev->dev_stats_valid) 7809733f4fbbSStefan Behrens return; 7810fb456252SJeff Mahoney btrfs_err_rl_in_rcu(dev->fs_info, 7811b14af3b4SDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7812606686eeSJosef Bacik rcu_str_deref(dev->name), 7813442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7814442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7815442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7816efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7817efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7818442a4f63SStefan Behrens } 7819c11d2c23SStefan Behrens 7820733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7821733f4fbbSStefan Behrens { 7822a98cdb85SStefan Behrens int i; 7823a98cdb85SStefan Behrens 7824a98cdb85SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7825a98cdb85SStefan Behrens if (btrfs_dev_stat_read(dev, i) != 0) 7826a98cdb85SStefan Behrens break; 7827a98cdb85SStefan Behrens if (i == BTRFS_DEV_STAT_VALUES_MAX) 7828a98cdb85SStefan Behrens return; /* all values == 0, suppress message */ 7829a98cdb85SStefan Behrens 7830fb456252SJeff Mahoney btrfs_info_in_rcu(dev->fs_info, 7831ecaeb14bSDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7832606686eeSJosef Bacik rcu_str_deref(dev->name), 7833733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7834733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7835733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7836733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7837733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7838733f4fbbSStefan Behrens } 7839733f4fbbSStefan Behrens 78402ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7841b27f7c0cSDavid Sterba struct btrfs_ioctl_get_dev_stats *stats) 7842c11d2c23SStefan Behrens { 7843c11d2c23SStefan Behrens struct btrfs_device *dev; 78440b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7845c11d2c23SStefan Behrens int i; 7846c11d2c23SStefan Behrens 7847c11d2c23SStefan Behrens mutex_lock(&fs_devices->device_list_mutex); 7848b2598edfSAnand Jain dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); 7849c11d2c23SStefan Behrens mutex_unlock(&fs_devices->device_list_mutex); 7850c11d2c23SStefan Behrens 7851c11d2c23SStefan Behrens if (!dev) { 78520b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7853c11d2c23SStefan Behrens return -ENODEV; 7854733f4fbbSStefan Behrens } else if (!dev->dev_stats_valid) { 78550b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7856733f4fbbSStefan Behrens return -ENODEV; 7857b27f7c0cSDavid Sterba } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7858c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7859c11d2c23SStefan Behrens if (stats->nr_items > i) 7860c11d2c23SStefan Behrens stats->values[i] = 7861c11d2c23SStefan Behrens btrfs_dev_stat_read_and_reset(dev, i); 7862c11d2c23SStefan Behrens else 78634e411a7dSAnand Jain btrfs_dev_stat_set(dev, i, 0); 7864c11d2c23SStefan Behrens } 7865a69976bcSAnand Jain btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7866a69976bcSAnand Jain current->comm, task_pid_nr(current)); 7867c11d2c23SStefan Behrens } else { 7868c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7869c11d2c23SStefan Behrens if (stats->nr_items > i) 7870c11d2c23SStefan Behrens stats->values[i] = btrfs_dev_stat_read(dev, i); 7871c11d2c23SStefan Behrens } 7872c11d2c23SStefan Behrens if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7873c11d2c23SStefan Behrens stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7874c11d2c23SStefan Behrens return 0; 7875c11d2c23SStefan Behrens } 7876a8a6dab7SStefan Behrens 7877935e5cc9SMiao Xie /* 7878bbbf7243SNikolay Borisov * Update the size and bytes used for each device where it changed. This is 7879bbbf7243SNikolay Borisov * delayed since we would otherwise get errors while writing out the 7880935e5cc9SMiao Xie * superblocks. 7881bbbf7243SNikolay Borisov * 7882bbbf7243SNikolay Borisov * Must be invoked during transaction commit. 7883935e5cc9SMiao Xie */ 7884bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7885935e5cc9SMiao Xie { 7886935e5cc9SMiao Xie struct btrfs_device *curr, *next; 7887935e5cc9SMiao Xie 7888bbbf7243SNikolay Borisov ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7889bbbf7243SNikolay Borisov 7890bbbf7243SNikolay Borisov if (list_empty(&trans->dev_update_list)) 7891935e5cc9SMiao Xie return; 7892935e5cc9SMiao Xie 7893bbbf7243SNikolay Borisov /* 7894bbbf7243SNikolay Borisov * We don't need the device_list_mutex here. This list is owned by the 7895bbbf7243SNikolay Borisov * transaction and the transaction must complete before the device is 7896bbbf7243SNikolay Borisov * released. 7897bbbf7243SNikolay Borisov */ 7898bbbf7243SNikolay Borisov mutex_lock(&trans->fs_info->chunk_mutex); 7899bbbf7243SNikolay Borisov list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7900bbbf7243SNikolay Borisov post_commit_list) { 7901bbbf7243SNikolay Borisov list_del_init(&curr->post_commit_list); 7902935e5cc9SMiao Xie curr->commit_total_bytes = curr->disk_total_bytes; 7903bbbf7243SNikolay Borisov curr->commit_bytes_used = curr->bytes_used; 7904935e5cc9SMiao Xie } 7905bbbf7243SNikolay Borisov mutex_unlock(&trans->fs_info->chunk_mutex); 7906ce7213c7SMiao Xie } 79075a13f430SAnand Jain 790846df06b8SDavid Sterba /* 790946df06b8SDavid Sterba * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 791046df06b8SDavid Sterba */ 791146df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags) 791246df06b8SDavid Sterba { 791344b28adaSDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags); 791444b28adaSDavid Sterba 791544b28adaSDavid Sterba return btrfs_raid_array[index].ncopies; 791646df06b8SDavid Sterba } 7917cf90d884SQu Wenruo 7918cf90d884SQu Wenruo 7919cf90d884SQu Wenruo 7920cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7921cf90d884SQu Wenruo u64 chunk_offset, u64 devid, 7922cf90d884SQu Wenruo u64 physical_offset, u64 physical_len) 7923cf90d884SQu Wenruo { 7924c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7925cf90d884SQu Wenruo struct extent_map *em; 7926cf90d884SQu Wenruo struct map_lookup *map; 792705a37c48SQu Wenruo struct btrfs_device *dev; 7928cf90d884SQu Wenruo u64 stripe_len; 7929cf90d884SQu Wenruo bool found = false; 7930cf90d884SQu Wenruo int ret = 0; 7931cf90d884SQu Wenruo int i; 7932cf90d884SQu Wenruo 7933cf90d884SQu Wenruo read_lock(&em_tree->lock); 7934cf90d884SQu Wenruo em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7935cf90d884SQu Wenruo read_unlock(&em_tree->lock); 7936cf90d884SQu Wenruo 7937cf90d884SQu Wenruo if (!em) { 7938cf90d884SQu Wenruo btrfs_err(fs_info, 7939cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7940cf90d884SQu Wenruo physical_offset, devid); 7941cf90d884SQu Wenruo ret = -EUCLEAN; 7942cf90d884SQu Wenruo goto out; 7943cf90d884SQu Wenruo } 7944cf90d884SQu Wenruo 7945cf90d884SQu Wenruo map = em->map_lookup; 7946cf90d884SQu Wenruo stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7947cf90d884SQu Wenruo if (physical_len != stripe_len) { 7948cf90d884SQu Wenruo btrfs_err(fs_info, 7949cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7950cf90d884SQu Wenruo physical_offset, devid, em->start, physical_len, 7951cf90d884SQu Wenruo stripe_len); 7952cf90d884SQu Wenruo ret = -EUCLEAN; 7953cf90d884SQu Wenruo goto out; 7954cf90d884SQu Wenruo } 7955cf90d884SQu Wenruo 7956cf90d884SQu Wenruo for (i = 0; i < map->num_stripes; i++) { 7957cf90d884SQu Wenruo if (map->stripes[i].dev->devid == devid && 7958cf90d884SQu Wenruo map->stripes[i].physical == physical_offset) { 7959cf90d884SQu Wenruo found = true; 7960cf90d884SQu Wenruo if (map->verified_stripes >= map->num_stripes) { 7961cf90d884SQu Wenruo btrfs_err(fs_info, 7962cf90d884SQu Wenruo "too many dev extents for chunk %llu found", 7963cf90d884SQu Wenruo em->start); 7964cf90d884SQu Wenruo ret = -EUCLEAN; 7965cf90d884SQu Wenruo goto out; 7966cf90d884SQu Wenruo } 7967cf90d884SQu Wenruo map->verified_stripes++; 7968cf90d884SQu Wenruo break; 7969cf90d884SQu Wenruo } 7970cf90d884SQu Wenruo } 7971cf90d884SQu Wenruo if (!found) { 7972cf90d884SQu Wenruo btrfs_err(fs_info, 7973cf90d884SQu Wenruo "dev extent physical offset %llu devid %llu has no corresponding chunk", 7974cf90d884SQu Wenruo physical_offset, devid); 7975cf90d884SQu Wenruo ret = -EUCLEAN; 7976cf90d884SQu Wenruo } 797705a37c48SQu Wenruo 79781a9fd417SDavid Sterba /* Make sure no dev extent is beyond device boundary */ 7979b2598edfSAnand Jain dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 798005a37c48SQu Wenruo if (!dev) { 798105a37c48SQu Wenruo btrfs_err(fs_info, "failed to find devid %llu", devid); 798205a37c48SQu Wenruo ret = -EUCLEAN; 798305a37c48SQu Wenruo goto out; 798405a37c48SQu Wenruo } 79851b3922a8SQu Wenruo 798605a37c48SQu Wenruo if (physical_offset + physical_len > dev->disk_total_bytes) { 798705a37c48SQu Wenruo btrfs_err(fs_info, 798805a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 798905a37c48SQu Wenruo devid, physical_offset, physical_len, 799005a37c48SQu Wenruo dev->disk_total_bytes); 799105a37c48SQu Wenruo ret = -EUCLEAN; 799205a37c48SQu Wenruo goto out; 799305a37c48SQu Wenruo } 7994381a696eSNaohiro Aota 7995381a696eSNaohiro Aota if (dev->zone_info) { 7996381a696eSNaohiro Aota u64 zone_size = dev->zone_info->zone_size; 7997381a696eSNaohiro Aota 7998381a696eSNaohiro Aota if (!IS_ALIGNED(physical_offset, zone_size) || 7999381a696eSNaohiro Aota !IS_ALIGNED(physical_len, zone_size)) { 8000381a696eSNaohiro Aota btrfs_err(fs_info, 8001381a696eSNaohiro Aota "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8002381a696eSNaohiro Aota devid, physical_offset, physical_len); 8003381a696eSNaohiro Aota ret = -EUCLEAN; 8004381a696eSNaohiro Aota goto out; 8005381a696eSNaohiro Aota } 8006381a696eSNaohiro Aota } 8007381a696eSNaohiro Aota 8008cf90d884SQu Wenruo out: 8009cf90d884SQu Wenruo free_extent_map(em); 8010cf90d884SQu Wenruo return ret; 8011cf90d884SQu Wenruo } 8012cf90d884SQu Wenruo 8013cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8014cf90d884SQu Wenruo { 8015c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8016cf90d884SQu Wenruo struct extent_map *em; 8017cf90d884SQu Wenruo struct rb_node *node; 8018cf90d884SQu Wenruo int ret = 0; 8019cf90d884SQu Wenruo 8020cf90d884SQu Wenruo read_lock(&em_tree->lock); 802107e1ce09SLiu Bo for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8022cf90d884SQu Wenruo em = rb_entry(node, struct extent_map, rb_node); 8023cf90d884SQu Wenruo if (em->map_lookup->num_stripes != 8024cf90d884SQu Wenruo em->map_lookup->verified_stripes) { 8025cf90d884SQu Wenruo btrfs_err(fs_info, 8026cf90d884SQu Wenruo "chunk %llu has missing dev extent, have %d expect %d", 8027cf90d884SQu Wenruo em->start, em->map_lookup->verified_stripes, 8028cf90d884SQu Wenruo em->map_lookup->num_stripes); 8029cf90d884SQu Wenruo ret = -EUCLEAN; 8030cf90d884SQu Wenruo goto out; 8031cf90d884SQu Wenruo } 8032cf90d884SQu Wenruo } 8033cf90d884SQu Wenruo out: 8034cf90d884SQu Wenruo read_unlock(&em_tree->lock); 8035cf90d884SQu Wenruo return ret; 8036cf90d884SQu Wenruo } 8037cf90d884SQu Wenruo 8038cf90d884SQu Wenruo /* 8039cf90d884SQu Wenruo * Ensure that all dev extents are mapped to correct chunk, otherwise 8040cf90d884SQu Wenruo * later chunk allocation/free would cause unexpected behavior. 8041cf90d884SQu Wenruo * 8042cf90d884SQu Wenruo * NOTE: This will iterate through the whole device tree, which should be of 8043cf90d884SQu Wenruo * the same size level as the chunk tree. This slightly increases mount time. 8044cf90d884SQu Wenruo */ 8045cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8046cf90d884SQu Wenruo { 8047cf90d884SQu Wenruo struct btrfs_path *path; 8048cf90d884SQu Wenruo struct btrfs_root *root = fs_info->dev_root; 8049cf90d884SQu Wenruo struct btrfs_key key; 80505eb19381SQu Wenruo u64 prev_devid = 0; 80515eb19381SQu Wenruo u64 prev_dev_ext_end = 0; 8052cf90d884SQu Wenruo int ret = 0; 8053cf90d884SQu Wenruo 805442437a63SJosef Bacik /* 805542437a63SJosef Bacik * We don't have a dev_root because we mounted with ignorebadroots and 805642437a63SJosef Bacik * failed to load the root, so we want to skip the verification in this 805742437a63SJosef Bacik * case for sure. 805842437a63SJosef Bacik * 805942437a63SJosef Bacik * However if the dev root is fine, but the tree itself is corrupted 806042437a63SJosef Bacik * we'd still fail to mount. This verification is only to make sure 806142437a63SJosef Bacik * writes can happen safely, so instead just bypass this check 806242437a63SJosef Bacik * completely in the case of IGNOREBADROOTS. 806342437a63SJosef Bacik */ 806442437a63SJosef Bacik if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 806542437a63SJosef Bacik return 0; 806642437a63SJosef Bacik 8067cf90d884SQu Wenruo key.objectid = 1; 8068cf90d884SQu Wenruo key.type = BTRFS_DEV_EXTENT_KEY; 8069cf90d884SQu Wenruo key.offset = 0; 8070cf90d884SQu Wenruo 8071cf90d884SQu Wenruo path = btrfs_alloc_path(); 8072cf90d884SQu Wenruo if (!path) 8073cf90d884SQu Wenruo return -ENOMEM; 8074cf90d884SQu Wenruo 8075cf90d884SQu Wenruo path->reada = READA_FORWARD; 8076cf90d884SQu Wenruo ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8077cf90d884SQu Wenruo if (ret < 0) 8078cf90d884SQu Wenruo goto out; 8079cf90d884SQu Wenruo 8080cf90d884SQu Wenruo if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8081ad9a9378SMarcos Paulo de Souza ret = btrfs_next_leaf(root, path); 8082cf90d884SQu Wenruo if (ret < 0) 8083cf90d884SQu Wenruo goto out; 8084cf90d884SQu Wenruo /* No dev extents at all? Not good */ 8085cf90d884SQu Wenruo if (ret > 0) { 8086cf90d884SQu Wenruo ret = -EUCLEAN; 8087cf90d884SQu Wenruo goto out; 8088cf90d884SQu Wenruo } 8089cf90d884SQu Wenruo } 8090cf90d884SQu Wenruo while (1) { 8091cf90d884SQu Wenruo struct extent_buffer *leaf = path->nodes[0]; 8092cf90d884SQu Wenruo struct btrfs_dev_extent *dext; 8093cf90d884SQu Wenruo int slot = path->slots[0]; 8094cf90d884SQu Wenruo u64 chunk_offset; 8095cf90d884SQu Wenruo u64 physical_offset; 8096cf90d884SQu Wenruo u64 physical_len; 8097cf90d884SQu Wenruo u64 devid; 8098cf90d884SQu Wenruo 8099cf90d884SQu Wenruo btrfs_item_key_to_cpu(leaf, &key, slot); 8100cf90d884SQu Wenruo if (key.type != BTRFS_DEV_EXTENT_KEY) 8101cf90d884SQu Wenruo break; 8102cf90d884SQu Wenruo devid = key.objectid; 8103cf90d884SQu Wenruo physical_offset = key.offset; 8104cf90d884SQu Wenruo 8105cf90d884SQu Wenruo dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8106cf90d884SQu Wenruo chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8107cf90d884SQu Wenruo physical_len = btrfs_dev_extent_length(leaf, dext); 8108cf90d884SQu Wenruo 81095eb19381SQu Wenruo /* Check if this dev extent overlaps with the previous one */ 81105eb19381SQu Wenruo if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 81115eb19381SQu Wenruo btrfs_err(fs_info, 81125eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 81135eb19381SQu Wenruo devid, physical_offset, prev_dev_ext_end); 81145eb19381SQu Wenruo ret = -EUCLEAN; 81155eb19381SQu Wenruo goto out; 81165eb19381SQu Wenruo } 81175eb19381SQu Wenruo 8118cf90d884SQu Wenruo ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8119cf90d884SQu Wenruo physical_offset, physical_len); 8120cf90d884SQu Wenruo if (ret < 0) 8121cf90d884SQu Wenruo goto out; 81225eb19381SQu Wenruo prev_devid = devid; 81235eb19381SQu Wenruo prev_dev_ext_end = physical_offset + physical_len; 81245eb19381SQu Wenruo 8125cf90d884SQu Wenruo ret = btrfs_next_item(root, path); 8126cf90d884SQu Wenruo if (ret < 0) 8127cf90d884SQu Wenruo goto out; 8128cf90d884SQu Wenruo if (ret > 0) { 8129cf90d884SQu Wenruo ret = 0; 8130cf90d884SQu Wenruo break; 8131cf90d884SQu Wenruo } 8132cf90d884SQu Wenruo } 8133cf90d884SQu Wenruo 8134cf90d884SQu Wenruo /* Ensure all chunks have corresponding dev extents */ 8135cf90d884SQu Wenruo ret = verify_chunk_dev_extent_mapping(fs_info); 8136cf90d884SQu Wenruo out: 8137cf90d884SQu Wenruo btrfs_free_path(path); 8138cf90d884SQu Wenruo return ret; 8139cf90d884SQu Wenruo } 8140eede2bf3SOmar Sandoval 8141eede2bf3SOmar Sandoval /* 8142eede2bf3SOmar Sandoval * Check whether the given block group or device is pinned by any inode being 8143eede2bf3SOmar Sandoval * used as a swapfile. 8144eede2bf3SOmar Sandoval */ 8145eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8146eede2bf3SOmar Sandoval { 8147eede2bf3SOmar Sandoval struct btrfs_swapfile_pin *sp; 8148eede2bf3SOmar Sandoval struct rb_node *node; 8149eede2bf3SOmar Sandoval 8150eede2bf3SOmar Sandoval spin_lock(&fs_info->swapfile_pins_lock); 8151eede2bf3SOmar Sandoval node = fs_info->swapfile_pins.rb_node; 8152eede2bf3SOmar Sandoval while (node) { 8153eede2bf3SOmar Sandoval sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8154eede2bf3SOmar Sandoval if (ptr < sp->ptr) 8155eede2bf3SOmar Sandoval node = node->rb_left; 8156eede2bf3SOmar Sandoval else if (ptr > sp->ptr) 8157eede2bf3SOmar Sandoval node = node->rb_right; 8158eede2bf3SOmar Sandoval else 8159eede2bf3SOmar Sandoval break; 8160eede2bf3SOmar Sandoval } 8161eede2bf3SOmar Sandoval spin_unlock(&fs_info->swapfile_pins_lock); 8162eede2bf3SOmar Sandoval return node != NULL; 8163eede2bf3SOmar Sandoval } 8164f7ef5287SNaohiro Aota 8165f7ef5287SNaohiro Aota static int relocating_repair_kthread(void *data) 8166f7ef5287SNaohiro Aota { 8167f7ef5287SNaohiro Aota struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8168f7ef5287SNaohiro Aota struct btrfs_fs_info *fs_info = cache->fs_info; 8169f7ef5287SNaohiro Aota u64 target; 8170f7ef5287SNaohiro Aota int ret = 0; 8171f7ef5287SNaohiro Aota 8172f7ef5287SNaohiro Aota target = cache->start; 8173f7ef5287SNaohiro Aota btrfs_put_block_group(cache); 8174f7ef5287SNaohiro Aota 8175f7ef5287SNaohiro Aota if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8176f7ef5287SNaohiro Aota btrfs_info(fs_info, 8177f7ef5287SNaohiro Aota "zoned: skip relocating block group %llu to repair: EBUSY", 8178f7ef5287SNaohiro Aota target); 8179f7ef5287SNaohiro Aota return -EBUSY; 8180f7ef5287SNaohiro Aota } 8181f7ef5287SNaohiro Aota 8182f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock); 8183f7ef5287SNaohiro Aota 8184f7ef5287SNaohiro Aota /* Ensure block group still exists */ 8185f7ef5287SNaohiro Aota cache = btrfs_lookup_block_group(fs_info, target); 8186f7ef5287SNaohiro Aota if (!cache) 8187f7ef5287SNaohiro Aota goto out; 8188f7ef5287SNaohiro Aota 8189f7ef5287SNaohiro Aota if (!cache->relocating_repair) 8190f7ef5287SNaohiro Aota goto out; 8191f7ef5287SNaohiro Aota 8192f7ef5287SNaohiro Aota ret = btrfs_may_alloc_data_chunk(fs_info, target); 8193f7ef5287SNaohiro Aota if (ret < 0) 8194f7ef5287SNaohiro Aota goto out; 8195f7ef5287SNaohiro Aota 8196f7ef5287SNaohiro Aota btrfs_info(fs_info, 8197f7ef5287SNaohiro Aota "zoned: relocating block group %llu to repair IO failure", 8198f7ef5287SNaohiro Aota target); 8199f7ef5287SNaohiro Aota ret = btrfs_relocate_chunk(fs_info, target); 8200f7ef5287SNaohiro Aota 8201f7ef5287SNaohiro Aota out: 8202f7ef5287SNaohiro Aota if (cache) 8203f7ef5287SNaohiro Aota btrfs_put_block_group(cache); 8204f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock); 8205f7ef5287SNaohiro Aota btrfs_exclop_finish(fs_info); 8206f7ef5287SNaohiro Aota 8207f7ef5287SNaohiro Aota return ret; 8208f7ef5287SNaohiro Aota } 8209f7ef5287SNaohiro Aota 8210f7ef5287SNaohiro Aota int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8211f7ef5287SNaohiro Aota { 8212f7ef5287SNaohiro Aota struct btrfs_block_group *cache; 8213f7ef5287SNaohiro Aota 8214f7ef5287SNaohiro Aota /* Do not attempt to repair in degraded state */ 8215f7ef5287SNaohiro Aota if (btrfs_test_opt(fs_info, DEGRADED)) 8216f7ef5287SNaohiro Aota return 0; 8217f7ef5287SNaohiro Aota 8218f7ef5287SNaohiro Aota cache = btrfs_lookup_block_group(fs_info, logical); 8219f7ef5287SNaohiro Aota if (!cache) 8220f7ef5287SNaohiro Aota return 0; 8221f7ef5287SNaohiro Aota 8222f7ef5287SNaohiro Aota spin_lock(&cache->lock); 8223f7ef5287SNaohiro Aota if (cache->relocating_repair) { 8224f7ef5287SNaohiro Aota spin_unlock(&cache->lock); 8225f7ef5287SNaohiro Aota btrfs_put_block_group(cache); 8226f7ef5287SNaohiro Aota return 0; 8227f7ef5287SNaohiro Aota } 8228f7ef5287SNaohiro Aota cache->relocating_repair = 1; 8229f7ef5287SNaohiro Aota spin_unlock(&cache->lock); 8230f7ef5287SNaohiro Aota 8231f7ef5287SNaohiro Aota kthread_run(relocating_repair_kthread, cache, 8232f7ef5287SNaohiro Aota "btrfs-relocating-repair"); 8233f7ef5287SNaohiro Aota 8234f7ef5287SNaohiro Aota return 0; 8235f7ef5287SNaohiro Aota } 8236