xref: /openbmc/linux/fs/btrfs/volumes.c (revision 8b41393fe7c3b180abadc26856fb653014733bb9)
1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
20b86a832SChris Mason /*
30b86a832SChris Mason  * Copyright (C) 2007 Oracle.  All rights reserved.
40b86a832SChris Mason  */
5c1d7c514SDavid Sterba 
60b86a832SChris Mason #include <linux/sched.h>
7fccc0007SJosef Bacik #include <linux/sched/mm.h>
80b86a832SChris Mason #include <linux/bio.h>
95a0e3ad6STejun Heo #include <linux/slab.h>
10f2d8d74dSChris Mason #include <linux/blkdev.h>
11442a4f63SStefan Behrens #include <linux/ratelimit.h>
1259641015SIlya Dryomov #include <linux/kthread.h>
1353b381b3SDavid Woodhouse #include <linux/raid/pq.h>
14803b2f54SStefan Behrens #include <linux/semaphore.h>
158da4b8c4SAndy Shevchenko #include <linux/uuid.h>
16f8e10cd3SAnand Jain #include <linux/list_sort.h>
17784352feSDavid Sterba #include "misc.h"
180b86a832SChris Mason #include "ctree.h"
190b86a832SChris Mason #include "extent_map.h"
200b86a832SChris Mason #include "disk-io.h"
210b86a832SChris Mason #include "transaction.h"
220b86a832SChris Mason #include "print-tree.h"
230b86a832SChris Mason #include "volumes.h"
2453b381b3SDavid Woodhouse #include "raid56.h"
258b712842SChris Mason #include "async-thread.h"
2621adbd5cSStefan Behrens #include "check-integrity.h"
27606686eeSJosef Bacik #include "rcu-string.h"
288dabb742SStefan Behrens #include "dev-replace.h"
2999994cdeSAnand Jain #include "sysfs.h"
3082fc28fbSQu Wenruo #include "tree-checker.h"
318719aaaeSJosef Bacik #include "space-info.h"
32aac0023cSJosef Bacik #include "block-group.h"
33b0643e59SDennis Zhou #include "discard.h"
345b316468SNaohiro Aota #include "zoned.h"
350b86a832SChris Mason 
36af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37af902047SZhao Lei 	[BTRFS_RAID_RAID10] = {
38af902047SZhao Lei 		.sub_stripes	= 2,
39af902047SZhao Lei 		.dev_stripes	= 1,
40af902047SZhao Lei 		.devs_max	= 0,	/* 0 == as many as possible */
41b2f78e88SDavid Sterba 		.devs_min	= 2,
428789f4feSZhao Lei 		.tolerated_failures = 1,
43af902047SZhao Lei 		.devs_increment	= 2,
44af902047SZhao Lei 		.ncopies	= 2,
45b50836edSHans van Kranenburg 		.nparity        = 0,
46ed23467bSAnand Jain 		.raid_name	= "raid10",
4741a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
48f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
49af902047SZhao Lei 	},
50af902047SZhao Lei 	[BTRFS_RAID_RAID1] = {
51af902047SZhao Lei 		.sub_stripes	= 1,
52af902047SZhao Lei 		.dev_stripes	= 1,
53af902047SZhao Lei 		.devs_max	= 2,
54af902047SZhao Lei 		.devs_min	= 2,
558789f4feSZhao Lei 		.tolerated_failures = 1,
56af902047SZhao Lei 		.devs_increment	= 2,
57af902047SZhao Lei 		.ncopies	= 2,
58b50836edSHans van Kranenburg 		.nparity        = 0,
59ed23467bSAnand Jain 		.raid_name	= "raid1",
6041a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
61f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62af902047SZhao Lei 	},
6347e6f742SDavid Sterba 	[BTRFS_RAID_RAID1C3] = {
6447e6f742SDavid Sterba 		.sub_stripes	= 1,
6547e6f742SDavid Sterba 		.dev_stripes	= 1,
66cf93e15eSDavid Sterba 		.devs_max	= 3,
6747e6f742SDavid Sterba 		.devs_min	= 3,
6847e6f742SDavid Sterba 		.tolerated_failures = 2,
6947e6f742SDavid Sterba 		.devs_increment	= 3,
7047e6f742SDavid Sterba 		.ncopies	= 3,
71db26a024SDavid Sterba 		.nparity        = 0,
7247e6f742SDavid Sterba 		.raid_name	= "raid1c3",
7347e6f742SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
7447e6f742SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
7547e6f742SDavid Sterba 	},
768d6fac00SDavid Sterba 	[BTRFS_RAID_RAID1C4] = {
778d6fac00SDavid Sterba 		.sub_stripes	= 1,
788d6fac00SDavid Sterba 		.dev_stripes	= 1,
79cf93e15eSDavid Sterba 		.devs_max	= 4,
808d6fac00SDavid Sterba 		.devs_min	= 4,
818d6fac00SDavid Sterba 		.tolerated_failures = 3,
828d6fac00SDavid Sterba 		.devs_increment	= 4,
838d6fac00SDavid Sterba 		.ncopies	= 4,
84db26a024SDavid Sterba 		.nparity        = 0,
858d6fac00SDavid Sterba 		.raid_name	= "raid1c4",
868d6fac00SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
878d6fac00SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
888d6fac00SDavid Sterba 	},
89af902047SZhao Lei 	[BTRFS_RAID_DUP] = {
90af902047SZhao Lei 		.sub_stripes	= 1,
91af902047SZhao Lei 		.dev_stripes	= 2,
92af902047SZhao Lei 		.devs_max	= 1,
93af902047SZhao Lei 		.devs_min	= 1,
948789f4feSZhao Lei 		.tolerated_failures = 0,
95af902047SZhao Lei 		.devs_increment	= 1,
96af902047SZhao Lei 		.ncopies	= 2,
97b50836edSHans van Kranenburg 		.nparity        = 0,
98ed23467bSAnand Jain 		.raid_name	= "dup",
9941a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
100f9fbcaa2SAnand Jain 		.mindev_error	= 0,
101af902047SZhao Lei 	},
102af902047SZhao Lei 	[BTRFS_RAID_RAID0] = {
103af902047SZhao Lei 		.sub_stripes	= 1,
104af902047SZhao Lei 		.dev_stripes	= 1,
105af902047SZhao Lei 		.devs_max	= 0,
106b2f78e88SDavid Sterba 		.devs_min	= 1,
1078789f4feSZhao Lei 		.tolerated_failures = 0,
108af902047SZhao Lei 		.devs_increment	= 1,
109af902047SZhao Lei 		.ncopies	= 1,
110b50836edSHans van Kranenburg 		.nparity        = 0,
111ed23467bSAnand Jain 		.raid_name	= "raid0",
11241a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
113f9fbcaa2SAnand Jain 		.mindev_error	= 0,
114af902047SZhao Lei 	},
115af902047SZhao Lei 	[BTRFS_RAID_SINGLE] = {
116af902047SZhao Lei 		.sub_stripes	= 1,
117af902047SZhao Lei 		.dev_stripes	= 1,
118af902047SZhao Lei 		.devs_max	= 1,
119af902047SZhao Lei 		.devs_min	= 1,
1208789f4feSZhao Lei 		.tolerated_failures = 0,
121af902047SZhao Lei 		.devs_increment	= 1,
122af902047SZhao Lei 		.ncopies	= 1,
123b50836edSHans van Kranenburg 		.nparity        = 0,
124ed23467bSAnand Jain 		.raid_name	= "single",
12541a6e891SAnand Jain 		.bg_flag	= 0,
126f9fbcaa2SAnand Jain 		.mindev_error	= 0,
127af902047SZhao Lei 	},
128af902047SZhao Lei 	[BTRFS_RAID_RAID5] = {
129af902047SZhao Lei 		.sub_stripes	= 1,
130af902047SZhao Lei 		.dev_stripes	= 1,
131af902047SZhao Lei 		.devs_max	= 0,
132af902047SZhao Lei 		.devs_min	= 2,
1338789f4feSZhao Lei 		.tolerated_failures = 1,
134af902047SZhao Lei 		.devs_increment	= 1,
135da612e31SHans van Kranenburg 		.ncopies	= 1,
136b50836edSHans van Kranenburg 		.nparity        = 1,
137ed23467bSAnand Jain 		.raid_name	= "raid5",
13841a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
139f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
140af902047SZhao Lei 	},
141af902047SZhao Lei 	[BTRFS_RAID_RAID6] = {
142af902047SZhao Lei 		.sub_stripes	= 1,
143af902047SZhao Lei 		.dev_stripes	= 1,
144af902047SZhao Lei 		.devs_max	= 0,
145af902047SZhao Lei 		.devs_min	= 3,
1468789f4feSZhao Lei 		.tolerated_failures = 2,
147af902047SZhao Lei 		.devs_increment	= 1,
148da612e31SHans van Kranenburg 		.ncopies	= 1,
149b50836edSHans van Kranenburg 		.nparity        = 2,
150ed23467bSAnand Jain 		.raid_name	= "raid6",
15141a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
152f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
153af902047SZhao Lei 	},
154af902047SZhao Lei };
155af902047SZhao Lei 
156500a44c9SDavid Sterba /*
157500a44c9SDavid Sterba  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
158500a44c9SDavid Sterba  * can be used as index to access btrfs_raid_array[].
159500a44c9SDavid Sterba  */
160500a44c9SDavid Sterba enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
161500a44c9SDavid Sterba {
162500a44c9SDavid Sterba 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
163500a44c9SDavid Sterba 		return BTRFS_RAID_RAID10;
164500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
165500a44c9SDavid Sterba 		return BTRFS_RAID_RAID1;
166500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
167500a44c9SDavid Sterba 		return BTRFS_RAID_RAID1C3;
168500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
169500a44c9SDavid Sterba 		return BTRFS_RAID_RAID1C4;
170500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
171500a44c9SDavid Sterba 		return BTRFS_RAID_DUP;
172500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
173500a44c9SDavid Sterba 		return BTRFS_RAID_RAID0;
174500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
175500a44c9SDavid Sterba 		return BTRFS_RAID_RAID5;
176500a44c9SDavid Sterba 	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
177500a44c9SDavid Sterba 		return BTRFS_RAID_RAID6;
178500a44c9SDavid Sterba 
179500a44c9SDavid Sterba 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
180500a44c9SDavid Sterba }
181500a44c9SDavid Sterba 
182158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags)
183ed23467bSAnand Jain {
184158da513SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
185158da513SDavid Sterba 
186158da513SDavid Sterba 	if (index >= BTRFS_NR_RAID_TYPES)
187ed23467bSAnand Jain 		return NULL;
188ed23467bSAnand Jain 
189158da513SDavid Sterba 	return btrfs_raid_array[index].raid_name;
190ed23467bSAnand Jain }
191ed23467bSAnand Jain 
192f89e09cfSAnand Jain /*
193f89e09cfSAnand Jain  * Fill @buf with textual description of @bg_flags, no more than @size_buf
194f89e09cfSAnand Jain  * bytes including terminating null byte.
195f89e09cfSAnand Jain  */
196f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
197f89e09cfSAnand Jain {
198f89e09cfSAnand Jain 	int i;
199f89e09cfSAnand Jain 	int ret;
200f89e09cfSAnand Jain 	char *bp = buf;
201f89e09cfSAnand Jain 	u64 flags = bg_flags;
202f89e09cfSAnand Jain 	u32 size_bp = size_buf;
203f89e09cfSAnand Jain 
204f89e09cfSAnand Jain 	if (!flags) {
205f89e09cfSAnand Jain 		strcpy(bp, "NONE");
206f89e09cfSAnand Jain 		return;
207f89e09cfSAnand Jain 	}
208f89e09cfSAnand Jain 
209f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc)						\
210f89e09cfSAnand Jain 	do {								\
211f89e09cfSAnand Jain 		if (flags & (flag)) {					\
212f89e09cfSAnand Jain 			ret = snprintf(bp, size_bp, "%s|", (desc));	\
213f89e09cfSAnand Jain 			if (ret < 0 || ret >= size_bp)			\
214f89e09cfSAnand Jain 				goto out_overflow;			\
215f89e09cfSAnand Jain 			size_bp -= ret;					\
216f89e09cfSAnand Jain 			bp += ret;					\
217f89e09cfSAnand Jain 			flags &= ~(flag);				\
218f89e09cfSAnand Jain 		}							\
219f89e09cfSAnand Jain 	} while (0)
220f89e09cfSAnand Jain 
221f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
222f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
223f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
224f89e09cfSAnand Jain 
225f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
226f89e09cfSAnand Jain 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
227f89e09cfSAnand Jain 		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
228f89e09cfSAnand Jain 			      btrfs_raid_array[i].raid_name);
229f89e09cfSAnand Jain #undef DESCRIBE_FLAG
230f89e09cfSAnand Jain 
231f89e09cfSAnand Jain 	if (flags) {
232f89e09cfSAnand Jain 		ret = snprintf(bp, size_bp, "0x%llx|", flags);
233f89e09cfSAnand Jain 		size_bp -= ret;
234f89e09cfSAnand Jain 	}
235f89e09cfSAnand Jain 
236f89e09cfSAnand Jain 	if (size_bp < size_buf)
237f89e09cfSAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
238f89e09cfSAnand Jain 
239f89e09cfSAnand Jain 	/*
240f89e09cfSAnand Jain 	 * The text is trimmed, it's up to the caller to provide sufficiently
241f89e09cfSAnand Jain 	 * large buffer
242f89e09cfSAnand Jain 	 */
243f89e09cfSAnand Jain out_overflow:;
244f89e09cfSAnand Jain }
245f89e09cfSAnand Jain 
2466f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans);
2472ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
24848a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
249733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
2505ab56090SLiu Bo static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
2515ab56090SLiu Bo 			     enum btrfs_map_op op,
2525ab56090SLiu Bo 			     u64 logical, u64 *length,
2534c664611SQu Wenruo 			     struct btrfs_io_context **bioc_ret,
2545ab56090SLiu Bo 			     int mirror_num, int need_raid_map);
2552b82032cSYan Zheng 
2569c6b1c4dSDavid Sterba /*
2579c6b1c4dSDavid Sterba  * Device locking
2589c6b1c4dSDavid Sterba  * ==============
2599c6b1c4dSDavid Sterba  *
2609c6b1c4dSDavid Sterba  * There are several mutexes that protect manipulation of devices and low-level
2619c6b1c4dSDavid Sterba  * structures like chunks but not block groups, extents or files
2629c6b1c4dSDavid Sterba  *
2639c6b1c4dSDavid Sterba  * uuid_mutex (global lock)
2649c6b1c4dSDavid Sterba  * ------------------------
2659c6b1c4dSDavid Sterba  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
2669c6b1c4dSDavid Sterba  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
2679c6b1c4dSDavid Sterba  * device) or requested by the device= mount option
2689c6b1c4dSDavid Sterba  *
2699c6b1c4dSDavid Sterba  * the mutex can be very coarse and can cover long-running operations
2709c6b1c4dSDavid Sterba  *
2719c6b1c4dSDavid Sterba  * protects: updates to fs_devices counters like missing devices, rw devices,
27252042d8eSAndrea Gelmini  * seeding, structure cloning, opening/closing devices at mount/umount time
2739c6b1c4dSDavid Sterba  *
2749c6b1c4dSDavid Sterba  * global::fs_devs - add, remove, updates to the global list
2759c6b1c4dSDavid Sterba  *
27618c850fdSJosef Bacik  * does not protect: manipulation of the fs_devices::devices list in general
27718c850fdSJosef Bacik  * but in mount context it could be used to exclude list modifications by eg.
27818c850fdSJosef Bacik  * scan ioctl
2799c6b1c4dSDavid Sterba  *
2809c6b1c4dSDavid Sterba  * btrfs_device::name - renames (write side), read is RCU
2819c6b1c4dSDavid Sterba  *
2829c6b1c4dSDavid Sterba  * fs_devices::device_list_mutex (per-fs, with RCU)
2839c6b1c4dSDavid Sterba  * ------------------------------------------------
2849c6b1c4dSDavid Sterba  * protects updates to fs_devices::devices, ie. adding and deleting
2859c6b1c4dSDavid Sterba  *
2869c6b1c4dSDavid Sterba  * simple list traversal with read-only actions can be done with RCU protection
2879c6b1c4dSDavid Sterba  *
2889c6b1c4dSDavid Sterba  * may be used to exclude some operations from running concurrently without any
2899c6b1c4dSDavid Sterba  * modifications to the list (see write_all_supers)
2909c6b1c4dSDavid Sterba  *
29118c850fdSJosef Bacik  * Is not required at mount and close times, because our device list is
29218c850fdSJosef Bacik  * protected by the uuid_mutex at that point.
29318c850fdSJosef Bacik  *
2949c6b1c4dSDavid Sterba  * balance_mutex
2959c6b1c4dSDavid Sterba  * -------------
2969c6b1c4dSDavid Sterba  * protects balance structures (status, state) and context accessed from
2979c6b1c4dSDavid Sterba  * several places (internally, ioctl)
2989c6b1c4dSDavid Sterba  *
2999c6b1c4dSDavid Sterba  * chunk_mutex
3009c6b1c4dSDavid Sterba  * -----------
3019c6b1c4dSDavid Sterba  * protects chunks, adding or removing during allocation, trim or when a new
3020b6f5d40SNikolay Borisov  * device is added/removed. Additionally it also protects post_commit_list of
3030b6f5d40SNikolay Borisov  * individual devices, since they can be added to the transaction's
3040b6f5d40SNikolay Borisov  * post_commit_list only with chunk_mutex held.
3059c6b1c4dSDavid Sterba  *
3069c6b1c4dSDavid Sterba  * cleaner_mutex
3079c6b1c4dSDavid Sterba  * -------------
3089c6b1c4dSDavid Sterba  * a big lock that is held by the cleaner thread and prevents running subvolume
3099c6b1c4dSDavid Sterba  * cleaning together with relocation or delayed iputs
3109c6b1c4dSDavid Sterba  *
3119c6b1c4dSDavid Sterba  *
3129c6b1c4dSDavid Sterba  * Lock nesting
3139c6b1c4dSDavid Sterba  * ============
3149c6b1c4dSDavid Sterba  *
3159c6b1c4dSDavid Sterba  * uuid_mutex
3169c6b1c4dSDavid Sterba  *   device_list_mutex
3179c6b1c4dSDavid Sterba  *     chunk_mutex
3189c6b1c4dSDavid Sterba  *   balance_mutex
31989595e80SAnand Jain  *
32089595e80SAnand Jain  *
321c3e1f96cSGoldwyn Rodrigues  * Exclusive operations
322c3e1f96cSGoldwyn Rodrigues  * ====================
32389595e80SAnand Jain  *
32489595e80SAnand Jain  * Maintains the exclusivity of the following operations that apply to the
32589595e80SAnand Jain  * whole filesystem and cannot run in parallel.
32689595e80SAnand Jain  *
32789595e80SAnand Jain  * - Balance (*)
32889595e80SAnand Jain  * - Device add
32989595e80SAnand Jain  * - Device remove
33089595e80SAnand Jain  * - Device replace (*)
33189595e80SAnand Jain  * - Resize
33289595e80SAnand Jain  *
33389595e80SAnand Jain  * The device operations (as above) can be in one of the following states:
33489595e80SAnand Jain  *
33589595e80SAnand Jain  * - Running state
33689595e80SAnand Jain  * - Paused state
33789595e80SAnand Jain  * - Completed state
33889595e80SAnand Jain  *
33989595e80SAnand Jain  * Only device operations marked with (*) can go into the Paused state for the
34089595e80SAnand Jain  * following reasons:
34189595e80SAnand Jain  *
34289595e80SAnand Jain  * - ioctl (only Balance can be Paused through ioctl)
34389595e80SAnand Jain  * - filesystem remounted as read-only
34489595e80SAnand Jain  * - filesystem unmounted and mounted as read-only
34589595e80SAnand Jain  * - system power-cycle and filesystem mounted as read-only
34689595e80SAnand Jain  * - filesystem or device errors leading to forced read-only
34789595e80SAnand Jain  *
348c3e1f96cSGoldwyn Rodrigues  * The status of exclusive operation is set and cleared atomically.
349c3e1f96cSGoldwyn Rodrigues  * During the course of Paused state, fs_info::exclusive_operation remains set.
35089595e80SAnand Jain  * A device operation in Paused or Running state can be canceled or resumed
35189595e80SAnand Jain  * either by ioctl (Balance only) or when remounted as read-write.
352c3e1f96cSGoldwyn Rodrigues  * The exclusive status is cleared when the device operation is canceled or
35389595e80SAnand Jain  * completed.
3549c6b1c4dSDavid Sterba  */
3559c6b1c4dSDavid Sterba 
35667a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex);
3578a4b83ccSChris Mason static LIST_HEAD(fs_uuids);
3584143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
359c73eccf7SAnand Jain {
360c73eccf7SAnand Jain 	return &fs_uuids;
361c73eccf7SAnand Jain }
3628a4b83ccSChris Mason 
3632dfeca9bSDavid Sterba /*
3642dfeca9bSDavid Sterba  * alloc_fs_devices - allocate struct btrfs_fs_devices
3657239ff4bSNikolay Borisov  * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
3667239ff4bSNikolay Borisov  * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
3672dfeca9bSDavid Sterba  *
3682dfeca9bSDavid Sterba  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
3692dfeca9bSDavid Sterba  * The returned struct is not linked onto any lists and can be destroyed with
3702dfeca9bSDavid Sterba  * kfree() right away.
3712dfeca9bSDavid Sterba  */
3727239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
3737239ff4bSNikolay Borisov 						 const u8 *metadata_fsid)
3742208a378SIlya Dryomov {
3752208a378SIlya Dryomov 	struct btrfs_fs_devices *fs_devs;
3762208a378SIlya Dryomov 
37778f2c9e6SDavid Sterba 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
3782208a378SIlya Dryomov 	if (!fs_devs)
3792208a378SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3802208a378SIlya Dryomov 
3812208a378SIlya Dryomov 	mutex_init(&fs_devs->device_list_mutex);
3822208a378SIlya Dryomov 
3832208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->devices);
3842208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->alloc_list);
385c4babc5eSAnand Jain 	INIT_LIST_HEAD(&fs_devs->fs_list);
386944d3f9fSNikolay Borisov 	INIT_LIST_HEAD(&fs_devs->seed_list);
3872208a378SIlya Dryomov 	if (fsid)
3882208a378SIlya Dryomov 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
3892208a378SIlya Dryomov 
3907239ff4bSNikolay Borisov 	if (metadata_fsid)
3917239ff4bSNikolay Borisov 		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
3927239ff4bSNikolay Borisov 	else if (fsid)
3937239ff4bSNikolay Borisov 		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
3947239ff4bSNikolay Borisov 
3952208a378SIlya Dryomov 	return fs_devs;
3962208a378SIlya Dryomov }
3972208a378SIlya Dryomov 
398a425f9d4SDavid Sterba void btrfs_free_device(struct btrfs_device *device)
39948dae9cfSDavid Sterba {
400bbbf7243SNikolay Borisov 	WARN_ON(!list_empty(&device->post_commit_list));
40148dae9cfSDavid Sterba 	rcu_string_free(device->name);
4021c11b63eSJeff Mahoney 	extent_io_tree_release(&device->alloc_state);
40348dae9cfSDavid Sterba 	bio_put(device->flush_bio);
4045b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
40548dae9cfSDavid Sterba 	kfree(device);
40648dae9cfSDavid Sterba }
40748dae9cfSDavid Sterba 
408e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
409e4404d6eSYan Zheng {
410e4404d6eSYan Zheng 	struct btrfs_device *device;
411e4404d6eSYan Zheng 	WARN_ON(fs_devices->opened);
412e4404d6eSYan Zheng 	while (!list_empty(&fs_devices->devices)) {
413e4404d6eSYan Zheng 		device = list_entry(fs_devices->devices.next,
414e4404d6eSYan Zheng 				    struct btrfs_device, dev_list);
415e4404d6eSYan Zheng 		list_del(&device->dev_list);
416a425f9d4SDavid Sterba 		btrfs_free_device(device);
417e4404d6eSYan Zheng 	}
418e4404d6eSYan Zheng 	kfree(fs_devices);
419e4404d6eSYan Zheng }
420e4404d6eSYan Zheng 
421ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void)
4228a4b83ccSChris Mason {
4238a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
4248a4b83ccSChris Mason 
4252b82032cSYan Zheng 	while (!list_empty(&fs_uuids)) {
4262b82032cSYan Zheng 		fs_devices = list_entry(fs_uuids.next,
427c4babc5eSAnand Jain 					struct btrfs_fs_devices, fs_list);
428c4babc5eSAnand Jain 		list_del(&fs_devices->fs_list);
429e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
4308a4b83ccSChris Mason 	}
4318a4b83ccSChris Mason }
4328a4b83ccSChris Mason 
4337239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid(
4347239ff4bSNikolay Borisov 		const u8 *fsid, const u8 *metadata_fsid)
4358a4b83ccSChris Mason {
4368a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
4378a4b83ccSChris Mason 
4387239ff4bSNikolay Borisov 	ASSERT(fsid);
4397239ff4bSNikolay Borisov 
440c6730a0eSSu Yue 	/* Handle non-split brain cases */
441c6730a0eSSu Yue 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4427a62d0f0SNikolay Borisov 		if (metadata_fsid) {
443c6730a0eSSu Yue 			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
444c6730a0eSSu Yue 			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
445c6730a0eSSu Yue 				      BTRFS_FSID_SIZE) == 0)
446c6730a0eSSu Yue 				return fs_devices;
447c6730a0eSSu Yue 		} else {
448c6730a0eSSu Yue 			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
449c6730a0eSSu Yue 				return fs_devices;
450c6730a0eSSu Yue 		}
451c6730a0eSSu Yue 	}
452c6730a0eSSu Yue 	return NULL;
453c6730a0eSSu Yue }
454c6730a0eSSu Yue 
455c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
456c6730a0eSSu Yue 				struct btrfs_super_block *disk_super)
457c6730a0eSSu Yue {
458c6730a0eSSu Yue 
459c6730a0eSSu Yue 	struct btrfs_fs_devices *fs_devices;
460c6730a0eSSu Yue 
4617a62d0f0SNikolay Borisov 	/*
4627a62d0f0SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
4637a62d0f0SNikolay Borisov 	 * belonging to a fs_devices that was created by first scanning
4647a62d0f0SNikolay Borisov 	 * a device which didn't have its fsid/metadata_uuid changed
4657a62d0f0SNikolay Borisov 	 * at all and the CHANGING_FSID_V2 flag set.
4667a62d0f0SNikolay Borisov 	 */
4677a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4687a62d0f0SNikolay Borisov 		if (fs_devices->fsid_change &&
469c6730a0eSSu Yue 		    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
4707a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
4717a62d0f0SNikolay Borisov 		    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
4727a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0) {
4737a62d0f0SNikolay Borisov 			return fs_devices;
4747a62d0f0SNikolay Borisov 		}
4757a62d0f0SNikolay Borisov 	}
476cc5de4e7SNikolay Borisov 	/*
477cc5de4e7SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
478cc5de4e7SNikolay Borisov 	 * belonging to a fs_devices that was created by a device that
479cc5de4e7SNikolay Borisov 	 * has an outdated pair of fsid/metadata_uuid and
480cc5de4e7SNikolay Borisov 	 * CHANGING_FSID_V2 flag set.
481cc5de4e7SNikolay Borisov 	 */
482cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
483cc5de4e7SNikolay Borisov 		if (fs_devices->fsid_change &&
484cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid,
485cc5de4e7SNikolay Borisov 			   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
486c6730a0eSSu Yue 		    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
487cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0) {
488cc5de4e7SNikolay Borisov 			return fs_devices;
489cc5de4e7SNikolay Borisov 		}
490cc5de4e7SNikolay Borisov 	}
491c6730a0eSSu Yue 
492c6730a0eSSu Yue 	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
4937a62d0f0SNikolay Borisov }
4947a62d0f0SNikolay Borisov 
4958a4b83ccSChris Mason 
496beaf8ab3SStefan Behrens static int
497beaf8ab3SStefan Behrens btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
498beaf8ab3SStefan Behrens 		      int flush, struct block_device **bdev,
4998f32380dSJohannes Thumshirn 		      struct btrfs_super_block **disk_super)
500beaf8ab3SStefan Behrens {
501beaf8ab3SStefan Behrens 	int ret;
502beaf8ab3SStefan Behrens 
503beaf8ab3SStefan Behrens 	*bdev = blkdev_get_by_path(device_path, flags, holder);
504beaf8ab3SStefan Behrens 
505beaf8ab3SStefan Behrens 	if (IS_ERR(*bdev)) {
506beaf8ab3SStefan Behrens 		ret = PTR_ERR(*bdev);
507beaf8ab3SStefan Behrens 		goto error;
508beaf8ab3SStefan Behrens 	}
509beaf8ab3SStefan Behrens 
510beaf8ab3SStefan Behrens 	if (flush)
511beaf8ab3SStefan Behrens 		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
5129f6d2510SDavid Sterba 	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
513beaf8ab3SStefan Behrens 	if (ret) {
514beaf8ab3SStefan Behrens 		blkdev_put(*bdev, flags);
515beaf8ab3SStefan Behrens 		goto error;
516beaf8ab3SStefan Behrens 	}
517beaf8ab3SStefan Behrens 	invalidate_bdev(*bdev);
5188f32380dSJohannes Thumshirn 	*disk_super = btrfs_read_dev_super(*bdev);
5198f32380dSJohannes Thumshirn 	if (IS_ERR(*disk_super)) {
5208f32380dSJohannes Thumshirn 		ret = PTR_ERR(*disk_super);
521beaf8ab3SStefan Behrens 		blkdev_put(*bdev, flags);
522beaf8ab3SStefan Behrens 		goto error;
523beaf8ab3SStefan Behrens 	}
524beaf8ab3SStefan Behrens 
525beaf8ab3SStefan Behrens 	return 0;
526beaf8ab3SStefan Behrens 
527beaf8ab3SStefan Behrens error:
528beaf8ab3SStefan Behrens 	*bdev = NULL;
529beaf8ab3SStefan Behrens 	return ret;
530beaf8ab3SStefan Behrens }
531beaf8ab3SStefan Behrens 
53270bc7088SAnand Jain static bool device_path_matched(const char *path, struct btrfs_device *device)
53370bc7088SAnand Jain {
53470bc7088SAnand Jain 	int found;
53570bc7088SAnand Jain 
53670bc7088SAnand Jain 	rcu_read_lock();
53770bc7088SAnand Jain 	found = strcmp(rcu_str_deref(device->name), path);
53870bc7088SAnand Jain 	rcu_read_unlock();
53970bc7088SAnand Jain 
54070bc7088SAnand Jain 	return found == 0;
54170bc7088SAnand Jain }
54270bc7088SAnand Jain 
543d8367db3SAnand Jain /*
544d8367db3SAnand Jain  *  Search and remove all stale (devices which are not mounted) devices.
545d8367db3SAnand Jain  *  When both inputs are NULL, it will search and release all stale devices.
546d8367db3SAnand Jain  *  path:	Optional. When provided will it release all unmounted devices
547d8367db3SAnand Jain  *		matching this path only.
548d8367db3SAnand Jain  *  skip_dev:	Optional. Will skip this device when searching for the stale
549d8367db3SAnand Jain  *		devices.
55070bc7088SAnand Jain  *  Return:	0 for success or if @path is NULL.
55170bc7088SAnand Jain  * 		-EBUSY if @path is a mounted device.
55270bc7088SAnand Jain  * 		-ENOENT if @path does not match any device in the list.
553d8367db3SAnand Jain  */
55470bc7088SAnand Jain static int btrfs_free_stale_devices(const char *path,
555fa6d2ae5SAnand Jain 				     struct btrfs_device *skip_device)
5564fde46f0SAnand Jain {
557fa6d2ae5SAnand Jain 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
558fa6d2ae5SAnand Jain 	struct btrfs_device *device, *tmp_device;
55970bc7088SAnand Jain 	int ret = 0;
56070bc7088SAnand Jain 
561c1247069SAnand Jain 	lockdep_assert_held(&uuid_mutex);
562c1247069SAnand Jain 
56370bc7088SAnand Jain 	if (path)
56470bc7088SAnand Jain 		ret = -ENOENT;
5654fde46f0SAnand Jain 
566fa6d2ae5SAnand Jain 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
5674fde46f0SAnand Jain 
56870bc7088SAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
569fa6d2ae5SAnand Jain 		list_for_each_entry_safe(device, tmp_device,
570fa6d2ae5SAnand Jain 					 &fs_devices->devices, dev_list) {
571fa6d2ae5SAnand Jain 			if (skip_device && skip_device == device)
572d8367db3SAnand Jain 				continue;
573fa6d2ae5SAnand Jain 			if (path && !device->name)
5744fde46f0SAnand Jain 				continue;
57570bc7088SAnand Jain 			if (path && !device_path_matched(path, device))
57638cf665dSAnand Jain 				continue;
57770bc7088SAnand Jain 			if (fs_devices->opened) {
57870bc7088SAnand Jain 				/* for an already deleted device return 0 */
57970bc7088SAnand Jain 				if (path && ret != 0)
58070bc7088SAnand Jain 					ret = -EBUSY;
58170bc7088SAnand Jain 				break;
58270bc7088SAnand Jain 			}
5834fde46f0SAnand Jain 
5844fde46f0SAnand Jain 			/* delete the stale device */
585fa6d2ae5SAnand Jain 			fs_devices->num_devices--;
586fa6d2ae5SAnand Jain 			list_del(&device->dev_list);
587fa6d2ae5SAnand Jain 			btrfs_free_device(device);
5887bcb8164SAnand Jain 
58970bc7088SAnand Jain 			ret = 0;
5904fde46f0SAnand Jain 		}
5917bcb8164SAnand Jain 		mutex_unlock(&fs_devices->device_list_mutex);
59270bc7088SAnand Jain 
5937bcb8164SAnand Jain 		if (fs_devices->num_devices == 0) {
5947bcb8164SAnand Jain 			btrfs_sysfs_remove_fsid(fs_devices);
5957bcb8164SAnand Jain 			list_del(&fs_devices->fs_list);
5967bcb8164SAnand Jain 			free_fs_devices(fs_devices);
5974fde46f0SAnand Jain 		}
5984fde46f0SAnand Jain 	}
59970bc7088SAnand Jain 
60070bc7088SAnand Jain 	return ret;
6014fde46f0SAnand Jain }
6024fde46f0SAnand Jain 
60318c850fdSJosef Bacik /*
60418c850fdSJosef Bacik  * This is only used on mount, and we are protected from competing things
60518c850fdSJosef Bacik  * messing with our fs_devices by the uuid_mutex, thus we do not need the
60618c850fdSJosef Bacik  * fs_devices->device_list_mutex here.
60718c850fdSJosef Bacik  */
6080fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
6090fb08bccSAnand Jain 			struct btrfs_device *device, fmode_t flags,
6100fb08bccSAnand Jain 			void *holder)
6110fb08bccSAnand Jain {
6120fb08bccSAnand Jain 	struct request_queue *q;
6130fb08bccSAnand Jain 	struct block_device *bdev;
6140fb08bccSAnand Jain 	struct btrfs_super_block *disk_super;
6150fb08bccSAnand Jain 	u64 devid;
6160fb08bccSAnand Jain 	int ret;
6170fb08bccSAnand Jain 
6180fb08bccSAnand Jain 	if (device->bdev)
6190fb08bccSAnand Jain 		return -EINVAL;
6200fb08bccSAnand Jain 	if (!device->name)
6210fb08bccSAnand Jain 		return -EINVAL;
6220fb08bccSAnand Jain 
6230fb08bccSAnand Jain 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
6248f32380dSJohannes Thumshirn 				    &bdev, &disk_super);
6250fb08bccSAnand Jain 	if (ret)
6260fb08bccSAnand Jain 		return ret;
6270fb08bccSAnand Jain 
6280fb08bccSAnand Jain 	devid = btrfs_stack_device_id(&disk_super->dev_item);
6290fb08bccSAnand Jain 	if (devid != device->devid)
6308f32380dSJohannes Thumshirn 		goto error_free_page;
6310fb08bccSAnand Jain 
6320fb08bccSAnand Jain 	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
6338f32380dSJohannes Thumshirn 		goto error_free_page;
6340fb08bccSAnand Jain 
6350fb08bccSAnand Jain 	device->generation = btrfs_super_generation(disk_super);
6360fb08bccSAnand Jain 
6370fb08bccSAnand Jain 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
6387239ff4bSNikolay Borisov 		if (btrfs_super_incompat_flags(disk_super) &
6397239ff4bSNikolay Borisov 		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
6407239ff4bSNikolay Borisov 			pr_err(
6417239ff4bSNikolay Borisov 		"BTRFS: Invalid seeding and uuid-changed device detected\n");
6428f32380dSJohannes Thumshirn 			goto error_free_page;
6437239ff4bSNikolay Borisov 		}
6447239ff4bSNikolay Borisov 
645ebbede42SAnand Jain 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6460395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
6470fb08bccSAnand Jain 	} else {
648ebbede42SAnand Jain 		if (bdev_read_only(bdev))
649ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
650ebbede42SAnand Jain 		else
651ebbede42SAnand Jain 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6520fb08bccSAnand Jain 	}
6530fb08bccSAnand Jain 
6540fb08bccSAnand Jain 	q = bdev_get_queue(bdev);
6550fb08bccSAnand Jain 	if (!blk_queue_nonrot(q))
6567f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
6570fb08bccSAnand Jain 
6580fb08bccSAnand Jain 	device->bdev = bdev;
659e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6600fb08bccSAnand Jain 	device->mode = flags;
6610fb08bccSAnand Jain 
6620fb08bccSAnand Jain 	fs_devices->open_devices++;
663ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
664ebbede42SAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
6650fb08bccSAnand Jain 		fs_devices->rw_devices++;
666b1b8e386SAnand Jain 		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
6670fb08bccSAnand Jain 	}
6688f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6690fb08bccSAnand Jain 
6700fb08bccSAnand Jain 	return 0;
6710fb08bccSAnand Jain 
6728f32380dSJohannes Thumshirn error_free_page:
6738f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6740fb08bccSAnand Jain 	blkdev_put(bdev, flags);
6750fb08bccSAnand Jain 
6760fb08bccSAnand Jain 	return -EINVAL;
6770fb08bccSAnand Jain }
6780fb08bccSAnand Jain 
67960999ca4SDavid Sterba /*
6807a62d0f0SNikolay Borisov  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
681c0d81c7cSSu Yue  * being created with a disk that has already completed its fsid change. Such
682c0d81c7cSSu Yue  * disk can belong to an fs which has its FSID changed or to one which doesn't.
683c0d81c7cSSu Yue  * Handle both cases here.
6847a62d0f0SNikolay Borisov  */
6857a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress(
6867a62d0f0SNikolay Borisov 					struct btrfs_super_block *disk_super)
6877a62d0f0SNikolay Borisov {
6887a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
6897a62d0f0SNikolay Borisov 
6907a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
6917a62d0f0SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
6927a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
6937a62d0f0SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
6947a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
6957a62d0f0SNikolay Borisov 			return fs_devices;
6967a62d0f0SNikolay Borisov 		}
6977a62d0f0SNikolay Borisov 	}
6987a62d0f0SNikolay Borisov 
699c0d81c7cSSu Yue 	return find_fsid(disk_super->fsid, NULL);
7007a62d0f0SNikolay Borisov }
7017a62d0f0SNikolay Borisov 
702cc5de4e7SNikolay Borisov 
703cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed(
704cc5de4e7SNikolay Borisov 					struct btrfs_super_block *disk_super)
705cc5de4e7SNikolay Borisov {
706cc5de4e7SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
707cc5de4e7SNikolay Borisov 
708cc5de4e7SNikolay Borisov 	/*
709cc5de4e7SNikolay Borisov 	 * Handles the case where scanned device is part of an fs that had
7101a9fd417SDavid Sterba 	 * multiple successful changes of FSID but currently device didn't
71105840710SNikolay Borisov 	 * observe it. Meaning our fsid will be different than theirs. We need
71205840710SNikolay Borisov 	 * to handle two subcases :
71305840710SNikolay Borisov 	 *  1 - The fs still continues to have different METADATA/FSID uuids.
71405840710SNikolay Borisov 	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
71505840710SNikolay Borisov 	 *  are equal).
716cc5de4e7SNikolay Borisov 	 */
717cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
71805840710SNikolay Borisov 		/* Changed UUIDs */
719cc5de4e7SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
720cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
721cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
722cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
723cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->fsid,
72405840710SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0)
725cc5de4e7SNikolay Borisov 			return fs_devices;
72605840710SNikolay Borisov 
72705840710SNikolay Borisov 		/* Unchanged UUIDs */
72805840710SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
72905840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
73005840710SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
73105840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0)
73205840710SNikolay Borisov 			return fs_devices;
733cc5de4e7SNikolay Borisov 	}
734cc5de4e7SNikolay Borisov 
735cc5de4e7SNikolay Borisov 	return NULL;
736cc5de4e7SNikolay Borisov }
7371362089dSNikolay Borisov 
7381362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata(
7391362089dSNikolay Borisov 				struct btrfs_super_block *disk_super)
7401362089dSNikolay Borisov {
7411362089dSNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
7421362089dSNikolay Borisov 
7431362089dSNikolay Borisov 	/*
7441362089dSNikolay Borisov 	 * Handle the case where the scanned device is part of an fs whose last
7451362089dSNikolay Borisov 	 * metadata UUID change reverted it to the original FSID. At the same
7461362089dSNikolay Borisov 	 * time * fs_devices was first created by another constitutent device
7471362089dSNikolay Borisov 	 * which didn't fully observe the operation. This results in an
7481362089dSNikolay Borisov 	 * btrfs_fs_devices created with metadata/fsid different AND
7491362089dSNikolay Borisov 	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
7501362089dSNikolay Borisov 	 * fs_devices equal to the FSID of the disk.
7511362089dSNikolay Borisov 	 */
7521362089dSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
7531362089dSNikolay Borisov 		if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
7541362089dSNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
7551362089dSNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
7561362089dSNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
7571362089dSNikolay Borisov 		    fs_devices->fsid_change)
7581362089dSNikolay Borisov 			return fs_devices;
7591362089dSNikolay Borisov 	}
7601362089dSNikolay Borisov 
7611362089dSNikolay Borisov 	return NULL;
7621362089dSNikolay Borisov }
7637a62d0f0SNikolay Borisov /*
76460999ca4SDavid Sterba  * Add new device to list of registered devices
76560999ca4SDavid Sterba  *
76660999ca4SDavid Sterba  * Returns:
767e124ece5SAnand Jain  * device pointer which was just added or updated when successful
768e124ece5SAnand Jain  * error pointer when failed
76960999ca4SDavid Sterba  */
770e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path,
7714306a974SAnand Jain 			   struct btrfs_super_block *disk_super,
7724306a974SAnand Jain 			   bool *new_device_added)
7738a4b83ccSChris Mason {
7748a4b83ccSChris Mason 	struct btrfs_device *device;
7757a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices = NULL;
776606686eeSJosef Bacik 	struct rcu_string *name;
7778a4b83ccSChris Mason 	u64 found_transid = btrfs_super_generation(disk_super);
7783acbcbfcSAnand Jain 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
7797239ff4bSNikolay Borisov 	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
7807239ff4bSNikolay Borisov 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
781d1a63002SNikolay Borisov 	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
782d1a63002SNikolay Borisov 					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
7838a4b83ccSChris Mason 
784cc5de4e7SNikolay Borisov 	if (fsid_change_in_progress) {
785c0d81c7cSSu Yue 		if (!has_metadata_uuid)
7867a62d0f0SNikolay Borisov 			fs_devices = find_fsid_inprogress(disk_super);
787c0d81c7cSSu Yue 		else
788cc5de4e7SNikolay Borisov 			fs_devices = find_fsid_changed(disk_super);
7897a62d0f0SNikolay Borisov 	} else if (has_metadata_uuid) {
790c6730a0eSSu Yue 		fs_devices = find_fsid_with_metadata_uuid(disk_super);
7917a62d0f0SNikolay Borisov 	} else {
7921362089dSNikolay Borisov 		fs_devices = find_fsid_reverted_metadata(disk_super);
7931362089dSNikolay Borisov 		if (!fs_devices)
7947a62d0f0SNikolay Borisov 			fs_devices = find_fsid(disk_super->fsid, NULL);
7957a62d0f0SNikolay Borisov 	}
7967a62d0f0SNikolay Borisov 
7977a62d0f0SNikolay Borisov 
7988a4b83ccSChris Mason 	if (!fs_devices) {
7997239ff4bSNikolay Borisov 		if (has_metadata_uuid)
8007239ff4bSNikolay Borisov 			fs_devices = alloc_fs_devices(disk_super->fsid,
8017239ff4bSNikolay Borisov 						      disk_super->metadata_uuid);
8027239ff4bSNikolay Borisov 		else
8037239ff4bSNikolay Borisov 			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
8047239ff4bSNikolay Borisov 
8052208a378SIlya Dryomov 		if (IS_ERR(fs_devices))
806e124ece5SAnand Jain 			return ERR_CAST(fs_devices);
8072208a378SIlya Dryomov 
80892900e51SAl Viro 		fs_devices->fsid_change = fsid_change_in_progress;
80992900e51SAl Viro 
8109c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
811c4babc5eSAnand Jain 		list_add(&fs_devices->fs_list, &fs_uuids);
8122208a378SIlya Dryomov 
8138a4b83ccSChris Mason 		device = NULL;
8148a4b83ccSChris Mason 	} else {
8159c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
81609ba3bc9SAnand Jain 		device = btrfs_find_device(fs_devices, devid,
817b2598edfSAnand Jain 				disk_super->dev_item.uuid, NULL);
8187a62d0f0SNikolay Borisov 
8197a62d0f0SNikolay Borisov 		/*
8207a62d0f0SNikolay Borisov 		 * If this disk has been pulled into an fs devices created by
8217a62d0f0SNikolay Borisov 		 * a device which had the CHANGING_FSID_V2 flag then replace the
8227a62d0f0SNikolay Borisov 		 * metadata_uuid/fsid values of the fs_devices.
8237a62d0f0SNikolay Borisov 		 */
8241362089dSNikolay Borisov 		if (fs_devices->fsid_change &&
8257a62d0f0SNikolay Borisov 		    found_transid > fs_devices->latest_generation) {
8267a62d0f0SNikolay Borisov 			memcpy(fs_devices->fsid, disk_super->fsid,
8277a62d0f0SNikolay Borisov 					BTRFS_FSID_SIZE);
8281362089dSNikolay Borisov 
8291362089dSNikolay Borisov 			if (has_metadata_uuid)
8307a62d0f0SNikolay Borisov 				memcpy(fs_devices->metadata_uuid,
8311362089dSNikolay Borisov 				       disk_super->metadata_uuid,
8321362089dSNikolay Borisov 				       BTRFS_FSID_SIZE);
8331362089dSNikolay Borisov 			else
8341362089dSNikolay Borisov 				memcpy(fs_devices->metadata_uuid,
8351362089dSNikolay Borisov 				       disk_super->fsid, BTRFS_FSID_SIZE);
8367a62d0f0SNikolay Borisov 
8377a62d0f0SNikolay Borisov 			fs_devices->fsid_change = false;
8387a62d0f0SNikolay Borisov 		}
8398a4b83ccSChris Mason 	}
840443f24feSMiao Xie 
8418a4b83ccSChris Mason 	if (!device) {
8429c6d173eSAnand Jain 		if (fs_devices->opened) {
8439c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
844e124ece5SAnand Jain 			return ERR_PTR(-EBUSY);
8459c6d173eSAnand Jain 		}
8462b82032cSYan Zheng 
84712bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &devid,
84812bd2fc0SIlya Dryomov 					    disk_super->dev_item.uuid);
84912bd2fc0SIlya Dryomov 		if (IS_ERR(device)) {
8509c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
8518a4b83ccSChris Mason 			/* we can safely leave the fs_devices entry around */
852e124ece5SAnand Jain 			return device;
8538a4b83ccSChris Mason 		}
854606686eeSJosef Bacik 
855606686eeSJosef Bacik 		name = rcu_string_strdup(path, GFP_NOFS);
856606686eeSJosef Bacik 		if (!name) {
857a425f9d4SDavid Sterba 			btrfs_free_device(device);
8589c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
859e124ece5SAnand Jain 			return ERR_PTR(-ENOMEM);
8608a4b83ccSChris Mason 		}
861606686eeSJosef Bacik 		rcu_assign_pointer(device->name, name);
86290519d66SArne Jansen 
8631f78160cSXiao Guangrong 		list_add_rcu(&device->dev_list, &fs_devices->devices);
864f7171750SFilipe David Borba Manana 		fs_devices->num_devices++;
865e5e9a520SChris Mason 
8662b82032cSYan Zheng 		device->fs_devices = fs_devices;
8674306a974SAnand Jain 		*new_device_added = true;
868327f18ccSAnand Jain 
869327f18ccSAnand Jain 		if (disk_super->label[0])
870aa6c0df7SAnand Jain 			pr_info(
871aa6c0df7SAnand Jain 	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
872aa6c0df7SAnand Jain 				disk_super->label, devid, found_transid, path,
873aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
874327f18ccSAnand Jain 		else
875aa6c0df7SAnand Jain 			pr_info(
876aa6c0df7SAnand Jain 	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
877aa6c0df7SAnand Jain 				disk_super->fsid, devid, found_transid, path,
878aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
879327f18ccSAnand Jain 
880606686eeSJosef Bacik 	} else if (!device->name || strcmp(device->name->str, path)) {
881b96de000SAnand Jain 		/*
882b96de000SAnand Jain 		 * When FS is already mounted.
883b96de000SAnand Jain 		 * 1. If you are here and if the device->name is NULL that
884b96de000SAnand Jain 		 *    means this device was missing at time of FS mount.
885b96de000SAnand Jain 		 * 2. If you are here and if the device->name is different
886b96de000SAnand Jain 		 *    from 'path' that means either
887b96de000SAnand Jain 		 *      a. The same device disappeared and reappeared with
888b96de000SAnand Jain 		 *         different name. or
889b96de000SAnand Jain 		 *      b. The missing-disk-which-was-replaced, has
890b96de000SAnand Jain 		 *         reappeared now.
891b96de000SAnand Jain 		 *
892b96de000SAnand Jain 		 * We must allow 1 and 2a above. But 2b would be a spurious
893b96de000SAnand Jain 		 * and unintentional.
894b96de000SAnand Jain 		 *
895b96de000SAnand Jain 		 * Further in case of 1 and 2a above, the disk at 'path'
896b96de000SAnand Jain 		 * would have missed some transaction when it was away and
897b96de000SAnand Jain 		 * in case of 2a the stale bdev has to be updated as well.
898b96de000SAnand Jain 		 * 2b must not be allowed at all time.
899b96de000SAnand Jain 		 */
900b96de000SAnand Jain 
901b96de000SAnand Jain 		/*
9020f23ae74SChris Mason 		 * For now, we do allow update to btrfs_fs_device through the
9030f23ae74SChris Mason 		 * btrfs dev scan cli after FS has been mounted.  We're still
9040f23ae74SChris Mason 		 * tracking a problem where systems fail mount by subvolume id
9050f23ae74SChris Mason 		 * when we reject replacement on a mounted FS.
906b96de000SAnand Jain 		 */
9070f23ae74SChris Mason 		if (!fs_devices->opened && found_transid < device->generation) {
90877bdae4dSAnand Jain 			/*
90977bdae4dSAnand Jain 			 * That is if the FS is _not_ mounted and if you
91077bdae4dSAnand Jain 			 * are here, that means there is more than one
91177bdae4dSAnand Jain 			 * disk with same uuid and devid.We keep the one
91277bdae4dSAnand Jain 			 * with larger generation number or the last-in if
91377bdae4dSAnand Jain 			 * generation are equal.
91477bdae4dSAnand Jain 			 */
9159c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
916e124ece5SAnand Jain 			return ERR_PTR(-EEXIST);
91777bdae4dSAnand Jain 		}
918b96de000SAnand Jain 
919a9261d41SAnand Jain 		/*
920a9261d41SAnand Jain 		 * We are going to replace the device path for a given devid,
921a9261d41SAnand Jain 		 * make sure it's the same device if the device is mounted
922a9261d41SAnand Jain 		 */
923a9261d41SAnand Jain 		if (device->bdev) {
9244e7b5671SChristoph Hellwig 			int error;
9254e7b5671SChristoph Hellwig 			dev_t path_dev;
926a9261d41SAnand Jain 
9274e7b5671SChristoph Hellwig 			error = lookup_bdev(path, &path_dev);
9284e7b5671SChristoph Hellwig 			if (error) {
929a9261d41SAnand Jain 				mutex_unlock(&fs_devices->device_list_mutex);
9304e7b5671SChristoph Hellwig 				return ERR_PTR(error);
931a9261d41SAnand Jain 			}
932a9261d41SAnand Jain 
9334e7b5671SChristoph Hellwig 			if (device->bdev->bd_dev != path_dev) {
934a9261d41SAnand Jain 				mutex_unlock(&fs_devices->device_list_mutex);
9350697d9a6SJohannes Thumshirn 				/*
9360697d9a6SJohannes Thumshirn 				 * device->fs_info may not be reliable here, so
9370697d9a6SJohannes Thumshirn 				 * pass in a NULL instead. This avoids a
9380697d9a6SJohannes Thumshirn 				 * possible use-after-free when the fs_info and
9390697d9a6SJohannes Thumshirn 				 * fs_info->sb are already torn down.
9400697d9a6SJohannes Thumshirn 				 */
9410697d9a6SJohannes Thumshirn 				btrfs_warn_in_rcu(NULL,
94279dae17dSAnand Jain 	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
94379dae17dSAnand Jain 						  path, devid, found_transid,
94479dae17dSAnand Jain 						  current->comm,
94579dae17dSAnand Jain 						  task_pid_nr(current));
946a9261d41SAnand Jain 				return ERR_PTR(-EEXIST);
947a9261d41SAnand Jain 			}
948a9261d41SAnand Jain 			btrfs_info_in_rcu(device->fs_info,
94979dae17dSAnand Jain 	"devid %llu device path %s changed to %s scanned by %s (%d)",
95079dae17dSAnand Jain 					  devid, rcu_str_deref(device->name),
95179dae17dSAnand Jain 					  path, current->comm,
95279dae17dSAnand Jain 					  task_pid_nr(current));
953a9261d41SAnand Jain 		}
954a9261d41SAnand Jain 
955606686eeSJosef Bacik 		name = rcu_string_strdup(path, GFP_NOFS);
9569c6d173eSAnand Jain 		if (!name) {
9579c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
958e124ece5SAnand Jain 			return ERR_PTR(-ENOMEM);
9599c6d173eSAnand Jain 		}
960606686eeSJosef Bacik 		rcu_string_free(device->name);
961606686eeSJosef Bacik 		rcu_assign_pointer(device->name, name);
962e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
963cd02dca5SChris Mason 			fs_devices->missing_devices--;
964e6e674bdSAnand Jain 			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
965cd02dca5SChris Mason 		}
9668a4b83ccSChris Mason 	}
9678a4b83ccSChris Mason 
96877bdae4dSAnand Jain 	/*
96977bdae4dSAnand Jain 	 * Unmount does not free the btrfs_device struct but would zero
97077bdae4dSAnand Jain 	 * generation along with most of the other members. So just update
97177bdae4dSAnand Jain 	 * it back. We need it to pick the disk with largest generation
97277bdae4dSAnand Jain 	 * (as above).
97377bdae4dSAnand Jain 	 */
974d1a63002SNikolay Borisov 	if (!fs_devices->opened) {
97577bdae4dSAnand Jain 		device->generation = found_transid;
976d1a63002SNikolay Borisov 		fs_devices->latest_generation = max_t(u64, found_transid,
977d1a63002SNikolay Borisov 						fs_devices->latest_generation);
978d1a63002SNikolay Borisov 	}
97977bdae4dSAnand Jain 
980f2788d2fSAnand Jain 	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
981f2788d2fSAnand Jain 
9829c6d173eSAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
983e124ece5SAnand Jain 	return device;
9848a4b83ccSChris Mason }
9858a4b83ccSChris Mason 
986e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
987e4404d6eSYan Zheng {
988e4404d6eSYan Zheng 	struct btrfs_fs_devices *fs_devices;
989e4404d6eSYan Zheng 	struct btrfs_device *device;
990e4404d6eSYan Zheng 	struct btrfs_device *orig_dev;
991d2979aa2SAnand Jain 	int ret = 0;
992e4404d6eSYan Zheng 
993c1247069SAnand Jain 	lockdep_assert_held(&uuid_mutex);
994c1247069SAnand Jain 
9957239ff4bSNikolay Borisov 	fs_devices = alloc_fs_devices(orig->fsid, NULL);
9962208a378SIlya Dryomov 	if (IS_ERR(fs_devices))
9972208a378SIlya Dryomov 		return fs_devices;
998e4404d6eSYan Zheng 
99902db0844SJosef Bacik 	fs_devices->total_devices = orig->total_devices;
1000e4404d6eSYan Zheng 
1001e4404d6eSYan Zheng 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1002606686eeSJosef Bacik 		struct rcu_string *name;
1003606686eeSJosef Bacik 
100412bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
100512bd2fc0SIlya Dryomov 					    orig_dev->uuid);
1006d2979aa2SAnand Jain 		if (IS_ERR(device)) {
1007d2979aa2SAnand Jain 			ret = PTR_ERR(device);
1008e4404d6eSYan Zheng 			goto error;
1009d2979aa2SAnand Jain 		}
1010e4404d6eSYan Zheng 
1011606686eeSJosef Bacik 		/*
1012606686eeSJosef Bacik 		 * This is ok to do without rcu read locked because we hold the
1013606686eeSJosef Bacik 		 * uuid mutex so nothing we touch in here is going to disappear.
1014606686eeSJosef Bacik 		 */
1015e755f780SAnand Jain 		if (orig_dev->name) {
101678f2c9e6SDavid Sterba 			name = rcu_string_strdup(orig_dev->name->str,
101778f2c9e6SDavid Sterba 					GFP_KERNEL);
1018606686eeSJosef Bacik 			if (!name) {
1019a425f9d4SDavid Sterba 				btrfs_free_device(device);
1020d2979aa2SAnand Jain 				ret = -ENOMEM;
1021e4404d6eSYan Zheng 				goto error;
1022fd2696f3SJulia Lawall 			}
1023606686eeSJosef Bacik 			rcu_assign_pointer(device->name, name);
1024e755f780SAnand Jain 		}
1025e4404d6eSYan Zheng 
1026e4404d6eSYan Zheng 		list_add(&device->dev_list, &fs_devices->devices);
1027e4404d6eSYan Zheng 		device->fs_devices = fs_devices;
1028e4404d6eSYan Zheng 		fs_devices->num_devices++;
1029e4404d6eSYan Zheng 	}
1030e4404d6eSYan Zheng 	return fs_devices;
1031e4404d6eSYan Zheng error:
1032e4404d6eSYan Zheng 	free_fs_devices(fs_devices);
1033d2979aa2SAnand Jain 	return ERR_PTR(ret);
1034e4404d6eSYan Zheng }
1035e4404d6eSYan Zheng 
10363712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1037bacce86aSAnand Jain 				      struct btrfs_device **latest_dev)
1038dfe25020SChris Mason {
1039c6e30871SQinghuang Feng 	struct btrfs_device *device, *next;
1040a6b0d5c8SChris Mason 
104146224705SXiao Guangrong 	/* This is the initialized path, it is safe to release the devices. */
1042c6e30871SQinghuang Feng 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
10433712ccb7SNikolay Borisov 		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1044401e29c1SAnand Jain 			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1045401e29c1SAnand Jain 				      &device->dev_state) &&
1046998a0671SAnand Jain 			    !test_bit(BTRFS_DEV_STATE_MISSING,
1047998a0671SAnand Jain 				      &device->dev_state) &&
10483712ccb7SNikolay Borisov 			    (!*latest_dev ||
10493712ccb7SNikolay Borisov 			     device->generation > (*latest_dev)->generation)) {
10503712ccb7SNikolay Borisov 				*latest_dev = device;
1051a6b0d5c8SChris Mason 			}
10522b82032cSYan Zheng 			continue;
1053a6b0d5c8SChris Mason 		}
10542b82032cSYan Zheng 
10558dabb742SStefan Behrens 		/*
1056cf89af14SAnand Jain 		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1057cf89af14SAnand Jain 		 * in btrfs_init_dev_replace() so just continue.
10588dabb742SStefan Behrens 		 */
1059cf89af14SAnand Jain 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
10608dabb742SStefan Behrens 			continue;
1061cf89af14SAnand Jain 
1062a74a4b97SChris Mason 		if (device->bdev) {
1063d4d77629STejun Heo 			blkdev_put(device->bdev, device->mode);
10642b82032cSYan Zheng 			device->bdev = NULL;
1065a74a4b97SChris Mason 			fs_devices->open_devices--;
1066a74a4b97SChris Mason 		}
1067ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
10682b82032cSYan Zheng 			list_del_init(&device->dev_alloc_list);
1069ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1070b2a61667SDesmond Cheong Zhi Xi 			fs_devices->rw_devices--;
10712b82032cSYan Zheng 		}
10722b82032cSYan Zheng 		list_del_init(&device->dev_list);
10732b82032cSYan Zheng 		fs_devices->num_devices--;
1074a425f9d4SDavid Sterba 		btrfs_free_device(device);
10752b82032cSYan Zheng 	}
10762b82032cSYan Zheng 
10773712ccb7SNikolay Borisov }
10783712ccb7SNikolay Borisov 
10793712ccb7SNikolay Borisov /*
10803712ccb7SNikolay Borisov  * After we have read the system tree and know devids belonging to this
10813712ccb7SNikolay Borisov  * filesystem, remove the device which does not belong there.
10823712ccb7SNikolay Borisov  */
1083bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
10843712ccb7SNikolay Borisov {
10853712ccb7SNikolay Borisov 	struct btrfs_device *latest_dev = NULL;
1086944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_dev;
10873712ccb7SNikolay Borisov 
10883712ccb7SNikolay Borisov 	mutex_lock(&uuid_mutex);
1089bacce86aSAnand Jain 	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1090944d3f9fSNikolay Borisov 
1091944d3f9fSNikolay Borisov 	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1092bacce86aSAnand Jain 		__btrfs_free_extra_devids(seed_dev, &latest_dev);
10932b82032cSYan Zheng 
1094d24fa5c1SAnand Jain 	fs_devices->latest_dev = latest_dev;
1095a6b0d5c8SChris Mason 
1096dfe25020SChris Mason 	mutex_unlock(&uuid_mutex);
1097dfe25020SChris Mason }
1098a0af469bSChris Mason 
109914238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device)
110014238819SAnand Jain {
110108ffcae8SDavid Sterba 	if (!device->bdev)
110208ffcae8SDavid Sterba 		return;
110308ffcae8SDavid Sterba 
1104ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
110514238819SAnand Jain 		sync_blockdev(device->bdev);
110614238819SAnand Jain 		invalidate_bdev(device->bdev);
110714238819SAnand Jain 	}
110814238819SAnand Jain 
110914238819SAnand Jain 	blkdev_put(device->bdev, device->mode);
111014238819SAnand Jain }
111114238819SAnand Jain 
1112959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device)
1113f448341aSAnand Jain {
1114f448341aSAnand Jain 	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1115f448341aSAnand Jain 
1116ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1117f448341aSAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
1118f448341aSAnand Jain 		list_del_init(&device->dev_alloc_list);
1119f448341aSAnand Jain 		fs_devices->rw_devices--;
1120f448341aSAnand Jain 	}
1121f448341aSAnand Jain 
11220d977e0eSDesmond Cheong Zhi Xi 	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
11230d977e0eSDesmond Cheong Zhi Xi 		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
11240d977e0eSDesmond Cheong Zhi Xi 
1125e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1126f448341aSAnand Jain 		fs_devices->missing_devices--;
1127f448341aSAnand Jain 
1128959b1c04SNikolay Borisov 	btrfs_close_bdev(device);
1129321f69f8SJohannes Thumshirn 	if (device->bdev) {
11303fff3975SJohannes Thumshirn 		fs_devices->open_devices--;
1131321f69f8SJohannes Thumshirn 		device->bdev = NULL;
1132f448341aSAnand Jain 	}
1133321f69f8SJohannes Thumshirn 	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
11345b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
1135f448341aSAnand Jain 
1136321f69f8SJohannes Thumshirn 	device->fs_info = NULL;
1137321f69f8SJohannes Thumshirn 	atomic_set(&device->dev_stats_ccnt, 0);
1138321f69f8SJohannes Thumshirn 	extent_io_tree_release(&device->alloc_state);
1139959b1c04SNikolay Borisov 
11406b225baaSFilipe Manana 	/*
11416b225baaSFilipe Manana 	 * Reset the flush error record. We might have a transient flush error
11426b225baaSFilipe Manana 	 * in this mount, and if so we aborted the current transaction and set
11436b225baaSFilipe Manana 	 * the fs to an error state, guaranteeing no super blocks can be further
11446b225baaSFilipe Manana 	 * committed. However that error might be transient and if we unmount the
11456b225baaSFilipe Manana 	 * filesystem and mount it again, we should allow the mount to succeed
11466b225baaSFilipe Manana 	 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
11476b225baaSFilipe Manana 	 * filesystem again we still get flush errors, then we will again abort
11486b225baaSFilipe Manana 	 * any transaction and set the error state, guaranteeing no commits of
11496b225baaSFilipe Manana 	 * unsafe super blocks.
11506b225baaSFilipe Manana 	 */
11516b225baaSFilipe Manana 	device->last_flush_error = 0;
11526b225baaSFilipe Manana 
1153321f69f8SJohannes Thumshirn 	/* Verify the device is back in a pristine state  */
1154321f69f8SJohannes Thumshirn 	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1155321f69f8SJohannes Thumshirn 	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1156321f69f8SJohannes Thumshirn 	ASSERT(list_empty(&device->dev_alloc_list));
1157321f69f8SJohannes Thumshirn 	ASSERT(list_empty(&device->post_commit_list));
1158321f69f8SJohannes Thumshirn 	ASSERT(atomic_read(&device->reada_in_flight) == 0);
1159f448341aSAnand Jain }
1160f448341aSAnand Jain 
116154eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
11628a4b83ccSChris Mason {
11632037a093SSasha Levin 	struct btrfs_device *device, *tmp;
1164e4404d6eSYan Zheng 
1165425c6ed6SJosef Bacik 	lockdep_assert_held(&uuid_mutex);
1166425c6ed6SJosef Bacik 
11672b82032cSYan Zheng 	if (--fs_devices->opened > 0)
116854eed6aeSNikolay Borisov 		return;
11698a4b83ccSChris Mason 
1170425c6ed6SJosef Bacik 	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1171959b1c04SNikolay Borisov 		btrfs_close_one_device(device);
1172c9513edbSXiao Guangrong 
1173e4404d6eSYan Zheng 	WARN_ON(fs_devices->open_devices);
1174e4404d6eSYan Zheng 	WARN_ON(fs_devices->rw_devices);
11752b82032cSYan Zheng 	fs_devices->opened = 0;
11760395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
1177c4989c2fSNikolay Borisov 	fs_devices->fs_info = NULL;
11788a4b83ccSChris Mason }
11798a4b83ccSChris Mason 
118054eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
11812b82032cSYan Zheng {
1182944d3f9fSNikolay Borisov 	LIST_HEAD(list);
1183944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *tmp;
11842b82032cSYan Zheng 
11852b82032cSYan Zheng 	mutex_lock(&uuid_mutex);
118654eed6aeSNikolay Borisov 	close_fs_devices(fs_devices);
1187944d3f9fSNikolay Borisov 	if (!fs_devices->opened)
1188944d3f9fSNikolay Borisov 		list_splice_init(&fs_devices->seed_list, &list);
1189e4404d6eSYan Zheng 
1190944d3f9fSNikolay Borisov 	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
11910226e0ebSAnand Jain 		close_fs_devices(fs_devices);
1192944d3f9fSNikolay Borisov 		list_del(&fs_devices->seed_list);
1193e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
1194e4404d6eSYan Zheng 	}
1195425c6ed6SJosef Bacik 	mutex_unlock(&uuid_mutex);
11962b82032cSYan Zheng }
11972b82032cSYan Zheng 
1198897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
119997288f2cSChristoph Hellwig 				fmode_t flags, void *holder)
12008a4b83ccSChris Mason {
12018a4b83ccSChris Mason 	struct btrfs_device *device;
1202443f24feSMiao Xie 	struct btrfs_device *latest_dev = NULL;
120396c2e067SAnand Jain 	struct btrfs_device *tmp_device;
12048a4b83ccSChris Mason 
1205d4d77629STejun Heo 	flags |= FMODE_EXCL;
1206d4d77629STejun Heo 
120796c2e067SAnand Jain 	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
120896c2e067SAnand Jain 				 dev_list) {
120996c2e067SAnand Jain 		int ret;
1210a0af469bSChris Mason 
121196c2e067SAnand Jain 		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
121296c2e067SAnand Jain 		if (ret == 0 &&
121396c2e067SAnand Jain 		    (!latest_dev || device->generation > latest_dev->generation)) {
12149f050db4SAnand Jain 			latest_dev = device;
121596c2e067SAnand Jain 		} else if (ret == -ENODATA) {
121696c2e067SAnand Jain 			fs_devices->num_devices--;
121796c2e067SAnand Jain 			list_del(&device->dev_list);
121896c2e067SAnand Jain 			btrfs_free_device(device);
121996c2e067SAnand Jain 		}
12208a4b83ccSChris Mason 	}
12211ed802c9SAnand Jain 	if (fs_devices->open_devices == 0)
12221ed802c9SAnand Jain 		return -EINVAL;
12231ed802c9SAnand Jain 
12242b82032cSYan Zheng 	fs_devices->opened = 1;
1225d24fa5c1SAnand Jain 	fs_devices->latest_dev = latest_dev;
12262b82032cSYan Zheng 	fs_devices->total_rw_bytes = 0;
1227c4a816c6SNaohiro Aota 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
122833fd2f71SAnand Jain 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
12291ed802c9SAnand Jain 
12301ed802c9SAnand Jain 	return 0;
12312b82032cSYan Zheng }
12322b82032cSYan Zheng 
12334f0f586bSSami Tolvanen static int devid_cmp(void *priv, const struct list_head *a,
12344f0f586bSSami Tolvanen 		     const struct list_head *b)
1235f8e10cd3SAnand Jain {
1236214cc184SDavid Sterba 	const struct btrfs_device *dev1, *dev2;
1237f8e10cd3SAnand Jain 
1238f8e10cd3SAnand Jain 	dev1 = list_entry(a, struct btrfs_device, dev_list);
1239f8e10cd3SAnand Jain 	dev2 = list_entry(b, struct btrfs_device, dev_list);
1240f8e10cd3SAnand Jain 
1241f8e10cd3SAnand Jain 	if (dev1->devid < dev2->devid)
1242f8e10cd3SAnand Jain 		return -1;
1243f8e10cd3SAnand Jain 	else if (dev1->devid > dev2->devid)
1244f8e10cd3SAnand Jain 		return 1;
1245f8e10cd3SAnand Jain 	return 0;
1246f8e10cd3SAnand Jain }
1247f8e10cd3SAnand Jain 
12482b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
124997288f2cSChristoph Hellwig 		       fmode_t flags, void *holder)
12502b82032cSYan Zheng {
12512b82032cSYan Zheng 	int ret;
12522b82032cSYan Zheng 
1253f5194e34SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
125418c850fdSJosef Bacik 	/*
125518c850fdSJosef Bacik 	 * The device_list_mutex cannot be taken here in case opening the
1256a8698707SChristoph Hellwig 	 * underlying device takes further locks like open_mutex.
125718c850fdSJosef Bacik 	 *
125818c850fdSJosef Bacik 	 * We also don't need the lock here as this is called during mount and
125918c850fdSJosef Bacik 	 * exclusion is provided by uuid_mutex
126018c850fdSJosef Bacik 	 */
1261f5194e34SDavid Sterba 
12622b82032cSYan Zheng 	if (fs_devices->opened) {
12632b82032cSYan Zheng 		fs_devices->opened++;
12642b82032cSYan Zheng 		ret = 0;
12652b82032cSYan Zheng 	} else {
1266f8e10cd3SAnand Jain 		list_sort(NULL, &fs_devices->devices, devid_cmp);
1267897fb573SAnand Jain 		ret = open_fs_devices(fs_devices, flags, holder);
12682b82032cSYan Zheng 	}
1269542c5908SAnand Jain 
12708a4b83ccSChris Mason 	return ret;
12718a4b83ccSChris Mason }
12728a4b83ccSChris Mason 
12738f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super)
12746cf86a00SAnand Jain {
12758f32380dSJohannes Thumshirn 	struct page *page = virt_to_page(super);
12768f32380dSJohannes Thumshirn 
12776cf86a00SAnand Jain 	put_page(page);
12786cf86a00SAnand Jain }
12796cf86a00SAnand Jain 
1280b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
128112659251SNaohiro Aota 						       u64 bytenr, u64 bytenr_orig)
12826cf86a00SAnand Jain {
1283b335eab8SNikolay Borisov 	struct btrfs_super_block *disk_super;
1284b335eab8SNikolay Borisov 	struct page *page;
12856cf86a00SAnand Jain 	void *p;
12866cf86a00SAnand Jain 	pgoff_t index;
12876cf86a00SAnand Jain 
12886cf86a00SAnand Jain 	/* make sure our super fits in the device */
12896cf86a00SAnand Jain 	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1290b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
12916cf86a00SAnand Jain 
12926cf86a00SAnand Jain 	/* make sure our super fits in the page */
1293b335eab8SNikolay Borisov 	if (sizeof(*disk_super) > PAGE_SIZE)
1294b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
12956cf86a00SAnand Jain 
12966cf86a00SAnand Jain 	/* make sure our super doesn't straddle pages on disk */
12976cf86a00SAnand Jain 	index = bytenr >> PAGE_SHIFT;
1298b335eab8SNikolay Borisov 	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1299b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13006cf86a00SAnand Jain 
13016cf86a00SAnand Jain 	/* pull in the page with our super */
1302b335eab8SNikolay Borisov 	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
13036cf86a00SAnand Jain 
1304b335eab8SNikolay Borisov 	if (IS_ERR(page))
1305b335eab8SNikolay Borisov 		return ERR_CAST(page);
13066cf86a00SAnand Jain 
1307b335eab8SNikolay Borisov 	p = page_address(page);
13086cf86a00SAnand Jain 
13096cf86a00SAnand Jain 	/* align our pointer to the offset of the super block */
1310b335eab8SNikolay Borisov 	disk_super = p + offset_in_page(bytenr);
13116cf86a00SAnand Jain 
131212659251SNaohiro Aota 	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1313b335eab8SNikolay Borisov 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
13148f32380dSJohannes Thumshirn 		btrfs_release_disk_super(p);
1315b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13166cf86a00SAnand Jain 	}
13176cf86a00SAnand Jain 
1318b335eab8SNikolay Borisov 	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1319b335eab8SNikolay Borisov 		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
13206cf86a00SAnand Jain 
1321b335eab8SNikolay Borisov 	return disk_super;
13226cf86a00SAnand Jain }
13236cf86a00SAnand Jain 
1324228a73abSAnand Jain int btrfs_forget_devices(const char *path)
1325228a73abSAnand Jain {
1326228a73abSAnand Jain 	int ret;
1327228a73abSAnand Jain 
1328228a73abSAnand Jain 	mutex_lock(&uuid_mutex);
1329228a73abSAnand Jain 	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1330228a73abSAnand Jain 	mutex_unlock(&uuid_mutex);
1331228a73abSAnand Jain 
1332228a73abSAnand Jain 	return ret;
1333228a73abSAnand Jain }
1334228a73abSAnand Jain 
13356f60cbd3SDavid Sterba /*
13366f60cbd3SDavid Sterba  * Look for a btrfs signature on a device. This may be called out of the mount path
13376f60cbd3SDavid Sterba  * and we are not allowed to call set_blocksize during the scan. The superblock
13386f60cbd3SDavid Sterba  * is read via pagecache
13396f60cbd3SDavid Sterba  */
134036350e95SGu Jinxiang struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
134136350e95SGu Jinxiang 					   void *holder)
13428a4b83ccSChris Mason {
13438a4b83ccSChris Mason 	struct btrfs_super_block *disk_super;
13444306a974SAnand Jain 	bool new_device_added = false;
134536350e95SGu Jinxiang 	struct btrfs_device *device = NULL;
13468a4b83ccSChris Mason 	struct block_device *bdev;
134712659251SNaohiro Aota 	u64 bytenr, bytenr_orig;
134812659251SNaohiro Aota 	int ret;
13498a4b83ccSChris Mason 
1350899f9307SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
1351899f9307SDavid Sterba 
13526f60cbd3SDavid Sterba 	/*
13536f60cbd3SDavid Sterba 	 * we would like to check all the supers, but that would make
13546f60cbd3SDavid Sterba 	 * a btrfs mount succeed after a mkfs from a different FS.
13556f60cbd3SDavid Sterba 	 * So, we need to add a special mount option to scan for
13566f60cbd3SDavid Sterba 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
13576f60cbd3SDavid Sterba 	 */
1358d4d77629STejun Heo 	flags |= FMODE_EXCL;
13596f60cbd3SDavid Sterba 
13606f60cbd3SDavid Sterba 	bdev = blkdev_get_by_path(path, flags, holder);
1361b6ed73bcSAnand Jain 	if (IS_ERR(bdev))
136236350e95SGu Jinxiang 		return ERR_CAST(bdev);
13636f60cbd3SDavid Sterba 
136412659251SNaohiro Aota 	bytenr_orig = btrfs_sb_offset(0);
136512659251SNaohiro Aota 	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
136612659251SNaohiro Aota 	if (ret)
136712659251SNaohiro Aota 		return ERR_PTR(ret);
136812659251SNaohiro Aota 
136912659251SNaohiro Aota 	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1370b335eab8SNikolay Borisov 	if (IS_ERR(disk_super)) {
1371b335eab8SNikolay Borisov 		device = ERR_CAST(disk_super);
13726f60cbd3SDavid Sterba 		goto error_bdev_put;
137305a5c55dSAnand Jain 	}
13746f60cbd3SDavid Sterba 
13754306a974SAnand Jain 	device = device_list_add(path, disk_super, &new_device_added);
137636350e95SGu Jinxiang 	if (!IS_ERR(device)) {
13774306a974SAnand Jain 		if (new_device_added)
13784306a974SAnand Jain 			btrfs_free_stale_devices(path, device);
13794306a974SAnand Jain 	}
13806f60cbd3SDavid Sterba 
13818f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
13826f60cbd3SDavid Sterba 
13836f60cbd3SDavid Sterba error_bdev_put:
1384d4d77629STejun Heo 	blkdev_put(bdev, flags);
1385b6ed73bcSAnand Jain 
138636350e95SGu Jinxiang 	return device;
13878a4b83ccSChris Mason }
13880b86a832SChris Mason 
1389c152b63eSFilipe Manana /*
13901c11b63eSJeff Mahoney  * Try to find a chunk that intersects [start, start + len] range and when one
13911c11b63eSJeff Mahoney  * such is found, record the end of it in *start
1392c152b63eSFilipe Manana  */
13931c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
13941c11b63eSJeff Mahoney 				    u64 len)
13951c11b63eSJeff Mahoney {
13961c11b63eSJeff Mahoney 	u64 physical_start, physical_end;
13976df9a95eSJosef Bacik 
13981c11b63eSJeff Mahoney 	lockdep_assert_held(&device->fs_info->chunk_mutex);
13991c11b63eSJeff Mahoney 
14001c11b63eSJeff Mahoney 	if (!find_first_extent_bit(&device->alloc_state, *start,
14011c11b63eSJeff Mahoney 				   &physical_start, &physical_end,
14021c11b63eSJeff Mahoney 				   CHUNK_ALLOCATED, NULL)) {
14031c11b63eSJeff Mahoney 
14041c11b63eSJeff Mahoney 		if (in_range(physical_start, *start, len) ||
14051c11b63eSJeff Mahoney 		    in_range(*start, physical_start,
14061c11b63eSJeff Mahoney 			     physical_end - physical_start)) {
14071c11b63eSJeff Mahoney 			*start = physical_end + 1;
14081c11b63eSJeff Mahoney 			return true;
14091c11b63eSJeff Mahoney 		}
14101c11b63eSJeff Mahoney 	}
14111c11b63eSJeff Mahoney 	return false;
14126df9a95eSJosef Bacik }
14136df9a95eSJosef Bacik 
14143b4ffa40SNaohiro Aota static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
14153b4ffa40SNaohiro Aota {
14163b4ffa40SNaohiro Aota 	switch (device->fs_devices->chunk_alloc_policy) {
14173b4ffa40SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
14183b4ffa40SNaohiro Aota 		/*
14193b4ffa40SNaohiro Aota 		 * We don't want to overwrite the superblock on the drive nor
14203b4ffa40SNaohiro Aota 		 * any area used by the boot loader (grub for example), so we
14213b4ffa40SNaohiro Aota 		 * make sure to start at an offset of at least 1MB.
14223b4ffa40SNaohiro Aota 		 */
14233b4ffa40SNaohiro Aota 		return max_t(u64, start, SZ_1M);
14241cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
14251cd6121fSNaohiro Aota 		/*
14261cd6121fSNaohiro Aota 		 * We don't care about the starting region like regular
14271cd6121fSNaohiro Aota 		 * allocator, because we anyway use/reserve the first two zones
14281cd6121fSNaohiro Aota 		 * for superblock logging.
14291cd6121fSNaohiro Aota 		 */
14301cd6121fSNaohiro Aota 		return ALIGN(start, device->zone_info->zone_size);
14313b4ffa40SNaohiro Aota 	default:
14323b4ffa40SNaohiro Aota 		BUG();
14333b4ffa40SNaohiro Aota 	}
14343b4ffa40SNaohiro Aota }
14353b4ffa40SNaohiro Aota 
14361cd6121fSNaohiro Aota static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
14371cd6121fSNaohiro Aota 					u64 *hole_start, u64 *hole_size,
14381cd6121fSNaohiro Aota 					u64 num_bytes)
14391cd6121fSNaohiro Aota {
14401cd6121fSNaohiro Aota 	u64 zone_size = device->zone_info->zone_size;
14411cd6121fSNaohiro Aota 	u64 pos;
14421cd6121fSNaohiro Aota 	int ret;
14431cd6121fSNaohiro Aota 	bool changed = false;
14441cd6121fSNaohiro Aota 
14451cd6121fSNaohiro Aota 	ASSERT(IS_ALIGNED(*hole_start, zone_size));
14461cd6121fSNaohiro Aota 
14471cd6121fSNaohiro Aota 	while (*hole_size > 0) {
14481cd6121fSNaohiro Aota 		pos = btrfs_find_allocatable_zones(device, *hole_start,
14491cd6121fSNaohiro Aota 						   *hole_start + *hole_size,
14501cd6121fSNaohiro Aota 						   num_bytes);
14511cd6121fSNaohiro Aota 		if (pos != *hole_start) {
14521cd6121fSNaohiro Aota 			*hole_size = *hole_start + *hole_size - pos;
14531cd6121fSNaohiro Aota 			*hole_start = pos;
14541cd6121fSNaohiro Aota 			changed = true;
14551cd6121fSNaohiro Aota 			if (*hole_size < num_bytes)
14561cd6121fSNaohiro Aota 				break;
14571cd6121fSNaohiro Aota 		}
14581cd6121fSNaohiro Aota 
14591cd6121fSNaohiro Aota 		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
14601cd6121fSNaohiro Aota 
14611cd6121fSNaohiro Aota 		/* Range is ensured to be empty */
14621cd6121fSNaohiro Aota 		if (!ret)
14631cd6121fSNaohiro Aota 			return changed;
14641cd6121fSNaohiro Aota 
14651cd6121fSNaohiro Aota 		/* Given hole range was invalid (outside of device) */
14661cd6121fSNaohiro Aota 		if (ret == -ERANGE) {
14671cd6121fSNaohiro Aota 			*hole_start += *hole_size;
1468d6f67afbSJohannes Thumshirn 			*hole_size = 0;
14697000babdSJiapeng Chong 			return true;
14701cd6121fSNaohiro Aota 		}
14711cd6121fSNaohiro Aota 
14721cd6121fSNaohiro Aota 		*hole_start += zone_size;
14731cd6121fSNaohiro Aota 		*hole_size -= zone_size;
14741cd6121fSNaohiro Aota 		changed = true;
14751cd6121fSNaohiro Aota 	}
14761cd6121fSNaohiro Aota 
14771cd6121fSNaohiro Aota 	return changed;
14781cd6121fSNaohiro Aota }
14791cd6121fSNaohiro Aota 
14803b4ffa40SNaohiro Aota /**
14813b4ffa40SNaohiro Aota  * dev_extent_hole_check - check if specified hole is suitable for allocation
14823b4ffa40SNaohiro Aota  * @device:	the device which we have the hole
14833b4ffa40SNaohiro Aota  * @hole_start: starting position of the hole
14843b4ffa40SNaohiro Aota  * @hole_size:	the size of the hole
14853b4ffa40SNaohiro Aota  * @num_bytes:	the size of the free space that we need
14863b4ffa40SNaohiro Aota  *
14871cd6121fSNaohiro Aota  * This function may modify @hole_start and @hole_size to reflect the suitable
14883b4ffa40SNaohiro Aota  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
14893b4ffa40SNaohiro Aota  */
14903b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
14913b4ffa40SNaohiro Aota 				  u64 *hole_size, u64 num_bytes)
14923b4ffa40SNaohiro Aota {
14933b4ffa40SNaohiro Aota 	bool changed = false;
14943b4ffa40SNaohiro Aota 	u64 hole_end = *hole_start + *hole_size;
14953b4ffa40SNaohiro Aota 
14961cd6121fSNaohiro Aota 	for (;;) {
14973b4ffa40SNaohiro Aota 		/*
14983b4ffa40SNaohiro Aota 		 * Check before we set max_hole_start, otherwise we could end up
14993b4ffa40SNaohiro Aota 		 * sending back this offset anyway.
15003b4ffa40SNaohiro Aota 		 */
15013b4ffa40SNaohiro Aota 		if (contains_pending_extent(device, hole_start, *hole_size)) {
15023b4ffa40SNaohiro Aota 			if (hole_end >= *hole_start)
15033b4ffa40SNaohiro Aota 				*hole_size = hole_end - *hole_start;
15043b4ffa40SNaohiro Aota 			else
15053b4ffa40SNaohiro Aota 				*hole_size = 0;
15063b4ffa40SNaohiro Aota 			changed = true;
15073b4ffa40SNaohiro Aota 		}
15083b4ffa40SNaohiro Aota 
15093b4ffa40SNaohiro Aota 		switch (device->fs_devices->chunk_alloc_policy) {
15103b4ffa40SNaohiro Aota 		case BTRFS_CHUNK_ALLOC_REGULAR:
15113b4ffa40SNaohiro Aota 			/* No extra check */
15123b4ffa40SNaohiro Aota 			break;
15131cd6121fSNaohiro Aota 		case BTRFS_CHUNK_ALLOC_ZONED:
15141cd6121fSNaohiro Aota 			if (dev_extent_hole_check_zoned(device, hole_start,
15151cd6121fSNaohiro Aota 							hole_size, num_bytes)) {
15161cd6121fSNaohiro Aota 				changed = true;
15171cd6121fSNaohiro Aota 				/*
15181cd6121fSNaohiro Aota 				 * The changed hole can contain pending extent.
15191cd6121fSNaohiro Aota 				 * Loop again to check that.
15201cd6121fSNaohiro Aota 				 */
15211cd6121fSNaohiro Aota 				continue;
15221cd6121fSNaohiro Aota 			}
15231cd6121fSNaohiro Aota 			break;
15243b4ffa40SNaohiro Aota 		default:
15253b4ffa40SNaohiro Aota 			BUG();
15263b4ffa40SNaohiro Aota 		}
15273b4ffa40SNaohiro Aota 
15281cd6121fSNaohiro Aota 		break;
15291cd6121fSNaohiro Aota 	}
15301cd6121fSNaohiro Aota 
15313b4ffa40SNaohiro Aota 	return changed;
15323b4ffa40SNaohiro Aota }
15336df9a95eSJosef Bacik 
15340b86a832SChris Mason /*
1535499f377fSJeff Mahoney  * find_free_dev_extent_start - find free space in the specified device
15367bfc837dSMiao Xie  * @device:	  the device which we search the free space in
15377bfc837dSMiao Xie  * @num_bytes:	  the size of the free space that we need
1538499f377fSJeff Mahoney  * @search_start: the position from which to begin the search
15397bfc837dSMiao Xie  * @start:	  store the start of the free space.
1540499f377fSJeff Mahoney  * @len:	  the size of the free space. that we find, or the size
1541499f377fSJeff Mahoney  *		  of the max free space if we don't find suitable free space
15427bfc837dSMiao Xie  *
15430b86a832SChris Mason  * this uses a pretty simple search, the expectation is that it is
15440b86a832SChris Mason  * called very infrequently and that a given device has a small number
15450b86a832SChris Mason  * of extents
15467bfc837dSMiao Xie  *
15477bfc837dSMiao Xie  * @start is used to store the start of the free space if we find. But if we
15487bfc837dSMiao Xie  * don't find suitable free space, it will be used to store the start position
15497bfc837dSMiao Xie  * of the max free space.
15507bfc837dSMiao Xie  *
15517bfc837dSMiao Xie  * @len is used to store the size of the free space that we find.
15527bfc837dSMiao Xie  * But if we don't find suitable free space, it is used to store the size of
15537bfc837dSMiao Xie  * the max free space.
1554135da976SQu Wenruo  *
1555135da976SQu Wenruo  * NOTE: This function will search *commit* root of device tree, and does extra
1556135da976SQu Wenruo  * check to ensure dev extents are not double allocated.
1557135da976SQu Wenruo  * This makes the function safe to allocate dev extents but may not report
1558135da976SQu Wenruo  * correct usable device space, as device extent freed in current transaction
15591a9fd417SDavid Sterba  * is not reported as available.
15600b86a832SChris Mason  */
15619e3246a5SQu Wenruo static int find_free_dev_extent_start(struct btrfs_device *device,
15629e3246a5SQu Wenruo 				u64 num_bytes, u64 search_start, u64 *start,
15639e3246a5SQu Wenruo 				u64 *len)
15640b86a832SChris Mason {
15650b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
15660b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
15670b86a832SChris Mason 	struct btrfs_key key;
15687bfc837dSMiao Xie 	struct btrfs_dev_extent *dev_extent;
15692b82032cSYan Zheng 	struct btrfs_path *path;
15707bfc837dSMiao Xie 	u64 hole_size;
15717bfc837dSMiao Xie 	u64 max_hole_start;
15727bfc837dSMiao Xie 	u64 max_hole_size;
15737bfc837dSMiao Xie 	u64 extent_end;
15740b86a832SChris Mason 	u64 search_end = device->total_bytes;
15750b86a832SChris Mason 	int ret;
15767bfc837dSMiao Xie 	int slot;
15770b86a832SChris Mason 	struct extent_buffer *l;
15788cdc7c5bSFilipe Manana 
15793b4ffa40SNaohiro Aota 	search_start = dev_extent_search_start(device, search_start);
15800b86a832SChris Mason 
15811cd6121fSNaohiro Aota 	WARN_ON(device->zone_info &&
15821cd6121fSNaohiro Aota 		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
15831cd6121fSNaohiro Aota 
15846df9a95eSJosef Bacik 	path = btrfs_alloc_path();
15856df9a95eSJosef Bacik 	if (!path)
15866df9a95eSJosef Bacik 		return -ENOMEM;
1587f2ab7618SZhao Lei 
15887bfc837dSMiao Xie 	max_hole_start = search_start;
15897bfc837dSMiao Xie 	max_hole_size = 0;
15907bfc837dSMiao Xie 
1591f2ab7618SZhao Lei again:
1592401e29c1SAnand Jain 	if (search_start >= search_end ||
1593401e29c1SAnand Jain 		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
15947bfc837dSMiao Xie 		ret = -ENOSPC;
15956df9a95eSJosef Bacik 		goto out;
15967bfc837dSMiao Xie 	}
15977bfc837dSMiao Xie 
1598e4058b54SDavid Sterba 	path->reada = READA_FORWARD;
15996df9a95eSJosef Bacik 	path->search_commit_root = 1;
16006df9a95eSJosef Bacik 	path->skip_locking = 1;
16017bfc837dSMiao Xie 
16020b86a832SChris Mason 	key.objectid = device->devid;
16030b86a832SChris Mason 	key.offset = search_start;
16040b86a832SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
16057bfc837dSMiao Xie 
16060ff40a91SMarcos Paulo de Souza 	ret = btrfs_search_backwards(root, &key, path);
16070b86a832SChris Mason 	if (ret < 0)
16087bfc837dSMiao Xie 		goto out;
16097bfc837dSMiao Xie 
16100b86a832SChris Mason 	while (1) {
16110b86a832SChris Mason 		l = path->nodes[0];
16120b86a832SChris Mason 		slot = path->slots[0];
16130b86a832SChris Mason 		if (slot >= btrfs_header_nritems(l)) {
16140b86a832SChris Mason 			ret = btrfs_next_leaf(root, path);
16150b86a832SChris Mason 			if (ret == 0)
16160b86a832SChris Mason 				continue;
16170b86a832SChris Mason 			if (ret < 0)
16187bfc837dSMiao Xie 				goto out;
16197bfc837dSMiao Xie 
16207bfc837dSMiao Xie 			break;
16210b86a832SChris Mason 		}
16220b86a832SChris Mason 		btrfs_item_key_to_cpu(l, &key, slot);
16230b86a832SChris Mason 
16240b86a832SChris Mason 		if (key.objectid < device->devid)
16250b86a832SChris Mason 			goto next;
16260b86a832SChris Mason 
16270b86a832SChris Mason 		if (key.objectid > device->devid)
16287bfc837dSMiao Xie 			break;
16290b86a832SChris Mason 
1630962a298fSDavid Sterba 		if (key.type != BTRFS_DEV_EXTENT_KEY)
16310b86a832SChris Mason 			goto next;
16320b86a832SChris Mason 
16337bfc837dSMiao Xie 		if (key.offset > search_start) {
16347bfc837dSMiao Xie 			hole_size = key.offset - search_start;
16353b4ffa40SNaohiro Aota 			dev_extent_hole_check(device, &search_start, &hole_size,
16363b4ffa40SNaohiro Aota 					      num_bytes);
16376df9a95eSJosef Bacik 
16387bfc837dSMiao Xie 			if (hole_size > max_hole_size) {
16397bfc837dSMiao Xie 				max_hole_start = search_start;
16407bfc837dSMiao Xie 				max_hole_size = hole_size;
16417bfc837dSMiao Xie 			}
16427bfc837dSMiao Xie 
16437bfc837dSMiao Xie 			/*
16447bfc837dSMiao Xie 			 * If this free space is greater than which we need,
16457bfc837dSMiao Xie 			 * it must be the max free space that we have found
16467bfc837dSMiao Xie 			 * until now, so max_hole_start must point to the start
16477bfc837dSMiao Xie 			 * of this free space and the length of this free space
16487bfc837dSMiao Xie 			 * is stored in max_hole_size. Thus, we return
16497bfc837dSMiao Xie 			 * max_hole_start and max_hole_size and go back to the
16507bfc837dSMiao Xie 			 * caller.
16517bfc837dSMiao Xie 			 */
16527bfc837dSMiao Xie 			if (hole_size >= num_bytes) {
16537bfc837dSMiao Xie 				ret = 0;
16547bfc837dSMiao Xie 				goto out;
16557bfc837dSMiao Xie 			}
16567bfc837dSMiao Xie 		}
16577bfc837dSMiao Xie 
16580b86a832SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
16597bfc837dSMiao Xie 		extent_end = key.offset + btrfs_dev_extent_length(l,
16607bfc837dSMiao Xie 								  dev_extent);
16617bfc837dSMiao Xie 		if (extent_end > search_start)
16627bfc837dSMiao Xie 			search_start = extent_end;
16630b86a832SChris Mason next:
16640b86a832SChris Mason 		path->slots[0]++;
16650b86a832SChris Mason 		cond_resched();
16660b86a832SChris Mason 	}
16670b86a832SChris Mason 
166838c01b96Sliubo 	/*
166938c01b96Sliubo 	 * At this point, search_start should be the end of
167038c01b96Sliubo 	 * allocated dev extents, and when shrinking the device,
167138c01b96Sliubo 	 * search_end may be smaller than search_start.
167238c01b96Sliubo 	 */
1673f2ab7618SZhao Lei 	if (search_end > search_start) {
16747bfc837dSMiao Xie 		hole_size = search_end - search_start;
16753b4ffa40SNaohiro Aota 		if (dev_extent_hole_check(device, &search_start, &hole_size,
16763b4ffa40SNaohiro Aota 					  num_bytes)) {
1677f2ab7618SZhao Lei 			btrfs_release_path(path);
1678f2ab7618SZhao Lei 			goto again;
1679f2ab7618SZhao Lei 		}
1680f2ab7618SZhao Lei 
16817bfc837dSMiao Xie 		if (hole_size > max_hole_size) {
16827bfc837dSMiao Xie 			max_hole_start = search_start;
16837bfc837dSMiao Xie 			max_hole_size = hole_size;
16840b86a832SChris Mason 		}
16856df9a95eSJosef Bacik 	}
16866df9a95eSJosef Bacik 
16877bfc837dSMiao Xie 	/* See above. */
1688f2ab7618SZhao Lei 	if (max_hole_size < num_bytes)
16897bfc837dSMiao Xie 		ret = -ENOSPC;
16907bfc837dSMiao Xie 	else
16912b82032cSYan Zheng 		ret = 0;
16920b86a832SChris Mason 
16937bfc837dSMiao Xie out:
16942b82032cSYan Zheng 	btrfs_free_path(path);
16957bfc837dSMiao Xie 	*start = max_hole_start;
1696b2117a39SMiao Xie 	if (len)
16977bfc837dSMiao Xie 		*len = max_hole_size;
16980b86a832SChris Mason 	return ret;
16990b86a832SChris Mason }
17000b86a832SChris Mason 
170160dfdf25SNikolay Borisov int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1702499f377fSJeff Mahoney 			 u64 *start, u64 *len)
1703499f377fSJeff Mahoney {
1704499f377fSJeff Mahoney 	/* FIXME use last free of some kind */
170560dfdf25SNikolay Borisov 	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1706499f377fSJeff Mahoney }
1707499f377fSJeff Mahoney 
1708b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
17098f18cf13SChris Mason 			  struct btrfs_device *device,
17102196d6e8SMiao Xie 			  u64 start, u64 *dev_extent_len)
17118f18cf13SChris Mason {
17120b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
17130b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
17148f18cf13SChris Mason 	int ret;
17158f18cf13SChris Mason 	struct btrfs_path *path;
17168f18cf13SChris Mason 	struct btrfs_key key;
1717a061fc8dSChris Mason 	struct btrfs_key found_key;
1718a061fc8dSChris Mason 	struct extent_buffer *leaf = NULL;
1719a061fc8dSChris Mason 	struct btrfs_dev_extent *extent = NULL;
17208f18cf13SChris Mason 
17218f18cf13SChris Mason 	path = btrfs_alloc_path();
17228f18cf13SChris Mason 	if (!path)
17238f18cf13SChris Mason 		return -ENOMEM;
17248f18cf13SChris Mason 
17258f18cf13SChris Mason 	key.objectid = device->devid;
17268f18cf13SChris Mason 	key.offset = start;
17278f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
1728924cd8fbSMiao Xie again:
17298f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1730a061fc8dSChris Mason 	if (ret > 0) {
1731a061fc8dSChris Mason 		ret = btrfs_previous_item(root, path, key.objectid,
1732a061fc8dSChris Mason 					  BTRFS_DEV_EXTENT_KEY);
1733b0b802d7STsutomu Itoh 		if (ret)
1734b0b802d7STsutomu Itoh 			goto out;
1735a061fc8dSChris Mason 		leaf = path->nodes[0];
1736a061fc8dSChris Mason 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1737a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1738a061fc8dSChris Mason 					struct btrfs_dev_extent);
1739a061fc8dSChris Mason 		BUG_ON(found_key.offset > start || found_key.offset +
1740a061fc8dSChris Mason 		       btrfs_dev_extent_length(leaf, extent) < start);
1741924cd8fbSMiao Xie 		key = found_key;
1742924cd8fbSMiao Xie 		btrfs_release_path(path);
1743924cd8fbSMiao Xie 		goto again;
1744a061fc8dSChris Mason 	} else if (ret == 0) {
1745a061fc8dSChris Mason 		leaf = path->nodes[0];
1746a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1747a061fc8dSChris Mason 					struct btrfs_dev_extent);
174879787eaaSJeff Mahoney 	} else {
174979787eaaSJeff Mahoney 		goto out;
1750a061fc8dSChris Mason 	}
17518f18cf13SChris Mason 
17522196d6e8SMiao Xie 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
17532196d6e8SMiao Xie 
17548f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
175579bd3712SFilipe Manana 	if (ret == 0)
17563204d33cSJosef Bacik 		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1757b0b802d7STsutomu Itoh out:
17588f18cf13SChris Mason 	btrfs_free_path(path);
17598f18cf13SChris Mason 	return ret;
17608f18cf13SChris Mason }
17618f18cf13SChris Mason 
17626df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
17630b86a832SChris Mason {
17646df9a95eSJosef Bacik 	struct extent_map_tree *em_tree;
17656df9a95eSJosef Bacik 	struct extent_map *em;
17666df9a95eSJosef Bacik 	struct rb_node *n;
17676df9a95eSJosef Bacik 	u64 ret = 0;
17680b86a832SChris Mason 
1769c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
17706df9a95eSJosef Bacik 	read_lock(&em_tree->lock);
177107e1ce09SLiu Bo 	n = rb_last(&em_tree->map.rb_root);
17726df9a95eSJosef Bacik 	if (n) {
17736df9a95eSJosef Bacik 		em = rb_entry(n, struct extent_map, rb_node);
17746df9a95eSJosef Bacik 		ret = em->start + em->len;
1775e17cade2SChris Mason 	}
17766df9a95eSJosef Bacik 	read_unlock(&em_tree->lock);
17776df9a95eSJosef Bacik 
17780b86a832SChris Mason 	return ret;
17790b86a832SChris Mason }
17800b86a832SChris Mason 
178153f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
178253f10659SIlya Dryomov 				    u64 *devid_ret)
17830b86a832SChris Mason {
17840b86a832SChris Mason 	int ret;
17850b86a832SChris Mason 	struct btrfs_key key;
17860b86a832SChris Mason 	struct btrfs_key found_key;
17872b82032cSYan Zheng 	struct btrfs_path *path;
17882b82032cSYan Zheng 
17892b82032cSYan Zheng 	path = btrfs_alloc_path();
17902b82032cSYan Zheng 	if (!path)
17912b82032cSYan Zheng 		return -ENOMEM;
17920b86a832SChris Mason 
17930b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
17940b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
17950b86a832SChris Mason 	key.offset = (u64)-1;
17960b86a832SChris Mason 
179753f10659SIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
17980b86a832SChris Mason 	if (ret < 0)
17990b86a832SChris Mason 		goto error;
18000b86a832SChris Mason 
1801a06dee4dSAnand Jain 	if (ret == 0) {
1802a06dee4dSAnand Jain 		/* Corruption */
1803a06dee4dSAnand Jain 		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1804a06dee4dSAnand Jain 		ret = -EUCLEAN;
1805a06dee4dSAnand Jain 		goto error;
1806a06dee4dSAnand Jain 	}
18070b86a832SChris Mason 
180853f10659SIlya Dryomov 	ret = btrfs_previous_item(fs_info->chunk_root, path,
180953f10659SIlya Dryomov 				  BTRFS_DEV_ITEMS_OBJECTID,
18100b86a832SChris Mason 				  BTRFS_DEV_ITEM_KEY);
18110b86a832SChris Mason 	if (ret) {
181253f10659SIlya Dryomov 		*devid_ret = 1;
18130b86a832SChris Mason 	} else {
18140b86a832SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
18150b86a832SChris Mason 				      path->slots[0]);
181653f10659SIlya Dryomov 		*devid_ret = found_key.offset + 1;
18170b86a832SChris Mason 	}
18180b86a832SChris Mason 	ret = 0;
18190b86a832SChris Mason error:
18202b82032cSYan Zheng 	btrfs_free_path(path);
18210b86a832SChris Mason 	return ret;
18220b86a832SChris Mason }
18230b86a832SChris Mason 
18240b86a832SChris Mason /*
18250b86a832SChris Mason  * the device information is stored in the chunk root
18260b86a832SChris Mason  * the btrfs_device struct should be fully filled in
18270b86a832SChris Mason  */
1828c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
18290b86a832SChris Mason 			    struct btrfs_device *device)
18300b86a832SChris Mason {
18310b86a832SChris Mason 	int ret;
18320b86a832SChris Mason 	struct btrfs_path *path;
18330b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
18340b86a832SChris Mason 	struct extent_buffer *leaf;
18350b86a832SChris Mason 	struct btrfs_key key;
18360b86a832SChris Mason 	unsigned long ptr;
18370b86a832SChris Mason 
18380b86a832SChris Mason 	path = btrfs_alloc_path();
18390b86a832SChris Mason 	if (!path)
18400b86a832SChris Mason 		return -ENOMEM;
18410b86a832SChris Mason 
18420b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
18430b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
18442b82032cSYan Zheng 	key.offset = device->devid;
18450b86a832SChris Mason 
18468e87e856SNikolay Borisov 	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
18478e87e856SNikolay Borisov 				      &key, sizeof(*dev_item));
18480b86a832SChris Mason 	if (ret)
18490b86a832SChris Mason 		goto out;
18500b86a832SChris Mason 
18510b86a832SChris Mason 	leaf = path->nodes[0];
18520b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
18530b86a832SChris Mason 
18540b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
18552b82032cSYan Zheng 	btrfs_set_device_generation(leaf, dev_item, 0);
18560b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
18570b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
18580b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
18590b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
18607cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
18617cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
18627cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
18637cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
1864e17cade2SChris Mason 	btrfs_set_device_group(leaf, dev_item, 0);
1865e17cade2SChris Mason 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1866e17cade2SChris Mason 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1867c3027eb5SChris Mason 	btrfs_set_device_start_offset(leaf, dev_item, 0);
18680b86a832SChris Mason 
1869410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
1870e17cade2SChris Mason 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
18711473b24eSGeert Uytterhoeven 	ptr = btrfs_device_fsid(dev_item);
1872de37aa51SNikolay Borisov 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1873de37aa51SNikolay Borisov 			    ptr, BTRFS_FSID_SIZE);
18740b86a832SChris Mason 	btrfs_mark_buffer_dirty(leaf);
18750b86a832SChris Mason 
18762b82032cSYan Zheng 	ret = 0;
18770b86a832SChris Mason out:
18780b86a832SChris Mason 	btrfs_free_path(path);
18790b86a832SChris Mason 	return ret;
18800b86a832SChris Mason }
18818f18cf13SChris Mason 
18825a1972bdSQu Wenruo /*
18835a1972bdSQu Wenruo  * Function to update ctime/mtime for a given device path.
18845a1972bdSQu Wenruo  * Mainly used for ctime/mtime based probe like libblkid.
18855a1972bdSQu Wenruo  */
18868f96a5bfSJosef Bacik static void update_dev_time(struct block_device *bdev)
18875a1972bdSQu Wenruo {
18888f96a5bfSJosef Bacik 	struct inode *inode = bdev->bd_inode;
18898f96a5bfSJosef Bacik 	struct timespec64 now;
18905a1972bdSQu Wenruo 
18918f96a5bfSJosef Bacik 	/* Shouldn't happen but just in case. */
18928f96a5bfSJosef Bacik 	if (!inode)
18935a1972bdSQu Wenruo 		return;
18948f96a5bfSJosef Bacik 
18958f96a5bfSJosef Bacik 	now = current_time(inode);
18968f96a5bfSJosef Bacik 	generic_update_time(inode, &now, S_MTIME | S_CTIME);
18975a1972bdSQu Wenruo }
18985a1972bdSQu Wenruo 
1899f331a952SDavid Sterba static int btrfs_rm_dev_item(struct btrfs_device *device)
1900a061fc8dSChris Mason {
1901f331a952SDavid Sterba 	struct btrfs_root *root = device->fs_info->chunk_root;
1902a061fc8dSChris Mason 	int ret;
1903a061fc8dSChris Mason 	struct btrfs_path *path;
1904a061fc8dSChris Mason 	struct btrfs_key key;
1905a061fc8dSChris Mason 	struct btrfs_trans_handle *trans;
1906a061fc8dSChris Mason 
1907a061fc8dSChris Mason 	path = btrfs_alloc_path();
1908a061fc8dSChris Mason 	if (!path)
1909a061fc8dSChris Mason 		return -ENOMEM;
1910a061fc8dSChris Mason 
1911a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
191298d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
191398d5dc13STsutomu Itoh 		btrfs_free_path(path);
191498d5dc13STsutomu Itoh 		return PTR_ERR(trans);
191598d5dc13STsutomu Itoh 	}
1916a061fc8dSChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1917a061fc8dSChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
1918a061fc8dSChris Mason 	key.offset = device->devid;
1919a061fc8dSChris Mason 
1920a061fc8dSChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
19215e9f2ad5SNikolay Borisov 	if (ret) {
19225e9f2ad5SNikolay Borisov 		if (ret > 0)
1923a061fc8dSChris Mason 			ret = -ENOENT;
19245e9f2ad5SNikolay Borisov 		btrfs_abort_transaction(trans, ret);
19255e9f2ad5SNikolay Borisov 		btrfs_end_transaction(trans);
1926a061fc8dSChris Mason 		goto out;
1927a061fc8dSChris Mason 	}
1928a061fc8dSChris Mason 
1929a061fc8dSChris Mason 	ret = btrfs_del_item(trans, root, path);
19305e9f2ad5SNikolay Borisov 	if (ret) {
19315e9f2ad5SNikolay Borisov 		btrfs_abort_transaction(trans, ret);
19325e9f2ad5SNikolay Borisov 		btrfs_end_transaction(trans);
19335e9f2ad5SNikolay Borisov 	}
19345e9f2ad5SNikolay Borisov 
1935a061fc8dSChris Mason out:
1936a061fc8dSChris Mason 	btrfs_free_path(path);
19375e9f2ad5SNikolay Borisov 	if (!ret)
19385e9f2ad5SNikolay Borisov 		ret = btrfs_commit_transaction(trans);
1939a061fc8dSChris Mason 	return ret;
1940a061fc8dSChris Mason }
1941a061fc8dSChris Mason 
19423cc31a0dSDavid Sterba /*
19433cc31a0dSDavid Sterba  * Verify that @num_devices satisfies the RAID profile constraints in the whole
19443cc31a0dSDavid Sterba  * filesystem. It's up to the caller to adjust that number regarding eg. device
19453cc31a0dSDavid Sterba  * replace.
19463cc31a0dSDavid Sterba  */
19473cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
19483cc31a0dSDavid Sterba 		u64 num_devices)
1949a061fc8dSChris Mason {
1950a061fc8dSChris Mason 	u64 all_avail;
1951de98ced9SMiao Xie 	unsigned seq;
1952418775a2SDavid Sterba 	int i;
1953a061fc8dSChris Mason 
1954de98ced9SMiao Xie 	do {
1955bd45ffbcSAnand Jain 		seq = read_seqbegin(&fs_info->profiles_lock);
1956de98ced9SMiao Xie 
1957bd45ffbcSAnand Jain 		all_avail = fs_info->avail_data_alloc_bits |
1958bd45ffbcSAnand Jain 			    fs_info->avail_system_alloc_bits |
1959bd45ffbcSAnand Jain 			    fs_info->avail_metadata_alloc_bits;
1960bd45ffbcSAnand Jain 	} while (read_seqretry(&fs_info->profiles_lock, seq));
1961f1fa7f26SAnand Jain 
1962418775a2SDavid Sterba 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
196341a6e891SAnand Jain 		if (!(all_avail & btrfs_raid_array[i].bg_flag))
1964418775a2SDavid Sterba 			continue;
1965a061fc8dSChris Mason 
1966efc222f8SAnand Jain 		if (num_devices < btrfs_raid_array[i].devs_min)
1967efc222f8SAnand Jain 			return btrfs_raid_array[i].mindev_error;
1968bd45ffbcSAnand Jain 	}
1969bd45ffbcSAnand Jain 
1970bd45ffbcSAnand Jain 	return 0;
1971f1fa7f26SAnand Jain }
1972f1fa7f26SAnand Jain 
1973c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device(
1974c9162bdfSOmar Sandoval 		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
197588acff64SAnand Jain {
197688acff64SAnand Jain 	struct btrfs_device *next_device;
197788acff64SAnand Jain 
197888acff64SAnand Jain 	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
197988acff64SAnand Jain 		if (next_device != device &&
1980e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1981e6e674bdSAnand Jain 		    && next_device->bdev)
198288acff64SAnand Jain 			return next_device;
198388acff64SAnand Jain 	}
198488acff64SAnand Jain 
198588acff64SAnand Jain 	return NULL;
198688acff64SAnand Jain }
198788acff64SAnand Jain 
198888acff64SAnand Jain /*
1989d24fa5c1SAnand Jain  * Helper function to check if the given device is part of s_bdev / latest_dev
199088acff64SAnand Jain  * and replace it with the provided or the next active device, in the context
199188acff64SAnand Jain  * where this function called, there should be always be another device (or
199288acff64SAnand Jain  * this_dev) which is active.
199388acff64SAnand Jain  */
1994b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1995e493e8f9SAnand Jain 					    struct btrfs_device *next_device)
199688acff64SAnand Jain {
1997d6507cf1SNikolay Borisov 	struct btrfs_fs_info *fs_info = device->fs_info;
199888acff64SAnand Jain 
1999e493e8f9SAnand Jain 	if (!next_device)
200088acff64SAnand Jain 		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
200188acff64SAnand Jain 							    device);
200288acff64SAnand Jain 	ASSERT(next_device);
200388acff64SAnand Jain 
200488acff64SAnand Jain 	if (fs_info->sb->s_bdev &&
200588acff64SAnand Jain 			(fs_info->sb->s_bdev == device->bdev))
200688acff64SAnand Jain 		fs_info->sb->s_bdev = next_device->bdev;
200788acff64SAnand Jain 
2008d24fa5c1SAnand Jain 	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2009d24fa5c1SAnand Jain 		fs_info->fs_devices->latest_dev = next_device;
201088acff64SAnand Jain }
201188acff64SAnand Jain 
20121da73967SAnand Jain /*
20131da73967SAnand Jain  * Return btrfs_fs_devices::num_devices excluding the device that's being
20141da73967SAnand Jain  * currently replaced.
20151da73967SAnand Jain  */
20161da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
20171da73967SAnand Jain {
20181da73967SAnand Jain 	u64 num_devices = fs_info->fs_devices->num_devices;
20191da73967SAnand Jain 
2020cb5583ddSDavid Sterba 	down_read(&fs_info->dev_replace.rwsem);
20211da73967SAnand Jain 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
20221da73967SAnand Jain 		ASSERT(num_devices > 1);
20231da73967SAnand Jain 		num_devices--;
20241da73967SAnand Jain 	}
2025cb5583ddSDavid Sterba 	up_read(&fs_info->dev_replace.rwsem);
20261da73967SAnand Jain 
20271da73967SAnand Jain 	return num_devices;
20281da73967SAnand Jain }
20291da73967SAnand Jain 
2030313b0858SJosef Bacik void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
20318f32380dSJohannes Thumshirn 			       struct block_device *bdev,
20326fbceb9fSJohannes Thumshirn 			       const char *device_path)
20336fbceb9fSJohannes Thumshirn {
20346fbceb9fSJohannes Thumshirn 	struct btrfs_super_block *disk_super;
20356fbceb9fSJohannes Thumshirn 	int copy_num;
20366fbceb9fSJohannes Thumshirn 
20376fbceb9fSJohannes Thumshirn 	if (!bdev)
20386fbceb9fSJohannes Thumshirn 		return;
20396fbceb9fSJohannes Thumshirn 
20406fbceb9fSJohannes Thumshirn 	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
20418f32380dSJohannes Thumshirn 		struct page *page;
20428f32380dSJohannes Thumshirn 		int ret;
20438f32380dSJohannes Thumshirn 
20448f32380dSJohannes Thumshirn 		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
20458f32380dSJohannes Thumshirn 		if (IS_ERR(disk_super))
20466fbceb9fSJohannes Thumshirn 			continue;
20476fbceb9fSJohannes Thumshirn 
204812659251SNaohiro Aota 		if (bdev_is_zoned(bdev)) {
204912659251SNaohiro Aota 			btrfs_reset_sb_log_zones(bdev, copy_num);
205012659251SNaohiro Aota 			continue;
205112659251SNaohiro Aota 		}
205212659251SNaohiro Aota 
20536fbceb9fSJohannes Thumshirn 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
20548f32380dSJohannes Thumshirn 
20558f32380dSJohannes Thumshirn 		page = virt_to_page(disk_super);
20568f32380dSJohannes Thumshirn 		set_page_dirty(page);
20578f32380dSJohannes Thumshirn 		lock_page(page);
20588f32380dSJohannes Thumshirn 		/* write_on_page() unlocks the page */
20598f32380dSJohannes Thumshirn 		ret = write_one_page(page);
20608f32380dSJohannes Thumshirn 		if (ret)
20618f32380dSJohannes Thumshirn 			btrfs_warn(fs_info,
20628f32380dSJohannes Thumshirn 				"error clearing superblock number %d (%d)",
20638f32380dSJohannes Thumshirn 				copy_num, ret);
20648f32380dSJohannes Thumshirn 		btrfs_release_disk_super(disk_super);
20658f32380dSJohannes Thumshirn 
20666fbceb9fSJohannes Thumshirn 	}
20676fbceb9fSJohannes Thumshirn 
20686fbceb9fSJohannes Thumshirn 	/* Notify udev that device has changed */
20696fbceb9fSJohannes Thumshirn 	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
20706fbceb9fSJohannes Thumshirn 
20716fbceb9fSJohannes Thumshirn 	/* Update ctime/mtime for device path for libblkid */
20728f96a5bfSJosef Bacik 	update_dev_time(bdev);
20736fbceb9fSJohannes Thumshirn }
20746fbceb9fSJohannes Thumshirn 
2075da353f6bSDavid Sterba int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
20763fa421deSJosef Bacik 		    u64 devid, struct block_device **bdev, fmode_t *mode)
2077f1fa7f26SAnand Jain {
2078f1fa7f26SAnand Jain 	struct btrfs_device *device;
2079f1fa7f26SAnand Jain 	struct btrfs_fs_devices *cur_devices;
2080b5185197SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2081f1fa7f26SAnand Jain 	u64 num_devices;
2082f1fa7f26SAnand Jain 	int ret = 0;
2083f1fa7f26SAnand Jain 
20848ef9dc0fSJosef Bacik 	/*
20858ef9dc0fSJosef Bacik 	 * The device list in fs_devices is accessed without locks (neither
20868ef9dc0fSJosef Bacik 	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
20878ef9dc0fSJosef Bacik 	 * filesystem and another device rm cannot run.
20888ef9dc0fSJosef Bacik 	 */
20891da73967SAnand Jain 	num_devices = btrfs_num_devices(fs_info);
2090a061fc8dSChris Mason 
20910b246afaSJeff Mahoney 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2092beaf8ab3SStefan Behrens 	if (ret)
2093a061fc8dSChris Mason 		goto out;
2094f1fa7f26SAnand Jain 
2095a27a94c2SNikolay Borisov 	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2096a27a94c2SNikolay Borisov 
2097a27a94c2SNikolay Borisov 	if (IS_ERR(device)) {
2098a27a94c2SNikolay Borisov 		if (PTR_ERR(device) == -ENOENT &&
2099e4571b8cSQu Wenruo 		    device_path && strcmp(device_path, "missing") == 0)
2100a27a94c2SNikolay Borisov 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2101a27a94c2SNikolay Borisov 		else
2102a27a94c2SNikolay Borisov 			ret = PTR_ERR(device);
2103a061fc8dSChris Mason 		goto out;
2104a27a94c2SNikolay Borisov 	}
21052b82032cSYan Zheng 
2106eede2bf3SOmar Sandoval 	if (btrfs_pinned_by_swapfile(fs_info, device)) {
2107eede2bf3SOmar Sandoval 		btrfs_warn_in_rcu(fs_info,
2108eede2bf3SOmar Sandoval 		  "cannot remove device %s (devid %llu) due to active swapfile",
2109eede2bf3SOmar Sandoval 				  rcu_str_deref(device->name), device->devid);
2110eede2bf3SOmar Sandoval 		ret = -ETXTBSY;
2111eede2bf3SOmar Sandoval 		goto out;
2112eede2bf3SOmar Sandoval 	}
2113eede2bf3SOmar Sandoval 
2114401e29c1SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2115183860f6SAnand Jain 		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
211624fc572fSAnand Jain 		goto out;
211763a212abSStefan Behrens 	}
211863a212abSStefan Behrens 
2119ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2120ebbede42SAnand Jain 	    fs_info->fs_devices->rw_devices == 1) {
2121183860f6SAnand Jain 		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
212224fc572fSAnand Jain 		goto out;
21232b82032cSYan Zheng 	}
21242b82032cSYan Zheng 
2125ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
212634441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
21272b82032cSYan Zheng 		list_del_init(&device->dev_alloc_list);
2128c3929c36SMiao Xie 		device->fs_devices->rw_devices--;
212934441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
21302b82032cSYan Zheng 	}
2131a061fc8dSChris Mason 
2132a061fc8dSChris Mason 	ret = btrfs_shrink_device(device, 0);
213366d204a1SFilipe Manana 	if (!ret)
213466d204a1SFilipe Manana 		btrfs_reada_remove_dev(device);
2135a061fc8dSChris Mason 	if (ret)
21369b3517e9SIlya Dryomov 		goto error_undo;
2137a061fc8dSChris Mason 
213863a212abSStefan Behrens 	/*
213963a212abSStefan Behrens 	 * TODO: the superblock still includes this device in its num_devices
214063a212abSStefan Behrens 	 * counter although write_all_supers() is not locked out. This
214163a212abSStefan Behrens 	 * could give a filesystem state which requires a degraded mount.
214263a212abSStefan Behrens 	 */
2143f331a952SDavid Sterba 	ret = btrfs_rm_dev_item(device);
2144a061fc8dSChris Mason 	if (ret)
21459b3517e9SIlya Dryomov 		goto error_undo;
2146a061fc8dSChris Mason 
2147e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2148163e97eeSDavid Sterba 	btrfs_scrub_cancel_dev(device);
2149e5e9a520SChris Mason 
2150e5e9a520SChris Mason 	/*
2151e5e9a520SChris Mason 	 * the device list mutex makes sure that we don't change
2152e5e9a520SChris Mason 	 * the device list while someone else is writing out all
2153d7306801SFilipe David Borba Manana 	 * the device supers. Whoever is writing all supers, should
2154d7306801SFilipe David Borba Manana 	 * lock the device list mutex before getting the number of
2155d7306801SFilipe David Borba Manana 	 * devices in the super block (super_copy). Conversely,
2156d7306801SFilipe David Borba Manana 	 * whoever updates the number of devices in the super block
2157d7306801SFilipe David Borba Manana 	 * (super_copy) should hold the device list mutex.
2158e5e9a520SChris Mason 	 */
21591f78160cSXiao Guangrong 
216041a52a0fSAnand Jain 	/*
216141a52a0fSAnand Jain 	 * In normal cases the cur_devices == fs_devices. But in case
216241a52a0fSAnand Jain 	 * of deleting a seed device, the cur_devices should point to
21639675ea8cSSu Yue 	 * its own fs_devices listed under the fs_devices->seed_list.
216441a52a0fSAnand Jain 	 */
21651f78160cSXiao Guangrong 	cur_devices = device->fs_devices;
2166b5185197SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
21671f78160cSXiao Guangrong 	list_del_rcu(&device->dev_list);
2168e5e9a520SChris Mason 
216941a52a0fSAnand Jain 	cur_devices->num_devices--;
217041a52a0fSAnand Jain 	cur_devices->total_devices--;
2171b4993e64SAnand Jain 	/* Update total_devices of the parent fs_devices if it's seed */
2172b4993e64SAnand Jain 	if (cur_devices != fs_devices)
2173b4993e64SAnand Jain 		fs_devices->total_devices--;
21742b82032cSYan Zheng 
2175e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
217641a52a0fSAnand Jain 		cur_devices->missing_devices--;
2177cd02dca5SChris Mason 
2178d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(device, NULL);
21792b82032cSYan Zheng 
21800bfaa9c5SEric Sandeen 	if (device->bdev) {
218141a52a0fSAnand Jain 		cur_devices->open_devices--;
218299994cdeSAnand Jain 		/* remove sysfs entry */
218353f8a74cSAnand Jain 		btrfs_sysfs_remove_device(device);
21840bfaa9c5SEric Sandeen 	}
218599994cdeSAnand Jain 
21860b246afaSJeff Mahoney 	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
21870b246afaSJeff Mahoney 	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2188b5185197SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2189e4404d6eSYan Zheng 
2190cea67ab9SJeff Mahoney 	/*
21913fa421deSJosef Bacik 	 * At this point, the device is zero sized and detached from the
21923fa421deSJosef Bacik 	 * devices list.  All that's left is to zero out the old supers and
21933fa421deSJosef Bacik 	 * free the device.
21943fa421deSJosef Bacik 	 *
21953fa421deSJosef Bacik 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
21963fa421deSJosef Bacik 	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
21973fa421deSJosef Bacik 	 * block device and it's dependencies.  Instead just flush the device
21983fa421deSJosef Bacik 	 * and let the caller do the final blkdev_put.
2199cea67ab9SJeff Mahoney 	 */
22003fa421deSJosef Bacik 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
22018f32380dSJohannes Thumshirn 		btrfs_scratch_superblocks(fs_info, device->bdev,
22028f32380dSJohannes Thumshirn 					  device->name->str);
22033fa421deSJosef Bacik 		if (device->bdev) {
22043fa421deSJosef Bacik 			sync_blockdev(device->bdev);
22053fa421deSJosef Bacik 			invalidate_bdev(device->bdev);
22063fa421deSJosef Bacik 		}
22073fa421deSJosef Bacik 	}
2208cea67ab9SJeff Mahoney 
22093fa421deSJosef Bacik 	*bdev = device->bdev;
22103fa421deSJosef Bacik 	*mode = device->mode;
22118e75fd89SNikolay Borisov 	synchronize_rcu();
22128e75fd89SNikolay Borisov 	btrfs_free_device(device);
2213cea67ab9SJeff Mahoney 
2214*8b41393fSJosef Bacik 	/*
2215*8b41393fSJosef Bacik 	 * This can happen if cur_devices is the private seed devices list.  We
2216*8b41393fSJosef Bacik 	 * cannot call close_fs_devices() here because it expects the uuid_mutex
2217*8b41393fSJosef Bacik 	 * to be held, but in fact we don't need that for the private
2218*8b41393fSJosef Bacik 	 * seed_devices, we can simply decrement cur_devices->opened and then
2219*8b41393fSJosef Bacik 	 * remove it from our list and free the fs_devices.
2220*8b41393fSJosef Bacik 	 */
22218e906945SAnand Jain 	if (cur_devices->num_devices == 0) {
2222944d3f9fSNikolay Borisov 		list_del_init(&cur_devices->seed_list);
2223*8b41393fSJosef Bacik 		ASSERT(cur_devices->opened == 1);
2224*8b41393fSJosef Bacik 		cur_devices->opened--;
22251f78160cSXiao Guangrong 		free_fs_devices(cur_devices);
22262b82032cSYan Zheng 	}
22272b82032cSYan Zheng 
2228a061fc8dSChris Mason out:
2229a061fc8dSChris Mason 	return ret;
223024fc572fSAnand Jain 
22319b3517e9SIlya Dryomov error_undo:
223266d204a1SFilipe Manana 	btrfs_reada_undo_remove_dev(device);
2233ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
223434441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
22359b3517e9SIlya Dryomov 		list_add(&device->dev_alloc_list,
2236b5185197SAnand Jain 			 &fs_devices->alloc_list);
2237c3929c36SMiao Xie 		device->fs_devices->rw_devices++;
223834441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
22399b3517e9SIlya Dryomov 	}
224024fc572fSAnand Jain 	goto out;
2241a061fc8dSChris Mason }
2242a061fc8dSChris Mason 
224368a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2244e93c89c1SStefan Behrens {
2245d51908ceSAnand Jain 	struct btrfs_fs_devices *fs_devices;
2246d51908ceSAnand Jain 
224768a9db5fSNikolay Borisov 	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
22481357272fSIlya Dryomov 
224925e8e911SAnand Jain 	/*
225025e8e911SAnand Jain 	 * in case of fs with no seed, srcdev->fs_devices will point
225125e8e911SAnand Jain 	 * to fs_devices of fs_info. However when the dev being replaced is
225225e8e911SAnand Jain 	 * a seed dev it will point to the seed's local fs_devices. In short
225325e8e911SAnand Jain 	 * srcdev will have its correct fs_devices in both the cases.
225425e8e911SAnand Jain 	 */
225525e8e911SAnand Jain 	fs_devices = srcdev->fs_devices;
2256d51908ceSAnand Jain 
2257e93c89c1SStefan Behrens 	list_del_rcu(&srcdev->dev_list);
2258619c47f3SDavid Sterba 	list_del(&srcdev->dev_alloc_list);
2259d51908ceSAnand Jain 	fs_devices->num_devices--;
2260e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2261d51908ceSAnand Jain 		fs_devices->missing_devices--;
2262e93c89c1SStefan Behrens 
2263ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
226482372bc8SMiao Xie 		fs_devices->rw_devices--;
22651357272fSIlya Dryomov 
226682372bc8SMiao Xie 	if (srcdev->bdev)
226782372bc8SMiao Xie 		fs_devices->open_devices--;
2268084b6e7cSQu Wenruo }
2269084b6e7cSQu Wenruo 
227065237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2271084b6e7cSQu Wenruo {
2272084b6e7cSQu Wenruo 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
227382372bc8SMiao Xie 
2274a466c85eSJosef Bacik 	mutex_lock(&uuid_mutex);
2275a466c85eSJosef Bacik 
227614238819SAnand Jain 	btrfs_close_bdev(srcdev);
22778e75fd89SNikolay Borisov 	synchronize_rcu();
22788e75fd89SNikolay Borisov 	btrfs_free_device(srcdev);
227994d5f0c2SAnand Jain 
228094d5f0c2SAnand Jain 	/* if this is no devs we rather delete the fs_devices */
228194d5f0c2SAnand Jain 	if (!fs_devices->num_devices) {
22826dd38f81SAnand Jain 		/*
22836dd38f81SAnand Jain 		 * On a mounted FS, num_devices can't be zero unless it's a
22846dd38f81SAnand Jain 		 * seed. In case of a seed device being replaced, the replace
22856dd38f81SAnand Jain 		 * target added to the sprout FS, so there will be no more
22866dd38f81SAnand Jain 		 * device left under the seed FS.
22876dd38f81SAnand Jain 		 */
22886dd38f81SAnand Jain 		ASSERT(fs_devices->seeding);
22896dd38f81SAnand Jain 
2290944d3f9fSNikolay Borisov 		list_del_init(&fs_devices->seed_list);
22910226e0ebSAnand Jain 		close_fs_devices(fs_devices);
22928bef8401SAnand Jain 		free_fs_devices(fs_devices);
229394d5f0c2SAnand Jain 	}
2294a466c85eSJosef Bacik 	mutex_unlock(&uuid_mutex);
2295e93c89c1SStefan Behrens }
2296e93c89c1SStefan Behrens 
22974f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2298e93c89c1SStefan Behrens {
22994f5ad7bdSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2300d2ff1b20SAnand Jain 
2301d9a071f0SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
2302d9a071f0SAnand Jain 
230353f8a74cSAnand Jain 	btrfs_sysfs_remove_device(tgtdev);
2304d2ff1b20SAnand Jain 
2305779bf3feSAnand Jain 	if (tgtdev->bdev)
2306d9a071f0SAnand Jain 		fs_devices->open_devices--;
2307779bf3feSAnand Jain 
2308d9a071f0SAnand Jain 	fs_devices->num_devices--;
2309e93c89c1SStefan Behrens 
2310d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(tgtdev, NULL);
2311e93c89c1SStefan Behrens 
2312e93c89c1SStefan Behrens 	list_del_rcu(&tgtdev->dev_list);
2313e93c89c1SStefan Behrens 
2314d9a071f0SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2315779bf3feSAnand Jain 
23168f32380dSJohannes Thumshirn 	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
23178f32380dSJohannes Thumshirn 				  tgtdev->name->str);
231814238819SAnand Jain 
231914238819SAnand Jain 	btrfs_close_bdev(tgtdev);
23208e75fd89SNikolay Borisov 	synchronize_rcu();
23218e75fd89SNikolay Borisov 	btrfs_free_device(tgtdev);
2322e93c89c1SStefan Behrens }
2323e93c89c1SStefan Behrens 
2324b444ad46SNikolay Borisov static struct btrfs_device *btrfs_find_device_by_path(
2325b444ad46SNikolay Borisov 		struct btrfs_fs_info *fs_info, const char *device_path)
23267ba15b7dSStefan Behrens {
23277ba15b7dSStefan Behrens 	int ret = 0;
23287ba15b7dSStefan Behrens 	struct btrfs_super_block *disk_super;
23297ba15b7dSStefan Behrens 	u64 devid;
23307ba15b7dSStefan Behrens 	u8 *dev_uuid;
23317ba15b7dSStefan Behrens 	struct block_device *bdev;
2332b444ad46SNikolay Borisov 	struct btrfs_device *device;
23337ba15b7dSStefan Behrens 
23347ba15b7dSStefan Behrens 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
23358f32380dSJohannes Thumshirn 				    fs_info->bdev_holder, 0, &bdev, &disk_super);
23367ba15b7dSStefan Behrens 	if (ret)
2337b444ad46SNikolay Borisov 		return ERR_PTR(ret);
23388f32380dSJohannes Thumshirn 
23397ba15b7dSStefan Behrens 	devid = btrfs_stack_device_id(&disk_super->dev_item);
23407ba15b7dSStefan Behrens 	dev_uuid = disk_super->dev_item.uuid;
23417239ff4bSNikolay Borisov 	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2342e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2343b2598edfSAnand Jain 					   disk_super->metadata_uuid);
23447239ff4bSNikolay Borisov 	else
2345e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2346b2598edfSAnand Jain 					   disk_super->fsid);
23477239ff4bSNikolay Borisov 
23488f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
2349b444ad46SNikolay Borisov 	if (!device)
2350b444ad46SNikolay Borisov 		device = ERR_PTR(-ENOENT);
23517ba15b7dSStefan Behrens 	blkdev_put(bdev, FMODE_READ);
2352b444ad46SNikolay Borisov 	return device;
23537ba15b7dSStefan Behrens }
23547ba15b7dSStefan Behrens 
23552b82032cSYan Zheng /*
23565c5c0df0SDavid Sterba  * Lookup a device given by device id, or the path if the id is 0.
23575c5c0df0SDavid Sterba  */
2358a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec(
23596e927cebSAnand Jain 		struct btrfs_fs_info *fs_info, u64 devid,
23606e927cebSAnand Jain 		const char *device_path)
236124e0474bSAnand Jain {
2362a27a94c2SNikolay Borisov 	struct btrfs_device *device;
236324e0474bSAnand Jain 
23645c5c0df0SDavid Sterba 	if (devid) {
2365e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2366b2598edfSAnand Jain 					   NULL);
2367a27a94c2SNikolay Borisov 		if (!device)
2368a27a94c2SNikolay Borisov 			return ERR_PTR(-ENOENT);
23696e927cebSAnand Jain 		return device;
23706e927cebSAnand Jain 	}
23716e927cebSAnand Jain 
23726e927cebSAnand Jain 	if (!device_path || !device_path[0])
2373a27a94c2SNikolay Borisov 		return ERR_PTR(-EINVAL);
2374d95a830cSAnand Jain 
23756e927cebSAnand Jain 	if (strcmp(device_path, "missing") == 0) {
23766e927cebSAnand Jain 		/* Find first missing device */
2377d95a830cSAnand Jain 		list_for_each_entry(device, &fs_info->fs_devices->devices,
2378d95a830cSAnand Jain 				    dev_list) {
2379d95a830cSAnand Jain 			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
23806e927cebSAnand Jain 				     &device->dev_state) && !device->bdev)
2381d95a830cSAnand Jain 				return device;
2382d95a830cSAnand Jain 		}
2383d95a830cSAnand Jain 		return ERR_PTR(-ENOENT);
2384d95a830cSAnand Jain 	}
23856e927cebSAnand Jain 
23866e927cebSAnand Jain 	return btrfs_find_device_by_path(fs_info, device_path);
238724e0474bSAnand Jain }
238824e0474bSAnand Jain 
23892b82032cSYan Zheng /*
23902b82032cSYan Zheng  * does all the dirty work required for changing file system's UUID.
23912b82032cSYan Zheng  */
23922ff7e61eSJeff Mahoney static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
23932b82032cSYan Zheng {
23940b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
23952b82032cSYan Zheng 	struct btrfs_fs_devices *old_devices;
2396e4404d6eSYan Zheng 	struct btrfs_fs_devices *seed_devices;
23970b246afaSJeff Mahoney 	struct btrfs_super_block *disk_super = fs_info->super_copy;
23982b82032cSYan Zheng 	struct btrfs_device *device;
23992b82032cSYan Zheng 	u64 super_flags;
24002b82032cSYan Zheng 
2401a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
2402e4404d6eSYan Zheng 	if (!fs_devices->seeding)
24032b82032cSYan Zheng 		return -EINVAL;
24042b82032cSYan Zheng 
2405427c8fddSNikolay Borisov 	/*
2406427c8fddSNikolay Borisov 	 * Private copy of the seed devices, anchored at
2407427c8fddSNikolay Borisov 	 * fs_info->fs_devices->seed_list
2408427c8fddSNikolay Borisov 	 */
24097239ff4bSNikolay Borisov 	seed_devices = alloc_fs_devices(NULL, NULL);
24102208a378SIlya Dryomov 	if (IS_ERR(seed_devices))
24112208a378SIlya Dryomov 		return PTR_ERR(seed_devices);
24122b82032cSYan Zheng 
2413427c8fddSNikolay Borisov 	/*
2414427c8fddSNikolay Borisov 	 * It's necessary to retain a copy of the original seed fs_devices in
2415427c8fddSNikolay Borisov 	 * fs_uuids so that filesystems which have been seeded can successfully
2416427c8fddSNikolay Borisov 	 * reference the seed device from open_seed_devices. This also supports
2417427c8fddSNikolay Borisov 	 * multiple fs seed.
2418427c8fddSNikolay Borisov 	 */
2419e4404d6eSYan Zheng 	old_devices = clone_fs_devices(fs_devices);
2420e4404d6eSYan Zheng 	if (IS_ERR(old_devices)) {
2421e4404d6eSYan Zheng 		kfree(seed_devices);
2422e4404d6eSYan Zheng 		return PTR_ERR(old_devices);
24232b82032cSYan Zheng 	}
2424e4404d6eSYan Zheng 
2425c4babc5eSAnand Jain 	list_add(&old_devices->fs_list, &fs_uuids);
24262b82032cSYan Zheng 
2427e4404d6eSYan Zheng 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2428e4404d6eSYan Zheng 	seed_devices->opened = 1;
2429e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->devices);
2430e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->alloc_list);
2431e5e9a520SChris Mason 	mutex_init(&seed_devices->device_list_mutex);
2432c9513edbSXiao Guangrong 
2433321a4bf7SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
24341f78160cSXiao Guangrong 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
24351f78160cSXiao Guangrong 			      synchronize_rcu);
24362196d6e8SMiao Xie 	list_for_each_entry(device, &seed_devices->devices, dev_list)
2437e4404d6eSYan Zheng 		device->fs_devices = seed_devices;
24382196d6e8SMiao Xie 
24390395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
24402b82032cSYan Zheng 	fs_devices->num_devices = 0;
24412b82032cSYan Zheng 	fs_devices->open_devices = 0;
244269611ac8SMiao Xie 	fs_devices->missing_devices = 0;
24437f0432d0SJohannes Thumshirn 	fs_devices->rotating = false;
2444944d3f9fSNikolay Borisov 	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
24452b82032cSYan Zheng 
24462b82032cSYan Zheng 	generate_random_uuid(fs_devices->fsid);
24477239ff4bSNikolay Borisov 	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
24482b82032cSYan Zheng 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2449321a4bf7SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2450f7171750SFilipe David Borba Manana 
24512b82032cSYan Zheng 	super_flags = btrfs_super_flags(disk_super) &
24522b82032cSYan Zheng 		      ~BTRFS_SUPER_FLAG_SEEDING;
24532b82032cSYan Zheng 	btrfs_set_super_flags(disk_super, super_flags);
24542b82032cSYan Zheng 
24552b82032cSYan Zheng 	return 0;
24562b82032cSYan Zheng }
24572b82032cSYan Zheng 
24582b82032cSYan Zheng /*
245901327610SNicholas D Steeves  * Store the expected generation for seed devices in device items.
24602b82032cSYan Zheng  */
24615c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
24622b82032cSYan Zheng {
24635c466629SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
24645b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
24652b82032cSYan Zheng 	struct btrfs_path *path;
24662b82032cSYan Zheng 	struct extent_buffer *leaf;
24672b82032cSYan Zheng 	struct btrfs_dev_item *dev_item;
24682b82032cSYan Zheng 	struct btrfs_device *device;
24692b82032cSYan Zheng 	struct btrfs_key key;
247044880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
24712b82032cSYan Zheng 	u8 dev_uuid[BTRFS_UUID_SIZE];
24722b82032cSYan Zheng 	u64 devid;
24732b82032cSYan Zheng 	int ret;
24742b82032cSYan Zheng 
24752b82032cSYan Zheng 	path = btrfs_alloc_path();
24762b82032cSYan Zheng 	if (!path)
24772b82032cSYan Zheng 		return -ENOMEM;
24782b82032cSYan Zheng 
24792b82032cSYan Zheng 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
24802b82032cSYan Zheng 	key.offset = 0;
24812b82032cSYan Zheng 	key.type = BTRFS_DEV_ITEM_KEY;
24822b82032cSYan Zheng 
24832b82032cSYan Zheng 	while (1) {
24842b82032cSYan Zheng 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
24852b82032cSYan Zheng 		if (ret < 0)
24862b82032cSYan Zheng 			goto error;
24872b82032cSYan Zheng 
24882b82032cSYan Zheng 		leaf = path->nodes[0];
24892b82032cSYan Zheng next_slot:
24902b82032cSYan Zheng 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
24912b82032cSYan Zheng 			ret = btrfs_next_leaf(root, path);
24922b82032cSYan Zheng 			if (ret > 0)
24932b82032cSYan Zheng 				break;
24942b82032cSYan Zheng 			if (ret < 0)
24952b82032cSYan Zheng 				goto error;
24962b82032cSYan Zheng 			leaf = path->nodes[0];
24972b82032cSYan Zheng 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2498b3b4aa74SDavid Sterba 			btrfs_release_path(path);
24992b82032cSYan Zheng 			continue;
25002b82032cSYan Zheng 		}
25012b82032cSYan Zheng 
25022b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
25032b82032cSYan Zheng 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
25042b82032cSYan Zheng 		    key.type != BTRFS_DEV_ITEM_KEY)
25052b82032cSYan Zheng 			break;
25062b82032cSYan Zheng 
25072b82032cSYan Zheng 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
25082b82032cSYan Zheng 					  struct btrfs_dev_item);
25092b82032cSYan Zheng 		devid = btrfs_device_id(leaf, dev_item);
2510410ba3a2SGeert Uytterhoeven 		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
25112b82032cSYan Zheng 				   BTRFS_UUID_SIZE);
25121473b24eSGeert Uytterhoeven 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
251344880fdcSAnand Jain 				   BTRFS_FSID_SIZE);
2514e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2515b2598edfSAnand Jain 					   fs_uuid);
251679787eaaSJeff Mahoney 		BUG_ON(!device); /* Logic error */
25172b82032cSYan Zheng 
25182b82032cSYan Zheng 		if (device->fs_devices->seeding) {
25192b82032cSYan Zheng 			btrfs_set_device_generation(leaf, dev_item,
25202b82032cSYan Zheng 						    device->generation);
25212b82032cSYan Zheng 			btrfs_mark_buffer_dirty(leaf);
25222b82032cSYan Zheng 		}
25232b82032cSYan Zheng 
25242b82032cSYan Zheng 		path->slots[0]++;
25252b82032cSYan Zheng 		goto next_slot;
25262b82032cSYan Zheng 	}
25272b82032cSYan Zheng 	ret = 0;
25282b82032cSYan Zheng error:
25292b82032cSYan Zheng 	btrfs_free_path(path);
25302b82032cSYan Zheng 	return ret;
25312b82032cSYan Zheng }
25322b82032cSYan Zheng 
2533da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2534788f20ebSChris Mason {
25355112febbSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
2536d5e2003cSJosef Bacik 	struct request_queue *q;
2537788f20ebSChris Mason 	struct btrfs_trans_handle *trans;
2538788f20ebSChris Mason 	struct btrfs_device *device;
2539788f20ebSChris Mason 	struct block_device *bdev;
25400b246afaSJeff Mahoney 	struct super_block *sb = fs_info->sb;
2541606686eeSJosef Bacik 	struct rcu_string *name;
25425da54bc1SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
254339379faaSNaohiro Aota 	u64 orig_super_total_bytes;
254439379faaSNaohiro Aota 	u64 orig_super_num_devices;
25452b82032cSYan Zheng 	int seeding_dev = 0;
2546788f20ebSChris Mason 	int ret = 0;
254744cab9baSNikolay Borisov 	bool locked = false;
2548788f20ebSChris Mason 
25495da54bc1SAnand Jain 	if (sb_rdonly(sb) && !fs_devices->seeding)
2550f8c5d0b4SLiu Bo 		return -EROFS;
2551788f20ebSChris Mason 
2552a5d16333SLi Zefan 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
25530b246afaSJeff Mahoney 				  fs_info->bdev_holder);
25547f59203aSJosef Bacik 	if (IS_ERR(bdev))
25557f59203aSJosef Bacik 		return PTR_ERR(bdev);
2556a2135011SChris Mason 
2557b70f5097SNaohiro Aota 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2558b70f5097SNaohiro Aota 		ret = -EINVAL;
2559b70f5097SNaohiro Aota 		goto error;
2560b70f5097SNaohiro Aota 	}
2561b70f5097SNaohiro Aota 
25625da54bc1SAnand Jain 	if (fs_devices->seeding) {
25632b82032cSYan Zheng 		seeding_dev = 1;
25642b82032cSYan Zheng 		down_write(&sb->s_umount);
25652b82032cSYan Zheng 		mutex_lock(&uuid_mutex);
256644cab9baSNikolay Borisov 		locked = true;
25672b82032cSYan Zheng 	}
25682b82032cSYan Zheng 
2569b9ba017fSNikolay Borisov 	sync_blockdev(bdev);
2570a2135011SChris Mason 
2571f4cfa9bdSNikolay Borisov 	rcu_read_lock();
2572f4cfa9bdSNikolay Borisov 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2573788f20ebSChris Mason 		if (device->bdev == bdev) {
2574788f20ebSChris Mason 			ret = -EEXIST;
2575f4cfa9bdSNikolay Borisov 			rcu_read_unlock();
25762b82032cSYan Zheng 			goto error;
2577788f20ebSChris Mason 		}
2578788f20ebSChris Mason 	}
2579f4cfa9bdSNikolay Borisov 	rcu_read_unlock();
2580788f20ebSChris Mason 
25810b246afaSJeff Mahoney 	device = btrfs_alloc_device(fs_info, NULL, NULL);
258212bd2fc0SIlya Dryomov 	if (IS_ERR(device)) {
2583788f20ebSChris Mason 		/* we can safely leave the fs_devices entry around */
258412bd2fc0SIlya Dryomov 		ret = PTR_ERR(device);
25852b82032cSYan Zheng 		goto error;
2586788f20ebSChris Mason 	}
2587788f20ebSChris Mason 
258878f2c9e6SDavid Sterba 	name = rcu_string_strdup(device_path, GFP_KERNEL);
2589606686eeSJosef Bacik 	if (!name) {
25902b82032cSYan Zheng 		ret = -ENOMEM;
25915c4cf6c9SDavid Sterba 		goto error_free_device;
2592788f20ebSChris Mason 	}
2593606686eeSJosef Bacik 	rcu_assign_pointer(device->name, name);
25942b82032cSYan Zheng 
25955b316468SNaohiro Aota 	device->fs_info = fs_info;
25965b316468SNaohiro Aota 	device->bdev = bdev;
25975b316468SNaohiro Aota 
25985b316468SNaohiro Aota 	ret = btrfs_get_dev_zone_info(device);
25995b316468SNaohiro Aota 	if (ret)
26005b316468SNaohiro Aota 		goto error_free_device;
26015b316468SNaohiro Aota 
2602a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
260398d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
260498d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
26055b316468SNaohiro Aota 		goto error_free_zone;
260698d5dc13STsutomu Itoh 	}
260798d5dc13STsutomu Itoh 
2608d5e2003cSJosef Bacik 	q = bdev_get_queue(bdev);
2609ebbede42SAnand Jain 	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
26102b82032cSYan Zheng 	device->generation = trans->transid;
26110b246afaSJeff Mahoney 	device->io_width = fs_info->sectorsize;
26120b246afaSJeff Mahoney 	device->io_align = fs_info->sectorsize;
26130b246afaSJeff Mahoney 	device->sector_size = fs_info->sectorsize;
26147dfb8be1SNikolay Borisov 	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
26157dfb8be1SNikolay Borisov 					 fs_info->sectorsize);
26162cc3c559SYan Zheng 	device->disk_total_bytes = device->total_bytes;
2617935e5cc9SMiao Xie 	device->commit_total_bytes = device->total_bytes;
2618e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2619401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2620fb01aa85SIlya Dryomov 	device->mode = FMODE_EXCL;
262127087f37SStefan Behrens 	device->dev_stats_valid = 1;
26229f6d2510SDavid Sterba 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2623325cd4baSZheng Yan 
26242b82032cSYan Zheng 	if (seeding_dev) {
2625a0a1db70SFilipe Manana 		btrfs_clear_sb_rdonly(sb);
26262ff7e61eSJeff Mahoney 		ret = btrfs_prepare_sprout(fs_info);
2627d31c32f6SAnand Jain 		if (ret) {
2628d31c32f6SAnand Jain 			btrfs_abort_transaction(trans, ret);
2629d31c32f6SAnand Jain 			goto error_trans;
2630d31c32f6SAnand Jain 		}
2631b7cb29e6SAnand Jain 		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2632b7cb29e6SAnand Jain 						device);
26332b82032cSYan Zheng 	}
26342b82032cSYan Zheng 
26355da54bc1SAnand Jain 	device->fs_devices = fs_devices;
2636e5e9a520SChris Mason 
26375da54bc1SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
263834441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
26395da54bc1SAnand Jain 	list_add_rcu(&device->dev_list, &fs_devices->devices);
26405da54bc1SAnand Jain 	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
26415da54bc1SAnand Jain 	fs_devices->num_devices++;
26425da54bc1SAnand Jain 	fs_devices->open_devices++;
26435da54bc1SAnand Jain 	fs_devices->rw_devices++;
26445da54bc1SAnand Jain 	fs_devices->total_devices++;
26455da54bc1SAnand Jain 	fs_devices->total_rw_bytes += device->total_bytes;
26462b82032cSYan Zheng 
2647a5ed45f8SNikolay Borisov 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
26482bf64758SJosef Bacik 
2649e884f4f0SAnand Jain 	if (!blk_queue_nonrot(q))
26507f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
2651c289811cSChris Mason 
265239379faaSNaohiro Aota 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
26530b246afaSJeff Mahoney 	btrfs_set_super_total_bytes(fs_info->super_copy,
265439379faaSNaohiro Aota 		round_down(orig_super_total_bytes + device->total_bytes,
265539379faaSNaohiro Aota 			   fs_info->sectorsize));
2656788f20ebSChris Mason 
265739379faaSNaohiro Aota 	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
265839379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
265939379faaSNaohiro Aota 				    orig_super_num_devices + 1);
26600d39376aSAnand Jain 
26612196d6e8SMiao Xie 	/*
26622196d6e8SMiao Xie 	 * we've got more storage, clear any full flags on the space
26632196d6e8SMiao Xie 	 * infos
26642196d6e8SMiao Xie 	 */
26650b246afaSJeff Mahoney 	btrfs_clear_space_info_full(fs_info);
26662196d6e8SMiao Xie 
266734441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
2668ca10845aSJosef Bacik 
2669ca10845aSJosef Bacik 	/* Add sysfs device entry */
2670cd36da2eSAnand Jain 	btrfs_sysfs_add_device(device);
2671ca10845aSJosef Bacik 
26725da54bc1SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2673788f20ebSChris Mason 
26742b82032cSYan Zheng 	if (seeding_dev) {
267534441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
26766f8e0fc7SDavid Sterba 		ret = init_first_rw_device(trans);
267734441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
2678005d6427SDavid Sterba 		if (ret) {
267966642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2680d31c32f6SAnand Jain 			goto error_sysfs;
2681005d6427SDavid Sterba 		}
26822196d6e8SMiao Xie 	}
26832196d6e8SMiao Xie 
26848e87e856SNikolay Borisov 	ret = btrfs_add_dev_item(trans, device);
26852196d6e8SMiao Xie 	if (ret) {
268666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
2687d31c32f6SAnand Jain 		goto error_sysfs;
26882196d6e8SMiao Xie 	}
26892196d6e8SMiao Xie 
26902196d6e8SMiao Xie 	if (seeding_dev) {
26915c466629SDavid Sterba 		ret = btrfs_finish_sprout(trans);
2692005d6427SDavid Sterba 		if (ret) {
269366642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2694d31c32f6SAnand Jain 			goto error_sysfs;
2695005d6427SDavid Sterba 		}
2696b2373f25SAnand Jain 
26978e560081SNikolay Borisov 		/*
26988e560081SNikolay Borisov 		 * fs_devices now represents the newly sprouted filesystem and
26998e560081SNikolay Borisov 		 * its fsid has been changed by btrfs_prepare_sprout
27008e560081SNikolay Borisov 		 */
27018e560081SNikolay Borisov 		btrfs_sysfs_update_sprout_fsid(fs_devices);
2702005d6427SDavid Sterba 	}
27032b82032cSYan Zheng 
27043a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
27052b82032cSYan Zheng 
27062b82032cSYan Zheng 	if (seeding_dev) {
27072b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
27082b82032cSYan Zheng 		up_write(&sb->s_umount);
270944cab9baSNikolay Borisov 		locked = false;
27102b82032cSYan Zheng 
271179787eaaSJeff Mahoney 		if (ret) /* transaction commit */
271279787eaaSJeff Mahoney 			return ret;
271379787eaaSJeff Mahoney 
27142ff7e61eSJeff Mahoney 		ret = btrfs_relocate_sys_chunks(fs_info);
271579787eaaSJeff Mahoney 		if (ret < 0)
27160b246afaSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
27175d163e0eSJeff Mahoney 				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2718671415b7SMiao Xie 		trans = btrfs_attach_transaction(root);
2719671415b7SMiao Xie 		if (IS_ERR(trans)) {
2720671415b7SMiao Xie 			if (PTR_ERR(trans) == -ENOENT)
2721671415b7SMiao Xie 				return 0;
27227132a262SAnand Jain 			ret = PTR_ERR(trans);
27237132a262SAnand Jain 			trans = NULL;
27247132a262SAnand Jain 			goto error_sysfs;
2725671415b7SMiao Xie 		}
27263a45bb20SJeff Mahoney 		ret = btrfs_commit_transaction(trans);
27272b82032cSYan Zheng 	}
2728c9e9f97bSIlya Dryomov 
27297f551d96SAnand Jain 	/*
27307f551d96SAnand Jain 	 * Now that we have written a new super block to this device, check all
27317f551d96SAnand Jain 	 * other fs_devices list if device_path alienates any other scanned
27327f551d96SAnand Jain 	 * device.
27337f551d96SAnand Jain 	 * We can ignore the return value as it typically returns -EINVAL and
27347f551d96SAnand Jain 	 * only succeeds if the device was an alien.
27357f551d96SAnand Jain 	 */
27367f551d96SAnand Jain 	btrfs_forget_devices(device_path);
27377f551d96SAnand Jain 
27387f551d96SAnand Jain 	/* Update ctime/mtime for blkid or udev */
27398f96a5bfSJosef Bacik 	update_dev_time(bdev);
27407f551d96SAnand Jain 
2741788f20ebSChris Mason 	return ret;
274279787eaaSJeff Mahoney 
2743d31c32f6SAnand Jain error_sysfs:
274453f8a74cSAnand Jain 	btrfs_sysfs_remove_device(device);
274539379faaSNaohiro Aota 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
274639379faaSNaohiro Aota 	mutex_lock(&fs_info->chunk_mutex);
274739379faaSNaohiro Aota 	list_del_rcu(&device->dev_list);
274839379faaSNaohiro Aota 	list_del(&device->dev_alloc_list);
274939379faaSNaohiro Aota 	fs_info->fs_devices->num_devices--;
275039379faaSNaohiro Aota 	fs_info->fs_devices->open_devices--;
275139379faaSNaohiro Aota 	fs_info->fs_devices->rw_devices--;
275239379faaSNaohiro Aota 	fs_info->fs_devices->total_devices--;
275339379faaSNaohiro Aota 	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
275439379faaSNaohiro Aota 	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
275539379faaSNaohiro Aota 	btrfs_set_super_total_bytes(fs_info->super_copy,
275639379faaSNaohiro Aota 				    orig_super_total_bytes);
275739379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
275839379faaSNaohiro Aota 				    orig_super_num_devices);
275939379faaSNaohiro Aota 	mutex_unlock(&fs_info->chunk_mutex);
276039379faaSNaohiro Aota 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
276179787eaaSJeff Mahoney error_trans:
27620af2c4bfSAnand Jain 	if (seeding_dev)
2763a0a1db70SFilipe Manana 		btrfs_set_sb_rdonly(sb);
27647132a262SAnand Jain 	if (trans)
27653a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
27665b316468SNaohiro Aota error_free_zone:
27675b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
27685c4cf6c9SDavid Sterba error_free_device:
2769a425f9d4SDavid Sterba 	btrfs_free_device(device);
27702b82032cSYan Zheng error:
2771e525fd89STejun Heo 	blkdev_put(bdev, FMODE_EXCL);
277244cab9baSNikolay Borisov 	if (locked) {
27732b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
27742b82032cSYan Zheng 		up_write(&sb->s_umount);
27752b82032cSYan Zheng 	}
2776c9e9f97bSIlya Dryomov 	return ret;
2777788f20ebSChris Mason }
2778788f20ebSChris Mason 
2779d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
27800b86a832SChris Mason 					struct btrfs_device *device)
27810b86a832SChris Mason {
27820b86a832SChris Mason 	int ret;
27830b86a832SChris Mason 	struct btrfs_path *path;
27840b246afaSJeff Mahoney 	struct btrfs_root *root = device->fs_info->chunk_root;
27850b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
27860b86a832SChris Mason 	struct extent_buffer *leaf;
27870b86a832SChris Mason 	struct btrfs_key key;
27880b86a832SChris Mason 
27890b86a832SChris Mason 	path = btrfs_alloc_path();
27900b86a832SChris Mason 	if (!path)
27910b86a832SChris Mason 		return -ENOMEM;
27920b86a832SChris Mason 
27930b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
27940b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
27950b86a832SChris Mason 	key.offset = device->devid;
27960b86a832SChris Mason 
27970b86a832SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
27980b86a832SChris Mason 	if (ret < 0)
27990b86a832SChris Mason 		goto out;
28000b86a832SChris Mason 
28010b86a832SChris Mason 	if (ret > 0) {
28020b86a832SChris Mason 		ret = -ENOENT;
28030b86a832SChris Mason 		goto out;
28040b86a832SChris Mason 	}
28050b86a832SChris Mason 
28060b86a832SChris Mason 	leaf = path->nodes[0];
28070b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
28080b86a832SChris Mason 
28090b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
28100b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
28110b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
28120b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
28130b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
28147cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
28157cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
28167cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
28177cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
28180b86a832SChris Mason 	btrfs_mark_buffer_dirty(leaf);
28190b86a832SChris Mason 
28200b86a832SChris Mason out:
28210b86a832SChris Mason 	btrfs_free_path(path);
28220b86a832SChris Mason 	return ret;
28230b86a832SChris Mason }
28240b86a832SChris Mason 
28252196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans,
28268f18cf13SChris Mason 		      struct btrfs_device *device, u64 new_size)
28278f18cf13SChris Mason {
28280b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
28290b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
28302196d6e8SMiao Xie 	u64 old_total;
28312196d6e8SMiao Xie 	u64 diff;
28328f18cf13SChris Mason 
2833ebbede42SAnand Jain 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
28342b82032cSYan Zheng 		return -EACCES;
28352196d6e8SMiao Xie 
28367dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
28377dfb8be1SNikolay Borisov 
283834441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
28392196d6e8SMiao Xie 	old_total = btrfs_super_total_bytes(super_copy);
28400e4324a4SNikolay Borisov 	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
28412196d6e8SMiao Xie 
284263a212abSStefan Behrens 	if (new_size <= device->total_bytes ||
2843401e29c1SAnand Jain 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
284434441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
28452b82032cSYan Zheng 		return -EINVAL;
28462196d6e8SMiao Xie 	}
28472b82032cSYan Zheng 
28487dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
28497dfb8be1SNikolay Borisov 			round_down(old_total + diff, fs_info->sectorsize));
28502b82032cSYan Zheng 	device->fs_devices->total_rw_bytes += diff;
28512b82032cSYan Zheng 
28527cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
28537cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
2854fb456252SJeff Mahoney 	btrfs_clear_space_info_full(device->fs_info);
2855bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
2856bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
2857bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
285834441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
28594184ea7fSChris Mason 
28608f18cf13SChris Mason 	return btrfs_update_device(trans, device);
28618f18cf13SChris Mason }
28628f18cf13SChris Mason 
2863f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
28648f18cf13SChris Mason {
2865f4208794SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
28665b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
28678f18cf13SChris Mason 	int ret;
28688f18cf13SChris Mason 	struct btrfs_path *path;
28698f18cf13SChris Mason 	struct btrfs_key key;
28708f18cf13SChris Mason 
28718f18cf13SChris Mason 	path = btrfs_alloc_path();
28728f18cf13SChris Mason 	if (!path)
28738f18cf13SChris Mason 		return -ENOMEM;
28748f18cf13SChris Mason 
2875408fbf19SNikolay Borisov 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
28768f18cf13SChris Mason 	key.offset = chunk_offset;
28778f18cf13SChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
28788f18cf13SChris Mason 
28798f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
288079787eaaSJeff Mahoney 	if (ret < 0)
288179787eaaSJeff Mahoney 		goto out;
288279787eaaSJeff Mahoney 	else if (ret > 0) { /* Logic error or corruption */
28830b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, -ENOENT,
288479787eaaSJeff Mahoney 				      "Failed lookup while freeing chunk.");
288579787eaaSJeff Mahoney 		ret = -ENOENT;
288679787eaaSJeff Mahoney 		goto out;
288779787eaaSJeff Mahoney 	}
28888f18cf13SChris Mason 
28898f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
289079787eaaSJeff Mahoney 	if (ret < 0)
28910b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
289279787eaaSJeff Mahoney 				      "Failed to delete chunk item.");
289379787eaaSJeff Mahoney out:
28948f18cf13SChris Mason 	btrfs_free_path(path);
289565a246c5STsutomu Itoh 	return ret;
28968f18cf13SChris Mason }
28978f18cf13SChris Mason 
2898408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
28998f18cf13SChris Mason {
29000b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
29018f18cf13SChris Mason 	struct btrfs_disk_key *disk_key;
29028f18cf13SChris Mason 	struct btrfs_chunk *chunk;
29038f18cf13SChris Mason 	u8 *ptr;
29048f18cf13SChris Mason 	int ret = 0;
29058f18cf13SChris Mason 	u32 num_stripes;
29068f18cf13SChris Mason 	u32 array_size;
29078f18cf13SChris Mason 	u32 len = 0;
29088f18cf13SChris Mason 	u32 cur;
29098f18cf13SChris Mason 	struct btrfs_key key;
29108f18cf13SChris Mason 
291179bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
29128f18cf13SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
29138f18cf13SChris Mason 
29148f18cf13SChris Mason 	ptr = super_copy->sys_chunk_array;
29158f18cf13SChris Mason 	cur = 0;
29168f18cf13SChris Mason 
29178f18cf13SChris Mason 	while (cur < array_size) {
29188f18cf13SChris Mason 		disk_key = (struct btrfs_disk_key *)ptr;
29198f18cf13SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
29208f18cf13SChris Mason 
29218f18cf13SChris Mason 		len = sizeof(*disk_key);
29228f18cf13SChris Mason 
29238f18cf13SChris Mason 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
29248f18cf13SChris Mason 			chunk = (struct btrfs_chunk *)(ptr + len);
29258f18cf13SChris Mason 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
29268f18cf13SChris Mason 			len += btrfs_chunk_item_size(num_stripes);
29278f18cf13SChris Mason 		} else {
29288f18cf13SChris Mason 			ret = -EIO;
29298f18cf13SChris Mason 			break;
29308f18cf13SChris Mason 		}
2931408fbf19SNikolay Borisov 		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
29328f18cf13SChris Mason 		    key.offset == chunk_offset) {
29338f18cf13SChris Mason 			memmove(ptr, ptr + len, array_size - (cur + len));
29348f18cf13SChris Mason 			array_size -= len;
29358f18cf13SChris Mason 			btrfs_set_super_sys_array_size(super_copy, array_size);
29368f18cf13SChris Mason 		} else {
29378f18cf13SChris Mason 			ptr += len;
29388f18cf13SChris Mason 			cur += len;
29398f18cf13SChris Mason 		}
29408f18cf13SChris Mason 	}
29418f18cf13SChris Mason 	return ret;
29428f18cf13SChris Mason }
29438f18cf13SChris Mason 
294460ca842eSOmar Sandoval /*
294560ca842eSOmar Sandoval  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
294660ca842eSOmar Sandoval  * @logical: Logical block offset in bytes.
294760ca842eSOmar Sandoval  * @length: Length of extent in bytes.
294860ca842eSOmar Sandoval  *
294960ca842eSOmar Sandoval  * Return: Chunk mapping or ERR_PTR.
295060ca842eSOmar Sandoval  */
295160ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2952592d92eeSLiu Bo 				       u64 logical, u64 length)
2953592d92eeSLiu Bo {
2954592d92eeSLiu Bo 	struct extent_map_tree *em_tree;
2955592d92eeSLiu Bo 	struct extent_map *em;
2956592d92eeSLiu Bo 
2957c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
2958592d92eeSLiu Bo 	read_lock(&em_tree->lock);
2959592d92eeSLiu Bo 	em = lookup_extent_mapping(em_tree, logical, length);
2960592d92eeSLiu Bo 	read_unlock(&em_tree->lock);
2961592d92eeSLiu Bo 
2962592d92eeSLiu Bo 	if (!em) {
2963592d92eeSLiu Bo 		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2964592d92eeSLiu Bo 			   logical, length);
2965592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
2966592d92eeSLiu Bo 	}
2967592d92eeSLiu Bo 
2968592d92eeSLiu Bo 	if (em->start > logical || em->start + em->len < logical) {
2969592d92eeSLiu Bo 		btrfs_crit(fs_info,
2970592d92eeSLiu Bo 			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2971592d92eeSLiu Bo 			   logical, length, em->start, em->start + em->len);
2972592d92eeSLiu Bo 		free_extent_map(em);
2973592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
2974592d92eeSLiu Bo 	}
2975592d92eeSLiu Bo 
2976592d92eeSLiu Bo 	/* callers are responsible for dropping em's ref. */
2977592d92eeSLiu Bo 	return em;
2978592d92eeSLiu Bo }
2979592d92eeSLiu Bo 
298079bd3712SFilipe Manana static int remove_chunk_item(struct btrfs_trans_handle *trans,
298179bd3712SFilipe Manana 			     struct map_lookup *map, u64 chunk_offset)
298279bd3712SFilipe Manana {
298379bd3712SFilipe Manana 	int i;
298479bd3712SFilipe Manana 
298579bd3712SFilipe Manana 	/*
298679bd3712SFilipe Manana 	 * Removing chunk items and updating the device items in the chunks btree
298779bd3712SFilipe Manana 	 * requires holding the chunk_mutex.
298879bd3712SFilipe Manana 	 * See the comment at btrfs_chunk_alloc() for the details.
298979bd3712SFilipe Manana 	 */
299079bd3712SFilipe Manana 	lockdep_assert_held(&trans->fs_info->chunk_mutex);
299179bd3712SFilipe Manana 
299279bd3712SFilipe Manana 	for (i = 0; i < map->num_stripes; i++) {
299379bd3712SFilipe Manana 		int ret;
299479bd3712SFilipe Manana 
299579bd3712SFilipe Manana 		ret = btrfs_update_device(trans, map->stripes[i].dev);
299679bd3712SFilipe Manana 		if (ret)
299779bd3712SFilipe Manana 			return ret;
299879bd3712SFilipe Manana 	}
299979bd3712SFilipe Manana 
300079bd3712SFilipe Manana 	return btrfs_free_chunk(trans, chunk_offset);
300179bd3712SFilipe Manana }
300279bd3712SFilipe Manana 
300397aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
300447ab2a6cSJosef Bacik {
300597aff912SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
300647ab2a6cSJosef Bacik 	struct extent_map *em;
300747ab2a6cSJosef Bacik 	struct map_lookup *map;
300847ab2a6cSJosef Bacik 	u64 dev_extent_len = 0;
300947ab2a6cSJosef Bacik 	int i, ret = 0;
30100b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
301147ab2a6cSJosef Bacik 
301260ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3013592d92eeSLiu Bo 	if (IS_ERR(em)) {
301447ab2a6cSJosef Bacik 		/*
301547ab2a6cSJosef Bacik 		 * This is a logic error, but we don't want to just rely on the
3016bb7ab3b9SAdam Buchbinder 		 * user having built with ASSERT enabled, so if ASSERT doesn't
301747ab2a6cSJosef Bacik 		 * do anything we still error out.
301847ab2a6cSJosef Bacik 		 */
301947ab2a6cSJosef Bacik 		ASSERT(0);
3020592d92eeSLiu Bo 		return PTR_ERR(em);
302147ab2a6cSJosef Bacik 	}
302295617d69SJeff Mahoney 	map = em->map_lookup;
302347ab2a6cSJosef Bacik 
302457ba4cb8SFilipe Manana 	/*
302579bd3712SFilipe Manana 	 * First delete the device extent items from the devices btree.
302679bd3712SFilipe Manana 	 * We take the device_list_mutex to avoid racing with the finishing phase
302779bd3712SFilipe Manana 	 * of a device replace operation. See the comment below before acquiring
302879bd3712SFilipe Manana 	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
302979bd3712SFilipe Manana 	 * because that can result in a deadlock when deleting the device extent
303079bd3712SFilipe Manana 	 * items from the devices btree - COWing an extent buffer from the btree
303179bd3712SFilipe Manana 	 * may result in allocating a new metadata chunk, which would attempt to
303279bd3712SFilipe Manana 	 * lock again fs_info->chunk_mutex.
303357ba4cb8SFilipe Manana 	 */
303457ba4cb8SFilipe Manana 	mutex_lock(&fs_devices->device_list_mutex);
303547ab2a6cSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
303647ab2a6cSJosef Bacik 		struct btrfs_device *device = map->stripes[i].dev;
303747ab2a6cSJosef Bacik 		ret = btrfs_free_dev_extent(trans, device,
303847ab2a6cSJosef Bacik 					    map->stripes[i].physical,
303947ab2a6cSJosef Bacik 					    &dev_extent_len);
304047ab2a6cSJosef Bacik 		if (ret) {
304157ba4cb8SFilipe Manana 			mutex_unlock(&fs_devices->device_list_mutex);
304266642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
304347ab2a6cSJosef Bacik 			goto out;
304447ab2a6cSJosef Bacik 		}
304547ab2a6cSJosef Bacik 
304647ab2a6cSJosef Bacik 		if (device->bytes_used > 0) {
304734441361SDavid Sterba 			mutex_lock(&fs_info->chunk_mutex);
304847ab2a6cSJosef Bacik 			btrfs_device_set_bytes_used(device,
304947ab2a6cSJosef Bacik 					device->bytes_used - dev_extent_len);
3050a5ed45f8SNikolay Borisov 			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
30510b246afaSJeff Mahoney 			btrfs_clear_space_info_full(fs_info);
305234441361SDavid Sterba 			mutex_unlock(&fs_info->chunk_mutex);
305347ab2a6cSJosef Bacik 		}
305479bd3712SFilipe Manana 	}
305557ba4cb8SFilipe Manana 	mutex_unlock(&fs_devices->device_list_mutex);
305679bd3712SFilipe Manana 
305779bd3712SFilipe Manana 	/*
305879bd3712SFilipe Manana 	 * We acquire fs_info->chunk_mutex for 2 reasons:
305979bd3712SFilipe Manana 	 *
306079bd3712SFilipe Manana 	 * 1) Just like with the first phase of the chunk allocation, we must
306179bd3712SFilipe Manana 	 *    reserve system space, do all chunk btree updates and deletions, and
306279bd3712SFilipe Manana 	 *    update the system chunk array in the superblock while holding this
306379bd3712SFilipe Manana 	 *    mutex. This is for similar reasons as explained on the comment at
306479bd3712SFilipe Manana 	 *    the top of btrfs_chunk_alloc();
306579bd3712SFilipe Manana 	 *
306679bd3712SFilipe Manana 	 * 2) Prevent races with the final phase of a device replace operation
306779bd3712SFilipe Manana 	 *    that replaces the device object associated with the map's stripes,
306879bd3712SFilipe Manana 	 *    because the device object's id can change at any time during that
306979bd3712SFilipe Manana 	 *    final phase of the device replace operation
307079bd3712SFilipe Manana 	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
307179bd3712SFilipe Manana 	 *    replaced device and then see it with an ID of
307279bd3712SFilipe Manana 	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
307379bd3712SFilipe Manana 	 *    the device item, which does not exists on the chunk btree.
307479bd3712SFilipe Manana 	 *    The finishing phase of device replace acquires both the
307579bd3712SFilipe Manana 	 *    device_list_mutex and the chunk_mutex, in that order, so we are
307679bd3712SFilipe Manana 	 *    safe by just acquiring the chunk_mutex.
307779bd3712SFilipe Manana 	 */
307879bd3712SFilipe Manana 	trans->removing_chunk = true;
307979bd3712SFilipe Manana 	mutex_lock(&fs_info->chunk_mutex);
308079bd3712SFilipe Manana 
308179bd3712SFilipe Manana 	check_system_chunk(trans, map->type);
308279bd3712SFilipe Manana 
308379bd3712SFilipe Manana 	ret = remove_chunk_item(trans, map, chunk_offset);
308479bd3712SFilipe Manana 	/*
308579bd3712SFilipe Manana 	 * Normally we should not get -ENOSPC since we reserved space before
308679bd3712SFilipe Manana 	 * through the call to check_system_chunk().
308779bd3712SFilipe Manana 	 *
308879bd3712SFilipe Manana 	 * Despite our system space_info having enough free space, we may not
308979bd3712SFilipe Manana 	 * be able to allocate extents from its block groups, because all have
309079bd3712SFilipe Manana 	 * an incompatible profile, which will force us to allocate a new system
309179bd3712SFilipe Manana 	 * block group with the right profile, or right after we called
309279bd3712SFilipe Manana 	 * check_system_space() above, a scrub turned the only system block group
309379bd3712SFilipe Manana 	 * with enough free space into RO mode.
309479bd3712SFilipe Manana 	 * This is explained with more detail at do_chunk_alloc().
309579bd3712SFilipe Manana 	 *
309679bd3712SFilipe Manana 	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
309779bd3712SFilipe Manana 	 */
309879bd3712SFilipe Manana 	if (ret == -ENOSPC) {
309979bd3712SFilipe Manana 		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
310079bd3712SFilipe Manana 		struct btrfs_block_group *sys_bg;
310179bd3712SFilipe Manana 
3102f6f39f7aSNikolay Borisov 		sys_bg = btrfs_create_chunk(trans, sys_flags);
310379bd3712SFilipe Manana 		if (IS_ERR(sys_bg)) {
310479bd3712SFilipe Manana 			ret = PTR_ERR(sys_bg);
310566642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
310647ab2a6cSJosef Bacik 			goto out;
310747ab2a6cSJosef Bacik 		}
310857ba4cb8SFilipe Manana 
310979bd3712SFilipe Manana 		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
311047ab2a6cSJosef Bacik 		if (ret) {
311166642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
311247ab2a6cSJosef Bacik 			goto out;
311347ab2a6cSJosef Bacik 		}
311447ab2a6cSJosef Bacik 
311579bd3712SFilipe Manana 		ret = remove_chunk_item(trans, map, chunk_offset);
311679bd3712SFilipe Manana 		if (ret) {
311779bd3712SFilipe Manana 			btrfs_abort_transaction(trans, ret);
311879bd3712SFilipe Manana 			goto out;
311979bd3712SFilipe Manana 		}
312079bd3712SFilipe Manana 	} else if (ret) {
312179bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
312279bd3712SFilipe Manana 		goto out;
312379bd3712SFilipe Manana 	}
312479bd3712SFilipe Manana 
31256bccf3abSJeff Mahoney 	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
312647ab2a6cSJosef Bacik 
312747ab2a6cSJosef Bacik 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3128408fbf19SNikolay Borisov 		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
312947ab2a6cSJosef Bacik 		if (ret) {
313066642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
313147ab2a6cSJosef Bacik 			goto out;
313247ab2a6cSJosef Bacik 		}
313347ab2a6cSJosef Bacik 	}
313447ab2a6cSJosef Bacik 
313579bd3712SFilipe Manana 	mutex_unlock(&fs_info->chunk_mutex);
313679bd3712SFilipe Manana 	trans->removing_chunk = false;
313779bd3712SFilipe Manana 
313879bd3712SFilipe Manana 	/*
313979bd3712SFilipe Manana 	 * We are done with chunk btree updates and deletions, so release the
314079bd3712SFilipe Manana 	 * system space we previously reserved (with check_system_chunk()).
314179bd3712SFilipe Manana 	 */
314279bd3712SFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
314379bd3712SFilipe Manana 
31445a98ec01SNikolay Borisov 	ret = btrfs_remove_block_group(trans, chunk_offset, em);
314547ab2a6cSJosef Bacik 	if (ret) {
314666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
314747ab2a6cSJosef Bacik 		goto out;
314847ab2a6cSJosef Bacik 	}
314947ab2a6cSJosef Bacik 
315047ab2a6cSJosef Bacik out:
315179bd3712SFilipe Manana 	if (trans->removing_chunk) {
315279bd3712SFilipe Manana 		mutex_unlock(&fs_info->chunk_mutex);
315379bd3712SFilipe Manana 		trans->removing_chunk = false;
315479bd3712SFilipe Manana 	}
315547ab2a6cSJosef Bacik 	/* once for us */
315647ab2a6cSJosef Bacik 	free_extent_map(em);
31578f18cf13SChris Mason 	return ret;
31588f18cf13SChris Mason }
31598f18cf13SChris Mason 
316018bb8bbfSJohannes Thumshirn int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
31618f18cf13SChris Mason {
31625b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
316319c4d2f9SChris Mason 	struct btrfs_trans_handle *trans;
3164b0643e59SDennis Zhou 	struct btrfs_block_group *block_group;
316501e86008SJohannes Thumshirn 	u64 length;
31668f18cf13SChris Mason 	int ret;
31678f18cf13SChris Mason 
316867c5e7d4SFilipe Manana 	/*
316967c5e7d4SFilipe Manana 	 * Prevent races with automatic removal of unused block groups.
317067c5e7d4SFilipe Manana 	 * After we relocate and before we remove the chunk with offset
317167c5e7d4SFilipe Manana 	 * chunk_offset, automatic removal of the block group can kick in,
317267c5e7d4SFilipe Manana 	 * resulting in a failure when calling btrfs_remove_chunk() below.
317367c5e7d4SFilipe Manana 	 *
317467c5e7d4SFilipe Manana 	 * Make sure to acquire this mutex before doing a tree search (dev
317567c5e7d4SFilipe Manana 	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
317667c5e7d4SFilipe Manana 	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
317767c5e7d4SFilipe Manana 	 * we release the path used to search the chunk/dev tree and before
317867c5e7d4SFilipe Manana 	 * the current task acquires this mutex and calls us.
317967c5e7d4SFilipe Manana 	 */
3180f3372065SJohannes Thumshirn 	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
318167c5e7d4SFilipe Manana 
31828f18cf13SChris Mason 	/* step one, relocate all the extents inside this chunk */
31832ff7e61eSJeff Mahoney 	btrfs_scrub_pause(fs_info);
31840b246afaSJeff Mahoney 	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
31852ff7e61eSJeff Mahoney 	btrfs_scrub_continue(fs_info);
3186a22285a6SYan, Zheng 	if (ret)
3187a22285a6SYan, Zheng 		return ret;
31888f18cf13SChris Mason 
3189b0643e59SDennis Zhou 	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3190b0643e59SDennis Zhou 	if (!block_group)
3191b0643e59SDennis Zhou 		return -ENOENT;
3192b0643e59SDennis Zhou 	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
319301e86008SJohannes Thumshirn 	length = block_group->length;
3194b0643e59SDennis Zhou 	btrfs_put_block_group(block_group);
3195b0643e59SDennis Zhou 
319601e86008SJohannes Thumshirn 	/*
319701e86008SJohannes Thumshirn 	 * On a zoned file system, discard the whole block group, this will
319801e86008SJohannes Thumshirn 	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
319901e86008SJohannes Thumshirn 	 * resetting the zone fails, don't treat it as a fatal problem from the
320001e86008SJohannes Thumshirn 	 * filesystem's point of view.
320101e86008SJohannes Thumshirn 	 */
320201e86008SJohannes Thumshirn 	if (btrfs_is_zoned(fs_info)) {
320301e86008SJohannes Thumshirn 		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
320401e86008SJohannes Thumshirn 		if (ret)
320501e86008SJohannes Thumshirn 			btrfs_info(fs_info,
320601e86008SJohannes Thumshirn 				"failed to reset zone %llu after relocation",
320701e86008SJohannes Thumshirn 				chunk_offset);
320801e86008SJohannes Thumshirn 	}
320901e86008SJohannes Thumshirn 
321019c4d2f9SChris Mason 	trans = btrfs_start_trans_remove_block_group(root->fs_info,
321119c4d2f9SChris Mason 						     chunk_offset);
321219c4d2f9SChris Mason 	if (IS_ERR(trans)) {
321319c4d2f9SChris Mason 		ret = PTR_ERR(trans);
321419c4d2f9SChris Mason 		btrfs_handle_fs_error(root->fs_info, ret, NULL);
321519c4d2f9SChris Mason 		return ret;
321619c4d2f9SChris Mason 	}
32175d8eb6feSNaohiro Aota 
321819c4d2f9SChris Mason 	/*
321919c4d2f9SChris Mason 	 * step two, delete the device extents and the
322019c4d2f9SChris Mason 	 * chunk tree entries
322119c4d2f9SChris Mason 	 */
322297aff912SNikolay Borisov 	ret = btrfs_remove_chunk(trans, chunk_offset);
32233a45bb20SJeff Mahoney 	btrfs_end_transaction(trans);
322419c4d2f9SChris Mason 	return ret;
32258f18cf13SChris Mason }
32268f18cf13SChris Mason 
32272ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
32282b82032cSYan Zheng {
32290b246afaSJeff Mahoney 	struct btrfs_root *chunk_root = fs_info->chunk_root;
32302b82032cSYan Zheng 	struct btrfs_path *path;
32312b82032cSYan Zheng 	struct extent_buffer *leaf;
32322b82032cSYan Zheng 	struct btrfs_chunk *chunk;
32332b82032cSYan Zheng 	struct btrfs_key key;
32342b82032cSYan Zheng 	struct btrfs_key found_key;
32352b82032cSYan Zheng 	u64 chunk_type;
3236ba1bf481SJosef Bacik 	bool retried = false;
3237ba1bf481SJosef Bacik 	int failed = 0;
32382b82032cSYan Zheng 	int ret;
32392b82032cSYan Zheng 
32402b82032cSYan Zheng 	path = btrfs_alloc_path();
32412b82032cSYan Zheng 	if (!path)
32422b82032cSYan Zheng 		return -ENOMEM;
32432b82032cSYan Zheng 
3244ba1bf481SJosef Bacik again:
32452b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
32462b82032cSYan Zheng 	key.offset = (u64)-1;
32472b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
32482b82032cSYan Zheng 
32492b82032cSYan Zheng 	while (1) {
3250f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
32512b82032cSYan Zheng 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
325267c5e7d4SFilipe Manana 		if (ret < 0) {
3253f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
32542b82032cSYan Zheng 			goto error;
325567c5e7d4SFilipe Manana 		}
325679787eaaSJeff Mahoney 		BUG_ON(ret == 0); /* Corruption */
32572b82032cSYan Zheng 
32582b82032cSYan Zheng 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
32592b82032cSYan Zheng 					  key.type);
326067c5e7d4SFilipe Manana 		if (ret)
3261f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
32622b82032cSYan Zheng 		if (ret < 0)
32632b82032cSYan Zheng 			goto error;
32642b82032cSYan Zheng 		if (ret > 0)
32652b82032cSYan Zheng 			break;
32662b82032cSYan Zheng 
32672b82032cSYan Zheng 		leaf = path->nodes[0];
32682b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
32692b82032cSYan Zheng 
32702b82032cSYan Zheng 		chunk = btrfs_item_ptr(leaf, path->slots[0],
32712b82032cSYan Zheng 				       struct btrfs_chunk);
32722b82032cSYan Zheng 		chunk_type = btrfs_chunk_type(leaf, chunk);
3273b3b4aa74SDavid Sterba 		btrfs_release_path(path);
32742b82032cSYan Zheng 
32752b82032cSYan Zheng 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
32760b246afaSJeff Mahoney 			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3277ba1bf481SJosef Bacik 			if (ret == -ENOSPC)
3278ba1bf481SJosef Bacik 				failed++;
327914586651SHIMANGI SARAOGI 			else
328014586651SHIMANGI SARAOGI 				BUG_ON(ret);
32812b82032cSYan Zheng 		}
3282f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
32832b82032cSYan Zheng 
32842b82032cSYan Zheng 		if (found_key.offset == 0)
32852b82032cSYan Zheng 			break;
32862b82032cSYan Zheng 		key.offset = found_key.offset - 1;
32872b82032cSYan Zheng 	}
32882b82032cSYan Zheng 	ret = 0;
3289ba1bf481SJosef Bacik 	if (failed && !retried) {
3290ba1bf481SJosef Bacik 		failed = 0;
3291ba1bf481SJosef Bacik 		retried = true;
3292ba1bf481SJosef Bacik 		goto again;
3293fae7f21cSDulshani Gunawardhana 	} else if (WARN_ON(failed && retried)) {
3294ba1bf481SJosef Bacik 		ret = -ENOSPC;
3295ba1bf481SJosef Bacik 	}
32962b82032cSYan Zheng error:
32972b82032cSYan Zheng 	btrfs_free_path(path);
32982b82032cSYan Zheng 	return ret;
32992b82032cSYan Zheng }
33002b82032cSYan Zheng 
3301a6f93c71SLiu Bo /*
3302a6f93c71SLiu Bo  * return 1 : allocate a data chunk successfully,
3303a6f93c71SLiu Bo  * return <0: errors during allocating a data chunk,
3304a6f93c71SLiu Bo  * return 0 : no need to allocate a data chunk.
3305a6f93c71SLiu Bo  */
3306a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3307a6f93c71SLiu Bo 				      u64 chunk_offset)
3308a6f93c71SLiu Bo {
330932da5386SDavid Sterba 	struct btrfs_block_group *cache;
3310a6f93c71SLiu Bo 	u64 bytes_used;
3311a6f93c71SLiu Bo 	u64 chunk_type;
3312a6f93c71SLiu Bo 
3313a6f93c71SLiu Bo 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3314a6f93c71SLiu Bo 	ASSERT(cache);
3315a6f93c71SLiu Bo 	chunk_type = cache->flags;
3316a6f93c71SLiu Bo 	btrfs_put_block_group(cache);
3317a6f93c71SLiu Bo 
33185ae21692SJohannes Thumshirn 	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
33195ae21692SJohannes Thumshirn 		return 0;
33205ae21692SJohannes Thumshirn 
3321a6f93c71SLiu Bo 	spin_lock(&fs_info->data_sinfo->lock);
3322a6f93c71SLiu Bo 	bytes_used = fs_info->data_sinfo->bytes_used;
3323a6f93c71SLiu Bo 	spin_unlock(&fs_info->data_sinfo->lock);
3324a6f93c71SLiu Bo 
3325a6f93c71SLiu Bo 	if (!bytes_used) {
3326a6f93c71SLiu Bo 		struct btrfs_trans_handle *trans;
3327a6f93c71SLiu Bo 		int ret;
3328a6f93c71SLiu Bo 
3329a6f93c71SLiu Bo 		trans =	btrfs_join_transaction(fs_info->tree_root);
3330a6f93c71SLiu Bo 		if (IS_ERR(trans))
3331a6f93c71SLiu Bo 			return PTR_ERR(trans);
3332a6f93c71SLiu Bo 
33335ae21692SJohannes Thumshirn 		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3334a6f93c71SLiu Bo 		btrfs_end_transaction(trans);
3335a6f93c71SLiu Bo 		if (ret < 0)
3336a6f93c71SLiu Bo 			return ret;
3337a6f93c71SLiu Bo 		return 1;
3338a6f93c71SLiu Bo 	}
33395ae21692SJohannes Thumshirn 
3340a6f93c71SLiu Bo 	return 0;
3341a6f93c71SLiu Bo }
3342a6f93c71SLiu Bo 
33436bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info,
33440940ebf6SIlya Dryomov 			       struct btrfs_balance_control *bctl)
33450940ebf6SIlya Dryomov {
33466bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
33470940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
33480940ebf6SIlya Dryomov 	struct btrfs_balance_item *item;
33490940ebf6SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
33500940ebf6SIlya Dryomov 	struct btrfs_path *path;
33510940ebf6SIlya Dryomov 	struct extent_buffer *leaf;
33520940ebf6SIlya Dryomov 	struct btrfs_key key;
33530940ebf6SIlya Dryomov 	int ret, err;
33540940ebf6SIlya Dryomov 
33550940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
33560940ebf6SIlya Dryomov 	if (!path)
33570940ebf6SIlya Dryomov 		return -ENOMEM;
33580940ebf6SIlya Dryomov 
33590940ebf6SIlya Dryomov 	trans = btrfs_start_transaction(root, 0);
33600940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
33610940ebf6SIlya Dryomov 		btrfs_free_path(path);
33620940ebf6SIlya Dryomov 		return PTR_ERR(trans);
33630940ebf6SIlya Dryomov 	}
33640940ebf6SIlya Dryomov 
33650940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3366c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
33670940ebf6SIlya Dryomov 	key.offset = 0;
33680940ebf6SIlya Dryomov 
33690940ebf6SIlya Dryomov 	ret = btrfs_insert_empty_item(trans, root, path, &key,
33700940ebf6SIlya Dryomov 				      sizeof(*item));
33710940ebf6SIlya Dryomov 	if (ret)
33720940ebf6SIlya Dryomov 		goto out;
33730940ebf6SIlya Dryomov 
33740940ebf6SIlya Dryomov 	leaf = path->nodes[0];
33750940ebf6SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
33760940ebf6SIlya Dryomov 
3377b159fa28SDavid Sterba 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
33780940ebf6SIlya Dryomov 
33790940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
33800940ebf6SIlya Dryomov 	btrfs_set_balance_data(leaf, item, &disk_bargs);
33810940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
33820940ebf6SIlya Dryomov 	btrfs_set_balance_meta(leaf, item, &disk_bargs);
33830940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
33840940ebf6SIlya Dryomov 	btrfs_set_balance_sys(leaf, item, &disk_bargs);
33850940ebf6SIlya Dryomov 
33860940ebf6SIlya Dryomov 	btrfs_set_balance_flags(leaf, item, bctl->flags);
33870940ebf6SIlya Dryomov 
33880940ebf6SIlya Dryomov 	btrfs_mark_buffer_dirty(leaf);
33890940ebf6SIlya Dryomov out:
33900940ebf6SIlya Dryomov 	btrfs_free_path(path);
33913a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
33920940ebf6SIlya Dryomov 	if (err && !ret)
33930940ebf6SIlya Dryomov 		ret = err;
33940940ebf6SIlya Dryomov 	return ret;
33950940ebf6SIlya Dryomov }
33960940ebf6SIlya Dryomov 
33976bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info)
33980940ebf6SIlya Dryomov {
33996bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
34000940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
34010940ebf6SIlya Dryomov 	struct btrfs_path *path;
34020940ebf6SIlya Dryomov 	struct btrfs_key key;
34030940ebf6SIlya Dryomov 	int ret, err;
34040940ebf6SIlya Dryomov 
34050940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
34060940ebf6SIlya Dryomov 	if (!path)
34070940ebf6SIlya Dryomov 		return -ENOMEM;
34080940ebf6SIlya Dryomov 
34093502a8c0SDavid Sterba 	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
34100940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
34110940ebf6SIlya Dryomov 		btrfs_free_path(path);
34120940ebf6SIlya Dryomov 		return PTR_ERR(trans);
34130940ebf6SIlya Dryomov 	}
34140940ebf6SIlya Dryomov 
34150940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3416c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
34170940ebf6SIlya Dryomov 	key.offset = 0;
34180940ebf6SIlya Dryomov 
34190940ebf6SIlya Dryomov 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
34200940ebf6SIlya Dryomov 	if (ret < 0)
34210940ebf6SIlya Dryomov 		goto out;
34220940ebf6SIlya Dryomov 	if (ret > 0) {
34230940ebf6SIlya Dryomov 		ret = -ENOENT;
34240940ebf6SIlya Dryomov 		goto out;
34250940ebf6SIlya Dryomov 	}
34260940ebf6SIlya Dryomov 
34270940ebf6SIlya Dryomov 	ret = btrfs_del_item(trans, root, path);
34280940ebf6SIlya Dryomov out:
34290940ebf6SIlya Dryomov 	btrfs_free_path(path);
34303a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
34310940ebf6SIlya Dryomov 	if (err && !ret)
34320940ebf6SIlya Dryomov 		ret = err;
34330940ebf6SIlya Dryomov 	return ret;
34340940ebf6SIlya Dryomov }
34350940ebf6SIlya Dryomov 
3436c9e9f97bSIlya Dryomov /*
343759641015SIlya Dryomov  * This is a heuristic used to reduce the number of chunks balanced on
343859641015SIlya Dryomov  * resume after balance was interrupted.
343959641015SIlya Dryomov  */
344059641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl)
344159641015SIlya Dryomov {
344259641015SIlya Dryomov 	/*
344359641015SIlya Dryomov 	 * Turn on soft mode for chunk types that were being converted.
344459641015SIlya Dryomov 	 */
344559641015SIlya Dryomov 	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
344659641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
344759641015SIlya Dryomov 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
344859641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
344959641015SIlya Dryomov 	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
345059641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
345159641015SIlya Dryomov 
345259641015SIlya Dryomov 	/*
345359641015SIlya Dryomov 	 * Turn on usage filter if is not already used.  The idea is
345459641015SIlya Dryomov 	 * that chunks that we have already balanced should be
345559641015SIlya Dryomov 	 * reasonably full.  Don't do it for chunks that are being
345659641015SIlya Dryomov 	 * converted - that will keep us from relocating unconverted
345759641015SIlya Dryomov 	 * (albeit full) chunks.
345859641015SIlya Dryomov 	 */
345959641015SIlya Dryomov 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3460bc309467SDavid Sterba 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
346159641015SIlya Dryomov 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
346259641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
346359641015SIlya Dryomov 		bctl->data.usage = 90;
346459641015SIlya Dryomov 	}
346559641015SIlya Dryomov 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3466bc309467SDavid Sterba 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
346759641015SIlya Dryomov 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
346859641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
346959641015SIlya Dryomov 		bctl->sys.usage = 90;
347059641015SIlya Dryomov 	}
347159641015SIlya Dryomov 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3472bc309467SDavid Sterba 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
347359641015SIlya Dryomov 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
347459641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
347559641015SIlya Dryomov 		bctl->meta.usage = 90;
347659641015SIlya Dryomov 	}
347759641015SIlya Dryomov }
347859641015SIlya Dryomov 
347959641015SIlya Dryomov /*
3480149196a2SDavid Sterba  * Clear the balance status in fs_info and delete the balance item from disk.
3481149196a2SDavid Sterba  */
3482149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info)
3483c9e9f97bSIlya Dryomov {
3484c9e9f97bSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3485149196a2SDavid Sterba 	int ret;
3486c9e9f97bSIlya Dryomov 
3487c9e9f97bSIlya Dryomov 	BUG_ON(!fs_info->balance_ctl);
3488c9e9f97bSIlya Dryomov 
3489c9e9f97bSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
3490c9e9f97bSIlya Dryomov 	fs_info->balance_ctl = NULL;
3491c9e9f97bSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
3492c9e9f97bSIlya Dryomov 
3493c9e9f97bSIlya Dryomov 	kfree(bctl);
3494149196a2SDavid Sterba 	ret = del_balance_item(fs_info);
3495149196a2SDavid Sterba 	if (ret)
3496149196a2SDavid Sterba 		btrfs_handle_fs_error(fs_info, ret, NULL);
3497c9e9f97bSIlya Dryomov }
3498c9e9f97bSIlya Dryomov 
3499ed25e9b2SIlya Dryomov /*
3500ed25e9b2SIlya Dryomov  * Balance filters.  Return 1 if chunk should be filtered out
3501ed25e9b2SIlya Dryomov  * (should not be balanced).
3502ed25e9b2SIlya Dryomov  */
3503899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type,
3504ed25e9b2SIlya Dryomov 				 struct btrfs_balance_args *bargs)
3505ed25e9b2SIlya Dryomov {
3506899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3507899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3508ed25e9b2SIlya Dryomov 
3509899c81eaSIlya Dryomov 	if (bargs->profiles & chunk_type)
3510ed25e9b2SIlya Dryomov 		return 0;
3511ed25e9b2SIlya Dryomov 
3512ed25e9b2SIlya Dryomov 	return 1;
3513ed25e9b2SIlya Dryomov }
3514ed25e9b2SIlya Dryomov 
3515dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
35165ce5b3c0SIlya Dryomov 			      struct btrfs_balance_args *bargs)
35175ce5b3c0SIlya Dryomov {
351832da5386SDavid Sterba 	struct btrfs_block_group *cache;
3519bc309467SDavid Sterba 	u64 chunk_used;
3520bc309467SDavid Sterba 	u64 user_thresh_min;
3521bc309467SDavid Sterba 	u64 user_thresh_max;
3522bc309467SDavid Sterba 	int ret = 1;
3523bc309467SDavid Sterba 
3524bc309467SDavid Sterba 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3525bf38be65SDavid Sterba 	chunk_used = cache->used;
3526bc309467SDavid Sterba 
3527bc309467SDavid Sterba 	if (bargs->usage_min == 0)
3528bc309467SDavid Sterba 		user_thresh_min = 0;
3529bc309467SDavid Sterba 	else
3530b3470b5dSDavid Sterba 		user_thresh_min = div_factor_fine(cache->length,
3531bc309467SDavid Sterba 						  bargs->usage_min);
3532bc309467SDavid Sterba 
3533bc309467SDavid Sterba 	if (bargs->usage_max == 0)
3534bc309467SDavid Sterba 		user_thresh_max = 1;
3535bc309467SDavid Sterba 	else if (bargs->usage_max > 100)
3536b3470b5dSDavid Sterba 		user_thresh_max = cache->length;
3537bc309467SDavid Sterba 	else
3538b3470b5dSDavid Sterba 		user_thresh_max = div_factor_fine(cache->length,
3539bc309467SDavid Sterba 						  bargs->usage_max);
3540bc309467SDavid Sterba 
3541bc309467SDavid Sterba 	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3542bc309467SDavid Sterba 		ret = 0;
3543bc309467SDavid Sterba 
3544bc309467SDavid Sterba 	btrfs_put_block_group(cache);
3545bc309467SDavid Sterba 	return ret;
3546bc309467SDavid Sterba }
3547bc309467SDavid Sterba 
3548dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3549bc309467SDavid Sterba 		u64 chunk_offset, struct btrfs_balance_args *bargs)
3550bc309467SDavid Sterba {
355132da5386SDavid Sterba 	struct btrfs_block_group *cache;
35525ce5b3c0SIlya Dryomov 	u64 chunk_used, user_thresh;
35535ce5b3c0SIlya Dryomov 	int ret = 1;
35545ce5b3c0SIlya Dryomov 
35555ce5b3c0SIlya Dryomov 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3556bf38be65SDavid Sterba 	chunk_used = cache->used;
35575ce5b3c0SIlya Dryomov 
3558bc309467SDavid Sterba 	if (bargs->usage_min == 0)
35593e39cea6SIlya Dryomov 		user_thresh = 1;
3560a105bb88SIlya Dryomov 	else if (bargs->usage > 100)
3561b3470b5dSDavid Sterba 		user_thresh = cache->length;
3562a105bb88SIlya Dryomov 	else
3563b3470b5dSDavid Sterba 		user_thresh = div_factor_fine(cache->length, bargs->usage);
3564a105bb88SIlya Dryomov 
35655ce5b3c0SIlya Dryomov 	if (chunk_used < user_thresh)
35665ce5b3c0SIlya Dryomov 		ret = 0;
35675ce5b3c0SIlya Dryomov 
35685ce5b3c0SIlya Dryomov 	btrfs_put_block_group(cache);
35695ce5b3c0SIlya Dryomov 	return ret;
35705ce5b3c0SIlya Dryomov }
35715ce5b3c0SIlya Dryomov 
3572409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf,
3573409d404bSIlya Dryomov 			      struct btrfs_chunk *chunk,
3574409d404bSIlya Dryomov 			      struct btrfs_balance_args *bargs)
3575409d404bSIlya Dryomov {
3576409d404bSIlya Dryomov 	struct btrfs_stripe *stripe;
3577409d404bSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3578409d404bSIlya Dryomov 	int i;
3579409d404bSIlya Dryomov 
3580409d404bSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
3581409d404bSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
3582409d404bSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3583409d404bSIlya Dryomov 			return 0;
3584409d404bSIlya Dryomov 	}
3585409d404bSIlya Dryomov 
3586409d404bSIlya Dryomov 	return 1;
3587409d404bSIlya Dryomov }
3588409d404bSIlya Dryomov 
3589946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes)
3590946c9256SDavid Sterba {
3591946c9256SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(type);
3592946c9256SDavid Sterba 	const int ncopies = btrfs_raid_array[index].ncopies;
3593946c9256SDavid Sterba 	const int nparity = btrfs_raid_array[index].nparity;
3594946c9256SDavid Sterba 
3595d58ede8dSDavid Sterba 	return (num_stripes - nparity) / ncopies;
3596946c9256SDavid Sterba }
3597946c9256SDavid Sterba 
359894e60d5aSIlya Dryomov /* [pstart, pend) */
359994e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf,
360094e60d5aSIlya Dryomov 			       struct btrfs_chunk *chunk,
360194e60d5aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
360294e60d5aSIlya Dryomov {
360394e60d5aSIlya Dryomov 	struct btrfs_stripe *stripe;
360494e60d5aSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
360594e60d5aSIlya Dryomov 	u64 stripe_offset;
360694e60d5aSIlya Dryomov 	u64 stripe_length;
3607946c9256SDavid Sterba 	u64 type;
360894e60d5aSIlya Dryomov 	int factor;
360994e60d5aSIlya Dryomov 	int i;
361094e60d5aSIlya Dryomov 
361194e60d5aSIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
361294e60d5aSIlya Dryomov 		return 0;
361394e60d5aSIlya Dryomov 
3614946c9256SDavid Sterba 	type = btrfs_chunk_type(leaf, chunk);
3615946c9256SDavid Sterba 	factor = calc_data_stripes(type, num_stripes);
361694e60d5aSIlya Dryomov 
361794e60d5aSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
361894e60d5aSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
361994e60d5aSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
362094e60d5aSIlya Dryomov 			continue;
362194e60d5aSIlya Dryomov 
362294e60d5aSIlya Dryomov 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
362394e60d5aSIlya Dryomov 		stripe_length = btrfs_chunk_length(leaf, chunk);
3624b8b93addSDavid Sterba 		stripe_length = div_u64(stripe_length, factor);
362594e60d5aSIlya Dryomov 
362694e60d5aSIlya Dryomov 		if (stripe_offset < bargs->pend &&
362794e60d5aSIlya Dryomov 		    stripe_offset + stripe_length > bargs->pstart)
362894e60d5aSIlya Dryomov 			return 0;
362994e60d5aSIlya Dryomov 	}
363094e60d5aSIlya Dryomov 
363194e60d5aSIlya Dryomov 	return 1;
363294e60d5aSIlya Dryomov }
363394e60d5aSIlya Dryomov 
3634ea67176aSIlya Dryomov /* [vstart, vend) */
3635ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf,
3636ea67176aSIlya Dryomov 			       struct btrfs_chunk *chunk,
3637ea67176aSIlya Dryomov 			       u64 chunk_offset,
3638ea67176aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
3639ea67176aSIlya Dryomov {
3640ea67176aSIlya Dryomov 	if (chunk_offset < bargs->vend &&
3641ea67176aSIlya Dryomov 	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3642ea67176aSIlya Dryomov 		/* at least part of the chunk is inside this vrange */
3643ea67176aSIlya Dryomov 		return 0;
3644ea67176aSIlya Dryomov 
3645ea67176aSIlya Dryomov 	return 1;
3646ea67176aSIlya Dryomov }
3647ea67176aSIlya Dryomov 
3648dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3649dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_chunk *chunk,
3650dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_balance_args *bargs)
3651dee32d0aSGabríel Arthúr Pétursson {
3652dee32d0aSGabríel Arthúr Pétursson 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3653dee32d0aSGabríel Arthúr Pétursson 
3654dee32d0aSGabríel Arthúr Pétursson 	if (bargs->stripes_min <= num_stripes
3655dee32d0aSGabríel Arthúr Pétursson 			&& num_stripes <= bargs->stripes_max)
3656dee32d0aSGabríel Arthúr Pétursson 		return 0;
3657dee32d0aSGabríel Arthúr Pétursson 
3658dee32d0aSGabríel Arthúr Pétursson 	return 1;
3659dee32d0aSGabríel Arthúr Pétursson }
3660dee32d0aSGabríel Arthúr Pétursson 
3661899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type,
3662cfa4c961SIlya Dryomov 				     struct btrfs_balance_args *bargs)
3663cfa4c961SIlya Dryomov {
3664cfa4c961SIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3665cfa4c961SIlya Dryomov 		return 0;
3666cfa4c961SIlya Dryomov 
3667899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3668899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3669cfa4c961SIlya Dryomov 
3670899c81eaSIlya Dryomov 	if (bargs->target == chunk_type)
3671cfa4c961SIlya Dryomov 		return 1;
3672cfa4c961SIlya Dryomov 
3673cfa4c961SIlya Dryomov 	return 0;
3674cfa4c961SIlya Dryomov }
3675cfa4c961SIlya Dryomov 
36766ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf,
3677f43ffb60SIlya Dryomov 				struct btrfs_chunk *chunk, u64 chunk_offset)
3678f43ffb60SIlya Dryomov {
36796ec0896cSDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
36800b246afaSJeff Mahoney 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3681f43ffb60SIlya Dryomov 	struct btrfs_balance_args *bargs = NULL;
3682f43ffb60SIlya Dryomov 	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3683f43ffb60SIlya Dryomov 
3684f43ffb60SIlya Dryomov 	/* type filter */
3685f43ffb60SIlya Dryomov 	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3686f43ffb60SIlya Dryomov 	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3687f43ffb60SIlya Dryomov 		return 0;
3688f43ffb60SIlya Dryomov 	}
3689f43ffb60SIlya Dryomov 
3690f43ffb60SIlya Dryomov 	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3691f43ffb60SIlya Dryomov 		bargs = &bctl->data;
3692f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3693f43ffb60SIlya Dryomov 		bargs = &bctl->sys;
3694f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3695f43ffb60SIlya Dryomov 		bargs = &bctl->meta;
3696f43ffb60SIlya Dryomov 
3697ed25e9b2SIlya Dryomov 	/* profiles filter */
3698ed25e9b2SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3699ed25e9b2SIlya Dryomov 	    chunk_profiles_filter(chunk_type, bargs)) {
3700ed25e9b2SIlya Dryomov 		return 0;
3701ed25e9b2SIlya Dryomov 	}
3702ed25e9b2SIlya Dryomov 
37035ce5b3c0SIlya Dryomov 	/* usage filter */
37045ce5b3c0SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
37050b246afaSJeff Mahoney 	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
37065ce5b3c0SIlya Dryomov 		return 0;
3707bc309467SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
37080b246afaSJeff Mahoney 	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3709bc309467SDavid Sterba 		return 0;
37105ce5b3c0SIlya Dryomov 	}
37115ce5b3c0SIlya Dryomov 
3712409d404bSIlya Dryomov 	/* devid filter */
3713409d404bSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3714409d404bSIlya Dryomov 	    chunk_devid_filter(leaf, chunk, bargs)) {
3715409d404bSIlya Dryomov 		return 0;
3716409d404bSIlya Dryomov 	}
3717409d404bSIlya Dryomov 
371894e60d5aSIlya Dryomov 	/* drange filter, makes sense only with devid filter */
371994e60d5aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3720e4ff5fb5SNikolay Borisov 	    chunk_drange_filter(leaf, chunk, bargs)) {
372194e60d5aSIlya Dryomov 		return 0;
372294e60d5aSIlya Dryomov 	}
372394e60d5aSIlya Dryomov 
3724ea67176aSIlya Dryomov 	/* vrange filter */
3725ea67176aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3726ea67176aSIlya Dryomov 	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3727ea67176aSIlya Dryomov 		return 0;
3728ea67176aSIlya Dryomov 	}
3729ea67176aSIlya Dryomov 
3730dee32d0aSGabríel Arthúr Pétursson 	/* stripes filter */
3731dee32d0aSGabríel Arthúr Pétursson 	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3732dee32d0aSGabríel Arthúr Pétursson 	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
3733dee32d0aSGabríel Arthúr Pétursson 		return 0;
3734dee32d0aSGabríel Arthúr Pétursson 	}
3735dee32d0aSGabríel Arthúr Pétursson 
3736cfa4c961SIlya Dryomov 	/* soft profile changing mode */
3737cfa4c961SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3738cfa4c961SIlya Dryomov 	    chunk_soft_convert_filter(chunk_type, bargs)) {
3739cfa4c961SIlya Dryomov 		return 0;
3740cfa4c961SIlya Dryomov 	}
3741cfa4c961SIlya Dryomov 
37427d824b6fSDavid Sterba 	/*
37437d824b6fSDavid Sterba 	 * limited by count, must be the last filter
37447d824b6fSDavid Sterba 	 */
37457d824b6fSDavid Sterba 	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
37467d824b6fSDavid Sterba 		if (bargs->limit == 0)
37477d824b6fSDavid Sterba 			return 0;
37487d824b6fSDavid Sterba 		else
37497d824b6fSDavid Sterba 			bargs->limit--;
375012907fc7SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
375112907fc7SDavid Sterba 		/*
375212907fc7SDavid Sterba 		 * Same logic as the 'limit' filter; the minimum cannot be
375301327610SNicholas D Steeves 		 * determined here because we do not have the global information
375412907fc7SDavid Sterba 		 * about the count of all chunks that satisfy the filters.
375512907fc7SDavid Sterba 		 */
375612907fc7SDavid Sterba 		if (bargs->limit_max == 0)
375712907fc7SDavid Sterba 			return 0;
375812907fc7SDavid Sterba 		else
375912907fc7SDavid Sterba 			bargs->limit_max--;
37607d824b6fSDavid Sterba 	}
37617d824b6fSDavid Sterba 
3762f43ffb60SIlya Dryomov 	return 1;
3763f43ffb60SIlya Dryomov }
3764f43ffb60SIlya Dryomov 
3765c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3766ec44a35cSChris Mason {
376719a39dceSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3768c9e9f97bSIlya Dryomov 	struct btrfs_root *chunk_root = fs_info->chunk_root;
376912907fc7SDavid Sterba 	u64 chunk_type;
3770f43ffb60SIlya Dryomov 	struct btrfs_chunk *chunk;
37715a488b9dSLiu Bo 	struct btrfs_path *path = NULL;
3772ec44a35cSChris Mason 	struct btrfs_key key;
3773ec44a35cSChris Mason 	struct btrfs_key found_key;
3774f43ffb60SIlya Dryomov 	struct extent_buffer *leaf;
3775f43ffb60SIlya Dryomov 	int slot;
3776c9e9f97bSIlya Dryomov 	int ret;
3777c9e9f97bSIlya Dryomov 	int enospc_errors = 0;
377819a39dceSIlya Dryomov 	bool counting = true;
377912907fc7SDavid Sterba 	/* The single value limit and min/max limits use the same bytes in the */
37807d824b6fSDavid Sterba 	u64 limit_data = bctl->data.limit;
37817d824b6fSDavid Sterba 	u64 limit_meta = bctl->meta.limit;
37827d824b6fSDavid Sterba 	u64 limit_sys = bctl->sys.limit;
378312907fc7SDavid Sterba 	u32 count_data = 0;
378412907fc7SDavid Sterba 	u32 count_meta = 0;
378512907fc7SDavid Sterba 	u32 count_sys = 0;
37862c9fe835SZhao Lei 	int chunk_reserved = 0;
3787ec44a35cSChris Mason 
3788ec44a35cSChris Mason 	path = btrfs_alloc_path();
378917e9f796SMark Fasheh 	if (!path) {
379017e9f796SMark Fasheh 		ret = -ENOMEM;
379117e9f796SMark Fasheh 		goto error;
379217e9f796SMark Fasheh 	}
379319a39dceSIlya Dryomov 
379419a39dceSIlya Dryomov 	/* zero out stat counters */
379519a39dceSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
379619a39dceSIlya Dryomov 	memset(&bctl->stat, 0, sizeof(bctl->stat));
379719a39dceSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
379819a39dceSIlya Dryomov again:
37997d824b6fSDavid Sterba 	if (!counting) {
380012907fc7SDavid Sterba 		/*
380112907fc7SDavid Sterba 		 * The single value limit and min/max limits use the same bytes
380212907fc7SDavid Sterba 		 * in the
380312907fc7SDavid Sterba 		 */
38047d824b6fSDavid Sterba 		bctl->data.limit = limit_data;
38057d824b6fSDavid Sterba 		bctl->meta.limit = limit_meta;
38067d824b6fSDavid Sterba 		bctl->sys.limit = limit_sys;
38077d824b6fSDavid Sterba 	}
3808ec44a35cSChris Mason 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3809ec44a35cSChris Mason 	key.offset = (u64)-1;
3810ec44a35cSChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
3811ec44a35cSChris Mason 
3812ec44a35cSChris Mason 	while (1) {
381319a39dceSIlya Dryomov 		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3814a7e99c69SIlya Dryomov 		    atomic_read(&fs_info->balance_cancel_req)) {
3815837d5b6eSIlya Dryomov 			ret = -ECANCELED;
3816837d5b6eSIlya Dryomov 			goto error;
3817837d5b6eSIlya Dryomov 		}
3818837d5b6eSIlya Dryomov 
3819f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
3820ec44a35cSChris Mason 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
382167c5e7d4SFilipe Manana 		if (ret < 0) {
3822f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3823ec44a35cSChris Mason 			goto error;
382467c5e7d4SFilipe Manana 		}
3825ec44a35cSChris Mason 
3826ec44a35cSChris Mason 		/*
3827ec44a35cSChris Mason 		 * this shouldn't happen, it means the last relocate
3828ec44a35cSChris Mason 		 * failed
3829ec44a35cSChris Mason 		 */
3830ec44a35cSChris Mason 		if (ret == 0)
3831c9e9f97bSIlya Dryomov 			BUG(); /* FIXME break ? */
3832ec44a35cSChris Mason 
3833ec44a35cSChris Mason 		ret = btrfs_previous_item(chunk_root, path, 0,
3834ec44a35cSChris Mason 					  BTRFS_CHUNK_ITEM_KEY);
3835c9e9f97bSIlya Dryomov 		if (ret) {
3836f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3837c9e9f97bSIlya Dryomov 			ret = 0;
3838ec44a35cSChris Mason 			break;
3839c9e9f97bSIlya Dryomov 		}
38407d9eb12cSChris Mason 
3841f43ffb60SIlya Dryomov 		leaf = path->nodes[0];
3842f43ffb60SIlya Dryomov 		slot = path->slots[0];
3843f43ffb60SIlya Dryomov 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3844f43ffb60SIlya Dryomov 
384567c5e7d4SFilipe Manana 		if (found_key.objectid != key.objectid) {
3846f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3847ec44a35cSChris Mason 			break;
384867c5e7d4SFilipe Manana 		}
38497d9eb12cSChris Mason 
3850f43ffb60SIlya Dryomov 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
385112907fc7SDavid Sterba 		chunk_type = btrfs_chunk_type(leaf, chunk);
3852f43ffb60SIlya Dryomov 
385319a39dceSIlya Dryomov 		if (!counting) {
385419a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
385519a39dceSIlya Dryomov 			bctl->stat.considered++;
385619a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
385719a39dceSIlya Dryomov 		}
385819a39dceSIlya Dryomov 
38596ec0896cSDavid Sterba 		ret = should_balance_chunk(leaf, chunk, found_key.offset);
38602c9fe835SZhao Lei 
3861b3b4aa74SDavid Sterba 		btrfs_release_path(path);
386267c5e7d4SFilipe Manana 		if (!ret) {
3863f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3864f43ffb60SIlya Dryomov 			goto loop;
386567c5e7d4SFilipe Manana 		}
3866f43ffb60SIlya Dryomov 
386719a39dceSIlya Dryomov 		if (counting) {
3868f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
386919a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
387019a39dceSIlya Dryomov 			bctl->stat.expected++;
387119a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
387212907fc7SDavid Sterba 
387312907fc7SDavid Sterba 			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
387412907fc7SDavid Sterba 				count_data++;
387512907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
387612907fc7SDavid Sterba 				count_sys++;
387712907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
387812907fc7SDavid Sterba 				count_meta++;
387912907fc7SDavid Sterba 
388012907fc7SDavid Sterba 			goto loop;
388112907fc7SDavid Sterba 		}
388212907fc7SDavid Sterba 
388312907fc7SDavid Sterba 		/*
388412907fc7SDavid Sterba 		 * Apply limit_min filter, no need to check if the LIMITS
388512907fc7SDavid Sterba 		 * filter is used, limit_min is 0 by default
388612907fc7SDavid Sterba 		 */
388712907fc7SDavid Sterba 		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
388812907fc7SDavid Sterba 					count_data < bctl->data.limit_min)
388912907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
389012907fc7SDavid Sterba 					count_meta < bctl->meta.limit_min)
389112907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
389212907fc7SDavid Sterba 					count_sys < bctl->sys.limit_min)) {
3893f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
389419a39dceSIlya Dryomov 			goto loop;
389519a39dceSIlya Dryomov 		}
389619a39dceSIlya Dryomov 
3897a6f93c71SLiu Bo 		if (!chunk_reserved) {
3898a6f93c71SLiu Bo 			/*
3899a6f93c71SLiu Bo 			 * We may be relocating the only data chunk we have,
3900a6f93c71SLiu Bo 			 * which could potentially end up with losing data's
3901a6f93c71SLiu Bo 			 * raid profile, so lets allocate an empty one in
3902a6f93c71SLiu Bo 			 * advance.
3903a6f93c71SLiu Bo 			 */
3904a6f93c71SLiu Bo 			ret = btrfs_may_alloc_data_chunk(fs_info,
3905a6f93c71SLiu Bo 							 found_key.offset);
39062c9fe835SZhao Lei 			if (ret < 0) {
3907f3372065SJohannes Thumshirn 				mutex_unlock(&fs_info->reclaim_bgs_lock);
39082c9fe835SZhao Lei 				goto error;
3909a6f93c71SLiu Bo 			} else if (ret == 1) {
39102c9fe835SZhao Lei 				chunk_reserved = 1;
39112c9fe835SZhao Lei 			}
3912a6f93c71SLiu Bo 		}
39132c9fe835SZhao Lei 
39145b4aacefSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3915f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
391619a39dceSIlya Dryomov 		if (ret == -ENOSPC) {
3917c9e9f97bSIlya Dryomov 			enospc_errors++;
3918eede2bf3SOmar Sandoval 		} else if (ret == -ETXTBSY) {
3919eede2bf3SOmar Sandoval 			btrfs_info(fs_info,
3920eede2bf3SOmar Sandoval 	   "skipping relocation of block group %llu due to active swapfile",
3921eede2bf3SOmar Sandoval 				   found_key.offset);
3922eede2bf3SOmar Sandoval 			ret = 0;
3923eede2bf3SOmar Sandoval 		} else if (ret) {
3924eede2bf3SOmar Sandoval 			goto error;
392519a39dceSIlya Dryomov 		} else {
392619a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
392719a39dceSIlya Dryomov 			bctl->stat.completed++;
392819a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
392919a39dceSIlya Dryomov 		}
3930f43ffb60SIlya Dryomov loop:
3931795a3321SIlya Dryomov 		if (found_key.offset == 0)
3932795a3321SIlya Dryomov 			break;
3933ba1bf481SJosef Bacik 		key.offset = found_key.offset - 1;
3934ec44a35cSChris Mason 	}
3935c9e9f97bSIlya Dryomov 
393619a39dceSIlya Dryomov 	if (counting) {
393719a39dceSIlya Dryomov 		btrfs_release_path(path);
393819a39dceSIlya Dryomov 		counting = false;
393919a39dceSIlya Dryomov 		goto again;
394019a39dceSIlya Dryomov 	}
3941ec44a35cSChris Mason error:
3942ec44a35cSChris Mason 	btrfs_free_path(path);
3943c9e9f97bSIlya Dryomov 	if (enospc_errors) {
3944efe120a0SFrank Holton 		btrfs_info(fs_info, "%d enospc errors during balance",
3945c9e9f97bSIlya Dryomov 			   enospc_errors);
3946c9e9f97bSIlya Dryomov 		if (!ret)
3947c9e9f97bSIlya Dryomov 			ret = -ENOSPC;
3948c9e9f97bSIlya Dryomov 	}
3949c9e9f97bSIlya Dryomov 
3950ec44a35cSChris Mason 	return ret;
3951ec44a35cSChris Mason }
3952ec44a35cSChris Mason 
39530c460c0dSIlya Dryomov /**
39540c460c0dSIlya Dryomov  * alloc_profile_is_valid - see if a given profile is valid and reduced
39550c460c0dSIlya Dryomov  * @flags: profile to validate
39560c460c0dSIlya Dryomov  * @extended: if true @flags is treated as an extended profile
39570c460c0dSIlya Dryomov  */
39580c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended)
39590c460c0dSIlya Dryomov {
39600c460c0dSIlya Dryomov 	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
39610c460c0dSIlya Dryomov 			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
39620c460c0dSIlya Dryomov 
39630c460c0dSIlya Dryomov 	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
39640c460c0dSIlya Dryomov 
39650c460c0dSIlya Dryomov 	/* 1) check that all other bits are zeroed */
39660c460c0dSIlya Dryomov 	if (flags & ~mask)
39670c460c0dSIlya Dryomov 		return 0;
39680c460c0dSIlya Dryomov 
39690c460c0dSIlya Dryomov 	/* 2) see if profile is reduced */
39700c460c0dSIlya Dryomov 	if (flags == 0)
39710c460c0dSIlya Dryomov 		return !extended; /* "0" is valid for usual profiles */
39720c460c0dSIlya Dryomov 
3973c1499166SDavid Sterba 	return has_single_bit_set(flags);
39740c460c0dSIlya Dryomov }
39750c460c0dSIlya Dryomov 
3976837d5b6eSIlya Dryomov static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3977837d5b6eSIlya Dryomov {
3978a7e99c69SIlya Dryomov 	/* cancel requested || normal exit path */
3979a7e99c69SIlya Dryomov 	return atomic_read(&fs_info->balance_cancel_req) ||
3980a7e99c69SIlya Dryomov 		(atomic_read(&fs_info->balance_pause_req) == 0 &&
3981a7e99c69SIlya Dryomov 		 atomic_read(&fs_info->balance_cancel_req) == 0);
3982837d5b6eSIlya Dryomov }
3983837d5b6eSIlya Dryomov 
39845ba366c3SDavid Sterba /*
39855ba366c3SDavid Sterba  * Validate target profile against allowed profiles and return true if it's OK.
39865ba366c3SDavid Sterba  * Otherwise print the error message and return false.
39875ba366c3SDavid Sterba  */
39885ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
39895ba366c3SDavid Sterba 		const struct btrfs_balance_args *bargs,
39905ba366c3SDavid Sterba 		u64 allowed, const char *type)
3991bdcd3c97SAlexandru Moise {
39925ba366c3SDavid Sterba 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
39935ba366c3SDavid Sterba 		return true;
39945ba366c3SDavid Sterba 
3995c8050b3bSQu Wenruo 	if (fs_info->sectorsize < PAGE_SIZE &&
3996c8050b3bSQu Wenruo 		bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3997c8050b3bSQu Wenruo 		btrfs_err(fs_info,
3998c8050b3bSQu Wenruo 		"RAID56 is not yet supported for sectorsize %u with page size %lu",
3999c8050b3bSQu Wenruo 			  fs_info->sectorsize, PAGE_SIZE);
4000c8050b3bSQu Wenruo 		return false;
4001c8050b3bSQu Wenruo 	}
40025ba366c3SDavid Sterba 	/* Profile is valid and does not have bits outside of the allowed set */
40035ba366c3SDavid Sterba 	if (alloc_profile_is_valid(bargs->target, 1) &&
40045ba366c3SDavid Sterba 	    (bargs->target & ~allowed) == 0)
40055ba366c3SDavid Sterba 		return true;
40065ba366c3SDavid Sterba 
40075ba366c3SDavid Sterba 	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
40085ba366c3SDavid Sterba 			type, btrfs_bg_type_to_raid_name(bargs->target));
40095ba366c3SDavid Sterba 	return false;
4010bdcd3c97SAlexandru Moise }
4011bdcd3c97SAlexandru Moise 
4012c9e9f97bSIlya Dryomov /*
401356fc37d9SAnand Jain  * Fill @buf with textual description of balance filter flags @bargs, up to
401456fc37d9SAnand Jain  * @size_buf including the terminating null. The output may be trimmed if it
401556fc37d9SAnand Jain  * does not fit into the provided buffer.
401656fc37d9SAnand Jain  */
401756fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
401856fc37d9SAnand Jain 				 u32 size_buf)
401956fc37d9SAnand Jain {
402056fc37d9SAnand Jain 	int ret;
402156fc37d9SAnand Jain 	u32 size_bp = size_buf;
402256fc37d9SAnand Jain 	char *bp = buf;
402356fc37d9SAnand Jain 	u64 flags = bargs->flags;
402456fc37d9SAnand Jain 	char tmp_buf[128] = {'\0'};
402556fc37d9SAnand Jain 
402656fc37d9SAnand Jain 	if (!flags)
402756fc37d9SAnand Jain 		return;
402856fc37d9SAnand Jain 
402956fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a)						\
403056fc37d9SAnand Jain 	do {								\
403156fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a));			\
403256fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
403356fc37d9SAnand Jain 			goto out_overflow;				\
403456fc37d9SAnand Jain 		size_bp -= ret;						\
403556fc37d9SAnand Jain 		bp += ret;						\
403656fc37d9SAnand Jain 	} while (0)
403756fc37d9SAnand Jain 
403856fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
403956fc37d9SAnand Jain 	do {								\
404056fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
404156fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
404256fc37d9SAnand Jain 			goto out_overflow;				\
404356fc37d9SAnand Jain 		size_bp -= ret;						\
404456fc37d9SAnand Jain 		bp += ret;						\
404556fc37d9SAnand Jain 	} while (0)
404656fc37d9SAnand Jain 
404756fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2)					\
404856fc37d9SAnand Jain 	do {								\
404956fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
405056fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
405156fc37d9SAnand Jain 			goto out_overflow;				\
405256fc37d9SAnand Jain 		size_bp -= ret;						\
405356fc37d9SAnand Jain 		bp += ret;						\
405456fc37d9SAnand Jain 	} while (0)
405556fc37d9SAnand Jain 
4056158da513SDavid Sterba 	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4057158da513SDavid Sterba 		CHECK_APPEND_1ARG("convert=%s,",
4058158da513SDavid Sterba 				  btrfs_bg_type_to_raid_name(bargs->target));
405956fc37d9SAnand Jain 
406056fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_SOFT)
406156fc37d9SAnand Jain 		CHECK_APPEND_NOARG("soft,");
406256fc37d9SAnand Jain 
406356fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
406456fc37d9SAnand Jain 		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
406556fc37d9SAnand Jain 					    sizeof(tmp_buf));
406656fc37d9SAnand Jain 		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
406756fc37d9SAnand Jain 	}
406856fc37d9SAnand Jain 
406956fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE)
407056fc37d9SAnand Jain 		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
407156fc37d9SAnand Jain 
407256fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
407356fc37d9SAnand Jain 		CHECK_APPEND_2ARG("usage=%u..%u,",
407456fc37d9SAnand Jain 				  bargs->usage_min, bargs->usage_max);
407556fc37d9SAnand Jain 
407656fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DEVID)
407756fc37d9SAnand Jain 		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
407856fc37d9SAnand Jain 
407956fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
408056fc37d9SAnand Jain 		CHECK_APPEND_2ARG("drange=%llu..%llu,",
408156fc37d9SAnand Jain 				  bargs->pstart, bargs->pend);
408256fc37d9SAnand Jain 
408356fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
408456fc37d9SAnand Jain 		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
408556fc37d9SAnand Jain 				  bargs->vstart, bargs->vend);
408656fc37d9SAnand Jain 
408756fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
408856fc37d9SAnand Jain 		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
408956fc37d9SAnand Jain 
409056fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
409156fc37d9SAnand Jain 		CHECK_APPEND_2ARG("limit=%u..%u,",
409256fc37d9SAnand Jain 				bargs->limit_min, bargs->limit_max);
409356fc37d9SAnand Jain 
409456fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
409556fc37d9SAnand Jain 		CHECK_APPEND_2ARG("stripes=%u..%u,",
409656fc37d9SAnand Jain 				  bargs->stripes_min, bargs->stripes_max);
409756fc37d9SAnand Jain 
409856fc37d9SAnand Jain #undef CHECK_APPEND_2ARG
409956fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
410056fc37d9SAnand Jain #undef CHECK_APPEND_NOARG
410156fc37d9SAnand Jain 
410256fc37d9SAnand Jain out_overflow:
410356fc37d9SAnand Jain 
410456fc37d9SAnand Jain 	if (size_bp < size_buf)
410556fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
410656fc37d9SAnand Jain 	else
410756fc37d9SAnand Jain 		buf[0] = '\0';
410856fc37d9SAnand Jain }
410956fc37d9SAnand Jain 
411056fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
411156fc37d9SAnand Jain {
411256fc37d9SAnand Jain 	u32 size_buf = 1024;
411356fc37d9SAnand Jain 	char tmp_buf[192] = {'\0'};
411456fc37d9SAnand Jain 	char *buf;
411556fc37d9SAnand Jain 	char *bp;
411656fc37d9SAnand Jain 	u32 size_bp = size_buf;
411756fc37d9SAnand Jain 	int ret;
411856fc37d9SAnand Jain 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
411956fc37d9SAnand Jain 
412056fc37d9SAnand Jain 	buf = kzalloc(size_buf, GFP_KERNEL);
412156fc37d9SAnand Jain 	if (!buf)
412256fc37d9SAnand Jain 		return;
412356fc37d9SAnand Jain 
412456fc37d9SAnand Jain 	bp = buf;
412556fc37d9SAnand Jain 
412656fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
412756fc37d9SAnand Jain 	do {								\
412856fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
412956fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
413056fc37d9SAnand Jain 			goto out_overflow;				\
413156fc37d9SAnand Jain 		size_bp -= ret;						\
413256fc37d9SAnand Jain 		bp += ret;						\
413356fc37d9SAnand Jain 	} while (0)
413456fc37d9SAnand Jain 
413556fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_FORCE)
413656fc37d9SAnand Jain 		CHECK_APPEND_1ARG("%s", "-f ");
413756fc37d9SAnand Jain 
413856fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_DATA) {
413956fc37d9SAnand Jain 		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
414056fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
414156fc37d9SAnand Jain 	}
414256fc37d9SAnand Jain 
414356fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_METADATA) {
414456fc37d9SAnand Jain 		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
414556fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
414656fc37d9SAnand Jain 	}
414756fc37d9SAnand Jain 
414856fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
414956fc37d9SAnand Jain 		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
415056fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
415156fc37d9SAnand Jain 	}
415256fc37d9SAnand Jain 
415356fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
415456fc37d9SAnand Jain 
415556fc37d9SAnand Jain out_overflow:
415656fc37d9SAnand Jain 
415756fc37d9SAnand Jain 	if (size_bp < size_buf)
415856fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
415956fc37d9SAnand Jain 	btrfs_info(fs_info, "balance: %s %s",
416056fc37d9SAnand Jain 		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
416156fc37d9SAnand Jain 		   "resume" : "start", buf);
416256fc37d9SAnand Jain 
416356fc37d9SAnand Jain 	kfree(buf);
416456fc37d9SAnand Jain }
416556fc37d9SAnand Jain 
416656fc37d9SAnand Jain /*
4167dccdb07bSDavid Sterba  * Should be called with balance mutexe held
4168c9e9f97bSIlya Dryomov  */
41696fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info,
41706fcf6e2bSDavid Sterba 		  struct btrfs_balance_control *bctl,
4171c9e9f97bSIlya Dryomov 		  struct btrfs_ioctl_balance_args *bargs)
4172c9e9f97bSIlya Dryomov {
417314506127SAdam Borowski 	u64 meta_target, data_target;
4174f43ffb60SIlya Dryomov 	u64 allowed;
4175e4837f8fSIlya Dryomov 	int mixed = 0;
4176c9e9f97bSIlya Dryomov 	int ret;
41778dabb742SStefan Behrens 	u64 num_devices;
4178de98ced9SMiao Xie 	unsigned seq;
4179e62869beSAnand Jain 	bool reducing_redundancy;
4180081db89bSDavid Sterba 	int i;
4181c9e9f97bSIlya Dryomov 
4182837d5b6eSIlya Dryomov 	if (btrfs_fs_closing(fs_info) ||
4183a7e99c69SIlya Dryomov 	    atomic_read(&fs_info->balance_pause_req) ||
4184726a3421SQu Wenruo 	    btrfs_should_cancel_balance(fs_info)) {
4185c9e9f97bSIlya Dryomov 		ret = -EINVAL;
4186c9e9f97bSIlya Dryomov 		goto out;
4187c9e9f97bSIlya Dryomov 	}
4188c9e9f97bSIlya Dryomov 
4189e4837f8fSIlya Dryomov 	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4190e4837f8fSIlya Dryomov 	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4191e4837f8fSIlya Dryomov 		mixed = 1;
4192e4837f8fSIlya Dryomov 
4193f43ffb60SIlya Dryomov 	/*
4194f43ffb60SIlya Dryomov 	 * In case of mixed groups both data and meta should be picked,
4195f43ffb60SIlya Dryomov 	 * and identical options should be given for both of them.
4196f43ffb60SIlya Dryomov 	 */
4197e4837f8fSIlya Dryomov 	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4198e4837f8fSIlya Dryomov 	if (mixed && (bctl->flags & allowed)) {
4199f43ffb60SIlya Dryomov 		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4200f43ffb60SIlya Dryomov 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4201f43ffb60SIlya Dryomov 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
42025d163e0eSJeff Mahoney 			btrfs_err(fs_info,
42036dac13f8SAnand Jain 	  "balance: mixed groups data and metadata options must be the same");
4204f43ffb60SIlya Dryomov 			ret = -EINVAL;
4205f43ffb60SIlya Dryomov 			goto out;
4206f43ffb60SIlya Dryomov 		}
4207f43ffb60SIlya Dryomov 	}
4208f43ffb60SIlya Dryomov 
4209b35cf1f0SJosef Bacik 	/*
4210b35cf1f0SJosef Bacik 	 * rw_devices will not change at the moment, device add/delete/replace
4211c3e1f96cSGoldwyn Rodrigues 	 * are exclusive
4212b35cf1f0SJosef Bacik 	 */
4213b35cf1f0SJosef Bacik 	num_devices = fs_info->fs_devices->rw_devices;
4214fab27359SQu Wenruo 
4215fab27359SQu Wenruo 	/*
4216fab27359SQu Wenruo 	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4217fab27359SQu Wenruo 	 * special bit for it, to make it easier to distinguish.  Thus we need
4218fab27359SQu Wenruo 	 * to set it manually, or balance would refuse the profile.
4219fab27359SQu Wenruo 	 */
4220fab27359SQu Wenruo 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4221081db89bSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4222081db89bSDavid Sterba 		if (num_devices >= btrfs_raid_array[i].devs_min)
4223081db89bSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
42241da73967SAnand Jain 
42255ba366c3SDavid Sterba 	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
42265ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
42275ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4228e4d8ec0fSIlya Dryomov 		ret = -EINVAL;
4229e4d8ec0fSIlya Dryomov 		goto out;
4230e4d8ec0fSIlya Dryomov 	}
4231e4d8ec0fSIlya Dryomov 
42326079e12cSDavid Sterba 	/*
42336079e12cSDavid Sterba 	 * Allow to reduce metadata or system integrity only if force set for
42346079e12cSDavid Sterba 	 * profiles with redundancy (copies, parity)
42356079e12cSDavid Sterba 	 */
42366079e12cSDavid Sterba 	allowed = 0;
42376079e12cSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
42386079e12cSDavid Sterba 		if (btrfs_raid_array[i].ncopies >= 2 ||
42396079e12cSDavid Sterba 		    btrfs_raid_array[i].tolerated_failures >= 1)
42406079e12cSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
42416079e12cSDavid Sterba 	}
4242de98ced9SMiao Xie 	do {
4243de98ced9SMiao Xie 		seq = read_seqbegin(&fs_info->profiles_lock);
4244de98ced9SMiao Xie 
4245e4d8ec0fSIlya Dryomov 		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4246e4d8ec0fSIlya Dryomov 		     (fs_info->avail_system_alloc_bits & allowed) &&
4247e4d8ec0fSIlya Dryomov 		     !(bctl->sys.target & allowed)) ||
4248e4d8ec0fSIlya Dryomov 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4249e4d8ec0fSIlya Dryomov 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
42505a8067c0SFilipe Manana 		     !(bctl->meta.target & allowed)))
4251e62869beSAnand Jain 			reducing_redundancy = true;
42525a8067c0SFilipe Manana 		else
4253e62869beSAnand Jain 			reducing_redundancy = false;
42545a8067c0SFilipe Manana 
42555a8067c0SFilipe Manana 		/* if we're not converting, the target field is uninitialized */
42565a8067c0SFilipe Manana 		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
42575a8067c0SFilipe Manana 			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
42585a8067c0SFilipe Manana 		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
42595a8067c0SFilipe Manana 			bctl->data.target : fs_info->avail_data_alloc_bits;
42605a8067c0SFilipe Manana 	} while (read_seqretry(&fs_info->profiles_lock, seq));
42615a8067c0SFilipe Manana 
4262e62869beSAnand Jain 	if (reducing_redundancy) {
4263e4d8ec0fSIlya Dryomov 		if (bctl->flags & BTRFS_BALANCE_FORCE) {
42645d163e0eSJeff Mahoney 			btrfs_info(fs_info,
4265e62869beSAnand Jain 			   "balance: force reducing metadata redundancy");
4266e4d8ec0fSIlya Dryomov 		} else {
42675d163e0eSJeff Mahoney 			btrfs_err(fs_info,
4268e62869beSAnand Jain 	"balance: reduces metadata redundancy, use --force if you want this");
4269e4d8ec0fSIlya Dryomov 			ret = -EINVAL;
4270e4d8ec0fSIlya Dryomov 			goto out;
4271e4d8ec0fSIlya Dryomov 		}
4272e4d8ec0fSIlya Dryomov 	}
4273e4d8ec0fSIlya Dryomov 
427414506127SAdam Borowski 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
427514506127SAdam Borowski 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4276ee592d07SSam Tygier 		btrfs_warn(fs_info,
42776dac13f8SAnand Jain 	"balance: metadata profile %s has lower redundancy than data profile %s",
4278158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(meta_target),
4279158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(data_target));
4280ee592d07SSam Tygier 	}
4281ee592d07SSam Tygier 
42826bccf3abSJeff Mahoney 	ret = insert_balance_item(fs_info, bctl);
428359641015SIlya Dryomov 	if (ret && ret != -EEXIST)
42840940ebf6SIlya Dryomov 		goto out;
42850940ebf6SIlya Dryomov 
428659641015SIlya Dryomov 	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
428759641015SIlya Dryomov 		BUG_ON(ret == -EEXIST);
4288833aae18SDavid Sterba 		BUG_ON(fs_info->balance_ctl);
4289833aae18SDavid Sterba 		spin_lock(&fs_info->balance_lock);
4290833aae18SDavid Sterba 		fs_info->balance_ctl = bctl;
4291833aae18SDavid Sterba 		spin_unlock(&fs_info->balance_lock);
429259641015SIlya Dryomov 	} else {
429359641015SIlya Dryomov 		BUG_ON(ret != -EEXIST);
429459641015SIlya Dryomov 		spin_lock(&fs_info->balance_lock);
429559641015SIlya Dryomov 		update_balance_args(bctl);
429659641015SIlya Dryomov 		spin_unlock(&fs_info->balance_lock);
429759641015SIlya Dryomov 	}
4298c9e9f97bSIlya Dryomov 
42993009a62fSDavid Sterba 	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
43003009a62fSDavid Sterba 	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
430156fc37d9SAnand Jain 	describe_balance_start_or_resume(fs_info);
4302c9e9f97bSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4303c9e9f97bSIlya Dryomov 
4304c9e9f97bSIlya Dryomov 	ret = __btrfs_balance(fs_info);
4305c9e9f97bSIlya Dryomov 
4306c9e9f97bSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
43077333bd02SAnand Jain 	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
43087333bd02SAnand Jain 		btrfs_info(fs_info, "balance: paused");
430944d354abSQu Wenruo 	/*
431044d354abSQu Wenruo 	 * Balance can be canceled by:
431144d354abSQu Wenruo 	 *
431244d354abSQu Wenruo 	 * - Regular cancel request
431344d354abSQu Wenruo 	 *   Then ret == -ECANCELED and balance_cancel_req > 0
431444d354abSQu Wenruo 	 *
431544d354abSQu Wenruo 	 * - Fatal signal to "btrfs" process
431644d354abSQu Wenruo 	 *   Either the signal caught by wait_reserve_ticket() and callers
431744d354abSQu Wenruo 	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
431844d354abSQu Wenruo 	 *   got -ECANCELED.
431944d354abSQu Wenruo 	 *   Either way, in this case balance_cancel_req = 0, and
432044d354abSQu Wenruo 	 *   ret == -EINTR or ret == -ECANCELED.
432144d354abSQu Wenruo 	 *
432244d354abSQu Wenruo 	 * So here we only check the return value to catch canceled balance.
432344d354abSQu Wenruo 	 */
432444d354abSQu Wenruo 	else if (ret == -ECANCELED || ret == -EINTR)
43257333bd02SAnand Jain 		btrfs_info(fs_info, "balance: canceled");
43267333bd02SAnand Jain 	else
43277333bd02SAnand Jain 		btrfs_info(fs_info, "balance: ended with status: %d", ret);
43287333bd02SAnand Jain 
43293009a62fSDavid Sterba 	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4330c9e9f97bSIlya Dryomov 
4331c9e9f97bSIlya Dryomov 	if (bargs) {
4332c9e9f97bSIlya Dryomov 		memset(bargs, 0, sizeof(*bargs));
4333008ef096SDavid Sterba 		btrfs_update_ioctl_balance_args(fs_info, bargs);
4334c9e9f97bSIlya Dryomov 	}
4335c9e9f97bSIlya Dryomov 
43363a01aa7aSIlya Dryomov 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
43373a01aa7aSIlya Dryomov 	    balance_need_close(fs_info)) {
4338149196a2SDavid Sterba 		reset_balance_state(fs_info);
4339c3e1f96cSGoldwyn Rodrigues 		btrfs_exclop_finish(fs_info);
43403a01aa7aSIlya Dryomov 	}
43413a01aa7aSIlya Dryomov 
4342837d5b6eSIlya Dryomov 	wake_up(&fs_info->balance_wait_q);
4343c9e9f97bSIlya Dryomov 
4344c9e9f97bSIlya Dryomov 	return ret;
4345c9e9f97bSIlya Dryomov out:
434659641015SIlya Dryomov 	if (bctl->flags & BTRFS_BALANCE_RESUME)
4347149196a2SDavid Sterba 		reset_balance_state(fs_info);
4348a17c95dfSDavid Sterba 	else
4349c9e9f97bSIlya Dryomov 		kfree(bctl);
4350c3e1f96cSGoldwyn Rodrigues 	btrfs_exclop_finish(fs_info);
4351a17c95dfSDavid Sterba 
43528f18cf13SChris Mason 	return ret;
43538f18cf13SChris Mason }
43548f18cf13SChris Mason 
435559641015SIlya Dryomov static int balance_kthread(void *data)
435659641015SIlya Dryomov {
43572b6ba629SIlya Dryomov 	struct btrfs_fs_info *fs_info = data;
43589555c6c1SIlya Dryomov 	int ret = 0;
435959641015SIlya Dryomov 
436059641015SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
436156fc37d9SAnand Jain 	if (fs_info->balance_ctl)
43626fcf6e2bSDavid Sterba 		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
436359641015SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
43642b6ba629SIlya Dryomov 
436559641015SIlya Dryomov 	return ret;
436659641015SIlya Dryomov }
436759641015SIlya Dryomov 
43682b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
43692b6ba629SIlya Dryomov {
43702b6ba629SIlya Dryomov 	struct task_struct *tsk;
43712b6ba629SIlya Dryomov 
43721354e1a1SDavid Sterba 	mutex_lock(&fs_info->balance_mutex);
43732b6ba629SIlya Dryomov 	if (!fs_info->balance_ctl) {
43741354e1a1SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
43752b6ba629SIlya Dryomov 		return 0;
43762b6ba629SIlya Dryomov 	}
43771354e1a1SDavid Sterba 	mutex_unlock(&fs_info->balance_mutex);
43782b6ba629SIlya Dryomov 
43793cdde224SJeff Mahoney 	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
43806dac13f8SAnand Jain 		btrfs_info(fs_info, "balance: resume skipped");
43812b6ba629SIlya Dryomov 		return 0;
43822b6ba629SIlya Dryomov 	}
43832b6ba629SIlya Dryomov 
438402ee654dSAnand Jain 	/*
438502ee654dSAnand Jain 	 * A ro->rw remount sequence should continue with the paused balance
438602ee654dSAnand Jain 	 * regardless of who pauses it, system or the user as of now, so set
438702ee654dSAnand Jain 	 * the resume flag.
438802ee654dSAnand Jain 	 */
438902ee654dSAnand Jain 	spin_lock(&fs_info->balance_lock);
439002ee654dSAnand Jain 	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
439102ee654dSAnand Jain 	spin_unlock(&fs_info->balance_lock);
439202ee654dSAnand Jain 
43932b6ba629SIlya Dryomov 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4394cd633972SSachin Kamat 	return PTR_ERR_OR_ZERO(tsk);
43952b6ba629SIlya Dryomov }
43962b6ba629SIlya Dryomov 
439768310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
439859641015SIlya Dryomov {
439959641015SIlya Dryomov 	struct btrfs_balance_control *bctl;
440059641015SIlya Dryomov 	struct btrfs_balance_item *item;
440159641015SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
440259641015SIlya Dryomov 	struct btrfs_path *path;
440359641015SIlya Dryomov 	struct extent_buffer *leaf;
440459641015SIlya Dryomov 	struct btrfs_key key;
440559641015SIlya Dryomov 	int ret;
440659641015SIlya Dryomov 
440759641015SIlya Dryomov 	path = btrfs_alloc_path();
440859641015SIlya Dryomov 	if (!path)
440959641015SIlya Dryomov 		return -ENOMEM;
441059641015SIlya Dryomov 
441168310a5eSIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
4412c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
441368310a5eSIlya Dryomov 	key.offset = 0;
441468310a5eSIlya Dryomov 
441568310a5eSIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
441668310a5eSIlya Dryomov 	if (ret < 0)
441768310a5eSIlya Dryomov 		goto out;
441868310a5eSIlya Dryomov 	if (ret > 0) { /* ret = -ENOENT; */
441968310a5eSIlya Dryomov 		ret = 0;
442068310a5eSIlya Dryomov 		goto out;
442168310a5eSIlya Dryomov 	}
442268310a5eSIlya Dryomov 
442359641015SIlya Dryomov 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
442459641015SIlya Dryomov 	if (!bctl) {
442559641015SIlya Dryomov 		ret = -ENOMEM;
442659641015SIlya Dryomov 		goto out;
442759641015SIlya Dryomov 	}
442859641015SIlya Dryomov 
442959641015SIlya Dryomov 	leaf = path->nodes[0];
443059641015SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
443159641015SIlya Dryomov 
443268310a5eSIlya Dryomov 	bctl->flags = btrfs_balance_flags(leaf, item);
443368310a5eSIlya Dryomov 	bctl->flags |= BTRFS_BALANCE_RESUME;
443459641015SIlya Dryomov 
443559641015SIlya Dryomov 	btrfs_balance_data(leaf, item, &disk_bargs);
443659641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
443759641015SIlya Dryomov 	btrfs_balance_meta(leaf, item, &disk_bargs);
443859641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
443959641015SIlya Dryomov 	btrfs_balance_sys(leaf, item, &disk_bargs);
444059641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
444159641015SIlya Dryomov 
4442eee95e3fSDavid Sterba 	/*
4443eee95e3fSDavid Sterba 	 * This should never happen, as the paused balance state is recovered
4444eee95e3fSDavid Sterba 	 * during mount without any chance of other exclusive ops to collide.
4445eee95e3fSDavid Sterba 	 *
4446eee95e3fSDavid Sterba 	 * This gives the exclusive op status to balance and keeps in paused
4447eee95e3fSDavid Sterba 	 * state until user intervention (cancel or umount). If the ownership
4448eee95e3fSDavid Sterba 	 * cannot be assigned, show a message but do not fail. The balance
4449eee95e3fSDavid Sterba 	 * is in a paused state and must have fs_info::balance_ctl properly
4450eee95e3fSDavid Sterba 	 * set up.
4451eee95e3fSDavid Sterba 	 */
4452c3e1f96cSGoldwyn Rodrigues 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4453eee95e3fSDavid Sterba 		btrfs_warn(fs_info,
44546dac13f8SAnand Jain 	"balance: cannot set exclusive op status, resume manually");
4455ed0fb78fSIlya Dryomov 
4456fb286100SJosef Bacik 	btrfs_release_path(path);
4457fb286100SJosef Bacik 
445868310a5eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4459833aae18SDavid Sterba 	BUG_ON(fs_info->balance_ctl);
4460833aae18SDavid Sterba 	spin_lock(&fs_info->balance_lock);
4461833aae18SDavid Sterba 	fs_info->balance_ctl = bctl;
4462833aae18SDavid Sterba 	spin_unlock(&fs_info->balance_lock);
446368310a5eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
446459641015SIlya Dryomov out:
446559641015SIlya Dryomov 	btrfs_free_path(path);
446659641015SIlya Dryomov 	return ret;
446759641015SIlya Dryomov }
446859641015SIlya Dryomov 
4469837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4470837d5b6eSIlya Dryomov {
4471837d5b6eSIlya Dryomov 	int ret = 0;
4472837d5b6eSIlya Dryomov 
4473837d5b6eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4474837d5b6eSIlya Dryomov 	if (!fs_info->balance_ctl) {
4475837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4476837d5b6eSIlya Dryomov 		return -ENOTCONN;
4477837d5b6eSIlya Dryomov 	}
4478837d5b6eSIlya Dryomov 
44793009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4480837d5b6eSIlya Dryomov 		atomic_inc(&fs_info->balance_pause_req);
4481837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4482837d5b6eSIlya Dryomov 
4483837d5b6eSIlya Dryomov 		wait_event(fs_info->balance_wait_q,
44843009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4485837d5b6eSIlya Dryomov 
4486837d5b6eSIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4487837d5b6eSIlya Dryomov 		/* we are good with balance_ctl ripped off from under us */
44883009a62fSDavid Sterba 		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4489837d5b6eSIlya Dryomov 		atomic_dec(&fs_info->balance_pause_req);
4490837d5b6eSIlya Dryomov 	} else {
4491837d5b6eSIlya Dryomov 		ret = -ENOTCONN;
4492837d5b6eSIlya Dryomov 	}
4493837d5b6eSIlya Dryomov 
4494837d5b6eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4495837d5b6eSIlya Dryomov 	return ret;
4496837d5b6eSIlya Dryomov }
4497837d5b6eSIlya Dryomov 
4498a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4499a7e99c69SIlya Dryomov {
4500a7e99c69SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4501a7e99c69SIlya Dryomov 	if (!fs_info->balance_ctl) {
4502a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4503a7e99c69SIlya Dryomov 		return -ENOTCONN;
4504a7e99c69SIlya Dryomov 	}
4505a7e99c69SIlya Dryomov 
4506cf7d20f4SDavid Sterba 	/*
4507cf7d20f4SDavid Sterba 	 * A paused balance with the item stored on disk can be resumed at
4508cf7d20f4SDavid Sterba 	 * mount time if the mount is read-write. Otherwise it's still paused
4509cf7d20f4SDavid Sterba 	 * and we must not allow cancelling as it deletes the item.
4510cf7d20f4SDavid Sterba 	 */
4511cf7d20f4SDavid Sterba 	if (sb_rdonly(fs_info->sb)) {
4512cf7d20f4SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
4513cf7d20f4SDavid Sterba 		return -EROFS;
4514cf7d20f4SDavid Sterba 	}
4515cf7d20f4SDavid Sterba 
4516a7e99c69SIlya Dryomov 	atomic_inc(&fs_info->balance_cancel_req);
4517a7e99c69SIlya Dryomov 	/*
4518a7e99c69SIlya Dryomov 	 * if we are running just wait and return, balance item is
4519a7e99c69SIlya Dryomov 	 * deleted in btrfs_balance in this case
4520a7e99c69SIlya Dryomov 	 */
45213009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4522a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4523a7e99c69SIlya Dryomov 		wait_event(fs_info->balance_wait_q,
45243009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4525a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4526a7e99c69SIlya Dryomov 	} else {
4527a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4528dccdb07bSDavid Sterba 		/*
4529dccdb07bSDavid Sterba 		 * Lock released to allow other waiters to continue, we'll
4530dccdb07bSDavid Sterba 		 * reexamine the status again.
4531dccdb07bSDavid Sterba 		 */
4532a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4533a7e99c69SIlya Dryomov 
4534a17c95dfSDavid Sterba 		if (fs_info->balance_ctl) {
4535149196a2SDavid Sterba 			reset_balance_state(fs_info);
4536c3e1f96cSGoldwyn Rodrigues 			btrfs_exclop_finish(fs_info);
45376dac13f8SAnand Jain 			btrfs_info(fs_info, "balance: canceled");
4538a17c95dfSDavid Sterba 		}
4539a7e99c69SIlya Dryomov 	}
4540a7e99c69SIlya Dryomov 
45413009a62fSDavid Sterba 	BUG_ON(fs_info->balance_ctl ||
45423009a62fSDavid Sterba 		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4543a7e99c69SIlya Dryomov 	atomic_dec(&fs_info->balance_cancel_req);
4544a7e99c69SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4545a7e99c69SIlya Dryomov 	return 0;
4546a7e99c69SIlya Dryomov }
4547a7e99c69SIlya Dryomov 
454897f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data)
4549803b2f54SStefan Behrens {
4550803b2f54SStefan Behrens 	struct btrfs_fs_info *fs_info = data;
4551803b2f54SStefan Behrens 	struct btrfs_root *root = fs_info->tree_root;
4552803b2f54SStefan Behrens 	struct btrfs_key key;
4553803b2f54SStefan Behrens 	struct btrfs_path *path = NULL;
4554803b2f54SStefan Behrens 	int ret = 0;
4555803b2f54SStefan Behrens 	struct extent_buffer *eb;
4556803b2f54SStefan Behrens 	int slot;
4557803b2f54SStefan Behrens 	struct btrfs_root_item root_item;
4558803b2f54SStefan Behrens 	u32 item_size;
4559f45388f3SFilipe David Borba Manana 	struct btrfs_trans_handle *trans = NULL;
4560c94bec2cSJosef Bacik 	bool closing = false;
4561803b2f54SStefan Behrens 
4562803b2f54SStefan Behrens 	path = btrfs_alloc_path();
4563803b2f54SStefan Behrens 	if (!path) {
4564803b2f54SStefan Behrens 		ret = -ENOMEM;
4565803b2f54SStefan Behrens 		goto out;
4566803b2f54SStefan Behrens 	}
4567803b2f54SStefan Behrens 
4568803b2f54SStefan Behrens 	key.objectid = 0;
4569803b2f54SStefan Behrens 	key.type = BTRFS_ROOT_ITEM_KEY;
4570803b2f54SStefan Behrens 	key.offset = 0;
4571803b2f54SStefan Behrens 
4572803b2f54SStefan Behrens 	while (1) {
4573c94bec2cSJosef Bacik 		if (btrfs_fs_closing(fs_info)) {
4574c94bec2cSJosef Bacik 			closing = true;
4575c94bec2cSJosef Bacik 			break;
4576c94bec2cSJosef Bacik 		}
45777c829b72SAnand Jain 		ret = btrfs_search_forward(root, &key, path,
45787c829b72SAnand Jain 				BTRFS_OLDEST_GENERATION);
4579803b2f54SStefan Behrens 		if (ret) {
4580803b2f54SStefan Behrens 			if (ret > 0)
4581803b2f54SStefan Behrens 				ret = 0;
4582803b2f54SStefan Behrens 			break;
4583803b2f54SStefan Behrens 		}
4584803b2f54SStefan Behrens 
4585803b2f54SStefan Behrens 		if (key.type != BTRFS_ROOT_ITEM_KEY ||
4586803b2f54SStefan Behrens 		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4587803b2f54SStefan Behrens 		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4588803b2f54SStefan Behrens 		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4589803b2f54SStefan Behrens 			goto skip;
4590803b2f54SStefan Behrens 
4591803b2f54SStefan Behrens 		eb = path->nodes[0];
4592803b2f54SStefan Behrens 		slot = path->slots[0];
4593803b2f54SStefan Behrens 		item_size = btrfs_item_size_nr(eb, slot);
4594803b2f54SStefan Behrens 		if (item_size < sizeof(root_item))
4595803b2f54SStefan Behrens 			goto skip;
4596803b2f54SStefan Behrens 
4597803b2f54SStefan Behrens 		read_extent_buffer(eb, &root_item,
4598803b2f54SStefan Behrens 				   btrfs_item_ptr_offset(eb, slot),
4599803b2f54SStefan Behrens 				   (int)sizeof(root_item));
4600803b2f54SStefan Behrens 		if (btrfs_root_refs(&root_item) == 0)
4601803b2f54SStefan Behrens 			goto skip;
4602f45388f3SFilipe David Borba Manana 
4603f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid) ||
4604f45388f3SFilipe David Borba Manana 		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4605f45388f3SFilipe David Borba Manana 			if (trans)
4606f45388f3SFilipe David Borba Manana 				goto update_tree;
4607f45388f3SFilipe David Borba Manana 
4608f45388f3SFilipe David Borba Manana 			btrfs_release_path(path);
4609803b2f54SStefan Behrens 			/*
4610803b2f54SStefan Behrens 			 * 1 - subvol uuid item
4611803b2f54SStefan Behrens 			 * 1 - received_subvol uuid item
4612803b2f54SStefan Behrens 			 */
4613803b2f54SStefan Behrens 			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4614803b2f54SStefan Behrens 			if (IS_ERR(trans)) {
4615803b2f54SStefan Behrens 				ret = PTR_ERR(trans);
4616803b2f54SStefan Behrens 				break;
4617803b2f54SStefan Behrens 			}
4618f45388f3SFilipe David Borba Manana 			continue;
4619f45388f3SFilipe David Borba Manana 		} else {
4620f45388f3SFilipe David Borba Manana 			goto skip;
4621f45388f3SFilipe David Borba Manana 		}
4622f45388f3SFilipe David Borba Manana update_tree:
46239771a5cfSJosef Bacik 		btrfs_release_path(path);
4624f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4625cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4626803b2f54SStefan Behrens 						  BTRFS_UUID_KEY_SUBVOL,
4627803b2f54SStefan Behrens 						  key.objectid);
4628803b2f54SStefan Behrens 			if (ret < 0) {
4629efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4630803b2f54SStefan Behrens 					ret);
4631803b2f54SStefan Behrens 				break;
4632803b2f54SStefan Behrens 			}
4633803b2f54SStefan Behrens 		}
4634803b2f54SStefan Behrens 
4635803b2f54SStefan Behrens 		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4636cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans,
4637803b2f54SStefan Behrens 						  root_item.received_uuid,
4638803b2f54SStefan Behrens 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4639803b2f54SStefan Behrens 						  key.objectid);
4640803b2f54SStefan Behrens 			if (ret < 0) {
4641efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4642803b2f54SStefan Behrens 					ret);
4643803b2f54SStefan Behrens 				break;
4644803b2f54SStefan Behrens 			}
4645803b2f54SStefan Behrens 		}
4646803b2f54SStefan Behrens 
4647f45388f3SFilipe David Borba Manana skip:
46489771a5cfSJosef Bacik 		btrfs_release_path(path);
4649803b2f54SStefan Behrens 		if (trans) {
46503a45bb20SJeff Mahoney 			ret = btrfs_end_transaction(trans);
4651f45388f3SFilipe David Borba Manana 			trans = NULL;
4652803b2f54SStefan Behrens 			if (ret)
4653803b2f54SStefan Behrens 				break;
4654803b2f54SStefan Behrens 		}
4655803b2f54SStefan Behrens 
4656803b2f54SStefan Behrens 		if (key.offset < (u64)-1) {
4657803b2f54SStefan Behrens 			key.offset++;
4658803b2f54SStefan Behrens 		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4659803b2f54SStefan Behrens 			key.offset = 0;
4660803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4661803b2f54SStefan Behrens 		} else if (key.objectid < (u64)-1) {
4662803b2f54SStefan Behrens 			key.offset = 0;
4663803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4664803b2f54SStefan Behrens 			key.objectid++;
4665803b2f54SStefan Behrens 		} else {
4666803b2f54SStefan Behrens 			break;
4667803b2f54SStefan Behrens 		}
4668803b2f54SStefan Behrens 		cond_resched();
4669803b2f54SStefan Behrens 	}
4670803b2f54SStefan Behrens 
4671803b2f54SStefan Behrens out:
4672803b2f54SStefan Behrens 	btrfs_free_path(path);
4673f45388f3SFilipe David Borba Manana 	if (trans && !IS_ERR(trans))
46743a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
4675803b2f54SStefan Behrens 	if (ret)
4676efe120a0SFrank Holton 		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4677c94bec2cSJosef Bacik 	else if (!closing)
4678afcdd129SJosef Bacik 		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4679803b2f54SStefan Behrens 	up(&fs_info->uuid_tree_rescan_sem);
4680803b2f54SStefan Behrens 	return 0;
4681803b2f54SStefan Behrens }
4682803b2f54SStefan Behrens 
4683f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4684f7a81ea4SStefan Behrens {
4685f7a81ea4SStefan Behrens 	struct btrfs_trans_handle *trans;
4686f7a81ea4SStefan Behrens 	struct btrfs_root *tree_root = fs_info->tree_root;
4687f7a81ea4SStefan Behrens 	struct btrfs_root *uuid_root;
4688803b2f54SStefan Behrens 	struct task_struct *task;
4689803b2f54SStefan Behrens 	int ret;
4690f7a81ea4SStefan Behrens 
4691f7a81ea4SStefan Behrens 	/*
4692f7a81ea4SStefan Behrens 	 * 1 - root node
4693f7a81ea4SStefan Behrens 	 * 1 - root item
4694f7a81ea4SStefan Behrens 	 */
4695f7a81ea4SStefan Behrens 	trans = btrfs_start_transaction(tree_root, 2);
4696f7a81ea4SStefan Behrens 	if (IS_ERR(trans))
4697f7a81ea4SStefan Behrens 		return PTR_ERR(trans);
4698f7a81ea4SStefan Behrens 
46999b7a2440SDavid Sterba 	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4700f7a81ea4SStefan Behrens 	if (IS_ERR(uuid_root)) {
47016d13f549SDavid Sterba 		ret = PTR_ERR(uuid_root);
470266642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
47033a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
47046d13f549SDavid Sterba 		return ret;
4705f7a81ea4SStefan Behrens 	}
4706f7a81ea4SStefan Behrens 
4707f7a81ea4SStefan Behrens 	fs_info->uuid_root = uuid_root;
4708f7a81ea4SStefan Behrens 
47093a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
4710803b2f54SStefan Behrens 	if (ret)
4711803b2f54SStefan Behrens 		return ret;
4712803b2f54SStefan Behrens 
4713803b2f54SStefan Behrens 	down(&fs_info->uuid_tree_rescan_sem);
4714803b2f54SStefan Behrens 	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4715803b2f54SStefan Behrens 	if (IS_ERR(task)) {
471670f80175SStefan Behrens 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4717efe120a0SFrank Holton 		btrfs_warn(fs_info, "failed to start uuid_scan task");
4718803b2f54SStefan Behrens 		up(&fs_info->uuid_tree_rescan_sem);
4719803b2f54SStefan Behrens 		return PTR_ERR(task);
4720f7a81ea4SStefan Behrens 	}
4721803b2f54SStefan Behrens 
4722803b2f54SStefan Behrens 	return 0;
4723803b2f54SStefan Behrens }
4724803b2f54SStefan Behrens 
47258f18cf13SChris Mason /*
47268f18cf13SChris Mason  * shrinking a device means finding all of the device extents past
47278f18cf13SChris Mason  * the new size, and then following the back refs to the chunks.
47288f18cf13SChris Mason  * The chunk relocation code actually frees the device extent
47298f18cf13SChris Mason  */
47308f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
47318f18cf13SChris Mason {
47320b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
47330b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
47348f18cf13SChris Mason 	struct btrfs_trans_handle *trans;
47358f18cf13SChris Mason 	struct btrfs_dev_extent *dev_extent = NULL;
47368f18cf13SChris Mason 	struct btrfs_path *path;
47378f18cf13SChris Mason 	u64 length;
47388f18cf13SChris Mason 	u64 chunk_offset;
47398f18cf13SChris Mason 	int ret;
47408f18cf13SChris Mason 	int slot;
4741ba1bf481SJosef Bacik 	int failed = 0;
4742ba1bf481SJosef Bacik 	bool retried = false;
47438f18cf13SChris Mason 	struct extent_buffer *l;
47448f18cf13SChris Mason 	struct btrfs_key key;
47450b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
47468f18cf13SChris Mason 	u64 old_total = btrfs_super_total_bytes(super_copy);
47477cc8e58dSMiao Xie 	u64 old_size = btrfs_device_get_total_bytes(device);
47487dfb8be1SNikolay Borisov 	u64 diff;
474961d0d0d2SNikolay Borisov 	u64 start;
47507dfb8be1SNikolay Borisov 
47517dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
475261d0d0d2SNikolay Borisov 	start = new_size;
47530e4324a4SNikolay Borisov 	diff = round_down(old_size - new_size, fs_info->sectorsize);
47548f18cf13SChris Mason 
4755401e29c1SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
475663a212abSStefan Behrens 		return -EINVAL;
475763a212abSStefan Behrens 
47588f18cf13SChris Mason 	path = btrfs_alloc_path();
47598f18cf13SChris Mason 	if (!path)
47608f18cf13SChris Mason 		return -ENOMEM;
47618f18cf13SChris Mason 
47620338dff6SGu Jinxiang 	path->reada = READA_BACK;
47638f18cf13SChris Mason 
476461d0d0d2SNikolay Borisov 	trans = btrfs_start_transaction(root, 0);
476561d0d0d2SNikolay Borisov 	if (IS_ERR(trans)) {
476661d0d0d2SNikolay Borisov 		btrfs_free_path(path);
476761d0d0d2SNikolay Borisov 		return PTR_ERR(trans);
476861d0d0d2SNikolay Borisov 	}
476961d0d0d2SNikolay Borisov 
477034441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
47717d9eb12cSChris Mason 
47727cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
4773ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
47742b82032cSYan Zheng 		device->fs_devices->total_rw_bytes -= diff;
4775a5ed45f8SNikolay Borisov 		atomic64_sub(diff, &fs_info->free_chunk_space);
47762bf64758SJosef Bacik 	}
477761d0d0d2SNikolay Borisov 
477861d0d0d2SNikolay Borisov 	/*
477961d0d0d2SNikolay Borisov 	 * Once the device's size has been set to the new size, ensure all
478061d0d0d2SNikolay Borisov 	 * in-memory chunks are synced to disk so that the loop below sees them
478161d0d0d2SNikolay Borisov 	 * and relocates them accordingly.
478261d0d0d2SNikolay Borisov 	 */
47831c11b63eSJeff Mahoney 	if (contains_pending_extent(device, &start, diff)) {
478434441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
478561d0d0d2SNikolay Borisov 		ret = btrfs_commit_transaction(trans);
478661d0d0d2SNikolay Borisov 		if (ret)
478761d0d0d2SNikolay Borisov 			goto done;
478861d0d0d2SNikolay Borisov 	} else {
478961d0d0d2SNikolay Borisov 		mutex_unlock(&fs_info->chunk_mutex);
479061d0d0d2SNikolay Borisov 		btrfs_end_transaction(trans);
479161d0d0d2SNikolay Borisov 	}
47928f18cf13SChris Mason 
4793ba1bf481SJosef Bacik again:
47948f18cf13SChris Mason 	key.objectid = device->devid;
47958f18cf13SChris Mason 	key.offset = (u64)-1;
47968f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
47978f18cf13SChris Mason 
4798213e64daSIlya Dryomov 	do {
4799f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
48008f18cf13SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
480167c5e7d4SFilipe Manana 		if (ret < 0) {
4802f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
48038f18cf13SChris Mason 			goto done;
480467c5e7d4SFilipe Manana 		}
48058f18cf13SChris Mason 
48068f18cf13SChris Mason 		ret = btrfs_previous_item(root, path, 0, key.type);
48077056bf69SNikolay Borisov 		if (ret) {
4808f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
48098f18cf13SChris Mason 			if (ret < 0)
48108f18cf13SChris Mason 				goto done;
48118f18cf13SChris Mason 			ret = 0;
4812b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4813bf1fb512SYan Zheng 			break;
48148f18cf13SChris Mason 		}
48158f18cf13SChris Mason 
48168f18cf13SChris Mason 		l = path->nodes[0];
48178f18cf13SChris Mason 		slot = path->slots[0];
48188f18cf13SChris Mason 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
48198f18cf13SChris Mason 
4820ba1bf481SJosef Bacik 		if (key.objectid != device->devid) {
4821f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4822b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4823bf1fb512SYan Zheng 			break;
4824ba1bf481SJosef Bacik 		}
48258f18cf13SChris Mason 
48268f18cf13SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
48278f18cf13SChris Mason 		length = btrfs_dev_extent_length(l, dev_extent);
48288f18cf13SChris Mason 
4829ba1bf481SJosef Bacik 		if (key.offset + length <= new_size) {
4830f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4831b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4832d6397baeSChris Ball 			break;
4833ba1bf481SJosef Bacik 		}
48348f18cf13SChris Mason 
48358f18cf13SChris Mason 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4836b3b4aa74SDavid Sterba 		btrfs_release_path(path);
48378f18cf13SChris Mason 
4838a6f93c71SLiu Bo 		/*
4839a6f93c71SLiu Bo 		 * We may be relocating the only data chunk we have,
4840a6f93c71SLiu Bo 		 * which could potentially end up with losing data's
4841a6f93c71SLiu Bo 		 * raid profile, so lets allocate an empty one in
4842a6f93c71SLiu Bo 		 * advance.
4843a6f93c71SLiu Bo 		 */
4844a6f93c71SLiu Bo 		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4845a6f93c71SLiu Bo 		if (ret < 0) {
4846f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4847a6f93c71SLiu Bo 			goto done;
4848a6f93c71SLiu Bo 		}
4849a6f93c71SLiu Bo 
48500b246afaSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4851f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
4852eede2bf3SOmar Sandoval 		if (ret == -ENOSPC) {
4853ba1bf481SJosef Bacik 			failed++;
4854eede2bf3SOmar Sandoval 		} else if (ret) {
4855eede2bf3SOmar Sandoval 			if (ret == -ETXTBSY) {
4856eede2bf3SOmar Sandoval 				btrfs_warn(fs_info,
4857eede2bf3SOmar Sandoval 		   "could not shrink block group %llu due to active swapfile",
4858eede2bf3SOmar Sandoval 					   chunk_offset);
4859eede2bf3SOmar Sandoval 			}
4860eede2bf3SOmar Sandoval 			goto done;
4861eede2bf3SOmar Sandoval 		}
4862213e64daSIlya Dryomov 	} while (key.offset-- > 0);
4863ba1bf481SJosef Bacik 
4864ba1bf481SJosef Bacik 	if (failed && !retried) {
4865ba1bf481SJosef Bacik 		failed = 0;
4866ba1bf481SJosef Bacik 		retried = true;
4867ba1bf481SJosef Bacik 		goto again;
4868ba1bf481SJosef Bacik 	} else if (failed && retried) {
4869ba1bf481SJosef Bacik 		ret = -ENOSPC;
48708f18cf13SChris Mason 		goto done;
48718f18cf13SChris Mason 	}
48728f18cf13SChris Mason 
4873d6397baeSChris Ball 	/* Shrinking succeeded, else we would be at "done". */
4874a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
487598d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
487698d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
487798d5dc13STsutomu Itoh 		goto done;
487898d5dc13STsutomu Itoh 	}
487998d5dc13STsutomu Itoh 
488034441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
4881c57dd1f2SQu Wenruo 	/* Clear all state bits beyond the shrunk device size */
4882c57dd1f2SQu Wenruo 	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4883c57dd1f2SQu Wenruo 			  CHUNK_STATE_MASK);
4884c57dd1f2SQu Wenruo 
48857cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
4886bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
4887bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
4888bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
4889d6397baeSChris Ball 
4890d6397baeSChris Ball 	WARN_ON(diff > old_total);
48917dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
48927dfb8be1SNikolay Borisov 			round_down(old_total - diff, fs_info->sectorsize));
489334441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
48942196d6e8SMiao Xie 
48952196d6e8SMiao Xie 	/* Now btrfs_update_device() will change the on-disk size. */
48962196d6e8SMiao Xie 	ret = btrfs_update_device(trans, device);
4897801660b0SAnand Jain 	if (ret < 0) {
4898801660b0SAnand Jain 		btrfs_abort_transaction(trans, ret);
48993a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
4900801660b0SAnand Jain 	} else {
4901801660b0SAnand Jain 		ret = btrfs_commit_transaction(trans);
4902801660b0SAnand Jain 	}
49038f18cf13SChris Mason done:
49048f18cf13SChris Mason 	btrfs_free_path(path);
490553e489bcSFilipe Manana 	if (ret) {
490634441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
490753e489bcSFilipe Manana 		btrfs_device_set_total_bytes(device, old_size);
4908ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
490953e489bcSFilipe Manana 			device->fs_devices->total_rw_bytes += diff;
4910a5ed45f8SNikolay Borisov 		atomic64_add(diff, &fs_info->free_chunk_space);
491134441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
491253e489bcSFilipe Manana 	}
49138f18cf13SChris Mason 	return ret;
49148f18cf13SChris Mason }
49158f18cf13SChris Mason 
49162ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
49170b86a832SChris Mason 			   struct btrfs_key *key,
49180b86a832SChris Mason 			   struct btrfs_chunk *chunk, int item_size)
49190b86a832SChris Mason {
49200b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
49210b86a832SChris Mason 	struct btrfs_disk_key disk_key;
49220b86a832SChris Mason 	u32 array_size;
49230b86a832SChris Mason 	u8 *ptr;
49240b86a832SChris Mason 
492579bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
492679bd3712SFilipe Manana 
49270b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
49285f43f86eSGui Hecheng 	if (array_size + item_size + sizeof(disk_key)
492979bd3712SFilipe Manana 			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
49300b86a832SChris Mason 		return -EFBIG;
49310b86a832SChris Mason 
49320b86a832SChris Mason 	ptr = super_copy->sys_chunk_array + array_size;
49330b86a832SChris Mason 	btrfs_cpu_key_to_disk(&disk_key, key);
49340b86a832SChris Mason 	memcpy(ptr, &disk_key, sizeof(disk_key));
49350b86a832SChris Mason 	ptr += sizeof(disk_key);
49360b86a832SChris Mason 	memcpy(ptr, chunk, item_size);
49370b86a832SChris Mason 	item_size += sizeof(disk_key);
49380b86a832SChris Mason 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4939fe48a5c0SMiao Xie 
49400b86a832SChris Mason 	return 0;
49410b86a832SChris Mason }
49420b86a832SChris Mason 
49439f680ce0SChris Mason /*
494473c5de00SArne Jansen  * sort the devices in descending order by max_avail, total_avail
49459f680ce0SChris Mason  */
494673c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b)
49472b82032cSYan Zheng {
494873c5de00SArne Jansen 	const struct btrfs_device_info *di_a = a;
494973c5de00SArne Jansen 	const struct btrfs_device_info *di_b = b;
49502b82032cSYan Zheng 
495173c5de00SArne Jansen 	if (di_a->max_avail > di_b->max_avail)
4952a40a90a0SChris Mason 		return -1;
495373c5de00SArne Jansen 	if (di_a->max_avail < di_b->max_avail)
49549b3f68b9SChris Mason 		return 1;
495573c5de00SArne Jansen 	if (di_a->total_avail > di_b->total_avail)
495673c5de00SArne Jansen 		return -1;
495773c5de00SArne Jansen 	if (di_a->total_avail < di_b->total_avail)
495873c5de00SArne Jansen 		return 1;
4959b2117a39SMiao Xie 	return 0;
4960b2117a39SMiao Xie }
4961b2117a39SMiao Xie 
496253b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
496353b381b3SDavid Woodhouse {
4964ffe2d203SZhao Lei 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
496553b381b3SDavid Woodhouse 		return;
496653b381b3SDavid Woodhouse 
4967ceda0864SMiao Xie 	btrfs_set_fs_incompat(info, RAID56);
496853b381b3SDavid Woodhouse }
496953b381b3SDavid Woodhouse 
4970cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4971cfbb825cSDavid Sterba {
4972cfbb825cSDavid Sterba 	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4973cfbb825cSDavid Sterba 		return;
4974cfbb825cSDavid Sterba 
4975cfbb825cSDavid Sterba 	btrfs_set_fs_incompat(info, RAID1C34);
4976cfbb825cSDavid Sterba }
4977cfbb825cSDavid Sterba 
49784f2bafe8SNaohiro Aota /*
4979f6f39f7aSNikolay Borisov  * Structure used internally for btrfs_create_chunk() function.
49804f2bafe8SNaohiro Aota  * Wraps needed parameters.
49814f2bafe8SNaohiro Aota  */
49824f2bafe8SNaohiro Aota struct alloc_chunk_ctl {
49834f2bafe8SNaohiro Aota 	u64 start;
49844f2bafe8SNaohiro Aota 	u64 type;
49854f2bafe8SNaohiro Aota 	/* Total number of stripes to allocate */
49864f2bafe8SNaohiro Aota 	int num_stripes;
49874f2bafe8SNaohiro Aota 	/* sub_stripes info for map */
49884f2bafe8SNaohiro Aota 	int sub_stripes;
49894f2bafe8SNaohiro Aota 	/* Stripes per device */
49904f2bafe8SNaohiro Aota 	int dev_stripes;
49914f2bafe8SNaohiro Aota 	/* Maximum number of devices to use */
49924f2bafe8SNaohiro Aota 	int devs_max;
49934f2bafe8SNaohiro Aota 	/* Minimum number of devices to use */
49944f2bafe8SNaohiro Aota 	int devs_min;
49954f2bafe8SNaohiro Aota 	/* ndevs has to be a multiple of this */
49964f2bafe8SNaohiro Aota 	int devs_increment;
49974f2bafe8SNaohiro Aota 	/* Number of copies */
49984f2bafe8SNaohiro Aota 	int ncopies;
49994f2bafe8SNaohiro Aota 	/* Number of stripes worth of bytes to store parity information */
50004f2bafe8SNaohiro Aota 	int nparity;
50014f2bafe8SNaohiro Aota 	u64 max_stripe_size;
50024f2bafe8SNaohiro Aota 	u64 max_chunk_size;
50036aafb303SNaohiro Aota 	u64 dev_extent_min;
50044f2bafe8SNaohiro Aota 	u64 stripe_size;
50054f2bafe8SNaohiro Aota 	u64 chunk_size;
50064f2bafe8SNaohiro Aota 	int ndevs;
50074f2bafe8SNaohiro Aota };
50084f2bafe8SNaohiro Aota 
500927c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular(
501027c314d5SNaohiro Aota 				struct btrfs_fs_devices *fs_devices,
501127c314d5SNaohiro Aota 				struct alloc_chunk_ctl *ctl)
501227c314d5SNaohiro Aota {
501327c314d5SNaohiro Aota 	u64 type = ctl->type;
501427c314d5SNaohiro Aota 
501527c314d5SNaohiro Aota 	if (type & BTRFS_BLOCK_GROUP_DATA) {
501627c314d5SNaohiro Aota 		ctl->max_stripe_size = SZ_1G;
501727c314d5SNaohiro Aota 		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
501827c314d5SNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
501927c314d5SNaohiro Aota 		/* For larger filesystems, use larger metadata chunks */
502027c314d5SNaohiro Aota 		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
502127c314d5SNaohiro Aota 			ctl->max_stripe_size = SZ_1G;
502227c314d5SNaohiro Aota 		else
502327c314d5SNaohiro Aota 			ctl->max_stripe_size = SZ_256M;
502427c314d5SNaohiro Aota 		ctl->max_chunk_size = ctl->max_stripe_size;
502527c314d5SNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
502627c314d5SNaohiro Aota 		ctl->max_stripe_size = SZ_32M;
502727c314d5SNaohiro Aota 		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
502827c314d5SNaohiro Aota 		ctl->devs_max = min_t(int, ctl->devs_max,
502927c314d5SNaohiro Aota 				      BTRFS_MAX_DEVS_SYS_CHUNK);
503027c314d5SNaohiro Aota 	} else {
503127c314d5SNaohiro Aota 		BUG();
503227c314d5SNaohiro Aota 	}
503327c314d5SNaohiro Aota 
503427c314d5SNaohiro Aota 	/* We don't want a chunk larger than 10% of writable space */
503527c314d5SNaohiro Aota 	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
503627c314d5SNaohiro Aota 				  ctl->max_chunk_size);
50376aafb303SNaohiro Aota 	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
503827c314d5SNaohiro Aota }
503927c314d5SNaohiro Aota 
50401cd6121fSNaohiro Aota static void init_alloc_chunk_ctl_policy_zoned(
50411cd6121fSNaohiro Aota 				      struct btrfs_fs_devices *fs_devices,
50421cd6121fSNaohiro Aota 				      struct alloc_chunk_ctl *ctl)
50431cd6121fSNaohiro Aota {
50441cd6121fSNaohiro Aota 	u64 zone_size = fs_devices->fs_info->zone_size;
50451cd6121fSNaohiro Aota 	u64 limit;
50461cd6121fSNaohiro Aota 	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
50471cd6121fSNaohiro Aota 	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
50481cd6121fSNaohiro Aota 	u64 min_chunk_size = min_data_stripes * zone_size;
50491cd6121fSNaohiro Aota 	u64 type = ctl->type;
50501cd6121fSNaohiro Aota 
50511cd6121fSNaohiro Aota 	ctl->max_stripe_size = zone_size;
50521cd6121fSNaohiro Aota 	if (type & BTRFS_BLOCK_GROUP_DATA) {
50531cd6121fSNaohiro Aota 		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
50541cd6121fSNaohiro Aota 						 zone_size);
50551cd6121fSNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
50561cd6121fSNaohiro Aota 		ctl->max_chunk_size = ctl->max_stripe_size;
50571cd6121fSNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
50581cd6121fSNaohiro Aota 		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
50591cd6121fSNaohiro Aota 		ctl->devs_max = min_t(int, ctl->devs_max,
50601cd6121fSNaohiro Aota 				      BTRFS_MAX_DEVS_SYS_CHUNK);
5061bb05b298SArnd Bergmann 	} else {
5062bb05b298SArnd Bergmann 		BUG();
50631cd6121fSNaohiro Aota 	}
50641cd6121fSNaohiro Aota 
50651cd6121fSNaohiro Aota 	/* We don't want a chunk larger than 10% of writable space */
50661cd6121fSNaohiro Aota 	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
50671cd6121fSNaohiro Aota 			       zone_size),
50681cd6121fSNaohiro Aota 		    min_chunk_size);
50691cd6121fSNaohiro Aota 	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
50701cd6121fSNaohiro Aota 	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
50711cd6121fSNaohiro Aota }
50721cd6121fSNaohiro Aota 
507327c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
507427c314d5SNaohiro Aota 				 struct alloc_chunk_ctl *ctl)
507527c314d5SNaohiro Aota {
507627c314d5SNaohiro Aota 	int index = btrfs_bg_flags_to_raid_index(ctl->type);
507727c314d5SNaohiro Aota 
507827c314d5SNaohiro Aota 	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
507927c314d5SNaohiro Aota 	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
508027c314d5SNaohiro Aota 	ctl->devs_max = btrfs_raid_array[index].devs_max;
508127c314d5SNaohiro Aota 	if (!ctl->devs_max)
508227c314d5SNaohiro Aota 		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
508327c314d5SNaohiro Aota 	ctl->devs_min = btrfs_raid_array[index].devs_min;
508427c314d5SNaohiro Aota 	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
508527c314d5SNaohiro Aota 	ctl->ncopies = btrfs_raid_array[index].ncopies;
508627c314d5SNaohiro Aota 	ctl->nparity = btrfs_raid_array[index].nparity;
508727c314d5SNaohiro Aota 	ctl->ndevs = 0;
508827c314d5SNaohiro Aota 
508927c314d5SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
509027c314d5SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
509127c314d5SNaohiro Aota 		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
509227c314d5SNaohiro Aota 		break;
50931cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
50941cd6121fSNaohiro Aota 		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
50951cd6121fSNaohiro Aota 		break;
509627c314d5SNaohiro Aota 	default:
509727c314d5SNaohiro Aota 		BUG();
509827c314d5SNaohiro Aota 	}
509927c314d5SNaohiro Aota }
510027c314d5SNaohiro Aota 
5101560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5102560156cbSNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
5103560156cbSNaohiro Aota 			      struct btrfs_device_info *devices_info)
5104560156cbSNaohiro Aota {
5105560156cbSNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
5106560156cbSNaohiro Aota 	struct btrfs_device *device;
5107560156cbSNaohiro Aota 	u64 total_avail;
5108560156cbSNaohiro Aota 	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5109560156cbSNaohiro Aota 	int ret;
5110560156cbSNaohiro Aota 	int ndevs = 0;
5111560156cbSNaohiro Aota 	u64 max_avail;
5112560156cbSNaohiro Aota 	u64 dev_offset;
5113560156cbSNaohiro Aota 
5114560156cbSNaohiro Aota 	/*
5115560156cbSNaohiro Aota 	 * in the first pass through the devices list, we gather information
5116560156cbSNaohiro Aota 	 * about the available holes on each device.
5117560156cbSNaohiro Aota 	 */
5118560156cbSNaohiro Aota 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5119560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5120560156cbSNaohiro Aota 			WARN(1, KERN_ERR
5121560156cbSNaohiro Aota 			       "BTRFS: read-only device in alloc_list\n");
5122560156cbSNaohiro Aota 			continue;
5123560156cbSNaohiro Aota 		}
5124560156cbSNaohiro Aota 
5125560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5126560156cbSNaohiro Aota 					&device->dev_state) ||
5127560156cbSNaohiro Aota 		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5128560156cbSNaohiro Aota 			continue;
5129560156cbSNaohiro Aota 
5130560156cbSNaohiro Aota 		if (device->total_bytes > device->bytes_used)
5131560156cbSNaohiro Aota 			total_avail = device->total_bytes - device->bytes_used;
5132560156cbSNaohiro Aota 		else
5133560156cbSNaohiro Aota 			total_avail = 0;
5134560156cbSNaohiro Aota 
5135560156cbSNaohiro Aota 		/* If there is no space on this device, skip it. */
51366aafb303SNaohiro Aota 		if (total_avail < ctl->dev_extent_min)
5137560156cbSNaohiro Aota 			continue;
5138560156cbSNaohiro Aota 
5139560156cbSNaohiro Aota 		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5140560156cbSNaohiro Aota 					   &max_avail);
5141560156cbSNaohiro Aota 		if (ret && ret != -ENOSPC)
5142560156cbSNaohiro Aota 			return ret;
5143560156cbSNaohiro Aota 
5144560156cbSNaohiro Aota 		if (ret == 0)
5145560156cbSNaohiro Aota 			max_avail = dev_extent_want;
5146560156cbSNaohiro Aota 
51476aafb303SNaohiro Aota 		if (max_avail < ctl->dev_extent_min) {
5148560156cbSNaohiro Aota 			if (btrfs_test_opt(info, ENOSPC_DEBUG))
5149560156cbSNaohiro Aota 				btrfs_debug(info,
5150560156cbSNaohiro Aota 			"%s: devid %llu has no free space, have=%llu want=%llu",
5151560156cbSNaohiro Aota 					    __func__, device->devid, max_avail,
51526aafb303SNaohiro Aota 					    ctl->dev_extent_min);
5153560156cbSNaohiro Aota 			continue;
5154560156cbSNaohiro Aota 		}
5155560156cbSNaohiro Aota 
5156560156cbSNaohiro Aota 		if (ndevs == fs_devices->rw_devices) {
5157560156cbSNaohiro Aota 			WARN(1, "%s: found more than %llu devices\n",
5158560156cbSNaohiro Aota 			     __func__, fs_devices->rw_devices);
5159560156cbSNaohiro Aota 			break;
5160560156cbSNaohiro Aota 		}
5161560156cbSNaohiro Aota 		devices_info[ndevs].dev_offset = dev_offset;
5162560156cbSNaohiro Aota 		devices_info[ndevs].max_avail = max_avail;
5163560156cbSNaohiro Aota 		devices_info[ndevs].total_avail = total_avail;
5164560156cbSNaohiro Aota 		devices_info[ndevs].dev = device;
5165560156cbSNaohiro Aota 		++ndevs;
5166560156cbSNaohiro Aota 	}
5167560156cbSNaohiro Aota 	ctl->ndevs = ndevs;
5168560156cbSNaohiro Aota 
5169560156cbSNaohiro Aota 	/*
5170560156cbSNaohiro Aota 	 * now sort the devices by hole size / available space
5171560156cbSNaohiro Aota 	 */
5172560156cbSNaohiro Aota 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5173560156cbSNaohiro Aota 	     btrfs_cmp_device_info, NULL);
5174560156cbSNaohiro Aota 
5175560156cbSNaohiro Aota 	return 0;
5176560156cbSNaohiro Aota }
5177560156cbSNaohiro Aota 
51785badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
51795badf512SNaohiro Aota 				      struct btrfs_device_info *devices_info)
51805badf512SNaohiro Aota {
51815badf512SNaohiro Aota 	/* Number of stripes that count for block group size */
51825badf512SNaohiro Aota 	int data_stripes;
51835badf512SNaohiro Aota 
51845badf512SNaohiro Aota 	/*
51855badf512SNaohiro Aota 	 * The primary goal is to maximize the number of stripes, so use as
51865badf512SNaohiro Aota 	 * many devices as possible, even if the stripes are not maximum sized.
51875badf512SNaohiro Aota 	 *
51885badf512SNaohiro Aota 	 * The DUP profile stores more than one stripe per device, the
51895badf512SNaohiro Aota 	 * max_avail is the total size so we have to adjust.
51905badf512SNaohiro Aota 	 */
51915badf512SNaohiro Aota 	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
51925badf512SNaohiro Aota 				   ctl->dev_stripes);
51935badf512SNaohiro Aota 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
51945badf512SNaohiro Aota 
51955badf512SNaohiro Aota 	/* This will have to be fixed for RAID1 and RAID10 over more drives */
51965badf512SNaohiro Aota 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
51975badf512SNaohiro Aota 
51985badf512SNaohiro Aota 	/*
51995badf512SNaohiro Aota 	 * Use the number of data stripes to figure out how big this chunk is
52005badf512SNaohiro Aota 	 * really going to be in terms of logical address space, and compare
52015badf512SNaohiro Aota 	 * that answer with the max chunk size. If it's higher, we try to
52025badf512SNaohiro Aota 	 * reduce stripe_size.
52035badf512SNaohiro Aota 	 */
52045badf512SNaohiro Aota 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
52055badf512SNaohiro Aota 		/*
52065badf512SNaohiro Aota 		 * Reduce stripe_size, round it up to a 16MB boundary again and
52075badf512SNaohiro Aota 		 * then use it, unless it ends up being even bigger than the
52085badf512SNaohiro Aota 		 * previous value we had already.
52095badf512SNaohiro Aota 		 */
52105badf512SNaohiro Aota 		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
52115badf512SNaohiro Aota 							data_stripes), SZ_16M),
52125badf512SNaohiro Aota 				       ctl->stripe_size);
52135badf512SNaohiro Aota 	}
52145badf512SNaohiro Aota 
52155badf512SNaohiro Aota 	/* Align to BTRFS_STRIPE_LEN */
52165badf512SNaohiro Aota 	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
52175badf512SNaohiro Aota 	ctl->chunk_size = ctl->stripe_size * data_stripes;
52185badf512SNaohiro Aota 
52195badf512SNaohiro Aota 	return 0;
52205badf512SNaohiro Aota }
52215badf512SNaohiro Aota 
52221cd6121fSNaohiro Aota static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
52231cd6121fSNaohiro Aota 				    struct btrfs_device_info *devices_info)
52241cd6121fSNaohiro Aota {
52251cd6121fSNaohiro Aota 	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
52261cd6121fSNaohiro Aota 	/* Number of stripes that count for block group size */
52271cd6121fSNaohiro Aota 	int data_stripes;
52281cd6121fSNaohiro Aota 
52291cd6121fSNaohiro Aota 	/*
52301cd6121fSNaohiro Aota 	 * It should hold because:
52311cd6121fSNaohiro Aota 	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
52321cd6121fSNaohiro Aota 	 */
52331cd6121fSNaohiro Aota 	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
52341cd6121fSNaohiro Aota 
52351cd6121fSNaohiro Aota 	ctl->stripe_size = zone_size;
52361cd6121fSNaohiro Aota 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
52371cd6121fSNaohiro Aota 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
52381cd6121fSNaohiro Aota 
52391cd6121fSNaohiro Aota 	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
52401cd6121fSNaohiro Aota 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
52411cd6121fSNaohiro Aota 		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
52421cd6121fSNaohiro Aota 					     ctl->stripe_size) + ctl->nparity,
52431cd6121fSNaohiro Aota 				     ctl->dev_stripes);
52441cd6121fSNaohiro Aota 		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
52451cd6121fSNaohiro Aota 		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
52461cd6121fSNaohiro Aota 		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
52471cd6121fSNaohiro Aota 	}
52481cd6121fSNaohiro Aota 
52491cd6121fSNaohiro Aota 	ctl->chunk_size = ctl->stripe_size * data_stripes;
52501cd6121fSNaohiro Aota 
52511cd6121fSNaohiro Aota 	return 0;
52521cd6121fSNaohiro Aota }
52531cd6121fSNaohiro Aota 
52545badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
52555badf512SNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
52565badf512SNaohiro Aota 			      struct btrfs_device_info *devices_info)
52575badf512SNaohiro Aota {
52585badf512SNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
52595badf512SNaohiro Aota 
52605badf512SNaohiro Aota 	/*
52615badf512SNaohiro Aota 	 * Round down to number of usable stripes, devs_increment can be any
52625badf512SNaohiro Aota 	 * number so we can't use round_down() that requires power of 2, while
52635badf512SNaohiro Aota 	 * rounddown is safe.
52645badf512SNaohiro Aota 	 */
52655badf512SNaohiro Aota 	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
52665badf512SNaohiro Aota 
52675badf512SNaohiro Aota 	if (ctl->ndevs < ctl->devs_min) {
52685badf512SNaohiro Aota 		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
52695badf512SNaohiro Aota 			btrfs_debug(info,
52705badf512SNaohiro Aota 	"%s: not enough devices with free space: have=%d minimum required=%d",
52715badf512SNaohiro Aota 				    __func__, ctl->ndevs, ctl->devs_min);
52725badf512SNaohiro Aota 		}
52735badf512SNaohiro Aota 		return -ENOSPC;
52745badf512SNaohiro Aota 	}
52755badf512SNaohiro Aota 
52765badf512SNaohiro Aota 	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
52775badf512SNaohiro Aota 
52785badf512SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
52795badf512SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
52805badf512SNaohiro Aota 		return decide_stripe_size_regular(ctl, devices_info);
52811cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
52821cd6121fSNaohiro Aota 		return decide_stripe_size_zoned(ctl, devices_info);
52835badf512SNaohiro Aota 	default:
52845badf512SNaohiro Aota 		BUG();
52855badf512SNaohiro Aota 	}
52865badf512SNaohiro Aota }
52875badf512SNaohiro Aota 
528879bd3712SFilipe Manana static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5289dce580caSNaohiro Aota 			struct alloc_chunk_ctl *ctl,
5290dce580caSNaohiro Aota 			struct btrfs_device_info *devices_info)
5291dce580caSNaohiro Aota {
5292dce580caSNaohiro Aota 	struct btrfs_fs_info *info = trans->fs_info;
5293dce580caSNaohiro Aota 	struct map_lookup *map = NULL;
5294dce580caSNaohiro Aota 	struct extent_map_tree *em_tree;
529579bd3712SFilipe Manana 	struct btrfs_block_group *block_group;
5296dce580caSNaohiro Aota 	struct extent_map *em;
5297dce580caSNaohiro Aota 	u64 start = ctl->start;
5298dce580caSNaohiro Aota 	u64 type = ctl->type;
5299dce580caSNaohiro Aota 	int ret;
5300dce580caSNaohiro Aota 	int i;
5301dce580caSNaohiro Aota 	int j;
5302dce580caSNaohiro Aota 
5303dce580caSNaohiro Aota 	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5304dce580caSNaohiro Aota 	if (!map)
530579bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5306dce580caSNaohiro Aota 	map->num_stripes = ctl->num_stripes;
5307dce580caSNaohiro Aota 
5308dce580caSNaohiro Aota 	for (i = 0; i < ctl->ndevs; ++i) {
5309dce580caSNaohiro Aota 		for (j = 0; j < ctl->dev_stripes; ++j) {
5310dce580caSNaohiro Aota 			int s = i * ctl->dev_stripes + j;
5311dce580caSNaohiro Aota 			map->stripes[s].dev = devices_info[i].dev;
5312dce580caSNaohiro Aota 			map->stripes[s].physical = devices_info[i].dev_offset +
5313dce580caSNaohiro Aota 						   j * ctl->stripe_size;
5314dce580caSNaohiro Aota 		}
5315dce580caSNaohiro Aota 	}
5316dce580caSNaohiro Aota 	map->stripe_len = BTRFS_STRIPE_LEN;
5317dce580caSNaohiro Aota 	map->io_align = BTRFS_STRIPE_LEN;
5318dce580caSNaohiro Aota 	map->io_width = BTRFS_STRIPE_LEN;
5319dce580caSNaohiro Aota 	map->type = type;
5320dce580caSNaohiro Aota 	map->sub_stripes = ctl->sub_stripes;
5321dce580caSNaohiro Aota 
5322dce580caSNaohiro Aota 	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5323dce580caSNaohiro Aota 
5324dce580caSNaohiro Aota 	em = alloc_extent_map();
5325dce580caSNaohiro Aota 	if (!em) {
5326dce580caSNaohiro Aota 		kfree(map);
532779bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5328dce580caSNaohiro Aota 	}
5329dce580caSNaohiro Aota 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5330dce580caSNaohiro Aota 	em->map_lookup = map;
5331dce580caSNaohiro Aota 	em->start = start;
5332dce580caSNaohiro Aota 	em->len = ctl->chunk_size;
5333dce580caSNaohiro Aota 	em->block_start = 0;
5334dce580caSNaohiro Aota 	em->block_len = em->len;
5335dce580caSNaohiro Aota 	em->orig_block_len = ctl->stripe_size;
5336dce580caSNaohiro Aota 
5337dce580caSNaohiro Aota 	em_tree = &info->mapping_tree;
5338dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5339dce580caSNaohiro Aota 	ret = add_extent_mapping(em_tree, em, 0);
5340dce580caSNaohiro Aota 	if (ret) {
5341dce580caSNaohiro Aota 		write_unlock(&em_tree->lock);
5342dce580caSNaohiro Aota 		free_extent_map(em);
534379bd3712SFilipe Manana 		return ERR_PTR(ret);
5344dce580caSNaohiro Aota 	}
5345dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5346dce580caSNaohiro Aota 
534779bd3712SFilipe Manana 	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
534879bd3712SFilipe Manana 	if (IS_ERR(block_group))
5349dce580caSNaohiro Aota 		goto error_del_extent;
5350dce580caSNaohiro Aota 
5351dce580caSNaohiro Aota 	for (i = 0; i < map->num_stripes; i++) {
5352dce580caSNaohiro Aota 		struct btrfs_device *dev = map->stripes[i].dev;
5353dce580caSNaohiro Aota 
5354dce580caSNaohiro Aota 		btrfs_device_set_bytes_used(dev,
5355dce580caSNaohiro Aota 					    dev->bytes_used + ctl->stripe_size);
5356dce580caSNaohiro Aota 		if (list_empty(&dev->post_commit_list))
5357dce580caSNaohiro Aota 			list_add_tail(&dev->post_commit_list,
5358dce580caSNaohiro Aota 				      &trans->transaction->dev_update_list);
5359dce580caSNaohiro Aota 	}
5360dce580caSNaohiro Aota 
5361dce580caSNaohiro Aota 	atomic64_sub(ctl->stripe_size * map->num_stripes,
5362dce580caSNaohiro Aota 		     &info->free_chunk_space);
5363dce580caSNaohiro Aota 
5364dce580caSNaohiro Aota 	free_extent_map(em);
5365dce580caSNaohiro Aota 	check_raid56_incompat_flag(info, type);
5366dce580caSNaohiro Aota 	check_raid1c34_incompat_flag(info, type);
5367dce580caSNaohiro Aota 
536879bd3712SFilipe Manana 	return block_group;
5369dce580caSNaohiro Aota 
5370dce580caSNaohiro Aota error_del_extent:
5371dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5372dce580caSNaohiro Aota 	remove_extent_mapping(em_tree, em);
5373dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5374dce580caSNaohiro Aota 
5375dce580caSNaohiro Aota 	/* One for our allocation */
5376dce580caSNaohiro Aota 	free_extent_map(em);
5377dce580caSNaohiro Aota 	/* One for the tree reference */
5378dce580caSNaohiro Aota 	free_extent_map(em);
5379dce580caSNaohiro Aota 
538079bd3712SFilipe Manana 	return block_group;
5381dce580caSNaohiro Aota }
5382dce580caSNaohiro Aota 
5383f6f39f7aSNikolay Borisov struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
538479bd3712SFilipe Manana 					    u64 type)
5385b2117a39SMiao Xie {
53862ff7e61eSJeff Mahoney 	struct btrfs_fs_info *info = trans->fs_info;
5387b2117a39SMiao Xie 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
538873c5de00SArne Jansen 	struct btrfs_device_info *devices_info = NULL;
53894f2bafe8SNaohiro Aota 	struct alloc_chunk_ctl ctl;
539079bd3712SFilipe Manana 	struct btrfs_block_group *block_group;
5391b2117a39SMiao Xie 	int ret;
5392b2117a39SMiao Xie 
539311c67b1aSNikolay Borisov 	lockdep_assert_held(&info->chunk_mutex);
539411c67b1aSNikolay Borisov 
5395b25c19f4SNaohiro Aota 	if (!alloc_profile_is_valid(type, 0)) {
5396b25c19f4SNaohiro Aota 		ASSERT(0);
539779bd3712SFilipe Manana 		return ERR_PTR(-EINVAL);
5398b25c19f4SNaohiro Aota 	}
539973c5de00SArne Jansen 
54004117f207SQu Wenruo 	if (list_empty(&fs_devices->alloc_list)) {
54014117f207SQu Wenruo 		if (btrfs_test_opt(info, ENOSPC_DEBUG))
54024117f207SQu Wenruo 			btrfs_debug(info, "%s: no writable device", __func__);
540379bd3712SFilipe Manana 		return ERR_PTR(-ENOSPC);
54044117f207SQu Wenruo 	}
5405b2117a39SMiao Xie 
540627c314d5SNaohiro Aota 	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
540727c314d5SNaohiro Aota 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
540827c314d5SNaohiro Aota 		ASSERT(0);
540979bd3712SFilipe Manana 		return ERR_PTR(-EINVAL);
541073c5de00SArne Jansen 	}
541173c5de00SArne Jansen 
541211c67b1aSNikolay Borisov 	ctl.start = find_next_chunk(info);
541327c314d5SNaohiro Aota 	ctl.type = type;
541427c314d5SNaohiro Aota 	init_alloc_chunk_ctl(fs_devices, &ctl);
5415b2117a39SMiao Xie 
541631e818feSDavid Sterba 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5417b2117a39SMiao Xie 			       GFP_NOFS);
5418b2117a39SMiao Xie 	if (!devices_info)
541979bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5420b2117a39SMiao Xie 
5421560156cbSNaohiro Aota 	ret = gather_device_info(fs_devices, &ctl, devices_info);
542279bd3712SFilipe Manana 	if (ret < 0) {
542379bd3712SFilipe Manana 		block_group = ERR_PTR(ret);
5424dce580caSNaohiro Aota 		goto out;
542579bd3712SFilipe Manana 	}
542673c5de00SArne Jansen 
54275badf512SNaohiro Aota 	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
542879bd3712SFilipe Manana 	if (ret < 0) {
542979bd3712SFilipe Manana 		block_group = ERR_PTR(ret);
5430dce580caSNaohiro Aota 		goto out;
543179bd3712SFilipe Manana 	}
543273c5de00SArne Jansen 
543379bd3712SFilipe Manana 	block_group = create_chunk(trans, &ctl, devices_info);
54349b3f68b9SChris Mason 
5435dce580caSNaohiro Aota out:
5436b2117a39SMiao Xie 	kfree(devices_info);
543779bd3712SFilipe Manana 	return block_group;
54382b82032cSYan Zheng }
54392b82032cSYan Zheng 
544011c67b1aSNikolay Borisov /*
544179bd3712SFilipe Manana  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
544279bd3712SFilipe Manana  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
544379bd3712SFilipe Manana  * chunks.
544479bd3712SFilipe Manana  *
544579bd3712SFilipe Manana  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
544679bd3712SFilipe Manana  * phases.
544779bd3712SFilipe Manana  */
544879bd3712SFilipe Manana int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
544979bd3712SFilipe Manana 				     struct btrfs_block_group *bg)
545079bd3712SFilipe Manana {
545179bd3712SFilipe Manana 	struct btrfs_fs_info *fs_info = trans->fs_info;
545279bd3712SFilipe Manana 	struct btrfs_root *extent_root = fs_info->extent_root;
545379bd3712SFilipe Manana 	struct btrfs_root *chunk_root = fs_info->chunk_root;
545479bd3712SFilipe Manana 	struct btrfs_key key;
545579bd3712SFilipe Manana 	struct btrfs_chunk *chunk;
545679bd3712SFilipe Manana 	struct btrfs_stripe *stripe;
545779bd3712SFilipe Manana 	struct extent_map *em;
545879bd3712SFilipe Manana 	struct map_lookup *map;
545979bd3712SFilipe Manana 	size_t item_size;
546079bd3712SFilipe Manana 	int i;
546179bd3712SFilipe Manana 	int ret;
546279bd3712SFilipe Manana 
546379bd3712SFilipe Manana 	/*
546479bd3712SFilipe Manana 	 * We take the chunk_mutex for 2 reasons:
546579bd3712SFilipe Manana 	 *
546679bd3712SFilipe Manana 	 * 1) Updates and insertions in the chunk btree must be done while holding
546779bd3712SFilipe Manana 	 *    the chunk_mutex, as well as updating the system chunk array in the
546879bd3712SFilipe Manana 	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
546979bd3712SFilipe Manana 	 *    details;
547079bd3712SFilipe Manana 	 *
547179bd3712SFilipe Manana 	 * 2) To prevent races with the final phase of a device replace operation
547279bd3712SFilipe Manana 	 *    that replaces the device object associated with the map's stripes,
547379bd3712SFilipe Manana 	 *    because the device object's id can change at any time during that
547479bd3712SFilipe Manana 	 *    final phase of the device replace operation
547579bd3712SFilipe Manana 	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
547679bd3712SFilipe Manana 	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
547779bd3712SFilipe Manana 	 *    which would cause a failure when updating the device item, which does
547879bd3712SFilipe Manana 	 *    not exists, or persisting a stripe of the chunk item with such ID.
547979bd3712SFilipe Manana 	 *    Here we can't use the device_list_mutex because our caller already
548079bd3712SFilipe Manana 	 *    has locked the chunk_mutex, and the final phase of device replace
548179bd3712SFilipe Manana 	 *    acquires both mutexes - first the device_list_mutex and then the
548279bd3712SFilipe Manana 	 *    chunk_mutex. Using any of those two mutexes protects us from a
548379bd3712SFilipe Manana 	 *    concurrent device replace.
548479bd3712SFilipe Manana 	 */
548579bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
548679bd3712SFilipe Manana 
548779bd3712SFilipe Manana 	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
548879bd3712SFilipe Manana 	if (IS_ERR(em)) {
548979bd3712SFilipe Manana 		ret = PTR_ERR(em);
549079bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
549179bd3712SFilipe Manana 		return ret;
549279bd3712SFilipe Manana 	}
549379bd3712SFilipe Manana 
549479bd3712SFilipe Manana 	map = em->map_lookup;
549579bd3712SFilipe Manana 	item_size = btrfs_chunk_item_size(map->num_stripes);
549679bd3712SFilipe Manana 
549779bd3712SFilipe Manana 	chunk = kzalloc(item_size, GFP_NOFS);
549879bd3712SFilipe Manana 	if (!chunk) {
549979bd3712SFilipe Manana 		ret = -ENOMEM;
550079bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
550179bd3712SFilipe Manana 		goto out;
550279bd3712SFilipe Manana 	}
550379bd3712SFilipe Manana 
550479bd3712SFilipe Manana 	for (i = 0; i < map->num_stripes; i++) {
550579bd3712SFilipe Manana 		struct btrfs_device *device = map->stripes[i].dev;
550679bd3712SFilipe Manana 
550779bd3712SFilipe Manana 		ret = btrfs_update_device(trans, device);
550879bd3712SFilipe Manana 		if (ret)
55096df9a95eSJosef Bacik 			goto out;
55102b82032cSYan Zheng 	}
55112b82032cSYan Zheng 
55122b82032cSYan Zheng 	stripe = &chunk->stripe;
55136df9a95eSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
551479bd3712SFilipe Manana 		struct btrfs_device *device = map->stripes[i].dev;
551579bd3712SFilipe Manana 		const u64 dev_offset = map->stripes[i].physical;
55162b82032cSYan Zheng 
55172b82032cSYan Zheng 		btrfs_set_stack_stripe_devid(stripe, device->devid);
55182b82032cSYan Zheng 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
55192b82032cSYan Zheng 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
55202b82032cSYan Zheng 		stripe++;
55212b82032cSYan Zheng 	}
55222b82032cSYan Zheng 
552379bd3712SFilipe Manana 	btrfs_set_stack_chunk_length(chunk, bg->length);
55242b82032cSYan Zheng 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
55252b82032cSYan Zheng 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
55262b82032cSYan Zheng 	btrfs_set_stack_chunk_type(chunk, map->type);
55272b82032cSYan Zheng 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
55282b82032cSYan Zheng 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
55292b82032cSYan Zheng 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
55300b246afaSJeff Mahoney 	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
55312b82032cSYan Zheng 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
55322b82032cSYan Zheng 
55332b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
55342b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
553579bd3712SFilipe Manana 	key.offset = bg->start;
55362b82032cSYan Zheng 
55372b82032cSYan Zheng 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
553879bd3712SFilipe Manana 	if (ret)
553979bd3712SFilipe Manana 		goto out;
554079bd3712SFilipe Manana 
554179bd3712SFilipe Manana 	bg->chunk_item_inserted = 1;
554279bd3712SFilipe Manana 
554379bd3712SFilipe Manana 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
55442ff7e61eSJeff Mahoney 		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
554579bd3712SFilipe Manana 		if (ret)
554679bd3712SFilipe Manana 			goto out;
55472b82032cSYan Zheng 	}
55481abe9b8aSliubo 
55496df9a95eSJosef Bacik out:
55502b82032cSYan Zheng 	kfree(chunk);
55516df9a95eSJosef Bacik 	free_extent_map(em);
55524ed1d16eSMark Fasheh 	return ret;
55532b82032cSYan Zheng }
55542b82032cSYan Zheng 
55556f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
55562b82032cSYan Zheng {
55576f8e0fc7SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
55582b82032cSYan Zheng 	u64 alloc_profile;
555979bd3712SFilipe Manana 	struct btrfs_block_group *meta_bg;
556079bd3712SFilipe Manana 	struct btrfs_block_group *sys_bg;
556179bd3712SFilipe Manana 
556279bd3712SFilipe Manana 	/*
556379bd3712SFilipe Manana 	 * When adding a new device for sprouting, the seed device is read-only
556479bd3712SFilipe Manana 	 * so we must first allocate a metadata and a system chunk. But before
556579bd3712SFilipe Manana 	 * adding the block group items to the extent, device and chunk btrees,
556679bd3712SFilipe Manana 	 * we must first:
556779bd3712SFilipe Manana 	 *
556879bd3712SFilipe Manana 	 * 1) Create both chunks without doing any changes to the btrees, as
556979bd3712SFilipe Manana 	 *    otherwise we would get -ENOSPC since the block groups from the
557079bd3712SFilipe Manana 	 *    seed device are read-only;
557179bd3712SFilipe Manana 	 *
557279bd3712SFilipe Manana 	 * 2) Add the device item for the new sprout device - finishing the setup
557379bd3712SFilipe Manana 	 *    of a new block group requires updating the device item in the chunk
557479bd3712SFilipe Manana 	 *    btree, so it must exist when we attempt to do it. The previous step
557579bd3712SFilipe Manana 	 *    ensures this does not fail with -ENOSPC.
557679bd3712SFilipe Manana 	 *
557779bd3712SFilipe Manana 	 * After that we can add the block group items to their btrees:
557879bd3712SFilipe Manana 	 * update existing device item in the chunk btree, add a new block group
557979bd3712SFilipe Manana 	 * item to the extent btree, add a new chunk item to the chunk btree and
558079bd3712SFilipe Manana 	 * finally add the new device extent items to the devices btree.
558179bd3712SFilipe Manana 	 */
55822b82032cSYan Zheng 
55831b86826dSJeff Mahoney 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5584f6f39f7aSNikolay Borisov 	meta_bg = btrfs_create_chunk(trans, alloc_profile);
558579bd3712SFilipe Manana 	if (IS_ERR(meta_bg))
558679bd3712SFilipe Manana 		return PTR_ERR(meta_bg);
55872b82032cSYan Zheng 
55881b86826dSJeff Mahoney 	alloc_profile = btrfs_system_alloc_profile(fs_info);
5589f6f39f7aSNikolay Borisov 	sys_bg = btrfs_create_chunk(trans, alloc_profile);
559079bd3712SFilipe Manana 	if (IS_ERR(sys_bg))
559179bd3712SFilipe Manana 		return PTR_ERR(sys_bg);
559279bd3712SFilipe Manana 
559379bd3712SFilipe Manana 	return 0;
5594005d6427SDavid Sterba }
55952b82032cSYan Zheng 
5596d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5597d20983b4SMiao Xie {
5598fc9a2ac7SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(map->type);
5599d20983b4SMiao Xie 
5600fc9a2ac7SDavid Sterba 	return btrfs_raid_array[index].tolerated_failures;
56012b82032cSYan Zheng }
56022b82032cSYan Zheng 
5603a09f23c3SAnand Jain bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
56042b82032cSYan Zheng {
56052b82032cSYan Zheng 	struct extent_map *em;
56062b82032cSYan Zheng 	struct map_lookup *map;
5607d20983b4SMiao Xie 	int miss_ndevs = 0;
56082b82032cSYan Zheng 	int i;
5609a09f23c3SAnand Jain 	bool ret = true;
56102b82032cSYan Zheng 
561160ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5612592d92eeSLiu Bo 	if (IS_ERR(em))
5613a09f23c3SAnand Jain 		return false;
56142b82032cSYan Zheng 
561595617d69SJeff Mahoney 	map = em->map_lookup;
56162b82032cSYan Zheng 	for (i = 0; i < map->num_stripes; i++) {
5617e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING,
5618e6e674bdSAnand Jain 					&map->stripes[i].dev->dev_state)) {
5619d20983b4SMiao Xie 			miss_ndevs++;
5620d20983b4SMiao Xie 			continue;
5621d20983b4SMiao Xie 		}
5622ebbede42SAnand Jain 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5623ebbede42SAnand Jain 					&map->stripes[i].dev->dev_state)) {
5624a09f23c3SAnand Jain 			ret = false;
5625d20983b4SMiao Xie 			goto end;
56262b82032cSYan Zheng 		}
56272b82032cSYan Zheng 	}
5628d20983b4SMiao Xie 
5629d20983b4SMiao Xie 	/*
5630a09f23c3SAnand Jain 	 * If the number of missing devices is larger than max errors, we can
5631a09f23c3SAnand Jain 	 * not write the data into that chunk successfully.
5632d20983b4SMiao Xie 	 */
5633d20983b4SMiao Xie 	if (miss_ndevs > btrfs_chunk_max_errors(map))
5634a09f23c3SAnand Jain 		ret = false;
5635d20983b4SMiao Xie end:
56362b82032cSYan Zheng 	free_extent_map(em);
5637a09f23c3SAnand Jain 	return ret;
56380b86a832SChris Mason }
56390b86a832SChris Mason 
5640c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree)
56410b86a832SChris Mason {
56420b86a832SChris Mason 	struct extent_map *em;
56430b86a832SChris Mason 
56440b86a832SChris Mason 	while (1) {
5645c8bf1b67SDavid Sterba 		write_lock(&tree->lock);
5646c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(tree, 0, (u64)-1);
56470b86a832SChris Mason 		if (em)
5648c8bf1b67SDavid Sterba 			remove_extent_mapping(tree, em);
5649c8bf1b67SDavid Sterba 		write_unlock(&tree->lock);
56500b86a832SChris Mason 		if (!em)
56510b86a832SChris Mason 			break;
56520b86a832SChris Mason 		/* once for us */
56530b86a832SChris Mason 		free_extent_map(em);
56540b86a832SChris Mason 		/* once for the tree */
56550b86a832SChris Mason 		free_extent_map(em);
56560b86a832SChris Mason 	}
56570b86a832SChris Mason }
56580b86a832SChris Mason 
56595d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5660f188591eSChris Mason {
5661f188591eSChris Mason 	struct extent_map *em;
5662f188591eSChris Mason 	struct map_lookup *map;
5663f188591eSChris Mason 	int ret;
5664f188591eSChris Mason 
566560ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
5666592d92eeSLiu Bo 	if (IS_ERR(em))
5667fb7669b5SJosef Bacik 		/*
5668592d92eeSLiu Bo 		 * We could return errors for these cases, but that could get
5669592d92eeSLiu Bo 		 * ugly and we'd probably do the same thing which is just not do
5670592d92eeSLiu Bo 		 * anything else and exit, so return 1 so the callers don't try
5671592d92eeSLiu Bo 		 * to use other copies.
5672fb7669b5SJosef Bacik 		 */
5673fb7669b5SJosef Bacik 		return 1;
5674fb7669b5SJosef Bacik 
567595617d69SJeff Mahoney 	map = em->map_lookup;
5676c7369b3fSDavid Sterba 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5677f188591eSChris Mason 		ret = map->num_stripes;
5678321aecc6SChris Mason 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5679321aecc6SChris Mason 		ret = map->sub_stripes;
568053b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
568153b381b3SDavid Woodhouse 		ret = 2;
568253b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
56838810f751SLiu Bo 		/*
56848810f751SLiu Bo 		 * There could be two corrupted data stripes, we need
56858810f751SLiu Bo 		 * to loop retry in order to rebuild the correct data.
56868810f751SLiu Bo 		 *
56878810f751SLiu Bo 		 * Fail a stripe at a time on every retry except the
56888810f751SLiu Bo 		 * stripe under reconstruction.
56898810f751SLiu Bo 		 */
56908810f751SLiu Bo 		ret = map->num_stripes;
5691f188591eSChris Mason 	else
5692f188591eSChris Mason 		ret = 1;
5693f188591eSChris Mason 	free_extent_map(em);
5694ad6d620eSStefan Behrens 
5695cb5583ddSDavid Sterba 	down_read(&fs_info->dev_replace.rwsem);
56966fad823fSLiu Bo 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
56976fad823fSLiu Bo 	    fs_info->dev_replace.tgtdev)
5698ad6d620eSStefan Behrens 		ret++;
5699cb5583ddSDavid Sterba 	up_read(&fs_info->dev_replace.rwsem);
5700ad6d620eSStefan Behrens 
5701f188591eSChris Mason 	return ret;
5702f188591eSChris Mason }
5703f188591eSChris Mason 
57042ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
570553b381b3SDavid Woodhouse 				    u64 logical)
570653b381b3SDavid Woodhouse {
570753b381b3SDavid Woodhouse 	struct extent_map *em;
570853b381b3SDavid Woodhouse 	struct map_lookup *map;
57090b246afaSJeff Mahoney 	unsigned long len = fs_info->sectorsize;
571053b381b3SDavid Woodhouse 
571160ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
571253b381b3SDavid Woodhouse 
571369f03f13SNikolay Borisov 	if (!WARN_ON(IS_ERR(em))) {
571495617d69SJeff Mahoney 		map = em->map_lookup;
5715ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
571653b381b3SDavid Woodhouse 			len = map->stripe_len * nr_data_stripes(map);
571753b381b3SDavid Woodhouse 		free_extent_map(em);
571869f03f13SNikolay Borisov 	}
571953b381b3SDavid Woodhouse 	return len;
572053b381b3SDavid Woodhouse }
572153b381b3SDavid Woodhouse 
5722e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
572353b381b3SDavid Woodhouse {
572453b381b3SDavid Woodhouse 	struct extent_map *em;
572553b381b3SDavid Woodhouse 	struct map_lookup *map;
572653b381b3SDavid Woodhouse 	int ret = 0;
572753b381b3SDavid Woodhouse 
572860ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
572953b381b3SDavid Woodhouse 
573069f03f13SNikolay Borisov 	if(!WARN_ON(IS_ERR(em))) {
573195617d69SJeff Mahoney 		map = em->map_lookup;
5732ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
573353b381b3SDavid Woodhouse 			ret = 1;
573453b381b3SDavid Woodhouse 		free_extent_map(em);
573569f03f13SNikolay Borisov 	}
573653b381b3SDavid Woodhouse 	return ret;
573753b381b3SDavid Woodhouse }
573853b381b3SDavid Woodhouse 
573930d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info,
574099f92a7cSAnand Jain 			    struct map_lookup *map, int first,
57418ba0ae78SAnand Jain 			    int dev_replace_is_ongoing)
5742dfe25020SChris Mason {
5743dfe25020SChris Mason 	int i;
574499f92a7cSAnand Jain 	int num_stripes;
57458ba0ae78SAnand Jain 	int preferred_mirror;
574630d9861fSStefan Behrens 	int tolerance;
574730d9861fSStefan Behrens 	struct btrfs_device *srcdev;
574830d9861fSStefan Behrens 
574999f92a7cSAnand Jain 	ASSERT((map->type &
5750c7369b3fSDavid Sterba 		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
575199f92a7cSAnand Jain 
575299f92a7cSAnand Jain 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
575399f92a7cSAnand Jain 		num_stripes = map->sub_stripes;
575499f92a7cSAnand Jain 	else
575599f92a7cSAnand Jain 		num_stripes = map->num_stripes;
575699f92a7cSAnand Jain 
575733fd2f71SAnand Jain 	switch (fs_info->fs_devices->read_policy) {
575833fd2f71SAnand Jain 	default:
575933fd2f71SAnand Jain 		/* Shouldn't happen, just warn and use pid instead of failing */
576033fd2f71SAnand Jain 		btrfs_warn_rl(fs_info,
576133fd2f71SAnand Jain 			      "unknown read_policy type %u, reset to pid",
576233fd2f71SAnand Jain 			      fs_info->fs_devices->read_policy);
576333fd2f71SAnand Jain 		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
576433fd2f71SAnand Jain 		fallthrough;
576533fd2f71SAnand Jain 	case BTRFS_READ_POLICY_PID:
576633fd2f71SAnand Jain 		preferred_mirror = first + (current->pid % num_stripes);
576733fd2f71SAnand Jain 		break;
576833fd2f71SAnand Jain 	}
57698ba0ae78SAnand Jain 
577030d9861fSStefan Behrens 	if (dev_replace_is_ongoing &&
577130d9861fSStefan Behrens 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
577230d9861fSStefan Behrens 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
577330d9861fSStefan Behrens 		srcdev = fs_info->dev_replace.srcdev;
577430d9861fSStefan Behrens 	else
577530d9861fSStefan Behrens 		srcdev = NULL;
577630d9861fSStefan Behrens 
577730d9861fSStefan Behrens 	/*
577830d9861fSStefan Behrens 	 * try to avoid the drive that is the source drive for a
577930d9861fSStefan Behrens 	 * dev-replace procedure, only choose it if no other non-missing
578030d9861fSStefan Behrens 	 * mirror is available
578130d9861fSStefan Behrens 	 */
578230d9861fSStefan Behrens 	for (tolerance = 0; tolerance < 2; tolerance++) {
57838ba0ae78SAnand Jain 		if (map->stripes[preferred_mirror].dev->bdev &&
57848ba0ae78SAnand Jain 		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
57858ba0ae78SAnand Jain 			return preferred_mirror;
578699f92a7cSAnand Jain 		for (i = first; i < first + num_stripes; i++) {
578730d9861fSStefan Behrens 			if (map->stripes[i].dev->bdev &&
578830d9861fSStefan Behrens 			    (tolerance || map->stripes[i].dev != srcdev))
5789dfe25020SChris Mason 				return i;
5790dfe25020SChris Mason 		}
579130d9861fSStefan Behrens 	}
579230d9861fSStefan Behrens 
5793dfe25020SChris Mason 	/* we couldn't find one that doesn't fail.  Just return something
5794dfe25020SChris Mason 	 * and the io error handling code will clean up eventually
5795dfe25020SChris Mason 	 */
57968ba0ae78SAnand Jain 	return preferred_mirror;
5797dfe25020SChris Mason }
5798dfe25020SChris Mason 
579953b381b3SDavid Woodhouse /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
58004c664611SQu Wenruo static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
580153b381b3SDavid Woodhouse {
580253b381b3SDavid Woodhouse 	int i;
580353b381b3SDavid Woodhouse 	int again = 1;
580453b381b3SDavid Woodhouse 
580553b381b3SDavid Woodhouse 	while (again) {
580653b381b3SDavid Woodhouse 		again = 0;
5807cc7539edSZhao Lei 		for (i = 0; i < num_stripes - 1; i++) {
5808eeb6f172SDavid Sterba 			/* Swap if parity is on a smaller index */
58094c664611SQu Wenruo 			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
58104c664611SQu Wenruo 				swap(bioc->stripes[i], bioc->stripes[i + 1]);
58114c664611SQu Wenruo 				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
581253b381b3SDavid Woodhouse 				again = 1;
581353b381b3SDavid Woodhouse 			}
581453b381b3SDavid Woodhouse 		}
581553b381b3SDavid Woodhouse 	}
581653b381b3SDavid Woodhouse }
581753b381b3SDavid Woodhouse 
5818731ccf15SQu Wenruo static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5819731ccf15SQu Wenruo 						       int total_stripes,
58204c664611SQu Wenruo 						       int real_stripes)
58216e9606d2SZhao Lei {
58224c664611SQu Wenruo 	struct btrfs_io_context *bioc = kzalloc(
58234c664611SQu Wenruo 		 /* The size of btrfs_io_context */
58244c664611SQu Wenruo 		sizeof(struct btrfs_io_context) +
58254c664611SQu Wenruo 		/* Plus the variable array for the stripes */
58264c664611SQu Wenruo 		sizeof(struct btrfs_io_stripe) * (total_stripes) +
58274c664611SQu Wenruo 		/* Plus the variable array for the tgt dev */
58286e9606d2SZhao Lei 		sizeof(int) * (real_stripes) +
5829e57cf21eSChris Mason 		/*
58304c664611SQu Wenruo 		 * Plus the raid_map, which includes both the tgt dev
58314c664611SQu Wenruo 		 * and the stripes.
5832e57cf21eSChris Mason 		 */
5833e57cf21eSChris Mason 		sizeof(u64) * (total_stripes),
5834277fb5fcSMichal Hocko 		GFP_NOFS|__GFP_NOFAIL);
58356e9606d2SZhao Lei 
58364c664611SQu Wenruo 	atomic_set(&bioc->error, 0);
58374c664611SQu Wenruo 	refcount_set(&bioc->refs, 1);
58386e9606d2SZhao Lei 
5839731ccf15SQu Wenruo 	bioc->fs_info = fs_info;
58404c664611SQu Wenruo 	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
58414c664611SQu Wenruo 	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
5842608769a4SNikolay Borisov 
58434c664611SQu Wenruo 	return bioc;
58446e9606d2SZhao Lei }
58456e9606d2SZhao Lei 
58464c664611SQu Wenruo void btrfs_get_bioc(struct btrfs_io_context *bioc)
58476e9606d2SZhao Lei {
58484c664611SQu Wenruo 	WARN_ON(!refcount_read(&bioc->refs));
58494c664611SQu Wenruo 	refcount_inc(&bioc->refs);
58506e9606d2SZhao Lei }
58516e9606d2SZhao Lei 
58524c664611SQu Wenruo void btrfs_put_bioc(struct btrfs_io_context *bioc)
58536e9606d2SZhao Lei {
58544c664611SQu Wenruo 	if (!bioc)
58556e9606d2SZhao Lei 		return;
58564c664611SQu Wenruo 	if (refcount_dec_and_test(&bioc->refs))
58574c664611SQu Wenruo 		kfree(bioc);
58586e9606d2SZhao Lei }
58596e9606d2SZhao Lei 
58600b3d4cd3SLiu Bo /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
58610b3d4cd3SLiu Bo /*
58620b3d4cd3SLiu Bo  * Please note that, discard won't be sent to target device of device
58630b3d4cd3SLiu Bo  * replace.
58640b3d4cd3SLiu Bo  */
58650b3d4cd3SLiu Bo static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
58666b7faaddSQu Wenruo 					 u64 logical, u64 *length_ret,
58674c664611SQu Wenruo 					 struct btrfs_io_context **bioc_ret)
58680b3d4cd3SLiu Bo {
58690b3d4cd3SLiu Bo 	struct extent_map *em;
58700b3d4cd3SLiu Bo 	struct map_lookup *map;
58714c664611SQu Wenruo 	struct btrfs_io_context *bioc;
58726b7faaddSQu Wenruo 	u64 length = *length_ret;
58730b3d4cd3SLiu Bo 	u64 offset;
58740b3d4cd3SLiu Bo 	u64 stripe_nr;
58750b3d4cd3SLiu Bo 	u64 stripe_nr_end;
58760b3d4cd3SLiu Bo 	u64 stripe_end_offset;
58770b3d4cd3SLiu Bo 	u64 stripe_cnt;
58780b3d4cd3SLiu Bo 	u64 stripe_len;
58790b3d4cd3SLiu Bo 	u64 stripe_offset;
58800b3d4cd3SLiu Bo 	u64 num_stripes;
58810b3d4cd3SLiu Bo 	u32 stripe_index;
58820b3d4cd3SLiu Bo 	u32 factor = 0;
58830b3d4cd3SLiu Bo 	u32 sub_stripes = 0;
58840b3d4cd3SLiu Bo 	u64 stripes_per_dev = 0;
58850b3d4cd3SLiu Bo 	u32 remaining_stripes = 0;
58860b3d4cd3SLiu Bo 	u32 last_stripe = 0;
58870b3d4cd3SLiu Bo 	int ret = 0;
58880b3d4cd3SLiu Bo 	int i;
58890b3d4cd3SLiu Bo 
58904c664611SQu Wenruo 	/* Discard always returns a bioc. */
58914c664611SQu Wenruo 	ASSERT(bioc_ret);
58920b3d4cd3SLiu Bo 
589360ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, length);
58940b3d4cd3SLiu Bo 	if (IS_ERR(em))
58950b3d4cd3SLiu Bo 		return PTR_ERR(em);
58960b3d4cd3SLiu Bo 
58970b3d4cd3SLiu Bo 	map = em->map_lookup;
58980b3d4cd3SLiu Bo 	/* we don't discard raid56 yet */
58990b3d4cd3SLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
59000b3d4cd3SLiu Bo 		ret = -EOPNOTSUPP;
59010b3d4cd3SLiu Bo 		goto out;
59020b3d4cd3SLiu Bo 	}
59030b3d4cd3SLiu Bo 
59040b3d4cd3SLiu Bo 	offset = logical - em->start;
59052d974619SQu Wenruo 	length = min_t(u64, em->start + em->len - logical, length);
59066b7faaddSQu Wenruo 	*length_ret = length;
59070b3d4cd3SLiu Bo 
59080b3d4cd3SLiu Bo 	stripe_len = map->stripe_len;
59090b3d4cd3SLiu Bo 	/*
59100b3d4cd3SLiu Bo 	 * stripe_nr counts the total number of stripes we have to stride
59110b3d4cd3SLiu Bo 	 * to get to this block
59120b3d4cd3SLiu Bo 	 */
59130b3d4cd3SLiu Bo 	stripe_nr = div64_u64(offset, stripe_len);
59140b3d4cd3SLiu Bo 
59150b3d4cd3SLiu Bo 	/* stripe_offset is the offset of this block in its stripe */
59160b3d4cd3SLiu Bo 	stripe_offset = offset - stripe_nr * stripe_len;
59170b3d4cd3SLiu Bo 
59180b3d4cd3SLiu Bo 	stripe_nr_end = round_up(offset + length, map->stripe_len);
591942c61ab6SLiu Bo 	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
59200b3d4cd3SLiu Bo 	stripe_cnt = stripe_nr_end - stripe_nr;
59210b3d4cd3SLiu Bo 	stripe_end_offset = stripe_nr_end * map->stripe_len -
59220b3d4cd3SLiu Bo 			    (offset + length);
59230b3d4cd3SLiu Bo 	/*
59240b3d4cd3SLiu Bo 	 * after this, stripe_nr is the number of stripes on this
59250b3d4cd3SLiu Bo 	 * device we have to walk to find the data, and stripe_index is
59260b3d4cd3SLiu Bo 	 * the number of our device in the stripe array
59270b3d4cd3SLiu Bo 	 */
59280b3d4cd3SLiu Bo 	num_stripes = 1;
59290b3d4cd3SLiu Bo 	stripe_index = 0;
59300b3d4cd3SLiu Bo 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
59310b3d4cd3SLiu Bo 			 BTRFS_BLOCK_GROUP_RAID10)) {
59320b3d4cd3SLiu Bo 		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
59330b3d4cd3SLiu Bo 			sub_stripes = 1;
59340b3d4cd3SLiu Bo 		else
59350b3d4cd3SLiu Bo 			sub_stripes = map->sub_stripes;
59360b3d4cd3SLiu Bo 
59370b3d4cd3SLiu Bo 		factor = map->num_stripes / sub_stripes;
59380b3d4cd3SLiu Bo 		num_stripes = min_t(u64, map->num_stripes,
59390b3d4cd3SLiu Bo 				    sub_stripes * stripe_cnt);
59400b3d4cd3SLiu Bo 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
59410b3d4cd3SLiu Bo 		stripe_index *= sub_stripes;
59420b3d4cd3SLiu Bo 		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
59430b3d4cd3SLiu Bo 					      &remaining_stripes);
59440b3d4cd3SLiu Bo 		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
59450b3d4cd3SLiu Bo 		last_stripe *= sub_stripes;
5946c7369b3fSDavid Sterba 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
59470b3d4cd3SLiu Bo 				BTRFS_BLOCK_GROUP_DUP)) {
59480b3d4cd3SLiu Bo 		num_stripes = map->num_stripes;
59490b3d4cd3SLiu Bo 	} else {
59500b3d4cd3SLiu Bo 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
59510b3d4cd3SLiu Bo 					&stripe_index);
59520b3d4cd3SLiu Bo 	}
59530b3d4cd3SLiu Bo 
5954731ccf15SQu Wenruo 	bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
59554c664611SQu Wenruo 	if (!bioc) {
59560b3d4cd3SLiu Bo 		ret = -ENOMEM;
59570b3d4cd3SLiu Bo 		goto out;
59580b3d4cd3SLiu Bo 	}
59590b3d4cd3SLiu Bo 
59600b3d4cd3SLiu Bo 	for (i = 0; i < num_stripes; i++) {
59614c664611SQu Wenruo 		bioc->stripes[i].physical =
59620b3d4cd3SLiu Bo 			map->stripes[stripe_index].physical +
59630b3d4cd3SLiu Bo 			stripe_offset + stripe_nr * map->stripe_len;
59644c664611SQu Wenruo 		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
59650b3d4cd3SLiu Bo 
59660b3d4cd3SLiu Bo 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
59670b3d4cd3SLiu Bo 				 BTRFS_BLOCK_GROUP_RAID10)) {
59684c664611SQu Wenruo 			bioc->stripes[i].length = stripes_per_dev *
59690b3d4cd3SLiu Bo 				map->stripe_len;
59700b3d4cd3SLiu Bo 
59710b3d4cd3SLiu Bo 			if (i / sub_stripes < remaining_stripes)
59724c664611SQu Wenruo 				bioc->stripes[i].length += map->stripe_len;
59730b3d4cd3SLiu Bo 
59740b3d4cd3SLiu Bo 			/*
59750b3d4cd3SLiu Bo 			 * Special for the first stripe and
59760b3d4cd3SLiu Bo 			 * the last stripe:
59770b3d4cd3SLiu Bo 			 *
59780b3d4cd3SLiu Bo 			 * |-------|...|-------|
59790b3d4cd3SLiu Bo 			 *     |----------|
59800b3d4cd3SLiu Bo 			 *    off     end_off
59810b3d4cd3SLiu Bo 			 */
59820b3d4cd3SLiu Bo 			if (i < sub_stripes)
59834c664611SQu Wenruo 				bioc->stripes[i].length -= stripe_offset;
59840b3d4cd3SLiu Bo 
59850b3d4cd3SLiu Bo 			if (stripe_index >= last_stripe &&
59860b3d4cd3SLiu Bo 			    stripe_index <= (last_stripe +
59870b3d4cd3SLiu Bo 					     sub_stripes - 1))
59884c664611SQu Wenruo 				bioc->stripes[i].length -= stripe_end_offset;
59890b3d4cd3SLiu Bo 
59900b3d4cd3SLiu Bo 			if (i == sub_stripes - 1)
59910b3d4cd3SLiu Bo 				stripe_offset = 0;
59920b3d4cd3SLiu Bo 		} else {
59934c664611SQu Wenruo 			bioc->stripes[i].length = length;
59940b3d4cd3SLiu Bo 		}
59950b3d4cd3SLiu Bo 
59960b3d4cd3SLiu Bo 		stripe_index++;
59970b3d4cd3SLiu Bo 		if (stripe_index == map->num_stripes) {
59980b3d4cd3SLiu Bo 			stripe_index = 0;
59990b3d4cd3SLiu Bo 			stripe_nr++;
60000b3d4cd3SLiu Bo 		}
60010b3d4cd3SLiu Bo 	}
60020b3d4cd3SLiu Bo 
60034c664611SQu Wenruo 	*bioc_ret = bioc;
60044c664611SQu Wenruo 	bioc->map_type = map->type;
60054c664611SQu Wenruo 	bioc->num_stripes = num_stripes;
60060b3d4cd3SLiu Bo out:
60070b3d4cd3SLiu Bo 	free_extent_map(em);
60080b3d4cd3SLiu Bo 	return ret;
60090b3d4cd3SLiu Bo }
60100b3d4cd3SLiu Bo 
60115ab56090SLiu Bo /*
60125ab56090SLiu Bo  * In dev-replace case, for repair case (that's the only case where the mirror
60135ab56090SLiu Bo  * is selected explicitly when calling btrfs_map_block), blocks left of the
60145ab56090SLiu Bo  * left cursor can also be read from the target drive.
60155ab56090SLiu Bo  *
60165ab56090SLiu Bo  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
60175ab56090SLiu Bo  * array of stripes.
60185ab56090SLiu Bo  * For READ, it also needs to be supported using the same mirror number.
60195ab56090SLiu Bo  *
60205ab56090SLiu Bo  * If the requested block is not left of the left cursor, EIO is returned. This
60215ab56090SLiu Bo  * can happen because btrfs_num_copies() returns one more in the dev-replace
60225ab56090SLiu Bo  * case.
60235ab56090SLiu Bo  */
60245ab56090SLiu Bo static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
60255ab56090SLiu Bo 					 u64 logical, u64 length,
60265ab56090SLiu Bo 					 u64 srcdev_devid, int *mirror_num,
60275ab56090SLiu Bo 					 u64 *physical)
60285ab56090SLiu Bo {
60294c664611SQu Wenruo 	struct btrfs_io_context *bioc = NULL;
60305ab56090SLiu Bo 	int num_stripes;
60315ab56090SLiu Bo 	int index_srcdev = 0;
60325ab56090SLiu Bo 	int found = 0;
60335ab56090SLiu Bo 	u64 physical_of_found = 0;
60345ab56090SLiu Bo 	int i;
60355ab56090SLiu Bo 	int ret = 0;
60365ab56090SLiu Bo 
60375ab56090SLiu Bo 	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
60384c664611SQu Wenruo 				logical, &length, &bioc, 0, 0);
60395ab56090SLiu Bo 	if (ret) {
60404c664611SQu Wenruo 		ASSERT(bioc == NULL);
60415ab56090SLiu Bo 		return ret;
60425ab56090SLiu Bo 	}
60435ab56090SLiu Bo 
60444c664611SQu Wenruo 	num_stripes = bioc->num_stripes;
60455ab56090SLiu Bo 	if (*mirror_num > num_stripes) {
60465ab56090SLiu Bo 		/*
60475ab56090SLiu Bo 		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
60485ab56090SLiu Bo 		 * that means that the requested area is not left of the left
60495ab56090SLiu Bo 		 * cursor
60505ab56090SLiu Bo 		 */
60514c664611SQu Wenruo 		btrfs_put_bioc(bioc);
60525ab56090SLiu Bo 		return -EIO;
60535ab56090SLiu Bo 	}
60545ab56090SLiu Bo 
60555ab56090SLiu Bo 	/*
60565ab56090SLiu Bo 	 * process the rest of the function using the mirror_num of the source
60575ab56090SLiu Bo 	 * drive. Therefore look it up first.  At the end, patch the device
60585ab56090SLiu Bo 	 * pointer to the one of the target drive.
60595ab56090SLiu Bo 	 */
60605ab56090SLiu Bo 	for (i = 0; i < num_stripes; i++) {
60614c664611SQu Wenruo 		if (bioc->stripes[i].dev->devid != srcdev_devid)
60625ab56090SLiu Bo 			continue;
60635ab56090SLiu Bo 
60645ab56090SLiu Bo 		/*
60655ab56090SLiu Bo 		 * In case of DUP, in order to keep it simple, only add the
60665ab56090SLiu Bo 		 * mirror with the lowest physical address
60675ab56090SLiu Bo 		 */
60685ab56090SLiu Bo 		if (found &&
60694c664611SQu Wenruo 		    physical_of_found <= bioc->stripes[i].physical)
60705ab56090SLiu Bo 			continue;
60715ab56090SLiu Bo 
60725ab56090SLiu Bo 		index_srcdev = i;
60735ab56090SLiu Bo 		found = 1;
60744c664611SQu Wenruo 		physical_of_found = bioc->stripes[i].physical;
60755ab56090SLiu Bo 	}
60765ab56090SLiu Bo 
60774c664611SQu Wenruo 	btrfs_put_bioc(bioc);
60785ab56090SLiu Bo 
60795ab56090SLiu Bo 	ASSERT(found);
60805ab56090SLiu Bo 	if (!found)
60815ab56090SLiu Bo 		return -EIO;
60825ab56090SLiu Bo 
60835ab56090SLiu Bo 	*mirror_num = index_srcdev + 1;
60845ab56090SLiu Bo 	*physical = physical_of_found;
60855ab56090SLiu Bo 	return ret;
60865ab56090SLiu Bo }
60875ab56090SLiu Bo 
60886143c23cSNaohiro Aota static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
60896143c23cSNaohiro Aota {
60906143c23cSNaohiro Aota 	struct btrfs_block_group *cache;
60916143c23cSNaohiro Aota 	bool ret;
60926143c23cSNaohiro Aota 
6093de17addcSNaohiro Aota 	/* Non zoned filesystem does not use "to_copy" flag */
60946143c23cSNaohiro Aota 	if (!btrfs_is_zoned(fs_info))
60956143c23cSNaohiro Aota 		return false;
60966143c23cSNaohiro Aota 
60976143c23cSNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, logical);
60986143c23cSNaohiro Aota 
60996143c23cSNaohiro Aota 	spin_lock(&cache->lock);
61006143c23cSNaohiro Aota 	ret = cache->to_copy;
61016143c23cSNaohiro Aota 	spin_unlock(&cache->lock);
61026143c23cSNaohiro Aota 
61036143c23cSNaohiro Aota 	btrfs_put_block_group(cache);
61046143c23cSNaohiro Aota 	return ret;
61056143c23cSNaohiro Aota }
61066143c23cSNaohiro Aota 
610773c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op,
61084c664611SQu Wenruo 				      struct btrfs_io_context **bioc_ret,
610973c0f228SLiu Bo 				      struct btrfs_dev_replace *dev_replace,
61106143c23cSNaohiro Aota 				      u64 logical,
611173c0f228SLiu Bo 				      int *num_stripes_ret, int *max_errors_ret)
611273c0f228SLiu Bo {
61134c664611SQu Wenruo 	struct btrfs_io_context *bioc = *bioc_ret;
611473c0f228SLiu Bo 	u64 srcdev_devid = dev_replace->srcdev->devid;
611573c0f228SLiu Bo 	int tgtdev_indexes = 0;
611673c0f228SLiu Bo 	int num_stripes = *num_stripes_ret;
611773c0f228SLiu Bo 	int max_errors = *max_errors_ret;
611873c0f228SLiu Bo 	int i;
611973c0f228SLiu Bo 
612073c0f228SLiu Bo 	if (op == BTRFS_MAP_WRITE) {
612173c0f228SLiu Bo 		int index_where_to_add;
612273c0f228SLiu Bo 
612373c0f228SLiu Bo 		/*
61246143c23cSNaohiro Aota 		 * A block group which have "to_copy" set will eventually
61256143c23cSNaohiro Aota 		 * copied by dev-replace process. We can avoid cloning IO here.
61266143c23cSNaohiro Aota 		 */
61276143c23cSNaohiro Aota 		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
61286143c23cSNaohiro Aota 			return;
61296143c23cSNaohiro Aota 
61306143c23cSNaohiro Aota 		/*
613173c0f228SLiu Bo 		 * duplicate the write operations while the dev replace
613273c0f228SLiu Bo 		 * procedure is running. Since the copying of the old disk to
613373c0f228SLiu Bo 		 * the new disk takes place at run time while the filesystem is
613473c0f228SLiu Bo 		 * mounted writable, the regular write operations to the old
613573c0f228SLiu Bo 		 * disk have to be duplicated to go to the new disk as well.
613673c0f228SLiu Bo 		 *
613773c0f228SLiu Bo 		 * Note that device->missing is handled by the caller, and that
613873c0f228SLiu Bo 		 * the write to the old disk is already set up in the stripes
613973c0f228SLiu Bo 		 * array.
614073c0f228SLiu Bo 		 */
614173c0f228SLiu Bo 		index_where_to_add = num_stripes;
614273c0f228SLiu Bo 		for (i = 0; i < num_stripes; i++) {
61434c664611SQu Wenruo 			if (bioc->stripes[i].dev->devid == srcdev_devid) {
614473c0f228SLiu Bo 				/* write to new disk, too */
61454c664611SQu Wenruo 				struct btrfs_io_stripe *new =
61464c664611SQu Wenruo 					bioc->stripes + index_where_to_add;
61474c664611SQu Wenruo 				struct btrfs_io_stripe *old =
61484c664611SQu Wenruo 					bioc->stripes + i;
614973c0f228SLiu Bo 
615073c0f228SLiu Bo 				new->physical = old->physical;
615173c0f228SLiu Bo 				new->length = old->length;
615273c0f228SLiu Bo 				new->dev = dev_replace->tgtdev;
61534c664611SQu Wenruo 				bioc->tgtdev_map[i] = index_where_to_add;
615473c0f228SLiu Bo 				index_where_to_add++;
615573c0f228SLiu Bo 				max_errors++;
615673c0f228SLiu Bo 				tgtdev_indexes++;
615773c0f228SLiu Bo 			}
615873c0f228SLiu Bo 		}
615973c0f228SLiu Bo 		num_stripes = index_where_to_add;
616073c0f228SLiu Bo 	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
616173c0f228SLiu Bo 		int index_srcdev = 0;
616273c0f228SLiu Bo 		int found = 0;
616373c0f228SLiu Bo 		u64 physical_of_found = 0;
616473c0f228SLiu Bo 
616573c0f228SLiu Bo 		/*
616673c0f228SLiu Bo 		 * During the dev-replace procedure, the target drive can also
616773c0f228SLiu Bo 		 * be used to read data in case it is needed to repair a corrupt
616873c0f228SLiu Bo 		 * block elsewhere. This is possible if the requested area is
616973c0f228SLiu Bo 		 * left of the left cursor. In this area, the target drive is a
617073c0f228SLiu Bo 		 * full copy of the source drive.
617173c0f228SLiu Bo 		 */
617273c0f228SLiu Bo 		for (i = 0; i < num_stripes; i++) {
61734c664611SQu Wenruo 			if (bioc->stripes[i].dev->devid == srcdev_devid) {
617473c0f228SLiu Bo 				/*
617573c0f228SLiu Bo 				 * In case of DUP, in order to keep it simple,
617673c0f228SLiu Bo 				 * only add the mirror with the lowest physical
617773c0f228SLiu Bo 				 * address
617873c0f228SLiu Bo 				 */
617973c0f228SLiu Bo 				if (found &&
61804c664611SQu Wenruo 				    physical_of_found <= bioc->stripes[i].physical)
618173c0f228SLiu Bo 					continue;
618273c0f228SLiu Bo 				index_srcdev = i;
618373c0f228SLiu Bo 				found = 1;
61844c664611SQu Wenruo 				physical_of_found = bioc->stripes[i].physical;
618573c0f228SLiu Bo 			}
618673c0f228SLiu Bo 		}
618773c0f228SLiu Bo 		if (found) {
61884c664611SQu Wenruo 			struct btrfs_io_stripe *tgtdev_stripe =
61894c664611SQu Wenruo 				bioc->stripes + num_stripes;
619073c0f228SLiu Bo 
619173c0f228SLiu Bo 			tgtdev_stripe->physical = physical_of_found;
619273c0f228SLiu Bo 			tgtdev_stripe->length =
61934c664611SQu Wenruo 				bioc->stripes[index_srcdev].length;
619473c0f228SLiu Bo 			tgtdev_stripe->dev = dev_replace->tgtdev;
61954c664611SQu Wenruo 			bioc->tgtdev_map[index_srcdev] = num_stripes;
619673c0f228SLiu Bo 
619773c0f228SLiu Bo 			tgtdev_indexes++;
619873c0f228SLiu Bo 			num_stripes++;
619973c0f228SLiu Bo 		}
620073c0f228SLiu Bo 	}
620173c0f228SLiu Bo 
620273c0f228SLiu Bo 	*num_stripes_ret = num_stripes;
620373c0f228SLiu Bo 	*max_errors_ret = max_errors;
62044c664611SQu Wenruo 	bioc->num_tgtdevs = tgtdev_indexes;
62054c664611SQu Wenruo 	*bioc_ret = bioc;
620673c0f228SLiu Bo }
620773c0f228SLiu Bo 
62082b19a1feSLiu Bo static bool need_full_stripe(enum btrfs_map_op op)
62092b19a1feSLiu Bo {
62102b19a1feSLiu Bo 	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
62112b19a1feSLiu Bo }
62122b19a1feSLiu Bo 
62135f141126SNikolay Borisov /*
621442034313SMichal Rostecki  * Calculate the geometry of a particular (address, len) tuple. This
621542034313SMichal Rostecki  * information is used to calculate how big a particular bio can get before it
621642034313SMichal Rostecki  * straddles a stripe.
62175f141126SNikolay Borisov  *
621842034313SMichal Rostecki  * @fs_info: the filesystem
621942034313SMichal Rostecki  * @em:      mapping containing the logical extent
622042034313SMichal Rostecki  * @op:      type of operation - write or read
622142034313SMichal Rostecki  * @logical: address that we want to figure out the geometry of
622242034313SMichal Rostecki  * @io_geom: pointer used to return values
62235f141126SNikolay Borisov  *
62245f141126SNikolay Borisov  * Returns < 0 in case a chunk for the given logical address cannot be found,
62255f141126SNikolay Borisov  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
62265f141126SNikolay Borisov  */
622742034313SMichal Rostecki int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
622843c0d1a5SQu Wenruo 			  enum btrfs_map_op op, u64 logical,
622942034313SMichal Rostecki 			  struct btrfs_io_geometry *io_geom)
62305f141126SNikolay Borisov {
62315f141126SNikolay Borisov 	struct map_lookup *map;
623243c0d1a5SQu Wenruo 	u64 len;
62335f141126SNikolay Borisov 	u64 offset;
62345f141126SNikolay Borisov 	u64 stripe_offset;
62355f141126SNikolay Borisov 	u64 stripe_nr;
62365f141126SNikolay Borisov 	u64 stripe_len;
62375f141126SNikolay Borisov 	u64 raid56_full_stripe_start = (u64)-1;
62385f141126SNikolay Borisov 	int data_stripes;
62395f141126SNikolay Borisov 
62405f141126SNikolay Borisov 	ASSERT(op != BTRFS_MAP_DISCARD);
62415f141126SNikolay Borisov 
62425f141126SNikolay Borisov 	map = em->map_lookup;
62435f141126SNikolay Borisov 	/* Offset of this logical address in the chunk */
62445f141126SNikolay Borisov 	offset = logical - em->start;
62455f141126SNikolay Borisov 	/* Len of a stripe in a chunk */
62465f141126SNikolay Borisov 	stripe_len = map->stripe_len;
62471a9fd417SDavid Sterba 	/* Stripe where this block falls in */
62485f141126SNikolay Borisov 	stripe_nr = div64_u64(offset, stripe_len);
62495f141126SNikolay Borisov 	/* Offset of stripe in the chunk */
62505f141126SNikolay Borisov 	stripe_offset = stripe_nr * stripe_len;
62515f141126SNikolay Borisov 	if (offset < stripe_offset) {
62525f141126SNikolay Borisov 		btrfs_crit(fs_info,
62535f141126SNikolay Borisov "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
62545f141126SNikolay Borisov 			stripe_offset, offset, em->start, logical, stripe_len);
625542034313SMichal Rostecki 		return -EINVAL;
62565f141126SNikolay Borisov 	}
62575f141126SNikolay Borisov 
62585f141126SNikolay Borisov 	/* stripe_offset is the offset of this block in its stripe */
62595f141126SNikolay Borisov 	stripe_offset = offset - stripe_offset;
62605f141126SNikolay Borisov 	data_stripes = nr_data_stripes(map);
62615f141126SNikolay Borisov 
62625f141126SNikolay Borisov 	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
62635f141126SNikolay Borisov 		u64 max_len = stripe_len - stripe_offset;
62645f141126SNikolay Borisov 
62655f141126SNikolay Borisov 		/*
62665f141126SNikolay Borisov 		 * In case of raid56, we need to know the stripe aligned start
62675f141126SNikolay Borisov 		 */
62685f141126SNikolay Borisov 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
62695f141126SNikolay Borisov 			unsigned long full_stripe_len = stripe_len * data_stripes;
62705f141126SNikolay Borisov 			raid56_full_stripe_start = offset;
62715f141126SNikolay Borisov 
62725f141126SNikolay Borisov 			/*
62735f141126SNikolay Borisov 			 * Allow a write of a full stripe, but make sure we
62745f141126SNikolay Borisov 			 * don't allow straddling of stripes
62755f141126SNikolay Borisov 			 */
62765f141126SNikolay Borisov 			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
62775f141126SNikolay Borisov 					full_stripe_len);
62785f141126SNikolay Borisov 			raid56_full_stripe_start *= full_stripe_len;
62795f141126SNikolay Borisov 
62805f141126SNikolay Borisov 			/*
62815f141126SNikolay Borisov 			 * For writes to RAID[56], allow a full stripeset across
62825f141126SNikolay Borisov 			 * all disks. For other RAID types and for RAID[56]
62835f141126SNikolay Borisov 			 * reads, just allow a single stripe (on a single disk).
62845f141126SNikolay Borisov 			 */
62855f141126SNikolay Borisov 			if (op == BTRFS_MAP_WRITE) {
62865f141126SNikolay Borisov 				max_len = stripe_len * data_stripes -
62875f141126SNikolay Borisov 					  (offset - raid56_full_stripe_start);
62885f141126SNikolay Borisov 			}
62895f141126SNikolay Borisov 		}
62905f141126SNikolay Borisov 		len = min_t(u64, em->len - offset, max_len);
62915f141126SNikolay Borisov 	} else {
62925f141126SNikolay Borisov 		len = em->len - offset;
62935f141126SNikolay Borisov 	}
62945f141126SNikolay Borisov 
62955f141126SNikolay Borisov 	io_geom->len = len;
62965f141126SNikolay Borisov 	io_geom->offset = offset;
62975f141126SNikolay Borisov 	io_geom->stripe_len = stripe_len;
62985f141126SNikolay Borisov 	io_geom->stripe_nr = stripe_nr;
62995f141126SNikolay Borisov 	io_geom->stripe_offset = stripe_offset;
63005f141126SNikolay Borisov 	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
63015f141126SNikolay Borisov 
630242034313SMichal Rostecki 	return 0;
63035f141126SNikolay Borisov }
63045f141126SNikolay Borisov 
6305cf8cddd3SChristoph Hellwig static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6306cf8cddd3SChristoph Hellwig 			     enum btrfs_map_op op,
6307cea9e445SChris Mason 			     u64 logical, u64 *length,
63084c664611SQu Wenruo 			     struct btrfs_io_context **bioc_ret,
63098e5cfb55SZhao Lei 			     int mirror_num, int need_raid_map)
63100b86a832SChris Mason {
63110b86a832SChris Mason 	struct extent_map *em;
63120b86a832SChris Mason 	struct map_lookup *map;
6313593060d7SChris Mason 	u64 stripe_offset;
6314593060d7SChris Mason 	u64 stripe_nr;
631553b381b3SDavid Woodhouse 	u64 stripe_len;
63169d644a62SDavid Sterba 	u32 stripe_index;
6317cff82672SDavid Sterba 	int data_stripes;
6318cea9e445SChris Mason 	int i;
6319de11cc12SLi Zefan 	int ret = 0;
6320f2d8d74dSChris Mason 	int num_stripes;
6321a236aed1SChris Mason 	int max_errors = 0;
63222c8cdd6eSMiao Xie 	int tgtdev_indexes = 0;
63234c664611SQu Wenruo 	struct btrfs_io_context *bioc = NULL;
6324472262f3SStefan Behrens 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6325472262f3SStefan Behrens 	int dev_replace_is_ongoing = 0;
6326472262f3SStefan Behrens 	int num_alloc_stripes;
6327ad6d620eSStefan Behrens 	int patch_the_first_stripe_for_dev_replace = 0;
6328ad6d620eSStefan Behrens 	u64 physical_to_patch_in_first_stripe = 0;
632953b381b3SDavid Woodhouse 	u64 raid56_full_stripe_start = (u64)-1;
633089b798adSNikolay Borisov 	struct btrfs_io_geometry geom;
633189b798adSNikolay Borisov 
63324c664611SQu Wenruo 	ASSERT(bioc_ret);
633375fb2e9eSDavid Sterba 	ASSERT(op != BTRFS_MAP_DISCARD);
63340b3d4cd3SLiu Bo 
633542034313SMichal Rostecki 	em = btrfs_get_chunk_map(fs_info, logical, *length);
633642034313SMichal Rostecki 	ASSERT(!IS_ERR(em));
633742034313SMichal Rostecki 
633843c0d1a5SQu Wenruo 	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
633989b798adSNikolay Borisov 	if (ret < 0)
634089b798adSNikolay Borisov 		return ret;
634189b798adSNikolay Borisov 
634295617d69SJeff Mahoney 	map = em->map_lookup;
6343593060d7SChris Mason 
634489b798adSNikolay Borisov 	*length = geom.len;
634589b798adSNikolay Borisov 	stripe_len = geom.stripe_len;
634689b798adSNikolay Borisov 	stripe_nr = geom.stripe_nr;
634789b798adSNikolay Borisov 	stripe_offset = geom.stripe_offset;
634889b798adSNikolay Borisov 	raid56_full_stripe_start = geom.raid56_stripe_offset;
6349cff82672SDavid Sterba 	data_stripes = nr_data_stripes(map);
6350593060d7SChris Mason 
6351cb5583ddSDavid Sterba 	down_read(&dev_replace->rwsem);
6352472262f3SStefan Behrens 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
635353176ddeSDavid Sterba 	/*
635453176ddeSDavid Sterba 	 * Hold the semaphore for read during the whole operation, write is
635553176ddeSDavid Sterba 	 * requested at commit time but must wait.
635653176ddeSDavid Sterba 	 */
6357472262f3SStefan Behrens 	if (!dev_replace_is_ongoing)
6358cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
6359472262f3SStefan Behrens 
6360ad6d620eSStefan Behrens 	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
63612b19a1feSLiu Bo 	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
63625ab56090SLiu Bo 		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
63635ab56090SLiu Bo 						    dev_replace->srcdev->devid,
63645ab56090SLiu Bo 						    &mirror_num,
63655ab56090SLiu Bo 					    &physical_to_patch_in_first_stripe);
63665ab56090SLiu Bo 		if (ret)
6367ad6d620eSStefan Behrens 			goto out;
63685ab56090SLiu Bo 		else
636994a97dfeSZhao Lei 			patch_the_first_stripe_for_dev_replace = 1;
6370ad6d620eSStefan Behrens 	} else if (mirror_num > map->num_stripes) {
6371ad6d620eSStefan Behrens 		mirror_num = 0;
6372ad6d620eSStefan Behrens 	}
6373ad6d620eSStefan Behrens 
6374f2d8d74dSChris Mason 	num_stripes = 1;
6375cea9e445SChris Mason 	stripe_index = 0;
6376fce3bb9aSLi Dongyang 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
637747c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
637847c5713fSDavid Sterba 				&stripe_index);
6379de483734SAnand Jain 		if (!need_full_stripe(op))
638028e1cc7dSMiao Xie 			mirror_num = 1;
6381c7369b3fSDavid Sterba 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6382de483734SAnand Jain 		if (need_full_stripe(op))
6383f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
63842fff734fSChris Mason 		else if (mirror_num)
6385f188591eSChris Mason 			stripe_index = mirror_num - 1;
6386dfe25020SChris Mason 		else {
638730d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map, 0,
638830d9861fSStefan Behrens 					    dev_replace_is_ongoing);
6389a1d3c478SJan Schmidt 			mirror_num = stripe_index + 1;
6390dfe25020SChris Mason 		}
63912fff734fSChris Mason 
6392611f0e00SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6393de483734SAnand Jain 		if (need_full_stripe(op)) {
6394f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
6395a1d3c478SJan Schmidt 		} else if (mirror_num) {
6396f188591eSChris Mason 			stripe_index = mirror_num - 1;
6397a1d3c478SJan Schmidt 		} else {
6398a1d3c478SJan Schmidt 			mirror_num = 1;
6399a1d3c478SJan Schmidt 		}
64002fff734fSChris Mason 
6401321aecc6SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
64029d644a62SDavid Sterba 		u32 factor = map->num_stripes / map->sub_stripes;
6403321aecc6SChris Mason 
640447c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6405321aecc6SChris Mason 		stripe_index *= map->sub_stripes;
6406321aecc6SChris Mason 
6407de483734SAnand Jain 		if (need_full_stripe(op))
6408f2d8d74dSChris Mason 			num_stripes = map->sub_stripes;
6409321aecc6SChris Mason 		else if (mirror_num)
6410321aecc6SChris Mason 			stripe_index += mirror_num - 1;
6411dfe25020SChris Mason 		else {
64123e74317aSJan Schmidt 			int old_stripe_index = stripe_index;
641330d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map,
641430d9861fSStefan Behrens 					      stripe_index,
641530d9861fSStefan Behrens 					      dev_replace_is_ongoing);
64163e74317aSJan Schmidt 			mirror_num = stripe_index - old_stripe_index + 1;
6417dfe25020SChris Mason 		}
641853b381b3SDavid Woodhouse 
6419ffe2d203SZhao Lei 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6420de483734SAnand Jain 		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
642153b381b3SDavid Woodhouse 			/* push stripe_nr back to the start of the full stripe */
642242c61ab6SLiu Bo 			stripe_nr = div64_u64(raid56_full_stripe_start,
6423cff82672SDavid Sterba 					stripe_len * data_stripes);
642453b381b3SDavid Woodhouse 
642553b381b3SDavid Woodhouse 			/* RAID[56] write or recovery. Return all stripes */
642653b381b3SDavid Woodhouse 			num_stripes = map->num_stripes;
642753b381b3SDavid Woodhouse 			max_errors = nr_parity_stripes(map);
642853b381b3SDavid Woodhouse 
642953b381b3SDavid Woodhouse 			*length = map->stripe_len;
643053b381b3SDavid Woodhouse 			stripe_index = 0;
643153b381b3SDavid Woodhouse 			stripe_offset = 0;
643253b381b3SDavid Woodhouse 		} else {
643353b381b3SDavid Woodhouse 			/*
643453b381b3SDavid Woodhouse 			 * Mirror #0 or #1 means the original data block.
643553b381b3SDavid Woodhouse 			 * Mirror #2 is RAID5 parity block.
643653b381b3SDavid Woodhouse 			 * Mirror #3 is RAID6 Q block.
643753b381b3SDavid Woodhouse 			 */
643847c5713fSDavid Sterba 			stripe_nr = div_u64_rem(stripe_nr,
6439cff82672SDavid Sterba 					data_stripes, &stripe_index);
644053b381b3SDavid Woodhouse 			if (mirror_num > 1)
6441cff82672SDavid Sterba 				stripe_index = data_stripes + mirror_num - 2;
644253b381b3SDavid Woodhouse 
644353b381b3SDavid Woodhouse 			/* We distribute the parity blocks across stripes */
644447c5713fSDavid Sterba 			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
644547c5713fSDavid Sterba 					&stripe_index);
6446de483734SAnand Jain 			if (!need_full_stripe(op) && mirror_num <= 1)
644728e1cc7dSMiao Xie 				mirror_num = 1;
644853b381b3SDavid Woodhouse 		}
64498790d502SChris Mason 	} else {
6450593060d7SChris Mason 		/*
645147c5713fSDavid Sterba 		 * after this, stripe_nr is the number of stripes on this
645247c5713fSDavid Sterba 		 * device we have to walk to find the data, and stripe_index is
645347c5713fSDavid Sterba 		 * the number of our device in the stripe array
6454593060d7SChris Mason 		 */
645547c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
645647c5713fSDavid Sterba 				&stripe_index);
6457a1d3c478SJan Schmidt 		mirror_num = stripe_index + 1;
64588790d502SChris Mason 	}
6459e042d1ecSJosef Bacik 	if (stripe_index >= map->num_stripes) {
64605d163e0eSJeff Mahoney 		btrfs_crit(fs_info,
64615d163e0eSJeff Mahoney 			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6462e042d1ecSJosef Bacik 			   stripe_index, map->num_stripes);
6463e042d1ecSJosef Bacik 		ret = -EINVAL;
6464e042d1ecSJosef Bacik 		goto out;
6465e042d1ecSJosef Bacik 	}
6466593060d7SChris Mason 
6467472262f3SStefan Behrens 	num_alloc_stripes = num_stripes;
64686fad823fSLiu Bo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
64690b3d4cd3SLiu Bo 		if (op == BTRFS_MAP_WRITE)
6470472262f3SStefan Behrens 			num_alloc_stripes <<= 1;
6471cf8cddd3SChristoph Hellwig 		if (op == BTRFS_MAP_GET_READ_MIRRORS)
6472ad6d620eSStefan Behrens 			num_alloc_stripes++;
64732c8cdd6eSMiao Xie 		tgtdev_indexes = num_stripes;
6474ad6d620eSStefan Behrens 	}
64752c8cdd6eSMiao Xie 
6476731ccf15SQu Wenruo 	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
64774c664611SQu Wenruo 	if (!bioc) {
6478de11cc12SLi Zefan 		ret = -ENOMEM;
6479de11cc12SLi Zefan 		goto out;
6480de11cc12SLi Zefan 	}
6481608769a4SNikolay Borisov 
6482608769a4SNikolay Borisov 	for (i = 0; i < num_stripes; i++) {
64834c664611SQu Wenruo 		bioc->stripes[i].physical = map->stripes[stripe_index].physical +
6484608769a4SNikolay Borisov 			stripe_offset + stripe_nr * map->stripe_len;
64854c664611SQu Wenruo 		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
6486608769a4SNikolay Borisov 		stripe_index++;
6487608769a4SNikolay Borisov 	}
6488de11cc12SLi Zefan 
64894c664611SQu Wenruo 	/* Build raid_map */
64902b19a1feSLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
64912b19a1feSLiu Bo 	    (need_full_stripe(op) || mirror_num > 1)) {
64928e5cfb55SZhao Lei 		u64 tmp;
64939d644a62SDavid Sterba 		unsigned rot;
64948e5cfb55SZhao Lei 
64958e5cfb55SZhao Lei 		/* Work out the disk rotation on this stripe-set */
649647c5713fSDavid Sterba 		div_u64_rem(stripe_nr, num_stripes, &rot);
64978e5cfb55SZhao Lei 
64988e5cfb55SZhao Lei 		/* Fill in the logical address of each stripe */
6499cff82672SDavid Sterba 		tmp = stripe_nr * data_stripes;
6500cff82672SDavid Sterba 		for (i = 0; i < data_stripes; i++)
65014c664611SQu Wenruo 			bioc->raid_map[(i + rot) % num_stripes] =
65028e5cfb55SZhao Lei 				em->start + (tmp + i) * map->stripe_len;
65038e5cfb55SZhao Lei 
65044c664611SQu Wenruo 		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
65058e5cfb55SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
65064c664611SQu Wenruo 			bioc->raid_map[(i + rot + 1) % num_stripes] =
65078e5cfb55SZhao Lei 				RAID6_Q_STRIPE;
65088e5cfb55SZhao Lei 
65094c664611SQu Wenruo 		sort_parity_stripes(bioc, num_stripes);
6510593060d7SChris Mason 	}
6511de11cc12SLi Zefan 
65122b19a1feSLiu Bo 	if (need_full_stripe(op))
6513d20983b4SMiao Xie 		max_errors = btrfs_chunk_max_errors(map);
6514de11cc12SLi Zefan 
651573c0f228SLiu Bo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
65162b19a1feSLiu Bo 	    need_full_stripe(op)) {
65174c664611SQu Wenruo 		handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
65186143c23cSNaohiro Aota 					  &num_stripes, &max_errors);
6519ad6d620eSStefan Behrens 	}
6520472262f3SStefan Behrens 
65214c664611SQu Wenruo 	*bioc_ret = bioc;
65224c664611SQu Wenruo 	bioc->map_type = map->type;
65234c664611SQu Wenruo 	bioc->num_stripes = num_stripes;
65244c664611SQu Wenruo 	bioc->max_errors = max_errors;
65254c664611SQu Wenruo 	bioc->mirror_num = mirror_num;
6526ad6d620eSStefan Behrens 
6527ad6d620eSStefan Behrens 	/*
6528ad6d620eSStefan Behrens 	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6529ad6d620eSStefan Behrens 	 * mirror_num == num_stripes + 1 && dev_replace target drive is
6530ad6d620eSStefan Behrens 	 * available as a mirror
6531ad6d620eSStefan Behrens 	 */
6532ad6d620eSStefan Behrens 	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6533ad6d620eSStefan Behrens 		WARN_ON(num_stripes > 1);
65344c664611SQu Wenruo 		bioc->stripes[0].dev = dev_replace->tgtdev;
65354c664611SQu Wenruo 		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
65364c664611SQu Wenruo 		bioc->mirror_num = map->num_stripes + 1;
6537ad6d620eSStefan Behrens 	}
6538cea9e445SChris Mason out:
653973beece9SLiu Bo 	if (dev_replace_is_ongoing) {
654053176ddeSDavid Sterba 		lockdep_assert_held(&dev_replace->rwsem);
654153176ddeSDavid Sterba 		/* Unlock and let waiting writers proceed */
6542cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
654373beece9SLiu Bo 	}
65440b86a832SChris Mason 	free_extent_map(em);
6545de11cc12SLi Zefan 	return ret;
65460b86a832SChris Mason }
65470b86a832SChris Mason 
6548cf8cddd3SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6549f2d8d74dSChris Mason 		      u64 logical, u64 *length,
65504c664611SQu Wenruo 		      struct btrfs_io_context **bioc_ret, int mirror_num)
6551f2d8d74dSChris Mason {
655275fb2e9eSDavid Sterba 	if (op == BTRFS_MAP_DISCARD)
655375fb2e9eSDavid Sterba 		return __btrfs_map_block_for_discard(fs_info, logical,
65544c664611SQu Wenruo 						     length, bioc_ret);
655575fb2e9eSDavid Sterba 
65564c664611SQu Wenruo 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
65578e5cfb55SZhao Lei 				 mirror_num, 0);
6558f2d8d74dSChris Mason }
6559f2d8d74dSChris Mason 
6560af8e2d1dSMiao Xie /* For Scrub/replace */
6561cf8cddd3SChristoph Hellwig int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6562af8e2d1dSMiao Xie 		     u64 logical, u64 *length,
65634c664611SQu Wenruo 		     struct btrfs_io_context **bioc_ret)
6564af8e2d1dSMiao Xie {
65654c664611SQu Wenruo 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
6566af8e2d1dSMiao Xie }
6567af8e2d1dSMiao Xie 
65684c664611SQu Wenruo static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
65698408c716SMiao Xie {
65704c664611SQu Wenruo 	bio->bi_private = bioc->private;
65714c664611SQu Wenruo 	bio->bi_end_io = bioc->end_io;
65724246a0b6SChristoph Hellwig 	bio_endio(bio);
6573326e1dbbSMike Snitzer 
65744c664611SQu Wenruo 	btrfs_put_bioc(bioc);
65758408c716SMiao Xie }
65768408c716SMiao Xie 
65774246a0b6SChristoph Hellwig static void btrfs_end_bio(struct bio *bio)
65788790d502SChris Mason {
65794c664611SQu Wenruo 	struct btrfs_io_context *bioc = bio->bi_private;
65807d2b4daaSChris Mason 	int is_orig_bio = 0;
65818790d502SChris Mason 
65824e4cbee9SChristoph Hellwig 	if (bio->bi_status) {
65834c664611SQu Wenruo 		atomic_inc(&bioc->error);
65844e4cbee9SChristoph Hellwig 		if (bio->bi_status == BLK_STS_IOERR ||
65854e4cbee9SChristoph Hellwig 		    bio->bi_status == BLK_STS_TARGET) {
6586c3a3b19bSQu Wenruo 			struct btrfs_device *dev = btrfs_bio(bio)->device;
6587442a4f63SStefan Behrens 
65883eee86c8SNikolay Borisov 			ASSERT(dev->bdev);
6589cfe94440SNaohiro Aota 			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
65901cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6591442a4f63SStefan Behrens 						BTRFS_DEV_STAT_WRITE_ERRS);
65920cc068e6SDavid Sterba 			else if (!(bio->bi_opf & REQ_RAHEAD))
65931cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6594442a4f63SStefan Behrens 						BTRFS_DEV_STAT_READ_ERRS);
659570fd7614SChristoph Hellwig 			if (bio->bi_opf & REQ_PREFLUSH)
65961cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6597442a4f63SStefan Behrens 						BTRFS_DEV_STAT_FLUSH_ERRS);
6598442a4f63SStefan Behrens 		}
6599442a4f63SStefan Behrens 	}
66008790d502SChris Mason 
66014c664611SQu Wenruo 	if (bio == bioc->orig_bio)
66027d2b4daaSChris Mason 		is_orig_bio = 1;
66037d2b4daaSChris Mason 
66044c664611SQu Wenruo 	btrfs_bio_counter_dec(bioc->fs_info);
6605c404e0dcSMiao Xie 
66064c664611SQu Wenruo 	if (atomic_dec_and_test(&bioc->stripes_pending)) {
66077d2b4daaSChris Mason 		if (!is_orig_bio) {
66087d2b4daaSChris Mason 			bio_put(bio);
66094c664611SQu Wenruo 			bio = bioc->orig_bio;
66107d2b4daaSChris Mason 		}
6611c7b22bb1SMuthu Kumar 
6612c3a3b19bSQu Wenruo 		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
6613a236aed1SChris Mason 		/* only send an error to the higher layers if it is
661453b381b3SDavid Woodhouse 		 * beyond the tolerance of the btrfs bio
6615a236aed1SChris Mason 		 */
66164c664611SQu Wenruo 		if (atomic_read(&bioc->error) > bioc->max_errors) {
66174e4cbee9SChristoph Hellwig 			bio->bi_status = BLK_STS_IOERR;
66185dbc8fcaSChris Mason 		} else {
66191259ab75SChris Mason 			/*
66201259ab75SChris Mason 			 * this bio is actually up to date, we didn't
66211259ab75SChris Mason 			 * go over the max number of errors
66221259ab75SChris Mason 			 */
66232dbe0c77SAnand Jain 			bio->bi_status = BLK_STS_OK;
66241259ab75SChris Mason 		}
6625c55f1396SMiao Xie 
66264c664611SQu Wenruo 		btrfs_end_bioc(bioc, bio);
66277d2b4daaSChris Mason 	} else if (!is_orig_bio) {
66288790d502SChris Mason 		bio_put(bio);
66298790d502SChris Mason 	}
66308790d502SChris Mason }
66318790d502SChris Mason 
66324c664611SQu Wenruo static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
6633c31efbdfSNikolay Borisov 			      u64 physical, struct btrfs_device *dev)
6634de1ee92aSJosef Bacik {
66354c664611SQu Wenruo 	struct btrfs_fs_info *fs_info = bioc->fs_info;
6636de1ee92aSJosef Bacik 
66374c664611SQu Wenruo 	bio->bi_private = bioc;
6638c3a3b19bSQu Wenruo 	btrfs_bio(bio)->device = dev;
6639de1ee92aSJosef Bacik 	bio->bi_end_io = btrfs_end_bio;
66404f024f37SKent Overstreet 	bio->bi_iter.bi_sector = physical >> 9;
6641d8e3fb10SNaohiro Aota 	/*
6642d8e3fb10SNaohiro Aota 	 * For zone append writing, bi_sector must point the beginning of the
6643d8e3fb10SNaohiro Aota 	 * zone
6644d8e3fb10SNaohiro Aota 	 */
6645d8e3fb10SNaohiro Aota 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6646d8e3fb10SNaohiro Aota 		if (btrfs_dev_is_sequential(dev, physical)) {
6647d8e3fb10SNaohiro Aota 			u64 zone_start = round_down(physical, fs_info->zone_size);
6648d8e3fb10SNaohiro Aota 
6649d8e3fb10SNaohiro Aota 			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6650d8e3fb10SNaohiro Aota 		} else {
6651d8e3fb10SNaohiro Aota 			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6652d8e3fb10SNaohiro Aota 			bio->bi_opf |= REQ_OP_WRITE;
6653d8e3fb10SNaohiro Aota 		}
6654d8e3fb10SNaohiro Aota 	}
6655672d5990SMisono Tomohiro 	btrfs_debug_in_rcu(fs_info,
6656ab8d0fc4SJeff Mahoney 	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
66571201b58bSDavid Sterba 		bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
66581db45a35SDavid Sterba 		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
66591db45a35SDavid Sterba 		dev->devid, bio->bi_iter.bi_size);
666074d46992SChristoph Hellwig 	bio_set_dev(bio, dev->bdev);
6661c404e0dcSMiao Xie 
66622ff7e61eSJeff Mahoney 	btrfs_bio_counter_inc_noblocked(fs_info);
6663c404e0dcSMiao Xie 
66644e49ea4aSMike Christie 	btrfsic_submit_bio(bio);
6665de1ee92aSJosef Bacik }
6666de1ee92aSJosef Bacik 
66674c664611SQu Wenruo static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
6668de1ee92aSJosef Bacik {
66694c664611SQu Wenruo 	atomic_inc(&bioc->error);
66704c664611SQu Wenruo 	if (atomic_dec_and_test(&bioc->stripes_pending)) {
667101327610SNicholas D Steeves 		/* Should be the original bio. */
66724c664611SQu Wenruo 		WARN_ON(bio != bioc->orig_bio);
66738408c716SMiao Xie 
6674c3a3b19bSQu Wenruo 		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
66754f024f37SKent Overstreet 		bio->bi_iter.bi_sector = logical >> 9;
66764c664611SQu Wenruo 		if (atomic_read(&bioc->error) > bioc->max_errors)
66774e4cbee9SChristoph Hellwig 			bio->bi_status = BLK_STS_IOERR;
6678102ed2c5SAnand Jain 		else
6679102ed2c5SAnand Jain 			bio->bi_status = BLK_STS_OK;
66804c664611SQu Wenruo 		btrfs_end_bioc(bioc, bio);
6681de1ee92aSJosef Bacik 	}
6682de1ee92aSJosef Bacik }
6683de1ee92aSJosef Bacik 
668458efbc9fSOmar Sandoval blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
668508635baeSChris Mason 			   int mirror_num)
66860b86a832SChris Mason {
66870b86a832SChris Mason 	struct btrfs_device *dev;
66888790d502SChris Mason 	struct bio *first_bio = bio;
66891201b58bSDavid Sterba 	u64 logical = bio->bi_iter.bi_sector << 9;
66900b86a832SChris Mason 	u64 length = 0;
66910b86a832SChris Mason 	u64 map_length;
66920b86a832SChris Mason 	int ret;
669308da757dSZhao Lei 	int dev_nr;
669408da757dSZhao Lei 	int total_devs;
66954c664611SQu Wenruo 	struct btrfs_io_context *bioc = NULL;
66960b86a832SChris Mason 
66974f024f37SKent Overstreet 	length = bio->bi_iter.bi_size;
66980b86a832SChris Mason 	map_length = length;
6699cea9e445SChris Mason 
67000b246afaSJeff Mahoney 	btrfs_bio_counter_inc_blocked(fs_info);
6701bd7d63c2SLiu Bo 	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
67024c664611SQu Wenruo 				&map_length, &bioc, mirror_num, 1);
6703c404e0dcSMiao Xie 	if (ret) {
67040b246afaSJeff Mahoney 		btrfs_bio_counter_dec(fs_info);
670558efbc9fSOmar Sandoval 		return errno_to_blk_status(ret);
6706c404e0dcSMiao Xie 	}
6707cea9e445SChris Mason 
67084c664611SQu Wenruo 	total_devs = bioc->num_stripes;
67094c664611SQu Wenruo 	bioc->orig_bio = first_bio;
67104c664611SQu Wenruo 	bioc->private = first_bio->bi_private;
67114c664611SQu Wenruo 	bioc->end_io = first_bio->bi_end_io;
67124c664611SQu Wenruo 	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
671353b381b3SDavid Woodhouse 
67144c664611SQu Wenruo 	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6715cfe94440SNaohiro Aota 	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
671653b381b3SDavid Woodhouse 		/* In this case, map_length has been set to the length of
671753b381b3SDavid Woodhouse 		   a single stripe; not the whole write */
6718cfe94440SNaohiro Aota 		if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
67196a258d72SQu Wenruo 			ret = raid56_parity_write(bio, bioc, map_length);
672053b381b3SDavid Woodhouse 		} else {
67216a258d72SQu Wenruo 			ret = raid56_parity_recover(bio, bioc, map_length,
67226a258d72SQu Wenruo 						    mirror_num, 1);
672353b381b3SDavid Woodhouse 		}
67244245215dSMiao Xie 
67250b246afaSJeff Mahoney 		btrfs_bio_counter_dec(fs_info);
672658efbc9fSOmar Sandoval 		return errno_to_blk_status(ret);
672753b381b3SDavid Woodhouse 	}
672853b381b3SDavid Woodhouse 
6729239b14b3SChris Mason 	if (map_length < length) {
67300b246afaSJeff Mahoney 		btrfs_crit(fs_info,
67315d163e0eSJeff Mahoney 			   "mapping failed logical %llu bio len %llu len %llu",
6732c1c9ff7cSGeert Uytterhoeven 			   logical, length, map_length);
6733239b14b3SChris Mason 		BUG();
6734239b14b3SChris Mason 	}
6735a1d3c478SJan Schmidt 
673608da757dSZhao Lei 	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
67374c664611SQu Wenruo 		dev = bioc->stripes[dev_nr].dev;
6738fc8a168aSNikolay Borisov 		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6739fc8a168aSNikolay Borisov 						   &dev->dev_state) ||
6740cfe94440SNaohiro Aota 		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6741ebbede42SAnand Jain 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
67424c664611SQu Wenruo 			bioc_error(bioc, first_bio, logical);
6743de1ee92aSJosef Bacik 			continue;
6744de1ee92aSJosef Bacik 		}
6745de1ee92aSJosef Bacik 
67463aa8e074SDavid Sterba 		if (dev_nr < total_devs - 1)
67478b6c1d56SDavid Sterba 			bio = btrfs_bio_clone(first_bio);
67483aa8e074SDavid Sterba 		else
67498790d502SChris Mason 			bio = first_bio;
6750606686eeSJosef Bacik 
67514c664611SQu Wenruo 		submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
67528790d502SChris Mason 	}
67530b246afaSJeff Mahoney 	btrfs_bio_counter_dec(fs_info);
675458efbc9fSOmar Sandoval 	return BLK_STS_OK;
67550b86a832SChris Mason }
67560b86a832SChris Mason 
675709ba3bc9SAnand Jain /*
675809ba3bc9SAnand Jain  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
675909ba3bc9SAnand Jain  * return NULL.
676009ba3bc9SAnand Jain  *
676109ba3bc9SAnand Jain  * If devid and uuid are both specified, the match must be exact, otherwise
676209ba3bc9SAnand Jain  * only devid is used.
676309ba3bc9SAnand Jain  */
6764e4319cd9SAnand Jain struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6765b2598edfSAnand Jain 				       u64 devid, u8 *uuid, u8 *fsid)
67660b86a832SChris Mason {
67672b82032cSYan Zheng 	struct btrfs_device *device;
6768944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_devs;
67690b86a832SChris Mason 
6770944d3f9fSNikolay Borisov 	if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6771944d3f9fSNikolay Borisov 		list_for_each_entry(device, &fs_devices->devices, dev_list) {
6772944d3f9fSNikolay Borisov 			if (device->devid == devid &&
6773944d3f9fSNikolay Borisov 			    (!uuid || memcmp(device->uuid, uuid,
6774944d3f9fSNikolay Borisov 					     BTRFS_UUID_SIZE) == 0))
6775944d3f9fSNikolay Borisov 				return device;
6776944d3f9fSNikolay Borisov 		}
6777944d3f9fSNikolay Borisov 	}
6778944d3f9fSNikolay Borisov 
6779944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
67802b82032cSYan Zheng 		if (!fsid ||
6781944d3f9fSNikolay Borisov 		    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6782944d3f9fSNikolay Borisov 			list_for_each_entry(device, &seed_devs->devices,
678309ba3bc9SAnand Jain 					    dev_list) {
678409ba3bc9SAnand Jain 				if (device->devid == devid &&
678509ba3bc9SAnand Jain 				    (!uuid || memcmp(device->uuid, uuid,
678609ba3bc9SAnand Jain 						     BTRFS_UUID_SIZE) == 0))
67872b82032cSYan Zheng 					return device;
67882b82032cSYan Zheng 			}
678909ba3bc9SAnand Jain 		}
67902b82032cSYan Zheng 	}
6791944d3f9fSNikolay Borisov 
67922b82032cSYan Zheng 	return NULL;
67930b86a832SChris Mason }
67940b86a832SChris Mason 
67952ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6796dfe25020SChris Mason 					    u64 devid, u8 *dev_uuid)
6797dfe25020SChris Mason {
6798dfe25020SChris Mason 	struct btrfs_device *device;
6799fccc0007SJosef Bacik 	unsigned int nofs_flag;
6800dfe25020SChris Mason 
6801fccc0007SJosef Bacik 	/*
6802fccc0007SJosef Bacik 	 * We call this under the chunk_mutex, so we want to use NOFS for this
6803fccc0007SJosef Bacik 	 * allocation, however we don't want to change btrfs_alloc_device() to
6804fccc0007SJosef Bacik 	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6805fccc0007SJosef Bacik 	 * places.
6806fccc0007SJosef Bacik 	 */
6807fccc0007SJosef Bacik 	nofs_flag = memalloc_nofs_save();
680812bd2fc0SIlya Dryomov 	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6809fccc0007SJosef Bacik 	memalloc_nofs_restore(nofs_flag);
681012bd2fc0SIlya Dryomov 	if (IS_ERR(device))
6811adfb69afSAnand Jain 		return device;
681212bd2fc0SIlya Dryomov 
681312bd2fc0SIlya Dryomov 	list_add(&device->dev_list, &fs_devices->devices);
6814e4404d6eSYan Zheng 	device->fs_devices = fs_devices;
6815dfe25020SChris Mason 	fs_devices->num_devices++;
681612bd2fc0SIlya Dryomov 
6817e6e674bdSAnand Jain 	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6818cd02dca5SChris Mason 	fs_devices->missing_devices++;
681912bd2fc0SIlya Dryomov 
6820dfe25020SChris Mason 	return device;
6821dfe25020SChris Mason }
6822dfe25020SChris Mason 
682312bd2fc0SIlya Dryomov /**
682412bd2fc0SIlya Dryomov  * btrfs_alloc_device - allocate struct btrfs_device
682512bd2fc0SIlya Dryomov  * @fs_info:	used only for generating a new devid, can be NULL if
682612bd2fc0SIlya Dryomov  *		devid is provided (i.e. @devid != NULL).
682712bd2fc0SIlya Dryomov  * @devid:	a pointer to devid for this device.  If NULL a new devid
682812bd2fc0SIlya Dryomov  *		is generated.
682912bd2fc0SIlya Dryomov  * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
683012bd2fc0SIlya Dryomov  *		is generated.
683112bd2fc0SIlya Dryomov  *
683212bd2fc0SIlya Dryomov  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
683348dae9cfSDavid Sterba  * on error.  Returned struct is not linked onto any lists and must be
6834a425f9d4SDavid Sterba  * destroyed with btrfs_free_device.
683512bd2fc0SIlya Dryomov  */
683612bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
683712bd2fc0SIlya Dryomov 					const u64 *devid,
683812bd2fc0SIlya Dryomov 					const u8 *uuid)
683912bd2fc0SIlya Dryomov {
684012bd2fc0SIlya Dryomov 	struct btrfs_device *dev;
684112bd2fc0SIlya Dryomov 	u64 tmp;
684212bd2fc0SIlya Dryomov 
6843fae7f21cSDulshani Gunawardhana 	if (WARN_ON(!devid && !fs_info))
684412bd2fc0SIlya Dryomov 		return ERR_PTR(-EINVAL);
684512bd2fc0SIlya Dryomov 
6846fe4f46d4SDavid Sterba 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6847fe4f46d4SDavid Sterba 	if (!dev)
6848fe4f46d4SDavid Sterba 		return ERR_PTR(-ENOMEM);
6849fe4f46d4SDavid Sterba 
6850fe4f46d4SDavid Sterba 	/*
6851fe4f46d4SDavid Sterba 	 * Preallocate a bio that's always going to be used for flushing device
6852fe4f46d4SDavid Sterba 	 * barriers and matches the device lifespan
6853fe4f46d4SDavid Sterba 	 */
6854fe4f46d4SDavid Sterba 	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
6855fe4f46d4SDavid Sterba 	if (!dev->flush_bio) {
6856fe4f46d4SDavid Sterba 		kfree(dev);
6857fe4f46d4SDavid Sterba 		return ERR_PTR(-ENOMEM);
6858fe4f46d4SDavid Sterba 	}
6859fe4f46d4SDavid Sterba 
6860fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->dev_list);
6861fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->dev_alloc_list);
6862fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->post_commit_list);
6863fe4f46d4SDavid Sterba 
6864fe4f46d4SDavid Sterba 	atomic_set(&dev->reada_in_flight, 0);
6865fe4f46d4SDavid Sterba 	atomic_set(&dev->dev_stats_ccnt, 0);
6866fe4f46d4SDavid Sterba 	btrfs_device_data_ordered_init(dev);
6867fe4f46d4SDavid Sterba 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6868fe4f46d4SDavid Sterba 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6869fe4f46d4SDavid Sterba 	extent_io_tree_init(fs_info, &dev->alloc_state,
6870fe4f46d4SDavid Sterba 			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
687112bd2fc0SIlya Dryomov 
687212bd2fc0SIlya Dryomov 	if (devid)
687312bd2fc0SIlya Dryomov 		tmp = *devid;
687412bd2fc0SIlya Dryomov 	else {
687512bd2fc0SIlya Dryomov 		int ret;
687612bd2fc0SIlya Dryomov 
687712bd2fc0SIlya Dryomov 		ret = find_next_devid(fs_info, &tmp);
687812bd2fc0SIlya Dryomov 		if (ret) {
6879a425f9d4SDavid Sterba 			btrfs_free_device(dev);
688012bd2fc0SIlya Dryomov 			return ERR_PTR(ret);
688112bd2fc0SIlya Dryomov 		}
688212bd2fc0SIlya Dryomov 	}
688312bd2fc0SIlya Dryomov 	dev->devid = tmp;
688412bd2fc0SIlya Dryomov 
688512bd2fc0SIlya Dryomov 	if (uuid)
688612bd2fc0SIlya Dryomov 		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
688712bd2fc0SIlya Dryomov 	else
688812bd2fc0SIlya Dryomov 		generate_random_uuid(dev->uuid);
688912bd2fc0SIlya Dryomov 
689012bd2fc0SIlya Dryomov 	return dev;
689112bd2fc0SIlya Dryomov }
689212bd2fc0SIlya Dryomov 
68935a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
68942b902dfcSAnand Jain 					u64 devid, u8 *uuid, bool error)
68955a2b8e60SAnand Jain {
68962b902dfcSAnand Jain 	if (error)
68972b902dfcSAnand Jain 		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
68982b902dfcSAnand Jain 			      devid, uuid);
68992b902dfcSAnand Jain 	else
69002b902dfcSAnand Jain 		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
69012b902dfcSAnand Jain 			      devid, uuid);
69025a2b8e60SAnand Jain }
69035a2b8e60SAnand Jain 
690439e264a4SNikolay Borisov static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
690539e264a4SNikolay Borisov {
6906d58ede8dSDavid Sterba 	const int data_stripes = calc_data_stripes(type, num_stripes);
6907e4f6c6beSDavid Sterba 
690839e264a4SNikolay Borisov 	return div_u64(chunk_len, data_stripes);
690939e264a4SNikolay Borisov }
691039e264a4SNikolay Borisov 
6911e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6912e9306ad4SQu Wenruo /*
6913e9306ad4SQu Wenruo  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6914e9306ad4SQu Wenruo  * can't be accessed on 32bit systems.
6915e9306ad4SQu Wenruo  *
6916e9306ad4SQu Wenruo  * This function do mount time check to reject the fs if it already has
6917e9306ad4SQu Wenruo  * metadata chunk beyond that limit.
6918e9306ad4SQu Wenruo  */
6919e9306ad4SQu Wenruo static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6920e9306ad4SQu Wenruo 				  u64 logical, u64 length, u64 type)
6921e9306ad4SQu Wenruo {
6922e9306ad4SQu Wenruo 	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6923e9306ad4SQu Wenruo 		return 0;
6924e9306ad4SQu Wenruo 
6925e9306ad4SQu Wenruo 	if (logical + length < MAX_LFS_FILESIZE)
6926e9306ad4SQu Wenruo 		return 0;
6927e9306ad4SQu Wenruo 
6928e9306ad4SQu Wenruo 	btrfs_err_32bit_limit(fs_info);
6929e9306ad4SQu Wenruo 	return -EOVERFLOW;
6930e9306ad4SQu Wenruo }
6931e9306ad4SQu Wenruo 
6932e9306ad4SQu Wenruo /*
6933e9306ad4SQu Wenruo  * This is to give early warning for any metadata chunk reaching
6934e9306ad4SQu Wenruo  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6935e9306ad4SQu Wenruo  * Although we can still access the metadata, it's not going to be possible
6936e9306ad4SQu Wenruo  * once the limit is reached.
6937e9306ad4SQu Wenruo  */
6938e9306ad4SQu Wenruo static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6939e9306ad4SQu Wenruo 				  u64 logical, u64 length, u64 type)
6940e9306ad4SQu Wenruo {
6941e9306ad4SQu Wenruo 	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6942e9306ad4SQu Wenruo 		return;
6943e9306ad4SQu Wenruo 
6944e9306ad4SQu Wenruo 	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6945e9306ad4SQu Wenruo 		return;
6946e9306ad4SQu Wenruo 
6947e9306ad4SQu Wenruo 	btrfs_warn_32bit_limit(fs_info);
6948e9306ad4SQu Wenruo }
6949e9306ad4SQu Wenruo #endif
6950e9306ad4SQu Wenruo 
69519690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
69520b86a832SChris Mason 			  struct btrfs_chunk *chunk)
69530b86a832SChris Mason {
69549690ac09SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
6955c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
69560b86a832SChris Mason 	struct map_lookup *map;
69570b86a832SChris Mason 	struct extent_map *em;
69580b86a832SChris Mason 	u64 logical;
69590b86a832SChris Mason 	u64 length;
69600b86a832SChris Mason 	u64 devid;
6961e9306ad4SQu Wenruo 	u64 type;
6962a443755fSChris Mason 	u8 uuid[BTRFS_UUID_SIZE];
6963593060d7SChris Mason 	int num_stripes;
69640b86a832SChris Mason 	int ret;
6965593060d7SChris Mason 	int i;
69660b86a832SChris Mason 
6967e17cade2SChris Mason 	logical = key->offset;
6968e17cade2SChris Mason 	length = btrfs_chunk_length(leaf, chunk);
6969e9306ad4SQu Wenruo 	type = btrfs_chunk_type(leaf, chunk);
6970f04b772bSQu Wenruo 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6971e06cd3ddSLiu Bo 
6972e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6973e9306ad4SQu Wenruo 	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6974e9306ad4SQu Wenruo 	if (ret < 0)
6975e9306ad4SQu Wenruo 		return ret;
6976e9306ad4SQu Wenruo 	warn_32bit_meta_chunk(fs_info, logical, length, type);
6977e9306ad4SQu Wenruo #endif
6978e9306ad4SQu Wenruo 
6979075cb3c7SQu Wenruo 	/*
6980075cb3c7SQu Wenruo 	 * Only need to verify chunk item if we're reading from sys chunk array,
6981075cb3c7SQu Wenruo 	 * as chunk item in tree block is already verified by tree-checker.
6982075cb3c7SQu Wenruo 	 */
6983075cb3c7SQu Wenruo 	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6984ddaf1d5aSDavid Sterba 		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6985e06cd3ddSLiu Bo 		if (ret)
6986e06cd3ddSLiu Bo 			return ret;
6987075cb3c7SQu Wenruo 	}
6988a061fc8dSChris Mason 
6989c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
6990c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, logical, 1);
6991c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
69920b86a832SChris Mason 
69930b86a832SChris Mason 	/* already mapped? */
69940b86a832SChris Mason 	if (em && em->start <= logical && em->start + em->len > logical) {
69950b86a832SChris Mason 		free_extent_map(em);
69960b86a832SChris Mason 		return 0;
69970b86a832SChris Mason 	} else if (em) {
69980b86a832SChris Mason 		free_extent_map(em);
69990b86a832SChris Mason 	}
70000b86a832SChris Mason 
7001172ddd60SDavid Sterba 	em = alloc_extent_map();
70020b86a832SChris Mason 	if (!em)
70030b86a832SChris Mason 		return -ENOMEM;
7004593060d7SChris Mason 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
70050b86a832SChris Mason 	if (!map) {
70060b86a832SChris Mason 		free_extent_map(em);
70070b86a832SChris Mason 		return -ENOMEM;
70080b86a832SChris Mason 	}
70090b86a832SChris Mason 
7010298a8f9cSWang Shilong 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
701195617d69SJeff Mahoney 	em->map_lookup = map;
70120b86a832SChris Mason 	em->start = logical;
70130b86a832SChris Mason 	em->len = length;
701470c8a91cSJosef Bacik 	em->orig_start = 0;
70150b86a832SChris Mason 	em->block_start = 0;
7016c8b97818SChris Mason 	em->block_len = em->len;
70170b86a832SChris Mason 
7018593060d7SChris Mason 	map->num_stripes = num_stripes;
7019593060d7SChris Mason 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
7020593060d7SChris Mason 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
7021593060d7SChris Mason 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7022e9306ad4SQu Wenruo 	map->type = type;
7023321aecc6SChris Mason 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7024cf90d884SQu Wenruo 	map->verified_stripes = 0;
7025e9306ad4SQu Wenruo 	em->orig_block_len = calc_stripe_length(type, em->len,
702639e264a4SNikolay Borisov 						map->num_stripes);
7027593060d7SChris Mason 	for (i = 0; i < num_stripes; i++) {
7028593060d7SChris Mason 		map->stripes[i].physical =
7029593060d7SChris Mason 			btrfs_stripe_offset_nr(leaf, chunk, i);
7030593060d7SChris Mason 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7031a443755fSChris Mason 		read_extent_buffer(leaf, uuid, (unsigned long)
7032a443755fSChris Mason 				   btrfs_stripe_dev_uuid_nr(chunk, i),
7033a443755fSChris Mason 				   BTRFS_UUID_SIZE);
7034e4319cd9SAnand Jain 		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
7035b2598edfSAnand Jain 							devid, uuid, NULL);
70363cdde224SJeff Mahoney 		if (!map->stripes[i].dev &&
70370b246afaSJeff Mahoney 		    !btrfs_test_opt(fs_info, DEGRADED)) {
7038dfe25020SChris Mason 			free_extent_map(em);
70392b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid, uuid, true);
704045dbdbc9SAnand Jain 			return -ENOENT;
7041dfe25020SChris Mason 		}
7042dfe25020SChris Mason 		if (!map->stripes[i].dev) {
7043dfe25020SChris Mason 			map->stripes[i].dev =
70442ff7e61eSJeff Mahoney 				add_missing_dev(fs_info->fs_devices, devid,
70452ff7e61eSJeff Mahoney 						uuid);
7046adfb69afSAnand Jain 			if (IS_ERR(map->stripes[i].dev)) {
70470b86a832SChris Mason 				free_extent_map(em);
7048adfb69afSAnand Jain 				btrfs_err(fs_info,
7049adfb69afSAnand Jain 					"failed to init missing dev %llu: %ld",
7050adfb69afSAnand Jain 					devid, PTR_ERR(map->stripes[i].dev));
7051adfb69afSAnand Jain 				return PTR_ERR(map->stripes[i].dev);
70520b86a832SChris Mason 			}
70532b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid, uuid, false);
7054593060d7SChris Mason 		}
7055e12c9621SAnand Jain 		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7056e12c9621SAnand Jain 				&(map->stripes[i].dev->dev_state));
7057e12c9621SAnand Jain 
7058dfe25020SChris Mason 	}
70590b86a832SChris Mason 
7060c8bf1b67SDavid Sterba 	write_lock(&map_tree->lock);
7061c8bf1b67SDavid Sterba 	ret = add_extent_mapping(map_tree, em, 0);
7062c8bf1b67SDavid Sterba 	write_unlock(&map_tree->lock);
706364f64f43SQu Wenruo 	if (ret < 0) {
706464f64f43SQu Wenruo 		btrfs_err(fs_info,
706564f64f43SQu Wenruo 			  "failed to add chunk map, start=%llu len=%llu: %d",
706664f64f43SQu Wenruo 			  em->start, em->len, ret);
706764f64f43SQu Wenruo 	}
70680b86a832SChris Mason 	free_extent_map(em);
70690b86a832SChris Mason 
707064f64f43SQu Wenruo 	return ret;
70710b86a832SChris Mason }
70720b86a832SChris Mason 
7073143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf,
70740b86a832SChris Mason 				 struct btrfs_dev_item *dev_item,
70750b86a832SChris Mason 				 struct btrfs_device *device)
70760b86a832SChris Mason {
70770b86a832SChris Mason 	unsigned long ptr;
70780b86a832SChris Mason 
70790b86a832SChris Mason 	device->devid = btrfs_device_id(leaf, dev_item);
7080d6397baeSChris Ball 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7081d6397baeSChris Ball 	device->total_bytes = device->disk_total_bytes;
7082935e5cc9SMiao Xie 	device->commit_total_bytes = device->disk_total_bytes;
70830b86a832SChris Mason 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7084ce7213c7SMiao Xie 	device->commit_bytes_used = device->bytes_used;
70850b86a832SChris Mason 	device->type = btrfs_device_type(leaf, dev_item);
70860b86a832SChris Mason 	device->io_align = btrfs_device_io_align(leaf, dev_item);
70870b86a832SChris Mason 	device->io_width = btrfs_device_io_width(leaf, dev_item);
70880b86a832SChris Mason 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
70898dabb742SStefan Behrens 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7090401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
70910b86a832SChris Mason 
7092410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
7093e17cade2SChris Mason 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
70940b86a832SChris Mason }
70950b86a832SChris Mason 
70962ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
70975f375835SMiao Xie 						  u8 *fsid)
70982b82032cSYan Zheng {
70992b82032cSYan Zheng 	struct btrfs_fs_devices *fs_devices;
71002b82032cSYan Zheng 	int ret;
71012b82032cSYan Zheng 
7102a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
71032dfeca9bSDavid Sterba 	ASSERT(fsid);
71042b82032cSYan Zheng 
7105427c8fddSNikolay Borisov 	/* This will match only for multi-device seed fs */
7106944d3f9fSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
710744880fdcSAnand Jain 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
71085f375835SMiao Xie 			return fs_devices;
71095f375835SMiao Xie 
71102b82032cSYan Zheng 
71117239ff4bSNikolay Borisov 	fs_devices = find_fsid(fsid, NULL);
71122b82032cSYan Zheng 	if (!fs_devices) {
71130b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, DEGRADED))
71145f375835SMiao Xie 			return ERR_PTR(-ENOENT);
71155f375835SMiao Xie 
71167239ff4bSNikolay Borisov 		fs_devices = alloc_fs_devices(fsid, NULL);
71175f375835SMiao Xie 		if (IS_ERR(fs_devices))
71185f375835SMiao Xie 			return fs_devices;
71195f375835SMiao Xie 
71200395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
71215f375835SMiao Xie 		fs_devices->opened = 1;
71225f375835SMiao Xie 		return fs_devices;
71232b82032cSYan Zheng 	}
7124e4404d6eSYan Zheng 
7125427c8fddSNikolay Borisov 	/*
7126427c8fddSNikolay Borisov 	 * Upon first call for a seed fs fsid, just create a private copy of the
7127427c8fddSNikolay Borisov 	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7128427c8fddSNikolay Borisov 	 */
7129e4404d6eSYan Zheng 	fs_devices = clone_fs_devices(fs_devices);
71305f375835SMiao Xie 	if (IS_ERR(fs_devices))
71315f375835SMiao Xie 		return fs_devices;
71322b82032cSYan Zheng 
7133897fb573SAnand Jain 	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
713448d28232SJulia Lawall 	if (ret) {
713548d28232SJulia Lawall 		free_fs_devices(fs_devices);
7136c83b60c0SAnand Jain 		return ERR_PTR(ret);
713748d28232SJulia Lawall 	}
71382b82032cSYan Zheng 
71392b82032cSYan Zheng 	if (!fs_devices->seeding) {
71400226e0ebSAnand Jain 		close_fs_devices(fs_devices);
7141e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
7142c83b60c0SAnand Jain 		return ERR_PTR(-EINVAL);
71432b82032cSYan Zheng 	}
71442b82032cSYan Zheng 
7145944d3f9fSNikolay Borisov 	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7146c83b60c0SAnand Jain 
71475f375835SMiao Xie 	return fs_devices;
71482b82032cSYan Zheng }
71492b82032cSYan Zheng 
715017850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf,
71510b86a832SChris Mason 			struct btrfs_dev_item *dev_item)
71520b86a832SChris Mason {
715317850759SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
71540b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
71550b86a832SChris Mason 	struct btrfs_device *device;
71560b86a832SChris Mason 	u64 devid;
71570b86a832SChris Mason 	int ret;
715844880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
7159a443755fSChris Mason 	u8 dev_uuid[BTRFS_UUID_SIZE];
7160a443755fSChris Mason 
71610b86a832SChris Mason 	devid = btrfs_device_id(leaf, dev_item);
7162410ba3a2SGeert Uytterhoeven 	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7163a443755fSChris Mason 			   BTRFS_UUID_SIZE);
71641473b24eSGeert Uytterhoeven 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
716544880fdcSAnand Jain 			   BTRFS_FSID_SIZE);
71662b82032cSYan Zheng 
7167de37aa51SNikolay Borisov 	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
71682ff7e61eSJeff Mahoney 		fs_devices = open_seed_devices(fs_info, fs_uuid);
71695f375835SMiao Xie 		if (IS_ERR(fs_devices))
71705f375835SMiao Xie 			return PTR_ERR(fs_devices);
71712b82032cSYan Zheng 	}
71722b82032cSYan Zheng 
7173e4319cd9SAnand Jain 	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7174b2598edfSAnand Jain 				   fs_uuid);
71755f375835SMiao Xie 	if (!device) {
7176c5502451SQu Wenruo 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
71772b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
71782b902dfcSAnand Jain 							dev_uuid, true);
717945dbdbc9SAnand Jain 			return -ENOENT;
7180c5502451SQu Wenruo 		}
71812b82032cSYan Zheng 
71822ff7e61eSJeff Mahoney 		device = add_missing_dev(fs_devices, devid, dev_uuid);
7183adfb69afSAnand Jain 		if (IS_ERR(device)) {
7184adfb69afSAnand Jain 			btrfs_err(fs_info,
7185adfb69afSAnand Jain 				"failed to add missing dev %llu: %ld",
7186adfb69afSAnand Jain 				devid, PTR_ERR(device));
7187adfb69afSAnand Jain 			return PTR_ERR(device);
7188adfb69afSAnand Jain 		}
71892b902dfcSAnand Jain 		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
71905f375835SMiao Xie 	} else {
7191c5502451SQu Wenruo 		if (!device->bdev) {
71922b902dfcSAnand Jain 			if (!btrfs_test_opt(fs_info, DEGRADED)) {
71932b902dfcSAnand Jain 				btrfs_report_missing_device(fs_info,
71942b902dfcSAnand Jain 						devid, dev_uuid, true);
719545dbdbc9SAnand Jain 				return -ENOENT;
7196c5502451SQu Wenruo 			}
71972b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
71982b902dfcSAnand Jain 							dev_uuid, false);
71992b902dfcSAnand Jain 		}
72005f375835SMiao Xie 
7201e6e674bdSAnand Jain 		if (!device->bdev &&
7202e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7203cd02dca5SChris Mason 			/*
7204cd02dca5SChris Mason 			 * this happens when a device that was properly setup
7205cd02dca5SChris Mason 			 * in the device info lists suddenly goes bad.
7206cd02dca5SChris Mason 			 * device->bdev is NULL, and so we have to set
7207cd02dca5SChris Mason 			 * device->missing to one here
7208cd02dca5SChris Mason 			 */
72095f375835SMiao Xie 			device->fs_devices->missing_devices++;
7210e6e674bdSAnand Jain 			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
72116324fbf3SChris Mason 		}
72125f375835SMiao Xie 
72135f375835SMiao Xie 		/* Move the device to its own fs_devices */
72145f375835SMiao Xie 		if (device->fs_devices != fs_devices) {
7215e6e674bdSAnand Jain 			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7216e6e674bdSAnand Jain 							&device->dev_state));
72175f375835SMiao Xie 
72185f375835SMiao Xie 			list_move(&device->dev_list, &fs_devices->devices);
72195f375835SMiao Xie 			device->fs_devices->num_devices--;
72205f375835SMiao Xie 			fs_devices->num_devices++;
72215f375835SMiao Xie 
72225f375835SMiao Xie 			device->fs_devices->missing_devices--;
72235f375835SMiao Xie 			fs_devices->missing_devices++;
72245f375835SMiao Xie 
72255f375835SMiao Xie 			device->fs_devices = fs_devices;
72265f375835SMiao Xie 		}
72272b82032cSYan Zheng 	}
72282b82032cSYan Zheng 
72290b246afaSJeff Mahoney 	if (device->fs_devices != fs_info->fs_devices) {
7230ebbede42SAnand Jain 		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
72312b82032cSYan Zheng 		if (device->generation !=
72322b82032cSYan Zheng 		    btrfs_device_generation(leaf, dev_item))
72332b82032cSYan Zheng 			return -EINVAL;
72342b82032cSYan Zheng 	}
72350b86a832SChris Mason 
72360b86a832SChris Mason 	fill_device_from_item(leaf, dev_item, device);
72373a160a93SAnand Jain 	if (device->bdev) {
72383a160a93SAnand Jain 		u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
72393a160a93SAnand Jain 
72403a160a93SAnand Jain 		if (device->total_bytes > max_total_bytes) {
72413a160a93SAnand Jain 			btrfs_err(fs_info,
72423a160a93SAnand Jain 			"device total_bytes should be at most %llu but found %llu",
72433a160a93SAnand Jain 				  max_total_bytes, device->total_bytes);
72443a160a93SAnand Jain 			return -EINVAL;
72453a160a93SAnand Jain 		}
72463a160a93SAnand Jain 	}
7247e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7248ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7249401e29c1SAnand Jain 	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
72502b82032cSYan Zheng 		device->fs_devices->total_rw_bytes += device->total_bytes;
7251a5ed45f8SNikolay Borisov 		atomic64_add(device->total_bytes - device->bytes_used,
7252a5ed45f8SNikolay Borisov 				&fs_info->free_chunk_space);
72532bf64758SJosef Bacik 	}
72540b86a832SChris Mason 	ret = 0;
72550b86a832SChris Mason 	return ret;
72560b86a832SChris Mason }
72570b86a832SChris Mason 
72586bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
72590b86a832SChris Mason {
72606bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
7261ab8d0fc4SJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
7262a061fc8dSChris Mason 	struct extent_buffer *sb;
72630b86a832SChris Mason 	struct btrfs_disk_key *disk_key;
72640b86a832SChris Mason 	struct btrfs_chunk *chunk;
72651ffb22cfSDavid Sterba 	u8 *array_ptr;
72661ffb22cfSDavid Sterba 	unsigned long sb_array_offset;
726784eed90fSChris Mason 	int ret = 0;
72680b86a832SChris Mason 	u32 num_stripes;
72690b86a832SChris Mason 	u32 array_size;
72700b86a832SChris Mason 	u32 len = 0;
72711ffb22cfSDavid Sterba 	u32 cur_offset;
7272e06cd3ddSLiu Bo 	u64 type;
727384eed90fSChris Mason 	struct btrfs_key key;
72740b86a832SChris Mason 
72750b246afaSJeff Mahoney 	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7276a83fffb7SDavid Sterba 	/*
7277a83fffb7SDavid Sterba 	 * This will create extent buffer of nodesize, superblock size is
7278a83fffb7SDavid Sterba 	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7279a83fffb7SDavid Sterba 	 * overallocate but we can keep it as-is, only the first page is used.
7280a83fffb7SDavid Sterba 	 */
72813fbaf258SJosef Bacik 	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
72823fbaf258SJosef Bacik 					  root->root_key.objectid, 0);
7283c871b0f2SLiu Bo 	if (IS_ERR(sb))
7284c871b0f2SLiu Bo 		return PTR_ERR(sb);
72854db8c528SDavid Sterba 	set_extent_buffer_uptodate(sb);
72868a334426SDavid Sterba 	/*
728701327610SNicholas D Steeves 	 * The sb extent buffer is artificial and just used to read the system array.
72884db8c528SDavid Sterba 	 * set_extent_buffer_uptodate() call does not properly mark all it's
72898a334426SDavid Sterba 	 * pages up-to-date when the page is larger: extent does not cover the
72908a334426SDavid Sterba 	 * whole page and consequently check_page_uptodate does not find all
72918a334426SDavid Sterba 	 * the page's extents up-to-date (the hole beyond sb),
72928a334426SDavid Sterba 	 * write_extent_buffer then triggers a WARN_ON.
72938a334426SDavid Sterba 	 *
72948a334426SDavid Sterba 	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
72958a334426SDavid Sterba 	 * but sb spans only this function. Add an explicit SetPageUptodate call
72968a334426SDavid Sterba 	 * to silence the warning eg. on PowerPC 64.
72978a334426SDavid Sterba 	 */
729809cbfeafSKirill A. Shutemov 	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7299727011e0SChris Mason 		SetPageUptodate(sb->pages[0]);
73004008c04aSChris Mason 
7301a061fc8dSChris Mason 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
73020b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
73030b86a832SChris Mason 
73041ffb22cfSDavid Sterba 	array_ptr = super_copy->sys_chunk_array;
73051ffb22cfSDavid Sterba 	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
73061ffb22cfSDavid Sterba 	cur_offset = 0;
73070b86a832SChris Mason 
73081ffb22cfSDavid Sterba 	while (cur_offset < array_size) {
73091ffb22cfSDavid Sterba 		disk_key = (struct btrfs_disk_key *)array_ptr;
7310e3540eabSDavid Sterba 		len = sizeof(*disk_key);
7311e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7312e3540eabSDavid Sterba 			goto out_short_read;
7313e3540eabSDavid Sterba 
73140b86a832SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
73150b86a832SChris Mason 
73161ffb22cfSDavid Sterba 		array_ptr += len;
73171ffb22cfSDavid Sterba 		sb_array_offset += len;
73181ffb22cfSDavid Sterba 		cur_offset += len;
73190b86a832SChris Mason 
732032ab3d1bSJohannes Thumshirn 		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
732132ab3d1bSJohannes Thumshirn 			btrfs_err(fs_info,
732232ab3d1bSJohannes Thumshirn 			    "unexpected item type %u in sys_array at offset %u",
732332ab3d1bSJohannes Thumshirn 				  (u32)key.type, cur_offset);
732432ab3d1bSJohannes Thumshirn 			ret = -EIO;
732532ab3d1bSJohannes Thumshirn 			break;
732632ab3d1bSJohannes Thumshirn 		}
732732ab3d1bSJohannes Thumshirn 
73281ffb22cfSDavid Sterba 		chunk = (struct btrfs_chunk *)sb_array_offset;
7329e3540eabSDavid Sterba 		/*
733032ab3d1bSJohannes Thumshirn 		 * At least one btrfs_chunk with one stripe must be present,
733132ab3d1bSJohannes Thumshirn 		 * exact stripe count check comes afterwards
7332e3540eabSDavid Sterba 		 */
7333e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(1);
7334e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7335e3540eabSDavid Sterba 			goto out_short_read;
7336e3540eabSDavid Sterba 
7337e3540eabSDavid Sterba 		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7338f5cdedd7SDavid Sterba 		if (!num_stripes) {
7339ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
7340ab8d0fc4SJeff Mahoney 			"invalid number of stripes %u in sys_array at offset %u",
7341f5cdedd7SDavid Sterba 				  num_stripes, cur_offset);
7342f5cdedd7SDavid Sterba 			ret = -EIO;
7343f5cdedd7SDavid Sterba 			break;
7344f5cdedd7SDavid Sterba 		}
7345f5cdedd7SDavid Sterba 
7346e06cd3ddSLiu Bo 		type = btrfs_chunk_type(sb, chunk);
7347e06cd3ddSLiu Bo 		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7348ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
7349e06cd3ddSLiu Bo 			"invalid chunk type %llu in sys_array at offset %u",
7350e06cd3ddSLiu Bo 				  type, cur_offset);
7351e06cd3ddSLiu Bo 			ret = -EIO;
7352e06cd3ddSLiu Bo 			break;
7353e06cd3ddSLiu Bo 		}
7354e06cd3ddSLiu Bo 
7355e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(num_stripes);
7356e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7357e3540eabSDavid Sterba 			goto out_short_read;
7358e3540eabSDavid Sterba 
73599690ac09SDavid Sterba 		ret = read_one_chunk(&key, sb, chunk);
736084eed90fSChris Mason 		if (ret)
736184eed90fSChris Mason 			break;
736232ab3d1bSJohannes Thumshirn 
73631ffb22cfSDavid Sterba 		array_ptr += len;
73641ffb22cfSDavid Sterba 		sb_array_offset += len;
73651ffb22cfSDavid Sterba 		cur_offset += len;
73660b86a832SChris Mason 	}
7367d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
73681c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
736984eed90fSChris Mason 	return ret;
7370e3540eabSDavid Sterba 
7371e3540eabSDavid Sterba out_short_read:
7372ab8d0fc4SJeff Mahoney 	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7373e3540eabSDavid Sterba 			len, cur_offset);
7374d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
73751c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
7376e3540eabSDavid Sterba 	return -EIO;
73770b86a832SChris Mason }
73780b86a832SChris Mason 
737921634a19SQu Wenruo /*
738021634a19SQu Wenruo  * Check if all chunks in the fs are OK for read-write degraded mount
738121634a19SQu Wenruo  *
73826528b99dSAnand Jain  * If the @failing_dev is specified, it's accounted as missing.
73836528b99dSAnand Jain  *
738421634a19SQu Wenruo  * Return true if all chunks meet the minimal RW mount requirements.
738521634a19SQu Wenruo  * Return false if any chunk doesn't meet the minimal RW mount requirements.
738621634a19SQu Wenruo  */
73876528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
73886528b99dSAnand Jain 					struct btrfs_device *failing_dev)
738921634a19SQu Wenruo {
7390c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
739121634a19SQu Wenruo 	struct extent_map *em;
739221634a19SQu Wenruo 	u64 next_start = 0;
739321634a19SQu Wenruo 	bool ret = true;
739421634a19SQu Wenruo 
7395c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
7396c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7397c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
739821634a19SQu Wenruo 	/* No chunk at all? Return false anyway */
739921634a19SQu Wenruo 	if (!em) {
740021634a19SQu Wenruo 		ret = false;
740121634a19SQu Wenruo 		goto out;
740221634a19SQu Wenruo 	}
740321634a19SQu Wenruo 	while (em) {
740421634a19SQu Wenruo 		struct map_lookup *map;
740521634a19SQu Wenruo 		int missing = 0;
740621634a19SQu Wenruo 		int max_tolerated;
740721634a19SQu Wenruo 		int i;
740821634a19SQu Wenruo 
740921634a19SQu Wenruo 		map = em->map_lookup;
741021634a19SQu Wenruo 		max_tolerated =
741121634a19SQu Wenruo 			btrfs_get_num_tolerated_disk_barrier_failures(
741221634a19SQu Wenruo 					map->type);
741321634a19SQu Wenruo 		for (i = 0; i < map->num_stripes; i++) {
741421634a19SQu Wenruo 			struct btrfs_device *dev = map->stripes[i].dev;
741521634a19SQu Wenruo 
7416e6e674bdSAnand Jain 			if (!dev || !dev->bdev ||
7417e6e674bdSAnand Jain 			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
741821634a19SQu Wenruo 			    dev->last_flush_error)
741921634a19SQu Wenruo 				missing++;
74206528b99dSAnand Jain 			else if (failing_dev && failing_dev == dev)
74216528b99dSAnand Jain 				missing++;
742221634a19SQu Wenruo 		}
742321634a19SQu Wenruo 		if (missing > max_tolerated) {
74246528b99dSAnand Jain 			if (!failing_dev)
742521634a19SQu Wenruo 				btrfs_warn(fs_info,
742652042d8eSAndrea Gelmini 	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
742721634a19SQu Wenruo 				   em->start, missing, max_tolerated);
742821634a19SQu Wenruo 			free_extent_map(em);
742921634a19SQu Wenruo 			ret = false;
743021634a19SQu Wenruo 			goto out;
743121634a19SQu Wenruo 		}
743221634a19SQu Wenruo 		next_start = extent_map_end(em);
743321634a19SQu Wenruo 		free_extent_map(em);
743421634a19SQu Wenruo 
7435c8bf1b67SDavid Sterba 		read_lock(&map_tree->lock);
7436c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(map_tree, next_start,
743721634a19SQu Wenruo 					   (u64)(-1) - next_start);
7438c8bf1b67SDavid Sterba 		read_unlock(&map_tree->lock);
743921634a19SQu Wenruo 	}
744021634a19SQu Wenruo out:
744121634a19SQu Wenruo 	return ret;
744221634a19SQu Wenruo }
744321634a19SQu Wenruo 
7444d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node)
7445d85327b1SDavid Sterba {
7446d85327b1SDavid Sterba 	int i;
7447d85327b1SDavid Sterba 	const int nr_items = btrfs_header_nritems(node);
7448d85327b1SDavid Sterba 
7449bfb484d9SJosef Bacik 	for (i = 0; i < nr_items; i++)
7450bfb484d9SJosef Bacik 		btrfs_readahead_node_child(node, i);
7451d85327b1SDavid Sterba }
7452d85327b1SDavid Sterba 
74535b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
74540b86a832SChris Mason {
74555b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
74560b86a832SChris Mason 	struct btrfs_path *path;
74570b86a832SChris Mason 	struct extent_buffer *leaf;
74580b86a832SChris Mason 	struct btrfs_key key;
74590b86a832SChris Mason 	struct btrfs_key found_key;
74600b86a832SChris Mason 	int ret;
74610b86a832SChris Mason 	int slot;
746299e3ecfcSLiu Bo 	u64 total_dev = 0;
7463d85327b1SDavid Sterba 	u64 last_ra_node = 0;
74640b86a832SChris Mason 
74650b86a832SChris Mason 	path = btrfs_alloc_path();
74660b86a832SChris Mason 	if (!path)
74670b86a832SChris Mason 		return -ENOMEM;
74680b86a832SChris Mason 
74693dd0f7a3SAnand Jain 	/*
74703dd0f7a3SAnand Jain 	 * uuid_mutex is needed only if we are mounting a sprout FS
74713dd0f7a3SAnand Jain 	 * otherwise we don't need it.
74723dd0f7a3SAnand Jain 	 */
7473b367e47fSLi Zefan 	mutex_lock(&uuid_mutex);
7474b367e47fSLi Zefan 
7475395927a9SFilipe David Borba Manana 	/*
747648cfa61bSBoris Burkov 	 * It is possible for mount and umount to race in such a way that
747748cfa61bSBoris Burkov 	 * we execute this code path, but open_fs_devices failed to clear
747848cfa61bSBoris Burkov 	 * total_rw_bytes. We certainly want it cleared before reading the
747948cfa61bSBoris Burkov 	 * device items, so clear it here.
748048cfa61bSBoris Burkov 	 */
748148cfa61bSBoris Burkov 	fs_info->fs_devices->total_rw_bytes = 0;
748248cfa61bSBoris Burkov 
748348cfa61bSBoris Burkov 	/*
7484395927a9SFilipe David Borba Manana 	 * Read all device items, and then all the chunk items. All
7485395927a9SFilipe David Borba Manana 	 * device items are found before any chunk item (their object id
7486395927a9SFilipe David Borba Manana 	 * is smaller than the lowest possible object id for a chunk
7487395927a9SFilipe David Borba Manana 	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
74880b86a832SChris Mason 	 */
74890b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
74900b86a832SChris Mason 	key.offset = 0;
74910b86a832SChris Mason 	key.type = 0;
74920b86a832SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7493ab59381eSZhao Lei 	if (ret < 0)
7494ab59381eSZhao Lei 		goto error;
74950b86a832SChris Mason 	while (1) {
7496d85327b1SDavid Sterba 		struct extent_buffer *node;
7497d85327b1SDavid Sterba 
74980b86a832SChris Mason 		leaf = path->nodes[0];
74990b86a832SChris Mason 		slot = path->slots[0];
75000b86a832SChris Mason 		if (slot >= btrfs_header_nritems(leaf)) {
75010b86a832SChris Mason 			ret = btrfs_next_leaf(root, path);
75020b86a832SChris Mason 			if (ret == 0)
75030b86a832SChris Mason 				continue;
75040b86a832SChris Mason 			if (ret < 0)
75050b86a832SChris Mason 				goto error;
75060b86a832SChris Mason 			break;
75070b86a832SChris Mason 		}
7508d85327b1SDavid Sterba 		/*
7509d85327b1SDavid Sterba 		 * The nodes on level 1 are not locked but we don't need to do
7510d85327b1SDavid Sterba 		 * that during mount time as nothing else can access the tree
7511d85327b1SDavid Sterba 		 */
7512d85327b1SDavid Sterba 		node = path->nodes[1];
7513d85327b1SDavid Sterba 		if (node) {
7514d85327b1SDavid Sterba 			if (last_ra_node != node->start) {
7515d85327b1SDavid Sterba 				readahead_tree_node_children(node);
7516d85327b1SDavid Sterba 				last_ra_node = node->start;
7517d85327b1SDavid Sterba 			}
7518d85327b1SDavid Sterba 		}
75190b86a832SChris Mason 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
75200b86a832SChris Mason 		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
75210b86a832SChris Mason 			struct btrfs_dev_item *dev_item;
75220b86a832SChris Mason 			dev_item = btrfs_item_ptr(leaf, slot,
75230b86a832SChris Mason 						  struct btrfs_dev_item);
752417850759SDavid Sterba 			ret = read_one_dev(leaf, dev_item);
75252b82032cSYan Zheng 			if (ret)
75262b82032cSYan Zheng 				goto error;
752799e3ecfcSLiu Bo 			total_dev++;
75280b86a832SChris Mason 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
75290b86a832SChris Mason 			struct btrfs_chunk *chunk;
753079bd3712SFilipe Manana 
753179bd3712SFilipe Manana 			/*
753279bd3712SFilipe Manana 			 * We are only called at mount time, so no need to take
753379bd3712SFilipe Manana 			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
753479bd3712SFilipe Manana 			 * we always lock first fs_info->chunk_mutex before
753579bd3712SFilipe Manana 			 * acquiring any locks on the chunk tree. This is a
753679bd3712SFilipe Manana 			 * requirement for chunk allocation, see the comment on
753779bd3712SFilipe Manana 			 * top of btrfs_chunk_alloc() for details.
753879bd3712SFilipe Manana 			 */
753979bd3712SFilipe Manana 			ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
75400b86a832SChris Mason 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
75419690ac09SDavid Sterba 			ret = read_one_chunk(&found_key, leaf, chunk);
75422b82032cSYan Zheng 			if (ret)
75432b82032cSYan Zheng 				goto error;
75440b86a832SChris Mason 		}
75450b86a832SChris Mason 		path->slots[0]++;
75460b86a832SChris Mason 	}
754799e3ecfcSLiu Bo 
754899e3ecfcSLiu Bo 	/*
754999e3ecfcSLiu Bo 	 * After loading chunk tree, we've got all device information,
755099e3ecfcSLiu Bo 	 * do another round of validation checks.
755199e3ecfcSLiu Bo 	 */
75520b246afaSJeff Mahoney 	if (total_dev != fs_info->fs_devices->total_devices) {
75530b246afaSJeff Mahoney 		btrfs_err(fs_info,
755499e3ecfcSLiu Bo 	   "super_num_devices %llu mismatch with num_devices %llu found here",
75550b246afaSJeff Mahoney 			  btrfs_super_num_devices(fs_info->super_copy),
755699e3ecfcSLiu Bo 			  total_dev);
755799e3ecfcSLiu Bo 		ret = -EINVAL;
755899e3ecfcSLiu Bo 		goto error;
755999e3ecfcSLiu Bo 	}
75600b246afaSJeff Mahoney 	if (btrfs_super_total_bytes(fs_info->super_copy) <
75610b246afaSJeff Mahoney 	    fs_info->fs_devices->total_rw_bytes) {
75620b246afaSJeff Mahoney 		btrfs_err(fs_info,
756399e3ecfcSLiu Bo 	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
75640b246afaSJeff Mahoney 			  btrfs_super_total_bytes(fs_info->super_copy),
75650b246afaSJeff Mahoney 			  fs_info->fs_devices->total_rw_bytes);
756699e3ecfcSLiu Bo 		ret = -EINVAL;
756799e3ecfcSLiu Bo 		goto error;
756899e3ecfcSLiu Bo 	}
75690b86a832SChris Mason 	ret = 0;
75700b86a832SChris Mason error:
7571b367e47fSLi Zefan 	mutex_unlock(&uuid_mutex);
7572b367e47fSLi Zefan 
75732b82032cSYan Zheng 	btrfs_free_path(path);
75740b86a832SChris Mason 	return ret;
75750b86a832SChris Mason }
7576442a4f63SStefan Behrens 
7577cb517eabSMiao Xie void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7578cb517eabSMiao Xie {
7579944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7580cb517eabSMiao Xie 	struct btrfs_device *device;
7581cb517eabSMiao Xie 
7582944d3f9fSNikolay Borisov 	fs_devices->fs_info = fs_info;
7583944d3f9fSNikolay Borisov 
7584cb517eabSMiao Xie 	mutex_lock(&fs_devices->device_list_mutex);
7585cb517eabSMiao Xie 	list_for_each_entry(device, &fs_devices->devices, dev_list)
7586fb456252SJeff Mahoney 		device->fs_info = fs_info;
758729cc83f6SLiu Bo 
7588944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7589944d3f9fSNikolay Borisov 		list_for_each_entry(device, &seed_devs->devices, dev_list)
7590944d3f9fSNikolay Borisov 			device->fs_info = fs_info;
7591944d3f9fSNikolay Borisov 
7592944d3f9fSNikolay Borisov 		seed_devs->fs_info = fs_info;
759329cc83f6SLiu Bo 	}
7594e17125b5SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
7595cb517eabSMiao Xie }
7596cb517eabSMiao Xie 
75971dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
75981dc990dfSDavid Sterba 				 const struct btrfs_dev_stats_item *ptr,
75991dc990dfSDavid Sterba 				 int index)
76001dc990dfSDavid Sterba {
76011dc990dfSDavid Sterba 	u64 val;
76021dc990dfSDavid Sterba 
76031dc990dfSDavid Sterba 	read_extent_buffer(eb, &val,
76041dc990dfSDavid Sterba 			   offsetof(struct btrfs_dev_stats_item, values) +
76051dc990dfSDavid Sterba 			    ((unsigned long)ptr) + (index * sizeof(u64)),
76061dc990dfSDavid Sterba 			   sizeof(val));
76071dc990dfSDavid Sterba 	return val;
76081dc990dfSDavid Sterba }
76091dc990dfSDavid Sterba 
76101dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
76111dc990dfSDavid Sterba 				      struct btrfs_dev_stats_item *ptr,
76121dc990dfSDavid Sterba 				      int index, u64 val)
76131dc990dfSDavid Sterba {
76141dc990dfSDavid Sterba 	write_extent_buffer(eb, &val,
76151dc990dfSDavid Sterba 			    offsetof(struct btrfs_dev_stats_item, values) +
76161dc990dfSDavid Sterba 			     ((unsigned long)ptr) + (index * sizeof(u64)),
76171dc990dfSDavid Sterba 			    sizeof(val));
76181dc990dfSDavid Sterba }
76191dc990dfSDavid Sterba 
762092e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7621124604ebSJosef Bacik 				       struct btrfs_path *path)
7622733f4fbbSStefan Behrens {
7623733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7624124604ebSJosef Bacik 	struct extent_buffer *eb;
7625124604ebSJosef Bacik 	struct btrfs_key key;
7626124604ebSJosef Bacik 	int item_size;
7627124604ebSJosef Bacik 	int i, ret, slot;
7628733f4fbbSStefan Behrens 
762982d62d06SJosef Bacik 	if (!device->fs_info->dev_root)
763082d62d06SJosef Bacik 		return 0;
763182d62d06SJosef Bacik 
7632242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7633242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7634733f4fbbSStefan Behrens 	key.offset = device->devid;
7635124604ebSJosef Bacik 	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7636733f4fbbSStefan Behrens 	if (ret) {
7637ae4b9b4cSAnand Jain 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7638ae4b9b4cSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7639733f4fbbSStefan Behrens 		device->dev_stats_valid = 1;
7640733f4fbbSStefan Behrens 		btrfs_release_path(path);
764192e26df4SJosef Bacik 		return ret < 0 ? ret : 0;
7642733f4fbbSStefan Behrens 	}
7643733f4fbbSStefan Behrens 	slot = path->slots[0];
7644733f4fbbSStefan Behrens 	eb = path->nodes[0];
7645733f4fbbSStefan Behrens 	item_size = btrfs_item_size_nr(eb, slot);
7646733f4fbbSStefan Behrens 
7647124604ebSJosef Bacik 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7648733f4fbbSStefan Behrens 
7649733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7650733f4fbbSStefan Behrens 		if (item_size >= (1 + i) * sizeof(__le64))
7651733f4fbbSStefan Behrens 			btrfs_dev_stat_set(device, i,
7652733f4fbbSStefan Behrens 					   btrfs_dev_stats_value(eb, ptr, i));
7653733f4fbbSStefan Behrens 		else
76544e411a7dSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7655733f4fbbSStefan Behrens 	}
7656733f4fbbSStefan Behrens 
7657733f4fbbSStefan Behrens 	device->dev_stats_valid = 1;
7658733f4fbbSStefan Behrens 	btrfs_dev_stat_print_on_load(device);
7659733f4fbbSStefan Behrens 	btrfs_release_path(path);
766092e26df4SJosef Bacik 
766192e26df4SJosef Bacik 	return 0;
7662733f4fbbSStefan Behrens }
7663124604ebSJosef Bacik 
7664124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7665124604ebSJosef Bacik {
7666124604ebSJosef Bacik 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7667124604ebSJosef Bacik 	struct btrfs_device *device;
7668124604ebSJosef Bacik 	struct btrfs_path *path = NULL;
766992e26df4SJosef Bacik 	int ret = 0;
7670124604ebSJosef Bacik 
7671124604ebSJosef Bacik 	path = btrfs_alloc_path();
7672124604ebSJosef Bacik 	if (!path)
7673124604ebSJosef Bacik 		return -ENOMEM;
7674124604ebSJosef Bacik 
7675124604ebSJosef Bacik 	mutex_lock(&fs_devices->device_list_mutex);
767692e26df4SJosef Bacik 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
767792e26df4SJosef Bacik 		ret = btrfs_device_init_dev_stats(device, path);
767892e26df4SJosef Bacik 		if (ret)
767992e26df4SJosef Bacik 			goto out;
7680124604ebSJosef Bacik 	}
768192e26df4SJosef Bacik 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
768292e26df4SJosef Bacik 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
768392e26df4SJosef Bacik 			ret = btrfs_device_init_dev_stats(device, path);
768492e26df4SJosef Bacik 			if (ret)
768592e26df4SJosef Bacik 				goto out;
768692e26df4SJosef Bacik 		}
768792e26df4SJosef Bacik 	}
768892e26df4SJosef Bacik out:
7689733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7690733f4fbbSStefan Behrens 
7691733f4fbbSStefan Behrens 	btrfs_free_path(path);
769292e26df4SJosef Bacik 	return ret;
7693733f4fbbSStefan Behrens }
7694733f4fbbSStefan Behrens 
7695733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7696733f4fbbSStefan Behrens 				struct btrfs_device *device)
7697733f4fbbSStefan Behrens {
76985495f195SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
76996bccf3abSJeff Mahoney 	struct btrfs_root *dev_root = fs_info->dev_root;
7700733f4fbbSStefan Behrens 	struct btrfs_path *path;
7701733f4fbbSStefan Behrens 	struct btrfs_key key;
7702733f4fbbSStefan Behrens 	struct extent_buffer *eb;
7703733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7704733f4fbbSStefan Behrens 	int ret;
7705733f4fbbSStefan Behrens 	int i;
7706733f4fbbSStefan Behrens 
7707242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7708242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7709733f4fbbSStefan Behrens 	key.offset = device->devid;
7710733f4fbbSStefan Behrens 
7711733f4fbbSStefan Behrens 	path = btrfs_alloc_path();
7712fa252992SDavid Sterba 	if (!path)
7713fa252992SDavid Sterba 		return -ENOMEM;
7714733f4fbbSStefan Behrens 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7715733f4fbbSStefan Behrens 	if (ret < 0) {
77160b246afaSJeff Mahoney 		btrfs_warn_in_rcu(fs_info,
7717ecaeb14bSDavid Sterba 			"error %d while searching for dev_stats item for device %s",
7718606686eeSJosef Bacik 			      ret, rcu_str_deref(device->name));
7719733f4fbbSStefan Behrens 		goto out;
7720733f4fbbSStefan Behrens 	}
7721733f4fbbSStefan Behrens 
7722733f4fbbSStefan Behrens 	if (ret == 0 &&
7723733f4fbbSStefan Behrens 	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7724733f4fbbSStefan Behrens 		/* need to delete old one and insert a new one */
7725733f4fbbSStefan Behrens 		ret = btrfs_del_item(trans, dev_root, path);
7726733f4fbbSStefan Behrens 		if (ret != 0) {
77270b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7728ecaeb14bSDavid Sterba 				"delete too small dev_stats item for device %s failed %d",
7729606686eeSJosef Bacik 				      rcu_str_deref(device->name), ret);
7730733f4fbbSStefan Behrens 			goto out;
7731733f4fbbSStefan Behrens 		}
7732733f4fbbSStefan Behrens 		ret = 1;
7733733f4fbbSStefan Behrens 	}
7734733f4fbbSStefan Behrens 
7735733f4fbbSStefan Behrens 	if (ret == 1) {
7736733f4fbbSStefan Behrens 		/* need to insert a new item */
7737733f4fbbSStefan Behrens 		btrfs_release_path(path);
7738733f4fbbSStefan Behrens 		ret = btrfs_insert_empty_item(trans, dev_root, path,
7739733f4fbbSStefan Behrens 					      &key, sizeof(*ptr));
7740733f4fbbSStefan Behrens 		if (ret < 0) {
77410b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7742ecaeb14bSDavid Sterba 				"insert dev_stats item for device %s failed %d",
7743606686eeSJosef Bacik 				rcu_str_deref(device->name), ret);
7744733f4fbbSStefan Behrens 			goto out;
7745733f4fbbSStefan Behrens 		}
7746733f4fbbSStefan Behrens 	}
7747733f4fbbSStefan Behrens 
7748733f4fbbSStefan Behrens 	eb = path->nodes[0];
7749733f4fbbSStefan Behrens 	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7750733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7751733f4fbbSStefan Behrens 		btrfs_set_dev_stats_value(eb, ptr, i,
7752733f4fbbSStefan Behrens 					  btrfs_dev_stat_read(device, i));
7753733f4fbbSStefan Behrens 	btrfs_mark_buffer_dirty(eb);
7754733f4fbbSStefan Behrens 
7755733f4fbbSStefan Behrens out:
7756733f4fbbSStefan Behrens 	btrfs_free_path(path);
7757733f4fbbSStefan Behrens 	return ret;
7758733f4fbbSStefan Behrens }
7759733f4fbbSStefan Behrens 
7760733f4fbbSStefan Behrens /*
7761733f4fbbSStefan Behrens  * called from commit_transaction. Writes all changed device stats to disk.
7762733f4fbbSStefan Behrens  */
7763196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7764733f4fbbSStefan Behrens {
7765196c9d8dSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
7766733f4fbbSStefan Behrens 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7767733f4fbbSStefan Behrens 	struct btrfs_device *device;
7768addc3fa7SMiao Xie 	int stats_cnt;
7769733f4fbbSStefan Behrens 	int ret = 0;
7770733f4fbbSStefan Behrens 
7771733f4fbbSStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
7772733f4fbbSStefan Behrens 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
77739deae968SNikolay Borisov 		stats_cnt = atomic_read(&device->dev_stats_ccnt);
77749deae968SNikolay Borisov 		if (!device->dev_stats_valid || stats_cnt == 0)
7775733f4fbbSStefan Behrens 			continue;
7776733f4fbbSStefan Behrens 
77779deae968SNikolay Borisov 
77789deae968SNikolay Borisov 		/*
77799deae968SNikolay Borisov 		 * There is a LOAD-LOAD control dependency between the value of
77809deae968SNikolay Borisov 		 * dev_stats_ccnt and updating the on-disk values which requires
77819deae968SNikolay Borisov 		 * reading the in-memory counters. Such control dependencies
77829deae968SNikolay Borisov 		 * require explicit read memory barriers.
77839deae968SNikolay Borisov 		 *
77849deae968SNikolay Borisov 		 * This memory barriers pairs with smp_mb__before_atomic in
77859deae968SNikolay Borisov 		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
77869deae968SNikolay Borisov 		 * barrier implied by atomic_xchg in
77879deae968SNikolay Borisov 		 * btrfs_dev_stats_read_and_reset
77889deae968SNikolay Borisov 		 */
77899deae968SNikolay Borisov 		smp_rmb();
77909deae968SNikolay Borisov 
77915495f195SNikolay Borisov 		ret = update_dev_stat_item(trans, device);
7792733f4fbbSStefan Behrens 		if (!ret)
7793addc3fa7SMiao Xie 			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7794733f4fbbSStefan Behrens 	}
7795733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7796733f4fbbSStefan Behrens 
7797733f4fbbSStefan Behrens 	return ret;
7798733f4fbbSStefan Behrens }
7799733f4fbbSStefan Behrens 
7800442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7801442a4f63SStefan Behrens {
7802442a4f63SStefan Behrens 	btrfs_dev_stat_inc(dev, index);
7803442a4f63SStefan Behrens 	btrfs_dev_stat_print_on_error(dev);
7804442a4f63SStefan Behrens }
7805442a4f63SStefan Behrens 
780648a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7807442a4f63SStefan Behrens {
7808733f4fbbSStefan Behrens 	if (!dev->dev_stats_valid)
7809733f4fbbSStefan Behrens 		return;
7810fb456252SJeff Mahoney 	btrfs_err_rl_in_rcu(dev->fs_info,
7811b14af3b4SDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7812606686eeSJosef Bacik 			   rcu_str_deref(dev->name),
7813442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7814442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7815442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7816efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7817efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7818442a4f63SStefan Behrens }
7819c11d2c23SStefan Behrens 
7820733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7821733f4fbbSStefan Behrens {
7822a98cdb85SStefan Behrens 	int i;
7823a98cdb85SStefan Behrens 
7824a98cdb85SStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7825a98cdb85SStefan Behrens 		if (btrfs_dev_stat_read(dev, i) != 0)
7826a98cdb85SStefan Behrens 			break;
7827a98cdb85SStefan Behrens 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
7828a98cdb85SStefan Behrens 		return; /* all values == 0, suppress message */
7829a98cdb85SStefan Behrens 
7830fb456252SJeff Mahoney 	btrfs_info_in_rcu(dev->fs_info,
7831ecaeb14bSDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7832606686eeSJosef Bacik 	       rcu_str_deref(dev->name),
7833733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7834733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7835733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7836733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7837733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7838733f4fbbSStefan Behrens }
7839733f4fbbSStefan Behrens 
78402ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7841b27f7c0cSDavid Sterba 			struct btrfs_ioctl_get_dev_stats *stats)
7842c11d2c23SStefan Behrens {
7843c11d2c23SStefan Behrens 	struct btrfs_device *dev;
78440b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7845c11d2c23SStefan Behrens 	int i;
7846c11d2c23SStefan Behrens 
7847c11d2c23SStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
7848b2598edfSAnand Jain 	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7849c11d2c23SStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7850c11d2c23SStefan Behrens 
7851c11d2c23SStefan Behrens 	if (!dev) {
78520b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7853c11d2c23SStefan Behrens 		return -ENODEV;
7854733f4fbbSStefan Behrens 	} else if (!dev->dev_stats_valid) {
78550b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7856733f4fbbSStefan Behrens 		return -ENODEV;
7857b27f7c0cSDavid Sterba 	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7858c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7859c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7860c11d2c23SStefan Behrens 				stats->values[i] =
7861c11d2c23SStefan Behrens 					btrfs_dev_stat_read_and_reset(dev, i);
7862c11d2c23SStefan Behrens 			else
78634e411a7dSAnand Jain 				btrfs_dev_stat_set(dev, i, 0);
7864c11d2c23SStefan Behrens 		}
7865a69976bcSAnand Jain 		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7866a69976bcSAnand Jain 			   current->comm, task_pid_nr(current));
7867c11d2c23SStefan Behrens 	} else {
7868c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7869c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7870c11d2c23SStefan Behrens 				stats->values[i] = btrfs_dev_stat_read(dev, i);
7871c11d2c23SStefan Behrens 	}
7872c11d2c23SStefan Behrens 	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7873c11d2c23SStefan Behrens 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7874c11d2c23SStefan Behrens 	return 0;
7875c11d2c23SStefan Behrens }
7876a8a6dab7SStefan Behrens 
7877935e5cc9SMiao Xie /*
7878bbbf7243SNikolay Borisov  * Update the size and bytes used for each device where it changed.  This is
7879bbbf7243SNikolay Borisov  * delayed since we would otherwise get errors while writing out the
7880935e5cc9SMiao Xie  * superblocks.
7881bbbf7243SNikolay Borisov  *
7882bbbf7243SNikolay Borisov  * Must be invoked during transaction commit.
7883935e5cc9SMiao Xie  */
7884bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7885935e5cc9SMiao Xie {
7886935e5cc9SMiao Xie 	struct btrfs_device *curr, *next;
7887935e5cc9SMiao Xie 
7888bbbf7243SNikolay Borisov 	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7889bbbf7243SNikolay Borisov 
7890bbbf7243SNikolay Borisov 	if (list_empty(&trans->dev_update_list))
7891935e5cc9SMiao Xie 		return;
7892935e5cc9SMiao Xie 
7893bbbf7243SNikolay Borisov 	/*
7894bbbf7243SNikolay Borisov 	 * We don't need the device_list_mutex here.  This list is owned by the
7895bbbf7243SNikolay Borisov 	 * transaction and the transaction must complete before the device is
7896bbbf7243SNikolay Borisov 	 * released.
7897bbbf7243SNikolay Borisov 	 */
7898bbbf7243SNikolay Borisov 	mutex_lock(&trans->fs_info->chunk_mutex);
7899bbbf7243SNikolay Borisov 	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7900bbbf7243SNikolay Borisov 				 post_commit_list) {
7901bbbf7243SNikolay Borisov 		list_del_init(&curr->post_commit_list);
7902935e5cc9SMiao Xie 		curr->commit_total_bytes = curr->disk_total_bytes;
7903bbbf7243SNikolay Borisov 		curr->commit_bytes_used = curr->bytes_used;
7904935e5cc9SMiao Xie 	}
7905bbbf7243SNikolay Borisov 	mutex_unlock(&trans->fs_info->chunk_mutex);
7906ce7213c7SMiao Xie }
79075a13f430SAnand Jain 
790846df06b8SDavid Sterba /*
790946df06b8SDavid Sterba  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
791046df06b8SDavid Sterba  */
791146df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags)
791246df06b8SDavid Sterba {
791344b28adaSDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
791444b28adaSDavid Sterba 
791544b28adaSDavid Sterba 	return btrfs_raid_array[index].ncopies;
791646df06b8SDavid Sterba }
7917cf90d884SQu Wenruo 
7918cf90d884SQu Wenruo 
7919cf90d884SQu Wenruo 
7920cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7921cf90d884SQu Wenruo 				 u64 chunk_offset, u64 devid,
7922cf90d884SQu Wenruo 				 u64 physical_offset, u64 physical_len)
7923cf90d884SQu Wenruo {
7924c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7925cf90d884SQu Wenruo 	struct extent_map *em;
7926cf90d884SQu Wenruo 	struct map_lookup *map;
792705a37c48SQu Wenruo 	struct btrfs_device *dev;
7928cf90d884SQu Wenruo 	u64 stripe_len;
7929cf90d884SQu Wenruo 	bool found = false;
7930cf90d884SQu Wenruo 	int ret = 0;
7931cf90d884SQu Wenruo 	int i;
7932cf90d884SQu Wenruo 
7933cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
7934cf90d884SQu Wenruo 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7935cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
7936cf90d884SQu Wenruo 
7937cf90d884SQu Wenruo 	if (!em) {
7938cf90d884SQu Wenruo 		btrfs_err(fs_info,
7939cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7940cf90d884SQu Wenruo 			  physical_offset, devid);
7941cf90d884SQu Wenruo 		ret = -EUCLEAN;
7942cf90d884SQu Wenruo 		goto out;
7943cf90d884SQu Wenruo 	}
7944cf90d884SQu Wenruo 
7945cf90d884SQu Wenruo 	map = em->map_lookup;
7946cf90d884SQu Wenruo 	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7947cf90d884SQu Wenruo 	if (physical_len != stripe_len) {
7948cf90d884SQu Wenruo 		btrfs_err(fs_info,
7949cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7950cf90d884SQu Wenruo 			  physical_offset, devid, em->start, physical_len,
7951cf90d884SQu Wenruo 			  stripe_len);
7952cf90d884SQu Wenruo 		ret = -EUCLEAN;
7953cf90d884SQu Wenruo 		goto out;
7954cf90d884SQu Wenruo 	}
7955cf90d884SQu Wenruo 
7956cf90d884SQu Wenruo 	for (i = 0; i < map->num_stripes; i++) {
7957cf90d884SQu Wenruo 		if (map->stripes[i].dev->devid == devid &&
7958cf90d884SQu Wenruo 		    map->stripes[i].physical == physical_offset) {
7959cf90d884SQu Wenruo 			found = true;
7960cf90d884SQu Wenruo 			if (map->verified_stripes >= map->num_stripes) {
7961cf90d884SQu Wenruo 				btrfs_err(fs_info,
7962cf90d884SQu Wenruo 				"too many dev extents for chunk %llu found",
7963cf90d884SQu Wenruo 					  em->start);
7964cf90d884SQu Wenruo 				ret = -EUCLEAN;
7965cf90d884SQu Wenruo 				goto out;
7966cf90d884SQu Wenruo 			}
7967cf90d884SQu Wenruo 			map->verified_stripes++;
7968cf90d884SQu Wenruo 			break;
7969cf90d884SQu Wenruo 		}
7970cf90d884SQu Wenruo 	}
7971cf90d884SQu Wenruo 	if (!found) {
7972cf90d884SQu Wenruo 		btrfs_err(fs_info,
7973cf90d884SQu Wenruo 	"dev extent physical offset %llu devid %llu has no corresponding chunk",
7974cf90d884SQu Wenruo 			physical_offset, devid);
7975cf90d884SQu Wenruo 		ret = -EUCLEAN;
7976cf90d884SQu Wenruo 	}
797705a37c48SQu Wenruo 
79781a9fd417SDavid Sterba 	/* Make sure no dev extent is beyond device boundary */
7979b2598edfSAnand Jain 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
798005a37c48SQu Wenruo 	if (!dev) {
798105a37c48SQu Wenruo 		btrfs_err(fs_info, "failed to find devid %llu", devid);
798205a37c48SQu Wenruo 		ret = -EUCLEAN;
798305a37c48SQu Wenruo 		goto out;
798405a37c48SQu Wenruo 	}
79851b3922a8SQu Wenruo 
798605a37c48SQu Wenruo 	if (physical_offset + physical_len > dev->disk_total_bytes) {
798705a37c48SQu Wenruo 		btrfs_err(fs_info,
798805a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
798905a37c48SQu Wenruo 			  devid, physical_offset, physical_len,
799005a37c48SQu Wenruo 			  dev->disk_total_bytes);
799105a37c48SQu Wenruo 		ret = -EUCLEAN;
799205a37c48SQu Wenruo 		goto out;
799305a37c48SQu Wenruo 	}
7994381a696eSNaohiro Aota 
7995381a696eSNaohiro Aota 	if (dev->zone_info) {
7996381a696eSNaohiro Aota 		u64 zone_size = dev->zone_info->zone_size;
7997381a696eSNaohiro Aota 
7998381a696eSNaohiro Aota 		if (!IS_ALIGNED(physical_offset, zone_size) ||
7999381a696eSNaohiro Aota 		    !IS_ALIGNED(physical_len, zone_size)) {
8000381a696eSNaohiro Aota 			btrfs_err(fs_info,
8001381a696eSNaohiro Aota "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8002381a696eSNaohiro Aota 				  devid, physical_offset, physical_len);
8003381a696eSNaohiro Aota 			ret = -EUCLEAN;
8004381a696eSNaohiro Aota 			goto out;
8005381a696eSNaohiro Aota 		}
8006381a696eSNaohiro Aota 	}
8007381a696eSNaohiro Aota 
8008cf90d884SQu Wenruo out:
8009cf90d884SQu Wenruo 	free_extent_map(em);
8010cf90d884SQu Wenruo 	return ret;
8011cf90d884SQu Wenruo }
8012cf90d884SQu Wenruo 
8013cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8014cf90d884SQu Wenruo {
8015c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8016cf90d884SQu Wenruo 	struct extent_map *em;
8017cf90d884SQu Wenruo 	struct rb_node *node;
8018cf90d884SQu Wenruo 	int ret = 0;
8019cf90d884SQu Wenruo 
8020cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
802107e1ce09SLiu Bo 	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8022cf90d884SQu Wenruo 		em = rb_entry(node, struct extent_map, rb_node);
8023cf90d884SQu Wenruo 		if (em->map_lookup->num_stripes !=
8024cf90d884SQu Wenruo 		    em->map_lookup->verified_stripes) {
8025cf90d884SQu Wenruo 			btrfs_err(fs_info,
8026cf90d884SQu Wenruo 			"chunk %llu has missing dev extent, have %d expect %d",
8027cf90d884SQu Wenruo 				  em->start, em->map_lookup->verified_stripes,
8028cf90d884SQu Wenruo 				  em->map_lookup->num_stripes);
8029cf90d884SQu Wenruo 			ret = -EUCLEAN;
8030cf90d884SQu Wenruo 			goto out;
8031cf90d884SQu Wenruo 		}
8032cf90d884SQu Wenruo 	}
8033cf90d884SQu Wenruo out:
8034cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
8035cf90d884SQu Wenruo 	return ret;
8036cf90d884SQu Wenruo }
8037cf90d884SQu Wenruo 
8038cf90d884SQu Wenruo /*
8039cf90d884SQu Wenruo  * Ensure that all dev extents are mapped to correct chunk, otherwise
8040cf90d884SQu Wenruo  * later chunk allocation/free would cause unexpected behavior.
8041cf90d884SQu Wenruo  *
8042cf90d884SQu Wenruo  * NOTE: This will iterate through the whole device tree, which should be of
8043cf90d884SQu Wenruo  * the same size level as the chunk tree.  This slightly increases mount time.
8044cf90d884SQu Wenruo  */
8045cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8046cf90d884SQu Wenruo {
8047cf90d884SQu Wenruo 	struct btrfs_path *path;
8048cf90d884SQu Wenruo 	struct btrfs_root *root = fs_info->dev_root;
8049cf90d884SQu Wenruo 	struct btrfs_key key;
80505eb19381SQu Wenruo 	u64 prev_devid = 0;
80515eb19381SQu Wenruo 	u64 prev_dev_ext_end = 0;
8052cf90d884SQu Wenruo 	int ret = 0;
8053cf90d884SQu Wenruo 
805442437a63SJosef Bacik 	/*
805542437a63SJosef Bacik 	 * We don't have a dev_root because we mounted with ignorebadroots and
805642437a63SJosef Bacik 	 * failed to load the root, so we want to skip the verification in this
805742437a63SJosef Bacik 	 * case for sure.
805842437a63SJosef Bacik 	 *
805942437a63SJosef Bacik 	 * However if the dev root is fine, but the tree itself is corrupted
806042437a63SJosef Bacik 	 * we'd still fail to mount.  This verification is only to make sure
806142437a63SJosef Bacik 	 * writes can happen safely, so instead just bypass this check
806242437a63SJosef Bacik 	 * completely in the case of IGNOREBADROOTS.
806342437a63SJosef Bacik 	 */
806442437a63SJosef Bacik 	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
806542437a63SJosef Bacik 		return 0;
806642437a63SJosef Bacik 
8067cf90d884SQu Wenruo 	key.objectid = 1;
8068cf90d884SQu Wenruo 	key.type = BTRFS_DEV_EXTENT_KEY;
8069cf90d884SQu Wenruo 	key.offset = 0;
8070cf90d884SQu Wenruo 
8071cf90d884SQu Wenruo 	path = btrfs_alloc_path();
8072cf90d884SQu Wenruo 	if (!path)
8073cf90d884SQu Wenruo 		return -ENOMEM;
8074cf90d884SQu Wenruo 
8075cf90d884SQu Wenruo 	path->reada = READA_FORWARD;
8076cf90d884SQu Wenruo 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8077cf90d884SQu Wenruo 	if (ret < 0)
8078cf90d884SQu Wenruo 		goto out;
8079cf90d884SQu Wenruo 
8080cf90d884SQu Wenruo 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8081ad9a9378SMarcos Paulo de Souza 		ret = btrfs_next_leaf(root, path);
8082cf90d884SQu Wenruo 		if (ret < 0)
8083cf90d884SQu Wenruo 			goto out;
8084cf90d884SQu Wenruo 		/* No dev extents at all? Not good */
8085cf90d884SQu Wenruo 		if (ret > 0) {
8086cf90d884SQu Wenruo 			ret = -EUCLEAN;
8087cf90d884SQu Wenruo 			goto out;
8088cf90d884SQu Wenruo 		}
8089cf90d884SQu Wenruo 	}
8090cf90d884SQu Wenruo 	while (1) {
8091cf90d884SQu Wenruo 		struct extent_buffer *leaf = path->nodes[0];
8092cf90d884SQu Wenruo 		struct btrfs_dev_extent *dext;
8093cf90d884SQu Wenruo 		int slot = path->slots[0];
8094cf90d884SQu Wenruo 		u64 chunk_offset;
8095cf90d884SQu Wenruo 		u64 physical_offset;
8096cf90d884SQu Wenruo 		u64 physical_len;
8097cf90d884SQu Wenruo 		u64 devid;
8098cf90d884SQu Wenruo 
8099cf90d884SQu Wenruo 		btrfs_item_key_to_cpu(leaf, &key, slot);
8100cf90d884SQu Wenruo 		if (key.type != BTRFS_DEV_EXTENT_KEY)
8101cf90d884SQu Wenruo 			break;
8102cf90d884SQu Wenruo 		devid = key.objectid;
8103cf90d884SQu Wenruo 		physical_offset = key.offset;
8104cf90d884SQu Wenruo 
8105cf90d884SQu Wenruo 		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8106cf90d884SQu Wenruo 		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8107cf90d884SQu Wenruo 		physical_len = btrfs_dev_extent_length(leaf, dext);
8108cf90d884SQu Wenruo 
81095eb19381SQu Wenruo 		/* Check if this dev extent overlaps with the previous one */
81105eb19381SQu Wenruo 		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
81115eb19381SQu Wenruo 			btrfs_err(fs_info,
81125eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
81135eb19381SQu Wenruo 				  devid, physical_offset, prev_dev_ext_end);
81145eb19381SQu Wenruo 			ret = -EUCLEAN;
81155eb19381SQu Wenruo 			goto out;
81165eb19381SQu Wenruo 		}
81175eb19381SQu Wenruo 
8118cf90d884SQu Wenruo 		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8119cf90d884SQu Wenruo 					    physical_offset, physical_len);
8120cf90d884SQu Wenruo 		if (ret < 0)
8121cf90d884SQu Wenruo 			goto out;
81225eb19381SQu Wenruo 		prev_devid = devid;
81235eb19381SQu Wenruo 		prev_dev_ext_end = physical_offset + physical_len;
81245eb19381SQu Wenruo 
8125cf90d884SQu Wenruo 		ret = btrfs_next_item(root, path);
8126cf90d884SQu Wenruo 		if (ret < 0)
8127cf90d884SQu Wenruo 			goto out;
8128cf90d884SQu Wenruo 		if (ret > 0) {
8129cf90d884SQu Wenruo 			ret = 0;
8130cf90d884SQu Wenruo 			break;
8131cf90d884SQu Wenruo 		}
8132cf90d884SQu Wenruo 	}
8133cf90d884SQu Wenruo 
8134cf90d884SQu Wenruo 	/* Ensure all chunks have corresponding dev extents */
8135cf90d884SQu Wenruo 	ret = verify_chunk_dev_extent_mapping(fs_info);
8136cf90d884SQu Wenruo out:
8137cf90d884SQu Wenruo 	btrfs_free_path(path);
8138cf90d884SQu Wenruo 	return ret;
8139cf90d884SQu Wenruo }
8140eede2bf3SOmar Sandoval 
8141eede2bf3SOmar Sandoval /*
8142eede2bf3SOmar Sandoval  * Check whether the given block group or device is pinned by any inode being
8143eede2bf3SOmar Sandoval  * used as a swapfile.
8144eede2bf3SOmar Sandoval  */
8145eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8146eede2bf3SOmar Sandoval {
8147eede2bf3SOmar Sandoval 	struct btrfs_swapfile_pin *sp;
8148eede2bf3SOmar Sandoval 	struct rb_node *node;
8149eede2bf3SOmar Sandoval 
8150eede2bf3SOmar Sandoval 	spin_lock(&fs_info->swapfile_pins_lock);
8151eede2bf3SOmar Sandoval 	node = fs_info->swapfile_pins.rb_node;
8152eede2bf3SOmar Sandoval 	while (node) {
8153eede2bf3SOmar Sandoval 		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8154eede2bf3SOmar Sandoval 		if (ptr < sp->ptr)
8155eede2bf3SOmar Sandoval 			node = node->rb_left;
8156eede2bf3SOmar Sandoval 		else if (ptr > sp->ptr)
8157eede2bf3SOmar Sandoval 			node = node->rb_right;
8158eede2bf3SOmar Sandoval 		else
8159eede2bf3SOmar Sandoval 			break;
8160eede2bf3SOmar Sandoval 	}
8161eede2bf3SOmar Sandoval 	spin_unlock(&fs_info->swapfile_pins_lock);
8162eede2bf3SOmar Sandoval 	return node != NULL;
8163eede2bf3SOmar Sandoval }
8164f7ef5287SNaohiro Aota 
8165f7ef5287SNaohiro Aota static int relocating_repair_kthread(void *data)
8166f7ef5287SNaohiro Aota {
8167f7ef5287SNaohiro Aota 	struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
8168f7ef5287SNaohiro Aota 	struct btrfs_fs_info *fs_info = cache->fs_info;
8169f7ef5287SNaohiro Aota 	u64 target;
8170f7ef5287SNaohiro Aota 	int ret = 0;
8171f7ef5287SNaohiro Aota 
8172f7ef5287SNaohiro Aota 	target = cache->start;
8173f7ef5287SNaohiro Aota 	btrfs_put_block_group(cache);
8174f7ef5287SNaohiro Aota 
8175f7ef5287SNaohiro Aota 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8176f7ef5287SNaohiro Aota 		btrfs_info(fs_info,
8177f7ef5287SNaohiro Aota 			   "zoned: skip relocating block group %llu to repair: EBUSY",
8178f7ef5287SNaohiro Aota 			   target);
8179f7ef5287SNaohiro Aota 		return -EBUSY;
8180f7ef5287SNaohiro Aota 	}
8181f7ef5287SNaohiro Aota 
8182f3372065SJohannes Thumshirn 	mutex_lock(&fs_info->reclaim_bgs_lock);
8183f7ef5287SNaohiro Aota 
8184f7ef5287SNaohiro Aota 	/* Ensure block group still exists */
8185f7ef5287SNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, target);
8186f7ef5287SNaohiro Aota 	if (!cache)
8187f7ef5287SNaohiro Aota 		goto out;
8188f7ef5287SNaohiro Aota 
8189f7ef5287SNaohiro Aota 	if (!cache->relocating_repair)
8190f7ef5287SNaohiro Aota 		goto out;
8191f7ef5287SNaohiro Aota 
8192f7ef5287SNaohiro Aota 	ret = btrfs_may_alloc_data_chunk(fs_info, target);
8193f7ef5287SNaohiro Aota 	if (ret < 0)
8194f7ef5287SNaohiro Aota 		goto out;
8195f7ef5287SNaohiro Aota 
8196f7ef5287SNaohiro Aota 	btrfs_info(fs_info,
8197f7ef5287SNaohiro Aota 		   "zoned: relocating block group %llu to repair IO failure",
8198f7ef5287SNaohiro Aota 		   target);
8199f7ef5287SNaohiro Aota 	ret = btrfs_relocate_chunk(fs_info, target);
8200f7ef5287SNaohiro Aota 
8201f7ef5287SNaohiro Aota out:
8202f7ef5287SNaohiro Aota 	if (cache)
8203f7ef5287SNaohiro Aota 		btrfs_put_block_group(cache);
8204f3372065SJohannes Thumshirn 	mutex_unlock(&fs_info->reclaim_bgs_lock);
8205f7ef5287SNaohiro Aota 	btrfs_exclop_finish(fs_info);
8206f7ef5287SNaohiro Aota 
8207f7ef5287SNaohiro Aota 	return ret;
8208f7ef5287SNaohiro Aota }
8209f7ef5287SNaohiro Aota 
8210f7ef5287SNaohiro Aota int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8211f7ef5287SNaohiro Aota {
8212f7ef5287SNaohiro Aota 	struct btrfs_block_group *cache;
8213f7ef5287SNaohiro Aota 
8214f7ef5287SNaohiro Aota 	/* Do not attempt to repair in degraded state */
8215f7ef5287SNaohiro Aota 	if (btrfs_test_opt(fs_info, DEGRADED))
8216f7ef5287SNaohiro Aota 		return 0;
8217f7ef5287SNaohiro Aota 
8218f7ef5287SNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, logical);
8219f7ef5287SNaohiro Aota 	if (!cache)
8220f7ef5287SNaohiro Aota 		return 0;
8221f7ef5287SNaohiro Aota 
8222f7ef5287SNaohiro Aota 	spin_lock(&cache->lock);
8223f7ef5287SNaohiro Aota 	if (cache->relocating_repair) {
8224f7ef5287SNaohiro Aota 		spin_unlock(&cache->lock);
8225f7ef5287SNaohiro Aota 		btrfs_put_block_group(cache);
8226f7ef5287SNaohiro Aota 		return 0;
8227f7ef5287SNaohiro Aota 	}
8228f7ef5287SNaohiro Aota 	cache->relocating_repair = 1;
8229f7ef5287SNaohiro Aota 	spin_unlock(&cache->lock);
8230f7ef5287SNaohiro Aota 
8231f7ef5287SNaohiro Aota 	kthread_run(relocating_repair_kthread, cache,
8232f7ef5287SNaohiro Aota 		    "btrfs-relocating-repair");
8233f7ef5287SNaohiro Aota 
8234f7ef5287SNaohiro Aota 	return 0;
8235f7ef5287SNaohiro Aota }
8236