xref: /openbmc/linux/fs/btrfs/volumes.c (revision 278002edb19bce2c628fafb0af936e77000f3a5b)
1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
20b86a832SChris Mason /*
30b86a832SChris Mason  * Copyright (C) 2007 Oracle.  All rights reserved.
40b86a832SChris Mason  */
5c1d7c514SDavid Sterba 
60b86a832SChris Mason #include <linux/sched.h>
7fccc0007SJosef Bacik #include <linux/sched/mm.h>
85a0e3ad6STejun Heo #include <linux/slab.h>
9442a4f63SStefan Behrens #include <linux/ratelimit.h>
1059641015SIlya Dryomov #include <linux/kthread.h>
11803b2f54SStefan Behrens #include <linux/semaphore.h>
128da4b8c4SAndy Shevchenko #include <linux/uuid.h>
13f8e10cd3SAnand Jain #include <linux/list_sort.h>
1454fde91fSJosef Bacik #include <linux/namei.h>
15784352feSDavid Sterba #include "misc.h"
160b86a832SChris Mason #include "ctree.h"
170b86a832SChris Mason #include "extent_map.h"
180b86a832SChris Mason #include "disk-io.h"
190b86a832SChris Mason #include "transaction.h"
200b86a832SChris Mason #include "print-tree.h"
210b86a832SChris Mason #include "volumes.h"
2253b381b3SDavid Woodhouse #include "raid56.h"
23606686eeSJosef Bacik #include "rcu-string.h"
248dabb742SStefan Behrens #include "dev-replace.h"
2599994cdeSAnand Jain #include "sysfs.h"
2682fc28fbSQu Wenruo #include "tree-checker.h"
278719aaaeSJosef Bacik #include "space-info.h"
28aac0023cSJosef Bacik #include "block-group.h"
29b0643e59SDennis Zhou #include "discard.h"
305b316468SNaohiro Aota #include "zoned.h"
31c7f13d42SJosef Bacik #include "fs.h"
3207e81dc9SJosef Bacik #include "accessors.h"
33c7a03b52SJosef Bacik #include "uuid-tree.h"
347572dec8SJosef Bacik #include "ioctl.h"
3567707479SJosef Bacik #include "relocation.h"
362fc6822cSJosef Bacik #include "scrub.h"
377f0add25SJosef Bacik #include "super.h"
380b86a832SChris Mason 
39bf08387fSQu Wenruo #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
40bf08387fSQu Wenruo 					 BTRFS_BLOCK_GROUP_RAID10 | \
41bf08387fSQu Wenruo 					 BTRFS_BLOCK_GROUP_RAID56_MASK)
42bf08387fSQu Wenruo 
43af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
44af902047SZhao Lei 	[BTRFS_RAID_RAID10] = {
45af902047SZhao Lei 		.sub_stripes	= 2,
46af902047SZhao Lei 		.dev_stripes	= 1,
47af902047SZhao Lei 		.devs_max	= 0,	/* 0 == as many as possible */
48b2f78e88SDavid Sterba 		.devs_min	= 2,
498789f4feSZhao Lei 		.tolerated_failures = 1,
50af902047SZhao Lei 		.devs_increment	= 2,
51af902047SZhao Lei 		.ncopies	= 2,
52b50836edSHans van Kranenburg 		.nparity        = 0,
53ed23467bSAnand Jain 		.raid_name	= "raid10",
5441a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
55f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
56af902047SZhao Lei 	},
57af902047SZhao Lei 	[BTRFS_RAID_RAID1] = {
58af902047SZhao Lei 		.sub_stripes	= 1,
59af902047SZhao Lei 		.dev_stripes	= 1,
60af902047SZhao Lei 		.devs_max	= 2,
61af902047SZhao Lei 		.devs_min	= 2,
628789f4feSZhao Lei 		.tolerated_failures = 1,
63af902047SZhao Lei 		.devs_increment	= 2,
64af902047SZhao Lei 		.ncopies	= 2,
65b50836edSHans van Kranenburg 		.nparity        = 0,
66ed23467bSAnand Jain 		.raid_name	= "raid1",
6741a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
68f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
69af902047SZhao Lei 	},
7047e6f742SDavid Sterba 	[BTRFS_RAID_RAID1C3] = {
7147e6f742SDavid Sterba 		.sub_stripes	= 1,
7247e6f742SDavid Sterba 		.dev_stripes	= 1,
73cf93e15eSDavid Sterba 		.devs_max	= 3,
7447e6f742SDavid Sterba 		.devs_min	= 3,
7547e6f742SDavid Sterba 		.tolerated_failures = 2,
7647e6f742SDavid Sterba 		.devs_increment	= 3,
7747e6f742SDavid Sterba 		.ncopies	= 3,
78db26a024SDavid Sterba 		.nparity        = 0,
7947e6f742SDavid Sterba 		.raid_name	= "raid1c3",
8047e6f742SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
8147e6f742SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
8247e6f742SDavid Sterba 	},
838d6fac00SDavid Sterba 	[BTRFS_RAID_RAID1C4] = {
848d6fac00SDavid Sterba 		.sub_stripes	= 1,
858d6fac00SDavid Sterba 		.dev_stripes	= 1,
86cf93e15eSDavid Sterba 		.devs_max	= 4,
878d6fac00SDavid Sterba 		.devs_min	= 4,
888d6fac00SDavid Sterba 		.tolerated_failures = 3,
898d6fac00SDavid Sterba 		.devs_increment	= 4,
908d6fac00SDavid Sterba 		.ncopies	= 4,
91db26a024SDavid Sterba 		.nparity        = 0,
928d6fac00SDavid Sterba 		.raid_name	= "raid1c4",
938d6fac00SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
948d6fac00SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
958d6fac00SDavid Sterba 	},
96af902047SZhao Lei 	[BTRFS_RAID_DUP] = {
97af902047SZhao Lei 		.sub_stripes	= 1,
98af902047SZhao Lei 		.dev_stripes	= 2,
99af902047SZhao Lei 		.devs_max	= 1,
100af902047SZhao Lei 		.devs_min	= 1,
1018789f4feSZhao Lei 		.tolerated_failures = 0,
102af902047SZhao Lei 		.devs_increment	= 1,
103af902047SZhao Lei 		.ncopies	= 2,
104b50836edSHans van Kranenburg 		.nparity        = 0,
105ed23467bSAnand Jain 		.raid_name	= "dup",
10641a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
107f9fbcaa2SAnand Jain 		.mindev_error	= 0,
108af902047SZhao Lei 	},
109af902047SZhao Lei 	[BTRFS_RAID_RAID0] = {
110af902047SZhao Lei 		.sub_stripes	= 1,
111af902047SZhao Lei 		.dev_stripes	= 1,
112af902047SZhao Lei 		.devs_max	= 0,
113b2f78e88SDavid Sterba 		.devs_min	= 1,
1148789f4feSZhao Lei 		.tolerated_failures = 0,
115af902047SZhao Lei 		.devs_increment	= 1,
116af902047SZhao Lei 		.ncopies	= 1,
117b50836edSHans van Kranenburg 		.nparity        = 0,
118ed23467bSAnand Jain 		.raid_name	= "raid0",
11941a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
120f9fbcaa2SAnand Jain 		.mindev_error	= 0,
121af902047SZhao Lei 	},
122af902047SZhao Lei 	[BTRFS_RAID_SINGLE] = {
123af902047SZhao Lei 		.sub_stripes	= 1,
124af902047SZhao Lei 		.dev_stripes	= 1,
125af902047SZhao Lei 		.devs_max	= 1,
126af902047SZhao Lei 		.devs_min	= 1,
1278789f4feSZhao Lei 		.tolerated_failures = 0,
128af902047SZhao Lei 		.devs_increment	= 1,
129af902047SZhao Lei 		.ncopies	= 1,
130b50836edSHans van Kranenburg 		.nparity        = 0,
131ed23467bSAnand Jain 		.raid_name	= "single",
13241a6e891SAnand Jain 		.bg_flag	= 0,
133f9fbcaa2SAnand Jain 		.mindev_error	= 0,
134af902047SZhao Lei 	},
135af902047SZhao Lei 	[BTRFS_RAID_RAID5] = {
136af902047SZhao Lei 		.sub_stripes	= 1,
137af902047SZhao Lei 		.dev_stripes	= 1,
138af902047SZhao Lei 		.devs_max	= 0,
139af902047SZhao Lei 		.devs_min	= 2,
1408789f4feSZhao Lei 		.tolerated_failures = 1,
141af902047SZhao Lei 		.devs_increment	= 1,
142da612e31SHans van Kranenburg 		.ncopies	= 1,
143b50836edSHans van Kranenburg 		.nparity        = 1,
144ed23467bSAnand Jain 		.raid_name	= "raid5",
14541a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
146f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
147af902047SZhao Lei 	},
148af902047SZhao Lei 	[BTRFS_RAID_RAID6] = {
149af902047SZhao Lei 		.sub_stripes	= 1,
150af902047SZhao Lei 		.dev_stripes	= 1,
151af902047SZhao Lei 		.devs_max	= 0,
152af902047SZhao Lei 		.devs_min	= 3,
1538789f4feSZhao Lei 		.tolerated_failures = 2,
154af902047SZhao Lei 		.devs_increment	= 1,
155da612e31SHans van Kranenburg 		.ncopies	= 1,
156b50836edSHans van Kranenburg 		.nparity        = 2,
157ed23467bSAnand Jain 		.raid_name	= "raid6",
15841a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
159f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
160af902047SZhao Lei 	},
161af902047SZhao Lei };
162af902047SZhao Lei 
163500a44c9SDavid Sterba /*
164500a44c9SDavid Sterba  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
165500a44c9SDavid Sterba  * can be used as index to access btrfs_raid_array[].
166500a44c9SDavid Sterba  */
btrfs_bg_flags_to_raid_index(u64 flags)167500a44c9SDavid Sterba enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
168500a44c9SDavid Sterba {
169719fae89SQu Wenruo 	const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
170500a44c9SDavid Sterba 
171719fae89SQu Wenruo 	if (!profile)
172719fae89SQu Wenruo 		return BTRFS_RAID_SINGLE;
173719fae89SQu Wenruo 
174719fae89SQu Wenruo 	return BTRFS_BG_FLAG_TO_INDEX(profile);
175500a44c9SDavid Sterba }
176500a44c9SDavid Sterba 
btrfs_bg_type_to_raid_name(u64 flags)177158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags)
178ed23467bSAnand Jain {
179158da513SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
180158da513SDavid Sterba 
181158da513SDavid Sterba 	if (index >= BTRFS_NR_RAID_TYPES)
182ed23467bSAnand Jain 		return NULL;
183ed23467bSAnand Jain 
184158da513SDavid Sterba 	return btrfs_raid_array[index].raid_name;
185ed23467bSAnand Jain }
186ed23467bSAnand Jain 
btrfs_nr_parity_stripes(u64 type)1870b30f719SQu Wenruo int btrfs_nr_parity_stripes(u64 type)
1880b30f719SQu Wenruo {
1890b30f719SQu Wenruo 	enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
1900b30f719SQu Wenruo 
1910b30f719SQu Wenruo 	return btrfs_raid_array[index].nparity;
1920b30f719SQu Wenruo }
1930b30f719SQu Wenruo 
194f89e09cfSAnand Jain /*
195f89e09cfSAnand Jain  * Fill @buf with textual description of @bg_flags, no more than @size_buf
196f89e09cfSAnand Jain  * bytes including terminating null byte.
197f89e09cfSAnand Jain  */
btrfs_describe_block_groups(u64 bg_flags,char * buf,u32 size_buf)198f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
199f89e09cfSAnand Jain {
200f89e09cfSAnand Jain 	int i;
201f89e09cfSAnand Jain 	int ret;
202f89e09cfSAnand Jain 	char *bp = buf;
203f89e09cfSAnand Jain 	u64 flags = bg_flags;
204f89e09cfSAnand Jain 	u32 size_bp = size_buf;
205f89e09cfSAnand Jain 
206f89e09cfSAnand Jain 	if (!flags) {
207f89e09cfSAnand Jain 		strcpy(bp, "NONE");
208f89e09cfSAnand Jain 		return;
209f89e09cfSAnand Jain 	}
210f89e09cfSAnand Jain 
211f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc)						\
212f89e09cfSAnand Jain 	do {								\
213f89e09cfSAnand Jain 		if (flags & (flag)) {					\
214f89e09cfSAnand Jain 			ret = snprintf(bp, size_bp, "%s|", (desc));	\
215f89e09cfSAnand Jain 			if (ret < 0 || ret >= size_bp)			\
216f89e09cfSAnand Jain 				goto out_overflow;			\
217f89e09cfSAnand Jain 			size_bp -= ret;					\
218f89e09cfSAnand Jain 			bp += ret;					\
219f89e09cfSAnand Jain 			flags &= ~(flag);				\
220f89e09cfSAnand Jain 		}							\
221f89e09cfSAnand Jain 	} while (0)
222f89e09cfSAnand Jain 
223f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
224f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
225f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
226f89e09cfSAnand Jain 
227f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
228f89e09cfSAnand Jain 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
229f89e09cfSAnand Jain 		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
230f89e09cfSAnand Jain 			      btrfs_raid_array[i].raid_name);
231f89e09cfSAnand Jain #undef DESCRIBE_FLAG
232f89e09cfSAnand Jain 
233f89e09cfSAnand Jain 	if (flags) {
234f89e09cfSAnand Jain 		ret = snprintf(bp, size_bp, "0x%llx|", flags);
235f89e09cfSAnand Jain 		size_bp -= ret;
236f89e09cfSAnand Jain 	}
237f89e09cfSAnand Jain 
238f89e09cfSAnand Jain 	if (size_bp < size_buf)
239f89e09cfSAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
240f89e09cfSAnand Jain 
241f89e09cfSAnand Jain 	/*
242f89e09cfSAnand Jain 	 * The text is trimmed, it's up to the caller to provide sufficiently
243f89e09cfSAnand Jain 	 * large buffer
244f89e09cfSAnand Jain 	 */
245f89e09cfSAnand Jain out_overflow:;
246f89e09cfSAnand Jain }
247f89e09cfSAnand Jain 
2486f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans);
2492ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
250733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
2512b82032cSYan Zheng 
2529c6b1c4dSDavid Sterba /*
2539c6b1c4dSDavid Sterba  * Device locking
2549c6b1c4dSDavid Sterba  * ==============
2559c6b1c4dSDavid Sterba  *
2569c6b1c4dSDavid Sterba  * There are several mutexes that protect manipulation of devices and low-level
2579c6b1c4dSDavid Sterba  * structures like chunks but not block groups, extents or files
2589c6b1c4dSDavid Sterba  *
2599c6b1c4dSDavid Sterba  * uuid_mutex (global lock)
2609c6b1c4dSDavid Sterba  * ------------------------
2619c6b1c4dSDavid Sterba  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
2629c6b1c4dSDavid Sterba  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
2639c6b1c4dSDavid Sterba  * device) or requested by the device= mount option
2649c6b1c4dSDavid Sterba  *
2659c6b1c4dSDavid Sterba  * the mutex can be very coarse and can cover long-running operations
2669c6b1c4dSDavid Sterba  *
2679c6b1c4dSDavid Sterba  * protects: updates to fs_devices counters like missing devices, rw devices,
26852042d8eSAndrea Gelmini  * seeding, structure cloning, opening/closing devices at mount/umount time
2699c6b1c4dSDavid Sterba  *
2709c6b1c4dSDavid Sterba  * global::fs_devs - add, remove, updates to the global list
2719c6b1c4dSDavid Sterba  *
27218c850fdSJosef Bacik  * does not protect: manipulation of the fs_devices::devices list in general
27318c850fdSJosef Bacik  * but in mount context it could be used to exclude list modifications by eg.
27418c850fdSJosef Bacik  * scan ioctl
2759c6b1c4dSDavid Sterba  *
2769c6b1c4dSDavid Sterba  * btrfs_device::name - renames (write side), read is RCU
2779c6b1c4dSDavid Sterba  *
2789c6b1c4dSDavid Sterba  * fs_devices::device_list_mutex (per-fs, with RCU)
2799c6b1c4dSDavid Sterba  * ------------------------------------------------
2809c6b1c4dSDavid Sterba  * protects updates to fs_devices::devices, ie. adding and deleting
2819c6b1c4dSDavid Sterba  *
2829c6b1c4dSDavid Sterba  * simple list traversal with read-only actions can be done with RCU protection
2839c6b1c4dSDavid Sterba  *
2849c6b1c4dSDavid Sterba  * may be used to exclude some operations from running concurrently without any
2859c6b1c4dSDavid Sterba  * modifications to the list (see write_all_supers)
2869c6b1c4dSDavid Sterba  *
28718c850fdSJosef Bacik  * Is not required at mount and close times, because our device list is
28818c850fdSJosef Bacik  * protected by the uuid_mutex at that point.
28918c850fdSJosef Bacik  *
2909c6b1c4dSDavid Sterba  * balance_mutex
2919c6b1c4dSDavid Sterba  * -------------
2929c6b1c4dSDavid Sterba  * protects balance structures (status, state) and context accessed from
2939c6b1c4dSDavid Sterba  * several places (internally, ioctl)
2949c6b1c4dSDavid Sterba  *
2959c6b1c4dSDavid Sterba  * chunk_mutex
2969c6b1c4dSDavid Sterba  * -----------
2979c6b1c4dSDavid Sterba  * protects chunks, adding or removing during allocation, trim or when a new
2980b6f5d40SNikolay Borisov  * device is added/removed. Additionally it also protects post_commit_list of
2990b6f5d40SNikolay Borisov  * individual devices, since they can be added to the transaction's
3000b6f5d40SNikolay Borisov  * post_commit_list only with chunk_mutex held.
3019c6b1c4dSDavid Sterba  *
3029c6b1c4dSDavid Sterba  * cleaner_mutex
3039c6b1c4dSDavid Sterba  * -------------
3049c6b1c4dSDavid Sterba  * a big lock that is held by the cleaner thread and prevents running subvolume
3059c6b1c4dSDavid Sterba  * cleaning together with relocation or delayed iputs
3069c6b1c4dSDavid Sterba  *
3079c6b1c4dSDavid Sterba  *
3089c6b1c4dSDavid Sterba  * Lock nesting
3099c6b1c4dSDavid Sterba  * ============
3109c6b1c4dSDavid Sterba  *
3119c6b1c4dSDavid Sterba  * uuid_mutex
3129c6b1c4dSDavid Sterba  *   device_list_mutex
3139c6b1c4dSDavid Sterba  *     chunk_mutex
3149c6b1c4dSDavid Sterba  *   balance_mutex
31589595e80SAnand Jain  *
31689595e80SAnand Jain  *
317c3e1f96cSGoldwyn Rodrigues  * Exclusive operations
318c3e1f96cSGoldwyn Rodrigues  * ====================
31989595e80SAnand Jain  *
32089595e80SAnand Jain  * Maintains the exclusivity of the following operations that apply to the
32189595e80SAnand Jain  * whole filesystem and cannot run in parallel.
32289595e80SAnand Jain  *
32389595e80SAnand Jain  * - Balance (*)
32489595e80SAnand Jain  * - Device add
32589595e80SAnand Jain  * - Device remove
32689595e80SAnand Jain  * - Device replace (*)
32789595e80SAnand Jain  * - Resize
32889595e80SAnand Jain  *
32989595e80SAnand Jain  * The device operations (as above) can be in one of the following states:
33089595e80SAnand Jain  *
33189595e80SAnand Jain  * - Running state
33289595e80SAnand Jain  * - Paused state
33389595e80SAnand Jain  * - Completed state
33489595e80SAnand Jain  *
33589595e80SAnand Jain  * Only device operations marked with (*) can go into the Paused state for the
33689595e80SAnand Jain  * following reasons:
33789595e80SAnand Jain  *
33889595e80SAnand Jain  * - ioctl (only Balance can be Paused through ioctl)
33989595e80SAnand Jain  * - filesystem remounted as read-only
34089595e80SAnand Jain  * - filesystem unmounted and mounted as read-only
34189595e80SAnand Jain  * - system power-cycle and filesystem mounted as read-only
34289595e80SAnand Jain  * - filesystem or device errors leading to forced read-only
34389595e80SAnand Jain  *
344c3e1f96cSGoldwyn Rodrigues  * The status of exclusive operation is set and cleared atomically.
345c3e1f96cSGoldwyn Rodrigues  * During the course of Paused state, fs_info::exclusive_operation remains set.
34689595e80SAnand Jain  * A device operation in Paused or Running state can be canceled or resumed
34789595e80SAnand Jain  * either by ioctl (Balance only) or when remounted as read-write.
348c3e1f96cSGoldwyn Rodrigues  * The exclusive status is cleared when the device operation is canceled or
34989595e80SAnand Jain  * completed.
3509c6b1c4dSDavid Sterba  */
3519c6b1c4dSDavid Sterba 
35267a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex);
3538a4b83ccSChris Mason static LIST_HEAD(fs_uuids);
btrfs_get_fs_uuids(void)3544143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
355c73eccf7SAnand Jain {
356c73eccf7SAnand Jain 	return &fs_uuids;
357c73eccf7SAnand Jain }
3588a4b83ccSChris Mason 
3592dfeca9bSDavid Sterba /*
3602dfeca9bSDavid Sterba  * alloc_fs_devices - allocate struct btrfs_fs_devices
3617239ff4bSNikolay Borisov  * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
3627239ff4bSNikolay Borisov  * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
3632dfeca9bSDavid Sterba  *
3642dfeca9bSDavid Sterba  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
3652dfeca9bSDavid Sterba  * The returned struct is not linked onto any lists and can be destroyed with
3662dfeca9bSDavid Sterba  * kfree() right away.
3672dfeca9bSDavid Sterba  */
alloc_fs_devices(const u8 * fsid,const u8 * metadata_fsid)3687239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
3697239ff4bSNikolay Borisov 						 const u8 *metadata_fsid)
3702208a378SIlya Dryomov {
3712208a378SIlya Dryomov 	struct btrfs_fs_devices *fs_devs;
3722208a378SIlya Dryomov 
37319c4c49cSAnand Jain 	ASSERT(fsid || !metadata_fsid);
37419c4c49cSAnand Jain 
37578f2c9e6SDavid Sterba 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
3762208a378SIlya Dryomov 	if (!fs_devs)
3772208a378SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3782208a378SIlya Dryomov 
3792208a378SIlya Dryomov 	mutex_init(&fs_devs->device_list_mutex);
3802208a378SIlya Dryomov 
3812208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->devices);
3822208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->alloc_list);
383c4babc5eSAnand Jain 	INIT_LIST_HEAD(&fs_devs->fs_list);
384944d3f9fSNikolay Borisov 	INIT_LIST_HEAD(&fs_devs->seed_list);
3852208a378SIlya Dryomov 
38619c4c49cSAnand Jain 	if (fsid) {
38719c4c49cSAnand Jain 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
38819c4c49cSAnand Jain 		memcpy(fs_devs->metadata_uuid,
38919c4c49cSAnand Jain 		       metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
39019c4c49cSAnand Jain 	}
3917239ff4bSNikolay Borisov 
3922208a378SIlya Dryomov 	return fs_devs;
3932208a378SIlya Dryomov }
3942208a378SIlya Dryomov 
btrfs_free_device(struct btrfs_device * device)395f2db4d5cSFilipe Manana static void btrfs_free_device(struct btrfs_device *device)
39648dae9cfSDavid Sterba {
397bbbf7243SNikolay Borisov 	WARN_ON(!list_empty(&device->post_commit_list));
39848dae9cfSDavid Sterba 	rcu_string_free(device->name);
399611ccc58SFilipe Manana 	extent_io_tree_release(&device->alloc_state);
4005b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
40148dae9cfSDavid Sterba 	kfree(device);
40248dae9cfSDavid Sterba }
40348dae9cfSDavid Sterba 
free_fs_devices(struct btrfs_fs_devices * fs_devices)404e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
405e4404d6eSYan Zheng {
406e4404d6eSYan Zheng 	struct btrfs_device *device;
4075f58d783SAnand Jain 
408e4404d6eSYan Zheng 	WARN_ON(fs_devices->opened);
409e4404d6eSYan Zheng 	while (!list_empty(&fs_devices->devices)) {
410e4404d6eSYan Zheng 		device = list_entry(fs_devices->devices.next,
411e4404d6eSYan Zheng 				    struct btrfs_device, dev_list);
412e4404d6eSYan Zheng 		list_del(&device->dev_list);
413a425f9d4SDavid Sterba 		btrfs_free_device(device);
414e4404d6eSYan Zheng 	}
415e4404d6eSYan Zheng 	kfree(fs_devices);
416e4404d6eSYan Zheng }
417e4404d6eSYan Zheng 
btrfs_cleanup_fs_uuids(void)418ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void)
4198a4b83ccSChris Mason {
4208a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
4218a4b83ccSChris Mason 
4222b82032cSYan Zheng 	while (!list_empty(&fs_uuids)) {
4232b82032cSYan Zheng 		fs_devices = list_entry(fs_uuids.next,
424c4babc5eSAnand Jain 					struct btrfs_fs_devices, fs_list);
425c4babc5eSAnand Jain 		list_del(&fs_devices->fs_list);
426e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
4278a4b83ccSChris Mason 	}
4288a4b83ccSChris Mason }
4298a4b83ccSChris Mason 
match_fsid_fs_devices(const struct btrfs_fs_devices * fs_devices,const u8 * fsid,const u8 * metadata_fsid)4301a898345SAnand Jain static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
4311a898345SAnand Jain 				  const u8 *fsid, const u8 *metadata_fsid)
4321a898345SAnand Jain {
4331a898345SAnand Jain 	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
4341a898345SAnand Jain 		return false;
4351a898345SAnand Jain 
4361a898345SAnand Jain 	if (!metadata_fsid)
4371a898345SAnand Jain 		return true;
4381a898345SAnand Jain 
4391a898345SAnand Jain 	if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
4401a898345SAnand Jain 		return false;
4411a898345SAnand Jain 
4421a898345SAnand Jain 	return true;
4431a898345SAnand Jain }
4441a898345SAnand Jain 
find_fsid(const u8 * fsid,const u8 * metadata_fsid)4457239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid(
4467239ff4bSNikolay Borisov 		const u8 *fsid, const u8 *metadata_fsid)
4478a4b83ccSChris Mason {
4488a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
4498a4b83ccSChris Mason 
4507239ff4bSNikolay Borisov 	ASSERT(fsid);
4517239ff4bSNikolay Borisov 
452c6730a0eSSu Yue 	/* Handle non-split brain cases */
453c6730a0eSSu Yue 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4541a898345SAnand Jain 		if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
455c6730a0eSSu Yue 			return fs_devices;
456c6730a0eSSu Yue 	}
457c6730a0eSSu Yue 	return NULL;
458c6730a0eSSu Yue }
459c6730a0eSSu Yue 
460a3c54b0bSAnand Jain /*
461a3c54b0bSAnand Jain  * First check if the metadata_uuid is different from the fsid in the given
462a3c54b0bSAnand Jain  * fs_devices. Then check if the given fsid is the same as the metadata_uuid
463a3c54b0bSAnand Jain  * in the fs_devices. If it is, return true; otherwise, return false.
464a3c54b0bSAnand Jain  */
check_fsid_changed(const struct btrfs_fs_devices * fs_devices,const u8 * fsid)465a3c54b0bSAnand Jain static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
466a3c54b0bSAnand Jain 				      const u8 *fsid)
467a3c54b0bSAnand Jain {
468a3c54b0bSAnand Jain 	return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
469a3c54b0bSAnand Jain 		      BTRFS_FSID_SIZE) != 0 &&
470a3c54b0bSAnand Jain 	       memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
471a3c54b0bSAnand Jain }
472a3c54b0bSAnand Jain 
find_fsid_with_metadata_uuid(struct btrfs_super_block * disk_super)473c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
474c6730a0eSSu Yue 				struct btrfs_super_block *disk_super)
475c6730a0eSSu Yue {
476c6730a0eSSu Yue 
477c6730a0eSSu Yue 	struct btrfs_fs_devices *fs_devices;
478c6730a0eSSu Yue 
4797a62d0f0SNikolay Borisov 	/*
4807a62d0f0SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
4817a62d0f0SNikolay Borisov 	 * belonging to a fs_devices that was created by first scanning
4827a62d0f0SNikolay Borisov 	 * a device which didn't have its fsid/metadata_uuid changed
4837a62d0f0SNikolay Borisov 	 * at all and the CHANGING_FSID_V2 flag set.
4847a62d0f0SNikolay Borisov 	 */
4857a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4861a898345SAnand Jain 		if (!fs_devices->fsid_change)
4871a898345SAnand Jain 			continue;
4881a898345SAnand Jain 
4891a898345SAnand Jain 		if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
4901a898345SAnand Jain 					  fs_devices->fsid))
4917a62d0f0SNikolay Borisov 			return fs_devices;
4927a62d0f0SNikolay Borisov 	}
4931a898345SAnand Jain 
494cc5de4e7SNikolay Borisov 	/*
495cc5de4e7SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
496cc5de4e7SNikolay Borisov 	 * belonging to a fs_devices that was created by a device that
497cc5de4e7SNikolay Borisov 	 * has an outdated pair of fsid/metadata_uuid and
498cc5de4e7SNikolay Borisov 	 * CHANGING_FSID_V2 flag set.
499cc5de4e7SNikolay Borisov 	 */
500cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
501a3c54b0bSAnand Jain 		if (!fs_devices->fsid_change)
502a3c54b0bSAnand Jain 			continue;
503a3c54b0bSAnand Jain 
504a3c54b0bSAnand Jain 		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
505cc5de4e7SNikolay Borisov 			return fs_devices;
506cc5de4e7SNikolay Borisov 	}
507c6730a0eSSu Yue 
508c6730a0eSSu Yue 	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
5097a62d0f0SNikolay Borisov }
5107a62d0f0SNikolay Borisov 
5118a4b83ccSChris Mason 
512beaf8ab3SStefan Behrens static int
btrfs_get_bdev_and_sb(const char * device_path,blk_mode_t flags,void * holder,int flush,struct block_device ** bdev,struct btrfs_super_block ** disk_super)51305bdb996SChristoph Hellwig btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
514beaf8ab3SStefan Behrens 		      int flush, struct block_device **bdev,
5158f32380dSJohannes Thumshirn 		      struct btrfs_super_block **disk_super)
516beaf8ab3SStefan Behrens {
517beaf8ab3SStefan Behrens 	int ret;
518beaf8ab3SStefan Behrens 
5190718afd4SChristoph Hellwig 	*bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
520beaf8ab3SStefan Behrens 
521beaf8ab3SStefan Behrens 	if (IS_ERR(*bdev)) {
522beaf8ab3SStefan Behrens 		ret = PTR_ERR(*bdev);
523beaf8ab3SStefan Behrens 		goto error;
524beaf8ab3SStefan Behrens 	}
525beaf8ab3SStefan Behrens 
526beaf8ab3SStefan Behrens 	if (flush)
5271226dfffSChristoph Hellwig 		sync_blockdev(*bdev);
5289f6d2510SDavid Sterba 	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
529beaf8ab3SStefan Behrens 	if (ret) {
5302736e8eeSChristoph Hellwig 		blkdev_put(*bdev, holder);
531beaf8ab3SStefan Behrens 		goto error;
532beaf8ab3SStefan Behrens 	}
533beaf8ab3SStefan Behrens 	invalidate_bdev(*bdev);
5348f32380dSJohannes Thumshirn 	*disk_super = btrfs_read_dev_super(*bdev);
5358f32380dSJohannes Thumshirn 	if (IS_ERR(*disk_super)) {
5368f32380dSJohannes Thumshirn 		ret = PTR_ERR(*disk_super);
5372736e8eeSChristoph Hellwig 		blkdev_put(*bdev, holder);
538beaf8ab3SStefan Behrens 		goto error;
539beaf8ab3SStefan Behrens 	}
540beaf8ab3SStefan Behrens 
541beaf8ab3SStefan Behrens 	return 0;
542beaf8ab3SStefan Behrens 
543beaf8ab3SStefan Behrens error:
544beaf8ab3SStefan Behrens 	*bdev = NULL;
545beaf8ab3SStefan Behrens 	return ret;
546beaf8ab3SStefan Behrens }
547beaf8ab3SStefan Behrens 
54843dd529aSDavid Sterba /*
54943dd529aSDavid Sterba  *  Search and remove all stale devices (which are not mounted).  When both
55043dd529aSDavid Sterba  *  inputs are NULL, it will search and release all stale devices.
55116cab91aSAnand Jain  *
55216cab91aSAnand Jain  *  @devt:         Optional. When provided will it release all unmounted devices
55316cab91aSAnand Jain  *                 matching this devt only.
55416cab91aSAnand Jain  *  @skip_device:  Optional. Will skip this device when searching for the stale
555d8367db3SAnand Jain  *                 devices.
55616cab91aSAnand Jain  *
55716cab91aSAnand Jain  *  Return:	0 for success or if @devt is 0.
55816cab91aSAnand Jain  *		-EBUSY if @devt is a mounted device.
55916cab91aSAnand Jain  *		-ENOENT if @devt does not match any device in the list.
560d8367db3SAnand Jain  */
btrfs_free_stale_devices(dev_t devt,struct btrfs_device * skip_device)56116cab91aSAnand Jain static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
5624fde46f0SAnand Jain {
563fa6d2ae5SAnand Jain 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
564fa6d2ae5SAnand Jain 	struct btrfs_device *device, *tmp_device;
56570bc7088SAnand Jain 	int ret = 0;
56670bc7088SAnand Jain 
567c1247069SAnand Jain 	lockdep_assert_held(&uuid_mutex);
568c1247069SAnand Jain 
56916cab91aSAnand Jain 	if (devt)
57070bc7088SAnand Jain 		ret = -ENOENT;
5714fde46f0SAnand Jain 
572fa6d2ae5SAnand Jain 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
5734fde46f0SAnand Jain 
57470bc7088SAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
575fa6d2ae5SAnand Jain 		list_for_each_entry_safe(device, tmp_device,
576fa6d2ae5SAnand Jain 					 &fs_devices->devices, dev_list) {
577fa6d2ae5SAnand Jain 			if (skip_device && skip_device == device)
578d8367db3SAnand Jain 				continue;
579330a5bf4SAnand Jain 			if (devt && devt != device->devt)
58038cf665dSAnand Jain 				continue;
58170bc7088SAnand Jain 			if (fs_devices->opened) {
58270bc7088SAnand Jain 				/* for an already deleted device return 0 */
58316cab91aSAnand Jain 				if (devt && ret != 0)
58470bc7088SAnand Jain 					ret = -EBUSY;
58570bc7088SAnand Jain 				break;
58670bc7088SAnand Jain 			}
5874fde46f0SAnand Jain 
5884fde46f0SAnand Jain 			/* delete the stale device */
589fa6d2ae5SAnand Jain 			fs_devices->num_devices--;
590fa6d2ae5SAnand Jain 			list_del(&device->dev_list);
591fa6d2ae5SAnand Jain 			btrfs_free_device(device);
5927bcb8164SAnand Jain 
59370bc7088SAnand Jain 			ret = 0;
5944fde46f0SAnand Jain 		}
5957bcb8164SAnand Jain 		mutex_unlock(&fs_devices->device_list_mutex);
59670bc7088SAnand Jain 
5977bcb8164SAnand Jain 		if (fs_devices->num_devices == 0) {
5987bcb8164SAnand Jain 			btrfs_sysfs_remove_fsid(fs_devices);
5997bcb8164SAnand Jain 			list_del(&fs_devices->fs_list);
6007bcb8164SAnand Jain 			free_fs_devices(fs_devices);
6014fde46f0SAnand Jain 		}
6024fde46f0SAnand Jain 	}
60370bc7088SAnand Jain 
60470bc7088SAnand Jain 	return ret;
6054fde46f0SAnand Jain }
6064fde46f0SAnand Jain 
60718c850fdSJosef Bacik /*
60818c850fdSJosef Bacik  * This is only used on mount, and we are protected from competing things
60918c850fdSJosef Bacik  * messing with our fs_devices by the uuid_mutex, thus we do not need the
61018c850fdSJosef Bacik  * fs_devices->device_list_mutex here.
61118c850fdSJosef Bacik  */
btrfs_open_one_device(struct btrfs_fs_devices * fs_devices,struct btrfs_device * device,blk_mode_t flags,void * holder)6120fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
61305bdb996SChristoph Hellwig 			struct btrfs_device *device, blk_mode_t flags,
6140fb08bccSAnand Jain 			void *holder)
6150fb08bccSAnand Jain {
6160fb08bccSAnand Jain 	struct block_device *bdev;
6170fb08bccSAnand Jain 	struct btrfs_super_block *disk_super;
6180fb08bccSAnand Jain 	u64 devid;
6190fb08bccSAnand Jain 	int ret;
6200fb08bccSAnand Jain 
6210fb08bccSAnand Jain 	if (device->bdev)
6220fb08bccSAnand Jain 		return -EINVAL;
6230fb08bccSAnand Jain 	if (!device->name)
6240fb08bccSAnand Jain 		return -EINVAL;
6250fb08bccSAnand Jain 
6260fb08bccSAnand Jain 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
6278f32380dSJohannes Thumshirn 				    &bdev, &disk_super);
6280fb08bccSAnand Jain 	if (ret)
6290fb08bccSAnand Jain 		return ret;
6300fb08bccSAnand Jain 
6310fb08bccSAnand Jain 	devid = btrfs_stack_device_id(&disk_super->dev_item);
6320fb08bccSAnand Jain 	if (devid != device->devid)
6338f32380dSJohannes Thumshirn 		goto error_free_page;
6340fb08bccSAnand Jain 
6350fb08bccSAnand Jain 	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
6368f32380dSJohannes Thumshirn 		goto error_free_page;
6370fb08bccSAnand Jain 
6380fb08bccSAnand Jain 	device->generation = btrfs_super_generation(disk_super);
6390fb08bccSAnand Jain 
6400fb08bccSAnand Jain 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
6417239ff4bSNikolay Borisov 		if (btrfs_super_incompat_flags(disk_super) &
6427239ff4bSNikolay Borisov 		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
6437239ff4bSNikolay Borisov 			pr_err(
6447239ff4bSNikolay Borisov 		"BTRFS: Invalid seeding and uuid-changed device detected\n");
6458f32380dSJohannes Thumshirn 			goto error_free_page;
6467239ff4bSNikolay Borisov 		}
6477239ff4bSNikolay Borisov 
648ebbede42SAnand Jain 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6490395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
6500fb08bccSAnand Jain 	} else {
651ebbede42SAnand Jain 		if (bdev_read_only(bdev))
652ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
653ebbede42SAnand Jain 		else
654ebbede42SAnand Jain 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6550fb08bccSAnand Jain 	}
6560fb08bccSAnand Jain 
65710f0d2a5SChristoph Hellwig 	if (!bdev_nonrot(bdev))
6587f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
6590fb08bccSAnand Jain 
66063a7cb13SDavid Sterba 	if (bdev_max_discard_sectors(bdev))
66163a7cb13SDavid Sterba 		fs_devices->discardable = true;
66263a7cb13SDavid Sterba 
6630fb08bccSAnand Jain 	device->bdev = bdev;
664e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6652736e8eeSChristoph Hellwig 	device->holder = holder;
6660fb08bccSAnand Jain 
6670fb08bccSAnand Jain 	fs_devices->open_devices++;
668ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
669ebbede42SAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
6700fb08bccSAnand Jain 		fs_devices->rw_devices++;
671b1b8e386SAnand Jain 		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
6720fb08bccSAnand Jain 	}
6738f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6740fb08bccSAnand Jain 
6750fb08bccSAnand Jain 	return 0;
6760fb08bccSAnand Jain 
6778f32380dSJohannes Thumshirn error_free_page:
6788f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6792736e8eeSChristoph Hellwig 	blkdev_put(bdev, holder);
6800fb08bccSAnand Jain 
6810fb08bccSAnand Jain 	return -EINVAL;
6820fb08bccSAnand Jain }
6830fb08bccSAnand Jain 
btrfs_sb_fsid_ptr(struct btrfs_super_block * sb)6844844c366SAnand Jain u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
6854844c366SAnand Jain {
6864844c366SAnand Jain 	bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
6874844c366SAnand Jain 				  BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
6884844c366SAnand Jain 
6894844c366SAnand Jain 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
6904844c366SAnand Jain }
6914844c366SAnand Jain 
is_same_device(struct btrfs_device * device,const char * new_path)692*a5d74fa2SQu Wenruo static bool is_same_device(struct btrfs_device *device, const char *new_path)
693*a5d74fa2SQu Wenruo {
694*a5d74fa2SQu Wenruo 	struct path old = { .mnt = NULL, .dentry = NULL };
695*a5d74fa2SQu Wenruo 	struct path new = { .mnt = NULL, .dentry = NULL };
696*a5d74fa2SQu Wenruo 	char *old_path = NULL;
697*a5d74fa2SQu Wenruo 	bool is_same = false;
698*a5d74fa2SQu Wenruo 	int ret;
699*a5d74fa2SQu Wenruo 
700*a5d74fa2SQu Wenruo 	if (!device->name)
701*a5d74fa2SQu Wenruo 		goto out;
702*a5d74fa2SQu Wenruo 
703*a5d74fa2SQu Wenruo 	old_path = kzalloc(PATH_MAX, GFP_NOFS);
704*a5d74fa2SQu Wenruo 	if (!old_path)
705*a5d74fa2SQu Wenruo 		goto out;
706*a5d74fa2SQu Wenruo 
707*a5d74fa2SQu Wenruo 	rcu_read_lock();
708*a5d74fa2SQu Wenruo 	ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
709*a5d74fa2SQu Wenruo 	rcu_read_unlock();
710*a5d74fa2SQu Wenruo 	if (ret < 0)
711*a5d74fa2SQu Wenruo 		goto out;
712*a5d74fa2SQu Wenruo 
713*a5d74fa2SQu Wenruo 	ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
714*a5d74fa2SQu Wenruo 	if (ret)
715*a5d74fa2SQu Wenruo 		goto out;
716*a5d74fa2SQu Wenruo 	ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
717*a5d74fa2SQu Wenruo 	if (ret)
718*a5d74fa2SQu Wenruo 		goto out;
719*a5d74fa2SQu Wenruo 	if (path_equal(&old, &new))
720*a5d74fa2SQu Wenruo 		is_same = true;
721*a5d74fa2SQu Wenruo out:
722*a5d74fa2SQu Wenruo 	kfree(old_path);
723*a5d74fa2SQu Wenruo 	path_put(&old);
724*a5d74fa2SQu Wenruo 	path_put(&new);
725*a5d74fa2SQu Wenruo 	return is_same;
726*a5d74fa2SQu Wenruo }
727*a5d74fa2SQu Wenruo 
72860999ca4SDavid Sterba /*
7297a62d0f0SNikolay Borisov  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
730c0d81c7cSSu Yue  * being created with a disk that has already completed its fsid change. Such
731c0d81c7cSSu Yue  * disk can belong to an fs which has its FSID changed or to one which doesn't.
732c0d81c7cSSu Yue  * Handle both cases here.
7337a62d0f0SNikolay Borisov  */
find_fsid_inprogress(struct btrfs_super_block * disk_super)7347a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress(
7357a62d0f0SNikolay Borisov 					struct btrfs_super_block *disk_super)
7367a62d0f0SNikolay Borisov {
7377a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
7387a62d0f0SNikolay Borisov 
7397a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
740a3c54b0bSAnand Jain 		if (fs_devices->fsid_change)
741a3c54b0bSAnand Jain 			continue;
742a3c54b0bSAnand Jain 
743a3c54b0bSAnand Jain 		if (check_fsid_changed(fs_devices,  disk_super->fsid))
7447a62d0f0SNikolay Borisov 			return fs_devices;
7457a62d0f0SNikolay Borisov 	}
7467a62d0f0SNikolay Borisov 
747c0d81c7cSSu Yue 	return find_fsid(disk_super->fsid, NULL);
7487a62d0f0SNikolay Borisov }
7497a62d0f0SNikolay Borisov 
find_fsid_changed(struct btrfs_super_block * disk_super)750cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed(
751cc5de4e7SNikolay Borisov 					struct btrfs_super_block *disk_super)
752cc5de4e7SNikolay Borisov {
753cc5de4e7SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
754cc5de4e7SNikolay Borisov 
755cc5de4e7SNikolay Borisov 	/*
756cc5de4e7SNikolay Borisov 	 * Handles the case where scanned device is part of an fs that had
7571a9fd417SDavid Sterba 	 * multiple successful changes of FSID but currently device didn't
75805840710SNikolay Borisov 	 * observe it. Meaning our fsid will be different than theirs. We need
75905840710SNikolay Borisov 	 * to handle two subcases :
76005840710SNikolay Borisov 	 *  1 - The fs still continues to have different METADATA/FSID uuids.
76105840710SNikolay Borisov 	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
76205840710SNikolay Borisov 	 *  are equal).
763cc5de4e7SNikolay Borisov 	 */
764cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
76505840710SNikolay Borisov 		/* Changed UUIDs */
766a3c54b0bSAnand Jain 		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
767cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->fsid,
76805840710SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0)
769cc5de4e7SNikolay Borisov 			return fs_devices;
77005840710SNikolay Borisov 
77105840710SNikolay Borisov 		/* Unchanged UUIDs */
77205840710SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
77305840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
77405840710SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
77505840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0)
77605840710SNikolay Borisov 			return fs_devices;
777cc5de4e7SNikolay Borisov 	}
778cc5de4e7SNikolay Borisov 
779cc5de4e7SNikolay Borisov 	return NULL;
780cc5de4e7SNikolay Borisov }
7811362089dSNikolay Borisov 
find_fsid_reverted_metadata(struct btrfs_super_block * disk_super)7821362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata(
7831362089dSNikolay Borisov 				struct btrfs_super_block *disk_super)
7841362089dSNikolay Borisov {
7851362089dSNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
7861362089dSNikolay Borisov 
7871362089dSNikolay Borisov 	/*
7881362089dSNikolay Borisov 	 * Handle the case where the scanned device is part of an fs whose last
7891362089dSNikolay Borisov 	 * metadata UUID change reverted it to the original FSID. At the same
79067da05b3SColin Ian King 	 * time fs_devices was first created by another constituent device
7911362089dSNikolay Borisov 	 * which didn't fully observe the operation. This results in an
7921362089dSNikolay Borisov 	 * btrfs_fs_devices created with metadata/fsid different AND
7931362089dSNikolay Borisov 	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
7941362089dSNikolay Borisov 	 * fs_devices equal to the FSID of the disk.
7951362089dSNikolay Borisov 	 */
7961362089dSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
797a3c54b0bSAnand Jain 		if (!fs_devices->fsid_change)
798a3c54b0bSAnand Jain 			continue;
799a3c54b0bSAnand Jain 
800a3c54b0bSAnand Jain 		if (check_fsid_changed(fs_devices, disk_super->fsid))
8011362089dSNikolay Borisov 			return fs_devices;
8021362089dSNikolay Borisov 	}
8031362089dSNikolay Borisov 
8041362089dSNikolay Borisov 	return NULL;
8051362089dSNikolay Borisov }
8067a62d0f0SNikolay Borisov /*
80760999ca4SDavid Sterba  * Add new device to list of registered devices
80860999ca4SDavid Sterba  *
80960999ca4SDavid Sterba  * Returns:
810e124ece5SAnand Jain  * device pointer which was just added or updated when successful
811e124ece5SAnand Jain  * error pointer when failed
81260999ca4SDavid Sterba  */
device_list_add(const char * path,struct btrfs_super_block * disk_super,bool * new_device_added)813e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path,
8144306a974SAnand Jain 			   struct btrfs_super_block *disk_super,
8154306a974SAnand Jain 			   bool *new_device_added)
8168a4b83ccSChris Mason {
8178a4b83ccSChris Mason 	struct btrfs_device *device;
8187a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices = NULL;
819606686eeSJosef Bacik 	struct rcu_string *name;
8208a4b83ccSChris Mason 	u64 found_transid = btrfs_super_generation(disk_super);
8213acbcbfcSAnand Jain 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
8224889bc05SAnand Jain 	dev_t path_devt;
8234889bc05SAnand Jain 	int error;
8247239ff4bSNikolay Borisov 	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
8257239ff4bSNikolay Borisov 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
826d1a63002SNikolay Borisov 	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
827d1a63002SNikolay Borisov 					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
8288a4b83ccSChris Mason 
8294889bc05SAnand Jain 	error = lookup_bdev(path, &path_devt);
830ed02363fSQu Wenruo 	if (error) {
831ed02363fSQu Wenruo 		btrfs_err(NULL, "failed to lookup block device for path %s: %d",
832ed02363fSQu Wenruo 			  path, error);
8334889bc05SAnand Jain 		return ERR_PTR(error);
834ed02363fSQu Wenruo 	}
8354889bc05SAnand Jain 
836cc5de4e7SNikolay Borisov 	if (fsid_change_in_progress) {
837c0d81c7cSSu Yue 		if (!has_metadata_uuid)
8387a62d0f0SNikolay Borisov 			fs_devices = find_fsid_inprogress(disk_super);
839c0d81c7cSSu Yue 		else
840cc5de4e7SNikolay Borisov 			fs_devices = find_fsid_changed(disk_super);
8417a62d0f0SNikolay Borisov 	} else if (has_metadata_uuid) {
842c6730a0eSSu Yue 		fs_devices = find_fsid_with_metadata_uuid(disk_super);
8437a62d0f0SNikolay Borisov 	} else {
8441362089dSNikolay Borisov 		fs_devices = find_fsid_reverted_metadata(disk_super);
8451362089dSNikolay Borisov 		if (!fs_devices)
8467a62d0f0SNikolay Borisov 			fs_devices = find_fsid(disk_super->fsid, NULL);
8477a62d0f0SNikolay Borisov 	}
8487a62d0f0SNikolay Borisov 
8497a62d0f0SNikolay Borisov 
8508a4b83ccSChris Mason 	if (!fs_devices) {
8517239ff4bSNikolay Borisov 		fs_devices = alloc_fs_devices(disk_super->fsid,
852c6930d7dSAnand Jain 				has_metadata_uuid ? disk_super->metadata_uuid : NULL);
8532208a378SIlya Dryomov 		if (IS_ERR(fs_devices))
854e124ece5SAnand Jain 			return ERR_CAST(fs_devices);
8552208a378SIlya Dryomov 
85692900e51SAl Viro 		fs_devices->fsid_change = fsid_change_in_progress;
85792900e51SAl Viro 
8589c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
859c4babc5eSAnand Jain 		list_add(&fs_devices->fs_list, &fs_uuids);
8602208a378SIlya Dryomov 
8618a4b83ccSChris Mason 		device = NULL;
8628a4b83ccSChris Mason 	} else {
863562d7b15SJosef Bacik 		struct btrfs_dev_lookup_args args = {
864562d7b15SJosef Bacik 			.devid = devid,
865562d7b15SJosef Bacik 			.uuid = disk_super->dev_item.uuid,
866562d7b15SJosef Bacik 		};
867562d7b15SJosef Bacik 
8689c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
869562d7b15SJosef Bacik 		device = btrfs_find_device(fs_devices, &args);
8707a62d0f0SNikolay Borisov 
8717a62d0f0SNikolay Borisov 		/*
8727a62d0f0SNikolay Borisov 		 * If this disk has been pulled into an fs devices created by
8737a62d0f0SNikolay Borisov 		 * a device which had the CHANGING_FSID_V2 flag then replace the
8747a62d0f0SNikolay Borisov 		 * metadata_uuid/fsid values of the fs_devices.
8757a62d0f0SNikolay Borisov 		 */
8761362089dSNikolay Borisov 		if (fs_devices->fsid_change &&
8777a62d0f0SNikolay Borisov 		    found_transid > fs_devices->latest_generation) {
8787a62d0f0SNikolay Borisov 			memcpy(fs_devices->fsid, disk_super->fsid,
8797a62d0f0SNikolay Borisov 					BTRFS_FSID_SIZE);
8807a62d0f0SNikolay Borisov 			memcpy(fs_devices->metadata_uuid,
881319baafcSAnand Jain 			       btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
8827a62d0f0SNikolay Borisov 			fs_devices->fsid_change = false;
8837a62d0f0SNikolay Borisov 		}
8848a4b83ccSChris Mason 	}
885443f24feSMiao Xie 
8868a4b83ccSChris Mason 	if (!device) {
887bb21e302SAnand Jain 		unsigned int nofs_flag;
888bb21e302SAnand Jain 
8899c6d173eSAnand Jain 		if (fs_devices->opened) {
890ed02363fSQu Wenruo 			btrfs_err(NULL,
8917f9879ebSAnand Jain "device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
8927f9879ebSAnand Jain 				  path, fs_devices->fsid, current->comm,
8937f9879ebSAnand Jain 				  task_pid_nr(current));
8949c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
895e124ece5SAnand Jain 			return ERR_PTR(-EBUSY);
8969c6d173eSAnand Jain 		}
8972b82032cSYan Zheng 
898bb21e302SAnand Jain 		nofs_flag = memalloc_nofs_save();
89912bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &devid,
900bb21e302SAnand Jain 					    disk_super->dev_item.uuid, path);
901bb21e302SAnand Jain 		memalloc_nofs_restore(nofs_flag);
90212bd2fc0SIlya Dryomov 		if (IS_ERR(device)) {
9039c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
9048a4b83ccSChris Mason 			/* we can safely leave the fs_devices entry around */
905e124ece5SAnand Jain 			return device;
9068a4b83ccSChris Mason 		}
907606686eeSJosef Bacik 
9084889bc05SAnand Jain 		device->devt = path_devt;
90990519d66SArne Jansen 
9101f78160cSXiao Guangrong 		list_add_rcu(&device->dev_list, &fs_devices->devices);
911f7171750SFilipe David Borba Manana 		fs_devices->num_devices++;
912e5e9a520SChris Mason 
9132b82032cSYan Zheng 		device->fs_devices = fs_devices;
9144306a974SAnand Jain 		*new_device_added = true;
915327f18ccSAnand Jain 
916327f18ccSAnand Jain 		if (disk_super->label[0])
917aa6c0df7SAnand Jain 			pr_info(
918aa6c0df7SAnand Jain 	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
919aa6c0df7SAnand Jain 				disk_super->label, devid, found_transid, path,
920aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
921327f18ccSAnand Jain 		else
922aa6c0df7SAnand Jain 			pr_info(
923aa6c0df7SAnand Jain 	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
924aa6c0df7SAnand Jain 				disk_super->fsid, devid, found_transid, path,
925aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
926327f18ccSAnand Jain 
927*a5d74fa2SQu Wenruo 	} else if (!device->name || !is_same_device(device, path)) {
928b96de000SAnand Jain 		/*
929b96de000SAnand Jain 		 * When FS is already mounted.
930b96de000SAnand Jain 		 * 1. If you are here and if the device->name is NULL that
931b96de000SAnand Jain 		 *    means this device was missing at time of FS mount.
932b96de000SAnand Jain 		 * 2. If you are here and if the device->name is different
933b96de000SAnand Jain 		 *    from 'path' that means either
934b96de000SAnand Jain 		 *      a. The same device disappeared and reappeared with
935b96de000SAnand Jain 		 *         different name. or
936b96de000SAnand Jain 		 *      b. The missing-disk-which-was-replaced, has
937b96de000SAnand Jain 		 *         reappeared now.
938b96de000SAnand Jain 		 *
939b96de000SAnand Jain 		 * We must allow 1 and 2a above. But 2b would be a spurious
940b96de000SAnand Jain 		 * and unintentional.
941b96de000SAnand Jain 		 *
942b96de000SAnand Jain 		 * Further in case of 1 and 2a above, the disk at 'path'
943b96de000SAnand Jain 		 * would have missed some transaction when it was away and
944b96de000SAnand Jain 		 * in case of 2a the stale bdev has to be updated as well.
945b96de000SAnand Jain 		 * 2b must not be allowed at all time.
946b96de000SAnand Jain 		 */
947b96de000SAnand Jain 
948b96de000SAnand Jain 		/*
9490f23ae74SChris Mason 		 * For now, we do allow update to btrfs_fs_device through the
9500f23ae74SChris Mason 		 * btrfs dev scan cli after FS has been mounted.  We're still
9510f23ae74SChris Mason 		 * tracking a problem where systems fail mount by subvolume id
9520f23ae74SChris Mason 		 * when we reject replacement on a mounted FS.
953b96de000SAnand Jain 		 */
9540f23ae74SChris Mason 		if (!fs_devices->opened && found_transid < device->generation) {
95577bdae4dSAnand Jain 			/*
95677bdae4dSAnand Jain 			 * That is if the FS is _not_ mounted and if you
95777bdae4dSAnand Jain 			 * are here, that means there is more than one
95877bdae4dSAnand Jain 			 * disk with same uuid and devid.We keep the one
95977bdae4dSAnand Jain 			 * with larger generation number or the last-in if
96077bdae4dSAnand Jain 			 * generation are equal.
96177bdae4dSAnand Jain 			 */
9629c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
963ed02363fSQu Wenruo 			btrfs_err(NULL,
964ed02363fSQu Wenruo "device %s already registered with a higher generation, found %llu expect %llu",
965ed02363fSQu Wenruo 				  path, found_transid, device->generation);
966e124ece5SAnand Jain 			return ERR_PTR(-EEXIST);
96777bdae4dSAnand Jain 		}
968b96de000SAnand Jain 
969a9261d41SAnand Jain 		/*
970a9261d41SAnand Jain 		 * We are going to replace the device path for a given devid,
971a9261d41SAnand Jain 		 * make sure it's the same device if the device is mounted
97279c9234bSDongliang Mu 		 *
97379c9234bSDongliang Mu 		 * NOTE: the device->fs_info may not be reliable here so pass
97479c9234bSDongliang Mu 		 * in a NULL to message helpers instead. This avoids a possible
97579c9234bSDongliang Mu 		 * use-after-free when the fs_info and fs_info->sb are already
97679c9234bSDongliang Mu 		 * torn down.
977a9261d41SAnand Jain 		 */
978a9261d41SAnand Jain 		if (device->bdev) {
9794889bc05SAnand Jain 			if (device->devt != path_devt) {
980a9261d41SAnand Jain 				mutex_unlock(&fs_devices->device_list_mutex);
9810697d9a6SJohannes Thumshirn 				btrfs_warn_in_rcu(NULL,
98279dae17dSAnand Jain 	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
98379dae17dSAnand Jain 						  path, devid, found_transid,
98479dae17dSAnand Jain 						  current->comm,
98579dae17dSAnand Jain 						  task_pid_nr(current));
986a9261d41SAnand Jain 				return ERR_PTR(-EEXIST);
987a9261d41SAnand Jain 			}
98879c9234bSDongliang Mu 			btrfs_info_in_rcu(NULL,
98979dae17dSAnand Jain 	"devid %llu device path %s changed to %s scanned by %s (%d)",
990cb3e217bSQu Wenruo 					  devid, btrfs_dev_name(device),
99179dae17dSAnand Jain 					  path, current->comm,
99279dae17dSAnand Jain 					  task_pid_nr(current));
993a9261d41SAnand Jain 		}
994a9261d41SAnand Jain 
995606686eeSJosef Bacik 		name = rcu_string_strdup(path, GFP_NOFS);
9969c6d173eSAnand Jain 		if (!name) {
9979c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
998e124ece5SAnand Jain 			return ERR_PTR(-ENOMEM);
9999c6d173eSAnand Jain 		}
1000606686eeSJosef Bacik 		rcu_string_free(device->name);
1001606686eeSJosef Bacik 		rcu_assign_pointer(device->name, name);
1002e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1003cd02dca5SChris Mason 			fs_devices->missing_devices--;
1004e6e674bdSAnand Jain 			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1005cd02dca5SChris Mason 		}
10064889bc05SAnand Jain 		device->devt = path_devt;
10078a4b83ccSChris Mason 	}
10088a4b83ccSChris Mason 
100977bdae4dSAnand Jain 	/*
101077bdae4dSAnand Jain 	 * Unmount does not free the btrfs_device struct but would zero
101177bdae4dSAnand Jain 	 * generation along with most of the other members. So just update
101277bdae4dSAnand Jain 	 * it back. We need it to pick the disk with largest generation
101377bdae4dSAnand Jain 	 * (as above).
101477bdae4dSAnand Jain 	 */
1015d1a63002SNikolay Borisov 	if (!fs_devices->opened) {
101677bdae4dSAnand Jain 		device->generation = found_transid;
1017d1a63002SNikolay Borisov 		fs_devices->latest_generation = max_t(u64, found_transid,
1018d1a63002SNikolay Borisov 						fs_devices->latest_generation);
1019d1a63002SNikolay Borisov 	}
102077bdae4dSAnand Jain 
1021f2788d2fSAnand Jain 	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1022f2788d2fSAnand Jain 
10239c6d173eSAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
1024e124ece5SAnand Jain 	return device;
10258a4b83ccSChris Mason }
10268a4b83ccSChris Mason 
clone_fs_devices(struct btrfs_fs_devices * orig)1027e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1028e4404d6eSYan Zheng {
1029e4404d6eSYan Zheng 	struct btrfs_fs_devices *fs_devices;
1030e4404d6eSYan Zheng 	struct btrfs_device *device;
1031e4404d6eSYan Zheng 	struct btrfs_device *orig_dev;
1032d2979aa2SAnand Jain 	int ret = 0;
1033e4404d6eSYan Zheng 
1034c1247069SAnand Jain 	lockdep_assert_held(&uuid_mutex);
1035c1247069SAnand Jain 
10367239ff4bSNikolay Borisov 	fs_devices = alloc_fs_devices(orig->fsid, NULL);
10372208a378SIlya Dryomov 	if (IS_ERR(fs_devices))
10382208a378SIlya Dryomov 		return fs_devices;
1039e4404d6eSYan Zheng 
104002db0844SJosef Bacik 	fs_devices->total_devices = orig->total_devices;
1041e4404d6eSYan Zheng 
1042e4404d6eSYan Zheng 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1043bb21e302SAnand Jain 		const char *dev_path = NULL;
1044bb21e302SAnand Jain 
1045bb21e302SAnand Jain 		/*
1046bb21e302SAnand Jain 		 * This is ok to do without RCU read locked because we hold the
1047bb21e302SAnand Jain 		 * uuid mutex so nothing we touch in here is going to disappear.
1048bb21e302SAnand Jain 		 */
1049bb21e302SAnand Jain 		if (orig_dev->name)
1050bb21e302SAnand Jain 			dev_path = orig_dev->name->str;
1051606686eeSJosef Bacik 
105212bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
1053bb21e302SAnand Jain 					    orig_dev->uuid, dev_path);
1054d2979aa2SAnand Jain 		if (IS_ERR(device)) {
1055d2979aa2SAnand Jain 			ret = PTR_ERR(device);
1056e4404d6eSYan Zheng 			goto error;
1057d2979aa2SAnand Jain 		}
1058e4404d6eSYan Zheng 
105921e61ec6SJohannes Thumshirn 		if (orig_dev->zone_info) {
106021e61ec6SJohannes Thumshirn 			struct btrfs_zoned_device_info *zone_info;
106121e61ec6SJohannes Thumshirn 
106221e61ec6SJohannes Thumshirn 			zone_info = btrfs_clone_dev_zone_info(orig_dev);
106321e61ec6SJohannes Thumshirn 			if (!zone_info) {
106421e61ec6SJohannes Thumshirn 				btrfs_free_device(device);
106521e61ec6SJohannes Thumshirn 				ret = -ENOMEM;
106621e61ec6SJohannes Thumshirn 				goto error;
106721e61ec6SJohannes Thumshirn 			}
106821e61ec6SJohannes Thumshirn 			device->zone_info = zone_info;
106921e61ec6SJohannes Thumshirn 		}
107021e61ec6SJohannes Thumshirn 
1071e4404d6eSYan Zheng 		list_add(&device->dev_list, &fs_devices->devices);
1072e4404d6eSYan Zheng 		device->fs_devices = fs_devices;
1073e4404d6eSYan Zheng 		fs_devices->num_devices++;
1074e4404d6eSYan Zheng 	}
1075e4404d6eSYan Zheng 	return fs_devices;
1076e4404d6eSYan Zheng error:
1077e4404d6eSYan Zheng 	free_fs_devices(fs_devices);
1078d2979aa2SAnand Jain 	return ERR_PTR(ret);
1079e4404d6eSYan Zheng }
1080e4404d6eSYan Zheng 
__btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,struct btrfs_device ** latest_dev)10813712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1082bacce86aSAnand Jain 				      struct btrfs_device **latest_dev)
1083dfe25020SChris Mason {
1084c6e30871SQinghuang Feng 	struct btrfs_device *device, *next;
1085a6b0d5c8SChris Mason 
108646224705SXiao Guangrong 	/* This is the initialized path, it is safe to release the devices. */
1087c6e30871SQinghuang Feng 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
10883712ccb7SNikolay Borisov 		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1089401e29c1SAnand Jain 			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1090401e29c1SAnand Jain 				      &device->dev_state) &&
1091998a0671SAnand Jain 			    !test_bit(BTRFS_DEV_STATE_MISSING,
1092998a0671SAnand Jain 				      &device->dev_state) &&
10933712ccb7SNikolay Borisov 			    (!*latest_dev ||
10943712ccb7SNikolay Borisov 			     device->generation > (*latest_dev)->generation)) {
10953712ccb7SNikolay Borisov 				*latest_dev = device;
1096a6b0d5c8SChris Mason 			}
10972b82032cSYan Zheng 			continue;
1098a6b0d5c8SChris Mason 		}
10992b82032cSYan Zheng 
11008dabb742SStefan Behrens 		/*
1101cf89af14SAnand Jain 		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1102cf89af14SAnand Jain 		 * in btrfs_init_dev_replace() so just continue.
11038dabb742SStefan Behrens 		 */
1104cf89af14SAnand Jain 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
11058dabb742SStefan Behrens 			continue;
1106cf89af14SAnand Jain 
1107a74a4b97SChris Mason 		if (device->bdev) {
11082736e8eeSChristoph Hellwig 			blkdev_put(device->bdev, device->holder);
11092b82032cSYan Zheng 			device->bdev = NULL;
1110a74a4b97SChris Mason 			fs_devices->open_devices--;
1111a74a4b97SChris Mason 		}
1112ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
11132b82032cSYan Zheng 			list_del_init(&device->dev_alloc_list);
1114ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1115b2a61667SDesmond Cheong Zhi Xi 			fs_devices->rw_devices--;
11162b82032cSYan Zheng 		}
11172b82032cSYan Zheng 		list_del_init(&device->dev_list);
11182b82032cSYan Zheng 		fs_devices->num_devices--;
1119a425f9d4SDavid Sterba 		btrfs_free_device(device);
11202b82032cSYan Zheng 	}
11212b82032cSYan Zheng 
11223712ccb7SNikolay Borisov }
11233712ccb7SNikolay Borisov 
11243712ccb7SNikolay Borisov /*
11253712ccb7SNikolay Borisov  * After we have read the system tree and know devids belonging to this
11263712ccb7SNikolay Borisov  * filesystem, remove the device which does not belong there.
11273712ccb7SNikolay Borisov  */
btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices)1128bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
11293712ccb7SNikolay Borisov {
11303712ccb7SNikolay Borisov 	struct btrfs_device *latest_dev = NULL;
1131944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_dev;
11323712ccb7SNikolay Borisov 
11333712ccb7SNikolay Borisov 	mutex_lock(&uuid_mutex);
1134bacce86aSAnand Jain 	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1135944d3f9fSNikolay Borisov 
1136944d3f9fSNikolay Borisov 	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1137bacce86aSAnand Jain 		__btrfs_free_extra_devids(seed_dev, &latest_dev);
11382b82032cSYan Zheng 
1139d24fa5c1SAnand Jain 	fs_devices->latest_dev = latest_dev;
1140a6b0d5c8SChris Mason 
1141dfe25020SChris Mason 	mutex_unlock(&uuid_mutex);
1142dfe25020SChris Mason }
1143a0af469bSChris Mason 
btrfs_close_bdev(struct btrfs_device * device)114414238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device)
114514238819SAnand Jain {
114608ffcae8SDavid Sterba 	if (!device->bdev)
114708ffcae8SDavid Sterba 		return;
114808ffcae8SDavid Sterba 
1149ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
115014238819SAnand Jain 		sync_blockdev(device->bdev);
115114238819SAnand Jain 		invalidate_bdev(device->bdev);
115214238819SAnand Jain 	}
115314238819SAnand Jain 
11542736e8eeSChristoph Hellwig 	blkdev_put(device->bdev, device->holder);
115514238819SAnand Jain }
115614238819SAnand Jain 
btrfs_close_one_device(struct btrfs_device * device)1157959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device)
1158f448341aSAnand Jain {
1159f448341aSAnand Jain 	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1160f448341aSAnand Jain 
1161ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1162f448341aSAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
1163f448341aSAnand Jain 		list_del_init(&device->dev_alloc_list);
1164f448341aSAnand Jain 		fs_devices->rw_devices--;
1165f448341aSAnand Jain 	}
1166f448341aSAnand Jain 
11670d977e0eSDesmond Cheong Zhi Xi 	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
11680d977e0eSDesmond Cheong Zhi Xi 		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
11690d977e0eSDesmond Cheong Zhi Xi 
11705d03dbebSLi Zhang 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
11715d03dbebSLi Zhang 		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1172f448341aSAnand Jain 		fs_devices->missing_devices--;
11735d03dbebSLi Zhang 	}
1174f448341aSAnand Jain 
1175959b1c04SNikolay Borisov 	btrfs_close_bdev(device);
1176321f69f8SJohannes Thumshirn 	if (device->bdev) {
11773fff3975SJohannes Thumshirn 		fs_devices->open_devices--;
1178321f69f8SJohannes Thumshirn 		device->bdev = NULL;
1179f448341aSAnand Jain 	}
1180321f69f8SJohannes Thumshirn 	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
11815b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
1182f448341aSAnand Jain 
1183321f69f8SJohannes Thumshirn 	device->fs_info = NULL;
1184321f69f8SJohannes Thumshirn 	atomic_set(&device->dev_stats_ccnt, 0);
1185321f69f8SJohannes Thumshirn 	extent_io_tree_release(&device->alloc_state);
1186959b1c04SNikolay Borisov 
11876b225baaSFilipe Manana 	/*
11886b225baaSFilipe Manana 	 * Reset the flush error record. We might have a transient flush error
11896b225baaSFilipe Manana 	 * in this mount, and if so we aborted the current transaction and set
11906b225baaSFilipe Manana 	 * the fs to an error state, guaranteeing no super blocks can be further
11916b225baaSFilipe Manana 	 * committed. However that error might be transient and if we unmount the
11926b225baaSFilipe Manana 	 * filesystem and mount it again, we should allow the mount to succeed
11936b225baaSFilipe Manana 	 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
11946b225baaSFilipe Manana 	 * filesystem again we still get flush errors, then we will again abort
11956b225baaSFilipe Manana 	 * any transaction and set the error state, guaranteeing no commits of
11966b225baaSFilipe Manana 	 * unsafe super blocks.
11976b225baaSFilipe Manana 	 */
11986b225baaSFilipe Manana 	device->last_flush_error = 0;
11996b225baaSFilipe Manana 
1200321f69f8SJohannes Thumshirn 	/* Verify the device is back in a pristine state  */
12011f16033cSAnand Jain 	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
12021f16033cSAnand Jain 	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
12031f16033cSAnand Jain 	WARN_ON(!list_empty(&device->dev_alloc_list));
12041f16033cSAnand Jain 	WARN_ON(!list_empty(&device->post_commit_list));
1205f448341aSAnand Jain }
1206f448341aSAnand Jain 
close_fs_devices(struct btrfs_fs_devices * fs_devices)120754eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
12088a4b83ccSChris Mason {
12092037a093SSasha Levin 	struct btrfs_device *device, *tmp;
1210e4404d6eSYan Zheng 
1211425c6ed6SJosef Bacik 	lockdep_assert_held(&uuid_mutex);
1212425c6ed6SJosef Bacik 
12132b82032cSYan Zheng 	if (--fs_devices->opened > 0)
121454eed6aeSNikolay Borisov 		return;
12158a4b83ccSChris Mason 
1216425c6ed6SJosef Bacik 	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1217959b1c04SNikolay Borisov 		btrfs_close_one_device(device);
1218c9513edbSXiao Guangrong 
1219e4404d6eSYan Zheng 	WARN_ON(fs_devices->open_devices);
1220e4404d6eSYan Zheng 	WARN_ON(fs_devices->rw_devices);
12212b82032cSYan Zheng 	fs_devices->opened = 0;
12220395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
1223c4989c2fSNikolay Borisov 	fs_devices->fs_info = NULL;
12248a4b83ccSChris Mason }
12258a4b83ccSChris Mason 
btrfs_close_devices(struct btrfs_fs_devices * fs_devices)122654eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
12272b82032cSYan Zheng {
1228944d3f9fSNikolay Borisov 	LIST_HEAD(list);
1229944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *tmp;
12302b82032cSYan Zheng 
12312b82032cSYan Zheng 	mutex_lock(&uuid_mutex);
123254eed6aeSNikolay Borisov 	close_fs_devices(fs_devices);
12335f58d783SAnand Jain 	if (!fs_devices->opened) {
1234944d3f9fSNikolay Borisov 		list_splice_init(&fs_devices->seed_list, &list);
1235e4404d6eSYan Zheng 
12365f58d783SAnand Jain 		/*
12375f58d783SAnand Jain 		 * If the struct btrfs_fs_devices is not assembled with any
12385f58d783SAnand Jain 		 * other device, it can be re-initialized during the next mount
12395f58d783SAnand Jain 		 * without the needing device-scan step. Therefore, it can be
12405f58d783SAnand Jain 		 * fully freed.
12415f58d783SAnand Jain 		 */
12425f58d783SAnand Jain 		if (fs_devices->num_devices == 1) {
12435f58d783SAnand Jain 			list_del(&fs_devices->fs_list);
12445f58d783SAnand Jain 			free_fs_devices(fs_devices);
12455f58d783SAnand Jain 		}
12465f58d783SAnand Jain 	}
12475f58d783SAnand Jain 
12485f58d783SAnand Jain 
1249944d3f9fSNikolay Borisov 	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
12500226e0ebSAnand Jain 		close_fs_devices(fs_devices);
1251944d3f9fSNikolay Borisov 		list_del(&fs_devices->seed_list);
1252e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
1253e4404d6eSYan Zheng 	}
1254425c6ed6SJosef Bacik 	mutex_unlock(&uuid_mutex);
12552b82032cSYan Zheng }
12562b82032cSYan Zheng 
open_fs_devices(struct btrfs_fs_devices * fs_devices,blk_mode_t flags,void * holder)1257897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
125805bdb996SChristoph Hellwig 				blk_mode_t flags, void *holder)
12598a4b83ccSChris Mason {
12608a4b83ccSChris Mason 	struct btrfs_device *device;
1261443f24feSMiao Xie 	struct btrfs_device *latest_dev = NULL;
126296c2e067SAnand Jain 	struct btrfs_device *tmp_device;
12631ea068f5SAnand Jain 	int ret = 0;
12648a4b83ccSChris Mason 
126596c2e067SAnand Jain 	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
126696c2e067SAnand Jain 				 dev_list) {
12671ea068f5SAnand Jain 		int ret2;
1268a0af469bSChris Mason 
12691ea068f5SAnand Jain 		ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
12701ea068f5SAnand Jain 		if (ret2 == 0 &&
127196c2e067SAnand Jain 		    (!latest_dev || device->generation > latest_dev->generation)) {
12729f050db4SAnand Jain 			latest_dev = device;
12731ea068f5SAnand Jain 		} else if (ret2 == -ENODATA) {
127496c2e067SAnand Jain 			fs_devices->num_devices--;
127596c2e067SAnand Jain 			list_del(&device->dev_list);
127696c2e067SAnand Jain 			btrfs_free_device(device);
127796c2e067SAnand Jain 		}
12781ea068f5SAnand Jain 		if (ret == 0 && ret2 != 0)
12791ea068f5SAnand Jain 			ret = ret2;
12808a4b83ccSChris Mason 	}
12811ea068f5SAnand Jain 
12821ea068f5SAnand Jain 	if (fs_devices->open_devices == 0) {
12831ea068f5SAnand Jain 		if (ret)
12841ea068f5SAnand Jain 			return ret;
12851ed802c9SAnand Jain 		return -EINVAL;
12861ea068f5SAnand Jain 	}
12871ed802c9SAnand Jain 
12882b82032cSYan Zheng 	fs_devices->opened = 1;
1289d24fa5c1SAnand Jain 	fs_devices->latest_dev = latest_dev;
12902b82032cSYan Zheng 	fs_devices->total_rw_bytes = 0;
1291c4a816c6SNaohiro Aota 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
129233fd2f71SAnand Jain 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
12931ed802c9SAnand Jain 
12941ed802c9SAnand Jain 	return 0;
12952b82032cSYan Zheng }
12962b82032cSYan Zheng 
devid_cmp(void * priv,const struct list_head * a,const struct list_head * b)12974f0f586bSSami Tolvanen static int devid_cmp(void *priv, const struct list_head *a,
12984f0f586bSSami Tolvanen 		     const struct list_head *b)
1299f8e10cd3SAnand Jain {
1300214cc184SDavid Sterba 	const struct btrfs_device *dev1, *dev2;
1301f8e10cd3SAnand Jain 
1302f8e10cd3SAnand Jain 	dev1 = list_entry(a, struct btrfs_device, dev_list);
1303f8e10cd3SAnand Jain 	dev2 = list_entry(b, struct btrfs_device, dev_list);
1304f8e10cd3SAnand Jain 
1305f8e10cd3SAnand Jain 	if (dev1->devid < dev2->devid)
1306f8e10cd3SAnand Jain 		return -1;
1307f8e10cd3SAnand Jain 	else if (dev1->devid > dev2->devid)
1308f8e10cd3SAnand Jain 		return 1;
1309f8e10cd3SAnand Jain 	return 0;
1310f8e10cd3SAnand Jain }
1311f8e10cd3SAnand Jain 
btrfs_open_devices(struct btrfs_fs_devices * fs_devices,blk_mode_t flags,void * holder)13122b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
131305bdb996SChristoph Hellwig 		       blk_mode_t flags, void *holder)
13142b82032cSYan Zheng {
13152b82032cSYan Zheng 	int ret;
13162b82032cSYan Zheng 
1317f5194e34SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
131818c850fdSJosef Bacik 	/*
131918c850fdSJosef Bacik 	 * The device_list_mutex cannot be taken here in case opening the
1320a8698707SChristoph Hellwig 	 * underlying device takes further locks like open_mutex.
132118c850fdSJosef Bacik 	 *
132218c850fdSJosef Bacik 	 * We also don't need the lock here as this is called during mount and
132318c850fdSJosef Bacik 	 * exclusion is provided by uuid_mutex
132418c850fdSJosef Bacik 	 */
1325f5194e34SDavid Sterba 
13262b82032cSYan Zheng 	if (fs_devices->opened) {
13272b82032cSYan Zheng 		fs_devices->opened++;
13282b82032cSYan Zheng 		ret = 0;
13292b82032cSYan Zheng 	} else {
1330f8e10cd3SAnand Jain 		list_sort(NULL, &fs_devices->devices, devid_cmp);
1331897fb573SAnand Jain 		ret = open_fs_devices(fs_devices, flags, holder);
13322b82032cSYan Zheng 	}
1333542c5908SAnand Jain 
13348a4b83ccSChris Mason 	return ret;
13358a4b83ccSChris Mason }
13368a4b83ccSChris Mason 
btrfs_release_disk_super(struct btrfs_super_block * super)13378f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super)
13386cf86a00SAnand Jain {
13398f32380dSJohannes Thumshirn 	struct page *page = virt_to_page(super);
13408f32380dSJohannes Thumshirn 
13416cf86a00SAnand Jain 	put_page(page);
13426cf86a00SAnand Jain }
13436cf86a00SAnand Jain 
btrfs_read_disk_super(struct block_device * bdev,u64 bytenr,u64 bytenr_orig)1344b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
134512659251SNaohiro Aota 						       u64 bytenr, u64 bytenr_orig)
13466cf86a00SAnand Jain {
1347b335eab8SNikolay Borisov 	struct btrfs_super_block *disk_super;
1348b335eab8SNikolay Borisov 	struct page *page;
13496cf86a00SAnand Jain 	void *p;
13506cf86a00SAnand Jain 	pgoff_t index;
13516cf86a00SAnand Jain 
13526cf86a00SAnand Jain 	/* make sure our super fits in the device */
1353cda00ebaSChristoph Hellwig 	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1354b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13556cf86a00SAnand Jain 
13566cf86a00SAnand Jain 	/* make sure our super fits in the page */
1357b335eab8SNikolay Borisov 	if (sizeof(*disk_super) > PAGE_SIZE)
1358b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13596cf86a00SAnand Jain 
13606cf86a00SAnand Jain 	/* make sure our super doesn't straddle pages on disk */
13616cf86a00SAnand Jain 	index = bytenr >> PAGE_SHIFT;
1362b335eab8SNikolay Borisov 	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1363b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13646cf86a00SAnand Jain 
13656cf86a00SAnand Jain 	/* pull in the page with our super */
1366b335eab8SNikolay Borisov 	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
13676cf86a00SAnand Jain 
1368b335eab8SNikolay Borisov 	if (IS_ERR(page))
1369b335eab8SNikolay Borisov 		return ERR_CAST(page);
13706cf86a00SAnand Jain 
1371b335eab8SNikolay Borisov 	p = page_address(page);
13726cf86a00SAnand Jain 
13736cf86a00SAnand Jain 	/* align our pointer to the offset of the super block */
1374b335eab8SNikolay Borisov 	disk_super = p + offset_in_page(bytenr);
13756cf86a00SAnand Jain 
137612659251SNaohiro Aota 	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1377b335eab8SNikolay Borisov 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
13788f32380dSJohannes Thumshirn 		btrfs_release_disk_super(p);
1379b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13806cf86a00SAnand Jain 	}
13816cf86a00SAnand Jain 
1382b335eab8SNikolay Borisov 	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1383b335eab8SNikolay Borisov 		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
13846cf86a00SAnand Jain 
1385b335eab8SNikolay Borisov 	return disk_super;
13866cf86a00SAnand Jain }
13876cf86a00SAnand Jain 
btrfs_forget_devices(dev_t devt)138816cab91aSAnand Jain int btrfs_forget_devices(dev_t devt)
1389228a73abSAnand Jain {
1390228a73abSAnand Jain 	int ret;
1391228a73abSAnand Jain 
1392228a73abSAnand Jain 	mutex_lock(&uuid_mutex);
139316cab91aSAnand Jain 	ret = btrfs_free_stale_devices(devt, NULL);
1394228a73abSAnand Jain 	mutex_unlock(&uuid_mutex);
1395228a73abSAnand Jain 
1396228a73abSAnand Jain 	return ret;
1397228a73abSAnand Jain }
1398228a73abSAnand Jain 
13996f60cbd3SDavid Sterba /*
14006f60cbd3SDavid Sterba  * Look for a btrfs signature on a device. This may be called out of the mount path
14016f60cbd3SDavid Sterba  * and we are not allowed to call set_blocksize during the scan. The superblock
14026f60cbd3SDavid Sterba  * is read via pagecache
14036f60cbd3SDavid Sterba  */
btrfs_scan_one_device(const char * path,blk_mode_t flags)140405bdb996SChristoph Hellwig struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
14058a4b83ccSChris Mason {
14068a4b83ccSChris Mason 	struct btrfs_super_block *disk_super;
14074306a974SAnand Jain 	bool new_device_added = false;
140836350e95SGu Jinxiang 	struct btrfs_device *device = NULL;
14098a4b83ccSChris Mason 	struct block_device *bdev;
141012659251SNaohiro Aota 	u64 bytenr, bytenr_orig;
141112659251SNaohiro Aota 	int ret;
14128a4b83ccSChris Mason 
1413899f9307SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
1414899f9307SDavid Sterba 
14156f60cbd3SDavid Sterba 	/*
14166f60cbd3SDavid Sterba 	 * we would like to check all the supers, but that would make
14176f60cbd3SDavid Sterba 	 * a btrfs mount succeed after a mkfs from a different FS.
14186f60cbd3SDavid Sterba 	 * So, we need to add a special mount option to scan for
14196f60cbd3SDavid Sterba 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
14206f60cbd3SDavid Sterba 	 */
14216f60cbd3SDavid Sterba 
142250d281fcSAnand Jain 	/*
14232ef78928SChristoph Hellwig 	 * Avoid an exclusive open here, as the systemd-udev may initiate the
14242ef78928SChristoph Hellwig 	 * device scan which may race with the user's mount or mkfs command,
14252ef78928SChristoph Hellwig 	 * resulting in failure.
14262ef78928SChristoph Hellwig 	 * Since the device scan is solely for reading purposes, there is no
14272ef78928SChristoph Hellwig 	 * need for an exclusive open. Additionally, the devices are read again
142850d281fcSAnand Jain 	 * during the mount process. It is ok to get some inconsistent
142950d281fcSAnand Jain 	 * values temporarily, as the device paths of the fsid are the only
143050d281fcSAnand Jain 	 * required information for assembling the volume.
143150d281fcSAnand Jain 	 */
14322ef78928SChristoph Hellwig 	bdev = blkdev_get_by_path(path, flags, NULL, NULL);
1433b6ed73bcSAnand Jain 	if (IS_ERR(bdev))
143436350e95SGu Jinxiang 		return ERR_CAST(bdev);
14356f60cbd3SDavid Sterba 
143612659251SNaohiro Aota 	bytenr_orig = btrfs_sb_offset(0);
143712659251SNaohiro Aota 	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
14384989d4a0SShin'ichiro Kawasaki 	if (ret) {
14394989d4a0SShin'ichiro Kawasaki 		device = ERR_PTR(ret);
14404989d4a0SShin'ichiro Kawasaki 		goto error_bdev_put;
14414989d4a0SShin'ichiro Kawasaki 	}
144212659251SNaohiro Aota 
144312659251SNaohiro Aota 	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1444b335eab8SNikolay Borisov 	if (IS_ERR(disk_super)) {
1445b335eab8SNikolay Borisov 		device = ERR_CAST(disk_super);
14466f60cbd3SDavid Sterba 		goto error_bdev_put;
144705a5c55dSAnand Jain 	}
14486f60cbd3SDavid Sterba 
14494306a974SAnand Jain 	device = device_list_add(path, disk_super, &new_device_added);
14504889bc05SAnand Jain 	if (!IS_ERR(device) && new_device_added)
14514889bc05SAnand Jain 		btrfs_free_stale_devices(device->devt, device);
14526f60cbd3SDavid Sterba 
14538f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
14546f60cbd3SDavid Sterba 
14556f60cbd3SDavid Sterba error_bdev_put:
14562736e8eeSChristoph Hellwig 	blkdev_put(bdev, NULL);
1457b6ed73bcSAnand Jain 
145836350e95SGu Jinxiang 	return device;
14598a4b83ccSChris Mason }
14600b86a832SChris Mason 
1461c152b63eSFilipe Manana /*
14621c11b63eSJeff Mahoney  * Try to find a chunk that intersects [start, start + len] range and when one
14631c11b63eSJeff Mahoney  * such is found, record the end of it in *start
1464c152b63eSFilipe Manana  */
contains_pending_extent(struct btrfs_device * device,u64 * start,u64 len)14651c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
14661c11b63eSJeff Mahoney 				    u64 len)
14671c11b63eSJeff Mahoney {
14681c11b63eSJeff Mahoney 	u64 physical_start, physical_end;
14696df9a95eSJosef Bacik 
14701c11b63eSJeff Mahoney 	lockdep_assert_held(&device->fs_info->chunk_mutex);
14711c11b63eSJeff Mahoney 
1472e5860f82SFilipe Manana 	if (find_first_extent_bit(&device->alloc_state, *start,
14731c11b63eSJeff Mahoney 				  &physical_start, &physical_end,
14741c11b63eSJeff Mahoney 				  CHUNK_ALLOCATED, NULL)) {
14751c11b63eSJeff Mahoney 
14761c11b63eSJeff Mahoney 		if (in_range(physical_start, *start, len) ||
14771c11b63eSJeff Mahoney 		    in_range(*start, physical_start,
147851dad05fSFilipe Manana 			     physical_end + 1 - physical_start)) {
14791c11b63eSJeff Mahoney 			*start = physical_end + 1;
14801c11b63eSJeff Mahoney 			return true;
14811c11b63eSJeff Mahoney 		}
14821c11b63eSJeff Mahoney 	}
14831c11b63eSJeff Mahoney 	return false;
14846df9a95eSJosef Bacik }
14856df9a95eSJosef Bacik 
dev_extent_search_start(struct btrfs_device * device)1486ed8947bcSFilipe Manana static u64 dev_extent_search_start(struct btrfs_device *device)
14873b4ffa40SNaohiro Aota {
14883b4ffa40SNaohiro Aota 	switch (device->fs_devices->chunk_alloc_policy) {
14893b4ffa40SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
1490ed8947bcSFilipe Manana 		return BTRFS_DEVICE_RANGE_RESERVED;
14911cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
14921cd6121fSNaohiro Aota 		/*
14931cd6121fSNaohiro Aota 		 * We don't care about the starting region like regular
14941cd6121fSNaohiro Aota 		 * allocator, because we anyway use/reserve the first two zones
14951cd6121fSNaohiro Aota 		 * for superblock logging.
14961cd6121fSNaohiro Aota 		 */
1497ed8947bcSFilipe Manana 		return 0;
14983b4ffa40SNaohiro Aota 	default:
14993b4ffa40SNaohiro Aota 		BUG();
15003b4ffa40SNaohiro Aota 	}
15013b4ffa40SNaohiro Aota }
15023b4ffa40SNaohiro Aota 
dev_extent_hole_check_zoned(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)15031cd6121fSNaohiro Aota static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
15041cd6121fSNaohiro Aota 					u64 *hole_start, u64 *hole_size,
15051cd6121fSNaohiro Aota 					u64 num_bytes)
15061cd6121fSNaohiro Aota {
15071cd6121fSNaohiro Aota 	u64 zone_size = device->zone_info->zone_size;
15081cd6121fSNaohiro Aota 	u64 pos;
15091cd6121fSNaohiro Aota 	int ret;
15101cd6121fSNaohiro Aota 	bool changed = false;
15111cd6121fSNaohiro Aota 
15121cd6121fSNaohiro Aota 	ASSERT(IS_ALIGNED(*hole_start, zone_size));
15131cd6121fSNaohiro Aota 
15141cd6121fSNaohiro Aota 	while (*hole_size > 0) {
15151cd6121fSNaohiro Aota 		pos = btrfs_find_allocatable_zones(device, *hole_start,
15161cd6121fSNaohiro Aota 						   *hole_start + *hole_size,
15171cd6121fSNaohiro Aota 						   num_bytes);
15181cd6121fSNaohiro Aota 		if (pos != *hole_start) {
15191cd6121fSNaohiro Aota 			*hole_size = *hole_start + *hole_size - pos;
15201cd6121fSNaohiro Aota 			*hole_start = pos;
15211cd6121fSNaohiro Aota 			changed = true;
15221cd6121fSNaohiro Aota 			if (*hole_size < num_bytes)
15231cd6121fSNaohiro Aota 				break;
15241cd6121fSNaohiro Aota 		}
15251cd6121fSNaohiro Aota 
15261cd6121fSNaohiro Aota 		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
15271cd6121fSNaohiro Aota 
15281cd6121fSNaohiro Aota 		/* Range is ensured to be empty */
15291cd6121fSNaohiro Aota 		if (!ret)
15301cd6121fSNaohiro Aota 			return changed;
15311cd6121fSNaohiro Aota 
15321cd6121fSNaohiro Aota 		/* Given hole range was invalid (outside of device) */
15331cd6121fSNaohiro Aota 		if (ret == -ERANGE) {
15341cd6121fSNaohiro Aota 			*hole_start += *hole_size;
1535d6f67afbSJohannes Thumshirn 			*hole_size = 0;
15367000babdSJiapeng Chong 			return true;
15371cd6121fSNaohiro Aota 		}
15381cd6121fSNaohiro Aota 
15391cd6121fSNaohiro Aota 		*hole_start += zone_size;
15401cd6121fSNaohiro Aota 		*hole_size -= zone_size;
15411cd6121fSNaohiro Aota 		changed = true;
15421cd6121fSNaohiro Aota 	}
15431cd6121fSNaohiro Aota 
15441cd6121fSNaohiro Aota 	return changed;
15451cd6121fSNaohiro Aota }
15461cd6121fSNaohiro Aota 
154743dd529aSDavid Sterba /*
154843dd529aSDavid Sterba  * Check if specified hole is suitable for allocation.
154943dd529aSDavid Sterba  *
15503b4ffa40SNaohiro Aota  * @device:	the device which we have the hole
15513b4ffa40SNaohiro Aota  * @hole_start: starting position of the hole
15523b4ffa40SNaohiro Aota  * @hole_size:	the size of the hole
15533b4ffa40SNaohiro Aota  * @num_bytes:	the size of the free space that we need
15543b4ffa40SNaohiro Aota  *
15551cd6121fSNaohiro Aota  * This function may modify @hole_start and @hole_size to reflect the suitable
15563b4ffa40SNaohiro Aota  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
15573b4ffa40SNaohiro Aota  */
dev_extent_hole_check(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)15583b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
15593b4ffa40SNaohiro Aota 				  u64 *hole_size, u64 num_bytes)
15603b4ffa40SNaohiro Aota {
15613b4ffa40SNaohiro Aota 	bool changed = false;
15623b4ffa40SNaohiro Aota 	u64 hole_end = *hole_start + *hole_size;
15633b4ffa40SNaohiro Aota 
15641cd6121fSNaohiro Aota 	for (;;) {
15653b4ffa40SNaohiro Aota 		/*
15663b4ffa40SNaohiro Aota 		 * Check before we set max_hole_start, otherwise we could end up
15673b4ffa40SNaohiro Aota 		 * sending back this offset anyway.
15683b4ffa40SNaohiro Aota 		 */
15693b4ffa40SNaohiro Aota 		if (contains_pending_extent(device, hole_start, *hole_size)) {
15703b4ffa40SNaohiro Aota 			if (hole_end >= *hole_start)
15713b4ffa40SNaohiro Aota 				*hole_size = hole_end - *hole_start;
15723b4ffa40SNaohiro Aota 			else
15733b4ffa40SNaohiro Aota 				*hole_size = 0;
15743b4ffa40SNaohiro Aota 			changed = true;
15753b4ffa40SNaohiro Aota 		}
15763b4ffa40SNaohiro Aota 
15773b4ffa40SNaohiro Aota 		switch (device->fs_devices->chunk_alloc_policy) {
15783b4ffa40SNaohiro Aota 		case BTRFS_CHUNK_ALLOC_REGULAR:
15793b4ffa40SNaohiro Aota 			/* No extra check */
15803b4ffa40SNaohiro Aota 			break;
15811cd6121fSNaohiro Aota 		case BTRFS_CHUNK_ALLOC_ZONED:
15821cd6121fSNaohiro Aota 			if (dev_extent_hole_check_zoned(device, hole_start,
15831cd6121fSNaohiro Aota 							hole_size, num_bytes)) {
15841cd6121fSNaohiro Aota 				changed = true;
15851cd6121fSNaohiro Aota 				/*
15861cd6121fSNaohiro Aota 				 * The changed hole can contain pending extent.
15871cd6121fSNaohiro Aota 				 * Loop again to check that.
15881cd6121fSNaohiro Aota 				 */
15891cd6121fSNaohiro Aota 				continue;
15901cd6121fSNaohiro Aota 			}
15911cd6121fSNaohiro Aota 			break;
15923b4ffa40SNaohiro Aota 		default:
15933b4ffa40SNaohiro Aota 			BUG();
15943b4ffa40SNaohiro Aota 		}
15953b4ffa40SNaohiro Aota 
15961cd6121fSNaohiro Aota 		break;
15971cd6121fSNaohiro Aota 	}
15981cd6121fSNaohiro Aota 
15993b4ffa40SNaohiro Aota 	return changed;
16003b4ffa40SNaohiro Aota }
16016df9a95eSJosef Bacik 
16020b86a832SChris Mason /*
160343dd529aSDavid Sterba  * Find free space in the specified device.
160443dd529aSDavid Sterba  *
16057bfc837dSMiao Xie  * @device:	  the device which we search the free space in
16067bfc837dSMiao Xie  * @num_bytes:	  the size of the free space that we need
1607499f377fSJeff Mahoney  * @search_start: the position from which to begin the search
16087bfc837dSMiao Xie  * @start:	  store the start of the free space.
1609499f377fSJeff Mahoney  * @len:	  the size of the free space. that we find, or the size
1610499f377fSJeff Mahoney  *		  of the max free space if we don't find suitable free space
16117bfc837dSMiao Xie  *
161243dd529aSDavid Sterba  * This does a pretty simple search, the expectation is that it is called very
161343dd529aSDavid Sterba  * infrequently and that a given device has a small number of extents.
16147bfc837dSMiao Xie  *
16157bfc837dSMiao Xie  * @start is used to store the start of the free space if we find. But if we
16167bfc837dSMiao Xie  * don't find suitable free space, it will be used to store the start position
16177bfc837dSMiao Xie  * of the max free space.
16187bfc837dSMiao Xie  *
16197bfc837dSMiao Xie  * @len is used to store the size of the free space that we find.
16207bfc837dSMiao Xie  * But if we don't find suitable free space, it is used to store the size of
16217bfc837dSMiao Xie  * the max free space.
1622135da976SQu Wenruo  *
1623135da976SQu Wenruo  * NOTE: This function will search *commit* root of device tree, and does extra
1624135da976SQu Wenruo  * check to ensure dev extents are not double allocated.
1625135da976SQu Wenruo  * This makes the function safe to allocate dev extents but may not report
1626135da976SQu Wenruo  * correct usable device space, as device extent freed in current transaction
16271a9fd417SDavid Sterba  * is not reported as available.
16280b86a832SChris Mason  */
find_free_dev_extent(struct btrfs_device * device,u64 num_bytes,u64 * start,u64 * len)1629ed8947bcSFilipe Manana static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1630ed8947bcSFilipe Manana 				u64 *start, u64 *len)
16310b86a832SChris Mason {
16320b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
16330b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
16340b86a832SChris Mason 	struct btrfs_key key;
16357bfc837dSMiao Xie 	struct btrfs_dev_extent *dev_extent;
16362b82032cSYan Zheng 	struct btrfs_path *path;
1637ed8947bcSFilipe Manana 	u64 search_start;
16387bfc837dSMiao Xie 	u64 hole_size;
16397bfc837dSMiao Xie 	u64 max_hole_start;
164020218dfbSJosef Bacik 	u64 max_hole_size = 0;
16417bfc837dSMiao Xie 	u64 extent_end;
16420b86a832SChris Mason 	u64 search_end = device->total_bytes;
16430b86a832SChris Mason 	int ret;
16447bfc837dSMiao Xie 	int slot;
16450b86a832SChris Mason 	struct extent_buffer *l;
16468cdc7c5bSFilipe Manana 
1647ed8947bcSFilipe Manana 	search_start = dev_extent_search_start(device);
164820218dfbSJosef Bacik 	max_hole_start = search_start;
16490b86a832SChris Mason 
16501cd6121fSNaohiro Aota 	WARN_ON(device->zone_info &&
16511cd6121fSNaohiro Aota 		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
16521cd6121fSNaohiro Aota 
16536df9a95eSJosef Bacik 	path = btrfs_alloc_path();
165420218dfbSJosef Bacik 	if (!path) {
165520218dfbSJosef Bacik 		ret = -ENOMEM;
165620218dfbSJosef Bacik 		goto out;
165720218dfbSJosef Bacik 	}
1658f2ab7618SZhao Lei again:
1659401e29c1SAnand Jain 	if (search_start >= search_end ||
1660401e29c1SAnand Jain 		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
16617bfc837dSMiao Xie 		ret = -ENOSPC;
16626df9a95eSJosef Bacik 		goto out;
16637bfc837dSMiao Xie 	}
16647bfc837dSMiao Xie 
1665e4058b54SDavid Sterba 	path->reada = READA_FORWARD;
16666df9a95eSJosef Bacik 	path->search_commit_root = 1;
16676df9a95eSJosef Bacik 	path->skip_locking = 1;
16687bfc837dSMiao Xie 
16690b86a832SChris Mason 	key.objectid = device->devid;
16700b86a832SChris Mason 	key.offset = search_start;
16710b86a832SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
16727bfc837dSMiao Xie 
16730ff40a91SMarcos Paulo de Souza 	ret = btrfs_search_backwards(root, &key, path);
16740b86a832SChris Mason 	if (ret < 0)
16757bfc837dSMiao Xie 		goto out;
16767bfc837dSMiao Xie 
16773c538de0SJosef Bacik 	while (search_start < search_end) {
16780b86a832SChris Mason 		l = path->nodes[0];
16790b86a832SChris Mason 		slot = path->slots[0];
16800b86a832SChris Mason 		if (slot >= btrfs_header_nritems(l)) {
16810b86a832SChris Mason 			ret = btrfs_next_leaf(root, path);
16820b86a832SChris Mason 			if (ret == 0)
16830b86a832SChris Mason 				continue;
16840b86a832SChris Mason 			if (ret < 0)
16857bfc837dSMiao Xie 				goto out;
16867bfc837dSMiao Xie 
16877bfc837dSMiao Xie 			break;
16880b86a832SChris Mason 		}
16890b86a832SChris Mason 		btrfs_item_key_to_cpu(l, &key, slot);
16900b86a832SChris Mason 
16910b86a832SChris Mason 		if (key.objectid < device->devid)
16920b86a832SChris Mason 			goto next;
16930b86a832SChris Mason 
16940b86a832SChris Mason 		if (key.objectid > device->devid)
16957bfc837dSMiao Xie 			break;
16960b86a832SChris Mason 
1697962a298fSDavid Sterba 		if (key.type != BTRFS_DEV_EXTENT_KEY)
16980b86a832SChris Mason 			goto next;
16990b86a832SChris Mason 
17003c538de0SJosef Bacik 		if (key.offset > search_end)
17013c538de0SJosef Bacik 			break;
17023c538de0SJosef Bacik 
17037bfc837dSMiao Xie 		if (key.offset > search_start) {
17047bfc837dSMiao Xie 			hole_size = key.offset - search_start;
17053b4ffa40SNaohiro Aota 			dev_extent_hole_check(device, &search_start, &hole_size,
17063b4ffa40SNaohiro Aota 					      num_bytes);
17076df9a95eSJosef Bacik 
17087bfc837dSMiao Xie 			if (hole_size > max_hole_size) {
17097bfc837dSMiao Xie 				max_hole_start = search_start;
17107bfc837dSMiao Xie 				max_hole_size = hole_size;
17117bfc837dSMiao Xie 			}
17127bfc837dSMiao Xie 
17137bfc837dSMiao Xie 			/*
17147bfc837dSMiao Xie 			 * If this free space is greater than which we need,
17157bfc837dSMiao Xie 			 * it must be the max free space that we have found
17167bfc837dSMiao Xie 			 * until now, so max_hole_start must point to the start
17177bfc837dSMiao Xie 			 * of this free space and the length of this free space
17187bfc837dSMiao Xie 			 * is stored in max_hole_size. Thus, we return
17197bfc837dSMiao Xie 			 * max_hole_start and max_hole_size and go back to the
17207bfc837dSMiao Xie 			 * caller.
17217bfc837dSMiao Xie 			 */
17227bfc837dSMiao Xie 			if (hole_size >= num_bytes) {
17237bfc837dSMiao Xie 				ret = 0;
17247bfc837dSMiao Xie 				goto out;
17257bfc837dSMiao Xie 			}
17267bfc837dSMiao Xie 		}
17277bfc837dSMiao Xie 
17280b86a832SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
17297bfc837dSMiao Xie 		extent_end = key.offset + btrfs_dev_extent_length(l,
17307bfc837dSMiao Xie 								  dev_extent);
17317bfc837dSMiao Xie 		if (extent_end > search_start)
17327bfc837dSMiao Xie 			search_start = extent_end;
17330b86a832SChris Mason next:
17340b86a832SChris Mason 		path->slots[0]++;
17350b86a832SChris Mason 		cond_resched();
17360b86a832SChris Mason 	}
17370b86a832SChris Mason 
173838c01b96Sliubo 	/*
173938c01b96Sliubo 	 * At this point, search_start should be the end of
174038c01b96Sliubo 	 * allocated dev extents, and when shrinking the device,
174138c01b96Sliubo 	 * search_end may be smaller than search_start.
174238c01b96Sliubo 	 */
1743f2ab7618SZhao Lei 	if (search_end > search_start) {
17447bfc837dSMiao Xie 		hole_size = search_end - search_start;
17453b4ffa40SNaohiro Aota 		if (dev_extent_hole_check(device, &search_start, &hole_size,
17463b4ffa40SNaohiro Aota 					  num_bytes)) {
1747f2ab7618SZhao Lei 			btrfs_release_path(path);
1748f2ab7618SZhao Lei 			goto again;
1749f2ab7618SZhao Lei 		}
1750f2ab7618SZhao Lei 
17517bfc837dSMiao Xie 		if (hole_size > max_hole_size) {
17527bfc837dSMiao Xie 			max_hole_start = search_start;
17537bfc837dSMiao Xie 			max_hole_size = hole_size;
17540b86a832SChris Mason 		}
17556df9a95eSJosef Bacik 	}
17566df9a95eSJosef Bacik 
17577bfc837dSMiao Xie 	/* See above. */
1758f2ab7618SZhao Lei 	if (max_hole_size < num_bytes)
17597bfc837dSMiao Xie 		ret = -ENOSPC;
17607bfc837dSMiao Xie 	else
17612b82032cSYan Zheng 		ret = 0;
17620b86a832SChris Mason 
17633c538de0SJosef Bacik 	ASSERT(max_hole_start + max_hole_size <= search_end);
17647bfc837dSMiao Xie out:
17652b82032cSYan Zheng 	btrfs_free_path(path);
17667bfc837dSMiao Xie 	*start = max_hole_start;
1767b2117a39SMiao Xie 	if (len)
17687bfc837dSMiao Xie 		*len = max_hole_size;
17690b86a832SChris Mason 	return ret;
17700b86a832SChris Mason }
17710b86a832SChris Mason 
btrfs_free_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 start,u64 * dev_extent_len)1772b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
17738f18cf13SChris Mason 			  struct btrfs_device *device,
17742196d6e8SMiao Xie 			  u64 start, u64 *dev_extent_len)
17758f18cf13SChris Mason {
17760b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
17770b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
17788f18cf13SChris Mason 	int ret;
17798f18cf13SChris Mason 	struct btrfs_path *path;
17808f18cf13SChris Mason 	struct btrfs_key key;
1781a061fc8dSChris Mason 	struct btrfs_key found_key;
1782a061fc8dSChris Mason 	struct extent_buffer *leaf = NULL;
1783a061fc8dSChris Mason 	struct btrfs_dev_extent *extent = NULL;
17848f18cf13SChris Mason 
17858f18cf13SChris Mason 	path = btrfs_alloc_path();
17868f18cf13SChris Mason 	if (!path)
17878f18cf13SChris Mason 		return -ENOMEM;
17888f18cf13SChris Mason 
17898f18cf13SChris Mason 	key.objectid = device->devid;
17908f18cf13SChris Mason 	key.offset = start;
17918f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
1792924cd8fbSMiao Xie again:
17938f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1794a061fc8dSChris Mason 	if (ret > 0) {
1795a061fc8dSChris Mason 		ret = btrfs_previous_item(root, path, key.objectid,
1796a061fc8dSChris Mason 					  BTRFS_DEV_EXTENT_KEY);
1797b0b802d7STsutomu Itoh 		if (ret)
1798b0b802d7STsutomu Itoh 			goto out;
1799a061fc8dSChris Mason 		leaf = path->nodes[0];
1800a061fc8dSChris Mason 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1801a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1802a061fc8dSChris Mason 					struct btrfs_dev_extent);
1803a061fc8dSChris Mason 		BUG_ON(found_key.offset > start || found_key.offset +
1804a061fc8dSChris Mason 		       btrfs_dev_extent_length(leaf, extent) < start);
1805924cd8fbSMiao Xie 		key = found_key;
1806924cd8fbSMiao Xie 		btrfs_release_path(path);
1807924cd8fbSMiao Xie 		goto again;
1808a061fc8dSChris Mason 	} else if (ret == 0) {
1809a061fc8dSChris Mason 		leaf = path->nodes[0];
1810a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1811a061fc8dSChris Mason 					struct btrfs_dev_extent);
181279787eaaSJeff Mahoney 	} else {
181379787eaaSJeff Mahoney 		goto out;
1814a061fc8dSChris Mason 	}
18158f18cf13SChris Mason 
18162196d6e8SMiao Xie 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
18172196d6e8SMiao Xie 
18188f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
181979bd3712SFilipe Manana 	if (ret == 0)
18203204d33cSJosef Bacik 		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1821b0b802d7STsutomu Itoh out:
18228f18cf13SChris Mason 	btrfs_free_path(path);
18238f18cf13SChris Mason 	return ret;
18248f18cf13SChris Mason }
18258f18cf13SChris Mason 
find_next_chunk(struct btrfs_fs_info * fs_info)18266df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
18270b86a832SChris Mason {
18286df9a95eSJosef Bacik 	struct extent_map_tree *em_tree;
18296df9a95eSJosef Bacik 	struct extent_map *em;
18306df9a95eSJosef Bacik 	struct rb_node *n;
18316df9a95eSJosef Bacik 	u64 ret = 0;
18320b86a832SChris Mason 
1833c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
18346df9a95eSJosef Bacik 	read_lock(&em_tree->lock);
183507e1ce09SLiu Bo 	n = rb_last(&em_tree->map.rb_root);
18366df9a95eSJosef Bacik 	if (n) {
18376df9a95eSJosef Bacik 		em = rb_entry(n, struct extent_map, rb_node);
18386df9a95eSJosef Bacik 		ret = em->start + em->len;
1839e17cade2SChris Mason 	}
18406df9a95eSJosef Bacik 	read_unlock(&em_tree->lock);
18416df9a95eSJosef Bacik 
18420b86a832SChris Mason 	return ret;
18430b86a832SChris Mason }
18440b86a832SChris Mason 
find_next_devid(struct btrfs_fs_info * fs_info,u64 * devid_ret)184553f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
184653f10659SIlya Dryomov 				    u64 *devid_ret)
18470b86a832SChris Mason {
18480b86a832SChris Mason 	int ret;
18490b86a832SChris Mason 	struct btrfs_key key;
18500b86a832SChris Mason 	struct btrfs_key found_key;
18512b82032cSYan Zheng 	struct btrfs_path *path;
18522b82032cSYan Zheng 
18532b82032cSYan Zheng 	path = btrfs_alloc_path();
18542b82032cSYan Zheng 	if (!path)
18552b82032cSYan Zheng 		return -ENOMEM;
18560b86a832SChris Mason 
18570b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
18580b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
18590b86a832SChris Mason 	key.offset = (u64)-1;
18600b86a832SChris Mason 
186153f10659SIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
18620b86a832SChris Mason 	if (ret < 0)
18630b86a832SChris Mason 		goto error;
18640b86a832SChris Mason 
1865a06dee4dSAnand Jain 	if (ret == 0) {
1866a06dee4dSAnand Jain 		/* Corruption */
1867a06dee4dSAnand Jain 		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1868a06dee4dSAnand Jain 		ret = -EUCLEAN;
1869a06dee4dSAnand Jain 		goto error;
1870a06dee4dSAnand Jain 	}
18710b86a832SChris Mason 
187253f10659SIlya Dryomov 	ret = btrfs_previous_item(fs_info->chunk_root, path,
187353f10659SIlya Dryomov 				  BTRFS_DEV_ITEMS_OBJECTID,
18740b86a832SChris Mason 				  BTRFS_DEV_ITEM_KEY);
18750b86a832SChris Mason 	if (ret) {
187653f10659SIlya Dryomov 		*devid_ret = 1;
18770b86a832SChris Mason 	} else {
18780b86a832SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
18790b86a832SChris Mason 				      path->slots[0]);
188053f10659SIlya Dryomov 		*devid_ret = found_key.offset + 1;
18810b86a832SChris Mason 	}
18820b86a832SChris Mason 	ret = 0;
18830b86a832SChris Mason error:
18842b82032cSYan Zheng 	btrfs_free_path(path);
18850b86a832SChris Mason 	return ret;
18860b86a832SChris Mason }
18870b86a832SChris Mason 
18880b86a832SChris Mason /*
18890b86a832SChris Mason  * the device information is stored in the chunk root
18900b86a832SChris Mason  * the btrfs_device struct should be fully filled in
18910b86a832SChris Mason  */
btrfs_add_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1892c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
18930b86a832SChris Mason 			    struct btrfs_device *device)
18940b86a832SChris Mason {
18950b86a832SChris Mason 	int ret;
18960b86a832SChris Mason 	struct btrfs_path *path;
18970b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
18980b86a832SChris Mason 	struct extent_buffer *leaf;
18990b86a832SChris Mason 	struct btrfs_key key;
19000b86a832SChris Mason 	unsigned long ptr;
19010b86a832SChris Mason 
19020b86a832SChris Mason 	path = btrfs_alloc_path();
19030b86a832SChris Mason 	if (!path)
19040b86a832SChris Mason 		return -ENOMEM;
19050b86a832SChris Mason 
19060b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
19070b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
19082b82032cSYan Zheng 	key.offset = device->devid;
19090b86a832SChris Mason 
19102bb2e00eSFilipe Manana 	btrfs_reserve_chunk_metadata(trans, true);
19118e87e856SNikolay Borisov 	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
19128e87e856SNikolay Borisov 				      &key, sizeof(*dev_item));
19132bb2e00eSFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
19140b86a832SChris Mason 	if (ret)
19150b86a832SChris Mason 		goto out;
19160b86a832SChris Mason 
19170b86a832SChris Mason 	leaf = path->nodes[0];
19180b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
19190b86a832SChris Mason 
19200b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
19212b82032cSYan Zheng 	btrfs_set_device_generation(leaf, dev_item, 0);
19220b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
19230b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
19240b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
19250b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
19267cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
19277cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
19287cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
19297cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
1930e17cade2SChris Mason 	btrfs_set_device_group(leaf, dev_item, 0);
1931e17cade2SChris Mason 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1932e17cade2SChris Mason 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1933c3027eb5SChris Mason 	btrfs_set_device_start_offset(leaf, dev_item, 0);
19340b86a832SChris Mason 
1935410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
1936e17cade2SChris Mason 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
19371473b24eSGeert Uytterhoeven 	ptr = btrfs_device_fsid(dev_item);
1938de37aa51SNikolay Borisov 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1939de37aa51SNikolay Borisov 			    ptr, BTRFS_FSID_SIZE);
1940d5e09e38SFilipe Manana 	btrfs_mark_buffer_dirty(trans, leaf);
19410b86a832SChris Mason 
19422b82032cSYan Zheng 	ret = 0;
19430b86a832SChris Mason out:
19440b86a832SChris Mason 	btrfs_free_path(path);
19450b86a832SChris Mason 	return ret;
19460b86a832SChris Mason }
19478f18cf13SChris Mason 
19485a1972bdSQu Wenruo /*
19495a1972bdSQu Wenruo  * Function to update ctime/mtime for a given device path.
19505a1972bdSQu Wenruo  * Mainly used for ctime/mtime based probe like libblkid.
195154fde91fSJosef Bacik  *
195254fde91fSJosef Bacik  * We don't care about errors here, this is just to be kind to userspace.
19535a1972bdSQu Wenruo  */
update_dev_time(const char * device_path)195454fde91fSJosef Bacik static void update_dev_time(const char *device_path)
19555a1972bdSQu Wenruo {
195654fde91fSJosef Bacik 	struct path path;
195754fde91fSJosef Bacik 	int ret;
19585a1972bdSQu Wenruo 
195954fde91fSJosef Bacik 	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
196054fde91fSJosef Bacik 	if (ret)
19615a1972bdSQu Wenruo 		return;
19628f96a5bfSJosef Bacik 
1963913e9928SJeff Layton 	inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
196454fde91fSJosef Bacik 	path_put(&path);
19655a1972bdSQu Wenruo }
19665a1972bdSQu Wenruo 
btrfs_rm_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1967bbac5869SQu Wenruo static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1968bbac5869SQu Wenruo 			     struct btrfs_device *device)
1969a061fc8dSChris Mason {
1970f331a952SDavid Sterba 	struct btrfs_root *root = device->fs_info->chunk_root;
1971a061fc8dSChris Mason 	int ret;
1972a061fc8dSChris Mason 	struct btrfs_path *path;
1973a061fc8dSChris Mason 	struct btrfs_key key;
1974a061fc8dSChris Mason 
1975a061fc8dSChris Mason 	path = btrfs_alloc_path();
1976a061fc8dSChris Mason 	if (!path)
1977a061fc8dSChris Mason 		return -ENOMEM;
1978a061fc8dSChris Mason 
1979a061fc8dSChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1980a061fc8dSChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
1981a061fc8dSChris Mason 	key.offset = device->devid;
1982a061fc8dSChris Mason 
19832bb2e00eSFilipe Manana 	btrfs_reserve_chunk_metadata(trans, false);
1984a061fc8dSChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
19852bb2e00eSFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
19865e9f2ad5SNikolay Borisov 	if (ret) {
19875e9f2ad5SNikolay Borisov 		if (ret > 0)
1988a061fc8dSChris Mason 			ret = -ENOENT;
1989a061fc8dSChris Mason 		goto out;
1990a061fc8dSChris Mason 	}
1991a061fc8dSChris Mason 
1992a061fc8dSChris Mason 	ret = btrfs_del_item(trans, root, path);
1993a061fc8dSChris Mason out:
1994a061fc8dSChris Mason 	btrfs_free_path(path);
1995a061fc8dSChris Mason 	return ret;
1996a061fc8dSChris Mason }
1997a061fc8dSChris Mason 
19983cc31a0dSDavid Sterba /*
19993cc31a0dSDavid Sterba  * Verify that @num_devices satisfies the RAID profile constraints in the whole
20003cc31a0dSDavid Sterba  * filesystem. It's up to the caller to adjust that number regarding eg. device
20013cc31a0dSDavid Sterba  * replace.
20023cc31a0dSDavid Sterba  */
btrfs_check_raid_min_devices(struct btrfs_fs_info * fs_info,u64 num_devices)20033cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
20043cc31a0dSDavid Sterba 		u64 num_devices)
2005a061fc8dSChris Mason {
2006a061fc8dSChris Mason 	u64 all_avail;
2007de98ced9SMiao Xie 	unsigned seq;
2008418775a2SDavid Sterba 	int i;
2009a061fc8dSChris Mason 
2010de98ced9SMiao Xie 	do {
2011bd45ffbcSAnand Jain 		seq = read_seqbegin(&fs_info->profiles_lock);
2012de98ced9SMiao Xie 
2013bd45ffbcSAnand Jain 		all_avail = fs_info->avail_data_alloc_bits |
2014bd45ffbcSAnand Jain 			    fs_info->avail_system_alloc_bits |
2015bd45ffbcSAnand Jain 			    fs_info->avail_metadata_alloc_bits;
2016bd45ffbcSAnand Jain 	} while (read_seqretry(&fs_info->profiles_lock, seq));
2017f1fa7f26SAnand Jain 
2018418775a2SDavid Sterba 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
201941a6e891SAnand Jain 		if (!(all_avail & btrfs_raid_array[i].bg_flag))
2020418775a2SDavid Sterba 			continue;
2021a061fc8dSChris Mason 
2022efc222f8SAnand Jain 		if (num_devices < btrfs_raid_array[i].devs_min)
2023efc222f8SAnand Jain 			return btrfs_raid_array[i].mindev_error;
2024bd45ffbcSAnand Jain 	}
2025bd45ffbcSAnand Jain 
2026bd45ffbcSAnand Jain 	return 0;
2027f1fa7f26SAnand Jain }
2028f1fa7f26SAnand Jain 
btrfs_find_next_active_device(struct btrfs_fs_devices * fs_devs,struct btrfs_device * device)2029c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device(
2030c9162bdfSOmar Sandoval 		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
203188acff64SAnand Jain {
203288acff64SAnand Jain 	struct btrfs_device *next_device;
203388acff64SAnand Jain 
203488acff64SAnand Jain 	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
203588acff64SAnand Jain 		if (next_device != device &&
2036e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2037e6e674bdSAnand Jain 		    && next_device->bdev)
203888acff64SAnand Jain 			return next_device;
203988acff64SAnand Jain 	}
204088acff64SAnand Jain 
204188acff64SAnand Jain 	return NULL;
204288acff64SAnand Jain }
204388acff64SAnand Jain 
204488acff64SAnand Jain /*
2045d24fa5c1SAnand Jain  * Helper function to check if the given device is part of s_bdev / latest_dev
204688acff64SAnand Jain  * and replace it with the provided or the next active device, in the context
204788acff64SAnand Jain  * where this function called, there should be always be another device (or
204888acff64SAnand Jain  * this_dev) which is active.
204988acff64SAnand Jain  */
btrfs_assign_next_active_device(struct btrfs_device * device,struct btrfs_device * next_device)2050b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2051e493e8f9SAnand Jain 					    struct btrfs_device *next_device)
205288acff64SAnand Jain {
2053d6507cf1SNikolay Borisov 	struct btrfs_fs_info *fs_info = device->fs_info;
205488acff64SAnand Jain 
2055e493e8f9SAnand Jain 	if (!next_device)
205688acff64SAnand Jain 		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
205788acff64SAnand Jain 							    device);
205888acff64SAnand Jain 	ASSERT(next_device);
205988acff64SAnand Jain 
206088acff64SAnand Jain 	if (fs_info->sb->s_bdev &&
206188acff64SAnand Jain 			(fs_info->sb->s_bdev == device->bdev))
206288acff64SAnand Jain 		fs_info->sb->s_bdev = next_device->bdev;
206388acff64SAnand Jain 
2064d24fa5c1SAnand Jain 	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2065d24fa5c1SAnand Jain 		fs_info->fs_devices->latest_dev = next_device;
206688acff64SAnand Jain }
206788acff64SAnand Jain 
20681da73967SAnand Jain /*
20691da73967SAnand Jain  * Return btrfs_fs_devices::num_devices excluding the device that's being
20701da73967SAnand Jain  * currently replaced.
20711da73967SAnand Jain  */
btrfs_num_devices(struct btrfs_fs_info * fs_info)20721da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
20731da73967SAnand Jain {
20741da73967SAnand Jain 	u64 num_devices = fs_info->fs_devices->num_devices;
20751da73967SAnand Jain 
2076cb5583ddSDavid Sterba 	down_read(&fs_info->dev_replace.rwsem);
20771da73967SAnand Jain 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
20781da73967SAnand Jain 		ASSERT(num_devices > 1);
20791da73967SAnand Jain 		num_devices--;
20801da73967SAnand Jain 	}
2081cb5583ddSDavid Sterba 	up_read(&fs_info->dev_replace.rwsem);
20821da73967SAnand Jain 
20831da73967SAnand Jain 	return num_devices;
20841da73967SAnand Jain }
20851da73967SAnand Jain 
btrfs_scratch_superblock(struct btrfs_fs_info * fs_info,struct block_device * bdev,int copy_num)20860e0078f7SChristoph Hellwig static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
20870e0078f7SChristoph Hellwig 				     struct block_device *bdev, int copy_num)
20886fbceb9fSJohannes Thumshirn {
20896fbceb9fSJohannes Thumshirn 	struct btrfs_super_block *disk_super;
209026ecf243SChristoph Hellwig 	const size_t len = sizeof(disk_super->magic);
209126ecf243SChristoph Hellwig 	const u64 bytenr = btrfs_sb_offset(copy_num);
20928f32380dSJohannes Thumshirn 	int ret;
20938f32380dSJohannes Thumshirn 
209426ecf243SChristoph Hellwig 	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
20958f32380dSJohannes Thumshirn 	if (IS_ERR(disk_super))
20960e0078f7SChristoph Hellwig 		return;
209712659251SNaohiro Aota 
209826ecf243SChristoph Hellwig 	memset(&disk_super->magic, 0, len);
209926ecf243SChristoph Hellwig 	folio_mark_dirty(virt_to_folio(disk_super));
210026ecf243SChristoph Hellwig 	btrfs_release_disk_super(disk_super);
210126ecf243SChristoph Hellwig 
210226ecf243SChristoph Hellwig 	ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
21038f32380dSJohannes Thumshirn 	if (ret)
21040e0078f7SChristoph Hellwig 		btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
21058f32380dSJohannes Thumshirn 			copy_num, ret);
21060e0078f7SChristoph Hellwig }
21078f32380dSJohannes Thumshirn 
btrfs_scratch_superblocks(struct btrfs_fs_info * fs_info,struct block_device * bdev,const char * device_path)21080e0078f7SChristoph Hellwig void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
21090e0078f7SChristoph Hellwig 			       struct block_device *bdev,
21100e0078f7SChristoph Hellwig 			       const char *device_path)
21110e0078f7SChristoph Hellwig {
21120e0078f7SChristoph Hellwig 	int copy_num;
21130e0078f7SChristoph Hellwig 
21140e0078f7SChristoph Hellwig 	if (!bdev)
21150e0078f7SChristoph Hellwig 		return;
21160e0078f7SChristoph Hellwig 
21170e0078f7SChristoph Hellwig 	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
21180e0078f7SChristoph Hellwig 		if (bdev_is_zoned(bdev))
21190e0078f7SChristoph Hellwig 			btrfs_reset_sb_log_zones(bdev, copy_num);
21200e0078f7SChristoph Hellwig 		else
21210e0078f7SChristoph Hellwig 			btrfs_scratch_superblock(fs_info, bdev, copy_num);
21226fbceb9fSJohannes Thumshirn 	}
21236fbceb9fSJohannes Thumshirn 
21246fbceb9fSJohannes Thumshirn 	/* Notify udev that device has changed */
21256fbceb9fSJohannes Thumshirn 	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
21266fbceb9fSJohannes Thumshirn 
21276fbceb9fSJohannes Thumshirn 	/* Update ctime/mtime for device path for libblkid */
212854fde91fSJosef Bacik 	update_dev_time(device_path);
21296fbceb9fSJohannes Thumshirn }
21306fbceb9fSJohannes Thumshirn 
btrfs_rm_device(struct btrfs_fs_info * fs_info,struct btrfs_dev_lookup_args * args,struct block_device ** bdev,void ** holder)21311a15eb72SJosef Bacik int btrfs_rm_device(struct btrfs_fs_info *fs_info,
21321a15eb72SJosef Bacik 		    struct btrfs_dev_lookup_args *args,
21332736e8eeSChristoph Hellwig 		    struct block_device **bdev, void **holder)
2134f1fa7f26SAnand Jain {
2135bbac5869SQu Wenruo 	struct btrfs_trans_handle *trans;
2136f1fa7f26SAnand Jain 	struct btrfs_device *device;
2137f1fa7f26SAnand Jain 	struct btrfs_fs_devices *cur_devices;
2138b5185197SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2139f1fa7f26SAnand Jain 	u64 num_devices;
2140f1fa7f26SAnand Jain 	int ret = 0;
2141f1fa7f26SAnand Jain 
2142914a519bSJosef Bacik 	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2143914a519bSJosef Bacik 		btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2144914a519bSJosef Bacik 		return -EINVAL;
2145914a519bSJosef Bacik 	}
2146914a519bSJosef Bacik 
21478ef9dc0fSJosef Bacik 	/*
21488ef9dc0fSJosef Bacik 	 * The device list in fs_devices is accessed without locks (neither
21498ef9dc0fSJosef Bacik 	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
21508ef9dc0fSJosef Bacik 	 * filesystem and another device rm cannot run.
21518ef9dc0fSJosef Bacik 	 */
21521da73967SAnand Jain 	num_devices = btrfs_num_devices(fs_info);
2153a061fc8dSChris Mason 
21540b246afaSJeff Mahoney 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2155beaf8ab3SStefan Behrens 	if (ret)
2156bbac5869SQu Wenruo 		return ret;
2157f1fa7f26SAnand Jain 
21581a15eb72SJosef Bacik 	device = btrfs_find_device(fs_info->fs_devices, args);
21591a15eb72SJosef Bacik 	if (!device) {
21601a15eb72SJosef Bacik 		if (args->missing)
2161a27a94c2SNikolay Borisov 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2162a27a94c2SNikolay Borisov 		else
21631a15eb72SJosef Bacik 			ret = -ENOENT;
2164bbac5869SQu Wenruo 		return ret;
2165a27a94c2SNikolay Borisov 	}
21662b82032cSYan Zheng 
2167eede2bf3SOmar Sandoval 	if (btrfs_pinned_by_swapfile(fs_info, device)) {
2168eede2bf3SOmar Sandoval 		btrfs_warn_in_rcu(fs_info,
2169eede2bf3SOmar Sandoval 		  "cannot remove device %s (devid %llu) due to active swapfile",
2170cb3e217bSQu Wenruo 				  btrfs_dev_name(device), device->devid);
2171bbac5869SQu Wenruo 		return -ETXTBSY;
2172eede2bf3SOmar Sandoval 	}
2173eede2bf3SOmar Sandoval 
2174bbac5869SQu Wenruo 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2175bbac5869SQu Wenruo 		return BTRFS_ERROR_DEV_TGT_REPLACE;
217663a212abSStefan Behrens 
2177ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2178bbac5869SQu Wenruo 	    fs_info->fs_devices->rw_devices == 1)
2179bbac5869SQu Wenruo 		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
21802b82032cSYan Zheng 
2181ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
218234441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
21832b82032cSYan Zheng 		list_del_init(&device->dev_alloc_list);
2184c3929c36SMiao Xie 		device->fs_devices->rw_devices--;
218534441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
21862b82032cSYan Zheng 	}
2187a061fc8dSChris Mason 
2188a061fc8dSChris Mason 	ret = btrfs_shrink_device(device, 0);
2189a061fc8dSChris Mason 	if (ret)
21909b3517e9SIlya Dryomov 		goto error_undo;
2191a061fc8dSChris Mason 
2192bbac5869SQu Wenruo 	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2193bbac5869SQu Wenruo 	if (IS_ERR(trans)) {
2194bbac5869SQu Wenruo 		ret = PTR_ERR(trans);
21959b3517e9SIlya Dryomov 		goto error_undo;
2196bbac5869SQu Wenruo 	}
2197bbac5869SQu Wenruo 
2198bbac5869SQu Wenruo 	ret = btrfs_rm_dev_item(trans, device);
2199bbac5869SQu Wenruo 	if (ret) {
2200bbac5869SQu Wenruo 		/* Any error in dev item removal is critical */
2201bbac5869SQu Wenruo 		btrfs_crit(fs_info,
2202bbac5869SQu Wenruo 			   "failed to remove device item for devid %llu: %d",
2203bbac5869SQu Wenruo 			   device->devid, ret);
2204bbac5869SQu Wenruo 		btrfs_abort_transaction(trans, ret);
2205bbac5869SQu Wenruo 		btrfs_end_transaction(trans);
2206bbac5869SQu Wenruo 		return ret;
2207bbac5869SQu Wenruo 	}
2208a061fc8dSChris Mason 
2209e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2210163e97eeSDavid Sterba 	btrfs_scrub_cancel_dev(device);
2211e5e9a520SChris Mason 
2212e5e9a520SChris Mason 	/*
2213e5e9a520SChris Mason 	 * the device list mutex makes sure that we don't change
2214e5e9a520SChris Mason 	 * the device list while someone else is writing out all
2215d7306801SFilipe David Borba Manana 	 * the device supers. Whoever is writing all supers, should
2216d7306801SFilipe David Borba Manana 	 * lock the device list mutex before getting the number of
2217d7306801SFilipe David Borba Manana 	 * devices in the super block (super_copy). Conversely,
2218d7306801SFilipe David Borba Manana 	 * whoever updates the number of devices in the super block
2219d7306801SFilipe David Borba Manana 	 * (super_copy) should hold the device list mutex.
2220e5e9a520SChris Mason 	 */
22211f78160cSXiao Guangrong 
222241a52a0fSAnand Jain 	/*
222341a52a0fSAnand Jain 	 * In normal cases the cur_devices == fs_devices. But in case
222441a52a0fSAnand Jain 	 * of deleting a seed device, the cur_devices should point to
22259675ea8cSSu Yue 	 * its own fs_devices listed under the fs_devices->seed_list.
222641a52a0fSAnand Jain 	 */
22271f78160cSXiao Guangrong 	cur_devices = device->fs_devices;
2228b5185197SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
22291f78160cSXiao Guangrong 	list_del_rcu(&device->dev_list);
2230e5e9a520SChris Mason 
223141a52a0fSAnand Jain 	cur_devices->num_devices--;
223241a52a0fSAnand Jain 	cur_devices->total_devices--;
2233b4993e64SAnand Jain 	/* Update total_devices of the parent fs_devices if it's seed */
2234b4993e64SAnand Jain 	if (cur_devices != fs_devices)
2235b4993e64SAnand Jain 		fs_devices->total_devices--;
22362b82032cSYan Zheng 
2237e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
223841a52a0fSAnand Jain 		cur_devices->missing_devices--;
2239cd02dca5SChris Mason 
2240d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(device, NULL);
22412b82032cSYan Zheng 
22420bfaa9c5SEric Sandeen 	if (device->bdev) {
224341a52a0fSAnand Jain 		cur_devices->open_devices--;
224499994cdeSAnand Jain 		/* remove sysfs entry */
224553f8a74cSAnand Jain 		btrfs_sysfs_remove_device(device);
22460bfaa9c5SEric Sandeen 	}
224799994cdeSAnand Jain 
22480b246afaSJeff Mahoney 	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
22490b246afaSJeff Mahoney 	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2250b5185197SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2251e4404d6eSYan Zheng 
2252cea67ab9SJeff Mahoney 	/*
22533fa421deSJosef Bacik 	 * At this point, the device is zero sized and detached from the
22543fa421deSJosef Bacik 	 * devices list.  All that's left is to zero out the old supers and
22553fa421deSJosef Bacik 	 * free the device.
22563fa421deSJosef Bacik 	 *
22573fa421deSJosef Bacik 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
22583fa421deSJosef Bacik 	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
22593fa421deSJosef Bacik 	 * block device and it's dependencies.  Instead just flush the device
22603fa421deSJosef Bacik 	 * and let the caller do the final blkdev_put.
2261cea67ab9SJeff Mahoney 	 */
22623fa421deSJosef Bacik 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
22638f32380dSJohannes Thumshirn 		btrfs_scratch_superblocks(fs_info, device->bdev,
22648f32380dSJohannes Thumshirn 					  device->name->str);
22653fa421deSJosef Bacik 		if (device->bdev) {
22663fa421deSJosef Bacik 			sync_blockdev(device->bdev);
22673fa421deSJosef Bacik 			invalidate_bdev(device->bdev);
22683fa421deSJosef Bacik 		}
22693fa421deSJosef Bacik 	}
2270cea67ab9SJeff Mahoney 
22713fa421deSJosef Bacik 	*bdev = device->bdev;
22722736e8eeSChristoph Hellwig 	*holder = device->holder;
22738e75fd89SNikolay Borisov 	synchronize_rcu();
22748e75fd89SNikolay Borisov 	btrfs_free_device(device);
2275cea67ab9SJeff Mahoney 
22768b41393fSJosef Bacik 	/*
22778b41393fSJosef Bacik 	 * This can happen if cur_devices is the private seed devices list.  We
22788b41393fSJosef Bacik 	 * cannot call close_fs_devices() here because it expects the uuid_mutex
22798b41393fSJosef Bacik 	 * to be held, but in fact we don't need that for the private
22808b41393fSJosef Bacik 	 * seed_devices, we can simply decrement cur_devices->opened and then
22818b41393fSJosef Bacik 	 * remove it from our list and free the fs_devices.
22828b41393fSJosef Bacik 	 */
22838e906945SAnand Jain 	if (cur_devices->num_devices == 0) {
2284944d3f9fSNikolay Borisov 		list_del_init(&cur_devices->seed_list);
22858b41393fSJosef Bacik 		ASSERT(cur_devices->opened == 1);
22868b41393fSJosef Bacik 		cur_devices->opened--;
22871f78160cSXiao Guangrong 		free_fs_devices(cur_devices);
22882b82032cSYan Zheng 	}
22892b82032cSYan Zheng 
2290bbac5869SQu Wenruo 	ret = btrfs_commit_transaction(trans);
2291bbac5869SQu Wenruo 
2292a061fc8dSChris Mason 	return ret;
229324fc572fSAnand Jain 
22949b3517e9SIlya Dryomov error_undo:
2295ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
229634441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
22979b3517e9SIlya Dryomov 		list_add(&device->dev_alloc_list,
2298b5185197SAnand Jain 			 &fs_devices->alloc_list);
2299c3929c36SMiao Xie 		device->fs_devices->rw_devices++;
230034441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
23019b3517e9SIlya Dryomov 	}
2302bbac5869SQu Wenruo 	return ret;
2303a061fc8dSChris Mason }
2304a061fc8dSChris Mason 
btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device * srcdev)230568a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2306e93c89c1SStefan Behrens {
2307d51908ceSAnand Jain 	struct btrfs_fs_devices *fs_devices;
2308d51908ceSAnand Jain 
230968a9db5fSNikolay Borisov 	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
23101357272fSIlya Dryomov 
231125e8e911SAnand Jain 	/*
231225e8e911SAnand Jain 	 * in case of fs with no seed, srcdev->fs_devices will point
231325e8e911SAnand Jain 	 * to fs_devices of fs_info. However when the dev being replaced is
231425e8e911SAnand Jain 	 * a seed dev it will point to the seed's local fs_devices. In short
231525e8e911SAnand Jain 	 * srcdev will have its correct fs_devices in both the cases.
231625e8e911SAnand Jain 	 */
231725e8e911SAnand Jain 	fs_devices = srcdev->fs_devices;
2318d51908ceSAnand Jain 
2319e93c89c1SStefan Behrens 	list_del_rcu(&srcdev->dev_list);
2320619c47f3SDavid Sterba 	list_del(&srcdev->dev_alloc_list);
2321d51908ceSAnand Jain 	fs_devices->num_devices--;
2322e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2323d51908ceSAnand Jain 		fs_devices->missing_devices--;
2324e93c89c1SStefan Behrens 
2325ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
232682372bc8SMiao Xie 		fs_devices->rw_devices--;
23271357272fSIlya Dryomov 
232882372bc8SMiao Xie 	if (srcdev->bdev)
232982372bc8SMiao Xie 		fs_devices->open_devices--;
2330084b6e7cSQu Wenruo }
2331084b6e7cSQu Wenruo 
btrfs_rm_dev_replace_free_srcdev(struct btrfs_device * srcdev)233265237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2333084b6e7cSQu Wenruo {
2334084b6e7cSQu Wenruo 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
233582372bc8SMiao Xie 
2336a466c85eSJosef Bacik 	mutex_lock(&uuid_mutex);
2337a466c85eSJosef Bacik 
233814238819SAnand Jain 	btrfs_close_bdev(srcdev);
23398e75fd89SNikolay Borisov 	synchronize_rcu();
23408e75fd89SNikolay Borisov 	btrfs_free_device(srcdev);
234194d5f0c2SAnand Jain 
234294d5f0c2SAnand Jain 	/* if this is no devs we rather delete the fs_devices */
234394d5f0c2SAnand Jain 	if (!fs_devices->num_devices) {
23446dd38f81SAnand Jain 		/*
23456dd38f81SAnand Jain 		 * On a mounted FS, num_devices can't be zero unless it's a
23466dd38f81SAnand Jain 		 * seed. In case of a seed device being replaced, the replace
23476dd38f81SAnand Jain 		 * target added to the sprout FS, so there will be no more
23486dd38f81SAnand Jain 		 * device left under the seed FS.
23496dd38f81SAnand Jain 		 */
23506dd38f81SAnand Jain 		ASSERT(fs_devices->seeding);
23516dd38f81SAnand Jain 
2352944d3f9fSNikolay Borisov 		list_del_init(&fs_devices->seed_list);
23530226e0ebSAnand Jain 		close_fs_devices(fs_devices);
23548bef8401SAnand Jain 		free_fs_devices(fs_devices);
235594d5f0c2SAnand Jain 	}
2356a466c85eSJosef Bacik 	mutex_unlock(&uuid_mutex);
2357e93c89c1SStefan Behrens }
2358e93c89c1SStefan Behrens 
btrfs_destroy_dev_replace_tgtdev(struct btrfs_device * tgtdev)23594f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2360e93c89c1SStefan Behrens {
23614f5ad7bdSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2362d2ff1b20SAnand Jain 
2363d9a071f0SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
2364d9a071f0SAnand Jain 
236553f8a74cSAnand Jain 	btrfs_sysfs_remove_device(tgtdev);
2366d2ff1b20SAnand Jain 
2367779bf3feSAnand Jain 	if (tgtdev->bdev)
2368d9a071f0SAnand Jain 		fs_devices->open_devices--;
2369779bf3feSAnand Jain 
2370d9a071f0SAnand Jain 	fs_devices->num_devices--;
2371e93c89c1SStefan Behrens 
2372d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(tgtdev, NULL);
2373e93c89c1SStefan Behrens 
2374e93c89c1SStefan Behrens 	list_del_rcu(&tgtdev->dev_list);
2375e93c89c1SStefan Behrens 
2376d9a071f0SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2377779bf3feSAnand Jain 
23788f32380dSJohannes Thumshirn 	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
23798f32380dSJohannes Thumshirn 				  tgtdev->name->str);
238014238819SAnand Jain 
238114238819SAnand Jain 	btrfs_close_bdev(tgtdev);
23828e75fd89SNikolay Borisov 	synchronize_rcu();
23838e75fd89SNikolay Borisov 	btrfs_free_device(tgtdev);
2384e93c89c1SStefan Behrens }
2385e93c89c1SStefan Behrens 
238643dd529aSDavid Sterba /*
238743dd529aSDavid Sterba  * Populate args from device at path.
2388faa775c4SJosef Bacik  *
2389faa775c4SJosef Bacik  * @fs_info:	the filesystem
2390faa775c4SJosef Bacik  * @args:	the args to populate
2391faa775c4SJosef Bacik  * @path:	the path to the device
2392faa775c4SJosef Bacik  *
2393faa775c4SJosef Bacik  * This will read the super block of the device at @path and populate @args with
2394faa775c4SJosef Bacik  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
2395faa775c4SJosef Bacik  * lookup a device to operate on, but need to do it before we take any locks.
2396faa775c4SJosef Bacik  * This properly handles the special case of "missing" that a user may pass in,
2397faa775c4SJosef Bacik  * and does some basic sanity checks.  The caller must make sure that @path is
2398faa775c4SJosef Bacik  * properly NUL terminated before calling in, and must call
2399faa775c4SJosef Bacik  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2400faa775c4SJosef Bacik  * uuid buffers.
2401faa775c4SJosef Bacik  *
2402faa775c4SJosef Bacik  * Return: 0 for success, -errno for failure
2403faa775c4SJosef Bacik  */
btrfs_get_dev_args_from_path(struct btrfs_fs_info * fs_info,struct btrfs_dev_lookup_args * args,const char * path)2404faa775c4SJosef Bacik int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2405faa775c4SJosef Bacik 				 struct btrfs_dev_lookup_args *args,
2406faa775c4SJosef Bacik 				 const char *path)
24077ba15b7dSStefan Behrens {
24087ba15b7dSStefan Behrens 	struct btrfs_super_block *disk_super;
24097ba15b7dSStefan Behrens 	struct block_device *bdev;
2410faa775c4SJosef Bacik 	int ret;
24117ba15b7dSStefan Behrens 
2412faa775c4SJosef Bacik 	if (!path || !path[0])
2413faa775c4SJosef Bacik 		return -EINVAL;
2414faa775c4SJosef Bacik 	if (!strcmp(path, "missing")) {
2415faa775c4SJosef Bacik 		args->missing = true;
2416faa775c4SJosef Bacik 		return 0;
2417faa775c4SJosef Bacik 	}
2418faa775c4SJosef Bacik 
2419faa775c4SJosef Bacik 	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2420faa775c4SJosef Bacik 	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2421faa775c4SJosef Bacik 	if (!args->uuid || !args->fsid) {
2422faa775c4SJosef Bacik 		btrfs_put_dev_args_from_path(args);
2423faa775c4SJosef Bacik 		return -ENOMEM;
2424faa775c4SJosef Bacik 	}
2425faa775c4SJosef Bacik 
242605bdb996SChristoph Hellwig 	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
2427faa775c4SJosef Bacik 				    &bdev, &disk_super);
24289ea0106aSZixuan Fu 	if (ret) {
24299ea0106aSZixuan Fu 		btrfs_put_dev_args_from_path(args);
2430faa775c4SJosef Bacik 		return ret;
24319ea0106aSZixuan Fu 	}
24329ea0106aSZixuan Fu 
2433faa775c4SJosef Bacik 	args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2434faa775c4SJosef Bacik 	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
24357239ff4bSNikolay Borisov 	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2436faa775c4SJosef Bacik 		memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
24377239ff4bSNikolay Borisov 	else
2438faa775c4SJosef Bacik 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
24398f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
24402736e8eeSChristoph Hellwig 	blkdev_put(bdev, NULL);
2441faa775c4SJosef Bacik 	return 0;
24427ba15b7dSStefan Behrens }
24437ba15b7dSStefan Behrens 
24442b82032cSYan Zheng /*
2445faa775c4SJosef Bacik  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2446faa775c4SJosef Bacik  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2447faa775c4SJosef Bacik  * that don't need to be freed.
24485c5c0df0SDavid Sterba  */
btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args * args)2449faa775c4SJosef Bacik void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2450faa775c4SJosef Bacik {
2451faa775c4SJosef Bacik 	kfree(args->uuid);
2452faa775c4SJosef Bacik 	kfree(args->fsid);
2453faa775c4SJosef Bacik 	args->uuid = NULL;
2454faa775c4SJosef Bacik 	args->fsid = NULL;
2455faa775c4SJosef Bacik }
2456faa775c4SJosef Bacik 
btrfs_find_device_by_devspec(struct btrfs_fs_info * fs_info,u64 devid,const char * device_path)2457a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec(
24586e927cebSAnand Jain 		struct btrfs_fs_info *fs_info, u64 devid,
24596e927cebSAnand Jain 		const char *device_path)
246024e0474bSAnand Jain {
2461562d7b15SJosef Bacik 	BTRFS_DEV_LOOKUP_ARGS(args);
2462a27a94c2SNikolay Borisov 	struct btrfs_device *device;
2463faa775c4SJosef Bacik 	int ret;
246424e0474bSAnand Jain 
24655c5c0df0SDavid Sterba 	if (devid) {
2466562d7b15SJosef Bacik 		args.devid = devid;
2467562d7b15SJosef Bacik 		device = btrfs_find_device(fs_info->fs_devices, &args);
2468a27a94c2SNikolay Borisov 		if (!device)
2469a27a94c2SNikolay Borisov 			return ERR_PTR(-ENOENT);
24706e927cebSAnand Jain 		return device;
24716e927cebSAnand Jain 	}
24726e927cebSAnand Jain 
2473faa775c4SJosef Bacik 	ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2474faa775c4SJosef Bacik 	if (ret)
2475faa775c4SJosef Bacik 		return ERR_PTR(ret);
2476562d7b15SJosef Bacik 	device = btrfs_find_device(fs_info->fs_devices, &args);
2477faa775c4SJosef Bacik 	btrfs_put_dev_args_from_path(&args);
2478562d7b15SJosef Bacik 	if (!device)
2479d95a830cSAnand Jain 		return ERR_PTR(-ENOENT);
2480562d7b15SJosef Bacik 	return device;
248124e0474bSAnand Jain }
248224e0474bSAnand Jain 
btrfs_init_sprout(struct btrfs_fs_info * fs_info)2483849eae5eSAnand Jain static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
24842b82032cSYan Zheng {
24850b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
24862b82032cSYan Zheng 	struct btrfs_fs_devices *old_devices;
2487e4404d6eSYan Zheng 	struct btrfs_fs_devices *seed_devices;
24882b82032cSYan Zheng 
2489a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
2490e4404d6eSYan Zheng 	if (!fs_devices->seeding)
2491849eae5eSAnand Jain 		return ERR_PTR(-EINVAL);
24922b82032cSYan Zheng 
2493427c8fddSNikolay Borisov 	/*
2494427c8fddSNikolay Borisov 	 * Private copy of the seed devices, anchored at
2495427c8fddSNikolay Borisov 	 * fs_info->fs_devices->seed_list
2496427c8fddSNikolay Borisov 	 */
24977239ff4bSNikolay Borisov 	seed_devices = alloc_fs_devices(NULL, NULL);
24982208a378SIlya Dryomov 	if (IS_ERR(seed_devices))
2499849eae5eSAnand Jain 		return seed_devices;
25002b82032cSYan Zheng 
2501427c8fddSNikolay Borisov 	/*
2502427c8fddSNikolay Borisov 	 * It's necessary to retain a copy of the original seed fs_devices in
2503427c8fddSNikolay Borisov 	 * fs_uuids so that filesystems which have been seeded can successfully
2504427c8fddSNikolay Borisov 	 * reference the seed device from open_seed_devices. This also supports
2505427c8fddSNikolay Borisov 	 * multiple fs seed.
2506427c8fddSNikolay Borisov 	 */
2507e4404d6eSYan Zheng 	old_devices = clone_fs_devices(fs_devices);
2508e4404d6eSYan Zheng 	if (IS_ERR(old_devices)) {
2509e4404d6eSYan Zheng 		kfree(seed_devices);
2510849eae5eSAnand Jain 		return old_devices;
25112b82032cSYan Zheng 	}
2512e4404d6eSYan Zheng 
2513c4babc5eSAnand Jain 	list_add(&old_devices->fs_list, &fs_uuids);
25142b82032cSYan Zheng 
2515e4404d6eSYan Zheng 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2516e4404d6eSYan Zheng 	seed_devices->opened = 1;
2517e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->devices);
2518e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->alloc_list);
2519e5e9a520SChris Mason 	mutex_init(&seed_devices->device_list_mutex);
2520c9513edbSXiao Guangrong 
2521849eae5eSAnand Jain 	return seed_devices;
2522849eae5eSAnand Jain }
2523849eae5eSAnand Jain 
2524849eae5eSAnand Jain /*
2525849eae5eSAnand Jain  * Splice seed devices into the sprout fs_devices.
2526849eae5eSAnand Jain  * Generate a new fsid for the sprouted read-write filesystem.
2527849eae5eSAnand Jain  */
btrfs_setup_sprout(struct btrfs_fs_info * fs_info,struct btrfs_fs_devices * seed_devices)2528849eae5eSAnand Jain static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2529849eae5eSAnand Jain 			       struct btrfs_fs_devices *seed_devices)
2530849eae5eSAnand Jain {
2531849eae5eSAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2532849eae5eSAnand Jain 	struct btrfs_super_block *disk_super = fs_info->super_copy;
2533849eae5eSAnand Jain 	struct btrfs_device *device;
2534849eae5eSAnand Jain 	u64 super_flags;
2535849eae5eSAnand Jain 
2536849eae5eSAnand Jain 	/*
2537849eae5eSAnand Jain 	 * We are updating the fsid, the thread leading to device_list_add()
2538849eae5eSAnand Jain 	 * could race, so uuid_mutex is needed.
2539849eae5eSAnand Jain 	 */
2540849eae5eSAnand Jain 	lockdep_assert_held(&uuid_mutex);
2541849eae5eSAnand Jain 
2542849eae5eSAnand Jain 	/*
2543849eae5eSAnand Jain 	 * The threads listed below may traverse dev_list but can do that without
2544849eae5eSAnand Jain 	 * device_list_mutex:
2545849eae5eSAnand Jain 	 * - All device ops and balance - as we are in btrfs_exclop_start.
2546849eae5eSAnand Jain 	 * - Various dev_list readers - are using RCU.
2547849eae5eSAnand Jain 	 * - btrfs_ioctl_fitrim() - is using RCU.
2548849eae5eSAnand Jain 	 *
2549849eae5eSAnand Jain 	 * For-read threads as below are using device_list_mutex:
2550849eae5eSAnand Jain 	 * - Readonly scrub btrfs_scrub_dev()
2551849eae5eSAnand Jain 	 * - Readonly scrub btrfs_scrub_progress()
2552849eae5eSAnand Jain 	 * - btrfs_get_dev_stats()
2553849eae5eSAnand Jain 	 */
2554849eae5eSAnand Jain 	lockdep_assert_held(&fs_devices->device_list_mutex);
2555849eae5eSAnand Jain 
25561f78160cSXiao Guangrong 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
25571f78160cSXiao Guangrong 			      synchronize_rcu);
25582196d6e8SMiao Xie 	list_for_each_entry(device, &seed_devices->devices, dev_list)
2559e4404d6eSYan Zheng 		device->fs_devices = seed_devices;
25602196d6e8SMiao Xie 
25610395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
25622b82032cSYan Zheng 	fs_devices->num_devices = 0;
25632b82032cSYan Zheng 	fs_devices->open_devices = 0;
256469611ac8SMiao Xie 	fs_devices->missing_devices = 0;
25657f0432d0SJohannes Thumshirn 	fs_devices->rotating = false;
2566944d3f9fSNikolay Borisov 	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
25672b82032cSYan Zheng 
25682b82032cSYan Zheng 	generate_random_uuid(fs_devices->fsid);
25697239ff4bSNikolay Borisov 	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
25702b82032cSYan Zheng 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2571f7171750SFilipe David Borba Manana 
25722b82032cSYan Zheng 	super_flags = btrfs_super_flags(disk_super) &
25732b82032cSYan Zheng 		      ~BTRFS_SUPER_FLAG_SEEDING;
25742b82032cSYan Zheng 	btrfs_set_super_flags(disk_super, super_flags);
25752b82032cSYan Zheng }
25762b82032cSYan Zheng 
25772b82032cSYan Zheng /*
257801327610SNicholas D Steeves  * Store the expected generation for seed devices in device items.
25792b82032cSYan Zheng  */
btrfs_finish_sprout(struct btrfs_trans_handle * trans)25805c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
25812b82032cSYan Zheng {
2582562d7b15SJosef Bacik 	BTRFS_DEV_LOOKUP_ARGS(args);
25835c466629SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
25845b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
25852b82032cSYan Zheng 	struct btrfs_path *path;
25862b82032cSYan Zheng 	struct extent_buffer *leaf;
25872b82032cSYan Zheng 	struct btrfs_dev_item *dev_item;
25882b82032cSYan Zheng 	struct btrfs_device *device;
25892b82032cSYan Zheng 	struct btrfs_key key;
259044880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
25912b82032cSYan Zheng 	u8 dev_uuid[BTRFS_UUID_SIZE];
25922b82032cSYan Zheng 	int ret;
25932b82032cSYan Zheng 
25942b82032cSYan Zheng 	path = btrfs_alloc_path();
25952b82032cSYan Zheng 	if (!path)
25962b82032cSYan Zheng 		return -ENOMEM;
25972b82032cSYan Zheng 
25982b82032cSYan Zheng 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
25992b82032cSYan Zheng 	key.offset = 0;
26002b82032cSYan Zheng 	key.type = BTRFS_DEV_ITEM_KEY;
26012b82032cSYan Zheng 
26022b82032cSYan Zheng 	while (1) {
26032bb2e00eSFilipe Manana 		btrfs_reserve_chunk_metadata(trans, false);
26042b82032cSYan Zheng 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
26052bb2e00eSFilipe Manana 		btrfs_trans_release_chunk_metadata(trans);
26062b82032cSYan Zheng 		if (ret < 0)
26072b82032cSYan Zheng 			goto error;
26082b82032cSYan Zheng 
26092b82032cSYan Zheng 		leaf = path->nodes[0];
26102b82032cSYan Zheng next_slot:
26112b82032cSYan Zheng 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
26122b82032cSYan Zheng 			ret = btrfs_next_leaf(root, path);
26132b82032cSYan Zheng 			if (ret > 0)
26142b82032cSYan Zheng 				break;
26152b82032cSYan Zheng 			if (ret < 0)
26162b82032cSYan Zheng 				goto error;
26172b82032cSYan Zheng 			leaf = path->nodes[0];
26182b82032cSYan Zheng 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2619b3b4aa74SDavid Sterba 			btrfs_release_path(path);
26202b82032cSYan Zheng 			continue;
26212b82032cSYan Zheng 		}
26222b82032cSYan Zheng 
26232b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
26242b82032cSYan Zheng 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
26252b82032cSYan Zheng 		    key.type != BTRFS_DEV_ITEM_KEY)
26262b82032cSYan Zheng 			break;
26272b82032cSYan Zheng 
26282b82032cSYan Zheng 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
26292b82032cSYan Zheng 					  struct btrfs_dev_item);
2630562d7b15SJosef Bacik 		args.devid = btrfs_device_id(leaf, dev_item);
2631410ba3a2SGeert Uytterhoeven 		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
26322b82032cSYan Zheng 				   BTRFS_UUID_SIZE);
26331473b24eSGeert Uytterhoeven 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
263444880fdcSAnand Jain 				   BTRFS_FSID_SIZE);
2635562d7b15SJosef Bacik 		args.uuid = dev_uuid;
2636562d7b15SJosef Bacik 		args.fsid = fs_uuid;
2637562d7b15SJosef Bacik 		device = btrfs_find_device(fs_info->fs_devices, &args);
263879787eaaSJeff Mahoney 		BUG_ON(!device); /* Logic error */
26392b82032cSYan Zheng 
26402b82032cSYan Zheng 		if (device->fs_devices->seeding) {
26412b82032cSYan Zheng 			btrfs_set_device_generation(leaf, dev_item,
26422b82032cSYan Zheng 						    device->generation);
2643d5e09e38SFilipe Manana 			btrfs_mark_buffer_dirty(trans, leaf);
26442b82032cSYan Zheng 		}
26452b82032cSYan Zheng 
26462b82032cSYan Zheng 		path->slots[0]++;
26472b82032cSYan Zheng 		goto next_slot;
26482b82032cSYan Zheng 	}
26492b82032cSYan Zheng 	ret = 0;
26502b82032cSYan Zheng error:
26512b82032cSYan Zheng 	btrfs_free_path(path);
26522b82032cSYan Zheng 	return ret;
26532b82032cSYan Zheng }
26542b82032cSYan Zheng 
btrfs_init_new_device(struct btrfs_fs_info * fs_info,const char * device_path)2655da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2656788f20ebSChris Mason {
26575112febbSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
2658788f20ebSChris Mason 	struct btrfs_trans_handle *trans;
2659788f20ebSChris Mason 	struct btrfs_device *device;
2660788f20ebSChris Mason 	struct block_device *bdev;
26610b246afaSJeff Mahoney 	struct super_block *sb = fs_info->sb;
26625da54bc1SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
26638ba7d5f5SGenjian Zhang 	struct btrfs_fs_devices *seed_devices = NULL;
266439379faaSNaohiro Aota 	u64 orig_super_total_bytes;
266539379faaSNaohiro Aota 	u64 orig_super_num_devices;
2666788f20ebSChris Mason 	int ret = 0;
2667fd880809SAnand Jain 	bool seeding_dev = false;
266844cab9baSNikolay Borisov 	bool locked = false;
2669788f20ebSChris Mason 
26705da54bc1SAnand Jain 	if (sb_rdonly(sb) && !fs_devices->seeding)
2671f8c5d0b4SLiu Bo 		return -EROFS;
2672788f20ebSChris Mason 
267305bdb996SChristoph Hellwig 	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
26740718afd4SChristoph Hellwig 				  fs_info->bdev_holder, NULL);
26757f59203aSJosef Bacik 	if (IS_ERR(bdev))
26767f59203aSJosef Bacik 		return PTR_ERR(bdev);
2677a2135011SChris Mason 
2678b70f5097SNaohiro Aota 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2679b70f5097SNaohiro Aota 		ret = -EINVAL;
2680b70f5097SNaohiro Aota 		goto error;
2681b70f5097SNaohiro Aota 	}
2682b70f5097SNaohiro Aota 
26835da54bc1SAnand Jain 	if (fs_devices->seeding) {
2684fd880809SAnand Jain 		seeding_dev = true;
26852b82032cSYan Zheng 		down_write(&sb->s_umount);
26862b82032cSYan Zheng 		mutex_lock(&uuid_mutex);
268744cab9baSNikolay Borisov 		locked = true;
26882b82032cSYan Zheng 	}
26892b82032cSYan Zheng 
2690b9ba017fSNikolay Borisov 	sync_blockdev(bdev);
2691a2135011SChris Mason 
2692f4cfa9bdSNikolay Borisov 	rcu_read_lock();
2693f4cfa9bdSNikolay Borisov 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2694788f20ebSChris Mason 		if (device->bdev == bdev) {
2695788f20ebSChris Mason 			ret = -EEXIST;
2696f4cfa9bdSNikolay Borisov 			rcu_read_unlock();
26972b82032cSYan Zheng 			goto error;
2698788f20ebSChris Mason 		}
2699788f20ebSChris Mason 	}
2700f4cfa9bdSNikolay Borisov 	rcu_read_unlock();
2701788f20ebSChris Mason 
2702bb21e302SAnand Jain 	device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
270312bd2fc0SIlya Dryomov 	if (IS_ERR(device)) {
2704788f20ebSChris Mason 		/* we can safely leave the fs_devices entry around */
270512bd2fc0SIlya Dryomov 		ret = PTR_ERR(device);
27062b82032cSYan Zheng 		goto error;
2707788f20ebSChris Mason 	}
2708788f20ebSChris Mason 
27095b316468SNaohiro Aota 	device->fs_info = fs_info;
27105b316468SNaohiro Aota 	device->bdev = bdev;
27114889bc05SAnand Jain 	ret = lookup_bdev(device_path, &device->devt);
27124889bc05SAnand Jain 	if (ret)
27134889bc05SAnand Jain 		goto error_free_device;
27145b316468SNaohiro Aota 
271516beac87SNaohiro Aota 	ret = btrfs_get_dev_zone_info(device, false);
27165b316468SNaohiro Aota 	if (ret)
27175b316468SNaohiro Aota 		goto error_free_device;
27185b316468SNaohiro Aota 
2719a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
272098d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
272198d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
27225b316468SNaohiro Aota 		goto error_free_zone;
272398d5dc13STsutomu Itoh 	}
272498d5dc13STsutomu Itoh 
2725ebbede42SAnand Jain 	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
27262b82032cSYan Zheng 	device->generation = trans->transid;
27270b246afaSJeff Mahoney 	device->io_width = fs_info->sectorsize;
27280b246afaSJeff Mahoney 	device->io_align = fs_info->sectorsize;
27290b246afaSJeff Mahoney 	device->sector_size = fs_info->sectorsize;
2730cda00ebaSChristoph Hellwig 	device->total_bytes =
2731cda00ebaSChristoph Hellwig 		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
27322cc3c559SYan Zheng 	device->disk_total_bytes = device->total_bytes;
2733935e5cc9SMiao Xie 	device->commit_total_bytes = device->total_bytes;
2734e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2735401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
27362736e8eeSChristoph Hellwig 	device->holder = fs_info->bdev_holder;
273727087f37SStefan Behrens 	device->dev_stats_valid = 1;
27389f6d2510SDavid Sterba 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2739325cd4baSZheng Yan 
27402b82032cSYan Zheng 	if (seeding_dev) {
2741849eae5eSAnand Jain 		/* GFP_KERNEL allocation must not be under device_list_mutex */
2742849eae5eSAnand Jain 		seed_devices = btrfs_init_sprout(fs_info);
2743849eae5eSAnand Jain 		if (IS_ERR(seed_devices)) {
2744849eae5eSAnand Jain 			ret = PTR_ERR(seed_devices);
2745d31c32f6SAnand Jain 			btrfs_abort_transaction(trans, ret);
2746d31c32f6SAnand Jain 			goto error_trans;
2747d31c32f6SAnand Jain 		}
2748849eae5eSAnand Jain 	}
2749849eae5eSAnand Jain 
2750849eae5eSAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
2751849eae5eSAnand Jain 	if (seeding_dev) {
2752849eae5eSAnand Jain 		btrfs_setup_sprout(fs_info, seed_devices);
2753b7cb29e6SAnand Jain 		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2754b7cb29e6SAnand Jain 						device);
27552b82032cSYan Zheng 	}
27562b82032cSYan Zheng 
27575da54bc1SAnand Jain 	device->fs_devices = fs_devices;
2758e5e9a520SChris Mason 
275934441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
27605da54bc1SAnand Jain 	list_add_rcu(&device->dev_list, &fs_devices->devices);
27615da54bc1SAnand Jain 	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
27625da54bc1SAnand Jain 	fs_devices->num_devices++;
27635da54bc1SAnand Jain 	fs_devices->open_devices++;
27645da54bc1SAnand Jain 	fs_devices->rw_devices++;
27655da54bc1SAnand Jain 	fs_devices->total_devices++;
27665da54bc1SAnand Jain 	fs_devices->total_rw_bytes += device->total_bytes;
27672b82032cSYan Zheng 
2768a5ed45f8SNikolay Borisov 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
27692bf64758SJosef Bacik 
277010f0d2a5SChristoph Hellwig 	if (!bdev_nonrot(bdev))
27717f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
2772c289811cSChris Mason 
277339379faaSNaohiro Aota 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
27740b246afaSJeff Mahoney 	btrfs_set_super_total_bytes(fs_info->super_copy,
277539379faaSNaohiro Aota 		round_down(orig_super_total_bytes + device->total_bytes,
277639379faaSNaohiro Aota 			   fs_info->sectorsize));
2777788f20ebSChris Mason 
277839379faaSNaohiro Aota 	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
277939379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
278039379faaSNaohiro Aota 				    orig_super_num_devices + 1);
27810d39376aSAnand Jain 
27822196d6e8SMiao Xie 	/*
27832196d6e8SMiao Xie 	 * we've got more storage, clear any full flags on the space
27842196d6e8SMiao Xie 	 * infos
27852196d6e8SMiao Xie 	 */
27860b246afaSJeff Mahoney 	btrfs_clear_space_info_full(fs_info);
27872196d6e8SMiao Xie 
278834441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
2789ca10845aSJosef Bacik 
2790ca10845aSJosef Bacik 	/* Add sysfs device entry */
2791cd36da2eSAnand Jain 	btrfs_sysfs_add_device(device);
2792ca10845aSJosef Bacik 
27935da54bc1SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2794788f20ebSChris Mason 
27952b82032cSYan Zheng 	if (seeding_dev) {
279634441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
27976f8e0fc7SDavid Sterba 		ret = init_first_rw_device(trans);
279834441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
2799005d6427SDavid Sterba 		if (ret) {
280066642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2801d31c32f6SAnand Jain 			goto error_sysfs;
2802005d6427SDavid Sterba 		}
28032196d6e8SMiao Xie 	}
28042196d6e8SMiao Xie 
28058e87e856SNikolay Borisov 	ret = btrfs_add_dev_item(trans, device);
28062196d6e8SMiao Xie 	if (ret) {
280766642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
2808d31c32f6SAnand Jain 		goto error_sysfs;
28092196d6e8SMiao Xie 	}
28102196d6e8SMiao Xie 
28112196d6e8SMiao Xie 	if (seeding_dev) {
28125c466629SDavid Sterba 		ret = btrfs_finish_sprout(trans);
2813005d6427SDavid Sterba 		if (ret) {
281466642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2815d31c32f6SAnand Jain 			goto error_sysfs;
2816005d6427SDavid Sterba 		}
2817b2373f25SAnand Jain 
28188e560081SNikolay Borisov 		/*
28198e560081SNikolay Borisov 		 * fs_devices now represents the newly sprouted filesystem and
2820849eae5eSAnand Jain 		 * its fsid has been changed by btrfs_sprout_splice().
28218e560081SNikolay Borisov 		 */
28228e560081SNikolay Borisov 		btrfs_sysfs_update_sprout_fsid(fs_devices);
2823005d6427SDavid Sterba 	}
28242b82032cSYan Zheng 
28253a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
28262b82032cSYan Zheng 
28272b82032cSYan Zheng 	if (seeding_dev) {
28282b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
28292b82032cSYan Zheng 		up_write(&sb->s_umount);
283044cab9baSNikolay Borisov 		locked = false;
28312b82032cSYan Zheng 
283279787eaaSJeff Mahoney 		if (ret) /* transaction commit */
283379787eaaSJeff Mahoney 			return ret;
283479787eaaSJeff Mahoney 
28352ff7e61eSJeff Mahoney 		ret = btrfs_relocate_sys_chunks(fs_info);
283679787eaaSJeff Mahoney 		if (ret < 0)
28370b246afaSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
28385d163e0eSJeff Mahoney 				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2839671415b7SMiao Xie 		trans = btrfs_attach_transaction(root);
2840671415b7SMiao Xie 		if (IS_ERR(trans)) {
2841671415b7SMiao Xie 			if (PTR_ERR(trans) == -ENOENT)
2842671415b7SMiao Xie 				return 0;
28437132a262SAnand Jain 			ret = PTR_ERR(trans);
28447132a262SAnand Jain 			trans = NULL;
28457132a262SAnand Jain 			goto error_sysfs;
2846671415b7SMiao Xie 		}
28473a45bb20SJeff Mahoney 		ret = btrfs_commit_transaction(trans);
28482b82032cSYan Zheng 	}
2849c9e9f97bSIlya Dryomov 
28507f551d96SAnand Jain 	/*
28517f551d96SAnand Jain 	 * Now that we have written a new super block to this device, check all
28527f551d96SAnand Jain 	 * other fs_devices list if device_path alienates any other scanned
28537f551d96SAnand Jain 	 * device.
28547f551d96SAnand Jain 	 * We can ignore the return value as it typically returns -EINVAL and
28557f551d96SAnand Jain 	 * only succeeds if the device was an alien.
28567f551d96SAnand Jain 	 */
28574889bc05SAnand Jain 	btrfs_forget_devices(device->devt);
28587f551d96SAnand Jain 
28597f551d96SAnand Jain 	/* Update ctime/mtime for blkid or udev */
286054fde91fSJosef Bacik 	update_dev_time(device_path);
28617f551d96SAnand Jain 
2862788f20ebSChris Mason 	return ret;
286379787eaaSJeff Mahoney 
2864d31c32f6SAnand Jain error_sysfs:
286553f8a74cSAnand Jain 	btrfs_sysfs_remove_device(device);
286639379faaSNaohiro Aota 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
286739379faaSNaohiro Aota 	mutex_lock(&fs_info->chunk_mutex);
286839379faaSNaohiro Aota 	list_del_rcu(&device->dev_list);
286939379faaSNaohiro Aota 	list_del(&device->dev_alloc_list);
287039379faaSNaohiro Aota 	fs_info->fs_devices->num_devices--;
287139379faaSNaohiro Aota 	fs_info->fs_devices->open_devices--;
287239379faaSNaohiro Aota 	fs_info->fs_devices->rw_devices--;
287339379faaSNaohiro Aota 	fs_info->fs_devices->total_devices--;
287439379faaSNaohiro Aota 	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
287539379faaSNaohiro Aota 	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
287639379faaSNaohiro Aota 	btrfs_set_super_total_bytes(fs_info->super_copy,
287739379faaSNaohiro Aota 				    orig_super_total_bytes);
287839379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
287939379faaSNaohiro Aota 				    orig_super_num_devices);
288039379faaSNaohiro Aota 	mutex_unlock(&fs_info->chunk_mutex);
288139379faaSNaohiro Aota 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
288279787eaaSJeff Mahoney error_trans:
28837132a262SAnand Jain 	if (trans)
28843a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
28855b316468SNaohiro Aota error_free_zone:
28865b316468SNaohiro Aota 	btrfs_destroy_dev_zone_info(device);
28875c4cf6c9SDavid Sterba error_free_device:
2888a425f9d4SDavid Sterba 	btrfs_free_device(device);
28892b82032cSYan Zheng error:
28902736e8eeSChristoph Hellwig 	blkdev_put(bdev, fs_info->bdev_holder);
289144cab9baSNikolay Borisov 	if (locked) {
28922b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
28932b82032cSYan Zheng 		up_write(&sb->s_umount);
28942b82032cSYan Zheng 	}
2895c9e9f97bSIlya Dryomov 	return ret;
2896788f20ebSChris Mason }
2897788f20ebSChris Mason 
btrfs_update_device(struct btrfs_trans_handle * trans,struct btrfs_device * device)2898d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
28990b86a832SChris Mason 					struct btrfs_device *device)
29000b86a832SChris Mason {
29010b86a832SChris Mason 	int ret;
29020b86a832SChris Mason 	struct btrfs_path *path;
29030b246afaSJeff Mahoney 	struct btrfs_root *root = device->fs_info->chunk_root;
29040b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
29050b86a832SChris Mason 	struct extent_buffer *leaf;
29060b86a832SChris Mason 	struct btrfs_key key;
29070b86a832SChris Mason 
29080b86a832SChris Mason 	path = btrfs_alloc_path();
29090b86a832SChris Mason 	if (!path)
29100b86a832SChris Mason 		return -ENOMEM;
29110b86a832SChris Mason 
29120b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
29130b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
29140b86a832SChris Mason 	key.offset = device->devid;
29150b86a832SChris Mason 
29160b86a832SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
29170b86a832SChris Mason 	if (ret < 0)
29180b86a832SChris Mason 		goto out;
29190b86a832SChris Mason 
29200b86a832SChris Mason 	if (ret > 0) {
29210b86a832SChris Mason 		ret = -ENOENT;
29220b86a832SChris Mason 		goto out;
29230b86a832SChris Mason 	}
29240b86a832SChris Mason 
29250b86a832SChris Mason 	leaf = path->nodes[0];
29260b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
29270b86a832SChris Mason 
29280b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
29290b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
29300b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
29310b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
29320b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
29337cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
29347cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
29357cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
29367cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
2937d5e09e38SFilipe Manana 	btrfs_mark_buffer_dirty(trans, leaf);
29380b86a832SChris Mason 
29390b86a832SChris Mason out:
29400b86a832SChris Mason 	btrfs_free_path(path);
29410b86a832SChris Mason 	return ret;
29420b86a832SChris Mason }
29430b86a832SChris Mason 
btrfs_grow_device(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 new_size)29442196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans,
29458f18cf13SChris Mason 		      struct btrfs_device *device, u64 new_size)
29468f18cf13SChris Mason {
29470b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
29480b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
29492196d6e8SMiao Xie 	u64 old_total;
29502196d6e8SMiao Xie 	u64 diff;
29512bb2e00eSFilipe Manana 	int ret;
29528f18cf13SChris Mason 
2953ebbede42SAnand Jain 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
29542b82032cSYan Zheng 		return -EACCES;
29552196d6e8SMiao Xie 
29567dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
29577dfb8be1SNikolay Borisov 
295834441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
29592196d6e8SMiao Xie 	old_total = btrfs_super_total_bytes(super_copy);
29600e4324a4SNikolay Borisov 	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
29612196d6e8SMiao Xie 
296263a212abSStefan Behrens 	if (new_size <= device->total_bytes ||
2963401e29c1SAnand Jain 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
296434441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
29652b82032cSYan Zheng 		return -EINVAL;
29662196d6e8SMiao Xie 	}
29672b82032cSYan Zheng 
29687dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
29697dfb8be1SNikolay Borisov 			round_down(old_total + diff, fs_info->sectorsize));
29702b82032cSYan Zheng 	device->fs_devices->total_rw_bytes += diff;
29712b82032cSYan Zheng 
29727cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
29737cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
2974fb456252SJeff Mahoney 	btrfs_clear_space_info_full(device->fs_info);
2975bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
2976bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
2977bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
297834441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
29794184ea7fSChris Mason 
29802bb2e00eSFilipe Manana 	btrfs_reserve_chunk_metadata(trans, false);
29812bb2e00eSFilipe Manana 	ret = btrfs_update_device(trans, device);
29822bb2e00eSFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
29832bb2e00eSFilipe Manana 
29842bb2e00eSFilipe Manana 	return ret;
29858f18cf13SChris Mason }
29868f18cf13SChris Mason 
btrfs_free_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2987f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
29888f18cf13SChris Mason {
2989f4208794SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
29905b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
29918f18cf13SChris Mason 	int ret;
29928f18cf13SChris Mason 	struct btrfs_path *path;
29938f18cf13SChris Mason 	struct btrfs_key key;
29948f18cf13SChris Mason 
29958f18cf13SChris Mason 	path = btrfs_alloc_path();
29968f18cf13SChris Mason 	if (!path)
29978f18cf13SChris Mason 		return -ENOMEM;
29988f18cf13SChris Mason 
2999408fbf19SNikolay Borisov 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
30008f18cf13SChris Mason 	key.offset = chunk_offset;
30018f18cf13SChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
30028f18cf13SChris Mason 
30038f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
300479787eaaSJeff Mahoney 	if (ret < 0)
300579787eaaSJeff Mahoney 		goto out;
300679787eaaSJeff Mahoney 	else if (ret > 0) { /* Logic error or corruption */
30070b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, -ENOENT,
300879787eaaSJeff Mahoney 				      "Failed lookup while freeing chunk.");
300979787eaaSJeff Mahoney 		ret = -ENOENT;
301079787eaaSJeff Mahoney 		goto out;
301179787eaaSJeff Mahoney 	}
30128f18cf13SChris Mason 
30138f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
301479787eaaSJeff Mahoney 	if (ret < 0)
30150b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
301679787eaaSJeff Mahoney 				      "Failed to delete chunk item.");
301779787eaaSJeff Mahoney out:
30188f18cf13SChris Mason 	btrfs_free_path(path);
301965a246c5STsutomu Itoh 	return ret;
30208f18cf13SChris Mason }
30218f18cf13SChris Mason 
btrfs_del_sys_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3022408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
30238f18cf13SChris Mason {
30240b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
30258f18cf13SChris Mason 	struct btrfs_disk_key *disk_key;
30268f18cf13SChris Mason 	struct btrfs_chunk *chunk;
30278f18cf13SChris Mason 	u8 *ptr;
30288f18cf13SChris Mason 	int ret = 0;
30298f18cf13SChris Mason 	u32 num_stripes;
30308f18cf13SChris Mason 	u32 array_size;
30318f18cf13SChris Mason 	u32 len = 0;
30328f18cf13SChris Mason 	u32 cur;
30338f18cf13SChris Mason 	struct btrfs_key key;
30348f18cf13SChris Mason 
303579bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
30368f18cf13SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
30378f18cf13SChris Mason 
30388f18cf13SChris Mason 	ptr = super_copy->sys_chunk_array;
30398f18cf13SChris Mason 	cur = 0;
30408f18cf13SChris Mason 
30418f18cf13SChris Mason 	while (cur < array_size) {
30428f18cf13SChris Mason 		disk_key = (struct btrfs_disk_key *)ptr;
30438f18cf13SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
30448f18cf13SChris Mason 
30458f18cf13SChris Mason 		len = sizeof(*disk_key);
30468f18cf13SChris Mason 
30478f18cf13SChris Mason 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
30488f18cf13SChris Mason 			chunk = (struct btrfs_chunk *)(ptr + len);
30498f18cf13SChris Mason 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
30508f18cf13SChris Mason 			len += btrfs_chunk_item_size(num_stripes);
30518f18cf13SChris Mason 		} else {
30528f18cf13SChris Mason 			ret = -EIO;
30538f18cf13SChris Mason 			break;
30548f18cf13SChris Mason 		}
3055408fbf19SNikolay Borisov 		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
30568f18cf13SChris Mason 		    key.offset == chunk_offset) {
30578f18cf13SChris Mason 			memmove(ptr, ptr + len, array_size - (cur + len));
30588f18cf13SChris Mason 			array_size -= len;
30598f18cf13SChris Mason 			btrfs_set_super_sys_array_size(super_copy, array_size);
30608f18cf13SChris Mason 		} else {
30618f18cf13SChris Mason 			ptr += len;
30628f18cf13SChris Mason 			cur += len;
30638f18cf13SChris Mason 		}
30648f18cf13SChris Mason 	}
30658f18cf13SChris Mason 	return ret;
30668f18cf13SChris Mason }
30678f18cf13SChris Mason 
306860ca842eSOmar Sandoval /*
306960ca842eSOmar Sandoval  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
307060ca842eSOmar Sandoval  * @logical: Logical block offset in bytes.
307160ca842eSOmar Sandoval  * @length: Length of extent in bytes.
307260ca842eSOmar Sandoval  *
307360ca842eSOmar Sandoval  * Return: Chunk mapping or ERR_PTR.
307460ca842eSOmar Sandoval  */
btrfs_get_chunk_map(struct btrfs_fs_info * fs_info,u64 logical,u64 length)307560ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3076592d92eeSLiu Bo 				       u64 logical, u64 length)
3077592d92eeSLiu Bo {
3078592d92eeSLiu Bo 	struct extent_map_tree *em_tree;
3079592d92eeSLiu Bo 	struct extent_map *em;
3080592d92eeSLiu Bo 
3081c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
3082592d92eeSLiu Bo 	read_lock(&em_tree->lock);
3083592d92eeSLiu Bo 	em = lookup_extent_mapping(em_tree, logical, length);
3084592d92eeSLiu Bo 	read_unlock(&em_tree->lock);
3085592d92eeSLiu Bo 
3086592d92eeSLiu Bo 	if (!em) {
308747ec6065SFilipe Manana 		btrfs_crit(fs_info,
308847ec6065SFilipe Manana 			   "unable to find chunk map for logical %llu length %llu",
3089592d92eeSLiu Bo 			   logical, length);
3090592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
3091592d92eeSLiu Bo 	}
3092592d92eeSLiu Bo 
30933952f84eSFilipe Manana 	if (em->start > logical || em->start + em->len <= logical) {
3094592d92eeSLiu Bo 		btrfs_crit(fs_info,
309547ec6065SFilipe Manana 			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
309647ec6065SFilipe Manana 			   logical, logical + length, em->start, em->start + em->len);
3097592d92eeSLiu Bo 		free_extent_map(em);
3098592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
3099592d92eeSLiu Bo 	}
3100592d92eeSLiu Bo 
3101592d92eeSLiu Bo 	/* callers are responsible for dropping em's ref. */
3102592d92eeSLiu Bo 	return em;
3103592d92eeSLiu Bo }
3104592d92eeSLiu Bo 
remove_chunk_item(struct btrfs_trans_handle * trans,struct map_lookup * map,u64 chunk_offset)310579bd3712SFilipe Manana static int remove_chunk_item(struct btrfs_trans_handle *trans,
310679bd3712SFilipe Manana 			     struct map_lookup *map, u64 chunk_offset)
310779bd3712SFilipe Manana {
310879bd3712SFilipe Manana 	int i;
310979bd3712SFilipe Manana 
311079bd3712SFilipe Manana 	/*
311179bd3712SFilipe Manana 	 * Removing chunk items and updating the device items in the chunks btree
311279bd3712SFilipe Manana 	 * requires holding the chunk_mutex.
311379bd3712SFilipe Manana 	 * See the comment at btrfs_chunk_alloc() for the details.
311479bd3712SFilipe Manana 	 */
311579bd3712SFilipe Manana 	lockdep_assert_held(&trans->fs_info->chunk_mutex);
311679bd3712SFilipe Manana 
311779bd3712SFilipe Manana 	for (i = 0; i < map->num_stripes; i++) {
311879bd3712SFilipe Manana 		int ret;
311979bd3712SFilipe Manana 
312079bd3712SFilipe Manana 		ret = btrfs_update_device(trans, map->stripes[i].dev);
312179bd3712SFilipe Manana 		if (ret)
312279bd3712SFilipe Manana 			return ret;
312379bd3712SFilipe Manana 	}
312479bd3712SFilipe Manana 
312579bd3712SFilipe Manana 	return btrfs_free_chunk(trans, chunk_offset);
312679bd3712SFilipe Manana }
312779bd3712SFilipe Manana 
btrfs_remove_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)312897aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
312947ab2a6cSJosef Bacik {
313097aff912SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
313147ab2a6cSJosef Bacik 	struct extent_map *em;
313247ab2a6cSJosef Bacik 	struct map_lookup *map;
313347ab2a6cSJosef Bacik 	u64 dev_extent_len = 0;
313447ab2a6cSJosef Bacik 	int i, ret = 0;
31350b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
313647ab2a6cSJosef Bacik 
313760ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3138592d92eeSLiu Bo 	if (IS_ERR(em)) {
313947ab2a6cSJosef Bacik 		/*
314047ab2a6cSJosef Bacik 		 * This is a logic error, but we don't want to just rely on the
3141bb7ab3b9SAdam Buchbinder 		 * user having built with ASSERT enabled, so if ASSERT doesn't
314247ab2a6cSJosef Bacik 		 * do anything we still error out.
314347ab2a6cSJosef Bacik 		 */
314447ab2a6cSJosef Bacik 		ASSERT(0);
3145592d92eeSLiu Bo 		return PTR_ERR(em);
314647ab2a6cSJosef Bacik 	}
314795617d69SJeff Mahoney 	map = em->map_lookup;
314847ab2a6cSJosef Bacik 
314957ba4cb8SFilipe Manana 	/*
315079bd3712SFilipe Manana 	 * First delete the device extent items from the devices btree.
315179bd3712SFilipe Manana 	 * We take the device_list_mutex to avoid racing with the finishing phase
315279bd3712SFilipe Manana 	 * of a device replace operation. See the comment below before acquiring
315379bd3712SFilipe Manana 	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
315479bd3712SFilipe Manana 	 * because that can result in a deadlock when deleting the device extent
315579bd3712SFilipe Manana 	 * items from the devices btree - COWing an extent buffer from the btree
315679bd3712SFilipe Manana 	 * may result in allocating a new metadata chunk, which would attempt to
315779bd3712SFilipe Manana 	 * lock again fs_info->chunk_mutex.
315857ba4cb8SFilipe Manana 	 */
315957ba4cb8SFilipe Manana 	mutex_lock(&fs_devices->device_list_mutex);
316047ab2a6cSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
316147ab2a6cSJosef Bacik 		struct btrfs_device *device = map->stripes[i].dev;
316247ab2a6cSJosef Bacik 		ret = btrfs_free_dev_extent(trans, device,
316347ab2a6cSJosef Bacik 					    map->stripes[i].physical,
316447ab2a6cSJosef Bacik 					    &dev_extent_len);
316547ab2a6cSJosef Bacik 		if (ret) {
316657ba4cb8SFilipe Manana 			mutex_unlock(&fs_devices->device_list_mutex);
316766642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
316847ab2a6cSJosef Bacik 			goto out;
316947ab2a6cSJosef Bacik 		}
317047ab2a6cSJosef Bacik 
317147ab2a6cSJosef Bacik 		if (device->bytes_used > 0) {
317234441361SDavid Sterba 			mutex_lock(&fs_info->chunk_mutex);
317347ab2a6cSJosef Bacik 			btrfs_device_set_bytes_used(device,
317447ab2a6cSJosef Bacik 					device->bytes_used - dev_extent_len);
3175a5ed45f8SNikolay Borisov 			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
31760b246afaSJeff Mahoney 			btrfs_clear_space_info_full(fs_info);
317734441361SDavid Sterba 			mutex_unlock(&fs_info->chunk_mutex);
317847ab2a6cSJosef Bacik 		}
317979bd3712SFilipe Manana 	}
318057ba4cb8SFilipe Manana 	mutex_unlock(&fs_devices->device_list_mutex);
318179bd3712SFilipe Manana 
318279bd3712SFilipe Manana 	/*
318379bd3712SFilipe Manana 	 * We acquire fs_info->chunk_mutex for 2 reasons:
318479bd3712SFilipe Manana 	 *
318579bd3712SFilipe Manana 	 * 1) Just like with the first phase of the chunk allocation, we must
318679bd3712SFilipe Manana 	 *    reserve system space, do all chunk btree updates and deletions, and
318779bd3712SFilipe Manana 	 *    update the system chunk array in the superblock while holding this
318879bd3712SFilipe Manana 	 *    mutex. This is for similar reasons as explained on the comment at
318979bd3712SFilipe Manana 	 *    the top of btrfs_chunk_alloc();
319079bd3712SFilipe Manana 	 *
319179bd3712SFilipe Manana 	 * 2) Prevent races with the final phase of a device replace operation
319279bd3712SFilipe Manana 	 *    that replaces the device object associated with the map's stripes,
319379bd3712SFilipe Manana 	 *    because the device object's id can change at any time during that
319479bd3712SFilipe Manana 	 *    final phase of the device replace operation
319579bd3712SFilipe Manana 	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
319679bd3712SFilipe Manana 	 *    replaced device and then see it with an ID of
319779bd3712SFilipe Manana 	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
319879bd3712SFilipe Manana 	 *    the device item, which does not exists on the chunk btree.
319979bd3712SFilipe Manana 	 *    The finishing phase of device replace acquires both the
320079bd3712SFilipe Manana 	 *    device_list_mutex and the chunk_mutex, in that order, so we are
320179bd3712SFilipe Manana 	 *    safe by just acquiring the chunk_mutex.
320279bd3712SFilipe Manana 	 */
320379bd3712SFilipe Manana 	trans->removing_chunk = true;
320479bd3712SFilipe Manana 	mutex_lock(&fs_info->chunk_mutex);
320579bd3712SFilipe Manana 
320679bd3712SFilipe Manana 	check_system_chunk(trans, map->type);
320779bd3712SFilipe Manana 
320879bd3712SFilipe Manana 	ret = remove_chunk_item(trans, map, chunk_offset);
320979bd3712SFilipe Manana 	/*
321079bd3712SFilipe Manana 	 * Normally we should not get -ENOSPC since we reserved space before
321179bd3712SFilipe Manana 	 * through the call to check_system_chunk().
321279bd3712SFilipe Manana 	 *
321379bd3712SFilipe Manana 	 * Despite our system space_info having enough free space, we may not
321479bd3712SFilipe Manana 	 * be able to allocate extents from its block groups, because all have
321579bd3712SFilipe Manana 	 * an incompatible profile, which will force us to allocate a new system
321679bd3712SFilipe Manana 	 * block group with the right profile, or right after we called
321779bd3712SFilipe Manana 	 * check_system_space() above, a scrub turned the only system block group
321879bd3712SFilipe Manana 	 * with enough free space into RO mode.
321979bd3712SFilipe Manana 	 * This is explained with more detail at do_chunk_alloc().
322079bd3712SFilipe Manana 	 *
322179bd3712SFilipe Manana 	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
322279bd3712SFilipe Manana 	 */
322379bd3712SFilipe Manana 	if (ret == -ENOSPC) {
322479bd3712SFilipe Manana 		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
322579bd3712SFilipe Manana 		struct btrfs_block_group *sys_bg;
322679bd3712SFilipe Manana 
3227f6f39f7aSNikolay Borisov 		sys_bg = btrfs_create_chunk(trans, sys_flags);
322879bd3712SFilipe Manana 		if (IS_ERR(sys_bg)) {
322979bd3712SFilipe Manana 			ret = PTR_ERR(sys_bg);
323066642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
323147ab2a6cSJosef Bacik 			goto out;
323247ab2a6cSJosef Bacik 		}
323357ba4cb8SFilipe Manana 
323479bd3712SFilipe Manana 		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
323547ab2a6cSJosef Bacik 		if (ret) {
323666642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
323747ab2a6cSJosef Bacik 			goto out;
323847ab2a6cSJosef Bacik 		}
323947ab2a6cSJosef Bacik 
324079bd3712SFilipe Manana 		ret = remove_chunk_item(trans, map, chunk_offset);
324179bd3712SFilipe Manana 		if (ret) {
324279bd3712SFilipe Manana 			btrfs_abort_transaction(trans, ret);
324379bd3712SFilipe Manana 			goto out;
324479bd3712SFilipe Manana 		}
324579bd3712SFilipe Manana 	} else if (ret) {
324679bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
324779bd3712SFilipe Manana 		goto out;
324879bd3712SFilipe Manana 	}
324979bd3712SFilipe Manana 
32506bccf3abSJeff Mahoney 	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
325147ab2a6cSJosef Bacik 
325247ab2a6cSJosef Bacik 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3253408fbf19SNikolay Borisov 		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
325447ab2a6cSJosef Bacik 		if (ret) {
325566642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
325647ab2a6cSJosef Bacik 			goto out;
325747ab2a6cSJosef Bacik 		}
325847ab2a6cSJosef Bacik 	}
325947ab2a6cSJosef Bacik 
326079bd3712SFilipe Manana 	mutex_unlock(&fs_info->chunk_mutex);
326179bd3712SFilipe Manana 	trans->removing_chunk = false;
326279bd3712SFilipe Manana 
326379bd3712SFilipe Manana 	/*
326479bd3712SFilipe Manana 	 * We are done with chunk btree updates and deletions, so release the
326579bd3712SFilipe Manana 	 * system space we previously reserved (with check_system_chunk()).
326679bd3712SFilipe Manana 	 */
326779bd3712SFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
326879bd3712SFilipe Manana 
32695a98ec01SNikolay Borisov 	ret = btrfs_remove_block_group(trans, chunk_offset, em);
327047ab2a6cSJosef Bacik 	if (ret) {
327166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
327247ab2a6cSJosef Bacik 		goto out;
327347ab2a6cSJosef Bacik 	}
327447ab2a6cSJosef Bacik 
327547ab2a6cSJosef Bacik out:
327679bd3712SFilipe Manana 	if (trans->removing_chunk) {
327779bd3712SFilipe Manana 		mutex_unlock(&fs_info->chunk_mutex);
327879bd3712SFilipe Manana 		trans->removing_chunk = false;
327979bd3712SFilipe Manana 	}
328047ab2a6cSJosef Bacik 	/* once for us */
328147ab2a6cSJosef Bacik 	free_extent_map(em);
32828f18cf13SChris Mason 	return ret;
32838f18cf13SChris Mason }
32848f18cf13SChris Mason 
btrfs_relocate_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)328518bb8bbfSJohannes Thumshirn int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
32868f18cf13SChris Mason {
32875b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
328819c4d2f9SChris Mason 	struct btrfs_trans_handle *trans;
3289b0643e59SDennis Zhou 	struct btrfs_block_group *block_group;
329001e86008SJohannes Thumshirn 	u64 length;
32918f18cf13SChris Mason 	int ret;
32928f18cf13SChris Mason 
32934b349253SJosef Bacik 	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
32944b349253SJosef Bacik 		btrfs_err(fs_info,
32954b349253SJosef Bacik 			  "relocate: not supported on extent tree v2 yet");
32964b349253SJosef Bacik 		return -EINVAL;
32974b349253SJosef Bacik 	}
32984b349253SJosef Bacik 
329967c5e7d4SFilipe Manana 	/*
330067c5e7d4SFilipe Manana 	 * Prevent races with automatic removal of unused block groups.
330167c5e7d4SFilipe Manana 	 * After we relocate and before we remove the chunk with offset
330267c5e7d4SFilipe Manana 	 * chunk_offset, automatic removal of the block group can kick in,
330367c5e7d4SFilipe Manana 	 * resulting in a failure when calling btrfs_remove_chunk() below.
330467c5e7d4SFilipe Manana 	 *
330567c5e7d4SFilipe Manana 	 * Make sure to acquire this mutex before doing a tree search (dev
330667c5e7d4SFilipe Manana 	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
330767c5e7d4SFilipe Manana 	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
330867c5e7d4SFilipe Manana 	 * we release the path used to search the chunk/dev tree and before
330967c5e7d4SFilipe Manana 	 * the current task acquires this mutex and calls us.
331067c5e7d4SFilipe Manana 	 */
3311f3372065SJohannes Thumshirn 	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
331267c5e7d4SFilipe Manana 
33138f18cf13SChris Mason 	/* step one, relocate all the extents inside this chunk */
33142ff7e61eSJeff Mahoney 	btrfs_scrub_pause(fs_info);
33150b246afaSJeff Mahoney 	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
33162ff7e61eSJeff Mahoney 	btrfs_scrub_continue(fs_info);
33172d82a40aSFilipe Manana 	if (ret) {
33182d82a40aSFilipe Manana 		/*
33192d82a40aSFilipe Manana 		 * If we had a transaction abort, stop all running scrubs.
33202d82a40aSFilipe Manana 		 * See transaction.c:cleanup_transaction() why we do it here.
33212d82a40aSFilipe Manana 		 */
33222d82a40aSFilipe Manana 		if (BTRFS_FS_ERROR(fs_info))
33232d82a40aSFilipe Manana 			btrfs_scrub_cancel(fs_info);
3324a22285a6SYan, Zheng 		return ret;
33252d82a40aSFilipe Manana 	}
33268f18cf13SChris Mason 
3327b0643e59SDennis Zhou 	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3328b0643e59SDennis Zhou 	if (!block_group)
3329b0643e59SDennis Zhou 		return -ENOENT;
3330b0643e59SDennis Zhou 	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
333101e86008SJohannes Thumshirn 	length = block_group->length;
3332b0643e59SDennis Zhou 	btrfs_put_block_group(block_group);
3333b0643e59SDennis Zhou 
333401e86008SJohannes Thumshirn 	/*
333501e86008SJohannes Thumshirn 	 * On a zoned file system, discard the whole block group, this will
333601e86008SJohannes Thumshirn 	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
333701e86008SJohannes Thumshirn 	 * resetting the zone fails, don't treat it as a fatal problem from the
333801e86008SJohannes Thumshirn 	 * filesystem's point of view.
333901e86008SJohannes Thumshirn 	 */
334001e86008SJohannes Thumshirn 	if (btrfs_is_zoned(fs_info)) {
334101e86008SJohannes Thumshirn 		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
334201e86008SJohannes Thumshirn 		if (ret)
334301e86008SJohannes Thumshirn 			btrfs_info(fs_info,
334401e86008SJohannes Thumshirn 				"failed to reset zone %llu after relocation",
334501e86008SJohannes Thumshirn 				chunk_offset);
334601e86008SJohannes Thumshirn 	}
334701e86008SJohannes Thumshirn 
334819c4d2f9SChris Mason 	trans = btrfs_start_trans_remove_block_group(root->fs_info,
334919c4d2f9SChris Mason 						     chunk_offset);
335019c4d2f9SChris Mason 	if (IS_ERR(trans)) {
335119c4d2f9SChris Mason 		ret = PTR_ERR(trans);
335219c4d2f9SChris Mason 		btrfs_handle_fs_error(root->fs_info, ret, NULL);
335319c4d2f9SChris Mason 		return ret;
335419c4d2f9SChris Mason 	}
33555d8eb6feSNaohiro Aota 
335619c4d2f9SChris Mason 	/*
335719c4d2f9SChris Mason 	 * step two, delete the device extents and the
335819c4d2f9SChris Mason 	 * chunk tree entries
335919c4d2f9SChris Mason 	 */
336097aff912SNikolay Borisov 	ret = btrfs_remove_chunk(trans, chunk_offset);
33613a45bb20SJeff Mahoney 	btrfs_end_transaction(trans);
336219c4d2f9SChris Mason 	return ret;
33638f18cf13SChris Mason }
33648f18cf13SChris Mason 
btrfs_relocate_sys_chunks(struct btrfs_fs_info * fs_info)33652ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
33662b82032cSYan Zheng {
33670b246afaSJeff Mahoney 	struct btrfs_root *chunk_root = fs_info->chunk_root;
33682b82032cSYan Zheng 	struct btrfs_path *path;
33692b82032cSYan Zheng 	struct extent_buffer *leaf;
33702b82032cSYan Zheng 	struct btrfs_chunk *chunk;
33712b82032cSYan Zheng 	struct btrfs_key key;
33722b82032cSYan Zheng 	struct btrfs_key found_key;
33732b82032cSYan Zheng 	u64 chunk_type;
3374ba1bf481SJosef Bacik 	bool retried = false;
3375ba1bf481SJosef Bacik 	int failed = 0;
33762b82032cSYan Zheng 	int ret;
33772b82032cSYan Zheng 
33782b82032cSYan Zheng 	path = btrfs_alloc_path();
33792b82032cSYan Zheng 	if (!path)
33802b82032cSYan Zheng 		return -ENOMEM;
33812b82032cSYan Zheng 
3382ba1bf481SJosef Bacik again:
33832b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
33842b82032cSYan Zheng 	key.offset = (u64)-1;
33852b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
33862b82032cSYan Zheng 
33872b82032cSYan Zheng 	while (1) {
3388f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
33892b82032cSYan Zheng 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
339067c5e7d4SFilipe Manana 		if (ret < 0) {
3391f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
33922b82032cSYan Zheng 			goto error;
339367c5e7d4SFilipe Manana 		}
33940d23b34cSDavid Sterba 		if (ret == 0) {
33950d23b34cSDavid Sterba 			/*
33960d23b34cSDavid Sterba 			 * On the first search we would find chunk tree with
33970d23b34cSDavid Sterba 			 * offset -1, which is not possible. On subsequent
33980d23b34cSDavid Sterba 			 * loops this would find an existing item on an invalid
33990d23b34cSDavid Sterba 			 * offset (one less than the previous one, wrong
34000d23b34cSDavid Sterba 			 * alignment and size).
34010d23b34cSDavid Sterba 			 */
34020d23b34cSDavid Sterba 			ret = -EUCLEAN;
3403e42004fdSDominique Martinet 			mutex_unlock(&fs_info->reclaim_bgs_lock);
34040d23b34cSDavid Sterba 			goto error;
34050d23b34cSDavid Sterba 		}
34062b82032cSYan Zheng 
34072b82032cSYan Zheng 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
34082b82032cSYan Zheng 					  key.type);
340967c5e7d4SFilipe Manana 		if (ret)
3410f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
34112b82032cSYan Zheng 		if (ret < 0)
34122b82032cSYan Zheng 			goto error;
34132b82032cSYan Zheng 		if (ret > 0)
34142b82032cSYan Zheng 			break;
34152b82032cSYan Zheng 
34162b82032cSYan Zheng 		leaf = path->nodes[0];
34172b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
34182b82032cSYan Zheng 
34192b82032cSYan Zheng 		chunk = btrfs_item_ptr(leaf, path->slots[0],
34202b82032cSYan Zheng 				       struct btrfs_chunk);
34212b82032cSYan Zheng 		chunk_type = btrfs_chunk_type(leaf, chunk);
3422b3b4aa74SDavid Sterba 		btrfs_release_path(path);
34232b82032cSYan Zheng 
34242b82032cSYan Zheng 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
34250b246afaSJeff Mahoney 			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3426ba1bf481SJosef Bacik 			if (ret == -ENOSPC)
3427ba1bf481SJosef Bacik 				failed++;
342814586651SHIMANGI SARAOGI 			else
342914586651SHIMANGI SARAOGI 				BUG_ON(ret);
34302b82032cSYan Zheng 		}
3431f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
34322b82032cSYan Zheng 
34332b82032cSYan Zheng 		if (found_key.offset == 0)
34342b82032cSYan Zheng 			break;
34352b82032cSYan Zheng 		key.offset = found_key.offset - 1;
34362b82032cSYan Zheng 	}
34372b82032cSYan Zheng 	ret = 0;
3438ba1bf481SJosef Bacik 	if (failed && !retried) {
3439ba1bf481SJosef Bacik 		failed = 0;
3440ba1bf481SJosef Bacik 		retried = true;
3441ba1bf481SJosef Bacik 		goto again;
3442fae7f21cSDulshani Gunawardhana 	} else if (WARN_ON(failed && retried)) {
3443ba1bf481SJosef Bacik 		ret = -ENOSPC;
3444ba1bf481SJosef Bacik 	}
34452b82032cSYan Zheng error:
34462b82032cSYan Zheng 	btrfs_free_path(path);
34472b82032cSYan Zheng 	return ret;
34482b82032cSYan Zheng }
34492b82032cSYan Zheng 
3450a6f93c71SLiu Bo /*
3451a6f93c71SLiu Bo  * return 1 : allocate a data chunk successfully,
3452a6f93c71SLiu Bo  * return <0: errors during allocating a data chunk,
3453a6f93c71SLiu Bo  * return 0 : no need to allocate a data chunk.
3454a6f93c71SLiu Bo  */
btrfs_may_alloc_data_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3455a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3456a6f93c71SLiu Bo 				      u64 chunk_offset)
3457a6f93c71SLiu Bo {
345832da5386SDavid Sterba 	struct btrfs_block_group *cache;
3459a6f93c71SLiu Bo 	u64 bytes_used;
3460a6f93c71SLiu Bo 	u64 chunk_type;
3461a6f93c71SLiu Bo 
3462a6f93c71SLiu Bo 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3463a6f93c71SLiu Bo 	ASSERT(cache);
3464a6f93c71SLiu Bo 	chunk_type = cache->flags;
3465a6f93c71SLiu Bo 	btrfs_put_block_group(cache);
3466a6f93c71SLiu Bo 
34675ae21692SJohannes Thumshirn 	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
34685ae21692SJohannes Thumshirn 		return 0;
34695ae21692SJohannes Thumshirn 
3470a6f93c71SLiu Bo 	spin_lock(&fs_info->data_sinfo->lock);
3471a6f93c71SLiu Bo 	bytes_used = fs_info->data_sinfo->bytes_used;
3472a6f93c71SLiu Bo 	spin_unlock(&fs_info->data_sinfo->lock);
3473a6f93c71SLiu Bo 
3474a6f93c71SLiu Bo 	if (!bytes_used) {
3475a6f93c71SLiu Bo 		struct btrfs_trans_handle *trans;
3476a6f93c71SLiu Bo 		int ret;
3477a6f93c71SLiu Bo 
3478a6f93c71SLiu Bo 		trans =	btrfs_join_transaction(fs_info->tree_root);
3479a6f93c71SLiu Bo 		if (IS_ERR(trans))
3480a6f93c71SLiu Bo 			return PTR_ERR(trans);
3481a6f93c71SLiu Bo 
34825ae21692SJohannes Thumshirn 		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3483a6f93c71SLiu Bo 		btrfs_end_transaction(trans);
3484a6f93c71SLiu Bo 		if (ret < 0)
3485a6f93c71SLiu Bo 			return ret;
3486a6f93c71SLiu Bo 		return 1;
3487a6f93c71SLiu Bo 	}
34885ae21692SJohannes Thumshirn 
3489a6f93c71SLiu Bo 	return 0;
3490a6f93c71SLiu Bo }
3491a6f93c71SLiu Bo 
insert_balance_item(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl)34926bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info,
34930940ebf6SIlya Dryomov 			       struct btrfs_balance_control *bctl)
34940940ebf6SIlya Dryomov {
34956bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
34960940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
34970940ebf6SIlya Dryomov 	struct btrfs_balance_item *item;
34980940ebf6SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
34990940ebf6SIlya Dryomov 	struct btrfs_path *path;
35000940ebf6SIlya Dryomov 	struct extent_buffer *leaf;
35010940ebf6SIlya Dryomov 	struct btrfs_key key;
35020940ebf6SIlya Dryomov 	int ret, err;
35030940ebf6SIlya Dryomov 
35040940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
35050940ebf6SIlya Dryomov 	if (!path)
35060940ebf6SIlya Dryomov 		return -ENOMEM;
35070940ebf6SIlya Dryomov 
35080940ebf6SIlya Dryomov 	trans = btrfs_start_transaction(root, 0);
35090940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
35100940ebf6SIlya Dryomov 		btrfs_free_path(path);
35110940ebf6SIlya Dryomov 		return PTR_ERR(trans);
35120940ebf6SIlya Dryomov 	}
35130940ebf6SIlya Dryomov 
35140940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3515c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
35160940ebf6SIlya Dryomov 	key.offset = 0;
35170940ebf6SIlya Dryomov 
35180940ebf6SIlya Dryomov 	ret = btrfs_insert_empty_item(trans, root, path, &key,
35190940ebf6SIlya Dryomov 				      sizeof(*item));
35200940ebf6SIlya Dryomov 	if (ret)
35210940ebf6SIlya Dryomov 		goto out;
35220940ebf6SIlya Dryomov 
35230940ebf6SIlya Dryomov 	leaf = path->nodes[0];
35240940ebf6SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
35250940ebf6SIlya Dryomov 
3526b159fa28SDavid Sterba 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
35270940ebf6SIlya Dryomov 
35280940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
35290940ebf6SIlya Dryomov 	btrfs_set_balance_data(leaf, item, &disk_bargs);
35300940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
35310940ebf6SIlya Dryomov 	btrfs_set_balance_meta(leaf, item, &disk_bargs);
35320940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
35330940ebf6SIlya Dryomov 	btrfs_set_balance_sys(leaf, item, &disk_bargs);
35340940ebf6SIlya Dryomov 
35350940ebf6SIlya Dryomov 	btrfs_set_balance_flags(leaf, item, bctl->flags);
35360940ebf6SIlya Dryomov 
3537d5e09e38SFilipe Manana 	btrfs_mark_buffer_dirty(trans, leaf);
35380940ebf6SIlya Dryomov out:
35390940ebf6SIlya Dryomov 	btrfs_free_path(path);
35403a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
35410940ebf6SIlya Dryomov 	if (err && !ret)
35420940ebf6SIlya Dryomov 		ret = err;
35430940ebf6SIlya Dryomov 	return ret;
35440940ebf6SIlya Dryomov }
35450940ebf6SIlya Dryomov 
del_balance_item(struct btrfs_fs_info * fs_info)35466bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info)
35470940ebf6SIlya Dryomov {
35486bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
35490940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
35500940ebf6SIlya Dryomov 	struct btrfs_path *path;
35510940ebf6SIlya Dryomov 	struct btrfs_key key;
35520940ebf6SIlya Dryomov 	int ret, err;
35530940ebf6SIlya Dryomov 
35540940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
35550940ebf6SIlya Dryomov 	if (!path)
35560940ebf6SIlya Dryomov 		return -ENOMEM;
35570940ebf6SIlya Dryomov 
35583502a8c0SDavid Sterba 	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
35590940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
35600940ebf6SIlya Dryomov 		btrfs_free_path(path);
35610940ebf6SIlya Dryomov 		return PTR_ERR(trans);
35620940ebf6SIlya Dryomov 	}
35630940ebf6SIlya Dryomov 
35640940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3565c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
35660940ebf6SIlya Dryomov 	key.offset = 0;
35670940ebf6SIlya Dryomov 
35680940ebf6SIlya Dryomov 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
35690940ebf6SIlya Dryomov 	if (ret < 0)
35700940ebf6SIlya Dryomov 		goto out;
35710940ebf6SIlya Dryomov 	if (ret > 0) {
35720940ebf6SIlya Dryomov 		ret = -ENOENT;
35730940ebf6SIlya Dryomov 		goto out;
35740940ebf6SIlya Dryomov 	}
35750940ebf6SIlya Dryomov 
35760940ebf6SIlya Dryomov 	ret = btrfs_del_item(trans, root, path);
35770940ebf6SIlya Dryomov out:
35780940ebf6SIlya Dryomov 	btrfs_free_path(path);
35793a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
35800940ebf6SIlya Dryomov 	if (err && !ret)
35810940ebf6SIlya Dryomov 		ret = err;
35820940ebf6SIlya Dryomov 	return ret;
35830940ebf6SIlya Dryomov }
35840940ebf6SIlya Dryomov 
3585c9e9f97bSIlya Dryomov /*
358659641015SIlya Dryomov  * This is a heuristic used to reduce the number of chunks balanced on
358759641015SIlya Dryomov  * resume after balance was interrupted.
358859641015SIlya Dryomov  */
update_balance_args(struct btrfs_balance_control * bctl)358959641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl)
359059641015SIlya Dryomov {
359159641015SIlya Dryomov 	/*
359259641015SIlya Dryomov 	 * Turn on soft mode for chunk types that were being converted.
359359641015SIlya Dryomov 	 */
359459641015SIlya Dryomov 	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
359559641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
359659641015SIlya Dryomov 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
359759641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
359859641015SIlya Dryomov 	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
359959641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
360059641015SIlya Dryomov 
360159641015SIlya Dryomov 	/*
360259641015SIlya Dryomov 	 * Turn on usage filter if is not already used.  The idea is
360359641015SIlya Dryomov 	 * that chunks that we have already balanced should be
360459641015SIlya Dryomov 	 * reasonably full.  Don't do it for chunks that are being
360559641015SIlya Dryomov 	 * converted - that will keep us from relocating unconverted
360659641015SIlya Dryomov 	 * (albeit full) chunks.
360759641015SIlya Dryomov 	 */
360859641015SIlya Dryomov 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3609bc309467SDavid Sterba 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
361059641015SIlya Dryomov 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
361159641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
361259641015SIlya Dryomov 		bctl->data.usage = 90;
361359641015SIlya Dryomov 	}
361459641015SIlya Dryomov 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3615bc309467SDavid Sterba 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
361659641015SIlya Dryomov 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
361759641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
361859641015SIlya Dryomov 		bctl->sys.usage = 90;
361959641015SIlya Dryomov 	}
362059641015SIlya Dryomov 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3621bc309467SDavid Sterba 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
362259641015SIlya Dryomov 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
362359641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
362459641015SIlya Dryomov 		bctl->meta.usage = 90;
362559641015SIlya Dryomov 	}
362659641015SIlya Dryomov }
362759641015SIlya Dryomov 
362859641015SIlya Dryomov /*
3629149196a2SDavid Sterba  * Clear the balance status in fs_info and delete the balance item from disk.
3630149196a2SDavid Sterba  */
reset_balance_state(struct btrfs_fs_info * fs_info)3631149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info)
3632c9e9f97bSIlya Dryomov {
3633c9e9f97bSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3634149196a2SDavid Sterba 	int ret;
3635c9e9f97bSIlya Dryomov 
3636c9e9f97bSIlya Dryomov 	BUG_ON(!fs_info->balance_ctl);
3637c9e9f97bSIlya Dryomov 
3638c9e9f97bSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
3639c9e9f97bSIlya Dryomov 	fs_info->balance_ctl = NULL;
3640c9e9f97bSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
3641c9e9f97bSIlya Dryomov 
3642c9e9f97bSIlya Dryomov 	kfree(bctl);
3643149196a2SDavid Sterba 	ret = del_balance_item(fs_info);
3644149196a2SDavid Sterba 	if (ret)
3645149196a2SDavid Sterba 		btrfs_handle_fs_error(fs_info, ret, NULL);
3646c9e9f97bSIlya Dryomov }
3647c9e9f97bSIlya Dryomov 
3648ed25e9b2SIlya Dryomov /*
3649ed25e9b2SIlya Dryomov  * Balance filters.  Return 1 if chunk should be filtered out
3650ed25e9b2SIlya Dryomov  * (should not be balanced).
3651ed25e9b2SIlya Dryomov  */
chunk_profiles_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3652899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type,
3653ed25e9b2SIlya Dryomov 				 struct btrfs_balance_args *bargs)
3654ed25e9b2SIlya Dryomov {
3655899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3656899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3657ed25e9b2SIlya Dryomov 
3658899c81eaSIlya Dryomov 	if (bargs->profiles & chunk_type)
3659ed25e9b2SIlya Dryomov 		return 0;
3660ed25e9b2SIlya Dryomov 
3661ed25e9b2SIlya Dryomov 	return 1;
3662ed25e9b2SIlya Dryomov }
3663ed25e9b2SIlya Dryomov 
chunk_usage_range_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3664dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
36655ce5b3c0SIlya Dryomov 			      struct btrfs_balance_args *bargs)
36665ce5b3c0SIlya Dryomov {
366732da5386SDavid Sterba 	struct btrfs_block_group *cache;
3668bc309467SDavid Sterba 	u64 chunk_used;
3669bc309467SDavid Sterba 	u64 user_thresh_min;
3670bc309467SDavid Sterba 	u64 user_thresh_max;
3671bc309467SDavid Sterba 	int ret = 1;
3672bc309467SDavid Sterba 
3673bc309467SDavid Sterba 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3674bf38be65SDavid Sterba 	chunk_used = cache->used;
3675bc309467SDavid Sterba 
3676bc309467SDavid Sterba 	if (bargs->usage_min == 0)
3677bc309467SDavid Sterba 		user_thresh_min = 0;
3678bc309467SDavid Sterba 	else
3679428c8e03SDavid Sterba 		user_thresh_min = mult_perc(cache->length, bargs->usage_min);
3680bc309467SDavid Sterba 
3681bc309467SDavid Sterba 	if (bargs->usage_max == 0)
3682bc309467SDavid Sterba 		user_thresh_max = 1;
3683bc309467SDavid Sterba 	else if (bargs->usage_max > 100)
3684b3470b5dSDavid Sterba 		user_thresh_max = cache->length;
3685bc309467SDavid Sterba 	else
3686428c8e03SDavid Sterba 		user_thresh_max = mult_perc(cache->length, bargs->usage_max);
3687bc309467SDavid Sterba 
3688bc309467SDavid Sterba 	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3689bc309467SDavid Sterba 		ret = 0;
3690bc309467SDavid Sterba 
3691bc309467SDavid Sterba 	btrfs_put_block_group(cache);
3692bc309467SDavid Sterba 	return ret;
3693bc309467SDavid Sterba }
3694bc309467SDavid Sterba 
chunk_usage_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3695dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3696bc309467SDavid Sterba 		u64 chunk_offset, struct btrfs_balance_args *bargs)
3697bc309467SDavid Sterba {
369832da5386SDavid Sterba 	struct btrfs_block_group *cache;
36995ce5b3c0SIlya Dryomov 	u64 chunk_used, user_thresh;
37005ce5b3c0SIlya Dryomov 	int ret = 1;
37015ce5b3c0SIlya Dryomov 
37025ce5b3c0SIlya Dryomov 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3703bf38be65SDavid Sterba 	chunk_used = cache->used;
37045ce5b3c0SIlya Dryomov 
3705bc309467SDavid Sterba 	if (bargs->usage_min == 0)
37063e39cea6SIlya Dryomov 		user_thresh = 1;
3707a105bb88SIlya Dryomov 	else if (bargs->usage > 100)
3708b3470b5dSDavid Sterba 		user_thresh = cache->length;
3709a105bb88SIlya Dryomov 	else
3710428c8e03SDavid Sterba 		user_thresh = mult_perc(cache->length, bargs->usage);
3711a105bb88SIlya Dryomov 
37125ce5b3c0SIlya Dryomov 	if (chunk_used < user_thresh)
37135ce5b3c0SIlya Dryomov 		ret = 0;
37145ce5b3c0SIlya Dryomov 
37155ce5b3c0SIlya Dryomov 	btrfs_put_block_group(cache);
37165ce5b3c0SIlya Dryomov 	return ret;
37175ce5b3c0SIlya Dryomov }
37185ce5b3c0SIlya Dryomov 
chunk_devid_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3719409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf,
3720409d404bSIlya Dryomov 			      struct btrfs_chunk *chunk,
3721409d404bSIlya Dryomov 			      struct btrfs_balance_args *bargs)
3722409d404bSIlya Dryomov {
3723409d404bSIlya Dryomov 	struct btrfs_stripe *stripe;
3724409d404bSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3725409d404bSIlya Dryomov 	int i;
3726409d404bSIlya Dryomov 
3727409d404bSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
3728409d404bSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
3729409d404bSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3730409d404bSIlya Dryomov 			return 0;
3731409d404bSIlya Dryomov 	}
3732409d404bSIlya Dryomov 
3733409d404bSIlya Dryomov 	return 1;
3734409d404bSIlya Dryomov }
3735409d404bSIlya Dryomov 
calc_data_stripes(u64 type,int num_stripes)3736946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes)
3737946c9256SDavid Sterba {
3738946c9256SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(type);
3739946c9256SDavid Sterba 	const int ncopies = btrfs_raid_array[index].ncopies;
3740946c9256SDavid Sterba 	const int nparity = btrfs_raid_array[index].nparity;
3741946c9256SDavid Sterba 
3742d58ede8dSDavid Sterba 	return (num_stripes - nparity) / ncopies;
3743946c9256SDavid Sterba }
3744946c9256SDavid Sterba 
374594e60d5aSIlya Dryomov /* [pstart, pend) */
chunk_drange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)374694e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf,
374794e60d5aSIlya Dryomov 			       struct btrfs_chunk *chunk,
374894e60d5aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
374994e60d5aSIlya Dryomov {
375094e60d5aSIlya Dryomov 	struct btrfs_stripe *stripe;
375194e60d5aSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
375294e60d5aSIlya Dryomov 	u64 stripe_offset;
375394e60d5aSIlya Dryomov 	u64 stripe_length;
3754946c9256SDavid Sterba 	u64 type;
375594e60d5aSIlya Dryomov 	int factor;
375694e60d5aSIlya Dryomov 	int i;
375794e60d5aSIlya Dryomov 
375894e60d5aSIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
375994e60d5aSIlya Dryomov 		return 0;
376094e60d5aSIlya Dryomov 
3761946c9256SDavid Sterba 	type = btrfs_chunk_type(leaf, chunk);
3762946c9256SDavid Sterba 	factor = calc_data_stripes(type, num_stripes);
376394e60d5aSIlya Dryomov 
376494e60d5aSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
376594e60d5aSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
376694e60d5aSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
376794e60d5aSIlya Dryomov 			continue;
376894e60d5aSIlya Dryomov 
376994e60d5aSIlya Dryomov 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
377094e60d5aSIlya Dryomov 		stripe_length = btrfs_chunk_length(leaf, chunk);
3771b8b93addSDavid Sterba 		stripe_length = div_u64(stripe_length, factor);
377294e60d5aSIlya Dryomov 
377394e60d5aSIlya Dryomov 		if (stripe_offset < bargs->pend &&
377494e60d5aSIlya Dryomov 		    stripe_offset + stripe_length > bargs->pstart)
377594e60d5aSIlya Dryomov 			return 0;
377694e60d5aSIlya Dryomov 	}
377794e60d5aSIlya Dryomov 
377894e60d5aSIlya Dryomov 	return 1;
377994e60d5aSIlya Dryomov }
378094e60d5aSIlya Dryomov 
3781ea67176aSIlya Dryomov /* [vstart, vend) */
chunk_vrange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset,struct btrfs_balance_args * bargs)3782ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf,
3783ea67176aSIlya Dryomov 			       struct btrfs_chunk *chunk,
3784ea67176aSIlya Dryomov 			       u64 chunk_offset,
3785ea67176aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
3786ea67176aSIlya Dryomov {
3787ea67176aSIlya Dryomov 	if (chunk_offset < bargs->vend &&
3788ea67176aSIlya Dryomov 	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3789ea67176aSIlya Dryomov 		/* at least part of the chunk is inside this vrange */
3790ea67176aSIlya Dryomov 		return 0;
3791ea67176aSIlya Dryomov 
3792ea67176aSIlya Dryomov 	return 1;
3793ea67176aSIlya Dryomov }
3794ea67176aSIlya Dryomov 
chunk_stripes_range_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3795dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3796dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_chunk *chunk,
3797dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_balance_args *bargs)
3798dee32d0aSGabríel Arthúr Pétursson {
3799dee32d0aSGabríel Arthúr Pétursson 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3800dee32d0aSGabríel Arthúr Pétursson 
3801dee32d0aSGabríel Arthúr Pétursson 	if (bargs->stripes_min <= num_stripes
3802dee32d0aSGabríel Arthúr Pétursson 			&& num_stripes <= bargs->stripes_max)
3803dee32d0aSGabríel Arthúr Pétursson 		return 0;
3804dee32d0aSGabríel Arthúr Pétursson 
3805dee32d0aSGabríel Arthúr Pétursson 	return 1;
3806dee32d0aSGabríel Arthúr Pétursson }
3807dee32d0aSGabríel Arthúr Pétursson 
chunk_soft_convert_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3808899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type,
3809cfa4c961SIlya Dryomov 				     struct btrfs_balance_args *bargs)
3810cfa4c961SIlya Dryomov {
3811cfa4c961SIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3812cfa4c961SIlya Dryomov 		return 0;
3813cfa4c961SIlya Dryomov 
3814899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3815899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3816cfa4c961SIlya Dryomov 
3817899c81eaSIlya Dryomov 	if (bargs->target == chunk_type)
3818cfa4c961SIlya Dryomov 		return 1;
3819cfa4c961SIlya Dryomov 
3820cfa4c961SIlya Dryomov 	return 0;
3821cfa4c961SIlya Dryomov }
3822cfa4c961SIlya Dryomov 
should_balance_chunk(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset)38236ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf,
3824f43ffb60SIlya Dryomov 				struct btrfs_chunk *chunk, u64 chunk_offset)
3825f43ffb60SIlya Dryomov {
38266ec0896cSDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
38270b246afaSJeff Mahoney 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3828f43ffb60SIlya Dryomov 	struct btrfs_balance_args *bargs = NULL;
3829f43ffb60SIlya Dryomov 	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3830f43ffb60SIlya Dryomov 
3831f43ffb60SIlya Dryomov 	/* type filter */
3832f43ffb60SIlya Dryomov 	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3833f43ffb60SIlya Dryomov 	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3834f43ffb60SIlya Dryomov 		return 0;
3835f43ffb60SIlya Dryomov 	}
3836f43ffb60SIlya Dryomov 
3837f43ffb60SIlya Dryomov 	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3838f43ffb60SIlya Dryomov 		bargs = &bctl->data;
3839f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3840f43ffb60SIlya Dryomov 		bargs = &bctl->sys;
3841f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3842f43ffb60SIlya Dryomov 		bargs = &bctl->meta;
3843f43ffb60SIlya Dryomov 
3844ed25e9b2SIlya Dryomov 	/* profiles filter */
3845ed25e9b2SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3846ed25e9b2SIlya Dryomov 	    chunk_profiles_filter(chunk_type, bargs)) {
3847ed25e9b2SIlya Dryomov 		return 0;
3848ed25e9b2SIlya Dryomov 	}
3849ed25e9b2SIlya Dryomov 
38505ce5b3c0SIlya Dryomov 	/* usage filter */
38515ce5b3c0SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
38520b246afaSJeff Mahoney 	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
38535ce5b3c0SIlya Dryomov 		return 0;
3854bc309467SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
38550b246afaSJeff Mahoney 	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3856bc309467SDavid Sterba 		return 0;
38575ce5b3c0SIlya Dryomov 	}
38585ce5b3c0SIlya Dryomov 
3859409d404bSIlya Dryomov 	/* devid filter */
3860409d404bSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3861409d404bSIlya Dryomov 	    chunk_devid_filter(leaf, chunk, bargs)) {
3862409d404bSIlya Dryomov 		return 0;
3863409d404bSIlya Dryomov 	}
3864409d404bSIlya Dryomov 
386594e60d5aSIlya Dryomov 	/* drange filter, makes sense only with devid filter */
386694e60d5aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3867e4ff5fb5SNikolay Borisov 	    chunk_drange_filter(leaf, chunk, bargs)) {
386894e60d5aSIlya Dryomov 		return 0;
386994e60d5aSIlya Dryomov 	}
387094e60d5aSIlya Dryomov 
3871ea67176aSIlya Dryomov 	/* vrange filter */
3872ea67176aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3873ea67176aSIlya Dryomov 	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3874ea67176aSIlya Dryomov 		return 0;
3875ea67176aSIlya Dryomov 	}
3876ea67176aSIlya Dryomov 
3877dee32d0aSGabríel Arthúr Pétursson 	/* stripes filter */
3878dee32d0aSGabríel Arthúr Pétursson 	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3879dee32d0aSGabríel Arthúr Pétursson 	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
3880dee32d0aSGabríel Arthúr Pétursson 		return 0;
3881dee32d0aSGabríel Arthúr Pétursson 	}
3882dee32d0aSGabríel Arthúr Pétursson 
3883cfa4c961SIlya Dryomov 	/* soft profile changing mode */
3884cfa4c961SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3885cfa4c961SIlya Dryomov 	    chunk_soft_convert_filter(chunk_type, bargs)) {
3886cfa4c961SIlya Dryomov 		return 0;
3887cfa4c961SIlya Dryomov 	}
3888cfa4c961SIlya Dryomov 
38897d824b6fSDavid Sterba 	/*
38907d824b6fSDavid Sterba 	 * limited by count, must be the last filter
38917d824b6fSDavid Sterba 	 */
38927d824b6fSDavid Sterba 	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
38937d824b6fSDavid Sterba 		if (bargs->limit == 0)
38947d824b6fSDavid Sterba 			return 0;
38957d824b6fSDavid Sterba 		else
38967d824b6fSDavid Sterba 			bargs->limit--;
389712907fc7SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
389812907fc7SDavid Sterba 		/*
389912907fc7SDavid Sterba 		 * Same logic as the 'limit' filter; the minimum cannot be
390001327610SNicholas D Steeves 		 * determined here because we do not have the global information
390112907fc7SDavid Sterba 		 * about the count of all chunks that satisfy the filters.
390212907fc7SDavid Sterba 		 */
390312907fc7SDavid Sterba 		if (bargs->limit_max == 0)
390412907fc7SDavid Sterba 			return 0;
390512907fc7SDavid Sterba 		else
390612907fc7SDavid Sterba 			bargs->limit_max--;
39077d824b6fSDavid Sterba 	}
39087d824b6fSDavid Sterba 
3909f43ffb60SIlya Dryomov 	return 1;
3910f43ffb60SIlya Dryomov }
3911f43ffb60SIlya Dryomov 
__btrfs_balance(struct btrfs_fs_info * fs_info)3912c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3913ec44a35cSChris Mason {
391419a39dceSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3915c9e9f97bSIlya Dryomov 	struct btrfs_root *chunk_root = fs_info->chunk_root;
391612907fc7SDavid Sterba 	u64 chunk_type;
3917f43ffb60SIlya Dryomov 	struct btrfs_chunk *chunk;
39185a488b9dSLiu Bo 	struct btrfs_path *path = NULL;
3919ec44a35cSChris Mason 	struct btrfs_key key;
3920ec44a35cSChris Mason 	struct btrfs_key found_key;
3921f43ffb60SIlya Dryomov 	struct extent_buffer *leaf;
3922f43ffb60SIlya Dryomov 	int slot;
3923c9e9f97bSIlya Dryomov 	int ret;
3924c9e9f97bSIlya Dryomov 	int enospc_errors = 0;
392519a39dceSIlya Dryomov 	bool counting = true;
392612907fc7SDavid Sterba 	/* The single value limit and min/max limits use the same bytes in the */
39277d824b6fSDavid Sterba 	u64 limit_data = bctl->data.limit;
39287d824b6fSDavid Sterba 	u64 limit_meta = bctl->meta.limit;
39297d824b6fSDavid Sterba 	u64 limit_sys = bctl->sys.limit;
393012907fc7SDavid Sterba 	u32 count_data = 0;
393112907fc7SDavid Sterba 	u32 count_meta = 0;
393212907fc7SDavid Sterba 	u32 count_sys = 0;
39332c9fe835SZhao Lei 	int chunk_reserved = 0;
3934ec44a35cSChris Mason 
3935ec44a35cSChris Mason 	path = btrfs_alloc_path();
393617e9f796SMark Fasheh 	if (!path) {
393717e9f796SMark Fasheh 		ret = -ENOMEM;
393817e9f796SMark Fasheh 		goto error;
393917e9f796SMark Fasheh 	}
394019a39dceSIlya Dryomov 
394119a39dceSIlya Dryomov 	/* zero out stat counters */
394219a39dceSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
394319a39dceSIlya Dryomov 	memset(&bctl->stat, 0, sizeof(bctl->stat));
394419a39dceSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
394519a39dceSIlya Dryomov again:
39467d824b6fSDavid Sterba 	if (!counting) {
394712907fc7SDavid Sterba 		/*
394812907fc7SDavid Sterba 		 * The single value limit and min/max limits use the same bytes
394912907fc7SDavid Sterba 		 * in the
395012907fc7SDavid Sterba 		 */
39517d824b6fSDavid Sterba 		bctl->data.limit = limit_data;
39527d824b6fSDavid Sterba 		bctl->meta.limit = limit_meta;
39537d824b6fSDavid Sterba 		bctl->sys.limit = limit_sys;
39547d824b6fSDavid Sterba 	}
3955ec44a35cSChris Mason 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3956ec44a35cSChris Mason 	key.offset = (u64)-1;
3957ec44a35cSChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
3958ec44a35cSChris Mason 
3959ec44a35cSChris Mason 	while (1) {
396019a39dceSIlya Dryomov 		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3961a7e99c69SIlya Dryomov 		    atomic_read(&fs_info->balance_cancel_req)) {
3962837d5b6eSIlya Dryomov 			ret = -ECANCELED;
3963837d5b6eSIlya Dryomov 			goto error;
3964837d5b6eSIlya Dryomov 		}
3965837d5b6eSIlya Dryomov 
3966f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
3967ec44a35cSChris Mason 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
396867c5e7d4SFilipe Manana 		if (ret < 0) {
3969f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3970ec44a35cSChris Mason 			goto error;
397167c5e7d4SFilipe Manana 		}
3972ec44a35cSChris Mason 
3973ec44a35cSChris Mason 		/*
3974ec44a35cSChris Mason 		 * this shouldn't happen, it means the last relocate
3975ec44a35cSChris Mason 		 * failed
3976ec44a35cSChris Mason 		 */
3977ec44a35cSChris Mason 		if (ret == 0)
3978c9e9f97bSIlya Dryomov 			BUG(); /* FIXME break ? */
3979ec44a35cSChris Mason 
3980ec44a35cSChris Mason 		ret = btrfs_previous_item(chunk_root, path, 0,
3981ec44a35cSChris Mason 					  BTRFS_CHUNK_ITEM_KEY);
3982c9e9f97bSIlya Dryomov 		if (ret) {
3983f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3984c9e9f97bSIlya Dryomov 			ret = 0;
3985ec44a35cSChris Mason 			break;
3986c9e9f97bSIlya Dryomov 		}
39877d9eb12cSChris Mason 
3988f43ffb60SIlya Dryomov 		leaf = path->nodes[0];
3989f43ffb60SIlya Dryomov 		slot = path->slots[0];
3990f43ffb60SIlya Dryomov 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3991f43ffb60SIlya Dryomov 
399267c5e7d4SFilipe Manana 		if (found_key.objectid != key.objectid) {
3993f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
3994ec44a35cSChris Mason 			break;
399567c5e7d4SFilipe Manana 		}
39967d9eb12cSChris Mason 
3997f43ffb60SIlya Dryomov 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
399812907fc7SDavid Sterba 		chunk_type = btrfs_chunk_type(leaf, chunk);
3999f43ffb60SIlya Dryomov 
400019a39dceSIlya Dryomov 		if (!counting) {
400119a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
400219a39dceSIlya Dryomov 			bctl->stat.considered++;
400319a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
400419a39dceSIlya Dryomov 		}
400519a39dceSIlya Dryomov 
40066ec0896cSDavid Sterba 		ret = should_balance_chunk(leaf, chunk, found_key.offset);
40072c9fe835SZhao Lei 
4008b3b4aa74SDavid Sterba 		btrfs_release_path(path);
400967c5e7d4SFilipe Manana 		if (!ret) {
4010f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4011f43ffb60SIlya Dryomov 			goto loop;
401267c5e7d4SFilipe Manana 		}
4013f43ffb60SIlya Dryomov 
401419a39dceSIlya Dryomov 		if (counting) {
4015f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
401619a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
401719a39dceSIlya Dryomov 			bctl->stat.expected++;
401819a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
401912907fc7SDavid Sterba 
402012907fc7SDavid Sterba 			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
402112907fc7SDavid Sterba 				count_data++;
402212907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
402312907fc7SDavid Sterba 				count_sys++;
402412907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
402512907fc7SDavid Sterba 				count_meta++;
402612907fc7SDavid Sterba 
402712907fc7SDavid Sterba 			goto loop;
402812907fc7SDavid Sterba 		}
402912907fc7SDavid Sterba 
403012907fc7SDavid Sterba 		/*
403112907fc7SDavid Sterba 		 * Apply limit_min filter, no need to check if the LIMITS
403212907fc7SDavid Sterba 		 * filter is used, limit_min is 0 by default
403312907fc7SDavid Sterba 		 */
403412907fc7SDavid Sterba 		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
403512907fc7SDavid Sterba 					count_data < bctl->data.limit_min)
403612907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
403712907fc7SDavid Sterba 					count_meta < bctl->meta.limit_min)
403812907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
403912907fc7SDavid Sterba 					count_sys < bctl->sys.limit_min)) {
4040f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
404119a39dceSIlya Dryomov 			goto loop;
404219a39dceSIlya Dryomov 		}
404319a39dceSIlya Dryomov 
4044a6f93c71SLiu Bo 		if (!chunk_reserved) {
4045a6f93c71SLiu Bo 			/*
4046a6f93c71SLiu Bo 			 * We may be relocating the only data chunk we have,
4047a6f93c71SLiu Bo 			 * which could potentially end up with losing data's
4048a6f93c71SLiu Bo 			 * raid profile, so lets allocate an empty one in
4049a6f93c71SLiu Bo 			 * advance.
4050a6f93c71SLiu Bo 			 */
4051a6f93c71SLiu Bo 			ret = btrfs_may_alloc_data_chunk(fs_info,
4052a6f93c71SLiu Bo 							 found_key.offset);
40532c9fe835SZhao Lei 			if (ret < 0) {
4054f3372065SJohannes Thumshirn 				mutex_unlock(&fs_info->reclaim_bgs_lock);
40552c9fe835SZhao Lei 				goto error;
4056a6f93c71SLiu Bo 			} else if (ret == 1) {
40572c9fe835SZhao Lei 				chunk_reserved = 1;
40582c9fe835SZhao Lei 			}
4059a6f93c71SLiu Bo 		}
40602c9fe835SZhao Lei 
40615b4aacefSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
4062f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
406319a39dceSIlya Dryomov 		if (ret == -ENOSPC) {
4064c9e9f97bSIlya Dryomov 			enospc_errors++;
4065eede2bf3SOmar Sandoval 		} else if (ret == -ETXTBSY) {
4066eede2bf3SOmar Sandoval 			btrfs_info(fs_info,
4067eede2bf3SOmar Sandoval 	   "skipping relocation of block group %llu due to active swapfile",
4068eede2bf3SOmar Sandoval 				   found_key.offset);
4069eede2bf3SOmar Sandoval 			ret = 0;
4070eede2bf3SOmar Sandoval 		} else if (ret) {
4071eede2bf3SOmar Sandoval 			goto error;
407219a39dceSIlya Dryomov 		} else {
407319a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
407419a39dceSIlya Dryomov 			bctl->stat.completed++;
407519a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
407619a39dceSIlya Dryomov 		}
4077f43ffb60SIlya Dryomov loop:
4078795a3321SIlya Dryomov 		if (found_key.offset == 0)
4079795a3321SIlya Dryomov 			break;
4080ba1bf481SJosef Bacik 		key.offset = found_key.offset - 1;
4081ec44a35cSChris Mason 	}
4082c9e9f97bSIlya Dryomov 
408319a39dceSIlya Dryomov 	if (counting) {
408419a39dceSIlya Dryomov 		btrfs_release_path(path);
408519a39dceSIlya Dryomov 		counting = false;
408619a39dceSIlya Dryomov 		goto again;
408719a39dceSIlya Dryomov 	}
4088ec44a35cSChris Mason error:
4089ec44a35cSChris Mason 	btrfs_free_path(path);
4090c9e9f97bSIlya Dryomov 	if (enospc_errors) {
4091efe120a0SFrank Holton 		btrfs_info(fs_info, "%d enospc errors during balance",
4092c9e9f97bSIlya Dryomov 			   enospc_errors);
4093c9e9f97bSIlya Dryomov 		if (!ret)
4094c9e9f97bSIlya Dryomov 			ret = -ENOSPC;
4095c9e9f97bSIlya Dryomov 	}
4096c9e9f97bSIlya Dryomov 
4097ec44a35cSChris Mason 	return ret;
4098ec44a35cSChris Mason }
4099ec44a35cSChris Mason 
410043dd529aSDavid Sterba /*
410143dd529aSDavid Sterba  * See if a given profile is valid and reduced.
410243dd529aSDavid Sterba  *
41030c460c0dSIlya Dryomov  * @flags:     profile to validate
41040c460c0dSIlya Dryomov  * @extended:  if true @flags is treated as an extended profile
41050c460c0dSIlya Dryomov  */
alloc_profile_is_valid(u64 flags,int extended)41060c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended)
41070c460c0dSIlya Dryomov {
41080c460c0dSIlya Dryomov 	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
41090c460c0dSIlya Dryomov 			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
41100c460c0dSIlya Dryomov 
41110c460c0dSIlya Dryomov 	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
41120c460c0dSIlya Dryomov 
41130c460c0dSIlya Dryomov 	/* 1) check that all other bits are zeroed */
41140c460c0dSIlya Dryomov 	if (flags & ~mask)
41150c460c0dSIlya Dryomov 		return 0;
41160c460c0dSIlya Dryomov 
41170c460c0dSIlya Dryomov 	/* 2) see if profile is reduced */
41180c460c0dSIlya Dryomov 	if (flags == 0)
41190c460c0dSIlya Dryomov 		return !extended; /* "0" is valid for usual profiles */
41200c460c0dSIlya Dryomov 
4121c1499166SDavid Sterba 	return has_single_bit_set(flags);
41220c460c0dSIlya Dryomov }
41230c460c0dSIlya Dryomov 
41245ba366c3SDavid Sterba /*
41255ba366c3SDavid Sterba  * Validate target profile against allowed profiles and return true if it's OK.
41265ba366c3SDavid Sterba  * Otherwise print the error message and return false.
41275ba366c3SDavid Sterba  */
validate_convert_profile(struct btrfs_fs_info * fs_info,const struct btrfs_balance_args * bargs,u64 allowed,const char * type)41285ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
41295ba366c3SDavid Sterba 		const struct btrfs_balance_args *bargs,
41305ba366c3SDavid Sterba 		u64 allowed, const char *type)
4131bdcd3c97SAlexandru Moise {
41325ba366c3SDavid Sterba 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
41335ba366c3SDavid Sterba 		return true;
41345ba366c3SDavid Sterba 
41355ba366c3SDavid Sterba 	/* Profile is valid and does not have bits outside of the allowed set */
41365ba366c3SDavid Sterba 	if (alloc_profile_is_valid(bargs->target, 1) &&
41375ba366c3SDavid Sterba 	    (bargs->target & ~allowed) == 0)
41385ba366c3SDavid Sterba 		return true;
41395ba366c3SDavid Sterba 
41405ba366c3SDavid Sterba 	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
41415ba366c3SDavid Sterba 			type, btrfs_bg_type_to_raid_name(bargs->target));
41425ba366c3SDavid Sterba 	return false;
4143bdcd3c97SAlexandru Moise }
4144bdcd3c97SAlexandru Moise 
4145c9e9f97bSIlya Dryomov /*
414656fc37d9SAnand Jain  * Fill @buf with textual description of balance filter flags @bargs, up to
414756fc37d9SAnand Jain  * @size_buf including the terminating null. The output may be trimmed if it
414856fc37d9SAnand Jain  * does not fit into the provided buffer.
414956fc37d9SAnand Jain  */
describe_balance_args(struct btrfs_balance_args * bargs,char * buf,u32 size_buf)415056fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
415156fc37d9SAnand Jain 				 u32 size_buf)
415256fc37d9SAnand Jain {
415356fc37d9SAnand Jain 	int ret;
415456fc37d9SAnand Jain 	u32 size_bp = size_buf;
415556fc37d9SAnand Jain 	char *bp = buf;
415656fc37d9SAnand Jain 	u64 flags = bargs->flags;
415756fc37d9SAnand Jain 	char tmp_buf[128] = {'\0'};
415856fc37d9SAnand Jain 
415956fc37d9SAnand Jain 	if (!flags)
416056fc37d9SAnand Jain 		return;
416156fc37d9SAnand Jain 
416256fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a)						\
416356fc37d9SAnand Jain 	do {								\
416456fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a));			\
416556fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
416656fc37d9SAnand Jain 			goto out_overflow;				\
416756fc37d9SAnand Jain 		size_bp -= ret;						\
416856fc37d9SAnand Jain 		bp += ret;						\
416956fc37d9SAnand Jain 	} while (0)
417056fc37d9SAnand Jain 
417156fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
417256fc37d9SAnand Jain 	do {								\
417356fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
417456fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
417556fc37d9SAnand Jain 			goto out_overflow;				\
417656fc37d9SAnand Jain 		size_bp -= ret;						\
417756fc37d9SAnand Jain 		bp += ret;						\
417856fc37d9SAnand Jain 	} while (0)
417956fc37d9SAnand Jain 
418056fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2)					\
418156fc37d9SAnand Jain 	do {								\
418256fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
418356fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
418456fc37d9SAnand Jain 			goto out_overflow;				\
418556fc37d9SAnand Jain 		size_bp -= ret;						\
418656fc37d9SAnand Jain 		bp += ret;						\
418756fc37d9SAnand Jain 	} while (0)
418856fc37d9SAnand Jain 
4189158da513SDavid Sterba 	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4190158da513SDavid Sterba 		CHECK_APPEND_1ARG("convert=%s,",
4191158da513SDavid Sterba 				  btrfs_bg_type_to_raid_name(bargs->target));
419256fc37d9SAnand Jain 
419356fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_SOFT)
419456fc37d9SAnand Jain 		CHECK_APPEND_NOARG("soft,");
419556fc37d9SAnand Jain 
419656fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
419756fc37d9SAnand Jain 		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
419856fc37d9SAnand Jain 					    sizeof(tmp_buf));
419956fc37d9SAnand Jain 		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
420056fc37d9SAnand Jain 	}
420156fc37d9SAnand Jain 
420256fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE)
420356fc37d9SAnand Jain 		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
420456fc37d9SAnand Jain 
420556fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
420656fc37d9SAnand Jain 		CHECK_APPEND_2ARG("usage=%u..%u,",
420756fc37d9SAnand Jain 				  bargs->usage_min, bargs->usage_max);
420856fc37d9SAnand Jain 
420956fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DEVID)
421056fc37d9SAnand Jain 		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
421156fc37d9SAnand Jain 
421256fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
421356fc37d9SAnand Jain 		CHECK_APPEND_2ARG("drange=%llu..%llu,",
421456fc37d9SAnand Jain 				  bargs->pstart, bargs->pend);
421556fc37d9SAnand Jain 
421656fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
421756fc37d9SAnand Jain 		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
421856fc37d9SAnand Jain 				  bargs->vstart, bargs->vend);
421956fc37d9SAnand Jain 
422056fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
422156fc37d9SAnand Jain 		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
422256fc37d9SAnand Jain 
422356fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
422456fc37d9SAnand Jain 		CHECK_APPEND_2ARG("limit=%u..%u,",
422556fc37d9SAnand Jain 				bargs->limit_min, bargs->limit_max);
422656fc37d9SAnand Jain 
422756fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
422856fc37d9SAnand Jain 		CHECK_APPEND_2ARG("stripes=%u..%u,",
422956fc37d9SAnand Jain 				  bargs->stripes_min, bargs->stripes_max);
423056fc37d9SAnand Jain 
423156fc37d9SAnand Jain #undef CHECK_APPEND_2ARG
423256fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
423356fc37d9SAnand Jain #undef CHECK_APPEND_NOARG
423456fc37d9SAnand Jain 
423556fc37d9SAnand Jain out_overflow:
423656fc37d9SAnand Jain 
423756fc37d9SAnand Jain 	if (size_bp < size_buf)
423856fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
423956fc37d9SAnand Jain 	else
424056fc37d9SAnand Jain 		buf[0] = '\0';
424156fc37d9SAnand Jain }
424256fc37d9SAnand Jain 
describe_balance_start_or_resume(struct btrfs_fs_info * fs_info)424356fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
424456fc37d9SAnand Jain {
424556fc37d9SAnand Jain 	u32 size_buf = 1024;
424656fc37d9SAnand Jain 	char tmp_buf[192] = {'\0'};
424756fc37d9SAnand Jain 	char *buf;
424856fc37d9SAnand Jain 	char *bp;
424956fc37d9SAnand Jain 	u32 size_bp = size_buf;
425056fc37d9SAnand Jain 	int ret;
425156fc37d9SAnand Jain 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
425256fc37d9SAnand Jain 
425356fc37d9SAnand Jain 	buf = kzalloc(size_buf, GFP_KERNEL);
425456fc37d9SAnand Jain 	if (!buf)
425556fc37d9SAnand Jain 		return;
425656fc37d9SAnand Jain 
425756fc37d9SAnand Jain 	bp = buf;
425856fc37d9SAnand Jain 
425956fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
426056fc37d9SAnand Jain 	do {								\
426156fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
426256fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
426356fc37d9SAnand Jain 			goto out_overflow;				\
426456fc37d9SAnand Jain 		size_bp -= ret;						\
426556fc37d9SAnand Jain 		bp += ret;						\
426656fc37d9SAnand Jain 	} while (0)
426756fc37d9SAnand Jain 
426856fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_FORCE)
426956fc37d9SAnand Jain 		CHECK_APPEND_1ARG("%s", "-f ");
427056fc37d9SAnand Jain 
427156fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_DATA) {
427256fc37d9SAnand Jain 		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
427356fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
427456fc37d9SAnand Jain 	}
427556fc37d9SAnand Jain 
427656fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_METADATA) {
427756fc37d9SAnand Jain 		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
427856fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
427956fc37d9SAnand Jain 	}
428056fc37d9SAnand Jain 
428156fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
428256fc37d9SAnand Jain 		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
428356fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
428456fc37d9SAnand Jain 	}
428556fc37d9SAnand Jain 
428656fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
428756fc37d9SAnand Jain 
428856fc37d9SAnand Jain out_overflow:
428956fc37d9SAnand Jain 
429056fc37d9SAnand Jain 	if (size_bp < size_buf)
429156fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
429256fc37d9SAnand Jain 	btrfs_info(fs_info, "balance: %s %s",
429356fc37d9SAnand Jain 		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
429456fc37d9SAnand Jain 		   "resume" : "start", buf);
429556fc37d9SAnand Jain 
429656fc37d9SAnand Jain 	kfree(buf);
429756fc37d9SAnand Jain }
429856fc37d9SAnand Jain 
429956fc37d9SAnand Jain /*
4300dccdb07bSDavid Sterba  * Should be called with balance mutexe held
4301c9e9f97bSIlya Dryomov  */
btrfs_balance(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl,struct btrfs_ioctl_balance_args * bargs)43026fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info,
43036fcf6e2bSDavid Sterba 		  struct btrfs_balance_control *bctl,
4304c9e9f97bSIlya Dryomov 		  struct btrfs_ioctl_balance_args *bargs)
4305c9e9f97bSIlya Dryomov {
430614506127SAdam Borowski 	u64 meta_target, data_target;
4307f43ffb60SIlya Dryomov 	u64 allowed;
4308e4837f8fSIlya Dryomov 	int mixed = 0;
4309c9e9f97bSIlya Dryomov 	int ret;
43108dabb742SStefan Behrens 	u64 num_devices;
4311de98ced9SMiao Xie 	unsigned seq;
4312e62869beSAnand Jain 	bool reducing_redundancy;
4313b19c98f2SJosef Bacik 	bool paused = false;
4314081db89bSDavid Sterba 	int i;
4315c9e9f97bSIlya Dryomov 
4316837d5b6eSIlya Dryomov 	if (btrfs_fs_closing(fs_info) ||
4317a7e99c69SIlya Dryomov 	    atomic_read(&fs_info->balance_pause_req) ||
4318726a3421SQu Wenruo 	    btrfs_should_cancel_balance(fs_info)) {
4319c9e9f97bSIlya Dryomov 		ret = -EINVAL;
4320c9e9f97bSIlya Dryomov 		goto out;
4321c9e9f97bSIlya Dryomov 	}
4322c9e9f97bSIlya Dryomov 
4323e4837f8fSIlya Dryomov 	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4324e4837f8fSIlya Dryomov 	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4325e4837f8fSIlya Dryomov 		mixed = 1;
4326e4837f8fSIlya Dryomov 
4327f43ffb60SIlya Dryomov 	/*
4328f43ffb60SIlya Dryomov 	 * In case of mixed groups both data and meta should be picked,
4329f43ffb60SIlya Dryomov 	 * and identical options should be given for both of them.
4330f43ffb60SIlya Dryomov 	 */
4331e4837f8fSIlya Dryomov 	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4332e4837f8fSIlya Dryomov 	if (mixed && (bctl->flags & allowed)) {
4333f43ffb60SIlya Dryomov 		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4334f43ffb60SIlya Dryomov 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4335f43ffb60SIlya Dryomov 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
43365d163e0eSJeff Mahoney 			btrfs_err(fs_info,
43376dac13f8SAnand Jain 	  "balance: mixed groups data and metadata options must be the same");
4338f43ffb60SIlya Dryomov 			ret = -EINVAL;
4339f43ffb60SIlya Dryomov 			goto out;
4340f43ffb60SIlya Dryomov 		}
4341f43ffb60SIlya Dryomov 	}
4342f43ffb60SIlya Dryomov 
4343b35cf1f0SJosef Bacik 	/*
4344b35cf1f0SJosef Bacik 	 * rw_devices will not change at the moment, device add/delete/replace
4345c3e1f96cSGoldwyn Rodrigues 	 * are exclusive
4346b35cf1f0SJosef Bacik 	 */
4347b35cf1f0SJosef Bacik 	num_devices = fs_info->fs_devices->rw_devices;
4348fab27359SQu Wenruo 
4349fab27359SQu Wenruo 	/*
4350fab27359SQu Wenruo 	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4351fab27359SQu Wenruo 	 * special bit for it, to make it easier to distinguish.  Thus we need
4352fab27359SQu Wenruo 	 * to set it manually, or balance would refuse the profile.
4353fab27359SQu Wenruo 	 */
4354fab27359SQu Wenruo 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4355081db89bSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4356081db89bSDavid Sterba 		if (num_devices >= btrfs_raid_array[i].devs_min)
4357081db89bSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
43581da73967SAnand Jain 
43595ba366c3SDavid Sterba 	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
43605ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
43615ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4362e4d8ec0fSIlya Dryomov 		ret = -EINVAL;
4363e4d8ec0fSIlya Dryomov 		goto out;
4364e4d8ec0fSIlya Dryomov 	}
4365e4d8ec0fSIlya Dryomov 
43666079e12cSDavid Sterba 	/*
43676079e12cSDavid Sterba 	 * Allow to reduce metadata or system integrity only if force set for
43686079e12cSDavid Sterba 	 * profiles with redundancy (copies, parity)
43696079e12cSDavid Sterba 	 */
43706079e12cSDavid Sterba 	allowed = 0;
43716079e12cSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
43726079e12cSDavid Sterba 		if (btrfs_raid_array[i].ncopies >= 2 ||
43736079e12cSDavid Sterba 		    btrfs_raid_array[i].tolerated_failures >= 1)
43746079e12cSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
43756079e12cSDavid Sterba 	}
4376de98ced9SMiao Xie 	do {
4377de98ced9SMiao Xie 		seq = read_seqbegin(&fs_info->profiles_lock);
4378de98ced9SMiao Xie 
4379e4d8ec0fSIlya Dryomov 		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4380e4d8ec0fSIlya Dryomov 		     (fs_info->avail_system_alloc_bits & allowed) &&
4381e4d8ec0fSIlya Dryomov 		     !(bctl->sys.target & allowed)) ||
4382e4d8ec0fSIlya Dryomov 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4383e4d8ec0fSIlya Dryomov 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
43845a8067c0SFilipe Manana 		     !(bctl->meta.target & allowed)))
4385e62869beSAnand Jain 			reducing_redundancy = true;
43865a8067c0SFilipe Manana 		else
4387e62869beSAnand Jain 			reducing_redundancy = false;
43885a8067c0SFilipe Manana 
43895a8067c0SFilipe Manana 		/* if we're not converting, the target field is uninitialized */
43905a8067c0SFilipe Manana 		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
43915a8067c0SFilipe Manana 			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
43925a8067c0SFilipe Manana 		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
43935a8067c0SFilipe Manana 			bctl->data.target : fs_info->avail_data_alloc_bits;
43945a8067c0SFilipe Manana 	} while (read_seqretry(&fs_info->profiles_lock, seq));
43955a8067c0SFilipe Manana 
4396e62869beSAnand Jain 	if (reducing_redundancy) {
4397e4d8ec0fSIlya Dryomov 		if (bctl->flags & BTRFS_BALANCE_FORCE) {
43985d163e0eSJeff Mahoney 			btrfs_info(fs_info,
4399e62869beSAnand Jain 			   "balance: force reducing metadata redundancy");
4400e4d8ec0fSIlya Dryomov 		} else {
44015d163e0eSJeff Mahoney 			btrfs_err(fs_info,
4402e62869beSAnand Jain 	"balance: reduces metadata redundancy, use --force if you want this");
4403e4d8ec0fSIlya Dryomov 			ret = -EINVAL;
4404e4d8ec0fSIlya Dryomov 			goto out;
4405e4d8ec0fSIlya Dryomov 		}
4406e4d8ec0fSIlya Dryomov 	}
4407e4d8ec0fSIlya Dryomov 
440814506127SAdam Borowski 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
440914506127SAdam Borowski 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4410ee592d07SSam Tygier 		btrfs_warn(fs_info,
44116dac13f8SAnand Jain 	"balance: metadata profile %s has lower redundancy than data profile %s",
4412158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(meta_target),
4413158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(data_target));
4414ee592d07SSam Tygier 	}
4415ee592d07SSam Tygier 
44166bccf3abSJeff Mahoney 	ret = insert_balance_item(fs_info, bctl);
441759641015SIlya Dryomov 	if (ret && ret != -EEXIST)
44180940ebf6SIlya Dryomov 		goto out;
44190940ebf6SIlya Dryomov 
442059641015SIlya Dryomov 	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
442159641015SIlya Dryomov 		BUG_ON(ret == -EEXIST);
4422833aae18SDavid Sterba 		BUG_ON(fs_info->balance_ctl);
4423833aae18SDavid Sterba 		spin_lock(&fs_info->balance_lock);
4424833aae18SDavid Sterba 		fs_info->balance_ctl = bctl;
4425833aae18SDavid Sterba 		spin_unlock(&fs_info->balance_lock);
442659641015SIlya Dryomov 	} else {
442759641015SIlya Dryomov 		BUG_ON(ret != -EEXIST);
442859641015SIlya Dryomov 		spin_lock(&fs_info->balance_lock);
442959641015SIlya Dryomov 		update_balance_args(bctl);
443059641015SIlya Dryomov 		spin_unlock(&fs_info->balance_lock);
443159641015SIlya Dryomov 	}
4432c9e9f97bSIlya Dryomov 
44333009a62fSDavid Sterba 	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
44343009a62fSDavid Sterba 	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
443556fc37d9SAnand Jain 	describe_balance_start_or_resume(fs_info);
4436c9e9f97bSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4437c9e9f97bSIlya Dryomov 
4438c9e9f97bSIlya Dryomov 	ret = __btrfs_balance(fs_info);
4439c9e9f97bSIlya Dryomov 
4440c9e9f97bSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4441efc0e69cSNikolay Borisov 	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
44427333bd02SAnand Jain 		btrfs_info(fs_info, "balance: paused");
4443efc0e69cSNikolay Borisov 		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4444b19c98f2SJosef Bacik 		paused = true;
4445efc0e69cSNikolay Borisov 	}
444644d354abSQu Wenruo 	/*
444744d354abSQu Wenruo 	 * Balance can be canceled by:
444844d354abSQu Wenruo 	 *
444944d354abSQu Wenruo 	 * - Regular cancel request
445044d354abSQu Wenruo 	 *   Then ret == -ECANCELED and balance_cancel_req > 0
445144d354abSQu Wenruo 	 *
445244d354abSQu Wenruo 	 * - Fatal signal to "btrfs" process
445344d354abSQu Wenruo 	 *   Either the signal caught by wait_reserve_ticket() and callers
445444d354abSQu Wenruo 	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
445544d354abSQu Wenruo 	 *   got -ECANCELED.
445644d354abSQu Wenruo 	 *   Either way, in this case balance_cancel_req = 0, and
445744d354abSQu Wenruo 	 *   ret == -EINTR or ret == -ECANCELED.
445844d354abSQu Wenruo 	 *
445944d354abSQu Wenruo 	 * So here we only check the return value to catch canceled balance.
446044d354abSQu Wenruo 	 */
446144d354abSQu Wenruo 	else if (ret == -ECANCELED || ret == -EINTR)
44627333bd02SAnand Jain 		btrfs_info(fs_info, "balance: canceled");
44637333bd02SAnand Jain 	else
44647333bd02SAnand Jain 		btrfs_info(fs_info, "balance: ended with status: %d", ret);
44657333bd02SAnand Jain 
44663009a62fSDavid Sterba 	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4467c9e9f97bSIlya Dryomov 
4468c9e9f97bSIlya Dryomov 	if (bargs) {
4469c9e9f97bSIlya Dryomov 		memset(bargs, 0, sizeof(*bargs));
4470008ef096SDavid Sterba 		btrfs_update_ioctl_balance_args(fs_info, bargs);
4471c9e9f97bSIlya Dryomov 	}
4472c9e9f97bSIlya Dryomov 
4473b19c98f2SJosef Bacik 	/* We didn't pause, we can clean everything up. */
4474b19c98f2SJosef Bacik 	if (!paused) {
4475149196a2SDavid Sterba 		reset_balance_state(fs_info);
4476c3e1f96cSGoldwyn Rodrigues 		btrfs_exclop_finish(fs_info);
44773a01aa7aSIlya Dryomov 	}
44783a01aa7aSIlya Dryomov 
4479837d5b6eSIlya Dryomov 	wake_up(&fs_info->balance_wait_q);
4480c9e9f97bSIlya Dryomov 
4481c9e9f97bSIlya Dryomov 	return ret;
4482c9e9f97bSIlya Dryomov out:
448359641015SIlya Dryomov 	if (bctl->flags & BTRFS_BALANCE_RESUME)
4484149196a2SDavid Sterba 		reset_balance_state(fs_info);
4485a17c95dfSDavid Sterba 	else
4486c9e9f97bSIlya Dryomov 		kfree(bctl);
4487c3e1f96cSGoldwyn Rodrigues 	btrfs_exclop_finish(fs_info);
4488a17c95dfSDavid Sterba 
44898f18cf13SChris Mason 	return ret;
44908f18cf13SChris Mason }
44918f18cf13SChris Mason 
balance_kthread(void * data)449259641015SIlya Dryomov static int balance_kthread(void *data)
449359641015SIlya Dryomov {
44942b6ba629SIlya Dryomov 	struct btrfs_fs_info *fs_info = data;
44959555c6c1SIlya Dryomov 	int ret = 0;
449659641015SIlya Dryomov 
4497a690e5f2SNaohiro Aota 	sb_start_write(fs_info->sb);
449859641015SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
449956fc37d9SAnand Jain 	if (fs_info->balance_ctl)
45006fcf6e2bSDavid Sterba 		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
450159641015SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4502a690e5f2SNaohiro Aota 	sb_end_write(fs_info->sb);
45032b6ba629SIlya Dryomov 
450459641015SIlya Dryomov 	return ret;
450559641015SIlya Dryomov }
450659641015SIlya Dryomov 
btrfs_resume_balance_async(struct btrfs_fs_info * fs_info)45072b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
45082b6ba629SIlya Dryomov {
45092b6ba629SIlya Dryomov 	struct task_struct *tsk;
45102b6ba629SIlya Dryomov 
45111354e1a1SDavid Sterba 	mutex_lock(&fs_info->balance_mutex);
45122b6ba629SIlya Dryomov 	if (!fs_info->balance_ctl) {
45131354e1a1SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
45142b6ba629SIlya Dryomov 		return 0;
45152b6ba629SIlya Dryomov 	}
45161354e1a1SDavid Sterba 	mutex_unlock(&fs_info->balance_mutex);
45172b6ba629SIlya Dryomov 
45183cdde224SJeff Mahoney 	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
45196dac13f8SAnand Jain 		btrfs_info(fs_info, "balance: resume skipped");
45202b6ba629SIlya Dryomov 		return 0;
45212b6ba629SIlya Dryomov 	}
45222b6ba629SIlya Dryomov 
4523efc0e69cSNikolay Borisov 	spin_lock(&fs_info->super_lock);
4524efc0e69cSNikolay Borisov 	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4525efc0e69cSNikolay Borisov 	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4526efc0e69cSNikolay Borisov 	spin_unlock(&fs_info->super_lock);
452702ee654dSAnand Jain 	/*
452802ee654dSAnand Jain 	 * A ro->rw remount sequence should continue with the paused balance
452902ee654dSAnand Jain 	 * regardless of who pauses it, system or the user as of now, so set
453002ee654dSAnand Jain 	 * the resume flag.
453102ee654dSAnand Jain 	 */
453202ee654dSAnand Jain 	spin_lock(&fs_info->balance_lock);
453302ee654dSAnand Jain 	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
453402ee654dSAnand Jain 	spin_unlock(&fs_info->balance_lock);
453502ee654dSAnand Jain 
45362b6ba629SIlya Dryomov 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4537cd633972SSachin Kamat 	return PTR_ERR_OR_ZERO(tsk);
45382b6ba629SIlya Dryomov }
45392b6ba629SIlya Dryomov 
btrfs_recover_balance(struct btrfs_fs_info * fs_info)454068310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
454159641015SIlya Dryomov {
454259641015SIlya Dryomov 	struct btrfs_balance_control *bctl;
454359641015SIlya Dryomov 	struct btrfs_balance_item *item;
454459641015SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
454559641015SIlya Dryomov 	struct btrfs_path *path;
454659641015SIlya Dryomov 	struct extent_buffer *leaf;
454759641015SIlya Dryomov 	struct btrfs_key key;
454859641015SIlya Dryomov 	int ret;
454959641015SIlya Dryomov 
455059641015SIlya Dryomov 	path = btrfs_alloc_path();
455159641015SIlya Dryomov 	if (!path)
455259641015SIlya Dryomov 		return -ENOMEM;
455359641015SIlya Dryomov 
455468310a5eSIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
4555c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
455668310a5eSIlya Dryomov 	key.offset = 0;
455768310a5eSIlya Dryomov 
455868310a5eSIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
455968310a5eSIlya Dryomov 	if (ret < 0)
456068310a5eSIlya Dryomov 		goto out;
456168310a5eSIlya Dryomov 	if (ret > 0) { /* ret = -ENOENT; */
456268310a5eSIlya Dryomov 		ret = 0;
456368310a5eSIlya Dryomov 		goto out;
456468310a5eSIlya Dryomov 	}
456568310a5eSIlya Dryomov 
456659641015SIlya Dryomov 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
456759641015SIlya Dryomov 	if (!bctl) {
456859641015SIlya Dryomov 		ret = -ENOMEM;
456959641015SIlya Dryomov 		goto out;
457059641015SIlya Dryomov 	}
457159641015SIlya Dryomov 
457259641015SIlya Dryomov 	leaf = path->nodes[0];
457359641015SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
457459641015SIlya Dryomov 
457568310a5eSIlya Dryomov 	bctl->flags = btrfs_balance_flags(leaf, item);
457668310a5eSIlya Dryomov 	bctl->flags |= BTRFS_BALANCE_RESUME;
457759641015SIlya Dryomov 
457859641015SIlya Dryomov 	btrfs_balance_data(leaf, item, &disk_bargs);
457959641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
458059641015SIlya Dryomov 	btrfs_balance_meta(leaf, item, &disk_bargs);
458159641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
458259641015SIlya Dryomov 	btrfs_balance_sys(leaf, item, &disk_bargs);
458359641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
458459641015SIlya Dryomov 
4585eee95e3fSDavid Sterba 	/*
4586eee95e3fSDavid Sterba 	 * This should never happen, as the paused balance state is recovered
4587eee95e3fSDavid Sterba 	 * during mount without any chance of other exclusive ops to collide.
4588eee95e3fSDavid Sterba 	 *
4589eee95e3fSDavid Sterba 	 * This gives the exclusive op status to balance and keeps in paused
4590eee95e3fSDavid Sterba 	 * state until user intervention (cancel or umount). If the ownership
4591eee95e3fSDavid Sterba 	 * cannot be assigned, show a message but do not fail. The balance
4592eee95e3fSDavid Sterba 	 * is in a paused state and must have fs_info::balance_ctl properly
4593eee95e3fSDavid Sterba 	 * set up.
4594eee95e3fSDavid Sterba 	 */
4595efc0e69cSNikolay Borisov 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4596eee95e3fSDavid Sterba 		btrfs_warn(fs_info,
45976dac13f8SAnand Jain 	"balance: cannot set exclusive op status, resume manually");
4598ed0fb78fSIlya Dryomov 
4599fb286100SJosef Bacik 	btrfs_release_path(path);
4600fb286100SJosef Bacik 
460168310a5eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4602833aae18SDavid Sterba 	BUG_ON(fs_info->balance_ctl);
4603833aae18SDavid Sterba 	spin_lock(&fs_info->balance_lock);
4604833aae18SDavid Sterba 	fs_info->balance_ctl = bctl;
4605833aae18SDavid Sterba 	spin_unlock(&fs_info->balance_lock);
460668310a5eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
460759641015SIlya Dryomov out:
460859641015SIlya Dryomov 	btrfs_free_path(path);
460959641015SIlya Dryomov 	return ret;
461059641015SIlya Dryomov }
461159641015SIlya Dryomov 
btrfs_pause_balance(struct btrfs_fs_info * fs_info)4612837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4613837d5b6eSIlya Dryomov {
4614837d5b6eSIlya Dryomov 	int ret = 0;
4615837d5b6eSIlya Dryomov 
4616837d5b6eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4617837d5b6eSIlya Dryomov 	if (!fs_info->balance_ctl) {
4618837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4619837d5b6eSIlya Dryomov 		return -ENOTCONN;
4620837d5b6eSIlya Dryomov 	}
4621837d5b6eSIlya Dryomov 
46223009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4623837d5b6eSIlya Dryomov 		atomic_inc(&fs_info->balance_pause_req);
4624837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4625837d5b6eSIlya Dryomov 
4626837d5b6eSIlya Dryomov 		wait_event(fs_info->balance_wait_q,
46273009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4628837d5b6eSIlya Dryomov 
4629837d5b6eSIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4630837d5b6eSIlya Dryomov 		/* we are good with balance_ctl ripped off from under us */
46313009a62fSDavid Sterba 		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4632837d5b6eSIlya Dryomov 		atomic_dec(&fs_info->balance_pause_req);
4633837d5b6eSIlya Dryomov 	} else {
4634837d5b6eSIlya Dryomov 		ret = -ENOTCONN;
4635837d5b6eSIlya Dryomov 	}
4636837d5b6eSIlya Dryomov 
4637837d5b6eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4638837d5b6eSIlya Dryomov 	return ret;
4639837d5b6eSIlya Dryomov }
4640837d5b6eSIlya Dryomov 
btrfs_cancel_balance(struct btrfs_fs_info * fs_info)4641a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4642a7e99c69SIlya Dryomov {
4643a7e99c69SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4644a7e99c69SIlya Dryomov 	if (!fs_info->balance_ctl) {
4645a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4646a7e99c69SIlya Dryomov 		return -ENOTCONN;
4647a7e99c69SIlya Dryomov 	}
4648a7e99c69SIlya Dryomov 
4649cf7d20f4SDavid Sterba 	/*
4650cf7d20f4SDavid Sterba 	 * A paused balance with the item stored on disk can be resumed at
4651cf7d20f4SDavid Sterba 	 * mount time if the mount is read-write. Otherwise it's still paused
4652cf7d20f4SDavid Sterba 	 * and we must not allow cancelling as it deletes the item.
4653cf7d20f4SDavid Sterba 	 */
4654cf7d20f4SDavid Sterba 	if (sb_rdonly(fs_info->sb)) {
4655cf7d20f4SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
4656cf7d20f4SDavid Sterba 		return -EROFS;
4657cf7d20f4SDavid Sterba 	}
4658cf7d20f4SDavid Sterba 
4659a7e99c69SIlya Dryomov 	atomic_inc(&fs_info->balance_cancel_req);
4660a7e99c69SIlya Dryomov 	/*
4661a7e99c69SIlya Dryomov 	 * if we are running just wait and return, balance item is
4662a7e99c69SIlya Dryomov 	 * deleted in btrfs_balance in this case
4663a7e99c69SIlya Dryomov 	 */
46643009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4665a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4666a7e99c69SIlya Dryomov 		wait_event(fs_info->balance_wait_q,
46673009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4668a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4669a7e99c69SIlya Dryomov 	} else {
4670a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4671dccdb07bSDavid Sterba 		/*
4672dccdb07bSDavid Sterba 		 * Lock released to allow other waiters to continue, we'll
4673dccdb07bSDavid Sterba 		 * reexamine the status again.
4674dccdb07bSDavid Sterba 		 */
4675a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4676a7e99c69SIlya Dryomov 
4677a17c95dfSDavid Sterba 		if (fs_info->balance_ctl) {
4678149196a2SDavid Sterba 			reset_balance_state(fs_info);
4679c3e1f96cSGoldwyn Rodrigues 			btrfs_exclop_finish(fs_info);
46806dac13f8SAnand Jain 			btrfs_info(fs_info, "balance: canceled");
4681a17c95dfSDavid Sterba 		}
4682a7e99c69SIlya Dryomov 	}
4683a7e99c69SIlya Dryomov 
468429eefa6dSxiaoshoukui 	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4685a7e99c69SIlya Dryomov 	atomic_dec(&fs_info->balance_cancel_req);
4686a7e99c69SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4687a7e99c69SIlya Dryomov 	return 0;
4688a7e99c69SIlya Dryomov }
4689a7e99c69SIlya Dryomov 
btrfs_uuid_scan_kthread(void * data)469097f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data)
4691803b2f54SStefan Behrens {
4692803b2f54SStefan Behrens 	struct btrfs_fs_info *fs_info = data;
4693803b2f54SStefan Behrens 	struct btrfs_root *root = fs_info->tree_root;
4694803b2f54SStefan Behrens 	struct btrfs_key key;
4695803b2f54SStefan Behrens 	struct btrfs_path *path = NULL;
4696803b2f54SStefan Behrens 	int ret = 0;
4697803b2f54SStefan Behrens 	struct extent_buffer *eb;
4698803b2f54SStefan Behrens 	int slot;
4699803b2f54SStefan Behrens 	struct btrfs_root_item root_item;
4700803b2f54SStefan Behrens 	u32 item_size;
4701f45388f3SFilipe David Borba Manana 	struct btrfs_trans_handle *trans = NULL;
4702c94bec2cSJosef Bacik 	bool closing = false;
4703803b2f54SStefan Behrens 
4704803b2f54SStefan Behrens 	path = btrfs_alloc_path();
4705803b2f54SStefan Behrens 	if (!path) {
4706803b2f54SStefan Behrens 		ret = -ENOMEM;
4707803b2f54SStefan Behrens 		goto out;
4708803b2f54SStefan Behrens 	}
4709803b2f54SStefan Behrens 
4710803b2f54SStefan Behrens 	key.objectid = 0;
4711803b2f54SStefan Behrens 	key.type = BTRFS_ROOT_ITEM_KEY;
4712803b2f54SStefan Behrens 	key.offset = 0;
4713803b2f54SStefan Behrens 
4714803b2f54SStefan Behrens 	while (1) {
4715c94bec2cSJosef Bacik 		if (btrfs_fs_closing(fs_info)) {
4716c94bec2cSJosef Bacik 			closing = true;
4717c94bec2cSJosef Bacik 			break;
4718c94bec2cSJosef Bacik 		}
47197c829b72SAnand Jain 		ret = btrfs_search_forward(root, &key, path,
47207c829b72SAnand Jain 				BTRFS_OLDEST_GENERATION);
4721803b2f54SStefan Behrens 		if (ret) {
4722803b2f54SStefan Behrens 			if (ret > 0)
4723803b2f54SStefan Behrens 				ret = 0;
4724803b2f54SStefan Behrens 			break;
4725803b2f54SStefan Behrens 		}
4726803b2f54SStefan Behrens 
4727803b2f54SStefan Behrens 		if (key.type != BTRFS_ROOT_ITEM_KEY ||
4728803b2f54SStefan Behrens 		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4729803b2f54SStefan Behrens 		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4730803b2f54SStefan Behrens 		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4731803b2f54SStefan Behrens 			goto skip;
4732803b2f54SStefan Behrens 
4733803b2f54SStefan Behrens 		eb = path->nodes[0];
4734803b2f54SStefan Behrens 		slot = path->slots[0];
47353212fa14SJosef Bacik 		item_size = btrfs_item_size(eb, slot);
4736803b2f54SStefan Behrens 		if (item_size < sizeof(root_item))
4737803b2f54SStefan Behrens 			goto skip;
4738803b2f54SStefan Behrens 
4739803b2f54SStefan Behrens 		read_extent_buffer(eb, &root_item,
4740803b2f54SStefan Behrens 				   btrfs_item_ptr_offset(eb, slot),
4741803b2f54SStefan Behrens 				   (int)sizeof(root_item));
4742803b2f54SStefan Behrens 		if (btrfs_root_refs(&root_item) == 0)
4743803b2f54SStefan Behrens 			goto skip;
4744f45388f3SFilipe David Borba Manana 
4745f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid) ||
4746f45388f3SFilipe David Borba Manana 		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4747f45388f3SFilipe David Borba Manana 			if (trans)
4748f45388f3SFilipe David Borba Manana 				goto update_tree;
4749f45388f3SFilipe David Borba Manana 
4750f45388f3SFilipe David Borba Manana 			btrfs_release_path(path);
4751803b2f54SStefan Behrens 			/*
4752803b2f54SStefan Behrens 			 * 1 - subvol uuid item
4753803b2f54SStefan Behrens 			 * 1 - received_subvol uuid item
4754803b2f54SStefan Behrens 			 */
4755803b2f54SStefan Behrens 			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4756803b2f54SStefan Behrens 			if (IS_ERR(trans)) {
4757803b2f54SStefan Behrens 				ret = PTR_ERR(trans);
4758803b2f54SStefan Behrens 				break;
4759803b2f54SStefan Behrens 			}
4760f45388f3SFilipe David Borba Manana 			continue;
4761f45388f3SFilipe David Borba Manana 		} else {
4762f45388f3SFilipe David Borba Manana 			goto skip;
4763f45388f3SFilipe David Borba Manana 		}
4764f45388f3SFilipe David Borba Manana update_tree:
47659771a5cfSJosef Bacik 		btrfs_release_path(path);
4766f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4767cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4768803b2f54SStefan Behrens 						  BTRFS_UUID_KEY_SUBVOL,
4769803b2f54SStefan Behrens 						  key.objectid);
4770803b2f54SStefan Behrens 			if (ret < 0) {
4771efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4772803b2f54SStefan Behrens 					ret);
4773803b2f54SStefan Behrens 				break;
4774803b2f54SStefan Behrens 			}
4775803b2f54SStefan Behrens 		}
4776803b2f54SStefan Behrens 
4777803b2f54SStefan Behrens 		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4778cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans,
4779803b2f54SStefan Behrens 						  root_item.received_uuid,
4780803b2f54SStefan Behrens 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4781803b2f54SStefan Behrens 						  key.objectid);
4782803b2f54SStefan Behrens 			if (ret < 0) {
4783efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4784803b2f54SStefan Behrens 					ret);
4785803b2f54SStefan Behrens 				break;
4786803b2f54SStefan Behrens 			}
4787803b2f54SStefan Behrens 		}
4788803b2f54SStefan Behrens 
4789f45388f3SFilipe David Borba Manana skip:
47909771a5cfSJosef Bacik 		btrfs_release_path(path);
4791803b2f54SStefan Behrens 		if (trans) {
47923a45bb20SJeff Mahoney 			ret = btrfs_end_transaction(trans);
4793f45388f3SFilipe David Borba Manana 			trans = NULL;
4794803b2f54SStefan Behrens 			if (ret)
4795803b2f54SStefan Behrens 				break;
4796803b2f54SStefan Behrens 		}
4797803b2f54SStefan Behrens 
4798803b2f54SStefan Behrens 		if (key.offset < (u64)-1) {
4799803b2f54SStefan Behrens 			key.offset++;
4800803b2f54SStefan Behrens 		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4801803b2f54SStefan Behrens 			key.offset = 0;
4802803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4803803b2f54SStefan Behrens 		} else if (key.objectid < (u64)-1) {
4804803b2f54SStefan Behrens 			key.offset = 0;
4805803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4806803b2f54SStefan Behrens 			key.objectid++;
4807803b2f54SStefan Behrens 		} else {
4808803b2f54SStefan Behrens 			break;
4809803b2f54SStefan Behrens 		}
4810803b2f54SStefan Behrens 		cond_resched();
4811803b2f54SStefan Behrens 	}
4812803b2f54SStefan Behrens 
4813803b2f54SStefan Behrens out:
4814803b2f54SStefan Behrens 	btrfs_free_path(path);
4815f45388f3SFilipe David Borba Manana 	if (trans && !IS_ERR(trans))
48163a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
4817803b2f54SStefan Behrens 	if (ret)
4818efe120a0SFrank Holton 		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4819c94bec2cSJosef Bacik 	else if (!closing)
4820afcdd129SJosef Bacik 		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4821803b2f54SStefan Behrens 	up(&fs_info->uuid_tree_rescan_sem);
4822803b2f54SStefan Behrens 	return 0;
4823803b2f54SStefan Behrens }
4824803b2f54SStefan Behrens 
btrfs_create_uuid_tree(struct btrfs_fs_info * fs_info)4825f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4826f7a81ea4SStefan Behrens {
4827f7a81ea4SStefan Behrens 	struct btrfs_trans_handle *trans;
4828f7a81ea4SStefan Behrens 	struct btrfs_root *tree_root = fs_info->tree_root;
4829f7a81ea4SStefan Behrens 	struct btrfs_root *uuid_root;
4830803b2f54SStefan Behrens 	struct task_struct *task;
4831803b2f54SStefan Behrens 	int ret;
4832f7a81ea4SStefan Behrens 
4833f7a81ea4SStefan Behrens 	/*
4834f7a81ea4SStefan Behrens 	 * 1 - root node
4835f7a81ea4SStefan Behrens 	 * 1 - root item
4836f7a81ea4SStefan Behrens 	 */
4837f7a81ea4SStefan Behrens 	trans = btrfs_start_transaction(tree_root, 2);
4838f7a81ea4SStefan Behrens 	if (IS_ERR(trans))
4839f7a81ea4SStefan Behrens 		return PTR_ERR(trans);
4840f7a81ea4SStefan Behrens 
48419b7a2440SDavid Sterba 	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4842f7a81ea4SStefan Behrens 	if (IS_ERR(uuid_root)) {
48436d13f549SDavid Sterba 		ret = PTR_ERR(uuid_root);
484466642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
48453a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
48466d13f549SDavid Sterba 		return ret;
4847f7a81ea4SStefan Behrens 	}
4848f7a81ea4SStefan Behrens 
4849f7a81ea4SStefan Behrens 	fs_info->uuid_root = uuid_root;
4850f7a81ea4SStefan Behrens 
48513a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
4852803b2f54SStefan Behrens 	if (ret)
4853803b2f54SStefan Behrens 		return ret;
4854803b2f54SStefan Behrens 
4855803b2f54SStefan Behrens 	down(&fs_info->uuid_tree_rescan_sem);
4856803b2f54SStefan Behrens 	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4857803b2f54SStefan Behrens 	if (IS_ERR(task)) {
485870f80175SStefan Behrens 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4859efe120a0SFrank Holton 		btrfs_warn(fs_info, "failed to start uuid_scan task");
4860803b2f54SStefan Behrens 		up(&fs_info->uuid_tree_rescan_sem);
4861803b2f54SStefan Behrens 		return PTR_ERR(task);
4862f7a81ea4SStefan Behrens 	}
4863803b2f54SStefan Behrens 
4864803b2f54SStefan Behrens 	return 0;
4865803b2f54SStefan Behrens }
4866803b2f54SStefan Behrens 
48678f18cf13SChris Mason /*
48688f18cf13SChris Mason  * shrinking a device means finding all of the device extents past
48698f18cf13SChris Mason  * the new size, and then following the back refs to the chunks.
48708f18cf13SChris Mason  * The chunk relocation code actually frees the device extent
48718f18cf13SChris Mason  */
btrfs_shrink_device(struct btrfs_device * device,u64 new_size)48728f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
48738f18cf13SChris Mason {
48740b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
48750b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
48768f18cf13SChris Mason 	struct btrfs_trans_handle *trans;
48778f18cf13SChris Mason 	struct btrfs_dev_extent *dev_extent = NULL;
48788f18cf13SChris Mason 	struct btrfs_path *path;
48798f18cf13SChris Mason 	u64 length;
48808f18cf13SChris Mason 	u64 chunk_offset;
48818f18cf13SChris Mason 	int ret;
48828f18cf13SChris Mason 	int slot;
4883ba1bf481SJosef Bacik 	int failed = 0;
4884ba1bf481SJosef Bacik 	bool retried = false;
48858f18cf13SChris Mason 	struct extent_buffer *l;
48868f18cf13SChris Mason 	struct btrfs_key key;
48870b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
48888f18cf13SChris Mason 	u64 old_total = btrfs_super_total_bytes(super_copy);
48897cc8e58dSMiao Xie 	u64 old_size = btrfs_device_get_total_bytes(device);
48907dfb8be1SNikolay Borisov 	u64 diff;
489161d0d0d2SNikolay Borisov 	u64 start;
48927dfb8be1SNikolay Borisov 
48937dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
489461d0d0d2SNikolay Borisov 	start = new_size;
48950e4324a4SNikolay Borisov 	diff = round_down(old_size - new_size, fs_info->sectorsize);
48968f18cf13SChris Mason 
4897401e29c1SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
489863a212abSStefan Behrens 		return -EINVAL;
489963a212abSStefan Behrens 
49008f18cf13SChris Mason 	path = btrfs_alloc_path();
49018f18cf13SChris Mason 	if (!path)
49028f18cf13SChris Mason 		return -ENOMEM;
49038f18cf13SChris Mason 
49040338dff6SGu Jinxiang 	path->reada = READA_BACK;
49058f18cf13SChris Mason 
490661d0d0d2SNikolay Borisov 	trans = btrfs_start_transaction(root, 0);
490761d0d0d2SNikolay Borisov 	if (IS_ERR(trans)) {
490861d0d0d2SNikolay Borisov 		btrfs_free_path(path);
490961d0d0d2SNikolay Borisov 		return PTR_ERR(trans);
491061d0d0d2SNikolay Borisov 	}
491161d0d0d2SNikolay Borisov 
491234441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
49137d9eb12cSChris Mason 
49147cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
4915ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
49162b82032cSYan Zheng 		device->fs_devices->total_rw_bytes -= diff;
4917a5ed45f8SNikolay Borisov 		atomic64_sub(diff, &fs_info->free_chunk_space);
49182bf64758SJosef Bacik 	}
491961d0d0d2SNikolay Borisov 
492061d0d0d2SNikolay Borisov 	/*
492161d0d0d2SNikolay Borisov 	 * Once the device's size has been set to the new size, ensure all
492261d0d0d2SNikolay Borisov 	 * in-memory chunks are synced to disk so that the loop below sees them
492361d0d0d2SNikolay Borisov 	 * and relocates them accordingly.
492461d0d0d2SNikolay Borisov 	 */
49251c11b63eSJeff Mahoney 	if (contains_pending_extent(device, &start, diff)) {
492634441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
492761d0d0d2SNikolay Borisov 		ret = btrfs_commit_transaction(trans);
492861d0d0d2SNikolay Borisov 		if (ret)
492961d0d0d2SNikolay Borisov 			goto done;
493061d0d0d2SNikolay Borisov 	} else {
493161d0d0d2SNikolay Borisov 		mutex_unlock(&fs_info->chunk_mutex);
493261d0d0d2SNikolay Borisov 		btrfs_end_transaction(trans);
493361d0d0d2SNikolay Borisov 	}
49348f18cf13SChris Mason 
4935ba1bf481SJosef Bacik again:
49368f18cf13SChris Mason 	key.objectid = device->devid;
49378f18cf13SChris Mason 	key.offset = (u64)-1;
49388f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
49398f18cf13SChris Mason 
4940213e64daSIlya Dryomov 	do {
4941f3372065SJohannes Thumshirn 		mutex_lock(&fs_info->reclaim_bgs_lock);
49428f18cf13SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
494367c5e7d4SFilipe Manana 		if (ret < 0) {
4944f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
49458f18cf13SChris Mason 			goto done;
494667c5e7d4SFilipe Manana 		}
49478f18cf13SChris Mason 
49488f18cf13SChris Mason 		ret = btrfs_previous_item(root, path, 0, key.type);
49497056bf69SNikolay Borisov 		if (ret) {
4950f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
49518f18cf13SChris Mason 			if (ret < 0)
49528f18cf13SChris Mason 				goto done;
49538f18cf13SChris Mason 			ret = 0;
4954b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4955bf1fb512SYan Zheng 			break;
49568f18cf13SChris Mason 		}
49578f18cf13SChris Mason 
49588f18cf13SChris Mason 		l = path->nodes[0];
49598f18cf13SChris Mason 		slot = path->slots[0];
49608f18cf13SChris Mason 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
49618f18cf13SChris Mason 
4962ba1bf481SJosef Bacik 		if (key.objectid != device->devid) {
4963f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4964b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4965bf1fb512SYan Zheng 			break;
4966ba1bf481SJosef Bacik 		}
49678f18cf13SChris Mason 
49688f18cf13SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
49698f18cf13SChris Mason 		length = btrfs_dev_extent_length(l, dev_extent);
49708f18cf13SChris Mason 
4971ba1bf481SJosef Bacik 		if (key.offset + length <= new_size) {
4972f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4973b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4974d6397baeSChris Ball 			break;
4975ba1bf481SJosef Bacik 		}
49768f18cf13SChris Mason 
49778f18cf13SChris Mason 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4978b3b4aa74SDavid Sterba 		btrfs_release_path(path);
49798f18cf13SChris Mason 
4980a6f93c71SLiu Bo 		/*
4981a6f93c71SLiu Bo 		 * We may be relocating the only data chunk we have,
4982a6f93c71SLiu Bo 		 * which could potentially end up with losing data's
4983a6f93c71SLiu Bo 		 * raid profile, so lets allocate an empty one in
4984a6f93c71SLiu Bo 		 * advance.
4985a6f93c71SLiu Bo 		 */
4986a6f93c71SLiu Bo 		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4987a6f93c71SLiu Bo 		if (ret < 0) {
4988f3372065SJohannes Thumshirn 			mutex_unlock(&fs_info->reclaim_bgs_lock);
4989a6f93c71SLiu Bo 			goto done;
4990a6f93c71SLiu Bo 		}
4991a6f93c71SLiu Bo 
49920b246afaSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4993f3372065SJohannes Thumshirn 		mutex_unlock(&fs_info->reclaim_bgs_lock);
4994eede2bf3SOmar Sandoval 		if (ret == -ENOSPC) {
4995ba1bf481SJosef Bacik 			failed++;
4996eede2bf3SOmar Sandoval 		} else if (ret) {
4997eede2bf3SOmar Sandoval 			if (ret == -ETXTBSY) {
4998eede2bf3SOmar Sandoval 				btrfs_warn(fs_info,
4999eede2bf3SOmar Sandoval 		   "could not shrink block group %llu due to active swapfile",
5000eede2bf3SOmar Sandoval 					   chunk_offset);
5001eede2bf3SOmar Sandoval 			}
5002eede2bf3SOmar Sandoval 			goto done;
5003eede2bf3SOmar Sandoval 		}
5004213e64daSIlya Dryomov 	} while (key.offset-- > 0);
5005ba1bf481SJosef Bacik 
5006ba1bf481SJosef Bacik 	if (failed && !retried) {
5007ba1bf481SJosef Bacik 		failed = 0;
5008ba1bf481SJosef Bacik 		retried = true;
5009ba1bf481SJosef Bacik 		goto again;
5010ba1bf481SJosef Bacik 	} else if (failed && retried) {
5011ba1bf481SJosef Bacik 		ret = -ENOSPC;
50128f18cf13SChris Mason 		goto done;
50138f18cf13SChris Mason 	}
50148f18cf13SChris Mason 
5015d6397baeSChris Ball 	/* Shrinking succeeded, else we would be at "done". */
5016a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
501798d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
501898d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
501998d5dc13STsutomu Itoh 		goto done;
502098d5dc13STsutomu Itoh 	}
502198d5dc13STsutomu Itoh 
502234441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
5023c57dd1f2SQu Wenruo 	/* Clear all state bits beyond the shrunk device size */
5024c57dd1f2SQu Wenruo 	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
5025c57dd1f2SQu Wenruo 			  CHUNK_STATE_MASK);
5026c57dd1f2SQu Wenruo 
50277cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
5028bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
5029bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
5030bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
5031d6397baeSChris Ball 
5032d6397baeSChris Ball 	WARN_ON(diff > old_total);
50337dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
50347dfb8be1SNikolay Borisov 			round_down(old_total - diff, fs_info->sectorsize));
503534441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
50362196d6e8SMiao Xie 
50372bb2e00eSFilipe Manana 	btrfs_reserve_chunk_metadata(trans, false);
50382196d6e8SMiao Xie 	/* Now btrfs_update_device() will change the on-disk size. */
50392196d6e8SMiao Xie 	ret = btrfs_update_device(trans, device);
50402bb2e00eSFilipe Manana 	btrfs_trans_release_chunk_metadata(trans);
5041801660b0SAnand Jain 	if (ret < 0) {
5042801660b0SAnand Jain 		btrfs_abort_transaction(trans, ret);
50433a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
5044801660b0SAnand Jain 	} else {
5045801660b0SAnand Jain 		ret = btrfs_commit_transaction(trans);
5046801660b0SAnand Jain 	}
50478f18cf13SChris Mason done:
50488f18cf13SChris Mason 	btrfs_free_path(path);
504953e489bcSFilipe Manana 	if (ret) {
505034441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
505153e489bcSFilipe Manana 		btrfs_device_set_total_bytes(device, old_size);
5052ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
505353e489bcSFilipe Manana 			device->fs_devices->total_rw_bytes += diff;
5054a5ed45f8SNikolay Borisov 		atomic64_add(diff, &fs_info->free_chunk_space);
505534441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
505653e489bcSFilipe Manana 	}
50578f18cf13SChris Mason 	return ret;
50588f18cf13SChris Mason }
50598f18cf13SChris Mason 
btrfs_add_system_chunk(struct btrfs_fs_info * fs_info,struct btrfs_key * key,struct btrfs_chunk * chunk,int item_size)50602ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
50610b86a832SChris Mason 			   struct btrfs_key *key,
50620b86a832SChris Mason 			   struct btrfs_chunk *chunk, int item_size)
50630b86a832SChris Mason {
50640b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
50650b86a832SChris Mason 	struct btrfs_disk_key disk_key;
50660b86a832SChris Mason 	u32 array_size;
50670b86a832SChris Mason 	u8 *ptr;
50680b86a832SChris Mason 
506979bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
507079bd3712SFilipe Manana 
50710b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
50725f43f86eSGui Hecheng 	if (array_size + item_size + sizeof(disk_key)
507379bd3712SFilipe Manana 			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
50740b86a832SChris Mason 		return -EFBIG;
50750b86a832SChris Mason 
50760b86a832SChris Mason 	ptr = super_copy->sys_chunk_array + array_size;
50770b86a832SChris Mason 	btrfs_cpu_key_to_disk(&disk_key, key);
50780b86a832SChris Mason 	memcpy(ptr, &disk_key, sizeof(disk_key));
50790b86a832SChris Mason 	ptr += sizeof(disk_key);
50800b86a832SChris Mason 	memcpy(ptr, chunk, item_size);
50810b86a832SChris Mason 	item_size += sizeof(disk_key);
50820b86a832SChris Mason 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5083fe48a5c0SMiao Xie 
50840b86a832SChris Mason 	return 0;
50850b86a832SChris Mason }
50860b86a832SChris Mason 
50879f680ce0SChris Mason /*
508873c5de00SArne Jansen  * sort the devices in descending order by max_avail, total_avail
50899f680ce0SChris Mason  */
btrfs_cmp_device_info(const void * a,const void * b)509073c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b)
50912b82032cSYan Zheng {
509273c5de00SArne Jansen 	const struct btrfs_device_info *di_a = a;
509373c5de00SArne Jansen 	const struct btrfs_device_info *di_b = b;
50942b82032cSYan Zheng 
509573c5de00SArne Jansen 	if (di_a->max_avail > di_b->max_avail)
5096a40a90a0SChris Mason 		return -1;
509773c5de00SArne Jansen 	if (di_a->max_avail < di_b->max_avail)
50989b3f68b9SChris Mason 		return 1;
509973c5de00SArne Jansen 	if (di_a->total_avail > di_b->total_avail)
510073c5de00SArne Jansen 		return -1;
510173c5de00SArne Jansen 	if (di_a->total_avail < di_b->total_avail)
510273c5de00SArne Jansen 		return 1;
5103b2117a39SMiao Xie 	return 0;
5104b2117a39SMiao Xie }
5105b2117a39SMiao Xie 
check_raid56_incompat_flag(struct btrfs_fs_info * info,u64 type)510653b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
510753b381b3SDavid Woodhouse {
5108ffe2d203SZhao Lei 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
510953b381b3SDavid Woodhouse 		return;
511053b381b3SDavid Woodhouse 
5111ceda0864SMiao Xie 	btrfs_set_fs_incompat(info, RAID56);
511253b381b3SDavid Woodhouse }
511353b381b3SDavid Woodhouse 
check_raid1c34_incompat_flag(struct btrfs_fs_info * info,u64 type)5114cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5115cfbb825cSDavid Sterba {
5116cfbb825cSDavid Sterba 	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5117cfbb825cSDavid Sterba 		return;
5118cfbb825cSDavid Sterba 
5119cfbb825cSDavid Sterba 	btrfs_set_fs_incompat(info, RAID1C34);
5120cfbb825cSDavid Sterba }
5121cfbb825cSDavid Sterba 
51224f2bafe8SNaohiro Aota /*
5123f6f39f7aSNikolay Borisov  * Structure used internally for btrfs_create_chunk() function.
51244f2bafe8SNaohiro Aota  * Wraps needed parameters.
51254f2bafe8SNaohiro Aota  */
51264f2bafe8SNaohiro Aota struct alloc_chunk_ctl {
51274f2bafe8SNaohiro Aota 	u64 start;
51284f2bafe8SNaohiro Aota 	u64 type;
51294f2bafe8SNaohiro Aota 	/* Total number of stripes to allocate */
51304f2bafe8SNaohiro Aota 	int num_stripes;
51314f2bafe8SNaohiro Aota 	/* sub_stripes info for map */
51324f2bafe8SNaohiro Aota 	int sub_stripes;
51334f2bafe8SNaohiro Aota 	/* Stripes per device */
51344f2bafe8SNaohiro Aota 	int dev_stripes;
51354f2bafe8SNaohiro Aota 	/* Maximum number of devices to use */
51364f2bafe8SNaohiro Aota 	int devs_max;
51374f2bafe8SNaohiro Aota 	/* Minimum number of devices to use */
51384f2bafe8SNaohiro Aota 	int devs_min;
51394f2bafe8SNaohiro Aota 	/* ndevs has to be a multiple of this */
51404f2bafe8SNaohiro Aota 	int devs_increment;
51414f2bafe8SNaohiro Aota 	/* Number of copies */
51424f2bafe8SNaohiro Aota 	int ncopies;
51434f2bafe8SNaohiro Aota 	/* Number of stripes worth of bytes to store parity information */
51444f2bafe8SNaohiro Aota 	int nparity;
51454f2bafe8SNaohiro Aota 	u64 max_stripe_size;
51464f2bafe8SNaohiro Aota 	u64 max_chunk_size;
51476aafb303SNaohiro Aota 	u64 dev_extent_min;
51484f2bafe8SNaohiro Aota 	u64 stripe_size;
51494f2bafe8SNaohiro Aota 	u64 chunk_size;
51504f2bafe8SNaohiro Aota 	int ndevs;
51514f2bafe8SNaohiro Aota };
51524f2bafe8SNaohiro Aota 
init_alloc_chunk_ctl_policy_regular(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)515327c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular(
515427c314d5SNaohiro Aota 				struct btrfs_fs_devices *fs_devices,
515527c314d5SNaohiro Aota 				struct alloc_chunk_ctl *ctl)
515627c314d5SNaohiro Aota {
5157f6fca391SStefan Roesch 	struct btrfs_space_info *space_info;
515827c314d5SNaohiro Aota 
5159f6fca391SStefan Roesch 	space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5160f6fca391SStefan Roesch 	ASSERT(space_info);
5161f6fca391SStefan Roesch 
5162f6fca391SStefan Roesch 	ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
51638a540e99SZygo Blaxell 	ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
5164f6fca391SStefan Roesch 
5165f6fca391SStefan Roesch 	if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5166f6fca391SStefan Roesch 		ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
516727c314d5SNaohiro Aota 
516827c314d5SNaohiro Aota 	/* We don't want a chunk larger than 10% of writable space */
5169428c8e03SDavid Sterba 	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
517027c314d5SNaohiro Aota 				  ctl->max_chunk_size);
5171cb091225SQu Wenruo 	ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
517227c314d5SNaohiro Aota }
517327c314d5SNaohiro Aota 
init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)51741cd6121fSNaohiro Aota static void init_alloc_chunk_ctl_policy_zoned(
51751cd6121fSNaohiro Aota 				      struct btrfs_fs_devices *fs_devices,
51761cd6121fSNaohiro Aota 				      struct alloc_chunk_ctl *ctl)
51771cd6121fSNaohiro Aota {
51781cd6121fSNaohiro Aota 	u64 zone_size = fs_devices->fs_info->zone_size;
51791cd6121fSNaohiro Aota 	u64 limit;
51801cd6121fSNaohiro Aota 	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
51811cd6121fSNaohiro Aota 	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
51821cd6121fSNaohiro Aota 	u64 min_chunk_size = min_data_stripes * zone_size;
51831cd6121fSNaohiro Aota 	u64 type = ctl->type;
51841cd6121fSNaohiro Aota 
51851cd6121fSNaohiro Aota 	ctl->max_stripe_size = zone_size;
51861cd6121fSNaohiro Aota 	if (type & BTRFS_BLOCK_GROUP_DATA) {
51871cd6121fSNaohiro Aota 		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
51881cd6121fSNaohiro Aota 						 zone_size);
51891cd6121fSNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
51901cd6121fSNaohiro Aota 		ctl->max_chunk_size = ctl->max_stripe_size;
51911cd6121fSNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
51921cd6121fSNaohiro Aota 		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
51931cd6121fSNaohiro Aota 		ctl->devs_max = min_t(int, ctl->devs_max,
51941cd6121fSNaohiro Aota 				      BTRFS_MAX_DEVS_SYS_CHUNK);
5195bb05b298SArnd Bergmann 	} else {
5196bb05b298SArnd Bergmann 		BUG();
51971cd6121fSNaohiro Aota 	}
51981cd6121fSNaohiro Aota 
51991cd6121fSNaohiro Aota 	/* We don't want a chunk larger than 10% of writable space */
5200428c8e03SDavid Sterba 	limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
52011cd6121fSNaohiro Aota 			       zone_size),
52021cd6121fSNaohiro Aota 		    min_chunk_size);
52031cd6121fSNaohiro Aota 	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
52041cd6121fSNaohiro Aota 	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
52051cd6121fSNaohiro Aota }
52061cd6121fSNaohiro Aota 
init_alloc_chunk_ctl(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)520727c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
520827c314d5SNaohiro Aota 				 struct alloc_chunk_ctl *ctl)
520927c314d5SNaohiro Aota {
521027c314d5SNaohiro Aota 	int index = btrfs_bg_flags_to_raid_index(ctl->type);
521127c314d5SNaohiro Aota 
521227c314d5SNaohiro Aota 	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
521327c314d5SNaohiro Aota 	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
521427c314d5SNaohiro Aota 	ctl->devs_max = btrfs_raid_array[index].devs_max;
521527c314d5SNaohiro Aota 	if (!ctl->devs_max)
521627c314d5SNaohiro Aota 		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
521727c314d5SNaohiro Aota 	ctl->devs_min = btrfs_raid_array[index].devs_min;
521827c314d5SNaohiro Aota 	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
521927c314d5SNaohiro Aota 	ctl->ncopies = btrfs_raid_array[index].ncopies;
522027c314d5SNaohiro Aota 	ctl->nparity = btrfs_raid_array[index].nparity;
522127c314d5SNaohiro Aota 	ctl->ndevs = 0;
522227c314d5SNaohiro Aota 
522327c314d5SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
522427c314d5SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
522527c314d5SNaohiro Aota 		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
522627c314d5SNaohiro Aota 		break;
52271cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
52281cd6121fSNaohiro Aota 		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
52291cd6121fSNaohiro Aota 		break;
523027c314d5SNaohiro Aota 	default:
523127c314d5SNaohiro Aota 		BUG();
523227c314d5SNaohiro Aota 	}
523327c314d5SNaohiro Aota }
523427c314d5SNaohiro Aota 
gather_device_info(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5235560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5236560156cbSNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
5237560156cbSNaohiro Aota 			      struct btrfs_device_info *devices_info)
5238560156cbSNaohiro Aota {
5239560156cbSNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
5240560156cbSNaohiro Aota 	struct btrfs_device *device;
5241560156cbSNaohiro Aota 	u64 total_avail;
5242560156cbSNaohiro Aota 	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5243560156cbSNaohiro Aota 	int ret;
5244560156cbSNaohiro Aota 	int ndevs = 0;
5245560156cbSNaohiro Aota 	u64 max_avail;
5246560156cbSNaohiro Aota 	u64 dev_offset;
5247560156cbSNaohiro Aota 
5248560156cbSNaohiro Aota 	/*
5249560156cbSNaohiro Aota 	 * in the first pass through the devices list, we gather information
5250560156cbSNaohiro Aota 	 * about the available holes on each device.
5251560156cbSNaohiro Aota 	 */
5252560156cbSNaohiro Aota 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5253560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5254560156cbSNaohiro Aota 			WARN(1, KERN_ERR
5255560156cbSNaohiro Aota 			       "BTRFS: read-only device in alloc_list\n");
5256560156cbSNaohiro Aota 			continue;
5257560156cbSNaohiro Aota 		}
5258560156cbSNaohiro Aota 
5259560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5260560156cbSNaohiro Aota 					&device->dev_state) ||
5261560156cbSNaohiro Aota 		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5262560156cbSNaohiro Aota 			continue;
5263560156cbSNaohiro Aota 
5264560156cbSNaohiro Aota 		if (device->total_bytes > device->bytes_used)
5265560156cbSNaohiro Aota 			total_avail = device->total_bytes - device->bytes_used;
5266560156cbSNaohiro Aota 		else
5267560156cbSNaohiro Aota 			total_avail = 0;
5268560156cbSNaohiro Aota 
5269560156cbSNaohiro Aota 		/* If there is no space on this device, skip it. */
52706aafb303SNaohiro Aota 		if (total_avail < ctl->dev_extent_min)
5271560156cbSNaohiro Aota 			continue;
5272560156cbSNaohiro Aota 
5273560156cbSNaohiro Aota 		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5274560156cbSNaohiro Aota 					   &max_avail);
5275560156cbSNaohiro Aota 		if (ret && ret != -ENOSPC)
5276560156cbSNaohiro Aota 			return ret;
5277560156cbSNaohiro Aota 
5278560156cbSNaohiro Aota 		if (ret == 0)
5279560156cbSNaohiro Aota 			max_avail = dev_extent_want;
5280560156cbSNaohiro Aota 
52816aafb303SNaohiro Aota 		if (max_avail < ctl->dev_extent_min) {
5282560156cbSNaohiro Aota 			if (btrfs_test_opt(info, ENOSPC_DEBUG))
5283560156cbSNaohiro Aota 				btrfs_debug(info,
5284560156cbSNaohiro Aota 			"%s: devid %llu has no free space, have=%llu want=%llu",
5285560156cbSNaohiro Aota 					    __func__, device->devid, max_avail,
52866aafb303SNaohiro Aota 					    ctl->dev_extent_min);
5287560156cbSNaohiro Aota 			continue;
5288560156cbSNaohiro Aota 		}
5289560156cbSNaohiro Aota 
5290560156cbSNaohiro Aota 		if (ndevs == fs_devices->rw_devices) {
5291560156cbSNaohiro Aota 			WARN(1, "%s: found more than %llu devices\n",
5292560156cbSNaohiro Aota 			     __func__, fs_devices->rw_devices);
5293560156cbSNaohiro Aota 			break;
5294560156cbSNaohiro Aota 		}
5295560156cbSNaohiro Aota 		devices_info[ndevs].dev_offset = dev_offset;
5296560156cbSNaohiro Aota 		devices_info[ndevs].max_avail = max_avail;
5297560156cbSNaohiro Aota 		devices_info[ndevs].total_avail = total_avail;
5298560156cbSNaohiro Aota 		devices_info[ndevs].dev = device;
5299560156cbSNaohiro Aota 		++ndevs;
5300560156cbSNaohiro Aota 	}
5301560156cbSNaohiro Aota 	ctl->ndevs = ndevs;
5302560156cbSNaohiro Aota 
5303560156cbSNaohiro Aota 	/*
5304560156cbSNaohiro Aota 	 * now sort the devices by hole size / available space
5305560156cbSNaohiro Aota 	 */
5306560156cbSNaohiro Aota 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5307560156cbSNaohiro Aota 	     btrfs_cmp_device_info, NULL);
5308560156cbSNaohiro Aota 
5309560156cbSNaohiro Aota 	return 0;
5310560156cbSNaohiro Aota }
5311560156cbSNaohiro Aota 
decide_stripe_size_regular(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53125badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
53135badf512SNaohiro Aota 				      struct btrfs_device_info *devices_info)
53145badf512SNaohiro Aota {
53155badf512SNaohiro Aota 	/* Number of stripes that count for block group size */
53165badf512SNaohiro Aota 	int data_stripes;
53175badf512SNaohiro Aota 
53185badf512SNaohiro Aota 	/*
53195badf512SNaohiro Aota 	 * The primary goal is to maximize the number of stripes, so use as
53205badf512SNaohiro Aota 	 * many devices as possible, even if the stripes are not maximum sized.
53215badf512SNaohiro Aota 	 *
53225badf512SNaohiro Aota 	 * The DUP profile stores more than one stripe per device, the
53235badf512SNaohiro Aota 	 * max_avail is the total size so we have to adjust.
53245badf512SNaohiro Aota 	 */
53255badf512SNaohiro Aota 	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
53265badf512SNaohiro Aota 				   ctl->dev_stripes);
53275badf512SNaohiro Aota 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53285badf512SNaohiro Aota 
53295badf512SNaohiro Aota 	/* This will have to be fixed for RAID1 and RAID10 over more drives */
53305badf512SNaohiro Aota 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53315badf512SNaohiro Aota 
53325badf512SNaohiro Aota 	/*
53335badf512SNaohiro Aota 	 * Use the number of data stripes to figure out how big this chunk is
53345badf512SNaohiro Aota 	 * really going to be in terms of logical address space, and compare
53355badf512SNaohiro Aota 	 * that answer with the max chunk size. If it's higher, we try to
53365badf512SNaohiro Aota 	 * reduce stripe_size.
53375badf512SNaohiro Aota 	 */
53385badf512SNaohiro Aota 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
53395badf512SNaohiro Aota 		/*
53405badf512SNaohiro Aota 		 * Reduce stripe_size, round it up to a 16MB boundary again and
53415badf512SNaohiro Aota 		 * then use it, unless it ends up being even bigger than the
53425badf512SNaohiro Aota 		 * previous value we had already.
53435badf512SNaohiro Aota 		 */
53445badf512SNaohiro Aota 		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
53455badf512SNaohiro Aota 							data_stripes), SZ_16M),
53465badf512SNaohiro Aota 				       ctl->stripe_size);
53475badf512SNaohiro Aota 	}
53485badf512SNaohiro Aota 
53495da431b7SQu Wenruo 	/* Stripe size should not go beyond 1G. */
53505da431b7SQu Wenruo 	ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
53515da431b7SQu Wenruo 
53525badf512SNaohiro Aota 	/* Align to BTRFS_STRIPE_LEN */
53535badf512SNaohiro Aota 	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
53545badf512SNaohiro Aota 	ctl->chunk_size = ctl->stripe_size * data_stripes;
53555badf512SNaohiro Aota 
53565badf512SNaohiro Aota 	return 0;
53575badf512SNaohiro Aota }
53585badf512SNaohiro Aota 
decide_stripe_size_zoned(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53591cd6121fSNaohiro Aota static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
53601cd6121fSNaohiro Aota 				    struct btrfs_device_info *devices_info)
53611cd6121fSNaohiro Aota {
53621cd6121fSNaohiro Aota 	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
53631cd6121fSNaohiro Aota 	/* Number of stripes that count for block group size */
53641cd6121fSNaohiro Aota 	int data_stripes;
53651cd6121fSNaohiro Aota 
53661cd6121fSNaohiro Aota 	/*
53671cd6121fSNaohiro Aota 	 * It should hold because:
53681cd6121fSNaohiro Aota 	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
53691cd6121fSNaohiro Aota 	 */
53701cd6121fSNaohiro Aota 	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
53711cd6121fSNaohiro Aota 
53721cd6121fSNaohiro Aota 	ctl->stripe_size = zone_size;
53731cd6121fSNaohiro Aota 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53741cd6121fSNaohiro Aota 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53751cd6121fSNaohiro Aota 
53761cd6121fSNaohiro Aota 	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
53771cd6121fSNaohiro Aota 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
53781cd6121fSNaohiro Aota 		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
53791cd6121fSNaohiro Aota 					     ctl->stripe_size) + ctl->nparity,
53801cd6121fSNaohiro Aota 				     ctl->dev_stripes);
53811cd6121fSNaohiro Aota 		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53821cd6121fSNaohiro Aota 		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53831cd6121fSNaohiro Aota 		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
53841cd6121fSNaohiro Aota 	}
53851cd6121fSNaohiro Aota 
53861cd6121fSNaohiro Aota 	ctl->chunk_size = ctl->stripe_size * data_stripes;
53871cd6121fSNaohiro Aota 
53881cd6121fSNaohiro Aota 	return 0;
53891cd6121fSNaohiro Aota }
53901cd6121fSNaohiro Aota 
decide_stripe_size(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53915badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
53925badf512SNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
53935badf512SNaohiro Aota 			      struct btrfs_device_info *devices_info)
53945badf512SNaohiro Aota {
53955badf512SNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
53965badf512SNaohiro Aota 
53975badf512SNaohiro Aota 	/*
53985badf512SNaohiro Aota 	 * Round down to number of usable stripes, devs_increment can be any
53995badf512SNaohiro Aota 	 * number so we can't use round_down() that requires power of 2, while
54005badf512SNaohiro Aota 	 * rounddown is safe.
54015badf512SNaohiro Aota 	 */
54025badf512SNaohiro Aota 	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
54035badf512SNaohiro Aota 
54045badf512SNaohiro Aota 	if (ctl->ndevs < ctl->devs_min) {
54055badf512SNaohiro Aota 		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
54065badf512SNaohiro Aota 			btrfs_debug(info,
54075badf512SNaohiro Aota 	"%s: not enough devices with free space: have=%d minimum required=%d",
54085badf512SNaohiro Aota 				    __func__, ctl->ndevs, ctl->devs_min);
54095badf512SNaohiro Aota 		}
54105badf512SNaohiro Aota 		return -ENOSPC;
54115badf512SNaohiro Aota 	}
54125badf512SNaohiro Aota 
54135badf512SNaohiro Aota 	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
54145badf512SNaohiro Aota 
54155badf512SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
54165badf512SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
54175badf512SNaohiro Aota 		return decide_stripe_size_regular(ctl, devices_info);
54181cd6121fSNaohiro Aota 	case BTRFS_CHUNK_ALLOC_ZONED:
54191cd6121fSNaohiro Aota 		return decide_stripe_size_zoned(ctl, devices_info);
54205badf512SNaohiro Aota 	default:
54215badf512SNaohiro Aota 		BUG();
54225badf512SNaohiro Aota 	}
54235badf512SNaohiro Aota }
54245badf512SNaohiro Aota 
create_chunk(struct btrfs_trans_handle * trans,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)542579bd3712SFilipe Manana static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5426dce580caSNaohiro Aota 			struct alloc_chunk_ctl *ctl,
5427dce580caSNaohiro Aota 			struct btrfs_device_info *devices_info)
5428dce580caSNaohiro Aota {
5429dce580caSNaohiro Aota 	struct btrfs_fs_info *info = trans->fs_info;
5430dce580caSNaohiro Aota 	struct map_lookup *map = NULL;
5431dce580caSNaohiro Aota 	struct extent_map_tree *em_tree;
543279bd3712SFilipe Manana 	struct btrfs_block_group *block_group;
5433dce580caSNaohiro Aota 	struct extent_map *em;
5434dce580caSNaohiro Aota 	u64 start = ctl->start;
5435dce580caSNaohiro Aota 	u64 type = ctl->type;
5436dce580caSNaohiro Aota 	int ret;
5437dce580caSNaohiro Aota 	int i;
5438dce580caSNaohiro Aota 	int j;
5439dce580caSNaohiro Aota 
5440dce580caSNaohiro Aota 	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5441dce580caSNaohiro Aota 	if (!map)
544279bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5443dce580caSNaohiro Aota 	map->num_stripes = ctl->num_stripes;
5444dce580caSNaohiro Aota 
5445dce580caSNaohiro Aota 	for (i = 0; i < ctl->ndevs; ++i) {
5446dce580caSNaohiro Aota 		for (j = 0; j < ctl->dev_stripes; ++j) {
5447dce580caSNaohiro Aota 			int s = i * ctl->dev_stripes + j;
5448dce580caSNaohiro Aota 			map->stripes[s].dev = devices_info[i].dev;
5449dce580caSNaohiro Aota 			map->stripes[s].physical = devices_info[i].dev_offset +
5450dce580caSNaohiro Aota 						   j * ctl->stripe_size;
5451dce580caSNaohiro Aota 		}
5452dce580caSNaohiro Aota 	}
5453dce580caSNaohiro Aota 	map->io_align = BTRFS_STRIPE_LEN;
5454dce580caSNaohiro Aota 	map->io_width = BTRFS_STRIPE_LEN;
5455dce580caSNaohiro Aota 	map->type = type;
5456dce580caSNaohiro Aota 	map->sub_stripes = ctl->sub_stripes;
5457dce580caSNaohiro Aota 
5458dce580caSNaohiro Aota 	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5459dce580caSNaohiro Aota 
5460dce580caSNaohiro Aota 	em = alloc_extent_map();
5461dce580caSNaohiro Aota 	if (!em) {
5462dce580caSNaohiro Aota 		kfree(map);
546379bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5464dce580caSNaohiro Aota 	}
5465dce580caSNaohiro Aota 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5466dce580caSNaohiro Aota 	em->map_lookup = map;
5467dce580caSNaohiro Aota 	em->start = start;
5468dce580caSNaohiro Aota 	em->len = ctl->chunk_size;
5469dce580caSNaohiro Aota 	em->block_start = 0;
5470dce580caSNaohiro Aota 	em->block_len = em->len;
5471dce580caSNaohiro Aota 	em->orig_block_len = ctl->stripe_size;
5472dce580caSNaohiro Aota 
5473dce580caSNaohiro Aota 	em_tree = &info->mapping_tree;
5474dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5475dce580caSNaohiro Aota 	ret = add_extent_mapping(em_tree, em, 0);
5476dce580caSNaohiro Aota 	if (ret) {
5477dce580caSNaohiro Aota 		write_unlock(&em_tree->lock);
5478dce580caSNaohiro Aota 		free_extent_map(em);
547979bd3712SFilipe Manana 		return ERR_PTR(ret);
5480dce580caSNaohiro Aota 	}
5481dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5482dce580caSNaohiro Aota 
54835758d1bdSFilipe Manana 	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
548479bd3712SFilipe Manana 	if (IS_ERR(block_group))
5485dce580caSNaohiro Aota 		goto error_del_extent;
5486dce580caSNaohiro Aota 
5487dce580caSNaohiro Aota 	for (i = 0; i < map->num_stripes; i++) {
5488dce580caSNaohiro Aota 		struct btrfs_device *dev = map->stripes[i].dev;
5489dce580caSNaohiro Aota 
5490dce580caSNaohiro Aota 		btrfs_device_set_bytes_used(dev,
5491dce580caSNaohiro Aota 					    dev->bytes_used + ctl->stripe_size);
5492dce580caSNaohiro Aota 		if (list_empty(&dev->post_commit_list))
5493dce580caSNaohiro Aota 			list_add_tail(&dev->post_commit_list,
5494dce580caSNaohiro Aota 				      &trans->transaction->dev_update_list);
5495dce580caSNaohiro Aota 	}
5496dce580caSNaohiro Aota 
5497dce580caSNaohiro Aota 	atomic64_sub(ctl->stripe_size * map->num_stripes,
5498dce580caSNaohiro Aota 		     &info->free_chunk_space);
5499dce580caSNaohiro Aota 
5500dce580caSNaohiro Aota 	free_extent_map(em);
5501dce580caSNaohiro Aota 	check_raid56_incompat_flag(info, type);
5502dce580caSNaohiro Aota 	check_raid1c34_incompat_flag(info, type);
5503dce580caSNaohiro Aota 
550479bd3712SFilipe Manana 	return block_group;
5505dce580caSNaohiro Aota 
5506dce580caSNaohiro Aota error_del_extent:
5507dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5508dce580caSNaohiro Aota 	remove_extent_mapping(em_tree, em);
5509dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5510dce580caSNaohiro Aota 
5511dce580caSNaohiro Aota 	/* One for our allocation */
5512dce580caSNaohiro Aota 	free_extent_map(em);
5513dce580caSNaohiro Aota 	/* One for the tree reference */
5514dce580caSNaohiro Aota 	free_extent_map(em);
5515dce580caSNaohiro Aota 
551679bd3712SFilipe Manana 	return block_group;
5517dce580caSNaohiro Aota }
5518dce580caSNaohiro Aota 
btrfs_create_chunk(struct btrfs_trans_handle * trans,u64 type)5519f6f39f7aSNikolay Borisov struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
552079bd3712SFilipe Manana 					    u64 type)
5521b2117a39SMiao Xie {
55222ff7e61eSJeff Mahoney 	struct btrfs_fs_info *info = trans->fs_info;
5523b2117a39SMiao Xie 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
552473c5de00SArne Jansen 	struct btrfs_device_info *devices_info = NULL;
55254f2bafe8SNaohiro Aota 	struct alloc_chunk_ctl ctl;
552679bd3712SFilipe Manana 	struct btrfs_block_group *block_group;
5527b2117a39SMiao Xie 	int ret;
5528b2117a39SMiao Xie 
552911c67b1aSNikolay Borisov 	lockdep_assert_held(&info->chunk_mutex);
553011c67b1aSNikolay Borisov 
5531b25c19f4SNaohiro Aota 	if (!alloc_profile_is_valid(type, 0)) {
5532b25c19f4SNaohiro Aota 		ASSERT(0);
553379bd3712SFilipe Manana 		return ERR_PTR(-EINVAL);
5534b25c19f4SNaohiro Aota 	}
553573c5de00SArne Jansen 
55364117f207SQu Wenruo 	if (list_empty(&fs_devices->alloc_list)) {
55374117f207SQu Wenruo 		if (btrfs_test_opt(info, ENOSPC_DEBUG))
55384117f207SQu Wenruo 			btrfs_debug(info, "%s: no writable device", __func__);
553979bd3712SFilipe Manana 		return ERR_PTR(-ENOSPC);
55404117f207SQu Wenruo 	}
5541b2117a39SMiao Xie 
554227c314d5SNaohiro Aota 	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
554327c314d5SNaohiro Aota 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
554427c314d5SNaohiro Aota 		ASSERT(0);
554579bd3712SFilipe Manana 		return ERR_PTR(-EINVAL);
554673c5de00SArne Jansen 	}
554773c5de00SArne Jansen 
554811c67b1aSNikolay Borisov 	ctl.start = find_next_chunk(info);
554927c314d5SNaohiro Aota 	ctl.type = type;
555027c314d5SNaohiro Aota 	init_alloc_chunk_ctl(fs_devices, &ctl);
5551b2117a39SMiao Xie 
555231e818feSDavid Sterba 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5553b2117a39SMiao Xie 			       GFP_NOFS);
5554b2117a39SMiao Xie 	if (!devices_info)
555579bd3712SFilipe Manana 		return ERR_PTR(-ENOMEM);
5556b2117a39SMiao Xie 
5557560156cbSNaohiro Aota 	ret = gather_device_info(fs_devices, &ctl, devices_info);
555879bd3712SFilipe Manana 	if (ret < 0) {
555979bd3712SFilipe Manana 		block_group = ERR_PTR(ret);
5560dce580caSNaohiro Aota 		goto out;
556179bd3712SFilipe Manana 	}
556273c5de00SArne Jansen 
55635badf512SNaohiro Aota 	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
556479bd3712SFilipe Manana 	if (ret < 0) {
556579bd3712SFilipe Manana 		block_group = ERR_PTR(ret);
5566dce580caSNaohiro Aota 		goto out;
556779bd3712SFilipe Manana 	}
556873c5de00SArne Jansen 
556979bd3712SFilipe Manana 	block_group = create_chunk(trans, &ctl, devices_info);
55709b3f68b9SChris Mason 
5571dce580caSNaohiro Aota out:
5572b2117a39SMiao Xie 	kfree(devices_info);
557379bd3712SFilipe Manana 	return block_group;
55742b82032cSYan Zheng }
55752b82032cSYan Zheng 
557611c67b1aSNikolay Borisov /*
557779bd3712SFilipe Manana  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
557879bd3712SFilipe Manana  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
557979bd3712SFilipe Manana  * chunks.
558079bd3712SFilipe Manana  *
558179bd3712SFilipe Manana  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
558279bd3712SFilipe Manana  * phases.
558379bd3712SFilipe Manana  */
btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle * trans,struct btrfs_block_group * bg)558479bd3712SFilipe Manana int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
558579bd3712SFilipe Manana 				     struct btrfs_block_group *bg)
558679bd3712SFilipe Manana {
558779bd3712SFilipe Manana 	struct btrfs_fs_info *fs_info = trans->fs_info;
558879bd3712SFilipe Manana 	struct btrfs_root *chunk_root = fs_info->chunk_root;
558979bd3712SFilipe Manana 	struct btrfs_key key;
559079bd3712SFilipe Manana 	struct btrfs_chunk *chunk;
559179bd3712SFilipe Manana 	struct btrfs_stripe *stripe;
559279bd3712SFilipe Manana 	struct extent_map *em;
559379bd3712SFilipe Manana 	struct map_lookup *map;
559479bd3712SFilipe Manana 	size_t item_size;
559579bd3712SFilipe Manana 	int i;
559679bd3712SFilipe Manana 	int ret;
559779bd3712SFilipe Manana 
559879bd3712SFilipe Manana 	/*
559979bd3712SFilipe Manana 	 * We take the chunk_mutex for 2 reasons:
560079bd3712SFilipe Manana 	 *
560179bd3712SFilipe Manana 	 * 1) Updates and insertions in the chunk btree must be done while holding
560279bd3712SFilipe Manana 	 *    the chunk_mutex, as well as updating the system chunk array in the
560379bd3712SFilipe Manana 	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
560479bd3712SFilipe Manana 	 *    details;
560579bd3712SFilipe Manana 	 *
560679bd3712SFilipe Manana 	 * 2) To prevent races with the final phase of a device replace operation
560779bd3712SFilipe Manana 	 *    that replaces the device object associated with the map's stripes,
560879bd3712SFilipe Manana 	 *    because the device object's id can change at any time during that
560979bd3712SFilipe Manana 	 *    final phase of the device replace operation
561079bd3712SFilipe Manana 	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
561179bd3712SFilipe Manana 	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
561279bd3712SFilipe Manana 	 *    which would cause a failure when updating the device item, which does
561379bd3712SFilipe Manana 	 *    not exists, or persisting a stripe of the chunk item with such ID.
561479bd3712SFilipe Manana 	 *    Here we can't use the device_list_mutex because our caller already
561579bd3712SFilipe Manana 	 *    has locked the chunk_mutex, and the final phase of device replace
561679bd3712SFilipe Manana 	 *    acquires both mutexes - first the device_list_mutex and then the
561779bd3712SFilipe Manana 	 *    chunk_mutex. Using any of those two mutexes protects us from a
561879bd3712SFilipe Manana 	 *    concurrent device replace.
561979bd3712SFilipe Manana 	 */
562079bd3712SFilipe Manana 	lockdep_assert_held(&fs_info->chunk_mutex);
562179bd3712SFilipe Manana 
562279bd3712SFilipe Manana 	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
562379bd3712SFilipe Manana 	if (IS_ERR(em)) {
562479bd3712SFilipe Manana 		ret = PTR_ERR(em);
562579bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
562679bd3712SFilipe Manana 		return ret;
562779bd3712SFilipe Manana 	}
562879bd3712SFilipe Manana 
562979bd3712SFilipe Manana 	map = em->map_lookup;
563079bd3712SFilipe Manana 	item_size = btrfs_chunk_item_size(map->num_stripes);
563179bd3712SFilipe Manana 
563279bd3712SFilipe Manana 	chunk = kzalloc(item_size, GFP_NOFS);
563379bd3712SFilipe Manana 	if (!chunk) {
563479bd3712SFilipe Manana 		ret = -ENOMEM;
563579bd3712SFilipe Manana 		btrfs_abort_transaction(trans, ret);
563679bd3712SFilipe Manana 		goto out;
563779bd3712SFilipe Manana 	}
563879bd3712SFilipe Manana 
563979bd3712SFilipe Manana 	for (i = 0; i < map->num_stripes; i++) {
564079bd3712SFilipe Manana 		struct btrfs_device *device = map->stripes[i].dev;
564179bd3712SFilipe Manana 
564279bd3712SFilipe Manana 		ret = btrfs_update_device(trans, device);
564379bd3712SFilipe Manana 		if (ret)
56446df9a95eSJosef Bacik 			goto out;
56452b82032cSYan Zheng 	}
56462b82032cSYan Zheng 
56472b82032cSYan Zheng 	stripe = &chunk->stripe;
56486df9a95eSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
564979bd3712SFilipe Manana 		struct btrfs_device *device = map->stripes[i].dev;
565079bd3712SFilipe Manana 		const u64 dev_offset = map->stripes[i].physical;
56512b82032cSYan Zheng 
56522b82032cSYan Zheng 		btrfs_set_stack_stripe_devid(stripe, device->devid);
56532b82032cSYan Zheng 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
56542b82032cSYan Zheng 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
56552b82032cSYan Zheng 		stripe++;
56562b82032cSYan Zheng 	}
56572b82032cSYan Zheng 
565879bd3712SFilipe Manana 	btrfs_set_stack_chunk_length(chunk, bg->length);
5659fd51eb2fSJosef Bacik 	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5660a97699d1SQu Wenruo 	btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
56612b82032cSYan Zheng 	btrfs_set_stack_chunk_type(chunk, map->type);
56622b82032cSYan Zheng 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5663a97699d1SQu Wenruo 	btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
5664a97699d1SQu Wenruo 	btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
56650b246afaSJeff Mahoney 	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
56662b82032cSYan Zheng 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
56672b82032cSYan Zheng 
56682b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
56692b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
567079bd3712SFilipe Manana 	key.offset = bg->start;
56712b82032cSYan Zheng 
56722b82032cSYan Zheng 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
567379bd3712SFilipe Manana 	if (ret)
567479bd3712SFilipe Manana 		goto out;
567579bd3712SFilipe Manana 
56763349b57fSJosef Bacik 	set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
567779bd3712SFilipe Manana 
567879bd3712SFilipe Manana 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
56792ff7e61eSJeff Mahoney 		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
568079bd3712SFilipe Manana 		if (ret)
568179bd3712SFilipe Manana 			goto out;
56822b82032cSYan Zheng 	}
56831abe9b8aSliubo 
56846df9a95eSJosef Bacik out:
56852b82032cSYan Zheng 	kfree(chunk);
56866df9a95eSJosef Bacik 	free_extent_map(em);
56874ed1d16eSMark Fasheh 	return ret;
56882b82032cSYan Zheng }
56892b82032cSYan Zheng 
init_first_rw_device(struct btrfs_trans_handle * trans)56906f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
56912b82032cSYan Zheng {
56926f8e0fc7SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
56932b82032cSYan Zheng 	u64 alloc_profile;
569479bd3712SFilipe Manana 	struct btrfs_block_group *meta_bg;
569579bd3712SFilipe Manana 	struct btrfs_block_group *sys_bg;
569679bd3712SFilipe Manana 
569779bd3712SFilipe Manana 	/*
569879bd3712SFilipe Manana 	 * When adding a new device for sprouting, the seed device is read-only
569979bd3712SFilipe Manana 	 * so we must first allocate a metadata and a system chunk. But before
570079bd3712SFilipe Manana 	 * adding the block group items to the extent, device and chunk btrees,
570179bd3712SFilipe Manana 	 * we must first:
570279bd3712SFilipe Manana 	 *
570379bd3712SFilipe Manana 	 * 1) Create both chunks without doing any changes to the btrees, as
570479bd3712SFilipe Manana 	 *    otherwise we would get -ENOSPC since the block groups from the
570579bd3712SFilipe Manana 	 *    seed device are read-only;
570679bd3712SFilipe Manana 	 *
570779bd3712SFilipe Manana 	 * 2) Add the device item for the new sprout device - finishing the setup
570879bd3712SFilipe Manana 	 *    of a new block group requires updating the device item in the chunk
570979bd3712SFilipe Manana 	 *    btree, so it must exist when we attempt to do it. The previous step
571079bd3712SFilipe Manana 	 *    ensures this does not fail with -ENOSPC.
571179bd3712SFilipe Manana 	 *
571279bd3712SFilipe Manana 	 * After that we can add the block group items to their btrees:
571379bd3712SFilipe Manana 	 * update existing device item in the chunk btree, add a new block group
571479bd3712SFilipe Manana 	 * item to the extent btree, add a new chunk item to the chunk btree and
571579bd3712SFilipe Manana 	 * finally add the new device extent items to the devices btree.
571679bd3712SFilipe Manana 	 */
57172b82032cSYan Zheng 
57181b86826dSJeff Mahoney 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5719f6f39f7aSNikolay Borisov 	meta_bg = btrfs_create_chunk(trans, alloc_profile);
572079bd3712SFilipe Manana 	if (IS_ERR(meta_bg))
572179bd3712SFilipe Manana 		return PTR_ERR(meta_bg);
57222b82032cSYan Zheng 
57231b86826dSJeff Mahoney 	alloc_profile = btrfs_system_alloc_profile(fs_info);
5724f6f39f7aSNikolay Borisov 	sys_bg = btrfs_create_chunk(trans, alloc_profile);
572579bd3712SFilipe Manana 	if (IS_ERR(sys_bg))
572679bd3712SFilipe Manana 		return PTR_ERR(sys_bg);
572779bd3712SFilipe Manana 
572879bd3712SFilipe Manana 	return 0;
5729005d6427SDavid Sterba }
57302b82032cSYan Zheng 
btrfs_chunk_max_errors(struct map_lookup * map)5731d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5732d20983b4SMiao Xie {
5733fc9a2ac7SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(map->type);
5734d20983b4SMiao Xie 
5735fc9a2ac7SDavid Sterba 	return btrfs_raid_array[index].tolerated_failures;
57362b82032cSYan Zheng }
57372b82032cSYan Zheng 
btrfs_chunk_writeable(struct btrfs_fs_info * fs_info,u64 chunk_offset)5738a09f23c3SAnand Jain bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
57392b82032cSYan Zheng {
57402b82032cSYan Zheng 	struct extent_map *em;
57412b82032cSYan Zheng 	struct map_lookup *map;
5742d20983b4SMiao Xie 	int miss_ndevs = 0;
57432b82032cSYan Zheng 	int i;
5744a09f23c3SAnand Jain 	bool ret = true;
57452b82032cSYan Zheng 
574660ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5747592d92eeSLiu Bo 	if (IS_ERR(em))
5748a09f23c3SAnand Jain 		return false;
57492b82032cSYan Zheng 
575095617d69SJeff Mahoney 	map = em->map_lookup;
57512b82032cSYan Zheng 	for (i = 0; i < map->num_stripes; i++) {
5752e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING,
5753e6e674bdSAnand Jain 					&map->stripes[i].dev->dev_state)) {
5754d20983b4SMiao Xie 			miss_ndevs++;
5755d20983b4SMiao Xie 			continue;
5756d20983b4SMiao Xie 		}
5757ebbede42SAnand Jain 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5758ebbede42SAnand Jain 					&map->stripes[i].dev->dev_state)) {
5759a09f23c3SAnand Jain 			ret = false;
5760d20983b4SMiao Xie 			goto end;
57612b82032cSYan Zheng 		}
57622b82032cSYan Zheng 	}
5763d20983b4SMiao Xie 
5764d20983b4SMiao Xie 	/*
5765a09f23c3SAnand Jain 	 * If the number of missing devices is larger than max errors, we can
5766a09f23c3SAnand Jain 	 * not write the data into that chunk successfully.
5767d20983b4SMiao Xie 	 */
5768d20983b4SMiao Xie 	if (miss_ndevs > btrfs_chunk_max_errors(map))
5769a09f23c3SAnand Jain 		ret = false;
5770d20983b4SMiao Xie end:
57712b82032cSYan Zheng 	free_extent_map(em);
5772a09f23c3SAnand Jain 	return ret;
57730b86a832SChris Mason }
57740b86a832SChris Mason 
btrfs_mapping_tree_free(struct extent_map_tree * tree)5775c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree)
57760b86a832SChris Mason {
57770b86a832SChris Mason 	struct extent_map *em;
57780b86a832SChris Mason 
57790b86a832SChris Mason 	while (1) {
5780c8bf1b67SDavid Sterba 		write_lock(&tree->lock);
5781c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(tree, 0, (u64)-1);
57820b86a832SChris Mason 		if (em)
5783c8bf1b67SDavid Sterba 			remove_extent_mapping(tree, em);
5784c8bf1b67SDavid Sterba 		write_unlock(&tree->lock);
57850b86a832SChris Mason 		if (!em)
57860b86a832SChris Mason 			break;
57870b86a832SChris Mason 		/* once for us */
57880b86a832SChris Mason 		free_extent_map(em);
57890b86a832SChris Mason 		/* once for the tree */
57900b86a832SChris Mason 		free_extent_map(em);
57910b86a832SChris Mason 	}
57920b86a832SChris Mason }
57930b86a832SChris Mason 
btrfs_num_copies(struct btrfs_fs_info * fs_info,u64 logical,u64 len)57945d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5795f188591eSChris Mason {
5796f188591eSChris Mason 	struct extent_map *em;
5797f188591eSChris Mason 	struct map_lookup *map;
57986d322b48SQu Wenruo 	enum btrfs_raid_types index;
57996d322b48SQu Wenruo 	int ret = 1;
5800f188591eSChris Mason 
580160ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
5802592d92eeSLiu Bo 	if (IS_ERR(em))
5803fb7669b5SJosef Bacik 		/*
5804592d92eeSLiu Bo 		 * We could return errors for these cases, but that could get
5805592d92eeSLiu Bo 		 * ugly and we'd probably do the same thing which is just not do
5806592d92eeSLiu Bo 		 * anything else and exit, so return 1 so the callers don't try
5807592d92eeSLiu Bo 		 * to use other copies.
5808fb7669b5SJosef Bacik 		 */
5809fb7669b5SJosef Bacik 		return 1;
5810fb7669b5SJosef Bacik 
581195617d69SJeff Mahoney 	map = em->map_lookup;
58126d322b48SQu Wenruo 	index = btrfs_bg_flags_to_raid_index(map->type);
58136d322b48SQu Wenruo 
58146d322b48SQu Wenruo 	/* Non-RAID56, use their ncopies from btrfs_raid_array. */
58156d322b48SQu Wenruo 	if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
58166d322b48SQu Wenruo 		ret = btrfs_raid_array[index].ncopies;
581753b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
581853b381b3SDavid Woodhouse 		ret = 2;
581953b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
58208810f751SLiu Bo 		/*
58218810f751SLiu Bo 		 * There could be two corrupted data stripes, we need
58228810f751SLiu Bo 		 * to loop retry in order to rebuild the correct data.
58238810f751SLiu Bo 		 *
58248810f751SLiu Bo 		 * Fail a stripe at a time on every retry except the
58258810f751SLiu Bo 		 * stripe under reconstruction.
58268810f751SLiu Bo 		 */
58278810f751SLiu Bo 		ret = map->num_stripes;
5828f188591eSChris Mason 	free_extent_map(em);
5829f188591eSChris Mason 	return ret;
5830f188591eSChris Mason }
5831f188591eSChris Mason 
btrfs_full_stripe_len(struct btrfs_fs_info * fs_info,u64 logical)58322ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
583353b381b3SDavid Woodhouse 				    u64 logical)
583453b381b3SDavid Woodhouse {
583553b381b3SDavid Woodhouse 	struct extent_map *em;
583653b381b3SDavid Woodhouse 	struct map_lookup *map;
58370b246afaSJeff Mahoney 	unsigned long len = fs_info->sectorsize;
583853b381b3SDavid Woodhouse 
5839b036f479SQu Wenruo 	if (!btrfs_fs_incompat(fs_info, RAID56))
5840b036f479SQu Wenruo 		return len;
5841b036f479SQu Wenruo 
584260ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
584353b381b3SDavid Woodhouse 
584469f03f13SNikolay Borisov 	if (!WARN_ON(IS_ERR(em))) {
584595617d69SJeff Mahoney 		map = em->map_lookup;
5846ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5847cb091225SQu Wenruo 			len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
584853b381b3SDavid Woodhouse 		free_extent_map(em);
584969f03f13SNikolay Borisov 	}
585053b381b3SDavid Woodhouse 	return len;
585153b381b3SDavid Woodhouse }
585253b381b3SDavid Woodhouse 
btrfs_is_parity_mirror(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5853e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
585453b381b3SDavid Woodhouse {
585553b381b3SDavid Woodhouse 	struct extent_map *em;
585653b381b3SDavid Woodhouse 	struct map_lookup *map;
585753b381b3SDavid Woodhouse 	int ret = 0;
585853b381b3SDavid Woodhouse 
5859b036f479SQu Wenruo 	if (!btrfs_fs_incompat(fs_info, RAID56))
5860b036f479SQu Wenruo 		return 0;
5861b036f479SQu Wenruo 
586260ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
586353b381b3SDavid Woodhouse 
586469f03f13SNikolay Borisov 	if(!WARN_ON(IS_ERR(em))) {
586595617d69SJeff Mahoney 		map = em->map_lookup;
5866ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
586753b381b3SDavid Woodhouse 			ret = 1;
586853b381b3SDavid Woodhouse 		free_extent_map(em);
586969f03f13SNikolay Borisov 	}
587053b381b3SDavid Woodhouse 	return ret;
587153b381b3SDavid Woodhouse }
587253b381b3SDavid Woodhouse 
find_live_mirror(struct btrfs_fs_info * fs_info,struct map_lookup * map,int first,int dev_replace_is_ongoing)587330d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info,
587499f92a7cSAnand Jain 			    struct map_lookup *map, int first,
58758ba0ae78SAnand Jain 			    int dev_replace_is_ongoing)
5876dfe25020SChris Mason {
5877dfe25020SChris Mason 	int i;
587899f92a7cSAnand Jain 	int num_stripes;
58798ba0ae78SAnand Jain 	int preferred_mirror;
588030d9861fSStefan Behrens 	int tolerance;
588130d9861fSStefan Behrens 	struct btrfs_device *srcdev;
588230d9861fSStefan Behrens 
588399f92a7cSAnand Jain 	ASSERT((map->type &
5884c7369b3fSDavid Sterba 		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
588599f92a7cSAnand Jain 
588699f92a7cSAnand Jain 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
588799f92a7cSAnand Jain 		num_stripes = map->sub_stripes;
588899f92a7cSAnand Jain 	else
588999f92a7cSAnand Jain 		num_stripes = map->num_stripes;
589099f92a7cSAnand Jain 
589133fd2f71SAnand Jain 	switch (fs_info->fs_devices->read_policy) {
589233fd2f71SAnand Jain 	default:
589333fd2f71SAnand Jain 		/* Shouldn't happen, just warn and use pid instead of failing */
589433fd2f71SAnand Jain 		btrfs_warn_rl(fs_info,
589533fd2f71SAnand Jain 			      "unknown read_policy type %u, reset to pid",
589633fd2f71SAnand Jain 			      fs_info->fs_devices->read_policy);
589733fd2f71SAnand Jain 		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
589833fd2f71SAnand Jain 		fallthrough;
589933fd2f71SAnand Jain 	case BTRFS_READ_POLICY_PID:
590033fd2f71SAnand Jain 		preferred_mirror = first + (current->pid % num_stripes);
590133fd2f71SAnand Jain 		break;
590233fd2f71SAnand Jain 	}
59038ba0ae78SAnand Jain 
590430d9861fSStefan Behrens 	if (dev_replace_is_ongoing &&
590530d9861fSStefan Behrens 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
590630d9861fSStefan Behrens 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
590730d9861fSStefan Behrens 		srcdev = fs_info->dev_replace.srcdev;
590830d9861fSStefan Behrens 	else
590930d9861fSStefan Behrens 		srcdev = NULL;
591030d9861fSStefan Behrens 
591130d9861fSStefan Behrens 	/*
591230d9861fSStefan Behrens 	 * try to avoid the drive that is the source drive for a
591330d9861fSStefan Behrens 	 * dev-replace procedure, only choose it if no other non-missing
591430d9861fSStefan Behrens 	 * mirror is available
591530d9861fSStefan Behrens 	 */
591630d9861fSStefan Behrens 	for (tolerance = 0; tolerance < 2; tolerance++) {
59178ba0ae78SAnand Jain 		if (map->stripes[preferred_mirror].dev->bdev &&
59188ba0ae78SAnand Jain 		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
59198ba0ae78SAnand Jain 			return preferred_mirror;
592099f92a7cSAnand Jain 		for (i = first; i < first + num_stripes; i++) {
592130d9861fSStefan Behrens 			if (map->stripes[i].dev->bdev &&
592230d9861fSStefan Behrens 			    (tolerance || map->stripes[i].dev != srcdev))
5923dfe25020SChris Mason 				return i;
5924dfe25020SChris Mason 		}
592530d9861fSStefan Behrens 	}
592630d9861fSStefan Behrens 
5927dfe25020SChris Mason 	/* we couldn't find one that doesn't fail.  Just return something
5928dfe25020SChris Mason 	 * and the io error handling code will clean up eventually
5929dfe25020SChris Mason 	 */
59308ba0ae78SAnand Jain 	return preferred_mirror;
5931dfe25020SChris Mason }
5932dfe25020SChris Mason 
alloc_btrfs_io_context(struct btrfs_fs_info * fs_info,u16 total_stripes)5933731ccf15SQu Wenruo static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
59341faf3885SQu Wenruo 						       u16 total_stripes)
59356e9606d2SZhao Lei {
59364ced85f8SQu Wenruo 	struct btrfs_io_context *bioc;
59374ced85f8SQu Wenruo 
59384ced85f8SQu Wenruo 	bioc = kzalloc(
59394c664611SQu Wenruo 		 /* The size of btrfs_io_context */
59404c664611SQu Wenruo 		sizeof(struct btrfs_io_context) +
59414c664611SQu Wenruo 		/* Plus the variable array for the stripes */
594218d758a2SQu Wenruo 		sizeof(struct btrfs_io_stripe) * (total_stripes),
59439f0eac07SLi zeming 		GFP_NOFS);
59449f0eac07SLi zeming 
59459f0eac07SLi zeming 	if (!bioc)
59469f0eac07SLi zeming 		return NULL;
59476e9606d2SZhao Lei 
59484c664611SQu Wenruo 	refcount_set(&bioc->refs, 1);
59496e9606d2SZhao Lei 
5950731ccf15SQu Wenruo 	bioc->fs_info = fs_info;
59511faf3885SQu Wenruo 	bioc->replace_stripe_src = -1;
595218d758a2SQu Wenruo 	bioc->full_stripe_logical = (u64)-1;
5953608769a4SNikolay Borisov 
59544c664611SQu Wenruo 	return bioc;
59556e9606d2SZhao Lei }
59566e9606d2SZhao Lei 
btrfs_get_bioc(struct btrfs_io_context * bioc)59574c664611SQu Wenruo void btrfs_get_bioc(struct btrfs_io_context *bioc)
59586e9606d2SZhao Lei {
59594c664611SQu Wenruo 	WARN_ON(!refcount_read(&bioc->refs));
59604c664611SQu Wenruo 	refcount_inc(&bioc->refs);
59616e9606d2SZhao Lei }
59626e9606d2SZhao Lei 
btrfs_put_bioc(struct btrfs_io_context * bioc)59634c664611SQu Wenruo void btrfs_put_bioc(struct btrfs_io_context *bioc)
59646e9606d2SZhao Lei {
59654c664611SQu Wenruo 	if (!bioc)
59666e9606d2SZhao Lei 		return;
59674c664611SQu Wenruo 	if (refcount_dec_and_test(&bioc->refs))
59684c664611SQu Wenruo 		kfree(bioc);
59696e9606d2SZhao Lei }
59706e9606d2SZhao Lei 
59710b3d4cd3SLiu Bo /*
59720b3d4cd3SLiu Bo  * Please note that, discard won't be sent to target device of device
59730b3d4cd3SLiu Bo  * replace.
59740b3d4cd3SLiu Bo  */
btrfs_map_discard(struct btrfs_fs_info * fs_info,u64 logical,u64 * length_ret,u32 * num_stripes)5975a4012f06SChristoph Hellwig struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
59766b7faaddSQu Wenruo 					       u64 logical, u64 *length_ret,
5977a4012f06SChristoph Hellwig 					       u32 *num_stripes)
59780b3d4cd3SLiu Bo {
59790b3d4cd3SLiu Bo 	struct extent_map *em;
59800b3d4cd3SLiu Bo 	struct map_lookup *map;
5981a4012f06SChristoph Hellwig 	struct btrfs_discard_stripe *stripes;
59826b7faaddSQu Wenruo 	u64 length = *length_ret;
59830b3d4cd3SLiu Bo 	u64 offset;
59846ded22c1SQu Wenruo 	u32 stripe_nr;
59856ded22c1SQu Wenruo 	u32 stripe_nr_end;
59866ded22c1SQu Wenruo 	u32 stripe_cnt;
59870b3d4cd3SLiu Bo 	u64 stripe_end_offset;
59880b3d4cd3SLiu Bo 	u64 stripe_offset;
59890b3d4cd3SLiu Bo 	u32 stripe_index;
59900b3d4cd3SLiu Bo 	u32 factor = 0;
59910b3d4cd3SLiu Bo 	u32 sub_stripes = 0;
59926ded22c1SQu Wenruo 	u32 stripes_per_dev = 0;
59930b3d4cd3SLiu Bo 	u32 remaining_stripes = 0;
59940b3d4cd3SLiu Bo 	u32 last_stripe = 0;
5995a4012f06SChristoph Hellwig 	int ret;
59960b3d4cd3SLiu Bo 	int i;
59970b3d4cd3SLiu Bo 
599860ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, length);
59990b3d4cd3SLiu Bo 	if (IS_ERR(em))
6000a4012f06SChristoph Hellwig 		return ERR_CAST(em);
60010b3d4cd3SLiu Bo 
60020b3d4cd3SLiu Bo 	map = em->map_lookup;
6003a4012f06SChristoph Hellwig 
60040b3d4cd3SLiu Bo 	/* we don't discard raid56 yet */
60050b3d4cd3SLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
60060b3d4cd3SLiu Bo 		ret = -EOPNOTSUPP;
6007a4012f06SChristoph Hellwig 		goto out_free_map;
60080b3d4cd3SLiu Bo 	}
60090b3d4cd3SLiu Bo 
60100b3d4cd3SLiu Bo 	offset = logical - em->start;
60112d974619SQu Wenruo 	length = min_t(u64, em->start + em->len - logical, length);
60126b7faaddSQu Wenruo 	*length_ret = length;
60130b3d4cd3SLiu Bo 
60140b3d4cd3SLiu Bo 	/*
60150b3d4cd3SLiu Bo 	 * stripe_nr counts the total number of stripes we have to stride
60160b3d4cd3SLiu Bo 	 * to get to this block
60170b3d4cd3SLiu Bo 	 */
6018a97699d1SQu Wenruo 	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
60190b3d4cd3SLiu Bo 
60200b3d4cd3SLiu Bo 	/* stripe_offset is the offset of this block in its stripe */
6021cb091225SQu Wenruo 	stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
60220b3d4cd3SLiu Bo 
6023a97699d1SQu Wenruo 	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
6024a97699d1SQu Wenruo 			BTRFS_STRIPE_LEN_SHIFT;
60250b3d4cd3SLiu Bo 	stripe_cnt = stripe_nr_end - stripe_nr;
6026cb091225SQu Wenruo 	stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
60270b3d4cd3SLiu Bo 			    (offset + length);
60280b3d4cd3SLiu Bo 	/*
60290b3d4cd3SLiu Bo 	 * after this, stripe_nr is the number of stripes on this
60300b3d4cd3SLiu Bo 	 * device we have to walk to find the data, and stripe_index is
60310b3d4cd3SLiu Bo 	 * the number of our device in the stripe array
60320b3d4cd3SLiu Bo 	 */
6033a4012f06SChristoph Hellwig 	*num_stripes = 1;
60340b3d4cd3SLiu Bo 	stripe_index = 0;
60350b3d4cd3SLiu Bo 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
60360b3d4cd3SLiu Bo 			 BTRFS_BLOCK_GROUP_RAID10)) {
60370b3d4cd3SLiu Bo 		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
60380b3d4cd3SLiu Bo 			sub_stripes = 1;
60390b3d4cd3SLiu Bo 		else
60400b3d4cd3SLiu Bo 			sub_stripes = map->sub_stripes;
60410b3d4cd3SLiu Bo 
60420b3d4cd3SLiu Bo 		factor = map->num_stripes / sub_stripes;
6043a4012f06SChristoph Hellwig 		*num_stripes = min_t(u64, map->num_stripes,
60440b3d4cd3SLiu Bo 				    sub_stripes * stripe_cnt);
60456ded22c1SQu Wenruo 		stripe_index = stripe_nr % factor;
60466ded22c1SQu Wenruo 		stripe_nr /= factor;
60470b3d4cd3SLiu Bo 		stripe_index *= sub_stripes;
60486ded22c1SQu Wenruo 
60496ded22c1SQu Wenruo 		remaining_stripes = stripe_cnt % factor;
60506ded22c1SQu Wenruo 		stripes_per_dev = stripe_cnt / factor;
60516ded22c1SQu Wenruo 		last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
6052c7369b3fSDavid Sterba 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
60530b3d4cd3SLiu Bo 				BTRFS_BLOCK_GROUP_DUP)) {
6054a4012f06SChristoph Hellwig 		*num_stripes = map->num_stripes;
60550b3d4cd3SLiu Bo 	} else {
60566ded22c1SQu Wenruo 		stripe_index = stripe_nr % map->num_stripes;
60576ded22c1SQu Wenruo 		stripe_nr /= map->num_stripes;
60580b3d4cd3SLiu Bo 	}
60590b3d4cd3SLiu Bo 
6060a4012f06SChristoph Hellwig 	stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6061a4012f06SChristoph Hellwig 	if (!stripes) {
60620b3d4cd3SLiu Bo 		ret = -ENOMEM;
6063a4012f06SChristoph Hellwig 		goto out_free_map;
60640b3d4cd3SLiu Bo 	}
60650b3d4cd3SLiu Bo 
6066a4012f06SChristoph Hellwig 	for (i = 0; i < *num_stripes; i++) {
6067a4012f06SChristoph Hellwig 		stripes[i].physical =
60680b3d4cd3SLiu Bo 			map->stripes[stripe_index].physical +
6069cb091225SQu Wenruo 			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
6070a4012f06SChristoph Hellwig 		stripes[i].dev = map->stripes[stripe_index].dev;
60710b3d4cd3SLiu Bo 
60720b3d4cd3SLiu Bo 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
60730b3d4cd3SLiu Bo 				 BTRFS_BLOCK_GROUP_RAID10)) {
6074cb091225SQu Wenruo 			stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
60750b3d4cd3SLiu Bo 
60760b3d4cd3SLiu Bo 			if (i / sub_stripes < remaining_stripes)
6077a97699d1SQu Wenruo 				stripes[i].length += BTRFS_STRIPE_LEN;
60780b3d4cd3SLiu Bo 
60790b3d4cd3SLiu Bo 			/*
60800b3d4cd3SLiu Bo 			 * Special for the first stripe and
60810b3d4cd3SLiu Bo 			 * the last stripe:
60820b3d4cd3SLiu Bo 			 *
60830b3d4cd3SLiu Bo 			 * |-------|...|-------|
60840b3d4cd3SLiu Bo 			 *     |----------|
60850b3d4cd3SLiu Bo 			 *    off     end_off
60860b3d4cd3SLiu Bo 			 */
60870b3d4cd3SLiu Bo 			if (i < sub_stripes)
6088a4012f06SChristoph Hellwig 				stripes[i].length -= stripe_offset;
60890b3d4cd3SLiu Bo 
60900b3d4cd3SLiu Bo 			if (stripe_index >= last_stripe &&
60910b3d4cd3SLiu Bo 			    stripe_index <= (last_stripe +
60920b3d4cd3SLiu Bo 					     sub_stripes - 1))
6093a4012f06SChristoph Hellwig 				stripes[i].length -= stripe_end_offset;
60940b3d4cd3SLiu Bo 
60950b3d4cd3SLiu Bo 			if (i == sub_stripes - 1)
60960b3d4cd3SLiu Bo 				stripe_offset = 0;
60970b3d4cd3SLiu Bo 		} else {
6098a4012f06SChristoph Hellwig 			stripes[i].length = length;
60990b3d4cd3SLiu Bo 		}
61000b3d4cd3SLiu Bo 
61010b3d4cd3SLiu Bo 		stripe_index++;
61020b3d4cd3SLiu Bo 		if (stripe_index == map->num_stripes) {
61030b3d4cd3SLiu Bo 			stripe_index = 0;
61040b3d4cd3SLiu Bo 			stripe_nr++;
61050b3d4cd3SLiu Bo 		}
61060b3d4cd3SLiu Bo 	}
61070b3d4cd3SLiu Bo 
61080b3d4cd3SLiu Bo 	free_extent_map(em);
6109a4012f06SChristoph Hellwig 	return stripes;
6110a4012f06SChristoph Hellwig out_free_map:
6111a4012f06SChristoph Hellwig 	free_extent_map(em);
6112a4012f06SChristoph Hellwig 	return ERR_PTR(ret);
61130b3d4cd3SLiu Bo }
61140b3d4cd3SLiu Bo 
is_block_group_to_copy(struct btrfs_fs_info * fs_info,u64 logical)61156143c23cSNaohiro Aota static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
61166143c23cSNaohiro Aota {
61176143c23cSNaohiro Aota 	struct btrfs_block_group *cache;
61186143c23cSNaohiro Aota 	bool ret;
61196143c23cSNaohiro Aota 
6120de17addcSNaohiro Aota 	/* Non zoned filesystem does not use "to_copy" flag */
61216143c23cSNaohiro Aota 	if (!btrfs_is_zoned(fs_info))
61226143c23cSNaohiro Aota 		return false;
61236143c23cSNaohiro Aota 
61246143c23cSNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, logical);
61256143c23cSNaohiro Aota 
61263349b57fSJosef Bacik 	ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
61276143c23cSNaohiro Aota 
61286143c23cSNaohiro Aota 	btrfs_put_block_group(cache);
61296143c23cSNaohiro Aota 	return ret;
61306143c23cSNaohiro Aota }
61316143c23cSNaohiro Aota 
handle_ops_on_dev_replace(enum btrfs_map_op op,struct btrfs_io_context * bioc,struct btrfs_dev_replace * dev_replace,u64 logical,int * num_stripes_ret,int * max_errors_ret)613273c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6133be5c7edbSQu Wenruo 				      struct btrfs_io_context *bioc,
613473c0f228SLiu Bo 				      struct btrfs_dev_replace *dev_replace,
61356143c23cSNaohiro Aota 				      u64 logical,
613673c0f228SLiu Bo 				      int *num_stripes_ret, int *max_errors_ret)
613773c0f228SLiu Bo {
613873c0f228SLiu Bo 	u64 srcdev_devid = dev_replace->srcdev->devid;
61391faf3885SQu Wenruo 	/*
61401faf3885SQu Wenruo 	 * At this stage, num_stripes is still the real number of stripes,
61411faf3885SQu Wenruo 	 * excluding the duplicated stripes.
61421faf3885SQu Wenruo 	 */
614373c0f228SLiu Bo 	int num_stripes = *num_stripes_ret;
61441faf3885SQu Wenruo 	int nr_extra_stripes = 0;
614573c0f228SLiu Bo 	int max_errors = *max_errors_ret;
614673c0f228SLiu Bo 	int i;
614773c0f228SLiu Bo 
614873c0f228SLiu Bo 	/*
61491faf3885SQu Wenruo 	 * A block group which has "to_copy" set will eventually be copied by
61501faf3885SQu Wenruo 	 * the dev-replace process. We can avoid cloning IO here.
61516143c23cSNaohiro Aota 	 */
61526143c23cSNaohiro Aota 	if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
61536143c23cSNaohiro Aota 		return;
61546143c23cSNaohiro Aota 
61556143c23cSNaohiro Aota 	/*
61561faf3885SQu Wenruo 	 * Duplicate the write operations while the dev-replace procedure is
61571faf3885SQu Wenruo 	 * running. Since the copying of the old disk to the new disk takes
61581faf3885SQu Wenruo 	 * place at run time while the filesystem is mounted writable, the
61591faf3885SQu Wenruo 	 * regular write operations to the old disk have to be duplicated to go
61601faf3885SQu Wenruo 	 * to the new disk as well.
616173c0f228SLiu Bo 	 *
61621faf3885SQu Wenruo 	 * Note that device->missing is handled by the caller, and that the
61631faf3885SQu Wenruo 	 * write to the old disk is already set up in the stripes array.
616473c0f228SLiu Bo 	 */
616573c0f228SLiu Bo 	for (i = 0; i < num_stripes; i++) {
61661faf3885SQu Wenruo 		struct btrfs_io_stripe *old = &bioc->stripes[i];
61671faf3885SQu Wenruo 		struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
61681faf3885SQu Wenruo 
61691faf3885SQu Wenruo 		if (old->dev->devid != srcdev_devid)
61701faf3885SQu Wenruo 			continue;
617173c0f228SLiu Bo 
617273c0f228SLiu Bo 		new->physical = old->physical;
617373c0f228SLiu Bo 		new->dev = dev_replace->tgtdev;
61741faf3885SQu Wenruo 		if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
61751faf3885SQu Wenruo 			bioc->replace_stripe_src = i;
61761faf3885SQu Wenruo 		nr_extra_stripes++;
617773c0f228SLiu Bo 	}
61781faf3885SQu Wenruo 
61791faf3885SQu Wenruo 	/* We can only have at most 2 extra nr_stripes (for DUP). */
61801faf3885SQu Wenruo 	ASSERT(nr_extra_stripes <= 2);
61811faf3885SQu Wenruo 	/*
61821faf3885SQu Wenruo 	 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
61831faf3885SQu Wenruo 	 * replace.
61841faf3885SQu Wenruo 	 * If we have 2 extra stripes, only choose the one with smaller physical.
61851faf3885SQu Wenruo 	 */
61861faf3885SQu Wenruo 	if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
61871faf3885SQu Wenruo 		struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
61881faf3885SQu Wenruo 		struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
61891faf3885SQu Wenruo 
61901faf3885SQu Wenruo 		/* Only DUP can have two extra stripes. */
61911faf3885SQu Wenruo 		ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
619273c0f228SLiu Bo 
619373c0f228SLiu Bo 		/*
61941faf3885SQu Wenruo 		 * Swap the last stripe stripes and reduce @nr_extra_stripes.
61951faf3885SQu Wenruo 		 * The extra stripe would still be there, but won't be accessed.
619673c0f228SLiu Bo 		 */
61971faf3885SQu Wenruo 		if (first->physical > second->physical) {
61981faf3885SQu Wenruo 			swap(second->physical, first->physical);
61991faf3885SQu Wenruo 			swap(second->dev, first->dev);
62001faf3885SQu Wenruo 			nr_extra_stripes--;
620173c0f228SLiu Bo 		}
620273c0f228SLiu Bo 	}
620373c0f228SLiu Bo 
62041faf3885SQu Wenruo 	*num_stripes_ret = num_stripes + nr_extra_stripes;
62051faf3885SQu Wenruo 	*max_errors_ret = max_errors + nr_extra_stripes;
62061faf3885SQu Wenruo 	bioc->replace_nr_stripes = nr_extra_stripes;
620773c0f228SLiu Bo }
620873c0f228SLiu Bo 
btrfs_max_io_len(struct map_lookup * map,enum btrfs_map_op op,u64 offset,u32 * stripe_nr,u64 * stripe_offset,u64 * full_stripe_start)6209f8a02dc6SChristoph Hellwig static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
62106ded22c1SQu Wenruo 			    u64 offset, u32 *stripe_nr, u64 *stripe_offset,
6211f8a02dc6SChristoph Hellwig 			    u64 *full_stripe_start)
62125f141126SNikolay Borisov {
6213cc353a8bSQu Wenruo 	/*
6214f8a02dc6SChristoph Hellwig 	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
6215f8a02dc6SChristoph Hellwig 	 * the offset of this block in its stripe.
6216cc353a8bSQu Wenruo 	 */
6217a97699d1SQu Wenruo 	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
6218a97699d1SQu Wenruo 	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6219f8a02dc6SChristoph Hellwig 	ASSERT(*stripe_offset < U32_MAX);
62205f141126SNikolay Borisov 
62215f141126SNikolay Borisov 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6222cb091225SQu Wenruo 		unsigned long full_stripe_len =
6223cb091225SQu Wenruo 			btrfs_stripe_nr_to_offset(nr_data_stripes(map));
6224f8a02dc6SChristoph Hellwig 
6225a97699d1SQu Wenruo 		/*
6226a97699d1SQu Wenruo 		 * For full stripe start, we use previously calculated
6227a97699d1SQu Wenruo 		 * @stripe_nr. Align it to nr_data_stripes, then multiply with
6228a97699d1SQu Wenruo 		 * STRIPE_LEN.
6229a97699d1SQu Wenruo 		 *
6230a97699d1SQu Wenruo 		 * By this we can avoid u64 division completely.  And we have
6231a97699d1SQu Wenruo 		 * to go rounddown(), not round_down(), as nr_data_stripes is
6232a97699d1SQu Wenruo 		 * not ensured to be power of 2.
6233a97699d1SQu Wenruo 		 */
6234f8a02dc6SChristoph Hellwig 		*full_stripe_start =
6235cb091225SQu Wenruo 			btrfs_stripe_nr_to_offset(
6236cb091225SQu Wenruo 				rounddown(*stripe_nr, nr_data_stripes(map)));
62375f141126SNikolay Borisov 
6238a7299a18SQu Wenruo 		ASSERT(*full_stripe_start + full_stripe_len > offset);
6239a7299a18SQu Wenruo 		ASSERT(*full_stripe_start <= offset);
62405f141126SNikolay Borisov 		/*
6241f8a02dc6SChristoph Hellwig 		 * For writes to RAID56, allow to write a full stripe set, but
6242f8a02dc6SChristoph Hellwig 		 * no straddling of stripe sets.
62435f141126SNikolay Borisov 		 */
6244f8a02dc6SChristoph Hellwig 		if (op == BTRFS_MAP_WRITE)
6245f8a02dc6SChristoph Hellwig 			return full_stripe_len - (offset - *full_stripe_start);
6246f8a02dc6SChristoph Hellwig 	}
62475f141126SNikolay Borisov 
62485f141126SNikolay Borisov 	/*
6249f8a02dc6SChristoph Hellwig 	 * For other RAID types and for RAID56 reads, allow a single stripe (on
6250f8a02dc6SChristoph Hellwig 	 * a single disk).
62515f141126SNikolay Borisov 	 */
6252f8a02dc6SChristoph Hellwig 	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
6253a97699d1SQu Wenruo 		return BTRFS_STRIPE_LEN - *stripe_offset;
6254f8a02dc6SChristoph Hellwig 	return U64_MAX;
62555f141126SNikolay Borisov }
62565f141126SNikolay Borisov 
set_io_stripe(struct btrfs_io_stripe * dst,const struct map_lookup * map,u32 stripe_index,u64 stripe_offset,u32 stripe_nr)625703793cbbSChristoph Hellwig static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
62586ded22c1SQu Wenruo 			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
625903793cbbSChristoph Hellwig {
626003793cbbSChristoph Hellwig 	dst->dev = map->stripes[stripe_index].dev;
626103793cbbSChristoph Hellwig 	dst->physical = map->stripes[stripe_index].physical +
6262cb091225SQu Wenruo 			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
626303793cbbSChristoph Hellwig }
626403793cbbSChristoph Hellwig 
6265ed3764f7SQu Wenruo /*
6266ed3764f7SQu Wenruo  * Map one logical range to one or more physical ranges.
6267ed3764f7SQu Wenruo  *
6268ed3764f7SQu Wenruo  * @length:		(Mandatory) mapped length of this run.
6269ed3764f7SQu Wenruo  *			One logical range can be split into different segments
6270ed3764f7SQu Wenruo  *			due to factors like zones and RAID0/5/6/10 stripe
6271ed3764f7SQu Wenruo  *			boundaries.
6272ed3764f7SQu Wenruo  *
6273ed3764f7SQu Wenruo  * @bioc_ret:		(Mandatory) returned btrfs_io_context structure.
6274ed3764f7SQu Wenruo  *			which has one or more physical ranges (btrfs_io_stripe)
6275ed3764f7SQu Wenruo  *			recorded inside.
6276ed3764f7SQu Wenruo  *			Caller should call btrfs_put_bioc() to free it after use.
6277ed3764f7SQu Wenruo  *
6278ed3764f7SQu Wenruo  * @smap:		(Optional) single physical range optimization.
6279ed3764f7SQu Wenruo  *			If the map request can be fulfilled by one single
6280ed3764f7SQu Wenruo  *			physical range, and this is parameter is not NULL,
6281ed3764f7SQu Wenruo  *			then @bioc_ret would be NULL, and @smap would be
6282ed3764f7SQu Wenruo  *			updated.
6283ed3764f7SQu Wenruo  *
6284ed3764f7SQu Wenruo  * @mirror_num_ret:	(Mandatory) returned mirror number if the original
6285ed3764f7SQu Wenruo  *			value is 0.
6286ed3764f7SQu Wenruo  *
6287ed3764f7SQu Wenruo  *			Mirror number 0 means to choose any live mirrors.
6288ed3764f7SQu Wenruo  *
6289ed3764f7SQu Wenruo  *			For non-RAID56 profiles, non-zero mirror_num means
6290ed3764f7SQu Wenruo  *			the Nth mirror. (e.g. mirror_num 1 means the first
6291ed3764f7SQu Wenruo  *			copy).
6292ed3764f7SQu Wenruo  *
6293ed3764f7SQu Wenruo  *			For RAID56 profile, mirror 1 means rebuild from P and
6294ed3764f7SQu Wenruo  *			the remaining data stripes.
6295ed3764f7SQu Wenruo  *
6296ed3764f7SQu Wenruo  *			For RAID6 profile, mirror > 2 means mark another
6297ed3764f7SQu Wenruo  *			data/P stripe error and rebuild from the remaining
6298ed3764f7SQu Wenruo  *			stripes..
6299ed3764f7SQu Wenruo  *
6300ed3764f7SQu Wenruo  * @need_raid_map:	(Used only for integrity checker) whether the map wants
6301ed3764f7SQu Wenruo  *                      a full stripe map (including all data and P/Q stripes)
6302ed3764f7SQu Wenruo  *                      for RAID56. Should always be 1 except integrity checker.
6303ed3764f7SQu Wenruo  */
btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_io_context ** bioc_ret,struct btrfs_io_stripe * smap,int * mirror_num_ret,int need_raid_map)6304cd4efd21SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6305103c1972SChristoph Hellwig 		    u64 logical, u64 *length,
63064c664611SQu Wenruo 		    struct btrfs_io_context **bioc_ret,
6307103c1972SChristoph Hellwig 		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
6308103c1972SChristoph Hellwig 		    int need_raid_map)
63090b86a832SChris Mason {
63100b86a832SChris Mason 	struct extent_map *em;
63110b86a832SChris Mason 	struct map_lookup *map;
6312f8a02dc6SChristoph Hellwig 	u64 map_offset;
6313593060d7SChris Mason 	u64 stripe_offset;
63146ded22c1SQu Wenruo 	u32 stripe_nr;
63159d644a62SDavid Sterba 	u32 stripe_index;
6316cff82672SDavid Sterba 	int data_stripes;
6317cea9e445SChris Mason 	int i;
6318de11cc12SLi Zefan 	int ret = 0;
631903793cbbSChristoph Hellwig 	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
6320f2d8d74dSChris Mason 	int num_stripes;
63215f50fa91SQu Wenruo 	int num_copies;
6322a236aed1SChris Mason 	int max_errors = 0;
63234c664611SQu Wenruo 	struct btrfs_io_context *bioc = NULL;
6324472262f3SStefan Behrens 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6325472262f3SStefan Behrens 	int dev_replace_is_ongoing = 0;
63264ced85f8SQu Wenruo 	u16 num_alloc_stripes;
632753b381b3SDavid Woodhouse 	u64 raid56_full_stripe_start = (u64)-1;
6328f8a02dc6SChristoph Hellwig 	u64 max_len;
632989b798adSNikolay Borisov 
63304c664611SQu Wenruo 	ASSERT(bioc_ret);
63310b3d4cd3SLiu Bo 
63325f50fa91SQu Wenruo 	num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
63335f50fa91SQu Wenruo 	if (mirror_num > num_copies)
63345f50fa91SQu Wenruo 		return -EINVAL;
63355f50fa91SQu Wenruo 
633642034313SMichal Rostecki 	em = btrfs_get_chunk_map(fs_info, logical, *length);
63371c3ab6dfSQu Wenruo 	if (IS_ERR(em))
63381c3ab6dfSQu Wenruo 		return PTR_ERR(em);
633942034313SMichal Rostecki 
634095617d69SJeff Mahoney 	map = em->map_lookup;
6341cff82672SDavid Sterba 	data_stripes = nr_data_stripes(map);
6342f8a02dc6SChristoph Hellwig 
6343f8a02dc6SChristoph Hellwig 	map_offset = logical - em->start;
6344f8a02dc6SChristoph Hellwig 	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
6345f8a02dc6SChristoph Hellwig 				   &stripe_offset, &raid56_full_stripe_start);
6346f8a02dc6SChristoph Hellwig 	*length = min_t(u64, em->len - map_offset, max_len);
6347593060d7SChris Mason 
6348a5bc4e03SJohannes Thumshirn 	if (dev_replace->replace_task != current)
6349cb5583ddSDavid Sterba 		down_read(&dev_replace->rwsem);
6350a5bc4e03SJohannes Thumshirn 
6351472262f3SStefan Behrens 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
635253176ddeSDavid Sterba 	/*
635353176ddeSDavid Sterba 	 * Hold the semaphore for read during the whole operation, write is
635453176ddeSDavid Sterba 	 * requested at commit time but must wait.
635553176ddeSDavid Sterba 	 */
6356a5bc4e03SJohannes Thumshirn 	if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
6357cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
6358472262f3SStefan Behrens 
6359f2d8d74dSChris Mason 	num_stripes = 1;
6360cea9e445SChris Mason 	stripe_index = 0;
6361fce3bb9aSLi Dongyang 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
63626ded22c1SQu Wenruo 		stripe_index = stripe_nr % map->num_stripes;
63636ded22c1SQu Wenruo 		stripe_nr /= map->num_stripes;
63648680e587SChristoph Hellwig 		if (op == BTRFS_MAP_READ)
636528e1cc7dSMiao Xie 			mirror_num = 1;
6366c7369b3fSDavid Sterba 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
63678680e587SChristoph Hellwig 		if (op != BTRFS_MAP_READ) {
6368f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
63698680e587SChristoph Hellwig 		} else if (mirror_num) {
6370f188591eSChris Mason 			stripe_index = mirror_num - 1;
63718680e587SChristoph Hellwig 		} else {
637230d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map, 0,
637330d9861fSStefan Behrens 					    dev_replace_is_ongoing);
6374a1d3c478SJan Schmidt 			mirror_num = stripe_index + 1;
6375dfe25020SChris Mason 		}
63762fff734fSChris Mason 
6377611f0e00SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
63788680e587SChristoph Hellwig 		if (op != BTRFS_MAP_READ) {
6379f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
6380a1d3c478SJan Schmidt 		} else if (mirror_num) {
6381f188591eSChris Mason 			stripe_index = mirror_num - 1;
6382a1d3c478SJan Schmidt 		} else {
6383a1d3c478SJan Schmidt 			mirror_num = 1;
6384a1d3c478SJan Schmidt 		}
63852fff734fSChris Mason 
6386321aecc6SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
63879d644a62SDavid Sterba 		u32 factor = map->num_stripes / map->sub_stripes;
6388321aecc6SChris Mason 
63896ded22c1SQu Wenruo 		stripe_index = (stripe_nr % factor) * map->sub_stripes;
63906ded22c1SQu Wenruo 		stripe_nr /= factor;
6391321aecc6SChris Mason 
63928680e587SChristoph Hellwig 		if (op != BTRFS_MAP_READ)
6393f2d8d74dSChris Mason 			num_stripes = map->sub_stripes;
6394321aecc6SChris Mason 		else if (mirror_num)
6395321aecc6SChris Mason 			stripe_index += mirror_num - 1;
6396dfe25020SChris Mason 		else {
63973e74317aSJan Schmidt 			int old_stripe_index = stripe_index;
639830d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map,
639930d9861fSStefan Behrens 					      stripe_index,
640030d9861fSStefan Behrens 					      dev_replace_is_ongoing);
64013e74317aSJan Schmidt 			mirror_num = stripe_index - old_stripe_index + 1;
6402dfe25020SChris Mason 		}
640353b381b3SDavid Woodhouse 
6404ffe2d203SZhao Lei 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
64058680e587SChristoph Hellwig 		if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
64066ded22c1SQu Wenruo 			/*
64076ded22c1SQu Wenruo 			 * Push stripe_nr back to the start of the full stripe
64086ded22c1SQu Wenruo 			 * For those cases needing a full stripe, @stripe_nr
64096ded22c1SQu Wenruo 			 * is the full stripe number.
64106ded22c1SQu Wenruo 			 *
64116ded22c1SQu Wenruo 			 * Originally we go raid56_full_stripe_start / full_stripe_len,
64126ded22c1SQu Wenruo 			 * but that can be expensive.  Here we just divide
64136ded22c1SQu Wenruo 			 * @stripe_nr with @data_stripes.
64146ded22c1SQu Wenruo 			 */
64156ded22c1SQu Wenruo 			stripe_nr /= data_stripes;
641653b381b3SDavid Woodhouse 
641753b381b3SDavid Woodhouse 			/* RAID[56] write or recovery. Return all stripes */
641853b381b3SDavid Woodhouse 			num_stripes = map->num_stripes;
64196dead96cSQu Wenruo 			max_errors = btrfs_chunk_max_errors(map);
642053b381b3SDavid Woodhouse 
6421462b0b2aSQu Wenruo 			/* Return the length to the full stripe end */
6422462b0b2aSQu Wenruo 			*length = min(logical + *length,
6423462b0b2aSQu Wenruo 				      raid56_full_stripe_start + em->start +
6424cb091225SQu Wenruo 				      btrfs_stripe_nr_to_offset(data_stripes)) -
6425cb091225SQu Wenruo 				  logical;
642653b381b3SDavid Woodhouse 			stripe_index = 0;
642753b381b3SDavid Woodhouse 			stripe_offset = 0;
642853b381b3SDavid Woodhouse 		} else {
642953b381b3SDavid Woodhouse 			/*
643053b381b3SDavid Woodhouse 			 * Mirror #0 or #1 means the original data block.
643153b381b3SDavid Woodhouse 			 * Mirror #2 is RAID5 parity block.
643253b381b3SDavid Woodhouse 			 * Mirror #3 is RAID6 Q block.
643353b381b3SDavid Woodhouse 			 */
64346ded22c1SQu Wenruo 			stripe_index = stripe_nr % data_stripes;
64356ded22c1SQu Wenruo 			stripe_nr /= data_stripes;
643653b381b3SDavid Woodhouse 			if (mirror_num > 1)
6437cff82672SDavid Sterba 				stripe_index = data_stripes + mirror_num - 2;
643853b381b3SDavid Woodhouse 
643953b381b3SDavid Woodhouse 			/* We distribute the parity blocks across stripes */
64406ded22c1SQu Wenruo 			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
64418680e587SChristoph Hellwig 			if (op == BTRFS_MAP_READ && mirror_num <= 1)
644228e1cc7dSMiao Xie 				mirror_num = 1;
644353b381b3SDavid Woodhouse 		}
64448790d502SChris Mason 	} else {
6445593060d7SChris Mason 		/*
64466ded22c1SQu Wenruo 		 * After this, stripe_nr is the number of stripes on this
644747c5713fSDavid Sterba 		 * device we have to walk to find the data, and stripe_index is
644847c5713fSDavid Sterba 		 * the number of our device in the stripe array
6449593060d7SChris Mason 		 */
64506ded22c1SQu Wenruo 		stripe_index = stripe_nr % map->num_stripes;
64516ded22c1SQu Wenruo 		stripe_nr /= map->num_stripes;
6452a1d3c478SJan Schmidt 		mirror_num = stripe_index + 1;
64538790d502SChris Mason 	}
6454e042d1ecSJosef Bacik 	if (stripe_index >= map->num_stripes) {
64555d163e0eSJeff Mahoney 		btrfs_crit(fs_info,
64565d163e0eSJeff Mahoney 			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6457e042d1ecSJosef Bacik 			   stripe_index, map->num_stripes);
6458e042d1ecSJosef Bacik 		ret = -EINVAL;
6459e042d1ecSJosef Bacik 		goto out;
6460e042d1ecSJosef Bacik 	}
6461593060d7SChris Mason 
6462472262f3SStefan Behrens 	num_alloc_stripes = num_stripes;
64631faf3885SQu Wenruo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
64641faf3885SQu Wenruo 	    op != BTRFS_MAP_READ)
64651faf3885SQu Wenruo 		/*
64661faf3885SQu Wenruo 		 * For replace case, we need to add extra stripes for extra
64671faf3885SQu Wenruo 		 * duplicated stripes.
64681faf3885SQu Wenruo 		 *
64691faf3885SQu Wenruo 		 * For both WRITE and GET_READ_MIRRORS, we may have at most
64701faf3885SQu Wenruo 		 * 2 more stripes (DUP types, otherwise 1).
64711faf3885SQu Wenruo 		 */
64721faf3885SQu Wenruo 		num_alloc_stripes += 2;
64732c8cdd6eSMiao Xie 
647403793cbbSChristoph Hellwig 	/*
647503793cbbSChristoph Hellwig 	 * If this I/O maps to a single device, try to return the device and
647603793cbbSChristoph Hellwig 	 * physical block information on the stack instead of allocating an
647703793cbbSChristoph Hellwig 	 * I/O context structure.
647803793cbbSChristoph Hellwig 	 */
647903793cbbSChristoph Hellwig 	if (smap && num_alloc_stripes == 1 &&
6480b2cc4400SChristoph Hellwig 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
64815f50fa91SQu Wenruo 		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
64824e7de35eSChristoph Hellwig 		if (mirror_num_ret)
648303793cbbSChristoph Hellwig 			*mirror_num_ret = mirror_num;
648403793cbbSChristoph Hellwig 		*bioc_ret = NULL;
648503793cbbSChristoph Hellwig 		ret = 0;
648603793cbbSChristoph Hellwig 		goto out;
648703793cbbSChristoph Hellwig 	}
648803793cbbSChristoph Hellwig 
64891faf3885SQu Wenruo 	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
64904c664611SQu Wenruo 	if (!bioc) {
6491de11cc12SLi Zefan 		ret = -ENOMEM;
6492de11cc12SLi Zefan 		goto out;
6493de11cc12SLi Zefan 	}
64941faf3885SQu Wenruo 	bioc->map_type = map->type;
6495608769a4SNikolay Borisov 
649618d758a2SQu Wenruo 	/*
649718d758a2SQu Wenruo 	 * For RAID56 full map, we need to make sure the stripes[] follows the
649818d758a2SQu Wenruo 	 * rule that data stripes are all ordered, then followed with P and Q
649918d758a2SQu Wenruo 	 * (if we have).
650018d758a2SQu Wenruo 	 *
650118d758a2SQu Wenruo 	 * It's still mostly the same as other profiles, just with extra rotation.
650218d758a2SQu Wenruo 	 */
65032b19a1feSLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
65048680e587SChristoph Hellwig 	    (op != BTRFS_MAP_READ || mirror_num > 1)) {
650518d758a2SQu Wenruo 		/*
650618d758a2SQu Wenruo 		 * For RAID56 @stripe_nr is already the number of full stripes
650718d758a2SQu Wenruo 		 * before us, which is also the rotation value (needs to modulo
650818d758a2SQu Wenruo 		 * with num_stripes).
650918d758a2SQu Wenruo 		 *
651018d758a2SQu Wenruo 		 * In this case, we just add @stripe_nr with @i, then do the
651118d758a2SQu Wenruo 		 * modulo, to reduce one modulo call.
651218d758a2SQu Wenruo 		 */
651318d758a2SQu Wenruo 		bioc->full_stripe_logical = em->start +
6514cb091225SQu Wenruo 			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
651518d758a2SQu Wenruo 		for (i = 0; i < num_stripes; i++)
651618d758a2SQu Wenruo 			set_io_stripe(&bioc->stripes[i], map,
651718d758a2SQu Wenruo 				      (i + stripe_nr) % num_stripes,
651818d758a2SQu Wenruo 				      stripe_offset, stripe_nr);
651918d758a2SQu Wenruo 	} else {
652018d758a2SQu Wenruo 		/*
652118d758a2SQu Wenruo 		 * For all other non-RAID56 profiles, just copy the target
652218d758a2SQu Wenruo 		 * stripe into the bioc.
652318d758a2SQu Wenruo 		 */
652418d758a2SQu Wenruo 		for (i = 0; i < num_stripes; i++) {
652518d758a2SQu Wenruo 			set_io_stripe(&bioc->stripes[i], map, stripe_index,
652618d758a2SQu Wenruo 				      stripe_offset, stripe_nr);
652718d758a2SQu Wenruo 			stripe_index++;
652818d758a2SQu Wenruo 		}
6529593060d7SChris Mason 	}
6530de11cc12SLi Zefan 
65318680e587SChristoph Hellwig 	if (op != BTRFS_MAP_READ)
6532d20983b4SMiao Xie 		max_errors = btrfs_chunk_max_errors(map);
6533de11cc12SLi Zefan 
653473c0f228SLiu Bo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
65358680e587SChristoph Hellwig 	    op != BTRFS_MAP_READ) {
6536be5c7edbSQu Wenruo 		handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
65376143c23cSNaohiro Aota 					  &num_stripes, &max_errors);
6538ad6d620eSStefan Behrens 	}
6539472262f3SStefan Behrens 
65404c664611SQu Wenruo 	*bioc_ret = bioc;
65414c664611SQu Wenruo 	bioc->num_stripes = num_stripes;
65424c664611SQu Wenruo 	bioc->max_errors = max_errors;
65434c664611SQu Wenruo 	bioc->mirror_num = mirror_num;
6544ad6d620eSStefan Behrens 
6545cea9e445SChris Mason out:
6546a5bc4e03SJohannes Thumshirn 	if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
654753176ddeSDavid Sterba 		lockdep_assert_held(&dev_replace->rwsem);
654853176ddeSDavid Sterba 		/* Unlock and let waiting writers proceed */
6549cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
655073beece9SLiu Bo 	}
65510b86a832SChris Mason 	free_extent_map(em);
6552de11cc12SLi Zefan 	return ret;
65530b86a832SChris Mason }
65540b86a832SChris Mason 
dev_args_match_fs_devices(const struct btrfs_dev_lookup_args * args,const struct btrfs_fs_devices * fs_devices)6555562d7b15SJosef Bacik static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6556562d7b15SJosef Bacik 				      const struct btrfs_fs_devices *fs_devices)
6557562d7b15SJosef Bacik {
6558562d7b15SJosef Bacik 	if (args->fsid == NULL)
6559562d7b15SJosef Bacik 		return true;
6560562d7b15SJosef Bacik 	if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6561562d7b15SJosef Bacik 		return true;
6562562d7b15SJosef Bacik 	return false;
6563562d7b15SJosef Bacik }
6564562d7b15SJosef Bacik 
dev_args_match_device(const struct btrfs_dev_lookup_args * args,const struct btrfs_device * device)6565562d7b15SJosef Bacik static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6566562d7b15SJosef Bacik 				  const struct btrfs_device *device)
6567562d7b15SJosef Bacik {
65680fca385dSLiu Shixin 	if (args->missing) {
6569562d7b15SJosef Bacik 		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6570562d7b15SJosef Bacik 		    !device->bdev)
6571562d7b15SJosef Bacik 			return true;
6572562d7b15SJosef Bacik 		return false;
6573562d7b15SJosef Bacik 	}
6574562d7b15SJosef Bacik 
65750fca385dSLiu Shixin 	if (device->devid != args->devid)
65760fca385dSLiu Shixin 		return false;
65770fca385dSLiu Shixin 	if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
65780fca385dSLiu Shixin 		return false;
65790fca385dSLiu Shixin 	return true;
65800fca385dSLiu Shixin }
65810fca385dSLiu Shixin 
658209ba3bc9SAnand Jain /*
658309ba3bc9SAnand Jain  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
658409ba3bc9SAnand Jain  * return NULL.
658509ba3bc9SAnand Jain  *
658609ba3bc9SAnand Jain  * If devid and uuid are both specified, the match must be exact, otherwise
658709ba3bc9SAnand Jain  * only devid is used.
658809ba3bc9SAnand Jain  */
btrfs_find_device(const struct btrfs_fs_devices * fs_devices,const struct btrfs_dev_lookup_args * args)6589562d7b15SJosef Bacik struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6590562d7b15SJosef Bacik 				       const struct btrfs_dev_lookup_args *args)
65910b86a832SChris Mason {
65922b82032cSYan Zheng 	struct btrfs_device *device;
6593944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_devs;
65940b86a832SChris Mason 
6595562d7b15SJosef Bacik 	if (dev_args_match_fs_devices(args, fs_devices)) {
6596944d3f9fSNikolay Borisov 		list_for_each_entry(device, &fs_devices->devices, dev_list) {
6597562d7b15SJosef Bacik 			if (dev_args_match_device(args, device))
6598944d3f9fSNikolay Borisov 				return device;
6599944d3f9fSNikolay Borisov 		}
6600944d3f9fSNikolay Borisov 	}
6601944d3f9fSNikolay Borisov 
6602944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6603562d7b15SJosef Bacik 		if (!dev_args_match_fs_devices(args, seed_devs))
6604562d7b15SJosef Bacik 			continue;
6605562d7b15SJosef Bacik 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
6606562d7b15SJosef Bacik 			if (dev_args_match_device(args, device))
66072b82032cSYan Zheng 				return device;
66082b82032cSYan Zheng 		}
660909ba3bc9SAnand Jain 	}
6610944d3f9fSNikolay Borisov 
66112b82032cSYan Zheng 	return NULL;
66120b86a832SChris Mason }
66130b86a832SChris Mason 
add_missing_dev(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * dev_uuid)66142ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6615dfe25020SChris Mason 					    u64 devid, u8 *dev_uuid)
6616dfe25020SChris Mason {
6617dfe25020SChris Mason 	struct btrfs_device *device;
6618fccc0007SJosef Bacik 	unsigned int nofs_flag;
6619dfe25020SChris Mason 
6620fccc0007SJosef Bacik 	/*
6621fccc0007SJosef Bacik 	 * We call this under the chunk_mutex, so we want to use NOFS for this
6622fccc0007SJosef Bacik 	 * allocation, however we don't want to change btrfs_alloc_device() to
6623fccc0007SJosef Bacik 	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6624fccc0007SJosef Bacik 	 * places.
6625fccc0007SJosef Bacik 	 */
6626bb21e302SAnand Jain 
6627fccc0007SJosef Bacik 	nofs_flag = memalloc_nofs_save();
6628bb21e302SAnand Jain 	device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
6629fccc0007SJosef Bacik 	memalloc_nofs_restore(nofs_flag);
663012bd2fc0SIlya Dryomov 	if (IS_ERR(device))
6631adfb69afSAnand Jain 		return device;
663212bd2fc0SIlya Dryomov 
663312bd2fc0SIlya Dryomov 	list_add(&device->dev_list, &fs_devices->devices);
6634e4404d6eSYan Zheng 	device->fs_devices = fs_devices;
6635dfe25020SChris Mason 	fs_devices->num_devices++;
663612bd2fc0SIlya Dryomov 
6637e6e674bdSAnand Jain 	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6638cd02dca5SChris Mason 	fs_devices->missing_devices++;
663912bd2fc0SIlya Dryomov 
6640dfe25020SChris Mason 	return device;
6641dfe25020SChris Mason }
6642dfe25020SChris Mason 
664343dd529aSDavid Sterba /*
664443dd529aSDavid Sterba  * Allocate new device struct, set up devid and UUID.
664543dd529aSDavid Sterba  *
664612bd2fc0SIlya Dryomov  * @fs_info:	used only for generating a new devid, can be NULL if
664712bd2fc0SIlya Dryomov  *		devid is provided (i.e. @devid != NULL).
664812bd2fc0SIlya Dryomov  * @devid:	a pointer to devid for this device.  If NULL a new devid
664912bd2fc0SIlya Dryomov  *		is generated.
665012bd2fc0SIlya Dryomov  * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
665112bd2fc0SIlya Dryomov  *		is generated.
6652bb21e302SAnand Jain  * @path:	a pointer to device path if available, NULL otherwise.
665312bd2fc0SIlya Dryomov  *
665412bd2fc0SIlya Dryomov  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
665548dae9cfSDavid Sterba  * on error.  Returned struct is not linked onto any lists and must be
6656a425f9d4SDavid Sterba  * destroyed with btrfs_free_device.
665712bd2fc0SIlya Dryomov  */
btrfs_alloc_device(struct btrfs_fs_info * fs_info,const u64 * devid,const u8 * uuid,const char * path)665812bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6659bb21e302SAnand Jain 					const u64 *devid, const u8 *uuid,
6660bb21e302SAnand Jain 					const char *path)
666112bd2fc0SIlya Dryomov {
666212bd2fc0SIlya Dryomov 	struct btrfs_device *dev;
666312bd2fc0SIlya Dryomov 	u64 tmp;
666412bd2fc0SIlya Dryomov 
6665fae7f21cSDulshani Gunawardhana 	if (WARN_ON(!devid && !fs_info))
666612bd2fc0SIlya Dryomov 		return ERR_PTR(-EINVAL);
666712bd2fc0SIlya Dryomov 
6668fe4f46d4SDavid Sterba 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6669fe4f46d4SDavid Sterba 	if (!dev)
6670fe4f46d4SDavid Sterba 		return ERR_PTR(-ENOMEM);
6671fe4f46d4SDavid Sterba 
6672fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->dev_list);
6673fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->dev_alloc_list);
6674fe4f46d4SDavid Sterba 	INIT_LIST_HEAD(&dev->post_commit_list);
6675fe4f46d4SDavid Sterba 
6676fe4f46d4SDavid Sterba 	atomic_set(&dev->dev_stats_ccnt, 0);
6677fe4f46d4SDavid Sterba 	btrfs_device_data_ordered_init(dev);
667835da5a7eSDavid Sterba 	extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
667912bd2fc0SIlya Dryomov 
668012bd2fc0SIlya Dryomov 	if (devid)
668112bd2fc0SIlya Dryomov 		tmp = *devid;
668212bd2fc0SIlya Dryomov 	else {
668312bd2fc0SIlya Dryomov 		int ret;
668412bd2fc0SIlya Dryomov 
668512bd2fc0SIlya Dryomov 		ret = find_next_devid(fs_info, &tmp);
668612bd2fc0SIlya Dryomov 		if (ret) {
6687a425f9d4SDavid Sterba 			btrfs_free_device(dev);
668812bd2fc0SIlya Dryomov 			return ERR_PTR(ret);
668912bd2fc0SIlya Dryomov 		}
669012bd2fc0SIlya Dryomov 	}
669112bd2fc0SIlya Dryomov 	dev->devid = tmp;
669212bd2fc0SIlya Dryomov 
669312bd2fc0SIlya Dryomov 	if (uuid)
669412bd2fc0SIlya Dryomov 		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
669512bd2fc0SIlya Dryomov 	else
669612bd2fc0SIlya Dryomov 		generate_random_uuid(dev->uuid);
669712bd2fc0SIlya Dryomov 
6698bb21e302SAnand Jain 	if (path) {
6699bb21e302SAnand Jain 		struct rcu_string *name;
6700bb21e302SAnand Jain 
6701bb21e302SAnand Jain 		name = rcu_string_strdup(path, GFP_KERNEL);
6702bb21e302SAnand Jain 		if (!name) {
6703bb21e302SAnand Jain 			btrfs_free_device(dev);
6704bb21e302SAnand Jain 			return ERR_PTR(-ENOMEM);
6705bb21e302SAnand Jain 		}
6706bb21e302SAnand Jain 		rcu_assign_pointer(dev->name, name);
6707bb21e302SAnand Jain 	}
6708bb21e302SAnand Jain 
670912bd2fc0SIlya Dryomov 	return dev;
671012bd2fc0SIlya Dryomov }
671112bd2fc0SIlya Dryomov 
btrfs_report_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid,bool error)67125a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
67132b902dfcSAnand Jain 					u64 devid, u8 *uuid, bool error)
67145a2b8e60SAnand Jain {
67152b902dfcSAnand Jain 	if (error)
67162b902dfcSAnand Jain 		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
67172b902dfcSAnand Jain 			      devid, uuid);
67182b902dfcSAnand Jain 	else
67192b902dfcSAnand Jain 		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
67202b902dfcSAnand Jain 			      devid, uuid);
67215a2b8e60SAnand Jain }
67225a2b8e60SAnand Jain 
btrfs_calc_stripe_length(const struct extent_map * em)6723bc88b486SQu Wenruo u64 btrfs_calc_stripe_length(const struct extent_map *em)
672439e264a4SNikolay Borisov {
6725bc88b486SQu Wenruo 	const struct map_lookup *map = em->map_lookup;
6726bc88b486SQu Wenruo 	const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
6727e4f6c6beSDavid Sterba 
6728bc88b486SQu Wenruo 	return div_u64(em->len, data_stripes);
672939e264a4SNikolay Borisov }
673039e264a4SNikolay Borisov 
6731e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6732e9306ad4SQu Wenruo /*
6733e9306ad4SQu Wenruo  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6734e9306ad4SQu Wenruo  * can't be accessed on 32bit systems.
6735e9306ad4SQu Wenruo  *
6736e9306ad4SQu Wenruo  * This function do mount time check to reject the fs if it already has
6737e9306ad4SQu Wenruo  * metadata chunk beyond that limit.
6738e9306ad4SQu Wenruo  */
check_32bit_meta_chunk(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 type)6739e9306ad4SQu Wenruo static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6740e9306ad4SQu Wenruo 				  u64 logical, u64 length, u64 type)
6741e9306ad4SQu Wenruo {
6742e9306ad4SQu Wenruo 	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6743e9306ad4SQu Wenruo 		return 0;
6744e9306ad4SQu Wenruo 
6745e9306ad4SQu Wenruo 	if (logical + length < MAX_LFS_FILESIZE)
6746e9306ad4SQu Wenruo 		return 0;
6747e9306ad4SQu Wenruo 
6748e9306ad4SQu Wenruo 	btrfs_err_32bit_limit(fs_info);
6749e9306ad4SQu Wenruo 	return -EOVERFLOW;
6750e9306ad4SQu Wenruo }
6751e9306ad4SQu Wenruo 
6752e9306ad4SQu Wenruo /*
6753e9306ad4SQu Wenruo  * This is to give early warning for any metadata chunk reaching
6754e9306ad4SQu Wenruo  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6755e9306ad4SQu Wenruo  * Although we can still access the metadata, it's not going to be possible
6756e9306ad4SQu Wenruo  * once the limit is reached.
6757e9306ad4SQu Wenruo  */
warn_32bit_meta_chunk(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 type)6758e9306ad4SQu Wenruo static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6759e9306ad4SQu Wenruo 				  u64 logical, u64 length, u64 type)
6760e9306ad4SQu Wenruo {
6761e9306ad4SQu Wenruo 	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6762e9306ad4SQu Wenruo 		return;
6763e9306ad4SQu Wenruo 
6764e9306ad4SQu Wenruo 	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6765e9306ad4SQu Wenruo 		return;
6766e9306ad4SQu Wenruo 
6767e9306ad4SQu Wenruo 	btrfs_warn_32bit_limit(fs_info);
6768e9306ad4SQu Wenruo }
6769e9306ad4SQu Wenruo #endif
6770e9306ad4SQu Wenruo 
handle_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid)6771ff37c89fSNikolay Borisov static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
6772ff37c89fSNikolay Borisov 						  u64 devid, u8 *uuid)
6773ff37c89fSNikolay Borisov {
6774ff37c89fSNikolay Borisov 	struct btrfs_device *dev;
6775ff37c89fSNikolay Borisov 
6776ff37c89fSNikolay Borisov 	if (!btrfs_test_opt(fs_info, DEGRADED)) {
6777ff37c89fSNikolay Borisov 		btrfs_report_missing_device(fs_info, devid, uuid, true);
6778ff37c89fSNikolay Borisov 		return ERR_PTR(-ENOENT);
6779ff37c89fSNikolay Borisov 	}
6780ff37c89fSNikolay Borisov 
6781ff37c89fSNikolay Borisov 	dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
6782ff37c89fSNikolay Borisov 	if (IS_ERR(dev)) {
6783ff37c89fSNikolay Borisov 		btrfs_err(fs_info, "failed to init missing device %llu: %ld",
6784ff37c89fSNikolay Borisov 			  devid, PTR_ERR(dev));
6785ff37c89fSNikolay Borisov 		return dev;
6786ff37c89fSNikolay Borisov 	}
6787ff37c89fSNikolay Borisov 	btrfs_report_missing_device(fs_info, devid, uuid, false);
6788ff37c89fSNikolay Borisov 
6789ff37c89fSNikolay Borisov 	return dev;
6790ff37c89fSNikolay Borisov }
6791ff37c89fSNikolay Borisov 
read_one_chunk(struct btrfs_key * key,struct extent_buffer * leaf,struct btrfs_chunk * chunk)67929690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
67930b86a832SChris Mason 			  struct btrfs_chunk *chunk)
67940b86a832SChris Mason {
6795562d7b15SJosef Bacik 	BTRFS_DEV_LOOKUP_ARGS(args);
67969690ac09SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
6797c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
67980b86a832SChris Mason 	struct map_lookup *map;
67990b86a832SChris Mason 	struct extent_map *em;
68000b86a832SChris Mason 	u64 logical;
68010b86a832SChris Mason 	u64 length;
68020b86a832SChris Mason 	u64 devid;
6803e9306ad4SQu Wenruo 	u64 type;
6804a443755fSChris Mason 	u8 uuid[BTRFS_UUID_SIZE];
680576a66ba1SQu Wenruo 	int index;
6806593060d7SChris Mason 	int num_stripes;
68070b86a832SChris Mason 	int ret;
6808593060d7SChris Mason 	int i;
68090b86a832SChris Mason 
6810e17cade2SChris Mason 	logical = key->offset;
6811e17cade2SChris Mason 	length = btrfs_chunk_length(leaf, chunk);
6812e9306ad4SQu Wenruo 	type = btrfs_chunk_type(leaf, chunk);
681376a66ba1SQu Wenruo 	index = btrfs_bg_flags_to_raid_index(type);
6814f04b772bSQu Wenruo 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6815e06cd3ddSLiu Bo 
6816e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6817e9306ad4SQu Wenruo 	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6818e9306ad4SQu Wenruo 	if (ret < 0)
6819e9306ad4SQu Wenruo 		return ret;
6820e9306ad4SQu Wenruo 	warn_32bit_meta_chunk(fs_info, logical, length, type);
6821e9306ad4SQu Wenruo #endif
6822e9306ad4SQu Wenruo 
6823075cb3c7SQu Wenruo 	/*
6824075cb3c7SQu Wenruo 	 * Only need to verify chunk item if we're reading from sys chunk array,
6825075cb3c7SQu Wenruo 	 * as chunk item in tree block is already verified by tree-checker.
6826075cb3c7SQu Wenruo 	 */
6827075cb3c7SQu Wenruo 	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6828ddaf1d5aSDavid Sterba 		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6829e06cd3ddSLiu Bo 		if (ret)
6830e06cd3ddSLiu Bo 			return ret;
6831075cb3c7SQu Wenruo 	}
6832a061fc8dSChris Mason 
6833c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
6834c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, logical, 1);
6835c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
68360b86a832SChris Mason 
68370b86a832SChris Mason 	/* already mapped? */
68380b86a832SChris Mason 	if (em && em->start <= logical && em->start + em->len > logical) {
68390b86a832SChris Mason 		free_extent_map(em);
68400b86a832SChris Mason 		return 0;
68410b86a832SChris Mason 	} else if (em) {
68420b86a832SChris Mason 		free_extent_map(em);
68430b86a832SChris Mason 	}
68440b86a832SChris Mason 
6845172ddd60SDavid Sterba 	em = alloc_extent_map();
68460b86a832SChris Mason 	if (!em)
68470b86a832SChris Mason 		return -ENOMEM;
6848593060d7SChris Mason 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
68490b86a832SChris Mason 	if (!map) {
68500b86a832SChris Mason 		free_extent_map(em);
68510b86a832SChris Mason 		return -ENOMEM;
68520b86a832SChris Mason 	}
68530b86a832SChris Mason 
6854298a8f9cSWang Shilong 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
685595617d69SJeff Mahoney 	em->map_lookup = map;
68560b86a832SChris Mason 	em->start = logical;
68570b86a832SChris Mason 	em->len = length;
685870c8a91cSJosef Bacik 	em->orig_start = 0;
68590b86a832SChris Mason 	em->block_start = 0;
6860c8b97818SChris Mason 	em->block_len = em->len;
68610b86a832SChris Mason 
6862593060d7SChris Mason 	map->num_stripes = num_stripes;
6863593060d7SChris Mason 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
6864593060d7SChris Mason 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
6865e9306ad4SQu Wenruo 	map->type = type;
686676a66ba1SQu Wenruo 	/*
686776a66ba1SQu Wenruo 	 * We can't use the sub_stripes value, as for profiles other than
686876a66ba1SQu Wenruo 	 * RAID10, they may have 0 as sub_stripes for filesystems created by
686976a66ba1SQu Wenruo 	 * older mkfs (<v5.4).
687076a66ba1SQu Wenruo 	 * In that case, it can cause divide-by-zero errors later.
687176a66ba1SQu Wenruo 	 * Since currently sub_stripes is fixed for each profile, let's
687276a66ba1SQu Wenruo 	 * use the trusted value instead.
687376a66ba1SQu Wenruo 	 */
687476a66ba1SQu Wenruo 	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
6875cf90d884SQu Wenruo 	map->verified_stripes = 0;
6876bc88b486SQu Wenruo 	em->orig_block_len = btrfs_calc_stripe_length(em);
6877593060d7SChris Mason 	for (i = 0; i < num_stripes; i++) {
6878593060d7SChris Mason 		map->stripes[i].physical =
6879593060d7SChris Mason 			btrfs_stripe_offset_nr(leaf, chunk, i);
6880593060d7SChris Mason 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6881562d7b15SJosef Bacik 		args.devid = devid;
6882a443755fSChris Mason 		read_extent_buffer(leaf, uuid, (unsigned long)
6883a443755fSChris Mason 				   btrfs_stripe_dev_uuid_nr(chunk, i),
6884a443755fSChris Mason 				   BTRFS_UUID_SIZE);
6885562d7b15SJosef Bacik 		args.uuid = uuid;
6886562d7b15SJosef Bacik 		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
6887dfe25020SChris Mason 		if (!map->stripes[i].dev) {
6888ff37c89fSNikolay Borisov 			map->stripes[i].dev = handle_missing_device(fs_info,
6889ff37c89fSNikolay Borisov 								    devid, uuid);
6890adfb69afSAnand Jain 			if (IS_ERR(map->stripes[i].dev)) {
68911742e1c9Svoid0red 				ret = PTR_ERR(map->stripes[i].dev);
68920b86a832SChris Mason 				free_extent_map(em);
68931742e1c9Svoid0red 				return ret;
68940b86a832SChris Mason 			}
6895593060d7SChris Mason 		}
6896ff37c89fSNikolay Borisov 
6897e12c9621SAnand Jain 		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6898e12c9621SAnand Jain 				&(map->stripes[i].dev->dev_state));
6899dfe25020SChris Mason 	}
69000b86a832SChris Mason 
6901c8bf1b67SDavid Sterba 	write_lock(&map_tree->lock);
6902c8bf1b67SDavid Sterba 	ret = add_extent_mapping(map_tree, em, 0);
6903c8bf1b67SDavid Sterba 	write_unlock(&map_tree->lock);
690464f64f43SQu Wenruo 	if (ret < 0) {
690564f64f43SQu Wenruo 		btrfs_err(fs_info,
690664f64f43SQu Wenruo 			  "failed to add chunk map, start=%llu len=%llu: %d",
690764f64f43SQu Wenruo 			  em->start, em->len, ret);
690864f64f43SQu Wenruo 	}
69090b86a832SChris Mason 	free_extent_map(em);
69100b86a832SChris Mason 
691164f64f43SQu Wenruo 	return ret;
69120b86a832SChris Mason }
69130b86a832SChris Mason 
fill_device_from_item(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item,struct btrfs_device * device)6914143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf,
69150b86a832SChris Mason 				 struct btrfs_dev_item *dev_item,
69160b86a832SChris Mason 				 struct btrfs_device *device)
69170b86a832SChris Mason {
69180b86a832SChris Mason 	unsigned long ptr;
69190b86a832SChris Mason 
69200b86a832SChris Mason 	device->devid = btrfs_device_id(leaf, dev_item);
6921d6397baeSChris Ball 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6922d6397baeSChris Ball 	device->total_bytes = device->disk_total_bytes;
6923935e5cc9SMiao Xie 	device->commit_total_bytes = device->disk_total_bytes;
69240b86a832SChris Mason 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6925ce7213c7SMiao Xie 	device->commit_bytes_used = device->bytes_used;
69260b86a832SChris Mason 	device->type = btrfs_device_type(leaf, dev_item);
69270b86a832SChris Mason 	device->io_align = btrfs_device_io_align(leaf, dev_item);
69280b86a832SChris Mason 	device->io_width = btrfs_device_io_width(leaf, dev_item);
69290b86a832SChris Mason 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
69308dabb742SStefan Behrens 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6931401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
69320b86a832SChris Mason 
6933410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
6934e17cade2SChris Mason 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
69350b86a832SChris Mason }
69360b86a832SChris Mason 
open_seed_devices(struct btrfs_fs_info * fs_info,u8 * fsid)69372ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
69385f375835SMiao Xie 						  u8 *fsid)
69392b82032cSYan Zheng {
69402b82032cSYan Zheng 	struct btrfs_fs_devices *fs_devices;
69412b82032cSYan Zheng 	int ret;
69422b82032cSYan Zheng 
6943a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
69442dfeca9bSDavid Sterba 	ASSERT(fsid);
69452b82032cSYan Zheng 
6946427c8fddSNikolay Borisov 	/* This will match only for multi-device seed fs */
6947944d3f9fSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
694844880fdcSAnand Jain 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
69495f375835SMiao Xie 			return fs_devices;
69505f375835SMiao Xie 
69512b82032cSYan Zheng 
69527239ff4bSNikolay Borisov 	fs_devices = find_fsid(fsid, NULL);
69532b82032cSYan Zheng 	if (!fs_devices) {
69540b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, DEGRADED))
69555f375835SMiao Xie 			return ERR_PTR(-ENOENT);
69565f375835SMiao Xie 
69577239ff4bSNikolay Borisov 		fs_devices = alloc_fs_devices(fsid, NULL);
69585f375835SMiao Xie 		if (IS_ERR(fs_devices))
69595f375835SMiao Xie 			return fs_devices;
69605f375835SMiao Xie 
69610395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
69625f375835SMiao Xie 		fs_devices->opened = 1;
69635f375835SMiao Xie 		return fs_devices;
69642b82032cSYan Zheng 	}
6965e4404d6eSYan Zheng 
6966427c8fddSNikolay Borisov 	/*
6967427c8fddSNikolay Borisov 	 * Upon first call for a seed fs fsid, just create a private copy of the
6968427c8fddSNikolay Borisov 	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6969427c8fddSNikolay Borisov 	 */
6970e4404d6eSYan Zheng 	fs_devices = clone_fs_devices(fs_devices);
69715f375835SMiao Xie 	if (IS_ERR(fs_devices))
69725f375835SMiao Xie 		return fs_devices;
69732b82032cSYan Zheng 
697405bdb996SChristoph Hellwig 	ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
697548d28232SJulia Lawall 	if (ret) {
697648d28232SJulia Lawall 		free_fs_devices(fs_devices);
6977c83b60c0SAnand Jain 		return ERR_PTR(ret);
697848d28232SJulia Lawall 	}
69792b82032cSYan Zheng 
69802b82032cSYan Zheng 	if (!fs_devices->seeding) {
69810226e0ebSAnand Jain 		close_fs_devices(fs_devices);
6982e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
6983c83b60c0SAnand Jain 		return ERR_PTR(-EINVAL);
69842b82032cSYan Zheng 	}
69852b82032cSYan Zheng 
6986944d3f9fSNikolay Borisov 	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6987c83b60c0SAnand Jain 
69885f375835SMiao Xie 	return fs_devices;
69892b82032cSYan Zheng }
69902b82032cSYan Zheng 
read_one_dev(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item)699117850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf,
69920b86a832SChris Mason 			struct btrfs_dev_item *dev_item)
69930b86a832SChris Mason {
6994562d7b15SJosef Bacik 	BTRFS_DEV_LOOKUP_ARGS(args);
699517850759SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
69960b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
69970b86a832SChris Mason 	struct btrfs_device *device;
69980b86a832SChris Mason 	u64 devid;
69990b86a832SChris Mason 	int ret;
700044880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
7001a443755fSChris Mason 	u8 dev_uuid[BTRFS_UUID_SIZE];
7002a443755fSChris Mason 
7003c1867eb3SDavid Sterba 	devid = btrfs_device_id(leaf, dev_item);
7004c1867eb3SDavid Sterba 	args.devid = devid;
7005410ba3a2SGeert Uytterhoeven 	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7006a443755fSChris Mason 			   BTRFS_UUID_SIZE);
70071473b24eSGeert Uytterhoeven 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
700844880fdcSAnand Jain 			   BTRFS_FSID_SIZE);
7009562d7b15SJosef Bacik 	args.uuid = dev_uuid;
7010562d7b15SJosef Bacik 	args.fsid = fs_uuid;
70112b82032cSYan Zheng 
7012de37aa51SNikolay Borisov 	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
70132ff7e61eSJeff Mahoney 		fs_devices = open_seed_devices(fs_info, fs_uuid);
70145f375835SMiao Xie 		if (IS_ERR(fs_devices))
70155f375835SMiao Xie 			return PTR_ERR(fs_devices);
70162b82032cSYan Zheng 	}
70172b82032cSYan Zheng 
7018562d7b15SJosef Bacik 	device = btrfs_find_device(fs_info->fs_devices, &args);
70195f375835SMiao Xie 	if (!device) {
7020c5502451SQu Wenruo 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
70212b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
70222b902dfcSAnand Jain 							dev_uuid, true);
702345dbdbc9SAnand Jain 			return -ENOENT;
7024c5502451SQu Wenruo 		}
70252b82032cSYan Zheng 
70262ff7e61eSJeff Mahoney 		device = add_missing_dev(fs_devices, devid, dev_uuid);
7027adfb69afSAnand Jain 		if (IS_ERR(device)) {
7028adfb69afSAnand Jain 			btrfs_err(fs_info,
7029adfb69afSAnand Jain 				"failed to add missing dev %llu: %ld",
7030adfb69afSAnand Jain 				devid, PTR_ERR(device));
7031adfb69afSAnand Jain 			return PTR_ERR(device);
7032adfb69afSAnand Jain 		}
70332b902dfcSAnand Jain 		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
70345f375835SMiao Xie 	} else {
7035c5502451SQu Wenruo 		if (!device->bdev) {
70362b902dfcSAnand Jain 			if (!btrfs_test_opt(fs_info, DEGRADED)) {
70372b902dfcSAnand Jain 				btrfs_report_missing_device(fs_info,
70382b902dfcSAnand Jain 						devid, dev_uuid, true);
703945dbdbc9SAnand Jain 				return -ENOENT;
7040c5502451SQu Wenruo 			}
70412b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
70422b902dfcSAnand Jain 							dev_uuid, false);
70432b902dfcSAnand Jain 		}
70445f375835SMiao Xie 
7045e6e674bdSAnand Jain 		if (!device->bdev &&
7046e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7047cd02dca5SChris Mason 			/*
7048cd02dca5SChris Mason 			 * this happens when a device that was properly setup
7049cd02dca5SChris Mason 			 * in the device info lists suddenly goes bad.
7050cd02dca5SChris Mason 			 * device->bdev is NULL, and so we have to set
7051cd02dca5SChris Mason 			 * device->missing to one here
7052cd02dca5SChris Mason 			 */
70535f375835SMiao Xie 			device->fs_devices->missing_devices++;
7054e6e674bdSAnand Jain 			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
70556324fbf3SChris Mason 		}
70565f375835SMiao Xie 
70575f375835SMiao Xie 		/* Move the device to its own fs_devices */
70585f375835SMiao Xie 		if (device->fs_devices != fs_devices) {
7059e6e674bdSAnand Jain 			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7060e6e674bdSAnand Jain 							&device->dev_state));
70615f375835SMiao Xie 
70625f375835SMiao Xie 			list_move(&device->dev_list, &fs_devices->devices);
70635f375835SMiao Xie 			device->fs_devices->num_devices--;
70645f375835SMiao Xie 			fs_devices->num_devices++;
70655f375835SMiao Xie 
70665f375835SMiao Xie 			device->fs_devices->missing_devices--;
70675f375835SMiao Xie 			fs_devices->missing_devices++;
70685f375835SMiao Xie 
70695f375835SMiao Xie 			device->fs_devices = fs_devices;
70705f375835SMiao Xie 		}
70712b82032cSYan Zheng 	}
70722b82032cSYan Zheng 
70730b246afaSJeff Mahoney 	if (device->fs_devices != fs_info->fs_devices) {
7074ebbede42SAnand Jain 		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
70752b82032cSYan Zheng 		if (device->generation !=
70762b82032cSYan Zheng 		    btrfs_device_generation(leaf, dev_item))
70772b82032cSYan Zheng 			return -EINVAL;
70782b82032cSYan Zheng 	}
70790b86a832SChris Mason 
70800b86a832SChris Mason 	fill_device_from_item(leaf, dev_item, device);
70813a160a93SAnand Jain 	if (device->bdev) {
7082cda00ebaSChristoph Hellwig 		u64 max_total_bytes = bdev_nr_bytes(device->bdev);
70833a160a93SAnand Jain 
70843a160a93SAnand Jain 		if (device->total_bytes > max_total_bytes) {
70853a160a93SAnand Jain 			btrfs_err(fs_info,
70863a160a93SAnand Jain 			"device total_bytes should be at most %llu but found %llu",
70873a160a93SAnand Jain 				  max_total_bytes, device->total_bytes);
70883a160a93SAnand Jain 			return -EINVAL;
70893a160a93SAnand Jain 		}
70903a160a93SAnand Jain 	}
7091e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7092ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7093401e29c1SAnand Jain 	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
70942b82032cSYan Zheng 		device->fs_devices->total_rw_bytes += device->total_bytes;
7095a5ed45f8SNikolay Borisov 		atomic64_add(device->total_bytes - device->bytes_used,
7096a5ed45f8SNikolay Borisov 				&fs_info->free_chunk_space);
70972bf64758SJosef Bacik 	}
70980b86a832SChris Mason 	ret = 0;
70990b86a832SChris Mason 	return ret;
71000b86a832SChris Mason }
71010b86a832SChris Mason 
btrfs_read_sys_array(struct btrfs_fs_info * fs_info)71026bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
71030b86a832SChris Mason {
7104ab8d0fc4SJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
7105a061fc8dSChris Mason 	struct extent_buffer *sb;
71060b86a832SChris Mason 	struct btrfs_disk_key *disk_key;
71070b86a832SChris Mason 	struct btrfs_chunk *chunk;
71081ffb22cfSDavid Sterba 	u8 *array_ptr;
71091ffb22cfSDavid Sterba 	unsigned long sb_array_offset;
711084eed90fSChris Mason 	int ret = 0;
71110b86a832SChris Mason 	u32 num_stripes;
71120b86a832SChris Mason 	u32 array_size;
71130b86a832SChris Mason 	u32 len = 0;
71141ffb22cfSDavid Sterba 	u32 cur_offset;
7115e06cd3ddSLiu Bo 	u64 type;
711684eed90fSChris Mason 	struct btrfs_key key;
71170b86a832SChris Mason 
71180b246afaSJeff Mahoney 	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7119e959d3c1SQu Wenruo 
7120a83fffb7SDavid Sterba 	/*
7121e959d3c1SQu Wenruo 	 * We allocated a dummy extent, just to use extent buffer accessors.
7122e959d3c1SQu Wenruo 	 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7123e959d3c1SQu Wenruo 	 * that's fine, we will not go beyond system chunk array anyway.
7124a83fffb7SDavid Sterba 	 */
7125e959d3c1SQu Wenruo 	sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7126e959d3c1SQu Wenruo 	if (!sb)
7127e959d3c1SQu Wenruo 		return -ENOMEM;
71284db8c528SDavid Sterba 	set_extent_buffer_uptodate(sb);
71294008c04aSChris Mason 
7130a061fc8dSChris Mason 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
71310b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
71320b86a832SChris Mason 
71331ffb22cfSDavid Sterba 	array_ptr = super_copy->sys_chunk_array;
71341ffb22cfSDavid Sterba 	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
71351ffb22cfSDavid Sterba 	cur_offset = 0;
71360b86a832SChris Mason 
71371ffb22cfSDavid Sterba 	while (cur_offset < array_size) {
71381ffb22cfSDavid Sterba 		disk_key = (struct btrfs_disk_key *)array_ptr;
7139e3540eabSDavid Sterba 		len = sizeof(*disk_key);
7140e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7141e3540eabSDavid Sterba 			goto out_short_read;
7142e3540eabSDavid Sterba 
71430b86a832SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
71440b86a832SChris Mason 
71451ffb22cfSDavid Sterba 		array_ptr += len;
71461ffb22cfSDavid Sterba 		sb_array_offset += len;
71471ffb22cfSDavid Sterba 		cur_offset += len;
71480b86a832SChris Mason 
714932ab3d1bSJohannes Thumshirn 		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
715032ab3d1bSJohannes Thumshirn 			btrfs_err(fs_info,
715132ab3d1bSJohannes Thumshirn 			    "unexpected item type %u in sys_array at offset %u",
715232ab3d1bSJohannes Thumshirn 				  (u32)key.type, cur_offset);
715332ab3d1bSJohannes Thumshirn 			ret = -EIO;
715432ab3d1bSJohannes Thumshirn 			break;
715532ab3d1bSJohannes Thumshirn 		}
715632ab3d1bSJohannes Thumshirn 
71571ffb22cfSDavid Sterba 		chunk = (struct btrfs_chunk *)sb_array_offset;
7158e3540eabSDavid Sterba 		/*
715932ab3d1bSJohannes Thumshirn 		 * At least one btrfs_chunk with one stripe must be present,
716032ab3d1bSJohannes Thumshirn 		 * exact stripe count check comes afterwards
7161e3540eabSDavid Sterba 		 */
7162e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(1);
7163e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7164e3540eabSDavid Sterba 			goto out_short_read;
7165e3540eabSDavid Sterba 
7166e3540eabSDavid Sterba 		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7167f5cdedd7SDavid Sterba 		if (!num_stripes) {
7168ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
7169ab8d0fc4SJeff Mahoney 			"invalid number of stripes %u in sys_array at offset %u",
7170f5cdedd7SDavid Sterba 				  num_stripes, cur_offset);
7171f5cdedd7SDavid Sterba 			ret = -EIO;
7172f5cdedd7SDavid Sterba 			break;
7173f5cdedd7SDavid Sterba 		}
7174f5cdedd7SDavid Sterba 
7175e06cd3ddSLiu Bo 		type = btrfs_chunk_type(sb, chunk);
7176e06cd3ddSLiu Bo 		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7177ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
7178e06cd3ddSLiu Bo 			"invalid chunk type %llu in sys_array at offset %u",
7179e06cd3ddSLiu Bo 				  type, cur_offset);
7180e06cd3ddSLiu Bo 			ret = -EIO;
7181e06cd3ddSLiu Bo 			break;
7182e06cd3ddSLiu Bo 		}
7183e06cd3ddSLiu Bo 
7184e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(num_stripes);
7185e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
7186e3540eabSDavid Sterba 			goto out_short_read;
7187e3540eabSDavid Sterba 
71889690ac09SDavid Sterba 		ret = read_one_chunk(&key, sb, chunk);
718984eed90fSChris Mason 		if (ret)
719084eed90fSChris Mason 			break;
719132ab3d1bSJohannes Thumshirn 
71921ffb22cfSDavid Sterba 		array_ptr += len;
71931ffb22cfSDavid Sterba 		sb_array_offset += len;
71941ffb22cfSDavid Sterba 		cur_offset += len;
71950b86a832SChris Mason 	}
7196d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
71971c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
719884eed90fSChris Mason 	return ret;
7199e3540eabSDavid Sterba 
7200e3540eabSDavid Sterba out_short_read:
7201ab8d0fc4SJeff Mahoney 	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7202e3540eabSDavid Sterba 			len, cur_offset);
7203d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
72041c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
7205e3540eabSDavid Sterba 	return -EIO;
72060b86a832SChris Mason }
72070b86a832SChris Mason 
720821634a19SQu Wenruo /*
720921634a19SQu Wenruo  * Check if all chunks in the fs are OK for read-write degraded mount
721021634a19SQu Wenruo  *
72116528b99dSAnand Jain  * If the @failing_dev is specified, it's accounted as missing.
72126528b99dSAnand Jain  *
721321634a19SQu Wenruo  * Return true if all chunks meet the minimal RW mount requirements.
721421634a19SQu Wenruo  * Return false if any chunk doesn't meet the minimal RW mount requirements.
721521634a19SQu Wenruo  */
btrfs_check_rw_degradable(struct btrfs_fs_info * fs_info,struct btrfs_device * failing_dev)72166528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
72176528b99dSAnand Jain 					struct btrfs_device *failing_dev)
721821634a19SQu Wenruo {
7219c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
722021634a19SQu Wenruo 	struct extent_map *em;
722121634a19SQu Wenruo 	u64 next_start = 0;
722221634a19SQu Wenruo 	bool ret = true;
722321634a19SQu Wenruo 
7224c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
7225c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7226c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
722721634a19SQu Wenruo 	/* No chunk at all? Return false anyway */
722821634a19SQu Wenruo 	if (!em) {
722921634a19SQu Wenruo 		ret = false;
723021634a19SQu Wenruo 		goto out;
723121634a19SQu Wenruo 	}
723221634a19SQu Wenruo 	while (em) {
723321634a19SQu Wenruo 		struct map_lookup *map;
723421634a19SQu Wenruo 		int missing = 0;
723521634a19SQu Wenruo 		int max_tolerated;
723621634a19SQu Wenruo 		int i;
723721634a19SQu Wenruo 
723821634a19SQu Wenruo 		map = em->map_lookup;
723921634a19SQu Wenruo 		max_tolerated =
724021634a19SQu Wenruo 			btrfs_get_num_tolerated_disk_barrier_failures(
724121634a19SQu Wenruo 					map->type);
724221634a19SQu Wenruo 		for (i = 0; i < map->num_stripes; i++) {
724321634a19SQu Wenruo 			struct btrfs_device *dev = map->stripes[i].dev;
724421634a19SQu Wenruo 
7245e6e674bdSAnand Jain 			if (!dev || !dev->bdev ||
7246e6e674bdSAnand Jain 			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
724721634a19SQu Wenruo 			    dev->last_flush_error)
724821634a19SQu Wenruo 				missing++;
72496528b99dSAnand Jain 			else if (failing_dev && failing_dev == dev)
72506528b99dSAnand Jain 				missing++;
725121634a19SQu Wenruo 		}
725221634a19SQu Wenruo 		if (missing > max_tolerated) {
72536528b99dSAnand Jain 			if (!failing_dev)
725421634a19SQu Wenruo 				btrfs_warn(fs_info,
725552042d8eSAndrea Gelmini 	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
725621634a19SQu Wenruo 				   em->start, missing, max_tolerated);
725721634a19SQu Wenruo 			free_extent_map(em);
725821634a19SQu Wenruo 			ret = false;
725921634a19SQu Wenruo 			goto out;
726021634a19SQu Wenruo 		}
726121634a19SQu Wenruo 		next_start = extent_map_end(em);
726221634a19SQu Wenruo 		free_extent_map(em);
726321634a19SQu Wenruo 
7264c8bf1b67SDavid Sterba 		read_lock(&map_tree->lock);
7265c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(map_tree, next_start,
726621634a19SQu Wenruo 					   (u64)(-1) - next_start);
7267c8bf1b67SDavid Sterba 		read_unlock(&map_tree->lock);
726821634a19SQu Wenruo 	}
726921634a19SQu Wenruo out:
727021634a19SQu Wenruo 	return ret;
727121634a19SQu Wenruo }
727221634a19SQu Wenruo 
readahead_tree_node_children(struct extent_buffer * node)7273d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node)
7274d85327b1SDavid Sterba {
7275d85327b1SDavid Sterba 	int i;
7276d85327b1SDavid Sterba 	const int nr_items = btrfs_header_nritems(node);
7277d85327b1SDavid Sterba 
7278bfb484d9SJosef Bacik 	for (i = 0; i < nr_items; i++)
7279bfb484d9SJosef Bacik 		btrfs_readahead_node_child(node, i);
7280d85327b1SDavid Sterba }
7281d85327b1SDavid Sterba 
btrfs_read_chunk_tree(struct btrfs_fs_info * fs_info)72825b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
72830b86a832SChris Mason {
72845b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
72850b86a832SChris Mason 	struct btrfs_path *path;
72860b86a832SChris Mason 	struct extent_buffer *leaf;
72870b86a832SChris Mason 	struct btrfs_key key;
72880b86a832SChris Mason 	struct btrfs_key found_key;
72890b86a832SChris Mason 	int ret;
72900b86a832SChris Mason 	int slot;
729143cb1478SGabriel Niebler 	int iter_ret = 0;
729299e3ecfcSLiu Bo 	u64 total_dev = 0;
7293d85327b1SDavid Sterba 	u64 last_ra_node = 0;
72940b86a832SChris Mason 
72950b86a832SChris Mason 	path = btrfs_alloc_path();
72960b86a832SChris Mason 	if (!path)
72970b86a832SChris Mason 		return -ENOMEM;
72980b86a832SChris Mason 
72993dd0f7a3SAnand Jain 	/*
73003dd0f7a3SAnand Jain 	 * uuid_mutex is needed only if we are mounting a sprout FS
73013dd0f7a3SAnand Jain 	 * otherwise we don't need it.
73023dd0f7a3SAnand Jain 	 */
7303b367e47fSLi Zefan 	mutex_lock(&uuid_mutex);
7304b367e47fSLi Zefan 
7305395927a9SFilipe David Borba Manana 	/*
730648cfa61bSBoris Burkov 	 * It is possible for mount and umount to race in such a way that
730748cfa61bSBoris Burkov 	 * we execute this code path, but open_fs_devices failed to clear
730848cfa61bSBoris Burkov 	 * total_rw_bytes. We certainly want it cleared before reading the
730948cfa61bSBoris Burkov 	 * device items, so clear it here.
731048cfa61bSBoris Burkov 	 */
731148cfa61bSBoris Burkov 	fs_info->fs_devices->total_rw_bytes = 0;
731248cfa61bSBoris Burkov 
731348cfa61bSBoris Burkov 	/*
73144d9380e0SFilipe Manana 	 * Lockdep complains about possible circular locking dependency between
73154d9380e0SFilipe Manana 	 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
73164d9380e0SFilipe Manana 	 * used for freeze procection of a fs (struct super_block.s_writers),
73174d9380e0SFilipe Manana 	 * which we take when starting a transaction, and extent buffers of the
73184d9380e0SFilipe Manana 	 * chunk tree if we call read_one_dev() while holding a lock on an
73194d9380e0SFilipe Manana 	 * extent buffer of the chunk tree. Since we are mounting the filesystem
73204d9380e0SFilipe Manana 	 * and at this point there can't be any concurrent task modifying the
73214d9380e0SFilipe Manana 	 * chunk tree, to keep it simple, just skip locking on the chunk tree.
73224d9380e0SFilipe Manana 	 */
73234d9380e0SFilipe Manana 	ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
73244d9380e0SFilipe Manana 	path->skip_locking = 1;
73254d9380e0SFilipe Manana 
73264d9380e0SFilipe Manana 	/*
7327395927a9SFilipe David Borba Manana 	 * Read all device items, and then all the chunk items. All
7328395927a9SFilipe David Borba Manana 	 * device items are found before any chunk item (their object id
7329395927a9SFilipe David Borba Manana 	 * is smaller than the lowest possible object id for a chunk
7330395927a9SFilipe David Borba Manana 	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
73310b86a832SChris Mason 	 */
73320b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
73330b86a832SChris Mason 	key.offset = 0;
73340b86a832SChris Mason 	key.type = 0;
733543cb1478SGabriel Niebler 	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
733643cb1478SGabriel Niebler 		struct extent_buffer *node = path->nodes[1];
7337d85327b1SDavid Sterba 
73380b86a832SChris Mason 		leaf = path->nodes[0];
73390b86a832SChris Mason 		slot = path->slots[0];
734043cb1478SGabriel Niebler 
7341d85327b1SDavid Sterba 		if (node) {
7342d85327b1SDavid Sterba 			if (last_ra_node != node->start) {
7343d85327b1SDavid Sterba 				readahead_tree_node_children(node);
7344d85327b1SDavid Sterba 				last_ra_node = node->start;
7345d85327b1SDavid Sterba 			}
7346d85327b1SDavid Sterba 		}
73470b86a832SChris Mason 		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
73480b86a832SChris Mason 			struct btrfs_dev_item *dev_item;
73490b86a832SChris Mason 			dev_item = btrfs_item_ptr(leaf, slot,
73500b86a832SChris Mason 						  struct btrfs_dev_item);
735117850759SDavid Sterba 			ret = read_one_dev(leaf, dev_item);
73522b82032cSYan Zheng 			if (ret)
73532b82032cSYan Zheng 				goto error;
735499e3ecfcSLiu Bo 			total_dev++;
73550b86a832SChris Mason 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
73560b86a832SChris Mason 			struct btrfs_chunk *chunk;
735779bd3712SFilipe Manana 
735879bd3712SFilipe Manana 			/*
735979bd3712SFilipe Manana 			 * We are only called at mount time, so no need to take
736079bd3712SFilipe Manana 			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
736179bd3712SFilipe Manana 			 * we always lock first fs_info->chunk_mutex before
736279bd3712SFilipe Manana 			 * acquiring any locks on the chunk tree. This is a
736379bd3712SFilipe Manana 			 * requirement for chunk allocation, see the comment on
736479bd3712SFilipe Manana 			 * top of btrfs_chunk_alloc() for details.
736579bd3712SFilipe Manana 			 */
73660b86a832SChris Mason 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
73679690ac09SDavid Sterba 			ret = read_one_chunk(&found_key, leaf, chunk);
73682b82032cSYan Zheng 			if (ret)
73692b82032cSYan Zheng 				goto error;
73700b86a832SChris Mason 		}
737143cb1478SGabriel Niebler 	}
737243cb1478SGabriel Niebler 	/* Catch error found during iteration */
737343cb1478SGabriel Niebler 	if (iter_ret < 0) {
737443cb1478SGabriel Niebler 		ret = iter_ret;
737543cb1478SGabriel Niebler 		goto error;
73760b86a832SChris Mason 	}
737799e3ecfcSLiu Bo 
737899e3ecfcSLiu Bo 	/*
737999e3ecfcSLiu Bo 	 * After loading chunk tree, we've got all device information,
738099e3ecfcSLiu Bo 	 * do another round of validation checks.
738199e3ecfcSLiu Bo 	 */
73820b246afaSJeff Mahoney 	if (total_dev != fs_info->fs_devices->total_devices) {
7383d201238cSQu Wenruo 		btrfs_warn(fs_info,
7384d201238cSQu Wenruo "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
73850b246afaSJeff Mahoney 			  btrfs_super_num_devices(fs_info->super_copy),
738699e3ecfcSLiu Bo 			  total_dev);
7387d201238cSQu Wenruo 		fs_info->fs_devices->total_devices = total_dev;
7388d201238cSQu Wenruo 		btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
738999e3ecfcSLiu Bo 	}
73900b246afaSJeff Mahoney 	if (btrfs_super_total_bytes(fs_info->super_copy) <
73910b246afaSJeff Mahoney 	    fs_info->fs_devices->total_rw_bytes) {
73920b246afaSJeff Mahoney 		btrfs_err(fs_info,
739399e3ecfcSLiu Bo 	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
73940b246afaSJeff Mahoney 			  btrfs_super_total_bytes(fs_info->super_copy),
73950b246afaSJeff Mahoney 			  fs_info->fs_devices->total_rw_bytes);
739699e3ecfcSLiu Bo 		ret = -EINVAL;
739799e3ecfcSLiu Bo 		goto error;
739899e3ecfcSLiu Bo 	}
73990b86a832SChris Mason 	ret = 0;
74000b86a832SChris Mason error:
7401b367e47fSLi Zefan 	mutex_unlock(&uuid_mutex);
7402b367e47fSLi Zefan 
74032b82032cSYan Zheng 	btrfs_free_path(path);
74040b86a832SChris Mason 	return ret;
74050b86a832SChris Mason }
7406442a4f63SStefan Behrens 
btrfs_init_devices_late(struct btrfs_fs_info * fs_info)7407a8d1b164SJohannes Thumshirn int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7408cb517eabSMiao Xie {
7409944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7410cb517eabSMiao Xie 	struct btrfs_device *device;
7411a8d1b164SJohannes Thumshirn 	int ret = 0;
7412cb517eabSMiao Xie 
7413944d3f9fSNikolay Borisov 	fs_devices->fs_info = fs_info;
7414944d3f9fSNikolay Borisov 
7415cb517eabSMiao Xie 	mutex_lock(&fs_devices->device_list_mutex);
7416cb517eabSMiao Xie 	list_for_each_entry(device, &fs_devices->devices, dev_list)
7417fb456252SJeff Mahoney 		device->fs_info = fs_info;
741829cc83f6SLiu Bo 
7419944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7420a8d1b164SJohannes Thumshirn 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
7421944d3f9fSNikolay Borisov 			device->fs_info = fs_info;
7422a8d1b164SJohannes Thumshirn 			ret = btrfs_get_dev_zone_info(device, false);
7423a8d1b164SJohannes Thumshirn 			if (ret)
7424a8d1b164SJohannes Thumshirn 				break;
7425a8d1b164SJohannes Thumshirn 		}
7426944d3f9fSNikolay Borisov 
7427944d3f9fSNikolay Borisov 		seed_devs->fs_info = fs_info;
742829cc83f6SLiu Bo 	}
7429e17125b5SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
7430a8d1b164SJohannes Thumshirn 
7431a8d1b164SJohannes Thumshirn 	return ret;
7432cb517eabSMiao Xie }
7433cb517eabSMiao Xie 
btrfs_dev_stats_value(const struct extent_buffer * eb,const struct btrfs_dev_stats_item * ptr,int index)74341dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
74351dc990dfSDavid Sterba 				 const struct btrfs_dev_stats_item *ptr,
74361dc990dfSDavid Sterba 				 int index)
74371dc990dfSDavid Sterba {
74381dc990dfSDavid Sterba 	u64 val;
74391dc990dfSDavid Sterba 
74401dc990dfSDavid Sterba 	read_extent_buffer(eb, &val,
74411dc990dfSDavid Sterba 			   offsetof(struct btrfs_dev_stats_item, values) +
74421dc990dfSDavid Sterba 			    ((unsigned long)ptr) + (index * sizeof(u64)),
74431dc990dfSDavid Sterba 			   sizeof(val));
74441dc990dfSDavid Sterba 	return val;
74451dc990dfSDavid Sterba }
74461dc990dfSDavid Sterba 
btrfs_set_dev_stats_value(struct extent_buffer * eb,struct btrfs_dev_stats_item * ptr,int index,u64 val)74471dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
74481dc990dfSDavid Sterba 				      struct btrfs_dev_stats_item *ptr,
74491dc990dfSDavid Sterba 				      int index, u64 val)
74501dc990dfSDavid Sterba {
74511dc990dfSDavid Sterba 	write_extent_buffer(eb, &val,
74521dc990dfSDavid Sterba 			    offsetof(struct btrfs_dev_stats_item, values) +
74531dc990dfSDavid Sterba 			     ((unsigned long)ptr) + (index * sizeof(u64)),
74541dc990dfSDavid Sterba 			    sizeof(val));
74551dc990dfSDavid Sterba }
74561dc990dfSDavid Sterba 
btrfs_device_init_dev_stats(struct btrfs_device * device,struct btrfs_path * path)745792e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7458124604ebSJosef Bacik 				       struct btrfs_path *path)
7459733f4fbbSStefan Behrens {
7460733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7461124604ebSJosef Bacik 	struct extent_buffer *eb;
7462124604ebSJosef Bacik 	struct btrfs_key key;
7463124604ebSJosef Bacik 	int item_size;
7464124604ebSJosef Bacik 	int i, ret, slot;
7465733f4fbbSStefan Behrens 
746682d62d06SJosef Bacik 	if (!device->fs_info->dev_root)
746782d62d06SJosef Bacik 		return 0;
746882d62d06SJosef Bacik 
7469242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7470242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7471733f4fbbSStefan Behrens 	key.offset = device->devid;
7472124604ebSJosef Bacik 	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7473733f4fbbSStefan Behrens 	if (ret) {
7474ae4b9b4cSAnand Jain 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7475ae4b9b4cSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7476733f4fbbSStefan Behrens 		device->dev_stats_valid = 1;
7477733f4fbbSStefan Behrens 		btrfs_release_path(path);
747892e26df4SJosef Bacik 		return ret < 0 ? ret : 0;
7479733f4fbbSStefan Behrens 	}
7480733f4fbbSStefan Behrens 	slot = path->slots[0];
7481733f4fbbSStefan Behrens 	eb = path->nodes[0];
74823212fa14SJosef Bacik 	item_size = btrfs_item_size(eb, slot);
7483733f4fbbSStefan Behrens 
7484124604ebSJosef Bacik 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7485733f4fbbSStefan Behrens 
7486733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7487733f4fbbSStefan Behrens 		if (item_size >= (1 + i) * sizeof(__le64))
7488733f4fbbSStefan Behrens 			btrfs_dev_stat_set(device, i,
7489733f4fbbSStefan Behrens 					   btrfs_dev_stats_value(eb, ptr, i));
7490733f4fbbSStefan Behrens 		else
74914e411a7dSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7492733f4fbbSStefan Behrens 	}
7493733f4fbbSStefan Behrens 
7494733f4fbbSStefan Behrens 	device->dev_stats_valid = 1;
7495733f4fbbSStefan Behrens 	btrfs_dev_stat_print_on_load(device);
7496733f4fbbSStefan Behrens 	btrfs_release_path(path);
749792e26df4SJosef Bacik 
749892e26df4SJosef Bacik 	return 0;
7499733f4fbbSStefan Behrens }
7500124604ebSJosef Bacik 
btrfs_init_dev_stats(struct btrfs_fs_info * fs_info)7501124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7502124604ebSJosef Bacik {
7503124604ebSJosef Bacik 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7504124604ebSJosef Bacik 	struct btrfs_device *device;
7505124604ebSJosef Bacik 	struct btrfs_path *path = NULL;
750692e26df4SJosef Bacik 	int ret = 0;
7507124604ebSJosef Bacik 
7508124604ebSJosef Bacik 	path = btrfs_alloc_path();
7509124604ebSJosef Bacik 	if (!path)
7510124604ebSJosef Bacik 		return -ENOMEM;
7511124604ebSJosef Bacik 
7512124604ebSJosef Bacik 	mutex_lock(&fs_devices->device_list_mutex);
751392e26df4SJosef Bacik 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
751492e26df4SJosef Bacik 		ret = btrfs_device_init_dev_stats(device, path);
751592e26df4SJosef Bacik 		if (ret)
751692e26df4SJosef Bacik 			goto out;
7517124604ebSJosef Bacik 	}
751892e26df4SJosef Bacik 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
751992e26df4SJosef Bacik 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
752092e26df4SJosef Bacik 			ret = btrfs_device_init_dev_stats(device, path);
752192e26df4SJosef Bacik 			if (ret)
752292e26df4SJosef Bacik 				goto out;
752392e26df4SJosef Bacik 		}
752492e26df4SJosef Bacik 	}
752592e26df4SJosef Bacik out:
7526733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7527733f4fbbSStefan Behrens 
7528733f4fbbSStefan Behrens 	btrfs_free_path(path);
752992e26df4SJosef Bacik 	return ret;
7530733f4fbbSStefan Behrens }
7531733f4fbbSStefan Behrens 
update_dev_stat_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)7532733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7533733f4fbbSStefan Behrens 				struct btrfs_device *device)
7534733f4fbbSStefan Behrens {
75355495f195SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
75366bccf3abSJeff Mahoney 	struct btrfs_root *dev_root = fs_info->dev_root;
7537733f4fbbSStefan Behrens 	struct btrfs_path *path;
7538733f4fbbSStefan Behrens 	struct btrfs_key key;
7539733f4fbbSStefan Behrens 	struct extent_buffer *eb;
7540733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7541733f4fbbSStefan Behrens 	int ret;
7542733f4fbbSStefan Behrens 	int i;
7543733f4fbbSStefan Behrens 
7544242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7545242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7546733f4fbbSStefan Behrens 	key.offset = device->devid;
7547733f4fbbSStefan Behrens 
7548733f4fbbSStefan Behrens 	path = btrfs_alloc_path();
7549fa252992SDavid Sterba 	if (!path)
7550fa252992SDavid Sterba 		return -ENOMEM;
7551733f4fbbSStefan Behrens 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7552733f4fbbSStefan Behrens 	if (ret < 0) {
75530b246afaSJeff Mahoney 		btrfs_warn_in_rcu(fs_info,
7554ecaeb14bSDavid Sterba 			"error %d while searching for dev_stats item for device %s",
7555cb3e217bSQu Wenruo 				  ret, btrfs_dev_name(device));
7556733f4fbbSStefan Behrens 		goto out;
7557733f4fbbSStefan Behrens 	}
7558733f4fbbSStefan Behrens 
7559733f4fbbSStefan Behrens 	if (ret == 0 &&
75603212fa14SJosef Bacik 	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7561733f4fbbSStefan Behrens 		/* need to delete old one and insert a new one */
7562733f4fbbSStefan Behrens 		ret = btrfs_del_item(trans, dev_root, path);
7563733f4fbbSStefan Behrens 		if (ret != 0) {
75640b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7565ecaeb14bSDavid Sterba 				"delete too small dev_stats item for device %s failed %d",
7566cb3e217bSQu Wenruo 					  btrfs_dev_name(device), ret);
7567733f4fbbSStefan Behrens 			goto out;
7568733f4fbbSStefan Behrens 		}
7569733f4fbbSStefan Behrens 		ret = 1;
7570733f4fbbSStefan Behrens 	}
7571733f4fbbSStefan Behrens 
7572733f4fbbSStefan Behrens 	if (ret == 1) {
7573733f4fbbSStefan Behrens 		/* need to insert a new item */
7574733f4fbbSStefan Behrens 		btrfs_release_path(path);
7575733f4fbbSStefan Behrens 		ret = btrfs_insert_empty_item(trans, dev_root, path,
7576733f4fbbSStefan Behrens 					      &key, sizeof(*ptr));
7577733f4fbbSStefan Behrens 		if (ret < 0) {
75780b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7579ecaeb14bSDavid Sterba 				"insert dev_stats item for device %s failed %d",
7580cb3e217bSQu Wenruo 				btrfs_dev_name(device), ret);
7581733f4fbbSStefan Behrens 			goto out;
7582733f4fbbSStefan Behrens 		}
7583733f4fbbSStefan Behrens 	}
7584733f4fbbSStefan Behrens 
7585733f4fbbSStefan Behrens 	eb = path->nodes[0];
7586733f4fbbSStefan Behrens 	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7587733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7588733f4fbbSStefan Behrens 		btrfs_set_dev_stats_value(eb, ptr, i,
7589733f4fbbSStefan Behrens 					  btrfs_dev_stat_read(device, i));
7590d5e09e38SFilipe Manana 	btrfs_mark_buffer_dirty(trans, eb);
7591733f4fbbSStefan Behrens 
7592733f4fbbSStefan Behrens out:
7593733f4fbbSStefan Behrens 	btrfs_free_path(path);
7594733f4fbbSStefan Behrens 	return ret;
7595733f4fbbSStefan Behrens }
7596733f4fbbSStefan Behrens 
7597733f4fbbSStefan Behrens /*
7598733f4fbbSStefan Behrens  * called from commit_transaction. Writes all changed device stats to disk.
7599733f4fbbSStefan Behrens  */
btrfs_run_dev_stats(struct btrfs_trans_handle * trans)7600196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7601733f4fbbSStefan Behrens {
7602196c9d8dSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
7603733f4fbbSStefan Behrens 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7604733f4fbbSStefan Behrens 	struct btrfs_device *device;
7605addc3fa7SMiao Xie 	int stats_cnt;
7606733f4fbbSStefan Behrens 	int ret = 0;
7607733f4fbbSStefan Behrens 
7608733f4fbbSStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
7609733f4fbbSStefan Behrens 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
76109deae968SNikolay Borisov 		stats_cnt = atomic_read(&device->dev_stats_ccnt);
76119deae968SNikolay Borisov 		if (!device->dev_stats_valid || stats_cnt == 0)
7612733f4fbbSStefan Behrens 			continue;
7613733f4fbbSStefan Behrens 
76149deae968SNikolay Borisov 
76159deae968SNikolay Borisov 		/*
76169deae968SNikolay Borisov 		 * There is a LOAD-LOAD control dependency between the value of
76179deae968SNikolay Borisov 		 * dev_stats_ccnt and updating the on-disk values which requires
76189deae968SNikolay Borisov 		 * reading the in-memory counters. Such control dependencies
76199deae968SNikolay Borisov 		 * require explicit read memory barriers.
76209deae968SNikolay Borisov 		 *
76219deae968SNikolay Borisov 		 * This memory barriers pairs with smp_mb__before_atomic in
76229deae968SNikolay Borisov 		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
76239deae968SNikolay Borisov 		 * barrier implied by atomic_xchg in
76249deae968SNikolay Borisov 		 * btrfs_dev_stats_read_and_reset
76259deae968SNikolay Borisov 		 */
76269deae968SNikolay Borisov 		smp_rmb();
76279deae968SNikolay Borisov 
76285495f195SNikolay Borisov 		ret = update_dev_stat_item(trans, device);
7629733f4fbbSStefan Behrens 		if (!ret)
7630addc3fa7SMiao Xie 			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7631733f4fbbSStefan Behrens 	}
7632733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7633733f4fbbSStefan Behrens 
7634733f4fbbSStefan Behrens 	return ret;
7635733f4fbbSStefan Behrens }
7636733f4fbbSStefan Behrens 
btrfs_dev_stat_inc_and_print(struct btrfs_device * dev,int index)7637442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7638442a4f63SStefan Behrens {
7639442a4f63SStefan Behrens 	btrfs_dev_stat_inc(dev, index);
7640442a4f63SStefan Behrens 
7641733f4fbbSStefan Behrens 	if (!dev->dev_stats_valid)
7642733f4fbbSStefan Behrens 		return;
7643fb456252SJeff Mahoney 	btrfs_err_rl_in_rcu(dev->fs_info,
7644b14af3b4SDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7645cb3e217bSQu Wenruo 			   btrfs_dev_name(dev),
7646442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7647442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7648442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7649efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7650efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7651442a4f63SStefan Behrens }
7652c11d2c23SStefan Behrens 
btrfs_dev_stat_print_on_load(struct btrfs_device * dev)7653733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7654733f4fbbSStefan Behrens {
7655a98cdb85SStefan Behrens 	int i;
7656a98cdb85SStefan Behrens 
7657a98cdb85SStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7658a98cdb85SStefan Behrens 		if (btrfs_dev_stat_read(dev, i) != 0)
7659a98cdb85SStefan Behrens 			break;
7660a98cdb85SStefan Behrens 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
7661a98cdb85SStefan Behrens 		return; /* all values == 0, suppress message */
7662a98cdb85SStefan Behrens 
7663fb456252SJeff Mahoney 	btrfs_info_in_rcu(dev->fs_info,
7664ecaeb14bSDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7665cb3e217bSQu Wenruo 	       btrfs_dev_name(dev),
7666733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7667733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7668733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7669733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7670733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7671733f4fbbSStefan Behrens }
7672733f4fbbSStefan Behrens 
btrfs_get_dev_stats(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_get_dev_stats * stats)76732ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7674b27f7c0cSDavid Sterba 			struct btrfs_ioctl_get_dev_stats *stats)
7675c11d2c23SStefan Behrens {
7676562d7b15SJosef Bacik 	BTRFS_DEV_LOOKUP_ARGS(args);
7677c11d2c23SStefan Behrens 	struct btrfs_device *dev;
76780b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7679c11d2c23SStefan Behrens 	int i;
7680c11d2c23SStefan Behrens 
7681c11d2c23SStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
7682562d7b15SJosef Bacik 	args.devid = stats->devid;
7683562d7b15SJosef Bacik 	dev = btrfs_find_device(fs_info->fs_devices, &args);
7684c11d2c23SStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7685c11d2c23SStefan Behrens 
7686c11d2c23SStefan Behrens 	if (!dev) {
76870b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7688c11d2c23SStefan Behrens 		return -ENODEV;
7689733f4fbbSStefan Behrens 	} else if (!dev->dev_stats_valid) {
76900b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7691733f4fbbSStefan Behrens 		return -ENODEV;
7692b27f7c0cSDavid Sterba 	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7693c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7694c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7695c11d2c23SStefan Behrens 				stats->values[i] =
7696c11d2c23SStefan Behrens 					btrfs_dev_stat_read_and_reset(dev, i);
7697c11d2c23SStefan Behrens 			else
76984e411a7dSAnand Jain 				btrfs_dev_stat_set(dev, i, 0);
7699c11d2c23SStefan Behrens 		}
7700a69976bcSAnand Jain 		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7701a69976bcSAnand Jain 			   current->comm, task_pid_nr(current));
7702c11d2c23SStefan Behrens 	} else {
7703c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7704c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7705c11d2c23SStefan Behrens 				stats->values[i] = btrfs_dev_stat_read(dev, i);
7706c11d2c23SStefan Behrens 	}
7707c11d2c23SStefan Behrens 	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7708c11d2c23SStefan Behrens 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7709c11d2c23SStefan Behrens 	return 0;
7710c11d2c23SStefan Behrens }
7711a8a6dab7SStefan Behrens 
7712935e5cc9SMiao Xie /*
7713bbbf7243SNikolay Borisov  * Update the size and bytes used for each device where it changed.  This is
7714bbbf7243SNikolay Borisov  * delayed since we would otherwise get errors while writing out the
7715935e5cc9SMiao Xie  * superblocks.
7716bbbf7243SNikolay Borisov  *
7717bbbf7243SNikolay Borisov  * Must be invoked during transaction commit.
7718935e5cc9SMiao Xie  */
btrfs_commit_device_sizes(struct btrfs_transaction * trans)7719bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7720935e5cc9SMiao Xie {
7721935e5cc9SMiao Xie 	struct btrfs_device *curr, *next;
7722935e5cc9SMiao Xie 
7723bbbf7243SNikolay Borisov 	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7724bbbf7243SNikolay Borisov 
7725bbbf7243SNikolay Borisov 	if (list_empty(&trans->dev_update_list))
7726935e5cc9SMiao Xie 		return;
7727935e5cc9SMiao Xie 
7728bbbf7243SNikolay Borisov 	/*
7729bbbf7243SNikolay Borisov 	 * We don't need the device_list_mutex here.  This list is owned by the
7730bbbf7243SNikolay Borisov 	 * transaction and the transaction must complete before the device is
7731bbbf7243SNikolay Borisov 	 * released.
7732bbbf7243SNikolay Borisov 	 */
7733bbbf7243SNikolay Borisov 	mutex_lock(&trans->fs_info->chunk_mutex);
7734bbbf7243SNikolay Borisov 	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7735bbbf7243SNikolay Borisov 				 post_commit_list) {
7736bbbf7243SNikolay Borisov 		list_del_init(&curr->post_commit_list);
7737935e5cc9SMiao Xie 		curr->commit_total_bytes = curr->disk_total_bytes;
7738bbbf7243SNikolay Borisov 		curr->commit_bytes_used = curr->bytes_used;
7739935e5cc9SMiao Xie 	}
7740bbbf7243SNikolay Borisov 	mutex_unlock(&trans->fs_info->chunk_mutex);
7741ce7213c7SMiao Xie }
77425a13f430SAnand Jain 
774346df06b8SDavid Sterba /*
774446df06b8SDavid Sterba  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
774546df06b8SDavid Sterba  */
btrfs_bg_type_to_factor(u64 flags)774646df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags)
774746df06b8SDavid Sterba {
774844b28adaSDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
774944b28adaSDavid Sterba 
775044b28adaSDavid Sterba 	return btrfs_raid_array[index].ncopies;
775146df06b8SDavid Sterba }
7752cf90d884SQu Wenruo 
7753cf90d884SQu Wenruo 
7754cf90d884SQu Wenruo 
verify_one_dev_extent(struct btrfs_fs_info * fs_info,u64 chunk_offset,u64 devid,u64 physical_offset,u64 physical_len)7755cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7756cf90d884SQu Wenruo 				 u64 chunk_offset, u64 devid,
7757cf90d884SQu Wenruo 				 u64 physical_offset, u64 physical_len)
7758cf90d884SQu Wenruo {
7759562d7b15SJosef Bacik 	struct btrfs_dev_lookup_args args = { .devid = devid };
7760c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7761cf90d884SQu Wenruo 	struct extent_map *em;
7762cf90d884SQu Wenruo 	struct map_lookup *map;
776305a37c48SQu Wenruo 	struct btrfs_device *dev;
7764cf90d884SQu Wenruo 	u64 stripe_len;
7765cf90d884SQu Wenruo 	bool found = false;
7766cf90d884SQu Wenruo 	int ret = 0;
7767cf90d884SQu Wenruo 	int i;
7768cf90d884SQu Wenruo 
7769cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
7770cf90d884SQu Wenruo 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7771cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
7772cf90d884SQu Wenruo 
7773cf90d884SQu Wenruo 	if (!em) {
7774cf90d884SQu Wenruo 		btrfs_err(fs_info,
7775cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7776cf90d884SQu Wenruo 			  physical_offset, devid);
7777cf90d884SQu Wenruo 		ret = -EUCLEAN;
7778cf90d884SQu Wenruo 		goto out;
7779cf90d884SQu Wenruo 	}
7780cf90d884SQu Wenruo 
7781cf90d884SQu Wenruo 	map = em->map_lookup;
7782bc88b486SQu Wenruo 	stripe_len = btrfs_calc_stripe_length(em);
7783cf90d884SQu Wenruo 	if (physical_len != stripe_len) {
7784cf90d884SQu Wenruo 		btrfs_err(fs_info,
7785cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7786cf90d884SQu Wenruo 			  physical_offset, devid, em->start, physical_len,
7787cf90d884SQu Wenruo 			  stripe_len);
7788cf90d884SQu Wenruo 		ret = -EUCLEAN;
7789cf90d884SQu Wenruo 		goto out;
7790cf90d884SQu Wenruo 	}
7791cf90d884SQu Wenruo 
77923613249aSQu Wenruo 	/*
77933613249aSQu Wenruo 	 * Very old mkfs.btrfs (before v4.1) will not respect the reserved
77943613249aSQu Wenruo 	 * space. Although kernel can handle it without problem, better to warn
77953613249aSQu Wenruo 	 * the users.
77963613249aSQu Wenruo 	 */
77973613249aSQu Wenruo 	if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
77983613249aSQu Wenruo 		btrfs_warn(fs_info,
77993613249aSQu Wenruo 		"devid %llu physical %llu len %llu inside the reserved space",
78003613249aSQu Wenruo 			   devid, physical_offset, physical_len);
78013613249aSQu Wenruo 
7802cf90d884SQu Wenruo 	for (i = 0; i < map->num_stripes; i++) {
7803cf90d884SQu Wenruo 		if (map->stripes[i].dev->devid == devid &&
7804cf90d884SQu Wenruo 		    map->stripes[i].physical == physical_offset) {
7805cf90d884SQu Wenruo 			found = true;
7806cf90d884SQu Wenruo 			if (map->verified_stripes >= map->num_stripes) {
7807cf90d884SQu Wenruo 				btrfs_err(fs_info,
7808cf90d884SQu Wenruo 				"too many dev extents for chunk %llu found",
7809cf90d884SQu Wenruo 					  em->start);
7810cf90d884SQu Wenruo 				ret = -EUCLEAN;
7811cf90d884SQu Wenruo 				goto out;
7812cf90d884SQu Wenruo 			}
7813cf90d884SQu Wenruo 			map->verified_stripes++;
7814cf90d884SQu Wenruo 			break;
7815cf90d884SQu Wenruo 		}
7816cf90d884SQu Wenruo 	}
7817cf90d884SQu Wenruo 	if (!found) {
7818cf90d884SQu Wenruo 		btrfs_err(fs_info,
7819cf90d884SQu Wenruo 	"dev extent physical offset %llu devid %llu has no corresponding chunk",
7820cf90d884SQu Wenruo 			physical_offset, devid);
7821cf90d884SQu Wenruo 		ret = -EUCLEAN;
7822cf90d884SQu Wenruo 	}
782305a37c48SQu Wenruo 
78241a9fd417SDavid Sterba 	/* Make sure no dev extent is beyond device boundary */
7825562d7b15SJosef Bacik 	dev = btrfs_find_device(fs_info->fs_devices, &args);
782605a37c48SQu Wenruo 	if (!dev) {
782705a37c48SQu Wenruo 		btrfs_err(fs_info, "failed to find devid %llu", devid);
782805a37c48SQu Wenruo 		ret = -EUCLEAN;
782905a37c48SQu Wenruo 		goto out;
783005a37c48SQu Wenruo 	}
78311b3922a8SQu Wenruo 
783205a37c48SQu Wenruo 	if (physical_offset + physical_len > dev->disk_total_bytes) {
783305a37c48SQu Wenruo 		btrfs_err(fs_info,
783405a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
783505a37c48SQu Wenruo 			  devid, physical_offset, physical_len,
783605a37c48SQu Wenruo 			  dev->disk_total_bytes);
783705a37c48SQu Wenruo 		ret = -EUCLEAN;
783805a37c48SQu Wenruo 		goto out;
783905a37c48SQu Wenruo 	}
7840381a696eSNaohiro Aota 
7841381a696eSNaohiro Aota 	if (dev->zone_info) {
7842381a696eSNaohiro Aota 		u64 zone_size = dev->zone_info->zone_size;
7843381a696eSNaohiro Aota 
7844381a696eSNaohiro Aota 		if (!IS_ALIGNED(physical_offset, zone_size) ||
7845381a696eSNaohiro Aota 		    !IS_ALIGNED(physical_len, zone_size)) {
7846381a696eSNaohiro Aota 			btrfs_err(fs_info,
7847381a696eSNaohiro Aota "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7848381a696eSNaohiro Aota 				  devid, physical_offset, physical_len);
7849381a696eSNaohiro Aota 			ret = -EUCLEAN;
7850381a696eSNaohiro Aota 			goto out;
7851381a696eSNaohiro Aota 		}
7852381a696eSNaohiro Aota 	}
7853381a696eSNaohiro Aota 
7854cf90d884SQu Wenruo out:
7855cf90d884SQu Wenruo 	free_extent_map(em);
7856cf90d884SQu Wenruo 	return ret;
7857cf90d884SQu Wenruo }
7858cf90d884SQu Wenruo 
verify_chunk_dev_extent_mapping(struct btrfs_fs_info * fs_info)7859cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7860cf90d884SQu Wenruo {
7861c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7862cf90d884SQu Wenruo 	struct extent_map *em;
7863cf90d884SQu Wenruo 	struct rb_node *node;
7864cf90d884SQu Wenruo 	int ret = 0;
7865cf90d884SQu Wenruo 
7866cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
786707e1ce09SLiu Bo 	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7868cf90d884SQu Wenruo 		em = rb_entry(node, struct extent_map, rb_node);
7869cf90d884SQu Wenruo 		if (em->map_lookup->num_stripes !=
7870cf90d884SQu Wenruo 		    em->map_lookup->verified_stripes) {
7871cf90d884SQu Wenruo 			btrfs_err(fs_info,
7872cf90d884SQu Wenruo 			"chunk %llu has missing dev extent, have %d expect %d",
7873cf90d884SQu Wenruo 				  em->start, em->map_lookup->verified_stripes,
7874cf90d884SQu Wenruo 				  em->map_lookup->num_stripes);
7875cf90d884SQu Wenruo 			ret = -EUCLEAN;
7876cf90d884SQu Wenruo 			goto out;
7877cf90d884SQu Wenruo 		}
7878cf90d884SQu Wenruo 	}
7879cf90d884SQu Wenruo out:
7880cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
7881cf90d884SQu Wenruo 	return ret;
7882cf90d884SQu Wenruo }
7883cf90d884SQu Wenruo 
7884cf90d884SQu Wenruo /*
7885cf90d884SQu Wenruo  * Ensure that all dev extents are mapped to correct chunk, otherwise
7886cf90d884SQu Wenruo  * later chunk allocation/free would cause unexpected behavior.
7887cf90d884SQu Wenruo  *
7888cf90d884SQu Wenruo  * NOTE: This will iterate through the whole device tree, which should be of
7889cf90d884SQu Wenruo  * the same size level as the chunk tree.  This slightly increases mount time.
7890cf90d884SQu Wenruo  */
btrfs_verify_dev_extents(struct btrfs_fs_info * fs_info)7891cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7892cf90d884SQu Wenruo {
7893cf90d884SQu Wenruo 	struct btrfs_path *path;
7894cf90d884SQu Wenruo 	struct btrfs_root *root = fs_info->dev_root;
7895cf90d884SQu Wenruo 	struct btrfs_key key;
78965eb19381SQu Wenruo 	u64 prev_devid = 0;
78975eb19381SQu Wenruo 	u64 prev_dev_ext_end = 0;
7898cf90d884SQu Wenruo 	int ret = 0;
7899cf90d884SQu Wenruo 
790042437a63SJosef Bacik 	/*
790142437a63SJosef Bacik 	 * We don't have a dev_root because we mounted with ignorebadroots and
790242437a63SJosef Bacik 	 * failed to load the root, so we want to skip the verification in this
790342437a63SJosef Bacik 	 * case for sure.
790442437a63SJosef Bacik 	 *
790542437a63SJosef Bacik 	 * However if the dev root is fine, but the tree itself is corrupted
790642437a63SJosef Bacik 	 * we'd still fail to mount.  This verification is only to make sure
790742437a63SJosef Bacik 	 * writes can happen safely, so instead just bypass this check
790842437a63SJosef Bacik 	 * completely in the case of IGNOREBADROOTS.
790942437a63SJosef Bacik 	 */
791042437a63SJosef Bacik 	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
791142437a63SJosef Bacik 		return 0;
791242437a63SJosef Bacik 
7913cf90d884SQu Wenruo 	key.objectid = 1;
7914cf90d884SQu Wenruo 	key.type = BTRFS_DEV_EXTENT_KEY;
7915cf90d884SQu Wenruo 	key.offset = 0;
7916cf90d884SQu Wenruo 
7917cf90d884SQu Wenruo 	path = btrfs_alloc_path();
7918cf90d884SQu Wenruo 	if (!path)
7919cf90d884SQu Wenruo 		return -ENOMEM;
7920cf90d884SQu Wenruo 
7921cf90d884SQu Wenruo 	path->reada = READA_FORWARD;
7922cf90d884SQu Wenruo 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7923cf90d884SQu Wenruo 	if (ret < 0)
7924cf90d884SQu Wenruo 		goto out;
7925cf90d884SQu Wenruo 
7926cf90d884SQu Wenruo 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7927ad9a9378SMarcos Paulo de Souza 		ret = btrfs_next_leaf(root, path);
7928cf90d884SQu Wenruo 		if (ret < 0)
7929cf90d884SQu Wenruo 			goto out;
7930cf90d884SQu Wenruo 		/* No dev extents at all? Not good */
7931cf90d884SQu Wenruo 		if (ret > 0) {
7932cf90d884SQu Wenruo 			ret = -EUCLEAN;
7933cf90d884SQu Wenruo 			goto out;
7934cf90d884SQu Wenruo 		}
7935cf90d884SQu Wenruo 	}
7936cf90d884SQu Wenruo 	while (1) {
7937cf90d884SQu Wenruo 		struct extent_buffer *leaf = path->nodes[0];
7938cf90d884SQu Wenruo 		struct btrfs_dev_extent *dext;
7939cf90d884SQu Wenruo 		int slot = path->slots[0];
7940cf90d884SQu Wenruo 		u64 chunk_offset;
7941cf90d884SQu Wenruo 		u64 physical_offset;
7942cf90d884SQu Wenruo 		u64 physical_len;
7943cf90d884SQu Wenruo 		u64 devid;
7944cf90d884SQu Wenruo 
7945cf90d884SQu Wenruo 		btrfs_item_key_to_cpu(leaf, &key, slot);
7946cf90d884SQu Wenruo 		if (key.type != BTRFS_DEV_EXTENT_KEY)
7947cf90d884SQu Wenruo 			break;
7948cf90d884SQu Wenruo 		devid = key.objectid;
7949cf90d884SQu Wenruo 		physical_offset = key.offset;
7950cf90d884SQu Wenruo 
7951cf90d884SQu Wenruo 		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7952cf90d884SQu Wenruo 		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7953cf90d884SQu Wenruo 		physical_len = btrfs_dev_extent_length(leaf, dext);
7954cf90d884SQu Wenruo 
79555eb19381SQu Wenruo 		/* Check if this dev extent overlaps with the previous one */
79565eb19381SQu Wenruo 		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
79575eb19381SQu Wenruo 			btrfs_err(fs_info,
79585eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
79595eb19381SQu Wenruo 				  devid, physical_offset, prev_dev_ext_end);
79605eb19381SQu Wenruo 			ret = -EUCLEAN;
79615eb19381SQu Wenruo 			goto out;
79625eb19381SQu Wenruo 		}
79635eb19381SQu Wenruo 
7964cf90d884SQu Wenruo 		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7965cf90d884SQu Wenruo 					    physical_offset, physical_len);
7966cf90d884SQu Wenruo 		if (ret < 0)
7967cf90d884SQu Wenruo 			goto out;
79685eb19381SQu Wenruo 		prev_devid = devid;
79695eb19381SQu Wenruo 		prev_dev_ext_end = physical_offset + physical_len;
79705eb19381SQu Wenruo 
7971cf90d884SQu Wenruo 		ret = btrfs_next_item(root, path);
7972cf90d884SQu Wenruo 		if (ret < 0)
7973cf90d884SQu Wenruo 			goto out;
7974cf90d884SQu Wenruo 		if (ret > 0) {
7975cf90d884SQu Wenruo 			ret = 0;
7976cf90d884SQu Wenruo 			break;
7977cf90d884SQu Wenruo 		}
7978cf90d884SQu Wenruo 	}
7979cf90d884SQu Wenruo 
7980cf90d884SQu Wenruo 	/* Ensure all chunks have corresponding dev extents */
7981cf90d884SQu Wenruo 	ret = verify_chunk_dev_extent_mapping(fs_info);
7982cf90d884SQu Wenruo out:
7983cf90d884SQu Wenruo 	btrfs_free_path(path);
7984cf90d884SQu Wenruo 	return ret;
7985cf90d884SQu Wenruo }
7986eede2bf3SOmar Sandoval 
7987eede2bf3SOmar Sandoval /*
7988eede2bf3SOmar Sandoval  * Check whether the given block group or device is pinned by any inode being
7989eede2bf3SOmar Sandoval  * used as a swapfile.
7990eede2bf3SOmar Sandoval  */
btrfs_pinned_by_swapfile(struct btrfs_fs_info * fs_info,void * ptr)7991eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7992eede2bf3SOmar Sandoval {
7993eede2bf3SOmar Sandoval 	struct btrfs_swapfile_pin *sp;
7994eede2bf3SOmar Sandoval 	struct rb_node *node;
7995eede2bf3SOmar Sandoval 
7996eede2bf3SOmar Sandoval 	spin_lock(&fs_info->swapfile_pins_lock);
7997eede2bf3SOmar Sandoval 	node = fs_info->swapfile_pins.rb_node;
7998eede2bf3SOmar Sandoval 	while (node) {
7999eede2bf3SOmar Sandoval 		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8000eede2bf3SOmar Sandoval 		if (ptr < sp->ptr)
8001eede2bf3SOmar Sandoval 			node = node->rb_left;
8002eede2bf3SOmar Sandoval 		else if (ptr > sp->ptr)
8003eede2bf3SOmar Sandoval 			node = node->rb_right;
8004eede2bf3SOmar Sandoval 		else
8005eede2bf3SOmar Sandoval 			break;
8006eede2bf3SOmar Sandoval 	}
8007eede2bf3SOmar Sandoval 	spin_unlock(&fs_info->swapfile_pins_lock);
8008eede2bf3SOmar Sandoval 	return node != NULL;
8009eede2bf3SOmar Sandoval }
8010f7ef5287SNaohiro Aota 
relocating_repair_kthread(void * data)8011f7ef5287SNaohiro Aota static int relocating_repair_kthread(void *data)
8012f7ef5287SNaohiro Aota {
80130d031dc4SYu Zhe 	struct btrfs_block_group *cache = data;
8014f7ef5287SNaohiro Aota 	struct btrfs_fs_info *fs_info = cache->fs_info;
8015f7ef5287SNaohiro Aota 	u64 target;
8016f7ef5287SNaohiro Aota 	int ret = 0;
8017f7ef5287SNaohiro Aota 
8018f7ef5287SNaohiro Aota 	target = cache->start;
8019f7ef5287SNaohiro Aota 	btrfs_put_block_group(cache);
8020f7ef5287SNaohiro Aota 
8021ca5e4ea0SNaohiro Aota 	sb_start_write(fs_info->sb);
8022f7ef5287SNaohiro Aota 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8023f7ef5287SNaohiro Aota 		btrfs_info(fs_info,
8024f7ef5287SNaohiro Aota 			   "zoned: skip relocating block group %llu to repair: EBUSY",
8025f7ef5287SNaohiro Aota 			   target);
8026ca5e4ea0SNaohiro Aota 		sb_end_write(fs_info->sb);
8027f7ef5287SNaohiro Aota 		return -EBUSY;
8028f7ef5287SNaohiro Aota 	}
8029f7ef5287SNaohiro Aota 
8030f3372065SJohannes Thumshirn 	mutex_lock(&fs_info->reclaim_bgs_lock);
8031f7ef5287SNaohiro Aota 
8032f7ef5287SNaohiro Aota 	/* Ensure block group still exists */
8033f7ef5287SNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, target);
8034f7ef5287SNaohiro Aota 	if (!cache)
8035f7ef5287SNaohiro Aota 		goto out;
8036f7ef5287SNaohiro Aota 
80373349b57fSJosef Bacik 	if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
8038f7ef5287SNaohiro Aota 		goto out;
8039f7ef5287SNaohiro Aota 
8040f7ef5287SNaohiro Aota 	ret = btrfs_may_alloc_data_chunk(fs_info, target);
8041f7ef5287SNaohiro Aota 	if (ret < 0)
8042f7ef5287SNaohiro Aota 		goto out;
8043f7ef5287SNaohiro Aota 
8044f7ef5287SNaohiro Aota 	btrfs_info(fs_info,
8045f7ef5287SNaohiro Aota 		   "zoned: relocating block group %llu to repair IO failure",
8046f7ef5287SNaohiro Aota 		   target);
8047f7ef5287SNaohiro Aota 	ret = btrfs_relocate_chunk(fs_info, target);
8048f7ef5287SNaohiro Aota 
8049f7ef5287SNaohiro Aota out:
8050f7ef5287SNaohiro Aota 	if (cache)
8051f7ef5287SNaohiro Aota 		btrfs_put_block_group(cache);
8052f3372065SJohannes Thumshirn 	mutex_unlock(&fs_info->reclaim_bgs_lock);
8053f7ef5287SNaohiro Aota 	btrfs_exclop_finish(fs_info);
8054ca5e4ea0SNaohiro Aota 	sb_end_write(fs_info->sb);
8055f7ef5287SNaohiro Aota 
8056f7ef5287SNaohiro Aota 	return ret;
8057f7ef5287SNaohiro Aota }
8058f7ef5287SNaohiro Aota 
btrfs_repair_one_zone(struct btrfs_fs_info * fs_info,u64 logical)8059554aed7dSJohannes Thumshirn bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8060f7ef5287SNaohiro Aota {
8061f7ef5287SNaohiro Aota 	struct btrfs_block_group *cache;
8062f7ef5287SNaohiro Aota 
8063554aed7dSJohannes Thumshirn 	if (!btrfs_is_zoned(fs_info))
8064554aed7dSJohannes Thumshirn 		return false;
8065554aed7dSJohannes Thumshirn 
8066f7ef5287SNaohiro Aota 	/* Do not attempt to repair in degraded state */
8067f7ef5287SNaohiro Aota 	if (btrfs_test_opt(fs_info, DEGRADED))
8068554aed7dSJohannes Thumshirn 		return true;
8069f7ef5287SNaohiro Aota 
8070f7ef5287SNaohiro Aota 	cache = btrfs_lookup_block_group(fs_info, logical);
8071f7ef5287SNaohiro Aota 	if (!cache)
8072554aed7dSJohannes Thumshirn 		return true;
8073f7ef5287SNaohiro Aota 
80743349b57fSJosef Bacik 	if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
8075f7ef5287SNaohiro Aota 		btrfs_put_block_group(cache);
8076554aed7dSJohannes Thumshirn 		return true;
8077f7ef5287SNaohiro Aota 	}
8078f7ef5287SNaohiro Aota 
8079f7ef5287SNaohiro Aota 	kthread_run(relocating_repair_kthread, cache,
8080f7ef5287SNaohiro Aota 		    "btrfs-relocating-repair");
8081f7ef5287SNaohiro Aota 
8082554aed7dSJohannes Thumshirn 	return true;
8083f7ef5287SNaohiro Aota }
80844886ff7bSQu Wenruo 
map_raid56_repair_block(struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,u64 logical)80854886ff7bSQu Wenruo static void map_raid56_repair_block(struct btrfs_io_context *bioc,
80864886ff7bSQu Wenruo 				    struct btrfs_io_stripe *smap,
80874886ff7bSQu Wenruo 				    u64 logical)
80884886ff7bSQu Wenruo {
80894886ff7bSQu Wenruo 	int data_stripes = nr_bioc_data_stripes(bioc);
80904886ff7bSQu Wenruo 	int i;
80914886ff7bSQu Wenruo 
80924886ff7bSQu Wenruo 	for (i = 0; i < data_stripes; i++) {
80934886ff7bSQu Wenruo 		u64 stripe_start = bioc->full_stripe_logical +
8094cb091225SQu Wenruo 				   btrfs_stripe_nr_to_offset(i);
80954886ff7bSQu Wenruo 
80964886ff7bSQu Wenruo 		if (logical >= stripe_start &&
80974886ff7bSQu Wenruo 		    logical < stripe_start + BTRFS_STRIPE_LEN)
80984886ff7bSQu Wenruo 			break;
80994886ff7bSQu Wenruo 	}
81004886ff7bSQu Wenruo 	ASSERT(i < data_stripes);
81014886ff7bSQu Wenruo 	smap->dev = bioc->stripes[i].dev;
81024886ff7bSQu Wenruo 	smap->physical = bioc->stripes[i].physical +
81034886ff7bSQu Wenruo 			((logical - bioc->full_stripe_logical) &
81044886ff7bSQu Wenruo 			 BTRFS_STRIPE_LEN_MASK);
81054886ff7bSQu Wenruo }
81064886ff7bSQu Wenruo 
81074886ff7bSQu Wenruo /*
81084886ff7bSQu Wenruo  * Map a repair write into a single device.
81094886ff7bSQu Wenruo  *
81104886ff7bSQu Wenruo  * A repair write is triggered by read time repair or scrub, which would only
81114886ff7bSQu Wenruo  * update the contents of a single device.
81124886ff7bSQu Wenruo  * Not update any other mirrors nor go through RMW path.
81134886ff7bSQu Wenruo  *
81144886ff7bSQu Wenruo  * Callers should ensure:
81154886ff7bSQu Wenruo  *
81164886ff7bSQu Wenruo  * - Call btrfs_bio_counter_inc_blocked() first
81174886ff7bSQu Wenruo  * - The range does not cross stripe boundary
81184886ff7bSQu Wenruo  * - Has a valid @mirror_num passed in.
81194886ff7bSQu Wenruo  */
btrfs_map_repair_block(struct btrfs_fs_info * fs_info,struct btrfs_io_stripe * smap,u64 logical,u32 length,int mirror_num)81204886ff7bSQu Wenruo int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
81214886ff7bSQu Wenruo 			   struct btrfs_io_stripe *smap, u64 logical,
81224886ff7bSQu Wenruo 			   u32 length, int mirror_num)
81234886ff7bSQu Wenruo {
81244886ff7bSQu Wenruo 	struct btrfs_io_context *bioc = NULL;
81254886ff7bSQu Wenruo 	u64 map_length = length;
81264886ff7bSQu Wenruo 	int mirror_ret = mirror_num;
81274886ff7bSQu Wenruo 	int ret;
81284886ff7bSQu Wenruo 
81294886ff7bSQu Wenruo 	ASSERT(mirror_num > 0);
81304886ff7bSQu Wenruo 
8131cd4efd21SChristoph Hellwig 	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
81324886ff7bSQu Wenruo 			      &bioc, smap, &mirror_ret, true);
81334886ff7bSQu Wenruo 	if (ret < 0)
81344886ff7bSQu Wenruo 		return ret;
81354886ff7bSQu Wenruo 
81364886ff7bSQu Wenruo 	/* The map range should not cross stripe boundary. */
81374886ff7bSQu Wenruo 	ASSERT(map_length >= length);
81384886ff7bSQu Wenruo 
81394886ff7bSQu Wenruo 	/* Already mapped to single stripe. */
81404886ff7bSQu Wenruo 	if (!bioc)
81414886ff7bSQu Wenruo 		goto out;
81424886ff7bSQu Wenruo 
81434886ff7bSQu Wenruo 	/* Map the RAID56 multi-stripe writes to a single one. */
81444886ff7bSQu Wenruo 	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
81454886ff7bSQu Wenruo 		map_raid56_repair_block(bioc, smap, logical);
81464886ff7bSQu Wenruo 		goto out;
81474886ff7bSQu Wenruo 	}
81484886ff7bSQu Wenruo 
81494886ff7bSQu Wenruo 	ASSERT(mirror_num <= bioc->num_stripes);
81504886ff7bSQu Wenruo 	smap->dev = bioc->stripes[mirror_num - 1].dev;
81514886ff7bSQu Wenruo 	smap->physical = bioc->stripes[mirror_num - 1].physical;
81524886ff7bSQu Wenruo out:
81534886ff7bSQu Wenruo 	btrfs_put_bioc(bioc);
81544886ff7bSQu Wenruo 	ASSERT(smap->dev);
81554886ff7bSQu Wenruo 	return 0;
81564886ff7bSQu Wenruo }
8157