xref: /openbmc/linux/fs/btrfs/volumes.c (revision bacce86ae8a7b8b3c7d8398eb57d151a808043d1)
1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
20b86a832SChris Mason /*
30b86a832SChris Mason  * Copyright (C) 2007 Oracle.  All rights reserved.
40b86a832SChris Mason  */
5c1d7c514SDavid Sterba 
60b86a832SChris Mason #include <linux/sched.h>
7fccc0007SJosef Bacik #include <linux/sched/mm.h>
80b86a832SChris Mason #include <linux/bio.h>
95a0e3ad6STejun Heo #include <linux/slab.h>
10f2d8d74dSChris Mason #include <linux/blkdev.h>
11442a4f63SStefan Behrens #include <linux/ratelimit.h>
1259641015SIlya Dryomov #include <linux/kthread.h>
1353b381b3SDavid Woodhouse #include <linux/raid/pq.h>
14803b2f54SStefan Behrens #include <linux/semaphore.h>
158da4b8c4SAndy Shevchenko #include <linux/uuid.h>
16f8e10cd3SAnand Jain #include <linux/list_sort.h>
17784352feSDavid Sterba #include "misc.h"
180b86a832SChris Mason #include "ctree.h"
190b86a832SChris Mason #include "extent_map.h"
200b86a832SChris Mason #include "disk-io.h"
210b86a832SChris Mason #include "transaction.h"
220b86a832SChris Mason #include "print-tree.h"
230b86a832SChris Mason #include "volumes.h"
2453b381b3SDavid Woodhouse #include "raid56.h"
258b712842SChris Mason #include "async-thread.h"
2621adbd5cSStefan Behrens #include "check-integrity.h"
27606686eeSJosef Bacik #include "rcu-string.h"
288dabb742SStefan Behrens #include "dev-replace.h"
2999994cdeSAnand Jain #include "sysfs.h"
3082fc28fbSQu Wenruo #include "tree-checker.h"
318719aaaeSJosef Bacik #include "space-info.h"
32aac0023cSJosef Bacik #include "block-group.h"
33b0643e59SDennis Zhou #include "discard.h"
340b86a832SChris Mason 
35af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
36af902047SZhao Lei 	[BTRFS_RAID_RAID10] = {
37af902047SZhao Lei 		.sub_stripes	= 2,
38af902047SZhao Lei 		.dev_stripes	= 1,
39af902047SZhao Lei 		.devs_max	= 0,	/* 0 == as many as possible */
40af902047SZhao Lei 		.devs_min	= 4,
418789f4feSZhao Lei 		.tolerated_failures = 1,
42af902047SZhao Lei 		.devs_increment	= 2,
43af902047SZhao Lei 		.ncopies	= 2,
44b50836edSHans van Kranenburg 		.nparity        = 0,
45ed23467bSAnand Jain 		.raid_name	= "raid10",
4641a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
47f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
48af902047SZhao Lei 	},
49af902047SZhao Lei 	[BTRFS_RAID_RAID1] = {
50af902047SZhao Lei 		.sub_stripes	= 1,
51af902047SZhao Lei 		.dev_stripes	= 1,
52af902047SZhao Lei 		.devs_max	= 2,
53af902047SZhao Lei 		.devs_min	= 2,
548789f4feSZhao Lei 		.tolerated_failures = 1,
55af902047SZhao Lei 		.devs_increment	= 2,
56af902047SZhao Lei 		.ncopies	= 2,
57b50836edSHans van Kranenburg 		.nparity        = 0,
58ed23467bSAnand Jain 		.raid_name	= "raid1",
5941a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
60f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
61af902047SZhao Lei 	},
6247e6f742SDavid Sterba 	[BTRFS_RAID_RAID1C3] = {
6347e6f742SDavid Sterba 		.sub_stripes	= 1,
6447e6f742SDavid Sterba 		.dev_stripes	= 1,
65cf93e15eSDavid Sterba 		.devs_max	= 3,
6647e6f742SDavid Sterba 		.devs_min	= 3,
6747e6f742SDavid Sterba 		.tolerated_failures = 2,
6847e6f742SDavid Sterba 		.devs_increment	= 3,
6947e6f742SDavid Sterba 		.ncopies	= 3,
70db26a024SDavid Sterba 		.nparity        = 0,
7147e6f742SDavid Sterba 		.raid_name	= "raid1c3",
7247e6f742SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
7347e6f742SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
7447e6f742SDavid Sterba 	},
758d6fac00SDavid Sterba 	[BTRFS_RAID_RAID1C4] = {
768d6fac00SDavid Sterba 		.sub_stripes	= 1,
778d6fac00SDavid Sterba 		.dev_stripes	= 1,
78cf93e15eSDavid Sterba 		.devs_max	= 4,
798d6fac00SDavid Sterba 		.devs_min	= 4,
808d6fac00SDavid Sterba 		.tolerated_failures = 3,
818d6fac00SDavid Sterba 		.devs_increment	= 4,
828d6fac00SDavid Sterba 		.ncopies	= 4,
83db26a024SDavid Sterba 		.nparity        = 0,
848d6fac00SDavid Sterba 		.raid_name	= "raid1c4",
858d6fac00SDavid Sterba 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
868d6fac00SDavid Sterba 		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
878d6fac00SDavid Sterba 	},
88af902047SZhao Lei 	[BTRFS_RAID_DUP] = {
89af902047SZhao Lei 		.sub_stripes	= 1,
90af902047SZhao Lei 		.dev_stripes	= 2,
91af902047SZhao Lei 		.devs_max	= 1,
92af902047SZhao Lei 		.devs_min	= 1,
938789f4feSZhao Lei 		.tolerated_failures = 0,
94af902047SZhao Lei 		.devs_increment	= 1,
95af902047SZhao Lei 		.ncopies	= 2,
96b50836edSHans van Kranenburg 		.nparity        = 0,
97ed23467bSAnand Jain 		.raid_name	= "dup",
9841a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
99f9fbcaa2SAnand Jain 		.mindev_error	= 0,
100af902047SZhao Lei 	},
101af902047SZhao Lei 	[BTRFS_RAID_RAID0] = {
102af902047SZhao Lei 		.sub_stripes	= 1,
103af902047SZhao Lei 		.dev_stripes	= 1,
104af902047SZhao Lei 		.devs_max	= 0,
105af902047SZhao Lei 		.devs_min	= 2,
1068789f4feSZhao Lei 		.tolerated_failures = 0,
107af902047SZhao Lei 		.devs_increment	= 1,
108af902047SZhao Lei 		.ncopies	= 1,
109b50836edSHans van Kranenburg 		.nparity        = 0,
110ed23467bSAnand Jain 		.raid_name	= "raid0",
11141a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
112f9fbcaa2SAnand Jain 		.mindev_error	= 0,
113af902047SZhao Lei 	},
114af902047SZhao Lei 	[BTRFS_RAID_SINGLE] = {
115af902047SZhao Lei 		.sub_stripes	= 1,
116af902047SZhao Lei 		.dev_stripes	= 1,
117af902047SZhao Lei 		.devs_max	= 1,
118af902047SZhao Lei 		.devs_min	= 1,
1198789f4feSZhao Lei 		.tolerated_failures = 0,
120af902047SZhao Lei 		.devs_increment	= 1,
121af902047SZhao Lei 		.ncopies	= 1,
122b50836edSHans van Kranenburg 		.nparity        = 0,
123ed23467bSAnand Jain 		.raid_name	= "single",
12441a6e891SAnand Jain 		.bg_flag	= 0,
125f9fbcaa2SAnand Jain 		.mindev_error	= 0,
126af902047SZhao Lei 	},
127af902047SZhao Lei 	[BTRFS_RAID_RAID5] = {
128af902047SZhao Lei 		.sub_stripes	= 1,
129af902047SZhao Lei 		.dev_stripes	= 1,
130af902047SZhao Lei 		.devs_max	= 0,
131af902047SZhao Lei 		.devs_min	= 2,
1328789f4feSZhao Lei 		.tolerated_failures = 1,
133af902047SZhao Lei 		.devs_increment	= 1,
134da612e31SHans van Kranenburg 		.ncopies	= 1,
135b50836edSHans van Kranenburg 		.nparity        = 1,
136ed23467bSAnand Jain 		.raid_name	= "raid5",
13741a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
138f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
139af902047SZhao Lei 	},
140af902047SZhao Lei 	[BTRFS_RAID_RAID6] = {
141af902047SZhao Lei 		.sub_stripes	= 1,
142af902047SZhao Lei 		.dev_stripes	= 1,
143af902047SZhao Lei 		.devs_max	= 0,
144af902047SZhao Lei 		.devs_min	= 3,
1458789f4feSZhao Lei 		.tolerated_failures = 2,
146af902047SZhao Lei 		.devs_increment	= 1,
147da612e31SHans van Kranenburg 		.ncopies	= 1,
148b50836edSHans van Kranenburg 		.nparity        = 2,
149ed23467bSAnand Jain 		.raid_name	= "raid6",
15041a6e891SAnand Jain 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
151f9fbcaa2SAnand Jain 		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
152af902047SZhao Lei 	},
153af902047SZhao Lei };
154af902047SZhao Lei 
155158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags)
156ed23467bSAnand Jain {
157158da513SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
158158da513SDavid Sterba 
159158da513SDavid Sterba 	if (index >= BTRFS_NR_RAID_TYPES)
160ed23467bSAnand Jain 		return NULL;
161ed23467bSAnand Jain 
162158da513SDavid Sterba 	return btrfs_raid_array[index].raid_name;
163ed23467bSAnand Jain }
164ed23467bSAnand Jain 
165f89e09cfSAnand Jain /*
166f89e09cfSAnand Jain  * Fill @buf with textual description of @bg_flags, no more than @size_buf
167f89e09cfSAnand Jain  * bytes including terminating null byte.
168f89e09cfSAnand Jain  */
169f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
170f89e09cfSAnand Jain {
171f89e09cfSAnand Jain 	int i;
172f89e09cfSAnand Jain 	int ret;
173f89e09cfSAnand Jain 	char *bp = buf;
174f89e09cfSAnand Jain 	u64 flags = bg_flags;
175f89e09cfSAnand Jain 	u32 size_bp = size_buf;
176f89e09cfSAnand Jain 
177f89e09cfSAnand Jain 	if (!flags) {
178f89e09cfSAnand Jain 		strcpy(bp, "NONE");
179f89e09cfSAnand Jain 		return;
180f89e09cfSAnand Jain 	}
181f89e09cfSAnand Jain 
182f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc)						\
183f89e09cfSAnand Jain 	do {								\
184f89e09cfSAnand Jain 		if (flags & (flag)) {					\
185f89e09cfSAnand Jain 			ret = snprintf(bp, size_bp, "%s|", (desc));	\
186f89e09cfSAnand Jain 			if (ret < 0 || ret >= size_bp)			\
187f89e09cfSAnand Jain 				goto out_overflow;			\
188f89e09cfSAnand Jain 			size_bp -= ret;					\
189f89e09cfSAnand Jain 			bp += ret;					\
190f89e09cfSAnand Jain 			flags &= ~(flag);				\
191f89e09cfSAnand Jain 		}							\
192f89e09cfSAnand Jain 	} while (0)
193f89e09cfSAnand Jain 
194f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
195f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
196f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
197f89e09cfSAnand Jain 
198f89e09cfSAnand Jain 	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
199f89e09cfSAnand Jain 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
200f89e09cfSAnand Jain 		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
201f89e09cfSAnand Jain 			      btrfs_raid_array[i].raid_name);
202f89e09cfSAnand Jain #undef DESCRIBE_FLAG
203f89e09cfSAnand Jain 
204f89e09cfSAnand Jain 	if (flags) {
205f89e09cfSAnand Jain 		ret = snprintf(bp, size_bp, "0x%llx|", flags);
206f89e09cfSAnand Jain 		size_bp -= ret;
207f89e09cfSAnand Jain 	}
208f89e09cfSAnand Jain 
209f89e09cfSAnand Jain 	if (size_bp < size_buf)
210f89e09cfSAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
211f89e09cfSAnand Jain 
212f89e09cfSAnand Jain 	/*
213f89e09cfSAnand Jain 	 * The text is trimmed, it's up to the caller to provide sufficiently
214f89e09cfSAnand Jain 	 * large buffer
215f89e09cfSAnand Jain 	 */
216f89e09cfSAnand Jain out_overflow:;
217f89e09cfSAnand Jain }
218f89e09cfSAnand Jain 
2196f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans);
2202ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
22148a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
222733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
2235ab56090SLiu Bo static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
2245ab56090SLiu Bo 			     enum btrfs_map_op op,
2255ab56090SLiu Bo 			     u64 logical, u64 *length,
2265ab56090SLiu Bo 			     struct btrfs_bio **bbio_ret,
2275ab56090SLiu Bo 			     int mirror_num, int need_raid_map);
2282b82032cSYan Zheng 
2299c6b1c4dSDavid Sterba /*
2309c6b1c4dSDavid Sterba  * Device locking
2319c6b1c4dSDavid Sterba  * ==============
2329c6b1c4dSDavid Sterba  *
2339c6b1c4dSDavid Sterba  * There are several mutexes that protect manipulation of devices and low-level
2349c6b1c4dSDavid Sterba  * structures like chunks but not block groups, extents or files
2359c6b1c4dSDavid Sterba  *
2369c6b1c4dSDavid Sterba  * uuid_mutex (global lock)
2379c6b1c4dSDavid Sterba  * ------------------------
2389c6b1c4dSDavid Sterba  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
2399c6b1c4dSDavid Sterba  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
2409c6b1c4dSDavid Sterba  * device) or requested by the device= mount option
2419c6b1c4dSDavid Sterba  *
2429c6b1c4dSDavid Sterba  * the mutex can be very coarse and can cover long-running operations
2439c6b1c4dSDavid Sterba  *
2449c6b1c4dSDavid Sterba  * protects: updates to fs_devices counters like missing devices, rw devices,
24552042d8eSAndrea Gelmini  * seeding, structure cloning, opening/closing devices at mount/umount time
2469c6b1c4dSDavid Sterba  *
2479c6b1c4dSDavid Sterba  * global::fs_devs - add, remove, updates to the global list
2489c6b1c4dSDavid Sterba  *
24918c850fdSJosef Bacik  * does not protect: manipulation of the fs_devices::devices list in general
25018c850fdSJosef Bacik  * but in mount context it could be used to exclude list modifications by eg.
25118c850fdSJosef Bacik  * scan ioctl
2529c6b1c4dSDavid Sterba  *
2539c6b1c4dSDavid Sterba  * btrfs_device::name - renames (write side), read is RCU
2549c6b1c4dSDavid Sterba  *
2559c6b1c4dSDavid Sterba  * fs_devices::device_list_mutex (per-fs, with RCU)
2569c6b1c4dSDavid Sterba  * ------------------------------------------------
2579c6b1c4dSDavid Sterba  * protects updates to fs_devices::devices, ie. adding and deleting
2589c6b1c4dSDavid Sterba  *
2599c6b1c4dSDavid Sterba  * simple list traversal with read-only actions can be done with RCU protection
2609c6b1c4dSDavid Sterba  *
2619c6b1c4dSDavid Sterba  * may be used to exclude some operations from running concurrently without any
2629c6b1c4dSDavid Sterba  * modifications to the list (see write_all_supers)
2639c6b1c4dSDavid Sterba  *
26418c850fdSJosef Bacik  * Is not required at mount and close times, because our device list is
26518c850fdSJosef Bacik  * protected by the uuid_mutex at that point.
26618c850fdSJosef Bacik  *
2679c6b1c4dSDavid Sterba  * balance_mutex
2689c6b1c4dSDavid Sterba  * -------------
2699c6b1c4dSDavid Sterba  * protects balance structures (status, state) and context accessed from
2709c6b1c4dSDavid Sterba  * several places (internally, ioctl)
2719c6b1c4dSDavid Sterba  *
2729c6b1c4dSDavid Sterba  * chunk_mutex
2739c6b1c4dSDavid Sterba  * -----------
2749c6b1c4dSDavid Sterba  * protects chunks, adding or removing during allocation, trim or when a new
2750b6f5d40SNikolay Borisov  * device is added/removed. Additionally it also protects post_commit_list of
2760b6f5d40SNikolay Borisov  * individual devices, since they can be added to the transaction's
2770b6f5d40SNikolay Borisov  * post_commit_list only with chunk_mutex held.
2789c6b1c4dSDavid Sterba  *
2799c6b1c4dSDavid Sterba  * cleaner_mutex
2809c6b1c4dSDavid Sterba  * -------------
2819c6b1c4dSDavid Sterba  * a big lock that is held by the cleaner thread and prevents running subvolume
2829c6b1c4dSDavid Sterba  * cleaning together with relocation or delayed iputs
2839c6b1c4dSDavid Sterba  *
2849c6b1c4dSDavid Sterba  *
2859c6b1c4dSDavid Sterba  * Lock nesting
2869c6b1c4dSDavid Sterba  * ============
2879c6b1c4dSDavid Sterba  *
2889c6b1c4dSDavid Sterba  * uuid_mutex
2899c6b1c4dSDavid Sterba  *   device_list_mutex
2909c6b1c4dSDavid Sterba  *     chunk_mutex
2919c6b1c4dSDavid Sterba  *   balance_mutex
29289595e80SAnand Jain  *
29389595e80SAnand Jain  *
294c3e1f96cSGoldwyn Rodrigues  * Exclusive operations
295c3e1f96cSGoldwyn Rodrigues  * ====================
29689595e80SAnand Jain  *
29789595e80SAnand Jain  * Maintains the exclusivity of the following operations that apply to the
29889595e80SAnand Jain  * whole filesystem and cannot run in parallel.
29989595e80SAnand Jain  *
30089595e80SAnand Jain  * - Balance (*)
30189595e80SAnand Jain  * - Device add
30289595e80SAnand Jain  * - Device remove
30389595e80SAnand Jain  * - Device replace (*)
30489595e80SAnand Jain  * - Resize
30589595e80SAnand Jain  *
30689595e80SAnand Jain  * The device operations (as above) can be in one of the following states:
30789595e80SAnand Jain  *
30889595e80SAnand Jain  * - Running state
30989595e80SAnand Jain  * - Paused state
31089595e80SAnand Jain  * - Completed state
31189595e80SAnand Jain  *
31289595e80SAnand Jain  * Only device operations marked with (*) can go into the Paused state for the
31389595e80SAnand Jain  * following reasons:
31489595e80SAnand Jain  *
31589595e80SAnand Jain  * - ioctl (only Balance can be Paused through ioctl)
31689595e80SAnand Jain  * - filesystem remounted as read-only
31789595e80SAnand Jain  * - filesystem unmounted and mounted as read-only
31889595e80SAnand Jain  * - system power-cycle and filesystem mounted as read-only
31989595e80SAnand Jain  * - filesystem or device errors leading to forced read-only
32089595e80SAnand Jain  *
321c3e1f96cSGoldwyn Rodrigues  * The status of exclusive operation is set and cleared atomically.
322c3e1f96cSGoldwyn Rodrigues  * During the course of Paused state, fs_info::exclusive_operation remains set.
32389595e80SAnand Jain  * A device operation in Paused or Running state can be canceled or resumed
32489595e80SAnand Jain  * either by ioctl (Balance only) or when remounted as read-write.
325c3e1f96cSGoldwyn Rodrigues  * The exclusive status is cleared when the device operation is canceled or
32689595e80SAnand Jain  * completed.
3279c6b1c4dSDavid Sterba  */
3289c6b1c4dSDavid Sterba 
32967a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex);
3308a4b83ccSChris Mason static LIST_HEAD(fs_uuids);
3314143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
332c73eccf7SAnand Jain {
333c73eccf7SAnand Jain 	return &fs_uuids;
334c73eccf7SAnand Jain }
3358a4b83ccSChris Mason 
3362dfeca9bSDavid Sterba /*
3372dfeca9bSDavid Sterba  * alloc_fs_devices - allocate struct btrfs_fs_devices
3387239ff4bSNikolay Borisov  * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
3397239ff4bSNikolay Borisov  * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
3402dfeca9bSDavid Sterba  *
3412dfeca9bSDavid Sterba  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
3422dfeca9bSDavid Sterba  * The returned struct is not linked onto any lists and can be destroyed with
3432dfeca9bSDavid Sterba  * kfree() right away.
3442dfeca9bSDavid Sterba  */
3457239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
3467239ff4bSNikolay Borisov 						 const u8 *metadata_fsid)
3472208a378SIlya Dryomov {
3482208a378SIlya Dryomov 	struct btrfs_fs_devices *fs_devs;
3492208a378SIlya Dryomov 
35078f2c9e6SDavid Sterba 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
3512208a378SIlya Dryomov 	if (!fs_devs)
3522208a378SIlya Dryomov 		return ERR_PTR(-ENOMEM);
3532208a378SIlya Dryomov 
3542208a378SIlya Dryomov 	mutex_init(&fs_devs->device_list_mutex);
3552208a378SIlya Dryomov 
3562208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->devices);
3572208a378SIlya Dryomov 	INIT_LIST_HEAD(&fs_devs->alloc_list);
358c4babc5eSAnand Jain 	INIT_LIST_HEAD(&fs_devs->fs_list);
359944d3f9fSNikolay Borisov 	INIT_LIST_HEAD(&fs_devs->seed_list);
3602208a378SIlya Dryomov 	if (fsid)
3612208a378SIlya Dryomov 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
3622208a378SIlya Dryomov 
3637239ff4bSNikolay Borisov 	if (metadata_fsid)
3647239ff4bSNikolay Borisov 		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
3657239ff4bSNikolay Borisov 	else if (fsid)
3667239ff4bSNikolay Borisov 		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
3677239ff4bSNikolay Borisov 
3682208a378SIlya Dryomov 	return fs_devs;
3692208a378SIlya Dryomov }
3702208a378SIlya Dryomov 
371a425f9d4SDavid Sterba void btrfs_free_device(struct btrfs_device *device)
37248dae9cfSDavid Sterba {
373bbbf7243SNikolay Borisov 	WARN_ON(!list_empty(&device->post_commit_list));
37448dae9cfSDavid Sterba 	rcu_string_free(device->name);
3751c11b63eSJeff Mahoney 	extent_io_tree_release(&device->alloc_state);
37648dae9cfSDavid Sterba 	bio_put(device->flush_bio);
37748dae9cfSDavid Sterba 	kfree(device);
37848dae9cfSDavid Sterba }
37948dae9cfSDavid Sterba 
380e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
381e4404d6eSYan Zheng {
382e4404d6eSYan Zheng 	struct btrfs_device *device;
383e4404d6eSYan Zheng 	WARN_ON(fs_devices->opened);
384e4404d6eSYan Zheng 	while (!list_empty(&fs_devices->devices)) {
385e4404d6eSYan Zheng 		device = list_entry(fs_devices->devices.next,
386e4404d6eSYan Zheng 				    struct btrfs_device, dev_list);
387e4404d6eSYan Zheng 		list_del(&device->dev_list);
388a425f9d4SDavid Sterba 		btrfs_free_device(device);
389e4404d6eSYan Zheng 	}
390e4404d6eSYan Zheng 	kfree(fs_devices);
391e4404d6eSYan Zheng }
392e4404d6eSYan Zheng 
393ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void)
3948a4b83ccSChris Mason {
3958a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
3968a4b83ccSChris Mason 
3972b82032cSYan Zheng 	while (!list_empty(&fs_uuids)) {
3982b82032cSYan Zheng 		fs_devices = list_entry(fs_uuids.next,
399c4babc5eSAnand Jain 					struct btrfs_fs_devices, fs_list);
400c4babc5eSAnand Jain 		list_del(&fs_devices->fs_list);
401e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
4028a4b83ccSChris Mason 	}
4038a4b83ccSChris Mason }
4048a4b83ccSChris Mason 
40548dae9cfSDavid Sterba /*
40648dae9cfSDavid Sterba  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
40748dae9cfSDavid Sterba  * Returned struct is not linked onto any lists and must be destroyed using
408a425f9d4SDavid Sterba  * btrfs_free_device.
40948dae9cfSDavid Sterba  */
410154f7cb8SQu Wenruo static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
41112bd2fc0SIlya Dryomov {
41212bd2fc0SIlya Dryomov 	struct btrfs_device *dev;
41312bd2fc0SIlya Dryomov 
41478f2c9e6SDavid Sterba 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
41512bd2fc0SIlya Dryomov 	if (!dev)
41612bd2fc0SIlya Dryomov 		return ERR_PTR(-ENOMEM);
41712bd2fc0SIlya Dryomov 
418e0ae9994SDavid Sterba 	/*
419e0ae9994SDavid Sterba 	 * Preallocate a bio that's always going to be used for flushing device
420e0ae9994SDavid Sterba 	 * barriers and matches the device lifespan
421e0ae9994SDavid Sterba 	 */
422e0ae9994SDavid Sterba 	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
423e0ae9994SDavid Sterba 	if (!dev->flush_bio) {
424e0ae9994SDavid Sterba 		kfree(dev);
425e0ae9994SDavid Sterba 		return ERR_PTR(-ENOMEM);
426e0ae9994SDavid Sterba 	}
427e0ae9994SDavid Sterba 
42812bd2fc0SIlya Dryomov 	INIT_LIST_HEAD(&dev->dev_list);
42912bd2fc0SIlya Dryomov 	INIT_LIST_HEAD(&dev->dev_alloc_list);
430bbbf7243SNikolay Borisov 	INIT_LIST_HEAD(&dev->post_commit_list);
43112bd2fc0SIlya Dryomov 
43212bd2fc0SIlya Dryomov 	atomic_set(&dev->reada_in_flight, 0);
433addc3fa7SMiao Xie 	atomic_set(&dev->dev_stats_ccnt, 0);
434d5c82388SDavidlohr Bueso 	btrfs_device_data_ordered_init(dev, fs_info);
4359bcaaea7SChris Mason 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
436d0164adcSMel Gorman 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
437154f7cb8SQu Wenruo 	extent_io_tree_init(fs_info, &dev->alloc_state,
438154f7cb8SQu Wenruo 			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
43912bd2fc0SIlya Dryomov 
44012bd2fc0SIlya Dryomov 	return dev;
44112bd2fc0SIlya Dryomov }
44212bd2fc0SIlya Dryomov 
4437239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid(
4447239ff4bSNikolay Borisov 		const u8 *fsid, const u8 *metadata_fsid)
4458a4b83ccSChris Mason {
4468a4b83ccSChris Mason 	struct btrfs_fs_devices *fs_devices;
4478a4b83ccSChris Mason 
4487239ff4bSNikolay Borisov 	ASSERT(fsid);
4497239ff4bSNikolay Borisov 
450c6730a0eSSu Yue 	/* Handle non-split brain cases */
451c6730a0eSSu Yue 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4527a62d0f0SNikolay Borisov 		if (metadata_fsid) {
453c6730a0eSSu Yue 			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
454c6730a0eSSu Yue 			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
455c6730a0eSSu Yue 				      BTRFS_FSID_SIZE) == 0)
456c6730a0eSSu Yue 				return fs_devices;
457c6730a0eSSu Yue 		} else {
458c6730a0eSSu Yue 			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
459c6730a0eSSu Yue 				return fs_devices;
460c6730a0eSSu Yue 		}
461c6730a0eSSu Yue 	}
462c6730a0eSSu Yue 	return NULL;
463c6730a0eSSu Yue }
464c6730a0eSSu Yue 
465c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
466c6730a0eSSu Yue 				struct btrfs_super_block *disk_super)
467c6730a0eSSu Yue {
468c6730a0eSSu Yue 
469c6730a0eSSu Yue 	struct btrfs_fs_devices *fs_devices;
470c6730a0eSSu Yue 
4717a62d0f0SNikolay Borisov 	/*
4727a62d0f0SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
4737a62d0f0SNikolay Borisov 	 * belonging to a fs_devices that was created by first scanning
4747a62d0f0SNikolay Borisov 	 * a device which didn't have its fsid/metadata_uuid changed
4757a62d0f0SNikolay Borisov 	 * at all and the CHANGING_FSID_V2 flag set.
4767a62d0f0SNikolay Borisov 	 */
4777a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4787a62d0f0SNikolay Borisov 		if (fs_devices->fsid_change &&
479c6730a0eSSu Yue 		    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
4807a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
4817a62d0f0SNikolay Borisov 		    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
4827a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0) {
4837a62d0f0SNikolay Borisov 			return fs_devices;
4847a62d0f0SNikolay Borisov 		}
4857a62d0f0SNikolay Borisov 	}
486cc5de4e7SNikolay Borisov 	/*
487cc5de4e7SNikolay Borisov 	 * Handle scanned device having completed its fsid change but
488cc5de4e7SNikolay Borisov 	 * belonging to a fs_devices that was created by a device that
489cc5de4e7SNikolay Borisov 	 * has an outdated pair of fsid/metadata_uuid and
490cc5de4e7SNikolay Borisov 	 * CHANGING_FSID_V2 flag set.
491cc5de4e7SNikolay Borisov 	 */
492cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
493cc5de4e7SNikolay Borisov 		if (fs_devices->fsid_change &&
494cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid,
495cc5de4e7SNikolay Borisov 			   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
496c6730a0eSSu Yue 		    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
497cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0) {
498cc5de4e7SNikolay Borisov 			return fs_devices;
499cc5de4e7SNikolay Borisov 		}
500cc5de4e7SNikolay Borisov 	}
501c6730a0eSSu Yue 
502c6730a0eSSu Yue 	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
5037a62d0f0SNikolay Borisov }
5047a62d0f0SNikolay Borisov 
5058a4b83ccSChris Mason 
506beaf8ab3SStefan Behrens static int
507beaf8ab3SStefan Behrens btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
508beaf8ab3SStefan Behrens 		      int flush, struct block_device **bdev,
5098f32380dSJohannes Thumshirn 		      struct btrfs_super_block **disk_super)
510beaf8ab3SStefan Behrens {
511beaf8ab3SStefan Behrens 	int ret;
512beaf8ab3SStefan Behrens 
513beaf8ab3SStefan Behrens 	*bdev = blkdev_get_by_path(device_path, flags, holder);
514beaf8ab3SStefan Behrens 
515beaf8ab3SStefan Behrens 	if (IS_ERR(*bdev)) {
516beaf8ab3SStefan Behrens 		ret = PTR_ERR(*bdev);
517beaf8ab3SStefan Behrens 		goto error;
518beaf8ab3SStefan Behrens 	}
519beaf8ab3SStefan Behrens 
520beaf8ab3SStefan Behrens 	if (flush)
521beaf8ab3SStefan Behrens 		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
5229f6d2510SDavid Sterba 	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
523beaf8ab3SStefan Behrens 	if (ret) {
524beaf8ab3SStefan Behrens 		blkdev_put(*bdev, flags);
525beaf8ab3SStefan Behrens 		goto error;
526beaf8ab3SStefan Behrens 	}
527beaf8ab3SStefan Behrens 	invalidate_bdev(*bdev);
5288f32380dSJohannes Thumshirn 	*disk_super = btrfs_read_dev_super(*bdev);
5298f32380dSJohannes Thumshirn 	if (IS_ERR(*disk_super)) {
5308f32380dSJohannes Thumshirn 		ret = PTR_ERR(*disk_super);
531beaf8ab3SStefan Behrens 		blkdev_put(*bdev, flags);
532beaf8ab3SStefan Behrens 		goto error;
533beaf8ab3SStefan Behrens 	}
534beaf8ab3SStefan Behrens 
535beaf8ab3SStefan Behrens 	return 0;
536beaf8ab3SStefan Behrens 
537beaf8ab3SStefan Behrens error:
538beaf8ab3SStefan Behrens 	*bdev = NULL;
539beaf8ab3SStefan Behrens 	return ret;
540beaf8ab3SStefan Behrens }
541beaf8ab3SStefan Behrens 
54270bc7088SAnand Jain static bool device_path_matched(const char *path, struct btrfs_device *device)
54370bc7088SAnand Jain {
54470bc7088SAnand Jain 	int found;
54570bc7088SAnand Jain 
54670bc7088SAnand Jain 	rcu_read_lock();
54770bc7088SAnand Jain 	found = strcmp(rcu_str_deref(device->name), path);
54870bc7088SAnand Jain 	rcu_read_unlock();
54970bc7088SAnand Jain 
55070bc7088SAnand Jain 	return found == 0;
55170bc7088SAnand Jain }
55270bc7088SAnand Jain 
553d8367db3SAnand Jain /*
554d8367db3SAnand Jain  *  Search and remove all stale (devices which are not mounted) devices.
555d8367db3SAnand Jain  *  When both inputs are NULL, it will search and release all stale devices.
556d8367db3SAnand Jain  *  path:	Optional. When provided will it release all unmounted devices
557d8367db3SAnand Jain  *		matching this path only.
558d8367db3SAnand Jain  *  skip_dev:	Optional. Will skip this device when searching for the stale
559d8367db3SAnand Jain  *		devices.
56070bc7088SAnand Jain  *  Return:	0 for success or if @path is NULL.
56170bc7088SAnand Jain  * 		-EBUSY if @path is a mounted device.
56270bc7088SAnand Jain  * 		-ENOENT if @path does not match any device in the list.
563d8367db3SAnand Jain  */
56470bc7088SAnand Jain static int btrfs_free_stale_devices(const char *path,
565fa6d2ae5SAnand Jain 				     struct btrfs_device *skip_device)
5664fde46f0SAnand Jain {
567fa6d2ae5SAnand Jain 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
568fa6d2ae5SAnand Jain 	struct btrfs_device *device, *tmp_device;
56970bc7088SAnand Jain 	int ret = 0;
57070bc7088SAnand Jain 
57170bc7088SAnand Jain 	if (path)
57270bc7088SAnand Jain 		ret = -ENOENT;
5734fde46f0SAnand Jain 
574fa6d2ae5SAnand Jain 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
5754fde46f0SAnand Jain 
57670bc7088SAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
577fa6d2ae5SAnand Jain 		list_for_each_entry_safe(device, tmp_device,
578fa6d2ae5SAnand Jain 					 &fs_devices->devices, dev_list) {
579fa6d2ae5SAnand Jain 			if (skip_device && skip_device == device)
580d8367db3SAnand Jain 				continue;
581fa6d2ae5SAnand Jain 			if (path && !device->name)
5824fde46f0SAnand Jain 				continue;
58370bc7088SAnand Jain 			if (path && !device_path_matched(path, device))
58438cf665dSAnand Jain 				continue;
58570bc7088SAnand Jain 			if (fs_devices->opened) {
58670bc7088SAnand Jain 				/* for an already deleted device return 0 */
58770bc7088SAnand Jain 				if (path && ret != 0)
58870bc7088SAnand Jain 					ret = -EBUSY;
58970bc7088SAnand Jain 				break;
59070bc7088SAnand Jain 			}
5914fde46f0SAnand Jain 
5924fde46f0SAnand Jain 			/* delete the stale device */
593fa6d2ae5SAnand Jain 			fs_devices->num_devices--;
594fa6d2ae5SAnand Jain 			list_del(&device->dev_list);
595fa6d2ae5SAnand Jain 			btrfs_free_device(device);
5967bcb8164SAnand Jain 
59770bc7088SAnand Jain 			ret = 0;
5984fde46f0SAnand Jain 		}
5997bcb8164SAnand Jain 		mutex_unlock(&fs_devices->device_list_mutex);
60070bc7088SAnand Jain 
6017bcb8164SAnand Jain 		if (fs_devices->num_devices == 0) {
6027bcb8164SAnand Jain 			btrfs_sysfs_remove_fsid(fs_devices);
6037bcb8164SAnand Jain 			list_del(&fs_devices->fs_list);
6047bcb8164SAnand Jain 			free_fs_devices(fs_devices);
6054fde46f0SAnand Jain 		}
6064fde46f0SAnand Jain 	}
60770bc7088SAnand Jain 
60870bc7088SAnand Jain 	return ret;
6094fde46f0SAnand Jain }
6104fde46f0SAnand Jain 
61118c850fdSJosef Bacik /*
61218c850fdSJosef Bacik  * This is only used on mount, and we are protected from competing things
61318c850fdSJosef Bacik  * messing with our fs_devices by the uuid_mutex, thus we do not need the
61418c850fdSJosef Bacik  * fs_devices->device_list_mutex here.
61518c850fdSJosef Bacik  */
6160fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
6170fb08bccSAnand Jain 			struct btrfs_device *device, fmode_t flags,
6180fb08bccSAnand Jain 			void *holder)
6190fb08bccSAnand Jain {
6200fb08bccSAnand Jain 	struct request_queue *q;
6210fb08bccSAnand Jain 	struct block_device *bdev;
6220fb08bccSAnand Jain 	struct btrfs_super_block *disk_super;
6230fb08bccSAnand Jain 	u64 devid;
6240fb08bccSAnand Jain 	int ret;
6250fb08bccSAnand Jain 
6260fb08bccSAnand Jain 	if (device->bdev)
6270fb08bccSAnand Jain 		return -EINVAL;
6280fb08bccSAnand Jain 	if (!device->name)
6290fb08bccSAnand Jain 		return -EINVAL;
6300fb08bccSAnand Jain 
6310fb08bccSAnand Jain 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
6328f32380dSJohannes Thumshirn 				    &bdev, &disk_super);
6330fb08bccSAnand Jain 	if (ret)
6340fb08bccSAnand Jain 		return ret;
6350fb08bccSAnand Jain 
6360fb08bccSAnand Jain 	devid = btrfs_stack_device_id(&disk_super->dev_item);
6370fb08bccSAnand Jain 	if (devid != device->devid)
6388f32380dSJohannes Thumshirn 		goto error_free_page;
6390fb08bccSAnand Jain 
6400fb08bccSAnand Jain 	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
6418f32380dSJohannes Thumshirn 		goto error_free_page;
6420fb08bccSAnand Jain 
6430fb08bccSAnand Jain 	device->generation = btrfs_super_generation(disk_super);
6440fb08bccSAnand Jain 
6450fb08bccSAnand Jain 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
6467239ff4bSNikolay Borisov 		if (btrfs_super_incompat_flags(disk_super) &
6477239ff4bSNikolay Borisov 		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
6487239ff4bSNikolay Borisov 			pr_err(
6497239ff4bSNikolay Borisov 		"BTRFS: Invalid seeding and uuid-changed device detected\n");
6508f32380dSJohannes Thumshirn 			goto error_free_page;
6517239ff4bSNikolay Borisov 		}
6527239ff4bSNikolay Borisov 
653ebbede42SAnand Jain 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6540395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
6550fb08bccSAnand Jain 	} else {
656ebbede42SAnand Jain 		if (bdev_read_only(bdev))
657ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
658ebbede42SAnand Jain 		else
659ebbede42SAnand Jain 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6600fb08bccSAnand Jain 	}
6610fb08bccSAnand Jain 
6620fb08bccSAnand Jain 	q = bdev_get_queue(bdev);
6630fb08bccSAnand Jain 	if (!blk_queue_nonrot(q))
6647f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
6650fb08bccSAnand Jain 
6660fb08bccSAnand Jain 	device->bdev = bdev;
667e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6680fb08bccSAnand Jain 	device->mode = flags;
6690fb08bccSAnand Jain 
6700fb08bccSAnand Jain 	fs_devices->open_devices++;
671ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
672ebbede42SAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
6730fb08bccSAnand Jain 		fs_devices->rw_devices++;
674b1b8e386SAnand Jain 		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
6750fb08bccSAnand Jain 	}
6768f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6770fb08bccSAnand Jain 
6780fb08bccSAnand Jain 	return 0;
6790fb08bccSAnand Jain 
6808f32380dSJohannes Thumshirn error_free_page:
6818f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
6820fb08bccSAnand Jain 	blkdev_put(bdev, flags);
6830fb08bccSAnand Jain 
6840fb08bccSAnand Jain 	return -EINVAL;
6850fb08bccSAnand Jain }
6860fb08bccSAnand Jain 
68760999ca4SDavid Sterba /*
6887a62d0f0SNikolay Borisov  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
689c0d81c7cSSu Yue  * being created with a disk that has already completed its fsid change. Such
690c0d81c7cSSu Yue  * disk can belong to an fs which has its FSID changed or to one which doesn't.
691c0d81c7cSSu Yue  * Handle both cases here.
6927a62d0f0SNikolay Borisov  */
6937a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress(
6947a62d0f0SNikolay Borisov 					struct btrfs_super_block *disk_super)
6957a62d0f0SNikolay Borisov {
6967a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
6977a62d0f0SNikolay Borisov 
6987a62d0f0SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
6997a62d0f0SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
7007a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
7017a62d0f0SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
7027a62d0f0SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
7037a62d0f0SNikolay Borisov 			return fs_devices;
7047a62d0f0SNikolay Borisov 		}
7057a62d0f0SNikolay Borisov 	}
7067a62d0f0SNikolay Borisov 
707c0d81c7cSSu Yue 	return find_fsid(disk_super->fsid, NULL);
7087a62d0f0SNikolay Borisov }
7097a62d0f0SNikolay Borisov 
710cc5de4e7SNikolay Borisov 
711cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed(
712cc5de4e7SNikolay Borisov 					struct btrfs_super_block *disk_super)
713cc5de4e7SNikolay Borisov {
714cc5de4e7SNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
715cc5de4e7SNikolay Borisov 
716cc5de4e7SNikolay Borisov 	/*
717cc5de4e7SNikolay Borisov 	 * Handles the case where scanned device is part of an fs that had
718cc5de4e7SNikolay Borisov 	 * multiple successful changes of FSID but curently device didn't
71905840710SNikolay Borisov 	 * observe it. Meaning our fsid will be different than theirs. We need
72005840710SNikolay Borisov 	 * to handle two subcases :
72105840710SNikolay Borisov 	 *  1 - The fs still continues to have different METADATA/FSID uuids.
72205840710SNikolay Borisov 	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
72305840710SNikolay Borisov 	 *  are equal).
724cc5de4e7SNikolay Borisov 	 */
725cc5de4e7SNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
72605840710SNikolay Borisov 		/* Changed UUIDs */
727cc5de4e7SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
728cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
729cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
730cc5de4e7SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
731cc5de4e7SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->fsid,
73205840710SNikolay Borisov 			   BTRFS_FSID_SIZE) != 0)
733cc5de4e7SNikolay Borisov 			return fs_devices;
73405840710SNikolay Borisov 
73505840710SNikolay Borisov 		/* Unchanged UUIDs */
73605840710SNikolay Borisov 		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
73705840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
73805840710SNikolay Borisov 		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
73905840710SNikolay Borisov 			   BTRFS_FSID_SIZE) == 0)
74005840710SNikolay Borisov 			return fs_devices;
741cc5de4e7SNikolay Borisov 	}
742cc5de4e7SNikolay Borisov 
743cc5de4e7SNikolay Borisov 	return NULL;
744cc5de4e7SNikolay Borisov }
7451362089dSNikolay Borisov 
7461362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata(
7471362089dSNikolay Borisov 				struct btrfs_super_block *disk_super)
7481362089dSNikolay Borisov {
7491362089dSNikolay Borisov 	struct btrfs_fs_devices *fs_devices;
7501362089dSNikolay Borisov 
7511362089dSNikolay Borisov 	/*
7521362089dSNikolay Borisov 	 * Handle the case where the scanned device is part of an fs whose last
7531362089dSNikolay Borisov 	 * metadata UUID change reverted it to the original FSID. At the same
7541362089dSNikolay Borisov 	 * time * fs_devices was first created by another constitutent device
7551362089dSNikolay Borisov 	 * which didn't fully observe the operation. This results in an
7561362089dSNikolay Borisov 	 * btrfs_fs_devices created with metadata/fsid different AND
7571362089dSNikolay Borisov 	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
7581362089dSNikolay Borisov 	 * fs_devices equal to the FSID of the disk.
7591362089dSNikolay Borisov 	 */
7601362089dSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
7611362089dSNikolay Borisov 		if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
7621362089dSNikolay Borisov 			   BTRFS_FSID_SIZE) != 0 &&
7631362089dSNikolay Borisov 		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
7641362089dSNikolay Borisov 			   BTRFS_FSID_SIZE) == 0 &&
7651362089dSNikolay Borisov 		    fs_devices->fsid_change)
7661362089dSNikolay Borisov 			return fs_devices;
7671362089dSNikolay Borisov 	}
7681362089dSNikolay Borisov 
7691362089dSNikolay Borisov 	return NULL;
7701362089dSNikolay Borisov }
7717a62d0f0SNikolay Borisov /*
77260999ca4SDavid Sterba  * Add new device to list of registered devices
77360999ca4SDavid Sterba  *
77460999ca4SDavid Sterba  * Returns:
775e124ece5SAnand Jain  * device pointer which was just added or updated when successful
776e124ece5SAnand Jain  * error pointer when failed
77760999ca4SDavid Sterba  */
778e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path,
7794306a974SAnand Jain 			   struct btrfs_super_block *disk_super,
7804306a974SAnand Jain 			   bool *new_device_added)
7818a4b83ccSChris Mason {
7828a4b83ccSChris Mason 	struct btrfs_device *device;
7837a62d0f0SNikolay Borisov 	struct btrfs_fs_devices *fs_devices = NULL;
784606686eeSJosef Bacik 	struct rcu_string *name;
7858a4b83ccSChris Mason 	u64 found_transid = btrfs_super_generation(disk_super);
7863acbcbfcSAnand Jain 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
7877239ff4bSNikolay Borisov 	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
7887239ff4bSNikolay Borisov 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
789d1a63002SNikolay Borisov 	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
790d1a63002SNikolay Borisov 					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
7918a4b83ccSChris Mason 
792cc5de4e7SNikolay Borisov 	if (fsid_change_in_progress) {
793c0d81c7cSSu Yue 		if (!has_metadata_uuid)
7947a62d0f0SNikolay Borisov 			fs_devices = find_fsid_inprogress(disk_super);
795c0d81c7cSSu Yue 		else
796cc5de4e7SNikolay Borisov 			fs_devices = find_fsid_changed(disk_super);
7977a62d0f0SNikolay Borisov 	} else if (has_metadata_uuid) {
798c6730a0eSSu Yue 		fs_devices = find_fsid_with_metadata_uuid(disk_super);
7997a62d0f0SNikolay Borisov 	} else {
8001362089dSNikolay Borisov 		fs_devices = find_fsid_reverted_metadata(disk_super);
8011362089dSNikolay Borisov 		if (!fs_devices)
8027a62d0f0SNikolay Borisov 			fs_devices = find_fsid(disk_super->fsid, NULL);
8037a62d0f0SNikolay Borisov 	}
8047a62d0f0SNikolay Borisov 
8057a62d0f0SNikolay Borisov 
8068a4b83ccSChris Mason 	if (!fs_devices) {
8077239ff4bSNikolay Borisov 		if (has_metadata_uuid)
8087239ff4bSNikolay Borisov 			fs_devices = alloc_fs_devices(disk_super->fsid,
8097239ff4bSNikolay Borisov 						      disk_super->metadata_uuid);
8107239ff4bSNikolay Borisov 		else
8117239ff4bSNikolay Borisov 			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
8127239ff4bSNikolay Borisov 
8132208a378SIlya Dryomov 		if (IS_ERR(fs_devices))
814e124ece5SAnand Jain 			return ERR_CAST(fs_devices);
8152208a378SIlya Dryomov 
81692900e51SAl Viro 		fs_devices->fsid_change = fsid_change_in_progress;
81792900e51SAl Viro 
8189c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
819c4babc5eSAnand Jain 		list_add(&fs_devices->fs_list, &fs_uuids);
8202208a378SIlya Dryomov 
8218a4b83ccSChris Mason 		device = NULL;
8228a4b83ccSChris Mason 	} else {
8239c6d173eSAnand Jain 		mutex_lock(&fs_devices->device_list_mutex);
82409ba3bc9SAnand Jain 		device = btrfs_find_device(fs_devices, devid,
82509ba3bc9SAnand Jain 				disk_super->dev_item.uuid, NULL, false);
8267a62d0f0SNikolay Borisov 
8277a62d0f0SNikolay Borisov 		/*
8287a62d0f0SNikolay Borisov 		 * If this disk has been pulled into an fs devices created by
8297a62d0f0SNikolay Borisov 		 * a device which had the CHANGING_FSID_V2 flag then replace the
8307a62d0f0SNikolay Borisov 		 * metadata_uuid/fsid values of the fs_devices.
8317a62d0f0SNikolay Borisov 		 */
8321362089dSNikolay Borisov 		if (fs_devices->fsid_change &&
8337a62d0f0SNikolay Borisov 		    found_transid > fs_devices->latest_generation) {
8347a62d0f0SNikolay Borisov 			memcpy(fs_devices->fsid, disk_super->fsid,
8357a62d0f0SNikolay Borisov 					BTRFS_FSID_SIZE);
8361362089dSNikolay Borisov 
8371362089dSNikolay Borisov 			if (has_metadata_uuid)
8387a62d0f0SNikolay Borisov 				memcpy(fs_devices->metadata_uuid,
8391362089dSNikolay Borisov 				       disk_super->metadata_uuid,
8401362089dSNikolay Borisov 				       BTRFS_FSID_SIZE);
8411362089dSNikolay Borisov 			else
8421362089dSNikolay Borisov 				memcpy(fs_devices->metadata_uuid,
8431362089dSNikolay Borisov 				       disk_super->fsid, BTRFS_FSID_SIZE);
8447a62d0f0SNikolay Borisov 
8457a62d0f0SNikolay Borisov 			fs_devices->fsid_change = false;
8467a62d0f0SNikolay Borisov 		}
8478a4b83ccSChris Mason 	}
848443f24feSMiao Xie 
8498a4b83ccSChris Mason 	if (!device) {
8509c6d173eSAnand Jain 		if (fs_devices->opened) {
8519c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
852e124ece5SAnand Jain 			return ERR_PTR(-EBUSY);
8539c6d173eSAnand Jain 		}
8542b82032cSYan Zheng 
85512bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &devid,
85612bd2fc0SIlya Dryomov 					    disk_super->dev_item.uuid);
85712bd2fc0SIlya Dryomov 		if (IS_ERR(device)) {
8589c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
8598a4b83ccSChris Mason 			/* we can safely leave the fs_devices entry around */
860e124ece5SAnand Jain 			return device;
8618a4b83ccSChris Mason 		}
862606686eeSJosef Bacik 
863606686eeSJosef Bacik 		name = rcu_string_strdup(path, GFP_NOFS);
864606686eeSJosef Bacik 		if (!name) {
865a425f9d4SDavid Sterba 			btrfs_free_device(device);
8669c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
867e124ece5SAnand Jain 			return ERR_PTR(-ENOMEM);
8688a4b83ccSChris Mason 		}
869606686eeSJosef Bacik 		rcu_assign_pointer(device->name, name);
87090519d66SArne Jansen 
8711f78160cSXiao Guangrong 		list_add_rcu(&device->dev_list, &fs_devices->devices);
872f7171750SFilipe David Borba Manana 		fs_devices->num_devices++;
873e5e9a520SChris Mason 
8742b82032cSYan Zheng 		device->fs_devices = fs_devices;
8754306a974SAnand Jain 		*new_device_added = true;
876327f18ccSAnand Jain 
877327f18ccSAnand Jain 		if (disk_super->label[0])
878aa6c0df7SAnand Jain 			pr_info(
879aa6c0df7SAnand Jain 	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
880aa6c0df7SAnand Jain 				disk_super->label, devid, found_transid, path,
881aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
882327f18ccSAnand Jain 		else
883aa6c0df7SAnand Jain 			pr_info(
884aa6c0df7SAnand Jain 	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
885aa6c0df7SAnand Jain 				disk_super->fsid, devid, found_transid, path,
886aa6c0df7SAnand Jain 				current->comm, task_pid_nr(current));
887327f18ccSAnand Jain 
888606686eeSJosef Bacik 	} else if (!device->name || strcmp(device->name->str, path)) {
889b96de000SAnand Jain 		/*
890b96de000SAnand Jain 		 * When FS is already mounted.
891b96de000SAnand Jain 		 * 1. If you are here and if the device->name is NULL that
892b96de000SAnand Jain 		 *    means this device was missing at time of FS mount.
893b96de000SAnand Jain 		 * 2. If you are here and if the device->name is different
894b96de000SAnand Jain 		 *    from 'path' that means either
895b96de000SAnand Jain 		 *      a. The same device disappeared and reappeared with
896b96de000SAnand Jain 		 *         different name. or
897b96de000SAnand Jain 		 *      b. The missing-disk-which-was-replaced, has
898b96de000SAnand Jain 		 *         reappeared now.
899b96de000SAnand Jain 		 *
900b96de000SAnand Jain 		 * We must allow 1 and 2a above. But 2b would be a spurious
901b96de000SAnand Jain 		 * and unintentional.
902b96de000SAnand Jain 		 *
903b96de000SAnand Jain 		 * Further in case of 1 and 2a above, the disk at 'path'
904b96de000SAnand Jain 		 * would have missed some transaction when it was away and
905b96de000SAnand Jain 		 * in case of 2a the stale bdev has to be updated as well.
906b96de000SAnand Jain 		 * 2b must not be allowed at all time.
907b96de000SAnand Jain 		 */
908b96de000SAnand Jain 
909b96de000SAnand Jain 		/*
9100f23ae74SChris Mason 		 * For now, we do allow update to btrfs_fs_device through the
9110f23ae74SChris Mason 		 * btrfs dev scan cli after FS has been mounted.  We're still
9120f23ae74SChris Mason 		 * tracking a problem where systems fail mount by subvolume id
9130f23ae74SChris Mason 		 * when we reject replacement on a mounted FS.
914b96de000SAnand Jain 		 */
9150f23ae74SChris Mason 		if (!fs_devices->opened && found_transid < device->generation) {
91677bdae4dSAnand Jain 			/*
91777bdae4dSAnand Jain 			 * That is if the FS is _not_ mounted and if you
91877bdae4dSAnand Jain 			 * are here, that means there is more than one
91977bdae4dSAnand Jain 			 * disk with same uuid and devid.We keep the one
92077bdae4dSAnand Jain 			 * with larger generation number or the last-in if
92177bdae4dSAnand Jain 			 * generation are equal.
92277bdae4dSAnand Jain 			 */
9239c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
924e124ece5SAnand Jain 			return ERR_PTR(-EEXIST);
92577bdae4dSAnand Jain 		}
926b96de000SAnand Jain 
927a9261d41SAnand Jain 		/*
928a9261d41SAnand Jain 		 * We are going to replace the device path for a given devid,
929a9261d41SAnand Jain 		 * make sure it's the same device if the device is mounted
930a9261d41SAnand Jain 		 */
931a9261d41SAnand Jain 		if (device->bdev) {
932a9261d41SAnand Jain 			struct block_device *path_bdev;
933a9261d41SAnand Jain 
934a9261d41SAnand Jain 			path_bdev = lookup_bdev(path);
935a9261d41SAnand Jain 			if (IS_ERR(path_bdev)) {
936a9261d41SAnand Jain 				mutex_unlock(&fs_devices->device_list_mutex);
937a9261d41SAnand Jain 				return ERR_CAST(path_bdev);
938a9261d41SAnand Jain 			}
939a9261d41SAnand Jain 
940a9261d41SAnand Jain 			if (device->bdev != path_bdev) {
941a9261d41SAnand Jain 				bdput(path_bdev);
942a9261d41SAnand Jain 				mutex_unlock(&fs_devices->device_list_mutex);
9430697d9a6SJohannes Thumshirn 				/*
9440697d9a6SJohannes Thumshirn 				 * device->fs_info may not be reliable here, so
9450697d9a6SJohannes Thumshirn 				 * pass in a NULL instead. This avoids a
9460697d9a6SJohannes Thumshirn 				 * possible use-after-free when the fs_info and
9470697d9a6SJohannes Thumshirn 				 * fs_info->sb are already torn down.
9480697d9a6SJohannes Thumshirn 				 */
9490697d9a6SJohannes Thumshirn 				btrfs_warn_in_rcu(NULL,
95079dae17dSAnand Jain 	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
95179dae17dSAnand Jain 						  path, devid, found_transid,
95279dae17dSAnand Jain 						  current->comm,
95379dae17dSAnand Jain 						  task_pid_nr(current));
954a9261d41SAnand Jain 				return ERR_PTR(-EEXIST);
955a9261d41SAnand Jain 			}
956a9261d41SAnand Jain 			bdput(path_bdev);
957a9261d41SAnand Jain 			btrfs_info_in_rcu(device->fs_info,
95879dae17dSAnand Jain 	"devid %llu device path %s changed to %s scanned by %s (%d)",
95979dae17dSAnand Jain 					  devid, rcu_str_deref(device->name),
96079dae17dSAnand Jain 					  path, current->comm,
96179dae17dSAnand Jain 					  task_pid_nr(current));
962a9261d41SAnand Jain 		}
963a9261d41SAnand Jain 
964606686eeSJosef Bacik 		name = rcu_string_strdup(path, GFP_NOFS);
9659c6d173eSAnand Jain 		if (!name) {
9669c6d173eSAnand Jain 			mutex_unlock(&fs_devices->device_list_mutex);
967e124ece5SAnand Jain 			return ERR_PTR(-ENOMEM);
9689c6d173eSAnand Jain 		}
969606686eeSJosef Bacik 		rcu_string_free(device->name);
970606686eeSJosef Bacik 		rcu_assign_pointer(device->name, name);
971e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
972cd02dca5SChris Mason 			fs_devices->missing_devices--;
973e6e674bdSAnand Jain 			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
974cd02dca5SChris Mason 		}
9758a4b83ccSChris Mason 	}
9768a4b83ccSChris Mason 
97777bdae4dSAnand Jain 	/*
97877bdae4dSAnand Jain 	 * Unmount does not free the btrfs_device struct but would zero
97977bdae4dSAnand Jain 	 * generation along with most of the other members. So just update
98077bdae4dSAnand Jain 	 * it back. We need it to pick the disk with largest generation
98177bdae4dSAnand Jain 	 * (as above).
98277bdae4dSAnand Jain 	 */
983d1a63002SNikolay Borisov 	if (!fs_devices->opened) {
98477bdae4dSAnand Jain 		device->generation = found_transid;
985d1a63002SNikolay Borisov 		fs_devices->latest_generation = max_t(u64, found_transid,
986d1a63002SNikolay Borisov 						fs_devices->latest_generation);
987d1a63002SNikolay Borisov 	}
98877bdae4dSAnand Jain 
989f2788d2fSAnand Jain 	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
990f2788d2fSAnand Jain 
9919c6d173eSAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
992e124ece5SAnand Jain 	return device;
9938a4b83ccSChris Mason }
9948a4b83ccSChris Mason 
995e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
996e4404d6eSYan Zheng {
997e4404d6eSYan Zheng 	struct btrfs_fs_devices *fs_devices;
998e4404d6eSYan Zheng 	struct btrfs_device *device;
999e4404d6eSYan Zheng 	struct btrfs_device *orig_dev;
1000d2979aa2SAnand Jain 	int ret = 0;
1001e4404d6eSYan Zheng 
10027239ff4bSNikolay Borisov 	fs_devices = alloc_fs_devices(orig->fsid, NULL);
10032208a378SIlya Dryomov 	if (IS_ERR(fs_devices))
10042208a378SIlya Dryomov 		return fs_devices;
1005e4404d6eSYan Zheng 
1006adbbb863SMiao Xie 	mutex_lock(&orig->device_list_mutex);
100702db0844SJosef Bacik 	fs_devices->total_devices = orig->total_devices;
1008e4404d6eSYan Zheng 
1009e4404d6eSYan Zheng 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1010606686eeSJosef Bacik 		struct rcu_string *name;
1011606686eeSJosef Bacik 
101212bd2fc0SIlya Dryomov 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
101312bd2fc0SIlya Dryomov 					    orig_dev->uuid);
1014d2979aa2SAnand Jain 		if (IS_ERR(device)) {
1015d2979aa2SAnand Jain 			ret = PTR_ERR(device);
1016e4404d6eSYan Zheng 			goto error;
1017d2979aa2SAnand Jain 		}
1018e4404d6eSYan Zheng 
1019606686eeSJosef Bacik 		/*
1020606686eeSJosef Bacik 		 * This is ok to do without rcu read locked because we hold the
1021606686eeSJosef Bacik 		 * uuid mutex so nothing we touch in here is going to disappear.
1022606686eeSJosef Bacik 		 */
1023e755f780SAnand Jain 		if (orig_dev->name) {
102478f2c9e6SDavid Sterba 			name = rcu_string_strdup(orig_dev->name->str,
102578f2c9e6SDavid Sterba 					GFP_KERNEL);
1026606686eeSJosef Bacik 			if (!name) {
1027a425f9d4SDavid Sterba 				btrfs_free_device(device);
1028d2979aa2SAnand Jain 				ret = -ENOMEM;
1029e4404d6eSYan Zheng 				goto error;
1030fd2696f3SJulia Lawall 			}
1031606686eeSJosef Bacik 			rcu_assign_pointer(device->name, name);
1032e755f780SAnand Jain 		}
1033e4404d6eSYan Zheng 
1034e4404d6eSYan Zheng 		list_add(&device->dev_list, &fs_devices->devices);
1035e4404d6eSYan Zheng 		device->fs_devices = fs_devices;
1036e4404d6eSYan Zheng 		fs_devices->num_devices++;
1037e4404d6eSYan Zheng 	}
1038adbbb863SMiao Xie 	mutex_unlock(&orig->device_list_mutex);
1039e4404d6eSYan Zheng 	return fs_devices;
1040e4404d6eSYan Zheng error:
1041adbbb863SMiao Xie 	mutex_unlock(&orig->device_list_mutex);
1042e4404d6eSYan Zheng 	free_fs_devices(fs_devices);
1043d2979aa2SAnand Jain 	return ERR_PTR(ret);
1044e4404d6eSYan Zheng }
1045e4404d6eSYan Zheng 
10463712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1047*bacce86aSAnand Jain 				      struct btrfs_device **latest_dev)
1048dfe25020SChris Mason {
1049c6e30871SQinghuang Feng 	struct btrfs_device *device, *next;
1050a6b0d5c8SChris Mason 
105146224705SXiao Guangrong 	/* This is the initialized path, it is safe to release the devices. */
1052c6e30871SQinghuang Feng 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
10533712ccb7SNikolay Borisov 		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1054401e29c1SAnand Jain 			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1055401e29c1SAnand Jain 				      &device->dev_state) &&
1056998a0671SAnand Jain 			    !test_bit(BTRFS_DEV_STATE_MISSING,
1057998a0671SAnand Jain 				      &device->dev_state) &&
10583712ccb7SNikolay Borisov 			    (!*latest_dev ||
10593712ccb7SNikolay Borisov 			     device->generation > (*latest_dev)->generation)) {
10603712ccb7SNikolay Borisov 				*latest_dev = device;
1061a6b0d5c8SChris Mason 			}
10622b82032cSYan Zheng 			continue;
1063a6b0d5c8SChris Mason 		}
10642b82032cSYan Zheng 
10658dabb742SStefan Behrens 		/*
1066cf89af14SAnand Jain 		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1067cf89af14SAnand Jain 		 * in btrfs_init_dev_replace() so just continue.
10688dabb742SStefan Behrens 		 */
1069cf89af14SAnand Jain 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
10708dabb742SStefan Behrens 			continue;
1071cf89af14SAnand Jain 
1072a74a4b97SChris Mason 		if (device->bdev) {
1073d4d77629STejun Heo 			blkdev_put(device->bdev, device->mode);
10742b82032cSYan Zheng 			device->bdev = NULL;
1075a74a4b97SChris Mason 			fs_devices->open_devices--;
1076a74a4b97SChris Mason 		}
1077ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
10782b82032cSYan Zheng 			list_del_init(&device->dev_alloc_list);
1079ebbede42SAnand Jain 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
10802b82032cSYan Zheng 		}
10812b82032cSYan Zheng 		list_del_init(&device->dev_list);
10822b82032cSYan Zheng 		fs_devices->num_devices--;
1083a425f9d4SDavid Sterba 		btrfs_free_device(device);
10842b82032cSYan Zheng 	}
10852b82032cSYan Zheng 
10863712ccb7SNikolay Borisov }
10873712ccb7SNikolay Borisov 
10883712ccb7SNikolay Borisov /*
10893712ccb7SNikolay Borisov  * After we have read the system tree and know devids belonging to this
10903712ccb7SNikolay Borisov  * filesystem, remove the device which does not belong there.
10913712ccb7SNikolay Borisov  */
1092*bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
10933712ccb7SNikolay Borisov {
10943712ccb7SNikolay Borisov 	struct btrfs_device *latest_dev = NULL;
1095944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_dev;
10963712ccb7SNikolay Borisov 
10973712ccb7SNikolay Borisov 	mutex_lock(&uuid_mutex);
1098*bacce86aSAnand Jain 	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1099944d3f9fSNikolay Borisov 
1100944d3f9fSNikolay Borisov 	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1101*bacce86aSAnand Jain 		__btrfs_free_extra_devids(seed_dev, &latest_dev);
11022b82032cSYan Zheng 
1103443f24feSMiao Xie 	fs_devices->latest_bdev = latest_dev->bdev;
1104a6b0d5c8SChris Mason 
1105dfe25020SChris Mason 	mutex_unlock(&uuid_mutex);
1106dfe25020SChris Mason }
1107a0af469bSChris Mason 
110814238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device)
110914238819SAnand Jain {
111008ffcae8SDavid Sterba 	if (!device->bdev)
111108ffcae8SDavid Sterba 		return;
111208ffcae8SDavid Sterba 
1113ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
111414238819SAnand Jain 		sync_blockdev(device->bdev);
111514238819SAnand Jain 		invalidate_bdev(device->bdev);
111614238819SAnand Jain 	}
111714238819SAnand Jain 
111814238819SAnand Jain 	blkdev_put(device->bdev, device->mode);
111914238819SAnand Jain }
112014238819SAnand Jain 
1121959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device)
1122f448341aSAnand Jain {
1123f448341aSAnand Jain 	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1124f448341aSAnand Jain 
1125ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1126f448341aSAnand Jain 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
1127f448341aSAnand Jain 		list_del_init(&device->dev_alloc_list);
1128f448341aSAnand Jain 		fs_devices->rw_devices--;
1129f448341aSAnand Jain 	}
1130f448341aSAnand Jain 
1131e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1132f448341aSAnand Jain 		fs_devices->missing_devices--;
1133f448341aSAnand Jain 
1134959b1c04SNikolay Borisov 	btrfs_close_bdev(device);
1135321f69f8SJohannes Thumshirn 	if (device->bdev) {
11363fff3975SJohannes Thumshirn 		fs_devices->open_devices--;
1137321f69f8SJohannes Thumshirn 		device->bdev = NULL;
1138f448341aSAnand Jain 	}
1139321f69f8SJohannes Thumshirn 	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1140f448341aSAnand Jain 
1141321f69f8SJohannes Thumshirn 	device->fs_info = NULL;
1142321f69f8SJohannes Thumshirn 	atomic_set(&device->dev_stats_ccnt, 0);
1143321f69f8SJohannes Thumshirn 	extent_io_tree_release(&device->alloc_state);
1144959b1c04SNikolay Borisov 
1145321f69f8SJohannes Thumshirn 	/* Verify the device is back in a pristine state  */
1146321f69f8SJohannes Thumshirn 	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1147321f69f8SJohannes Thumshirn 	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1148321f69f8SJohannes Thumshirn 	ASSERT(list_empty(&device->dev_alloc_list));
1149321f69f8SJohannes Thumshirn 	ASSERT(list_empty(&device->post_commit_list));
1150321f69f8SJohannes Thumshirn 	ASSERT(atomic_read(&device->reada_in_flight) == 0);
1151f448341aSAnand Jain }
1152f448341aSAnand Jain 
115354eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
11548a4b83ccSChris Mason {
11552037a093SSasha Levin 	struct btrfs_device *device, *tmp;
1156e4404d6eSYan Zheng 
1157425c6ed6SJosef Bacik 	lockdep_assert_held(&uuid_mutex);
1158425c6ed6SJosef Bacik 
11592b82032cSYan Zheng 	if (--fs_devices->opened > 0)
116054eed6aeSNikolay Borisov 		return;
11618a4b83ccSChris Mason 
1162425c6ed6SJosef Bacik 	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1163959b1c04SNikolay Borisov 		btrfs_close_one_device(device);
1164c9513edbSXiao Guangrong 
1165e4404d6eSYan Zheng 	WARN_ON(fs_devices->open_devices);
1166e4404d6eSYan Zheng 	WARN_ON(fs_devices->rw_devices);
11672b82032cSYan Zheng 	fs_devices->opened = 0;
11680395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
1169c4989c2fSNikolay Borisov 	fs_devices->fs_info = NULL;
11708a4b83ccSChris Mason }
11718a4b83ccSChris Mason 
117254eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
11732b82032cSYan Zheng {
1174944d3f9fSNikolay Borisov 	LIST_HEAD(list);
1175944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *tmp;
11762b82032cSYan Zheng 
11772b82032cSYan Zheng 	mutex_lock(&uuid_mutex);
117854eed6aeSNikolay Borisov 	close_fs_devices(fs_devices);
1179944d3f9fSNikolay Borisov 	if (!fs_devices->opened)
1180944d3f9fSNikolay Borisov 		list_splice_init(&fs_devices->seed_list, &list);
1181e4404d6eSYan Zheng 
1182944d3f9fSNikolay Borisov 	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
11830226e0ebSAnand Jain 		close_fs_devices(fs_devices);
1184944d3f9fSNikolay Borisov 		list_del(&fs_devices->seed_list);
1185e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
1186e4404d6eSYan Zheng 	}
1187425c6ed6SJosef Bacik 	mutex_unlock(&uuid_mutex);
11882b82032cSYan Zheng }
11892b82032cSYan Zheng 
1190897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
119197288f2cSChristoph Hellwig 				fmode_t flags, void *holder)
11928a4b83ccSChris Mason {
11938a4b83ccSChris Mason 	struct btrfs_device *device;
1194443f24feSMiao Xie 	struct btrfs_device *latest_dev = NULL;
119596c2e067SAnand Jain 	struct btrfs_device *tmp_device;
11968a4b83ccSChris Mason 
1197d4d77629STejun Heo 	flags |= FMODE_EXCL;
1198d4d77629STejun Heo 
119996c2e067SAnand Jain 	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
120096c2e067SAnand Jain 				 dev_list) {
120196c2e067SAnand Jain 		int ret;
1202a0af469bSChris Mason 
120396c2e067SAnand Jain 		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
120496c2e067SAnand Jain 		if (ret == 0 &&
120596c2e067SAnand Jain 		    (!latest_dev || device->generation > latest_dev->generation)) {
12069f050db4SAnand Jain 			latest_dev = device;
120796c2e067SAnand Jain 		} else if (ret == -ENODATA) {
120896c2e067SAnand Jain 			fs_devices->num_devices--;
120996c2e067SAnand Jain 			list_del(&device->dev_list);
121096c2e067SAnand Jain 			btrfs_free_device(device);
121196c2e067SAnand Jain 		}
12128a4b83ccSChris Mason 	}
12131ed802c9SAnand Jain 	if (fs_devices->open_devices == 0)
12141ed802c9SAnand Jain 		return -EINVAL;
12151ed802c9SAnand Jain 
12162b82032cSYan Zheng 	fs_devices->opened = 1;
1217443f24feSMiao Xie 	fs_devices->latest_bdev = latest_dev->bdev;
12182b82032cSYan Zheng 	fs_devices->total_rw_bytes = 0;
1219c4a816c6SNaohiro Aota 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
122033fd2f71SAnand Jain 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
12211ed802c9SAnand Jain 
12221ed802c9SAnand Jain 	return 0;
12232b82032cSYan Zheng }
12242b82032cSYan Zheng 
1225f8e10cd3SAnand Jain static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1226f8e10cd3SAnand Jain {
1227f8e10cd3SAnand Jain 	struct btrfs_device *dev1, *dev2;
1228f8e10cd3SAnand Jain 
1229f8e10cd3SAnand Jain 	dev1 = list_entry(a, struct btrfs_device, dev_list);
1230f8e10cd3SAnand Jain 	dev2 = list_entry(b, struct btrfs_device, dev_list);
1231f8e10cd3SAnand Jain 
1232f8e10cd3SAnand Jain 	if (dev1->devid < dev2->devid)
1233f8e10cd3SAnand Jain 		return -1;
1234f8e10cd3SAnand Jain 	else if (dev1->devid > dev2->devid)
1235f8e10cd3SAnand Jain 		return 1;
1236f8e10cd3SAnand Jain 	return 0;
1237f8e10cd3SAnand Jain }
1238f8e10cd3SAnand Jain 
12392b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
124097288f2cSChristoph Hellwig 		       fmode_t flags, void *holder)
12412b82032cSYan Zheng {
12422b82032cSYan Zheng 	int ret;
12432b82032cSYan Zheng 
1244f5194e34SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
124518c850fdSJosef Bacik 	/*
124618c850fdSJosef Bacik 	 * The device_list_mutex cannot be taken here in case opening the
124718c850fdSJosef Bacik 	 * underlying device takes further locks like bd_mutex.
124818c850fdSJosef Bacik 	 *
124918c850fdSJosef Bacik 	 * We also don't need the lock here as this is called during mount and
125018c850fdSJosef Bacik 	 * exclusion is provided by uuid_mutex
125118c850fdSJosef Bacik 	 */
1252f5194e34SDavid Sterba 
12532b82032cSYan Zheng 	if (fs_devices->opened) {
12542b82032cSYan Zheng 		fs_devices->opened++;
12552b82032cSYan Zheng 		ret = 0;
12562b82032cSYan Zheng 	} else {
1257f8e10cd3SAnand Jain 		list_sort(NULL, &fs_devices->devices, devid_cmp);
1258897fb573SAnand Jain 		ret = open_fs_devices(fs_devices, flags, holder);
12592b82032cSYan Zheng 	}
1260542c5908SAnand Jain 
12618a4b83ccSChris Mason 	return ret;
12628a4b83ccSChris Mason }
12638a4b83ccSChris Mason 
12648f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super)
12656cf86a00SAnand Jain {
12668f32380dSJohannes Thumshirn 	struct page *page = virt_to_page(super);
12678f32380dSJohannes Thumshirn 
12686cf86a00SAnand Jain 	put_page(page);
12696cf86a00SAnand Jain }
12706cf86a00SAnand Jain 
1271b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1272b335eab8SNikolay Borisov 						       u64 bytenr)
12736cf86a00SAnand Jain {
1274b335eab8SNikolay Borisov 	struct btrfs_super_block *disk_super;
1275b335eab8SNikolay Borisov 	struct page *page;
12766cf86a00SAnand Jain 	void *p;
12776cf86a00SAnand Jain 	pgoff_t index;
12786cf86a00SAnand Jain 
12796cf86a00SAnand Jain 	/* make sure our super fits in the device */
12806cf86a00SAnand Jain 	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1281b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
12826cf86a00SAnand Jain 
12836cf86a00SAnand Jain 	/* make sure our super fits in the page */
1284b335eab8SNikolay Borisov 	if (sizeof(*disk_super) > PAGE_SIZE)
1285b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
12866cf86a00SAnand Jain 
12876cf86a00SAnand Jain 	/* make sure our super doesn't straddle pages on disk */
12886cf86a00SAnand Jain 	index = bytenr >> PAGE_SHIFT;
1289b335eab8SNikolay Borisov 	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1290b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
12916cf86a00SAnand Jain 
12926cf86a00SAnand Jain 	/* pull in the page with our super */
1293b335eab8SNikolay Borisov 	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
12946cf86a00SAnand Jain 
1295b335eab8SNikolay Borisov 	if (IS_ERR(page))
1296b335eab8SNikolay Borisov 		return ERR_CAST(page);
12976cf86a00SAnand Jain 
1298b335eab8SNikolay Borisov 	p = page_address(page);
12996cf86a00SAnand Jain 
13006cf86a00SAnand Jain 	/* align our pointer to the offset of the super block */
1301b335eab8SNikolay Borisov 	disk_super = p + offset_in_page(bytenr);
13026cf86a00SAnand Jain 
1303b335eab8SNikolay Borisov 	if (btrfs_super_bytenr(disk_super) != bytenr ||
1304b335eab8SNikolay Borisov 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
13058f32380dSJohannes Thumshirn 		btrfs_release_disk_super(p);
1306b335eab8SNikolay Borisov 		return ERR_PTR(-EINVAL);
13076cf86a00SAnand Jain 	}
13086cf86a00SAnand Jain 
1309b335eab8SNikolay Borisov 	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1310b335eab8SNikolay Borisov 		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
13116cf86a00SAnand Jain 
1312b335eab8SNikolay Borisov 	return disk_super;
13136cf86a00SAnand Jain }
13146cf86a00SAnand Jain 
1315228a73abSAnand Jain int btrfs_forget_devices(const char *path)
1316228a73abSAnand Jain {
1317228a73abSAnand Jain 	int ret;
1318228a73abSAnand Jain 
1319228a73abSAnand Jain 	mutex_lock(&uuid_mutex);
1320228a73abSAnand Jain 	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1321228a73abSAnand Jain 	mutex_unlock(&uuid_mutex);
1322228a73abSAnand Jain 
1323228a73abSAnand Jain 	return ret;
1324228a73abSAnand Jain }
1325228a73abSAnand Jain 
13266f60cbd3SDavid Sterba /*
13276f60cbd3SDavid Sterba  * Look for a btrfs signature on a device. This may be called out of the mount path
13286f60cbd3SDavid Sterba  * and we are not allowed to call set_blocksize during the scan. The superblock
13296f60cbd3SDavid Sterba  * is read via pagecache
13306f60cbd3SDavid Sterba  */
133136350e95SGu Jinxiang struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
133236350e95SGu Jinxiang 					   void *holder)
13338a4b83ccSChris Mason {
13348a4b83ccSChris Mason 	struct btrfs_super_block *disk_super;
13354306a974SAnand Jain 	bool new_device_added = false;
133636350e95SGu Jinxiang 	struct btrfs_device *device = NULL;
13378a4b83ccSChris Mason 	struct block_device *bdev;
13386f60cbd3SDavid Sterba 	u64 bytenr;
13398a4b83ccSChris Mason 
1340899f9307SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
1341899f9307SDavid Sterba 
13426f60cbd3SDavid Sterba 	/*
13436f60cbd3SDavid Sterba 	 * we would like to check all the supers, but that would make
13446f60cbd3SDavid Sterba 	 * a btrfs mount succeed after a mkfs from a different FS.
13456f60cbd3SDavid Sterba 	 * So, we need to add a special mount option to scan for
13466f60cbd3SDavid Sterba 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
13476f60cbd3SDavid Sterba 	 */
13486f60cbd3SDavid Sterba 	bytenr = btrfs_sb_offset(0);
1349d4d77629STejun Heo 	flags |= FMODE_EXCL;
13506f60cbd3SDavid Sterba 
13516f60cbd3SDavid Sterba 	bdev = blkdev_get_by_path(path, flags, holder);
1352b6ed73bcSAnand Jain 	if (IS_ERR(bdev))
135336350e95SGu Jinxiang 		return ERR_CAST(bdev);
13546f60cbd3SDavid Sterba 
1355b335eab8SNikolay Borisov 	disk_super = btrfs_read_disk_super(bdev, bytenr);
1356b335eab8SNikolay Borisov 	if (IS_ERR(disk_super)) {
1357b335eab8SNikolay Borisov 		device = ERR_CAST(disk_super);
13586f60cbd3SDavid Sterba 		goto error_bdev_put;
135905a5c55dSAnand Jain 	}
13606f60cbd3SDavid Sterba 
13614306a974SAnand Jain 	device = device_list_add(path, disk_super, &new_device_added);
136236350e95SGu Jinxiang 	if (!IS_ERR(device)) {
13634306a974SAnand Jain 		if (new_device_added)
13644306a974SAnand Jain 			btrfs_free_stale_devices(path, device);
13654306a974SAnand Jain 	}
13666f60cbd3SDavid Sterba 
13678f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
13686f60cbd3SDavid Sterba 
13696f60cbd3SDavid Sterba error_bdev_put:
1370d4d77629STejun Heo 	blkdev_put(bdev, flags);
1371b6ed73bcSAnand Jain 
137236350e95SGu Jinxiang 	return device;
13738a4b83ccSChris Mason }
13740b86a832SChris Mason 
1375c152b63eSFilipe Manana /*
13761c11b63eSJeff Mahoney  * Try to find a chunk that intersects [start, start + len] range and when one
13771c11b63eSJeff Mahoney  * such is found, record the end of it in *start
1378c152b63eSFilipe Manana  */
13791c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
13801c11b63eSJeff Mahoney 				    u64 len)
13811c11b63eSJeff Mahoney {
13821c11b63eSJeff Mahoney 	u64 physical_start, physical_end;
13836df9a95eSJosef Bacik 
13841c11b63eSJeff Mahoney 	lockdep_assert_held(&device->fs_info->chunk_mutex);
13851c11b63eSJeff Mahoney 
13861c11b63eSJeff Mahoney 	if (!find_first_extent_bit(&device->alloc_state, *start,
13871c11b63eSJeff Mahoney 				   &physical_start, &physical_end,
13881c11b63eSJeff Mahoney 				   CHUNK_ALLOCATED, NULL)) {
13891c11b63eSJeff Mahoney 
13901c11b63eSJeff Mahoney 		if (in_range(physical_start, *start, len) ||
13911c11b63eSJeff Mahoney 		    in_range(*start, physical_start,
13921c11b63eSJeff Mahoney 			     physical_end - physical_start)) {
13931c11b63eSJeff Mahoney 			*start = physical_end + 1;
13941c11b63eSJeff Mahoney 			return true;
13951c11b63eSJeff Mahoney 		}
13961c11b63eSJeff Mahoney 	}
13971c11b63eSJeff Mahoney 	return false;
13986df9a95eSJosef Bacik }
13996df9a95eSJosef Bacik 
14003b4ffa40SNaohiro Aota static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
14013b4ffa40SNaohiro Aota {
14023b4ffa40SNaohiro Aota 	switch (device->fs_devices->chunk_alloc_policy) {
14033b4ffa40SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
14043b4ffa40SNaohiro Aota 		/*
14053b4ffa40SNaohiro Aota 		 * We don't want to overwrite the superblock on the drive nor
14063b4ffa40SNaohiro Aota 		 * any area used by the boot loader (grub for example), so we
14073b4ffa40SNaohiro Aota 		 * make sure to start at an offset of at least 1MB.
14083b4ffa40SNaohiro Aota 		 */
14093b4ffa40SNaohiro Aota 		return max_t(u64, start, SZ_1M);
14103b4ffa40SNaohiro Aota 	default:
14113b4ffa40SNaohiro Aota 		BUG();
14123b4ffa40SNaohiro Aota 	}
14133b4ffa40SNaohiro Aota }
14143b4ffa40SNaohiro Aota 
14153b4ffa40SNaohiro Aota /**
14163b4ffa40SNaohiro Aota  * dev_extent_hole_check - check if specified hole is suitable for allocation
14173b4ffa40SNaohiro Aota  * @device:	the device which we have the hole
14183b4ffa40SNaohiro Aota  * @hole_start: starting position of the hole
14193b4ffa40SNaohiro Aota  * @hole_size:	the size of the hole
14203b4ffa40SNaohiro Aota  * @num_bytes:	the size of the free space that we need
14213b4ffa40SNaohiro Aota  *
14223b4ffa40SNaohiro Aota  * This function may modify @hole_start and @hole_end to reflect the suitable
14233b4ffa40SNaohiro Aota  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
14243b4ffa40SNaohiro Aota  */
14253b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
14263b4ffa40SNaohiro Aota 				  u64 *hole_size, u64 num_bytes)
14273b4ffa40SNaohiro Aota {
14283b4ffa40SNaohiro Aota 	bool changed = false;
14293b4ffa40SNaohiro Aota 	u64 hole_end = *hole_start + *hole_size;
14303b4ffa40SNaohiro Aota 
14313b4ffa40SNaohiro Aota 	/*
14323b4ffa40SNaohiro Aota 	 * Check before we set max_hole_start, otherwise we could end up
14333b4ffa40SNaohiro Aota 	 * sending back this offset anyway.
14343b4ffa40SNaohiro Aota 	 */
14353b4ffa40SNaohiro Aota 	if (contains_pending_extent(device, hole_start, *hole_size)) {
14363b4ffa40SNaohiro Aota 		if (hole_end >= *hole_start)
14373b4ffa40SNaohiro Aota 			*hole_size = hole_end - *hole_start;
14383b4ffa40SNaohiro Aota 		else
14393b4ffa40SNaohiro Aota 			*hole_size = 0;
14403b4ffa40SNaohiro Aota 		changed = true;
14413b4ffa40SNaohiro Aota 	}
14423b4ffa40SNaohiro Aota 
14433b4ffa40SNaohiro Aota 	switch (device->fs_devices->chunk_alloc_policy) {
14443b4ffa40SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
14453b4ffa40SNaohiro Aota 		/* No extra check */
14463b4ffa40SNaohiro Aota 		break;
14473b4ffa40SNaohiro Aota 	default:
14483b4ffa40SNaohiro Aota 		BUG();
14493b4ffa40SNaohiro Aota 	}
14503b4ffa40SNaohiro Aota 
14513b4ffa40SNaohiro Aota 	return changed;
14523b4ffa40SNaohiro Aota }
14536df9a95eSJosef Bacik 
14540b86a832SChris Mason /*
1455499f377fSJeff Mahoney  * find_free_dev_extent_start - find free space in the specified device
14567bfc837dSMiao Xie  * @device:	  the device which we search the free space in
14577bfc837dSMiao Xie  * @num_bytes:	  the size of the free space that we need
1458499f377fSJeff Mahoney  * @search_start: the position from which to begin the search
14597bfc837dSMiao Xie  * @start:	  store the start of the free space.
1460499f377fSJeff Mahoney  * @len:	  the size of the free space. that we find, or the size
1461499f377fSJeff Mahoney  *		  of the max free space if we don't find suitable free space
14627bfc837dSMiao Xie  *
14630b86a832SChris Mason  * this uses a pretty simple search, the expectation is that it is
14640b86a832SChris Mason  * called very infrequently and that a given device has a small number
14650b86a832SChris Mason  * of extents
14667bfc837dSMiao Xie  *
14677bfc837dSMiao Xie  * @start is used to store the start of the free space if we find. But if we
14687bfc837dSMiao Xie  * don't find suitable free space, it will be used to store the start position
14697bfc837dSMiao Xie  * of the max free space.
14707bfc837dSMiao Xie  *
14717bfc837dSMiao Xie  * @len is used to store the size of the free space that we find.
14727bfc837dSMiao Xie  * But if we don't find suitable free space, it is used to store the size of
14737bfc837dSMiao Xie  * the max free space.
1474135da976SQu Wenruo  *
1475135da976SQu Wenruo  * NOTE: This function will search *commit* root of device tree, and does extra
1476135da976SQu Wenruo  * check to ensure dev extents are not double allocated.
1477135da976SQu Wenruo  * This makes the function safe to allocate dev extents but may not report
1478135da976SQu Wenruo  * correct usable device space, as device extent freed in current transaction
1479135da976SQu Wenruo  * is not reported as avaiable.
14800b86a832SChris Mason  */
14819e3246a5SQu Wenruo static int find_free_dev_extent_start(struct btrfs_device *device,
14829e3246a5SQu Wenruo 				u64 num_bytes, u64 search_start, u64 *start,
14839e3246a5SQu Wenruo 				u64 *len)
14840b86a832SChris Mason {
14850b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
14860b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
14870b86a832SChris Mason 	struct btrfs_key key;
14887bfc837dSMiao Xie 	struct btrfs_dev_extent *dev_extent;
14892b82032cSYan Zheng 	struct btrfs_path *path;
14907bfc837dSMiao Xie 	u64 hole_size;
14917bfc837dSMiao Xie 	u64 max_hole_start;
14927bfc837dSMiao Xie 	u64 max_hole_size;
14937bfc837dSMiao Xie 	u64 extent_end;
14940b86a832SChris Mason 	u64 search_end = device->total_bytes;
14950b86a832SChris Mason 	int ret;
14967bfc837dSMiao Xie 	int slot;
14970b86a832SChris Mason 	struct extent_buffer *l;
14988cdc7c5bSFilipe Manana 
14993b4ffa40SNaohiro Aota 	search_start = dev_extent_search_start(device, search_start);
15000b86a832SChris Mason 
15016df9a95eSJosef Bacik 	path = btrfs_alloc_path();
15026df9a95eSJosef Bacik 	if (!path)
15036df9a95eSJosef Bacik 		return -ENOMEM;
1504f2ab7618SZhao Lei 
15057bfc837dSMiao Xie 	max_hole_start = search_start;
15067bfc837dSMiao Xie 	max_hole_size = 0;
15077bfc837dSMiao Xie 
1508f2ab7618SZhao Lei again:
1509401e29c1SAnand Jain 	if (search_start >= search_end ||
1510401e29c1SAnand Jain 		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
15117bfc837dSMiao Xie 		ret = -ENOSPC;
15126df9a95eSJosef Bacik 		goto out;
15137bfc837dSMiao Xie 	}
15147bfc837dSMiao Xie 
1515e4058b54SDavid Sterba 	path->reada = READA_FORWARD;
15166df9a95eSJosef Bacik 	path->search_commit_root = 1;
15176df9a95eSJosef Bacik 	path->skip_locking = 1;
15187bfc837dSMiao Xie 
15190b86a832SChris Mason 	key.objectid = device->devid;
15200b86a832SChris Mason 	key.offset = search_start;
15210b86a832SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
15227bfc837dSMiao Xie 
1523125ccb0aSLi Zefan 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
15240b86a832SChris Mason 	if (ret < 0)
15257bfc837dSMiao Xie 		goto out;
15261fcbac58SYan Zheng 	if (ret > 0) {
15271fcbac58SYan Zheng 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
15280b86a832SChris Mason 		if (ret < 0)
15297bfc837dSMiao Xie 			goto out;
15301fcbac58SYan Zheng 	}
15317bfc837dSMiao Xie 
15320b86a832SChris Mason 	while (1) {
15330b86a832SChris Mason 		l = path->nodes[0];
15340b86a832SChris Mason 		slot = path->slots[0];
15350b86a832SChris Mason 		if (slot >= btrfs_header_nritems(l)) {
15360b86a832SChris Mason 			ret = btrfs_next_leaf(root, path);
15370b86a832SChris Mason 			if (ret == 0)
15380b86a832SChris Mason 				continue;
15390b86a832SChris Mason 			if (ret < 0)
15407bfc837dSMiao Xie 				goto out;
15417bfc837dSMiao Xie 
15427bfc837dSMiao Xie 			break;
15430b86a832SChris Mason 		}
15440b86a832SChris Mason 		btrfs_item_key_to_cpu(l, &key, slot);
15450b86a832SChris Mason 
15460b86a832SChris Mason 		if (key.objectid < device->devid)
15470b86a832SChris Mason 			goto next;
15480b86a832SChris Mason 
15490b86a832SChris Mason 		if (key.objectid > device->devid)
15507bfc837dSMiao Xie 			break;
15510b86a832SChris Mason 
1552962a298fSDavid Sterba 		if (key.type != BTRFS_DEV_EXTENT_KEY)
15530b86a832SChris Mason 			goto next;
15540b86a832SChris Mason 
15557bfc837dSMiao Xie 		if (key.offset > search_start) {
15567bfc837dSMiao Xie 			hole_size = key.offset - search_start;
15573b4ffa40SNaohiro Aota 			dev_extent_hole_check(device, &search_start, &hole_size,
15583b4ffa40SNaohiro Aota 					      num_bytes);
15596df9a95eSJosef Bacik 
15607bfc837dSMiao Xie 			if (hole_size > max_hole_size) {
15617bfc837dSMiao Xie 				max_hole_start = search_start;
15627bfc837dSMiao Xie 				max_hole_size = hole_size;
15637bfc837dSMiao Xie 			}
15647bfc837dSMiao Xie 
15657bfc837dSMiao Xie 			/*
15667bfc837dSMiao Xie 			 * If this free space is greater than which we need,
15677bfc837dSMiao Xie 			 * it must be the max free space that we have found
15687bfc837dSMiao Xie 			 * until now, so max_hole_start must point to the start
15697bfc837dSMiao Xie 			 * of this free space and the length of this free space
15707bfc837dSMiao Xie 			 * is stored in max_hole_size. Thus, we return
15717bfc837dSMiao Xie 			 * max_hole_start and max_hole_size and go back to the
15727bfc837dSMiao Xie 			 * caller.
15737bfc837dSMiao Xie 			 */
15747bfc837dSMiao Xie 			if (hole_size >= num_bytes) {
15757bfc837dSMiao Xie 				ret = 0;
15767bfc837dSMiao Xie 				goto out;
15777bfc837dSMiao Xie 			}
15787bfc837dSMiao Xie 		}
15797bfc837dSMiao Xie 
15800b86a832SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
15817bfc837dSMiao Xie 		extent_end = key.offset + btrfs_dev_extent_length(l,
15827bfc837dSMiao Xie 								  dev_extent);
15837bfc837dSMiao Xie 		if (extent_end > search_start)
15847bfc837dSMiao Xie 			search_start = extent_end;
15850b86a832SChris Mason next:
15860b86a832SChris Mason 		path->slots[0]++;
15870b86a832SChris Mason 		cond_resched();
15880b86a832SChris Mason 	}
15890b86a832SChris Mason 
159038c01b96Sliubo 	/*
159138c01b96Sliubo 	 * At this point, search_start should be the end of
159238c01b96Sliubo 	 * allocated dev extents, and when shrinking the device,
159338c01b96Sliubo 	 * search_end may be smaller than search_start.
159438c01b96Sliubo 	 */
1595f2ab7618SZhao Lei 	if (search_end > search_start) {
15967bfc837dSMiao Xie 		hole_size = search_end - search_start;
15973b4ffa40SNaohiro Aota 		if (dev_extent_hole_check(device, &search_start, &hole_size,
15983b4ffa40SNaohiro Aota 					  num_bytes)) {
1599f2ab7618SZhao Lei 			btrfs_release_path(path);
1600f2ab7618SZhao Lei 			goto again;
1601f2ab7618SZhao Lei 		}
1602f2ab7618SZhao Lei 
16037bfc837dSMiao Xie 		if (hole_size > max_hole_size) {
16047bfc837dSMiao Xie 			max_hole_start = search_start;
16057bfc837dSMiao Xie 			max_hole_size = hole_size;
16060b86a832SChris Mason 		}
16076df9a95eSJosef Bacik 	}
16086df9a95eSJosef Bacik 
16097bfc837dSMiao Xie 	/* See above. */
1610f2ab7618SZhao Lei 	if (max_hole_size < num_bytes)
16117bfc837dSMiao Xie 		ret = -ENOSPC;
16127bfc837dSMiao Xie 	else
16132b82032cSYan Zheng 		ret = 0;
16140b86a832SChris Mason 
16157bfc837dSMiao Xie out:
16162b82032cSYan Zheng 	btrfs_free_path(path);
16177bfc837dSMiao Xie 	*start = max_hole_start;
1618b2117a39SMiao Xie 	if (len)
16197bfc837dSMiao Xie 		*len = max_hole_size;
16200b86a832SChris Mason 	return ret;
16210b86a832SChris Mason }
16220b86a832SChris Mason 
162360dfdf25SNikolay Borisov int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1624499f377fSJeff Mahoney 			 u64 *start, u64 *len)
1625499f377fSJeff Mahoney {
1626499f377fSJeff Mahoney 	/* FIXME use last free of some kind */
162760dfdf25SNikolay Borisov 	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1628499f377fSJeff Mahoney }
1629499f377fSJeff Mahoney 
1630b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
16318f18cf13SChris Mason 			  struct btrfs_device *device,
16322196d6e8SMiao Xie 			  u64 start, u64 *dev_extent_len)
16338f18cf13SChris Mason {
16340b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
16350b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
16368f18cf13SChris Mason 	int ret;
16378f18cf13SChris Mason 	struct btrfs_path *path;
16388f18cf13SChris Mason 	struct btrfs_key key;
1639a061fc8dSChris Mason 	struct btrfs_key found_key;
1640a061fc8dSChris Mason 	struct extent_buffer *leaf = NULL;
1641a061fc8dSChris Mason 	struct btrfs_dev_extent *extent = NULL;
16428f18cf13SChris Mason 
16438f18cf13SChris Mason 	path = btrfs_alloc_path();
16448f18cf13SChris Mason 	if (!path)
16458f18cf13SChris Mason 		return -ENOMEM;
16468f18cf13SChris Mason 
16478f18cf13SChris Mason 	key.objectid = device->devid;
16488f18cf13SChris Mason 	key.offset = start;
16498f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
1650924cd8fbSMiao Xie again:
16518f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1652a061fc8dSChris Mason 	if (ret > 0) {
1653a061fc8dSChris Mason 		ret = btrfs_previous_item(root, path, key.objectid,
1654a061fc8dSChris Mason 					  BTRFS_DEV_EXTENT_KEY);
1655b0b802d7STsutomu Itoh 		if (ret)
1656b0b802d7STsutomu Itoh 			goto out;
1657a061fc8dSChris Mason 		leaf = path->nodes[0];
1658a061fc8dSChris Mason 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1659a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1660a061fc8dSChris Mason 					struct btrfs_dev_extent);
1661a061fc8dSChris Mason 		BUG_ON(found_key.offset > start || found_key.offset +
1662a061fc8dSChris Mason 		       btrfs_dev_extent_length(leaf, extent) < start);
1663924cd8fbSMiao Xie 		key = found_key;
1664924cd8fbSMiao Xie 		btrfs_release_path(path);
1665924cd8fbSMiao Xie 		goto again;
1666a061fc8dSChris Mason 	} else if (ret == 0) {
1667a061fc8dSChris Mason 		leaf = path->nodes[0];
1668a061fc8dSChris Mason 		extent = btrfs_item_ptr(leaf, path->slots[0],
1669a061fc8dSChris Mason 					struct btrfs_dev_extent);
167079787eaaSJeff Mahoney 	} else {
16710b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
167279787eaaSJeff Mahoney 		goto out;
1673a061fc8dSChris Mason 	}
16748f18cf13SChris Mason 
16752196d6e8SMiao Xie 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
16762196d6e8SMiao Xie 
16778f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
167879787eaaSJeff Mahoney 	if (ret) {
16790b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
168079787eaaSJeff Mahoney 				      "Failed to remove dev extent item");
168113212b54SZhao Lei 	} else {
16823204d33cSJosef Bacik 		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
168379787eaaSJeff Mahoney 	}
1684b0b802d7STsutomu Itoh out:
16858f18cf13SChris Mason 	btrfs_free_path(path);
16868f18cf13SChris Mason 	return ret;
16878f18cf13SChris Mason }
16888f18cf13SChris Mason 
168948a3b636SEric Sandeen static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
16900b86a832SChris Mason 				  struct btrfs_device *device,
16912b82032cSYan Zheng 				  u64 chunk_offset, u64 start, u64 num_bytes)
16920b86a832SChris Mason {
16930b86a832SChris Mason 	int ret;
16940b86a832SChris Mason 	struct btrfs_path *path;
16950b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
16960b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
16970b86a832SChris Mason 	struct btrfs_dev_extent *extent;
16980b86a832SChris Mason 	struct extent_buffer *leaf;
16990b86a832SChris Mason 	struct btrfs_key key;
17000b86a832SChris Mason 
1701e12c9621SAnand Jain 	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1702401e29c1SAnand Jain 	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
17030b86a832SChris Mason 	path = btrfs_alloc_path();
17040b86a832SChris Mason 	if (!path)
17050b86a832SChris Mason 		return -ENOMEM;
17060b86a832SChris Mason 
17070b86a832SChris Mason 	key.objectid = device->devid;
17082b82032cSYan Zheng 	key.offset = start;
17090b86a832SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
17100b86a832SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key,
17110b86a832SChris Mason 				      sizeof(*extent));
17122cdcecbcSMark Fasheh 	if (ret)
17132cdcecbcSMark Fasheh 		goto out;
17140b86a832SChris Mason 
17150b86a832SChris Mason 	leaf = path->nodes[0];
17160b86a832SChris Mason 	extent = btrfs_item_ptr(leaf, path->slots[0],
17170b86a832SChris Mason 				struct btrfs_dev_extent);
1718b5d9071cSNikolay Borisov 	btrfs_set_dev_extent_chunk_tree(leaf, extent,
1719b5d9071cSNikolay Borisov 					BTRFS_CHUNK_TREE_OBJECTID);
17200ca00afbSNikolay Borisov 	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
17210ca00afbSNikolay Borisov 					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1722e17cade2SChris Mason 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1723e17cade2SChris Mason 
17240b86a832SChris Mason 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
17250b86a832SChris Mason 	btrfs_mark_buffer_dirty(leaf);
17262cdcecbcSMark Fasheh out:
17270b86a832SChris Mason 	btrfs_free_path(path);
17280b86a832SChris Mason 	return ret;
17290b86a832SChris Mason }
17300b86a832SChris Mason 
17316df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
17320b86a832SChris Mason {
17336df9a95eSJosef Bacik 	struct extent_map_tree *em_tree;
17346df9a95eSJosef Bacik 	struct extent_map *em;
17356df9a95eSJosef Bacik 	struct rb_node *n;
17366df9a95eSJosef Bacik 	u64 ret = 0;
17370b86a832SChris Mason 
1738c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
17396df9a95eSJosef Bacik 	read_lock(&em_tree->lock);
174007e1ce09SLiu Bo 	n = rb_last(&em_tree->map.rb_root);
17416df9a95eSJosef Bacik 	if (n) {
17426df9a95eSJosef Bacik 		em = rb_entry(n, struct extent_map, rb_node);
17436df9a95eSJosef Bacik 		ret = em->start + em->len;
1744e17cade2SChris Mason 	}
17456df9a95eSJosef Bacik 	read_unlock(&em_tree->lock);
17466df9a95eSJosef Bacik 
17470b86a832SChris Mason 	return ret;
17480b86a832SChris Mason }
17490b86a832SChris Mason 
175053f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
175153f10659SIlya Dryomov 				    u64 *devid_ret)
17520b86a832SChris Mason {
17530b86a832SChris Mason 	int ret;
17540b86a832SChris Mason 	struct btrfs_key key;
17550b86a832SChris Mason 	struct btrfs_key found_key;
17562b82032cSYan Zheng 	struct btrfs_path *path;
17572b82032cSYan Zheng 
17582b82032cSYan Zheng 	path = btrfs_alloc_path();
17592b82032cSYan Zheng 	if (!path)
17602b82032cSYan Zheng 		return -ENOMEM;
17610b86a832SChris Mason 
17620b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
17630b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
17640b86a832SChris Mason 	key.offset = (u64)-1;
17650b86a832SChris Mason 
176653f10659SIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
17670b86a832SChris Mason 	if (ret < 0)
17680b86a832SChris Mason 		goto error;
17690b86a832SChris Mason 
1770a06dee4dSAnand Jain 	if (ret == 0) {
1771a06dee4dSAnand Jain 		/* Corruption */
1772a06dee4dSAnand Jain 		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1773a06dee4dSAnand Jain 		ret = -EUCLEAN;
1774a06dee4dSAnand Jain 		goto error;
1775a06dee4dSAnand Jain 	}
17760b86a832SChris Mason 
177753f10659SIlya Dryomov 	ret = btrfs_previous_item(fs_info->chunk_root, path,
177853f10659SIlya Dryomov 				  BTRFS_DEV_ITEMS_OBJECTID,
17790b86a832SChris Mason 				  BTRFS_DEV_ITEM_KEY);
17800b86a832SChris Mason 	if (ret) {
178153f10659SIlya Dryomov 		*devid_ret = 1;
17820b86a832SChris Mason 	} else {
17830b86a832SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
17840b86a832SChris Mason 				      path->slots[0]);
178553f10659SIlya Dryomov 		*devid_ret = found_key.offset + 1;
17860b86a832SChris Mason 	}
17870b86a832SChris Mason 	ret = 0;
17880b86a832SChris Mason error:
17892b82032cSYan Zheng 	btrfs_free_path(path);
17900b86a832SChris Mason 	return ret;
17910b86a832SChris Mason }
17920b86a832SChris Mason 
17930b86a832SChris Mason /*
17940b86a832SChris Mason  * the device information is stored in the chunk root
17950b86a832SChris Mason  * the btrfs_device struct should be fully filled in
17960b86a832SChris Mason  */
1797c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
17980b86a832SChris Mason 			    struct btrfs_device *device)
17990b86a832SChris Mason {
18000b86a832SChris Mason 	int ret;
18010b86a832SChris Mason 	struct btrfs_path *path;
18020b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
18030b86a832SChris Mason 	struct extent_buffer *leaf;
18040b86a832SChris Mason 	struct btrfs_key key;
18050b86a832SChris Mason 	unsigned long ptr;
18060b86a832SChris Mason 
18070b86a832SChris Mason 	path = btrfs_alloc_path();
18080b86a832SChris Mason 	if (!path)
18090b86a832SChris Mason 		return -ENOMEM;
18100b86a832SChris Mason 
18110b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
18120b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
18132b82032cSYan Zheng 	key.offset = device->devid;
18140b86a832SChris Mason 
18158e87e856SNikolay Borisov 	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
18168e87e856SNikolay Borisov 				      &key, sizeof(*dev_item));
18170b86a832SChris Mason 	if (ret)
18180b86a832SChris Mason 		goto out;
18190b86a832SChris Mason 
18200b86a832SChris Mason 	leaf = path->nodes[0];
18210b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
18220b86a832SChris Mason 
18230b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
18242b82032cSYan Zheng 	btrfs_set_device_generation(leaf, dev_item, 0);
18250b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
18260b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
18270b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
18280b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
18297cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
18307cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
18317cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
18327cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
1833e17cade2SChris Mason 	btrfs_set_device_group(leaf, dev_item, 0);
1834e17cade2SChris Mason 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1835e17cade2SChris Mason 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1836c3027eb5SChris Mason 	btrfs_set_device_start_offset(leaf, dev_item, 0);
18370b86a832SChris Mason 
1838410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
1839e17cade2SChris Mason 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
18401473b24eSGeert Uytterhoeven 	ptr = btrfs_device_fsid(dev_item);
1841de37aa51SNikolay Borisov 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1842de37aa51SNikolay Borisov 			    ptr, BTRFS_FSID_SIZE);
18430b86a832SChris Mason 	btrfs_mark_buffer_dirty(leaf);
18440b86a832SChris Mason 
18452b82032cSYan Zheng 	ret = 0;
18460b86a832SChris Mason out:
18470b86a832SChris Mason 	btrfs_free_path(path);
18480b86a832SChris Mason 	return ret;
18490b86a832SChris Mason }
18508f18cf13SChris Mason 
18515a1972bdSQu Wenruo /*
18525a1972bdSQu Wenruo  * Function to update ctime/mtime for a given device path.
18535a1972bdSQu Wenruo  * Mainly used for ctime/mtime based probe like libblkid.
18545a1972bdSQu Wenruo  */
1855da353f6bSDavid Sterba static void update_dev_time(const char *path_name)
18565a1972bdSQu Wenruo {
18575a1972bdSQu Wenruo 	struct file *filp;
18585a1972bdSQu Wenruo 
18595a1972bdSQu Wenruo 	filp = filp_open(path_name, O_RDWR, 0);
186098af592fSAl Viro 	if (IS_ERR(filp))
18615a1972bdSQu Wenruo 		return;
18625a1972bdSQu Wenruo 	file_update_time(filp);
18635a1972bdSQu Wenruo 	filp_close(filp, NULL);
18645a1972bdSQu Wenruo }
18655a1972bdSQu Wenruo 
1866f331a952SDavid Sterba static int btrfs_rm_dev_item(struct btrfs_device *device)
1867a061fc8dSChris Mason {
1868f331a952SDavid Sterba 	struct btrfs_root *root = device->fs_info->chunk_root;
1869a061fc8dSChris Mason 	int ret;
1870a061fc8dSChris Mason 	struct btrfs_path *path;
1871a061fc8dSChris Mason 	struct btrfs_key key;
1872a061fc8dSChris Mason 	struct btrfs_trans_handle *trans;
1873a061fc8dSChris Mason 
1874a061fc8dSChris Mason 	path = btrfs_alloc_path();
1875a061fc8dSChris Mason 	if (!path)
1876a061fc8dSChris Mason 		return -ENOMEM;
1877a061fc8dSChris Mason 
1878a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
187998d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
188098d5dc13STsutomu Itoh 		btrfs_free_path(path);
188198d5dc13STsutomu Itoh 		return PTR_ERR(trans);
188298d5dc13STsutomu Itoh 	}
1883a061fc8dSChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1884a061fc8dSChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
1885a061fc8dSChris Mason 	key.offset = device->devid;
1886a061fc8dSChris Mason 
1887a061fc8dSChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
18885e9f2ad5SNikolay Borisov 	if (ret) {
18895e9f2ad5SNikolay Borisov 		if (ret > 0)
1890a061fc8dSChris Mason 			ret = -ENOENT;
18915e9f2ad5SNikolay Borisov 		btrfs_abort_transaction(trans, ret);
18925e9f2ad5SNikolay Borisov 		btrfs_end_transaction(trans);
1893a061fc8dSChris Mason 		goto out;
1894a061fc8dSChris Mason 	}
1895a061fc8dSChris Mason 
1896a061fc8dSChris Mason 	ret = btrfs_del_item(trans, root, path);
18975e9f2ad5SNikolay Borisov 	if (ret) {
18985e9f2ad5SNikolay Borisov 		btrfs_abort_transaction(trans, ret);
18995e9f2ad5SNikolay Borisov 		btrfs_end_transaction(trans);
19005e9f2ad5SNikolay Borisov 	}
19015e9f2ad5SNikolay Borisov 
1902a061fc8dSChris Mason out:
1903a061fc8dSChris Mason 	btrfs_free_path(path);
19045e9f2ad5SNikolay Borisov 	if (!ret)
19055e9f2ad5SNikolay Borisov 		ret = btrfs_commit_transaction(trans);
1906a061fc8dSChris Mason 	return ret;
1907a061fc8dSChris Mason }
1908a061fc8dSChris Mason 
19093cc31a0dSDavid Sterba /*
19103cc31a0dSDavid Sterba  * Verify that @num_devices satisfies the RAID profile constraints in the whole
19113cc31a0dSDavid Sterba  * filesystem. It's up to the caller to adjust that number regarding eg. device
19123cc31a0dSDavid Sterba  * replace.
19133cc31a0dSDavid Sterba  */
19143cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
19153cc31a0dSDavid Sterba 		u64 num_devices)
1916a061fc8dSChris Mason {
1917a061fc8dSChris Mason 	u64 all_avail;
1918de98ced9SMiao Xie 	unsigned seq;
1919418775a2SDavid Sterba 	int i;
1920a061fc8dSChris Mason 
1921de98ced9SMiao Xie 	do {
1922bd45ffbcSAnand Jain 		seq = read_seqbegin(&fs_info->profiles_lock);
1923de98ced9SMiao Xie 
1924bd45ffbcSAnand Jain 		all_avail = fs_info->avail_data_alloc_bits |
1925bd45ffbcSAnand Jain 			    fs_info->avail_system_alloc_bits |
1926bd45ffbcSAnand Jain 			    fs_info->avail_metadata_alloc_bits;
1927bd45ffbcSAnand Jain 	} while (read_seqretry(&fs_info->profiles_lock, seq));
1928f1fa7f26SAnand Jain 
1929418775a2SDavid Sterba 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
193041a6e891SAnand Jain 		if (!(all_avail & btrfs_raid_array[i].bg_flag))
1931418775a2SDavid Sterba 			continue;
1932a061fc8dSChris Mason 
1933418775a2SDavid Sterba 		if (num_devices < btrfs_raid_array[i].devs_min) {
1934f9fbcaa2SAnand Jain 			int ret = btrfs_raid_array[i].mindev_error;
1935a061fc8dSChris Mason 
1936418775a2SDavid Sterba 			if (ret)
1937418775a2SDavid Sterba 				return ret;
193853b381b3SDavid Woodhouse 		}
1939bd45ffbcSAnand Jain 	}
1940bd45ffbcSAnand Jain 
1941bd45ffbcSAnand Jain 	return 0;
1942f1fa7f26SAnand Jain }
1943f1fa7f26SAnand Jain 
1944c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device(
1945c9162bdfSOmar Sandoval 		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
194688acff64SAnand Jain {
194788acff64SAnand Jain 	struct btrfs_device *next_device;
194888acff64SAnand Jain 
194988acff64SAnand Jain 	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
195088acff64SAnand Jain 		if (next_device != device &&
1951e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1952e6e674bdSAnand Jain 		    && next_device->bdev)
195388acff64SAnand Jain 			return next_device;
195488acff64SAnand Jain 	}
195588acff64SAnand Jain 
195688acff64SAnand Jain 	return NULL;
195788acff64SAnand Jain }
195888acff64SAnand Jain 
195988acff64SAnand Jain /*
196088acff64SAnand Jain  * Helper function to check if the given device is part of s_bdev / latest_bdev
196188acff64SAnand Jain  * and replace it with the provided or the next active device, in the context
196288acff64SAnand Jain  * where this function called, there should be always be another device (or
196388acff64SAnand Jain  * this_dev) which is active.
196488acff64SAnand Jain  */
1965b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1966e493e8f9SAnand Jain 					    struct btrfs_device *next_device)
196788acff64SAnand Jain {
1968d6507cf1SNikolay Borisov 	struct btrfs_fs_info *fs_info = device->fs_info;
196988acff64SAnand Jain 
1970e493e8f9SAnand Jain 	if (!next_device)
197188acff64SAnand Jain 		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
197288acff64SAnand Jain 							    device);
197388acff64SAnand Jain 	ASSERT(next_device);
197488acff64SAnand Jain 
197588acff64SAnand Jain 	if (fs_info->sb->s_bdev &&
197688acff64SAnand Jain 			(fs_info->sb->s_bdev == device->bdev))
197788acff64SAnand Jain 		fs_info->sb->s_bdev = next_device->bdev;
197888acff64SAnand Jain 
197988acff64SAnand Jain 	if (fs_info->fs_devices->latest_bdev == device->bdev)
198088acff64SAnand Jain 		fs_info->fs_devices->latest_bdev = next_device->bdev;
198188acff64SAnand Jain }
198288acff64SAnand Jain 
19831da73967SAnand Jain /*
19841da73967SAnand Jain  * Return btrfs_fs_devices::num_devices excluding the device that's being
19851da73967SAnand Jain  * currently replaced.
19861da73967SAnand Jain  */
19871da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
19881da73967SAnand Jain {
19891da73967SAnand Jain 	u64 num_devices = fs_info->fs_devices->num_devices;
19901da73967SAnand Jain 
1991cb5583ddSDavid Sterba 	down_read(&fs_info->dev_replace.rwsem);
19921da73967SAnand Jain 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
19931da73967SAnand Jain 		ASSERT(num_devices > 1);
19941da73967SAnand Jain 		num_devices--;
19951da73967SAnand Jain 	}
1996cb5583ddSDavid Sterba 	up_read(&fs_info->dev_replace.rwsem);
19971da73967SAnand Jain 
19981da73967SAnand Jain 	return num_devices;
19991da73967SAnand Jain }
20001da73967SAnand Jain 
2001313b0858SJosef Bacik void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
20028f32380dSJohannes Thumshirn 			       struct block_device *bdev,
20036fbceb9fSJohannes Thumshirn 			       const char *device_path)
20046fbceb9fSJohannes Thumshirn {
20056fbceb9fSJohannes Thumshirn 	struct btrfs_super_block *disk_super;
20066fbceb9fSJohannes Thumshirn 	int copy_num;
20076fbceb9fSJohannes Thumshirn 
20086fbceb9fSJohannes Thumshirn 	if (!bdev)
20096fbceb9fSJohannes Thumshirn 		return;
20106fbceb9fSJohannes Thumshirn 
20116fbceb9fSJohannes Thumshirn 	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
20128f32380dSJohannes Thumshirn 		struct page *page;
20138f32380dSJohannes Thumshirn 		int ret;
20148f32380dSJohannes Thumshirn 
20158f32380dSJohannes Thumshirn 		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
20168f32380dSJohannes Thumshirn 		if (IS_ERR(disk_super))
20176fbceb9fSJohannes Thumshirn 			continue;
20186fbceb9fSJohannes Thumshirn 
20196fbceb9fSJohannes Thumshirn 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
20208f32380dSJohannes Thumshirn 
20218f32380dSJohannes Thumshirn 		page = virt_to_page(disk_super);
20228f32380dSJohannes Thumshirn 		set_page_dirty(page);
20238f32380dSJohannes Thumshirn 		lock_page(page);
20248f32380dSJohannes Thumshirn 		/* write_on_page() unlocks the page */
20258f32380dSJohannes Thumshirn 		ret = write_one_page(page);
20268f32380dSJohannes Thumshirn 		if (ret)
20278f32380dSJohannes Thumshirn 			btrfs_warn(fs_info,
20288f32380dSJohannes Thumshirn 				"error clearing superblock number %d (%d)",
20298f32380dSJohannes Thumshirn 				copy_num, ret);
20308f32380dSJohannes Thumshirn 		btrfs_release_disk_super(disk_super);
20318f32380dSJohannes Thumshirn 
20326fbceb9fSJohannes Thumshirn 	}
20336fbceb9fSJohannes Thumshirn 
20346fbceb9fSJohannes Thumshirn 	/* Notify udev that device has changed */
20356fbceb9fSJohannes Thumshirn 	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
20366fbceb9fSJohannes Thumshirn 
20376fbceb9fSJohannes Thumshirn 	/* Update ctime/mtime for device path for libblkid */
20386fbceb9fSJohannes Thumshirn 	update_dev_time(device_path);
20396fbceb9fSJohannes Thumshirn }
20406fbceb9fSJohannes Thumshirn 
2041da353f6bSDavid Sterba int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2042da353f6bSDavid Sterba 		    u64 devid)
2043f1fa7f26SAnand Jain {
2044f1fa7f26SAnand Jain 	struct btrfs_device *device;
2045f1fa7f26SAnand Jain 	struct btrfs_fs_devices *cur_devices;
2046b5185197SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2047f1fa7f26SAnand Jain 	u64 num_devices;
2048f1fa7f26SAnand Jain 	int ret = 0;
2049f1fa7f26SAnand Jain 
2050f1fa7f26SAnand Jain 	mutex_lock(&uuid_mutex);
2051a061fc8dSChris Mason 
20521da73967SAnand Jain 	num_devices = btrfs_num_devices(fs_info);
2053a061fc8dSChris Mason 
20540b246afaSJeff Mahoney 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2055beaf8ab3SStefan Behrens 	if (ret)
2056a061fc8dSChris Mason 		goto out;
2057f1fa7f26SAnand Jain 
2058a27a94c2SNikolay Borisov 	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2059a27a94c2SNikolay Borisov 
2060a27a94c2SNikolay Borisov 	if (IS_ERR(device)) {
2061a27a94c2SNikolay Borisov 		if (PTR_ERR(device) == -ENOENT &&
2062a27a94c2SNikolay Borisov 		    strcmp(device_path, "missing") == 0)
2063a27a94c2SNikolay Borisov 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2064a27a94c2SNikolay Borisov 		else
2065a27a94c2SNikolay Borisov 			ret = PTR_ERR(device);
2066a061fc8dSChris Mason 		goto out;
2067a27a94c2SNikolay Borisov 	}
20682b82032cSYan Zheng 
2069eede2bf3SOmar Sandoval 	if (btrfs_pinned_by_swapfile(fs_info, device)) {
2070eede2bf3SOmar Sandoval 		btrfs_warn_in_rcu(fs_info,
2071eede2bf3SOmar Sandoval 		  "cannot remove device %s (devid %llu) due to active swapfile",
2072eede2bf3SOmar Sandoval 				  rcu_str_deref(device->name), device->devid);
2073eede2bf3SOmar Sandoval 		ret = -ETXTBSY;
2074eede2bf3SOmar Sandoval 		goto out;
2075eede2bf3SOmar Sandoval 	}
2076eede2bf3SOmar Sandoval 
2077401e29c1SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2078183860f6SAnand Jain 		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
207924fc572fSAnand Jain 		goto out;
208063a212abSStefan Behrens 	}
208163a212abSStefan Behrens 
2082ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2083ebbede42SAnand Jain 	    fs_info->fs_devices->rw_devices == 1) {
2084183860f6SAnand Jain 		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
208524fc572fSAnand Jain 		goto out;
20862b82032cSYan Zheng 	}
20872b82032cSYan Zheng 
2088ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
208934441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
20902b82032cSYan Zheng 		list_del_init(&device->dev_alloc_list);
2091c3929c36SMiao Xie 		device->fs_devices->rw_devices--;
209234441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
20932b82032cSYan Zheng 	}
2094a061fc8dSChris Mason 
2095d7901554SCarey Underwood 	mutex_unlock(&uuid_mutex);
2096a061fc8dSChris Mason 	ret = btrfs_shrink_device(device, 0);
209766d204a1SFilipe Manana 	if (!ret)
209866d204a1SFilipe Manana 		btrfs_reada_remove_dev(device);
2099d7901554SCarey Underwood 	mutex_lock(&uuid_mutex);
2100a061fc8dSChris Mason 	if (ret)
21019b3517e9SIlya Dryomov 		goto error_undo;
2102a061fc8dSChris Mason 
210363a212abSStefan Behrens 	/*
210463a212abSStefan Behrens 	 * TODO: the superblock still includes this device in its num_devices
210563a212abSStefan Behrens 	 * counter although write_all_supers() is not locked out. This
210663a212abSStefan Behrens 	 * could give a filesystem state which requires a degraded mount.
210763a212abSStefan Behrens 	 */
2108f331a952SDavid Sterba 	ret = btrfs_rm_dev_item(device);
2109a061fc8dSChris Mason 	if (ret)
21109b3517e9SIlya Dryomov 		goto error_undo;
2111a061fc8dSChris Mason 
2112e12c9621SAnand Jain 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2113163e97eeSDavid Sterba 	btrfs_scrub_cancel_dev(device);
2114e5e9a520SChris Mason 
2115e5e9a520SChris Mason 	/*
2116e5e9a520SChris Mason 	 * the device list mutex makes sure that we don't change
2117e5e9a520SChris Mason 	 * the device list while someone else is writing out all
2118d7306801SFilipe David Borba Manana 	 * the device supers. Whoever is writing all supers, should
2119d7306801SFilipe David Borba Manana 	 * lock the device list mutex before getting the number of
2120d7306801SFilipe David Borba Manana 	 * devices in the super block (super_copy). Conversely,
2121d7306801SFilipe David Borba Manana 	 * whoever updates the number of devices in the super block
2122d7306801SFilipe David Borba Manana 	 * (super_copy) should hold the device list mutex.
2123e5e9a520SChris Mason 	 */
21241f78160cSXiao Guangrong 
212541a52a0fSAnand Jain 	/*
212641a52a0fSAnand Jain 	 * In normal cases the cur_devices == fs_devices. But in case
212741a52a0fSAnand Jain 	 * of deleting a seed device, the cur_devices should point to
212841a52a0fSAnand Jain 	 * its own fs_devices listed under the fs_devices->seed.
212941a52a0fSAnand Jain 	 */
21301f78160cSXiao Guangrong 	cur_devices = device->fs_devices;
2131b5185197SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
21321f78160cSXiao Guangrong 	list_del_rcu(&device->dev_list);
2133e5e9a520SChris Mason 
213441a52a0fSAnand Jain 	cur_devices->num_devices--;
213541a52a0fSAnand Jain 	cur_devices->total_devices--;
2136b4993e64SAnand Jain 	/* Update total_devices of the parent fs_devices if it's seed */
2137b4993e64SAnand Jain 	if (cur_devices != fs_devices)
2138b4993e64SAnand Jain 		fs_devices->total_devices--;
21392b82032cSYan Zheng 
2140e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
214141a52a0fSAnand Jain 		cur_devices->missing_devices--;
2142cd02dca5SChris Mason 
2143d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(device, NULL);
21442b82032cSYan Zheng 
21450bfaa9c5SEric Sandeen 	if (device->bdev) {
214641a52a0fSAnand Jain 		cur_devices->open_devices--;
214799994cdeSAnand Jain 		/* remove sysfs entry */
214853f8a74cSAnand Jain 		btrfs_sysfs_remove_device(device);
21490bfaa9c5SEric Sandeen 	}
215099994cdeSAnand Jain 
21510b246afaSJeff Mahoney 	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
21520b246afaSJeff Mahoney 	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2153b5185197SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2154e4404d6eSYan Zheng 
2155cea67ab9SJeff Mahoney 	/*
2156cea67ab9SJeff Mahoney 	 * at this point, the device is zero sized and detached from
2157cea67ab9SJeff Mahoney 	 * the devices list.  All that's left is to zero out the old
2158cea67ab9SJeff Mahoney 	 * supers and free the device.
2159cea67ab9SJeff Mahoney 	 */
2160ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
21618f32380dSJohannes Thumshirn 		btrfs_scratch_superblocks(fs_info, device->bdev,
21628f32380dSJohannes Thumshirn 					  device->name->str);
2163cea67ab9SJeff Mahoney 
2164cea67ab9SJeff Mahoney 	btrfs_close_bdev(device);
21658e75fd89SNikolay Borisov 	synchronize_rcu();
21668e75fd89SNikolay Borisov 	btrfs_free_device(device);
2167cea67ab9SJeff Mahoney 
21681f78160cSXiao Guangrong 	if (cur_devices->open_devices == 0) {
2169944d3f9fSNikolay Borisov 		list_del_init(&cur_devices->seed_list);
21700226e0ebSAnand Jain 		close_fs_devices(cur_devices);
21711f78160cSXiao Guangrong 		free_fs_devices(cur_devices);
21722b82032cSYan Zheng 	}
21732b82032cSYan Zheng 
2174a061fc8dSChris Mason out:
2175a061fc8dSChris Mason 	mutex_unlock(&uuid_mutex);
2176a061fc8dSChris Mason 	return ret;
217724fc572fSAnand Jain 
21789b3517e9SIlya Dryomov error_undo:
217966d204a1SFilipe Manana 	btrfs_reada_undo_remove_dev(device);
2180ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
218134441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
21829b3517e9SIlya Dryomov 		list_add(&device->dev_alloc_list,
2183b5185197SAnand Jain 			 &fs_devices->alloc_list);
2184c3929c36SMiao Xie 		device->fs_devices->rw_devices++;
218534441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
21869b3517e9SIlya Dryomov 	}
218724fc572fSAnand Jain 	goto out;
2188a061fc8dSChris Mason }
2189a061fc8dSChris Mason 
219068a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2191e93c89c1SStefan Behrens {
2192d51908ceSAnand Jain 	struct btrfs_fs_devices *fs_devices;
2193d51908ceSAnand Jain 
219468a9db5fSNikolay Borisov 	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
21951357272fSIlya Dryomov 
219625e8e911SAnand Jain 	/*
219725e8e911SAnand Jain 	 * in case of fs with no seed, srcdev->fs_devices will point
219825e8e911SAnand Jain 	 * to fs_devices of fs_info. However when the dev being replaced is
219925e8e911SAnand Jain 	 * a seed dev it will point to the seed's local fs_devices. In short
220025e8e911SAnand Jain 	 * srcdev will have its correct fs_devices in both the cases.
220125e8e911SAnand Jain 	 */
220225e8e911SAnand Jain 	fs_devices = srcdev->fs_devices;
2203d51908ceSAnand Jain 
2204e93c89c1SStefan Behrens 	list_del_rcu(&srcdev->dev_list);
2205619c47f3SDavid Sterba 	list_del(&srcdev->dev_alloc_list);
2206d51908ceSAnand Jain 	fs_devices->num_devices--;
2207e6e674bdSAnand Jain 	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2208d51908ceSAnand Jain 		fs_devices->missing_devices--;
2209e93c89c1SStefan Behrens 
2210ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
221182372bc8SMiao Xie 		fs_devices->rw_devices--;
22121357272fSIlya Dryomov 
221382372bc8SMiao Xie 	if (srcdev->bdev)
221482372bc8SMiao Xie 		fs_devices->open_devices--;
2215084b6e7cSQu Wenruo }
2216084b6e7cSQu Wenruo 
221765237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2218084b6e7cSQu Wenruo {
2219084b6e7cSQu Wenruo 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
222082372bc8SMiao Xie 
2221a466c85eSJosef Bacik 	mutex_lock(&uuid_mutex);
2222a466c85eSJosef Bacik 
222314238819SAnand Jain 	btrfs_close_bdev(srcdev);
22248e75fd89SNikolay Borisov 	synchronize_rcu();
22258e75fd89SNikolay Borisov 	btrfs_free_device(srcdev);
222694d5f0c2SAnand Jain 
222794d5f0c2SAnand Jain 	/* if this is no devs we rather delete the fs_devices */
222894d5f0c2SAnand Jain 	if (!fs_devices->num_devices) {
22296dd38f81SAnand Jain 		/*
22306dd38f81SAnand Jain 		 * On a mounted FS, num_devices can't be zero unless it's a
22316dd38f81SAnand Jain 		 * seed. In case of a seed device being replaced, the replace
22326dd38f81SAnand Jain 		 * target added to the sprout FS, so there will be no more
22336dd38f81SAnand Jain 		 * device left under the seed FS.
22346dd38f81SAnand Jain 		 */
22356dd38f81SAnand Jain 		ASSERT(fs_devices->seeding);
22366dd38f81SAnand Jain 
2237944d3f9fSNikolay Borisov 		list_del_init(&fs_devices->seed_list);
22380226e0ebSAnand Jain 		close_fs_devices(fs_devices);
22398bef8401SAnand Jain 		free_fs_devices(fs_devices);
224094d5f0c2SAnand Jain 	}
2241a466c85eSJosef Bacik 	mutex_unlock(&uuid_mutex);
2242e93c89c1SStefan Behrens }
2243e93c89c1SStefan Behrens 
22444f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2245e93c89c1SStefan Behrens {
22464f5ad7bdSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2247d2ff1b20SAnand Jain 
2248d9a071f0SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
2249d9a071f0SAnand Jain 
225053f8a74cSAnand Jain 	btrfs_sysfs_remove_device(tgtdev);
2251d2ff1b20SAnand Jain 
2252779bf3feSAnand Jain 	if (tgtdev->bdev)
2253d9a071f0SAnand Jain 		fs_devices->open_devices--;
2254779bf3feSAnand Jain 
2255d9a071f0SAnand Jain 	fs_devices->num_devices--;
2256e93c89c1SStefan Behrens 
2257d6507cf1SNikolay Borisov 	btrfs_assign_next_active_device(tgtdev, NULL);
2258e93c89c1SStefan Behrens 
2259e93c89c1SStefan Behrens 	list_del_rcu(&tgtdev->dev_list);
2260e93c89c1SStefan Behrens 
2261d9a071f0SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2262779bf3feSAnand Jain 
2263779bf3feSAnand Jain 	/*
2264779bf3feSAnand Jain 	 * The update_dev_time() with in btrfs_scratch_superblocks()
2265779bf3feSAnand Jain 	 * may lead to a call to btrfs_show_devname() which will try
2266779bf3feSAnand Jain 	 * to hold device_list_mutex. And here this device
2267779bf3feSAnand Jain 	 * is already out of device list, so we don't have to hold
2268779bf3feSAnand Jain 	 * the device_list_mutex lock.
2269779bf3feSAnand Jain 	 */
22708f32380dSJohannes Thumshirn 	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
22718f32380dSJohannes Thumshirn 				  tgtdev->name->str);
227214238819SAnand Jain 
227314238819SAnand Jain 	btrfs_close_bdev(tgtdev);
22748e75fd89SNikolay Borisov 	synchronize_rcu();
22758e75fd89SNikolay Borisov 	btrfs_free_device(tgtdev);
2276e93c89c1SStefan Behrens }
2277e93c89c1SStefan Behrens 
2278b444ad46SNikolay Borisov static struct btrfs_device *btrfs_find_device_by_path(
2279b444ad46SNikolay Borisov 		struct btrfs_fs_info *fs_info, const char *device_path)
22807ba15b7dSStefan Behrens {
22817ba15b7dSStefan Behrens 	int ret = 0;
22827ba15b7dSStefan Behrens 	struct btrfs_super_block *disk_super;
22837ba15b7dSStefan Behrens 	u64 devid;
22847ba15b7dSStefan Behrens 	u8 *dev_uuid;
22857ba15b7dSStefan Behrens 	struct block_device *bdev;
2286b444ad46SNikolay Borisov 	struct btrfs_device *device;
22877ba15b7dSStefan Behrens 
22887ba15b7dSStefan Behrens 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
22898f32380dSJohannes Thumshirn 				    fs_info->bdev_holder, 0, &bdev, &disk_super);
22907ba15b7dSStefan Behrens 	if (ret)
2291b444ad46SNikolay Borisov 		return ERR_PTR(ret);
22928f32380dSJohannes Thumshirn 
22937ba15b7dSStefan Behrens 	devid = btrfs_stack_device_id(&disk_super->dev_item);
22947ba15b7dSStefan Behrens 	dev_uuid = disk_super->dev_item.uuid;
22957239ff4bSNikolay Borisov 	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2296e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
229709ba3bc9SAnand Jain 					   disk_super->metadata_uuid, true);
22987239ff4bSNikolay Borisov 	else
2299e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
230009ba3bc9SAnand Jain 					   disk_super->fsid, true);
23017239ff4bSNikolay Borisov 
23028f32380dSJohannes Thumshirn 	btrfs_release_disk_super(disk_super);
2303b444ad46SNikolay Borisov 	if (!device)
2304b444ad46SNikolay Borisov 		device = ERR_PTR(-ENOENT);
23057ba15b7dSStefan Behrens 	blkdev_put(bdev, FMODE_READ);
2306b444ad46SNikolay Borisov 	return device;
23077ba15b7dSStefan Behrens }
23087ba15b7dSStefan Behrens 
23092b82032cSYan Zheng /*
23105c5c0df0SDavid Sterba  * Lookup a device given by device id, or the path if the id is 0.
23115c5c0df0SDavid Sterba  */
2312a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec(
23136e927cebSAnand Jain 		struct btrfs_fs_info *fs_info, u64 devid,
23146e927cebSAnand Jain 		const char *device_path)
231524e0474bSAnand Jain {
2316a27a94c2SNikolay Borisov 	struct btrfs_device *device;
231724e0474bSAnand Jain 
23185c5c0df0SDavid Sterba 	if (devid) {
2319e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
232009ba3bc9SAnand Jain 					   NULL, true);
2321a27a94c2SNikolay Borisov 		if (!device)
2322a27a94c2SNikolay Borisov 			return ERR_PTR(-ENOENT);
23236e927cebSAnand Jain 		return device;
23246e927cebSAnand Jain 	}
23256e927cebSAnand Jain 
23266e927cebSAnand Jain 	if (!device_path || !device_path[0])
2327a27a94c2SNikolay Borisov 		return ERR_PTR(-EINVAL);
2328d95a830cSAnand Jain 
23296e927cebSAnand Jain 	if (strcmp(device_path, "missing") == 0) {
23306e927cebSAnand Jain 		/* Find first missing device */
2331d95a830cSAnand Jain 		list_for_each_entry(device, &fs_info->fs_devices->devices,
2332d95a830cSAnand Jain 				    dev_list) {
2333d95a830cSAnand Jain 			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
23346e927cebSAnand Jain 				     &device->dev_state) && !device->bdev)
2335d95a830cSAnand Jain 				return device;
2336d95a830cSAnand Jain 		}
2337d95a830cSAnand Jain 		return ERR_PTR(-ENOENT);
2338d95a830cSAnand Jain 	}
23396e927cebSAnand Jain 
23406e927cebSAnand Jain 	return btrfs_find_device_by_path(fs_info, device_path);
234124e0474bSAnand Jain }
234224e0474bSAnand Jain 
23432b82032cSYan Zheng /*
23442b82032cSYan Zheng  * does all the dirty work required for changing file system's UUID.
23452b82032cSYan Zheng  */
23462ff7e61eSJeff Mahoney static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
23472b82032cSYan Zheng {
23480b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
23492b82032cSYan Zheng 	struct btrfs_fs_devices *old_devices;
2350e4404d6eSYan Zheng 	struct btrfs_fs_devices *seed_devices;
23510b246afaSJeff Mahoney 	struct btrfs_super_block *disk_super = fs_info->super_copy;
23522b82032cSYan Zheng 	struct btrfs_device *device;
23532b82032cSYan Zheng 	u64 super_flags;
23542b82032cSYan Zheng 
2355a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
2356e4404d6eSYan Zheng 	if (!fs_devices->seeding)
23572b82032cSYan Zheng 		return -EINVAL;
23582b82032cSYan Zheng 
2359427c8fddSNikolay Borisov 	/*
2360427c8fddSNikolay Borisov 	 * Private copy of the seed devices, anchored at
2361427c8fddSNikolay Borisov 	 * fs_info->fs_devices->seed_list
2362427c8fddSNikolay Borisov 	 */
23637239ff4bSNikolay Borisov 	seed_devices = alloc_fs_devices(NULL, NULL);
23642208a378SIlya Dryomov 	if (IS_ERR(seed_devices))
23652208a378SIlya Dryomov 		return PTR_ERR(seed_devices);
23662b82032cSYan Zheng 
2367427c8fddSNikolay Borisov 	/*
2368427c8fddSNikolay Borisov 	 * It's necessary to retain a copy of the original seed fs_devices in
2369427c8fddSNikolay Borisov 	 * fs_uuids so that filesystems which have been seeded can successfully
2370427c8fddSNikolay Borisov 	 * reference the seed device from open_seed_devices. This also supports
2371427c8fddSNikolay Borisov 	 * multiple fs seed.
2372427c8fddSNikolay Borisov 	 */
2373e4404d6eSYan Zheng 	old_devices = clone_fs_devices(fs_devices);
2374e4404d6eSYan Zheng 	if (IS_ERR(old_devices)) {
2375e4404d6eSYan Zheng 		kfree(seed_devices);
2376e4404d6eSYan Zheng 		return PTR_ERR(old_devices);
23772b82032cSYan Zheng 	}
2378e4404d6eSYan Zheng 
2379c4babc5eSAnand Jain 	list_add(&old_devices->fs_list, &fs_uuids);
23802b82032cSYan Zheng 
2381e4404d6eSYan Zheng 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2382e4404d6eSYan Zheng 	seed_devices->opened = 1;
2383e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->devices);
2384e4404d6eSYan Zheng 	INIT_LIST_HEAD(&seed_devices->alloc_list);
2385e5e9a520SChris Mason 	mutex_init(&seed_devices->device_list_mutex);
2386c9513edbSXiao Guangrong 
2387321a4bf7SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
23881f78160cSXiao Guangrong 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
23891f78160cSXiao Guangrong 			      synchronize_rcu);
23902196d6e8SMiao Xie 	list_for_each_entry(device, &seed_devices->devices, dev_list)
2391e4404d6eSYan Zheng 		device->fs_devices = seed_devices;
23922196d6e8SMiao Xie 
23930395d84fSJohannes Thumshirn 	fs_devices->seeding = false;
23942b82032cSYan Zheng 	fs_devices->num_devices = 0;
23952b82032cSYan Zheng 	fs_devices->open_devices = 0;
239669611ac8SMiao Xie 	fs_devices->missing_devices = 0;
23977f0432d0SJohannes Thumshirn 	fs_devices->rotating = false;
2398944d3f9fSNikolay Borisov 	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
23992b82032cSYan Zheng 
24002b82032cSYan Zheng 	generate_random_uuid(fs_devices->fsid);
24017239ff4bSNikolay Borisov 	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
24022b82032cSYan Zheng 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2403321a4bf7SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2404f7171750SFilipe David Borba Manana 
24052b82032cSYan Zheng 	super_flags = btrfs_super_flags(disk_super) &
24062b82032cSYan Zheng 		      ~BTRFS_SUPER_FLAG_SEEDING;
24072b82032cSYan Zheng 	btrfs_set_super_flags(disk_super, super_flags);
24082b82032cSYan Zheng 
24092b82032cSYan Zheng 	return 0;
24102b82032cSYan Zheng }
24112b82032cSYan Zheng 
24122b82032cSYan Zheng /*
241301327610SNicholas D Steeves  * Store the expected generation for seed devices in device items.
24142b82032cSYan Zheng  */
24155c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
24162b82032cSYan Zheng {
24175c466629SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
24185b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
24192b82032cSYan Zheng 	struct btrfs_path *path;
24202b82032cSYan Zheng 	struct extent_buffer *leaf;
24212b82032cSYan Zheng 	struct btrfs_dev_item *dev_item;
24222b82032cSYan Zheng 	struct btrfs_device *device;
24232b82032cSYan Zheng 	struct btrfs_key key;
242444880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
24252b82032cSYan Zheng 	u8 dev_uuid[BTRFS_UUID_SIZE];
24262b82032cSYan Zheng 	u64 devid;
24272b82032cSYan Zheng 	int ret;
24282b82032cSYan Zheng 
24292b82032cSYan Zheng 	path = btrfs_alloc_path();
24302b82032cSYan Zheng 	if (!path)
24312b82032cSYan Zheng 		return -ENOMEM;
24322b82032cSYan Zheng 
24332b82032cSYan Zheng 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
24342b82032cSYan Zheng 	key.offset = 0;
24352b82032cSYan Zheng 	key.type = BTRFS_DEV_ITEM_KEY;
24362b82032cSYan Zheng 
24372b82032cSYan Zheng 	while (1) {
24382b82032cSYan Zheng 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
24392b82032cSYan Zheng 		if (ret < 0)
24402b82032cSYan Zheng 			goto error;
24412b82032cSYan Zheng 
24422b82032cSYan Zheng 		leaf = path->nodes[0];
24432b82032cSYan Zheng next_slot:
24442b82032cSYan Zheng 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
24452b82032cSYan Zheng 			ret = btrfs_next_leaf(root, path);
24462b82032cSYan Zheng 			if (ret > 0)
24472b82032cSYan Zheng 				break;
24482b82032cSYan Zheng 			if (ret < 0)
24492b82032cSYan Zheng 				goto error;
24502b82032cSYan Zheng 			leaf = path->nodes[0];
24512b82032cSYan Zheng 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2452b3b4aa74SDavid Sterba 			btrfs_release_path(path);
24532b82032cSYan Zheng 			continue;
24542b82032cSYan Zheng 		}
24552b82032cSYan Zheng 
24562b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
24572b82032cSYan Zheng 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
24582b82032cSYan Zheng 		    key.type != BTRFS_DEV_ITEM_KEY)
24592b82032cSYan Zheng 			break;
24602b82032cSYan Zheng 
24612b82032cSYan Zheng 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
24622b82032cSYan Zheng 					  struct btrfs_dev_item);
24632b82032cSYan Zheng 		devid = btrfs_device_id(leaf, dev_item);
2464410ba3a2SGeert Uytterhoeven 		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
24652b82032cSYan Zheng 				   BTRFS_UUID_SIZE);
24661473b24eSGeert Uytterhoeven 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
246744880fdcSAnand Jain 				   BTRFS_FSID_SIZE);
2468e4319cd9SAnand Jain 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
246909ba3bc9SAnand Jain 					   fs_uuid, true);
247079787eaaSJeff Mahoney 		BUG_ON(!device); /* Logic error */
24712b82032cSYan Zheng 
24722b82032cSYan Zheng 		if (device->fs_devices->seeding) {
24732b82032cSYan Zheng 			btrfs_set_device_generation(leaf, dev_item,
24742b82032cSYan Zheng 						    device->generation);
24752b82032cSYan Zheng 			btrfs_mark_buffer_dirty(leaf);
24762b82032cSYan Zheng 		}
24772b82032cSYan Zheng 
24782b82032cSYan Zheng 		path->slots[0]++;
24792b82032cSYan Zheng 		goto next_slot;
24802b82032cSYan Zheng 	}
24812b82032cSYan Zheng 	ret = 0;
24822b82032cSYan Zheng error:
24832b82032cSYan Zheng 	btrfs_free_path(path);
24842b82032cSYan Zheng 	return ret;
24852b82032cSYan Zheng }
24862b82032cSYan Zheng 
2487da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2488788f20ebSChris Mason {
24895112febbSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
2490d5e2003cSJosef Bacik 	struct request_queue *q;
2491788f20ebSChris Mason 	struct btrfs_trans_handle *trans;
2492788f20ebSChris Mason 	struct btrfs_device *device;
2493788f20ebSChris Mason 	struct block_device *bdev;
24940b246afaSJeff Mahoney 	struct super_block *sb = fs_info->sb;
2495606686eeSJosef Bacik 	struct rcu_string *name;
24965da54bc1SAnand Jain 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
249739379faaSNaohiro Aota 	u64 orig_super_total_bytes;
249839379faaSNaohiro Aota 	u64 orig_super_num_devices;
24992b82032cSYan Zheng 	int seeding_dev = 0;
2500788f20ebSChris Mason 	int ret = 0;
250144cab9baSNikolay Borisov 	bool locked = false;
2502788f20ebSChris Mason 
25035da54bc1SAnand Jain 	if (sb_rdonly(sb) && !fs_devices->seeding)
2504f8c5d0b4SLiu Bo 		return -EROFS;
2505788f20ebSChris Mason 
2506a5d16333SLi Zefan 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
25070b246afaSJeff Mahoney 				  fs_info->bdev_holder);
25087f59203aSJosef Bacik 	if (IS_ERR(bdev))
25097f59203aSJosef Bacik 		return PTR_ERR(bdev);
2510a2135011SChris Mason 
25115da54bc1SAnand Jain 	if (fs_devices->seeding) {
25122b82032cSYan Zheng 		seeding_dev = 1;
25132b82032cSYan Zheng 		down_write(&sb->s_umount);
25142b82032cSYan Zheng 		mutex_lock(&uuid_mutex);
251544cab9baSNikolay Borisov 		locked = true;
25162b82032cSYan Zheng 	}
25172b82032cSYan Zheng 
2518b9ba017fSNikolay Borisov 	sync_blockdev(bdev);
2519a2135011SChris Mason 
2520f4cfa9bdSNikolay Borisov 	rcu_read_lock();
2521f4cfa9bdSNikolay Borisov 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2522788f20ebSChris Mason 		if (device->bdev == bdev) {
2523788f20ebSChris Mason 			ret = -EEXIST;
2524f4cfa9bdSNikolay Borisov 			rcu_read_unlock();
25252b82032cSYan Zheng 			goto error;
2526788f20ebSChris Mason 		}
2527788f20ebSChris Mason 	}
2528f4cfa9bdSNikolay Borisov 	rcu_read_unlock();
2529788f20ebSChris Mason 
25300b246afaSJeff Mahoney 	device = btrfs_alloc_device(fs_info, NULL, NULL);
253112bd2fc0SIlya Dryomov 	if (IS_ERR(device)) {
2532788f20ebSChris Mason 		/* we can safely leave the fs_devices entry around */
253312bd2fc0SIlya Dryomov 		ret = PTR_ERR(device);
25342b82032cSYan Zheng 		goto error;
2535788f20ebSChris Mason 	}
2536788f20ebSChris Mason 
253778f2c9e6SDavid Sterba 	name = rcu_string_strdup(device_path, GFP_KERNEL);
2538606686eeSJosef Bacik 	if (!name) {
25392b82032cSYan Zheng 		ret = -ENOMEM;
25405c4cf6c9SDavid Sterba 		goto error_free_device;
2541788f20ebSChris Mason 	}
2542606686eeSJosef Bacik 	rcu_assign_pointer(device->name, name);
25432b82032cSYan Zheng 
2544a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
254598d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
254698d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
25475c4cf6c9SDavid Sterba 		goto error_free_device;
254898d5dc13STsutomu Itoh 	}
254998d5dc13STsutomu Itoh 
2550d5e2003cSJosef Bacik 	q = bdev_get_queue(bdev);
2551ebbede42SAnand Jain 	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
25522b82032cSYan Zheng 	device->generation = trans->transid;
25530b246afaSJeff Mahoney 	device->io_width = fs_info->sectorsize;
25540b246afaSJeff Mahoney 	device->io_align = fs_info->sectorsize;
25550b246afaSJeff Mahoney 	device->sector_size = fs_info->sectorsize;
25567dfb8be1SNikolay Borisov 	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
25577dfb8be1SNikolay Borisov 					 fs_info->sectorsize);
25582cc3c559SYan Zheng 	device->disk_total_bytes = device->total_bytes;
2559935e5cc9SMiao Xie 	device->commit_total_bytes = device->total_bytes;
2560fb456252SJeff Mahoney 	device->fs_info = fs_info;
2561788f20ebSChris Mason 	device->bdev = bdev;
2562e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2563401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2564fb01aa85SIlya Dryomov 	device->mode = FMODE_EXCL;
256527087f37SStefan Behrens 	device->dev_stats_valid = 1;
25669f6d2510SDavid Sterba 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2567325cd4baSZheng Yan 
25682b82032cSYan Zheng 	if (seeding_dev) {
25691751e8a6SLinus Torvalds 		sb->s_flags &= ~SB_RDONLY;
25702ff7e61eSJeff Mahoney 		ret = btrfs_prepare_sprout(fs_info);
2571d31c32f6SAnand Jain 		if (ret) {
2572d31c32f6SAnand Jain 			btrfs_abort_transaction(trans, ret);
2573d31c32f6SAnand Jain 			goto error_trans;
2574d31c32f6SAnand Jain 		}
25752b82032cSYan Zheng 	}
25762b82032cSYan Zheng 
25775da54bc1SAnand Jain 	device->fs_devices = fs_devices;
2578e5e9a520SChris Mason 
25795da54bc1SAnand Jain 	mutex_lock(&fs_devices->device_list_mutex);
258034441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
25815da54bc1SAnand Jain 	list_add_rcu(&device->dev_list, &fs_devices->devices);
25825da54bc1SAnand Jain 	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
25835da54bc1SAnand Jain 	fs_devices->num_devices++;
25845da54bc1SAnand Jain 	fs_devices->open_devices++;
25855da54bc1SAnand Jain 	fs_devices->rw_devices++;
25865da54bc1SAnand Jain 	fs_devices->total_devices++;
25875da54bc1SAnand Jain 	fs_devices->total_rw_bytes += device->total_bytes;
25882b82032cSYan Zheng 
2589a5ed45f8SNikolay Borisov 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
25902bf64758SJosef Bacik 
2591e884f4f0SAnand Jain 	if (!blk_queue_nonrot(q))
25927f0432d0SJohannes Thumshirn 		fs_devices->rotating = true;
2593c289811cSChris Mason 
259439379faaSNaohiro Aota 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
25950b246afaSJeff Mahoney 	btrfs_set_super_total_bytes(fs_info->super_copy,
259639379faaSNaohiro Aota 		round_down(orig_super_total_bytes + device->total_bytes,
259739379faaSNaohiro Aota 			   fs_info->sectorsize));
2598788f20ebSChris Mason 
259939379faaSNaohiro Aota 	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
260039379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
260139379faaSNaohiro Aota 				    orig_super_num_devices + 1);
26020d39376aSAnand Jain 
26032196d6e8SMiao Xie 	/*
26042196d6e8SMiao Xie 	 * we've got more storage, clear any full flags on the space
26052196d6e8SMiao Xie 	 * infos
26062196d6e8SMiao Xie 	 */
26070b246afaSJeff Mahoney 	btrfs_clear_space_info_full(fs_info);
26082196d6e8SMiao Xie 
260934441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
2610ca10845aSJosef Bacik 
2611ca10845aSJosef Bacik 	/* Add sysfs device entry */
2612cd36da2eSAnand Jain 	btrfs_sysfs_add_device(device);
2613ca10845aSJosef Bacik 
26145da54bc1SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
2615788f20ebSChris Mason 
26162b82032cSYan Zheng 	if (seeding_dev) {
261734441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
26186f8e0fc7SDavid Sterba 		ret = init_first_rw_device(trans);
261934441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
2620005d6427SDavid Sterba 		if (ret) {
262166642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2622d31c32f6SAnand Jain 			goto error_sysfs;
2623005d6427SDavid Sterba 		}
26242196d6e8SMiao Xie 	}
26252196d6e8SMiao Xie 
26268e87e856SNikolay Borisov 	ret = btrfs_add_dev_item(trans, device);
26272196d6e8SMiao Xie 	if (ret) {
262866642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
2629d31c32f6SAnand Jain 		goto error_sysfs;
26302196d6e8SMiao Xie 	}
26312196d6e8SMiao Xie 
26322196d6e8SMiao Xie 	if (seeding_dev) {
26335c466629SDavid Sterba 		ret = btrfs_finish_sprout(trans);
2634005d6427SDavid Sterba 		if (ret) {
263566642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
2636d31c32f6SAnand Jain 			goto error_sysfs;
2637005d6427SDavid Sterba 		}
2638b2373f25SAnand Jain 
26398e560081SNikolay Borisov 		/*
26408e560081SNikolay Borisov 		 * fs_devices now represents the newly sprouted filesystem and
26418e560081SNikolay Borisov 		 * its fsid has been changed by btrfs_prepare_sprout
26428e560081SNikolay Borisov 		 */
26438e560081SNikolay Borisov 		btrfs_sysfs_update_sprout_fsid(fs_devices);
2644005d6427SDavid Sterba 	}
26452b82032cSYan Zheng 
26463a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
26472b82032cSYan Zheng 
26482b82032cSYan Zheng 	if (seeding_dev) {
26492b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
26502b82032cSYan Zheng 		up_write(&sb->s_umount);
265144cab9baSNikolay Borisov 		locked = false;
26522b82032cSYan Zheng 
265379787eaaSJeff Mahoney 		if (ret) /* transaction commit */
265479787eaaSJeff Mahoney 			return ret;
265579787eaaSJeff Mahoney 
26562ff7e61eSJeff Mahoney 		ret = btrfs_relocate_sys_chunks(fs_info);
265779787eaaSJeff Mahoney 		if (ret < 0)
26580b246afaSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
26595d163e0eSJeff Mahoney 				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2660671415b7SMiao Xie 		trans = btrfs_attach_transaction(root);
2661671415b7SMiao Xie 		if (IS_ERR(trans)) {
2662671415b7SMiao Xie 			if (PTR_ERR(trans) == -ENOENT)
2663671415b7SMiao Xie 				return 0;
26647132a262SAnand Jain 			ret = PTR_ERR(trans);
26657132a262SAnand Jain 			trans = NULL;
26667132a262SAnand Jain 			goto error_sysfs;
2667671415b7SMiao Xie 		}
26683a45bb20SJeff Mahoney 		ret = btrfs_commit_transaction(trans);
26692b82032cSYan Zheng 	}
2670c9e9f97bSIlya Dryomov 
26717f551d96SAnand Jain 	/*
26727f551d96SAnand Jain 	 * Now that we have written a new super block to this device, check all
26737f551d96SAnand Jain 	 * other fs_devices list if device_path alienates any other scanned
26747f551d96SAnand Jain 	 * device.
26757f551d96SAnand Jain 	 * We can ignore the return value as it typically returns -EINVAL and
26767f551d96SAnand Jain 	 * only succeeds if the device was an alien.
26777f551d96SAnand Jain 	 */
26787f551d96SAnand Jain 	btrfs_forget_devices(device_path);
26797f551d96SAnand Jain 
26807f551d96SAnand Jain 	/* Update ctime/mtime for blkid or udev */
26815a1972bdSQu Wenruo 	update_dev_time(device_path);
26827f551d96SAnand Jain 
2683788f20ebSChris Mason 	return ret;
268479787eaaSJeff Mahoney 
2685d31c32f6SAnand Jain error_sysfs:
268653f8a74cSAnand Jain 	btrfs_sysfs_remove_device(device);
268739379faaSNaohiro Aota 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
268839379faaSNaohiro Aota 	mutex_lock(&fs_info->chunk_mutex);
268939379faaSNaohiro Aota 	list_del_rcu(&device->dev_list);
269039379faaSNaohiro Aota 	list_del(&device->dev_alloc_list);
269139379faaSNaohiro Aota 	fs_info->fs_devices->num_devices--;
269239379faaSNaohiro Aota 	fs_info->fs_devices->open_devices--;
269339379faaSNaohiro Aota 	fs_info->fs_devices->rw_devices--;
269439379faaSNaohiro Aota 	fs_info->fs_devices->total_devices--;
269539379faaSNaohiro Aota 	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
269639379faaSNaohiro Aota 	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
269739379faaSNaohiro Aota 	btrfs_set_super_total_bytes(fs_info->super_copy,
269839379faaSNaohiro Aota 				    orig_super_total_bytes);
269939379faaSNaohiro Aota 	btrfs_set_super_num_devices(fs_info->super_copy,
270039379faaSNaohiro Aota 				    orig_super_num_devices);
270139379faaSNaohiro Aota 	mutex_unlock(&fs_info->chunk_mutex);
270239379faaSNaohiro Aota 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
270379787eaaSJeff Mahoney error_trans:
27040af2c4bfSAnand Jain 	if (seeding_dev)
27051751e8a6SLinus Torvalds 		sb->s_flags |= SB_RDONLY;
27067132a262SAnand Jain 	if (trans)
27073a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
27085c4cf6c9SDavid Sterba error_free_device:
2709a425f9d4SDavid Sterba 	btrfs_free_device(device);
27102b82032cSYan Zheng error:
2711e525fd89STejun Heo 	blkdev_put(bdev, FMODE_EXCL);
271244cab9baSNikolay Borisov 	if (locked) {
27132b82032cSYan Zheng 		mutex_unlock(&uuid_mutex);
27142b82032cSYan Zheng 		up_write(&sb->s_umount);
27152b82032cSYan Zheng 	}
2716c9e9f97bSIlya Dryomov 	return ret;
2717788f20ebSChris Mason }
2718788f20ebSChris Mason 
2719d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
27200b86a832SChris Mason 					struct btrfs_device *device)
27210b86a832SChris Mason {
27220b86a832SChris Mason 	int ret;
27230b86a832SChris Mason 	struct btrfs_path *path;
27240b246afaSJeff Mahoney 	struct btrfs_root *root = device->fs_info->chunk_root;
27250b86a832SChris Mason 	struct btrfs_dev_item *dev_item;
27260b86a832SChris Mason 	struct extent_buffer *leaf;
27270b86a832SChris Mason 	struct btrfs_key key;
27280b86a832SChris Mason 
27290b86a832SChris Mason 	path = btrfs_alloc_path();
27300b86a832SChris Mason 	if (!path)
27310b86a832SChris Mason 		return -ENOMEM;
27320b86a832SChris Mason 
27330b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
27340b86a832SChris Mason 	key.type = BTRFS_DEV_ITEM_KEY;
27350b86a832SChris Mason 	key.offset = device->devid;
27360b86a832SChris Mason 
27370b86a832SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
27380b86a832SChris Mason 	if (ret < 0)
27390b86a832SChris Mason 		goto out;
27400b86a832SChris Mason 
27410b86a832SChris Mason 	if (ret > 0) {
27420b86a832SChris Mason 		ret = -ENOENT;
27430b86a832SChris Mason 		goto out;
27440b86a832SChris Mason 	}
27450b86a832SChris Mason 
27460b86a832SChris Mason 	leaf = path->nodes[0];
27470b86a832SChris Mason 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
27480b86a832SChris Mason 
27490b86a832SChris Mason 	btrfs_set_device_id(leaf, dev_item, device->devid);
27500b86a832SChris Mason 	btrfs_set_device_type(leaf, dev_item, device->type);
27510b86a832SChris Mason 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
27520b86a832SChris Mason 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
27530b86a832SChris Mason 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
27547cc8e58dSMiao Xie 	btrfs_set_device_total_bytes(leaf, dev_item,
27557cc8e58dSMiao Xie 				     btrfs_device_get_disk_total_bytes(device));
27567cc8e58dSMiao Xie 	btrfs_set_device_bytes_used(leaf, dev_item,
27577cc8e58dSMiao Xie 				    btrfs_device_get_bytes_used(device));
27580b86a832SChris Mason 	btrfs_mark_buffer_dirty(leaf);
27590b86a832SChris Mason 
27600b86a832SChris Mason out:
27610b86a832SChris Mason 	btrfs_free_path(path);
27620b86a832SChris Mason 	return ret;
27630b86a832SChris Mason }
27640b86a832SChris Mason 
27652196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans,
27668f18cf13SChris Mason 		      struct btrfs_device *device, u64 new_size)
27678f18cf13SChris Mason {
27680b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
27690b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
27702196d6e8SMiao Xie 	u64 old_total;
27712196d6e8SMiao Xie 	u64 diff;
27728f18cf13SChris Mason 
2773ebbede42SAnand Jain 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
27742b82032cSYan Zheng 		return -EACCES;
27752196d6e8SMiao Xie 
27767dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
27777dfb8be1SNikolay Borisov 
277834441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
27792196d6e8SMiao Xie 	old_total = btrfs_super_total_bytes(super_copy);
27800e4324a4SNikolay Borisov 	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
27812196d6e8SMiao Xie 
278263a212abSStefan Behrens 	if (new_size <= device->total_bytes ||
2783401e29c1SAnand Jain 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
278434441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
27852b82032cSYan Zheng 		return -EINVAL;
27862196d6e8SMiao Xie 	}
27872b82032cSYan Zheng 
27887dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
27897dfb8be1SNikolay Borisov 			round_down(old_total + diff, fs_info->sectorsize));
27902b82032cSYan Zheng 	device->fs_devices->total_rw_bytes += diff;
27912b82032cSYan Zheng 
27927cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
27937cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
2794fb456252SJeff Mahoney 	btrfs_clear_space_info_full(device->fs_info);
2795bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
2796bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
2797bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
279834441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
27994184ea7fSChris Mason 
28008f18cf13SChris Mason 	return btrfs_update_device(trans, device);
28018f18cf13SChris Mason }
28028f18cf13SChris Mason 
2803f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
28048f18cf13SChris Mason {
2805f4208794SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
28065b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
28078f18cf13SChris Mason 	int ret;
28088f18cf13SChris Mason 	struct btrfs_path *path;
28098f18cf13SChris Mason 	struct btrfs_key key;
28108f18cf13SChris Mason 
28118f18cf13SChris Mason 	path = btrfs_alloc_path();
28128f18cf13SChris Mason 	if (!path)
28138f18cf13SChris Mason 		return -ENOMEM;
28148f18cf13SChris Mason 
2815408fbf19SNikolay Borisov 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
28168f18cf13SChris Mason 	key.offset = chunk_offset;
28178f18cf13SChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
28188f18cf13SChris Mason 
28198f18cf13SChris Mason 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
282079787eaaSJeff Mahoney 	if (ret < 0)
282179787eaaSJeff Mahoney 		goto out;
282279787eaaSJeff Mahoney 	else if (ret > 0) { /* Logic error or corruption */
28230b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, -ENOENT,
282479787eaaSJeff Mahoney 				      "Failed lookup while freeing chunk.");
282579787eaaSJeff Mahoney 		ret = -ENOENT;
282679787eaaSJeff Mahoney 		goto out;
282779787eaaSJeff Mahoney 	}
28288f18cf13SChris Mason 
28298f18cf13SChris Mason 	ret = btrfs_del_item(trans, root, path);
283079787eaaSJeff Mahoney 	if (ret < 0)
28310b246afaSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
283279787eaaSJeff Mahoney 				      "Failed to delete chunk item.");
283379787eaaSJeff Mahoney out:
28348f18cf13SChris Mason 	btrfs_free_path(path);
283565a246c5STsutomu Itoh 	return ret;
28368f18cf13SChris Mason }
28378f18cf13SChris Mason 
2838408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
28398f18cf13SChris Mason {
28400b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
28418f18cf13SChris Mason 	struct btrfs_disk_key *disk_key;
28428f18cf13SChris Mason 	struct btrfs_chunk *chunk;
28438f18cf13SChris Mason 	u8 *ptr;
28448f18cf13SChris Mason 	int ret = 0;
28458f18cf13SChris Mason 	u32 num_stripes;
28468f18cf13SChris Mason 	u32 array_size;
28478f18cf13SChris Mason 	u32 len = 0;
28488f18cf13SChris Mason 	u32 cur;
28498f18cf13SChris Mason 	struct btrfs_key key;
28508f18cf13SChris Mason 
285134441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
28528f18cf13SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
28538f18cf13SChris Mason 
28548f18cf13SChris Mason 	ptr = super_copy->sys_chunk_array;
28558f18cf13SChris Mason 	cur = 0;
28568f18cf13SChris Mason 
28578f18cf13SChris Mason 	while (cur < array_size) {
28588f18cf13SChris Mason 		disk_key = (struct btrfs_disk_key *)ptr;
28598f18cf13SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
28608f18cf13SChris Mason 
28618f18cf13SChris Mason 		len = sizeof(*disk_key);
28628f18cf13SChris Mason 
28638f18cf13SChris Mason 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
28648f18cf13SChris Mason 			chunk = (struct btrfs_chunk *)(ptr + len);
28658f18cf13SChris Mason 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
28668f18cf13SChris Mason 			len += btrfs_chunk_item_size(num_stripes);
28678f18cf13SChris Mason 		} else {
28688f18cf13SChris Mason 			ret = -EIO;
28698f18cf13SChris Mason 			break;
28708f18cf13SChris Mason 		}
2871408fbf19SNikolay Borisov 		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
28728f18cf13SChris Mason 		    key.offset == chunk_offset) {
28738f18cf13SChris Mason 			memmove(ptr, ptr + len, array_size - (cur + len));
28748f18cf13SChris Mason 			array_size -= len;
28758f18cf13SChris Mason 			btrfs_set_super_sys_array_size(super_copy, array_size);
28768f18cf13SChris Mason 		} else {
28778f18cf13SChris Mason 			ptr += len;
28788f18cf13SChris Mason 			cur += len;
28798f18cf13SChris Mason 		}
28808f18cf13SChris Mason 	}
288134441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
28828f18cf13SChris Mason 	return ret;
28838f18cf13SChris Mason }
28848f18cf13SChris Mason 
288560ca842eSOmar Sandoval /*
288660ca842eSOmar Sandoval  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
288760ca842eSOmar Sandoval  * @logical: Logical block offset in bytes.
288860ca842eSOmar Sandoval  * @length: Length of extent in bytes.
288960ca842eSOmar Sandoval  *
289060ca842eSOmar Sandoval  * Return: Chunk mapping or ERR_PTR.
289160ca842eSOmar Sandoval  */
289260ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2893592d92eeSLiu Bo 				       u64 logical, u64 length)
2894592d92eeSLiu Bo {
2895592d92eeSLiu Bo 	struct extent_map_tree *em_tree;
2896592d92eeSLiu Bo 	struct extent_map *em;
2897592d92eeSLiu Bo 
2898c8bf1b67SDavid Sterba 	em_tree = &fs_info->mapping_tree;
2899592d92eeSLiu Bo 	read_lock(&em_tree->lock);
2900592d92eeSLiu Bo 	em = lookup_extent_mapping(em_tree, logical, length);
2901592d92eeSLiu Bo 	read_unlock(&em_tree->lock);
2902592d92eeSLiu Bo 
2903592d92eeSLiu Bo 	if (!em) {
2904592d92eeSLiu Bo 		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2905592d92eeSLiu Bo 			   logical, length);
2906592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
2907592d92eeSLiu Bo 	}
2908592d92eeSLiu Bo 
2909592d92eeSLiu Bo 	if (em->start > logical || em->start + em->len < logical) {
2910592d92eeSLiu Bo 		btrfs_crit(fs_info,
2911592d92eeSLiu Bo 			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2912592d92eeSLiu Bo 			   logical, length, em->start, em->start + em->len);
2913592d92eeSLiu Bo 		free_extent_map(em);
2914592d92eeSLiu Bo 		return ERR_PTR(-EINVAL);
2915592d92eeSLiu Bo 	}
2916592d92eeSLiu Bo 
2917592d92eeSLiu Bo 	/* callers are responsible for dropping em's ref. */
2918592d92eeSLiu Bo 	return em;
2919592d92eeSLiu Bo }
2920592d92eeSLiu Bo 
292197aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
292247ab2a6cSJosef Bacik {
292397aff912SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
292447ab2a6cSJosef Bacik 	struct extent_map *em;
292547ab2a6cSJosef Bacik 	struct map_lookup *map;
292647ab2a6cSJosef Bacik 	u64 dev_extent_len = 0;
292747ab2a6cSJosef Bacik 	int i, ret = 0;
29280b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
292947ab2a6cSJosef Bacik 
293060ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2931592d92eeSLiu Bo 	if (IS_ERR(em)) {
293247ab2a6cSJosef Bacik 		/*
293347ab2a6cSJosef Bacik 		 * This is a logic error, but we don't want to just rely on the
2934bb7ab3b9SAdam Buchbinder 		 * user having built with ASSERT enabled, so if ASSERT doesn't
293547ab2a6cSJosef Bacik 		 * do anything we still error out.
293647ab2a6cSJosef Bacik 		 */
293747ab2a6cSJosef Bacik 		ASSERT(0);
2938592d92eeSLiu Bo 		return PTR_ERR(em);
293947ab2a6cSJosef Bacik 	}
294095617d69SJeff Mahoney 	map = em->map_lookup;
294134441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
2942451a2c13SNikolay Borisov 	check_system_chunk(trans, map->type);
294334441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
294447ab2a6cSJosef Bacik 
294557ba4cb8SFilipe Manana 	/*
294657ba4cb8SFilipe Manana 	 * Take the device list mutex to prevent races with the final phase of
294757ba4cb8SFilipe Manana 	 * a device replace operation that replaces the device object associated
294857ba4cb8SFilipe Manana 	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
294957ba4cb8SFilipe Manana 	 */
295057ba4cb8SFilipe Manana 	mutex_lock(&fs_devices->device_list_mutex);
295147ab2a6cSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
295247ab2a6cSJosef Bacik 		struct btrfs_device *device = map->stripes[i].dev;
295347ab2a6cSJosef Bacik 		ret = btrfs_free_dev_extent(trans, device,
295447ab2a6cSJosef Bacik 					    map->stripes[i].physical,
295547ab2a6cSJosef Bacik 					    &dev_extent_len);
295647ab2a6cSJosef Bacik 		if (ret) {
295757ba4cb8SFilipe Manana 			mutex_unlock(&fs_devices->device_list_mutex);
295866642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
295947ab2a6cSJosef Bacik 			goto out;
296047ab2a6cSJosef Bacik 		}
296147ab2a6cSJosef Bacik 
296247ab2a6cSJosef Bacik 		if (device->bytes_used > 0) {
296334441361SDavid Sterba 			mutex_lock(&fs_info->chunk_mutex);
296447ab2a6cSJosef Bacik 			btrfs_device_set_bytes_used(device,
296547ab2a6cSJosef Bacik 					device->bytes_used - dev_extent_len);
2966a5ed45f8SNikolay Borisov 			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
29670b246afaSJeff Mahoney 			btrfs_clear_space_info_full(fs_info);
296834441361SDavid Sterba 			mutex_unlock(&fs_info->chunk_mutex);
296947ab2a6cSJosef Bacik 		}
297047ab2a6cSJosef Bacik 
297164bc6c2aSNikolay Borisov 		ret = btrfs_update_device(trans, device);
297247ab2a6cSJosef Bacik 		if (ret) {
297357ba4cb8SFilipe Manana 			mutex_unlock(&fs_devices->device_list_mutex);
297466642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
297547ab2a6cSJosef Bacik 			goto out;
297647ab2a6cSJosef Bacik 		}
297747ab2a6cSJosef Bacik 	}
297857ba4cb8SFilipe Manana 	mutex_unlock(&fs_devices->device_list_mutex);
297957ba4cb8SFilipe Manana 
2980f4208794SNikolay Borisov 	ret = btrfs_free_chunk(trans, chunk_offset);
298147ab2a6cSJosef Bacik 	if (ret) {
298266642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
298347ab2a6cSJosef Bacik 		goto out;
298447ab2a6cSJosef Bacik 	}
298547ab2a6cSJosef Bacik 
29866bccf3abSJeff Mahoney 	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
298747ab2a6cSJosef Bacik 
298847ab2a6cSJosef Bacik 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2989408fbf19SNikolay Borisov 		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
299047ab2a6cSJosef Bacik 		if (ret) {
299166642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
299247ab2a6cSJosef Bacik 			goto out;
299347ab2a6cSJosef Bacik 		}
299447ab2a6cSJosef Bacik 	}
299547ab2a6cSJosef Bacik 
29965a98ec01SNikolay Borisov 	ret = btrfs_remove_block_group(trans, chunk_offset, em);
299747ab2a6cSJosef Bacik 	if (ret) {
299866642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
299947ab2a6cSJosef Bacik 		goto out;
300047ab2a6cSJosef Bacik 	}
300147ab2a6cSJosef Bacik 
300247ab2a6cSJosef Bacik out:
300347ab2a6cSJosef Bacik 	/* once for us */
300447ab2a6cSJosef Bacik 	free_extent_map(em);
30058f18cf13SChris Mason 	return ret;
30068f18cf13SChris Mason }
30078f18cf13SChris Mason 
30085b4aacefSJeff Mahoney static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
30098f18cf13SChris Mason {
30105b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
301119c4d2f9SChris Mason 	struct btrfs_trans_handle *trans;
3012b0643e59SDennis Zhou 	struct btrfs_block_group *block_group;
30138f18cf13SChris Mason 	int ret;
30148f18cf13SChris Mason 
301567c5e7d4SFilipe Manana 	/*
301667c5e7d4SFilipe Manana 	 * Prevent races with automatic removal of unused block groups.
301767c5e7d4SFilipe Manana 	 * After we relocate and before we remove the chunk with offset
301867c5e7d4SFilipe Manana 	 * chunk_offset, automatic removal of the block group can kick in,
301967c5e7d4SFilipe Manana 	 * resulting in a failure when calling btrfs_remove_chunk() below.
302067c5e7d4SFilipe Manana 	 *
302167c5e7d4SFilipe Manana 	 * Make sure to acquire this mutex before doing a tree search (dev
302267c5e7d4SFilipe Manana 	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
302367c5e7d4SFilipe Manana 	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
302467c5e7d4SFilipe Manana 	 * we release the path used to search the chunk/dev tree and before
302567c5e7d4SFilipe Manana 	 * the current task acquires this mutex and calls us.
302667c5e7d4SFilipe Manana 	 */
3027a32bf9a3SDavid Sterba 	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
302867c5e7d4SFilipe Manana 
30298f18cf13SChris Mason 	/* step one, relocate all the extents inside this chunk */
30302ff7e61eSJeff Mahoney 	btrfs_scrub_pause(fs_info);
30310b246afaSJeff Mahoney 	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
30322ff7e61eSJeff Mahoney 	btrfs_scrub_continue(fs_info);
3033a22285a6SYan, Zheng 	if (ret)
3034a22285a6SYan, Zheng 		return ret;
30358f18cf13SChris Mason 
3036b0643e59SDennis Zhou 	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3037b0643e59SDennis Zhou 	if (!block_group)
3038b0643e59SDennis Zhou 		return -ENOENT;
3039b0643e59SDennis Zhou 	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3040b0643e59SDennis Zhou 	btrfs_put_block_group(block_group);
3041b0643e59SDennis Zhou 
304219c4d2f9SChris Mason 	trans = btrfs_start_trans_remove_block_group(root->fs_info,
304319c4d2f9SChris Mason 						     chunk_offset);
304419c4d2f9SChris Mason 	if (IS_ERR(trans)) {
304519c4d2f9SChris Mason 		ret = PTR_ERR(trans);
304619c4d2f9SChris Mason 		btrfs_handle_fs_error(root->fs_info, ret, NULL);
304719c4d2f9SChris Mason 		return ret;
304819c4d2f9SChris Mason 	}
30495d8eb6feSNaohiro Aota 
305019c4d2f9SChris Mason 	/*
305119c4d2f9SChris Mason 	 * step two, delete the device extents and the
305219c4d2f9SChris Mason 	 * chunk tree entries
305319c4d2f9SChris Mason 	 */
305497aff912SNikolay Borisov 	ret = btrfs_remove_chunk(trans, chunk_offset);
30553a45bb20SJeff Mahoney 	btrfs_end_transaction(trans);
305619c4d2f9SChris Mason 	return ret;
30578f18cf13SChris Mason }
30588f18cf13SChris Mason 
30592ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
30602b82032cSYan Zheng {
30610b246afaSJeff Mahoney 	struct btrfs_root *chunk_root = fs_info->chunk_root;
30622b82032cSYan Zheng 	struct btrfs_path *path;
30632b82032cSYan Zheng 	struct extent_buffer *leaf;
30642b82032cSYan Zheng 	struct btrfs_chunk *chunk;
30652b82032cSYan Zheng 	struct btrfs_key key;
30662b82032cSYan Zheng 	struct btrfs_key found_key;
30672b82032cSYan Zheng 	u64 chunk_type;
3068ba1bf481SJosef Bacik 	bool retried = false;
3069ba1bf481SJosef Bacik 	int failed = 0;
30702b82032cSYan Zheng 	int ret;
30712b82032cSYan Zheng 
30722b82032cSYan Zheng 	path = btrfs_alloc_path();
30732b82032cSYan Zheng 	if (!path)
30742b82032cSYan Zheng 		return -ENOMEM;
30752b82032cSYan Zheng 
3076ba1bf481SJosef Bacik again:
30772b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
30782b82032cSYan Zheng 	key.offset = (u64)-1;
30792b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
30802b82032cSYan Zheng 
30812b82032cSYan Zheng 	while (1) {
30820b246afaSJeff Mahoney 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
30832b82032cSYan Zheng 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
308467c5e7d4SFilipe Manana 		if (ret < 0) {
30850b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
30862b82032cSYan Zheng 			goto error;
308767c5e7d4SFilipe Manana 		}
308879787eaaSJeff Mahoney 		BUG_ON(ret == 0); /* Corruption */
30892b82032cSYan Zheng 
30902b82032cSYan Zheng 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
30912b82032cSYan Zheng 					  key.type);
309267c5e7d4SFilipe Manana 		if (ret)
30930b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
30942b82032cSYan Zheng 		if (ret < 0)
30952b82032cSYan Zheng 			goto error;
30962b82032cSYan Zheng 		if (ret > 0)
30972b82032cSYan Zheng 			break;
30982b82032cSYan Zheng 
30992b82032cSYan Zheng 		leaf = path->nodes[0];
31002b82032cSYan Zheng 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
31012b82032cSYan Zheng 
31022b82032cSYan Zheng 		chunk = btrfs_item_ptr(leaf, path->slots[0],
31032b82032cSYan Zheng 				       struct btrfs_chunk);
31042b82032cSYan Zheng 		chunk_type = btrfs_chunk_type(leaf, chunk);
3105b3b4aa74SDavid Sterba 		btrfs_release_path(path);
31062b82032cSYan Zheng 
31072b82032cSYan Zheng 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
31080b246afaSJeff Mahoney 			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3109ba1bf481SJosef Bacik 			if (ret == -ENOSPC)
3110ba1bf481SJosef Bacik 				failed++;
311114586651SHIMANGI SARAOGI 			else
311214586651SHIMANGI SARAOGI 				BUG_ON(ret);
31132b82032cSYan Zheng 		}
31140b246afaSJeff Mahoney 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
31152b82032cSYan Zheng 
31162b82032cSYan Zheng 		if (found_key.offset == 0)
31172b82032cSYan Zheng 			break;
31182b82032cSYan Zheng 		key.offset = found_key.offset - 1;
31192b82032cSYan Zheng 	}
31202b82032cSYan Zheng 	ret = 0;
3121ba1bf481SJosef Bacik 	if (failed && !retried) {
3122ba1bf481SJosef Bacik 		failed = 0;
3123ba1bf481SJosef Bacik 		retried = true;
3124ba1bf481SJosef Bacik 		goto again;
3125fae7f21cSDulshani Gunawardhana 	} else if (WARN_ON(failed && retried)) {
3126ba1bf481SJosef Bacik 		ret = -ENOSPC;
3127ba1bf481SJosef Bacik 	}
31282b82032cSYan Zheng error:
31292b82032cSYan Zheng 	btrfs_free_path(path);
31302b82032cSYan Zheng 	return ret;
31312b82032cSYan Zheng }
31322b82032cSYan Zheng 
3133a6f93c71SLiu Bo /*
3134a6f93c71SLiu Bo  * return 1 : allocate a data chunk successfully,
3135a6f93c71SLiu Bo  * return <0: errors during allocating a data chunk,
3136a6f93c71SLiu Bo  * return 0 : no need to allocate a data chunk.
3137a6f93c71SLiu Bo  */
3138a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3139a6f93c71SLiu Bo 				      u64 chunk_offset)
3140a6f93c71SLiu Bo {
314132da5386SDavid Sterba 	struct btrfs_block_group *cache;
3142a6f93c71SLiu Bo 	u64 bytes_used;
3143a6f93c71SLiu Bo 	u64 chunk_type;
3144a6f93c71SLiu Bo 
3145a6f93c71SLiu Bo 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3146a6f93c71SLiu Bo 	ASSERT(cache);
3147a6f93c71SLiu Bo 	chunk_type = cache->flags;
3148a6f93c71SLiu Bo 	btrfs_put_block_group(cache);
3149a6f93c71SLiu Bo 
31505ae21692SJohannes Thumshirn 	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
31515ae21692SJohannes Thumshirn 		return 0;
31525ae21692SJohannes Thumshirn 
3153a6f93c71SLiu Bo 	spin_lock(&fs_info->data_sinfo->lock);
3154a6f93c71SLiu Bo 	bytes_used = fs_info->data_sinfo->bytes_used;
3155a6f93c71SLiu Bo 	spin_unlock(&fs_info->data_sinfo->lock);
3156a6f93c71SLiu Bo 
3157a6f93c71SLiu Bo 	if (!bytes_used) {
3158a6f93c71SLiu Bo 		struct btrfs_trans_handle *trans;
3159a6f93c71SLiu Bo 		int ret;
3160a6f93c71SLiu Bo 
3161a6f93c71SLiu Bo 		trans =	btrfs_join_transaction(fs_info->tree_root);
3162a6f93c71SLiu Bo 		if (IS_ERR(trans))
3163a6f93c71SLiu Bo 			return PTR_ERR(trans);
3164a6f93c71SLiu Bo 
31655ae21692SJohannes Thumshirn 		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3166a6f93c71SLiu Bo 		btrfs_end_transaction(trans);
3167a6f93c71SLiu Bo 		if (ret < 0)
3168a6f93c71SLiu Bo 			return ret;
3169a6f93c71SLiu Bo 		return 1;
3170a6f93c71SLiu Bo 	}
31715ae21692SJohannes Thumshirn 
3172a6f93c71SLiu Bo 	return 0;
3173a6f93c71SLiu Bo }
3174a6f93c71SLiu Bo 
31756bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info,
31760940ebf6SIlya Dryomov 			       struct btrfs_balance_control *bctl)
31770940ebf6SIlya Dryomov {
31786bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
31790940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
31800940ebf6SIlya Dryomov 	struct btrfs_balance_item *item;
31810940ebf6SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
31820940ebf6SIlya Dryomov 	struct btrfs_path *path;
31830940ebf6SIlya Dryomov 	struct extent_buffer *leaf;
31840940ebf6SIlya Dryomov 	struct btrfs_key key;
31850940ebf6SIlya Dryomov 	int ret, err;
31860940ebf6SIlya Dryomov 
31870940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
31880940ebf6SIlya Dryomov 	if (!path)
31890940ebf6SIlya Dryomov 		return -ENOMEM;
31900940ebf6SIlya Dryomov 
31910940ebf6SIlya Dryomov 	trans = btrfs_start_transaction(root, 0);
31920940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
31930940ebf6SIlya Dryomov 		btrfs_free_path(path);
31940940ebf6SIlya Dryomov 		return PTR_ERR(trans);
31950940ebf6SIlya Dryomov 	}
31960940ebf6SIlya Dryomov 
31970940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3198c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
31990940ebf6SIlya Dryomov 	key.offset = 0;
32000940ebf6SIlya Dryomov 
32010940ebf6SIlya Dryomov 	ret = btrfs_insert_empty_item(trans, root, path, &key,
32020940ebf6SIlya Dryomov 				      sizeof(*item));
32030940ebf6SIlya Dryomov 	if (ret)
32040940ebf6SIlya Dryomov 		goto out;
32050940ebf6SIlya Dryomov 
32060940ebf6SIlya Dryomov 	leaf = path->nodes[0];
32070940ebf6SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
32080940ebf6SIlya Dryomov 
3209b159fa28SDavid Sterba 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
32100940ebf6SIlya Dryomov 
32110940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
32120940ebf6SIlya Dryomov 	btrfs_set_balance_data(leaf, item, &disk_bargs);
32130940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
32140940ebf6SIlya Dryomov 	btrfs_set_balance_meta(leaf, item, &disk_bargs);
32150940ebf6SIlya Dryomov 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
32160940ebf6SIlya Dryomov 	btrfs_set_balance_sys(leaf, item, &disk_bargs);
32170940ebf6SIlya Dryomov 
32180940ebf6SIlya Dryomov 	btrfs_set_balance_flags(leaf, item, bctl->flags);
32190940ebf6SIlya Dryomov 
32200940ebf6SIlya Dryomov 	btrfs_mark_buffer_dirty(leaf);
32210940ebf6SIlya Dryomov out:
32220940ebf6SIlya Dryomov 	btrfs_free_path(path);
32233a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
32240940ebf6SIlya Dryomov 	if (err && !ret)
32250940ebf6SIlya Dryomov 		ret = err;
32260940ebf6SIlya Dryomov 	return ret;
32270940ebf6SIlya Dryomov }
32280940ebf6SIlya Dryomov 
32296bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info)
32300940ebf6SIlya Dryomov {
32316bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
32320940ebf6SIlya Dryomov 	struct btrfs_trans_handle *trans;
32330940ebf6SIlya Dryomov 	struct btrfs_path *path;
32340940ebf6SIlya Dryomov 	struct btrfs_key key;
32350940ebf6SIlya Dryomov 	int ret, err;
32360940ebf6SIlya Dryomov 
32370940ebf6SIlya Dryomov 	path = btrfs_alloc_path();
32380940ebf6SIlya Dryomov 	if (!path)
32390940ebf6SIlya Dryomov 		return -ENOMEM;
32400940ebf6SIlya Dryomov 
32413502a8c0SDavid Sterba 	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
32420940ebf6SIlya Dryomov 	if (IS_ERR(trans)) {
32430940ebf6SIlya Dryomov 		btrfs_free_path(path);
32440940ebf6SIlya Dryomov 		return PTR_ERR(trans);
32450940ebf6SIlya Dryomov 	}
32460940ebf6SIlya Dryomov 
32470940ebf6SIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
3248c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
32490940ebf6SIlya Dryomov 	key.offset = 0;
32500940ebf6SIlya Dryomov 
32510940ebf6SIlya Dryomov 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
32520940ebf6SIlya Dryomov 	if (ret < 0)
32530940ebf6SIlya Dryomov 		goto out;
32540940ebf6SIlya Dryomov 	if (ret > 0) {
32550940ebf6SIlya Dryomov 		ret = -ENOENT;
32560940ebf6SIlya Dryomov 		goto out;
32570940ebf6SIlya Dryomov 	}
32580940ebf6SIlya Dryomov 
32590940ebf6SIlya Dryomov 	ret = btrfs_del_item(trans, root, path);
32600940ebf6SIlya Dryomov out:
32610940ebf6SIlya Dryomov 	btrfs_free_path(path);
32623a45bb20SJeff Mahoney 	err = btrfs_commit_transaction(trans);
32630940ebf6SIlya Dryomov 	if (err && !ret)
32640940ebf6SIlya Dryomov 		ret = err;
32650940ebf6SIlya Dryomov 	return ret;
32660940ebf6SIlya Dryomov }
32670940ebf6SIlya Dryomov 
3268c9e9f97bSIlya Dryomov /*
326959641015SIlya Dryomov  * This is a heuristic used to reduce the number of chunks balanced on
327059641015SIlya Dryomov  * resume after balance was interrupted.
327159641015SIlya Dryomov  */
327259641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl)
327359641015SIlya Dryomov {
327459641015SIlya Dryomov 	/*
327559641015SIlya Dryomov 	 * Turn on soft mode for chunk types that were being converted.
327659641015SIlya Dryomov 	 */
327759641015SIlya Dryomov 	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
327859641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
327959641015SIlya Dryomov 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
328059641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
328159641015SIlya Dryomov 	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
328259641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
328359641015SIlya Dryomov 
328459641015SIlya Dryomov 	/*
328559641015SIlya Dryomov 	 * Turn on usage filter if is not already used.  The idea is
328659641015SIlya Dryomov 	 * that chunks that we have already balanced should be
328759641015SIlya Dryomov 	 * reasonably full.  Don't do it for chunks that are being
328859641015SIlya Dryomov 	 * converted - that will keep us from relocating unconverted
328959641015SIlya Dryomov 	 * (albeit full) chunks.
329059641015SIlya Dryomov 	 */
329159641015SIlya Dryomov 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3292bc309467SDavid Sterba 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
329359641015SIlya Dryomov 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
329459641015SIlya Dryomov 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
329559641015SIlya Dryomov 		bctl->data.usage = 90;
329659641015SIlya Dryomov 	}
329759641015SIlya Dryomov 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3298bc309467SDavid Sterba 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
329959641015SIlya Dryomov 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
330059641015SIlya Dryomov 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
330159641015SIlya Dryomov 		bctl->sys.usage = 90;
330259641015SIlya Dryomov 	}
330359641015SIlya Dryomov 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3304bc309467SDavid Sterba 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
330559641015SIlya Dryomov 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
330659641015SIlya Dryomov 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
330759641015SIlya Dryomov 		bctl->meta.usage = 90;
330859641015SIlya Dryomov 	}
330959641015SIlya Dryomov }
331059641015SIlya Dryomov 
331159641015SIlya Dryomov /*
3312149196a2SDavid Sterba  * Clear the balance status in fs_info and delete the balance item from disk.
3313149196a2SDavid Sterba  */
3314149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info)
3315c9e9f97bSIlya Dryomov {
3316c9e9f97bSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3317149196a2SDavid Sterba 	int ret;
3318c9e9f97bSIlya Dryomov 
3319c9e9f97bSIlya Dryomov 	BUG_ON(!fs_info->balance_ctl);
3320c9e9f97bSIlya Dryomov 
3321c9e9f97bSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
3322c9e9f97bSIlya Dryomov 	fs_info->balance_ctl = NULL;
3323c9e9f97bSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
3324c9e9f97bSIlya Dryomov 
3325c9e9f97bSIlya Dryomov 	kfree(bctl);
3326149196a2SDavid Sterba 	ret = del_balance_item(fs_info);
3327149196a2SDavid Sterba 	if (ret)
3328149196a2SDavid Sterba 		btrfs_handle_fs_error(fs_info, ret, NULL);
3329c9e9f97bSIlya Dryomov }
3330c9e9f97bSIlya Dryomov 
3331ed25e9b2SIlya Dryomov /*
3332ed25e9b2SIlya Dryomov  * Balance filters.  Return 1 if chunk should be filtered out
3333ed25e9b2SIlya Dryomov  * (should not be balanced).
3334ed25e9b2SIlya Dryomov  */
3335899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type,
3336ed25e9b2SIlya Dryomov 				 struct btrfs_balance_args *bargs)
3337ed25e9b2SIlya Dryomov {
3338899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3339899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3340ed25e9b2SIlya Dryomov 
3341899c81eaSIlya Dryomov 	if (bargs->profiles & chunk_type)
3342ed25e9b2SIlya Dryomov 		return 0;
3343ed25e9b2SIlya Dryomov 
3344ed25e9b2SIlya Dryomov 	return 1;
3345ed25e9b2SIlya Dryomov }
3346ed25e9b2SIlya Dryomov 
3347dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
33485ce5b3c0SIlya Dryomov 			      struct btrfs_balance_args *bargs)
33495ce5b3c0SIlya Dryomov {
335032da5386SDavid Sterba 	struct btrfs_block_group *cache;
3351bc309467SDavid Sterba 	u64 chunk_used;
3352bc309467SDavid Sterba 	u64 user_thresh_min;
3353bc309467SDavid Sterba 	u64 user_thresh_max;
3354bc309467SDavid Sterba 	int ret = 1;
3355bc309467SDavid Sterba 
3356bc309467SDavid Sterba 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3357bf38be65SDavid Sterba 	chunk_used = cache->used;
3358bc309467SDavid Sterba 
3359bc309467SDavid Sterba 	if (bargs->usage_min == 0)
3360bc309467SDavid Sterba 		user_thresh_min = 0;
3361bc309467SDavid Sterba 	else
3362b3470b5dSDavid Sterba 		user_thresh_min = div_factor_fine(cache->length,
3363bc309467SDavid Sterba 						  bargs->usage_min);
3364bc309467SDavid Sterba 
3365bc309467SDavid Sterba 	if (bargs->usage_max == 0)
3366bc309467SDavid Sterba 		user_thresh_max = 1;
3367bc309467SDavid Sterba 	else if (bargs->usage_max > 100)
3368b3470b5dSDavid Sterba 		user_thresh_max = cache->length;
3369bc309467SDavid Sterba 	else
3370b3470b5dSDavid Sterba 		user_thresh_max = div_factor_fine(cache->length,
3371bc309467SDavid Sterba 						  bargs->usage_max);
3372bc309467SDavid Sterba 
3373bc309467SDavid Sterba 	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3374bc309467SDavid Sterba 		ret = 0;
3375bc309467SDavid Sterba 
3376bc309467SDavid Sterba 	btrfs_put_block_group(cache);
3377bc309467SDavid Sterba 	return ret;
3378bc309467SDavid Sterba }
3379bc309467SDavid Sterba 
3380dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3381bc309467SDavid Sterba 		u64 chunk_offset, struct btrfs_balance_args *bargs)
3382bc309467SDavid Sterba {
338332da5386SDavid Sterba 	struct btrfs_block_group *cache;
33845ce5b3c0SIlya Dryomov 	u64 chunk_used, user_thresh;
33855ce5b3c0SIlya Dryomov 	int ret = 1;
33865ce5b3c0SIlya Dryomov 
33875ce5b3c0SIlya Dryomov 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3388bf38be65SDavid Sterba 	chunk_used = cache->used;
33895ce5b3c0SIlya Dryomov 
3390bc309467SDavid Sterba 	if (bargs->usage_min == 0)
33913e39cea6SIlya Dryomov 		user_thresh = 1;
3392a105bb88SIlya Dryomov 	else if (bargs->usage > 100)
3393b3470b5dSDavid Sterba 		user_thresh = cache->length;
3394a105bb88SIlya Dryomov 	else
3395b3470b5dSDavid Sterba 		user_thresh = div_factor_fine(cache->length, bargs->usage);
3396a105bb88SIlya Dryomov 
33975ce5b3c0SIlya Dryomov 	if (chunk_used < user_thresh)
33985ce5b3c0SIlya Dryomov 		ret = 0;
33995ce5b3c0SIlya Dryomov 
34005ce5b3c0SIlya Dryomov 	btrfs_put_block_group(cache);
34015ce5b3c0SIlya Dryomov 	return ret;
34025ce5b3c0SIlya Dryomov }
34035ce5b3c0SIlya Dryomov 
3404409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf,
3405409d404bSIlya Dryomov 			      struct btrfs_chunk *chunk,
3406409d404bSIlya Dryomov 			      struct btrfs_balance_args *bargs)
3407409d404bSIlya Dryomov {
3408409d404bSIlya Dryomov 	struct btrfs_stripe *stripe;
3409409d404bSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3410409d404bSIlya Dryomov 	int i;
3411409d404bSIlya Dryomov 
3412409d404bSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
3413409d404bSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
3414409d404bSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3415409d404bSIlya Dryomov 			return 0;
3416409d404bSIlya Dryomov 	}
3417409d404bSIlya Dryomov 
3418409d404bSIlya Dryomov 	return 1;
3419409d404bSIlya Dryomov }
3420409d404bSIlya Dryomov 
3421946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes)
3422946c9256SDavid Sterba {
3423946c9256SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(type);
3424946c9256SDavid Sterba 	const int ncopies = btrfs_raid_array[index].ncopies;
3425946c9256SDavid Sterba 	const int nparity = btrfs_raid_array[index].nparity;
3426946c9256SDavid Sterba 
3427946c9256SDavid Sterba 	if (nparity)
3428946c9256SDavid Sterba 		return num_stripes - nparity;
3429946c9256SDavid Sterba 	else
3430946c9256SDavid Sterba 		return num_stripes / ncopies;
3431946c9256SDavid Sterba }
3432946c9256SDavid Sterba 
343394e60d5aSIlya Dryomov /* [pstart, pend) */
343494e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf,
343594e60d5aSIlya Dryomov 			       struct btrfs_chunk *chunk,
343694e60d5aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
343794e60d5aSIlya Dryomov {
343894e60d5aSIlya Dryomov 	struct btrfs_stripe *stripe;
343994e60d5aSIlya Dryomov 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
344094e60d5aSIlya Dryomov 	u64 stripe_offset;
344194e60d5aSIlya Dryomov 	u64 stripe_length;
3442946c9256SDavid Sterba 	u64 type;
344394e60d5aSIlya Dryomov 	int factor;
344494e60d5aSIlya Dryomov 	int i;
344594e60d5aSIlya Dryomov 
344694e60d5aSIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
344794e60d5aSIlya Dryomov 		return 0;
344894e60d5aSIlya Dryomov 
3449946c9256SDavid Sterba 	type = btrfs_chunk_type(leaf, chunk);
3450946c9256SDavid Sterba 	factor = calc_data_stripes(type, num_stripes);
345194e60d5aSIlya Dryomov 
345294e60d5aSIlya Dryomov 	for (i = 0; i < num_stripes; i++) {
345394e60d5aSIlya Dryomov 		stripe = btrfs_stripe_nr(chunk, i);
345494e60d5aSIlya Dryomov 		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
345594e60d5aSIlya Dryomov 			continue;
345694e60d5aSIlya Dryomov 
345794e60d5aSIlya Dryomov 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
345894e60d5aSIlya Dryomov 		stripe_length = btrfs_chunk_length(leaf, chunk);
3459b8b93addSDavid Sterba 		stripe_length = div_u64(stripe_length, factor);
346094e60d5aSIlya Dryomov 
346194e60d5aSIlya Dryomov 		if (stripe_offset < bargs->pend &&
346294e60d5aSIlya Dryomov 		    stripe_offset + stripe_length > bargs->pstart)
346394e60d5aSIlya Dryomov 			return 0;
346494e60d5aSIlya Dryomov 	}
346594e60d5aSIlya Dryomov 
346694e60d5aSIlya Dryomov 	return 1;
346794e60d5aSIlya Dryomov }
346894e60d5aSIlya Dryomov 
3469ea67176aSIlya Dryomov /* [vstart, vend) */
3470ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf,
3471ea67176aSIlya Dryomov 			       struct btrfs_chunk *chunk,
3472ea67176aSIlya Dryomov 			       u64 chunk_offset,
3473ea67176aSIlya Dryomov 			       struct btrfs_balance_args *bargs)
3474ea67176aSIlya Dryomov {
3475ea67176aSIlya Dryomov 	if (chunk_offset < bargs->vend &&
3476ea67176aSIlya Dryomov 	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3477ea67176aSIlya Dryomov 		/* at least part of the chunk is inside this vrange */
3478ea67176aSIlya Dryomov 		return 0;
3479ea67176aSIlya Dryomov 
3480ea67176aSIlya Dryomov 	return 1;
3481ea67176aSIlya Dryomov }
3482ea67176aSIlya Dryomov 
3483dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3484dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_chunk *chunk,
3485dee32d0aSGabríel Arthúr Pétursson 			       struct btrfs_balance_args *bargs)
3486dee32d0aSGabríel Arthúr Pétursson {
3487dee32d0aSGabríel Arthúr Pétursson 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3488dee32d0aSGabríel Arthúr Pétursson 
3489dee32d0aSGabríel Arthúr Pétursson 	if (bargs->stripes_min <= num_stripes
3490dee32d0aSGabríel Arthúr Pétursson 			&& num_stripes <= bargs->stripes_max)
3491dee32d0aSGabríel Arthúr Pétursson 		return 0;
3492dee32d0aSGabríel Arthúr Pétursson 
3493dee32d0aSGabríel Arthúr Pétursson 	return 1;
3494dee32d0aSGabríel Arthúr Pétursson }
3495dee32d0aSGabríel Arthúr Pétursson 
3496899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type,
3497cfa4c961SIlya Dryomov 				     struct btrfs_balance_args *bargs)
3498cfa4c961SIlya Dryomov {
3499cfa4c961SIlya Dryomov 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3500cfa4c961SIlya Dryomov 		return 0;
3501cfa4c961SIlya Dryomov 
3502899c81eaSIlya Dryomov 	chunk_type = chunk_to_extended(chunk_type) &
3503899c81eaSIlya Dryomov 				BTRFS_EXTENDED_PROFILE_MASK;
3504cfa4c961SIlya Dryomov 
3505899c81eaSIlya Dryomov 	if (bargs->target == chunk_type)
3506cfa4c961SIlya Dryomov 		return 1;
3507cfa4c961SIlya Dryomov 
3508cfa4c961SIlya Dryomov 	return 0;
3509cfa4c961SIlya Dryomov }
3510cfa4c961SIlya Dryomov 
35116ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf,
3512f43ffb60SIlya Dryomov 				struct btrfs_chunk *chunk, u64 chunk_offset)
3513f43ffb60SIlya Dryomov {
35146ec0896cSDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
35150b246afaSJeff Mahoney 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3516f43ffb60SIlya Dryomov 	struct btrfs_balance_args *bargs = NULL;
3517f43ffb60SIlya Dryomov 	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3518f43ffb60SIlya Dryomov 
3519f43ffb60SIlya Dryomov 	/* type filter */
3520f43ffb60SIlya Dryomov 	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3521f43ffb60SIlya Dryomov 	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3522f43ffb60SIlya Dryomov 		return 0;
3523f43ffb60SIlya Dryomov 	}
3524f43ffb60SIlya Dryomov 
3525f43ffb60SIlya Dryomov 	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3526f43ffb60SIlya Dryomov 		bargs = &bctl->data;
3527f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3528f43ffb60SIlya Dryomov 		bargs = &bctl->sys;
3529f43ffb60SIlya Dryomov 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3530f43ffb60SIlya Dryomov 		bargs = &bctl->meta;
3531f43ffb60SIlya Dryomov 
3532ed25e9b2SIlya Dryomov 	/* profiles filter */
3533ed25e9b2SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3534ed25e9b2SIlya Dryomov 	    chunk_profiles_filter(chunk_type, bargs)) {
3535ed25e9b2SIlya Dryomov 		return 0;
3536ed25e9b2SIlya Dryomov 	}
3537ed25e9b2SIlya Dryomov 
35385ce5b3c0SIlya Dryomov 	/* usage filter */
35395ce5b3c0SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
35400b246afaSJeff Mahoney 	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
35415ce5b3c0SIlya Dryomov 		return 0;
3542bc309467SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
35430b246afaSJeff Mahoney 	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3544bc309467SDavid Sterba 		return 0;
35455ce5b3c0SIlya Dryomov 	}
35465ce5b3c0SIlya Dryomov 
3547409d404bSIlya Dryomov 	/* devid filter */
3548409d404bSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3549409d404bSIlya Dryomov 	    chunk_devid_filter(leaf, chunk, bargs)) {
3550409d404bSIlya Dryomov 		return 0;
3551409d404bSIlya Dryomov 	}
3552409d404bSIlya Dryomov 
355394e60d5aSIlya Dryomov 	/* drange filter, makes sense only with devid filter */
355494e60d5aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3555e4ff5fb5SNikolay Borisov 	    chunk_drange_filter(leaf, chunk, bargs)) {
355694e60d5aSIlya Dryomov 		return 0;
355794e60d5aSIlya Dryomov 	}
355894e60d5aSIlya Dryomov 
3559ea67176aSIlya Dryomov 	/* vrange filter */
3560ea67176aSIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3561ea67176aSIlya Dryomov 	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3562ea67176aSIlya Dryomov 		return 0;
3563ea67176aSIlya Dryomov 	}
3564ea67176aSIlya Dryomov 
3565dee32d0aSGabríel Arthúr Pétursson 	/* stripes filter */
3566dee32d0aSGabríel Arthúr Pétursson 	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3567dee32d0aSGabríel Arthúr Pétursson 	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
3568dee32d0aSGabríel Arthúr Pétursson 		return 0;
3569dee32d0aSGabríel Arthúr Pétursson 	}
3570dee32d0aSGabríel Arthúr Pétursson 
3571cfa4c961SIlya Dryomov 	/* soft profile changing mode */
3572cfa4c961SIlya Dryomov 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3573cfa4c961SIlya Dryomov 	    chunk_soft_convert_filter(chunk_type, bargs)) {
3574cfa4c961SIlya Dryomov 		return 0;
3575cfa4c961SIlya Dryomov 	}
3576cfa4c961SIlya Dryomov 
35777d824b6fSDavid Sterba 	/*
35787d824b6fSDavid Sterba 	 * limited by count, must be the last filter
35797d824b6fSDavid Sterba 	 */
35807d824b6fSDavid Sterba 	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
35817d824b6fSDavid Sterba 		if (bargs->limit == 0)
35827d824b6fSDavid Sterba 			return 0;
35837d824b6fSDavid Sterba 		else
35847d824b6fSDavid Sterba 			bargs->limit--;
358512907fc7SDavid Sterba 	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
358612907fc7SDavid Sterba 		/*
358712907fc7SDavid Sterba 		 * Same logic as the 'limit' filter; the minimum cannot be
358801327610SNicholas D Steeves 		 * determined here because we do not have the global information
358912907fc7SDavid Sterba 		 * about the count of all chunks that satisfy the filters.
359012907fc7SDavid Sterba 		 */
359112907fc7SDavid Sterba 		if (bargs->limit_max == 0)
359212907fc7SDavid Sterba 			return 0;
359312907fc7SDavid Sterba 		else
359412907fc7SDavid Sterba 			bargs->limit_max--;
35957d824b6fSDavid Sterba 	}
35967d824b6fSDavid Sterba 
3597f43ffb60SIlya Dryomov 	return 1;
3598f43ffb60SIlya Dryomov }
3599f43ffb60SIlya Dryomov 
3600c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3601ec44a35cSChris Mason {
360219a39dceSIlya Dryomov 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3603c9e9f97bSIlya Dryomov 	struct btrfs_root *chunk_root = fs_info->chunk_root;
360412907fc7SDavid Sterba 	u64 chunk_type;
3605f43ffb60SIlya Dryomov 	struct btrfs_chunk *chunk;
36065a488b9dSLiu Bo 	struct btrfs_path *path = NULL;
3607ec44a35cSChris Mason 	struct btrfs_key key;
3608ec44a35cSChris Mason 	struct btrfs_key found_key;
3609f43ffb60SIlya Dryomov 	struct extent_buffer *leaf;
3610f43ffb60SIlya Dryomov 	int slot;
3611c9e9f97bSIlya Dryomov 	int ret;
3612c9e9f97bSIlya Dryomov 	int enospc_errors = 0;
361319a39dceSIlya Dryomov 	bool counting = true;
361412907fc7SDavid Sterba 	/* The single value limit and min/max limits use the same bytes in the */
36157d824b6fSDavid Sterba 	u64 limit_data = bctl->data.limit;
36167d824b6fSDavid Sterba 	u64 limit_meta = bctl->meta.limit;
36177d824b6fSDavid Sterba 	u64 limit_sys = bctl->sys.limit;
361812907fc7SDavid Sterba 	u32 count_data = 0;
361912907fc7SDavid Sterba 	u32 count_meta = 0;
362012907fc7SDavid Sterba 	u32 count_sys = 0;
36212c9fe835SZhao Lei 	int chunk_reserved = 0;
3622ec44a35cSChris Mason 
3623ec44a35cSChris Mason 	path = btrfs_alloc_path();
362417e9f796SMark Fasheh 	if (!path) {
362517e9f796SMark Fasheh 		ret = -ENOMEM;
362617e9f796SMark Fasheh 		goto error;
362717e9f796SMark Fasheh 	}
362819a39dceSIlya Dryomov 
362919a39dceSIlya Dryomov 	/* zero out stat counters */
363019a39dceSIlya Dryomov 	spin_lock(&fs_info->balance_lock);
363119a39dceSIlya Dryomov 	memset(&bctl->stat, 0, sizeof(bctl->stat));
363219a39dceSIlya Dryomov 	spin_unlock(&fs_info->balance_lock);
363319a39dceSIlya Dryomov again:
36347d824b6fSDavid Sterba 	if (!counting) {
363512907fc7SDavid Sterba 		/*
363612907fc7SDavid Sterba 		 * The single value limit and min/max limits use the same bytes
363712907fc7SDavid Sterba 		 * in the
363812907fc7SDavid Sterba 		 */
36397d824b6fSDavid Sterba 		bctl->data.limit = limit_data;
36407d824b6fSDavid Sterba 		bctl->meta.limit = limit_meta;
36417d824b6fSDavid Sterba 		bctl->sys.limit = limit_sys;
36427d824b6fSDavid Sterba 	}
3643ec44a35cSChris Mason 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3644ec44a35cSChris Mason 	key.offset = (u64)-1;
3645ec44a35cSChris Mason 	key.type = BTRFS_CHUNK_ITEM_KEY;
3646ec44a35cSChris Mason 
3647ec44a35cSChris Mason 	while (1) {
364819a39dceSIlya Dryomov 		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3649a7e99c69SIlya Dryomov 		    atomic_read(&fs_info->balance_cancel_req)) {
3650837d5b6eSIlya Dryomov 			ret = -ECANCELED;
3651837d5b6eSIlya Dryomov 			goto error;
3652837d5b6eSIlya Dryomov 		}
3653837d5b6eSIlya Dryomov 
365467c5e7d4SFilipe Manana 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3655ec44a35cSChris Mason 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
365667c5e7d4SFilipe Manana 		if (ret < 0) {
365767c5e7d4SFilipe Manana 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3658ec44a35cSChris Mason 			goto error;
365967c5e7d4SFilipe Manana 		}
3660ec44a35cSChris Mason 
3661ec44a35cSChris Mason 		/*
3662ec44a35cSChris Mason 		 * this shouldn't happen, it means the last relocate
3663ec44a35cSChris Mason 		 * failed
3664ec44a35cSChris Mason 		 */
3665ec44a35cSChris Mason 		if (ret == 0)
3666c9e9f97bSIlya Dryomov 			BUG(); /* FIXME break ? */
3667ec44a35cSChris Mason 
3668ec44a35cSChris Mason 		ret = btrfs_previous_item(chunk_root, path, 0,
3669ec44a35cSChris Mason 					  BTRFS_CHUNK_ITEM_KEY);
3670c9e9f97bSIlya Dryomov 		if (ret) {
367167c5e7d4SFilipe Manana 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3672c9e9f97bSIlya Dryomov 			ret = 0;
3673ec44a35cSChris Mason 			break;
3674c9e9f97bSIlya Dryomov 		}
36757d9eb12cSChris Mason 
3676f43ffb60SIlya Dryomov 		leaf = path->nodes[0];
3677f43ffb60SIlya Dryomov 		slot = path->slots[0];
3678f43ffb60SIlya Dryomov 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3679f43ffb60SIlya Dryomov 
368067c5e7d4SFilipe Manana 		if (found_key.objectid != key.objectid) {
368167c5e7d4SFilipe Manana 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3682ec44a35cSChris Mason 			break;
368367c5e7d4SFilipe Manana 		}
36847d9eb12cSChris Mason 
3685f43ffb60SIlya Dryomov 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
368612907fc7SDavid Sterba 		chunk_type = btrfs_chunk_type(leaf, chunk);
3687f43ffb60SIlya Dryomov 
368819a39dceSIlya Dryomov 		if (!counting) {
368919a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
369019a39dceSIlya Dryomov 			bctl->stat.considered++;
369119a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
369219a39dceSIlya Dryomov 		}
369319a39dceSIlya Dryomov 
36946ec0896cSDavid Sterba 		ret = should_balance_chunk(leaf, chunk, found_key.offset);
36952c9fe835SZhao Lei 
3696b3b4aa74SDavid Sterba 		btrfs_release_path(path);
369767c5e7d4SFilipe Manana 		if (!ret) {
369867c5e7d4SFilipe Manana 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3699f43ffb60SIlya Dryomov 			goto loop;
370067c5e7d4SFilipe Manana 		}
3701f43ffb60SIlya Dryomov 
370219a39dceSIlya Dryomov 		if (counting) {
370367c5e7d4SFilipe Manana 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
370419a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
370519a39dceSIlya Dryomov 			bctl->stat.expected++;
370619a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
370712907fc7SDavid Sterba 
370812907fc7SDavid Sterba 			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
370912907fc7SDavid Sterba 				count_data++;
371012907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
371112907fc7SDavid Sterba 				count_sys++;
371212907fc7SDavid Sterba 			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
371312907fc7SDavid Sterba 				count_meta++;
371412907fc7SDavid Sterba 
371512907fc7SDavid Sterba 			goto loop;
371612907fc7SDavid Sterba 		}
371712907fc7SDavid Sterba 
371812907fc7SDavid Sterba 		/*
371912907fc7SDavid Sterba 		 * Apply limit_min filter, no need to check if the LIMITS
372012907fc7SDavid Sterba 		 * filter is used, limit_min is 0 by default
372112907fc7SDavid Sterba 		 */
372212907fc7SDavid Sterba 		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
372312907fc7SDavid Sterba 					count_data < bctl->data.limit_min)
372412907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
372512907fc7SDavid Sterba 					count_meta < bctl->meta.limit_min)
372612907fc7SDavid Sterba 				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
372712907fc7SDavid Sterba 					count_sys < bctl->sys.limit_min)) {
372812907fc7SDavid Sterba 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
372919a39dceSIlya Dryomov 			goto loop;
373019a39dceSIlya Dryomov 		}
373119a39dceSIlya Dryomov 
3732a6f93c71SLiu Bo 		if (!chunk_reserved) {
3733a6f93c71SLiu Bo 			/*
3734a6f93c71SLiu Bo 			 * We may be relocating the only data chunk we have,
3735a6f93c71SLiu Bo 			 * which could potentially end up with losing data's
3736a6f93c71SLiu Bo 			 * raid profile, so lets allocate an empty one in
3737a6f93c71SLiu Bo 			 * advance.
3738a6f93c71SLiu Bo 			 */
3739a6f93c71SLiu Bo 			ret = btrfs_may_alloc_data_chunk(fs_info,
3740a6f93c71SLiu Bo 							 found_key.offset);
37412c9fe835SZhao Lei 			if (ret < 0) {
37422c9fe835SZhao Lei 				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
37432c9fe835SZhao Lei 				goto error;
3744a6f93c71SLiu Bo 			} else if (ret == 1) {
37452c9fe835SZhao Lei 				chunk_reserved = 1;
37462c9fe835SZhao Lei 			}
3747a6f93c71SLiu Bo 		}
37482c9fe835SZhao Lei 
37495b4aacefSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
375067c5e7d4SFilipe Manana 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
375119a39dceSIlya Dryomov 		if (ret == -ENOSPC) {
3752c9e9f97bSIlya Dryomov 			enospc_errors++;
3753eede2bf3SOmar Sandoval 		} else if (ret == -ETXTBSY) {
3754eede2bf3SOmar Sandoval 			btrfs_info(fs_info,
3755eede2bf3SOmar Sandoval 	   "skipping relocation of block group %llu due to active swapfile",
3756eede2bf3SOmar Sandoval 				   found_key.offset);
3757eede2bf3SOmar Sandoval 			ret = 0;
3758eede2bf3SOmar Sandoval 		} else if (ret) {
3759eede2bf3SOmar Sandoval 			goto error;
376019a39dceSIlya Dryomov 		} else {
376119a39dceSIlya Dryomov 			spin_lock(&fs_info->balance_lock);
376219a39dceSIlya Dryomov 			bctl->stat.completed++;
376319a39dceSIlya Dryomov 			spin_unlock(&fs_info->balance_lock);
376419a39dceSIlya Dryomov 		}
3765f43ffb60SIlya Dryomov loop:
3766795a3321SIlya Dryomov 		if (found_key.offset == 0)
3767795a3321SIlya Dryomov 			break;
3768ba1bf481SJosef Bacik 		key.offset = found_key.offset - 1;
3769ec44a35cSChris Mason 	}
3770c9e9f97bSIlya Dryomov 
377119a39dceSIlya Dryomov 	if (counting) {
377219a39dceSIlya Dryomov 		btrfs_release_path(path);
377319a39dceSIlya Dryomov 		counting = false;
377419a39dceSIlya Dryomov 		goto again;
377519a39dceSIlya Dryomov 	}
3776ec44a35cSChris Mason error:
3777ec44a35cSChris Mason 	btrfs_free_path(path);
3778c9e9f97bSIlya Dryomov 	if (enospc_errors) {
3779efe120a0SFrank Holton 		btrfs_info(fs_info, "%d enospc errors during balance",
3780c9e9f97bSIlya Dryomov 			   enospc_errors);
3781c9e9f97bSIlya Dryomov 		if (!ret)
3782c9e9f97bSIlya Dryomov 			ret = -ENOSPC;
3783c9e9f97bSIlya Dryomov 	}
3784c9e9f97bSIlya Dryomov 
3785ec44a35cSChris Mason 	return ret;
3786ec44a35cSChris Mason }
3787ec44a35cSChris Mason 
37880c460c0dSIlya Dryomov /**
37890c460c0dSIlya Dryomov  * alloc_profile_is_valid - see if a given profile is valid and reduced
37900c460c0dSIlya Dryomov  * @flags: profile to validate
37910c460c0dSIlya Dryomov  * @extended: if true @flags is treated as an extended profile
37920c460c0dSIlya Dryomov  */
37930c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended)
37940c460c0dSIlya Dryomov {
37950c460c0dSIlya Dryomov 	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
37960c460c0dSIlya Dryomov 			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
37970c460c0dSIlya Dryomov 
37980c460c0dSIlya Dryomov 	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
37990c460c0dSIlya Dryomov 
38000c460c0dSIlya Dryomov 	/* 1) check that all other bits are zeroed */
38010c460c0dSIlya Dryomov 	if (flags & ~mask)
38020c460c0dSIlya Dryomov 		return 0;
38030c460c0dSIlya Dryomov 
38040c460c0dSIlya Dryomov 	/* 2) see if profile is reduced */
38050c460c0dSIlya Dryomov 	if (flags == 0)
38060c460c0dSIlya Dryomov 		return !extended; /* "0" is valid for usual profiles */
38070c460c0dSIlya Dryomov 
3808c1499166SDavid Sterba 	return has_single_bit_set(flags);
38090c460c0dSIlya Dryomov }
38100c460c0dSIlya Dryomov 
3811837d5b6eSIlya Dryomov static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3812837d5b6eSIlya Dryomov {
3813a7e99c69SIlya Dryomov 	/* cancel requested || normal exit path */
3814a7e99c69SIlya Dryomov 	return atomic_read(&fs_info->balance_cancel_req) ||
3815a7e99c69SIlya Dryomov 		(atomic_read(&fs_info->balance_pause_req) == 0 &&
3816a7e99c69SIlya Dryomov 		 atomic_read(&fs_info->balance_cancel_req) == 0);
3817837d5b6eSIlya Dryomov }
3818837d5b6eSIlya Dryomov 
38195ba366c3SDavid Sterba /*
38205ba366c3SDavid Sterba  * Validate target profile against allowed profiles and return true if it's OK.
38215ba366c3SDavid Sterba  * Otherwise print the error message and return false.
38225ba366c3SDavid Sterba  */
38235ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
38245ba366c3SDavid Sterba 		const struct btrfs_balance_args *bargs,
38255ba366c3SDavid Sterba 		u64 allowed, const char *type)
3826bdcd3c97SAlexandru Moise {
38275ba366c3SDavid Sterba 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
38285ba366c3SDavid Sterba 		return true;
38295ba366c3SDavid Sterba 
38305ba366c3SDavid Sterba 	/* Profile is valid and does not have bits outside of the allowed set */
38315ba366c3SDavid Sterba 	if (alloc_profile_is_valid(bargs->target, 1) &&
38325ba366c3SDavid Sterba 	    (bargs->target & ~allowed) == 0)
38335ba366c3SDavid Sterba 		return true;
38345ba366c3SDavid Sterba 
38355ba366c3SDavid Sterba 	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
38365ba366c3SDavid Sterba 			type, btrfs_bg_type_to_raid_name(bargs->target));
38375ba366c3SDavid Sterba 	return false;
3838bdcd3c97SAlexandru Moise }
3839bdcd3c97SAlexandru Moise 
3840c9e9f97bSIlya Dryomov /*
384156fc37d9SAnand Jain  * Fill @buf with textual description of balance filter flags @bargs, up to
384256fc37d9SAnand Jain  * @size_buf including the terminating null. The output may be trimmed if it
384356fc37d9SAnand Jain  * does not fit into the provided buffer.
384456fc37d9SAnand Jain  */
384556fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
384656fc37d9SAnand Jain 				 u32 size_buf)
384756fc37d9SAnand Jain {
384856fc37d9SAnand Jain 	int ret;
384956fc37d9SAnand Jain 	u32 size_bp = size_buf;
385056fc37d9SAnand Jain 	char *bp = buf;
385156fc37d9SAnand Jain 	u64 flags = bargs->flags;
385256fc37d9SAnand Jain 	char tmp_buf[128] = {'\0'};
385356fc37d9SAnand Jain 
385456fc37d9SAnand Jain 	if (!flags)
385556fc37d9SAnand Jain 		return;
385656fc37d9SAnand Jain 
385756fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a)						\
385856fc37d9SAnand Jain 	do {								\
385956fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a));			\
386056fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
386156fc37d9SAnand Jain 			goto out_overflow;				\
386256fc37d9SAnand Jain 		size_bp -= ret;						\
386356fc37d9SAnand Jain 		bp += ret;						\
386456fc37d9SAnand Jain 	} while (0)
386556fc37d9SAnand Jain 
386656fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
386756fc37d9SAnand Jain 	do {								\
386856fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
386956fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
387056fc37d9SAnand Jain 			goto out_overflow;				\
387156fc37d9SAnand Jain 		size_bp -= ret;						\
387256fc37d9SAnand Jain 		bp += ret;						\
387356fc37d9SAnand Jain 	} while (0)
387456fc37d9SAnand Jain 
387556fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2)					\
387656fc37d9SAnand Jain 	do {								\
387756fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
387856fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
387956fc37d9SAnand Jain 			goto out_overflow;				\
388056fc37d9SAnand Jain 		size_bp -= ret;						\
388156fc37d9SAnand Jain 		bp += ret;						\
388256fc37d9SAnand Jain 	} while (0)
388356fc37d9SAnand Jain 
3884158da513SDavid Sterba 	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3885158da513SDavid Sterba 		CHECK_APPEND_1ARG("convert=%s,",
3886158da513SDavid Sterba 				  btrfs_bg_type_to_raid_name(bargs->target));
388756fc37d9SAnand Jain 
388856fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_SOFT)
388956fc37d9SAnand Jain 		CHECK_APPEND_NOARG("soft,");
389056fc37d9SAnand Jain 
389156fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
389256fc37d9SAnand Jain 		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
389356fc37d9SAnand Jain 					    sizeof(tmp_buf));
389456fc37d9SAnand Jain 		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
389556fc37d9SAnand Jain 	}
389656fc37d9SAnand Jain 
389756fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE)
389856fc37d9SAnand Jain 		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
389956fc37d9SAnand Jain 
390056fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
390156fc37d9SAnand Jain 		CHECK_APPEND_2ARG("usage=%u..%u,",
390256fc37d9SAnand Jain 				  bargs->usage_min, bargs->usage_max);
390356fc37d9SAnand Jain 
390456fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DEVID)
390556fc37d9SAnand Jain 		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
390656fc37d9SAnand Jain 
390756fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
390856fc37d9SAnand Jain 		CHECK_APPEND_2ARG("drange=%llu..%llu,",
390956fc37d9SAnand Jain 				  bargs->pstart, bargs->pend);
391056fc37d9SAnand Jain 
391156fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
391256fc37d9SAnand Jain 		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
391356fc37d9SAnand Jain 				  bargs->vstart, bargs->vend);
391456fc37d9SAnand Jain 
391556fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
391656fc37d9SAnand Jain 		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
391756fc37d9SAnand Jain 
391856fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
391956fc37d9SAnand Jain 		CHECK_APPEND_2ARG("limit=%u..%u,",
392056fc37d9SAnand Jain 				bargs->limit_min, bargs->limit_max);
392156fc37d9SAnand Jain 
392256fc37d9SAnand Jain 	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
392356fc37d9SAnand Jain 		CHECK_APPEND_2ARG("stripes=%u..%u,",
392456fc37d9SAnand Jain 				  bargs->stripes_min, bargs->stripes_max);
392556fc37d9SAnand Jain 
392656fc37d9SAnand Jain #undef CHECK_APPEND_2ARG
392756fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
392856fc37d9SAnand Jain #undef CHECK_APPEND_NOARG
392956fc37d9SAnand Jain 
393056fc37d9SAnand Jain out_overflow:
393156fc37d9SAnand Jain 
393256fc37d9SAnand Jain 	if (size_bp < size_buf)
393356fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
393456fc37d9SAnand Jain 	else
393556fc37d9SAnand Jain 		buf[0] = '\0';
393656fc37d9SAnand Jain }
393756fc37d9SAnand Jain 
393856fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
393956fc37d9SAnand Jain {
394056fc37d9SAnand Jain 	u32 size_buf = 1024;
394156fc37d9SAnand Jain 	char tmp_buf[192] = {'\0'};
394256fc37d9SAnand Jain 	char *buf;
394356fc37d9SAnand Jain 	char *bp;
394456fc37d9SAnand Jain 	u32 size_bp = size_buf;
394556fc37d9SAnand Jain 	int ret;
394656fc37d9SAnand Jain 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
394756fc37d9SAnand Jain 
394856fc37d9SAnand Jain 	buf = kzalloc(size_buf, GFP_KERNEL);
394956fc37d9SAnand Jain 	if (!buf)
395056fc37d9SAnand Jain 		return;
395156fc37d9SAnand Jain 
395256fc37d9SAnand Jain 	bp = buf;
395356fc37d9SAnand Jain 
395456fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1)					\
395556fc37d9SAnand Jain 	do {								\
395656fc37d9SAnand Jain 		ret = snprintf(bp, size_bp, (a), (v1));			\
395756fc37d9SAnand Jain 		if (ret < 0 || ret >= size_bp)				\
395856fc37d9SAnand Jain 			goto out_overflow;				\
395956fc37d9SAnand Jain 		size_bp -= ret;						\
396056fc37d9SAnand Jain 		bp += ret;						\
396156fc37d9SAnand Jain 	} while (0)
396256fc37d9SAnand Jain 
396356fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_FORCE)
396456fc37d9SAnand Jain 		CHECK_APPEND_1ARG("%s", "-f ");
396556fc37d9SAnand Jain 
396656fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_DATA) {
396756fc37d9SAnand Jain 		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
396856fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
396956fc37d9SAnand Jain 	}
397056fc37d9SAnand Jain 
397156fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_METADATA) {
397256fc37d9SAnand Jain 		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
397356fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
397456fc37d9SAnand Jain 	}
397556fc37d9SAnand Jain 
397656fc37d9SAnand Jain 	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
397756fc37d9SAnand Jain 		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
397856fc37d9SAnand Jain 		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
397956fc37d9SAnand Jain 	}
398056fc37d9SAnand Jain 
398156fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
398256fc37d9SAnand Jain 
398356fc37d9SAnand Jain out_overflow:
398456fc37d9SAnand Jain 
398556fc37d9SAnand Jain 	if (size_bp < size_buf)
398656fc37d9SAnand Jain 		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
398756fc37d9SAnand Jain 	btrfs_info(fs_info, "balance: %s %s",
398856fc37d9SAnand Jain 		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
398956fc37d9SAnand Jain 		   "resume" : "start", buf);
399056fc37d9SAnand Jain 
399156fc37d9SAnand Jain 	kfree(buf);
399256fc37d9SAnand Jain }
399356fc37d9SAnand Jain 
399456fc37d9SAnand Jain /*
3995dccdb07bSDavid Sterba  * Should be called with balance mutexe held
3996c9e9f97bSIlya Dryomov  */
39976fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info,
39986fcf6e2bSDavid Sterba 		  struct btrfs_balance_control *bctl,
3999c9e9f97bSIlya Dryomov 		  struct btrfs_ioctl_balance_args *bargs)
4000c9e9f97bSIlya Dryomov {
400114506127SAdam Borowski 	u64 meta_target, data_target;
4002f43ffb60SIlya Dryomov 	u64 allowed;
4003e4837f8fSIlya Dryomov 	int mixed = 0;
4004c9e9f97bSIlya Dryomov 	int ret;
40058dabb742SStefan Behrens 	u64 num_devices;
4006de98ced9SMiao Xie 	unsigned seq;
4007e62869beSAnand Jain 	bool reducing_redundancy;
4008081db89bSDavid Sterba 	int i;
4009c9e9f97bSIlya Dryomov 
4010837d5b6eSIlya Dryomov 	if (btrfs_fs_closing(fs_info) ||
4011a7e99c69SIlya Dryomov 	    atomic_read(&fs_info->balance_pause_req) ||
4012726a3421SQu Wenruo 	    btrfs_should_cancel_balance(fs_info)) {
4013c9e9f97bSIlya Dryomov 		ret = -EINVAL;
4014c9e9f97bSIlya Dryomov 		goto out;
4015c9e9f97bSIlya Dryomov 	}
4016c9e9f97bSIlya Dryomov 
4017e4837f8fSIlya Dryomov 	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4018e4837f8fSIlya Dryomov 	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4019e4837f8fSIlya Dryomov 		mixed = 1;
4020e4837f8fSIlya Dryomov 
4021f43ffb60SIlya Dryomov 	/*
4022f43ffb60SIlya Dryomov 	 * In case of mixed groups both data and meta should be picked,
4023f43ffb60SIlya Dryomov 	 * and identical options should be given for both of them.
4024f43ffb60SIlya Dryomov 	 */
4025e4837f8fSIlya Dryomov 	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4026e4837f8fSIlya Dryomov 	if (mixed && (bctl->flags & allowed)) {
4027f43ffb60SIlya Dryomov 		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4028f43ffb60SIlya Dryomov 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4029f43ffb60SIlya Dryomov 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
40305d163e0eSJeff Mahoney 			btrfs_err(fs_info,
40316dac13f8SAnand Jain 	  "balance: mixed groups data and metadata options must be the same");
4032f43ffb60SIlya Dryomov 			ret = -EINVAL;
4033f43ffb60SIlya Dryomov 			goto out;
4034f43ffb60SIlya Dryomov 		}
4035f43ffb60SIlya Dryomov 	}
4036f43ffb60SIlya Dryomov 
4037b35cf1f0SJosef Bacik 	/*
4038b35cf1f0SJosef Bacik 	 * rw_devices will not change at the moment, device add/delete/replace
4039c3e1f96cSGoldwyn Rodrigues 	 * are exclusive
4040b35cf1f0SJosef Bacik 	 */
4041b35cf1f0SJosef Bacik 	num_devices = fs_info->fs_devices->rw_devices;
4042fab27359SQu Wenruo 
4043fab27359SQu Wenruo 	/*
4044fab27359SQu Wenruo 	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4045fab27359SQu Wenruo 	 * special bit for it, to make it easier to distinguish.  Thus we need
4046fab27359SQu Wenruo 	 * to set it manually, or balance would refuse the profile.
4047fab27359SQu Wenruo 	 */
4048fab27359SQu Wenruo 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4049081db89bSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4050081db89bSDavid Sterba 		if (num_devices >= btrfs_raid_array[i].devs_min)
4051081db89bSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
40521da73967SAnand Jain 
40535ba366c3SDavid Sterba 	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
40545ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
40555ba366c3SDavid Sterba 	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4056e4d8ec0fSIlya Dryomov 		ret = -EINVAL;
4057e4d8ec0fSIlya Dryomov 		goto out;
4058e4d8ec0fSIlya Dryomov 	}
4059e4d8ec0fSIlya Dryomov 
40606079e12cSDavid Sterba 	/*
40616079e12cSDavid Sterba 	 * Allow to reduce metadata or system integrity only if force set for
40626079e12cSDavid Sterba 	 * profiles with redundancy (copies, parity)
40636079e12cSDavid Sterba 	 */
40646079e12cSDavid Sterba 	allowed = 0;
40656079e12cSDavid Sterba 	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
40666079e12cSDavid Sterba 		if (btrfs_raid_array[i].ncopies >= 2 ||
40676079e12cSDavid Sterba 		    btrfs_raid_array[i].tolerated_failures >= 1)
40686079e12cSDavid Sterba 			allowed |= btrfs_raid_array[i].bg_flag;
40696079e12cSDavid Sterba 	}
4070de98ced9SMiao Xie 	do {
4071de98ced9SMiao Xie 		seq = read_seqbegin(&fs_info->profiles_lock);
4072de98ced9SMiao Xie 
4073e4d8ec0fSIlya Dryomov 		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4074e4d8ec0fSIlya Dryomov 		     (fs_info->avail_system_alloc_bits & allowed) &&
4075e4d8ec0fSIlya Dryomov 		     !(bctl->sys.target & allowed)) ||
4076e4d8ec0fSIlya Dryomov 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4077e4d8ec0fSIlya Dryomov 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
40785a8067c0SFilipe Manana 		     !(bctl->meta.target & allowed)))
4079e62869beSAnand Jain 			reducing_redundancy = true;
40805a8067c0SFilipe Manana 		else
4081e62869beSAnand Jain 			reducing_redundancy = false;
40825a8067c0SFilipe Manana 
40835a8067c0SFilipe Manana 		/* if we're not converting, the target field is uninitialized */
40845a8067c0SFilipe Manana 		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
40855a8067c0SFilipe Manana 			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
40865a8067c0SFilipe Manana 		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
40875a8067c0SFilipe Manana 			bctl->data.target : fs_info->avail_data_alloc_bits;
40885a8067c0SFilipe Manana 	} while (read_seqretry(&fs_info->profiles_lock, seq));
40895a8067c0SFilipe Manana 
4090e62869beSAnand Jain 	if (reducing_redundancy) {
4091e4d8ec0fSIlya Dryomov 		if (bctl->flags & BTRFS_BALANCE_FORCE) {
40925d163e0eSJeff Mahoney 			btrfs_info(fs_info,
4093e62869beSAnand Jain 			   "balance: force reducing metadata redundancy");
4094e4d8ec0fSIlya Dryomov 		} else {
40955d163e0eSJeff Mahoney 			btrfs_err(fs_info,
4096e62869beSAnand Jain 	"balance: reduces metadata redundancy, use --force if you want this");
4097e4d8ec0fSIlya Dryomov 			ret = -EINVAL;
4098e4d8ec0fSIlya Dryomov 			goto out;
4099e4d8ec0fSIlya Dryomov 		}
4100e4d8ec0fSIlya Dryomov 	}
4101e4d8ec0fSIlya Dryomov 
410214506127SAdam Borowski 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
410314506127SAdam Borowski 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4104ee592d07SSam Tygier 		btrfs_warn(fs_info,
41056dac13f8SAnand Jain 	"balance: metadata profile %s has lower redundancy than data profile %s",
4106158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(meta_target),
4107158da513SDavid Sterba 				btrfs_bg_type_to_raid_name(data_target));
4108ee592d07SSam Tygier 	}
4109ee592d07SSam Tygier 
41109e967495SFilipe Manana 	if (fs_info->send_in_progress) {
41119e967495SFilipe Manana 		btrfs_warn_rl(fs_info,
41129e967495SFilipe Manana "cannot run balance while send operations are in progress (%d in progress)",
41139e967495SFilipe Manana 			      fs_info->send_in_progress);
41149e967495SFilipe Manana 		ret = -EAGAIN;
41159e967495SFilipe Manana 		goto out;
41169e967495SFilipe Manana 	}
41179e967495SFilipe Manana 
41186bccf3abSJeff Mahoney 	ret = insert_balance_item(fs_info, bctl);
411959641015SIlya Dryomov 	if (ret && ret != -EEXIST)
41200940ebf6SIlya Dryomov 		goto out;
41210940ebf6SIlya Dryomov 
412259641015SIlya Dryomov 	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
412359641015SIlya Dryomov 		BUG_ON(ret == -EEXIST);
4124833aae18SDavid Sterba 		BUG_ON(fs_info->balance_ctl);
4125833aae18SDavid Sterba 		spin_lock(&fs_info->balance_lock);
4126833aae18SDavid Sterba 		fs_info->balance_ctl = bctl;
4127833aae18SDavid Sterba 		spin_unlock(&fs_info->balance_lock);
412859641015SIlya Dryomov 	} else {
412959641015SIlya Dryomov 		BUG_ON(ret != -EEXIST);
413059641015SIlya Dryomov 		spin_lock(&fs_info->balance_lock);
413159641015SIlya Dryomov 		update_balance_args(bctl);
413259641015SIlya Dryomov 		spin_unlock(&fs_info->balance_lock);
413359641015SIlya Dryomov 	}
4134c9e9f97bSIlya Dryomov 
41353009a62fSDavid Sterba 	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
41363009a62fSDavid Sterba 	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
413756fc37d9SAnand Jain 	describe_balance_start_or_resume(fs_info);
4138c9e9f97bSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4139c9e9f97bSIlya Dryomov 
4140c9e9f97bSIlya Dryomov 	ret = __btrfs_balance(fs_info);
4141c9e9f97bSIlya Dryomov 
4142c9e9f97bSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
41437333bd02SAnand Jain 	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
41447333bd02SAnand Jain 		btrfs_info(fs_info, "balance: paused");
414544d354abSQu Wenruo 	/*
414644d354abSQu Wenruo 	 * Balance can be canceled by:
414744d354abSQu Wenruo 	 *
414844d354abSQu Wenruo 	 * - Regular cancel request
414944d354abSQu Wenruo 	 *   Then ret == -ECANCELED and balance_cancel_req > 0
415044d354abSQu Wenruo 	 *
415144d354abSQu Wenruo 	 * - Fatal signal to "btrfs" process
415244d354abSQu Wenruo 	 *   Either the signal caught by wait_reserve_ticket() and callers
415344d354abSQu Wenruo 	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
415444d354abSQu Wenruo 	 *   got -ECANCELED.
415544d354abSQu Wenruo 	 *   Either way, in this case balance_cancel_req = 0, and
415644d354abSQu Wenruo 	 *   ret == -EINTR or ret == -ECANCELED.
415744d354abSQu Wenruo 	 *
415844d354abSQu Wenruo 	 * So here we only check the return value to catch canceled balance.
415944d354abSQu Wenruo 	 */
416044d354abSQu Wenruo 	else if (ret == -ECANCELED || ret == -EINTR)
41617333bd02SAnand Jain 		btrfs_info(fs_info, "balance: canceled");
41627333bd02SAnand Jain 	else
41637333bd02SAnand Jain 		btrfs_info(fs_info, "balance: ended with status: %d", ret);
41647333bd02SAnand Jain 
41653009a62fSDavid Sterba 	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4166c9e9f97bSIlya Dryomov 
4167c9e9f97bSIlya Dryomov 	if (bargs) {
4168c9e9f97bSIlya Dryomov 		memset(bargs, 0, sizeof(*bargs));
4169008ef096SDavid Sterba 		btrfs_update_ioctl_balance_args(fs_info, bargs);
4170c9e9f97bSIlya Dryomov 	}
4171c9e9f97bSIlya Dryomov 
41723a01aa7aSIlya Dryomov 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
41733a01aa7aSIlya Dryomov 	    balance_need_close(fs_info)) {
4174149196a2SDavid Sterba 		reset_balance_state(fs_info);
4175c3e1f96cSGoldwyn Rodrigues 		btrfs_exclop_finish(fs_info);
41763a01aa7aSIlya Dryomov 	}
41773a01aa7aSIlya Dryomov 
4178837d5b6eSIlya Dryomov 	wake_up(&fs_info->balance_wait_q);
4179c9e9f97bSIlya Dryomov 
4180c9e9f97bSIlya Dryomov 	return ret;
4181c9e9f97bSIlya Dryomov out:
418259641015SIlya Dryomov 	if (bctl->flags & BTRFS_BALANCE_RESUME)
4183149196a2SDavid Sterba 		reset_balance_state(fs_info);
4184a17c95dfSDavid Sterba 	else
4185c9e9f97bSIlya Dryomov 		kfree(bctl);
4186c3e1f96cSGoldwyn Rodrigues 	btrfs_exclop_finish(fs_info);
4187a17c95dfSDavid Sterba 
41888f18cf13SChris Mason 	return ret;
41898f18cf13SChris Mason }
41908f18cf13SChris Mason 
419159641015SIlya Dryomov static int balance_kthread(void *data)
419259641015SIlya Dryomov {
41932b6ba629SIlya Dryomov 	struct btrfs_fs_info *fs_info = data;
41949555c6c1SIlya Dryomov 	int ret = 0;
419559641015SIlya Dryomov 
419659641015SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
419756fc37d9SAnand Jain 	if (fs_info->balance_ctl)
41986fcf6e2bSDavid Sterba 		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
419959641015SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
42002b6ba629SIlya Dryomov 
420159641015SIlya Dryomov 	return ret;
420259641015SIlya Dryomov }
420359641015SIlya Dryomov 
42042b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
42052b6ba629SIlya Dryomov {
42062b6ba629SIlya Dryomov 	struct task_struct *tsk;
42072b6ba629SIlya Dryomov 
42081354e1a1SDavid Sterba 	mutex_lock(&fs_info->balance_mutex);
42092b6ba629SIlya Dryomov 	if (!fs_info->balance_ctl) {
42101354e1a1SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
42112b6ba629SIlya Dryomov 		return 0;
42122b6ba629SIlya Dryomov 	}
42131354e1a1SDavid Sterba 	mutex_unlock(&fs_info->balance_mutex);
42142b6ba629SIlya Dryomov 
42153cdde224SJeff Mahoney 	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
42166dac13f8SAnand Jain 		btrfs_info(fs_info, "balance: resume skipped");
42172b6ba629SIlya Dryomov 		return 0;
42182b6ba629SIlya Dryomov 	}
42192b6ba629SIlya Dryomov 
422002ee654dSAnand Jain 	/*
422102ee654dSAnand Jain 	 * A ro->rw remount sequence should continue with the paused balance
422202ee654dSAnand Jain 	 * regardless of who pauses it, system or the user as of now, so set
422302ee654dSAnand Jain 	 * the resume flag.
422402ee654dSAnand Jain 	 */
422502ee654dSAnand Jain 	spin_lock(&fs_info->balance_lock);
422602ee654dSAnand Jain 	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
422702ee654dSAnand Jain 	spin_unlock(&fs_info->balance_lock);
422802ee654dSAnand Jain 
42292b6ba629SIlya Dryomov 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4230cd633972SSachin Kamat 	return PTR_ERR_OR_ZERO(tsk);
42312b6ba629SIlya Dryomov }
42322b6ba629SIlya Dryomov 
423368310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
423459641015SIlya Dryomov {
423559641015SIlya Dryomov 	struct btrfs_balance_control *bctl;
423659641015SIlya Dryomov 	struct btrfs_balance_item *item;
423759641015SIlya Dryomov 	struct btrfs_disk_balance_args disk_bargs;
423859641015SIlya Dryomov 	struct btrfs_path *path;
423959641015SIlya Dryomov 	struct extent_buffer *leaf;
424059641015SIlya Dryomov 	struct btrfs_key key;
424159641015SIlya Dryomov 	int ret;
424259641015SIlya Dryomov 
424359641015SIlya Dryomov 	path = btrfs_alloc_path();
424459641015SIlya Dryomov 	if (!path)
424559641015SIlya Dryomov 		return -ENOMEM;
424659641015SIlya Dryomov 
424768310a5eSIlya Dryomov 	key.objectid = BTRFS_BALANCE_OBJECTID;
4248c479cb4fSDavid Sterba 	key.type = BTRFS_TEMPORARY_ITEM_KEY;
424968310a5eSIlya Dryomov 	key.offset = 0;
425068310a5eSIlya Dryomov 
425168310a5eSIlya Dryomov 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
425268310a5eSIlya Dryomov 	if (ret < 0)
425368310a5eSIlya Dryomov 		goto out;
425468310a5eSIlya Dryomov 	if (ret > 0) { /* ret = -ENOENT; */
425568310a5eSIlya Dryomov 		ret = 0;
425668310a5eSIlya Dryomov 		goto out;
425768310a5eSIlya Dryomov 	}
425868310a5eSIlya Dryomov 
425959641015SIlya Dryomov 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
426059641015SIlya Dryomov 	if (!bctl) {
426159641015SIlya Dryomov 		ret = -ENOMEM;
426259641015SIlya Dryomov 		goto out;
426359641015SIlya Dryomov 	}
426459641015SIlya Dryomov 
426559641015SIlya Dryomov 	leaf = path->nodes[0];
426659641015SIlya Dryomov 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
426759641015SIlya Dryomov 
426868310a5eSIlya Dryomov 	bctl->flags = btrfs_balance_flags(leaf, item);
426968310a5eSIlya Dryomov 	bctl->flags |= BTRFS_BALANCE_RESUME;
427059641015SIlya Dryomov 
427159641015SIlya Dryomov 	btrfs_balance_data(leaf, item, &disk_bargs);
427259641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
427359641015SIlya Dryomov 	btrfs_balance_meta(leaf, item, &disk_bargs);
427459641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
427559641015SIlya Dryomov 	btrfs_balance_sys(leaf, item, &disk_bargs);
427659641015SIlya Dryomov 	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
427759641015SIlya Dryomov 
4278eee95e3fSDavid Sterba 	/*
4279eee95e3fSDavid Sterba 	 * This should never happen, as the paused balance state is recovered
4280eee95e3fSDavid Sterba 	 * during mount without any chance of other exclusive ops to collide.
4281eee95e3fSDavid Sterba 	 *
4282eee95e3fSDavid Sterba 	 * This gives the exclusive op status to balance and keeps in paused
4283eee95e3fSDavid Sterba 	 * state until user intervention (cancel or umount). If the ownership
4284eee95e3fSDavid Sterba 	 * cannot be assigned, show a message but do not fail. The balance
4285eee95e3fSDavid Sterba 	 * is in a paused state and must have fs_info::balance_ctl properly
4286eee95e3fSDavid Sterba 	 * set up.
4287eee95e3fSDavid Sterba 	 */
4288c3e1f96cSGoldwyn Rodrigues 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4289eee95e3fSDavid Sterba 		btrfs_warn(fs_info,
42906dac13f8SAnand Jain 	"balance: cannot set exclusive op status, resume manually");
4291ed0fb78fSIlya Dryomov 
429268310a5eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4293833aae18SDavid Sterba 	BUG_ON(fs_info->balance_ctl);
4294833aae18SDavid Sterba 	spin_lock(&fs_info->balance_lock);
4295833aae18SDavid Sterba 	fs_info->balance_ctl = bctl;
4296833aae18SDavid Sterba 	spin_unlock(&fs_info->balance_lock);
429768310a5eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
429859641015SIlya Dryomov out:
429959641015SIlya Dryomov 	btrfs_free_path(path);
430059641015SIlya Dryomov 	return ret;
430159641015SIlya Dryomov }
430259641015SIlya Dryomov 
4303837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4304837d5b6eSIlya Dryomov {
4305837d5b6eSIlya Dryomov 	int ret = 0;
4306837d5b6eSIlya Dryomov 
4307837d5b6eSIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4308837d5b6eSIlya Dryomov 	if (!fs_info->balance_ctl) {
4309837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4310837d5b6eSIlya Dryomov 		return -ENOTCONN;
4311837d5b6eSIlya Dryomov 	}
4312837d5b6eSIlya Dryomov 
43133009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4314837d5b6eSIlya Dryomov 		atomic_inc(&fs_info->balance_pause_req);
4315837d5b6eSIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4316837d5b6eSIlya Dryomov 
4317837d5b6eSIlya Dryomov 		wait_event(fs_info->balance_wait_q,
43183009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4319837d5b6eSIlya Dryomov 
4320837d5b6eSIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4321837d5b6eSIlya Dryomov 		/* we are good with balance_ctl ripped off from under us */
43223009a62fSDavid Sterba 		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4323837d5b6eSIlya Dryomov 		atomic_dec(&fs_info->balance_pause_req);
4324837d5b6eSIlya Dryomov 	} else {
4325837d5b6eSIlya Dryomov 		ret = -ENOTCONN;
4326837d5b6eSIlya Dryomov 	}
4327837d5b6eSIlya Dryomov 
4328837d5b6eSIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4329837d5b6eSIlya Dryomov 	return ret;
4330837d5b6eSIlya Dryomov }
4331837d5b6eSIlya Dryomov 
4332a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4333a7e99c69SIlya Dryomov {
4334a7e99c69SIlya Dryomov 	mutex_lock(&fs_info->balance_mutex);
4335a7e99c69SIlya Dryomov 	if (!fs_info->balance_ctl) {
4336a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4337a7e99c69SIlya Dryomov 		return -ENOTCONN;
4338a7e99c69SIlya Dryomov 	}
4339a7e99c69SIlya Dryomov 
4340cf7d20f4SDavid Sterba 	/*
4341cf7d20f4SDavid Sterba 	 * A paused balance with the item stored on disk can be resumed at
4342cf7d20f4SDavid Sterba 	 * mount time if the mount is read-write. Otherwise it's still paused
4343cf7d20f4SDavid Sterba 	 * and we must not allow cancelling as it deletes the item.
4344cf7d20f4SDavid Sterba 	 */
4345cf7d20f4SDavid Sterba 	if (sb_rdonly(fs_info->sb)) {
4346cf7d20f4SDavid Sterba 		mutex_unlock(&fs_info->balance_mutex);
4347cf7d20f4SDavid Sterba 		return -EROFS;
4348cf7d20f4SDavid Sterba 	}
4349cf7d20f4SDavid Sterba 
4350a7e99c69SIlya Dryomov 	atomic_inc(&fs_info->balance_cancel_req);
4351a7e99c69SIlya Dryomov 	/*
4352a7e99c69SIlya Dryomov 	 * if we are running just wait and return, balance item is
4353a7e99c69SIlya Dryomov 	 * deleted in btrfs_balance in this case
4354a7e99c69SIlya Dryomov 	 */
43553009a62fSDavid Sterba 	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4356a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4357a7e99c69SIlya Dryomov 		wait_event(fs_info->balance_wait_q,
43583009a62fSDavid Sterba 			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4359a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4360a7e99c69SIlya Dryomov 	} else {
4361a7e99c69SIlya Dryomov 		mutex_unlock(&fs_info->balance_mutex);
4362dccdb07bSDavid Sterba 		/*
4363dccdb07bSDavid Sterba 		 * Lock released to allow other waiters to continue, we'll
4364dccdb07bSDavid Sterba 		 * reexamine the status again.
4365dccdb07bSDavid Sterba 		 */
4366a7e99c69SIlya Dryomov 		mutex_lock(&fs_info->balance_mutex);
4367a7e99c69SIlya Dryomov 
4368a17c95dfSDavid Sterba 		if (fs_info->balance_ctl) {
4369149196a2SDavid Sterba 			reset_balance_state(fs_info);
4370c3e1f96cSGoldwyn Rodrigues 			btrfs_exclop_finish(fs_info);
43716dac13f8SAnand Jain 			btrfs_info(fs_info, "balance: canceled");
4372a17c95dfSDavid Sterba 		}
4373a7e99c69SIlya Dryomov 	}
4374a7e99c69SIlya Dryomov 
43753009a62fSDavid Sterba 	BUG_ON(fs_info->balance_ctl ||
43763009a62fSDavid Sterba 		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4377a7e99c69SIlya Dryomov 	atomic_dec(&fs_info->balance_cancel_req);
4378a7e99c69SIlya Dryomov 	mutex_unlock(&fs_info->balance_mutex);
4379a7e99c69SIlya Dryomov 	return 0;
4380a7e99c69SIlya Dryomov }
4381a7e99c69SIlya Dryomov 
438297f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data)
4383803b2f54SStefan Behrens {
4384803b2f54SStefan Behrens 	struct btrfs_fs_info *fs_info = data;
4385803b2f54SStefan Behrens 	struct btrfs_root *root = fs_info->tree_root;
4386803b2f54SStefan Behrens 	struct btrfs_key key;
4387803b2f54SStefan Behrens 	struct btrfs_path *path = NULL;
4388803b2f54SStefan Behrens 	int ret = 0;
4389803b2f54SStefan Behrens 	struct extent_buffer *eb;
4390803b2f54SStefan Behrens 	int slot;
4391803b2f54SStefan Behrens 	struct btrfs_root_item root_item;
4392803b2f54SStefan Behrens 	u32 item_size;
4393f45388f3SFilipe David Borba Manana 	struct btrfs_trans_handle *trans = NULL;
4394c94bec2cSJosef Bacik 	bool closing = false;
4395803b2f54SStefan Behrens 
4396803b2f54SStefan Behrens 	path = btrfs_alloc_path();
4397803b2f54SStefan Behrens 	if (!path) {
4398803b2f54SStefan Behrens 		ret = -ENOMEM;
4399803b2f54SStefan Behrens 		goto out;
4400803b2f54SStefan Behrens 	}
4401803b2f54SStefan Behrens 
4402803b2f54SStefan Behrens 	key.objectid = 0;
4403803b2f54SStefan Behrens 	key.type = BTRFS_ROOT_ITEM_KEY;
4404803b2f54SStefan Behrens 	key.offset = 0;
4405803b2f54SStefan Behrens 
4406803b2f54SStefan Behrens 	while (1) {
4407c94bec2cSJosef Bacik 		if (btrfs_fs_closing(fs_info)) {
4408c94bec2cSJosef Bacik 			closing = true;
4409c94bec2cSJosef Bacik 			break;
4410c94bec2cSJosef Bacik 		}
44117c829b72SAnand Jain 		ret = btrfs_search_forward(root, &key, path,
44127c829b72SAnand Jain 				BTRFS_OLDEST_GENERATION);
4413803b2f54SStefan Behrens 		if (ret) {
4414803b2f54SStefan Behrens 			if (ret > 0)
4415803b2f54SStefan Behrens 				ret = 0;
4416803b2f54SStefan Behrens 			break;
4417803b2f54SStefan Behrens 		}
4418803b2f54SStefan Behrens 
4419803b2f54SStefan Behrens 		if (key.type != BTRFS_ROOT_ITEM_KEY ||
4420803b2f54SStefan Behrens 		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4421803b2f54SStefan Behrens 		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4422803b2f54SStefan Behrens 		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4423803b2f54SStefan Behrens 			goto skip;
4424803b2f54SStefan Behrens 
4425803b2f54SStefan Behrens 		eb = path->nodes[0];
4426803b2f54SStefan Behrens 		slot = path->slots[0];
4427803b2f54SStefan Behrens 		item_size = btrfs_item_size_nr(eb, slot);
4428803b2f54SStefan Behrens 		if (item_size < sizeof(root_item))
4429803b2f54SStefan Behrens 			goto skip;
4430803b2f54SStefan Behrens 
4431803b2f54SStefan Behrens 		read_extent_buffer(eb, &root_item,
4432803b2f54SStefan Behrens 				   btrfs_item_ptr_offset(eb, slot),
4433803b2f54SStefan Behrens 				   (int)sizeof(root_item));
4434803b2f54SStefan Behrens 		if (btrfs_root_refs(&root_item) == 0)
4435803b2f54SStefan Behrens 			goto skip;
4436f45388f3SFilipe David Borba Manana 
4437f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid) ||
4438f45388f3SFilipe David Borba Manana 		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4439f45388f3SFilipe David Borba Manana 			if (trans)
4440f45388f3SFilipe David Borba Manana 				goto update_tree;
4441f45388f3SFilipe David Borba Manana 
4442f45388f3SFilipe David Borba Manana 			btrfs_release_path(path);
4443803b2f54SStefan Behrens 			/*
4444803b2f54SStefan Behrens 			 * 1 - subvol uuid item
4445803b2f54SStefan Behrens 			 * 1 - received_subvol uuid item
4446803b2f54SStefan Behrens 			 */
4447803b2f54SStefan Behrens 			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4448803b2f54SStefan Behrens 			if (IS_ERR(trans)) {
4449803b2f54SStefan Behrens 				ret = PTR_ERR(trans);
4450803b2f54SStefan Behrens 				break;
4451803b2f54SStefan Behrens 			}
4452f45388f3SFilipe David Borba Manana 			continue;
4453f45388f3SFilipe David Borba Manana 		} else {
4454f45388f3SFilipe David Borba Manana 			goto skip;
4455f45388f3SFilipe David Borba Manana 		}
4456f45388f3SFilipe David Borba Manana update_tree:
44579771a5cfSJosef Bacik 		btrfs_release_path(path);
4458f45388f3SFilipe David Borba Manana 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4459cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4460803b2f54SStefan Behrens 						  BTRFS_UUID_KEY_SUBVOL,
4461803b2f54SStefan Behrens 						  key.objectid);
4462803b2f54SStefan Behrens 			if (ret < 0) {
4463efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4464803b2f54SStefan Behrens 					ret);
4465803b2f54SStefan Behrens 				break;
4466803b2f54SStefan Behrens 			}
4467803b2f54SStefan Behrens 		}
4468803b2f54SStefan Behrens 
4469803b2f54SStefan Behrens 		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4470cdb345a8SLu Fengqi 			ret = btrfs_uuid_tree_add(trans,
4471803b2f54SStefan Behrens 						  root_item.received_uuid,
4472803b2f54SStefan Behrens 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4473803b2f54SStefan Behrens 						  key.objectid);
4474803b2f54SStefan Behrens 			if (ret < 0) {
4475efe120a0SFrank Holton 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
4476803b2f54SStefan Behrens 					ret);
4477803b2f54SStefan Behrens 				break;
4478803b2f54SStefan Behrens 			}
4479803b2f54SStefan Behrens 		}
4480803b2f54SStefan Behrens 
4481f45388f3SFilipe David Borba Manana skip:
44829771a5cfSJosef Bacik 		btrfs_release_path(path);
4483803b2f54SStefan Behrens 		if (trans) {
44843a45bb20SJeff Mahoney 			ret = btrfs_end_transaction(trans);
4485f45388f3SFilipe David Borba Manana 			trans = NULL;
4486803b2f54SStefan Behrens 			if (ret)
4487803b2f54SStefan Behrens 				break;
4488803b2f54SStefan Behrens 		}
4489803b2f54SStefan Behrens 
4490803b2f54SStefan Behrens 		if (key.offset < (u64)-1) {
4491803b2f54SStefan Behrens 			key.offset++;
4492803b2f54SStefan Behrens 		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4493803b2f54SStefan Behrens 			key.offset = 0;
4494803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4495803b2f54SStefan Behrens 		} else if (key.objectid < (u64)-1) {
4496803b2f54SStefan Behrens 			key.offset = 0;
4497803b2f54SStefan Behrens 			key.type = BTRFS_ROOT_ITEM_KEY;
4498803b2f54SStefan Behrens 			key.objectid++;
4499803b2f54SStefan Behrens 		} else {
4500803b2f54SStefan Behrens 			break;
4501803b2f54SStefan Behrens 		}
4502803b2f54SStefan Behrens 		cond_resched();
4503803b2f54SStefan Behrens 	}
4504803b2f54SStefan Behrens 
4505803b2f54SStefan Behrens out:
4506803b2f54SStefan Behrens 	btrfs_free_path(path);
4507f45388f3SFilipe David Borba Manana 	if (trans && !IS_ERR(trans))
45083a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
4509803b2f54SStefan Behrens 	if (ret)
4510efe120a0SFrank Holton 		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4511c94bec2cSJosef Bacik 	else if (!closing)
4512afcdd129SJosef Bacik 		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4513803b2f54SStefan Behrens 	up(&fs_info->uuid_tree_rescan_sem);
4514803b2f54SStefan Behrens 	return 0;
4515803b2f54SStefan Behrens }
4516803b2f54SStefan Behrens 
4517f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4518f7a81ea4SStefan Behrens {
4519f7a81ea4SStefan Behrens 	struct btrfs_trans_handle *trans;
4520f7a81ea4SStefan Behrens 	struct btrfs_root *tree_root = fs_info->tree_root;
4521f7a81ea4SStefan Behrens 	struct btrfs_root *uuid_root;
4522803b2f54SStefan Behrens 	struct task_struct *task;
4523803b2f54SStefan Behrens 	int ret;
4524f7a81ea4SStefan Behrens 
4525f7a81ea4SStefan Behrens 	/*
4526f7a81ea4SStefan Behrens 	 * 1 - root node
4527f7a81ea4SStefan Behrens 	 * 1 - root item
4528f7a81ea4SStefan Behrens 	 */
4529f7a81ea4SStefan Behrens 	trans = btrfs_start_transaction(tree_root, 2);
4530f7a81ea4SStefan Behrens 	if (IS_ERR(trans))
4531f7a81ea4SStefan Behrens 		return PTR_ERR(trans);
4532f7a81ea4SStefan Behrens 
45339b7a2440SDavid Sterba 	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4534f7a81ea4SStefan Behrens 	if (IS_ERR(uuid_root)) {
45356d13f549SDavid Sterba 		ret = PTR_ERR(uuid_root);
453666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
45373a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
45386d13f549SDavid Sterba 		return ret;
4539f7a81ea4SStefan Behrens 	}
4540f7a81ea4SStefan Behrens 
4541f7a81ea4SStefan Behrens 	fs_info->uuid_root = uuid_root;
4542f7a81ea4SStefan Behrens 
45433a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
4544803b2f54SStefan Behrens 	if (ret)
4545803b2f54SStefan Behrens 		return ret;
4546803b2f54SStefan Behrens 
4547803b2f54SStefan Behrens 	down(&fs_info->uuid_tree_rescan_sem);
4548803b2f54SStefan Behrens 	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4549803b2f54SStefan Behrens 	if (IS_ERR(task)) {
455070f80175SStefan Behrens 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4551efe120a0SFrank Holton 		btrfs_warn(fs_info, "failed to start uuid_scan task");
4552803b2f54SStefan Behrens 		up(&fs_info->uuid_tree_rescan_sem);
4553803b2f54SStefan Behrens 		return PTR_ERR(task);
4554f7a81ea4SStefan Behrens 	}
4555803b2f54SStefan Behrens 
4556803b2f54SStefan Behrens 	return 0;
4557803b2f54SStefan Behrens }
4558803b2f54SStefan Behrens 
45598f18cf13SChris Mason /*
45608f18cf13SChris Mason  * shrinking a device means finding all of the device extents past
45618f18cf13SChris Mason  * the new size, and then following the back refs to the chunks.
45628f18cf13SChris Mason  * The chunk relocation code actually frees the device extent
45638f18cf13SChris Mason  */
45648f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
45658f18cf13SChris Mason {
45660b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = device->fs_info;
45670b246afaSJeff Mahoney 	struct btrfs_root *root = fs_info->dev_root;
45688f18cf13SChris Mason 	struct btrfs_trans_handle *trans;
45698f18cf13SChris Mason 	struct btrfs_dev_extent *dev_extent = NULL;
45708f18cf13SChris Mason 	struct btrfs_path *path;
45718f18cf13SChris Mason 	u64 length;
45728f18cf13SChris Mason 	u64 chunk_offset;
45738f18cf13SChris Mason 	int ret;
45748f18cf13SChris Mason 	int slot;
4575ba1bf481SJosef Bacik 	int failed = 0;
4576ba1bf481SJosef Bacik 	bool retried = false;
45778f18cf13SChris Mason 	struct extent_buffer *l;
45788f18cf13SChris Mason 	struct btrfs_key key;
45790b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
45808f18cf13SChris Mason 	u64 old_total = btrfs_super_total_bytes(super_copy);
45817cc8e58dSMiao Xie 	u64 old_size = btrfs_device_get_total_bytes(device);
45827dfb8be1SNikolay Borisov 	u64 diff;
458361d0d0d2SNikolay Borisov 	u64 start;
45847dfb8be1SNikolay Borisov 
45857dfb8be1SNikolay Borisov 	new_size = round_down(new_size, fs_info->sectorsize);
458661d0d0d2SNikolay Borisov 	start = new_size;
45870e4324a4SNikolay Borisov 	diff = round_down(old_size - new_size, fs_info->sectorsize);
45888f18cf13SChris Mason 
4589401e29c1SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
459063a212abSStefan Behrens 		return -EINVAL;
459163a212abSStefan Behrens 
45928f18cf13SChris Mason 	path = btrfs_alloc_path();
45938f18cf13SChris Mason 	if (!path)
45948f18cf13SChris Mason 		return -ENOMEM;
45958f18cf13SChris Mason 
45960338dff6SGu Jinxiang 	path->reada = READA_BACK;
45978f18cf13SChris Mason 
459861d0d0d2SNikolay Borisov 	trans = btrfs_start_transaction(root, 0);
459961d0d0d2SNikolay Borisov 	if (IS_ERR(trans)) {
460061d0d0d2SNikolay Borisov 		btrfs_free_path(path);
460161d0d0d2SNikolay Borisov 		return PTR_ERR(trans);
460261d0d0d2SNikolay Borisov 	}
460361d0d0d2SNikolay Borisov 
460434441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
46057d9eb12cSChris Mason 
46067cc8e58dSMiao Xie 	btrfs_device_set_total_bytes(device, new_size);
4607ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
46082b82032cSYan Zheng 		device->fs_devices->total_rw_bytes -= diff;
4609a5ed45f8SNikolay Borisov 		atomic64_sub(diff, &fs_info->free_chunk_space);
46102bf64758SJosef Bacik 	}
461161d0d0d2SNikolay Borisov 
461261d0d0d2SNikolay Borisov 	/*
461361d0d0d2SNikolay Borisov 	 * Once the device's size has been set to the new size, ensure all
461461d0d0d2SNikolay Borisov 	 * in-memory chunks are synced to disk so that the loop below sees them
461561d0d0d2SNikolay Borisov 	 * and relocates them accordingly.
461661d0d0d2SNikolay Borisov 	 */
46171c11b63eSJeff Mahoney 	if (contains_pending_extent(device, &start, diff)) {
461834441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
461961d0d0d2SNikolay Borisov 		ret = btrfs_commit_transaction(trans);
462061d0d0d2SNikolay Borisov 		if (ret)
462161d0d0d2SNikolay Borisov 			goto done;
462261d0d0d2SNikolay Borisov 	} else {
462361d0d0d2SNikolay Borisov 		mutex_unlock(&fs_info->chunk_mutex);
462461d0d0d2SNikolay Borisov 		btrfs_end_transaction(trans);
462561d0d0d2SNikolay Borisov 	}
46268f18cf13SChris Mason 
4627ba1bf481SJosef Bacik again:
46288f18cf13SChris Mason 	key.objectid = device->devid;
46298f18cf13SChris Mason 	key.offset = (u64)-1;
46308f18cf13SChris Mason 	key.type = BTRFS_DEV_EXTENT_KEY;
46318f18cf13SChris Mason 
4632213e64daSIlya Dryomov 	do {
46330b246afaSJeff Mahoney 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
46348f18cf13SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
463567c5e7d4SFilipe Manana 		if (ret < 0) {
46360b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
46378f18cf13SChris Mason 			goto done;
463867c5e7d4SFilipe Manana 		}
46398f18cf13SChris Mason 
46408f18cf13SChris Mason 		ret = btrfs_previous_item(root, path, 0, key.type);
464167c5e7d4SFilipe Manana 		if (ret)
46420b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
46438f18cf13SChris Mason 		if (ret < 0)
46448f18cf13SChris Mason 			goto done;
46458f18cf13SChris Mason 		if (ret) {
46468f18cf13SChris Mason 			ret = 0;
4647b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4648bf1fb512SYan Zheng 			break;
46498f18cf13SChris Mason 		}
46508f18cf13SChris Mason 
46518f18cf13SChris Mason 		l = path->nodes[0];
46528f18cf13SChris Mason 		slot = path->slots[0];
46538f18cf13SChris Mason 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
46548f18cf13SChris Mason 
4655ba1bf481SJosef Bacik 		if (key.objectid != device->devid) {
46560b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4657b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4658bf1fb512SYan Zheng 			break;
4659ba1bf481SJosef Bacik 		}
46608f18cf13SChris Mason 
46618f18cf13SChris Mason 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
46628f18cf13SChris Mason 		length = btrfs_dev_extent_length(l, dev_extent);
46638f18cf13SChris Mason 
4664ba1bf481SJosef Bacik 		if (key.offset + length <= new_size) {
46650b246afaSJeff Mahoney 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4666b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4667d6397baeSChris Ball 			break;
4668ba1bf481SJosef Bacik 		}
46698f18cf13SChris Mason 
46708f18cf13SChris Mason 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4671b3b4aa74SDavid Sterba 		btrfs_release_path(path);
46728f18cf13SChris Mason 
4673a6f93c71SLiu Bo 		/*
4674a6f93c71SLiu Bo 		 * We may be relocating the only data chunk we have,
4675a6f93c71SLiu Bo 		 * which could potentially end up with losing data's
4676a6f93c71SLiu Bo 		 * raid profile, so lets allocate an empty one in
4677a6f93c71SLiu Bo 		 * advance.
4678a6f93c71SLiu Bo 		 */
4679a6f93c71SLiu Bo 		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4680a6f93c71SLiu Bo 		if (ret < 0) {
4681a6f93c71SLiu Bo 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4682a6f93c71SLiu Bo 			goto done;
4683a6f93c71SLiu Bo 		}
4684a6f93c71SLiu Bo 
46850b246afaSJeff Mahoney 		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
46860b246afaSJeff Mahoney 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4687eede2bf3SOmar Sandoval 		if (ret == -ENOSPC) {
4688ba1bf481SJosef Bacik 			failed++;
4689eede2bf3SOmar Sandoval 		} else if (ret) {
4690eede2bf3SOmar Sandoval 			if (ret == -ETXTBSY) {
4691eede2bf3SOmar Sandoval 				btrfs_warn(fs_info,
4692eede2bf3SOmar Sandoval 		   "could not shrink block group %llu due to active swapfile",
4693eede2bf3SOmar Sandoval 					   chunk_offset);
4694eede2bf3SOmar Sandoval 			}
4695eede2bf3SOmar Sandoval 			goto done;
4696eede2bf3SOmar Sandoval 		}
4697213e64daSIlya Dryomov 	} while (key.offset-- > 0);
4698ba1bf481SJosef Bacik 
4699ba1bf481SJosef Bacik 	if (failed && !retried) {
4700ba1bf481SJosef Bacik 		failed = 0;
4701ba1bf481SJosef Bacik 		retried = true;
4702ba1bf481SJosef Bacik 		goto again;
4703ba1bf481SJosef Bacik 	} else if (failed && retried) {
4704ba1bf481SJosef Bacik 		ret = -ENOSPC;
47058f18cf13SChris Mason 		goto done;
47068f18cf13SChris Mason 	}
47078f18cf13SChris Mason 
4708d6397baeSChris Ball 	/* Shrinking succeeded, else we would be at "done". */
4709a22285a6SYan, Zheng 	trans = btrfs_start_transaction(root, 0);
471098d5dc13STsutomu Itoh 	if (IS_ERR(trans)) {
471198d5dc13STsutomu Itoh 		ret = PTR_ERR(trans);
471298d5dc13STsutomu Itoh 		goto done;
471398d5dc13STsutomu Itoh 	}
471498d5dc13STsutomu Itoh 
471534441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
4716c57dd1f2SQu Wenruo 	/* Clear all state bits beyond the shrunk device size */
4717c57dd1f2SQu Wenruo 	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4718c57dd1f2SQu Wenruo 			  CHUNK_STATE_MASK);
4719c57dd1f2SQu Wenruo 
47207cc8e58dSMiao Xie 	btrfs_device_set_disk_total_bytes(device, new_size);
4721bbbf7243SNikolay Borisov 	if (list_empty(&device->post_commit_list))
4722bbbf7243SNikolay Borisov 		list_add_tail(&device->post_commit_list,
4723bbbf7243SNikolay Borisov 			      &trans->transaction->dev_update_list);
4724d6397baeSChris Ball 
4725d6397baeSChris Ball 	WARN_ON(diff > old_total);
47267dfb8be1SNikolay Borisov 	btrfs_set_super_total_bytes(super_copy,
47277dfb8be1SNikolay Borisov 			round_down(old_total - diff, fs_info->sectorsize));
472834441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
47292196d6e8SMiao Xie 
47302196d6e8SMiao Xie 	/* Now btrfs_update_device() will change the on-disk size. */
47312196d6e8SMiao Xie 	ret = btrfs_update_device(trans, device);
4732801660b0SAnand Jain 	if (ret < 0) {
4733801660b0SAnand Jain 		btrfs_abort_transaction(trans, ret);
47343a45bb20SJeff Mahoney 		btrfs_end_transaction(trans);
4735801660b0SAnand Jain 	} else {
4736801660b0SAnand Jain 		ret = btrfs_commit_transaction(trans);
4737801660b0SAnand Jain 	}
47388f18cf13SChris Mason done:
47398f18cf13SChris Mason 	btrfs_free_path(path);
474053e489bcSFilipe Manana 	if (ret) {
474134441361SDavid Sterba 		mutex_lock(&fs_info->chunk_mutex);
474253e489bcSFilipe Manana 		btrfs_device_set_total_bytes(device, old_size);
4743ebbede42SAnand Jain 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
474453e489bcSFilipe Manana 			device->fs_devices->total_rw_bytes += diff;
4745a5ed45f8SNikolay Borisov 		atomic64_add(diff, &fs_info->free_chunk_space);
474634441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
474753e489bcSFilipe Manana 	}
47488f18cf13SChris Mason 	return ret;
47498f18cf13SChris Mason }
47508f18cf13SChris Mason 
47512ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
47520b86a832SChris Mason 			   struct btrfs_key *key,
47530b86a832SChris Mason 			   struct btrfs_chunk *chunk, int item_size)
47540b86a832SChris Mason {
47550b246afaSJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
47560b86a832SChris Mason 	struct btrfs_disk_key disk_key;
47570b86a832SChris Mason 	u32 array_size;
47580b86a832SChris Mason 	u8 *ptr;
47590b86a832SChris Mason 
476034441361SDavid Sterba 	mutex_lock(&fs_info->chunk_mutex);
47610b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
47625f43f86eSGui Hecheng 	if (array_size + item_size + sizeof(disk_key)
4763fe48a5c0SMiao Xie 			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
476434441361SDavid Sterba 		mutex_unlock(&fs_info->chunk_mutex);
47650b86a832SChris Mason 		return -EFBIG;
4766fe48a5c0SMiao Xie 	}
47670b86a832SChris Mason 
47680b86a832SChris Mason 	ptr = super_copy->sys_chunk_array + array_size;
47690b86a832SChris Mason 	btrfs_cpu_key_to_disk(&disk_key, key);
47700b86a832SChris Mason 	memcpy(ptr, &disk_key, sizeof(disk_key));
47710b86a832SChris Mason 	ptr += sizeof(disk_key);
47720b86a832SChris Mason 	memcpy(ptr, chunk, item_size);
47730b86a832SChris Mason 	item_size += sizeof(disk_key);
47740b86a832SChris Mason 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
477534441361SDavid Sterba 	mutex_unlock(&fs_info->chunk_mutex);
4776fe48a5c0SMiao Xie 
47770b86a832SChris Mason 	return 0;
47780b86a832SChris Mason }
47790b86a832SChris Mason 
47809f680ce0SChris Mason /*
478173c5de00SArne Jansen  * sort the devices in descending order by max_avail, total_avail
47829f680ce0SChris Mason  */
478373c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b)
47842b82032cSYan Zheng {
478573c5de00SArne Jansen 	const struct btrfs_device_info *di_a = a;
478673c5de00SArne Jansen 	const struct btrfs_device_info *di_b = b;
47872b82032cSYan Zheng 
478873c5de00SArne Jansen 	if (di_a->max_avail > di_b->max_avail)
4789a40a90a0SChris Mason 		return -1;
479073c5de00SArne Jansen 	if (di_a->max_avail < di_b->max_avail)
47919b3f68b9SChris Mason 		return 1;
479273c5de00SArne Jansen 	if (di_a->total_avail > di_b->total_avail)
479373c5de00SArne Jansen 		return -1;
479473c5de00SArne Jansen 	if (di_a->total_avail < di_b->total_avail)
479573c5de00SArne Jansen 		return 1;
4796b2117a39SMiao Xie 	return 0;
4797b2117a39SMiao Xie }
4798b2117a39SMiao Xie 
479953b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
480053b381b3SDavid Woodhouse {
4801ffe2d203SZhao Lei 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
480253b381b3SDavid Woodhouse 		return;
480353b381b3SDavid Woodhouse 
4804ceda0864SMiao Xie 	btrfs_set_fs_incompat(info, RAID56);
480553b381b3SDavid Woodhouse }
480653b381b3SDavid Woodhouse 
4807cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4808cfbb825cSDavid Sterba {
4809cfbb825cSDavid Sterba 	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4810cfbb825cSDavid Sterba 		return;
4811cfbb825cSDavid Sterba 
4812cfbb825cSDavid Sterba 	btrfs_set_fs_incompat(info, RAID1C34);
4813cfbb825cSDavid Sterba }
4814cfbb825cSDavid Sterba 
48154f2bafe8SNaohiro Aota /*
48164f2bafe8SNaohiro Aota  * Structure used internally for __btrfs_alloc_chunk() function.
48174f2bafe8SNaohiro Aota  * Wraps needed parameters.
48184f2bafe8SNaohiro Aota  */
48194f2bafe8SNaohiro Aota struct alloc_chunk_ctl {
48204f2bafe8SNaohiro Aota 	u64 start;
48214f2bafe8SNaohiro Aota 	u64 type;
48224f2bafe8SNaohiro Aota 	/* Total number of stripes to allocate */
48234f2bafe8SNaohiro Aota 	int num_stripes;
48244f2bafe8SNaohiro Aota 	/* sub_stripes info for map */
48254f2bafe8SNaohiro Aota 	int sub_stripes;
48264f2bafe8SNaohiro Aota 	/* Stripes per device */
48274f2bafe8SNaohiro Aota 	int dev_stripes;
48284f2bafe8SNaohiro Aota 	/* Maximum number of devices to use */
48294f2bafe8SNaohiro Aota 	int devs_max;
48304f2bafe8SNaohiro Aota 	/* Minimum number of devices to use */
48314f2bafe8SNaohiro Aota 	int devs_min;
48324f2bafe8SNaohiro Aota 	/* ndevs has to be a multiple of this */
48334f2bafe8SNaohiro Aota 	int devs_increment;
48344f2bafe8SNaohiro Aota 	/* Number of copies */
48354f2bafe8SNaohiro Aota 	int ncopies;
48364f2bafe8SNaohiro Aota 	/* Number of stripes worth of bytes to store parity information */
48374f2bafe8SNaohiro Aota 	int nparity;
48384f2bafe8SNaohiro Aota 	u64 max_stripe_size;
48394f2bafe8SNaohiro Aota 	u64 max_chunk_size;
48406aafb303SNaohiro Aota 	u64 dev_extent_min;
48414f2bafe8SNaohiro Aota 	u64 stripe_size;
48424f2bafe8SNaohiro Aota 	u64 chunk_size;
48434f2bafe8SNaohiro Aota 	int ndevs;
48444f2bafe8SNaohiro Aota };
48454f2bafe8SNaohiro Aota 
484627c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular(
484727c314d5SNaohiro Aota 				struct btrfs_fs_devices *fs_devices,
484827c314d5SNaohiro Aota 				struct alloc_chunk_ctl *ctl)
484927c314d5SNaohiro Aota {
485027c314d5SNaohiro Aota 	u64 type = ctl->type;
485127c314d5SNaohiro Aota 
485227c314d5SNaohiro Aota 	if (type & BTRFS_BLOCK_GROUP_DATA) {
485327c314d5SNaohiro Aota 		ctl->max_stripe_size = SZ_1G;
485427c314d5SNaohiro Aota 		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
485527c314d5SNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
485627c314d5SNaohiro Aota 		/* For larger filesystems, use larger metadata chunks */
485727c314d5SNaohiro Aota 		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
485827c314d5SNaohiro Aota 			ctl->max_stripe_size = SZ_1G;
485927c314d5SNaohiro Aota 		else
486027c314d5SNaohiro Aota 			ctl->max_stripe_size = SZ_256M;
486127c314d5SNaohiro Aota 		ctl->max_chunk_size = ctl->max_stripe_size;
486227c314d5SNaohiro Aota 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
486327c314d5SNaohiro Aota 		ctl->max_stripe_size = SZ_32M;
486427c314d5SNaohiro Aota 		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
486527c314d5SNaohiro Aota 		ctl->devs_max = min_t(int, ctl->devs_max,
486627c314d5SNaohiro Aota 				      BTRFS_MAX_DEVS_SYS_CHUNK);
486727c314d5SNaohiro Aota 	} else {
486827c314d5SNaohiro Aota 		BUG();
486927c314d5SNaohiro Aota 	}
487027c314d5SNaohiro Aota 
487127c314d5SNaohiro Aota 	/* We don't want a chunk larger than 10% of writable space */
487227c314d5SNaohiro Aota 	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
487327c314d5SNaohiro Aota 				  ctl->max_chunk_size);
48746aafb303SNaohiro Aota 	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
487527c314d5SNaohiro Aota }
487627c314d5SNaohiro Aota 
487727c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
487827c314d5SNaohiro Aota 				 struct alloc_chunk_ctl *ctl)
487927c314d5SNaohiro Aota {
488027c314d5SNaohiro Aota 	int index = btrfs_bg_flags_to_raid_index(ctl->type);
488127c314d5SNaohiro Aota 
488227c314d5SNaohiro Aota 	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
488327c314d5SNaohiro Aota 	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
488427c314d5SNaohiro Aota 	ctl->devs_max = btrfs_raid_array[index].devs_max;
488527c314d5SNaohiro Aota 	if (!ctl->devs_max)
488627c314d5SNaohiro Aota 		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
488727c314d5SNaohiro Aota 	ctl->devs_min = btrfs_raid_array[index].devs_min;
488827c314d5SNaohiro Aota 	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
488927c314d5SNaohiro Aota 	ctl->ncopies = btrfs_raid_array[index].ncopies;
489027c314d5SNaohiro Aota 	ctl->nparity = btrfs_raid_array[index].nparity;
489127c314d5SNaohiro Aota 	ctl->ndevs = 0;
489227c314d5SNaohiro Aota 
489327c314d5SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
489427c314d5SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
489527c314d5SNaohiro Aota 		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
489627c314d5SNaohiro Aota 		break;
489727c314d5SNaohiro Aota 	default:
489827c314d5SNaohiro Aota 		BUG();
489927c314d5SNaohiro Aota 	}
490027c314d5SNaohiro Aota }
490127c314d5SNaohiro Aota 
4902560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4903560156cbSNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
4904560156cbSNaohiro Aota 			      struct btrfs_device_info *devices_info)
4905560156cbSNaohiro Aota {
4906560156cbSNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
4907560156cbSNaohiro Aota 	struct btrfs_device *device;
4908560156cbSNaohiro Aota 	u64 total_avail;
4909560156cbSNaohiro Aota 	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4910560156cbSNaohiro Aota 	int ret;
4911560156cbSNaohiro Aota 	int ndevs = 0;
4912560156cbSNaohiro Aota 	u64 max_avail;
4913560156cbSNaohiro Aota 	u64 dev_offset;
4914560156cbSNaohiro Aota 
4915560156cbSNaohiro Aota 	/*
4916560156cbSNaohiro Aota 	 * in the first pass through the devices list, we gather information
4917560156cbSNaohiro Aota 	 * about the available holes on each device.
4918560156cbSNaohiro Aota 	 */
4919560156cbSNaohiro Aota 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4920560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4921560156cbSNaohiro Aota 			WARN(1, KERN_ERR
4922560156cbSNaohiro Aota 			       "BTRFS: read-only device in alloc_list\n");
4923560156cbSNaohiro Aota 			continue;
4924560156cbSNaohiro Aota 		}
4925560156cbSNaohiro Aota 
4926560156cbSNaohiro Aota 		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4927560156cbSNaohiro Aota 					&device->dev_state) ||
4928560156cbSNaohiro Aota 		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4929560156cbSNaohiro Aota 			continue;
4930560156cbSNaohiro Aota 
4931560156cbSNaohiro Aota 		if (device->total_bytes > device->bytes_used)
4932560156cbSNaohiro Aota 			total_avail = device->total_bytes - device->bytes_used;
4933560156cbSNaohiro Aota 		else
4934560156cbSNaohiro Aota 			total_avail = 0;
4935560156cbSNaohiro Aota 
4936560156cbSNaohiro Aota 		/* If there is no space on this device, skip it. */
49376aafb303SNaohiro Aota 		if (total_avail < ctl->dev_extent_min)
4938560156cbSNaohiro Aota 			continue;
4939560156cbSNaohiro Aota 
4940560156cbSNaohiro Aota 		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4941560156cbSNaohiro Aota 					   &max_avail);
4942560156cbSNaohiro Aota 		if (ret && ret != -ENOSPC)
4943560156cbSNaohiro Aota 			return ret;
4944560156cbSNaohiro Aota 
4945560156cbSNaohiro Aota 		if (ret == 0)
4946560156cbSNaohiro Aota 			max_avail = dev_extent_want;
4947560156cbSNaohiro Aota 
49486aafb303SNaohiro Aota 		if (max_avail < ctl->dev_extent_min) {
4949560156cbSNaohiro Aota 			if (btrfs_test_opt(info, ENOSPC_DEBUG))
4950560156cbSNaohiro Aota 				btrfs_debug(info,
4951560156cbSNaohiro Aota 			"%s: devid %llu has no free space, have=%llu want=%llu",
4952560156cbSNaohiro Aota 					    __func__, device->devid, max_avail,
49536aafb303SNaohiro Aota 					    ctl->dev_extent_min);
4954560156cbSNaohiro Aota 			continue;
4955560156cbSNaohiro Aota 		}
4956560156cbSNaohiro Aota 
4957560156cbSNaohiro Aota 		if (ndevs == fs_devices->rw_devices) {
4958560156cbSNaohiro Aota 			WARN(1, "%s: found more than %llu devices\n",
4959560156cbSNaohiro Aota 			     __func__, fs_devices->rw_devices);
4960560156cbSNaohiro Aota 			break;
4961560156cbSNaohiro Aota 		}
4962560156cbSNaohiro Aota 		devices_info[ndevs].dev_offset = dev_offset;
4963560156cbSNaohiro Aota 		devices_info[ndevs].max_avail = max_avail;
4964560156cbSNaohiro Aota 		devices_info[ndevs].total_avail = total_avail;
4965560156cbSNaohiro Aota 		devices_info[ndevs].dev = device;
4966560156cbSNaohiro Aota 		++ndevs;
4967560156cbSNaohiro Aota 	}
4968560156cbSNaohiro Aota 	ctl->ndevs = ndevs;
4969560156cbSNaohiro Aota 
4970560156cbSNaohiro Aota 	/*
4971560156cbSNaohiro Aota 	 * now sort the devices by hole size / available space
4972560156cbSNaohiro Aota 	 */
4973560156cbSNaohiro Aota 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4974560156cbSNaohiro Aota 	     btrfs_cmp_device_info, NULL);
4975560156cbSNaohiro Aota 
4976560156cbSNaohiro Aota 	return 0;
4977560156cbSNaohiro Aota }
4978560156cbSNaohiro Aota 
49795badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
49805badf512SNaohiro Aota 				      struct btrfs_device_info *devices_info)
49815badf512SNaohiro Aota {
49825badf512SNaohiro Aota 	/* Number of stripes that count for block group size */
49835badf512SNaohiro Aota 	int data_stripes;
49845badf512SNaohiro Aota 
49855badf512SNaohiro Aota 	/*
49865badf512SNaohiro Aota 	 * The primary goal is to maximize the number of stripes, so use as
49875badf512SNaohiro Aota 	 * many devices as possible, even if the stripes are not maximum sized.
49885badf512SNaohiro Aota 	 *
49895badf512SNaohiro Aota 	 * The DUP profile stores more than one stripe per device, the
49905badf512SNaohiro Aota 	 * max_avail is the total size so we have to adjust.
49915badf512SNaohiro Aota 	 */
49925badf512SNaohiro Aota 	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
49935badf512SNaohiro Aota 				   ctl->dev_stripes);
49945badf512SNaohiro Aota 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
49955badf512SNaohiro Aota 
49965badf512SNaohiro Aota 	/* This will have to be fixed for RAID1 and RAID10 over more drives */
49975badf512SNaohiro Aota 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
49985badf512SNaohiro Aota 
49995badf512SNaohiro Aota 	/*
50005badf512SNaohiro Aota 	 * Use the number of data stripes to figure out how big this chunk is
50015badf512SNaohiro Aota 	 * really going to be in terms of logical address space, and compare
50025badf512SNaohiro Aota 	 * that answer with the max chunk size. If it's higher, we try to
50035badf512SNaohiro Aota 	 * reduce stripe_size.
50045badf512SNaohiro Aota 	 */
50055badf512SNaohiro Aota 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
50065badf512SNaohiro Aota 		/*
50075badf512SNaohiro Aota 		 * Reduce stripe_size, round it up to a 16MB boundary again and
50085badf512SNaohiro Aota 		 * then use it, unless it ends up being even bigger than the
50095badf512SNaohiro Aota 		 * previous value we had already.
50105badf512SNaohiro Aota 		 */
50115badf512SNaohiro Aota 		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
50125badf512SNaohiro Aota 							data_stripes), SZ_16M),
50135badf512SNaohiro Aota 				       ctl->stripe_size);
50145badf512SNaohiro Aota 	}
50155badf512SNaohiro Aota 
50165badf512SNaohiro Aota 	/* Align to BTRFS_STRIPE_LEN */
50175badf512SNaohiro Aota 	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
50185badf512SNaohiro Aota 	ctl->chunk_size = ctl->stripe_size * data_stripes;
50195badf512SNaohiro Aota 
50205badf512SNaohiro Aota 	return 0;
50215badf512SNaohiro Aota }
50225badf512SNaohiro Aota 
50235badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
50245badf512SNaohiro Aota 			      struct alloc_chunk_ctl *ctl,
50255badf512SNaohiro Aota 			      struct btrfs_device_info *devices_info)
50265badf512SNaohiro Aota {
50275badf512SNaohiro Aota 	struct btrfs_fs_info *info = fs_devices->fs_info;
50285badf512SNaohiro Aota 
50295badf512SNaohiro Aota 	/*
50305badf512SNaohiro Aota 	 * Round down to number of usable stripes, devs_increment can be any
50315badf512SNaohiro Aota 	 * number so we can't use round_down() that requires power of 2, while
50325badf512SNaohiro Aota 	 * rounddown is safe.
50335badf512SNaohiro Aota 	 */
50345badf512SNaohiro Aota 	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
50355badf512SNaohiro Aota 
50365badf512SNaohiro Aota 	if (ctl->ndevs < ctl->devs_min) {
50375badf512SNaohiro Aota 		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
50385badf512SNaohiro Aota 			btrfs_debug(info,
50395badf512SNaohiro Aota 	"%s: not enough devices with free space: have=%d minimum required=%d",
50405badf512SNaohiro Aota 				    __func__, ctl->ndevs, ctl->devs_min);
50415badf512SNaohiro Aota 		}
50425badf512SNaohiro Aota 		return -ENOSPC;
50435badf512SNaohiro Aota 	}
50445badf512SNaohiro Aota 
50455badf512SNaohiro Aota 	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
50465badf512SNaohiro Aota 
50475badf512SNaohiro Aota 	switch (fs_devices->chunk_alloc_policy) {
50485badf512SNaohiro Aota 	case BTRFS_CHUNK_ALLOC_REGULAR:
50495badf512SNaohiro Aota 		return decide_stripe_size_regular(ctl, devices_info);
50505badf512SNaohiro Aota 	default:
50515badf512SNaohiro Aota 		BUG();
50525badf512SNaohiro Aota 	}
50535badf512SNaohiro Aota }
50545badf512SNaohiro Aota 
5055dce580caSNaohiro Aota static int create_chunk(struct btrfs_trans_handle *trans,
5056dce580caSNaohiro Aota 			struct alloc_chunk_ctl *ctl,
5057dce580caSNaohiro Aota 			struct btrfs_device_info *devices_info)
5058dce580caSNaohiro Aota {
5059dce580caSNaohiro Aota 	struct btrfs_fs_info *info = trans->fs_info;
5060dce580caSNaohiro Aota 	struct map_lookup *map = NULL;
5061dce580caSNaohiro Aota 	struct extent_map_tree *em_tree;
5062dce580caSNaohiro Aota 	struct extent_map *em;
5063dce580caSNaohiro Aota 	u64 start = ctl->start;
5064dce580caSNaohiro Aota 	u64 type = ctl->type;
5065dce580caSNaohiro Aota 	int ret;
5066dce580caSNaohiro Aota 	int i;
5067dce580caSNaohiro Aota 	int j;
5068dce580caSNaohiro Aota 
5069dce580caSNaohiro Aota 	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5070dce580caSNaohiro Aota 	if (!map)
5071dce580caSNaohiro Aota 		return -ENOMEM;
5072dce580caSNaohiro Aota 	map->num_stripes = ctl->num_stripes;
5073dce580caSNaohiro Aota 
5074dce580caSNaohiro Aota 	for (i = 0; i < ctl->ndevs; ++i) {
5075dce580caSNaohiro Aota 		for (j = 0; j < ctl->dev_stripes; ++j) {
5076dce580caSNaohiro Aota 			int s = i * ctl->dev_stripes + j;
5077dce580caSNaohiro Aota 			map->stripes[s].dev = devices_info[i].dev;
5078dce580caSNaohiro Aota 			map->stripes[s].physical = devices_info[i].dev_offset +
5079dce580caSNaohiro Aota 						   j * ctl->stripe_size;
5080dce580caSNaohiro Aota 		}
5081dce580caSNaohiro Aota 	}
5082dce580caSNaohiro Aota 	map->stripe_len = BTRFS_STRIPE_LEN;
5083dce580caSNaohiro Aota 	map->io_align = BTRFS_STRIPE_LEN;
5084dce580caSNaohiro Aota 	map->io_width = BTRFS_STRIPE_LEN;
5085dce580caSNaohiro Aota 	map->type = type;
5086dce580caSNaohiro Aota 	map->sub_stripes = ctl->sub_stripes;
5087dce580caSNaohiro Aota 
5088dce580caSNaohiro Aota 	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5089dce580caSNaohiro Aota 
5090dce580caSNaohiro Aota 	em = alloc_extent_map();
5091dce580caSNaohiro Aota 	if (!em) {
5092dce580caSNaohiro Aota 		kfree(map);
5093dce580caSNaohiro Aota 		return -ENOMEM;
5094dce580caSNaohiro Aota 	}
5095dce580caSNaohiro Aota 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5096dce580caSNaohiro Aota 	em->map_lookup = map;
5097dce580caSNaohiro Aota 	em->start = start;
5098dce580caSNaohiro Aota 	em->len = ctl->chunk_size;
5099dce580caSNaohiro Aota 	em->block_start = 0;
5100dce580caSNaohiro Aota 	em->block_len = em->len;
5101dce580caSNaohiro Aota 	em->orig_block_len = ctl->stripe_size;
5102dce580caSNaohiro Aota 
5103dce580caSNaohiro Aota 	em_tree = &info->mapping_tree;
5104dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5105dce580caSNaohiro Aota 	ret = add_extent_mapping(em_tree, em, 0);
5106dce580caSNaohiro Aota 	if (ret) {
5107dce580caSNaohiro Aota 		write_unlock(&em_tree->lock);
5108dce580caSNaohiro Aota 		free_extent_map(em);
5109dce580caSNaohiro Aota 		return ret;
5110dce580caSNaohiro Aota 	}
5111dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5112dce580caSNaohiro Aota 
5113dce580caSNaohiro Aota 	ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5114dce580caSNaohiro Aota 	if (ret)
5115dce580caSNaohiro Aota 		goto error_del_extent;
5116dce580caSNaohiro Aota 
5117dce580caSNaohiro Aota 	for (i = 0; i < map->num_stripes; i++) {
5118dce580caSNaohiro Aota 		struct btrfs_device *dev = map->stripes[i].dev;
5119dce580caSNaohiro Aota 
5120dce580caSNaohiro Aota 		btrfs_device_set_bytes_used(dev,
5121dce580caSNaohiro Aota 					    dev->bytes_used + ctl->stripe_size);
5122dce580caSNaohiro Aota 		if (list_empty(&dev->post_commit_list))
5123dce580caSNaohiro Aota 			list_add_tail(&dev->post_commit_list,
5124dce580caSNaohiro Aota 				      &trans->transaction->dev_update_list);
5125dce580caSNaohiro Aota 	}
5126dce580caSNaohiro Aota 
5127dce580caSNaohiro Aota 	atomic64_sub(ctl->stripe_size * map->num_stripes,
5128dce580caSNaohiro Aota 		     &info->free_chunk_space);
5129dce580caSNaohiro Aota 
5130dce580caSNaohiro Aota 	free_extent_map(em);
5131dce580caSNaohiro Aota 	check_raid56_incompat_flag(info, type);
5132dce580caSNaohiro Aota 	check_raid1c34_incompat_flag(info, type);
5133dce580caSNaohiro Aota 
5134dce580caSNaohiro Aota 	return 0;
5135dce580caSNaohiro Aota 
5136dce580caSNaohiro Aota error_del_extent:
5137dce580caSNaohiro Aota 	write_lock(&em_tree->lock);
5138dce580caSNaohiro Aota 	remove_extent_mapping(em_tree, em);
5139dce580caSNaohiro Aota 	write_unlock(&em_tree->lock);
5140dce580caSNaohiro Aota 
5141dce580caSNaohiro Aota 	/* One for our allocation */
5142dce580caSNaohiro Aota 	free_extent_map(em);
5143dce580caSNaohiro Aota 	/* One for the tree reference */
5144dce580caSNaohiro Aota 	free_extent_map(em);
5145dce580caSNaohiro Aota 
5146dce580caSNaohiro Aota 	return ret;
5147dce580caSNaohiro Aota }
5148dce580caSNaohiro Aota 
514911c67b1aSNikolay Borisov int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5150b2117a39SMiao Xie {
51512ff7e61eSJeff Mahoney 	struct btrfs_fs_info *info = trans->fs_info;
5152b2117a39SMiao Xie 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
515373c5de00SArne Jansen 	struct btrfs_device_info *devices_info = NULL;
51544f2bafe8SNaohiro Aota 	struct alloc_chunk_ctl ctl;
5155b2117a39SMiao Xie 	int ret;
5156b2117a39SMiao Xie 
515711c67b1aSNikolay Borisov 	lockdep_assert_held(&info->chunk_mutex);
515811c67b1aSNikolay Borisov 
5159b25c19f4SNaohiro Aota 	if (!alloc_profile_is_valid(type, 0)) {
5160b25c19f4SNaohiro Aota 		ASSERT(0);
5161b25c19f4SNaohiro Aota 		return -EINVAL;
5162b25c19f4SNaohiro Aota 	}
516373c5de00SArne Jansen 
51644117f207SQu Wenruo 	if (list_empty(&fs_devices->alloc_list)) {
51654117f207SQu Wenruo 		if (btrfs_test_opt(info, ENOSPC_DEBUG))
51664117f207SQu Wenruo 			btrfs_debug(info, "%s: no writable device", __func__);
5167b2117a39SMiao Xie 		return -ENOSPC;
51684117f207SQu Wenruo 	}
5169b2117a39SMiao Xie 
517027c314d5SNaohiro Aota 	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
517127c314d5SNaohiro Aota 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
517227c314d5SNaohiro Aota 		ASSERT(0);
517327c314d5SNaohiro Aota 		return -EINVAL;
517473c5de00SArne Jansen 	}
517573c5de00SArne Jansen 
517611c67b1aSNikolay Borisov 	ctl.start = find_next_chunk(info);
517727c314d5SNaohiro Aota 	ctl.type = type;
517827c314d5SNaohiro Aota 	init_alloc_chunk_ctl(fs_devices, &ctl);
5179b2117a39SMiao Xie 
518031e818feSDavid Sterba 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5181b2117a39SMiao Xie 			       GFP_NOFS);
5182b2117a39SMiao Xie 	if (!devices_info)
5183b2117a39SMiao Xie 		return -ENOMEM;
5184b2117a39SMiao Xie 
5185560156cbSNaohiro Aota 	ret = gather_device_info(fs_devices, &ctl, devices_info);
5186560156cbSNaohiro Aota 	if (ret < 0)
5187dce580caSNaohiro Aota 		goto out;
518873c5de00SArne Jansen 
51895badf512SNaohiro Aota 	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
51905badf512SNaohiro Aota 	if (ret < 0)
5191dce580caSNaohiro Aota 		goto out;
519273c5de00SArne Jansen 
5193dce580caSNaohiro Aota 	ret = create_chunk(trans, &ctl, devices_info);
51949b3f68b9SChris Mason 
5195dce580caSNaohiro Aota out:
5196b2117a39SMiao Xie 	kfree(devices_info);
5197b2117a39SMiao Xie 	return ret;
51982b82032cSYan Zheng }
51992b82032cSYan Zheng 
520011c67b1aSNikolay Borisov /*
520111c67b1aSNikolay Borisov  * Chunk allocation falls into two parts. The first part does work
520211c67b1aSNikolay Borisov  * that makes the new allocated chunk usable, but does not do any operation
520311c67b1aSNikolay Borisov  * that modifies the chunk tree. The second part does the work that
520411c67b1aSNikolay Borisov  * requires modifying the chunk tree. This division is important for the
520511c67b1aSNikolay Borisov  * bootstrap process of adding storage to a seed btrfs.
520611c67b1aSNikolay Borisov  */
52076df9a95eSJosef Bacik int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
52086df9a95eSJosef Bacik 			     u64 chunk_offset, u64 chunk_size)
52092b82032cSYan Zheng {
521097aff912SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
52116bccf3abSJeff Mahoney 	struct btrfs_root *extent_root = fs_info->extent_root;
52126bccf3abSJeff Mahoney 	struct btrfs_root *chunk_root = fs_info->chunk_root;
52132b82032cSYan Zheng 	struct btrfs_key key;
52142b82032cSYan Zheng 	struct btrfs_device *device;
52152b82032cSYan Zheng 	struct btrfs_chunk *chunk;
52162b82032cSYan Zheng 	struct btrfs_stripe *stripe;
52176df9a95eSJosef Bacik 	struct extent_map *em;
52186df9a95eSJosef Bacik 	struct map_lookup *map;
52196df9a95eSJosef Bacik 	size_t item_size;
52206df9a95eSJosef Bacik 	u64 dev_offset;
52216df9a95eSJosef Bacik 	u64 stripe_size;
52226df9a95eSJosef Bacik 	int i = 0;
5223140e639fSChris Mason 	int ret = 0;
52242b82032cSYan Zheng 
522560ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5226592d92eeSLiu Bo 	if (IS_ERR(em))
5227592d92eeSLiu Bo 		return PTR_ERR(em);
52286df9a95eSJosef Bacik 
522995617d69SJeff Mahoney 	map = em->map_lookup;
52306df9a95eSJosef Bacik 	item_size = btrfs_chunk_item_size(map->num_stripes);
52316df9a95eSJosef Bacik 	stripe_size = em->orig_block_len;
52326df9a95eSJosef Bacik 
52336df9a95eSJosef Bacik 	chunk = kzalloc(item_size, GFP_NOFS);
52346df9a95eSJosef Bacik 	if (!chunk) {
52356df9a95eSJosef Bacik 		ret = -ENOMEM;
52366df9a95eSJosef Bacik 		goto out;
52376df9a95eSJosef Bacik 	}
52386df9a95eSJosef Bacik 
523950460e37SFilipe Manana 	/*
524050460e37SFilipe Manana 	 * Take the device list mutex to prevent races with the final phase of
524150460e37SFilipe Manana 	 * a device replace operation that replaces the device object associated
524250460e37SFilipe Manana 	 * with the map's stripes, because the device object's id can change
524350460e37SFilipe Manana 	 * at any time during that final phase of the device replace operation
524450460e37SFilipe Manana 	 * (dev-replace.c:btrfs_dev_replace_finishing()).
524550460e37SFilipe Manana 	 */
52460b246afaSJeff Mahoney 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
52476df9a95eSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
52486df9a95eSJosef Bacik 		device = map->stripes[i].dev;
52496df9a95eSJosef Bacik 		dev_offset = map->stripes[i].physical;
52506df9a95eSJosef Bacik 
52512b82032cSYan Zheng 		ret = btrfs_update_device(trans, device);
52523acd3953SMark Fasheh 		if (ret)
525350460e37SFilipe Manana 			break;
5254b5d9071cSNikolay Borisov 		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5255b5d9071cSNikolay Borisov 					     dev_offset, stripe_size);
52566df9a95eSJosef Bacik 		if (ret)
525750460e37SFilipe Manana 			break;
525850460e37SFilipe Manana 	}
525950460e37SFilipe Manana 	if (ret) {
52600b246afaSJeff Mahoney 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
52616df9a95eSJosef Bacik 		goto out;
52622b82032cSYan Zheng 	}
52632b82032cSYan Zheng 
52642b82032cSYan Zheng 	stripe = &chunk->stripe;
52656df9a95eSJosef Bacik 	for (i = 0; i < map->num_stripes; i++) {
52666df9a95eSJosef Bacik 		device = map->stripes[i].dev;
52676df9a95eSJosef Bacik 		dev_offset = map->stripes[i].physical;
52682b82032cSYan Zheng 
52692b82032cSYan Zheng 		btrfs_set_stack_stripe_devid(stripe, device->devid);
52702b82032cSYan Zheng 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
52712b82032cSYan Zheng 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
52722b82032cSYan Zheng 		stripe++;
52732b82032cSYan Zheng 	}
52740b246afaSJeff Mahoney 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
52752b82032cSYan Zheng 
52762b82032cSYan Zheng 	btrfs_set_stack_chunk_length(chunk, chunk_size);
52772b82032cSYan Zheng 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
52782b82032cSYan Zheng 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
52792b82032cSYan Zheng 	btrfs_set_stack_chunk_type(chunk, map->type);
52802b82032cSYan Zheng 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
52812b82032cSYan Zheng 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
52822b82032cSYan Zheng 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
52830b246afaSJeff Mahoney 	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
52842b82032cSYan Zheng 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
52852b82032cSYan Zheng 
52862b82032cSYan Zheng 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
52872b82032cSYan Zheng 	key.type = BTRFS_CHUNK_ITEM_KEY;
52882b82032cSYan Zheng 	key.offset = chunk_offset;
52892b82032cSYan Zheng 
52902b82032cSYan Zheng 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
52914ed1d16eSMark Fasheh 	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
52924ed1d16eSMark Fasheh 		/*
52934ed1d16eSMark Fasheh 		 * TODO: Cleanup of inserted chunk root in case of
52944ed1d16eSMark Fasheh 		 * failure.
52954ed1d16eSMark Fasheh 		 */
52962ff7e61eSJeff Mahoney 		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
52972b82032cSYan Zheng 	}
52981abe9b8aSliubo 
52996df9a95eSJosef Bacik out:
53002b82032cSYan Zheng 	kfree(chunk);
53016df9a95eSJosef Bacik 	free_extent_map(em);
53024ed1d16eSMark Fasheh 	return ret;
53032b82032cSYan Zheng }
53042b82032cSYan Zheng 
53056f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
53062b82032cSYan Zheng {
53076f8e0fc7SDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
53082b82032cSYan Zheng 	u64 alloc_profile;
53092b82032cSYan Zheng 	int ret;
53102b82032cSYan Zheng 
53111b86826dSJeff Mahoney 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
531211c67b1aSNikolay Borisov 	ret = btrfs_alloc_chunk(trans, alloc_profile);
531379787eaaSJeff Mahoney 	if (ret)
531479787eaaSJeff Mahoney 		return ret;
53152b82032cSYan Zheng 
53161b86826dSJeff Mahoney 	alloc_profile = btrfs_system_alloc_profile(fs_info);
531711c67b1aSNikolay Borisov 	ret = btrfs_alloc_chunk(trans, alloc_profile);
531879787eaaSJeff Mahoney 	return ret;
53192b82032cSYan Zheng }
53202b82032cSYan Zheng 
5321d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5322d20983b4SMiao Xie {
5323fc9a2ac7SDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(map->type);
5324d20983b4SMiao Xie 
5325fc9a2ac7SDavid Sterba 	return btrfs_raid_array[index].tolerated_failures;
53262b82032cSYan Zheng }
53272b82032cSYan Zheng 
53282ff7e61eSJeff Mahoney int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
53292b82032cSYan Zheng {
53302b82032cSYan Zheng 	struct extent_map *em;
53312b82032cSYan Zheng 	struct map_lookup *map;
53322b82032cSYan Zheng 	int readonly = 0;
5333d20983b4SMiao Xie 	int miss_ndevs = 0;
53342b82032cSYan Zheng 	int i;
53352b82032cSYan Zheng 
533660ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5337592d92eeSLiu Bo 	if (IS_ERR(em))
53382b82032cSYan Zheng 		return 1;
53392b82032cSYan Zheng 
534095617d69SJeff Mahoney 	map = em->map_lookup;
53412b82032cSYan Zheng 	for (i = 0; i < map->num_stripes; i++) {
5342e6e674bdSAnand Jain 		if (test_bit(BTRFS_DEV_STATE_MISSING,
5343e6e674bdSAnand Jain 					&map->stripes[i].dev->dev_state)) {
5344d20983b4SMiao Xie 			miss_ndevs++;
5345d20983b4SMiao Xie 			continue;
5346d20983b4SMiao Xie 		}
5347ebbede42SAnand Jain 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5348ebbede42SAnand Jain 					&map->stripes[i].dev->dev_state)) {
53492b82032cSYan Zheng 			readonly = 1;
5350d20983b4SMiao Xie 			goto end;
53512b82032cSYan Zheng 		}
53522b82032cSYan Zheng 	}
5353d20983b4SMiao Xie 
5354d20983b4SMiao Xie 	/*
5355d20983b4SMiao Xie 	 * If the number of missing devices is larger than max errors,
5356d20983b4SMiao Xie 	 * we can not write the data into that chunk successfully, so
5357d20983b4SMiao Xie 	 * set it readonly.
5358d20983b4SMiao Xie 	 */
5359d20983b4SMiao Xie 	if (miss_ndevs > btrfs_chunk_max_errors(map))
5360d20983b4SMiao Xie 		readonly = 1;
5361d20983b4SMiao Xie end:
53622b82032cSYan Zheng 	free_extent_map(em);
53632b82032cSYan Zheng 	return readonly;
53640b86a832SChris Mason }
53650b86a832SChris Mason 
5366c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree)
53670b86a832SChris Mason {
53680b86a832SChris Mason 	struct extent_map *em;
53690b86a832SChris Mason 
53700b86a832SChris Mason 	while (1) {
5371c8bf1b67SDavid Sterba 		write_lock(&tree->lock);
5372c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(tree, 0, (u64)-1);
53730b86a832SChris Mason 		if (em)
5374c8bf1b67SDavid Sterba 			remove_extent_mapping(tree, em);
5375c8bf1b67SDavid Sterba 		write_unlock(&tree->lock);
53760b86a832SChris Mason 		if (!em)
53770b86a832SChris Mason 			break;
53780b86a832SChris Mason 		/* once for us */
53790b86a832SChris Mason 		free_extent_map(em);
53800b86a832SChris Mason 		/* once for the tree */
53810b86a832SChris Mason 		free_extent_map(em);
53820b86a832SChris Mason 	}
53830b86a832SChris Mason }
53840b86a832SChris Mason 
53855d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5386f188591eSChris Mason {
5387f188591eSChris Mason 	struct extent_map *em;
5388f188591eSChris Mason 	struct map_lookup *map;
5389f188591eSChris Mason 	int ret;
5390f188591eSChris Mason 
539160ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
5392592d92eeSLiu Bo 	if (IS_ERR(em))
5393fb7669b5SJosef Bacik 		/*
5394592d92eeSLiu Bo 		 * We could return errors for these cases, but that could get
5395592d92eeSLiu Bo 		 * ugly and we'd probably do the same thing which is just not do
5396592d92eeSLiu Bo 		 * anything else and exit, so return 1 so the callers don't try
5397592d92eeSLiu Bo 		 * to use other copies.
5398fb7669b5SJosef Bacik 		 */
5399fb7669b5SJosef Bacik 		return 1;
5400fb7669b5SJosef Bacik 
540195617d69SJeff Mahoney 	map = em->map_lookup;
5402c7369b3fSDavid Sterba 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5403f188591eSChris Mason 		ret = map->num_stripes;
5404321aecc6SChris Mason 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5405321aecc6SChris Mason 		ret = map->sub_stripes;
540653b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
540753b381b3SDavid Woodhouse 		ret = 2;
540853b381b3SDavid Woodhouse 	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
54098810f751SLiu Bo 		/*
54108810f751SLiu Bo 		 * There could be two corrupted data stripes, we need
54118810f751SLiu Bo 		 * to loop retry in order to rebuild the correct data.
54128810f751SLiu Bo 		 *
54138810f751SLiu Bo 		 * Fail a stripe at a time on every retry except the
54148810f751SLiu Bo 		 * stripe under reconstruction.
54158810f751SLiu Bo 		 */
54168810f751SLiu Bo 		ret = map->num_stripes;
5417f188591eSChris Mason 	else
5418f188591eSChris Mason 		ret = 1;
5419f188591eSChris Mason 	free_extent_map(em);
5420ad6d620eSStefan Behrens 
5421cb5583ddSDavid Sterba 	down_read(&fs_info->dev_replace.rwsem);
54226fad823fSLiu Bo 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
54236fad823fSLiu Bo 	    fs_info->dev_replace.tgtdev)
5424ad6d620eSStefan Behrens 		ret++;
5425cb5583ddSDavid Sterba 	up_read(&fs_info->dev_replace.rwsem);
5426ad6d620eSStefan Behrens 
5427f188591eSChris Mason 	return ret;
5428f188591eSChris Mason }
5429f188591eSChris Mason 
54302ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
543153b381b3SDavid Woodhouse 				    u64 logical)
543253b381b3SDavid Woodhouse {
543353b381b3SDavid Woodhouse 	struct extent_map *em;
543453b381b3SDavid Woodhouse 	struct map_lookup *map;
54350b246afaSJeff Mahoney 	unsigned long len = fs_info->sectorsize;
543653b381b3SDavid Woodhouse 
543760ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
543853b381b3SDavid Woodhouse 
543969f03f13SNikolay Borisov 	if (!WARN_ON(IS_ERR(em))) {
544095617d69SJeff Mahoney 		map = em->map_lookup;
5441ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
544253b381b3SDavid Woodhouse 			len = map->stripe_len * nr_data_stripes(map);
544353b381b3SDavid Woodhouse 		free_extent_map(em);
544469f03f13SNikolay Borisov 	}
544553b381b3SDavid Woodhouse 	return len;
544653b381b3SDavid Woodhouse }
544753b381b3SDavid Woodhouse 
5448e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
544953b381b3SDavid Woodhouse {
545053b381b3SDavid Woodhouse 	struct extent_map *em;
545153b381b3SDavid Woodhouse 	struct map_lookup *map;
545253b381b3SDavid Woodhouse 	int ret = 0;
545353b381b3SDavid Woodhouse 
545460ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, len);
545553b381b3SDavid Woodhouse 
545669f03f13SNikolay Borisov 	if(!WARN_ON(IS_ERR(em))) {
545795617d69SJeff Mahoney 		map = em->map_lookup;
5458ffe2d203SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
545953b381b3SDavid Woodhouse 			ret = 1;
546053b381b3SDavid Woodhouse 		free_extent_map(em);
546169f03f13SNikolay Borisov 	}
546253b381b3SDavid Woodhouse 	return ret;
546353b381b3SDavid Woodhouse }
546453b381b3SDavid Woodhouse 
546530d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info,
546699f92a7cSAnand Jain 			    struct map_lookup *map, int first,
54678ba0ae78SAnand Jain 			    int dev_replace_is_ongoing)
5468dfe25020SChris Mason {
5469dfe25020SChris Mason 	int i;
547099f92a7cSAnand Jain 	int num_stripes;
54718ba0ae78SAnand Jain 	int preferred_mirror;
547230d9861fSStefan Behrens 	int tolerance;
547330d9861fSStefan Behrens 	struct btrfs_device *srcdev;
547430d9861fSStefan Behrens 
547599f92a7cSAnand Jain 	ASSERT((map->type &
5476c7369b3fSDavid Sterba 		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
547799f92a7cSAnand Jain 
547899f92a7cSAnand Jain 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
547999f92a7cSAnand Jain 		num_stripes = map->sub_stripes;
548099f92a7cSAnand Jain 	else
548199f92a7cSAnand Jain 		num_stripes = map->num_stripes;
548299f92a7cSAnand Jain 
548333fd2f71SAnand Jain 	switch (fs_info->fs_devices->read_policy) {
548433fd2f71SAnand Jain 	default:
548533fd2f71SAnand Jain 		/* Shouldn't happen, just warn and use pid instead of failing */
548633fd2f71SAnand Jain 		btrfs_warn_rl(fs_info,
548733fd2f71SAnand Jain 			      "unknown read_policy type %u, reset to pid",
548833fd2f71SAnand Jain 			      fs_info->fs_devices->read_policy);
548933fd2f71SAnand Jain 		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
549033fd2f71SAnand Jain 		fallthrough;
549133fd2f71SAnand Jain 	case BTRFS_READ_POLICY_PID:
549233fd2f71SAnand Jain 		preferred_mirror = first + (current->pid % num_stripes);
549333fd2f71SAnand Jain 		break;
549433fd2f71SAnand Jain 	}
54958ba0ae78SAnand Jain 
549630d9861fSStefan Behrens 	if (dev_replace_is_ongoing &&
549730d9861fSStefan Behrens 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
549830d9861fSStefan Behrens 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
549930d9861fSStefan Behrens 		srcdev = fs_info->dev_replace.srcdev;
550030d9861fSStefan Behrens 	else
550130d9861fSStefan Behrens 		srcdev = NULL;
550230d9861fSStefan Behrens 
550330d9861fSStefan Behrens 	/*
550430d9861fSStefan Behrens 	 * try to avoid the drive that is the source drive for a
550530d9861fSStefan Behrens 	 * dev-replace procedure, only choose it if no other non-missing
550630d9861fSStefan Behrens 	 * mirror is available
550730d9861fSStefan Behrens 	 */
550830d9861fSStefan Behrens 	for (tolerance = 0; tolerance < 2; tolerance++) {
55098ba0ae78SAnand Jain 		if (map->stripes[preferred_mirror].dev->bdev &&
55108ba0ae78SAnand Jain 		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
55118ba0ae78SAnand Jain 			return preferred_mirror;
551299f92a7cSAnand Jain 		for (i = first; i < first + num_stripes; i++) {
551330d9861fSStefan Behrens 			if (map->stripes[i].dev->bdev &&
551430d9861fSStefan Behrens 			    (tolerance || map->stripes[i].dev != srcdev))
5515dfe25020SChris Mason 				return i;
5516dfe25020SChris Mason 		}
551730d9861fSStefan Behrens 	}
551830d9861fSStefan Behrens 
5519dfe25020SChris Mason 	/* we couldn't find one that doesn't fail.  Just return something
5520dfe25020SChris Mason 	 * and the io error handling code will clean up eventually
5521dfe25020SChris Mason 	 */
55228ba0ae78SAnand Jain 	return preferred_mirror;
5523dfe25020SChris Mason }
5524dfe25020SChris Mason 
552553b381b3SDavid Woodhouse /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
55268e5cfb55SZhao Lei static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
552753b381b3SDavid Woodhouse {
552853b381b3SDavid Woodhouse 	int i;
552953b381b3SDavid Woodhouse 	int again = 1;
553053b381b3SDavid Woodhouse 
553153b381b3SDavid Woodhouse 	while (again) {
553253b381b3SDavid Woodhouse 		again = 0;
5533cc7539edSZhao Lei 		for (i = 0; i < num_stripes - 1; i++) {
5534eeb6f172SDavid Sterba 			/* Swap if parity is on a smaller index */
5535eeb6f172SDavid Sterba 			if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5536eeb6f172SDavid Sterba 				swap(bbio->stripes[i], bbio->stripes[i + 1]);
5537eeb6f172SDavid Sterba 				swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
553853b381b3SDavid Woodhouse 				again = 1;
553953b381b3SDavid Woodhouse 			}
554053b381b3SDavid Woodhouse 		}
554153b381b3SDavid Woodhouse 	}
554253b381b3SDavid Woodhouse }
554353b381b3SDavid Woodhouse 
55446e9606d2SZhao Lei static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
55456e9606d2SZhao Lei {
55466e9606d2SZhao Lei 	struct btrfs_bio *bbio = kzalloc(
5547e57cf21eSChris Mason 		 /* the size of the btrfs_bio */
55486e9606d2SZhao Lei 		sizeof(struct btrfs_bio) +
5549e57cf21eSChris Mason 		/* plus the variable array for the stripes */
55506e9606d2SZhao Lei 		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5551e57cf21eSChris Mason 		/* plus the variable array for the tgt dev */
55526e9606d2SZhao Lei 		sizeof(int) * (real_stripes) +
5553e57cf21eSChris Mason 		/*
5554e57cf21eSChris Mason 		 * plus the raid_map, which includes both the tgt dev
5555e57cf21eSChris Mason 		 * and the stripes
5556e57cf21eSChris Mason 		 */
5557e57cf21eSChris Mason 		sizeof(u64) * (total_stripes),
5558277fb5fcSMichal Hocko 		GFP_NOFS|__GFP_NOFAIL);
55596e9606d2SZhao Lei 
55606e9606d2SZhao Lei 	atomic_set(&bbio->error, 0);
5561140475aeSElena Reshetova 	refcount_set(&bbio->refs, 1);
55626e9606d2SZhao Lei 
5563608769a4SNikolay Borisov 	bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5564608769a4SNikolay Borisov 	bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5565608769a4SNikolay Borisov 
55666e9606d2SZhao Lei 	return bbio;
55676e9606d2SZhao Lei }
55686e9606d2SZhao Lei 
55696e9606d2SZhao Lei void btrfs_get_bbio(struct btrfs_bio *bbio)
55706e9606d2SZhao Lei {
5571140475aeSElena Reshetova 	WARN_ON(!refcount_read(&bbio->refs));
5572140475aeSElena Reshetova 	refcount_inc(&bbio->refs);
55736e9606d2SZhao Lei }
55746e9606d2SZhao Lei 
55756e9606d2SZhao Lei void btrfs_put_bbio(struct btrfs_bio *bbio)
55766e9606d2SZhao Lei {
55776e9606d2SZhao Lei 	if (!bbio)
55786e9606d2SZhao Lei 		return;
5579140475aeSElena Reshetova 	if (refcount_dec_and_test(&bbio->refs))
55806e9606d2SZhao Lei 		kfree(bbio);
55816e9606d2SZhao Lei }
55826e9606d2SZhao Lei 
55830b3d4cd3SLiu Bo /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
55840b3d4cd3SLiu Bo /*
55850b3d4cd3SLiu Bo  * Please note that, discard won't be sent to target device of device
55860b3d4cd3SLiu Bo  * replace.
55870b3d4cd3SLiu Bo  */
55880b3d4cd3SLiu Bo static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
55896b7faaddSQu Wenruo 					 u64 logical, u64 *length_ret,
55900b3d4cd3SLiu Bo 					 struct btrfs_bio **bbio_ret)
55910b3d4cd3SLiu Bo {
55920b3d4cd3SLiu Bo 	struct extent_map *em;
55930b3d4cd3SLiu Bo 	struct map_lookup *map;
55940b3d4cd3SLiu Bo 	struct btrfs_bio *bbio;
55956b7faaddSQu Wenruo 	u64 length = *length_ret;
55960b3d4cd3SLiu Bo 	u64 offset;
55970b3d4cd3SLiu Bo 	u64 stripe_nr;
55980b3d4cd3SLiu Bo 	u64 stripe_nr_end;
55990b3d4cd3SLiu Bo 	u64 stripe_end_offset;
56000b3d4cd3SLiu Bo 	u64 stripe_cnt;
56010b3d4cd3SLiu Bo 	u64 stripe_len;
56020b3d4cd3SLiu Bo 	u64 stripe_offset;
56030b3d4cd3SLiu Bo 	u64 num_stripes;
56040b3d4cd3SLiu Bo 	u32 stripe_index;
56050b3d4cd3SLiu Bo 	u32 factor = 0;
56060b3d4cd3SLiu Bo 	u32 sub_stripes = 0;
56070b3d4cd3SLiu Bo 	u64 stripes_per_dev = 0;
56080b3d4cd3SLiu Bo 	u32 remaining_stripes = 0;
56090b3d4cd3SLiu Bo 	u32 last_stripe = 0;
56100b3d4cd3SLiu Bo 	int ret = 0;
56110b3d4cd3SLiu Bo 	int i;
56120b3d4cd3SLiu Bo 
56130b3d4cd3SLiu Bo 	/* discard always return a bbio */
56140b3d4cd3SLiu Bo 	ASSERT(bbio_ret);
56150b3d4cd3SLiu Bo 
561660ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, length);
56170b3d4cd3SLiu Bo 	if (IS_ERR(em))
56180b3d4cd3SLiu Bo 		return PTR_ERR(em);
56190b3d4cd3SLiu Bo 
56200b3d4cd3SLiu Bo 	map = em->map_lookup;
56210b3d4cd3SLiu Bo 	/* we don't discard raid56 yet */
56220b3d4cd3SLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
56230b3d4cd3SLiu Bo 		ret = -EOPNOTSUPP;
56240b3d4cd3SLiu Bo 		goto out;
56250b3d4cd3SLiu Bo 	}
56260b3d4cd3SLiu Bo 
56270b3d4cd3SLiu Bo 	offset = logical - em->start;
56282d974619SQu Wenruo 	length = min_t(u64, em->start + em->len - logical, length);
56296b7faaddSQu Wenruo 	*length_ret = length;
56300b3d4cd3SLiu Bo 
56310b3d4cd3SLiu Bo 	stripe_len = map->stripe_len;
56320b3d4cd3SLiu Bo 	/*
56330b3d4cd3SLiu Bo 	 * stripe_nr counts the total number of stripes we have to stride
56340b3d4cd3SLiu Bo 	 * to get to this block
56350b3d4cd3SLiu Bo 	 */
56360b3d4cd3SLiu Bo 	stripe_nr = div64_u64(offset, stripe_len);
56370b3d4cd3SLiu Bo 
56380b3d4cd3SLiu Bo 	/* stripe_offset is the offset of this block in its stripe */
56390b3d4cd3SLiu Bo 	stripe_offset = offset - stripe_nr * stripe_len;
56400b3d4cd3SLiu Bo 
56410b3d4cd3SLiu Bo 	stripe_nr_end = round_up(offset + length, map->stripe_len);
564242c61ab6SLiu Bo 	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
56430b3d4cd3SLiu Bo 	stripe_cnt = stripe_nr_end - stripe_nr;
56440b3d4cd3SLiu Bo 	stripe_end_offset = stripe_nr_end * map->stripe_len -
56450b3d4cd3SLiu Bo 			    (offset + length);
56460b3d4cd3SLiu Bo 	/*
56470b3d4cd3SLiu Bo 	 * after this, stripe_nr is the number of stripes on this
56480b3d4cd3SLiu Bo 	 * device we have to walk to find the data, and stripe_index is
56490b3d4cd3SLiu Bo 	 * the number of our device in the stripe array
56500b3d4cd3SLiu Bo 	 */
56510b3d4cd3SLiu Bo 	num_stripes = 1;
56520b3d4cd3SLiu Bo 	stripe_index = 0;
56530b3d4cd3SLiu Bo 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
56540b3d4cd3SLiu Bo 			 BTRFS_BLOCK_GROUP_RAID10)) {
56550b3d4cd3SLiu Bo 		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
56560b3d4cd3SLiu Bo 			sub_stripes = 1;
56570b3d4cd3SLiu Bo 		else
56580b3d4cd3SLiu Bo 			sub_stripes = map->sub_stripes;
56590b3d4cd3SLiu Bo 
56600b3d4cd3SLiu Bo 		factor = map->num_stripes / sub_stripes;
56610b3d4cd3SLiu Bo 		num_stripes = min_t(u64, map->num_stripes,
56620b3d4cd3SLiu Bo 				    sub_stripes * stripe_cnt);
56630b3d4cd3SLiu Bo 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
56640b3d4cd3SLiu Bo 		stripe_index *= sub_stripes;
56650b3d4cd3SLiu Bo 		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
56660b3d4cd3SLiu Bo 					      &remaining_stripes);
56670b3d4cd3SLiu Bo 		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
56680b3d4cd3SLiu Bo 		last_stripe *= sub_stripes;
5669c7369b3fSDavid Sterba 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
56700b3d4cd3SLiu Bo 				BTRFS_BLOCK_GROUP_DUP)) {
56710b3d4cd3SLiu Bo 		num_stripes = map->num_stripes;
56720b3d4cd3SLiu Bo 	} else {
56730b3d4cd3SLiu Bo 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
56740b3d4cd3SLiu Bo 					&stripe_index);
56750b3d4cd3SLiu Bo 	}
56760b3d4cd3SLiu Bo 
56770b3d4cd3SLiu Bo 	bbio = alloc_btrfs_bio(num_stripes, 0);
56780b3d4cd3SLiu Bo 	if (!bbio) {
56790b3d4cd3SLiu Bo 		ret = -ENOMEM;
56800b3d4cd3SLiu Bo 		goto out;
56810b3d4cd3SLiu Bo 	}
56820b3d4cd3SLiu Bo 
56830b3d4cd3SLiu Bo 	for (i = 0; i < num_stripes; i++) {
56840b3d4cd3SLiu Bo 		bbio->stripes[i].physical =
56850b3d4cd3SLiu Bo 			map->stripes[stripe_index].physical +
56860b3d4cd3SLiu Bo 			stripe_offset + stripe_nr * map->stripe_len;
56870b3d4cd3SLiu Bo 		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
56880b3d4cd3SLiu Bo 
56890b3d4cd3SLiu Bo 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
56900b3d4cd3SLiu Bo 				 BTRFS_BLOCK_GROUP_RAID10)) {
56910b3d4cd3SLiu Bo 			bbio->stripes[i].length = stripes_per_dev *
56920b3d4cd3SLiu Bo 				map->stripe_len;
56930b3d4cd3SLiu Bo 
56940b3d4cd3SLiu Bo 			if (i / sub_stripes < remaining_stripes)
56950b3d4cd3SLiu Bo 				bbio->stripes[i].length +=
56960b3d4cd3SLiu Bo 					map->stripe_len;
56970b3d4cd3SLiu Bo 
56980b3d4cd3SLiu Bo 			/*
56990b3d4cd3SLiu Bo 			 * Special for the first stripe and
57000b3d4cd3SLiu Bo 			 * the last stripe:
57010b3d4cd3SLiu Bo 			 *
57020b3d4cd3SLiu Bo 			 * |-------|...|-------|
57030b3d4cd3SLiu Bo 			 *     |----------|
57040b3d4cd3SLiu Bo 			 *    off     end_off
57050b3d4cd3SLiu Bo 			 */
57060b3d4cd3SLiu Bo 			if (i < sub_stripes)
57070b3d4cd3SLiu Bo 				bbio->stripes[i].length -=
57080b3d4cd3SLiu Bo 					stripe_offset;
57090b3d4cd3SLiu Bo 
57100b3d4cd3SLiu Bo 			if (stripe_index >= last_stripe &&
57110b3d4cd3SLiu Bo 			    stripe_index <= (last_stripe +
57120b3d4cd3SLiu Bo 					     sub_stripes - 1))
57130b3d4cd3SLiu Bo 				bbio->stripes[i].length -=
57140b3d4cd3SLiu Bo 					stripe_end_offset;
57150b3d4cd3SLiu Bo 
57160b3d4cd3SLiu Bo 			if (i == sub_stripes - 1)
57170b3d4cd3SLiu Bo 				stripe_offset = 0;
57180b3d4cd3SLiu Bo 		} else {
57190b3d4cd3SLiu Bo 			bbio->stripes[i].length = length;
57200b3d4cd3SLiu Bo 		}
57210b3d4cd3SLiu Bo 
57220b3d4cd3SLiu Bo 		stripe_index++;
57230b3d4cd3SLiu Bo 		if (stripe_index == map->num_stripes) {
57240b3d4cd3SLiu Bo 			stripe_index = 0;
57250b3d4cd3SLiu Bo 			stripe_nr++;
57260b3d4cd3SLiu Bo 		}
57270b3d4cd3SLiu Bo 	}
57280b3d4cd3SLiu Bo 
57290b3d4cd3SLiu Bo 	*bbio_ret = bbio;
57300b3d4cd3SLiu Bo 	bbio->map_type = map->type;
57310b3d4cd3SLiu Bo 	bbio->num_stripes = num_stripes;
57320b3d4cd3SLiu Bo out:
57330b3d4cd3SLiu Bo 	free_extent_map(em);
57340b3d4cd3SLiu Bo 	return ret;
57350b3d4cd3SLiu Bo }
57360b3d4cd3SLiu Bo 
57375ab56090SLiu Bo /*
57385ab56090SLiu Bo  * In dev-replace case, for repair case (that's the only case where the mirror
57395ab56090SLiu Bo  * is selected explicitly when calling btrfs_map_block), blocks left of the
57405ab56090SLiu Bo  * left cursor can also be read from the target drive.
57415ab56090SLiu Bo  *
57425ab56090SLiu Bo  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
57435ab56090SLiu Bo  * array of stripes.
57445ab56090SLiu Bo  * For READ, it also needs to be supported using the same mirror number.
57455ab56090SLiu Bo  *
57465ab56090SLiu Bo  * If the requested block is not left of the left cursor, EIO is returned. This
57475ab56090SLiu Bo  * can happen because btrfs_num_copies() returns one more in the dev-replace
57485ab56090SLiu Bo  * case.
57495ab56090SLiu Bo  */
57505ab56090SLiu Bo static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
57515ab56090SLiu Bo 					 u64 logical, u64 length,
57525ab56090SLiu Bo 					 u64 srcdev_devid, int *mirror_num,
57535ab56090SLiu Bo 					 u64 *physical)
57545ab56090SLiu Bo {
57555ab56090SLiu Bo 	struct btrfs_bio *bbio = NULL;
57565ab56090SLiu Bo 	int num_stripes;
57575ab56090SLiu Bo 	int index_srcdev = 0;
57585ab56090SLiu Bo 	int found = 0;
57595ab56090SLiu Bo 	u64 physical_of_found = 0;
57605ab56090SLiu Bo 	int i;
57615ab56090SLiu Bo 	int ret = 0;
57625ab56090SLiu Bo 
57635ab56090SLiu Bo 	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
57645ab56090SLiu Bo 				logical, &length, &bbio, 0, 0);
57655ab56090SLiu Bo 	if (ret) {
57665ab56090SLiu Bo 		ASSERT(bbio == NULL);
57675ab56090SLiu Bo 		return ret;
57685ab56090SLiu Bo 	}
57695ab56090SLiu Bo 
57705ab56090SLiu Bo 	num_stripes = bbio->num_stripes;
57715ab56090SLiu Bo 	if (*mirror_num > num_stripes) {
57725ab56090SLiu Bo 		/*
57735ab56090SLiu Bo 		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
57745ab56090SLiu Bo 		 * that means that the requested area is not left of the left
57755ab56090SLiu Bo 		 * cursor
57765ab56090SLiu Bo 		 */
57775ab56090SLiu Bo 		btrfs_put_bbio(bbio);
57785ab56090SLiu Bo 		return -EIO;
57795ab56090SLiu Bo 	}
57805ab56090SLiu Bo 
57815ab56090SLiu Bo 	/*
57825ab56090SLiu Bo 	 * process the rest of the function using the mirror_num of the source
57835ab56090SLiu Bo 	 * drive. Therefore look it up first.  At the end, patch the device
57845ab56090SLiu Bo 	 * pointer to the one of the target drive.
57855ab56090SLiu Bo 	 */
57865ab56090SLiu Bo 	for (i = 0; i < num_stripes; i++) {
57875ab56090SLiu Bo 		if (bbio->stripes[i].dev->devid != srcdev_devid)
57885ab56090SLiu Bo 			continue;
57895ab56090SLiu Bo 
57905ab56090SLiu Bo 		/*
57915ab56090SLiu Bo 		 * In case of DUP, in order to keep it simple, only add the
57925ab56090SLiu Bo 		 * mirror with the lowest physical address
57935ab56090SLiu Bo 		 */
57945ab56090SLiu Bo 		if (found &&
57955ab56090SLiu Bo 		    physical_of_found <= bbio->stripes[i].physical)
57965ab56090SLiu Bo 			continue;
57975ab56090SLiu Bo 
57985ab56090SLiu Bo 		index_srcdev = i;
57995ab56090SLiu Bo 		found = 1;
58005ab56090SLiu Bo 		physical_of_found = bbio->stripes[i].physical;
58015ab56090SLiu Bo 	}
58025ab56090SLiu Bo 
58035ab56090SLiu Bo 	btrfs_put_bbio(bbio);
58045ab56090SLiu Bo 
58055ab56090SLiu Bo 	ASSERT(found);
58065ab56090SLiu Bo 	if (!found)
58075ab56090SLiu Bo 		return -EIO;
58085ab56090SLiu Bo 
58095ab56090SLiu Bo 	*mirror_num = index_srcdev + 1;
58105ab56090SLiu Bo 	*physical = physical_of_found;
58115ab56090SLiu Bo 	return ret;
58125ab56090SLiu Bo }
58135ab56090SLiu Bo 
581473c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op,
581573c0f228SLiu Bo 				      struct btrfs_bio **bbio_ret,
581673c0f228SLiu Bo 				      struct btrfs_dev_replace *dev_replace,
581773c0f228SLiu Bo 				      int *num_stripes_ret, int *max_errors_ret)
581873c0f228SLiu Bo {
581973c0f228SLiu Bo 	struct btrfs_bio *bbio = *bbio_ret;
582073c0f228SLiu Bo 	u64 srcdev_devid = dev_replace->srcdev->devid;
582173c0f228SLiu Bo 	int tgtdev_indexes = 0;
582273c0f228SLiu Bo 	int num_stripes = *num_stripes_ret;
582373c0f228SLiu Bo 	int max_errors = *max_errors_ret;
582473c0f228SLiu Bo 	int i;
582573c0f228SLiu Bo 
582673c0f228SLiu Bo 	if (op == BTRFS_MAP_WRITE) {
582773c0f228SLiu Bo 		int index_where_to_add;
582873c0f228SLiu Bo 
582973c0f228SLiu Bo 		/*
583073c0f228SLiu Bo 		 * duplicate the write operations while the dev replace
583173c0f228SLiu Bo 		 * procedure is running. Since the copying of the old disk to
583273c0f228SLiu Bo 		 * the new disk takes place at run time while the filesystem is
583373c0f228SLiu Bo 		 * mounted writable, the regular write operations to the old
583473c0f228SLiu Bo 		 * disk have to be duplicated to go to the new disk as well.
583573c0f228SLiu Bo 		 *
583673c0f228SLiu Bo 		 * Note that device->missing is handled by the caller, and that
583773c0f228SLiu Bo 		 * the write to the old disk is already set up in the stripes
583873c0f228SLiu Bo 		 * array.
583973c0f228SLiu Bo 		 */
584073c0f228SLiu Bo 		index_where_to_add = num_stripes;
584173c0f228SLiu Bo 		for (i = 0; i < num_stripes; i++) {
584273c0f228SLiu Bo 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
584373c0f228SLiu Bo 				/* write to new disk, too */
584473c0f228SLiu Bo 				struct btrfs_bio_stripe *new =
584573c0f228SLiu Bo 					bbio->stripes + index_where_to_add;
584673c0f228SLiu Bo 				struct btrfs_bio_stripe *old =
584773c0f228SLiu Bo 					bbio->stripes + i;
584873c0f228SLiu Bo 
584973c0f228SLiu Bo 				new->physical = old->physical;
585073c0f228SLiu Bo 				new->length = old->length;
585173c0f228SLiu Bo 				new->dev = dev_replace->tgtdev;
585273c0f228SLiu Bo 				bbio->tgtdev_map[i] = index_where_to_add;
585373c0f228SLiu Bo 				index_where_to_add++;
585473c0f228SLiu Bo 				max_errors++;
585573c0f228SLiu Bo 				tgtdev_indexes++;
585673c0f228SLiu Bo 			}
585773c0f228SLiu Bo 		}
585873c0f228SLiu Bo 		num_stripes = index_where_to_add;
585973c0f228SLiu Bo 	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
586073c0f228SLiu Bo 		int index_srcdev = 0;
586173c0f228SLiu Bo 		int found = 0;
586273c0f228SLiu Bo 		u64 physical_of_found = 0;
586373c0f228SLiu Bo 
586473c0f228SLiu Bo 		/*
586573c0f228SLiu Bo 		 * During the dev-replace procedure, the target drive can also
586673c0f228SLiu Bo 		 * be used to read data in case it is needed to repair a corrupt
586773c0f228SLiu Bo 		 * block elsewhere. This is possible if the requested area is
586873c0f228SLiu Bo 		 * left of the left cursor. In this area, the target drive is a
586973c0f228SLiu Bo 		 * full copy of the source drive.
587073c0f228SLiu Bo 		 */
587173c0f228SLiu Bo 		for (i = 0; i < num_stripes; i++) {
587273c0f228SLiu Bo 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
587373c0f228SLiu Bo 				/*
587473c0f228SLiu Bo 				 * In case of DUP, in order to keep it simple,
587573c0f228SLiu Bo 				 * only add the mirror with the lowest physical
587673c0f228SLiu Bo 				 * address
587773c0f228SLiu Bo 				 */
587873c0f228SLiu Bo 				if (found &&
587973c0f228SLiu Bo 				    physical_of_found <=
588073c0f228SLiu Bo 				     bbio->stripes[i].physical)
588173c0f228SLiu Bo 					continue;
588273c0f228SLiu Bo 				index_srcdev = i;
588373c0f228SLiu Bo 				found = 1;
588473c0f228SLiu Bo 				physical_of_found = bbio->stripes[i].physical;
588573c0f228SLiu Bo 			}
588673c0f228SLiu Bo 		}
588773c0f228SLiu Bo 		if (found) {
588873c0f228SLiu Bo 			struct btrfs_bio_stripe *tgtdev_stripe =
588973c0f228SLiu Bo 				bbio->stripes + num_stripes;
589073c0f228SLiu Bo 
589173c0f228SLiu Bo 			tgtdev_stripe->physical = physical_of_found;
589273c0f228SLiu Bo 			tgtdev_stripe->length =
589373c0f228SLiu Bo 				bbio->stripes[index_srcdev].length;
589473c0f228SLiu Bo 			tgtdev_stripe->dev = dev_replace->tgtdev;
589573c0f228SLiu Bo 			bbio->tgtdev_map[index_srcdev] = num_stripes;
589673c0f228SLiu Bo 
589773c0f228SLiu Bo 			tgtdev_indexes++;
589873c0f228SLiu Bo 			num_stripes++;
589973c0f228SLiu Bo 		}
590073c0f228SLiu Bo 	}
590173c0f228SLiu Bo 
590273c0f228SLiu Bo 	*num_stripes_ret = num_stripes;
590373c0f228SLiu Bo 	*max_errors_ret = max_errors;
590473c0f228SLiu Bo 	bbio->num_tgtdevs = tgtdev_indexes;
590573c0f228SLiu Bo 	*bbio_ret = bbio;
590673c0f228SLiu Bo }
590773c0f228SLiu Bo 
59082b19a1feSLiu Bo static bool need_full_stripe(enum btrfs_map_op op)
59092b19a1feSLiu Bo {
59102b19a1feSLiu Bo 	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
59112b19a1feSLiu Bo }
59122b19a1feSLiu Bo 
59135f141126SNikolay Borisov /*
59145f141126SNikolay Borisov  * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
59155f141126SNikolay Borisov  *		       tuple. This information is used to calculate how big a
59165f141126SNikolay Borisov  *		       particular bio can get before it straddles a stripe.
59175f141126SNikolay Borisov  *
59185f141126SNikolay Borisov  * @fs_info - the filesystem
59195f141126SNikolay Borisov  * @logical - address that we want to figure out the geometry of
59205f141126SNikolay Borisov  * @len	    - the length of IO we are going to perform, starting at @logical
59215f141126SNikolay Borisov  * @op      - type of operation - write or read
59225f141126SNikolay Borisov  * @io_geom - pointer used to return values
59235f141126SNikolay Borisov  *
59245f141126SNikolay Borisov  * Returns < 0 in case a chunk for the given logical address cannot be found,
59255f141126SNikolay Borisov  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
59265f141126SNikolay Borisov  */
59275f141126SNikolay Borisov int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
59285f141126SNikolay Borisov 			u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
59295f141126SNikolay Borisov {
59305f141126SNikolay Borisov 	struct extent_map *em;
59315f141126SNikolay Borisov 	struct map_lookup *map;
59325f141126SNikolay Borisov 	u64 offset;
59335f141126SNikolay Borisov 	u64 stripe_offset;
59345f141126SNikolay Borisov 	u64 stripe_nr;
59355f141126SNikolay Borisov 	u64 stripe_len;
59365f141126SNikolay Borisov 	u64 raid56_full_stripe_start = (u64)-1;
59375f141126SNikolay Borisov 	int data_stripes;
5938373c3b80SJohannes Thumshirn 	int ret = 0;
59395f141126SNikolay Borisov 
59405f141126SNikolay Borisov 	ASSERT(op != BTRFS_MAP_DISCARD);
59415f141126SNikolay Borisov 
59425f141126SNikolay Borisov 	em = btrfs_get_chunk_map(fs_info, logical, len);
59435f141126SNikolay Borisov 	if (IS_ERR(em))
59445f141126SNikolay Borisov 		return PTR_ERR(em);
59455f141126SNikolay Borisov 
59465f141126SNikolay Borisov 	map = em->map_lookup;
59475f141126SNikolay Borisov 	/* Offset of this logical address in the chunk */
59485f141126SNikolay Borisov 	offset = logical - em->start;
59495f141126SNikolay Borisov 	/* Len of a stripe in a chunk */
59505f141126SNikolay Borisov 	stripe_len = map->stripe_len;
59515f141126SNikolay Borisov 	/* Stripe wher this block falls in */
59525f141126SNikolay Borisov 	stripe_nr = div64_u64(offset, stripe_len);
59535f141126SNikolay Borisov 	/* Offset of stripe in the chunk */
59545f141126SNikolay Borisov 	stripe_offset = stripe_nr * stripe_len;
59555f141126SNikolay Borisov 	if (offset < stripe_offset) {
59565f141126SNikolay Borisov 		btrfs_crit(fs_info,
59575f141126SNikolay Borisov "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
59585f141126SNikolay Borisov 			stripe_offset, offset, em->start, logical, stripe_len);
5959373c3b80SJohannes Thumshirn 		ret = -EINVAL;
5960373c3b80SJohannes Thumshirn 		goto out;
59615f141126SNikolay Borisov 	}
59625f141126SNikolay Borisov 
59635f141126SNikolay Borisov 	/* stripe_offset is the offset of this block in its stripe */
59645f141126SNikolay Borisov 	stripe_offset = offset - stripe_offset;
59655f141126SNikolay Borisov 	data_stripes = nr_data_stripes(map);
59665f141126SNikolay Borisov 
59675f141126SNikolay Borisov 	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
59685f141126SNikolay Borisov 		u64 max_len = stripe_len - stripe_offset;
59695f141126SNikolay Borisov 
59705f141126SNikolay Borisov 		/*
59715f141126SNikolay Borisov 		 * In case of raid56, we need to know the stripe aligned start
59725f141126SNikolay Borisov 		 */
59735f141126SNikolay Borisov 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
59745f141126SNikolay Borisov 			unsigned long full_stripe_len = stripe_len * data_stripes;
59755f141126SNikolay Borisov 			raid56_full_stripe_start = offset;
59765f141126SNikolay Borisov 
59775f141126SNikolay Borisov 			/*
59785f141126SNikolay Borisov 			 * Allow a write of a full stripe, but make sure we
59795f141126SNikolay Borisov 			 * don't allow straddling of stripes
59805f141126SNikolay Borisov 			 */
59815f141126SNikolay Borisov 			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
59825f141126SNikolay Borisov 					full_stripe_len);
59835f141126SNikolay Borisov 			raid56_full_stripe_start *= full_stripe_len;
59845f141126SNikolay Borisov 
59855f141126SNikolay Borisov 			/*
59865f141126SNikolay Borisov 			 * For writes to RAID[56], allow a full stripeset across
59875f141126SNikolay Borisov 			 * all disks. For other RAID types and for RAID[56]
59885f141126SNikolay Borisov 			 * reads, just allow a single stripe (on a single disk).
59895f141126SNikolay Borisov 			 */
59905f141126SNikolay Borisov 			if (op == BTRFS_MAP_WRITE) {
59915f141126SNikolay Borisov 				max_len = stripe_len * data_stripes -
59925f141126SNikolay Borisov 					  (offset - raid56_full_stripe_start);
59935f141126SNikolay Borisov 			}
59945f141126SNikolay Borisov 		}
59955f141126SNikolay Borisov 		len = min_t(u64, em->len - offset, max_len);
59965f141126SNikolay Borisov 	} else {
59975f141126SNikolay Borisov 		len = em->len - offset;
59985f141126SNikolay Borisov 	}
59995f141126SNikolay Borisov 
60005f141126SNikolay Borisov 	io_geom->len = len;
60015f141126SNikolay Borisov 	io_geom->offset = offset;
60025f141126SNikolay Borisov 	io_geom->stripe_len = stripe_len;
60035f141126SNikolay Borisov 	io_geom->stripe_nr = stripe_nr;
60045f141126SNikolay Borisov 	io_geom->stripe_offset = stripe_offset;
60055f141126SNikolay Borisov 	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
60065f141126SNikolay Borisov 
6007373c3b80SJohannes Thumshirn out:
6008373c3b80SJohannes Thumshirn 	/* once for us */
6009373c3b80SJohannes Thumshirn 	free_extent_map(em);
6010373c3b80SJohannes Thumshirn 	return ret;
60115f141126SNikolay Borisov }
60125f141126SNikolay Borisov 
6013cf8cddd3SChristoph Hellwig static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6014cf8cddd3SChristoph Hellwig 			     enum btrfs_map_op op,
6015cea9e445SChris Mason 			     u64 logical, u64 *length,
6016a1d3c478SJan Schmidt 			     struct btrfs_bio **bbio_ret,
60178e5cfb55SZhao Lei 			     int mirror_num, int need_raid_map)
60180b86a832SChris Mason {
60190b86a832SChris Mason 	struct extent_map *em;
60200b86a832SChris Mason 	struct map_lookup *map;
6021593060d7SChris Mason 	u64 stripe_offset;
6022593060d7SChris Mason 	u64 stripe_nr;
602353b381b3SDavid Woodhouse 	u64 stripe_len;
60249d644a62SDavid Sterba 	u32 stripe_index;
6025cff82672SDavid Sterba 	int data_stripes;
6026cea9e445SChris Mason 	int i;
6027de11cc12SLi Zefan 	int ret = 0;
6028f2d8d74dSChris Mason 	int num_stripes;
6029a236aed1SChris Mason 	int max_errors = 0;
60302c8cdd6eSMiao Xie 	int tgtdev_indexes = 0;
6031a1d3c478SJan Schmidt 	struct btrfs_bio *bbio = NULL;
6032472262f3SStefan Behrens 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6033472262f3SStefan Behrens 	int dev_replace_is_ongoing = 0;
6034472262f3SStefan Behrens 	int num_alloc_stripes;
6035ad6d620eSStefan Behrens 	int patch_the_first_stripe_for_dev_replace = 0;
6036ad6d620eSStefan Behrens 	u64 physical_to_patch_in_first_stripe = 0;
603753b381b3SDavid Woodhouse 	u64 raid56_full_stripe_start = (u64)-1;
603889b798adSNikolay Borisov 	struct btrfs_io_geometry geom;
603989b798adSNikolay Borisov 
604089b798adSNikolay Borisov 	ASSERT(bbio_ret);
604175fb2e9eSDavid Sterba 	ASSERT(op != BTRFS_MAP_DISCARD);
60420b3d4cd3SLiu Bo 
604389b798adSNikolay Borisov 	ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
604489b798adSNikolay Borisov 	if (ret < 0)
604589b798adSNikolay Borisov 		return ret;
604689b798adSNikolay Borisov 
604760ca842eSOmar Sandoval 	em = btrfs_get_chunk_map(fs_info, logical, *length);
6048f1136989SDan Carpenter 	ASSERT(!IS_ERR(em));
604995617d69SJeff Mahoney 	map = em->map_lookup;
6050593060d7SChris Mason 
605189b798adSNikolay Borisov 	*length = geom.len;
605289b798adSNikolay Borisov 	stripe_len = geom.stripe_len;
605389b798adSNikolay Borisov 	stripe_nr = geom.stripe_nr;
605489b798adSNikolay Borisov 	stripe_offset = geom.stripe_offset;
605589b798adSNikolay Borisov 	raid56_full_stripe_start = geom.raid56_stripe_offset;
6056cff82672SDavid Sterba 	data_stripes = nr_data_stripes(map);
6057593060d7SChris Mason 
6058cb5583ddSDavid Sterba 	down_read(&dev_replace->rwsem);
6059472262f3SStefan Behrens 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
606053176ddeSDavid Sterba 	/*
606153176ddeSDavid Sterba 	 * Hold the semaphore for read during the whole operation, write is
606253176ddeSDavid Sterba 	 * requested at commit time but must wait.
606353176ddeSDavid Sterba 	 */
6064472262f3SStefan Behrens 	if (!dev_replace_is_ongoing)
6065cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
6066472262f3SStefan Behrens 
6067ad6d620eSStefan Behrens 	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
60682b19a1feSLiu Bo 	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
60695ab56090SLiu Bo 		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
60705ab56090SLiu Bo 						    dev_replace->srcdev->devid,
60715ab56090SLiu Bo 						    &mirror_num,
60725ab56090SLiu Bo 					    &physical_to_patch_in_first_stripe);
60735ab56090SLiu Bo 		if (ret)
6074ad6d620eSStefan Behrens 			goto out;
60755ab56090SLiu Bo 		else
607694a97dfeSZhao Lei 			patch_the_first_stripe_for_dev_replace = 1;
6077ad6d620eSStefan Behrens 	} else if (mirror_num > map->num_stripes) {
6078ad6d620eSStefan Behrens 		mirror_num = 0;
6079ad6d620eSStefan Behrens 	}
6080ad6d620eSStefan Behrens 
6081f2d8d74dSChris Mason 	num_stripes = 1;
6082cea9e445SChris Mason 	stripe_index = 0;
6083fce3bb9aSLi Dongyang 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
608447c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
608547c5713fSDavid Sterba 				&stripe_index);
6086de483734SAnand Jain 		if (!need_full_stripe(op))
608728e1cc7dSMiao Xie 			mirror_num = 1;
6088c7369b3fSDavid Sterba 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6089de483734SAnand Jain 		if (need_full_stripe(op))
6090f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
60912fff734fSChris Mason 		else if (mirror_num)
6092f188591eSChris Mason 			stripe_index = mirror_num - 1;
6093dfe25020SChris Mason 		else {
609430d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map, 0,
609530d9861fSStefan Behrens 					    dev_replace_is_ongoing);
6096a1d3c478SJan Schmidt 			mirror_num = stripe_index + 1;
6097dfe25020SChris Mason 		}
60982fff734fSChris Mason 
6099611f0e00SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6100de483734SAnand Jain 		if (need_full_stripe(op)) {
6101f2d8d74dSChris Mason 			num_stripes = map->num_stripes;
6102a1d3c478SJan Schmidt 		} else if (mirror_num) {
6103f188591eSChris Mason 			stripe_index = mirror_num - 1;
6104a1d3c478SJan Schmidt 		} else {
6105a1d3c478SJan Schmidt 			mirror_num = 1;
6106a1d3c478SJan Schmidt 		}
61072fff734fSChris Mason 
6108321aecc6SChris Mason 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
61099d644a62SDavid Sterba 		u32 factor = map->num_stripes / map->sub_stripes;
6110321aecc6SChris Mason 
611147c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6112321aecc6SChris Mason 		stripe_index *= map->sub_stripes;
6113321aecc6SChris Mason 
6114de483734SAnand Jain 		if (need_full_stripe(op))
6115f2d8d74dSChris Mason 			num_stripes = map->sub_stripes;
6116321aecc6SChris Mason 		else if (mirror_num)
6117321aecc6SChris Mason 			stripe_index += mirror_num - 1;
6118dfe25020SChris Mason 		else {
61193e74317aSJan Schmidt 			int old_stripe_index = stripe_index;
612030d9861fSStefan Behrens 			stripe_index = find_live_mirror(fs_info, map,
612130d9861fSStefan Behrens 					      stripe_index,
612230d9861fSStefan Behrens 					      dev_replace_is_ongoing);
61233e74317aSJan Schmidt 			mirror_num = stripe_index - old_stripe_index + 1;
6124dfe25020SChris Mason 		}
612553b381b3SDavid Woodhouse 
6126ffe2d203SZhao Lei 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6127de483734SAnand Jain 		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
612853b381b3SDavid Woodhouse 			/* push stripe_nr back to the start of the full stripe */
612942c61ab6SLiu Bo 			stripe_nr = div64_u64(raid56_full_stripe_start,
6130cff82672SDavid Sterba 					stripe_len * data_stripes);
613153b381b3SDavid Woodhouse 
613253b381b3SDavid Woodhouse 			/* RAID[56] write or recovery. Return all stripes */
613353b381b3SDavid Woodhouse 			num_stripes = map->num_stripes;
613453b381b3SDavid Woodhouse 			max_errors = nr_parity_stripes(map);
613553b381b3SDavid Woodhouse 
613653b381b3SDavid Woodhouse 			*length = map->stripe_len;
613753b381b3SDavid Woodhouse 			stripe_index = 0;
613853b381b3SDavid Woodhouse 			stripe_offset = 0;
613953b381b3SDavid Woodhouse 		} else {
614053b381b3SDavid Woodhouse 			/*
614153b381b3SDavid Woodhouse 			 * Mirror #0 or #1 means the original data block.
614253b381b3SDavid Woodhouse 			 * Mirror #2 is RAID5 parity block.
614353b381b3SDavid Woodhouse 			 * Mirror #3 is RAID6 Q block.
614453b381b3SDavid Woodhouse 			 */
614547c5713fSDavid Sterba 			stripe_nr = div_u64_rem(stripe_nr,
6146cff82672SDavid Sterba 					data_stripes, &stripe_index);
614753b381b3SDavid Woodhouse 			if (mirror_num > 1)
6148cff82672SDavid Sterba 				stripe_index = data_stripes + mirror_num - 2;
614953b381b3SDavid Woodhouse 
615053b381b3SDavid Woodhouse 			/* We distribute the parity blocks across stripes */
615147c5713fSDavid Sterba 			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
615247c5713fSDavid Sterba 					&stripe_index);
6153de483734SAnand Jain 			if (!need_full_stripe(op) && mirror_num <= 1)
615428e1cc7dSMiao Xie 				mirror_num = 1;
615553b381b3SDavid Woodhouse 		}
61568790d502SChris Mason 	} else {
6157593060d7SChris Mason 		/*
615847c5713fSDavid Sterba 		 * after this, stripe_nr is the number of stripes on this
615947c5713fSDavid Sterba 		 * device we have to walk to find the data, and stripe_index is
616047c5713fSDavid Sterba 		 * the number of our device in the stripe array
6161593060d7SChris Mason 		 */
616247c5713fSDavid Sterba 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
616347c5713fSDavid Sterba 				&stripe_index);
6164a1d3c478SJan Schmidt 		mirror_num = stripe_index + 1;
61658790d502SChris Mason 	}
6166e042d1ecSJosef Bacik 	if (stripe_index >= map->num_stripes) {
61675d163e0eSJeff Mahoney 		btrfs_crit(fs_info,
61685d163e0eSJeff Mahoney 			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6169e042d1ecSJosef Bacik 			   stripe_index, map->num_stripes);
6170e042d1ecSJosef Bacik 		ret = -EINVAL;
6171e042d1ecSJosef Bacik 		goto out;
6172e042d1ecSJosef Bacik 	}
6173593060d7SChris Mason 
6174472262f3SStefan Behrens 	num_alloc_stripes = num_stripes;
61756fad823fSLiu Bo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
61760b3d4cd3SLiu Bo 		if (op == BTRFS_MAP_WRITE)
6177472262f3SStefan Behrens 			num_alloc_stripes <<= 1;
6178cf8cddd3SChristoph Hellwig 		if (op == BTRFS_MAP_GET_READ_MIRRORS)
6179ad6d620eSStefan Behrens 			num_alloc_stripes++;
61802c8cdd6eSMiao Xie 		tgtdev_indexes = num_stripes;
6181ad6d620eSStefan Behrens 	}
61822c8cdd6eSMiao Xie 
61836e9606d2SZhao Lei 	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6184de11cc12SLi Zefan 	if (!bbio) {
6185de11cc12SLi Zefan 		ret = -ENOMEM;
6186de11cc12SLi Zefan 		goto out;
6187de11cc12SLi Zefan 	}
6188608769a4SNikolay Borisov 
6189608769a4SNikolay Borisov 	for (i = 0; i < num_stripes; i++) {
6190608769a4SNikolay Borisov 		bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6191608769a4SNikolay Borisov 			stripe_offset + stripe_nr * map->stripe_len;
6192608769a4SNikolay Borisov 		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6193608769a4SNikolay Borisov 		stripe_index++;
6194608769a4SNikolay Borisov 	}
6195de11cc12SLi Zefan 
61968e5cfb55SZhao Lei 	/* build raid_map */
61972b19a1feSLiu Bo 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
61982b19a1feSLiu Bo 	    (need_full_stripe(op) || mirror_num > 1)) {
61998e5cfb55SZhao Lei 		u64 tmp;
62009d644a62SDavid Sterba 		unsigned rot;
62018e5cfb55SZhao Lei 
62028e5cfb55SZhao Lei 		/* Work out the disk rotation on this stripe-set */
620347c5713fSDavid Sterba 		div_u64_rem(stripe_nr, num_stripes, &rot);
62048e5cfb55SZhao Lei 
62058e5cfb55SZhao Lei 		/* Fill in the logical address of each stripe */
6206cff82672SDavid Sterba 		tmp = stripe_nr * data_stripes;
6207cff82672SDavid Sterba 		for (i = 0; i < data_stripes; i++)
62088e5cfb55SZhao Lei 			bbio->raid_map[(i+rot) % num_stripes] =
62098e5cfb55SZhao Lei 				em->start + (tmp + i) * map->stripe_len;
62108e5cfb55SZhao Lei 
62118e5cfb55SZhao Lei 		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
62128e5cfb55SZhao Lei 		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
62138e5cfb55SZhao Lei 			bbio->raid_map[(i+rot+1) % num_stripes] =
62148e5cfb55SZhao Lei 				RAID6_Q_STRIPE;
62158e5cfb55SZhao Lei 
6216608769a4SNikolay Borisov 		sort_parity_stripes(bbio, num_stripes);
6217593060d7SChris Mason 	}
6218de11cc12SLi Zefan 
62192b19a1feSLiu Bo 	if (need_full_stripe(op))
6220d20983b4SMiao Xie 		max_errors = btrfs_chunk_max_errors(map);
6221de11cc12SLi Zefan 
622273c0f228SLiu Bo 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
62232b19a1feSLiu Bo 	    need_full_stripe(op)) {
622473c0f228SLiu Bo 		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
622573c0f228SLiu Bo 					  &max_errors);
6226ad6d620eSStefan Behrens 	}
6227472262f3SStefan Behrens 
6228a1d3c478SJan Schmidt 	*bbio_ret = bbio;
622910f11900SZhao Lei 	bbio->map_type = map->type;
6230a1d3c478SJan Schmidt 	bbio->num_stripes = num_stripes;
6231a1d3c478SJan Schmidt 	bbio->max_errors = max_errors;
6232a1d3c478SJan Schmidt 	bbio->mirror_num = mirror_num;
6233ad6d620eSStefan Behrens 
6234ad6d620eSStefan Behrens 	/*
6235ad6d620eSStefan Behrens 	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6236ad6d620eSStefan Behrens 	 * mirror_num == num_stripes + 1 && dev_replace target drive is
6237ad6d620eSStefan Behrens 	 * available as a mirror
6238ad6d620eSStefan Behrens 	 */
6239ad6d620eSStefan Behrens 	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6240ad6d620eSStefan Behrens 		WARN_ON(num_stripes > 1);
6241ad6d620eSStefan Behrens 		bbio->stripes[0].dev = dev_replace->tgtdev;
6242ad6d620eSStefan Behrens 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6243ad6d620eSStefan Behrens 		bbio->mirror_num = map->num_stripes + 1;
6244ad6d620eSStefan Behrens 	}
6245cea9e445SChris Mason out:
624673beece9SLiu Bo 	if (dev_replace_is_ongoing) {
624753176ddeSDavid Sterba 		lockdep_assert_held(&dev_replace->rwsem);
624853176ddeSDavid Sterba 		/* Unlock and let waiting writers proceed */
6249cb5583ddSDavid Sterba 		up_read(&dev_replace->rwsem);
625073beece9SLiu Bo 	}
62510b86a832SChris Mason 	free_extent_map(em);
6252de11cc12SLi Zefan 	return ret;
62530b86a832SChris Mason }
62540b86a832SChris Mason 
6255cf8cddd3SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6256f2d8d74dSChris Mason 		      u64 logical, u64 *length,
6257a1d3c478SJan Schmidt 		      struct btrfs_bio **bbio_ret, int mirror_num)
6258f2d8d74dSChris Mason {
625975fb2e9eSDavid Sterba 	if (op == BTRFS_MAP_DISCARD)
626075fb2e9eSDavid Sterba 		return __btrfs_map_block_for_discard(fs_info, logical,
626175fb2e9eSDavid Sterba 						     length, bbio_ret);
626275fb2e9eSDavid Sterba 
6263b3d3fa51SMike Christie 	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
62648e5cfb55SZhao Lei 				 mirror_num, 0);
6265f2d8d74dSChris Mason }
6266f2d8d74dSChris Mason 
6267af8e2d1dSMiao Xie /* For Scrub/replace */
6268cf8cddd3SChristoph Hellwig int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6269af8e2d1dSMiao Xie 		     u64 logical, u64 *length,
6270825ad4c9SDavid Sterba 		     struct btrfs_bio **bbio_ret)
6271af8e2d1dSMiao Xie {
6272825ad4c9SDavid Sterba 	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6273af8e2d1dSMiao Xie }
6274af8e2d1dSMiao Xie 
62754246a0b6SChristoph Hellwig static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
62768408c716SMiao Xie {
6277326e1dbbSMike Snitzer 	bio->bi_private = bbio->private;
6278326e1dbbSMike Snitzer 	bio->bi_end_io = bbio->end_io;
62794246a0b6SChristoph Hellwig 	bio_endio(bio);
6280326e1dbbSMike Snitzer 
62816e9606d2SZhao Lei 	btrfs_put_bbio(bbio);
62828408c716SMiao Xie }
62838408c716SMiao Xie 
62844246a0b6SChristoph Hellwig static void btrfs_end_bio(struct bio *bio)
62858790d502SChris Mason {
62869be3395bSChris Mason 	struct btrfs_bio *bbio = bio->bi_private;
62877d2b4daaSChris Mason 	int is_orig_bio = 0;
62888790d502SChris Mason 
62894e4cbee9SChristoph Hellwig 	if (bio->bi_status) {
6290a1d3c478SJan Schmidt 		atomic_inc(&bbio->error);
62914e4cbee9SChristoph Hellwig 		if (bio->bi_status == BLK_STS_IOERR ||
62924e4cbee9SChristoph Hellwig 		    bio->bi_status == BLK_STS_TARGET) {
6293c31efbdfSNikolay Borisov 			struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6294442a4f63SStefan Behrens 
62953eee86c8SNikolay Borisov 			ASSERT(dev->bdev);
629637226b21SMike Christie 			if (bio_op(bio) == REQ_OP_WRITE)
62971cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6298442a4f63SStefan Behrens 						BTRFS_DEV_STAT_WRITE_ERRS);
62990cc068e6SDavid Sterba 			else if (!(bio->bi_opf & REQ_RAHEAD))
63001cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6301442a4f63SStefan Behrens 						BTRFS_DEV_STAT_READ_ERRS);
630270fd7614SChristoph Hellwig 			if (bio->bi_opf & REQ_PREFLUSH)
63031cb34c8eSAnand Jain 				btrfs_dev_stat_inc_and_print(dev,
6304442a4f63SStefan Behrens 						BTRFS_DEV_STAT_FLUSH_ERRS);
6305442a4f63SStefan Behrens 		}
6306442a4f63SStefan Behrens 	}
63078790d502SChris Mason 
6308a1d3c478SJan Schmidt 	if (bio == bbio->orig_bio)
63097d2b4daaSChris Mason 		is_orig_bio = 1;
63107d2b4daaSChris Mason 
6311c404e0dcSMiao Xie 	btrfs_bio_counter_dec(bbio->fs_info);
6312c404e0dcSMiao Xie 
6313a1d3c478SJan Schmidt 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
63147d2b4daaSChris Mason 		if (!is_orig_bio) {
63157d2b4daaSChris Mason 			bio_put(bio);
6316a1d3c478SJan Schmidt 			bio = bbio->orig_bio;
63177d2b4daaSChris Mason 		}
6318c7b22bb1SMuthu Kumar 
63199be3395bSChris Mason 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6320a236aed1SChris Mason 		/* only send an error to the higher layers if it is
632153b381b3SDavid Woodhouse 		 * beyond the tolerance of the btrfs bio
6322a236aed1SChris Mason 		 */
6323a1d3c478SJan Schmidt 		if (atomic_read(&bbio->error) > bbio->max_errors) {
63244e4cbee9SChristoph Hellwig 			bio->bi_status = BLK_STS_IOERR;
63255dbc8fcaSChris Mason 		} else {
63261259ab75SChris Mason 			/*
63271259ab75SChris Mason 			 * this bio is actually up to date, we didn't
63281259ab75SChris Mason 			 * go over the max number of errors
63291259ab75SChris Mason 			 */
63302dbe0c77SAnand Jain 			bio->bi_status = BLK_STS_OK;
63311259ab75SChris Mason 		}
6332c55f1396SMiao Xie 
63334246a0b6SChristoph Hellwig 		btrfs_end_bbio(bbio, bio);
63347d2b4daaSChris Mason 	} else if (!is_orig_bio) {
63358790d502SChris Mason 		bio_put(bio);
63368790d502SChris Mason 	}
63378790d502SChris Mason }
63388790d502SChris Mason 
63392ff7e61eSJeff Mahoney static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6340c31efbdfSNikolay Borisov 			      u64 physical, struct btrfs_device *dev)
6341de1ee92aSJosef Bacik {
63422ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = bbio->fs_info;
6343de1ee92aSJosef Bacik 
6344de1ee92aSJosef Bacik 	bio->bi_private = bbio;
6345c31efbdfSNikolay Borisov 	btrfs_io_bio(bio)->device = dev;
6346de1ee92aSJosef Bacik 	bio->bi_end_io = btrfs_end_bio;
63474f024f37SKent Overstreet 	bio->bi_iter.bi_sector = physical >> 9;
6348672d5990SMisono Tomohiro 	btrfs_debug_in_rcu(fs_info,
6349ab8d0fc4SJeff Mahoney 	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6350672d5990SMisono Tomohiro 		bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
63511db45a35SDavid Sterba 		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
63521db45a35SDavid Sterba 		dev->devid, bio->bi_iter.bi_size);
635374d46992SChristoph Hellwig 	bio_set_dev(bio, dev->bdev);
6354c404e0dcSMiao Xie 
63552ff7e61eSJeff Mahoney 	btrfs_bio_counter_inc_noblocked(fs_info);
6356c404e0dcSMiao Xie 
63574e49ea4aSMike Christie 	btrfsic_submit_bio(bio);
6358de1ee92aSJosef Bacik }
6359de1ee92aSJosef Bacik 
6360de1ee92aSJosef Bacik static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6361de1ee92aSJosef Bacik {
6362de1ee92aSJosef Bacik 	atomic_inc(&bbio->error);
6363de1ee92aSJosef Bacik 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
636401327610SNicholas D Steeves 		/* Should be the original bio. */
63658408c716SMiao Xie 		WARN_ON(bio != bbio->orig_bio);
63668408c716SMiao Xie 
63679be3395bSChris Mason 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
63684f024f37SKent Overstreet 		bio->bi_iter.bi_sector = logical >> 9;
6369102ed2c5SAnand Jain 		if (atomic_read(&bbio->error) > bbio->max_errors)
63704e4cbee9SChristoph Hellwig 			bio->bi_status = BLK_STS_IOERR;
6371102ed2c5SAnand Jain 		else
6372102ed2c5SAnand Jain 			bio->bi_status = BLK_STS_OK;
63734246a0b6SChristoph Hellwig 		btrfs_end_bbio(bbio, bio);
6374de1ee92aSJosef Bacik 	}
6375de1ee92aSJosef Bacik }
6376de1ee92aSJosef Bacik 
637758efbc9fSOmar Sandoval blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
637808635baeSChris Mason 			   int mirror_num)
63790b86a832SChris Mason {
63800b86a832SChris Mason 	struct btrfs_device *dev;
63818790d502SChris Mason 	struct bio *first_bio = bio;
63824f024f37SKent Overstreet 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
63830b86a832SChris Mason 	u64 length = 0;
63840b86a832SChris Mason 	u64 map_length;
63850b86a832SChris Mason 	int ret;
638608da757dSZhao Lei 	int dev_nr;
638708da757dSZhao Lei 	int total_devs;
6388a1d3c478SJan Schmidt 	struct btrfs_bio *bbio = NULL;
63890b86a832SChris Mason 
63904f024f37SKent Overstreet 	length = bio->bi_iter.bi_size;
63910b86a832SChris Mason 	map_length = length;
6392cea9e445SChris Mason 
63930b246afaSJeff Mahoney 	btrfs_bio_counter_inc_blocked(fs_info);
6394bd7d63c2SLiu Bo 	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
639537226b21SMike Christie 				&map_length, &bbio, mirror_num, 1);
6396c404e0dcSMiao Xie 	if (ret) {
63970b246afaSJeff Mahoney 		btrfs_bio_counter_dec(fs_info);
639858efbc9fSOmar Sandoval 		return errno_to_blk_status(ret);
6399c404e0dcSMiao Xie 	}
6400cea9e445SChris Mason 
6401a1d3c478SJan Schmidt 	total_devs = bbio->num_stripes;
640253b381b3SDavid Woodhouse 	bbio->orig_bio = first_bio;
640353b381b3SDavid Woodhouse 	bbio->private = first_bio->bi_private;
640453b381b3SDavid Woodhouse 	bbio->end_io = first_bio->bi_end_io;
64050b246afaSJeff Mahoney 	bbio->fs_info = fs_info;
640653b381b3SDavid Woodhouse 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
640753b381b3SDavid Woodhouse 
6408ad1ba2a0SZhao Lei 	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
640937226b21SMike Christie 	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
641053b381b3SDavid Woodhouse 		/* In this case, map_length has been set to the length of
641153b381b3SDavid Woodhouse 		   a single stripe; not the whole write */
641237226b21SMike Christie 		if (bio_op(bio) == REQ_OP_WRITE) {
64132ff7e61eSJeff Mahoney 			ret = raid56_parity_write(fs_info, bio, bbio,
64142ff7e61eSJeff Mahoney 						  map_length);
641553b381b3SDavid Woodhouse 		} else {
64162ff7e61eSJeff Mahoney 			ret = raid56_parity_recover(fs_info, bio, bbio,
64172ff7e61eSJeff Mahoney 						    map_length, mirror_num, 1);
641853b381b3SDavid Woodhouse 		}
64194245215dSMiao Xie 
64200b246afaSJeff Mahoney 		btrfs_bio_counter_dec(fs_info);
642158efbc9fSOmar Sandoval 		return errno_to_blk_status(ret);
642253b381b3SDavid Woodhouse 	}
642353b381b3SDavid Woodhouse 
6424239b14b3SChris Mason 	if (map_length < length) {
64250b246afaSJeff Mahoney 		btrfs_crit(fs_info,
64265d163e0eSJeff Mahoney 			   "mapping failed logical %llu bio len %llu len %llu",
6427c1c9ff7cSGeert Uytterhoeven 			   logical, length, map_length);
6428239b14b3SChris Mason 		BUG();
6429239b14b3SChris Mason 	}
6430a1d3c478SJan Schmidt 
643108da757dSZhao Lei 	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6432de1ee92aSJosef Bacik 		dev = bbio->stripes[dev_nr].dev;
6433fc8a168aSNikolay Borisov 		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6434fc8a168aSNikolay Borisov 						   &dev->dev_state) ||
6435ebbede42SAnand Jain 		    (bio_op(first_bio) == REQ_OP_WRITE &&
6436ebbede42SAnand Jain 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6437de1ee92aSJosef Bacik 			bbio_error(bbio, first_bio, logical);
6438de1ee92aSJosef Bacik 			continue;
6439de1ee92aSJosef Bacik 		}
6440de1ee92aSJosef Bacik 
64413aa8e074SDavid Sterba 		if (dev_nr < total_devs - 1)
64428b6c1d56SDavid Sterba 			bio = btrfs_bio_clone(first_bio);
64433aa8e074SDavid Sterba 		else
64448790d502SChris Mason 			bio = first_bio;
6445606686eeSJosef Bacik 
6446c31efbdfSNikolay Borisov 		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
64478790d502SChris Mason 	}
64480b246afaSJeff Mahoney 	btrfs_bio_counter_dec(fs_info);
644958efbc9fSOmar Sandoval 	return BLK_STS_OK;
64500b86a832SChris Mason }
64510b86a832SChris Mason 
645209ba3bc9SAnand Jain /*
645309ba3bc9SAnand Jain  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
645409ba3bc9SAnand Jain  * return NULL.
645509ba3bc9SAnand Jain  *
645609ba3bc9SAnand Jain  * If devid and uuid are both specified, the match must be exact, otherwise
645709ba3bc9SAnand Jain  * only devid is used.
645809ba3bc9SAnand Jain  *
645909ba3bc9SAnand Jain  * If @seed is true, traverse through the seed devices.
646009ba3bc9SAnand Jain  */
6461e4319cd9SAnand Jain struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
646209ba3bc9SAnand Jain 				       u64 devid, u8 *uuid, u8 *fsid,
646309ba3bc9SAnand Jain 				       bool seed)
64640b86a832SChris Mason {
64652b82032cSYan Zheng 	struct btrfs_device *device;
6466944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *seed_devs;
64670b86a832SChris Mason 
6468944d3f9fSNikolay Borisov 	if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6469944d3f9fSNikolay Borisov 		list_for_each_entry(device, &fs_devices->devices, dev_list) {
6470944d3f9fSNikolay Borisov 			if (device->devid == devid &&
6471944d3f9fSNikolay Borisov 			    (!uuid || memcmp(device->uuid, uuid,
6472944d3f9fSNikolay Borisov 					     BTRFS_UUID_SIZE) == 0))
6473944d3f9fSNikolay Borisov 				return device;
6474944d3f9fSNikolay Borisov 		}
6475944d3f9fSNikolay Borisov 	}
6476944d3f9fSNikolay Borisov 
6477944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
64782b82032cSYan Zheng 		if (!fsid ||
6479944d3f9fSNikolay Borisov 		    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6480944d3f9fSNikolay Borisov 			list_for_each_entry(device, &seed_devs->devices,
648109ba3bc9SAnand Jain 					    dev_list) {
648209ba3bc9SAnand Jain 				if (device->devid == devid &&
648309ba3bc9SAnand Jain 				    (!uuid || memcmp(device->uuid, uuid,
648409ba3bc9SAnand Jain 						     BTRFS_UUID_SIZE) == 0))
64852b82032cSYan Zheng 					return device;
64862b82032cSYan Zheng 			}
648709ba3bc9SAnand Jain 		}
64882b82032cSYan Zheng 	}
6489944d3f9fSNikolay Borisov 
64902b82032cSYan Zheng 	return NULL;
64910b86a832SChris Mason }
64920b86a832SChris Mason 
64932ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6494dfe25020SChris Mason 					    u64 devid, u8 *dev_uuid)
6495dfe25020SChris Mason {
6496dfe25020SChris Mason 	struct btrfs_device *device;
6497fccc0007SJosef Bacik 	unsigned int nofs_flag;
6498dfe25020SChris Mason 
6499fccc0007SJosef Bacik 	/*
6500fccc0007SJosef Bacik 	 * We call this under the chunk_mutex, so we want to use NOFS for this
6501fccc0007SJosef Bacik 	 * allocation, however we don't want to change btrfs_alloc_device() to
6502fccc0007SJosef Bacik 	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6503fccc0007SJosef Bacik 	 * places.
6504fccc0007SJosef Bacik 	 */
6505fccc0007SJosef Bacik 	nofs_flag = memalloc_nofs_save();
650612bd2fc0SIlya Dryomov 	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6507fccc0007SJosef Bacik 	memalloc_nofs_restore(nofs_flag);
650812bd2fc0SIlya Dryomov 	if (IS_ERR(device))
6509adfb69afSAnand Jain 		return device;
651012bd2fc0SIlya Dryomov 
651112bd2fc0SIlya Dryomov 	list_add(&device->dev_list, &fs_devices->devices);
6512e4404d6eSYan Zheng 	device->fs_devices = fs_devices;
6513dfe25020SChris Mason 	fs_devices->num_devices++;
651412bd2fc0SIlya Dryomov 
6515e6e674bdSAnand Jain 	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6516cd02dca5SChris Mason 	fs_devices->missing_devices++;
651712bd2fc0SIlya Dryomov 
6518dfe25020SChris Mason 	return device;
6519dfe25020SChris Mason }
6520dfe25020SChris Mason 
652112bd2fc0SIlya Dryomov /**
652212bd2fc0SIlya Dryomov  * btrfs_alloc_device - allocate struct btrfs_device
652312bd2fc0SIlya Dryomov  * @fs_info:	used only for generating a new devid, can be NULL if
652412bd2fc0SIlya Dryomov  *		devid is provided (i.e. @devid != NULL).
652512bd2fc0SIlya Dryomov  * @devid:	a pointer to devid for this device.  If NULL a new devid
652612bd2fc0SIlya Dryomov  *		is generated.
652712bd2fc0SIlya Dryomov  * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
652812bd2fc0SIlya Dryomov  *		is generated.
652912bd2fc0SIlya Dryomov  *
653012bd2fc0SIlya Dryomov  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
653148dae9cfSDavid Sterba  * on error.  Returned struct is not linked onto any lists and must be
6532a425f9d4SDavid Sterba  * destroyed with btrfs_free_device.
653312bd2fc0SIlya Dryomov  */
653412bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
653512bd2fc0SIlya Dryomov 					const u64 *devid,
653612bd2fc0SIlya Dryomov 					const u8 *uuid)
653712bd2fc0SIlya Dryomov {
653812bd2fc0SIlya Dryomov 	struct btrfs_device *dev;
653912bd2fc0SIlya Dryomov 	u64 tmp;
654012bd2fc0SIlya Dryomov 
6541fae7f21cSDulshani Gunawardhana 	if (WARN_ON(!devid && !fs_info))
654212bd2fc0SIlya Dryomov 		return ERR_PTR(-EINVAL);
654312bd2fc0SIlya Dryomov 
6544154f7cb8SQu Wenruo 	dev = __alloc_device(fs_info);
654512bd2fc0SIlya Dryomov 	if (IS_ERR(dev))
654612bd2fc0SIlya Dryomov 		return dev;
654712bd2fc0SIlya Dryomov 
654812bd2fc0SIlya Dryomov 	if (devid)
654912bd2fc0SIlya Dryomov 		tmp = *devid;
655012bd2fc0SIlya Dryomov 	else {
655112bd2fc0SIlya Dryomov 		int ret;
655212bd2fc0SIlya Dryomov 
655312bd2fc0SIlya Dryomov 		ret = find_next_devid(fs_info, &tmp);
655412bd2fc0SIlya Dryomov 		if (ret) {
6555a425f9d4SDavid Sterba 			btrfs_free_device(dev);
655612bd2fc0SIlya Dryomov 			return ERR_PTR(ret);
655712bd2fc0SIlya Dryomov 		}
655812bd2fc0SIlya Dryomov 	}
655912bd2fc0SIlya Dryomov 	dev->devid = tmp;
656012bd2fc0SIlya Dryomov 
656112bd2fc0SIlya Dryomov 	if (uuid)
656212bd2fc0SIlya Dryomov 		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
656312bd2fc0SIlya Dryomov 	else
656412bd2fc0SIlya Dryomov 		generate_random_uuid(dev->uuid);
656512bd2fc0SIlya Dryomov 
656612bd2fc0SIlya Dryomov 	return dev;
656712bd2fc0SIlya Dryomov }
656812bd2fc0SIlya Dryomov 
65695a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
65702b902dfcSAnand Jain 					u64 devid, u8 *uuid, bool error)
65715a2b8e60SAnand Jain {
65722b902dfcSAnand Jain 	if (error)
65732b902dfcSAnand Jain 		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
65742b902dfcSAnand Jain 			      devid, uuid);
65752b902dfcSAnand Jain 	else
65762b902dfcSAnand Jain 		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
65772b902dfcSAnand Jain 			      devid, uuid);
65785a2b8e60SAnand Jain }
65795a2b8e60SAnand Jain 
658039e264a4SNikolay Borisov static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
658139e264a4SNikolay Borisov {
658239e264a4SNikolay Borisov 	int index = btrfs_bg_flags_to_raid_index(type);
658339e264a4SNikolay Borisov 	int ncopies = btrfs_raid_array[index].ncopies;
6584e4f6c6beSDavid Sterba 	const int nparity = btrfs_raid_array[index].nparity;
658539e264a4SNikolay Borisov 	int data_stripes;
658639e264a4SNikolay Borisov 
6587e4f6c6beSDavid Sterba 	if (nparity)
6588e4f6c6beSDavid Sterba 		data_stripes = num_stripes - nparity;
6589e4f6c6beSDavid Sterba 	else
659039e264a4SNikolay Borisov 		data_stripes = num_stripes / ncopies;
6591e4f6c6beSDavid Sterba 
659239e264a4SNikolay Borisov 	return div_u64(chunk_len, data_stripes);
659339e264a4SNikolay Borisov }
659439e264a4SNikolay Borisov 
65959690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
65960b86a832SChris Mason 			  struct btrfs_chunk *chunk)
65970b86a832SChris Mason {
65989690ac09SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
6599c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
66000b86a832SChris Mason 	struct map_lookup *map;
66010b86a832SChris Mason 	struct extent_map *em;
66020b86a832SChris Mason 	u64 logical;
66030b86a832SChris Mason 	u64 length;
66040b86a832SChris Mason 	u64 devid;
6605a443755fSChris Mason 	u8 uuid[BTRFS_UUID_SIZE];
6606593060d7SChris Mason 	int num_stripes;
66070b86a832SChris Mason 	int ret;
6608593060d7SChris Mason 	int i;
66090b86a832SChris Mason 
6610e17cade2SChris Mason 	logical = key->offset;
6611e17cade2SChris Mason 	length = btrfs_chunk_length(leaf, chunk);
6612f04b772bSQu Wenruo 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6613e06cd3ddSLiu Bo 
6614075cb3c7SQu Wenruo 	/*
6615075cb3c7SQu Wenruo 	 * Only need to verify chunk item if we're reading from sys chunk array,
6616075cb3c7SQu Wenruo 	 * as chunk item in tree block is already verified by tree-checker.
6617075cb3c7SQu Wenruo 	 */
6618075cb3c7SQu Wenruo 	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6619ddaf1d5aSDavid Sterba 		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6620e06cd3ddSLiu Bo 		if (ret)
6621e06cd3ddSLiu Bo 			return ret;
6622075cb3c7SQu Wenruo 	}
6623a061fc8dSChris Mason 
6624c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
6625c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, logical, 1);
6626c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
66270b86a832SChris Mason 
66280b86a832SChris Mason 	/* already mapped? */
66290b86a832SChris Mason 	if (em && em->start <= logical && em->start + em->len > logical) {
66300b86a832SChris Mason 		free_extent_map(em);
66310b86a832SChris Mason 		return 0;
66320b86a832SChris Mason 	} else if (em) {
66330b86a832SChris Mason 		free_extent_map(em);
66340b86a832SChris Mason 	}
66350b86a832SChris Mason 
6636172ddd60SDavid Sterba 	em = alloc_extent_map();
66370b86a832SChris Mason 	if (!em)
66380b86a832SChris Mason 		return -ENOMEM;
6639593060d7SChris Mason 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
66400b86a832SChris Mason 	if (!map) {
66410b86a832SChris Mason 		free_extent_map(em);
66420b86a832SChris Mason 		return -ENOMEM;
66430b86a832SChris Mason 	}
66440b86a832SChris Mason 
6645298a8f9cSWang Shilong 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
664695617d69SJeff Mahoney 	em->map_lookup = map;
66470b86a832SChris Mason 	em->start = logical;
66480b86a832SChris Mason 	em->len = length;
664970c8a91cSJosef Bacik 	em->orig_start = 0;
66500b86a832SChris Mason 	em->block_start = 0;
6651c8b97818SChris Mason 	em->block_len = em->len;
66520b86a832SChris Mason 
6653593060d7SChris Mason 	map->num_stripes = num_stripes;
6654593060d7SChris Mason 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
6655593060d7SChris Mason 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
6656593060d7SChris Mason 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6657593060d7SChris Mason 	map->type = btrfs_chunk_type(leaf, chunk);
6658321aecc6SChris Mason 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6659cf90d884SQu Wenruo 	map->verified_stripes = 0;
666039e264a4SNikolay Borisov 	em->orig_block_len = calc_stripe_length(map->type, em->len,
666139e264a4SNikolay Borisov 						map->num_stripes);
6662593060d7SChris Mason 	for (i = 0; i < num_stripes; i++) {
6663593060d7SChris Mason 		map->stripes[i].physical =
6664593060d7SChris Mason 			btrfs_stripe_offset_nr(leaf, chunk, i);
6665593060d7SChris Mason 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6666a443755fSChris Mason 		read_extent_buffer(leaf, uuid, (unsigned long)
6667a443755fSChris Mason 				   btrfs_stripe_dev_uuid_nr(chunk, i),
6668a443755fSChris Mason 				   BTRFS_UUID_SIZE);
6669e4319cd9SAnand Jain 		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
667009ba3bc9SAnand Jain 							devid, uuid, NULL, true);
66713cdde224SJeff Mahoney 		if (!map->stripes[i].dev &&
66720b246afaSJeff Mahoney 		    !btrfs_test_opt(fs_info, DEGRADED)) {
6673dfe25020SChris Mason 			free_extent_map(em);
66742b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid, uuid, true);
667545dbdbc9SAnand Jain 			return -ENOENT;
6676dfe25020SChris Mason 		}
6677dfe25020SChris Mason 		if (!map->stripes[i].dev) {
6678dfe25020SChris Mason 			map->stripes[i].dev =
66792ff7e61eSJeff Mahoney 				add_missing_dev(fs_info->fs_devices, devid,
66802ff7e61eSJeff Mahoney 						uuid);
6681adfb69afSAnand Jain 			if (IS_ERR(map->stripes[i].dev)) {
66820b86a832SChris Mason 				free_extent_map(em);
6683adfb69afSAnand Jain 				btrfs_err(fs_info,
6684adfb69afSAnand Jain 					"failed to init missing dev %llu: %ld",
6685adfb69afSAnand Jain 					devid, PTR_ERR(map->stripes[i].dev));
6686adfb69afSAnand Jain 				return PTR_ERR(map->stripes[i].dev);
66870b86a832SChris Mason 			}
66882b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid, uuid, false);
6689593060d7SChris Mason 		}
6690e12c9621SAnand Jain 		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6691e12c9621SAnand Jain 				&(map->stripes[i].dev->dev_state));
6692e12c9621SAnand Jain 
6693dfe25020SChris Mason 	}
66940b86a832SChris Mason 
6695c8bf1b67SDavid Sterba 	write_lock(&map_tree->lock);
6696c8bf1b67SDavid Sterba 	ret = add_extent_mapping(map_tree, em, 0);
6697c8bf1b67SDavid Sterba 	write_unlock(&map_tree->lock);
669864f64f43SQu Wenruo 	if (ret < 0) {
669964f64f43SQu Wenruo 		btrfs_err(fs_info,
670064f64f43SQu Wenruo 			  "failed to add chunk map, start=%llu len=%llu: %d",
670164f64f43SQu Wenruo 			  em->start, em->len, ret);
670264f64f43SQu Wenruo 	}
67030b86a832SChris Mason 	free_extent_map(em);
67040b86a832SChris Mason 
670564f64f43SQu Wenruo 	return ret;
67060b86a832SChris Mason }
67070b86a832SChris Mason 
6708143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf,
67090b86a832SChris Mason 				 struct btrfs_dev_item *dev_item,
67100b86a832SChris Mason 				 struct btrfs_device *device)
67110b86a832SChris Mason {
67120b86a832SChris Mason 	unsigned long ptr;
67130b86a832SChris Mason 
67140b86a832SChris Mason 	device->devid = btrfs_device_id(leaf, dev_item);
6715d6397baeSChris Ball 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6716d6397baeSChris Ball 	device->total_bytes = device->disk_total_bytes;
6717935e5cc9SMiao Xie 	device->commit_total_bytes = device->disk_total_bytes;
67180b86a832SChris Mason 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6719ce7213c7SMiao Xie 	device->commit_bytes_used = device->bytes_used;
67200b86a832SChris Mason 	device->type = btrfs_device_type(leaf, dev_item);
67210b86a832SChris Mason 	device->io_align = btrfs_device_io_align(leaf, dev_item);
67220b86a832SChris Mason 	device->io_width = btrfs_device_io_width(leaf, dev_item);
67230b86a832SChris Mason 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
67248dabb742SStefan Behrens 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6725401e29c1SAnand Jain 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
67260b86a832SChris Mason 
6727410ba3a2SGeert Uytterhoeven 	ptr = btrfs_device_uuid(dev_item);
6728e17cade2SChris Mason 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
67290b86a832SChris Mason }
67300b86a832SChris Mason 
67312ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
67325f375835SMiao Xie 						  u8 *fsid)
67332b82032cSYan Zheng {
67342b82032cSYan Zheng 	struct btrfs_fs_devices *fs_devices;
67352b82032cSYan Zheng 	int ret;
67362b82032cSYan Zheng 
6737a32bf9a3SDavid Sterba 	lockdep_assert_held(&uuid_mutex);
67382dfeca9bSDavid Sterba 	ASSERT(fsid);
67392b82032cSYan Zheng 
6740427c8fddSNikolay Borisov 	/* This will match only for multi-device seed fs */
6741944d3f9fSNikolay Borisov 	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
674244880fdcSAnand Jain 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
67435f375835SMiao Xie 			return fs_devices;
67445f375835SMiao Xie 
67452b82032cSYan Zheng 
67467239ff4bSNikolay Borisov 	fs_devices = find_fsid(fsid, NULL);
67472b82032cSYan Zheng 	if (!fs_devices) {
67480b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, DEGRADED))
67495f375835SMiao Xie 			return ERR_PTR(-ENOENT);
67505f375835SMiao Xie 
67517239ff4bSNikolay Borisov 		fs_devices = alloc_fs_devices(fsid, NULL);
67525f375835SMiao Xie 		if (IS_ERR(fs_devices))
67535f375835SMiao Xie 			return fs_devices;
67545f375835SMiao Xie 
67550395d84fSJohannes Thumshirn 		fs_devices->seeding = true;
67565f375835SMiao Xie 		fs_devices->opened = 1;
67575f375835SMiao Xie 		return fs_devices;
67582b82032cSYan Zheng 	}
6759e4404d6eSYan Zheng 
6760427c8fddSNikolay Borisov 	/*
6761427c8fddSNikolay Borisov 	 * Upon first call for a seed fs fsid, just create a private copy of the
6762427c8fddSNikolay Borisov 	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6763427c8fddSNikolay Borisov 	 */
6764e4404d6eSYan Zheng 	fs_devices = clone_fs_devices(fs_devices);
67655f375835SMiao Xie 	if (IS_ERR(fs_devices))
67665f375835SMiao Xie 		return fs_devices;
67672b82032cSYan Zheng 
6768897fb573SAnand Jain 	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
676948d28232SJulia Lawall 	if (ret) {
677048d28232SJulia Lawall 		free_fs_devices(fs_devices);
6771c83b60c0SAnand Jain 		return ERR_PTR(ret);
677248d28232SJulia Lawall 	}
67732b82032cSYan Zheng 
67742b82032cSYan Zheng 	if (!fs_devices->seeding) {
67750226e0ebSAnand Jain 		close_fs_devices(fs_devices);
6776e4404d6eSYan Zheng 		free_fs_devices(fs_devices);
6777c83b60c0SAnand Jain 		return ERR_PTR(-EINVAL);
67782b82032cSYan Zheng 	}
67792b82032cSYan Zheng 
6780944d3f9fSNikolay Borisov 	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6781c83b60c0SAnand Jain 
67825f375835SMiao Xie 	return fs_devices;
67832b82032cSYan Zheng }
67842b82032cSYan Zheng 
678517850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf,
67860b86a832SChris Mason 			struct btrfs_dev_item *dev_item)
67870b86a832SChris Mason {
678817850759SDavid Sterba 	struct btrfs_fs_info *fs_info = leaf->fs_info;
67890b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
67900b86a832SChris Mason 	struct btrfs_device *device;
67910b86a832SChris Mason 	u64 devid;
67920b86a832SChris Mason 	int ret;
679344880fdcSAnand Jain 	u8 fs_uuid[BTRFS_FSID_SIZE];
6794a443755fSChris Mason 	u8 dev_uuid[BTRFS_UUID_SIZE];
6795a443755fSChris Mason 
67960b86a832SChris Mason 	devid = btrfs_device_id(leaf, dev_item);
6797410ba3a2SGeert Uytterhoeven 	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6798a443755fSChris Mason 			   BTRFS_UUID_SIZE);
67991473b24eSGeert Uytterhoeven 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
680044880fdcSAnand Jain 			   BTRFS_FSID_SIZE);
68012b82032cSYan Zheng 
6802de37aa51SNikolay Borisov 	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
68032ff7e61eSJeff Mahoney 		fs_devices = open_seed_devices(fs_info, fs_uuid);
68045f375835SMiao Xie 		if (IS_ERR(fs_devices))
68055f375835SMiao Xie 			return PTR_ERR(fs_devices);
68062b82032cSYan Zheng 	}
68072b82032cSYan Zheng 
6808e4319cd9SAnand Jain 	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
680909ba3bc9SAnand Jain 				   fs_uuid, true);
68105f375835SMiao Xie 	if (!device) {
6811c5502451SQu Wenruo 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
68122b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
68132b902dfcSAnand Jain 							dev_uuid, true);
681445dbdbc9SAnand Jain 			return -ENOENT;
6815c5502451SQu Wenruo 		}
68162b82032cSYan Zheng 
68172ff7e61eSJeff Mahoney 		device = add_missing_dev(fs_devices, devid, dev_uuid);
6818adfb69afSAnand Jain 		if (IS_ERR(device)) {
6819adfb69afSAnand Jain 			btrfs_err(fs_info,
6820adfb69afSAnand Jain 				"failed to add missing dev %llu: %ld",
6821adfb69afSAnand Jain 				devid, PTR_ERR(device));
6822adfb69afSAnand Jain 			return PTR_ERR(device);
6823adfb69afSAnand Jain 		}
68242b902dfcSAnand Jain 		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
68255f375835SMiao Xie 	} else {
6826c5502451SQu Wenruo 		if (!device->bdev) {
68272b902dfcSAnand Jain 			if (!btrfs_test_opt(fs_info, DEGRADED)) {
68282b902dfcSAnand Jain 				btrfs_report_missing_device(fs_info,
68292b902dfcSAnand Jain 						devid, dev_uuid, true);
683045dbdbc9SAnand Jain 				return -ENOENT;
6831c5502451SQu Wenruo 			}
68322b902dfcSAnand Jain 			btrfs_report_missing_device(fs_info, devid,
68332b902dfcSAnand Jain 							dev_uuid, false);
68342b902dfcSAnand Jain 		}
68355f375835SMiao Xie 
6836e6e674bdSAnand Jain 		if (!device->bdev &&
6837e6e674bdSAnand Jain 		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6838cd02dca5SChris Mason 			/*
6839cd02dca5SChris Mason 			 * this happens when a device that was properly setup
6840cd02dca5SChris Mason 			 * in the device info lists suddenly goes bad.
6841cd02dca5SChris Mason 			 * device->bdev is NULL, and so we have to set
6842cd02dca5SChris Mason 			 * device->missing to one here
6843cd02dca5SChris Mason 			 */
68445f375835SMiao Xie 			device->fs_devices->missing_devices++;
6845e6e674bdSAnand Jain 			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
68466324fbf3SChris Mason 		}
68475f375835SMiao Xie 
68485f375835SMiao Xie 		/* Move the device to its own fs_devices */
68495f375835SMiao Xie 		if (device->fs_devices != fs_devices) {
6850e6e674bdSAnand Jain 			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6851e6e674bdSAnand Jain 							&device->dev_state));
68525f375835SMiao Xie 
68535f375835SMiao Xie 			list_move(&device->dev_list, &fs_devices->devices);
68545f375835SMiao Xie 			device->fs_devices->num_devices--;
68555f375835SMiao Xie 			fs_devices->num_devices++;
68565f375835SMiao Xie 
68575f375835SMiao Xie 			device->fs_devices->missing_devices--;
68585f375835SMiao Xie 			fs_devices->missing_devices++;
68595f375835SMiao Xie 
68605f375835SMiao Xie 			device->fs_devices = fs_devices;
68615f375835SMiao Xie 		}
68622b82032cSYan Zheng 	}
68632b82032cSYan Zheng 
68640b246afaSJeff Mahoney 	if (device->fs_devices != fs_info->fs_devices) {
6865ebbede42SAnand Jain 		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
68662b82032cSYan Zheng 		if (device->generation !=
68672b82032cSYan Zheng 		    btrfs_device_generation(leaf, dev_item))
68682b82032cSYan Zheng 			return -EINVAL;
68692b82032cSYan Zheng 	}
68700b86a832SChris Mason 
68710b86a832SChris Mason 	fill_device_from_item(leaf, dev_item, device);
6872e12c9621SAnand Jain 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6873ebbede42SAnand Jain 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6874401e29c1SAnand Jain 	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
68752b82032cSYan Zheng 		device->fs_devices->total_rw_bytes += device->total_bytes;
6876a5ed45f8SNikolay Borisov 		atomic64_add(device->total_bytes - device->bytes_used,
6877a5ed45f8SNikolay Borisov 				&fs_info->free_chunk_space);
68782bf64758SJosef Bacik 	}
68790b86a832SChris Mason 	ret = 0;
68800b86a832SChris Mason 	return ret;
68810b86a832SChris Mason }
68820b86a832SChris Mason 
68836bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
68840b86a832SChris Mason {
68856bccf3abSJeff Mahoney 	struct btrfs_root *root = fs_info->tree_root;
6886ab8d0fc4SJeff Mahoney 	struct btrfs_super_block *super_copy = fs_info->super_copy;
6887a061fc8dSChris Mason 	struct extent_buffer *sb;
68880b86a832SChris Mason 	struct btrfs_disk_key *disk_key;
68890b86a832SChris Mason 	struct btrfs_chunk *chunk;
68901ffb22cfSDavid Sterba 	u8 *array_ptr;
68911ffb22cfSDavid Sterba 	unsigned long sb_array_offset;
689284eed90fSChris Mason 	int ret = 0;
68930b86a832SChris Mason 	u32 num_stripes;
68940b86a832SChris Mason 	u32 array_size;
68950b86a832SChris Mason 	u32 len = 0;
68961ffb22cfSDavid Sterba 	u32 cur_offset;
6897e06cd3ddSLiu Bo 	u64 type;
689884eed90fSChris Mason 	struct btrfs_key key;
68990b86a832SChris Mason 
69000b246afaSJeff Mahoney 	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6901a83fffb7SDavid Sterba 	/*
6902a83fffb7SDavid Sterba 	 * This will create extent buffer of nodesize, superblock size is
6903a83fffb7SDavid Sterba 	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6904a83fffb7SDavid Sterba 	 * overallocate but we can keep it as-is, only the first page is used.
6905a83fffb7SDavid Sterba 	 */
69063fbaf258SJosef Bacik 	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
69073fbaf258SJosef Bacik 					  root->root_key.objectid, 0);
6908c871b0f2SLiu Bo 	if (IS_ERR(sb))
6909c871b0f2SLiu Bo 		return PTR_ERR(sb);
69104db8c528SDavid Sterba 	set_extent_buffer_uptodate(sb);
69118a334426SDavid Sterba 	/*
691201327610SNicholas D Steeves 	 * The sb extent buffer is artificial and just used to read the system array.
69134db8c528SDavid Sterba 	 * set_extent_buffer_uptodate() call does not properly mark all it's
69148a334426SDavid Sterba 	 * pages up-to-date when the page is larger: extent does not cover the
69158a334426SDavid Sterba 	 * whole page and consequently check_page_uptodate does not find all
69168a334426SDavid Sterba 	 * the page's extents up-to-date (the hole beyond sb),
69178a334426SDavid Sterba 	 * write_extent_buffer then triggers a WARN_ON.
69188a334426SDavid Sterba 	 *
69198a334426SDavid Sterba 	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
69208a334426SDavid Sterba 	 * but sb spans only this function. Add an explicit SetPageUptodate call
69218a334426SDavid Sterba 	 * to silence the warning eg. on PowerPC 64.
69228a334426SDavid Sterba 	 */
692309cbfeafSKirill A. Shutemov 	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6924727011e0SChris Mason 		SetPageUptodate(sb->pages[0]);
69254008c04aSChris Mason 
6926a061fc8dSChris Mason 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
69270b86a832SChris Mason 	array_size = btrfs_super_sys_array_size(super_copy);
69280b86a832SChris Mason 
69291ffb22cfSDavid Sterba 	array_ptr = super_copy->sys_chunk_array;
69301ffb22cfSDavid Sterba 	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
69311ffb22cfSDavid Sterba 	cur_offset = 0;
69320b86a832SChris Mason 
69331ffb22cfSDavid Sterba 	while (cur_offset < array_size) {
69341ffb22cfSDavid Sterba 		disk_key = (struct btrfs_disk_key *)array_ptr;
6935e3540eabSDavid Sterba 		len = sizeof(*disk_key);
6936e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
6937e3540eabSDavid Sterba 			goto out_short_read;
6938e3540eabSDavid Sterba 
69390b86a832SChris Mason 		btrfs_disk_key_to_cpu(&key, disk_key);
69400b86a832SChris Mason 
69411ffb22cfSDavid Sterba 		array_ptr += len;
69421ffb22cfSDavid Sterba 		sb_array_offset += len;
69431ffb22cfSDavid Sterba 		cur_offset += len;
69440b86a832SChris Mason 
694532ab3d1bSJohannes Thumshirn 		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
694632ab3d1bSJohannes Thumshirn 			btrfs_err(fs_info,
694732ab3d1bSJohannes Thumshirn 			    "unexpected item type %u in sys_array at offset %u",
694832ab3d1bSJohannes Thumshirn 				  (u32)key.type, cur_offset);
694932ab3d1bSJohannes Thumshirn 			ret = -EIO;
695032ab3d1bSJohannes Thumshirn 			break;
695132ab3d1bSJohannes Thumshirn 		}
695232ab3d1bSJohannes Thumshirn 
69531ffb22cfSDavid Sterba 		chunk = (struct btrfs_chunk *)sb_array_offset;
6954e3540eabSDavid Sterba 		/*
695532ab3d1bSJohannes Thumshirn 		 * At least one btrfs_chunk with one stripe must be present,
695632ab3d1bSJohannes Thumshirn 		 * exact stripe count check comes afterwards
6957e3540eabSDavid Sterba 		 */
6958e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(1);
6959e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
6960e3540eabSDavid Sterba 			goto out_short_read;
6961e3540eabSDavid Sterba 
6962e3540eabSDavid Sterba 		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6963f5cdedd7SDavid Sterba 		if (!num_stripes) {
6964ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
6965ab8d0fc4SJeff Mahoney 			"invalid number of stripes %u in sys_array at offset %u",
6966f5cdedd7SDavid Sterba 				  num_stripes, cur_offset);
6967f5cdedd7SDavid Sterba 			ret = -EIO;
6968f5cdedd7SDavid Sterba 			break;
6969f5cdedd7SDavid Sterba 		}
6970f5cdedd7SDavid Sterba 
6971e06cd3ddSLiu Bo 		type = btrfs_chunk_type(sb, chunk);
6972e06cd3ddSLiu Bo 		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6973ab8d0fc4SJeff Mahoney 			btrfs_err(fs_info,
6974e06cd3ddSLiu Bo 			"invalid chunk type %llu in sys_array at offset %u",
6975e06cd3ddSLiu Bo 				  type, cur_offset);
6976e06cd3ddSLiu Bo 			ret = -EIO;
6977e06cd3ddSLiu Bo 			break;
6978e06cd3ddSLiu Bo 		}
6979e06cd3ddSLiu Bo 
6980e3540eabSDavid Sterba 		len = btrfs_chunk_item_size(num_stripes);
6981e3540eabSDavid Sterba 		if (cur_offset + len > array_size)
6982e3540eabSDavid Sterba 			goto out_short_read;
6983e3540eabSDavid Sterba 
69849690ac09SDavid Sterba 		ret = read_one_chunk(&key, sb, chunk);
698584eed90fSChris Mason 		if (ret)
698684eed90fSChris Mason 			break;
698732ab3d1bSJohannes Thumshirn 
69881ffb22cfSDavid Sterba 		array_ptr += len;
69891ffb22cfSDavid Sterba 		sb_array_offset += len;
69901ffb22cfSDavid Sterba 		cur_offset += len;
69910b86a832SChris Mason 	}
6992d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
69931c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
699484eed90fSChris Mason 	return ret;
6995e3540eabSDavid Sterba 
6996e3540eabSDavid Sterba out_short_read:
6997ab8d0fc4SJeff Mahoney 	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6998e3540eabSDavid Sterba 			len, cur_offset);
6999d865177aSLiu Bo 	clear_extent_buffer_uptodate(sb);
70001c8b5b6eSLiu Bo 	free_extent_buffer_stale(sb);
7001e3540eabSDavid Sterba 	return -EIO;
70020b86a832SChris Mason }
70030b86a832SChris Mason 
700421634a19SQu Wenruo /*
700521634a19SQu Wenruo  * Check if all chunks in the fs are OK for read-write degraded mount
700621634a19SQu Wenruo  *
70076528b99dSAnand Jain  * If the @failing_dev is specified, it's accounted as missing.
70086528b99dSAnand Jain  *
700921634a19SQu Wenruo  * Return true if all chunks meet the minimal RW mount requirements.
701021634a19SQu Wenruo  * Return false if any chunk doesn't meet the minimal RW mount requirements.
701121634a19SQu Wenruo  */
70126528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
70136528b99dSAnand Jain 					struct btrfs_device *failing_dev)
701421634a19SQu Wenruo {
7015c8bf1b67SDavid Sterba 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
701621634a19SQu Wenruo 	struct extent_map *em;
701721634a19SQu Wenruo 	u64 next_start = 0;
701821634a19SQu Wenruo 	bool ret = true;
701921634a19SQu Wenruo 
7020c8bf1b67SDavid Sterba 	read_lock(&map_tree->lock);
7021c8bf1b67SDavid Sterba 	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7022c8bf1b67SDavid Sterba 	read_unlock(&map_tree->lock);
702321634a19SQu Wenruo 	/* No chunk at all? Return false anyway */
702421634a19SQu Wenruo 	if (!em) {
702521634a19SQu Wenruo 		ret = false;
702621634a19SQu Wenruo 		goto out;
702721634a19SQu Wenruo 	}
702821634a19SQu Wenruo 	while (em) {
702921634a19SQu Wenruo 		struct map_lookup *map;
703021634a19SQu Wenruo 		int missing = 0;
703121634a19SQu Wenruo 		int max_tolerated;
703221634a19SQu Wenruo 		int i;
703321634a19SQu Wenruo 
703421634a19SQu Wenruo 		map = em->map_lookup;
703521634a19SQu Wenruo 		max_tolerated =
703621634a19SQu Wenruo 			btrfs_get_num_tolerated_disk_barrier_failures(
703721634a19SQu Wenruo 					map->type);
703821634a19SQu Wenruo 		for (i = 0; i < map->num_stripes; i++) {
703921634a19SQu Wenruo 			struct btrfs_device *dev = map->stripes[i].dev;
704021634a19SQu Wenruo 
7041e6e674bdSAnand Jain 			if (!dev || !dev->bdev ||
7042e6e674bdSAnand Jain 			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
704321634a19SQu Wenruo 			    dev->last_flush_error)
704421634a19SQu Wenruo 				missing++;
70456528b99dSAnand Jain 			else if (failing_dev && failing_dev == dev)
70466528b99dSAnand Jain 				missing++;
704721634a19SQu Wenruo 		}
704821634a19SQu Wenruo 		if (missing > max_tolerated) {
70496528b99dSAnand Jain 			if (!failing_dev)
705021634a19SQu Wenruo 				btrfs_warn(fs_info,
705152042d8eSAndrea Gelmini 	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
705221634a19SQu Wenruo 				   em->start, missing, max_tolerated);
705321634a19SQu Wenruo 			free_extent_map(em);
705421634a19SQu Wenruo 			ret = false;
705521634a19SQu Wenruo 			goto out;
705621634a19SQu Wenruo 		}
705721634a19SQu Wenruo 		next_start = extent_map_end(em);
705821634a19SQu Wenruo 		free_extent_map(em);
705921634a19SQu Wenruo 
7060c8bf1b67SDavid Sterba 		read_lock(&map_tree->lock);
7061c8bf1b67SDavid Sterba 		em = lookup_extent_mapping(map_tree, next_start,
706221634a19SQu Wenruo 					   (u64)(-1) - next_start);
7063c8bf1b67SDavid Sterba 		read_unlock(&map_tree->lock);
706421634a19SQu Wenruo 	}
706521634a19SQu Wenruo out:
706621634a19SQu Wenruo 	return ret;
706721634a19SQu Wenruo }
706821634a19SQu Wenruo 
7069d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node)
7070d85327b1SDavid Sterba {
7071d85327b1SDavid Sterba 	int i;
7072d85327b1SDavid Sterba 	const int nr_items = btrfs_header_nritems(node);
7073d85327b1SDavid Sterba 
7074bfb484d9SJosef Bacik 	for (i = 0; i < nr_items; i++)
7075bfb484d9SJosef Bacik 		btrfs_readahead_node_child(node, i);
7076d85327b1SDavid Sterba }
7077d85327b1SDavid Sterba 
70785b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
70790b86a832SChris Mason {
70805b4aacefSJeff Mahoney 	struct btrfs_root *root = fs_info->chunk_root;
70810b86a832SChris Mason 	struct btrfs_path *path;
70820b86a832SChris Mason 	struct extent_buffer *leaf;
70830b86a832SChris Mason 	struct btrfs_key key;
70840b86a832SChris Mason 	struct btrfs_key found_key;
70850b86a832SChris Mason 	int ret;
70860b86a832SChris Mason 	int slot;
708799e3ecfcSLiu Bo 	u64 total_dev = 0;
7088d85327b1SDavid Sterba 	u64 last_ra_node = 0;
70890b86a832SChris Mason 
70900b86a832SChris Mason 	path = btrfs_alloc_path();
70910b86a832SChris Mason 	if (!path)
70920b86a832SChris Mason 		return -ENOMEM;
70930b86a832SChris Mason 
70943dd0f7a3SAnand Jain 	/*
70953dd0f7a3SAnand Jain 	 * uuid_mutex is needed only if we are mounting a sprout FS
70963dd0f7a3SAnand Jain 	 * otherwise we don't need it.
70973dd0f7a3SAnand Jain 	 */
7098b367e47fSLi Zefan 	mutex_lock(&uuid_mutex);
7099b367e47fSLi Zefan 
7100395927a9SFilipe David Borba Manana 	/*
710148cfa61bSBoris Burkov 	 * It is possible for mount and umount to race in such a way that
710248cfa61bSBoris Burkov 	 * we execute this code path, but open_fs_devices failed to clear
710348cfa61bSBoris Burkov 	 * total_rw_bytes. We certainly want it cleared before reading the
710448cfa61bSBoris Burkov 	 * device items, so clear it here.
710548cfa61bSBoris Burkov 	 */
710648cfa61bSBoris Burkov 	fs_info->fs_devices->total_rw_bytes = 0;
710748cfa61bSBoris Burkov 
710848cfa61bSBoris Burkov 	/*
7109395927a9SFilipe David Borba Manana 	 * Read all device items, and then all the chunk items. All
7110395927a9SFilipe David Borba Manana 	 * device items are found before any chunk item (their object id
7111395927a9SFilipe David Borba Manana 	 * is smaller than the lowest possible object id for a chunk
7112395927a9SFilipe David Borba Manana 	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
71130b86a832SChris Mason 	 */
71140b86a832SChris Mason 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
71150b86a832SChris Mason 	key.offset = 0;
71160b86a832SChris Mason 	key.type = 0;
71170b86a832SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7118ab59381eSZhao Lei 	if (ret < 0)
7119ab59381eSZhao Lei 		goto error;
71200b86a832SChris Mason 	while (1) {
7121d85327b1SDavid Sterba 		struct extent_buffer *node;
7122d85327b1SDavid Sterba 
71230b86a832SChris Mason 		leaf = path->nodes[0];
71240b86a832SChris Mason 		slot = path->slots[0];
71250b86a832SChris Mason 		if (slot >= btrfs_header_nritems(leaf)) {
71260b86a832SChris Mason 			ret = btrfs_next_leaf(root, path);
71270b86a832SChris Mason 			if (ret == 0)
71280b86a832SChris Mason 				continue;
71290b86a832SChris Mason 			if (ret < 0)
71300b86a832SChris Mason 				goto error;
71310b86a832SChris Mason 			break;
71320b86a832SChris Mason 		}
7133d85327b1SDavid Sterba 		/*
7134d85327b1SDavid Sterba 		 * The nodes on level 1 are not locked but we don't need to do
7135d85327b1SDavid Sterba 		 * that during mount time as nothing else can access the tree
7136d85327b1SDavid Sterba 		 */
7137d85327b1SDavid Sterba 		node = path->nodes[1];
7138d85327b1SDavid Sterba 		if (node) {
7139d85327b1SDavid Sterba 			if (last_ra_node != node->start) {
7140d85327b1SDavid Sterba 				readahead_tree_node_children(node);
7141d85327b1SDavid Sterba 				last_ra_node = node->start;
7142d85327b1SDavid Sterba 			}
7143d85327b1SDavid Sterba 		}
71440b86a832SChris Mason 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
71450b86a832SChris Mason 		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
71460b86a832SChris Mason 			struct btrfs_dev_item *dev_item;
71470b86a832SChris Mason 			dev_item = btrfs_item_ptr(leaf, slot,
71480b86a832SChris Mason 						  struct btrfs_dev_item);
714917850759SDavid Sterba 			ret = read_one_dev(leaf, dev_item);
71502b82032cSYan Zheng 			if (ret)
71512b82032cSYan Zheng 				goto error;
715299e3ecfcSLiu Bo 			total_dev++;
71530b86a832SChris Mason 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
71540b86a832SChris Mason 			struct btrfs_chunk *chunk;
71550b86a832SChris Mason 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
715601d01cafSJosef Bacik 			mutex_lock(&fs_info->chunk_mutex);
71579690ac09SDavid Sterba 			ret = read_one_chunk(&found_key, leaf, chunk);
715801d01cafSJosef Bacik 			mutex_unlock(&fs_info->chunk_mutex);
71592b82032cSYan Zheng 			if (ret)
71602b82032cSYan Zheng 				goto error;
71610b86a832SChris Mason 		}
71620b86a832SChris Mason 		path->slots[0]++;
71630b86a832SChris Mason 	}
716499e3ecfcSLiu Bo 
716599e3ecfcSLiu Bo 	/*
716699e3ecfcSLiu Bo 	 * After loading chunk tree, we've got all device information,
716799e3ecfcSLiu Bo 	 * do another round of validation checks.
716899e3ecfcSLiu Bo 	 */
71690b246afaSJeff Mahoney 	if (total_dev != fs_info->fs_devices->total_devices) {
71700b246afaSJeff Mahoney 		btrfs_err(fs_info,
717199e3ecfcSLiu Bo 	   "super_num_devices %llu mismatch with num_devices %llu found here",
71720b246afaSJeff Mahoney 			  btrfs_super_num_devices(fs_info->super_copy),
717399e3ecfcSLiu Bo 			  total_dev);
717499e3ecfcSLiu Bo 		ret = -EINVAL;
717599e3ecfcSLiu Bo 		goto error;
717699e3ecfcSLiu Bo 	}
71770b246afaSJeff Mahoney 	if (btrfs_super_total_bytes(fs_info->super_copy) <
71780b246afaSJeff Mahoney 	    fs_info->fs_devices->total_rw_bytes) {
71790b246afaSJeff Mahoney 		btrfs_err(fs_info,
718099e3ecfcSLiu Bo 	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
71810b246afaSJeff Mahoney 			  btrfs_super_total_bytes(fs_info->super_copy),
71820b246afaSJeff Mahoney 			  fs_info->fs_devices->total_rw_bytes);
718399e3ecfcSLiu Bo 		ret = -EINVAL;
718499e3ecfcSLiu Bo 		goto error;
718599e3ecfcSLiu Bo 	}
71860b86a832SChris Mason 	ret = 0;
71870b86a832SChris Mason error:
7188b367e47fSLi Zefan 	mutex_unlock(&uuid_mutex);
7189b367e47fSLi Zefan 
71902b82032cSYan Zheng 	btrfs_free_path(path);
71910b86a832SChris Mason 	return ret;
71920b86a832SChris Mason }
7193442a4f63SStefan Behrens 
7194cb517eabSMiao Xie void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7195cb517eabSMiao Xie {
7196944d3f9fSNikolay Borisov 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7197cb517eabSMiao Xie 	struct btrfs_device *device;
7198cb517eabSMiao Xie 
7199944d3f9fSNikolay Borisov 	fs_devices->fs_info = fs_info;
7200944d3f9fSNikolay Borisov 
7201cb517eabSMiao Xie 	mutex_lock(&fs_devices->device_list_mutex);
7202cb517eabSMiao Xie 	list_for_each_entry(device, &fs_devices->devices, dev_list)
7203fb456252SJeff Mahoney 		device->fs_info = fs_info;
720429cc83f6SLiu Bo 
7205944d3f9fSNikolay Borisov 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7206944d3f9fSNikolay Borisov 		list_for_each_entry(device, &seed_devs->devices, dev_list)
7207944d3f9fSNikolay Borisov 			device->fs_info = fs_info;
7208944d3f9fSNikolay Borisov 
7209944d3f9fSNikolay Borisov 		seed_devs->fs_info = fs_info;
721029cc83f6SLiu Bo 	}
7211e17125b5SAnand Jain 	mutex_unlock(&fs_devices->device_list_mutex);
7212cb517eabSMiao Xie }
7213cb517eabSMiao Xie 
72141dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
72151dc990dfSDavid Sterba 				 const struct btrfs_dev_stats_item *ptr,
72161dc990dfSDavid Sterba 				 int index)
72171dc990dfSDavid Sterba {
72181dc990dfSDavid Sterba 	u64 val;
72191dc990dfSDavid Sterba 
72201dc990dfSDavid Sterba 	read_extent_buffer(eb, &val,
72211dc990dfSDavid Sterba 			   offsetof(struct btrfs_dev_stats_item, values) +
72221dc990dfSDavid Sterba 			    ((unsigned long)ptr) + (index * sizeof(u64)),
72231dc990dfSDavid Sterba 			   sizeof(val));
72241dc990dfSDavid Sterba 	return val;
72251dc990dfSDavid Sterba }
72261dc990dfSDavid Sterba 
72271dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
72281dc990dfSDavid Sterba 				      struct btrfs_dev_stats_item *ptr,
72291dc990dfSDavid Sterba 				      int index, u64 val)
72301dc990dfSDavid Sterba {
72311dc990dfSDavid Sterba 	write_extent_buffer(eb, &val,
72321dc990dfSDavid Sterba 			    offsetof(struct btrfs_dev_stats_item, values) +
72331dc990dfSDavid Sterba 			     ((unsigned long)ptr) + (index * sizeof(u64)),
72341dc990dfSDavid Sterba 			    sizeof(val));
72351dc990dfSDavid Sterba }
72361dc990dfSDavid Sterba 
723792e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7238124604ebSJosef Bacik 				       struct btrfs_path *path)
7239733f4fbbSStefan Behrens {
7240733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7241124604ebSJosef Bacik 	struct extent_buffer *eb;
7242124604ebSJosef Bacik 	struct btrfs_key key;
7243124604ebSJosef Bacik 	int item_size;
7244124604ebSJosef Bacik 	int i, ret, slot;
7245733f4fbbSStefan Behrens 
7246242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7247242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7248733f4fbbSStefan Behrens 	key.offset = device->devid;
7249124604ebSJosef Bacik 	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7250733f4fbbSStefan Behrens 	if (ret) {
7251ae4b9b4cSAnand Jain 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7252ae4b9b4cSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7253733f4fbbSStefan Behrens 		device->dev_stats_valid = 1;
7254733f4fbbSStefan Behrens 		btrfs_release_path(path);
725592e26df4SJosef Bacik 		return ret < 0 ? ret : 0;
7256733f4fbbSStefan Behrens 	}
7257733f4fbbSStefan Behrens 	slot = path->slots[0];
7258733f4fbbSStefan Behrens 	eb = path->nodes[0];
7259733f4fbbSStefan Behrens 	item_size = btrfs_item_size_nr(eb, slot);
7260733f4fbbSStefan Behrens 
7261124604ebSJosef Bacik 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7262733f4fbbSStefan Behrens 
7263733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7264733f4fbbSStefan Behrens 		if (item_size >= (1 + i) * sizeof(__le64))
7265733f4fbbSStefan Behrens 			btrfs_dev_stat_set(device, i,
7266733f4fbbSStefan Behrens 					   btrfs_dev_stats_value(eb, ptr, i));
7267733f4fbbSStefan Behrens 		else
72684e411a7dSAnand Jain 			btrfs_dev_stat_set(device, i, 0);
7269733f4fbbSStefan Behrens 	}
7270733f4fbbSStefan Behrens 
7271733f4fbbSStefan Behrens 	device->dev_stats_valid = 1;
7272733f4fbbSStefan Behrens 	btrfs_dev_stat_print_on_load(device);
7273733f4fbbSStefan Behrens 	btrfs_release_path(path);
727492e26df4SJosef Bacik 
727592e26df4SJosef Bacik 	return 0;
7276733f4fbbSStefan Behrens }
7277124604ebSJosef Bacik 
7278124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7279124604ebSJosef Bacik {
7280124604ebSJosef Bacik 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7281124604ebSJosef Bacik 	struct btrfs_device *device;
7282124604ebSJosef Bacik 	struct btrfs_path *path = NULL;
728392e26df4SJosef Bacik 	int ret = 0;
7284124604ebSJosef Bacik 
7285124604ebSJosef Bacik 	path = btrfs_alloc_path();
7286124604ebSJosef Bacik 	if (!path)
7287124604ebSJosef Bacik 		return -ENOMEM;
7288124604ebSJosef Bacik 
7289124604ebSJosef Bacik 	mutex_lock(&fs_devices->device_list_mutex);
729092e26df4SJosef Bacik 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
729192e26df4SJosef Bacik 		ret = btrfs_device_init_dev_stats(device, path);
729292e26df4SJosef Bacik 		if (ret)
729392e26df4SJosef Bacik 			goto out;
7294124604ebSJosef Bacik 	}
729592e26df4SJosef Bacik 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
729692e26df4SJosef Bacik 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
729792e26df4SJosef Bacik 			ret = btrfs_device_init_dev_stats(device, path);
729892e26df4SJosef Bacik 			if (ret)
729992e26df4SJosef Bacik 				goto out;
730092e26df4SJosef Bacik 		}
730192e26df4SJosef Bacik 	}
730292e26df4SJosef Bacik out:
7303733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7304733f4fbbSStefan Behrens 
7305733f4fbbSStefan Behrens 	btrfs_free_path(path);
730692e26df4SJosef Bacik 	return ret;
7307733f4fbbSStefan Behrens }
7308733f4fbbSStefan Behrens 
7309733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7310733f4fbbSStefan Behrens 				struct btrfs_device *device)
7311733f4fbbSStefan Behrens {
73125495f195SNikolay Borisov 	struct btrfs_fs_info *fs_info = trans->fs_info;
73136bccf3abSJeff Mahoney 	struct btrfs_root *dev_root = fs_info->dev_root;
7314733f4fbbSStefan Behrens 	struct btrfs_path *path;
7315733f4fbbSStefan Behrens 	struct btrfs_key key;
7316733f4fbbSStefan Behrens 	struct extent_buffer *eb;
7317733f4fbbSStefan Behrens 	struct btrfs_dev_stats_item *ptr;
7318733f4fbbSStefan Behrens 	int ret;
7319733f4fbbSStefan Behrens 	int i;
7320733f4fbbSStefan Behrens 
7321242e2956SDavid Sterba 	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7322242e2956SDavid Sterba 	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7323733f4fbbSStefan Behrens 	key.offset = device->devid;
7324733f4fbbSStefan Behrens 
7325733f4fbbSStefan Behrens 	path = btrfs_alloc_path();
7326fa252992SDavid Sterba 	if (!path)
7327fa252992SDavid Sterba 		return -ENOMEM;
7328733f4fbbSStefan Behrens 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7329733f4fbbSStefan Behrens 	if (ret < 0) {
73300b246afaSJeff Mahoney 		btrfs_warn_in_rcu(fs_info,
7331ecaeb14bSDavid Sterba 			"error %d while searching for dev_stats item for device %s",
7332606686eeSJosef Bacik 			      ret, rcu_str_deref(device->name));
7333733f4fbbSStefan Behrens 		goto out;
7334733f4fbbSStefan Behrens 	}
7335733f4fbbSStefan Behrens 
7336733f4fbbSStefan Behrens 	if (ret == 0 &&
7337733f4fbbSStefan Behrens 	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7338733f4fbbSStefan Behrens 		/* need to delete old one and insert a new one */
7339733f4fbbSStefan Behrens 		ret = btrfs_del_item(trans, dev_root, path);
7340733f4fbbSStefan Behrens 		if (ret != 0) {
73410b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7342ecaeb14bSDavid Sterba 				"delete too small dev_stats item for device %s failed %d",
7343606686eeSJosef Bacik 				      rcu_str_deref(device->name), ret);
7344733f4fbbSStefan Behrens 			goto out;
7345733f4fbbSStefan Behrens 		}
7346733f4fbbSStefan Behrens 		ret = 1;
7347733f4fbbSStefan Behrens 	}
7348733f4fbbSStefan Behrens 
7349733f4fbbSStefan Behrens 	if (ret == 1) {
7350733f4fbbSStefan Behrens 		/* need to insert a new item */
7351733f4fbbSStefan Behrens 		btrfs_release_path(path);
7352733f4fbbSStefan Behrens 		ret = btrfs_insert_empty_item(trans, dev_root, path,
7353733f4fbbSStefan Behrens 					      &key, sizeof(*ptr));
7354733f4fbbSStefan Behrens 		if (ret < 0) {
73550b246afaSJeff Mahoney 			btrfs_warn_in_rcu(fs_info,
7356ecaeb14bSDavid Sterba 				"insert dev_stats item for device %s failed %d",
7357606686eeSJosef Bacik 				rcu_str_deref(device->name), ret);
7358733f4fbbSStefan Behrens 			goto out;
7359733f4fbbSStefan Behrens 		}
7360733f4fbbSStefan Behrens 	}
7361733f4fbbSStefan Behrens 
7362733f4fbbSStefan Behrens 	eb = path->nodes[0];
7363733f4fbbSStefan Behrens 	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7364733f4fbbSStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7365733f4fbbSStefan Behrens 		btrfs_set_dev_stats_value(eb, ptr, i,
7366733f4fbbSStefan Behrens 					  btrfs_dev_stat_read(device, i));
7367733f4fbbSStefan Behrens 	btrfs_mark_buffer_dirty(eb);
7368733f4fbbSStefan Behrens 
7369733f4fbbSStefan Behrens out:
7370733f4fbbSStefan Behrens 	btrfs_free_path(path);
7371733f4fbbSStefan Behrens 	return ret;
7372733f4fbbSStefan Behrens }
7373733f4fbbSStefan Behrens 
7374733f4fbbSStefan Behrens /*
7375733f4fbbSStefan Behrens  * called from commit_transaction. Writes all changed device stats to disk.
7376733f4fbbSStefan Behrens  */
7377196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7378733f4fbbSStefan Behrens {
7379196c9d8dSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
7380733f4fbbSStefan Behrens 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7381733f4fbbSStefan Behrens 	struct btrfs_device *device;
7382addc3fa7SMiao Xie 	int stats_cnt;
7383733f4fbbSStefan Behrens 	int ret = 0;
7384733f4fbbSStefan Behrens 
7385733f4fbbSStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
7386733f4fbbSStefan Behrens 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
73879deae968SNikolay Borisov 		stats_cnt = atomic_read(&device->dev_stats_ccnt);
73889deae968SNikolay Borisov 		if (!device->dev_stats_valid || stats_cnt == 0)
7389733f4fbbSStefan Behrens 			continue;
7390733f4fbbSStefan Behrens 
73919deae968SNikolay Borisov 
73929deae968SNikolay Borisov 		/*
73939deae968SNikolay Borisov 		 * There is a LOAD-LOAD control dependency between the value of
73949deae968SNikolay Borisov 		 * dev_stats_ccnt and updating the on-disk values which requires
73959deae968SNikolay Borisov 		 * reading the in-memory counters. Such control dependencies
73969deae968SNikolay Borisov 		 * require explicit read memory barriers.
73979deae968SNikolay Borisov 		 *
73989deae968SNikolay Borisov 		 * This memory barriers pairs with smp_mb__before_atomic in
73999deae968SNikolay Borisov 		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
74009deae968SNikolay Borisov 		 * barrier implied by atomic_xchg in
74019deae968SNikolay Borisov 		 * btrfs_dev_stats_read_and_reset
74029deae968SNikolay Borisov 		 */
74039deae968SNikolay Borisov 		smp_rmb();
74049deae968SNikolay Borisov 
74055495f195SNikolay Borisov 		ret = update_dev_stat_item(trans, device);
7406733f4fbbSStefan Behrens 		if (!ret)
7407addc3fa7SMiao Xie 			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7408733f4fbbSStefan Behrens 	}
7409733f4fbbSStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7410733f4fbbSStefan Behrens 
7411733f4fbbSStefan Behrens 	return ret;
7412733f4fbbSStefan Behrens }
7413733f4fbbSStefan Behrens 
7414442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7415442a4f63SStefan Behrens {
7416442a4f63SStefan Behrens 	btrfs_dev_stat_inc(dev, index);
7417442a4f63SStefan Behrens 	btrfs_dev_stat_print_on_error(dev);
7418442a4f63SStefan Behrens }
7419442a4f63SStefan Behrens 
742048a3b636SEric Sandeen static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7421442a4f63SStefan Behrens {
7422733f4fbbSStefan Behrens 	if (!dev->dev_stats_valid)
7423733f4fbbSStefan Behrens 		return;
7424fb456252SJeff Mahoney 	btrfs_err_rl_in_rcu(dev->fs_info,
7425b14af3b4SDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7426606686eeSJosef Bacik 			   rcu_str_deref(dev->name),
7427442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7428442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7429442a4f63SStefan Behrens 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7430efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7431efe120a0SFrank Holton 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7432442a4f63SStefan Behrens }
7433c11d2c23SStefan Behrens 
7434733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7435733f4fbbSStefan Behrens {
7436a98cdb85SStefan Behrens 	int i;
7437a98cdb85SStefan Behrens 
7438a98cdb85SStefan Behrens 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7439a98cdb85SStefan Behrens 		if (btrfs_dev_stat_read(dev, i) != 0)
7440a98cdb85SStefan Behrens 			break;
7441a98cdb85SStefan Behrens 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
7442a98cdb85SStefan Behrens 		return; /* all values == 0, suppress message */
7443a98cdb85SStefan Behrens 
7444fb456252SJeff Mahoney 	btrfs_info_in_rcu(dev->fs_info,
7445ecaeb14bSDavid Sterba 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7446606686eeSJosef Bacik 	       rcu_str_deref(dev->name),
7447733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7448733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7449733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7450733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7451733f4fbbSStefan Behrens 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7452733f4fbbSStefan Behrens }
7453733f4fbbSStefan Behrens 
74542ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7455b27f7c0cSDavid Sterba 			struct btrfs_ioctl_get_dev_stats *stats)
7456c11d2c23SStefan Behrens {
7457c11d2c23SStefan Behrens 	struct btrfs_device *dev;
74580b246afaSJeff Mahoney 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7459c11d2c23SStefan Behrens 	int i;
7460c11d2c23SStefan Behrens 
7461c11d2c23SStefan Behrens 	mutex_lock(&fs_devices->device_list_mutex);
746209ba3bc9SAnand Jain 	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
746309ba3bc9SAnand Jain 				true);
7464c11d2c23SStefan Behrens 	mutex_unlock(&fs_devices->device_list_mutex);
7465c11d2c23SStefan Behrens 
7466c11d2c23SStefan Behrens 	if (!dev) {
74670b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7468c11d2c23SStefan Behrens 		return -ENODEV;
7469733f4fbbSStefan Behrens 	} else if (!dev->dev_stats_valid) {
74700b246afaSJeff Mahoney 		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7471733f4fbbSStefan Behrens 		return -ENODEV;
7472b27f7c0cSDavid Sterba 	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7473c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7474c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7475c11d2c23SStefan Behrens 				stats->values[i] =
7476c11d2c23SStefan Behrens 					btrfs_dev_stat_read_and_reset(dev, i);
7477c11d2c23SStefan Behrens 			else
74784e411a7dSAnand Jain 				btrfs_dev_stat_set(dev, i, 0);
7479c11d2c23SStefan Behrens 		}
7480a69976bcSAnand Jain 		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7481a69976bcSAnand Jain 			   current->comm, task_pid_nr(current));
7482c11d2c23SStefan Behrens 	} else {
7483c11d2c23SStefan Behrens 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7484c11d2c23SStefan Behrens 			if (stats->nr_items > i)
7485c11d2c23SStefan Behrens 				stats->values[i] = btrfs_dev_stat_read(dev, i);
7486c11d2c23SStefan Behrens 	}
7487c11d2c23SStefan Behrens 	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7488c11d2c23SStefan Behrens 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7489c11d2c23SStefan Behrens 	return 0;
7490c11d2c23SStefan Behrens }
7491a8a6dab7SStefan Behrens 
7492935e5cc9SMiao Xie /*
7493bbbf7243SNikolay Borisov  * Update the size and bytes used for each device where it changed.  This is
7494bbbf7243SNikolay Borisov  * delayed since we would otherwise get errors while writing out the
7495935e5cc9SMiao Xie  * superblocks.
7496bbbf7243SNikolay Borisov  *
7497bbbf7243SNikolay Borisov  * Must be invoked during transaction commit.
7498935e5cc9SMiao Xie  */
7499bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7500935e5cc9SMiao Xie {
7501935e5cc9SMiao Xie 	struct btrfs_device *curr, *next;
7502935e5cc9SMiao Xie 
7503bbbf7243SNikolay Borisov 	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7504bbbf7243SNikolay Borisov 
7505bbbf7243SNikolay Borisov 	if (list_empty(&trans->dev_update_list))
7506935e5cc9SMiao Xie 		return;
7507935e5cc9SMiao Xie 
7508bbbf7243SNikolay Borisov 	/*
7509bbbf7243SNikolay Borisov 	 * We don't need the device_list_mutex here.  This list is owned by the
7510bbbf7243SNikolay Borisov 	 * transaction and the transaction must complete before the device is
7511bbbf7243SNikolay Borisov 	 * released.
7512bbbf7243SNikolay Borisov 	 */
7513bbbf7243SNikolay Borisov 	mutex_lock(&trans->fs_info->chunk_mutex);
7514bbbf7243SNikolay Borisov 	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7515bbbf7243SNikolay Borisov 				 post_commit_list) {
7516bbbf7243SNikolay Borisov 		list_del_init(&curr->post_commit_list);
7517935e5cc9SMiao Xie 		curr->commit_total_bytes = curr->disk_total_bytes;
7518bbbf7243SNikolay Borisov 		curr->commit_bytes_used = curr->bytes_used;
7519935e5cc9SMiao Xie 	}
7520bbbf7243SNikolay Borisov 	mutex_unlock(&trans->fs_info->chunk_mutex);
7521ce7213c7SMiao Xie }
75225a13f430SAnand Jain 
752346df06b8SDavid Sterba /*
752446df06b8SDavid Sterba  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
752546df06b8SDavid Sterba  */
752646df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags)
752746df06b8SDavid Sterba {
752844b28adaSDavid Sterba 	const int index = btrfs_bg_flags_to_raid_index(flags);
752944b28adaSDavid Sterba 
753044b28adaSDavid Sterba 	return btrfs_raid_array[index].ncopies;
753146df06b8SDavid Sterba }
7532cf90d884SQu Wenruo 
7533cf90d884SQu Wenruo 
7534cf90d884SQu Wenruo 
7535cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7536cf90d884SQu Wenruo 				 u64 chunk_offset, u64 devid,
7537cf90d884SQu Wenruo 				 u64 physical_offset, u64 physical_len)
7538cf90d884SQu Wenruo {
7539c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7540cf90d884SQu Wenruo 	struct extent_map *em;
7541cf90d884SQu Wenruo 	struct map_lookup *map;
754205a37c48SQu Wenruo 	struct btrfs_device *dev;
7543cf90d884SQu Wenruo 	u64 stripe_len;
7544cf90d884SQu Wenruo 	bool found = false;
7545cf90d884SQu Wenruo 	int ret = 0;
7546cf90d884SQu Wenruo 	int i;
7547cf90d884SQu Wenruo 
7548cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
7549cf90d884SQu Wenruo 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7550cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
7551cf90d884SQu Wenruo 
7552cf90d884SQu Wenruo 	if (!em) {
7553cf90d884SQu Wenruo 		btrfs_err(fs_info,
7554cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7555cf90d884SQu Wenruo 			  physical_offset, devid);
7556cf90d884SQu Wenruo 		ret = -EUCLEAN;
7557cf90d884SQu Wenruo 		goto out;
7558cf90d884SQu Wenruo 	}
7559cf90d884SQu Wenruo 
7560cf90d884SQu Wenruo 	map = em->map_lookup;
7561cf90d884SQu Wenruo 	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7562cf90d884SQu Wenruo 	if (physical_len != stripe_len) {
7563cf90d884SQu Wenruo 		btrfs_err(fs_info,
7564cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7565cf90d884SQu Wenruo 			  physical_offset, devid, em->start, physical_len,
7566cf90d884SQu Wenruo 			  stripe_len);
7567cf90d884SQu Wenruo 		ret = -EUCLEAN;
7568cf90d884SQu Wenruo 		goto out;
7569cf90d884SQu Wenruo 	}
7570cf90d884SQu Wenruo 
7571cf90d884SQu Wenruo 	for (i = 0; i < map->num_stripes; i++) {
7572cf90d884SQu Wenruo 		if (map->stripes[i].dev->devid == devid &&
7573cf90d884SQu Wenruo 		    map->stripes[i].physical == physical_offset) {
7574cf90d884SQu Wenruo 			found = true;
7575cf90d884SQu Wenruo 			if (map->verified_stripes >= map->num_stripes) {
7576cf90d884SQu Wenruo 				btrfs_err(fs_info,
7577cf90d884SQu Wenruo 				"too many dev extents for chunk %llu found",
7578cf90d884SQu Wenruo 					  em->start);
7579cf90d884SQu Wenruo 				ret = -EUCLEAN;
7580cf90d884SQu Wenruo 				goto out;
7581cf90d884SQu Wenruo 			}
7582cf90d884SQu Wenruo 			map->verified_stripes++;
7583cf90d884SQu Wenruo 			break;
7584cf90d884SQu Wenruo 		}
7585cf90d884SQu Wenruo 	}
7586cf90d884SQu Wenruo 	if (!found) {
7587cf90d884SQu Wenruo 		btrfs_err(fs_info,
7588cf90d884SQu Wenruo 	"dev extent physical offset %llu devid %llu has no corresponding chunk",
7589cf90d884SQu Wenruo 			physical_offset, devid);
7590cf90d884SQu Wenruo 		ret = -EUCLEAN;
7591cf90d884SQu Wenruo 	}
759205a37c48SQu Wenruo 
759305a37c48SQu Wenruo 	/* Make sure no dev extent is beyond device bondary */
759409ba3bc9SAnand Jain 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
759505a37c48SQu Wenruo 	if (!dev) {
759605a37c48SQu Wenruo 		btrfs_err(fs_info, "failed to find devid %llu", devid);
759705a37c48SQu Wenruo 		ret = -EUCLEAN;
759805a37c48SQu Wenruo 		goto out;
759905a37c48SQu Wenruo 	}
76001b3922a8SQu Wenruo 
76011b3922a8SQu Wenruo 	/* It's possible this device is a dummy for seed device */
76021b3922a8SQu Wenruo 	if (dev->disk_total_bytes == 0) {
7603944d3f9fSNikolay Borisov 		struct btrfs_fs_devices *devs;
7604944d3f9fSNikolay Borisov 
7605944d3f9fSNikolay Borisov 		devs = list_first_entry(&fs_info->fs_devices->seed_list,
7606944d3f9fSNikolay Borisov 					struct btrfs_fs_devices, seed_list);
7607944d3f9fSNikolay Borisov 		dev = btrfs_find_device(devs, devid, NULL, NULL, false);
76081b3922a8SQu Wenruo 		if (!dev) {
76091b3922a8SQu Wenruo 			btrfs_err(fs_info, "failed to find seed devid %llu",
76101b3922a8SQu Wenruo 				  devid);
76111b3922a8SQu Wenruo 			ret = -EUCLEAN;
76121b3922a8SQu Wenruo 			goto out;
76131b3922a8SQu Wenruo 		}
76141b3922a8SQu Wenruo 	}
76151b3922a8SQu Wenruo 
761605a37c48SQu Wenruo 	if (physical_offset + physical_len > dev->disk_total_bytes) {
761705a37c48SQu Wenruo 		btrfs_err(fs_info,
761805a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
761905a37c48SQu Wenruo 			  devid, physical_offset, physical_len,
762005a37c48SQu Wenruo 			  dev->disk_total_bytes);
762105a37c48SQu Wenruo 		ret = -EUCLEAN;
762205a37c48SQu Wenruo 		goto out;
762305a37c48SQu Wenruo 	}
7624cf90d884SQu Wenruo out:
7625cf90d884SQu Wenruo 	free_extent_map(em);
7626cf90d884SQu Wenruo 	return ret;
7627cf90d884SQu Wenruo }
7628cf90d884SQu Wenruo 
7629cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7630cf90d884SQu Wenruo {
7631c8bf1b67SDavid Sterba 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7632cf90d884SQu Wenruo 	struct extent_map *em;
7633cf90d884SQu Wenruo 	struct rb_node *node;
7634cf90d884SQu Wenruo 	int ret = 0;
7635cf90d884SQu Wenruo 
7636cf90d884SQu Wenruo 	read_lock(&em_tree->lock);
763707e1ce09SLiu Bo 	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7638cf90d884SQu Wenruo 		em = rb_entry(node, struct extent_map, rb_node);
7639cf90d884SQu Wenruo 		if (em->map_lookup->num_stripes !=
7640cf90d884SQu Wenruo 		    em->map_lookup->verified_stripes) {
7641cf90d884SQu Wenruo 			btrfs_err(fs_info,
7642cf90d884SQu Wenruo 			"chunk %llu has missing dev extent, have %d expect %d",
7643cf90d884SQu Wenruo 				  em->start, em->map_lookup->verified_stripes,
7644cf90d884SQu Wenruo 				  em->map_lookup->num_stripes);
7645cf90d884SQu Wenruo 			ret = -EUCLEAN;
7646cf90d884SQu Wenruo 			goto out;
7647cf90d884SQu Wenruo 		}
7648cf90d884SQu Wenruo 	}
7649cf90d884SQu Wenruo out:
7650cf90d884SQu Wenruo 	read_unlock(&em_tree->lock);
7651cf90d884SQu Wenruo 	return ret;
7652cf90d884SQu Wenruo }
7653cf90d884SQu Wenruo 
7654cf90d884SQu Wenruo /*
7655cf90d884SQu Wenruo  * Ensure that all dev extents are mapped to correct chunk, otherwise
7656cf90d884SQu Wenruo  * later chunk allocation/free would cause unexpected behavior.
7657cf90d884SQu Wenruo  *
7658cf90d884SQu Wenruo  * NOTE: This will iterate through the whole device tree, which should be of
7659cf90d884SQu Wenruo  * the same size level as the chunk tree.  This slightly increases mount time.
7660cf90d884SQu Wenruo  */
7661cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7662cf90d884SQu Wenruo {
7663cf90d884SQu Wenruo 	struct btrfs_path *path;
7664cf90d884SQu Wenruo 	struct btrfs_root *root = fs_info->dev_root;
7665cf90d884SQu Wenruo 	struct btrfs_key key;
76665eb19381SQu Wenruo 	u64 prev_devid = 0;
76675eb19381SQu Wenruo 	u64 prev_dev_ext_end = 0;
7668cf90d884SQu Wenruo 	int ret = 0;
7669cf90d884SQu Wenruo 
767042437a63SJosef Bacik 	/*
767142437a63SJosef Bacik 	 * We don't have a dev_root because we mounted with ignorebadroots and
767242437a63SJosef Bacik 	 * failed to load the root, so we want to skip the verification in this
767342437a63SJosef Bacik 	 * case for sure.
767442437a63SJosef Bacik 	 *
767542437a63SJosef Bacik 	 * However if the dev root is fine, but the tree itself is corrupted
767642437a63SJosef Bacik 	 * we'd still fail to mount.  This verification is only to make sure
767742437a63SJosef Bacik 	 * writes can happen safely, so instead just bypass this check
767842437a63SJosef Bacik 	 * completely in the case of IGNOREBADROOTS.
767942437a63SJosef Bacik 	 */
768042437a63SJosef Bacik 	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
768142437a63SJosef Bacik 		return 0;
768242437a63SJosef Bacik 
7683cf90d884SQu Wenruo 	key.objectid = 1;
7684cf90d884SQu Wenruo 	key.type = BTRFS_DEV_EXTENT_KEY;
7685cf90d884SQu Wenruo 	key.offset = 0;
7686cf90d884SQu Wenruo 
7687cf90d884SQu Wenruo 	path = btrfs_alloc_path();
7688cf90d884SQu Wenruo 	if (!path)
7689cf90d884SQu Wenruo 		return -ENOMEM;
7690cf90d884SQu Wenruo 
7691cf90d884SQu Wenruo 	path->reada = READA_FORWARD;
7692cf90d884SQu Wenruo 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7693cf90d884SQu Wenruo 	if (ret < 0)
7694cf90d884SQu Wenruo 		goto out;
7695cf90d884SQu Wenruo 
7696cf90d884SQu Wenruo 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7697cf90d884SQu Wenruo 		ret = btrfs_next_item(root, path);
7698cf90d884SQu Wenruo 		if (ret < 0)
7699cf90d884SQu Wenruo 			goto out;
7700cf90d884SQu Wenruo 		/* No dev extents at all? Not good */
7701cf90d884SQu Wenruo 		if (ret > 0) {
7702cf90d884SQu Wenruo 			ret = -EUCLEAN;
7703cf90d884SQu Wenruo 			goto out;
7704cf90d884SQu Wenruo 		}
7705cf90d884SQu Wenruo 	}
7706cf90d884SQu Wenruo 	while (1) {
7707cf90d884SQu Wenruo 		struct extent_buffer *leaf = path->nodes[0];
7708cf90d884SQu Wenruo 		struct btrfs_dev_extent *dext;
7709cf90d884SQu Wenruo 		int slot = path->slots[0];
7710cf90d884SQu Wenruo 		u64 chunk_offset;
7711cf90d884SQu Wenruo 		u64 physical_offset;
7712cf90d884SQu Wenruo 		u64 physical_len;
7713cf90d884SQu Wenruo 		u64 devid;
7714cf90d884SQu Wenruo 
7715cf90d884SQu Wenruo 		btrfs_item_key_to_cpu(leaf, &key, slot);
7716cf90d884SQu Wenruo 		if (key.type != BTRFS_DEV_EXTENT_KEY)
7717cf90d884SQu Wenruo 			break;
7718cf90d884SQu Wenruo 		devid = key.objectid;
7719cf90d884SQu Wenruo 		physical_offset = key.offset;
7720cf90d884SQu Wenruo 
7721cf90d884SQu Wenruo 		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7722cf90d884SQu Wenruo 		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7723cf90d884SQu Wenruo 		physical_len = btrfs_dev_extent_length(leaf, dext);
7724cf90d884SQu Wenruo 
77255eb19381SQu Wenruo 		/* Check if this dev extent overlaps with the previous one */
77265eb19381SQu Wenruo 		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
77275eb19381SQu Wenruo 			btrfs_err(fs_info,
77285eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
77295eb19381SQu Wenruo 				  devid, physical_offset, prev_dev_ext_end);
77305eb19381SQu Wenruo 			ret = -EUCLEAN;
77315eb19381SQu Wenruo 			goto out;
77325eb19381SQu Wenruo 		}
77335eb19381SQu Wenruo 
7734cf90d884SQu Wenruo 		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7735cf90d884SQu Wenruo 					    physical_offset, physical_len);
7736cf90d884SQu Wenruo 		if (ret < 0)
7737cf90d884SQu Wenruo 			goto out;
77385eb19381SQu Wenruo 		prev_devid = devid;
77395eb19381SQu Wenruo 		prev_dev_ext_end = physical_offset + physical_len;
77405eb19381SQu Wenruo 
7741cf90d884SQu Wenruo 		ret = btrfs_next_item(root, path);
7742cf90d884SQu Wenruo 		if (ret < 0)
7743cf90d884SQu Wenruo 			goto out;
7744cf90d884SQu Wenruo 		if (ret > 0) {
7745cf90d884SQu Wenruo 			ret = 0;
7746cf90d884SQu Wenruo 			break;
7747cf90d884SQu Wenruo 		}
7748cf90d884SQu Wenruo 	}
7749cf90d884SQu Wenruo 
7750cf90d884SQu Wenruo 	/* Ensure all chunks have corresponding dev extents */
7751cf90d884SQu Wenruo 	ret = verify_chunk_dev_extent_mapping(fs_info);
7752cf90d884SQu Wenruo out:
7753cf90d884SQu Wenruo 	btrfs_free_path(path);
7754cf90d884SQu Wenruo 	return ret;
7755cf90d884SQu Wenruo }
7756eede2bf3SOmar Sandoval 
7757eede2bf3SOmar Sandoval /*
7758eede2bf3SOmar Sandoval  * Check whether the given block group or device is pinned by any inode being
7759eede2bf3SOmar Sandoval  * used as a swapfile.
7760eede2bf3SOmar Sandoval  */
7761eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7762eede2bf3SOmar Sandoval {
7763eede2bf3SOmar Sandoval 	struct btrfs_swapfile_pin *sp;
7764eede2bf3SOmar Sandoval 	struct rb_node *node;
7765eede2bf3SOmar Sandoval 
7766eede2bf3SOmar Sandoval 	spin_lock(&fs_info->swapfile_pins_lock);
7767eede2bf3SOmar Sandoval 	node = fs_info->swapfile_pins.rb_node;
7768eede2bf3SOmar Sandoval 	while (node) {
7769eede2bf3SOmar Sandoval 		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7770eede2bf3SOmar Sandoval 		if (ptr < sp->ptr)
7771eede2bf3SOmar Sandoval 			node = node->rb_left;
7772eede2bf3SOmar Sandoval 		else if (ptr > sp->ptr)
7773eede2bf3SOmar Sandoval 			node = node->rb_right;
7774eede2bf3SOmar Sandoval 		else
7775eede2bf3SOmar Sandoval 			break;
7776eede2bf3SOmar Sandoval 	}
7777eede2bf3SOmar Sandoval 	spin_unlock(&fs_info->swapfile_pins_lock);
7778eede2bf3SOmar Sandoval 	return node != NULL;
7779eede2bf3SOmar Sandoval }
7780