1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
20b86a832SChris Mason /*
30b86a832SChris Mason * Copyright (C) 2007 Oracle. All rights reserved.
40b86a832SChris Mason */
5c1d7c514SDavid Sterba
60b86a832SChris Mason #include <linux/sched.h>
7fccc0007SJosef Bacik #include <linux/sched/mm.h>
85a0e3ad6STejun Heo #include <linux/slab.h>
9442a4f63SStefan Behrens #include <linux/ratelimit.h>
1059641015SIlya Dryomov #include <linux/kthread.h>
11803b2f54SStefan Behrens #include <linux/semaphore.h>
128da4b8c4SAndy Shevchenko #include <linux/uuid.h>
13f8e10cd3SAnand Jain #include <linux/list_sort.h>
1454fde91fSJosef Bacik #include <linux/namei.h>
15784352feSDavid Sterba #include "misc.h"
160b86a832SChris Mason #include "ctree.h"
170b86a832SChris Mason #include "extent_map.h"
180b86a832SChris Mason #include "disk-io.h"
190b86a832SChris Mason #include "transaction.h"
200b86a832SChris Mason #include "print-tree.h"
210b86a832SChris Mason #include "volumes.h"
2253b381b3SDavid Woodhouse #include "raid56.h"
23606686eeSJosef Bacik #include "rcu-string.h"
248dabb742SStefan Behrens #include "dev-replace.h"
2599994cdeSAnand Jain #include "sysfs.h"
2682fc28fbSQu Wenruo #include "tree-checker.h"
278719aaaeSJosef Bacik #include "space-info.h"
28aac0023cSJosef Bacik #include "block-group.h"
29b0643e59SDennis Zhou #include "discard.h"
305b316468SNaohiro Aota #include "zoned.h"
31c7f13d42SJosef Bacik #include "fs.h"
3207e81dc9SJosef Bacik #include "accessors.h"
33c7a03b52SJosef Bacik #include "uuid-tree.h"
347572dec8SJosef Bacik #include "ioctl.h"
3567707479SJosef Bacik #include "relocation.h"
362fc6822cSJosef Bacik #include "scrub.h"
377f0add25SJosef Bacik #include "super.h"
380b86a832SChris Mason
39bf08387fSQu Wenruo #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
40bf08387fSQu Wenruo BTRFS_BLOCK_GROUP_RAID10 | \
41bf08387fSQu Wenruo BTRFS_BLOCK_GROUP_RAID56_MASK)
42bf08387fSQu Wenruo
43af902047SZhao Lei const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
44af902047SZhao Lei [BTRFS_RAID_RAID10] = {
45af902047SZhao Lei .sub_stripes = 2,
46af902047SZhao Lei .dev_stripes = 1,
47af902047SZhao Lei .devs_max = 0, /* 0 == as many as possible */
48b2f78e88SDavid Sterba .devs_min = 2,
498789f4feSZhao Lei .tolerated_failures = 1,
50af902047SZhao Lei .devs_increment = 2,
51af902047SZhao Lei .ncopies = 2,
52b50836edSHans van Kranenburg .nparity = 0,
53ed23467bSAnand Jain .raid_name = "raid10",
5441a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
55f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
56af902047SZhao Lei },
57af902047SZhao Lei [BTRFS_RAID_RAID1] = {
58af902047SZhao Lei .sub_stripes = 1,
59af902047SZhao Lei .dev_stripes = 1,
60af902047SZhao Lei .devs_max = 2,
61af902047SZhao Lei .devs_min = 2,
628789f4feSZhao Lei .tolerated_failures = 1,
63af902047SZhao Lei .devs_increment = 2,
64af902047SZhao Lei .ncopies = 2,
65b50836edSHans van Kranenburg .nparity = 0,
66ed23467bSAnand Jain .raid_name = "raid1",
6741a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
68f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
69af902047SZhao Lei },
7047e6f742SDavid Sterba [BTRFS_RAID_RAID1C3] = {
7147e6f742SDavid Sterba .sub_stripes = 1,
7247e6f742SDavid Sterba .dev_stripes = 1,
73cf93e15eSDavid Sterba .devs_max = 3,
7447e6f742SDavid Sterba .devs_min = 3,
7547e6f742SDavid Sterba .tolerated_failures = 2,
7647e6f742SDavid Sterba .devs_increment = 3,
7747e6f742SDavid Sterba .ncopies = 3,
78db26a024SDavid Sterba .nparity = 0,
7947e6f742SDavid Sterba .raid_name = "raid1c3",
8047e6f742SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
8147e6f742SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
8247e6f742SDavid Sterba },
838d6fac00SDavid Sterba [BTRFS_RAID_RAID1C4] = {
848d6fac00SDavid Sterba .sub_stripes = 1,
858d6fac00SDavid Sterba .dev_stripes = 1,
86cf93e15eSDavid Sterba .devs_max = 4,
878d6fac00SDavid Sterba .devs_min = 4,
888d6fac00SDavid Sterba .tolerated_failures = 3,
898d6fac00SDavid Sterba .devs_increment = 4,
908d6fac00SDavid Sterba .ncopies = 4,
91db26a024SDavid Sterba .nparity = 0,
928d6fac00SDavid Sterba .raid_name = "raid1c4",
938d6fac00SDavid Sterba .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
948d6fac00SDavid Sterba .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
958d6fac00SDavid Sterba },
96af902047SZhao Lei [BTRFS_RAID_DUP] = {
97af902047SZhao Lei .sub_stripes = 1,
98af902047SZhao Lei .dev_stripes = 2,
99af902047SZhao Lei .devs_max = 1,
100af902047SZhao Lei .devs_min = 1,
1018789f4feSZhao Lei .tolerated_failures = 0,
102af902047SZhao Lei .devs_increment = 1,
103af902047SZhao Lei .ncopies = 2,
104b50836edSHans van Kranenburg .nparity = 0,
105ed23467bSAnand Jain .raid_name = "dup",
10641a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_DUP,
107f9fbcaa2SAnand Jain .mindev_error = 0,
108af902047SZhao Lei },
109af902047SZhao Lei [BTRFS_RAID_RAID0] = {
110af902047SZhao Lei .sub_stripes = 1,
111af902047SZhao Lei .dev_stripes = 1,
112af902047SZhao Lei .devs_max = 0,
113b2f78e88SDavid Sterba .devs_min = 1,
1148789f4feSZhao Lei .tolerated_failures = 0,
115af902047SZhao Lei .devs_increment = 1,
116af902047SZhao Lei .ncopies = 1,
117b50836edSHans van Kranenburg .nparity = 0,
118ed23467bSAnand Jain .raid_name = "raid0",
11941a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
120f9fbcaa2SAnand Jain .mindev_error = 0,
121af902047SZhao Lei },
122af902047SZhao Lei [BTRFS_RAID_SINGLE] = {
123af902047SZhao Lei .sub_stripes = 1,
124af902047SZhao Lei .dev_stripes = 1,
125af902047SZhao Lei .devs_max = 1,
126af902047SZhao Lei .devs_min = 1,
1278789f4feSZhao Lei .tolerated_failures = 0,
128af902047SZhao Lei .devs_increment = 1,
129af902047SZhao Lei .ncopies = 1,
130b50836edSHans van Kranenburg .nparity = 0,
131ed23467bSAnand Jain .raid_name = "single",
13241a6e891SAnand Jain .bg_flag = 0,
133f9fbcaa2SAnand Jain .mindev_error = 0,
134af902047SZhao Lei },
135af902047SZhao Lei [BTRFS_RAID_RAID5] = {
136af902047SZhao Lei .sub_stripes = 1,
137af902047SZhao Lei .dev_stripes = 1,
138af902047SZhao Lei .devs_max = 0,
139af902047SZhao Lei .devs_min = 2,
1408789f4feSZhao Lei .tolerated_failures = 1,
141af902047SZhao Lei .devs_increment = 1,
142da612e31SHans van Kranenburg .ncopies = 1,
143b50836edSHans van Kranenburg .nparity = 1,
144ed23467bSAnand Jain .raid_name = "raid5",
14541a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
146f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
147af902047SZhao Lei },
148af902047SZhao Lei [BTRFS_RAID_RAID6] = {
149af902047SZhao Lei .sub_stripes = 1,
150af902047SZhao Lei .dev_stripes = 1,
151af902047SZhao Lei .devs_max = 0,
152af902047SZhao Lei .devs_min = 3,
1538789f4feSZhao Lei .tolerated_failures = 2,
154af902047SZhao Lei .devs_increment = 1,
155da612e31SHans van Kranenburg .ncopies = 1,
156b50836edSHans van Kranenburg .nparity = 2,
157ed23467bSAnand Jain .raid_name = "raid6",
15841a6e891SAnand Jain .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
159f9fbcaa2SAnand Jain .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
160af902047SZhao Lei },
161af902047SZhao Lei };
162af902047SZhao Lei
163500a44c9SDavid Sterba /*
164500a44c9SDavid Sterba * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
165500a44c9SDavid Sterba * can be used as index to access btrfs_raid_array[].
166500a44c9SDavid Sterba */
btrfs_bg_flags_to_raid_index(u64 flags)167500a44c9SDavid Sterba enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
168500a44c9SDavid Sterba {
169719fae89SQu Wenruo const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
170500a44c9SDavid Sterba
171719fae89SQu Wenruo if (!profile)
172719fae89SQu Wenruo return BTRFS_RAID_SINGLE;
173719fae89SQu Wenruo
174719fae89SQu Wenruo return BTRFS_BG_FLAG_TO_INDEX(profile);
175500a44c9SDavid Sterba }
176500a44c9SDavid Sterba
btrfs_bg_type_to_raid_name(u64 flags)177158da513SDavid Sterba const char *btrfs_bg_type_to_raid_name(u64 flags)
178ed23467bSAnand Jain {
179158da513SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags);
180158da513SDavid Sterba
181158da513SDavid Sterba if (index >= BTRFS_NR_RAID_TYPES)
182ed23467bSAnand Jain return NULL;
183ed23467bSAnand Jain
184158da513SDavid Sterba return btrfs_raid_array[index].raid_name;
185ed23467bSAnand Jain }
186ed23467bSAnand Jain
btrfs_nr_parity_stripes(u64 type)1870b30f719SQu Wenruo int btrfs_nr_parity_stripes(u64 type)
1880b30f719SQu Wenruo {
1890b30f719SQu Wenruo enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
1900b30f719SQu Wenruo
1910b30f719SQu Wenruo return btrfs_raid_array[index].nparity;
1920b30f719SQu Wenruo }
1930b30f719SQu Wenruo
194f89e09cfSAnand Jain /*
195f89e09cfSAnand Jain * Fill @buf with textual description of @bg_flags, no more than @size_buf
196f89e09cfSAnand Jain * bytes including terminating null byte.
197f89e09cfSAnand Jain */
btrfs_describe_block_groups(u64 bg_flags,char * buf,u32 size_buf)198f89e09cfSAnand Jain void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
199f89e09cfSAnand Jain {
200f89e09cfSAnand Jain int i;
201f89e09cfSAnand Jain int ret;
202f89e09cfSAnand Jain char *bp = buf;
203f89e09cfSAnand Jain u64 flags = bg_flags;
204f89e09cfSAnand Jain u32 size_bp = size_buf;
205f89e09cfSAnand Jain
206f89e09cfSAnand Jain if (!flags) {
207f89e09cfSAnand Jain strcpy(bp, "NONE");
208f89e09cfSAnand Jain return;
209f89e09cfSAnand Jain }
210f89e09cfSAnand Jain
211f89e09cfSAnand Jain #define DESCRIBE_FLAG(flag, desc) \
212f89e09cfSAnand Jain do { \
213f89e09cfSAnand Jain if (flags & (flag)) { \
214f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "%s|", (desc)); \
215f89e09cfSAnand Jain if (ret < 0 || ret >= size_bp) \
216f89e09cfSAnand Jain goto out_overflow; \
217f89e09cfSAnand Jain size_bp -= ret; \
218f89e09cfSAnand Jain bp += ret; \
219f89e09cfSAnand Jain flags &= ~(flag); \
220f89e09cfSAnand Jain } \
221f89e09cfSAnand Jain } while (0)
222f89e09cfSAnand Jain
223f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
224f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
225f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
226f89e09cfSAnand Jain
227f89e09cfSAnand Jain DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
228f89e09cfSAnand Jain for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
229f89e09cfSAnand Jain DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
230f89e09cfSAnand Jain btrfs_raid_array[i].raid_name);
231f89e09cfSAnand Jain #undef DESCRIBE_FLAG
232f89e09cfSAnand Jain
233f89e09cfSAnand Jain if (flags) {
234f89e09cfSAnand Jain ret = snprintf(bp, size_bp, "0x%llx|", flags);
235f89e09cfSAnand Jain size_bp -= ret;
236f89e09cfSAnand Jain }
237f89e09cfSAnand Jain
238f89e09cfSAnand Jain if (size_bp < size_buf)
239f89e09cfSAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
240f89e09cfSAnand Jain
241f89e09cfSAnand Jain /*
242f89e09cfSAnand Jain * The text is trimmed, it's up to the caller to provide sufficiently
243f89e09cfSAnand Jain * large buffer
244f89e09cfSAnand Jain */
245f89e09cfSAnand Jain out_overflow:;
246f89e09cfSAnand Jain }
247f89e09cfSAnand Jain
2486f8e0fc7SDavid Sterba static int init_first_rw_device(struct btrfs_trans_handle *trans);
2492ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
250733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
2512b82032cSYan Zheng
2529c6b1c4dSDavid Sterba /*
2539c6b1c4dSDavid Sterba * Device locking
2549c6b1c4dSDavid Sterba * ==============
2559c6b1c4dSDavid Sterba *
2569c6b1c4dSDavid Sterba * There are several mutexes that protect manipulation of devices and low-level
2579c6b1c4dSDavid Sterba * structures like chunks but not block groups, extents or files
2589c6b1c4dSDavid Sterba *
2599c6b1c4dSDavid Sterba * uuid_mutex (global lock)
2609c6b1c4dSDavid Sterba * ------------------------
2619c6b1c4dSDavid Sterba * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
2629c6b1c4dSDavid Sterba * the SCAN_DEV ioctl registration or from mount either implicitly (the first
2639c6b1c4dSDavid Sterba * device) or requested by the device= mount option
2649c6b1c4dSDavid Sterba *
2659c6b1c4dSDavid Sterba * the mutex can be very coarse and can cover long-running operations
2669c6b1c4dSDavid Sterba *
2679c6b1c4dSDavid Sterba * protects: updates to fs_devices counters like missing devices, rw devices,
26852042d8eSAndrea Gelmini * seeding, structure cloning, opening/closing devices at mount/umount time
2699c6b1c4dSDavid Sterba *
2709c6b1c4dSDavid Sterba * global::fs_devs - add, remove, updates to the global list
2719c6b1c4dSDavid Sterba *
27218c850fdSJosef Bacik * does not protect: manipulation of the fs_devices::devices list in general
27318c850fdSJosef Bacik * but in mount context it could be used to exclude list modifications by eg.
27418c850fdSJosef Bacik * scan ioctl
2759c6b1c4dSDavid Sterba *
2769c6b1c4dSDavid Sterba * btrfs_device::name - renames (write side), read is RCU
2779c6b1c4dSDavid Sterba *
2789c6b1c4dSDavid Sterba * fs_devices::device_list_mutex (per-fs, with RCU)
2799c6b1c4dSDavid Sterba * ------------------------------------------------
2809c6b1c4dSDavid Sterba * protects updates to fs_devices::devices, ie. adding and deleting
2819c6b1c4dSDavid Sterba *
2829c6b1c4dSDavid Sterba * simple list traversal with read-only actions can be done with RCU protection
2839c6b1c4dSDavid Sterba *
2849c6b1c4dSDavid Sterba * may be used to exclude some operations from running concurrently without any
2859c6b1c4dSDavid Sterba * modifications to the list (see write_all_supers)
2869c6b1c4dSDavid Sterba *
28718c850fdSJosef Bacik * Is not required at mount and close times, because our device list is
28818c850fdSJosef Bacik * protected by the uuid_mutex at that point.
28918c850fdSJosef Bacik *
2909c6b1c4dSDavid Sterba * balance_mutex
2919c6b1c4dSDavid Sterba * -------------
2929c6b1c4dSDavid Sterba * protects balance structures (status, state) and context accessed from
2939c6b1c4dSDavid Sterba * several places (internally, ioctl)
2949c6b1c4dSDavid Sterba *
2959c6b1c4dSDavid Sterba * chunk_mutex
2969c6b1c4dSDavid Sterba * -----------
2979c6b1c4dSDavid Sterba * protects chunks, adding or removing during allocation, trim or when a new
2980b6f5d40SNikolay Borisov * device is added/removed. Additionally it also protects post_commit_list of
2990b6f5d40SNikolay Borisov * individual devices, since they can be added to the transaction's
3000b6f5d40SNikolay Borisov * post_commit_list only with chunk_mutex held.
3019c6b1c4dSDavid Sterba *
3029c6b1c4dSDavid Sterba * cleaner_mutex
3039c6b1c4dSDavid Sterba * -------------
3049c6b1c4dSDavid Sterba * a big lock that is held by the cleaner thread and prevents running subvolume
3059c6b1c4dSDavid Sterba * cleaning together with relocation or delayed iputs
3069c6b1c4dSDavid Sterba *
3079c6b1c4dSDavid Sterba *
3089c6b1c4dSDavid Sterba * Lock nesting
3099c6b1c4dSDavid Sterba * ============
3109c6b1c4dSDavid Sterba *
3119c6b1c4dSDavid Sterba * uuid_mutex
3129c6b1c4dSDavid Sterba * device_list_mutex
3139c6b1c4dSDavid Sterba * chunk_mutex
3149c6b1c4dSDavid Sterba * balance_mutex
31589595e80SAnand Jain *
31689595e80SAnand Jain *
317c3e1f96cSGoldwyn Rodrigues * Exclusive operations
318c3e1f96cSGoldwyn Rodrigues * ====================
31989595e80SAnand Jain *
32089595e80SAnand Jain * Maintains the exclusivity of the following operations that apply to the
32189595e80SAnand Jain * whole filesystem and cannot run in parallel.
32289595e80SAnand Jain *
32389595e80SAnand Jain * - Balance (*)
32489595e80SAnand Jain * - Device add
32589595e80SAnand Jain * - Device remove
32689595e80SAnand Jain * - Device replace (*)
32789595e80SAnand Jain * - Resize
32889595e80SAnand Jain *
32989595e80SAnand Jain * The device operations (as above) can be in one of the following states:
33089595e80SAnand Jain *
33189595e80SAnand Jain * - Running state
33289595e80SAnand Jain * - Paused state
33389595e80SAnand Jain * - Completed state
33489595e80SAnand Jain *
33589595e80SAnand Jain * Only device operations marked with (*) can go into the Paused state for the
33689595e80SAnand Jain * following reasons:
33789595e80SAnand Jain *
33889595e80SAnand Jain * - ioctl (only Balance can be Paused through ioctl)
33989595e80SAnand Jain * - filesystem remounted as read-only
34089595e80SAnand Jain * - filesystem unmounted and mounted as read-only
34189595e80SAnand Jain * - system power-cycle and filesystem mounted as read-only
34289595e80SAnand Jain * - filesystem or device errors leading to forced read-only
34389595e80SAnand Jain *
344c3e1f96cSGoldwyn Rodrigues * The status of exclusive operation is set and cleared atomically.
345c3e1f96cSGoldwyn Rodrigues * During the course of Paused state, fs_info::exclusive_operation remains set.
34689595e80SAnand Jain * A device operation in Paused or Running state can be canceled or resumed
34789595e80SAnand Jain * either by ioctl (Balance only) or when remounted as read-write.
348c3e1f96cSGoldwyn Rodrigues * The exclusive status is cleared when the device operation is canceled or
34989595e80SAnand Jain * completed.
3509c6b1c4dSDavid Sterba */
3519c6b1c4dSDavid Sterba
35267a2c45eSMiao Xie DEFINE_MUTEX(uuid_mutex);
3538a4b83ccSChris Mason static LIST_HEAD(fs_uuids);
btrfs_get_fs_uuids(void)3544143cb8bSDavid Sterba struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
355c73eccf7SAnand Jain {
356c73eccf7SAnand Jain return &fs_uuids;
357c73eccf7SAnand Jain }
3588a4b83ccSChris Mason
3592dfeca9bSDavid Sterba /*
3602dfeca9bSDavid Sterba * alloc_fs_devices - allocate struct btrfs_fs_devices
3617239ff4bSNikolay Borisov * @fsid: if not NULL, copy the UUID to fs_devices::fsid
3627239ff4bSNikolay Borisov * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
3632dfeca9bSDavid Sterba *
3642dfeca9bSDavid Sterba * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
3652dfeca9bSDavid Sterba * The returned struct is not linked onto any lists and can be destroyed with
3662dfeca9bSDavid Sterba * kfree() right away.
3672dfeca9bSDavid Sterba */
alloc_fs_devices(const u8 * fsid,const u8 * metadata_fsid)3687239ff4bSNikolay Borisov static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
3697239ff4bSNikolay Borisov const u8 *metadata_fsid)
3702208a378SIlya Dryomov {
3712208a378SIlya Dryomov struct btrfs_fs_devices *fs_devs;
3722208a378SIlya Dryomov
37319c4c49cSAnand Jain ASSERT(fsid || !metadata_fsid);
37419c4c49cSAnand Jain
37578f2c9e6SDavid Sterba fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
3762208a378SIlya Dryomov if (!fs_devs)
3772208a378SIlya Dryomov return ERR_PTR(-ENOMEM);
3782208a378SIlya Dryomov
3792208a378SIlya Dryomov mutex_init(&fs_devs->device_list_mutex);
3802208a378SIlya Dryomov
3812208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->devices);
3822208a378SIlya Dryomov INIT_LIST_HEAD(&fs_devs->alloc_list);
383c4babc5eSAnand Jain INIT_LIST_HEAD(&fs_devs->fs_list);
384944d3f9fSNikolay Borisov INIT_LIST_HEAD(&fs_devs->seed_list);
3852208a378SIlya Dryomov
38619c4c49cSAnand Jain if (fsid) {
38719c4c49cSAnand Jain memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
38819c4c49cSAnand Jain memcpy(fs_devs->metadata_uuid,
38919c4c49cSAnand Jain metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
39019c4c49cSAnand Jain }
3917239ff4bSNikolay Borisov
3922208a378SIlya Dryomov return fs_devs;
3932208a378SIlya Dryomov }
3942208a378SIlya Dryomov
btrfs_free_device(struct btrfs_device * device)395f2db4d5cSFilipe Manana static void btrfs_free_device(struct btrfs_device *device)
39648dae9cfSDavid Sterba {
397bbbf7243SNikolay Borisov WARN_ON(!list_empty(&device->post_commit_list));
39848dae9cfSDavid Sterba rcu_string_free(device->name);
399611ccc58SFilipe Manana extent_io_tree_release(&device->alloc_state);
4005b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device);
40148dae9cfSDavid Sterba kfree(device);
40248dae9cfSDavid Sterba }
40348dae9cfSDavid Sterba
free_fs_devices(struct btrfs_fs_devices * fs_devices)404e4404d6eSYan Zheng static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
405e4404d6eSYan Zheng {
406e4404d6eSYan Zheng struct btrfs_device *device;
4075f58d783SAnand Jain
408e4404d6eSYan Zheng WARN_ON(fs_devices->opened);
409e4404d6eSYan Zheng while (!list_empty(&fs_devices->devices)) {
410e4404d6eSYan Zheng device = list_entry(fs_devices->devices.next,
411e4404d6eSYan Zheng struct btrfs_device, dev_list);
412e4404d6eSYan Zheng list_del(&device->dev_list);
413a425f9d4SDavid Sterba btrfs_free_device(device);
414e4404d6eSYan Zheng }
415e4404d6eSYan Zheng kfree(fs_devices);
416e4404d6eSYan Zheng }
417e4404d6eSYan Zheng
btrfs_cleanup_fs_uuids(void)418ffc5a379SDavid Sterba void __exit btrfs_cleanup_fs_uuids(void)
4198a4b83ccSChris Mason {
4208a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices;
4218a4b83ccSChris Mason
4222b82032cSYan Zheng while (!list_empty(&fs_uuids)) {
4232b82032cSYan Zheng fs_devices = list_entry(fs_uuids.next,
424c4babc5eSAnand Jain struct btrfs_fs_devices, fs_list);
425c4babc5eSAnand Jain list_del(&fs_devices->fs_list);
426e4404d6eSYan Zheng free_fs_devices(fs_devices);
4278a4b83ccSChris Mason }
4288a4b83ccSChris Mason }
4298a4b83ccSChris Mason
match_fsid_fs_devices(const struct btrfs_fs_devices * fs_devices,const u8 * fsid,const u8 * metadata_fsid)4301a898345SAnand Jain static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
4311a898345SAnand Jain const u8 *fsid, const u8 *metadata_fsid)
4321a898345SAnand Jain {
4331a898345SAnand Jain if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
4341a898345SAnand Jain return false;
4351a898345SAnand Jain
4361a898345SAnand Jain if (!metadata_fsid)
4371a898345SAnand Jain return true;
4381a898345SAnand Jain
4391a898345SAnand Jain if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
4401a898345SAnand Jain return false;
4411a898345SAnand Jain
4421a898345SAnand Jain return true;
4431a898345SAnand Jain }
4441a898345SAnand Jain
find_fsid(const u8 * fsid,const u8 * metadata_fsid)4457239ff4bSNikolay Borisov static noinline struct btrfs_fs_devices *find_fsid(
4467239ff4bSNikolay Borisov const u8 *fsid, const u8 *metadata_fsid)
4478a4b83ccSChris Mason {
4488a4b83ccSChris Mason struct btrfs_fs_devices *fs_devices;
4498a4b83ccSChris Mason
4507239ff4bSNikolay Borisov ASSERT(fsid);
4517239ff4bSNikolay Borisov
452c6730a0eSSu Yue /* Handle non-split brain cases */
453c6730a0eSSu Yue list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4541a898345SAnand Jain if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
455c6730a0eSSu Yue return fs_devices;
456c6730a0eSSu Yue }
457c6730a0eSSu Yue return NULL;
458c6730a0eSSu Yue }
459c6730a0eSSu Yue
460a3c54b0bSAnand Jain /*
461a3c54b0bSAnand Jain * First check if the metadata_uuid is different from the fsid in the given
462a3c54b0bSAnand Jain * fs_devices. Then check if the given fsid is the same as the metadata_uuid
463a3c54b0bSAnand Jain * in the fs_devices. If it is, return true; otherwise, return false.
464a3c54b0bSAnand Jain */
check_fsid_changed(const struct btrfs_fs_devices * fs_devices,const u8 * fsid)465a3c54b0bSAnand Jain static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
466a3c54b0bSAnand Jain const u8 *fsid)
467a3c54b0bSAnand Jain {
468a3c54b0bSAnand Jain return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
469a3c54b0bSAnand Jain BTRFS_FSID_SIZE) != 0 &&
470a3c54b0bSAnand Jain memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
471a3c54b0bSAnand Jain }
472a3c54b0bSAnand Jain
find_fsid_with_metadata_uuid(struct btrfs_super_block * disk_super)473c6730a0eSSu Yue static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
474c6730a0eSSu Yue struct btrfs_super_block *disk_super)
475c6730a0eSSu Yue {
476c6730a0eSSu Yue
477c6730a0eSSu Yue struct btrfs_fs_devices *fs_devices;
478c6730a0eSSu Yue
4797a62d0f0SNikolay Borisov /*
4807a62d0f0SNikolay Borisov * Handle scanned device having completed its fsid change but
4817a62d0f0SNikolay Borisov * belonging to a fs_devices that was created by first scanning
4827a62d0f0SNikolay Borisov * a device which didn't have its fsid/metadata_uuid changed
4837a62d0f0SNikolay Borisov * at all and the CHANGING_FSID_V2 flag set.
4847a62d0f0SNikolay Borisov */
4857a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
4861a898345SAnand Jain if (!fs_devices->fsid_change)
4871a898345SAnand Jain continue;
4881a898345SAnand Jain
4891a898345SAnand Jain if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
4901a898345SAnand Jain fs_devices->fsid))
4917a62d0f0SNikolay Borisov return fs_devices;
4927a62d0f0SNikolay Borisov }
4931a898345SAnand Jain
494cc5de4e7SNikolay Borisov /*
495cc5de4e7SNikolay Borisov * Handle scanned device having completed its fsid change but
496cc5de4e7SNikolay Borisov * belonging to a fs_devices that was created by a device that
497cc5de4e7SNikolay Borisov * has an outdated pair of fsid/metadata_uuid and
498cc5de4e7SNikolay Borisov * CHANGING_FSID_V2 flag set.
499cc5de4e7SNikolay Borisov */
500cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
501a3c54b0bSAnand Jain if (!fs_devices->fsid_change)
502a3c54b0bSAnand Jain continue;
503a3c54b0bSAnand Jain
504a3c54b0bSAnand Jain if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
505cc5de4e7SNikolay Borisov return fs_devices;
506cc5de4e7SNikolay Borisov }
507c6730a0eSSu Yue
508c6730a0eSSu Yue return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
5097a62d0f0SNikolay Borisov }
5107a62d0f0SNikolay Borisov
5118a4b83ccSChris Mason
512beaf8ab3SStefan Behrens static int
btrfs_get_bdev_and_sb(const char * device_path,blk_mode_t flags,void * holder,int flush,struct block_device ** bdev,struct btrfs_super_block ** disk_super)51305bdb996SChristoph Hellwig btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
514beaf8ab3SStefan Behrens int flush, struct block_device **bdev,
5158f32380dSJohannes Thumshirn struct btrfs_super_block **disk_super)
516beaf8ab3SStefan Behrens {
517beaf8ab3SStefan Behrens int ret;
518beaf8ab3SStefan Behrens
5190718afd4SChristoph Hellwig *bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
520beaf8ab3SStefan Behrens
521beaf8ab3SStefan Behrens if (IS_ERR(*bdev)) {
522beaf8ab3SStefan Behrens ret = PTR_ERR(*bdev);
523beaf8ab3SStefan Behrens goto error;
524beaf8ab3SStefan Behrens }
525beaf8ab3SStefan Behrens
526beaf8ab3SStefan Behrens if (flush)
5271226dfffSChristoph Hellwig sync_blockdev(*bdev);
5289f6d2510SDavid Sterba ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
529beaf8ab3SStefan Behrens if (ret) {
5302736e8eeSChristoph Hellwig blkdev_put(*bdev, holder);
531beaf8ab3SStefan Behrens goto error;
532beaf8ab3SStefan Behrens }
533beaf8ab3SStefan Behrens invalidate_bdev(*bdev);
5348f32380dSJohannes Thumshirn *disk_super = btrfs_read_dev_super(*bdev);
5358f32380dSJohannes Thumshirn if (IS_ERR(*disk_super)) {
5368f32380dSJohannes Thumshirn ret = PTR_ERR(*disk_super);
5372736e8eeSChristoph Hellwig blkdev_put(*bdev, holder);
538beaf8ab3SStefan Behrens goto error;
539beaf8ab3SStefan Behrens }
540beaf8ab3SStefan Behrens
541beaf8ab3SStefan Behrens return 0;
542beaf8ab3SStefan Behrens
543beaf8ab3SStefan Behrens error:
544beaf8ab3SStefan Behrens *bdev = NULL;
545beaf8ab3SStefan Behrens return ret;
546beaf8ab3SStefan Behrens }
547beaf8ab3SStefan Behrens
54843dd529aSDavid Sterba /*
54943dd529aSDavid Sterba * Search and remove all stale devices (which are not mounted). When both
55043dd529aSDavid Sterba * inputs are NULL, it will search and release all stale devices.
55116cab91aSAnand Jain *
55216cab91aSAnand Jain * @devt: Optional. When provided will it release all unmounted devices
55316cab91aSAnand Jain * matching this devt only.
55416cab91aSAnand Jain * @skip_device: Optional. Will skip this device when searching for the stale
555d8367db3SAnand Jain * devices.
55616cab91aSAnand Jain *
55716cab91aSAnand Jain * Return: 0 for success or if @devt is 0.
55816cab91aSAnand Jain * -EBUSY if @devt is a mounted device.
55916cab91aSAnand Jain * -ENOENT if @devt does not match any device in the list.
560d8367db3SAnand Jain */
btrfs_free_stale_devices(dev_t devt,struct btrfs_device * skip_device)56116cab91aSAnand Jain static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
5624fde46f0SAnand Jain {
563fa6d2ae5SAnand Jain struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
564fa6d2ae5SAnand Jain struct btrfs_device *device, *tmp_device;
56570bc7088SAnand Jain int ret = 0;
56670bc7088SAnand Jain
567c1247069SAnand Jain lockdep_assert_held(&uuid_mutex);
568c1247069SAnand Jain
56916cab91aSAnand Jain if (devt)
57070bc7088SAnand Jain ret = -ENOENT;
5714fde46f0SAnand Jain
572fa6d2ae5SAnand Jain list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
5734fde46f0SAnand Jain
57470bc7088SAnand Jain mutex_lock(&fs_devices->device_list_mutex);
575fa6d2ae5SAnand Jain list_for_each_entry_safe(device, tmp_device,
576fa6d2ae5SAnand Jain &fs_devices->devices, dev_list) {
577fa6d2ae5SAnand Jain if (skip_device && skip_device == device)
578d8367db3SAnand Jain continue;
579330a5bf4SAnand Jain if (devt && devt != device->devt)
58038cf665dSAnand Jain continue;
58170bc7088SAnand Jain if (fs_devices->opened) {
58270bc7088SAnand Jain /* for an already deleted device return 0 */
58316cab91aSAnand Jain if (devt && ret != 0)
58470bc7088SAnand Jain ret = -EBUSY;
58570bc7088SAnand Jain break;
58670bc7088SAnand Jain }
5874fde46f0SAnand Jain
5884fde46f0SAnand Jain /* delete the stale device */
589fa6d2ae5SAnand Jain fs_devices->num_devices--;
590fa6d2ae5SAnand Jain list_del(&device->dev_list);
591fa6d2ae5SAnand Jain btrfs_free_device(device);
5927bcb8164SAnand Jain
59370bc7088SAnand Jain ret = 0;
5944fde46f0SAnand Jain }
5957bcb8164SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
59670bc7088SAnand Jain
5977bcb8164SAnand Jain if (fs_devices->num_devices == 0) {
5987bcb8164SAnand Jain btrfs_sysfs_remove_fsid(fs_devices);
5997bcb8164SAnand Jain list_del(&fs_devices->fs_list);
6007bcb8164SAnand Jain free_fs_devices(fs_devices);
6014fde46f0SAnand Jain }
6024fde46f0SAnand Jain }
60370bc7088SAnand Jain
60470bc7088SAnand Jain return ret;
6054fde46f0SAnand Jain }
6064fde46f0SAnand Jain
60718c850fdSJosef Bacik /*
60818c850fdSJosef Bacik * This is only used on mount, and we are protected from competing things
60918c850fdSJosef Bacik * messing with our fs_devices by the uuid_mutex, thus we do not need the
61018c850fdSJosef Bacik * fs_devices->device_list_mutex here.
61118c850fdSJosef Bacik */
btrfs_open_one_device(struct btrfs_fs_devices * fs_devices,struct btrfs_device * device,blk_mode_t flags,void * holder)6120fb08bccSAnand Jain static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
61305bdb996SChristoph Hellwig struct btrfs_device *device, blk_mode_t flags,
6140fb08bccSAnand Jain void *holder)
6150fb08bccSAnand Jain {
6160fb08bccSAnand Jain struct block_device *bdev;
6170fb08bccSAnand Jain struct btrfs_super_block *disk_super;
6180fb08bccSAnand Jain u64 devid;
6190fb08bccSAnand Jain int ret;
6200fb08bccSAnand Jain
6210fb08bccSAnand Jain if (device->bdev)
6220fb08bccSAnand Jain return -EINVAL;
6230fb08bccSAnand Jain if (!device->name)
6240fb08bccSAnand Jain return -EINVAL;
6250fb08bccSAnand Jain
6260fb08bccSAnand Jain ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
6278f32380dSJohannes Thumshirn &bdev, &disk_super);
6280fb08bccSAnand Jain if (ret)
6290fb08bccSAnand Jain return ret;
6300fb08bccSAnand Jain
6310fb08bccSAnand Jain devid = btrfs_stack_device_id(&disk_super->dev_item);
6320fb08bccSAnand Jain if (devid != device->devid)
6338f32380dSJohannes Thumshirn goto error_free_page;
6340fb08bccSAnand Jain
6350fb08bccSAnand Jain if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
6368f32380dSJohannes Thumshirn goto error_free_page;
6370fb08bccSAnand Jain
6380fb08bccSAnand Jain device->generation = btrfs_super_generation(disk_super);
6390fb08bccSAnand Jain
6400fb08bccSAnand Jain if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
6417239ff4bSNikolay Borisov if (btrfs_super_incompat_flags(disk_super) &
6427239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
6437239ff4bSNikolay Borisov pr_err(
6447239ff4bSNikolay Borisov "BTRFS: Invalid seeding and uuid-changed device detected\n");
6458f32380dSJohannes Thumshirn goto error_free_page;
6467239ff4bSNikolay Borisov }
6477239ff4bSNikolay Borisov
648ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6490395d84fSJohannes Thumshirn fs_devices->seeding = true;
6500fb08bccSAnand Jain } else {
651ebbede42SAnand Jain if (bdev_read_only(bdev))
652ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
653ebbede42SAnand Jain else
654ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
6550fb08bccSAnand Jain }
6560fb08bccSAnand Jain
65710f0d2a5SChristoph Hellwig if (!bdev_nonrot(bdev))
6587f0432d0SJohannes Thumshirn fs_devices->rotating = true;
6590fb08bccSAnand Jain
66063a7cb13SDavid Sterba if (bdev_max_discard_sectors(bdev))
66163a7cb13SDavid Sterba fs_devices->discardable = true;
66263a7cb13SDavid Sterba
6630fb08bccSAnand Jain device->bdev = bdev;
664e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6652736e8eeSChristoph Hellwig device->holder = holder;
6660fb08bccSAnand Jain
6670fb08bccSAnand Jain fs_devices->open_devices++;
668ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
669ebbede42SAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) {
6700fb08bccSAnand Jain fs_devices->rw_devices++;
671b1b8e386SAnand Jain list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
6720fb08bccSAnand Jain }
6738f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super);
6740fb08bccSAnand Jain
6750fb08bccSAnand Jain return 0;
6760fb08bccSAnand Jain
6778f32380dSJohannes Thumshirn error_free_page:
6788f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super);
6792736e8eeSChristoph Hellwig blkdev_put(bdev, holder);
6800fb08bccSAnand Jain
6810fb08bccSAnand Jain return -EINVAL;
6820fb08bccSAnand Jain }
6830fb08bccSAnand Jain
btrfs_sb_fsid_ptr(struct btrfs_super_block * sb)6844844c366SAnand Jain u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
6854844c366SAnand Jain {
6864844c366SAnand Jain bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
6874844c366SAnand Jain BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
6884844c366SAnand Jain
6894844c366SAnand Jain return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
6904844c366SAnand Jain }
6914844c366SAnand Jain
is_same_device(struct btrfs_device * device,const char * new_path)692*a5d74fa2SQu Wenruo static bool is_same_device(struct btrfs_device *device, const char *new_path)
693*a5d74fa2SQu Wenruo {
694*a5d74fa2SQu Wenruo struct path old = { .mnt = NULL, .dentry = NULL };
695*a5d74fa2SQu Wenruo struct path new = { .mnt = NULL, .dentry = NULL };
696*a5d74fa2SQu Wenruo char *old_path = NULL;
697*a5d74fa2SQu Wenruo bool is_same = false;
698*a5d74fa2SQu Wenruo int ret;
699*a5d74fa2SQu Wenruo
700*a5d74fa2SQu Wenruo if (!device->name)
701*a5d74fa2SQu Wenruo goto out;
702*a5d74fa2SQu Wenruo
703*a5d74fa2SQu Wenruo old_path = kzalloc(PATH_MAX, GFP_NOFS);
704*a5d74fa2SQu Wenruo if (!old_path)
705*a5d74fa2SQu Wenruo goto out;
706*a5d74fa2SQu Wenruo
707*a5d74fa2SQu Wenruo rcu_read_lock();
708*a5d74fa2SQu Wenruo ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
709*a5d74fa2SQu Wenruo rcu_read_unlock();
710*a5d74fa2SQu Wenruo if (ret < 0)
711*a5d74fa2SQu Wenruo goto out;
712*a5d74fa2SQu Wenruo
713*a5d74fa2SQu Wenruo ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
714*a5d74fa2SQu Wenruo if (ret)
715*a5d74fa2SQu Wenruo goto out;
716*a5d74fa2SQu Wenruo ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
717*a5d74fa2SQu Wenruo if (ret)
718*a5d74fa2SQu Wenruo goto out;
719*a5d74fa2SQu Wenruo if (path_equal(&old, &new))
720*a5d74fa2SQu Wenruo is_same = true;
721*a5d74fa2SQu Wenruo out:
722*a5d74fa2SQu Wenruo kfree(old_path);
723*a5d74fa2SQu Wenruo path_put(&old);
724*a5d74fa2SQu Wenruo path_put(&new);
725*a5d74fa2SQu Wenruo return is_same;
726*a5d74fa2SQu Wenruo }
727*a5d74fa2SQu Wenruo
72860999ca4SDavid Sterba /*
7297a62d0f0SNikolay Borisov * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
730c0d81c7cSSu Yue * being created with a disk that has already completed its fsid change. Such
731c0d81c7cSSu Yue * disk can belong to an fs which has its FSID changed or to one which doesn't.
732c0d81c7cSSu Yue * Handle both cases here.
7337a62d0f0SNikolay Borisov */
find_fsid_inprogress(struct btrfs_super_block * disk_super)7347a62d0f0SNikolay Borisov static struct btrfs_fs_devices *find_fsid_inprogress(
7357a62d0f0SNikolay Borisov struct btrfs_super_block *disk_super)
7367a62d0f0SNikolay Borisov {
7377a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices;
7387a62d0f0SNikolay Borisov
7397a62d0f0SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
740a3c54b0bSAnand Jain if (fs_devices->fsid_change)
741a3c54b0bSAnand Jain continue;
742a3c54b0bSAnand Jain
743a3c54b0bSAnand Jain if (check_fsid_changed(fs_devices, disk_super->fsid))
7447a62d0f0SNikolay Borisov return fs_devices;
7457a62d0f0SNikolay Borisov }
7467a62d0f0SNikolay Borisov
747c0d81c7cSSu Yue return find_fsid(disk_super->fsid, NULL);
7487a62d0f0SNikolay Borisov }
7497a62d0f0SNikolay Borisov
find_fsid_changed(struct btrfs_super_block * disk_super)750cc5de4e7SNikolay Borisov static struct btrfs_fs_devices *find_fsid_changed(
751cc5de4e7SNikolay Borisov struct btrfs_super_block *disk_super)
752cc5de4e7SNikolay Borisov {
753cc5de4e7SNikolay Borisov struct btrfs_fs_devices *fs_devices;
754cc5de4e7SNikolay Borisov
755cc5de4e7SNikolay Borisov /*
756cc5de4e7SNikolay Borisov * Handles the case where scanned device is part of an fs that had
7571a9fd417SDavid Sterba * multiple successful changes of FSID but currently device didn't
75805840710SNikolay Borisov * observe it. Meaning our fsid will be different than theirs. We need
75905840710SNikolay Borisov * to handle two subcases :
76005840710SNikolay Borisov * 1 - The fs still continues to have different METADATA/FSID uuids.
76105840710SNikolay Borisov * 2 - The fs is switched back to its original FSID (METADATA/FSID
76205840710SNikolay Borisov * are equal).
763cc5de4e7SNikolay Borisov */
764cc5de4e7SNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
76505840710SNikolay Borisov /* Changed UUIDs */
766a3c54b0bSAnand Jain if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
767cc5de4e7SNikolay Borisov memcmp(fs_devices->fsid, disk_super->fsid,
76805840710SNikolay Borisov BTRFS_FSID_SIZE) != 0)
769cc5de4e7SNikolay Borisov return fs_devices;
77005840710SNikolay Borisov
77105840710SNikolay Borisov /* Unchanged UUIDs */
77205840710SNikolay Borisov if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
77305840710SNikolay Borisov BTRFS_FSID_SIZE) == 0 &&
77405840710SNikolay Borisov memcmp(fs_devices->fsid, disk_super->metadata_uuid,
77505840710SNikolay Borisov BTRFS_FSID_SIZE) == 0)
77605840710SNikolay Borisov return fs_devices;
777cc5de4e7SNikolay Borisov }
778cc5de4e7SNikolay Borisov
779cc5de4e7SNikolay Borisov return NULL;
780cc5de4e7SNikolay Borisov }
7811362089dSNikolay Borisov
find_fsid_reverted_metadata(struct btrfs_super_block * disk_super)7821362089dSNikolay Borisov static struct btrfs_fs_devices *find_fsid_reverted_metadata(
7831362089dSNikolay Borisov struct btrfs_super_block *disk_super)
7841362089dSNikolay Borisov {
7851362089dSNikolay Borisov struct btrfs_fs_devices *fs_devices;
7861362089dSNikolay Borisov
7871362089dSNikolay Borisov /*
7881362089dSNikolay Borisov * Handle the case where the scanned device is part of an fs whose last
7891362089dSNikolay Borisov * metadata UUID change reverted it to the original FSID. At the same
79067da05b3SColin Ian King * time fs_devices was first created by another constituent device
7911362089dSNikolay Borisov * which didn't fully observe the operation. This results in an
7921362089dSNikolay Borisov * btrfs_fs_devices created with metadata/fsid different AND
7931362089dSNikolay Borisov * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
7941362089dSNikolay Borisov * fs_devices equal to the FSID of the disk.
7951362089dSNikolay Borisov */
7961362089dSNikolay Borisov list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
797a3c54b0bSAnand Jain if (!fs_devices->fsid_change)
798a3c54b0bSAnand Jain continue;
799a3c54b0bSAnand Jain
800a3c54b0bSAnand Jain if (check_fsid_changed(fs_devices, disk_super->fsid))
8011362089dSNikolay Borisov return fs_devices;
8021362089dSNikolay Borisov }
8031362089dSNikolay Borisov
8041362089dSNikolay Borisov return NULL;
8051362089dSNikolay Borisov }
8067a62d0f0SNikolay Borisov /*
80760999ca4SDavid Sterba * Add new device to list of registered devices
80860999ca4SDavid Sterba *
80960999ca4SDavid Sterba * Returns:
810e124ece5SAnand Jain * device pointer which was just added or updated when successful
811e124ece5SAnand Jain * error pointer when failed
81260999ca4SDavid Sterba */
device_list_add(const char * path,struct btrfs_super_block * disk_super,bool * new_device_added)813e124ece5SAnand Jain static noinline struct btrfs_device *device_list_add(const char *path,
8144306a974SAnand Jain struct btrfs_super_block *disk_super,
8154306a974SAnand Jain bool *new_device_added)
8168a4b83ccSChris Mason {
8178a4b83ccSChris Mason struct btrfs_device *device;
8187a62d0f0SNikolay Borisov struct btrfs_fs_devices *fs_devices = NULL;
819606686eeSJosef Bacik struct rcu_string *name;
8208a4b83ccSChris Mason u64 found_transid = btrfs_super_generation(disk_super);
8213acbcbfcSAnand Jain u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
8224889bc05SAnand Jain dev_t path_devt;
8234889bc05SAnand Jain int error;
8247239ff4bSNikolay Borisov bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
8257239ff4bSNikolay Borisov BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
826d1a63002SNikolay Borisov bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
827d1a63002SNikolay Borisov BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
8288a4b83ccSChris Mason
8294889bc05SAnand Jain error = lookup_bdev(path, &path_devt);
830ed02363fSQu Wenruo if (error) {
831ed02363fSQu Wenruo btrfs_err(NULL, "failed to lookup block device for path %s: %d",
832ed02363fSQu Wenruo path, error);
8334889bc05SAnand Jain return ERR_PTR(error);
834ed02363fSQu Wenruo }
8354889bc05SAnand Jain
836cc5de4e7SNikolay Borisov if (fsid_change_in_progress) {
837c0d81c7cSSu Yue if (!has_metadata_uuid)
8387a62d0f0SNikolay Borisov fs_devices = find_fsid_inprogress(disk_super);
839c0d81c7cSSu Yue else
840cc5de4e7SNikolay Borisov fs_devices = find_fsid_changed(disk_super);
8417a62d0f0SNikolay Borisov } else if (has_metadata_uuid) {
842c6730a0eSSu Yue fs_devices = find_fsid_with_metadata_uuid(disk_super);
8437a62d0f0SNikolay Borisov } else {
8441362089dSNikolay Borisov fs_devices = find_fsid_reverted_metadata(disk_super);
8451362089dSNikolay Borisov if (!fs_devices)
8467a62d0f0SNikolay Borisov fs_devices = find_fsid(disk_super->fsid, NULL);
8477a62d0f0SNikolay Borisov }
8487a62d0f0SNikolay Borisov
8497a62d0f0SNikolay Borisov
8508a4b83ccSChris Mason if (!fs_devices) {
8517239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(disk_super->fsid,
852c6930d7dSAnand Jain has_metadata_uuid ? disk_super->metadata_uuid : NULL);
8532208a378SIlya Dryomov if (IS_ERR(fs_devices))
854e124ece5SAnand Jain return ERR_CAST(fs_devices);
8552208a378SIlya Dryomov
85692900e51SAl Viro fs_devices->fsid_change = fsid_change_in_progress;
85792900e51SAl Viro
8589c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex);
859c4babc5eSAnand Jain list_add(&fs_devices->fs_list, &fs_uuids);
8602208a378SIlya Dryomov
8618a4b83ccSChris Mason device = NULL;
8628a4b83ccSChris Mason } else {
863562d7b15SJosef Bacik struct btrfs_dev_lookup_args args = {
864562d7b15SJosef Bacik .devid = devid,
865562d7b15SJosef Bacik .uuid = disk_super->dev_item.uuid,
866562d7b15SJosef Bacik };
867562d7b15SJosef Bacik
8689c6d173eSAnand Jain mutex_lock(&fs_devices->device_list_mutex);
869562d7b15SJosef Bacik device = btrfs_find_device(fs_devices, &args);
8707a62d0f0SNikolay Borisov
8717a62d0f0SNikolay Borisov /*
8727a62d0f0SNikolay Borisov * If this disk has been pulled into an fs devices created by
8737a62d0f0SNikolay Borisov * a device which had the CHANGING_FSID_V2 flag then replace the
8747a62d0f0SNikolay Borisov * metadata_uuid/fsid values of the fs_devices.
8757a62d0f0SNikolay Borisov */
8761362089dSNikolay Borisov if (fs_devices->fsid_change &&
8777a62d0f0SNikolay Borisov found_transid > fs_devices->latest_generation) {
8787a62d0f0SNikolay Borisov memcpy(fs_devices->fsid, disk_super->fsid,
8797a62d0f0SNikolay Borisov BTRFS_FSID_SIZE);
8807a62d0f0SNikolay Borisov memcpy(fs_devices->metadata_uuid,
881319baafcSAnand Jain btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
8827a62d0f0SNikolay Borisov fs_devices->fsid_change = false;
8837a62d0f0SNikolay Borisov }
8848a4b83ccSChris Mason }
885443f24feSMiao Xie
8868a4b83ccSChris Mason if (!device) {
887bb21e302SAnand Jain unsigned int nofs_flag;
888bb21e302SAnand Jain
8899c6d173eSAnand Jain if (fs_devices->opened) {
890ed02363fSQu Wenruo btrfs_err(NULL,
8917f9879ebSAnand Jain "device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
8927f9879ebSAnand Jain path, fs_devices->fsid, current->comm,
8937f9879ebSAnand Jain task_pid_nr(current));
8949c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
895e124ece5SAnand Jain return ERR_PTR(-EBUSY);
8969c6d173eSAnand Jain }
8972b82032cSYan Zheng
898bb21e302SAnand Jain nofs_flag = memalloc_nofs_save();
89912bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &devid,
900bb21e302SAnand Jain disk_super->dev_item.uuid, path);
901bb21e302SAnand Jain memalloc_nofs_restore(nofs_flag);
90212bd2fc0SIlya Dryomov if (IS_ERR(device)) {
9039c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
9048a4b83ccSChris Mason /* we can safely leave the fs_devices entry around */
905e124ece5SAnand Jain return device;
9068a4b83ccSChris Mason }
907606686eeSJosef Bacik
9084889bc05SAnand Jain device->devt = path_devt;
90990519d66SArne Jansen
9101f78160cSXiao Guangrong list_add_rcu(&device->dev_list, &fs_devices->devices);
911f7171750SFilipe David Borba Manana fs_devices->num_devices++;
912e5e9a520SChris Mason
9132b82032cSYan Zheng device->fs_devices = fs_devices;
9144306a974SAnand Jain *new_device_added = true;
915327f18ccSAnand Jain
916327f18ccSAnand Jain if (disk_super->label[0])
917aa6c0df7SAnand Jain pr_info(
918aa6c0df7SAnand Jain "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
919aa6c0df7SAnand Jain disk_super->label, devid, found_transid, path,
920aa6c0df7SAnand Jain current->comm, task_pid_nr(current));
921327f18ccSAnand Jain else
922aa6c0df7SAnand Jain pr_info(
923aa6c0df7SAnand Jain "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
924aa6c0df7SAnand Jain disk_super->fsid, devid, found_transid, path,
925aa6c0df7SAnand Jain current->comm, task_pid_nr(current));
926327f18ccSAnand Jain
927*a5d74fa2SQu Wenruo } else if (!device->name || !is_same_device(device, path)) {
928b96de000SAnand Jain /*
929b96de000SAnand Jain * When FS is already mounted.
930b96de000SAnand Jain * 1. If you are here and if the device->name is NULL that
931b96de000SAnand Jain * means this device was missing at time of FS mount.
932b96de000SAnand Jain * 2. If you are here and if the device->name is different
933b96de000SAnand Jain * from 'path' that means either
934b96de000SAnand Jain * a. The same device disappeared and reappeared with
935b96de000SAnand Jain * different name. or
936b96de000SAnand Jain * b. The missing-disk-which-was-replaced, has
937b96de000SAnand Jain * reappeared now.
938b96de000SAnand Jain *
939b96de000SAnand Jain * We must allow 1 and 2a above. But 2b would be a spurious
940b96de000SAnand Jain * and unintentional.
941b96de000SAnand Jain *
942b96de000SAnand Jain * Further in case of 1 and 2a above, the disk at 'path'
943b96de000SAnand Jain * would have missed some transaction when it was away and
944b96de000SAnand Jain * in case of 2a the stale bdev has to be updated as well.
945b96de000SAnand Jain * 2b must not be allowed at all time.
946b96de000SAnand Jain */
947b96de000SAnand Jain
948b96de000SAnand Jain /*
9490f23ae74SChris Mason * For now, we do allow update to btrfs_fs_device through the
9500f23ae74SChris Mason * btrfs dev scan cli after FS has been mounted. We're still
9510f23ae74SChris Mason * tracking a problem where systems fail mount by subvolume id
9520f23ae74SChris Mason * when we reject replacement on a mounted FS.
953b96de000SAnand Jain */
9540f23ae74SChris Mason if (!fs_devices->opened && found_transid < device->generation) {
95577bdae4dSAnand Jain /*
95677bdae4dSAnand Jain * That is if the FS is _not_ mounted and if you
95777bdae4dSAnand Jain * are here, that means there is more than one
95877bdae4dSAnand Jain * disk with same uuid and devid.We keep the one
95977bdae4dSAnand Jain * with larger generation number or the last-in if
96077bdae4dSAnand Jain * generation are equal.
96177bdae4dSAnand Jain */
9629c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
963ed02363fSQu Wenruo btrfs_err(NULL,
964ed02363fSQu Wenruo "device %s already registered with a higher generation, found %llu expect %llu",
965ed02363fSQu Wenruo path, found_transid, device->generation);
966e124ece5SAnand Jain return ERR_PTR(-EEXIST);
96777bdae4dSAnand Jain }
968b96de000SAnand Jain
969a9261d41SAnand Jain /*
970a9261d41SAnand Jain * We are going to replace the device path for a given devid,
971a9261d41SAnand Jain * make sure it's the same device if the device is mounted
97279c9234bSDongliang Mu *
97379c9234bSDongliang Mu * NOTE: the device->fs_info may not be reliable here so pass
97479c9234bSDongliang Mu * in a NULL to message helpers instead. This avoids a possible
97579c9234bSDongliang Mu * use-after-free when the fs_info and fs_info->sb are already
97679c9234bSDongliang Mu * torn down.
977a9261d41SAnand Jain */
978a9261d41SAnand Jain if (device->bdev) {
9794889bc05SAnand Jain if (device->devt != path_devt) {
980a9261d41SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
9810697d9a6SJohannes Thumshirn btrfs_warn_in_rcu(NULL,
98279dae17dSAnand Jain "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
98379dae17dSAnand Jain path, devid, found_transid,
98479dae17dSAnand Jain current->comm,
98579dae17dSAnand Jain task_pid_nr(current));
986a9261d41SAnand Jain return ERR_PTR(-EEXIST);
987a9261d41SAnand Jain }
98879c9234bSDongliang Mu btrfs_info_in_rcu(NULL,
98979dae17dSAnand Jain "devid %llu device path %s changed to %s scanned by %s (%d)",
990cb3e217bSQu Wenruo devid, btrfs_dev_name(device),
99179dae17dSAnand Jain path, current->comm,
99279dae17dSAnand Jain task_pid_nr(current));
993a9261d41SAnand Jain }
994a9261d41SAnand Jain
995606686eeSJosef Bacik name = rcu_string_strdup(path, GFP_NOFS);
9969c6d173eSAnand Jain if (!name) {
9979c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
998e124ece5SAnand Jain return ERR_PTR(-ENOMEM);
9999c6d173eSAnand Jain }
1000606686eeSJosef Bacik rcu_string_free(device->name);
1001606686eeSJosef Bacik rcu_assign_pointer(device->name, name);
1002e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1003cd02dca5SChris Mason fs_devices->missing_devices--;
1004e6e674bdSAnand Jain clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1005cd02dca5SChris Mason }
10064889bc05SAnand Jain device->devt = path_devt;
10078a4b83ccSChris Mason }
10088a4b83ccSChris Mason
100977bdae4dSAnand Jain /*
101077bdae4dSAnand Jain * Unmount does not free the btrfs_device struct but would zero
101177bdae4dSAnand Jain * generation along with most of the other members. So just update
101277bdae4dSAnand Jain * it back. We need it to pick the disk with largest generation
101377bdae4dSAnand Jain * (as above).
101477bdae4dSAnand Jain */
1015d1a63002SNikolay Borisov if (!fs_devices->opened) {
101677bdae4dSAnand Jain device->generation = found_transid;
1017d1a63002SNikolay Borisov fs_devices->latest_generation = max_t(u64, found_transid,
1018d1a63002SNikolay Borisov fs_devices->latest_generation);
1019d1a63002SNikolay Borisov }
102077bdae4dSAnand Jain
1021f2788d2fSAnand Jain fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1022f2788d2fSAnand Jain
10239c6d173eSAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
1024e124ece5SAnand Jain return device;
10258a4b83ccSChris Mason }
10268a4b83ccSChris Mason
clone_fs_devices(struct btrfs_fs_devices * orig)1027e4404d6eSYan Zheng static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1028e4404d6eSYan Zheng {
1029e4404d6eSYan Zheng struct btrfs_fs_devices *fs_devices;
1030e4404d6eSYan Zheng struct btrfs_device *device;
1031e4404d6eSYan Zheng struct btrfs_device *orig_dev;
1032d2979aa2SAnand Jain int ret = 0;
1033e4404d6eSYan Zheng
1034c1247069SAnand Jain lockdep_assert_held(&uuid_mutex);
1035c1247069SAnand Jain
10367239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(orig->fsid, NULL);
10372208a378SIlya Dryomov if (IS_ERR(fs_devices))
10382208a378SIlya Dryomov return fs_devices;
1039e4404d6eSYan Zheng
104002db0844SJosef Bacik fs_devices->total_devices = orig->total_devices;
1041e4404d6eSYan Zheng
1042e4404d6eSYan Zheng list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1043bb21e302SAnand Jain const char *dev_path = NULL;
1044bb21e302SAnand Jain
1045bb21e302SAnand Jain /*
1046bb21e302SAnand Jain * This is ok to do without RCU read locked because we hold the
1047bb21e302SAnand Jain * uuid mutex so nothing we touch in here is going to disappear.
1048bb21e302SAnand Jain */
1049bb21e302SAnand Jain if (orig_dev->name)
1050bb21e302SAnand Jain dev_path = orig_dev->name->str;
1051606686eeSJosef Bacik
105212bd2fc0SIlya Dryomov device = btrfs_alloc_device(NULL, &orig_dev->devid,
1053bb21e302SAnand Jain orig_dev->uuid, dev_path);
1054d2979aa2SAnand Jain if (IS_ERR(device)) {
1055d2979aa2SAnand Jain ret = PTR_ERR(device);
1056e4404d6eSYan Zheng goto error;
1057d2979aa2SAnand Jain }
1058e4404d6eSYan Zheng
105921e61ec6SJohannes Thumshirn if (orig_dev->zone_info) {
106021e61ec6SJohannes Thumshirn struct btrfs_zoned_device_info *zone_info;
106121e61ec6SJohannes Thumshirn
106221e61ec6SJohannes Thumshirn zone_info = btrfs_clone_dev_zone_info(orig_dev);
106321e61ec6SJohannes Thumshirn if (!zone_info) {
106421e61ec6SJohannes Thumshirn btrfs_free_device(device);
106521e61ec6SJohannes Thumshirn ret = -ENOMEM;
106621e61ec6SJohannes Thumshirn goto error;
106721e61ec6SJohannes Thumshirn }
106821e61ec6SJohannes Thumshirn device->zone_info = zone_info;
106921e61ec6SJohannes Thumshirn }
107021e61ec6SJohannes Thumshirn
1071e4404d6eSYan Zheng list_add(&device->dev_list, &fs_devices->devices);
1072e4404d6eSYan Zheng device->fs_devices = fs_devices;
1073e4404d6eSYan Zheng fs_devices->num_devices++;
1074e4404d6eSYan Zheng }
1075e4404d6eSYan Zheng return fs_devices;
1076e4404d6eSYan Zheng error:
1077e4404d6eSYan Zheng free_fs_devices(fs_devices);
1078d2979aa2SAnand Jain return ERR_PTR(ret);
1079e4404d6eSYan Zheng }
1080e4404d6eSYan Zheng
__btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,struct btrfs_device ** latest_dev)10813712ccb7SNikolay Borisov static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1082bacce86aSAnand Jain struct btrfs_device **latest_dev)
1083dfe25020SChris Mason {
1084c6e30871SQinghuang Feng struct btrfs_device *device, *next;
1085a6b0d5c8SChris Mason
108646224705SXiao Guangrong /* This is the initialized path, it is safe to release the devices. */
1087c6e30871SQinghuang Feng list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
10883712ccb7SNikolay Borisov if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1089401e29c1SAnand Jain if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1090401e29c1SAnand Jain &device->dev_state) &&
1091998a0671SAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING,
1092998a0671SAnand Jain &device->dev_state) &&
10933712ccb7SNikolay Borisov (!*latest_dev ||
10943712ccb7SNikolay Borisov device->generation > (*latest_dev)->generation)) {
10953712ccb7SNikolay Borisov *latest_dev = device;
1096a6b0d5c8SChris Mason }
10972b82032cSYan Zheng continue;
1098a6b0d5c8SChris Mason }
10992b82032cSYan Zheng
11008dabb742SStefan Behrens /*
1101cf89af14SAnand Jain * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1102cf89af14SAnand Jain * in btrfs_init_dev_replace() so just continue.
11038dabb742SStefan Behrens */
1104cf89af14SAnand Jain if (device->devid == BTRFS_DEV_REPLACE_DEVID)
11058dabb742SStefan Behrens continue;
1106cf89af14SAnand Jain
1107a74a4b97SChris Mason if (device->bdev) {
11082736e8eeSChristoph Hellwig blkdev_put(device->bdev, device->holder);
11092b82032cSYan Zheng device->bdev = NULL;
1110a74a4b97SChris Mason fs_devices->open_devices--;
1111a74a4b97SChris Mason }
1112ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
11132b82032cSYan Zheng list_del_init(&device->dev_alloc_list);
1114ebbede42SAnand Jain clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1115b2a61667SDesmond Cheong Zhi Xi fs_devices->rw_devices--;
11162b82032cSYan Zheng }
11172b82032cSYan Zheng list_del_init(&device->dev_list);
11182b82032cSYan Zheng fs_devices->num_devices--;
1119a425f9d4SDavid Sterba btrfs_free_device(device);
11202b82032cSYan Zheng }
11212b82032cSYan Zheng
11223712ccb7SNikolay Borisov }
11233712ccb7SNikolay Borisov
11243712ccb7SNikolay Borisov /*
11253712ccb7SNikolay Borisov * After we have read the system tree and know devids belonging to this
11263712ccb7SNikolay Borisov * filesystem, remove the device which does not belong there.
11273712ccb7SNikolay Borisov */
btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices)1128bacce86aSAnand Jain void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
11293712ccb7SNikolay Borisov {
11303712ccb7SNikolay Borisov struct btrfs_device *latest_dev = NULL;
1131944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_dev;
11323712ccb7SNikolay Borisov
11333712ccb7SNikolay Borisov mutex_lock(&uuid_mutex);
1134bacce86aSAnand Jain __btrfs_free_extra_devids(fs_devices, &latest_dev);
1135944d3f9fSNikolay Borisov
1136944d3f9fSNikolay Borisov list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1137bacce86aSAnand Jain __btrfs_free_extra_devids(seed_dev, &latest_dev);
11382b82032cSYan Zheng
1139d24fa5c1SAnand Jain fs_devices->latest_dev = latest_dev;
1140a6b0d5c8SChris Mason
1141dfe25020SChris Mason mutex_unlock(&uuid_mutex);
1142dfe25020SChris Mason }
1143a0af469bSChris Mason
btrfs_close_bdev(struct btrfs_device * device)114414238819SAnand Jain static void btrfs_close_bdev(struct btrfs_device *device)
114514238819SAnand Jain {
114608ffcae8SDavid Sterba if (!device->bdev)
114708ffcae8SDavid Sterba return;
114808ffcae8SDavid Sterba
1149ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
115014238819SAnand Jain sync_blockdev(device->bdev);
115114238819SAnand Jain invalidate_bdev(device->bdev);
115214238819SAnand Jain }
115314238819SAnand Jain
11542736e8eeSChristoph Hellwig blkdev_put(device->bdev, device->holder);
115514238819SAnand Jain }
115614238819SAnand Jain
btrfs_close_one_device(struct btrfs_device * device)1157959b1c04SNikolay Borisov static void btrfs_close_one_device(struct btrfs_device *device)
1158f448341aSAnand Jain {
1159f448341aSAnand Jain struct btrfs_fs_devices *fs_devices = device->fs_devices;
1160f448341aSAnand Jain
1161ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1162f448341aSAnand Jain device->devid != BTRFS_DEV_REPLACE_DEVID) {
1163f448341aSAnand Jain list_del_init(&device->dev_alloc_list);
1164f448341aSAnand Jain fs_devices->rw_devices--;
1165f448341aSAnand Jain }
1166f448341aSAnand Jain
11670d977e0eSDesmond Cheong Zhi Xi if (device->devid == BTRFS_DEV_REPLACE_DEVID)
11680d977e0eSDesmond Cheong Zhi Xi clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
11690d977e0eSDesmond Cheong Zhi Xi
11705d03dbebSLi Zhang if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
11715d03dbebSLi Zhang clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1172f448341aSAnand Jain fs_devices->missing_devices--;
11735d03dbebSLi Zhang }
1174f448341aSAnand Jain
1175959b1c04SNikolay Borisov btrfs_close_bdev(device);
1176321f69f8SJohannes Thumshirn if (device->bdev) {
11773fff3975SJohannes Thumshirn fs_devices->open_devices--;
1178321f69f8SJohannes Thumshirn device->bdev = NULL;
1179f448341aSAnand Jain }
1180321f69f8SJohannes Thumshirn clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
11815b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device);
1182f448341aSAnand Jain
1183321f69f8SJohannes Thumshirn device->fs_info = NULL;
1184321f69f8SJohannes Thumshirn atomic_set(&device->dev_stats_ccnt, 0);
1185321f69f8SJohannes Thumshirn extent_io_tree_release(&device->alloc_state);
1186959b1c04SNikolay Borisov
11876b225baaSFilipe Manana /*
11886b225baaSFilipe Manana * Reset the flush error record. We might have a transient flush error
11896b225baaSFilipe Manana * in this mount, and if so we aborted the current transaction and set
11906b225baaSFilipe Manana * the fs to an error state, guaranteeing no super blocks can be further
11916b225baaSFilipe Manana * committed. However that error might be transient and if we unmount the
11926b225baaSFilipe Manana * filesystem and mount it again, we should allow the mount to succeed
11936b225baaSFilipe Manana * (btrfs_check_rw_degradable() should not fail) - if after mounting the
11946b225baaSFilipe Manana * filesystem again we still get flush errors, then we will again abort
11956b225baaSFilipe Manana * any transaction and set the error state, guaranteeing no commits of
11966b225baaSFilipe Manana * unsafe super blocks.
11976b225baaSFilipe Manana */
11986b225baaSFilipe Manana device->last_flush_error = 0;
11996b225baaSFilipe Manana
1200321f69f8SJohannes Thumshirn /* Verify the device is back in a pristine state */
12011f16033cSAnand Jain WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
12021f16033cSAnand Jain WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
12031f16033cSAnand Jain WARN_ON(!list_empty(&device->dev_alloc_list));
12041f16033cSAnand Jain WARN_ON(!list_empty(&device->post_commit_list));
1205f448341aSAnand Jain }
1206f448341aSAnand Jain
close_fs_devices(struct btrfs_fs_devices * fs_devices)120754eed6aeSNikolay Borisov static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
12088a4b83ccSChris Mason {
12092037a093SSasha Levin struct btrfs_device *device, *tmp;
1210e4404d6eSYan Zheng
1211425c6ed6SJosef Bacik lockdep_assert_held(&uuid_mutex);
1212425c6ed6SJosef Bacik
12132b82032cSYan Zheng if (--fs_devices->opened > 0)
121454eed6aeSNikolay Borisov return;
12158a4b83ccSChris Mason
1216425c6ed6SJosef Bacik list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1217959b1c04SNikolay Borisov btrfs_close_one_device(device);
1218c9513edbSXiao Guangrong
1219e4404d6eSYan Zheng WARN_ON(fs_devices->open_devices);
1220e4404d6eSYan Zheng WARN_ON(fs_devices->rw_devices);
12212b82032cSYan Zheng fs_devices->opened = 0;
12220395d84fSJohannes Thumshirn fs_devices->seeding = false;
1223c4989c2fSNikolay Borisov fs_devices->fs_info = NULL;
12248a4b83ccSChris Mason }
12258a4b83ccSChris Mason
btrfs_close_devices(struct btrfs_fs_devices * fs_devices)122654eed6aeSNikolay Borisov void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
12272b82032cSYan Zheng {
1228944d3f9fSNikolay Borisov LIST_HEAD(list);
1229944d3f9fSNikolay Borisov struct btrfs_fs_devices *tmp;
12302b82032cSYan Zheng
12312b82032cSYan Zheng mutex_lock(&uuid_mutex);
123254eed6aeSNikolay Borisov close_fs_devices(fs_devices);
12335f58d783SAnand Jain if (!fs_devices->opened) {
1234944d3f9fSNikolay Borisov list_splice_init(&fs_devices->seed_list, &list);
1235e4404d6eSYan Zheng
12365f58d783SAnand Jain /*
12375f58d783SAnand Jain * If the struct btrfs_fs_devices is not assembled with any
12385f58d783SAnand Jain * other device, it can be re-initialized during the next mount
12395f58d783SAnand Jain * without the needing device-scan step. Therefore, it can be
12405f58d783SAnand Jain * fully freed.
12415f58d783SAnand Jain */
12425f58d783SAnand Jain if (fs_devices->num_devices == 1) {
12435f58d783SAnand Jain list_del(&fs_devices->fs_list);
12445f58d783SAnand Jain free_fs_devices(fs_devices);
12455f58d783SAnand Jain }
12465f58d783SAnand Jain }
12475f58d783SAnand Jain
12485f58d783SAnand Jain
1249944d3f9fSNikolay Borisov list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
12500226e0ebSAnand Jain close_fs_devices(fs_devices);
1251944d3f9fSNikolay Borisov list_del(&fs_devices->seed_list);
1252e4404d6eSYan Zheng free_fs_devices(fs_devices);
1253e4404d6eSYan Zheng }
1254425c6ed6SJosef Bacik mutex_unlock(&uuid_mutex);
12552b82032cSYan Zheng }
12562b82032cSYan Zheng
open_fs_devices(struct btrfs_fs_devices * fs_devices,blk_mode_t flags,void * holder)1257897fb573SAnand Jain static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
125805bdb996SChristoph Hellwig blk_mode_t flags, void *holder)
12598a4b83ccSChris Mason {
12608a4b83ccSChris Mason struct btrfs_device *device;
1261443f24feSMiao Xie struct btrfs_device *latest_dev = NULL;
126296c2e067SAnand Jain struct btrfs_device *tmp_device;
12631ea068f5SAnand Jain int ret = 0;
12648a4b83ccSChris Mason
126596c2e067SAnand Jain list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
126696c2e067SAnand Jain dev_list) {
12671ea068f5SAnand Jain int ret2;
1268a0af469bSChris Mason
12691ea068f5SAnand Jain ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
12701ea068f5SAnand Jain if (ret2 == 0 &&
127196c2e067SAnand Jain (!latest_dev || device->generation > latest_dev->generation)) {
12729f050db4SAnand Jain latest_dev = device;
12731ea068f5SAnand Jain } else if (ret2 == -ENODATA) {
127496c2e067SAnand Jain fs_devices->num_devices--;
127596c2e067SAnand Jain list_del(&device->dev_list);
127696c2e067SAnand Jain btrfs_free_device(device);
127796c2e067SAnand Jain }
12781ea068f5SAnand Jain if (ret == 0 && ret2 != 0)
12791ea068f5SAnand Jain ret = ret2;
12808a4b83ccSChris Mason }
12811ea068f5SAnand Jain
12821ea068f5SAnand Jain if (fs_devices->open_devices == 0) {
12831ea068f5SAnand Jain if (ret)
12841ea068f5SAnand Jain return ret;
12851ed802c9SAnand Jain return -EINVAL;
12861ea068f5SAnand Jain }
12871ed802c9SAnand Jain
12882b82032cSYan Zheng fs_devices->opened = 1;
1289d24fa5c1SAnand Jain fs_devices->latest_dev = latest_dev;
12902b82032cSYan Zheng fs_devices->total_rw_bytes = 0;
1291c4a816c6SNaohiro Aota fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
129233fd2f71SAnand Jain fs_devices->read_policy = BTRFS_READ_POLICY_PID;
12931ed802c9SAnand Jain
12941ed802c9SAnand Jain return 0;
12952b82032cSYan Zheng }
12962b82032cSYan Zheng
devid_cmp(void * priv,const struct list_head * a,const struct list_head * b)12974f0f586bSSami Tolvanen static int devid_cmp(void *priv, const struct list_head *a,
12984f0f586bSSami Tolvanen const struct list_head *b)
1299f8e10cd3SAnand Jain {
1300214cc184SDavid Sterba const struct btrfs_device *dev1, *dev2;
1301f8e10cd3SAnand Jain
1302f8e10cd3SAnand Jain dev1 = list_entry(a, struct btrfs_device, dev_list);
1303f8e10cd3SAnand Jain dev2 = list_entry(b, struct btrfs_device, dev_list);
1304f8e10cd3SAnand Jain
1305f8e10cd3SAnand Jain if (dev1->devid < dev2->devid)
1306f8e10cd3SAnand Jain return -1;
1307f8e10cd3SAnand Jain else if (dev1->devid > dev2->devid)
1308f8e10cd3SAnand Jain return 1;
1309f8e10cd3SAnand Jain return 0;
1310f8e10cd3SAnand Jain }
1311f8e10cd3SAnand Jain
btrfs_open_devices(struct btrfs_fs_devices * fs_devices,blk_mode_t flags,void * holder)13122b82032cSYan Zheng int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
131305bdb996SChristoph Hellwig blk_mode_t flags, void *holder)
13142b82032cSYan Zheng {
13152b82032cSYan Zheng int ret;
13162b82032cSYan Zheng
1317f5194e34SDavid Sterba lockdep_assert_held(&uuid_mutex);
131818c850fdSJosef Bacik /*
131918c850fdSJosef Bacik * The device_list_mutex cannot be taken here in case opening the
1320a8698707SChristoph Hellwig * underlying device takes further locks like open_mutex.
132118c850fdSJosef Bacik *
132218c850fdSJosef Bacik * We also don't need the lock here as this is called during mount and
132318c850fdSJosef Bacik * exclusion is provided by uuid_mutex
132418c850fdSJosef Bacik */
1325f5194e34SDavid Sterba
13262b82032cSYan Zheng if (fs_devices->opened) {
13272b82032cSYan Zheng fs_devices->opened++;
13282b82032cSYan Zheng ret = 0;
13292b82032cSYan Zheng } else {
1330f8e10cd3SAnand Jain list_sort(NULL, &fs_devices->devices, devid_cmp);
1331897fb573SAnand Jain ret = open_fs_devices(fs_devices, flags, holder);
13322b82032cSYan Zheng }
1333542c5908SAnand Jain
13348a4b83ccSChris Mason return ret;
13358a4b83ccSChris Mason }
13368a4b83ccSChris Mason
btrfs_release_disk_super(struct btrfs_super_block * super)13378f32380dSJohannes Thumshirn void btrfs_release_disk_super(struct btrfs_super_block *super)
13386cf86a00SAnand Jain {
13398f32380dSJohannes Thumshirn struct page *page = virt_to_page(super);
13408f32380dSJohannes Thumshirn
13416cf86a00SAnand Jain put_page(page);
13426cf86a00SAnand Jain }
13436cf86a00SAnand Jain
btrfs_read_disk_super(struct block_device * bdev,u64 bytenr,u64 bytenr_orig)1344b335eab8SNikolay Borisov static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
134512659251SNaohiro Aota u64 bytenr, u64 bytenr_orig)
13466cf86a00SAnand Jain {
1347b335eab8SNikolay Borisov struct btrfs_super_block *disk_super;
1348b335eab8SNikolay Borisov struct page *page;
13496cf86a00SAnand Jain void *p;
13506cf86a00SAnand Jain pgoff_t index;
13516cf86a00SAnand Jain
13526cf86a00SAnand Jain /* make sure our super fits in the device */
1353cda00ebaSChristoph Hellwig if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1354b335eab8SNikolay Borisov return ERR_PTR(-EINVAL);
13556cf86a00SAnand Jain
13566cf86a00SAnand Jain /* make sure our super fits in the page */
1357b335eab8SNikolay Borisov if (sizeof(*disk_super) > PAGE_SIZE)
1358b335eab8SNikolay Borisov return ERR_PTR(-EINVAL);
13596cf86a00SAnand Jain
13606cf86a00SAnand Jain /* make sure our super doesn't straddle pages on disk */
13616cf86a00SAnand Jain index = bytenr >> PAGE_SHIFT;
1362b335eab8SNikolay Borisov if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1363b335eab8SNikolay Borisov return ERR_PTR(-EINVAL);
13646cf86a00SAnand Jain
13656cf86a00SAnand Jain /* pull in the page with our super */
1366b335eab8SNikolay Borisov page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
13676cf86a00SAnand Jain
1368b335eab8SNikolay Borisov if (IS_ERR(page))
1369b335eab8SNikolay Borisov return ERR_CAST(page);
13706cf86a00SAnand Jain
1371b335eab8SNikolay Borisov p = page_address(page);
13726cf86a00SAnand Jain
13736cf86a00SAnand Jain /* align our pointer to the offset of the super block */
1374b335eab8SNikolay Borisov disk_super = p + offset_in_page(bytenr);
13756cf86a00SAnand Jain
137612659251SNaohiro Aota if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1377b335eab8SNikolay Borisov btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
13788f32380dSJohannes Thumshirn btrfs_release_disk_super(p);
1379b335eab8SNikolay Borisov return ERR_PTR(-EINVAL);
13806cf86a00SAnand Jain }
13816cf86a00SAnand Jain
1382b335eab8SNikolay Borisov if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1383b335eab8SNikolay Borisov disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
13846cf86a00SAnand Jain
1385b335eab8SNikolay Borisov return disk_super;
13866cf86a00SAnand Jain }
13876cf86a00SAnand Jain
btrfs_forget_devices(dev_t devt)138816cab91aSAnand Jain int btrfs_forget_devices(dev_t devt)
1389228a73abSAnand Jain {
1390228a73abSAnand Jain int ret;
1391228a73abSAnand Jain
1392228a73abSAnand Jain mutex_lock(&uuid_mutex);
139316cab91aSAnand Jain ret = btrfs_free_stale_devices(devt, NULL);
1394228a73abSAnand Jain mutex_unlock(&uuid_mutex);
1395228a73abSAnand Jain
1396228a73abSAnand Jain return ret;
1397228a73abSAnand Jain }
1398228a73abSAnand Jain
13996f60cbd3SDavid Sterba /*
14006f60cbd3SDavid Sterba * Look for a btrfs signature on a device. This may be called out of the mount path
14016f60cbd3SDavid Sterba * and we are not allowed to call set_blocksize during the scan. The superblock
14026f60cbd3SDavid Sterba * is read via pagecache
14036f60cbd3SDavid Sterba */
btrfs_scan_one_device(const char * path,blk_mode_t flags)140405bdb996SChristoph Hellwig struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
14058a4b83ccSChris Mason {
14068a4b83ccSChris Mason struct btrfs_super_block *disk_super;
14074306a974SAnand Jain bool new_device_added = false;
140836350e95SGu Jinxiang struct btrfs_device *device = NULL;
14098a4b83ccSChris Mason struct block_device *bdev;
141012659251SNaohiro Aota u64 bytenr, bytenr_orig;
141112659251SNaohiro Aota int ret;
14128a4b83ccSChris Mason
1413899f9307SDavid Sterba lockdep_assert_held(&uuid_mutex);
1414899f9307SDavid Sterba
14156f60cbd3SDavid Sterba /*
14166f60cbd3SDavid Sterba * we would like to check all the supers, but that would make
14176f60cbd3SDavid Sterba * a btrfs mount succeed after a mkfs from a different FS.
14186f60cbd3SDavid Sterba * So, we need to add a special mount option to scan for
14196f60cbd3SDavid Sterba * later supers, using BTRFS_SUPER_MIRROR_MAX instead
14206f60cbd3SDavid Sterba */
14216f60cbd3SDavid Sterba
142250d281fcSAnand Jain /*
14232ef78928SChristoph Hellwig * Avoid an exclusive open here, as the systemd-udev may initiate the
14242ef78928SChristoph Hellwig * device scan which may race with the user's mount or mkfs command,
14252ef78928SChristoph Hellwig * resulting in failure.
14262ef78928SChristoph Hellwig * Since the device scan is solely for reading purposes, there is no
14272ef78928SChristoph Hellwig * need for an exclusive open. Additionally, the devices are read again
142850d281fcSAnand Jain * during the mount process. It is ok to get some inconsistent
142950d281fcSAnand Jain * values temporarily, as the device paths of the fsid are the only
143050d281fcSAnand Jain * required information for assembling the volume.
143150d281fcSAnand Jain */
14322ef78928SChristoph Hellwig bdev = blkdev_get_by_path(path, flags, NULL, NULL);
1433b6ed73bcSAnand Jain if (IS_ERR(bdev))
143436350e95SGu Jinxiang return ERR_CAST(bdev);
14356f60cbd3SDavid Sterba
143612659251SNaohiro Aota bytenr_orig = btrfs_sb_offset(0);
143712659251SNaohiro Aota ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
14384989d4a0SShin'ichiro Kawasaki if (ret) {
14394989d4a0SShin'ichiro Kawasaki device = ERR_PTR(ret);
14404989d4a0SShin'ichiro Kawasaki goto error_bdev_put;
14414989d4a0SShin'ichiro Kawasaki }
144212659251SNaohiro Aota
144312659251SNaohiro Aota disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1444b335eab8SNikolay Borisov if (IS_ERR(disk_super)) {
1445b335eab8SNikolay Borisov device = ERR_CAST(disk_super);
14466f60cbd3SDavid Sterba goto error_bdev_put;
144705a5c55dSAnand Jain }
14486f60cbd3SDavid Sterba
14494306a974SAnand Jain device = device_list_add(path, disk_super, &new_device_added);
14504889bc05SAnand Jain if (!IS_ERR(device) && new_device_added)
14514889bc05SAnand Jain btrfs_free_stale_devices(device->devt, device);
14526f60cbd3SDavid Sterba
14538f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super);
14546f60cbd3SDavid Sterba
14556f60cbd3SDavid Sterba error_bdev_put:
14562736e8eeSChristoph Hellwig blkdev_put(bdev, NULL);
1457b6ed73bcSAnand Jain
145836350e95SGu Jinxiang return device;
14598a4b83ccSChris Mason }
14600b86a832SChris Mason
1461c152b63eSFilipe Manana /*
14621c11b63eSJeff Mahoney * Try to find a chunk that intersects [start, start + len] range and when one
14631c11b63eSJeff Mahoney * such is found, record the end of it in *start
1464c152b63eSFilipe Manana */
contains_pending_extent(struct btrfs_device * device,u64 * start,u64 len)14651c11b63eSJeff Mahoney static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
14661c11b63eSJeff Mahoney u64 len)
14671c11b63eSJeff Mahoney {
14681c11b63eSJeff Mahoney u64 physical_start, physical_end;
14696df9a95eSJosef Bacik
14701c11b63eSJeff Mahoney lockdep_assert_held(&device->fs_info->chunk_mutex);
14711c11b63eSJeff Mahoney
1472e5860f82SFilipe Manana if (find_first_extent_bit(&device->alloc_state, *start,
14731c11b63eSJeff Mahoney &physical_start, &physical_end,
14741c11b63eSJeff Mahoney CHUNK_ALLOCATED, NULL)) {
14751c11b63eSJeff Mahoney
14761c11b63eSJeff Mahoney if (in_range(physical_start, *start, len) ||
14771c11b63eSJeff Mahoney in_range(*start, physical_start,
147851dad05fSFilipe Manana physical_end + 1 - physical_start)) {
14791c11b63eSJeff Mahoney *start = physical_end + 1;
14801c11b63eSJeff Mahoney return true;
14811c11b63eSJeff Mahoney }
14821c11b63eSJeff Mahoney }
14831c11b63eSJeff Mahoney return false;
14846df9a95eSJosef Bacik }
14856df9a95eSJosef Bacik
dev_extent_search_start(struct btrfs_device * device)1486ed8947bcSFilipe Manana static u64 dev_extent_search_start(struct btrfs_device *device)
14873b4ffa40SNaohiro Aota {
14883b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) {
14893b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR:
1490ed8947bcSFilipe Manana return BTRFS_DEVICE_RANGE_RESERVED;
14911cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED:
14921cd6121fSNaohiro Aota /*
14931cd6121fSNaohiro Aota * We don't care about the starting region like regular
14941cd6121fSNaohiro Aota * allocator, because we anyway use/reserve the first two zones
14951cd6121fSNaohiro Aota * for superblock logging.
14961cd6121fSNaohiro Aota */
1497ed8947bcSFilipe Manana return 0;
14983b4ffa40SNaohiro Aota default:
14993b4ffa40SNaohiro Aota BUG();
15003b4ffa40SNaohiro Aota }
15013b4ffa40SNaohiro Aota }
15023b4ffa40SNaohiro Aota
dev_extent_hole_check_zoned(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)15031cd6121fSNaohiro Aota static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
15041cd6121fSNaohiro Aota u64 *hole_start, u64 *hole_size,
15051cd6121fSNaohiro Aota u64 num_bytes)
15061cd6121fSNaohiro Aota {
15071cd6121fSNaohiro Aota u64 zone_size = device->zone_info->zone_size;
15081cd6121fSNaohiro Aota u64 pos;
15091cd6121fSNaohiro Aota int ret;
15101cd6121fSNaohiro Aota bool changed = false;
15111cd6121fSNaohiro Aota
15121cd6121fSNaohiro Aota ASSERT(IS_ALIGNED(*hole_start, zone_size));
15131cd6121fSNaohiro Aota
15141cd6121fSNaohiro Aota while (*hole_size > 0) {
15151cd6121fSNaohiro Aota pos = btrfs_find_allocatable_zones(device, *hole_start,
15161cd6121fSNaohiro Aota *hole_start + *hole_size,
15171cd6121fSNaohiro Aota num_bytes);
15181cd6121fSNaohiro Aota if (pos != *hole_start) {
15191cd6121fSNaohiro Aota *hole_size = *hole_start + *hole_size - pos;
15201cd6121fSNaohiro Aota *hole_start = pos;
15211cd6121fSNaohiro Aota changed = true;
15221cd6121fSNaohiro Aota if (*hole_size < num_bytes)
15231cd6121fSNaohiro Aota break;
15241cd6121fSNaohiro Aota }
15251cd6121fSNaohiro Aota
15261cd6121fSNaohiro Aota ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
15271cd6121fSNaohiro Aota
15281cd6121fSNaohiro Aota /* Range is ensured to be empty */
15291cd6121fSNaohiro Aota if (!ret)
15301cd6121fSNaohiro Aota return changed;
15311cd6121fSNaohiro Aota
15321cd6121fSNaohiro Aota /* Given hole range was invalid (outside of device) */
15331cd6121fSNaohiro Aota if (ret == -ERANGE) {
15341cd6121fSNaohiro Aota *hole_start += *hole_size;
1535d6f67afbSJohannes Thumshirn *hole_size = 0;
15367000babdSJiapeng Chong return true;
15371cd6121fSNaohiro Aota }
15381cd6121fSNaohiro Aota
15391cd6121fSNaohiro Aota *hole_start += zone_size;
15401cd6121fSNaohiro Aota *hole_size -= zone_size;
15411cd6121fSNaohiro Aota changed = true;
15421cd6121fSNaohiro Aota }
15431cd6121fSNaohiro Aota
15441cd6121fSNaohiro Aota return changed;
15451cd6121fSNaohiro Aota }
15461cd6121fSNaohiro Aota
154743dd529aSDavid Sterba /*
154843dd529aSDavid Sterba * Check if specified hole is suitable for allocation.
154943dd529aSDavid Sterba *
15503b4ffa40SNaohiro Aota * @device: the device which we have the hole
15513b4ffa40SNaohiro Aota * @hole_start: starting position of the hole
15523b4ffa40SNaohiro Aota * @hole_size: the size of the hole
15533b4ffa40SNaohiro Aota * @num_bytes: the size of the free space that we need
15543b4ffa40SNaohiro Aota *
15551cd6121fSNaohiro Aota * This function may modify @hole_start and @hole_size to reflect the suitable
15563b4ffa40SNaohiro Aota * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
15573b4ffa40SNaohiro Aota */
dev_extent_hole_check(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)15583b4ffa40SNaohiro Aota static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
15593b4ffa40SNaohiro Aota u64 *hole_size, u64 num_bytes)
15603b4ffa40SNaohiro Aota {
15613b4ffa40SNaohiro Aota bool changed = false;
15623b4ffa40SNaohiro Aota u64 hole_end = *hole_start + *hole_size;
15633b4ffa40SNaohiro Aota
15641cd6121fSNaohiro Aota for (;;) {
15653b4ffa40SNaohiro Aota /*
15663b4ffa40SNaohiro Aota * Check before we set max_hole_start, otherwise we could end up
15673b4ffa40SNaohiro Aota * sending back this offset anyway.
15683b4ffa40SNaohiro Aota */
15693b4ffa40SNaohiro Aota if (contains_pending_extent(device, hole_start, *hole_size)) {
15703b4ffa40SNaohiro Aota if (hole_end >= *hole_start)
15713b4ffa40SNaohiro Aota *hole_size = hole_end - *hole_start;
15723b4ffa40SNaohiro Aota else
15733b4ffa40SNaohiro Aota *hole_size = 0;
15743b4ffa40SNaohiro Aota changed = true;
15753b4ffa40SNaohiro Aota }
15763b4ffa40SNaohiro Aota
15773b4ffa40SNaohiro Aota switch (device->fs_devices->chunk_alloc_policy) {
15783b4ffa40SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR:
15793b4ffa40SNaohiro Aota /* No extra check */
15803b4ffa40SNaohiro Aota break;
15811cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED:
15821cd6121fSNaohiro Aota if (dev_extent_hole_check_zoned(device, hole_start,
15831cd6121fSNaohiro Aota hole_size, num_bytes)) {
15841cd6121fSNaohiro Aota changed = true;
15851cd6121fSNaohiro Aota /*
15861cd6121fSNaohiro Aota * The changed hole can contain pending extent.
15871cd6121fSNaohiro Aota * Loop again to check that.
15881cd6121fSNaohiro Aota */
15891cd6121fSNaohiro Aota continue;
15901cd6121fSNaohiro Aota }
15911cd6121fSNaohiro Aota break;
15923b4ffa40SNaohiro Aota default:
15933b4ffa40SNaohiro Aota BUG();
15943b4ffa40SNaohiro Aota }
15953b4ffa40SNaohiro Aota
15961cd6121fSNaohiro Aota break;
15971cd6121fSNaohiro Aota }
15981cd6121fSNaohiro Aota
15993b4ffa40SNaohiro Aota return changed;
16003b4ffa40SNaohiro Aota }
16016df9a95eSJosef Bacik
16020b86a832SChris Mason /*
160343dd529aSDavid Sterba * Find free space in the specified device.
160443dd529aSDavid Sterba *
16057bfc837dSMiao Xie * @device: the device which we search the free space in
16067bfc837dSMiao Xie * @num_bytes: the size of the free space that we need
1607499f377fSJeff Mahoney * @search_start: the position from which to begin the search
16087bfc837dSMiao Xie * @start: store the start of the free space.
1609499f377fSJeff Mahoney * @len: the size of the free space. that we find, or the size
1610499f377fSJeff Mahoney * of the max free space if we don't find suitable free space
16117bfc837dSMiao Xie *
161243dd529aSDavid Sterba * This does a pretty simple search, the expectation is that it is called very
161343dd529aSDavid Sterba * infrequently and that a given device has a small number of extents.
16147bfc837dSMiao Xie *
16157bfc837dSMiao Xie * @start is used to store the start of the free space if we find. But if we
16167bfc837dSMiao Xie * don't find suitable free space, it will be used to store the start position
16177bfc837dSMiao Xie * of the max free space.
16187bfc837dSMiao Xie *
16197bfc837dSMiao Xie * @len is used to store the size of the free space that we find.
16207bfc837dSMiao Xie * But if we don't find suitable free space, it is used to store the size of
16217bfc837dSMiao Xie * the max free space.
1622135da976SQu Wenruo *
1623135da976SQu Wenruo * NOTE: This function will search *commit* root of device tree, and does extra
1624135da976SQu Wenruo * check to ensure dev extents are not double allocated.
1625135da976SQu Wenruo * This makes the function safe to allocate dev extents but may not report
1626135da976SQu Wenruo * correct usable device space, as device extent freed in current transaction
16271a9fd417SDavid Sterba * is not reported as available.
16280b86a832SChris Mason */
find_free_dev_extent(struct btrfs_device * device,u64 num_bytes,u64 * start,u64 * len)1629ed8947bcSFilipe Manana static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1630ed8947bcSFilipe Manana u64 *start, u64 *len)
16310b86a832SChris Mason {
16320b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info;
16330b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root;
16340b86a832SChris Mason struct btrfs_key key;
16357bfc837dSMiao Xie struct btrfs_dev_extent *dev_extent;
16362b82032cSYan Zheng struct btrfs_path *path;
1637ed8947bcSFilipe Manana u64 search_start;
16387bfc837dSMiao Xie u64 hole_size;
16397bfc837dSMiao Xie u64 max_hole_start;
164020218dfbSJosef Bacik u64 max_hole_size = 0;
16417bfc837dSMiao Xie u64 extent_end;
16420b86a832SChris Mason u64 search_end = device->total_bytes;
16430b86a832SChris Mason int ret;
16447bfc837dSMiao Xie int slot;
16450b86a832SChris Mason struct extent_buffer *l;
16468cdc7c5bSFilipe Manana
1647ed8947bcSFilipe Manana search_start = dev_extent_search_start(device);
164820218dfbSJosef Bacik max_hole_start = search_start;
16490b86a832SChris Mason
16501cd6121fSNaohiro Aota WARN_ON(device->zone_info &&
16511cd6121fSNaohiro Aota !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
16521cd6121fSNaohiro Aota
16536df9a95eSJosef Bacik path = btrfs_alloc_path();
165420218dfbSJosef Bacik if (!path) {
165520218dfbSJosef Bacik ret = -ENOMEM;
165620218dfbSJosef Bacik goto out;
165720218dfbSJosef Bacik }
1658f2ab7618SZhao Lei again:
1659401e29c1SAnand Jain if (search_start >= search_end ||
1660401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
16617bfc837dSMiao Xie ret = -ENOSPC;
16626df9a95eSJosef Bacik goto out;
16637bfc837dSMiao Xie }
16647bfc837dSMiao Xie
1665e4058b54SDavid Sterba path->reada = READA_FORWARD;
16666df9a95eSJosef Bacik path->search_commit_root = 1;
16676df9a95eSJosef Bacik path->skip_locking = 1;
16687bfc837dSMiao Xie
16690b86a832SChris Mason key.objectid = device->devid;
16700b86a832SChris Mason key.offset = search_start;
16710b86a832SChris Mason key.type = BTRFS_DEV_EXTENT_KEY;
16727bfc837dSMiao Xie
16730ff40a91SMarcos Paulo de Souza ret = btrfs_search_backwards(root, &key, path);
16740b86a832SChris Mason if (ret < 0)
16757bfc837dSMiao Xie goto out;
16767bfc837dSMiao Xie
16773c538de0SJosef Bacik while (search_start < search_end) {
16780b86a832SChris Mason l = path->nodes[0];
16790b86a832SChris Mason slot = path->slots[0];
16800b86a832SChris Mason if (slot >= btrfs_header_nritems(l)) {
16810b86a832SChris Mason ret = btrfs_next_leaf(root, path);
16820b86a832SChris Mason if (ret == 0)
16830b86a832SChris Mason continue;
16840b86a832SChris Mason if (ret < 0)
16857bfc837dSMiao Xie goto out;
16867bfc837dSMiao Xie
16877bfc837dSMiao Xie break;
16880b86a832SChris Mason }
16890b86a832SChris Mason btrfs_item_key_to_cpu(l, &key, slot);
16900b86a832SChris Mason
16910b86a832SChris Mason if (key.objectid < device->devid)
16920b86a832SChris Mason goto next;
16930b86a832SChris Mason
16940b86a832SChris Mason if (key.objectid > device->devid)
16957bfc837dSMiao Xie break;
16960b86a832SChris Mason
1697962a298fSDavid Sterba if (key.type != BTRFS_DEV_EXTENT_KEY)
16980b86a832SChris Mason goto next;
16990b86a832SChris Mason
17003c538de0SJosef Bacik if (key.offset > search_end)
17013c538de0SJosef Bacik break;
17023c538de0SJosef Bacik
17037bfc837dSMiao Xie if (key.offset > search_start) {
17047bfc837dSMiao Xie hole_size = key.offset - search_start;
17053b4ffa40SNaohiro Aota dev_extent_hole_check(device, &search_start, &hole_size,
17063b4ffa40SNaohiro Aota num_bytes);
17076df9a95eSJosef Bacik
17087bfc837dSMiao Xie if (hole_size > max_hole_size) {
17097bfc837dSMiao Xie max_hole_start = search_start;
17107bfc837dSMiao Xie max_hole_size = hole_size;
17117bfc837dSMiao Xie }
17127bfc837dSMiao Xie
17137bfc837dSMiao Xie /*
17147bfc837dSMiao Xie * If this free space is greater than which we need,
17157bfc837dSMiao Xie * it must be the max free space that we have found
17167bfc837dSMiao Xie * until now, so max_hole_start must point to the start
17177bfc837dSMiao Xie * of this free space and the length of this free space
17187bfc837dSMiao Xie * is stored in max_hole_size. Thus, we return
17197bfc837dSMiao Xie * max_hole_start and max_hole_size and go back to the
17207bfc837dSMiao Xie * caller.
17217bfc837dSMiao Xie */
17227bfc837dSMiao Xie if (hole_size >= num_bytes) {
17237bfc837dSMiao Xie ret = 0;
17247bfc837dSMiao Xie goto out;
17257bfc837dSMiao Xie }
17267bfc837dSMiao Xie }
17277bfc837dSMiao Xie
17280b86a832SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
17297bfc837dSMiao Xie extent_end = key.offset + btrfs_dev_extent_length(l,
17307bfc837dSMiao Xie dev_extent);
17317bfc837dSMiao Xie if (extent_end > search_start)
17327bfc837dSMiao Xie search_start = extent_end;
17330b86a832SChris Mason next:
17340b86a832SChris Mason path->slots[0]++;
17350b86a832SChris Mason cond_resched();
17360b86a832SChris Mason }
17370b86a832SChris Mason
173838c01b96Sliubo /*
173938c01b96Sliubo * At this point, search_start should be the end of
174038c01b96Sliubo * allocated dev extents, and when shrinking the device,
174138c01b96Sliubo * search_end may be smaller than search_start.
174238c01b96Sliubo */
1743f2ab7618SZhao Lei if (search_end > search_start) {
17447bfc837dSMiao Xie hole_size = search_end - search_start;
17453b4ffa40SNaohiro Aota if (dev_extent_hole_check(device, &search_start, &hole_size,
17463b4ffa40SNaohiro Aota num_bytes)) {
1747f2ab7618SZhao Lei btrfs_release_path(path);
1748f2ab7618SZhao Lei goto again;
1749f2ab7618SZhao Lei }
1750f2ab7618SZhao Lei
17517bfc837dSMiao Xie if (hole_size > max_hole_size) {
17527bfc837dSMiao Xie max_hole_start = search_start;
17537bfc837dSMiao Xie max_hole_size = hole_size;
17540b86a832SChris Mason }
17556df9a95eSJosef Bacik }
17566df9a95eSJosef Bacik
17577bfc837dSMiao Xie /* See above. */
1758f2ab7618SZhao Lei if (max_hole_size < num_bytes)
17597bfc837dSMiao Xie ret = -ENOSPC;
17607bfc837dSMiao Xie else
17612b82032cSYan Zheng ret = 0;
17620b86a832SChris Mason
17633c538de0SJosef Bacik ASSERT(max_hole_start + max_hole_size <= search_end);
17647bfc837dSMiao Xie out:
17652b82032cSYan Zheng btrfs_free_path(path);
17667bfc837dSMiao Xie *start = max_hole_start;
1767b2117a39SMiao Xie if (len)
17687bfc837dSMiao Xie *len = max_hole_size;
17690b86a832SChris Mason return ret;
17700b86a832SChris Mason }
17710b86a832SChris Mason
btrfs_free_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 start,u64 * dev_extent_len)1772b2950863SChristoph Hellwig static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
17738f18cf13SChris Mason struct btrfs_device *device,
17742196d6e8SMiao Xie u64 start, u64 *dev_extent_len)
17758f18cf13SChris Mason {
17760b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info;
17770b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root;
17788f18cf13SChris Mason int ret;
17798f18cf13SChris Mason struct btrfs_path *path;
17808f18cf13SChris Mason struct btrfs_key key;
1781a061fc8dSChris Mason struct btrfs_key found_key;
1782a061fc8dSChris Mason struct extent_buffer *leaf = NULL;
1783a061fc8dSChris Mason struct btrfs_dev_extent *extent = NULL;
17848f18cf13SChris Mason
17858f18cf13SChris Mason path = btrfs_alloc_path();
17868f18cf13SChris Mason if (!path)
17878f18cf13SChris Mason return -ENOMEM;
17888f18cf13SChris Mason
17898f18cf13SChris Mason key.objectid = device->devid;
17908f18cf13SChris Mason key.offset = start;
17918f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY;
1792924cd8fbSMiao Xie again:
17938f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1794a061fc8dSChris Mason if (ret > 0) {
1795a061fc8dSChris Mason ret = btrfs_previous_item(root, path, key.objectid,
1796a061fc8dSChris Mason BTRFS_DEV_EXTENT_KEY);
1797b0b802d7STsutomu Itoh if (ret)
1798b0b802d7STsutomu Itoh goto out;
1799a061fc8dSChris Mason leaf = path->nodes[0];
1800a061fc8dSChris Mason btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1801a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0],
1802a061fc8dSChris Mason struct btrfs_dev_extent);
1803a061fc8dSChris Mason BUG_ON(found_key.offset > start || found_key.offset +
1804a061fc8dSChris Mason btrfs_dev_extent_length(leaf, extent) < start);
1805924cd8fbSMiao Xie key = found_key;
1806924cd8fbSMiao Xie btrfs_release_path(path);
1807924cd8fbSMiao Xie goto again;
1808a061fc8dSChris Mason } else if (ret == 0) {
1809a061fc8dSChris Mason leaf = path->nodes[0];
1810a061fc8dSChris Mason extent = btrfs_item_ptr(leaf, path->slots[0],
1811a061fc8dSChris Mason struct btrfs_dev_extent);
181279787eaaSJeff Mahoney } else {
181379787eaaSJeff Mahoney goto out;
1814a061fc8dSChris Mason }
18158f18cf13SChris Mason
18162196d6e8SMiao Xie *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
18172196d6e8SMiao Xie
18188f18cf13SChris Mason ret = btrfs_del_item(trans, root, path);
181979bd3712SFilipe Manana if (ret == 0)
18203204d33cSJosef Bacik set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1821b0b802d7STsutomu Itoh out:
18228f18cf13SChris Mason btrfs_free_path(path);
18238f18cf13SChris Mason return ret;
18248f18cf13SChris Mason }
18258f18cf13SChris Mason
find_next_chunk(struct btrfs_fs_info * fs_info)18266df9a95eSJosef Bacik static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
18270b86a832SChris Mason {
18286df9a95eSJosef Bacik struct extent_map_tree *em_tree;
18296df9a95eSJosef Bacik struct extent_map *em;
18306df9a95eSJosef Bacik struct rb_node *n;
18316df9a95eSJosef Bacik u64 ret = 0;
18320b86a832SChris Mason
1833c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree;
18346df9a95eSJosef Bacik read_lock(&em_tree->lock);
183507e1ce09SLiu Bo n = rb_last(&em_tree->map.rb_root);
18366df9a95eSJosef Bacik if (n) {
18376df9a95eSJosef Bacik em = rb_entry(n, struct extent_map, rb_node);
18386df9a95eSJosef Bacik ret = em->start + em->len;
1839e17cade2SChris Mason }
18406df9a95eSJosef Bacik read_unlock(&em_tree->lock);
18416df9a95eSJosef Bacik
18420b86a832SChris Mason return ret;
18430b86a832SChris Mason }
18440b86a832SChris Mason
find_next_devid(struct btrfs_fs_info * fs_info,u64 * devid_ret)184553f10659SIlya Dryomov static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
184653f10659SIlya Dryomov u64 *devid_ret)
18470b86a832SChris Mason {
18480b86a832SChris Mason int ret;
18490b86a832SChris Mason struct btrfs_key key;
18500b86a832SChris Mason struct btrfs_key found_key;
18512b82032cSYan Zheng struct btrfs_path *path;
18522b82032cSYan Zheng
18532b82032cSYan Zheng path = btrfs_alloc_path();
18542b82032cSYan Zheng if (!path)
18552b82032cSYan Zheng return -ENOMEM;
18560b86a832SChris Mason
18570b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
18580b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY;
18590b86a832SChris Mason key.offset = (u64)-1;
18600b86a832SChris Mason
186153f10659SIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
18620b86a832SChris Mason if (ret < 0)
18630b86a832SChris Mason goto error;
18640b86a832SChris Mason
1865a06dee4dSAnand Jain if (ret == 0) {
1866a06dee4dSAnand Jain /* Corruption */
1867a06dee4dSAnand Jain btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1868a06dee4dSAnand Jain ret = -EUCLEAN;
1869a06dee4dSAnand Jain goto error;
1870a06dee4dSAnand Jain }
18710b86a832SChris Mason
187253f10659SIlya Dryomov ret = btrfs_previous_item(fs_info->chunk_root, path,
187353f10659SIlya Dryomov BTRFS_DEV_ITEMS_OBJECTID,
18740b86a832SChris Mason BTRFS_DEV_ITEM_KEY);
18750b86a832SChris Mason if (ret) {
187653f10659SIlya Dryomov *devid_ret = 1;
18770b86a832SChris Mason } else {
18780b86a832SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key,
18790b86a832SChris Mason path->slots[0]);
188053f10659SIlya Dryomov *devid_ret = found_key.offset + 1;
18810b86a832SChris Mason }
18820b86a832SChris Mason ret = 0;
18830b86a832SChris Mason error:
18842b82032cSYan Zheng btrfs_free_path(path);
18850b86a832SChris Mason return ret;
18860b86a832SChris Mason }
18870b86a832SChris Mason
18880b86a832SChris Mason /*
18890b86a832SChris Mason * the device information is stored in the chunk root
18900b86a832SChris Mason * the btrfs_device struct should be fully filled in
18910b86a832SChris Mason */
btrfs_add_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1892c74a0b02SAnand Jain static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
18930b86a832SChris Mason struct btrfs_device *device)
18940b86a832SChris Mason {
18950b86a832SChris Mason int ret;
18960b86a832SChris Mason struct btrfs_path *path;
18970b86a832SChris Mason struct btrfs_dev_item *dev_item;
18980b86a832SChris Mason struct extent_buffer *leaf;
18990b86a832SChris Mason struct btrfs_key key;
19000b86a832SChris Mason unsigned long ptr;
19010b86a832SChris Mason
19020b86a832SChris Mason path = btrfs_alloc_path();
19030b86a832SChris Mason if (!path)
19040b86a832SChris Mason return -ENOMEM;
19050b86a832SChris Mason
19060b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
19070b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY;
19082b82032cSYan Zheng key.offset = device->devid;
19090b86a832SChris Mason
19102bb2e00eSFilipe Manana btrfs_reserve_chunk_metadata(trans, true);
19118e87e856SNikolay Borisov ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
19128e87e856SNikolay Borisov &key, sizeof(*dev_item));
19132bb2e00eSFilipe Manana btrfs_trans_release_chunk_metadata(trans);
19140b86a832SChris Mason if (ret)
19150b86a832SChris Mason goto out;
19160b86a832SChris Mason
19170b86a832SChris Mason leaf = path->nodes[0];
19180b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
19190b86a832SChris Mason
19200b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid);
19212b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item, 0);
19220b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type);
19230b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align);
19240b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width);
19250b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
19267cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item,
19277cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device));
19287cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item,
19297cc8e58dSMiao Xie btrfs_device_get_bytes_used(device));
1930e17cade2SChris Mason btrfs_set_device_group(leaf, dev_item, 0);
1931e17cade2SChris Mason btrfs_set_device_seek_speed(leaf, dev_item, 0);
1932e17cade2SChris Mason btrfs_set_device_bandwidth(leaf, dev_item, 0);
1933c3027eb5SChris Mason btrfs_set_device_start_offset(leaf, dev_item, 0);
19340b86a832SChris Mason
1935410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item);
1936e17cade2SChris Mason write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
19371473b24eSGeert Uytterhoeven ptr = btrfs_device_fsid(dev_item);
1938de37aa51SNikolay Borisov write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1939de37aa51SNikolay Borisov ptr, BTRFS_FSID_SIZE);
1940d5e09e38SFilipe Manana btrfs_mark_buffer_dirty(trans, leaf);
19410b86a832SChris Mason
19422b82032cSYan Zheng ret = 0;
19430b86a832SChris Mason out:
19440b86a832SChris Mason btrfs_free_path(path);
19450b86a832SChris Mason return ret;
19460b86a832SChris Mason }
19478f18cf13SChris Mason
19485a1972bdSQu Wenruo /*
19495a1972bdSQu Wenruo * Function to update ctime/mtime for a given device path.
19505a1972bdSQu Wenruo * Mainly used for ctime/mtime based probe like libblkid.
195154fde91fSJosef Bacik *
195254fde91fSJosef Bacik * We don't care about errors here, this is just to be kind to userspace.
19535a1972bdSQu Wenruo */
update_dev_time(const char * device_path)195454fde91fSJosef Bacik static void update_dev_time(const char *device_path)
19555a1972bdSQu Wenruo {
195654fde91fSJosef Bacik struct path path;
195754fde91fSJosef Bacik int ret;
19585a1972bdSQu Wenruo
195954fde91fSJosef Bacik ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
196054fde91fSJosef Bacik if (ret)
19615a1972bdSQu Wenruo return;
19628f96a5bfSJosef Bacik
1963913e9928SJeff Layton inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
196454fde91fSJosef Bacik path_put(&path);
19655a1972bdSQu Wenruo }
19665a1972bdSQu Wenruo
btrfs_rm_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1967bbac5869SQu Wenruo static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1968bbac5869SQu Wenruo struct btrfs_device *device)
1969a061fc8dSChris Mason {
1970f331a952SDavid Sterba struct btrfs_root *root = device->fs_info->chunk_root;
1971a061fc8dSChris Mason int ret;
1972a061fc8dSChris Mason struct btrfs_path *path;
1973a061fc8dSChris Mason struct btrfs_key key;
1974a061fc8dSChris Mason
1975a061fc8dSChris Mason path = btrfs_alloc_path();
1976a061fc8dSChris Mason if (!path)
1977a061fc8dSChris Mason return -ENOMEM;
1978a061fc8dSChris Mason
1979a061fc8dSChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1980a061fc8dSChris Mason key.type = BTRFS_DEV_ITEM_KEY;
1981a061fc8dSChris Mason key.offset = device->devid;
1982a061fc8dSChris Mason
19832bb2e00eSFilipe Manana btrfs_reserve_chunk_metadata(trans, false);
1984a061fc8dSChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
19852bb2e00eSFilipe Manana btrfs_trans_release_chunk_metadata(trans);
19865e9f2ad5SNikolay Borisov if (ret) {
19875e9f2ad5SNikolay Borisov if (ret > 0)
1988a061fc8dSChris Mason ret = -ENOENT;
1989a061fc8dSChris Mason goto out;
1990a061fc8dSChris Mason }
1991a061fc8dSChris Mason
1992a061fc8dSChris Mason ret = btrfs_del_item(trans, root, path);
1993a061fc8dSChris Mason out:
1994a061fc8dSChris Mason btrfs_free_path(path);
1995a061fc8dSChris Mason return ret;
1996a061fc8dSChris Mason }
1997a061fc8dSChris Mason
19983cc31a0dSDavid Sterba /*
19993cc31a0dSDavid Sterba * Verify that @num_devices satisfies the RAID profile constraints in the whole
20003cc31a0dSDavid Sterba * filesystem. It's up to the caller to adjust that number regarding eg. device
20013cc31a0dSDavid Sterba * replace.
20023cc31a0dSDavid Sterba */
btrfs_check_raid_min_devices(struct btrfs_fs_info * fs_info,u64 num_devices)20033cc31a0dSDavid Sterba static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
20043cc31a0dSDavid Sterba u64 num_devices)
2005a061fc8dSChris Mason {
2006a061fc8dSChris Mason u64 all_avail;
2007de98ced9SMiao Xie unsigned seq;
2008418775a2SDavid Sterba int i;
2009a061fc8dSChris Mason
2010de98ced9SMiao Xie do {
2011bd45ffbcSAnand Jain seq = read_seqbegin(&fs_info->profiles_lock);
2012de98ced9SMiao Xie
2013bd45ffbcSAnand Jain all_avail = fs_info->avail_data_alloc_bits |
2014bd45ffbcSAnand Jain fs_info->avail_system_alloc_bits |
2015bd45ffbcSAnand Jain fs_info->avail_metadata_alloc_bits;
2016bd45ffbcSAnand Jain } while (read_seqretry(&fs_info->profiles_lock, seq));
2017f1fa7f26SAnand Jain
2018418775a2SDavid Sterba for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
201941a6e891SAnand Jain if (!(all_avail & btrfs_raid_array[i].bg_flag))
2020418775a2SDavid Sterba continue;
2021a061fc8dSChris Mason
2022efc222f8SAnand Jain if (num_devices < btrfs_raid_array[i].devs_min)
2023efc222f8SAnand Jain return btrfs_raid_array[i].mindev_error;
2024bd45ffbcSAnand Jain }
2025bd45ffbcSAnand Jain
2026bd45ffbcSAnand Jain return 0;
2027f1fa7f26SAnand Jain }
2028f1fa7f26SAnand Jain
btrfs_find_next_active_device(struct btrfs_fs_devices * fs_devs,struct btrfs_device * device)2029c9162bdfSOmar Sandoval static struct btrfs_device * btrfs_find_next_active_device(
2030c9162bdfSOmar Sandoval struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
203188acff64SAnand Jain {
203288acff64SAnand Jain struct btrfs_device *next_device;
203388acff64SAnand Jain
203488acff64SAnand Jain list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
203588acff64SAnand Jain if (next_device != device &&
2036e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2037e6e674bdSAnand Jain && next_device->bdev)
203888acff64SAnand Jain return next_device;
203988acff64SAnand Jain }
204088acff64SAnand Jain
204188acff64SAnand Jain return NULL;
204288acff64SAnand Jain }
204388acff64SAnand Jain
204488acff64SAnand Jain /*
2045d24fa5c1SAnand Jain * Helper function to check if the given device is part of s_bdev / latest_dev
204688acff64SAnand Jain * and replace it with the provided or the next active device, in the context
204788acff64SAnand Jain * where this function called, there should be always be another device (or
204888acff64SAnand Jain * this_dev) which is active.
204988acff64SAnand Jain */
btrfs_assign_next_active_device(struct btrfs_device * device,struct btrfs_device * next_device)2050b105e927SDavid Sterba void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2051e493e8f9SAnand Jain struct btrfs_device *next_device)
205288acff64SAnand Jain {
2053d6507cf1SNikolay Borisov struct btrfs_fs_info *fs_info = device->fs_info;
205488acff64SAnand Jain
2055e493e8f9SAnand Jain if (!next_device)
205688acff64SAnand Jain next_device = btrfs_find_next_active_device(fs_info->fs_devices,
205788acff64SAnand Jain device);
205888acff64SAnand Jain ASSERT(next_device);
205988acff64SAnand Jain
206088acff64SAnand Jain if (fs_info->sb->s_bdev &&
206188acff64SAnand Jain (fs_info->sb->s_bdev == device->bdev))
206288acff64SAnand Jain fs_info->sb->s_bdev = next_device->bdev;
206388acff64SAnand Jain
2064d24fa5c1SAnand Jain if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2065d24fa5c1SAnand Jain fs_info->fs_devices->latest_dev = next_device;
206688acff64SAnand Jain }
206788acff64SAnand Jain
20681da73967SAnand Jain /*
20691da73967SAnand Jain * Return btrfs_fs_devices::num_devices excluding the device that's being
20701da73967SAnand Jain * currently replaced.
20711da73967SAnand Jain */
btrfs_num_devices(struct btrfs_fs_info * fs_info)20721da73967SAnand Jain static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
20731da73967SAnand Jain {
20741da73967SAnand Jain u64 num_devices = fs_info->fs_devices->num_devices;
20751da73967SAnand Jain
2076cb5583ddSDavid Sterba down_read(&fs_info->dev_replace.rwsem);
20771da73967SAnand Jain if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
20781da73967SAnand Jain ASSERT(num_devices > 1);
20791da73967SAnand Jain num_devices--;
20801da73967SAnand Jain }
2081cb5583ddSDavid Sterba up_read(&fs_info->dev_replace.rwsem);
20821da73967SAnand Jain
20831da73967SAnand Jain return num_devices;
20841da73967SAnand Jain }
20851da73967SAnand Jain
btrfs_scratch_superblock(struct btrfs_fs_info * fs_info,struct block_device * bdev,int copy_num)20860e0078f7SChristoph Hellwig static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
20870e0078f7SChristoph Hellwig struct block_device *bdev, int copy_num)
20886fbceb9fSJohannes Thumshirn {
20896fbceb9fSJohannes Thumshirn struct btrfs_super_block *disk_super;
209026ecf243SChristoph Hellwig const size_t len = sizeof(disk_super->magic);
209126ecf243SChristoph Hellwig const u64 bytenr = btrfs_sb_offset(copy_num);
20928f32380dSJohannes Thumshirn int ret;
20938f32380dSJohannes Thumshirn
209426ecf243SChristoph Hellwig disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
20958f32380dSJohannes Thumshirn if (IS_ERR(disk_super))
20960e0078f7SChristoph Hellwig return;
209712659251SNaohiro Aota
209826ecf243SChristoph Hellwig memset(&disk_super->magic, 0, len);
209926ecf243SChristoph Hellwig folio_mark_dirty(virt_to_folio(disk_super));
210026ecf243SChristoph Hellwig btrfs_release_disk_super(disk_super);
210126ecf243SChristoph Hellwig
210226ecf243SChristoph Hellwig ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
21038f32380dSJohannes Thumshirn if (ret)
21040e0078f7SChristoph Hellwig btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
21058f32380dSJohannes Thumshirn copy_num, ret);
21060e0078f7SChristoph Hellwig }
21078f32380dSJohannes Thumshirn
btrfs_scratch_superblocks(struct btrfs_fs_info * fs_info,struct block_device * bdev,const char * device_path)21080e0078f7SChristoph Hellwig void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
21090e0078f7SChristoph Hellwig struct block_device *bdev,
21100e0078f7SChristoph Hellwig const char *device_path)
21110e0078f7SChristoph Hellwig {
21120e0078f7SChristoph Hellwig int copy_num;
21130e0078f7SChristoph Hellwig
21140e0078f7SChristoph Hellwig if (!bdev)
21150e0078f7SChristoph Hellwig return;
21160e0078f7SChristoph Hellwig
21170e0078f7SChristoph Hellwig for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
21180e0078f7SChristoph Hellwig if (bdev_is_zoned(bdev))
21190e0078f7SChristoph Hellwig btrfs_reset_sb_log_zones(bdev, copy_num);
21200e0078f7SChristoph Hellwig else
21210e0078f7SChristoph Hellwig btrfs_scratch_superblock(fs_info, bdev, copy_num);
21226fbceb9fSJohannes Thumshirn }
21236fbceb9fSJohannes Thumshirn
21246fbceb9fSJohannes Thumshirn /* Notify udev that device has changed */
21256fbceb9fSJohannes Thumshirn btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
21266fbceb9fSJohannes Thumshirn
21276fbceb9fSJohannes Thumshirn /* Update ctime/mtime for device path for libblkid */
212854fde91fSJosef Bacik update_dev_time(device_path);
21296fbceb9fSJohannes Thumshirn }
21306fbceb9fSJohannes Thumshirn
btrfs_rm_device(struct btrfs_fs_info * fs_info,struct btrfs_dev_lookup_args * args,struct block_device ** bdev,void ** holder)21311a15eb72SJosef Bacik int btrfs_rm_device(struct btrfs_fs_info *fs_info,
21321a15eb72SJosef Bacik struct btrfs_dev_lookup_args *args,
21332736e8eeSChristoph Hellwig struct block_device **bdev, void **holder)
2134f1fa7f26SAnand Jain {
2135bbac5869SQu Wenruo struct btrfs_trans_handle *trans;
2136f1fa7f26SAnand Jain struct btrfs_device *device;
2137f1fa7f26SAnand Jain struct btrfs_fs_devices *cur_devices;
2138b5185197SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2139f1fa7f26SAnand Jain u64 num_devices;
2140f1fa7f26SAnand Jain int ret = 0;
2141f1fa7f26SAnand Jain
2142914a519bSJosef Bacik if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2143914a519bSJosef Bacik btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2144914a519bSJosef Bacik return -EINVAL;
2145914a519bSJosef Bacik }
2146914a519bSJosef Bacik
21478ef9dc0fSJosef Bacik /*
21488ef9dc0fSJosef Bacik * The device list in fs_devices is accessed without locks (neither
21498ef9dc0fSJosef Bacik * uuid_mutex nor device_list_mutex) as it won't change on a mounted
21508ef9dc0fSJosef Bacik * filesystem and another device rm cannot run.
21518ef9dc0fSJosef Bacik */
21521da73967SAnand Jain num_devices = btrfs_num_devices(fs_info);
2153a061fc8dSChris Mason
21540b246afaSJeff Mahoney ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2155beaf8ab3SStefan Behrens if (ret)
2156bbac5869SQu Wenruo return ret;
2157f1fa7f26SAnand Jain
21581a15eb72SJosef Bacik device = btrfs_find_device(fs_info->fs_devices, args);
21591a15eb72SJosef Bacik if (!device) {
21601a15eb72SJosef Bacik if (args->missing)
2161a27a94c2SNikolay Borisov ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2162a27a94c2SNikolay Borisov else
21631a15eb72SJosef Bacik ret = -ENOENT;
2164bbac5869SQu Wenruo return ret;
2165a27a94c2SNikolay Borisov }
21662b82032cSYan Zheng
2167eede2bf3SOmar Sandoval if (btrfs_pinned_by_swapfile(fs_info, device)) {
2168eede2bf3SOmar Sandoval btrfs_warn_in_rcu(fs_info,
2169eede2bf3SOmar Sandoval "cannot remove device %s (devid %llu) due to active swapfile",
2170cb3e217bSQu Wenruo btrfs_dev_name(device), device->devid);
2171bbac5869SQu Wenruo return -ETXTBSY;
2172eede2bf3SOmar Sandoval }
2173eede2bf3SOmar Sandoval
2174bbac5869SQu Wenruo if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2175bbac5869SQu Wenruo return BTRFS_ERROR_DEV_TGT_REPLACE;
217663a212abSStefan Behrens
2177ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2178bbac5869SQu Wenruo fs_info->fs_devices->rw_devices == 1)
2179bbac5869SQu Wenruo return BTRFS_ERROR_DEV_ONLY_WRITABLE;
21802b82032cSYan Zheng
2181ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
218234441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
21832b82032cSYan Zheng list_del_init(&device->dev_alloc_list);
2184c3929c36SMiao Xie device->fs_devices->rw_devices--;
218534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
21862b82032cSYan Zheng }
2187a061fc8dSChris Mason
2188a061fc8dSChris Mason ret = btrfs_shrink_device(device, 0);
2189a061fc8dSChris Mason if (ret)
21909b3517e9SIlya Dryomov goto error_undo;
2191a061fc8dSChris Mason
2192bbac5869SQu Wenruo trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2193bbac5869SQu Wenruo if (IS_ERR(trans)) {
2194bbac5869SQu Wenruo ret = PTR_ERR(trans);
21959b3517e9SIlya Dryomov goto error_undo;
2196bbac5869SQu Wenruo }
2197bbac5869SQu Wenruo
2198bbac5869SQu Wenruo ret = btrfs_rm_dev_item(trans, device);
2199bbac5869SQu Wenruo if (ret) {
2200bbac5869SQu Wenruo /* Any error in dev item removal is critical */
2201bbac5869SQu Wenruo btrfs_crit(fs_info,
2202bbac5869SQu Wenruo "failed to remove device item for devid %llu: %d",
2203bbac5869SQu Wenruo device->devid, ret);
2204bbac5869SQu Wenruo btrfs_abort_transaction(trans, ret);
2205bbac5869SQu Wenruo btrfs_end_transaction(trans);
2206bbac5869SQu Wenruo return ret;
2207bbac5869SQu Wenruo }
2208a061fc8dSChris Mason
2209e12c9621SAnand Jain clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2210163e97eeSDavid Sterba btrfs_scrub_cancel_dev(device);
2211e5e9a520SChris Mason
2212e5e9a520SChris Mason /*
2213e5e9a520SChris Mason * the device list mutex makes sure that we don't change
2214e5e9a520SChris Mason * the device list while someone else is writing out all
2215d7306801SFilipe David Borba Manana * the device supers. Whoever is writing all supers, should
2216d7306801SFilipe David Borba Manana * lock the device list mutex before getting the number of
2217d7306801SFilipe David Borba Manana * devices in the super block (super_copy). Conversely,
2218d7306801SFilipe David Borba Manana * whoever updates the number of devices in the super block
2219d7306801SFilipe David Borba Manana * (super_copy) should hold the device list mutex.
2220e5e9a520SChris Mason */
22211f78160cSXiao Guangrong
222241a52a0fSAnand Jain /*
222341a52a0fSAnand Jain * In normal cases the cur_devices == fs_devices. But in case
222441a52a0fSAnand Jain * of deleting a seed device, the cur_devices should point to
22259675ea8cSSu Yue * its own fs_devices listed under the fs_devices->seed_list.
222641a52a0fSAnand Jain */
22271f78160cSXiao Guangrong cur_devices = device->fs_devices;
2228b5185197SAnand Jain mutex_lock(&fs_devices->device_list_mutex);
22291f78160cSXiao Guangrong list_del_rcu(&device->dev_list);
2230e5e9a520SChris Mason
223141a52a0fSAnand Jain cur_devices->num_devices--;
223241a52a0fSAnand Jain cur_devices->total_devices--;
2233b4993e64SAnand Jain /* Update total_devices of the parent fs_devices if it's seed */
2234b4993e64SAnand Jain if (cur_devices != fs_devices)
2235b4993e64SAnand Jain fs_devices->total_devices--;
22362b82032cSYan Zheng
2237e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
223841a52a0fSAnand Jain cur_devices->missing_devices--;
2239cd02dca5SChris Mason
2240d6507cf1SNikolay Borisov btrfs_assign_next_active_device(device, NULL);
22412b82032cSYan Zheng
22420bfaa9c5SEric Sandeen if (device->bdev) {
224341a52a0fSAnand Jain cur_devices->open_devices--;
224499994cdeSAnand Jain /* remove sysfs entry */
224553f8a74cSAnand Jain btrfs_sysfs_remove_device(device);
22460bfaa9c5SEric Sandeen }
224799994cdeSAnand Jain
22480b246afaSJeff Mahoney num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
22490b246afaSJeff Mahoney btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2250b5185197SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
2251e4404d6eSYan Zheng
2252cea67ab9SJeff Mahoney /*
22533fa421deSJosef Bacik * At this point, the device is zero sized and detached from the
22543fa421deSJosef Bacik * devices list. All that's left is to zero out the old supers and
22553fa421deSJosef Bacik * free the device.
22563fa421deSJosef Bacik *
22573fa421deSJosef Bacik * We cannot call btrfs_close_bdev() here because we're holding the sb
22583fa421deSJosef Bacik * write lock, and blkdev_put() will pull in the ->open_mutex on the
22593fa421deSJosef Bacik * block device and it's dependencies. Instead just flush the device
22603fa421deSJosef Bacik * and let the caller do the final blkdev_put.
2261cea67ab9SJeff Mahoney */
22623fa421deSJosef Bacik if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
22638f32380dSJohannes Thumshirn btrfs_scratch_superblocks(fs_info, device->bdev,
22648f32380dSJohannes Thumshirn device->name->str);
22653fa421deSJosef Bacik if (device->bdev) {
22663fa421deSJosef Bacik sync_blockdev(device->bdev);
22673fa421deSJosef Bacik invalidate_bdev(device->bdev);
22683fa421deSJosef Bacik }
22693fa421deSJosef Bacik }
2270cea67ab9SJeff Mahoney
22713fa421deSJosef Bacik *bdev = device->bdev;
22722736e8eeSChristoph Hellwig *holder = device->holder;
22738e75fd89SNikolay Borisov synchronize_rcu();
22748e75fd89SNikolay Borisov btrfs_free_device(device);
2275cea67ab9SJeff Mahoney
22768b41393fSJosef Bacik /*
22778b41393fSJosef Bacik * This can happen if cur_devices is the private seed devices list. We
22788b41393fSJosef Bacik * cannot call close_fs_devices() here because it expects the uuid_mutex
22798b41393fSJosef Bacik * to be held, but in fact we don't need that for the private
22808b41393fSJosef Bacik * seed_devices, we can simply decrement cur_devices->opened and then
22818b41393fSJosef Bacik * remove it from our list and free the fs_devices.
22828b41393fSJosef Bacik */
22838e906945SAnand Jain if (cur_devices->num_devices == 0) {
2284944d3f9fSNikolay Borisov list_del_init(&cur_devices->seed_list);
22858b41393fSJosef Bacik ASSERT(cur_devices->opened == 1);
22868b41393fSJosef Bacik cur_devices->opened--;
22871f78160cSXiao Guangrong free_fs_devices(cur_devices);
22882b82032cSYan Zheng }
22892b82032cSYan Zheng
2290bbac5869SQu Wenruo ret = btrfs_commit_transaction(trans);
2291bbac5869SQu Wenruo
2292a061fc8dSChris Mason return ret;
229324fc572fSAnand Jain
22949b3517e9SIlya Dryomov error_undo:
2295ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
229634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
22979b3517e9SIlya Dryomov list_add(&device->dev_alloc_list,
2298b5185197SAnand Jain &fs_devices->alloc_list);
2299c3929c36SMiao Xie device->fs_devices->rw_devices++;
230034441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
23019b3517e9SIlya Dryomov }
2302bbac5869SQu Wenruo return ret;
2303a061fc8dSChris Mason }
2304a061fc8dSChris Mason
btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device * srcdev)230568a9db5fSNikolay Borisov void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2306e93c89c1SStefan Behrens {
2307d51908ceSAnand Jain struct btrfs_fs_devices *fs_devices;
2308d51908ceSAnand Jain
230968a9db5fSNikolay Borisov lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
23101357272fSIlya Dryomov
231125e8e911SAnand Jain /*
231225e8e911SAnand Jain * in case of fs with no seed, srcdev->fs_devices will point
231325e8e911SAnand Jain * to fs_devices of fs_info. However when the dev being replaced is
231425e8e911SAnand Jain * a seed dev it will point to the seed's local fs_devices. In short
231525e8e911SAnand Jain * srcdev will have its correct fs_devices in both the cases.
231625e8e911SAnand Jain */
231725e8e911SAnand Jain fs_devices = srcdev->fs_devices;
2318d51908ceSAnand Jain
2319e93c89c1SStefan Behrens list_del_rcu(&srcdev->dev_list);
2320619c47f3SDavid Sterba list_del(&srcdev->dev_alloc_list);
2321d51908ceSAnand Jain fs_devices->num_devices--;
2322e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2323d51908ceSAnand Jain fs_devices->missing_devices--;
2324e93c89c1SStefan Behrens
2325ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
232682372bc8SMiao Xie fs_devices->rw_devices--;
23271357272fSIlya Dryomov
232882372bc8SMiao Xie if (srcdev->bdev)
232982372bc8SMiao Xie fs_devices->open_devices--;
2330084b6e7cSQu Wenruo }
2331084b6e7cSQu Wenruo
btrfs_rm_dev_replace_free_srcdev(struct btrfs_device * srcdev)233265237ee3SDavid Sterba void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2333084b6e7cSQu Wenruo {
2334084b6e7cSQu Wenruo struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
233582372bc8SMiao Xie
2336a466c85eSJosef Bacik mutex_lock(&uuid_mutex);
2337a466c85eSJosef Bacik
233814238819SAnand Jain btrfs_close_bdev(srcdev);
23398e75fd89SNikolay Borisov synchronize_rcu();
23408e75fd89SNikolay Borisov btrfs_free_device(srcdev);
234194d5f0c2SAnand Jain
234294d5f0c2SAnand Jain /* if this is no devs we rather delete the fs_devices */
234394d5f0c2SAnand Jain if (!fs_devices->num_devices) {
23446dd38f81SAnand Jain /*
23456dd38f81SAnand Jain * On a mounted FS, num_devices can't be zero unless it's a
23466dd38f81SAnand Jain * seed. In case of a seed device being replaced, the replace
23476dd38f81SAnand Jain * target added to the sprout FS, so there will be no more
23486dd38f81SAnand Jain * device left under the seed FS.
23496dd38f81SAnand Jain */
23506dd38f81SAnand Jain ASSERT(fs_devices->seeding);
23516dd38f81SAnand Jain
2352944d3f9fSNikolay Borisov list_del_init(&fs_devices->seed_list);
23530226e0ebSAnand Jain close_fs_devices(fs_devices);
23548bef8401SAnand Jain free_fs_devices(fs_devices);
235594d5f0c2SAnand Jain }
2356a466c85eSJosef Bacik mutex_unlock(&uuid_mutex);
2357e93c89c1SStefan Behrens }
2358e93c89c1SStefan Behrens
btrfs_destroy_dev_replace_tgtdev(struct btrfs_device * tgtdev)23594f5ad7bdSNikolay Borisov void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2360e93c89c1SStefan Behrens {
23614f5ad7bdSNikolay Borisov struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2362d2ff1b20SAnand Jain
2363d9a071f0SAnand Jain mutex_lock(&fs_devices->device_list_mutex);
2364d9a071f0SAnand Jain
236553f8a74cSAnand Jain btrfs_sysfs_remove_device(tgtdev);
2366d2ff1b20SAnand Jain
2367779bf3feSAnand Jain if (tgtdev->bdev)
2368d9a071f0SAnand Jain fs_devices->open_devices--;
2369779bf3feSAnand Jain
2370d9a071f0SAnand Jain fs_devices->num_devices--;
2371e93c89c1SStefan Behrens
2372d6507cf1SNikolay Borisov btrfs_assign_next_active_device(tgtdev, NULL);
2373e93c89c1SStefan Behrens
2374e93c89c1SStefan Behrens list_del_rcu(&tgtdev->dev_list);
2375e93c89c1SStefan Behrens
2376d9a071f0SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
2377779bf3feSAnand Jain
23788f32380dSJohannes Thumshirn btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
23798f32380dSJohannes Thumshirn tgtdev->name->str);
238014238819SAnand Jain
238114238819SAnand Jain btrfs_close_bdev(tgtdev);
23828e75fd89SNikolay Borisov synchronize_rcu();
23838e75fd89SNikolay Borisov btrfs_free_device(tgtdev);
2384e93c89c1SStefan Behrens }
2385e93c89c1SStefan Behrens
238643dd529aSDavid Sterba /*
238743dd529aSDavid Sterba * Populate args from device at path.
2388faa775c4SJosef Bacik *
2389faa775c4SJosef Bacik * @fs_info: the filesystem
2390faa775c4SJosef Bacik * @args: the args to populate
2391faa775c4SJosef Bacik * @path: the path to the device
2392faa775c4SJosef Bacik *
2393faa775c4SJosef Bacik * This will read the super block of the device at @path and populate @args with
2394faa775c4SJosef Bacik * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
2395faa775c4SJosef Bacik * lookup a device to operate on, but need to do it before we take any locks.
2396faa775c4SJosef Bacik * This properly handles the special case of "missing" that a user may pass in,
2397faa775c4SJosef Bacik * and does some basic sanity checks. The caller must make sure that @path is
2398faa775c4SJosef Bacik * properly NUL terminated before calling in, and must call
2399faa775c4SJosef Bacik * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2400faa775c4SJosef Bacik * uuid buffers.
2401faa775c4SJosef Bacik *
2402faa775c4SJosef Bacik * Return: 0 for success, -errno for failure
2403faa775c4SJosef Bacik */
btrfs_get_dev_args_from_path(struct btrfs_fs_info * fs_info,struct btrfs_dev_lookup_args * args,const char * path)2404faa775c4SJosef Bacik int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2405faa775c4SJosef Bacik struct btrfs_dev_lookup_args *args,
2406faa775c4SJosef Bacik const char *path)
24077ba15b7dSStefan Behrens {
24087ba15b7dSStefan Behrens struct btrfs_super_block *disk_super;
24097ba15b7dSStefan Behrens struct block_device *bdev;
2410faa775c4SJosef Bacik int ret;
24117ba15b7dSStefan Behrens
2412faa775c4SJosef Bacik if (!path || !path[0])
2413faa775c4SJosef Bacik return -EINVAL;
2414faa775c4SJosef Bacik if (!strcmp(path, "missing")) {
2415faa775c4SJosef Bacik args->missing = true;
2416faa775c4SJosef Bacik return 0;
2417faa775c4SJosef Bacik }
2418faa775c4SJosef Bacik
2419faa775c4SJosef Bacik args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2420faa775c4SJosef Bacik args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2421faa775c4SJosef Bacik if (!args->uuid || !args->fsid) {
2422faa775c4SJosef Bacik btrfs_put_dev_args_from_path(args);
2423faa775c4SJosef Bacik return -ENOMEM;
2424faa775c4SJosef Bacik }
2425faa775c4SJosef Bacik
242605bdb996SChristoph Hellwig ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
2427faa775c4SJosef Bacik &bdev, &disk_super);
24289ea0106aSZixuan Fu if (ret) {
24299ea0106aSZixuan Fu btrfs_put_dev_args_from_path(args);
2430faa775c4SJosef Bacik return ret;
24319ea0106aSZixuan Fu }
24329ea0106aSZixuan Fu
2433faa775c4SJosef Bacik args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2434faa775c4SJosef Bacik memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
24357239ff4bSNikolay Borisov if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2436faa775c4SJosef Bacik memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
24377239ff4bSNikolay Borisov else
2438faa775c4SJosef Bacik memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
24398f32380dSJohannes Thumshirn btrfs_release_disk_super(disk_super);
24402736e8eeSChristoph Hellwig blkdev_put(bdev, NULL);
2441faa775c4SJosef Bacik return 0;
24427ba15b7dSStefan Behrens }
24437ba15b7dSStefan Behrens
24442b82032cSYan Zheng /*
2445faa775c4SJosef Bacik * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2446faa775c4SJosef Bacik * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2447faa775c4SJosef Bacik * that don't need to be freed.
24485c5c0df0SDavid Sterba */
btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args * args)2449faa775c4SJosef Bacik void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2450faa775c4SJosef Bacik {
2451faa775c4SJosef Bacik kfree(args->uuid);
2452faa775c4SJosef Bacik kfree(args->fsid);
2453faa775c4SJosef Bacik args->uuid = NULL;
2454faa775c4SJosef Bacik args->fsid = NULL;
2455faa775c4SJosef Bacik }
2456faa775c4SJosef Bacik
btrfs_find_device_by_devspec(struct btrfs_fs_info * fs_info,u64 devid,const char * device_path)2457a27a94c2SNikolay Borisov struct btrfs_device *btrfs_find_device_by_devspec(
24586e927cebSAnand Jain struct btrfs_fs_info *fs_info, u64 devid,
24596e927cebSAnand Jain const char *device_path)
246024e0474bSAnand Jain {
2461562d7b15SJosef Bacik BTRFS_DEV_LOOKUP_ARGS(args);
2462a27a94c2SNikolay Borisov struct btrfs_device *device;
2463faa775c4SJosef Bacik int ret;
246424e0474bSAnand Jain
24655c5c0df0SDavid Sterba if (devid) {
2466562d7b15SJosef Bacik args.devid = devid;
2467562d7b15SJosef Bacik device = btrfs_find_device(fs_info->fs_devices, &args);
2468a27a94c2SNikolay Borisov if (!device)
2469a27a94c2SNikolay Borisov return ERR_PTR(-ENOENT);
24706e927cebSAnand Jain return device;
24716e927cebSAnand Jain }
24726e927cebSAnand Jain
2473faa775c4SJosef Bacik ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2474faa775c4SJosef Bacik if (ret)
2475faa775c4SJosef Bacik return ERR_PTR(ret);
2476562d7b15SJosef Bacik device = btrfs_find_device(fs_info->fs_devices, &args);
2477faa775c4SJosef Bacik btrfs_put_dev_args_from_path(&args);
2478562d7b15SJosef Bacik if (!device)
2479d95a830cSAnand Jain return ERR_PTR(-ENOENT);
2480562d7b15SJosef Bacik return device;
248124e0474bSAnand Jain }
248224e0474bSAnand Jain
btrfs_init_sprout(struct btrfs_fs_info * fs_info)2483849eae5eSAnand Jain static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
24842b82032cSYan Zheng {
24850b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
24862b82032cSYan Zheng struct btrfs_fs_devices *old_devices;
2487e4404d6eSYan Zheng struct btrfs_fs_devices *seed_devices;
24882b82032cSYan Zheng
2489a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex);
2490e4404d6eSYan Zheng if (!fs_devices->seeding)
2491849eae5eSAnand Jain return ERR_PTR(-EINVAL);
24922b82032cSYan Zheng
2493427c8fddSNikolay Borisov /*
2494427c8fddSNikolay Borisov * Private copy of the seed devices, anchored at
2495427c8fddSNikolay Borisov * fs_info->fs_devices->seed_list
2496427c8fddSNikolay Borisov */
24977239ff4bSNikolay Borisov seed_devices = alloc_fs_devices(NULL, NULL);
24982208a378SIlya Dryomov if (IS_ERR(seed_devices))
2499849eae5eSAnand Jain return seed_devices;
25002b82032cSYan Zheng
2501427c8fddSNikolay Borisov /*
2502427c8fddSNikolay Borisov * It's necessary to retain a copy of the original seed fs_devices in
2503427c8fddSNikolay Borisov * fs_uuids so that filesystems which have been seeded can successfully
2504427c8fddSNikolay Borisov * reference the seed device from open_seed_devices. This also supports
2505427c8fddSNikolay Borisov * multiple fs seed.
2506427c8fddSNikolay Borisov */
2507e4404d6eSYan Zheng old_devices = clone_fs_devices(fs_devices);
2508e4404d6eSYan Zheng if (IS_ERR(old_devices)) {
2509e4404d6eSYan Zheng kfree(seed_devices);
2510849eae5eSAnand Jain return old_devices;
25112b82032cSYan Zheng }
2512e4404d6eSYan Zheng
2513c4babc5eSAnand Jain list_add(&old_devices->fs_list, &fs_uuids);
25142b82032cSYan Zheng
2515e4404d6eSYan Zheng memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2516e4404d6eSYan Zheng seed_devices->opened = 1;
2517e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->devices);
2518e4404d6eSYan Zheng INIT_LIST_HEAD(&seed_devices->alloc_list);
2519e5e9a520SChris Mason mutex_init(&seed_devices->device_list_mutex);
2520c9513edbSXiao Guangrong
2521849eae5eSAnand Jain return seed_devices;
2522849eae5eSAnand Jain }
2523849eae5eSAnand Jain
2524849eae5eSAnand Jain /*
2525849eae5eSAnand Jain * Splice seed devices into the sprout fs_devices.
2526849eae5eSAnand Jain * Generate a new fsid for the sprouted read-write filesystem.
2527849eae5eSAnand Jain */
btrfs_setup_sprout(struct btrfs_fs_info * fs_info,struct btrfs_fs_devices * seed_devices)2528849eae5eSAnand Jain static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2529849eae5eSAnand Jain struct btrfs_fs_devices *seed_devices)
2530849eae5eSAnand Jain {
2531849eae5eSAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2532849eae5eSAnand Jain struct btrfs_super_block *disk_super = fs_info->super_copy;
2533849eae5eSAnand Jain struct btrfs_device *device;
2534849eae5eSAnand Jain u64 super_flags;
2535849eae5eSAnand Jain
2536849eae5eSAnand Jain /*
2537849eae5eSAnand Jain * We are updating the fsid, the thread leading to device_list_add()
2538849eae5eSAnand Jain * could race, so uuid_mutex is needed.
2539849eae5eSAnand Jain */
2540849eae5eSAnand Jain lockdep_assert_held(&uuid_mutex);
2541849eae5eSAnand Jain
2542849eae5eSAnand Jain /*
2543849eae5eSAnand Jain * The threads listed below may traverse dev_list but can do that without
2544849eae5eSAnand Jain * device_list_mutex:
2545849eae5eSAnand Jain * - All device ops and balance - as we are in btrfs_exclop_start.
2546849eae5eSAnand Jain * - Various dev_list readers - are using RCU.
2547849eae5eSAnand Jain * - btrfs_ioctl_fitrim() - is using RCU.
2548849eae5eSAnand Jain *
2549849eae5eSAnand Jain * For-read threads as below are using device_list_mutex:
2550849eae5eSAnand Jain * - Readonly scrub btrfs_scrub_dev()
2551849eae5eSAnand Jain * - Readonly scrub btrfs_scrub_progress()
2552849eae5eSAnand Jain * - btrfs_get_dev_stats()
2553849eae5eSAnand Jain */
2554849eae5eSAnand Jain lockdep_assert_held(&fs_devices->device_list_mutex);
2555849eae5eSAnand Jain
25561f78160cSXiao Guangrong list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
25571f78160cSXiao Guangrong synchronize_rcu);
25582196d6e8SMiao Xie list_for_each_entry(device, &seed_devices->devices, dev_list)
2559e4404d6eSYan Zheng device->fs_devices = seed_devices;
25602196d6e8SMiao Xie
25610395d84fSJohannes Thumshirn fs_devices->seeding = false;
25622b82032cSYan Zheng fs_devices->num_devices = 0;
25632b82032cSYan Zheng fs_devices->open_devices = 0;
256469611ac8SMiao Xie fs_devices->missing_devices = 0;
25657f0432d0SJohannes Thumshirn fs_devices->rotating = false;
2566944d3f9fSNikolay Borisov list_add(&seed_devices->seed_list, &fs_devices->seed_list);
25672b82032cSYan Zheng
25682b82032cSYan Zheng generate_random_uuid(fs_devices->fsid);
25697239ff4bSNikolay Borisov memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
25702b82032cSYan Zheng memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2571f7171750SFilipe David Borba Manana
25722b82032cSYan Zheng super_flags = btrfs_super_flags(disk_super) &
25732b82032cSYan Zheng ~BTRFS_SUPER_FLAG_SEEDING;
25742b82032cSYan Zheng btrfs_set_super_flags(disk_super, super_flags);
25752b82032cSYan Zheng }
25762b82032cSYan Zheng
25772b82032cSYan Zheng /*
257801327610SNicholas D Steeves * Store the expected generation for seed devices in device items.
25792b82032cSYan Zheng */
btrfs_finish_sprout(struct btrfs_trans_handle * trans)25805c466629SDavid Sterba static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
25812b82032cSYan Zheng {
2582562d7b15SJosef Bacik BTRFS_DEV_LOOKUP_ARGS(args);
25835c466629SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info;
25845b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root;
25852b82032cSYan Zheng struct btrfs_path *path;
25862b82032cSYan Zheng struct extent_buffer *leaf;
25872b82032cSYan Zheng struct btrfs_dev_item *dev_item;
25882b82032cSYan Zheng struct btrfs_device *device;
25892b82032cSYan Zheng struct btrfs_key key;
259044880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE];
25912b82032cSYan Zheng u8 dev_uuid[BTRFS_UUID_SIZE];
25922b82032cSYan Zheng int ret;
25932b82032cSYan Zheng
25942b82032cSYan Zheng path = btrfs_alloc_path();
25952b82032cSYan Zheng if (!path)
25962b82032cSYan Zheng return -ENOMEM;
25972b82032cSYan Zheng
25982b82032cSYan Zheng key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
25992b82032cSYan Zheng key.offset = 0;
26002b82032cSYan Zheng key.type = BTRFS_DEV_ITEM_KEY;
26012b82032cSYan Zheng
26022b82032cSYan Zheng while (1) {
26032bb2e00eSFilipe Manana btrfs_reserve_chunk_metadata(trans, false);
26042b82032cSYan Zheng ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
26052bb2e00eSFilipe Manana btrfs_trans_release_chunk_metadata(trans);
26062b82032cSYan Zheng if (ret < 0)
26072b82032cSYan Zheng goto error;
26082b82032cSYan Zheng
26092b82032cSYan Zheng leaf = path->nodes[0];
26102b82032cSYan Zheng next_slot:
26112b82032cSYan Zheng if (path->slots[0] >= btrfs_header_nritems(leaf)) {
26122b82032cSYan Zheng ret = btrfs_next_leaf(root, path);
26132b82032cSYan Zheng if (ret > 0)
26142b82032cSYan Zheng break;
26152b82032cSYan Zheng if (ret < 0)
26162b82032cSYan Zheng goto error;
26172b82032cSYan Zheng leaf = path->nodes[0];
26182b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2619b3b4aa74SDavid Sterba btrfs_release_path(path);
26202b82032cSYan Zheng continue;
26212b82032cSYan Zheng }
26222b82032cSYan Zheng
26232b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
26242b82032cSYan Zheng if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
26252b82032cSYan Zheng key.type != BTRFS_DEV_ITEM_KEY)
26262b82032cSYan Zheng break;
26272b82032cSYan Zheng
26282b82032cSYan Zheng dev_item = btrfs_item_ptr(leaf, path->slots[0],
26292b82032cSYan Zheng struct btrfs_dev_item);
2630562d7b15SJosef Bacik args.devid = btrfs_device_id(leaf, dev_item);
2631410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
26322b82032cSYan Zheng BTRFS_UUID_SIZE);
26331473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
263444880fdcSAnand Jain BTRFS_FSID_SIZE);
2635562d7b15SJosef Bacik args.uuid = dev_uuid;
2636562d7b15SJosef Bacik args.fsid = fs_uuid;
2637562d7b15SJosef Bacik device = btrfs_find_device(fs_info->fs_devices, &args);
263879787eaaSJeff Mahoney BUG_ON(!device); /* Logic error */
26392b82032cSYan Zheng
26402b82032cSYan Zheng if (device->fs_devices->seeding) {
26412b82032cSYan Zheng btrfs_set_device_generation(leaf, dev_item,
26422b82032cSYan Zheng device->generation);
2643d5e09e38SFilipe Manana btrfs_mark_buffer_dirty(trans, leaf);
26442b82032cSYan Zheng }
26452b82032cSYan Zheng
26462b82032cSYan Zheng path->slots[0]++;
26472b82032cSYan Zheng goto next_slot;
26482b82032cSYan Zheng }
26492b82032cSYan Zheng ret = 0;
26502b82032cSYan Zheng error:
26512b82032cSYan Zheng btrfs_free_path(path);
26522b82032cSYan Zheng return ret;
26532b82032cSYan Zheng }
26542b82032cSYan Zheng
btrfs_init_new_device(struct btrfs_fs_info * fs_info,const char * device_path)2655da353f6bSDavid Sterba int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2656788f20ebSChris Mason {
26575112febbSJeff Mahoney struct btrfs_root *root = fs_info->dev_root;
2658788f20ebSChris Mason struct btrfs_trans_handle *trans;
2659788f20ebSChris Mason struct btrfs_device *device;
2660788f20ebSChris Mason struct block_device *bdev;
26610b246afaSJeff Mahoney struct super_block *sb = fs_info->sb;
26625da54bc1SAnand Jain struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
26638ba7d5f5SGenjian Zhang struct btrfs_fs_devices *seed_devices = NULL;
266439379faaSNaohiro Aota u64 orig_super_total_bytes;
266539379faaSNaohiro Aota u64 orig_super_num_devices;
2666788f20ebSChris Mason int ret = 0;
2667fd880809SAnand Jain bool seeding_dev = false;
266844cab9baSNikolay Borisov bool locked = false;
2669788f20ebSChris Mason
26705da54bc1SAnand Jain if (sb_rdonly(sb) && !fs_devices->seeding)
2671f8c5d0b4SLiu Bo return -EROFS;
2672788f20ebSChris Mason
267305bdb996SChristoph Hellwig bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
26740718afd4SChristoph Hellwig fs_info->bdev_holder, NULL);
26757f59203aSJosef Bacik if (IS_ERR(bdev))
26767f59203aSJosef Bacik return PTR_ERR(bdev);
2677a2135011SChris Mason
2678b70f5097SNaohiro Aota if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2679b70f5097SNaohiro Aota ret = -EINVAL;
2680b70f5097SNaohiro Aota goto error;
2681b70f5097SNaohiro Aota }
2682b70f5097SNaohiro Aota
26835da54bc1SAnand Jain if (fs_devices->seeding) {
2684fd880809SAnand Jain seeding_dev = true;
26852b82032cSYan Zheng down_write(&sb->s_umount);
26862b82032cSYan Zheng mutex_lock(&uuid_mutex);
268744cab9baSNikolay Borisov locked = true;
26882b82032cSYan Zheng }
26892b82032cSYan Zheng
2690b9ba017fSNikolay Borisov sync_blockdev(bdev);
2691a2135011SChris Mason
2692f4cfa9bdSNikolay Borisov rcu_read_lock();
2693f4cfa9bdSNikolay Borisov list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2694788f20ebSChris Mason if (device->bdev == bdev) {
2695788f20ebSChris Mason ret = -EEXIST;
2696f4cfa9bdSNikolay Borisov rcu_read_unlock();
26972b82032cSYan Zheng goto error;
2698788f20ebSChris Mason }
2699788f20ebSChris Mason }
2700f4cfa9bdSNikolay Borisov rcu_read_unlock();
2701788f20ebSChris Mason
2702bb21e302SAnand Jain device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
270312bd2fc0SIlya Dryomov if (IS_ERR(device)) {
2704788f20ebSChris Mason /* we can safely leave the fs_devices entry around */
270512bd2fc0SIlya Dryomov ret = PTR_ERR(device);
27062b82032cSYan Zheng goto error;
2707788f20ebSChris Mason }
2708788f20ebSChris Mason
27095b316468SNaohiro Aota device->fs_info = fs_info;
27105b316468SNaohiro Aota device->bdev = bdev;
27114889bc05SAnand Jain ret = lookup_bdev(device_path, &device->devt);
27124889bc05SAnand Jain if (ret)
27134889bc05SAnand Jain goto error_free_device;
27145b316468SNaohiro Aota
271516beac87SNaohiro Aota ret = btrfs_get_dev_zone_info(device, false);
27165b316468SNaohiro Aota if (ret)
27175b316468SNaohiro Aota goto error_free_device;
27185b316468SNaohiro Aota
2719a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0);
272098d5dc13STsutomu Itoh if (IS_ERR(trans)) {
272198d5dc13STsutomu Itoh ret = PTR_ERR(trans);
27225b316468SNaohiro Aota goto error_free_zone;
272398d5dc13STsutomu Itoh }
272498d5dc13STsutomu Itoh
2725ebbede42SAnand Jain set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
27262b82032cSYan Zheng device->generation = trans->transid;
27270b246afaSJeff Mahoney device->io_width = fs_info->sectorsize;
27280b246afaSJeff Mahoney device->io_align = fs_info->sectorsize;
27290b246afaSJeff Mahoney device->sector_size = fs_info->sectorsize;
2730cda00ebaSChristoph Hellwig device->total_bytes =
2731cda00ebaSChristoph Hellwig round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
27322cc3c559SYan Zheng device->disk_total_bytes = device->total_bytes;
2733935e5cc9SMiao Xie device->commit_total_bytes = device->total_bytes;
2734e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2735401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
27362736e8eeSChristoph Hellwig device->holder = fs_info->bdev_holder;
273727087f37SStefan Behrens device->dev_stats_valid = 1;
27389f6d2510SDavid Sterba set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2739325cd4baSZheng Yan
27402b82032cSYan Zheng if (seeding_dev) {
2741849eae5eSAnand Jain /* GFP_KERNEL allocation must not be under device_list_mutex */
2742849eae5eSAnand Jain seed_devices = btrfs_init_sprout(fs_info);
2743849eae5eSAnand Jain if (IS_ERR(seed_devices)) {
2744849eae5eSAnand Jain ret = PTR_ERR(seed_devices);
2745d31c32f6SAnand Jain btrfs_abort_transaction(trans, ret);
2746d31c32f6SAnand Jain goto error_trans;
2747d31c32f6SAnand Jain }
2748849eae5eSAnand Jain }
2749849eae5eSAnand Jain
2750849eae5eSAnand Jain mutex_lock(&fs_devices->device_list_mutex);
2751849eae5eSAnand Jain if (seeding_dev) {
2752849eae5eSAnand Jain btrfs_setup_sprout(fs_info, seed_devices);
2753b7cb29e6SAnand Jain btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2754b7cb29e6SAnand Jain device);
27552b82032cSYan Zheng }
27562b82032cSYan Zheng
27575da54bc1SAnand Jain device->fs_devices = fs_devices;
2758e5e9a520SChris Mason
275934441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
27605da54bc1SAnand Jain list_add_rcu(&device->dev_list, &fs_devices->devices);
27615da54bc1SAnand Jain list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
27625da54bc1SAnand Jain fs_devices->num_devices++;
27635da54bc1SAnand Jain fs_devices->open_devices++;
27645da54bc1SAnand Jain fs_devices->rw_devices++;
27655da54bc1SAnand Jain fs_devices->total_devices++;
27665da54bc1SAnand Jain fs_devices->total_rw_bytes += device->total_bytes;
27672b82032cSYan Zheng
2768a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
27692bf64758SJosef Bacik
277010f0d2a5SChristoph Hellwig if (!bdev_nonrot(bdev))
27717f0432d0SJohannes Thumshirn fs_devices->rotating = true;
2772c289811cSChris Mason
277339379faaSNaohiro Aota orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
27740b246afaSJeff Mahoney btrfs_set_super_total_bytes(fs_info->super_copy,
277539379faaSNaohiro Aota round_down(orig_super_total_bytes + device->total_bytes,
277639379faaSNaohiro Aota fs_info->sectorsize));
2777788f20ebSChris Mason
277839379faaSNaohiro Aota orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
277939379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy,
278039379faaSNaohiro Aota orig_super_num_devices + 1);
27810d39376aSAnand Jain
27822196d6e8SMiao Xie /*
27832196d6e8SMiao Xie * we've got more storage, clear any full flags on the space
27842196d6e8SMiao Xie * infos
27852196d6e8SMiao Xie */
27860b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info);
27872196d6e8SMiao Xie
278834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
2789ca10845aSJosef Bacik
2790ca10845aSJosef Bacik /* Add sysfs device entry */
2791cd36da2eSAnand Jain btrfs_sysfs_add_device(device);
2792ca10845aSJosef Bacik
27935da54bc1SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
2794788f20ebSChris Mason
27952b82032cSYan Zheng if (seeding_dev) {
279634441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
27976f8e0fc7SDavid Sterba ret = init_first_rw_device(trans);
279834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
2799005d6427SDavid Sterba if (ret) {
280066642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
2801d31c32f6SAnand Jain goto error_sysfs;
2802005d6427SDavid Sterba }
28032196d6e8SMiao Xie }
28042196d6e8SMiao Xie
28058e87e856SNikolay Borisov ret = btrfs_add_dev_item(trans, device);
28062196d6e8SMiao Xie if (ret) {
280766642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
2808d31c32f6SAnand Jain goto error_sysfs;
28092196d6e8SMiao Xie }
28102196d6e8SMiao Xie
28112196d6e8SMiao Xie if (seeding_dev) {
28125c466629SDavid Sterba ret = btrfs_finish_sprout(trans);
2813005d6427SDavid Sterba if (ret) {
281466642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
2815d31c32f6SAnand Jain goto error_sysfs;
2816005d6427SDavid Sterba }
2817b2373f25SAnand Jain
28188e560081SNikolay Borisov /*
28198e560081SNikolay Borisov * fs_devices now represents the newly sprouted filesystem and
2820849eae5eSAnand Jain * its fsid has been changed by btrfs_sprout_splice().
28218e560081SNikolay Borisov */
28228e560081SNikolay Borisov btrfs_sysfs_update_sprout_fsid(fs_devices);
2823005d6427SDavid Sterba }
28242b82032cSYan Zheng
28253a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans);
28262b82032cSYan Zheng
28272b82032cSYan Zheng if (seeding_dev) {
28282b82032cSYan Zheng mutex_unlock(&uuid_mutex);
28292b82032cSYan Zheng up_write(&sb->s_umount);
283044cab9baSNikolay Borisov locked = false;
28312b82032cSYan Zheng
283279787eaaSJeff Mahoney if (ret) /* transaction commit */
283379787eaaSJeff Mahoney return ret;
283479787eaaSJeff Mahoney
28352ff7e61eSJeff Mahoney ret = btrfs_relocate_sys_chunks(fs_info);
283679787eaaSJeff Mahoney if (ret < 0)
28370b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret,
28385d163e0eSJeff Mahoney "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2839671415b7SMiao Xie trans = btrfs_attach_transaction(root);
2840671415b7SMiao Xie if (IS_ERR(trans)) {
2841671415b7SMiao Xie if (PTR_ERR(trans) == -ENOENT)
2842671415b7SMiao Xie return 0;
28437132a262SAnand Jain ret = PTR_ERR(trans);
28447132a262SAnand Jain trans = NULL;
28457132a262SAnand Jain goto error_sysfs;
2846671415b7SMiao Xie }
28473a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans);
28482b82032cSYan Zheng }
2849c9e9f97bSIlya Dryomov
28507f551d96SAnand Jain /*
28517f551d96SAnand Jain * Now that we have written a new super block to this device, check all
28527f551d96SAnand Jain * other fs_devices list if device_path alienates any other scanned
28537f551d96SAnand Jain * device.
28547f551d96SAnand Jain * We can ignore the return value as it typically returns -EINVAL and
28557f551d96SAnand Jain * only succeeds if the device was an alien.
28567f551d96SAnand Jain */
28574889bc05SAnand Jain btrfs_forget_devices(device->devt);
28587f551d96SAnand Jain
28597f551d96SAnand Jain /* Update ctime/mtime for blkid or udev */
286054fde91fSJosef Bacik update_dev_time(device_path);
28617f551d96SAnand Jain
2862788f20ebSChris Mason return ret;
286379787eaaSJeff Mahoney
2864d31c32f6SAnand Jain error_sysfs:
286553f8a74cSAnand Jain btrfs_sysfs_remove_device(device);
286639379faaSNaohiro Aota mutex_lock(&fs_info->fs_devices->device_list_mutex);
286739379faaSNaohiro Aota mutex_lock(&fs_info->chunk_mutex);
286839379faaSNaohiro Aota list_del_rcu(&device->dev_list);
286939379faaSNaohiro Aota list_del(&device->dev_alloc_list);
287039379faaSNaohiro Aota fs_info->fs_devices->num_devices--;
287139379faaSNaohiro Aota fs_info->fs_devices->open_devices--;
287239379faaSNaohiro Aota fs_info->fs_devices->rw_devices--;
287339379faaSNaohiro Aota fs_info->fs_devices->total_devices--;
287439379faaSNaohiro Aota fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
287539379faaSNaohiro Aota atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
287639379faaSNaohiro Aota btrfs_set_super_total_bytes(fs_info->super_copy,
287739379faaSNaohiro Aota orig_super_total_bytes);
287839379faaSNaohiro Aota btrfs_set_super_num_devices(fs_info->super_copy,
287939379faaSNaohiro Aota orig_super_num_devices);
288039379faaSNaohiro Aota mutex_unlock(&fs_info->chunk_mutex);
288139379faaSNaohiro Aota mutex_unlock(&fs_info->fs_devices->device_list_mutex);
288279787eaaSJeff Mahoney error_trans:
28837132a262SAnand Jain if (trans)
28843a45bb20SJeff Mahoney btrfs_end_transaction(trans);
28855b316468SNaohiro Aota error_free_zone:
28865b316468SNaohiro Aota btrfs_destroy_dev_zone_info(device);
28875c4cf6c9SDavid Sterba error_free_device:
2888a425f9d4SDavid Sterba btrfs_free_device(device);
28892b82032cSYan Zheng error:
28902736e8eeSChristoph Hellwig blkdev_put(bdev, fs_info->bdev_holder);
289144cab9baSNikolay Borisov if (locked) {
28922b82032cSYan Zheng mutex_unlock(&uuid_mutex);
28932b82032cSYan Zheng up_write(&sb->s_umount);
28942b82032cSYan Zheng }
2895c9e9f97bSIlya Dryomov return ret;
2896788f20ebSChris Mason }
2897788f20ebSChris Mason
btrfs_update_device(struct btrfs_trans_handle * trans,struct btrfs_device * device)2898d397712bSChris Mason static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
28990b86a832SChris Mason struct btrfs_device *device)
29000b86a832SChris Mason {
29010b86a832SChris Mason int ret;
29020b86a832SChris Mason struct btrfs_path *path;
29030b246afaSJeff Mahoney struct btrfs_root *root = device->fs_info->chunk_root;
29040b86a832SChris Mason struct btrfs_dev_item *dev_item;
29050b86a832SChris Mason struct extent_buffer *leaf;
29060b86a832SChris Mason struct btrfs_key key;
29070b86a832SChris Mason
29080b86a832SChris Mason path = btrfs_alloc_path();
29090b86a832SChris Mason if (!path)
29100b86a832SChris Mason return -ENOMEM;
29110b86a832SChris Mason
29120b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
29130b86a832SChris Mason key.type = BTRFS_DEV_ITEM_KEY;
29140b86a832SChris Mason key.offset = device->devid;
29150b86a832SChris Mason
29160b86a832SChris Mason ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
29170b86a832SChris Mason if (ret < 0)
29180b86a832SChris Mason goto out;
29190b86a832SChris Mason
29200b86a832SChris Mason if (ret > 0) {
29210b86a832SChris Mason ret = -ENOENT;
29220b86a832SChris Mason goto out;
29230b86a832SChris Mason }
29240b86a832SChris Mason
29250b86a832SChris Mason leaf = path->nodes[0];
29260b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
29270b86a832SChris Mason
29280b86a832SChris Mason btrfs_set_device_id(leaf, dev_item, device->devid);
29290b86a832SChris Mason btrfs_set_device_type(leaf, dev_item, device->type);
29300b86a832SChris Mason btrfs_set_device_io_align(leaf, dev_item, device->io_align);
29310b86a832SChris Mason btrfs_set_device_io_width(leaf, dev_item, device->io_width);
29320b86a832SChris Mason btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
29337cc8e58dSMiao Xie btrfs_set_device_total_bytes(leaf, dev_item,
29347cc8e58dSMiao Xie btrfs_device_get_disk_total_bytes(device));
29357cc8e58dSMiao Xie btrfs_set_device_bytes_used(leaf, dev_item,
29367cc8e58dSMiao Xie btrfs_device_get_bytes_used(device));
2937d5e09e38SFilipe Manana btrfs_mark_buffer_dirty(trans, leaf);
29380b86a832SChris Mason
29390b86a832SChris Mason out:
29400b86a832SChris Mason btrfs_free_path(path);
29410b86a832SChris Mason return ret;
29420b86a832SChris Mason }
29430b86a832SChris Mason
btrfs_grow_device(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 new_size)29442196d6e8SMiao Xie int btrfs_grow_device(struct btrfs_trans_handle *trans,
29458f18cf13SChris Mason struct btrfs_device *device, u64 new_size)
29468f18cf13SChris Mason {
29470b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info;
29480b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy;
29492196d6e8SMiao Xie u64 old_total;
29502196d6e8SMiao Xie u64 diff;
29512bb2e00eSFilipe Manana int ret;
29528f18cf13SChris Mason
2953ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
29542b82032cSYan Zheng return -EACCES;
29552196d6e8SMiao Xie
29567dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize);
29577dfb8be1SNikolay Borisov
295834441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
29592196d6e8SMiao Xie old_total = btrfs_super_total_bytes(super_copy);
29600e4324a4SNikolay Borisov diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
29612196d6e8SMiao Xie
296263a212abSStefan Behrens if (new_size <= device->total_bytes ||
2963401e29c1SAnand Jain test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
296434441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
29652b82032cSYan Zheng return -EINVAL;
29662196d6e8SMiao Xie }
29672b82032cSYan Zheng
29687dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy,
29697dfb8be1SNikolay Borisov round_down(old_total + diff, fs_info->sectorsize));
29702b82032cSYan Zheng device->fs_devices->total_rw_bytes += diff;
29712b82032cSYan Zheng
29727cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size);
29737cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size);
2974fb456252SJeff Mahoney btrfs_clear_space_info_full(device->fs_info);
2975bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list))
2976bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list,
2977bbbf7243SNikolay Borisov &trans->transaction->dev_update_list);
297834441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
29794184ea7fSChris Mason
29802bb2e00eSFilipe Manana btrfs_reserve_chunk_metadata(trans, false);
29812bb2e00eSFilipe Manana ret = btrfs_update_device(trans, device);
29822bb2e00eSFilipe Manana btrfs_trans_release_chunk_metadata(trans);
29832bb2e00eSFilipe Manana
29842bb2e00eSFilipe Manana return ret;
29858f18cf13SChris Mason }
29868f18cf13SChris Mason
btrfs_free_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2987f4208794SNikolay Borisov static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
29888f18cf13SChris Mason {
2989f4208794SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info;
29905b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root;
29918f18cf13SChris Mason int ret;
29928f18cf13SChris Mason struct btrfs_path *path;
29938f18cf13SChris Mason struct btrfs_key key;
29948f18cf13SChris Mason
29958f18cf13SChris Mason path = btrfs_alloc_path();
29968f18cf13SChris Mason if (!path)
29978f18cf13SChris Mason return -ENOMEM;
29988f18cf13SChris Mason
2999408fbf19SNikolay Borisov key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
30008f18cf13SChris Mason key.offset = chunk_offset;
30018f18cf13SChris Mason key.type = BTRFS_CHUNK_ITEM_KEY;
30028f18cf13SChris Mason
30038f18cf13SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
300479787eaaSJeff Mahoney if (ret < 0)
300579787eaaSJeff Mahoney goto out;
300679787eaaSJeff Mahoney else if (ret > 0) { /* Logic error or corruption */
30070b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, -ENOENT,
300879787eaaSJeff Mahoney "Failed lookup while freeing chunk.");
300979787eaaSJeff Mahoney ret = -ENOENT;
301079787eaaSJeff Mahoney goto out;
301179787eaaSJeff Mahoney }
30128f18cf13SChris Mason
30138f18cf13SChris Mason ret = btrfs_del_item(trans, root, path);
301479787eaaSJeff Mahoney if (ret < 0)
30150b246afaSJeff Mahoney btrfs_handle_fs_error(fs_info, ret,
301679787eaaSJeff Mahoney "Failed to delete chunk item.");
301779787eaaSJeff Mahoney out:
30188f18cf13SChris Mason btrfs_free_path(path);
301965a246c5STsutomu Itoh return ret;
30208f18cf13SChris Mason }
30218f18cf13SChris Mason
btrfs_del_sys_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3022408fbf19SNikolay Borisov static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
30238f18cf13SChris Mason {
30240b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy;
30258f18cf13SChris Mason struct btrfs_disk_key *disk_key;
30268f18cf13SChris Mason struct btrfs_chunk *chunk;
30278f18cf13SChris Mason u8 *ptr;
30288f18cf13SChris Mason int ret = 0;
30298f18cf13SChris Mason u32 num_stripes;
30308f18cf13SChris Mason u32 array_size;
30318f18cf13SChris Mason u32 len = 0;
30328f18cf13SChris Mason u32 cur;
30338f18cf13SChris Mason struct btrfs_key key;
30348f18cf13SChris Mason
303579bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex);
30368f18cf13SChris Mason array_size = btrfs_super_sys_array_size(super_copy);
30378f18cf13SChris Mason
30388f18cf13SChris Mason ptr = super_copy->sys_chunk_array;
30398f18cf13SChris Mason cur = 0;
30408f18cf13SChris Mason
30418f18cf13SChris Mason while (cur < array_size) {
30428f18cf13SChris Mason disk_key = (struct btrfs_disk_key *)ptr;
30438f18cf13SChris Mason btrfs_disk_key_to_cpu(&key, disk_key);
30448f18cf13SChris Mason
30458f18cf13SChris Mason len = sizeof(*disk_key);
30468f18cf13SChris Mason
30478f18cf13SChris Mason if (key.type == BTRFS_CHUNK_ITEM_KEY) {
30488f18cf13SChris Mason chunk = (struct btrfs_chunk *)(ptr + len);
30498f18cf13SChris Mason num_stripes = btrfs_stack_chunk_num_stripes(chunk);
30508f18cf13SChris Mason len += btrfs_chunk_item_size(num_stripes);
30518f18cf13SChris Mason } else {
30528f18cf13SChris Mason ret = -EIO;
30538f18cf13SChris Mason break;
30548f18cf13SChris Mason }
3055408fbf19SNikolay Borisov if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
30568f18cf13SChris Mason key.offset == chunk_offset) {
30578f18cf13SChris Mason memmove(ptr, ptr + len, array_size - (cur + len));
30588f18cf13SChris Mason array_size -= len;
30598f18cf13SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size);
30608f18cf13SChris Mason } else {
30618f18cf13SChris Mason ptr += len;
30628f18cf13SChris Mason cur += len;
30638f18cf13SChris Mason }
30648f18cf13SChris Mason }
30658f18cf13SChris Mason return ret;
30668f18cf13SChris Mason }
30678f18cf13SChris Mason
306860ca842eSOmar Sandoval /*
306960ca842eSOmar Sandoval * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
307060ca842eSOmar Sandoval * @logical: Logical block offset in bytes.
307160ca842eSOmar Sandoval * @length: Length of extent in bytes.
307260ca842eSOmar Sandoval *
307360ca842eSOmar Sandoval * Return: Chunk mapping or ERR_PTR.
307460ca842eSOmar Sandoval */
btrfs_get_chunk_map(struct btrfs_fs_info * fs_info,u64 logical,u64 length)307560ca842eSOmar Sandoval struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3076592d92eeSLiu Bo u64 logical, u64 length)
3077592d92eeSLiu Bo {
3078592d92eeSLiu Bo struct extent_map_tree *em_tree;
3079592d92eeSLiu Bo struct extent_map *em;
3080592d92eeSLiu Bo
3081c8bf1b67SDavid Sterba em_tree = &fs_info->mapping_tree;
3082592d92eeSLiu Bo read_lock(&em_tree->lock);
3083592d92eeSLiu Bo em = lookup_extent_mapping(em_tree, logical, length);
3084592d92eeSLiu Bo read_unlock(&em_tree->lock);
3085592d92eeSLiu Bo
3086592d92eeSLiu Bo if (!em) {
308747ec6065SFilipe Manana btrfs_crit(fs_info,
308847ec6065SFilipe Manana "unable to find chunk map for logical %llu length %llu",
3089592d92eeSLiu Bo logical, length);
3090592d92eeSLiu Bo return ERR_PTR(-EINVAL);
3091592d92eeSLiu Bo }
3092592d92eeSLiu Bo
30933952f84eSFilipe Manana if (em->start > logical || em->start + em->len <= logical) {
3094592d92eeSLiu Bo btrfs_crit(fs_info,
309547ec6065SFilipe Manana "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
309647ec6065SFilipe Manana logical, logical + length, em->start, em->start + em->len);
3097592d92eeSLiu Bo free_extent_map(em);
3098592d92eeSLiu Bo return ERR_PTR(-EINVAL);
3099592d92eeSLiu Bo }
3100592d92eeSLiu Bo
3101592d92eeSLiu Bo /* callers are responsible for dropping em's ref. */
3102592d92eeSLiu Bo return em;
3103592d92eeSLiu Bo }
3104592d92eeSLiu Bo
remove_chunk_item(struct btrfs_trans_handle * trans,struct map_lookup * map,u64 chunk_offset)310579bd3712SFilipe Manana static int remove_chunk_item(struct btrfs_trans_handle *trans,
310679bd3712SFilipe Manana struct map_lookup *map, u64 chunk_offset)
310779bd3712SFilipe Manana {
310879bd3712SFilipe Manana int i;
310979bd3712SFilipe Manana
311079bd3712SFilipe Manana /*
311179bd3712SFilipe Manana * Removing chunk items and updating the device items in the chunks btree
311279bd3712SFilipe Manana * requires holding the chunk_mutex.
311379bd3712SFilipe Manana * See the comment at btrfs_chunk_alloc() for the details.
311479bd3712SFilipe Manana */
311579bd3712SFilipe Manana lockdep_assert_held(&trans->fs_info->chunk_mutex);
311679bd3712SFilipe Manana
311779bd3712SFilipe Manana for (i = 0; i < map->num_stripes; i++) {
311879bd3712SFilipe Manana int ret;
311979bd3712SFilipe Manana
312079bd3712SFilipe Manana ret = btrfs_update_device(trans, map->stripes[i].dev);
312179bd3712SFilipe Manana if (ret)
312279bd3712SFilipe Manana return ret;
312379bd3712SFilipe Manana }
312479bd3712SFilipe Manana
312579bd3712SFilipe Manana return btrfs_free_chunk(trans, chunk_offset);
312679bd3712SFilipe Manana }
312779bd3712SFilipe Manana
btrfs_remove_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)312897aff912SNikolay Borisov int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
312947ab2a6cSJosef Bacik {
313097aff912SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info;
313147ab2a6cSJosef Bacik struct extent_map *em;
313247ab2a6cSJosef Bacik struct map_lookup *map;
313347ab2a6cSJosef Bacik u64 dev_extent_len = 0;
313447ab2a6cSJosef Bacik int i, ret = 0;
31350b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
313647ab2a6cSJosef Bacik
313760ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3138592d92eeSLiu Bo if (IS_ERR(em)) {
313947ab2a6cSJosef Bacik /*
314047ab2a6cSJosef Bacik * This is a logic error, but we don't want to just rely on the
3141bb7ab3b9SAdam Buchbinder * user having built with ASSERT enabled, so if ASSERT doesn't
314247ab2a6cSJosef Bacik * do anything we still error out.
314347ab2a6cSJosef Bacik */
314447ab2a6cSJosef Bacik ASSERT(0);
3145592d92eeSLiu Bo return PTR_ERR(em);
314647ab2a6cSJosef Bacik }
314795617d69SJeff Mahoney map = em->map_lookup;
314847ab2a6cSJosef Bacik
314957ba4cb8SFilipe Manana /*
315079bd3712SFilipe Manana * First delete the device extent items from the devices btree.
315179bd3712SFilipe Manana * We take the device_list_mutex to avoid racing with the finishing phase
315279bd3712SFilipe Manana * of a device replace operation. See the comment below before acquiring
315379bd3712SFilipe Manana * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
315479bd3712SFilipe Manana * because that can result in a deadlock when deleting the device extent
315579bd3712SFilipe Manana * items from the devices btree - COWing an extent buffer from the btree
315679bd3712SFilipe Manana * may result in allocating a new metadata chunk, which would attempt to
315779bd3712SFilipe Manana * lock again fs_info->chunk_mutex.
315857ba4cb8SFilipe Manana */
315957ba4cb8SFilipe Manana mutex_lock(&fs_devices->device_list_mutex);
316047ab2a6cSJosef Bacik for (i = 0; i < map->num_stripes; i++) {
316147ab2a6cSJosef Bacik struct btrfs_device *device = map->stripes[i].dev;
316247ab2a6cSJosef Bacik ret = btrfs_free_dev_extent(trans, device,
316347ab2a6cSJosef Bacik map->stripes[i].physical,
316447ab2a6cSJosef Bacik &dev_extent_len);
316547ab2a6cSJosef Bacik if (ret) {
316657ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex);
316766642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
316847ab2a6cSJosef Bacik goto out;
316947ab2a6cSJosef Bacik }
317047ab2a6cSJosef Bacik
317147ab2a6cSJosef Bacik if (device->bytes_used > 0) {
317234441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
317347ab2a6cSJosef Bacik btrfs_device_set_bytes_used(device,
317447ab2a6cSJosef Bacik device->bytes_used - dev_extent_len);
3175a5ed45f8SNikolay Borisov atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
31760b246afaSJeff Mahoney btrfs_clear_space_info_full(fs_info);
317734441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
317847ab2a6cSJosef Bacik }
317979bd3712SFilipe Manana }
318057ba4cb8SFilipe Manana mutex_unlock(&fs_devices->device_list_mutex);
318179bd3712SFilipe Manana
318279bd3712SFilipe Manana /*
318379bd3712SFilipe Manana * We acquire fs_info->chunk_mutex for 2 reasons:
318479bd3712SFilipe Manana *
318579bd3712SFilipe Manana * 1) Just like with the first phase of the chunk allocation, we must
318679bd3712SFilipe Manana * reserve system space, do all chunk btree updates and deletions, and
318779bd3712SFilipe Manana * update the system chunk array in the superblock while holding this
318879bd3712SFilipe Manana * mutex. This is for similar reasons as explained on the comment at
318979bd3712SFilipe Manana * the top of btrfs_chunk_alloc();
319079bd3712SFilipe Manana *
319179bd3712SFilipe Manana * 2) Prevent races with the final phase of a device replace operation
319279bd3712SFilipe Manana * that replaces the device object associated with the map's stripes,
319379bd3712SFilipe Manana * because the device object's id can change at any time during that
319479bd3712SFilipe Manana * final phase of the device replace operation
319579bd3712SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
319679bd3712SFilipe Manana * replaced device and then see it with an ID of
319779bd3712SFilipe Manana * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
319879bd3712SFilipe Manana * the device item, which does not exists on the chunk btree.
319979bd3712SFilipe Manana * The finishing phase of device replace acquires both the
320079bd3712SFilipe Manana * device_list_mutex and the chunk_mutex, in that order, so we are
320179bd3712SFilipe Manana * safe by just acquiring the chunk_mutex.
320279bd3712SFilipe Manana */
320379bd3712SFilipe Manana trans->removing_chunk = true;
320479bd3712SFilipe Manana mutex_lock(&fs_info->chunk_mutex);
320579bd3712SFilipe Manana
320679bd3712SFilipe Manana check_system_chunk(trans, map->type);
320779bd3712SFilipe Manana
320879bd3712SFilipe Manana ret = remove_chunk_item(trans, map, chunk_offset);
320979bd3712SFilipe Manana /*
321079bd3712SFilipe Manana * Normally we should not get -ENOSPC since we reserved space before
321179bd3712SFilipe Manana * through the call to check_system_chunk().
321279bd3712SFilipe Manana *
321379bd3712SFilipe Manana * Despite our system space_info having enough free space, we may not
321479bd3712SFilipe Manana * be able to allocate extents from its block groups, because all have
321579bd3712SFilipe Manana * an incompatible profile, which will force us to allocate a new system
321679bd3712SFilipe Manana * block group with the right profile, or right after we called
321779bd3712SFilipe Manana * check_system_space() above, a scrub turned the only system block group
321879bd3712SFilipe Manana * with enough free space into RO mode.
321979bd3712SFilipe Manana * This is explained with more detail at do_chunk_alloc().
322079bd3712SFilipe Manana *
322179bd3712SFilipe Manana * So if we get -ENOSPC, allocate a new system chunk and retry once.
322279bd3712SFilipe Manana */
322379bd3712SFilipe Manana if (ret == -ENOSPC) {
322479bd3712SFilipe Manana const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
322579bd3712SFilipe Manana struct btrfs_block_group *sys_bg;
322679bd3712SFilipe Manana
3227f6f39f7aSNikolay Borisov sys_bg = btrfs_create_chunk(trans, sys_flags);
322879bd3712SFilipe Manana if (IS_ERR(sys_bg)) {
322979bd3712SFilipe Manana ret = PTR_ERR(sys_bg);
323066642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
323147ab2a6cSJosef Bacik goto out;
323247ab2a6cSJosef Bacik }
323357ba4cb8SFilipe Manana
323479bd3712SFilipe Manana ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
323547ab2a6cSJosef Bacik if (ret) {
323666642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
323747ab2a6cSJosef Bacik goto out;
323847ab2a6cSJosef Bacik }
323947ab2a6cSJosef Bacik
324079bd3712SFilipe Manana ret = remove_chunk_item(trans, map, chunk_offset);
324179bd3712SFilipe Manana if (ret) {
324279bd3712SFilipe Manana btrfs_abort_transaction(trans, ret);
324379bd3712SFilipe Manana goto out;
324479bd3712SFilipe Manana }
324579bd3712SFilipe Manana } else if (ret) {
324679bd3712SFilipe Manana btrfs_abort_transaction(trans, ret);
324779bd3712SFilipe Manana goto out;
324879bd3712SFilipe Manana }
324979bd3712SFilipe Manana
32506bccf3abSJeff Mahoney trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
325147ab2a6cSJosef Bacik
325247ab2a6cSJosef Bacik if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3253408fbf19SNikolay Borisov ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
325447ab2a6cSJosef Bacik if (ret) {
325566642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
325647ab2a6cSJosef Bacik goto out;
325747ab2a6cSJosef Bacik }
325847ab2a6cSJosef Bacik }
325947ab2a6cSJosef Bacik
326079bd3712SFilipe Manana mutex_unlock(&fs_info->chunk_mutex);
326179bd3712SFilipe Manana trans->removing_chunk = false;
326279bd3712SFilipe Manana
326379bd3712SFilipe Manana /*
326479bd3712SFilipe Manana * We are done with chunk btree updates and deletions, so release the
326579bd3712SFilipe Manana * system space we previously reserved (with check_system_chunk()).
326679bd3712SFilipe Manana */
326779bd3712SFilipe Manana btrfs_trans_release_chunk_metadata(trans);
326879bd3712SFilipe Manana
32695a98ec01SNikolay Borisov ret = btrfs_remove_block_group(trans, chunk_offset, em);
327047ab2a6cSJosef Bacik if (ret) {
327166642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
327247ab2a6cSJosef Bacik goto out;
327347ab2a6cSJosef Bacik }
327447ab2a6cSJosef Bacik
327547ab2a6cSJosef Bacik out:
327679bd3712SFilipe Manana if (trans->removing_chunk) {
327779bd3712SFilipe Manana mutex_unlock(&fs_info->chunk_mutex);
327879bd3712SFilipe Manana trans->removing_chunk = false;
327979bd3712SFilipe Manana }
328047ab2a6cSJosef Bacik /* once for us */
328147ab2a6cSJosef Bacik free_extent_map(em);
32828f18cf13SChris Mason return ret;
32838f18cf13SChris Mason }
32848f18cf13SChris Mason
btrfs_relocate_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)328518bb8bbfSJohannes Thumshirn int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
32868f18cf13SChris Mason {
32875b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root;
328819c4d2f9SChris Mason struct btrfs_trans_handle *trans;
3289b0643e59SDennis Zhou struct btrfs_block_group *block_group;
329001e86008SJohannes Thumshirn u64 length;
32918f18cf13SChris Mason int ret;
32928f18cf13SChris Mason
32934b349253SJosef Bacik if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
32944b349253SJosef Bacik btrfs_err(fs_info,
32954b349253SJosef Bacik "relocate: not supported on extent tree v2 yet");
32964b349253SJosef Bacik return -EINVAL;
32974b349253SJosef Bacik }
32984b349253SJosef Bacik
329967c5e7d4SFilipe Manana /*
330067c5e7d4SFilipe Manana * Prevent races with automatic removal of unused block groups.
330167c5e7d4SFilipe Manana * After we relocate and before we remove the chunk with offset
330267c5e7d4SFilipe Manana * chunk_offset, automatic removal of the block group can kick in,
330367c5e7d4SFilipe Manana * resulting in a failure when calling btrfs_remove_chunk() below.
330467c5e7d4SFilipe Manana *
330567c5e7d4SFilipe Manana * Make sure to acquire this mutex before doing a tree search (dev
330667c5e7d4SFilipe Manana * or chunk trees) to find chunks. Otherwise the cleaner kthread might
330767c5e7d4SFilipe Manana * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
330867c5e7d4SFilipe Manana * we release the path used to search the chunk/dev tree and before
330967c5e7d4SFilipe Manana * the current task acquires this mutex and calls us.
331067c5e7d4SFilipe Manana */
3311f3372065SJohannes Thumshirn lockdep_assert_held(&fs_info->reclaim_bgs_lock);
331267c5e7d4SFilipe Manana
33138f18cf13SChris Mason /* step one, relocate all the extents inside this chunk */
33142ff7e61eSJeff Mahoney btrfs_scrub_pause(fs_info);
33150b246afaSJeff Mahoney ret = btrfs_relocate_block_group(fs_info, chunk_offset);
33162ff7e61eSJeff Mahoney btrfs_scrub_continue(fs_info);
33172d82a40aSFilipe Manana if (ret) {
33182d82a40aSFilipe Manana /*
33192d82a40aSFilipe Manana * If we had a transaction abort, stop all running scrubs.
33202d82a40aSFilipe Manana * See transaction.c:cleanup_transaction() why we do it here.
33212d82a40aSFilipe Manana */
33222d82a40aSFilipe Manana if (BTRFS_FS_ERROR(fs_info))
33232d82a40aSFilipe Manana btrfs_scrub_cancel(fs_info);
3324a22285a6SYan, Zheng return ret;
33252d82a40aSFilipe Manana }
33268f18cf13SChris Mason
3327b0643e59SDennis Zhou block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3328b0643e59SDennis Zhou if (!block_group)
3329b0643e59SDennis Zhou return -ENOENT;
3330b0643e59SDennis Zhou btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
333101e86008SJohannes Thumshirn length = block_group->length;
3332b0643e59SDennis Zhou btrfs_put_block_group(block_group);
3333b0643e59SDennis Zhou
333401e86008SJohannes Thumshirn /*
333501e86008SJohannes Thumshirn * On a zoned file system, discard the whole block group, this will
333601e86008SJohannes Thumshirn * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
333701e86008SJohannes Thumshirn * resetting the zone fails, don't treat it as a fatal problem from the
333801e86008SJohannes Thumshirn * filesystem's point of view.
333901e86008SJohannes Thumshirn */
334001e86008SJohannes Thumshirn if (btrfs_is_zoned(fs_info)) {
334101e86008SJohannes Thumshirn ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
334201e86008SJohannes Thumshirn if (ret)
334301e86008SJohannes Thumshirn btrfs_info(fs_info,
334401e86008SJohannes Thumshirn "failed to reset zone %llu after relocation",
334501e86008SJohannes Thumshirn chunk_offset);
334601e86008SJohannes Thumshirn }
334701e86008SJohannes Thumshirn
334819c4d2f9SChris Mason trans = btrfs_start_trans_remove_block_group(root->fs_info,
334919c4d2f9SChris Mason chunk_offset);
335019c4d2f9SChris Mason if (IS_ERR(trans)) {
335119c4d2f9SChris Mason ret = PTR_ERR(trans);
335219c4d2f9SChris Mason btrfs_handle_fs_error(root->fs_info, ret, NULL);
335319c4d2f9SChris Mason return ret;
335419c4d2f9SChris Mason }
33555d8eb6feSNaohiro Aota
335619c4d2f9SChris Mason /*
335719c4d2f9SChris Mason * step two, delete the device extents and the
335819c4d2f9SChris Mason * chunk tree entries
335919c4d2f9SChris Mason */
336097aff912SNikolay Borisov ret = btrfs_remove_chunk(trans, chunk_offset);
33613a45bb20SJeff Mahoney btrfs_end_transaction(trans);
336219c4d2f9SChris Mason return ret;
33638f18cf13SChris Mason }
33648f18cf13SChris Mason
btrfs_relocate_sys_chunks(struct btrfs_fs_info * fs_info)33652ff7e61eSJeff Mahoney static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
33662b82032cSYan Zheng {
33670b246afaSJeff Mahoney struct btrfs_root *chunk_root = fs_info->chunk_root;
33682b82032cSYan Zheng struct btrfs_path *path;
33692b82032cSYan Zheng struct extent_buffer *leaf;
33702b82032cSYan Zheng struct btrfs_chunk *chunk;
33712b82032cSYan Zheng struct btrfs_key key;
33722b82032cSYan Zheng struct btrfs_key found_key;
33732b82032cSYan Zheng u64 chunk_type;
3374ba1bf481SJosef Bacik bool retried = false;
3375ba1bf481SJosef Bacik int failed = 0;
33762b82032cSYan Zheng int ret;
33772b82032cSYan Zheng
33782b82032cSYan Zheng path = btrfs_alloc_path();
33792b82032cSYan Zheng if (!path)
33802b82032cSYan Zheng return -ENOMEM;
33812b82032cSYan Zheng
3382ba1bf481SJosef Bacik again:
33832b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
33842b82032cSYan Zheng key.offset = (u64)-1;
33852b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY;
33862b82032cSYan Zheng
33872b82032cSYan Zheng while (1) {
3388f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock);
33892b82032cSYan Zheng ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
339067c5e7d4SFilipe Manana if (ret < 0) {
3391f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
33922b82032cSYan Zheng goto error;
339367c5e7d4SFilipe Manana }
33940d23b34cSDavid Sterba if (ret == 0) {
33950d23b34cSDavid Sterba /*
33960d23b34cSDavid Sterba * On the first search we would find chunk tree with
33970d23b34cSDavid Sterba * offset -1, which is not possible. On subsequent
33980d23b34cSDavid Sterba * loops this would find an existing item on an invalid
33990d23b34cSDavid Sterba * offset (one less than the previous one, wrong
34000d23b34cSDavid Sterba * alignment and size).
34010d23b34cSDavid Sterba */
34020d23b34cSDavid Sterba ret = -EUCLEAN;
3403e42004fdSDominique Martinet mutex_unlock(&fs_info->reclaim_bgs_lock);
34040d23b34cSDavid Sterba goto error;
34050d23b34cSDavid Sterba }
34062b82032cSYan Zheng
34072b82032cSYan Zheng ret = btrfs_previous_item(chunk_root, path, key.objectid,
34082b82032cSYan Zheng key.type);
340967c5e7d4SFilipe Manana if (ret)
3410f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
34112b82032cSYan Zheng if (ret < 0)
34122b82032cSYan Zheng goto error;
34132b82032cSYan Zheng if (ret > 0)
34142b82032cSYan Zheng break;
34152b82032cSYan Zheng
34162b82032cSYan Zheng leaf = path->nodes[0];
34172b82032cSYan Zheng btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
34182b82032cSYan Zheng
34192b82032cSYan Zheng chunk = btrfs_item_ptr(leaf, path->slots[0],
34202b82032cSYan Zheng struct btrfs_chunk);
34212b82032cSYan Zheng chunk_type = btrfs_chunk_type(leaf, chunk);
3422b3b4aa74SDavid Sterba btrfs_release_path(path);
34232b82032cSYan Zheng
34242b82032cSYan Zheng if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
34250b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3426ba1bf481SJosef Bacik if (ret == -ENOSPC)
3427ba1bf481SJosef Bacik failed++;
342814586651SHIMANGI SARAOGI else
342914586651SHIMANGI SARAOGI BUG_ON(ret);
34302b82032cSYan Zheng }
3431f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
34322b82032cSYan Zheng
34332b82032cSYan Zheng if (found_key.offset == 0)
34342b82032cSYan Zheng break;
34352b82032cSYan Zheng key.offset = found_key.offset - 1;
34362b82032cSYan Zheng }
34372b82032cSYan Zheng ret = 0;
3438ba1bf481SJosef Bacik if (failed && !retried) {
3439ba1bf481SJosef Bacik failed = 0;
3440ba1bf481SJosef Bacik retried = true;
3441ba1bf481SJosef Bacik goto again;
3442fae7f21cSDulshani Gunawardhana } else if (WARN_ON(failed && retried)) {
3443ba1bf481SJosef Bacik ret = -ENOSPC;
3444ba1bf481SJosef Bacik }
34452b82032cSYan Zheng error:
34462b82032cSYan Zheng btrfs_free_path(path);
34472b82032cSYan Zheng return ret;
34482b82032cSYan Zheng }
34492b82032cSYan Zheng
3450a6f93c71SLiu Bo /*
3451a6f93c71SLiu Bo * return 1 : allocate a data chunk successfully,
3452a6f93c71SLiu Bo * return <0: errors during allocating a data chunk,
3453a6f93c71SLiu Bo * return 0 : no need to allocate a data chunk.
3454a6f93c71SLiu Bo */
btrfs_may_alloc_data_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3455a6f93c71SLiu Bo static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3456a6f93c71SLiu Bo u64 chunk_offset)
3457a6f93c71SLiu Bo {
345832da5386SDavid Sterba struct btrfs_block_group *cache;
3459a6f93c71SLiu Bo u64 bytes_used;
3460a6f93c71SLiu Bo u64 chunk_type;
3461a6f93c71SLiu Bo
3462a6f93c71SLiu Bo cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3463a6f93c71SLiu Bo ASSERT(cache);
3464a6f93c71SLiu Bo chunk_type = cache->flags;
3465a6f93c71SLiu Bo btrfs_put_block_group(cache);
3466a6f93c71SLiu Bo
34675ae21692SJohannes Thumshirn if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
34685ae21692SJohannes Thumshirn return 0;
34695ae21692SJohannes Thumshirn
3470a6f93c71SLiu Bo spin_lock(&fs_info->data_sinfo->lock);
3471a6f93c71SLiu Bo bytes_used = fs_info->data_sinfo->bytes_used;
3472a6f93c71SLiu Bo spin_unlock(&fs_info->data_sinfo->lock);
3473a6f93c71SLiu Bo
3474a6f93c71SLiu Bo if (!bytes_used) {
3475a6f93c71SLiu Bo struct btrfs_trans_handle *trans;
3476a6f93c71SLiu Bo int ret;
3477a6f93c71SLiu Bo
3478a6f93c71SLiu Bo trans = btrfs_join_transaction(fs_info->tree_root);
3479a6f93c71SLiu Bo if (IS_ERR(trans))
3480a6f93c71SLiu Bo return PTR_ERR(trans);
3481a6f93c71SLiu Bo
34825ae21692SJohannes Thumshirn ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3483a6f93c71SLiu Bo btrfs_end_transaction(trans);
3484a6f93c71SLiu Bo if (ret < 0)
3485a6f93c71SLiu Bo return ret;
3486a6f93c71SLiu Bo return 1;
3487a6f93c71SLiu Bo }
34885ae21692SJohannes Thumshirn
3489a6f93c71SLiu Bo return 0;
3490a6f93c71SLiu Bo }
3491a6f93c71SLiu Bo
insert_balance_item(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl)34926bccf3abSJeff Mahoney static int insert_balance_item(struct btrfs_fs_info *fs_info,
34930940ebf6SIlya Dryomov struct btrfs_balance_control *bctl)
34940940ebf6SIlya Dryomov {
34956bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root;
34960940ebf6SIlya Dryomov struct btrfs_trans_handle *trans;
34970940ebf6SIlya Dryomov struct btrfs_balance_item *item;
34980940ebf6SIlya Dryomov struct btrfs_disk_balance_args disk_bargs;
34990940ebf6SIlya Dryomov struct btrfs_path *path;
35000940ebf6SIlya Dryomov struct extent_buffer *leaf;
35010940ebf6SIlya Dryomov struct btrfs_key key;
35020940ebf6SIlya Dryomov int ret, err;
35030940ebf6SIlya Dryomov
35040940ebf6SIlya Dryomov path = btrfs_alloc_path();
35050940ebf6SIlya Dryomov if (!path)
35060940ebf6SIlya Dryomov return -ENOMEM;
35070940ebf6SIlya Dryomov
35080940ebf6SIlya Dryomov trans = btrfs_start_transaction(root, 0);
35090940ebf6SIlya Dryomov if (IS_ERR(trans)) {
35100940ebf6SIlya Dryomov btrfs_free_path(path);
35110940ebf6SIlya Dryomov return PTR_ERR(trans);
35120940ebf6SIlya Dryomov }
35130940ebf6SIlya Dryomov
35140940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID;
3515c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY;
35160940ebf6SIlya Dryomov key.offset = 0;
35170940ebf6SIlya Dryomov
35180940ebf6SIlya Dryomov ret = btrfs_insert_empty_item(trans, root, path, &key,
35190940ebf6SIlya Dryomov sizeof(*item));
35200940ebf6SIlya Dryomov if (ret)
35210940ebf6SIlya Dryomov goto out;
35220940ebf6SIlya Dryomov
35230940ebf6SIlya Dryomov leaf = path->nodes[0];
35240940ebf6SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
35250940ebf6SIlya Dryomov
3526b159fa28SDavid Sterba memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
35270940ebf6SIlya Dryomov
35280940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
35290940ebf6SIlya Dryomov btrfs_set_balance_data(leaf, item, &disk_bargs);
35300940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
35310940ebf6SIlya Dryomov btrfs_set_balance_meta(leaf, item, &disk_bargs);
35320940ebf6SIlya Dryomov btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
35330940ebf6SIlya Dryomov btrfs_set_balance_sys(leaf, item, &disk_bargs);
35340940ebf6SIlya Dryomov
35350940ebf6SIlya Dryomov btrfs_set_balance_flags(leaf, item, bctl->flags);
35360940ebf6SIlya Dryomov
3537d5e09e38SFilipe Manana btrfs_mark_buffer_dirty(trans, leaf);
35380940ebf6SIlya Dryomov out:
35390940ebf6SIlya Dryomov btrfs_free_path(path);
35403a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans);
35410940ebf6SIlya Dryomov if (err && !ret)
35420940ebf6SIlya Dryomov ret = err;
35430940ebf6SIlya Dryomov return ret;
35440940ebf6SIlya Dryomov }
35450940ebf6SIlya Dryomov
del_balance_item(struct btrfs_fs_info * fs_info)35466bccf3abSJeff Mahoney static int del_balance_item(struct btrfs_fs_info *fs_info)
35470940ebf6SIlya Dryomov {
35486bccf3abSJeff Mahoney struct btrfs_root *root = fs_info->tree_root;
35490940ebf6SIlya Dryomov struct btrfs_trans_handle *trans;
35500940ebf6SIlya Dryomov struct btrfs_path *path;
35510940ebf6SIlya Dryomov struct btrfs_key key;
35520940ebf6SIlya Dryomov int ret, err;
35530940ebf6SIlya Dryomov
35540940ebf6SIlya Dryomov path = btrfs_alloc_path();
35550940ebf6SIlya Dryomov if (!path)
35560940ebf6SIlya Dryomov return -ENOMEM;
35570940ebf6SIlya Dryomov
35583502a8c0SDavid Sterba trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
35590940ebf6SIlya Dryomov if (IS_ERR(trans)) {
35600940ebf6SIlya Dryomov btrfs_free_path(path);
35610940ebf6SIlya Dryomov return PTR_ERR(trans);
35620940ebf6SIlya Dryomov }
35630940ebf6SIlya Dryomov
35640940ebf6SIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID;
3565c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY;
35660940ebf6SIlya Dryomov key.offset = 0;
35670940ebf6SIlya Dryomov
35680940ebf6SIlya Dryomov ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
35690940ebf6SIlya Dryomov if (ret < 0)
35700940ebf6SIlya Dryomov goto out;
35710940ebf6SIlya Dryomov if (ret > 0) {
35720940ebf6SIlya Dryomov ret = -ENOENT;
35730940ebf6SIlya Dryomov goto out;
35740940ebf6SIlya Dryomov }
35750940ebf6SIlya Dryomov
35760940ebf6SIlya Dryomov ret = btrfs_del_item(trans, root, path);
35770940ebf6SIlya Dryomov out:
35780940ebf6SIlya Dryomov btrfs_free_path(path);
35793a45bb20SJeff Mahoney err = btrfs_commit_transaction(trans);
35800940ebf6SIlya Dryomov if (err && !ret)
35810940ebf6SIlya Dryomov ret = err;
35820940ebf6SIlya Dryomov return ret;
35830940ebf6SIlya Dryomov }
35840940ebf6SIlya Dryomov
3585c9e9f97bSIlya Dryomov /*
358659641015SIlya Dryomov * This is a heuristic used to reduce the number of chunks balanced on
358759641015SIlya Dryomov * resume after balance was interrupted.
358859641015SIlya Dryomov */
update_balance_args(struct btrfs_balance_control * bctl)358959641015SIlya Dryomov static void update_balance_args(struct btrfs_balance_control *bctl)
359059641015SIlya Dryomov {
359159641015SIlya Dryomov /*
359259641015SIlya Dryomov * Turn on soft mode for chunk types that were being converted.
359359641015SIlya Dryomov */
359459641015SIlya Dryomov if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
359559641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
359659641015SIlya Dryomov if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
359759641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
359859641015SIlya Dryomov if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
359959641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
360059641015SIlya Dryomov
360159641015SIlya Dryomov /*
360259641015SIlya Dryomov * Turn on usage filter if is not already used. The idea is
360359641015SIlya Dryomov * that chunks that we have already balanced should be
360459641015SIlya Dryomov * reasonably full. Don't do it for chunks that are being
360559641015SIlya Dryomov * converted - that will keep us from relocating unconverted
360659641015SIlya Dryomov * (albeit full) chunks.
360759641015SIlya Dryomov */
360859641015SIlya Dryomov if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3609bc309467SDavid Sterba !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
361059641015SIlya Dryomov !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
361159641015SIlya Dryomov bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
361259641015SIlya Dryomov bctl->data.usage = 90;
361359641015SIlya Dryomov }
361459641015SIlya Dryomov if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3615bc309467SDavid Sterba !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
361659641015SIlya Dryomov !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
361759641015SIlya Dryomov bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
361859641015SIlya Dryomov bctl->sys.usage = 90;
361959641015SIlya Dryomov }
362059641015SIlya Dryomov if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3621bc309467SDavid Sterba !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
362259641015SIlya Dryomov !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
362359641015SIlya Dryomov bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
362459641015SIlya Dryomov bctl->meta.usage = 90;
362559641015SIlya Dryomov }
362659641015SIlya Dryomov }
362759641015SIlya Dryomov
362859641015SIlya Dryomov /*
3629149196a2SDavid Sterba * Clear the balance status in fs_info and delete the balance item from disk.
3630149196a2SDavid Sterba */
reset_balance_state(struct btrfs_fs_info * fs_info)3631149196a2SDavid Sterba static void reset_balance_state(struct btrfs_fs_info *fs_info)
3632c9e9f97bSIlya Dryomov {
3633c9e9f97bSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3634149196a2SDavid Sterba int ret;
3635c9e9f97bSIlya Dryomov
3636c9e9f97bSIlya Dryomov BUG_ON(!fs_info->balance_ctl);
3637c9e9f97bSIlya Dryomov
3638c9e9f97bSIlya Dryomov spin_lock(&fs_info->balance_lock);
3639c9e9f97bSIlya Dryomov fs_info->balance_ctl = NULL;
3640c9e9f97bSIlya Dryomov spin_unlock(&fs_info->balance_lock);
3641c9e9f97bSIlya Dryomov
3642c9e9f97bSIlya Dryomov kfree(bctl);
3643149196a2SDavid Sterba ret = del_balance_item(fs_info);
3644149196a2SDavid Sterba if (ret)
3645149196a2SDavid Sterba btrfs_handle_fs_error(fs_info, ret, NULL);
3646c9e9f97bSIlya Dryomov }
3647c9e9f97bSIlya Dryomov
3648ed25e9b2SIlya Dryomov /*
3649ed25e9b2SIlya Dryomov * Balance filters. Return 1 if chunk should be filtered out
3650ed25e9b2SIlya Dryomov * (should not be balanced).
3651ed25e9b2SIlya Dryomov */
chunk_profiles_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3652899c81eaSIlya Dryomov static int chunk_profiles_filter(u64 chunk_type,
3653ed25e9b2SIlya Dryomov struct btrfs_balance_args *bargs)
3654ed25e9b2SIlya Dryomov {
3655899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) &
3656899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK;
3657ed25e9b2SIlya Dryomov
3658899c81eaSIlya Dryomov if (bargs->profiles & chunk_type)
3659ed25e9b2SIlya Dryomov return 0;
3660ed25e9b2SIlya Dryomov
3661ed25e9b2SIlya Dryomov return 1;
3662ed25e9b2SIlya Dryomov }
3663ed25e9b2SIlya Dryomov
chunk_usage_range_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3664dba72cb3SHolger Hoffstätte static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
36655ce5b3c0SIlya Dryomov struct btrfs_balance_args *bargs)
36665ce5b3c0SIlya Dryomov {
366732da5386SDavid Sterba struct btrfs_block_group *cache;
3668bc309467SDavid Sterba u64 chunk_used;
3669bc309467SDavid Sterba u64 user_thresh_min;
3670bc309467SDavid Sterba u64 user_thresh_max;
3671bc309467SDavid Sterba int ret = 1;
3672bc309467SDavid Sterba
3673bc309467SDavid Sterba cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3674bf38be65SDavid Sterba chunk_used = cache->used;
3675bc309467SDavid Sterba
3676bc309467SDavid Sterba if (bargs->usage_min == 0)
3677bc309467SDavid Sterba user_thresh_min = 0;
3678bc309467SDavid Sterba else
3679428c8e03SDavid Sterba user_thresh_min = mult_perc(cache->length, bargs->usage_min);
3680bc309467SDavid Sterba
3681bc309467SDavid Sterba if (bargs->usage_max == 0)
3682bc309467SDavid Sterba user_thresh_max = 1;
3683bc309467SDavid Sterba else if (bargs->usage_max > 100)
3684b3470b5dSDavid Sterba user_thresh_max = cache->length;
3685bc309467SDavid Sterba else
3686428c8e03SDavid Sterba user_thresh_max = mult_perc(cache->length, bargs->usage_max);
3687bc309467SDavid Sterba
3688bc309467SDavid Sterba if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3689bc309467SDavid Sterba ret = 0;
3690bc309467SDavid Sterba
3691bc309467SDavid Sterba btrfs_put_block_group(cache);
3692bc309467SDavid Sterba return ret;
3693bc309467SDavid Sterba }
3694bc309467SDavid Sterba
chunk_usage_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3695dba72cb3SHolger Hoffstätte static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3696bc309467SDavid Sterba u64 chunk_offset, struct btrfs_balance_args *bargs)
3697bc309467SDavid Sterba {
369832da5386SDavid Sterba struct btrfs_block_group *cache;
36995ce5b3c0SIlya Dryomov u64 chunk_used, user_thresh;
37005ce5b3c0SIlya Dryomov int ret = 1;
37015ce5b3c0SIlya Dryomov
37025ce5b3c0SIlya Dryomov cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3703bf38be65SDavid Sterba chunk_used = cache->used;
37045ce5b3c0SIlya Dryomov
3705bc309467SDavid Sterba if (bargs->usage_min == 0)
37063e39cea6SIlya Dryomov user_thresh = 1;
3707a105bb88SIlya Dryomov else if (bargs->usage > 100)
3708b3470b5dSDavid Sterba user_thresh = cache->length;
3709a105bb88SIlya Dryomov else
3710428c8e03SDavid Sterba user_thresh = mult_perc(cache->length, bargs->usage);
3711a105bb88SIlya Dryomov
37125ce5b3c0SIlya Dryomov if (chunk_used < user_thresh)
37135ce5b3c0SIlya Dryomov ret = 0;
37145ce5b3c0SIlya Dryomov
37155ce5b3c0SIlya Dryomov btrfs_put_block_group(cache);
37165ce5b3c0SIlya Dryomov return ret;
37175ce5b3c0SIlya Dryomov }
37185ce5b3c0SIlya Dryomov
chunk_devid_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3719409d404bSIlya Dryomov static int chunk_devid_filter(struct extent_buffer *leaf,
3720409d404bSIlya Dryomov struct btrfs_chunk *chunk,
3721409d404bSIlya Dryomov struct btrfs_balance_args *bargs)
3722409d404bSIlya Dryomov {
3723409d404bSIlya Dryomov struct btrfs_stripe *stripe;
3724409d404bSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3725409d404bSIlya Dryomov int i;
3726409d404bSIlya Dryomov
3727409d404bSIlya Dryomov for (i = 0; i < num_stripes; i++) {
3728409d404bSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i);
3729409d404bSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3730409d404bSIlya Dryomov return 0;
3731409d404bSIlya Dryomov }
3732409d404bSIlya Dryomov
3733409d404bSIlya Dryomov return 1;
3734409d404bSIlya Dryomov }
3735409d404bSIlya Dryomov
calc_data_stripes(u64 type,int num_stripes)3736946c9256SDavid Sterba static u64 calc_data_stripes(u64 type, int num_stripes)
3737946c9256SDavid Sterba {
3738946c9256SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(type);
3739946c9256SDavid Sterba const int ncopies = btrfs_raid_array[index].ncopies;
3740946c9256SDavid Sterba const int nparity = btrfs_raid_array[index].nparity;
3741946c9256SDavid Sterba
3742d58ede8dSDavid Sterba return (num_stripes - nparity) / ncopies;
3743946c9256SDavid Sterba }
3744946c9256SDavid Sterba
374594e60d5aSIlya Dryomov /* [pstart, pend) */
chunk_drange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)374694e60d5aSIlya Dryomov static int chunk_drange_filter(struct extent_buffer *leaf,
374794e60d5aSIlya Dryomov struct btrfs_chunk *chunk,
374894e60d5aSIlya Dryomov struct btrfs_balance_args *bargs)
374994e60d5aSIlya Dryomov {
375094e60d5aSIlya Dryomov struct btrfs_stripe *stripe;
375194e60d5aSIlya Dryomov int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
375294e60d5aSIlya Dryomov u64 stripe_offset;
375394e60d5aSIlya Dryomov u64 stripe_length;
3754946c9256SDavid Sterba u64 type;
375594e60d5aSIlya Dryomov int factor;
375694e60d5aSIlya Dryomov int i;
375794e60d5aSIlya Dryomov
375894e60d5aSIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
375994e60d5aSIlya Dryomov return 0;
376094e60d5aSIlya Dryomov
3761946c9256SDavid Sterba type = btrfs_chunk_type(leaf, chunk);
3762946c9256SDavid Sterba factor = calc_data_stripes(type, num_stripes);
376394e60d5aSIlya Dryomov
376494e60d5aSIlya Dryomov for (i = 0; i < num_stripes; i++) {
376594e60d5aSIlya Dryomov stripe = btrfs_stripe_nr(chunk, i);
376694e60d5aSIlya Dryomov if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
376794e60d5aSIlya Dryomov continue;
376894e60d5aSIlya Dryomov
376994e60d5aSIlya Dryomov stripe_offset = btrfs_stripe_offset(leaf, stripe);
377094e60d5aSIlya Dryomov stripe_length = btrfs_chunk_length(leaf, chunk);
3771b8b93addSDavid Sterba stripe_length = div_u64(stripe_length, factor);
377294e60d5aSIlya Dryomov
377394e60d5aSIlya Dryomov if (stripe_offset < bargs->pend &&
377494e60d5aSIlya Dryomov stripe_offset + stripe_length > bargs->pstart)
377594e60d5aSIlya Dryomov return 0;
377694e60d5aSIlya Dryomov }
377794e60d5aSIlya Dryomov
377894e60d5aSIlya Dryomov return 1;
377994e60d5aSIlya Dryomov }
378094e60d5aSIlya Dryomov
3781ea67176aSIlya Dryomov /* [vstart, vend) */
chunk_vrange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset,struct btrfs_balance_args * bargs)3782ea67176aSIlya Dryomov static int chunk_vrange_filter(struct extent_buffer *leaf,
3783ea67176aSIlya Dryomov struct btrfs_chunk *chunk,
3784ea67176aSIlya Dryomov u64 chunk_offset,
3785ea67176aSIlya Dryomov struct btrfs_balance_args *bargs)
3786ea67176aSIlya Dryomov {
3787ea67176aSIlya Dryomov if (chunk_offset < bargs->vend &&
3788ea67176aSIlya Dryomov chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3789ea67176aSIlya Dryomov /* at least part of the chunk is inside this vrange */
3790ea67176aSIlya Dryomov return 0;
3791ea67176aSIlya Dryomov
3792ea67176aSIlya Dryomov return 1;
3793ea67176aSIlya Dryomov }
3794ea67176aSIlya Dryomov
chunk_stripes_range_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3795dee32d0aSGabríel Arthúr Pétursson static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3796dee32d0aSGabríel Arthúr Pétursson struct btrfs_chunk *chunk,
3797dee32d0aSGabríel Arthúr Pétursson struct btrfs_balance_args *bargs)
3798dee32d0aSGabríel Arthúr Pétursson {
3799dee32d0aSGabríel Arthúr Pétursson int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3800dee32d0aSGabríel Arthúr Pétursson
3801dee32d0aSGabríel Arthúr Pétursson if (bargs->stripes_min <= num_stripes
3802dee32d0aSGabríel Arthúr Pétursson && num_stripes <= bargs->stripes_max)
3803dee32d0aSGabríel Arthúr Pétursson return 0;
3804dee32d0aSGabríel Arthúr Pétursson
3805dee32d0aSGabríel Arthúr Pétursson return 1;
3806dee32d0aSGabríel Arthúr Pétursson }
3807dee32d0aSGabríel Arthúr Pétursson
chunk_soft_convert_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3808899c81eaSIlya Dryomov static int chunk_soft_convert_filter(u64 chunk_type,
3809cfa4c961SIlya Dryomov struct btrfs_balance_args *bargs)
3810cfa4c961SIlya Dryomov {
3811cfa4c961SIlya Dryomov if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3812cfa4c961SIlya Dryomov return 0;
3813cfa4c961SIlya Dryomov
3814899c81eaSIlya Dryomov chunk_type = chunk_to_extended(chunk_type) &
3815899c81eaSIlya Dryomov BTRFS_EXTENDED_PROFILE_MASK;
3816cfa4c961SIlya Dryomov
3817899c81eaSIlya Dryomov if (bargs->target == chunk_type)
3818cfa4c961SIlya Dryomov return 1;
3819cfa4c961SIlya Dryomov
3820cfa4c961SIlya Dryomov return 0;
3821cfa4c961SIlya Dryomov }
3822cfa4c961SIlya Dryomov
should_balance_chunk(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset)38236ec0896cSDavid Sterba static int should_balance_chunk(struct extent_buffer *leaf,
3824f43ffb60SIlya Dryomov struct btrfs_chunk *chunk, u64 chunk_offset)
3825f43ffb60SIlya Dryomov {
38266ec0896cSDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info;
38270b246afaSJeff Mahoney struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3828f43ffb60SIlya Dryomov struct btrfs_balance_args *bargs = NULL;
3829f43ffb60SIlya Dryomov u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3830f43ffb60SIlya Dryomov
3831f43ffb60SIlya Dryomov /* type filter */
3832f43ffb60SIlya Dryomov if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3833f43ffb60SIlya Dryomov (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3834f43ffb60SIlya Dryomov return 0;
3835f43ffb60SIlya Dryomov }
3836f43ffb60SIlya Dryomov
3837f43ffb60SIlya Dryomov if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3838f43ffb60SIlya Dryomov bargs = &bctl->data;
3839f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3840f43ffb60SIlya Dryomov bargs = &bctl->sys;
3841f43ffb60SIlya Dryomov else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3842f43ffb60SIlya Dryomov bargs = &bctl->meta;
3843f43ffb60SIlya Dryomov
3844ed25e9b2SIlya Dryomov /* profiles filter */
3845ed25e9b2SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3846ed25e9b2SIlya Dryomov chunk_profiles_filter(chunk_type, bargs)) {
3847ed25e9b2SIlya Dryomov return 0;
3848ed25e9b2SIlya Dryomov }
3849ed25e9b2SIlya Dryomov
38505ce5b3c0SIlya Dryomov /* usage filter */
38515ce5b3c0SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
38520b246afaSJeff Mahoney chunk_usage_filter(fs_info, chunk_offset, bargs)) {
38535ce5b3c0SIlya Dryomov return 0;
3854bc309467SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
38550b246afaSJeff Mahoney chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3856bc309467SDavid Sterba return 0;
38575ce5b3c0SIlya Dryomov }
38585ce5b3c0SIlya Dryomov
3859409d404bSIlya Dryomov /* devid filter */
3860409d404bSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3861409d404bSIlya Dryomov chunk_devid_filter(leaf, chunk, bargs)) {
3862409d404bSIlya Dryomov return 0;
3863409d404bSIlya Dryomov }
3864409d404bSIlya Dryomov
386594e60d5aSIlya Dryomov /* drange filter, makes sense only with devid filter */
386694e60d5aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3867e4ff5fb5SNikolay Borisov chunk_drange_filter(leaf, chunk, bargs)) {
386894e60d5aSIlya Dryomov return 0;
386994e60d5aSIlya Dryomov }
387094e60d5aSIlya Dryomov
3871ea67176aSIlya Dryomov /* vrange filter */
3872ea67176aSIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3873ea67176aSIlya Dryomov chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3874ea67176aSIlya Dryomov return 0;
3875ea67176aSIlya Dryomov }
3876ea67176aSIlya Dryomov
3877dee32d0aSGabríel Arthúr Pétursson /* stripes filter */
3878dee32d0aSGabríel Arthúr Pétursson if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3879dee32d0aSGabríel Arthúr Pétursson chunk_stripes_range_filter(leaf, chunk, bargs)) {
3880dee32d0aSGabríel Arthúr Pétursson return 0;
3881dee32d0aSGabríel Arthúr Pétursson }
3882dee32d0aSGabríel Arthúr Pétursson
3883cfa4c961SIlya Dryomov /* soft profile changing mode */
3884cfa4c961SIlya Dryomov if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3885cfa4c961SIlya Dryomov chunk_soft_convert_filter(chunk_type, bargs)) {
3886cfa4c961SIlya Dryomov return 0;
3887cfa4c961SIlya Dryomov }
3888cfa4c961SIlya Dryomov
38897d824b6fSDavid Sterba /*
38907d824b6fSDavid Sterba * limited by count, must be the last filter
38917d824b6fSDavid Sterba */
38927d824b6fSDavid Sterba if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
38937d824b6fSDavid Sterba if (bargs->limit == 0)
38947d824b6fSDavid Sterba return 0;
38957d824b6fSDavid Sterba else
38967d824b6fSDavid Sterba bargs->limit--;
389712907fc7SDavid Sterba } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
389812907fc7SDavid Sterba /*
389912907fc7SDavid Sterba * Same logic as the 'limit' filter; the minimum cannot be
390001327610SNicholas D Steeves * determined here because we do not have the global information
390112907fc7SDavid Sterba * about the count of all chunks that satisfy the filters.
390212907fc7SDavid Sterba */
390312907fc7SDavid Sterba if (bargs->limit_max == 0)
390412907fc7SDavid Sterba return 0;
390512907fc7SDavid Sterba else
390612907fc7SDavid Sterba bargs->limit_max--;
39077d824b6fSDavid Sterba }
39087d824b6fSDavid Sterba
3909f43ffb60SIlya Dryomov return 1;
3910f43ffb60SIlya Dryomov }
3911f43ffb60SIlya Dryomov
__btrfs_balance(struct btrfs_fs_info * fs_info)3912c9e9f97bSIlya Dryomov static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3913ec44a35cSChris Mason {
391419a39dceSIlya Dryomov struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3915c9e9f97bSIlya Dryomov struct btrfs_root *chunk_root = fs_info->chunk_root;
391612907fc7SDavid Sterba u64 chunk_type;
3917f43ffb60SIlya Dryomov struct btrfs_chunk *chunk;
39185a488b9dSLiu Bo struct btrfs_path *path = NULL;
3919ec44a35cSChris Mason struct btrfs_key key;
3920ec44a35cSChris Mason struct btrfs_key found_key;
3921f43ffb60SIlya Dryomov struct extent_buffer *leaf;
3922f43ffb60SIlya Dryomov int slot;
3923c9e9f97bSIlya Dryomov int ret;
3924c9e9f97bSIlya Dryomov int enospc_errors = 0;
392519a39dceSIlya Dryomov bool counting = true;
392612907fc7SDavid Sterba /* The single value limit and min/max limits use the same bytes in the */
39277d824b6fSDavid Sterba u64 limit_data = bctl->data.limit;
39287d824b6fSDavid Sterba u64 limit_meta = bctl->meta.limit;
39297d824b6fSDavid Sterba u64 limit_sys = bctl->sys.limit;
393012907fc7SDavid Sterba u32 count_data = 0;
393112907fc7SDavid Sterba u32 count_meta = 0;
393212907fc7SDavid Sterba u32 count_sys = 0;
39332c9fe835SZhao Lei int chunk_reserved = 0;
3934ec44a35cSChris Mason
3935ec44a35cSChris Mason path = btrfs_alloc_path();
393617e9f796SMark Fasheh if (!path) {
393717e9f796SMark Fasheh ret = -ENOMEM;
393817e9f796SMark Fasheh goto error;
393917e9f796SMark Fasheh }
394019a39dceSIlya Dryomov
394119a39dceSIlya Dryomov /* zero out stat counters */
394219a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock);
394319a39dceSIlya Dryomov memset(&bctl->stat, 0, sizeof(bctl->stat));
394419a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock);
394519a39dceSIlya Dryomov again:
39467d824b6fSDavid Sterba if (!counting) {
394712907fc7SDavid Sterba /*
394812907fc7SDavid Sterba * The single value limit and min/max limits use the same bytes
394912907fc7SDavid Sterba * in the
395012907fc7SDavid Sterba */
39517d824b6fSDavid Sterba bctl->data.limit = limit_data;
39527d824b6fSDavid Sterba bctl->meta.limit = limit_meta;
39537d824b6fSDavid Sterba bctl->sys.limit = limit_sys;
39547d824b6fSDavid Sterba }
3955ec44a35cSChris Mason key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3956ec44a35cSChris Mason key.offset = (u64)-1;
3957ec44a35cSChris Mason key.type = BTRFS_CHUNK_ITEM_KEY;
3958ec44a35cSChris Mason
3959ec44a35cSChris Mason while (1) {
396019a39dceSIlya Dryomov if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3961a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_cancel_req)) {
3962837d5b6eSIlya Dryomov ret = -ECANCELED;
3963837d5b6eSIlya Dryomov goto error;
3964837d5b6eSIlya Dryomov }
3965837d5b6eSIlya Dryomov
3966f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock);
3967ec44a35cSChris Mason ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
396867c5e7d4SFilipe Manana if (ret < 0) {
3969f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
3970ec44a35cSChris Mason goto error;
397167c5e7d4SFilipe Manana }
3972ec44a35cSChris Mason
3973ec44a35cSChris Mason /*
3974ec44a35cSChris Mason * this shouldn't happen, it means the last relocate
3975ec44a35cSChris Mason * failed
3976ec44a35cSChris Mason */
3977ec44a35cSChris Mason if (ret == 0)
3978c9e9f97bSIlya Dryomov BUG(); /* FIXME break ? */
3979ec44a35cSChris Mason
3980ec44a35cSChris Mason ret = btrfs_previous_item(chunk_root, path, 0,
3981ec44a35cSChris Mason BTRFS_CHUNK_ITEM_KEY);
3982c9e9f97bSIlya Dryomov if (ret) {
3983f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
3984c9e9f97bSIlya Dryomov ret = 0;
3985ec44a35cSChris Mason break;
3986c9e9f97bSIlya Dryomov }
39877d9eb12cSChris Mason
3988f43ffb60SIlya Dryomov leaf = path->nodes[0];
3989f43ffb60SIlya Dryomov slot = path->slots[0];
3990f43ffb60SIlya Dryomov btrfs_item_key_to_cpu(leaf, &found_key, slot);
3991f43ffb60SIlya Dryomov
399267c5e7d4SFilipe Manana if (found_key.objectid != key.objectid) {
3993f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
3994ec44a35cSChris Mason break;
399567c5e7d4SFilipe Manana }
39967d9eb12cSChris Mason
3997f43ffb60SIlya Dryomov chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
399812907fc7SDavid Sterba chunk_type = btrfs_chunk_type(leaf, chunk);
3999f43ffb60SIlya Dryomov
400019a39dceSIlya Dryomov if (!counting) {
400119a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock);
400219a39dceSIlya Dryomov bctl->stat.considered++;
400319a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock);
400419a39dceSIlya Dryomov }
400519a39dceSIlya Dryomov
40066ec0896cSDavid Sterba ret = should_balance_chunk(leaf, chunk, found_key.offset);
40072c9fe835SZhao Lei
4008b3b4aa74SDavid Sterba btrfs_release_path(path);
400967c5e7d4SFilipe Manana if (!ret) {
4010f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
4011f43ffb60SIlya Dryomov goto loop;
401267c5e7d4SFilipe Manana }
4013f43ffb60SIlya Dryomov
401419a39dceSIlya Dryomov if (counting) {
4015f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
401619a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock);
401719a39dceSIlya Dryomov bctl->stat.expected++;
401819a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock);
401912907fc7SDavid Sterba
402012907fc7SDavid Sterba if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
402112907fc7SDavid Sterba count_data++;
402212907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
402312907fc7SDavid Sterba count_sys++;
402412907fc7SDavid Sterba else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
402512907fc7SDavid Sterba count_meta++;
402612907fc7SDavid Sterba
402712907fc7SDavid Sterba goto loop;
402812907fc7SDavid Sterba }
402912907fc7SDavid Sterba
403012907fc7SDavid Sterba /*
403112907fc7SDavid Sterba * Apply limit_min filter, no need to check if the LIMITS
403212907fc7SDavid Sterba * filter is used, limit_min is 0 by default
403312907fc7SDavid Sterba */
403412907fc7SDavid Sterba if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
403512907fc7SDavid Sterba count_data < bctl->data.limit_min)
403612907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
403712907fc7SDavid Sterba count_meta < bctl->meta.limit_min)
403812907fc7SDavid Sterba || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
403912907fc7SDavid Sterba count_sys < bctl->sys.limit_min)) {
4040f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
404119a39dceSIlya Dryomov goto loop;
404219a39dceSIlya Dryomov }
404319a39dceSIlya Dryomov
4044a6f93c71SLiu Bo if (!chunk_reserved) {
4045a6f93c71SLiu Bo /*
4046a6f93c71SLiu Bo * We may be relocating the only data chunk we have,
4047a6f93c71SLiu Bo * which could potentially end up with losing data's
4048a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in
4049a6f93c71SLiu Bo * advance.
4050a6f93c71SLiu Bo */
4051a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info,
4052a6f93c71SLiu Bo found_key.offset);
40532c9fe835SZhao Lei if (ret < 0) {
4054f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
40552c9fe835SZhao Lei goto error;
4056a6f93c71SLiu Bo } else if (ret == 1) {
40572c9fe835SZhao Lei chunk_reserved = 1;
40582c9fe835SZhao Lei }
4059a6f93c71SLiu Bo }
40602c9fe835SZhao Lei
40615b4aacefSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, found_key.offset);
4062f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
406319a39dceSIlya Dryomov if (ret == -ENOSPC) {
4064c9e9f97bSIlya Dryomov enospc_errors++;
4065eede2bf3SOmar Sandoval } else if (ret == -ETXTBSY) {
4066eede2bf3SOmar Sandoval btrfs_info(fs_info,
4067eede2bf3SOmar Sandoval "skipping relocation of block group %llu due to active swapfile",
4068eede2bf3SOmar Sandoval found_key.offset);
4069eede2bf3SOmar Sandoval ret = 0;
4070eede2bf3SOmar Sandoval } else if (ret) {
4071eede2bf3SOmar Sandoval goto error;
407219a39dceSIlya Dryomov } else {
407319a39dceSIlya Dryomov spin_lock(&fs_info->balance_lock);
407419a39dceSIlya Dryomov bctl->stat.completed++;
407519a39dceSIlya Dryomov spin_unlock(&fs_info->balance_lock);
407619a39dceSIlya Dryomov }
4077f43ffb60SIlya Dryomov loop:
4078795a3321SIlya Dryomov if (found_key.offset == 0)
4079795a3321SIlya Dryomov break;
4080ba1bf481SJosef Bacik key.offset = found_key.offset - 1;
4081ec44a35cSChris Mason }
4082c9e9f97bSIlya Dryomov
408319a39dceSIlya Dryomov if (counting) {
408419a39dceSIlya Dryomov btrfs_release_path(path);
408519a39dceSIlya Dryomov counting = false;
408619a39dceSIlya Dryomov goto again;
408719a39dceSIlya Dryomov }
4088ec44a35cSChris Mason error:
4089ec44a35cSChris Mason btrfs_free_path(path);
4090c9e9f97bSIlya Dryomov if (enospc_errors) {
4091efe120a0SFrank Holton btrfs_info(fs_info, "%d enospc errors during balance",
4092c9e9f97bSIlya Dryomov enospc_errors);
4093c9e9f97bSIlya Dryomov if (!ret)
4094c9e9f97bSIlya Dryomov ret = -ENOSPC;
4095c9e9f97bSIlya Dryomov }
4096c9e9f97bSIlya Dryomov
4097ec44a35cSChris Mason return ret;
4098ec44a35cSChris Mason }
4099ec44a35cSChris Mason
410043dd529aSDavid Sterba /*
410143dd529aSDavid Sterba * See if a given profile is valid and reduced.
410243dd529aSDavid Sterba *
41030c460c0dSIlya Dryomov * @flags: profile to validate
41040c460c0dSIlya Dryomov * @extended: if true @flags is treated as an extended profile
41050c460c0dSIlya Dryomov */
alloc_profile_is_valid(u64 flags,int extended)41060c460c0dSIlya Dryomov static int alloc_profile_is_valid(u64 flags, int extended)
41070c460c0dSIlya Dryomov {
41080c460c0dSIlya Dryomov u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
41090c460c0dSIlya Dryomov BTRFS_BLOCK_GROUP_PROFILE_MASK);
41100c460c0dSIlya Dryomov
41110c460c0dSIlya Dryomov flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
41120c460c0dSIlya Dryomov
41130c460c0dSIlya Dryomov /* 1) check that all other bits are zeroed */
41140c460c0dSIlya Dryomov if (flags & ~mask)
41150c460c0dSIlya Dryomov return 0;
41160c460c0dSIlya Dryomov
41170c460c0dSIlya Dryomov /* 2) see if profile is reduced */
41180c460c0dSIlya Dryomov if (flags == 0)
41190c460c0dSIlya Dryomov return !extended; /* "0" is valid for usual profiles */
41200c460c0dSIlya Dryomov
4121c1499166SDavid Sterba return has_single_bit_set(flags);
41220c460c0dSIlya Dryomov }
41230c460c0dSIlya Dryomov
41245ba366c3SDavid Sterba /*
41255ba366c3SDavid Sterba * Validate target profile against allowed profiles and return true if it's OK.
41265ba366c3SDavid Sterba * Otherwise print the error message and return false.
41275ba366c3SDavid Sterba */
validate_convert_profile(struct btrfs_fs_info * fs_info,const struct btrfs_balance_args * bargs,u64 allowed,const char * type)41285ba366c3SDavid Sterba static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
41295ba366c3SDavid Sterba const struct btrfs_balance_args *bargs,
41305ba366c3SDavid Sterba u64 allowed, const char *type)
4131bdcd3c97SAlexandru Moise {
41325ba366c3SDavid Sterba if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
41335ba366c3SDavid Sterba return true;
41345ba366c3SDavid Sterba
41355ba366c3SDavid Sterba /* Profile is valid and does not have bits outside of the allowed set */
41365ba366c3SDavid Sterba if (alloc_profile_is_valid(bargs->target, 1) &&
41375ba366c3SDavid Sterba (bargs->target & ~allowed) == 0)
41385ba366c3SDavid Sterba return true;
41395ba366c3SDavid Sterba
41405ba366c3SDavid Sterba btrfs_err(fs_info, "balance: invalid convert %s profile %s",
41415ba366c3SDavid Sterba type, btrfs_bg_type_to_raid_name(bargs->target));
41425ba366c3SDavid Sterba return false;
4143bdcd3c97SAlexandru Moise }
4144bdcd3c97SAlexandru Moise
4145c9e9f97bSIlya Dryomov /*
414656fc37d9SAnand Jain * Fill @buf with textual description of balance filter flags @bargs, up to
414756fc37d9SAnand Jain * @size_buf including the terminating null. The output may be trimmed if it
414856fc37d9SAnand Jain * does not fit into the provided buffer.
414956fc37d9SAnand Jain */
describe_balance_args(struct btrfs_balance_args * bargs,char * buf,u32 size_buf)415056fc37d9SAnand Jain static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
415156fc37d9SAnand Jain u32 size_buf)
415256fc37d9SAnand Jain {
415356fc37d9SAnand Jain int ret;
415456fc37d9SAnand Jain u32 size_bp = size_buf;
415556fc37d9SAnand Jain char *bp = buf;
415656fc37d9SAnand Jain u64 flags = bargs->flags;
415756fc37d9SAnand Jain char tmp_buf[128] = {'\0'};
415856fc37d9SAnand Jain
415956fc37d9SAnand Jain if (!flags)
416056fc37d9SAnand Jain return;
416156fc37d9SAnand Jain
416256fc37d9SAnand Jain #define CHECK_APPEND_NOARG(a) \
416356fc37d9SAnand Jain do { \
416456fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a)); \
416556fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \
416656fc37d9SAnand Jain goto out_overflow; \
416756fc37d9SAnand Jain size_bp -= ret; \
416856fc37d9SAnand Jain bp += ret; \
416956fc37d9SAnand Jain } while (0)
417056fc37d9SAnand Jain
417156fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \
417256fc37d9SAnand Jain do { \
417356fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \
417456fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \
417556fc37d9SAnand Jain goto out_overflow; \
417656fc37d9SAnand Jain size_bp -= ret; \
417756fc37d9SAnand Jain bp += ret; \
417856fc37d9SAnand Jain } while (0)
417956fc37d9SAnand Jain
418056fc37d9SAnand Jain #define CHECK_APPEND_2ARG(a, v1, v2) \
418156fc37d9SAnand Jain do { \
418256fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
418356fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \
418456fc37d9SAnand Jain goto out_overflow; \
418556fc37d9SAnand Jain size_bp -= ret; \
418656fc37d9SAnand Jain bp += ret; \
418756fc37d9SAnand Jain } while (0)
418856fc37d9SAnand Jain
4189158da513SDavid Sterba if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4190158da513SDavid Sterba CHECK_APPEND_1ARG("convert=%s,",
4191158da513SDavid Sterba btrfs_bg_type_to_raid_name(bargs->target));
419256fc37d9SAnand Jain
419356fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_SOFT)
419456fc37d9SAnand Jain CHECK_APPEND_NOARG("soft,");
419556fc37d9SAnand Jain
419656fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
419756fc37d9SAnand Jain btrfs_describe_block_groups(bargs->profiles, tmp_buf,
419856fc37d9SAnand Jain sizeof(tmp_buf));
419956fc37d9SAnand Jain CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
420056fc37d9SAnand Jain }
420156fc37d9SAnand Jain
420256fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE)
420356fc37d9SAnand Jain CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
420456fc37d9SAnand Jain
420556fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
420656fc37d9SAnand Jain CHECK_APPEND_2ARG("usage=%u..%u,",
420756fc37d9SAnand Jain bargs->usage_min, bargs->usage_max);
420856fc37d9SAnand Jain
420956fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DEVID)
421056fc37d9SAnand Jain CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
421156fc37d9SAnand Jain
421256fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_DRANGE)
421356fc37d9SAnand Jain CHECK_APPEND_2ARG("drange=%llu..%llu,",
421456fc37d9SAnand Jain bargs->pstart, bargs->pend);
421556fc37d9SAnand Jain
421656fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_VRANGE)
421756fc37d9SAnand Jain CHECK_APPEND_2ARG("vrange=%llu..%llu,",
421856fc37d9SAnand Jain bargs->vstart, bargs->vend);
421956fc37d9SAnand Jain
422056fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT)
422156fc37d9SAnand Jain CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
422256fc37d9SAnand Jain
422356fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
422456fc37d9SAnand Jain CHECK_APPEND_2ARG("limit=%u..%u,",
422556fc37d9SAnand Jain bargs->limit_min, bargs->limit_max);
422656fc37d9SAnand Jain
422756fc37d9SAnand Jain if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
422856fc37d9SAnand Jain CHECK_APPEND_2ARG("stripes=%u..%u,",
422956fc37d9SAnand Jain bargs->stripes_min, bargs->stripes_max);
423056fc37d9SAnand Jain
423156fc37d9SAnand Jain #undef CHECK_APPEND_2ARG
423256fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
423356fc37d9SAnand Jain #undef CHECK_APPEND_NOARG
423456fc37d9SAnand Jain
423556fc37d9SAnand Jain out_overflow:
423656fc37d9SAnand Jain
423756fc37d9SAnand Jain if (size_bp < size_buf)
423856fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
423956fc37d9SAnand Jain else
424056fc37d9SAnand Jain buf[0] = '\0';
424156fc37d9SAnand Jain }
424256fc37d9SAnand Jain
describe_balance_start_or_resume(struct btrfs_fs_info * fs_info)424356fc37d9SAnand Jain static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
424456fc37d9SAnand Jain {
424556fc37d9SAnand Jain u32 size_buf = 1024;
424656fc37d9SAnand Jain char tmp_buf[192] = {'\0'};
424756fc37d9SAnand Jain char *buf;
424856fc37d9SAnand Jain char *bp;
424956fc37d9SAnand Jain u32 size_bp = size_buf;
425056fc37d9SAnand Jain int ret;
425156fc37d9SAnand Jain struct btrfs_balance_control *bctl = fs_info->balance_ctl;
425256fc37d9SAnand Jain
425356fc37d9SAnand Jain buf = kzalloc(size_buf, GFP_KERNEL);
425456fc37d9SAnand Jain if (!buf)
425556fc37d9SAnand Jain return;
425656fc37d9SAnand Jain
425756fc37d9SAnand Jain bp = buf;
425856fc37d9SAnand Jain
425956fc37d9SAnand Jain #define CHECK_APPEND_1ARG(a, v1) \
426056fc37d9SAnand Jain do { \
426156fc37d9SAnand Jain ret = snprintf(bp, size_bp, (a), (v1)); \
426256fc37d9SAnand Jain if (ret < 0 || ret >= size_bp) \
426356fc37d9SAnand Jain goto out_overflow; \
426456fc37d9SAnand Jain size_bp -= ret; \
426556fc37d9SAnand Jain bp += ret; \
426656fc37d9SAnand Jain } while (0)
426756fc37d9SAnand Jain
426856fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_FORCE)
426956fc37d9SAnand Jain CHECK_APPEND_1ARG("%s", "-f ");
427056fc37d9SAnand Jain
427156fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_DATA) {
427256fc37d9SAnand Jain describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
427356fc37d9SAnand Jain CHECK_APPEND_1ARG("-d%s ", tmp_buf);
427456fc37d9SAnand Jain }
427556fc37d9SAnand Jain
427656fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_METADATA) {
427756fc37d9SAnand Jain describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
427856fc37d9SAnand Jain CHECK_APPEND_1ARG("-m%s ", tmp_buf);
427956fc37d9SAnand Jain }
428056fc37d9SAnand Jain
428156fc37d9SAnand Jain if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
428256fc37d9SAnand Jain describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
428356fc37d9SAnand Jain CHECK_APPEND_1ARG("-s%s ", tmp_buf);
428456fc37d9SAnand Jain }
428556fc37d9SAnand Jain
428656fc37d9SAnand Jain #undef CHECK_APPEND_1ARG
428756fc37d9SAnand Jain
428856fc37d9SAnand Jain out_overflow:
428956fc37d9SAnand Jain
429056fc37d9SAnand Jain if (size_bp < size_buf)
429156fc37d9SAnand Jain buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
429256fc37d9SAnand Jain btrfs_info(fs_info, "balance: %s %s",
429356fc37d9SAnand Jain (bctl->flags & BTRFS_BALANCE_RESUME) ?
429456fc37d9SAnand Jain "resume" : "start", buf);
429556fc37d9SAnand Jain
429656fc37d9SAnand Jain kfree(buf);
429756fc37d9SAnand Jain }
429856fc37d9SAnand Jain
429956fc37d9SAnand Jain /*
4300dccdb07bSDavid Sterba * Should be called with balance mutexe held
4301c9e9f97bSIlya Dryomov */
btrfs_balance(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl,struct btrfs_ioctl_balance_args * bargs)43026fcf6e2bSDavid Sterba int btrfs_balance(struct btrfs_fs_info *fs_info,
43036fcf6e2bSDavid Sterba struct btrfs_balance_control *bctl,
4304c9e9f97bSIlya Dryomov struct btrfs_ioctl_balance_args *bargs)
4305c9e9f97bSIlya Dryomov {
430614506127SAdam Borowski u64 meta_target, data_target;
4307f43ffb60SIlya Dryomov u64 allowed;
4308e4837f8fSIlya Dryomov int mixed = 0;
4309c9e9f97bSIlya Dryomov int ret;
43108dabb742SStefan Behrens u64 num_devices;
4311de98ced9SMiao Xie unsigned seq;
4312e62869beSAnand Jain bool reducing_redundancy;
4313b19c98f2SJosef Bacik bool paused = false;
4314081db89bSDavid Sterba int i;
4315c9e9f97bSIlya Dryomov
4316837d5b6eSIlya Dryomov if (btrfs_fs_closing(fs_info) ||
4317a7e99c69SIlya Dryomov atomic_read(&fs_info->balance_pause_req) ||
4318726a3421SQu Wenruo btrfs_should_cancel_balance(fs_info)) {
4319c9e9f97bSIlya Dryomov ret = -EINVAL;
4320c9e9f97bSIlya Dryomov goto out;
4321c9e9f97bSIlya Dryomov }
4322c9e9f97bSIlya Dryomov
4323e4837f8fSIlya Dryomov allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4324e4837f8fSIlya Dryomov if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4325e4837f8fSIlya Dryomov mixed = 1;
4326e4837f8fSIlya Dryomov
4327f43ffb60SIlya Dryomov /*
4328f43ffb60SIlya Dryomov * In case of mixed groups both data and meta should be picked,
4329f43ffb60SIlya Dryomov * and identical options should be given for both of them.
4330f43ffb60SIlya Dryomov */
4331e4837f8fSIlya Dryomov allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4332e4837f8fSIlya Dryomov if (mixed && (bctl->flags & allowed)) {
4333f43ffb60SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4334f43ffb60SIlya Dryomov !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4335f43ffb60SIlya Dryomov memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
43365d163e0eSJeff Mahoney btrfs_err(fs_info,
43376dac13f8SAnand Jain "balance: mixed groups data and metadata options must be the same");
4338f43ffb60SIlya Dryomov ret = -EINVAL;
4339f43ffb60SIlya Dryomov goto out;
4340f43ffb60SIlya Dryomov }
4341f43ffb60SIlya Dryomov }
4342f43ffb60SIlya Dryomov
4343b35cf1f0SJosef Bacik /*
4344b35cf1f0SJosef Bacik * rw_devices will not change at the moment, device add/delete/replace
4345c3e1f96cSGoldwyn Rodrigues * are exclusive
4346b35cf1f0SJosef Bacik */
4347b35cf1f0SJosef Bacik num_devices = fs_info->fs_devices->rw_devices;
4348fab27359SQu Wenruo
4349fab27359SQu Wenruo /*
4350fab27359SQu Wenruo * SINGLE profile on-disk has no profile bit, but in-memory we have a
4351fab27359SQu Wenruo * special bit for it, to make it easier to distinguish. Thus we need
4352fab27359SQu Wenruo * to set it manually, or balance would refuse the profile.
4353fab27359SQu Wenruo */
4354fab27359SQu Wenruo allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4355081db89bSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4356081db89bSDavid Sterba if (num_devices >= btrfs_raid_array[i].devs_min)
4357081db89bSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag;
43581da73967SAnand Jain
43595ba366c3SDavid Sterba if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
43605ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
43615ba366c3SDavid Sterba !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
4362e4d8ec0fSIlya Dryomov ret = -EINVAL;
4363e4d8ec0fSIlya Dryomov goto out;
4364e4d8ec0fSIlya Dryomov }
4365e4d8ec0fSIlya Dryomov
43666079e12cSDavid Sterba /*
43676079e12cSDavid Sterba * Allow to reduce metadata or system integrity only if force set for
43686079e12cSDavid Sterba * profiles with redundancy (copies, parity)
43696079e12cSDavid Sterba */
43706079e12cSDavid Sterba allowed = 0;
43716079e12cSDavid Sterba for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
43726079e12cSDavid Sterba if (btrfs_raid_array[i].ncopies >= 2 ||
43736079e12cSDavid Sterba btrfs_raid_array[i].tolerated_failures >= 1)
43746079e12cSDavid Sterba allowed |= btrfs_raid_array[i].bg_flag;
43756079e12cSDavid Sterba }
4376de98ced9SMiao Xie do {
4377de98ced9SMiao Xie seq = read_seqbegin(&fs_info->profiles_lock);
4378de98ced9SMiao Xie
4379e4d8ec0fSIlya Dryomov if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4380e4d8ec0fSIlya Dryomov (fs_info->avail_system_alloc_bits & allowed) &&
4381e4d8ec0fSIlya Dryomov !(bctl->sys.target & allowed)) ||
4382e4d8ec0fSIlya Dryomov ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4383e4d8ec0fSIlya Dryomov (fs_info->avail_metadata_alloc_bits & allowed) &&
43845a8067c0SFilipe Manana !(bctl->meta.target & allowed)))
4385e62869beSAnand Jain reducing_redundancy = true;
43865a8067c0SFilipe Manana else
4387e62869beSAnand Jain reducing_redundancy = false;
43885a8067c0SFilipe Manana
43895a8067c0SFilipe Manana /* if we're not converting, the target field is uninitialized */
43905a8067c0SFilipe Manana meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
43915a8067c0SFilipe Manana bctl->meta.target : fs_info->avail_metadata_alloc_bits;
43925a8067c0SFilipe Manana data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
43935a8067c0SFilipe Manana bctl->data.target : fs_info->avail_data_alloc_bits;
43945a8067c0SFilipe Manana } while (read_seqretry(&fs_info->profiles_lock, seq));
43955a8067c0SFilipe Manana
4396e62869beSAnand Jain if (reducing_redundancy) {
4397e4d8ec0fSIlya Dryomov if (bctl->flags & BTRFS_BALANCE_FORCE) {
43985d163e0eSJeff Mahoney btrfs_info(fs_info,
4399e62869beSAnand Jain "balance: force reducing metadata redundancy");
4400e4d8ec0fSIlya Dryomov } else {
44015d163e0eSJeff Mahoney btrfs_err(fs_info,
4402e62869beSAnand Jain "balance: reduces metadata redundancy, use --force if you want this");
4403e4d8ec0fSIlya Dryomov ret = -EINVAL;
4404e4d8ec0fSIlya Dryomov goto out;
4405e4d8ec0fSIlya Dryomov }
4406e4d8ec0fSIlya Dryomov }
4407e4d8ec0fSIlya Dryomov
440814506127SAdam Borowski if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
440914506127SAdam Borowski btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4410ee592d07SSam Tygier btrfs_warn(fs_info,
44116dac13f8SAnand Jain "balance: metadata profile %s has lower redundancy than data profile %s",
4412158da513SDavid Sterba btrfs_bg_type_to_raid_name(meta_target),
4413158da513SDavid Sterba btrfs_bg_type_to_raid_name(data_target));
4414ee592d07SSam Tygier }
4415ee592d07SSam Tygier
44166bccf3abSJeff Mahoney ret = insert_balance_item(fs_info, bctl);
441759641015SIlya Dryomov if (ret && ret != -EEXIST)
44180940ebf6SIlya Dryomov goto out;
44190940ebf6SIlya Dryomov
442059641015SIlya Dryomov if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
442159641015SIlya Dryomov BUG_ON(ret == -EEXIST);
4422833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl);
4423833aae18SDavid Sterba spin_lock(&fs_info->balance_lock);
4424833aae18SDavid Sterba fs_info->balance_ctl = bctl;
4425833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock);
442659641015SIlya Dryomov } else {
442759641015SIlya Dryomov BUG_ON(ret != -EEXIST);
442859641015SIlya Dryomov spin_lock(&fs_info->balance_lock);
442959641015SIlya Dryomov update_balance_args(bctl);
443059641015SIlya Dryomov spin_unlock(&fs_info->balance_lock);
443159641015SIlya Dryomov }
4432c9e9f97bSIlya Dryomov
44333009a62fSDavid Sterba ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
44343009a62fSDavid Sterba set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
443556fc37d9SAnand Jain describe_balance_start_or_resume(fs_info);
4436c9e9f97bSIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4437c9e9f97bSIlya Dryomov
4438c9e9f97bSIlya Dryomov ret = __btrfs_balance(fs_info);
4439c9e9f97bSIlya Dryomov
4440c9e9f97bSIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4441efc0e69cSNikolay Borisov if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
44427333bd02SAnand Jain btrfs_info(fs_info, "balance: paused");
4443efc0e69cSNikolay Borisov btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4444b19c98f2SJosef Bacik paused = true;
4445efc0e69cSNikolay Borisov }
444644d354abSQu Wenruo /*
444744d354abSQu Wenruo * Balance can be canceled by:
444844d354abSQu Wenruo *
444944d354abSQu Wenruo * - Regular cancel request
445044d354abSQu Wenruo * Then ret == -ECANCELED and balance_cancel_req > 0
445144d354abSQu Wenruo *
445244d354abSQu Wenruo * - Fatal signal to "btrfs" process
445344d354abSQu Wenruo * Either the signal caught by wait_reserve_ticket() and callers
445444d354abSQu Wenruo * got -EINTR, or caught by btrfs_should_cancel_balance() and
445544d354abSQu Wenruo * got -ECANCELED.
445644d354abSQu Wenruo * Either way, in this case balance_cancel_req = 0, and
445744d354abSQu Wenruo * ret == -EINTR or ret == -ECANCELED.
445844d354abSQu Wenruo *
445944d354abSQu Wenruo * So here we only check the return value to catch canceled balance.
446044d354abSQu Wenruo */
446144d354abSQu Wenruo else if (ret == -ECANCELED || ret == -EINTR)
44627333bd02SAnand Jain btrfs_info(fs_info, "balance: canceled");
44637333bd02SAnand Jain else
44647333bd02SAnand Jain btrfs_info(fs_info, "balance: ended with status: %d", ret);
44657333bd02SAnand Jain
44663009a62fSDavid Sterba clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4467c9e9f97bSIlya Dryomov
4468c9e9f97bSIlya Dryomov if (bargs) {
4469c9e9f97bSIlya Dryomov memset(bargs, 0, sizeof(*bargs));
4470008ef096SDavid Sterba btrfs_update_ioctl_balance_args(fs_info, bargs);
4471c9e9f97bSIlya Dryomov }
4472c9e9f97bSIlya Dryomov
4473b19c98f2SJosef Bacik /* We didn't pause, we can clean everything up. */
4474b19c98f2SJosef Bacik if (!paused) {
4475149196a2SDavid Sterba reset_balance_state(fs_info);
4476c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info);
44773a01aa7aSIlya Dryomov }
44783a01aa7aSIlya Dryomov
4479837d5b6eSIlya Dryomov wake_up(&fs_info->balance_wait_q);
4480c9e9f97bSIlya Dryomov
4481c9e9f97bSIlya Dryomov return ret;
4482c9e9f97bSIlya Dryomov out:
448359641015SIlya Dryomov if (bctl->flags & BTRFS_BALANCE_RESUME)
4484149196a2SDavid Sterba reset_balance_state(fs_info);
4485a17c95dfSDavid Sterba else
4486c9e9f97bSIlya Dryomov kfree(bctl);
4487c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info);
4488a17c95dfSDavid Sterba
44898f18cf13SChris Mason return ret;
44908f18cf13SChris Mason }
44918f18cf13SChris Mason
balance_kthread(void * data)449259641015SIlya Dryomov static int balance_kthread(void *data)
449359641015SIlya Dryomov {
44942b6ba629SIlya Dryomov struct btrfs_fs_info *fs_info = data;
44959555c6c1SIlya Dryomov int ret = 0;
449659641015SIlya Dryomov
4497a690e5f2SNaohiro Aota sb_start_write(fs_info->sb);
449859641015SIlya Dryomov mutex_lock(&fs_info->balance_mutex);
449956fc37d9SAnand Jain if (fs_info->balance_ctl)
45006fcf6e2bSDavid Sterba ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
450159641015SIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4502a690e5f2SNaohiro Aota sb_end_write(fs_info->sb);
45032b6ba629SIlya Dryomov
450459641015SIlya Dryomov return ret;
450559641015SIlya Dryomov }
450659641015SIlya Dryomov
btrfs_resume_balance_async(struct btrfs_fs_info * fs_info)45072b6ba629SIlya Dryomov int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
45082b6ba629SIlya Dryomov {
45092b6ba629SIlya Dryomov struct task_struct *tsk;
45102b6ba629SIlya Dryomov
45111354e1a1SDavid Sterba mutex_lock(&fs_info->balance_mutex);
45122b6ba629SIlya Dryomov if (!fs_info->balance_ctl) {
45131354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex);
45142b6ba629SIlya Dryomov return 0;
45152b6ba629SIlya Dryomov }
45161354e1a1SDavid Sterba mutex_unlock(&fs_info->balance_mutex);
45172b6ba629SIlya Dryomov
45183cdde224SJeff Mahoney if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
45196dac13f8SAnand Jain btrfs_info(fs_info, "balance: resume skipped");
45202b6ba629SIlya Dryomov return 0;
45212b6ba629SIlya Dryomov }
45222b6ba629SIlya Dryomov
4523efc0e69cSNikolay Borisov spin_lock(&fs_info->super_lock);
4524efc0e69cSNikolay Borisov ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4525efc0e69cSNikolay Borisov fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4526efc0e69cSNikolay Borisov spin_unlock(&fs_info->super_lock);
452702ee654dSAnand Jain /*
452802ee654dSAnand Jain * A ro->rw remount sequence should continue with the paused balance
452902ee654dSAnand Jain * regardless of who pauses it, system or the user as of now, so set
453002ee654dSAnand Jain * the resume flag.
453102ee654dSAnand Jain */
453202ee654dSAnand Jain spin_lock(&fs_info->balance_lock);
453302ee654dSAnand Jain fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
453402ee654dSAnand Jain spin_unlock(&fs_info->balance_lock);
453502ee654dSAnand Jain
45362b6ba629SIlya Dryomov tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4537cd633972SSachin Kamat return PTR_ERR_OR_ZERO(tsk);
45382b6ba629SIlya Dryomov }
45392b6ba629SIlya Dryomov
btrfs_recover_balance(struct btrfs_fs_info * fs_info)454068310a5eSIlya Dryomov int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
454159641015SIlya Dryomov {
454259641015SIlya Dryomov struct btrfs_balance_control *bctl;
454359641015SIlya Dryomov struct btrfs_balance_item *item;
454459641015SIlya Dryomov struct btrfs_disk_balance_args disk_bargs;
454559641015SIlya Dryomov struct btrfs_path *path;
454659641015SIlya Dryomov struct extent_buffer *leaf;
454759641015SIlya Dryomov struct btrfs_key key;
454859641015SIlya Dryomov int ret;
454959641015SIlya Dryomov
455059641015SIlya Dryomov path = btrfs_alloc_path();
455159641015SIlya Dryomov if (!path)
455259641015SIlya Dryomov return -ENOMEM;
455359641015SIlya Dryomov
455468310a5eSIlya Dryomov key.objectid = BTRFS_BALANCE_OBJECTID;
4555c479cb4fSDavid Sterba key.type = BTRFS_TEMPORARY_ITEM_KEY;
455668310a5eSIlya Dryomov key.offset = 0;
455768310a5eSIlya Dryomov
455868310a5eSIlya Dryomov ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
455968310a5eSIlya Dryomov if (ret < 0)
456068310a5eSIlya Dryomov goto out;
456168310a5eSIlya Dryomov if (ret > 0) { /* ret = -ENOENT; */
456268310a5eSIlya Dryomov ret = 0;
456368310a5eSIlya Dryomov goto out;
456468310a5eSIlya Dryomov }
456568310a5eSIlya Dryomov
456659641015SIlya Dryomov bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
456759641015SIlya Dryomov if (!bctl) {
456859641015SIlya Dryomov ret = -ENOMEM;
456959641015SIlya Dryomov goto out;
457059641015SIlya Dryomov }
457159641015SIlya Dryomov
457259641015SIlya Dryomov leaf = path->nodes[0];
457359641015SIlya Dryomov item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
457459641015SIlya Dryomov
457568310a5eSIlya Dryomov bctl->flags = btrfs_balance_flags(leaf, item);
457668310a5eSIlya Dryomov bctl->flags |= BTRFS_BALANCE_RESUME;
457759641015SIlya Dryomov
457859641015SIlya Dryomov btrfs_balance_data(leaf, item, &disk_bargs);
457959641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
458059641015SIlya Dryomov btrfs_balance_meta(leaf, item, &disk_bargs);
458159641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
458259641015SIlya Dryomov btrfs_balance_sys(leaf, item, &disk_bargs);
458359641015SIlya Dryomov btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
458459641015SIlya Dryomov
4585eee95e3fSDavid Sterba /*
4586eee95e3fSDavid Sterba * This should never happen, as the paused balance state is recovered
4587eee95e3fSDavid Sterba * during mount without any chance of other exclusive ops to collide.
4588eee95e3fSDavid Sterba *
4589eee95e3fSDavid Sterba * This gives the exclusive op status to balance and keeps in paused
4590eee95e3fSDavid Sterba * state until user intervention (cancel or umount). If the ownership
4591eee95e3fSDavid Sterba * cannot be assigned, show a message but do not fail. The balance
4592eee95e3fSDavid Sterba * is in a paused state and must have fs_info::balance_ctl properly
4593eee95e3fSDavid Sterba * set up.
4594eee95e3fSDavid Sterba */
4595efc0e69cSNikolay Borisov if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4596eee95e3fSDavid Sterba btrfs_warn(fs_info,
45976dac13f8SAnand Jain "balance: cannot set exclusive op status, resume manually");
4598ed0fb78fSIlya Dryomov
4599fb286100SJosef Bacik btrfs_release_path(path);
4600fb286100SJosef Bacik
460168310a5eSIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4602833aae18SDavid Sterba BUG_ON(fs_info->balance_ctl);
4603833aae18SDavid Sterba spin_lock(&fs_info->balance_lock);
4604833aae18SDavid Sterba fs_info->balance_ctl = bctl;
4605833aae18SDavid Sterba spin_unlock(&fs_info->balance_lock);
460668310a5eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
460759641015SIlya Dryomov out:
460859641015SIlya Dryomov btrfs_free_path(path);
460959641015SIlya Dryomov return ret;
461059641015SIlya Dryomov }
461159641015SIlya Dryomov
btrfs_pause_balance(struct btrfs_fs_info * fs_info)4612837d5b6eSIlya Dryomov int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4613837d5b6eSIlya Dryomov {
4614837d5b6eSIlya Dryomov int ret = 0;
4615837d5b6eSIlya Dryomov
4616837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4617837d5b6eSIlya Dryomov if (!fs_info->balance_ctl) {
4618837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4619837d5b6eSIlya Dryomov return -ENOTCONN;
4620837d5b6eSIlya Dryomov }
4621837d5b6eSIlya Dryomov
46223009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4623837d5b6eSIlya Dryomov atomic_inc(&fs_info->balance_pause_req);
4624837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4625837d5b6eSIlya Dryomov
4626837d5b6eSIlya Dryomov wait_event(fs_info->balance_wait_q,
46273009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4628837d5b6eSIlya Dryomov
4629837d5b6eSIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4630837d5b6eSIlya Dryomov /* we are good with balance_ctl ripped off from under us */
46313009a62fSDavid Sterba BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4632837d5b6eSIlya Dryomov atomic_dec(&fs_info->balance_pause_req);
4633837d5b6eSIlya Dryomov } else {
4634837d5b6eSIlya Dryomov ret = -ENOTCONN;
4635837d5b6eSIlya Dryomov }
4636837d5b6eSIlya Dryomov
4637837d5b6eSIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4638837d5b6eSIlya Dryomov return ret;
4639837d5b6eSIlya Dryomov }
4640837d5b6eSIlya Dryomov
btrfs_cancel_balance(struct btrfs_fs_info * fs_info)4641a7e99c69SIlya Dryomov int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4642a7e99c69SIlya Dryomov {
4643a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4644a7e99c69SIlya Dryomov if (!fs_info->balance_ctl) {
4645a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4646a7e99c69SIlya Dryomov return -ENOTCONN;
4647a7e99c69SIlya Dryomov }
4648a7e99c69SIlya Dryomov
4649cf7d20f4SDavid Sterba /*
4650cf7d20f4SDavid Sterba * A paused balance with the item stored on disk can be resumed at
4651cf7d20f4SDavid Sterba * mount time if the mount is read-write. Otherwise it's still paused
4652cf7d20f4SDavid Sterba * and we must not allow cancelling as it deletes the item.
4653cf7d20f4SDavid Sterba */
4654cf7d20f4SDavid Sterba if (sb_rdonly(fs_info->sb)) {
4655cf7d20f4SDavid Sterba mutex_unlock(&fs_info->balance_mutex);
4656cf7d20f4SDavid Sterba return -EROFS;
4657cf7d20f4SDavid Sterba }
4658cf7d20f4SDavid Sterba
4659a7e99c69SIlya Dryomov atomic_inc(&fs_info->balance_cancel_req);
4660a7e99c69SIlya Dryomov /*
4661a7e99c69SIlya Dryomov * if we are running just wait and return, balance item is
4662a7e99c69SIlya Dryomov * deleted in btrfs_balance in this case
4663a7e99c69SIlya Dryomov */
46643009a62fSDavid Sterba if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4665a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4666a7e99c69SIlya Dryomov wait_event(fs_info->balance_wait_q,
46673009a62fSDavid Sterba !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4668a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4669a7e99c69SIlya Dryomov } else {
4670a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4671dccdb07bSDavid Sterba /*
4672dccdb07bSDavid Sterba * Lock released to allow other waiters to continue, we'll
4673dccdb07bSDavid Sterba * reexamine the status again.
4674dccdb07bSDavid Sterba */
4675a7e99c69SIlya Dryomov mutex_lock(&fs_info->balance_mutex);
4676a7e99c69SIlya Dryomov
4677a17c95dfSDavid Sterba if (fs_info->balance_ctl) {
4678149196a2SDavid Sterba reset_balance_state(fs_info);
4679c3e1f96cSGoldwyn Rodrigues btrfs_exclop_finish(fs_info);
46806dac13f8SAnand Jain btrfs_info(fs_info, "balance: canceled");
4681a17c95dfSDavid Sterba }
4682a7e99c69SIlya Dryomov }
4683a7e99c69SIlya Dryomov
468429eefa6dSxiaoshoukui ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4685a7e99c69SIlya Dryomov atomic_dec(&fs_info->balance_cancel_req);
4686a7e99c69SIlya Dryomov mutex_unlock(&fs_info->balance_mutex);
4687a7e99c69SIlya Dryomov return 0;
4688a7e99c69SIlya Dryomov }
4689a7e99c69SIlya Dryomov
btrfs_uuid_scan_kthread(void * data)469097f4dd09SNikolay Borisov int btrfs_uuid_scan_kthread(void *data)
4691803b2f54SStefan Behrens {
4692803b2f54SStefan Behrens struct btrfs_fs_info *fs_info = data;
4693803b2f54SStefan Behrens struct btrfs_root *root = fs_info->tree_root;
4694803b2f54SStefan Behrens struct btrfs_key key;
4695803b2f54SStefan Behrens struct btrfs_path *path = NULL;
4696803b2f54SStefan Behrens int ret = 0;
4697803b2f54SStefan Behrens struct extent_buffer *eb;
4698803b2f54SStefan Behrens int slot;
4699803b2f54SStefan Behrens struct btrfs_root_item root_item;
4700803b2f54SStefan Behrens u32 item_size;
4701f45388f3SFilipe David Borba Manana struct btrfs_trans_handle *trans = NULL;
4702c94bec2cSJosef Bacik bool closing = false;
4703803b2f54SStefan Behrens
4704803b2f54SStefan Behrens path = btrfs_alloc_path();
4705803b2f54SStefan Behrens if (!path) {
4706803b2f54SStefan Behrens ret = -ENOMEM;
4707803b2f54SStefan Behrens goto out;
4708803b2f54SStefan Behrens }
4709803b2f54SStefan Behrens
4710803b2f54SStefan Behrens key.objectid = 0;
4711803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY;
4712803b2f54SStefan Behrens key.offset = 0;
4713803b2f54SStefan Behrens
4714803b2f54SStefan Behrens while (1) {
4715c94bec2cSJosef Bacik if (btrfs_fs_closing(fs_info)) {
4716c94bec2cSJosef Bacik closing = true;
4717c94bec2cSJosef Bacik break;
4718c94bec2cSJosef Bacik }
47197c829b72SAnand Jain ret = btrfs_search_forward(root, &key, path,
47207c829b72SAnand Jain BTRFS_OLDEST_GENERATION);
4721803b2f54SStefan Behrens if (ret) {
4722803b2f54SStefan Behrens if (ret > 0)
4723803b2f54SStefan Behrens ret = 0;
4724803b2f54SStefan Behrens break;
4725803b2f54SStefan Behrens }
4726803b2f54SStefan Behrens
4727803b2f54SStefan Behrens if (key.type != BTRFS_ROOT_ITEM_KEY ||
4728803b2f54SStefan Behrens (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4729803b2f54SStefan Behrens key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4730803b2f54SStefan Behrens key.objectid > BTRFS_LAST_FREE_OBJECTID)
4731803b2f54SStefan Behrens goto skip;
4732803b2f54SStefan Behrens
4733803b2f54SStefan Behrens eb = path->nodes[0];
4734803b2f54SStefan Behrens slot = path->slots[0];
47353212fa14SJosef Bacik item_size = btrfs_item_size(eb, slot);
4736803b2f54SStefan Behrens if (item_size < sizeof(root_item))
4737803b2f54SStefan Behrens goto skip;
4738803b2f54SStefan Behrens
4739803b2f54SStefan Behrens read_extent_buffer(eb, &root_item,
4740803b2f54SStefan Behrens btrfs_item_ptr_offset(eb, slot),
4741803b2f54SStefan Behrens (int)sizeof(root_item));
4742803b2f54SStefan Behrens if (btrfs_root_refs(&root_item) == 0)
4743803b2f54SStefan Behrens goto skip;
4744f45388f3SFilipe David Borba Manana
4745f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid) ||
4746f45388f3SFilipe David Borba Manana !btrfs_is_empty_uuid(root_item.received_uuid)) {
4747f45388f3SFilipe David Borba Manana if (trans)
4748f45388f3SFilipe David Borba Manana goto update_tree;
4749f45388f3SFilipe David Borba Manana
4750f45388f3SFilipe David Borba Manana btrfs_release_path(path);
4751803b2f54SStefan Behrens /*
4752803b2f54SStefan Behrens * 1 - subvol uuid item
4753803b2f54SStefan Behrens * 1 - received_subvol uuid item
4754803b2f54SStefan Behrens */
4755803b2f54SStefan Behrens trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4756803b2f54SStefan Behrens if (IS_ERR(trans)) {
4757803b2f54SStefan Behrens ret = PTR_ERR(trans);
4758803b2f54SStefan Behrens break;
4759803b2f54SStefan Behrens }
4760f45388f3SFilipe David Borba Manana continue;
4761f45388f3SFilipe David Borba Manana } else {
4762f45388f3SFilipe David Borba Manana goto skip;
4763f45388f3SFilipe David Borba Manana }
4764f45388f3SFilipe David Borba Manana update_tree:
47659771a5cfSJosef Bacik btrfs_release_path(path);
4766f45388f3SFilipe David Borba Manana if (!btrfs_is_empty_uuid(root_item.uuid)) {
4767cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4768803b2f54SStefan Behrens BTRFS_UUID_KEY_SUBVOL,
4769803b2f54SStefan Behrens key.objectid);
4770803b2f54SStefan Behrens if (ret < 0) {
4771efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d",
4772803b2f54SStefan Behrens ret);
4773803b2f54SStefan Behrens break;
4774803b2f54SStefan Behrens }
4775803b2f54SStefan Behrens }
4776803b2f54SStefan Behrens
4777803b2f54SStefan Behrens if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4778cdb345a8SLu Fengqi ret = btrfs_uuid_tree_add(trans,
4779803b2f54SStefan Behrens root_item.received_uuid,
4780803b2f54SStefan Behrens BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4781803b2f54SStefan Behrens key.objectid);
4782803b2f54SStefan Behrens if (ret < 0) {
4783efe120a0SFrank Holton btrfs_warn(fs_info, "uuid_tree_add failed %d",
4784803b2f54SStefan Behrens ret);
4785803b2f54SStefan Behrens break;
4786803b2f54SStefan Behrens }
4787803b2f54SStefan Behrens }
4788803b2f54SStefan Behrens
4789f45388f3SFilipe David Borba Manana skip:
47909771a5cfSJosef Bacik btrfs_release_path(path);
4791803b2f54SStefan Behrens if (trans) {
47923a45bb20SJeff Mahoney ret = btrfs_end_transaction(trans);
4793f45388f3SFilipe David Borba Manana trans = NULL;
4794803b2f54SStefan Behrens if (ret)
4795803b2f54SStefan Behrens break;
4796803b2f54SStefan Behrens }
4797803b2f54SStefan Behrens
4798803b2f54SStefan Behrens if (key.offset < (u64)-1) {
4799803b2f54SStefan Behrens key.offset++;
4800803b2f54SStefan Behrens } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4801803b2f54SStefan Behrens key.offset = 0;
4802803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY;
4803803b2f54SStefan Behrens } else if (key.objectid < (u64)-1) {
4804803b2f54SStefan Behrens key.offset = 0;
4805803b2f54SStefan Behrens key.type = BTRFS_ROOT_ITEM_KEY;
4806803b2f54SStefan Behrens key.objectid++;
4807803b2f54SStefan Behrens } else {
4808803b2f54SStefan Behrens break;
4809803b2f54SStefan Behrens }
4810803b2f54SStefan Behrens cond_resched();
4811803b2f54SStefan Behrens }
4812803b2f54SStefan Behrens
4813803b2f54SStefan Behrens out:
4814803b2f54SStefan Behrens btrfs_free_path(path);
4815f45388f3SFilipe David Borba Manana if (trans && !IS_ERR(trans))
48163a45bb20SJeff Mahoney btrfs_end_transaction(trans);
4817803b2f54SStefan Behrens if (ret)
4818efe120a0SFrank Holton btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4819c94bec2cSJosef Bacik else if (!closing)
4820afcdd129SJosef Bacik set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4821803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem);
4822803b2f54SStefan Behrens return 0;
4823803b2f54SStefan Behrens }
4824803b2f54SStefan Behrens
btrfs_create_uuid_tree(struct btrfs_fs_info * fs_info)4825f7a81ea4SStefan Behrens int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4826f7a81ea4SStefan Behrens {
4827f7a81ea4SStefan Behrens struct btrfs_trans_handle *trans;
4828f7a81ea4SStefan Behrens struct btrfs_root *tree_root = fs_info->tree_root;
4829f7a81ea4SStefan Behrens struct btrfs_root *uuid_root;
4830803b2f54SStefan Behrens struct task_struct *task;
4831803b2f54SStefan Behrens int ret;
4832f7a81ea4SStefan Behrens
4833f7a81ea4SStefan Behrens /*
4834f7a81ea4SStefan Behrens * 1 - root node
4835f7a81ea4SStefan Behrens * 1 - root item
4836f7a81ea4SStefan Behrens */
4837f7a81ea4SStefan Behrens trans = btrfs_start_transaction(tree_root, 2);
4838f7a81ea4SStefan Behrens if (IS_ERR(trans))
4839f7a81ea4SStefan Behrens return PTR_ERR(trans);
4840f7a81ea4SStefan Behrens
48419b7a2440SDavid Sterba uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4842f7a81ea4SStefan Behrens if (IS_ERR(uuid_root)) {
48436d13f549SDavid Sterba ret = PTR_ERR(uuid_root);
484466642832SJeff Mahoney btrfs_abort_transaction(trans, ret);
48453a45bb20SJeff Mahoney btrfs_end_transaction(trans);
48466d13f549SDavid Sterba return ret;
4847f7a81ea4SStefan Behrens }
4848f7a81ea4SStefan Behrens
4849f7a81ea4SStefan Behrens fs_info->uuid_root = uuid_root;
4850f7a81ea4SStefan Behrens
48513a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans);
4852803b2f54SStefan Behrens if (ret)
4853803b2f54SStefan Behrens return ret;
4854803b2f54SStefan Behrens
4855803b2f54SStefan Behrens down(&fs_info->uuid_tree_rescan_sem);
4856803b2f54SStefan Behrens task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4857803b2f54SStefan Behrens if (IS_ERR(task)) {
485870f80175SStefan Behrens /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4859efe120a0SFrank Holton btrfs_warn(fs_info, "failed to start uuid_scan task");
4860803b2f54SStefan Behrens up(&fs_info->uuid_tree_rescan_sem);
4861803b2f54SStefan Behrens return PTR_ERR(task);
4862f7a81ea4SStefan Behrens }
4863803b2f54SStefan Behrens
4864803b2f54SStefan Behrens return 0;
4865803b2f54SStefan Behrens }
4866803b2f54SStefan Behrens
48678f18cf13SChris Mason /*
48688f18cf13SChris Mason * shrinking a device means finding all of the device extents past
48698f18cf13SChris Mason * the new size, and then following the back refs to the chunks.
48708f18cf13SChris Mason * The chunk relocation code actually frees the device extent
48718f18cf13SChris Mason */
btrfs_shrink_device(struct btrfs_device * device,u64 new_size)48728f18cf13SChris Mason int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
48738f18cf13SChris Mason {
48740b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = device->fs_info;
48750b246afaSJeff Mahoney struct btrfs_root *root = fs_info->dev_root;
48768f18cf13SChris Mason struct btrfs_trans_handle *trans;
48778f18cf13SChris Mason struct btrfs_dev_extent *dev_extent = NULL;
48788f18cf13SChris Mason struct btrfs_path *path;
48798f18cf13SChris Mason u64 length;
48808f18cf13SChris Mason u64 chunk_offset;
48818f18cf13SChris Mason int ret;
48828f18cf13SChris Mason int slot;
4883ba1bf481SJosef Bacik int failed = 0;
4884ba1bf481SJosef Bacik bool retried = false;
48858f18cf13SChris Mason struct extent_buffer *l;
48868f18cf13SChris Mason struct btrfs_key key;
48870b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy;
48888f18cf13SChris Mason u64 old_total = btrfs_super_total_bytes(super_copy);
48897cc8e58dSMiao Xie u64 old_size = btrfs_device_get_total_bytes(device);
48907dfb8be1SNikolay Borisov u64 diff;
489161d0d0d2SNikolay Borisov u64 start;
48927dfb8be1SNikolay Borisov
48937dfb8be1SNikolay Borisov new_size = round_down(new_size, fs_info->sectorsize);
489461d0d0d2SNikolay Borisov start = new_size;
48950e4324a4SNikolay Borisov diff = round_down(old_size - new_size, fs_info->sectorsize);
48968f18cf13SChris Mason
4897401e29c1SAnand Jain if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
489863a212abSStefan Behrens return -EINVAL;
489963a212abSStefan Behrens
49008f18cf13SChris Mason path = btrfs_alloc_path();
49018f18cf13SChris Mason if (!path)
49028f18cf13SChris Mason return -ENOMEM;
49038f18cf13SChris Mason
49040338dff6SGu Jinxiang path->reada = READA_BACK;
49058f18cf13SChris Mason
490661d0d0d2SNikolay Borisov trans = btrfs_start_transaction(root, 0);
490761d0d0d2SNikolay Borisov if (IS_ERR(trans)) {
490861d0d0d2SNikolay Borisov btrfs_free_path(path);
490961d0d0d2SNikolay Borisov return PTR_ERR(trans);
491061d0d0d2SNikolay Borisov }
491161d0d0d2SNikolay Borisov
491234441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
49137d9eb12cSChris Mason
49147cc8e58dSMiao Xie btrfs_device_set_total_bytes(device, new_size);
4915ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
49162b82032cSYan Zheng device->fs_devices->total_rw_bytes -= diff;
4917a5ed45f8SNikolay Borisov atomic64_sub(diff, &fs_info->free_chunk_space);
49182bf64758SJosef Bacik }
491961d0d0d2SNikolay Borisov
492061d0d0d2SNikolay Borisov /*
492161d0d0d2SNikolay Borisov * Once the device's size has been set to the new size, ensure all
492261d0d0d2SNikolay Borisov * in-memory chunks are synced to disk so that the loop below sees them
492361d0d0d2SNikolay Borisov * and relocates them accordingly.
492461d0d0d2SNikolay Borisov */
49251c11b63eSJeff Mahoney if (contains_pending_extent(device, &start, diff)) {
492634441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
492761d0d0d2SNikolay Borisov ret = btrfs_commit_transaction(trans);
492861d0d0d2SNikolay Borisov if (ret)
492961d0d0d2SNikolay Borisov goto done;
493061d0d0d2SNikolay Borisov } else {
493161d0d0d2SNikolay Borisov mutex_unlock(&fs_info->chunk_mutex);
493261d0d0d2SNikolay Borisov btrfs_end_transaction(trans);
493361d0d0d2SNikolay Borisov }
49348f18cf13SChris Mason
4935ba1bf481SJosef Bacik again:
49368f18cf13SChris Mason key.objectid = device->devid;
49378f18cf13SChris Mason key.offset = (u64)-1;
49388f18cf13SChris Mason key.type = BTRFS_DEV_EXTENT_KEY;
49398f18cf13SChris Mason
4940213e64daSIlya Dryomov do {
4941f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock);
49428f18cf13SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
494367c5e7d4SFilipe Manana if (ret < 0) {
4944f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
49458f18cf13SChris Mason goto done;
494667c5e7d4SFilipe Manana }
49478f18cf13SChris Mason
49488f18cf13SChris Mason ret = btrfs_previous_item(root, path, 0, key.type);
49497056bf69SNikolay Borisov if (ret) {
4950f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
49518f18cf13SChris Mason if (ret < 0)
49528f18cf13SChris Mason goto done;
49538f18cf13SChris Mason ret = 0;
4954b3b4aa74SDavid Sterba btrfs_release_path(path);
4955bf1fb512SYan Zheng break;
49568f18cf13SChris Mason }
49578f18cf13SChris Mason
49588f18cf13SChris Mason l = path->nodes[0];
49598f18cf13SChris Mason slot = path->slots[0];
49608f18cf13SChris Mason btrfs_item_key_to_cpu(l, &key, path->slots[0]);
49618f18cf13SChris Mason
4962ba1bf481SJosef Bacik if (key.objectid != device->devid) {
4963f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
4964b3b4aa74SDavid Sterba btrfs_release_path(path);
4965bf1fb512SYan Zheng break;
4966ba1bf481SJosef Bacik }
49678f18cf13SChris Mason
49688f18cf13SChris Mason dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
49698f18cf13SChris Mason length = btrfs_dev_extent_length(l, dev_extent);
49708f18cf13SChris Mason
4971ba1bf481SJosef Bacik if (key.offset + length <= new_size) {
4972f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
4973b3b4aa74SDavid Sterba btrfs_release_path(path);
4974d6397baeSChris Ball break;
4975ba1bf481SJosef Bacik }
49768f18cf13SChris Mason
49778f18cf13SChris Mason chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4978b3b4aa74SDavid Sterba btrfs_release_path(path);
49798f18cf13SChris Mason
4980a6f93c71SLiu Bo /*
4981a6f93c71SLiu Bo * We may be relocating the only data chunk we have,
4982a6f93c71SLiu Bo * which could potentially end up with losing data's
4983a6f93c71SLiu Bo * raid profile, so lets allocate an empty one in
4984a6f93c71SLiu Bo * advance.
4985a6f93c71SLiu Bo */
4986a6f93c71SLiu Bo ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4987a6f93c71SLiu Bo if (ret < 0) {
4988f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
4989a6f93c71SLiu Bo goto done;
4990a6f93c71SLiu Bo }
4991a6f93c71SLiu Bo
49920b246afaSJeff Mahoney ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4993f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
4994eede2bf3SOmar Sandoval if (ret == -ENOSPC) {
4995ba1bf481SJosef Bacik failed++;
4996eede2bf3SOmar Sandoval } else if (ret) {
4997eede2bf3SOmar Sandoval if (ret == -ETXTBSY) {
4998eede2bf3SOmar Sandoval btrfs_warn(fs_info,
4999eede2bf3SOmar Sandoval "could not shrink block group %llu due to active swapfile",
5000eede2bf3SOmar Sandoval chunk_offset);
5001eede2bf3SOmar Sandoval }
5002eede2bf3SOmar Sandoval goto done;
5003eede2bf3SOmar Sandoval }
5004213e64daSIlya Dryomov } while (key.offset-- > 0);
5005ba1bf481SJosef Bacik
5006ba1bf481SJosef Bacik if (failed && !retried) {
5007ba1bf481SJosef Bacik failed = 0;
5008ba1bf481SJosef Bacik retried = true;
5009ba1bf481SJosef Bacik goto again;
5010ba1bf481SJosef Bacik } else if (failed && retried) {
5011ba1bf481SJosef Bacik ret = -ENOSPC;
50128f18cf13SChris Mason goto done;
50138f18cf13SChris Mason }
50148f18cf13SChris Mason
5015d6397baeSChris Ball /* Shrinking succeeded, else we would be at "done". */
5016a22285a6SYan, Zheng trans = btrfs_start_transaction(root, 0);
501798d5dc13STsutomu Itoh if (IS_ERR(trans)) {
501898d5dc13STsutomu Itoh ret = PTR_ERR(trans);
501998d5dc13STsutomu Itoh goto done;
502098d5dc13STsutomu Itoh }
502198d5dc13STsutomu Itoh
502234441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
5023c57dd1f2SQu Wenruo /* Clear all state bits beyond the shrunk device size */
5024c57dd1f2SQu Wenruo clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
5025c57dd1f2SQu Wenruo CHUNK_STATE_MASK);
5026c57dd1f2SQu Wenruo
50277cc8e58dSMiao Xie btrfs_device_set_disk_total_bytes(device, new_size);
5028bbbf7243SNikolay Borisov if (list_empty(&device->post_commit_list))
5029bbbf7243SNikolay Borisov list_add_tail(&device->post_commit_list,
5030bbbf7243SNikolay Borisov &trans->transaction->dev_update_list);
5031d6397baeSChris Ball
5032d6397baeSChris Ball WARN_ON(diff > old_total);
50337dfb8be1SNikolay Borisov btrfs_set_super_total_bytes(super_copy,
50347dfb8be1SNikolay Borisov round_down(old_total - diff, fs_info->sectorsize));
503534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
50362196d6e8SMiao Xie
50372bb2e00eSFilipe Manana btrfs_reserve_chunk_metadata(trans, false);
50382196d6e8SMiao Xie /* Now btrfs_update_device() will change the on-disk size. */
50392196d6e8SMiao Xie ret = btrfs_update_device(trans, device);
50402bb2e00eSFilipe Manana btrfs_trans_release_chunk_metadata(trans);
5041801660b0SAnand Jain if (ret < 0) {
5042801660b0SAnand Jain btrfs_abort_transaction(trans, ret);
50433a45bb20SJeff Mahoney btrfs_end_transaction(trans);
5044801660b0SAnand Jain } else {
5045801660b0SAnand Jain ret = btrfs_commit_transaction(trans);
5046801660b0SAnand Jain }
50478f18cf13SChris Mason done:
50488f18cf13SChris Mason btrfs_free_path(path);
504953e489bcSFilipe Manana if (ret) {
505034441361SDavid Sterba mutex_lock(&fs_info->chunk_mutex);
505153e489bcSFilipe Manana btrfs_device_set_total_bytes(device, old_size);
5052ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
505353e489bcSFilipe Manana device->fs_devices->total_rw_bytes += diff;
5054a5ed45f8SNikolay Borisov atomic64_add(diff, &fs_info->free_chunk_space);
505534441361SDavid Sterba mutex_unlock(&fs_info->chunk_mutex);
505653e489bcSFilipe Manana }
50578f18cf13SChris Mason return ret;
50588f18cf13SChris Mason }
50598f18cf13SChris Mason
btrfs_add_system_chunk(struct btrfs_fs_info * fs_info,struct btrfs_key * key,struct btrfs_chunk * chunk,int item_size)50602ff7e61eSJeff Mahoney static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
50610b86a832SChris Mason struct btrfs_key *key,
50620b86a832SChris Mason struct btrfs_chunk *chunk, int item_size)
50630b86a832SChris Mason {
50640b246afaSJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy;
50650b86a832SChris Mason struct btrfs_disk_key disk_key;
50660b86a832SChris Mason u32 array_size;
50670b86a832SChris Mason u8 *ptr;
50680b86a832SChris Mason
506979bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex);
507079bd3712SFilipe Manana
50710b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy);
50725f43f86eSGui Hecheng if (array_size + item_size + sizeof(disk_key)
507379bd3712SFilipe Manana > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
50740b86a832SChris Mason return -EFBIG;
50750b86a832SChris Mason
50760b86a832SChris Mason ptr = super_copy->sys_chunk_array + array_size;
50770b86a832SChris Mason btrfs_cpu_key_to_disk(&disk_key, key);
50780b86a832SChris Mason memcpy(ptr, &disk_key, sizeof(disk_key));
50790b86a832SChris Mason ptr += sizeof(disk_key);
50800b86a832SChris Mason memcpy(ptr, chunk, item_size);
50810b86a832SChris Mason item_size += sizeof(disk_key);
50820b86a832SChris Mason btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5083fe48a5c0SMiao Xie
50840b86a832SChris Mason return 0;
50850b86a832SChris Mason }
50860b86a832SChris Mason
50879f680ce0SChris Mason /*
508873c5de00SArne Jansen * sort the devices in descending order by max_avail, total_avail
50899f680ce0SChris Mason */
btrfs_cmp_device_info(const void * a,const void * b)509073c5de00SArne Jansen static int btrfs_cmp_device_info(const void *a, const void *b)
50912b82032cSYan Zheng {
509273c5de00SArne Jansen const struct btrfs_device_info *di_a = a;
509373c5de00SArne Jansen const struct btrfs_device_info *di_b = b;
50942b82032cSYan Zheng
509573c5de00SArne Jansen if (di_a->max_avail > di_b->max_avail)
5096a40a90a0SChris Mason return -1;
509773c5de00SArne Jansen if (di_a->max_avail < di_b->max_avail)
50989b3f68b9SChris Mason return 1;
509973c5de00SArne Jansen if (di_a->total_avail > di_b->total_avail)
510073c5de00SArne Jansen return -1;
510173c5de00SArne Jansen if (di_a->total_avail < di_b->total_avail)
510273c5de00SArne Jansen return 1;
5103b2117a39SMiao Xie return 0;
5104b2117a39SMiao Xie }
5105b2117a39SMiao Xie
check_raid56_incompat_flag(struct btrfs_fs_info * info,u64 type)510653b381b3SDavid Woodhouse static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
510753b381b3SDavid Woodhouse {
5108ffe2d203SZhao Lei if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
510953b381b3SDavid Woodhouse return;
511053b381b3SDavid Woodhouse
5111ceda0864SMiao Xie btrfs_set_fs_incompat(info, RAID56);
511253b381b3SDavid Woodhouse }
511353b381b3SDavid Woodhouse
check_raid1c34_incompat_flag(struct btrfs_fs_info * info,u64 type)5114cfbb825cSDavid Sterba static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5115cfbb825cSDavid Sterba {
5116cfbb825cSDavid Sterba if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5117cfbb825cSDavid Sterba return;
5118cfbb825cSDavid Sterba
5119cfbb825cSDavid Sterba btrfs_set_fs_incompat(info, RAID1C34);
5120cfbb825cSDavid Sterba }
5121cfbb825cSDavid Sterba
51224f2bafe8SNaohiro Aota /*
5123f6f39f7aSNikolay Borisov * Structure used internally for btrfs_create_chunk() function.
51244f2bafe8SNaohiro Aota * Wraps needed parameters.
51254f2bafe8SNaohiro Aota */
51264f2bafe8SNaohiro Aota struct alloc_chunk_ctl {
51274f2bafe8SNaohiro Aota u64 start;
51284f2bafe8SNaohiro Aota u64 type;
51294f2bafe8SNaohiro Aota /* Total number of stripes to allocate */
51304f2bafe8SNaohiro Aota int num_stripes;
51314f2bafe8SNaohiro Aota /* sub_stripes info for map */
51324f2bafe8SNaohiro Aota int sub_stripes;
51334f2bafe8SNaohiro Aota /* Stripes per device */
51344f2bafe8SNaohiro Aota int dev_stripes;
51354f2bafe8SNaohiro Aota /* Maximum number of devices to use */
51364f2bafe8SNaohiro Aota int devs_max;
51374f2bafe8SNaohiro Aota /* Minimum number of devices to use */
51384f2bafe8SNaohiro Aota int devs_min;
51394f2bafe8SNaohiro Aota /* ndevs has to be a multiple of this */
51404f2bafe8SNaohiro Aota int devs_increment;
51414f2bafe8SNaohiro Aota /* Number of copies */
51424f2bafe8SNaohiro Aota int ncopies;
51434f2bafe8SNaohiro Aota /* Number of stripes worth of bytes to store parity information */
51444f2bafe8SNaohiro Aota int nparity;
51454f2bafe8SNaohiro Aota u64 max_stripe_size;
51464f2bafe8SNaohiro Aota u64 max_chunk_size;
51476aafb303SNaohiro Aota u64 dev_extent_min;
51484f2bafe8SNaohiro Aota u64 stripe_size;
51494f2bafe8SNaohiro Aota u64 chunk_size;
51504f2bafe8SNaohiro Aota int ndevs;
51514f2bafe8SNaohiro Aota };
51524f2bafe8SNaohiro Aota
init_alloc_chunk_ctl_policy_regular(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)515327c314d5SNaohiro Aota static void init_alloc_chunk_ctl_policy_regular(
515427c314d5SNaohiro Aota struct btrfs_fs_devices *fs_devices,
515527c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl)
515627c314d5SNaohiro Aota {
5157f6fca391SStefan Roesch struct btrfs_space_info *space_info;
515827c314d5SNaohiro Aota
5159f6fca391SStefan Roesch space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5160f6fca391SStefan Roesch ASSERT(space_info);
5161f6fca391SStefan Roesch
5162f6fca391SStefan Roesch ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
51638a540e99SZygo Blaxell ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
5164f6fca391SStefan Roesch
5165f6fca391SStefan Roesch if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5166f6fca391SStefan Roesch ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
516727c314d5SNaohiro Aota
516827c314d5SNaohiro Aota /* We don't want a chunk larger than 10% of writable space */
5169428c8e03SDavid Sterba ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
517027c314d5SNaohiro Aota ctl->max_chunk_size);
5171cb091225SQu Wenruo ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
517227c314d5SNaohiro Aota }
517327c314d5SNaohiro Aota
init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)51741cd6121fSNaohiro Aota static void init_alloc_chunk_ctl_policy_zoned(
51751cd6121fSNaohiro Aota struct btrfs_fs_devices *fs_devices,
51761cd6121fSNaohiro Aota struct alloc_chunk_ctl *ctl)
51771cd6121fSNaohiro Aota {
51781cd6121fSNaohiro Aota u64 zone_size = fs_devices->fs_info->zone_size;
51791cd6121fSNaohiro Aota u64 limit;
51801cd6121fSNaohiro Aota int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
51811cd6121fSNaohiro Aota int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
51821cd6121fSNaohiro Aota u64 min_chunk_size = min_data_stripes * zone_size;
51831cd6121fSNaohiro Aota u64 type = ctl->type;
51841cd6121fSNaohiro Aota
51851cd6121fSNaohiro Aota ctl->max_stripe_size = zone_size;
51861cd6121fSNaohiro Aota if (type & BTRFS_BLOCK_GROUP_DATA) {
51871cd6121fSNaohiro Aota ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
51881cd6121fSNaohiro Aota zone_size);
51891cd6121fSNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
51901cd6121fSNaohiro Aota ctl->max_chunk_size = ctl->max_stripe_size;
51911cd6121fSNaohiro Aota } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
51921cd6121fSNaohiro Aota ctl->max_chunk_size = 2 * ctl->max_stripe_size;
51931cd6121fSNaohiro Aota ctl->devs_max = min_t(int, ctl->devs_max,
51941cd6121fSNaohiro Aota BTRFS_MAX_DEVS_SYS_CHUNK);
5195bb05b298SArnd Bergmann } else {
5196bb05b298SArnd Bergmann BUG();
51971cd6121fSNaohiro Aota }
51981cd6121fSNaohiro Aota
51991cd6121fSNaohiro Aota /* We don't want a chunk larger than 10% of writable space */
5200428c8e03SDavid Sterba limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
52011cd6121fSNaohiro Aota zone_size),
52021cd6121fSNaohiro Aota min_chunk_size);
52031cd6121fSNaohiro Aota ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
52041cd6121fSNaohiro Aota ctl->dev_extent_min = zone_size * ctl->dev_stripes;
52051cd6121fSNaohiro Aota }
52061cd6121fSNaohiro Aota
init_alloc_chunk_ctl(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)520727c314d5SNaohiro Aota static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
520827c314d5SNaohiro Aota struct alloc_chunk_ctl *ctl)
520927c314d5SNaohiro Aota {
521027c314d5SNaohiro Aota int index = btrfs_bg_flags_to_raid_index(ctl->type);
521127c314d5SNaohiro Aota
521227c314d5SNaohiro Aota ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
521327c314d5SNaohiro Aota ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
521427c314d5SNaohiro Aota ctl->devs_max = btrfs_raid_array[index].devs_max;
521527c314d5SNaohiro Aota if (!ctl->devs_max)
521627c314d5SNaohiro Aota ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
521727c314d5SNaohiro Aota ctl->devs_min = btrfs_raid_array[index].devs_min;
521827c314d5SNaohiro Aota ctl->devs_increment = btrfs_raid_array[index].devs_increment;
521927c314d5SNaohiro Aota ctl->ncopies = btrfs_raid_array[index].ncopies;
522027c314d5SNaohiro Aota ctl->nparity = btrfs_raid_array[index].nparity;
522127c314d5SNaohiro Aota ctl->ndevs = 0;
522227c314d5SNaohiro Aota
522327c314d5SNaohiro Aota switch (fs_devices->chunk_alloc_policy) {
522427c314d5SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR:
522527c314d5SNaohiro Aota init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
522627c314d5SNaohiro Aota break;
52271cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED:
52281cd6121fSNaohiro Aota init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
52291cd6121fSNaohiro Aota break;
523027c314d5SNaohiro Aota default:
523127c314d5SNaohiro Aota BUG();
523227c314d5SNaohiro Aota }
523327c314d5SNaohiro Aota }
523427c314d5SNaohiro Aota
gather_device_info(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5235560156cbSNaohiro Aota static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5236560156cbSNaohiro Aota struct alloc_chunk_ctl *ctl,
5237560156cbSNaohiro Aota struct btrfs_device_info *devices_info)
5238560156cbSNaohiro Aota {
5239560156cbSNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info;
5240560156cbSNaohiro Aota struct btrfs_device *device;
5241560156cbSNaohiro Aota u64 total_avail;
5242560156cbSNaohiro Aota u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5243560156cbSNaohiro Aota int ret;
5244560156cbSNaohiro Aota int ndevs = 0;
5245560156cbSNaohiro Aota u64 max_avail;
5246560156cbSNaohiro Aota u64 dev_offset;
5247560156cbSNaohiro Aota
5248560156cbSNaohiro Aota /*
5249560156cbSNaohiro Aota * in the first pass through the devices list, we gather information
5250560156cbSNaohiro Aota * about the available holes on each device.
5251560156cbSNaohiro Aota */
5252560156cbSNaohiro Aota list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5253560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5254560156cbSNaohiro Aota WARN(1, KERN_ERR
5255560156cbSNaohiro Aota "BTRFS: read-only device in alloc_list\n");
5256560156cbSNaohiro Aota continue;
5257560156cbSNaohiro Aota }
5258560156cbSNaohiro Aota
5259560156cbSNaohiro Aota if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5260560156cbSNaohiro Aota &device->dev_state) ||
5261560156cbSNaohiro Aota test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5262560156cbSNaohiro Aota continue;
5263560156cbSNaohiro Aota
5264560156cbSNaohiro Aota if (device->total_bytes > device->bytes_used)
5265560156cbSNaohiro Aota total_avail = device->total_bytes - device->bytes_used;
5266560156cbSNaohiro Aota else
5267560156cbSNaohiro Aota total_avail = 0;
5268560156cbSNaohiro Aota
5269560156cbSNaohiro Aota /* If there is no space on this device, skip it. */
52706aafb303SNaohiro Aota if (total_avail < ctl->dev_extent_min)
5271560156cbSNaohiro Aota continue;
5272560156cbSNaohiro Aota
5273560156cbSNaohiro Aota ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5274560156cbSNaohiro Aota &max_avail);
5275560156cbSNaohiro Aota if (ret && ret != -ENOSPC)
5276560156cbSNaohiro Aota return ret;
5277560156cbSNaohiro Aota
5278560156cbSNaohiro Aota if (ret == 0)
5279560156cbSNaohiro Aota max_avail = dev_extent_want;
5280560156cbSNaohiro Aota
52816aafb303SNaohiro Aota if (max_avail < ctl->dev_extent_min) {
5282560156cbSNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG))
5283560156cbSNaohiro Aota btrfs_debug(info,
5284560156cbSNaohiro Aota "%s: devid %llu has no free space, have=%llu want=%llu",
5285560156cbSNaohiro Aota __func__, device->devid, max_avail,
52866aafb303SNaohiro Aota ctl->dev_extent_min);
5287560156cbSNaohiro Aota continue;
5288560156cbSNaohiro Aota }
5289560156cbSNaohiro Aota
5290560156cbSNaohiro Aota if (ndevs == fs_devices->rw_devices) {
5291560156cbSNaohiro Aota WARN(1, "%s: found more than %llu devices\n",
5292560156cbSNaohiro Aota __func__, fs_devices->rw_devices);
5293560156cbSNaohiro Aota break;
5294560156cbSNaohiro Aota }
5295560156cbSNaohiro Aota devices_info[ndevs].dev_offset = dev_offset;
5296560156cbSNaohiro Aota devices_info[ndevs].max_avail = max_avail;
5297560156cbSNaohiro Aota devices_info[ndevs].total_avail = total_avail;
5298560156cbSNaohiro Aota devices_info[ndevs].dev = device;
5299560156cbSNaohiro Aota ++ndevs;
5300560156cbSNaohiro Aota }
5301560156cbSNaohiro Aota ctl->ndevs = ndevs;
5302560156cbSNaohiro Aota
5303560156cbSNaohiro Aota /*
5304560156cbSNaohiro Aota * now sort the devices by hole size / available space
5305560156cbSNaohiro Aota */
5306560156cbSNaohiro Aota sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5307560156cbSNaohiro Aota btrfs_cmp_device_info, NULL);
5308560156cbSNaohiro Aota
5309560156cbSNaohiro Aota return 0;
5310560156cbSNaohiro Aota }
5311560156cbSNaohiro Aota
decide_stripe_size_regular(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53125badf512SNaohiro Aota static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
53135badf512SNaohiro Aota struct btrfs_device_info *devices_info)
53145badf512SNaohiro Aota {
53155badf512SNaohiro Aota /* Number of stripes that count for block group size */
53165badf512SNaohiro Aota int data_stripes;
53175badf512SNaohiro Aota
53185badf512SNaohiro Aota /*
53195badf512SNaohiro Aota * The primary goal is to maximize the number of stripes, so use as
53205badf512SNaohiro Aota * many devices as possible, even if the stripes are not maximum sized.
53215badf512SNaohiro Aota *
53225badf512SNaohiro Aota * The DUP profile stores more than one stripe per device, the
53235badf512SNaohiro Aota * max_avail is the total size so we have to adjust.
53245badf512SNaohiro Aota */
53255badf512SNaohiro Aota ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
53265badf512SNaohiro Aota ctl->dev_stripes);
53275badf512SNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53285badf512SNaohiro Aota
53295badf512SNaohiro Aota /* This will have to be fixed for RAID1 and RAID10 over more drives */
53305badf512SNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53315badf512SNaohiro Aota
53325badf512SNaohiro Aota /*
53335badf512SNaohiro Aota * Use the number of data stripes to figure out how big this chunk is
53345badf512SNaohiro Aota * really going to be in terms of logical address space, and compare
53355badf512SNaohiro Aota * that answer with the max chunk size. If it's higher, we try to
53365badf512SNaohiro Aota * reduce stripe_size.
53375badf512SNaohiro Aota */
53385badf512SNaohiro Aota if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
53395badf512SNaohiro Aota /*
53405badf512SNaohiro Aota * Reduce stripe_size, round it up to a 16MB boundary again and
53415badf512SNaohiro Aota * then use it, unless it ends up being even bigger than the
53425badf512SNaohiro Aota * previous value we had already.
53435badf512SNaohiro Aota */
53445badf512SNaohiro Aota ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
53455badf512SNaohiro Aota data_stripes), SZ_16M),
53465badf512SNaohiro Aota ctl->stripe_size);
53475badf512SNaohiro Aota }
53485badf512SNaohiro Aota
53495da431b7SQu Wenruo /* Stripe size should not go beyond 1G. */
53505da431b7SQu Wenruo ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
53515da431b7SQu Wenruo
53525badf512SNaohiro Aota /* Align to BTRFS_STRIPE_LEN */
53535badf512SNaohiro Aota ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
53545badf512SNaohiro Aota ctl->chunk_size = ctl->stripe_size * data_stripes;
53555badf512SNaohiro Aota
53565badf512SNaohiro Aota return 0;
53575badf512SNaohiro Aota }
53585badf512SNaohiro Aota
decide_stripe_size_zoned(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53591cd6121fSNaohiro Aota static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
53601cd6121fSNaohiro Aota struct btrfs_device_info *devices_info)
53611cd6121fSNaohiro Aota {
53621cd6121fSNaohiro Aota u64 zone_size = devices_info[0].dev->zone_info->zone_size;
53631cd6121fSNaohiro Aota /* Number of stripes that count for block group size */
53641cd6121fSNaohiro Aota int data_stripes;
53651cd6121fSNaohiro Aota
53661cd6121fSNaohiro Aota /*
53671cd6121fSNaohiro Aota * It should hold because:
53681cd6121fSNaohiro Aota * dev_extent_min == dev_extent_want == zone_size * dev_stripes
53691cd6121fSNaohiro Aota */
53701cd6121fSNaohiro Aota ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
53711cd6121fSNaohiro Aota
53721cd6121fSNaohiro Aota ctl->stripe_size = zone_size;
53731cd6121fSNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53741cd6121fSNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53751cd6121fSNaohiro Aota
53761cd6121fSNaohiro Aota /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
53771cd6121fSNaohiro Aota if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
53781cd6121fSNaohiro Aota ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
53791cd6121fSNaohiro Aota ctl->stripe_size) + ctl->nparity,
53801cd6121fSNaohiro Aota ctl->dev_stripes);
53811cd6121fSNaohiro Aota ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
53821cd6121fSNaohiro Aota data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
53831cd6121fSNaohiro Aota ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
53841cd6121fSNaohiro Aota }
53851cd6121fSNaohiro Aota
53861cd6121fSNaohiro Aota ctl->chunk_size = ctl->stripe_size * data_stripes;
53871cd6121fSNaohiro Aota
53881cd6121fSNaohiro Aota return 0;
53891cd6121fSNaohiro Aota }
53901cd6121fSNaohiro Aota
decide_stripe_size(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)53915badf512SNaohiro Aota static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
53925badf512SNaohiro Aota struct alloc_chunk_ctl *ctl,
53935badf512SNaohiro Aota struct btrfs_device_info *devices_info)
53945badf512SNaohiro Aota {
53955badf512SNaohiro Aota struct btrfs_fs_info *info = fs_devices->fs_info;
53965badf512SNaohiro Aota
53975badf512SNaohiro Aota /*
53985badf512SNaohiro Aota * Round down to number of usable stripes, devs_increment can be any
53995badf512SNaohiro Aota * number so we can't use round_down() that requires power of 2, while
54005badf512SNaohiro Aota * rounddown is safe.
54015badf512SNaohiro Aota */
54025badf512SNaohiro Aota ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
54035badf512SNaohiro Aota
54045badf512SNaohiro Aota if (ctl->ndevs < ctl->devs_min) {
54055badf512SNaohiro Aota if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
54065badf512SNaohiro Aota btrfs_debug(info,
54075badf512SNaohiro Aota "%s: not enough devices with free space: have=%d minimum required=%d",
54085badf512SNaohiro Aota __func__, ctl->ndevs, ctl->devs_min);
54095badf512SNaohiro Aota }
54105badf512SNaohiro Aota return -ENOSPC;
54115badf512SNaohiro Aota }
54125badf512SNaohiro Aota
54135badf512SNaohiro Aota ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
54145badf512SNaohiro Aota
54155badf512SNaohiro Aota switch (fs_devices->chunk_alloc_policy) {
54165badf512SNaohiro Aota case BTRFS_CHUNK_ALLOC_REGULAR:
54175badf512SNaohiro Aota return decide_stripe_size_regular(ctl, devices_info);
54181cd6121fSNaohiro Aota case BTRFS_CHUNK_ALLOC_ZONED:
54191cd6121fSNaohiro Aota return decide_stripe_size_zoned(ctl, devices_info);
54205badf512SNaohiro Aota default:
54215badf512SNaohiro Aota BUG();
54225badf512SNaohiro Aota }
54235badf512SNaohiro Aota }
54245badf512SNaohiro Aota
create_chunk(struct btrfs_trans_handle * trans,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)542579bd3712SFilipe Manana static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5426dce580caSNaohiro Aota struct alloc_chunk_ctl *ctl,
5427dce580caSNaohiro Aota struct btrfs_device_info *devices_info)
5428dce580caSNaohiro Aota {
5429dce580caSNaohiro Aota struct btrfs_fs_info *info = trans->fs_info;
5430dce580caSNaohiro Aota struct map_lookup *map = NULL;
5431dce580caSNaohiro Aota struct extent_map_tree *em_tree;
543279bd3712SFilipe Manana struct btrfs_block_group *block_group;
5433dce580caSNaohiro Aota struct extent_map *em;
5434dce580caSNaohiro Aota u64 start = ctl->start;
5435dce580caSNaohiro Aota u64 type = ctl->type;
5436dce580caSNaohiro Aota int ret;
5437dce580caSNaohiro Aota int i;
5438dce580caSNaohiro Aota int j;
5439dce580caSNaohiro Aota
5440dce580caSNaohiro Aota map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5441dce580caSNaohiro Aota if (!map)
544279bd3712SFilipe Manana return ERR_PTR(-ENOMEM);
5443dce580caSNaohiro Aota map->num_stripes = ctl->num_stripes;
5444dce580caSNaohiro Aota
5445dce580caSNaohiro Aota for (i = 0; i < ctl->ndevs; ++i) {
5446dce580caSNaohiro Aota for (j = 0; j < ctl->dev_stripes; ++j) {
5447dce580caSNaohiro Aota int s = i * ctl->dev_stripes + j;
5448dce580caSNaohiro Aota map->stripes[s].dev = devices_info[i].dev;
5449dce580caSNaohiro Aota map->stripes[s].physical = devices_info[i].dev_offset +
5450dce580caSNaohiro Aota j * ctl->stripe_size;
5451dce580caSNaohiro Aota }
5452dce580caSNaohiro Aota }
5453dce580caSNaohiro Aota map->io_align = BTRFS_STRIPE_LEN;
5454dce580caSNaohiro Aota map->io_width = BTRFS_STRIPE_LEN;
5455dce580caSNaohiro Aota map->type = type;
5456dce580caSNaohiro Aota map->sub_stripes = ctl->sub_stripes;
5457dce580caSNaohiro Aota
5458dce580caSNaohiro Aota trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5459dce580caSNaohiro Aota
5460dce580caSNaohiro Aota em = alloc_extent_map();
5461dce580caSNaohiro Aota if (!em) {
5462dce580caSNaohiro Aota kfree(map);
546379bd3712SFilipe Manana return ERR_PTR(-ENOMEM);
5464dce580caSNaohiro Aota }
5465dce580caSNaohiro Aota set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5466dce580caSNaohiro Aota em->map_lookup = map;
5467dce580caSNaohiro Aota em->start = start;
5468dce580caSNaohiro Aota em->len = ctl->chunk_size;
5469dce580caSNaohiro Aota em->block_start = 0;
5470dce580caSNaohiro Aota em->block_len = em->len;
5471dce580caSNaohiro Aota em->orig_block_len = ctl->stripe_size;
5472dce580caSNaohiro Aota
5473dce580caSNaohiro Aota em_tree = &info->mapping_tree;
5474dce580caSNaohiro Aota write_lock(&em_tree->lock);
5475dce580caSNaohiro Aota ret = add_extent_mapping(em_tree, em, 0);
5476dce580caSNaohiro Aota if (ret) {
5477dce580caSNaohiro Aota write_unlock(&em_tree->lock);
5478dce580caSNaohiro Aota free_extent_map(em);
547979bd3712SFilipe Manana return ERR_PTR(ret);
5480dce580caSNaohiro Aota }
5481dce580caSNaohiro Aota write_unlock(&em_tree->lock);
5482dce580caSNaohiro Aota
54835758d1bdSFilipe Manana block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
548479bd3712SFilipe Manana if (IS_ERR(block_group))
5485dce580caSNaohiro Aota goto error_del_extent;
5486dce580caSNaohiro Aota
5487dce580caSNaohiro Aota for (i = 0; i < map->num_stripes; i++) {
5488dce580caSNaohiro Aota struct btrfs_device *dev = map->stripes[i].dev;
5489dce580caSNaohiro Aota
5490dce580caSNaohiro Aota btrfs_device_set_bytes_used(dev,
5491dce580caSNaohiro Aota dev->bytes_used + ctl->stripe_size);
5492dce580caSNaohiro Aota if (list_empty(&dev->post_commit_list))
5493dce580caSNaohiro Aota list_add_tail(&dev->post_commit_list,
5494dce580caSNaohiro Aota &trans->transaction->dev_update_list);
5495dce580caSNaohiro Aota }
5496dce580caSNaohiro Aota
5497dce580caSNaohiro Aota atomic64_sub(ctl->stripe_size * map->num_stripes,
5498dce580caSNaohiro Aota &info->free_chunk_space);
5499dce580caSNaohiro Aota
5500dce580caSNaohiro Aota free_extent_map(em);
5501dce580caSNaohiro Aota check_raid56_incompat_flag(info, type);
5502dce580caSNaohiro Aota check_raid1c34_incompat_flag(info, type);
5503dce580caSNaohiro Aota
550479bd3712SFilipe Manana return block_group;
5505dce580caSNaohiro Aota
5506dce580caSNaohiro Aota error_del_extent:
5507dce580caSNaohiro Aota write_lock(&em_tree->lock);
5508dce580caSNaohiro Aota remove_extent_mapping(em_tree, em);
5509dce580caSNaohiro Aota write_unlock(&em_tree->lock);
5510dce580caSNaohiro Aota
5511dce580caSNaohiro Aota /* One for our allocation */
5512dce580caSNaohiro Aota free_extent_map(em);
5513dce580caSNaohiro Aota /* One for the tree reference */
5514dce580caSNaohiro Aota free_extent_map(em);
5515dce580caSNaohiro Aota
551679bd3712SFilipe Manana return block_group;
5517dce580caSNaohiro Aota }
5518dce580caSNaohiro Aota
btrfs_create_chunk(struct btrfs_trans_handle * trans,u64 type)5519f6f39f7aSNikolay Borisov struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
552079bd3712SFilipe Manana u64 type)
5521b2117a39SMiao Xie {
55222ff7e61eSJeff Mahoney struct btrfs_fs_info *info = trans->fs_info;
5523b2117a39SMiao Xie struct btrfs_fs_devices *fs_devices = info->fs_devices;
552473c5de00SArne Jansen struct btrfs_device_info *devices_info = NULL;
55254f2bafe8SNaohiro Aota struct alloc_chunk_ctl ctl;
552679bd3712SFilipe Manana struct btrfs_block_group *block_group;
5527b2117a39SMiao Xie int ret;
5528b2117a39SMiao Xie
552911c67b1aSNikolay Borisov lockdep_assert_held(&info->chunk_mutex);
553011c67b1aSNikolay Borisov
5531b25c19f4SNaohiro Aota if (!alloc_profile_is_valid(type, 0)) {
5532b25c19f4SNaohiro Aota ASSERT(0);
553379bd3712SFilipe Manana return ERR_PTR(-EINVAL);
5534b25c19f4SNaohiro Aota }
553573c5de00SArne Jansen
55364117f207SQu Wenruo if (list_empty(&fs_devices->alloc_list)) {
55374117f207SQu Wenruo if (btrfs_test_opt(info, ENOSPC_DEBUG))
55384117f207SQu Wenruo btrfs_debug(info, "%s: no writable device", __func__);
553979bd3712SFilipe Manana return ERR_PTR(-ENOSPC);
55404117f207SQu Wenruo }
5541b2117a39SMiao Xie
554227c314d5SNaohiro Aota if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
554327c314d5SNaohiro Aota btrfs_err(info, "invalid chunk type 0x%llx requested", type);
554427c314d5SNaohiro Aota ASSERT(0);
554579bd3712SFilipe Manana return ERR_PTR(-EINVAL);
554673c5de00SArne Jansen }
554773c5de00SArne Jansen
554811c67b1aSNikolay Borisov ctl.start = find_next_chunk(info);
554927c314d5SNaohiro Aota ctl.type = type;
555027c314d5SNaohiro Aota init_alloc_chunk_ctl(fs_devices, &ctl);
5551b2117a39SMiao Xie
555231e818feSDavid Sterba devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5553b2117a39SMiao Xie GFP_NOFS);
5554b2117a39SMiao Xie if (!devices_info)
555579bd3712SFilipe Manana return ERR_PTR(-ENOMEM);
5556b2117a39SMiao Xie
5557560156cbSNaohiro Aota ret = gather_device_info(fs_devices, &ctl, devices_info);
555879bd3712SFilipe Manana if (ret < 0) {
555979bd3712SFilipe Manana block_group = ERR_PTR(ret);
5560dce580caSNaohiro Aota goto out;
556179bd3712SFilipe Manana }
556273c5de00SArne Jansen
55635badf512SNaohiro Aota ret = decide_stripe_size(fs_devices, &ctl, devices_info);
556479bd3712SFilipe Manana if (ret < 0) {
556579bd3712SFilipe Manana block_group = ERR_PTR(ret);
5566dce580caSNaohiro Aota goto out;
556779bd3712SFilipe Manana }
556873c5de00SArne Jansen
556979bd3712SFilipe Manana block_group = create_chunk(trans, &ctl, devices_info);
55709b3f68b9SChris Mason
5571dce580caSNaohiro Aota out:
5572b2117a39SMiao Xie kfree(devices_info);
557379bd3712SFilipe Manana return block_group;
55742b82032cSYan Zheng }
55752b82032cSYan Zheng
557611c67b1aSNikolay Borisov /*
557779bd3712SFilipe Manana * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
557879bd3712SFilipe Manana * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
557979bd3712SFilipe Manana * chunks.
558079bd3712SFilipe Manana *
558179bd3712SFilipe Manana * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
558279bd3712SFilipe Manana * phases.
558379bd3712SFilipe Manana */
btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle * trans,struct btrfs_block_group * bg)558479bd3712SFilipe Manana int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
558579bd3712SFilipe Manana struct btrfs_block_group *bg)
558679bd3712SFilipe Manana {
558779bd3712SFilipe Manana struct btrfs_fs_info *fs_info = trans->fs_info;
558879bd3712SFilipe Manana struct btrfs_root *chunk_root = fs_info->chunk_root;
558979bd3712SFilipe Manana struct btrfs_key key;
559079bd3712SFilipe Manana struct btrfs_chunk *chunk;
559179bd3712SFilipe Manana struct btrfs_stripe *stripe;
559279bd3712SFilipe Manana struct extent_map *em;
559379bd3712SFilipe Manana struct map_lookup *map;
559479bd3712SFilipe Manana size_t item_size;
559579bd3712SFilipe Manana int i;
559679bd3712SFilipe Manana int ret;
559779bd3712SFilipe Manana
559879bd3712SFilipe Manana /*
559979bd3712SFilipe Manana * We take the chunk_mutex for 2 reasons:
560079bd3712SFilipe Manana *
560179bd3712SFilipe Manana * 1) Updates and insertions in the chunk btree must be done while holding
560279bd3712SFilipe Manana * the chunk_mutex, as well as updating the system chunk array in the
560379bd3712SFilipe Manana * superblock. See the comment on top of btrfs_chunk_alloc() for the
560479bd3712SFilipe Manana * details;
560579bd3712SFilipe Manana *
560679bd3712SFilipe Manana * 2) To prevent races with the final phase of a device replace operation
560779bd3712SFilipe Manana * that replaces the device object associated with the map's stripes,
560879bd3712SFilipe Manana * because the device object's id can change at any time during that
560979bd3712SFilipe Manana * final phase of the device replace operation
561079bd3712SFilipe Manana * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
561179bd3712SFilipe Manana * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
561279bd3712SFilipe Manana * which would cause a failure when updating the device item, which does
561379bd3712SFilipe Manana * not exists, or persisting a stripe of the chunk item with such ID.
561479bd3712SFilipe Manana * Here we can't use the device_list_mutex because our caller already
561579bd3712SFilipe Manana * has locked the chunk_mutex, and the final phase of device replace
561679bd3712SFilipe Manana * acquires both mutexes - first the device_list_mutex and then the
561779bd3712SFilipe Manana * chunk_mutex. Using any of those two mutexes protects us from a
561879bd3712SFilipe Manana * concurrent device replace.
561979bd3712SFilipe Manana */
562079bd3712SFilipe Manana lockdep_assert_held(&fs_info->chunk_mutex);
562179bd3712SFilipe Manana
562279bd3712SFilipe Manana em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
562379bd3712SFilipe Manana if (IS_ERR(em)) {
562479bd3712SFilipe Manana ret = PTR_ERR(em);
562579bd3712SFilipe Manana btrfs_abort_transaction(trans, ret);
562679bd3712SFilipe Manana return ret;
562779bd3712SFilipe Manana }
562879bd3712SFilipe Manana
562979bd3712SFilipe Manana map = em->map_lookup;
563079bd3712SFilipe Manana item_size = btrfs_chunk_item_size(map->num_stripes);
563179bd3712SFilipe Manana
563279bd3712SFilipe Manana chunk = kzalloc(item_size, GFP_NOFS);
563379bd3712SFilipe Manana if (!chunk) {
563479bd3712SFilipe Manana ret = -ENOMEM;
563579bd3712SFilipe Manana btrfs_abort_transaction(trans, ret);
563679bd3712SFilipe Manana goto out;
563779bd3712SFilipe Manana }
563879bd3712SFilipe Manana
563979bd3712SFilipe Manana for (i = 0; i < map->num_stripes; i++) {
564079bd3712SFilipe Manana struct btrfs_device *device = map->stripes[i].dev;
564179bd3712SFilipe Manana
564279bd3712SFilipe Manana ret = btrfs_update_device(trans, device);
564379bd3712SFilipe Manana if (ret)
56446df9a95eSJosef Bacik goto out;
56452b82032cSYan Zheng }
56462b82032cSYan Zheng
56472b82032cSYan Zheng stripe = &chunk->stripe;
56486df9a95eSJosef Bacik for (i = 0; i < map->num_stripes; i++) {
564979bd3712SFilipe Manana struct btrfs_device *device = map->stripes[i].dev;
565079bd3712SFilipe Manana const u64 dev_offset = map->stripes[i].physical;
56512b82032cSYan Zheng
56522b82032cSYan Zheng btrfs_set_stack_stripe_devid(stripe, device->devid);
56532b82032cSYan Zheng btrfs_set_stack_stripe_offset(stripe, dev_offset);
56542b82032cSYan Zheng memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
56552b82032cSYan Zheng stripe++;
56562b82032cSYan Zheng }
56572b82032cSYan Zheng
565879bd3712SFilipe Manana btrfs_set_stack_chunk_length(chunk, bg->length);
5659fd51eb2fSJosef Bacik btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5660a97699d1SQu Wenruo btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
56612b82032cSYan Zheng btrfs_set_stack_chunk_type(chunk, map->type);
56622b82032cSYan Zheng btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5663a97699d1SQu Wenruo btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
5664a97699d1SQu Wenruo btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
56650b246afaSJeff Mahoney btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
56662b82032cSYan Zheng btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
56672b82032cSYan Zheng
56682b82032cSYan Zheng key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
56692b82032cSYan Zheng key.type = BTRFS_CHUNK_ITEM_KEY;
567079bd3712SFilipe Manana key.offset = bg->start;
56712b82032cSYan Zheng
56722b82032cSYan Zheng ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
567379bd3712SFilipe Manana if (ret)
567479bd3712SFilipe Manana goto out;
567579bd3712SFilipe Manana
56763349b57fSJosef Bacik set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
567779bd3712SFilipe Manana
567879bd3712SFilipe Manana if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
56792ff7e61eSJeff Mahoney ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
568079bd3712SFilipe Manana if (ret)
568179bd3712SFilipe Manana goto out;
56822b82032cSYan Zheng }
56831abe9b8aSliubo
56846df9a95eSJosef Bacik out:
56852b82032cSYan Zheng kfree(chunk);
56866df9a95eSJosef Bacik free_extent_map(em);
56874ed1d16eSMark Fasheh return ret;
56882b82032cSYan Zheng }
56892b82032cSYan Zheng
init_first_rw_device(struct btrfs_trans_handle * trans)56906f8e0fc7SDavid Sterba static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
56912b82032cSYan Zheng {
56926f8e0fc7SDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info;
56932b82032cSYan Zheng u64 alloc_profile;
569479bd3712SFilipe Manana struct btrfs_block_group *meta_bg;
569579bd3712SFilipe Manana struct btrfs_block_group *sys_bg;
569679bd3712SFilipe Manana
569779bd3712SFilipe Manana /*
569879bd3712SFilipe Manana * When adding a new device for sprouting, the seed device is read-only
569979bd3712SFilipe Manana * so we must first allocate a metadata and a system chunk. But before
570079bd3712SFilipe Manana * adding the block group items to the extent, device and chunk btrees,
570179bd3712SFilipe Manana * we must first:
570279bd3712SFilipe Manana *
570379bd3712SFilipe Manana * 1) Create both chunks without doing any changes to the btrees, as
570479bd3712SFilipe Manana * otherwise we would get -ENOSPC since the block groups from the
570579bd3712SFilipe Manana * seed device are read-only;
570679bd3712SFilipe Manana *
570779bd3712SFilipe Manana * 2) Add the device item for the new sprout device - finishing the setup
570879bd3712SFilipe Manana * of a new block group requires updating the device item in the chunk
570979bd3712SFilipe Manana * btree, so it must exist when we attempt to do it. The previous step
571079bd3712SFilipe Manana * ensures this does not fail with -ENOSPC.
571179bd3712SFilipe Manana *
571279bd3712SFilipe Manana * After that we can add the block group items to their btrees:
571379bd3712SFilipe Manana * update existing device item in the chunk btree, add a new block group
571479bd3712SFilipe Manana * item to the extent btree, add a new chunk item to the chunk btree and
571579bd3712SFilipe Manana * finally add the new device extent items to the devices btree.
571679bd3712SFilipe Manana */
57172b82032cSYan Zheng
57181b86826dSJeff Mahoney alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5719f6f39f7aSNikolay Borisov meta_bg = btrfs_create_chunk(trans, alloc_profile);
572079bd3712SFilipe Manana if (IS_ERR(meta_bg))
572179bd3712SFilipe Manana return PTR_ERR(meta_bg);
57222b82032cSYan Zheng
57231b86826dSJeff Mahoney alloc_profile = btrfs_system_alloc_profile(fs_info);
5724f6f39f7aSNikolay Borisov sys_bg = btrfs_create_chunk(trans, alloc_profile);
572579bd3712SFilipe Manana if (IS_ERR(sys_bg))
572679bd3712SFilipe Manana return PTR_ERR(sys_bg);
572779bd3712SFilipe Manana
572879bd3712SFilipe Manana return 0;
5729005d6427SDavid Sterba }
57302b82032cSYan Zheng
btrfs_chunk_max_errors(struct map_lookup * map)5731d20983b4SMiao Xie static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5732d20983b4SMiao Xie {
5733fc9a2ac7SDavid Sterba const int index = btrfs_bg_flags_to_raid_index(map->type);
5734d20983b4SMiao Xie
5735fc9a2ac7SDavid Sterba return btrfs_raid_array[index].tolerated_failures;
57362b82032cSYan Zheng }
57372b82032cSYan Zheng
btrfs_chunk_writeable(struct btrfs_fs_info * fs_info,u64 chunk_offset)5738a09f23c3SAnand Jain bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
57392b82032cSYan Zheng {
57402b82032cSYan Zheng struct extent_map *em;
57412b82032cSYan Zheng struct map_lookup *map;
5742d20983b4SMiao Xie int miss_ndevs = 0;
57432b82032cSYan Zheng int i;
5744a09f23c3SAnand Jain bool ret = true;
57452b82032cSYan Zheng
574660ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5747592d92eeSLiu Bo if (IS_ERR(em))
5748a09f23c3SAnand Jain return false;
57492b82032cSYan Zheng
575095617d69SJeff Mahoney map = em->map_lookup;
57512b82032cSYan Zheng for (i = 0; i < map->num_stripes; i++) {
5752e6e674bdSAnand Jain if (test_bit(BTRFS_DEV_STATE_MISSING,
5753e6e674bdSAnand Jain &map->stripes[i].dev->dev_state)) {
5754d20983b4SMiao Xie miss_ndevs++;
5755d20983b4SMiao Xie continue;
5756d20983b4SMiao Xie }
5757ebbede42SAnand Jain if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5758ebbede42SAnand Jain &map->stripes[i].dev->dev_state)) {
5759a09f23c3SAnand Jain ret = false;
5760d20983b4SMiao Xie goto end;
57612b82032cSYan Zheng }
57622b82032cSYan Zheng }
5763d20983b4SMiao Xie
5764d20983b4SMiao Xie /*
5765a09f23c3SAnand Jain * If the number of missing devices is larger than max errors, we can
5766a09f23c3SAnand Jain * not write the data into that chunk successfully.
5767d20983b4SMiao Xie */
5768d20983b4SMiao Xie if (miss_ndevs > btrfs_chunk_max_errors(map))
5769a09f23c3SAnand Jain ret = false;
5770d20983b4SMiao Xie end:
57712b82032cSYan Zheng free_extent_map(em);
5772a09f23c3SAnand Jain return ret;
57730b86a832SChris Mason }
57740b86a832SChris Mason
btrfs_mapping_tree_free(struct extent_map_tree * tree)5775c8bf1b67SDavid Sterba void btrfs_mapping_tree_free(struct extent_map_tree *tree)
57760b86a832SChris Mason {
57770b86a832SChris Mason struct extent_map *em;
57780b86a832SChris Mason
57790b86a832SChris Mason while (1) {
5780c8bf1b67SDavid Sterba write_lock(&tree->lock);
5781c8bf1b67SDavid Sterba em = lookup_extent_mapping(tree, 0, (u64)-1);
57820b86a832SChris Mason if (em)
5783c8bf1b67SDavid Sterba remove_extent_mapping(tree, em);
5784c8bf1b67SDavid Sterba write_unlock(&tree->lock);
57850b86a832SChris Mason if (!em)
57860b86a832SChris Mason break;
57870b86a832SChris Mason /* once for us */
57880b86a832SChris Mason free_extent_map(em);
57890b86a832SChris Mason /* once for the tree */
57900b86a832SChris Mason free_extent_map(em);
57910b86a832SChris Mason }
57920b86a832SChris Mason }
57930b86a832SChris Mason
btrfs_num_copies(struct btrfs_fs_info * fs_info,u64 logical,u64 len)57945d964051SStefan Behrens int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5795f188591eSChris Mason {
5796f188591eSChris Mason struct extent_map *em;
5797f188591eSChris Mason struct map_lookup *map;
57986d322b48SQu Wenruo enum btrfs_raid_types index;
57996d322b48SQu Wenruo int ret = 1;
5800f188591eSChris Mason
580160ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len);
5802592d92eeSLiu Bo if (IS_ERR(em))
5803fb7669b5SJosef Bacik /*
5804592d92eeSLiu Bo * We could return errors for these cases, but that could get
5805592d92eeSLiu Bo * ugly and we'd probably do the same thing which is just not do
5806592d92eeSLiu Bo * anything else and exit, so return 1 so the callers don't try
5807592d92eeSLiu Bo * to use other copies.
5808fb7669b5SJosef Bacik */
5809fb7669b5SJosef Bacik return 1;
5810fb7669b5SJosef Bacik
581195617d69SJeff Mahoney map = em->map_lookup;
58126d322b48SQu Wenruo index = btrfs_bg_flags_to_raid_index(map->type);
58136d322b48SQu Wenruo
58146d322b48SQu Wenruo /* Non-RAID56, use their ncopies from btrfs_raid_array. */
58156d322b48SQu Wenruo if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
58166d322b48SQu Wenruo ret = btrfs_raid_array[index].ncopies;
581753b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
581853b381b3SDavid Woodhouse ret = 2;
581953b381b3SDavid Woodhouse else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
58208810f751SLiu Bo /*
58218810f751SLiu Bo * There could be two corrupted data stripes, we need
58228810f751SLiu Bo * to loop retry in order to rebuild the correct data.
58238810f751SLiu Bo *
58248810f751SLiu Bo * Fail a stripe at a time on every retry except the
58258810f751SLiu Bo * stripe under reconstruction.
58268810f751SLiu Bo */
58278810f751SLiu Bo ret = map->num_stripes;
5828f188591eSChris Mason free_extent_map(em);
5829f188591eSChris Mason return ret;
5830f188591eSChris Mason }
5831f188591eSChris Mason
btrfs_full_stripe_len(struct btrfs_fs_info * fs_info,u64 logical)58322ff7e61eSJeff Mahoney unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
583353b381b3SDavid Woodhouse u64 logical)
583453b381b3SDavid Woodhouse {
583553b381b3SDavid Woodhouse struct extent_map *em;
583653b381b3SDavid Woodhouse struct map_lookup *map;
58370b246afaSJeff Mahoney unsigned long len = fs_info->sectorsize;
583853b381b3SDavid Woodhouse
5839b036f479SQu Wenruo if (!btrfs_fs_incompat(fs_info, RAID56))
5840b036f479SQu Wenruo return len;
5841b036f479SQu Wenruo
584260ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len);
584353b381b3SDavid Woodhouse
584469f03f13SNikolay Borisov if (!WARN_ON(IS_ERR(em))) {
584595617d69SJeff Mahoney map = em->map_lookup;
5846ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5847cb091225SQu Wenruo len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
584853b381b3SDavid Woodhouse free_extent_map(em);
584969f03f13SNikolay Borisov }
585053b381b3SDavid Woodhouse return len;
585153b381b3SDavid Woodhouse }
585253b381b3SDavid Woodhouse
btrfs_is_parity_mirror(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5853e4ff5fb5SNikolay Borisov int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
585453b381b3SDavid Woodhouse {
585553b381b3SDavid Woodhouse struct extent_map *em;
585653b381b3SDavid Woodhouse struct map_lookup *map;
585753b381b3SDavid Woodhouse int ret = 0;
585853b381b3SDavid Woodhouse
5859b036f479SQu Wenruo if (!btrfs_fs_incompat(fs_info, RAID56))
5860b036f479SQu Wenruo return 0;
5861b036f479SQu Wenruo
586260ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, len);
586353b381b3SDavid Woodhouse
586469f03f13SNikolay Borisov if(!WARN_ON(IS_ERR(em))) {
586595617d69SJeff Mahoney map = em->map_lookup;
5866ffe2d203SZhao Lei if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
586753b381b3SDavid Woodhouse ret = 1;
586853b381b3SDavid Woodhouse free_extent_map(em);
586969f03f13SNikolay Borisov }
587053b381b3SDavid Woodhouse return ret;
587153b381b3SDavid Woodhouse }
587253b381b3SDavid Woodhouse
find_live_mirror(struct btrfs_fs_info * fs_info,struct map_lookup * map,int first,int dev_replace_is_ongoing)587330d9861fSStefan Behrens static int find_live_mirror(struct btrfs_fs_info *fs_info,
587499f92a7cSAnand Jain struct map_lookup *map, int first,
58758ba0ae78SAnand Jain int dev_replace_is_ongoing)
5876dfe25020SChris Mason {
5877dfe25020SChris Mason int i;
587899f92a7cSAnand Jain int num_stripes;
58798ba0ae78SAnand Jain int preferred_mirror;
588030d9861fSStefan Behrens int tolerance;
588130d9861fSStefan Behrens struct btrfs_device *srcdev;
588230d9861fSStefan Behrens
588399f92a7cSAnand Jain ASSERT((map->type &
5884c7369b3fSDavid Sterba (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
588599f92a7cSAnand Jain
588699f92a7cSAnand Jain if (map->type & BTRFS_BLOCK_GROUP_RAID10)
588799f92a7cSAnand Jain num_stripes = map->sub_stripes;
588899f92a7cSAnand Jain else
588999f92a7cSAnand Jain num_stripes = map->num_stripes;
589099f92a7cSAnand Jain
589133fd2f71SAnand Jain switch (fs_info->fs_devices->read_policy) {
589233fd2f71SAnand Jain default:
589333fd2f71SAnand Jain /* Shouldn't happen, just warn and use pid instead of failing */
589433fd2f71SAnand Jain btrfs_warn_rl(fs_info,
589533fd2f71SAnand Jain "unknown read_policy type %u, reset to pid",
589633fd2f71SAnand Jain fs_info->fs_devices->read_policy);
589733fd2f71SAnand Jain fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
589833fd2f71SAnand Jain fallthrough;
589933fd2f71SAnand Jain case BTRFS_READ_POLICY_PID:
590033fd2f71SAnand Jain preferred_mirror = first + (current->pid % num_stripes);
590133fd2f71SAnand Jain break;
590233fd2f71SAnand Jain }
59038ba0ae78SAnand Jain
590430d9861fSStefan Behrens if (dev_replace_is_ongoing &&
590530d9861fSStefan Behrens fs_info->dev_replace.cont_reading_from_srcdev_mode ==
590630d9861fSStefan Behrens BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
590730d9861fSStefan Behrens srcdev = fs_info->dev_replace.srcdev;
590830d9861fSStefan Behrens else
590930d9861fSStefan Behrens srcdev = NULL;
591030d9861fSStefan Behrens
591130d9861fSStefan Behrens /*
591230d9861fSStefan Behrens * try to avoid the drive that is the source drive for a
591330d9861fSStefan Behrens * dev-replace procedure, only choose it if no other non-missing
591430d9861fSStefan Behrens * mirror is available
591530d9861fSStefan Behrens */
591630d9861fSStefan Behrens for (tolerance = 0; tolerance < 2; tolerance++) {
59178ba0ae78SAnand Jain if (map->stripes[preferred_mirror].dev->bdev &&
59188ba0ae78SAnand Jain (tolerance || map->stripes[preferred_mirror].dev != srcdev))
59198ba0ae78SAnand Jain return preferred_mirror;
592099f92a7cSAnand Jain for (i = first; i < first + num_stripes; i++) {
592130d9861fSStefan Behrens if (map->stripes[i].dev->bdev &&
592230d9861fSStefan Behrens (tolerance || map->stripes[i].dev != srcdev))
5923dfe25020SChris Mason return i;
5924dfe25020SChris Mason }
592530d9861fSStefan Behrens }
592630d9861fSStefan Behrens
5927dfe25020SChris Mason /* we couldn't find one that doesn't fail. Just return something
5928dfe25020SChris Mason * and the io error handling code will clean up eventually
5929dfe25020SChris Mason */
59308ba0ae78SAnand Jain return preferred_mirror;
5931dfe25020SChris Mason }
5932dfe25020SChris Mason
alloc_btrfs_io_context(struct btrfs_fs_info * fs_info,u16 total_stripes)5933731ccf15SQu Wenruo static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
59341faf3885SQu Wenruo u16 total_stripes)
59356e9606d2SZhao Lei {
59364ced85f8SQu Wenruo struct btrfs_io_context *bioc;
59374ced85f8SQu Wenruo
59384ced85f8SQu Wenruo bioc = kzalloc(
59394c664611SQu Wenruo /* The size of btrfs_io_context */
59404c664611SQu Wenruo sizeof(struct btrfs_io_context) +
59414c664611SQu Wenruo /* Plus the variable array for the stripes */
594218d758a2SQu Wenruo sizeof(struct btrfs_io_stripe) * (total_stripes),
59439f0eac07SLi zeming GFP_NOFS);
59449f0eac07SLi zeming
59459f0eac07SLi zeming if (!bioc)
59469f0eac07SLi zeming return NULL;
59476e9606d2SZhao Lei
59484c664611SQu Wenruo refcount_set(&bioc->refs, 1);
59496e9606d2SZhao Lei
5950731ccf15SQu Wenruo bioc->fs_info = fs_info;
59511faf3885SQu Wenruo bioc->replace_stripe_src = -1;
595218d758a2SQu Wenruo bioc->full_stripe_logical = (u64)-1;
5953608769a4SNikolay Borisov
59544c664611SQu Wenruo return bioc;
59556e9606d2SZhao Lei }
59566e9606d2SZhao Lei
btrfs_get_bioc(struct btrfs_io_context * bioc)59574c664611SQu Wenruo void btrfs_get_bioc(struct btrfs_io_context *bioc)
59586e9606d2SZhao Lei {
59594c664611SQu Wenruo WARN_ON(!refcount_read(&bioc->refs));
59604c664611SQu Wenruo refcount_inc(&bioc->refs);
59616e9606d2SZhao Lei }
59626e9606d2SZhao Lei
btrfs_put_bioc(struct btrfs_io_context * bioc)59634c664611SQu Wenruo void btrfs_put_bioc(struct btrfs_io_context *bioc)
59646e9606d2SZhao Lei {
59654c664611SQu Wenruo if (!bioc)
59666e9606d2SZhao Lei return;
59674c664611SQu Wenruo if (refcount_dec_and_test(&bioc->refs))
59684c664611SQu Wenruo kfree(bioc);
59696e9606d2SZhao Lei }
59706e9606d2SZhao Lei
59710b3d4cd3SLiu Bo /*
59720b3d4cd3SLiu Bo * Please note that, discard won't be sent to target device of device
59730b3d4cd3SLiu Bo * replace.
59740b3d4cd3SLiu Bo */
btrfs_map_discard(struct btrfs_fs_info * fs_info,u64 logical,u64 * length_ret,u32 * num_stripes)5975a4012f06SChristoph Hellwig struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
59766b7faaddSQu Wenruo u64 logical, u64 *length_ret,
5977a4012f06SChristoph Hellwig u32 *num_stripes)
59780b3d4cd3SLiu Bo {
59790b3d4cd3SLiu Bo struct extent_map *em;
59800b3d4cd3SLiu Bo struct map_lookup *map;
5981a4012f06SChristoph Hellwig struct btrfs_discard_stripe *stripes;
59826b7faaddSQu Wenruo u64 length = *length_ret;
59830b3d4cd3SLiu Bo u64 offset;
59846ded22c1SQu Wenruo u32 stripe_nr;
59856ded22c1SQu Wenruo u32 stripe_nr_end;
59866ded22c1SQu Wenruo u32 stripe_cnt;
59870b3d4cd3SLiu Bo u64 stripe_end_offset;
59880b3d4cd3SLiu Bo u64 stripe_offset;
59890b3d4cd3SLiu Bo u32 stripe_index;
59900b3d4cd3SLiu Bo u32 factor = 0;
59910b3d4cd3SLiu Bo u32 sub_stripes = 0;
59926ded22c1SQu Wenruo u32 stripes_per_dev = 0;
59930b3d4cd3SLiu Bo u32 remaining_stripes = 0;
59940b3d4cd3SLiu Bo u32 last_stripe = 0;
5995a4012f06SChristoph Hellwig int ret;
59960b3d4cd3SLiu Bo int i;
59970b3d4cd3SLiu Bo
599860ca842eSOmar Sandoval em = btrfs_get_chunk_map(fs_info, logical, length);
59990b3d4cd3SLiu Bo if (IS_ERR(em))
6000a4012f06SChristoph Hellwig return ERR_CAST(em);
60010b3d4cd3SLiu Bo
60020b3d4cd3SLiu Bo map = em->map_lookup;
6003a4012f06SChristoph Hellwig
60040b3d4cd3SLiu Bo /* we don't discard raid56 yet */
60050b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
60060b3d4cd3SLiu Bo ret = -EOPNOTSUPP;
6007a4012f06SChristoph Hellwig goto out_free_map;
60080b3d4cd3SLiu Bo }
60090b3d4cd3SLiu Bo
60100b3d4cd3SLiu Bo offset = logical - em->start;
60112d974619SQu Wenruo length = min_t(u64, em->start + em->len - logical, length);
60126b7faaddSQu Wenruo *length_ret = length;
60130b3d4cd3SLiu Bo
60140b3d4cd3SLiu Bo /*
60150b3d4cd3SLiu Bo * stripe_nr counts the total number of stripes we have to stride
60160b3d4cd3SLiu Bo * to get to this block
60170b3d4cd3SLiu Bo */
6018a97699d1SQu Wenruo stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
60190b3d4cd3SLiu Bo
60200b3d4cd3SLiu Bo /* stripe_offset is the offset of this block in its stripe */
6021cb091225SQu Wenruo stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
60220b3d4cd3SLiu Bo
6023a97699d1SQu Wenruo stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
6024a97699d1SQu Wenruo BTRFS_STRIPE_LEN_SHIFT;
60250b3d4cd3SLiu Bo stripe_cnt = stripe_nr_end - stripe_nr;
6026cb091225SQu Wenruo stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
60270b3d4cd3SLiu Bo (offset + length);
60280b3d4cd3SLiu Bo /*
60290b3d4cd3SLiu Bo * after this, stripe_nr is the number of stripes on this
60300b3d4cd3SLiu Bo * device we have to walk to find the data, and stripe_index is
60310b3d4cd3SLiu Bo * the number of our device in the stripe array
60320b3d4cd3SLiu Bo */
6033a4012f06SChristoph Hellwig *num_stripes = 1;
60340b3d4cd3SLiu Bo stripe_index = 0;
60350b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
60360b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) {
60370b3d4cd3SLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID0)
60380b3d4cd3SLiu Bo sub_stripes = 1;
60390b3d4cd3SLiu Bo else
60400b3d4cd3SLiu Bo sub_stripes = map->sub_stripes;
60410b3d4cd3SLiu Bo
60420b3d4cd3SLiu Bo factor = map->num_stripes / sub_stripes;
6043a4012f06SChristoph Hellwig *num_stripes = min_t(u64, map->num_stripes,
60440b3d4cd3SLiu Bo sub_stripes * stripe_cnt);
60456ded22c1SQu Wenruo stripe_index = stripe_nr % factor;
60466ded22c1SQu Wenruo stripe_nr /= factor;
60470b3d4cd3SLiu Bo stripe_index *= sub_stripes;
60486ded22c1SQu Wenruo
60496ded22c1SQu Wenruo remaining_stripes = stripe_cnt % factor;
60506ded22c1SQu Wenruo stripes_per_dev = stripe_cnt / factor;
60516ded22c1SQu Wenruo last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
6052c7369b3fSDavid Sterba } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
60530b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_DUP)) {
6054a4012f06SChristoph Hellwig *num_stripes = map->num_stripes;
60550b3d4cd3SLiu Bo } else {
60566ded22c1SQu Wenruo stripe_index = stripe_nr % map->num_stripes;
60576ded22c1SQu Wenruo stripe_nr /= map->num_stripes;
60580b3d4cd3SLiu Bo }
60590b3d4cd3SLiu Bo
6060a4012f06SChristoph Hellwig stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6061a4012f06SChristoph Hellwig if (!stripes) {
60620b3d4cd3SLiu Bo ret = -ENOMEM;
6063a4012f06SChristoph Hellwig goto out_free_map;
60640b3d4cd3SLiu Bo }
60650b3d4cd3SLiu Bo
6066a4012f06SChristoph Hellwig for (i = 0; i < *num_stripes; i++) {
6067a4012f06SChristoph Hellwig stripes[i].physical =
60680b3d4cd3SLiu Bo map->stripes[stripe_index].physical +
6069cb091225SQu Wenruo stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
6070a4012f06SChristoph Hellwig stripes[i].dev = map->stripes[stripe_index].dev;
60710b3d4cd3SLiu Bo
60720b3d4cd3SLiu Bo if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
60730b3d4cd3SLiu Bo BTRFS_BLOCK_GROUP_RAID10)) {
6074cb091225SQu Wenruo stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
60750b3d4cd3SLiu Bo
60760b3d4cd3SLiu Bo if (i / sub_stripes < remaining_stripes)
6077a97699d1SQu Wenruo stripes[i].length += BTRFS_STRIPE_LEN;
60780b3d4cd3SLiu Bo
60790b3d4cd3SLiu Bo /*
60800b3d4cd3SLiu Bo * Special for the first stripe and
60810b3d4cd3SLiu Bo * the last stripe:
60820b3d4cd3SLiu Bo *
60830b3d4cd3SLiu Bo * |-------|...|-------|
60840b3d4cd3SLiu Bo * |----------|
60850b3d4cd3SLiu Bo * off end_off
60860b3d4cd3SLiu Bo */
60870b3d4cd3SLiu Bo if (i < sub_stripes)
6088a4012f06SChristoph Hellwig stripes[i].length -= stripe_offset;
60890b3d4cd3SLiu Bo
60900b3d4cd3SLiu Bo if (stripe_index >= last_stripe &&
60910b3d4cd3SLiu Bo stripe_index <= (last_stripe +
60920b3d4cd3SLiu Bo sub_stripes - 1))
6093a4012f06SChristoph Hellwig stripes[i].length -= stripe_end_offset;
60940b3d4cd3SLiu Bo
60950b3d4cd3SLiu Bo if (i == sub_stripes - 1)
60960b3d4cd3SLiu Bo stripe_offset = 0;
60970b3d4cd3SLiu Bo } else {
6098a4012f06SChristoph Hellwig stripes[i].length = length;
60990b3d4cd3SLiu Bo }
61000b3d4cd3SLiu Bo
61010b3d4cd3SLiu Bo stripe_index++;
61020b3d4cd3SLiu Bo if (stripe_index == map->num_stripes) {
61030b3d4cd3SLiu Bo stripe_index = 0;
61040b3d4cd3SLiu Bo stripe_nr++;
61050b3d4cd3SLiu Bo }
61060b3d4cd3SLiu Bo }
61070b3d4cd3SLiu Bo
61080b3d4cd3SLiu Bo free_extent_map(em);
6109a4012f06SChristoph Hellwig return stripes;
6110a4012f06SChristoph Hellwig out_free_map:
6111a4012f06SChristoph Hellwig free_extent_map(em);
6112a4012f06SChristoph Hellwig return ERR_PTR(ret);
61130b3d4cd3SLiu Bo }
61140b3d4cd3SLiu Bo
is_block_group_to_copy(struct btrfs_fs_info * fs_info,u64 logical)61156143c23cSNaohiro Aota static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
61166143c23cSNaohiro Aota {
61176143c23cSNaohiro Aota struct btrfs_block_group *cache;
61186143c23cSNaohiro Aota bool ret;
61196143c23cSNaohiro Aota
6120de17addcSNaohiro Aota /* Non zoned filesystem does not use "to_copy" flag */
61216143c23cSNaohiro Aota if (!btrfs_is_zoned(fs_info))
61226143c23cSNaohiro Aota return false;
61236143c23cSNaohiro Aota
61246143c23cSNaohiro Aota cache = btrfs_lookup_block_group(fs_info, logical);
61256143c23cSNaohiro Aota
61263349b57fSJosef Bacik ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
61276143c23cSNaohiro Aota
61286143c23cSNaohiro Aota btrfs_put_block_group(cache);
61296143c23cSNaohiro Aota return ret;
61306143c23cSNaohiro Aota }
61316143c23cSNaohiro Aota
handle_ops_on_dev_replace(enum btrfs_map_op op,struct btrfs_io_context * bioc,struct btrfs_dev_replace * dev_replace,u64 logical,int * num_stripes_ret,int * max_errors_ret)613273c0f228SLiu Bo static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6133be5c7edbSQu Wenruo struct btrfs_io_context *bioc,
613473c0f228SLiu Bo struct btrfs_dev_replace *dev_replace,
61356143c23cSNaohiro Aota u64 logical,
613673c0f228SLiu Bo int *num_stripes_ret, int *max_errors_ret)
613773c0f228SLiu Bo {
613873c0f228SLiu Bo u64 srcdev_devid = dev_replace->srcdev->devid;
61391faf3885SQu Wenruo /*
61401faf3885SQu Wenruo * At this stage, num_stripes is still the real number of stripes,
61411faf3885SQu Wenruo * excluding the duplicated stripes.
61421faf3885SQu Wenruo */
614373c0f228SLiu Bo int num_stripes = *num_stripes_ret;
61441faf3885SQu Wenruo int nr_extra_stripes = 0;
614573c0f228SLiu Bo int max_errors = *max_errors_ret;
614673c0f228SLiu Bo int i;
614773c0f228SLiu Bo
614873c0f228SLiu Bo /*
61491faf3885SQu Wenruo * A block group which has "to_copy" set will eventually be copied by
61501faf3885SQu Wenruo * the dev-replace process. We can avoid cloning IO here.
61516143c23cSNaohiro Aota */
61526143c23cSNaohiro Aota if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
61536143c23cSNaohiro Aota return;
61546143c23cSNaohiro Aota
61556143c23cSNaohiro Aota /*
61561faf3885SQu Wenruo * Duplicate the write operations while the dev-replace procedure is
61571faf3885SQu Wenruo * running. Since the copying of the old disk to the new disk takes
61581faf3885SQu Wenruo * place at run time while the filesystem is mounted writable, the
61591faf3885SQu Wenruo * regular write operations to the old disk have to be duplicated to go
61601faf3885SQu Wenruo * to the new disk as well.
616173c0f228SLiu Bo *
61621faf3885SQu Wenruo * Note that device->missing is handled by the caller, and that the
61631faf3885SQu Wenruo * write to the old disk is already set up in the stripes array.
616473c0f228SLiu Bo */
616573c0f228SLiu Bo for (i = 0; i < num_stripes; i++) {
61661faf3885SQu Wenruo struct btrfs_io_stripe *old = &bioc->stripes[i];
61671faf3885SQu Wenruo struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
61681faf3885SQu Wenruo
61691faf3885SQu Wenruo if (old->dev->devid != srcdev_devid)
61701faf3885SQu Wenruo continue;
617173c0f228SLiu Bo
617273c0f228SLiu Bo new->physical = old->physical;
617373c0f228SLiu Bo new->dev = dev_replace->tgtdev;
61741faf3885SQu Wenruo if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
61751faf3885SQu Wenruo bioc->replace_stripe_src = i;
61761faf3885SQu Wenruo nr_extra_stripes++;
617773c0f228SLiu Bo }
61781faf3885SQu Wenruo
61791faf3885SQu Wenruo /* We can only have at most 2 extra nr_stripes (for DUP). */
61801faf3885SQu Wenruo ASSERT(nr_extra_stripes <= 2);
61811faf3885SQu Wenruo /*
61821faf3885SQu Wenruo * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
61831faf3885SQu Wenruo * replace.
61841faf3885SQu Wenruo * If we have 2 extra stripes, only choose the one with smaller physical.
61851faf3885SQu Wenruo */
61861faf3885SQu Wenruo if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
61871faf3885SQu Wenruo struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
61881faf3885SQu Wenruo struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
61891faf3885SQu Wenruo
61901faf3885SQu Wenruo /* Only DUP can have two extra stripes. */
61911faf3885SQu Wenruo ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
619273c0f228SLiu Bo
619373c0f228SLiu Bo /*
61941faf3885SQu Wenruo * Swap the last stripe stripes and reduce @nr_extra_stripes.
61951faf3885SQu Wenruo * The extra stripe would still be there, but won't be accessed.
619673c0f228SLiu Bo */
61971faf3885SQu Wenruo if (first->physical > second->physical) {
61981faf3885SQu Wenruo swap(second->physical, first->physical);
61991faf3885SQu Wenruo swap(second->dev, first->dev);
62001faf3885SQu Wenruo nr_extra_stripes--;
620173c0f228SLiu Bo }
620273c0f228SLiu Bo }
620373c0f228SLiu Bo
62041faf3885SQu Wenruo *num_stripes_ret = num_stripes + nr_extra_stripes;
62051faf3885SQu Wenruo *max_errors_ret = max_errors + nr_extra_stripes;
62061faf3885SQu Wenruo bioc->replace_nr_stripes = nr_extra_stripes;
620773c0f228SLiu Bo }
620873c0f228SLiu Bo
btrfs_max_io_len(struct map_lookup * map,enum btrfs_map_op op,u64 offset,u32 * stripe_nr,u64 * stripe_offset,u64 * full_stripe_start)6209f8a02dc6SChristoph Hellwig static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
62106ded22c1SQu Wenruo u64 offset, u32 *stripe_nr, u64 *stripe_offset,
6211f8a02dc6SChristoph Hellwig u64 *full_stripe_start)
62125f141126SNikolay Borisov {
6213cc353a8bSQu Wenruo /*
6214f8a02dc6SChristoph Hellwig * Stripe_nr is the stripe where this block falls. stripe_offset is
6215f8a02dc6SChristoph Hellwig * the offset of this block in its stripe.
6216cc353a8bSQu Wenruo */
6217a97699d1SQu Wenruo *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
6218a97699d1SQu Wenruo *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6219f8a02dc6SChristoph Hellwig ASSERT(*stripe_offset < U32_MAX);
62205f141126SNikolay Borisov
62215f141126SNikolay Borisov if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6222cb091225SQu Wenruo unsigned long full_stripe_len =
6223cb091225SQu Wenruo btrfs_stripe_nr_to_offset(nr_data_stripes(map));
6224f8a02dc6SChristoph Hellwig
6225a97699d1SQu Wenruo /*
6226a97699d1SQu Wenruo * For full stripe start, we use previously calculated
6227a97699d1SQu Wenruo * @stripe_nr. Align it to nr_data_stripes, then multiply with
6228a97699d1SQu Wenruo * STRIPE_LEN.
6229a97699d1SQu Wenruo *
6230a97699d1SQu Wenruo * By this we can avoid u64 division completely. And we have
6231a97699d1SQu Wenruo * to go rounddown(), not round_down(), as nr_data_stripes is
6232a97699d1SQu Wenruo * not ensured to be power of 2.
6233a97699d1SQu Wenruo */
6234f8a02dc6SChristoph Hellwig *full_stripe_start =
6235cb091225SQu Wenruo btrfs_stripe_nr_to_offset(
6236cb091225SQu Wenruo rounddown(*stripe_nr, nr_data_stripes(map)));
62375f141126SNikolay Borisov
6238a7299a18SQu Wenruo ASSERT(*full_stripe_start + full_stripe_len > offset);
6239a7299a18SQu Wenruo ASSERT(*full_stripe_start <= offset);
62405f141126SNikolay Borisov /*
6241f8a02dc6SChristoph Hellwig * For writes to RAID56, allow to write a full stripe set, but
6242f8a02dc6SChristoph Hellwig * no straddling of stripe sets.
62435f141126SNikolay Borisov */
6244f8a02dc6SChristoph Hellwig if (op == BTRFS_MAP_WRITE)
6245f8a02dc6SChristoph Hellwig return full_stripe_len - (offset - *full_stripe_start);
6246f8a02dc6SChristoph Hellwig }
62475f141126SNikolay Borisov
62485f141126SNikolay Borisov /*
6249f8a02dc6SChristoph Hellwig * For other RAID types and for RAID56 reads, allow a single stripe (on
6250f8a02dc6SChristoph Hellwig * a single disk).
62515f141126SNikolay Borisov */
6252f8a02dc6SChristoph Hellwig if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
6253a97699d1SQu Wenruo return BTRFS_STRIPE_LEN - *stripe_offset;
6254f8a02dc6SChristoph Hellwig return U64_MAX;
62555f141126SNikolay Borisov }
62565f141126SNikolay Borisov
set_io_stripe(struct btrfs_io_stripe * dst,const struct map_lookup * map,u32 stripe_index,u64 stripe_offset,u32 stripe_nr)625703793cbbSChristoph Hellwig static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
62586ded22c1SQu Wenruo u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
625903793cbbSChristoph Hellwig {
626003793cbbSChristoph Hellwig dst->dev = map->stripes[stripe_index].dev;
626103793cbbSChristoph Hellwig dst->physical = map->stripes[stripe_index].physical +
6262cb091225SQu Wenruo stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
626303793cbbSChristoph Hellwig }
626403793cbbSChristoph Hellwig
6265ed3764f7SQu Wenruo /*
6266ed3764f7SQu Wenruo * Map one logical range to one or more physical ranges.
6267ed3764f7SQu Wenruo *
6268ed3764f7SQu Wenruo * @length: (Mandatory) mapped length of this run.
6269ed3764f7SQu Wenruo * One logical range can be split into different segments
6270ed3764f7SQu Wenruo * due to factors like zones and RAID0/5/6/10 stripe
6271ed3764f7SQu Wenruo * boundaries.
6272ed3764f7SQu Wenruo *
6273ed3764f7SQu Wenruo * @bioc_ret: (Mandatory) returned btrfs_io_context structure.
6274ed3764f7SQu Wenruo * which has one or more physical ranges (btrfs_io_stripe)
6275ed3764f7SQu Wenruo * recorded inside.
6276ed3764f7SQu Wenruo * Caller should call btrfs_put_bioc() to free it after use.
6277ed3764f7SQu Wenruo *
6278ed3764f7SQu Wenruo * @smap: (Optional) single physical range optimization.
6279ed3764f7SQu Wenruo * If the map request can be fulfilled by one single
6280ed3764f7SQu Wenruo * physical range, and this is parameter is not NULL,
6281ed3764f7SQu Wenruo * then @bioc_ret would be NULL, and @smap would be
6282ed3764f7SQu Wenruo * updated.
6283ed3764f7SQu Wenruo *
6284ed3764f7SQu Wenruo * @mirror_num_ret: (Mandatory) returned mirror number if the original
6285ed3764f7SQu Wenruo * value is 0.
6286ed3764f7SQu Wenruo *
6287ed3764f7SQu Wenruo * Mirror number 0 means to choose any live mirrors.
6288ed3764f7SQu Wenruo *
6289ed3764f7SQu Wenruo * For non-RAID56 profiles, non-zero mirror_num means
6290ed3764f7SQu Wenruo * the Nth mirror. (e.g. mirror_num 1 means the first
6291ed3764f7SQu Wenruo * copy).
6292ed3764f7SQu Wenruo *
6293ed3764f7SQu Wenruo * For RAID56 profile, mirror 1 means rebuild from P and
6294ed3764f7SQu Wenruo * the remaining data stripes.
6295ed3764f7SQu Wenruo *
6296ed3764f7SQu Wenruo * For RAID6 profile, mirror > 2 means mark another
6297ed3764f7SQu Wenruo * data/P stripe error and rebuild from the remaining
6298ed3764f7SQu Wenruo * stripes..
6299ed3764f7SQu Wenruo *
6300ed3764f7SQu Wenruo * @need_raid_map: (Used only for integrity checker) whether the map wants
6301ed3764f7SQu Wenruo * a full stripe map (including all data and P/Q stripes)
6302ed3764f7SQu Wenruo * for RAID56. Should always be 1 except integrity checker.
6303ed3764f7SQu Wenruo */
btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_io_context ** bioc_ret,struct btrfs_io_stripe * smap,int * mirror_num_ret,int need_raid_map)6304cd4efd21SChristoph Hellwig int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6305103c1972SChristoph Hellwig u64 logical, u64 *length,
63064c664611SQu Wenruo struct btrfs_io_context **bioc_ret,
6307103c1972SChristoph Hellwig struct btrfs_io_stripe *smap, int *mirror_num_ret,
6308103c1972SChristoph Hellwig int need_raid_map)
63090b86a832SChris Mason {
63100b86a832SChris Mason struct extent_map *em;
63110b86a832SChris Mason struct map_lookup *map;
6312f8a02dc6SChristoph Hellwig u64 map_offset;
6313593060d7SChris Mason u64 stripe_offset;
63146ded22c1SQu Wenruo u32 stripe_nr;
63159d644a62SDavid Sterba u32 stripe_index;
6316cff82672SDavid Sterba int data_stripes;
6317cea9e445SChris Mason int i;
6318de11cc12SLi Zefan int ret = 0;
631903793cbbSChristoph Hellwig int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
6320f2d8d74dSChris Mason int num_stripes;
63215f50fa91SQu Wenruo int num_copies;
6322a236aed1SChris Mason int max_errors = 0;
63234c664611SQu Wenruo struct btrfs_io_context *bioc = NULL;
6324472262f3SStefan Behrens struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6325472262f3SStefan Behrens int dev_replace_is_ongoing = 0;
63264ced85f8SQu Wenruo u16 num_alloc_stripes;
632753b381b3SDavid Woodhouse u64 raid56_full_stripe_start = (u64)-1;
6328f8a02dc6SChristoph Hellwig u64 max_len;
632989b798adSNikolay Borisov
63304c664611SQu Wenruo ASSERT(bioc_ret);
63310b3d4cd3SLiu Bo
63325f50fa91SQu Wenruo num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
63335f50fa91SQu Wenruo if (mirror_num > num_copies)
63345f50fa91SQu Wenruo return -EINVAL;
63355f50fa91SQu Wenruo
633642034313SMichal Rostecki em = btrfs_get_chunk_map(fs_info, logical, *length);
63371c3ab6dfSQu Wenruo if (IS_ERR(em))
63381c3ab6dfSQu Wenruo return PTR_ERR(em);
633942034313SMichal Rostecki
634095617d69SJeff Mahoney map = em->map_lookup;
6341cff82672SDavid Sterba data_stripes = nr_data_stripes(map);
6342f8a02dc6SChristoph Hellwig
6343f8a02dc6SChristoph Hellwig map_offset = logical - em->start;
6344f8a02dc6SChristoph Hellwig max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
6345f8a02dc6SChristoph Hellwig &stripe_offset, &raid56_full_stripe_start);
6346f8a02dc6SChristoph Hellwig *length = min_t(u64, em->len - map_offset, max_len);
6347593060d7SChris Mason
6348a5bc4e03SJohannes Thumshirn if (dev_replace->replace_task != current)
6349cb5583ddSDavid Sterba down_read(&dev_replace->rwsem);
6350a5bc4e03SJohannes Thumshirn
6351472262f3SStefan Behrens dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
635253176ddeSDavid Sterba /*
635353176ddeSDavid Sterba * Hold the semaphore for read during the whole operation, write is
635453176ddeSDavid Sterba * requested at commit time but must wait.
635553176ddeSDavid Sterba */
6356a5bc4e03SJohannes Thumshirn if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
6357cb5583ddSDavid Sterba up_read(&dev_replace->rwsem);
6358472262f3SStefan Behrens
6359f2d8d74dSChris Mason num_stripes = 1;
6360cea9e445SChris Mason stripe_index = 0;
6361fce3bb9aSLi Dongyang if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
63626ded22c1SQu Wenruo stripe_index = stripe_nr % map->num_stripes;
63636ded22c1SQu Wenruo stripe_nr /= map->num_stripes;
63648680e587SChristoph Hellwig if (op == BTRFS_MAP_READ)
636528e1cc7dSMiao Xie mirror_num = 1;
6366c7369b3fSDavid Sterba } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
63678680e587SChristoph Hellwig if (op != BTRFS_MAP_READ) {
6368f2d8d74dSChris Mason num_stripes = map->num_stripes;
63698680e587SChristoph Hellwig } else if (mirror_num) {
6370f188591eSChris Mason stripe_index = mirror_num - 1;
63718680e587SChristoph Hellwig } else {
637230d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map, 0,
637330d9861fSStefan Behrens dev_replace_is_ongoing);
6374a1d3c478SJan Schmidt mirror_num = stripe_index + 1;
6375dfe25020SChris Mason }
63762fff734fSChris Mason
6377611f0e00SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
63788680e587SChristoph Hellwig if (op != BTRFS_MAP_READ) {
6379f2d8d74dSChris Mason num_stripes = map->num_stripes;
6380a1d3c478SJan Schmidt } else if (mirror_num) {
6381f188591eSChris Mason stripe_index = mirror_num - 1;
6382a1d3c478SJan Schmidt } else {
6383a1d3c478SJan Schmidt mirror_num = 1;
6384a1d3c478SJan Schmidt }
63852fff734fSChris Mason
6386321aecc6SChris Mason } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
63879d644a62SDavid Sterba u32 factor = map->num_stripes / map->sub_stripes;
6388321aecc6SChris Mason
63896ded22c1SQu Wenruo stripe_index = (stripe_nr % factor) * map->sub_stripes;
63906ded22c1SQu Wenruo stripe_nr /= factor;
6391321aecc6SChris Mason
63928680e587SChristoph Hellwig if (op != BTRFS_MAP_READ)
6393f2d8d74dSChris Mason num_stripes = map->sub_stripes;
6394321aecc6SChris Mason else if (mirror_num)
6395321aecc6SChris Mason stripe_index += mirror_num - 1;
6396dfe25020SChris Mason else {
63973e74317aSJan Schmidt int old_stripe_index = stripe_index;
639830d9861fSStefan Behrens stripe_index = find_live_mirror(fs_info, map,
639930d9861fSStefan Behrens stripe_index,
640030d9861fSStefan Behrens dev_replace_is_ongoing);
64013e74317aSJan Schmidt mirror_num = stripe_index - old_stripe_index + 1;
6402dfe25020SChris Mason }
640353b381b3SDavid Woodhouse
6404ffe2d203SZhao Lei } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
64058680e587SChristoph Hellwig if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
64066ded22c1SQu Wenruo /*
64076ded22c1SQu Wenruo * Push stripe_nr back to the start of the full stripe
64086ded22c1SQu Wenruo * For those cases needing a full stripe, @stripe_nr
64096ded22c1SQu Wenruo * is the full stripe number.
64106ded22c1SQu Wenruo *
64116ded22c1SQu Wenruo * Originally we go raid56_full_stripe_start / full_stripe_len,
64126ded22c1SQu Wenruo * but that can be expensive. Here we just divide
64136ded22c1SQu Wenruo * @stripe_nr with @data_stripes.
64146ded22c1SQu Wenruo */
64156ded22c1SQu Wenruo stripe_nr /= data_stripes;
641653b381b3SDavid Woodhouse
641753b381b3SDavid Woodhouse /* RAID[56] write or recovery. Return all stripes */
641853b381b3SDavid Woodhouse num_stripes = map->num_stripes;
64196dead96cSQu Wenruo max_errors = btrfs_chunk_max_errors(map);
642053b381b3SDavid Woodhouse
6421462b0b2aSQu Wenruo /* Return the length to the full stripe end */
6422462b0b2aSQu Wenruo *length = min(logical + *length,
6423462b0b2aSQu Wenruo raid56_full_stripe_start + em->start +
6424cb091225SQu Wenruo btrfs_stripe_nr_to_offset(data_stripes)) -
6425cb091225SQu Wenruo logical;
642653b381b3SDavid Woodhouse stripe_index = 0;
642753b381b3SDavid Woodhouse stripe_offset = 0;
642853b381b3SDavid Woodhouse } else {
642953b381b3SDavid Woodhouse /*
643053b381b3SDavid Woodhouse * Mirror #0 or #1 means the original data block.
643153b381b3SDavid Woodhouse * Mirror #2 is RAID5 parity block.
643253b381b3SDavid Woodhouse * Mirror #3 is RAID6 Q block.
643353b381b3SDavid Woodhouse */
64346ded22c1SQu Wenruo stripe_index = stripe_nr % data_stripes;
64356ded22c1SQu Wenruo stripe_nr /= data_stripes;
643653b381b3SDavid Woodhouse if (mirror_num > 1)
6437cff82672SDavid Sterba stripe_index = data_stripes + mirror_num - 2;
643853b381b3SDavid Woodhouse
643953b381b3SDavid Woodhouse /* We distribute the parity blocks across stripes */
64406ded22c1SQu Wenruo stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
64418680e587SChristoph Hellwig if (op == BTRFS_MAP_READ && mirror_num <= 1)
644228e1cc7dSMiao Xie mirror_num = 1;
644353b381b3SDavid Woodhouse }
64448790d502SChris Mason } else {
6445593060d7SChris Mason /*
64466ded22c1SQu Wenruo * After this, stripe_nr is the number of stripes on this
644747c5713fSDavid Sterba * device we have to walk to find the data, and stripe_index is
644847c5713fSDavid Sterba * the number of our device in the stripe array
6449593060d7SChris Mason */
64506ded22c1SQu Wenruo stripe_index = stripe_nr % map->num_stripes;
64516ded22c1SQu Wenruo stripe_nr /= map->num_stripes;
6452a1d3c478SJan Schmidt mirror_num = stripe_index + 1;
64538790d502SChris Mason }
6454e042d1ecSJosef Bacik if (stripe_index >= map->num_stripes) {
64555d163e0eSJeff Mahoney btrfs_crit(fs_info,
64565d163e0eSJeff Mahoney "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6457e042d1ecSJosef Bacik stripe_index, map->num_stripes);
6458e042d1ecSJosef Bacik ret = -EINVAL;
6459e042d1ecSJosef Bacik goto out;
6460e042d1ecSJosef Bacik }
6461593060d7SChris Mason
6462472262f3SStefan Behrens num_alloc_stripes = num_stripes;
64631faf3885SQu Wenruo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
64641faf3885SQu Wenruo op != BTRFS_MAP_READ)
64651faf3885SQu Wenruo /*
64661faf3885SQu Wenruo * For replace case, we need to add extra stripes for extra
64671faf3885SQu Wenruo * duplicated stripes.
64681faf3885SQu Wenruo *
64691faf3885SQu Wenruo * For both WRITE and GET_READ_MIRRORS, we may have at most
64701faf3885SQu Wenruo * 2 more stripes (DUP types, otherwise 1).
64711faf3885SQu Wenruo */
64721faf3885SQu Wenruo num_alloc_stripes += 2;
64732c8cdd6eSMiao Xie
647403793cbbSChristoph Hellwig /*
647503793cbbSChristoph Hellwig * If this I/O maps to a single device, try to return the device and
647603793cbbSChristoph Hellwig * physical block information on the stack instead of allocating an
647703793cbbSChristoph Hellwig * I/O context structure.
647803793cbbSChristoph Hellwig */
647903793cbbSChristoph Hellwig if (smap && num_alloc_stripes == 1 &&
6480b2cc4400SChristoph Hellwig !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
64815f50fa91SQu Wenruo set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
64824e7de35eSChristoph Hellwig if (mirror_num_ret)
648303793cbbSChristoph Hellwig *mirror_num_ret = mirror_num;
648403793cbbSChristoph Hellwig *bioc_ret = NULL;
648503793cbbSChristoph Hellwig ret = 0;
648603793cbbSChristoph Hellwig goto out;
648703793cbbSChristoph Hellwig }
648803793cbbSChristoph Hellwig
64891faf3885SQu Wenruo bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
64904c664611SQu Wenruo if (!bioc) {
6491de11cc12SLi Zefan ret = -ENOMEM;
6492de11cc12SLi Zefan goto out;
6493de11cc12SLi Zefan }
64941faf3885SQu Wenruo bioc->map_type = map->type;
6495608769a4SNikolay Borisov
649618d758a2SQu Wenruo /*
649718d758a2SQu Wenruo * For RAID56 full map, we need to make sure the stripes[] follows the
649818d758a2SQu Wenruo * rule that data stripes are all ordered, then followed with P and Q
649918d758a2SQu Wenruo * (if we have).
650018d758a2SQu Wenruo *
650118d758a2SQu Wenruo * It's still mostly the same as other profiles, just with extra rotation.
650218d758a2SQu Wenruo */
65032b19a1feSLiu Bo if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
65048680e587SChristoph Hellwig (op != BTRFS_MAP_READ || mirror_num > 1)) {
650518d758a2SQu Wenruo /*
650618d758a2SQu Wenruo * For RAID56 @stripe_nr is already the number of full stripes
650718d758a2SQu Wenruo * before us, which is also the rotation value (needs to modulo
650818d758a2SQu Wenruo * with num_stripes).
650918d758a2SQu Wenruo *
651018d758a2SQu Wenruo * In this case, we just add @stripe_nr with @i, then do the
651118d758a2SQu Wenruo * modulo, to reduce one modulo call.
651218d758a2SQu Wenruo */
651318d758a2SQu Wenruo bioc->full_stripe_logical = em->start +
6514cb091225SQu Wenruo btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
651518d758a2SQu Wenruo for (i = 0; i < num_stripes; i++)
651618d758a2SQu Wenruo set_io_stripe(&bioc->stripes[i], map,
651718d758a2SQu Wenruo (i + stripe_nr) % num_stripes,
651818d758a2SQu Wenruo stripe_offset, stripe_nr);
651918d758a2SQu Wenruo } else {
652018d758a2SQu Wenruo /*
652118d758a2SQu Wenruo * For all other non-RAID56 profiles, just copy the target
652218d758a2SQu Wenruo * stripe into the bioc.
652318d758a2SQu Wenruo */
652418d758a2SQu Wenruo for (i = 0; i < num_stripes; i++) {
652518d758a2SQu Wenruo set_io_stripe(&bioc->stripes[i], map, stripe_index,
652618d758a2SQu Wenruo stripe_offset, stripe_nr);
652718d758a2SQu Wenruo stripe_index++;
652818d758a2SQu Wenruo }
6529593060d7SChris Mason }
6530de11cc12SLi Zefan
65318680e587SChristoph Hellwig if (op != BTRFS_MAP_READ)
6532d20983b4SMiao Xie max_errors = btrfs_chunk_max_errors(map);
6533de11cc12SLi Zefan
653473c0f228SLiu Bo if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
65358680e587SChristoph Hellwig op != BTRFS_MAP_READ) {
6536be5c7edbSQu Wenruo handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
65376143c23cSNaohiro Aota &num_stripes, &max_errors);
6538ad6d620eSStefan Behrens }
6539472262f3SStefan Behrens
65404c664611SQu Wenruo *bioc_ret = bioc;
65414c664611SQu Wenruo bioc->num_stripes = num_stripes;
65424c664611SQu Wenruo bioc->max_errors = max_errors;
65434c664611SQu Wenruo bioc->mirror_num = mirror_num;
6544ad6d620eSStefan Behrens
6545cea9e445SChris Mason out:
6546a5bc4e03SJohannes Thumshirn if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
654753176ddeSDavid Sterba lockdep_assert_held(&dev_replace->rwsem);
654853176ddeSDavid Sterba /* Unlock and let waiting writers proceed */
6549cb5583ddSDavid Sterba up_read(&dev_replace->rwsem);
655073beece9SLiu Bo }
65510b86a832SChris Mason free_extent_map(em);
6552de11cc12SLi Zefan return ret;
65530b86a832SChris Mason }
65540b86a832SChris Mason
dev_args_match_fs_devices(const struct btrfs_dev_lookup_args * args,const struct btrfs_fs_devices * fs_devices)6555562d7b15SJosef Bacik static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6556562d7b15SJosef Bacik const struct btrfs_fs_devices *fs_devices)
6557562d7b15SJosef Bacik {
6558562d7b15SJosef Bacik if (args->fsid == NULL)
6559562d7b15SJosef Bacik return true;
6560562d7b15SJosef Bacik if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6561562d7b15SJosef Bacik return true;
6562562d7b15SJosef Bacik return false;
6563562d7b15SJosef Bacik }
6564562d7b15SJosef Bacik
dev_args_match_device(const struct btrfs_dev_lookup_args * args,const struct btrfs_device * device)6565562d7b15SJosef Bacik static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6566562d7b15SJosef Bacik const struct btrfs_device *device)
6567562d7b15SJosef Bacik {
65680fca385dSLiu Shixin if (args->missing) {
6569562d7b15SJosef Bacik if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6570562d7b15SJosef Bacik !device->bdev)
6571562d7b15SJosef Bacik return true;
6572562d7b15SJosef Bacik return false;
6573562d7b15SJosef Bacik }
6574562d7b15SJosef Bacik
65750fca385dSLiu Shixin if (device->devid != args->devid)
65760fca385dSLiu Shixin return false;
65770fca385dSLiu Shixin if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
65780fca385dSLiu Shixin return false;
65790fca385dSLiu Shixin return true;
65800fca385dSLiu Shixin }
65810fca385dSLiu Shixin
658209ba3bc9SAnand Jain /*
658309ba3bc9SAnand Jain * Find a device specified by @devid or @uuid in the list of @fs_devices, or
658409ba3bc9SAnand Jain * return NULL.
658509ba3bc9SAnand Jain *
658609ba3bc9SAnand Jain * If devid and uuid are both specified, the match must be exact, otherwise
658709ba3bc9SAnand Jain * only devid is used.
658809ba3bc9SAnand Jain */
btrfs_find_device(const struct btrfs_fs_devices * fs_devices,const struct btrfs_dev_lookup_args * args)6589562d7b15SJosef Bacik struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6590562d7b15SJosef Bacik const struct btrfs_dev_lookup_args *args)
65910b86a832SChris Mason {
65922b82032cSYan Zheng struct btrfs_device *device;
6593944d3f9fSNikolay Borisov struct btrfs_fs_devices *seed_devs;
65940b86a832SChris Mason
6595562d7b15SJosef Bacik if (dev_args_match_fs_devices(args, fs_devices)) {
6596944d3f9fSNikolay Borisov list_for_each_entry(device, &fs_devices->devices, dev_list) {
6597562d7b15SJosef Bacik if (dev_args_match_device(args, device))
6598944d3f9fSNikolay Borisov return device;
6599944d3f9fSNikolay Borisov }
6600944d3f9fSNikolay Borisov }
6601944d3f9fSNikolay Borisov
6602944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6603562d7b15SJosef Bacik if (!dev_args_match_fs_devices(args, seed_devs))
6604562d7b15SJosef Bacik continue;
6605562d7b15SJosef Bacik list_for_each_entry(device, &seed_devs->devices, dev_list) {
6606562d7b15SJosef Bacik if (dev_args_match_device(args, device))
66072b82032cSYan Zheng return device;
66082b82032cSYan Zheng }
660909ba3bc9SAnand Jain }
6610944d3f9fSNikolay Borisov
66112b82032cSYan Zheng return NULL;
66120b86a832SChris Mason }
66130b86a832SChris Mason
add_missing_dev(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * dev_uuid)66142ff7e61eSJeff Mahoney static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6615dfe25020SChris Mason u64 devid, u8 *dev_uuid)
6616dfe25020SChris Mason {
6617dfe25020SChris Mason struct btrfs_device *device;
6618fccc0007SJosef Bacik unsigned int nofs_flag;
6619dfe25020SChris Mason
6620fccc0007SJosef Bacik /*
6621fccc0007SJosef Bacik * We call this under the chunk_mutex, so we want to use NOFS for this
6622fccc0007SJosef Bacik * allocation, however we don't want to change btrfs_alloc_device() to
6623fccc0007SJosef Bacik * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6624fccc0007SJosef Bacik * places.
6625fccc0007SJosef Bacik */
6626bb21e302SAnand Jain
6627fccc0007SJosef Bacik nofs_flag = memalloc_nofs_save();
6628bb21e302SAnand Jain device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
6629fccc0007SJosef Bacik memalloc_nofs_restore(nofs_flag);
663012bd2fc0SIlya Dryomov if (IS_ERR(device))
6631adfb69afSAnand Jain return device;
663212bd2fc0SIlya Dryomov
663312bd2fc0SIlya Dryomov list_add(&device->dev_list, &fs_devices->devices);
6634e4404d6eSYan Zheng device->fs_devices = fs_devices;
6635dfe25020SChris Mason fs_devices->num_devices++;
663612bd2fc0SIlya Dryomov
6637e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6638cd02dca5SChris Mason fs_devices->missing_devices++;
663912bd2fc0SIlya Dryomov
6640dfe25020SChris Mason return device;
6641dfe25020SChris Mason }
6642dfe25020SChris Mason
664343dd529aSDavid Sterba /*
664443dd529aSDavid Sterba * Allocate new device struct, set up devid and UUID.
664543dd529aSDavid Sterba *
664612bd2fc0SIlya Dryomov * @fs_info: used only for generating a new devid, can be NULL if
664712bd2fc0SIlya Dryomov * devid is provided (i.e. @devid != NULL).
664812bd2fc0SIlya Dryomov * @devid: a pointer to devid for this device. If NULL a new devid
664912bd2fc0SIlya Dryomov * is generated.
665012bd2fc0SIlya Dryomov * @uuid: a pointer to UUID for this device. If NULL a new UUID
665112bd2fc0SIlya Dryomov * is generated.
6652bb21e302SAnand Jain * @path: a pointer to device path if available, NULL otherwise.
665312bd2fc0SIlya Dryomov *
665412bd2fc0SIlya Dryomov * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
665548dae9cfSDavid Sterba * on error. Returned struct is not linked onto any lists and must be
6656a425f9d4SDavid Sterba * destroyed with btrfs_free_device.
665712bd2fc0SIlya Dryomov */
btrfs_alloc_device(struct btrfs_fs_info * fs_info,const u64 * devid,const u8 * uuid,const char * path)665812bd2fc0SIlya Dryomov struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6659bb21e302SAnand Jain const u64 *devid, const u8 *uuid,
6660bb21e302SAnand Jain const char *path)
666112bd2fc0SIlya Dryomov {
666212bd2fc0SIlya Dryomov struct btrfs_device *dev;
666312bd2fc0SIlya Dryomov u64 tmp;
666412bd2fc0SIlya Dryomov
6665fae7f21cSDulshani Gunawardhana if (WARN_ON(!devid && !fs_info))
666612bd2fc0SIlya Dryomov return ERR_PTR(-EINVAL);
666712bd2fc0SIlya Dryomov
6668fe4f46d4SDavid Sterba dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6669fe4f46d4SDavid Sterba if (!dev)
6670fe4f46d4SDavid Sterba return ERR_PTR(-ENOMEM);
6671fe4f46d4SDavid Sterba
6672fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->dev_list);
6673fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->dev_alloc_list);
6674fe4f46d4SDavid Sterba INIT_LIST_HEAD(&dev->post_commit_list);
6675fe4f46d4SDavid Sterba
6676fe4f46d4SDavid Sterba atomic_set(&dev->dev_stats_ccnt, 0);
6677fe4f46d4SDavid Sterba btrfs_device_data_ordered_init(dev);
667835da5a7eSDavid Sterba extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
667912bd2fc0SIlya Dryomov
668012bd2fc0SIlya Dryomov if (devid)
668112bd2fc0SIlya Dryomov tmp = *devid;
668212bd2fc0SIlya Dryomov else {
668312bd2fc0SIlya Dryomov int ret;
668412bd2fc0SIlya Dryomov
668512bd2fc0SIlya Dryomov ret = find_next_devid(fs_info, &tmp);
668612bd2fc0SIlya Dryomov if (ret) {
6687a425f9d4SDavid Sterba btrfs_free_device(dev);
668812bd2fc0SIlya Dryomov return ERR_PTR(ret);
668912bd2fc0SIlya Dryomov }
669012bd2fc0SIlya Dryomov }
669112bd2fc0SIlya Dryomov dev->devid = tmp;
669212bd2fc0SIlya Dryomov
669312bd2fc0SIlya Dryomov if (uuid)
669412bd2fc0SIlya Dryomov memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
669512bd2fc0SIlya Dryomov else
669612bd2fc0SIlya Dryomov generate_random_uuid(dev->uuid);
669712bd2fc0SIlya Dryomov
6698bb21e302SAnand Jain if (path) {
6699bb21e302SAnand Jain struct rcu_string *name;
6700bb21e302SAnand Jain
6701bb21e302SAnand Jain name = rcu_string_strdup(path, GFP_KERNEL);
6702bb21e302SAnand Jain if (!name) {
6703bb21e302SAnand Jain btrfs_free_device(dev);
6704bb21e302SAnand Jain return ERR_PTR(-ENOMEM);
6705bb21e302SAnand Jain }
6706bb21e302SAnand Jain rcu_assign_pointer(dev->name, name);
6707bb21e302SAnand Jain }
6708bb21e302SAnand Jain
670912bd2fc0SIlya Dryomov return dev;
671012bd2fc0SIlya Dryomov }
671112bd2fc0SIlya Dryomov
btrfs_report_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid,bool error)67125a2b8e60SAnand Jain static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
67132b902dfcSAnand Jain u64 devid, u8 *uuid, bool error)
67145a2b8e60SAnand Jain {
67152b902dfcSAnand Jain if (error)
67162b902dfcSAnand Jain btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
67172b902dfcSAnand Jain devid, uuid);
67182b902dfcSAnand Jain else
67192b902dfcSAnand Jain btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
67202b902dfcSAnand Jain devid, uuid);
67215a2b8e60SAnand Jain }
67225a2b8e60SAnand Jain
btrfs_calc_stripe_length(const struct extent_map * em)6723bc88b486SQu Wenruo u64 btrfs_calc_stripe_length(const struct extent_map *em)
672439e264a4SNikolay Borisov {
6725bc88b486SQu Wenruo const struct map_lookup *map = em->map_lookup;
6726bc88b486SQu Wenruo const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
6727e4f6c6beSDavid Sterba
6728bc88b486SQu Wenruo return div_u64(em->len, data_stripes);
672939e264a4SNikolay Borisov }
673039e264a4SNikolay Borisov
6731e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6732e9306ad4SQu Wenruo /*
6733e9306ad4SQu Wenruo * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6734e9306ad4SQu Wenruo * can't be accessed on 32bit systems.
6735e9306ad4SQu Wenruo *
6736e9306ad4SQu Wenruo * This function do mount time check to reject the fs if it already has
6737e9306ad4SQu Wenruo * metadata chunk beyond that limit.
6738e9306ad4SQu Wenruo */
check_32bit_meta_chunk(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 type)6739e9306ad4SQu Wenruo static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6740e9306ad4SQu Wenruo u64 logical, u64 length, u64 type)
6741e9306ad4SQu Wenruo {
6742e9306ad4SQu Wenruo if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6743e9306ad4SQu Wenruo return 0;
6744e9306ad4SQu Wenruo
6745e9306ad4SQu Wenruo if (logical + length < MAX_LFS_FILESIZE)
6746e9306ad4SQu Wenruo return 0;
6747e9306ad4SQu Wenruo
6748e9306ad4SQu Wenruo btrfs_err_32bit_limit(fs_info);
6749e9306ad4SQu Wenruo return -EOVERFLOW;
6750e9306ad4SQu Wenruo }
6751e9306ad4SQu Wenruo
6752e9306ad4SQu Wenruo /*
6753e9306ad4SQu Wenruo * This is to give early warning for any metadata chunk reaching
6754e9306ad4SQu Wenruo * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6755e9306ad4SQu Wenruo * Although we can still access the metadata, it's not going to be possible
6756e9306ad4SQu Wenruo * once the limit is reached.
6757e9306ad4SQu Wenruo */
warn_32bit_meta_chunk(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 type)6758e9306ad4SQu Wenruo static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6759e9306ad4SQu Wenruo u64 logical, u64 length, u64 type)
6760e9306ad4SQu Wenruo {
6761e9306ad4SQu Wenruo if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6762e9306ad4SQu Wenruo return;
6763e9306ad4SQu Wenruo
6764e9306ad4SQu Wenruo if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6765e9306ad4SQu Wenruo return;
6766e9306ad4SQu Wenruo
6767e9306ad4SQu Wenruo btrfs_warn_32bit_limit(fs_info);
6768e9306ad4SQu Wenruo }
6769e9306ad4SQu Wenruo #endif
6770e9306ad4SQu Wenruo
handle_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid)6771ff37c89fSNikolay Borisov static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
6772ff37c89fSNikolay Borisov u64 devid, u8 *uuid)
6773ff37c89fSNikolay Borisov {
6774ff37c89fSNikolay Borisov struct btrfs_device *dev;
6775ff37c89fSNikolay Borisov
6776ff37c89fSNikolay Borisov if (!btrfs_test_opt(fs_info, DEGRADED)) {
6777ff37c89fSNikolay Borisov btrfs_report_missing_device(fs_info, devid, uuid, true);
6778ff37c89fSNikolay Borisov return ERR_PTR(-ENOENT);
6779ff37c89fSNikolay Borisov }
6780ff37c89fSNikolay Borisov
6781ff37c89fSNikolay Borisov dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
6782ff37c89fSNikolay Borisov if (IS_ERR(dev)) {
6783ff37c89fSNikolay Borisov btrfs_err(fs_info, "failed to init missing device %llu: %ld",
6784ff37c89fSNikolay Borisov devid, PTR_ERR(dev));
6785ff37c89fSNikolay Borisov return dev;
6786ff37c89fSNikolay Borisov }
6787ff37c89fSNikolay Borisov btrfs_report_missing_device(fs_info, devid, uuid, false);
6788ff37c89fSNikolay Borisov
6789ff37c89fSNikolay Borisov return dev;
6790ff37c89fSNikolay Borisov }
6791ff37c89fSNikolay Borisov
read_one_chunk(struct btrfs_key * key,struct extent_buffer * leaf,struct btrfs_chunk * chunk)67929690ac09SDavid Sterba static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
67930b86a832SChris Mason struct btrfs_chunk *chunk)
67940b86a832SChris Mason {
6795562d7b15SJosef Bacik BTRFS_DEV_LOOKUP_ARGS(args);
67969690ac09SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info;
6797c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree;
67980b86a832SChris Mason struct map_lookup *map;
67990b86a832SChris Mason struct extent_map *em;
68000b86a832SChris Mason u64 logical;
68010b86a832SChris Mason u64 length;
68020b86a832SChris Mason u64 devid;
6803e9306ad4SQu Wenruo u64 type;
6804a443755fSChris Mason u8 uuid[BTRFS_UUID_SIZE];
680576a66ba1SQu Wenruo int index;
6806593060d7SChris Mason int num_stripes;
68070b86a832SChris Mason int ret;
6808593060d7SChris Mason int i;
68090b86a832SChris Mason
6810e17cade2SChris Mason logical = key->offset;
6811e17cade2SChris Mason length = btrfs_chunk_length(leaf, chunk);
6812e9306ad4SQu Wenruo type = btrfs_chunk_type(leaf, chunk);
681376a66ba1SQu Wenruo index = btrfs_bg_flags_to_raid_index(type);
6814f04b772bSQu Wenruo num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6815e06cd3ddSLiu Bo
6816e9306ad4SQu Wenruo #if BITS_PER_LONG == 32
6817e9306ad4SQu Wenruo ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6818e9306ad4SQu Wenruo if (ret < 0)
6819e9306ad4SQu Wenruo return ret;
6820e9306ad4SQu Wenruo warn_32bit_meta_chunk(fs_info, logical, length, type);
6821e9306ad4SQu Wenruo #endif
6822e9306ad4SQu Wenruo
6823075cb3c7SQu Wenruo /*
6824075cb3c7SQu Wenruo * Only need to verify chunk item if we're reading from sys chunk array,
6825075cb3c7SQu Wenruo * as chunk item in tree block is already verified by tree-checker.
6826075cb3c7SQu Wenruo */
6827075cb3c7SQu Wenruo if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6828ddaf1d5aSDavid Sterba ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6829e06cd3ddSLiu Bo if (ret)
6830e06cd3ddSLiu Bo return ret;
6831075cb3c7SQu Wenruo }
6832a061fc8dSChris Mason
6833c8bf1b67SDavid Sterba read_lock(&map_tree->lock);
6834c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, logical, 1);
6835c8bf1b67SDavid Sterba read_unlock(&map_tree->lock);
68360b86a832SChris Mason
68370b86a832SChris Mason /* already mapped? */
68380b86a832SChris Mason if (em && em->start <= logical && em->start + em->len > logical) {
68390b86a832SChris Mason free_extent_map(em);
68400b86a832SChris Mason return 0;
68410b86a832SChris Mason } else if (em) {
68420b86a832SChris Mason free_extent_map(em);
68430b86a832SChris Mason }
68440b86a832SChris Mason
6845172ddd60SDavid Sterba em = alloc_extent_map();
68460b86a832SChris Mason if (!em)
68470b86a832SChris Mason return -ENOMEM;
6848593060d7SChris Mason map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
68490b86a832SChris Mason if (!map) {
68500b86a832SChris Mason free_extent_map(em);
68510b86a832SChris Mason return -ENOMEM;
68520b86a832SChris Mason }
68530b86a832SChris Mason
6854298a8f9cSWang Shilong set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
685595617d69SJeff Mahoney em->map_lookup = map;
68560b86a832SChris Mason em->start = logical;
68570b86a832SChris Mason em->len = length;
685870c8a91cSJosef Bacik em->orig_start = 0;
68590b86a832SChris Mason em->block_start = 0;
6860c8b97818SChris Mason em->block_len = em->len;
68610b86a832SChris Mason
6862593060d7SChris Mason map->num_stripes = num_stripes;
6863593060d7SChris Mason map->io_width = btrfs_chunk_io_width(leaf, chunk);
6864593060d7SChris Mason map->io_align = btrfs_chunk_io_align(leaf, chunk);
6865e9306ad4SQu Wenruo map->type = type;
686676a66ba1SQu Wenruo /*
686776a66ba1SQu Wenruo * We can't use the sub_stripes value, as for profiles other than
686876a66ba1SQu Wenruo * RAID10, they may have 0 as sub_stripes for filesystems created by
686976a66ba1SQu Wenruo * older mkfs (<v5.4).
687076a66ba1SQu Wenruo * In that case, it can cause divide-by-zero errors later.
687176a66ba1SQu Wenruo * Since currently sub_stripes is fixed for each profile, let's
687276a66ba1SQu Wenruo * use the trusted value instead.
687376a66ba1SQu Wenruo */
687476a66ba1SQu Wenruo map->sub_stripes = btrfs_raid_array[index].sub_stripes;
6875cf90d884SQu Wenruo map->verified_stripes = 0;
6876bc88b486SQu Wenruo em->orig_block_len = btrfs_calc_stripe_length(em);
6877593060d7SChris Mason for (i = 0; i < num_stripes; i++) {
6878593060d7SChris Mason map->stripes[i].physical =
6879593060d7SChris Mason btrfs_stripe_offset_nr(leaf, chunk, i);
6880593060d7SChris Mason devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6881562d7b15SJosef Bacik args.devid = devid;
6882a443755fSChris Mason read_extent_buffer(leaf, uuid, (unsigned long)
6883a443755fSChris Mason btrfs_stripe_dev_uuid_nr(chunk, i),
6884a443755fSChris Mason BTRFS_UUID_SIZE);
6885562d7b15SJosef Bacik args.uuid = uuid;
6886562d7b15SJosef Bacik map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
6887dfe25020SChris Mason if (!map->stripes[i].dev) {
6888ff37c89fSNikolay Borisov map->stripes[i].dev = handle_missing_device(fs_info,
6889ff37c89fSNikolay Borisov devid, uuid);
6890adfb69afSAnand Jain if (IS_ERR(map->stripes[i].dev)) {
68911742e1c9Svoid0red ret = PTR_ERR(map->stripes[i].dev);
68920b86a832SChris Mason free_extent_map(em);
68931742e1c9Svoid0red return ret;
68940b86a832SChris Mason }
6895593060d7SChris Mason }
6896ff37c89fSNikolay Borisov
6897e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6898e12c9621SAnand Jain &(map->stripes[i].dev->dev_state));
6899dfe25020SChris Mason }
69000b86a832SChris Mason
6901c8bf1b67SDavid Sterba write_lock(&map_tree->lock);
6902c8bf1b67SDavid Sterba ret = add_extent_mapping(map_tree, em, 0);
6903c8bf1b67SDavid Sterba write_unlock(&map_tree->lock);
690464f64f43SQu Wenruo if (ret < 0) {
690564f64f43SQu Wenruo btrfs_err(fs_info,
690664f64f43SQu Wenruo "failed to add chunk map, start=%llu len=%llu: %d",
690764f64f43SQu Wenruo em->start, em->len, ret);
690864f64f43SQu Wenruo }
69090b86a832SChris Mason free_extent_map(em);
69100b86a832SChris Mason
691164f64f43SQu Wenruo return ret;
69120b86a832SChris Mason }
69130b86a832SChris Mason
fill_device_from_item(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item,struct btrfs_device * device)6914143bede5SJeff Mahoney static void fill_device_from_item(struct extent_buffer *leaf,
69150b86a832SChris Mason struct btrfs_dev_item *dev_item,
69160b86a832SChris Mason struct btrfs_device *device)
69170b86a832SChris Mason {
69180b86a832SChris Mason unsigned long ptr;
69190b86a832SChris Mason
69200b86a832SChris Mason device->devid = btrfs_device_id(leaf, dev_item);
6921d6397baeSChris Ball device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6922d6397baeSChris Ball device->total_bytes = device->disk_total_bytes;
6923935e5cc9SMiao Xie device->commit_total_bytes = device->disk_total_bytes;
69240b86a832SChris Mason device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6925ce7213c7SMiao Xie device->commit_bytes_used = device->bytes_used;
69260b86a832SChris Mason device->type = btrfs_device_type(leaf, dev_item);
69270b86a832SChris Mason device->io_align = btrfs_device_io_align(leaf, dev_item);
69280b86a832SChris Mason device->io_width = btrfs_device_io_width(leaf, dev_item);
69290b86a832SChris Mason device->sector_size = btrfs_device_sector_size(leaf, dev_item);
69308dabb742SStefan Behrens WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6931401e29c1SAnand Jain clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
69320b86a832SChris Mason
6933410ba3a2SGeert Uytterhoeven ptr = btrfs_device_uuid(dev_item);
6934e17cade2SChris Mason read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
69350b86a832SChris Mason }
69360b86a832SChris Mason
open_seed_devices(struct btrfs_fs_info * fs_info,u8 * fsid)69372ff7e61eSJeff Mahoney static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
69385f375835SMiao Xie u8 *fsid)
69392b82032cSYan Zheng {
69402b82032cSYan Zheng struct btrfs_fs_devices *fs_devices;
69412b82032cSYan Zheng int ret;
69422b82032cSYan Zheng
6943a32bf9a3SDavid Sterba lockdep_assert_held(&uuid_mutex);
69442dfeca9bSDavid Sterba ASSERT(fsid);
69452b82032cSYan Zheng
6946427c8fddSNikolay Borisov /* This will match only for multi-device seed fs */
6947944d3f9fSNikolay Borisov list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
694844880fdcSAnand Jain if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
69495f375835SMiao Xie return fs_devices;
69505f375835SMiao Xie
69512b82032cSYan Zheng
69527239ff4bSNikolay Borisov fs_devices = find_fsid(fsid, NULL);
69532b82032cSYan Zheng if (!fs_devices) {
69540b246afaSJeff Mahoney if (!btrfs_test_opt(fs_info, DEGRADED))
69555f375835SMiao Xie return ERR_PTR(-ENOENT);
69565f375835SMiao Xie
69577239ff4bSNikolay Borisov fs_devices = alloc_fs_devices(fsid, NULL);
69585f375835SMiao Xie if (IS_ERR(fs_devices))
69595f375835SMiao Xie return fs_devices;
69605f375835SMiao Xie
69610395d84fSJohannes Thumshirn fs_devices->seeding = true;
69625f375835SMiao Xie fs_devices->opened = 1;
69635f375835SMiao Xie return fs_devices;
69642b82032cSYan Zheng }
6965e4404d6eSYan Zheng
6966427c8fddSNikolay Borisov /*
6967427c8fddSNikolay Borisov * Upon first call for a seed fs fsid, just create a private copy of the
6968427c8fddSNikolay Borisov * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6969427c8fddSNikolay Borisov */
6970e4404d6eSYan Zheng fs_devices = clone_fs_devices(fs_devices);
69715f375835SMiao Xie if (IS_ERR(fs_devices))
69725f375835SMiao Xie return fs_devices;
69732b82032cSYan Zheng
697405bdb996SChristoph Hellwig ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
697548d28232SJulia Lawall if (ret) {
697648d28232SJulia Lawall free_fs_devices(fs_devices);
6977c83b60c0SAnand Jain return ERR_PTR(ret);
697848d28232SJulia Lawall }
69792b82032cSYan Zheng
69802b82032cSYan Zheng if (!fs_devices->seeding) {
69810226e0ebSAnand Jain close_fs_devices(fs_devices);
6982e4404d6eSYan Zheng free_fs_devices(fs_devices);
6983c83b60c0SAnand Jain return ERR_PTR(-EINVAL);
69842b82032cSYan Zheng }
69852b82032cSYan Zheng
6986944d3f9fSNikolay Borisov list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6987c83b60c0SAnand Jain
69885f375835SMiao Xie return fs_devices;
69892b82032cSYan Zheng }
69902b82032cSYan Zheng
read_one_dev(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item)699117850759SDavid Sterba static int read_one_dev(struct extent_buffer *leaf,
69920b86a832SChris Mason struct btrfs_dev_item *dev_item)
69930b86a832SChris Mason {
6994562d7b15SJosef Bacik BTRFS_DEV_LOOKUP_ARGS(args);
699517850759SDavid Sterba struct btrfs_fs_info *fs_info = leaf->fs_info;
69960b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
69970b86a832SChris Mason struct btrfs_device *device;
69980b86a832SChris Mason u64 devid;
69990b86a832SChris Mason int ret;
700044880fdcSAnand Jain u8 fs_uuid[BTRFS_FSID_SIZE];
7001a443755fSChris Mason u8 dev_uuid[BTRFS_UUID_SIZE];
7002a443755fSChris Mason
7003c1867eb3SDavid Sterba devid = btrfs_device_id(leaf, dev_item);
7004c1867eb3SDavid Sterba args.devid = devid;
7005410ba3a2SGeert Uytterhoeven read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7006a443755fSChris Mason BTRFS_UUID_SIZE);
70071473b24eSGeert Uytterhoeven read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
700844880fdcSAnand Jain BTRFS_FSID_SIZE);
7009562d7b15SJosef Bacik args.uuid = dev_uuid;
7010562d7b15SJosef Bacik args.fsid = fs_uuid;
70112b82032cSYan Zheng
7012de37aa51SNikolay Borisov if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
70132ff7e61eSJeff Mahoney fs_devices = open_seed_devices(fs_info, fs_uuid);
70145f375835SMiao Xie if (IS_ERR(fs_devices))
70155f375835SMiao Xie return PTR_ERR(fs_devices);
70162b82032cSYan Zheng }
70172b82032cSYan Zheng
7018562d7b15SJosef Bacik device = btrfs_find_device(fs_info->fs_devices, &args);
70195f375835SMiao Xie if (!device) {
7020c5502451SQu Wenruo if (!btrfs_test_opt(fs_info, DEGRADED)) {
70212b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid,
70222b902dfcSAnand Jain dev_uuid, true);
702345dbdbc9SAnand Jain return -ENOENT;
7024c5502451SQu Wenruo }
70252b82032cSYan Zheng
70262ff7e61eSJeff Mahoney device = add_missing_dev(fs_devices, devid, dev_uuid);
7027adfb69afSAnand Jain if (IS_ERR(device)) {
7028adfb69afSAnand Jain btrfs_err(fs_info,
7029adfb69afSAnand Jain "failed to add missing dev %llu: %ld",
7030adfb69afSAnand Jain devid, PTR_ERR(device));
7031adfb69afSAnand Jain return PTR_ERR(device);
7032adfb69afSAnand Jain }
70332b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
70345f375835SMiao Xie } else {
7035c5502451SQu Wenruo if (!device->bdev) {
70362b902dfcSAnand Jain if (!btrfs_test_opt(fs_info, DEGRADED)) {
70372b902dfcSAnand Jain btrfs_report_missing_device(fs_info,
70382b902dfcSAnand Jain devid, dev_uuid, true);
703945dbdbc9SAnand Jain return -ENOENT;
7040c5502451SQu Wenruo }
70412b902dfcSAnand Jain btrfs_report_missing_device(fs_info, devid,
70422b902dfcSAnand Jain dev_uuid, false);
70432b902dfcSAnand Jain }
70445f375835SMiao Xie
7045e6e674bdSAnand Jain if (!device->bdev &&
7046e6e674bdSAnand Jain !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7047cd02dca5SChris Mason /*
7048cd02dca5SChris Mason * this happens when a device that was properly setup
7049cd02dca5SChris Mason * in the device info lists suddenly goes bad.
7050cd02dca5SChris Mason * device->bdev is NULL, and so we have to set
7051cd02dca5SChris Mason * device->missing to one here
7052cd02dca5SChris Mason */
70535f375835SMiao Xie device->fs_devices->missing_devices++;
7054e6e674bdSAnand Jain set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
70556324fbf3SChris Mason }
70565f375835SMiao Xie
70575f375835SMiao Xie /* Move the device to its own fs_devices */
70585f375835SMiao Xie if (device->fs_devices != fs_devices) {
7059e6e674bdSAnand Jain ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7060e6e674bdSAnand Jain &device->dev_state));
70615f375835SMiao Xie
70625f375835SMiao Xie list_move(&device->dev_list, &fs_devices->devices);
70635f375835SMiao Xie device->fs_devices->num_devices--;
70645f375835SMiao Xie fs_devices->num_devices++;
70655f375835SMiao Xie
70665f375835SMiao Xie device->fs_devices->missing_devices--;
70675f375835SMiao Xie fs_devices->missing_devices++;
70685f375835SMiao Xie
70695f375835SMiao Xie device->fs_devices = fs_devices;
70705f375835SMiao Xie }
70712b82032cSYan Zheng }
70722b82032cSYan Zheng
70730b246afaSJeff Mahoney if (device->fs_devices != fs_info->fs_devices) {
7074ebbede42SAnand Jain BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
70752b82032cSYan Zheng if (device->generation !=
70762b82032cSYan Zheng btrfs_device_generation(leaf, dev_item))
70772b82032cSYan Zheng return -EINVAL;
70782b82032cSYan Zheng }
70790b86a832SChris Mason
70800b86a832SChris Mason fill_device_from_item(leaf, dev_item, device);
70813a160a93SAnand Jain if (device->bdev) {
7082cda00ebaSChristoph Hellwig u64 max_total_bytes = bdev_nr_bytes(device->bdev);
70833a160a93SAnand Jain
70843a160a93SAnand Jain if (device->total_bytes > max_total_bytes) {
70853a160a93SAnand Jain btrfs_err(fs_info,
70863a160a93SAnand Jain "device total_bytes should be at most %llu but found %llu",
70873a160a93SAnand Jain max_total_bytes, device->total_bytes);
70883a160a93SAnand Jain return -EINVAL;
70893a160a93SAnand Jain }
70903a160a93SAnand Jain }
7091e12c9621SAnand Jain set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7092ebbede42SAnand Jain if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7093401e29c1SAnand Jain !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
70942b82032cSYan Zheng device->fs_devices->total_rw_bytes += device->total_bytes;
7095a5ed45f8SNikolay Borisov atomic64_add(device->total_bytes - device->bytes_used,
7096a5ed45f8SNikolay Borisov &fs_info->free_chunk_space);
70972bf64758SJosef Bacik }
70980b86a832SChris Mason ret = 0;
70990b86a832SChris Mason return ret;
71000b86a832SChris Mason }
71010b86a832SChris Mason
btrfs_read_sys_array(struct btrfs_fs_info * fs_info)71026bccf3abSJeff Mahoney int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
71030b86a832SChris Mason {
7104ab8d0fc4SJeff Mahoney struct btrfs_super_block *super_copy = fs_info->super_copy;
7105a061fc8dSChris Mason struct extent_buffer *sb;
71060b86a832SChris Mason struct btrfs_disk_key *disk_key;
71070b86a832SChris Mason struct btrfs_chunk *chunk;
71081ffb22cfSDavid Sterba u8 *array_ptr;
71091ffb22cfSDavid Sterba unsigned long sb_array_offset;
711084eed90fSChris Mason int ret = 0;
71110b86a832SChris Mason u32 num_stripes;
71120b86a832SChris Mason u32 array_size;
71130b86a832SChris Mason u32 len = 0;
71141ffb22cfSDavid Sterba u32 cur_offset;
7115e06cd3ddSLiu Bo u64 type;
711684eed90fSChris Mason struct btrfs_key key;
71170b86a832SChris Mason
71180b246afaSJeff Mahoney ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7119e959d3c1SQu Wenruo
7120a83fffb7SDavid Sterba /*
7121e959d3c1SQu Wenruo * We allocated a dummy extent, just to use extent buffer accessors.
7122e959d3c1SQu Wenruo * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7123e959d3c1SQu Wenruo * that's fine, we will not go beyond system chunk array anyway.
7124a83fffb7SDavid Sterba */
7125e959d3c1SQu Wenruo sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7126e959d3c1SQu Wenruo if (!sb)
7127e959d3c1SQu Wenruo return -ENOMEM;
71284db8c528SDavid Sterba set_extent_buffer_uptodate(sb);
71294008c04aSChris Mason
7130a061fc8dSChris Mason write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
71310b86a832SChris Mason array_size = btrfs_super_sys_array_size(super_copy);
71320b86a832SChris Mason
71331ffb22cfSDavid Sterba array_ptr = super_copy->sys_chunk_array;
71341ffb22cfSDavid Sterba sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
71351ffb22cfSDavid Sterba cur_offset = 0;
71360b86a832SChris Mason
71371ffb22cfSDavid Sterba while (cur_offset < array_size) {
71381ffb22cfSDavid Sterba disk_key = (struct btrfs_disk_key *)array_ptr;
7139e3540eabSDavid Sterba len = sizeof(*disk_key);
7140e3540eabSDavid Sterba if (cur_offset + len > array_size)
7141e3540eabSDavid Sterba goto out_short_read;
7142e3540eabSDavid Sterba
71430b86a832SChris Mason btrfs_disk_key_to_cpu(&key, disk_key);
71440b86a832SChris Mason
71451ffb22cfSDavid Sterba array_ptr += len;
71461ffb22cfSDavid Sterba sb_array_offset += len;
71471ffb22cfSDavid Sterba cur_offset += len;
71480b86a832SChris Mason
714932ab3d1bSJohannes Thumshirn if (key.type != BTRFS_CHUNK_ITEM_KEY) {
715032ab3d1bSJohannes Thumshirn btrfs_err(fs_info,
715132ab3d1bSJohannes Thumshirn "unexpected item type %u in sys_array at offset %u",
715232ab3d1bSJohannes Thumshirn (u32)key.type, cur_offset);
715332ab3d1bSJohannes Thumshirn ret = -EIO;
715432ab3d1bSJohannes Thumshirn break;
715532ab3d1bSJohannes Thumshirn }
715632ab3d1bSJohannes Thumshirn
71571ffb22cfSDavid Sterba chunk = (struct btrfs_chunk *)sb_array_offset;
7158e3540eabSDavid Sterba /*
715932ab3d1bSJohannes Thumshirn * At least one btrfs_chunk with one stripe must be present,
716032ab3d1bSJohannes Thumshirn * exact stripe count check comes afterwards
7161e3540eabSDavid Sterba */
7162e3540eabSDavid Sterba len = btrfs_chunk_item_size(1);
7163e3540eabSDavid Sterba if (cur_offset + len > array_size)
7164e3540eabSDavid Sterba goto out_short_read;
7165e3540eabSDavid Sterba
7166e3540eabSDavid Sterba num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7167f5cdedd7SDavid Sterba if (!num_stripes) {
7168ab8d0fc4SJeff Mahoney btrfs_err(fs_info,
7169ab8d0fc4SJeff Mahoney "invalid number of stripes %u in sys_array at offset %u",
7170f5cdedd7SDavid Sterba num_stripes, cur_offset);
7171f5cdedd7SDavid Sterba ret = -EIO;
7172f5cdedd7SDavid Sterba break;
7173f5cdedd7SDavid Sterba }
7174f5cdedd7SDavid Sterba
7175e06cd3ddSLiu Bo type = btrfs_chunk_type(sb, chunk);
7176e06cd3ddSLiu Bo if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7177ab8d0fc4SJeff Mahoney btrfs_err(fs_info,
7178e06cd3ddSLiu Bo "invalid chunk type %llu in sys_array at offset %u",
7179e06cd3ddSLiu Bo type, cur_offset);
7180e06cd3ddSLiu Bo ret = -EIO;
7181e06cd3ddSLiu Bo break;
7182e06cd3ddSLiu Bo }
7183e06cd3ddSLiu Bo
7184e3540eabSDavid Sterba len = btrfs_chunk_item_size(num_stripes);
7185e3540eabSDavid Sterba if (cur_offset + len > array_size)
7186e3540eabSDavid Sterba goto out_short_read;
7187e3540eabSDavid Sterba
71889690ac09SDavid Sterba ret = read_one_chunk(&key, sb, chunk);
718984eed90fSChris Mason if (ret)
719084eed90fSChris Mason break;
719132ab3d1bSJohannes Thumshirn
71921ffb22cfSDavid Sterba array_ptr += len;
71931ffb22cfSDavid Sterba sb_array_offset += len;
71941ffb22cfSDavid Sterba cur_offset += len;
71950b86a832SChris Mason }
7196d865177aSLiu Bo clear_extent_buffer_uptodate(sb);
71971c8b5b6eSLiu Bo free_extent_buffer_stale(sb);
719884eed90fSChris Mason return ret;
7199e3540eabSDavid Sterba
7200e3540eabSDavid Sterba out_short_read:
7201ab8d0fc4SJeff Mahoney btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7202e3540eabSDavid Sterba len, cur_offset);
7203d865177aSLiu Bo clear_extent_buffer_uptodate(sb);
72041c8b5b6eSLiu Bo free_extent_buffer_stale(sb);
7205e3540eabSDavid Sterba return -EIO;
72060b86a832SChris Mason }
72070b86a832SChris Mason
720821634a19SQu Wenruo /*
720921634a19SQu Wenruo * Check if all chunks in the fs are OK for read-write degraded mount
721021634a19SQu Wenruo *
72116528b99dSAnand Jain * If the @failing_dev is specified, it's accounted as missing.
72126528b99dSAnand Jain *
721321634a19SQu Wenruo * Return true if all chunks meet the minimal RW mount requirements.
721421634a19SQu Wenruo * Return false if any chunk doesn't meet the minimal RW mount requirements.
721521634a19SQu Wenruo */
btrfs_check_rw_degradable(struct btrfs_fs_info * fs_info,struct btrfs_device * failing_dev)72166528b99dSAnand Jain bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
72176528b99dSAnand Jain struct btrfs_device *failing_dev)
721821634a19SQu Wenruo {
7219c8bf1b67SDavid Sterba struct extent_map_tree *map_tree = &fs_info->mapping_tree;
722021634a19SQu Wenruo struct extent_map *em;
722121634a19SQu Wenruo u64 next_start = 0;
722221634a19SQu Wenruo bool ret = true;
722321634a19SQu Wenruo
7224c8bf1b67SDavid Sterba read_lock(&map_tree->lock);
7225c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7226c8bf1b67SDavid Sterba read_unlock(&map_tree->lock);
722721634a19SQu Wenruo /* No chunk at all? Return false anyway */
722821634a19SQu Wenruo if (!em) {
722921634a19SQu Wenruo ret = false;
723021634a19SQu Wenruo goto out;
723121634a19SQu Wenruo }
723221634a19SQu Wenruo while (em) {
723321634a19SQu Wenruo struct map_lookup *map;
723421634a19SQu Wenruo int missing = 0;
723521634a19SQu Wenruo int max_tolerated;
723621634a19SQu Wenruo int i;
723721634a19SQu Wenruo
723821634a19SQu Wenruo map = em->map_lookup;
723921634a19SQu Wenruo max_tolerated =
724021634a19SQu Wenruo btrfs_get_num_tolerated_disk_barrier_failures(
724121634a19SQu Wenruo map->type);
724221634a19SQu Wenruo for (i = 0; i < map->num_stripes; i++) {
724321634a19SQu Wenruo struct btrfs_device *dev = map->stripes[i].dev;
724421634a19SQu Wenruo
7245e6e674bdSAnand Jain if (!dev || !dev->bdev ||
7246e6e674bdSAnand Jain test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
724721634a19SQu Wenruo dev->last_flush_error)
724821634a19SQu Wenruo missing++;
72496528b99dSAnand Jain else if (failing_dev && failing_dev == dev)
72506528b99dSAnand Jain missing++;
725121634a19SQu Wenruo }
725221634a19SQu Wenruo if (missing > max_tolerated) {
72536528b99dSAnand Jain if (!failing_dev)
725421634a19SQu Wenruo btrfs_warn(fs_info,
725552042d8eSAndrea Gelmini "chunk %llu missing %d devices, max tolerance is %d for writable mount",
725621634a19SQu Wenruo em->start, missing, max_tolerated);
725721634a19SQu Wenruo free_extent_map(em);
725821634a19SQu Wenruo ret = false;
725921634a19SQu Wenruo goto out;
726021634a19SQu Wenruo }
726121634a19SQu Wenruo next_start = extent_map_end(em);
726221634a19SQu Wenruo free_extent_map(em);
726321634a19SQu Wenruo
7264c8bf1b67SDavid Sterba read_lock(&map_tree->lock);
7265c8bf1b67SDavid Sterba em = lookup_extent_mapping(map_tree, next_start,
726621634a19SQu Wenruo (u64)(-1) - next_start);
7267c8bf1b67SDavid Sterba read_unlock(&map_tree->lock);
726821634a19SQu Wenruo }
726921634a19SQu Wenruo out:
727021634a19SQu Wenruo return ret;
727121634a19SQu Wenruo }
727221634a19SQu Wenruo
readahead_tree_node_children(struct extent_buffer * node)7273d85327b1SDavid Sterba static void readahead_tree_node_children(struct extent_buffer *node)
7274d85327b1SDavid Sterba {
7275d85327b1SDavid Sterba int i;
7276d85327b1SDavid Sterba const int nr_items = btrfs_header_nritems(node);
7277d85327b1SDavid Sterba
7278bfb484d9SJosef Bacik for (i = 0; i < nr_items; i++)
7279bfb484d9SJosef Bacik btrfs_readahead_node_child(node, i);
7280d85327b1SDavid Sterba }
7281d85327b1SDavid Sterba
btrfs_read_chunk_tree(struct btrfs_fs_info * fs_info)72825b4aacefSJeff Mahoney int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
72830b86a832SChris Mason {
72845b4aacefSJeff Mahoney struct btrfs_root *root = fs_info->chunk_root;
72850b86a832SChris Mason struct btrfs_path *path;
72860b86a832SChris Mason struct extent_buffer *leaf;
72870b86a832SChris Mason struct btrfs_key key;
72880b86a832SChris Mason struct btrfs_key found_key;
72890b86a832SChris Mason int ret;
72900b86a832SChris Mason int slot;
729143cb1478SGabriel Niebler int iter_ret = 0;
729299e3ecfcSLiu Bo u64 total_dev = 0;
7293d85327b1SDavid Sterba u64 last_ra_node = 0;
72940b86a832SChris Mason
72950b86a832SChris Mason path = btrfs_alloc_path();
72960b86a832SChris Mason if (!path)
72970b86a832SChris Mason return -ENOMEM;
72980b86a832SChris Mason
72993dd0f7a3SAnand Jain /*
73003dd0f7a3SAnand Jain * uuid_mutex is needed only if we are mounting a sprout FS
73013dd0f7a3SAnand Jain * otherwise we don't need it.
73023dd0f7a3SAnand Jain */
7303b367e47fSLi Zefan mutex_lock(&uuid_mutex);
7304b367e47fSLi Zefan
7305395927a9SFilipe David Borba Manana /*
730648cfa61bSBoris Burkov * It is possible for mount and umount to race in such a way that
730748cfa61bSBoris Burkov * we execute this code path, but open_fs_devices failed to clear
730848cfa61bSBoris Burkov * total_rw_bytes. We certainly want it cleared before reading the
730948cfa61bSBoris Burkov * device items, so clear it here.
731048cfa61bSBoris Burkov */
731148cfa61bSBoris Burkov fs_info->fs_devices->total_rw_bytes = 0;
731248cfa61bSBoris Burkov
731348cfa61bSBoris Burkov /*
73144d9380e0SFilipe Manana * Lockdep complains about possible circular locking dependency between
73154d9380e0SFilipe Manana * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
73164d9380e0SFilipe Manana * used for freeze procection of a fs (struct super_block.s_writers),
73174d9380e0SFilipe Manana * which we take when starting a transaction, and extent buffers of the
73184d9380e0SFilipe Manana * chunk tree if we call read_one_dev() while holding a lock on an
73194d9380e0SFilipe Manana * extent buffer of the chunk tree. Since we are mounting the filesystem
73204d9380e0SFilipe Manana * and at this point there can't be any concurrent task modifying the
73214d9380e0SFilipe Manana * chunk tree, to keep it simple, just skip locking on the chunk tree.
73224d9380e0SFilipe Manana */
73234d9380e0SFilipe Manana ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
73244d9380e0SFilipe Manana path->skip_locking = 1;
73254d9380e0SFilipe Manana
73264d9380e0SFilipe Manana /*
7327395927a9SFilipe David Borba Manana * Read all device items, and then all the chunk items. All
7328395927a9SFilipe David Borba Manana * device items are found before any chunk item (their object id
7329395927a9SFilipe David Borba Manana * is smaller than the lowest possible object id for a chunk
7330395927a9SFilipe David Borba Manana * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
73310b86a832SChris Mason */
73320b86a832SChris Mason key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
73330b86a832SChris Mason key.offset = 0;
73340b86a832SChris Mason key.type = 0;
733543cb1478SGabriel Niebler btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
733643cb1478SGabriel Niebler struct extent_buffer *node = path->nodes[1];
7337d85327b1SDavid Sterba
73380b86a832SChris Mason leaf = path->nodes[0];
73390b86a832SChris Mason slot = path->slots[0];
734043cb1478SGabriel Niebler
7341d85327b1SDavid Sterba if (node) {
7342d85327b1SDavid Sterba if (last_ra_node != node->start) {
7343d85327b1SDavid Sterba readahead_tree_node_children(node);
7344d85327b1SDavid Sterba last_ra_node = node->start;
7345d85327b1SDavid Sterba }
7346d85327b1SDavid Sterba }
73470b86a832SChris Mason if (found_key.type == BTRFS_DEV_ITEM_KEY) {
73480b86a832SChris Mason struct btrfs_dev_item *dev_item;
73490b86a832SChris Mason dev_item = btrfs_item_ptr(leaf, slot,
73500b86a832SChris Mason struct btrfs_dev_item);
735117850759SDavid Sterba ret = read_one_dev(leaf, dev_item);
73522b82032cSYan Zheng if (ret)
73532b82032cSYan Zheng goto error;
735499e3ecfcSLiu Bo total_dev++;
73550b86a832SChris Mason } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
73560b86a832SChris Mason struct btrfs_chunk *chunk;
735779bd3712SFilipe Manana
735879bd3712SFilipe Manana /*
735979bd3712SFilipe Manana * We are only called at mount time, so no need to take
736079bd3712SFilipe Manana * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
736179bd3712SFilipe Manana * we always lock first fs_info->chunk_mutex before
736279bd3712SFilipe Manana * acquiring any locks on the chunk tree. This is a
736379bd3712SFilipe Manana * requirement for chunk allocation, see the comment on
736479bd3712SFilipe Manana * top of btrfs_chunk_alloc() for details.
736579bd3712SFilipe Manana */
73660b86a832SChris Mason chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
73679690ac09SDavid Sterba ret = read_one_chunk(&found_key, leaf, chunk);
73682b82032cSYan Zheng if (ret)
73692b82032cSYan Zheng goto error;
73700b86a832SChris Mason }
737143cb1478SGabriel Niebler }
737243cb1478SGabriel Niebler /* Catch error found during iteration */
737343cb1478SGabriel Niebler if (iter_ret < 0) {
737443cb1478SGabriel Niebler ret = iter_ret;
737543cb1478SGabriel Niebler goto error;
73760b86a832SChris Mason }
737799e3ecfcSLiu Bo
737899e3ecfcSLiu Bo /*
737999e3ecfcSLiu Bo * After loading chunk tree, we've got all device information,
738099e3ecfcSLiu Bo * do another round of validation checks.
738199e3ecfcSLiu Bo */
73820b246afaSJeff Mahoney if (total_dev != fs_info->fs_devices->total_devices) {
7383d201238cSQu Wenruo btrfs_warn(fs_info,
7384d201238cSQu Wenruo "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
73850b246afaSJeff Mahoney btrfs_super_num_devices(fs_info->super_copy),
738699e3ecfcSLiu Bo total_dev);
7387d201238cSQu Wenruo fs_info->fs_devices->total_devices = total_dev;
7388d201238cSQu Wenruo btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
738999e3ecfcSLiu Bo }
73900b246afaSJeff Mahoney if (btrfs_super_total_bytes(fs_info->super_copy) <
73910b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes) {
73920b246afaSJeff Mahoney btrfs_err(fs_info,
739399e3ecfcSLiu Bo "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
73940b246afaSJeff Mahoney btrfs_super_total_bytes(fs_info->super_copy),
73950b246afaSJeff Mahoney fs_info->fs_devices->total_rw_bytes);
739699e3ecfcSLiu Bo ret = -EINVAL;
739799e3ecfcSLiu Bo goto error;
739899e3ecfcSLiu Bo }
73990b86a832SChris Mason ret = 0;
74000b86a832SChris Mason error:
7401b367e47fSLi Zefan mutex_unlock(&uuid_mutex);
7402b367e47fSLi Zefan
74032b82032cSYan Zheng btrfs_free_path(path);
74040b86a832SChris Mason return ret;
74050b86a832SChris Mason }
7406442a4f63SStefan Behrens
btrfs_init_devices_late(struct btrfs_fs_info * fs_info)7407a8d1b164SJohannes Thumshirn int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7408cb517eabSMiao Xie {
7409944d3f9fSNikolay Borisov struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7410cb517eabSMiao Xie struct btrfs_device *device;
7411a8d1b164SJohannes Thumshirn int ret = 0;
7412cb517eabSMiao Xie
7413944d3f9fSNikolay Borisov fs_devices->fs_info = fs_info;
7414944d3f9fSNikolay Borisov
7415cb517eabSMiao Xie mutex_lock(&fs_devices->device_list_mutex);
7416cb517eabSMiao Xie list_for_each_entry(device, &fs_devices->devices, dev_list)
7417fb456252SJeff Mahoney device->fs_info = fs_info;
741829cc83f6SLiu Bo
7419944d3f9fSNikolay Borisov list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7420a8d1b164SJohannes Thumshirn list_for_each_entry(device, &seed_devs->devices, dev_list) {
7421944d3f9fSNikolay Borisov device->fs_info = fs_info;
7422a8d1b164SJohannes Thumshirn ret = btrfs_get_dev_zone_info(device, false);
7423a8d1b164SJohannes Thumshirn if (ret)
7424a8d1b164SJohannes Thumshirn break;
7425a8d1b164SJohannes Thumshirn }
7426944d3f9fSNikolay Borisov
7427944d3f9fSNikolay Borisov seed_devs->fs_info = fs_info;
742829cc83f6SLiu Bo }
7429e17125b5SAnand Jain mutex_unlock(&fs_devices->device_list_mutex);
7430a8d1b164SJohannes Thumshirn
7431a8d1b164SJohannes Thumshirn return ret;
7432cb517eabSMiao Xie }
7433cb517eabSMiao Xie
btrfs_dev_stats_value(const struct extent_buffer * eb,const struct btrfs_dev_stats_item * ptr,int index)74341dc990dfSDavid Sterba static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
74351dc990dfSDavid Sterba const struct btrfs_dev_stats_item *ptr,
74361dc990dfSDavid Sterba int index)
74371dc990dfSDavid Sterba {
74381dc990dfSDavid Sterba u64 val;
74391dc990dfSDavid Sterba
74401dc990dfSDavid Sterba read_extent_buffer(eb, &val,
74411dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) +
74421dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)),
74431dc990dfSDavid Sterba sizeof(val));
74441dc990dfSDavid Sterba return val;
74451dc990dfSDavid Sterba }
74461dc990dfSDavid Sterba
btrfs_set_dev_stats_value(struct extent_buffer * eb,struct btrfs_dev_stats_item * ptr,int index,u64 val)74471dc990dfSDavid Sterba static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
74481dc990dfSDavid Sterba struct btrfs_dev_stats_item *ptr,
74491dc990dfSDavid Sterba int index, u64 val)
74501dc990dfSDavid Sterba {
74511dc990dfSDavid Sterba write_extent_buffer(eb, &val,
74521dc990dfSDavid Sterba offsetof(struct btrfs_dev_stats_item, values) +
74531dc990dfSDavid Sterba ((unsigned long)ptr) + (index * sizeof(u64)),
74541dc990dfSDavid Sterba sizeof(val));
74551dc990dfSDavid Sterba }
74561dc990dfSDavid Sterba
btrfs_device_init_dev_stats(struct btrfs_device * device,struct btrfs_path * path)745792e26df4SJosef Bacik static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7458124604ebSJosef Bacik struct btrfs_path *path)
7459733f4fbbSStefan Behrens {
7460733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr;
7461124604ebSJosef Bacik struct extent_buffer *eb;
7462124604ebSJosef Bacik struct btrfs_key key;
7463124604ebSJosef Bacik int item_size;
7464124604ebSJosef Bacik int i, ret, slot;
7465733f4fbbSStefan Behrens
746682d62d06SJosef Bacik if (!device->fs_info->dev_root)
746782d62d06SJosef Bacik return 0;
746882d62d06SJosef Bacik
7469242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID;
7470242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY;
7471733f4fbbSStefan Behrens key.offset = device->devid;
7472124604ebSJosef Bacik ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7473733f4fbbSStefan Behrens if (ret) {
7474ae4b9b4cSAnand Jain for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7475ae4b9b4cSAnand Jain btrfs_dev_stat_set(device, i, 0);
7476733f4fbbSStefan Behrens device->dev_stats_valid = 1;
7477733f4fbbSStefan Behrens btrfs_release_path(path);
747892e26df4SJosef Bacik return ret < 0 ? ret : 0;
7479733f4fbbSStefan Behrens }
7480733f4fbbSStefan Behrens slot = path->slots[0];
7481733f4fbbSStefan Behrens eb = path->nodes[0];
74823212fa14SJosef Bacik item_size = btrfs_item_size(eb, slot);
7483733f4fbbSStefan Behrens
7484124604ebSJosef Bacik ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7485733f4fbbSStefan Behrens
7486733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7487733f4fbbSStefan Behrens if (item_size >= (1 + i) * sizeof(__le64))
7488733f4fbbSStefan Behrens btrfs_dev_stat_set(device, i,
7489733f4fbbSStefan Behrens btrfs_dev_stats_value(eb, ptr, i));
7490733f4fbbSStefan Behrens else
74914e411a7dSAnand Jain btrfs_dev_stat_set(device, i, 0);
7492733f4fbbSStefan Behrens }
7493733f4fbbSStefan Behrens
7494733f4fbbSStefan Behrens device->dev_stats_valid = 1;
7495733f4fbbSStefan Behrens btrfs_dev_stat_print_on_load(device);
7496733f4fbbSStefan Behrens btrfs_release_path(path);
749792e26df4SJosef Bacik
749892e26df4SJosef Bacik return 0;
7499733f4fbbSStefan Behrens }
7500124604ebSJosef Bacik
btrfs_init_dev_stats(struct btrfs_fs_info * fs_info)7501124604ebSJosef Bacik int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7502124604ebSJosef Bacik {
7503124604ebSJosef Bacik struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7504124604ebSJosef Bacik struct btrfs_device *device;
7505124604ebSJosef Bacik struct btrfs_path *path = NULL;
750692e26df4SJosef Bacik int ret = 0;
7507124604ebSJosef Bacik
7508124604ebSJosef Bacik path = btrfs_alloc_path();
7509124604ebSJosef Bacik if (!path)
7510124604ebSJosef Bacik return -ENOMEM;
7511124604ebSJosef Bacik
7512124604ebSJosef Bacik mutex_lock(&fs_devices->device_list_mutex);
751392e26df4SJosef Bacik list_for_each_entry(device, &fs_devices->devices, dev_list) {
751492e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path);
751592e26df4SJosef Bacik if (ret)
751692e26df4SJosef Bacik goto out;
7517124604ebSJosef Bacik }
751892e26df4SJosef Bacik list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
751992e26df4SJosef Bacik list_for_each_entry(device, &seed_devs->devices, dev_list) {
752092e26df4SJosef Bacik ret = btrfs_device_init_dev_stats(device, path);
752192e26df4SJosef Bacik if (ret)
752292e26df4SJosef Bacik goto out;
752392e26df4SJosef Bacik }
752492e26df4SJosef Bacik }
752592e26df4SJosef Bacik out:
7526733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex);
7527733f4fbbSStefan Behrens
7528733f4fbbSStefan Behrens btrfs_free_path(path);
752992e26df4SJosef Bacik return ret;
7530733f4fbbSStefan Behrens }
7531733f4fbbSStefan Behrens
update_dev_stat_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)7532733f4fbbSStefan Behrens static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7533733f4fbbSStefan Behrens struct btrfs_device *device)
7534733f4fbbSStefan Behrens {
75355495f195SNikolay Borisov struct btrfs_fs_info *fs_info = trans->fs_info;
75366bccf3abSJeff Mahoney struct btrfs_root *dev_root = fs_info->dev_root;
7537733f4fbbSStefan Behrens struct btrfs_path *path;
7538733f4fbbSStefan Behrens struct btrfs_key key;
7539733f4fbbSStefan Behrens struct extent_buffer *eb;
7540733f4fbbSStefan Behrens struct btrfs_dev_stats_item *ptr;
7541733f4fbbSStefan Behrens int ret;
7542733f4fbbSStefan Behrens int i;
7543733f4fbbSStefan Behrens
7544242e2956SDavid Sterba key.objectid = BTRFS_DEV_STATS_OBJECTID;
7545242e2956SDavid Sterba key.type = BTRFS_PERSISTENT_ITEM_KEY;
7546733f4fbbSStefan Behrens key.offset = device->devid;
7547733f4fbbSStefan Behrens
7548733f4fbbSStefan Behrens path = btrfs_alloc_path();
7549fa252992SDavid Sterba if (!path)
7550fa252992SDavid Sterba return -ENOMEM;
7551733f4fbbSStefan Behrens ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7552733f4fbbSStefan Behrens if (ret < 0) {
75530b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info,
7554ecaeb14bSDavid Sterba "error %d while searching for dev_stats item for device %s",
7555cb3e217bSQu Wenruo ret, btrfs_dev_name(device));
7556733f4fbbSStefan Behrens goto out;
7557733f4fbbSStefan Behrens }
7558733f4fbbSStefan Behrens
7559733f4fbbSStefan Behrens if (ret == 0 &&
75603212fa14SJosef Bacik btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7561733f4fbbSStefan Behrens /* need to delete old one and insert a new one */
7562733f4fbbSStefan Behrens ret = btrfs_del_item(trans, dev_root, path);
7563733f4fbbSStefan Behrens if (ret != 0) {
75640b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info,
7565ecaeb14bSDavid Sterba "delete too small dev_stats item for device %s failed %d",
7566cb3e217bSQu Wenruo btrfs_dev_name(device), ret);
7567733f4fbbSStefan Behrens goto out;
7568733f4fbbSStefan Behrens }
7569733f4fbbSStefan Behrens ret = 1;
7570733f4fbbSStefan Behrens }
7571733f4fbbSStefan Behrens
7572733f4fbbSStefan Behrens if (ret == 1) {
7573733f4fbbSStefan Behrens /* need to insert a new item */
7574733f4fbbSStefan Behrens btrfs_release_path(path);
7575733f4fbbSStefan Behrens ret = btrfs_insert_empty_item(trans, dev_root, path,
7576733f4fbbSStefan Behrens &key, sizeof(*ptr));
7577733f4fbbSStefan Behrens if (ret < 0) {
75780b246afaSJeff Mahoney btrfs_warn_in_rcu(fs_info,
7579ecaeb14bSDavid Sterba "insert dev_stats item for device %s failed %d",
7580cb3e217bSQu Wenruo btrfs_dev_name(device), ret);
7581733f4fbbSStefan Behrens goto out;
7582733f4fbbSStefan Behrens }
7583733f4fbbSStefan Behrens }
7584733f4fbbSStefan Behrens
7585733f4fbbSStefan Behrens eb = path->nodes[0];
7586733f4fbbSStefan Behrens ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7587733f4fbbSStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7588733f4fbbSStefan Behrens btrfs_set_dev_stats_value(eb, ptr, i,
7589733f4fbbSStefan Behrens btrfs_dev_stat_read(device, i));
7590d5e09e38SFilipe Manana btrfs_mark_buffer_dirty(trans, eb);
7591733f4fbbSStefan Behrens
7592733f4fbbSStefan Behrens out:
7593733f4fbbSStefan Behrens btrfs_free_path(path);
7594733f4fbbSStefan Behrens return ret;
7595733f4fbbSStefan Behrens }
7596733f4fbbSStefan Behrens
7597733f4fbbSStefan Behrens /*
7598733f4fbbSStefan Behrens * called from commit_transaction. Writes all changed device stats to disk.
7599733f4fbbSStefan Behrens */
btrfs_run_dev_stats(struct btrfs_trans_handle * trans)7600196c9d8dSDavid Sterba int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7601733f4fbbSStefan Behrens {
7602196c9d8dSDavid Sterba struct btrfs_fs_info *fs_info = trans->fs_info;
7603733f4fbbSStefan Behrens struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7604733f4fbbSStefan Behrens struct btrfs_device *device;
7605addc3fa7SMiao Xie int stats_cnt;
7606733f4fbbSStefan Behrens int ret = 0;
7607733f4fbbSStefan Behrens
7608733f4fbbSStefan Behrens mutex_lock(&fs_devices->device_list_mutex);
7609733f4fbbSStefan Behrens list_for_each_entry(device, &fs_devices->devices, dev_list) {
76109deae968SNikolay Borisov stats_cnt = atomic_read(&device->dev_stats_ccnt);
76119deae968SNikolay Borisov if (!device->dev_stats_valid || stats_cnt == 0)
7612733f4fbbSStefan Behrens continue;
7613733f4fbbSStefan Behrens
76149deae968SNikolay Borisov
76159deae968SNikolay Borisov /*
76169deae968SNikolay Borisov * There is a LOAD-LOAD control dependency between the value of
76179deae968SNikolay Borisov * dev_stats_ccnt and updating the on-disk values which requires
76189deae968SNikolay Borisov * reading the in-memory counters. Such control dependencies
76199deae968SNikolay Borisov * require explicit read memory barriers.
76209deae968SNikolay Borisov *
76219deae968SNikolay Borisov * This memory barriers pairs with smp_mb__before_atomic in
76229deae968SNikolay Borisov * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
76239deae968SNikolay Borisov * barrier implied by atomic_xchg in
76249deae968SNikolay Borisov * btrfs_dev_stats_read_and_reset
76259deae968SNikolay Borisov */
76269deae968SNikolay Borisov smp_rmb();
76279deae968SNikolay Borisov
76285495f195SNikolay Borisov ret = update_dev_stat_item(trans, device);
7629733f4fbbSStefan Behrens if (!ret)
7630addc3fa7SMiao Xie atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7631733f4fbbSStefan Behrens }
7632733f4fbbSStefan Behrens mutex_unlock(&fs_devices->device_list_mutex);
7633733f4fbbSStefan Behrens
7634733f4fbbSStefan Behrens return ret;
7635733f4fbbSStefan Behrens }
7636733f4fbbSStefan Behrens
btrfs_dev_stat_inc_and_print(struct btrfs_device * dev,int index)7637442a4f63SStefan Behrens void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7638442a4f63SStefan Behrens {
7639442a4f63SStefan Behrens btrfs_dev_stat_inc(dev, index);
7640442a4f63SStefan Behrens
7641733f4fbbSStefan Behrens if (!dev->dev_stats_valid)
7642733f4fbbSStefan Behrens return;
7643fb456252SJeff Mahoney btrfs_err_rl_in_rcu(dev->fs_info,
7644b14af3b4SDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7645cb3e217bSQu Wenruo btrfs_dev_name(dev),
7646442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7647442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7648442a4f63SStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7649efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7650efe120a0SFrank Holton btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7651442a4f63SStefan Behrens }
7652c11d2c23SStefan Behrens
btrfs_dev_stat_print_on_load(struct btrfs_device * dev)7653733f4fbbSStefan Behrens static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7654733f4fbbSStefan Behrens {
7655a98cdb85SStefan Behrens int i;
7656a98cdb85SStefan Behrens
7657a98cdb85SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7658a98cdb85SStefan Behrens if (btrfs_dev_stat_read(dev, i) != 0)
7659a98cdb85SStefan Behrens break;
7660a98cdb85SStefan Behrens if (i == BTRFS_DEV_STAT_VALUES_MAX)
7661a98cdb85SStefan Behrens return; /* all values == 0, suppress message */
7662a98cdb85SStefan Behrens
7663fb456252SJeff Mahoney btrfs_info_in_rcu(dev->fs_info,
7664ecaeb14bSDavid Sterba "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7665cb3e217bSQu Wenruo btrfs_dev_name(dev),
7666733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7667733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7668733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7669733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7670733f4fbbSStefan Behrens btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7671733f4fbbSStefan Behrens }
7672733f4fbbSStefan Behrens
btrfs_get_dev_stats(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_get_dev_stats * stats)76732ff7e61eSJeff Mahoney int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7674b27f7c0cSDavid Sterba struct btrfs_ioctl_get_dev_stats *stats)
7675c11d2c23SStefan Behrens {
7676562d7b15SJosef Bacik BTRFS_DEV_LOOKUP_ARGS(args);
7677c11d2c23SStefan Behrens struct btrfs_device *dev;
76780b246afaSJeff Mahoney struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7679c11d2c23SStefan Behrens int i;
7680c11d2c23SStefan Behrens
7681c11d2c23SStefan Behrens mutex_lock(&fs_devices->device_list_mutex);
7682562d7b15SJosef Bacik args.devid = stats->devid;
7683562d7b15SJosef Bacik dev = btrfs_find_device(fs_info->fs_devices, &args);
7684c11d2c23SStefan Behrens mutex_unlock(&fs_devices->device_list_mutex);
7685c11d2c23SStefan Behrens
7686c11d2c23SStefan Behrens if (!dev) {
76870b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, device not found");
7688c11d2c23SStefan Behrens return -ENODEV;
7689733f4fbbSStefan Behrens } else if (!dev->dev_stats_valid) {
76900b246afaSJeff Mahoney btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7691733f4fbbSStefan Behrens return -ENODEV;
7692b27f7c0cSDavid Sterba } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7693c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7694c11d2c23SStefan Behrens if (stats->nr_items > i)
7695c11d2c23SStefan Behrens stats->values[i] =
7696c11d2c23SStefan Behrens btrfs_dev_stat_read_and_reset(dev, i);
7697c11d2c23SStefan Behrens else
76984e411a7dSAnand Jain btrfs_dev_stat_set(dev, i, 0);
7699c11d2c23SStefan Behrens }
7700a69976bcSAnand Jain btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7701a69976bcSAnand Jain current->comm, task_pid_nr(current));
7702c11d2c23SStefan Behrens } else {
7703c11d2c23SStefan Behrens for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7704c11d2c23SStefan Behrens if (stats->nr_items > i)
7705c11d2c23SStefan Behrens stats->values[i] = btrfs_dev_stat_read(dev, i);
7706c11d2c23SStefan Behrens }
7707c11d2c23SStefan Behrens if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7708c11d2c23SStefan Behrens stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7709c11d2c23SStefan Behrens return 0;
7710c11d2c23SStefan Behrens }
7711a8a6dab7SStefan Behrens
7712935e5cc9SMiao Xie /*
7713bbbf7243SNikolay Borisov * Update the size and bytes used for each device where it changed. This is
7714bbbf7243SNikolay Borisov * delayed since we would otherwise get errors while writing out the
7715935e5cc9SMiao Xie * superblocks.
7716bbbf7243SNikolay Borisov *
7717bbbf7243SNikolay Borisov * Must be invoked during transaction commit.
7718935e5cc9SMiao Xie */
btrfs_commit_device_sizes(struct btrfs_transaction * trans)7719bbbf7243SNikolay Borisov void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7720935e5cc9SMiao Xie {
7721935e5cc9SMiao Xie struct btrfs_device *curr, *next;
7722935e5cc9SMiao Xie
7723bbbf7243SNikolay Borisov ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7724bbbf7243SNikolay Borisov
7725bbbf7243SNikolay Borisov if (list_empty(&trans->dev_update_list))
7726935e5cc9SMiao Xie return;
7727935e5cc9SMiao Xie
7728bbbf7243SNikolay Borisov /*
7729bbbf7243SNikolay Borisov * We don't need the device_list_mutex here. This list is owned by the
7730bbbf7243SNikolay Borisov * transaction and the transaction must complete before the device is
7731bbbf7243SNikolay Borisov * released.
7732bbbf7243SNikolay Borisov */
7733bbbf7243SNikolay Borisov mutex_lock(&trans->fs_info->chunk_mutex);
7734bbbf7243SNikolay Borisov list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7735bbbf7243SNikolay Borisov post_commit_list) {
7736bbbf7243SNikolay Borisov list_del_init(&curr->post_commit_list);
7737935e5cc9SMiao Xie curr->commit_total_bytes = curr->disk_total_bytes;
7738bbbf7243SNikolay Borisov curr->commit_bytes_used = curr->bytes_used;
7739935e5cc9SMiao Xie }
7740bbbf7243SNikolay Borisov mutex_unlock(&trans->fs_info->chunk_mutex);
7741ce7213c7SMiao Xie }
77425a13f430SAnand Jain
774346df06b8SDavid Sterba /*
774446df06b8SDavid Sterba * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
774546df06b8SDavid Sterba */
btrfs_bg_type_to_factor(u64 flags)774646df06b8SDavid Sterba int btrfs_bg_type_to_factor(u64 flags)
774746df06b8SDavid Sterba {
774844b28adaSDavid Sterba const int index = btrfs_bg_flags_to_raid_index(flags);
774944b28adaSDavid Sterba
775044b28adaSDavid Sterba return btrfs_raid_array[index].ncopies;
775146df06b8SDavid Sterba }
7752cf90d884SQu Wenruo
7753cf90d884SQu Wenruo
7754cf90d884SQu Wenruo
verify_one_dev_extent(struct btrfs_fs_info * fs_info,u64 chunk_offset,u64 devid,u64 physical_offset,u64 physical_len)7755cf90d884SQu Wenruo static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7756cf90d884SQu Wenruo u64 chunk_offset, u64 devid,
7757cf90d884SQu Wenruo u64 physical_offset, u64 physical_len)
7758cf90d884SQu Wenruo {
7759562d7b15SJosef Bacik struct btrfs_dev_lookup_args args = { .devid = devid };
7760c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7761cf90d884SQu Wenruo struct extent_map *em;
7762cf90d884SQu Wenruo struct map_lookup *map;
776305a37c48SQu Wenruo struct btrfs_device *dev;
7764cf90d884SQu Wenruo u64 stripe_len;
7765cf90d884SQu Wenruo bool found = false;
7766cf90d884SQu Wenruo int ret = 0;
7767cf90d884SQu Wenruo int i;
7768cf90d884SQu Wenruo
7769cf90d884SQu Wenruo read_lock(&em_tree->lock);
7770cf90d884SQu Wenruo em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7771cf90d884SQu Wenruo read_unlock(&em_tree->lock);
7772cf90d884SQu Wenruo
7773cf90d884SQu Wenruo if (!em) {
7774cf90d884SQu Wenruo btrfs_err(fs_info,
7775cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7776cf90d884SQu Wenruo physical_offset, devid);
7777cf90d884SQu Wenruo ret = -EUCLEAN;
7778cf90d884SQu Wenruo goto out;
7779cf90d884SQu Wenruo }
7780cf90d884SQu Wenruo
7781cf90d884SQu Wenruo map = em->map_lookup;
7782bc88b486SQu Wenruo stripe_len = btrfs_calc_stripe_length(em);
7783cf90d884SQu Wenruo if (physical_len != stripe_len) {
7784cf90d884SQu Wenruo btrfs_err(fs_info,
7785cf90d884SQu Wenruo "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7786cf90d884SQu Wenruo physical_offset, devid, em->start, physical_len,
7787cf90d884SQu Wenruo stripe_len);
7788cf90d884SQu Wenruo ret = -EUCLEAN;
7789cf90d884SQu Wenruo goto out;
7790cf90d884SQu Wenruo }
7791cf90d884SQu Wenruo
77923613249aSQu Wenruo /*
77933613249aSQu Wenruo * Very old mkfs.btrfs (before v4.1) will not respect the reserved
77943613249aSQu Wenruo * space. Although kernel can handle it without problem, better to warn
77953613249aSQu Wenruo * the users.
77963613249aSQu Wenruo */
77973613249aSQu Wenruo if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
77983613249aSQu Wenruo btrfs_warn(fs_info,
77993613249aSQu Wenruo "devid %llu physical %llu len %llu inside the reserved space",
78003613249aSQu Wenruo devid, physical_offset, physical_len);
78013613249aSQu Wenruo
7802cf90d884SQu Wenruo for (i = 0; i < map->num_stripes; i++) {
7803cf90d884SQu Wenruo if (map->stripes[i].dev->devid == devid &&
7804cf90d884SQu Wenruo map->stripes[i].physical == physical_offset) {
7805cf90d884SQu Wenruo found = true;
7806cf90d884SQu Wenruo if (map->verified_stripes >= map->num_stripes) {
7807cf90d884SQu Wenruo btrfs_err(fs_info,
7808cf90d884SQu Wenruo "too many dev extents for chunk %llu found",
7809cf90d884SQu Wenruo em->start);
7810cf90d884SQu Wenruo ret = -EUCLEAN;
7811cf90d884SQu Wenruo goto out;
7812cf90d884SQu Wenruo }
7813cf90d884SQu Wenruo map->verified_stripes++;
7814cf90d884SQu Wenruo break;
7815cf90d884SQu Wenruo }
7816cf90d884SQu Wenruo }
7817cf90d884SQu Wenruo if (!found) {
7818cf90d884SQu Wenruo btrfs_err(fs_info,
7819cf90d884SQu Wenruo "dev extent physical offset %llu devid %llu has no corresponding chunk",
7820cf90d884SQu Wenruo physical_offset, devid);
7821cf90d884SQu Wenruo ret = -EUCLEAN;
7822cf90d884SQu Wenruo }
782305a37c48SQu Wenruo
78241a9fd417SDavid Sterba /* Make sure no dev extent is beyond device boundary */
7825562d7b15SJosef Bacik dev = btrfs_find_device(fs_info->fs_devices, &args);
782605a37c48SQu Wenruo if (!dev) {
782705a37c48SQu Wenruo btrfs_err(fs_info, "failed to find devid %llu", devid);
782805a37c48SQu Wenruo ret = -EUCLEAN;
782905a37c48SQu Wenruo goto out;
783005a37c48SQu Wenruo }
78311b3922a8SQu Wenruo
783205a37c48SQu Wenruo if (physical_offset + physical_len > dev->disk_total_bytes) {
783305a37c48SQu Wenruo btrfs_err(fs_info,
783405a37c48SQu Wenruo "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
783505a37c48SQu Wenruo devid, physical_offset, physical_len,
783605a37c48SQu Wenruo dev->disk_total_bytes);
783705a37c48SQu Wenruo ret = -EUCLEAN;
783805a37c48SQu Wenruo goto out;
783905a37c48SQu Wenruo }
7840381a696eSNaohiro Aota
7841381a696eSNaohiro Aota if (dev->zone_info) {
7842381a696eSNaohiro Aota u64 zone_size = dev->zone_info->zone_size;
7843381a696eSNaohiro Aota
7844381a696eSNaohiro Aota if (!IS_ALIGNED(physical_offset, zone_size) ||
7845381a696eSNaohiro Aota !IS_ALIGNED(physical_len, zone_size)) {
7846381a696eSNaohiro Aota btrfs_err(fs_info,
7847381a696eSNaohiro Aota "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7848381a696eSNaohiro Aota devid, physical_offset, physical_len);
7849381a696eSNaohiro Aota ret = -EUCLEAN;
7850381a696eSNaohiro Aota goto out;
7851381a696eSNaohiro Aota }
7852381a696eSNaohiro Aota }
7853381a696eSNaohiro Aota
7854cf90d884SQu Wenruo out:
7855cf90d884SQu Wenruo free_extent_map(em);
7856cf90d884SQu Wenruo return ret;
7857cf90d884SQu Wenruo }
7858cf90d884SQu Wenruo
verify_chunk_dev_extent_mapping(struct btrfs_fs_info * fs_info)7859cf90d884SQu Wenruo static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7860cf90d884SQu Wenruo {
7861c8bf1b67SDavid Sterba struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7862cf90d884SQu Wenruo struct extent_map *em;
7863cf90d884SQu Wenruo struct rb_node *node;
7864cf90d884SQu Wenruo int ret = 0;
7865cf90d884SQu Wenruo
7866cf90d884SQu Wenruo read_lock(&em_tree->lock);
786707e1ce09SLiu Bo for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7868cf90d884SQu Wenruo em = rb_entry(node, struct extent_map, rb_node);
7869cf90d884SQu Wenruo if (em->map_lookup->num_stripes !=
7870cf90d884SQu Wenruo em->map_lookup->verified_stripes) {
7871cf90d884SQu Wenruo btrfs_err(fs_info,
7872cf90d884SQu Wenruo "chunk %llu has missing dev extent, have %d expect %d",
7873cf90d884SQu Wenruo em->start, em->map_lookup->verified_stripes,
7874cf90d884SQu Wenruo em->map_lookup->num_stripes);
7875cf90d884SQu Wenruo ret = -EUCLEAN;
7876cf90d884SQu Wenruo goto out;
7877cf90d884SQu Wenruo }
7878cf90d884SQu Wenruo }
7879cf90d884SQu Wenruo out:
7880cf90d884SQu Wenruo read_unlock(&em_tree->lock);
7881cf90d884SQu Wenruo return ret;
7882cf90d884SQu Wenruo }
7883cf90d884SQu Wenruo
7884cf90d884SQu Wenruo /*
7885cf90d884SQu Wenruo * Ensure that all dev extents are mapped to correct chunk, otherwise
7886cf90d884SQu Wenruo * later chunk allocation/free would cause unexpected behavior.
7887cf90d884SQu Wenruo *
7888cf90d884SQu Wenruo * NOTE: This will iterate through the whole device tree, which should be of
7889cf90d884SQu Wenruo * the same size level as the chunk tree. This slightly increases mount time.
7890cf90d884SQu Wenruo */
btrfs_verify_dev_extents(struct btrfs_fs_info * fs_info)7891cf90d884SQu Wenruo int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7892cf90d884SQu Wenruo {
7893cf90d884SQu Wenruo struct btrfs_path *path;
7894cf90d884SQu Wenruo struct btrfs_root *root = fs_info->dev_root;
7895cf90d884SQu Wenruo struct btrfs_key key;
78965eb19381SQu Wenruo u64 prev_devid = 0;
78975eb19381SQu Wenruo u64 prev_dev_ext_end = 0;
7898cf90d884SQu Wenruo int ret = 0;
7899cf90d884SQu Wenruo
790042437a63SJosef Bacik /*
790142437a63SJosef Bacik * We don't have a dev_root because we mounted with ignorebadroots and
790242437a63SJosef Bacik * failed to load the root, so we want to skip the verification in this
790342437a63SJosef Bacik * case for sure.
790442437a63SJosef Bacik *
790542437a63SJosef Bacik * However if the dev root is fine, but the tree itself is corrupted
790642437a63SJosef Bacik * we'd still fail to mount. This verification is only to make sure
790742437a63SJosef Bacik * writes can happen safely, so instead just bypass this check
790842437a63SJosef Bacik * completely in the case of IGNOREBADROOTS.
790942437a63SJosef Bacik */
791042437a63SJosef Bacik if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
791142437a63SJosef Bacik return 0;
791242437a63SJosef Bacik
7913cf90d884SQu Wenruo key.objectid = 1;
7914cf90d884SQu Wenruo key.type = BTRFS_DEV_EXTENT_KEY;
7915cf90d884SQu Wenruo key.offset = 0;
7916cf90d884SQu Wenruo
7917cf90d884SQu Wenruo path = btrfs_alloc_path();
7918cf90d884SQu Wenruo if (!path)
7919cf90d884SQu Wenruo return -ENOMEM;
7920cf90d884SQu Wenruo
7921cf90d884SQu Wenruo path->reada = READA_FORWARD;
7922cf90d884SQu Wenruo ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7923cf90d884SQu Wenruo if (ret < 0)
7924cf90d884SQu Wenruo goto out;
7925cf90d884SQu Wenruo
7926cf90d884SQu Wenruo if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7927ad9a9378SMarcos Paulo de Souza ret = btrfs_next_leaf(root, path);
7928cf90d884SQu Wenruo if (ret < 0)
7929cf90d884SQu Wenruo goto out;
7930cf90d884SQu Wenruo /* No dev extents at all? Not good */
7931cf90d884SQu Wenruo if (ret > 0) {
7932cf90d884SQu Wenruo ret = -EUCLEAN;
7933cf90d884SQu Wenruo goto out;
7934cf90d884SQu Wenruo }
7935cf90d884SQu Wenruo }
7936cf90d884SQu Wenruo while (1) {
7937cf90d884SQu Wenruo struct extent_buffer *leaf = path->nodes[0];
7938cf90d884SQu Wenruo struct btrfs_dev_extent *dext;
7939cf90d884SQu Wenruo int slot = path->slots[0];
7940cf90d884SQu Wenruo u64 chunk_offset;
7941cf90d884SQu Wenruo u64 physical_offset;
7942cf90d884SQu Wenruo u64 physical_len;
7943cf90d884SQu Wenruo u64 devid;
7944cf90d884SQu Wenruo
7945cf90d884SQu Wenruo btrfs_item_key_to_cpu(leaf, &key, slot);
7946cf90d884SQu Wenruo if (key.type != BTRFS_DEV_EXTENT_KEY)
7947cf90d884SQu Wenruo break;
7948cf90d884SQu Wenruo devid = key.objectid;
7949cf90d884SQu Wenruo physical_offset = key.offset;
7950cf90d884SQu Wenruo
7951cf90d884SQu Wenruo dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7952cf90d884SQu Wenruo chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7953cf90d884SQu Wenruo physical_len = btrfs_dev_extent_length(leaf, dext);
7954cf90d884SQu Wenruo
79555eb19381SQu Wenruo /* Check if this dev extent overlaps with the previous one */
79565eb19381SQu Wenruo if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
79575eb19381SQu Wenruo btrfs_err(fs_info,
79585eb19381SQu Wenruo "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
79595eb19381SQu Wenruo devid, physical_offset, prev_dev_ext_end);
79605eb19381SQu Wenruo ret = -EUCLEAN;
79615eb19381SQu Wenruo goto out;
79625eb19381SQu Wenruo }
79635eb19381SQu Wenruo
7964cf90d884SQu Wenruo ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7965cf90d884SQu Wenruo physical_offset, physical_len);
7966cf90d884SQu Wenruo if (ret < 0)
7967cf90d884SQu Wenruo goto out;
79685eb19381SQu Wenruo prev_devid = devid;
79695eb19381SQu Wenruo prev_dev_ext_end = physical_offset + physical_len;
79705eb19381SQu Wenruo
7971cf90d884SQu Wenruo ret = btrfs_next_item(root, path);
7972cf90d884SQu Wenruo if (ret < 0)
7973cf90d884SQu Wenruo goto out;
7974cf90d884SQu Wenruo if (ret > 0) {
7975cf90d884SQu Wenruo ret = 0;
7976cf90d884SQu Wenruo break;
7977cf90d884SQu Wenruo }
7978cf90d884SQu Wenruo }
7979cf90d884SQu Wenruo
7980cf90d884SQu Wenruo /* Ensure all chunks have corresponding dev extents */
7981cf90d884SQu Wenruo ret = verify_chunk_dev_extent_mapping(fs_info);
7982cf90d884SQu Wenruo out:
7983cf90d884SQu Wenruo btrfs_free_path(path);
7984cf90d884SQu Wenruo return ret;
7985cf90d884SQu Wenruo }
7986eede2bf3SOmar Sandoval
7987eede2bf3SOmar Sandoval /*
7988eede2bf3SOmar Sandoval * Check whether the given block group or device is pinned by any inode being
7989eede2bf3SOmar Sandoval * used as a swapfile.
7990eede2bf3SOmar Sandoval */
btrfs_pinned_by_swapfile(struct btrfs_fs_info * fs_info,void * ptr)7991eede2bf3SOmar Sandoval bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7992eede2bf3SOmar Sandoval {
7993eede2bf3SOmar Sandoval struct btrfs_swapfile_pin *sp;
7994eede2bf3SOmar Sandoval struct rb_node *node;
7995eede2bf3SOmar Sandoval
7996eede2bf3SOmar Sandoval spin_lock(&fs_info->swapfile_pins_lock);
7997eede2bf3SOmar Sandoval node = fs_info->swapfile_pins.rb_node;
7998eede2bf3SOmar Sandoval while (node) {
7999eede2bf3SOmar Sandoval sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8000eede2bf3SOmar Sandoval if (ptr < sp->ptr)
8001eede2bf3SOmar Sandoval node = node->rb_left;
8002eede2bf3SOmar Sandoval else if (ptr > sp->ptr)
8003eede2bf3SOmar Sandoval node = node->rb_right;
8004eede2bf3SOmar Sandoval else
8005eede2bf3SOmar Sandoval break;
8006eede2bf3SOmar Sandoval }
8007eede2bf3SOmar Sandoval spin_unlock(&fs_info->swapfile_pins_lock);
8008eede2bf3SOmar Sandoval return node != NULL;
8009eede2bf3SOmar Sandoval }
8010f7ef5287SNaohiro Aota
relocating_repair_kthread(void * data)8011f7ef5287SNaohiro Aota static int relocating_repair_kthread(void *data)
8012f7ef5287SNaohiro Aota {
80130d031dc4SYu Zhe struct btrfs_block_group *cache = data;
8014f7ef5287SNaohiro Aota struct btrfs_fs_info *fs_info = cache->fs_info;
8015f7ef5287SNaohiro Aota u64 target;
8016f7ef5287SNaohiro Aota int ret = 0;
8017f7ef5287SNaohiro Aota
8018f7ef5287SNaohiro Aota target = cache->start;
8019f7ef5287SNaohiro Aota btrfs_put_block_group(cache);
8020f7ef5287SNaohiro Aota
8021ca5e4ea0SNaohiro Aota sb_start_write(fs_info->sb);
8022f7ef5287SNaohiro Aota if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8023f7ef5287SNaohiro Aota btrfs_info(fs_info,
8024f7ef5287SNaohiro Aota "zoned: skip relocating block group %llu to repair: EBUSY",
8025f7ef5287SNaohiro Aota target);
8026ca5e4ea0SNaohiro Aota sb_end_write(fs_info->sb);
8027f7ef5287SNaohiro Aota return -EBUSY;
8028f7ef5287SNaohiro Aota }
8029f7ef5287SNaohiro Aota
8030f3372065SJohannes Thumshirn mutex_lock(&fs_info->reclaim_bgs_lock);
8031f7ef5287SNaohiro Aota
8032f7ef5287SNaohiro Aota /* Ensure block group still exists */
8033f7ef5287SNaohiro Aota cache = btrfs_lookup_block_group(fs_info, target);
8034f7ef5287SNaohiro Aota if (!cache)
8035f7ef5287SNaohiro Aota goto out;
8036f7ef5287SNaohiro Aota
80373349b57fSJosef Bacik if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
8038f7ef5287SNaohiro Aota goto out;
8039f7ef5287SNaohiro Aota
8040f7ef5287SNaohiro Aota ret = btrfs_may_alloc_data_chunk(fs_info, target);
8041f7ef5287SNaohiro Aota if (ret < 0)
8042f7ef5287SNaohiro Aota goto out;
8043f7ef5287SNaohiro Aota
8044f7ef5287SNaohiro Aota btrfs_info(fs_info,
8045f7ef5287SNaohiro Aota "zoned: relocating block group %llu to repair IO failure",
8046f7ef5287SNaohiro Aota target);
8047f7ef5287SNaohiro Aota ret = btrfs_relocate_chunk(fs_info, target);
8048f7ef5287SNaohiro Aota
8049f7ef5287SNaohiro Aota out:
8050f7ef5287SNaohiro Aota if (cache)
8051f7ef5287SNaohiro Aota btrfs_put_block_group(cache);
8052f3372065SJohannes Thumshirn mutex_unlock(&fs_info->reclaim_bgs_lock);
8053f7ef5287SNaohiro Aota btrfs_exclop_finish(fs_info);
8054ca5e4ea0SNaohiro Aota sb_end_write(fs_info->sb);
8055f7ef5287SNaohiro Aota
8056f7ef5287SNaohiro Aota return ret;
8057f7ef5287SNaohiro Aota }
8058f7ef5287SNaohiro Aota
btrfs_repair_one_zone(struct btrfs_fs_info * fs_info,u64 logical)8059554aed7dSJohannes Thumshirn bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8060f7ef5287SNaohiro Aota {
8061f7ef5287SNaohiro Aota struct btrfs_block_group *cache;
8062f7ef5287SNaohiro Aota
8063554aed7dSJohannes Thumshirn if (!btrfs_is_zoned(fs_info))
8064554aed7dSJohannes Thumshirn return false;
8065554aed7dSJohannes Thumshirn
8066f7ef5287SNaohiro Aota /* Do not attempt to repair in degraded state */
8067f7ef5287SNaohiro Aota if (btrfs_test_opt(fs_info, DEGRADED))
8068554aed7dSJohannes Thumshirn return true;
8069f7ef5287SNaohiro Aota
8070f7ef5287SNaohiro Aota cache = btrfs_lookup_block_group(fs_info, logical);
8071f7ef5287SNaohiro Aota if (!cache)
8072554aed7dSJohannes Thumshirn return true;
8073f7ef5287SNaohiro Aota
80743349b57fSJosef Bacik if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
8075f7ef5287SNaohiro Aota btrfs_put_block_group(cache);
8076554aed7dSJohannes Thumshirn return true;
8077f7ef5287SNaohiro Aota }
8078f7ef5287SNaohiro Aota
8079f7ef5287SNaohiro Aota kthread_run(relocating_repair_kthread, cache,
8080f7ef5287SNaohiro Aota "btrfs-relocating-repair");
8081f7ef5287SNaohiro Aota
8082554aed7dSJohannes Thumshirn return true;
8083f7ef5287SNaohiro Aota }
80844886ff7bSQu Wenruo
map_raid56_repair_block(struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,u64 logical)80854886ff7bSQu Wenruo static void map_raid56_repair_block(struct btrfs_io_context *bioc,
80864886ff7bSQu Wenruo struct btrfs_io_stripe *smap,
80874886ff7bSQu Wenruo u64 logical)
80884886ff7bSQu Wenruo {
80894886ff7bSQu Wenruo int data_stripes = nr_bioc_data_stripes(bioc);
80904886ff7bSQu Wenruo int i;
80914886ff7bSQu Wenruo
80924886ff7bSQu Wenruo for (i = 0; i < data_stripes; i++) {
80934886ff7bSQu Wenruo u64 stripe_start = bioc->full_stripe_logical +
8094cb091225SQu Wenruo btrfs_stripe_nr_to_offset(i);
80954886ff7bSQu Wenruo
80964886ff7bSQu Wenruo if (logical >= stripe_start &&
80974886ff7bSQu Wenruo logical < stripe_start + BTRFS_STRIPE_LEN)
80984886ff7bSQu Wenruo break;
80994886ff7bSQu Wenruo }
81004886ff7bSQu Wenruo ASSERT(i < data_stripes);
81014886ff7bSQu Wenruo smap->dev = bioc->stripes[i].dev;
81024886ff7bSQu Wenruo smap->physical = bioc->stripes[i].physical +
81034886ff7bSQu Wenruo ((logical - bioc->full_stripe_logical) &
81044886ff7bSQu Wenruo BTRFS_STRIPE_LEN_MASK);
81054886ff7bSQu Wenruo }
81064886ff7bSQu Wenruo
81074886ff7bSQu Wenruo /*
81084886ff7bSQu Wenruo * Map a repair write into a single device.
81094886ff7bSQu Wenruo *
81104886ff7bSQu Wenruo * A repair write is triggered by read time repair or scrub, which would only
81114886ff7bSQu Wenruo * update the contents of a single device.
81124886ff7bSQu Wenruo * Not update any other mirrors nor go through RMW path.
81134886ff7bSQu Wenruo *
81144886ff7bSQu Wenruo * Callers should ensure:
81154886ff7bSQu Wenruo *
81164886ff7bSQu Wenruo * - Call btrfs_bio_counter_inc_blocked() first
81174886ff7bSQu Wenruo * - The range does not cross stripe boundary
81184886ff7bSQu Wenruo * - Has a valid @mirror_num passed in.
81194886ff7bSQu Wenruo */
btrfs_map_repair_block(struct btrfs_fs_info * fs_info,struct btrfs_io_stripe * smap,u64 logical,u32 length,int mirror_num)81204886ff7bSQu Wenruo int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
81214886ff7bSQu Wenruo struct btrfs_io_stripe *smap, u64 logical,
81224886ff7bSQu Wenruo u32 length, int mirror_num)
81234886ff7bSQu Wenruo {
81244886ff7bSQu Wenruo struct btrfs_io_context *bioc = NULL;
81254886ff7bSQu Wenruo u64 map_length = length;
81264886ff7bSQu Wenruo int mirror_ret = mirror_num;
81274886ff7bSQu Wenruo int ret;
81284886ff7bSQu Wenruo
81294886ff7bSQu Wenruo ASSERT(mirror_num > 0);
81304886ff7bSQu Wenruo
8131cd4efd21SChristoph Hellwig ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
81324886ff7bSQu Wenruo &bioc, smap, &mirror_ret, true);
81334886ff7bSQu Wenruo if (ret < 0)
81344886ff7bSQu Wenruo return ret;
81354886ff7bSQu Wenruo
81364886ff7bSQu Wenruo /* The map range should not cross stripe boundary. */
81374886ff7bSQu Wenruo ASSERT(map_length >= length);
81384886ff7bSQu Wenruo
81394886ff7bSQu Wenruo /* Already mapped to single stripe. */
81404886ff7bSQu Wenruo if (!bioc)
81414886ff7bSQu Wenruo goto out;
81424886ff7bSQu Wenruo
81434886ff7bSQu Wenruo /* Map the RAID56 multi-stripe writes to a single one. */
81444886ff7bSQu Wenruo if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
81454886ff7bSQu Wenruo map_raid56_repair_block(bioc, smap, logical);
81464886ff7bSQu Wenruo goto out;
81474886ff7bSQu Wenruo }
81484886ff7bSQu Wenruo
81494886ff7bSQu Wenruo ASSERT(mirror_num <= bioc->num_stripes);
81504886ff7bSQu Wenruo smap->dev = bioc->stripes[mirror_num - 1].dev;
81514886ff7bSQu Wenruo smap->physical = bioc->stripes[mirror_num - 1].physical;
81524886ff7bSQu Wenruo out:
81534886ff7bSQu Wenruo btrfs_put_bioc(bioc);
81544886ff7bSQu Wenruo ASSERT(smap->dev);
81554886ff7bSQu Wenruo return 0;
81564886ff7bSQu Wenruo }
8157