xref: /openbmc/linux/fs/btrfs/zoned.c (revision 7b7fd0ac7dc1ffcaf24d9bca0f051b0168e43cd4)
1  // SPDX-License-Identifier: GPL-2.0
2  
3  #include <linux/bitops.h>
4  #include <linux/slab.h>
5  #include <linux/blkdev.h>
6  #include <linux/sched/mm.h>
7  #include <linux/atomic.h>
8  #include <linux/vmalloc.h>
9  #include "ctree.h"
10  #include "volumes.h"
11  #include "zoned.h"
12  #include "rcu-string.h"
13  #include "disk-io.h"
14  #include "block-group.h"
15  #include "transaction.h"
16  #include "dev-replace.h"
17  #include "space-info.h"
18  #include "super.h"
19  #include "fs.h"
20  #include "accessors.h"
21  #include "bio.h"
22  
23  /* Maximum number of zones to report per blkdev_report_zones() call */
24  #define BTRFS_REPORT_NR_ZONES   4096
25  /* Invalid allocation pointer value for missing devices */
26  #define WP_MISSING_DEV ((u64)-1)
27  /* Pseudo write pointer value for conventional zone */
28  #define WP_CONVENTIONAL ((u64)-2)
29  
30  /*
31   * Location of the first zone of superblock logging zone pairs.
32   *
33   * - primary superblock:    0B (zone 0)
34   * - first copy:          512G (zone starting at that offset)
35   * - second copy:           4T (zone starting at that offset)
36   */
37  #define BTRFS_SB_LOG_PRIMARY_OFFSET	(0ULL)
38  #define BTRFS_SB_LOG_FIRST_OFFSET	(512ULL * SZ_1G)
39  #define BTRFS_SB_LOG_SECOND_OFFSET	(4096ULL * SZ_1G)
40  
41  #define BTRFS_SB_LOG_FIRST_SHIFT	const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
42  #define BTRFS_SB_LOG_SECOND_SHIFT	const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
43  
44  /* Number of superblock log zones */
45  #define BTRFS_NR_SB_LOG_ZONES 2
46  
47  /*
48   * Minimum of active zones we need:
49   *
50   * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
51   * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
52   * - 1 zone for tree-log dedicated block group
53   * - 1 zone for relocation
54   */
55  #define BTRFS_MIN_ACTIVE_ZONES		(BTRFS_SUPER_MIRROR_MAX + 5)
56  
57  /*
58   * Minimum / maximum supported zone size. Currently, SMR disks have a zone
59   * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
60   * We do not expect the zone size to become larger than 8GiB or smaller than
61   * 4MiB in the near future.
62   */
63  #define BTRFS_MAX_ZONE_SIZE		SZ_8G
64  #define BTRFS_MIN_ZONE_SIZE		SZ_4M
65  
66  #define SUPER_INFO_SECTORS	((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
67  
68  static void wait_eb_writebacks(struct btrfs_block_group *block_group);
69  static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);
70  
sb_zone_is_full(const struct blk_zone * zone)71  static inline bool sb_zone_is_full(const struct blk_zone *zone)
72  {
73  	return (zone->cond == BLK_ZONE_COND_FULL) ||
74  		(zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
75  }
76  
copy_zone_info_cb(struct blk_zone * zone,unsigned int idx,void * data)77  static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
78  {
79  	struct blk_zone *zones = data;
80  
81  	memcpy(&zones[idx], zone, sizeof(*zone));
82  
83  	return 0;
84  }
85  
sb_write_pointer(struct block_device * bdev,struct blk_zone * zones,u64 * wp_ret)86  static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
87  			    u64 *wp_ret)
88  {
89  	bool empty[BTRFS_NR_SB_LOG_ZONES];
90  	bool full[BTRFS_NR_SB_LOG_ZONES];
91  	sector_t sector;
92  	int i;
93  
94  	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
95  		ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
96  		empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
97  		full[i] = sb_zone_is_full(&zones[i]);
98  	}
99  
100  	/*
101  	 * Possible states of log buffer zones
102  	 *
103  	 *           Empty[0]  In use[0]  Full[0]
104  	 * Empty[1]         *          0        1
105  	 * In use[1]        x          x        1
106  	 * Full[1]          0          0        C
107  	 *
108  	 * Log position:
109  	 *   *: Special case, no superblock is written
110  	 *   0: Use write pointer of zones[0]
111  	 *   1: Use write pointer of zones[1]
112  	 *   C: Compare super blocks from zones[0] and zones[1], use the latest
113  	 *      one determined by generation
114  	 *   x: Invalid state
115  	 */
116  
117  	if (empty[0] && empty[1]) {
118  		/* Special case to distinguish no superblock to read */
119  		*wp_ret = zones[0].start << SECTOR_SHIFT;
120  		return -ENOENT;
121  	} else if (full[0] && full[1]) {
122  		/* Compare two super blocks */
123  		struct address_space *mapping = bdev->bd_inode->i_mapping;
124  		struct page *page[BTRFS_NR_SB_LOG_ZONES];
125  		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
126  		int i;
127  
128  		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
129  			u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
130  			u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
131  						BTRFS_SUPER_INFO_SIZE;
132  
133  			page[i] = read_cache_page_gfp(mapping,
134  					bytenr >> PAGE_SHIFT, GFP_NOFS);
135  			if (IS_ERR(page[i])) {
136  				if (i == 1)
137  					btrfs_release_disk_super(super[0]);
138  				return PTR_ERR(page[i]);
139  			}
140  			super[i] = page_address(page[i]);
141  		}
142  
143  		if (btrfs_super_generation(super[0]) >
144  		    btrfs_super_generation(super[1]))
145  			sector = zones[1].start;
146  		else
147  			sector = zones[0].start;
148  
149  		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
150  			btrfs_release_disk_super(super[i]);
151  	} else if (!full[0] && (empty[1] || full[1])) {
152  		sector = zones[0].wp;
153  	} else if (full[0]) {
154  		sector = zones[1].wp;
155  	} else {
156  		return -EUCLEAN;
157  	}
158  	*wp_ret = sector << SECTOR_SHIFT;
159  	return 0;
160  }
161  
162  /*
163   * Get the first zone number of the superblock mirror
164   */
sb_zone_number(int shift,int mirror)165  static inline u32 sb_zone_number(int shift, int mirror)
166  {
167  	u64 zone = U64_MAX;
168  
169  	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
170  	switch (mirror) {
171  	case 0: zone = 0; break;
172  	case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
173  	case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
174  	}
175  
176  	ASSERT(zone <= U32_MAX);
177  
178  	return (u32)zone;
179  }
180  
zone_start_sector(u32 zone_number,struct block_device * bdev)181  static inline sector_t zone_start_sector(u32 zone_number,
182  					 struct block_device *bdev)
183  {
184  	return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
185  }
186  
zone_start_physical(u32 zone_number,struct btrfs_zoned_device_info * zone_info)187  static inline u64 zone_start_physical(u32 zone_number,
188  				      struct btrfs_zoned_device_info *zone_info)
189  {
190  	return (u64)zone_number << zone_info->zone_size_shift;
191  }
192  
193  /*
194   * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
195   * device into static sized chunks and fake a conventional zone on each of
196   * them.
197   */
emulate_report_zones(struct btrfs_device * device,u64 pos,struct blk_zone * zones,unsigned int nr_zones)198  static int emulate_report_zones(struct btrfs_device *device, u64 pos,
199  				struct blk_zone *zones, unsigned int nr_zones)
200  {
201  	const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
202  	sector_t bdev_size = bdev_nr_sectors(device->bdev);
203  	unsigned int i;
204  
205  	pos >>= SECTOR_SHIFT;
206  	for (i = 0; i < nr_zones; i++) {
207  		zones[i].start = i * zone_sectors + pos;
208  		zones[i].len = zone_sectors;
209  		zones[i].capacity = zone_sectors;
210  		zones[i].wp = zones[i].start + zone_sectors;
211  		zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
212  		zones[i].cond = BLK_ZONE_COND_NOT_WP;
213  
214  		if (zones[i].wp >= bdev_size) {
215  			i++;
216  			break;
217  		}
218  	}
219  
220  	return i;
221  }
222  
btrfs_get_dev_zones(struct btrfs_device * device,u64 pos,struct blk_zone * zones,unsigned int * nr_zones)223  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
224  			       struct blk_zone *zones, unsigned int *nr_zones)
225  {
226  	struct btrfs_zoned_device_info *zinfo = device->zone_info;
227  	int ret;
228  
229  	if (!*nr_zones)
230  		return 0;
231  
232  	if (!bdev_is_zoned(device->bdev)) {
233  		ret = emulate_report_zones(device, pos, zones, *nr_zones);
234  		*nr_zones = ret;
235  		return 0;
236  	}
237  
238  	/* Check cache */
239  	if (zinfo->zone_cache) {
240  		unsigned int i;
241  		u32 zno;
242  
243  		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
244  		zno = pos >> zinfo->zone_size_shift;
245  		/*
246  		 * We cannot report zones beyond the zone end. So, it is OK to
247  		 * cap *nr_zones to at the end.
248  		 */
249  		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
250  
251  		for (i = 0; i < *nr_zones; i++) {
252  			struct blk_zone *zone_info;
253  
254  			zone_info = &zinfo->zone_cache[zno + i];
255  			if (!zone_info->len)
256  				break;
257  		}
258  
259  		if (i == *nr_zones) {
260  			/* Cache hit on all the zones */
261  			memcpy(zones, zinfo->zone_cache + zno,
262  			       sizeof(*zinfo->zone_cache) * *nr_zones);
263  			return 0;
264  		}
265  	}
266  
267  	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
268  				  copy_zone_info_cb, zones);
269  	if (ret < 0) {
270  		btrfs_err_in_rcu(device->fs_info,
271  				 "zoned: failed to read zone %llu on %s (devid %llu)",
272  				 pos, rcu_str_deref(device->name),
273  				 device->devid);
274  		return ret;
275  	}
276  	*nr_zones = ret;
277  	if (!ret)
278  		return -EIO;
279  
280  	/* Populate cache */
281  	if (zinfo->zone_cache) {
282  		u32 zno = pos >> zinfo->zone_size_shift;
283  
284  		memcpy(zinfo->zone_cache + zno, zones,
285  		       sizeof(*zinfo->zone_cache) * *nr_zones);
286  	}
287  
288  	return 0;
289  }
290  
291  /* The emulated zone size is determined from the size of device extent */
calculate_emulated_zone_size(struct btrfs_fs_info * fs_info)292  static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
293  {
294  	struct btrfs_path *path;
295  	struct btrfs_root *root = fs_info->dev_root;
296  	struct btrfs_key key;
297  	struct extent_buffer *leaf;
298  	struct btrfs_dev_extent *dext;
299  	int ret = 0;
300  
301  	key.objectid = 1;
302  	key.type = BTRFS_DEV_EXTENT_KEY;
303  	key.offset = 0;
304  
305  	path = btrfs_alloc_path();
306  	if (!path)
307  		return -ENOMEM;
308  
309  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
310  	if (ret < 0)
311  		goto out;
312  
313  	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
314  		ret = btrfs_next_leaf(root, path);
315  		if (ret < 0)
316  			goto out;
317  		/* No dev extents at all? Not good */
318  		if (ret > 0) {
319  			ret = -EUCLEAN;
320  			goto out;
321  		}
322  	}
323  
324  	leaf = path->nodes[0];
325  	dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
326  	fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
327  	ret = 0;
328  
329  out:
330  	btrfs_free_path(path);
331  
332  	return ret;
333  }
334  
btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info * fs_info)335  int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
336  {
337  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
338  	struct btrfs_device *device;
339  	int ret = 0;
340  
341  	/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
342  	if (!btrfs_fs_incompat(fs_info, ZONED))
343  		return 0;
344  
345  	mutex_lock(&fs_devices->device_list_mutex);
346  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
347  		/* We can skip reading of zone info for missing devices */
348  		if (!device->bdev)
349  			continue;
350  
351  		ret = btrfs_get_dev_zone_info(device, true);
352  		if (ret)
353  			break;
354  	}
355  	mutex_unlock(&fs_devices->device_list_mutex);
356  
357  	return ret;
358  }
359  
btrfs_get_dev_zone_info(struct btrfs_device * device,bool populate_cache)360  int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
361  {
362  	struct btrfs_fs_info *fs_info = device->fs_info;
363  	struct btrfs_zoned_device_info *zone_info = NULL;
364  	struct block_device *bdev = device->bdev;
365  	unsigned int max_active_zones;
366  	unsigned int nactive;
367  	sector_t nr_sectors;
368  	sector_t sector = 0;
369  	struct blk_zone *zones = NULL;
370  	unsigned int i, nreported = 0, nr_zones;
371  	sector_t zone_sectors;
372  	char *model, *emulated;
373  	int ret;
374  
375  	/*
376  	 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
377  	 * yet be set.
378  	 */
379  	if (!btrfs_fs_incompat(fs_info, ZONED))
380  		return 0;
381  
382  	if (device->zone_info)
383  		return 0;
384  
385  	zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
386  	if (!zone_info)
387  		return -ENOMEM;
388  
389  	device->zone_info = zone_info;
390  
391  	if (!bdev_is_zoned(bdev)) {
392  		if (!fs_info->zone_size) {
393  			ret = calculate_emulated_zone_size(fs_info);
394  			if (ret)
395  				goto out;
396  		}
397  
398  		ASSERT(fs_info->zone_size);
399  		zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
400  	} else {
401  		zone_sectors = bdev_zone_sectors(bdev);
402  	}
403  
404  	ASSERT(is_power_of_two_u64(zone_sectors));
405  	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
406  
407  	/* We reject devices with a zone size larger than 8GB */
408  	if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
409  		btrfs_err_in_rcu(fs_info,
410  		"zoned: %s: zone size %llu larger than supported maximum %llu",
411  				 rcu_str_deref(device->name),
412  				 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
413  		ret = -EINVAL;
414  		goto out;
415  	} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
416  		btrfs_err_in_rcu(fs_info,
417  		"zoned: %s: zone size %llu smaller than supported minimum %u",
418  				 rcu_str_deref(device->name),
419  				 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
420  		ret = -EINVAL;
421  		goto out;
422  	}
423  
424  	nr_sectors = bdev_nr_sectors(bdev);
425  	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
426  	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
427  	if (!IS_ALIGNED(nr_sectors, zone_sectors))
428  		zone_info->nr_zones++;
429  
430  	max_active_zones = bdev_max_active_zones(bdev);
431  	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
432  		btrfs_err_in_rcu(fs_info,
433  "zoned: %s: max active zones %u is too small, need at least %u active zones",
434  				 rcu_str_deref(device->name), max_active_zones,
435  				 BTRFS_MIN_ACTIVE_ZONES);
436  		ret = -EINVAL;
437  		goto out;
438  	}
439  	zone_info->max_active_zones = max_active_zones;
440  
441  	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
442  	if (!zone_info->seq_zones) {
443  		ret = -ENOMEM;
444  		goto out;
445  	}
446  
447  	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
448  	if (!zone_info->empty_zones) {
449  		ret = -ENOMEM;
450  		goto out;
451  	}
452  
453  	zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
454  	if (!zone_info->active_zones) {
455  		ret = -ENOMEM;
456  		goto out;
457  	}
458  
459  	zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
460  	if (!zones) {
461  		ret = -ENOMEM;
462  		goto out;
463  	}
464  
465  	/*
466  	 * Enable zone cache only for a zoned device. On a non-zoned device, we
467  	 * fill the zone info with emulated CONVENTIONAL zones, so no need to
468  	 * use the cache.
469  	 */
470  	if (populate_cache && bdev_is_zoned(device->bdev)) {
471  		zone_info->zone_cache = vcalloc(zone_info->nr_zones,
472  						sizeof(struct blk_zone));
473  		if (!zone_info->zone_cache) {
474  			btrfs_err_in_rcu(device->fs_info,
475  				"zoned: failed to allocate zone cache for %s",
476  				rcu_str_deref(device->name));
477  			ret = -ENOMEM;
478  			goto out;
479  		}
480  	}
481  
482  	/* Get zones type */
483  	nactive = 0;
484  	while (sector < nr_sectors) {
485  		nr_zones = BTRFS_REPORT_NR_ZONES;
486  		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
487  					  &nr_zones);
488  		if (ret)
489  			goto out;
490  
491  		for (i = 0; i < nr_zones; i++) {
492  			if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
493  				__set_bit(nreported, zone_info->seq_zones);
494  			switch (zones[i].cond) {
495  			case BLK_ZONE_COND_EMPTY:
496  				__set_bit(nreported, zone_info->empty_zones);
497  				break;
498  			case BLK_ZONE_COND_IMP_OPEN:
499  			case BLK_ZONE_COND_EXP_OPEN:
500  			case BLK_ZONE_COND_CLOSED:
501  				__set_bit(nreported, zone_info->active_zones);
502  				nactive++;
503  				break;
504  			}
505  			nreported++;
506  		}
507  		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
508  	}
509  
510  	if (nreported != zone_info->nr_zones) {
511  		btrfs_err_in_rcu(device->fs_info,
512  				 "inconsistent number of zones on %s (%u/%u)",
513  				 rcu_str_deref(device->name), nreported,
514  				 zone_info->nr_zones);
515  		ret = -EIO;
516  		goto out;
517  	}
518  
519  	if (max_active_zones) {
520  		if (nactive > max_active_zones) {
521  			btrfs_err_in_rcu(device->fs_info,
522  			"zoned: %u active zones on %s exceeds max_active_zones %u",
523  					 nactive, rcu_str_deref(device->name),
524  					 max_active_zones);
525  			ret = -EIO;
526  			goto out;
527  		}
528  		atomic_set(&zone_info->active_zones_left,
529  			   max_active_zones - nactive);
530  		set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
531  	}
532  
533  	/* Validate superblock log */
534  	nr_zones = BTRFS_NR_SB_LOG_ZONES;
535  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
536  		u32 sb_zone;
537  		u64 sb_wp;
538  		int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
539  
540  		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
541  		if (sb_zone + 1 >= zone_info->nr_zones)
542  			continue;
543  
544  		ret = btrfs_get_dev_zones(device,
545  					  zone_start_physical(sb_zone, zone_info),
546  					  &zone_info->sb_zones[sb_pos],
547  					  &nr_zones);
548  		if (ret)
549  			goto out;
550  
551  		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
552  			btrfs_err_in_rcu(device->fs_info,
553  	"zoned: failed to read super block log zone info at devid %llu zone %u",
554  					 device->devid, sb_zone);
555  			ret = -EUCLEAN;
556  			goto out;
557  		}
558  
559  		/*
560  		 * If zones[0] is conventional, always use the beginning of the
561  		 * zone to record superblock. No need to validate in that case.
562  		 */
563  		if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
564  		    BLK_ZONE_TYPE_CONVENTIONAL)
565  			continue;
566  
567  		ret = sb_write_pointer(device->bdev,
568  				       &zone_info->sb_zones[sb_pos], &sb_wp);
569  		if (ret != -ENOENT && ret) {
570  			btrfs_err_in_rcu(device->fs_info,
571  			"zoned: super block log zone corrupted devid %llu zone %u",
572  					 device->devid, sb_zone);
573  			ret = -EUCLEAN;
574  			goto out;
575  		}
576  	}
577  
578  
579  	kvfree(zones);
580  
581  	switch (bdev_zoned_model(bdev)) {
582  	case BLK_ZONED_HM:
583  		model = "host-managed zoned";
584  		emulated = "";
585  		break;
586  	case BLK_ZONED_HA:
587  		model = "host-aware zoned";
588  		emulated = "";
589  		break;
590  	case BLK_ZONED_NONE:
591  		model = "regular";
592  		emulated = "emulated ";
593  		break;
594  	default:
595  		/* Just in case */
596  		btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
597  				 bdev_zoned_model(bdev),
598  				 rcu_str_deref(device->name));
599  		ret = -EOPNOTSUPP;
600  		goto out_free_zone_info;
601  	}
602  
603  	btrfs_info_in_rcu(fs_info,
604  		"%s block device %s, %u %szones of %llu bytes",
605  		model, rcu_str_deref(device->name), zone_info->nr_zones,
606  		emulated, zone_info->zone_size);
607  
608  	return 0;
609  
610  out:
611  	kvfree(zones);
612  out_free_zone_info:
613  	btrfs_destroy_dev_zone_info(device);
614  
615  	return ret;
616  }
617  
btrfs_destroy_dev_zone_info(struct btrfs_device * device)618  void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
619  {
620  	struct btrfs_zoned_device_info *zone_info = device->zone_info;
621  
622  	if (!zone_info)
623  		return;
624  
625  	bitmap_free(zone_info->active_zones);
626  	bitmap_free(zone_info->seq_zones);
627  	bitmap_free(zone_info->empty_zones);
628  	vfree(zone_info->zone_cache);
629  	kfree(zone_info);
630  	device->zone_info = NULL;
631  }
632  
btrfs_clone_dev_zone_info(struct btrfs_device * orig_dev)633  struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
634  {
635  	struct btrfs_zoned_device_info *zone_info;
636  
637  	zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
638  	if (!zone_info)
639  		return NULL;
640  
641  	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
642  	if (!zone_info->seq_zones)
643  		goto out;
644  
645  	bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
646  		    zone_info->nr_zones);
647  
648  	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
649  	if (!zone_info->empty_zones)
650  		goto out;
651  
652  	bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
653  		    zone_info->nr_zones);
654  
655  	zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
656  	if (!zone_info->active_zones)
657  		goto out;
658  
659  	bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
660  		    zone_info->nr_zones);
661  	zone_info->zone_cache = NULL;
662  
663  	return zone_info;
664  
665  out:
666  	bitmap_free(zone_info->seq_zones);
667  	bitmap_free(zone_info->empty_zones);
668  	bitmap_free(zone_info->active_zones);
669  	kfree(zone_info);
670  	return NULL;
671  }
672  
btrfs_get_dev_zone(struct btrfs_device * device,u64 pos,struct blk_zone * zone)673  int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
674  		       struct blk_zone *zone)
675  {
676  	unsigned int nr_zones = 1;
677  	int ret;
678  
679  	ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
680  	if (ret != 0 || !nr_zones)
681  		return ret ? ret : -EIO;
682  
683  	return 0;
684  }
685  
btrfs_check_for_zoned_device(struct btrfs_fs_info * fs_info)686  static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
687  {
688  	struct btrfs_device *device;
689  
690  	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
691  		if (device->bdev &&
692  		    bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
693  			btrfs_err(fs_info,
694  				"zoned: mode not enabled but zoned device found: %pg",
695  				device->bdev);
696  			return -EINVAL;
697  		}
698  	}
699  
700  	return 0;
701  }
702  
btrfs_check_zoned_mode(struct btrfs_fs_info * fs_info)703  int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
704  {
705  	struct queue_limits *lim = &fs_info->limits;
706  	struct btrfs_device *device;
707  	u64 zone_size = 0;
708  	int ret;
709  
710  	/*
711  	 * Host-Managed devices can't be used without the ZONED flag.  With the
712  	 * ZONED all devices can be used, using zone emulation if required.
713  	 */
714  	if (!btrfs_fs_incompat(fs_info, ZONED))
715  		return btrfs_check_for_zoned_device(fs_info);
716  
717  	blk_set_stacking_limits(lim);
718  
719  	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
720  		struct btrfs_zoned_device_info *zone_info = device->zone_info;
721  
722  		if (!device->bdev)
723  			continue;
724  
725  		if (!zone_size) {
726  			zone_size = zone_info->zone_size;
727  		} else if (zone_info->zone_size != zone_size) {
728  			btrfs_err(fs_info,
729  		"zoned: unequal block device zone sizes: have %llu found %llu",
730  				  zone_info->zone_size, zone_size);
731  			return -EINVAL;
732  		}
733  
734  		/*
735  		 * With the zoned emulation, we can have non-zoned device on the
736  		 * zoned mode. In this case, we don't have a valid max zone
737  		 * append size.
738  		 */
739  		if (bdev_is_zoned(device->bdev)) {
740  			blk_stack_limits(lim,
741  					 &bdev_get_queue(device->bdev)->limits,
742  					 0);
743  		}
744  	}
745  
746  	/*
747  	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
748  	 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
749  	 * check the alignment here.
750  	 */
751  	if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
752  		btrfs_err(fs_info,
753  			  "zoned: zone size %llu not aligned to stripe %u",
754  			  zone_size, BTRFS_STRIPE_LEN);
755  		return -EINVAL;
756  	}
757  
758  	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
759  		btrfs_err(fs_info, "zoned: mixed block groups not supported");
760  		return -EINVAL;
761  	}
762  
763  	fs_info->zone_size = zone_size;
764  	/*
765  	 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
766  	 * Technically, we can have multiple pages per segment. But, since
767  	 * we add the pages one by one to a bio, and cannot increase the
768  	 * metadata reservation even if it increases the number of extents, it
769  	 * is safe to stick with the limit.
770  	 */
771  	fs_info->max_zone_append_size = ALIGN_DOWN(
772  		min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
773  		     (u64)lim->max_sectors << SECTOR_SHIFT,
774  		     (u64)lim->max_segments << PAGE_SHIFT),
775  		fs_info->sectorsize);
776  	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
777  	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
778  		fs_info->max_extent_size = fs_info->max_zone_append_size;
779  
780  	/*
781  	 * Check mount options here, because we might change fs_info->zoned
782  	 * from fs_info->zone_size.
783  	 */
784  	ret = btrfs_check_mountopts_zoned(fs_info);
785  	if (ret)
786  		return ret;
787  
788  	btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
789  	return 0;
790  }
791  
btrfs_check_mountopts_zoned(struct btrfs_fs_info * info)792  int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
793  {
794  	if (!btrfs_is_zoned(info))
795  		return 0;
796  
797  	/*
798  	 * Space cache writing is not COWed. Disable that to avoid write errors
799  	 * in sequential zones.
800  	 */
801  	if (btrfs_test_opt(info, SPACE_CACHE)) {
802  		btrfs_err(info, "zoned: space cache v1 is not supported");
803  		return -EINVAL;
804  	}
805  
806  	if (btrfs_test_opt(info, NODATACOW)) {
807  		btrfs_err(info, "zoned: NODATACOW not supported");
808  		return -EINVAL;
809  	}
810  
811  	btrfs_clear_and_info(info, DISCARD_ASYNC,
812  			"zoned: async discard ignored and disabled for zoned mode");
813  
814  	return 0;
815  }
816  
sb_log_location(struct block_device * bdev,struct blk_zone * zones,int rw,u64 * bytenr_ret)817  static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
818  			   int rw, u64 *bytenr_ret)
819  {
820  	u64 wp;
821  	int ret;
822  
823  	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
824  		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
825  		return 0;
826  	}
827  
828  	ret = sb_write_pointer(bdev, zones, &wp);
829  	if (ret != -ENOENT && ret < 0)
830  		return ret;
831  
832  	if (rw == WRITE) {
833  		struct blk_zone *reset = NULL;
834  
835  		if (wp == zones[0].start << SECTOR_SHIFT)
836  			reset = &zones[0];
837  		else if (wp == zones[1].start << SECTOR_SHIFT)
838  			reset = &zones[1];
839  
840  		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
841  			ASSERT(sb_zone_is_full(reset));
842  
843  			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
844  					       reset->start, reset->len,
845  					       GFP_NOFS);
846  			if (ret)
847  				return ret;
848  
849  			reset->cond = BLK_ZONE_COND_EMPTY;
850  			reset->wp = reset->start;
851  		}
852  	} else if (ret != -ENOENT) {
853  		/*
854  		 * For READ, we want the previous one. Move write pointer to
855  		 * the end of a zone, if it is at the head of a zone.
856  		 */
857  		u64 zone_end = 0;
858  
859  		if (wp == zones[0].start << SECTOR_SHIFT)
860  			zone_end = zones[1].start + zones[1].capacity;
861  		else if (wp == zones[1].start << SECTOR_SHIFT)
862  			zone_end = zones[0].start + zones[0].capacity;
863  		if (zone_end)
864  			wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
865  					BTRFS_SUPER_INFO_SIZE);
866  
867  		wp -= BTRFS_SUPER_INFO_SIZE;
868  	}
869  
870  	*bytenr_ret = wp;
871  	return 0;
872  
873  }
874  
btrfs_sb_log_location_bdev(struct block_device * bdev,int mirror,int rw,u64 * bytenr_ret)875  int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
876  			       u64 *bytenr_ret)
877  {
878  	struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
879  	sector_t zone_sectors;
880  	u32 sb_zone;
881  	int ret;
882  	u8 zone_sectors_shift;
883  	sector_t nr_sectors;
884  	u32 nr_zones;
885  
886  	if (!bdev_is_zoned(bdev)) {
887  		*bytenr_ret = btrfs_sb_offset(mirror);
888  		return 0;
889  	}
890  
891  	ASSERT(rw == READ || rw == WRITE);
892  
893  	zone_sectors = bdev_zone_sectors(bdev);
894  	if (!is_power_of_2(zone_sectors))
895  		return -EINVAL;
896  	zone_sectors_shift = ilog2(zone_sectors);
897  	nr_sectors = bdev_nr_sectors(bdev);
898  	nr_zones = nr_sectors >> zone_sectors_shift;
899  
900  	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
901  	if (sb_zone + 1 >= nr_zones)
902  		return -ENOENT;
903  
904  	ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
905  				  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
906  				  zones);
907  	if (ret < 0)
908  		return ret;
909  	if (ret != BTRFS_NR_SB_LOG_ZONES)
910  		return -EIO;
911  
912  	return sb_log_location(bdev, zones, rw, bytenr_ret);
913  }
914  
btrfs_sb_log_location(struct btrfs_device * device,int mirror,int rw,u64 * bytenr_ret)915  int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
916  			  u64 *bytenr_ret)
917  {
918  	struct btrfs_zoned_device_info *zinfo = device->zone_info;
919  	u32 zone_num;
920  
921  	/*
922  	 * For a zoned filesystem on a non-zoned block device, use the same
923  	 * super block locations as regular filesystem. Doing so, the super
924  	 * block can always be retrieved and the zoned flag of the volume
925  	 * detected from the super block information.
926  	 */
927  	if (!bdev_is_zoned(device->bdev)) {
928  		*bytenr_ret = btrfs_sb_offset(mirror);
929  		return 0;
930  	}
931  
932  	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
933  	if (zone_num + 1 >= zinfo->nr_zones)
934  		return -ENOENT;
935  
936  	return sb_log_location(device->bdev,
937  			       &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
938  			       rw, bytenr_ret);
939  }
940  
is_sb_log_zone(struct btrfs_zoned_device_info * zinfo,int mirror)941  static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
942  				  int mirror)
943  {
944  	u32 zone_num;
945  
946  	if (!zinfo)
947  		return false;
948  
949  	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
950  	if (zone_num + 1 >= zinfo->nr_zones)
951  		return false;
952  
953  	if (!test_bit(zone_num, zinfo->seq_zones))
954  		return false;
955  
956  	return true;
957  }
958  
btrfs_advance_sb_log(struct btrfs_device * device,int mirror)959  int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
960  {
961  	struct btrfs_zoned_device_info *zinfo = device->zone_info;
962  	struct blk_zone *zone;
963  	int i;
964  
965  	if (!is_sb_log_zone(zinfo, mirror))
966  		return 0;
967  
968  	zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
969  	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
970  		/* Advance the next zone */
971  		if (zone->cond == BLK_ZONE_COND_FULL) {
972  			zone++;
973  			continue;
974  		}
975  
976  		if (zone->cond == BLK_ZONE_COND_EMPTY)
977  			zone->cond = BLK_ZONE_COND_IMP_OPEN;
978  
979  		zone->wp += SUPER_INFO_SECTORS;
980  
981  		if (sb_zone_is_full(zone)) {
982  			/*
983  			 * No room left to write new superblock. Since
984  			 * superblock is written with REQ_SYNC, it is safe to
985  			 * finish the zone now.
986  			 *
987  			 * If the write pointer is exactly at the capacity,
988  			 * explicit ZONE_FINISH is not necessary.
989  			 */
990  			if (zone->wp != zone->start + zone->capacity) {
991  				int ret;
992  
993  				ret = blkdev_zone_mgmt(device->bdev,
994  						REQ_OP_ZONE_FINISH, zone->start,
995  						zone->len, GFP_NOFS);
996  				if (ret)
997  					return ret;
998  			}
999  
1000  			zone->wp = zone->start + zone->len;
1001  			zone->cond = BLK_ZONE_COND_FULL;
1002  		}
1003  		return 0;
1004  	}
1005  
1006  	/* All the zones are FULL. Should not reach here. */
1007  	ASSERT(0);
1008  	return -EIO;
1009  }
1010  
btrfs_reset_sb_log_zones(struct block_device * bdev,int mirror)1011  int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
1012  {
1013  	sector_t zone_sectors;
1014  	sector_t nr_sectors;
1015  	u8 zone_sectors_shift;
1016  	u32 sb_zone;
1017  	u32 nr_zones;
1018  
1019  	zone_sectors = bdev_zone_sectors(bdev);
1020  	zone_sectors_shift = ilog2(zone_sectors);
1021  	nr_sectors = bdev_nr_sectors(bdev);
1022  	nr_zones = nr_sectors >> zone_sectors_shift;
1023  
1024  	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
1025  	if (sb_zone + 1 >= nr_zones)
1026  		return -ENOENT;
1027  
1028  	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1029  				zone_start_sector(sb_zone, bdev),
1030  				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
1031  }
1032  
1033  /*
1034   * Find allocatable zones within a given region.
1035   *
1036   * @device:	the device to allocate a region on
1037   * @hole_start: the position of the hole to allocate the region
1038   * @num_bytes:	size of wanted region
1039   * @hole_end:	the end of the hole
1040   * @return:	position of allocatable zones
1041   *
1042   * Allocatable region should not contain any superblock locations.
1043   */
btrfs_find_allocatable_zones(struct btrfs_device * device,u64 hole_start,u64 hole_end,u64 num_bytes)1044  u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
1045  				 u64 hole_end, u64 num_bytes)
1046  {
1047  	struct btrfs_zoned_device_info *zinfo = device->zone_info;
1048  	const u8 shift = zinfo->zone_size_shift;
1049  	u64 nzones = num_bytes >> shift;
1050  	u64 pos = hole_start;
1051  	u64 begin, end;
1052  	bool have_sb;
1053  	int i;
1054  
1055  	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
1056  	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
1057  
1058  	while (pos < hole_end) {
1059  		begin = pos >> shift;
1060  		end = begin + nzones;
1061  
1062  		if (end > zinfo->nr_zones)
1063  			return hole_end;
1064  
1065  		/* Check if zones in the region are all empty */
1066  		if (btrfs_dev_is_sequential(device, pos) &&
1067  		    !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
1068  			pos += zinfo->zone_size;
1069  			continue;
1070  		}
1071  
1072  		have_sb = false;
1073  		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1074  			u32 sb_zone;
1075  			u64 sb_pos;
1076  
1077  			sb_zone = sb_zone_number(shift, i);
1078  			if (!(end <= sb_zone ||
1079  			      sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
1080  				have_sb = true;
1081  				pos = zone_start_physical(
1082  					sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
1083  				break;
1084  			}
1085  
1086  			/* We also need to exclude regular superblock positions */
1087  			sb_pos = btrfs_sb_offset(i);
1088  			if (!(pos + num_bytes <= sb_pos ||
1089  			      sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
1090  				have_sb = true;
1091  				pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
1092  					    zinfo->zone_size);
1093  				break;
1094  			}
1095  		}
1096  		if (!have_sb)
1097  			break;
1098  	}
1099  
1100  	return pos;
1101  }
1102  
btrfs_dev_set_active_zone(struct btrfs_device * device,u64 pos)1103  static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
1104  {
1105  	struct btrfs_zoned_device_info *zone_info = device->zone_info;
1106  	unsigned int zno = (pos >> zone_info->zone_size_shift);
1107  
1108  	/* We can use any number of zones */
1109  	if (zone_info->max_active_zones == 0)
1110  		return true;
1111  
1112  	if (!test_bit(zno, zone_info->active_zones)) {
1113  		/* Active zone left? */
1114  		if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
1115  			return false;
1116  		if (test_and_set_bit(zno, zone_info->active_zones)) {
1117  			/* Someone already set the bit */
1118  			atomic_inc(&zone_info->active_zones_left);
1119  		}
1120  	}
1121  
1122  	return true;
1123  }
1124  
btrfs_dev_clear_active_zone(struct btrfs_device * device,u64 pos)1125  static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
1126  {
1127  	struct btrfs_zoned_device_info *zone_info = device->zone_info;
1128  	unsigned int zno = (pos >> zone_info->zone_size_shift);
1129  
1130  	/* We can use any number of zones */
1131  	if (zone_info->max_active_zones == 0)
1132  		return;
1133  
1134  	if (test_and_clear_bit(zno, zone_info->active_zones))
1135  		atomic_inc(&zone_info->active_zones_left);
1136  }
1137  
btrfs_reset_device_zone(struct btrfs_device * device,u64 physical,u64 length,u64 * bytes)1138  int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
1139  			    u64 length, u64 *bytes)
1140  {
1141  	int ret;
1142  
1143  	*bytes = 0;
1144  	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
1145  			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
1146  			       GFP_NOFS);
1147  	if (ret)
1148  		return ret;
1149  
1150  	*bytes = length;
1151  	while (length) {
1152  		btrfs_dev_set_zone_empty(device, physical);
1153  		btrfs_dev_clear_active_zone(device, physical);
1154  		physical += device->zone_info->zone_size;
1155  		length -= device->zone_info->zone_size;
1156  	}
1157  
1158  	return 0;
1159  }
1160  
btrfs_ensure_empty_zones(struct btrfs_device * device,u64 start,u64 size)1161  int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1162  {
1163  	struct btrfs_zoned_device_info *zinfo = device->zone_info;
1164  	const u8 shift = zinfo->zone_size_shift;
1165  	unsigned long begin = start >> shift;
1166  	unsigned long nbits = size >> shift;
1167  	u64 pos;
1168  	int ret;
1169  
1170  	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1171  	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1172  
1173  	if (begin + nbits > zinfo->nr_zones)
1174  		return -ERANGE;
1175  
1176  	/* All the zones are conventional */
1177  	if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
1178  		return 0;
1179  
1180  	/* All the zones are sequential and empty */
1181  	if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
1182  	    bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
1183  		return 0;
1184  
1185  	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1186  		u64 reset_bytes;
1187  
1188  		if (!btrfs_dev_is_sequential(device, pos) ||
1189  		    btrfs_dev_is_empty_zone(device, pos))
1190  			continue;
1191  
1192  		/* Free regions should be empty */
1193  		btrfs_warn_in_rcu(
1194  			device->fs_info,
1195  		"zoned: resetting device %s (devid %llu) zone %llu for allocation",
1196  			rcu_str_deref(device->name), device->devid, pos >> shift);
1197  		WARN_ON_ONCE(1);
1198  
1199  		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1200  					      &reset_bytes);
1201  		if (ret)
1202  			return ret;
1203  	}
1204  
1205  	return 0;
1206  }
1207  
1208  /*
1209   * Calculate an allocation pointer from the extent allocation information
1210   * for a block group consist of conventional zones. It is pointed to the
1211   * end of the highest addressed extent in the block group as an allocation
1212   * offset.
1213   */
calculate_alloc_pointer(struct btrfs_block_group * cache,u64 * offset_ret,bool new)1214  static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1215  				   u64 *offset_ret, bool new)
1216  {
1217  	struct btrfs_fs_info *fs_info = cache->fs_info;
1218  	struct btrfs_root *root;
1219  	struct btrfs_path *path;
1220  	struct btrfs_key key;
1221  	struct btrfs_key found_key;
1222  	int ret;
1223  	u64 length;
1224  
1225  	/*
1226  	 * Avoid  tree lookups for a new block group, there's no use for it.
1227  	 * It must always be 0.
1228  	 *
1229  	 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1230  	 * For new a block group, this function is called from
1231  	 * btrfs_make_block_group() which is already taking the chunk mutex.
1232  	 * Thus, we cannot call calculate_alloc_pointer() which takes extent
1233  	 * buffer locks to avoid deadlock.
1234  	 */
1235  	if (new) {
1236  		*offset_ret = 0;
1237  		return 0;
1238  	}
1239  
1240  	path = btrfs_alloc_path();
1241  	if (!path)
1242  		return -ENOMEM;
1243  
1244  	key.objectid = cache->start + cache->length;
1245  	key.type = 0;
1246  	key.offset = 0;
1247  
1248  	root = btrfs_extent_root(fs_info, key.objectid);
1249  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1250  	/* We should not find the exact match */
1251  	if (!ret)
1252  		ret = -EUCLEAN;
1253  	if (ret < 0)
1254  		goto out;
1255  
1256  	ret = btrfs_previous_extent_item(root, path, cache->start);
1257  	if (ret) {
1258  		if (ret == 1) {
1259  			ret = 0;
1260  			*offset_ret = 0;
1261  		}
1262  		goto out;
1263  	}
1264  
1265  	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1266  
1267  	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1268  		length = found_key.offset;
1269  	else
1270  		length = fs_info->nodesize;
1271  
1272  	if (!(found_key.objectid >= cache->start &&
1273  	       found_key.objectid + length <= cache->start + cache->length)) {
1274  		ret = -EUCLEAN;
1275  		goto out;
1276  	}
1277  	*offset_ret = found_key.objectid + length - cache->start;
1278  	ret = 0;
1279  
1280  out:
1281  	btrfs_free_path(path);
1282  	return ret;
1283  }
1284  
1285  struct zone_info {
1286  	u64 physical;
1287  	u64 capacity;
1288  	u64 alloc_offset;
1289  };
1290  
btrfs_load_zone_info(struct btrfs_fs_info * fs_info,int zone_idx,struct zone_info * info,unsigned long * active,struct map_lookup * map)1291  static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
1292  				struct zone_info *info, unsigned long *active,
1293  				struct map_lookup *map)
1294  {
1295  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1296  	struct btrfs_device *device;
1297  	int dev_replace_is_ongoing = 0;
1298  	unsigned int nofs_flag;
1299  	struct blk_zone zone;
1300  	int ret;
1301  
1302  	info->physical = map->stripes[zone_idx].physical;
1303  
1304  	down_read(&dev_replace->rwsem);
1305  	device = map->stripes[zone_idx].dev;
1306  
1307  	if (!device->bdev) {
1308  		up_read(&dev_replace->rwsem);
1309  		info->alloc_offset = WP_MISSING_DEV;
1310  		return 0;
1311  	}
1312  
1313  	/* Consider a zone as active if we can allow any number of active zones. */
1314  	if (!device->zone_info->max_active_zones)
1315  		__set_bit(zone_idx, active);
1316  
1317  	if (!btrfs_dev_is_sequential(device, info->physical)) {
1318  		up_read(&dev_replace->rwsem);
1319  		info->alloc_offset = WP_CONVENTIONAL;
1320  		return 0;
1321  	}
1322  
1323  	/* This zone will be used for allocation, so mark this zone non-empty. */
1324  	btrfs_dev_clear_zone_empty(device, info->physical);
1325  
1326  	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1327  	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1328  		btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
1329  
1330  	/*
1331  	 * The group is mapped to a sequential zone. Get the zone write pointer
1332  	 * to determine the allocation offset within the zone.
1333  	 */
1334  	WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
1335  	nofs_flag = memalloc_nofs_save();
1336  	ret = btrfs_get_dev_zone(device, info->physical, &zone);
1337  	memalloc_nofs_restore(nofs_flag);
1338  	if (ret) {
1339  		up_read(&dev_replace->rwsem);
1340  		if (ret != -EIO && ret != -EOPNOTSUPP)
1341  			return ret;
1342  		info->alloc_offset = WP_MISSING_DEV;
1343  		return 0;
1344  	}
1345  
1346  	if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1347  		btrfs_err_in_rcu(fs_info,
1348  		"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1349  			zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
1350  			device->devid);
1351  		up_read(&dev_replace->rwsem);
1352  		return -EIO;
1353  	}
1354  
1355  	info->capacity = (zone.capacity << SECTOR_SHIFT);
1356  
1357  	switch (zone.cond) {
1358  	case BLK_ZONE_COND_OFFLINE:
1359  	case BLK_ZONE_COND_READONLY:
1360  		btrfs_err_in_rcu(fs_info,
1361  		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
1362  			  (info->physical >> device->zone_info->zone_size_shift),
1363  			  rcu_str_deref(device->name), device->devid);
1364  		info->alloc_offset = WP_MISSING_DEV;
1365  		break;
1366  	case BLK_ZONE_COND_EMPTY:
1367  		info->alloc_offset = 0;
1368  		break;
1369  	case BLK_ZONE_COND_FULL:
1370  		info->alloc_offset = info->capacity;
1371  		break;
1372  	default:
1373  		/* Partially used zone. */
1374  		info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
1375  		__set_bit(zone_idx, active);
1376  		break;
1377  	}
1378  
1379  	up_read(&dev_replace->rwsem);
1380  
1381  	return 0;
1382  }
1383  
btrfs_load_block_group_single(struct btrfs_block_group * bg,struct zone_info * info,unsigned long * active)1384  static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
1385  					 struct zone_info *info,
1386  					 unsigned long *active)
1387  {
1388  	if (info->alloc_offset == WP_MISSING_DEV) {
1389  		btrfs_err(bg->fs_info,
1390  			"zoned: cannot recover write pointer for zone %llu",
1391  			info->physical);
1392  		return -EIO;
1393  	}
1394  
1395  	bg->alloc_offset = info->alloc_offset;
1396  	bg->zone_capacity = info->capacity;
1397  	if (test_bit(0, active))
1398  		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1399  	return 0;
1400  }
1401  
btrfs_load_block_group_dup(struct btrfs_block_group * bg,struct map_lookup * map,struct zone_info * zone_info,unsigned long * active)1402  static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
1403  				      struct map_lookup *map,
1404  				      struct zone_info *zone_info,
1405  				      unsigned long *active)
1406  {
1407  	if (map->type & BTRFS_BLOCK_GROUP_DATA) {
1408  		btrfs_err(bg->fs_info,
1409  			  "zoned: profile DUP not yet supported on data bg");
1410  		return -EINVAL;
1411  	}
1412  
1413  	if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
1414  		btrfs_err(bg->fs_info,
1415  			  "zoned: cannot recover write pointer for zone %llu",
1416  			  zone_info[0].physical);
1417  		return -EIO;
1418  	}
1419  	if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
1420  		btrfs_err(bg->fs_info,
1421  			  "zoned: cannot recover write pointer for zone %llu",
1422  			  zone_info[1].physical);
1423  		return -EIO;
1424  	}
1425  	if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
1426  		btrfs_err(bg->fs_info,
1427  			  "zoned: write pointer offset mismatch of zones in DUP profile");
1428  		return -EIO;
1429  	}
1430  
1431  	if (test_bit(0, active) != test_bit(1, active)) {
1432  		if (!btrfs_zone_activate(bg))
1433  			return -EIO;
1434  	} else if (test_bit(0, active)) {
1435  		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1436  	}
1437  
1438  	bg->alloc_offset = zone_info[0].alloc_offset;
1439  	bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
1440  	return 0;
1441  }
1442  
btrfs_load_block_group_zone_info(struct btrfs_block_group * cache,bool new)1443  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1444  {
1445  	struct btrfs_fs_info *fs_info = cache->fs_info;
1446  	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1447  	struct extent_map *em;
1448  	struct map_lookup *map;
1449  	u64 logical = cache->start;
1450  	u64 length = cache->length;
1451  	struct zone_info *zone_info = NULL;
1452  	int ret;
1453  	int i;
1454  	unsigned long *active = NULL;
1455  	u64 last_alloc = 0;
1456  	u32 num_sequential = 0, num_conventional = 0;
1457  
1458  	if (!btrfs_is_zoned(fs_info))
1459  		return 0;
1460  
1461  	/* Sanity check */
1462  	if (!IS_ALIGNED(length, fs_info->zone_size)) {
1463  		btrfs_err(fs_info,
1464  		"zoned: block group %llu len %llu unaligned to zone size %llu",
1465  			  logical, length, fs_info->zone_size);
1466  		return -EIO;
1467  	}
1468  
1469  	/* Get the chunk mapping */
1470  	read_lock(&em_tree->lock);
1471  	em = lookup_extent_mapping(em_tree, logical, length);
1472  	read_unlock(&em_tree->lock);
1473  
1474  	if (!em)
1475  		return -EINVAL;
1476  
1477  	map = em->map_lookup;
1478  
1479  	cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
1480  	if (!cache->physical_map) {
1481  		ret = -ENOMEM;
1482  		goto out;
1483  	}
1484  
1485  	zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
1486  	if (!zone_info) {
1487  		ret = -ENOMEM;
1488  		goto out;
1489  	}
1490  
1491  	active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
1492  	if (!active) {
1493  		ret = -ENOMEM;
1494  		goto out;
1495  	}
1496  
1497  	for (i = 0; i < map->num_stripes; i++) {
1498  		ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
1499  		if (ret)
1500  			goto out;
1501  
1502  		if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
1503  			num_conventional++;
1504  		else
1505  			num_sequential++;
1506  	}
1507  
1508  	if (num_sequential > 0)
1509  		set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1510  
1511  	if (num_conventional > 0) {
1512  		/* Zone capacity is always zone size in emulation */
1513  		cache->zone_capacity = cache->length;
1514  		ret = calculate_alloc_pointer(cache, &last_alloc, new);
1515  		if (ret) {
1516  			btrfs_err(fs_info,
1517  			"zoned: failed to determine allocation offset of bg %llu",
1518  				  cache->start);
1519  			goto out;
1520  		} else if (map->num_stripes == num_conventional) {
1521  			cache->alloc_offset = last_alloc;
1522  			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1523  			goto out;
1524  		}
1525  	}
1526  
1527  	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1528  	case 0: /* single */
1529  		ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
1530  		break;
1531  	case BTRFS_BLOCK_GROUP_DUP:
1532  		ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
1533  		break;
1534  	case BTRFS_BLOCK_GROUP_RAID1:
1535  	case BTRFS_BLOCK_GROUP_RAID0:
1536  	case BTRFS_BLOCK_GROUP_RAID10:
1537  	case BTRFS_BLOCK_GROUP_RAID5:
1538  	case BTRFS_BLOCK_GROUP_RAID6:
1539  		/* non-single profiles are not supported yet */
1540  	default:
1541  		btrfs_err(fs_info, "zoned: profile %s not yet supported",
1542  			  btrfs_bg_type_to_raid_name(map->type));
1543  		ret = -EINVAL;
1544  		goto out;
1545  	}
1546  
1547  out:
1548  	if (cache->alloc_offset > fs_info->zone_size) {
1549  		btrfs_err(fs_info,
1550  			"zoned: invalid write pointer %llu in block group %llu",
1551  			cache->alloc_offset, cache->start);
1552  		ret = -EIO;
1553  	}
1554  
1555  	if (cache->alloc_offset > cache->zone_capacity) {
1556  		btrfs_err(fs_info,
1557  "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1558  			  cache->alloc_offset, cache->zone_capacity,
1559  			  cache->start);
1560  		ret = -EIO;
1561  	}
1562  
1563  	/* An extent is allocated after the write pointer */
1564  	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1565  		btrfs_err(fs_info,
1566  			  "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1567  			  logical, last_alloc, cache->alloc_offset);
1568  		ret = -EIO;
1569  	}
1570  
1571  	if (!ret) {
1572  		cache->meta_write_pointer = cache->alloc_offset + cache->start;
1573  		if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
1574  			btrfs_get_block_group(cache);
1575  			spin_lock(&fs_info->zone_active_bgs_lock);
1576  			list_add_tail(&cache->active_bg_list,
1577  				      &fs_info->zone_active_bgs);
1578  			spin_unlock(&fs_info->zone_active_bgs_lock);
1579  		}
1580  	} else {
1581  		kfree(cache->physical_map);
1582  		cache->physical_map = NULL;
1583  	}
1584  	bitmap_free(active);
1585  	kfree(zone_info);
1586  	free_extent_map(em);
1587  
1588  	return ret;
1589  }
1590  
btrfs_calc_zone_unusable(struct btrfs_block_group * cache)1591  void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1592  {
1593  	u64 unusable, free;
1594  
1595  	if (!btrfs_is_zoned(cache->fs_info))
1596  		return;
1597  
1598  	WARN_ON(cache->bytes_super != 0);
1599  	unusable = (cache->alloc_offset - cache->used) +
1600  		   (cache->length - cache->zone_capacity);
1601  	free = cache->zone_capacity - cache->alloc_offset;
1602  
1603  	/* We only need ->free_space in ALLOC_SEQ block groups */
1604  	cache->cached = BTRFS_CACHE_FINISHED;
1605  	cache->free_space_ctl->free_space = free;
1606  	cache->zone_unusable = unusable;
1607  }
1608  
btrfs_redirty_list_add(struct btrfs_transaction * trans,struct extent_buffer * eb)1609  void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1610  			    struct extent_buffer *eb)
1611  {
1612  	if (!btrfs_is_zoned(eb->fs_info) ||
1613  	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
1614  		return;
1615  
1616  	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1617  
1618  	memzero_extent_buffer(eb, 0, eb->len);
1619  	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1620  	set_extent_buffer_dirty(eb);
1621  	set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
1622  			EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
1623  }
1624  
btrfs_use_zone_append(struct btrfs_bio * bbio)1625  bool btrfs_use_zone_append(struct btrfs_bio *bbio)
1626  {
1627  	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
1628  	struct btrfs_inode *inode = bbio->inode;
1629  	struct btrfs_fs_info *fs_info = bbio->fs_info;
1630  	struct btrfs_block_group *cache;
1631  	bool ret = false;
1632  
1633  	if (!btrfs_is_zoned(fs_info))
1634  		return false;
1635  
1636  	if (!inode || !is_data_inode(&inode->vfs_inode))
1637  		return false;
1638  
1639  	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
1640  		return false;
1641  
1642  	/*
1643  	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1644  	 * extent layout the relocation code has.
1645  	 * Furthermore we have set aside own block-group from which only the
1646  	 * relocation "process" can allocate and make sure only one process at a
1647  	 * time can add pages to an extent that gets relocated, so it's safe to
1648  	 * use regular REQ_OP_WRITE for this special case.
1649  	 */
1650  	if (btrfs_is_data_reloc_root(inode->root))
1651  		return false;
1652  
1653  	cache = btrfs_lookup_block_group(fs_info, start);
1654  	ASSERT(cache);
1655  	if (!cache)
1656  		return false;
1657  
1658  	ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1659  	btrfs_put_block_group(cache);
1660  
1661  	return ret;
1662  }
1663  
btrfs_record_physical_zoned(struct btrfs_bio * bbio)1664  void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
1665  {
1666  	const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1667  	struct btrfs_ordered_sum *sum = bbio->sums;
1668  
1669  	if (physical < bbio->orig_physical)
1670  		sum->logical -= bbio->orig_physical - physical;
1671  	else
1672  		sum->logical += physical - bbio->orig_physical;
1673  }
1674  
btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent * ordered,u64 logical)1675  static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
1676  					u64 logical)
1677  {
1678  	struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
1679  	struct extent_map *em;
1680  
1681  	ordered->disk_bytenr = logical;
1682  
1683  	write_lock(&em_tree->lock);
1684  	em = search_extent_mapping(em_tree, ordered->file_offset,
1685  				   ordered->num_bytes);
1686  	em->block_start = logical;
1687  	free_extent_map(em);
1688  	write_unlock(&em_tree->lock);
1689  }
1690  
btrfs_zoned_split_ordered(struct btrfs_ordered_extent * ordered,u64 logical,u64 len)1691  static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
1692  				      u64 logical, u64 len)
1693  {
1694  	struct btrfs_ordered_extent *new;
1695  
1696  	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
1697  	    split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
1698  			     ordered->num_bytes, len, logical))
1699  		return false;
1700  
1701  	new = btrfs_split_ordered_extent(ordered, len);
1702  	if (IS_ERR(new))
1703  		return false;
1704  	new->disk_bytenr = logical;
1705  	btrfs_finish_one_ordered(new);
1706  	return true;
1707  }
1708  
btrfs_finish_ordered_zoned(struct btrfs_ordered_extent * ordered)1709  void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
1710  {
1711  	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1712  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1713  	struct btrfs_ordered_sum *sum;
1714  	u64 logical, len;
1715  
1716  	/*
1717  	 * Write to pre-allocated region is for the data relocation, and so
1718  	 * it should use WRITE operation. No split/rewrite are necessary.
1719  	 */
1720  	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
1721  		return;
1722  
1723  	ASSERT(!list_empty(&ordered->list));
1724  	/* The ordered->list can be empty in the above pre-alloc case. */
1725  	sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
1726  	logical = sum->logical;
1727  	len = sum->len;
1728  
1729  	while (len < ordered->disk_num_bytes) {
1730  		sum = list_next_entry(sum, list);
1731  		if (sum->logical == logical + len) {
1732  			len += sum->len;
1733  			continue;
1734  		}
1735  		if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
1736  			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
1737  			btrfs_err(fs_info, "failed to split ordered extent");
1738  			goto out;
1739  		}
1740  		logical = sum->logical;
1741  		len = sum->len;
1742  	}
1743  
1744  	if (ordered->disk_bytenr != logical)
1745  		btrfs_rewrite_logical_zoned(ordered, logical);
1746  
1747  out:
1748  	/*
1749  	 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1750  	 * were allocated by btrfs_alloc_dummy_sum only to record the logical
1751  	 * addresses and don't contain actual checksums.  We thus must free them
1752  	 * here so that we don't attempt to log the csums later.
1753  	 */
1754  	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
1755  	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
1756  		while ((sum = list_first_entry_or_null(&ordered->list,
1757  						       typeof(*sum), list))) {
1758  			list_del(&sum->list);
1759  			kfree(sum);
1760  		}
1761  	}
1762  }
1763  
check_bg_is_active(struct btrfs_eb_write_context * ctx,struct btrfs_block_group ** active_bg)1764  static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
1765  			       struct btrfs_block_group **active_bg)
1766  {
1767  	const struct writeback_control *wbc = ctx->wbc;
1768  	struct btrfs_block_group *block_group = ctx->zoned_bg;
1769  	struct btrfs_fs_info *fs_info = block_group->fs_info;
1770  
1771  	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
1772  		return true;
1773  
1774  	if (fs_info->treelog_bg == block_group->start) {
1775  		if (!btrfs_zone_activate(block_group)) {
1776  			int ret_fin = btrfs_zone_finish_one_bg(fs_info);
1777  
1778  			if (ret_fin != 1 || !btrfs_zone_activate(block_group))
1779  				return false;
1780  		}
1781  	} else if (*active_bg != block_group) {
1782  		struct btrfs_block_group *tgt = *active_bg;
1783  
1784  		/* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
1785  		lockdep_assert_held(&fs_info->zoned_meta_io_lock);
1786  
1787  		if (tgt) {
1788  			/*
1789  			 * If there is an unsent IO left in the allocated area,
1790  			 * we cannot wait for them as it may cause a deadlock.
1791  			 */
1792  			if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
1793  				if (wbc->sync_mode == WB_SYNC_NONE ||
1794  				    (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
1795  					return false;
1796  			}
1797  
1798  			/* Pivot active metadata/system block group. */
1799  			btrfs_zoned_meta_io_unlock(fs_info);
1800  			wait_eb_writebacks(tgt);
1801  			do_zone_finish(tgt, true);
1802  			btrfs_zoned_meta_io_lock(fs_info);
1803  			if (*active_bg == tgt) {
1804  				btrfs_put_block_group(tgt);
1805  				*active_bg = NULL;
1806  			}
1807  		}
1808  		if (!btrfs_zone_activate(block_group))
1809  			return false;
1810  		if (*active_bg != block_group) {
1811  			ASSERT(*active_bg == NULL);
1812  			*active_bg = block_group;
1813  			btrfs_get_block_group(block_group);
1814  		}
1815  	}
1816  
1817  	return true;
1818  }
1819  
1820  /*
1821   * Check if @ctx->eb is aligned to the write pointer.
1822   *
1823   * Return:
1824   *   0:        @ctx->eb is at the write pointer. You can write it.
1825   *   -EAGAIN:  There is a hole. The caller should handle the case.
1826   *   -EBUSY:   There is a hole, but the caller can just bail out.
1827   */
btrfs_check_meta_write_pointer(struct btrfs_fs_info * fs_info,struct btrfs_eb_write_context * ctx)1828  int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1829  				   struct btrfs_eb_write_context *ctx)
1830  {
1831  	const struct writeback_control *wbc = ctx->wbc;
1832  	const struct extent_buffer *eb = ctx->eb;
1833  	struct btrfs_block_group *block_group = ctx->zoned_bg;
1834  
1835  	if (!btrfs_is_zoned(fs_info))
1836  		return 0;
1837  
1838  	if (block_group) {
1839  		if (block_group->start > eb->start ||
1840  		    block_group->start + block_group->length <= eb->start) {
1841  			btrfs_put_block_group(block_group);
1842  			block_group = NULL;
1843  			ctx->zoned_bg = NULL;
1844  		}
1845  	}
1846  
1847  	if (!block_group) {
1848  		block_group = btrfs_lookup_block_group(fs_info, eb->start);
1849  		if (!block_group)
1850  			return 0;
1851  		ctx->zoned_bg = block_group;
1852  	}
1853  
1854  	if (block_group->meta_write_pointer == eb->start) {
1855  		struct btrfs_block_group **tgt;
1856  
1857  		if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
1858  			return 0;
1859  
1860  		if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
1861  			tgt = &fs_info->active_system_bg;
1862  		else
1863  			tgt = &fs_info->active_meta_bg;
1864  		if (check_bg_is_active(ctx, tgt))
1865  			return 0;
1866  	}
1867  
1868  	/*
1869  	 * Since we may release fs_info->zoned_meta_io_lock, someone can already
1870  	 * start writing this eb. In that case, we can just bail out.
1871  	 */
1872  	if (block_group->meta_write_pointer > eb->start)
1873  		return -EBUSY;
1874  
1875  	/* If for_sync, this hole will be filled with trasnsaction commit. */
1876  	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
1877  		return -EAGAIN;
1878  	return -EBUSY;
1879  }
1880  
btrfs_zoned_issue_zeroout(struct btrfs_device * device,u64 physical,u64 length)1881  int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1882  {
1883  	if (!btrfs_dev_is_sequential(device, physical))
1884  		return -EOPNOTSUPP;
1885  
1886  	return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1887  				    length >> SECTOR_SHIFT, GFP_NOFS, 0);
1888  }
1889  
read_zone_info(struct btrfs_fs_info * fs_info,u64 logical,struct blk_zone * zone)1890  static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1891  			  struct blk_zone *zone)
1892  {
1893  	struct btrfs_io_context *bioc = NULL;
1894  	u64 mapped_length = PAGE_SIZE;
1895  	unsigned int nofs_flag;
1896  	int nmirrors;
1897  	int i, ret;
1898  
1899  	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1900  			      &mapped_length, &bioc, NULL, NULL, 1);
1901  	if (ret || !bioc || mapped_length < PAGE_SIZE) {
1902  		ret = -EIO;
1903  		goto out_put_bioc;
1904  	}
1905  
1906  	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1907  		ret = -EINVAL;
1908  		goto out_put_bioc;
1909  	}
1910  
1911  	nofs_flag = memalloc_nofs_save();
1912  	nmirrors = (int)bioc->num_stripes;
1913  	for (i = 0; i < nmirrors; i++) {
1914  		u64 physical = bioc->stripes[i].physical;
1915  		struct btrfs_device *dev = bioc->stripes[i].dev;
1916  
1917  		/* Missing device */
1918  		if (!dev->bdev)
1919  			continue;
1920  
1921  		ret = btrfs_get_dev_zone(dev, physical, zone);
1922  		/* Failing device */
1923  		if (ret == -EIO || ret == -EOPNOTSUPP)
1924  			continue;
1925  		break;
1926  	}
1927  	memalloc_nofs_restore(nofs_flag);
1928  out_put_bioc:
1929  	btrfs_put_bioc(bioc);
1930  	return ret;
1931  }
1932  
1933  /*
1934   * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1935   * filling zeros between @physical_pos to a write pointer of dev-replace
1936   * source device.
1937   */
btrfs_sync_zone_write_pointer(struct btrfs_device * tgt_dev,u64 logical,u64 physical_start,u64 physical_pos)1938  int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1939  				    u64 physical_start, u64 physical_pos)
1940  {
1941  	struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1942  	struct blk_zone zone;
1943  	u64 length;
1944  	u64 wp;
1945  	int ret;
1946  
1947  	if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1948  		return 0;
1949  
1950  	ret = read_zone_info(fs_info, logical, &zone);
1951  	if (ret)
1952  		return ret;
1953  
1954  	wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1955  
1956  	if (physical_pos == wp)
1957  		return 0;
1958  
1959  	if (physical_pos > wp)
1960  		return -EUCLEAN;
1961  
1962  	length = wp - physical_pos;
1963  	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1964  }
1965  
1966  /*
1967   * Activate block group and underlying device zones
1968   *
1969   * @block_group: the block group to activate
1970   *
1971   * Return: true on success, false otherwise
1972   */
btrfs_zone_activate(struct btrfs_block_group * block_group)1973  bool btrfs_zone_activate(struct btrfs_block_group *block_group)
1974  {
1975  	struct btrfs_fs_info *fs_info = block_group->fs_info;
1976  	struct map_lookup *map;
1977  	struct btrfs_device *device;
1978  	u64 physical;
1979  	const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
1980  	bool ret;
1981  	int i;
1982  
1983  	if (!btrfs_is_zoned(block_group->fs_info))
1984  		return true;
1985  
1986  	map = block_group->physical_map;
1987  
1988  	spin_lock(&fs_info->zone_active_bgs_lock);
1989  	spin_lock(&block_group->lock);
1990  	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
1991  		ret = true;
1992  		goto out_unlock;
1993  	}
1994  
1995  	/* No space left */
1996  	if (btrfs_zoned_bg_is_full(block_group)) {
1997  		ret = false;
1998  		goto out_unlock;
1999  	}
2000  
2001  	for (i = 0; i < map->num_stripes; i++) {
2002  		struct btrfs_zoned_device_info *zinfo;
2003  		int reserved = 0;
2004  
2005  		device = map->stripes[i].dev;
2006  		physical = map->stripes[i].physical;
2007  		zinfo = device->zone_info;
2008  
2009  		if (zinfo->max_active_zones == 0)
2010  			continue;
2011  
2012  		if (is_data)
2013  			reserved = zinfo->reserved_active_zones;
2014  		/*
2015  		 * For the data block group, leave active zones for one
2016  		 * metadata block group and one system block group.
2017  		 */
2018  		if (atomic_read(&zinfo->active_zones_left) <= reserved) {
2019  			ret = false;
2020  			goto out_unlock;
2021  		}
2022  
2023  		if (!btrfs_dev_set_active_zone(device, physical)) {
2024  			/* Cannot activate the zone */
2025  			ret = false;
2026  			goto out_unlock;
2027  		}
2028  		if (!is_data)
2029  			zinfo->reserved_active_zones--;
2030  	}
2031  
2032  	/* Successfully activated all the zones */
2033  	set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2034  	spin_unlock(&block_group->lock);
2035  
2036  	/* For the active block group list */
2037  	btrfs_get_block_group(block_group);
2038  	list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
2039  	spin_unlock(&fs_info->zone_active_bgs_lock);
2040  
2041  	return true;
2042  
2043  out_unlock:
2044  	spin_unlock(&block_group->lock);
2045  	spin_unlock(&fs_info->zone_active_bgs_lock);
2046  	return ret;
2047  }
2048  
wait_eb_writebacks(struct btrfs_block_group * block_group)2049  static void wait_eb_writebacks(struct btrfs_block_group *block_group)
2050  {
2051  	struct btrfs_fs_info *fs_info = block_group->fs_info;
2052  	const u64 end = block_group->start + block_group->length;
2053  	struct radix_tree_iter iter;
2054  	struct extent_buffer *eb;
2055  	void __rcu **slot;
2056  
2057  	rcu_read_lock();
2058  	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
2059  				 block_group->start >> fs_info->sectorsize_bits) {
2060  		eb = radix_tree_deref_slot(slot);
2061  		if (!eb)
2062  			continue;
2063  		if (radix_tree_deref_retry(eb)) {
2064  			slot = radix_tree_iter_retry(&iter);
2065  			continue;
2066  		}
2067  
2068  		if (eb->start < block_group->start)
2069  			continue;
2070  		if (eb->start >= end)
2071  			break;
2072  
2073  		slot = radix_tree_iter_resume(slot, &iter);
2074  		rcu_read_unlock();
2075  		wait_on_extent_buffer_writeback(eb);
2076  		rcu_read_lock();
2077  	}
2078  	rcu_read_unlock();
2079  }
2080  
do_zone_finish(struct btrfs_block_group * block_group,bool fully_written)2081  static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
2082  {
2083  	struct btrfs_fs_info *fs_info = block_group->fs_info;
2084  	struct map_lookup *map;
2085  	const bool is_metadata = (block_group->flags &
2086  			(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
2087  	int ret = 0;
2088  	int i;
2089  
2090  	spin_lock(&block_group->lock);
2091  	if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
2092  		spin_unlock(&block_group->lock);
2093  		return 0;
2094  	}
2095  
2096  	/* Check if we have unwritten allocated space */
2097  	if (is_metadata &&
2098  	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
2099  		spin_unlock(&block_group->lock);
2100  		return -EAGAIN;
2101  	}
2102  
2103  	/*
2104  	 * If we are sure that the block group is full (= no more room left for
2105  	 * new allocation) and the IO for the last usable block is completed, we
2106  	 * don't need to wait for the other IOs. This holds because we ensure
2107  	 * the sequential IO submissions using the ZONE_APPEND command for data
2108  	 * and block_group->meta_write_pointer for metadata.
2109  	 */
2110  	if (!fully_written) {
2111  		if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2112  			spin_unlock(&block_group->lock);
2113  			return -EAGAIN;
2114  		}
2115  		spin_unlock(&block_group->lock);
2116  
2117  		ret = btrfs_inc_block_group_ro(block_group, false);
2118  		if (ret)
2119  			return ret;
2120  
2121  		/* Ensure all writes in this block group finish */
2122  		btrfs_wait_block_group_reservations(block_group);
2123  		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
2124  		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
2125  					 block_group->length);
2126  		/* Wait for extent buffers to be written. */
2127  		if (is_metadata)
2128  			wait_eb_writebacks(block_group);
2129  
2130  		spin_lock(&block_group->lock);
2131  
2132  		/*
2133  		 * Bail out if someone already deactivated the block group, or
2134  		 * allocated space is left in the block group.
2135  		 */
2136  		if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2137  			      &block_group->runtime_flags)) {
2138  			spin_unlock(&block_group->lock);
2139  			btrfs_dec_block_group_ro(block_group);
2140  			return 0;
2141  		}
2142  
2143  		if (block_group->reserved ||
2144  		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2145  			     &block_group->runtime_flags)) {
2146  			spin_unlock(&block_group->lock);
2147  			btrfs_dec_block_group_ro(block_group);
2148  			return -EAGAIN;
2149  		}
2150  	}
2151  
2152  	clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2153  	block_group->alloc_offset = block_group->zone_capacity;
2154  	if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
2155  		block_group->meta_write_pointer = block_group->start +
2156  						  block_group->zone_capacity;
2157  	block_group->free_space_ctl->free_space = 0;
2158  	btrfs_clear_treelog_bg(block_group);
2159  	btrfs_clear_data_reloc_bg(block_group);
2160  	spin_unlock(&block_group->lock);
2161  
2162  	map = block_group->physical_map;
2163  	for (i = 0; i < map->num_stripes; i++) {
2164  		struct btrfs_device *device = map->stripes[i].dev;
2165  		const u64 physical = map->stripes[i].physical;
2166  		struct btrfs_zoned_device_info *zinfo = device->zone_info;
2167  
2168  		if (zinfo->max_active_zones == 0)
2169  			continue;
2170  
2171  		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
2172  				       physical >> SECTOR_SHIFT,
2173  				       zinfo->zone_size >> SECTOR_SHIFT,
2174  				       GFP_NOFS);
2175  
2176  		if (ret)
2177  			return ret;
2178  
2179  		if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
2180  			zinfo->reserved_active_zones++;
2181  		btrfs_dev_clear_active_zone(device, physical);
2182  	}
2183  
2184  	if (!fully_written)
2185  		btrfs_dec_block_group_ro(block_group);
2186  
2187  	spin_lock(&fs_info->zone_active_bgs_lock);
2188  	ASSERT(!list_empty(&block_group->active_bg_list));
2189  	list_del_init(&block_group->active_bg_list);
2190  	spin_unlock(&fs_info->zone_active_bgs_lock);
2191  
2192  	/* For active_bg_list */
2193  	btrfs_put_block_group(block_group);
2194  
2195  	clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2196  
2197  	return 0;
2198  }
2199  
btrfs_zone_finish(struct btrfs_block_group * block_group)2200  int btrfs_zone_finish(struct btrfs_block_group *block_group)
2201  {
2202  	if (!btrfs_is_zoned(block_group->fs_info))
2203  		return 0;
2204  
2205  	return do_zone_finish(block_group, false);
2206  }
2207  
btrfs_can_activate_zone(struct btrfs_fs_devices * fs_devices,u64 flags)2208  bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
2209  {
2210  	struct btrfs_fs_info *fs_info = fs_devices->fs_info;
2211  	struct btrfs_device *device;
2212  	bool ret = false;
2213  
2214  	if (!btrfs_is_zoned(fs_info))
2215  		return true;
2216  
2217  	/* Check if there is a device with active zones left */
2218  	mutex_lock(&fs_info->chunk_mutex);
2219  	spin_lock(&fs_info->zone_active_bgs_lock);
2220  	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
2221  		struct btrfs_zoned_device_info *zinfo = device->zone_info;
2222  		int reserved = 0;
2223  
2224  		if (!device->bdev)
2225  			continue;
2226  
2227  		if (!zinfo->max_active_zones) {
2228  			ret = true;
2229  			break;
2230  		}
2231  
2232  		if (flags & BTRFS_BLOCK_GROUP_DATA)
2233  			reserved = zinfo->reserved_active_zones;
2234  
2235  		switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2236  		case 0: /* single */
2237  			ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
2238  			break;
2239  		case BTRFS_BLOCK_GROUP_DUP:
2240  			ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
2241  			break;
2242  		}
2243  		if (ret)
2244  			break;
2245  	}
2246  	spin_unlock(&fs_info->zone_active_bgs_lock);
2247  	mutex_unlock(&fs_info->chunk_mutex);
2248  
2249  	if (!ret)
2250  		set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2251  
2252  	return ret;
2253  }
2254  
btrfs_zone_finish_endio(struct btrfs_fs_info * fs_info,u64 logical,u64 length)2255  void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
2256  {
2257  	struct btrfs_block_group *block_group;
2258  	u64 min_alloc_bytes;
2259  
2260  	if (!btrfs_is_zoned(fs_info))
2261  		return;
2262  
2263  	block_group = btrfs_lookup_block_group(fs_info, logical);
2264  	ASSERT(block_group);
2265  
2266  	/* No MIXED_BG on zoned btrfs. */
2267  	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
2268  		min_alloc_bytes = fs_info->sectorsize;
2269  	else
2270  		min_alloc_bytes = fs_info->nodesize;
2271  
2272  	/* Bail out if we can allocate more data from this block group. */
2273  	if (logical + length + min_alloc_bytes <=
2274  	    block_group->start + block_group->zone_capacity)
2275  		goto out;
2276  
2277  	do_zone_finish(block_group, true);
2278  
2279  out:
2280  	btrfs_put_block_group(block_group);
2281  }
2282  
btrfs_zone_finish_endio_workfn(struct work_struct * work)2283  static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
2284  {
2285  	struct btrfs_block_group *bg =
2286  		container_of(work, struct btrfs_block_group, zone_finish_work);
2287  
2288  	wait_on_extent_buffer_writeback(bg->last_eb);
2289  	free_extent_buffer(bg->last_eb);
2290  	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
2291  	btrfs_put_block_group(bg);
2292  }
2293  
btrfs_schedule_zone_finish_bg(struct btrfs_block_group * bg,struct extent_buffer * eb)2294  void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
2295  				   struct extent_buffer *eb)
2296  {
2297  	if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
2298  	    eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
2299  		return;
2300  
2301  	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
2302  		btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
2303  			  bg->start);
2304  		return;
2305  	}
2306  
2307  	/* For the work */
2308  	btrfs_get_block_group(bg);
2309  	atomic_inc(&eb->refs);
2310  	bg->last_eb = eb;
2311  	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
2312  	queue_work(system_unbound_wq, &bg->zone_finish_work);
2313  }
2314  
btrfs_clear_data_reloc_bg(struct btrfs_block_group * bg)2315  void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
2316  {
2317  	struct btrfs_fs_info *fs_info = bg->fs_info;
2318  
2319  	spin_lock(&fs_info->relocation_bg_lock);
2320  	if (fs_info->data_reloc_bg == bg->start)
2321  		fs_info->data_reloc_bg = 0;
2322  	spin_unlock(&fs_info->relocation_bg_lock);
2323  }
2324  
btrfs_free_zone_cache(struct btrfs_fs_info * fs_info)2325  void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
2326  {
2327  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2328  	struct btrfs_device *device;
2329  
2330  	if (!btrfs_is_zoned(fs_info))
2331  		return;
2332  
2333  	mutex_lock(&fs_devices->device_list_mutex);
2334  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
2335  		if (device->zone_info) {
2336  			vfree(device->zone_info->zone_cache);
2337  			device->zone_info->zone_cache = NULL;
2338  		}
2339  	}
2340  	mutex_unlock(&fs_devices->device_list_mutex);
2341  }
2342  
btrfs_zoned_should_reclaim(struct btrfs_fs_info * fs_info)2343  bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
2344  {
2345  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2346  	struct btrfs_device *device;
2347  	u64 used = 0;
2348  	u64 total = 0;
2349  	u64 factor;
2350  
2351  	ASSERT(btrfs_is_zoned(fs_info));
2352  
2353  	if (fs_info->bg_reclaim_threshold == 0)
2354  		return false;
2355  
2356  	mutex_lock(&fs_devices->device_list_mutex);
2357  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
2358  		if (!device->bdev)
2359  			continue;
2360  
2361  		total += device->disk_total_bytes;
2362  		used += device->bytes_used;
2363  	}
2364  	mutex_unlock(&fs_devices->device_list_mutex);
2365  
2366  	factor = div64_u64(used * 100, total);
2367  	return factor >= fs_info->bg_reclaim_threshold;
2368  }
2369  
btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info * fs_info,u64 logical,u64 length)2370  void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2371  				       u64 length)
2372  {
2373  	struct btrfs_block_group *block_group;
2374  
2375  	if (!btrfs_is_zoned(fs_info))
2376  		return;
2377  
2378  	block_group = btrfs_lookup_block_group(fs_info, logical);
2379  	/* It should be called on a previous data relocation block group. */
2380  	ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2381  
2382  	spin_lock(&block_group->lock);
2383  	if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
2384  		goto out;
2385  
2386  	/* All relocation extents are written. */
2387  	if (block_group->start + block_group->alloc_offset == logical + length) {
2388  		/*
2389  		 * Now, release this block group for further allocations and
2390  		 * zone finish.
2391  		 */
2392  		clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2393  			  &block_group->runtime_flags);
2394  	}
2395  
2396  out:
2397  	spin_unlock(&block_group->lock);
2398  	btrfs_put_block_group(block_group);
2399  }
2400  
btrfs_zone_finish_one_bg(struct btrfs_fs_info * fs_info)2401  int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
2402  {
2403  	struct btrfs_block_group *block_group;
2404  	struct btrfs_block_group *min_bg = NULL;
2405  	u64 min_avail = U64_MAX;
2406  	int ret;
2407  
2408  	spin_lock(&fs_info->zone_active_bgs_lock);
2409  	list_for_each_entry(block_group, &fs_info->zone_active_bgs,
2410  			    active_bg_list) {
2411  		u64 avail;
2412  
2413  		spin_lock(&block_group->lock);
2414  		if (block_group->reserved || block_group->alloc_offset == 0 ||
2415  		    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
2416  		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2417  			spin_unlock(&block_group->lock);
2418  			continue;
2419  		}
2420  
2421  		avail = block_group->zone_capacity - block_group->alloc_offset;
2422  		if (min_avail > avail) {
2423  			if (min_bg)
2424  				btrfs_put_block_group(min_bg);
2425  			min_bg = block_group;
2426  			min_avail = avail;
2427  			btrfs_get_block_group(min_bg);
2428  		}
2429  		spin_unlock(&block_group->lock);
2430  	}
2431  	spin_unlock(&fs_info->zone_active_bgs_lock);
2432  
2433  	if (!min_bg)
2434  		return 0;
2435  
2436  	ret = btrfs_zone_finish(min_bg);
2437  	btrfs_put_block_group(min_bg);
2438  
2439  	return ret < 0 ? ret : 1;
2440  }
2441  
btrfs_zoned_activate_one_bg(struct btrfs_fs_info * fs_info,struct btrfs_space_info * space_info,bool do_finish)2442  int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
2443  				struct btrfs_space_info *space_info,
2444  				bool do_finish)
2445  {
2446  	struct btrfs_block_group *bg;
2447  	int index;
2448  
2449  	if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
2450  		return 0;
2451  
2452  	for (;;) {
2453  		int ret;
2454  		bool need_finish = false;
2455  
2456  		down_read(&space_info->groups_sem);
2457  		for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
2458  			list_for_each_entry(bg, &space_info->block_groups[index],
2459  					    list) {
2460  				if (!spin_trylock(&bg->lock))
2461  					continue;
2462  				if (btrfs_zoned_bg_is_full(bg) ||
2463  				    test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2464  					     &bg->runtime_flags)) {
2465  					spin_unlock(&bg->lock);
2466  					continue;
2467  				}
2468  				spin_unlock(&bg->lock);
2469  
2470  				if (btrfs_zone_activate(bg)) {
2471  					up_read(&space_info->groups_sem);
2472  					return 1;
2473  				}
2474  
2475  				need_finish = true;
2476  			}
2477  		}
2478  		up_read(&space_info->groups_sem);
2479  
2480  		if (!do_finish || !need_finish)
2481  			break;
2482  
2483  		ret = btrfs_zone_finish_one_bg(fs_info);
2484  		if (ret == 0)
2485  			break;
2486  		if (ret < 0)
2487  			return ret;
2488  	}
2489  
2490  	return 0;
2491  }
2492  
2493  /*
2494   * Reserve zones for one metadata block group, one tree-log block group, and one
2495   * system block group.
2496   */
btrfs_check_active_zone_reservation(struct btrfs_fs_info * fs_info)2497  void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
2498  {
2499  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2500  	struct btrfs_block_group *block_group;
2501  	struct btrfs_device *device;
2502  	/* Reserve zones for normal SINGLE metadata and tree-log block group. */
2503  	unsigned int metadata_reserve = 2;
2504  	/* Reserve a zone for SINGLE system block group. */
2505  	unsigned int system_reserve = 1;
2506  
2507  	if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
2508  		return;
2509  
2510  	/*
2511  	 * This function is called from the mount context. So, there is no
2512  	 * parallel process touching the bits. No need for read_seqretry().
2513  	 */
2514  	if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
2515  		metadata_reserve = 4;
2516  	if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
2517  		system_reserve = 2;
2518  
2519  	/* Apply the reservation on all the devices. */
2520  	mutex_lock(&fs_devices->device_list_mutex);
2521  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
2522  		if (!device->bdev)
2523  			continue;
2524  
2525  		device->zone_info->reserved_active_zones =
2526  			metadata_reserve + system_reserve;
2527  	}
2528  	mutex_unlock(&fs_devices->device_list_mutex);
2529  
2530  	/* Release reservation for currently active block groups. */
2531  	spin_lock(&fs_info->zone_active_bgs_lock);
2532  	list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
2533  		struct map_lookup *map = block_group->physical_map;
2534  
2535  		if (!(block_group->flags &
2536  		      (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
2537  			continue;
2538  
2539  		for (int i = 0; i < map->num_stripes; i++)
2540  			map->stripes[i].dev->zone_info->reserved_active_zones--;
2541  	}
2542  	spin_unlock(&fs_info->zone_active_bgs_lock);
2543  }
2544