xref: /openbmc/linux/fs/btrfs/zoned.c (revision 31e67366)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include "ctree.h"
8 #include "volumes.h"
9 #include "zoned.h"
10 #include "rcu-string.h"
11 #include "disk-io.h"
12 #include "block-group.h"
13 #include "transaction.h"
14 #include "dev-replace.h"
15 #include "space-info.h"
16 
17 /* Maximum number of zones to report per blkdev_report_zones() call */
18 #define BTRFS_REPORT_NR_ZONES   4096
19 /* Invalid allocation pointer value for missing devices */
20 #define WP_MISSING_DEV ((u64)-1)
21 /* Pseudo write pointer value for conventional zone */
22 #define WP_CONVENTIONAL ((u64)-2)
23 
24 /* Number of superblock log zones */
25 #define BTRFS_NR_SB_LOG_ZONES 2
26 
27 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
28 {
29 	struct blk_zone *zones = data;
30 
31 	memcpy(&zones[idx], zone, sizeof(*zone));
32 
33 	return 0;
34 }
35 
36 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
37 			    u64 *wp_ret)
38 {
39 	bool empty[BTRFS_NR_SB_LOG_ZONES];
40 	bool full[BTRFS_NR_SB_LOG_ZONES];
41 	sector_t sector;
42 
43 	ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
44 	       zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
45 
46 	empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
47 	empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
48 	full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
49 	full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
50 
51 	/*
52 	 * Possible states of log buffer zones
53 	 *
54 	 *           Empty[0]  In use[0]  Full[0]
55 	 * Empty[1]         *          x        0
56 	 * In use[1]        0          x        0
57 	 * Full[1]          1          1        C
58 	 *
59 	 * Log position:
60 	 *   *: Special case, no superblock is written
61 	 *   0: Use write pointer of zones[0]
62 	 *   1: Use write pointer of zones[1]
63 	 *   C: Compare super blcoks from zones[0] and zones[1], use the latest
64 	 *      one determined by generation
65 	 *   x: Invalid state
66 	 */
67 
68 	if (empty[0] && empty[1]) {
69 		/* Special case to distinguish no superblock to read */
70 		*wp_ret = zones[0].start << SECTOR_SHIFT;
71 		return -ENOENT;
72 	} else if (full[0] && full[1]) {
73 		/* Compare two super blocks */
74 		struct address_space *mapping = bdev->bd_inode->i_mapping;
75 		struct page *page[BTRFS_NR_SB_LOG_ZONES];
76 		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
77 		int i;
78 
79 		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
80 			u64 bytenr;
81 
82 			bytenr = ((zones[i].start + zones[i].len)
83 				   << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
84 
85 			page[i] = read_cache_page_gfp(mapping,
86 					bytenr >> PAGE_SHIFT, GFP_NOFS);
87 			if (IS_ERR(page[i])) {
88 				if (i == 1)
89 					btrfs_release_disk_super(super[0]);
90 				return PTR_ERR(page[i]);
91 			}
92 			super[i] = page_address(page[i]);
93 		}
94 
95 		if (super[0]->generation > super[1]->generation)
96 			sector = zones[1].start;
97 		else
98 			sector = zones[0].start;
99 
100 		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
101 			btrfs_release_disk_super(super[i]);
102 	} else if (!full[0] && (empty[1] || full[1])) {
103 		sector = zones[0].wp;
104 	} else if (full[0]) {
105 		sector = zones[1].wp;
106 	} else {
107 		return -EUCLEAN;
108 	}
109 	*wp_ret = sector << SECTOR_SHIFT;
110 	return 0;
111 }
112 
113 /*
114  * The following zones are reserved as the circular buffer on ZONED btrfs.
115  *  - The primary superblock: zones 0 and 1
116  *  - The first copy: zones 16 and 17
117  *  - The second copy: zones 1024 or zone at 256GB which is minimum, and
118  *                     the following one
119  */
120 static inline u32 sb_zone_number(int shift, int mirror)
121 {
122 	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
123 
124 	switch (mirror) {
125 	case 0: return 0;
126 	case 1: return 16;
127 	case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
128 	}
129 
130 	return 0;
131 }
132 
133 /*
134  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
135  * device into static sized chunks and fake a conventional zone on each of
136  * them.
137  */
138 static int emulate_report_zones(struct btrfs_device *device, u64 pos,
139 				struct blk_zone *zones, unsigned int nr_zones)
140 {
141 	const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
142 	sector_t bdev_size = bdev_nr_sectors(device->bdev);
143 	unsigned int i;
144 
145 	pos >>= SECTOR_SHIFT;
146 	for (i = 0; i < nr_zones; i++) {
147 		zones[i].start = i * zone_sectors + pos;
148 		zones[i].len = zone_sectors;
149 		zones[i].capacity = zone_sectors;
150 		zones[i].wp = zones[i].start + zone_sectors;
151 		zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
152 		zones[i].cond = BLK_ZONE_COND_NOT_WP;
153 
154 		if (zones[i].wp >= bdev_size) {
155 			i++;
156 			break;
157 		}
158 	}
159 
160 	return i;
161 }
162 
163 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
164 			       struct blk_zone *zones, unsigned int *nr_zones)
165 {
166 	int ret;
167 
168 	if (!*nr_zones)
169 		return 0;
170 
171 	if (!bdev_is_zoned(device->bdev)) {
172 		ret = emulate_report_zones(device, pos, zones, *nr_zones);
173 		*nr_zones = ret;
174 		return 0;
175 	}
176 
177 	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
178 				  copy_zone_info_cb, zones);
179 	if (ret < 0) {
180 		btrfs_err_in_rcu(device->fs_info,
181 				 "zoned: failed to read zone %llu on %s (devid %llu)",
182 				 pos, rcu_str_deref(device->name),
183 				 device->devid);
184 		return ret;
185 	}
186 	*nr_zones = ret;
187 	if (!ret)
188 		return -EIO;
189 
190 	return 0;
191 }
192 
193 /* The emulated zone size is determined from the size of device extent */
194 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
195 {
196 	struct btrfs_path *path;
197 	struct btrfs_root *root = fs_info->dev_root;
198 	struct btrfs_key key;
199 	struct extent_buffer *leaf;
200 	struct btrfs_dev_extent *dext;
201 	int ret = 0;
202 
203 	key.objectid = 1;
204 	key.type = BTRFS_DEV_EXTENT_KEY;
205 	key.offset = 0;
206 
207 	path = btrfs_alloc_path();
208 	if (!path)
209 		return -ENOMEM;
210 
211 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
212 	if (ret < 0)
213 		goto out;
214 
215 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
216 		ret = btrfs_next_item(root, path);
217 		if (ret < 0)
218 			goto out;
219 		/* No dev extents at all? Not good */
220 		if (ret > 0) {
221 			ret = -EUCLEAN;
222 			goto out;
223 		}
224 	}
225 
226 	leaf = path->nodes[0];
227 	dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
228 	fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
229 	ret = 0;
230 
231 out:
232 	btrfs_free_path(path);
233 
234 	return ret;
235 }
236 
237 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
238 {
239 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
240 	struct btrfs_device *device;
241 	int ret = 0;
242 
243 	/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
244 	if (!btrfs_fs_incompat(fs_info, ZONED))
245 		return 0;
246 
247 	mutex_lock(&fs_devices->device_list_mutex);
248 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
249 		/* We can skip reading of zone info for missing devices */
250 		if (!device->bdev)
251 			continue;
252 
253 		ret = btrfs_get_dev_zone_info(device);
254 		if (ret)
255 			break;
256 	}
257 	mutex_unlock(&fs_devices->device_list_mutex);
258 
259 	return ret;
260 }
261 
262 int btrfs_get_dev_zone_info(struct btrfs_device *device)
263 {
264 	struct btrfs_fs_info *fs_info = device->fs_info;
265 	struct btrfs_zoned_device_info *zone_info = NULL;
266 	struct block_device *bdev = device->bdev;
267 	struct request_queue *queue = bdev_get_queue(bdev);
268 	sector_t nr_sectors;
269 	sector_t sector = 0;
270 	struct blk_zone *zones = NULL;
271 	unsigned int i, nreported = 0, nr_zones;
272 	sector_t zone_sectors;
273 	char *model, *emulated;
274 	int ret;
275 
276 	/*
277 	 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
278 	 * yet be set.
279 	 */
280 	if (!btrfs_fs_incompat(fs_info, ZONED))
281 		return 0;
282 
283 	if (device->zone_info)
284 		return 0;
285 
286 	zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
287 	if (!zone_info)
288 		return -ENOMEM;
289 
290 	if (!bdev_is_zoned(bdev)) {
291 		if (!fs_info->zone_size) {
292 			ret = calculate_emulated_zone_size(fs_info);
293 			if (ret)
294 				goto out;
295 		}
296 
297 		ASSERT(fs_info->zone_size);
298 		zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
299 	} else {
300 		zone_sectors = bdev_zone_sectors(bdev);
301 	}
302 
303 	nr_sectors = bdev_nr_sectors(bdev);
304 	/* Check if it's power of 2 (see is_power_of_2) */
305 	ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
306 	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
307 	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
308 	zone_info->max_zone_append_size =
309 		(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
310 	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
311 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
312 		zone_info->nr_zones++;
313 
314 	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
315 	if (!zone_info->seq_zones) {
316 		ret = -ENOMEM;
317 		goto out;
318 	}
319 
320 	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
321 	if (!zone_info->empty_zones) {
322 		ret = -ENOMEM;
323 		goto out;
324 	}
325 
326 	zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
327 	if (!zones) {
328 		ret = -ENOMEM;
329 		goto out;
330 	}
331 
332 	/* Get zones type */
333 	while (sector < nr_sectors) {
334 		nr_zones = BTRFS_REPORT_NR_ZONES;
335 		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
336 					  &nr_zones);
337 		if (ret)
338 			goto out;
339 
340 		for (i = 0; i < nr_zones; i++) {
341 			if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
342 				__set_bit(nreported, zone_info->seq_zones);
343 			if (zones[i].cond == BLK_ZONE_COND_EMPTY)
344 				__set_bit(nreported, zone_info->empty_zones);
345 			nreported++;
346 		}
347 		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
348 	}
349 
350 	if (nreported != zone_info->nr_zones) {
351 		btrfs_err_in_rcu(device->fs_info,
352 				 "inconsistent number of zones on %s (%u/%u)",
353 				 rcu_str_deref(device->name), nreported,
354 				 zone_info->nr_zones);
355 		ret = -EIO;
356 		goto out;
357 	}
358 
359 	/* Validate superblock log */
360 	nr_zones = BTRFS_NR_SB_LOG_ZONES;
361 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
362 		u32 sb_zone;
363 		u64 sb_wp;
364 		int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
365 
366 		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
367 		if (sb_zone + 1 >= zone_info->nr_zones)
368 			continue;
369 
370 		sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
371 		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
372 					  &zone_info->sb_zones[sb_pos],
373 					  &nr_zones);
374 		if (ret)
375 			goto out;
376 
377 		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
378 			btrfs_err_in_rcu(device->fs_info,
379 	"zoned: failed to read super block log zone info at devid %llu zone %u",
380 					 device->devid, sb_zone);
381 			ret = -EUCLEAN;
382 			goto out;
383 		}
384 
385 		/*
386 		 * If zones[0] is conventional, always use the beggining of the
387 		 * zone to record superblock. No need to validate in that case.
388 		 */
389 		if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
390 		    BLK_ZONE_TYPE_CONVENTIONAL)
391 			continue;
392 
393 		ret = sb_write_pointer(device->bdev,
394 				       &zone_info->sb_zones[sb_pos], &sb_wp);
395 		if (ret != -ENOENT && ret) {
396 			btrfs_err_in_rcu(device->fs_info,
397 			"zoned: super block log zone corrupted devid %llu zone %u",
398 					 device->devid, sb_zone);
399 			ret = -EUCLEAN;
400 			goto out;
401 		}
402 	}
403 
404 
405 	kfree(zones);
406 
407 	device->zone_info = zone_info;
408 
409 	switch (bdev_zoned_model(bdev)) {
410 	case BLK_ZONED_HM:
411 		model = "host-managed zoned";
412 		emulated = "";
413 		break;
414 	case BLK_ZONED_HA:
415 		model = "host-aware zoned";
416 		emulated = "";
417 		break;
418 	case BLK_ZONED_NONE:
419 		model = "regular";
420 		emulated = "emulated ";
421 		break;
422 	default:
423 		/* Just in case */
424 		btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
425 				 bdev_zoned_model(bdev),
426 				 rcu_str_deref(device->name));
427 		ret = -EOPNOTSUPP;
428 		goto out_free_zone_info;
429 	}
430 
431 	btrfs_info_in_rcu(fs_info,
432 		"%s block device %s, %u %szones of %llu bytes",
433 		model, rcu_str_deref(device->name), zone_info->nr_zones,
434 		emulated, zone_info->zone_size);
435 
436 	return 0;
437 
438 out:
439 	kfree(zones);
440 out_free_zone_info:
441 	bitmap_free(zone_info->empty_zones);
442 	bitmap_free(zone_info->seq_zones);
443 	kfree(zone_info);
444 	device->zone_info = NULL;
445 
446 	return ret;
447 }
448 
449 void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
450 {
451 	struct btrfs_zoned_device_info *zone_info = device->zone_info;
452 
453 	if (!zone_info)
454 		return;
455 
456 	bitmap_free(zone_info->seq_zones);
457 	bitmap_free(zone_info->empty_zones);
458 	kfree(zone_info);
459 	device->zone_info = NULL;
460 }
461 
462 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
463 		       struct blk_zone *zone)
464 {
465 	unsigned int nr_zones = 1;
466 	int ret;
467 
468 	ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
469 	if (ret != 0 || !nr_zones)
470 		return ret ? ret : -EIO;
471 
472 	return 0;
473 }
474 
475 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
476 {
477 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
478 	struct btrfs_device *device;
479 	u64 zoned_devices = 0;
480 	u64 nr_devices = 0;
481 	u64 zone_size = 0;
482 	u64 max_zone_append_size = 0;
483 	const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
484 	int ret = 0;
485 
486 	/* Count zoned devices */
487 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
488 		enum blk_zoned_model model;
489 
490 		if (!device->bdev)
491 			continue;
492 
493 		model = bdev_zoned_model(device->bdev);
494 		/*
495 		 * A Host-Managed zoned device must be used as a zoned device.
496 		 * A Host-Aware zoned device and a non-zoned devices can be
497 		 * treated as a zoned device, if ZONED flag is enabled in the
498 		 * superblock.
499 		 */
500 		if (model == BLK_ZONED_HM ||
501 		    (model == BLK_ZONED_HA && incompat_zoned) ||
502 		    (model == BLK_ZONED_NONE && incompat_zoned)) {
503 			struct btrfs_zoned_device_info *zone_info =
504 				device->zone_info;
505 
506 			zone_info = device->zone_info;
507 			zoned_devices++;
508 			if (!zone_size) {
509 				zone_size = zone_info->zone_size;
510 			} else if (zone_info->zone_size != zone_size) {
511 				btrfs_err(fs_info,
512 		"zoned: unequal block device zone sizes: have %llu found %llu",
513 					  device->zone_info->zone_size,
514 					  zone_size);
515 				ret = -EINVAL;
516 				goto out;
517 			}
518 			if (!max_zone_append_size ||
519 			    (zone_info->max_zone_append_size &&
520 			     zone_info->max_zone_append_size < max_zone_append_size))
521 				max_zone_append_size =
522 					zone_info->max_zone_append_size;
523 		}
524 		nr_devices++;
525 	}
526 
527 	if (!zoned_devices && !incompat_zoned)
528 		goto out;
529 
530 	if (!zoned_devices && incompat_zoned) {
531 		/* No zoned block device found on ZONED filesystem */
532 		btrfs_err(fs_info,
533 			  "zoned: no zoned devices found on a zoned filesystem");
534 		ret = -EINVAL;
535 		goto out;
536 	}
537 
538 	if (zoned_devices && !incompat_zoned) {
539 		btrfs_err(fs_info,
540 			  "zoned: mode not enabled but zoned device found");
541 		ret = -EINVAL;
542 		goto out;
543 	}
544 
545 	if (zoned_devices != nr_devices) {
546 		btrfs_err(fs_info,
547 			  "zoned: cannot mix zoned and regular devices");
548 		ret = -EINVAL;
549 		goto out;
550 	}
551 
552 	/*
553 	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
554 	 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
555 	 * check the alignment here.
556 	 */
557 	if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
558 		btrfs_err(fs_info,
559 			  "zoned: zone size %llu not aligned to stripe %u",
560 			  zone_size, BTRFS_STRIPE_LEN);
561 		ret = -EINVAL;
562 		goto out;
563 	}
564 
565 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
566 		btrfs_err(fs_info, "zoned: mixed block groups not supported");
567 		ret = -EINVAL;
568 		goto out;
569 	}
570 
571 	fs_info->zone_size = zone_size;
572 	fs_info->max_zone_append_size = max_zone_append_size;
573 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
574 
575 	/*
576 	 * Check mount options here, because we might change fs_info->zoned
577 	 * from fs_info->zone_size.
578 	 */
579 	ret = btrfs_check_mountopts_zoned(fs_info);
580 	if (ret)
581 		goto out;
582 
583 	btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
584 out:
585 	return ret;
586 }
587 
588 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
589 {
590 	if (!btrfs_is_zoned(info))
591 		return 0;
592 
593 	/*
594 	 * Space cache writing is not COWed. Disable that to avoid write errors
595 	 * in sequential zones.
596 	 */
597 	if (btrfs_test_opt(info, SPACE_CACHE)) {
598 		btrfs_err(info, "zoned: space cache v1 is not supported");
599 		return -EINVAL;
600 	}
601 
602 	if (btrfs_test_opt(info, NODATACOW)) {
603 		btrfs_err(info, "zoned: NODATACOW not supported");
604 		return -EINVAL;
605 	}
606 
607 	return 0;
608 }
609 
610 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
611 			   int rw, u64 *bytenr_ret)
612 {
613 	u64 wp;
614 	int ret;
615 
616 	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
617 		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
618 		return 0;
619 	}
620 
621 	ret = sb_write_pointer(bdev, zones, &wp);
622 	if (ret != -ENOENT && ret < 0)
623 		return ret;
624 
625 	if (rw == WRITE) {
626 		struct blk_zone *reset = NULL;
627 
628 		if (wp == zones[0].start << SECTOR_SHIFT)
629 			reset = &zones[0];
630 		else if (wp == zones[1].start << SECTOR_SHIFT)
631 			reset = &zones[1];
632 
633 		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
634 			ASSERT(reset->cond == BLK_ZONE_COND_FULL);
635 
636 			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
637 					       reset->start, reset->len,
638 					       GFP_NOFS);
639 			if (ret)
640 				return ret;
641 
642 			reset->cond = BLK_ZONE_COND_EMPTY;
643 			reset->wp = reset->start;
644 		}
645 	} else if (ret != -ENOENT) {
646 		/* For READ, we want the precious one */
647 		if (wp == zones[0].start << SECTOR_SHIFT)
648 			wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
649 		wp -= BTRFS_SUPER_INFO_SIZE;
650 	}
651 
652 	*bytenr_ret = wp;
653 	return 0;
654 
655 }
656 
657 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
658 			       u64 *bytenr_ret)
659 {
660 	struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
661 	sector_t zone_sectors;
662 	u32 sb_zone;
663 	int ret;
664 	u8 zone_sectors_shift;
665 	sector_t nr_sectors;
666 	u32 nr_zones;
667 
668 	if (!bdev_is_zoned(bdev)) {
669 		*bytenr_ret = btrfs_sb_offset(mirror);
670 		return 0;
671 	}
672 
673 	ASSERT(rw == READ || rw == WRITE);
674 
675 	zone_sectors = bdev_zone_sectors(bdev);
676 	if (!is_power_of_2(zone_sectors))
677 		return -EINVAL;
678 	zone_sectors_shift = ilog2(zone_sectors);
679 	nr_sectors = bdev_nr_sectors(bdev);
680 	nr_zones = nr_sectors >> zone_sectors_shift;
681 
682 	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
683 	if (sb_zone + 1 >= nr_zones)
684 		return -ENOENT;
685 
686 	ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
687 				  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
688 				  zones);
689 	if (ret < 0)
690 		return ret;
691 	if (ret != BTRFS_NR_SB_LOG_ZONES)
692 		return -EIO;
693 
694 	return sb_log_location(bdev, zones, rw, bytenr_ret);
695 }
696 
697 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
698 			  u64 *bytenr_ret)
699 {
700 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
701 	u32 zone_num;
702 
703 	/*
704 	 * For a zoned filesystem on a non-zoned block device, use the same
705 	 * super block locations as regular filesystem. Doing so, the super
706 	 * block can always be retrieved and the zoned flag of the volume
707 	 * detected from the super block information.
708 	 */
709 	if (!bdev_is_zoned(device->bdev)) {
710 		*bytenr_ret = btrfs_sb_offset(mirror);
711 		return 0;
712 	}
713 
714 	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
715 	if (zone_num + 1 >= zinfo->nr_zones)
716 		return -ENOENT;
717 
718 	return sb_log_location(device->bdev,
719 			       &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
720 			       rw, bytenr_ret);
721 }
722 
723 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
724 				  int mirror)
725 {
726 	u32 zone_num;
727 
728 	if (!zinfo)
729 		return false;
730 
731 	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
732 	if (zone_num + 1 >= zinfo->nr_zones)
733 		return false;
734 
735 	if (!test_bit(zone_num, zinfo->seq_zones))
736 		return false;
737 
738 	return true;
739 }
740 
741 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
742 {
743 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
744 	struct blk_zone *zone;
745 
746 	if (!is_sb_log_zone(zinfo, mirror))
747 		return;
748 
749 	zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
750 	if (zone->cond != BLK_ZONE_COND_FULL) {
751 		if (zone->cond == BLK_ZONE_COND_EMPTY)
752 			zone->cond = BLK_ZONE_COND_IMP_OPEN;
753 
754 		zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
755 
756 		if (zone->wp == zone->start + zone->len)
757 			zone->cond = BLK_ZONE_COND_FULL;
758 
759 		return;
760 	}
761 
762 	zone++;
763 	ASSERT(zone->cond != BLK_ZONE_COND_FULL);
764 	if (zone->cond == BLK_ZONE_COND_EMPTY)
765 		zone->cond = BLK_ZONE_COND_IMP_OPEN;
766 
767 	zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
768 
769 	if (zone->wp == zone->start + zone->len)
770 		zone->cond = BLK_ZONE_COND_FULL;
771 }
772 
773 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
774 {
775 	sector_t zone_sectors;
776 	sector_t nr_sectors;
777 	u8 zone_sectors_shift;
778 	u32 sb_zone;
779 	u32 nr_zones;
780 
781 	zone_sectors = bdev_zone_sectors(bdev);
782 	zone_sectors_shift = ilog2(zone_sectors);
783 	nr_sectors = bdev_nr_sectors(bdev);
784 	nr_zones = nr_sectors >> zone_sectors_shift;
785 
786 	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
787 	if (sb_zone + 1 >= nr_zones)
788 		return -ENOENT;
789 
790 	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
791 				sb_zone << zone_sectors_shift,
792 				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
793 }
794 
795 /**
796  * btrfs_find_allocatable_zones - find allocatable zones within a given region
797  *
798  * @device:	the device to allocate a region on
799  * @hole_start: the position of the hole to allocate the region
800  * @num_bytes:	size of wanted region
801  * @hole_end:	the end of the hole
802  * @return:	position of allocatable zones
803  *
804  * Allocatable region should not contain any superblock locations.
805  */
806 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
807 				 u64 hole_end, u64 num_bytes)
808 {
809 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
810 	const u8 shift = zinfo->zone_size_shift;
811 	u64 nzones = num_bytes >> shift;
812 	u64 pos = hole_start;
813 	u64 begin, end;
814 	bool have_sb;
815 	int i;
816 
817 	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
818 	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
819 
820 	while (pos < hole_end) {
821 		begin = pos >> shift;
822 		end = begin + nzones;
823 
824 		if (end > zinfo->nr_zones)
825 			return hole_end;
826 
827 		/* Check if zones in the region are all empty */
828 		if (btrfs_dev_is_sequential(device, pos) &&
829 		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
830 			pos += zinfo->zone_size;
831 			continue;
832 		}
833 
834 		have_sb = false;
835 		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
836 			u32 sb_zone;
837 			u64 sb_pos;
838 
839 			sb_zone = sb_zone_number(shift, i);
840 			if (!(end <= sb_zone ||
841 			      sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
842 				have_sb = true;
843 				pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
844 				break;
845 			}
846 
847 			/* We also need to exclude regular superblock positions */
848 			sb_pos = btrfs_sb_offset(i);
849 			if (!(pos + num_bytes <= sb_pos ||
850 			      sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
851 				have_sb = true;
852 				pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
853 					    zinfo->zone_size);
854 				break;
855 			}
856 		}
857 		if (!have_sb)
858 			break;
859 	}
860 
861 	return pos;
862 }
863 
864 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
865 			    u64 length, u64 *bytes)
866 {
867 	int ret;
868 
869 	*bytes = 0;
870 	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
871 			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
872 			       GFP_NOFS);
873 	if (ret)
874 		return ret;
875 
876 	*bytes = length;
877 	while (length) {
878 		btrfs_dev_set_zone_empty(device, physical);
879 		physical += device->zone_info->zone_size;
880 		length -= device->zone_info->zone_size;
881 	}
882 
883 	return 0;
884 }
885 
886 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
887 {
888 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
889 	const u8 shift = zinfo->zone_size_shift;
890 	unsigned long begin = start >> shift;
891 	unsigned long end = (start + size) >> shift;
892 	u64 pos;
893 	int ret;
894 
895 	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
896 	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
897 
898 	if (end > zinfo->nr_zones)
899 		return -ERANGE;
900 
901 	/* All the zones are conventional */
902 	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
903 		return 0;
904 
905 	/* All the zones are sequential and empty */
906 	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
907 	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
908 		return 0;
909 
910 	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
911 		u64 reset_bytes;
912 
913 		if (!btrfs_dev_is_sequential(device, pos) ||
914 		    btrfs_dev_is_empty_zone(device, pos))
915 			continue;
916 
917 		/* Free regions should be empty */
918 		btrfs_warn_in_rcu(
919 			device->fs_info,
920 		"zoned: resetting device %s (devid %llu) zone %llu for allocation",
921 			rcu_str_deref(device->name), device->devid, pos >> shift);
922 		WARN_ON_ONCE(1);
923 
924 		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
925 					      &reset_bytes);
926 		if (ret)
927 			return ret;
928 	}
929 
930 	return 0;
931 }
932 
933 /*
934  * Calculate an allocation pointer from the extent allocation information
935  * for a block group consist of conventional zones. It is pointed to the
936  * end of the highest addressed extent in the block group as an allocation
937  * offset.
938  */
939 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
940 				   u64 *offset_ret)
941 {
942 	struct btrfs_fs_info *fs_info = cache->fs_info;
943 	struct btrfs_root *root = fs_info->extent_root;
944 	struct btrfs_path *path;
945 	struct btrfs_key key;
946 	struct btrfs_key found_key;
947 	int ret;
948 	u64 length;
949 
950 	path = btrfs_alloc_path();
951 	if (!path)
952 		return -ENOMEM;
953 
954 	key.objectid = cache->start + cache->length;
955 	key.type = 0;
956 	key.offset = 0;
957 
958 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
959 	/* We should not find the exact match */
960 	if (!ret)
961 		ret = -EUCLEAN;
962 	if (ret < 0)
963 		goto out;
964 
965 	ret = btrfs_previous_extent_item(root, path, cache->start);
966 	if (ret) {
967 		if (ret == 1) {
968 			ret = 0;
969 			*offset_ret = 0;
970 		}
971 		goto out;
972 	}
973 
974 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
975 
976 	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
977 		length = found_key.offset;
978 	else
979 		length = fs_info->nodesize;
980 
981 	if (!(found_key.objectid >= cache->start &&
982 	       found_key.objectid + length <= cache->start + cache->length)) {
983 		ret = -EUCLEAN;
984 		goto out;
985 	}
986 	*offset_ret = found_key.objectid + length - cache->start;
987 	ret = 0;
988 
989 out:
990 	btrfs_free_path(path);
991 	return ret;
992 }
993 
994 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
995 {
996 	struct btrfs_fs_info *fs_info = cache->fs_info;
997 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
998 	struct extent_map *em;
999 	struct map_lookup *map;
1000 	struct btrfs_device *device;
1001 	u64 logical = cache->start;
1002 	u64 length = cache->length;
1003 	u64 physical = 0;
1004 	int ret;
1005 	int i;
1006 	unsigned int nofs_flag;
1007 	u64 *alloc_offsets = NULL;
1008 	u64 last_alloc = 0;
1009 	u32 num_sequential = 0, num_conventional = 0;
1010 
1011 	if (!btrfs_is_zoned(fs_info))
1012 		return 0;
1013 
1014 	/* Sanity check */
1015 	if (!IS_ALIGNED(length, fs_info->zone_size)) {
1016 		btrfs_err(fs_info,
1017 		"zoned: block group %llu len %llu unaligned to zone size %llu",
1018 			  logical, length, fs_info->zone_size);
1019 		return -EIO;
1020 	}
1021 
1022 	/* Get the chunk mapping */
1023 	read_lock(&em_tree->lock);
1024 	em = lookup_extent_mapping(em_tree, logical, length);
1025 	read_unlock(&em_tree->lock);
1026 
1027 	if (!em)
1028 		return -EINVAL;
1029 
1030 	map = em->map_lookup;
1031 
1032 	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1033 	if (!alloc_offsets) {
1034 		free_extent_map(em);
1035 		return -ENOMEM;
1036 	}
1037 
1038 	for (i = 0; i < map->num_stripes; i++) {
1039 		bool is_sequential;
1040 		struct blk_zone zone;
1041 		struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1042 		int dev_replace_is_ongoing = 0;
1043 
1044 		device = map->stripes[i].dev;
1045 		physical = map->stripes[i].physical;
1046 
1047 		if (device->bdev == NULL) {
1048 			alloc_offsets[i] = WP_MISSING_DEV;
1049 			continue;
1050 		}
1051 
1052 		is_sequential = btrfs_dev_is_sequential(device, physical);
1053 		if (is_sequential)
1054 			num_sequential++;
1055 		else
1056 			num_conventional++;
1057 
1058 		if (!is_sequential) {
1059 			alloc_offsets[i] = WP_CONVENTIONAL;
1060 			continue;
1061 		}
1062 
1063 		/*
1064 		 * This zone will be used for allocation, so mark this zone
1065 		 * non-empty.
1066 		 */
1067 		btrfs_dev_clear_zone_empty(device, physical);
1068 
1069 		down_read(&dev_replace->rwsem);
1070 		dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1071 		if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1072 			btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1073 		up_read(&dev_replace->rwsem);
1074 
1075 		/*
1076 		 * The group is mapped to a sequential zone. Get the zone write
1077 		 * pointer to determine the allocation offset within the zone.
1078 		 */
1079 		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1080 		nofs_flag = memalloc_nofs_save();
1081 		ret = btrfs_get_dev_zone(device, physical, &zone);
1082 		memalloc_nofs_restore(nofs_flag);
1083 		if (ret == -EIO || ret == -EOPNOTSUPP) {
1084 			ret = 0;
1085 			alloc_offsets[i] = WP_MISSING_DEV;
1086 			continue;
1087 		} else if (ret) {
1088 			goto out;
1089 		}
1090 
1091 		switch (zone.cond) {
1092 		case BLK_ZONE_COND_OFFLINE:
1093 		case BLK_ZONE_COND_READONLY:
1094 			btrfs_err(fs_info,
1095 		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
1096 				  physical >> device->zone_info->zone_size_shift,
1097 				  rcu_str_deref(device->name), device->devid);
1098 			alloc_offsets[i] = WP_MISSING_DEV;
1099 			break;
1100 		case BLK_ZONE_COND_EMPTY:
1101 			alloc_offsets[i] = 0;
1102 			break;
1103 		case BLK_ZONE_COND_FULL:
1104 			alloc_offsets[i] = fs_info->zone_size;
1105 			break;
1106 		default:
1107 			/* Partially used zone */
1108 			alloc_offsets[i] =
1109 					((zone.wp - zone.start) << SECTOR_SHIFT);
1110 			break;
1111 		}
1112 	}
1113 
1114 	if (num_sequential > 0)
1115 		cache->seq_zone = true;
1116 
1117 	if (num_conventional > 0) {
1118 		/*
1119 		 * Avoid calling calculate_alloc_pointer() for new BG. It
1120 		 * is no use for new BG. It must be always 0.
1121 		 *
1122 		 * Also, we have a lock chain of extent buffer lock ->
1123 		 * chunk mutex.  For new BG, this function is called from
1124 		 * btrfs_make_block_group() which is already taking the
1125 		 * chunk mutex. Thus, we cannot call
1126 		 * calculate_alloc_pointer() which takes extent buffer
1127 		 * locks to avoid deadlock.
1128 		 */
1129 		if (new) {
1130 			cache->alloc_offset = 0;
1131 			goto out;
1132 		}
1133 		ret = calculate_alloc_pointer(cache, &last_alloc);
1134 		if (ret || map->num_stripes == num_conventional) {
1135 			if (!ret)
1136 				cache->alloc_offset = last_alloc;
1137 			else
1138 				btrfs_err(fs_info,
1139 			"zoned: failed to determine allocation offset of bg %llu",
1140 					  cache->start);
1141 			goto out;
1142 		}
1143 	}
1144 
1145 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1146 	case 0: /* single */
1147 		cache->alloc_offset = alloc_offsets[0];
1148 		break;
1149 	case BTRFS_BLOCK_GROUP_DUP:
1150 	case BTRFS_BLOCK_GROUP_RAID1:
1151 	case BTRFS_BLOCK_GROUP_RAID0:
1152 	case BTRFS_BLOCK_GROUP_RAID10:
1153 	case BTRFS_BLOCK_GROUP_RAID5:
1154 	case BTRFS_BLOCK_GROUP_RAID6:
1155 		/* non-single profiles are not supported yet */
1156 	default:
1157 		btrfs_err(fs_info, "zoned: profile %s not yet supported",
1158 			  btrfs_bg_type_to_raid_name(map->type));
1159 		ret = -EINVAL;
1160 		goto out;
1161 	}
1162 
1163 out:
1164 	/* An extent is allocated after the write pointer */
1165 	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1166 		btrfs_err(fs_info,
1167 			  "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1168 			  logical, last_alloc, cache->alloc_offset);
1169 		ret = -EIO;
1170 	}
1171 
1172 	if (!ret)
1173 		cache->meta_write_pointer = cache->alloc_offset + cache->start;
1174 
1175 	kfree(alloc_offsets);
1176 	free_extent_map(em);
1177 
1178 	return ret;
1179 }
1180 
1181 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1182 {
1183 	u64 unusable, free;
1184 
1185 	if (!btrfs_is_zoned(cache->fs_info))
1186 		return;
1187 
1188 	WARN_ON(cache->bytes_super != 0);
1189 	unusable = cache->alloc_offset - cache->used;
1190 	free = cache->length - cache->alloc_offset;
1191 
1192 	/* We only need ->free_space in ALLOC_SEQ block groups */
1193 	cache->last_byte_to_unpin = (u64)-1;
1194 	cache->cached = BTRFS_CACHE_FINISHED;
1195 	cache->free_space_ctl->free_space = free;
1196 	cache->zone_unusable = unusable;
1197 
1198 	/* Should not have any excluded extents. Just in case, though */
1199 	btrfs_free_excluded_extents(cache);
1200 }
1201 
1202 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1203 			    struct extent_buffer *eb)
1204 {
1205 	struct btrfs_fs_info *fs_info = eb->fs_info;
1206 
1207 	if (!btrfs_is_zoned(fs_info) ||
1208 	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1209 	    !list_empty(&eb->release_list))
1210 		return;
1211 
1212 	set_extent_buffer_dirty(eb);
1213 	set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1214 			       eb->start + eb->len - 1, EXTENT_DIRTY);
1215 	memzero_extent_buffer(eb, 0, eb->len);
1216 	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1217 
1218 	spin_lock(&trans->releasing_ebs_lock);
1219 	list_add_tail(&eb->release_list, &trans->releasing_ebs);
1220 	spin_unlock(&trans->releasing_ebs_lock);
1221 	atomic_inc(&eb->refs);
1222 }
1223 
1224 void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1225 {
1226 	spin_lock(&trans->releasing_ebs_lock);
1227 	while (!list_empty(&trans->releasing_ebs)) {
1228 		struct extent_buffer *eb;
1229 
1230 		eb = list_first_entry(&trans->releasing_ebs,
1231 				      struct extent_buffer, release_list);
1232 		list_del_init(&eb->release_list);
1233 		free_extent_buffer(eb);
1234 	}
1235 	spin_unlock(&trans->releasing_ebs_lock);
1236 }
1237 
1238 bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
1239 {
1240 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1241 	struct btrfs_block_group *cache;
1242 	bool ret = false;
1243 
1244 	if (!btrfs_is_zoned(fs_info))
1245 		return false;
1246 
1247 	if (!fs_info->max_zone_append_size)
1248 		return false;
1249 
1250 	if (!is_data_inode(&inode->vfs_inode))
1251 		return false;
1252 
1253 	cache = btrfs_lookup_block_group(fs_info, em->block_start);
1254 	ASSERT(cache);
1255 	if (!cache)
1256 		return false;
1257 
1258 	ret = cache->seq_zone;
1259 	btrfs_put_block_group(cache);
1260 
1261 	return ret;
1262 }
1263 
1264 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1265 				 struct bio *bio)
1266 {
1267 	struct btrfs_ordered_extent *ordered;
1268 	const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1269 
1270 	if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1271 		return;
1272 
1273 	ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1274 	if (WARN_ON(!ordered))
1275 		return;
1276 
1277 	ordered->physical = physical;
1278 	ordered->disk = bio->bi_bdev->bd_disk;
1279 	ordered->partno = bio->bi_bdev->bd_partno;
1280 
1281 	btrfs_put_ordered_extent(ordered);
1282 }
1283 
1284 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1285 {
1286 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1287 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1288 	struct extent_map_tree *em_tree;
1289 	struct extent_map *em;
1290 	struct btrfs_ordered_sum *sum;
1291 	struct block_device *bdev;
1292 	u64 orig_logical = ordered->disk_bytenr;
1293 	u64 *logical = NULL;
1294 	int nr, stripe_len;
1295 
1296 	/* Zoned devices should not have partitions. So, we can assume it is 0 */
1297 	ASSERT(ordered->partno == 0);
1298 	bdev = bdgrab(ordered->disk->part0);
1299 	if (WARN_ON(!bdev))
1300 		return;
1301 
1302 	if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
1303 				     ordered->physical, &logical, &nr,
1304 				     &stripe_len)))
1305 		goto out;
1306 
1307 	WARN_ON(nr != 1);
1308 
1309 	if (orig_logical == *logical)
1310 		goto out;
1311 
1312 	ordered->disk_bytenr = *logical;
1313 
1314 	em_tree = &inode->extent_tree;
1315 	write_lock(&em_tree->lock);
1316 	em = search_extent_mapping(em_tree, ordered->file_offset,
1317 				   ordered->num_bytes);
1318 	em->block_start = *logical;
1319 	free_extent_map(em);
1320 	write_unlock(&em_tree->lock);
1321 
1322 	list_for_each_entry(sum, &ordered->list, list) {
1323 		if (*logical < orig_logical)
1324 			sum->bytenr -= orig_logical - *logical;
1325 		else
1326 			sum->bytenr += *logical - orig_logical;
1327 	}
1328 
1329 out:
1330 	kfree(logical);
1331 	bdput(bdev);
1332 }
1333 
1334 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1335 				    struct extent_buffer *eb,
1336 				    struct btrfs_block_group **cache_ret)
1337 {
1338 	struct btrfs_block_group *cache;
1339 	bool ret = true;
1340 
1341 	if (!btrfs_is_zoned(fs_info))
1342 		return true;
1343 
1344 	cache = *cache_ret;
1345 
1346 	if (cache && (eb->start < cache->start ||
1347 		      cache->start + cache->length <= eb->start)) {
1348 		btrfs_put_block_group(cache);
1349 		cache = NULL;
1350 		*cache_ret = NULL;
1351 	}
1352 
1353 	if (!cache)
1354 		cache = btrfs_lookup_block_group(fs_info, eb->start);
1355 
1356 	if (cache) {
1357 		if (cache->meta_write_pointer != eb->start) {
1358 			btrfs_put_block_group(cache);
1359 			cache = NULL;
1360 			ret = false;
1361 		} else {
1362 			cache->meta_write_pointer = eb->start + eb->len;
1363 		}
1364 
1365 		*cache_ret = cache;
1366 	}
1367 
1368 	return ret;
1369 }
1370 
1371 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1372 				     struct extent_buffer *eb)
1373 {
1374 	if (!btrfs_is_zoned(eb->fs_info) || !cache)
1375 		return;
1376 
1377 	ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1378 	cache->meta_write_pointer = eb->start;
1379 }
1380 
1381 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1382 {
1383 	if (!btrfs_dev_is_sequential(device, physical))
1384 		return -EOPNOTSUPP;
1385 
1386 	return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1387 				    length >> SECTOR_SHIFT, GFP_NOFS, 0);
1388 }
1389 
1390 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1391 			  struct blk_zone *zone)
1392 {
1393 	struct btrfs_bio *bbio = NULL;
1394 	u64 mapped_length = PAGE_SIZE;
1395 	unsigned int nofs_flag;
1396 	int nmirrors;
1397 	int i, ret;
1398 
1399 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1400 			       &mapped_length, &bbio);
1401 	if (ret || !bbio || mapped_length < PAGE_SIZE) {
1402 		btrfs_put_bbio(bbio);
1403 		return -EIO;
1404 	}
1405 
1406 	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1407 		return -EINVAL;
1408 
1409 	nofs_flag = memalloc_nofs_save();
1410 	nmirrors = (int)bbio->num_stripes;
1411 	for (i = 0; i < nmirrors; i++) {
1412 		u64 physical = bbio->stripes[i].physical;
1413 		struct btrfs_device *dev = bbio->stripes[i].dev;
1414 
1415 		/* Missing device */
1416 		if (!dev->bdev)
1417 			continue;
1418 
1419 		ret = btrfs_get_dev_zone(dev, physical, zone);
1420 		/* Failing device */
1421 		if (ret == -EIO || ret == -EOPNOTSUPP)
1422 			continue;
1423 		break;
1424 	}
1425 	memalloc_nofs_restore(nofs_flag);
1426 
1427 	return ret;
1428 }
1429 
1430 /*
1431  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1432  * filling zeros between @physical_pos to a write pointer of dev-replace
1433  * source device.
1434  */
1435 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1436 				    u64 physical_start, u64 physical_pos)
1437 {
1438 	struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1439 	struct blk_zone zone;
1440 	u64 length;
1441 	u64 wp;
1442 	int ret;
1443 
1444 	if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1445 		return 0;
1446 
1447 	ret = read_zone_info(fs_info, logical, &zone);
1448 	if (ret)
1449 		return ret;
1450 
1451 	wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1452 
1453 	if (physical_pos == wp)
1454 		return 0;
1455 
1456 	if (physical_pos > wp)
1457 		return -EUCLEAN;
1458 
1459 	length = wp - physical_pos;
1460 	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1461 }
1462