xref: /openbmc/linux/fs/btrfs/scrub.c (revision 11976fe2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5 
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "raid56.h"
21 #include "block-group.h"
22 #include "zoned.h"
23 #include "fs.h"
24 #include "accessors.h"
25 #include "file-item.h"
26 #include "scrub.h"
27 
28 /*
29  * This is only the first step towards a full-features scrub. It reads all
30  * extent and super block and verifies the checksums. In case a bad checksum
31  * is found or the extent cannot be read, good data will be written back if
32  * any can be found.
33  *
34  * Future enhancements:
35  *  - In case an unrepairable extent is encountered, track which files are
36  *    affected and report them
37  *  - track and record media errors, throw out bad devices
38  *  - add a mode to also read unallocated space
39  */
40 
41 struct scrub_ctx;
42 
43 /*
44  * The following value only influences the performance.
45  *
46  * This determines the batch size for stripe submitted in one go.
47  */
48 #define SCRUB_STRIPES_PER_SCTX	8	/* That would be 8 64K stripe per-device. */
49 
50 /*
51  * The following value times PAGE_SIZE needs to be large enough to match the
52  * largest node/leaf/sector size that shall be supported.
53  */
54 #define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
55 
56 /* Represent one sector and its needed info to verify the content. */
57 struct scrub_sector_verification {
58 	bool is_metadata;
59 
60 	union {
61 		/*
62 		 * Csum pointer for data csum verification.  Should point to a
63 		 * sector csum inside scrub_stripe::csums.
64 		 *
65 		 * NULL if this data sector has no csum.
66 		 */
67 		u8 *csum;
68 
69 		/*
70 		 * Extra info for metadata verification.  All sectors inside a
71 		 * tree block share the same generation.
72 		 */
73 		u64 generation;
74 	};
75 };
76 
77 enum scrub_stripe_flags {
78 	/* Set when @mirror_num, @dev, @physical and @logical are set. */
79 	SCRUB_STRIPE_FLAG_INITIALIZED,
80 
81 	/* Set when the read-repair is finished. */
82 	SCRUB_STRIPE_FLAG_REPAIR_DONE,
83 
84 	/*
85 	 * Set for data stripes if it's triggered from P/Q stripe.
86 	 * During such scrub, we should not report errors in data stripes, nor
87 	 * update the accounting.
88 	 */
89 	SCRUB_STRIPE_FLAG_NO_REPORT,
90 };
91 
92 #define SCRUB_STRIPE_PAGES		(BTRFS_STRIPE_LEN / PAGE_SIZE)
93 
94 /*
95  * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
96  */
97 struct scrub_stripe {
98 	struct scrub_ctx *sctx;
99 	struct btrfs_block_group *bg;
100 
101 	struct page *pages[SCRUB_STRIPE_PAGES];
102 	struct scrub_sector_verification *sectors;
103 
104 	struct btrfs_device *dev;
105 	u64 logical;
106 	u64 physical;
107 
108 	u16 mirror_num;
109 
110 	/* Should be BTRFS_STRIPE_LEN / sectorsize. */
111 	u16 nr_sectors;
112 
113 	/*
114 	 * How many data/meta extents are in this stripe.  Only for scrub status
115 	 * reporting purposes.
116 	 */
117 	u16 nr_data_extents;
118 	u16 nr_meta_extents;
119 
120 	atomic_t pending_io;
121 	wait_queue_head_t io_wait;
122 	wait_queue_head_t repair_wait;
123 
124 	/*
125 	 * Indicate the states of the stripe.  Bits are defined in
126 	 * scrub_stripe_flags enum.
127 	 */
128 	unsigned long state;
129 
130 	/* Indicate which sectors are covered by extent items. */
131 	unsigned long extent_sector_bitmap;
132 
133 	/*
134 	 * The errors hit during the initial read of the stripe.
135 	 *
136 	 * Would be utilized for error reporting and repair.
137 	 */
138 	unsigned long init_error_bitmap;
139 
140 	/*
141 	 * The following error bitmaps are all for the current status.
142 	 * Every time we submit a new read, these bitmaps may be updated.
143 	 *
144 	 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
145 	 *
146 	 * IO and csum errors can happen for both metadata and data.
147 	 */
148 	unsigned long error_bitmap;
149 	unsigned long io_error_bitmap;
150 	unsigned long csum_error_bitmap;
151 	unsigned long meta_error_bitmap;
152 
153 	/* For writeback (repair or replace) error reporting. */
154 	unsigned long write_error_bitmap;
155 
156 	/* Writeback can be concurrent, thus we need to protect the bitmap. */
157 	spinlock_t write_error_lock;
158 
159 	/*
160 	 * Checksum for the whole stripe if this stripe is inside a data block
161 	 * group.
162 	 */
163 	u8 *csums;
164 
165 	struct work_struct work;
166 };
167 
168 struct scrub_ctx {
169 	struct scrub_stripe	stripes[SCRUB_STRIPES_PER_SCTX];
170 	struct scrub_stripe	*raid56_data_stripes;
171 	struct btrfs_fs_info	*fs_info;
172 	int			first_free;
173 	int			cur_stripe;
174 	struct list_head	csum_list;
175 	atomic_t		cancel_req;
176 	int			readonly;
177 	int			sectors_per_bio;
178 
179 	/* State of IO submission throttling affecting the associated device */
180 	ktime_t			throttle_deadline;
181 	u64			throttle_sent;
182 
183 	int			is_dev_replace;
184 	u64			write_pointer;
185 
186 	struct mutex            wr_lock;
187 	struct btrfs_device     *wr_tgtdev;
188 
189 	/*
190 	 * statistics
191 	 */
192 	struct btrfs_scrub_progress stat;
193 	spinlock_t		stat_lock;
194 
195 	/*
196 	 * Use a ref counter to avoid use-after-free issues. Scrub workers
197 	 * decrement bios_in_flight and workers_pending and then do a wakeup
198 	 * on the list_wait wait queue. We must ensure the main scrub task
199 	 * doesn't free the scrub context before or while the workers are
200 	 * doing the wakeup() call.
201 	 */
202 	refcount_t              refs;
203 };
204 
205 struct scrub_warning {
206 	struct btrfs_path	*path;
207 	u64			extent_item_size;
208 	const char		*errstr;
209 	u64			physical;
210 	u64			logical;
211 	struct btrfs_device	*dev;
212 };
213 
214 static void release_scrub_stripe(struct scrub_stripe *stripe)
215 {
216 	if (!stripe)
217 		return;
218 
219 	for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
220 		if (stripe->pages[i])
221 			__free_page(stripe->pages[i]);
222 		stripe->pages[i] = NULL;
223 	}
224 	kfree(stripe->sectors);
225 	kfree(stripe->csums);
226 	stripe->sectors = NULL;
227 	stripe->csums = NULL;
228 	stripe->sctx = NULL;
229 	stripe->state = 0;
230 }
231 
232 static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
233 			     struct scrub_stripe *stripe)
234 {
235 	int ret;
236 
237 	memset(stripe, 0, sizeof(*stripe));
238 
239 	stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
240 	stripe->state = 0;
241 
242 	init_waitqueue_head(&stripe->io_wait);
243 	init_waitqueue_head(&stripe->repair_wait);
244 	atomic_set(&stripe->pending_io, 0);
245 	spin_lock_init(&stripe->write_error_lock);
246 
247 	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
248 	if (ret < 0)
249 		goto error;
250 
251 	stripe->sectors = kcalloc(stripe->nr_sectors,
252 				  sizeof(struct scrub_sector_verification),
253 				  GFP_KERNEL);
254 	if (!stripe->sectors)
255 		goto error;
256 
257 	stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
258 				fs_info->csum_size, GFP_KERNEL);
259 	if (!stripe->csums)
260 		goto error;
261 	return 0;
262 error:
263 	release_scrub_stripe(stripe);
264 	return -ENOMEM;
265 }
266 
267 static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
268 {
269 	wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
270 }
271 
272 static void scrub_put_ctx(struct scrub_ctx *sctx);
273 
274 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
275 {
276 	while (atomic_read(&fs_info->scrub_pause_req)) {
277 		mutex_unlock(&fs_info->scrub_lock);
278 		wait_event(fs_info->scrub_pause_wait,
279 		   atomic_read(&fs_info->scrub_pause_req) == 0);
280 		mutex_lock(&fs_info->scrub_lock);
281 	}
282 }
283 
284 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
285 {
286 	atomic_inc(&fs_info->scrubs_paused);
287 	wake_up(&fs_info->scrub_pause_wait);
288 }
289 
290 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
291 {
292 	mutex_lock(&fs_info->scrub_lock);
293 	__scrub_blocked_if_needed(fs_info);
294 	atomic_dec(&fs_info->scrubs_paused);
295 	mutex_unlock(&fs_info->scrub_lock);
296 
297 	wake_up(&fs_info->scrub_pause_wait);
298 }
299 
300 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
301 {
302 	scrub_pause_on(fs_info);
303 	scrub_pause_off(fs_info);
304 }
305 
306 static void scrub_free_csums(struct scrub_ctx *sctx)
307 {
308 	while (!list_empty(&sctx->csum_list)) {
309 		struct btrfs_ordered_sum *sum;
310 		sum = list_first_entry(&sctx->csum_list,
311 				       struct btrfs_ordered_sum, list);
312 		list_del(&sum->list);
313 		kfree(sum);
314 	}
315 }
316 
317 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
318 {
319 	int i;
320 
321 	if (!sctx)
322 		return;
323 
324 	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
325 		release_scrub_stripe(&sctx->stripes[i]);
326 
327 	scrub_free_csums(sctx);
328 	kfree(sctx);
329 }
330 
331 static void scrub_put_ctx(struct scrub_ctx *sctx)
332 {
333 	if (refcount_dec_and_test(&sctx->refs))
334 		scrub_free_ctx(sctx);
335 }
336 
337 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
338 		struct btrfs_fs_info *fs_info, int is_dev_replace)
339 {
340 	struct scrub_ctx *sctx;
341 	int		i;
342 
343 	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
344 	if (!sctx)
345 		goto nomem;
346 	refcount_set(&sctx->refs, 1);
347 	sctx->is_dev_replace = is_dev_replace;
348 	sctx->fs_info = fs_info;
349 	INIT_LIST_HEAD(&sctx->csum_list);
350 	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
351 		int ret;
352 
353 		ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
354 		if (ret < 0)
355 			goto nomem;
356 		sctx->stripes[i].sctx = sctx;
357 	}
358 	sctx->first_free = 0;
359 	atomic_set(&sctx->cancel_req, 0);
360 
361 	spin_lock_init(&sctx->stat_lock);
362 	sctx->throttle_deadline = 0;
363 
364 	mutex_init(&sctx->wr_lock);
365 	if (is_dev_replace) {
366 		WARN_ON(!fs_info->dev_replace.tgtdev);
367 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
368 	}
369 
370 	return sctx;
371 
372 nomem:
373 	scrub_free_ctx(sctx);
374 	return ERR_PTR(-ENOMEM);
375 }
376 
377 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
378 				     u64 root, void *warn_ctx)
379 {
380 	u32 nlink;
381 	int ret;
382 	int i;
383 	unsigned nofs_flag;
384 	struct extent_buffer *eb;
385 	struct btrfs_inode_item *inode_item;
386 	struct scrub_warning *swarn = warn_ctx;
387 	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
388 	struct inode_fs_paths *ipath = NULL;
389 	struct btrfs_root *local_root;
390 	struct btrfs_key key;
391 
392 	local_root = btrfs_get_fs_root(fs_info, root, true);
393 	if (IS_ERR(local_root)) {
394 		ret = PTR_ERR(local_root);
395 		goto err;
396 	}
397 
398 	/*
399 	 * this makes the path point to (inum INODE_ITEM ioff)
400 	 */
401 	key.objectid = inum;
402 	key.type = BTRFS_INODE_ITEM_KEY;
403 	key.offset = 0;
404 
405 	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
406 	if (ret) {
407 		btrfs_put_root(local_root);
408 		btrfs_release_path(swarn->path);
409 		goto err;
410 	}
411 
412 	eb = swarn->path->nodes[0];
413 	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
414 					struct btrfs_inode_item);
415 	nlink = btrfs_inode_nlink(eb, inode_item);
416 	btrfs_release_path(swarn->path);
417 
418 	/*
419 	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
420 	 * uses GFP_NOFS in this context, so we keep it consistent but it does
421 	 * not seem to be strictly necessary.
422 	 */
423 	nofs_flag = memalloc_nofs_save();
424 	ipath = init_ipath(4096, local_root, swarn->path);
425 	memalloc_nofs_restore(nofs_flag);
426 	if (IS_ERR(ipath)) {
427 		btrfs_put_root(local_root);
428 		ret = PTR_ERR(ipath);
429 		ipath = NULL;
430 		goto err;
431 	}
432 	ret = paths_from_inode(inum, ipath);
433 
434 	if (ret < 0)
435 		goto err;
436 
437 	/*
438 	 * we deliberately ignore the bit ipath might have been too small to
439 	 * hold all of the paths here
440 	 */
441 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
442 		btrfs_warn_in_rcu(fs_info,
443 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
444 				  swarn->errstr, swarn->logical,
445 				  btrfs_dev_name(swarn->dev),
446 				  swarn->physical,
447 				  root, inum, offset,
448 				  fs_info->sectorsize, nlink,
449 				  (char *)(unsigned long)ipath->fspath->val[i]);
450 
451 	btrfs_put_root(local_root);
452 	free_ipath(ipath);
453 	return 0;
454 
455 err:
456 	btrfs_warn_in_rcu(fs_info,
457 			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
458 			  swarn->errstr, swarn->logical,
459 			  btrfs_dev_name(swarn->dev),
460 			  swarn->physical,
461 			  root, inum, offset, ret);
462 
463 	free_ipath(ipath);
464 	return 0;
465 }
466 
467 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
468 				       bool is_super, u64 logical, u64 physical)
469 {
470 	struct btrfs_fs_info *fs_info = dev->fs_info;
471 	struct btrfs_path *path;
472 	struct btrfs_key found_key;
473 	struct extent_buffer *eb;
474 	struct btrfs_extent_item *ei;
475 	struct scrub_warning swarn;
476 	unsigned long ptr = 0;
477 	u64 flags = 0;
478 	u64 ref_root;
479 	u32 item_size;
480 	u8 ref_level = 0;
481 	int ret;
482 
483 	/* Super block error, no need to search extent tree. */
484 	if (is_super) {
485 		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
486 				  errstr, btrfs_dev_name(dev), physical);
487 		return;
488 	}
489 	path = btrfs_alloc_path();
490 	if (!path)
491 		return;
492 
493 	swarn.physical = physical;
494 	swarn.logical = logical;
495 	swarn.errstr = errstr;
496 	swarn.dev = NULL;
497 
498 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
499 				  &flags);
500 	if (ret < 0)
501 		goto out;
502 
503 	swarn.extent_item_size = found_key.offset;
504 
505 	eb = path->nodes[0];
506 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
507 	item_size = btrfs_item_size(eb, path->slots[0]);
508 
509 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
510 		do {
511 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
512 						      item_size, &ref_root,
513 						      &ref_level);
514 			btrfs_warn_in_rcu(fs_info,
515 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
516 				errstr, swarn.logical,
517 				btrfs_dev_name(dev),
518 				swarn.physical,
519 				ref_level ? "node" : "leaf",
520 				ret < 0 ? -1 : ref_level,
521 				ret < 0 ? -1 : ref_root);
522 		} while (ret != 1);
523 		btrfs_release_path(path);
524 	} else {
525 		struct btrfs_backref_walk_ctx ctx = { 0 };
526 
527 		btrfs_release_path(path);
528 
529 		ctx.bytenr = found_key.objectid;
530 		ctx.extent_item_pos = swarn.logical - found_key.objectid;
531 		ctx.fs_info = fs_info;
532 
533 		swarn.path = path;
534 		swarn.dev = dev;
535 
536 		iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
537 	}
538 
539 out:
540 	btrfs_free_path(path);
541 }
542 
543 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
544 {
545 	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
546 		return 2;
547 	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
548 		return 3;
549 	else
550 		return (int)bioc->num_stripes;
551 }
552 
553 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
554 						 u64 full_stripe_logical,
555 						 int nstripes, int mirror,
556 						 int *stripe_index,
557 						 u64 *stripe_offset)
558 {
559 	int i;
560 
561 	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
562 		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
563 					    nstripes - 1 : nstripes - 2;
564 
565 		/* RAID5/6 */
566 		for (i = 0; i < nr_data_stripes; i++) {
567 			const u64 data_stripe_start = full_stripe_logical +
568 						(i * BTRFS_STRIPE_LEN);
569 
570 			if (logical >= data_stripe_start &&
571 			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
572 				break;
573 		}
574 
575 		*stripe_index = i;
576 		*stripe_offset = (logical - full_stripe_logical) &
577 				 BTRFS_STRIPE_LEN_MASK;
578 	} else {
579 		/* The other RAID type */
580 		*stripe_index = mirror;
581 		*stripe_offset = 0;
582 	}
583 }
584 
585 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
586 {
587 	int ret = 0;
588 	u64 length;
589 
590 	if (!btrfs_is_zoned(sctx->fs_info))
591 		return 0;
592 
593 	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
594 		return 0;
595 
596 	if (sctx->write_pointer < physical) {
597 		length = physical - sctx->write_pointer;
598 
599 		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
600 						sctx->write_pointer, length);
601 		if (!ret)
602 			sctx->write_pointer = physical;
603 	}
604 	return ret;
605 }
606 
607 static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
608 {
609 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
610 	int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
611 
612 	return stripe->pages[page_index];
613 }
614 
615 static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
616 						 int sector_nr)
617 {
618 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
619 
620 	return offset_in_page(sector_nr << fs_info->sectorsize_bits);
621 }
622 
623 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
624 {
625 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
626 	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
627 	const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
628 	const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
629 	const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
630 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
631 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
632 	u8 calculated_csum[BTRFS_CSUM_SIZE];
633 	struct btrfs_header *header;
634 
635 	/*
636 	 * Here we don't have a good way to attach the pages (and subpages)
637 	 * to a dummy extent buffer, thus we have to directly grab the members
638 	 * from pages.
639 	 */
640 	header = (struct btrfs_header *)(page_address(first_page) + first_off);
641 	memcpy(on_disk_csum, header->csum, fs_info->csum_size);
642 
643 	if (logical != btrfs_stack_header_bytenr(header)) {
644 		bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
645 		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
646 		btrfs_warn_rl(fs_info,
647 		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
648 			      logical, stripe->mirror_num,
649 			      btrfs_stack_header_bytenr(header), logical);
650 		return;
651 	}
652 	if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
653 		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
654 		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
655 		btrfs_warn_rl(fs_info,
656 		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
657 			      logical, stripe->mirror_num,
658 			      header->fsid, fs_info->fs_devices->fsid);
659 		return;
660 	}
661 	if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
662 		   BTRFS_UUID_SIZE) != 0) {
663 		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
664 		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
665 		btrfs_warn_rl(fs_info,
666 		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
667 			      logical, stripe->mirror_num,
668 			      header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
669 		return;
670 	}
671 
672 	/* Now check tree block csum. */
673 	shash->tfm = fs_info->csum_shash;
674 	crypto_shash_init(shash);
675 	crypto_shash_update(shash, page_address(first_page) + first_off +
676 			    BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
677 
678 	for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
679 		struct page *page = scrub_stripe_get_page(stripe, i);
680 		unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
681 
682 		crypto_shash_update(shash, page_address(page) + page_off,
683 				    fs_info->sectorsize);
684 	}
685 
686 	crypto_shash_final(shash, calculated_csum);
687 	if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
688 		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
689 		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
690 		btrfs_warn_rl(fs_info,
691 		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
692 			      logical, stripe->mirror_num,
693 			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
694 			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
695 		return;
696 	}
697 	if (stripe->sectors[sector_nr].generation !=
698 	    btrfs_stack_header_generation(header)) {
699 		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
700 		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
701 		btrfs_warn_rl(fs_info,
702 		"tree block %llu mirror %u has bad generation, has %llu want %llu",
703 			      logical, stripe->mirror_num,
704 			      btrfs_stack_header_generation(header),
705 			      stripe->sectors[sector_nr].generation);
706 		return;
707 	}
708 	bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
709 	bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
710 	bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
711 }
712 
713 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
714 {
715 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
716 	struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
717 	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
718 	struct page *page = scrub_stripe_get_page(stripe, sector_nr);
719 	unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
720 	u8 csum_buf[BTRFS_CSUM_SIZE];
721 	int ret;
722 
723 	ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
724 
725 	/* Sector not utilized, skip it. */
726 	if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
727 		return;
728 
729 	/* IO error, no need to check. */
730 	if (test_bit(sector_nr, &stripe->io_error_bitmap))
731 		return;
732 
733 	/* Metadata, verify the full tree block. */
734 	if (sector->is_metadata) {
735 		/*
736 		 * Check if the tree block crosses the stripe boudary.  If
737 		 * crossed the boundary, we cannot verify it but only give a
738 		 * warning.
739 		 *
740 		 * This can only happen on a very old filesystem where chunks
741 		 * are not ensured to be stripe aligned.
742 		 */
743 		if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
744 			btrfs_warn_rl(fs_info,
745 			"tree block at %llu crosses stripe boundary %llu",
746 				      stripe->logical +
747 				      (sector_nr << fs_info->sectorsize_bits),
748 				      stripe->logical);
749 			return;
750 		}
751 		scrub_verify_one_metadata(stripe, sector_nr);
752 		return;
753 	}
754 
755 	/*
756 	 * Data is easier, we just verify the data csum (if we have it).  For
757 	 * cases without csum, we have no other choice but to trust it.
758 	 */
759 	if (!sector->csum) {
760 		clear_bit(sector_nr, &stripe->error_bitmap);
761 		return;
762 	}
763 
764 	ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
765 	if (ret < 0) {
766 		set_bit(sector_nr, &stripe->csum_error_bitmap);
767 		set_bit(sector_nr, &stripe->error_bitmap);
768 	} else {
769 		clear_bit(sector_nr, &stripe->csum_error_bitmap);
770 		clear_bit(sector_nr, &stripe->error_bitmap);
771 	}
772 }
773 
774 /* Verify specified sectors of a stripe. */
775 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
776 {
777 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
778 	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
779 	int sector_nr;
780 
781 	for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
782 		scrub_verify_one_sector(stripe, sector_nr);
783 		if (stripe->sectors[sector_nr].is_metadata)
784 			sector_nr += sectors_per_tree - 1;
785 	}
786 }
787 
788 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
789 {
790 	int i;
791 
792 	for (i = 0; i < stripe->nr_sectors; i++) {
793 		if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
794 		    scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
795 			break;
796 	}
797 	ASSERT(i < stripe->nr_sectors);
798 	return i;
799 }
800 
801 /*
802  * Repair read is different to the regular read:
803  *
804  * - Only reads the failed sectors
805  * - May have extra blocksize limits
806  */
807 static void scrub_repair_read_endio(struct btrfs_bio *bbio)
808 {
809 	struct scrub_stripe *stripe = bbio->private;
810 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
811 	struct bio_vec *bvec;
812 	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
813 	u32 bio_size = 0;
814 	int i;
815 
816 	ASSERT(sector_nr < stripe->nr_sectors);
817 
818 	bio_for_each_bvec_all(bvec, &bbio->bio, i)
819 		bio_size += bvec->bv_len;
820 
821 	if (bbio->bio.bi_status) {
822 		bitmap_set(&stripe->io_error_bitmap, sector_nr,
823 			   bio_size >> fs_info->sectorsize_bits);
824 		bitmap_set(&stripe->error_bitmap, sector_nr,
825 			   bio_size >> fs_info->sectorsize_bits);
826 	} else {
827 		bitmap_clear(&stripe->io_error_bitmap, sector_nr,
828 			     bio_size >> fs_info->sectorsize_bits);
829 	}
830 	bio_put(&bbio->bio);
831 	if (atomic_dec_and_test(&stripe->pending_io))
832 		wake_up(&stripe->io_wait);
833 }
834 
835 static int calc_next_mirror(int mirror, int num_copies)
836 {
837 	ASSERT(mirror <= num_copies);
838 	return (mirror + 1 > num_copies) ? 1 : mirror + 1;
839 }
840 
841 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
842 					    int mirror, int blocksize, bool wait)
843 {
844 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
845 	struct btrfs_bio *bbio = NULL;
846 	const unsigned long old_error_bitmap = stripe->error_bitmap;
847 	int i;
848 
849 	ASSERT(stripe->mirror_num >= 1);
850 	ASSERT(atomic_read(&stripe->pending_io) == 0);
851 
852 	for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
853 		struct page *page;
854 		int pgoff;
855 		int ret;
856 
857 		page = scrub_stripe_get_page(stripe, i);
858 		pgoff = scrub_stripe_get_page_offset(stripe, i);
859 
860 		/* The current sector cannot be merged, submit the bio. */
861 		if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
862 			     bbio->bio.bi_iter.bi_size >= blocksize)) {
863 			ASSERT(bbio->bio.bi_iter.bi_size);
864 			atomic_inc(&stripe->pending_io);
865 			btrfs_submit_bio(bbio, mirror);
866 			if (wait)
867 				wait_scrub_stripe_io(stripe);
868 			bbio = NULL;
869 		}
870 
871 		if (!bbio) {
872 			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
873 				fs_info, scrub_repair_read_endio, stripe);
874 			bbio->bio.bi_iter.bi_sector = (stripe->logical +
875 				(i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
876 		}
877 
878 		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
879 		ASSERT(ret == fs_info->sectorsize);
880 	}
881 	if (bbio) {
882 		ASSERT(bbio->bio.bi_iter.bi_size);
883 		atomic_inc(&stripe->pending_io);
884 		btrfs_submit_bio(bbio, mirror);
885 		if (wait)
886 			wait_scrub_stripe_io(stripe);
887 	}
888 }
889 
890 static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
891 				       struct scrub_stripe *stripe)
892 {
893 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
894 				      DEFAULT_RATELIMIT_BURST);
895 	struct btrfs_fs_info *fs_info = sctx->fs_info;
896 	struct btrfs_device *dev = NULL;
897 	u64 physical = 0;
898 	int nr_data_sectors = 0;
899 	int nr_meta_sectors = 0;
900 	int nr_nodatacsum_sectors = 0;
901 	int nr_repaired_sectors = 0;
902 	int sector_nr;
903 
904 	if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
905 		return;
906 
907 	/*
908 	 * Init needed infos for error reporting.
909 	 *
910 	 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
911 	 * thus no need for dev/physical, error reporting still needs dev and physical.
912 	 */
913 	if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
914 		u64 mapped_len = fs_info->sectorsize;
915 		struct btrfs_io_context *bioc = NULL;
916 		int stripe_index = stripe->mirror_num - 1;
917 		int ret;
918 
919 		/* For scrub, our mirror_num should always start at 1. */
920 		ASSERT(stripe->mirror_num >= 1);
921 		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
922 				       stripe->logical, &mapped_len, &bioc);
923 		/*
924 		 * If we failed, dev will be NULL, and later detailed reports
925 		 * will just be skipped.
926 		 */
927 		if (ret < 0)
928 			goto skip;
929 		physical = bioc->stripes[stripe_index].physical;
930 		dev = bioc->stripes[stripe_index].dev;
931 		btrfs_put_bioc(bioc);
932 	}
933 
934 skip:
935 	for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
936 		bool repaired = false;
937 
938 		if (stripe->sectors[sector_nr].is_metadata) {
939 			nr_meta_sectors++;
940 		} else {
941 			nr_data_sectors++;
942 			if (!stripe->sectors[sector_nr].csum)
943 				nr_nodatacsum_sectors++;
944 		}
945 
946 		if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
947 		    !test_bit(sector_nr, &stripe->error_bitmap)) {
948 			nr_repaired_sectors++;
949 			repaired = true;
950 		}
951 
952 		/* Good sector from the beginning, nothing need to be done. */
953 		if (!test_bit(sector_nr, &stripe->init_error_bitmap))
954 			continue;
955 
956 		/*
957 		 * Report error for the corrupted sectors.  If repaired, just
958 		 * output the message of repaired message.
959 		 */
960 		if (repaired) {
961 			if (dev) {
962 				btrfs_err_rl_in_rcu(fs_info,
963 			"fixed up error at logical %llu on dev %s physical %llu",
964 					    stripe->logical, btrfs_dev_name(dev),
965 					    physical);
966 			} else {
967 				btrfs_err_rl_in_rcu(fs_info,
968 			"fixed up error at logical %llu on mirror %u",
969 					    stripe->logical, stripe->mirror_num);
970 			}
971 			continue;
972 		}
973 
974 		/* The remaining are all for unrepaired. */
975 		if (dev) {
976 			btrfs_err_rl_in_rcu(fs_info,
977 	"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
978 					    stripe->logical, btrfs_dev_name(dev),
979 					    physical);
980 		} else {
981 			btrfs_err_rl_in_rcu(fs_info,
982 	"unable to fixup (regular) error at logical %llu on mirror %u",
983 					    stripe->logical, stripe->mirror_num);
984 		}
985 
986 		if (test_bit(sector_nr, &stripe->io_error_bitmap))
987 			if (__ratelimit(&rs) && dev)
988 				scrub_print_common_warning("i/o error", dev, false,
989 						     stripe->logical, physical);
990 		if (test_bit(sector_nr, &stripe->csum_error_bitmap))
991 			if (__ratelimit(&rs) && dev)
992 				scrub_print_common_warning("checksum error", dev, false,
993 						     stripe->logical, physical);
994 		if (test_bit(sector_nr, &stripe->meta_error_bitmap))
995 			if (__ratelimit(&rs) && dev)
996 				scrub_print_common_warning("header error", dev, false,
997 						     stripe->logical, physical);
998 	}
999 
1000 	spin_lock(&sctx->stat_lock);
1001 	sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
1002 	sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
1003 	sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
1004 	sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
1005 	sctx->stat.no_csum += nr_nodatacsum_sectors;
1006 	sctx->stat.read_errors +=
1007 		bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
1008 	sctx->stat.csum_errors +=
1009 		bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
1010 	sctx->stat.verify_errors +=
1011 		bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
1012 	sctx->stat.uncorrectable_errors +=
1013 		bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
1014 	sctx->stat.corrected_errors += nr_repaired_sectors;
1015 	spin_unlock(&sctx->stat_lock);
1016 }
1017 
1018 /*
1019  * The main entrance for all read related scrub work, including:
1020  *
1021  * - Wait for the initial read to finish
1022  * - Verify and locate any bad sectors
1023  * - Go through the remaining mirrors and try to read as large blocksize as
1024  *   possible
1025  * - Go through all mirrors (including the failed mirror) sector-by-sector
1026  *
1027  * Writeback does not happen here, it needs extra synchronization.
1028  */
1029 static void scrub_stripe_read_repair_worker(struct work_struct *work)
1030 {
1031 	struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
1032 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1033 	int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1034 					  stripe->bg->length);
1035 	int mirror;
1036 	int i;
1037 
1038 	ASSERT(stripe->mirror_num > 0);
1039 
1040 	wait_scrub_stripe_io(stripe);
1041 	scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
1042 	/* Save the initial failed bitmap for later repair and report usage. */
1043 	stripe->init_error_bitmap = stripe->error_bitmap;
1044 
1045 	if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
1046 		goto out;
1047 
1048 	/*
1049 	 * Try all remaining mirrors.
1050 	 *
1051 	 * Here we still try to read as large block as possible, as this is
1052 	 * faster and we have extra safety nets to rely on.
1053 	 */
1054 	for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
1055 	     mirror != stripe->mirror_num;
1056 	     mirror = calc_next_mirror(mirror, num_copies)) {
1057 		const unsigned long old_error_bitmap = stripe->error_bitmap;
1058 
1059 		scrub_stripe_submit_repair_read(stripe, mirror,
1060 						BTRFS_STRIPE_LEN, false);
1061 		wait_scrub_stripe_io(stripe);
1062 		scrub_verify_one_stripe(stripe, old_error_bitmap);
1063 		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
1064 			goto out;
1065 	}
1066 
1067 	/*
1068 	 * Last safety net, try re-checking all mirrors, including the failed
1069 	 * one, sector-by-sector.
1070 	 *
1071 	 * As if one sector failed the drive's internal csum, the whole read
1072 	 * containing the offending sector would be marked as error.
1073 	 * Thus here we do sector-by-sector read.
1074 	 *
1075 	 * This can be slow, thus we only try it as the last resort.
1076 	 */
1077 
1078 	for (i = 0, mirror = stripe->mirror_num;
1079 	     i < num_copies;
1080 	     i++, mirror = calc_next_mirror(mirror, num_copies)) {
1081 		const unsigned long old_error_bitmap = stripe->error_bitmap;
1082 
1083 		scrub_stripe_submit_repair_read(stripe, mirror,
1084 						fs_info->sectorsize, true);
1085 		wait_scrub_stripe_io(stripe);
1086 		scrub_verify_one_stripe(stripe, old_error_bitmap);
1087 		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
1088 			goto out;
1089 	}
1090 out:
1091 	scrub_stripe_report_errors(stripe->sctx, stripe);
1092 	set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
1093 	wake_up(&stripe->repair_wait);
1094 }
1095 
1096 static void scrub_read_endio(struct btrfs_bio *bbio)
1097 {
1098 	struct scrub_stripe *stripe = bbio->private;
1099 
1100 	if (bbio->bio.bi_status) {
1101 		bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
1102 		bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
1103 	} else {
1104 		bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
1105 	}
1106 	bio_put(&bbio->bio);
1107 	if (atomic_dec_and_test(&stripe->pending_io)) {
1108 		wake_up(&stripe->io_wait);
1109 		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1110 		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1111 	}
1112 }
1113 
1114 static void scrub_write_endio(struct btrfs_bio *bbio)
1115 {
1116 	struct scrub_stripe *stripe = bbio->private;
1117 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1118 	struct bio_vec *bvec;
1119 	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1120 	u32 bio_size = 0;
1121 	int i;
1122 
1123 	bio_for_each_bvec_all(bvec, &bbio->bio, i)
1124 		bio_size += bvec->bv_len;
1125 
1126 	if (bbio->bio.bi_status) {
1127 		unsigned long flags;
1128 
1129 		spin_lock_irqsave(&stripe->write_error_lock, flags);
1130 		bitmap_set(&stripe->write_error_bitmap, sector_nr,
1131 			   bio_size >> fs_info->sectorsize_bits);
1132 		spin_unlock_irqrestore(&stripe->write_error_lock, flags);
1133 	}
1134 	bio_put(&bbio->bio);
1135 
1136 	if (atomic_dec_and_test(&stripe->pending_io))
1137 		wake_up(&stripe->io_wait);
1138 }
1139 
1140 static void scrub_submit_write_bio(struct scrub_ctx *sctx,
1141 				   struct scrub_stripe *stripe,
1142 				   struct btrfs_bio *bbio, bool dev_replace)
1143 {
1144 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1145 	u32 bio_len = bbio->bio.bi_iter.bi_size;
1146 	u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
1147 		      stripe->logical;
1148 
1149 	fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
1150 	atomic_inc(&stripe->pending_io);
1151 	btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
1152 	if (!btrfs_is_zoned(fs_info))
1153 		return;
1154 	/*
1155 	 * For zoned writeback, queue depth must be 1, thus we must wait for
1156 	 * the write to finish before the next write.
1157 	 */
1158 	wait_scrub_stripe_io(stripe);
1159 
1160 	/*
1161 	 * And also need to update the write pointer if write finished
1162 	 * successfully.
1163 	 */
1164 	if (!test_bit(bio_off >> fs_info->sectorsize_bits,
1165 		      &stripe->write_error_bitmap))
1166 		sctx->write_pointer += bio_len;
1167 }
1168 
1169 /*
1170  * Submit the write bio(s) for the sectors specified by @write_bitmap.
1171  *
1172  * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1173  *
1174  * - Only needs logical bytenr and mirror_num
1175  *   Just like the scrub read path
1176  *
1177  * - Would only result in writes to the specified mirror
1178  *   Unlike the regular writeback path, which would write back to all stripes
1179  *
1180  * - Handle dev-replace and read-repair writeback differently
1181  */
1182 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1183 				unsigned long write_bitmap, bool dev_replace)
1184 {
1185 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1186 	struct btrfs_bio *bbio = NULL;
1187 	int sector_nr;
1188 
1189 	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
1190 		struct page *page = scrub_stripe_get_page(stripe, sector_nr);
1191 		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
1192 		int ret;
1193 
1194 		/* We should only writeback sectors covered by an extent. */
1195 		ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
1196 
1197 		/* Cannot merge with previous sector, submit the current one. */
1198 		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
1199 			scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1200 			bbio = NULL;
1201 		}
1202 		if (!bbio) {
1203 			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
1204 					       fs_info, scrub_write_endio, stripe);
1205 			bbio->bio.bi_iter.bi_sector = (stripe->logical +
1206 				(sector_nr << fs_info->sectorsize_bits)) >>
1207 				SECTOR_SHIFT;
1208 		}
1209 		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
1210 		ASSERT(ret == fs_info->sectorsize);
1211 	}
1212 	if (bbio)
1213 		scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1214 }
1215 
1216 /*
1217  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1218  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1219  */
1220 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
1221 				  unsigned int bio_size)
1222 {
1223 	const int time_slice = 1000;
1224 	s64 delta;
1225 	ktime_t now;
1226 	u32 div;
1227 	u64 bwlimit;
1228 
1229 	bwlimit = READ_ONCE(device->scrub_speed_max);
1230 	if (bwlimit == 0)
1231 		return;
1232 
1233 	/*
1234 	 * Slice is divided into intervals when the IO is submitted, adjust by
1235 	 * bwlimit and maximum of 64 intervals.
1236 	 */
1237 	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1238 	div = min_t(u32, 64, div);
1239 
1240 	/* Start new epoch, set deadline */
1241 	now = ktime_get();
1242 	if (sctx->throttle_deadline == 0) {
1243 		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1244 		sctx->throttle_sent = 0;
1245 	}
1246 
1247 	/* Still in the time to send? */
1248 	if (ktime_before(now, sctx->throttle_deadline)) {
1249 		/* If current bio is within the limit, send it */
1250 		sctx->throttle_sent += bio_size;
1251 		if (sctx->throttle_sent <= div_u64(bwlimit, div))
1252 			return;
1253 
1254 		/* We're over the limit, sleep until the rest of the slice */
1255 		delta = ktime_ms_delta(sctx->throttle_deadline, now);
1256 	} else {
1257 		/* New request after deadline, start new epoch */
1258 		delta = 0;
1259 	}
1260 
1261 	if (delta) {
1262 		long timeout;
1263 
1264 		timeout = div_u64(delta * HZ, 1000);
1265 		schedule_timeout_interruptible(timeout);
1266 	}
1267 
1268 	/* Next call will start the deadline period */
1269 	sctx->throttle_deadline = 0;
1270 }
1271 
1272 /*
1273  * Given a physical address, this will calculate it's
1274  * logical offset. if this is a parity stripe, it will return
1275  * the most left data stripe's logical offset.
1276  *
1277  * return 0 if it is a data stripe, 1 means parity stripe.
1278  */
1279 static int get_raid56_logic_offset(u64 physical, int num,
1280 				   struct map_lookup *map, u64 *offset,
1281 				   u64 *stripe_start)
1282 {
1283 	int i;
1284 	int j = 0;
1285 	u64 last_offset;
1286 	const int data_stripes = nr_data_stripes(map);
1287 
1288 	last_offset = (physical - map->stripes[num].physical) * data_stripes;
1289 	if (stripe_start)
1290 		*stripe_start = last_offset;
1291 
1292 	*offset = last_offset;
1293 	for (i = 0; i < data_stripes; i++) {
1294 		u32 stripe_nr;
1295 		u32 stripe_index;
1296 		u32 rot;
1297 
1298 		*offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
1299 
1300 		stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
1301 
1302 		/* Work out the disk rotation on this stripe-set */
1303 		rot = stripe_nr % map->num_stripes;
1304 		stripe_nr /= map->num_stripes;
1305 		/* calculate which stripe this data locates */
1306 		rot += i;
1307 		stripe_index = rot % map->num_stripes;
1308 		if (stripe_index == num)
1309 			return 0;
1310 		if (stripe_index < num)
1311 			j++;
1312 	}
1313 	*offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
1314 	return 1;
1315 }
1316 
1317 /*
1318  * Return 0 if the extent item range covers any byte of the range.
1319  * Return <0 if the extent item is before @search_start.
1320  * Return >0 if the extent item is after @start_start + @search_len.
1321  */
1322 static int compare_extent_item_range(struct btrfs_path *path,
1323 				     u64 search_start, u64 search_len)
1324 {
1325 	struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
1326 	u64 len;
1327 	struct btrfs_key key;
1328 
1329 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1330 	ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
1331 	       key.type == BTRFS_METADATA_ITEM_KEY);
1332 	if (key.type == BTRFS_METADATA_ITEM_KEY)
1333 		len = fs_info->nodesize;
1334 	else
1335 		len = key.offset;
1336 
1337 	if (key.objectid + len <= search_start)
1338 		return -1;
1339 	if (key.objectid >= search_start + search_len)
1340 		return 1;
1341 	return 0;
1342 }
1343 
1344 /*
1345  * Locate one extent item which covers any byte in range
1346  * [@search_start, @search_start + @search_length)
1347  *
1348  * If the path is not initialized, we will initialize the search by doing
1349  * a btrfs_search_slot().
1350  * If the path is already initialized, we will use the path as the initial
1351  * slot, to avoid duplicated btrfs_search_slot() calls.
1352  *
1353  * NOTE: If an extent item starts before @search_start, we will still
1354  * return the extent item. This is for data extent crossing stripe boundary.
1355  *
1356  * Return 0 if we found such extent item, and @path will point to the extent item.
1357  * Return >0 if no such extent item can be found, and @path will be released.
1358  * Return <0 if hit fatal error, and @path will be released.
1359  */
1360 static int find_first_extent_item(struct btrfs_root *extent_root,
1361 				  struct btrfs_path *path,
1362 				  u64 search_start, u64 search_len)
1363 {
1364 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
1365 	struct btrfs_key key;
1366 	int ret;
1367 
1368 	/* Continue using the existing path */
1369 	if (path->nodes[0])
1370 		goto search_forward;
1371 
1372 	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1373 		key.type = BTRFS_METADATA_ITEM_KEY;
1374 	else
1375 		key.type = BTRFS_EXTENT_ITEM_KEY;
1376 	key.objectid = search_start;
1377 	key.offset = (u64)-1;
1378 
1379 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1380 	if (ret < 0)
1381 		return ret;
1382 
1383 	ASSERT(ret > 0);
1384 	/*
1385 	 * Here we intentionally pass 0 as @min_objectid, as there could be
1386 	 * an extent item starting before @search_start.
1387 	 */
1388 	ret = btrfs_previous_extent_item(extent_root, path, 0);
1389 	if (ret < 0)
1390 		return ret;
1391 	/*
1392 	 * No matter whether we have found an extent item, the next loop will
1393 	 * properly do every check on the key.
1394 	 */
1395 search_forward:
1396 	while (true) {
1397 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1398 		if (key.objectid >= search_start + search_len)
1399 			break;
1400 		if (key.type != BTRFS_METADATA_ITEM_KEY &&
1401 		    key.type != BTRFS_EXTENT_ITEM_KEY)
1402 			goto next;
1403 
1404 		ret = compare_extent_item_range(path, search_start, search_len);
1405 		if (ret == 0)
1406 			return ret;
1407 		if (ret > 0)
1408 			break;
1409 next:
1410 		path->slots[0]++;
1411 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
1412 			ret = btrfs_next_leaf(extent_root, path);
1413 			if (ret) {
1414 				/* Either no more item or fatal error */
1415 				btrfs_release_path(path);
1416 				return ret;
1417 			}
1418 		}
1419 	}
1420 	btrfs_release_path(path);
1421 	return 1;
1422 }
1423 
1424 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
1425 			    u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
1426 {
1427 	struct btrfs_key key;
1428 	struct btrfs_extent_item *ei;
1429 
1430 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1431 	ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
1432 	       key.type == BTRFS_EXTENT_ITEM_KEY);
1433 	*extent_start_ret = key.objectid;
1434 	if (key.type == BTRFS_METADATA_ITEM_KEY)
1435 		*size_ret = path->nodes[0]->fs_info->nodesize;
1436 	else
1437 		*size_ret = key.offset;
1438 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
1439 	*flags_ret = btrfs_extent_flags(path->nodes[0], ei);
1440 	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
1441 }
1442 
1443 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
1444 					u64 physical, u64 physical_end)
1445 {
1446 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1447 	int ret = 0;
1448 
1449 	if (!btrfs_is_zoned(fs_info))
1450 		return 0;
1451 
1452 	mutex_lock(&sctx->wr_lock);
1453 	if (sctx->write_pointer < physical_end) {
1454 		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
1455 						    physical,
1456 						    sctx->write_pointer);
1457 		if (ret)
1458 			btrfs_err(fs_info,
1459 				  "zoned: failed to recover write pointer");
1460 	}
1461 	mutex_unlock(&sctx->wr_lock);
1462 	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
1463 
1464 	return ret;
1465 }
1466 
1467 static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
1468 				 struct scrub_stripe *stripe,
1469 				 u64 extent_start, u64 extent_len,
1470 				 u64 extent_flags, u64 extent_gen)
1471 {
1472 	for (u64 cur_logical = max(stripe->logical, extent_start);
1473 	     cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
1474 			       extent_start + extent_len);
1475 	     cur_logical += fs_info->sectorsize) {
1476 		const int nr_sector = (cur_logical - stripe->logical) >>
1477 				      fs_info->sectorsize_bits;
1478 		struct scrub_sector_verification *sector =
1479 						&stripe->sectors[nr_sector];
1480 
1481 		set_bit(nr_sector, &stripe->extent_sector_bitmap);
1482 		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1483 			sector->is_metadata = true;
1484 			sector->generation = extent_gen;
1485 		}
1486 	}
1487 }
1488 
1489 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
1490 {
1491 	stripe->extent_sector_bitmap = 0;
1492 	stripe->init_error_bitmap = 0;
1493 	stripe->error_bitmap = 0;
1494 	stripe->io_error_bitmap = 0;
1495 	stripe->csum_error_bitmap = 0;
1496 	stripe->meta_error_bitmap = 0;
1497 }
1498 
1499 /*
1500  * Locate one stripe which has at least one extent in its range.
1501  *
1502  * Return 0 if found such stripe, and store its info into @stripe.
1503  * Return >0 if there is no such stripe in the specified range.
1504  * Return <0 for error.
1505  */
1506 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
1507 					struct btrfs_device *dev, u64 physical,
1508 					int mirror_num, u64 logical_start,
1509 					u32 logical_len,
1510 					struct scrub_stripe *stripe)
1511 {
1512 	struct btrfs_fs_info *fs_info = bg->fs_info;
1513 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
1514 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
1515 	const u64 logical_end = logical_start + logical_len;
1516 	struct btrfs_path path = { 0 };
1517 	u64 cur_logical = logical_start;
1518 	u64 stripe_end;
1519 	u64 extent_start;
1520 	u64 extent_len;
1521 	u64 extent_flags;
1522 	u64 extent_gen;
1523 	int ret;
1524 
1525 	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
1526 				   stripe->nr_sectors);
1527 	scrub_stripe_reset_bitmaps(stripe);
1528 
1529 	/* The range must be inside the bg. */
1530 	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
1531 
1532 	path.search_commit_root = 1;
1533 	path.skip_locking = 1;
1534 
1535 	ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
1536 	/* Either error or not found. */
1537 	if (ret)
1538 		goto out;
1539 	get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
1540 	if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1541 		stripe->nr_meta_extents++;
1542 	if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1543 		stripe->nr_data_extents++;
1544 	cur_logical = max(extent_start, cur_logical);
1545 
1546 	/*
1547 	 * Round down to stripe boundary.
1548 	 *
1549 	 * The extra calculation against bg->start is to handle block groups
1550 	 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1551 	 */
1552 	stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
1553 			  bg->start;
1554 	stripe->physical = physical + stripe->logical - logical_start;
1555 	stripe->dev = dev;
1556 	stripe->bg = bg;
1557 	stripe->mirror_num = mirror_num;
1558 	stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
1559 
1560 	/* Fill the first extent info into stripe->sectors[] array. */
1561 	fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1562 			     extent_flags, extent_gen);
1563 	cur_logical = extent_start + extent_len;
1564 
1565 	/* Fill the extent info for the remaining sectors. */
1566 	while (cur_logical <= stripe_end) {
1567 		ret = find_first_extent_item(extent_root, &path, cur_logical,
1568 					     stripe_end - cur_logical + 1);
1569 		if (ret < 0)
1570 			goto out;
1571 		if (ret > 0) {
1572 			ret = 0;
1573 			break;
1574 		}
1575 		get_extent_info(&path, &extent_start, &extent_len,
1576 				&extent_flags, &extent_gen);
1577 		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1578 			stripe->nr_meta_extents++;
1579 		if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1580 			stripe->nr_data_extents++;
1581 		fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1582 				     extent_flags, extent_gen);
1583 		cur_logical = extent_start + extent_len;
1584 	}
1585 
1586 	/* Now fill the data csum. */
1587 	if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
1588 		int sector_nr;
1589 		unsigned long csum_bitmap = 0;
1590 
1591 		/* Csum space should have already been allocated. */
1592 		ASSERT(stripe->csums);
1593 
1594 		/*
1595 		 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1596 		 * should contain at most 16 sectors.
1597 		 */
1598 		ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1599 
1600 		ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
1601 						stripe_end, stripe->csums,
1602 						&csum_bitmap, true);
1603 		if (ret < 0)
1604 			goto out;
1605 		if (ret > 0)
1606 			ret = 0;
1607 
1608 		for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
1609 			stripe->sectors[sector_nr].csum = stripe->csums +
1610 				sector_nr * fs_info->csum_size;
1611 		}
1612 	}
1613 	set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1614 out:
1615 	btrfs_release_path(&path);
1616 	return ret;
1617 }
1618 
1619 static void scrub_reset_stripe(struct scrub_stripe *stripe)
1620 {
1621 	scrub_stripe_reset_bitmaps(stripe);
1622 
1623 	stripe->nr_meta_extents = 0;
1624 	stripe->nr_data_extents = 0;
1625 	stripe->state = 0;
1626 
1627 	for (int i = 0; i < stripe->nr_sectors; i++) {
1628 		stripe->sectors[i].is_metadata = false;
1629 		stripe->sectors[i].csum = NULL;
1630 		stripe->sectors[i].generation = 0;
1631 	}
1632 }
1633 
1634 static void scrub_submit_initial_read(struct scrub_ctx *sctx,
1635 				      struct scrub_stripe *stripe)
1636 {
1637 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1638 	struct btrfs_bio *bbio;
1639 	int mirror = stripe->mirror_num;
1640 
1641 	ASSERT(stripe->bg);
1642 	ASSERT(stripe->mirror_num > 0);
1643 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1644 
1645 	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
1646 			       scrub_read_endio, stripe);
1647 
1648 	/* Read the whole stripe. */
1649 	bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
1650 	for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
1651 		int ret;
1652 
1653 		ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
1654 		/* We should have allocated enough bio vectors. */
1655 		ASSERT(ret == PAGE_SIZE);
1656 	}
1657 	atomic_inc(&stripe->pending_io);
1658 
1659 	/*
1660 	 * For dev-replace, either user asks to avoid the source dev, or
1661 	 * the device is missing, we try the next mirror instead.
1662 	 */
1663 	if (sctx->is_dev_replace &&
1664 	    (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
1665 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
1666 	     !stripe->dev->bdev)) {
1667 		int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1668 						  stripe->bg->length);
1669 
1670 		mirror = calc_next_mirror(mirror, num_copies);
1671 	}
1672 	btrfs_submit_bio(bbio, mirror);
1673 }
1674 
1675 static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
1676 {
1677 	int i;
1678 
1679 	for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
1680 		if (stripe->sectors[i].is_metadata) {
1681 			struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1682 
1683 			btrfs_err(fs_info,
1684 			"stripe %llu has unrepaired metadata sector at %llu",
1685 				  stripe->logical,
1686 				  stripe->logical + (i << fs_info->sectorsize_bits));
1687 			return true;
1688 		}
1689 	}
1690 	return false;
1691 }
1692 
1693 static int flush_scrub_stripes(struct scrub_ctx *sctx)
1694 {
1695 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1696 	struct scrub_stripe *stripe;
1697 	const int nr_stripes = sctx->cur_stripe;
1698 	int ret = 0;
1699 
1700 	if (!nr_stripes)
1701 		return 0;
1702 
1703 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
1704 
1705 	scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1706 			      nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
1707 	for (int i = 0; i < nr_stripes; i++) {
1708 		stripe = &sctx->stripes[i];
1709 		scrub_submit_initial_read(sctx, stripe);
1710 	}
1711 
1712 	for (int i = 0; i < nr_stripes; i++) {
1713 		stripe = &sctx->stripes[i];
1714 
1715 		wait_event(stripe->repair_wait,
1716 			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1717 	}
1718 
1719 	/*
1720 	 * Submit the repaired sectors.  For zoned case, we cannot do repair
1721 	 * in-place, but queue the bg to be relocated.
1722 	 */
1723 	if (btrfs_is_zoned(fs_info)) {
1724 		for (int i = 0; i < nr_stripes; i++) {
1725 			stripe = &sctx->stripes[i];
1726 
1727 			if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
1728 				btrfs_repair_one_zone(fs_info,
1729 						      sctx->stripes[0].bg->start);
1730 				break;
1731 			}
1732 		}
1733 	} else {
1734 		for (int i = 0; i < nr_stripes; i++) {
1735 			unsigned long repaired;
1736 
1737 			stripe = &sctx->stripes[i];
1738 
1739 			bitmap_andnot(&repaired, &stripe->init_error_bitmap,
1740 				      &stripe->error_bitmap, stripe->nr_sectors);
1741 			scrub_write_sectors(sctx, stripe, repaired, false);
1742 		}
1743 	}
1744 
1745 	/* Submit for dev-replace. */
1746 	if (sctx->is_dev_replace) {
1747 		/*
1748 		 * For dev-replace, if we know there is something wrong with
1749 		 * metadata, we should immedately abort.
1750 		 */
1751 		for (int i = 0; i < nr_stripes; i++) {
1752 			if (stripe_has_metadata_error(&sctx->stripes[i])) {
1753 				ret = -EIO;
1754 				goto out;
1755 			}
1756 		}
1757 		for (int i = 0; i < nr_stripes; i++) {
1758 			unsigned long good;
1759 
1760 			stripe = &sctx->stripes[i];
1761 
1762 			ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
1763 
1764 			bitmap_andnot(&good, &stripe->extent_sector_bitmap,
1765 				      &stripe->error_bitmap, stripe->nr_sectors);
1766 			scrub_write_sectors(sctx, stripe, good, true);
1767 		}
1768 	}
1769 
1770 	/* Wait for the above writebacks to finish. */
1771 	for (int i = 0; i < nr_stripes; i++) {
1772 		stripe = &sctx->stripes[i];
1773 
1774 		wait_scrub_stripe_io(stripe);
1775 		scrub_reset_stripe(stripe);
1776 	}
1777 out:
1778 	sctx->cur_stripe = 0;
1779 	return ret;
1780 }
1781 
1782 static void raid56_scrub_wait_endio(struct bio *bio)
1783 {
1784 	complete(bio->bi_private);
1785 }
1786 
1787 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
1788 			      struct btrfs_device *dev, int mirror_num,
1789 			      u64 logical, u32 length, u64 physical)
1790 {
1791 	struct scrub_stripe *stripe;
1792 	int ret;
1793 
1794 	/* No available slot, submit all stripes and wait for them. */
1795 	if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
1796 		ret = flush_scrub_stripes(sctx);
1797 		if (ret < 0)
1798 			return ret;
1799 	}
1800 
1801 	stripe = &sctx->stripes[sctx->cur_stripe];
1802 
1803 	/* We can queue one stripe using the remaining slot. */
1804 	scrub_reset_stripe(stripe);
1805 	ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
1806 					   logical, length, stripe);
1807 	/* Either >0 as no more extents or <0 for error. */
1808 	if (ret)
1809 		return ret;
1810 	sctx->cur_stripe++;
1811 	return 0;
1812 }
1813 
1814 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
1815 				      struct btrfs_device *scrub_dev,
1816 				      struct btrfs_block_group *bg,
1817 				      struct map_lookup *map,
1818 				      u64 full_stripe_start)
1819 {
1820 	DECLARE_COMPLETION_ONSTACK(io_done);
1821 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1822 	struct btrfs_raid_bio *rbio;
1823 	struct btrfs_io_context *bioc = NULL;
1824 	struct bio *bio;
1825 	struct scrub_stripe *stripe;
1826 	bool all_empty = true;
1827 	const int data_stripes = nr_data_stripes(map);
1828 	unsigned long extent_bitmap = 0;
1829 	u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
1830 	int ret;
1831 
1832 	ASSERT(sctx->raid56_data_stripes);
1833 
1834 	for (int i = 0; i < data_stripes; i++) {
1835 		int stripe_index;
1836 		int rot;
1837 		u64 physical;
1838 
1839 		stripe = &sctx->raid56_data_stripes[i];
1840 		rot = div_u64(full_stripe_start - bg->start,
1841 			      data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
1842 		stripe_index = (i + rot) % map->num_stripes;
1843 		physical = map->stripes[stripe_index].physical +
1844 			   (rot << BTRFS_STRIPE_LEN_SHIFT);
1845 
1846 		scrub_reset_stripe(stripe);
1847 		set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
1848 		ret = scrub_find_fill_first_stripe(bg,
1849 				map->stripes[stripe_index].dev, physical, 1,
1850 				full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
1851 				BTRFS_STRIPE_LEN, stripe);
1852 		if (ret < 0)
1853 			goto out;
1854 		/*
1855 		 * No extent in this data stripe, need to manually mark them
1856 		 * initialized to make later read submission happy.
1857 		 */
1858 		if (ret > 0) {
1859 			stripe->logical = full_stripe_start +
1860 					  (i << BTRFS_STRIPE_LEN_SHIFT);
1861 			stripe->dev = map->stripes[stripe_index].dev;
1862 			stripe->mirror_num = 1;
1863 			set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1864 		}
1865 	}
1866 
1867 	/* Check if all data stripes are empty. */
1868 	for (int i = 0; i < data_stripes; i++) {
1869 		stripe = &sctx->raid56_data_stripes[i];
1870 		if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
1871 			all_empty = false;
1872 			break;
1873 		}
1874 	}
1875 	if (all_empty) {
1876 		ret = 0;
1877 		goto out;
1878 	}
1879 
1880 	for (int i = 0; i < data_stripes; i++) {
1881 		stripe = &sctx->raid56_data_stripes[i];
1882 		scrub_submit_initial_read(sctx, stripe);
1883 	}
1884 	for (int i = 0; i < data_stripes; i++) {
1885 		stripe = &sctx->raid56_data_stripes[i];
1886 
1887 		wait_event(stripe->repair_wait,
1888 			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1889 	}
1890 	/* For now, no zoned support for RAID56. */
1891 	ASSERT(!btrfs_is_zoned(sctx->fs_info));
1892 
1893 	/* Writeback for the repaired sectors. */
1894 	for (int i = 0; i < data_stripes; i++) {
1895 		unsigned long repaired;
1896 
1897 		stripe = &sctx->raid56_data_stripes[i];
1898 
1899 		bitmap_andnot(&repaired, &stripe->init_error_bitmap,
1900 			      &stripe->error_bitmap, stripe->nr_sectors);
1901 		scrub_write_sectors(sctx, stripe, repaired, false);
1902 	}
1903 
1904 	/* Wait for the above writebacks to finish. */
1905 	for (int i = 0; i < data_stripes; i++) {
1906 		stripe = &sctx->raid56_data_stripes[i];
1907 
1908 		wait_scrub_stripe_io(stripe);
1909 	}
1910 
1911 	/*
1912 	 * Now all data stripes are properly verified. Check if we have any
1913 	 * unrepaired, if so abort immediately or we could further corrupt the
1914 	 * P/Q stripes.
1915 	 *
1916 	 * During the loop, also populate extent_bitmap.
1917 	 */
1918 	for (int i = 0; i < data_stripes; i++) {
1919 		unsigned long error;
1920 
1921 		stripe = &sctx->raid56_data_stripes[i];
1922 
1923 		/*
1924 		 * We should only check the errors where there is an extent.
1925 		 * As we may hit an empty data stripe while it's missing.
1926 		 */
1927 		bitmap_and(&error, &stripe->error_bitmap,
1928 			   &stripe->extent_sector_bitmap, stripe->nr_sectors);
1929 		if (!bitmap_empty(&error, stripe->nr_sectors)) {
1930 			btrfs_err(fs_info,
1931 "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
1932 				  full_stripe_start, i, stripe->nr_sectors,
1933 				  &error);
1934 			ret = -EIO;
1935 			goto out;
1936 		}
1937 		bitmap_or(&extent_bitmap, &extent_bitmap,
1938 			  &stripe->extent_sector_bitmap, stripe->nr_sectors);
1939 	}
1940 
1941 	/* Now we can check and regenerate the P/Q stripe. */
1942 	bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
1943 	bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
1944 	bio->bi_private = &io_done;
1945 	bio->bi_end_io = raid56_scrub_wait_endio;
1946 
1947 	btrfs_bio_counter_inc_blocked(fs_info);
1948 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
1949 			       &length, &bioc);
1950 	if (ret < 0) {
1951 		btrfs_put_bioc(bioc);
1952 		btrfs_bio_counter_dec(fs_info);
1953 		goto out;
1954 	}
1955 	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
1956 				BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1957 	btrfs_put_bioc(bioc);
1958 	if (!rbio) {
1959 		ret = -ENOMEM;
1960 		btrfs_bio_counter_dec(fs_info);
1961 		goto out;
1962 	}
1963 	raid56_parity_submit_scrub_rbio(rbio);
1964 	wait_for_completion_io(&io_done);
1965 	ret = blk_status_to_errno(bio->bi_status);
1966 	bio_put(bio);
1967 	btrfs_bio_counter_dec(fs_info);
1968 
1969 out:
1970 	return ret;
1971 }
1972 
1973 /*
1974  * Scrub one range which can only has simple mirror based profile.
1975  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
1976  *  RAID0/RAID10).
1977  *
1978  * Since we may need to handle a subset of block group, we need @logical_start
1979  * and @logical_length parameter.
1980  */
1981 static int scrub_simple_mirror(struct scrub_ctx *sctx,
1982 			       struct btrfs_block_group *bg,
1983 			       struct map_lookup *map,
1984 			       u64 logical_start, u64 logical_length,
1985 			       struct btrfs_device *device,
1986 			       u64 physical, int mirror_num)
1987 {
1988 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1989 	const u64 logical_end = logical_start + logical_length;
1990 	/* An artificial limit, inherit from old scrub behavior */
1991 	struct btrfs_path path = { 0 };
1992 	u64 cur_logical = logical_start;
1993 	int ret;
1994 
1995 	/* The range must be inside the bg */
1996 	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
1997 
1998 	path.search_commit_root = 1;
1999 	path.skip_locking = 1;
2000 	/* Go through each extent items inside the logical range */
2001 	while (cur_logical < logical_end) {
2002 		u64 cur_physical = physical + cur_logical - logical_start;
2003 
2004 		/* Canceled? */
2005 		if (atomic_read(&fs_info->scrub_cancel_req) ||
2006 		    atomic_read(&sctx->cancel_req)) {
2007 			ret = -ECANCELED;
2008 			break;
2009 		}
2010 		/* Paused? */
2011 		if (atomic_read(&fs_info->scrub_pause_req)) {
2012 			/* Push queued extents */
2013 			scrub_blocked_if_needed(fs_info);
2014 		}
2015 		/* Block group removed? */
2016 		spin_lock(&bg->lock);
2017 		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
2018 			spin_unlock(&bg->lock);
2019 			ret = 0;
2020 			break;
2021 		}
2022 		spin_unlock(&bg->lock);
2023 
2024 		ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
2025 					 cur_logical, logical_end - cur_logical,
2026 					 cur_physical);
2027 		if (ret > 0) {
2028 			/* No more extent, just update the accounting */
2029 			sctx->stat.last_physical = physical + logical_length;
2030 			ret = 0;
2031 			break;
2032 		}
2033 		if (ret < 0)
2034 			break;
2035 
2036 		ASSERT(sctx->cur_stripe > 0);
2037 		cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
2038 			      + BTRFS_STRIPE_LEN;
2039 
2040 		/* Don't hold CPU for too long time */
2041 		cond_resched();
2042 	}
2043 	btrfs_release_path(&path);
2044 	return ret;
2045 }
2046 
2047 /* Calculate the full stripe length for simple stripe based profiles */
2048 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
2049 {
2050 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2051 			    BTRFS_BLOCK_GROUP_RAID10));
2052 
2053 	return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
2054 }
2055 
2056 /* Get the logical bytenr for the stripe */
2057 static u64 simple_stripe_get_logical(struct map_lookup *map,
2058 				     struct btrfs_block_group *bg,
2059 				     int stripe_index)
2060 {
2061 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2062 			    BTRFS_BLOCK_GROUP_RAID10));
2063 	ASSERT(stripe_index < map->num_stripes);
2064 
2065 	/*
2066 	 * (stripe_index / sub_stripes) gives how many data stripes we need to
2067 	 * skip.
2068 	 */
2069 	return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
2070 	       bg->start;
2071 }
2072 
2073 /* Get the mirror number for the stripe */
2074 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
2075 {
2076 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2077 			    BTRFS_BLOCK_GROUP_RAID10));
2078 	ASSERT(stripe_index < map->num_stripes);
2079 
2080 	/* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2081 	return stripe_index % map->sub_stripes + 1;
2082 }
2083 
2084 static int scrub_simple_stripe(struct scrub_ctx *sctx,
2085 			       struct btrfs_block_group *bg,
2086 			       struct map_lookup *map,
2087 			       struct btrfs_device *device,
2088 			       int stripe_index)
2089 {
2090 	const u64 logical_increment = simple_stripe_full_stripe_len(map);
2091 	const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
2092 	const u64 orig_physical = map->stripes[stripe_index].physical;
2093 	const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
2094 	u64 cur_logical = orig_logical;
2095 	u64 cur_physical = orig_physical;
2096 	int ret = 0;
2097 
2098 	while (cur_logical < bg->start + bg->length) {
2099 		/*
2100 		 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2101 		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2102 		 * this stripe.
2103 		 */
2104 		ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
2105 					  BTRFS_STRIPE_LEN, device, cur_physical,
2106 					  mirror_num);
2107 		if (ret)
2108 			return ret;
2109 		/* Skip to next stripe which belongs to the target device */
2110 		cur_logical += logical_increment;
2111 		/* For physical offset, we just go to next stripe */
2112 		cur_physical += BTRFS_STRIPE_LEN;
2113 	}
2114 	return ret;
2115 }
2116 
2117 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2118 					   struct btrfs_block_group *bg,
2119 					   struct extent_map *em,
2120 					   struct btrfs_device *scrub_dev,
2121 					   int stripe_index)
2122 {
2123 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2124 	struct map_lookup *map = em->map_lookup;
2125 	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
2126 	const u64 chunk_logical = bg->start;
2127 	int ret;
2128 	int ret2;
2129 	u64 physical = map->stripes[stripe_index].physical;
2130 	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
2131 	const u64 physical_end = physical + dev_stripe_len;
2132 	u64 logical;
2133 	u64 logic_end;
2134 	/* The logical increment after finishing one stripe */
2135 	u64 increment;
2136 	/* Offset inside the chunk */
2137 	u64 offset;
2138 	u64 stripe_logical;
2139 	int stop_loop = 0;
2140 
2141 	scrub_blocked_if_needed(fs_info);
2142 
2143 	if (sctx->is_dev_replace &&
2144 	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
2145 		mutex_lock(&sctx->wr_lock);
2146 		sctx->write_pointer = physical;
2147 		mutex_unlock(&sctx->wr_lock);
2148 	}
2149 
2150 	/* Prepare the extra data stripes used by RAID56. */
2151 	if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2152 		ASSERT(sctx->raid56_data_stripes == NULL);
2153 
2154 		sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
2155 						    sizeof(struct scrub_stripe),
2156 						    GFP_KERNEL);
2157 		if (!sctx->raid56_data_stripes) {
2158 			ret = -ENOMEM;
2159 			goto out;
2160 		}
2161 		for (int i = 0; i < nr_data_stripes(map); i++) {
2162 			ret = init_scrub_stripe(fs_info,
2163 						&sctx->raid56_data_stripes[i]);
2164 			if (ret < 0)
2165 				goto out;
2166 			sctx->raid56_data_stripes[i].bg = bg;
2167 			sctx->raid56_data_stripes[i].sctx = sctx;
2168 		}
2169 	}
2170 	/*
2171 	 * There used to be a big double loop to handle all profiles using the
2172 	 * same routine, which grows larger and more gross over time.
2173 	 *
2174 	 * So here we handle each profile differently, so simpler profiles
2175 	 * have simpler scrubbing function.
2176 	 */
2177 	if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
2178 			 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2179 		/*
2180 		 * Above check rules out all complex profile, the remaining
2181 		 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2182 		 * mirrored duplication without stripe.
2183 		 *
2184 		 * Only @physical and @mirror_num needs to calculated using
2185 		 * @stripe_index.
2186 		 */
2187 		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
2188 				scrub_dev, map->stripes[stripe_index].physical,
2189 				stripe_index + 1);
2190 		offset = 0;
2191 		goto out;
2192 	}
2193 	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
2194 		ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
2195 		offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
2196 		goto out;
2197 	}
2198 
2199 	/* Only RAID56 goes through the old code */
2200 	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
2201 	ret = 0;
2202 
2203 	/* Calculate the logical end of the stripe */
2204 	get_raid56_logic_offset(physical_end, stripe_index,
2205 				map, &logic_end, NULL);
2206 	logic_end += chunk_logical;
2207 
2208 	/* Initialize @offset in case we need to go to out: label */
2209 	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
2210 	increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
2211 
2212 	/*
2213 	 * Due to the rotation, for RAID56 it's better to iterate each stripe
2214 	 * using their physical offset.
2215 	 */
2216 	while (physical < physical_end) {
2217 		ret = get_raid56_logic_offset(physical, stripe_index, map,
2218 					      &logical, &stripe_logical);
2219 		logical += chunk_logical;
2220 		if (ret) {
2221 			/* it is parity strip */
2222 			stripe_logical += chunk_logical;
2223 			ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
2224 							 map, stripe_logical);
2225 			if (ret)
2226 				goto out;
2227 			goto next;
2228 		}
2229 
2230 		/*
2231 		 * Now we're at a data stripe, scrub each extents in the range.
2232 		 *
2233 		 * At this stage, if we ignore the repair part, inside each data
2234 		 * stripe it is no different than SINGLE profile.
2235 		 * We can reuse scrub_simple_mirror() here, as the repair part
2236 		 * is still based on @mirror_num.
2237 		 */
2238 		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
2239 					  scrub_dev, physical, 1);
2240 		if (ret < 0)
2241 			goto out;
2242 next:
2243 		logical += increment;
2244 		physical += BTRFS_STRIPE_LEN;
2245 		spin_lock(&sctx->stat_lock);
2246 		if (stop_loop)
2247 			sctx->stat.last_physical =
2248 				map->stripes[stripe_index].physical + dev_stripe_len;
2249 		else
2250 			sctx->stat.last_physical = physical;
2251 		spin_unlock(&sctx->stat_lock);
2252 		if (stop_loop)
2253 			break;
2254 	}
2255 out:
2256 	ret2 = flush_scrub_stripes(sctx);
2257 	if (!ret2)
2258 		ret = ret2;
2259 	if (sctx->raid56_data_stripes) {
2260 		for (int i = 0; i < nr_data_stripes(map); i++)
2261 			release_scrub_stripe(&sctx->raid56_data_stripes[i]);
2262 		kfree(sctx->raid56_data_stripes);
2263 		sctx->raid56_data_stripes = NULL;
2264 	}
2265 
2266 	if (sctx->is_dev_replace && ret >= 0) {
2267 		int ret2;
2268 
2269 		ret2 = sync_write_pointer_for_zoned(sctx,
2270 				chunk_logical + offset,
2271 				map->stripes[stripe_index].physical,
2272 				physical_end);
2273 		if (ret2)
2274 			ret = ret2;
2275 	}
2276 
2277 	return ret < 0 ? ret : 0;
2278 }
2279 
2280 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2281 					  struct btrfs_block_group *bg,
2282 					  struct btrfs_device *scrub_dev,
2283 					  u64 dev_offset,
2284 					  u64 dev_extent_len)
2285 {
2286 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2287 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
2288 	struct map_lookup *map;
2289 	struct extent_map *em;
2290 	int i;
2291 	int ret = 0;
2292 
2293 	read_lock(&map_tree->lock);
2294 	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
2295 	read_unlock(&map_tree->lock);
2296 
2297 	if (!em) {
2298 		/*
2299 		 * Might have been an unused block group deleted by the cleaner
2300 		 * kthread or relocation.
2301 		 */
2302 		spin_lock(&bg->lock);
2303 		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
2304 			ret = -EINVAL;
2305 		spin_unlock(&bg->lock);
2306 
2307 		return ret;
2308 	}
2309 	if (em->start != bg->start)
2310 		goto out;
2311 	if (em->len < dev_extent_len)
2312 		goto out;
2313 
2314 	map = em->map_lookup;
2315 	for (i = 0; i < map->num_stripes; ++i) {
2316 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2317 		    map->stripes[i].physical == dev_offset) {
2318 			ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
2319 			if (ret)
2320 				goto out;
2321 		}
2322 	}
2323 out:
2324 	free_extent_map(em);
2325 
2326 	return ret;
2327 }
2328 
2329 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
2330 					  struct btrfs_block_group *cache)
2331 {
2332 	struct btrfs_fs_info *fs_info = cache->fs_info;
2333 	struct btrfs_trans_handle *trans;
2334 
2335 	if (!btrfs_is_zoned(fs_info))
2336 		return 0;
2337 
2338 	btrfs_wait_block_group_reservations(cache);
2339 	btrfs_wait_nocow_writers(cache);
2340 	btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
2341 
2342 	trans = btrfs_join_transaction(root);
2343 	if (IS_ERR(trans))
2344 		return PTR_ERR(trans);
2345 	return btrfs_commit_transaction(trans);
2346 }
2347 
2348 static noinline_for_stack
2349 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2350 			   struct btrfs_device *scrub_dev, u64 start, u64 end)
2351 {
2352 	struct btrfs_dev_extent *dev_extent = NULL;
2353 	struct btrfs_path *path;
2354 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2355 	struct btrfs_root *root = fs_info->dev_root;
2356 	u64 chunk_offset;
2357 	int ret = 0;
2358 	int ro_set;
2359 	int slot;
2360 	struct extent_buffer *l;
2361 	struct btrfs_key key;
2362 	struct btrfs_key found_key;
2363 	struct btrfs_block_group *cache;
2364 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2365 
2366 	path = btrfs_alloc_path();
2367 	if (!path)
2368 		return -ENOMEM;
2369 
2370 	path->reada = READA_FORWARD;
2371 	path->search_commit_root = 1;
2372 	path->skip_locking = 1;
2373 
2374 	key.objectid = scrub_dev->devid;
2375 	key.offset = 0ull;
2376 	key.type = BTRFS_DEV_EXTENT_KEY;
2377 
2378 	while (1) {
2379 		u64 dev_extent_len;
2380 
2381 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2382 		if (ret < 0)
2383 			break;
2384 		if (ret > 0) {
2385 			if (path->slots[0] >=
2386 			    btrfs_header_nritems(path->nodes[0])) {
2387 				ret = btrfs_next_leaf(root, path);
2388 				if (ret < 0)
2389 					break;
2390 				if (ret > 0) {
2391 					ret = 0;
2392 					break;
2393 				}
2394 			} else {
2395 				ret = 0;
2396 			}
2397 		}
2398 
2399 		l = path->nodes[0];
2400 		slot = path->slots[0];
2401 
2402 		btrfs_item_key_to_cpu(l, &found_key, slot);
2403 
2404 		if (found_key.objectid != scrub_dev->devid)
2405 			break;
2406 
2407 		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2408 			break;
2409 
2410 		if (found_key.offset >= end)
2411 			break;
2412 
2413 		if (found_key.offset < key.offset)
2414 			break;
2415 
2416 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2417 		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
2418 
2419 		if (found_key.offset + dev_extent_len <= start)
2420 			goto skip;
2421 
2422 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2423 
2424 		/*
2425 		 * get a reference on the corresponding block group to prevent
2426 		 * the chunk from going away while we scrub it
2427 		 */
2428 		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2429 
2430 		/* some chunks are removed but not committed to disk yet,
2431 		 * continue scrubbing */
2432 		if (!cache)
2433 			goto skip;
2434 
2435 		ASSERT(cache->start <= chunk_offset);
2436 		/*
2437 		 * We are using the commit root to search for device extents, so
2438 		 * that means we could have found a device extent item from a
2439 		 * block group that was deleted in the current transaction. The
2440 		 * logical start offset of the deleted block group, stored at
2441 		 * @chunk_offset, might be part of the logical address range of
2442 		 * a new block group (which uses different physical extents).
2443 		 * In this case btrfs_lookup_block_group() has returned the new
2444 		 * block group, and its start address is less than @chunk_offset.
2445 		 *
2446 		 * We skip such new block groups, because it's pointless to
2447 		 * process them, as we won't find their extents because we search
2448 		 * for them using the commit root of the extent tree. For a device
2449 		 * replace it's also fine to skip it, we won't miss copying them
2450 		 * to the target device because we have the write duplication
2451 		 * setup through the regular write path (by btrfs_map_block()),
2452 		 * and we have committed a transaction when we started the device
2453 		 * replace, right after setting up the device replace state.
2454 		 */
2455 		if (cache->start < chunk_offset) {
2456 			btrfs_put_block_group(cache);
2457 			goto skip;
2458 		}
2459 
2460 		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
2461 			if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
2462 				btrfs_put_block_group(cache);
2463 				goto skip;
2464 			}
2465 		}
2466 
2467 		/*
2468 		 * Make sure that while we are scrubbing the corresponding block
2469 		 * group doesn't get its logical address and its device extents
2470 		 * reused for another block group, which can possibly be of a
2471 		 * different type and different profile. We do this to prevent
2472 		 * false error detections and crashes due to bogus attempts to
2473 		 * repair extents.
2474 		 */
2475 		spin_lock(&cache->lock);
2476 		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
2477 			spin_unlock(&cache->lock);
2478 			btrfs_put_block_group(cache);
2479 			goto skip;
2480 		}
2481 		btrfs_freeze_block_group(cache);
2482 		spin_unlock(&cache->lock);
2483 
2484 		/*
2485 		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2486 		 * to avoid deadlock caused by:
2487 		 * btrfs_inc_block_group_ro()
2488 		 * -> btrfs_wait_for_commit()
2489 		 * -> btrfs_commit_transaction()
2490 		 * -> btrfs_scrub_pause()
2491 		 */
2492 		scrub_pause_on(fs_info);
2493 
2494 		/*
2495 		 * Don't do chunk preallocation for scrub.
2496 		 *
2497 		 * This is especially important for SYSTEM bgs, or we can hit
2498 		 * -EFBIG from btrfs_finish_chunk_alloc() like:
2499 		 * 1. The only SYSTEM bg is marked RO.
2500 		 *    Since SYSTEM bg is small, that's pretty common.
2501 		 * 2. New SYSTEM bg will be allocated
2502 		 *    Due to regular version will allocate new chunk.
2503 		 * 3. New SYSTEM bg is empty and will get cleaned up
2504 		 *    Before cleanup really happens, it's marked RO again.
2505 		 * 4. Empty SYSTEM bg get scrubbed
2506 		 *    We go back to 2.
2507 		 *
2508 		 * This can easily boost the amount of SYSTEM chunks if cleaner
2509 		 * thread can't be triggered fast enough, and use up all space
2510 		 * of btrfs_super_block::sys_chunk_array
2511 		 *
2512 		 * While for dev replace, we need to try our best to mark block
2513 		 * group RO, to prevent race between:
2514 		 * - Write duplication
2515 		 *   Contains latest data
2516 		 * - Scrub copy
2517 		 *   Contains data from commit tree
2518 		 *
2519 		 * If target block group is not marked RO, nocow writes can
2520 		 * be overwritten by scrub copy, causing data corruption.
2521 		 * So for dev-replace, it's not allowed to continue if a block
2522 		 * group is not RO.
2523 		 */
2524 		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
2525 		if (!ret && sctx->is_dev_replace) {
2526 			ret = finish_extent_writes_for_zoned(root, cache);
2527 			if (ret) {
2528 				btrfs_dec_block_group_ro(cache);
2529 				scrub_pause_off(fs_info);
2530 				btrfs_put_block_group(cache);
2531 				break;
2532 			}
2533 		}
2534 
2535 		if (ret == 0) {
2536 			ro_set = 1;
2537 		} else if (ret == -ENOSPC && !sctx->is_dev_replace &&
2538 			   !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
2539 			/*
2540 			 * btrfs_inc_block_group_ro return -ENOSPC when it
2541 			 * failed in creating new chunk for metadata.
2542 			 * It is not a problem for scrub, because
2543 			 * metadata are always cowed, and our scrub paused
2544 			 * commit_transactions.
2545 			 *
2546 			 * For RAID56 chunks, we have to mark them read-only
2547 			 * for scrub, as later we would use our own cache
2548 			 * out of RAID56 realm.
2549 			 * Thus we want the RAID56 bg to be marked RO to
2550 			 * prevent RMW from screwing up out cache.
2551 			 */
2552 			ro_set = 0;
2553 		} else if (ret == -ETXTBSY) {
2554 			btrfs_warn(fs_info,
2555 		   "skipping scrub of block group %llu due to active swapfile",
2556 				   cache->start);
2557 			scrub_pause_off(fs_info);
2558 			ret = 0;
2559 			goto skip_unfreeze;
2560 		} else {
2561 			btrfs_warn(fs_info,
2562 				   "failed setting block group ro: %d", ret);
2563 			btrfs_unfreeze_block_group(cache);
2564 			btrfs_put_block_group(cache);
2565 			scrub_pause_off(fs_info);
2566 			break;
2567 		}
2568 
2569 		/*
2570 		 * Now the target block is marked RO, wait for nocow writes to
2571 		 * finish before dev-replace.
2572 		 * COW is fine, as COW never overwrites extents in commit tree.
2573 		 */
2574 		if (sctx->is_dev_replace) {
2575 			btrfs_wait_nocow_writers(cache);
2576 			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
2577 					cache->length);
2578 		}
2579 
2580 		scrub_pause_off(fs_info);
2581 		down_write(&dev_replace->rwsem);
2582 		dev_replace->cursor_right = found_key.offset + dev_extent_len;
2583 		dev_replace->cursor_left = found_key.offset;
2584 		dev_replace->item_needs_writeback = 1;
2585 		up_write(&dev_replace->rwsem);
2586 
2587 		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
2588 				  dev_extent_len);
2589 		if (sctx->is_dev_replace &&
2590 		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
2591 						      cache, found_key.offset))
2592 			ro_set = 0;
2593 
2594 		down_write(&dev_replace->rwsem);
2595 		dev_replace->cursor_left = dev_replace->cursor_right;
2596 		dev_replace->item_needs_writeback = 1;
2597 		up_write(&dev_replace->rwsem);
2598 
2599 		if (ro_set)
2600 			btrfs_dec_block_group_ro(cache);
2601 
2602 		/*
2603 		 * We might have prevented the cleaner kthread from deleting
2604 		 * this block group if it was already unused because we raced
2605 		 * and set it to RO mode first. So add it back to the unused
2606 		 * list, otherwise it might not ever be deleted unless a manual
2607 		 * balance is triggered or it becomes used and unused again.
2608 		 */
2609 		spin_lock(&cache->lock);
2610 		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
2611 		    !cache->ro && cache->reserved == 0 && cache->used == 0) {
2612 			spin_unlock(&cache->lock);
2613 			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
2614 				btrfs_discard_queue_work(&fs_info->discard_ctl,
2615 							 cache);
2616 			else
2617 				btrfs_mark_bg_unused(cache);
2618 		} else {
2619 			spin_unlock(&cache->lock);
2620 		}
2621 skip_unfreeze:
2622 		btrfs_unfreeze_block_group(cache);
2623 		btrfs_put_block_group(cache);
2624 		if (ret)
2625 			break;
2626 		if (sctx->is_dev_replace &&
2627 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
2628 			ret = -EIO;
2629 			break;
2630 		}
2631 		if (sctx->stat.malloc_errors > 0) {
2632 			ret = -ENOMEM;
2633 			break;
2634 		}
2635 skip:
2636 		key.offset = found_key.offset + dev_extent_len;
2637 		btrfs_release_path(path);
2638 	}
2639 
2640 	btrfs_free_path(path);
2641 
2642 	return ret;
2643 }
2644 
2645 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
2646 			   struct page *page, u64 physical, u64 generation)
2647 {
2648 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2649 	struct bio_vec bvec;
2650 	struct bio bio;
2651 	struct btrfs_super_block *sb = page_address(page);
2652 	int ret;
2653 
2654 	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
2655 	bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
2656 	__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
2657 	ret = submit_bio_wait(&bio);
2658 	bio_uninit(&bio);
2659 
2660 	if (ret < 0)
2661 		return ret;
2662 	ret = btrfs_check_super_csum(fs_info, sb);
2663 	if (ret != 0) {
2664 		btrfs_err_rl(fs_info,
2665 			"super block at physical %llu devid %llu has bad csum",
2666 			physical, dev->devid);
2667 		return -EIO;
2668 	}
2669 	if (btrfs_super_generation(sb) != generation) {
2670 		btrfs_err_rl(fs_info,
2671 "super block at physical %llu devid %llu has bad generation %llu expect %llu",
2672 			     physical, dev->devid,
2673 			     btrfs_super_generation(sb), generation);
2674 		return -EUCLEAN;
2675 	}
2676 
2677 	return btrfs_validate_super(fs_info, sb, -1);
2678 }
2679 
2680 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2681 					   struct btrfs_device *scrub_dev)
2682 {
2683 	int	i;
2684 	u64	bytenr;
2685 	u64	gen;
2686 	int ret = 0;
2687 	struct page *page;
2688 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2689 
2690 	if (BTRFS_FS_ERROR(fs_info))
2691 		return -EROFS;
2692 
2693 	page = alloc_page(GFP_KERNEL);
2694 	if (!page) {
2695 		spin_lock(&sctx->stat_lock);
2696 		sctx->stat.malloc_errors++;
2697 		spin_unlock(&sctx->stat_lock);
2698 		return -ENOMEM;
2699 	}
2700 
2701 	/* Seed devices of a new filesystem has their own generation. */
2702 	if (scrub_dev->fs_devices != fs_info->fs_devices)
2703 		gen = scrub_dev->generation;
2704 	else
2705 		gen = fs_info->last_trans_committed;
2706 
2707 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2708 		bytenr = btrfs_sb_offset(i);
2709 		if (bytenr + BTRFS_SUPER_INFO_SIZE >
2710 		    scrub_dev->commit_total_bytes)
2711 			break;
2712 		if (!btrfs_check_super_location(scrub_dev, bytenr))
2713 			continue;
2714 
2715 		ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
2716 		if (ret) {
2717 			spin_lock(&sctx->stat_lock);
2718 			sctx->stat.super_errors++;
2719 			spin_unlock(&sctx->stat_lock);
2720 		}
2721 	}
2722 	__free_page(page);
2723 	return 0;
2724 }
2725 
2726 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
2727 {
2728 	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
2729 					&fs_info->scrub_lock)) {
2730 		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
2731 		struct workqueue_struct *scrub_wr_comp =
2732 						fs_info->scrub_wr_completion_workers;
2733 
2734 		fs_info->scrub_workers = NULL;
2735 		fs_info->scrub_wr_completion_workers = NULL;
2736 		mutex_unlock(&fs_info->scrub_lock);
2737 
2738 		if (scrub_workers)
2739 			destroy_workqueue(scrub_workers);
2740 		if (scrub_wr_comp)
2741 			destroy_workqueue(scrub_wr_comp);
2742 	}
2743 }
2744 
2745 /*
2746  * get a reference count on fs_info->scrub_workers. start worker if necessary
2747  */
2748 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2749 						int is_dev_replace)
2750 {
2751 	struct workqueue_struct *scrub_workers = NULL;
2752 	struct workqueue_struct *scrub_wr_comp = NULL;
2753 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
2754 	int max_active = fs_info->thread_pool_size;
2755 	int ret = -ENOMEM;
2756 
2757 	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
2758 		return 0;
2759 
2760 	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
2761 					is_dev_replace ? 1 : max_active);
2762 	if (!scrub_workers)
2763 		goto fail_scrub_workers;
2764 
2765 	scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
2766 	if (!scrub_wr_comp)
2767 		goto fail_scrub_wr_completion_workers;
2768 
2769 	mutex_lock(&fs_info->scrub_lock);
2770 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
2771 		ASSERT(fs_info->scrub_workers == NULL &&
2772 		       fs_info->scrub_wr_completion_workers == NULL);
2773 		fs_info->scrub_workers = scrub_workers;
2774 		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
2775 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
2776 		mutex_unlock(&fs_info->scrub_lock);
2777 		return 0;
2778 	}
2779 	/* Other thread raced in and created the workers for us */
2780 	refcount_inc(&fs_info->scrub_workers_refcnt);
2781 	mutex_unlock(&fs_info->scrub_lock);
2782 
2783 	ret = 0;
2784 
2785 	destroy_workqueue(scrub_wr_comp);
2786 fail_scrub_wr_completion_workers:
2787 	destroy_workqueue(scrub_workers);
2788 fail_scrub_workers:
2789 	return ret;
2790 }
2791 
2792 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2793 		    u64 end, struct btrfs_scrub_progress *progress,
2794 		    int readonly, int is_dev_replace)
2795 {
2796 	struct btrfs_dev_lookup_args args = { .devid = devid };
2797 	struct scrub_ctx *sctx;
2798 	int ret;
2799 	struct btrfs_device *dev;
2800 	unsigned int nofs_flag;
2801 	bool need_commit = false;
2802 
2803 	if (btrfs_fs_closing(fs_info))
2804 		return -EAGAIN;
2805 
2806 	/* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
2807 	ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
2808 
2809 	/*
2810 	 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
2811 	 * value (max nodesize / min sectorsize), thus nodesize should always
2812 	 * be fine.
2813 	 */
2814 	ASSERT(fs_info->nodesize <=
2815 	       SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
2816 
2817 	/* Allocate outside of device_list_mutex */
2818 	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
2819 	if (IS_ERR(sctx))
2820 		return PTR_ERR(sctx);
2821 
2822 	ret = scrub_workers_get(fs_info, is_dev_replace);
2823 	if (ret)
2824 		goto out_free_ctx;
2825 
2826 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2827 	dev = btrfs_find_device(fs_info->fs_devices, &args);
2828 	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
2829 		     !is_dev_replace)) {
2830 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2831 		ret = -ENODEV;
2832 		goto out;
2833 	}
2834 
2835 	if (!is_dev_replace && !readonly &&
2836 	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2837 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2838 		btrfs_err_in_rcu(fs_info,
2839 			"scrub on devid %llu: filesystem on %s is not writable",
2840 				 devid, btrfs_dev_name(dev));
2841 		ret = -EROFS;
2842 		goto out;
2843 	}
2844 
2845 	mutex_lock(&fs_info->scrub_lock);
2846 	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
2847 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
2848 		mutex_unlock(&fs_info->scrub_lock);
2849 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2850 		ret = -EIO;
2851 		goto out;
2852 	}
2853 
2854 	down_read(&fs_info->dev_replace.rwsem);
2855 	if (dev->scrub_ctx ||
2856 	    (!is_dev_replace &&
2857 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2858 		up_read(&fs_info->dev_replace.rwsem);
2859 		mutex_unlock(&fs_info->scrub_lock);
2860 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2861 		ret = -EINPROGRESS;
2862 		goto out;
2863 	}
2864 	up_read(&fs_info->dev_replace.rwsem);
2865 
2866 	sctx->readonly = readonly;
2867 	dev->scrub_ctx = sctx;
2868 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2869 
2870 	/*
2871 	 * checking @scrub_pause_req here, we can avoid
2872 	 * race between committing transaction and scrubbing.
2873 	 */
2874 	__scrub_blocked_if_needed(fs_info);
2875 	atomic_inc(&fs_info->scrubs_running);
2876 	mutex_unlock(&fs_info->scrub_lock);
2877 
2878 	/*
2879 	 * In order to avoid deadlock with reclaim when there is a transaction
2880 	 * trying to pause scrub, make sure we use GFP_NOFS for all the
2881 	 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
2882 	 * invoked by our callees. The pausing request is done when the
2883 	 * transaction commit starts, and it blocks the transaction until scrub
2884 	 * is paused (done at specific points at scrub_stripe() or right above
2885 	 * before incrementing fs_info->scrubs_running).
2886 	 */
2887 	nofs_flag = memalloc_nofs_save();
2888 	if (!is_dev_replace) {
2889 		u64 old_super_errors;
2890 
2891 		spin_lock(&sctx->stat_lock);
2892 		old_super_errors = sctx->stat.super_errors;
2893 		spin_unlock(&sctx->stat_lock);
2894 
2895 		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
2896 		/*
2897 		 * by holding device list mutex, we can
2898 		 * kick off writing super in log tree sync.
2899 		 */
2900 		mutex_lock(&fs_info->fs_devices->device_list_mutex);
2901 		ret = scrub_supers(sctx, dev);
2902 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2903 
2904 		spin_lock(&sctx->stat_lock);
2905 		/*
2906 		 * Super block errors found, but we can not commit transaction
2907 		 * at current context, since btrfs_commit_transaction() needs
2908 		 * to pause the current running scrub (hold by ourselves).
2909 		 */
2910 		if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
2911 			need_commit = true;
2912 		spin_unlock(&sctx->stat_lock);
2913 	}
2914 
2915 	if (!ret)
2916 		ret = scrub_enumerate_chunks(sctx, dev, start, end);
2917 	memalloc_nofs_restore(nofs_flag);
2918 
2919 	atomic_dec(&fs_info->scrubs_running);
2920 	wake_up(&fs_info->scrub_pause_wait);
2921 
2922 	if (progress)
2923 		memcpy(progress, &sctx->stat, sizeof(*progress));
2924 
2925 	if (!is_dev_replace)
2926 		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
2927 			ret ? "not finished" : "finished", devid, ret);
2928 
2929 	mutex_lock(&fs_info->scrub_lock);
2930 	dev->scrub_ctx = NULL;
2931 	mutex_unlock(&fs_info->scrub_lock);
2932 
2933 	scrub_workers_put(fs_info);
2934 	scrub_put_ctx(sctx);
2935 
2936 	/*
2937 	 * We found some super block errors before, now try to force a
2938 	 * transaction commit, as scrub has finished.
2939 	 */
2940 	if (need_commit) {
2941 		struct btrfs_trans_handle *trans;
2942 
2943 		trans = btrfs_start_transaction(fs_info->tree_root, 0);
2944 		if (IS_ERR(trans)) {
2945 			ret = PTR_ERR(trans);
2946 			btrfs_err(fs_info,
2947 	"scrub: failed to start transaction to fix super block errors: %d", ret);
2948 			return ret;
2949 		}
2950 		ret = btrfs_commit_transaction(trans);
2951 		if (ret < 0)
2952 			btrfs_err(fs_info,
2953 	"scrub: failed to commit transaction to fix super block errors: %d", ret);
2954 	}
2955 	return ret;
2956 out:
2957 	scrub_workers_put(fs_info);
2958 out_free_ctx:
2959 	scrub_free_ctx(sctx);
2960 
2961 	return ret;
2962 }
2963 
2964 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
2965 {
2966 	mutex_lock(&fs_info->scrub_lock);
2967 	atomic_inc(&fs_info->scrub_pause_req);
2968 	while (atomic_read(&fs_info->scrubs_paused) !=
2969 	       atomic_read(&fs_info->scrubs_running)) {
2970 		mutex_unlock(&fs_info->scrub_lock);
2971 		wait_event(fs_info->scrub_pause_wait,
2972 			   atomic_read(&fs_info->scrubs_paused) ==
2973 			   atomic_read(&fs_info->scrubs_running));
2974 		mutex_lock(&fs_info->scrub_lock);
2975 	}
2976 	mutex_unlock(&fs_info->scrub_lock);
2977 }
2978 
2979 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
2980 {
2981 	atomic_dec(&fs_info->scrub_pause_req);
2982 	wake_up(&fs_info->scrub_pause_wait);
2983 }
2984 
2985 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2986 {
2987 	mutex_lock(&fs_info->scrub_lock);
2988 	if (!atomic_read(&fs_info->scrubs_running)) {
2989 		mutex_unlock(&fs_info->scrub_lock);
2990 		return -ENOTCONN;
2991 	}
2992 
2993 	atomic_inc(&fs_info->scrub_cancel_req);
2994 	while (atomic_read(&fs_info->scrubs_running)) {
2995 		mutex_unlock(&fs_info->scrub_lock);
2996 		wait_event(fs_info->scrub_pause_wait,
2997 			   atomic_read(&fs_info->scrubs_running) == 0);
2998 		mutex_lock(&fs_info->scrub_lock);
2999 	}
3000 	atomic_dec(&fs_info->scrub_cancel_req);
3001 	mutex_unlock(&fs_info->scrub_lock);
3002 
3003 	return 0;
3004 }
3005 
3006 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
3007 {
3008 	struct btrfs_fs_info *fs_info = dev->fs_info;
3009 	struct scrub_ctx *sctx;
3010 
3011 	mutex_lock(&fs_info->scrub_lock);
3012 	sctx = dev->scrub_ctx;
3013 	if (!sctx) {
3014 		mutex_unlock(&fs_info->scrub_lock);
3015 		return -ENOTCONN;
3016 	}
3017 	atomic_inc(&sctx->cancel_req);
3018 	while (dev->scrub_ctx) {
3019 		mutex_unlock(&fs_info->scrub_lock);
3020 		wait_event(fs_info->scrub_pause_wait,
3021 			   dev->scrub_ctx == NULL);
3022 		mutex_lock(&fs_info->scrub_lock);
3023 	}
3024 	mutex_unlock(&fs_info->scrub_lock);
3025 
3026 	return 0;
3027 }
3028 
3029 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
3030 			 struct btrfs_scrub_progress *progress)
3031 {
3032 	struct btrfs_dev_lookup_args args = { .devid = devid };
3033 	struct btrfs_device *dev;
3034 	struct scrub_ctx *sctx = NULL;
3035 
3036 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3037 	dev = btrfs_find_device(fs_info->fs_devices, &args);
3038 	if (dev)
3039 		sctx = dev->scrub_ctx;
3040 	if (sctx)
3041 		memcpy(progress, &sctx->stat, sizeof(*progress));
3042 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3043 
3044 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3045 }
3046