xref: /openbmc/linux/fs/btrfs/scrub.c (revision f7777dcc)
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31 #include "raid56.h"
32 
33 /*
34  * This is only the first step towards a full-features scrub. It reads all
35  * extent and super block and verifies the checksums. In case a bad checksum
36  * is found or the extent cannot be read, good data will be written back if
37  * any can be found.
38  *
39  * Future enhancements:
40  *  - In case an unrepairable extent is encountered, track which files are
41  *    affected and report them
42  *  - track and record media errors, throw out bad devices
43  *  - add a mode to also read unallocated space
44  */
45 
46 struct scrub_block;
47 struct scrub_ctx;
48 
49 /*
50  * the following three values only influence the performance.
51  * The last one configures the number of parallel and outstanding I/O
52  * operations. The first two values configure an upper limit for the number
53  * of (dynamically allocated) pages that are added to a bio.
54  */
55 #define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
56 #define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
57 #define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
58 
59 /*
60  * the following value times PAGE_SIZE needs to be large enough to match the
61  * largest node/leaf/sector size that shall be supported.
62  * Values larger than BTRFS_STRIPE_LEN are not supported.
63  */
64 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
65 
66 struct scrub_page {
67 	struct scrub_block	*sblock;
68 	struct page		*page;
69 	struct btrfs_device	*dev;
70 	u64			flags;  /* extent flags */
71 	u64			generation;
72 	u64			logical;
73 	u64			physical;
74 	u64			physical_for_dev_replace;
75 	atomic_t		ref_count;
76 	struct {
77 		unsigned int	mirror_num:8;
78 		unsigned int	have_csum:1;
79 		unsigned int	io_error:1;
80 	};
81 	u8			csum[BTRFS_CSUM_SIZE];
82 };
83 
84 struct scrub_bio {
85 	int			index;
86 	struct scrub_ctx	*sctx;
87 	struct btrfs_device	*dev;
88 	struct bio		*bio;
89 	int			err;
90 	u64			logical;
91 	u64			physical;
92 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93 	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
94 #else
95 	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
96 #endif
97 	int			page_count;
98 	int			next_free;
99 	struct btrfs_work	work;
100 };
101 
102 struct scrub_block {
103 	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104 	int			page_count;
105 	atomic_t		outstanding_pages;
106 	atomic_t		ref_count; /* free mem on transition to zero */
107 	struct scrub_ctx	*sctx;
108 	struct {
109 		unsigned int	header_error:1;
110 		unsigned int	checksum_error:1;
111 		unsigned int	no_io_error_seen:1;
112 		unsigned int	generation_error:1; /* also sets header_error */
113 	};
114 };
115 
116 struct scrub_wr_ctx {
117 	struct scrub_bio *wr_curr_bio;
118 	struct btrfs_device *tgtdev;
119 	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
120 	atomic_t flush_all_writes;
121 	struct mutex wr_lock;
122 };
123 
124 struct scrub_ctx {
125 	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
126 	struct btrfs_root	*dev_root;
127 	int			first_free;
128 	int			curr;
129 	atomic_t		bios_in_flight;
130 	atomic_t		workers_pending;
131 	spinlock_t		list_lock;
132 	wait_queue_head_t	list_wait;
133 	u16			csum_size;
134 	struct list_head	csum_list;
135 	atomic_t		cancel_req;
136 	int			readonly;
137 	int			pages_per_rd_bio;
138 	u32			sectorsize;
139 	u32			nodesize;
140 	u32			leafsize;
141 
142 	int			is_dev_replace;
143 	struct scrub_wr_ctx	wr_ctx;
144 
145 	/*
146 	 * statistics
147 	 */
148 	struct btrfs_scrub_progress stat;
149 	spinlock_t		stat_lock;
150 };
151 
152 struct scrub_fixup_nodatasum {
153 	struct scrub_ctx	*sctx;
154 	struct btrfs_device	*dev;
155 	u64			logical;
156 	struct btrfs_root	*root;
157 	struct btrfs_work	work;
158 	int			mirror_num;
159 };
160 
161 struct scrub_nocow_inode {
162 	u64			inum;
163 	u64			offset;
164 	u64			root;
165 	struct list_head	list;
166 };
167 
168 struct scrub_copy_nocow_ctx {
169 	struct scrub_ctx	*sctx;
170 	u64			logical;
171 	u64			len;
172 	int			mirror_num;
173 	u64			physical_for_dev_replace;
174 	struct list_head	inodes;
175 	struct btrfs_work	work;
176 };
177 
178 struct scrub_warning {
179 	struct btrfs_path	*path;
180 	u64			extent_item_size;
181 	char			*scratch_buf;
182 	char			*msg_buf;
183 	const char		*errstr;
184 	sector_t		sector;
185 	u64			logical;
186 	struct btrfs_device	*dev;
187 	int			msg_bufsize;
188 	int			scratch_bufsize;
189 };
190 
191 
192 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
195 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
196 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
197 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
198 				     struct btrfs_fs_info *fs_info,
199 				     struct scrub_block *original_sblock,
200 				     u64 length, u64 logical,
201 				     struct scrub_block *sblocks_for_recheck);
202 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
203 				struct scrub_block *sblock, int is_metadata,
204 				int have_csum, u8 *csum, u64 generation,
205 				u16 csum_size);
206 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
207 					 struct scrub_block *sblock,
208 					 int is_metadata, int have_csum,
209 					 const u8 *csum, u64 generation,
210 					 u16 csum_size);
211 static void scrub_complete_bio_end_io(struct bio *bio, int err);
212 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
213 					     struct scrub_block *sblock_good,
214 					     int force_write);
215 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
216 					    struct scrub_block *sblock_good,
217 					    int page_num, int force_write);
218 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
219 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
220 					   int page_num);
221 static int scrub_checksum_data(struct scrub_block *sblock);
222 static int scrub_checksum_tree_block(struct scrub_block *sblock);
223 static int scrub_checksum_super(struct scrub_block *sblock);
224 static void scrub_block_get(struct scrub_block *sblock);
225 static void scrub_block_put(struct scrub_block *sblock);
226 static void scrub_page_get(struct scrub_page *spage);
227 static void scrub_page_put(struct scrub_page *spage);
228 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
229 				    struct scrub_page *spage);
230 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
231 		       u64 physical, struct btrfs_device *dev, u64 flags,
232 		       u64 gen, int mirror_num, u8 *csum, int force,
233 		       u64 physical_for_dev_replace);
234 static void scrub_bio_end_io(struct bio *bio, int err);
235 static void scrub_bio_end_io_worker(struct btrfs_work *work);
236 static void scrub_block_complete(struct scrub_block *sblock);
237 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
238 			       u64 extent_logical, u64 extent_len,
239 			       u64 *extent_physical,
240 			       struct btrfs_device **extent_dev,
241 			       int *extent_mirror_num);
242 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
243 			      struct scrub_wr_ctx *wr_ctx,
244 			      struct btrfs_fs_info *fs_info,
245 			      struct btrfs_device *dev,
246 			      int is_dev_replace);
247 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
248 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
249 				    struct scrub_page *spage);
250 static void scrub_wr_submit(struct scrub_ctx *sctx);
251 static void scrub_wr_bio_end_io(struct bio *bio, int err);
252 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
253 static int write_page_nocow(struct scrub_ctx *sctx,
254 			    u64 physical_for_dev_replace, struct page *page);
255 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
256 				      struct scrub_copy_nocow_ctx *ctx);
257 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
258 			    int mirror_num, u64 physical_for_dev_replace);
259 static void copy_nocow_pages_worker(struct btrfs_work *work);
260 
261 
262 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
263 {
264 	atomic_inc(&sctx->bios_in_flight);
265 }
266 
267 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
268 {
269 	atomic_dec(&sctx->bios_in_flight);
270 	wake_up(&sctx->list_wait);
271 }
272 
273 /*
274  * used for workers that require transaction commits (i.e., for the
275  * NOCOW case)
276  */
277 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
278 {
279 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
280 
281 	/*
282 	 * increment scrubs_running to prevent cancel requests from
283 	 * completing as long as a worker is running. we must also
284 	 * increment scrubs_paused to prevent deadlocking on pause
285 	 * requests used for transactions commits (as the worker uses a
286 	 * transaction context). it is safe to regard the worker
287 	 * as paused for all matters practical. effectively, we only
288 	 * avoid cancellation requests from completing.
289 	 */
290 	mutex_lock(&fs_info->scrub_lock);
291 	atomic_inc(&fs_info->scrubs_running);
292 	atomic_inc(&fs_info->scrubs_paused);
293 	mutex_unlock(&fs_info->scrub_lock);
294 	atomic_inc(&sctx->workers_pending);
295 }
296 
297 /* used for workers that require transaction commits */
298 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
299 {
300 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
301 
302 	/*
303 	 * see scrub_pending_trans_workers_inc() why we're pretending
304 	 * to be paused in the scrub counters
305 	 */
306 	mutex_lock(&fs_info->scrub_lock);
307 	atomic_dec(&fs_info->scrubs_running);
308 	atomic_dec(&fs_info->scrubs_paused);
309 	mutex_unlock(&fs_info->scrub_lock);
310 	atomic_dec(&sctx->workers_pending);
311 	wake_up(&fs_info->scrub_pause_wait);
312 	wake_up(&sctx->list_wait);
313 }
314 
315 static void scrub_free_csums(struct scrub_ctx *sctx)
316 {
317 	while (!list_empty(&sctx->csum_list)) {
318 		struct btrfs_ordered_sum *sum;
319 		sum = list_first_entry(&sctx->csum_list,
320 				       struct btrfs_ordered_sum, list);
321 		list_del(&sum->list);
322 		kfree(sum);
323 	}
324 }
325 
326 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
327 {
328 	int i;
329 
330 	if (!sctx)
331 		return;
332 
333 	scrub_free_wr_ctx(&sctx->wr_ctx);
334 
335 	/* this can happen when scrub is cancelled */
336 	if (sctx->curr != -1) {
337 		struct scrub_bio *sbio = sctx->bios[sctx->curr];
338 
339 		for (i = 0; i < sbio->page_count; i++) {
340 			WARN_ON(!sbio->pagev[i]->page);
341 			scrub_block_put(sbio->pagev[i]->sblock);
342 		}
343 		bio_put(sbio->bio);
344 	}
345 
346 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
347 		struct scrub_bio *sbio = sctx->bios[i];
348 
349 		if (!sbio)
350 			break;
351 		kfree(sbio);
352 	}
353 
354 	scrub_free_csums(sctx);
355 	kfree(sctx);
356 }
357 
358 static noinline_for_stack
359 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
360 {
361 	struct scrub_ctx *sctx;
362 	int		i;
363 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
364 	int pages_per_rd_bio;
365 	int ret;
366 
367 	/*
368 	 * the setting of pages_per_rd_bio is correct for scrub but might
369 	 * be wrong for the dev_replace code where we might read from
370 	 * different devices in the initial huge bios. However, that
371 	 * code is able to correctly handle the case when adding a page
372 	 * to a bio fails.
373 	 */
374 	if (dev->bdev)
375 		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
376 					 bio_get_nr_vecs(dev->bdev));
377 	else
378 		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
379 	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
380 	if (!sctx)
381 		goto nomem;
382 	sctx->is_dev_replace = is_dev_replace;
383 	sctx->pages_per_rd_bio = pages_per_rd_bio;
384 	sctx->curr = -1;
385 	sctx->dev_root = dev->dev_root;
386 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
387 		struct scrub_bio *sbio;
388 
389 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
390 		if (!sbio)
391 			goto nomem;
392 		sctx->bios[i] = sbio;
393 
394 		sbio->index = i;
395 		sbio->sctx = sctx;
396 		sbio->page_count = 0;
397 		sbio->work.func = scrub_bio_end_io_worker;
398 
399 		if (i != SCRUB_BIOS_PER_SCTX - 1)
400 			sctx->bios[i]->next_free = i + 1;
401 		else
402 			sctx->bios[i]->next_free = -1;
403 	}
404 	sctx->first_free = 0;
405 	sctx->nodesize = dev->dev_root->nodesize;
406 	sctx->leafsize = dev->dev_root->leafsize;
407 	sctx->sectorsize = dev->dev_root->sectorsize;
408 	atomic_set(&sctx->bios_in_flight, 0);
409 	atomic_set(&sctx->workers_pending, 0);
410 	atomic_set(&sctx->cancel_req, 0);
411 	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
412 	INIT_LIST_HEAD(&sctx->csum_list);
413 
414 	spin_lock_init(&sctx->list_lock);
415 	spin_lock_init(&sctx->stat_lock);
416 	init_waitqueue_head(&sctx->list_wait);
417 
418 	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
419 				 fs_info->dev_replace.tgtdev, is_dev_replace);
420 	if (ret) {
421 		scrub_free_ctx(sctx);
422 		return ERR_PTR(ret);
423 	}
424 	return sctx;
425 
426 nomem:
427 	scrub_free_ctx(sctx);
428 	return ERR_PTR(-ENOMEM);
429 }
430 
431 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
432 				     void *warn_ctx)
433 {
434 	u64 isize;
435 	u32 nlink;
436 	int ret;
437 	int i;
438 	struct extent_buffer *eb;
439 	struct btrfs_inode_item *inode_item;
440 	struct scrub_warning *swarn = warn_ctx;
441 	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
442 	struct inode_fs_paths *ipath = NULL;
443 	struct btrfs_root *local_root;
444 	struct btrfs_key root_key;
445 
446 	root_key.objectid = root;
447 	root_key.type = BTRFS_ROOT_ITEM_KEY;
448 	root_key.offset = (u64)-1;
449 	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
450 	if (IS_ERR(local_root)) {
451 		ret = PTR_ERR(local_root);
452 		goto err;
453 	}
454 
455 	ret = inode_item_info(inum, 0, local_root, swarn->path);
456 	if (ret) {
457 		btrfs_release_path(swarn->path);
458 		goto err;
459 	}
460 
461 	eb = swarn->path->nodes[0];
462 	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
463 					struct btrfs_inode_item);
464 	isize = btrfs_inode_size(eb, inode_item);
465 	nlink = btrfs_inode_nlink(eb, inode_item);
466 	btrfs_release_path(swarn->path);
467 
468 	ipath = init_ipath(4096, local_root, swarn->path);
469 	if (IS_ERR(ipath)) {
470 		ret = PTR_ERR(ipath);
471 		ipath = NULL;
472 		goto err;
473 	}
474 	ret = paths_from_inode(inum, ipath);
475 
476 	if (ret < 0)
477 		goto err;
478 
479 	/*
480 	 * we deliberately ignore the bit ipath might have been too small to
481 	 * hold all of the paths here
482 	 */
483 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
484 		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
485 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
486 			"length %llu, links %u (path: %s)\n", swarn->errstr,
487 			swarn->logical, rcu_str_deref(swarn->dev->name),
488 			(unsigned long long)swarn->sector, root, inum, offset,
489 			min(isize - offset, (u64)PAGE_SIZE), nlink,
490 			(char *)(unsigned long)ipath->fspath->val[i]);
491 
492 	free_ipath(ipath);
493 	return 0;
494 
495 err:
496 	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
497 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
498 		"resolving failed with ret=%d\n", swarn->errstr,
499 		swarn->logical, rcu_str_deref(swarn->dev->name),
500 		(unsigned long long)swarn->sector, root, inum, offset, ret);
501 
502 	free_ipath(ipath);
503 	return 0;
504 }
505 
506 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
507 {
508 	struct btrfs_device *dev;
509 	struct btrfs_fs_info *fs_info;
510 	struct btrfs_path *path;
511 	struct btrfs_key found_key;
512 	struct extent_buffer *eb;
513 	struct btrfs_extent_item *ei;
514 	struct scrub_warning swarn;
515 	unsigned long ptr = 0;
516 	u64 extent_item_pos;
517 	u64 flags = 0;
518 	u64 ref_root;
519 	u32 item_size;
520 	u8 ref_level;
521 	const int bufsize = 4096;
522 	int ret;
523 
524 	WARN_ON(sblock->page_count < 1);
525 	dev = sblock->pagev[0]->dev;
526 	fs_info = sblock->sctx->dev_root->fs_info;
527 
528 	path = btrfs_alloc_path();
529 
530 	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
531 	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
532 	swarn.sector = (sblock->pagev[0]->physical) >> 9;
533 	swarn.logical = sblock->pagev[0]->logical;
534 	swarn.errstr = errstr;
535 	swarn.dev = NULL;
536 	swarn.msg_bufsize = bufsize;
537 	swarn.scratch_bufsize = bufsize;
538 
539 	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
540 		goto out;
541 
542 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
543 				  &flags);
544 	if (ret < 0)
545 		goto out;
546 
547 	extent_item_pos = swarn.logical - found_key.objectid;
548 	swarn.extent_item_size = found_key.offset;
549 
550 	eb = path->nodes[0];
551 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
552 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
553 
554 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
555 		do {
556 			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
557 							&ref_root, &ref_level);
558 			printk_in_rcu(KERN_WARNING
559 				"btrfs: %s at logical %llu on dev %s, "
560 				"sector %llu: metadata %s (level %d) in tree "
561 				"%llu\n", errstr, swarn.logical,
562 				rcu_str_deref(dev->name),
563 				(unsigned long long)swarn.sector,
564 				ref_level ? "node" : "leaf",
565 				ret < 0 ? -1 : ref_level,
566 				ret < 0 ? -1 : ref_root);
567 		} while (ret != 1);
568 		btrfs_release_path(path);
569 	} else {
570 		btrfs_release_path(path);
571 		swarn.path = path;
572 		swarn.dev = dev;
573 		iterate_extent_inodes(fs_info, found_key.objectid,
574 					extent_item_pos, 1,
575 					scrub_print_warning_inode, &swarn);
576 	}
577 
578 out:
579 	btrfs_free_path(path);
580 	kfree(swarn.scratch_buf);
581 	kfree(swarn.msg_buf);
582 }
583 
584 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
585 {
586 	struct page *page = NULL;
587 	unsigned long index;
588 	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
589 	int ret;
590 	int corrected = 0;
591 	struct btrfs_key key;
592 	struct inode *inode = NULL;
593 	struct btrfs_fs_info *fs_info;
594 	u64 end = offset + PAGE_SIZE - 1;
595 	struct btrfs_root *local_root;
596 	int srcu_index;
597 
598 	key.objectid = root;
599 	key.type = BTRFS_ROOT_ITEM_KEY;
600 	key.offset = (u64)-1;
601 
602 	fs_info = fixup->root->fs_info;
603 	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
604 
605 	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
606 	if (IS_ERR(local_root)) {
607 		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
608 		return PTR_ERR(local_root);
609 	}
610 
611 	key.type = BTRFS_INODE_ITEM_KEY;
612 	key.objectid = inum;
613 	key.offset = 0;
614 	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
615 	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
616 	if (IS_ERR(inode))
617 		return PTR_ERR(inode);
618 
619 	index = offset >> PAGE_CACHE_SHIFT;
620 
621 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
622 	if (!page) {
623 		ret = -ENOMEM;
624 		goto out;
625 	}
626 
627 	if (PageUptodate(page)) {
628 		if (PageDirty(page)) {
629 			/*
630 			 * we need to write the data to the defect sector. the
631 			 * data that was in that sector is not in memory,
632 			 * because the page was modified. we must not write the
633 			 * modified page to that sector.
634 			 *
635 			 * TODO: what could be done here: wait for the delalloc
636 			 *       runner to write out that page (might involve
637 			 *       COW) and see whether the sector is still
638 			 *       referenced afterwards.
639 			 *
640 			 * For the meantime, we'll treat this error
641 			 * incorrectable, although there is a chance that a
642 			 * later scrub will find the bad sector again and that
643 			 * there's no dirty page in memory, then.
644 			 */
645 			ret = -EIO;
646 			goto out;
647 		}
648 		fs_info = BTRFS_I(inode)->root->fs_info;
649 		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
650 					fixup->logical, page,
651 					fixup->mirror_num);
652 		unlock_page(page);
653 		corrected = !ret;
654 	} else {
655 		/*
656 		 * we need to get good data first. the general readpage path
657 		 * will call repair_io_failure for us, we just have to make
658 		 * sure we read the bad mirror.
659 		 */
660 		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
661 					EXTENT_DAMAGED, GFP_NOFS);
662 		if (ret) {
663 			/* set_extent_bits should give proper error */
664 			WARN_ON(ret > 0);
665 			if (ret > 0)
666 				ret = -EFAULT;
667 			goto out;
668 		}
669 
670 		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
671 						btrfs_get_extent,
672 						fixup->mirror_num);
673 		wait_on_page_locked(page);
674 
675 		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
676 						end, EXTENT_DAMAGED, 0, NULL);
677 		if (!corrected)
678 			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
679 						EXTENT_DAMAGED, GFP_NOFS);
680 	}
681 
682 out:
683 	if (page)
684 		put_page(page);
685 	if (inode)
686 		iput(inode);
687 
688 	if (ret < 0)
689 		return ret;
690 
691 	if (ret == 0 && corrected) {
692 		/*
693 		 * we only need to call readpage for one of the inodes belonging
694 		 * to this extent. so make iterate_extent_inodes stop
695 		 */
696 		return 1;
697 	}
698 
699 	return -EIO;
700 }
701 
702 static void scrub_fixup_nodatasum(struct btrfs_work *work)
703 {
704 	int ret;
705 	struct scrub_fixup_nodatasum *fixup;
706 	struct scrub_ctx *sctx;
707 	struct btrfs_trans_handle *trans = NULL;
708 	struct btrfs_fs_info *fs_info;
709 	struct btrfs_path *path;
710 	int uncorrectable = 0;
711 
712 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
713 	sctx = fixup->sctx;
714 	fs_info = fixup->root->fs_info;
715 
716 	path = btrfs_alloc_path();
717 	if (!path) {
718 		spin_lock(&sctx->stat_lock);
719 		++sctx->stat.malloc_errors;
720 		spin_unlock(&sctx->stat_lock);
721 		uncorrectable = 1;
722 		goto out;
723 	}
724 
725 	trans = btrfs_join_transaction(fixup->root);
726 	if (IS_ERR(trans)) {
727 		uncorrectable = 1;
728 		goto out;
729 	}
730 
731 	/*
732 	 * the idea is to trigger a regular read through the standard path. we
733 	 * read a page from the (failed) logical address by specifying the
734 	 * corresponding copynum of the failed sector. thus, that readpage is
735 	 * expected to fail.
736 	 * that is the point where on-the-fly error correction will kick in
737 	 * (once it's finished) and rewrite the failed sector if a good copy
738 	 * can be found.
739 	 */
740 	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
741 						path, scrub_fixup_readpage,
742 						fixup);
743 	if (ret < 0) {
744 		uncorrectable = 1;
745 		goto out;
746 	}
747 	WARN_ON(ret != 1);
748 
749 	spin_lock(&sctx->stat_lock);
750 	++sctx->stat.corrected_errors;
751 	spin_unlock(&sctx->stat_lock);
752 
753 out:
754 	if (trans && !IS_ERR(trans))
755 		btrfs_end_transaction(trans, fixup->root);
756 	if (uncorrectable) {
757 		spin_lock(&sctx->stat_lock);
758 		++sctx->stat.uncorrectable_errors;
759 		spin_unlock(&sctx->stat_lock);
760 		btrfs_dev_replace_stats_inc(
761 			&sctx->dev_root->fs_info->dev_replace.
762 			num_uncorrectable_read_errors);
763 		printk_ratelimited_in_rcu(KERN_ERR
764 			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
765 			fixup->logical, rcu_str_deref(fixup->dev->name));
766 	}
767 
768 	btrfs_free_path(path);
769 	kfree(fixup);
770 
771 	scrub_pending_trans_workers_dec(sctx);
772 }
773 
774 /*
775  * scrub_handle_errored_block gets called when either verification of the
776  * pages failed or the bio failed to read, e.g. with EIO. In the latter
777  * case, this function handles all pages in the bio, even though only one
778  * may be bad.
779  * The goal of this function is to repair the errored block by using the
780  * contents of one of the mirrors.
781  */
782 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
783 {
784 	struct scrub_ctx *sctx = sblock_to_check->sctx;
785 	struct btrfs_device *dev;
786 	struct btrfs_fs_info *fs_info;
787 	u64 length;
788 	u64 logical;
789 	u64 generation;
790 	unsigned int failed_mirror_index;
791 	unsigned int is_metadata;
792 	unsigned int have_csum;
793 	u8 *csum;
794 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
795 	struct scrub_block *sblock_bad;
796 	int ret;
797 	int mirror_index;
798 	int page_num;
799 	int success;
800 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
801 				      DEFAULT_RATELIMIT_BURST);
802 
803 	BUG_ON(sblock_to_check->page_count < 1);
804 	fs_info = sctx->dev_root->fs_info;
805 	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
806 		/*
807 		 * if we find an error in a super block, we just report it.
808 		 * They will get written with the next transaction commit
809 		 * anyway
810 		 */
811 		spin_lock(&sctx->stat_lock);
812 		++sctx->stat.super_errors;
813 		spin_unlock(&sctx->stat_lock);
814 		return 0;
815 	}
816 	length = sblock_to_check->page_count * PAGE_SIZE;
817 	logical = sblock_to_check->pagev[0]->logical;
818 	generation = sblock_to_check->pagev[0]->generation;
819 	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
820 	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
821 	is_metadata = !(sblock_to_check->pagev[0]->flags &
822 			BTRFS_EXTENT_FLAG_DATA);
823 	have_csum = sblock_to_check->pagev[0]->have_csum;
824 	csum = sblock_to_check->pagev[0]->csum;
825 	dev = sblock_to_check->pagev[0]->dev;
826 
827 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
828 		sblocks_for_recheck = NULL;
829 		goto nodatasum_case;
830 	}
831 
832 	/*
833 	 * read all mirrors one after the other. This includes to
834 	 * re-read the extent or metadata block that failed (that was
835 	 * the cause that this fixup code is called) another time,
836 	 * page by page this time in order to know which pages
837 	 * caused I/O errors and which ones are good (for all mirrors).
838 	 * It is the goal to handle the situation when more than one
839 	 * mirror contains I/O errors, but the errors do not
840 	 * overlap, i.e. the data can be repaired by selecting the
841 	 * pages from those mirrors without I/O error on the
842 	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
843 	 * would be that mirror #1 has an I/O error on the first page,
844 	 * the second page is good, and mirror #2 has an I/O error on
845 	 * the second page, but the first page is good.
846 	 * Then the first page of the first mirror can be repaired by
847 	 * taking the first page of the second mirror, and the
848 	 * second page of the second mirror can be repaired by
849 	 * copying the contents of the 2nd page of the 1st mirror.
850 	 * One more note: if the pages of one mirror contain I/O
851 	 * errors, the checksum cannot be verified. In order to get
852 	 * the best data for repairing, the first attempt is to find
853 	 * a mirror without I/O errors and with a validated checksum.
854 	 * Only if this is not possible, the pages are picked from
855 	 * mirrors with I/O errors without considering the checksum.
856 	 * If the latter is the case, at the end, the checksum of the
857 	 * repaired area is verified in order to correctly maintain
858 	 * the statistics.
859 	 */
860 
861 	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
862 				     sizeof(*sblocks_for_recheck),
863 				     GFP_NOFS);
864 	if (!sblocks_for_recheck) {
865 		spin_lock(&sctx->stat_lock);
866 		sctx->stat.malloc_errors++;
867 		sctx->stat.read_errors++;
868 		sctx->stat.uncorrectable_errors++;
869 		spin_unlock(&sctx->stat_lock);
870 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
871 		goto out;
872 	}
873 
874 	/* setup the context, map the logical blocks and alloc the pages */
875 	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
876 					logical, sblocks_for_recheck);
877 	if (ret) {
878 		spin_lock(&sctx->stat_lock);
879 		sctx->stat.read_errors++;
880 		sctx->stat.uncorrectable_errors++;
881 		spin_unlock(&sctx->stat_lock);
882 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
883 		goto out;
884 	}
885 	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
886 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
887 
888 	/* build and submit the bios for the failed mirror, check checksums */
889 	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
890 			    csum, generation, sctx->csum_size);
891 
892 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
893 	    sblock_bad->no_io_error_seen) {
894 		/*
895 		 * the error disappeared after reading page by page, or
896 		 * the area was part of a huge bio and other parts of the
897 		 * bio caused I/O errors, or the block layer merged several
898 		 * read requests into one and the error is caused by a
899 		 * different bio (usually one of the two latter cases is
900 		 * the cause)
901 		 */
902 		spin_lock(&sctx->stat_lock);
903 		sctx->stat.unverified_errors++;
904 		spin_unlock(&sctx->stat_lock);
905 
906 		if (sctx->is_dev_replace)
907 			scrub_write_block_to_dev_replace(sblock_bad);
908 		goto out;
909 	}
910 
911 	if (!sblock_bad->no_io_error_seen) {
912 		spin_lock(&sctx->stat_lock);
913 		sctx->stat.read_errors++;
914 		spin_unlock(&sctx->stat_lock);
915 		if (__ratelimit(&_rs))
916 			scrub_print_warning("i/o error", sblock_to_check);
917 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
918 	} else if (sblock_bad->checksum_error) {
919 		spin_lock(&sctx->stat_lock);
920 		sctx->stat.csum_errors++;
921 		spin_unlock(&sctx->stat_lock);
922 		if (__ratelimit(&_rs))
923 			scrub_print_warning("checksum error", sblock_to_check);
924 		btrfs_dev_stat_inc_and_print(dev,
925 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
926 	} else if (sblock_bad->header_error) {
927 		spin_lock(&sctx->stat_lock);
928 		sctx->stat.verify_errors++;
929 		spin_unlock(&sctx->stat_lock);
930 		if (__ratelimit(&_rs))
931 			scrub_print_warning("checksum/header error",
932 					    sblock_to_check);
933 		if (sblock_bad->generation_error)
934 			btrfs_dev_stat_inc_and_print(dev,
935 				BTRFS_DEV_STAT_GENERATION_ERRS);
936 		else
937 			btrfs_dev_stat_inc_and_print(dev,
938 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
939 	}
940 
941 	if (sctx->readonly && !sctx->is_dev_replace)
942 		goto did_not_correct_error;
943 
944 	if (!is_metadata && !have_csum) {
945 		struct scrub_fixup_nodatasum *fixup_nodatasum;
946 
947 nodatasum_case:
948 		WARN_ON(sctx->is_dev_replace);
949 
950 		/*
951 		 * !is_metadata and !have_csum, this means that the data
952 		 * might not be COW'ed, that it might be modified
953 		 * concurrently. The general strategy to work on the
954 		 * commit root does not help in the case when COW is not
955 		 * used.
956 		 */
957 		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
958 		if (!fixup_nodatasum)
959 			goto did_not_correct_error;
960 		fixup_nodatasum->sctx = sctx;
961 		fixup_nodatasum->dev = dev;
962 		fixup_nodatasum->logical = logical;
963 		fixup_nodatasum->root = fs_info->extent_root;
964 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
965 		scrub_pending_trans_workers_inc(sctx);
966 		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
967 		btrfs_queue_worker(&fs_info->scrub_workers,
968 				   &fixup_nodatasum->work);
969 		goto out;
970 	}
971 
972 	/*
973 	 * now build and submit the bios for the other mirrors, check
974 	 * checksums.
975 	 * First try to pick the mirror which is completely without I/O
976 	 * errors and also does not have a checksum error.
977 	 * If one is found, and if a checksum is present, the full block
978 	 * that is known to contain an error is rewritten. Afterwards
979 	 * the block is known to be corrected.
980 	 * If a mirror is found which is completely correct, and no
981 	 * checksum is present, only those pages are rewritten that had
982 	 * an I/O error in the block to be repaired, since it cannot be
983 	 * determined, which copy of the other pages is better (and it
984 	 * could happen otherwise that a correct page would be
985 	 * overwritten by a bad one).
986 	 */
987 	for (mirror_index = 0;
988 	     mirror_index < BTRFS_MAX_MIRRORS &&
989 	     sblocks_for_recheck[mirror_index].page_count > 0;
990 	     mirror_index++) {
991 		struct scrub_block *sblock_other;
992 
993 		if (mirror_index == failed_mirror_index)
994 			continue;
995 		sblock_other = sblocks_for_recheck + mirror_index;
996 
997 		/* build and submit the bios, check checksums */
998 		scrub_recheck_block(fs_info, sblock_other, is_metadata,
999 				    have_csum, csum, generation,
1000 				    sctx->csum_size);
1001 
1002 		if (!sblock_other->header_error &&
1003 		    !sblock_other->checksum_error &&
1004 		    sblock_other->no_io_error_seen) {
1005 			if (sctx->is_dev_replace) {
1006 				scrub_write_block_to_dev_replace(sblock_other);
1007 			} else {
1008 				int force_write = is_metadata || have_csum;
1009 
1010 				ret = scrub_repair_block_from_good_copy(
1011 						sblock_bad, sblock_other,
1012 						force_write);
1013 			}
1014 			if (0 == ret)
1015 				goto corrected_error;
1016 		}
1017 	}
1018 
1019 	/*
1020 	 * for dev_replace, pick good pages and write to the target device.
1021 	 */
1022 	if (sctx->is_dev_replace) {
1023 		success = 1;
1024 		for (page_num = 0; page_num < sblock_bad->page_count;
1025 		     page_num++) {
1026 			int sub_success;
1027 
1028 			sub_success = 0;
1029 			for (mirror_index = 0;
1030 			     mirror_index < BTRFS_MAX_MIRRORS &&
1031 			     sblocks_for_recheck[mirror_index].page_count > 0;
1032 			     mirror_index++) {
1033 				struct scrub_block *sblock_other =
1034 					sblocks_for_recheck + mirror_index;
1035 				struct scrub_page *page_other =
1036 					sblock_other->pagev[page_num];
1037 
1038 				if (!page_other->io_error) {
1039 					ret = scrub_write_page_to_dev_replace(
1040 							sblock_other, page_num);
1041 					if (ret == 0) {
1042 						/* succeeded for this page */
1043 						sub_success = 1;
1044 						break;
1045 					} else {
1046 						btrfs_dev_replace_stats_inc(
1047 							&sctx->dev_root->
1048 							fs_info->dev_replace.
1049 							num_write_errors);
1050 					}
1051 				}
1052 			}
1053 
1054 			if (!sub_success) {
1055 				/*
1056 				 * did not find a mirror to fetch the page
1057 				 * from. scrub_write_page_to_dev_replace()
1058 				 * handles this case (page->io_error), by
1059 				 * filling the block with zeros before
1060 				 * submitting the write request
1061 				 */
1062 				success = 0;
1063 				ret = scrub_write_page_to_dev_replace(
1064 						sblock_bad, page_num);
1065 				if (ret)
1066 					btrfs_dev_replace_stats_inc(
1067 						&sctx->dev_root->fs_info->
1068 						dev_replace.num_write_errors);
1069 			}
1070 		}
1071 
1072 		goto out;
1073 	}
1074 
1075 	/*
1076 	 * for regular scrub, repair those pages that are errored.
1077 	 * In case of I/O errors in the area that is supposed to be
1078 	 * repaired, continue by picking good copies of those pages.
1079 	 * Select the good pages from mirrors to rewrite bad pages from
1080 	 * the area to fix. Afterwards verify the checksum of the block
1081 	 * that is supposed to be repaired. This verification step is
1082 	 * only done for the purpose of statistic counting and for the
1083 	 * final scrub report, whether errors remain.
1084 	 * A perfect algorithm could make use of the checksum and try
1085 	 * all possible combinations of pages from the different mirrors
1086 	 * until the checksum verification succeeds. For example, when
1087 	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1088 	 * of mirror #2 is readable but the final checksum test fails,
1089 	 * then the 2nd page of mirror #3 could be tried, whether now
1090 	 * the final checksum succeedes. But this would be a rare
1091 	 * exception and is therefore not implemented. At least it is
1092 	 * avoided that the good copy is overwritten.
1093 	 * A more useful improvement would be to pick the sectors
1094 	 * without I/O error based on sector sizes (512 bytes on legacy
1095 	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1096 	 * mirror could be repaired by taking 512 byte of a different
1097 	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1098 	 * area are unreadable.
1099 	 */
1100 
1101 	/* can only fix I/O errors from here on */
1102 	if (sblock_bad->no_io_error_seen)
1103 		goto did_not_correct_error;
1104 
1105 	success = 1;
1106 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1107 		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1108 
1109 		if (!page_bad->io_error)
1110 			continue;
1111 
1112 		for (mirror_index = 0;
1113 		     mirror_index < BTRFS_MAX_MIRRORS &&
1114 		     sblocks_for_recheck[mirror_index].page_count > 0;
1115 		     mirror_index++) {
1116 			struct scrub_block *sblock_other = sblocks_for_recheck +
1117 							   mirror_index;
1118 			struct scrub_page *page_other = sblock_other->pagev[
1119 							page_num];
1120 
1121 			if (!page_other->io_error) {
1122 				ret = scrub_repair_page_from_good_copy(
1123 					sblock_bad, sblock_other, page_num, 0);
1124 				if (0 == ret) {
1125 					page_bad->io_error = 0;
1126 					break; /* succeeded for this page */
1127 				}
1128 			}
1129 		}
1130 
1131 		if (page_bad->io_error) {
1132 			/* did not find a mirror to copy the page from */
1133 			success = 0;
1134 		}
1135 	}
1136 
1137 	if (success) {
1138 		if (is_metadata || have_csum) {
1139 			/*
1140 			 * need to verify the checksum now that all
1141 			 * sectors on disk are repaired (the write
1142 			 * request for data to be repaired is on its way).
1143 			 * Just be lazy and use scrub_recheck_block()
1144 			 * which re-reads the data before the checksum
1145 			 * is verified, but most likely the data comes out
1146 			 * of the page cache.
1147 			 */
1148 			scrub_recheck_block(fs_info, sblock_bad,
1149 					    is_metadata, have_csum, csum,
1150 					    generation, sctx->csum_size);
1151 			if (!sblock_bad->header_error &&
1152 			    !sblock_bad->checksum_error &&
1153 			    sblock_bad->no_io_error_seen)
1154 				goto corrected_error;
1155 			else
1156 				goto did_not_correct_error;
1157 		} else {
1158 corrected_error:
1159 			spin_lock(&sctx->stat_lock);
1160 			sctx->stat.corrected_errors++;
1161 			spin_unlock(&sctx->stat_lock);
1162 			printk_ratelimited_in_rcu(KERN_ERR
1163 				"btrfs: fixed up error at logical %llu on dev %s\n",
1164 				logical, rcu_str_deref(dev->name));
1165 		}
1166 	} else {
1167 did_not_correct_error:
1168 		spin_lock(&sctx->stat_lock);
1169 		sctx->stat.uncorrectable_errors++;
1170 		spin_unlock(&sctx->stat_lock);
1171 		printk_ratelimited_in_rcu(KERN_ERR
1172 			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1173 			logical, rcu_str_deref(dev->name));
1174 	}
1175 
1176 out:
1177 	if (sblocks_for_recheck) {
1178 		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1179 		     mirror_index++) {
1180 			struct scrub_block *sblock = sblocks_for_recheck +
1181 						     mirror_index;
1182 			int page_index;
1183 
1184 			for (page_index = 0; page_index < sblock->page_count;
1185 			     page_index++) {
1186 				sblock->pagev[page_index]->sblock = NULL;
1187 				scrub_page_put(sblock->pagev[page_index]);
1188 			}
1189 		}
1190 		kfree(sblocks_for_recheck);
1191 	}
1192 
1193 	return 0;
1194 }
1195 
1196 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1197 				     struct btrfs_fs_info *fs_info,
1198 				     struct scrub_block *original_sblock,
1199 				     u64 length, u64 logical,
1200 				     struct scrub_block *sblocks_for_recheck)
1201 {
1202 	int page_index;
1203 	int mirror_index;
1204 	int ret;
1205 
1206 	/*
1207 	 * note: the two members ref_count and outstanding_pages
1208 	 * are not used (and not set) in the blocks that are used for
1209 	 * the recheck procedure
1210 	 */
1211 
1212 	page_index = 0;
1213 	while (length > 0) {
1214 		u64 sublen = min_t(u64, length, PAGE_SIZE);
1215 		u64 mapped_length = sublen;
1216 		struct btrfs_bio *bbio = NULL;
1217 
1218 		/*
1219 		 * with a length of PAGE_SIZE, each returned stripe
1220 		 * represents one mirror
1221 		 */
1222 		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1223 				      &mapped_length, &bbio, 0);
1224 		if (ret || !bbio || mapped_length < sublen) {
1225 			kfree(bbio);
1226 			return -EIO;
1227 		}
1228 
1229 		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1230 		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1231 		     mirror_index++) {
1232 			struct scrub_block *sblock;
1233 			struct scrub_page *page;
1234 
1235 			if (mirror_index >= BTRFS_MAX_MIRRORS)
1236 				continue;
1237 
1238 			sblock = sblocks_for_recheck + mirror_index;
1239 			sblock->sctx = sctx;
1240 			page = kzalloc(sizeof(*page), GFP_NOFS);
1241 			if (!page) {
1242 leave_nomem:
1243 				spin_lock(&sctx->stat_lock);
1244 				sctx->stat.malloc_errors++;
1245 				spin_unlock(&sctx->stat_lock);
1246 				kfree(bbio);
1247 				return -ENOMEM;
1248 			}
1249 			scrub_page_get(page);
1250 			sblock->pagev[page_index] = page;
1251 			page->logical = logical;
1252 			page->physical = bbio->stripes[mirror_index].physical;
1253 			BUG_ON(page_index >= original_sblock->page_count);
1254 			page->physical_for_dev_replace =
1255 				original_sblock->pagev[page_index]->
1256 				physical_for_dev_replace;
1257 			/* for missing devices, dev->bdev is NULL */
1258 			page->dev = bbio->stripes[mirror_index].dev;
1259 			page->mirror_num = mirror_index + 1;
1260 			sblock->page_count++;
1261 			page->page = alloc_page(GFP_NOFS);
1262 			if (!page->page)
1263 				goto leave_nomem;
1264 		}
1265 		kfree(bbio);
1266 		length -= sublen;
1267 		logical += sublen;
1268 		page_index++;
1269 	}
1270 
1271 	return 0;
1272 }
1273 
1274 /*
1275  * this function will check the on disk data for checksum errors, header
1276  * errors and read I/O errors. If any I/O errors happen, the exact pages
1277  * which are errored are marked as being bad. The goal is to enable scrub
1278  * to take those pages that are not errored from all the mirrors so that
1279  * the pages that are errored in the just handled mirror can be repaired.
1280  */
1281 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1282 				struct scrub_block *sblock, int is_metadata,
1283 				int have_csum, u8 *csum, u64 generation,
1284 				u16 csum_size)
1285 {
1286 	int page_num;
1287 
1288 	sblock->no_io_error_seen = 1;
1289 	sblock->header_error = 0;
1290 	sblock->checksum_error = 0;
1291 
1292 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1293 		struct bio *bio;
1294 		struct scrub_page *page = sblock->pagev[page_num];
1295 		DECLARE_COMPLETION_ONSTACK(complete);
1296 
1297 		if (page->dev->bdev == NULL) {
1298 			page->io_error = 1;
1299 			sblock->no_io_error_seen = 0;
1300 			continue;
1301 		}
1302 
1303 		WARN_ON(!page->page);
1304 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1305 		if (!bio) {
1306 			page->io_error = 1;
1307 			sblock->no_io_error_seen = 0;
1308 			continue;
1309 		}
1310 		bio->bi_bdev = page->dev->bdev;
1311 		bio->bi_sector = page->physical >> 9;
1312 		bio->bi_end_io = scrub_complete_bio_end_io;
1313 		bio->bi_private = &complete;
1314 
1315 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1316 		btrfsic_submit_bio(READ, bio);
1317 
1318 		/* this will also unplug the queue */
1319 		wait_for_completion(&complete);
1320 
1321 		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1322 		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1323 			sblock->no_io_error_seen = 0;
1324 		bio_put(bio);
1325 	}
1326 
1327 	if (sblock->no_io_error_seen)
1328 		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1329 					     have_csum, csum, generation,
1330 					     csum_size);
1331 
1332 	return;
1333 }
1334 
1335 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1336 					 struct scrub_block *sblock,
1337 					 int is_metadata, int have_csum,
1338 					 const u8 *csum, u64 generation,
1339 					 u16 csum_size)
1340 {
1341 	int page_num;
1342 	u8 calculated_csum[BTRFS_CSUM_SIZE];
1343 	u32 crc = ~(u32)0;
1344 	void *mapped_buffer;
1345 
1346 	WARN_ON(!sblock->pagev[0]->page);
1347 	if (is_metadata) {
1348 		struct btrfs_header *h;
1349 
1350 		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1351 		h = (struct btrfs_header *)mapped_buffer;
1352 
1353 		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1354 		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1355 		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1356 			   BTRFS_UUID_SIZE)) {
1357 			sblock->header_error = 1;
1358 		} else if (generation != btrfs_stack_header_generation(h)) {
1359 			sblock->header_error = 1;
1360 			sblock->generation_error = 1;
1361 		}
1362 		csum = h->csum;
1363 	} else {
1364 		if (!have_csum)
1365 			return;
1366 
1367 		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1368 	}
1369 
1370 	for (page_num = 0;;) {
1371 		if (page_num == 0 && is_metadata)
1372 			crc = btrfs_csum_data(
1373 				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1374 				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1375 		else
1376 			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1377 
1378 		kunmap_atomic(mapped_buffer);
1379 		page_num++;
1380 		if (page_num >= sblock->page_count)
1381 			break;
1382 		WARN_ON(!sblock->pagev[page_num]->page);
1383 
1384 		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1385 	}
1386 
1387 	btrfs_csum_final(crc, calculated_csum);
1388 	if (memcmp(calculated_csum, csum, csum_size))
1389 		sblock->checksum_error = 1;
1390 }
1391 
1392 static void scrub_complete_bio_end_io(struct bio *bio, int err)
1393 {
1394 	complete((struct completion *)bio->bi_private);
1395 }
1396 
1397 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1398 					     struct scrub_block *sblock_good,
1399 					     int force_write)
1400 {
1401 	int page_num;
1402 	int ret = 0;
1403 
1404 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1405 		int ret_sub;
1406 
1407 		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1408 							   sblock_good,
1409 							   page_num,
1410 							   force_write);
1411 		if (ret_sub)
1412 			ret = ret_sub;
1413 	}
1414 
1415 	return ret;
1416 }
1417 
1418 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1419 					    struct scrub_block *sblock_good,
1420 					    int page_num, int force_write)
1421 {
1422 	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1423 	struct scrub_page *page_good = sblock_good->pagev[page_num];
1424 
1425 	BUG_ON(page_bad->page == NULL);
1426 	BUG_ON(page_good->page == NULL);
1427 	if (force_write || sblock_bad->header_error ||
1428 	    sblock_bad->checksum_error || page_bad->io_error) {
1429 		struct bio *bio;
1430 		int ret;
1431 		DECLARE_COMPLETION_ONSTACK(complete);
1432 
1433 		if (!page_bad->dev->bdev) {
1434 			printk_ratelimited(KERN_WARNING
1435 				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1436 			return -EIO;
1437 		}
1438 
1439 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1440 		if (!bio)
1441 			return -EIO;
1442 		bio->bi_bdev = page_bad->dev->bdev;
1443 		bio->bi_sector = page_bad->physical >> 9;
1444 		bio->bi_end_io = scrub_complete_bio_end_io;
1445 		bio->bi_private = &complete;
1446 
1447 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1448 		if (PAGE_SIZE != ret) {
1449 			bio_put(bio);
1450 			return -EIO;
1451 		}
1452 		btrfsic_submit_bio(WRITE, bio);
1453 
1454 		/* this will also unplug the queue */
1455 		wait_for_completion(&complete);
1456 		if (!bio_flagged(bio, BIO_UPTODATE)) {
1457 			btrfs_dev_stat_inc_and_print(page_bad->dev,
1458 				BTRFS_DEV_STAT_WRITE_ERRS);
1459 			btrfs_dev_replace_stats_inc(
1460 				&sblock_bad->sctx->dev_root->fs_info->
1461 				dev_replace.num_write_errors);
1462 			bio_put(bio);
1463 			return -EIO;
1464 		}
1465 		bio_put(bio);
1466 	}
1467 
1468 	return 0;
1469 }
1470 
1471 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1472 {
1473 	int page_num;
1474 
1475 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1476 		int ret;
1477 
1478 		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1479 		if (ret)
1480 			btrfs_dev_replace_stats_inc(
1481 				&sblock->sctx->dev_root->fs_info->dev_replace.
1482 				num_write_errors);
1483 	}
1484 }
1485 
1486 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1487 					   int page_num)
1488 {
1489 	struct scrub_page *spage = sblock->pagev[page_num];
1490 
1491 	BUG_ON(spage->page == NULL);
1492 	if (spage->io_error) {
1493 		void *mapped_buffer = kmap_atomic(spage->page);
1494 
1495 		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1496 		flush_dcache_page(spage->page);
1497 		kunmap_atomic(mapped_buffer);
1498 	}
1499 	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1500 }
1501 
1502 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1503 				    struct scrub_page *spage)
1504 {
1505 	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1506 	struct scrub_bio *sbio;
1507 	int ret;
1508 
1509 	mutex_lock(&wr_ctx->wr_lock);
1510 again:
1511 	if (!wr_ctx->wr_curr_bio) {
1512 		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1513 					      GFP_NOFS);
1514 		if (!wr_ctx->wr_curr_bio) {
1515 			mutex_unlock(&wr_ctx->wr_lock);
1516 			return -ENOMEM;
1517 		}
1518 		wr_ctx->wr_curr_bio->sctx = sctx;
1519 		wr_ctx->wr_curr_bio->page_count = 0;
1520 	}
1521 	sbio = wr_ctx->wr_curr_bio;
1522 	if (sbio->page_count == 0) {
1523 		struct bio *bio;
1524 
1525 		sbio->physical = spage->physical_for_dev_replace;
1526 		sbio->logical = spage->logical;
1527 		sbio->dev = wr_ctx->tgtdev;
1528 		bio = sbio->bio;
1529 		if (!bio) {
1530 			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1531 			if (!bio) {
1532 				mutex_unlock(&wr_ctx->wr_lock);
1533 				return -ENOMEM;
1534 			}
1535 			sbio->bio = bio;
1536 		}
1537 
1538 		bio->bi_private = sbio;
1539 		bio->bi_end_io = scrub_wr_bio_end_io;
1540 		bio->bi_bdev = sbio->dev->bdev;
1541 		bio->bi_sector = sbio->physical >> 9;
1542 		sbio->err = 0;
1543 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1544 		   spage->physical_for_dev_replace ||
1545 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1546 		   spage->logical) {
1547 		scrub_wr_submit(sctx);
1548 		goto again;
1549 	}
1550 
1551 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1552 	if (ret != PAGE_SIZE) {
1553 		if (sbio->page_count < 1) {
1554 			bio_put(sbio->bio);
1555 			sbio->bio = NULL;
1556 			mutex_unlock(&wr_ctx->wr_lock);
1557 			return -EIO;
1558 		}
1559 		scrub_wr_submit(sctx);
1560 		goto again;
1561 	}
1562 
1563 	sbio->pagev[sbio->page_count] = spage;
1564 	scrub_page_get(spage);
1565 	sbio->page_count++;
1566 	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1567 		scrub_wr_submit(sctx);
1568 	mutex_unlock(&wr_ctx->wr_lock);
1569 
1570 	return 0;
1571 }
1572 
1573 static void scrub_wr_submit(struct scrub_ctx *sctx)
1574 {
1575 	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1576 	struct scrub_bio *sbio;
1577 
1578 	if (!wr_ctx->wr_curr_bio)
1579 		return;
1580 
1581 	sbio = wr_ctx->wr_curr_bio;
1582 	wr_ctx->wr_curr_bio = NULL;
1583 	WARN_ON(!sbio->bio->bi_bdev);
1584 	scrub_pending_bio_inc(sctx);
1585 	/* process all writes in a single worker thread. Then the block layer
1586 	 * orders the requests before sending them to the driver which
1587 	 * doubled the write performance on spinning disks when measured
1588 	 * with Linux 3.5 */
1589 	btrfsic_submit_bio(WRITE, sbio->bio);
1590 }
1591 
1592 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1593 {
1594 	struct scrub_bio *sbio = bio->bi_private;
1595 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1596 
1597 	sbio->err = err;
1598 	sbio->bio = bio;
1599 
1600 	sbio->work.func = scrub_wr_bio_end_io_worker;
1601 	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1602 }
1603 
1604 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1605 {
1606 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1607 	struct scrub_ctx *sctx = sbio->sctx;
1608 	int i;
1609 
1610 	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1611 	if (sbio->err) {
1612 		struct btrfs_dev_replace *dev_replace =
1613 			&sbio->sctx->dev_root->fs_info->dev_replace;
1614 
1615 		for (i = 0; i < sbio->page_count; i++) {
1616 			struct scrub_page *spage = sbio->pagev[i];
1617 
1618 			spage->io_error = 1;
1619 			btrfs_dev_replace_stats_inc(&dev_replace->
1620 						    num_write_errors);
1621 		}
1622 	}
1623 
1624 	for (i = 0; i < sbio->page_count; i++)
1625 		scrub_page_put(sbio->pagev[i]);
1626 
1627 	bio_put(sbio->bio);
1628 	kfree(sbio);
1629 	scrub_pending_bio_dec(sctx);
1630 }
1631 
1632 static int scrub_checksum(struct scrub_block *sblock)
1633 {
1634 	u64 flags;
1635 	int ret;
1636 
1637 	WARN_ON(sblock->page_count < 1);
1638 	flags = sblock->pagev[0]->flags;
1639 	ret = 0;
1640 	if (flags & BTRFS_EXTENT_FLAG_DATA)
1641 		ret = scrub_checksum_data(sblock);
1642 	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1643 		ret = scrub_checksum_tree_block(sblock);
1644 	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1645 		(void)scrub_checksum_super(sblock);
1646 	else
1647 		WARN_ON(1);
1648 	if (ret)
1649 		scrub_handle_errored_block(sblock);
1650 
1651 	return ret;
1652 }
1653 
1654 static int scrub_checksum_data(struct scrub_block *sblock)
1655 {
1656 	struct scrub_ctx *sctx = sblock->sctx;
1657 	u8 csum[BTRFS_CSUM_SIZE];
1658 	u8 *on_disk_csum;
1659 	struct page *page;
1660 	void *buffer;
1661 	u32 crc = ~(u32)0;
1662 	int fail = 0;
1663 	u64 len;
1664 	int index;
1665 
1666 	BUG_ON(sblock->page_count < 1);
1667 	if (!sblock->pagev[0]->have_csum)
1668 		return 0;
1669 
1670 	on_disk_csum = sblock->pagev[0]->csum;
1671 	page = sblock->pagev[0]->page;
1672 	buffer = kmap_atomic(page);
1673 
1674 	len = sctx->sectorsize;
1675 	index = 0;
1676 	for (;;) {
1677 		u64 l = min_t(u64, len, PAGE_SIZE);
1678 
1679 		crc = btrfs_csum_data(buffer, crc, l);
1680 		kunmap_atomic(buffer);
1681 		len -= l;
1682 		if (len == 0)
1683 			break;
1684 		index++;
1685 		BUG_ON(index >= sblock->page_count);
1686 		BUG_ON(!sblock->pagev[index]->page);
1687 		page = sblock->pagev[index]->page;
1688 		buffer = kmap_atomic(page);
1689 	}
1690 
1691 	btrfs_csum_final(crc, csum);
1692 	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1693 		fail = 1;
1694 
1695 	return fail;
1696 }
1697 
1698 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1699 {
1700 	struct scrub_ctx *sctx = sblock->sctx;
1701 	struct btrfs_header *h;
1702 	struct btrfs_root *root = sctx->dev_root;
1703 	struct btrfs_fs_info *fs_info = root->fs_info;
1704 	u8 calculated_csum[BTRFS_CSUM_SIZE];
1705 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1706 	struct page *page;
1707 	void *mapped_buffer;
1708 	u64 mapped_size;
1709 	void *p;
1710 	u32 crc = ~(u32)0;
1711 	int fail = 0;
1712 	int crc_fail = 0;
1713 	u64 len;
1714 	int index;
1715 
1716 	BUG_ON(sblock->page_count < 1);
1717 	page = sblock->pagev[0]->page;
1718 	mapped_buffer = kmap_atomic(page);
1719 	h = (struct btrfs_header *)mapped_buffer;
1720 	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1721 
1722 	/*
1723 	 * we don't use the getter functions here, as we
1724 	 * a) don't have an extent buffer and
1725 	 * b) the page is already kmapped
1726 	 */
1727 
1728 	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1729 		++fail;
1730 
1731 	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1732 		++fail;
1733 
1734 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1735 		++fail;
1736 
1737 	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1738 		   BTRFS_UUID_SIZE))
1739 		++fail;
1740 
1741 	WARN_ON(sctx->nodesize != sctx->leafsize);
1742 	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1743 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1744 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1745 	index = 0;
1746 	for (;;) {
1747 		u64 l = min_t(u64, len, mapped_size);
1748 
1749 		crc = btrfs_csum_data(p, crc, l);
1750 		kunmap_atomic(mapped_buffer);
1751 		len -= l;
1752 		if (len == 0)
1753 			break;
1754 		index++;
1755 		BUG_ON(index >= sblock->page_count);
1756 		BUG_ON(!sblock->pagev[index]->page);
1757 		page = sblock->pagev[index]->page;
1758 		mapped_buffer = kmap_atomic(page);
1759 		mapped_size = PAGE_SIZE;
1760 		p = mapped_buffer;
1761 	}
1762 
1763 	btrfs_csum_final(crc, calculated_csum);
1764 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1765 		++crc_fail;
1766 
1767 	return fail || crc_fail;
1768 }
1769 
1770 static int scrub_checksum_super(struct scrub_block *sblock)
1771 {
1772 	struct btrfs_super_block *s;
1773 	struct scrub_ctx *sctx = sblock->sctx;
1774 	struct btrfs_root *root = sctx->dev_root;
1775 	struct btrfs_fs_info *fs_info = root->fs_info;
1776 	u8 calculated_csum[BTRFS_CSUM_SIZE];
1777 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1778 	struct page *page;
1779 	void *mapped_buffer;
1780 	u64 mapped_size;
1781 	void *p;
1782 	u32 crc = ~(u32)0;
1783 	int fail_gen = 0;
1784 	int fail_cor = 0;
1785 	u64 len;
1786 	int index;
1787 
1788 	BUG_ON(sblock->page_count < 1);
1789 	page = sblock->pagev[0]->page;
1790 	mapped_buffer = kmap_atomic(page);
1791 	s = (struct btrfs_super_block *)mapped_buffer;
1792 	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1793 
1794 	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1795 		++fail_cor;
1796 
1797 	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1798 		++fail_gen;
1799 
1800 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1801 		++fail_cor;
1802 
1803 	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1804 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1805 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1806 	index = 0;
1807 	for (;;) {
1808 		u64 l = min_t(u64, len, mapped_size);
1809 
1810 		crc = btrfs_csum_data(p, crc, l);
1811 		kunmap_atomic(mapped_buffer);
1812 		len -= l;
1813 		if (len == 0)
1814 			break;
1815 		index++;
1816 		BUG_ON(index >= sblock->page_count);
1817 		BUG_ON(!sblock->pagev[index]->page);
1818 		page = sblock->pagev[index]->page;
1819 		mapped_buffer = kmap_atomic(page);
1820 		mapped_size = PAGE_SIZE;
1821 		p = mapped_buffer;
1822 	}
1823 
1824 	btrfs_csum_final(crc, calculated_csum);
1825 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1826 		++fail_cor;
1827 
1828 	if (fail_cor + fail_gen) {
1829 		/*
1830 		 * if we find an error in a super block, we just report it.
1831 		 * They will get written with the next transaction commit
1832 		 * anyway
1833 		 */
1834 		spin_lock(&sctx->stat_lock);
1835 		++sctx->stat.super_errors;
1836 		spin_unlock(&sctx->stat_lock);
1837 		if (fail_cor)
1838 			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1839 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1840 		else
1841 			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1842 				BTRFS_DEV_STAT_GENERATION_ERRS);
1843 	}
1844 
1845 	return fail_cor + fail_gen;
1846 }
1847 
1848 static void scrub_block_get(struct scrub_block *sblock)
1849 {
1850 	atomic_inc(&sblock->ref_count);
1851 }
1852 
1853 static void scrub_block_put(struct scrub_block *sblock)
1854 {
1855 	if (atomic_dec_and_test(&sblock->ref_count)) {
1856 		int i;
1857 
1858 		for (i = 0; i < sblock->page_count; i++)
1859 			scrub_page_put(sblock->pagev[i]);
1860 		kfree(sblock);
1861 	}
1862 }
1863 
1864 static void scrub_page_get(struct scrub_page *spage)
1865 {
1866 	atomic_inc(&spage->ref_count);
1867 }
1868 
1869 static void scrub_page_put(struct scrub_page *spage)
1870 {
1871 	if (atomic_dec_and_test(&spage->ref_count)) {
1872 		if (spage->page)
1873 			__free_page(spage->page);
1874 		kfree(spage);
1875 	}
1876 }
1877 
1878 static void scrub_submit(struct scrub_ctx *sctx)
1879 {
1880 	struct scrub_bio *sbio;
1881 
1882 	if (sctx->curr == -1)
1883 		return;
1884 
1885 	sbio = sctx->bios[sctx->curr];
1886 	sctx->curr = -1;
1887 	scrub_pending_bio_inc(sctx);
1888 
1889 	if (!sbio->bio->bi_bdev) {
1890 		/*
1891 		 * this case should not happen. If btrfs_map_block() is
1892 		 * wrong, it could happen for dev-replace operations on
1893 		 * missing devices when no mirrors are available, but in
1894 		 * this case it should already fail the mount.
1895 		 * This case is handled correctly (but _very_ slowly).
1896 		 */
1897 		printk_ratelimited(KERN_WARNING
1898 			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1899 		bio_endio(sbio->bio, -EIO);
1900 	} else {
1901 		btrfsic_submit_bio(READ, sbio->bio);
1902 	}
1903 }
1904 
1905 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1906 				    struct scrub_page *spage)
1907 {
1908 	struct scrub_block *sblock = spage->sblock;
1909 	struct scrub_bio *sbio;
1910 	int ret;
1911 
1912 again:
1913 	/*
1914 	 * grab a fresh bio or wait for one to become available
1915 	 */
1916 	while (sctx->curr == -1) {
1917 		spin_lock(&sctx->list_lock);
1918 		sctx->curr = sctx->first_free;
1919 		if (sctx->curr != -1) {
1920 			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1921 			sctx->bios[sctx->curr]->next_free = -1;
1922 			sctx->bios[sctx->curr]->page_count = 0;
1923 			spin_unlock(&sctx->list_lock);
1924 		} else {
1925 			spin_unlock(&sctx->list_lock);
1926 			wait_event(sctx->list_wait, sctx->first_free != -1);
1927 		}
1928 	}
1929 	sbio = sctx->bios[sctx->curr];
1930 	if (sbio->page_count == 0) {
1931 		struct bio *bio;
1932 
1933 		sbio->physical = spage->physical;
1934 		sbio->logical = spage->logical;
1935 		sbio->dev = spage->dev;
1936 		bio = sbio->bio;
1937 		if (!bio) {
1938 			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1939 			if (!bio)
1940 				return -ENOMEM;
1941 			sbio->bio = bio;
1942 		}
1943 
1944 		bio->bi_private = sbio;
1945 		bio->bi_end_io = scrub_bio_end_io;
1946 		bio->bi_bdev = sbio->dev->bdev;
1947 		bio->bi_sector = sbio->physical >> 9;
1948 		sbio->err = 0;
1949 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1950 		   spage->physical ||
1951 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1952 		   spage->logical ||
1953 		   sbio->dev != spage->dev) {
1954 		scrub_submit(sctx);
1955 		goto again;
1956 	}
1957 
1958 	sbio->pagev[sbio->page_count] = spage;
1959 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1960 	if (ret != PAGE_SIZE) {
1961 		if (sbio->page_count < 1) {
1962 			bio_put(sbio->bio);
1963 			sbio->bio = NULL;
1964 			return -EIO;
1965 		}
1966 		scrub_submit(sctx);
1967 		goto again;
1968 	}
1969 
1970 	scrub_block_get(sblock); /* one for the page added to the bio */
1971 	atomic_inc(&sblock->outstanding_pages);
1972 	sbio->page_count++;
1973 	if (sbio->page_count == sctx->pages_per_rd_bio)
1974 		scrub_submit(sctx);
1975 
1976 	return 0;
1977 }
1978 
1979 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1980 		       u64 physical, struct btrfs_device *dev, u64 flags,
1981 		       u64 gen, int mirror_num, u8 *csum, int force,
1982 		       u64 physical_for_dev_replace)
1983 {
1984 	struct scrub_block *sblock;
1985 	int index;
1986 
1987 	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1988 	if (!sblock) {
1989 		spin_lock(&sctx->stat_lock);
1990 		sctx->stat.malloc_errors++;
1991 		spin_unlock(&sctx->stat_lock);
1992 		return -ENOMEM;
1993 	}
1994 
1995 	/* one ref inside this function, plus one for each page added to
1996 	 * a bio later on */
1997 	atomic_set(&sblock->ref_count, 1);
1998 	sblock->sctx = sctx;
1999 	sblock->no_io_error_seen = 1;
2000 
2001 	for (index = 0; len > 0; index++) {
2002 		struct scrub_page *spage;
2003 		u64 l = min_t(u64, len, PAGE_SIZE);
2004 
2005 		spage = kzalloc(sizeof(*spage), GFP_NOFS);
2006 		if (!spage) {
2007 leave_nomem:
2008 			spin_lock(&sctx->stat_lock);
2009 			sctx->stat.malloc_errors++;
2010 			spin_unlock(&sctx->stat_lock);
2011 			scrub_block_put(sblock);
2012 			return -ENOMEM;
2013 		}
2014 		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2015 		scrub_page_get(spage);
2016 		sblock->pagev[index] = spage;
2017 		spage->sblock = sblock;
2018 		spage->dev = dev;
2019 		spage->flags = flags;
2020 		spage->generation = gen;
2021 		spage->logical = logical;
2022 		spage->physical = physical;
2023 		spage->physical_for_dev_replace = physical_for_dev_replace;
2024 		spage->mirror_num = mirror_num;
2025 		if (csum) {
2026 			spage->have_csum = 1;
2027 			memcpy(spage->csum, csum, sctx->csum_size);
2028 		} else {
2029 			spage->have_csum = 0;
2030 		}
2031 		sblock->page_count++;
2032 		spage->page = alloc_page(GFP_NOFS);
2033 		if (!spage->page)
2034 			goto leave_nomem;
2035 		len -= l;
2036 		logical += l;
2037 		physical += l;
2038 		physical_for_dev_replace += l;
2039 	}
2040 
2041 	WARN_ON(sblock->page_count == 0);
2042 	for (index = 0; index < sblock->page_count; index++) {
2043 		struct scrub_page *spage = sblock->pagev[index];
2044 		int ret;
2045 
2046 		ret = scrub_add_page_to_rd_bio(sctx, spage);
2047 		if (ret) {
2048 			scrub_block_put(sblock);
2049 			return ret;
2050 		}
2051 	}
2052 
2053 	if (force)
2054 		scrub_submit(sctx);
2055 
2056 	/* last one frees, either here or in bio completion for last page */
2057 	scrub_block_put(sblock);
2058 	return 0;
2059 }
2060 
2061 static void scrub_bio_end_io(struct bio *bio, int err)
2062 {
2063 	struct scrub_bio *sbio = bio->bi_private;
2064 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2065 
2066 	sbio->err = err;
2067 	sbio->bio = bio;
2068 
2069 	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2070 }
2071 
2072 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2073 {
2074 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2075 	struct scrub_ctx *sctx = sbio->sctx;
2076 	int i;
2077 
2078 	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2079 	if (sbio->err) {
2080 		for (i = 0; i < sbio->page_count; i++) {
2081 			struct scrub_page *spage = sbio->pagev[i];
2082 
2083 			spage->io_error = 1;
2084 			spage->sblock->no_io_error_seen = 0;
2085 		}
2086 	}
2087 
2088 	/* now complete the scrub_block items that have all pages completed */
2089 	for (i = 0; i < sbio->page_count; i++) {
2090 		struct scrub_page *spage = sbio->pagev[i];
2091 		struct scrub_block *sblock = spage->sblock;
2092 
2093 		if (atomic_dec_and_test(&sblock->outstanding_pages))
2094 			scrub_block_complete(sblock);
2095 		scrub_block_put(sblock);
2096 	}
2097 
2098 	bio_put(sbio->bio);
2099 	sbio->bio = NULL;
2100 	spin_lock(&sctx->list_lock);
2101 	sbio->next_free = sctx->first_free;
2102 	sctx->first_free = sbio->index;
2103 	spin_unlock(&sctx->list_lock);
2104 
2105 	if (sctx->is_dev_replace &&
2106 	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2107 		mutex_lock(&sctx->wr_ctx.wr_lock);
2108 		scrub_wr_submit(sctx);
2109 		mutex_unlock(&sctx->wr_ctx.wr_lock);
2110 	}
2111 
2112 	scrub_pending_bio_dec(sctx);
2113 }
2114 
2115 static void scrub_block_complete(struct scrub_block *sblock)
2116 {
2117 	if (!sblock->no_io_error_seen) {
2118 		scrub_handle_errored_block(sblock);
2119 	} else {
2120 		/*
2121 		 * if has checksum error, write via repair mechanism in
2122 		 * dev replace case, otherwise write here in dev replace
2123 		 * case.
2124 		 */
2125 		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2126 			scrub_write_block_to_dev_replace(sblock);
2127 	}
2128 }
2129 
2130 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2131 			   u8 *csum)
2132 {
2133 	struct btrfs_ordered_sum *sum = NULL;
2134 	unsigned long index;
2135 	unsigned long num_sectors;
2136 
2137 	while (!list_empty(&sctx->csum_list)) {
2138 		sum = list_first_entry(&sctx->csum_list,
2139 				       struct btrfs_ordered_sum, list);
2140 		if (sum->bytenr > logical)
2141 			return 0;
2142 		if (sum->bytenr + sum->len > logical)
2143 			break;
2144 
2145 		++sctx->stat.csum_discards;
2146 		list_del(&sum->list);
2147 		kfree(sum);
2148 		sum = NULL;
2149 	}
2150 	if (!sum)
2151 		return 0;
2152 
2153 	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2154 	num_sectors = sum->len / sctx->sectorsize;
2155 	memcpy(csum, sum->sums + index, sctx->csum_size);
2156 	if (index == num_sectors - 1) {
2157 		list_del(&sum->list);
2158 		kfree(sum);
2159 	}
2160 	return 1;
2161 }
2162 
2163 /* scrub extent tries to collect up to 64 kB for each bio */
2164 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2165 			u64 physical, struct btrfs_device *dev, u64 flags,
2166 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2167 {
2168 	int ret;
2169 	u8 csum[BTRFS_CSUM_SIZE];
2170 	u32 blocksize;
2171 
2172 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2173 		blocksize = sctx->sectorsize;
2174 		spin_lock(&sctx->stat_lock);
2175 		sctx->stat.data_extents_scrubbed++;
2176 		sctx->stat.data_bytes_scrubbed += len;
2177 		spin_unlock(&sctx->stat_lock);
2178 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2179 		WARN_ON(sctx->nodesize != sctx->leafsize);
2180 		blocksize = sctx->nodesize;
2181 		spin_lock(&sctx->stat_lock);
2182 		sctx->stat.tree_extents_scrubbed++;
2183 		sctx->stat.tree_bytes_scrubbed += len;
2184 		spin_unlock(&sctx->stat_lock);
2185 	} else {
2186 		blocksize = sctx->sectorsize;
2187 		WARN_ON(1);
2188 	}
2189 
2190 	while (len) {
2191 		u64 l = min_t(u64, len, blocksize);
2192 		int have_csum = 0;
2193 
2194 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2195 			/* push csums to sbio */
2196 			have_csum = scrub_find_csum(sctx, logical, l, csum);
2197 			if (have_csum == 0)
2198 				++sctx->stat.no_csum;
2199 			if (sctx->is_dev_replace && !have_csum) {
2200 				ret = copy_nocow_pages(sctx, logical, l,
2201 						       mirror_num,
2202 						      physical_for_dev_replace);
2203 				goto behind_scrub_pages;
2204 			}
2205 		}
2206 		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2207 				  mirror_num, have_csum ? csum : NULL, 0,
2208 				  physical_for_dev_replace);
2209 behind_scrub_pages:
2210 		if (ret)
2211 			return ret;
2212 		len -= l;
2213 		logical += l;
2214 		physical += l;
2215 		physical_for_dev_replace += l;
2216 	}
2217 	return 0;
2218 }
2219 
2220 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2221 					   struct map_lookup *map,
2222 					   struct btrfs_device *scrub_dev,
2223 					   int num, u64 base, u64 length,
2224 					   int is_dev_replace)
2225 {
2226 	struct btrfs_path *path;
2227 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2228 	struct btrfs_root *root = fs_info->extent_root;
2229 	struct btrfs_root *csum_root = fs_info->csum_root;
2230 	struct btrfs_extent_item *extent;
2231 	struct blk_plug plug;
2232 	u64 flags;
2233 	int ret;
2234 	int slot;
2235 	u64 nstripes;
2236 	struct extent_buffer *l;
2237 	struct btrfs_key key;
2238 	u64 physical;
2239 	u64 logical;
2240 	u64 logic_end;
2241 	u64 generation;
2242 	int mirror_num;
2243 	struct reada_control *reada1;
2244 	struct reada_control *reada2;
2245 	struct btrfs_key key_start;
2246 	struct btrfs_key key_end;
2247 	u64 increment = map->stripe_len;
2248 	u64 offset;
2249 	u64 extent_logical;
2250 	u64 extent_physical;
2251 	u64 extent_len;
2252 	struct btrfs_device *extent_dev;
2253 	int extent_mirror_num;
2254 	int stop_loop;
2255 
2256 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2257 			 BTRFS_BLOCK_GROUP_RAID6)) {
2258 		if (num >= nr_data_stripes(map)) {
2259 			return 0;
2260 		}
2261 	}
2262 
2263 	nstripes = length;
2264 	offset = 0;
2265 	do_div(nstripes, map->stripe_len);
2266 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2267 		offset = map->stripe_len * num;
2268 		increment = map->stripe_len * map->num_stripes;
2269 		mirror_num = 1;
2270 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2271 		int factor = map->num_stripes / map->sub_stripes;
2272 		offset = map->stripe_len * (num / map->sub_stripes);
2273 		increment = map->stripe_len * factor;
2274 		mirror_num = num % map->sub_stripes + 1;
2275 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2276 		increment = map->stripe_len;
2277 		mirror_num = num % map->num_stripes + 1;
2278 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2279 		increment = map->stripe_len;
2280 		mirror_num = num % map->num_stripes + 1;
2281 	} else {
2282 		increment = map->stripe_len;
2283 		mirror_num = 1;
2284 	}
2285 
2286 	path = btrfs_alloc_path();
2287 	if (!path)
2288 		return -ENOMEM;
2289 
2290 	/*
2291 	 * work on commit root. The related disk blocks are static as
2292 	 * long as COW is applied. This means, it is save to rewrite
2293 	 * them to repair disk errors without any race conditions
2294 	 */
2295 	path->search_commit_root = 1;
2296 	path->skip_locking = 1;
2297 
2298 	/*
2299 	 * trigger the readahead for extent tree csum tree and wait for
2300 	 * completion. During readahead, the scrub is officially paused
2301 	 * to not hold off transaction commits
2302 	 */
2303 	logical = base + offset;
2304 
2305 	wait_event(sctx->list_wait,
2306 		   atomic_read(&sctx->bios_in_flight) == 0);
2307 	atomic_inc(&fs_info->scrubs_paused);
2308 	wake_up(&fs_info->scrub_pause_wait);
2309 
2310 	/* FIXME it might be better to start readahead at commit root */
2311 	key_start.objectid = logical;
2312 	key_start.type = BTRFS_EXTENT_ITEM_KEY;
2313 	key_start.offset = (u64)0;
2314 	key_end.objectid = base + offset + nstripes * increment;
2315 	key_end.type = BTRFS_METADATA_ITEM_KEY;
2316 	key_end.offset = (u64)-1;
2317 	reada1 = btrfs_reada_add(root, &key_start, &key_end);
2318 
2319 	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2320 	key_start.type = BTRFS_EXTENT_CSUM_KEY;
2321 	key_start.offset = logical;
2322 	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2323 	key_end.type = BTRFS_EXTENT_CSUM_KEY;
2324 	key_end.offset = base + offset + nstripes * increment;
2325 	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2326 
2327 	if (!IS_ERR(reada1))
2328 		btrfs_reada_wait(reada1);
2329 	if (!IS_ERR(reada2))
2330 		btrfs_reada_wait(reada2);
2331 
2332 	mutex_lock(&fs_info->scrub_lock);
2333 	while (atomic_read(&fs_info->scrub_pause_req)) {
2334 		mutex_unlock(&fs_info->scrub_lock);
2335 		wait_event(fs_info->scrub_pause_wait,
2336 		   atomic_read(&fs_info->scrub_pause_req) == 0);
2337 		mutex_lock(&fs_info->scrub_lock);
2338 	}
2339 	atomic_dec(&fs_info->scrubs_paused);
2340 	mutex_unlock(&fs_info->scrub_lock);
2341 	wake_up(&fs_info->scrub_pause_wait);
2342 
2343 	/*
2344 	 * collect all data csums for the stripe to avoid seeking during
2345 	 * the scrub. This might currently (crc32) end up to be about 1MB
2346 	 */
2347 	blk_start_plug(&plug);
2348 
2349 	/*
2350 	 * now find all extents for each stripe and scrub them
2351 	 */
2352 	logical = base + offset;
2353 	physical = map->stripes[num].physical;
2354 	logic_end = logical + increment * nstripes;
2355 	ret = 0;
2356 	while (logical < logic_end) {
2357 		/*
2358 		 * canceled?
2359 		 */
2360 		if (atomic_read(&fs_info->scrub_cancel_req) ||
2361 		    atomic_read(&sctx->cancel_req)) {
2362 			ret = -ECANCELED;
2363 			goto out;
2364 		}
2365 		/*
2366 		 * check to see if we have to pause
2367 		 */
2368 		if (atomic_read(&fs_info->scrub_pause_req)) {
2369 			/* push queued extents */
2370 			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2371 			scrub_submit(sctx);
2372 			mutex_lock(&sctx->wr_ctx.wr_lock);
2373 			scrub_wr_submit(sctx);
2374 			mutex_unlock(&sctx->wr_ctx.wr_lock);
2375 			wait_event(sctx->list_wait,
2376 				   atomic_read(&sctx->bios_in_flight) == 0);
2377 			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2378 			atomic_inc(&fs_info->scrubs_paused);
2379 			wake_up(&fs_info->scrub_pause_wait);
2380 			mutex_lock(&fs_info->scrub_lock);
2381 			while (atomic_read(&fs_info->scrub_pause_req)) {
2382 				mutex_unlock(&fs_info->scrub_lock);
2383 				wait_event(fs_info->scrub_pause_wait,
2384 				   atomic_read(&fs_info->scrub_pause_req) == 0);
2385 				mutex_lock(&fs_info->scrub_lock);
2386 			}
2387 			atomic_dec(&fs_info->scrubs_paused);
2388 			mutex_unlock(&fs_info->scrub_lock);
2389 			wake_up(&fs_info->scrub_pause_wait);
2390 		}
2391 
2392 		key.objectid = logical;
2393 		key.type = BTRFS_EXTENT_ITEM_KEY;
2394 		key.offset = (u64)-1;
2395 
2396 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2397 		if (ret < 0)
2398 			goto out;
2399 
2400 		if (ret > 0) {
2401 			ret = btrfs_previous_item(root, path, 0,
2402 						  BTRFS_EXTENT_ITEM_KEY);
2403 			if (ret < 0)
2404 				goto out;
2405 			if (ret > 0) {
2406 				/* there's no smaller item, so stick with the
2407 				 * larger one */
2408 				btrfs_release_path(path);
2409 				ret = btrfs_search_slot(NULL, root, &key,
2410 							path, 0, 0);
2411 				if (ret < 0)
2412 					goto out;
2413 			}
2414 		}
2415 
2416 		stop_loop = 0;
2417 		while (1) {
2418 			u64 bytes;
2419 
2420 			l = path->nodes[0];
2421 			slot = path->slots[0];
2422 			if (slot >= btrfs_header_nritems(l)) {
2423 				ret = btrfs_next_leaf(root, path);
2424 				if (ret == 0)
2425 					continue;
2426 				if (ret < 0)
2427 					goto out;
2428 
2429 				stop_loop = 1;
2430 				break;
2431 			}
2432 			btrfs_item_key_to_cpu(l, &key, slot);
2433 
2434 			if (key.type == BTRFS_METADATA_ITEM_KEY)
2435 				bytes = root->leafsize;
2436 			else
2437 				bytes = key.offset;
2438 
2439 			if (key.objectid + bytes <= logical)
2440 				goto next;
2441 
2442 			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2443 			    key.type != BTRFS_METADATA_ITEM_KEY)
2444 				goto next;
2445 
2446 			if (key.objectid >= logical + map->stripe_len) {
2447 				/* out of this device extent */
2448 				if (key.objectid >= logic_end)
2449 					stop_loop = 1;
2450 				break;
2451 			}
2452 
2453 			extent = btrfs_item_ptr(l, slot,
2454 						struct btrfs_extent_item);
2455 			flags = btrfs_extent_flags(l, extent);
2456 			generation = btrfs_extent_generation(l, extent);
2457 
2458 			if (key.objectid < logical &&
2459 			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2460 				printk(KERN_ERR
2461 				       "btrfs scrub: tree block %llu spanning "
2462 				       "stripes, ignored. logical=%llu\n",
2463 				       key.objectid, logical);
2464 				goto next;
2465 			}
2466 
2467 again:
2468 			extent_logical = key.objectid;
2469 			extent_len = bytes;
2470 
2471 			/*
2472 			 * trim extent to this stripe
2473 			 */
2474 			if (extent_logical < logical) {
2475 				extent_len -= logical - extent_logical;
2476 				extent_logical = logical;
2477 			}
2478 			if (extent_logical + extent_len >
2479 			    logical + map->stripe_len) {
2480 				extent_len = logical + map->stripe_len -
2481 					     extent_logical;
2482 			}
2483 
2484 			extent_physical = extent_logical - logical + physical;
2485 			extent_dev = scrub_dev;
2486 			extent_mirror_num = mirror_num;
2487 			if (is_dev_replace)
2488 				scrub_remap_extent(fs_info, extent_logical,
2489 						   extent_len, &extent_physical,
2490 						   &extent_dev,
2491 						   &extent_mirror_num);
2492 
2493 			ret = btrfs_lookup_csums_range(csum_root, logical,
2494 						logical + map->stripe_len - 1,
2495 						&sctx->csum_list, 1);
2496 			if (ret)
2497 				goto out;
2498 
2499 			ret = scrub_extent(sctx, extent_logical, extent_len,
2500 					   extent_physical, extent_dev, flags,
2501 					   generation, extent_mirror_num,
2502 					   extent_logical - logical + physical);
2503 			if (ret)
2504 				goto out;
2505 
2506 			scrub_free_csums(sctx);
2507 			if (extent_logical + extent_len <
2508 			    key.objectid + bytes) {
2509 				logical += increment;
2510 				physical += map->stripe_len;
2511 
2512 				if (logical < key.objectid + bytes) {
2513 					cond_resched();
2514 					goto again;
2515 				}
2516 
2517 				if (logical >= logic_end) {
2518 					stop_loop = 1;
2519 					break;
2520 				}
2521 			}
2522 next:
2523 			path->slots[0]++;
2524 		}
2525 		btrfs_release_path(path);
2526 		logical += increment;
2527 		physical += map->stripe_len;
2528 		spin_lock(&sctx->stat_lock);
2529 		if (stop_loop)
2530 			sctx->stat.last_physical = map->stripes[num].physical +
2531 						   length;
2532 		else
2533 			sctx->stat.last_physical = physical;
2534 		spin_unlock(&sctx->stat_lock);
2535 		if (stop_loop)
2536 			break;
2537 	}
2538 out:
2539 	/* push queued extents */
2540 	scrub_submit(sctx);
2541 	mutex_lock(&sctx->wr_ctx.wr_lock);
2542 	scrub_wr_submit(sctx);
2543 	mutex_unlock(&sctx->wr_ctx.wr_lock);
2544 
2545 	blk_finish_plug(&plug);
2546 	btrfs_free_path(path);
2547 	return ret < 0 ? ret : 0;
2548 }
2549 
2550 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2551 					  struct btrfs_device *scrub_dev,
2552 					  u64 chunk_tree, u64 chunk_objectid,
2553 					  u64 chunk_offset, u64 length,
2554 					  u64 dev_offset, int is_dev_replace)
2555 {
2556 	struct btrfs_mapping_tree *map_tree =
2557 		&sctx->dev_root->fs_info->mapping_tree;
2558 	struct map_lookup *map;
2559 	struct extent_map *em;
2560 	int i;
2561 	int ret = 0;
2562 
2563 	read_lock(&map_tree->map_tree.lock);
2564 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2565 	read_unlock(&map_tree->map_tree.lock);
2566 
2567 	if (!em)
2568 		return -EINVAL;
2569 
2570 	map = (struct map_lookup *)em->bdev;
2571 	if (em->start != chunk_offset)
2572 		goto out;
2573 
2574 	if (em->len < length)
2575 		goto out;
2576 
2577 	for (i = 0; i < map->num_stripes; ++i) {
2578 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2579 		    map->stripes[i].physical == dev_offset) {
2580 			ret = scrub_stripe(sctx, map, scrub_dev, i,
2581 					   chunk_offset, length,
2582 					   is_dev_replace);
2583 			if (ret)
2584 				goto out;
2585 		}
2586 	}
2587 out:
2588 	free_extent_map(em);
2589 
2590 	return ret;
2591 }
2592 
2593 static noinline_for_stack
2594 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2595 			   struct btrfs_device *scrub_dev, u64 start, u64 end,
2596 			   int is_dev_replace)
2597 {
2598 	struct btrfs_dev_extent *dev_extent = NULL;
2599 	struct btrfs_path *path;
2600 	struct btrfs_root *root = sctx->dev_root;
2601 	struct btrfs_fs_info *fs_info = root->fs_info;
2602 	u64 length;
2603 	u64 chunk_tree;
2604 	u64 chunk_objectid;
2605 	u64 chunk_offset;
2606 	int ret;
2607 	int slot;
2608 	struct extent_buffer *l;
2609 	struct btrfs_key key;
2610 	struct btrfs_key found_key;
2611 	struct btrfs_block_group_cache *cache;
2612 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2613 
2614 	path = btrfs_alloc_path();
2615 	if (!path)
2616 		return -ENOMEM;
2617 
2618 	path->reada = 2;
2619 	path->search_commit_root = 1;
2620 	path->skip_locking = 1;
2621 
2622 	key.objectid = scrub_dev->devid;
2623 	key.offset = 0ull;
2624 	key.type = BTRFS_DEV_EXTENT_KEY;
2625 
2626 	while (1) {
2627 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2628 		if (ret < 0)
2629 			break;
2630 		if (ret > 0) {
2631 			if (path->slots[0] >=
2632 			    btrfs_header_nritems(path->nodes[0])) {
2633 				ret = btrfs_next_leaf(root, path);
2634 				if (ret)
2635 					break;
2636 			}
2637 		}
2638 
2639 		l = path->nodes[0];
2640 		slot = path->slots[0];
2641 
2642 		btrfs_item_key_to_cpu(l, &found_key, slot);
2643 
2644 		if (found_key.objectid != scrub_dev->devid)
2645 			break;
2646 
2647 		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2648 			break;
2649 
2650 		if (found_key.offset >= end)
2651 			break;
2652 
2653 		if (found_key.offset < key.offset)
2654 			break;
2655 
2656 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2657 		length = btrfs_dev_extent_length(l, dev_extent);
2658 
2659 		if (found_key.offset + length <= start) {
2660 			key.offset = found_key.offset + length;
2661 			btrfs_release_path(path);
2662 			continue;
2663 		}
2664 
2665 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2666 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2667 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2668 
2669 		/*
2670 		 * get a reference on the corresponding block group to prevent
2671 		 * the chunk from going away while we scrub it
2672 		 */
2673 		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2674 		if (!cache) {
2675 			ret = -ENOENT;
2676 			break;
2677 		}
2678 		dev_replace->cursor_right = found_key.offset + length;
2679 		dev_replace->cursor_left = found_key.offset;
2680 		dev_replace->item_needs_writeback = 1;
2681 		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2682 				  chunk_offset, length, found_key.offset,
2683 				  is_dev_replace);
2684 
2685 		/*
2686 		 * flush, submit all pending read and write bios, afterwards
2687 		 * wait for them.
2688 		 * Note that in the dev replace case, a read request causes
2689 		 * write requests that are submitted in the read completion
2690 		 * worker. Therefore in the current situation, it is required
2691 		 * that all write requests are flushed, so that all read and
2692 		 * write requests are really completed when bios_in_flight
2693 		 * changes to 0.
2694 		 */
2695 		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2696 		scrub_submit(sctx);
2697 		mutex_lock(&sctx->wr_ctx.wr_lock);
2698 		scrub_wr_submit(sctx);
2699 		mutex_unlock(&sctx->wr_ctx.wr_lock);
2700 
2701 		wait_event(sctx->list_wait,
2702 			   atomic_read(&sctx->bios_in_flight) == 0);
2703 		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2704 		atomic_inc(&fs_info->scrubs_paused);
2705 		wake_up(&fs_info->scrub_pause_wait);
2706 		wait_event(sctx->list_wait,
2707 			   atomic_read(&sctx->workers_pending) == 0);
2708 
2709 		mutex_lock(&fs_info->scrub_lock);
2710 		while (atomic_read(&fs_info->scrub_pause_req)) {
2711 			mutex_unlock(&fs_info->scrub_lock);
2712 			wait_event(fs_info->scrub_pause_wait,
2713 			   atomic_read(&fs_info->scrub_pause_req) == 0);
2714 			mutex_lock(&fs_info->scrub_lock);
2715 		}
2716 		atomic_dec(&fs_info->scrubs_paused);
2717 		mutex_unlock(&fs_info->scrub_lock);
2718 		wake_up(&fs_info->scrub_pause_wait);
2719 
2720 		dev_replace->cursor_left = dev_replace->cursor_right;
2721 		dev_replace->item_needs_writeback = 1;
2722 		btrfs_put_block_group(cache);
2723 		if (ret)
2724 			break;
2725 		if (is_dev_replace &&
2726 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
2727 			ret = -EIO;
2728 			break;
2729 		}
2730 		if (sctx->stat.malloc_errors > 0) {
2731 			ret = -ENOMEM;
2732 			break;
2733 		}
2734 
2735 		key.offset = found_key.offset + length;
2736 		btrfs_release_path(path);
2737 	}
2738 
2739 	btrfs_free_path(path);
2740 
2741 	/*
2742 	 * ret can still be 1 from search_slot or next_leaf,
2743 	 * that's not an error
2744 	 */
2745 	return ret < 0 ? ret : 0;
2746 }
2747 
2748 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2749 					   struct btrfs_device *scrub_dev)
2750 {
2751 	int	i;
2752 	u64	bytenr;
2753 	u64	gen;
2754 	int	ret;
2755 	struct btrfs_root *root = sctx->dev_root;
2756 
2757 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2758 		return -EIO;
2759 
2760 	gen = root->fs_info->last_trans_committed;
2761 
2762 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2763 		bytenr = btrfs_sb_offset(i);
2764 		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2765 			break;
2766 
2767 		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2768 				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2769 				  NULL, 1, bytenr);
2770 		if (ret)
2771 			return ret;
2772 	}
2773 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2774 
2775 	return 0;
2776 }
2777 
2778 /*
2779  * get a reference count on fs_info->scrub_workers. start worker if necessary
2780  */
2781 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2782 						int is_dev_replace)
2783 {
2784 	int ret = 0;
2785 
2786 	mutex_lock(&fs_info->scrub_lock);
2787 	if (fs_info->scrub_workers_refcnt == 0) {
2788 		if (is_dev_replace)
2789 			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2790 					&fs_info->generic_worker);
2791 		else
2792 			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2793 					fs_info->thread_pool_size,
2794 					&fs_info->generic_worker);
2795 		fs_info->scrub_workers.idle_thresh = 4;
2796 		ret = btrfs_start_workers(&fs_info->scrub_workers);
2797 		if (ret)
2798 			goto out;
2799 		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2800 				   "scrubwrc",
2801 				   fs_info->thread_pool_size,
2802 				   &fs_info->generic_worker);
2803 		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2804 		ret = btrfs_start_workers(
2805 				&fs_info->scrub_wr_completion_workers);
2806 		if (ret)
2807 			goto out;
2808 		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2809 				   &fs_info->generic_worker);
2810 		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2811 		if (ret)
2812 			goto out;
2813 	}
2814 	++fs_info->scrub_workers_refcnt;
2815 out:
2816 	mutex_unlock(&fs_info->scrub_lock);
2817 
2818 	return ret;
2819 }
2820 
2821 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2822 {
2823 	mutex_lock(&fs_info->scrub_lock);
2824 	if (--fs_info->scrub_workers_refcnt == 0) {
2825 		btrfs_stop_workers(&fs_info->scrub_workers);
2826 		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2827 		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2828 	}
2829 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
2830 	mutex_unlock(&fs_info->scrub_lock);
2831 }
2832 
2833 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2834 		    u64 end, struct btrfs_scrub_progress *progress,
2835 		    int readonly, int is_dev_replace)
2836 {
2837 	struct scrub_ctx *sctx;
2838 	int ret;
2839 	struct btrfs_device *dev;
2840 
2841 	if (btrfs_fs_closing(fs_info))
2842 		return -EINVAL;
2843 
2844 	/*
2845 	 * check some assumptions
2846 	 */
2847 	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2848 		printk(KERN_ERR
2849 		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2850 		       fs_info->chunk_root->nodesize,
2851 		       fs_info->chunk_root->leafsize);
2852 		return -EINVAL;
2853 	}
2854 
2855 	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2856 		/*
2857 		 * in this case scrub is unable to calculate the checksum
2858 		 * the way scrub is implemented. Do not handle this
2859 		 * situation at all because it won't ever happen.
2860 		 */
2861 		printk(KERN_ERR
2862 		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2863 		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2864 		return -EINVAL;
2865 	}
2866 
2867 	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2868 		/* not supported for data w/o checksums */
2869 		printk(KERN_ERR
2870 		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
2871 		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
2872 		return -EINVAL;
2873 	}
2874 
2875 	if (fs_info->chunk_root->nodesize >
2876 	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2877 	    fs_info->chunk_root->sectorsize >
2878 	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2879 		/*
2880 		 * would exhaust the array bounds of pagev member in
2881 		 * struct scrub_block
2882 		 */
2883 		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2884 		       fs_info->chunk_root->nodesize,
2885 		       SCRUB_MAX_PAGES_PER_BLOCK,
2886 		       fs_info->chunk_root->sectorsize,
2887 		       SCRUB_MAX_PAGES_PER_BLOCK);
2888 		return -EINVAL;
2889 	}
2890 
2891 	ret = scrub_workers_get(fs_info, is_dev_replace);
2892 	if (ret)
2893 		return ret;
2894 
2895 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2896 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2897 	if (!dev || (dev->missing && !is_dev_replace)) {
2898 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2899 		scrub_workers_put(fs_info);
2900 		return -ENODEV;
2901 	}
2902 	mutex_lock(&fs_info->scrub_lock);
2903 
2904 	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2905 		mutex_unlock(&fs_info->scrub_lock);
2906 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2907 		scrub_workers_put(fs_info);
2908 		return -EIO;
2909 	}
2910 
2911 	btrfs_dev_replace_lock(&fs_info->dev_replace);
2912 	if (dev->scrub_device ||
2913 	    (!is_dev_replace &&
2914 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2915 		btrfs_dev_replace_unlock(&fs_info->dev_replace);
2916 		mutex_unlock(&fs_info->scrub_lock);
2917 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2918 		scrub_workers_put(fs_info);
2919 		return -EINPROGRESS;
2920 	}
2921 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
2922 	sctx = scrub_setup_ctx(dev, is_dev_replace);
2923 	if (IS_ERR(sctx)) {
2924 		mutex_unlock(&fs_info->scrub_lock);
2925 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2926 		scrub_workers_put(fs_info);
2927 		return PTR_ERR(sctx);
2928 	}
2929 	sctx->readonly = readonly;
2930 	dev->scrub_device = sctx;
2931 
2932 	atomic_inc(&fs_info->scrubs_running);
2933 	mutex_unlock(&fs_info->scrub_lock);
2934 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2935 
2936 	if (!is_dev_replace) {
2937 		down_read(&fs_info->scrub_super_lock);
2938 		ret = scrub_supers(sctx, dev);
2939 		up_read(&fs_info->scrub_super_lock);
2940 	}
2941 
2942 	if (!ret)
2943 		ret = scrub_enumerate_chunks(sctx, dev, start, end,
2944 					     is_dev_replace);
2945 
2946 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2947 	atomic_dec(&fs_info->scrubs_running);
2948 	wake_up(&fs_info->scrub_pause_wait);
2949 
2950 	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2951 
2952 	if (progress)
2953 		memcpy(progress, &sctx->stat, sizeof(*progress));
2954 
2955 	mutex_lock(&fs_info->scrub_lock);
2956 	dev->scrub_device = NULL;
2957 	mutex_unlock(&fs_info->scrub_lock);
2958 
2959 	scrub_free_ctx(sctx);
2960 	scrub_workers_put(fs_info);
2961 
2962 	return ret;
2963 }
2964 
2965 void btrfs_scrub_pause(struct btrfs_root *root)
2966 {
2967 	struct btrfs_fs_info *fs_info = root->fs_info;
2968 
2969 	mutex_lock(&fs_info->scrub_lock);
2970 	atomic_inc(&fs_info->scrub_pause_req);
2971 	while (atomic_read(&fs_info->scrubs_paused) !=
2972 	       atomic_read(&fs_info->scrubs_running)) {
2973 		mutex_unlock(&fs_info->scrub_lock);
2974 		wait_event(fs_info->scrub_pause_wait,
2975 			   atomic_read(&fs_info->scrubs_paused) ==
2976 			   atomic_read(&fs_info->scrubs_running));
2977 		mutex_lock(&fs_info->scrub_lock);
2978 	}
2979 	mutex_unlock(&fs_info->scrub_lock);
2980 }
2981 
2982 void btrfs_scrub_continue(struct btrfs_root *root)
2983 {
2984 	struct btrfs_fs_info *fs_info = root->fs_info;
2985 
2986 	atomic_dec(&fs_info->scrub_pause_req);
2987 	wake_up(&fs_info->scrub_pause_wait);
2988 }
2989 
2990 void btrfs_scrub_pause_super(struct btrfs_root *root)
2991 {
2992 	down_write(&root->fs_info->scrub_super_lock);
2993 }
2994 
2995 void btrfs_scrub_continue_super(struct btrfs_root *root)
2996 {
2997 	up_write(&root->fs_info->scrub_super_lock);
2998 }
2999 
3000 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3001 {
3002 	mutex_lock(&fs_info->scrub_lock);
3003 	if (!atomic_read(&fs_info->scrubs_running)) {
3004 		mutex_unlock(&fs_info->scrub_lock);
3005 		return -ENOTCONN;
3006 	}
3007 
3008 	atomic_inc(&fs_info->scrub_cancel_req);
3009 	while (atomic_read(&fs_info->scrubs_running)) {
3010 		mutex_unlock(&fs_info->scrub_lock);
3011 		wait_event(fs_info->scrub_pause_wait,
3012 			   atomic_read(&fs_info->scrubs_running) == 0);
3013 		mutex_lock(&fs_info->scrub_lock);
3014 	}
3015 	atomic_dec(&fs_info->scrub_cancel_req);
3016 	mutex_unlock(&fs_info->scrub_lock);
3017 
3018 	return 0;
3019 }
3020 
3021 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3022 			   struct btrfs_device *dev)
3023 {
3024 	struct scrub_ctx *sctx;
3025 
3026 	mutex_lock(&fs_info->scrub_lock);
3027 	sctx = dev->scrub_device;
3028 	if (!sctx) {
3029 		mutex_unlock(&fs_info->scrub_lock);
3030 		return -ENOTCONN;
3031 	}
3032 	atomic_inc(&sctx->cancel_req);
3033 	while (dev->scrub_device) {
3034 		mutex_unlock(&fs_info->scrub_lock);
3035 		wait_event(fs_info->scrub_pause_wait,
3036 			   dev->scrub_device == NULL);
3037 		mutex_lock(&fs_info->scrub_lock);
3038 	}
3039 	mutex_unlock(&fs_info->scrub_lock);
3040 
3041 	return 0;
3042 }
3043 
3044 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3045 			 struct btrfs_scrub_progress *progress)
3046 {
3047 	struct btrfs_device *dev;
3048 	struct scrub_ctx *sctx = NULL;
3049 
3050 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3051 	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3052 	if (dev)
3053 		sctx = dev->scrub_device;
3054 	if (sctx)
3055 		memcpy(progress, &sctx->stat, sizeof(*progress));
3056 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3057 
3058 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3059 }
3060 
3061 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3062 			       u64 extent_logical, u64 extent_len,
3063 			       u64 *extent_physical,
3064 			       struct btrfs_device **extent_dev,
3065 			       int *extent_mirror_num)
3066 {
3067 	u64 mapped_length;
3068 	struct btrfs_bio *bbio = NULL;
3069 	int ret;
3070 
3071 	mapped_length = extent_len;
3072 	ret = btrfs_map_block(fs_info, READ, extent_logical,
3073 			      &mapped_length, &bbio, 0);
3074 	if (ret || !bbio || mapped_length < extent_len ||
3075 	    !bbio->stripes[0].dev->bdev) {
3076 		kfree(bbio);
3077 		return;
3078 	}
3079 
3080 	*extent_physical = bbio->stripes[0].physical;
3081 	*extent_mirror_num = bbio->mirror_num;
3082 	*extent_dev = bbio->stripes[0].dev;
3083 	kfree(bbio);
3084 }
3085 
3086 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3087 			      struct scrub_wr_ctx *wr_ctx,
3088 			      struct btrfs_fs_info *fs_info,
3089 			      struct btrfs_device *dev,
3090 			      int is_dev_replace)
3091 {
3092 	WARN_ON(wr_ctx->wr_curr_bio != NULL);
3093 
3094 	mutex_init(&wr_ctx->wr_lock);
3095 	wr_ctx->wr_curr_bio = NULL;
3096 	if (!is_dev_replace)
3097 		return 0;
3098 
3099 	WARN_ON(!dev->bdev);
3100 	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3101 					 bio_get_nr_vecs(dev->bdev));
3102 	wr_ctx->tgtdev = dev;
3103 	atomic_set(&wr_ctx->flush_all_writes, 0);
3104 	return 0;
3105 }
3106 
3107 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3108 {
3109 	mutex_lock(&wr_ctx->wr_lock);
3110 	kfree(wr_ctx->wr_curr_bio);
3111 	wr_ctx->wr_curr_bio = NULL;
3112 	mutex_unlock(&wr_ctx->wr_lock);
3113 }
3114 
3115 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3116 			    int mirror_num, u64 physical_for_dev_replace)
3117 {
3118 	struct scrub_copy_nocow_ctx *nocow_ctx;
3119 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3120 
3121 	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3122 	if (!nocow_ctx) {
3123 		spin_lock(&sctx->stat_lock);
3124 		sctx->stat.malloc_errors++;
3125 		spin_unlock(&sctx->stat_lock);
3126 		return -ENOMEM;
3127 	}
3128 
3129 	scrub_pending_trans_workers_inc(sctx);
3130 
3131 	nocow_ctx->sctx = sctx;
3132 	nocow_ctx->logical = logical;
3133 	nocow_ctx->len = len;
3134 	nocow_ctx->mirror_num = mirror_num;
3135 	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3136 	nocow_ctx->work.func = copy_nocow_pages_worker;
3137 	INIT_LIST_HEAD(&nocow_ctx->inodes);
3138 	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3139 			   &nocow_ctx->work);
3140 
3141 	return 0;
3142 }
3143 
3144 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3145 {
3146 	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3147 	struct scrub_nocow_inode *nocow_inode;
3148 
3149 	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3150 	if (!nocow_inode)
3151 		return -ENOMEM;
3152 	nocow_inode->inum = inum;
3153 	nocow_inode->offset = offset;
3154 	nocow_inode->root = root;
3155 	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3156 	return 0;
3157 }
3158 
3159 #define COPY_COMPLETE 1
3160 
3161 static void copy_nocow_pages_worker(struct btrfs_work *work)
3162 {
3163 	struct scrub_copy_nocow_ctx *nocow_ctx =
3164 		container_of(work, struct scrub_copy_nocow_ctx, work);
3165 	struct scrub_ctx *sctx = nocow_ctx->sctx;
3166 	u64 logical = nocow_ctx->logical;
3167 	u64 len = nocow_ctx->len;
3168 	int mirror_num = nocow_ctx->mirror_num;
3169 	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3170 	int ret;
3171 	struct btrfs_trans_handle *trans = NULL;
3172 	struct btrfs_fs_info *fs_info;
3173 	struct btrfs_path *path;
3174 	struct btrfs_root *root;
3175 	int not_written = 0;
3176 
3177 	fs_info = sctx->dev_root->fs_info;
3178 	root = fs_info->extent_root;
3179 
3180 	path = btrfs_alloc_path();
3181 	if (!path) {
3182 		spin_lock(&sctx->stat_lock);
3183 		sctx->stat.malloc_errors++;
3184 		spin_unlock(&sctx->stat_lock);
3185 		not_written = 1;
3186 		goto out;
3187 	}
3188 
3189 	trans = btrfs_join_transaction(root);
3190 	if (IS_ERR(trans)) {
3191 		not_written = 1;
3192 		goto out;
3193 	}
3194 
3195 	ret = iterate_inodes_from_logical(logical, fs_info, path,
3196 					  record_inode_for_nocow, nocow_ctx);
3197 	if (ret != 0 && ret != -ENOENT) {
3198 		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
3199 			logical, physical_for_dev_replace, len, mirror_num,
3200 			ret);
3201 		not_written = 1;
3202 		goto out;
3203 	}
3204 
3205 	btrfs_end_transaction(trans, root);
3206 	trans = NULL;
3207 	while (!list_empty(&nocow_ctx->inodes)) {
3208 		struct scrub_nocow_inode *entry;
3209 		entry = list_first_entry(&nocow_ctx->inodes,
3210 					 struct scrub_nocow_inode,
3211 					 list);
3212 		list_del_init(&entry->list);
3213 		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3214 						 entry->root, nocow_ctx);
3215 		kfree(entry);
3216 		if (ret == COPY_COMPLETE) {
3217 			ret = 0;
3218 			break;
3219 		} else if (ret) {
3220 			break;
3221 		}
3222 	}
3223 out:
3224 	while (!list_empty(&nocow_ctx->inodes)) {
3225 		struct scrub_nocow_inode *entry;
3226 		entry = list_first_entry(&nocow_ctx->inodes,
3227 					 struct scrub_nocow_inode,
3228 					 list);
3229 		list_del_init(&entry->list);
3230 		kfree(entry);
3231 	}
3232 	if (trans && !IS_ERR(trans))
3233 		btrfs_end_transaction(trans, root);
3234 	if (not_written)
3235 		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3236 					    num_uncorrectable_read_errors);
3237 
3238 	btrfs_free_path(path);
3239 	kfree(nocow_ctx);
3240 
3241 	scrub_pending_trans_workers_dec(sctx);
3242 }
3243 
3244 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3245 				      struct scrub_copy_nocow_ctx *nocow_ctx)
3246 {
3247 	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3248 	struct btrfs_key key;
3249 	struct inode *inode;
3250 	struct page *page;
3251 	struct btrfs_root *local_root;
3252 	struct btrfs_ordered_extent *ordered;
3253 	struct extent_map *em;
3254 	struct extent_state *cached_state = NULL;
3255 	struct extent_io_tree *io_tree;
3256 	u64 physical_for_dev_replace;
3257 	u64 len = nocow_ctx->len;
3258 	u64 lockstart = offset, lockend = offset + len - 1;
3259 	unsigned long index;
3260 	int srcu_index;
3261 	int ret = 0;
3262 	int err = 0;
3263 
3264 	key.objectid = root;
3265 	key.type = BTRFS_ROOT_ITEM_KEY;
3266 	key.offset = (u64)-1;
3267 
3268 	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3269 
3270 	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3271 	if (IS_ERR(local_root)) {
3272 		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3273 		return PTR_ERR(local_root);
3274 	}
3275 
3276 	key.type = BTRFS_INODE_ITEM_KEY;
3277 	key.objectid = inum;
3278 	key.offset = 0;
3279 	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3280 	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3281 	if (IS_ERR(inode))
3282 		return PTR_ERR(inode);
3283 
3284 	/* Avoid truncate/dio/punch hole.. */
3285 	mutex_lock(&inode->i_mutex);
3286 	inode_dio_wait(inode);
3287 
3288 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3289 	io_tree = &BTRFS_I(inode)->io_tree;
3290 
3291 	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3292 	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3293 	if (ordered) {
3294 		btrfs_put_ordered_extent(ordered);
3295 		goto out_unlock;
3296 	}
3297 
3298 	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3299 	if (IS_ERR(em)) {
3300 		ret = PTR_ERR(em);
3301 		goto out_unlock;
3302 	}
3303 
3304 	/*
3305 	 * This extent does not actually cover the logical extent anymore,
3306 	 * move on to the next inode.
3307 	 */
3308 	if (em->block_start > nocow_ctx->logical ||
3309 	    em->block_start + em->block_len < nocow_ctx->logical + len) {
3310 		free_extent_map(em);
3311 		goto out_unlock;
3312 	}
3313 	free_extent_map(em);
3314 
3315 	while (len >= PAGE_CACHE_SIZE) {
3316 		index = offset >> PAGE_CACHE_SHIFT;
3317 again:
3318 		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3319 		if (!page) {
3320 			pr_err("find_or_create_page() failed\n");
3321 			ret = -ENOMEM;
3322 			goto out;
3323 		}
3324 
3325 		if (PageUptodate(page)) {
3326 			if (PageDirty(page))
3327 				goto next_page;
3328 		} else {
3329 			ClearPageError(page);
3330 			err = extent_read_full_page_nolock(io_tree, page,
3331 							   btrfs_get_extent,
3332 							   nocow_ctx->mirror_num);
3333 			if (err) {
3334 				ret = err;
3335 				goto next_page;
3336 			}
3337 
3338 			lock_page(page);
3339 			/*
3340 			 * If the page has been remove from the page cache,
3341 			 * the data on it is meaningless, because it may be
3342 			 * old one, the new data may be written into the new
3343 			 * page in the page cache.
3344 			 */
3345 			if (page->mapping != inode->i_mapping) {
3346 				unlock_page(page);
3347 				page_cache_release(page);
3348 				goto again;
3349 			}
3350 			if (!PageUptodate(page)) {
3351 				ret = -EIO;
3352 				goto next_page;
3353 			}
3354 		}
3355 		err = write_page_nocow(nocow_ctx->sctx,
3356 				       physical_for_dev_replace, page);
3357 		if (err)
3358 			ret = err;
3359 next_page:
3360 		unlock_page(page);
3361 		page_cache_release(page);
3362 
3363 		if (ret)
3364 			break;
3365 
3366 		offset += PAGE_CACHE_SIZE;
3367 		physical_for_dev_replace += PAGE_CACHE_SIZE;
3368 		len -= PAGE_CACHE_SIZE;
3369 	}
3370 	ret = COPY_COMPLETE;
3371 out_unlock:
3372 	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3373 			     GFP_NOFS);
3374 out:
3375 	mutex_unlock(&inode->i_mutex);
3376 	iput(inode);
3377 	return ret;
3378 }
3379 
3380 static int write_page_nocow(struct scrub_ctx *sctx,
3381 			    u64 physical_for_dev_replace, struct page *page)
3382 {
3383 	struct bio *bio;
3384 	struct btrfs_device *dev;
3385 	int ret;
3386 	DECLARE_COMPLETION_ONSTACK(compl);
3387 
3388 	dev = sctx->wr_ctx.tgtdev;
3389 	if (!dev)
3390 		return -EIO;
3391 	if (!dev->bdev) {
3392 		printk_ratelimited(KERN_WARNING
3393 			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3394 		return -EIO;
3395 	}
3396 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3397 	if (!bio) {
3398 		spin_lock(&sctx->stat_lock);
3399 		sctx->stat.malloc_errors++;
3400 		spin_unlock(&sctx->stat_lock);
3401 		return -ENOMEM;
3402 	}
3403 	bio->bi_private = &compl;
3404 	bio->bi_end_io = scrub_complete_bio_end_io;
3405 	bio->bi_size = 0;
3406 	bio->bi_sector = physical_for_dev_replace >> 9;
3407 	bio->bi_bdev = dev->bdev;
3408 	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3409 	if (ret != PAGE_CACHE_SIZE) {
3410 leave_with_eio:
3411 		bio_put(bio);
3412 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3413 		return -EIO;
3414 	}
3415 	btrfsic_submit_bio(WRITE_SYNC, bio);
3416 	wait_for_completion(&compl);
3417 
3418 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3419 		goto leave_with_eio;
3420 
3421 	bio_put(bio);
3422 	return 0;
3423 }
3424