xref: /openbmc/linux/fs/btrfs/raid56.c (revision a13144e2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "misc.h"
18 #include "ctree.h"
19 #include "disk-io.h"
20 #include "volumes.h"
21 #include "raid56.h"
22 #include "async-thread.h"
23 #include "file-item.h"
24 #include "btrfs_inode.h"
25 
26 /* set when additional merges to this rbio are not allowed */
27 #define RBIO_RMW_LOCKED_BIT	1
28 
29 /*
30  * set when this rbio is sitting in the hash, but it is just a cache
31  * of past RMW
32  */
33 #define RBIO_CACHE_BIT		2
34 
35 /*
36  * set when it is safe to trust the stripe_pages for caching
37  */
38 #define RBIO_CACHE_READY_BIT	3
39 
40 #define RBIO_CACHE_SIZE 1024
41 
42 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
43 
44 /* Used by the raid56 code to lock stripes for read/modify/write */
45 struct btrfs_stripe_hash {
46 	struct list_head hash_list;
47 	spinlock_t lock;
48 };
49 
50 /* Used by the raid56 code to lock stripes for read/modify/write */
51 struct btrfs_stripe_hash_table {
52 	struct list_head stripe_cache;
53 	spinlock_t cache_lock;
54 	int cache_size;
55 	struct btrfs_stripe_hash table[];
56 };
57 
58 /*
59  * A bvec like structure to present a sector inside a page.
60  *
61  * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
62  */
63 struct sector_ptr {
64 	struct page *page;
65 	unsigned int pgoff:24;
66 	unsigned int uptodate:8;
67 };
68 
69 static void rmw_rbio_work(struct work_struct *work);
70 static void rmw_rbio_work_locked(struct work_struct *work);
71 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
72 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
73 
74 static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
75 static void scrub_rbio_work_locked(struct work_struct *work);
76 
77 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
78 {
79 	bitmap_free(rbio->error_bitmap);
80 	kfree(rbio->stripe_pages);
81 	kfree(rbio->bio_sectors);
82 	kfree(rbio->stripe_sectors);
83 	kfree(rbio->finish_pointers);
84 }
85 
86 static void free_raid_bio(struct btrfs_raid_bio *rbio)
87 {
88 	int i;
89 
90 	if (!refcount_dec_and_test(&rbio->refs))
91 		return;
92 
93 	WARN_ON(!list_empty(&rbio->stripe_cache));
94 	WARN_ON(!list_empty(&rbio->hash_list));
95 	WARN_ON(!bio_list_empty(&rbio->bio_list));
96 
97 	for (i = 0; i < rbio->nr_pages; i++) {
98 		if (rbio->stripe_pages[i]) {
99 			__free_page(rbio->stripe_pages[i]);
100 			rbio->stripe_pages[i] = NULL;
101 		}
102 	}
103 
104 	btrfs_put_bioc(rbio->bioc);
105 	free_raid_bio_pointers(rbio);
106 	kfree(rbio);
107 }
108 
109 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
110 {
111 	INIT_WORK(&rbio->work, work_func);
112 	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
113 }
114 
115 /*
116  * the stripe hash table is used for locking, and to collect
117  * bios in hopes of making a full stripe
118  */
119 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
120 {
121 	struct btrfs_stripe_hash_table *table;
122 	struct btrfs_stripe_hash_table *x;
123 	struct btrfs_stripe_hash *cur;
124 	struct btrfs_stripe_hash *h;
125 	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
126 	int i;
127 
128 	if (info->stripe_hash_table)
129 		return 0;
130 
131 	/*
132 	 * The table is large, starting with order 4 and can go as high as
133 	 * order 7 in case lock debugging is turned on.
134 	 *
135 	 * Try harder to allocate and fallback to vmalloc to lower the chance
136 	 * of a failing mount.
137 	 */
138 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
139 	if (!table)
140 		return -ENOMEM;
141 
142 	spin_lock_init(&table->cache_lock);
143 	INIT_LIST_HEAD(&table->stripe_cache);
144 
145 	h = table->table;
146 
147 	for (i = 0; i < num_entries; i++) {
148 		cur = h + i;
149 		INIT_LIST_HEAD(&cur->hash_list);
150 		spin_lock_init(&cur->lock);
151 	}
152 
153 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
154 	kvfree(x);
155 	return 0;
156 }
157 
158 /*
159  * caching an rbio means to copy anything from the
160  * bio_sectors array into the stripe_pages array.  We
161  * use the page uptodate bit in the stripe cache array
162  * to indicate if it has valid data
163  *
164  * once the caching is done, we set the cache ready
165  * bit.
166  */
167 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
168 {
169 	int i;
170 	int ret;
171 
172 	ret = alloc_rbio_pages(rbio);
173 	if (ret)
174 		return;
175 
176 	for (i = 0; i < rbio->nr_sectors; i++) {
177 		/* Some range not covered by bio (partial write), skip it */
178 		if (!rbio->bio_sectors[i].page) {
179 			/*
180 			 * Even if the sector is not covered by bio, if it is
181 			 * a data sector it should still be uptodate as it is
182 			 * read from disk.
183 			 */
184 			if (i < rbio->nr_data * rbio->stripe_nsectors)
185 				ASSERT(rbio->stripe_sectors[i].uptodate);
186 			continue;
187 		}
188 
189 		ASSERT(rbio->stripe_sectors[i].page);
190 		memcpy_page(rbio->stripe_sectors[i].page,
191 			    rbio->stripe_sectors[i].pgoff,
192 			    rbio->bio_sectors[i].page,
193 			    rbio->bio_sectors[i].pgoff,
194 			    rbio->bioc->fs_info->sectorsize);
195 		rbio->stripe_sectors[i].uptodate = 1;
196 	}
197 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
198 }
199 
200 /*
201  * we hash on the first logical address of the stripe
202  */
203 static int rbio_bucket(struct btrfs_raid_bio *rbio)
204 {
205 	u64 num = rbio->bioc->raid_map[0];
206 
207 	/*
208 	 * we shift down quite a bit.  We're using byte
209 	 * addressing, and most of the lower bits are zeros.
210 	 * This tends to upset hash_64, and it consistently
211 	 * returns just one or two different values.
212 	 *
213 	 * shifting off the lower bits fixes things.
214 	 */
215 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
216 }
217 
218 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
219 				       unsigned int page_nr)
220 {
221 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
222 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
223 	int i;
224 
225 	ASSERT(page_nr < rbio->nr_pages);
226 
227 	for (i = sectors_per_page * page_nr;
228 	     i < sectors_per_page * page_nr + sectors_per_page;
229 	     i++) {
230 		if (!rbio->stripe_sectors[i].uptodate)
231 			return false;
232 	}
233 	return true;
234 }
235 
236 /*
237  * Update the stripe_sectors[] array to use correct page and pgoff
238  *
239  * Should be called every time any page pointer in stripes_pages[] got modified.
240  */
241 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
242 {
243 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
244 	u32 offset;
245 	int i;
246 
247 	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
248 		int page_index = offset >> PAGE_SHIFT;
249 
250 		ASSERT(page_index < rbio->nr_pages);
251 		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
252 		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
253 	}
254 }
255 
256 static void steal_rbio_page(struct btrfs_raid_bio *src,
257 			    struct btrfs_raid_bio *dest, int page_nr)
258 {
259 	const u32 sectorsize = src->bioc->fs_info->sectorsize;
260 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
261 	int i;
262 
263 	if (dest->stripe_pages[page_nr])
264 		__free_page(dest->stripe_pages[page_nr]);
265 	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
266 	src->stripe_pages[page_nr] = NULL;
267 
268 	/* Also update the sector->uptodate bits. */
269 	for (i = sectors_per_page * page_nr;
270 	     i < sectors_per_page * page_nr + sectors_per_page; i++)
271 		dest->stripe_sectors[i].uptodate = true;
272 }
273 
274 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
275 {
276 	const int sector_nr = (page_nr << PAGE_SHIFT) >>
277 			      rbio->bioc->fs_info->sectorsize_bits;
278 
279 	/*
280 	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
281 	 * we won't have a page which is half data half parity.
282 	 *
283 	 * Thus if the first sector of the page belongs to data stripes, then
284 	 * the full page belongs to data stripes.
285 	 */
286 	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
287 }
288 
289 /*
290  * Stealing an rbio means taking all the uptodate pages from the stripe array
291  * in the source rbio and putting them into the destination rbio.
292  *
293  * This will also update the involved stripe_sectors[] which are referring to
294  * the old pages.
295  */
296 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
297 {
298 	int i;
299 
300 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
301 		return;
302 
303 	for (i = 0; i < dest->nr_pages; i++) {
304 		struct page *p = src->stripe_pages[i];
305 
306 		/*
307 		 * We don't need to steal P/Q pages as they will always be
308 		 * regenerated for RMW or full write anyway.
309 		 */
310 		if (!is_data_stripe_page(src, i))
311 			continue;
312 
313 		/*
314 		 * If @src already has RBIO_CACHE_READY_BIT, it should have
315 		 * all data stripe pages present and uptodate.
316 		 */
317 		ASSERT(p);
318 		ASSERT(full_page_sectors_uptodate(src, i));
319 		steal_rbio_page(src, dest, i);
320 	}
321 	index_stripe_sectors(dest);
322 	index_stripe_sectors(src);
323 }
324 
325 /*
326  * merging means we take the bio_list from the victim and
327  * splice it into the destination.  The victim should
328  * be discarded afterwards.
329  *
330  * must be called with dest->rbio_list_lock held
331  */
332 static void merge_rbio(struct btrfs_raid_bio *dest,
333 		       struct btrfs_raid_bio *victim)
334 {
335 	bio_list_merge(&dest->bio_list, &victim->bio_list);
336 	dest->bio_list_bytes += victim->bio_list_bytes;
337 	/* Also inherit the bitmaps from @victim. */
338 	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
339 		  dest->stripe_nsectors);
340 	bio_list_init(&victim->bio_list);
341 }
342 
343 /*
344  * used to prune items that are in the cache.  The caller
345  * must hold the hash table lock.
346  */
347 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348 {
349 	int bucket = rbio_bucket(rbio);
350 	struct btrfs_stripe_hash_table *table;
351 	struct btrfs_stripe_hash *h;
352 	int freeit = 0;
353 
354 	/*
355 	 * check the bit again under the hash table lock.
356 	 */
357 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
358 		return;
359 
360 	table = rbio->bioc->fs_info->stripe_hash_table;
361 	h = table->table + bucket;
362 
363 	/* hold the lock for the bucket because we may be
364 	 * removing it from the hash table
365 	 */
366 	spin_lock(&h->lock);
367 
368 	/*
369 	 * hold the lock for the bio list because we need
370 	 * to make sure the bio list is empty
371 	 */
372 	spin_lock(&rbio->bio_list_lock);
373 
374 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
375 		list_del_init(&rbio->stripe_cache);
376 		table->cache_size -= 1;
377 		freeit = 1;
378 
379 		/* if the bio list isn't empty, this rbio is
380 		 * still involved in an IO.  We take it out
381 		 * of the cache list, and drop the ref that
382 		 * was held for the list.
383 		 *
384 		 * If the bio_list was empty, we also remove
385 		 * the rbio from the hash_table, and drop
386 		 * the corresponding ref
387 		 */
388 		if (bio_list_empty(&rbio->bio_list)) {
389 			if (!list_empty(&rbio->hash_list)) {
390 				list_del_init(&rbio->hash_list);
391 				refcount_dec(&rbio->refs);
392 				BUG_ON(!list_empty(&rbio->plug_list));
393 			}
394 		}
395 	}
396 
397 	spin_unlock(&rbio->bio_list_lock);
398 	spin_unlock(&h->lock);
399 
400 	if (freeit)
401 		free_raid_bio(rbio);
402 }
403 
404 /*
405  * prune a given rbio from the cache
406  */
407 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
408 {
409 	struct btrfs_stripe_hash_table *table;
410 	unsigned long flags;
411 
412 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
413 		return;
414 
415 	table = rbio->bioc->fs_info->stripe_hash_table;
416 
417 	spin_lock_irqsave(&table->cache_lock, flags);
418 	__remove_rbio_from_cache(rbio);
419 	spin_unlock_irqrestore(&table->cache_lock, flags);
420 }
421 
422 /*
423  * remove everything in the cache
424  */
425 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
426 {
427 	struct btrfs_stripe_hash_table *table;
428 	unsigned long flags;
429 	struct btrfs_raid_bio *rbio;
430 
431 	table = info->stripe_hash_table;
432 
433 	spin_lock_irqsave(&table->cache_lock, flags);
434 	while (!list_empty(&table->stripe_cache)) {
435 		rbio = list_entry(table->stripe_cache.next,
436 				  struct btrfs_raid_bio,
437 				  stripe_cache);
438 		__remove_rbio_from_cache(rbio);
439 	}
440 	spin_unlock_irqrestore(&table->cache_lock, flags);
441 }
442 
443 /*
444  * remove all cached entries and free the hash table
445  * used by unmount
446  */
447 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
448 {
449 	if (!info->stripe_hash_table)
450 		return;
451 	btrfs_clear_rbio_cache(info);
452 	kvfree(info->stripe_hash_table);
453 	info->stripe_hash_table = NULL;
454 }
455 
456 /*
457  * insert an rbio into the stripe cache.  It
458  * must have already been prepared by calling
459  * cache_rbio_pages
460  *
461  * If this rbio was already cached, it gets
462  * moved to the front of the lru.
463  *
464  * If the size of the rbio cache is too big, we
465  * prune an item.
466  */
467 static void cache_rbio(struct btrfs_raid_bio *rbio)
468 {
469 	struct btrfs_stripe_hash_table *table;
470 	unsigned long flags;
471 
472 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
473 		return;
474 
475 	table = rbio->bioc->fs_info->stripe_hash_table;
476 
477 	spin_lock_irqsave(&table->cache_lock, flags);
478 	spin_lock(&rbio->bio_list_lock);
479 
480 	/* bump our ref if we were not in the list before */
481 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
482 		refcount_inc(&rbio->refs);
483 
484 	if (!list_empty(&rbio->stripe_cache)){
485 		list_move(&rbio->stripe_cache, &table->stripe_cache);
486 	} else {
487 		list_add(&rbio->stripe_cache, &table->stripe_cache);
488 		table->cache_size += 1;
489 	}
490 
491 	spin_unlock(&rbio->bio_list_lock);
492 
493 	if (table->cache_size > RBIO_CACHE_SIZE) {
494 		struct btrfs_raid_bio *found;
495 
496 		found = list_entry(table->stripe_cache.prev,
497 				  struct btrfs_raid_bio,
498 				  stripe_cache);
499 
500 		if (found != rbio)
501 			__remove_rbio_from_cache(found);
502 	}
503 
504 	spin_unlock_irqrestore(&table->cache_lock, flags);
505 }
506 
507 /*
508  * helper function to run the xor_blocks api.  It is only
509  * able to do MAX_XOR_BLOCKS at a time, so we need to
510  * loop through.
511  */
512 static void run_xor(void **pages, int src_cnt, ssize_t len)
513 {
514 	int src_off = 0;
515 	int xor_src_cnt = 0;
516 	void *dest = pages[src_cnt];
517 
518 	while(src_cnt > 0) {
519 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
520 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
521 
522 		src_cnt -= xor_src_cnt;
523 		src_off += xor_src_cnt;
524 	}
525 }
526 
527 /*
528  * Returns true if the bio list inside this rbio covers an entire stripe (no
529  * rmw required).
530  */
531 static int rbio_is_full(struct btrfs_raid_bio *rbio)
532 {
533 	unsigned long flags;
534 	unsigned long size = rbio->bio_list_bytes;
535 	int ret = 1;
536 
537 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
538 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
539 		ret = 0;
540 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
541 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
542 
543 	return ret;
544 }
545 
546 /*
547  * returns 1 if it is safe to merge two rbios together.
548  * The merging is safe if the two rbios correspond to
549  * the same stripe and if they are both going in the same
550  * direction (read vs write), and if neither one is
551  * locked for final IO
552  *
553  * The caller is responsible for locking such that
554  * rmw_locked is safe to test
555  */
556 static int rbio_can_merge(struct btrfs_raid_bio *last,
557 			  struct btrfs_raid_bio *cur)
558 {
559 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
560 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
561 		return 0;
562 
563 	/*
564 	 * we can't merge with cached rbios, since the
565 	 * idea is that when we merge the destination
566 	 * rbio is going to run our IO for us.  We can
567 	 * steal from cached rbios though, other functions
568 	 * handle that.
569 	 */
570 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
571 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
572 		return 0;
573 
574 	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
575 		return 0;
576 
577 	/* we can't merge with different operations */
578 	if (last->operation != cur->operation)
579 		return 0;
580 	/*
581 	 * We've need read the full stripe from the drive.
582 	 * check and repair the parity and write the new results.
583 	 *
584 	 * We're not allowed to add any new bios to the
585 	 * bio list here, anyone else that wants to
586 	 * change this stripe needs to do their own rmw.
587 	 */
588 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
589 		return 0;
590 
591 	if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
592 	    last->operation == BTRFS_RBIO_READ_REBUILD)
593 		return 0;
594 
595 	return 1;
596 }
597 
598 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
599 					     unsigned int stripe_nr,
600 					     unsigned int sector_nr)
601 {
602 	ASSERT(stripe_nr < rbio->real_stripes);
603 	ASSERT(sector_nr < rbio->stripe_nsectors);
604 
605 	return stripe_nr * rbio->stripe_nsectors + sector_nr;
606 }
607 
608 /* Return a sector from rbio->stripe_sectors, not from the bio list */
609 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
610 					     unsigned int stripe_nr,
611 					     unsigned int sector_nr)
612 {
613 	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
614 							      sector_nr)];
615 }
616 
617 /* Grab a sector inside P stripe */
618 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
619 					      unsigned int sector_nr)
620 {
621 	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
622 }
623 
624 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
625 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
626 					      unsigned int sector_nr)
627 {
628 	if (rbio->nr_data + 1 == rbio->real_stripes)
629 		return NULL;
630 	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
631 }
632 
633 /*
634  * The first stripe in the table for a logical address
635  * has the lock.  rbios are added in one of three ways:
636  *
637  * 1) Nobody has the stripe locked yet.  The rbio is given
638  * the lock and 0 is returned.  The caller must start the IO
639  * themselves.
640  *
641  * 2) Someone has the stripe locked, but we're able to merge
642  * with the lock owner.  The rbio is freed and the IO will
643  * start automatically along with the existing rbio.  1 is returned.
644  *
645  * 3) Someone has the stripe locked, but we're not able to merge.
646  * The rbio is added to the lock owner's plug list, or merged into
647  * an rbio already on the plug list.  When the lock owner unlocks,
648  * the next rbio on the list is run and the IO is started automatically.
649  * 1 is returned
650  *
651  * If we return 0, the caller still owns the rbio and must continue with
652  * IO submission.  If we return 1, the caller must assume the rbio has
653  * already been freed.
654  */
655 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
656 {
657 	struct btrfs_stripe_hash *h;
658 	struct btrfs_raid_bio *cur;
659 	struct btrfs_raid_bio *pending;
660 	unsigned long flags;
661 	struct btrfs_raid_bio *freeit = NULL;
662 	struct btrfs_raid_bio *cache_drop = NULL;
663 	int ret = 0;
664 
665 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
666 
667 	spin_lock_irqsave(&h->lock, flags);
668 	list_for_each_entry(cur, &h->hash_list, hash_list) {
669 		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
670 			continue;
671 
672 		spin_lock(&cur->bio_list_lock);
673 
674 		/* Can we steal this cached rbio's pages? */
675 		if (bio_list_empty(&cur->bio_list) &&
676 		    list_empty(&cur->plug_list) &&
677 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
678 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
679 			list_del_init(&cur->hash_list);
680 			refcount_dec(&cur->refs);
681 
682 			steal_rbio(cur, rbio);
683 			cache_drop = cur;
684 			spin_unlock(&cur->bio_list_lock);
685 
686 			goto lockit;
687 		}
688 
689 		/* Can we merge into the lock owner? */
690 		if (rbio_can_merge(cur, rbio)) {
691 			merge_rbio(cur, rbio);
692 			spin_unlock(&cur->bio_list_lock);
693 			freeit = rbio;
694 			ret = 1;
695 			goto out;
696 		}
697 
698 
699 		/*
700 		 * We couldn't merge with the running rbio, see if we can merge
701 		 * with the pending ones.  We don't have to check for rmw_locked
702 		 * because there is no way they are inside finish_rmw right now
703 		 */
704 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
705 			if (rbio_can_merge(pending, rbio)) {
706 				merge_rbio(pending, rbio);
707 				spin_unlock(&cur->bio_list_lock);
708 				freeit = rbio;
709 				ret = 1;
710 				goto out;
711 			}
712 		}
713 
714 		/*
715 		 * No merging, put us on the tail of the plug list, our rbio
716 		 * will be started with the currently running rbio unlocks
717 		 */
718 		list_add_tail(&rbio->plug_list, &cur->plug_list);
719 		spin_unlock(&cur->bio_list_lock);
720 		ret = 1;
721 		goto out;
722 	}
723 lockit:
724 	refcount_inc(&rbio->refs);
725 	list_add(&rbio->hash_list, &h->hash_list);
726 out:
727 	spin_unlock_irqrestore(&h->lock, flags);
728 	if (cache_drop)
729 		remove_rbio_from_cache(cache_drop);
730 	if (freeit)
731 		free_raid_bio(freeit);
732 	return ret;
733 }
734 
735 static void recover_rbio_work_locked(struct work_struct *work);
736 
737 /*
738  * called as rmw or parity rebuild is completed.  If the plug list has more
739  * rbios waiting for this stripe, the next one on the list will be started
740  */
741 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
742 {
743 	int bucket;
744 	struct btrfs_stripe_hash *h;
745 	unsigned long flags;
746 	int keep_cache = 0;
747 
748 	bucket = rbio_bucket(rbio);
749 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
750 
751 	if (list_empty(&rbio->plug_list))
752 		cache_rbio(rbio);
753 
754 	spin_lock_irqsave(&h->lock, flags);
755 	spin_lock(&rbio->bio_list_lock);
756 
757 	if (!list_empty(&rbio->hash_list)) {
758 		/*
759 		 * if we're still cached and there is no other IO
760 		 * to perform, just leave this rbio here for others
761 		 * to steal from later
762 		 */
763 		if (list_empty(&rbio->plug_list) &&
764 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
765 			keep_cache = 1;
766 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
767 			BUG_ON(!bio_list_empty(&rbio->bio_list));
768 			goto done;
769 		}
770 
771 		list_del_init(&rbio->hash_list);
772 		refcount_dec(&rbio->refs);
773 
774 		/*
775 		 * we use the plug list to hold all the rbios
776 		 * waiting for the chance to lock this stripe.
777 		 * hand the lock over to one of them.
778 		 */
779 		if (!list_empty(&rbio->plug_list)) {
780 			struct btrfs_raid_bio *next;
781 			struct list_head *head = rbio->plug_list.next;
782 
783 			next = list_entry(head, struct btrfs_raid_bio,
784 					  plug_list);
785 
786 			list_del_init(&rbio->plug_list);
787 
788 			list_add(&next->hash_list, &h->hash_list);
789 			refcount_inc(&next->refs);
790 			spin_unlock(&rbio->bio_list_lock);
791 			spin_unlock_irqrestore(&h->lock, flags);
792 
793 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
794 				start_async_work(next, recover_rbio_work_locked);
795 			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
796 				steal_rbio(rbio, next);
797 				start_async_work(next, recover_rbio_work_locked);
798 			} else if (next->operation == BTRFS_RBIO_WRITE) {
799 				steal_rbio(rbio, next);
800 				start_async_work(next, rmw_rbio_work_locked);
801 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
802 				steal_rbio(rbio, next);
803 				start_async_work(next, scrub_rbio_work_locked);
804 			}
805 
806 			goto done_nolock;
807 		}
808 	}
809 done:
810 	spin_unlock(&rbio->bio_list_lock);
811 	spin_unlock_irqrestore(&h->lock, flags);
812 
813 done_nolock:
814 	if (!keep_cache)
815 		remove_rbio_from_cache(rbio);
816 }
817 
818 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
819 {
820 	struct bio *next;
821 
822 	while (cur) {
823 		next = cur->bi_next;
824 		cur->bi_next = NULL;
825 		cur->bi_status = err;
826 		bio_endio(cur);
827 		cur = next;
828 	}
829 }
830 
831 /*
832  * this frees the rbio and runs through all the bios in the
833  * bio_list and calls end_io on them
834  */
835 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
836 {
837 	struct bio *cur = bio_list_get(&rbio->bio_list);
838 	struct bio *extra;
839 
840 	kfree(rbio->csum_buf);
841 	bitmap_free(rbio->csum_bitmap);
842 	rbio->csum_buf = NULL;
843 	rbio->csum_bitmap = NULL;
844 
845 	/*
846 	 * Clear the data bitmap, as the rbio may be cached for later usage.
847 	 * do this before before unlock_stripe() so there will be no new bio
848 	 * for this bio.
849 	 */
850 	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
851 
852 	/*
853 	 * At this moment, rbio->bio_list is empty, however since rbio does not
854 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
855 	 * hash list, rbio may be merged with others so that rbio->bio_list
856 	 * becomes non-empty.
857 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
858 	 * more and we can call bio_endio() on all queued bios.
859 	 */
860 	unlock_stripe(rbio);
861 	extra = bio_list_get(&rbio->bio_list);
862 	free_raid_bio(rbio);
863 
864 	rbio_endio_bio_list(cur, err);
865 	if (extra)
866 		rbio_endio_bio_list(extra, err);
867 }
868 
869 /*
870  * Get a sector pointer specified by its @stripe_nr and @sector_nr.
871  *
872  * @rbio:               The raid bio
873  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
874  * @sector_nr:		Sector number inside the stripe,
875  *			valid range [0, stripe_nsectors)
876  * @bio_list_only:      Whether to use sectors inside the bio list only.
877  *
878  * The read/modify/write code wants to reuse the original bio page as much
879  * as possible, and only use stripe_sectors as fallback.
880  */
881 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
882 					 int stripe_nr, int sector_nr,
883 					 bool bio_list_only)
884 {
885 	struct sector_ptr *sector;
886 	int index;
887 
888 	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
889 	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
890 
891 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
892 	ASSERT(index >= 0 && index < rbio->nr_sectors);
893 
894 	spin_lock_irq(&rbio->bio_list_lock);
895 	sector = &rbio->bio_sectors[index];
896 	if (sector->page || bio_list_only) {
897 		/* Don't return sector without a valid page pointer */
898 		if (!sector->page)
899 			sector = NULL;
900 		spin_unlock_irq(&rbio->bio_list_lock);
901 		return sector;
902 	}
903 	spin_unlock_irq(&rbio->bio_list_lock);
904 
905 	return &rbio->stripe_sectors[index];
906 }
907 
908 /*
909  * allocation and initial setup for the btrfs_raid_bio.  Not
910  * this does not allocate any pages for rbio->pages.
911  */
912 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
913 					 struct btrfs_io_context *bioc)
914 {
915 	const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
916 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
917 	const unsigned int num_pages = stripe_npages * real_stripes;
918 	const unsigned int stripe_nsectors =
919 		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
920 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
921 	struct btrfs_raid_bio *rbio;
922 
923 	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
924 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
925 	/*
926 	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
927 	 * (at most 16) should be no larger than BITS_PER_LONG.
928 	 */
929 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
930 
931 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
932 	if (!rbio)
933 		return ERR_PTR(-ENOMEM);
934 	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
935 				     GFP_NOFS);
936 	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
937 				    GFP_NOFS);
938 	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
939 				       GFP_NOFS);
940 	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
941 	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
942 
943 	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
944 	    !rbio->finish_pointers || !rbio->error_bitmap) {
945 		free_raid_bio_pointers(rbio);
946 		kfree(rbio);
947 		return ERR_PTR(-ENOMEM);
948 	}
949 
950 	bio_list_init(&rbio->bio_list);
951 	init_waitqueue_head(&rbio->io_wait);
952 	INIT_LIST_HEAD(&rbio->plug_list);
953 	spin_lock_init(&rbio->bio_list_lock);
954 	INIT_LIST_HEAD(&rbio->stripe_cache);
955 	INIT_LIST_HEAD(&rbio->hash_list);
956 	btrfs_get_bioc(bioc);
957 	rbio->bioc = bioc;
958 	rbio->nr_pages = num_pages;
959 	rbio->nr_sectors = num_sectors;
960 	rbio->real_stripes = real_stripes;
961 	rbio->stripe_npages = stripe_npages;
962 	rbio->stripe_nsectors = stripe_nsectors;
963 	refcount_set(&rbio->refs, 1);
964 	atomic_set(&rbio->stripes_pending, 0);
965 
966 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
967 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
968 
969 	return rbio;
970 }
971 
972 /* allocate pages for all the stripes in the bio, including parity */
973 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
974 {
975 	int ret;
976 
977 	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
978 	if (ret < 0)
979 		return ret;
980 	/* Mapping all sectors */
981 	index_stripe_sectors(rbio);
982 	return 0;
983 }
984 
985 /* only allocate pages for p/q stripes */
986 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
987 {
988 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
989 	int ret;
990 
991 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
992 				     rbio->stripe_pages + data_pages);
993 	if (ret < 0)
994 		return ret;
995 
996 	index_stripe_sectors(rbio);
997 	return 0;
998 }
999 
1000 /*
1001  * Return the total numer of errors found in the vertical stripe of @sector_nr.
1002  *
1003  * @faila and @failb will also be updated to the first and second stripe
1004  * number of the errors.
1005  */
1006 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1007 				     int *faila, int *failb)
1008 {
1009 	int stripe_nr;
1010 	int found_errors = 0;
1011 
1012 	if (faila || failb) {
1013 		/*
1014 		 * Both @faila and @failb should be valid pointers if any of
1015 		 * them is specified.
1016 		 */
1017 		ASSERT(faila && failb);
1018 		*faila = -1;
1019 		*failb = -1;
1020 	}
1021 
1022 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1023 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1024 
1025 		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1026 			found_errors++;
1027 			if (faila) {
1028 				/* Update faila and failb. */
1029 				if (*faila < 0)
1030 					*faila = stripe_nr;
1031 				else if (*failb < 0)
1032 					*failb = stripe_nr;
1033 			}
1034 		}
1035 	}
1036 	return found_errors;
1037 }
1038 
1039 /*
1040  * Add a single sector @sector into our list of bios for IO.
1041  *
1042  * Return 0 if everything went well.
1043  * Return <0 for error.
1044  */
1045 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1046 			      struct bio_list *bio_list,
1047 			      struct sector_ptr *sector,
1048 			      unsigned int stripe_nr,
1049 			      unsigned int sector_nr,
1050 			      enum req_op op)
1051 {
1052 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1053 	struct bio *last = bio_list->tail;
1054 	int ret;
1055 	struct bio *bio;
1056 	struct btrfs_io_stripe *stripe;
1057 	u64 disk_start;
1058 
1059 	/*
1060 	 * Note: here stripe_nr has taken device replace into consideration,
1061 	 * thus it can be larger than rbio->real_stripe.
1062 	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1063 	 */
1064 	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1065 	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1066 	ASSERT(sector->page);
1067 
1068 	stripe = &rbio->bioc->stripes[stripe_nr];
1069 	disk_start = stripe->physical + sector_nr * sectorsize;
1070 
1071 	/* if the device is missing, just fail this stripe */
1072 	if (!stripe->dev->bdev) {
1073 		int found_errors;
1074 
1075 		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1076 			rbio->error_bitmap);
1077 
1078 		/* Check if we have reached tolerance early. */
1079 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1080 							 NULL, NULL);
1081 		if (found_errors > rbio->bioc->max_errors)
1082 			return -EIO;
1083 		return 0;
1084 	}
1085 
1086 	/* see if we can add this page onto our existing bio */
1087 	if (last) {
1088 		u64 last_end = last->bi_iter.bi_sector << 9;
1089 		last_end += last->bi_iter.bi_size;
1090 
1091 		/*
1092 		 * we can't merge these if they are from different
1093 		 * devices or if they are not contiguous
1094 		 */
1095 		if (last_end == disk_start && !last->bi_status &&
1096 		    last->bi_bdev == stripe->dev->bdev) {
1097 			ret = bio_add_page(last, sector->page, sectorsize,
1098 					   sector->pgoff);
1099 			if (ret == sectorsize)
1100 				return 0;
1101 		}
1102 	}
1103 
1104 	/* put a new bio on the list */
1105 	bio = bio_alloc(stripe->dev->bdev,
1106 			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1107 			op, GFP_NOFS);
1108 	bio->bi_iter.bi_sector = disk_start >> 9;
1109 	bio->bi_private = rbio;
1110 
1111 	bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1112 	bio_list_add(bio_list, bio);
1113 	return 0;
1114 }
1115 
1116 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1117 {
1118 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1119 	struct bio_vec bvec;
1120 	struct bvec_iter iter;
1121 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1122 		     rbio->bioc->raid_map[0];
1123 
1124 	bio_for_each_segment(bvec, bio, iter) {
1125 		u32 bvec_offset;
1126 
1127 		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1128 		     bvec_offset += sectorsize, offset += sectorsize) {
1129 			int index = offset / sectorsize;
1130 			struct sector_ptr *sector = &rbio->bio_sectors[index];
1131 
1132 			sector->page = bvec.bv_page;
1133 			sector->pgoff = bvec.bv_offset + bvec_offset;
1134 			ASSERT(sector->pgoff < PAGE_SIZE);
1135 		}
1136 	}
1137 }
1138 
1139 /*
1140  * helper function to walk our bio list and populate the bio_pages array with
1141  * the result.  This seems expensive, but it is faster than constantly
1142  * searching through the bio list as we setup the IO in finish_rmw or stripe
1143  * reconstruction.
1144  *
1145  * This must be called before you trust the answers from page_in_rbio
1146  */
1147 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1148 {
1149 	struct bio *bio;
1150 
1151 	spin_lock_irq(&rbio->bio_list_lock);
1152 	bio_list_for_each(bio, &rbio->bio_list)
1153 		index_one_bio(rbio, bio);
1154 
1155 	spin_unlock_irq(&rbio->bio_list_lock);
1156 }
1157 
1158 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1159 			       struct raid56_bio_trace_info *trace_info)
1160 {
1161 	const struct btrfs_io_context *bioc = rbio->bioc;
1162 	int i;
1163 
1164 	ASSERT(bioc);
1165 
1166 	/* We rely on bio->bi_bdev to find the stripe number. */
1167 	if (!bio->bi_bdev)
1168 		goto not_found;
1169 
1170 	for (i = 0; i < bioc->num_stripes; i++) {
1171 		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1172 			continue;
1173 		trace_info->stripe_nr = i;
1174 		trace_info->devid = bioc->stripes[i].dev->devid;
1175 		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1176 				     bioc->stripes[i].physical;
1177 		return;
1178 	}
1179 
1180 not_found:
1181 	trace_info->devid = -1;
1182 	trace_info->offset = -1;
1183 	trace_info->stripe_nr = -1;
1184 }
1185 
1186 /* Generate PQ for one veritical stripe. */
1187 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1188 {
1189 	void **pointers = rbio->finish_pointers;
1190 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1191 	struct sector_ptr *sector;
1192 	int stripe;
1193 	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1194 
1195 	/* First collect one sector from each data stripe */
1196 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1197 		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1198 		pointers[stripe] = kmap_local_page(sector->page) +
1199 				   sector->pgoff;
1200 	}
1201 
1202 	/* Then add the parity stripe */
1203 	sector = rbio_pstripe_sector(rbio, sectornr);
1204 	sector->uptodate = 1;
1205 	pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1206 
1207 	if (has_qstripe) {
1208 		/*
1209 		 * RAID6, add the qstripe and call the library function
1210 		 * to fill in our p/q
1211 		 */
1212 		sector = rbio_qstripe_sector(rbio, sectornr);
1213 		sector->uptodate = 1;
1214 		pointers[stripe++] = kmap_local_page(sector->page) +
1215 				     sector->pgoff;
1216 
1217 		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1218 					pointers);
1219 	} else {
1220 		/* raid5 */
1221 		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1222 		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1223 	}
1224 	for (stripe = stripe - 1; stripe >= 0; stripe--)
1225 		kunmap_local(pointers[stripe]);
1226 }
1227 
1228 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1229 				   struct bio_list *bio_list)
1230 {
1231 	struct bio *bio;
1232 	/* The total sector number inside the full stripe. */
1233 	int total_sector_nr;
1234 	int sectornr;
1235 	int stripe;
1236 	int ret;
1237 
1238 	ASSERT(bio_list_size(bio_list) == 0);
1239 
1240 	/* We should have at least one data sector. */
1241 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1242 
1243 	/*
1244 	 * Reset errors, as we may have errors inherited from from degraded
1245 	 * write.
1246 	 */
1247 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1248 
1249 	/*
1250 	 * Start assembly.  Make bios for everything from the higher layers (the
1251 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1252 	 */
1253 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1254 	     total_sector_nr++) {
1255 		struct sector_ptr *sector;
1256 
1257 		stripe = total_sector_nr / rbio->stripe_nsectors;
1258 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1259 
1260 		/* This vertical stripe has no data, skip it. */
1261 		if (!test_bit(sectornr, &rbio->dbitmap))
1262 			continue;
1263 
1264 		if (stripe < rbio->nr_data) {
1265 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1266 			if (!sector)
1267 				continue;
1268 		} else {
1269 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1270 		}
1271 
1272 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1273 					 sectornr, REQ_OP_WRITE);
1274 		if (ret)
1275 			goto error;
1276 	}
1277 
1278 	if (likely(!rbio->bioc->num_tgtdevs))
1279 		return 0;
1280 
1281 	/* Make a copy for the replace target device. */
1282 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1283 	     total_sector_nr++) {
1284 		struct sector_ptr *sector;
1285 
1286 		stripe = total_sector_nr / rbio->stripe_nsectors;
1287 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1288 
1289 		if (!rbio->bioc->tgtdev_map[stripe]) {
1290 			/*
1291 			 * We can skip the whole stripe completely, note
1292 			 * total_sector_nr will be increased by one anyway.
1293 			 */
1294 			ASSERT(sectornr == 0);
1295 			total_sector_nr += rbio->stripe_nsectors - 1;
1296 			continue;
1297 		}
1298 
1299 		/* This vertical stripe has no data, skip it. */
1300 		if (!test_bit(sectornr, &rbio->dbitmap))
1301 			continue;
1302 
1303 		if (stripe < rbio->nr_data) {
1304 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1305 			if (!sector)
1306 				continue;
1307 		} else {
1308 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1309 		}
1310 
1311 		ret = rbio_add_io_sector(rbio, bio_list, sector,
1312 					 rbio->bioc->tgtdev_map[stripe],
1313 					 sectornr, REQ_OP_WRITE);
1314 		if (ret)
1315 			goto error;
1316 	}
1317 
1318 	return 0;
1319 error:
1320 	while ((bio = bio_list_pop(bio_list)))
1321 		bio_put(bio);
1322 	return -EIO;
1323 }
1324 
1325 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1326 {
1327 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1328 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1329 		     rbio->bioc->raid_map[0];
1330 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1331 
1332 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1333 
1334 	bitmap_set(rbio->error_bitmap, total_nr_sector,
1335 		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1336 
1337 	/*
1338 	 * Special handling for raid56_alloc_missing_rbio() used by
1339 	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1340 	 * pass an empty bio here.  Thus we have to find out the missing device
1341 	 * and mark the stripe error instead.
1342 	 */
1343 	if (bio->bi_iter.bi_size == 0) {
1344 		bool found_missing = false;
1345 		int stripe_nr;
1346 
1347 		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1348 			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1349 				found_missing = true;
1350 				bitmap_set(rbio->error_bitmap,
1351 					   stripe_nr * rbio->stripe_nsectors,
1352 					   rbio->stripe_nsectors);
1353 			}
1354 		}
1355 		ASSERT(found_missing);
1356 	}
1357 }
1358 
1359 /*
1360  * For subpage case, we can no longer set page Uptodate directly for
1361  * stripe_pages[], thus we need to locate the sector.
1362  */
1363 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1364 					     struct page *page,
1365 					     unsigned int pgoff)
1366 {
1367 	int i;
1368 
1369 	for (i = 0; i < rbio->nr_sectors; i++) {
1370 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1371 
1372 		if (sector->page == page && sector->pgoff == pgoff)
1373 			return sector;
1374 	}
1375 	return NULL;
1376 }
1377 
1378 /*
1379  * this sets each page in the bio uptodate.  It should only be used on private
1380  * rbio pages, nothing that comes in from the higher layers
1381  */
1382 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1383 {
1384 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1385 	struct bio_vec *bvec;
1386 	struct bvec_iter_all iter_all;
1387 
1388 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1389 
1390 	bio_for_each_segment_all(bvec, bio, iter_all) {
1391 		struct sector_ptr *sector;
1392 		int pgoff;
1393 
1394 		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1395 		     pgoff += sectorsize) {
1396 			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1397 			ASSERT(sector);
1398 			if (sector)
1399 				sector->uptodate = 1;
1400 		}
1401 	}
1402 }
1403 
1404 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1405 {
1406 	struct bio_vec *bv = bio_first_bvec_all(bio);
1407 	int i;
1408 
1409 	for (i = 0; i < rbio->nr_sectors; i++) {
1410 		struct sector_ptr *sector;
1411 
1412 		sector = &rbio->stripe_sectors[i];
1413 		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1414 			break;
1415 		sector = &rbio->bio_sectors[i];
1416 		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1417 			break;
1418 	}
1419 	ASSERT(i < rbio->nr_sectors);
1420 	return i;
1421 }
1422 
1423 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1424 {
1425 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1426 	u32 bio_size = 0;
1427 	struct bio_vec *bvec;
1428 	struct bvec_iter_all iter_all;
1429 
1430 	bio_for_each_segment_all(bvec, bio, iter_all)
1431 		bio_size += bvec->bv_len;
1432 
1433 	bitmap_set(rbio->error_bitmap, total_sector_nr,
1434 		   bio_size >> rbio->bioc->fs_info->sectorsize_bits);
1435 }
1436 
1437 /* Verify the data sectors at read time. */
1438 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1439 				    struct bio *bio)
1440 {
1441 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1442 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1443 	struct bio_vec *bvec;
1444 	struct bvec_iter_all iter_all;
1445 
1446 	/* No data csum for the whole stripe, no need to verify. */
1447 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1448 		return;
1449 
1450 	/* P/Q stripes, they have no data csum to verify against. */
1451 	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1452 		return;
1453 
1454 	bio_for_each_segment_all(bvec, bio, iter_all) {
1455 		int bv_offset;
1456 
1457 		for (bv_offset = bvec->bv_offset;
1458 		     bv_offset < bvec->bv_offset + bvec->bv_len;
1459 		     bv_offset += fs_info->sectorsize, total_sector_nr++) {
1460 			u8 csum_buf[BTRFS_CSUM_SIZE];
1461 			u8 *expected_csum = rbio->csum_buf +
1462 					    total_sector_nr * fs_info->csum_size;
1463 			int ret;
1464 
1465 			/* No csum for this sector, skip to the next sector. */
1466 			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1467 				continue;
1468 
1469 			ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
1470 				bv_offset, csum_buf, expected_csum);
1471 			if (ret < 0)
1472 				set_bit(total_sector_nr, rbio->error_bitmap);
1473 		}
1474 	}
1475 }
1476 
1477 static void raid_wait_read_end_io(struct bio *bio)
1478 {
1479 	struct btrfs_raid_bio *rbio = bio->bi_private;
1480 
1481 	if (bio->bi_status) {
1482 		rbio_update_error_bitmap(rbio, bio);
1483 	} else {
1484 		set_bio_pages_uptodate(rbio, bio);
1485 		verify_bio_data_sectors(rbio, bio);
1486 	}
1487 
1488 	bio_put(bio);
1489 	if (atomic_dec_and_test(&rbio->stripes_pending))
1490 		wake_up(&rbio->io_wait);
1491 }
1492 
1493 static void submit_read_bios(struct btrfs_raid_bio *rbio,
1494 			     struct bio_list *bio_list)
1495 {
1496 	struct bio *bio;
1497 
1498 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1499 	while ((bio = bio_list_pop(bio_list))) {
1500 		bio->bi_end_io = raid_wait_read_end_io;
1501 
1502 		if (trace_raid56_scrub_read_recover_enabled()) {
1503 			struct raid56_bio_trace_info trace_info = { 0 };
1504 
1505 			bio_get_trace_info(rbio, bio, &trace_info);
1506 			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
1507 		}
1508 		submit_bio(bio);
1509 	}
1510 }
1511 
1512 static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
1513 				  struct bio_list *bio_list)
1514 {
1515 	struct bio *bio;
1516 	int total_sector_nr;
1517 	int ret = 0;
1518 
1519 	ASSERT(bio_list_size(bio_list) == 0);
1520 
1521 	/*
1522 	 * Build a list of bios to read all sectors (including data and P/Q).
1523 	 *
1524 	 * This behaviro is to compensate the later csum verification and
1525 	 * recovery.
1526 	 */
1527 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1528 	     total_sector_nr++) {
1529 		struct sector_ptr *sector;
1530 		int stripe = total_sector_nr / rbio->stripe_nsectors;
1531 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
1532 
1533 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
1534 		ret = rbio_add_io_sector(rbio, bio_list, sector,
1535 			       stripe, sectornr, REQ_OP_READ);
1536 		if (ret)
1537 			goto cleanup;
1538 	}
1539 	return 0;
1540 
1541 cleanup:
1542 	while ((bio = bio_list_pop(bio_list)))
1543 		bio_put(bio);
1544 	return ret;
1545 }
1546 
1547 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1548 {
1549 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1550 	int ret;
1551 
1552 	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
1553 	if (ret < 0)
1554 		return ret;
1555 
1556 	index_stripe_sectors(rbio);
1557 	return 0;
1558 }
1559 
1560 /*
1561  * We use plugging call backs to collect full stripes.
1562  * Any time we get a partial stripe write while plugged
1563  * we collect it into a list.  When the unplug comes down,
1564  * we sort the list by logical block number and merge
1565  * everything we can into the same rbios
1566  */
1567 struct btrfs_plug_cb {
1568 	struct blk_plug_cb cb;
1569 	struct btrfs_fs_info *info;
1570 	struct list_head rbio_list;
1571 	struct work_struct work;
1572 };
1573 
1574 /*
1575  * rbios on the plug list are sorted for easier merging.
1576  */
1577 static int plug_cmp(void *priv, const struct list_head *a,
1578 		    const struct list_head *b)
1579 {
1580 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1581 						       plug_list);
1582 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1583 						       plug_list);
1584 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1585 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1586 
1587 	if (a_sector < b_sector)
1588 		return -1;
1589 	if (a_sector > b_sector)
1590 		return 1;
1591 	return 0;
1592 }
1593 
1594 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1595 {
1596 	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1597 	struct btrfs_raid_bio *cur;
1598 	struct btrfs_raid_bio *last = NULL;
1599 
1600 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1601 
1602 	while (!list_empty(&plug->rbio_list)) {
1603 		cur = list_entry(plug->rbio_list.next,
1604 				 struct btrfs_raid_bio, plug_list);
1605 		list_del_init(&cur->plug_list);
1606 
1607 		if (rbio_is_full(cur)) {
1608 			/* We have a full stripe, queue it down. */
1609 			start_async_work(cur, rmw_rbio_work);
1610 			continue;
1611 		}
1612 		if (last) {
1613 			if (rbio_can_merge(last, cur)) {
1614 				merge_rbio(last, cur);
1615 				free_raid_bio(cur);
1616 				continue;
1617 			}
1618 			start_async_work(last, rmw_rbio_work);
1619 		}
1620 		last = cur;
1621 	}
1622 	if (last)
1623 		start_async_work(last, rmw_rbio_work);
1624 	kfree(plug);
1625 }
1626 
1627 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1628 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1629 {
1630 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1631 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1632 	const u64 full_stripe_start = rbio->bioc->raid_map[0];
1633 	const u32 orig_len = orig_bio->bi_iter.bi_size;
1634 	const u32 sectorsize = fs_info->sectorsize;
1635 	u64 cur_logical;
1636 
1637 	ASSERT(orig_logical >= full_stripe_start &&
1638 	       orig_logical + orig_len <= full_stripe_start +
1639 	       rbio->nr_data * BTRFS_STRIPE_LEN);
1640 
1641 	bio_list_add(&rbio->bio_list, orig_bio);
1642 	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1643 
1644 	/* Update the dbitmap. */
1645 	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1646 	     cur_logical += sectorsize) {
1647 		int bit = ((u32)(cur_logical - full_stripe_start) >>
1648 			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1649 
1650 		set_bit(bit, &rbio->dbitmap);
1651 	}
1652 }
1653 
1654 /*
1655  * our main entry point for writes from the rest of the FS.
1656  */
1657 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1658 {
1659 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1660 	struct btrfs_raid_bio *rbio;
1661 	struct btrfs_plug_cb *plug = NULL;
1662 	struct blk_plug_cb *cb;
1663 	int ret = 0;
1664 
1665 	rbio = alloc_rbio(fs_info, bioc);
1666 	if (IS_ERR(rbio)) {
1667 		ret = PTR_ERR(rbio);
1668 		goto fail;
1669 	}
1670 	rbio->operation = BTRFS_RBIO_WRITE;
1671 	rbio_add_bio(rbio, bio);
1672 
1673 	/*
1674 	 * Don't plug on full rbios, just get them out the door
1675 	 * as quickly as we can
1676 	 */
1677 	if (rbio_is_full(rbio))
1678 		goto queue_rbio;
1679 
1680 	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1681 	if (cb) {
1682 		plug = container_of(cb, struct btrfs_plug_cb, cb);
1683 		if (!plug->info) {
1684 			plug->info = fs_info;
1685 			INIT_LIST_HEAD(&plug->rbio_list);
1686 		}
1687 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
1688 		return;
1689 	}
1690 queue_rbio:
1691 	/*
1692 	 * Either we don't have any existing plug, or we're doing a full stripe,
1693 	 * can queue the rmw work now.
1694 	 */
1695 	start_async_work(rbio, rmw_rbio_work);
1696 
1697 	return;
1698 
1699 fail:
1700 	bio->bi_status = errno_to_blk_status(ret);
1701 	bio_endio(bio);
1702 }
1703 
1704 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1705 			     int stripe_nr, int sector_nr)
1706 {
1707 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1708 	struct sector_ptr *sector;
1709 	u8 csum_buf[BTRFS_CSUM_SIZE];
1710 	u8 *csum_expected;
1711 	int ret;
1712 
1713 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1714 		return 0;
1715 
1716 	/* No way to verify P/Q as they are not covered by data csum. */
1717 	if (stripe_nr >= rbio->nr_data)
1718 		return 0;
1719 	/*
1720 	 * If we're rebuilding a read, we have to use pages from the
1721 	 * bio list if possible.
1722 	 */
1723 	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1724 	     rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
1725 		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1726 	} else {
1727 		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1728 	}
1729 
1730 	ASSERT(sector->page);
1731 
1732 	csum_expected = rbio->csum_buf +
1733 			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1734 			fs_info->csum_size;
1735 	ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
1736 				      csum_buf, csum_expected);
1737 	return ret;
1738 }
1739 
1740 /*
1741  * Recover a vertical stripe specified by @sector_nr.
1742  * @*pointers are the pre-allocated pointers by the caller, so we don't
1743  * need to allocate/free the pointers again and again.
1744  */
1745 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1746 			    void **pointers, void **unmap_array)
1747 {
1748 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1749 	struct sector_ptr *sector;
1750 	const u32 sectorsize = fs_info->sectorsize;
1751 	int found_errors;
1752 	int faila;
1753 	int failb;
1754 	int stripe_nr;
1755 	int ret = 0;
1756 
1757 	/*
1758 	 * Now we just use bitmap to mark the horizontal stripes in
1759 	 * which we have data when doing parity scrub.
1760 	 */
1761 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1762 	    !test_bit(sector_nr, &rbio->dbitmap))
1763 		return 0;
1764 
1765 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1766 						 &failb);
1767 	/*
1768 	 * No errors in the veritical stripe, skip it.  Can happen for recovery
1769 	 * which only part of a stripe failed csum check.
1770 	 */
1771 	if (!found_errors)
1772 		return 0;
1773 
1774 	if (found_errors > rbio->bioc->max_errors)
1775 		return -EIO;
1776 
1777 	/*
1778 	 * Setup our array of pointers with sectors from each stripe
1779 	 *
1780 	 * NOTE: store a duplicate array of pointers to preserve the
1781 	 * pointer order.
1782 	 */
1783 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1784 		/*
1785 		 * If we're rebuilding a read, we have to use pages from the
1786 		 * bio list if possible.
1787 		 */
1788 		if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1789 		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
1790 			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1791 		} else {
1792 			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1793 		}
1794 		ASSERT(sector->page);
1795 		pointers[stripe_nr] = kmap_local_page(sector->page) +
1796 				   sector->pgoff;
1797 		unmap_array[stripe_nr] = pointers[stripe_nr];
1798 	}
1799 
1800 	/* All raid6 handling here */
1801 	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1802 		/* Single failure, rebuild from parity raid5 style */
1803 		if (failb < 0) {
1804 			if (faila == rbio->nr_data)
1805 				/*
1806 				 * Just the P stripe has failed, without
1807 				 * a bad data or Q stripe.
1808 				 * We have nothing to do, just skip the
1809 				 * recovery for this stripe.
1810 				 */
1811 				goto cleanup;
1812 			/*
1813 			 * a single failure in raid6 is rebuilt
1814 			 * in the pstripe code below
1815 			 */
1816 			goto pstripe;
1817 		}
1818 
1819 		/*
1820 		 * If the q stripe is failed, do a pstripe reconstruction from
1821 		 * the xors.
1822 		 * If both the q stripe and the P stripe are failed, we're
1823 		 * here due to a crc mismatch and we can't give them the
1824 		 * data they want.
1825 		 */
1826 		if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1827 			if (rbio->bioc->raid_map[faila] ==
1828 			    RAID5_P_STRIPE)
1829 				/*
1830 				 * Only P and Q are corrupted.
1831 				 * We only care about data stripes recovery,
1832 				 * can skip this vertical stripe.
1833 				 */
1834 				goto cleanup;
1835 			/*
1836 			 * Otherwise we have one bad data stripe and
1837 			 * a good P stripe.  raid5!
1838 			 */
1839 			goto pstripe;
1840 		}
1841 
1842 		if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1843 			raid6_datap_recov(rbio->real_stripes, sectorsize,
1844 					  faila, pointers);
1845 		} else {
1846 			raid6_2data_recov(rbio->real_stripes, sectorsize,
1847 					  faila, failb, pointers);
1848 		}
1849 	} else {
1850 		void *p;
1851 
1852 		/* Rebuild from P stripe here (raid5 or raid6). */
1853 		ASSERT(failb == -1);
1854 pstripe:
1855 		/* Copy parity block into failed block to start with */
1856 		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1857 
1858 		/* Rearrange the pointer array */
1859 		p = pointers[faila];
1860 		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1861 		     stripe_nr++)
1862 			pointers[stripe_nr] = pointers[stripe_nr + 1];
1863 		pointers[rbio->nr_data - 1] = p;
1864 
1865 		/* Xor in the rest */
1866 		run_xor(pointers, rbio->nr_data - 1, sectorsize);
1867 
1868 	}
1869 
1870 	/*
1871 	 * No matter if this is a RMW or recovery, we should have all
1872 	 * failed sectors repaired in the vertical stripe, thus they are now
1873 	 * uptodate.
1874 	 * Especially if we determine to cache the rbio, we need to
1875 	 * have at least all data sectors uptodate.
1876 	 *
1877 	 * If possible, also check if the repaired sector matches its data
1878 	 * checksum.
1879 	 */
1880 	if (faila >= 0) {
1881 		ret = verify_one_sector(rbio, faila, sector_nr);
1882 		if (ret < 0)
1883 			goto cleanup;
1884 
1885 		sector = rbio_stripe_sector(rbio, faila, sector_nr);
1886 		sector->uptodate = 1;
1887 	}
1888 	if (failb >= 0) {
1889 		ret = verify_one_sector(rbio, faila, sector_nr);
1890 		if (ret < 0)
1891 			goto cleanup;
1892 
1893 		sector = rbio_stripe_sector(rbio, failb, sector_nr);
1894 		sector->uptodate = 1;
1895 	}
1896 
1897 cleanup:
1898 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1899 		kunmap_local(unmap_array[stripe_nr]);
1900 	return ret;
1901 }
1902 
1903 static int recover_sectors(struct btrfs_raid_bio *rbio)
1904 {
1905 	void **pointers = NULL;
1906 	void **unmap_array = NULL;
1907 	int sectornr;
1908 	int ret = 0;
1909 
1910 	/*
1911 	 * @pointers array stores the pointer for each sector.
1912 	 *
1913 	 * @unmap_array stores copy of pointers that does not get reordered
1914 	 * during reconstruction so that kunmap_local works.
1915 	 */
1916 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1917 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1918 	if (!pointers || !unmap_array) {
1919 		ret = -ENOMEM;
1920 		goto out;
1921 	}
1922 
1923 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1924 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1925 		spin_lock_irq(&rbio->bio_list_lock);
1926 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1927 		spin_unlock_irq(&rbio->bio_list_lock);
1928 	}
1929 
1930 	index_rbio_pages(rbio);
1931 
1932 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1933 		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
1934 		if (ret < 0)
1935 			break;
1936 	}
1937 
1938 out:
1939 	kfree(pointers);
1940 	kfree(unmap_array);
1941 	return ret;
1942 }
1943 
1944 static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
1945 				      struct bio_list *bio_list)
1946 {
1947 	struct bio *bio;
1948 	int total_sector_nr;
1949 	int ret = 0;
1950 
1951 	ASSERT(bio_list_size(bio_list) == 0);
1952 	/*
1953 	 * Read everything that hasn't failed. However this time we will
1954 	 * not trust any cached sector.
1955 	 * As we may read out some stale data but higher layer is not reading
1956 	 * that stale part.
1957 	 *
1958 	 * So here we always re-read everything in recovery path.
1959 	 */
1960 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1961 	     total_sector_nr++) {
1962 		int stripe = total_sector_nr / rbio->stripe_nsectors;
1963 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
1964 		struct sector_ptr *sector;
1965 
1966 		/*
1967 		 * Skip the range which has error.  It can be a range which is
1968 		 * marked error (for csum mismatch), or it can be a missing
1969 		 * device.
1970 		 */
1971 		if (!rbio->bioc->stripes[stripe].dev->bdev ||
1972 		    test_bit(total_sector_nr, rbio->error_bitmap)) {
1973 			/*
1974 			 * Also set the error bit for missing device, which
1975 			 * may not yet have its error bit set.
1976 			 */
1977 			set_bit(total_sector_nr, rbio->error_bitmap);
1978 			continue;
1979 		}
1980 
1981 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
1982 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1983 					 sectornr, REQ_OP_READ);
1984 		if (ret < 0)
1985 			goto error;
1986 	}
1987 	return 0;
1988 error:
1989 	while ((bio = bio_list_pop(bio_list)))
1990 		bio_put(bio);
1991 
1992 	return -EIO;
1993 }
1994 
1995 static int recover_rbio(struct btrfs_raid_bio *rbio)
1996 {
1997 	struct bio_list bio_list;
1998 	struct bio *bio;
1999 	int ret;
2000 
2001 	/*
2002 	 * Either we're doing recover for a read failure or degraded write,
2003 	 * caller should have set error bitmap correctly.
2004 	 */
2005 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2006 	bio_list_init(&bio_list);
2007 
2008 	/* For recovery, we need to read all sectors including P/Q. */
2009 	ret = alloc_rbio_pages(rbio);
2010 	if (ret < 0)
2011 		goto out;
2012 
2013 	index_rbio_pages(rbio);
2014 
2015 	ret = recover_assemble_read_bios(rbio, &bio_list);
2016 	if (ret < 0)
2017 		goto out;
2018 
2019 	submit_read_bios(rbio, &bio_list);
2020 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2021 
2022 	ret = recover_sectors(rbio);
2023 
2024 out:
2025 	while ((bio = bio_list_pop(&bio_list)))
2026 		bio_put(bio);
2027 
2028 	return ret;
2029 }
2030 
2031 static void recover_rbio_work(struct work_struct *work)
2032 {
2033 	struct btrfs_raid_bio *rbio;
2034 	int ret;
2035 
2036 	rbio = container_of(work, struct btrfs_raid_bio, work);
2037 
2038 	ret = lock_stripe_add(rbio);
2039 	if (ret == 0) {
2040 		ret = recover_rbio(rbio);
2041 		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2042 	}
2043 }
2044 
2045 static void recover_rbio_work_locked(struct work_struct *work)
2046 {
2047 	struct btrfs_raid_bio *rbio;
2048 	int ret;
2049 
2050 	rbio = container_of(work, struct btrfs_raid_bio, work);
2051 
2052 	ret = recover_rbio(rbio);
2053 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2054 }
2055 
2056 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2057 {
2058 	bool found = false;
2059 	int sector_nr;
2060 
2061 	/*
2062 	 * This is for RAID6 extra recovery tries, thus mirror number should
2063 	 * be large than 2.
2064 	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2065 	 * RAID5 methods.
2066 	 */
2067 	ASSERT(mirror_num > 2);
2068 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2069 		int found_errors;
2070 		int faila;
2071 		int failb;
2072 
2073 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2074 							 &faila, &failb);
2075 		/* This vertical stripe doesn't have errors. */
2076 		if (!found_errors)
2077 			continue;
2078 
2079 		/*
2080 		 * If we found errors, there should be only one error marked
2081 		 * by previous set_rbio_range_error().
2082 		 */
2083 		ASSERT(found_errors == 1);
2084 		found = true;
2085 
2086 		/* Now select another stripe to mark as error. */
2087 		failb = rbio->real_stripes - (mirror_num - 1);
2088 		if (failb <= faila)
2089 			failb--;
2090 
2091 		/* Set the extra bit in error bitmap. */
2092 		if (failb >= 0)
2093 			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2094 				rbio->error_bitmap);
2095 	}
2096 
2097 	/* We should found at least one vertical stripe with error.*/
2098 	ASSERT(found);
2099 }
2100 
2101 /*
2102  * the main entry point for reads from the higher layers.  This
2103  * is really only called when the normal read path had a failure,
2104  * so we assume the bio they send down corresponds to a failed part
2105  * of the drive.
2106  */
2107 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2108 			   int mirror_num)
2109 {
2110 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2111 	struct btrfs_raid_bio *rbio;
2112 
2113 	rbio = alloc_rbio(fs_info, bioc);
2114 	if (IS_ERR(rbio)) {
2115 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2116 		bio_endio(bio);
2117 		return;
2118 	}
2119 
2120 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2121 	rbio_add_bio(rbio, bio);
2122 
2123 	set_rbio_range_error(rbio, bio);
2124 
2125 	/*
2126 	 * Loop retry:
2127 	 * for 'mirror == 2', reconstruct from all other stripes.
2128 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2129 	 */
2130 	if (mirror_num > 2)
2131 		set_rbio_raid6_extra_error(rbio, mirror_num);
2132 
2133 	start_async_work(rbio, recover_rbio_work);
2134 }
2135 
2136 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2137 {
2138 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2139 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2140 						       rbio->bioc->raid_map[0]);
2141 	const u64 start = rbio->bioc->raid_map[0];
2142 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2143 			fs_info->sectorsize_bits;
2144 	int ret;
2145 
2146 	/* The rbio should not have its csum buffer initialized. */
2147 	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2148 
2149 	/*
2150 	 * Skip the csum search if:
2151 	 *
2152 	 * - The rbio doesn't belong to data block groups
2153 	 *   Then we are doing IO for tree blocks, no need to search csums.
2154 	 *
2155 	 * - The rbio belongs to mixed block groups
2156 	 *   This is to avoid deadlock, as we're already holding the full
2157 	 *   stripe lock, if we trigger a metadata read, and it needs to do
2158 	 *   raid56 recovery, we will deadlock.
2159 	 */
2160 	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2161 	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2162 		return;
2163 
2164 	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2165 				 fs_info->csum_size, GFP_NOFS);
2166 	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2167 					  GFP_NOFS);
2168 	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2169 		ret = -ENOMEM;
2170 		goto error;
2171 	}
2172 
2173 	ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
2174 					rbio->csum_buf, rbio->csum_bitmap);
2175 	if (ret < 0)
2176 		goto error;
2177 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2178 		goto no_csum;
2179 	return;
2180 
2181 error:
2182 	/*
2183 	 * We failed to allocate memory or grab the csum, but it's not fatal,
2184 	 * we can still continue.  But better to warn users that RMW is no
2185 	 * longer safe for this particular sub-stripe write.
2186 	 */
2187 	btrfs_warn_rl(fs_info,
2188 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2189 			rbio->bioc->raid_map[0], ret);
2190 no_csum:
2191 	kfree(rbio->csum_buf);
2192 	bitmap_free(rbio->csum_bitmap);
2193 	rbio->csum_buf = NULL;
2194 	rbio->csum_bitmap = NULL;
2195 }
2196 
2197 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2198 {
2199 	struct bio_list bio_list;
2200 	struct bio *bio;
2201 	int ret;
2202 
2203 	bio_list_init(&bio_list);
2204 
2205 	/*
2206 	 * Fill the data csums we need for data verification.  We need to fill
2207 	 * the csum_bitmap/csum_buf first, as our endio function will try to
2208 	 * verify the data sectors.
2209 	 */
2210 	fill_data_csums(rbio);
2211 
2212 	ret = rmw_assemble_read_bios(rbio, &bio_list);
2213 	if (ret < 0)
2214 		goto out;
2215 
2216 	submit_read_bios(rbio, &bio_list);
2217 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2218 
2219 	/*
2220 	 * We may or may not have any corrupted sectors (including missing dev
2221 	 * and csum mismatch), just let recover_sectors() to handle them all.
2222 	 */
2223 	ret = recover_sectors(rbio);
2224 	return ret;
2225 out:
2226 	while ((bio = bio_list_pop(&bio_list)))
2227 		bio_put(bio);
2228 
2229 	return ret;
2230 }
2231 
2232 static void raid_wait_write_end_io(struct bio *bio)
2233 {
2234 	struct btrfs_raid_bio *rbio = bio->bi_private;
2235 	blk_status_t err = bio->bi_status;
2236 
2237 	if (err)
2238 		rbio_update_error_bitmap(rbio, bio);
2239 	bio_put(bio);
2240 	if (atomic_dec_and_test(&rbio->stripes_pending))
2241 		wake_up(&rbio->io_wait);
2242 }
2243 
2244 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2245 			      struct bio_list *bio_list)
2246 {
2247 	struct bio *bio;
2248 
2249 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2250 	while ((bio = bio_list_pop(bio_list))) {
2251 		bio->bi_end_io = raid_wait_write_end_io;
2252 
2253 		if (trace_raid56_write_stripe_enabled()) {
2254 			struct raid56_bio_trace_info trace_info = { 0 };
2255 
2256 			bio_get_trace_info(rbio, bio, &trace_info);
2257 			trace_raid56_write_stripe(rbio, bio, &trace_info);
2258 		}
2259 		submit_bio(bio);
2260 	}
2261 }
2262 
2263 /*
2264  * To determine if we need to read any sector from the disk.
2265  * Should only be utilized in RMW path, to skip cached rbio.
2266  */
2267 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2268 {
2269 	int i;
2270 
2271 	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2272 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
2273 
2274 		/*
2275 		 * We have a sector which doesn't have page nor uptodate,
2276 		 * thus this rbio can not be cached one, as cached one must
2277 		 * have all its data sectors present and uptodate.
2278 		 */
2279 		if (!sector->page || !sector->uptodate)
2280 			return true;
2281 	}
2282 	return false;
2283 }
2284 
2285 static int rmw_rbio(struct btrfs_raid_bio *rbio)
2286 {
2287 	struct bio_list bio_list;
2288 	int sectornr;
2289 	int ret = 0;
2290 
2291 	/*
2292 	 * Allocate the pages for parity first, as P/Q pages will always be
2293 	 * needed for both full-stripe and sub-stripe writes.
2294 	 */
2295 	ret = alloc_rbio_parity_pages(rbio);
2296 	if (ret < 0)
2297 		return ret;
2298 
2299 	/*
2300 	 * Either full stripe write, or we have every data sector already
2301 	 * cached, can go to write path immediately.
2302 	 */
2303 	if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
2304 		goto write;
2305 
2306 	/*
2307 	 * Now we're doing sub-stripe write, also need all data stripes to do
2308 	 * the full RMW.
2309 	 */
2310 	ret = alloc_rbio_data_pages(rbio);
2311 	if (ret < 0)
2312 		return ret;
2313 
2314 	index_rbio_pages(rbio);
2315 
2316 	ret = rmw_read_wait_recover(rbio);
2317 	if (ret < 0)
2318 		return ret;
2319 
2320 write:
2321 	/*
2322 	 * At this stage we're not allowed to add any new bios to the
2323 	 * bio list any more, anyone else that wants to change this stripe
2324 	 * needs to do their own rmw.
2325 	 */
2326 	spin_lock_irq(&rbio->bio_list_lock);
2327 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2328 	spin_unlock_irq(&rbio->bio_list_lock);
2329 
2330 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2331 
2332 	index_rbio_pages(rbio);
2333 
2334 	/*
2335 	 * We don't cache full rbios because we're assuming
2336 	 * the higher layers are unlikely to use this area of
2337 	 * the disk again soon.  If they do use it again,
2338 	 * hopefully they will send another full bio.
2339 	 */
2340 	if (!rbio_is_full(rbio))
2341 		cache_rbio_pages(rbio);
2342 	else
2343 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2344 
2345 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2346 		generate_pq_vertical(rbio, sectornr);
2347 
2348 	bio_list_init(&bio_list);
2349 	ret = rmw_assemble_write_bios(rbio, &bio_list);
2350 	if (ret < 0)
2351 		return ret;
2352 
2353 	/* We should have at least one bio assembled. */
2354 	ASSERT(bio_list_size(&bio_list));
2355 	submit_write_bios(rbio, &bio_list);
2356 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2357 
2358 	/* We may have more errors than our tolerance during the read. */
2359 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2360 		int found_errors;
2361 
2362 		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2363 		if (found_errors > rbio->bioc->max_errors) {
2364 			ret = -EIO;
2365 			break;
2366 		}
2367 	}
2368 	return ret;
2369 }
2370 
2371 static void rmw_rbio_work(struct work_struct *work)
2372 {
2373 	struct btrfs_raid_bio *rbio;
2374 	int ret;
2375 
2376 	rbio = container_of(work, struct btrfs_raid_bio, work);
2377 
2378 	ret = lock_stripe_add(rbio);
2379 	if (ret == 0) {
2380 		ret = rmw_rbio(rbio);
2381 		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2382 	}
2383 }
2384 
2385 static void rmw_rbio_work_locked(struct work_struct *work)
2386 {
2387 	struct btrfs_raid_bio *rbio;
2388 	int ret;
2389 
2390 	rbio = container_of(work, struct btrfs_raid_bio, work);
2391 
2392 	ret = rmw_rbio(rbio);
2393 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2394 }
2395 
2396 /*
2397  * The following code is used to scrub/replace the parity stripe
2398  *
2399  * Caller must have already increased bio_counter for getting @bioc.
2400  *
2401  * Note: We need make sure all the pages that add into the scrub/replace
2402  * raid bio are correct and not be changed during the scrub/replace. That
2403  * is those pages just hold metadata or file data with checksum.
2404  */
2405 
2406 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2407 				struct btrfs_io_context *bioc,
2408 				struct btrfs_device *scrub_dev,
2409 				unsigned long *dbitmap, int stripe_nsectors)
2410 {
2411 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2412 	struct btrfs_raid_bio *rbio;
2413 	int i;
2414 
2415 	rbio = alloc_rbio(fs_info, bioc);
2416 	if (IS_ERR(rbio))
2417 		return NULL;
2418 	bio_list_add(&rbio->bio_list, bio);
2419 	/*
2420 	 * This is a special bio which is used to hold the completion handler
2421 	 * and make the scrub rbio is similar to the other types
2422 	 */
2423 	ASSERT(!bio->bi_iter.bi_size);
2424 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2425 
2426 	/*
2427 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2428 	 * to the end position, so this search can start from the first parity
2429 	 * stripe.
2430 	 */
2431 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2432 		if (bioc->stripes[i].dev == scrub_dev) {
2433 			rbio->scrubp = i;
2434 			break;
2435 		}
2436 	}
2437 	ASSERT(i < rbio->real_stripes);
2438 
2439 	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2440 	return rbio;
2441 }
2442 
2443 /* Used for both parity scrub and missing. */
2444 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2445 			    unsigned int pgoff, u64 logical)
2446 {
2447 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2448 	int stripe_offset;
2449 	int index;
2450 
2451 	ASSERT(logical >= rbio->bioc->raid_map[0]);
2452 	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
2453 				       BTRFS_STRIPE_LEN * rbio->nr_data);
2454 	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2455 	index = stripe_offset / sectorsize;
2456 	rbio->bio_sectors[index].page = page;
2457 	rbio->bio_sectors[index].pgoff = pgoff;
2458 }
2459 
2460 /*
2461  * We just scrub the parity that we have correct data on the same horizontal,
2462  * so we needn't allocate all pages for all the stripes.
2463  */
2464 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2465 {
2466 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2467 	int total_sector_nr;
2468 
2469 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2470 	     total_sector_nr++) {
2471 		struct page *page;
2472 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2473 		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2474 
2475 		if (!test_bit(sectornr, &rbio->dbitmap))
2476 			continue;
2477 		if (rbio->stripe_pages[index])
2478 			continue;
2479 		page = alloc_page(GFP_NOFS);
2480 		if (!page)
2481 			return -ENOMEM;
2482 		rbio->stripe_pages[index] = page;
2483 	}
2484 	index_stripe_sectors(rbio);
2485 	return 0;
2486 }
2487 
2488 static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
2489 {
2490 	struct btrfs_io_context *bioc = rbio->bioc;
2491 	const u32 sectorsize = bioc->fs_info->sectorsize;
2492 	void **pointers = rbio->finish_pointers;
2493 	unsigned long *pbitmap = &rbio->finish_pbitmap;
2494 	int nr_data = rbio->nr_data;
2495 	int stripe;
2496 	int sectornr;
2497 	bool has_qstripe;
2498 	struct sector_ptr p_sector = { 0 };
2499 	struct sector_ptr q_sector = { 0 };
2500 	struct bio_list bio_list;
2501 	struct bio *bio;
2502 	int is_replace = 0;
2503 	int ret;
2504 
2505 	bio_list_init(&bio_list);
2506 
2507 	if (rbio->real_stripes - rbio->nr_data == 1)
2508 		has_qstripe = false;
2509 	else if (rbio->real_stripes - rbio->nr_data == 2)
2510 		has_qstripe = true;
2511 	else
2512 		BUG();
2513 
2514 	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2515 		is_replace = 1;
2516 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2517 	}
2518 
2519 	/*
2520 	 * Because the higher layers(scrubber) are unlikely to
2521 	 * use this area of the disk again soon, so don't cache
2522 	 * it.
2523 	 */
2524 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2525 
2526 	if (!need_check)
2527 		goto writeback;
2528 
2529 	p_sector.page = alloc_page(GFP_NOFS);
2530 	if (!p_sector.page)
2531 		return -ENOMEM;
2532 	p_sector.pgoff = 0;
2533 	p_sector.uptodate = 1;
2534 
2535 	if (has_qstripe) {
2536 		/* RAID6, allocate and map temp space for the Q stripe */
2537 		q_sector.page = alloc_page(GFP_NOFS);
2538 		if (!q_sector.page) {
2539 			__free_page(p_sector.page);
2540 			p_sector.page = NULL;
2541 			return -ENOMEM;
2542 		}
2543 		q_sector.pgoff = 0;
2544 		q_sector.uptodate = 1;
2545 		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2546 	}
2547 
2548 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2549 
2550 	/* Map the parity stripe just once */
2551 	pointers[nr_data] = kmap_local_page(p_sector.page);
2552 
2553 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2554 		struct sector_ptr *sector;
2555 		void *parity;
2556 
2557 		/* first collect one page from each data stripe */
2558 		for (stripe = 0; stripe < nr_data; stripe++) {
2559 			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2560 			pointers[stripe] = kmap_local_page(sector->page) +
2561 					   sector->pgoff;
2562 		}
2563 
2564 		if (has_qstripe) {
2565 			/* RAID6, call the library function to fill in our P/Q */
2566 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2567 						pointers);
2568 		} else {
2569 			/* raid5 */
2570 			memcpy(pointers[nr_data], pointers[0], sectorsize);
2571 			run_xor(pointers + 1, nr_data - 1, sectorsize);
2572 		}
2573 
2574 		/* Check scrubbing parity and repair it */
2575 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2576 		parity = kmap_local_page(sector->page) + sector->pgoff;
2577 		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2578 			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2579 		else
2580 			/* Parity is right, needn't writeback */
2581 			bitmap_clear(&rbio->dbitmap, sectornr, 1);
2582 		kunmap_local(parity);
2583 
2584 		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2585 			kunmap_local(pointers[stripe]);
2586 	}
2587 
2588 	kunmap_local(pointers[nr_data]);
2589 	__free_page(p_sector.page);
2590 	p_sector.page = NULL;
2591 	if (q_sector.page) {
2592 		kunmap_local(pointers[rbio->real_stripes - 1]);
2593 		__free_page(q_sector.page);
2594 		q_sector.page = NULL;
2595 	}
2596 
2597 writeback:
2598 	/*
2599 	 * time to start writing.  Make bios for everything from the
2600 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2601 	 * everything else.
2602 	 */
2603 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2604 		struct sector_ptr *sector;
2605 
2606 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2607 		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2608 					 sectornr, REQ_OP_WRITE);
2609 		if (ret)
2610 			goto cleanup;
2611 	}
2612 
2613 	if (!is_replace)
2614 		goto submit_write;
2615 
2616 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2617 		struct sector_ptr *sector;
2618 
2619 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2620 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2621 				       bioc->tgtdev_map[rbio->scrubp],
2622 				       sectornr, REQ_OP_WRITE);
2623 		if (ret)
2624 			goto cleanup;
2625 	}
2626 
2627 submit_write:
2628 	submit_write_bios(rbio, &bio_list);
2629 	return 0;
2630 
2631 cleanup:
2632 	while ((bio = bio_list_pop(&bio_list)))
2633 		bio_put(bio);
2634 	return ret;
2635 }
2636 
2637 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2638 {
2639 	if (stripe >= 0 && stripe < rbio->nr_data)
2640 		return 1;
2641 	return 0;
2642 }
2643 
2644 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2645 {
2646 	void **pointers = NULL;
2647 	void **unmap_array = NULL;
2648 	int sector_nr;
2649 	int ret;
2650 
2651 	/*
2652 	 * @pointers array stores the pointer for each sector.
2653 	 *
2654 	 * @unmap_array stores copy of pointers that does not get reordered
2655 	 * during reconstruction so that kunmap_local works.
2656 	 */
2657 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2658 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2659 	if (!pointers || !unmap_array) {
2660 		ret = -ENOMEM;
2661 		goto out;
2662 	}
2663 
2664 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2665 		int dfail = 0, failp = -1;
2666 		int faila;
2667 		int failb;
2668 		int found_errors;
2669 
2670 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2671 							 &faila, &failb);
2672 		if (found_errors > rbio->bioc->max_errors) {
2673 			ret = -EIO;
2674 			goto out;
2675 		}
2676 		if (found_errors == 0)
2677 			continue;
2678 
2679 		/* We should have at least one error here. */
2680 		ASSERT(faila >= 0 || failb >= 0);
2681 
2682 		if (is_data_stripe(rbio, faila))
2683 			dfail++;
2684 		else if (is_parity_stripe(faila))
2685 			failp = faila;
2686 
2687 		if (is_data_stripe(rbio, failb))
2688 			dfail++;
2689 		else if (is_parity_stripe(failb))
2690 			failp = failb;
2691 		/*
2692 		 * Because we can not use a scrubbing parity to repair the
2693 		 * data, so the capability of the repair is declined.  (In the
2694 		 * case of RAID5, we can not repair anything.)
2695 		 */
2696 		if (dfail > rbio->bioc->max_errors - 1) {
2697 			ret = -EIO;
2698 			goto out;
2699 		}
2700 		/*
2701 		 * If all data is good, only parity is correctly, just repair
2702 		 * the parity, no need to recover data stripes.
2703 		 */
2704 		if (dfail == 0)
2705 			continue;
2706 
2707 		/*
2708 		 * Here means we got one corrupted data stripe and one
2709 		 * corrupted parity on RAID6, if the corrupted parity is
2710 		 * scrubbing parity, luckily, use the other one to repair the
2711 		 * data, or we can not repair the data stripe.
2712 		 */
2713 		if (failp != rbio->scrubp) {
2714 			ret = -EIO;
2715 			goto out;
2716 		}
2717 
2718 		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2719 		if (ret < 0)
2720 			goto out;
2721 	}
2722 out:
2723 	kfree(pointers);
2724 	kfree(unmap_array);
2725 	return ret;
2726 }
2727 
2728 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
2729 				    struct bio_list *bio_list)
2730 {
2731 	struct bio *bio;
2732 	int total_sector_nr;
2733 	int ret = 0;
2734 
2735 	ASSERT(bio_list_size(bio_list) == 0);
2736 
2737 	/* Build a list of bios to read all the missing parts. */
2738 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2739 	     total_sector_nr++) {
2740 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2741 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2742 		struct sector_ptr *sector;
2743 
2744 		/* No data in the vertical stripe, no need to read. */
2745 		if (!test_bit(sectornr, &rbio->dbitmap))
2746 			continue;
2747 
2748 		/*
2749 		 * We want to find all the sectors missing from the rbio and
2750 		 * read them from the disk. If sector_in_rbio() finds a sector
2751 		 * in the bio list we don't need to read it off the stripe.
2752 		 */
2753 		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2754 		if (sector)
2755 			continue;
2756 
2757 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2758 		/*
2759 		 * The bio cache may have handed us an uptodate sector.  If so,
2760 		 * use it.
2761 		 */
2762 		if (sector->uptodate)
2763 			continue;
2764 
2765 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
2766 					 sectornr, REQ_OP_READ);
2767 		if (ret)
2768 			goto error;
2769 	}
2770 	return 0;
2771 error:
2772 	while ((bio = bio_list_pop(bio_list)))
2773 		bio_put(bio);
2774 	return ret;
2775 }
2776 
2777 static int scrub_rbio(struct btrfs_raid_bio *rbio)
2778 {
2779 	bool need_check = false;
2780 	struct bio_list bio_list;
2781 	int sector_nr;
2782 	int ret;
2783 	struct bio *bio;
2784 
2785 	bio_list_init(&bio_list);
2786 
2787 	ret = alloc_rbio_essential_pages(rbio);
2788 	if (ret)
2789 		goto cleanup;
2790 
2791 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2792 
2793 	ret = scrub_assemble_read_bios(rbio, &bio_list);
2794 	if (ret < 0)
2795 		goto cleanup;
2796 
2797 	submit_read_bios(rbio, &bio_list);
2798 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2799 
2800 	/* We may have some failures, recover the failed sectors first. */
2801 	ret = recover_scrub_rbio(rbio);
2802 	if (ret < 0)
2803 		goto cleanup;
2804 
2805 	/*
2806 	 * We have every sector properly prepared. Can finish the scrub
2807 	 * and writeback the good content.
2808 	 */
2809 	ret = finish_parity_scrub(rbio, need_check);
2810 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2811 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2812 		int found_errors;
2813 
2814 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2815 		if (found_errors > rbio->bioc->max_errors) {
2816 			ret = -EIO;
2817 			break;
2818 		}
2819 	}
2820 	return ret;
2821 
2822 cleanup:
2823 	while ((bio = bio_list_pop(&bio_list)))
2824 		bio_put(bio);
2825 
2826 	return ret;
2827 }
2828 
2829 static void scrub_rbio_work_locked(struct work_struct *work)
2830 {
2831 	struct btrfs_raid_bio *rbio;
2832 	int ret;
2833 
2834 	rbio = container_of(work, struct btrfs_raid_bio, work);
2835 	ret = scrub_rbio(rbio);
2836 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2837 }
2838 
2839 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2840 {
2841 	if (!lock_stripe_add(rbio))
2842 		start_async_work(rbio, scrub_rbio_work_locked);
2843 }
2844 
2845 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2846 
2847 struct btrfs_raid_bio *
2848 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
2849 {
2850 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2851 	struct btrfs_raid_bio *rbio;
2852 
2853 	rbio = alloc_rbio(fs_info, bioc);
2854 	if (IS_ERR(rbio))
2855 		return NULL;
2856 
2857 	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2858 	bio_list_add(&rbio->bio_list, bio);
2859 	/*
2860 	 * This is a special bio which is used to hold the completion handler
2861 	 * and make the scrub rbio is similar to the other types
2862 	 */
2863 	ASSERT(!bio->bi_iter.bi_size);
2864 
2865 	set_rbio_range_error(rbio, bio);
2866 
2867 	return rbio;
2868 }
2869 
2870 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2871 {
2872 	start_async_work(rbio, recover_rbio_work);
2873 }
2874