xref: /openbmc/linux/fs/btrfs/raid56.c (revision 31e67366)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "ctree.h"
17 #include "disk-io.h"
18 #include "volumes.h"
19 #include "raid56.h"
20 #include "async-thread.h"
21 
22 /* set when additional merges to this rbio are not allowed */
23 #define RBIO_RMW_LOCKED_BIT	1
24 
25 /*
26  * set when this rbio is sitting in the hash, but it is just a cache
27  * of past RMW
28  */
29 #define RBIO_CACHE_BIT		2
30 
31 /*
32  * set when it is safe to trust the stripe_pages for caching
33  */
34 #define RBIO_CACHE_READY_BIT	3
35 
36 #define RBIO_CACHE_SIZE 1024
37 
38 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
39 
40 /* Used by the raid56 code to lock stripes for read/modify/write */
41 struct btrfs_stripe_hash {
42 	struct list_head hash_list;
43 	spinlock_t lock;
44 };
45 
46 /* Used by the raid56 code to lock stripes for read/modify/write */
47 struct btrfs_stripe_hash_table {
48 	struct list_head stripe_cache;
49 	spinlock_t cache_lock;
50 	int cache_size;
51 	struct btrfs_stripe_hash table[];
52 };
53 
54 enum btrfs_rbio_ops {
55 	BTRFS_RBIO_WRITE,
56 	BTRFS_RBIO_READ_REBUILD,
57 	BTRFS_RBIO_PARITY_SCRUB,
58 	BTRFS_RBIO_REBUILD_MISSING,
59 };
60 
61 struct btrfs_raid_bio {
62 	struct btrfs_fs_info *fs_info;
63 	struct btrfs_bio *bbio;
64 
65 	/* while we're doing rmw on a stripe
66 	 * we put it into a hash table so we can
67 	 * lock the stripe and merge more rbios
68 	 * into it.
69 	 */
70 	struct list_head hash_list;
71 
72 	/*
73 	 * LRU list for the stripe cache
74 	 */
75 	struct list_head stripe_cache;
76 
77 	/*
78 	 * for scheduling work in the helper threads
79 	 */
80 	struct btrfs_work work;
81 
82 	/*
83 	 * bio list and bio_list_lock are used
84 	 * to add more bios into the stripe
85 	 * in hopes of avoiding the full rmw
86 	 */
87 	struct bio_list bio_list;
88 	spinlock_t bio_list_lock;
89 
90 	/* also protected by the bio_list_lock, the
91 	 * plug list is used by the plugging code
92 	 * to collect partial bios while plugged.  The
93 	 * stripe locking code also uses it to hand off
94 	 * the stripe lock to the next pending IO
95 	 */
96 	struct list_head plug_list;
97 
98 	/*
99 	 * flags that tell us if it is safe to
100 	 * merge with this bio
101 	 */
102 	unsigned long flags;
103 
104 	/* size of each individual stripe on disk */
105 	int stripe_len;
106 
107 	/* number of data stripes (no p/q) */
108 	int nr_data;
109 
110 	int real_stripes;
111 
112 	int stripe_npages;
113 	/*
114 	 * set if we're doing a parity rebuild
115 	 * for a read from higher up, which is handled
116 	 * differently from a parity rebuild as part of
117 	 * rmw
118 	 */
119 	enum btrfs_rbio_ops operation;
120 
121 	/* first bad stripe */
122 	int faila;
123 
124 	/* second bad stripe (for raid6 use) */
125 	int failb;
126 
127 	int scrubp;
128 	/*
129 	 * number of pages needed to represent the full
130 	 * stripe
131 	 */
132 	int nr_pages;
133 
134 	/*
135 	 * size of all the bios in the bio_list.  This
136 	 * helps us decide if the rbio maps to a full
137 	 * stripe or not
138 	 */
139 	int bio_list_bytes;
140 
141 	int generic_bio_cnt;
142 
143 	refcount_t refs;
144 
145 	atomic_t stripes_pending;
146 
147 	atomic_t error;
148 	/*
149 	 * these are two arrays of pointers.  We allocate the
150 	 * rbio big enough to hold them both and setup their
151 	 * locations when the rbio is allocated
152 	 */
153 
154 	/* pointers to pages that we allocated for
155 	 * reading/writing stripes directly from the disk (including P/Q)
156 	 */
157 	struct page **stripe_pages;
158 
159 	/*
160 	 * pointers to the pages in the bio_list.  Stored
161 	 * here for faster lookup
162 	 */
163 	struct page **bio_pages;
164 
165 	/*
166 	 * bitmap to record which horizontal stripe has data
167 	 */
168 	unsigned long *dbitmap;
169 
170 	/* allocated with real_stripes-many pointers for finish_*() calls */
171 	void **finish_pointers;
172 
173 	/* allocated with stripe_npages-many bits for finish_*() calls */
174 	unsigned long *finish_pbitmap;
175 };
176 
177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179 static void rmw_work(struct btrfs_work *work);
180 static void read_rebuild_work(struct btrfs_work *work);
181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
186 
187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
188 					 int need_check);
189 static void scrub_parity_work(struct btrfs_work *work);
190 
191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
192 {
193 	btrfs_init_work(&rbio->work, work_func, NULL, NULL);
194 	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
195 }
196 
197 /*
198  * the stripe hash table is used for locking, and to collect
199  * bios in hopes of making a full stripe
200  */
201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
202 {
203 	struct btrfs_stripe_hash_table *table;
204 	struct btrfs_stripe_hash_table *x;
205 	struct btrfs_stripe_hash *cur;
206 	struct btrfs_stripe_hash *h;
207 	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
208 	int i;
209 
210 	if (info->stripe_hash_table)
211 		return 0;
212 
213 	/*
214 	 * The table is large, starting with order 4 and can go as high as
215 	 * order 7 in case lock debugging is turned on.
216 	 *
217 	 * Try harder to allocate and fallback to vmalloc to lower the chance
218 	 * of a failing mount.
219 	 */
220 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
221 	if (!table)
222 		return -ENOMEM;
223 
224 	spin_lock_init(&table->cache_lock);
225 	INIT_LIST_HEAD(&table->stripe_cache);
226 
227 	h = table->table;
228 
229 	for (i = 0; i < num_entries; i++) {
230 		cur = h + i;
231 		INIT_LIST_HEAD(&cur->hash_list);
232 		spin_lock_init(&cur->lock);
233 	}
234 
235 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
236 	kvfree(x);
237 	return 0;
238 }
239 
240 /*
241  * caching an rbio means to copy anything from the
242  * bio_pages array into the stripe_pages array.  We
243  * use the page uptodate bit in the stripe cache array
244  * to indicate if it has valid data
245  *
246  * once the caching is done, we set the cache ready
247  * bit.
248  */
249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
250 {
251 	int i;
252 	int ret;
253 
254 	ret = alloc_rbio_pages(rbio);
255 	if (ret)
256 		return;
257 
258 	for (i = 0; i < rbio->nr_pages; i++) {
259 		if (!rbio->bio_pages[i])
260 			continue;
261 
262 		copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
263 		SetPageUptodate(rbio->stripe_pages[i]);
264 	}
265 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
266 }
267 
268 /*
269  * we hash on the first logical address of the stripe
270  */
271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
272 {
273 	u64 num = rbio->bbio->raid_map[0];
274 
275 	/*
276 	 * we shift down quite a bit.  We're using byte
277 	 * addressing, and most of the lower bits are zeros.
278 	 * This tends to upset hash_64, and it consistently
279 	 * returns just one or two different values.
280 	 *
281 	 * shifting off the lower bits fixes things.
282 	 */
283 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
284 }
285 
286 /*
287  * stealing an rbio means taking all the uptodate pages from the stripe
288  * array in the source rbio and putting them into the destination rbio
289  */
290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
291 {
292 	int i;
293 	struct page *s;
294 	struct page *d;
295 
296 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
297 		return;
298 
299 	for (i = 0; i < dest->nr_pages; i++) {
300 		s = src->stripe_pages[i];
301 		if (!s || !PageUptodate(s)) {
302 			continue;
303 		}
304 
305 		d = dest->stripe_pages[i];
306 		if (d)
307 			__free_page(d);
308 
309 		dest->stripe_pages[i] = s;
310 		src->stripe_pages[i] = NULL;
311 	}
312 }
313 
314 /*
315  * merging means we take the bio_list from the victim and
316  * splice it into the destination.  The victim should
317  * be discarded afterwards.
318  *
319  * must be called with dest->rbio_list_lock held
320  */
321 static void merge_rbio(struct btrfs_raid_bio *dest,
322 		       struct btrfs_raid_bio *victim)
323 {
324 	bio_list_merge(&dest->bio_list, &victim->bio_list);
325 	dest->bio_list_bytes += victim->bio_list_bytes;
326 	dest->generic_bio_cnt += victim->generic_bio_cnt;
327 	bio_list_init(&victim->bio_list);
328 }
329 
330 /*
331  * used to prune items that are in the cache.  The caller
332  * must hold the hash table lock.
333  */
334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335 {
336 	int bucket = rbio_bucket(rbio);
337 	struct btrfs_stripe_hash_table *table;
338 	struct btrfs_stripe_hash *h;
339 	int freeit = 0;
340 
341 	/*
342 	 * check the bit again under the hash table lock.
343 	 */
344 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 		return;
346 
347 	table = rbio->fs_info->stripe_hash_table;
348 	h = table->table + bucket;
349 
350 	/* hold the lock for the bucket because we may be
351 	 * removing it from the hash table
352 	 */
353 	spin_lock(&h->lock);
354 
355 	/*
356 	 * hold the lock for the bio list because we need
357 	 * to make sure the bio list is empty
358 	 */
359 	spin_lock(&rbio->bio_list_lock);
360 
361 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 		list_del_init(&rbio->stripe_cache);
363 		table->cache_size -= 1;
364 		freeit = 1;
365 
366 		/* if the bio list isn't empty, this rbio is
367 		 * still involved in an IO.  We take it out
368 		 * of the cache list, and drop the ref that
369 		 * was held for the list.
370 		 *
371 		 * If the bio_list was empty, we also remove
372 		 * the rbio from the hash_table, and drop
373 		 * the corresponding ref
374 		 */
375 		if (bio_list_empty(&rbio->bio_list)) {
376 			if (!list_empty(&rbio->hash_list)) {
377 				list_del_init(&rbio->hash_list);
378 				refcount_dec(&rbio->refs);
379 				BUG_ON(!list_empty(&rbio->plug_list));
380 			}
381 		}
382 	}
383 
384 	spin_unlock(&rbio->bio_list_lock);
385 	spin_unlock(&h->lock);
386 
387 	if (freeit)
388 		__free_raid_bio(rbio);
389 }
390 
391 /*
392  * prune a given rbio from the cache
393  */
394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395 {
396 	struct btrfs_stripe_hash_table *table;
397 	unsigned long flags;
398 
399 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 		return;
401 
402 	table = rbio->fs_info->stripe_hash_table;
403 
404 	spin_lock_irqsave(&table->cache_lock, flags);
405 	__remove_rbio_from_cache(rbio);
406 	spin_unlock_irqrestore(&table->cache_lock, flags);
407 }
408 
409 /*
410  * remove everything in the cache
411  */
412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413 {
414 	struct btrfs_stripe_hash_table *table;
415 	unsigned long flags;
416 	struct btrfs_raid_bio *rbio;
417 
418 	table = info->stripe_hash_table;
419 
420 	spin_lock_irqsave(&table->cache_lock, flags);
421 	while (!list_empty(&table->stripe_cache)) {
422 		rbio = list_entry(table->stripe_cache.next,
423 				  struct btrfs_raid_bio,
424 				  stripe_cache);
425 		__remove_rbio_from_cache(rbio);
426 	}
427 	spin_unlock_irqrestore(&table->cache_lock, flags);
428 }
429 
430 /*
431  * remove all cached entries and free the hash table
432  * used by unmount
433  */
434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435 {
436 	if (!info->stripe_hash_table)
437 		return;
438 	btrfs_clear_rbio_cache(info);
439 	kvfree(info->stripe_hash_table);
440 	info->stripe_hash_table = NULL;
441 }
442 
443 /*
444  * insert an rbio into the stripe cache.  It
445  * must have already been prepared by calling
446  * cache_rbio_pages
447  *
448  * If this rbio was already cached, it gets
449  * moved to the front of the lru.
450  *
451  * If the size of the rbio cache is too big, we
452  * prune an item.
453  */
454 static void cache_rbio(struct btrfs_raid_bio *rbio)
455 {
456 	struct btrfs_stripe_hash_table *table;
457 	unsigned long flags;
458 
459 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
460 		return;
461 
462 	table = rbio->fs_info->stripe_hash_table;
463 
464 	spin_lock_irqsave(&table->cache_lock, flags);
465 	spin_lock(&rbio->bio_list_lock);
466 
467 	/* bump our ref if we were not in the list before */
468 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
469 		refcount_inc(&rbio->refs);
470 
471 	if (!list_empty(&rbio->stripe_cache)){
472 		list_move(&rbio->stripe_cache, &table->stripe_cache);
473 	} else {
474 		list_add(&rbio->stripe_cache, &table->stripe_cache);
475 		table->cache_size += 1;
476 	}
477 
478 	spin_unlock(&rbio->bio_list_lock);
479 
480 	if (table->cache_size > RBIO_CACHE_SIZE) {
481 		struct btrfs_raid_bio *found;
482 
483 		found = list_entry(table->stripe_cache.prev,
484 				  struct btrfs_raid_bio,
485 				  stripe_cache);
486 
487 		if (found != rbio)
488 			__remove_rbio_from_cache(found);
489 	}
490 
491 	spin_unlock_irqrestore(&table->cache_lock, flags);
492 }
493 
494 /*
495  * helper function to run the xor_blocks api.  It is only
496  * able to do MAX_XOR_BLOCKS at a time, so we need to
497  * loop through.
498  */
499 static void run_xor(void **pages, int src_cnt, ssize_t len)
500 {
501 	int src_off = 0;
502 	int xor_src_cnt = 0;
503 	void *dest = pages[src_cnt];
504 
505 	while(src_cnt > 0) {
506 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
508 
509 		src_cnt -= xor_src_cnt;
510 		src_off += xor_src_cnt;
511 	}
512 }
513 
514 /*
515  * Returns true if the bio list inside this rbio covers an entire stripe (no
516  * rmw required).
517  */
518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
519 {
520 	unsigned long flags;
521 	unsigned long size = rbio->bio_list_bytes;
522 	int ret = 1;
523 
524 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
525 	if (size != rbio->nr_data * rbio->stripe_len)
526 		ret = 0;
527 	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
528 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
529 
530 	return ret;
531 }
532 
533 /*
534  * returns 1 if it is safe to merge two rbios together.
535  * The merging is safe if the two rbios correspond to
536  * the same stripe and if they are both going in the same
537  * direction (read vs write), and if neither one is
538  * locked for final IO
539  *
540  * The caller is responsible for locking such that
541  * rmw_locked is safe to test
542  */
543 static int rbio_can_merge(struct btrfs_raid_bio *last,
544 			  struct btrfs_raid_bio *cur)
545 {
546 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
548 		return 0;
549 
550 	/*
551 	 * we can't merge with cached rbios, since the
552 	 * idea is that when we merge the destination
553 	 * rbio is going to run our IO for us.  We can
554 	 * steal from cached rbios though, other functions
555 	 * handle that.
556 	 */
557 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
559 		return 0;
560 
561 	if (last->bbio->raid_map[0] !=
562 	    cur->bbio->raid_map[0])
563 		return 0;
564 
565 	/* we can't merge with different operations */
566 	if (last->operation != cur->operation)
567 		return 0;
568 	/*
569 	 * We've need read the full stripe from the drive.
570 	 * check and repair the parity and write the new results.
571 	 *
572 	 * We're not allowed to add any new bios to the
573 	 * bio list here, anyone else that wants to
574 	 * change this stripe needs to do their own rmw.
575 	 */
576 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
577 		return 0;
578 
579 	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
580 		return 0;
581 
582 	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
583 		int fa = last->faila;
584 		int fb = last->failb;
585 		int cur_fa = cur->faila;
586 		int cur_fb = cur->failb;
587 
588 		if (last->faila >= last->failb) {
589 			fa = last->failb;
590 			fb = last->faila;
591 		}
592 
593 		if (cur->faila >= cur->failb) {
594 			cur_fa = cur->failb;
595 			cur_fb = cur->faila;
596 		}
597 
598 		if (fa != cur_fa || fb != cur_fb)
599 			return 0;
600 	}
601 	return 1;
602 }
603 
604 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
605 				  int index)
606 {
607 	return stripe * rbio->stripe_npages + index;
608 }
609 
610 /*
611  * these are just the pages from the rbio array, not from anything
612  * the FS sent down to us
613  */
614 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
615 				     int index)
616 {
617 	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
618 }
619 
620 /*
621  * helper to index into the pstripe
622  */
623 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
624 {
625 	return rbio_stripe_page(rbio, rbio->nr_data, index);
626 }
627 
628 /*
629  * helper to index into the qstripe, returns null
630  * if there is no qstripe
631  */
632 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
633 {
634 	if (rbio->nr_data + 1 == rbio->real_stripes)
635 		return NULL;
636 	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
637 }
638 
639 /*
640  * The first stripe in the table for a logical address
641  * has the lock.  rbios are added in one of three ways:
642  *
643  * 1) Nobody has the stripe locked yet.  The rbio is given
644  * the lock and 0 is returned.  The caller must start the IO
645  * themselves.
646  *
647  * 2) Someone has the stripe locked, but we're able to merge
648  * with the lock owner.  The rbio is freed and the IO will
649  * start automatically along with the existing rbio.  1 is returned.
650  *
651  * 3) Someone has the stripe locked, but we're not able to merge.
652  * The rbio is added to the lock owner's plug list, or merged into
653  * an rbio already on the plug list.  When the lock owner unlocks,
654  * the next rbio on the list is run and the IO is started automatically.
655  * 1 is returned
656  *
657  * If we return 0, the caller still owns the rbio and must continue with
658  * IO submission.  If we return 1, the caller must assume the rbio has
659  * already been freed.
660  */
661 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
662 {
663 	struct btrfs_stripe_hash *h;
664 	struct btrfs_raid_bio *cur;
665 	struct btrfs_raid_bio *pending;
666 	unsigned long flags;
667 	struct btrfs_raid_bio *freeit = NULL;
668 	struct btrfs_raid_bio *cache_drop = NULL;
669 	int ret = 0;
670 
671 	h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
672 
673 	spin_lock_irqsave(&h->lock, flags);
674 	list_for_each_entry(cur, &h->hash_list, hash_list) {
675 		if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
676 			continue;
677 
678 		spin_lock(&cur->bio_list_lock);
679 
680 		/* Can we steal this cached rbio's pages? */
681 		if (bio_list_empty(&cur->bio_list) &&
682 		    list_empty(&cur->plug_list) &&
683 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
684 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
685 			list_del_init(&cur->hash_list);
686 			refcount_dec(&cur->refs);
687 
688 			steal_rbio(cur, rbio);
689 			cache_drop = cur;
690 			spin_unlock(&cur->bio_list_lock);
691 
692 			goto lockit;
693 		}
694 
695 		/* Can we merge into the lock owner? */
696 		if (rbio_can_merge(cur, rbio)) {
697 			merge_rbio(cur, rbio);
698 			spin_unlock(&cur->bio_list_lock);
699 			freeit = rbio;
700 			ret = 1;
701 			goto out;
702 		}
703 
704 
705 		/*
706 		 * We couldn't merge with the running rbio, see if we can merge
707 		 * with the pending ones.  We don't have to check for rmw_locked
708 		 * because there is no way they are inside finish_rmw right now
709 		 */
710 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
711 			if (rbio_can_merge(pending, rbio)) {
712 				merge_rbio(pending, rbio);
713 				spin_unlock(&cur->bio_list_lock);
714 				freeit = rbio;
715 				ret = 1;
716 				goto out;
717 			}
718 		}
719 
720 		/*
721 		 * No merging, put us on the tail of the plug list, our rbio
722 		 * will be started with the currently running rbio unlocks
723 		 */
724 		list_add_tail(&rbio->plug_list, &cur->plug_list);
725 		spin_unlock(&cur->bio_list_lock);
726 		ret = 1;
727 		goto out;
728 	}
729 lockit:
730 	refcount_inc(&rbio->refs);
731 	list_add(&rbio->hash_list, &h->hash_list);
732 out:
733 	spin_unlock_irqrestore(&h->lock, flags);
734 	if (cache_drop)
735 		remove_rbio_from_cache(cache_drop);
736 	if (freeit)
737 		__free_raid_bio(freeit);
738 	return ret;
739 }
740 
741 /*
742  * called as rmw or parity rebuild is completed.  If the plug list has more
743  * rbios waiting for this stripe, the next one on the list will be started
744  */
745 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
746 {
747 	int bucket;
748 	struct btrfs_stripe_hash *h;
749 	unsigned long flags;
750 	int keep_cache = 0;
751 
752 	bucket = rbio_bucket(rbio);
753 	h = rbio->fs_info->stripe_hash_table->table + bucket;
754 
755 	if (list_empty(&rbio->plug_list))
756 		cache_rbio(rbio);
757 
758 	spin_lock_irqsave(&h->lock, flags);
759 	spin_lock(&rbio->bio_list_lock);
760 
761 	if (!list_empty(&rbio->hash_list)) {
762 		/*
763 		 * if we're still cached and there is no other IO
764 		 * to perform, just leave this rbio here for others
765 		 * to steal from later
766 		 */
767 		if (list_empty(&rbio->plug_list) &&
768 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
769 			keep_cache = 1;
770 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
771 			BUG_ON(!bio_list_empty(&rbio->bio_list));
772 			goto done;
773 		}
774 
775 		list_del_init(&rbio->hash_list);
776 		refcount_dec(&rbio->refs);
777 
778 		/*
779 		 * we use the plug list to hold all the rbios
780 		 * waiting for the chance to lock this stripe.
781 		 * hand the lock over to one of them.
782 		 */
783 		if (!list_empty(&rbio->plug_list)) {
784 			struct btrfs_raid_bio *next;
785 			struct list_head *head = rbio->plug_list.next;
786 
787 			next = list_entry(head, struct btrfs_raid_bio,
788 					  plug_list);
789 
790 			list_del_init(&rbio->plug_list);
791 
792 			list_add(&next->hash_list, &h->hash_list);
793 			refcount_inc(&next->refs);
794 			spin_unlock(&rbio->bio_list_lock);
795 			spin_unlock_irqrestore(&h->lock, flags);
796 
797 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
798 				start_async_work(next, read_rebuild_work);
799 			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
800 				steal_rbio(rbio, next);
801 				start_async_work(next, read_rebuild_work);
802 			} else if (next->operation == BTRFS_RBIO_WRITE) {
803 				steal_rbio(rbio, next);
804 				start_async_work(next, rmw_work);
805 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
806 				steal_rbio(rbio, next);
807 				start_async_work(next, scrub_parity_work);
808 			}
809 
810 			goto done_nolock;
811 		}
812 	}
813 done:
814 	spin_unlock(&rbio->bio_list_lock);
815 	spin_unlock_irqrestore(&h->lock, flags);
816 
817 done_nolock:
818 	if (!keep_cache)
819 		remove_rbio_from_cache(rbio);
820 }
821 
822 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
823 {
824 	int i;
825 
826 	if (!refcount_dec_and_test(&rbio->refs))
827 		return;
828 
829 	WARN_ON(!list_empty(&rbio->stripe_cache));
830 	WARN_ON(!list_empty(&rbio->hash_list));
831 	WARN_ON(!bio_list_empty(&rbio->bio_list));
832 
833 	for (i = 0; i < rbio->nr_pages; i++) {
834 		if (rbio->stripe_pages[i]) {
835 			__free_page(rbio->stripe_pages[i]);
836 			rbio->stripe_pages[i] = NULL;
837 		}
838 	}
839 
840 	btrfs_put_bbio(rbio->bbio);
841 	kfree(rbio);
842 }
843 
844 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
845 {
846 	struct bio *next;
847 
848 	while (cur) {
849 		next = cur->bi_next;
850 		cur->bi_next = NULL;
851 		cur->bi_status = err;
852 		bio_endio(cur);
853 		cur = next;
854 	}
855 }
856 
857 /*
858  * this frees the rbio and runs through all the bios in the
859  * bio_list and calls end_io on them
860  */
861 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
862 {
863 	struct bio *cur = bio_list_get(&rbio->bio_list);
864 	struct bio *extra;
865 
866 	if (rbio->generic_bio_cnt)
867 		btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
868 
869 	/*
870 	 * At this moment, rbio->bio_list is empty, however since rbio does not
871 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
872 	 * hash list, rbio may be merged with others so that rbio->bio_list
873 	 * becomes non-empty.
874 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
875 	 * more and we can call bio_endio() on all queued bios.
876 	 */
877 	unlock_stripe(rbio);
878 	extra = bio_list_get(&rbio->bio_list);
879 	__free_raid_bio(rbio);
880 
881 	rbio_endio_bio_list(cur, err);
882 	if (extra)
883 		rbio_endio_bio_list(extra, err);
884 }
885 
886 /*
887  * end io function used by finish_rmw.  When we finally
888  * get here, we've written a full stripe
889  */
890 static void raid_write_end_io(struct bio *bio)
891 {
892 	struct btrfs_raid_bio *rbio = bio->bi_private;
893 	blk_status_t err = bio->bi_status;
894 	int max_errors;
895 
896 	if (err)
897 		fail_bio_stripe(rbio, bio);
898 
899 	bio_put(bio);
900 
901 	if (!atomic_dec_and_test(&rbio->stripes_pending))
902 		return;
903 
904 	err = BLK_STS_OK;
905 
906 	/* OK, we have read all the stripes we need to. */
907 	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
908 		     0 : rbio->bbio->max_errors;
909 	if (atomic_read(&rbio->error) > max_errors)
910 		err = BLK_STS_IOERR;
911 
912 	rbio_orig_end_io(rbio, err);
913 }
914 
915 /*
916  * the read/modify/write code wants to use the original bio for
917  * any pages it included, and then use the rbio for everything
918  * else.  This function decides if a given index (stripe number)
919  * and page number in that stripe fall inside the original bio
920  * or the rbio.
921  *
922  * if you set bio_list_only, you'll get a NULL back for any ranges
923  * that are outside the bio_list
924  *
925  * This doesn't take any refs on anything, you get a bare page pointer
926  * and the caller must bump refs as required.
927  *
928  * You must call index_rbio_pages once before you can trust
929  * the answers from this function.
930  */
931 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
932 				 int index, int pagenr, int bio_list_only)
933 {
934 	int chunk_page;
935 	struct page *p = NULL;
936 
937 	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
938 
939 	spin_lock_irq(&rbio->bio_list_lock);
940 	p = rbio->bio_pages[chunk_page];
941 	spin_unlock_irq(&rbio->bio_list_lock);
942 
943 	if (p || bio_list_only)
944 		return p;
945 
946 	return rbio->stripe_pages[chunk_page];
947 }
948 
949 /*
950  * number of pages we need for the entire stripe across all the
951  * drives
952  */
953 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
954 {
955 	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
956 }
957 
958 /*
959  * allocation and initial setup for the btrfs_raid_bio.  Not
960  * this does not allocate any pages for rbio->pages.
961  */
962 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
963 					 struct btrfs_bio *bbio,
964 					 u64 stripe_len)
965 {
966 	struct btrfs_raid_bio *rbio;
967 	int nr_data = 0;
968 	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
969 	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
970 	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
971 	void *p;
972 
973 	rbio = kzalloc(sizeof(*rbio) +
974 		       sizeof(*rbio->stripe_pages) * num_pages +
975 		       sizeof(*rbio->bio_pages) * num_pages +
976 		       sizeof(*rbio->finish_pointers) * real_stripes +
977 		       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
978 		       sizeof(*rbio->finish_pbitmap) *
979 				BITS_TO_LONGS(stripe_npages),
980 		       GFP_NOFS);
981 	if (!rbio)
982 		return ERR_PTR(-ENOMEM);
983 
984 	bio_list_init(&rbio->bio_list);
985 	INIT_LIST_HEAD(&rbio->plug_list);
986 	spin_lock_init(&rbio->bio_list_lock);
987 	INIT_LIST_HEAD(&rbio->stripe_cache);
988 	INIT_LIST_HEAD(&rbio->hash_list);
989 	rbio->bbio = bbio;
990 	rbio->fs_info = fs_info;
991 	rbio->stripe_len = stripe_len;
992 	rbio->nr_pages = num_pages;
993 	rbio->real_stripes = real_stripes;
994 	rbio->stripe_npages = stripe_npages;
995 	rbio->faila = -1;
996 	rbio->failb = -1;
997 	refcount_set(&rbio->refs, 1);
998 	atomic_set(&rbio->error, 0);
999 	atomic_set(&rbio->stripes_pending, 0);
1000 
1001 	/*
1002 	 * the stripe_pages, bio_pages, etc arrays point to the extra
1003 	 * memory we allocated past the end of the rbio
1004 	 */
1005 	p = rbio + 1;
1006 #define CONSUME_ALLOC(ptr, count)	do {				\
1007 		ptr = p;						\
1008 		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
1009 	} while (0)
1010 	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1011 	CONSUME_ALLOC(rbio->bio_pages, num_pages);
1012 	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1013 	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1014 	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1015 #undef  CONSUME_ALLOC
1016 
1017 	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1018 		nr_data = real_stripes - 1;
1019 	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1020 		nr_data = real_stripes - 2;
1021 	else
1022 		BUG();
1023 
1024 	rbio->nr_data = nr_data;
1025 	return rbio;
1026 }
1027 
1028 /* allocate pages for all the stripes in the bio, including parity */
1029 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1030 {
1031 	int i;
1032 	struct page *page;
1033 
1034 	for (i = 0; i < rbio->nr_pages; i++) {
1035 		if (rbio->stripe_pages[i])
1036 			continue;
1037 		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1038 		if (!page)
1039 			return -ENOMEM;
1040 		rbio->stripe_pages[i] = page;
1041 	}
1042 	return 0;
1043 }
1044 
1045 /* only allocate pages for p/q stripes */
1046 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1047 {
1048 	int i;
1049 	struct page *page;
1050 
1051 	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1052 
1053 	for (; i < rbio->nr_pages; i++) {
1054 		if (rbio->stripe_pages[i])
1055 			continue;
1056 		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1057 		if (!page)
1058 			return -ENOMEM;
1059 		rbio->stripe_pages[i] = page;
1060 	}
1061 	return 0;
1062 }
1063 
1064 /*
1065  * add a single page from a specific stripe into our list of bios for IO
1066  * this will try to merge into existing bios if possible, and returns
1067  * zero if all went well.
1068  */
1069 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1070 			    struct bio_list *bio_list,
1071 			    struct page *page,
1072 			    int stripe_nr,
1073 			    unsigned long page_index,
1074 			    unsigned long bio_max_len)
1075 {
1076 	struct bio *last = bio_list->tail;
1077 	int ret;
1078 	struct bio *bio;
1079 	struct btrfs_bio_stripe *stripe;
1080 	u64 disk_start;
1081 
1082 	stripe = &rbio->bbio->stripes[stripe_nr];
1083 	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1084 
1085 	/* if the device is missing, just fail this stripe */
1086 	if (!stripe->dev->bdev)
1087 		return fail_rbio_index(rbio, stripe_nr);
1088 
1089 	/* see if we can add this page onto our existing bio */
1090 	if (last) {
1091 		u64 last_end = last->bi_iter.bi_sector << 9;
1092 		last_end += last->bi_iter.bi_size;
1093 
1094 		/*
1095 		 * we can't merge these if they are from different
1096 		 * devices or if they are not contiguous
1097 		 */
1098 		if (last_end == disk_start && !last->bi_status &&
1099 		    last->bi_bdev == stripe->dev->bdev) {
1100 			ret = bio_add_page(last, page, PAGE_SIZE, 0);
1101 			if (ret == PAGE_SIZE)
1102 				return 0;
1103 		}
1104 	}
1105 
1106 	/* put a new bio on the list */
1107 	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1108 	btrfs_io_bio(bio)->device = stripe->dev;
1109 	bio->bi_iter.bi_size = 0;
1110 	bio_set_dev(bio, stripe->dev->bdev);
1111 	bio->bi_iter.bi_sector = disk_start >> 9;
1112 
1113 	bio_add_page(bio, page, PAGE_SIZE, 0);
1114 	bio_list_add(bio_list, bio);
1115 	return 0;
1116 }
1117 
1118 /*
1119  * while we're doing the read/modify/write cycle, we could
1120  * have errors in reading pages off the disk.  This checks
1121  * for errors and if we're not able to read the page it'll
1122  * trigger parity reconstruction.  The rmw will be finished
1123  * after we've reconstructed the failed stripes
1124  */
1125 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1126 {
1127 	if (rbio->faila >= 0 || rbio->failb >= 0) {
1128 		BUG_ON(rbio->faila == rbio->real_stripes - 1);
1129 		__raid56_parity_recover(rbio);
1130 	} else {
1131 		finish_rmw(rbio);
1132 	}
1133 }
1134 
1135 /*
1136  * helper function to walk our bio list and populate the bio_pages array with
1137  * the result.  This seems expensive, but it is faster than constantly
1138  * searching through the bio list as we setup the IO in finish_rmw or stripe
1139  * reconstruction.
1140  *
1141  * This must be called before you trust the answers from page_in_rbio
1142  */
1143 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1144 {
1145 	struct bio *bio;
1146 	u64 start;
1147 	unsigned long stripe_offset;
1148 	unsigned long page_index;
1149 
1150 	spin_lock_irq(&rbio->bio_list_lock);
1151 	bio_list_for_each(bio, &rbio->bio_list) {
1152 		struct bio_vec bvec;
1153 		struct bvec_iter iter;
1154 		int i = 0;
1155 
1156 		start = bio->bi_iter.bi_sector << 9;
1157 		stripe_offset = start - rbio->bbio->raid_map[0];
1158 		page_index = stripe_offset >> PAGE_SHIFT;
1159 
1160 		if (bio_flagged(bio, BIO_CLONED))
1161 			bio->bi_iter = btrfs_io_bio(bio)->iter;
1162 
1163 		bio_for_each_segment(bvec, bio, iter) {
1164 			rbio->bio_pages[page_index + i] = bvec.bv_page;
1165 			i++;
1166 		}
1167 	}
1168 	spin_unlock_irq(&rbio->bio_list_lock);
1169 }
1170 
1171 /*
1172  * this is called from one of two situations.  We either
1173  * have a full stripe from the higher layers, or we've read all
1174  * the missing bits off disk.
1175  *
1176  * This will calculate the parity and then send down any
1177  * changed blocks.
1178  */
1179 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1180 {
1181 	struct btrfs_bio *bbio = rbio->bbio;
1182 	void **pointers = rbio->finish_pointers;
1183 	int nr_data = rbio->nr_data;
1184 	int stripe;
1185 	int pagenr;
1186 	bool has_qstripe;
1187 	struct bio_list bio_list;
1188 	struct bio *bio;
1189 	int ret;
1190 
1191 	bio_list_init(&bio_list);
1192 
1193 	if (rbio->real_stripes - rbio->nr_data == 1)
1194 		has_qstripe = false;
1195 	else if (rbio->real_stripes - rbio->nr_data == 2)
1196 		has_qstripe = true;
1197 	else
1198 		BUG();
1199 
1200 	/* at this point we either have a full stripe,
1201 	 * or we've read the full stripe from the drive.
1202 	 * recalculate the parity and write the new results.
1203 	 *
1204 	 * We're not allowed to add any new bios to the
1205 	 * bio list here, anyone else that wants to
1206 	 * change this stripe needs to do their own rmw.
1207 	 */
1208 	spin_lock_irq(&rbio->bio_list_lock);
1209 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1210 	spin_unlock_irq(&rbio->bio_list_lock);
1211 
1212 	atomic_set(&rbio->error, 0);
1213 
1214 	/*
1215 	 * now that we've set rmw_locked, run through the
1216 	 * bio list one last time and map the page pointers
1217 	 *
1218 	 * We don't cache full rbios because we're assuming
1219 	 * the higher layers are unlikely to use this area of
1220 	 * the disk again soon.  If they do use it again,
1221 	 * hopefully they will send another full bio.
1222 	 */
1223 	index_rbio_pages(rbio);
1224 	if (!rbio_is_full(rbio))
1225 		cache_rbio_pages(rbio);
1226 	else
1227 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1228 
1229 	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1230 		struct page *p;
1231 		/* first collect one page from each data stripe */
1232 		for (stripe = 0; stripe < nr_data; stripe++) {
1233 			p = page_in_rbio(rbio, stripe, pagenr, 0);
1234 			pointers[stripe] = kmap(p);
1235 		}
1236 
1237 		/* then add the parity stripe */
1238 		p = rbio_pstripe_page(rbio, pagenr);
1239 		SetPageUptodate(p);
1240 		pointers[stripe++] = kmap(p);
1241 
1242 		if (has_qstripe) {
1243 
1244 			/*
1245 			 * raid6, add the qstripe and call the
1246 			 * library function to fill in our p/q
1247 			 */
1248 			p = rbio_qstripe_page(rbio, pagenr);
1249 			SetPageUptodate(p);
1250 			pointers[stripe++] = kmap(p);
1251 
1252 			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1253 						pointers);
1254 		} else {
1255 			/* raid5 */
1256 			copy_page(pointers[nr_data], pointers[0]);
1257 			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1258 		}
1259 
1260 
1261 		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1262 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1263 	}
1264 
1265 	/*
1266 	 * time to start writing.  Make bios for everything from the
1267 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1268 	 * everything else.
1269 	 */
1270 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1271 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1272 			struct page *page;
1273 			if (stripe < rbio->nr_data) {
1274 				page = page_in_rbio(rbio, stripe, pagenr, 1);
1275 				if (!page)
1276 					continue;
1277 			} else {
1278 			       page = rbio_stripe_page(rbio, stripe, pagenr);
1279 			}
1280 
1281 			ret = rbio_add_io_page(rbio, &bio_list,
1282 				       page, stripe, pagenr, rbio->stripe_len);
1283 			if (ret)
1284 				goto cleanup;
1285 		}
1286 	}
1287 
1288 	if (likely(!bbio->num_tgtdevs))
1289 		goto write_data;
1290 
1291 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1292 		if (!bbio->tgtdev_map[stripe])
1293 			continue;
1294 
1295 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1296 			struct page *page;
1297 			if (stripe < rbio->nr_data) {
1298 				page = page_in_rbio(rbio, stripe, pagenr, 1);
1299 				if (!page)
1300 					continue;
1301 			} else {
1302 			       page = rbio_stripe_page(rbio, stripe, pagenr);
1303 			}
1304 
1305 			ret = rbio_add_io_page(rbio, &bio_list, page,
1306 					       rbio->bbio->tgtdev_map[stripe],
1307 					       pagenr, rbio->stripe_len);
1308 			if (ret)
1309 				goto cleanup;
1310 		}
1311 	}
1312 
1313 write_data:
1314 	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1315 	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1316 
1317 	while ((bio = bio_list_pop(&bio_list))) {
1318 		bio->bi_private = rbio;
1319 		bio->bi_end_io = raid_write_end_io;
1320 		bio->bi_opf = REQ_OP_WRITE;
1321 
1322 		submit_bio(bio);
1323 	}
1324 	return;
1325 
1326 cleanup:
1327 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1328 
1329 	while ((bio = bio_list_pop(&bio_list)))
1330 		bio_put(bio);
1331 }
1332 
1333 /*
1334  * helper to find the stripe number for a given bio.  Used to figure out which
1335  * stripe has failed.  This expects the bio to correspond to a physical disk,
1336  * so it looks up based on physical sector numbers.
1337  */
1338 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1339 			   struct bio *bio)
1340 {
1341 	u64 physical = bio->bi_iter.bi_sector;
1342 	int i;
1343 	struct btrfs_bio_stripe *stripe;
1344 
1345 	physical <<= 9;
1346 
1347 	for (i = 0; i < rbio->bbio->num_stripes; i++) {
1348 		stripe = &rbio->bbio->stripes[i];
1349 		if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1350 		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1351 			return i;
1352 		}
1353 	}
1354 	return -1;
1355 }
1356 
1357 /*
1358  * helper to find the stripe number for a given
1359  * bio (before mapping).  Used to figure out which stripe has
1360  * failed.  This looks up based on logical block numbers.
1361  */
1362 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1363 				   struct bio *bio)
1364 {
1365 	u64 logical = bio->bi_iter.bi_sector << 9;
1366 	int i;
1367 
1368 	for (i = 0; i < rbio->nr_data; i++) {
1369 		u64 stripe_start = rbio->bbio->raid_map[i];
1370 
1371 		if (in_range(logical, stripe_start, rbio->stripe_len))
1372 			return i;
1373 	}
1374 	return -1;
1375 }
1376 
1377 /*
1378  * returns -EIO if we had too many failures
1379  */
1380 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1381 {
1382 	unsigned long flags;
1383 	int ret = 0;
1384 
1385 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
1386 
1387 	/* we already know this stripe is bad, move on */
1388 	if (rbio->faila == failed || rbio->failb == failed)
1389 		goto out;
1390 
1391 	if (rbio->faila == -1) {
1392 		/* first failure on this rbio */
1393 		rbio->faila = failed;
1394 		atomic_inc(&rbio->error);
1395 	} else if (rbio->failb == -1) {
1396 		/* second failure on this rbio */
1397 		rbio->failb = failed;
1398 		atomic_inc(&rbio->error);
1399 	} else {
1400 		ret = -EIO;
1401 	}
1402 out:
1403 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1404 
1405 	return ret;
1406 }
1407 
1408 /*
1409  * helper to fail a stripe based on a physical disk
1410  * bio.
1411  */
1412 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1413 			   struct bio *bio)
1414 {
1415 	int failed = find_bio_stripe(rbio, bio);
1416 
1417 	if (failed < 0)
1418 		return -EIO;
1419 
1420 	return fail_rbio_index(rbio, failed);
1421 }
1422 
1423 /*
1424  * this sets each page in the bio uptodate.  It should only be used on private
1425  * rbio pages, nothing that comes in from the higher layers
1426  */
1427 static void set_bio_pages_uptodate(struct bio *bio)
1428 {
1429 	struct bio_vec *bvec;
1430 	struct bvec_iter_all iter_all;
1431 
1432 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1433 
1434 	bio_for_each_segment_all(bvec, bio, iter_all)
1435 		SetPageUptodate(bvec->bv_page);
1436 }
1437 
1438 /*
1439  * end io for the read phase of the rmw cycle.  All the bios here are physical
1440  * stripe bios we've read from the disk so we can recalculate the parity of the
1441  * stripe.
1442  *
1443  * This will usually kick off finish_rmw once all the bios are read in, but it
1444  * may trigger parity reconstruction if we had any errors along the way
1445  */
1446 static void raid_rmw_end_io(struct bio *bio)
1447 {
1448 	struct btrfs_raid_bio *rbio = bio->bi_private;
1449 
1450 	if (bio->bi_status)
1451 		fail_bio_stripe(rbio, bio);
1452 	else
1453 		set_bio_pages_uptodate(bio);
1454 
1455 	bio_put(bio);
1456 
1457 	if (!atomic_dec_and_test(&rbio->stripes_pending))
1458 		return;
1459 
1460 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1461 		goto cleanup;
1462 
1463 	/*
1464 	 * this will normally call finish_rmw to start our write
1465 	 * but if there are any failed stripes we'll reconstruct
1466 	 * from parity first
1467 	 */
1468 	validate_rbio_for_rmw(rbio);
1469 	return;
1470 
1471 cleanup:
1472 
1473 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1474 }
1475 
1476 /*
1477  * the stripe must be locked by the caller.  It will
1478  * unlock after all the writes are done
1479  */
1480 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1481 {
1482 	int bios_to_read = 0;
1483 	struct bio_list bio_list;
1484 	int ret;
1485 	int pagenr;
1486 	int stripe;
1487 	struct bio *bio;
1488 
1489 	bio_list_init(&bio_list);
1490 
1491 	ret = alloc_rbio_pages(rbio);
1492 	if (ret)
1493 		goto cleanup;
1494 
1495 	index_rbio_pages(rbio);
1496 
1497 	atomic_set(&rbio->error, 0);
1498 	/*
1499 	 * build a list of bios to read all the missing parts of this
1500 	 * stripe
1501 	 */
1502 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1503 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1504 			struct page *page;
1505 			/*
1506 			 * we want to find all the pages missing from
1507 			 * the rbio and read them from the disk.  If
1508 			 * page_in_rbio finds a page in the bio list
1509 			 * we don't need to read it off the stripe.
1510 			 */
1511 			page = page_in_rbio(rbio, stripe, pagenr, 1);
1512 			if (page)
1513 				continue;
1514 
1515 			page = rbio_stripe_page(rbio, stripe, pagenr);
1516 			/*
1517 			 * the bio cache may have handed us an uptodate
1518 			 * page.  If so, be happy and use it
1519 			 */
1520 			if (PageUptodate(page))
1521 				continue;
1522 
1523 			ret = rbio_add_io_page(rbio, &bio_list, page,
1524 				       stripe, pagenr, rbio->stripe_len);
1525 			if (ret)
1526 				goto cleanup;
1527 		}
1528 	}
1529 
1530 	bios_to_read = bio_list_size(&bio_list);
1531 	if (!bios_to_read) {
1532 		/*
1533 		 * this can happen if others have merged with
1534 		 * us, it means there is nothing left to read.
1535 		 * But if there are missing devices it may not be
1536 		 * safe to do the full stripe write yet.
1537 		 */
1538 		goto finish;
1539 	}
1540 
1541 	/*
1542 	 * the bbio may be freed once we submit the last bio.  Make sure
1543 	 * not to touch it after that
1544 	 */
1545 	atomic_set(&rbio->stripes_pending, bios_to_read);
1546 	while ((bio = bio_list_pop(&bio_list))) {
1547 		bio->bi_private = rbio;
1548 		bio->bi_end_io = raid_rmw_end_io;
1549 		bio->bi_opf = REQ_OP_READ;
1550 
1551 		btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1552 
1553 		submit_bio(bio);
1554 	}
1555 	/* the actual write will happen once the reads are done */
1556 	return 0;
1557 
1558 cleanup:
1559 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1560 
1561 	while ((bio = bio_list_pop(&bio_list)))
1562 		bio_put(bio);
1563 
1564 	return -EIO;
1565 
1566 finish:
1567 	validate_rbio_for_rmw(rbio);
1568 	return 0;
1569 }
1570 
1571 /*
1572  * if the upper layers pass in a full stripe, we thank them by only allocating
1573  * enough pages to hold the parity, and sending it all down quickly.
1574  */
1575 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1576 {
1577 	int ret;
1578 
1579 	ret = alloc_rbio_parity_pages(rbio);
1580 	if (ret) {
1581 		__free_raid_bio(rbio);
1582 		return ret;
1583 	}
1584 
1585 	ret = lock_stripe_add(rbio);
1586 	if (ret == 0)
1587 		finish_rmw(rbio);
1588 	return 0;
1589 }
1590 
1591 /*
1592  * partial stripe writes get handed over to async helpers.
1593  * We're really hoping to merge a few more writes into this
1594  * rbio before calculating new parity
1595  */
1596 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1597 {
1598 	int ret;
1599 
1600 	ret = lock_stripe_add(rbio);
1601 	if (ret == 0)
1602 		start_async_work(rbio, rmw_work);
1603 	return 0;
1604 }
1605 
1606 /*
1607  * sometimes while we were reading from the drive to
1608  * recalculate parity, enough new bios come into create
1609  * a full stripe.  So we do a check here to see if we can
1610  * go directly to finish_rmw
1611  */
1612 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1613 {
1614 	/* head off into rmw land if we don't have a full stripe */
1615 	if (!rbio_is_full(rbio))
1616 		return partial_stripe_write(rbio);
1617 	return full_stripe_write(rbio);
1618 }
1619 
1620 /*
1621  * We use plugging call backs to collect full stripes.
1622  * Any time we get a partial stripe write while plugged
1623  * we collect it into a list.  When the unplug comes down,
1624  * we sort the list by logical block number and merge
1625  * everything we can into the same rbios
1626  */
1627 struct btrfs_plug_cb {
1628 	struct blk_plug_cb cb;
1629 	struct btrfs_fs_info *info;
1630 	struct list_head rbio_list;
1631 	struct btrfs_work work;
1632 };
1633 
1634 /*
1635  * rbios on the plug list are sorted for easier merging.
1636  */
1637 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1638 {
1639 	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1640 						 plug_list);
1641 	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1642 						 plug_list);
1643 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1644 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1645 
1646 	if (a_sector < b_sector)
1647 		return -1;
1648 	if (a_sector > b_sector)
1649 		return 1;
1650 	return 0;
1651 }
1652 
1653 static void run_plug(struct btrfs_plug_cb *plug)
1654 {
1655 	struct btrfs_raid_bio *cur;
1656 	struct btrfs_raid_bio *last = NULL;
1657 
1658 	/*
1659 	 * sort our plug list then try to merge
1660 	 * everything we can in hopes of creating full
1661 	 * stripes.
1662 	 */
1663 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1664 	while (!list_empty(&plug->rbio_list)) {
1665 		cur = list_entry(plug->rbio_list.next,
1666 				 struct btrfs_raid_bio, plug_list);
1667 		list_del_init(&cur->plug_list);
1668 
1669 		if (rbio_is_full(cur)) {
1670 			int ret;
1671 
1672 			/* we have a full stripe, send it down */
1673 			ret = full_stripe_write(cur);
1674 			BUG_ON(ret);
1675 			continue;
1676 		}
1677 		if (last) {
1678 			if (rbio_can_merge(last, cur)) {
1679 				merge_rbio(last, cur);
1680 				__free_raid_bio(cur);
1681 				continue;
1682 
1683 			}
1684 			__raid56_parity_write(last);
1685 		}
1686 		last = cur;
1687 	}
1688 	if (last) {
1689 		__raid56_parity_write(last);
1690 	}
1691 	kfree(plug);
1692 }
1693 
1694 /*
1695  * if the unplug comes from schedule, we have to push the
1696  * work off to a helper thread
1697  */
1698 static void unplug_work(struct btrfs_work *work)
1699 {
1700 	struct btrfs_plug_cb *plug;
1701 	plug = container_of(work, struct btrfs_plug_cb, work);
1702 	run_plug(plug);
1703 }
1704 
1705 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1706 {
1707 	struct btrfs_plug_cb *plug;
1708 	plug = container_of(cb, struct btrfs_plug_cb, cb);
1709 
1710 	if (from_schedule) {
1711 		btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1712 		btrfs_queue_work(plug->info->rmw_workers,
1713 				 &plug->work);
1714 		return;
1715 	}
1716 	run_plug(plug);
1717 }
1718 
1719 /*
1720  * our main entry point for writes from the rest of the FS.
1721  */
1722 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
1723 			struct btrfs_bio *bbio, u64 stripe_len)
1724 {
1725 	struct btrfs_raid_bio *rbio;
1726 	struct btrfs_plug_cb *plug = NULL;
1727 	struct blk_plug_cb *cb;
1728 	int ret;
1729 
1730 	rbio = alloc_rbio(fs_info, bbio, stripe_len);
1731 	if (IS_ERR(rbio)) {
1732 		btrfs_put_bbio(bbio);
1733 		return PTR_ERR(rbio);
1734 	}
1735 	bio_list_add(&rbio->bio_list, bio);
1736 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
1737 	rbio->operation = BTRFS_RBIO_WRITE;
1738 
1739 	btrfs_bio_counter_inc_noblocked(fs_info);
1740 	rbio->generic_bio_cnt = 1;
1741 
1742 	/*
1743 	 * don't plug on full rbios, just get them out the door
1744 	 * as quickly as we can
1745 	 */
1746 	if (rbio_is_full(rbio)) {
1747 		ret = full_stripe_write(rbio);
1748 		if (ret)
1749 			btrfs_bio_counter_dec(fs_info);
1750 		return ret;
1751 	}
1752 
1753 	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1754 	if (cb) {
1755 		plug = container_of(cb, struct btrfs_plug_cb, cb);
1756 		if (!plug->info) {
1757 			plug->info = fs_info;
1758 			INIT_LIST_HEAD(&plug->rbio_list);
1759 		}
1760 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
1761 		ret = 0;
1762 	} else {
1763 		ret = __raid56_parity_write(rbio);
1764 		if (ret)
1765 			btrfs_bio_counter_dec(fs_info);
1766 	}
1767 	return ret;
1768 }
1769 
1770 /*
1771  * all parity reconstruction happens here.  We've read in everything
1772  * we can find from the drives and this does the heavy lifting of
1773  * sorting the good from the bad.
1774  */
1775 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1776 {
1777 	int pagenr, stripe;
1778 	void **pointers;
1779 	int faila = -1, failb = -1;
1780 	struct page *page;
1781 	blk_status_t err;
1782 	int i;
1783 
1784 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1785 	if (!pointers) {
1786 		err = BLK_STS_RESOURCE;
1787 		goto cleanup_io;
1788 	}
1789 
1790 	faila = rbio->faila;
1791 	failb = rbio->failb;
1792 
1793 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1794 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1795 		spin_lock_irq(&rbio->bio_list_lock);
1796 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1797 		spin_unlock_irq(&rbio->bio_list_lock);
1798 	}
1799 
1800 	index_rbio_pages(rbio);
1801 
1802 	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1803 		/*
1804 		 * Now we just use bitmap to mark the horizontal stripes in
1805 		 * which we have data when doing parity scrub.
1806 		 */
1807 		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1808 		    !test_bit(pagenr, rbio->dbitmap))
1809 			continue;
1810 
1811 		/* setup our array of pointers with pages
1812 		 * from each stripe
1813 		 */
1814 		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1815 			/*
1816 			 * if we're rebuilding a read, we have to use
1817 			 * pages from the bio list
1818 			 */
1819 			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1820 			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1821 			    (stripe == faila || stripe == failb)) {
1822 				page = page_in_rbio(rbio, stripe, pagenr, 0);
1823 			} else {
1824 				page = rbio_stripe_page(rbio, stripe, pagenr);
1825 			}
1826 			pointers[stripe] = kmap(page);
1827 		}
1828 
1829 		/* all raid6 handling here */
1830 		if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1831 			/*
1832 			 * single failure, rebuild from parity raid5
1833 			 * style
1834 			 */
1835 			if (failb < 0) {
1836 				if (faila == rbio->nr_data) {
1837 					/*
1838 					 * Just the P stripe has failed, without
1839 					 * a bad data or Q stripe.
1840 					 * TODO, we should redo the xor here.
1841 					 */
1842 					err = BLK_STS_IOERR;
1843 					goto cleanup;
1844 				}
1845 				/*
1846 				 * a single failure in raid6 is rebuilt
1847 				 * in the pstripe code below
1848 				 */
1849 				goto pstripe;
1850 			}
1851 
1852 			/* make sure our ps and qs are in order */
1853 			if (faila > failb)
1854 				swap(faila, failb);
1855 
1856 			/* if the q stripe is failed, do a pstripe reconstruction
1857 			 * from the xors.
1858 			 * If both the q stripe and the P stripe are failed, we're
1859 			 * here due to a crc mismatch and we can't give them the
1860 			 * data they want
1861 			 */
1862 			if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1863 				if (rbio->bbio->raid_map[faila] ==
1864 				    RAID5_P_STRIPE) {
1865 					err = BLK_STS_IOERR;
1866 					goto cleanup;
1867 				}
1868 				/*
1869 				 * otherwise we have one bad data stripe and
1870 				 * a good P stripe.  raid5!
1871 				 */
1872 				goto pstripe;
1873 			}
1874 
1875 			if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1876 				raid6_datap_recov(rbio->real_stripes,
1877 						  PAGE_SIZE, faila, pointers);
1878 			} else {
1879 				raid6_2data_recov(rbio->real_stripes,
1880 						  PAGE_SIZE, faila, failb,
1881 						  pointers);
1882 			}
1883 		} else {
1884 			void *p;
1885 
1886 			/* rebuild from P stripe here (raid5 or raid6) */
1887 			BUG_ON(failb != -1);
1888 pstripe:
1889 			/* Copy parity block into failed block to start with */
1890 			copy_page(pointers[faila], pointers[rbio->nr_data]);
1891 
1892 			/* rearrange the pointer array */
1893 			p = pointers[faila];
1894 			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1895 				pointers[stripe] = pointers[stripe + 1];
1896 			pointers[rbio->nr_data - 1] = p;
1897 
1898 			/* xor in the rest */
1899 			run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1900 		}
1901 		/* if we're doing this rebuild as part of an rmw, go through
1902 		 * and set all of our private rbio pages in the
1903 		 * failed stripes as uptodate.  This way finish_rmw will
1904 		 * know they can be trusted.  If this was a read reconstruction,
1905 		 * other endio functions will fiddle the uptodate bits
1906 		 */
1907 		if (rbio->operation == BTRFS_RBIO_WRITE) {
1908 			for (i = 0;  i < rbio->stripe_npages; i++) {
1909 				if (faila != -1) {
1910 					page = rbio_stripe_page(rbio, faila, i);
1911 					SetPageUptodate(page);
1912 				}
1913 				if (failb != -1) {
1914 					page = rbio_stripe_page(rbio, failb, i);
1915 					SetPageUptodate(page);
1916 				}
1917 			}
1918 		}
1919 		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1920 			/*
1921 			 * if we're rebuilding a read, we have to use
1922 			 * pages from the bio list
1923 			 */
1924 			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1925 			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1926 			    (stripe == faila || stripe == failb)) {
1927 				page = page_in_rbio(rbio, stripe, pagenr, 0);
1928 			} else {
1929 				page = rbio_stripe_page(rbio, stripe, pagenr);
1930 			}
1931 			kunmap(page);
1932 		}
1933 	}
1934 
1935 	err = BLK_STS_OK;
1936 cleanup:
1937 	kfree(pointers);
1938 
1939 cleanup_io:
1940 	/*
1941 	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1942 	 * valid rbio which is consistent with ondisk content, thus such a
1943 	 * valid rbio can be cached to avoid further disk reads.
1944 	 */
1945 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1946 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1947 		/*
1948 		 * - In case of two failures, where rbio->failb != -1:
1949 		 *
1950 		 *   Do not cache this rbio since the above read reconstruction
1951 		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
1952 		 *   changed some content of stripes which are not identical to
1953 		 *   on-disk content any more, otherwise, a later write/recover
1954 		 *   may steal stripe_pages from this rbio and end up with
1955 		 *   corruptions or rebuild failures.
1956 		 *
1957 		 * - In case of single failure, where rbio->failb == -1:
1958 		 *
1959 		 *   Cache this rbio iff the above read reconstruction is
1960 		 *   executed without problems.
1961 		 */
1962 		if (err == BLK_STS_OK && rbio->failb < 0)
1963 			cache_rbio_pages(rbio);
1964 		else
1965 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1966 
1967 		rbio_orig_end_io(rbio, err);
1968 	} else if (err == BLK_STS_OK) {
1969 		rbio->faila = -1;
1970 		rbio->failb = -1;
1971 
1972 		if (rbio->operation == BTRFS_RBIO_WRITE)
1973 			finish_rmw(rbio);
1974 		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1975 			finish_parity_scrub(rbio, 0);
1976 		else
1977 			BUG();
1978 	} else {
1979 		rbio_orig_end_io(rbio, err);
1980 	}
1981 }
1982 
1983 /*
1984  * This is called only for stripes we've read from disk to
1985  * reconstruct the parity.
1986  */
1987 static void raid_recover_end_io(struct bio *bio)
1988 {
1989 	struct btrfs_raid_bio *rbio = bio->bi_private;
1990 
1991 	/*
1992 	 * we only read stripe pages off the disk, set them
1993 	 * up to date if there were no errors
1994 	 */
1995 	if (bio->bi_status)
1996 		fail_bio_stripe(rbio, bio);
1997 	else
1998 		set_bio_pages_uptodate(bio);
1999 	bio_put(bio);
2000 
2001 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2002 		return;
2003 
2004 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2005 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2006 	else
2007 		__raid_recover_end_io(rbio);
2008 }
2009 
2010 /*
2011  * reads everything we need off the disk to reconstruct
2012  * the parity. endio handlers trigger final reconstruction
2013  * when the IO is done.
2014  *
2015  * This is used both for reads from the higher layers and for
2016  * parity construction required to finish a rmw cycle.
2017  */
2018 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2019 {
2020 	int bios_to_read = 0;
2021 	struct bio_list bio_list;
2022 	int ret;
2023 	int pagenr;
2024 	int stripe;
2025 	struct bio *bio;
2026 
2027 	bio_list_init(&bio_list);
2028 
2029 	ret = alloc_rbio_pages(rbio);
2030 	if (ret)
2031 		goto cleanup;
2032 
2033 	atomic_set(&rbio->error, 0);
2034 
2035 	/*
2036 	 * read everything that hasn't failed.  Thanks to the
2037 	 * stripe cache, it is possible that some or all of these
2038 	 * pages are going to be uptodate.
2039 	 */
2040 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2041 		if (rbio->faila == stripe || rbio->failb == stripe) {
2042 			atomic_inc(&rbio->error);
2043 			continue;
2044 		}
2045 
2046 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2047 			struct page *p;
2048 
2049 			/*
2050 			 * the rmw code may have already read this
2051 			 * page in
2052 			 */
2053 			p = rbio_stripe_page(rbio, stripe, pagenr);
2054 			if (PageUptodate(p))
2055 				continue;
2056 
2057 			ret = rbio_add_io_page(rbio, &bio_list,
2058 				       rbio_stripe_page(rbio, stripe, pagenr),
2059 				       stripe, pagenr, rbio->stripe_len);
2060 			if (ret < 0)
2061 				goto cleanup;
2062 		}
2063 	}
2064 
2065 	bios_to_read = bio_list_size(&bio_list);
2066 	if (!bios_to_read) {
2067 		/*
2068 		 * we might have no bios to read just because the pages
2069 		 * were up to date, or we might have no bios to read because
2070 		 * the devices were gone.
2071 		 */
2072 		if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2073 			__raid_recover_end_io(rbio);
2074 			return 0;
2075 		} else {
2076 			goto cleanup;
2077 		}
2078 	}
2079 
2080 	/*
2081 	 * the bbio may be freed once we submit the last bio.  Make sure
2082 	 * not to touch it after that
2083 	 */
2084 	atomic_set(&rbio->stripes_pending, bios_to_read);
2085 	while ((bio = bio_list_pop(&bio_list))) {
2086 		bio->bi_private = rbio;
2087 		bio->bi_end_io = raid_recover_end_io;
2088 		bio->bi_opf = REQ_OP_READ;
2089 
2090 		btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2091 
2092 		submit_bio(bio);
2093 	}
2094 
2095 	return 0;
2096 
2097 cleanup:
2098 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2099 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2100 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2101 
2102 	while ((bio = bio_list_pop(&bio_list)))
2103 		bio_put(bio);
2104 
2105 	return -EIO;
2106 }
2107 
2108 /*
2109  * the main entry point for reads from the higher layers.  This
2110  * is really only called when the normal read path had a failure,
2111  * so we assume the bio they send down corresponds to a failed part
2112  * of the drive.
2113  */
2114 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
2115 			  struct btrfs_bio *bbio, u64 stripe_len,
2116 			  int mirror_num, int generic_io)
2117 {
2118 	struct btrfs_raid_bio *rbio;
2119 	int ret;
2120 
2121 	if (generic_io) {
2122 		ASSERT(bbio->mirror_num == mirror_num);
2123 		btrfs_io_bio(bio)->mirror_num = mirror_num;
2124 	}
2125 
2126 	rbio = alloc_rbio(fs_info, bbio, stripe_len);
2127 	if (IS_ERR(rbio)) {
2128 		if (generic_io)
2129 			btrfs_put_bbio(bbio);
2130 		return PTR_ERR(rbio);
2131 	}
2132 
2133 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2134 	bio_list_add(&rbio->bio_list, bio);
2135 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
2136 
2137 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2138 	if (rbio->faila == -1) {
2139 		btrfs_warn(fs_info,
2140 	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2141 			   __func__, bio->bi_iter.bi_sector << 9,
2142 			   (u64)bio->bi_iter.bi_size, bbio->map_type);
2143 		if (generic_io)
2144 			btrfs_put_bbio(bbio);
2145 		kfree(rbio);
2146 		return -EIO;
2147 	}
2148 
2149 	if (generic_io) {
2150 		btrfs_bio_counter_inc_noblocked(fs_info);
2151 		rbio->generic_bio_cnt = 1;
2152 	} else {
2153 		btrfs_get_bbio(bbio);
2154 	}
2155 
2156 	/*
2157 	 * Loop retry:
2158 	 * for 'mirror == 2', reconstruct from all other stripes.
2159 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2160 	 */
2161 	if (mirror_num > 2) {
2162 		/*
2163 		 * 'mirror == 3' is to fail the p stripe and
2164 		 * reconstruct from the q stripe.  'mirror > 3' is to
2165 		 * fail a data stripe and reconstruct from p+q stripe.
2166 		 */
2167 		rbio->failb = rbio->real_stripes - (mirror_num - 1);
2168 		ASSERT(rbio->failb > 0);
2169 		if (rbio->failb <= rbio->faila)
2170 			rbio->failb--;
2171 	}
2172 
2173 	ret = lock_stripe_add(rbio);
2174 
2175 	/*
2176 	 * __raid56_parity_recover will end the bio with
2177 	 * any errors it hits.  We don't want to return
2178 	 * its error value up the stack because our caller
2179 	 * will end up calling bio_endio with any nonzero
2180 	 * return
2181 	 */
2182 	if (ret == 0)
2183 		__raid56_parity_recover(rbio);
2184 	/*
2185 	 * our rbio has been added to the list of
2186 	 * rbios that will be handled after the
2187 	 * currently lock owner is done
2188 	 */
2189 	return 0;
2190 
2191 }
2192 
2193 static void rmw_work(struct btrfs_work *work)
2194 {
2195 	struct btrfs_raid_bio *rbio;
2196 
2197 	rbio = container_of(work, struct btrfs_raid_bio, work);
2198 	raid56_rmw_stripe(rbio);
2199 }
2200 
2201 static void read_rebuild_work(struct btrfs_work *work)
2202 {
2203 	struct btrfs_raid_bio *rbio;
2204 
2205 	rbio = container_of(work, struct btrfs_raid_bio, work);
2206 	__raid56_parity_recover(rbio);
2207 }
2208 
2209 /*
2210  * The following code is used to scrub/replace the parity stripe
2211  *
2212  * Caller must have already increased bio_counter for getting @bbio.
2213  *
2214  * Note: We need make sure all the pages that add into the scrub/replace
2215  * raid bio are correct and not be changed during the scrub/replace. That
2216  * is those pages just hold metadata or file data with checksum.
2217  */
2218 
2219 struct btrfs_raid_bio *
2220 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2221 			       struct btrfs_bio *bbio, u64 stripe_len,
2222 			       struct btrfs_device *scrub_dev,
2223 			       unsigned long *dbitmap, int stripe_nsectors)
2224 {
2225 	struct btrfs_raid_bio *rbio;
2226 	int i;
2227 
2228 	rbio = alloc_rbio(fs_info, bbio, stripe_len);
2229 	if (IS_ERR(rbio))
2230 		return NULL;
2231 	bio_list_add(&rbio->bio_list, bio);
2232 	/*
2233 	 * This is a special bio which is used to hold the completion handler
2234 	 * and make the scrub rbio is similar to the other types
2235 	 */
2236 	ASSERT(!bio->bi_iter.bi_size);
2237 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2238 
2239 	/*
2240 	 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2241 	 * to the end position, so this search can start from the first parity
2242 	 * stripe.
2243 	 */
2244 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2245 		if (bbio->stripes[i].dev == scrub_dev) {
2246 			rbio->scrubp = i;
2247 			break;
2248 		}
2249 	}
2250 	ASSERT(i < rbio->real_stripes);
2251 
2252 	/* Now we just support the sectorsize equals to page size */
2253 	ASSERT(fs_info->sectorsize == PAGE_SIZE);
2254 	ASSERT(rbio->stripe_npages == stripe_nsectors);
2255 	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2256 
2257 	/*
2258 	 * We have already increased bio_counter when getting bbio, record it
2259 	 * so we can free it at rbio_orig_end_io().
2260 	 */
2261 	rbio->generic_bio_cnt = 1;
2262 
2263 	return rbio;
2264 }
2265 
2266 /* Used for both parity scrub and missing. */
2267 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2268 			    u64 logical)
2269 {
2270 	int stripe_offset;
2271 	int index;
2272 
2273 	ASSERT(logical >= rbio->bbio->raid_map[0]);
2274 	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2275 				rbio->stripe_len * rbio->nr_data);
2276 	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2277 	index = stripe_offset >> PAGE_SHIFT;
2278 	rbio->bio_pages[index] = page;
2279 }
2280 
2281 /*
2282  * We just scrub the parity that we have correct data on the same horizontal,
2283  * so we needn't allocate all pages for all the stripes.
2284  */
2285 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2286 {
2287 	int i;
2288 	int bit;
2289 	int index;
2290 	struct page *page;
2291 
2292 	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2293 		for (i = 0; i < rbio->real_stripes; i++) {
2294 			index = i * rbio->stripe_npages + bit;
2295 			if (rbio->stripe_pages[index])
2296 				continue;
2297 
2298 			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2299 			if (!page)
2300 				return -ENOMEM;
2301 			rbio->stripe_pages[index] = page;
2302 		}
2303 	}
2304 	return 0;
2305 }
2306 
2307 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2308 					 int need_check)
2309 {
2310 	struct btrfs_bio *bbio = rbio->bbio;
2311 	void **pointers = rbio->finish_pointers;
2312 	unsigned long *pbitmap = rbio->finish_pbitmap;
2313 	int nr_data = rbio->nr_data;
2314 	int stripe;
2315 	int pagenr;
2316 	bool has_qstripe;
2317 	struct page *p_page = NULL;
2318 	struct page *q_page = NULL;
2319 	struct bio_list bio_list;
2320 	struct bio *bio;
2321 	int is_replace = 0;
2322 	int ret;
2323 
2324 	bio_list_init(&bio_list);
2325 
2326 	if (rbio->real_stripes - rbio->nr_data == 1)
2327 		has_qstripe = false;
2328 	else if (rbio->real_stripes - rbio->nr_data == 2)
2329 		has_qstripe = true;
2330 	else
2331 		BUG();
2332 
2333 	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2334 		is_replace = 1;
2335 		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2336 	}
2337 
2338 	/*
2339 	 * Because the higher layers(scrubber) are unlikely to
2340 	 * use this area of the disk again soon, so don't cache
2341 	 * it.
2342 	 */
2343 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2344 
2345 	if (!need_check)
2346 		goto writeback;
2347 
2348 	p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2349 	if (!p_page)
2350 		goto cleanup;
2351 	SetPageUptodate(p_page);
2352 
2353 	if (has_qstripe) {
2354 		/* RAID6, allocate and map temp space for the Q stripe */
2355 		q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2356 		if (!q_page) {
2357 			__free_page(p_page);
2358 			goto cleanup;
2359 		}
2360 		SetPageUptodate(q_page);
2361 		pointers[rbio->real_stripes - 1] = kmap(q_page);
2362 	}
2363 
2364 	atomic_set(&rbio->error, 0);
2365 
2366 	/* Map the parity stripe just once */
2367 	pointers[nr_data] = kmap(p_page);
2368 
2369 	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2370 		struct page *p;
2371 		void *parity;
2372 		/* first collect one page from each data stripe */
2373 		for (stripe = 0; stripe < nr_data; stripe++) {
2374 			p = page_in_rbio(rbio, stripe, pagenr, 0);
2375 			pointers[stripe] = kmap(p);
2376 		}
2377 
2378 		if (has_qstripe) {
2379 			/* RAID6, call the library function to fill in our P/Q */
2380 			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2381 						pointers);
2382 		} else {
2383 			/* raid5 */
2384 			copy_page(pointers[nr_data], pointers[0]);
2385 			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2386 		}
2387 
2388 		/* Check scrubbing parity and repair it */
2389 		p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2390 		parity = kmap(p);
2391 		if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2392 			copy_page(parity, pointers[rbio->scrubp]);
2393 		else
2394 			/* Parity is right, needn't writeback */
2395 			bitmap_clear(rbio->dbitmap, pagenr, 1);
2396 		kunmap(p);
2397 
2398 		for (stripe = 0; stripe < nr_data; stripe++)
2399 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2400 	}
2401 
2402 	kunmap(p_page);
2403 	__free_page(p_page);
2404 	if (q_page) {
2405 		kunmap(q_page);
2406 		__free_page(q_page);
2407 	}
2408 
2409 writeback:
2410 	/*
2411 	 * time to start writing.  Make bios for everything from the
2412 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2413 	 * everything else.
2414 	 */
2415 	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2416 		struct page *page;
2417 
2418 		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2419 		ret = rbio_add_io_page(rbio, &bio_list,
2420 			       page, rbio->scrubp, pagenr, rbio->stripe_len);
2421 		if (ret)
2422 			goto cleanup;
2423 	}
2424 
2425 	if (!is_replace)
2426 		goto submit_write;
2427 
2428 	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2429 		struct page *page;
2430 
2431 		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2432 		ret = rbio_add_io_page(rbio, &bio_list, page,
2433 				       bbio->tgtdev_map[rbio->scrubp],
2434 				       pagenr, rbio->stripe_len);
2435 		if (ret)
2436 			goto cleanup;
2437 	}
2438 
2439 submit_write:
2440 	nr_data = bio_list_size(&bio_list);
2441 	if (!nr_data) {
2442 		/* Every parity is right */
2443 		rbio_orig_end_io(rbio, BLK_STS_OK);
2444 		return;
2445 	}
2446 
2447 	atomic_set(&rbio->stripes_pending, nr_data);
2448 
2449 	while ((bio = bio_list_pop(&bio_list))) {
2450 		bio->bi_private = rbio;
2451 		bio->bi_end_io = raid_write_end_io;
2452 		bio->bi_opf = REQ_OP_WRITE;
2453 
2454 		submit_bio(bio);
2455 	}
2456 	return;
2457 
2458 cleanup:
2459 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2460 
2461 	while ((bio = bio_list_pop(&bio_list)))
2462 		bio_put(bio);
2463 }
2464 
2465 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2466 {
2467 	if (stripe >= 0 && stripe < rbio->nr_data)
2468 		return 1;
2469 	return 0;
2470 }
2471 
2472 /*
2473  * While we're doing the parity check and repair, we could have errors
2474  * in reading pages off the disk.  This checks for errors and if we're
2475  * not able to read the page it'll trigger parity reconstruction.  The
2476  * parity scrub will be finished after we've reconstructed the failed
2477  * stripes
2478  */
2479 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2480 {
2481 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2482 		goto cleanup;
2483 
2484 	if (rbio->faila >= 0 || rbio->failb >= 0) {
2485 		int dfail = 0, failp = -1;
2486 
2487 		if (is_data_stripe(rbio, rbio->faila))
2488 			dfail++;
2489 		else if (is_parity_stripe(rbio->faila))
2490 			failp = rbio->faila;
2491 
2492 		if (is_data_stripe(rbio, rbio->failb))
2493 			dfail++;
2494 		else if (is_parity_stripe(rbio->failb))
2495 			failp = rbio->failb;
2496 
2497 		/*
2498 		 * Because we can not use a scrubbing parity to repair
2499 		 * the data, so the capability of the repair is declined.
2500 		 * (In the case of RAID5, we can not repair anything)
2501 		 */
2502 		if (dfail > rbio->bbio->max_errors - 1)
2503 			goto cleanup;
2504 
2505 		/*
2506 		 * If all data is good, only parity is correctly, just
2507 		 * repair the parity.
2508 		 */
2509 		if (dfail == 0) {
2510 			finish_parity_scrub(rbio, 0);
2511 			return;
2512 		}
2513 
2514 		/*
2515 		 * Here means we got one corrupted data stripe and one
2516 		 * corrupted parity on RAID6, if the corrupted parity
2517 		 * is scrubbing parity, luckily, use the other one to repair
2518 		 * the data, or we can not repair the data stripe.
2519 		 */
2520 		if (failp != rbio->scrubp)
2521 			goto cleanup;
2522 
2523 		__raid_recover_end_io(rbio);
2524 	} else {
2525 		finish_parity_scrub(rbio, 1);
2526 	}
2527 	return;
2528 
2529 cleanup:
2530 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2531 }
2532 
2533 /*
2534  * end io for the read phase of the rmw cycle.  All the bios here are physical
2535  * stripe bios we've read from the disk so we can recalculate the parity of the
2536  * stripe.
2537  *
2538  * This will usually kick off finish_rmw once all the bios are read in, but it
2539  * may trigger parity reconstruction if we had any errors along the way
2540  */
2541 static void raid56_parity_scrub_end_io(struct bio *bio)
2542 {
2543 	struct btrfs_raid_bio *rbio = bio->bi_private;
2544 
2545 	if (bio->bi_status)
2546 		fail_bio_stripe(rbio, bio);
2547 	else
2548 		set_bio_pages_uptodate(bio);
2549 
2550 	bio_put(bio);
2551 
2552 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2553 		return;
2554 
2555 	/*
2556 	 * this will normally call finish_rmw to start our write
2557 	 * but if there are any failed stripes we'll reconstruct
2558 	 * from parity first
2559 	 */
2560 	validate_rbio_for_parity_scrub(rbio);
2561 }
2562 
2563 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2564 {
2565 	int bios_to_read = 0;
2566 	struct bio_list bio_list;
2567 	int ret;
2568 	int pagenr;
2569 	int stripe;
2570 	struct bio *bio;
2571 
2572 	bio_list_init(&bio_list);
2573 
2574 	ret = alloc_rbio_essential_pages(rbio);
2575 	if (ret)
2576 		goto cleanup;
2577 
2578 	atomic_set(&rbio->error, 0);
2579 	/*
2580 	 * build a list of bios to read all the missing parts of this
2581 	 * stripe
2582 	 */
2583 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2584 		for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2585 			struct page *page;
2586 			/*
2587 			 * we want to find all the pages missing from
2588 			 * the rbio and read them from the disk.  If
2589 			 * page_in_rbio finds a page in the bio list
2590 			 * we don't need to read it off the stripe.
2591 			 */
2592 			page = page_in_rbio(rbio, stripe, pagenr, 1);
2593 			if (page)
2594 				continue;
2595 
2596 			page = rbio_stripe_page(rbio, stripe, pagenr);
2597 			/*
2598 			 * the bio cache may have handed us an uptodate
2599 			 * page.  If so, be happy and use it
2600 			 */
2601 			if (PageUptodate(page))
2602 				continue;
2603 
2604 			ret = rbio_add_io_page(rbio, &bio_list, page,
2605 				       stripe, pagenr, rbio->stripe_len);
2606 			if (ret)
2607 				goto cleanup;
2608 		}
2609 	}
2610 
2611 	bios_to_read = bio_list_size(&bio_list);
2612 	if (!bios_to_read) {
2613 		/*
2614 		 * this can happen if others have merged with
2615 		 * us, it means there is nothing left to read.
2616 		 * But if there are missing devices it may not be
2617 		 * safe to do the full stripe write yet.
2618 		 */
2619 		goto finish;
2620 	}
2621 
2622 	/*
2623 	 * the bbio may be freed once we submit the last bio.  Make sure
2624 	 * not to touch it after that
2625 	 */
2626 	atomic_set(&rbio->stripes_pending, bios_to_read);
2627 	while ((bio = bio_list_pop(&bio_list))) {
2628 		bio->bi_private = rbio;
2629 		bio->bi_end_io = raid56_parity_scrub_end_io;
2630 		bio->bi_opf = REQ_OP_READ;
2631 
2632 		btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2633 
2634 		submit_bio(bio);
2635 	}
2636 	/* the actual write will happen once the reads are done */
2637 	return;
2638 
2639 cleanup:
2640 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2641 
2642 	while ((bio = bio_list_pop(&bio_list)))
2643 		bio_put(bio);
2644 
2645 	return;
2646 
2647 finish:
2648 	validate_rbio_for_parity_scrub(rbio);
2649 }
2650 
2651 static void scrub_parity_work(struct btrfs_work *work)
2652 {
2653 	struct btrfs_raid_bio *rbio;
2654 
2655 	rbio = container_of(work, struct btrfs_raid_bio, work);
2656 	raid56_parity_scrub_stripe(rbio);
2657 }
2658 
2659 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2660 {
2661 	if (!lock_stripe_add(rbio))
2662 		start_async_work(rbio, scrub_parity_work);
2663 }
2664 
2665 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2666 
2667 struct btrfs_raid_bio *
2668 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2669 			  struct btrfs_bio *bbio, u64 length)
2670 {
2671 	struct btrfs_raid_bio *rbio;
2672 
2673 	rbio = alloc_rbio(fs_info, bbio, length);
2674 	if (IS_ERR(rbio))
2675 		return NULL;
2676 
2677 	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2678 	bio_list_add(&rbio->bio_list, bio);
2679 	/*
2680 	 * This is a special bio which is used to hold the completion handler
2681 	 * and make the scrub rbio is similar to the other types
2682 	 */
2683 	ASSERT(!bio->bi_iter.bi_size);
2684 
2685 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2686 	if (rbio->faila == -1) {
2687 		BUG();
2688 		kfree(rbio);
2689 		return NULL;
2690 	}
2691 
2692 	/*
2693 	 * When we get bbio, we have already increased bio_counter, record it
2694 	 * so we can free it at rbio_orig_end_io()
2695 	 */
2696 	rbio->generic_bio_cnt = 1;
2697 
2698 	return rbio;
2699 }
2700 
2701 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2702 {
2703 	if (!lock_stripe_add(rbio))
2704 		start_async_work(rbio, read_rebuild_work);
2705 }
2706