xref: /openbmc/linux/fs/xfs/xfs_buf.c (revision de3a9980)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include <linux/backing-dev.h>
8 
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_trace.h"
16 #include "xfs_log.h"
17 #include "xfs_log_recover.h"
18 #include "xfs_trans.h"
19 #include "xfs_buf_item.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 
23 static kmem_zone_t *xfs_buf_zone;
24 
25 #define xb_to_gfp(flags) \
26 	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
27 
28 /*
29  * Locking orders
30  *
31  * xfs_buf_ioacct_inc:
32  * xfs_buf_ioacct_dec:
33  *	b_sema (caller holds)
34  *	  b_lock
35  *
36  * xfs_buf_stale:
37  *	b_sema (caller holds)
38  *	  b_lock
39  *	    lru_lock
40  *
41  * xfs_buf_rele:
42  *	b_lock
43  *	  pag_buf_lock
44  *	    lru_lock
45  *
46  * xfs_buftarg_drain_rele
47  *	lru_lock
48  *	  b_lock (trylock due to inversion)
49  *
50  * xfs_buftarg_isolate
51  *	lru_lock
52  *	  b_lock (trylock due to inversion)
53  */
54 
55 static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
56 
57 static inline int
58 xfs_buf_submit(
59 	struct xfs_buf		*bp)
60 {
61 	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
62 }
63 
64 static inline int
65 xfs_buf_is_vmapped(
66 	struct xfs_buf	*bp)
67 {
68 	/*
69 	 * Return true if the buffer is vmapped.
70 	 *
71 	 * b_addr is null if the buffer is not mapped, but the code is clever
72 	 * enough to know it doesn't have to map a single page, so the check has
73 	 * to be both for b_addr and bp->b_page_count > 1.
74 	 */
75 	return bp->b_addr && bp->b_page_count > 1;
76 }
77 
78 static inline int
79 xfs_buf_vmap_len(
80 	struct xfs_buf	*bp)
81 {
82 	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
83 }
84 
85 /*
86  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
87  * this buffer. The count is incremented once per buffer (per hold cycle)
88  * because the corresponding decrement is deferred to buffer release. Buffers
89  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
90  * tracking adds unnecessary overhead. This is used for sychronization purposes
91  * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
92  * in-flight buffers.
93  *
94  * Buffers that are never released (e.g., superblock, iclog buffers) must set
95  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
96  * never reaches zero and unmount hangs indefinitely.
97  */
98 static inline void
99 xfs_buf_ioacct_inc(
100 	struct xfs_buf	*bp)
101 {
102 	if (bp->b_flags & XBF_NO_IOACCT)
103 		return;
104 
105 	ASSERT(bp->b_flags & XBF_ASYNC);
106 	spin_lock(&bp->b_lock);
107 	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
108 		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
109 		percpu_counter_inc(&bp->b_target->bt_io_count);
110 	}
111 	spin_unlock(&bp->b_lock);
112 }
113 
114 /*
115  * Clear the in-flight state on a buffer about to be released to the LRU or
116  * freed and unaccount from the buftarg.
117  */
118 static inline void
119 __xfs_buf_ioacct_dec(
120 	struct xfs_buf	*bp)
121 {
122 	lockdep_assert_held(&bp->b_lock);
123 
124 	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
125 		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
126 		percpu_counter_dec(&bp->b_target->bt_io_count);
127 	}
128 }
129 
130 static inline void
131 xfs_buf_ioacct_dec(
132 	struct xfs_buf	*bp)
133 {
134 	spin_lock(&bp->b_lock);
135 	__xfs_buf_ioacct_dec(bp);
136 	spin_unlock(&bp->b_lock);
137 }
138 
139 /*
140  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
141  * b_lru_ref count so that the buffer is freed immediately when the buffer
142  * reference count falls to zero. If the buffer is already on the LRU, we need
143  * to remove the reference that LRU holds on the buffer.
144  *
145  * This prevents build-up of stale buffers on the LRU.
146  */
147 void
148 xfs_buf_stale(
149 	struct xfs_buf	*bp)
150 {
151 	ASSERT(xfs_buf_islocked(bp));
152 
153 	bp->b_flags |= XBF_STALE;
154 
155 	/*
156 	 * Clear the delwri status so that a delwri queue walker will not
157 	 * flush this buffer to disk now that it is stale. The delwri queue has
158 	 * a reference to the buffer, so this is safe to do.
159 	 */
160 	bp->b_flags &= ~_XBF_DELWRI_Q;
161 
162 	/*
163 	 * Once the buffer is marked stale and unlocked, a subsequent lookup
164 	 * could reset b_flags. There is no guarantee that the buffer is
165 	 * unaccounted (released to LRU) before that occurs. Drop in-flight
166 	 * status now to preserve accounting consistency.
167 	 */
168 	spin_lock(&bp->b_lock);
169 	__xfs_buf_ioacct_dec(bp);
170 
171 	atomic_set(&bp->b_lru_ref, 0);
172 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
173 	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
174 		atomic_dec(&bp->b_hold);
175 
176 	ASSERT(atomic_read(&bp->b_hold) >= 1);
177 	spin_unlock(&bp->b_lock);
178 }
179 
180 static int
181 xfs_buf_get_maps(
182 	struct xfs_buf		*bp,
183 	int			map_count)
184 {
185 	ASSERT(bp->b_maps == NULL);
186 	bp->b_map_count = map_count;
187 
188 	if (map_count == 1) {
189 		bp->b_maps = &bp->__b_map;
190 		return 0;
191 	}
192 
193 	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
194 				KM_NOFS);
195 	if (!bp->b_maps)
196 		return -ENOMEM;
197 	return 0;
198 }
199 
200 /*
201  *	Frees b_pages if it was allocated.
202  */
203 static void
204 xfs_buf_free_maps(
205 	struct xfs_buf	*bp)
206 {
207 	if (bp->b_maps != &bp->__b_map) {
208 		kmem_free(bp->b_maps);
209 		bp->b_maps = NULL;
210 	}
211 }
212 
213 static int
214 _xfs_buf_alloc(
215 	struct xfs_buftarg	*target,
216 	struct xfs_buf_map	*map,
217 	int			nmaps,
218 	xfs_buf_flags_t		flags,
219 	struct xfs_buf		**bpp)
220 {
221 	struct xfs_buf		*bp;
222 	int			error;
223 	int			i;
224 
225 	*bpp = NULL;
226 	bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
227 
228 	/*
229 	 * We don't want certain flags to appear in b_flags unless they are
230 	 * specifically set by later operations on the buffer.
231 	 */
232 	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
233 
234 	atomic_set(&bp->b_hold, 1);
235 	atomic_set(&bp->b_lru_ref, 1);
236 	init_completion(&bp->b_iowait);
237 	INIT_LIST_HEAD(&bp->b_lru);
238 	INIT_LIST_HEAD(&bp->b_list);
239 	INIT_LIST_HEAD(&bp->b_li_list);
240 	sema_init(&bp->b_sema, 0); /* held, no waiters */
241 	spin_lock_init(&bp->b_lock);
242 	bp->b_target = target;
243 	bp->b_mount = target->bt_mount;
244 	bp->b_flags = flags;
245 
246 	/*
247 	 * Set length and io_length to the same value initially.
248 	 * I/O routines should use io_length, which will be the same in
249 	 * most cases but may be reset (e.g. XFS recovery).
250 	 */
251 	error = xfs_buf_get_maps(bp, nmaps);
252 	if (error)  {
253 		kmem_cache_free(xfs_buf_zone, bp);
254 		return error;
255 	}
256 
257 	bp->b_bn = map[0].bm_bn;
258 	bp->b_length = 0;
259 	for (i = 0; i < nmaps; i++) {
260 		bp->b_maps[i].bm_bn = map[i].bm_bn;
261 		bp->b_maps[i].bm_len = map[i].bm_len;
262 		bp->b_length += map[i].bm_len;
263 	}
264 
265 	atomic_set(&bp->b_pin_count, 0);
266 	init_waitqueue_head(&bp->b_waiters);
267 
268 	XFS_STATS_INC(bp->b_mount, xb_create);
269 	trace_xfs_buf_init(bp, _RET_IP_);
270 
271 	*bpp = bp;
272 	return 0;
273 }
274 
275 /*
276  *	Allocate a page array capable of holding a specified number
277  *	of pages, and point the page buf at it.
278  */
279 STATIC int
280 _xfs_buf_get_pages(
281 	struct xfs_buf		*bp,
282 	int			page_count)
283 {
284 	/* Make sure that we have a page list */
285 	if (bp->b_pages == NULL) {
286 		bp->b_page_count = page_count;
287 		if (page_count <= XB_PAGES) {
288 			bp->b_pages = bp->b_page_array;
289 		} else {
290 			bp->b_pages = kmem_alloc(sizeof(struct page *) *
291 						 page_count, KM_NOFS);
292 			if (bp->b_pages == NULL)
293 				return -ENOMEM;
294 		}
295 		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
296 	}
297 	return 0;
298 }
299 
300 /*
301  *	Frees b_pages if it was allocated.
302  */
303 STATIC void
304 _xfs_buf_free_pages(
305 	struct xfs_buf	*bp)
306 {
307 	if (bp->b_pages != bp->b_page_array) {
308 		kmem_free(bp->b_pages);
309 		bp->b_pages = NULL;
310 	}
311 }
312 
313 /*
314  *	Releases the specified buffer.
315  *
316  * 	The modification state of any associated pages is left unchanged.
317  * 	The buffer must not be on any hash - use xfs_buf_rele instead for
318  * 	hashed and refcounted buffers
319  */
320 static void
321 xfs_buf_free(
322 	struct xfs_buf		*bp)
323 {
324 	trace_xfs_buf_free(bp, _RET_IP_);
325 
326 	ASSERT(list_empty(&bp->b_lru));
327 
328 	if (bp->b_flags & _XBF_PAGES) {
329 		uint		i;
330 
331 		if (xfs_buf_is_vmapped(bp))
332 			vm_unmap_ram(bp->b_addr - bp->b_offset,
333 					bp->b_page_count);
334 
335 		for (i = 0; i < bp->b_page_count; i++) {
336 			struct page	*page = bp->b_pages[i];
337 
338 			__free_page(page);
339 		}
340 		if (current->reclaim_state)
341 			current->reclaim_state->reclaimed_slab +=
342 							bp->b_page_count;
343 	} else if (bp->b_flags & _XBF_KMEM)
344 		kmem_free(bp->b_addr);
345 	_xfs_buf_free_pages(bp);
346 	xfs_buf_free_maps(bp);
347 	kmem_cache_free(xfs_buf_zone, bp);
348 }
349 
350 /*
351  * Allocates all the pages for buffer in question and builds it's page list.
352  */
353 STATIC int
354 xfs_buf_allocate_memory(
355 	struct xfs_buf		*bp,
356 	uint			flags)
357 {
358 	size_t			size;
359 	size_t			nbytes, offset;
360 	gfp_t			gfp_mask = xb_to_gfp(flags);
361 	unsigned short		page_count, i;
362 	xfs_off_t		start, end;
363 	int			error;
364 	xfs_km_flags_t		kmflag_mask = 0;
365 
366 	/*
367 	 * assure zeroed buffer for non-read cases.
368 	 */
369 	if (!(flags & XBF_READ)) {
370 		kmflag_mask |= KM_ZERO;
371 		gfp_mask |= __GFP_ZERO;
372 	}
373 
374 	/*
375 	 * for buffers that are contained within a single page, just allocate
376 	 * the memory from the heap - there's no need for the complexity of
377 	 * page arrays to keep allocation down to order 0.
378 	 */
379 	size = BBTOB(bp->b_length);
380 	if (size < PAGE_SIZE) {
381 		int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
382 		bp->b_addr = kmem_alloc_io(size, align_mask,
383 					   KM_NOFS | kmflag_mask);
384 		if (!bp->b_addr) {
385 			/* low memory - use alloc_page loop instead */
386 			goto use_alloc_page;
387 		}
388 
389 		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
390 		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
391 			/* b_addr spans two pages - use alloc_page instead */
392 			kmem_free(bp->b_addr);
393 			bp->b_addr = NULL;
394 			goto use_alloc_page;
395 		}
396 		bp->b_offset = offset_in_page(bp->b_addr);
397 		bp->b_pages = bp->b_page_array;
398 		bp->b_pages[0] = kmem_to_page(bp->b_addr);
399 		bp->b_page_count = 1;
400 		bp->b_flags |= _XBF_KMEM;
401 		return 0;
402 	}
403 
404 use_alloc_page:
405 	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
406 	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
407 								>> PAGE_SHIFT;
408 	page_count = end - start;
409 	error = _xfs_buf_get_pages(bp, page_count);
410 	if (unlikely(error))
411 		return error;
412 
413 	offset = bp->b_offset;
414 	bp->b_flags |= _XBF_PAGES;
415 
416 	for (i = 0; i < bp->b_page_count; i++) {
417 		struct page	*page;
418 		uint		retries = 0;
419 retry:
420 		page = alloc_page(gfp_mask);
421 		if (unlikely(page == NULL)) {
422 			if (flags & XBF_READ_AHEAD) {
423 				bp->b_page_count = i;
424 				error = -ENOMEM;
425 				goto out_free_pages;
426 			}
427 
428 			/*
429 			 * This could deadlock.
430 			 *
431 			 * But until all the XFS lowlevel code is revamped to
432 			 * handle buffer allocation failures we can't do much.
433 			 */
434 			if (!(++retries % 100))
435 				xfs_err(NULL,
436 		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
437 					current->comm, current->pid,
438 					__func__, gfp_mask);
439 
440 			XFS_STATS_INC(bp->b_mount, xb_page_retries);
441 			congestion_wait(BLK_RW_ASYNC, HZ/50);
442 			goto retry;
443 		}
444 
445 		XFS_STATS_INC(bp->b_mount, xb_page_found);
446 
447 		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
448 		size -= nbytes;
449 		bp->b_pages[i] = page;
450 		offset = 0;
451 	}
452 	return 0;
453 
454 out_free_pages:
455 	for (i = 0; i < bp->b_page_count; i++)
456 		__free_page(bp->b_pages[i]);
457 	bp->b_flags &= ~_XBF_PAGES;
458 	return error;
459 }
460 
461 /*
462  *	Map buffer into kernel address-space if necessary.
463  */
464 STATIC int
465 _xfs_buf_map_pages(
466 	struct xfs_buf		*bp,
467 	uint			flags)
468 {
469 	ASSERT(bp->b_flags & _XBF_PAGES);
470 	if (bp->b_page_count == 1) {
471 		/* A single page buffer is always mappable */
472 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
473 	} else if (flags & XBF_UNMAPPED) {
474 		bp->b_addr = NULL;
475 	} else {
476 		int retried = 0;
477 		unsigned nofs_flag;
478 
479 		/*
480 		 * vm_map_ram() will allocate auxiliary structures (e.g.
481 		 * pagetables) with GFP_KERNEL, yet we are likely to be under
482 		 * GFP_NOFS context here. Hence we need to tell memory reclaim
483 		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
484 		 * memory reclaim re-entering the filesystem here and
485 		 * potentially deadlocking.
486 		 */
487 		nofs_flag = memalloc_nofs_save();
488 		do {
489 			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
490 						-1);
491 			if (bp->b_addr)
492 				break;
493 			vm_unmap_aliases();
494 		} while (retried++ <= 1);
495 		memalloc_nofs_restore(nofs_flag);
496 
497 		if (!bp->b_addr)
498 			return -ENOMEM;
499 		bp->b_addr += bp->b_offset;
500 	}
501 
502 	return 0;
503 }
504 
505 /*
506  *	Finding and Reading Buffers
507  */
508 static int
509 _xfs_buf_obj_cmp(
510 	struct rhashtable_compare_arg	*arg,
511 	const void			*obj)
512 {
513 	const struct xfs_buf_map	*map = arg->key;
514 	const struct xfs_buf		*bp = obj;
515 
516 	/*
517 	 * The key hashing in the lookup path depends on the key being the
518 	 * first element of the compare_arg, make sure to assert this.
519 	 */
520 	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
521 
522 	if (bp->b_bn != map->bm_bn)
523 		return 1;
524 
525 	if (unlikely(bp->b_length != map->bm_len)) {
526 		/*
527 		 * found a block number match. If the range doesn't
528 		 * match, the only way this is allowed is if the buffer
529 		 * in the cache is stale and the transaction that made
530 		 * it stale has not yet committed. i.e. we are
531 		 * reallocating a busy extent. Skip this buffer and
532 		 * continue searching for an exact match.
533 		 */
534 		ASSERT(bp->b_flags & XBF_STALE);
535 		return 1;
536 	}
537 	return 0;
538 }
539 
540 static const struct rhashtable_params xfs_buf_hash_params = {
541 	.min_size		= 32,	/* empty AGs have minimal footprint */
542 	.nelem_hint		= 16,
543 	.key_len		= sizeof(xfs_daddr_t),
544 	.key_offset		= offsetof(struct xfs_buf, b_bn),
545 	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
546 	.automatic_shrinking	= true,
547 	.obj_cmpfn		= _xfs_buf_obj_cmp,
548 };
549 
550 int
551 xfs_buf_hash_init(
552 	struct xfs_perag	*pag)
553 {
554 	spin_lock_init(&pag->pag_buf_lock);
555 	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
556 }
557 
558 void
559 xfs_buf_hash_destroy(
560 	struct xfs_perag	*pag)
561 {
562 	rhashtable_destroy(&pag->pag_buf_hash);
563 }
564 
565 /*
566  * Look up a buffer in the buffer cache and return it referenced and locked
567  * in @found_bp.
568  *
569  * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
570  * cache.
571  *
572  * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
573  * -EAGAIN if we fail to lock it.
574  *
575  * Return values are:
576  *	-EFSCORRUPTED if have been supplied with an invalid address
577  *	-EAGAIN on trylock failure
578  *	-ENOENT if we fail to find a match and @new_bp was NULL
579  *	0, with @found_bp:
580  *		- @new_bp if we inserted it into the cache
581  *		- the buffer we found and locked.
582  */
583 static int
584 xfs_buf_find(
585 	struct xfs_buftarg	*btp,
586 	struct xfs_buf_map	*map,
587 	int			nmaps,
588 	xfs_buf_flags_t		flags,
589 	struct xfs_buf		*new_bp,
590 	struct xfs_buf		**found_bp)
591 {
592 	struct xfs_perag	*pag;
593 	struct xfs_buf		*bp;
594 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
595 	xfs_daddr_t		eofs;
596 	int			i;
597 
598 	*found_bp = NULL;
599 
600 	for (i = 0; i < nmaps; i++)
601 		cmap.bm_len += map[i].bm_len;
602 
603 	/* Check for IOs smaller than the sector size / not sector aligned */
604 	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
605 	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
606 
607 	/*
608 	 * Corrupted block numbers can get through to here, unfortunately, so we
609 	 * have to check that the buffer falls within the filesystem bounds.
610 	 */
611 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
612 	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
613 		xfs_alert(btp->bt_mount,
614 			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
615 			  __func__, cmap.bm_bn, eofs);
616 		WARN_ON(1);
617 		return -EFSCORRUPTED;
618 	}
619 
620 	pag = xfs_perag_get(btp->bt_mount,
621 			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
622 
623 	spin_lock(&pag->pag_buf_lock);
624 	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
625 				    xfs_buf_hash_params);
626 	if (bp) {
627 		atomic_inc(&bp->b_hold);
628 		goto found;
629 	}
630 
631 	/* No match found */
632 	if (!new_bp) {
633 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
634 		spin_unlock(&pag->pag_buf_lock);
635 		xfs_perag_put(pag);
636 		return -ENOENT;
637 	}
638 
639 	/* the buffer keeps the perag reference until it is freed */
640 	new_bp->b_pag = pag;
641 	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
642 			       xfs_buf_hash_params);
643 	spin_unlock(&pag->pag_buf_lock);
644 	*found_bp = new_bp;
645 	return 0;
646 
647 found:
648 	spin_unlock(&pag->pag_buf_lock);
649 	xfs_perag_put(pag);
650 
651 	if (!xfs_buf_trylock(bp)) {
652 		if (flags & XBF_TRYLOCK) {
653 			xfs_buf_rele(bp);
654 			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
655 			return -EAGAIN;
656 		}
657 		xfs_buf_lock(bp);
658 		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
659 	}
660 
661 	/*
662 	 * if the buffer is stale, clear all the external state associated with
663 	 * it. We need to keep flags such as how we allocated the buffer memory
664 	 * intact here.
665 	 */
666 	if (bp->b_flags & XBF_STALE) {
667 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
668 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
669 		bp->b_ops = NULL;
670 	}
671 
672 	trace_xfs_buf_find(bp, flags, _RET_IP_);
673 	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
674 	*found_bp = bp;
675 	return 0;
676 }
677 
678 struct xfs_buf *
679 xfs_buf_incore(
680 	struct xfs_buftarg	*target,
681 	xfs_daddr_t		blkno,
682 	size_t			numblks,
683 	xfs_buf_flags_t		flags)
684 {
685 	struct xfs_buf		*bp;
686 	int			error;
687 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
688 
689 	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
690 	if (error)
691 		return NULL;
692 	return bp;
693 }
694 
695 /*
696  * Assembles a buffer covering the specified range. The code is optimised for
697  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
698  * more hits than misses.
699  */
700 int
701 xfs_buf_get_map(
702 	struct xfs_buftarg	*target,
703 	struct xfs_buf_map	*map,
704 	int			nmaps,
705 	xfs_buf_flags_t		flags,
706 	struct xfs_buf		**bpp)
707 {
708 	struct xfs_buf		*bp;
709 	struct xfs_buf		*new_bp;
710 	int			error = 0;
711 
712 	*bpp = NULL;
713 	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
714 	if (!error)
715 		goto found;
716 	if (error != -ENOENT)
717 		return error;
718 
719 	error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
720 	if (error)
721 		return error;
722 
723 	error = xfs_buf_allocate_memory(new_bp, flags);
724 	if (error) {
725 		xfs_buf_free(new_bp);
726 		return error;
727 	}
728 
729 	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
730 	if (error) {
731 		xfs_buf_free(new_bp);
732 		return error;
733 	}
734 
735 	if (bp != new_bp)
736 		xfs_buf_free(new_bp);
737 
738 found:
739 	if (!bp->b_addr) {
740 		error = _xfs_buf_map_pages(bp, flags);
741 		if (unlikely(error)) {
742 			xfs_warn_ratelimited(target->bt_mount,
743 				"%s: failed to map %u pages", __func__,
744 				bp->b_page_count);
745 			xfs_buf_relse(bp);
746 			return error;
747 		}
748 	}
749 
750 	/*
751 	 * Clear b_error if this is a lookup from a caller that doesn't expect
752 	 * valid data to be found in the buffer.
753 	 */
754 	if (!(flags & XBF_READ))
755 		xfs_buf_ioerror(bp, 0);
756 
757 	XFS_STATS_INC(target->bt_mount, xb_get);
758 	trace_xfs_buf_get(bp, flags, _RET_IP_);
759 	*bpp = bp;
760 	return 0;
761 }
762 
763 int
764 _xfs_buf_read(
765 	struct xfs_buf		*bp,
766 	xfs_buf_flags_t		flags)
767 {
768 	ASSERT(!(flags & XBF_WRITE));
769 	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
770 
771 	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
772 	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
773 
774 	return xfs_buf_submit(bp);
775 }
776 
777 /*
778  * Reverify a buffer found in cache without an attached ->b_ops.
779  *
780  * If the caller passed an ops structure and the buffer doesn't have ops
781  * assigned, set the ops and use it to verify the contents. If verification
782  * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
783  * already in XBF_DONE state on entry.
784  *
785  * Under normal operations, every in-core buffer is verified on read I/O
786  * completion. There are two scenarios that can lead to in-core buffers without
787  * an assigned ->b_ops. The first is during log recovery of buffers on a V4
788  * filesystem, though these buffers are purged at the end of recovery. The
789  * other is online repair, which intentionally reads with a NULL buffer ops to
790  * run several verifiers across an in-core buffer in order to establish buffer
791  * type.  If repair can't establish that, the buffer will be left in memory
792  * with NULL buffer ops.
793  */
794 int
795 xfs_buf_reverify(
796 	struct xfs_buf		*bp,
797 	const struct xfs_buf_ops *ops)
798 {
799 	ASSERT(bp->b_flags & XBF_DONE);
800 	ASSERT(bp->b_error == 0);
801 
802 	if (!ops || bp->b_ops)
803 		return 0;
804 
805 	bp->b_ops = ops;
806 	bp->b_ops->verify_read(bp);
807 	if (bp->b_error)
808 		bp->b_flags &= ~XBF_DONE;
809 	return bp->b_error;
810 }
811 
812 int
813 xfs_buf_read_map(
814 	struct xfs_buftarg	*target,
815 	struct xfs_buf_map	*map,
816 	int			nmaps,
817 	xfs_buf_flags_t		flags,
818 	struct xfs_buf		**bpp,
819 	const struct xfs_buf_ops *ops,
820 	xfs_failaddr_t		fa)
821 {
822 	struct xfs_buf		*bp;
823 	int			error;
824 
825 	flags |= XBF_READ;
826 	*bpp = NULL;
827 
828 	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
829 	if (error)
830 		return error;
831 
832 	trace_xfs_buf_read(bp, flags, _RET_IP_);
833 
834 	if (!(bp->b_flags & XBF_DONE)) {
835 		/* Initiate the buffer read and wait. */
836 		XFS_STATS_INC(target->bt_mount, xb_get_read);
837 		bp->b_ops = ops;
838 		error = _xfs_buf_read(bp, flags);
839 
840 		/* Readahead iodone already dropped the buffer, so exit. */
841 		if (flags & XBF_ASYNC)
842 			return 0;
843 	} else {
844 		/* Buffer already read; all we need to do is check it. */
845 		error = xfs_buf_reverify(bp, ops);
846 
847 		/* Readahead already finished; drop the buffer and exit. */
848 		if (flags & XBF_ASYNC) {
849 			xfs_buf_relse(bp);
850 			return 0;
851 		}
852 
853 		/* We do not want read in the flags */
854 		bp->b_flags &= ~XBF_READ;
855 		ASSERT(bp->b_ops != NULL || ops == NULL);
856 	}
857 
858 	/*
859 	 * If we've had a read error, then the contents of the buffer are
860 	 * invalid and should not be used. To ensure that a followup read tries
861 	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
862 	 * mark the buffer stale. This ensures that anyone who has a current
863 	 * reference to the buffer will interpret it's contents correctly and
864 	 * future cache lookups will also treat it as an empty, uninitialised
865 	 * buffer.
866 	 */
867 	if (error) {
868 		if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
869 			xfs_buf_ioerror_alert(bp, fa);
870 
871 		bp->b_flags &= ~XBF_DONE;
872 		xfs_buf_stale(bp);
873 		xfs_buf_relse(bp);
874 
875 		/* bad CRC means corrupted metadata */
876 		if (error == -EFSBADCRC)
877 			error = -EFSCORRUPTED;
878 		return error;
879 	}
880 
881 	*bpp = bp;
882 	return 0;
883 }
884 
885 /*
886  *	If we are not low on memory then do the readahead in a deadlock
887  *	safe manner.
888  */
889 void
890 xfs_buf_readahead_map(
891 	struct xfs_buftarg	*target,
892 	struct xfs_buf_map	*map,
893 	int			nmaps,
894 	const struct xfs_buf_ops *ops)
895 {
896 	struct xfs_buf		*bp;
897 
898 	if (bdi_read_congested(target->bt_bdev->bd_bdi))
899 		return;
900 
901 	xfs_buf_read_map(target, map, nmaps,
902 		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
903 		     __this_address);
904 }
905 
906 /*
907  * Read an uncached buffer from disk. Allocates and returns a locked
908  * buffer containing the disk contents or nothing.
909  */
910 int
911 xfs_buf_read_uncached(
912 	struct xfs_buftarg	*target,
913 	xfs_daddr_t		daddr,
914 	size_t			numblks,
915 	int			flags,
916 	struct xfs_buf		**bpp,
917 	const struct xfs_buf_ops *ops)
918 {
919 	struct xfs_buf		*bp;
920 	int			error;
921 
922 	*bpp = NULL;
923 
924 	error = xfs_buf_get_uncached(target, numblks, flags, &bp);
925 	if (error)
926 		return error;
927 
928 	/* set up the buffer for a read IO */
929 	ASSERT(bp->b_map_count == 1);
930 	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
931 	bp->b_maps[0].bm_bn = daddr;
932 	bp->b_flags |= XBF_READ;
933 	bp->b_ops = ops;
934 
935 	xfs_buf_submit(bp);
936 	if (bp->b_error) {
937 		error = bp->b_error;
938 		xfs_buf_relse(bp);
939 		return error;
940 	}
941 
942 	*bpp = bp;
943 	return 0;
944 }
945 
946 int
947 xfs_buf_get_uncached(
948 	struct xfs_buftarg	*target,
949 	size_t			numblks,
950 	int			flags,
951 	struct xfs_buf		**bpp)
952 {
953 	unsigned long		page_count;
954 	int			error, i;
955 	struct xfs_buf		*bp;
956 	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
957 
958 	*bpp = NULL;
959 
960 	/* flags might contain irrelevant bits, pass only what we care about */
961 	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
962 	if (error)
963 		goto fail;
964 
965 	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
966 	error = _xfs_buf_get_pages(bp, page_count);
967 	if (error)
968 		goto fail_free_buf;
969 
970 	for (i = 0; i < page_count; i++) {
971 		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
972 		if (!bp->b_pages[i]) {
973 			error = -ENOMEM;
974 			goto fail_free_mem;
975 		}
976 	}
977 	bp->b_flags |= _XBF_PAGES;
978 
979 	error = _xfs_buf_map_pages(bp, 0);
980 	if (unlikely(error)) {
981 		xfs_warn(target->bt_mount,
982 			"%s: failed to map pages", __func__);
983 		goto fail_free_mem;
984 	}
985 
986 	trace_xfs_buf_get_uncached(bp, _RET_IP_);
987 	*bpp = bp;
988 	return 0;
989 
990  fail_free_mem:
991 	while (--i >= 0)
992 		__free_page(bp->b_pages[i]);
993 	_xfs_buf_free_pages(bp);
994  fail_free_buf:
995 	xfs_buf_free_maps(bp);
996 	kmem_cache_free(xfs_buf_zone, bp);
997  fail:
998 	return error;
999 }
1000 
1001 /*
1002  *	Increment reference count on buffer, to hold the buffer concurrently
1003  *	with another thread which may release (free) the buffer asynchronously.
1004  *	Must hold the buffer already to call this function.
1005  */
1006 void
1007 xfs_buf_hold(
1008 	struct xfs_buf		*bp)
1009 {
1010 	trace_xfs_buf_hold(bp, _RET_IP_);
1011 	atomic_inc(&bp->b_hold);
1012 }
1013 
1014 /*
1015  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
1016  * placed on LRU or freed (depending on b_lru_ref).
1017  */
1018 void
1019 xfs_buf_rele(
1020 	struct xfs_buf		*bp)
1021 {
1022 	struct xfs_perag	*pag = bp->b_pag;
1023 	bool			release;
1024 	bool			freebuf = false;
1025 
1026 	trace_xfs_buf_rele(bp, _RET_IP_);
1027 
1028 	if (!pag) {
1029 		ASSERT(list_empty(&bp->b_lru));
1030 		if (atomic_dec_and_test(&bp->b_hold)) {
1031 			xfs_buf_ioacct_dec(bp);
1032 			xfs_buf_free(bp);
1033 		}
1034 		return;
1035 	}
1036 
1037 	ASSERT(atomic_read(&bp->b_hold) > 0);
1038 
1039 	/*
1040 	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
1041 	 * calls. The pag_buf_lock being taken on the last reference only
1042 	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
1043 	 * to last reference we drop here is not serialised against the last
1044 	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
1045 	 * first, the last "release" reference can win the race to the lock and
1046 	 * free the buffer before the second-to-last reference is processed,
1047 	 * leading to a use-after-free scenario.
1048 	 */
1049 	spin_lock(&bp->b_lock);
1050 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
1051 	if (!release) {
1052 		/*
1053 		 * Drop the in-flight state if the buffer is already on the LRU
1054 		 * and it holds the only reference. This is racy because we
1055 		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
1056 		 * ensures the decrement occurs only once per-buf.
1057 		 */
1058 		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
1059 			__xfs_buf_ioacct_dec(bp);
1060 		goto out_unlock;
1061 	}
1062 
1063 	/* the last reference has been dropped ... */
1064 	__xfs_buf_ioacct_dec(bp);
1065 	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
1066 		/*
1067 		 * If the buffer is added to the LRU take a new reference to the
1068 		 * buffer for the LRU and clear the (now stale) dispose list
1069 		 * state flag
1070 		 */
1071 		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
1072 			bp->b_state &= ~XFS_BSTATE_DISPOSE;
1073 			atomic_inc(&bp->b_hold);
1074 		}
1075 		spin_unlock(&pag->pag_buf_lock);
1076 	} else {
1077 		/*
1078 		 * most of the time buffers will already be removed from the
1079 		 * LRU, so optimise that case by checking for the
1080 		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
1081 		 * was on was the disposal list
1082 		 */
1083 		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1084 			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
1085 		} else {
1086 			ASSERT(list_empty(&bp->b_lru));
1087 		}
1088 
1089 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1090 		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
1091 				       xfs_buf_hash_params);
1092 		spin_unlock(&pag->pag_buf_lock);
1093 		xfs_perag_put(pag);
1094 		freebuf = true;
1095 	}
1096 
1097 out_unlock:
1098 	spin_unlock(&bp->b_lock);
1099 
1100 	if (freebuf)
1101 		xfs_buf_free(bp);
1102 }
1103 
1104 
1105 /*
1106  *	Lock a buffer object, if it is not already locked.
1107  *
1108  *	If we come across a stale, pinned, locked buffer, we know that we are
1109  *	being asked to lock a buffer that has been reallocated. Because it is
1110  *	pinned, we know that the log has not been pushed to disk and hence it
1111  *	will still be locked.  Rather than continuing to have trylock attempts
1112  *	fail until someone else pushes the log, push it ourselves before
1113  *	returning.  This means that the xfsaild will not get stuck trying
1114  *	to push on stale inode buffers.
1115  */
1116 int
1117 xfs_buf_trylock(
1118 	struct xfs_buf		*bp)
1119 {
1120 	int			locked;
1121 
1122 	locked = down_trylock(&bp->b_sema) == 0;
1123 	if (locked)
1124 		trace_xfs_buf_trylock(bp, _RET_IP_);
1125 	else
1126 		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1127 	return locked;
1128 }
1129 
1130 /*
1131  *	Lock a buffer object.
1132  *
1133  *	If we come across a stale, pinned, locked buffer, we know that we
1134  *	are being asked to lock a buffer that has been reallocated. Because
1135  *	it is pinned, we know that the log has not been pushed to disk and
1136  *	hence it will still be locked. Rather than sleeping until someone
1137  *	else pushes the log, push it ourselves before trying to get the lock.
1138  */
1139 void
1140 xfs_buf_lock(
1141 	struct xfs_buf		*bp)
1142 {
1143 	trace_xfs_buf_lock(bp, _RET_IP_);
1144 
1145 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
1146 		xfs_log_force(bp->b_mount, 0);
1147 	down(&bp->b_sema);
1148 
1149 	trace_xfs_buf_lock_done(bp, _RET_IP_);
1150 }
1151 
1152 void
1153 xfs_buf_unlock(
1154 	struct xfs_buf		*bp)
1155 {
1156 	ASSERT(xfs_buf_islocked(bp));
1157 
1158 	up(&bp->b_sema);
1159 	trace_xfs_buf_unlock(bp, _RET_IP_);
1160 }
1161 
1162 STATIC void
1163 xfs_buf_wait_unpin(
1164 	struct xfs_buf		*bp)
1165 {
1166 	DECLARE_WAITQUEUE	(wait, current);
1167 
1168 	if (atomic_read(&bp->b_pin_count) == 0)
1169 		return;
1170 
1171 	add_wait_queue(&bp->b_waiters, &wait);
1172 	for (;;) {
1173 		set_current_state(TASK_UNINTERRUPTIBLE);
1174 		if (atomic_read(&bp->b_pin_count) == 0)
1175 			break;
1176 		io_schedule();
1177 	}
1178 	remove_wait_queue(&bp->b_waiters, &wait);
1179 	set_current_state(TASK_RUNNING);
1180 }
1181 
1182 static void
1183 xfs_buf_ioerror_alert_ratelimited(
1184 	struct xfs_buf		*bp)
1185 {
1186 	static unsigned long	lasttime;
1187 	static struct xfs_buftarg *lasttarg;
1188 
1189 	if (bp->b_target != lasttarg ||
1190 	    time_after(jiffies, (lasttime + 5*HZ))) {
1191 		lasttime = jiffies;
1192 		xfs_buf_ioerror_alert(bp, __this_address);
1193 	}
1194 	lasttarg = bp->b_target;
1195 }
1196 
1197 /*
1198  * Account for this latest trip around the retry handler, and decide if
1199  * we've failed enough times to constitute a permanent failure.
1200  */
1201 static bool
1202 xfs_buf_ioerror_permanent(
1203 	struct xfs_buf		*bp,
1204 	struct xfs_error_cfg	*cfg)
1205 {
1206 	struct xfs_mount	*mp = bp->b_mount;
1207 
1208 	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1209 	    ++bp->b_retries > cfg->max_retries)
1210 		return true;
1211 	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1212 	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
1213 		return true;
1214 
1215 	/* At unmount we may treat errors differently */
1216 	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
1217 		return true;
1218 
1219 	return false;
1220 }
1221 
1222 /*
1223  * On a sync write or shutdown we just want to stale the buffer and let the
1224  * caller handle the error in bp->b_error appropriately.
1225  *
1226  * If the write was asynchronous then no one will be looking for the error.  If
1227  * this is the first failure of this type, clear the error state and write the
1228  * buffer out again. This means we always retry an async write failure at least
1229  * once, but we also need to set the buffer up to behave correctly now for
1230  * repeated failures.
1231  *
1232  * If we get repeated async write failures, then we take action according to the
1233  * error configuration we have been set up to use.
1234  *
1235  * Returns true if this function took care of error handling and the caller must
1236  * not touch the buffer again.  Return false if the caller should proceed with
1237  * normal I/O completion handling.
1238  */
1239 static bool
1240 xfs_buf_ioend_handle_error(
1241 	struct xfs_buf		*bp)
1242 {
1243 	struct xfs_mount	*mp = bp->b_mount;
1244 	struct xfs_error_cfg	*cfg;
1245 
1246 	/*
1247 	 * If we've already decided to shutdown the filesystem because of I/O
1248 	 * errors, there's no point in giving this a retry.
1249 	 */
1250 	if (XFS_FORCED_SHUTDOWN(mp))
1251 		goto out_stale;
1252 
1253 	xfs_buf_ioerror_alert_ratelimited(bp);
1254 
1255 	/*
1256 	 * We're not going to bother about retrying this during recovery.
1257 	 * One strike!
1258 	 */
1259 	if (bp->b_flags & _XBF_LOGRECOVERY) {
1260 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1261 		return false;
1262 	}
1263 
1264 	/*
1265 	 * Synchronous writes will have callers process the error.
1266 	 */
1267 	if (!(bp->b_flags & XBF_ASYNC))
1268 		goto out_stale;
1269 
1270 	trace_xfs_buf_iodone_async(bp, _RET_IP_);
1271 
1272 	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1273 	if (bp->b_last_error != bp->b_error ||
1274 	    !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
1275 		bp->b_last_error = bp->b_error;
1276 		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1277 		    !bp->b_first_retry_time)
1278 			bp->b_first_retry_time = jiffies;
1279 		goto resubmit;
1280 	}
1281 
1282 	/*
1283 	 * Permanent error - we need to trigger a shutdown if we haven't already
1284 	 * to indicate that inconsistency will result from this action.
1285 	 */
1286 	if (xfs_buf_ioerror_permanent(bp, cfg)) {
1287 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1288 		goto out_stale;
1289 	}
1290 
1291 	/* Still considered a transient error. Caller will schedule retries. */
1292 	if (bp->b_flags & _XBF_INODES)
1293 		xfs_buf_inode_io_fail(bp);
1294 	else if (bp->b_flags & _XBF_DQUOTS)
1295 		xfs_buf_dquot_io_fail(bp);
1296 	else
1297 		ASSERT(list_empty(&bp->b_li_list));
1298 	xfs_buf_ioerror(bp, 0);
1299 	xfs_buf_relse(bp);
1300 	return true;
1301 
1302 resubmit:
1303 	xfs_buf_ioerror(bp, 0);
1304 	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
1305 	xfs_buf_submit(bp);
1306 	return true;
1307 out_stale:
1308 	xfs_buf_stale(bp);
1309 	bp->b_flags |= XBF_DONE;
1310 	bp->b_flags &= ~XBF_WRITE;
1311 	trace_xfs_buf_error_relse(bp, _RET_IP_);
1312 	return false;
1313 }
1314 
1315 static void
1316 xfs_buf_ioend(
1317 	struct xfs_buf	*bp)
1318 {
1319 	trace_xfs_buf_iodone(bp, _RET_IP_);
1320 
1321 	/*
1322 	 * Pull in IO completion errors now. We are guaranteed to be running
1323 	 * single threaded, so we don't need the lock to read b_io_error.
1324 	 */
1325 	if (!bp->b_error && bp->b_io_error)
1326 		xfs_buf_ioerror(bp, bp->b_io_error);
1327 
1328 	if (bp->b_flags & XBF_READ) {
1329 		if (!bp->b_error && bp->b_ops)
1330 			bp->b_ops->verify_read(bp);
1331 		if (!bp->b_error)
1332 			bp->b_flags |= XBF_DONE;
1333 	} else {
1334 		if (!bp->b_error) {
1335 			bp->b_flags &= ~XBF_WRITE_FAIL;
1336 			bp->b_flags |= XBF_DONE;
1337 		}
1338 
1339 		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
1340 			return;
1341 
1342 		/* clear the retry state */
1343 		bp->b_last_error = 0;
1344 		bp->b_retries = 0;
1345 		bp->b_first_retry_time = 0;
1346 
1347 		/*
1348 		 * Note that for things like remote attribute buffers, there may
1349 		 * not be a buffer log item here, so processing the buffer log
1350 		 * item must remain optional.
1351 		 */
1352 		if (bp->b_log_item)
1353 			xfs_buf_item_done(bp);
1354 
1355 		if (bp->b_flags & _XBF_INODES)
1356 			xfs_buf_inode_iodone(bp);
1357 		else if (bp->b_flags & _XBF_DQUOTS)
1358 			xfs_buf_dquot_iodone(bp);
1359 
1360 	}
1361 
1362 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
1363 			 _XBF_LOGRECOVERY);
1364 
1365 	if (bp->b_flags & XBF_ASYNC)
1366 		xfs_buf_relse(bp);
1367 	else
1368 		complete(&bp->b_iowait);
1369 }
1370 
1371 static void
1372 xfs_buf_ioend_work(
1373 	struct work_struct	*work)
1374 {
1375 	struct xfs_buf		*bp =
1376 		container_of(work, struct xfs_buf, b_ioend_work);
1377 
1378 	xfs_buf_ioend(bp);
1379 }
1380 
1381 static void
1382 xfs_buf_ioend_async(
1383 	struct xfs_buf	*bp)
1384 {
1385 	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1386 	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1387 }
1388 
1389 void
1390 __xfs_buf_ioerror(
1391 	struct xfs_buf		*bp,
1392 	int			error,
1393 	xfs_failaddr_t		failaddr)
1394 {
1395 	ASSERT(error <= 0 && error >= -1000);
1396 	bp->b_error = error;
1397 	trace_xfs_buf_ioerror(bp, error, failaddr);
1398 }
1399 
1400 void
1401 xfs_buf_ioerror_alert(
1402 	struct xfs_buf		*bp,
1403 	xfs_failaddr_t		func)
1404 {
1405 	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1406 		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
1407 				  func, (uint64_t)XFS_BUF_ADDR(bp),
1408 				  bp->b_length, -bp->b_error);
1409 }
1410 
1411 /*
1412  * To simulate an I/O failure, the buffer must be locked and held with at least
1413  * three references. The LRU reference is dropped by the stale call. The buf
1414  * item reference is dropped via ioend processing. The third reference is owned
1415  * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
1416  */
1417 void
1418 xfs_buf_ioend_fail(
1419 	struct xfs_buf	*bp)
1420 {
1421 	bp->b_flags &= ~XBF_DONE;
1422 	xfs_buf_stale(bp);
1423 	xfs_buf_ioerror(bp, -EIO);
1424 	xfs_buf_ioend(bp);
1425 }
1426 
1427 int
1428 xfs_bwrite(
1429 	struct xfs_buf		*bp)
1430 {
1431 	int			error;
1432 
1433 	ASSERT(xfs_buf_islocked(bp));
1434 
1435 	bp->b_flags |= XBF_WRITE;
1436 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1437 			 XBF_DONE);
1438 
1439 	error = xfs_buf_submit(bp);
1440 	if (error)
1441 		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1442 	return error;
1443 }
1444 
1445 static void
1446 xfs_buf_bio_end_io(
1447 	struct bio		*bio)
1448 {
1449 	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
1450 
1451 	if (!bio->bi_status &&
1452 	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
1453 	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
1454 		bio->bi_status = BLK_STS_IOERR;
1455 
1456 	/*
1457 	 * don't overwrite existing errors - otherwise we can lose errors on
1458 	 * buffers that require multiple bios to complete.
1459 	 */
1460 	if (bio->bi_status) {
1461 		int error = blk_status_to_errno(bio->bi_status);
1462 
1463 		cmpxchg(&bp->b_io_error, 0, error);
1464 	}
1465 
1466 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1467 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1468 
1469 	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1470 		xfs_buf_ioend_async(bp);
1471 	bio_put(bio);
1472 }
1473 
1474 static void
1475 xfs_buf_ioapply_map(
1476 	struct xfs_buf	*bp,
1477 	int		map,
1478 	int		*buf_offset,
1479 	int		*count,
1480 	int		op)
1481 {
1482 	int		page_index;
1483 	unsigned int	total_nr_pages = bp->b_page_count;
1484 	int		nr_pages;
1485 	struct bio	*bio;
1486 	sector_t	sector =  bp->b_maps[map].bm_bn;
1487 	int		size;
1488 	int		offset;
1489 
1490 	/* skip the pages in the buffer before the start offset */
1491 	page_index = 0;
1492 	offset = *buf_offset;
1493 	while (offset >= PAGE_SIZE) {
1494 		page_index++;
1495 		offset -= PAGE_SIZE;
1496 	}
1497 
1498 	/*
1499 	 * Limit the IO size to the length of the current vector, and update the
1500 	 * remaining IO count for the next time around.
1501 	 */
1502 	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1503 	*count -= size;
1504 	*buf_offset += size;
1505 
1506 next_chunk:
1507 	atomic_inc(&bp->b_io_remaining);
1508 	nr_pages = bio_max_segs(total_nr_pages);
1509 
1510 	bio = bio_alloc(GFP_NOIO, nr_pages);
1511 	bio_set_dev(bio, bp->b_target->bt_bdev);
1512 	bio->bi_iter.bi_sector = sector;
1513 	bio->bi_end_io = xfs_buf_bio_end_io;
1514 	bio->bi_private = bp;
1515 	bio->bi_opf = op;
1516 
1517 	for (; size && nr_pages; nr_pages--, page_index++) {
1518 		int	rbytes, nbytes = PAGE_SIZE - offset;
1519 
1520 		if (nbytes > size)
1521 			nbytes = size;
1522 
1523 		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1524 				      offset);
1525 		if (rbytes < nbytes)
1526 			break;
1527 
1528 		offset = 0;
1529 		sector += BTOBB(nbytes);
1530 		size -= nbytes;
1531 		total_nr_pages--;
1532 	}
1533 
1534 	if (likely(bio->bi_iter.bi_size)) {
1535 		if (xfs_buf_is_vmapped(bp)) {
1536 			flush_kernel_vmap_range(bp->b_addr,
1537 						xfs_buf_vmap_len(bp));
1538 		}
1539 		submit_bio(bio);
1540 		if (size)
1541 			goto next_chunk;
1542 	} else {
1543 		/*
1544 		 * This is guaranteed not to be the last io reference count
1545 		 * because the caller (xfs_buf_submit) holds a count itself.
1546 		 */
1547 		atomic_dec(&bp->b_io_remaining);
1548 		xfs_buf_ioerror(bp, -EIO);
1549 		bio_put(bio);
1550 	}
1551 
1552 }
1553 
1554 STATIC void
1555 _xfs_buf_ioapply(
1556 	struct xfs_buf	*bp)
1557 {
1558 	struct blk_plug	plug;
1559 	int		op;
1560 	int		offset;
1561 	int		size;
1562 	int		i;
1563 
1564 	/*
1565 	 * Make sure we capture only current IO errors rather than stale errors
1566 	 * left over from previous use of the buffer (e.g. failed readahead).
1567 	 */
1568 	bp->b_error = 0;
1569 
1570 	if (bp->b_flags & XBF_WRITE) {
1571 		op = REQ_OP_WRITE;
1572 
1573 		/*
1574 		 * Run the write verifier callback function if it exists. If
1575 		 * this function fails it will mark the buffer with an error and
1576 		 * the IO should not be dispatched.
1577 		 */
1578 		if (bp->b_ops) {
1579 			bp->b_ops->verify_write(bp);
1580 			if (bp->b_error) {
1581 				xfs_force_shutdown(bp->b_mount,
1582 						   SHUTDOWN_CORRUPT_INCORE);
1583 				return;
1584 			}
1585 		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
1586 			struct xfs_mount *mp = bp->b_mount;
1587 
1588 			/*
1589 			 * non-crc filesystems don't attach verifiers during
1590 			 * log recovery, so don't warn for such filesystems.
1591 			 */
1592 			if (xfs_sb_version_hascrc(&mp->m_sb)) {
1593 				xfs_warn(mp,
1594 					"%s: no buf ops on daddr 0x%llx len %d",
1595 					__func__, bp->b_bn, bp->b_length);
1596 				xfs_hex_dump(bp->b_addr,
1597 						XFS_CORRUPTION_DUMP_LEN);
1598 				dump_stack();
1599 			}
1600 		}
1601 	} else {
1602 		op = REQ_OP_READ;
1603 		if (bp->b_flags & XBF_READ_AHEAD)
1604 			op |= REQ_RAHEAD;
1605 	}
1606 
1607 	/* we only use the buffer cache for meta-data */
1608 	op |= REQ_META;
1609 
1610 	/*
1611 	 * Walk all the vectors issuing IO on them. Set up the initial offset
1612 	 * into the buffer and the desired IO size before we start -
1613 	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
1614 	 * subsequent call.
1615 	 */
1616 	offset = bp->b_offset;
1617 	size = BBTOB(bp->b_length);
1618 	blk_start_plug(&plug);
1619 	for (i = 0; i < bp->b_map_count; i++) {
1620 		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
1621 		if (bp->b_error)
1622 			break;
1623 		if (size <= 0)
1624 			break;	/* all done */
1625 	}
1626 	blk_finish_plug(&plug);
1627 }
1628 
1629 /*
1630  * Wait for I/O completion of a sync buffer and return the I/O error code.
1631  */
1632 static int
1633 xfs_buf_iowait(
1634 	struct xfs_buf	*bp)
1635 {
1636 	ASSERT(!(bp->b_flags & XBF_ASYNC));
1637 
1638 	trace_xfs_buf_iowait(bp, _RET_IP_);
1639 	wait_for_completion(&bp->b_iowait);
1640 	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1641 
1642 	return bp->b_error;
1643 }
1644 
1645 /*
1646  * Buffer I/O submission path, read or write. Asynchronous submission transfers
1647  * the buffer lock ownership and the current reference to the IO. It is not
1648  * safe to reference the buffer after a call to this function unless the caller
1649  * holds an additional reference itself.
1650  */
1651 static int
1652 __xfs_buf_submit(
1653 	struct xfs_buf	*bp,
1654 	bool		wait)
1655 {
1656 	int		error = 0;
1657 
1658 	trace_xfs_buf_submit(bp, _RET_IP_);
1659 
1660 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1661 
1662 	/* on shutdown we stale and complete the buffer immediately */
1663 	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1664 		xfs_buf_ioend_fail(bp);
1665 		return -EIO;
1666 	}
1667 
1668 	/*
1669 	 * Grab a reference so the buffer does not go away underneath us. For
1670 	 * async buffers, I/O completion drops the callers reference, which
1671 	 * could occur before submission returns.
1672 	 */
1673 	xfs_buf_hold(bp);
1674 
1675 	if (bp->b_flags & XBF_WRITE)
1676 		xfs_buf_wait_unpin(bp);
1677 
1678 	/* clear the internal error state to avoid spurious errors */
1679 	bp->b_io_error = 0;
1680 
1681 	/*
1682 	 * Set the count to 1 initially, this will stop an I/O completion
1683 	 * callout which happens before we have started all the I/O from calling
1684 	 * xfs_buf_ioend too early.
1685 	 */
1686 	atomic_set(&bp->b_io_remaining, 1);
1687 	if (bp->b_flags & XBF_ASYNC)
1688 		xfs_buf_ioacct_inc(bp);
1689 	_xfs_buf_ioapply(bp);
1690 
1691 	/*
1692 	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
1693 	 * reference we took above. If we drop it to zero, run completion so
1694 	 * that we don't return to the caller with completion still pending.
1695 	 */
1696 	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1697 		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
1698 			xfs_buf_ioend(bp);
1699 		else
1700 			xfs_buf_ioend_async(bp);
1701 	}
1702 
1703 	if (wait)
1704 		error = xfs_buf_iowait(bp);
1705 
1706 	/*
1707 	 * Release the hold that keeps the buffer referenced for the entire
1708 	 * I/O. Note that if the buffer is async, it is not safe to reference
1709 	 * after this release.
1710 	 */
1711 	xfs_buf_rele(bp);
1712 	return error;
1713 }
1714 
1715 void *
1716 xfs_buf_offset(
1717 	struct xfs_buf		*bp,
1718 	size_t			offset)
1719 {
1720 	struct page		*page;
1721 
1722 	if (bp->b_addr)
1723 		return bp->b_addr + offset;
1724 
1725 	offset += bp->b_offset;
1726 	page = bp->b_pages[offset >> PAGE_SHIFT];
1727 	return page_address(page) + (offset & (PAGE_SIZE-1));
1728 }
1729 
1730 void
1731 xfs_buf_zero(
1732 	struct xfs_buf		*bp,
1733 	size_t			boff,
1734 	size_t			bsize)
1735 {
1736 	size_t			bend;
1737 
1738 	bend = boff + bsize;
1739 	while (boff < bend) {
1740 		struct page	*page;
1741 		int		page_index, page_offset, csize;
1742 
1743 		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1744 		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1745 		page = bp->b_pages[page_index];
1746 		csize = min_t(size_t, PAGE_SIZE - page_offset,
1747 				      BBTOB(bp->b_length) - boff);
1748 
1749 		ASSERT((csize + page_offset) <= PAGE_SIZE);
1750 
1751 		memset(page_address(page) + page_offset, 0, csize);
1752 
1753 		boff += csize;
1754 	}
1755 }
1756 
1757 /*
1758  * Log a message about and stale a buffer that a caller has decided is corrupt.
1759  *
1760  * This function should be called for the kinds of metadata corruption that
1761  * cannot be detect from a verifier, such as incorrect inter-block relationship
1762  * data.  Do /not/ call this function from a verifier function.
1763  *
1764  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
1765  * be marked stale, but b_error will not be set.  The caller is responsible for
1766  * releasing the buffer or fixing it.
1767  */
1768 void
1769 __xfs_buf_mark_corrupt(
1770 	struct xfs_buf		*bp,
1771 	xfs_failaddr_t		fa)
1772 {
1773 	ASSERT(bp->b_flags & XBF_DONE);
1774 
1775 	xfs_buf_corruption_error(bp, fa);
1776 	xfs_buf_stale(bp);
1777 }
1778 
1779 /*
1780  *	Handling of buffer targets (buftargs).
1781  */
1782 
1783 /*
1784  * Wait for any bufs with callbacks that have been submitted but have not yet
1785  * returned. These buffers will have an elevated hold count, so wait on those
1786  * while freeing all the buffers only held by the LRU.
1787  */
1788 static enum lru_status
1789 xfs_buftarg_drain_rele(
1790 	struct list_head	*item,
1791 	struct list_lru_one	*lru,
1792 	spinlock_t		*lru_lock,
1793 	void			*arg)
1794 
1795 {
1796 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1797 	struct list_head	*dispose = arg;
1798 
1799 	if (atomic_read(&bp->b_hold) > 1) {
1800 		/* need to wait, so skip it this pass */
1801 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
1802 		return LRU_SKIP;
1803 	}
1804 	if (!spin_trylock(&bp->b_lock))
1805 		return LRU_SKIP;
1806 
1807 	/*
1808 	 * clear the LRU reference count so the buffer doesn't get
1809 	 * ignored in xfs_buf_rele().
1810 	 */
1811 	atomic_set(&bp->b_lru_ref, 0);
1812 	bp->b_state |= XFS_BSTATE_DISPOSE;
1813 	list_lru_isolate_move(lru, item, dispose);
1814 	spin_unlock(&bp->b_lock);
1815 	return LRU_REMOVED;
1816 }
1817 
1818 /*
1819  * Wait for outstanding I/O on the buftarg to complete.
1820  */
1821 void
1822 xfs_buftarg_wait(
1823 	struct xfs_buftarg	*btp)
1824 {
1825 	/*
1826 	 * First wait on the buftarg I/O count for all in-flight buffers to be
1827 	 * released. This is critical as new buffers do not make the LRU until
1828 	 * they are released.
1829 	 *
1830 	 * Next, flush the buffer workqueue to ensure all completion processing
1831 	 * has finished. Just waiting on buffer locks is not sufficient for
1832 	 * async IO as the reference count held over IO is not released until
1833 	 * after the buffer lock is dropped. Hence we need to ensure here that
1834 	 * all reference counts have been dropped before we start walking the
1835 	 * LRU list.
1836 	 */
1837 	while (percpu_counter_sum(&btp->bt_io_count))
1838 		delay(100);
1839 	flush_workqueue(btp->bt_mount->m_buf_workqueue);
1840 }
1841 
1842 void
1843 xfs_buftarg_drain(
1844 	struct xfs_buftarg	*btp)
1845 {
1846 	LIST_HEAD(dispose);
1847 	int			loop = 0;
1848 	bool			write_fail = false;
1849 
1850 	xfs_buftarg_wait(btp);
1851 
1852 	/* loop until there is nothing left on the lru list. */
1853 	while (list_lru_count(&btp->bt_lru)) {
1854 		list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
1855 			      &dispose, LONG_MAX);
1856 
1857 		while (!list_empty(&dispose)) {
1858 			struct xfs_buf *bp;
1859 			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1860 			list_del_init(&bp->b_lru);
1861 			if (bp->b_flags & XBF_WRITE_FAIL) {
1862 				write_fail = true;
1863 				xfs_buf_alert_ratelimited(bp,
1864 					"XFS: Corruption Alert",
1865 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
1866 					(long long)bp->b_bn);
1867 			}
1868 			xfs_buf_rele(bp);
1869 		}
1870 		if (loop++ != 0)
1871 			delay(100);
1872 	}
1873 
1874 	/*
1875 	 * If one or more failed buffers were freed, that means dirty metadata
1876 	 * was thrown away. This should only ever happen after I/O completion
1877 	 * handling has elevated I/O error(s) to permanent failures and shuts
1878 	 * down the fs.
1879 	 */
1880 	if (write_fail) {
1881 		ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
1882 		xfs_alert(btp->bt_mount,
1883 	      "Please run xfs_repair to determine the extent of the problem.");
1884 	}
1885 }
1886 
1887 static enum lru_status
1888 xfs_buftarg_isolate(
1889 	struct list_head	*item,
1890 	struct list_lru_one	*lru,
1891 	spinlock_t		*lru_lock,
1892 	void			*arg)
1893 {
1894 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1895 	struct list_head	*dispose = arg;
1896 
1897 	/*
1898 	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1899 	 * If we fail to get the lock, just skip it.
1900 	 */
1901 	if (!spin_trylock(&bp->b_lock))
1902 		return LRU_SKIP;
1903 	/*
1904 	 * Decrement the b_lru_ref count unless the value is already
1905 	 * zero. If the value is already zero, we need to reclaim the
1906 	 * buffer, otherwise it gets another trip through the LRU.
1907 	 */
1908 	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1909 		spin_unlock(&bp->b_lock);
1910 		return LRU_ROTATE;
1911 	}
1912 
1913 	bp->b_state |= XFS_BSTATE_DISPOSE;
1914 	list_lru_isolate_move(lru, item, dispose);
1915 	spin_unlock(&bp->b_lock);
1916 	return LRU_REMOVED;
1917 }
1918 
1919 static unsigned long
1920 xfs_buftarg_shrink_scan(
1921 	struct shrinker		*shrink,
1922 	struct shrink_control	*sc)
1923 {
1924 	struct xfs_buftarg	*btp = container_of(shrink,
1925 					struct xfs_buftarg, bt_shrinker);
1926 	LIST_HEAD(dispose);
1927 	unsigned long		freed;
1928 
1929 	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1930 				     xfs_buftarg_isolate, &dispose);
1931 
1932 	while (!list_empty(&dispose)) {
1933 		struct xfs_buf *bp;
1934 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1935 		list_del_init(&bp->b_lru);
1936 		xfs_buf_rele(bp);
1937 	}
1938 
1939 	return freed;
1940 }
1941 
1942 static unsigned long
1943 xfs_buftarg_shrink_count(
1944 	struct shrinker		*shrink,
1945 	struct shrink_control	*sc)
1946 {
1947 	struct xfs_buftarg	*btp = container_of(shrink,
1948 					struct xfs_buftarg, bt_shrinker);
1949 	return list_lru_shrink_count(&btp->bt_lru, sc);
1950 }
1951 
1952 void
1953 xfs_free_buftarg(
1954 	struct xfs_buftarg	*btp)
1955 {
1956 	unregister_shrinker(&btp->bt_shrinker);
1957 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1958 	percpu_counter_destroy(&btp->bt_io_count);
1959 	list_lru_destroy(&btp->bt_lru);
1960 
1961 	xfs_blkdev_issue_flush(btp);
1962 
1963 	kmem_free(btp);
1964 }
1965 
1966 int
1967 xfs_setsize_buftarg(
1968 	xfs_buftarg_t		*btp,
1969 	unsigned int		sectorsize)
1970 {
1971 	/* Set up metadata sector size info */
1972 	btp->bt_meta_sectorsize = sectorsize;
1973 	btp->bt_meta_sectormask = sectorsize - 1;
1974 
1975 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1976 		xfs_warn(btp->bt_mount,
1977 			"Cannot set_blocksize to %u on device %pg",
1978 			sectorsize, btp->bt_bdev);
1979 		return -EINVAL;
1980 	}
1981 
1982 	/* Set up device logical sector size mask */
1983 	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
1984 	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
1985 
1986 	return 0;
1987 }
1988 
1989 /*
1990  * When allocating the initial buffer target we have not yet
1991  * read in the superblock, so don't know what sized sectors
1992  * are being used at this early stage.  Play safe.
1993  */
1994 STATIC int
1995 xfs_setsize_buftarg_early(
1996 	xfs_buftarg_t		*btp,
1997 	struct block_device	*bdev)
1998 {
1999 	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
2000 }
2001 
2002 xfs_buftarg_t *
2003 xfs_alloc_buftarg(
2004 	struct xfs_mount	*mp,
2005 	struct block_device	*bdev,
2006 	struct dax_device	*dax_dev)
2007 {
2008 	xfs_buftarg_t		*btp;
2009 
2010 	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
2011 
2012 	btp->bt_mount = mp;
2013 	btp->bt_dev =  bdev->bd_dev;
2014 	btp->bt_bdev = bdev;
2015 	btp->bt_daxdev = dax_dev;
2016 
2017 	/*
2018 	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
2019 	 * per 30 seconds so as to not spam logs too much on repeated errors.
2020 	 */
2021 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
2022 			     DEFAULT_RATELIMIT_BURST);
2023 
2024 	if (xfs_setsize_buftarg_early(btp, bdev))
2025 		goto error_free;
2026 
2027 	if (list_lru_init(&btp->bt_lru))
2028 		goto error_free;
2029 
2030 	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
2031 		goto error_lru;
2032 
2033 	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
2034 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
2035 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
2036 	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
2037 	if (register_shrinker(&btp->bt_shrinker))
2038 		goto error_pcpu;
2039 	return btp;
2040 
2041 error_pcpu:
2042 	percpu_counter_destroy(&btp->bt_io_count);
2043 error_lru:
2044 	list_lru_destroy(&btp->bt_lru);
2045 error_free:
2046 	kmem_free(btp);
2047 	return NULL;
2048 }
2049 
2050 /*
2051  * Cancel a delayed write list.
2052  *
2053  * Remove each buffer from the list, clear the delwri queue flag and drop the
2054  * associated buffer reference.
2055  */
2056 void
2057 xfs_buf_delwri_cancel(
2058 	struct list_head	*list)
2059 {
2060 	struct xfs_buf		*bp;
2061 
2062 	while (!list_empty(list)) {
2063 		bp = list_first_entry(list, struct xfs_buf, b_list);
2064 
2065 		xfs_buf_lock(bp);
2066 		bp->b_flags &= ~_XBF_DELWRI_Q;
2067 		list_del_init(&bp->b_list);
2068 		xfs_buf_relse(bp);
2069 	}
2070 }
2071 
2072 /*
2073  * Add a buffer to the delayed write list.
2074  *
2075  * This queues a buffer for writeout if it hasn't already been.  Note that
2076  * neither this routine nor the buffer list submission functions perform
2077  * any internal synchronization.  It is expected that the lists are thread-local
2078  * to the callers.
2079  *
2080  * Returns true if we queued up the buffer, or false if it already had
2081  * been on the buffer list.
2082  */
2083 bool
2084 xfs_buf_delwri_queue(
2085 	struct xfs_buf		*bp,
2086 	struct list_head	*list)
2087 {
2088 	ASSERT(xfs_buf_islocked(bp));
2089 	ASSERT(!(bp->b_flags & XBF_READ));
2090 
2091 	/*
2092 	 * If the buffer is already marked delwri it already is queued up
2093 	 * by someone else for imediate writeout.  Just ignore it in that
2094 	 * case.
2095 	 */
2096 	if (bp->b_flags & _XBF_DELWRI_Q) {
2097 		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
2098 		return false;
2099 	}
2100 
2101 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
2102 
2103 	/*
2104 	 * If a buffer gets written out synchronously or marked stale while it
2105 	 * is on a delwri list we lazily remove it. To do this, the other party
2106 	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
2107 	 * It remains referenced and on the list.  In a rare corner case it
2108 	 * might get readded to a delwri list after the synchronous writeout, in
2109 	 * which case we need just need to re-add the flag here.
2110 	 */
2111 	bp->b_flags |= _XBF_DELWRI_Q;
2112 	if (list_empty(&bp->b_list)) {
2113 		atomic_inc(&bp->b_hold);
2114 		list_add_tail(&bp->b_list, list);
2115 	}
2116 
2117 	return true;
2118 }
2119 
2120 /*
2121  * Compare function is more complex than it needs to be because
2122  * the return value is only 32 bits and we are doing comparisons
2123  * on 64 bit values
2124  */
2125 static int
2126 xfs_buf_cmp(
2127 	void		*priv,
2128 	struct list_head *a,
2129 	struct list_head *b)
2130 {
2131 	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
2132 	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
2133 	xfs_daddr_t		diff;
2134 
2135 	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
2136 	if (diff < 0)
2137 		return -1;
2138 	if (diff > 0)
2139 		return 1;
2140 	return 0;
2141 }
2142 
2143 /*
2144  * Submit buffers for write. If wait_list is specified, the buffers are
2145  * submitted using sync I/O and placed on the wait list such that the caller can
2146  * iowait each buffer. Otherwise async I/O is used and the buffers are released
2147  * at I/O completion time. In either case, buffers remain locked until I/O
2148  * completes and the buffer is released from the queue.
2149  */
2150 static int
2151 xfs_buf_delwri_submit_buffers(
2152 	struct list_head	*buffer_list,
2153 	struct list_head	*wait_list)
2154 {
2155 	struct xfs_buf		*bp, *n;
2156 	int			pinned = 0;
2157 	struct blk_plug		plug;
2158 
2159 	list_sort(NULL, buffer_list, xfs_buf_cmp);
2160 
2161 	blk_start_plug(&plug);
2162 	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
2163 		if (!wait_list) {
2164 			if (xfs_buf_ispinned(bp)) {
2165 				pinned++;
2166 				continue;
2167 			}
2168 			if (!xfs_buf_trylock(bp))
2169 				continue;
2170 		} else {
2171 			xfs_buf_lock(bp);
2172 		}
2173 
2174 		/*
2175 		 * Someone else might have written the buffer synchronously or
2176 		 * marked it stale in the meantime.  In that case only the
2177 		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
2178 		 * reference and remove it from the list here.
2179 		 */
2180 		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2181 			list_del_init(&bp->b_list);
2182 			xfs_buf_relse(bp);
2183 			continue;
2184 		}
2185 
2186 		trace_xfs_buf_delwri_split(bp, _RET_IP_);
2187 
2188 		/*
2189 		 * If we have a wait list, each buffer (and associated delwri
2190 		 * queue reference) transfers to it and is submitted
2191 		 * synchronously. Otherwise, drop the buffer from the delwri
2192 		 * queue and submit async.
2193 		 */
2194 		bp->b_flags &= ~_XBF_DELWRI_Q;
2195 		bp->b_flags |= XBF_WRITE;
2196 		if (wait_list) {
2197 			bp->b_flags &= ~XBF_ASYNC;
2198 			list_move_tail(&bp->b_list, wait_list);
2199 		} else {
2200 			bp->b_flags |= XBF_ASYNC;
2201 			list_del_init(&bp->b_list);
2202 		}
2203 		__xfs_buf_submit(bp, false);
2204 	}
2205 	blk_finish_plug(&plug);
2206 
2207 	return pinned;
2208 }
2209 
2210 /*
2211  * Write out a buffer list asynchronously.
2212  *
2213  * This will take the @buffer_list, write all non-locked and non-pinned buffers
2214  * out and not wait for I/O completion on any of the buffers.  This interface
2215  * is only safely useable for callers that can track I/O completion by higher
2216  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
2217  * function.
2218  *
2219  * Note: this function will skip buffers it would block on, and in doing so
2220  * leaves them on @buffer_list so they can be retried on a later pass. As such,
2221  * it is up to the caller to ensure that the buffer list is fully submitted or
2222  * cancelled appropriately when they are finished with the list. Failure to
2223  * cancel or resubmit the list until it is empty will result in leaked buffers
2224  * at unmount time.
2225  */
2226 int
2227 xfs_buf_delwri_submit_nowait(
2228 	struct list_head	*buffer_list)
2229 {
2230 	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
2231 }
2232 
2233 /*
2234  * Write out a buffer list synchronously.
2235  *
2236  * This will take the @buffer_list, write all buffers out and wait for I/O
2237  * completion on all of the buffers. @buffer_list is consumed by the function,
2238  * so callers must have some other way of tracking buffers if they require such
2239  * functionality.
2240  */
2241 int
2242 xfs_buf_delwri_submit(
2243 	struct list_head	*buffer_list)
2244 {
2245 	LIST_HEAD		(wait_list);
2246 	int			error = 0, error2;
2247 	struct xfs_buf		*bp;
2248 
2249 	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
2250 
2251 	/* Wait for IO to complete. */
2252 	while (!list_empty(&wait_list)) {
2253 		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2254 
2255 		list_del_init(&bp->b_list);
2256 
2257 		/*
2258 		 * Wait on the locked buffer, check for errors and unlock and
2259 		 * release the delwri queue reference.
2260 		 */
2261 		error2 = xfs_buf_iowait(bp);
2262 		xfs_buf_relse(bp);
2263 		if (!error)
2264 			error = error2;
2265 	}
2266 
2267 	return error;
2268 }
2269 
2270 /*
2271  * Push a single buffer on a delwri queue.
2272  *
2273  * The purpose of this function is to submit a single buffer of a delwri queue
2274  * and return with the buffer still on the original queue. The waiting delwri
2275  * buffer submission infrastructure guarantees transfer of the delwri queue
2276  * buffer reference to a temporary wait list. We reuse this infrastructure to
2277  * transfer the buffer back to the original queue.
2278  *
2279  * Note the buffer transitions from the queued state, to the submitted and wait
2280  * listed state and back to the queued state during this call. The buffer
2281  * locking and queue management logic between _delwri_pushbuf() and
2282  * _delwri_queue() guarantee that the buffer cannot be queued to another list
2283  * before returning.
2284  */
2285 int
2286 xfs_buf_delwri_pushbuf(
2287 	struct xfs_buf		*bp,
2288 	struct list_head	*buffer_list)
2289 {
2290 	LIST_HEAD		(submit_list);
2291 	int			error;
2292 
2293 	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
2294 
2295 	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
2296 
2297 	/*
2298 	 * Isolate the buffer to a new local list so we can submit it for I/O
2299 	 * independently from the rest of the original list.
2300 	 */
2301 	xfs_buf_lock(bp);
2302 	list_move(&bp->b_list, &submit_list);
2303 	xfs_buf_unlock(bp);
2304 
2305 	/*
2306 	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
2307 	 * the buffer on the wait list with the original reference. Rather than
2308 	 * bounce the buffer from a local wait list back to the original list
2309 	 * after I/O completion, reuse the original list as the wait list.
2310 	 */
2311 	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
2312 
2313 	/*
2314 	 * The buffer is now locked, under I/O and wait listed on the original
2315 	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
2316 	 * return with the buffer unlocked and on the original queue.
2317 	 */
2318 	error = xfs_buf_iowait(bp);
2319 	bp->b_flags |= _XBF_DELWRI_Q;
2320 	xfs_buf_unlock(bp);
2321 
2322 	return error;
2323 }
2324 
2325 int __init
2326 xfs_buf_init(void)
2327 {
2328 	xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
2329 					 SLAB_HWCACHE_ALIGN |
2330 					 SLAB_RECLAIM_ACCOUNT |
2331 					 SLAB_MEM_SPREAD,
2332 					 NULL);
2333 	if (!xfs_buf_zone)
2334 		goto out;
2335 
2336 	return 0;
2337 
2338  out:
2339 	return -ENOMEM;
2340 }
2341 
2342 void
2343 xfs_buf_terminate(void)
2344 {
2345 	kmem_cache_destroy(xfs_buf_zone);
2346 }
2347 
2348 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2349 {
2350 	/*
2351 	 * Set the lru reference count to 0 based on the error injection tag.
2352 	 * This allows userspace to disrupt buffer caching for debug/testing
2353 	 * purposes.
2354 	 */
2355 	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
2356 		lru_ref = 0;
2357 
2358 	atomic_set(&bp->b_lru_ref, lru_ref);
2359 }
2360 
2361 /*
2362  * Verify an on-disk magic value against the magic value specified in the
2363  * verifier structure. The verifier magic is in disk byte order so the caller is
2364  * expected to pass the value directly from disk.
2365  */
2366 bool
2367 xfs_verify_magic(
2368 	struct xfs_buf		*bp,
2369 	__be32			dmagic)
2370 {
2371 	struct xfs_mount	*mp = bp->b_mount;
2372 	int			idx;
2373 
2374 	idx = xfs_sb_version_hascrc(&mp->m_sb);
2375 	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
2376 		return false;
2377 	return dmagic == bp->b_ops->magic[idx];
2378 }
2379 /*
2380  * Verify an on-disk magic value against the magic value specified in the
2381  * verifier structure. The verifier magic is in disk byte order so the caller is
2382  * expected to pass the value directly from disk.
2383  */
2384 bool
2385 xfs_verify_magic16(
2386 	struct xfs_buf		*bp,
2387 	__be16			dmagic)
2388 {
2389 	struct xfs_mount	*mp = bp->b_mount;
2390 	int			idx;
2391 
2392 	idx = xfs_sb_version_hascrc(&mp->m_sb);
2393 	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
2394 		return false;
2395 	return dmagic == bp->b_ops->magic16[idx];
2396 }
2397