xref: /openbmc/linux/fs/xfs/xfs_buf.c (revision 930beb5a)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include <linux/stddef.h>
20 #include <linux/errno.h>
21 #include <linux/gfp.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/vmalloc.h>
25 #include <linux/bio.h>
26 #include <linux/sysctl.h>
27 #include <linux/proc_fs.h>
28 #include <linux/workqueue.h>
29 #include <linux/percpu.h>
30 #include <linux/blkdev.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/migrate.h>
34 #include <linux/backing-dev.h>
35 #include <linux/freezer.h>
36 
37 #include "xfs_log_format.h"
38 #include "xfs_trans_resv.h"
39 #include "xfs_sb.h"
40 #include "xfs_ag.h"
41 #include "xfs_mount.h"
42 #include "xfs_trace.h"
43 #include "xfs_log.h"
44 
45 static kmem_zone_t *xfs_buf_zone;
46 
47 static struct workqueue_struct *xfslogd_workqueue;
48 
49 #ifdef XFS_BUF_LOCK_TRACKING
50 # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
51 # define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
52 # define XB_GET_OWNER(bp)	((bp)->b_last_holder)
53 #else
54 # define XB_SET_OWNER(bp)	do { } while (0)
55 # define XB_CLEAR_OWNER(bp)	do { } while (0)
56 # define XB_GET_OWNER(bp)	do { } while (0)
57 #endif
58 
59 #define xb_to_gfp(flags) \
60 	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
61 
62 
63 static inline int
64 xfs_buf_is_vmapped(
65 	struct xfs_buf	*bp)
66 {
67 	/*
68 	 * Return true if the buffer is vmapped.
69 	 *
70 	 * b_addr is null if the buffer is not mapped, but the code is clever
71 	 * enough to know it doesn't have to map a single page, so the check has
72 	 * to be both for b_addr and bp->b_page_count > 1.
73 	 */
74 	return bp->b_addr && bp->b_page_count > 1;
75 }
76 
77 static inline int
78 xfs_buf_vmap_len(
79 	struct xfs_buf	*bp)
80 {
81 	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
82 }
83 
84 /*
85  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
86  * b_lru_ref count so that the buffer is freed immediately when the buffer
87  * reference count falls to zero. If the buffer is already on the LRU, we need
88  * to remove the reference that LRU holds on the buffer.
89  *
90  * This prevents build-up of stale buffers on the LRU.
91  */
92 void
93 xfs_buf_stale(
94 	struct xfs_buf	*bp)
95 {
96 	ASSERT(xfs_buf_islocked(bp));
97 
98 	bp->b_flags |= XBF_STALE;
99 
100 	/*
101 	 * Clear the delwri status so that a delwri queue walker will not
102 	 * flush this buffer to disk now that it is stale. The delwri queue has
103 	 * a reference to the buffer, so this is safe to do.
104 	 */
105 	bp->b_flags &= ~_XBF_DELWRI_Q;
106 
107 	spin_lock(&bp->b_lock);
108 	atomic_set(&bp->b_lru_ref, 0);
109 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
110 	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
111 		atomic_dec(&bp->b_hold);
112 
113 	ASSERT(atomic_read(&bp->b_hold) >= 1);
114 	spin_unlock(&bp->b_lock);
115 }
116 
117 static int
118 xfs_buf_get_maps(
119 	struct xfs_buf		*bp,
120 	int			map_count)
121 {
122 	ASSERT(bp->b_maps == NULL);
123 	bp->b_map_count = map_count;
124 
125 	if (map_count == 1) {
126 		bp->b_maps = &bp->__b_map;
127 		return 0;
128 	}
129 
130 	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
131 				KM_NOFS);
132 	if (!bp->b_maps)
133 		return ENOMEM;
134 	return 0;
135 }
136 
137 /*
138  *	Frees b_pages if it was allocated.
139  */
140 static void
141 xfs_buf_free_maps(
142 	struct xfs_buf	*bp)
143 {
144 	if (bp->b_maps != &bp->__b_map) {
145 		kmem_free(bp->b_maps);
146 		bp->b_maps = NULL;
147 	}
148 }
149 
150 struct xfs_buf *
151 _xfs_buf_alloc(
152 	struct xfs_buftarg	*target,
153 	struct xfs_buf_map	*map,
154 	int			nmaps,
155 	xfs_buf_flags_t		flags)
156 {
157 	struct xfs_buf		*bp;
158 	int			error;
159 	int			i;
160 
161 	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
162 	if (unlikely(!bp))
163 		return NULL;
164 
165 	/*
166 	 * We don't want certain flags to appear in b_flags unless they are
167 	 * specifically set by later operations on the buffer.
168 	 */
169 	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
170 
171 	atomic_set(&bp->b_hold, 1);
172 	atomic_set(&bp->b_lru_ref, 1);
173 	init_completion(&bp->b_iowait);
174 	INIT_LIST_HEAD(&bp->b_lru);
175 	INIT_LIST_HEAD(&bp->b_list);
176 	RB_CLEAR_NODE(&bp->b_rbnode);
177 	sema_init(&bp->b_sema, 0); /* held, no waiters */
178 	spin_lock_init(&bp->b_lock);
179 	XB_SET_OWNER(bp);
180 	bp->b_target = target;
181 	bp->b_flags = flags;
182 
183 	/*
184 	 * Set length and io_length to the same value initially.
185 	 * I/O routines should use io_length, which will be the same in
186 	 * most cases but may be reset (e.g. XFS recovery).
187 	 */
188 	error = xfs_buf_get_maps(bp, nmaps);
189 	if (error)  {
190 		kmem_zone_free(xfs_buf_zone, bp);
191 		return NULL;
192 	}
193 
194 	bp->b_bn = map[0].bm_bn;
195 	bp->b_length = 0;
196 	for (i = 0; i < nmaps; i++) {
197 		bp->b_maps[i].bm_bn = map[i].bm_bn;
198 		bp->b_maps[i].bm_len = map[i].bm_len;
199 		bp->b_length += map[i].bm_len;
200 	}
201 	bp->b_io_length = bp->b_length;
202 
203 	atomic_set(&bp->b_pin_count, 0);
204 	init_waitqueue_head(&bp->b_waiters);
205 
206 	XFS_STATS_INC(xb_create);
207 	trace_xfs_buf_init(bp, _RET_IP_);
208 
209 	return bp;
210 }
211 
212 /*
213  *	Allocate a page array capable of holding a specified number
214  *	of pages, and point the page buf at it.
215  */
216 STATIC int
217 _xfs_buf_get_pages(
218 	xfs_buf_t		*bp,
219 	int			page_count,
220 	xfs_buf_flags_t		flags)
221 {
222 	/* Make sure that we have a page list */
223 	if (bp->b_pages == NULL) {
224 		bp->b_page_count = page_count;
225 		if (page_count <= XB_PAGES) {
226 			bp->b_pages = bp->b_page_array;
227 		} else {
228 			bp->b_pages = kmem_alloc(sizeof(struct page *) *
229 						 page_count, KM_NOFS);
230 			if (bp->b_pages == NULL)
231 				return -ENOMEM;
232 		}
233 		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
234 	}
235 	return 0;
236 }
237 
238 /*
239  *	Frees b_pages if it was allocated.
240  */
241 STATIC void
242 _xfs_buf_free_pages(
243 	xfs_buf_t	*bp)
244 {
245 	if (bp->b_pages != bp->b_page_array) {
246 		kmem_free(bp->b_pages);
247 		bp->b_pages = NULL;
248 	}
249 }
250 
251 /*
252  *	Releases the specified buffer.
253  *
254  * 	The modification state of any associated pages is left unchanged.
255  * 	The buffer must not be on any hash - use xfs_buf_rele instead for
256  * 	hashed and refcounted buffers
257  */
258 void
259 xfs_buf_free(
260 	xfs_buf_t		*bp)
261 {
262 	trace_xfs_buf_free(bp, _RET_IP_);
263 
264 	ASSERT(list_empty(&bp->b_lru));
265 
266 	if (bp->b_flags & _XBF_PAGES) {
267 		uint		i;
268 
269 		if (xfs_buf_is_vmapped(bp))
270 			vm_unmap_ram(bp->b_addr - bp->b_offset,
271 					bp->b_page_count);
272 
273 		for (i = 0; i < bp->b_page_count; i++) {
274 			struct page	*page = bp->b_pages[i];
275 
276 			__free_page(page);
277 		}
278 	} else if (bp->b_flags & _XBF_KMEM)
279 		kmem_free(bp->b_addr);
280 	_xfs_buf_free_pages(bp);
281 	xfs_buf_free_maps(bp);
282 	kmem_zone_free(xfs_buf_zone, bp);
283 }
284 
285 /*
286  * Allocates all the pages for buffer in question and builds it's page list.
287  */
288 STATIC int
289 xfs_buf_allocate_memory(
290 	xfs_buf_t		*bp,
291 	uint			flags)
292 {
293 	size_t			size;
294 	size_t			nbytes, offset;
295 	gfp_t			gfp_mask = xb_to_gfp(flags);
296 	unsigned short		page_count, i;
297 	xfs_off_t		start, end;
298 	int			error;
299 
300 	/*
301 	 * for buffers that are contained within a single page, just allocate
302 	 * the memory from the heap - there's no need for the complexity of
303 	 * page arrays to keep allocation down to order 0.
304 	 */
305 	size = BBTOB(bp->b_length);
306 	if (size < PAGE_SIZE) {
307 		bp->b_addr = kmem_alloc(size, KM_NOFS);
308 		if (!bp->b_addr) {
309 			/* low memory - use alloc_page loop instead */
310 			goto use_alloc_page;
311 		}
312 
313 		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
314 		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
315 			/* b_addr spans two pages - use alloc_page instead */
316 			kmem_free(bp->b_addr);
317 			bp->b_addr = NULL;
318 			goto use_alloc_page;
319 		}
320 		bp->b_offset = offset_in_page(bp->b_addr);
321 		bp->b_pages = bp->b_page_array;
322 		bp->b_pages[0] = virt_to_page(bp->b_addr);
323 		bp->b_page_count = 1;
324 		bp->b_flags |= _XBF_KMEM;
325 		return 0;
326 	}
327 
328 use_alloc_page:
329 	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
330 	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
331 								>> PAGE_SHIFT;
332 	page_count = end - start;
333 	error = _xfs_buf_get_pages(bp, page_count, flags);
334 	if (unlikely(error))
335 		return error;
336 
337 	offset = bp->b_offset;
338 	bp->b_flags |= _XBF_PAGES;
339 
340 	for (i = 0; i < bp->b_page_count; i++) {
341 		struct page	*page;
342 		uint		retries = 0;
343 retry:
344 		page = alloc_page(gfp_mask);
345 		if (unlikely(page == NULL)) {
346 			if (flags & XBF_READ_AHEAD) {
347 				bp->b_page_count = i;
348 				error = ENOMEM;
349 				goto out_free_pages;
350 			}
351 
352 			/*
353 			 * This could deadlock.
354 			 *
355 			 * But until all the XFS lowlevel code is revamped to
356 			 * handle buffer allocation failures we can't do much.
357 			 */
358 			if (!(++retries % 100))
359 				xfs_err(NULL,
360 		"possible memory allocation deadlock in %s (mode:0x%x)",
361 					__func__, gfp_mask);
362 
363 			XFS_STATS_INC(xb_page_retries);
364 			congestion_wait(BLK_RW_ASYNC, HZ/50);
365 			goto retry;
366 		}
367 
368 		XFS_STATS_INC(xb_page_found);
369 
370 		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
371 		size -= nbytes;
372 		bp->b_pages[i] = page;
373 		offset = 0;
374 	}
375 	return 0;
376 
377 out_free_pages:
378 	for (i = 0; i < bp->b_page_count; i++)
379 		__free_page(bp->b_pages[i]);
380 	return error;
381 }
382 
383 /*
384  *	Map buffer into kernel address-space if necessary.
385  */
386 STATIC int
387 _xfs_buf_map_pages(
388 	xfs_buf_t		*bp,
389 	uint			flags)
390 {
391 	ASSERT(bp->b_flags & _XBF_PAGES);
392 	if (bp->b_page_count == 1) {
393 		/* A single page buffer is always mappable */
394 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
395 	} else if (flags & XBF_UNMAPPED) {
396 		bp->b_addr = NULL;
397 	} else {
398 		int retried = 0;
399 
400 		do {
401 			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
402 						-1, PAGE_KERNEL);
403 			if (bp->b_addr)
404 				break;
405 			vm_unmap_aliases();
406 		} while (retried++ <= 1);
407 
408 		if (!bp->b_addr)
409 			return -ENOMEM;
410 		bp->b_addr += bp->b_offset;
411 	}
412 
413 	return 0;
414 }
415 
416 /*
417  *	Finding and Reading Buffers
418  */
419 
420 /*
421  *	Look up, and creates if absent, a lockable buffer for
422  *	a given range of an inode.  The buffer is returned
423  *	locked.	No I/O is implied by this call.
424  */
425 xfs_buf_t *
426 _xfs_buf_find(
427 	struct xfs_buftarg	*btp,
428 	struct xfs_buf_map	*map,
429 	int			nmaps,
430 	xfs_buf_flags_t		flags,
431 	xfs_buf_t		*new_bp)
432 {
433 	size_t			numbytes;
434 	struct xfs_perag	*pag;
435 	struct rb_node		**rbp;
436 	struct rb_node		*parent;
437 	xfs_buf_t		*bp;
438 	xfs_daddr_t		blkno = map[0].bm_bn;
439 	xfs_daddr_t		eofs;
440 	int			numblks = 0;
441 	int			i;
442 
443 	for (i = 0; i < nmaps; i++)
444 		numblks += map[i].bm_len;
445 	numbytes = BBTOB(numblks);
446 
447 	/* Check for IOs smaller than the sector size / not sector aligned */
448 	ASSERT(!(numbytes < (1 << btp->bt_sshift)));
449 	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
450 
451 	/*
452 	 * Corrupted block numbers can get through to here, unfortunately, so we
453 	 * have to check that the buffer falls within the filesystem bounds.
454 	 */
455 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
456 	if (blkno >= eofs) {
457 		/*
458 		 * XXX (dgc): we should really be returning EFSCORRUPTED here,
459 		 * but none of the higher level infrastructure supports
460 		 * returning a specific error on buffer lookup failures.
461 		 */
462 		xfs_alert(btp->bt_mount,
463 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
464 			  __func__, blkno, eofs);
465 		WARN_ON(1);
466 		return NULL;
467 	}
468 
469 	/* get tree root */
470 	pag = xfs_perag_get(btp->bt_mount,
471 				xfs_daddr_to_agno(btp->bt_mount, blkno));
472 
473 	/* walk tree */
474 	spin_lock(&pag->pag_buf_lock);
475 	rbp = &pag->pag_buf_tree.rb_node;
476 	parent = NULL;
477 	bp = NULL;
478 	while (*rbp) {
479 		parent = *rbp;
480 		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
481 
482 		if (blkno < bp->b_bn)
483 			rbp = &(*rbp)->rb_left;
484 		else if (blkno > bp->b_bn)
485 			rbp = &(*rbp)->rb_right;
486 		else {
487 			/*
488 			 * found a block number match. If the range doesn't
489 			 * match, the only way this is allowed is if the buffer
490 			 * in the cache is stale and the transaction that made
491 			 * it stale has not yet committed. i.e. we are
492 			 * reallocating a busy extent. Skip this buffer and
493 			 * continue searching to the right for an exact match.
494 			 */
495 			if (bp->b_length != numblks) {
496 				ASSERT(bp->b_flags & XBF_STALE);
497 				rbp = &(*rbp)->rb_right;
498 				continue;
499 			}
500 			atomic_inc(&bp->b_hold);
501 			goto found;
502 		}
503 	}
504 
505 	/* No match found */
506 	if (new_bp) {
507 		rb_link_node(&new_bp->b_rbnode, parent, rbp);
508 		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
509 		/* the buffer keeps the perag reference until it is freed */
510 		new_bp->b_pag = pag;
511 		spin_unlock(&pag->pag_buf_lock);
512 	} else {
513 		XFS_STATS_INC(xb_miss_locked);
514 		spin_unlock(&pag->pag_buf_lock);
515 		xfs_perag_put(pag);
516 	}
517 	return new_bp;
518 
519 found:
520 	spin_unlock(&pag->pag_buf_lock);
521 	xfs_perag_put(pag);
522 
523 	if (!xfs_buf_trylock(bp)) {
524 		if (flags & XBF_TRYLOCK) {
525 			xfs_buf_rele(bp);
526 			XFS_STATS_INC(xb_busy_locked);
527 			return NULL;
528 		}
529 		xfs_buf_lock(bp);
530 		XFS_STATS_INC(xb_get_locked_waited);
531 	}
532 
533 	/*
534 	 * if the buffer is stale, clear all the external state associated with
535 	 * it. We need to keep flags such as how we allocated the buffer memory
536 	 * intact here.
537 	 */
538 	if (bp->b_flags & XBF_STALE) {
539 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
540 		ASSERT(bp->b_iodone == NULL);
541 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
542 		bp->b_ops = NULL;
543 	}
544 
545 	trace_xfs_buf_find(bp, flags, _RET_IP_);
546 	XFS_STATS_INC(xb_get_locked);
547 	return bp;
548 }
549 
550 /*
551  * Assembles a buffer covering the specified range. The code is optimised for
552  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
553  * more hits than misses.
554  */
555 struct xfs_buf *
556 xfs_buf_get_map(
557 	struct xfs_buftarg	*target,
558 	struct xfs_buf_map	*map,
559 	int			nmaps,
560 	xfs_buf_flags_t		flags)
561 {
562 	struct xfs_buf		*bp;
563 	struct xfs_buf		*new_bp;
564 	int			error = 0;
565 
566 	bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
567 	if (likely(bp))
568 		goto found;
569 
570 	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
571 	if (unlikely(!new_bp))
572 		return NULL;
573 
574 	error = xfs_buf_allocate_memory(new_bp, flags);
575 	if (error) {
576 		xfs_buf_free(new_bp);
577 		return NULL;
578 	}
579 
580 	bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
581 	if (!bp) {
582 		xfs_buf_free(new_bp);
583 		return NULL;
584 	}
585 
586 	if (bp != new_bp)
587 		xfs_buf_free(new_bp);
588 
589 found:
590 	if (!bp->b_addr) {
591 		error = _xfs_buf_map_pages(bp, flags);
592 		if (unlikely(error)) {
593 			xfs_warn(target->bt_mount,
594 				"%s: failed to map pagesn", __func__);
595 			xfs_buf_relse(bp);
596 			return NULL;
597 		}
598 	}
599 
600 	XFS_STATS_INC(xb_get);
601 	trace_xfs_buf_get(bp, flags, _RET_IP_);
602 	return bp;
603 }
604 
605 STATIC int
606 _xfs_buf_read(
607 	xfs_buf_t		*bp,
608 	xfs_buf_flags_t		flags)
609 {
610 	ASSERT(!(flags & XBF_WRITE));
611 	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
612 
613 	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
614 	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
615 
616 	xfs_buf_iorequest(bp);
617 	if (flags & XBF_ASYNC)
618 		return 0;
619 	return xfs_buf_iowait(bp);
620 }
621 
622 xfs_buf_t *
623 xfs_buf_read_map(
624 	struct xfs_buftarg	*target,
625 	struct xfs_buf_map	*map,
626 	int			nmaps,
627 	xfs_buf_flags_t		flags,
628 	const struct xfs_buf_ops *ops)
629 {
630 	struct xfs_buf		*bp;
631 
632 	flags |= XBF_READ;
633 
634 	bp = xfs_buf_get_map(target, map, nmaps, flags);
635 	if (bp) {
636 		trace_xfs_buf_read(bp, flags, _RET_IP_);
637 
638 		if (!XFS_BUF_ISDONE(bp)) {
639 			XFS_STATS_INC(xb_get_read);
640 			bp->b_ops = ops;
641 			_xfs_buf_read(bp, flags);
642 		} else if (flags & XBF_ASYNC) {
643 			/*
644 			 * Read ahead call which is already satisfied,
645 			 * drop the buffer
646 			 */
647 			xfs_buf_relse(bp);
648 			return NULL;
649 		} else {
650 			/* We do not want read in the flags */
651 			bp->b_flags &= ~XBF_READ;
652 		}
653 	}
654 
655 	return bp;
656 }
657 
658 /*
659  *	If we are not low on memory then do the readahead in a deadlock
660  *	safe manner.
661  */
662 void
663 xfs_buf_readahead_map(
664 	struct xfs_buftarg	*target,
665 	struct xfs_buf_map	*map,
666 	int			nmaps,
667 	const struct xfs_buf_ops *ops)
668 {
669 	if (bdi_read_congested(target->bt_bdi))
670 		return;
671 
672 	xfs_buf_read_map(target, map, nmaps,
673 		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
674 }
675 
676 /*
677  * Read an uncached buffer from disk. Allocates and returns a locked
678  * buffer containing the disk contents or nothing.
679  */
680 struct xfs_buf *
681 xfs_buf_read_uncached(
682 	struct xfs_buftarg	*target,
683 	xfs_daddr_t		daddr,
684 	size_t			numblks,
685 	int			flags,
686 	const struct xfs_buf_ops *ops)
687 {
688 	struct xfs_buf		*bp;
689 
690 	bp = xfs_buf_get_uncached(target, numblks, flags);
691 	if (!bp)
692 		return NULL;
693 
694 	/* set up the buffer for a read IO */
695 	ASSERT(bp->b_map_count == 1);
696 	bp->b_bn = daddr;
697 	bp->b_maps[0].bm_bn = daddr;
698 	bp->b_flags |= XBF_READ;
699 	bp->b_ops = ops;
700 
701 	if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
702 		xfs_buf_relse(bp);
703 		return NULL;
704 	}
705 	xfs_buf_iorequest(bp);
706 	xfs_buf_iowait(bp);
707 	return bp;
708 }
709 
710 /*
711  * Return a buffer allocated as an empty buffer and associated to external
712  * memory via xfs_buf_associate_memory() back to it's empty state.
713  */
714 void
715 xfs_buf_set_empty(
716 	struct xfs_buf		*bp,
717 	size_t			numblks)
718 {
719 	if (bp->b_pages)
720 		_xfs_buf_free_pages(bp);
721 
722 	bp->b_pages = NULL;
723 	bp->b_page_count = 0;
724 	bp->b_addr = NULL;
725 	bp->b_length = numblks;
726 	bp->b_io_length = numblks;
727 
728 	ASSERT(bp->b_map_count == 1);
729 	bp->b_bn = XFS_BUF_DADDR_NULL;
730 	bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
731 	bp->b_maps[0].bm_len = bp->b_length;
732 }
733 
734 static inline struct page *
735 mem_to_page(
736 	void			*addr)
737 {
738 	if ((!is_vmalloc_addr(addr))) {
739 		return virt_to_page(addr);
740 	} else {
741 		return vmalloc_to_page(addr);
742 	}
743 }
744 
745 int
746 xfs_buf_associate_memory(
747 	xfs_buf_t		*bp,
748 	void			*mem,
749 	size_t			len)
750 {
751 	int			rval;
752 	int			i = 0;
753 	unsigned long		pageaddr;
754 	unsigned long		offset;
755 	size_t			buflen;
756 	int			page_count;
757 
758 	pageaddr = (unsigned long)mem & PAGE_MASK;
759 	offset = (unsigned long)mem - pageaddr;
760 	buflen = PAGE_ALIGN(len + offset);
761 	page_count = buflen >> PAGE_SHIFT;
762 
763 	/* Free any previous set of page pointers */
764 	if (bp->b_pages)
765 		_xfs_buf_free_pages(bp);
766 
767 	bp->b_pages = NULL;
768 	bp->b_addr = mem;
769 
770 	rval = _xfs_buf_get_pages(bp, page_count, 0);
771 	if (rval)
772 		return rval;
773 
774 	bp->b_offset = offset;
775 
776 	for (i = 0; i < bp->b_page_count; i++) {
777 		bp->b_pages[i] = mem_to_page((void *)pageaddr);
778 		pageaddr += PAGE_SIZE;
779 	}
780 
781 	bp->b_io_length = BTOBB(len);
782 	bp->b_length = BTOBB(buflen);
783 
784 	return 0;
785 }
786 
787 xfs_buf_t *
788 xfs_buf_get_uncached(
789 	struct xfs_buftarg	*target,
790 	size_t			numblks,
791 	int			flags)
792 {
793 	unsigned long		page_count;
794 	int			error, i;
795 	struct xfs_buf		*bp;
796 	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
797 
798 	bp = _xfs_buf_alloc(target, &map, 1, 0);
799 	if (unlikely(bp == NULL))
800 		goto fail;
801 
802 	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
803 	error = _xfs_buf_get_pages(bp, page_count, 0);
804 	if (error)
805 		goto fail_free_buf;
806 
807 	for (i = 0; i < page_count; i++) {
808 		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
809 		if (!bp->b_pages[i])
810 			goto fail_free_mem;
811 	}
812 	bp->b_flags |= _XBF_PAGES;
813 
814 	error = _xfs_buf_map_pages(bp, 0);
815 	if (unlikely(error)) {
816 		xfs_warn(target->bt_mount,
817 			"%s: failed to map pages", __func__);
818 		goto fail_free_mem;
819 	}
820 
821 	trace_xfs_buf_get_uncached(bp, _RET_IP_);
822 	return bp;
823 
824  fail_free_mem:
825 	while (--i >= 0)
826 		__free_page(bp->b_pages[i]);
827 	_xfs_buf_free_pages(bp);
828  fail_free_buf:
829 	xfs_buf_free_maps(bp);
830 	kmem_zone_free(xfs_buf_zone, bp);
831  fail:
832 	return NULL;
833 }
834 
835 /*
836  *	Increment reference count on buffer, to hold the buffer concurrently
837  *	with another thread which may release (free) the buffer asynchronously.
838  *	Must hold the buffer already to call this function.
839  */
840 void
841 xfs_buf_hold(
842 	xfs_buf_t		*bp)
843 {
844 	trace_xfs_buf_hold(bp, _RET_IP_);
845 	atomic_inc(&bp->b_hold);
846 }
847 
848 /*
849  *	Releases a hold on the specified buffer.  If the
850  *	the hold count is 1, calls xfs_buf_free.
851  */
852 void
853 xfs_buf_rele(
854 	xfs_buf_t		*bp)
855 {
856 	struct xfs_perag	*pag = bp->b_pag;
857 
858 	trace_xfs_buf_rele(bp, _RET_IP_);
859 
860 	if (!pag) {
861 		ASSERT(list_empty(&bp->b_lru));
862 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
863 		if (atomic_dec_and_test(&bp->b_hold))
864 			xfs_buf_free(bp);
865 		return;
866 	}
867 
868 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
869 
870 	ASSERT(atomic_read(&bp->b_hold) > 0);
871 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
872 		spin_lock(&bp->b_lock);
873 		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
874 			/*
875 			 * If the buffer is added to the LRU take a new
876 			 * reference to the buffer for the LRU and clear the
877 			 * (now stale) dispose list state flag
878 			 */
879 			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
880 				bp->b_state &= ~XFS_BSTATE_DISPOSE;
881 				atomic_inc(&bp->b_hold);
882 			}
883 			spin_unlock(&bp->b_lock);
884 			spin_unlock(&pag->pag_buf_lock);
885 		} else {
886 			/*
887 			 * most of the time buffers will already be removed from
888 			 * the LRU, so optimise that case by checking for the
889 			 * XFS_BSTATE_DISPOSE flag indicating the last list the
890 			 * buffer was on was the disposal list
891 			 */
892 			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
893 				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
894 			} else {
895 				ASSERT(list_empty(&bp->b_lru));
896 			}
897 			spin_unlock(&bp->b_lock);
898 
899 			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
900 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
901 			spin_unlock(&pag->pag_buf_lock);
902 			xfs_perag_put(pag);
903 			xfs_buf_free(bp);
904 		}
905 	}
906 }
907 
908 
909 /*
910  *	Lock a buffer object, if it is not already locked.
911  *
912  *	If we come across a stale, pinned, locked buffer, we know that we are
913  *	being asked to lock a buffer that has been reallocated. Because it is
914  *	pinned, we know that the log has not been pushed to disk and hence it
915  *	will still be locked.  Rather than continuing to have trylock attempts
916  *	fail until someone else pushes the log, push it ourselves before
917  *	returning.  This means that the xfsaild will not get stuck trying
918  *	to push on stale inode buffers.
919  */
920 int
921 xfs_buf_trylock(
922 	struct xfs_buf		*bp)
923 {
924 	int			locked;
925 
926 	locked = down_trylock(&bp->b_sema) == 0;
927 	if (locked)
928 		XB_SET_OWNER(bp);
929 
930 	trace_xfs_buf_trylock(bp, _RET_IP_);
931 	return locked;
932 }
933 
934 /*
935  *	Lock a buffer object.
936  *
937  *	If we come across a stale, pinned, locked buffer, we know that we
938  *	are being asked to lock a buffer that has been reallocated. Because
939  *	it is pinned, we know that the log has not been pushed to disk and
940  *	hence it will still be locked. Rather than sleeping until someone
941  *	else pushes the log, push it ourselves before trying to get the lock.
942  */
943 void
944 xfs_buf_lock(
945 	struct xfs_buf		*bp)
946 {
947 	trace_xfs_buf_lock(bp, _RET_IP_);
948 
949 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
950 		xfs_log_force(bp->b_target->bt_mount, 0);
951 	down(&bp->b_sema);
952 	XB_SET_OWNER(bp);
953 
954 	trace_xfs_buf_lock_done(bp, _RET_IP_);
955 }
956 
957 void
958 xfs_buf_unlock(
959 	struct xfs_buf		*bp)
960 {
961 	XB_CLEAR_OWNER(bp);
962 	up(&bp->b_sema);
963 
964 	trace_xfs_buf_unlock(bp, _RET_IP_);
965 }
966 
967 STATIC void
968 xfs_buf_wait_unpin(
969 	xfs_buf_t		*bp)
970 {
971 	DECLARE_WAITQUEUE	(wait, current);
972 
973 	if (atomic_read(&bp->b_pin_count) == 0)
974 		return;
975 
976 	add_wait_queue(&bp->b_waiters, &wait);
977 	for (;;) {
978 		set_current_state(TASK_UNINTERRUPTIBLE);
979 		if (atomic_read(&bp->b_pin_count) == 0)
980 			break;
981 		io_schedule();
982 	}
983 	remove_wait_queue(&bp->b_waiters, &wait);
984 	set_current_state(TASK_RUNNING);
985 }
986 
987 /*
988  *	Buffer Utility Routines
989  */
990 
991 STATIC void
992 xfs_buf_iodone_work(
993 	struct work_struct	*work)
994 {
995 	struct xfs_buf		*bp =
996 		container_of(work, xfs_buf_t, b_iodone_work);
997 	bool			read = !!(bp->b_flags & XBF_READ);
998 
999 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1000 
1001 	/* only validate buffers that were read without errors */
1002 	if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
1003 		bp->b_ops->verify_read(bp);
1004 
1005 	if (bp->b_iodone)
1006 		(*(bp->b_iodone))(bp);
1007 	else if (bp->b_flags & XBF_ASYNC)
1008 		xfs_buf_relse(bp);
1009 	else {
1010 		ASSERT(read && bp->b_ops);
1011 		complete(&bp->b_iowait);
1012 	}
1013 }
1014 
1015 void
1016 xfs_buf_ioend(
1017 	struct xfs_buf	*bp,
1018 	int		schedule)
1019 {
1020 	bool		read = !!(bp->b_flags & XBF_READ);
1021 
1022 	trace_xfs_buf_iodone(bp, _RET_IP_);
1023 
1024 	if (bp->b_error == 0)
1025 		bp->b_flags |= XBF_DONE;
1026 
1027 	if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
1028 		if (schedule) {
1029 			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1030 			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1031 		} else {
1032 			xfs_buf_iodone_work(&bp->b_iodone_work);
1033 		}
1034 	} else {
1035 		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1036 		complete(&bp->b_iowait);
1037 	}
1038 }
1039 
1040 void
1041 xfs_buf_ioerror(
1042 	xfs_buf_t		*bp,
1043 	int			error)
1044 {
1045 	ASSERT(error >= 0 && error <= 0xffff);
1046 	bp->b_error = (unsigned short)error;
1047 	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1048 }
1049 
1050 void
1051 xfs_buf_ioerror_alert(
1052 	struct xfs_buf		*bp,
1053 	const char		*func)
1054 {
1055 	xfs_alert(bp->b_target->bt_mount,
1056 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
1057 		(__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
1058 }
1059 
1060 /*
1061  * Called when we want to stop a buffer from getting written or read.
1062  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1063  * so that the proper iodone callbacks get called.
1064  */
1065 STATIC int
1066 xfs_bioerror(
1067 	xfs_buf_t *bp)
1068 {
1069 #ifdef XFSERRORDEBUG
1070 	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1071 #endif
1072 
1073 	/*
1074 	 * No need to wait until the buffer is unpinned, we aren't flushing it.
1075 	 */
1076 	xfs_buf_ioerror(bp, EIO);
1077 
1078 	/*
1079 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1080 	 */
1081 	XFS_BUF_UNREAD(bp);
1082 	XFS_BUF_UNDONE(bp);
1083 	xfs_buf_stale(bp);
1084 
1085 	xfs_buf_ioend(bp, 0);
1086 
1087 	return EIO;
1088 }
1089 
1090 /*
1091  * Same as xfs_bioerror, except that we are releasing the buffer
1092  * here ourselves, and avoiding the xfs_buf_ioend call.
1093  * This is meant for userdata errors; metadata bufs come with
1094  * iodone functions attached, so that we can track down errors.
1095  */
1096 int
1097 xfs_bioerror_relse(
1098 	struct xfs_buf	*bp)
1099 {
1100 	int64_t		fl = bp->b_flags;
1101 	/*
1102 	 * No need to wait until the buffer is unpinned.
1103 	 * We aren't flushing it.
1104 	 *
1105 	 * chunkhold expects B_DONE to be set, whether
1106 	 * we actually finish the I/O or not. We don't want to
1107 	 * change that interface.
1108 	 */
1109 	XFS_BUF_UNREAD(bp);
1110 	XFS_BUF_DONE(bp);
1111 	xfs_buf_stale(bp);
1112 	bp->b_iodone = NULL;
1113 	if (!(fl & XBF_ASYNC)) {
1114 		/*
1115 		 * Mark b_error and B_ERROR _both_.
1116 		 * Lot's of chunkcache code assumes that.
1117 		 * There's no reason to mark error for
1118 		 * ASYNC buffers.
1119 		 */
1120 		xfs_buf_ioerror(bp, EIO);
1121 		complete(&bp->b_iowait);
1122 	} else {
1123 		xfs_buf_relse(bp);
1124 	}
1125 
1126 	return EIO;
1127 }
1128 
1129 STATIC int
1130 xfs_bdstrat_cb(
1131 	struct xfs_buf	*bp)
1132 {
1133 	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1134 		trace_xfs_bdstrat_shut(bp, _RET_IP_);
1135 		/*
1136 		 * Metadata write that didn't get logged but
1137 		 * written delayed anyway. These aren't associated
1138 		 * with a transaction, and can be ignored.
1139 		 */
1140 		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1141 			return xfs_bioerror_relse(bp);
1142 		else
1143 			return xfs_bioerror(bp);
1144 	}
1145 
1146 	xfs_buf_iorequest(bp);
1147 	return 0;
1148 }
1149 
1150 int
1151 xfs_bwrite(
1152 	struct xfs_buf		*bp)
1153 {
1154 	int			error;
1155 
1156 	ASSERT(xfs_buf_islocked(bp));
1157 
1158 	bp->b_flags |= XBF_WRITE;
1159 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
1160 
1161 	xfs_bdstrat_cb(bp);
1162 
1163 	error = xfs_buf_iowait(bp);
1164 	if (error) {
1165 		xfs_force_shutdown(bp->b_target->bt_mount,
1166 				   SHUTDOWN_META_IO_ERROR);
1167 	}
1168 	return error;
1169 }
1170 
1171 STATIC void
1172 _xfs_buf_ioend(
1173 	xfs_buf_t		*bp,
1174 	int			schedule)
1175 {
1176 	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1177 		xfs_buf_ioend(bp, schedule);
1178 }
1179 
1180 STATIC void
1181 xfs_buf_bio_end_io(
1182 	struct bio		*bio,
1183 	int			error)
1184 {
1185 	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
1186 
1187 	/*
1188 	 * don't overwrite existing errors - otherwise we can lose errors on
1189 	 * buffers that require multiple bios to complete.
1190 	 */
1191 	if (!bp->b_error)
1192 		xfs_buf_ioerror(bp, -error);
1193 
1194 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1195 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1196 
1197 	_xfs_buf_ioend(bp, 1);
1198 	bio_put(bio);
1199 }
1200 
1201 static void
1202 xfs_buf_ioapply_map(
1203 	struct xfs_buf	*bp,
1204 	int		map,
1205 	int		*buf_offset,
1206 	int		*count,
1207 	int		rw)
1208 {
1209 	int		page_index;
1210 	int		total_nr_pages = bp->b_page_count;
1211 	int		nr_pages;
1212 	struct bio	*bio;
1213 	sector_t	sector =  bp->b_maps[map].bm_bn;
1214 	int		size;
1215 	int		offset;
1216 
1217 	total_nr_pages = bp->b_page_count;
1218 
1219 	/* skip the pages in the buffer before the start offset */
1220 	page_index = 0;
1221 	offset = *buf_offset;
1222 	while (offset >= PAGE_SIZE) {
1223 		page_index++;
1224 		offset -= PAGE_SIZE;
1225 	}
1226 
1227 	/*
1228 	 * Limit the IO size to the length of the current vector, and update the
1229 	 * remaining IO count for the next time around.
1230 	 */
1231 	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1232 	*count -= size;
1233 	*buf_offset += size;
1234 
1235 next_chunk:
1236 	atomic_inc(&bp->b_io_remaining);
1237 	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1238 	if (nr_pages > total_nr_pages)
1239 		nr_pages = total_nr_pages;
1240 
1241 	bio = bio_alloc(GFP_NOIO, nr_pages);
1242 	bio->bi_bdev = bp->b_target->bt_bdev;
1243 	bio->bi_sector = sector;
1244 	bio->bi_end_io = xfs_buf_bio_end_io;
1245 	bio->bi_private = bp;
1246 
1247 
1248 	for (; size && nr_pages; nr_pages--, page_index++) {
1249 		int	rbytes, nbytes = PAGE_SIZE - offset;
1250 
1251 		if (nbytes > size)
1252 			nbytes = size;
1253 
1254 		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1255 				      offset);
1256 		if (rbytes < nbytes)
1257 			break;
1258 
1259 		offset = 0;
1260 		sector += BTOBB(nbytes);
1261 		size -= nbytes;
1262 		total_nr_pages--;
1263 	}
1264 
1265 	if (likely(bio->bi_size)) {
1266 		if (xfs_buf_is_vmapped(bp)) {
1267 			flush_kernel_vmap_range(bp->b_addr,
1268 						xfs_buf_vmap_len(bp));
1269 		}
1270 		submit_bio(rw, bio);
1271 		if (size)
1272 			goto next_chunk;
1273 	} else {
1274 		/*
1275 		 * This is guaranteed not to be the last io reference count
1276 		 * because the caller (xfs_buf_iorequest) holds a count itself.
1277 		 */
1278 		atomic_dec(&bp->b_io_remaining);
1279 		xfs_buf_ioerror(bp, EIO);
1280 		bio_put(bio);
1281 	}
1282 
1283 }
1284 
1285 STATIC void
1286 _xfs_buf_ioapply(
1287 	struct xfs_buf	*bp)
1288 {
1289 	struct blk_plug	plug;
1290 	int		rw;
1291 	int		offset;
1292 	int		size;
1293 	int		i;
1294 
1295 	/*
1296 	 * Make sure we capture only current IO errors rather than stale errors
1297 	 * left over from previous use of the buffer (e.g. failed readahead).
1298 	 */
1299 	bp->b_error = 0;
1300 
1301 	if (bp->b_flags & XBF_WRITE) {
1302 		if (bp->b_flags & XBF_SYNCIO)
1303 			rw = WRITE_SYNC;
1304 		else
1305 			rw = WRITE;
1306 		if (bp->b_flags & XBF_FUA)
1307 			rw |= REQ_FUA;
1308 		if (bp->b_flags & XBF_FLUSH)
1309 			rw |= REQ_FLUSH;
1310 
1311 		/*
1312 		 * Run the write verifier callback function if it exists. If
1313 		 * this function fails it will mark the buffer with an error and
1314 		 * the IO should not be dispatched.
1315 		 */
1316 		if (bp->b_ops) {
1317 			bp->b_ops->verify_write(bp);
1318 			if (bp->b_error) {
1319 				xfs_force_shutdown(bp->b_target->bt_mount,
1320 						   SHUTDOWN_CORRUPT_INCORE);
1321 				return;
1322 			}
1323 		}
1324 	} else if (bp->b_flags & XBF_READ_AHEAD) {
1325 		rw = READA;
1326 	} else {
1327 		rw = READ;
1328 	}
1329 
1330 	/* we only use the buffer cache for meta-data */
1331 	rw |= REQ_META;
1332 
1333 	/*
1334 	 * Walk all the vectors issuing IO on them. Set up the initial offset
1335 	 * into the buffer and the desired IO size before we start -
1336 	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
1337 	 * subsequent call.
1338 	 */
1339 	offset = bp->b_offset;
1340 	size = BBTOB(bp->b_io_length);
1341 	blk_start_plug(&plug);
1342 	for (i = 0; i < bp->b_map_count; i++) {
1343 		xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
1344 		if (bp->b_error)
1345 			break;
1346 		if (size <= 0)
1347 			break;	/* all done */
1348 	}
1349 	blk_finish_plug(&plug);
1350 }
1351 
1352 void
1353 xfs_buf_iorequest(
1354 	xfs_buf_t		*bp)
1355 {
1356 	trace_xfs_buf_iorequest(bp, _RET_IP_);
1357 
1358 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1359 
1360 	if (bp->b_flags & XBF_WRITE)
1361 		xfs_buf_wait_unpin(bp);
1362 	xfs_buf_hold(bp);
1363 
1364 	/* Set the count to 1 initially, this will stop an I/O
1365 	 * completion callout which happens before we have started
1366 	 * all the I/O from calling xfs_buf_ioend too early.
1367 	 */
1368 	atomic_set(&bp->b_io_remaining, 1);
1369 	_xfs_buf_ioapply(bp);
1370 	_xfs_buf_ioend(bp, 1);
1371 
1372 	xfs_buf_rele(bp);
1373 }
1374 
1375 /*
1376  * Waits for I/O to complete on the buffer supplied.  It returns immediately if
1377  * no I/O is pending or there is already a pending error on the buffer.  It
1378  * returns the I/O error code, if any, or 0 if there was no error.
1379  */
1380 int
1381 xfs_buf_iowait(
1382 	xfs_buf_t		*bp)
1383 {
1384 	trace_xfs_buf_iowait(bp, _RET_IP_);
1385 
1386 	if (!bp->b_error)
1387 		wait_for_completion(&bp->b_iowait);
1388 
1389 	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1390 	return bp->b_error;
1391 }
1392 
1393 xfs_caddr_t
1394 xfs_buf_offset(
1395 	xfs_buf_t		*bp,
1396 	size_t			offset)
1397 {
1398 	struct page		*page;
1399 
1400 	if (bp->b_addr)
1401 		return bp->b_addr + offset;
1402 
1403 	offset += bp->b_offset;
1404 	page = bp->b_pages[offset >> PAGE_SHIFT];
1405 	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1406 }
1407 
1408 /*
1409  *	Move data into or out of a buffer.
1410  */
1411 void
1412 xfs_buf_iomove(
1413 	xfs_buf_t		*bp,	/* buffer to process		*/
1414 	size_t			boff,	/* starting buffer offset	*/
1415 	size_t			bsize,	/* length to copy		*/
1416 	void			*data,	/* data address			*/
1417 	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
1418 {
1419 	size_t			bend;
1420 
1421 	bend = boff + bsize;
1422 	while (boff < bend) {
1423 		struct page	*page;
1424 		int		page_index, page_offset, csize;
1425 
1426 		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1427 		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1428 		page = bp->b_pages[page_index];
1429 		csize = min_t(size_t, PAGE_SIZE - page_offset,
1430 				      BBTOB(bp->b_io_length) - boff);
1431 
1432 		ASSERT((csize + page_offset) <= PAGE_SIZE);
1433 
1434 		switch (mode) {
1435 		case XBRW_ZERO:
1436 			memset(page_address(page) + page_offset, 0, csize);
1437 			break;
1438 		case XBRW_READ:
1439 			memcpy(data, page_address(page) + page_offset, csize);
1440 			break;
1441 		case XBRW_WRITE:
1442 			memcpy(page_address(page) + page_offset, data, csize);
1443 		}
1444 
1445 		boff += csize;
1446 		data += csize;
1447 	}
1448 }
1449 
1450 /*
1451  *	Handling of buffer targets (buftargs).
1452  */
1453 
1454 /*
1455  * Wait for any bufs with callbacks that have been submitted but have not yet
1456  * returned. These buffers will have an elevated hold count, so wait on those
1457  * while freeing all the buffers only held by the LRU.
1458  */
1459 static enum lru_status
1460 xfs_buftarg_wait_rele(
1461 	struct list_head	*item,
1462 	spinlock_t		*lru_lock,
1463 	void			*arg)
1464 
1465 {
1466 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1467 	struct list_head	*dispose = arg;
1468 
1469 	if (atomic_read(&bp->b_hold) > 1) {
1470 		/* need to wait, so skip it this pass */
1471 		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1472 		return LRU_SKIP;
1473 	}
1474 	if (!spin_trylock(&bp->b_lock))
1475 		return LRU_SKIP;
1476 
1477 	/*
1478 	 * clear the LRU reference count so the buffer doesn't get
1479 	 * ignored in xfs_buf_rele().
1480 	 */
1481 	atomic_set(&bp->b_lru_ref, 0);
1482 	bp->b_state |= XFS_BSTATE_DISPOSE;
1483 	list_move(item, dispose);
1484 	spin_unlock(&bp->b_lock);
1485 	return LRU_REMOVED;
1486 }
1487 
1488 void
1489 xfs_wait_buftarg(
1490 	struct xfs_buftarg	*btp)
1491 {
1492 	LIST_HEAD(dispose);
1493 	int loop = 0;
1494 
1495 	/* loop until there is nothing left on the lru list. */
1496 	while (list_lru_count(&btp->bt_lru)) {
1497 		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
1498 			      &dispose, LONG_MAX);
1499 
1500 		while (!list_empty(&dispose)) {
1501 			struct xfs_buf *bp;
1502 			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1503 			list_del_init(&bp->b_lru);
1504 			if (bp->b_flags & XBF_WRITE_FAIL) {
1505 				xfs_alert(btp->bt_mount,
1506 "Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
1507 "Please run xfs_repair to determine the extent of the problem.",
1508 					(long long)bp->b_bn);
1509 			}
1510 			xfs_buf_rele(bp);
1511 		}
1512 		if (loop++ != 0)
1513 			delay(100);
1514 	}
1515 }
1516 
1517 static enum lru_status
1518 xfs_buftarg_isolate(
1519 	struct list_head	*item,
1520 	spinlock_t		*lru_lock,
1521 	void			*arg)
1522 {
1523 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1524 	struct list_head	*dispose = arg;
1525 
1526 	/*
1527 	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1528 	 * If we fail to get the lock, just skip it.
1529 	 */
1530 	if (!spin_trylock(&bp->b_lock))
1531 		return LRU_SKIP;
1532 	/*
1533 	 * Decrement the b_lru_ref count unless the value is already
1534 	 * zero. If the value is already zero, we need to reclaim the
1535 	 * buffer, otherwise it gets another trip through the LRU.
1536 	 */
1537 	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1538 		spin_unlock(&bp->b_lock);
1539 		return LRU_ROTATE;
1540 	}
1541 
1542 	bp->b_state |= XFS_BSTATE_DISPOSE;
1543 	list_move(item, dispose);
1544 	spin_unlock(&bp->b_lock);
1545 	return LRU_REMOVED;
1546 }
1547 
1548 static unsigned long
1549 xfs_buftarg_shrink_scan(
1550 	struct shrinker		*shrink,
1551 	struct shrink_control	*sc)
1552 {
1553 	struct xfs_buftarg	*btp = container_of(shrink,
1554 					struct xfs_buftarg, bt_shrinker);
1555 	LIST_HEAD(dispose);
1556 	unsigned long		freed;
1557 	unsigned long		nr_to_scan = sc->nr_to_scan;
1558 
1559 	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
1560 				       &dispose, &nr_to_scan);
1561 
1562 	while (!list_empty(&dispose)) {
1563 		struct xfs_buf *bp;
1564 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1565 		list_del_init(&bp->b_lru);
1566 		xfs_buf_rele(bp);
1567 	}
1568 
1569 	return freed;
1570 }
1571 
1572 static unsigned long
1573 xfs_buftarg_shrink_count(
1574 	struct shrinker		*shrink,
1575 	struct shrink_control	*sc)
1576 {
1577 	struct xfs_buftarg	*btp = container_of(shrink,
1578 					struct xfs_buftarg, bt_shrinker);
1579 	return list_lru_count_node(&btp->bt_lru, sc->nid);
1580 }
1581 
1582 void
1583 xfs_free_buftarg(
1584 	struct xfs_mount	*mp,
1585 	struct xfs_buftarg	*btp)
1586 {
1587 	unregister_shrinker(&btp->bt_shrinker);
1588 	list_lru_destroy(&btp->bt_lru);
1589 
1590 	if (mp->m_flags & XFS_MOUNT_BARRIER)
1591 		xfs_blkdev_issue_flush(btp);
1592 
1593 	kmem_free(btp);
1594 }
1595 
1596 STATIC int
1597 xfs_setsize_buftarg_flags(
1598 	xfs_buftarg_t		*btp,
1599 	unsigned int		blocksize,
1600 	unsigned int		sectorsize,
1601 	int			verbose)
1602 {
1603 	btp->bt_bsize = blocksize;
1604 	btp->bt_sshift = ffs(sectorsize) - 1;
1605 	btp->bt_smask = sectorsize - 1;
1606 
1607 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1608 		char name[BDEVNAME_SIZE];
1609 
1610 		bdevname(btp->bt_bdev, name);
1611 
1612 		xfs_warn(btp->bt_mount,
1613 			"Cannot set_blocksize to %u on device %s",
1614 			sectorsize, name);
1615 		return EINVAL;
1616 	}
1617 
1618 	return 0;
1619 }
1620 
1621 /*
1622  *	When allocating the initial buffer target we have not yet
1623  *	read in the superblock, so don't know what sized sectors
1624  *	are being used at this early stage.  Play safe.
1625  */
1626 STATIC int
1627 xfs_setsize_buftarg_early(
1628 	xfs_buftarg_t		*btp,
1629 	struct block_device	*bdev)
1630 {
1631 	return xfs_setsize_buftarg_flags(btp,
1632 			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1633 }
1634 
1635 int
1636 xfs_setsize_buftarg(
1637 	xfs_buftarg_t		*btp,
1638 	unsigned int		blocksize,
1639 	unsigned int		sectorsize)
1640 {
1641 	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1642 }
1643 
1644 xfs_buftarg_t *
1645 xfs_alloc_buftarg(
1646 	struct xfs_mount	*mp,
1647 	struct block_device	*bdev,
1648 	int			external,
1649 	const char		*fsname)
1650 {
1651 	xfs_buftarg_t		*btp;
1652 
1653 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
1654 
1655 	btp->bt_mount = mp;
1656 	btp->bt_dev =  bdev->bd_dev;
1657 	btp->bt_bdev = bdev;
1658 	btp->bt_bdi = blk_get_backing_dev_info(bdev);
1659 	if (!btp->bt_bdi)
1660 		goto error;
1661 
1662 	if (xfs_setsize_buftarg_early(btp, bdev))
1663 		goto error;
1664 
1665 	if (list_lru_init(&btp->bt_lru))
1666 		goto error;
1667 
1668 	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1669 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1670 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1671 	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
1672 	register_shrinker(&btp->bt_shrinker);
1673 	return btp;
1674 
1675 error:
1676 	kmem_free(btp);
1677 	return NULL;
1678 }
1679 
1680 /*
1681  * Add a buffer to the delayed write list.
1682  *
1683  * This queues a buffer for writeout if it hasn't already been.  Note that
1684  * neither this routine nor the buffer list submission functions perform
1685  * any internal synchronization.  It is expected that the lists are thread-local
1686  * to the callers.
1687  *
1688  * Returns true if we queued up the buffer, or false if it already had
1689  * been on the buffer list.
1690  */
1691 bool
1692 xfs_buf_delwri_queue(
1693 	struct xfs_buf		*bp,
1694 	struct list_head	*list)
1695 {
1696 	ASSERT(xfs_buf_islocked(bp));
1697 	ASSERT(!(bp->b_flags & XBF_READ));
1698 
1699 	/*
1700 	 * If the buffer is already marked delwri it already is queued up
1701 	 * by someone else for imediate writeout.  Just ignore it in that
1702 	 * case.
1703 	 */
1704 	if (bp->b_flags & _XBF_DELWRI_Q) {
1705 		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1706 		return false;
1707 	}
1708 
1709 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1710 
1711 	/*
1712 	 * If a buffer gets written out synchronously or marked stale while it
1713 	 * is on a delwri list we lazily remove it. To do this, the other party
1714 	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1715 	 * It remains referenced and on the list.  In a rare corner case it
1716 	 * might get readded to a delwri list after the synchronous writeout, in
1717 	 * which case we need just need to re-add the flag here.
1718 	 */
1719 	bp->b_flags |= _XBF_DELWRI_Q;
1720 	if (list_empty(&bp->b_list)) {
1721 		atomic_inc(&bp->b_hold);
1722 		list_add_tail(&bp->b_list, list);
1723 	}
1724 
1725 	return true;
1726 }
1727 
1728 /*
1729  * Compare function is more complex than it needs to be because
1730  * the return value is only 32 bits and we are doing comparisons
1731  * on 64 bit values
1732  */
1733 static int
1734 xfs_buf_cmp(
1735 	void		*priv,
1736 	struct list_head *a,
1737 	struct list_head *b)
1738 {
1739 	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
1740 	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
1741 	xfs_daddr_t		diff;
1742 
1743 	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
1744 	if (diff < 0)
1745 		return -1;
1746 	if (diff > 0)
1747 		return 1;
1748 	return 0;
1749 }
1750 
1751 static int
1752 __xfs_buf_delwri_submit(
1753 	struct list_head	*buffer_list,
1754 	struct list_head	*io_list,
1755 	bool			wait)
1756 {
1757 	struct blk_plug		plug;
1758 	struct xfs_buf		*bp, *n;
1759 	int			pinned = 0;
1760 
1761 	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1762 		if (!wait) {
1763 			if (xfs_buf_ispinned(bp)) {
1764 				pinned++;
1765 				continue;
1766 			}
1767 			if (!xfs_buf_trylock(bp))
1768 				continue;
1769 		} else {
1770 			xfs_buf_lock(bp);
1771 		}
1772 
1773 		/*
1774 		 * Someone else might have written the buffer synchronously or
1775 		 * marked it stale in the meantime.  In that case only the
1776 		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1777 		 * reference and remove it from the list here.
1778 		 */
1779 		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1780 			list_del_init(&bp->b_list);
1781 			xfs_buf_relse(bp);
1782 			continue;
1783 		}
1784 
1785 		list_move_tail(&bp->b_list, io_list);
1786 		trace_xfs_buf_delwri_split(bp, _RET_IP_);
1787 	}
1788 
1789 	list_sort(NULL, io_list, xfs_buf_cmp);
1790 
1791 	blk_start_plug(&plug);
1792 	list_for_each_entry_safe(bp, n, io_list, b_list) {
1793 		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1794 		bp->b_flags |= XBF_WRITE;
1795 
1796 		if (!wait) {
1797 			bp->b_flags |= XBF_ASYNC;
1798 			list_del_init(&bp->b_list);
1799 		}
1800 		xfs_bdstrat_cb(bp);
1801 	}
1802 	blk_finish_plug(&plug);
1803 
1804 	return pinned;
1805 }
1806 
1807 /*
1808  * Write out a buffer list asynchronously.
1809  *
1810  * This will take the @buffer_list, write all non-locked and non-pinned buffers
1811  * out and not wait for I/O completion on any of the buffers.  This interface
1812  * is only safely useable for callers that can track I/O completion by higher
1813  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1814  * function.
1815  */
1816 int
1817 xfs_buf_delwri_submit_nowait(
1818 	struct list_head	*buffer_list)
1819 {
1820 	LIST_HEAD		(io_list);
1821 	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1822 }
1823 
1824 /*
1825  * Write out a buffer list synchronously.
1826  *
1827  * This will take the @buffer_list, write all buffers out and wait for I/O
1828  * completion on all of the buffers. @buffer_list is consumed by the function,
1829  * so callers must have some other way of tracking buffers if they require such
1830  * functionality.
1831  */
1832 int
1833 xfs_buf_delwri_submit(
1834 	struct list_head	*buffer_list)
1835 {
1836 	LIST_HEAD		(io_list);
1837 	int			error = 0, error2;
1838 	struct xfs_buf		*bp;
1839 
1840 	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
1841 
1842 	/* Wait for IO to complete. */
1843 	while (!list_empty(&io_list)) {
1844 		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1845 
1846 		list_del_init(&bp->b_list);
1847 		error2 = xfs_buf_iowait(bp);
1848 		xfs_buf_relse(bp);
1849 		if (!error)
1850 			error = error2;
1851 	}
1852 
1853 	return error;
1854 }
1855 
1856 int __init
1857 xfs_buf_init(void)
1858 {
1859 	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1860 						KM_ZONE_HWALIGN, NULL);
1861 	if (!xfs_buf_zone)
1862 		goto out;
1863 
1864 	xfslogd_workqueue = alloc_workqueue("xfslogd",
1865 					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1866 	if (!xfslogd_workqueue)
1867 		goto out_free_buf_zone;
1868 
1869 	return 0;
1870 
1871  out_free_buf_zone:
1872 	kmem_zone_destroy(xfs_buf_zone);
1873  out:
1874 	return -ENOMEM;
1875 }
1876 
1877 void
1878 xfs_buf_terminate(void)
1879 {
1880 	destroy_workqueue(xfslogd_workqueue);
1881 	kmem_zone_destroy(xfs_buf_zone);
1882 }
1883