10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0
2c59d87c4SChristoph Hellwig /*
3c59d87c4SChristoph Hellwig * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4c59d87c4SChristoph Hellwig * All Rights Reserved.
5c59d87c4SChristoph Hellwig */
6c59d87c4SChristoph Hellwig #include "xfs.h"
7c59d87c4SChristoph Hellwig #include <linux/backing-dev.h>
86f643c57SShiyang Ruan #include <linux/dax.h>
9c59d87c4SChristoph Hellwig
105467b34bSDarrick J. Wong #include "xfs_shared.h"
114fb6e8adSChristoph Hellwig #include "xfs_format.h"
12239880efSDave Chinner #include "xfs_log_format.h"
137fd36c44SDave Chinner #include "xfs_trans_resv.h"
14c59d87c4SChristoph Hellwig #include "xfs_mount.h"
15c59d87c4SChristoph Hellwig #include "xfs_trace.h"
16239880efSDave Chinner #include "xfs_log.h"
179fe5c77cSDave Chinner #include "xfs_log_recover.h"
1801728b44SDave Chinner #include "xfs_log_priv.h"
19f593bf14SDave Chinner #include "xfs_trans.h"
20f593bf14SDave Chinner #include "xfs_buf_item.h"
21e9e899a2SDarrick J. Wong #include "xfs_errortag.h"
227561d27eSBrian Foster #include "xfs_error.h"
239bbafc71SDave Chinner #include "xfs_ag.h"
24c59d87c4SChristoph Hellwig
25231f91abSDave Chinner struct kmem_cache *xfs_buf_cache;
26c59d87c4SChristoph Hellwig
2737fd1678SDave Chinner /*
2837fd1678SDave Chinner * Locking orders
2937fd1678SDave Chinner *
3037fd1678SDave Chinner * xfs_buf_ioacct_inc:
3137fd1678SDave Chinner * xfs_buf_ioacct_dec:
3237fd1678SDave Chinner * b_sema (caller holds)
3337fd1678SDave Chinner * b_lock
3437fd1678SDave Chinner *
3537fd1678SDave Chinner * xfs_buf_stale:
3637fd1678SDave Chinner * b_sema (caller holds)
3737fd1678SDave Chinner * b_lock
3837fd1678SDave Chinner * lru_lock
3937fd1678SDave Chinner *
4037fd1678SDave Chinner * xfs_buf_rele:
4137fd1678SDave Chinner * b_lock
4237fd1678SDave Chinner * pag_buf_lock
4337fd1678SDave Chinner * lru_lock
4437fd1678SDave Chinner *
4510fb9ac1SBrian Foster * xfs_buftarg_drain_rele
4637fd1678SDave Chinner * lru_lock
4737fd1678SDave Chinner * b_lock (trylock due to inversion)
4837fd1678SDave Chinner *
4937fd1678SDave Chinner * xfs_buftarg_isolate
5037fd1678SDave Chinner * lru_lock
5137fd1678SDave Chinner * b_lock (trylock due to inversion)
5237fd1678SDave Chinner */
53c59d87c4SChristoph Hellwig
5426e32875SChristoph Hellwig static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
5526e32875SChristoph Hellwig
5626e32875SChristoph Hellwig static inline int
xfs_buf_submit(struct xfs_buf * bp)5726e32875SChristoph Hellwig xfs_buf_submit(
5826e32875SChristoph Hellwig struct xfs_buf *bp)
5926e32875SChristoph Hellwig {
6026e32875SChristoph Hellwig return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
6126e32875SChristoph Hellwig }
6226e32875SChristoph Hellwig
63c59d87c4SChristoph Hellwig static inline int
xfs_buf_is_vmapped(struct xfs_buf * bp)64c59d87c4SChristoph Hellwig xfs_buf_is_vmapped(
65c59d87c4SChristoph Hellwig struct xfs_buf *bp)
66c59d87c4SChristoph Hellwig {
67c59d87c4SChristoph Hellwig /*
68c59d87c4SChristoph Hellwig * Return true if the buffer is vmapped.
69c59d87c4SChristoph Hellwig *
70611c9946SDave Chinner * b_addr is null if the buffer is not mapped, but the code is clever
71611c9946SDave Chinner * enough to know it doesn't have to map a single page, so the check has
72611c9946SDave Chinner * to be both for b_addr and bp->b_page_count > 1.
73c59d87c4SChristoph Hellwig */
74611c9946SDave Chinner return bp->b_addr && bp->b_page_count > 1;
75c59d87c4SChristoph Hellwig }
76c59d87c4SChristoph Hellwig
77c59d87c4SChristoph Hellwig static inline int
xfs_buf_vmap_len(struct xfs_buf * bp)78c59d87c4SChristoph Hellwig xfs_buf_vmap_len(
79c59d87c4SChristoph Hellwig struct xfs_buf *bp)
80c59d87c4SChristoph Hellwig {
8154cd3aa6SChristoph Hellwig return (bp->b_page_count * PAGE_SIZE);
82c59d87c4SChristoph Hellwig }
83c59d87c4SChristoph Hellwig
84c59d87c4SChristoph Hellwig /*
859c7504aaSBrian Foster * Bump the I/O in flight count on the buftarg if we haven't yet done so for
869c7504aaSBrian Foster * this buffer. The count is incremented once per buffer (per hold cycle)
879c7504aaSBrian Foster * because the corresponding decrement is deferred to buffer release. Buffers
889c7504aaSBrian Foster * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
899c7504aaSBrian Foster * tracking adds unnecessary overhead. This is used for sychronization purposes
9010fb9ac1SBrian Foster * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
919c7504aaSBrian Foster * in-flight buffers.
929c7504aaSBrian Foster *
939c7504aaSBrian Foster * Buffers that are never released (e.g., superblock, iclog buffers) must set
949c7504aaSBrian Foster * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
959c7504aaSBrian Foster * never reaches zero and unmount hangs indefinitely.
969c7504aaSBrian Foster */
979c7504aaSBrian Foster static inline void
xfs_buf_ioacct_inc(struct xfs_buf * bp)989c7504aaSBrian Foster xfs_buf_ioacct_inc(
999c7504aaSBrian Foster struct xfs_buf *bp)
1009c7504aaSBrian Foster {
10163db7c81SBrian Foster if (bp->b_flags & XBF_NO_IOACCT)
1029c7504aaSBrian Foster return;
1039c7504aaSBrian Foster
1049c7504aaSBrian Foster ASSERT(bp->b_flags & XBF_ASYNC);
10563db7c81SBrian Foster spin_lock(&bp->b_lock);
10663db7c81SBrian Foster if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
10763db7c81SBrian Foster bp->b_state |= XFS_BSTATE_IN_FLIGHT;
1089c7504aaSBrian Foster percpu_counter_inc(&bp->b_target->bt_io_count);
1099c7504aaSBrian Foster }
11063db7c81SBrian Foster spin_unlock(&bp->b_lock);
11163db7c81SBrian Foster }
1129c7504aaSBrian Foster
1139c7504aaSBrian Foster /*
1149c7504aaSBrian Foster * Clear the in-flight state on a buffer about to be released to the LRU or
1159c7504aaSBrian Foster * freed and unaccount from the buftarg.
1169c7504aaSBrian Foster */
1179c7504aaSBrian Foster static inline void
__xfs_buf_ioacct_dec(struct xfs_buf * bp)11863db7c81SBrian Foster __xfs_buf_ioacct_dec(
11963db7c81SBrian Foster struct xfs_buf *bp)
12063db7c81SBrian Foster {
12195989c46SBrian Foster lockdep_assert_held(&bp->b_lock);
12263db7c81SBrian Foster
12363db7c81SBrian Foster if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
12463db7c81SBrian Foster bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
12563db7c81SBrian Foster percpu_counter_dec(&bp->b_target->bt_io_count);
12663db7c81SBrian Foster }
12763db7c81SBrian Foster }
12863db7c81SBrian Foster
12963db7c81SBrian Foster static inline void
xfs_buf_ioacct_dec(struct xfs_buf * bp)1309c7504aaSBrian Foster xfs_buf_ioacct_dec(
1319c7504aaSBrian Foster struct xfs_buf *bp)
1329c7504aaSBrian Foster {
13363db7c81SBrian Foster spin_lock(&bp->b_lock);
13463db7c81SBrian Foster __xfs_buf_ioacct_dec(bp);
13563db7c81SBrian Foster spin_unlock(&bp->b_lock);
1369c7504aaSBrian Foster }
1379c7504aaSBrian Foster
1389c7504aaSBrian Foster /*
139c59d87c4SChristoph Hellwig * When we mark a buffer stale, we remove the buffer from the LRU and clear the
140c59d87c4SChristoph Hellwig * b_lru_ref count so that the buffer is freed immediately when the buffer
141c59d87c4SChristoph Hellwig * reference count falls to zero. If the buffer is already on the LRU, we need
142c59d87c4SChristoph Hellwig * to remove the reference that LRU holds on the buffer.
143c59d87c4SChristoph Hellwig *
144c59d87c4SChristoph Hellwig * This prevents build-up of stale buffers on the LRU.
145c59d87c4SChristoph Hellwig */
146c59d87c4SChristoph Hellwig void
xfs_buf_stale(struct xfs_buf * bp)147c59d87c4SChristoph Hellwig xfs_buf_stale(
148c59d87c4SChristoph Hellwig struct xfs_buf *bp)
149c59d87c4SChristoph Hellwig {
15043ff2122SChristoph Hellwig ASSERT(xfs_buf_islocked(bp));
15143ff2122SChristoph Hellwig
152c59d87c4SChristoph Hellwig bp->b_flags |= XBF_STALE;
15343ff2122SChristoph Hellwig
15443ff2122SChristoph Hellwig /*
15543ff2122SChristoph Hellwig * Clear the delwri status so that a delwri queue walker will not
15643ff2122SChristoph Hellwig * flush this buffer to disk now that it is stale. The delwri queue has
15743ff2122SChristoph Hellwig * a reference to the buffer, so this is safe to do.
15843ff2122SChristoph Hellwig */
15943ff2122SChristoph Hellwig bp->b_flags &= ~_XBF_DELWRI_Q;
16043ff2122SChristoph Hellwig
1619c7504aaSBrian Foster /*
1629c7504aaSBrian Foster * Once the buffer is marked stale and unlocked, a subsequent lookup
1639c7504aaSBrian Foster * could reset b_flags. There is no guarantee that the buffer is
1649c7504aaSBrian Foster * unaccounted (released to LRU) before that occurs. Drop in-flight
1659c7504aaSBrian Foster * status now to preserve accounting consistency.
1669c7504aaSBrian Foster */
167a4082357SDave Chinner spin_lock(&bp->b_lock);
16863db7c81SBrian Foster __xfs_buf_ioacct_dec(bp);
16963db7c81SBrian Foster
170a4082357SDave Chinner atomic_set(&bp->b_lru_ref, 0);
171a4082357SDave Chinner if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
172e80dfa19SDave Chinner (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
173c59d87c4SChristoph Hellwig atomic_dec(&bp->b_hold);
174e80dfa19SDave Chinner
175c59d87c4SChristoph Hellwig ASSERT(atomic_read(&bp->b_hold) >= 1);
176a4082357SDave Chinner spin_unlock(&bp->b_lock);
177c59d87c4SChristoph Hellwig }
178c59d87c4SChristoph Hellwig
1793e85c868SDave Chinner static int
xfs_buf_get_maps(struct xfs_buf * bp,int map_count)1803e85c868SDave Chinner xfs_buf_get_maps(
1813e85c868SDave Chinner struct xfs_buf *bp,
1823e85c868SDave Chinner int map_count)
1833e85c868SDave Chinner {
1843e85c868SDave Chinner ASSERT(bp->b_maps == NULL);
1853e85c868SDave Chinner bp->b_map_count = map_count;
1863e85c868SDave Chinner
1873e85c868SDave Chinner if (map_count == 1) {
188f4b42421SMark Tinguely bp->b_maps = &bp->__b_map;
1893e85c868SDave Chinner return 0;
1903e85c868SDave Chinner }
1913e85c868SDave Chinner
1923e85c868SDave Chinner bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
1933e85c868SDave Chinner KM_NOFS);
1943e85c868SDave Chinner if (!bp->b_maps)
1952451337dSDave Chinner return -ENOMEM;
1963e85c868SDave Chinner return 0;
1973e85c868SDave Chinner }
1983e85c868SDave Chinner
1993e85c868SDave Chinner /*
2003e85c868SDave Chinner * Frees b_pages if it was allocated.
2013e85c868SDave Chinner */
2023e85c868SDave Chinner static void
xfs_buf_free_maps(struct xfs_buf * bp)2033e85c868SDave Chinner xfs_buf_free_maps(
2043e85c868SDave Chinner struct xfs_buf *bp)
2053e85c868SDave Chinner {
206f4b42421SMark Tinguely if (bp->b_maps != &bp->__b_map) {
2073e85c868SDave Chinner kmem_free(bp->b_maps);
2083e85c868SDave Chinner bp->b_maps = NULL;
2093e85c868SDave Chinner }
2103e85c868SDave Chinner }
2113e85c868SDave Chinner
21232dff5e5SDarrick J. Wong static int
_xfs_buf_alloc(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)2133e85c868SDave Chinner _xfs_buf_alloc(
2144347b9d7SChristoph Hellwig struct xfs_buftarg *target,
2153e85c868SDave Chinner struct xfs_buf_map *map,
2163e85c868SDave Chinner int nmaps,
21732dff5e5SDarrick J. Wong xfs_buf_flags_t flags,
21832dff5e5SDarrick J. Wong struct xfs_buf **bpp)
219c59d87c4SChristoph Hellwig {
2204347b9d7SChristoph Hellwig struct xfs_buf *bp;
2213e85c868SDave Chinner int error;
2223e85c868SDave Chinner int i;
2234347b9d7SChristoph Hellwig
22432dff5e5SDarrick J. Wong *bpp = NULL;
225182696fbSDarrick J. Wong bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
2264347b9d7SChristoph Hellwig
227c59d87c4SChristoph Hellwig /*
22812bcb3f7SDave Chinner * We don't want certain flags to appear in b_flags unless they are
22912bcb3f7SDave Chinner * specifically set by later operations on the buffer.
230c59d87c4SChristoph Hellwig */
231611c9946SDave Chinner flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
232c59d87c4SChristoph Hellwig
233c59d87c4SChristoph Hellwig atomic_set(&bp->b_hold, 1);
234c59d87c4SChristoph Hellwig atomic_set(&bp->b_lru_ref, 1);
235c59d87c4SChristoph Hellwig init_completion(&bp->b_iowait);
236c59d87c4SChristoph Hellwig INIT_LIST_HEAD(&bp->b_lru);
237c59d87c4SChristoph Hellwig INIT_LIST_HEAD(&bp->b_list);
238643c8c05SCarlos Maiolino INIT_LIST_HEAD(&bp->b_li_list);
239c59d87c4SChristoph Hellwig sema_init(&bp->b_sema, 0); /* held, no waiters */
240a4082357SDave Chinner spin_lock_init(&bp->b_lock);
241c59d87c4SChristoph Hellwig bp->b_target = target;
242dbd329f1SChristoph Hellwig bp->b_mount = target->bt_mount;
2433e85c868SDave Chinner bp->b_flags = flags;
244de1cbee4SDave Chinner
245c59d87c4SChristoph Hellwig /*
246aa0e8833SDave Chinner * Set length and io_length to the same value initially.
247aa0e8833SDave Chinner * I/O routines should use io_length, which will be the same in
248c59d87c4SChristoph Hellwig * most cases but may be reset (e.g. XFS recovery).
249c59d87c4SChristoph Hellwig */
2503e85c868SDave Chinner error = xfs_buf_get_maps(bp, nmaps);
2513e85c868SDave Chinner if (error) {
252182696fbSDarrick J. Wong kmem_cache_free(xfs_buf_cache, bp);
25332dff5e5SDarrick J. Wong return error;
2543e85c868SDave Chinner }
2553e85c868SDave Chinner
2564c7f65aeSDave Chinner bp->b_rhash_key = map[0].bm_bn;
2573e85c868SDave Chinner bp->b_length = 0;
2583e85c868SDave Chinner for (i = 0; i < nmaps; i++) {
2593e85c868SDave Chinner bp->b_maps[i].bm_bn = map[i].bm_bn;
2603e85c868SDave Chinner bp->b_maps[i].bm_len = map[i].bm_len;
2613e85c868SDave Chinner bp->b_length += map[i].bm_len;
2623e85c868SDave Chinner }
2633e85c868SDave Chinner
264c59d87c4SChristoph Hellwig atomic_set(&bp->b_pin_count, 0);
265c59d87c4SChristoph Hellwig init_waitqueue_head(&bp->b_waiters);
266c59d87c4SChristoph Hellwig
267dbd329f1SChristoph Hellwig XFS_STATS_INC(bp->b_mount, xb_create);
268c59d87c4SChristoph Hellwig trace_xfs_buf_init(bp, _RET_IP_);
2694347b9d7SChristoph Hellwig
27032dff5e5SDarrick J. Wong *bpp = bp;
27132dff5e5SDarrick J. Wong return 0;
272c59d87c4SChristoph Hellwig }
273c59d87c4SChristoph Hellwig
274e7d236a6SDave Chinner static void
xfs_buf_free_pages(struct xfs_buf * bp)275e7d236a6SDave Chinner xfs_buf_free_pages(
276e8222613SDave Chinner struct xfs_buf *bp)
277c59d87c4SChristoph Hellwig {
278e7d236a6SDave Chinner uint i;
279e7d236a6SDave Chinner
280e7d236a6SDave Chinner ASSERT(bp->b_flags & _XBF_PAGES);
281e7d236a6SDave Chinner
282e7d236a6SDave Chinner if (xfs_buf_is_vmapped(bp))
28354cd3aa6SChristoph Hellwig vm_unmap_ram(bp->b_addr, bp->b_page_count);
284e7d236a6SDave Chinner
285e7d236a6SDave Chinner for (i = 0; i < bp->b_page_count; i++) {
286e7d236a6SDave Chinner if (bp->b_pages[i])
287e7d236a6SDave Chinner __free_page(bp->b_pages[i]);
288e7d236a6SDave Chinner }
289c7b23b68SYosry Ahmed mm_account_reclaimed_pages(bp->b_page_count);
290e7d236a6SDave Chinner
29102c51173SDave Chinner if (bp->b_pages != bp->b_page_array)
292c59d87c4SChristoph Hellwig kmem_free(bp->b_pages);
293c59d87c4SChristoph Hellwig bp->b_pages = NULL;
294e7d236a6SDave Chinner bp->b_flags &= ~_XBF_PAGES;
295c59d87c4SChristoph Hellwig }
296c59d87c4SChristoph Hellwig
29725a40957SChristoph Hellwig static void
xfs_buf_free_callback(struct callback_head * cb)298298f3422SDave Chinner xfs_buf_free_callback(
299298f3422SDave Chinner struct callback_head *cb)
300298f3422SDave Chinner {
301298f3422SDave Chinner struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
302298f3422SDave Chinner
303298f3422SDave Chinner xfs_buf_free_maps(bp);
304298f3422SDave Chinner kmem_cache_free(xfs_buf_cache, bp);
305298f3422SDave Chinner }
306298f3422SDave Chinner
307298f3422SDave Chinner static void
xfs_buf_free(struct xfs_buf * bp)308c59d87c4SChristoph Hellwig xfs_buf_free(
309e8222613SDave Chinner struct xfs_buf *bp)
310c59d87c4SChristoph Hellwig {
311c59d87c4SChristoph Hellwig trace_xfs_buf_free(bp, _RET_IP_);
312c59d87c4SChristoph Hellwig
313c59d87c4SChristoph Hellwig ASSERT(list_empty(&bp->b_lru));
314c59d87c4SChristoph Hellwig
315e7d236a6SDave Chinner if (bp->b_flags & _XBF_PAGES)
316e7d236a6SDave Chinner xfs_buf_free_pages(bp);
317e7d236a6SDave Chinner else if (bp->b_flags & _XBF_KMEM)
318c59d87c4SChristoph Hellwig kmem_free(bp->b_addr);
319e7d236a6SDave Chinner
320298f3422SDave Chinner call_rcu(&bp->b_rcu, xfs_buf_free_callback);
321c59d87c4SChristoph Hellwig }
322c59d87c4SChristoph Hellwig
3230a683794SDave Chinner static int
xfs_buf_alloc_kmem(struct xfs_buf * bp,xfs_buf_flags_t flags)3240a683794SDave Chinner xfs_buf_alloc_kmem(
325e8222613SDave Chinner struct xfs_buf *bp,
3260a683794SDave Chinner xfs_buf_flags_t flags)
327c59d87c4SChristoph Hellwig {
3280a683794SDave Chinner xfs_km_flags_t kmflag_mask = KM_NOFS;
3298bcac744SDave Chinner size_t size = BBTOB(bp->b_length);
3300a683794SDave Chinner
3310a683794SDave Chinner /* Assure zeroed buffer for non-read cases. */
3320a683794SDave Chinner if (!(flags & XBF_READ))
3330a683794SDave Chinner kmflag_mask |= KM_ZERO;
3340a683794SDave Chinner
33598fe2c3cSDave Chinner bp->b_addr = kmem_alloc(size, kmflag_mask);
3360a683794SDave Chinner if (!bp->b_addr)
3370a683794SDave Chinner return -ENOMEM;
338c59d87c4SChristoph Hellwig
339795cac72SDave Chinner if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
340c59d87c4SChristoph Hellwig ((unsigned long)bp->b_addr & PAGE_MASK)) {
341c59d87c4SChristoph Hellwig /* b_addr spans two pages - use alloc_page instead */
342c59d87c4SChristoph Hellwig kmem_free(bp->b_addr);
343c59d87c4SChristoph Hellwig bp->b_addr = NULL;
3440a683794SDave Chinner return -ENOMEM;
345c59d87c4SChristoph Hellwig }
346c59d87c4SChristoph Hellwig bp->b_offset = offset_in_page(bp->b_addr);
347c59d87c4SChristoph Hellwig bp->b_pages = bp->b_page_array;
348f8f9ee47SDave Chinner bp->b_pages[0] = kmem_to_page(bp->b_addr);
349c59d87c4SChristoph Hellwig bp->b_page_count = 1;
350611c9946SDave Chinner bp->b_flags |= _XBF_KMEM;
351c59d87c4SChristoph Hellwig return 0;
352c59d87c4SChristoph Hellwig }
353c59d87c4SChristoph Hellwig
3540a683794SDave Chinner static int
xfs_buf_alloc_pages(struct xfs_buf * bp,xfs_buf_flags_t flags)3550a683794SDave Chinner xfs_buf_alloc_pages(
3560a683794SDave Chinner struct xfs_buf *bp,
3570a683794SDave Chinner xfs_buf_flags_t flags)
3580a683794SDave Chinner {
359289ae7b4SDave Chinner gfp_t gfp_mask = __GFP_NOWARN;
360c9fa5630SDave Chinner long filled = 0;
3610a683794SDave Chinner
362289ae7b4SDave Chinner if (flags & XBF_READ_AHEAD)
363289ae7b4SDave Chinner gfp_mask |= __GFP_NORETRY;
364289ae7b4SDave Chinner else
365289ae7b4SDave Chinner gfp_mask |= GFP_NOFS;
366289ae7b4SDave Chinner
36702c51173SDave Chinner /* Make sure that we have a page list */
368934d1076SChristoph Hellwig bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
36902c51173SDave Chinner if (bp->b_page_count <= XB_PAGES) {
37002c51173SDave Chinner bp->b_pages = bp->b_page_array;
37102c51173SDave Chinner } else {
37202c51173SDave Chinner bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
37302c51173SDave Chinner gfp_mask);
37402c51173SDave Chinner if (!bp->b_pages)
37502c51173SDave Chinner return -ENOMEM;
37602c51173SDave Chinner }
37702c51173SDave Chinner bp->b_flags |= _XBF_PAGES;
37802c51173SDave Chinner
3790a683794SDave Chinner /* Assure zeroed buffer for non-read cases. */
3800a683794SDave Chinner if (!(flags & XBF_READ))
3810a683794SDave Chinner gfp_mask |= __GFP_ZERO;
3820a683794SDave Chinner
383c9fa5630SDave Chinner /*
384c9fa5630SDave Chinner * Bulk filling of pages can take multiple calls. Not filling the entire
385c9fa5630SDave Chinner * array is not an allocation failure, so don't back off if we get at
386c9fa5630SDave Chinner * least one extra page.
387c9fa5630SDave Chinner */
388c9fa5630SDave Chinner for (;;) {
389c9fa5630SDave Chinner long last = filled;
390c9fa5630SDave Chinner
391c9fa5630SDave Chinner filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
392c9fa5630SDave Chinner bp->b_pages);
393c9fa5630SDave Chinner if (filled == bp->b_page_count) {
394c9fa5630SDave Chinner XFS_STATS_INC(bp->b_mount, xb_page_found);
395c9fa5630SDave Chinner break;
396c9fa5630SDave Chinner }
397c9fa5630SDave Chinner
398c9fa5630SDave Chinner if (filled != last)
399c9fa5630SDave Chinner continue;
400c9fa5630SDave Chinner
401c59d87c4SChristoph Hellwig if (flags & XBF_READ_AHEAD) {
402e7d236a6SDave Chinner xfs_buf_free_pages(bp);
403e7d236a6SDave Chinner return -ENOMEM;
404c59d87c4SChristoph Hellwig }
405c59d87c4SChristoph Hellwig
406dbd329f1SChristoph Hellwig XFS_STATS_INC(bp->b_mount, xb_page_retries);
4074034247aSNeilBrown memalloc_retry_wait(gfp_mask);
408c59d87c4SChristoph Hellwig }
409c59d87c4SChristoph Hellwig return 0;
410c59d87c4SChristoph Hellwig }
411c59d87c4SChristoph Hellwig
412c59d87c4SChristoph Hellwig /*
413c59d87c4SChristoph Hellwig * Map buffer into kernel address-space if necessary.
414c59d87c4SChristoph Hellwig */
415c59d87c4SChristoph Hellwig STATIC int
_xfs_buf_map_pages(struct xfs_buf * bp,xfs_buf_flags_t flags)416c59d87c4SChristoph Hellwig _xfs_buf_map_pages(
417e8222613SDave Chinner struct xfs_buf *bp,
418b9b3fe15SDave Chinner xfs_buf_flags_t flags)
419c59d87c4SChristoph Hellwig {
420c59d87c4SChristoph Hellwig ASSERT(bp->b_flags & _XBF_PAGES);
421c59d87c4SChristoph Hellwig if (bp->b_page_count == 1) {
422c59d87c4SChristoph Hellwig /* A single page buffer is always mappable */
42354cd3aa6SChristoph Hellwig bp->b_addr = page_address(bp->b_pages[0]);
424611c9946SDave Chinner } else if (flags & XBF_UNMAPPED) {
425611c9946SDave Chinner bp->b_addr = NULL;
426611c9946SDave Chinner } else {
427c59d87c4SChristoph Hellwig int retried = 0;
4289ba1fb2cSMichal Hocko unsigned nofs_flag;
429c59d87c4SChristoph Hellwig
430ae687e58SDave Chinner /*
431cf085a1bSJoe Perches * vm_map_ram() will allocate auxiliary structures (e.g.
432ae687e58SDave Chinner * pagetables) with GFP_KERNEL, yet we are likely to be under
433ae687e58SDave Chinner * GFP_NOFS context here. Hence we need to tell memory reclaim
4349ba1fb2cSMichal Hocko * that we are in such a context via PF_MEMALLOC_NOFS to prevent
435ae687e58SDave Chinner * memory reclaim re-entering the filesystem here and
436ae687e58SDave Chinner * potentially deadlocking.
437ae687e58SDave Chinner */
4389ba1fb2cSMichal Hocko nofs_flag = memalloc_nofs_save();
439c59d87c4SChristoph Hellwig do {
440c59d87c4SChristoph Hellwig bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
441d4efd79aSChristoph Hellwig -1);
442c59d87c4SChristoph Hellwig if (bp->b_addr)
443c59d87c4SChristoph Hellwig break;
444c59d87c4SChristoph Hellwig vm_unmap_aliases();
445c59d87c4SChristoph Hellwig } while (retried++ <= 1);
4469ba1fb2cSMichal Hocko memalloc_nofs_restore(nofs_flag);
447c59d87c4SChristoph Hellwig
448c59d87c4SChristoph Hellwig if (!bp->b_addr)
449c59d87c4SChristoph Hellwig return -ENOMEM;
450c59d87c4SChristoph Hellwig }
451c59d87c4SChristoph Hellwig
452c59d87c4SChristoph Hellwig return 0;
453c59d87c4SChristoph Hellwig }
454c59d87c4SChristoph Hellwig
455c59d87c4SChristoph Hellwig /*
456c59d87c4SChristoph Hellwig * Finding and Reading Buffers
457c59d87c4SChristoph Hellwig */
4586031e73aSLucas Stach static int
_xfs_buf_obj_cmp(struct rhashtable_compare_arg * arg,const void * obj)4596031e73aSLucas Stach _xfs_buf_obj_cmp(
4606031e73aSLucas Stach struct rhashtable_compare_arg *arg,
4616031e73aSLucas Stach const void *obj)
4626031e73aSLucas Stach {
4636031e73aSLucas Stach const struct xfs_buf_map *map = arg->key;
4646031e73aSLucas Stach const struct xfs_buf *bp = obj;
4656031e73aSLucas Stach
4666031e73aSLucas Stach /*
4676031e73aSLucas Stach * The key hashing in the lookup path depends on the key being the
4686031e73aSLucas Stach * first element of the compare_arg, make sure to assert this.
4696031e73aSLucas Stach */
4706031e73aSLucas Stach BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
4716031e73aSLucas Stach
4724c7f65aeSDave Chinner if (bp->b_rhash_key != map->bm_bn)
4736031e73aSLucas Stach return 1;
4746031e73aSLucas Stach
4756031e73aSLucas Stach if (unlikely(bp->b_length != map->bm_len)) {
4766031e73aSLucas Stach /*
4776031e73aSLucas Stach * found a block number match. If the range doesn't
4786031e73aSLucas Stach * match, the only way this is allowed is if the buffer
4796031e73aSLucas Stach * in the cache is stale and the transaction that made
4806031e73aSLucas Stach * it stale has not yet committed. i.e. we are
4816031e73aSLucas Stach * reallocating a busy extent. Skip this buffer and
4826031e73aSLucas Stach * continue searching for an exact match.
4836031e73aSLucas Stach */
4849ed851f6SDarrick J. Wong if (!(map->bm_flags & XBM_LIVESCAN))
4856031e73aSLucas Stach ASSERT(bp->b_flags & XBF_STALE);
4866031e73aSLucas Stach return 1;
4876031e73aSLucas Stach }
4886031e73aSLucas Stach return 0;
4896031e73aSLucas Stach }
4906031e73aSLucas Stach
4916031e73aSLucas Stach static const struct rhashtable_params xfs_buf_hash_params = {
4926031e73aSLucas Stach .min_size = 32, /* empty AGs have minimal footprint */
4936031e73aSLucas Stach .nelem_hint = 16,
4946031e73aSLucas Stach .key_len = sizeof(xfs_daddr_t),
4954c7f65aeSDave Chinner .key_offset = offsetof(struct xfs_buf, b_rhash_key),
4966031e73aSLucas Stach .head_offset = offsetof(struct xfs_buf, b_rhash_head),
4976031e73aSLucas Stach .automatic_shrinking = true,
4986031e73aSLucas Stach .obj_cmpfn = _xfs_buf_obj_cmp,
4996031e73aSLucas Stach };
5006031e73aSLucas Stach
5016031e73aSLucas Stach int
xfs_buf_hash_init(struct xfs_perag * pag)5026031e73aSLucas Stach xfs_buf_hash_init(
5036031e73aSLucas Stach struct xfs_perag *pag)
5046031e73aSLucas Stach {
5056031e73aSLucas Stach spin_lock_init(&pag->pag_buf_lock);
5066031e73aSLucas Stach return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
5076031e73aSLucas Stach }
5086031e73aSLucas Stach
5096031e73aSLucas Stach void
xfs_buf_hash_destroy(struct xfs_perag * pag)5106031e73aSLucas Stach xfs_buf_hash_destroy(
5116031e73aSLucas Stach struct xfs_perag *pag)
5126031e73aSLucas Stach {
5136031e73aSLucas Stach rhashtable_destroy(&pag->pag_buf_hash);
5146031e73aSLucas Stach }
515c59d87c4SChristoph Hellwig
516b027d4c9SDave Chinner static int
xfs_buf_map_verify(struct xfs_buftarg * btp,struct xfs_buf_map * map)517de67dc57SDave Chinner xfs_buf_map_verify(
518e70b73f8SDave Chinner struct xfs_buftarg *btp,
519de67dc57SDave Chinner struct xfs_buf_map *map)
520c59d87c4SChristoph Hellwig {
52110616b80SDave Chinner xfs_daddr_t eofs;
522c59d87c4SChristoph Hellwig
523c59d87c4SChristoph Hellwig /* Check for IOs smaller than the sector size / not sector aligned */
524de67dc57SDave Chinner ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
525de67dc57SDave Chinner ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
526c59d87c4SChristoph Hellwig
52710616b80SDave Chinner /*
52810616b80SDave Chinner * Corrupted block numbers can get through to here, unfortunately, so we
52910616b80SDave Chinner * have to check that the buffer falls within the filesystem bounds.
53010616b80SDave Chinner */
53110616b80SDave Chinner eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
532de67dc57SDave Chinner if (map->bm_bn < 0 || map->bm_bn >= eofs) {
53310616b80SDave Chinner xfs_alert(btp->bt_mount,
534c219b015SDarrick J. Wong "%s: daddr 0x%llx out of range, EOFS 0x%llx",
535de67dc57SDave Chinner __func__, map->bm_bn, eofs);
5367bc0dc27SDave Chinner WARN_ON(1);
537b027d4c9SDave Chinner return -EFSCORRUPTED;
53810616b80SDave Chinner }
539b027d4c9SDave Chinner return 0;
540de67dc57SDave Chinner }
541c59d87c4SChristoph Hellwig
542de67dc57SDave Chinner static int
xfs_buf_find_lock(struct xfs_buf * bp,xfs_buf_flags_t flags)543de67dc57SDave Chinner xfs_buf_find_lock(
544de67dc57SDave Chinner struct xfs_buf *bp,
545de67dc57SDave Chinner xfs_buf_flags_t flags)
546de67dc57SDave Chinner {
547c59d87c4SChristoph Hellwig if (flags & XBF_TRYLOCK) {
548d8d9bbb0SDave Chinner if (!xfs_buf_trylock(bp)) {
549de67dc57SDave Chinner XFS_STATS_INC(bp->b_mount, xb_busy_locked);
550b027d4c9SDave Chinner return -EAGAIN;
551c59d87c4SChristoph Hellwig }
552d8d9bbb0SDave Chinner } else {
553c59d87c4SChristoph Hellwig xfs_buf_lock(bp);
554de67dc57SDave Chinner XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
555c59d87c4SChristoph Hellwig }
556c59d87c4SChristoph Hellwig
557c59d87c4SChristoph Hellwig /*
558c59d87c4SChristoph Hellwig * if the buffer is stale, clear all the external state associated with
559c59d87c4SChristoph Hellwig * it. We need to keep flags such as how we allocated the buffer memory
560c59d87c4SChristoph Hellwig * intact here.
561c59d87c4SChristoph Hellwig */
562c59d87c4SChristoph Hellwig if (bp->b_flags & XBF_STALE) {
5639ed851f6SDarrick J. Wong if (flags & XBF_LIVESCAN) {
5649ed851f6SDarrick J. Wong xfs_buf_unlock(bp);
5659ed851f6SDarrick J. Wong return -ENOENT;
5669ed851f6SDarrick J. Wong }
567c59d87c4SChristoph Hellwig ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
568611c9946SDave Chinner bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
5691813dd64SDave Chinner bp->b_ops = NULL;
570c59d87c4SChristoph Hellwig }
571b027d4c9SDave Chinner return 0;
572c59d87c4SChristoph Hellwig }
573c59d87c4SChristoph Hellwig
57434800080SDave Chinner static inline int
xfs_buf_lookup(struct xfs_perag * pag,struct xfs_buf_map * map,xfs_buf_flags_t flags,struct xfs_buf ** bpp)575de67dc57SDave Chinner xfs_buf_lookup(
576de67dc57SDave Chinner struct xfs_perag *pag,
57734800080SDave Chinner struct xfs_buf_map *map,
57834800080SDave Chinner xfs_buf_flags_t flags,
57934800080SDave Chinner struct xfs_buf **bpp)
5808925a3dcSDave Chinner {
581b027d4c9SDave Chinner struct xfs_buf *bp;
582b027d4c9SDave Chinner int error;
583b027d4c9SDave Chinner
584298f3422SDave Chinner rcu_read_lock();
585de67dc57SDave Chinner bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
586298f3422SDave Chinner if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
587298f3422SDave Chinner rcu_read_unlock();
58834800080SDave Chinner return -ENOENT;
58934800080SDave Chinner }
590298f3422SDave Chinner rcu_read_unlock();
59134800080SDave Chinner
59234800080SDave Chinner error = xfs_buf_find_lock(bp, flags);
59334800080SDave Chinner if (error) {
59434800080SDave Chinner xfs_buf_rele(bp);
59534800080SDave Chinner return error;
59634800080SDave Chinner }
59734800080SDave Chinner
59834800080SDave Chinner trace_xfs_buf_find(bp, flags, _RET_IP_);
59934800080SDave Chinner *bpp = bp;
60034800080SDave Chinner return 0;
6018925a3dcSDave Chinner }
6028925a3dcSDave Chinner
603c59d87c4SChristoph Hellwig /*
604de67dc57SDave Chinner * Insert the new_bp into the hash table. This consumes the perag reference
60534800080SDave Chinner * taken for the lookup regardless of the result of the insert.
606c59d87c4SChristoph Hellwig */
607de67dc57SDave Chinner static int
xfs_buf_find_insert(struct xfs_buftarg * btp,struct xfs_perag * pag,struct xfs_buf_map * cmap,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)608de67dc57SDave Chinner xfs_buf_find_insert(
609de67dc57SDave Chinner struct xfs_buftarg *btp,
610de67dc57SDave Chinner struct xfs_perag *pag,
61134800080SDave Chinner struct xfs_buf_map *cmap,
6126dde2707SDave Chinner struct xfs_buf_map *map,
6136dde2707SDave Chinner int nmaps,
6143848b5f6SDarrick J. Wong xfs_buf_flags_t flags,
6153848b5f6SDarrick J. Wong struct xfs_buf **bpp)
616c59d87c4SChristoph Hellwig {
6173815832aSDave Chinner struct xfs_buf *new_bp;
61834800080SDave Chinner struct xfs_buf *bp;
6199bb38aa0SShaokun Zhang int error;
620c59d87c4SChristoph Hellwig
62134800080SDave Chinner error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
62232dff5e5SDarrick J. Wong if (error)
62334800080SDave Chinner goto out_drop_pag;
624c59d87c4SChristoph Hellwig
6258bcac744SDave Chinner /*
6268bcac744SDave Chinner * For buffers that fit entirely within a single page, first attempt to
6278bcac744SDave Chinner * allocate the memory from the heap to minimise memory usage. If we
6288bcac744SDave Chinner * can't get heap memory for these small buffers, we fall back to using
6298bcac744SDave Chinner * the page allocator.
6308bcac744SDave Chinner */
6318bcac744SDave Chinner if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
6328bcac744SDave Chinner xfs_buf_alloc_kmem(new_bp, flags) < 0) {
6338bcac744SDave Chinner error = xfs_buf_alloc_pages(new_bp, flags);
634170041f7SChristoph Hellwig if (error)
635170041f7SChristoph Hellwig goto out_free_buf;
6368bcac744SDave Chinner }
6373815832aSDave Chinner
63834800080SDave Chinner spin_lock(&pag->pag_buf_lock);
63932dd4f9cSDave Chinner bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
64032dd4f9cSDave Chinner &new_bp->b_rhash_head, xfs_buf_hash_params);
64132dd4f9cSDave Chinner if (IS_ERR(bp)) {
64232dd4f9cSDave Chinner error = PTR_ERR(bp);
64332dd4f9cSDave Chinner spin_unlock(&pag->pag_buf_lock);
644170041f7SChristoph Hellwig goto out_free_buf;
64532dd4f9cSDave Chinner }
64634800080SDave Chinner if (bp) {
64732dd4f9cSDave Chinner /* found an existing buffer */
64834800080SDave Chinner atomic_inc(&bp->b_hold);
64934800080SDave Chinner spin_unlock(&pag->pag_buf_lock);
65034800080SDave Chinner error = xfs_buf_find_lock(bp, flags);
6513848b5f6SDarrick J. Wong if (error)
65234800080SDave Chinner xfs_buf_rele(bp);
65334800080SDave Chinner else
65434800080SDave Chinner *bpp = bp;
655fe2429b0SDave Chinner goto out_free_buf;
65634800080SDave Chinner }
657fe2429b0SDave Chinner
65832dd4f9cSDave Chinner /* The new buffer keeps the perag reference until it is freed. */
65934800080SDave Chinner new_bp->b_pag = pag;
66034800080SDave Chinner spin_unlock(&pag->pag_buf_lock);
66134800080SDave Chinner *bpp = new_bp;
66234800080SDave Chinner return 0;
66334800080SDave Chinner
66434800080SDave Chinner out_free_buf:
665fe2429b0SDave Chinner xfs_buf_free(new_bp);
66634800080SDave Chinner out_drop_pag:
66734800080SDave Chinner xfs_perag_put(pag);
66834800080SDave Chinner return error;
66934800080SDave Chinner }
670c59d87c4SChristoph Hellwig
67134800080SDave Chinner /*
67234800080SDave Chinner * Assembles a buffer covering the specified range. The code is optimised for
67334800080SDave Chinner * cache hits, as metadata intensive workloads will see 3 orders of magnitude
67434800080SDave Chinner * more hits than misses.
67534800080SDave Chinner */
67634800080SDave Chinner int
xfs_buf_get_map(struct xfs_buftarg * btp,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)67734800080SDave Chinner xfs_buf_get_map(
67834800080SDave Chinner struct xfs_buftarg *btp,
67934800080SDave Chinner struct xfs_buf_map *map,
68034800080SDave Chinner int nmaps,
68134800080SDave Chinner xfs_buf_flags_t flags,
68234800080SDave Chinner struct xfs_buf **bpp)
68334800080SDave Chinner {
68434800080SDave Chinner struct xfs_perag *pag;
68534800080SDave Chinner struct xfs_buf *bp = NULL;
68634800080SDave Chinner struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
68734800080SDave Chinner int error;
68834800080SDave Chinner int i;
68934800080SDave Chinner
6909ed851f6SDarrick J. Wong if (flags & XBF_LIVESCAN)
6919ed851f6SDarrick J. Wong cmap.bm_flags |= XBM_LIVESCAN;
69234800080SDave Chinner for (i = 0; i < nmaps; i++)
69334800080SDave Chinner cmap.bm_len += map[i].bm_len;
69434800080SDave Chinner
69534800080SDave Chinner error = xfs_buf_map_verify(btp, &cmap);
69634800080SDave Chinner if (error)
69734800080SDave Chinner return error;
69834800080SDave Chinner
69934800080SDave Chinner pag = xfs_perag_get(btp->bt_mount,
70034800080SDave Chinner xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
70134800080SDave Chinner
70234800080SDave Chinner error = xfs_buf_lookup(pag, &cmap, flags, &bp);
70334800080SDave Chinner if (error && error != -ENOENT)
70434800080SDave Chinner goto out_put_perag;
70534800080SDave Chinner
70634800080SDave Chinner /* cache hits always outnumber misses by at least 10:1 */
70734800080SDave Chinner if (unlikely(!bp)) {
70834800080SDave Chinner XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
70934800080SDave Chinner
71034800080SDave Chinner if (flags & XBF_INCORE)
71134800080SDave Chinner goto out_put_perag;
71234800080SDave Chinner
71334800080SDave Chinner /* xfs_buf_find_insert() consumes the perag reference. */
71434800080SDave Chinner error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
71534800080SDave Chinner flags, &bp);
71634800080SDave Chinner if (error)
71734800080SDave Chinner return error;
71834800080SDave Chinner } else {
71934800080SDave Chinner XFS_STATS_INC(btp->bt_mount, xb_get_locked);
72034800080SDave Chinner xfs_perag_put(pag);
72134800080SDave Chinner }
72234800080SDave Chinner
72334800080SDave Chinner /* We do not hold a perag reference anymore. */
724611c9946SDave Chinner if (!bp->b_addr) {
725c59d87c4SChristoph Hellwig error = _xfs_buf_map_pages(bp, flags);
726c59d87c4SChristoph Hellwig if (unlikely(error)) {
72734800080SDave Chinner xfs_warn_ratelimited(btp->bt_mount,
72893baa55aSDarrick J. Wong "%s: failed to map %u pages", __func__,
72993baa55aSDarrick J. Wong bp->b_page_count);
730a8acad70SDave Chinner xfs_buf_relse(bp);
7313848b5f6SDarrick J. Wong return error;
732c59d87c4SChristoph Hellwig }
733c59d87c4SChristoph Hellwig }
734c59d87c4SChristoph Hellwig
735b79f4a1cSDave Chinner /*
736b79f4a1cSDave Chinner * Clear b_error if this is a lookup from a caller that doesn't expect
737b79f4a1cSDave Chinner * valid data to be found in the buffer.
738b79f4a1cSDave Chinner */
739b79f4a1cSDave Chinner if (!(flags & XBF_READ))
740b79f4a1cSDave Chinner xfs_buf_ioerror(bp, 0);
741b79f4a1cSDave Chinner
74234800080SDave Chinner XFS_STATS_INC(btp->bt_mount, xb_get);
743c59d87c4SChristoph Hellwig trace_xfs_buf_get(bp, flags, _RET_IP_);
7443848b5f6SDarrick J. Wong *bpp = bp;
7453848b5f6SDarrick J. Wong return 0;
74634800080SDave Chinner
74734800080SDave Chinner out_put_perag:
74834800080SDave Chinner xfs_perag_put(pag);
749170041f7SChristoph Hellwig return error;
750c59d87c4SChristoph Hellwig }
751c59d87c4SChristoph Hellwig
75226e32875SChristoph Hellwig int
_xfs_buf_read(struct xfs_buf * bp,xfs_buf_flags_t flags)753c59d87c4SChristoph Hellwig _xfs_buf_read(
754e8222613SDave Chinner struct xfs_buf *bp,
755c59d87c4SChristoph Hellwig xfs_buf_flags_t flags)
756c59d87c4SChristoph Hellwig {
75743ff2122SChristoph Hellwig ASSERT(!(flags & XBF_WRITE));
758f4b42421SMark Tinguely ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
759c59d87c4SChristoph Hellwig
76026e32875SChristoph Hellwig bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
761c59d87c4SChristoph Hellwig bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
762c59d87c4SChristoph Hellwig
7636af88cdaSBrian Foster return xfs_buf_submit(bp);
764c59d87c4SChristoph Hellwig }
765c59d87c4SChristoph Hellwig
7661aff5696SDarrick J. Wong /*
76775d02303SBrian Foster * Reverify a buffer found in cache without an attached ->b_ops.
768add46b3bSDarrick J. Wong *
76975d02303SBrian Foster * If the caller passed an ops structure and the buffer doesn't have ops
77075d02303SBrian Foster * assigned, set the ops and use it to verify the contents. If verification
77175d02303SBrian Foster * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
77275d02303SBrian Foster * already in XBF_DONE state on entry.
773add46b3bSDarrick J. Wong *
77475d02303SBrian Foster * Under normal operations, every in-core buffer is verified on read I/O
77575d02303SBrian Foster * completion. There are two scenarios that can lead to in-core buffers without
77675d02303SBrian Foster * an assigned ->b_ops. The first is during log recovery of buffers on a V4
77775d02303SBrian Foster * filesystem, though these buffers are purged at the end of recovery. The
77875d02303SBrian Foster * other is online repair, which intentionally reads with a NULL buffer ops to
77975d02303SBrian Foster * run several verifiers across an in-core buffer in order to establish buffer
78075d02303SBrian Foster * type. If repair can't establish that, the buffer will be left in memory
78175d02303SBrian Foster * with NULL buffer ops.
7821aff5696SDarrick J. Wong */
7831aff5696SDarrick J. Wong int
xfs_buf_reverify(struct xfs_buf * bp,const struct xfs_buf_ops * ops)78475d02303SBrian Foster xfs_buf_reverify(
7851aff5696SDarrick J. Wong struct xfs_buf *bp,
7861aff5696SDarrick J. Wong const struct xfs_buf_ops *ops)
7871aff5696SDarrick J. Wong {
7881aff5696SDarrick J. Wong ASSERT(bp->b_flags & XBF_DONE);
7891aff5696SDarrick J. Wong ASSERT(bp->b_error == 0);
7901aff5696SDarrick J. Wong
7911aff5696SDarrick J. Wong if (!ops || bp->b_ops)
7921aff5696SDarrick J. Wong return 0;
7931aff5696SDarrick J. Wong
7941aff5696SDarrick J. Wong bp->b_ops = ops;
7951aff5696SDarrick J. Wong bp->b_ops->verify_read(bp);
7961aff5696SDarrick J. Wong if (bp->b_error)
7971aff5696SDarrick J. Wong bp->b_flags &= ~XBF_DONE;
7981aff5696SDarrick J. Wong return bp->b_error;
7991aff5696SDarrick J. Wong }
8001aff5696SDarrick J. Wong
8014ed8e27bSDarrick J. Wong int
xfs_buf_read_map(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp,const struct xfs_buf_ops * ops,xfs_failaddr_t fa)8026dde2707SDave Chinner xfs_buf_read_map(
8036dde2707SDave Chinner struct xfs_buftarg *target,
8046dde2707SDave Chinner struct xfs_buf_map *map,
8056dde2707SDave Chinner int nmaps,
806c3f8fc73SDave Chinner xfs_buf_flags_t flags,
8074ed8e27bSDarrick J. Wong struct xfs_buf **bpp,
808cdbcf82bSDarrick J. Wong const struct xfs_buf_ops *ops,
809cdbcf82bSDarrick J. Wong xfs_failaddr_t fa)
810c59d87c4SChristoph Hellwig {
8116dde2707SDave Chinner struct xfs_buf *bp;
8123848b5f6SDarrick J. Wong int error;
813c59d87c4SChristoph Hellwig
814c59d87c4SChristoph Hellwig flags |= XBF_READ;
8154ed8e27bSDarrick J. Wong *bpp = NULL;
816c59d87c4SChristoph Hellwig
8173848b5f6SDarrick J. Wong error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
8183848b5f6SDarrick J. Wong if (error)
8194ed8e27bSDarrick J. Wong return error;
8201aff5696SDarrick J. Wong
821c59d87c4SChristoph Hellwig trace_xfs_buf_read(bp, flags, _RET_IP_);
822c59d87c4SChristoph Hellwig
823b0388bf1SDave Chinner if (!(bp->b_flags & XBF_DONE)) {
8244ed8e27bSDarrick J. Wong /* Initiate the buffer read and wait. */
825ff6d6af2SBill O'Donnell XFS_STATS_INC(target->bt_mount, xb_get_read);
8261813dd64SDave Chinner bp->b_ops = ops;
8274ed8e27bSDarrick J. Wong error = _xfs_buf_read(bp, flags);
8281aff5696SDarrick J. Wong
8294ed8e27bSDarrick J. Wong /* Readahead iodone already dropped the buffer, so exit. */
8304ed8e27bSDarrick J. Wong if (flags & XBF_ASYNC)
8314ed8e27bSDarrick J. Wong return 0;
8324ed8e27bSDarrick J. Wong } else {
8334ed8e27bSDarrick J. Wong /* Buffer already read; all we need to do is check it. */
8344ed8e27bSDarrick J. Wong error = xfs_buf_reverify(bp, ops);
8351aff5696SDarrick J. Wong
8364ed8e27bSDarrick J. Wong /* Readahead already finished; drop the buffer and exit. */
8371aff5696SDarrick J. Wong if (flags & XBF_ASYNC) {
838a8acad70SDave Chinner xfs_buf_relse(bp);
8394ed8e27bSDarrick J. Wong return 0;
840c59d87c4SChristoph Hellwig }
841c59d87c4SChristoph Hellwig
8421aff5696SDarrick J. Wong /* We do not want read in the flags */
8431aff5696SDarrick J. Wong bp->b_flags &= ~XBF_READ;
8441aff5696SDarrick J. Wong ASSERT(bp->b_ops != NULL || ops == NULL);
8454ed8e27bSDarrick J. Wong }
8464ed8e27bSDarrick J. Wong
8474ed8e27bSDarrick J. Wong /*
8484ed8e27bSDarrick J. Wong * If we've had a read error, then the contents of the buffer are
8494ed8e27bSDarrick J. Wong * invalid and should not be used. To ensure that a followup read tries
8504ed8e27bSDarrick J. Wong * to pull the buffer from disk again, we clear the XBF_DONE flag and
8514ed8e27bSDarrick J. Wong * mark the buffer stale. This ensures that anyone who has a current
8524ed8e27bSDarrick J. Wong * reference to the buffer will interpret it's contents correctly and
8534ed8e27bSDarrick J. Wong * future cache lookups will also treat it as an empty, uninitialised
8544ed8e27bSDarrick J. Wong * buffer.
8554ed8e27bSDarrick J. Wong */
8564ed8e27bSDarrick J. Wong if (error) {
85701728b44SDave Chinner /*
85801728b44SDave Chinner * Check against log shutdown for error reporting because
85901728b44SDave Chinner * metadata writeback may require a read first and we need to
86001728b44SDave Chinner * report errors in metadata writeback until the log is shut
86101728b44SDave Chinner * down. High level transaction read functions already check
86201728b44SDave Chinner * against mount shutdown, anyway, so we only need to be
86301728b44SDave Chinner * concerned about low level IO interactions here.
86401728b44SDave Chinner */
86501728b44SDave Chinner if (!xlog_is_shutdown(target->bt_mount->m_log))
866cdbcf82bSDarrick J. Wong xfs_buf_ioerror_alert(bp, fa);
8674ed8e27bSDarrick J. Wong
8684ed8e27bSDarrick J. Wong bp->b_flags &= ~XBF_DONE;
8694ed8e27bSDarrick J. Wong xfs_buf_stale(bp);
8704ed8e27bSDarrick J. Wong xfs_buf_relse(bp);
8714ed8e27bSDarrick J. Wong
8724ed8e27bSDarrick J. Wong /* bad CRC means corrupted metadata */
8734ed8e27bSDarrick J. Wong if (error == -EFSBADCRC)
8744ed8e27bSDarrick J. Wong error = -EFSCORRUPTED;
8754ed8e27bSDarrick J. Wong return error;
8764ed8e27bSDarrick J. Wong }
8774ed8e27bSDarrick J. Wong
8784ed8e27bSDarrick J. Wong *bpp = bp;
8794ed8e27bSDarrick J. Wong return 0;
880c59d87c4SChristoph Hellwig }
881c59d87c4SChristoph Hellwig
882c59d87c4SChristoph Hellwig /*
883c59d87c4SChristoph Hellwig * If we are not low on memory then do the readahead in a deadlock
884c59d87c4SChristoph Hellwig * safe manner.
885c59d87c4SChristoph Hellwig */
886c59d87c4SChristoph Hellwig void
xfs_buf_readahead_map(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,const struct xfs_buf_ops * ops)8876dde2707SDave Chinner xfs_buf_readahead_map(
8886dde2707SDave Chinner struct xfs_buftarg *target,
8896dde2707SDave Chinner struct xfs_buf_map *map,
890c3f8fc73SDave Chinner int nmaps,
8911813dd64SDave Chinner const struct xfs_buf_ops *ops)
892c59d87c4SChristoph Hellwig {
8934ed8e27bSDarrick J. Wong struct xfs_buf *bp;
8944ed8e27bSDarrick J. Wong
8956dde2707SDave Chinner xfs_buf_read_map(target, map, nmaps,
896cdbcf82bSDarrick J. Wong XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
897cdbcf82bSDarrick J. Wong __this_address);
898c59d87c4SChristoph Hellwig }
899c59d87c4SChristoph Hellwig
900c59d87c4SChristoph Hellwig /*
901c59d87c4SChristoph Hellwig * Read an uncached buffer from disk. Allocates and returns a locked
9024c7f65aeSDave Chinner * buffer containing the disk contents or nothing. Uncached buffers always have
9034c7f65aeSDave Chinner * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
9044c7f65aeSDave Chinner * is cached or uncached during fault diagnosis.
905c59d87c4SChristoph Hellwig */
906ba372674SDave Chinner int
xfs_buf_read_uncached(struct xfs_buftarg * target,xfs_daddr_t daddr,size_t numblks,xfs_buf_flags_t flags,struct xfs_buf ** bpp,const struct xfs_buf_ops * ops)907c59d87c4SChristoph Hellwig xfs_buf_read_uncached(
908c59d87c4SChristoph Hellwig struct xfs_buftarg *target,
909c59d87c4SChristoph Hellwig xfs_daddr_t daddr,
910e70b73f8SDave Chinner size_t numblks,
911b9b3fe15SDave Chinner xfs_buf_flags_t flags,
912ba372674SDave Chinner struct xfs_buf **bpp,
9131813dd64SDave Chinner const struct xfs_buf_ops *ops)
914c59d87c4SChristoph Hellwig {
915eab4e633SDave Chinner struct xfs_buf *bp;
9162842b6dbSDarrick J. Wong int error;
917c59d87c4SChristoph Hellwig
918ba372674SDave Chinner *bpp = NULL;
919ba372674SDave Chinner
9202842b6dbSDarrick J. Wong error = xfs_buf_get_uncached(target, numblks, flags, &bp);
9212842b6dbSDarrick J. Wong if (error)
9222842b6dbSDarrick J. Wong return error;
923c59d87c4SChristoph Hellwig
924c59d87c4SChristoph Hellwig /* set up the buffer for a read IO */
9253e85c868SDave Chinner ASSERT(bp->b_map_count == 1);
9264c7f65aeSDave Chinner bp->b_rhash_key = XFS_BUF_DADDR_NULL;
9273e85c868SDave Chinner bp->b_maps[0].bm_bn = daddr;
928cbb7baabSDave Chinner bp->b_flags |= XBF_READ;
9291813dd64SDave Chinner bp->b_ops = ops;
930c59d87c4SChristoph Hellwig
9316af88cdaSBrian Foster xfs_buf_submit(bp);
932ba372674SDave Chinner if (bp->b_error) {
9332842b6dbSDarrick J. Wong error = bp->b_error;
93483a0adc3SChristoph Hellwig xfs_buf_relse(bp);
935ba372674SDave Chinner return error;
93683a0adc3SChristoph Hellwig }
937ba372674SDave Chinner
938ba372674SDave Chinner *bpp = bp;
939ba372674SDave Chinner return 0;
940c59d87c4SChristoph Hellwig }
941c59d87c4SChristoph Hellwig
9422842b6dbSDarrick J. Wong int
xfs_buf_get_uncached(struct xfs_buftarg * target,size_t numblks,xfs_buf_flags_t flags,struct xfs_buf ** bpp)943c59d87c4SChristoph Hellwig xfs_buf_get_uncached(
944c59d87c4SChristoph Hellwig struct xfs_buftarg *target,
945e70b73f8SDave Chinner size_t numblks,
946b9b3fe15SDave Chinner xfs_buf_flags_t flags,
9472842b6dbSDarrick J. Wong struct xfs_buf **bpp)
948c59d87c4SChristoph Hellwig {
94907b5c5adSDave Chinner int error;
9503e85c868SDave Chinner struct xfs_buf *bp;
9513e85c868SDave Chinner DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
952c59d87c4SChristoph Hellwig
9532842b6dbSDarrick J. Wong *bpp = NULL;
9542842b6dbSDarrick J. Wong
955c891c30aSBrian Foster /* flags might contain irrelevant bits, pass only what we care about */
95632dff5e5SDarrick J. Wong error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
95732dff5e5SDarrick J. Wong if (error)
95807b5c5adSDave Chinner return error;
959c59d87c4SChristoph Hellwig
960934d1076SChristoph Hellwig error = xfs_buf_alloc_pages(bp, flags);
961c59d87c4SChristoph Hellwig if (error)
962c59d87c4SChristoph Hellwig goto fail_free_buf;
963c59d87c4SChristoph Hellwig
964611c9946SDave Chinner error = _xfs_buf_map_pages(bp, 0);
965c59d87c4SChristoph Hellwig if (unlikely(error)) {
966c59d87c4SChristoph Hellwig xfs_warn(target->bt_mount,
96708e96e1aSEric Sandeen "%s: failed to map pages", __func__);
96807b5c5adSDave Chinner goto fail_free_buf;
969c59d87c4SChristoph Hellwig }
970c59d87c4SChristoph Hellwig
971c59d87c4SChristoph Hellwig trace_xfs_buf_get_uncached(bp, _RET_IP_);
9722842b6dbSDarrick J. Wong *bpp = bp;
9732842b6dbSDarrick J. Wong return 0;
974c59d87c4SChristoph Hellwig
975c59d87c4SChristoph Hellwig fail_free_buf:
97607b5c5adSDave Chinner xfs_buf_free(bp);
9772842b6dbSDarrick J. Wong return error;
978c59d87c4SChristoph Hellwig }
979c59d87c4SChristoph Hellwig
980c59d87c4SChristoph Hellwig /*
981c59d87c4SChristoph Hellwig * Increment reference count on buffer, to hold the buffer concurrently
982c59d87c4SChristoph Hellwig * with another thread which may release (free) the buffer asynchronously.
983c59d87c4SChristoph Hellwig * Must hold the buffer already to call this function.
984c59d87c4SChristoph Hellwig */
985c59d87c4SChristoph Hellwig void
xfs_buf_hold(struct xfs_buf * bp)986c59d87c4SChristoph Hellwig xfs_buf_hold(
987e8222613SDave Chinner struct xfs_buf *bp)
988c59d87c4SChristoph Hellwig {
989c59d87c4SChristoph Hellwig trace_xfs_buf_hold(bp, _RET_IP_);
990c59d87c4SChristoph Hellwig atomic_inc(&bp->b_hold);
991c59d87c4SChristoph Hellwig }
992c59d87c4SChristoph Hellwig
993c59d87c4SChristoph Hellwig /*
9949c7504aaSBrian Foster * Release a hold on the specified buffer. If the hold count is 1, the buffer is
9959c7504aaSBrian Foster * placed on LRU or freed (depending on b_lru_ref).
996c59d87c4SChristoph Hellwig */
997c59d87c4SChristoph Hellwig void
xfs_buf_rele(struct xfs_buf * bp)998c59d87c4SChristoph Hellwig xfs_buf_rele(
999e8222613SDave Chinner struct xfs_buf *bp)
1000c59d87c4SChristoph Hellwig {
1001c59d87c4SChristoph Hellwig struct xfs_perag *pag = bp->b_pag;
10029c7504aaSBrian Foster bool release;
10039c7504aaSBrian Foster bool freebuf = false;
1004c59d87c4SChristoph Hellwig
1005c59d87c4SChristoph Hellwig trace_xfs_buf_rele(bp, _RET_IP_);
1006c59d87c4SChristoph Hellwig
1007c59d87c4SChristoph Hellwig if (!pag) {
1008c59d87c4SChristoph Hellwig ASSERT(list_empty(&bp->b_lru));
10099c7504aaSBrian Foster if (atomic_dec_and_test(&bp->b_hold)) {
10109c7504aaSBrian Foster xfs_buf_ioacct_dec(bp);
1011c59d87c4SChristoph Hellwig xfs_buf_free(bp);
10129c7504aaSBrian Foster }
1013c59d87c4SChristoph Hellwig return;
1014c59d87c4SChristoph Hellwig }
1015c59d87c4SChristoph Hellwig
1016c59d87c4SChristoph Hellwig ASSERT(atomic_read(&bp->b_hold) > 0);
10179c7504aaSBrian Foster
101837fd1678SDave Chinner /*
101937fd1678SDave Chinner * We grab the b_lock here first to serialise racing xfs_buf_rele()
102037fd1678SDave Chinner * calls. The pag_buf_lock being taken on the last reference only
102137fd1678SDave Chinner * serialises against racing lookups in xfs_buf_find(). IOWs, the second
102237fd1678SDave Chinner * to last reference we drop here is not serialised against the last
102337fd1678SDave Chinner * reference until we take bp->b_lock. Hence if we don't grab b_lock
102437fd1678SDave Chinner * first, the last "release" reference can win the race to the lock and
102537fd1678SDave Chinner * free the buffer before the second-to-last reference is processed,
102637fd1678SDave Chinner * leading to a use-after-free scenario.
102737fd1678SDave Chinner */
1028a4082357SDave Chinner spin_lock(&bp->b_lock);
102937fd1678SDave Chinner release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
10309c7504aaSBrian Foster if (!release) {
10319c7504aaSBrian Foster /*
10329c7504aaSBrian Foster * Drop the in-flight state if the buffer is already on the LRU
10339c7504aaSBrian Foster * and it holds the only reference. This is racy because we
10349c7504aaSBrian Foster * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
10359c7504aaSBrian Foster * ensures the decrement occurs only once per-buf.
10369c7504aaSBrian Foster */
10379c7504aaSBrian Foster if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
103863db7c81SBrian Foster __xfs_buf_ioacct_dec(bp);
10399c7504aaSBrian Foster goto out_unlock;
10409c7504aaSBrian Foster }
10419c7504aaSBrian Foster
10429c7504aaSBrian Foster /* the last reference has been dropped ... */
104363db7c81SBrian Foster __xfs_buf_ioacct_dec(bp);
1044a4082357SDave Chinner if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
1045a4082357SDave Chinner /*
10469c7504aaSBrian Foster * If the buffer is added to the LRU take a new reference to the
10479c7504aaSBrian Foster * buffer for the LRU and clear the (now stale) dispose list
10489c7504aaSBrian Foster * state flag
1049a4082357SDave Chinner */
1050a4082357SDave Chinner if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
1051a4082357SDave Chinner bp->b_state &= ~XFS_BSTATE_DISPOSE;
1052a4082357SDave Chinner atomic_inc(&bp->b_hold);
1053a4082357SDave Chinner }
1054c59d87c4SChristoph Hellwig spin_unlock(&pag->pag_buf_lock);
1055c59d87c4SChristoph Hellwig } else {
1056a4082357SDave Chinner /*
10579c7504aaSBrian Foster * most of the time buffers will already be removed from the
10589c7504aaSBrian Foster * LRU, so optimise that case by checking for the
10599c7504aaSBrian Foster * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
10609c7504aaSBrian Foster * was on was the disposal list
1061a4082357SDave Chinner */
1062a4082357SDave Chinner if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1063a4082357SDave Chinner list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
1064a4082357SDave Chinner } else {
1065a4082357SDave Chinner ASSERT(list_empty(&bp->b_lru));
1066a4082357SDave Chinner }
1067a4082357SDave Chinner
106843ff2122SChristoph Hellwig ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
10696031e73aSLucas Stach rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
10706031e73aSLucas Stach xfs_buf_hash_params);
1071c59d87c4SChristoph Hellwig spin_unlock(&pag->pag_buf_lock);
1072c59d87c4SChristoph Hellwig xfs_perag_put(pag);
10739c7504aaSBrian Foster freebuf = true;
10749c7504aaSBrian Foster }
10759c7504aaSBrian Foster
10769c7504aaSBrian Foster out_unlock:
10779c7504aaSBrian Foster spin_unlock(&bp->b_lock);
10789c7504aaSBrian Foster
10799c7504aaSBrian Foster if (freebuf)
1080c59d87c4SChristoph Hellwig xfs_buf_free(bp);
1081c59d87c4SChristoph Hellwig }
1082c59d87c4SChristoph Hellwig
1083c59d87c4SChristoph Hellwig
1084c59d87c4SChristoph Hellwig /*
1085c59d87c4SChristoph Hellwig * Lock a buffer object, if it is not already locked.
1086c59d87c4SChristoph Hellwig *
1087c59d87c4SChristoph Hellwig * If we come across a stale, pinned, locked buffer, we know that we are
1088c59d87c4SChristoph Hellwig * being asked to lock a buffer that has been reallocated. Because it is
1089c59d87c4SChristoph Hellwig * pinned, we know that the log has not been pushed to disk and hence it
1090c59d87c4SChristoph Hellwig * will still be locked. Rather than continuing to have trylock attempts
1091c59d87c4SChristoph Hellwig * fail until someone else pushes the log, push it ourselves before
1092c59d87c4SChristoph Hellwig * returning. This means that the xfsaild will not get stuck trying
1093c59d87c4SChristoph Hellwig * to push on stale inode buffers.
1094c59d87c4SChristoph Hellwig */
1095c59d87c4SChristoph Hellwig int
xfs_buf_trylock(struct xfs_buf * bp)1096c59d87c4SChristoph Hellwig xfs_buf_trylock(
1097c59d87c4SChristoph Hellwig struct xfs_buf *bp)
1098c59d87c4SChristoph Hellwig {
1099c59d87c4SChristoph Hellwig int locked;
1100c59d87c4SChristoph Hellwig
1101c59d87c4SChristoph Hellwig locked = down_trylock(&bp->b_sema) == 0;
1102fa6c668dSEric Sandeen if (locked)
1103c59d87c4SChristoph Hellwig trace_xfs_buf_trylock(bp, _RET_IP_);
1104fa6c668dSEric Sandeen else
1105479c6412SDarrick J. Wong trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1106c59d87c4SChristoph Hellwig return locked;
1107c59d87c4SChristoph Hellwig }
1108c59d87c4SChristoph Hellwig
1109c59d87c4SChristoph Hellwig /*
1110c59d87c4SChristoph Hellwig * Lock a buffer object.
1111c59d87c4SChristoph Hellwig *
1112c59d87c4SChristoph Hellwig * If we come across a stale, pinned, locked buffer, we know that we
1113c59d87c4SChristoph Hellwig * are being asked to lock a buffer that has been reallocated. Because
1114c59d87c4SChristoph Hellwig * it is pinned, we know that the log has not been pushed to disk and
1115c59d87c4SChristoph Hellwig * hence it will still be locked. Rather than sleeping until someone
1116c59d87c4SChristoph Hellwig * else pushes the log, push it ourselves before trying to get the lock.
1117c59d87c4SChristoph Hellwig */
1118c59d87c4SChristoph Hellwig void
xfs_buf_lock(struct xfs_buf * bp)1119c59d87c4SChristoph Hellwig xfs_buf_lock(
1120c59d87c4SChristoph Hellwig struct xfs_buf *bp)
1121c59d87c4SChristoph Hellwig {
1122c59d87c4SChristoph Hellwig trace_xfs_buf_lock(bp, _RET_IP_);
1123c59d87c4SChristoph Hellwig
1124c59d87c4SChristoph Hellwig if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
1125dbd329f1SChristoph Hellwig xfs_log_force(bp->b_mount, 0);
1126c59d87c4SChristoph Hellwig down(&bp->b_sema);
1127c59d87c4SChristoph Hellwig
1128c59d87c4SChristoph Hellwig trace_xfs_buf_lock_done(bp, _RET_IP_);
1129c59d87c4SChristoph Hellwig }
1130c59d87c4SChristoph Hellwig
1131c59d87c4SChristoph Hellwig void
xfs_buf_unlock(struct xfs_buf * bp)1132c59d87c4SChristoph Hellwig xfs_buf_unlock(
1133c59d87c4SChristoph Hellwig struct xfs_buf *bp)
1134c59d87c4SChristoph Hellwig {
113520e8a063SBrian Foster ASSERT(xfs_buf_islocked(bp));
113620e8a063SBrian Foster
1137c59d87c4SChristoph Hellwig up(&bp->b_sema);
1138c59d87c4SChristoph Hellwig trace_xfs_buf_unlock(bp, _RET_IP_);
1139c59d87c4SChristoph Hellwig }
1140c59d87c4SChristoph Hellwig
1141c59d87c4SChristoph Hellwig STATIC void
xfs_buf_wait_unpin(struct xfs_buf * bp)1142c59d87c4SChristoph Hellwig xfs_buf_wait_unpin(
1143e8222613SDave Chinner struct xfs_buf *bp)
1144c59d87c4SChristoph Hellwig {
1145c59d87c4SChristoph Hellwig DECLARE_WAITQUEUE (wait, current);
1146c59d87c4SChristoph Hellwig
1147c59d87c4SChristoph Hellwig if (atomic_read(&bp->b_pin_count) == 0)
1148c59d87c4SChristoph Hellwig return;
1149c59d87c4SChristoph Hellwig
1150c59d87c4SChristoph Hellwig add_wait_queue(&bp->b_waiters, &wait);
1151c59d87c4SChristoph Hellwig for (;;) {
1152c59d87c4SChristoph Hellwig set_current_state(TASK_UNINTERRUPTIBLE);
1153c59d87c4SChristoph Hellwig if (atomic_read(&bp->b_pin_count) == 0)
1154c59d87c4SChristoph Hellwig break;
1155c59d87c4SChristoph Hellwig io_schedule();
1156c59d87c4SChristoph Hellwig }
1157c59d87c4SChristoph Hellwig remove_wait_queue(&bp->b_waiters, &wait);
1158c59d87c4SChristoph Hellwig set_current_state(TASK_RUNNING);
1159c59d87c4SChristoph Hellwig }
1160c59d87c4SChristoph Hellwig
1161f58d0ea9SChristoph Hellwig static void
xfs_buf_ioerror_alert_ratelimited(struct xfs_buf * bp)1162f58d0ea9SChristoph Hellwig xfs_buf_ioerror_alert_ratelimited(
1163664ffb8aSChristoph Hellwig struct xfs_buf *bp)
1164664ffb8aSChristoph Hellwig {
1165664ffb8aSChristoph Hellwig static unsigned long lasttime;
1166664ffb8aSChristoph Hellwig static struct xfs_buftarg *lasttarg;
1167664ffb8aSChristoph Hellwig
1168664ffb8aSChristoph Hellwig if (bp->b_target != lasttarg ||
1169664ffb8aSChristoph Hellwig time_after(jiffies, (lasttime + 5*HZ))) {
1170664ffb8aSChristoph Hellwig lasttime = jiffies;
1171664ffb8aSChristoph Hellwig xfs_buf_ioerror_alert(bp, __this_address);
1172664ffb8aSChristoph Hellwig }
1173664ffb8aSChristoph Hellwig lasttarg = bp->b_target;
1174664ffb8aSChristoph Hellwig }
1175664ffb8aSChristoph Hellwig
1176664ffb8aSChristoph Hellwig /*
1177664ffb8aSChristoph Hellwig * Account for this latest trip around the retry handler, and decide if
1178664ffb8aSChristoph Hellwig * we've failed enough times to constitute a permanent failure.
1179664ffb8aSChristoph Hellwig */
1180664ffb8aSChristoph Hellwig static bool
xfs_buf_ioerror_permanent(struct xfs_buf * bp,struct xfs_error_cfg * cfg)1181664ffb8aSChristoph Hellwig xfs_buf_ioerror_permanent(
1182664ffb8aSChristoph Hellwig struct xfs_buf *bp,
1183664ffb8aSChristoph Hellwig struct xfs_error_cfg *cfg)
1184664ffb8aSChristoph Hellwig {
1185664ffb8aSChristoph Hellwig struct xfs_mount *mp = bp->b_mount;
1186664ffb8aSChristoph Hellwig
1187664ffb8aSChristoph Hellwig if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1188664ffb8aSChristoph Hellwig ++bp->b_retries > cfg->max_retries)
1189664ffb8aSChristoph Hellwig return true;
1190664ffb8aSChristoph Hellwig if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1191664ffb8aSChristoph Hellwig time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
1192664ffb8aSChristoph Hellwig return true;
1193664ffb8aSChristoph Hellwig
1194664ffb8aSChristoph Hellwig /* At unmount we may treat errors differently */
11952e973b2cSDave Chinner if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
1196664ffb8aSChristoph Hellwig return true;
1197664ffb8aSChristoph Hellwig
1198664ffb8aSChristoph Hellwig return false;
1199664ffb8aSChristoph Hellwig }
1200664ffb8aSChristoph Hellwig
1201664ffb8aSChristoph Hellwig /*
1202664ffb8aSChristoph Hellwig * On a sync write or shutdown we just want to stale the buffer and let the
1203664ffb8aSChristoph Hellwig * caller handle the error in bp->b_error appropriately.
1204664ffb8aSChristoph Hellwig *
1205664ffb8aSChristoph Hellwig * If the write was asynchronous then no one will be looking for the error. If
1206664ffb8aSChristoph Hellwig * this is the first failure of this type, clear the error state and write the
1207664ffb8aSChristoph Hellwig * buffer out again. This means we always retry an async write failure at least
1208664ffb8aSChristoph Hellwig * once, but we also need to set the buffer up to behave correctly now for
1209664ffb8aSChristoph Hellwig * repeated failures.
1210664ffb8aSChristoph Hellwig *
1211664ffb8aSChristoph Hellwig * If we get repeated async write failures, then we take action according to the
1212664ffb8aSChristoph Hellwig * error configuration we have been set up to use.
1213664ffb8aSChristoph Hellwig *
121470796c6bSChristoph Hellwig * Returns true if this function took care of error handling and the caller must
121570796c6bSChristoph Hellwig * not touch the buffer again. Return false if the caller should proceed with
121670796c6bSChristoph Hellwig * normal I/O completion handling.
1217664ffb8aSChristoph Hellwig */
121870796c6bSChristoph Hellwig static bool
xfs_buf_ioend_handle_error(struct xfs_buf * bp)121970796c6bSChristoph Hellwig xfs_buf_ioend_handle_error(
1220664ffb8aSChristoph Hellwig struct xfs_buf *bp)
1221664ffb8aSChristoph Hellwig {
1222664ffb8aSChristoph Hellwig struct xfs_mount *mp = bp->b_mount;
1223664ffb8aSChristoph Hellwig struct xfs_error_cfg *cfg;
1224664ffb8aSChristoph Hellwig
1225f58d0ea9SChristoph Hellwig /*
122601728b44SDave Chinner * If we've already shutdown the journal because of I/O errors, there's
122701728b44SDave Chinner * no point in giving this a retry.
1228f58d0ea9SChristoph Hellwig */
122901728b44SDave Chinner if (xlog_is_shutdown(mp->m_log))
1230f58d0ea9SChristoph Hellwig goto out_stale;
1231f58d0ea9SChristoph Hellwig
1232f58d0ea9SChristoph Hellwig xfs_buf_ioerror_alert_ratelimited(bp);
1233f58d0ea9SChristoph Hellwig
1234f58d0ea9SChristoph Hellwig /*
123522c10589SChristoph Hellwig * We're not going to bother about retrying this during recovery.
123622c10589SChristoph Hellwig * One strike!
123722c10589SChristoph Hellwig */
123822c10589SChristoph Hellwig if (bp->b_flags & _XBF_LOGRECOVERY) {
123922c10589SChristoph Hellwig xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
124022c10589SChristoph Hellwig return false;
124122c10589SChristoph Hellwig }
124222c10589SChristoph Hellwig
124322c10589SChristoph Hellwig /*
1244f58d0ea9SChristoph Hellwig * Synchronous writes will have callers process the error.
1245f58d0ea9SChristoph Hellwig */
1246f58d0ea9SChristoph Hellwig if (!(bp->b_flags & XBF_ASYNC))
1247664ffb8aSChristoph Hellwig goto out_stale;
1248664ffb8aSChristoph Hellwig
1249664ffb8aSChristoph Hellwig trace_xfs_buf_iodone_async(bp, _RET_IP_);
1250664ffb8aSChristoph Hellwig
1251664ffb8aSChristoph Hellwig cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
12523cc49884SChristoph Hellwig if (bp->b_last_error != bp->b_error ||
12533cc49884SChristoph Hellwig !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
12543cc49884SChristoph Hellwig bp->b_last_error = bp->b_error;
12553cc49884SChristoph Hellwig if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
12563cc49884SChristoph Hellwig !bp->b_first_retry_time)
12573cc49884SChristoph Hellwig bp->b_first_retry_time = jiffies;
12583cc49884SChristoph Hellwig goto resubmit;
1259664ffb8aSChristoph Hellwig }
1260664ffb8aSChristoph Hellwig
1261664ffb8aSChristoph Hellwig /*
1262664ffb8aSChristoph Hellwig * Permanent error - we need to trigger a shutdown if we haven't already
1263664ffb8aSChristoph Hellwig * to indicate that inconsistency will result from this action.
1264664ffb8aSChristoph Hellwig */
1265664ffb8aSChristoph Hellwig if (xfs_buf_ioerror_permanent(bp, cfg)) {
1266664ffb8aSChristoph Hellwig xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1267664ffb8aSChristoph Hellwig goto out_stale;
1268664ffb8aSChristoph Hellwig }
1269664ffb8aSChristoph Hellwig
1270664ffb8aSChristoph Hellwig /* Still considered a transient error. Caller will schedule retries. */
1271844c9358SChristoph Hellwig if (bp->b_flags & _XBF_INODES)
1272844c9358SChristoph Hellwig xfs_buf_inode_io_fail(bp);
1273844c9358SChristoph Hellwig else if (bp->b_flags & _XBF_DQUOTS)
1274844c9358SChristoph Hellwig xfs_buf_dquot_io_fail(bp);
1275844c9358SChristoph Hellwig else
1276844c9358SChristoph Hellwig ASSERT(list_empty(&bp->b_li_list));
1277844c9358SChristoph Hellwig xfs_buf_ioerror(bp, 0);
1278844c9358SChristoph Hellwig xfs_buf_relse(bp);
127970796c6bSChristoph Hellwig return true;
1280664ffb8aSChristoph Hellwig
12813cc49884SChristoph Hellwig resubmit:
12823cc49884SChristoph Hellwig xfs_buf_ioerror(bp, 0);
128355b7d711SChristoph Hellwig bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
12843cc49884SChristoph Hellwig xfs_buf_submit(bp);
128570796c6bSChristoph Hellwig return true;
1286664ffb8aSChristoph Hellwig out_stale:
1287664ffb8aSChristoph Hellwig xfs_buf_stale(bp);
1288664ffb8aSChristoph Hellwig bp->b_flags |= XBF_DONE;
128955b7d711SChristoph Hellwig bp->b_flags &= ~XBF_WRITE;
1290664ffb8aSChristoph Hellwig trace_xfs_buf_error_relse(bp, _RET_IP_);
129170796c6bSChristoph Hellwig return false;
1292664ffb8aSChristoph Hellwig }
1293c59d87c4SChristoph Hellwig
129476b2d323SChristoph Hellwig static void
xfs_buf_ioend(struct xfs_buf * bp)1295e8aaba9aSDave Chinner xfs_buf_ioend(
1296e8aaba9aSDave Chinner struct xfs_buf *bp)
1297c59d87c4SChristoph Hellwig {
1298e8aaba9aSDave Chinner trace_xfs_buf_iodone(bp, _RET_IP_);
12991813dd64SDave Chinner
130061be9c52SDave Chinner /*
130161be9c52SDave Chinner * Pull in IO completion errors now. We are guaranteed to be running
130261be9c52SDave Chinner * single threaded, so we don't need the lock to read b_io_error.
130361be9c52SDave Chinner */
130461be9c52SDave Chinner if (!bp->b_error && bp->b_io_error)
130561be9c52SDave Chinner xfs_buf_ioerror(bp, bp->b_io_error);
130661be9c52SDave Chinner
130755b7d711SChristoph Hellwig if (bp->b_flags & XBF_READ) {
1308b01d1461SDave Chinner if (!bp->b_error && bp->b_ops)
13091813dd64SDave Chinner bp->b_ops->verify_read(bp);
1310b01d1461SDave Chinner if (!bp->b_error)
1311b01d1461SDave Chinner bp->b_flags |= XBF_DONE;
131223fb5a93SChristoph Hellwig } else {
1313b6983e80SBrian Foster if (!bp->b_error) {
1314b6983e80SBrian Foster bp->b_flags &= ~XBF_WRITE_FAIL;
1315e8aaba9aSDave Chinner bp->b_flags |= XBF_DONE;
1316b6983e80SBrian Foster }
1317c59d87c4SChristoph Hellwig
131870796c6bSChristoph Hellwig if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
1319664ffb8aSChristoph Hellwig return;
1320664ffb8aSChristoph Hellwig
1321664ffb8aSChristoph Hellwig /* clear the retry state */
1322664ffb8aSChristoph Hellwig bp->b_last_error = 0;
1323664ffb8aSChristoph Hellwig bp->b_retries = 0;
1324664ffb8aSChristoph Hellwig bp->b_first_retry_time = 0;
1325664ffb8aSChristoph Hellwig
1326664ffb8aSChristoph Hellwig /*
1327664ffb8aSChristoph Hellwig * Note that for things like remote attribute buffers, there may
1328664ffb8aSChristoph Hellwig * not be a buffer log item here, so processing the buffer log
1329664ffb8aSChristoph Hellwig * item must remain optional.
1330664ffb8aSChristoph Hellwig */
1331664ffb8aSChristoph Hellwig if (bp->b_log_item)
1332664ffb8aSChristoph Hellwig xfs_buf_item_done(bp);
1333664ffb8aSChristoph Hellwig
133423fb5a93SChristoph Hellwig if (bp->b_flags & _XBF_INODES)
1335f593bf14SDave Chinner xfs_buf_inode_iodone(bp);
133623fb5a93SChristoph Hellwig else if (bp->b_flags & _XBF_DQUOTS)
13370c7e5afbSDave Chinner xfs_buf_dquot_iodone(bp);
133822c10589SChristoph Hellwig
13391813dd64SDave Chinner }
13406a7584b1SChristoph Hellwig
134122c10589SChristoph Hellwig bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
134222c10589SChristoph Hellwig _XBF_LOGRECOVERY);
134355b7d711SChristoph Hellwig
13446a7584b1SChristoph Hellwig if (bp->b_flags & XBF_ASYNC)
13456a7584b1SChristoph Hellwig xfs_buf_relse(bp);
13466a7584b1SChristoph Hellwig else
13476a7584b1SChristoph Hellwig complete(&bp->b_iowait);
134823fb5a93SChristoph Hellwig }
1349c59d87c4SChristoph Hellwig
1350e8aaba9aSDave Chinner static void
xfs_buf_ioend_work(struct work_struct * work)1351e8aaba9aSDave Chinner xfs_buf_ioend_work(
1352e8aaba9aSDave Chinner struct work_struct *work)
1353c59d87c4SChristoph Hellwig {
1354e8aaba9aSDave Chinner struct xfs_buf *bp =
1355e8222613SDave Chinner container_of(work, struct xfs_buf, b_ioend_work);
13561813dd64SDave Chinner
1357e8aaba9aSDave Chinner xfs_buf_ioend(bp);
1358c59d87c4SChristoph Hellwig }
1359c59d87c4SChristoph Hellwig
1360211fe1a4SAlexander Kuleshov static void
xfs_buf_ioend_async(struct xfs_buf * bp)1361e8aaba9aSDave Chinner xfs_buf_ioend_async(
1362e8aaba9aSDave Chinner struct xfs_buf *bp)
1363c59d87c4SChristoph Hellwig {
1364b29c70f5SBrian Foster INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1365dbd329f1SChristoph Hellwig queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1366c59d87c4SChristoph Hellwig }
1367c59d87c4SChristoph Hellwig
1368c59d87c4SChristoph Hellwig void
__xfs_buf_ioerror(struct xfs_buf * bp,int error,xfs_failaddr_t failaddr)136931ca03c9SDarrick J. Wong __xfs_buf_ioerror(
1370e8222613SDave Chinner struct xfs_buf *bp,
137131ca03c9SDarrick J. Wong int error,
137231ca03c9SDarrick J. Wong xfs_failaddr_t failaddr)
1373c59d87c4SChristoph Hellwig {
13742451337dSDave Chinner ASSERT(error <= 0 && error >= -1000);
13752451337dSDave Chinner bp->b_error = error;
137631ca03c9SDarrick J. Wong trace_xfs_buf_ioerror(bp, error, failaddr);
1377c59d87c4SChristoph Hellwig }
1378c59d87c4SChristoph Hellwig
1379901796afSChristoph Hellwig void
xfs_buf_ioerror_alert(struct xfs_buf * bp,xfs_failaddr_t func)1380901796afSChristoph Hellwig xfs_buf_ioerror_alert(
1381901796afSChristoph Hellwig struct xfs_buf *bp,
1382cdbcf82bSDarrick J. Wong xfs_failaddr_t func)
1383901796afSChristoph Hellwig {
1384f9bccfccSBrian Foster xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1385cdbcf82bSDarrick J. Wong "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
138604fcad80SDave Chinner func, (uint64_t)xfs_buf_daddr(bp),
1387f9bccfccSBrian Foster bp->b_length, -bp->b_error);
1388901796afSChristoph Hellwig }
1389901796afSChristoph Hellwig
139054b3b1f6SBrian Foster /*
139154b3b1f6SBrian Foster * To simulate an I/O failure, the buffer must be locked and held with at least
139254b3b1f6SBrian Foster * three references. The LRU reference is dropped by the stale call. The buf
139354b3b1f6SBrian Foster * item reference is dropped via ioend processing. The third reference is owned
139454b3b1f6SBrian Foster * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
139554b3b1f6SBrian Foster */
139654b3b1f6SBrian Foster void
xfs_buf_ioend_fail(struct xfs_buf * bp)139754b3b1f6SBrian Foster xfs_buf_ioend_fail(
139854b3b1f6SBrian Foster struct xfs_buf *bp)
139954b3b1f6SBrian Foster {
140054b3b1f6SBrian Foster bp->b_flags &= ~XBF_DONE;
140154b3b1f6SBrian Foster xfs_buf_stale(bp);
140254b3b1f6SBrian Foster xfs_buf_ioerror(bp, -EIO);
140354b3b1f6SBrian Foster xfs_buf_ioend(bp);
1404c59d87c4SChristoph Hellwig }
1405c59d87c4SChristoph Hellwig
1406a2dcf5dfSChristoph Hellwig int
xfs_bwrite(struct xfs_buf * bp)1407a2dcf5dfSChristoph Hellwig xfs_bwrite(
1408a2dcf5dfSChristoph Hellwig struct xfs_buf *bp)
1409a2dcf5dfSChristoph Hellwig {
1410a2dcf5dfSChristoph Hellwig int error;
1411a2dcf5dfSChristoph Hellwig
1412a2dcf5dfSChristoph Hellwig ASSERT(xfs_buf_islocked(bp));
1413a2dcf5dfSChristoph Hellwig
1414a2dcf5dfSChristoph Hellwig bp->b_flags |= XBF_WRITE;
141527187754SDave Chinner bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1416b6983e80SBrian Foster XBF_DONE);
1417a2dcf5dfSChristoph Hellwig
14186af88cdaSBrian Foster error = xfs_buf_submit(bp);
1419dbd329f1SChristoph Hellwig if (error)
1420dbd329f1SChristoph Hellwig xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1421a2dcf5dfSChristoph Hellwig return error;
1422a2dcf5dfSChristoph Hellwig }
1423a2dcf5dfSChristoph Hellwig
14249bdd9bd6SBrian Foster static void
xfs_buf_bio_end_io(struct bio * bio)1425c59d87c4SChristoph Hellwig xfs_buf_bio_end_io(
14264246a0b6SChristoph Hellwig struct bio *bio)
1427c59d87c4SChristoph Hellwig {
14289bdd9bd6SBrian Foster struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
1429c59d87c4SChristoph Hellwig
14307376d745SBrian Foster if (!bio->bi_status &&
14317376d745SBrian Foster (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
143243dc0aa8SBrian Foster XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
14337376d745SBrian Foster bio->bi_status = BLK_STS_IOERR;
1434c59d87c4SChristoph Hellwig
143537eb17e6SDave Chinner /*
143637eb17e6SDave Chinner * don't overwrite existing errors - otherwise we can lose errors on
143737eb17e6SDave Chinner * buffers that require multiple bios to complete.
143837eb17e6SDave Chinner */
14394e4cbee9SChristoph Hellwig if (bio->bi_status) {
14404e4cbee9SChristoph Hellwig int error = blk_status_to_errno(bio->bi_status);
14414e4cbee9SChristoph Hellwig
14424e4cbee9SChristoph Hellwig cmpxchg(&bp->b_io_error, 0, error);
14434e4cbee9SChristoph Hellwig }
1444c59d87c4SChristoph Hellwig
144537eb17e6SDave Chinner if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1446c59d87c4SChristoph Hellwig invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1447c59d87c4SChristoph Hellwig
1448e8aaba9aSDave Chinner if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1449e8aaba9aSDave Chinner xfs_buf_ioend_async(bp);
1450c59d87c4SChristoph Hellwig bio_put(bio);
1451c59d87c4SChristoph Hellwig }
1452c59d87c4SChristoph Hellwig
14533e85c868SDave Chinner static void
xfs_buf_ioapply_map(struct xfs_buf * bp,int map,int * buf_offset,int * count,blk_opf_t op)14543e85c868SDave Chinner xfs_buf_ioapply_map(
14553e85c868SDave Chinner struct xfs_buf *bp,
14563e85c868SDave Chinner int map,
14573e85c868SDave Chinner int *buf_offset,
14583e85c868SDave Chinner int *count,
1459d03025aeSBart Van Assche blk_opf_t op)
1460c59d87c4SChristoph Hellwig {
14613e85c868SDave Chinner int page_index;
14625f7136dbSMatthew Wilcox (Oracle) unsigned int total_nr_pages = bp->b_page_count;
14633e85c868SDave Chinner int nr_pages;
1464c59d87c4SChristoph Hellwig struct bio *bio;
14653e85c868SDave Chinner sector_t sector = bp->b_maps[map].bm_bn;
14663e85c868SDave Chinner int size;
14673e85c868SDave Chinner int offset;
1468c59d87c4SChristoph Hellwig
14693e85c868SDave Chinner /* skip the pages in the buffer before the start offset */
14703e85c868SDave Chinner page_index = 0;
14713e85c868SDave Chinner offset = *buf_offset;
14723e85c868SDave Chinner while (offset >= PAGE_SIZE) {
14733e85c868SDave Chinner page_index++;
14743e85c868SDave Chinner offset -= PAGE_SIZE;
1475c59d87c4SChristoph Hellwig }
1476c59d87c4SChristoph Hellwig
14773e85c868SDave Chinner /*
14783e85c868SDave Chinner * Limit the IO size to the length of the current vector, and update the
14793e85c868SDave Chinner * remaining IO count for the next time around.
14803e85c868SDave Chinner */
14813e85c868SDave Chinner size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
14823e85c868SDave Chinner *count -= size;
14833e85c868SDave Chinner *buf_offset += size;
1484c59d87c4SChristoph Hellwig
1485c59d87c4SChristoph Hellwig next_chunk:
1486c59d87c4SChristoph Hellwig atomic_inc(&bp->b_io_remaining);
14875f7136dbSMatthew Wilcox (Oracle) nr_pages = bio_max_segs(total_nr_pages);
1488c59d87c4SChristoph Hellwig
148907888c66SChristoph Hellwig bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
14904f024f37SKent Overstreet bio->bi_iter.bi_sector = sector;
1491c59d87c4SChristoph Hellwig bio->bi_end_io = xfs_buf_bio_end_io;
1492c59d87c4SChristoph Hellwig bio->bi_private = bp;
1493c59d87c4SChristoph Hellwig
14943e85c868SDave Chinner for (; size && nr_pages; nr_pages--, page_index++) {
1495c59d87c4SChristoph Hellwig int rbytes, nbytes = PAGE_SIZE - offset;
1496c59d87c4SChristoph Hellwig
1497c59d87c4SChristoph Hellwig if (nbytes > size)
1498c59d87c4SChristoph Hellwig nbytes = size;
1499c59d87c4SChristoph Hellwig
15003e85c868SDave Chinner rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
15013e85c868SDave Chinner offset);
1502c59d87c4SChristoph Hellwig if (rbytes < nbytes)
1503c59d87c4SChristoph Hellwig break;
1504c59d87c4SChristoph Hellwig
1505c59d87c4SChristoph Hellwig offset = 0;
1506aa0e8833SDave Chinner sector += BTOBB(nbytes);
1507c59d87c4SChristoph Hellwig size -= nbytes;
1508c59d87c4SChristoph Hellwig total_nr_pages--;
1509c59d87c4SChristoph Hellwig }
1510c59d87c4SChristoph Hellwig
15114f024f37SKent Overstreet if (likely(bio->bi_iter.bi_size)) {
1512c59d87c4SChristoph Hellwig if (xfs_buf_is_vmapped(bp)) {
1513c59d87c4SChristoph Hellwig flush_kernel_vmap_range(bp->b_addr,
1514c59d87c4SChristoph Hellwig xfs_buf_vmap_len(bp));
1515c59d87c4SChristoph Hellwig }
15164e49ea4aSMike Christie submit_bio(bio);
1517c59d87c4SChristoph Hellwig if (size)
1518c59d87c4SChristoph Hellwig goto next_chunk;
1519c59d87c4SChristoph Hellwig } else {
152037eb17e6SDave Chinner /*
152137eb17e6SDave Chinner * This is guaranteed not to be the last io reference count
1522595bff75SDave Chinner * because the caller (xfs_buf_submit) holds a count itself.
152337eb17e6SDave Chinner */
152437eb17e6SDave Chinner atomic_dec(&bp->b_io_remaining);
15252451337dSDave Chinner xfs_buf_ioerror(bp, -EIO);
1526c59d87c4SChristoph Hellwig bio_put(bio);
1527c59d87c4SChristoph Hellwig }
15283e85c868SDave Chinner
15293e85c868SDave Chinner }
15303e85c868SDave Chinner
15313e85c868SDave Chinner STATIC void
_xfs_buf_ioapply(struct xfs_buf * bp)15323e85c868SDave Chinner _xfs_buf_ioapply(
15333e85c868SDave Chinner struct xfs_buf *bp)
15343e85c868SDave Chinner {
15353e85c868SDave Chinner struct blk_plug plug;
1536d03025aeSBart Van Assche blk_opf_t op;
15373e85c868SDave Chinner int offset;
15383e85c868SDave Chinner int size;
15393e85c868SDave Chinner int i;
15403e85c868SDave Chinner
1541c163f9a1SDave Chinner /*
1542c163f9a1SDave Chinner * Make sure we capture only current IO errors rather than stale errors
1543c163f9a1SDave Chinner * left over from previous use of the buffer (e.g. failed readahead).
1544c163f9a1SDave Chinner */
1545c163f9a1SDave Chinner bp->b_error = 0;
1546c163f9a1SDave Chinner
15473e85c868SDave Chinner if (bp->b_flags & XBF_WRITE) {
154850bfcd0cSMike Christie op = REQ_OP_WRITE;
15491813dd64SDave Chinner
15501813dd64SDave Chinner /*
15511813dd64SDave Chinner * Run the write verifier callback function if it exists. If
15521813dd64SDave Chinner * this function fails it will mark the buffer with an error and
15531813dd64SDave Chinner * the IO should not be dispatched.
15541813dd64SDave Chinner */
15551813dd64SDave Chinner if (bp->b_ops) {
15561813dd64SDave Chinner bp->b_ops->verify_write(bp);
15571813dd64SDave Chinner if (bp->b_error) {
1558dbd329f1SChristoph Hellwig xfs_force_shutdown(bp->b_mount,
15591813dd64SDave Chinner SHUTDOWN_CORRUPT_INCORE);
15601813dd64SDave Chinner return;
15611813dd64SDave Chinner }
15624c7f65aeSDave Chinner } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
1563dbd329f1SChristoph Hellwig struct xfs_mount *mp = bp->b_mount;
1564400b9d88SDave Chinner
1565400b9d88SDave Chinner /*
1566400b9d88SDave Chinner * non-crc filesystems don't attach verifiers during
1567400b9d88SDave Chinner * log recovery, so don't warn for such filesystems.
1568400b9d88SDave Chinner */
156938c26bfdSDave Chinner if (xfs_has_crc(mp)) {
1570400b9d88SDave Chinner xfs_warn(mp,
1571c219b015SDarrick J. Wong "%s: no buf ops on daddr 0x%llx len %d",
15724c7f65aeSDave Chinner __func__, xfs_buf_daddr(bp),
15734c7f65aeSDave Chinner bp->b_length);
15749c712a13SDarrick J. Wong xfs_hex_dump(bp->b_addr,
15759c712a13SDarrick J. Wong XFS_CORRUPTION_DUMP_LEN);
1576400b9d88SDave Chinner dump_stack();
1577400b9d88SDave Chinner }
15781813dd64SDave Chinner }
15793e85c868SDave Chinner } else {
158050bfcd0cSMike Christie op = REQ_OP_READ;
15812123ef85SChristoph Hellwig if (bp->b_flags & XBF_READ_AHEAD)
15822123ef85SChristoph Hellwig op |= REQ_RAHEAD;
15833e85c868SDave Chinner }
15843e85c868SDave Chinner
15853e85c868SDave Chinner /* we only use the buffer cache for meta-data */
15862123ef85SChristoph Hellwig op |= REQ_META;
15873e85c868SDave Chinner
15883e85c868SDave Chinner /*
15893e85c868SDave Chinner * Walk all the vectors issuing IO on them. Set up the initial offset
15903e85c868SDave Chinner * into the buffer and the desired IO size before we start -
15913e85c868SDave Chinner * _xfs_buf_ioapply_vec() will modify them appropriately for each
15923e85c868SDave Chinner * subsequent call.
15933e85c868SDave Chinner */
15943e85c868SDave Chinner offset = bp->b_offset;
15958124b9b6SChristoph Hellwig size = BBTOB(bp->b_length);
15963e85c868SDave Chinner blk_start_plug(&plug);
15973e85c868SDave Chinner for (i = 0; i < bp->b_map_count; i++) {
15982123ef85SChristoph Hellwig xfs_buf_ioapply_map(bp, i, &offset, &size, op);
15993e85c868SDave Chinner if (bp->b_error)
16003e85c868SDave Chinner break;
16013e85c868SDave Chinner if (size <= 0)
16023e85c868SDave Chinner break; /* all done */
16033e85c868SDave Chinner }
16043e85c868SDave Chinner blk_finish_plug(&plug);
1605c59d87c4SChristoph Hellwig }
1606c59d87c4SChristoph Hellwig
1607595bff75SDave Chinner /*
1608bb00b6f1SBrian Foster * Wait for I/O completion of a sync buffer and return the I/O error code.
1609595bff75SDave Chinner */
1610eaebb515SBrian Foster static int
xfs_buf_iowait(struct xfs_buf * bp)1611bb00b6f1SBrian Foster xfs_buf_iowait(
1612595bff75SDave Chinner struct xfs_buf *bp)
1613c59d87c4SChristoph Hellwig {
1614bb00b6f1SBrian Foster ASSERT(!(bp->b_flags & XBF_ASYNC));
1615bb00b6f1SBrian Foster
1616bb00b6f1SBrian Foster trace_xfs_buf_iowait(bp, _RET_IP_);
1617bb00b6f1SBrian Foster wait_for_completion(&bp->b_iowait);
1618bb00b6f1SBrian Foster trace_xfs_buf_iowait_done(bp, _RET_IP_);
1619bb00b6f1SBrian Foster
1620bb00b6f1SBrian Foster return bp->b_error;
1621bb00b6f1SBrian Foster }
1622bb00b6f1SBrian Foster
1623bb00b6f1SBrian Foster /*
1624bb00b6f1SBrian Foster * Buffer I/O submission path, read or write. Asynchronous submission transfers
1625bb00b6f1SBrian Foster * the buffer lock ownership and the current reference to the IO. It is not
1626bb00b6f1SBrian Foster * safe to reference the buffer after a call to this function unless the caller
1627bb00b6f1SBrian Foster * holds an additional reference itself.
1628bb00b6f1SBrian Foster */
162926e32875SChristoph Hellwig static int
__xfs_buf_submit(struct xfs_buf * bp,bool wait)1630bb00b6f1SBrian Foster __xfs_buf_submit(
1631bb00b6f1SBrian Foster struct xfs_buf *bp,
1632bb00b6f1SBrian Foster bool wait)
1633bb00b6f1SBrian Foster {
1634bb00b6f1SBrian Foster int error = 0;
1635bb00b6f1SBrian Foster
1636595bff75SDave Chinner trace_xfs_buf_submit(bp, _RET_IP_);
1637c59d87c4SChristoph Hellwig
163843ff2122SChristoph Hellwig ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1639595bff75SDave Chinner
164001728b44SDave Chinner /*
164101728b44SDave Chinner * On log shutdown we stale and complete the buffer immediately. We can
164201728b44SDave Chinner * be called to read the superblock before the log has been set up, so
164301728b44SDave Chinner * be careful checking the log state.
164401728b44SDave Chinner *
164501728b44SDave Chinner * Checking the mount shutdown state here can result in the log tail
164601728b44SDave Chinner * moving inappropriately on disk as the log may not yet be shut down.
164701728b44SDave Chinner * i.e. failing this buffer on mount shutdown can remove it from the AIL
164801728b44SDave Chinner * and move the tail of the log forwards without having written this
164901728b44SDave Chinner * buffer to disk. This corrupts the log tail state in memory, and
165001728b44SDave Chinner * because the log may not be shut down yet, it can then be propagated
165101728b44SDave Chinner * to disk before the log is shutdown. Hence we check log shutdown
165201728b44SDave Chinner * state here rather than mount state to avoid corrupting the log tail
165301728b44SDave Chinner * on shutdown.
165401728b44SDave Chinner */
165501728b44SDave Chinner if (bp->b_mount->m_log &&
165601728b44SDave Chinner xlog_is_shutdown(bp->b_mount->m_log)) {
165754b3b1f6SBrian Foster xfs_buf_ioend_fail(bp);
1658eaebb515SBrian Foster return -EIO;
1659595bff75SDave Chinner }
1660c59d87c4SChristoph Hellwig
1661bb00b6f1SBrian Foster /*
1662bb00b6f1SBrian Foster * Grab a reference so the buffer does not go away underneath us. For
1663bb00b6f1SBrian Foster * async buffers, I/O completion drops the callers reference, which
1664bb00b6f1SBrian Foster * could occur before submission returns.
1665bb00b6f1SBrian Foster */
1666bb00b6f1SBrian Foster xfs_buf_hold(bp);
1667bb00b6f1SBrian Foster
1668375ec69dSChristoph Hellwig if (bp->b_flags & XBF_WRITE)
1669c59d87c4SChristoph Hellwig xfs_buf_wait_unpin(bp);
1670c59d87c4SChristoph Hellwig
167161be9c52SDave Chinner /* clear the internal error state to avoid spurious errors */
167261be9c52SDave Chinner bp->b_io_error = 0;
167361be9c52SDave Chinner
16748d6c1210SEric Sandeen /*
1675eaebb515SBrian Foster * Set the count to 1 initially, this will stop an I/O completion
1676eaebb515SBrian Foster * callout which happens before we have started all the I/O from calling
1677eaebb515SBrian Foster * xfs_buf_ioend too early.
1678eaebb515SBrian Foster */
1679eaebb515SBrian Foster atomic_set(&bp->b_io_remaining, 1);
1680eaebb515SBrian Foster if (bp->b_flags & XBF_ASYNC)
1681eaebb515SBrian Foster xfs_buf_ioacct_inc(bp);
1682eaebb515SBrian Foster _xfs_buf_ioapply(bp);
1683eaebb515SBrian Foster
1684eaebb515SBrian Foster /*
1685eaebb515SBrian Foster * If _xfs_buf_ioapply failed, we can get back here with only the IO
1686eaebb515SBrian Foster * reference we took above. If we drop it to zero, run completion so
1687eaebb515SBrian Foster * that we don't return to the caller with completion still pending.
1688eaebb515SBrian Foster */
1689eaebb515SBrian Foster if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1690eaebb515SBrian Foster if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
1691eaebb515SBrian Foster xfs_buf_ioend(bp);
1692eaebb515SBrian Foster else
1693eaebb515SBrian Foster xfs_buf_ioend_async(bp);
1694eaebb515SBrian Foster }
1695eaebb515SBrian Foster
16966af88cdaSBrian Foster if (wait)
16976af88cdaSBrian Foster error = xfs_buf_iowait(bp);
1698bb00b6f1SBrian Foster
1699595bff75SDave Chinner /*
17006af88cdaSBrian Foster * Release the hold that keeps the buffer referenced for the entire
17016af88cdaSBrian Foster * I/O. Note that if the buffer is async, it is not safe to reference
17026af88cdaSBrian Foster * after this release.
1703595bff75SDave Chinner */
1704595bff75SDave Chinner xfs_buf_rele(bp);
1705595bff75SDave Chinner return error;
1706c59d87c4SChristoph Hellwig }
1707c59d87c4SChristoph Hellwig
170888ee2df7SChristoph Hellwig void *
xfs_buf_offset(struct xfs_buf * bp,size_t offset)1709c59d87c4SChristoph Hellwig xfs_buf_offset(
171088ee2df7SChristoph Hellwig struct xfs_buf *bp,
1711c59d87c4SChristoph Hellwig size_t offset)
1712c59d87c4SChristoph Hellwig {
1713c59d87c4SChristoph Hellwig struct page *page;
1714c59d87c4SChristoph Hellwig
1715611c9946SDave Chinner if (bp->b_addr)
1716c59d87c4SChristoph Hellwig return bp->b_addr + offset;
1717c59d87c4SChristoph Hellwig
1718c59d87c4SChristoph Hellwig page = bp->b_pages[offset >> PAGE_SHIFT];
171988ee2df7SChristoph Hellwig return page_address(page) + (offset & (PAGE_SIZE-1));
1720c59d87c4SChristoph Hellwig }
1721c59d87c4SChristoph Hellwig
1722c59d87c4SChristoph Hellwig void
xfs_buf_zero(struct xfs_buf * bp,size_t boff,size_t bsize)1723f9a196eeSChristoph Hellwig xfs_buf_zero(
1724f9a196eeSChristoph Hellwig struct xfs_buf *bp,
1725f9a196eeSChristoph Hellwig size_t boff,
1726f9a196eeSChristoph Hellwig size_t bsize)
1727c59d87c4SChristoph Hellwig {
1728795cac72SDave Chinner size_t bend;
1729c59d87c4SChristoph Hellwig
1730c59d87c4SChristoph Hellwig bend = boff + bsize;
1731c59d87c4SChristoph Hellwig while (boff < bend) {
1732795cac72SDave Chinner struct page *page;
1733795cac72SDave Chinner int page_index, page_offset, csize;
1734c59d87c4SChristoph Hellwig
1735795cac72SDave Chinner page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1736795cac72SDave Chinner page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1737795cac72SDave Chinner page = bp->b_pages[page_index];
1738795cac72SDave Chinner csize = min_t(size_t, PAGE_SIZE - page_offset,
17398124b9b6SChristoph Hellwig BBTOB(bp->b_length) - boff);
1740795cac72SDave Chinner
1741795cac72SDave Chinner ASSERT((csize + page_offset) <= PAGE_SIZE);
1742c59d87c4SChristoph Hellwig
1743795cac72SDave Chinner memset(page_address(page) + page_offset, 0, csize);
1744c59d87c4SChristoph Hellwig
1745c59d87c4SChristoph Hellwig boff += csize;
1746c59d87c4SChristoph Hellwig }
1747c59d87c4SChristoph Hellwig }
1748c59d87c4SChristoph Hellwig
1749c59d87c4SChristoph Hellwig /*
17508d57c216SDarrick J. Wong * Log a message about and stale a buffer that a caller has decided is corrupt.
17518d57c216SDarrick J. Wong *
17528d57c216SDarrick J. Wong * This function should be called for the kinds of metadata corruption that
17538d57c216SDarrick J. Wong * cannot be detect from a verifier, such as incorrect inter-block relationship
17548d57c216SDarrick J. Wong * data. Do /not/ call this function from a verifier function.
17558d57c216SDarrick J. Wong *
17568d57c216SDarrick J. Wong * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
17578d57c216SDarrick J. Wong * be marked stale, but b_error will not be set. The caller is responsible for
17588d57c216SDarrick J. Wong * releasing the buffer or fixing it.
17598d57c216SDarrick J. Wong */
17608d57c216SDarrick J. Wong void
__xfs_buf_mark_corrupt(struct xfs_buf * bp,xfs_failaddr_t fa)17618d57c216SDarrick J. Wong __xfs_buf_mark_corrupt(
17628d57c216SDarrick J. Wong struct xfs_buf *bp,
17638d57c216SDarrick J. Wong xfs_failaddr_t fa)
17648d57c216SDarrick J. Wong {
17658d57c216SDarrick J. Wong ASSERT(bp->b_flags & XBF_DONE);
17668d57c216SDarrick J. Wong
1767e83cf875SDarrick J. Wong xfs_buf_corruption_error(bp, fa);
17688d57c216SDarrick J. Wong xfs_buf_stale(bp);
17698d57c216SDarrick J. Wong }
17708d57c216SDarrick J. Wong
17718d57c216SDarrick J. Wong /*
1772c59d87c4SChristoph Hellwig * Handling of buffer targets (buftargs).
1773c59d87c4SChristoph Hellwig */
1774c59d87c4SChristoph Hellwig
1775c59d87c4SChristoph Hellwig /*
1776c59d87c4SChristoph Hellwig * Wait for any bufs with callbacks that have been submitted but have not yet
1777c59d87c4SChristoph Hellwig * returned. These buffers will have an elevated hold count, so wait on those
1778c59d87c4SChristoph Hellwig * while freeing all the buffers only held by the LRU.
1779c59d87c4SChristoph Hellwig */
1780e80dfa19SDave Chinner static enum lru_status
xfs_buftarg_drain_rele(struct list_head * item,struct list_lru_one * lru,spinlock_t * lru_lock,void * arg)178110fb9ac1SBrian Foster xfs_buftarg_drain_rele(
1782e80dfa19SDave Chinner struct list_head *item,
17833f97b163SVladimir Davydov struct list_lru_one *lru,
1784e80dfa19SDave Chinner spinlock_t *lru_lock,
1785e80dfa19SDave Chinner void *arg)
1786c59d87c4SChristoph Hellwig
1787e80dfa19SDave Chinner {
1788e80dfa19SDave Chinner struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1789a4082357SDave Chinner struct list_head *dispose = arg;
1790e80dfa19SDave Chinner
1791c59d87c4SChristoph Hellwig if (atomic_read(&bp->b_hold) > 1) {
1792a4082357SDave Chinner /* need to wait, so skip it this pass */
179310fb9ac1SBrian Foster trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
1794a4082357SDave Chinner return LRU_SKIP;
1795a4082357SDave Chinner }
1796a4082357SDave Chinner if (!spin_trylock(&bp->b_lock))
1797a4082357SDave Chinner return LRU_SKIP;
1798a4082357SDave Chinner
1799c59d87c4SChristoph Hellwig /*
180090802ed9SPaul Bolle * clear the LRU reference count so the buffer doesn't get
1801c59d87c4SChristoph Hellwig * ignored in xfs_buf_rele().
1802c59d87c4SChristoph Hellwig */
1803c59d87c4SChristoph Hellwig atomic_set(&bp->b_lru_ref, 0);
1804a4082357SDave Chinner bp->b_state |= XFS_BSTATE_DISPOSE;
18053f97b163SVladimir Davydov list_lru_isolate_move(lru, item, dispose);
1806a4082357SDave Chinner spin_unlock(&bp->b_lock);
1807a4082357SDave Chinner return LRU_REMOVED;
1808e80dfa19SDave Chinner }
1809e80dfa19SDave Chinner
18108321ddb2SBrian Foster /*
18118321ddb2SBrian Foster * Wait for outstanding I/O on the buftarg to complete.
18128321ddb2SBrian Foster */
1813e80dfa19SDave Chinner void
xfs_buftarg_wait(struct xfs_buftarg * btp)18148321ddb2SBrian Foster xfs_buftarg_wait(
1815e80dfa19SDave Chinner struct xfs_buftarg *btp)
1816c59d87c4SChristoph Hellwig {
181785bec546SDave Chinner /*
18189c7504aaSBrian Foster * First wait on the buftarg I/O count for all in-flight buffers to be
18199c7504aaSBrian Foster * released. This is critical as new buffers do not make the LRU until
18209c7504aaSBrian Foster * they are released.
18219c7504aaSBrian Foster *
18229c7504aaSBrian Foster * Next, flush the buffer workqueue to ensure all completion processing
18239c7504aaSBrian Foster * has finished. Just waiting on buffer locks is not sufficient for
18249c7504aaSBrian Foster * async IO as the reference count held over IO is not released until
18259c7504aaSBrian Foster * after the buffer lock is dropped. Hence we need to ensure here that
18269c7504aaSBrian Foster * all reference counts have been dropped before we start walking the
18279c7504aaSBrian Foster * LRU list.
182885bec546SDave Chinner */
18299c7504aaSBrian Foster while (percpu_counter_sum(&btp->bt_io_count))
18309c7504aaSBrian Foster delay(100);
1831800b2694SBrian Foster flush_workqueue(btp->bt_mount->m_buf_workqueue);
18328321ddb2SBrian Foster }
18338321ddb2SBrian Foster
18348321ddb2SBrian Foster void
xfs_buftarg_drain(struct xfs_buftarg * btp)18358321ddb2SBrian Foster xfs_buftarg_drain(
18368321ddb2SBrian Foster struct xfs_buftarg *btp)
18378321ddb2SBrian Foster {
18388321ddb2SBrian Foster LIST_HEAD(dispose);
18398321ddb2SBrian Foster int loop = 0;
18408321ddb2SBrian Foster bool write_fail = false;
18418321ddb2SBrian Foster
18428321ddb2SBrian Foster xfs_buftarg_wait(btp);
184385bec546SDave Chinner
1844a4082357SDave Chinner /* loop until there is nothing left on the lru list. */
1845a4082357SDave Chinner while (list_lru_count(&btp->bt_lru)) {
184610fb9ac1SBrian Foster list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
1847a4082357SDave Chinner &dispose, LONG_MAX);
1848a4082357SDave Chinner
1849a4082357SDave Chinner while (!list_empty(&dispose)) {
1850a4082357SDave Chinner struct xfs_buf *bp;
1851a4082357SDave Chinner bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1852a4082357SDave Chinner list_del_init(&bp->b_lru);
1853ac8809f9SDave Chinner if (bp->b_flags & XBF_WRITE_FAIL) {
185461948b6fSBrian Foster write_fail = true;
185561948b6fSBrian Foster xfs_buf_alert_ratelimited(bp,
185661948b6fSBrian Foster "XFS: Corruption Alert",
1857c219b015SDarrick J. Wong "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
18584c7f65aeSDave Chinner (long long)xfs_buf_daddr(bp));
1859ac8809f9SDave Chinner }
1860a4082357SDave Chinner xfs_buf_rele(bp);
1861a4082357SDave Chinner }
1862a4082357SDave Chinner if (loop++ != 0)
1863a4082357SDave Chinner delay(100);
1864a4082357SDave Chinner }
186561948b6fSBrian Foster
186661948b6fSBrian Foster /*
186761948b6fSBrian Foster * If one or more failed buffers were freed, that means dirty metadata
186861948b6fSBrian Foster * was thrown away. This should only ever happen after I/O completion
186961948b6fSBrian Foster * handling has elevated I/O error(s) to permanent failures and shuts
187001728b44SDave Chinner * down the journal.
187161948b6fSBrian Foster */
187261948b6fSBrian Foster if (write_fail) {
187301728b44SDave Chinner ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
187461948b6fSBrian Foster xfs_alert(btp->bt_mount,
187561948b6fSBrian Foster "Please run xfs_repair to determine the extent of the problem.");
187661948b6fSBrian Foster }
1877e80dfa19SDave Chinner }
1878c59d87c4SChristoph Hellwig
1879e80dfa19SDave Chinner static enum lru_status
xfs_buftarg_isolate(struct list_head * item,struct list_lru_one * lru,spinlock_t * lru_lock,void * arg)1880e80dfa19SDave Chinner xfs_buftarg_isolate(
1881e80dfa19SDave Chinner struct list_head *item,
18823f97b163SVladimir Davydov struct list_lru_one *lru,
1883e80dfa19SDave Chinner spinlock_t *lru_lock,
1884e80dfa19SDave Chinner void *arg)
1885e80dfa19SDave Chinner {
1886e80dfa19SDave Chinner struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1887e80dfa19SDave Chinner struct list_head *dispose = arg;
1888c59d87c4SChristoph Hellwig
1889c59d87c4SChristoph Hellwig /*
1890a4082357SDave Chinner * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1891a4082357SDave Chinner * If we fail to get the lock, just skip it.
1892a4082357SDave Chinner */
1893a4082357SDave Chinner if (!spin_trylock(&bp->b_lock))
1894a4082357SDave Chinner return LRU_SKIP;
1895a4082357SDave Chinner /*
1896c59d87c4SChristoph Hellwig * Decrement the b_lru_ref count unless the value is already
1897c59d87c4SChristoph Hellwig * zero. If the value is already zero, we need to reclaim the
1898c59d87c4SChristoph Hellwig * buffer, otherwise it gets another trip through the LRU.
1899c59d87c4SChristoph Hellwig */
190019957a18SVratislav Bendel if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1901a4082357SDave Chinner spin_unlock(&bp->b_lock);
1902e80dfa19SDave Chinner return LRU_ROTATE;
1903a4082357SDave Chinner }
1904e80dfa19SDave Chinner
1905a4082357SDave Chinner bp->b_state |= XFS_BSTATE_DISPOSE;
19063f97b163SVladimir Davydov list_lru_isolate_move(lru, item, dispose);
1907a4082357SDave Chinner spin_unlock(&bp->b_lock);
1908e80dfa19SDave Chinner return LRU_REMOVED;
1909c59d87c4SChristoph Hellwig }
1910c59d87c4SChristoph Hellwig
1911addbda40SAndrew Morton static unsigned long
xfs_buftarg_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)1912e80dfa19SDave Chinner xfs_buftarg_shrink_scan(
1913e80dfa19SDave Chinner struct shrinker *shrink,
1914e80dfa19SDave Chinner struct shrink_control *sc)
1915e80dfa19SDave Chinner {
1916e80dfa19SDave Chinner struct xfs_buftarg *btp = container_of(shrink,
1917e80dfa19SDave Chinner struct xfs_buftarg, bt_shrinker);
1918e80dfa19SDave Chinner LIST_HEAD(dispose);
1919addbda40SAndrew Morton unsigned long freed;
1920e80dfa19SDave Chinner
1921503c358cSVladimir Davydov freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1922503c358cSVladimir Davydov xfs_buftarg_isolate, &dispose);
1923c59d87c4SChristoph Hellwig
1924c59d87c4SChristoph Hellwig while (!list_empty(&dispose)) {
1925e80dfa19SDave Chinner struct xfs_buf *bp;
1926c59d87c4SChristoph Hellwig bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1927c59d87c4SChristoph Hellwig list_del_init(&bp->b_lru);
1928c59d87c4SChristoph Hellwig xfs_buf_rele(bp);
1929c59d87c4SChristoph Hellwig }
1930c59d87c4SChristoph Hellwig
1931e80dfa19SDave Chinner return freed;
1932e80dfa19SDave Chinner }
1933e80dfa19SDave Chinner
1934addbda40SAndrew Morton static unsigned long
xfs_buftarg_shrink_count(struct shrinker * shrink,struct shrink_control * sc)1935e80dfa19SDave Chinner xfs_buftarg_shrink_count(
1936e80dfa19SDave Chinner struct shrinker *shrink,
1937e80dfa19SDave Chinner struct shrink_control *sc)
1938e80dfa19SDave Chinner {
1939e80dfa19SDave Chinner struct xfs_buftarg *btp = container_of(shrink,
1940e80dfa19SDave Chinner struct xfs_buftarg, bt_shrinker);
1941503c358cSVladimir Davydov return list_lru_shrink_count(&btp->bt_lru, sc);
1942c59d87c4SChristoph Hellwig }
1943c59d87c4SChristoph Hellwig
1944c59d87c4SChristoph Hellwig void
xfs_free_buftarg(struct xfs_buftarg * btp)1945c59d87c4SChristoph Hellwig xfs_free_buftarg(
1946c59d87c4SChristoph Hellwig struct xfs_buftarg *btp)
1947c59d87c4SChristoph Hellwig {
194841233576SChristoph Hellwig struct block_device *bdev = btp->bt_bdev;
194941233576SChristoph Hellwig
1950c59d87c4SChristoph Hellwig unregister_shrinker(&btp->bt_shrinker);
19519c7504aaSBrian Foster ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
19529c7504aaSBrian Foster percpu_counter_destroy(&btp->bt_io_count);
1953f5e1dd34SGlauber Costa list_lru_destroy(&btp->bt_lru);
1954c59d87c4SChristoph Hellwig
19556f643c57SShiyang Ruan fs_put_dax(btp->bt_daxdev, btp->bt_mount);
195641233576SChristoph Hellwig /* the main block device is closed by kill_block_super */
195741233576SChristoph Hellwig if (bdev != btp->bt_mount->m_super->s_bdev)
19582ea6f689SChristoph Hellwig blkdev_put(bdev, btp->bt_mount->m_super);
1959c59d87c4SChristoph Hellwig
1960c59d87c4SChristoph Hellwig kmem_free(btp);
1961c59d87c4SChristoph Hellwig }
1962c59d87c4SChristoph Hellwig
19633fefdeeeSEric Sandeen int
xfs_setsize_buftarg(xfs_buftarg_t * btp,unsigned int sectorsize)19643fefdeeeSEric Sandeen xfs_setsize_buftarg(
1965c59d87c4SChristoph Hellwig xfs_buftarg_t *btp,
19663fefdeeeSEric Sandeen unsigned int sectorsize)
1967c59d87c4SChristoph Hellwig {
19687c71ee78SEric Sandeen /* Set up metadata sector size info */
19696da54179SEric Sandeen btp->bt_meta_sectorsize = sectorsize;
19706da54179SEric Sandeen btp->bt_meta_sectormask = sectorsize - 1;
1971c59d87c4SChristoph Hellwig
1972c59d87c4SChristoph Hellwig if (set_blocksize(btp->bt_bdev, sectorsize)) {
1973c59d87c4SChristoph Hellwig xfs_warn(btp->bt_mount,
1974a1c6f057SDmitry Monakhov "Cannot set_blocksize to %u on device %pg",
1975a1c6f057SDmitry Monakhov sectorsize, btp->bt_bdev);
19762451337dSDave Chinner return -EINVAL;
1977c59d87c4SChristoph Hellwig }
1978c59d87c4SChristoph Hellwig
19797c71ee78SEric Sandeen /* Set up device logical sector size mask */
19807c71ee78SEric Sandeen btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
19817c71ee78SEric Sandeen btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
19827c71ee78SEric Sandeen
1983c59d87c4SChristoph Hellwig return 0;
1984c59d87c4SChristoph Hellwig }
1985c59d87c4SChristoph Hellwig
1986c59d87c4SChristoph Hellwig /*
1987c59d87c4SChristoph Hellwig * When allocating the initial buffer target we have not yet
1988c59d87c4SChristoph Hellwig * read in the superblock, so don't know what sized sectors
19898b4ad79cSZhi Yong Wu * are being used at this early stage. Play safe.
1990c59d87c4SChristoph Hellwig */
1991c59d87c4SChristoph Hellwig STATIC int
xfs_setsize_buftarg_early(xfs_buftarg_t * btp,struct block_device * bdev)1992c59d87c4SChristoph Hellwig xfs_setsize_buftarg_early(
1993c59d87c4SChristoph Hellwig xfs_buftarg_t *btp,
1994c59d87c4SChristoph Hellwig struct block_device *bdev)
1995c59d87c4SChristoph Hellwig {
1996a96c4151SEric Sandeen return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
1997c59d87c4SChristoph Hellwig }
1998c59d87c4SChristoph Hellwig
19995b5abbefSChristoph Hellwig struct xfs_buftarg *
xfs_alloc_buftarg(struct xfs_mount * mp,struct block_device * bdev)2000c59d87c4SChristoph Hellwig xfs_alloc_buftarg(
2001c59d87c4SChristoph Hellwig struct xfs_mount *mp,
20025b5abbefSChristoph Hellwig struct block_device *bdev)
2003c59d87c4SChristoph Hellwig {
2004c59d87c4SChristoph Hellwig xfs_buftarg_t *btp;
20056f643c57SShiyang Ruan const struct dax_holder_operations *ops = NULL;
2006c59d87c4SChristoph Hellwig
20076f643c57SShiyang Ruan #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
20086f643c57SShiyang Ruan ops = &xfs_dax_holder_operations;
20096f643c57SShiyang Ruan #endif
2010707e0ddaSTetsuo Handa btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
2011c59d87c4SChristoph Hellwig
2012c59d87c4SChristoph Hellwig btp->bt_mount = mp;
2013c59d87c4SChristoph Hellwig btp->bt_dev = bdev->bd_dev;
2014c59d87c4SChristoph Hellwig btp->bt_bdev = bdev;
20156f643c57SShiyang Ruan btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
20166f643c57SShiyang Ruan mp, ops);
2017c59d87c4SChristoph Hellwig
2018f9bccfccSBrian Foster /*
2019f9bccfccSBrian Foster * Buffer IO error rate limiting. Limit it to no more than 10 messages
2020f9bccfccSBrian Foster * per 30 seconds so as to not spam logs too much on repeated errors.
2021f9bccfccSBrian Foster */
2022f9bccfccSBrian Foster ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
2023f9bccfccSBrian Foster DEFAULT_RATELIMIT_BURST);
2024f9bccfccSBrian Foster
2025c59d87c4SChristoph Hellwig if (xfs_setsize_buftarg_early(btp, bdev))
2026d210a987SMichal Hocko goto error_free;
20275ca302c8SGlauber Costa
20285ca302c8SGlauber Costa if (list_lru_init(&btp->bt_lru))
2029d210a987SMichal Hocko goto error_free;
20305ca302c8SGlauber Costa
20319c7504aaSBrian Foster if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
2032d210a987SMichal Hocko goto error_lru;
20339c7504aaSBrian Foster
2034e80dfa19SDave Chinner btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
2035e80dfa19SDave Chinner btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
2036c59d87c4SChristoph Hellwig btp->bt_shrinker.seeks = DEFAULT_SEEKS;
2037e80dfa19SDave Chinner btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
2038e33c267aSRoman Gushchin if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
2039e33c267aSRoman Gushchin mp->m_super->s_id))
2040d210a987SMichal Hocko goto error_pcpu;
2041c59d87c4SChristoph Hellwig return btp;
2042c59d87c4SChristoph Hellwig
2043d210a987SMichal Hocko error_pcpu:
2044d210a987SMichal Hocko percpu_counter_destroy(&btp->bt_io_count);
2045d210a987SMichal Hocko error_lru:
2046d210a987SMichal Hocko list_lru_destroy(&btp->bt_lru);
2047d210a987SMichal Hocko error_free:
2048c59d87c4SChristoph Hellwig kmem_free(btp);
2049c59d87c4SChristoph Hellwig return NULL;
2050c59d87c4SChristoph Hellwig }
2051c59d87c4SChristoph Hellwig
2052*1a48327cSDarrick J. Wong static inline void
xfs_buf_list_del(struct xfs_buf * bp)2053*1a48327cSDarrick J. Wong xfs_buf_list_del(
2054*1a48327cSDarrick J. Wong struct xfs_buf *bp)
2055*1a48327cSDarrick J. Wong {
2056*1a48327cSDarrick J. Wong list_del_init(&bp->b_list);
2057*1a48327cSDarrick J. Wong wake_up_var(&bp->b_list);
2058*1a48327cSDarrick J. Wong }
2059*1a48327cSDarrick J. Wong
206043ff2122SChristoph Hellwig /*
206120e8a063SBrian Foster * Cancel a delayed write list.
206220e8a063SBrian Foster *
206320e8a063SBrian Foster * Remove each buffer from the list, clear the delwri queue flag and drop the
206420e8a063SBrian Foster * associated buffer reference.
206520e8a063SBrian Foster */
206620e8a063SBrian Foster void
xfs_buf_delwri_cancel(struct list_head * list)206720e8a063SBrian Foster xfs_buf_delwri_cancel(
206820e8a063SBrian Foster struct list_head *list)
206920e8a063SBrian Foster {
207020e8a063SBrian Foster struct xfs_buf *bp;
207120e8a063SBrian Foster
207220e8a063SBrian Foster while (!list_empty(list)) {
207320e8a063SBrian Foster bp = list_first_entry(list, struct xfs_buf, b_list);
207420e8a063SBrian Foster
207520e8a063SBrian Foster xfs_buf_lock(bp);
207620e8a063SBrian Foster bp->b_flags &= ~_XBF_DELWRI_Q;
2077*1a48327cSDarrick J. Wong xfs_buf_list_del(bp);
207820e8a063SBrian Foster xfs_buf_relse(bp);
207920e8a063SBrian Foster }
208020e8a063SBrian Foster }
208120e8a063SBrian Foster
208220e8a063SBrian Foster /*
208343ff2122SChristoph Hellwig * Add a buffer to the delayed write list.
208443ff2122SChristoph Hellwig *
208543ff2122SChristoph Hellwig * This queues a buffer for writeout if it hasn't already been. Note that
208643ff2122SChristoph Hellwig * neither this routine nor the buffer list submission functions perform
208743ff2122SChristoph Hellwig * any internal synchronization. It is expected that the lists are thread-local
208843ff2122SChristoph Hellwig * to the callers.
208943ff2122SChristoph Hellwig *
209043ff2122SChristoph Hellwig * Returns true if we queued up the buffer, or false if it already had
209143ff2122SChristoph Hellwig * been on the buffer list.
209243ff2122SChristoph Hellwig */
209343ff2122SChristoph Hellwig bool
xfs_buf_delwri_queue(struct xfs_buf * bp,struct list_head * list)209443ff2122SChristoph Hellwig xfs_buf_delwri_queue(
209543ff2122SChristoph Hellwig struct xfs_buf *bp,
209643ff2122SChristoph Hellwig struct list_head *list)
209743ff2122SChristoph Hellwig {
209843ff2122SChristoph Hellwig ASSERT(xfs_buf_islocked(bp));
209943ff2122SChristoph Hellwig ASSERT(!(bp->b_flags & XBF_READ));
2100c59d87c4SChristoph Hellwig
2101c59d87c4SChristoph Hellwig /*
210243ff2122SChristoph Hellwig * If the buffer is already marked delwri it already is queued up
210343ff2122SChristoph Hellwig * by someone else for imediate writeout. Just ignore it in that
210443ff2122SChristoph Hellwig * case.
2105c59d87c4SChristoph Hellwig */
210643ff2122SChristoph Hellwig if (bp->b_flags & _XBF_DELWRI_Q) {
210743ff2122SChristoph Hellwig trace_xfs_buf_delwri_queued(bp, _RET_IP_);
210843ff2122SChristoph Hellwig return false;
210943ff2122SChristoph Hellwig }
2110c59d87c4SChristoph Hellwig
2111c59d87c4SChristoph Hellwig trace_xfs_buf_delwri_queue(bp, _RET_IP_);
2112c59d87c4SChristoph Hellwig
211343ff2122SChristoph Hellwig /*
211443ff2122SChristoph Hellwig * If a buffer gets written out synchronously or marked stale while it
211543ff2122SChristoph Hellwig * is on a delwri list we lazily remove it. To do this, the other party
211643ff2122SChristoph Hellwig * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
211743ff2122SChristoph Hellwig * It remains referenced and on the list. In a rare corner case it
211843ff2122SChristoph Hellwig * might get readded to a delwri list after the synchronous writeout, in
211943ff2122SChristoph Hellwig * which case we need just need to re-add the flag here.
212043ff2122SChristoph Hellwig */
212143ff2122SChristoph Hellwig bp->b_flags |= _XBF_DELWRI_Q;
212243ff2122SChristoph Hellwig if (list_empty(&bp->b_list)) {
21235a8ee6baSChristoph Hellwig atomic_inc(&bp->b_hold);
212443ff2122SChristoph Hellwig list_add_tail(&bp->b_list, list);
2125c59d87c4SChristoph Hellwig }
2126c59d87c4SChristoph Hellwig
212743ff2122SChristoph Hellwig return true;
2128c59d87c4SChristoph Hellwig }
2129c59d87c4SChristoph Hellwig
2130c59d87c4SChristoph Hellwig /*
2131*1a48327cSDarrick J. Wong * Queue a buffer to this delwri list as part of a data integrity operation.
2132*1a48327cSDarrick J. Wong * If the buffer is on any other delwri list, we'll wait for that to clear
2133*1a48327cSDarrick J. Wong * so that the caller can submit the buffer for IO and wait for the result.
2134*1a48327cSDarrick J. Wong * Callers must ensure the buffer is not already on the list.
2135*1a48327cSDarrick J. Wong */
2136*1a48327cSDarrick J. Wong void
xfs_buf_delwri_queue_here(struct xfs_buf * bp,struct list_head * buffer_list)2137*1a48327cSDarrick J. Wong xfs_buf_delwri_queue_here(
2138*1a48327cSDarrick J. Wong struct xfs_buf *bp,
2139*1a48327cSDarrick J. Wong struct list_head *buffer_list)
2140*1a48327cSDarrick J. Wong {
2141*1a48327cSDarrick J. Wong /*
2142*1a48327cSDarrick J. Wong * We need this buffer to end up on the /caller's/ delwri list, not any
2143*1a48327cSDarrick J. Wong * old list. This can happen if the buffer is marked stale (which
2144*1a48327cSDarrick J. Wong * clears DELWRI_Q) after the AIL queues the buffer to its list but
2145*1a48327cSDarrick J. Wong * before the AIL has a chance to submit the list.
2146*1a48327cSDarrick J. Wong */
2147*1a48327cSDarrick J. Wong while (!list_empty(&bp->b_list)) {
2148*1a48327cSDarrick J. Wong xfs_buf_unlock(bp);
2149*1a48327cSDarrick J. Wong wait_var_event(&bp->b_list, list_empty(&bp->b_list));
2150*1a48327cSDarrick J. Wong xfs_buf_lock(bp);
2151*1a48327cSDarrick J. Wong }
2152*1a48327cSDarrick J. Wong
2153*1a48327cSDarrick J. Wong ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
2154*1a48327cSDarrick J. Wong
2155*1a48327cSDarrick J. Wong xfs_buf_delwri_queue(bp, buffer_list);
2156*1a48327cSDarrick J. Wong }
2157*1a48327cSDarrick J. Wong
2158*1a48327cSDarrick J. Wong /*
2159c59d87c4SChristoph Hellwig * Compare function is more complex than it needs to be because
2160c59d87c4SChristoph Hellwig * the return value is only 32 bits and we are doing comparisons
2161c59d87c4SChristoph Hellwig * on 64 bit values
2162c59d87c4SChristoph Hellwig */
2163c59d87c4SChristoph Hellwig static int
xfs_buf_cmp(void * priv,const struct list_head * a,const struct list_head * b)2164c59d87c4SChristoph Hellwig xfs_buf_cmp(
2165c59d87c4SChristoph Hellwig void *priv,
21664f0f586bSSami Tolvanen const struct list_head *a,
21674f0f586bSSami Tolvanen const struct list_head *b)
2168c59d87c4SChristoph Hellwig {
2169c59d87c4SChristoph Hellwig struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
2170c59d87c4SChristoph Hellwig struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
2171c59d87c4SChristoph Hellwig xfs_daddr_t diff;
2172c59d87c4SChristoph Hellwig
2173f4b42421SMark Tinguely diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
2174c59d87c4SChristoph Hellwig if (diff < 0)
2175c59d87c4SChristoph Hellwig return -1;
2176c59d87c4SChristoph Hellwig if (diff > 0)
2177c59d87c4SChristoph Hellwig return 1;
2178c59d87c4SChristoph Hellwig return 0;
2179c59d87c4SChristoph Hellwig }
2180c59d87c4SChristoph Hellwig
218126f1fe85SDave Chinner /*
2182e339dd8dSBrian Foster * Submit buffers for write. If wait_list is specified, the buffers are
2183e339dd8dSBrian Foster * submitted using sync I/O and placed on the wait list such that the caller can
2184e339dd8dSBrian Foster * iowait each buffer. Otherwise async I/O is used and the buffers are released
2185e339dd8dSBrian Foster * at I/O completion time. In either case, buffers remain locked until I/O
2186e339dd8dSBrian Foster * completes and the buffer is released from the queue.
218726f1fe85SDave Chinner */
218843ff2122SChristoph Hellwig static int
xfs_buf_delwri_submit_buffers(struct list_head * buffer_list,struct list_head * wait_list)218926f1fe85SDave Chinner xfs_buf_delwri_submit_buffers(
219043ff2122SChristoph Hellwig struct list_head *buffer_list,
219126f1fe85SDave Chinner struct list_head *wait_list)
2192c59d87c4SChristoph Hellwig {
219343ff2122SChristoph Hellwig struct xfs_buf *bp, *n;
219443ff2122SChristoph Hellwig int pinned = 0;
219526f1fe85SDave Chinner struct blk_plug plug;
2196c59d87c4SChristoph Hellwig
219726f1fe85SDave Chinner list_sort(NULL, buffer_list, xfs_buf_cmp);
219826f1fe85SDave Chinner
219926f1fe85SDave Chinner blk_start_plug(&plug);
220043ff2122SChristoph Hellwig list_for_each_entry_safe(bp, n, buffer_list, b_list) {
220126f1fe85SDave Chinner if (!wait_list) {
2202dbd0f529SDave Chinner if (!xfs_buf_trylock(bp))
2203dbd0f529SDave Chinner continue;
220443ff2122SChristoph Hellwig if (xfs_buf_ispinned(bp)) {
2205dbd0f529SDave Chinner xfs_buf_unlock(bp);
220643ff2122SChristoph Hellwig pinned++;
220743ff2122SChristoph Hellwig continue;
2208c59d87c4SChristoph Hellwig }
220943ff2122SChristoph Hellwig } else {
221043ff2122SChristoph Hellwig xfs_buf_lock(bp);
2211c59d87c4SChristoph Hellwig }
2212c59d87c4SChristoph Hellwig
2213c59d87c4SChristoph Hellwig /*
221443ff2122SChristoph Hellwig * Someone else might have written the buffer synchronously or
221543ff2122SChristoph Hellwig * marked it stale in the meantime. In that case only the
221643ff2122SChristoph Hellwig * _XBF_DELWRI_Q flag got cleared, and we have to drop the
221743ff2122SChristoph Hellwig * reference and remove it from the list here.
221843ff2122SChristoph Hellwig */
221943ff2122SChristoph Hellwig if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2220*1a48327cSDarrick J. Wong xfs_buf_list_del(bp);
222143ff2122SChristoph Hellwig xfs_buf_relse(bp);
222243ff2122SChristoph Hellwig continue;
222343ff2122SChristoph Hellwig }
222443ff2122SChristoph Hellwig
222543ff2122SChristoph Hellwig trace_xfs_buf_delwri_split(bp, _RET_IP_);
222643ff2122SChristoph Hellwig
2227cf53e99dSDave Chinner /*
2228e339dd8dSBrian Foster * If we have a wait list, each buffer (and associated delwri
2229e339dd8dSBrian Foster * queue reference) transfers to it and is submitted
2230e339dd8dSBrian Foster * synchronously. Otherwise, drop the buffer from the delwri
2231e339dd8dSBrian Foster * queue and submit async.
2232cf53e99dSDave Chinner */
2233b6983e80SBrian Foster bp->b_flags &= ~_XBF_DELWRI_Q;
2234e339dd8dSBrian Foster bp->b_flags |= XBF_WRITE;
223526f1fe85SDave Chinner if (wait_list) {
2236e339dd8dSBrian Foster bp->b_flags &= ~XBF_ASYNC;
223726f1fe85SDave Chinner list_move_tail(&bp->b_list, wait_list);
2238e339dd8dSBrian Foster } else {
2239e339dd8dSBrian Foster bp->b_flags |= XBF_ASYNC;
2240*1a48327cSDarrick J. Wong xfs_buf_list_del(bp);
224143ff2122SChristoph Hellwig }
22426af88cdaSBrian Foster __xfs_buf_submit(bp, false);
2243e339dd8dSBrian Foster }
224443ff2122SChristoph Hellwig blk_finish_plug(&plug);
224543ff2122SChristoph Hellwig
224643ff2122SChristoph Hellwig return pinned;
224743ff2122SChristoph Hellwig }
224843ff2122SChristoph Hellwig
224943ff2122SChristoph Hellwig /*
225043ff2122SChristoph Hellwig * Write out a buffer list asynchronously.
225143ff2122SChristoph Hellwig *
225243ff2122SChristoph Hellwig * This will take the @buffer_list, write all non-locked and non-pinned buffers
225343ff2122SChristoph Hellwig * out and not wait for I/O completion on any of the buffers. This interface
225443ff2122SChristoph Hellwig * is only safely useable for callers that can track I/O completion by higher
225543ff2122SChristoph Hellwig * level means, e.g. AIL pushing as the @buffer_list is consumed in this
225643ff2122SChristoph Hellwig * function.
2257efc3289cSBrian Foster *
2258efc3289cSBrian Foster * Note: this function will skip buffers it would block on, and in doing so
2259efc3289cSBrian Foster * leaves them on @buffer_list so they can be retried on a later pass. As such,
2260efc3289cSBrian Foster * it is up to the caller to ensure that the buffer list is fully submitted or
2261efc3289cSBrian Foster * cancelled appropriately when they are finished with the list. Failure to
2262efc3289cSBrian Foster * cancel or resubmit the list until it is empty will result in leaked buffers
2263efc3289cSBrian Foster * at unmount time.
2264c59d87c4SChristoph Hellwig */
2265c59d87c4SChristoph Hellwig int
xfs_buf_delwri_submit_nowait(struct list_head * buffer_list)226643ff2122SChristoph Hellwig xfs_buf_delwri_submit_nowait(
226743ff2122SChristoph Hellwig struct list_head *buffer_list)
2268c59d87c4SChristoph Hellwig {
226926f1fe85SDave Chinner return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
227043ff2122SChristoph Hellwig }
2271c59d87c4SChristoph Hellwig
2272c59d87c4SChristoph Hellwig /*
227343ff2122SChristoph Hellwig * Write out a buffer list synchronously.
227443ff2122SChristoph Hellwig *
227543ff2122SChristoph Hellwig * This will take the @buffer_list, write all buffers out and wait for I/O
227643ff2122SChristoph Hellwig * completion on all of the buffers. @buffer_list is consumed by the function,
227743ff2122SChristoph Hellwig * so callers must have some other way of tracking buffers if they require such
227843ff2122SChristoph Hellwig * functionality.
2279c59d87c4SChristoph Hellwig */
228043ff2122SChristoph Hellwig int
xfs_buf_delwri_submit(struct list_head * buffer_list)228143ff2122SChristoph Hellwig xfs_buf_delwri_submit(
228243ff2122SChristoph Hellwig struct list_head *buffer_list)
228343ff2122SChristoph Hellwig {
228426f1fe85SDave Chinner LIST_HEAD (wait_list);
228543ff2122SChristoph Hellwig int error = 0, error2;
228643ff2122SChristoph Hellwig struct xfs_buf *bp;
2287c59d87c4SChristoph Hellwig
228826f1fe85SDave Chinner xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
2289c59d87c4SChristoph Hellwig
2290c59d87c4SChristoph Hellwig /* Wait for IO to complete. */
229126f1fe85SDave Chinner while (!list_empty(&wait_list)) {
229226f1fe85SDave Chinner bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2293c59d87c4SChristoph Hellwig
2294*1a48327cSDarrick J. Wong xfs_buf_list_del(bp);
2295cf53e99dSDave Chinner
2296e339dd8dSBrian Foster /*
2297e339dd8dSBrian Foster * Wait on the locked buffer, check for errors and unlock and
2298e339dd8dSBrian Foster * release the delwri queue reference.
2299e339dd8dSBrian Foster */
2300e339dd8dSBrian Foster error2 = xfs_buf_iowait(bp);
2301c59d87c4SChristoph Hellwig xfs_buf_relse(bp);
230243ff2122SChristoph Hellwig if (!error)
230343ff2122SChristoph Hellwig error = error2;
2304c59d87c4SChristoph Hellwig }
2305c59d87c4SChristoph Hellwig
230643ff2122SChristoph Hellwig return error;
2307c59d87c4SChristoph Hellwig }
2308c59d87c4SChristoph Hellwig
23097912e7feSBrian Foster /*
23107912e7feSBrian Foster * Push a single buffer on a delwri queue.
23117912e7feSBrian Foster *
23127912e7feSBrian Foster * The purpose of this function is to submit a single buffer of a delwri queue
23137912e7feSBrian Foster * and return with the buffer still on the original queue. The waiting delwri
23147912e7feSBrian Foster * buffer submission infrastructure guarantees transfer of the delwri queue
23157912e7feSBrian Foster * buffer reference to a temporary wait list. We reuse this infrastructure to
23167912e7feSBrian Foster * transfer the buffer back to the original queue.
23177912e7feSBrian Foster *
23187912e7feSBrian Foster * Note the buffer transitions from the queued state, to the submitted and wait
23197912e7feSBrian Foster * listed state and back to the queued state during this call. The buffer
23207912e7feSBrian Foster * locking and queue management logic between _delwri_pushbuf() and
23217912e7feSBrian Foster * _delwri_queue() guarantee that the buffer cannot be queued to another list
23227912e7feSBrian Foster * before returning.
23237912e7feSBrian Foster */
23247912e7feSBrian Foster int
xfs_buf_delwri_pushbuf(struct xfs_buf * bp,struct list_head * buffer_list)23257912e7feSBrian Foster xfs_buf_delwri_pushbuf(
23267912e7feSBrian Foster struct xfs_buf *bp,
23277912e7feSBrian Foster struct list_head *buffer_list)
23287912e7feSBrian Foster {
23297912e7feSBrian Foster LIST_HEAD (submit_list);
23307912e7feSBrian Foster int error;
23317912e7feSBrian Foster
23327912e7feSBrian Foster ASSERT(bp->b_flags & _XBF_DELWRI_Q);
23337912e7feSBrian Foster
23347912e7feSBrian Foster trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
23357912e7feSBrian Foster
23367912e7feSBrian Foster /*
23377912e7feSBrian Foster * Isolate the buffer to a new local list so we can submit it for I/O
23387912e7feSBrian Foster * independently from the rest of the original list.
23397912e7feSBrian Foster */
23407912e7feSBrian Foster xfs_buf_lock(bp);
23417912e7feSBrian Foster list_move(&bp->b_list, &submit_list);
23427912e7feSBrian Foster xfs_buf_unlock(bp);
23437912e7feSBrian Foster
23447912e7feSBrian Foster /*
23457912e7feSBrian Foster * Delwri submission clears the DELWRI_Q buffer flag and returns with
2346e339dd8dSBrian Foster * the buffer on the wait list with the original reference. Rather than
23477912e7feSBrian Foster * bounce the buffer from a local wait list back to the original list
23487912e7feSBrian Foster * after I/O completion, reuse the original list as the wait list.
23497912e7feSBrian Foster */
23507912e7feSBrian Foster xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
23517912e7feSBrian Foster
23527912e7feSBrian Foster /*
2353e339dd8dSBrian Foster * The buffer is now locked, under I/O and wait listed on the original
2354e339dd8dSBrian Foster * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
2355e339dd8dSBrian Foster * return with the buffer unlocked and on the original queue.
23567912e7feSBrian Foster */
2357e339dd8dSBrian Foster error = xfs_buf_iowait(bp);
23587912e7feSBrian Foster bp->b_flags |= _XBF_DELWRI_Q;
23597912e7feSBrian Foster xfs_buf_unlock(bp);
23607912e7feSBrian Foster
23617912e7feSBrian Foster return error;
23627912e7feSBrian Foster }
23637912e7feSBrian Foster
xfs_buf_set_ref(struct xfs_buf * bp,int lru_ref)23647561d27eSBrian Foster void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
23657561d27eSBrian Foster {
23667561d27eSBrian Foster /*
23677561d27eSBrian Foster * Set the lru reference count to 0 based on the error injection tag.
23687561d27eSBrian Foster * This allows userspace to disrupt buffer caching for debug/testing
23697561d27eSBrian Foster * purposes.
23707561d27eSBrian Foster */
2371dbd329f1SChristoph Hellwig if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
23727561d27eSBrian Foster lru_ref = 0;
23737561d27eSBrian Foster
23747561d27eSBrian Foster atomic_set(&bp->b_lru_ref, lru_ref);
23757561d27eSBrian Foster }
23768473fee3SBrian Foster
23778473fee3SBrian Foster /*
23788473fee3SBrian Foster * Verify an on-disk magic value against the magic value specified in the
23798473fee3SBrian Foster * verifier structure. The verifier magic is in disk byte order so the caller is
23808473fee3SBrian Foster * expected to pass the value directly from disk.
23818473fee3SBrian Foster */
23828473fee3SBrian Foster bool
xfs_verify_magic(struct xfs_buf * bp,__be32 dmagic)23838473fee3SBrian Foster xfs_verify_magic(
23848473fee3SBrian Foster struct xfs_buf *bp,
238515baadf7SDarrick J. Wong __be32 dmagic)
23868473fee3SBrian Foster {
2387dbd329f1SChristoph Hellwig struct xfs_mount *mp = bp->b_mount;
23888473fee3SBrian Foster int idx;
23898473fee3SBrian Foster
239038c26bfdSDave Chinner idx = xfs_has_crc(mp);
239114ed8688SDenis Efremov if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
23928473fee3SBrian Foster return false;
23938473fee3SBrian Foster return dmagic == bp->b_ops->magic[idx];
23948473fee3SBrian Foster }
239515baadf7SDarrick J. Wong /*
239615baadf7SDarrick J. Wong * Verify an on-disk magic value against the magic value specified in the
239715baadf7SDarrick J. Wong * verifier structure. The verifier magic is in disk byte order so the caller is
239815baadf7SDarrick J. Wong * expected to pass the value directly from disk.
239915baadf7SDarrick J. Wong */
240015baadf7SDarrick J. Wong bool
xfs_verify_magic16(struct xfs_buf * bp,__be16 dmagic)240115baadf7SDarrick J. Wong xfs_verify_magic16(
240215baadf7SDarrick J. Wong struct xfs_buf *bp,
240315baadf7SDarrick J. Wong __be16 dmagic)
240415baadf7SDarrick J. Wong {
2405dbd329f1SChristoph Hellwig struct xfs_mount *mp = bp->b_mount;
240615baadf7SDarrick J. Wong int idx;
240715baadf7SDarrick J. Wong
240838c26bfdSDave Chinner idx = xfs_has_crc(mp);
240914ed8688SDenis Efremov if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
241015baadf7SDarrick J. Wong return false;
241115baadf7SDarrick J. Wong return dmagic == bp->b_ops->magic16[idx];
241215baadf7SDarrick J. Wong }
2413