xref: /openbmc/linux/fs/xfs/xfs_aops.c (revision 47010c04)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * Copyright (c) 2016-2018 Christoph Hellwig.
5  * All Rights Reserved.
6  */
7 #include "xfs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 
21 struct xfs_writepage_ctx {
22 	struct iomap_writepage_ctx ctx;
23 	unsigned int		data_seq;
24 	unsigned int		cow_seq;
25 };
26 
27 static inline struct xfs_writepage_ctx *
28 XFS_WPC(struct iomap_writepage_ctx *ctx)
29 {
30 	return container_of(ctx, struct xfs_writepage_ctx, ctx);
31 }
32 
33 /*
34  * Fast and loose check if this write could update the on-disk inode size.
35  */
36 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
37 {
38 	return ioend->io_offset + ioend->io_size >
39 		XFS_I(ioend->io_inode)->i_disk_size;
40 }
41 
42 /*
43  * Update on-disk file size now that data has been written to disk.
44  */
45 int
46 xfs_setfilesize(
47 	struct xfs_inode	*ip,
48 	xfs_off_t		offset,
49 	size_t			size)
50 {
51 	struct xfs_mount	*mp = ip->i_mount;
52 	struct xfs_trans	*tp;
53 	xfs_fsize_t		isize;
54 	int			error;
55 
56 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
57 	if (error)
58 		return error;
59 
60 	xfs_ilock(ip, XFS_ILOCK_EXCL);
61 	isize = xfs_new_eof(ip, offset + size);
62 	if (!isize) {
63 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
64 		xfs_trans_cancel(tp);
65 		return 0;
66 	}
67 
68 	trace_xfs_setfilesize(ip, offset, size);
69 
70 	ip->i_disk_size = isize;
71 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
72 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
73 
74 	return xfs_trans_commit(tp);
75 }
76 
77 /*
78  * IO write completion.
79  */
80 STATIC void
81 xfs_end_ioend(
82 	struct iomap_ioend	*ioend)
83 {
84 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
85 	struct xfs_mount	*mp = ip->i_mount;
86 	xfs_off_t		offset = ioend->io_offset;
87 	size_t			size = ioend->io_size;
88 	unsigned int		nofs_flag;
89 	int			error;
90 
91 	/*
92 	 * We can allocate memory here while doing writeback on behalf of
93 	 * memory reclaim.  To avoid memory allocation deadlocks set the
94 	 * task-wide nofs context for the following operations.
95 	 */
96 	nofs_flag = memalloc_nofs_save();
97 
98 	/*
99 	 * Just clean up the in-memory structures if the fs has been shut down.
100 	 */
101 	if (xfs_is_shutdown(mp)) {
102 		error = -EIO;
103 		goto done;
104 	}
105 
106 	/*
107 	 * Clean up all COW blocks and underlying data fork delalloc blocks on
108 	 * I/O error. The delalloc punch is required because this ioend was
109 	 * mapped to blocks in the COW fork and the associated pages are no
110 	 * longer dirty. If we don't remove delalloc blocks here, they become
111 	 * stale and can corrupt free space accounting on unmount.
112 	 */
113 	error = blk_status_to_errno(ioend->io_bio->bi_status);
114 	if (unlikely(error)) {
115 		if (ioend->io_flags & IOMAP_F_SHARED) {
116 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
117 			xfs_bmap_punch_delalloc_range(ip,
118 						      XFS_B_TO_FSBT(mp, offset),
119 						      XFS_B_TO_FSB(mp, size));
120 		}
121 		goto done;
122 	}
123 
124 	/*
125 	 * Success: commit the COW or unwritten blocks if needed.
126 	 */
127 	if (ioend->io_flags & IOMAP_F_SHARED)
128 		error = xfs_reflink_end_cow(ip, offset, size);
129 	else if (ioend->io_type == IOMAP_UNWRITTEN)
130 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
131 
132 	if (!error && xfs_ioend_is_append(ioend))
133 		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
134 done:
135 	iomap_finish_ioends(ioend, error);
136 	memalloc_nofs_restore(nofs_flag);
137 }
138 
139 /*
140  * Finish all pending IO completions that require transactional modifications.
141  *
142  * We try to merge physical and logically contiguous ioends before completion to
143  * minimise the number of transactions we need to perform during IO completion.
144  * Both unwritten extent conversion and COW remapping need to iterate and modify
145  * one physical extent at a time, so we gain nothing by merging physically
146  * discontiguous extents here.
147  *
148  * The ioend chain length that we can be processing here is largely unbound in
149  * length and we may have to perform significant amounts of work on each ioend
150  * to complete it. Hence we have to be careful about holding the CPU for too
151  * long in this loop.
152  */
153 void
154 xfs_end_io(
155 	struct work_struct	*work)
156 {
157 	struct xfs_inode	*ip =
158 		container_of(work, struct xfs_inode, i_ioend_work);
159 	struct iomap_ioend	*ioend;
160 	struct list_head	tmp;
161 	unsigned long		flags;
162 
163 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
164 	list_replace_init(&ip->i_ioend_list, &tmp);
165 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
166 
167 	iomap_sort_ioends(&tmp);
168 	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
169 			io_list))) {
170 		list_del_init(&ioend->io_list);
171 		iomap_ioend_try_merge(ioend, &tmp);
172 		xfs_end_ioend(ioend);
173 		cond_resched();
174 	}
175 }
176 
177 STATIC void
178 xfs_end_bio(
179 	struct bio		*bio)
180 {
181 	struct iomap_ioend	*ioend = bio->bi_private;
182 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
183 	unsigned long		flags;
184 
185 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
186 	if (list_empty(&ip->i_ioend_list))
187 		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
188 					 &ip->i_ioend_work));
189 	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
190 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
191 }
192 
193 /*
194  * Fast revalidation of the cached writeback mapping. Return true if the current
195  * mapping is valid, false otherwise.
196  */
197 static bool
198 xfs_imap_valid(
199 	struct iomap_writepage_ctx	*wpc,
200 	struct xfs_inode		*ip,
201 	loff_t				offset)
202 {
203 	if (offset < wpc->iomap.offset ||
204 	    offset >= wpc->iomap.offset + wpc->iomap.length)
205 		return false;
206 	/*
207 	 * If this is a COW mapping, it is sufficient to check that the mapping
208 	 * covers the offset. Be careful to check this first because the caller
209 	 * can revalidate a COW mapping without updating the data seqno.
210 	 */
211 	if (wpc->iomap.flags & IOMAP_F_SHARED)
212 		return true;
213 
214 	/*
215 	 * This is not a COW mapping. Check the sequence number of the data fork
216 	 * because concurrent changes could have invalidated the extent. Check
217 	 * the COW fork because concurrent changes since the last time we
218 	 * checked (and found nothing at this offset) could have added
219 	 * overlapping blocks.
220 	 */
221 	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
222 		return false;
223 	if (xfs_inode_has_cow_data(ip) &&
224 	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
225 		return false;
226 	return true;
227 }
228 
229 /*
230  * Pass in a dellalloc extent and convert it to real extents, return the real
231  * extent that maps offset_fsb in wpc->iomap.
232  *
233  * The current page is held locked so nothing could have removed the block
234  * backing offset_fsb, although it could have moved from the COW to the data
235  * fork by another thread.
236  */
237 static int
238 xfs_convert_blocks(
239 	struct iomap_writepage_ctx *wpc,
240 	struct xfs_inode	*ip,
241 	int			whichfork,
242 	loff_t			offset)
243 {
244 	int			error;
245 	unsigned		*seq;
246 
247 	if (whichfork == XFS_COW_FORK)
248 		seq = &XFS_WPC(wpc)->cow_seq;
249 	else
250 		seq = &XFS_WPC(wpc)->data_seq;
251 
252 	/*
253 	 * Attempt to allocate whatever delalloc extent currently backs offset
254 	 * and put the result into wpc->iomap.  Allocate in a loop because it
255 	 * may take several attempts to allocate real blocks for a contiguous
256 	 * delalloc extent if free space is sufficiently fragmented.
257 	 */
258 	do {
259 		error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
260 				&wpc->iomap, seq);
261 		if (error)
262 			return error;
263 	} while (wpc->iomap.offset + wpc->iomap.length <= offset);
264 
265 	return 0;
266 }
267 
268 static int
269 xfs_map_blocks(
270 	struct iomap_writepage_ctx *wpc,
271 	struct inode		*inode,
272 	loff_t			offset)
273 {
274 	struct xfs_inode	*ip = XFS_I(inode);
275 	struct xfs_mount	*mp = ip->i_mount;
276 	ssize_t			count = i_blocksize(inode);
277 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
278 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
279 	xfs_fileoff_t		cow_fsb;
280 	int			whichfork;
281 	struct xfs_bmbt_irec	imap;
282 	struct xfs_iext_cursor	icur;
283 	int			retries = 0;
284 	int			error = 0;
285 
286 	if (xfs_is_shutdown(mp))
287 		return -EIO;
288 
289 	/*
290 	 * COW fork blocks can overlap data fork blocks even if the blocks
291 	 * aren't shared.  COW I/O always takes precedent, so we must always
292 	 * check for overlap on reflink inodes unless the mapping is already a
293 	 * COW one, or the COW fork hasn't changed from the last time we looked
294 	 * at it.
295 	 *
296 	 * It's safe to check the COW fork if_seq here without the ILOCK because
297 	 * we've indirectly protected against concurrent updates: writeback has
298 	 * the page locked, which prevents concurrent invalidations by reflink
299 	 * and directio and prevents concurrent buffered writes to the same
300 	 * page.  Changes to if_seq always happen under i_lock, which protects
301 	 * against concurrent updates and provides a memory barrier on the way
302 	 * out that ensures that we always see the current value.
303 	 */
304 	if (xfs_imap_valid(wpc, ip, offset))
305 		return 0;
306 
307 	/*
308 	 * If we don't have a valid map, now it's time to get a new one for this
309 	 * offset.  This will convert delayed allocations (including COW ones)
310 	 * into real extents.  If we return without a valid map, it means we
311 	 * landed in a hole and we skip the block.
312 	 */
313 retry:
314 	cow_fsb = NULLFILEOFF;
315 	whichfork = XFS_DATA_FORK;
316 	xfs_ilock(ip, XFS_ILOCK_SHARED);
317 	ASSERT(!xfs_need_iread_extents(&ip->i_df));
318 
319 	/*
320 	 * Check if this is offset is covered by a COW extents, and if yes use
321 	 * it directly instead of looking up anything in the data fork.
322 	 */
323 	if (xfs_inode_has_cow_data(ip) &&
324 	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
325 		cow_fsb = imap.br_startoff;
326 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
327 		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
328 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
329 
330 		whichfork = XFS_COW_FORK;
331 		goto allocate_blocks;
332 	}
333 
334 	/*
335 	 * No COW extent overlap. Revalidate now that we may have updated
336 	 * ->cow_seq. If the data mapping is still valid, we're done.
337 	 */
338 	if (xfs_imap_valid(wpc, ip, offset)) {
339 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
340 		return 0;
341 	}
342 
343 	/*
344 	 * If we don't have a valid map, now it's time to get a new one for this
345 	 * offset.  This will convert delayed allocations (including COW ones)
346 	 * into real extents.
347 	 */
348 	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
349 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
350 	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
351 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
352 
353 	/* landed in a hole or beyond EOF? */
354 	if (imap.br_startoff > offset_fsb) {
355 		imap.br_blockcount = imap.br_startoff - offset_fsb;
356 		imap.br_startoff = offset_fsb;
357 		imap.br_startblock = HOLESTARTBLOCK;
358 		imap.br_state = XFS_EXT_NORM;
359 	}
360 
361 	/*
362 	 * Truncate to the next COW extent if there is one.  This is the only
363 	 * opportunity to do this because we can skip COW fork lookups for the
364 	 * subsequent blocks in the mapping; however, the requirement to treat
365 	 * the COW range separately remains.
366 	 */
367 	if (cow_fsb != NULLFILEOFF &&
368 	    cow_fsb < imap.br_startoff + imap.br_blockcount)
369 		imap.br_blockcount = cow_fsb - imap.br_startoff;
370 
371 	/* got a delalloc extent? */
372 	if (imap.br_startblock != HOLESTARTBLOCK &&
373 	    isnullstartblock(imap.br_startblock))
374 		goto allocate_blocks;
375 
376 	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
377 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
378 	return 0;
379 allocate_blocks:
380 	error = xfs_convert_blocks(wpc, ip, whichfork, offset);
381 	if (error) {
382 		/*
383 		 * If we failed to find the extent in the COW fork we might have
384 		 * raced with a COW to data fork conversion or truncate.
385 		 * Restart the lookup to catch the extent in the data fork for
386 		 * the former case, but prevent additional retries to avoid
387 		 * looping forever for the latter case.
388 		 */
389 		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
390 			goto retry;
391 		ASSERT(error != -EAGAIN);
392 		return error;
393 	}
394 
395 	/*
396 	 * Due to merging the return real extent might be larger than the
397 	 * original delalloc one.  Trim the return extent to the next COW
398 	 * boundary again to force a re-lookup.
399 	 */
400 	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
401 		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
402 
403 		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
404 			wpc->iomap.length = cow_offset - wpc->iomap.offset;
405 	}
406 
407 	ASSERT(wpc->iomap.offset <= offset);
408 	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
409 	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
410 	return 0;
411 }
412 
413 static int
414 xfs_prepare_ioend(
415 	struct iomap_ioend	*ioend,
416 	int			status)
417 {
418 	unsigned int		nofs_flag;
419 
420 	/*
421 	 * We can allocate memory here while doing writeback on behalf of
422 	 * memory reclaim.  To avoid memory allocation deadlocks set the
423 	 * task-wide nofs context for the following operations.
424 	 */
425 	nofs_flag = memalloc_nofs_save();
426 
427 	/* Convert CoW extents to regular */
428 	if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
429 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
430 				ioend->io_offset, ioend->io_size);
431 	}
432 
433 	memalloc_nofs_restore(nofs_flag);
434 
435 	/* send ioends that might require a transaction to the completion wq */
436 	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
437 	    (ioend->io_flags & IOMAP_F_SHARED))
438 		ioend->io_bio->bi_end_io = xfs_end_bio;
439 	return status;
440 }
441 
442 /*
443  * If the page has delalloc blocks on it, we need to punch them out before we
444  * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
445  * inode that can trip up a later direct I/O read operation on the same region.
446  *
447  * We prevent this by truncating away the delalloc regions on the page.  Because
448  * they are delalloc, we can do this without needing a transaction. Indeed - if
449  * we get ENOSPC errors, we have to be able to do this truncation without a
450  * transaction as there is no space left for block reservation (typically why we
451  * see a ENOSPC in writeback).
452  */
453 static void
454 xfs_discard_folio(
455 	struct folio		*folio,
456 	loff_t			pos)
457 {
458 	struct inode		*inode = folio->mapping->host;
459 	struct xfs_inode	*ip = XFS_I(inode);
460 	struct xfs_mount	*mp = ip->i_mount;
461 	size_t			offset = offset_in_folio(folio, pos);
462 	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, pos);
463 	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
464 	int			error;
465 
466 	if (xfs_is_shutdown(mp))
467 		goto out_invalidate;
468 
469 	xfs_alert_ratelimited(mp,
470 		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
471 			folio, ip->i_ino, pos);
472 
473 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
474 			i_blocks_per_folio(inode, folio) - pageoff_fsb);
475 	if (error && !xfs_is_shutdown(mp))
476 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
477 out_invalidate:
478 	iomap_invalidate_folio(folio, offset, folio_size(folio) - offset);
479 }
480 
481 static const struct iomap_writeback_ops xfs_writeback_ops = {
482 	.map_blocks		= xfs_map_blocks,
483 	.prepare_ioend		= xfs_prepare_ioend,
484 	.discard_folio		= xfs_discard_folio,
485 };
486 
487 STATIC int
488 xfs_vm_writepages(
489 	struct address_space	*mapping,
490 	struct writeback_control *wbc)
491 {
492 	struct xfs_writepage_ctx wpc = { };
493 
494 	/*
495 	 * Writing back data in a transaction context can result in recursive
496 	 * transactions. This is bad, so issue a warning and get out of here.
497 	 */
498 	if (WARN_ON_ONCE(current->journal_info))
499 		return 0;
500 
501 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
502 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
503 }
504 
505 STATIC int
506 xfs_dax_writepages(
507 	struct address_space	*mapping,
508 	struct writeback_control *wbc)
509 {
510 	struct xfs_inode	*ip = XFS_I(mapping->host);
511 
512 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
513 	return dax_writeback_mapping_range(mapping,
514 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
515 }
516 
517 STATIC sector_t
518 xfs_vm_bmap(
519 	struct address_space	*mapping,
520 	sector_t		block)
521 {
522 	struct xfs_inode	*ip = XFS_I(mapping->host);
523 
524 	trace_xfs_vm_bmap(ip);
525 
526 	/*
527 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
528 	 * bypasses the file system for actual I/O.  We really can't allow
529 	 * that on reflinks inodes, so we have to skip out here.  And yes,
530 	 * 0 is the magic code for a bmap error.
531 	 *
532 	 * Since we don't pass back blockdev info, we can't return bmap
533 	 * information for rt files either.
534 	 */
535 	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
536 		return 0;
537 	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
538 }
539 
540 STATIC int
541 xfs_vm_readpage(
542 	struct file		*unused,
543 	struct page		*page)
544 {
545 	return iomap_readpage(page, &xfs_read_iomap_ops);
546 }
547 
548 STATIC void
549 xfs_vm_readahead(
550 	struct readahead_control	*rac)
551 {
552 	iomap_readahead(rac, &xfs_read_iomap_ops);
553 }
554 
555 static int
556 xfs_iomap_swapfile_activate(
557 	struct swap_info_struct		*sis,
558 	struct file			*swap_file,
559 	sector_t			*span)
560 {
561 	sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
562 	return iomap_swapfile_activate(sis, swap_file, span,
563 			&xfs_read_iomap_ops);
564 }
565 
566 const struct address_space_operations xfs_address_space_operations = {
567 	.readpage		= xfs_vm_readpage,
568 	.readahead		= xfs_vm_readahead,
569 	.writepages		= xfs_vm_writepages,
570 	.dirty_folio		= filemap_dirty_folio,
571 	.releasepage		= iomap_releasepage,
572 	.invalidate_folio	= iomap_invalidate_folio,
573 	.bmap			= xfs_vm_bmap,
574 	.direct_IO		= noop_direct_IO,
575 	.migratepage		= iomap_migrate_page,
576 	.is_partially_uptodate  = iomap_is_partially_uptodate,
577 	.error_remove_page	= generic_error_remove_page,
578 	.swap_activate		= xfs_iomap_swapfile_activate,
579 };
580 
581 const struct address_space_operations xfs_dax_aops = {
582 	.writepages		= xfs_dax_writepages,
583 	.direct_IO		= noop_direct_IO,
584 	.dirty_folio		= noop_dirty_folio,
585 	.swap_activate		= xfs_iomap_swapfile_activate,
586 };
587