xref: /openbmc/linux/fs/xfs/xfs_file.c (revision 2f63296578cad1ae681152d5b2122a4595195f16)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 
28 #include <linux/falloc.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mman.h>
31 #include <linux/fadvise.h>
32 
33 static const struct vm_operations_struct xfs_file_vm_ops;
34 
35 /*
36  * Decide if the given file range is aligned to the size of the fundamental
37  * allocation unit for the file.
38  */
39 static bool
40 xfs_is_falloc_aligned(
41 	struct xfs_inode	*ip,
42 	loff_t			pos,
43 	long long int		len)
44 {
45 	struct xfs_mount	*mp = ip->i_mount;
46 	uint64_t		mask;
47 
48 	if (XFS_IS_REALTIME_INODE(ip)) {
49 		if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
50 			u64	rextbytes;
51 			u32	mod;
52 
53 			rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
54 			div_u64_rem(pos, rextbytes, &mod);
55 			if (mod)
56 				return false;
57 			div_u64_rem(len, rextbytes, &mod);
58 			return mod == 0;
59 		}
60 		mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
61 	} else {
62 		mask = mp->m_sb.sb_blocksize - 1;
63 	}
64 
65 	return !((pos | len) & mask);
66 }
67 
68 int
69 xfs_update_prealloc_flags(
70 	struct xfs_inode	*ip,
71 	enum xfs_prealloc_flags	flags)
72 {
73 	struct xfs_trans	*tp;
74 	int			error;
75 
76 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
77 			0, 0, 0, &tp);
78 	if (error)
79 		return error;
80 
81 	xfs_ilock(ip, XFS_ILOCK_EXCL);
82 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
83 
84 	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
85 		VFS_I(ip)->i_mode &= ~S_ISUID;
86 		if (VFS_I(ip)->i_mode & S_IXGRP)
87 			VFS_I(ip)->i_mode &= ~S_ISGID;
88 		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
89 	}
90 
91 	if (flags & XFS_PREALLOC_SET)
92 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
93 	if (flags & XFS_PREALLOC_CLEAR)
94 		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
95 
96 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
97 	if (flags & XFS_PREALLOC_SYNC)
98 		xfs_trans_set_sync(tp);
99 	return xfs_trans_commit(tp);
100 }
101 
102 /*
103  * Fsync operations on directories are much simpler than on regular files,
104  * as there is no file data to flush, and thus also no need for explicit
105  * cache flush operations, and there are no non-transaction metadata updates
106  * on directories either.
107  */
108 STATIC int
109 xfs_dir_fsync(
110 	struct file		*file,
111 	loff_t			start,
112 	loff_t			end,
113 	int			datasync)
114 {
115 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
116 
117 	trace_xfs_dir_fsync(ip);
118 	return xfs_log_force_inode(ip);
119 }
120 
121 STATIC int
122 xfs_file_fsync(
123 	struct file		*file,
124 	loff_t			start,
125 	loff_t			end,
126 	int			datasync)
127 {
128 	struct inode		*inode = file->f_mapping->host;
129 	struct xfs_inode	*ip = XFS_I(inode);
130 	struct xfs_inode_log_item *iip = ip->i_itemp;
131 	struct xfs_mount	*mp = ip->i_mount;
132 	int			error = 0;
133 	int			log_flushed = 0;
134 	xfs_lsn_t		lsn = 0;
135 
136 	trace_xfs_file_fsync(ip);
137 
138 	error = file_write_and_wait_range(file, start, end);
139 	if (error)
140 		return error;
141 
142 	if (XFS_FORCED_SHUTDOWN(mp))
143 		return -EIO;
144 
145 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
146 
147 	/*
148 	 * If we have an RT and/or log subvolume we need to make sure to flush
149 	 * the write cache the device used for file data first.  This is to
150 	 * ensure newly written file data make it to disk before logging the new
151 	 * inode size in case of an extending write.
152 	 */
153 	if (XFS_IS_REALTIME_INODE(ip))
154 		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
155 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
157 
158 	/*
159 	 * All metadata updates are logged, which means that we just have to
160 	 * flush the log up to the latest LSN that touched the inode. If we have
161 	 * concurrent fsync/fdatasync() calls, we need them to all block on the
162 	 * log force before we clear the ili_fsync_fields field. This ensures
163 	 * that we don't get a racing sync operation that does not wait for the
164 	 * metadata to hit the journal before returning. If we race with
165 	 * clearing the ili_fsync_fields, then all that will happen is the log
166 	 * force will do nothing as the lsn will already be on disk. We can't
167 	 * race with setting ili_fsync_fields because that is done under
168 	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
169 	 * until after the ili_fsync_fields is cleared.
170 	 */
171 	xfs_ilock(ip, XFS_ILOCK_SHARED);
172 	if (xfs_ipincount(ip)) {
173 		if (!datasync ||
174 		    (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
175 			lsn = iip->ili_last_lsn;
176 	}
177 
178 	if (lsn) {
179 		error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
180 		spin_lock(&iip->ili_lock);
181 		iip->ili_fsync_fields = 0;
182 		spin_unlock(&iip->ili_lock);
183 	}
184 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
185 
186 	/*
187 	 * If we only have a single device, and the log force about was
188 	 * a no-op we might have to flush the data device cache here.
189 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
190 	 * an already allocated file and thus do not have any metadata to
191 	 * commit.
192 	 */
193 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
194 	    mp->m_logdev_targp == mp->m_ddev_targp)
195 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
196 
197 	return error;
198 }
199 
200 STATIC ssize_t
201 xfs_file_dio_aio_read(
202 	struct kiocb		*iocb,
203 	struct iov_iter		*to)
204 {
205 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
206 	size_t			count = iov_iter_count(to);
207 	ssize_t			ret;
208 
209 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
210 
211 	if (!count)
212 		return 0; /* skip atime */
213 
214 	file_accessed(iocb->ki_filp);
215 
216 	if (iocb->ki_flags & IOCB_NOWAIT) {
217 		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
218 			return -EAGAIN;
219 	} else {
220 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
221 	}
222 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
223 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224 
225 	return ret;
226 }
227 
228 static noinline ssize_t
229 xfs_file_dax_read(
230 	struct kiocb		*iocb,
231 	struct iov_iter		*to)
232 {
233 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
234 	size_t			count = iov_iter_count(to);
235 	ssize_t			ret = 0;
236 
237 	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
238 
239 	if (!count)
240 		return 0; /* skip atime */
241 
242 	if (iocb->ki_flags & IOCB_NOWAIT) {
243 		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
244 			return -EAGAIN;
245 	} else {
246 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
247 	}
248 
249 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
250 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
251 
252 	file_accessed(iocb->ki_filp);
253 	return ret;
254 }
255 
256 STATIC ssize_t
257 xfs_file_buffered_aio_read(
258 	struct kiocb		*iocb,
259 	struct iov_iter		*to)
260 {
261 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
262 	ssize_t			ret;
263 
264 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
265 
266 	if (iocb->ki_flags & IOCB_NOWAIT) {
267 		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
268 			return -EAGAIN;
269 	} else {
270 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
271 	}
272 	ret = generic_file_read_iter(iocb, to);
273 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274 
275 	return ret;
276 }
277 
278 STATIC ssize_t
279 xfs_file_read_iter(
280 	struct kiocb		*iocb,
281 	struct iov_iter		*to)
282 {
283 	struct inode		*inode = file_inode(iocb->ki_filp);
284 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
285 	ssize_t			ret = 0;
286 
287 	XFS_STATS_INC(mp, xs_read_calls);
288 
289 	if (XFS_FORCED_SHUTDOWN(mp))
290 		return -EIO;
291 
292 	if (IS_DAX(inode))
293 		ret = xfs_file_dax_read(iocb, to);
294 	else if (iocb->ki_flags & IOCB_DIRECT)
295 		ret = xfs_file_dio_aio_read(iocb, to);
296 	else
297 		ret = xfs_file_buffered_aio_read(iocb, to);
298 
299 	if (ret > 0)
300 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
301 	return ret;
302 }
303 
304 /*
305  * Common pre-write limit and setup checks.
306  *
307  * Called with the iolocked held either shared and exclusive according to
308  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
309  * if called for a direct write beyond i_size.
310  */
311 STATIC ssize_t
312 xfs_file_aio_write_checks(
313 	struct kiocb		*iocb,
314 	struct iov_iter		*from,
315 	int			*iolock)
316 {
317 	struct file		*file = iocb->ki_filp;
318 	struct inode		*inode = file->f_mapping->host;
319 	struct xfs_inode	*ip = XFS_I(inode);
320 	ssize_t			error = 0;
321 	size_t			count = iov_iter_count(from);
322 	bool			drained_dio = false;
323 	loff_t			isize;
324 
325 restart:
326 	error = generic_write_checks(iocb, from);
327 	if (error <= 0)
328 		return error;
329 
330 	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
331 	if (error)
332 		return error;
333 
334 	/*
335 	 * For changing security info in file_remove_privs() we need i_rwsem
336 	 * exclusively.
337 	 */
338 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
339 		xfs_iunlock(ip, *iolock);
340 		*iolock = XFS_IOLOCK_EXCL;
341 		xfs_ilock(ip, *iolock);
342 		goto restart;
343 	}
344 	/*
345 	 * If the offset is beyond the size of the file, we need to zero any
346 	 * blocks that fall between the existing EOF and the start of this
347 	 * write.  If zeroing is needed and we are currently holding the
348 	 * iolock shared, we need to update it to exclusive which implies
349 	 * having to redo all checks before.
350 	 *
351 	 * We need to serialise against EOF updates that occur in IO
352 	 * completions here. We want to make sure that nobody is changing the
353 	 * size while we do this check until we have placed an IO barrier (i.e.
354 	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
355 	 * The spinlock effectively forms a memory barrier once we have the
356 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
357 	 * and hence be able to correctly determine if we need to run zeroing.
358 	 */
359 	spin_lock(&ip->i_flags_lock);
360 	isize = i_size_read(inode);
361 	if (iocb->ki_pos > isize) {
362 		spin_unlock(&ip->i_flags_lock);
363 		if (!drained_dio) {
364 			if (*iolock == XFS_IOLOCK_SHARED) {
365 				xfs_iunlock(ip, *iolock);
366 				*iolock = XFS_IOLOCK_EXCL;
367 				xfs_ilock(ip, *iolock);
368 				iov_iter_reexpand(from, count);
369 			}
370 			/*
371 			 * We now have an IO submission barrier in place, but
372 			 * AIO can do EOF updates during IO completion and hence
373 			 * we now need to wait for all of them to drain. Non-AIO
374 			 * DIO will have drained before we are given the
375 			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
376 			 * no-op.
377 			 */
378 			inode_dio_wait(inode);
379 			drained_dio = true;
380 			goto restart;
381 		}
382 
383 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
384 		error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
385 				NULL, &xfs_buffered_write_iomap_ops);
386 		if (error)
387 			return error;
388 	} else
389 		spin_unlock(&ip->i_flags_lock);
390 
391 	/*
392 	 * Updating the timestamps will grab the ilock again from
393 	 * xfs_fs_dirty_inode, so we have to call it after dropping the
394 	 * lock above.  Eventually we should look into a way to avoid
395 	 * the pointless lock roundtrip.
396 	 */
397 	return file_modified(file);
398 }
399 
400 static int
401 xfs_dio_write_end_io(
402 	struct kiocb		*iocb,
403 	ssize_t			size,
404 	int			error,
405 	unsigned		flags)
406 {
407 	struct inode		*inode = file_inode(iocb->ki_filp);
408 	struct xfs_inode	*ip = XFS_I(inode);
409 	loff_t			offset = iocb->ki_pos;
410 	unsigned int		nofs_flag;
411 
412 	trace_xfs_end_io_direct_write(ip, offset, size);
413 
414 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
415 		return -EIO;
416 
417 	if (error)
418 		return error;
419 	if (!size)
420 		return 0;
421 
422 	/*
423 	 * Capture amount written on completion as we can't reliably account
424 	 * for it on submission.
425 	 */
426 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
427 
428 	/*
429 	 * We can allocate memory here while doing writeback on behalf of
430 	 * memory reclaim.  To avoid memory allocation deadlocks set the
431 	 * task-wide nofs context for the following operations.
432 	 */
433 	nofs_flag = memalloc_nofs_save();
434 
435 	if (flags & IOMAP_DIO_COW) {
436 		error = xfs_reflink_end_cow(ip, offset, size);
437 		if (error)
438 			goto out;
439 	}
440 
441 	/*
442 	 * Unwritten conversion updates the in-core isize after extent
443 	 * conversion but before updating the on-disk size. Updating isize any
444 	 * earlier allows a racing dio read to find unwritten extents before
445 	 * they are converted.
446 	 */
447 	if (flags & IOMAP_DIO_UNWRITTEN) {
448 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
449 		goto out;
450 	}
451 
452 	/*
453 	 * We need to update the in-core inode size here so that we don't end up
454 	 * with the on-disk inode size being outside the in-core inode size. We
455 	 * have no other method of updating EOF for AIO, so always do it here
456 	 * if necessary.
457 	 *
458 	 * We need to lock the test/set EOF update as we can be racing with
459 	 * other IO completions here to update the EOF. Failing to serialise
460 	 * here can result in EOF moving backwards and Bad Things Happen when
461 	 * that occurs.
462 	 */
463 	spin_lock(&ip->i_flags_lock);
464 	if (offset + size > i_size_read(inode)) {
465 		i_size_write(inode, offset + size);
466 		spin_unlock(&ip->i_flags_lock);
467 		error = xfs_setfilesize(ip, offset, size);
468 	} else {
469 		spin_unlock(&ip->i_flags_lock);
470 	}
471 
472 out:
473 	memalloc_nofs_restore(nofs_flag);
474 	return error;
475 }
476 
477 static const struct iomap_dio_ops xfs_dio_write_ops = {
478 	.end_io		= xfs_dio_write_end_io,
479 };
480 
481 /*
482  * xfs_file_dio_aio_write - handle direct IO writes
483  *
484  * Lock the inode appropriately to prepare for and issue a direct IO write.
485  * By separating it from the buffered write path we remove all the tricky to
486  * follow locking changes and looping.
487  *
488  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
489  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
490  * pages are flushed out.
491  *
492  * In most cases the direct IO writes will be done holding IOLOCK_SHARED
493  * allowing them to be done in parallel with reads and other direct IO writes.
494  * However, if the IO is not aligned to filesystem blocks, the direct IO layer
495  * needs to do sub-block zeroing and that requires serialisation against other
496  * direct IOs to the same block. In this case we need to serialise the
497  * submission of the unaligned IOs so that we don't get racing block zeroing in
498  * the dio layer.  To avoid the problem with aio, we also need to wait for
499  * outstanding IOs to complete so that unwritten extent conversion is completed
500  * before we try to map the overlapping block. This is currently implemented by
501  * hitting it with a big hammer (i.e. inode_dio_wait()).
502  *
503  * Returns with locks held indicated by @iolock and errors indicated by
504  * negative return values.
505  */
506 STATIC ssize_t
507 xfs_file_dio_aio_write(
508 	struct kiocb		*iocb,
509 	struct iov_iter		*from)
510 {
511 	struct file		*file = iocb->ki_filp;
512 	struct address_space	*mapping = file->f_mapping;
513 	struct inode		*inode = mapping->host;
514 	struct xfs_inode	*ip = XFS_I(inode);
515 	struct xfs_mount	*mp = ip->i_mount;
516 	ssize_t			ret = 0;
517 	int			unaligned_io = 0;
518 	int			iolock;
519 	size_t			count = iov_iter_count(from);
520 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
521 
522 	/* DIO must be aligned to device logical sector size */
523 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
524 		return -EINVAL;
525 
526 	/*
527 	 * Don't take the exclusive iolock here unless the I/O is unaligned to
528 	 * the file system block size.  We don't need to consider the EOF
529 	 * extension case here because xfs_file_aio_write_checks() will relock
530 	 * the inode as necessary for EOF zeroing cases and fill out the new
531 	 * inode size as appropriate.
532 	 */
533 	if ((iocb->ki_pos & mp->m_blockmask) ||
534 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
535 		unaligned_io = 1;
536 
537 		/*
538 		 * We can't properly handle unaligned direct I/O to reflink
539 		 * files yet, as we can't unshare a partial block.
540 		 */
541 		if (xfs_is_cow_inode(ip)) {
542 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
543 			return -ENOTBLK;
544 		}
545 		iolock = XFS_IOLOCK_EXCL;
546 	} else {
547 		iolock = XFS_IOLOCK_SHARED;
548 	}
549 
550 	if (iocb->ki_flags & IOCB_NOWAIT) {
551 		/* unaligned dio always waits, bail */
552 		if (unaligned_io)
553 			return -EAGAIN;
554 		if (!xfs_ilock_nowait(ip, iolock))
555 			return -EAGAIN;
556 	} else {
557 		xfs_ilock(ip, iolock);
558 	}
559 
560 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
561 	if (ret)
562 		goto out;
563 	count = iov_iter_count(from);
564 
565 	/*
566 	 * If we are doing unaligned IO, we can't allow any other overlapping IO
567 	 * in-flight at the same time or we risk data corruption. Wait for all
568 	 * other IO to drain before we submit. If the IO is aligned, demote the
569 	 * iolock if we had to take the exclusive lock in
570 	 * xfs_file_aio_write_checks() for other reasons.
571 	 */
572 	if (unaligned_io) {
573 		inode_dio_wait(inode);
574 	} else if (iolock == XFS_IOLOCK_EXCL) {
575 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
576 		iolock = XFS_IOLOCK_SHARED;
577 	}
578 
579 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
580 	/*
581 	 * If unaligned, this is the only IO in-flight. Wait on it before we
582 	 * release the iolock to prevent subsequent overlapping IO.
583 	 */
584 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
585 			   &xfs_dio_write_ops,
586 			   unaligned_io ? IOMAP_DIO_FORCE_WAIT : 0);
587 out:
588 	xfs_iunlock(ip, iolock);
589 
590 	/*
591 	 * No fallback to buffered IO after short writes for XFS, direct I/O
592 	 * will either complete fully or return an error.
593 	 */
594 	ASSERT(ret < 0 || ret == count);
595 	return ret;
596 }
597 
598 static noinline ssize_t
599 xfs_file_dax_write(
600 	struct kiocb		*iocb,
601 	struct iov_iter		*from)
602 {
603 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
604 	struct xfs_inode	*ip = XFS_I(inode);
605 	int			iolock = XFS_IOLOCK_EXCL;
606 	ssize_t			ret, error = 0;
607 	size_t			count;
608 	loff_t			pos;
609 
610 	if (iocb->ki_flags & IOCB_NOWAIT) {
611 		if (!xfs_ilock_nowait(ip, iolock))
612 			return -EAGAIN;
613 	} else {
614 		xfs_ilock(ip, iolock);
615 	}
616 
617 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
618 	if (ret)
619 		goto out;
620 
621 	pos = iocb->ki_pos;
622 	count = iov_iter_count(from);
623 
624 	trace_xfs_file_dax_write(ip, count, pos);
625 	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
626 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
627 		i_size_write(inode, iocb->ki_pos);
628 		error = xfs_setfilesize(ip, pos, ret);
629 	}
630 out:
631 	xfs_iunlock(ip, iolock);
632 	if (error)
633 		return error;
634 
635 	if (ret > 0) {
636 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
637 
638 		/* Handle various SYNC-type writes */
639 		ret = generic_write_sync(iocb, ret);
640 	}
641 	return ret;
642 }
643 
644 STATIC ssize_t
645 xfs_file_buffered_aio_write(
646 	struct kiocb		*iocb,
647 	struct iov_iter		*from)
648 {
649 	struct file		*file = iocb->ki_filp;
650 	struct address_space	*mapping = file->f_mapping;
651 	struct inode		*inode = mapping->host;
652 	struct xfs_inode	*ip = XFS_I(inode);
653 	ssize_t			ret;
654 	int			enospc = 0;
655 	int			iolock;
656 
657 	if (iocb->ki_flags & IOCB_NOWAIT)
658 		return -EOPNOTSUPP;
659 
660 write_retry:
661 	iolock = XFS_IOLOCK_EXCL;
662 	xfs_ilock(ip, iolock);
663 
664 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
665 	if (ret)
666 		goto out;
667 
668 	/* We can write back this queue in page reclaim */
669 	current->backing_dev_info = inode_to_bdi(inode);
670 
671 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
672 	ret = iomap_file_buffered_write(iocb, from,
673 			&xfs_buffered_write_iomap_ops);
674 	if (likely(ret >= 0))
675 		iocb->ki_pos += ret;
676 
677 	/*
678 	 * If we hit a space limit, try to free up some lingering preallocated
679 	 * space before returning an error. In the case of ENOSPC, first try to
680 	 * write back all dirty inodes to free up some of the excess reserved
681 	 * metadata space. This reduces the chances that the eofblocks scan
682 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
683 	 * also behaves as a filter to prevent too many eofblocks scans from
684 	 * running at the same time.
685 	 */
686 	if (ret == -EDQUOT && !enospc) {
687 		xfs_iunlock(ip, iolock);
688 		enospc = xfs_inode_free_quota_eofblocks(ip);
689 		if (enospc)
690 			goto write_retry;
691 		enospc = xfs_inode_free_quota_cowblocks(ip);
692 		if (enospc)
693 			goto write_retry;
694 		iolock = 0;
695 	} else if (ret == -ENOSPC && !enospc) {
696 		struct xfs_eofblocks eofb = {0};
697 
698 		enospc = 1;
699 		xfs_flush_inodes(ip->i_mount);
700 
701 		xfs_iunlock(ip, iolock);
702 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
703 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
704 		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
705 		goto write_retry;
706 	}
707 
708 	current->backing_dev_info = NULL;
709 out:
710 	if (iolock)
711 		xfs_iunlock(ip, iolock);
712 
713 	if (ret > 0) {
714 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
715 		/* Handle various SYNC-type writes */
716 		ret = generic_write_sync(iocb, ret);
717 	}
718 	return ret;
719 }
720 
721 STATIC ssize_t
722 xfs_file_write_iter(
723 	struct kiocb		*iocb,
724 	struct iov_iter		*from)
725 {
726 	struct file		*file = iocb->ki_filp;
727 	struct address_space	*mapping = file->f_mapping;
728 	struct inode		*inode = mapping->host;
729 	struct xfs_inode	*ip = XFS_I(inode);
730 	ssize_t			ret;
731 	size_t			ocount = iov_iter_count(from);
732 
733 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
734 
735 	if (ocount == 0)
736 		return 0;
737 
738 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
739 		return -EIO;
740 
741 	if (IS_DAX(inode))
742 		return xfs_file_dax_write(iocb, from);
743 
744 	if (iocb->ki_flags & IOCB_DIRECT) {
745 		/*
746 		 * Allow a directio write to fall back to a buffered
747 		 * write *only* in the case that we're doing a reflink
748 		 * CoW.  In all other directio scenarios we do not
749 		 * allow an operation to fall back to buffered mode.
750 		 */
751 		ret = xfs_file_dio_aio_write(iocb, from);
752 		if (ret != -ENOTBLK)
753 			return ret;
754 	}
755 
756 	return xfs_file_buffered_aio_write(iocb, from);
757 }
758 
759 static void
760 xfs_wait_dax_page(
761 	struct inode		*inode)
762 {
763 	struct xfs_inode        *ip = XFS_I(inode);
764 
765 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
766 	schedule();
767 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
768 }
769 
770 static int
771 xfs_break_dax_layouts(
772 	struct inode		*inode,
773 	bool			*retry)
774 {
775 	struct page		*page;
776 
777 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
778 
779 	page = dax_layout_busy_page(inode->i_mapping);
780 	if (!page)
781 		return 0;
782 
783 	*retry = true;
784 	return ___wait_var_event(&page->_refcount,
785 			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
786 			0, 0, xfs_wait_dax_page(inode));
787 }
788 
789 int
790 xfs_break_layouts(
791 	struct inode		*inode,
792 	uint			*iolock,
793 	enum layout_break_reason reason)
794 {
795 	bool			retry;
796 	int			error;
797 
798 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
799 
800 	do {
801 		retry = false;
802 		switch (reason) {
803 		case BREAK_UNMAP:
804 			error = xfs_break_dax_layouts(inode, &retry);
805 			if (error || retry)
806 				break;
807 			/* fall through */
808 		case BREAK_WRITE:
809 			error = xfs_break_leased_layouts(inode, iolock, &retry);
810 			break;
811 		default:
812 			WARN_ON_ONCE(1);
813 			error = -EINVAL;
814 		}
815 	} while (error == 0 && retry);
816 
817 	return error;
818 }
819 
820 #define	XFS_FALLOC_FL_SUPPORTED						\
821 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
822 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
823 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
824 
825 STATIC long
826 xfs_file_fallocate(
827 	struct file		*file,
828 	int			mode,
829 	loff_t			offset,
830 	loff_t			len)
831 {
832 	struct inode		*inode = file_inode(file);
833 	struct xfs_inode	*ip = XFS_I(inode);
834 	long			error;
835 	enum xfs_prealloc_flags	flags = 0;
836 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
837 	loff_t			new_size = 0;
838 	bool			do_file_insert = false;
839 
840 	if (!S_ISREG(inode->i_mode))
841 		return -EINVAL;
842 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
843 		return -EOPNOTSUPP;
844 
845 	xfs_ilock(ip, iolock);
846 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
847 	if (error)
848 		goto out_unlock;
849 
850 	/*
851 	 * Must wait for all AIO to complete before we continue as AIO can
852 	 * change the file size on completion without holding any locks we
853 	 * currently hold. We must do this first because AIO can update both
854 	 * the on disk and in memory inode sizes, and the operations that follow
855 	 * require the in-memory size to be fully up-to-date.
856 	 */
857 	inode_dio_wait(inode);
858 
859 	/*
860 	 * Now AIO and DIO has drained we flush and (if necessary) invalidate
861 	 * the cached range over the first operation we are about to run.
862 	 *
863 	 * We care about zero and collapse here because they both run a hole
864 	 * punch over the range first. Because that can zero data, and the range
865 	 * of invalidation for the shift operations is much larger, we still do
866 	 * the required flush for collapse in xfs_prepare_shift().
867 	 *
868 	 * Insert has the same range requirements as collapse, and we extend the
869 	 * file first which can zero data. Hence insert has the same
870 	 * flush/invalidate requirements as collapse and so they are both
871 	 * handled at the right time by xfs_prepare_shift().
872 	 */
873 	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
874 		    FALLOC_FL_COLLAPSE_RANGE)) {
875 		error = xfs_flush_unmap_range(ip, offset, len);
876 		if (error)
877 			goto out_unlock;
878 	}
879 
880 	if (mode & FALLOC_FL_PUNCH_HOLE) {
881 		error = xfs_free_file_space(ip, offset, len);
882 		if (error)
883 			goto out_unlock;
884 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
885 		if (!xfs_is_falloc_aligned(ip, offset, len)) {
886 			error = -EINVAL;
887 			goto out_unlock;
888 		}
889 
890 		/*
891 		 * There is no need to overlap collapse range with EOF,
892 		 * in which case it is effectively a truncate operation
893 		 */
894 		if (offset + len >= i_size_read(inode)) {
895 			error = -EINVAL;
896 			goto out_unlock;
897 		}
898 
899 		new_size = i_size_read(inode) - len;
900 
901 		error = xfs_collapse_file_space(ip, offset, len);
902 		if (error)
903 			goto out_unlock;
904 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
905 		loff_t		isize = i_size_read(inode);
906 
907 		if (!xfs_is_falloc_aligned(ip, offset, len)) {
908 			error = -EINVAL;
909 			goto out_unlock;
910 		}
911 
912 		/*
913 		 * New inode size must not exceed ->s_maxbytes, accounting for
914 		 * possible signed overflow.
915 		 */
916 		if (inode->i_sb->s_maxbytes - isize < len) {
917 			error = -EFBIG;
918 			goto out_unlock;
919 		}
920 		new_size = isize + len;
921 
922 		/* Offset should be less than i_size */
923 		if (offset >= isize) {
924 			error = -EINVAL;
925 			goto out_unlock;
926 		}
927 		do_file_insert = true;
928 	} else {
929 		flags |= XFS_PREALLOC_SET;
930 
931 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
932 		    offset + len > i_size_read(inode)) {
933 			new_size = offset + len;
934 			error = inode_newsize_ok(inode, new_size);
935 			if (error)
936 				goto out_unlock;
937 		}
938 
939 		if (mode & FALLOC_FL_ZERO_RANGE) {
940 			/*
941 			 * Punch a hole and prealloc the range.  We use a hole
942 			 * punch rather than unwritten extent conversion for two
943 			 * reasons:
944 			 *
945 			 *   1.) Hole punch handles partial block zeroing for us.
946 			 *   2.) If prealloc returns ENOSPC, the file range is
947 			 *       still zero-valued by virtue of the hole punch.
948 			 */
949 			unsigned int blksize = i_blocksize(inode);
950 
951 			trace_xfs_zero_file_space(ip);
952 
953 			error = xfs_free_file_space(ip, offset, len);
954 			if (error)
955 				goto out_unlock;
956 
957 			len = round_up(offset + len, blksize) -
958 			      round_down(offset, blksize);
959 			offset = round_down(offset, blksize);
960 		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
961 			error = xfs_reflink_unshare(ip, offset, len);
962 			if (error)
963 				goto out_unlock;
964 		} else {
965 			/*
966 			 * If always_cow mode we can't use preallocations and
967 			 * thus should not create them.
968 			 */
969 			if (xfs_is_always_cow_inode(ip)) {
970 				error = -EOPNOTSUPP;
971 				goto out_unlock;
972 			}
973 		}
974 
975 		if (!xfs_is_always_cow_inode(ip)) {
976 			error = xfs_alloc_file_space(ip, offset, len,
977 						     XFS_BMAPI_PREALLOC);
978 			if (error)
979 				goto out_unlock;
980 		}
981 	}
982 
983 	if (file->f_flags & O_DSYNC)
984 		flags |= XFS_PREALLOC_SYNC;
985 
986 	error = xfs_update_prealloc_flags(ip, flags);
987 	if (error)
988 		goto out_unlock;
989 
990 	/* Change file size if needed */
991 	if (new_size) {
992 		struct iattr iattr;
993 
994 		iattr.ia_valid = ATTR_SIZE;
995 		iattr.ia_size = new_size;
996 		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
997 		if (error)
998 			goto out_unlock;
999 	}
1000 
1001 	/*
1002 	 * Perform hole insertion now that the file size has been
1003 	 * updated so that if we crash during the operation we don't
1004 	 * leave shifted extents past EOF and hence losing access to
1005 	 * the data that is contained within them.
1006 	 */
1007 	if (do_file_insert)
1008 		error = xfs_insert_file_space(ip, offset, len);
1009 
1010 out_unlock:
1011 	xfs_iunlock(ip, iolock);
1012 	return error;
1013 }
1014 
1015 STATIC int
1016 xfs_file_fadvise(
1017 	struct file	*file,
1018 	loff_t		start,
1019 	loff_t		end,
1020 	int		advice)
1021 {
1022 	struct xfs_inode *ip = XFS_I(file_inode(file));
1023 	int ret;
1024 	int lockflags = 0;
1025 
1026 	/*
1027 	 * Operations creating pages in page cache need protection from hole
1028 	 * punching and similar ops
1029 	 */
1030 	if (advice == POSIX_FADV_WILLNEED) {
1031 		lockflags = XFS_IOLOCK_SHARED;
1032 		xfs_ilock(ip, lockflags);
1033 	}
1034 	ret = generic_fadvise(file, start, end, advice);
1035 	if (lockflags)
1036 		xfs_iunlock(ip, lockflags);
1037 	return ret;
1038 }
1039 
1040 /* Does this file, inode, or mount want synchronous writes? */
1041 static inline bool xfs_file_sync_writes(struct file *filp)
1042 {
1043 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1044 
1045 	if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
1046 		return true;
1047 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1048 		return true;
1049 	if (IS_SYNC(file_inode(filp)))
1050 		return true;
1051 
1052 	return false;
1053 }
1054 
1055 STATIC loff_t
1056 xfs_file_remap_range(
1057 	struct file		*file_in,
1058 	loff_t			pos_in,
1059 	struct file		*file_out,
1060 	loff_t			pos_out,
1061 	loff_t			len,
1062 	unsigned int		remap_flags)
1063 {
1064 	struct inode		*inode_in = file_inode(file_in);
1065 	struct xfs_inode	*src = XFS_I(inode_in);
1066 	struct inode		*inode_out = file_inode(file_out);
1067 	struct xfs_inode	*dest = XFS_I(inode_out);
1068 	struct xfs_mount	*mp = src->i_mount;
1069 	loff_t			remapped = 0;
1070 	xfs_extlen_t		cowextsize;
1071 	int			ret;
1072 
1073 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1074 		return -EINVAL;
1075 
1076 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
1077 		return -EOPNOTSUPP;
1078 
1079 	if (XFS_FORCED_SHUTDOWN(mp))
1080 		return -EIO;
1081 
1082 	/* Prepare and then clone file data. */
1083 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1084 			&len, remap_flags);
1085 	if (ret || len == 0)
1086 		return ret;
1087 
1088 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1089 
1090 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1091 			&remapped);
1092 	if (ret)
1093 		goto out_unlock;
1094 
1095 	/*
1096 	 * Carry the cowextsize hint from src to dest if we're sharing the
1097 	 * entire source file to the entire destination file, the source file
1098 	 * has a cowextsize hint, and the destination file does not.
1099 	 */
1100 	cowextsize = 0;
1101 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1102 	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1103 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1104 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1105 		cowextsize = src->i_d.di_cowextsize;
1106 
1107 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1108 			remap_flags);
1109 	if (ret)
1110 		goto out_unlock;
1111 
1112 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1113 		xfs_log_force_inode(dest);
1114 out_unlock:
1115 	xfs_iunlock2_io_mmap(src, dest);
1116 	if (ret)
1117 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1118 	return remapped > 0 ? remapped : ret;
1119 }
1120 
1121 STATIC int
1122 xfs_file_open(
1123 	struct inode	*inode,
1124 	struct file	*file)
1125 {
1126 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1127 		return -EFBIG;
1128 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
1129 		return -EIO;
1130 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
1131 	return 0;
1132 }
1133 
1134 STATIC int
1135 xfs_dir_open(
1136 	struct inode	*inode,
1137 	struct file	*file)
1138 {
1139 	struct xfs_inode *ip = XFS_I(inode);
1140 	int		mode;
1141 	int		error;
1142 
1143 	error = xfs_file_open(inode, file);
1144 	if (error)
1145 		return error;
1146 
1147 	/*
1148 	 * If there are any blocks, read-ahead block 0 as we're almost
1149 	 * certain to have the next operation be a read there.
1150 	 */
1151 	mode = xfs_ilock_data_map_shared(ip);
1152 	if (ip->i_df.if_nextents > 0)
1153 		error = xfs_dir3_data_readahead(ip, 0, 0);
1154 	xfs_iunlock(ip, mode);
1155 	return error;
1156 }
1157 
1158 STATIC int
1159 xfs_file_release(
1160 	struct inode	*inode,
1161 	struct file	*filp)
1162 {
1163 	return xfs_release(XFS_I(inode));
1164 }
1165 
1166 STATIC int
1167 xfs_file_readdir(
1168 	struct file	*file,
1169 	struct dir_context *ctx)
1170 {
1171 	struct inode	*inode = file_inode(file);
1172 	xfs_inode_t	*ip = XFS_I(inode);
1173 	size_t		bufsize;
1174 
1175 	/*
1176 	 * The Linux API doesn't pass down the total size of the buffer
1177 	 * we read into down to the filesystem.  With the filldir concept
1178 	 * it's not needed for correct information, but the XFS dir2 leaf
1179 	 * code wants an estimate of the buffer size to calculate it's
1180 	 * readahead window and size the buffers used for mapping to
1181 	 * physical blocks.
1182 	 *
1183 	 * Try to give it an estimate that's good enough, maybe at some
1184 	 * point we can change the ->readdir prototype to include the
1185 	 * buffer size.  For now we use the current glibc buffer size.
1186 	 */
1187 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
1188 
1189 	return xfs_readdir(NULL, ip, ctx, bufsize);
1190 }
1191 
1192 STATIC loff_t
1193 xfs_file_llseek(
1194 	struct file	*file,
1195 	loff_t		offset,
1196 	int		whence)
1197 {
1198 	struct inode		*inode = file->f_mapping->host;
1199 
1200 	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1201 		return -EIO;
1202 
1203 	switch (whence) {
1204 	default:
1205 		return generic_file_llseek(file, offset, whence);
1206 	case SEEK_HOLE:
1207 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1208 		break;
1209 	case SEEK_DATA:
1210 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1211 		break;
1212 	}
1213 
1214 	if (offset < 0)
1215 		return offset;
1216 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1217 }
1218 
1219 /*
1220  * Locking for serialisation of IO during page faults. This results in a lock
1221  * ordering of:
1222  *
1223  * mmap_lock (MM)
1224  *   sb_start_pagefault(vfs, freeze)
1225  *     i_mmaplock (XFS - truncate serialisation)
1226  *       page_lock (MM)
1227  *         i_lock (XFS - extent map serialisation)
1228  */
1229 static vm_fault_t
1230 __xfs_filemap_fault(
1231 	struct vm_fault		*vmf,
1232 	enum page_entry_size	pe_size,
1233 	bool			write_fault)
1234 {
1235 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1236 	struct xfs_inode	*ip = XFS_I(inode);
1237 	vm_fault_t		ret;
1238 
1239 	trace_xfs_filemap_fault(ip, pe_size, write_fault);
1240 
1241 	if (write_fault) {
1242 		sb_start_pagefault(inode->i_sb);
1243 		file_update_time(vmf->vma->vm_file);
1244 	}
1245 
1246 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1247 	if (IS_DAX(inode)) {
1248 		pfn_t pfn;
1249 
1250 		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
1251 				(write_fault && !vmf->cow_page) ?
1252 				 &xfs_direct_write_iomap_ops :
1253 				 &xfs_read_iomap_ops);
1254 		if (ret & VM_FAULT_NEEDDSYNC)
1255 			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1256 	} else {
1257 		if (write_fault)
1258 			ret = iomap_page_mkwrite(vmf,
1259 					&xfs_buffered_write_iomap_ops);
1260 		else
1261 			ret = filemap_fault(vmf);
1262 	}
1263 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1264 
1265 	if (write_fault)
1266 		sb_end_pagefault(inode->i_sb);
1267 	return ret;
1268 }
1269 
1270 static inline bool
1271 xfs_is_write_fault(
1272 	struct vm_fault		*vmf)
1273 {
1274 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1275 	       (vmf->vma->vm_flags & VM_SHARED);
1276 }
1277 
1278 static vm_fault_t
1279 xfs_filemap_fault(
1280 	struct vm_fault		*vmf)
1281 {
1282 	/* DAX can shortcut the normal fault path on write faults! */
1283 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1284 			IS_DAX(file_inode(vmf->vma->vm_file)) &&
1285 			xfs_is_write_fault(vmf));
1286 }
1287 
1288 static vm_fault_t
1289 xfs_filemap_huge_fault(
1290 	struct vm_fault		*vmf,
1291 	enum page_entry_size	pe_size)
1292 {
1293 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1294 		return VM_FAULT_FALLBACK;
1295 
1296 	/* DAX can shortcut the normal fault path on write faults! */
1297 	return __xfs_filemap_fault(vmf, pe_size,
1298 			xfs_is_write_fault(vmf));
1299 }
1300 
1301 static vm_fault_t
1302 xfs_filemap_page_mkwrite(
1303 	struct vm_fault		*vmf)
1304 {
1305 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1306 }
1307 
1308 /*
1309  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1310  * on write faults. In reality, it needs to serialise against truncate and
1311  * prepare memory for writing so handle is as standard write fault.
1312  */
1313 static vm_fault_t
1314 xfs_filemap_pfn_mkwrite(
1315 	struct vm_fault		*vmf)
1316 {
1317 
1318 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1319 }
1320 
1321 static void
1322 xfs_filemap_map_pages(
1323 	struct vm_fault		*vmf,
1324 	pgoff_t			start_pgoff,
1325 	pgoff_t			end_pgoff)
1326 {
1327 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1328 
1329 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1330 	filemap_map_pages(vmf, start_pgoff, end_pgoff);
1331 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1332 }
1333 
1334 static const struct vm_operations_struct xfs_file_vm_ops = {
1335 	.fault		= xfs_filemap_fault,
1336 	.huge_fault	= xfs_filemap_huge_fault,
1337 	.map_pages	= xfs_filemap_map_pages,
1338 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1339 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1340 };
1341 
1342 STATIC int
1343 xfs_file_mmap(
1344 	struct file		*file,
1345 	struct vm_area_struct	*vma)
1346 {
1347 	struct inode		*inode = file_inode(file);
1348 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1349 
1350 	/*
1351 	 * We don't support synchronous mappings for non-DAX files and
1352 	 * for DAX files if underneath dax_device is not synchronous.
1353 	 */
1354 	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1355 		return -EOPNOTSUPP;
1356 
1357 	file_accessed(file);
1358 	vma->vm_ops = &xfs_file_vm_ops;
1359 	if (IS_DAX(inode))
1360 		vma->vm_flags |= VM_HUGEPAGE;
1361 	return 0;
1362 }
1363 
1364 const struct file_operations xfs_file_operations = {
1365 	.llseek		= xfs_file_llseek,
1366 	.read_iter	= xfs_file_read_iter,
1367 	.write_iter	= xfs_file_write_iter,
1368 	.splice_read	= generic_file_splice_read,
1369 	.splice_write	= iter_file_splice_write,
1370 	.iopoll		= iomap_dio_iopoll,
1371 	.unlocked_ioctl	= xfs_file_ioctl,
1372 #ifdef CONFIG_COMPAT
1373 	.compat_ioctl	= xfs_file_compat_ioctl,
1374 #endif
1375 	.mmap		= xfs_file_mmap,
1376 	.mmap_supported_flags = MAP_SYNC,
1377 	.open		= xfs_file_open,
1378 	.release	= xfs_file_release,
1379 	.fsync		= xfs_file_fsync,
1380 	.get_unmapped_area = thp_get_unmapped_area,
1381 	.fallocate	= xfs_file_fallocate,
1382 	.fadvise	= xfs_file_fadvise,
1383 	.remap_file_range = xfs_file_remap_range,
1384 };
1385 
1386 const struct file_operations xfs_dir_file_operations = {
1387 	.open		= xfs_dir_open,
1388 	.read		= generic_read_dir,
1389 	.iterate_shared	= xfs_file_readdir,
1390 	.llseek		= generic_file_llseek,
1391 	.unlocked_ioctl	= xfs_file_ioctl,
1392 #ifdef CONFIG_COMPAT
1393 	.compat_ioctl	= xfs_file_compat_ioctl,
1394 #endif
1395 	.fsync		= xfs_dir_fsync,
1396 };
1397