xref: /openbmc/linux/fs/xfs/xfs_file.c (revision b5f184fbdb03b4fcc1141de34dd5ec964ca5d99e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 
28 #include <linux/falloc.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mman.h>
31 #include <linux/fadvise.h>
32 
33 static const struct vm_operations_struct xfs_file_vm_ops;
34 
35 /*
36  * Decide if the given file range is aligned to the size of the fundamental
37  * allocation unit for the file.
38  */
39 static bool
40 xfs_is_falloc_aligned(
41 	struct xfs_inode	*ip,
42 	loff_t			pos,
43 	long long int		len)
44 {
45 	struct xfs_mount	*mp = ip->i_mount;
46 	uint64_t		mask;
47 
48 	if (XFS_IS_REALTIME_INODE(ip)) {
49 		if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
50 			u64	rextbytes;
51 			u32	mod;
52 
53 			rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
54 			div_u64_rem(pos, rextbytes, &mod);
55 			if (mod)
56 				return false;
57 			div_u64_rem(len, rextbytes, &mod);
58 			return mod == 0;
59 		}
60 		mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
61 	} else {
62 		mask = mp->m_sb.sb_blocksize - 1;
63 	}
64 
65 	return !((pos | len) & mask);
66 }
67 
68 int
69 xfs_update_prealloc_flags(
70 	struct xfs_inode	*ip,
71 	enum xfs_prealloc_flags	flags)
72 {
73 	struct xfs_trans	*tp;
74 	int			error;
75 
76 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
77 			0, 0, 0, &tp);
78 	if (error)
79 		return error;
80 
81 	xfs_ilock(ip, XFS_ILOCK_EXCL);
82 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
83 
84 	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
85 		VFS_I(ip)->i_mode &= ~S_ISUID;
86 		if (VFS_I(ip)->i_mode & S_IXGRP)
87 			VFS_I(ip)->i_mode &= ~S_ISGID;
88 		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
89 	}
90 
91 	if (flags & XFS_PREALLOC_SET)
92 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
93 	if (flags & XFS_PREALLOC_CLEAR)
94 		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
95 
96 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
97 	if (flags & XFS_PREALLOC_SYNC)
98 		xfs_trans_set_sync(tp);
99 	return xfs_trans_commit(tp);
100 }
101 
102 /*
103  * Fsync operations on directories are much simpler than on regular files,
104  * as there is no file data to flush, and thus also no need for explicit
105  * cache flush operations, and there are no non-transaction metadata updates
106  * on directories either.
107  */
108 STATIC int
109 xfs_dir_fsync(
110 	struct file		*file,
111 	loff_t			start,
112 	loff_t			end,
113 	int			datasync)
114 {
115 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
116 
117 	trace_xfs_dir_fsync(ip);
118 	return xfs_log_force_inode(ip);
119 }
120 
121 static xfs_lsn_t
122 xfs_fsync_lsn(
123 	struct xfs_inode	*ip,
124 	bool			datasync)
125 {
126 	if (!xfs_ipincount(ip))
127 		return 0;
128 	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
129 		return 0;
130 	return ip->i_itemp->ili_last_lsn;
131 }
132 
133 /*
134  * All metadata updates are logged, which means that we just have to flush the
135  * log up to the latest LSN that touched the inode.
136  *
137  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
138  * the log force before we clear the ili_fsync_fields field. This ensures that
139  * we don't get a racing sync operation that does not wait for the metadata to
140  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
141  * then all that will happen is the log force will do nothing as the lsn will
142  * already be on disk.  We can't race with setting ili_fsync_fields because that
143  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
144  * shared until after the ili_fsync_fields is cleared.
145  */
146 static  int
147 xfs_fsync_flush_log(
148 	struct xfs_inode	*ip,
149 	bool			datasync,
150 	int			*log_flushed)
151 {
152 	int			error = 0;
153 	xfs_lsn_t		lsn;
154 
155 	xfs_ilock(ip, XFS_ILOCK_SHARED);
156 	lsn = xfs_fsync_lsn(ip, datasync);
157 	if (lsn) {
158 		error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
159 					  log_flushed);
160 
161 		spin_lock(&ip->i_itemp->ili_lock);
162 		ip->i_itemp->ili_fsync_fields = 0;
163 		spin_unlock(&ip->i_itemp->ili_lock);
164 	}
165 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
166 	return error;
167 }
168 
169 STATIC int
170 xfs_file_fsync(
171 	struct file		*file,
172 	loff_t			start,
173 	loff_t			end,
174 	int			datasync)
175 {
176 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
177 	struct xfs_mount	*mp = ip->i_mount;
178 	int			error = 0;
179 	int			log_flushed = 0;
180 
181 	trace_xfs_file_fsync(ip);
182 
183 	error = file_write_and_wait_range(file, start, end);
184 	if (error)
185 		return error;
186 
187 	if (XFS_FORCED_SHUTDOWN(mp))
188 		return -EIO;
189 
190 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
191 
192 	/*
193 	 * If we have an RT and/or log subvolume we need to make sure to flush
194 	 * the write cache the device used for file data first.  This is to
195 	 * ensure newly written file data make it to disk before logging the new
196 	 * inode size in case of an extending write.
197 	 */
198 	if (XFS_IS_REALTIME_INODE(ip))
199 		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
200 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
201 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
202 
203 	/*
204 	 * Any inode that has dirty modifications in the log is pinned.  The
205 	 * racy check here for a pinned inode while not catch modifications
206 	 * that happen concurrently to the fsync call, but fsync semantics
207 	 * only require to sync previously completed I/O.
208 	 */
209 	if (xfs_ipincount(ip))
210 		error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
211 
212 	/*
213 	 * If we only have a single device, and the log force about was
214 	 * a no-op we might have to flush the data device cache here.
215 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
216 	 * an already allocated file and thus do not have any metadata to
217 	 * commit.
218 	 */
219 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
220 	    mp->m_logdev_targp == mp->m_ddev_targp)
221 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
222 
223 	return error;
224 }
225 
226 static int
227 xfs_ilock_iocb(
228 	struct kiocb		*iocb,
229 	unsigned int		lock_mode)
230 {
231 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
232 
233 	if (iocb->ki_flags & IOCB_NOWAIT) {
234 		if (!xfs_ilock_nowait(ip, lock_mode))
235 			return -EAGAIN;
236 	} else {
237 		xfs_ilock(ip, lock_mode);
238 	}
239 
240 	return 0;
241 }
242 
243 STATIC ssize_t
244 xfs_file_dio_read(
245 	struct kiocb		*iocb,
246 	struct iov_iter		*to)
247 {
248 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
249 	ssize_t			ret;
250 
251 	trace_xfs_file_direct_read(iocb, to);
252 
253 	if (!iov_iter_count(to))
254 		return 0; /* skip atime */
255 
256 	file_accessed(iocb->ki_filp);
257 
258 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
259 	if (ret)
260 		return ret;
261 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
262 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
263 
264 	return ret;
265 }
266 
267 static noinline ssize_t
268 xfs_file_dax_read(
269 	struct kiocb		*iocb,
270 	struct iov_iter		*to)
271 {
272 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
273 	ssize_t			ret = 0;
274 
275 	trace_xfs_file_dax_read(iocb, to);
276 
277 	if (!iov_iter_count(to))
278 		return 0; /* skip atime */
279 
280 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
281 	if (ret)
282 		return ret;
283 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
284 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
285 
286 	file_accessed(iocb->ki_filp);
287 	return ret;
288 }
289 
290 STATIC ssize_t
291 xfs_file_buffered_read(
292 	struct kiocb		*iocb,
293 	struct iov_iter		*to)
294 {
295 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
296 	ssize_t			ret;
297 
298 	trace_xfs_file_buffered_read(iocb, to);
299 
300 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
301 	if (ret)
302 		return ret;
303 	ret = generic_file_read_iter(iocb, to);
304 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
305 
306 	return ret;
307 }
308 
309 STATIC ssize_t
310 xfs_file_read_iter(
311 	struct kiocb		*iocb,
312 	struct iov_iter		*to)
313 {
314 	struct inode		*inode = file_inode(iocb->ki_filp);
315 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
316 	ssize_t			ret = 0;
317 
318 	XFS_STATS_INC(mp, xs_read_calls);
319 
320 	if (XFS_FORCED_SHUTDOWN(mp))
321 		return -EIO;
322 
323 	if (IS_DAX(inode))
324 		ret = xfs_file_dax_read(iocb, to);
325 	else if (iocb->ki_flags & IOCB_DIRECT)
326 		ret = xfs_file_dio_read(iocb, to);
327 	else
328 		ret = xfs_file_buffered_read(iocb, to);
329 
330 	if (ret > 0)
331 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
332 	return ret;
333 }
334 
335 /*
336  * Common pre-write limit and setup checks.
337  *
338  * Called with the iolocked held either shared and exclusive according to
339  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
340  * if called for a direct write beyond i_size.
341  */
342 STATIC ssize_t
343 xfs_file_write_checks(
344 	struct kiocb		*iocb,
345 	struct iov_iter		*from,
346 	int			*iolock)
347 {
348 	struct file		*file = iocb->ki_filp;
349 	struct inode		*inode = file->f_mapping->host;
350 	struct xfs_inode	*ip = XFS_I(inode);
351 	ssize_t			error = 0;
352 	size_t			count = iov_iter_count(from);
353 	bool			drained_dio = false;
354 	loff_t			isize;
355 
356 restart:
357 	error = generic_write_checks(iocb, from);
358 	if (error <= 0)
359 		return error;
360 
361 	if (iocb->ki_flags & IOCB_NOWAIT) {
362 		error = break_layout(inode, false);
363 		if (error == -EWOULDBLOCK)
364 			error = -EAGAIN;
365 	} else {
366 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
367 	}
368 
369 	if (error)
370 		return error;
371 
372 	/*
373 	 * For changing security info in file_remove_privs() we need i_rwsem
374 	 * exclusively.
375 	 */
376 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
377 		xfs_iunlock(ip, *iolock);
378 		*iolock = XFS_IOLOCK_EXCL;
379 		error = xfs_ilock_iocb(iocb, *iolock);
380 		if (error) {
381 			*iolock = 0;
382 			return error;
383 		}
384 		goto restart;
385 	}
386 	/*
387 	 * If the offset is beyond the size of the file, we need to zero any
388 	 * blocks that fall between the existing EOF and the start of this
389 	 * write.  If zeroing is needed and we are currently holding the
390 	 * iolock shared, we need to update it to exclusive which implies
391 	 * having to redo all checks before.
392 	 *
393 	 * We need to serialise against EOF updates that occur in IO
394 	 * completions here. We want to make sure that nobody is changing the
395 	 * size while we do this check until we have placed an IO barrier (i.e.
396 	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
397 	 * The spinlock effectively forms a memory barrier once we have the
398 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
399 	 * and hence be able to correctly determine if we need to run zeroing.
400 	 */
401 	spin_lock(&ip->i_flags_lock);
402 	isize = i_size_read(inode);
403 	if (iocb->ki_pos > isize) {
404 		spin_unlock(&ip->i_flags_lock);
405 
406 		if (iocb->ki_flags & IOCB_NOWAIT)
407 			return -EAGAIN;
408 
409 		if (!drained_dio) {
410 			if (*iolock == XFS_IOLOCK_SHARED) {
411 				xfs_iunlock(ip, *iolock);
412 				*iolock = XFS_IOLOCK_EXCL;
413 				xfs_ilock(ip, *iolock);
414 				iov_iter_reexpand(from, count);
415 			}
416 			/*
417 			 * We now have an IO submission barrier in place, but
418 			 * AIO can do EOF updates during IO completion and hence
419 			 * we now need to wait for all of them to drain. Non-AIO
420 			 * DIO will have drained before we are given the
421 			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
422 			 * no-op.
423 			 */
424 			inode_dio_wait(inode);
425 			drained_dio = true;
426 			goto restart;
427 		}
428 
429 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
430 		error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
431 				NULL, &xfs_buffered_write_iomap_ops);
432 		if (error)
433 			return error;
434 	} else
435 		spin_unlock(&ip->i_flags_lock);
436 
437 	return file_modified(file);
438 }
439 
440 static int
441 xfs_dio_write_end_io(
442 	struct kiocb		*iocb,
443 	ssize_t			size,
444 	int			error,
445 	unsigned		flags)
446 {
447 	struct inode		*inode = file_inode(iocb->ki_filp);
448 	struct xfs_inode	*ip = XFS_I(inode);
449 	loff_t			offset = iocb->ki_pos;
450 	unsigned int		nofs_flag;
451 
452 	trace_xfs_end_io_direct_write(ip, offset, size);
453 
454 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
455 		return -EIO;
456 
457 	if (error)
458 		return error;
459 	if (!size)
460 		return 0;
461 
462 	/*
463 	 * Capture amount written on completion as we can't reliably account
464 	 * for it on submission.
465 	 */
466 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
467 
468 	/*
469 	 * We can allocate memory here while doing writeback on behalf of
470 	 * memory reclaim.  To avoid memory allocation deadlocks set the
471 	 * task-wide nofs context for the following operations.
472 	 */
473 	nofs_flag = memalloc_nofs_save();
474 
475 	if (flags & IOMAP_DIO_COW) {
476 		error = xfs_reflink_end_cow(ip, offset, size);
477 		if (error)
478 			goto out;
479 	}
480 
481 	/*
482 	 * Unwritten conversion updates the in-core isize after extent
483 	 * conversion but before updating the on-disk size. Updating isize any
484 	 * earlier allows a racing dio read to find unwritten extents before
485 	 * they are converted.
486 	 */
487 	if (flags & IOMAP_DIO_UNWRITTEN) {
488 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
489 		goto out;
490 	}
491 
492 	/*
493 	 * We need to update the in-core inode size here so that we don't end up
494 	 * with the on-disk inode size being outside the in-core inode size. We
495 	 * have no other method of updating EOF for AIO, so always do it here
496 	 * if necessary.
497 	 *
498 	 * We need to lock the test/set EOF update as we can be racing with
499 	 * other IO completions here to update the EOF. Failing to serialise
500 	 * here can result in EOF moving backwards and Bad Things Happen when
501 	 * that occurs.
502 	 */
503 	spin_lock(&ip->i_flags_lock);
504 	if (offset + size > i_size_read(inode)) {
505 		i_size_write(inode, offset + size);
506 		spin_unlock(&ip->i_flags_lock);
507 		error = xfs_setfilesize(ip, offset, size);
508 	} else {
509 		spin_unlock(&ip->i_flags_lock);
510 	}
511 
512 out:
513 	memalloc_nofs_restore(nofs_flag);
514 	return error;
515 }
516 
517 static const struct iomap_dio_ops xfs_dio_write_ops = {
518 	.end_io		= xfs_dio_write_end_io,
519 };
520 
521 /*
522  * Handle block aligned direct I/O writes
523  */
524 static noinline ssize_t
525 xfs_file_dio_write_aligned(
526 	struct xfs_inode	*ip,
527 	struct kiocb		*iocb,
528 	struct iov_iter		*from)
529 {
530 	int			iolock = XFS_IOLOCK_SHARED;
531 	ssize_t			ret;
532 
533 	ret = xfs_ilock_iocb(iocb, iolock);
534 	if (ret)
535 		return ret;
536 	ret = xfs_file_write_checks(iocb, from, &iolock);
537 	if (ret)
538 		goto out_unlock;
539 
540 	/*
541 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
542 	 * the iolock back to shared if we had to take the exclusive lock in
543 	 * xfs_file_write_checks() for other reasons.
544 	 */
545 	if (iolock == XFS_IOLOCK_EXCL) {
546 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
547 		iolock = XFS_IOLOCK_SHARED;
548 	}
549 	trace_xfs_file_direct_write(iocb, from);
550 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
551 			   &xfs_dio_write_ops, 0);
552 out_unlock:
553 	if (iolock)
554 		xfs_iunlock(ip, iolock);
555 	return ret;
556 }
557 
558 /*
559  * Handle block unaligned direct I/O writes
560  *
561  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
562  * them to be done in parallel with reads and other direct I/O writes.  However,
563  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
564  * to do sub-block zeroing and that requires serialisation against other direct
565  * I/O to the same block.  In this case we need to serialise the submission of
566  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
567  * In the case where sub-block zeroing is not required, we can do concurrent
568  * sub-block dios to the same block successfully.
569  *
570  * Optimistically submit the I/O using the shared lock first, but use the
571  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
572  * if block allocation or partial block zeroing would be required.  In that case
573  * we try again with the exclusive lock.
574  */
575 static noinline ssize_t
576 xfs_file_dio_write_unaligned(
577 	struct xfs_inode	*ip,
578 	struct kiocb		*iocb,
579 	struct iov_iter		*from)
580 {
581 	size_t			isize = i_size_read(VFS_I(ip));
582 	size_t			count = iov_iter_count(from);
583 	int			iolock = XFS_IOLOCK_SHARED;
584 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
585 	ssize_t			ret;
586 
587 	/*
588 	 * Extending writes need exclusivity because of the sub-block zeroing
589 	 * that the DIO code always does for partial tail blocks beyond EOF, so
590 	 * don't even bother trying the fast path in this case.
591 	 */
592 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
593 retry_exclusive:
594 		if (iocb->ki_flags & IOCB_NOWAIT)
595 			return -EAGAIN;
596 		iolock = XFS_IOLOCK_EXCL;
597 		flags = IOMAP_DIO_FORCE_WAIT;
598 	}
599 
600 	ret = xfs_ilock_iocb(iocb, iolock);
601 	if (ret)
602 		return ret;
603 
604 	/*
605 	 * We can't properly handle unaligned direct I/O to reflink files yet,
606 	 * as we can't unshare a partial block.
607 	 */
608 	if (xfs_is_cow_inode(ip)) {
609 		trace_xfs_reflink_bounce_dio_write(iocb, from);
610 		ret = -ENOTBLK;
611 		goto out_unlock;
612 	}
613 
614 	ret = xfs_file_write_checks(iocb, from, &iolock);
615 	if (ret)
616 		goto out_unlock;
617 
618 	/*
619 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
620 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
621 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
622 	 * drain first.
623 	 */
624 	if (flags & IOMAP_DIO_FORCE_WAIT)
625 		inode_dio_wait(VFS_I(ip));
626 
627 	trace_xfs_file_direct_write(iocb, from);
628 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
629 			   &xfs_dio_write_ops, flags);
630 
631 	/*
632 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
633 	 * layer rejected it for mapping or locking reasons. If we are doing
634 	 * nonblocking user I/O, propagate the error.
635 	 */
636 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
637 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
638 		xfs_iunlock(ip, iolock);
639 		goto retry_exclusive;
640 	}
641 
642 out_unlock:
643 	if (iolock)
644 		xfs_iunlock(ip, iolock);
645 	return ret;
646 }
647 
648 static ssize_t
649 xfs_file_dio_write(
650 	struct kiocb		*iocb,
651 	struct iov_iter		*from)
652 {
653 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
654 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
655 	size_t			count = iov_iter_count(from);
656 
657 	/* direct I/O must be aligned to device logical sector size */
658 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
659 		return -EINVAL;
660 	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
661 		return xfs_file_dio_write_unaligned(ip, iocb, from);
662 	return xfs_file_dio_write_aligned(ip, iocb, from);
663 }
664 
665 static noinline ssize_t
666 xfs_file_dax_write(
667 	struct kiocb		*iocb,
668 	struct iov_iter		*from)
669 {
670 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
671 	struct xfs_inode	*ip = XFS_I(inode);
672 	int			iolock = XFS_IOLOCK_EXCL;
673 	ssize_t			ret, error = 0;
674 	loff_t			pos;
675 
676 	ret = xfs_ilock_iocb(iocb, iolock);
677 	if (ret)
678 		return ret;
679 	ret = xfs_file_write_checks(iocb, from, &iolock);
680 	if (ret)
681 		goto out;
682 
683 	pos = iocb->ki_pos;
684 
685 	trace_xfs_file_dax_write(iocb, from);
686 	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
687 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
688 		i_size_write(inode, iocb->ki_pos);
689 		error = xfs_setfilesize(ip, pos, ret);
690 	}
691 out:
692 	if (iolock)
693 		xfs_iunlock(ip, iolock);
694 	if (error)
695 		return error;
696 
697 	if (ret > 0) {
698 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
699 
700 		/* Handle various SYNC-type writes */
701 		ret = generic_write_sync(iocb, ret);
702 	}
703 	return ret;
704 }
705 
706 STATIC ssize_t
707 xfs_file_buffered_write(
708 	struct kiocb		*iocb,
709 	struct iov_iter		*from)
710 {
711 	struct file		*file = iocb->ki_filp;
712 	struct address_space	*mapping = file->f_mapping;
713 	struct inode		*inode = mapping->host;
714 	struct xfs_inode	*ip = XFS_I(inode);
715 	ssize_t			ret;
716 	bool			cleared_space = false;
717 	int			iolock;
718 
719 	if (iocb->ki_flags & IOCB_NOWAIT)
720 		return -EOPNOTSUPP;
721 
722 write_retry:
723 	iolock = XFS_IOLOCK_EXCL;
724 	xfs_ilock(ip, iolock);
725 
726 	ret = xfs_file_write_checks(iocb, from, &iolock);
727 	if (ret)
728 		goto out;
729 
730 	/* We can write back this queue in page reclaim */
731 	current->backing_dev_info = inode_to_bdi(inode);
732 
733 	trace_xfs_file_buffered_write(iocb, from);
734 	ret = iomap_file_buffered_write(iocb, from,
735 			&xfs_buffered_write_iomap_ops);
736 	if (likely(ret >= 0))
737 		iocb->ki_pos += ret;
738 
739 	/*
740 	 * If we hit a space limit, try to free up some lingering preallocated
741 	 * space before returning an error. In the case of ENOSPC, first try to
742 	 * write back all dirty inodes to free up some of the excess reserved
743 	 * metadata space. This reduces the chances that the eofblocks scan
744 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
745 	 * also behaves as a filter to prevent too many eofblocks scans from
746 	 * running at the same time.  Use a synchronous scan to increase the
747 	 * effectiveness of the scan.
748 	 */
749 	if (ret == -EDQUOT && !cleared_space) {
750 		xfs_iunlock(ip, iolock);
751 		xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
752 		cleared_space = true;
753 		goto write_retry;
754 	} else if (ret == -ENOSPC && !cleared_space) {
755 		struct xfs_eofblocks eofb = {0};
756 
757 		cleared_space = true;
758 		xfs_flush_inodes(ip->i_mount);
759 
760 		xfs_iunlock(ip, iolock);
761 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
762 		xfs_blockgc_free_space(ip->i_mount, &eofb);
763 		goto write_retry;
764 	}
765 
766 	current->backing_dev_info = NULL;
767 out:
768 	if (iolock)
769 		xfs_iunlock(ip, iolock);
770 
771 	if (ret > 0) {
772 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
773 		/* Handle various SYNC-type writes */
774 		ret = generic_write_sync(iocb, ret);
775 	}
776 	return ret;
777 }
778 
779 STATIC ssize_t
780 xfs_file_write_iter(
781 	struct kiocb		*iocb,
782 	struct iov_iter		*from)
783 {
784 	struct file		*file = iocb->ki_filp;
785 	struct address_space	*mapping = file->f_mapping;
786 	struct inode		*inode = mapping->host;
787 	struct xfs_inode	*ip = XFS_I(inode);
788 	ssize_t			ret;
789 	size_t			ocount = iov_iter_count(from);
790 
791 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
792 
793 	if (ocount == 0)
794 		return 0;
795 
796 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
797 		return -EIO;
798 
799 	if (IS_DAX(inode))
800 		return xfs_file_dax_write(iocb, from);
801 
802 	if (iocb->ki_flags & IOCB_DIRECT) {
803 		/*
804 		 * Allow a directio write to fall back to a buffered
805 		 * write *only* in the case that we're doing a reflink
806 		 * CoW.  In all other directio scenarios we do not
807 		 * allow an operation to fall back to buffered mode.
808 		 */
809 		ret = xfs_file_dio_write(iocb, from);
810 		if (ret != -ENOTBLK)
811 			return ret;
812 	}
813 
814 	return xfs_file_buffered_write(iocb, from);
815 }
816 
817 static void
818 xfs_wait_dax_page(
819 	struct inode		*inode)
820 {
821 	struct xfs_inode        *ip = XFS_I(inode);
822 
823 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
824 	schedule();
825 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
826 }
827 
828 static int
829 xfs_break_dax_layouts(
830 	struct inode		*inode,
831 	bool			*retry)
832 {
833 	struct page		*page;
834 
835 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
836 
837 	page = dax_layout_busy_page(inode->i_mapping);
838 	if (!page)
839 		return 0;
840 
841 	*retry = true;
842 	return ___wait_var_event(&page->_refcount,
843 			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
844 			0, 0, xfs_wait_dax_page(inode));
845 }
846 
847 int
848 xfs_break_layouts(
849 	struct inode		*inode,
850 	uint			*iolock,
851 	enum layout_break_reason reason)
852 {
853 	bool			retry;
854 	int			error;
855 
856 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
857 
858 	do {
859 		retry = false;
860 		switch (reason) {
861 		case BREAK_UNMAP:
862 			error = xfs_break_dax_layouts(inode, &retry);
863 			if (error || retry)
864 				break;
865 			/* fall through */
866 		case BREAK_WRITE:
867 			error = xfs_break_leased_layouts(inode, iolock, &retry);
868 			break;
869 		default:
870 			WARN_ON_ONCE(1);
871 			error = -EINVAL;
872 		}
873 	} while (error == 0 && retry);
874 
875 	return error;
876 }
877 
878 #define	XFS_FALLOC_FL_SUPPORTED						\
879 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
880 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
881 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
882 
883 STATIC long
884 xfs_file_fallocate(
885 	struct file		*file,
886 	int			mode,
887 	loff_t			offset,
888 	loff_t			len)
889 {
890 	struct inode		*inode = file_inode(file);
891 	struct xfs_inode	*ip = XFS_I(inode);
892 	long			error;
893 	enum xfs_prealloc_flags	flags = 0;
894 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
895 	loff_t			new_size = 0;
896 	bool			do_file_insert = false;
897 
898 	if (!S_ISREG(inode->i_mode))
899 		return -EINVAL;
900 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
901 		return -EOPNOTSUPP;
902 
903 	xfs_ilock(ip, iolock);
904 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
905 	if (error)
906 		goto out_unlock;
907 
908 	/*
909 	 * Must wait for all AIO to complete before we continue as AIO can
910 	 * change the file size on completion without holding any locks we
911 	 * currently hold. We must do this first because AIO can update both
912 	 * the on disk and in memory inode sizes, and the operations that follow
913 	 * require the in-memory size to be fully up-to-date.
914 	 */
915 	inode_dio_wait(inode);
916 
917 	/*
918 	 * Now AIO and DIO has drained we flush and (if necessary) invalidate
919 	 * the cached range over the first operation we are about to run.
920 	 *
921 	 * We care about zero and collapse here because they both run a hole
922 	 * punch over the range first. Because that can zero data, and the range
923 	 * of invalidation for the shift operations is much larger, we still do
924 	 * the required flush for collapse in xfs_prepare_shift().
925 	 *
926 	 * Insert has the same range requirements as collapse, and we extend the
927 	 * file first which can zero data. Hence insert has the same
928 	 * flush/invalidate requirements as collapse and so they are both
929 	 * handled at the right time by xfs_prepare_shift().
930 	 */
931 	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
932 		    FALLOC_FL_COLLAPSE_RANGE)) {
933 		error = xfs_flush_unmap_range(ip, offset, len);
934 		if (error)
935 			goto out_unlock;
936 	}
937 
938 	if (mode & FALLOC_FL_PUNCH_HOLE) {
939 		error = xfs_free_file_space(ip, offset, len);
940 		if (error)
941 			goto out_unlock;
942 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
943 		if (!xfs_is_falloc_aligned(ip, offset, len)) {
944 			error = -EINVAL;
945 			goto out_unlock;
946 		}
947 
948 		/*
949 		 * There is no need to overlap collapse range with EOF,
950 		 * in which case it is effectively a truncate operation
951 		 */
952 		if (offset + len >= i_size_read(inode)) {
953 			error = -EINVAL;
954 			goto out_unlock;
955 		}
956 
957 		new_size = i_size_read(inode) - len;
958 
959 		error = xfs_collapse_file_space(ip, offset, len);
960 		if (error)
961 			goto out_unlock;
962 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
963 		loff_t		isize = i_size_read(inode);
964 
965 		if (!xfs_is_falloc_aligned(ip, offset, len)) {
966 			error = -EINVAL;
967 			goto out_unlock;
968 		}
969 
970 		/*
971 		 * New inode size must not exceed ->s_maxbytes, accounting for
972 		 * possible signed overflow.
973 		 */
974 		if (inode->i_sb->s_maxbytes - isize < len) {
975 			error = -EFBIG;
976 			goto out_unlock;
977 		}
978 		new_size = isize + len;
979 
980 		/* Offset should be less than i_size */
981 		if (offset >= isize) {
982 			error = -EINVAL;
983 			goto out_unlock;
984 		}
985 		do_file_insert = true;
986 	} else {
987 		flags |= XFS_PREALLOC_SET;
988 
989 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
990 		    offset + len > i_size_read(inode)) {
991 			new_size = offset + len;
992 			error = inode_newsize_ok(inode, new_size);
993 			if (error)
994 				goto out_unlock;
995 		}
996 
997 		if (mode & FALLOC_FL_ZERO_RANGE) {
998 			/*
999 			 * Punch a hole and prealloc the range.  We use a hole
1000 			 * punch rather than unwritten extent conversion for two
1001 			 * reasons:
1002 			 *
1003 			 *   1.) Hole punch handles partial block zeroing for us.
1004 			 *   2.) If prealloc returns ENOSPC, the file range is
1005 			 *       still zero-valued by virtue of the hole punch.
1006 			 */
1007 			unsigned int blksize = i_blocksize(inode);
1008 
1009 			trace_xfs_zero_file_space(ip);
1010 
1011 			error = xfs_free_file_space(ip, offset, len);
1012 			if (error)
1013 				goto out_unlock;
1014 
1015 			len = round_up(offset + len, blksize) -
1016 			      round_down(offset, blksize);
1017 			offset = round_down(offset, blksize);
1018 		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1019 			error = xfs_reflink_unshare(ip, offset, len);
1020 			if (error)
1021 				goto out_unlock;
1022 		} else {
1023 			/*
1024 			 * If always_cow mode we can't use preallocations and
1025 			 * thus should not create them.
1026 			 */
1027 			if (xfs_is_always_cow_inode(ip)) {
1028 				error = -EOPNOTSUPP;
1029 				goto out_unlock;
1030 			}
1031 		}
1032 
1033 		if (!xfs_is_always_cow_inode(ip)) {
1034 			error = xfs_alloc_file_space(ip, offset, len,
1035 						     XFS_BMAPI_PREALLOC);
1036 			if (error)
1037 				goto out_unlock;
1038 		}
1039 	}
1040 
1041 	if (file->f_flags & O_DSYNC)
1042 		flags |= XFS_PREALLOC_SYNC;
1043 
1044 	error = xfs_update_prealloc_flags(ip, flags);
1045 	if (error)
1046 		goto out_unlock;
1047 
1048 	/* Change file size if needed */
1049 	if (new_size) {
1050 		struct iattr iattr;
1051 
1052 		iattr.ia_valid = ATTR_SIZE;
1053 		iattr.ia_size = new_size;
1054 		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
1055 		if (error)
1056 			goto out_unlock;
1057 	}
1058 
1059 	/*
1060 	 * Perform hole insertion now that the file size has been
1061 	 * updated so that if we crash during the operation we don't
1062 	 * leave shifted extents past EOF and hence losing access to
1063 	 * the data that is contained within them.
1064 	 */
1065 	if (do_file_insert)
1066 		error = xfs_insert_file_space(ip, offset, len);
1067 
1068 out_unlock:
1069 	xfs_iunlock(ip, iolock);
1070 	return error;
1071 }
1072 
1073 STATIC int
1074 xfs_file_fadvise(
1075 	struct file	*file,
1076 	loff_t		start,
1077 	loff_t		end,
1078 	int		advice)
1079 {
1080 	struct xfs_inode *ip = XFS_I(file_inode(file));
1081 	int ret;
1082 	int lockflags = 0;
1083 
1084 	/*
1085 	 * Operations creating pages in page cache need protection from hole
1086 	 * punching and similar ops
1087 	 */
1088 	if (advice == POSIX_FADV_WILLNEED) {
1089 		lockflags = XFS_IOLOCK_SHARED;
1090 		xfs_ilock(ip, lockflags);
1091 	}
1092 	ret = generic_fadvise(file, start, end, advice);
1093 	if (lockflags)
1094 		xfs_iunlock(ip, lockflags);
1095 	return ret;
1096 }
1097 
1098 /* Does this file, inode, or mount want synchronous writes? */
1099 static inline bool xfs_file_sync_writes(struct file *filp)
1100 {
1101 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1102 
1103 	if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
1104 		return true;
1105 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1106 		return true;
1107 	if (IS_SYNC(file_inode(filp)))
1108 		return true;
1109 
1110 	return false;
1111 }
1112 
1113 STATIC loff_t
1114 xfs_file_remap_range(
1115 	struct file		*file_in,
1116 	loff_t			pos_in,
1117 	struct file		*file_out,
1118 	loff_t			pos_out,
1119 	loff_t			len,
1120 	unsigned int		remap_flags)
1121 {
1122 	struct inode		*inode_in = file_inode(file_in);
1123 	struct xfs_inode	*src = XFS_I(inode_in);
1124 	struct inode		*inode_out = file_inode(file_out);
1125 	struct xfs_inode	*dest = XFS_I(inode_out);
1126 	struct xfs_mount	*mp = src->i_mount;
1127 	loff_t			remapped = 0;
1128 	xfs_extlen_t		cowextsize;
1129 	int			ret;
1130 
1131 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1132 		return -EINVAL;
1133 
1134 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
1135 		return -EOPNOTSUPP;
1136 
1137 	if (XFS_FORCED_SHUTDOWN(mp))
1138 		return -EIO;
1139 
1140 	/* Prepare and then clone file data. */
1141 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1142 			&len, remap_flags);
1143 	if (ret || len == 0)
1144 		return ret;
1145 
1146 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1147 
1148 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1149 			&remapped);
1150 	if (ret)
1151 		goto out_unlock;
1152 
1153 	/*
1154 	 * Carry the cowextsize hint from src to dest if we're sharing the
1155 	 * entire source file to the entire destination file, the source file
1156 	 * has a cowextsize hint, and the destination file does not.
1157 	 */
1158 	cowextsize = 0;
1159 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1160 	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1161 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1162 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1163 		cowextsize = src->i_d.di_cowextsize;
1164 
1165 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1166 			remap_flags);
1167 	if (ret)
1168 		goto out_unlock;
1169 
1170 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1171 		xfs_log_force_inode(dest);
1172 out_unlock:
1173 	xfs_iunlock2_io_mmap(src, dest);
1174 	if (ret)
1175 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1176 	return remapped > 0 ? remapped : ret;
1177 }
1178 
1179 STATIC int
1180 xfs_file_open(
1181 	struct inode	*inode,
1182 	struct file	*file)
1183 {
1184 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1185 		return -EFBIG;
1186 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
1187 		return -EIO;
1188 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
1189 	return 0;
1190 }
1191 
1192 STATIC int
1193 xfs_dir_open(
1194 	struct inode	*inode,
1195 	struct file	*file)
1196 {
1197 	struct xfs_inode *ip = XFS_I(inode);
1198 	int		mode;
1199 	int		error;
1200 
1201 	error = xfs_file_open(inode, file);
1202 	if (error)
1203 		return error;
1204 
1205 	/*
1206 	 * If there are any blocks, read-ahead block 0 as we're almost
1207 	 * certain to have the next operation be a read there.
1208 	 */
1209 	mode = xfs_ilock_data_map_shared(ip);
1210 	if (ip->i_df.if_nextents > 0)
1211 		error = xfs_dir3_data_readahead(ip, 0, 0);
1212 	xfs_iunlock(ip, mode);
1213 	return error;
1214 }
1215 
1216 STATIC int
1217 xfs_file_release(
1218 	struct inode	*inode,
1219 	struct file	*filp)
1220 {
1221 	return xfs_release(XFS_I(inode));
1222 }
1223 
1224 STATIC int
1225 xfs_file_readdir(
1226 	struct file	*file,
1227 	struct dir_context *ctx)
1228 {
1229 	struct inode	*inode = file_inode(file);
1230 	xfs_inode_t	*ip = XFS_I(inode);
1231 	size_t		bufsize;
1232 
1233 	/*
1234 	 * The Linux API doesn't pass down the total size of the buffer
1235 	 * we read into down to the filesystem.  With the filldir concept
1236 	 * it's not needed for correct information, but the XFS dir2 leaf
1237 	 * code wants an estimate of the buffer size to calculate it's
1238 	 * readahead window and size the buffers used for mapping to
1239 	 * physical blocks.
1240 	 *
1241 	 * Try to give it an estimate that's good enough, maybe at some
1242 	 * point we can change the ->readdir prototype to include the
1243 	 * buffer size.  For now we use the current glibc buffer size.
1244 	 */
1245 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
1246 
1247 	return xfs_readdir(NULL, ip, ctx, bufsize);
1248 }
1249 
1250 STATIC loff_t
1251 xfs_file_llseek(
1252 	struct file	*file,
1253 	loff_t		offset,
1254 	int		whence)
1255 {
1256 	struct inode		*inode = file->f_mapping->host;
1257 
1258 	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1259 		return -EIO;
1260 
1261 	switch (whence) {
1262 	default:
1263 		return generic_file_llseek(file, offset, whence);
1264 	case SEEK_HOLE:
1265 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1266 		break;
1267 	case SEEK_DATA:
1268 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1269 		break;
1270 	}
1271 
1272 	if (offset < 0)
1273 		return offset;
1274 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1275 }
1276 
1277 /*
1278  * Locking for serialisation of IO during page faults. This results in a lock
1279  * ordering of:
1280  *
1281  * mmap_lock (MM)
1282  *   sb_start_pagefault(vfs, freeze)
1283  *     i_mmaplock (XFS - truncate serialisation)
1284  *       page_lock (MM)
1285  *         i_lock (XFS - extent map serialisation)
1286  */
1287 static vm_fault_t
1288 __xfs_filemap_fault(
1289 	struct vm_fault		*vmf,
1290 	enum page_entry_size	pe_size,
1291 	bool			write_fault)
1292 {
1293 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1294 	struct xfs_inode	*ip = XFS_I(inode);
1295 	vm_fault_t		ret;
1296 
1297 	trace_xfs_filemap_fault(ip, pe_size, write_fault);
1298 
1299 	if (write_fault) {
1300 		sb_start_pagefault(inode->i_sb);
1301 		file_update_time(vmf->vma->vm_file);
1302 	}
1303 
1304 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1305 	if (IS_DAX(inode)) {
1306 		pfn_t pfn;
1307 
1308 		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
1309 				(write_fault && !vmf->cow_page) ?
1310 				 &xfs_direct_write_iomap_ops :
1311 				 &xfs_read_iomap_ops);
1312 		if (ret & VM_FAULT_NEEDDSYNC)
1313 			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1314 	} else {
1315 		if (write_fault)
1316 			ret = iomap_page_mkwrite(vmf,
1317 					&xfs_buffered_write_iomap_ops);
1318 		else
1319 			ret = filemap_fault(vmf);
1320 	}
1321 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1322 
1323 	if (write_fault)
1324 		sb_end_pagefault(inode->i_sb);
1325 	return ret;
1326 }
1327 
1328 static inline bool
1329 xfs_is_write_fault(
1330 	struct vm_fault		*vmf)
1331 {
1332 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1333 	       (vmf->vma->vm_flags & VM_SHARED);
1334 }
1335 
1336 static vm_fault_t
1337 xfs_filemap_fault(
1338 	struct vm_fault		*vmf)
1339 {
1340 	/* DAX can shortcut the normal fault path on write faults! */
1341 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1342 			IS_DAX(file_inode(vmf->vma->vm_file)) &&
1343 			xfs_is_write_fault(vmf));
1344 }
1345 
1346 static vm_fault_t
1347 xfs_filemap_huge_fault(
1348 	struct vm_fault		*vmf,
1349 	enum page_entry_size	pe_size)
1350 {
1351 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1352 		return VM_FAULT_FALLBACK;
1353 
1354 	/* DAX can shortcut the normal fault path on write faults! */
1355 	return __xfs_filemap_fault(vmf, pe_size,
1356 			xfs_is_write_fault(vmf));
1357 }
1358 
1359 static vm_fault_t
1360 xfs_filemap_page_mkwrite(
1361 	struct vm_fault		*vmf)
1362 {
1363 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1364 }
1365 
1366 /*
1367  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1368  * on write faults. In reality, it needs to serialise against truncate and
1369  * prepare memory for writing so handle is as standard write fault.
1370  */
1371 static vm_fault_t
1372 xfs_filemap_pfn_mkwrite(
1373 	struct vm_fault		*vmf)
1374 {
1375 
1376 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1377 }
1378 
1379 static vm_fault_t
1380 xfs_filemap_map_pages(
1381 	struct vm_fault		*vmf,
1382 	pgoff_t			start_pgoff,
1383 	pgoff_t			end_pgoff)
1384 {
1385 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1386 	vm_fault_t ret;
1387 
1388 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1389 	ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
1390 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1391 	return ret;
1392 }
1393 
1394 static const struct vm_operations_struct xfs_file_vm_ops = {
1395 	.fault		= xfs_filemap_fault,
1396 	.huge_fault	= xfs_filemap_huge_fault,
1397 	.map_pages	= xfs_filemap_map_pages,
1398 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1399 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1400 };
1401 
1402 STATIC int
1403 xfs_file_mmap(
1404 	struct file		*file,
1405 	struct vm_area_struct	*vma)
1406 {
1407 	struct inode		*inode = file_inode(file);
1408 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1409 
1410 	/*
1411 	 * We don't support synchronous mappings for non-DAX files and
1412 	 * for DAX files if underneath dax_device is not synchronous.
1413 	 */
1414 	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1415 		return -EOPNOTSUPP;
1416 
1417 	file_accessed(file);
1418 	vma->vm_ops = &xfs_file_vm_ops;
1419 	if (IS_DAX(inode))
1420 		vma->vm_flags |= VM_HUGEPAGE;
1421 	return 0;
1422 }
1423 
1424 const struct file_operations xfs_file_operations = {
1425 	.llseek		= xfs_file_llseek,
1426 	.read_iter	= xfs_file_read_iter,
1427 	.write_iter	= xfs_file_write_iter,
1428 	.splice_read	= generic_file_splice_read,
1429 	.splice_write	= iter_file_splice_write,
1430 	.iopoll		= iomap_dio_iopoll,
1431 	.unlocked_ioctl	= xfs_file_ioctl,
1432 #ifdef CONFIG_COMPAT
1433 	.compat_ioctl	= xfs_file_compat_ioctl,
1434 #endif
1435 	.mmap		= xfs_file_mmap,
1436 	.mmap_supported_flags = MAP_SYNC,
1437 	.open		= xfs_file_open,
1438 	.release	= xfs_file_release,
1439 	.fsync		= xfs_file_fsync,
1440 	.get_unmapped_area = thp_get_unmapped_area,
1441 	.fallocate	= xfs_file_fallocate,
1442 	.fadvise	= xfs_file_fadvise,
1443 	.remap_file_range = xfs_file_remap_range,
1444 };
1445 
1446 const struct file_operations xfs_dir_file_operations = {
1447 	.open		= xfs_dir_open,
1448 	.read		= generic_read_dir,
1449 	.iterate_shared	= xfs_file_readdir,
1450 	.llseek		= generic_file_llseek,
1451 	.unlocked_ioctl	= xfs_file_ioctl,
1452 #ifdef CONFIG_COMPAT
1453 	.compat_ioctl	= xfs_file_compat_ioctl,
1454 #endif
1455 	.fsync		= xfs_dir_fsync,
1456 };
1457