xref: /openbmc/linux/fs/xfs/xfs_file.c (revision a2d581675d485eb7188f521f36efc114639a3096)
1c59d87c4SChristoph Hellwig /*
2c59d87c4SChristoph Hellwig  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3c59d87c4SChristoph Hellwig  * All Rights Reserved.
4c59d87c4SChristoph Hellwig  *
5c59d87c4SChristoph Hellwig  * This program is free software; you can redistribute it and/or
6c59d87c4SChristoph Hellwig  * modify it under the terms of the GNU General Public License as
7c59d87c4SChristoph Hellwig  * published by the Free Software Foundation.
8c59d87c4SChristoph Hellwig  *
9c59d87c4SChristoph Hellwig  * This program is distributed in the hope that it would be useful,
10c59d87c4SChristoph Hellwig  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11c59d87c4SChristoph Hellwig  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12c59d87c4SChristoph Hellwig  * GNU General Public License for more details.
13c59d87c4SChristoph Hellwig  *
14c59d87c4SChristoph Hellwig  * You should have received a copy of the GNU General Public License
15c59d87c4SChristoph Hellwig  * along with this program; if not, write the Free Software Foundation,
16c59d87c4SChristoph Hellwig  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17c59d87c4SChristoph Hellwig  */
18c59d87c4SChristoph Hellwig #include "xfs.h"
19c59d87c4SChristoph Hellwig #include "xfs_fs.h"
2070a9883cSDave Chinner #include "xfs_shared.h"
21a4fbe6abSDave Chinner #include "xfs_format.h"
22239880efSDave Chinner #include "xfs_log_format.h"
23239880efSDave Chinner #include "xfs_trans_resv.h"
24c59d87c4SChristoph Hellwig #include "xfs_mount.h"
2557062787SDave Chinner #include "xfs_da_format.h"
2657062787SDave Chinner #include "xfs_da_btree.h"
27c59d87c4SChristoph Hellwig #include "xfs_inode.h"
28239880efSDave Chinner #include "xfs_trans.h"
29c59d87c4SChristoph Hellwig #include "xfs_inode_item.h"
30c59d87c4SChristoph Hellwig #include "xfs_bmap.h"
31c24b5dfaSDave Chinner #include "xfs_bmap_util.h"
32c59d87c4SChristoph Hellwig #include "xfs_error.h"
332b9ab5abSDave Chinner #include "xfs_dir2.h"
34c24b5dfaSDave Chinner #include "xfs_dir2_priv.h"
35c59d87c4SChristoph Hellwig #include "xfs_ioctl.h"
36c59d87c4SChristoph Hellwig #include "xfs_trace.h"
37239880efSDave Chinner #include "xfs_log.h"
38dc06f398SBrian Foster #include "xfs_icache.h"
39781355c6SChristoph Hellwig #include "xfs_pnfs.h"
4068a9f5e7SChristoph Hellwig #include "xfs_iomap.h"
410613f16cSDarrick J. Wong #include "xfs_reflink.h"
42c59d87c4SChristoph Hellwig 
43c59d87c4SChristoph Hellwig #include <linux/dcache.h>
44c59d87c4SChristoph Hellwig #include <linux/falloc.h>
45d126d43fSJeff Liu #include <linux/pagevec.h>
4666114cadSTejun Heo #include <linux/backing-dev.h>
47c59d87c4SChristoph Hellwig 
48c59d87c4SChristoph Hellwig static const struct vm_operations_struct xfs_file_vm_ops;
49c59d87c4SChristoph Hellwig 
50c59d87c4SChristoph Hellwig /*
5168a9f5e7SChristoph Hellwig  * Clear the specified ranges to zero through either the pagecache or DAX.
5268a9f5e7SChristoph Hellwig  * Holes and unwritten extents will be left as-is as they already are zeroed.
53c59d87c4SChristoph Hellwig  */
54ef9d8733SDave Chinner int
557bb41db3SChristoph Hellwig xfs_zero_range(
5668a9f5e7SChristoph Hellwig 	struct xfs_inode	*ip,
577bb41db3SChristoph Hellwig 	xfs_off_t		pos,
587bb41db3SChristoph Hellwig 	xfs_off_t		count,
597bb41db3SChristoph Hellwig 	bool			*did_zero)
60c59d87c4SChristoph Hellwig {
61459f0fbcSChristoph Hellwig 	return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
62c59d87c4SChristoph Hellwig }
63c59d87c4SChristoph Hellwig 
648add71caSChristoph Hellwig int
658add71caSChristoph Hellwig xfs_update_prealloc_flags(
668add71caSChristoph Hellwig 	struct xfs_inode	*ip,
678add71caSChristoph Hellwig 	enum xfs_prealloc_flags	flags)
688add71caSChristoph Hellwig {
698add71caSChristoph Hellwig 	struct xfs_trans	*tp;
708add71caSChristoph Hellwig 	int			error;
718add71caSChristoph Hellwig 
72253f4911SChristoph Hellwig 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
73253f4911SChristoph Hellwig 			0, 0, 0, &tp);
74253f4911SChristoph Hellwig 	if (error)
758add71caSChristoph Hellwig 		return error;
768add71caSChristoph Hellwig 
778add71caSChristoph Hellwig 	xfs_ilock(ip, XFS_ILOCK_EXCL);
788add71caSChristoph Hellwig 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
798add71caSChristoph Hellwig 
808add71caSChristoph Hellwig 	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
81c19b3b05SDave Chinner 		VFS_I(ip)->i_mode &= ~S_ISUID;
82c19b3b05SDave Chinner 		if (VFS_I(ip)->i_mode & S_IXGRP)
83c19b3b05SDave Chinner 			VFS_I(ip)->i_mode &= ~S_ISGID;
848add71caSChristoph Hellwig 		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
858add71caSChristoph Hellwig 	}
868add71caSChristoph Hellwig 
878add71caSChristoph Hellwig 	if (flags & XFS_PREALLOC_SET)
888add71caSChristoph Hellwig 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
898add71caSChristoph Hellwig 	if (flags & XFS_PREALLOC_CLEAR)
908add71caSChristoph Hellwig 		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
918add71caSChristoph Hellwig 
928add71caSChristoph Hellwig 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
938add71caSChristoph Hellwig 	if (flags & XFS_PREALLOC_SYNC)
948add71caSChristoph Hellwig 		xfs_trans_set_sync(tp);
9570393313SChristoph Hellwig 	return xfs_trans_commit(tp);
968add71caSChristoph Hellwig }
978add71caSChristoph Hellwig 
981da2f2dbSChristoph Hellwig /*
991da2f2dbSChristoph Hellwig  * Fsync operations on directories are much simpler than on regular files,
1001da2f2dbSChristoph Hellwig  * as there is no file data to flush, and thus also no need for explicit
1011da2f2dbSChristoph Hellwig  * cache flush operations, and there are no non-transaction metadata updates
1021da2f2dbSChristoph Hellwig  * on directories either.
1031da2f2dbSChristoph Hellwig  */
1041da2f2dbSChristoph Hellwig STATIC int
1051da2f2dbSChristoph Hellwig xfs_dir_fsync(
1061da2f2dbSChristoph Hellwig 	struct file		*file,
1071da2f2dbSChristoph Hellwig 	loff_t			start,
1081da2f2dbSChristoph Hellwig 	loff_t			end,
1091da2f2dbSChristoph Hellwig 	int			datasync)
1101da2f2dbSChristoph Hellwig {
1111da2f2dbSChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
1121da2f2dbSChristoph Hellwig 	struct xfs_mount	*mp = ip->i_mount;
1131da2f2dbSChristoph Hellwig 	xfs_lsn_t		lsn = 0;
1141da2f2dbSChristoph Hellwig 
1151da2f2dbSChristoph Hellwig 	trace_xfs_dir_fsync(ip);
1161da2f2dbSChristoph Hellwig 
1171da2f2dbSChristoph Hellwig 	xfs_ilock(ip, XFS_ILOCK_SHARED);
1181da2f2dbSChristoph Hellwig 	if (xfs_ipincount(ip))
1191da2f2dbSChristoph Hellwig 		lsn = ip->i_itemp->ili_last_lsn;
1201da2f2dbSChristoph Hellwig 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1211da2f2dbSChristoph Hellwig 
1221da2f2dbSChristoph Hellwig 	if (!lsn)
1231da2f2dbSChristoph Hellwig 		return 0;
1242451337dSDave Chinner 	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
1251da2f2dbSChristoph Hellwig }
1261da2f2dbSChristoph Hellwig 
127c59d87c4SChristoph Hellwig STATIC int
128c59d87c4SChristoph Hellwig xfs_file_fsync(
129c59d87c4SChristoph Hellwig 	struct file		*file,
130c59d87c4SChristoph Hellwig 	loff_t			start,
131c59d87c4SChristoph Hellwig 	loff_t			end,
132c59d87c4SChristoph Hellwig 	int			datasync)
133c59d87c4SChristoph Hellwig {
134c59d87c4SChristoph Hellwig 	struct inode		*inode = file->f_mapping->host;
135c59d87c4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
136c59d87c4SChristoph Hellwig 	struct xfs_mount	*mp = ip->i_mount;
137c59d87c4SChristoph Hellwig 	int			error = 0;
138c59d87c4SChristoph Hellwig 	int			log_flushed = 0;
139b1037058SChristoph Hellwig 	xfs_lsn_t		lsn = 0;
140c59d87c4SChristoph Hellwig 
141c59d87c4SChristoph Hellwig 	trace_xfs_file_fsync(ip);
142c59d87c4SChristoph Hellwig 
143c59d87c4SChristoph Hellwig 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
144c59d87c4SChristoph Hellwig 	if (error)
145c59d87c4SChristoph Hellwig 		return error;
146c59d87c4SChristoph Hellwig 
147c59d87c4SChristoph Hellwig 	if (XFS_FORCED_SHUTDOWN(mp))
148b474c7aeSEric Sandeen 		return -EIO;
149c59d87c4SChristoph Hellwig 
150c59d87c4SChristoph Hellwig 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
151c59d87c4SChristoph Hellwig 
152c59d87c4SChristoph Hellwig 	/*
1532291dab2SDave Chinner 	 * If we have an RT and/or log subvolume we need to make sure to flush
1542291dab2SDave Chinner 	 * the write cache the device used for file data first.  This is to
1552291dab2SDave Chinner 	 * ensure newly written file data make it to disk before logging the new
1562291dab2SDave Chinner 	 * inode size in case of an extending write.
157c59d87c4SChristoph Hellwig 	 */
158c59d87c4SChristoph Hellwig 	if (XFS_IS_REALTIME_INODE(ip))
159c59d87c4SChristoph Hellwig 		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
160c59d87c4SChristoph Hellwig 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
161c59d87c4SChristoph Hellwig 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
162c59d87c4SChristoph Hellwig 
163c59d87c4SChristoph Hellwig 	/*
164fc0561ceSDave Chinner 	 * All metadata updates are logged, which means that we just have to
165fc0561ceSDave Chinner 	 * flush the log up to the latest LSN that touched the inode. If we have
166fc0561ceSDave Chinner 	 * concurrent fsync/fdatasync() calls, we need them to all block on the
167fc0561ceSDave Chinner 	 * log force before we clear the ili_fsync_fields field. This ensures
168fc0561ceSDave Chinner 	 * that we don't get a racing sync operation that does not wait for the
169fc0561ceSDave Chinner 	 * metadata to hit the journal before returning. If we race with
170fc0561ceSDave Chinner 	 * clearing the ili_fsync_fields, then all that will happen is the log
171fc0561ceSDave Chinner 	 * force will do nothing as the lsn will already be on disk. We can't
172fc0561ceSDave Chinner 	 * race with setting ili_fsync_fields because that is done under
173fc0561ceSDave Chinner 	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
174fc0561ceSDave Chinner 	 * until after the ili_fsync_fields is cleared.
175c59d87c4SChristoph Hellwig 	 */
176c59d87c4SChristoph Hellwig 	xfs_ilock(ip, XFS_ILOCK_SHARED);
1778f639ddeSChristoph Hellwig 	if (xfs_ipincount(ip)) {
1788f639ddeSChristoph Hellwig 		if (!datasync ||
179fc0561ceSDave Chinner 		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
180b1037058SChristoph Hellwig 			lsn = ip->i_itemp->ili_last_lsn;
1818f639ddeSChristoph Hellwig 	}
182c59d87c4SChristoph Hellwig 
183fc0561ceSDave Chinner 	if (lsn) {
184b1037058SChristoph Hellwig 		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
185fc0561ceSDave Chinner 		ip->i_itemp->ili_fsync_fields = 0;
186fc0561ceSDave Chinner 	}
187fc0561ceSDave Chinner 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
188b1037058SChristoph Hellwig 
189c59d87c4SChristoph Hellwig 	/*
190c59d87c4SChristoph Hellwig 	 * If we only have a single device, and the log force about was
191c59d87c4SChristoph Hellwig 	 * a no-op we might have to flush the data device cache here.
192c59d87c4SChristoph Hellwig 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
193c59d87c4SChristoph Hellwig 	 * an already allocated file and thus do not have any metadata to
194c59d87c4SChristoph Hellwig 	 * commit.
195c59d87c4SChristoph Hellwig 	 */
1962291dab2SDave Chinner 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
1972291dab2SDave Chinner 	    mp->m_logdev_targp == mp->m_ddev_targp)
198c59d87c4SChristoph Hellwig 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
199c59d87c4SChristoph Hellwig 
2002451337dSDave Chinner 	return error;
201c59d87c4SChristoph Hellwig }
202c59d87c4SChristoph Hellwig 
203c59d87c4SChristoph Hellwig STATIC ssize_t
204bbc5a740SChristoph Hellwig xfs_file_dio_aio_read(
205c59d87c4SChristoph Hellwig 	struct kiocb		*iocb,
206b4f5d2c6SAl Viro 	struct iov_iter		*to)
207c59d87c4SChristoph Hellwig {
208acdda3aaSChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
209bbc5a740SChristoph Hellwig 	size_t			count = iov_iter_count(to);
210acdda3aaSChristoph Hellwig 	ssize_t			ret;
211c59d87c4SChristoph Hellwig 
212bbc5a740SChristoph Hellwig 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
213c59d87c4SChristoph Hellwig 
214f1285ff0SChristoph Hellwig 	if (!count)
215f1285ff0SChristoph Hellwig 		return 0; /* skip atime */
216c59d87c4SChristoph Hellwig 
217a447d7cdSChristoph Hellwig 	file_accessed(iocb->ki_filp);
218a447d7cdSChristoph Hellwig 
21965523218SChristoph Hellwig 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
220acdda3aaSChristoph Hellwig 	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
22165523218SChristoph Hellwig 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
222acdda3aaSChristoph Hellwig 
22316d4d435SChristoph Hellwig 	return ret;
22416d4d435SChristoph Hellwig }
22516d4d435SChristoph Hellwig 
226f021bd07SArnd Bergmann static noinline ssize_t
22716d4d435SChristoph Hellwig xfs_file_dax_read(
22816d4d435SChristoph Hellwig 	struct kiocb		*iocb,
22916d4d435SChristoph Hellwig 	struct iov_iter		*to)
23016d4d435SChristoph Hellwig {
2316c31f495SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
23216d4d435SChristoph Hellwig 	size_t			count = iov_iter_count(to);
23316d4d435SChristoph Hellwig 	ssize_t			ret = 0;
23416d4d435SChristoph Hellwig 
23516d4d435SChristoph Hellwig 	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
23616d4d435SChristoph Hellwig 
23716d4d435SChristoph Hellwig 	if (!count)
23816d4d435SChristoph Hellwig 		return 0; /* skip atime */
23916d4d435SChristoph Hellwig 
24065523218SChristoph Hellwig 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
24111c59c92SRoss Zwisler 	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
24265523218SChristoph Hellwig 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
243bbc5a740SChristoph Hellwig 
244f1285ff0SChristoph Hellwig 	file_accessed(iocb->ki_filp);
245bbc5a740SChristoph Hellwig 	return ret;
246bbc5a740SChristoph Hellwig }
247bbc5a740SChristoph Hellwig 
248bbc5a740SChristoph Hellwig STATIC ssize_t
249bbc5a740SChristoph Hellwig xfs_file_buffered_aio_read(
250bbc5a740SChristoph Hellwig 	struct kiocb		*iocb,
251bbc5a740SChristoph Hellwig 	struct iov_iter		*to)
252bbc5a740SChristoph Hellwig {
253bbc5a740SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
254bbc5a740SChristoph Hellwig 	ssize_t			ret;
255bbc5a740SChristoph Hellwig 
256bbc5a740SChristoph Hellwig 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
257bbc5a740SChristoph Hellwig 
25865523218SChristoph Hellwig 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
259b4f5d2c6SAl Viro 	ret = generic_file_read_iter(iocb, to);
26065523218SChristoph Hellwig 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
261bbc5a740SChristoph Hellwig 
262bbc5a740SChristoph Hellwig 	return ret;
263bbc5a740SChristoph Hellwig }
264bbc5a740SChristoph Hellwig 
265bbc5a740SChristoph Hellwig STATIC ssize_t
266bbc5a740SChristoph Hellwig xfs_file_read_iter(
267bbc5a740SChristoph Hellwig 	struct kiocb		*iocb,
268bbc5a740SChristoph Hellwig 	struct iov_iter		*to)
269bbc5a740SChristoph Hellwig {
27016d4d435SChristoph Hellwig 	struct inode		*inode = file_inode(iocb->ki_filp);
27116d4d435SChristoph Hellwig 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
272bbc5a740SChristoph Hellwig 	ssize_t			ret = 0;
273bbc5a740SChristoph Hellwig 
274bbc5a740SChristoph Hellwig 	XFS_STATS_INC(mp, xs_read_calls);
275bbc5a740SChristoph Hellwig 
276bbc5a740SChristoph Hellwig 	if (XFS_FORCED_SHUTDOWN(mp))
277bbc5a740SChristoph Hellwig 		return -EIO;
278bbc5a740SChristoph Hellwig 
27916d4d435SChristoph Hellwig 	if (IS_DAX(inode))
28016d4d435SChristoph Hellwig 		ret = xfs_file_dax_read(iocb, to);
28116d4d435SChristoph Hellwig 	else if (iocb->ki_flags & IOCB_DIRECT)
282bbc5a740SChristoph Hellwig 		ret = xfs_file_dio_aio_read(iocb, to);
283bbc5a740SChristoph Hellwig 	else
284bbc5a740SChristoph Hellwig 		ret = xfs_file_buffered_aio_read(iocb, to);
285bbc5a740SChristoph Hellwig 
286c59d87c4SChristoph Hellwig 	if (ret > 0)
287ff6d6af2SBill O'Donnell 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
288c59d87c4SChristoph Hellwig 	return ret;
289c59d87c4SChristoph Hellwig }
290c59d87c4SChristoph Hellwig 
291c59d87c4SChristoph Hellwig /*
292193aec10SChristoph Hellwig  * Zero any on disk space between the current EOF and the new, larger EOF.
293193aec10SChristoph Hellwig  *
294193aec10SChristoph Hellwig  * This handles the normal case of zeroing the remainder of the last block in
295193aec10SChristoph Hellwig  * the file and the unusual case of zeroing blocks out beyond the size of the
296193aec10SChristoph Hellwig  * file.  This second case only happens with fixed size extents and when the
297193aec10SChristoph Hellwig  * system crashes before the inode size was updated but after blocks were
298193aec10SChristoph Hellwig  * allocated.
299193aec10SChristoph Hellwig  *
300193aec10SChristoph Hellwig  * Expects the iolock to be held exclusive, and will take the ilock internally.
301c59d87c4SChristoph Hellwig  */
302c59d87c4SChristoph Hellwig int					/* error (positive) */
303c59d87c4SChristoph Hellwig xfs_zero_eof(
304193aec10SChristoph Hellwig 	struct xfs_inode	*ip,
305c59d87c4SChristoph Hellwig 	xfs_off_t		offset,		/* starting I/O offset */
3065885ebdaSDave Chinner 	xfs_fsize_t		isize,		/* current inode size */
3075885ebdaSDave Chinner 	bool			*did_zeroing)
308c59d87c4SChristoph Hellwig {
309193aec10SChristoph Hellwig 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
310c59d87c4SChristoph Hellwig 	ASSERT(offset > isize);
311c59d87c4SChristoph Hellwig 
3120a50f162SBrian Foster 	trace_xfs_zero_eof(ip, isize, offset - isize);
313570b6211SChristoph Hellwig 	return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
314c59d87c4SChristoph Hellwig }
315c59d87c4SChristoph Hellwig 
316c59d87c4SChristoph Hellwig /*
317c59d87c4SChristoph Hellwig  * Common pre-write limit and setup checks.
318c59d87c4SChristoph Hellwig  *
3195bf1f262SChristoph Hellwig  * Called with the iolocked held either shared and exclusive according to
3205bf1f262SChristoph Hellwig  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
3215bf1f262SChristoph Hellwig  * if called for a direct write beyond i_size.
322c59d87c4SChristoph Hellwig  */
323c59d87c4SChristoph Hellwig STATIC ssize_t
324c59d87c4SChristoph Hellwig xfs_file_aio_write_checks(
32599733fa3SAl Viro 	struct kiocb		*iocb,
32699733fa3SAl Viro 	struct iov_iter		*from,
327c59d87c4SChristoph Hellwig 	int			*iolock)
328c59d87c4SChristoph Hellwig {
32999733fa3SAl Viro 	struct file		*file = iocb->ki_filp;
330c59d87c4SChristoph Hellwig 	struct inode		*inode = file->f_mapping->host;
331c59d87c4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
3323309dd04SAl Viro 	ssize_t			error = 0;
33399733fa3SAl Viro 	size_t			count = iov_iter_count(from);
3343136e8bbSBrian Foster 	bool			drained_dio = false;
335c59d87c4SChristoph Hellwig 
3367271d243SDave Chinner restart:
3373309dd04SAl Viro 	error = generic_write_checks(iocb, from);
3383309dd04SAl Viro 	if (error <= 0)
339c59d87c4SChristoph Hellwig 		return error;
340c59d87c4SChristoph Hellwig 
34165523218SChristoph Hellwig 	error = xfs_break_layouts(inode, iolock);
342781355c6SChristoph Hellwig 	if (error)
343781355c6SChristoph Hellwig 		return error;
344781355c6SChristoph Hellwig 
34565523218SChristoph Hellwig 	/*
34665523218SChristoph Hellwig 	 * For changing security info in file_remove_privs() we need i_rwsem
34765523218SChristoph Hellwig 	 * exclusively.
34865523218SChristoph Hellwig 	 */
349a6de82caSJan Kara 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
35065523218SChristoph Hellwig 		xfs_iunlock(ip, *iolock);
351a6de82caSJan Kara 		*iolock = XFS_IOLOCK_EXCL;
35265523218SChristoph Hellwig 		xfs_ilock(ip, *iolock);
353a6de82caSJan Kara 		goto restart;
354a6de82caSJan Kara 	}
355c59d87c4SChristoph Hellwig 	/*
356c59d87c4SChristoph Hellwig 	 * If the offset is beyond the size of the file, we need to zero any
357c59d87c4SChristoph Hellwig 	 * blocks that fall between the existing EOF and the start of this
3582813d682SChristoph Hellwig 	 * write.  If zeroing is needed and we are currently holding the
359467f7899SChristoph Hellwig 	 * iolock shared, we need to update it to exclusive which implies
360467f7899SChristoph Hellwig 	 * having to redo all checks before.
361b9d59846SDave Chinner 	 *
362b9d59846SDave Chinner 	 * We need to serialise against EOF updates that occur in IO
363b9d59846SDave Chinner 	 * completions here. We want to make sure that nobody is changing the
364b9d59846SDave Chinner 	 * size while we do this check until we have placed an IO barrier (i.e.
365b9d59846SDave Chinner 	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
366b9d59846SDave Chinner 	 * The spinlock effectively forms a memory barrier once we have the
367b9d59846SDave Chinner 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
368b9d59846SDave Chinner 	 * and hence be able to correctly determine if we need to run zeroing.
369c59d87c4SChristoph Hellwig 	 */
370b9d59846SDave Chinner 	spin_lock(&ip->i_flags_lock);
37199733fa3SAl Viro 	if (iocb->ki_pos > i_size_read(inode)) {
3725885ebdaSDave Chinner 		bool	zero = false;
3735885ebdaSDave Chinner 
374b9d59846SDave Chinner 		spin_unlock(&ip->i_flags_lock);
3753136e8bbSBrian Foster 		if (!drained_dio) {
3767271d243SDave Chinner 			if (*iolock == XFS_IOLOCK_SHARED) {
37765523218SChristoph Hellwig 				xfs_iunlock(ip, *iolock);
3787271d243SDave Chinner 				*iolock = XFS_IOLOCK_EXCL;
37965523218SChristoph Hellwig 				xfs_ilock(ip, *iolock);
3803309dd04SAl Viro 				iov_iter_reexpand(from, count);
3813136e8bbSBrian Foster 			}
38240c63fbcSDave Chinner 			/*
38340c63fbcSDave Chinner 			 * We now have an IO submission barrier in place, but
38440c63fbcSDave Chinner 			 * AIO can do EOF updates during IO completion and hence
38540c63fbcSDave Chinner 			 * we now need to wait for all of them to drain. Non-AIO
38640c63fbcSDave Chinner 			 * DIO will have drained before we are given the
38740c63fbcSDave Chinner 			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
38840c63fbcSDave Chinner 			 * no-op.
38940c63fbcSDave Chinner 			 */
39040c63fbcSDave Chinner 			inode_dio_wait(inode);
3913136e8bbSBrian Foster 			drained_dio = true;
3927271d243SDave Chinner 			goto restart;
3937271d243SDave Chinner 		}
39499733fa3SAl Viro 		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
395c59d87c4SChristoph Hellwig 		if (error)
396c59d87c4SChristoph Hellwig 			return error;
397b9d59846SDave Chinner 	} else
398b9d59846SDave Chinner 		spin_unlock(&ip->i_flags_lock);
399c59d87c4SChristoph Hellwig 
400c59d87c4SChristoph Hellwig 	/*
4018a9c9980SChristoph Hellwig 	 * Updating the timestamps will grab the ilock again from
4028a9c9980SChristoph Hellwig 	 * xfs_fs_dirty_inode, so we have to call it after dropping the
4038a9c9980SChristoph Hellwig 	 * lock above.  Eventually we should look into a way to avoid
4048a9c9980SChristoph Hellwig 	 * the pointless lock roundtrip.
4058a9c9980SChristoph Hellwig 	 */
406c3b2da31SJosef Bacik 	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
407c3b2da31SJosef Bacik 		error = file_update_time(file);
408c3b2da31SJosef Bacik 		if (error)
409c3b2da31SJosef Bacik 			return error;
410c3b2da31SJosef Bacik 	}
4118a9c9980SChristoph Hellwig 
4128a9c9980SChristoph Hellwig 	/*
413c59d87c4SChristoph Hellwig 	 * If we're writing the file then make sure to clear the setuid and
414c59d87c4SChristoph Hellwig 	 * setgid bits if the process is not being run by root.  This keeps
415c59d87c4SChristoph Hellwig 	 * people from modifying setuid and setgid binaries.
416c59d87c4SChristoph Hellwig 	 */
417a6de82caSJan Kara 	if (!IS_NOSEC(inode))
4185fa8e0a1SJan Kara 		return file_remove_privs(file);
419a6de82caSJan Kara 	return 0;
420c59d87c4SChristoph Hellwig }
421c59d87c4SChristoph Hellwig 
422acdda3aaSChristoph Hellwig static int
423acdda3aaSChristoph Hellwig xfs_dio_write_end_io(
424acdda3aaSChristoph Hellwig 	struct kiocb		*iocb,
425acdda3aaSChristoph Hellwig 	ssize_t			size,
426acdda3aaSChristoph Hellwig 	unsigned		flags)
427acdda3aaSChristoph Hellwig {
428acdda3aaSChristoph Hellwig 	struct inode		*inode = file_inode(iocb->ki_filp);
429acdda3aaSChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
430acdda3aaSChristoph Hellwig 	loff_t			offset = iocb->ki_pos;
431acdda3aaSChristoph Hellwig 	bool			update_size = false;
432acdda3aaSChristoph Hellwig 	int			error = 0;
433acdda3aaSChristoph Hellwig 
434acdda3aaSChristoph Hellwig 	trace_xfs_end_io_direct_write(ip, offset, size);
435acdda3aaSChristoph Hellwig 
436acdda3aaSChristoph Hellwig 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
437acdda3aaSChristoph Hellwig 		return -EIO;
438acdda3aaSChristoph Hellwig 
439acdda3aaSChristoph Hellwig 	if (size <= 0)
440acdda3aaSChristoph Hellwig 		return size;
441acdda3aaSChristoph Hellwig 
442acdda3aaSChristoph Hellwig 	/*
443acdda3aaSChristoph Hellwig 	 * We need to update the in-core inode size here so that we don't end up
444acdda3aaSChristoph Hellwig 	 * with the on-disk inode size being outside the in-core inode size. We
445acdda3aaSChristoph Hellwig 	 * have no other method of updating EOF for AIO, so always do it here
446acdda3aaSChristoph Hellwig 	 * if necessary.
447acdda3aaSChristoph Hellwig 	 *
448acdda3aaSChristoph Hellwig 	 * We need to lock the test/set EOF update as we can be racing with
449acdda3aaSChristoph Hellwig 	 * other IO completions here to update the EOF. Failing to serialise
450acdda3aaSChristoph Hellwig 	 * here can result in EOF moving backwards and Bad Things Happen when
451acdda3aaSChristoph Hellwig 	 * that occurs.
452acdda3aaSChristoph Hellwig 	 */
453acdda3aaSChristoph Hellwig 	spin_lock(&ip->i_flags_lock);
454acdda3aaSChristoph Hellwig 	if (offset + size > i_size_read(inode)) {
455acdda3aaSChristoph Hellwig 		i_size_write(inode, offset + size);
456acdda3aaSChristoph Hellwig 		update_size = true;
457acdda3aaSChristoph Hellwig 	}
458acdda3aaSChristoph Hellwig 	spin_unlock(&ip->i_flags_lock);
459acdda3aaSChristoph Hellwig 
460acdda3aaSChristoph Hellwig 	if (flags & IOMAP_DIO_COW) {
461acdda3aaSChristoph Hellwig 		error = xfs_reflink_end_cow(ip, offset, size);
462acdda3aaSChristoph Hellwig 		if (error)
463acdda3aaSChristoph Hellwig 			return error;
464acdda3aaSChristoph Hellwig 	}
465acdda3aaSChristoph Hellwig 
466acdda3aaSChristoph Hellwig 	if (flags & IOMAP_DIO_UNWRITTEN)
467acdda3aaSChristoph Hellwig 		error = xfs_iomap_write_unwritten(ip, offset, size);
468acdda3aaSChristoph Hellwig 	else if (update_size)
469acdda3aaSChristoph Hellwig 		error = xfs_setfilesize(ip, offset, size);
470acdda3aaSChristoph Hellwig 
471acdda3aaSChristoph Hellwig 	return error;
472acdda3aaSChristoph Hellwig }
473acdda3aaSChristoph Hellwig 
474c59d87c4SChristoph Hellwig /*
475c59d87c4SChristoph Hellwig  * xfs_file_dio_aio_write - handle direct IO writes
476c59d87c4SChristoph Hellwig  *
477c59d87c4SChristoph Hellwig  * Lock the inode appropriately to prepare for and issue a direct IO write.
478c59d87c4SChristoph Hellwig  * By separating it from the buffered write path we remove all the tricky to
479c59d87c4SChristoph Hellwig  * follow locking changes and looping.
480c59d87c4SChristoph Hellwig  *
481c59d87c4SChristoph Hellwig  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
482c59d87c4SChristoph Hellwig  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
483c59d87c4SChristoph Hellwig  * pages are flushed out.
484c59d87c4SChristoph Hellwig  *
485c59d87c4SChristoph Hellwig  * In most cases the direct IO writes will be done holding IOLOCK_SHARED
486c59d87c4SChristoph Hellwig  * allowing them to be done in parallel with reads and other direct IO writes.
487c59d87c4SChristoph Hellwig  * However, if the IO is not aligned to filesystem blocks, the direct IO layer
488c59d87c4SChristoph Hellwig  * needs to do sub-block zeroing and that requires serialisation against other
489c59d87c4SChristoph Hellwig  * direct IOs to the same block. In this case we need to serialise the
490c59d87c4SChristoph Hellwig  * submission of the unaligned IOs so that we don't get racing block zeroing in
491c59d87c4SChristoph Hellwig  * the dio layer.  To avoid the problem with aio, we also need to wait for
492c59d87c4SChristoph Hellwig  * outstanding IOs to complete so that unwritten extent conversion is completed
493c59d87c4SChristoph Hellwig  * before we try to map the overlapping block. This is currently implemented by
4944a06fd26SChristoph Hellwig  * hitting it with a big hammer (i.e. inode_dio_wait()).
495c59d87c4SChristoph Hellwig  *
496c59d87c4SChristoph Hellwig  * Returns with locks held indicated by @iolock and errors indicated by
497c59d87c4SChristoph Hellwig  * negative return values.
498c59d87c4SChristoph Hellwig  */
499c59d87c4SChristoph Hellwig STATIC ssize_t
500c59d87c4SChristoph Hellwig xfs_file_dio_aio_write(
501c59d87c4SChristoph Hellwig 	struct kiocb		*iocb,
502b3188919SAl Viro 	struct iov_iter		*from)
503c59d87c4SChristoph Hellwig {
504c59d87c4SChristoph Hellwig 	struct file		*file = iocb->ki_filp;
505c59d87c4SChristoph Hellwig 	struct address_space	*mapping = file->f_mapping;
506c59d87c4SChristoph Hellwig 	struct inode		*inode = mapping->host;
507c59d87c4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
508c59d87c4SChristoph Hellwig 	struct xfs_mount	*mp = ip->i_mount;
509c59d87c4SChristoph Hellwig 	ssize_t			ret = 0;
510c59d87c4SChristoph Hellwig 	int			unaligned_io = 0;
511d0606464SChristoph Hellwig 	int			iolock;
512b3188919SAl Viro 	size_t			count = iov_iter_count(from);
513c59d87c4SChristoph Hellwig 	struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
514c59d87c4SChristoph Hellwig 					mp->m_rtdev_targp : mp->m_ddev_targp;
515c59d87c4SChristoph Hellwig 
5167c71ee78SEric Sandeen 	/* DIO must be aligned to device logical sector size */
51716d4d435SChristoph Hellwig 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
518b474c7aeSEric Sandeen 		return -EINVAL;
519c59d87c4SChristoph Hellwig 
5200ee7a3f6SChristoph Hellwig 	/*
5210ee7a3f6SChristoph Hellwig 	 * Don't take the exclusive iolock here unless the I/O is unaligned to
5220ee7a3f6SChristoph Hellwig 	 * the file system block size.  We don't need to consider the EOF
5230ee7a3f6SChristoph Hellwig 	 * extension case here because xfs_file_aio_write_checks() will relock
5240ee7a3f6SChristoph Hellwig 	 * the inode as necessary for EOF zeroing cases and fill out the new
5250ee7a3f6SChristoph Hellwig 	 * inode size as appropriate.
5260ee7a3f6SChristoph Hellwig 	 */
52713712713SChristoph Hellwig 	if ((iocb->ki_pos & mp->m_blockmask) ||
5280ee7a3f6SChristoph Hellwig 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
529c59d87c4SChristoph Hellwig 		unaligned_io = 1;
53054a4ef8aSChristoph Hellwig 
53154a4ef8aSChristoph Hellwig 		/*
53254a4ef8aSChristoph Hellwig 		 * We can't properly handle unaligned direct I/O to reflink
53354a4ef8aSChristoph Hellwig 		 * files yet, as we can't unshare a partial block.
53454a4ef8aSChristoph Hellwig 		 */
53554a4ef8aSChristoph Hellwig 		if (xfs_is_reflink_inode(ip)) {
53654a4ef8aSChristoph Hellwig 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
53754a4ef8aSChristoph Hellwig 			return -EREMCHG;
53854a4ef8aSChristoph Hellwig 		}
539d0606464SChristoph Hellwig 		iolock = XFS_IOLOCK_EXCL;
5400ee7a3f6SChristoph Hellwig 	} else {
541d0606464SChristoph Hellwig 		iolock = XFS_IOLOCK_SHARED;
542c58cb165SChristoph Hellwig 	}
543c59d87c4SChristoph Hellwig 
54465523218SChristoph Hellwig 	xfs_ilock(ip, iolock);
5450ee7a3f6SChristoph Hellwig 
54699733fa3SAl Viro 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
547c59d87c4SChristoph Hellwig 	if (ret)
548d0606464SChristoph Hellwig 		goto out;
54999733fa3SAl Viro 	count = iov_iter_count(from);
550c59d87c4SChristoph Hellwig 
551c59d87c4SChristoph Hellwig 	/*
552c59d87c4SChristoph Hellwig 	 * If we are doing unaligned IO, wait for all other IO to drain,
5530ee7a3f6SChristoph Hellwig 	 * otherwise demote the lock if we had to take the exclusive lock
5540ee7a3f6SChristoph Hellwig 	 * for other reasons in xfs_file_aio_write_checks.
555c59d87c4SChristoph Hellwig 	 */
556c59d87c4SChristoph Hellwig 	if (unaligned_io)
5574a06fd26SChristoph Hellwig 		inode_dio_wait(inode);
558d0606464SChristoph Hellwig 	else if (iolock == XFS_IOLOCK_EXCL) {
55965523218SChristoph Hellwig 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
560d0606464SChristoph Hellwig 		iolock = XFS_IOLOCK_SHARED;
561c59d87c4SChristoph Hellwig 	}
562c59d87c4SChristoph Hellwig 
5633176c3e0SChristoph Hellwig 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
564acdda3aaSChristoph Hellwig 	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
565d0606464SChristoph Hellwig out:
56665523218SChristoph Hellwig 	xfs_iunlock(ip, iolock);
567d0606464SChristoph Hellwig 
5686b698edeSDave Chinner 	/*
56916d4d435SChristoph Hellwig 	 * No fallback to buffered IO on errors for XFS, direct IO will either
57016d4d435SChristoph Hellwig 	 * complete fully or fail.
5716b698edeSDave Chinner 	 */
57216d4d435SChristoph Hellwig 	ASSERT(ret < 0 || ret == count);
57316d4d435SChristoph Hellwig 	return ret;
57416d4d435SChristoph Hellwig }
57516d4d435SChristoph Hellwig 
576f021bd07SArnd Bergmann static noinline ssize_t
57716d4d435SChristoph Hellwig xfs_file_dax_write(
57816d4d435SChristoph Hellwig 	struct kiocb		*iocb,
57916d4d435SChristoph Hellwig 	struct iov_iter		*from)
58016d4d435SChristoph Hellwig {
5816c31f495SChristoph Hellwig 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
58216d4d435SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
58317879e8fSChristoph Hellwig 	int			iolock = XFS_IOLOCK_EXCL;
5846c31f495SChristoph Hellwig 	ssize_t			ret, error = 0;
5856c31f495SChristoph Hellwig 	size_t			count;
5866c31f495SChristoph Hellwig 	loff_t			pos;
58716d4d435SChristoph Hellwig 
58865523218SChristoph Hellwig 	xfs_ilock(ip, iolock);
58916d4d435SChristoph Hellwig 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
59016d4d435SChristoph Hellwig 	if (ret)
59116d4d435SChristoph Hellwig 		goto out;
59216d4d435SChristoph Hellwig 
5936c31f495SChristoph Hellwig 	pos = iocb->ki_pos;
5946c31f495SChristoph Hellwig 	count = iov_iter_count(from);
5958b2180b3SDave Chinner 
5966c31f495SChristoph Hellwig 	trace_xfs_file_dax_write(ip, count, pos);
59711c59c92SRoss Zwisler 	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
5986c31f495SChristoph Hellwig 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
5996c31f495SChristoph Hellwig 		i_size_write(inode, iocb->ki_pos);
6006c31f495SChristoph Hellwig 		error = xfs_setfilesize(ip, pos, ret);
60116d4d435SChristoph Hellwig 	}
60216d4d435SChristoph Hellwig out:
60365523218SChristoph Hellwig 	xfs_iunlock(ip, iolock);
6046c31f495SChristoph Hellwig 	return error ? error : ret;
605c59d87c4SChristoph Hellwig }
606c59d87c4SChristoph Hellwig 
607c59d87c4SChristoph Hellwig STATIC ssize_t
608c59d87c4SChristoph Hellwig xfs_file_buffered_aio_write(
609c59d87c4SChristoph Hellwig 	struct kiocb		*iocb,
610b3188919SAl Viro 	struct iov_iter		*from)
611c59d87c4SChristoph Hellwig {
612c59d87c4SChristoph Hellwig 	struct file		*file = iocb->ki_filp;
613c59d87c4SChristoph Hellwig 	struct address_space	*mapping = file->f_mapping;
614c59d87c4SChristoph Hellwig 	struct inode		*inode = mapping->host;
615c59d87c4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
616c59d87c4SChristoph Hellwig 	ssize_t			ret;
617c59d87c4SChristoph Hellwig 	int			enospc = 0;
618c3155097SBrian Foster 	int			iolock;
619c59d87c4SChristoph Hellwig 
620c3155097SBrian Foster write_retry:
621c3155097SBrian Foster 	iolock = XFS_IOLOCK_EXCL;
62265523218SChristoph Hellwig 	xfs_ilock(ip, iolock);
623c59d87c4SChristoph Hellwig 
62499733fa3SAl Viro 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
625c59d87c4SChristoph Hellwig 	if (ret)
626d0606464SChristoph Hellwig 		goto out;
627c59d87c4SChristoph Hellwig 
628c59d87c4SChristoph Hellwig 	/* We can write back this queue in page reclaim */
629de1414a6SChristoph Hellwig 	current->backing_dev_info = inode_to_bdi(inode);
630c59d87c4SChristoph Hellwig 
6313176c3e0SChristoph Hellwig 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
63268a9f5e7SChristoph Hellwig 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
6330a64bc2cSAl Viro 	if (likely(ret >= 0))
63499733fa3SAl Viro 		iocb->ki_pos += ret;
635dc06f398SBrian Foster 
636c59d87c4SChristoph Hellwig 	/*
637dc06f398SBrian Foster 	 * If we hit a space limit, try to free up some lingering preallocated
638dc06f398SBrian Foster 	 * space before returning an error. In the case of ENOSPC, first try to
639dc06f398SBrian Foster 	 * write back all dirty inodes to free up some of the excess reserved
640dc06f398SBrian Foster 	 * metadata space. This reduces the chances that the eofblocks scan
641dc06f398SBrian Foster 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
642dc06f398SBrian Foster 	 * also behaves as a filter to prevent too many eofblocks scans from
643dc06f398SBrian Foster 	 * running at the same time.
644c59d87c4SChristoph Hellwig 	 */
645dc06f398SBrian Foster 	if (ret == -EDQUOT && !enospc) {
646c3155097SBrian Foster 		xfs_iunlock(ip, iolock);
647dc06f398SBrian Foster 		enospc = xfs_inode_free_quota_eofblocks(ip);
648dc06f398SBrian Foster 		if (enospc)
649dc06f398SBrian Foster 			goto write_retry;
65083104d44SDarrick J. Wong 		enospc = xfs_inode_free_quota_cowblocks(ip);
65183104d44SDarrick J. Wong 		if (enospc)
65283104d44SDarrick J. Wong 			goto write_retry;
653c3155097SBrian Foster 		iolock = 0;
654dc06f398SBrian Foster 	} else if (ret == -ENOSPC && !enospc) {
655dc06f398SBrian Foster 		struct xfs_eofblocks eofb = {0};
656dc06f398SBrian Foster 
657c59d87c4SChristoph Hellwig 		enospc = 1;
6589aa05000SDave Chinner 		xfs_flush_inodes(ip->i_mount);
659c3155097SBrian Foster 
660c3155097SBrian Foster 		xfs_iunlock(ip, iolock);
661dc06f398SBrian Foster 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
662dc06f398SBrian Foster 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
663c59d87c4SChristoph Hellwig 		goto write_retry;
664c59d87c4SChristoph Hellwig 	}
665d0606464SChristoph Hellwig 
666c59d87c4SChristoph Hellwig 	current->backing_dev_info = NULL;
667d0606464SChristoph Hellwig out:
668c3155097SBrian Foster 	if (iolock)
66965523218SChristoph Hellwig 		xfs_iunlock(ip, iolock);
670c59d87c4SChristoph Hellwig 	return ret;
671c59d87c4SChristoph Hellwig }
672c59d87c4SChristoph Hellwig 
673c59d87c4SChristoph Hellwig STATIC ssize_t
674bf97f3bcSAl Viro xfs_file_write_iter(
675c59d87c4SChristoph Hellwig 	struct kiocb		*iocb,
676bf97f3bcSAl Viro 	struct iov_iter		*from)
677c59d87c4SChristoph Hellwig {
678c59d87c4SChristoph Hellwig 	struct file		*file = iocb->ki_filp;
679c59d87c4SChristoph Hellwig 	struct address_space	*mapping = file->f_mapping;
680c59d87c4SChristoph Hellwig 	struct inode		*inode = mapping->host;
681c59d87c4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
682c59d87c4SChristoph Hellwig 	ssize_t			ret;
683bf97f3bcSAl Viro 	size_t			ocount = iov_iter_count(from);
684c59d87c4SChristoph Hellwig 
685ff6d6af2SBill O'Donnell 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
686c59d87c4SChristoph Hellwig 
687c59d87c4SChristoph Hellwig 	if (ocount == 0)
688c59d87c4SChristoph Hellwig 		return 0;
689c59d87c4SChristoph Hellwig 
690bf97f3bcSAl Viro 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
691bf97f3bcSAl Viro 		return -EIO;
692c59d87c4SChristoph Hellwig 
69316d4d435SChristoph Hellwig 	if (IS_DAX(inode))
69416d4d435SChristoph Hellwig 		ret = xfs_file_dax_write(iocb, from);
6950613f16cSDarrick J. Wong 	else if (iocb->ki_flags & IOCB_DIRECT) {
6960613f16cSDarrick J. Wong 		/*
6970613f16cSDarrick J. Wong 		 * Allow a directio write to fall back to a buffered
6980613f16cSDarrick J. Wong 		 * write *only* in the case that we're doing a reflink
6990613f16cSDarrick J. Wong 		 * CoW.  In all other directio scenarios we do not
7000613f16cSDarrick J. Wong 		 * allow an operation to fall back to buffered mode.
7010613f16cSDarrick J. Wong 		 */
702bf97f3bcSAl Viro 		ret = xfs_file_dio_aio_write(iocb, from);
7030613f16cSDarrick J. Wong 		if (ret == -EREMCHG)
7040613f16cSDarrick J. Wong 			goto buffered;
7050613f16cSDarrick J. Wong 	} else {
7060613f16cSDarrick J. Wong buffered:
707bf97f3bcSAl Viro 		ret = xfs_file_buffered_aio_write(iocb, from);
7080613f16cSDarrick J. Wong 	}
709c59d87c4SChristoph Hellwig 
710d0606464SChristoph Hellwig 	if (ret > 0) {
711ff6d6af2SBill O'Donnell 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
712ce7ae151SChristoph Hellwig 
713c59d87c4SChristoph Hellwig 		/* Handle various SYNC-type writes */
714e2592217SChristoph Hellwig 		ret = generic_write_sync(iocb, ret);
715c59d87c4SChristoph Hellwig 	}
716c59d87c4SChristoph Hellwig 	return ret;
717c59d87c4SChristoph Hellwig }
718c59d87c4SChristoph Hellwig 
719a904b1caSNamjae Jeon #define	XFS_FALLOC_FL_SUPPORTED						\
720a904b1caSNamjae Jeon 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
721a904b1caSNamjae Jeon 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
72298cc2db5SDarrick J. Wong 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
723a904b1caSNamjae Jeon 
724c59d87c4SChristoph Hellwig STATIC long
725c59d87c4SChristoph Hellwig xfs_file_fallocate(
726c59d87c4SChristoph Hellwig 	struct file		*file,
727c59d87c4SChristoph Hellwig 	int			mode,
728c59d87c4SChristoph Hellwig 	loff_t			offset,
729c59d87c4SChristoph Hellwig 	loff_t			len)
730c59d87c4SChristoph Hellwig {
731496ad9aaSAl Viro 	struct inode		*inode = file_inode(file);
73283aee9e4SChristoph Hellwig 	struct xfs_inode	*ip = XFS_I(inode);
733c59d87c4SChristoph Hellwig 	long			error;
7348add71caSChristoph Hellwig 	enum xfs_prealloc_flags	flags = 0;
735781355c6SChristoph Hellwig 	uint			iolock = XFS_IOLOCK_EXCL;
736c59d87c4SChristoph Hellwig 	loff_t			new_size = 0;
737a904b1caSNamjae Jeon 	bool			do_file_insert = 0;
738c59d87c4SChristoph Hellwig 
73983aee9e4SChristoph Hellwig 	if (!S_ISREG(inode->i_mode))
74083aee9e4SChristoph Hellwig 		return -EINVAL;
741a904b1caSNamjae Jeon 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
742c59d87c4SChristoph Hellwig 		return -EOPNOTSUPP;
743c59d87c4SChristoph Hellwig 
744781355c6SChristoph Hellwig 	xfs_ilock(ip, iolock);
74565523218SChristoph Hellwig 	error = xfs_break_layouts(inode, &iolock);
746781355c6SChristoph Hellwig 	if (error)
747781355c6SChristoph Hellwig 		goto out_unlock;
748781355c6SChristoph Hellwig 
749e8e9ad42SDave Chinner 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
750e8e9ad42SDave Chinner 	iolock |= XFS_MMAPLOCK_EXCL;
751e8e9ad42SDave Chinner 
75283aee9e4SChristoph Hellwig 	if (mode & FALLOC_FL_PUNCH_HOLE) {
75383aee9e4SChristoph Hellwig 		error = xfs_free_file_space(ip, offset, len);
75483aee9e4SChristoph Hellwig 		if (error)
75583aee9e4SChristoph Hellwig 			goto out_unlock;
756e1d8fb88SNamjae Jeon 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
757e1d8fb88SNamjae Jeon 		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
758e1d8fb88SNamjae Jeon 
759e1d8fb88SNamjae Jeon 		if (offset & blksize_mask || len & blksize_mask) {
7602451337dSDave Chinner 			error = -EINVAL;
761e1d8fb88SNamjae Jeon 			goto out_unlock;
762e1d8fb88SNamjae Jeon 		}
763e1d8fb88SNamjae Jeon 
76423fffa92SLukas Czerner 		/*
76523fffa92SLukas Czerner 		 * There is no need to overlap collapse range with EOF,
76623fffa92SLukas Czerner 		 * in which case it is effectively a truncate operation
76723fffa92SLukas Czerner 		 */
76823fffa92SLukas Czerner 		if (offset + len >= i_size_read(inode)) {
7692451337dSDave Chinner 			error = -EINVAL;
77023fffa92SLukas Czerner 			goto out_unlock;
77123fffa92SLukas Czerner 		}
77223fffa92SLukas Czerner 
773e1d8fb88SNamjae Jeon 		new_size = i_size_read(inode) - len;
774e1d8fb88SNamjae Jeon 
775e1d8fb88SNamjae Jeon 		error = xfs_collapse_file_space(ip, offset, len);
776e1d8fb88SNamjae Jeon 		if (error)
777e1d8fb88SNamjae Jeon 			goto out_unlock;
778a904b1caSNamjae Jeon 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
779a904b1caSNamjae Jeon 		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
780a904b1caSNamjae Jeon 
781a904b1caSNamjae Jeon 		new_size = i_size_read(inode) + len;
782a904b1caSNamjae Jeon 		if (offset & blksize_mask || len & blksize_mask) {
783a904b1caSNamjae Jeon 			error = -EINVAL;
784a904b1caSNamjae Jeon 			goto out_unlock;
785a904b1caSNamjae Jeon 		}
786a904b1caSNamjae Jeon 
787a904b1caSNamjae Jeon 		/* check the new inode size does not wrap through zero */
788a904b1caSNamjae Jeon 		if (new_size > inode->i_sb->s_maxbytes) {
789a904b1caSNamjae Jeon 			error = -EFBIG;
790a904b1caSNamjae Jeon 			goto out_unlock;
791a904b1caSNamjae Jeon 		}
792a904b1caSNamjae Jeon 
793a904b1caSNamjae Jeon 		/* Offset should be less than i_size */
794a904b1caSNamjae Jeon 		if (offset >= i_size_read(inode)) {
795a904b1caSNamjae Jeon 			error = -EINVAL;
796a904b1caSNamjae Jeon 			goto out_unlock;
797a904b1caSNamjae Jeon 		}
798a904b1caSNamjae Jeon 		do_file_insert = 1;
79983aee9e4SChristoph Hellwig 	} else {
8008add71caSChristoph Hellwig 		flags |= XFS_PREALLOC_SET;
8018add71caSChristoph Hellwig 
802c59d87c4SChristoph Hellwig 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
803c59d87c4SChristoph Hellwig 		    offset + len > i_size_read(inode)) {
804c59d87c4SChristoph Hellwig 			new_size = offset + len;
8052451337dSDave Chinner 			error = inode_newsize_ok(inode, new_size);
806c59d87c4SChristoph Hellwig 			if (error)
807c59d87c4SChristoph Hellwig 				goto out_unlock;
808c59d87c4SChristoph Hellwig 		}
809c59d87c4SChristoph Hellwig 
810376ba313SLukas Czerner 		if (mode & FALLOC_FL_ZERO_RANGE)
811376ba313SLukas Czerner 			error = xfs_zero_file_space(ip, offset, len);
81298cc2db5SDarrick J. Wong 		else {
81398cc2db5SDarrick J. Wong 			if (mode & FALLOC_FL_UNSHARE_RANGE) {
81498cc2db5SDarrick J. Wong 				error = xfs_reflink_unshare(ip, offset, len);
81598cc2db5SDarrick J. Wong 				if (error)
81698cc2db5SDarrick J. Wong 					goto out_unlock;
81798cc2db5SDarrick J. Wong 			}
81883aee9e4SChristoph Hellwig 			error = xfs_alloc_file_space(ip, offset, len,
81983aee9e4SChristoph Hellwig 						     XFS_BMAPI_PREALLOC);
82098cc2db5SDarrick J. Wong 		}
82183aee9e4SChristoph Hellwig 		if (error)
82283aee9e4SChristoph Hellwig 			goto out_unlock;
82383aee9e4SChristoph Hellwig 	}
824c59d87c4SChristoph Hellwig 
82583aee9e4SChristoph Hellwig 	if (file->f_flags & O_DSYNC)
8268add71caSChristoph Hellwig 		flags |= XFS_PREALLOC_SYNC;
8278add71caSChristoph Hellwig 
8288add71caSChristoph Hellwig 	error = xfs_update_prealloc_flags(ip, flags);
829c59d87c4SChristoph Hellwig 	if (error)
830c59d87c4SChristoph Hellwig 		goto out_unlock;
831c59d87c4SChristoph Hellwig 
832c59d87c4SChristoph Hellwig 	/* Change file size if needed */
833c59d87c4SChristoph Hellwig 	if (new_size) {
834c59d87c4SChristoph Hellwig 		struct iattr iattr;
835c59d87c4SChristoph Hellwig 
836c59d87c4SChristoph Hellwig 		iattr.ia_valid = ATTR_SIZE;
837c59d87c4SChristoph Hellwig 		iattr.ia_size = new_size;
83869bca807SJan Kara 		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
839a904b1caSNamjae Jeon 		if (error)
840a904b1caSNamjae Jeon 			goto out_unlock;
841c59d87c4SChristoph Hellwig 	}
842c59d87c4SChristoph Hellwig 
843a904b1caSNamjae Jeon 	/*
844a904b1caSNamjae Jeon 	 * Perform hole insertion now that the file size has been
845a904b1caSNamjae Jeon 	 * updated so that if we crash during the operation we don't
846a904b1caSNamjae Jeon 	 * leave shifted extents past EOF and hence losing access to
847a904b1caSNamjae Jeon 	 * the data that is contained within them.
848a904b1caSNamjae Jeon 	 */
849a904b1caSNamjae Jeon 	if (do_file_insert)
850a904b1caSNamjae Jeon 		error = xfs_insert_file_space(ip, offset, len);
851a904b1caSNamjae Jeon 
852c59d87c4SChristoph Hellwig out_unlock:
853781355c6SChristoph Hellwig 	xfs_iunlock(ip, iolock);
8542451337dSDave Chinner 	return error;
855c59d87c4SChristoph Hellwig }
856c59d87c4SChristoph Hellwig 
8579fe26045SDarrick J. Wong STATIC int
8589fe26045SDarrick J. Wong xfs_file_clone_range(
8599fe26045SDarrick J. Wong 	struct file	*file_in,
8609fe26045SDarrick J. Wong 	loff_t		pos_in,
8619fe26045SDarrick J. Wong 	struct file	*file_out,
8629fe26045SDarrick J. Wong 	loff_t		pos_out,
8639fe26045SDarrick J. Wong 	u64		len)
8649fe26045SDarrick J. Wong {
8655faaf4faSChristoph Hellwig 	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
866cc714660SDarrick J. Wong 				     len, false);
867cc714660SDarrick J. Wong }
868cc714660SDarrick J. Wong 
869cc714660SDarrick J. Wong STATIC ssize_t
870cc714660SDarrick J. Wong xfs_file_dedupe_range(
871cc714660SDarrick J. Wong 	struct file	*src_file,
872cc714660SDarrick J. Wong 	u64		loff,
873cc714660SDarrick J. Wong 	u64		len,
874cc714660SDarrick J. Wong 	struct file	*dst_file,
875cc714660SDarrick J. Wong 	u64		dst_loff)
876cc714660SDarrick J. Wong {
877cc714660SDarrick J. Wong 	int		error;
878cc714660SDarrick J. Wong 
8795faaf4faSChristoph Hellwig 	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
880cc714660SDarrick J. Wong 				     len, true);
881cc714660SDarrick J. Wong 	if (error)
882cc714660SDarrick J. Wong 		return error;
883cc714660SDarrick J. Wong 	return len;
8849fe26045SDarrick J. Wong }
885c59d87c4SChristoph Hellwig 
886c59d87c4SChristoph Hellwig STATIC int
887c59d87c4SChristoph Hellwig xfs_file_open(
888c59d87c4SChristoph Hellwig 	struct inode	*inode,
889c59d87c4SChristoph Hellwig 	struct file	*file)
890c59d87c4SChristoph Hellwig {
891c59d87c4SChristoph Hellwig 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
892c59d87c4SChristoph Hellwig 		return -EFBIG;
893c59d87c4SChristoph Hellwig 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
894c59d87c4SChristoph Hellwig 		return -EIO;
895c59d87c4SChristoph Hellwig 	return 0;
896c59d87c4SChristoph Hellwig }
897c59d87c4SChristoph Hellwig 
898c59d87c4SChristoph Hellwig STATIC int
899c59d87c4SChristoph Hellwig xfs_dir_open(
900c59d87c4SChristoph Hellwig 	struct inode	*inode,
901c59d87c4SChristoph Hellwig 	struct file	*file)
902c59d87c4SChristoph Hellwig {
903c59d87c4SChristoph Hellwig 	struct xfs_inode *ip = XFS_I(inode);
904c59d87c4SChristoph Hellwig 	int		mode;
905c59d87c4SChristoph Hellwig 	int		error;
906c59d87c4SChristoph Hellwig 
907c59d87c4SChristoph Hellwig 	error = xfs_file_open(inode, file);
908c59d87c4SChristoph Hellwig 	if (error)
909c59d87c4SChristoph Hellwig 		return error;
910c59d87c4SChristoph Hellwig 
911c59d87c4SChristoph Hellwig 	/*
912c59d87c4SChristoph Hellwig 	 * If there are any blocks, read-ahead block 0 as we're almost
913c59d87c4SChristoph Hellwig 	 * certain to have the next operation be a read there.
914c59d87c4SChristoph Hellwig 	 */
915309ecac8SChristoph Hellwig 	mode = xfs_ilock_data_map_shared(ip);
916c59d87c4SChristoph Hellwig 	if (ip->i_d.di_nextents > 0)
9177a652bbeSDarrick J. Wong 		error = xfs_dir3_data_readahead(ip, 0, -1);
918c59d87c4SChristoph Hellwig 	xfs_iunlock(ip, mode);
9197a652bbeSDarrick J. Wong 	return error;
920c59d87c4SChristoph Hellwig }
921c59d87c4SChristoph Hellwig 
922c59d87c4SChristoph Hellwig STATIC int
923c59d87c4SChristoph Hellwig xfs_file_release(
924c59d87c4SChristoph Hellwig 	struct inode	*inode,
925c59d87c4SChristoph Hellwig 	struct file	*filp)
926c59d87c4SChristoph Hellwig {
9272451337dSDave Chinner 	return xfs_release(XFS_I(inode));
928c59d87c4SChristoph Hellwig }
929c59d87c4SChristoph Hellwig 
930c59d87c4SChristoph Hellwig STATIC int
931c59d87c4SChristoph Hellwig xfs_file_readdir(
932b8227554SAl Viro 	struct file	*file,
933b8227554SAl Viro 	struct dir_context *ctx)
934c59d87c4SChristoph Hellwig {
935b8227554SAl Viro 	struct inode	*inode = file_inode(file);
936c59d87c4SChristoph Hellwig 	xfs_inode_t	*ip = XFS_I(inode);
937c59d87c4SChristoph Hellwig 	size_t		bufsize;
938c59d87c4SChristoph Hellwig 
939c59d87c4SChristoph Hellwig 	/*
940c59d87c4SChristoph Hellwig 	 * The Linux API doesn't pass down the total size of the buffer
941c59d87c4SChristoph Hellwig 	 * we read into down to the filesystem.  With the filldir concept
942c59d87c4SChristoph Hellwig 	 * it's not needed for correct information, but the XFS dir2 leaf
943c59d87c4SChristoph Hellwig 	 * code wants an estimate of the buffer size to calculate it's
944c59d87c4SChristoph Hellwig 	 * readahead window and size the buffers used for mapping to
945c59d87c4SChristoph Hellwig 	 * physical blocks.
946c59d87c4SChristoph Hellwig 	 *
947c59d87c4SChristoph Hellwig 	 * Try to give it an estimate that's good enough, maybe at some
948c59d87c4SChristoph Hellwig 	 * point we can change the ->readdir prototype to include the
949c59d87c4SChristoph Hellwig 	 * buffer size.  For now we use the current glibc buffer size.
950c59d87c4SChristoph Hellwig 	 */
951c59d87c4SChristoph Hellwig 	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
952c59d87c4SChristoph Hellwig 
9538300475eSkbuild test robot 	return xfs_readdir(ip, ctx, bufsize);
954c59d87c4SChristoph Hellwig }
955c59d87c4SChristoph Hellwig 
956c59d87c4SChristoph Hellwig /*
957d126d43fSJeff Liu  * This type is designed to indicate the type of offset we would like
95849c69591SEric Sandeen  * to search from page cache for xfs_seek_hole_data().
959d126d43fSJeff Liu  */
960d126d43fSJeff Liu enum {
961d126d43fSJeff Liu 	HOLE_OFF = 0,
962d126d43fSJeff Liu 	DATA_OFF,
963d126d43fSJeff Liu };
964d126d43fSJeff Liu 
965d126d43fSJeff Liu /*
966d126d43fSJeff Liu  * Lookup the desired type of offset from the given page.
967d126d43fSJeff Liu  *
968d126d43fSJeff Liu  * On success, return true and the offset argument will point to the
969d126d43fSJeff Liu  * start of the region that was found.  Otherwise this function will
970d126d43fSJeff Liu  * return false and keep the offset argument unchanged.
971d126d43fSJeff Liu  */
972d126d43fSJeff Liu STATIC bool
973d126d43fSJeff Liu xfs_lookup_buffer_offset(
974d126d43fSJeff Liu 	struct page		*page,
975d126d43fSJeff Liu 	loff_t			*offset,
976d126d43fSJeff Liu 	unsigned int		type)
977d126d43fSJeff Liu {
978d126d43fSJeff Liu 	loff_t			lastoff = page_offset(page);
979d126d43fSJeff Liu 	bool			found = false;
980d126d43fSJeff Liu 	struct buffer_head	*bh, *head;
981d126d43fSJeff Liu 
982d126d43fSJeff Liu 	bh = head = page_buffers(page);
983d126d43fSJeff Liu 	do {
984d126d43fSJeff Liu 		/*
985d126d43fSJeff Liu 		 * Unwritten extents that have data in the page
986d126d43fSJeff Liu 		 * cache covering them can be identified by the
987d126d43fSJeff Liu 		 * BH_Unwritten state flag.  Pages with multiple
988d126d43fSJeff Liu 		 * buffers might have a mix of holes, data and
989d126d43fSJeff Liu 		 * unwritten extents - any buffer with valid
990d126d43fSJeff Liu 		 * data in it should have BH_Uptodate flag set
991d126d43fSJeff Liu 		 * on it.
992d126d43fSJeff Liu 		 */
993d126d43fSJeff Liu 		if (buffer_unwritten(bh) ||
994d126d43fSJeff Liu 		    buffer_uptodate(bh)) {
995d126d43fSJeff Liu 			if (type == DATA_OFF)
996d126d43fSJeff Liu 				found = true;
997d126d43fSJeff Liu 		} else {
998d126d43fSJeff Liu 			if (type == HOLE_OFF)
999d126d43fSJeff Liu 				found = true;
1000d126d43fSJeff Liu 		}
1001d126d43fSJeff Liu 
1002d126d43fSJeff Liu 		if (found) {
1003d126d43fSJeff Liu 			*offset = lastoff;
1004d126d43fSJeff Liu 			break;
1005d126d43fSJeff Liu 		}
1006d126d43fSJeff Liu 		lastoff += bh->b_size;
1007d126d43fSJeff Liu 	} while ((bh = bh->b_this_page) != head);
1008d126d43fSJeff Liu 
1009d126d43fSJeff Liu 	return found;
1010d126d43fSJeff Liu }
1011d126d43fSJeff Liu 
1012d126d43fSJeff Liu /*
1013d126d43fSJeff Liu  * This routine is called to find out and return a data or hole offset
1014d126d43fSJeff Liu  * from the page cache for unwritten extents according to the desired
101549c69591SEric Sandeen  * type for xfs_seek_hole_data().
1016d126d43fSJeff Liu  *
1017d126d43fSJeff Liu  * The argument offset is used to tell where we start to search from the
1018d126d43fSJeff Liu  * page cache.  Map is used to figure out the end points of the range to
1019d126d43fSJeff Liu  * lookup pages.
1020d126d43fSJeff Liu  *
1021d126d43fSJeff Liu  * Return true if the desired type of offset was found, and the argument
1022d126d43fSJeff Liu  * offset is filled with that address.  Otherwise, return false and keep
1023d126d43fSJeff Liu  * offset unchanged.
1024d126d43fSJeff Liu  */
1025d126d43fSJeff Liu STATIC bool
1026d126d43fSJeff Liu xfs_find_get_desired_pgoff(
1027d126d43fSJeff Liu 	struct inode		*inode,
1028d126d43fSJeff Liu 	struct xfs_bmbt_irec	*map,
1029d126d43fSJeff Liu 	unsigned int		type,
1030d126d43fSJeff Liu 	loff_t			*offset)
1031d126d43fSJeff Liu {
1032d126d43fSJeff Liu 	struct xfs_inode	*ip = XFS_I(inode);
1033d126d43fSJeff Liu 	struct xfs_mount	*mp = ip->i_mount;
1034d126d43fSJeff Liu 	struct pagevec		pvec;
1035d126d43fSJeff Liu 	pgoff_t			index;
1036d126d43fSJeff Liu 	pgoff_t			end;
1037d126d43fSJeff Liu 	loff_t			endoff;
1038d126d43fSJeff Liu 	loff_t			startoff = *offset;
1039d126d43fSJeff Liu 	loff_t			lastoff = startoff;
1040d126d43fSJeff Liu 	bool			found = false;
1041d126d43fSJeff Liu 
1042d126d43fSJeff Liu 	pagevec_init(&pvec, 0);
1043d126d43fSJeff Liu 
104409cbfeafSKirill A. Shutemov 	index = startoff >> PAGE_SHIFT;
1045d126d43fSJeff Liu 	endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
104609cbfeafSKirill A. Shutemov 	end = endoff >> PAGE_SHIFT;
1047d126d43fSJeff Liu 	do {
1048d126d43fSJeff Liu 		int		want;
1049d126d43fSJeff Liu 		unsigned	nr_pages;
1050d126d43fSJeff Liu 		unsigned int	i;
1051d126d43fSJeff Liu 
1052d126d43fSJeff Liu 		want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
1053d126d43fSJeff Liu 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
1054d126d43fSJeff Liu 					  want);
1055d126d43fSJeff Liu 		/*
1056d126d43fSJeff Liu 		 * No page mapped into given range.  If we are searching holes
1057d126d43fSJeff Liu 		 * and if this is the first time we got into the loop, it means
1058d126d43fSJeff Liu 		 * that the given offset is landed in a hole, return it.
1059d126d43fSJeff Liu 		 *
1060d126d43fSJeff Liu 		 * If we have already stepped through some block buffers to find
1061d126d43fSJeff Liu 		 * holes but they all contains data.  In this case, the last
1062d126d43fSJeff Liu 		 * offset is already updated and pointed to the end of the last
1063d126d43fSJeff Liu 		 * mapped page, if it does not reach the endpoint to search,
1064d126d43fSJeff Liu 		 * that means there should be a hole between them.
1065d126d43fSJeff Liu 		 */
1066d126d43fSJeff Liu 		if (nr_pages == 0) {
1067d126d43fSJeff Liu 			/* Data search found nothing */
1068d126d43fSJeff Liu 			if (type == DATA_OFF)
1069d126d43fSJeff Liu 				break;
1070d126d43fSJeff Liu 
1071d126d43fSJeff Liu 			ASSERT(type == HOLE_OFF);
1072d126d43fSJeff Liu 			if (lastoff == startoff || lastoff < endoff) {
1073d126d43fSJeff Liu 				found = true;
1074d126d43fSJeff Liu 				*offset = lastoff;
1075d126d43fSJeff Liu 			}
1076d126d43fSJeff Liu 			break;
1077d126d43fSJeff Liu 		}
1078d126d43fSJeff Liu 
1079d126d43fSJeff Liu 		/*
1080d126d43fSJeff Liu 		 * At lease we found one page.  If this is the first time we
1081d126d43fSJeff Liu 		 * step into the loop, and if the first page index offset is
1082d126d43fSJeff Liu 		 * greater than the given search offset, a hole was found.
1083d126d43fSJeff Liu 		 */
1084d126d43fSJeff Liu 		if (type == HOLE_OFF && lastoff == startoff &&
1085d126d43fSJeff Liu 		    lastoff < page_offset(pvec.pages[0])) {
1086d126d43fSJeff Liu 			found = true;
1087d126d43fSJeff Liu 			break;
1088d126d43fSJeff Liu 		}
1089d126d43fSJeff Liu 
1090d126d43fSJeff Liu 		for (i = 0; i < nr_pages; i++) {
1091d126d43fSJeff Liu 			struct page	*page = pvec.pages[i];
1092d126d43fSJeff Liu 			loff_t		b_offset;
1093d126d43fSJeff Liu 
1094d126d43fSJeff Liu 			/*
1095d126d43fSJeff Liu 			 * At this point, the page may be truncated or
1096d126d43fSJeff Liu 			 * invalidated (changing page->mapping to NULL),
1097d126d43fSJeff Liu 			 * or even swizzled back from swapper_space to tmpfs
1098d126d43fSJeff Liu 			 * file mapping. However, page->index will not change
1099d126d43fSJeff Liu 			 * because we have a reference on the page.
1100d126d43fSJeff Liu 			 *
1101d126d43fSJeff Liu 			 * Searching done if the page index is out of range.
1102d126d43fSJeff Liu 			 * If the current offset is not reaches the end of
1103d126d43fSJeff Liu 			 * the specified search range, there should be a hole
1104d126d43fSJeff Liu 			 * between them.
1105d126d43fSJeff Liu 			 */
1106d126d43fSJeff Liu 			if (page->index > end) {
1107d126d43fSJeff Liu 				if (type == HOLE_OFF && lastoff < endoff) {
1108d126d43fSJeff Liu 					*offset = lastoff;
1109d126d43fSJeff Liu 					found = true;
1110d126d43fSJeff Liu 				}
1111d126d43fSJeff Liu 				goto out;
1112d126d43fSJeff Liu 			}
1113d126d43fSJeff Liu 
1114d126d43fSJeff Liu 			lock_page(page);
1115d126d43fSJeff Liu 			/*
1116d126d43fSJeff Liu 			 * Page truncated or invalidated(page->mapping == NULL).
1117d126d43fSJeff Liu 			 * We can freely skip it and proceed to check the next
1118d126d43fSJeff Liu 			 * page.
1119d126d43fSJeff Liu 			 */
1120d126d43fSJeff Liu 			if (unlikely(page->mapping != inode->i_mapping)) {
1121d126d43fSJeff Liu 				unlock_page(page);
1122d126d43fSJeff Liu 				continue;
1123d126d43fSJeff Liu 			}
1124d126d43fSJeff Liu 
1125d126d43fSJeff Liu 			if (!page_has_buffers(page)) {
1126d126d43fSJeff Liu 				unlock_page(page);
1127d126d43fSJeff Liu 				continue;
1128d126d43fSJeff Liu 			}
1129d126d43fSJeff Liu 
1130d126d43fSJeff Liu 			found = xfs_lookup_buffer_offset(page, &b_offset, type);
1131d126d43fSJeff Liu 			if (found) {
1132d126d43fSJeff Liu 				/*
1133d126d43fSJeff Liu 				 * The found offset may be less than the start
1134d126d43fSJeff Liu 				 * point to search if this is the first time to
1135d126d43fSJeff Liu 				 * come here.
1136d126d43fSJeff Liu 				 */
1137d126d43fSJeff Liu 				*offset = max_t(loff_t, startoff, b_offset);
1138d126d43fSJeff Liu 				unlock_page(page);
1139d126d43fSJeff Liu 				goto out;
1140d126d43fSJeff Liu 			}
1141d126d43fSJeff Liu 
1142d126d43fSJeff Liu 			/*
1143d126d43fSJeff Liu 			 * We either searching data but nothing was found, or
1144d126d43fSJeff Liu 			 * searching hole but found a data buffer.  In either
1145d126d43fSJeff Liu 			 * case, probably the next page contains the desired
1146d126d43fSJeff Liu 			 * things, update the last offset to it so.
1147d126d43fSJeff Liu 			 */
1148d126d43fSJeff Liu 			lastoff = page_offset(page) + PAGE_SIZE;
1149d126d43fSJeff Liu 			unlock_page(page);
1150d126d43fSJeff Liu 		}
1151d126d43fSJeff Liu 
1152d126d43fSJeff Liu 		/*
1153d126d43fSJeff Liu 		 * The number of returned pages less than our desired, search
1154d126d43fSJeff Liu 		 * done.  In this case, nothing was found for searching data,
1155d126d43fSJeff Liu 		 * but we found a hole behind the last offset.
1156d126d43fSJeff Liu 		 */
1157d126d43fSJeff Liu 		if (nr_pages < want) {
1158d126d43fSJeff Liu 			if (type == HOLE_OFF) {
1159d126d43fSJeff Liu 				*offset = lastoff;
1160d126d43fSJeff Liu 				found = true;
1161d126d43fSJeff Liu 			}
1162d126d43fSJeff Liu 			break;
1163d126d43fSJeff Liu 		}
1164d126d43fSJeff Liu 
1165d126d43fSJeff Liu 		index = pvec.pages[i - 1]->index + 1;
1166d126d43fSJeff Liu 		pagevec_release(&pvec);
1167d126d43fSJeff Liu 	} while (index <= end);
1168d126d43fSJeff Liu 
1169d126d43fSJeff Liu out:
1170d126d43fSJeff Liu 	pagevec_release(&pvec);
1171d126d43fSJeff Liu 	return found;
1172d126d43fSJeff Liu }
1173d126d43fSJeff Liu 
11748aa7d37eSEric Sandeen /*
11758aa7d37eSEric Sandeen  * caller must lock inode with xfs_ilock_data_map_shared,
11768aa7d37eSEric Sandeen  * can we craft an appropriate ASSERT?
11778aa7d37eSEric Sandeen  *
11788aa7d37eSEric Sandeen  * end is because the VFS-level lseek interface is defined such that any
11798aa7d37eSEric Sandeen  * offset past i_size shall return -ENXIO, but we use this for quota code
11808aa7d37eSEric Sandeen  * which does not maintain i_size, and we want to SEEK_DATA past i_size.
11818aa7d37eSEric Sandeen  */
11828aa7d37eSEric Sandeen loff_t
11838aa7d37eSEric Sandeen __xfs_seek_hole_data(
11848aa7d37eSEric Sandeen 	struct inode		*inode,
118549c69591SEric Sandeen 	loff_t			start,
11868aa7d37eSEric Sandeen 	loff_t			end,
118749c69591SEric Sandeen 	int			whence)
11883fe3e6b1SJeff Liu {
11893fe3e6b1SJeff Liu 	struct xfs_inode	*ip = XFS_I(inode);
11903fe3e6b1SJeff Liu 	struct xfs_mount	*mp = ip->i_mount;
11913fe3e6b1SJeff Liu 	loff_t			uninitialized_var(offset);
11923fe3e6b1SJeff Liu 	xfs_fileoff_t		fsbno;
11938aa7d37eSEric Sandeen 	xfs_filblks_t		lastbno;
11943fe3e6b1SJeff Liu 	int			error;
11953fe3e6b1SJeff Liu 
11968aa7d37eSEric Sandeen 	if (start >= end) {
11972451337dSDave Chinner 		error = -ENXIO;
11988aa7d37eSEric Sandeen 		goto out_error;
11993fe3e6b1SJeff Liu 	}
12003fe3e6b1SJeff Liu 
120149c69591SEric Sandeen 	/*
120249c69591SEric Sandeen 	 * Try to read extents from the first block indicated
120349c69591SEric Sandeen 	 * by fsbno to the end block of the file.
120449c69591SEric Sandeen 	 */
12053fe3e6b1SJeff Liu 	fsbno = XFS_B_TO_FSBT(mp, start);
12068aa7d37eSEric Sandeen 	lastbno = XFS_B_TO_FSB(mp, end);
1207b686d1f7SJeff Liu 
1208b686d1f7SJeff Liu 	for (;;) {
1209b686d1f7SJeff Liu 		struct xfs_bmbt_irec	map[2];
1210b686d1f7SJeff Liu 		int			nmap = 2;
1211b686d1f7SJeff Liu 		unsigned int		i;
1212b686d1f7SJeff Liu 
12138aa7d37eSEric Sandeen 		error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
1214b686d1f7SJeff Liu 				       XFS_BMAPI_ENTIRE);
12153fe3e6b1SJeff Liu 		if (error)
12168aa7d37eSEric Sandeen 			goto out_error;
12173fe3e6b1SJeff Liu 
1218b686d1f7SJeff Liu 		/* No extents at given offset, must be beyond EOF */
1219b686d1f7SJeff Liu 		if (nmap == 0) {
12202451337dSDave Chinner 			error = -ENXIO;
12218aa7d37eSEric Sandeen 			goto out_error;
12223fe3e6b1SJeff Liu 		}
12233fe3e6b1SJeff Liu 
1224b686d1f7SJeff Liu 		for (i = 0; i < nmap; i++) {
1225b686d1f7SJeff Liu 			offset = max_t(loff_t, start,
1226b686d1f7SJeff Liu 				       XFS_FSB_TO_B(mp, map[i].br_startoff));
1227b686d1f7SJeff Liu 
122849c69591SEric Sandeen 			/* Landed in the hole we wanted? */
122949c69591SEric Sandeen 			if (whence == SEEK_HOLE &&
123049c69591SEric Sandeen 			    map[i].br_startblock == HOLESTARTBLOCK)
123149c69591SEric Sandeen 				goto out;
123249c69591SEric Sandeen 
123349c69591SEric Sandeen 			/* Landed in the data extent we wanted? */
123449c69591SEric Sandeen 			if (whence == SEEK_DATA &&
123549c69591SEric Sandeen 			    (map[i].br_startblock == DELAYSTARTBLOCK ||
123649c69591SEric Sandeen 			     (map[i].br_state == XFS_EXT_NORM &&
123749c69591SEric Sandeen 			      !isnullstartblock(map[i].br_startblock))))
1238b686d1f7SJeff Liu 				goto out;
1239b686d1f7SJeff Liu 
1240b686d1f7SJeff Liu 			/*
124149c69591SEric Sandeen 			 * Landed in an unwritten extent, try to search
124249c69591SEric Sandeen 			 * for hole or data from page cache.
1243b686d1f7SJeff Liu 			 */
1244b686d1f7SJeff Liu 			if (map[i].br_state == XFS_EXT_UNWRITTEN) {
1245b686d1f7SJeff Liu 				if (xfs_find_get_desired_pgoff(inode, &map[i],
124649c69591SEric Sandeen 				      whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
124749c69591SEric Sandeen 							&offset))
1248b686d1f7SJeff Liu 					goto out;
1249b686d1f7SJeff Liu 			}
1250b686d1f7SJeff Liu 		}
1251b686d1f7SJeff Liu 
1252b686d1f7SJeff Liu 		/*
125349c69591SEric Sandeen 		 * We only received one extent out of the two requested. This
125449c69591SEric Sandeen 		 * means we've hit EOF and didn't find what we are looking for.
1255b686d1f7SJeff Liu 		 */
1256b686d1f7SJeff Liu 		if (nmap == 1) {
125749c69591SEric Sandeen 			/*
125849c69591SEric Sandeen 			 * If we were looking for a hole, set offset to
125949c69591SEric Sandeen 			 * the end of the file (i.e., there is an implicit
126049c69591SEric Sandeen 			 * hole at the end of any file).
126149c69591SEric Sandeen 		 	 */
126249c69591SEric Sandeen 			if (whence == SEEK_HOLE) {
12638aa7d37eSEric Sandeen 				offset = end;
1264b686d1f7SJeff Liu 				break;
1265b686d1f7SJeff Liu 			}
126649c69591SEric Sandeen 			/*
126749c69591SEric Sandeen 			 * If we were looking for data, it's nowhere to be found
126849c69591SEric Sandeen 			 */
126949c69591SEric Sandeen 			ASSERT(whence == SEEK_DATA);
127049c69591SEric Sandeen 			error = -ENXIO;
12718aa7d37eSEric Sandeen 			goto out_error;
127249c69591SEric Sandeen 		}
1273b686d1f7SJeff Liu 
1274b686d1f7SJeff Liu 		ASSERT(i > 1);
1275b686d1f7SJeff Liu 
1276b686d1f7SJeff Liu 		/*
127749c69591SEric Sandeen 		 * Nothing was found, proceed to the next round of search
127849c69591SEric Sandeen 		 * if the next reading offset is not at or beyond EOF.
1279b686d1f7SJeff Liu 		 */
1280b686d1f7SJeff Liu 		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1281b686d1f7SJeff Liu 		start = XFS_FSB_TO_B(mp, fsbno);
12828aa7d37eSEric Sandeen 		if (start >= end) {
128349c69591SEric Sandeen 			if (whence == SEEK_HOLE) {
12848aa7d37eSEric Sandeen 				offset = end;
1285b686d1f7SJeff Liu 				break;
1286b686d1f7SJeff Liu 			}
128749c69591SEric Sandeen 			ASSERT(whence == SEEK_DATA);
128849c69591SEric Sandeen 			error = -ENXIO;
12898aa7d37eSEric Sandeen 			goto out_error;
129049c69591SEric Sandeen 		}
1291b686d1f7SJeff Liu 	}
1292b686d1f7SJeff Liu 
1293b686d1f7SJeff Liu out:
1294b686d1f7SJeff Liu 	/*
129549c69591SEric Sandeen 	 * If at this point we have found the hole we wanted, the returned
1296b686d1f7SJeff Liu 	 * offset may be bigger than the file size as it may be aligned to
129749c69591SEric Sandeen 	 * page boundary for unwritten extents.  We need to deal with this
1298b686d1f7SJeff Liu 	 * situation in particular.
1299b686d1f7SJeff Liu 	 */
130049c69591SEric Sandeen 	if (whence == SEEK_HOLE)
13018aa7d37eSEric Sandeen 		offset = min_t(loff_t, offset, end);
13028aa7d37eSEric Sandeen 
13038aa7d37eSEric Sandeen 	return offset;
13048aa7d37eSEric Sandeen 
13058aa7d37eSEric Sandeen out_error:
13068aa7d37eSEric Sandeen 	return error;
13078aa7d37eSEric Sandeen }
13088aa7d37eSEric Sandeen 
13098aa7d37eSEric Sandeen STATIC loff_t
13108aa7d37eSEric Sandeen xfs_seek_hole_data(
13118aa7d37eSEric Sandeen 	struct file		*file,
13128aa7d37eSEric Sandeen 	loff_t			start,
13138aa7d37eSEric Sandeen 	int			whence)
13148aa7d37eSEric Sandeen {
13158aa7d37eSEric Sandeen 	struct inode		*inode = file->f_mapping->host;
13168aa7d37eSEric Sandeen 	struct xfs_inode	*ip = XFS_I(inode);
13178aa7d37eSEric Sandeen 	struct xfs_mount	*mp = ip->i_mount;
13188aa7d37eSEric Sandeen 	uint			lock;
13198aa7d37eSEric Sandeen 	loff_t			offset, end;
13208aa7d37eSEric Sandeen 	int			error = 0;
13218aa7d37eSEric Sandeen 
13228aa7d37eSEric Sandeen 	if (XFS_FORCED_SHUTDOWN(mp))
13238aa7d37eSEric Sandeen 		return -EIO;
13248aa7d37eSEric Sandeen 
13258aa7d37eSEric Sandeen 	lock = xfs_ilock_data_map_shared(ip);
13268aa7d37eSEric Sandeen 
13278aa7d37eSEric Sandeen 	end = i_size_read(inode);
13288aa7d37eSEric Sandeen 	offset = __xfs_seek_hole_data(inode, start, end, whence);
13298aa7d37eSEric Sandeen 	if (offset < 0) {
13308aa7d37eSEric Sandeen 		error = offset;
13318aa7d37eSEric Sandeen 		goto out_unlock;
13328aa7d37eSEric Sandeen 	}
13338aa7d37eSEric Sandeen 
133446a1c2c7SJie Liu 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
13353fe3e6b1SJeff Liu 
13363fe3e6b1SJeff Liu out_unlock:
133701f4f327SChristoph Hellwig 	xfs_iunlock(ip, lock);
13383fe3e6b1SJeff Liu 
13393fe3e6b1SJeff Liu 	if (error)
13402451337dSDave Chinner 		return error;
13413fe3e6b1SJeff Liu 	return offset;
13423fe3e6b1SJeff Liu }
13433fe3e6b1SJeff Liu 
13443fe3e6b1SJeff Liu STATIC loff_t
13453fe3e6b1SJeff Liu xfs_file_llseek(
13463fe3e6b1SJeff Liu 	struct file	*file,
13473fe3e6b1SJeff Liu 	loff_t		offset,
134859f9c004SEric Sandeen 	int		whence)
13493fe3e6b1SJeff Liu {
135059f9c004SEric Sandeen 	switch (whence) {
13513fe3e6b1SJeff Liu 	case SEEK_END:
13523fe3e6b1SJeff Liu 	case SEEK_CUR:
13533fe3e6b1SJeff Liu 	case SEEK_SET:
135459f9c004SEric Sandeen 		return generic_file_llseek(file, offset, whence);
13553fe3e6b1SJeff Liu 	case SEEK_HOLE:
135649c69591SEric Sandeen 	case SEEK_DATA:
135759f9c004SEric Sandeen 		return xfs_seek_hole_data(file, offset, whence);
13583fe3e6b1SJeff Liu 	default:
13593fe3e6b1SJeff Liu 		return -EINVAL;
13603fe3e6b1SJeff Liu 	}
13613fe3e6b1SJeff Liu }
13623fe3e6b1SJeff Liu 
1363de0e8c20SDave Chinner /*
1364de0e8c20SDave Chinner  * Locking for serialisation of IO during page faults. This results in a lock
1365de0e8c20SDave Chinner  * ordering of:
1366de0e8c20SDave Chinner  *
1367de0e8c20SDave Chinner  * mmap_sem (MM)
13686b698edeSDave Chinner  *   sb_start_pagefault(vfs, freeze)
136913ad4fe3SDave Chinner  *     i_mmaplock (XFS - truncate serialisation)
1370de0e8c20SDave Chinner  *       page_lock (MM)
1371de0e8c20SDave Chinner  *         i_lock (XFS - extent map serialisation)
1372de0e8c20SDave Chinner  */
1373de0e8c20SDave Chinner 
1374075a924dSDave Chinner /*
1375075a924dSDave Chinner  * mmap()d file has taken write protection fault and is being made writable. We
1376075a924dSDave Chinner  * can set the page state up correctly for a writable page, which means we can
1377075a924dSDave Chinner  * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
1378075a924dSDave Chinner  * mapping.
1379075a924dSDave Chinner  */
1380075a924dSDave Chinner STATIC int
1381075a924dSDave Chinner xfs_filemap_page_mkwrite(
1382075a924dSDave Chinner 	struct vm_fault		*vmf)
1383075a924dSDave Chinner {
138411bac800SDave Jiang 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1385ec56b1f1SDave Chinner 	int			ret;
1386075a924dSDave Chinner 
13876b698edeSDave Chinner 	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
1388075a924dSDave Chinner 
13896b698edeSDave Chinner 	sb_start_pagefault(inode->i_sb);
139011bac800SDave Jiang 	file_update_time(vmf->vma->vm_file);
13916b698edeSDave Chinner 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1392075a924dSDave Chinner 
13936b698edeSDave Chinner 	if (IS_DAX(inode)) {
139411bac800SDave Jiang 		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
13956b698edeSDave Chinner 	} else {
139611bac800SDave Jiang 		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
13976b698edeSDave Chinner 		ret = block_page_mkwrite_return(ret);
13986b698edeSDave Chinner 	}
1399ec56b1f1SDave Chinner 
14006b698edeSDave Chinner 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
14016b698edeSDave Chinner 	sb_end_pagefault(inode->i_sb);
14026b698edeSDave Chinner 
14036b698edeSDave Chinner 	return ret;
14046b698edeSDave Chinner }
14056b698edeSDave Chinner 
14066b698edeSDave Chinner STATIC int
14076b698edeSDave Chinner xfs_filemap_fault(
14086b698edeSDave Chinner 	struct vm_fault		*vmf)
14096b698edeSDave Chinner {
141011bac800SDave Jiang 	struct inode		*inode = file_inode(vmf->vma->vm_file);
14116b698edeSDave Chinner 	int			ret;
14126b698edeSDave Chinner 
1413b2442c5aSDave Chinner 	trace_xfs_filemap_fault(XFS_I(inode));
14146b698edeSDave Chinner 
14156b698edeSDave Chinner 	/* DAX can shortcut the normal fault path on write faults! */
1416b2442c5aSDave Chinner 	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
141711bac800SDave Jiang 		return xfs_filemap_page_mkwrite(vmf);
1418075a924dSDave Chinner 
1419b2442c5aSDave Chinner 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1420acdda3aaSChristoph Hellwig 	if (IS_DAX(inode))
142111bac800SDave Jiang 		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
1422acdda3aaSChristoph Hellwig 	else
142311bac800SDave Jiang 		ret = filemap_fault(vmf);
1424b2442c5aSDave Chinner 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1425075a924dSDave Chinner 
14266b698edeSDave Chinner 	return ret;
14276b698edeSDave Chinner }
14286b698edeSDave Chinner 
142913ad4fe3SDave Chinner /*
143013ad4fe3SDave Chinner  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
143113ad4fe3SDave Chinner  * both read and write faults. Hence we need to handle both cases. There is no
1432*a2d58167SDave Jiang  * ->huge_mkwrite callout for huge pages, so we have a single function here to
143313ad4fe3SDave Chinner  * handle both cases here. @flags carries the information on the type of fault
143413ad4fe3SDave Chinner  * occuring.
143513ad4fe3SDave Chinner  */
1436acd76e74SMatthew Wilcox STATIC int
1437*a2d58167SDave Jiang xfs_filemap_huge_fault(
1438d8a849e1SDave Jiang 	struct vm_fault		*vmf)
1439acd76e74SMatthew Wilcox {
1440f4200391SDave Jiang 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1441acd76e74SMatthew Wilcox 	struct xfs_inode	*ip = XFS_I(inode);
1442acd76e74SMatthew Wilcox 	int			ret;
1443acd76e74SMatthew Wilcox 
1444acd76e74SMatthew Wilcox 	if (!IS_DAX(inode))
1445acd76e74SMatthew Wilcox 		return VM_FAULT_FALLBACK;
1446acd76e74SMatthew Wilcox 
1447*a2d58167SDave Jiang 	trace_xfs_filemap_huge_fault(ip);
1448acd76e74SMatthew Wilcox 
1449d8a849e1SDave Jiang 	if (vmf->flags & FAULT_FLAG_WRITE) {
1450acd76e74SMatthew Wilcox 		sb_start_pagefault(inode->i_sb);
1451f4200391SDave Jiang 		file_update_time(vmf->vma->vm_file);
145213ad4fe3SDave Chinner 	}
145313ad4fe3SDave Chinner 
1454acd76e74SMatthew Wilcox 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1455*a2d58167SDave Jiang 	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
1456acd76e74SMatthew Wilcox 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
145713ad4fe3SDave Chinner 
1458d8a849e1SDave Jiang 	if (vmf->flags & FAULT_FLAG_WRITE)
1459acd76e74SMatthew Wilcox 		sb_end_pagefault(inode->i_sb);
1460acd76e74SMatthew Wilcox 
1461acd76e74SMatthew Wilcox 	return ret;
1462acd76e74SMatthew Wilcox }
1463acd76e74SMatthew Wilcox 
14643af49285SDave Chinner /*
14653af49285SDave Chinner  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
14663af49285SDave Chinner  * updates on write faults. In reality, it's need to serialise against
14675eb88dcaSRoss Zwisler  * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
14685eb88dcaSRoss Zwisler  * to ensure we serialise the fault barrier in place.
14693af49285SDave Chinner  */
14703af49285SDave Chinner static int
14713af49285SDave Chinner xfs_filemap_pfn_mkwrite(
14723af49285SDave Chinner 	struct vm_fault		*vmf)
14733af49285SDave Chinner {
14743af49285SDave Chinner 
147511bac800SDave Jiang 	struct inode		*inode = file_inode(vmf->vma->vm_file);
14763af49285SDave Chinner 	struct xfs_inode	*ip = XFS_I(inode);
14773af49285SDave Chinner 	int			ret = VM_FAULT_NOPAGE;
14783af49285SDave Chinner 	loff_t			size;
14793af49285SDave Chinner 
14803af49285SDave Chinner 	trace_xfs_filemap_pfn_mkwrite(ip);
14813af49285SDave Chinner 
14823af49285SDave Chinner 	sb_start_pagefault(inode->i_sb);
148311bac800SDave Jiang 	file_update_time(vmf->vma->vm_file);
14843af49285SDave Chinner 
14853af49285SDave Chinner 	/* check if the faulting page hasn't raced with truncate */
14863af49285SDave Chinner 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
14873af49285SDave Chinner 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
14883af49285SDave Chinner 	if (vmf->pgoff >= size)
14893af49285SDave Chinner 		ret = VM_FAULT_SIGBUS;
14905eb88dcaSRoss Zwisler 	else if (IS_DAX(inode))
149111bac800SDave Jiang 		ret = dax_pfn_mkwrite(vmf);
14923af49285SDave Chinner 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
14933af49285SDave Chinner 	sb_end_pagefault(inode->i_sb);
14943af49285SDave Chinner 	return ret;
14953af49285SDave Chinner 
14963af49285SDave Chinner }
14973af49285SDave Chinner 
14986b698edeSDave Chinner static const struct vm_operations_struct xfs_file_vm_ops = {
14996b698edeSDave Chinner 	.fault		= xfs_filemap_fault,
1500*a2d58167SDave Jiang 	.huge_fault	= xfs_filemap_huge_fault,
15016b698edeSDave Chinner 	.map_pages	= filemap_map_pages,
15026b698edeSDave Chinner 	.page_mkwrite	= xfs_filemap_page_mkwrite,
15033af49285SDave Chinner 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
15046b698edeSDave Chinner };
15056b698edeSDave Chinner 
15066b698edeSDave Chinner STATIC int
15076b698edeSDave Chinner xfs_file_mmap(
15086b698edeSDave Chinner 	struct file	*filp,
15096b698edeSDave Chinner 	struct vm_area_struct *vma)
15106b698edeSDave Chinner {
15116b698edeSDave Chinner 	file_accessed(filp);
15126b698edeSDave Chinner 	vma->vm_ops = &xfs_file_vm_ops;
15136b698edeSDave Chinner 	if (IS_DAX(file_inode(filp)))
1514acd76e74SMatthew Wilcox 		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
15156b698edeSDave Chinner 	return 0;
1516075a924dSDave Chinner }
1517075a924dSDave Chinner 
1518c59d87c4SChristoph Hellwig const struct file_operations xfs_file_operations = {
15193fe3e6b1SJeff Liu 	.llseek		= xfs_file_llseek,
1520b4f5d2c6SAl Viro 	.read_iter	= xfs_file_read_iter,
1521bf97f3bcSAl Viro 	.write_iter	= xfs_file_write_iter,
152282c156f8SAl Viro 	.splice_read	= generic_file_splice_read,
15238d020765SAl Viro 	.splice_write	= iter_file_splice_write,
1524c59d87c4SChristoph Hellwig 	.unlocked_ioctl	= xfs_file_ioctl,
1525c59d87c4SChristoph Hellwig #ifdef CONFIG_COMPAT
1526c59d87c4SChristoph Hellwig 	.compat_ioctl	= xfs_file_compat_ioctl,
1527c59d87c4SChristoph Hellwig #endif
1528c59d87c4SChristoph Hellwig 	.mmap		= xfs_file_mmap,
1529c59d87c4SChristoph Hellwig 	.open		= xfs_file_open,
1530c59d87c4SChristoph Hellwig 	.release	= xfs_file_release,
1531c59d87c4SChristoph Hellwig 	.fsync		= xfs_file_fsync,
1532dbe6ec81SToshi Kani 	.get_unmapped_area = thp_get_unmapped_area,
1533c59d87c4SChristoph Hellwig 	.fallocate	= xfs_file_fallocate,
15349fe26045SDarrick J. Wong 	.clone_file_range = xfs_file_clone_range,
1535cc714660SDarrick J. Wong 	.dedupe_file_range = xfs_file_dedupe_range,
1536c59d87c4SChristoph Hellwig };
1537c59d87c4SChristoph Hellwig 
1538c59d87c4SChristoph Hellwig const struct file_operations xfs_dir_file_operations = {
1539c59d87c4SChristoph Hellwig 	.open		= xfs_dir_open,
1540c59d87c4SChristoph Hellwig 	.read		= generic_read_dir,
15413b0a3c1aSAl Viro 	.iterate_shared	= xfs_file_readdir,
1542c59d87c4SChristoph Hellwig 	.llseek		= generic_file_llseek,
1543c59d87c4SChristoph Hellwig 	.unlocked_ioctl	= xfs_file_ioctl,
1544c59d87c4SChristoph Hellwig #ifdef CONFIG_COMPAT
1545c59d87c4SChristoph Hellwig 	.compat_ioctl	= xfs_file_compat_ioctl,
1546c59d87c4SChristoph Hellwig #endif
15471da2f2dbSChristoph Hellwig 	.fsync		= xfs_dir_fsync,
1548c59d87c4SChristoph Hellwig };
1549