xref: /openbmc/linux/fs/dax.c (revision 34c0fd54)
1d475c634SMatthew Wilcox /*
2d475c634SMatthew Wilcox  * fs/dax.c - Direct Access filesystem code
3d475c634SMatthew Wilcox  * Copyright (c) 2013-2014 Intel Corporation
4d475c634SMatthew Wilcox  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5d475c634SMatthew Wilcox  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6d475c634SMatthew Wilcox  *
7d475c634SMatthew Wilcox  * This program is free software; you can redistribute it and/or modify it
8d475c634SMatthew Wilcox  * under the terms and conditions of the GNU General Public License,
9d475c634SMatthew Wilcox  * version 2, as published by the Free Software Foundation.
10d475c634SMatthew Wilcox  *
11d475c634SMatthew Wilcox  * This program is distributed in the hope it will be useful, but WITHOUT
12d475c634SMatthew Wilcox  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13d475c634SMatthew Wilcox  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14d475c634SMatthew Wilcox  * more details.
15d475c634SMatthew Wilcox  */
16d475c634SMatthew Wilcox 
17d475c634SMatthew Wilcox #include <linux/atomic.h>
18d475c634SMatthew Wilcox #include <linux/blkdev.h>
19d475c634SMatthew Wilcox #include <linux/buffer_head.h>
20d77e92e2SRoss Zwisler #include <linux/dax.h>
21d475c634SMatthew Wilcox #include <linux/fs.h>
22d475c634SMatthew Wilcox #include <linux/genhd.h>
23f7ca90b1SMatthew Wilcox #include <linux/highmem.h>
24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h>
25f7ca90b1SMatthew Wilcox #include <linux/mm.h>
26d475c634SMatthew Wilcox #include <linux/mutex.h>
272765cfbbSRoss Zwisler #include <linux/pmem.h>
28289c6aedSMatthew Wilcox #include <linux/sched.h>
29d475c634SMatthew Wilcox #include <linux/uio.h>
30f7ca90b1SMatthew Wilcox #include <linux/vmstat.h>
3134c0fd54SDan Williams #include <linux/pfn_t.h>
320e749e54SDan Williams #include <linux/sizes.h>
33d475c634SMatthew Wilcox 
34b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
35b2e0d162SDan Williams {
36b2e0d162SDan Williams 	struct request_queue *q = bdev->bd_queue;
37b2e0d162SDan Williams 	long rc = -EIO;
38b2e0d162SDan Williams 
39b2e0d162SDan Williams 	dax->addr = (void __pmem *) ERR_PTR(-EIO);
40b2e0d162SDan Williams 	if (blk_queue_enter(q, true) != 0)
41b2e0d162SDan Williams 		return rc;
42b2e0d162SDan Williams 
43b2e0d162SDan Williams 	rc = bdev_direct_access(bdev, dax);
44b2e0d162SDan Williams 	if (rc < 0) {
45b2e0d162SDan Williams 		dax->addr = (void __pmem *) ERR_PTR(rc);
46b2e0d162SDan Williams 		blk_queue_exit(q);
47b2e0d162SDan Williams 		return rc;
48b2e0d162SDan Williams 	}
49b2e0d162SDan Williams 	return rc;
50b2e0d162SDan Williams }
51b2e0d162SDan Williams 
52b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev,
53b2e0d162SDan Williams 		const struct blk_dax_ctl *dax)
54b2e0d162SDan Williams {
55b2e0d162SDan Williams 	if (IS_ERR(dax->addr))
56b2e0d162SDan Williams 		return;
57b2e0d162SDan Williams 	blk_queue_exit(bdev->bd_queue);
58b2e0d162SDan Williams }
59b2e0d162SDan Williams 
601ca19157SDave Chinner /*
611ca19157SDave Chinner  * dax_clear_blocks() is called from within transaction context from XFS,
621ca19157SDave Chinner  * and hence this means the stack from this point must follow GFP_NOFS
631ca19157SDave Chinner  * semantics for all operations.
641ca19157SDave Chinner  */
65b2e0d162SDan Williams int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
66289c6aedSMatthew Wilcox {
67289c6aedSMatthew Wilcox 	struct block_device *bdev = inode->i_sb->s_bdev;
68b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
69b2e0d162SDan Williams 		.sector = block << (inode->i_blkbits - 9),
70b2e0d162SDan Williams 		.size = _size,
71b2e0d162SDan Williams 	};
72289c6aedSMatthew Wilcox 
73289c6aedSMatthew Wilcox 	might_sleep();
74289c6aedSMatthew Wilcox 	do {
750e749e54SDan Williams 		long count, sz;
76289c6aedSMatthew Wilcox 
77b2e0d162SDan Williams 		count = dax_map_atomic(bdev, &dax);
78289c6aedSMatthew Wilcox 		if (count < 0)
79289c6aedSMatthew Wilcox 			return count;
800e749e54SDan Williams 		sz = min_t(long, count, SZ_128K);
81b2e0d162SDan Williams 		clear_pmem(dax.addr, sz);
82b2e0d162SDan Williams 		dax.size -= sz;
83b2e0d162SDan Williams 		dax.sector += sz / 512;
84b2e0d162SDan Williams 		dax_unmap_atomic(bdev, &dax);
85289c6aedSMatthew Wilcox 		cond_resched();
86b2e0d162SDan Williams 	} while (dax.size);
87289c6aedSMatthew Wilcox 
882765cfbbSRoss Zwisler 	wmb_pmem();
89289c6aedSMatthew Wilcox 	return 0;
90289c6aedSMatthew Wilcox }
91289c6aedSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_clear_blocks);
92289c6aedSMatthew Wilcox 
932765cfbbSRoss Zwisler /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
94e2e05394SRoss Zwisler static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
95e2e05394SRoss Zwisler 		loff_t pos, loff_t end)
96d475c634SMatthew Wilcox {
97d475c634SMatthew Wilcox 	loff_t final = end - pos + first; /* The final byte of the buffer */
98d475c634SMatthew Wilcox 
99d475c634SMatthew Wilcox 	if (first > 0)
100e2e05394SRoss Zwisler 		clear_pmem(addr, first);
101d475c634SMatthew Wilcox 	if (final < size)
102e2e05394SRoss Zwisler 		clear_pmem(addr + final, size - final);
103d475c634SMatthew Wilcox }
104d475c634SMatthew Wilcox 
105d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh)
106d475c634SMatthew Wilcox {
107d475c634SMatthew Wilcox 	return buffer_mapped(bh) && !buffer_unwritten(bh);
108d475c634SMatthew Wilcox }
109d475c634SMatthew Wilcox 
110d475c634SMatthew Wilcox /*
111d475c634SMatthew Wilcox  * When ext4 encounters a hole, it returns without modifying the buffer_head
112d475c634SMatthew Wilcox  * which means that we can't trust b_size.  To cope with this, we set b_state
113d475c634SMatthew Wilcox  * to 0 before calling get_block and, if any bit is set, we know we can trust
114d475c634SMatthew Wilcox  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
115d475c634SMatthew Wilcox  * and would save us time calling get_block repeatedly.
116d475c634SMatthew Wilcox  */
117d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh)
118d475c634SMatthew Wilcox {
119d475c634SMatthew Wilcox 	return bh->b_state != 0;
120d475c634SMatthew Wilcox }
121d475c634SMatthew Wilcox 
122b2e0d162SDan Williams 
123b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh,
124b2e0d162SDan Williams 		const struct inode *inode)
125b2e0d162SDan Williams {
126b2e0d162SDan Williams 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
127b2e0d162SDan Williams 
128b2e0d162SDan Williams 	return sector;
129b2e0d162SDan Williams }
130b2e0d162SDan Williams 
131a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
132d475c634SMatthew Wilcox 		      loff_t start, loff_t end, get_block_t get_block,
133d475c634SMatthew Wilcox 		      struct buffer_head *bh)
134d475c634SMatthew Wilcox {
135b2e0d162SDan Williams 	loff_t pos = start, max = start, bh_max = start;
136b2e0d162SDan Williams 	bool hole = false, need_wmb = false;
137b2e0d162SDan Williams 	struct block_device *bdev = NULL;
138b2e0d162SDan Williams 	int rw = iov_iter_rw(iter), rc;
139b2e0d162SDan Williams 	long map_len = 0;
140b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
141b2e0d162SDan Williams 		.addr = (void __pmem *) ERR_PTR(-EIO),
142b2e0d162SDan Williams 	};
143d475c634SMatthew Wilcox 
144b2e0d162SDan Williams 	if (rw == READ)
145d475c634SMatthew Wilcox 		end = min(end, i_size_read(inode));
146d475c634SMatthew Wilcox 
147d475c634SMatthew Wilcox 	while (pos < end) {
1482765cfbbSRoss Zwisler 		size_t len;
149d475c634SMatthew Wilcox 		if (pos == max) {
150d475c634SMatthew Wilcox 			unsigned blkbits = inode->i_blkbits;
151e94f5a22SJeff Moyer 			long page = pos >> PAGE_SHIFT;
152e94f5a22SJeff Moyer 			sector_t block = page << (PAGE_SHIFT - blkbits);
153d475c634SMatthew Wilcox 			unsigned first = pos - (block << blkbits);
154d475c634SMatthew Wilcox 			long size;
155d475c634SMatthew Wilcox 
156d475c634SMatthew Wilcox 			if (pos == bh_max) {
157d475c634SMatthew Wilcox 				bh->b_size = PAGE_ALIGN(end - pos);
158d475c634SMatthew Wilcox 				bh->b_state = 0;
159b2e0d162SDan Williams 				rc = get_block(inode, block, bh, rw == WRITE);
160b2e0d162SDan Williams 				if (rc)
161d475c634SMatthew Wilcox 					break;
162d475c634SMatthew Wilcox 				if (!buffer_size_valid(bh))
163d475c634SMatthew Wilcox 					bh->b_size = 1 << blkbits;
164d475c634SMatthew Wilcox 				bh_max = pos - first + bh->b_size;
165b2e0d162SDan Williams 				bdev = bh->b_bdev;
166d475c634SMatthew Wilcox 			} else {
167d475c634SMatthew Wilcox 				unsigned done = bh->b_size -
168d475c634SMatthew Wilcox 						(bh_max - (pos - first));
169d475c634SMatthew Wilcox 				bh->b_blocknr += done >> blkbits;
170d475c634SMatthew Wilcox 				bh->b_size -= done;
171d475c634SMatthew Wilcox 			}
172d475c634SMatthew Wilcox 
173b2e0d162SDan Williams 			hole = rw == READ && !buffer_written(bh);
174d475c634SMatthew Wilcox 			if (hole) {
175d475c634SMatthew Wilcox 				size = bh->b_size - first;
176d475c634SMatthew Wilcox 			} else {
177b2e0d162SDan Williams 				dax_unmap_atomic(bdev, &dax);
178b2e0d162SDan Williams 				dax.sector = to_sector(bh, inode);
179b2e0d162SDan Williams 				dax.size = bh->b_size;
180b2e0d162SDan Williams 				map_len = dax_map_atomic(bdev, &dax);
181b2e0d162SDan Williams 				if (map_len < 0) {
182b2e0d162SDan Williams 					rc = map_len;
183d475c634SMatthew Wilcox 					break;
184b2e0d162SDan Williams 				}
1852765cfbbSRoss Zwisler 				if (buffer_unwritten(bh) || buffer_new(bh)) {
186b2e0d162SDan Williams 					dax_new_buf(dax.addr, map_len, first,
187b2e0d162SDan Williams 							pos, end);
1882765cfbbSRoss Zwisler 					need_wmb = true;
1892765cfbbSRoss Zwisler 				}
190b2e0d162SDan Williams 				dax.addr += first;
191b2e0d162SDan Williams 				size = map_len - first;
192d475c634SMatthew Wilcox 			}
193d475c634SMatthew Wilcox 			max = min(pos + size, end);
194d475c634SMatthew Wilcox 		}
195d475c634SMatthew Wilcox 
1962765cfbbSRoss Zwisler 		if (iov_iter_rw(iter) == WRITE) {
197b2e0d162SDan Williams 			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
1982765cfbbSRoss Zwisler 			need_wmb = true;
1992765cfbbSRoss Zwisler 		} else if (!hole)
200b2e0d162SDan Williams 			len = copy_to_iter((void __force *) dax.addr, max - pos,
201e2e05394SRoss Zwisler 					iter);
202d475c634SMatthew Wilcox 		else
203d475c634SMatthew Wilcox 			len = iov_iter_zero(max - pos, iter);
204d475c634SMatthew Wilcox 
205cadfbb6eSAl Viro 		if (!len) {
206b2e0d162SDan Williams 			rc = -EFAULT;
207d475c634SMatthew Wilcox 			break;
208cadfbb6eSAl Viro 		}
209d475c634SMatthew Wilcox 
210d475c634SMatthew Wilcox 		pos += len;
211b2e0d162SDan Williams 		if (!IS_ERR(dax.addr))
212b2e0d162SDan Williams 			dax.addr += len;
213d475c634SMatthew Wilcox 	}
214d475c634SMatthew Wilcox 
2152765cfbbSRoss Zwisler 	if (need_wmb)
2162765cfbbSRoss Zwisler 		wmb_pmem();
217b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
2182765cfbbSRoss Zwisler 
219b2e0d162SDan Williams 	return (pos == start) ? rc : pos - start;
220d475c634SMatthew Wilcox }
221d475c634SMatthew Wilcox 
222d475c634SMatthew Wilcox /**
223d475c634SMatthew Wilcox  * dax_do_io - Perform I/O to a DAX file
224d475c634SMatthew Wilcox  * @iocb: The control block for this I/O
225d475c634SMatthew Wilcox  * @inode: The file which the I/O is directed at
226d475c634SMatthew Wilcox  * @iter: The addresses to do I/O from or to
227d475c634SMatthew Wilcox  * @pos: The file offset where the I/O starts
228d475c634SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
229d475c634SMatthew Wilcox  * @end_io: A filesystem callback for I/O completion
230d475c634SMatthew Wilcox  * @flags: See below
231d475c634SMatthew Wilcox  *
232d475c634SMatthew Wilcox  * This function uses the same locking scheme as do_blockdev_direct_IO:
233d475c634SMatthew Wilcox  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
234d475c634SMatthew Wilcox  * caller for writes.  For reads, we take and release the i_mutex ourselves.
235d475c634SMatthew Wilcox  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
236d475c634SMatthew Wilcox  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
237d475c634SMatthew Wilcox  * is in progress.
238d475c634SMatthew Wilcox  */
239a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
240a95cd631SOmar Sandoval 		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
241a95cd631SOmar Sandoval 		  dio_iodone_t end_io, int flags)
242d475c634SMatthew Wilcox {
243d475c634SMatthew Wilcox 	struct buffer_head bh;
244d475c634SMatthew Wilcox 	ssize_t retval = -EINVAL;
245d475c634SMatthew Wilcox 	loff_t end = pos + iov_iter_count(iter);
246d475c634SMatthew Wilcox 
247d475c634SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
248d475c634SMatthew Wilcox 
249a95cd631SOmar Sandoval 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
250d475c634SMatthew Wilcox 		struct address_space *mapping = inode->i_mapping;
251d475c634SMatthew Wilcox 		mutex_lock(&inode->i_mutex);
252d475c634SMatthew Wilcox 		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
253d475c634SMatthew Wilcox 		if (retval) {
254d475c634SMatthew Wilcox 			mutex_unlock(&inode->i_mutex);
255d475c634SMatthew Wilcox 			goto out;
256d475c634SMatthew Wilcox 		}
257d475c634SMatthew Wilcox 	}
258d475c634SMatthew Wilcox 
259d475c634SMatthew Wilcox 	/* Protects against truncate */
260bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
261fe0f07d0SJens Axboe 		inode_dio_begin(inode);
262d475c634SMatthew Wilcox 
263a95cd631SOmar Sandoval 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
264d475c634SMatthew Wilcox 
265a95cd631SOmar Sandoval 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
266d475c634SMatthew Wilcox 		mutex_unlock(&inode->i_mutex);
267d475c634SMatthew Wilcox 
268d475c634SMatthew Wilcox 	if ((retval > 0) && end_io)
269d475c634SMatthew Wilcox 		end_io(iocb, pos, retval, bh.b_private);
270d475c634SMatthew Wilcox 
271bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
272fe0f07d0SJens Axboe 		inode_dio_end(inode);
273d475c634SMatthew Wilcox  out:
274d475c634SMatthew Wilcox 	return retval;
275d475c634SMatthew Wilcox }
276d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io);
277f7ca90b1SMatthew Wilcox 
278f7ca90b1SMatthew Wilcox /*
279f7ca90b1SMatthew Wilcox  * The user has performed a load from a hole in the file.  Allocating
280f7ca90b1SMatthew Wilcox  * a new page in the file would cause excessive storage usage for
281f7ca90b1SMatthew Wilcox  * workloads with sparse files.  We allocate a page cache page instead.
282f7ca90b1SMatthew Wilcox  * We'll kick it out of the page cache if it's ever written to,
283f7ca90b1SMatthew Wilcox  * otherwise it will simply fall out of the page cache under memory
284f7ca90b1SMatthew Wilcox  * pressure without ever having been dirtied.
285f7ca90b1SMatthew Wilcox  */
286f7ca90b1SMatthew Wilcox static int dax_load_hole(struct address_space *mapping, struct page *page,
287f7ca90b1SMatthew Wilcox 							struct vm_fault *vmf)
288f7ca90b1SMatthew Wilcox {
289f7ca90b1SMatthew Wilcox 	unsigned long size;
290f7ca90b1SMatthew Wilcox 	struct inode *inode = mapping->host;
291f7ca90b1SMatthew Wilcox 	if (!page)
292f7ca90b1SMatthew Wilcox 		page = find_or_create_page(mapping, vmf->pgoff,
293f7ca90b1SMatthew Wilcox 						GFP_KERNEL | __GFP_ZERO);
294f7ca90b1SMatthew Wilcox 	if (!page)
295f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM;
296f7ca90b1SMatthew Wilcox 	/* Recheck i_size under page lock to avoid truncate race */
297f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
298f7ca90b1SMatthew Wilcox 	if (vmf->pgoff >= size) {
299f7ca90b1SMatthew Wilcox 		unlock_page(page);
300f7ca90b1SMatthew Wilcox 		page_cache_release(page);
301f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS;
302f7ca90b1SMatthew Wilcox 	}
303f7ca90b1SMatthew Wilcox 
304f7ca90b1SMatthew Wilcox 	vmf->page = page;
305f7ca90b1SMatthew Wilcox 	return VM_FAULT_LOCKED;
306f7ca90b1SMatthew Wilcox }
307f7ca90b1SMatthew Wilcox 
308b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode,
309b2e0d162SDan Williams 		struct buffer_head *bh, unsigned long vaddr)
310f7ca90b1SMatthew Wilcox {
311b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
312b2e0d162SDan Williams 		.sector = to_sector(bh, inode),
313b2e0d162SDan Williams 		.size = bh->b_size,
314b2e0d162SDan Williams 	};
315b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
316e2e05394SRoss Zwisler 	void *vto;
317e2e05394SRoss Zwisler 
318b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0)
319b2e0d162SDan Williams 		return PTR_ERR(dax.addr);
320f7ca90b1SMatthew Wilcox 	vto = kmap_atomic(to);
321b2e0d162SDan Williams 	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
322f7ca90b1SMatthew Wilcox 	kunmap_atomic(vto);
323b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
324f7ca90b1SMatthew Wilcox 	return 0;
325f7ca90b1SMatthew Wilcox }
326f7ca90b1SMatthew Wilcox 
327f7ca90b1SMatthew Wilcox static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
328f7ca90b1SMatthew Wilcox 			struct vm_area_struct *vma, struct vm_fault *vmf)
329f7ca90b1SMatthew Wilcox {
330f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
331b2e0d162SDan Williams 	struct address_space *mapping = inode->i_mapping;
332b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
333b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
334b2e0d162SDan Williams 		.sector = to_sector(bh, inode),
335b2e0d162SDan Williams 		.size = bh->b_size,
336b2e0d162SDan Williams 	};
337f7ca90b1SMatthew Wilcox 	pgoff_t size;
338f7ca90b1SMatthew Wilcox 	int error;
339f7ca90b1SMatthew Wilcox 
3400f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
3410f90cc66SRoss Zwisler 
342f7ca90b1SMatthew Wilcox 	/*
343f7ca90b1SMatthew Wilcox 	 * Check truncate didn't happen while we were allocating a block.
344f7ca90b1SMatthew Wilcox 	 * If it did, this block may or may not be still allocated to the
345f7ca90b1SMatthew Wilcox 	 * file.  We can't tell the filesystem to free it because we can't
346f7ca90b1SMatthew Wilcox 	 * take i_mutex here.  In the worst case, the file still has blocks
347f7ca90b1SMatthew Wilcox 	 * allocated past the end of the file.
348f7ca90b1SMatthew Wilcox 	 */
349f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
350f7ca90b1SMatthew Wilcox 	if (unlikely(vmf->pgoff >= size)) {
351f7ca90b1SMatthew Wilcox 		error = -EIO;
352f7ca90b1SMatthew Wilcox 		goto out;
353f7ca90b1SMatthew Wilcox 	}
354f7ca90b1SMatthew Wilcox 
355b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0) {
356b2e0d162SDan Williams 		error = PTR_ERR(dax.addr);
357f7ca90b1SMatthew Wilcox 		goto out;
358f7ca90b1SMatthew Wilcox 	}
359f7ca90b1SMatthew Wilcox 
3602765cfbbSRoss Zwisler 	if (buffer_unwritten(bh) || buffer_new(bh)) {
361b2e0d162SDan Williams 		clear_pmem(dax.addr, PAGE_SIZE);
3622765cfbbSRoss Zwisler 		wmb_pmem();
3632765cfbbSRoss Zwisler 	}
364b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
365f7ca90b1SMatthew Wilcox 
36634c0fd54SDan Williams 	error = vm_insert_mixed(vma, vaddr, pfn_t_to_pfn(dax.pfn));
367f7ca90b1SMatthew Wilcox 
368f7ca90b1SMatthew Wilcox  out:
3690f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
3700f90cc66SRoss Zwisler 
371f7ca90b1SMatthew Wilcox 	return error;
372f7ca90b1SMatthew Wilcox }
373f7ca90b1SMatthew Wilcox 
374ce5c5d55SDave Chinner /**
375ce5c5d55SDave Chinner  * __dax_fault - handle a page fault on a DAX file
376ce5c5d55SDave Chinner  * @vma: The virtual memory area where the fault occurred
377ce5c5d55SDave Chinner  * @vmf: The description of the fault
378ce5c5d55SDave Chinner  * @get_block: The filesystem method used to translate file offsets to blocks
379b2442c5aSDave Chinner  * @complete_unwritten: The filesystem method used to convert unwritten blocks
380b2442c5aSDave Chinner  *	to written so the data written to them is exposed. This is required for
381b2442c5aSDave Chinner  *	required by write faults for filesystems that will return unwritten
382b2442c5aSDave Chinner  *	extent mappings from @get_block, but it is optional for reads as
383b2442c5aSDave Chinner  *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
384b2442c5aSDave Chinner  *	not support unwritten extents, the it should pass NULL.
385ce5c5d55SDave Chinner  *
386ce5c5d55SDave Chinner  * When a page fault occurs, filesystems may call this helper in their
387ce5c5d55SDave Chinner  * fault handler for DAX files. __dax_fault() assumes the caller has done all
388ce5c5d55SDave Chinner  * the necessary locking for the page fault to proceed successfully.
389ce5c5d55SDave Chinner  */
390ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
391e842f290SDave Chinner 			get_block_t get_block, dax_iodone_t complete_unwritten)
392f7ca90b1SMatthew Wilcox {
393f7ca90b1SMatthew Wilcox 	struct file *file = vma->vm_file;
394f7ca90b1SMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
395f7ca90b1SMatthew Wilcox 	struct inode *inode = mapping->host;
396f7ca90b1SMatthew Wilcox 	struct page *page;
397f7ca90b1SMatthew Wilcox 	struct buffer_head bh;
398f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
399f7ca90b1SMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
400f7ca90b1SMatthew Wilcox 	sector_t block;
401f7ca90b1SMatthew Wilcox 	pgoff_t size;
402f7ca90b1SMatthew Wilcox 	int error;
403f7ca90b1SMatthew Wilcox 	int major = 0;
404f7ca90b1SMatthew Wilcox 
405f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
406f7ca90b1SMatthew Wilcox 	if (vmf->pgoff >= size)
407f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS;
408f7ca90b1SMatthew Wilcox 
409f7ca90b1SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
410f7ca90b1SMatthew Wilcox 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
411f7ca90b1SMatthew Wilcox 	bh.b_size = PAGE_SIZE;
412f7ca90b1SMatthew Wilcox 
413f7ca90b1SMatthew Wilcox  repeat:
414f7ca90b1SMatthew Wilcox 	page = find_get_page(mapping, vmf->pgoff);
415f7ca90b1SMatthew Wilcox 	if (page) {
416f7ca90b1SMatthew Wilcox 		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
417f7ca90b1SMatthew Wilcox 			page_cache_release(page);
418f7ca90b1SMatthew Wilcox 			return VM_FAULT_RETRY;
419f7ca90b1SMatthew Wilcox 		}
420f7ca90b1SMatthew Wilcox 		if (unlikely(page->mapping != mapping)) {
421f7ca90b1SMatthew Wilcox 			unlock_page(page);
422f7ca90b1SMatthew Wilcox 			page_cache_release(page);
423f7ca90b1SMatthew Wilcox 			goto repeat;
424f7ca90b1SMatthew Wilcox 		}
425f7ca90b1SMatthew Wilcox 		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
426f7ca90b1SMatthew Wilcox 		if (unlikely(vmf->pgoff >= size)) {
427f7ca90b1SMatthew Wilcox 			/*
428f7ca90b1SMatthew Wilcox 			 * We have a struct page covering a hole in the file
429f7ca90b1SMatthew Wilcox 			 * from a read fault and we've raced with a truncate
430f7ca90b1SMatthew Wilcox 			 */
431f7ca90b1SMatthew Wilcox 			error = -EIO;
4320f90cc66SRoss Zwisler 			goto unlock_page;
433f7ca90b1SMatthew Wilcox 		}
434f7ca90b1SMatthew Wilcox 	}
435f7ca90b1SMatthew Wilcox 
436f7ca90b1SMatthew Wilcox 	error = get_block(inode, block, &bh, 0);
437f7ca90b1SMatthew Wilcox 	if (!error && (bh.b_size < PAGE_SIZE))
438f7ca90b1SMatthew Wilcox 		error = -EIO;		/* fs corruption? */
439f7ca90b1SMatthew Wilcox 	if (error)
4400f90cc66SRoss Zwisler 		goto unlock_page;
441f7ca90b1SMatthew Wilcox 
442f7ca90b1SMatthew Wilcox 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
443f7ca90b1SMatthew Wilcox 		if (vmf->flags & FAULT_FLAG_WRITE) {
444f7ca90b1SMatthew Wilcox 			error = get_block(inode, block, &bh, 1);
445f7ca90b1SMatthew Wilcox 			count_vm_event(PGMAJFAULT);
446f7ca90b1SMatthew Wilcox 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
447f7ca90b1SMatthew Wilcox 			major = VM_FAULT_MAJOR;
448f7ca90b1SMatthew Wilcox 			if (!error && (bh.b_size < PAGE_SIZE))
449f7ca90b1SMatthew Wilcox 				error = -EIO;
450f7ca90b1SMatthew Wilcox 			if (error)
4510f90cc66SRoss Zwisler 				goto unlock_page;
452f7ca90b1SMatthew Wilcox 		} else {
453f7ca90b1SMatthew Wilcox 			return dax_load_hole(mapping, page, vmf);
454f7ca90b1SMatthew Wilcox 		}
455f7ca90b1SMatthew Wilcox 	}
456f7ca90b1SMatthew Wilcox 
457f7ca90b1SMatthew Wilcox 	if (vmf->cow_page) {
458f7ca90b1SMatthew Wilcox 		struct page *new_page = vmf->cow_page;
459f7ca90b1SMatthew Wilcox 		if (buffer_written(&bh))
460b2e0d162SDan Williams 			error = copy_user_bh(new_page, inode, &bh, vaddr);
461f7ca90b1SMatthew Wilcox 		else
462f7ca90b1SMatthew Wilcox 			clear_user_highpage(new_page, vaddr);
463f7ca90b1SMatthew Wilcox 		if (error)
4640f90cc66SRoss Zwisler 			goto unlock_page;
465f7ca90b1SMatthew Wilcox 		vmf->page = page;
466f7ca90b1SMatthew Wilcox 		if (!page) {
4670f90cc66SRoss Zwisler 			i_mmap_lock_read(mapping);
468f7ca90b1SMatthew Wilcox 			/* Check we didn't race with truncate */
469f7ca90b1SMatthew Wilcox 			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
470f7ca90b1SMatthew Wilcox 								PAGE_SHIFT;
471f7ca90b1SMatthew Wilcox 			if (vmf->pgoff >= size) {
4720f90cc66SRoss Zwisler 				i_mmap_unlock_read(mapping);
473f7ca90b1SMatthew Wilcox 				error = -EIO;
4740f90cc66SRoss Zwisler 				goto out;
475f7ca90b1SMatthew Wilcox 			}
476f7ca90b1SMatthew Wilcox 		}
477f7ca90b1SMatthew Wilcox 		return VM_FAULT_LOCKED;
478f7ca90b1SMatthew Wilcox 	}
479f7ca90b1SMatthew Wilcox 
480f7ca90b1SMatthew Wilcox 	/* Check we didn't race with a read fault installing a new page */
481f7ca90b1SMatthew Wilcox 	if (!page && major)
482f7ca90b1SMatthew Wilcox 		page = find_lock_page(mapping, vmf->pgoff);
483f7ca90b1SMatthew Wilcox 
484f7ca90b1SMatthew Wilcox 	if (page) {
485f7ca90b1SMatthew Wilcox 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
486f7ca90b1SMatthew Wilcox 							PAGE_CACHE_SIZE, 0);
487f7ca90b1SMatthew Wilcox 		delete_from_page_cache(page);
488f7ca90b1SMatthew Wilcox 		unlock_page(page);
489f7ca90b1SMatthew Wilcox 		page_cache_release(page);
490f7ca90b1SMatthew Wilcox 	}
491f7ca90b1SMatthew Wilcox 
492e842f290SDave Chinner 	/*
493e842f290SDave Chinner 	 * If we successfully insert the new mapping over an unwritten extent,
494e842f290SDave Chinner 	 * we need to ensure we convert the unwritten extent. If there is an
495e842f290SDave Chinner 	 * error inserting the mapping, the filesystem needs to leave it as
496e842f290SDave Chinner 	 * unwritten to prevent exposure of the stale underlying data to
497e842f290SDave Chinner 	 * userspace, but we still need to call the completion function so
498e842f290SDave Chinner 	 * the private resources on the mapping buffer can be released. We
499e842f290SDave Chinner 	 * indicate what the callback should do via the uptodate variable, same
500e842f290SDave Chinner 	 * as for normal BH based IO completions.
501e842f290SDave Chinner 	 */
502f7ca90b1SMatthew Wilcox 	error = dax_insert_mapping(inode, &bh, vma, vmf);
503b2442c5aSDave Chinner 	if (buffer_unwritten(&bh)) {
504b2442c5aSDave Chinner 		if (complete_unwritten)
505e842f290SDave Chinner 			complete_unwritten(&bh, !error);
506b2442c5aSDave Chinner 		else
507b2442c5aSDave Chinner 			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
508b2442c5aSDave Chinner 	}
509f7ca90b1SMatthew Wilcox 
510f7ca90b1SMatthew Wilcox  out:
511f7ca90b1SMatthew Wilcox 	if (error == -ENOMEM)
512f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM | major;
513f7ca90b1SMatthew Wilcox 	/* -EBUSY is fine, somebody else faulted on the same PTE */
514f7ca90b1SMatthew Wilcox 	if ((error < 0) && (error != -EBUSY))
515f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS | major;
516f7ca90b1SMatthew Wilcox 	return VM_FAULT_NOPAGE | major;
517f7ca90b1SMatthew Wilcox 
5180f90cc66SRoss Zwisler  unlock_page:
519f7ca90b1SMatthew Wilcox 	if (page) {
520f7ca90b1SMatthew Wilcox 		unlock_page(page);
521f7ca90b1SMatthew Wilcox 		page_cache_release(page);
522f7ca90b1SMatthew Wilcox 	}
523f7ca90b1SMatthew Wilcox 	goto out;
524f7ca90b1SMatthew Wilcox }
525ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault);
526f7ca90b1SMatthew Wilcox 
527f7ca90b1SMatthew Wilcox /**
528f7ca90b1SMatthew Wilcox  * dax_fault - handle a page fault on a DAX file
529f7ca90b1SMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
530f7ca90b1SMatthew Wilcox  * @vmf: The description of the fault
531f7ca90b1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
532f7ca90b1SMatthew Wilcox  *
533f7ca90b1SMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
534f7ca90b1SMatthew Wilcox  * fault handler for DAX files.
535f7ca90b1SMatthew Wilcox  */
536f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
537e842f290SDave Chinner 	      get_block_t get_block, dax_iodone_t complete_unwritten)
538f7ca90b1SMatthew Wilcox {
539f7ca90b1SMatthew Wilcox 	int result;
540f7ca90b1SMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
541f7ca90b1SMatthew Wilcox 
542f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE) {
543f7ca90b1SMatthew Wilcox 		sb_start_pagefault(sb);
544f7ca90b1SMatthew Wilcox 		file_update_time(vma->vm_file);
545f7ca90b1SMatthew Wilcox 	}
546ce5c5d55SDave Chinner 	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
547f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE)
548f7ca90b1SMatthew Wilcox 		sb_end_pagefault(sb);
549f7ca90b1SMatthew Wilcox 
550f7ca90b1SMatthew Wilcox 	return result;
551f7ca90b1SMatthew Wilcox }
552f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault);
5534c0ccfefSMatthew Wilcox 
554844f35dbSMatthew Wilcox #ifdef CONFIG_TRANSPARENT_HUGEPAGE
555844f35dbSMatthew Wilcox /*
556844f35dbSMatthew Wilcox  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
557844f35dbSMatthew Wilcox  * more often than one might expect in the below function.
558844f35dbSMatthew Wilcox  */
559844f35dbSMatthew Wilcox #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
560844f35dbSMatthew Wilcox 
561844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
562844f35dbSMatthew Wilcox 		pmd_t *pmd, unsigned int flags, get_block_t get_block,
563844f35dbSMatthew Wilcox 		dax_iodone_t complete_unwritten)
564844f35dbSMatthew Wilcox {
565844f35dbSMatthew Wilcox 	struct file *file = vma->vm_file;
566844f35dbSMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
567844f35dbSMatthew Wilcox 	struct inode *inode = mapping->host;
568844f35dbSMatthew Wilcox 	struct buffer_head bh;
569844f35dbSMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
570844f35dbSMatthew Wilcox 	unsigned long pmd_addr = address & PMD_MASK;
571844f35dbSMatthew Wilcox 	bool write = flags & FAULT_FLAG_WRITE;
572b2e0d162SDan Williams 	struct block_device *bdev;
573844f35dbSMatthew Wilcox 	pgoff_t size, pgoff;
574b2e0d162SDan Williams 	sector_t block;
575844f35dbSMatthew Wilcox 	int result = 0;
576844f35dbSMatthew Wilcox 
577ee82c9edSDan Williams 	/* dax pmd mappings are broken wrt gup and fork */
578ee82c9edSDan Williams 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
579ee82c9edSDan Williams 		return VM_FAULT_FALLBACK;
580ee82c9edSDan Williams 
581844f35dbSMatthew Wilcox 	/* Fall back to PTEs if we're going to COW */
58259bf4fb9SToshi Kani 	if (write && !(vma->vm_flags & VM_SHARED)) {
58359bf4fb9SToshi Kani 		split_huge_pmd(vma, pmd, address);
584844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
58559bf4fb9SToshi Kani 	}
586844f35dbSMatthew Wilcox 	/* If the PMD would extend outside the VMA */
587844f35dbSMatthew Wilcox 	if (pmd_addr < vma->vm_start)
588844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
589844f35dbSMatthew Wilcox 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
590844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
591844f35dbSMatthew Wilcox 
5923fdd1b47SMatthew Wilcox 	pgoff = linear_page_index(vma, pmd_addr);
593844f35dbSMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
594844f35dbSMatthew Wilcox 	if (pgoff >= size)
595844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
596844f35dbSMatthew Wilcox 	/* If the PMD would cover blocks out of the file */
597844f35dbSMatthew Wilcox 	if ((pgoff | PG_PMD_COLOUR) >= size)
598844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
599844f35dbSMatthew Wilcox 
600844f35dbSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
601844f35dbSMatthew Wilcox 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
602844f35dbSMatthew Wilcox 
603844f35dbSMatthew Wilcox 	bh.b_size = PMD_SIZE;
604b2e0d162SDan Williams 	if (get_block(inode, block, &bh, write) != 0)
605844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
606b2e0d162SDan Williams 	bdev = bh.b_bdev;
6070f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
608844f35dbSMatthew Wilcox 
609844f35dbSMatthew Wilcox 	/*
610844f35dbSMatthew Wilcox 	 * If the filesystem isn't willing to tell us the length of a hole,
611844f35dbSMatthew Wilcox 	 * just fall back to PTEs.  Calling get_block 512 times in a loop
612844f35dbSMatthew Wilcox 	 * would be silly.
613844f35dbSMatthew Wilcox 	 */
614844f35dbSMatthew Wilcox 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
615844f35dbSMatthew Wilcox 		goto fallback;
616844f35dbSMatthew Wilcox 
61746c043edSKirill A. Shutemov 	/*
61846c043edSKirill A. Shutemov 	 * If we allocated new storage, make sure no process has any
61946c043edSKirill A. Shutemov 	 * zero pages covering this hole
62046c043edSKirill A. Shutemov 	 */
62146c043edSKirill A. Shutemov 	if (buffer_new(&bh)) {
6220f90cc66SRoss Zwisler 		i_mmap_unlock_read(mapping);
62346c043edSKirill A. Shutemov 		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
6240f90cc66SRoss Zwisler 		i_mmap_lock_read(mapping);
62546c043edSKirill A. Shutemov 	}
62646c043edSKirill A. Shutemov 
62784c4e5e6SMatthew Wilcox 	/*
62884c4e5e6SMatthew Wilcox 	 * If a truncate happened while we were allocating blocks, we may
62984c4e5e6SMatthew Wilcox 	 * leave blocks allocated to the file that are beyond EOF.  We can't
63084c4e5e6SMatthew Wilcox 	 * take i_mutex here, so just leave them hanging; they'll be freed
63184c4e5e6SMatthew Wilcox 	 * when the file is deleted.
63284c4e5e6SMatthew Wilcox 	 */
633844f35dbSMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
634844f35dbSMatthew Wilcox 	if (pgoff >= size) {
635844f35dbSMatthew Wilcox 		result = VM_FAULT_SIGBUS;
636844f35dbSMatthew Wilcox 		goto out;
637844f35dbSMatthew Wilcox 	}
638844f35dbSMatthew Wilcox 	if ((pgoff | PG_PMD_COLOUR) >= size)
639844f35dbSMatthew Wilcox 		goto fallback;
640844f35dbSMatthew Wilcox 
641844f35dbSMatthew Wilcox 	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
642844f35dbSMatthew Wilcox 		spinlock_t *ptl;
643d295e341SKirill A. Shutemov 		pmd_t entry;
644844f35dbSMatthew Wilcox 		struct page *zero_page = get_huge_zero_page();
645d295e341SKirill A. Shutemov 
646844f35dbSMatthew Wilcox 		if (unlikely(!zero_page))
647844f35dbSMatthew Wilcox 			goto fallback;
648844f35dbSMatthew Wilcox 
649d295e341SKirill A. Shutemov 		ptl = pmd_lock(vma->vm_mm, pmd);
650d295e341SKirill A. Shutemov 		if (!pmd_none(*pmd)) {
651844f35dbSMatthew Wilcox 			spin_unlock(ptl);
652d295e341SKirill A. Shutemov 			goto fallback;
653d295e341SKirill A. Shutemov 		}
654d295e341SKirill A. Shutemov 
655d295e341SKirill A. Shutemov 		entry = mk_pmd(zero_page, vma->vm_page_prot);
656d295e341SKirill A. Shutemov 		entry = pmd_mkhuge(entry);
657d295e341SKirill A. Shutemov 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
658844f35dbSMatthew Wilcox 		result = VM_FAULT_NOPAGE;
659d295e341SKirill A. Shutemov 		spin_unlock(ptl);
660844f35dbSMatthew Wilcox 	} else {
661b2e0d162SDan Williams 		struct blk_dax_ctl dax = {
662b2e0d162SDan Williams 			.sector = to_sector(&bh, inode),
663b2e0d162SDan Williams 			.size = PMD_SIZE,
664b2e0d162SDan Williams 		};
665b2e0d162SDan Williams 		long length = dax_map_atomic(bdev, &dax);
666b2e0d162SDan Williams 
667844f35dbSMatthew Wilcox 		if (length < 0) {
668844f35dbSMatthew Wilcox 			result = VM_FAULT_SIGBUS;
669844f35dbSMatthew Wilcox 			goto out;
670844f35dbSMatthew Wilcox 		}
67134c0fd54SDan Williams 		if (length < PMD_SIZE
67234c0fd54SDan Williams 				|| (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)) {
673b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
674844f35dbSMatthew Wilcox 			goto fallback;
675b2e0d162SDan Williams 		}
676844f35dbSMatthew Wilcox 
677152d7bd8SDan Williams 		/*
678152d7bd8SDan Williams 		 * TODO: teach vmf_insert_pfn_pmd() to support
679152d7bd8SDan Williams 		 * 'pte_special' for pmds
680152d7bd8SDan Williams 		 */
68134c0fd54SDan Williams 		if (pfn_t_has_page(dax.pfn)) {
682b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
683152d7bd8SDan Williams 			goto fallback;
684b2e0d162SDan Williams 		}
685152d7bd8SDan Williams 
6860f90cc66SRoss Zwisler 		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
687b2e0d162SDan Williams 			clear_pmem(dax.addr, PMD_SIZE);
6880f90cc66SRoss Zwisler 			wmb_pmem();
6890f90cc66SRoss Zwisler 			count_vm_event(PGMAJFAULT);
6900f90cc66SRoss Zwisler 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
6910f90cc66SRoss Zwisler 			result |= VM_FAULT_MAJOR;
6920f90cc66SRoss Zwisler 		}
693b2e0d162SDan Williams 		dax_unmap_atomic(bdev, &dax);
6940f90cc66SRoss Zwisler 
69534c0fd54SDan Williams 		result |= vmf_insert_pfn_pmd(vma, address, pmd,
69634c0fd54SDan Williams 				pfn_t_to_pfn(dax.pfn), write);
697844f35dbSMatthew Wilcox 	}
698844f35dbSMatthew Wilcox 
699844f35dbSMatthew Wilcox  out:
7000f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
7010f90cc66SRoss Zwisler 
702844f35dbSMatthew Wilcox 	if (buffer_unwritten(&bh))
703844f35dbSMatthew Wilcox 		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
704844f35dbSMatthew Wilcox 
705844f35dbSMatthew Wilcox 	return result;
706844f35dbSMatthew Wilcox 
707844f35dbSMatthew Wilcox  fallback:
708844f35dbSMatthew Wilcox 	count_vm_event(THP_FAULT_FALLBACK);
709844f35dbSMatthew Wilcox 	result = VM_FAULT_FALLBACK;
710844f35dbSMatthew Wilcox 	goto out;
711844f35dbSMatthew Wilcox }
712844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault);
713844f35dbSMatthew Wilcox 
714844f35dbSMatthew Wilcox /**
715844f35dbSMatthew Wilcox  * dax_pmd_fault - handle a PMD fault on a DAX file
716844f35dbSMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
717844f35dbSMatthew Wilcox  * @vmf: The description of the fault
718844f35dbSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
719844f35dbSMatthew Wilcox  *
720844f35dbSMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
721844f35dbSMatthew Wilcox  * pmd_fault handler for DAX files.
722844f35dbSMatthew Wilcox  */
723844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
724844f35dbSMatthew Wilcox 			pmd_t *pmd, unsigned int flags, get_block_t get_block,
725844f35dbSMatthew Wilcox 			dax_iodone_t complete_unwritten)
726844f35dbSMatthew Wilcox {
727844f35dbSMatthew Wilcox 	int result;
728844f35dbSMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
729844f35dbSMatthew Wilcox 
730844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE) {
731844f35dbSMatthew Wilcox 		sb_start_pagefault(sb);
732844f35dbSMatthew Wilcox 		file_update_time(vma->vm_file);
733844f35dbSMatthew Wilcox 	}
734844f35dbSMatthew Wilcox 	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
735844f35dbSMatthew Wilcox 				complete_unwritten);
736844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE)
737844f35dbSMatthew Wilcox 		sb_end_pagefault(sb);
738844f35dbSMatthew Wilcox 
739844f35dbSMatthew Wilcox 	return result;
740844f35dbSMatthew Wilcox }
741844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault);
742dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
743844f35dbSMatthew Wilcox 
7444c0ccfefSMatthew Wilcox /**
7450e3b210cSBoaz Harrosh  * dax_pfn_mkwrite - handle first write to DAX page
7460e3b210cSBoaz Harrosh  * @vma: The virtual memory area where the fault occurred
7470e3b210cSBoaz Harrosh  * @vmf: The description of the fault
7480e3b210cSBoaz Harrosh  *
7490e3b210cSBoaz Harrosh  */
7500e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7510e3b210cSBoaz Harrosh {
7520e3b210cSBoaz Harrosh 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
7530e3b210cSBoaz Harrosh 
7540e3b210cSBoaz Harrosh 	sb_start_pagefault(sb);
7550e3b210cSBoaz Harrosh 	file_update_time(vma->vm_file);
7560e3b210cSBoaz Harrosh 	sb_end_pagefault(sb);
7570e3b210cSBoaz Harrosh 	return VM_FAULT_NOPAGE;
7580e3b210cSBoaz Harrosh }
7590e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
7600e3b210cSBoaz Harrosh 
7610e3b210cSBoaz Harrosh /**
76225726bc1SMatthew Wilcox  * dax_zero_page_range - zero a range within a page of a DAX file
7634c0ccfefSMatthew Wilcox  * @inode: The file being truncated
7644c0ccfefSMatthew Wilcox  * @from: The file offset that is being truncated to
76525726bc1SMatthew Wilcox  * @length: The number of bytes to zero
7664c0ccfefSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
7674c0ccfefSMatthew Wilcox  *
76825726bc1SMatthew Wilcox  * This function can be called by a filesystem when it is zeroing part of a
76925726bc1SMatthew Wilcox  * page in a DAX file.  This is intended for hole-punch operations.  If
77025726bc1SMatthew Wilcox  * you are truncating a file, the helper function dax_truncate_page() may be
77125726bc1SMatthew Wilcox  * more convenient.
7724c0ccfefSMatthew Wilcox  *
7734c0ccfefSMatthew Wilcox  * We work in terms of PAGE_CACHE_SIZE here for commonality with
7744c0ccfefSMatthew Wilcox  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
7754c0ccfefSMatthew Wilcox  * took care of disposing of the unnecessary blocks.  Even if the filesystem
7764c0ccfefSMatthew Wilcox  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
77725726bc1SMatthew Wilcox  * since the file might be mmapped.
7784c0ccfefSMatthew Wilcox  */
77925726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
78025726bc1SMatthew Wilcox 							get_block_t get_block)
7814c0ccfefSMatthew Wilcox {
7824c0ccfefSMatthew Wilcox 	struct buffer_head bh;
7834c0ccfefSMatthew Wilcox 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
7844c0ccfefSMatthew Wilcox 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
7854c0ccfefSMatthew Wilcox 	int err;
7864c0ccfefSMatthew Wilcox 
7874c0ccfefSMatthew Wilcox 	/* Block boundary? Nothing to do */
7884c0ccfefSMatthew Wilcox 	if (!length)
7894c0ccfefSMatthew Wilcox 		return 0;
79025726bc1SMatthew Wilcox 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
7914c0ccfefSMatthew Wilcox 
7924c0ccfefSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
7934c0ccfefSMatthew Wilcox 	bh.b_size = PAGE_CACHE_SIZE;
7944c0ccfefSMatthew Wilcox 	err = get_block(inode, index, &bh, 0);
7954c0ccfefSMatthew Wilcox 	if (err < 0)
7964c0ccfefSMatthew Wilcox 		return err;
7974c0ccfefSMatthew Wilcox 	if (buffer_written(&bh)) {
798b2e0d162SDan Williams 		struct block_device *bdev = bh.b_bdev;
799b2e0d162SDan Williams 		struct blk_dax_ctl dax = {
800b2e0d162SDan Williams 			.sector = to_sector(&bh, inode),
801b2e0d162SDan Williams 			.size = PAGE_CACHE_SIZE,
802b2e0d162SDan Williams 		};
803b2e0d162SDan Williams 
804b2e0d162SDan Williams 		if (dax_map_atomic(bdev, &dax) < 0)
805b2e0d162SDan Williams 			return PTR_ERR(dax.addr);
806b2e0d162SDan Williams 		clear_pmem(dax.addr + offset, length);
8072765cfbbSRoss Zwisler 		wmb_pmem();
808b2e0d162SDan Williams 		dax_unmap_atomic(bdev, &dax);
8094c0ccfefSMatthew Wilcox 	}
8104c0ccfefSMatthew Wilcox 
8114c0ccfefSMatthew Wilcox 	return 0;
8124c0ccfefSMatthew Wilcox }
81325726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range);
81425726bc1SMatthew Wilcox 
81525726bc1SMatthew Wilcox /**
81625726bc1SMatthew Wilcox  * dax_truncate_page - handle a partial page being truncated in a DAX file
81725726bc1SMatthew Wilcox  * @inode: The file being truncated
81825726bc1SMatthew Wilcox  * @from: The file offset that is being truncated to
81925726bc1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
82025726bc1SMatthew Wilcox  *
82125726bc1SMatthew Wilcox  * Similar to block_truncate_page(), this function can be called by a
82225726bc1SMatthew Wilcox  * filesystem when it is truncating a DAX file to handle the partial page.
82325726bc1SMatthew Wilcox  *
82425726bc1SMatthew Wilcox  * We work in terms of PAGE_CACHE_SIZE here for commonality with
82525726bc1SMatthew Wilcox  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
82625726bc1SMatthew Wilcox  * took care of disposing of the unnecessary blocks.  Even if the filesystem
82725726bc1SMatthew Wilcox  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
82825726bc1SMatthew Wilcox  * since the file might be mmapped.
82925726bc1SMatthew Wilcox  */
83025726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
83125726bc1SMatthew Wilcox {
83225726bc1SMatthew Wilcox 	unsigned length = PAGE_CACHE_ALIGN(from) - from;
83325726bc1SMatthew Wilcox 	return dax_zero_page_range(inode, from, length, get_block);
83425726bc1SMatthew Wilcox }
8354c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page);
836