xref: /openbmc/linux/fs/dax.c (revision 679c8bd3)
1d475c634SMatthew Wilcox /*
2d475c634SMatthew Wilcox  * fs/dax.c - Direct Access filesystem code
3d475c634SMatthew Wilcox  * Copyright (c) 2013-2014 Intel Corporation
4d475c634SMatthew Wilcox  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5d475c634SMatthew Wilcox  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6d475c634SMatthew Wilcox  *
7d475c634SMatthew Wilcox  * This program is free software; you can redistribute it and/or modify it
8d475c634SMatthew Wilcox  * under the terms and conditions of the GNU General Public License,
9d475c634SMatthew Wilcox  * version 2, as published by the Free Software Foundation.
10d475c634SMatthew Wilcox  *
11d475c634SMatthew Wilcox  * This program is distributed in the hope it will be useful, but WITHOUT
12d475c634SMatthew Wilcox  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13d475c634SMatthew Wilcox  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14d475c634SMatthew Wilcox  * more details.
15d475c634SMatthew Wilcox  */
16d475c634SMatthew Wilcox 
17d475c634SMatthew Wilcox #include <linux/atomic.h>
18d475c634SMatthew Wilcox #include <linux/blkdev.h>
19d475c634SMatthew Wilcox #include <linux/buffer_head.h>
20d77e92e2SRoss Zwisler #include <linux/dax.h>
21d475c634SMatthew Wilcox #include <linux/fs.h>
22d475c634SMatthew Wilcox #include <linux/genhd.h>
23f7ca90b1SMatthew Wilcox #include <linux/highmem.h>
24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h>
25f7ca90b1SMatthew Wilcox #include <linux/mm.h>
26d475c634SMatthew Wilcox #include <linux/mutex.h>
279973c98eSRoss Zwisler #include <linux/pagevec.h>
282765cfbbSRoss Zwisler #include <linux/pmem.h>
29289c6aedSMatthew Wilcox #include <linux/sched.h>
30d475c634SMatthew Wilcox #include <linux/uio.h>
31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h>
3234c0fd54SDan Williams #include <linux/pfn_t.h>
330e749e54SDan Williams #include <linux/sizes.h>
34d475c634SMatthew Wilcox 
35e4b27491SNeilBrown #define RADIX_DAX_MASK	0xf
36e4b27491SNeilBrown #define RADIX_DAX_SHIFT	4
37e4b27491SNeilBrown #define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
38e4b27491SNeilBrown #define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
39e4b27491SNeilBrown #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
40e4b27491SNeilBrown #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
41e4b27491SNeilBrown #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
42e4b27491SNeilBrown 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
43e4b27491SNeilBrown 
44b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
45b2e0d162SDan Williams {
46b2e0d162SDan Williams 	struct request_queue *q = bdev->bd_queue;
47b2e0d162SDan Williams 	long rc = -EIO;
48b2e0d162SDan Williams 
49b2e0d162SDan Williams 	dax->addr = (void __pmem *) ERR_PTR(-EIO);
50b2e0d162SDan Williams 	if (blk_queue_enter(q, true) != 0)
51b2e0d162SDan Williams 		return rc;
52b2e0d162SDan Williams 
53b2e0d162SDan Williams 	rc = bdev_direct_access(bdev, dax);
54b2e0d162SDan Williams 	if (rc < 0) {
55b2e0d162SDan Williams 		dax->addr = (void __pmem *) ERR_PTR(rc);
56b2e0d162SDan Williams 		blk_queue_exit(q);
57b2e0d162SDan Williams 		return rc;
58b2e0d162SDan Williams 	}
59b2e0d162SDan Williams 	return rc;
60b2e0d162SDan Williams }
61b2e0d162SDan Williams 
62b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev,
63b2e0d162SDan Williams 		const struct blk_dax_ctl *dax)
64b2e0d162SDan Williams {
65b2e0d162SDan Williams 	if (IS_ERR(dax->addr))
66b2e0d162SDan Williams 		return;
67b2e0d162SDan Williams 	blk_queue_exit(bdev->bd_queue);
68b2e0d162SDan Williams }
69b2e0d162SDan Williams 
70d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n)
71d1a5f2b4SDan Williams {
72d1a5f2b4SDan Williams 	struct page *page = alloc_pages(GFP_KERNEL, 0);
73d1a5f2b4SDan Williams 	struct blk_dax_ctl dax = {
74d1a5f2b4SDan Williams 		.size = PAGE_SIZE,
75d1a5f2b4SDan Williams 		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
76d1a5f2b4SDan Williams 	};
77d1a5f2b4SDan Williams 	long rc;
78d1a5f2b4SDan Williams 
79d1a5f2b4SDan Williams 	if (!page)
80d1a5f2b4SDan Williams 		return ERR_PTR(-ENOMEM);
81d1a5f2b4SDan Williams 
82d1a5f2b4SDan Williams 	rc = dax_map_atomic(bdev, &dax);
83d1a5f2b4SDan Williams 	if (rc < 0)
84d1a5f2b4SDan Williams 		return ERR_PTR(rc);
85d1a5f2b4SDan Williams 	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
86d1a5f2b4SDan Williams 	dax_unmap_atomic(bdev, &dax);
87d1a5f2b4SDan Williams 	return page;
88d1a5f2b4SDan Williams }
89d1a5f2b4SDan Williams 
90d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh)
91d475c634SMatthew Wilcox {
92d475c634SMatthew Wilcox 	return buffer_mapped(bh) && !buffer_unwritten(bh);
93d475c634SMatthew Wilcox }
94d475c634SMatthew Wilcox 
95d475c634SMatthew Wilcox /*
96d475c634SMatthew Wilcox  * When ext4 encounters a hole, it returns without modifying the buffer_head
97d475c634SMatthew Wilcox  * which means that we can't trust b_size.  To cope with this, we set b_state
98d475c634SMatthew Wilcox  * to 0 before calling get_block and, if any bit is set, we know we can trust
99d475c634SMatthew Wilcox  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
100d475c634SMatthew Wilcox  * and would save us time calling get_block repeatedly.
101d475c634SMatthew Wilcox  */
102d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh)
103d475c634SMatthew Wilcox {
104d475c634SMatthew Wilcox 	return bh->b_state != 0;
105d475c634SMatthew Wilcox }
106d475c634SMatthew Wilcox 
107b2e0d162SDan Williams 
108b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh,
109b2e0d162SDan Williams 		const struct inode *inode)
110b2e0d162SDan Williams {
111b2e0d162SDan Williams 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
112b2e0d162SDan Williams 
113b2e0d162SDan Williams 	return sector;
114b2e0d162SDan Williams }
115b2e0d162SDan Williams 
116a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
117d475c634SMatthew Wilcox 		      loff_t start, loff_t end, get_block_t get_block,
118d475c634SMatthew Wilcox 		      struct buffer_head *bh)
119d475c634SMatthew Wilcox {
120b2e0d162SDan Williams 	loff_t pos = start, max = start, bh_max = start;
121b2e0d162SDan Williams 	bool hole = false, need_wmb = false;
122b2e0d162SDan Williams 	struct block_device *bdev = NULL;
123b2e0d162SDan Williams 	int rw = iov_iter_rw(iter), rc;
124b2e0d162SDan Williams 	long map_len = 0;
125b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
126b2e0d162SDan Williams 		.addr = (void __pmem *) ERR_PTR(-EIO),
127b2e0d162SDan Williams 	};
128069c77bcSJan Kara 	unsigned blkbits = inode->i_blkbits;
129069c77bcSJan Kara 	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
130069c77bcSJan Kara 								>> blkbits;
131d475c634SMatthew Wilcox 
132b2e0d162SDan Williams 	if (rw == READ)
133d475c634SMatthew Wilcox 		end = min(end, i_size_read(inode));
134d475c634SMatthew Wilcox 
135d475c634SMatthew Wilcox 	while (pos < end) {
1362765cfbbSRoss Zwisler 		size_t len;
137d475c634SMatthew Wilcox 		if (pos == max) {
138e94f5a22SJeff Moyer 			long page = pos >> PAGE_SHIFT;
139e94f5a22SJeff Moyer 			sector_t block = page << (PAGE_SHIFT - blkbits);
140d475c634SMatthew Wilcox 			unsigned first = pos - (block << blkbits);
141d475c634SMatthew Wilcox 			long size;
142d475c634SMatthew Wilcox 
143d475c634SMatthew Wilcox 			if (pos == bh_max) {
144d475c634SMatthew Wilcox 				bh->b_size = PAGE_ALIGN(end - pos);
145d475c634SMatthew Wilcox 				bh->b_state = 0;
146b2e0d162SDan Williams 				rc = get_block(inode, block, bh, rw == WRITE);
147b2e0d162SDan Williams 				if (rc)
148d475c634SMatthew Wilcox 					break;
149d475c634SMatthew Wilcox 				if (!buffer_size_valid(bh))
150d475c634SMatthew Wilcox 					bh->b_size = 1 << blkbits;
151d475c634SMatthew Wilcox 				bh_max = pos - first + bh->b_size;
152b2e0d162SDan Williams 				bdev = bh->b_bdev;
153069c77bcSJan Kara 				/*
154069c77bcSJan Kara 				 * We allow uninitialized buffers for writes
155069c77bcSJan Kara 				 * beyond EOF as those cannot race with faults
156069c77bcSJan Kara 				 */
157069c77bcSJan Kara 				WARN_ON_ONCE(
158069c77bcSJan Kara 					(buffer_new(bh) && block < file_blks) ||
159069c77bcSJan Kara 					(rw == WRITE && buffer_unwritten(bh)));
160d475c634SMatthew Wilcox 			} else {
161d475c634SMatthew Wilcox 				unsigned done = bh->b_size -
162d475c634SMatthew Wilcox 						(bh_max - (pos - first));
163d475c634SMatthew Wilcox 				bh->b_blocknr += done >> blkbits;
164d475c634SMatthew Wilcox 				bh->b_size -= done;
165d475c634SMatthew Wilcox 			}
166d475c634SMatthew Wilcox 
167b2e0d162SDan Williams 			hole = rw == READ && !buffer_written(bh);
168d475c634SMatthew Wilcox 			if (hole) {
169d475c634SMatthew Wilcox 				size = bh->b_size - first;
170d475c634SMatthew Wilcox 			} else {
171b2e0d162SDan Williams 				dax_unmap_atomic(bdev, &dax);
172b2e0d162SDan Williams 				dax.sector = to_sector(bh, inode);
173b2e0d162SDan Williams 				dax.size = bh->b_size;
174b2e0d162SDan Williams 				map_len = dax_map_atomic(bdev, &dax);
175b2e0d162SDan Williams 				if (map_len < 0) {
176b2e0d162SDan Williams 					rc = map_len;
177d475c634SMatthew Wilcox 					break;
178b2e0d162SDan Williams 				}
179b2e0d162SDan Williams 				dax.addr += first;
180b2e0d162SDan Williams 				size = map_len - first;
181d475c634SMatthew Wilcox 			}
182d475c634SMatthew Wilcox 			max = min(pos + size, end);
183d475c634SMatthew Wilcox 		}
184d475c634SMatthew Wilcox 
1852765cfbbSRoss Zwisler 		if (iov_iter_rw(iter) == WRITE) {
186b2e0d162SDan Williams 			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
1872765cfbbSRoss Zwisler 			need_wmb = true;
1882765cfbbSRoss Zwisler 		} else if (!hole)
189b2e0d162SDan Williams 			len = copy_to_iter((void __force *) dax.addr, max - pos,
190e2e05394SRoss Zwisler 					iter);
191d475c634SMatthew Wilcox 		else
192d475c634SMatthew Wilcox 			len = iov_iter_zero(max - pos, iter);
193d475c634SMatthew Wilcox 
194cadfbb6eSAl Viro 		if (!len) {
195b2e0d162SDan Williams 			rc = -EFAULT;
196d475c634SMatthew Wilcox 			break;
197cadfbb6eSAl Viro 		}
198d475c634SMatthew Wilcox 
199d475c634SMatthew Wilcox 		pos += len;
200b2e0d162SDan Williams 		if (!IS_ERR(dax.addr))
201b2e0d162SDan Williams 			dax.addr += len;
202d475c634SMatthew Wilcox 	}
203d475c634SMatthew Wilcox 
2042765cfbbSRoss Zwisler 	if (need_wmb)
2052765cfbbSRoss Zwisler 		wmb_pmem();
206b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
2072765cfbbSRoss Zwisler 
208b2e0d162SDan Williams 	return (pos == start) ? rc : pos - start;
209d475c634SMatthew Wilcox }
210d475c634SMatthew Wilcox 
211d475c634SMatthew Wilcox /**
212d475c634SMatthew Wilcox  * dax_do_io - Perform I/O to a DAX file
213d475c634SMatthew Wilcox  * @iocb: The control block for this I/O
214d475c634SMatthew Wilcox  * @inode: The file which the I/O is directed at
215d475c634SMatthew Wilcox  * @iter: The addresses to do I/O from or to
216d475c634SMatthew Wilcox  * @pos: The file offset where the I/O starts
217d475c634SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
218d475c634SMatthew Wilcox  * @end_io: A filesystem callback for I/O completion
219d475c634SMatthew Wilcox  * @flags: See below
220d475c634SMatthew Wilcox  *
221d475c634SMatthew Wilcox  * This function uses the same locking scheme as do_blockdev_direct_IO:
222d475c634SMatthew Wilcox  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
223d475c634SMatthew Wilcox  * caller for writes.  For reads, we take and release the i_mutex ourselves.
224d475c634SMatthew Wilcox  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
225d475c634SMatthew Wilcox  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
226d475c634SMatthew Wilcox  * is in progress.
227d475c634SMatthew Wilcox  */
228a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
229a95cd631SOmar Sandoval 		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
230a95cd631SOmar Sandoval 		  dio_iodone_t end_io, int flags)
231d475c634SMatthew Wilcox {
232d475c634SMatthew Wilcox 	struct buffer_head bh;
233d475c634SMatthew Wilcox 	ssize_t retval = -EINVAL;
234d475c634SMatthew Wilcox 	loff_t end = pos + iov_iter_count(iter);
235d475c634SMatthew Wilcox 
236d475c634SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
237eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
238d475c634SMatthew Wilcox 
239c3d98e39SJan Kara 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2405955102cSAl Viro 		inode_lock(inode);
241d475c634SMatthew Wilcox 
242d475c634SMatthew Wilcox 	/* Protects against truncate */
243bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
244fe0f07d0SJens Axboe 		inode_dio_begin(inode);
245d475c634SMatthew Wilcox 
246a95cd631SOmar Sandoval 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
247d475c634SMatthew Wilcox 
248a95cd631SOmar Sandoval 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2495955102cSAl Viro 		inode_unlock(inode);
250d475c634SMatthew Wilcox 
251187372a3SChristoph Hellwig 	if (end_io) {
252187372a3SChristoph Hellwig 		int err;
253187372a3SChristoph Hellwig 
254187372a3SChristoph Hellwig 		err = end_io(iocb, pos, retval, bh.b_private);
255187372a3SChristoph Hellwig 		if (err)
256187372a3SChristoph Hellwig 			retval = err;
257187372a3SChristoph Hellwig 	}
258d475c634SMatthew Wilcox 
259bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
260fe0f07d0SJens Axboe 		inode_dio_end(inode);
261d475c634SMatthew Wilcox 	return retval;
262d475c634SMatthew Wilcox }
263d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io);
264f7ca90b1SMatthew Wilcox 
265f7ca90b1SMatthew Wilcox /*
266f7ca90b1SMatthew Wilcox  * The user has performed a load from a hole in the file.  Allocating
267f7ca90b1SMatthew Wilcox  * a new page in the file would cause excessive storage usage for
268f7ca90b1SMatthew Wilcox  * workloads with sparse files.  We allocate a page cache page instead.
269f7ca90b1SMatthew Wilcox  * We'll kick it out of the page cache if it's ever written to,
270f7ca90b1SMatthew Wilcox  * otherwise it will simply fall out of the page cache under memory
271f7ca90b1SMatthew Wilcox  * pressure without ever having been dirtied.
272f7ca90b1SMatthew Wilcox  */
273f7ca90b1SMatthew Wilcox static int dax_load_hole(struct address_space *mapping, struct page *page,
274f7ca90b1SMatthew Wilcox 							struct vm_fault *vmf)
275f7ca90b1SMatthew Wilcox {
276f7ca90b1SMatthew Wilcox 	if (!page)
277f7ca90b1SMatthew Wilcox 		page = find_or_create_page(mapping, vmf->pgoff,
278f7ca90b1SMatthew Wilcox 						GFP_KERNEL | __GFP_ZERO);
279f7ca90b1SMatthew Wilcox 	if (!page)
280f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM;
281f7ca90b1SMatthew Wilcox 
282f7ca90b1SMatthew Wilcox 	vmf->page = page;
283f7ca90b1SMatthew Wilcox 	return VM_FAULT_LOCKED;
284f7ca90b1SMatthew Wilcox }
285f7ca90b1SMatthew Wilcox 
286b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode,
287b2e0d162SDan Williams 		struct buffer_head *bh, unsigned long vaddr)
288f7ca90b1SMatthew Wilcox {
289b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
290b2e0d162SDan Williams 		.sector = to_sector(bh, inode),
291b2e0d162SDan Williams 		.size = bh->b_size,
292b2e0d162SDan Williams 	};
293b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
294e2e05394SRoss Zwisler 	void *vto;
295e2e05394SRoss Zwisler 
296b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0)
297b2e0d162SDan Williams 		return PTR_ERR(dax.addr);
298f7ca90b1SMatthew Wilcox 	vto = kmap_atomic(to);
299b2e0d162SDan Williams 	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
300f7ca90b1SMatthew Wilcox 	kunmap_atomic(vto);
301b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
302f7ca90b1SMatthew Wilcox 	return 0;
303f7ca90b1SMatthew Wilcox }
304f7ca90b1SMatthew Wilcox 
3059973c98eSRoss Zwisler #define NO_SECTOR -1
30609cbfeafSKirill A. Shutemov #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
3079973c98eSRoss Zwisler 
3089973c98eSRoss Zwisler static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
3099973c98eSRoss Zwisler 		sector_t sector, bool pmd_entry, bool dirty)
3109973c98eSRoss Zwisler {
3119973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
3129973c98eSRoss Zwisler 	pgoff_t pmd_index = DAX_PMD_INDEX(index);
3139973c98eSRoss Zwisler 	int type, error = 0;
3149973c98eSRoss Zwisler 	void *entry;
3159973c98eSRoss Zwisler 
3169973c98eSRoss Zwisler 	WARN_ON_ONCE(pmd_entry && !dirty);
317d2b2a28eSDmitry Monakhov 	if (dirty)
3189973c98eSRoss Zwisler 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
3199973c98eSRoss Zwisler 
3209973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
3219973c98eSRoss Zwisler 
3229973c98eSRoss Zwisler 	entry = radix_tree_lookup(page_tree, pmd_index);
3239973c98eSRoss Zwisler 	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
3249973c98eSRoss Zwisler 		index = pmd_index;
3259973c98eSRoss Zwisler 		goto dirty;
3269973c98eSRoss Zwisler 	}
3279973c98eSRoss Zwisler 
3289973c98eSRoss Zwisler 	entry = radix_tree_lookup(page_tree, index);
3299973c98eSRoss Zwisler 	if (entry) {
3309973c98eSRoss Zwisler 		type = RADIX_DAX_TYPE(entry);
3319973c98eSRoss Zwisler 		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
3329973c98eSRoss Zwisler 					type != RADIX_DAX_PMD)) {
3339973c98eSRoss Zwisler 			error = -EIO;
3349973c98eSRoss Zwisler 			goto unlock;
3359973c98eSRoss Zwisler 		}
3369973c98eSRoss Zwisler 
3379973c98eSRoss Zwisler 		if (!pmd_entry || type == RADIX_DAX_PMD)
3389973c98eSRoss Zwisler 			goto dirty;
3399973c98eSRoss Zwisler 
3409973c98eSRoss Zwisler 		/*
3419973c98eSRoss Zwisler 		 * We only insert dirty PMD entries into the radix tree.  This
3429973c98eSRoss Zwisler 		 * means we don't need to worry about removing a dirty PTE
3439973c98eSRoss Zwisler 		 * entry and inserting a clean PMD entry, thus reducing the
3449973c98eSRoss Zwisler 		 * range we would flush with a follow-up fsync/msync call.
3459973c98eSRoss Zwisler 		 */
3469973c98eSRoss Zwisler 		radix_tree_delete(&mapping->page_tree, index);
3479973c98eSRoss Zwisler 		mapping->nrexceptional--;
3489973c98eSRoss Zwisler 	}
3499973c98eSRoss Zwisler 
3509973c98eSRoss Zwisler 	if (sector == NO_SECTOR) {
3519973c98eSRoss Zwisler 		/*
3529973c98eSRoss Zwisler 		 * This can happen during correct operation if our pfn_mkwrite
3539973c98eSRoss Zwisler 		 * fault raced against a hole punch operation.  If this
3549973c98eSRoss Zwisler 		 * happens the pte that was hole punched will have been
3559973c98eSRoss Zwisler 		 * unmapped and the radix tree entry will have been removed by
3569973c98eSRoss Zwisler 		 * the time we are called, but the call will still happen.  We
3579973c98eSRoss Zwisler 		 * will return all the way up to wp_pfn_shared(), where the
3589973c98eSRoss Zwisler 		 * pte_same() check will fail, eventually causing page fault
3599973c98eSRoss Zwisler 		 * to be retried by the CPU.
3609973c98eSRoss Zwisler 		 */
3619973c98eSRoss Zwisler 		goto unlock;
3629973c98eSRoss Zwisler 	}
3639973c98eSRoss Zwisler 
3649973c98eSRoss Zwisler 	error = radix_tree_insert(page_tree, index,
3659973c98eSRoss Zwisler 			RADIX_DAX_ENTRY(sector, pmd_entry));
3669973c98eSRoss Zwisler 	if (error)
3679973c98eSRoss Zwisler 		goto unlock;
3689973c98eSRoss Zwisler 
3699973c98eSRoss Zwisler 	mapping->nrexceptional++;
3709973c98eSRoss Zwisler  dirty:
3719973c98eSRoss Zwisler 	if (dirty)
3729973c98eSRoss Zwisler 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
3739973c98eSRoss Zwisler  unlock:
3749973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
3759973c98eSRoss Zwisler 	return error;
3769973c98eSRoss Zwisler }
3779973c98eSRoss Zwisler 
3789973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev,
3799973c98eSRoss Zwisler 		struct address_space *mapping, pgoff_t index, void *entry)
3809973c98eSRoss Zwisler {
3819973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
3829973c98eSRoss Zwisler 	int type = RADIX_DAX_TYPE(entry);
3839973c98eSRoss Zwisler 	struct radix_tree_node *node;
3849973c98eSRoss Zwisler 	struct blk_dax_ctl dax;
3859973c98eSRoss Zwisler 	void **slot;
3869973c98eSRoss Zwisler 	int ret = 0;
3879973c98eSRoss Zwisler 
3889973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
3899973c98eSRoss Zwisler 	/*
3909973c98eSRoss Zwisler 	 * Regular page slots are stabilized by the page lock even
3919973c98eSRoss Zwisler 	 * without the tree itself locked.  These unlocked entries
3929973c98eSRoss Zwisler 	 * need verification under the tree lock.
3939973c98eSRoss Zwisler 	 */
3949973c98eSRoss Zwisler 	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
3959973c98eSRoss Zwisler 		goto unlock;
3969973c98eSRoss Zwisler 	if (*slot != entry)
3979973c98eSRoss Zwisler 		goto unlock;
3989973c98eSRoss Zwisler 
3999973c98eSRoss Zwisler 	/* another fsync thread may have already written back this entry */
4009973c98eSRoss Zwisler 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
4019973c98eSRoss Zwisler 		goto unlock;
4029973c98eSRoss Zwisler 
4039973c98eSRoss Zwisler 	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
4049973c98eSRoss Zwisler 		ret = -EIO;
4059973c98eSRoss Zwisler 		goto unlock;
4069973c98eSRoss Zwisler 	}
4079973c98eSRoss Zwisler 
4089973c98eSRoss Zwisler 	dax.sector = RADIX_DAX_SECTOR(entry);
4099973c98eSRoss Zwisler 	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
4109973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
4119973c98eSRoss Zwisler 
4129973c98eSRoss Zwisler 	/*
4139973c98eSRoss Zwisler 	 * We cannot hold tree_lock while calling dax_map_atomic() because it
4149973c98eSRoss Zwisler 	 * eventually calls cond_resched().
4159973c98eSRoss Zwisler 	 */
4169973c98eSRoss Zwisler 	ret = dax_map_atomic(bdev, &dax);
4179973c98eSRoss Zwisler 	if (ret < 0)
4189973c98eSRoss Zwisler 		return ret;
4199973c98eSRoss Zwisler 
4209973c98eSRoss Zwisler 	if (WARN_ON_ONCE(ret < dax.size)) {
4219973c98eSRoss Zwisler 		ret = -EIO;
4229973c98eSRoss Zwisler 		goto unmap;
4239973c98eSRoss Zwisler 	}
4249973c98eSRoss Zwisler 
4259973c98eSRoss Zwisler 	wb_cache_pmem(dax.addr, dax.size);
4269973c98eSRoss Zwisler 
4279973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
4289973c98eSRoss Zwisler 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
4299973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
4309973c98eSRoss Zwisler  unmap:
4319973c98eSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
4329973c98eSRoss Zwisler 	return ret;
4339973c98eSRoss Zwisler 
4349973c98eSRoss Zwisler  unlock:
4359973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
4369973c98eSRoss Zwisler 	return ret;
4379973c98eSRoss Zwisler }
4389973c98eSRoss Zwisler 
4399973c98eSRoss Zwisler /*
4409973c98eSRoss Zwisler  * Flush the mapping to the persistent domain within the byte range of [start,
4419973c98eSRoss Zwisler  * end]. This is required by data integrity operations to ensure file data is
4429973c98eSRoss Zwisler  * on persistent storage prior to completion of the operation.
4439973c98eSRoss Zwisler  */
4447f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping,
4457f6d5b52SRoss Zwisler 		struct block_device *bdev, struct writeback_control *wbc)
4469973c98eSRoss Zwisler {
4479973c98eSRoss Zwisler 	struct inode *inode = mapping->host;
4489973c98eSRoss Zwisler 	pgoff_t start_index, end_index, pmd_index;
4499973c98eSRoss Zwisler 	pgoff_t indices[PAGEVEC_SIZE];
4509973c98eSRoss Zwisler 	struct pagevec pvec;
4519973c98eSRoss Zwisler 	bool done = false;
4529973c98eSRoss Zwisler 	int i, ret = 0;
4539973c98eSRoss Zwisler 	void *entry;
4549973c98eSRoss Zwisler 
4559973c98eSRoss Zwisler 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
4569973c98eSRoss Zwisler 		return -EIO;
4579973c98eSRoss Zwisler 
4587f6d5b52SRoss Zwisler 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
4597f6d5b52SRoss Zwisler 		return 0;
4607f6d5b52SRoss Zwisler 
46109cbfeafSKirill A. Shutemov 	start_index = wbc->range_start >> PAGE_SHIFT;
46209cbfeafSKirill A. Shutemov 	end_index = wbc->range_end >> PAGE_SHIFT;
4639973c98eSRoss Zwisler 	pmd_index = DAX_PMD_INDEX(start_index);
4649973c98eSRoss Zwisler 
4659973c98eSRoss Zwisler 	rcu_read_lock();
4669973c98eSRoss Zwisler 	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
4679973c98eSRoss Zwisler 	rcu_read_unlock();
4689973c98eSRoss Zwisler 
4699973c98eSRoss Zwisler 	/* see if the start of our range is covered by a PMD entry */
4709973c98eSRoss Zwisler 	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
4719973c98eSRoss Zwisler 		start_index = pmd_index;
4729973c98eSRoss Zwisler 
4739973c98eSRoss Zwisler 	tag_pages_for_writeback(mapping, start_index, end_index);
4749973c98eSRoss Zwisler 
4759973c98eSRoss Zwisler 	pagevec_init(&pvec, 0);
4769973c98eSRoss Zwisler 	while (!done) {
4779973c98eSRoss Zwisler 		pvec.nr = find_get_entries_tag(mapping, start_index,
4789973c98eSRoss Zwisler 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
4799973c98eSRoss Zwisler 				pvec.pages, indices);
4809973c98eSRoss Zwisler 
4819973c98eSRoss Zwisler 		if (pvec.nr == 0)
4829973c98eSRoss Zwisler 			break;
4839973c98eSRoss Zwisler 
4849973c98eSRoss Zwisler 		for (i = 0; i < pvec.nr; i++) {
4859973c98eSRoss Zwisler 			if (indices[i] > end_index) {
4869973c98eSRoss Zwisler 				done = true;
4879973c98eSRoss Zwisler 				break;
4889973c98eSRoss Zwisler 			}
4899973c98eSRoss Zwisler 
4909973c98eSRoss Zwisler 			ret = dax_writeback_one(bdev, mapping, indices[i],
4919973c98eSRoss Zwisler 					pvec.pages[i]);
4929973c98eSRoss Zwisler 			if (ret < 0)
4939973c98eSRoss Zwisler 				return ret;
4949973c98eSRoss Zwisler 		}
4959973c98eSRoss Zwisler 	}
4969973c98eSRoss Zwisler 	wmb_pmem();
4979973c98eSRoss Zwisler 	return 0;
4989973c98eSRoss Zwisler }
4999973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
5009973c98eSRoss Zwisler 
501f7ca90b1SMatthew Wilcox static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
502f7ca90b1SMatthew Wilcox 			struct vm_area_struct *vma, struct vm_fault *vmf)
503f7ca90b1SMatthew Wilcox {
504f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
505b2e0d162SDan Williams 	struct address_space *mapping = inode->i_mapping;
506b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
507b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
508b2e0d162SDan Williams 		.sector = to_sector(bh, inode),
509b2e0d162SDan Williams 		.size = bh->b_size,
510b2e0d162SDan Williams 	};
511f7ca90b1SMatthew Wilcox 	int error;
512f7ca90b1SMatthew Wilcox 
5130f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
5140f90cc66SRoss Zwisler 
515b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0) {
516b2e0d162SDan Williams 		error = PTR_ERR(dax.addr);
517f7ca90b1SMatthew Wilcox 		goto out;
518f7ca90b1SMatthew Wilcox 	}
519b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
520f7ca90b1SMatthew Wilcox 
5219973c98eSRoss Zwisler 	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
5229973c98eSRoss Zwisler 			vmf->flags & FAULT_FLAG_WRITE);
5239973c98eSRoss Zwisler 	if (error)
5249973c98eSRoss Zwisler 		goto out;
5259973c98eSRoss Zwisler 
52601c8f1c4SDan Williams 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
527f7ca90b1SMatthew Wilcox 
528f7ca90b1SMatthew Wilcox  out:
5290f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
5300f90cc66SRoss Zwisler 
531f7ca90b1SMatthew Wilcox 	return error;
532f7ca90b1SMatthew Wilcox }
533f7ca90b1SMatthew Wilcox 
534ce5c5d55SDave Chinner /**
535ce5c5d55SDave Chinner  * __dax_fault - handle a page fault on a DAX file
536ce5c5d55SDave Chinner  * @vma: The virtual memory area where the fault occurred
537ce5c5d55SDave Chinner  * @vmf: The description of the fault
538ce5c5d55SDave Chinner  * @get_block: The filesystem method used to translate file offsets to blocks
539ce5c5d55SDave Chinner  *
540ce5c5d55SDave Chinner  * When a page fault occurs, filesystems may call this helper in their
541ce5c5d55SDave Chinner  * fault handler for DAX files. __dax_fault() assumes the caller has done all
542ce5c5d55SDave Chinner  * the necessary locking for the page fault to proceed successfully.
543ce5c5d55SDave Chinner  */
544ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
54502fbd139SJan Kara 			get_block_t get_block)
546f7ca90b1SMatthew Wilcox {
547f7ca90b1SMatthew Wilcox 	struct file *file = vma->vm_file;
548f7ca90b1SMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
549f7ca90b1SMatthew Wilcox 	struct inode *inode = mapping->host;
550f7ca90b1SMatthew Wilcox 	struct page *page;
551f7ca90b1SMatthew Wilcox 	struct buffer_head bh;
552f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
553f7ca90b1SMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
554f7ca90b1SMatthew Wilcox 	sector_t block;
555f7ca90b1SMatthew Wilcox 	pgoff_t size;
556f7ca90b1SMatthew Wilcox 	int error;
557f7ca90b1SMatthew Wilcox 	int major = 0;
558f7ca90b1SMatthew Wilcox 
559f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
560f7ca90b1SMatthew Wilcox 	if (vmf->pgoff >= size)
561f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS;
562f7ca90b1SMatthew Wilcox 
563f7ca90b1SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
564f7ca90b1SMatthew Wilcox 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
565eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
566f7ca90b1SMatthew Wilcox 	bh.b_size = PAGE_SIZE;
567f7ca90b1SMatthew Wilcox 
568f7ca90b1SMatthew Wilcox  repeat:
569f7ca90b1SMatthew Wilcox 	page = find_get_page(mapping, vmf->pgoff);
570f7ca90b1SMatthew Wilcox 	if (page) {
571f7ca90b1SMatthew Wilcox 		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
57209cbfeafSKirill A. Shutemov 			put_page(page);
573f7ca90b1SMatthew Wilcox 			return VM_FAULT_RETRY;
574f7ca90b1SMatthew Wilcox 		}
575f7ca90b1SMatthew Wilcox 		if (unlikely(page->mapping != mapping)) {
576f7ca90b1SMatthew Wilcox 			unlock_page(page);
57709cbfeafSKirill A. Shutemov 			put_page(page);
578f7ca90b1SMatthew Wilcox 			goto repeat;
579f7ca90b1SMatthew Wilcox 		}
580f7ca90b1SMatthew Wilcox 	}
581f7ca90b1SMatthew Wilcox 
582f7ca90b1SMatthew Wilcox 	error = get_block(inode, block, &bh, 0);
583f7ca90b1SMatthew Wilcox 	if (!error && (bh.b_size < PAGE_SIZE))
584f7ca90b1SMatthew Wilcox 		error = -EIO;		/* fs corruption? */
585f7ca90b1SMatthew Wilcox 	if (error)
5860f90cc66SRoss Zwisler 		goto unlock_page;
587f7ca90b1SMatthew Wilcox 
588aef39ab1SJan Kara 	if (!buffer_mapped(&bh) && !vmf->cow_page) {
589f7ca90b1SMatthew Wilcox 		if (vmf->flags & FAULT_FLAG_WRITE) {
590f7ca90b1SMatthew Wilcox 			error = get_block(inode, block, &bh, 1);
591f7ca90b1SMatthew Wilcox 			count_vm_event(PGMAJFAULT);
592f7ca90b1SMatthew Wilcox 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
593f7ca90b1SMatthew Wilcox 			major = VM_FAULT_MAJOR;
594f7ca90b1SMatthew Wilcox 			if (!error && (bh.b_size < PAGE_SIZE))
595f7ca90b1SMatthew Wilcox 				error = -EIO;
596f7ca90b1SMatthew Wilcox 			if (error)
5970f90cc66SRoss Zwisler 				goto unlock_page;
598f7ca90b1SMatthew Wilcox 		} else {
599f7ca90b1SMatthew Wilcox 			return dax_load_hole(mapping, page, vmf);
600f7ca90b1SMatthew Wilcox 		}
601f7ca90b1SMatthew Wilcox 	}
602f7ca90b1SMatthew Wilcox 
603f7ca90b1SMatthew Wilcox 	if (vmf->cow_page) {
604f7ca90b1SMatthew Wilcox 		struct page *new_page = vmf->cow_page;
605f7ca90b1SMatthew Wilcox 		if (buffer_written(&bh))
606b2e0d162SDan Williams 			error = copy_user_bh(new_page, inode, &bh, vaddr);
607f7ca90b1SMatthew Wilcox 		else
608f7ca90b1SMatthew Wilcox 			clear_user_highpage(new_page, vaddr);
609f7ca90b1SMatthew Wilcox 		if (error)
6100f90cc66SRoss Zwisler 			goto unlock_page;
611f7ca90b1SMatthew Wilcox 		vmf->page = page;
6127795bec8SJan Kara 		if (!page)
6130f90cc66SRoss Zwisler 			i_mmap_lock_read(mapping);
614f7ca90b1SMatthew Wilcox 		return VM_FAULT_LOCKED;
615f7ca90b1SMatthew Wilcox 	}
616f7ca90b1SMatthew Wilcox 
617f7ca90b1SMatthew Wilcox 	/* Check we didn't race with a read fault installing a new page */
618f7ca90b1SMatthew Wilcox 	if (!page && major)
619f7ca90b1SMatthew Wilcox 		page = find_lock_page(mapping, vmf->pgoff);
620f7ca90b1SMatthew Wilcox 
621f7ca90b1SMatthew Wilcox 	if (page) {
622f7ca90b1SMatthew Wilcox 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
62309cbfeafSKirill A. Shutemov 							PAGE_SIZE, 0);
624f7ca90b1SMatthew Wilcox 		delete_from_page_cache(page);
625f7ca90b1SMatthew Wilcox 		unlock_page(page);
62609cbfeafSKirill A. Shutemov 		put_page(page);
6279973c98eSRoss Zwisler 		page = NULL;
628f7ca90b1SMatthew Wilcox 	}
629f7ca90b1SMatthew Wilcox 
63002fbd139SJan Kara 	/* Filesystem should not return unwritten buffers to us! */
6312b10945cSJan Kara 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
632f7ca90b1SMatthew Wilcox 	error = dax_insert_mapping(inode, &bh, vma, vmf);
633f7ca90b1SMatthew Wilcox 
634f7ca90b1SMatthew Wilcox  out:
635f7ca90b1SMatthew Wilcox 	if (error == -ENOMEM)
636f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM | major;
637f7ca90b1SMatthew Wilcox 	/* -EBUSY is fine, somebody else faulted on the same PTE */
638f7ca90b1SMatthew Wilcox 	if ((error < 0) && (error != -EBUSY))
639f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS | major;
640f7ca90b1SMatthew Wilcox 	return VM_FAULT_NOPAGE | major;
641f7ca90b1SMatthew Wilcox 
6420f90cc66SRoss Zwisler  unlock_page:
643f7ca90b1SMatthew Wilcox 	if (page) {
644f7ca90b1SMatthew Wilcox 		unlock_page(page);
64509cbfeafSKirill A. Shutemov 		put_page(page);
646f7ca90b1SMatthew Wilcox 	}
647f7ca90b1SMatthew Wilcox 	goto out;
648f7ca90b1SMatthew Wilcox }
649ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault);
650f7ca90b1SMatthew Wilcox 
651f7ca90b1SMatthew Wilcox /**
652f7ca90b1SMatthew Wilcox  * dax_fault - handle a page fault on a DAX file
653f7ca90b1SMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
654f7ca90b1SMatthew Wilcox  * @vmf: The description of the fault
655f7ca90b1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
656f7ca90b1SMatthew Wilcox  *
657f7ca90b1SMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
658f7ca90b1SMatthew Wilcox  * fault handler for DAX files.
659f7ca90b1SMatthew Wilcox  */
660f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
66102fbd139SJan Kara 	      get_block_t get_block)
662f7ca90b1SMatthew Wilcox {
663f7ca90b1SMatthew Wilcox 	int result;
664f7ca90b1SMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
665f7ca90b1SMatthew Wilcox 
666f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE) {
667f7ca90b1SMatthew Wilcox 		sb_start_pagefault(sb);
668f7ca90b1SMatthew Wilcox 		file_update_time(vma->vm_file);
669f7ca90b1SMatthew Wilcox 	}
67002fbd139SJan Kara 	result = __dax_fault(vma, vmf, get_block);
671f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE)
672f7ca90b1SMatthew Wilcox 		sb_end_pagefault(sb);
673f7ca90b1SMatthew Wilcox 
674f7ca90b1SMatthew Wilcox 	return result;
675f7ca90b1SMatthew Wilcox }
676f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault);
6774c0ccfefSMatthew Wilcox 
678844f35dbSMatthew Wilcox #ifdef CONFIG_TRANSPARENT_HUGEPAGE
679844f35dbSMatthew Wilcox /*
680844f35dbSMatthew Wilcox  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
681844f35dbSMatthew Wilcox  * more often than one might expect in the below function.
682844f35dbSMatthew Wilcox  */
683844f35dbSMatthew Wilcox #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
684844f35dbSMatthew Wilcox 
685cbb38e41SDan Williams static void __dax_dbg(struct buffer_head *bh, unsigned long address,
686cbb38e41SDan Williams 		const char *reason, const char *fn)
687cbb38e41SDan Williams {
688cbb38e41SDan Williams 	if (bh) {
689cbb38e41SDan Williams 		char bname[BDEVNAME_SIZE];
690cbb38e41SDan Williams 		bdevname(bh->b_bdev, bname);
691cbb38e41SDan Williams 		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
692cbb38e41SDan Williams 			"length %zd fallback: %s\n", fn, current->comm,
693cbb38e41SDan Williams 			address, bname, bh->b_state, (u64)bh->b_blocknr,
694cbb38e41SDan Williams 			bh->b_size, reason);
695cbb38e41SDan Williams 	} else {
696cbb38e41SDan Williams 		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
697cbb38e41SDan Williams 			current->comm, address, reason);
698cbb38e41SDan Williams 	}
699cbb38e41SDan Williams }
700cbb38e41SDan Williams 
701cbb38e41SDan Williams #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
702cbb38e41SDan Williams 
703844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
70402fbd139SJan Kara 		pmd_t *pmd, unsigned int flags, get_block_t get_block)
705844f35dbSMatthew Wilcox {
706844f35dbSMatthew Wilcox 	struct file *file = vma->vm_file;
707844f35dbSMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
708844f35dbSMatthew Wilcox 	struct inode *inode = mapping->host;
709844f35dbSMatthew Wilcox 	struct buffer_head bh;
710844f35dbSMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
711844f35dbSMatthew Wilcox 	unsigned long pmd_addr = address & PMD_MASK;
712844f35dbSMatthew Wilcox 	bool write = flags & FAULT_FLAG_WRITE;
713b2e0d162SDan Williams 	struct block_device *bdev;
714844f35dbSMatthew Wilcox 	pgoff_t size, pgoff;
715b2e0d162SDan Williams 	sector_t block;
7169973c98eSRoss Zwisler 	int error, result = 0;
7179973c98eSRoss Zwisler 	bool alloc = false;
718844f35dbSMatthew Wilcox 
719c046c321SDan Williams 	/* dax pmd mappings require pfn_t_devmap() */
720ee82c9edSDan Williams 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
721ee82c9edSDan Williams 		return VM_FAULT_FALLBACK;
722ee82c9edSDan Williams 
723844f35dbSMatthew Wilcox 	/* Fall back to PTEs if we're going to COW */
72459bf4fb9SToshi Kani 	if (write && !(vma->vm_flags & VM_SHARED)) {
72559bf4fb9SToshi Kani 		split_huge_pmd(vma, pmd, address);
726cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "cow write");
727844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
72859bf4fb9SToshi Kani 	}
729844f35dbSMatthew Wilcox 	/* If the PMD would extend outside the VMA */
730cbb38e41SDan Williams 	if (pmd_addr < vma->vm_start) {
731cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "vma start unaligned");
732844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
733cbb38e41SDan Williams 	}
734cbb38e41SDan Williams 	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
735cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "vma end unaligned");
736844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
737cbb38e41SDan Williams 	}
738844f35dbSMatthew Wilcox 
7393fdd1b47SMatthew Wilcox 	pgoff = linear_page_index(vma, pmd_addr);
740844f35dbSMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
741844f35dbSMatthew Wilcox 	if (pgoff >= size)
742844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
743844f35dbSMatthew Wilcox 	/* If the PMD would cover blocks out of the file */
744cbb38e41SDan Williams 	if ((pgoff | PG_PMD_COLOUR) >= size) {
745cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address,
746cbb38e41SDan Williams 				"offset + huge page size > file size");
747844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
748cbb38e41SDan Williams 	}
749844f35dbSMatthew Wilcox 
750844f35dbSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
751d4bbe706SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
752844f35dbSMatthew Wilcox 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
753844f35dbSMatthew Wilcox 
754844f35dbSMatthew Wilcox 	bh.b_size = PMD_SIZE;
7559973c98eSRoss Zwisler 
7569973c98eSRoss Zwisler 	if (get_block(inode, block, &bh, 0) != 0)
757844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
7589973c98eSRoss Zwisler 
7599973c98eSRoss Zwisler 	if (!buffer_mapped(&bh) && write) {
7609973c98eSRoss Zwisler 		if (get_block(inode, block, &bh, 1) != 0)
7619973c98eSRoss Zwisler 			return VM_FAULT_SIGBUS;
7629973c98eSRoss Zwisler 		alloc = true;
7632b10945cSJan Kara 		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
7649973c98eSRoss Zwisler 	}
7659973c98eSRoss Zwisler 
766b2e0d162SDan Williams 	bdev = bh.b_bdev;
767844f35dbSMatthew Wilcox 
768844f35dbSMatthew Wilcox 	/*
769844f35dbSMatthew Wilcox 	 * If the filesystem isn't willing to tell us the length of a hole,
770844f35dbSMatthew Wilcox 	 * just fall back to PTEs.  Calling get_block 512 times in a loop
771844f35dbSMatthew Wilcox 	 * would be silly.
772844f35dbSMatthew Wilcox 	 */
773cbb38e41SDan Williams 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
774cbb38e41SDan Williams 		dax_pmd_dbg(&bh, address, "allocated block too small");
7759973c98eSRoss Zwisler 		return VM_FAULT_FALLBACK;
776cbb38e41SDan Williams 	}
777844f35dbSMatthew Wilcox 
7789973c98eSRoss Zwisler 	/*
7799973c98eSRoss Zwisler 	 * If we allocated new storage, make sure no process has any
7809973c98eSRoss Zwisler 	 * zero pages covering this hole
7819973c98eSRoss Zwisler 	 */
7829973c98eSRoss Zwisler 	if (alloc) {
7839973c98eSRoss Zwisler 		loff_t lstart = pgoff << PAGE_SHIFT;
7849973c98eSRoss Zwisler 		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
7859973c98eSRoss Zwisler 
7869973c98eSRoss Zwisler 		truncate_pagecache_range(inode, lstart, lend);
7879973c98eSRoss Zwisler 	}
7889973c98eSRoss Zwisler 
7890f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
79046c043edSKirill A. Shutemov 
791844f35dbSMatthew Wilcox 	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
792844f35dbSMatthew Wilcox 		spinlock_t *ptl;
793d295e341SKirill A. Shutemov 		pmd_t entry;
794844f35dbSMatthew Wilcox 		struct page *zero_page = get_huge_zero_page();
795d295e341SKirill A. Shutemov 
796cbb38e41SDan Williams 		if (unlikely(!zero_page)) {
797cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "no zero page");
798844f35dbSMatthew Wilcox 			goto fallback;
799cbb38e41SDan Williams 		}
800844f35dbSMatthew Wilcox 
801d295e341SKirill A. Shutemov 		ptl = pmd_lock(vma->vm_mm, pmd);
802d295e341SKirill A. Shutemov 		if (!pmd_none(*pmd)) {
803844f35dbSMatthew Wilcox 			spin_unlock(ptl);
804cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pmd already present");
805d295e341SKirill A. Shutemov 			goto fallback;
806d295e341SKirill A. Shutemov 		}
807d295e341SKirill A. Shutemov 
808cbb38e41SDan Williams 		dev_dbg(part_to_dev(bdev->bd_part),
809cbb38e41SDan Williams 				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
810cbb38e41SDan Williams 				__func__, current->comm, address,
811cbb38e41SDan Williams 				(unsigned long long) to_sector(&bh, inode));
812cbb38e41SDan Williams 
813d295e341SKirill A. Shutemov 		entry = mk_pmd(zero_page, vma->vm_page_prot);
814d295e341SKirill A. Shutemov 		entry = pmd_mkhuge(entry);
815d295e341SKirill A. Shutemov 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
816844f35dbSMatthew Wilcox 		result = VM_FAULT_NOPAGE;
817d295e341SKirill A. Shutemov 		spin_unlock(ptl);
818844f35dbSMatthew Wilcox 	} else {
819b2e0d162SDan Williams 		struct blk_dax_ctl dax = {
820b2e0d162SDan Williams 			.sector = to_sector(&bh, inode),
821b2e0d162SDan Williams 			.size = PMD_SIZE,
822b2e0d162SDan Williams 		};
823b2e0d162SDan Williams 		long length = dax_map_atomic(bdev, &dax);
824b2e0d162SDan Williams 
825844f35dbSMatthew Wilcox 		if (length < 0) {
8268b3db979SDan Williams 			dax_pmd_dbg(&bh, address, "dax-error fallback");
8278b3db979SDan Williams 			goto fallback;
828844f35dbSMatthew Wilcox 		}
829cbb38e41SDan Williams 		if (length < PMD_SIZE) {
830cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "dax-length too small");
831cbb38e41SDan Williams 			dax_unmap_atomic(bdev, &dax);
832cbb38e41SDan Williams 			goto fallback;
833cbb38e41SDan Williams 		}
834cbb38e41SDan Williams 		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
835cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pfn unaligned");
836b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
837844f35dbSMatthew Wilcox 			goto fallback;
838b2e0d162SDan Williams 		}
839844f35dbSMatthew Wilcox 
840c046c321SDan Williams 		if (!pfn_t_devmap(dax.pfn)) {
841b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
842cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pfn not in memmap");
843152d7bd8SDan Williams 			goto fallback;
844b2e0d162SDan Williams 		}
845b2e0d162SDan Williams 		dax_unmap_atomic(bdev, &dax);
8460f90cc66SRoss Zwisler 
8479973c98eSRoss Zwisler 		/*
8489973c98eSRoss Zwisler 		 * For PTE faults we insert a radix tree entry for reads, and
8499973c98eSRoss Zwisler 		 * leave it clean.  Then on the first write we dirty the radix
8509973c98eSRoss Zwisler 		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
8519973c98eSRoss Zwisler 		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
8529973c98eSRoss Zwisler 		 * call into get_block() to translate the pgoff to a sector in
8539973c98eSRoss Zwisler 		 * order to be able to create a new radix tree entry.
8549973c98eSRoss Zwisler 		 *
8559973c98eSRoss Zwisler 		 * The PMD path doesn't have an equivalent to
8569973c98eSRoss Zwisler 		 * dax_pfn_mkwrite(), though, so for a read followed by a
8579973c98eSRoss Zwisler 		 * write we traverse all the way through __dax_pmd_fault()
8589973c98eSRoss Zwisler 		 * twice.  This means we can just skip inserting a radix tree
8599973c98eSRoss Zwisler 		 * entry completely on the initial read and just wait until
8609973c98eSRoss Zwisler 		 * the write to insert a dirty entry.
8619973c98eSRoss Zwisler 		 */
8629973c98eSRoss Zwisler 		if (write) {
8639973c98eSRoss Zwisler 			error = dax_radix_entry(mapping, pgoff, dax.sector,
8649973c98eSRoss Zwisler 					true, true);
8659973c98eSRoss Zwisler 			if (error) {
8669973c98eSRoss Zwisler 				dax_pmd_dbg(&bh, address,
8679973c98eSRoss Zwisler 						"PMD radix insertion failed");
8689973c98eSRoss Zwisler 				goto fallback;
8699973c98eSRoss Zwisler 			}
8709973c98eSRoss Zwisler 		}
8719973c98eSRoss Zwisler 
872cbb38e41SDan Williams 		dev_dbg(part_to_dev(bdev->bd_part),
873cbb38e41SDan Williams 				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
874cbb38e41SDan Williams 				__func__, current->comm, address,
875cbb38e41SDan Williams 				pfn_t_to_pfn(dax.pfn),
876cbb38e41SDan Williams 				(unsigned long long) dax.sector);
87734c0fd54SDan Williams 		result |= vmf_insert_pfn_pmd(vma, address, pmd,
878f25748e3SDan Williams 				dax.pfn, write);
879844f35dbSMatthew Wilcox 	}
880844f35dbSMatthew Wilcox 
881844f35dbSMatthew Wilcox  out:
8820f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
8830f90cc66SRoss Zwisler 
884844f35dbSMatthew Wilcox 	return result;
885844f35dbSMatthew Wilcox 
886844f35dbSMatthew Wilcox  fallback:
887844f35dbSMatthew Wilcox 	count_vm_event(THP_FAULT_FALLBACK);
888844f35dbSMatthew Wilcox 	result = VM_FAULT_FALLBACK;
889844f35dbSMatthew Wilcox 	goto out;
890844f35dbSMatthew Wilcox }
891844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault);
892844f35dbSMatthew Wilcox 
893844f35dbSMatthew Wilcox /**
894844f35dbSMatthew Wilcox  * dax_pmd_fault - handle a PMD fault on a DAX file
895844f35dbSMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
896844f35dbSMatthew Wilcox  * @vmf: The description of the fault
897844f35dbSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
898844f35dbSMatthew Wilcox  *
899844f35dbSMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
900844f35dbSMatthew Wilcox  * pmd_fault handler for DAX files.
901844f35dbSMatthew Wilcox  */
902844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
90302fbd139SJan Kara 			pmd_t *pmd, unsigned int flags, get_block_t get_block)
904844f35dbSMatthew Wilcox {
905844f35dbSMatthew Wilcox 	int result;
906844f35dbSMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
907844f35dbSMatthew Wilcox 
908844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE) {
909844f35dbSMatthew Wilcox 		sb_start_pagefault(sb);
910844f35dbSMatthew Wilcox 		file_update_time(vma->vm_file);
911844f35dbSMatthew Wilcox 	}
91202fbd139SJan Kara 	result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
913844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE)
914844f35dbSMatthew Wilcox 		sb_end_pagefault(sb);
915844f35dbSMatthew Wilcox 
916844f35dbSMatthew Wilcox 	return result;
917844f35dbSMatthew Wilcox }
918844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault);
919dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
920844f35dbSMatthew Wilcox 
9214c0ccfefSMatthew Wilcox /**
9220e3b210cSBoaz Harrosh  * dax_pfn_mkwrite - handle first write to DAX page
9230e3b210cSBoaz Harrosh  * @vma: The virtual memory area where the fault occurred
9240e3b210cSBoaz Harrosh  * @vmf: The description of the fault
9250e3b210cSBoaz Harrosh  */
9260e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
9270e3b210cSBoaz Harrosh {
9289973c98eSRoss Zwisler 	struct file *file = vma->vm_file;
92930f471fdSRoss Zwisler 	int error;
9300e3b210cSBoaz Harrosh 
9319973c98eSRoss Zwisler 	/*
9329973c98eSRoss Zwisler 	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
9339973c98eSRoss Zwisler 	 * RADIX_DAX_PTE entry already exists in the radix tree from a
9349973c98eSRoss Zwisler 	 * previous call to __dax_fault().  We just want to look up that PTE
9359973c98eSRoss Zwisler 	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
9369973c98eSRoss Zwisler 	 * saves us from having to make a call to get_block() here to look
9379973c98eSRoss Zwisler 	 * up the sector.
9389973c98eSRoss Zwisler 	 */
93930f471fdSRoss Zwisler 	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
94030f471fdSRoss Zwisler 			true);
94130f471fdSRoss Zwisler 
94230f471fdSRoss Zwisler 	if (error == -ENOMEM)
94330f471fdSRoss Zwisler 		return VM_FAULT_OOM;
94430f471fdSRoss Zwisler 	if (error)
94530f471fdSRoss Zwisler 		return VM_FAULT_SIGBUS;
9460e3b210cSBoaz Harrosh 	return VM_FAULT_NOPAGE;
9470e3b210cSBoaz Harrosh }
9480e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
9490e3b210cSBoaz Harrosh 
950679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
951679c8bd3SChristoph Hellwig 		unsigned int offset, unsigned int length)
952679c8bd3SChristoph Hellwig {
953679c8bd3SChristoph Hellwig 	struct blk_dax_ctl dax = {
954679c8bd3SChristoph Hellwig 		.sector		= sector,
955679c8bd3SChristoph Hellwig 		.size		= PAGE_SIZE,
956679c8bd3SChristoph Hellwig 	};
957679c8bd3SChristoph Hellwig 
958679c8bd3SChristoph Hellwig 	if (dax_map_atomic(bdev, &dax) < 0)
959679c8bd3SChristoph Hellwig 		return PTR_ERR(dax.addr);
960679c8bd3SChristoph Hellwig 	clear_pmem(dax.addr + offset, length);
961679c8bd3SChristoph Hellwig 	wmb_pmem();
962679c8bd3SChristoph Hellwig 	dax_unmap_atomic(bdev, &dax);
963679c8bd3SChristoph Hellwig 	return 0;
964679c8bd3SChristoph Hellwig }
965679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range);
966679c8bd3SChristoph Hellwig 
9670e3b210cSBoaz Harrosh /**
96825726bc1SMatthew Wilcox  * dax_zero_page_range - zero a range within a page of a DAX file
9694c0ccfefSMatthew Wilcox  * @inode: The file being truncated
9704c0ccfefSMatthew Wilcox  * @from: The file offset that is being truncated to
97125726bc1SMatthew Wilcox  * @length: The number of bytes to zero
9724c0ccfefSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
9734c0ccfefSMatthew Wilcox  *
97425726bc1SMatthew Wilcox  * This function can be called by a filesystem when it is zeroing part of a
97525726bc1SMatthew Wilcox  * page in a DAX file.  This is intended for hole-punch operations.  If
97625726bc1SMatthew Wilcox  * you are truncating a file, the helper function dax_truncate_page() may be
97725726bc1SMatthew Wilcox  * more convenient.
9784c0ccfefSMatthew Wilcox  *
979ea1754a0SKirill A. Shutemov  * We work in terms of PAGE_SIZE here for commonality with
9804c0ccfefSMatthew Wilcox  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
9814c0ccfefSMatthew Wilcox  * took care of disposing of the unnecessary blocks.  Even if the filesystem
9824c0ccfefSMatthew Wilcox  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
98325726bc1SMatthew Wilcox  * since the file might be mmapped.
9844c0ccfefSMatthew Wilcox  */
98525726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
98625726bc1SMatthew Wilcox 							get_block_t get_block)
9874c0ccfefSMatthew Wilcox {
9884c0ccfefSMatthew Wilcox 	struct buffer_head bh;
98909cbfeafSKirill A. Shutemov 	pgoff_t index = from >> PAGE_SHIFT;
99009cbfeafSKirill A. Shutemov 	unsigned offset = from & (PAGE_SIZE-1);
9914c0ccfefSMatthew Wilcox 	int err;
9924c0ccfefSMatthew Wilcox 
9934c0ccfefSMatthew Wilcox 	/* Block boundary? Nothing to do */
9944c0ccfefSMatthew Wilcox 	if (!length)
9954c0ccfefSMatthew Wilcox 		return 0;
99609cbfeafSKirill A. Shutemov 	BUG_ON((offset + length) > PAGE_SIZE);
9974c0ccfefSMatthew Wilcox 
9984c0ccfefSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
999eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
100009cbfeafSKirill A. Shutemov 	bh.b_size = PAGE_SIZE;
10014c0ccfefSMatthew Wilcox 	err = get_block(inode, index, &bh, 0);
1002679c8bd3SChristoph Hellwig 	if (err < 0 || !buffer_written(&bh))
10034c0ccfefSMatthew Wilcox 		return err;
1004b2e0d162SDan Williams 
1005679c8bd3SChristoph Hellwig 	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1006679c8bd3SChristoph Hellwig 			offset, length);
10074c0ccfefSMatthew Wilcox }
100825726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range);
100925726bc1SMatthew Wilcox 
101025726bc1SMatthew Wilcox /**
101125726bc1SMatthew Wilcox  * dax_truncate_page - handle a partial page being truncated in a DAX file
101225726bc1SMatthew Wilcox  * @inode: The file being truncated
101325726bc1SMatthew Wilcox  * @from: The file offset that is being truncated to
101425726bc1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
101525726bc1SMatthew Wilcox  *
101625726bc1SMatthew Wilcox  * Similar to block_truncate_page(), this function can be called by a
101725726bc1SMatthew Wilcox  * filesystem when it is truncating a DAX file to handle the partial page.
101825726bc1SMatthew Wilcox  *
1019ea1754a0SKirill A. Shutemov  * We work in terms of PAGE_SIZE here for commonality with
102025726bc1SMatthew Wilcox  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
102125726bc1SMatthew Wilcox  * took care of disposing of the unnecessary blocks.  Even if the filesystem
102225726bc1SMatthew Wilcox  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
102325726bc1SMatthew Wilcox  * since the file might be mmapped.
102425726bc1SMatthew Wilcox  */
102525726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
102625726bc1SMatthew Wilcox {
102709cbfeafSKirill A. Shutemov 	unsigned length = PAGE_ALIGN(from) - from;
102825726bc1SMatthew Wilcox 	return dax_zero_page_range(inode, from, length, get_block);
102925726bc1SMatthew Wilcox }
10304c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page);
1031