xref: /openbmc/linux/fs/dax.c (revision ac401cc7)
1d475c634SMatthew Wilcox /*
2d475c634SMatthew Wilcox  * fs/dax.c - Direct Access filesystem code
3d475c634SMatthew Wilcox  * Copyright (c) 2013-2014 Intel Corporation
4d475c634SMatthew Wilcox  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5d475c634SMatthew Wilcox  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6d475c634SMatthew Wilcox  *
7d475c634SMatthew Wilcox  * This program is free software; you can redistribute it and/or modify it
8d475c634SMatthew Wilcox  * under the terms and conditions of the GNU General Public License,
9d475c634SMatthew Wilcox  * version 2, as published by the Free Software Foundation.
10d475c634SMatthew Wilcox  *
11d475c634SMatthew Wilcox  * This program is distributed in the hope it will be useful, but WITHOUT
12d475c634SMatthew Wilcox  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13d475c634SMatthew Wilcox  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14d475c634SMatthew Wilcox  * more details.
15d475c634SMatthew Wilcox  */
16d475c634SMatthew Wilcox 
17d475c634SMatthew Wilcox #include <linux/atomic.h>
18d475c634SMatthew Wilcox #include <linux/blkdev.h>
19d475c634SMatthew Wilcox #include <linux/buffer_head.h>
20d77e92e2SRoss Zwisler #include <linux/dax.h>
21d475c634SMatthew Wilcox #include <linux/fs.h>
22d475c634SMatthew Wilcox #include <linux/genhd.h>
23f7ca90b1SMatthew Wilcox #include <linux/highmem.h>
24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h>
25f7ca90b1SMatthew Wilcox #include <linux/mm.h>
26d475c634SMatthew Wilcox #include <linux/mutex.h>
279973c98eSRoss Zwisler #include <linux/pagevec.h>
282765cfbbSRoss Zwisler #include <linux/pmem.h>
29289c6aedSMatthew Wilcox #include <linux/sched.h>
30d475c634SMatthew Wilcox #include <linux/uio.h>
31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h>
3234c0fd54SDan Williams #include <linux/pfn_t.h>
330e749e54SDan Williams #include <linux/sizes.h>
34d475c634SMatthew Wilcox 
35e804315dSJan Kara /*
36e804315dSJan Kara  * We use lowest available bit in exceptional entry for locking, other two
37e804315dSJan Kara  * bits to determine entry type. In total 3 special bits.
38e804315dSJan Kara  */
39e804315dSJan Kara #define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
40e804315dSJan Kara #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
41e804315dSJan Kara #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
42e804315dSJan Kara #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
43e804315dSJan Kara #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
44e4b27491SNeilBrown #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
45e4b27491SNeilBrown #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
46e804315dSJan Kara 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
47e804315dSJan Kara 		RADIX_TREE_EXCEPTIONAL_ENTRY))
48e4b27491SNeilBrown 
49ac401cc7SJan Kara /* We choose 4096 entries - same as per-zone page wait tables */
50ac401cc7SJan Kara #define DAX_WAIT_TABLE_BITS 12
51ac401cc7SJan Kara #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
52ac401cc7SJan Kara 
53ac401cc7SJan Kara wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
54ac401cc7SJan Kara 
55ac401cc7SJan Kara static int __init init_dax_wait_table(void)
56ac401cc7SJan Kara {
57ac401cc7SJan Kara 	int i;
58ac401cc7SJan Kara 
59ac401cc7SJan Kara 	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
60ac401cc7SJan Kara 		init_waitqueue_head(wait_table + i);
61ac401cc7SJan Kara 	return 0;
62ac401cc7SJan Kara }
63ac401cc7SJan Kara fs_initcall(init_dax_wait_table);
64ac401cc7SJan Kara 
65ac401cc7SJan Kara static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
66ac401cc7SJan Kara 					      pgoff_t index)
67ac401cc7SJan Kara {
68ac401cc7SJan Kara 	unsigned long hash = hash_long((unsigned long)mapping ^ index,
69ac401cc7SJan Kara 				       DAX_WAIT_TABLE_BITS);
70ac401cc7SJan Kara 	return wait_table + hash;
71ac401cc7SJan Kara }
72ac401cc7SJan Kara 
73b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
74b2e0d162SDan Williams {
75b2e0d162SDan Williams 	struct request_queue *q = bdev->bd_queue;
76b2e0d162SDan Williams 	long rc = -EIO;
77b2e0d162SDan Williams 
78b2e0d162SDan Williams 	dax->addr = (void __pmem *) ERR_PTR(-EIO);
79b2e0d162SDan Williams 	if (blk_queue_enter(q, true) != 0)
80b2e0d162SDan Williams 		return rc;
81b2e0d162SDan Williams 
82b2e0d162SDan Williams 	rc = bdev_direct_access(bdev, dax);
83b2e0d162SDan Williams 	if (rc < 0) {
84b2e0d162SDan Williams 		dax->addr = (void __pmem *) ERR_PTR(rc);
85b2e0d162SDan Williams 		blk_queue_exit(q);
86b2e0d162SDan Williams 		return rc;
87b2e0d162SDan Williams 	}
88b2e0d162SDan Williams 	return rc;
89b2e0d162SDan Williams }
90b2e0d162SDan Williams 
91b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev,
92b2e0d162SDan Williams 		const struct blk_dax_ctl *dax)
93b2e0d162SDan Williams {
94b2e0d162SDan Williams 	if (IS_ERR(dax->addr))
95b2e0d162SDan Williams 		return;
96b2e0d162SDan Williams 	blk_queue_exit(bdev->bd_queue);
97b2e0d162SDan Williams }
98b2e0d162SDan Williams 
99d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n)
100d1a5f2b4SDan Williams {
101d1a5f2b4SDan Williams 	struct page *page = alloc_pages(GFP_KERNEL, 0);
102d1a5f2b4SDan Williams 	struct blk_dax_ctl dax = {
103d1a5f2b4SDan Williams 		.size = PAGE_SIZE,
104d1a5f2b4SDan Williams 		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
105d1a5f2b4SDan Williams 	};
106d1a5f2b4SDan Williams 	long rc;
107d1a5f2b4SDan Williams 
108d1a5f2b4SDan Williams 	if (!page)
109d1a5f2b4SDan Williams 		return ERR_PTR(-ENOMEM);
110d1a5f2b4SDan Williams 
111d1a5f2b4SDan Williams 	rc = dax_map_atomic(bdev, &dax);
112d1a5f2b4SDan Williams 	if (rc < 0)
113d1a5f2b4SDan Williams 		return ERR_PTR(rc);
114d1a5f2b4SDan Williams 	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
115d1a5f2b4SDan Williams 	dax_unmap_atomic(bdev, &dax);
116d1a5f2b4SDan Williams 	return page;
117d1a5f2b4SDan Williams }
118d1a5f2b4SDan Williams 
119d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh)
120d475c634SMatthew Wilcox {
121d475c634SMatthew Wilcox 	return buffer_mapped(bh) && !buffer_unwritten(bh);
122d475c634SMatthew Wilcox }
123d475c634SMatthew Wilcox 
124d475c634SMatthew Wilcox /*
125d475c634SMatthew Wilcox  * When ext4 encounters a hole, it returns without modifying the buffer_head
126d475c634SMatthew Wilcox  * which means that we can't trust b_size.  To cope with this, we set b_state
127d475c634SMatthew Wilcox  * to 0 before calling get_block and, if any bit is set, we know we can trust
128d475c634SMatthew Wilcox  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
129d475c634SMatthew Wilcox  * and would save us time calling get_block repeatedly.
130d475c634SMatthew Wilcox  */
131d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh)
132d475c634SMatthew Wilcox {
133d475c634SMatthew Wilcox 	return bh->b_state != 0;
134d475c634SMatthew Wilcox }
135d475c634SMatthew Wilcox 
136b2e0d162SDan Williams 
137b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh,
138b2e0d162SDan Williams 		const struct inode *inode)
139b2e0d162SDan Williams {
140b2e0d162SDan Williams 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
141b2e0d162SDan Williams 
142b2e0d162SDan Williams 	return sector;
143b2e0d162SDan Williams }
144b2e0d162SDan Williams 
145a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
146d475c634SMatthew Wilcox 		      loff_t start, loff_t end, get_block_t get_block,
147d475c634SMatthew Wilcox 		      struct buffer_head *bh)
148d475c634SMatthew Wilcox {
149b2e0d162SDan Williams 	loff_t pos = start, max = start, bh_max = start;
150b2e0d162SDan Williams 	bool hole = false, need_wmb = false;
151b2e0d162SDan Williams 	struct block_device *bdev = NULL;
152b2e0d162SDan Williams 	int rw = iov_iter_rw(iter), rc;
153b2e0d162SDan Williams 	long map_len = 0;
154b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
155b2e0d162SDan Williams 		.addr = (void __pmem *) ERR_PTR(-EIO),
156b2e0d162SDan Williams 	};
157069c77bcSJan Kara 	unsigned blkbits = inode->i_blkbits;
158069c77bcSJan Kara 	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
159069c77bcSJan Kara 								>> blkbits;
160d475c634SMatthew Wilcox 
161b2e0d162SDan Williams 	if (rw == READ)
162d475c634SMatthew Wilcox 		end = min(end, i_size_read(inode));
163d475c634SMatthew Wilcox 
164d475c634SMatthew Wilcox 	while (pos < end) {
1652765cfbbSRoss Zwisler 		size_t len;
166d475c634SMatthew Wilcox 		if (pos == max) {
167e94f5a22SJeff Moyer 			long page = pos >> PAGE_SHIFT;
168e94f5a22SJeff Moyer 			sector_t block = page << (PAGE_SHIFT - blkbits);
169d475c634SMatthew Wilcox 			unsigned first = pos - (block << blkbits);
170d475c634SMatthew Wilcox 			long size;
171d475c634SMatthew Wilcox 
172d475c634SMatthew Wilcox 			if (pos == bh_max) {
173d475c634SMatthew Wilcox 				bh->b_size = PAGE_ALIGN(end - pos);
174d475c634SMatthew Wilcox 				bh->b_state = 0;
175b2e0d162SDan Williams 				rc = get_block(inode, block, bh, rw == WRITE);
176b2e0d162SDan Williams 				if (rc)
177d475c634SMatthew Wilcox 					break;
178d475c634SMatthew Wilcox 				if (!buffer_size_valid(bh))
179d475c634SMatthew Wilcox 					bh->b_size = 1 << blkbits;
180d475c634SMatthew Wilcox 				bh_max = pos - first + bh->b_size;
181b2e0d162SDan Williams 				bdev = bh->b_bdev;
182069c77bcSJan Kara 				/*
183069c77bcSJan Kara 				 * We allow uninitialized buffers for writes
184069c77bcSJan Kara 				 * beyond EOF as those cannot race with faults
185069c77bcSJan Kara 				 */
186069c77bcSJan Kara 				WARN_ON_ONCE(
187069c77bcSJan Kara 					(buffer_new(bh) && block < file_blks) ||
188069c77bcSJan Kara 					(rw == WRITE && buffer_unwritten(bh)));
189d475c634SMatthew Wilcox 			} else {
190d475c634SMatthew Wilcox 				unsigned done = bh->b_size -
191d475c634SMatthew Wilcox 						(bh_max - (pos - first));
192d475c634SMatthew Wilcox 				bh->b_blocknr += done >> blkbits;
193d475c634SMatthew Wilcox 				bh->b_size -= done;
194d475c634SMatthew Wilcox 			}
195d475c634SMatthew Wilcox 
196b2e0d162SDan Williams 			hole = rw == READ && !buffer_written(bh);
197d475c634SMatthew Wilcox 			if (hole) {
198d475c634SMatthew Wilcox 				size = bh->b_size - first;
199d475c634SMatthew Wilcox 			} else {
200b2e0d162SDan Williams 				dax_unmap_atomic(bdev, &dax);
201b2e0d162SDan Williams 				dax.sector = to_sector(bh, inode);
202b2e0d162SDan Williams 				dax.size = bh->b_size;
203b2e0d162SDan Williams 				map_len = dax_map_atomic(bdev, &dax);
204b2e0d162SDan Williams 				if (map_len < 0) {
205b2e0d162SDan Williams 					rc = map_len;
206d475c634SMatthew Wilcox 					break;
207b2e0d162SDan Williams 				}
208b2e0d162SDan Williams 				dax.addr += first;
209b2e0d162SDan Williams 				size = map_len - first;
210d475c634SMatthew Wilcox 			}
211d475c634SMatthew Wilcox 			max = min(pos + size, end);
212d475c634SMatthew Wilcox 		}
213d475c634SMatthew Wilcox 
2142765cfbbSRoss Zwisler 		if (iov_iter_rw(iter) == WRITE) {
215b2e0d162SDan Williams 			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
2162765cfbbSRoss Zwisler 			need_wmb = true;
2172765cfbbSRoss Zwisler 		} else if (!hole)
218b2e0d162SDan Williams 			len = copy_to_iter((void __force *) dax.addr, max - pos,
219e2e05394SRoss Zwisler 					iter);
220d475c634SMatthew Wilcox 		else
221d475c634SMatthew Wilcox 			len = iov_iter_zero(max - pos, iter);
222d475c634SMatthew Wilcox 
223cadfbb6eSAl Viro 		if (!len) {
224b2e0d162SDan Williams 			rc = -EFAULT;
225d475c634SMatthew Wilcox 			break;
226cadfbb6eSAl Viro 		}
227d475c634SMatthew Wilcox 
228d475c634SMatthew Wilcox 		pos += len;
229b2e0d162SDan Williams 		if (!IS_ERR(dax.addr))
230b2e0d162SDan Williams 			dax.addr += len;
231d475c634SMatthew Wilcox 	}
232d475c634SMatthew Wilcox 
2332765cfbbSRoss Zwisler 	if (need_wmb)
2342765cfbbSRoss Zwisler 		wmb_pmem();
235b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
2362765cfbbSRoss Zwisler 
237b2e0d162SDan Williams 	return (pos == start) ? rc : pos - start;
238d475c634SMatthew Wilcox }
239d475c634SMatthew Wilcox 
240d475c634SMatthew Wilcox /**
241d475c634SMatthew Wilcox  * dax_do_io - Perform I/O to a DAX file
242d475c634SMatthew Wilcox  * @iocb: The control block for this I/O
243d475c634SMatthew Wilcox  * @inode: The file which the I/O is directed at
244d475c634SMatthew Wilcox  * @iter: The addresses to do I/O from or to
245d475c634SMatthew Wilcox  * @pos: The file offset where the I/O starts
246d475c634SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
247d475c634SMatthew Wilcox  * @end_io: A filesystem callback for I/O completion
248d475c634SMatthew Wilcox  * @flags: See below
249d475c634SMatthew Wilcox  *
250d475c634SMatthew Wilcox  * This function uses the same locking scheme as do_blockdev_direct_IO:
251d475c634SMatthew Wilcox  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
252d475c634SMatthew Wilcox  * caller for writes.  For reads, we take and release the i_mutex ourselves.
253d475c634SMatthew Wilcox  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
254d475c634SMatthew Wilcox  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
255d475c634SMatthew Wilcox  * is in progress.
256d475c634SMatthew Wilcox  */
257a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
258a95cd631SOmar Sandoval 		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
259a95cd631SOmar Sandoval 		  dio_iodone_t end_io, int flags)
260d475c634SMatthew Wilcox {
261d475c634SMatthew Wilcox 	struct buffer_head bh;
262d475c634SMatthew Wilcox 	ssize_t retval = -EINVAL;
263d475c634SMatthew Wilcox 	loff_t end = pos + iov_iter_count(iter);
264d475c634SMatthew Wilcox 
265d475c634SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
266eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
267d475c634SMatthew Wilcox 
268c3d98e39SJan Kara 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2695955102cSAl Viro 		inode_lock(inode);
270d475c634SMatthew Wilcox 
271d475c634SMatthew Wilcox 	/* Protects against truncate */
272bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
273fe0f07d0SJens Axboe 		inode_dio_begin(inode);
274d475c634SMatthew Wilcox 
275a95cd631SOmar Sandoval 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
276d475c634SMatthew Wilcox 
277a95cd631SOmar Sandoval 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2785955102cSAl Viro 		inode_unlock(inode);
279d475c634SMatthew Wilcox 
280187372a3SChristoph Hellwig 	if (end_io) {
281187372a3SChristoph Hellwig 		int err;
282187372a3SChristoph Hellwig 
283187372a3SChristoph Hellwig 		err = end_io(iocb, pos, retval, bh.b_private);
284187372a3SChristoph Hellwig 		if (err)
285187372a3SChristoph Hellwig 			retval = err;
286187372a3SChristoph Hellwig 	}
287d475c634SMatthew Wilcox 
288bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
289fe0f07d0SJens Axboe 		inode_dio_end(inode);
290d475c634SMatthew Wilcox 	return retval;
291d475c634SMatthew Wilcox }
292d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io);
293f7ca90b1SMatthew Wilcox 
294f7ca90b1SMatthew Wilcox /*
295ac401cc7SJan Kara  * DAX radix tree locking
296ac401cc7SJan Kara  */
297ac401cc7SJan Kara struct exceptional_entry_key {
298ac401cc7SJan Kara 	struct address_space *mapping;
299ac401cc7SJan Kara 	unsigned long index;
300ac401cc7SJan Kara };
301ac401cc7SJan Kara 
302ac401cc7SJan Kara struct wait_exceptional_entry_queue {
303ac401cc7SJan Kara 	wait_queue_t wait;
304ac401cc7SJan Kara 	struct exceptional_entry_key key;
305ac401cc7SJan Kara };
306ac401cc7SJan Kara 
307ac401cc7SJan Kara static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
308ac401cc7SJan Kara 				       int sync, void *keyp)
309ac401cc7SJan Kara {
310ac401cc7SJan Kara 	struct exceptional_entry_key *key = keyp;
311ac401cc7SJan Kara 	struct wait_exceptional_entry_queue *ewait =
312ac401cc7SJan Kara 		container_of(wait, struct wait_exceptional_entry_queue, wait);
313ac401cc7SJan Kara 
314ac401cc7SJan Kara 	if (key->mapping != ewait->key.mapping ||
315ac401cc7SJan Kara 	    key->index != ewait->key.index)
316ac401cc7SJan Kara 		return 0;
317ac401cc7SJan Kara 	return autoremove_wake_function(wait, mode, sync, NULL);
318ac401cc7SJan Kara }
319ac401cc7SJan Kara 
320ac401cc7SJan Kara /*
321ac401cc7SJan Kara  * Check whether the given slot is locked. The function must be called with
322ac401cc7SJan Kara  * mapping->tree_lock held
323ac401cc7SJan Kara  */
324ac401cc7SJan Kara static inline int slot_locked(struct address_space *mapping, void **slot)
325ac401cc7SJan Kara {
326ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
327ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
328ac401cc7SJan Kara 	return entry & RADIX_DAX_ENTRY_LOCK;
329ac401cc7SJan Kara }
330ac401cc7SJan Kara 
331ac401cc7SJan Kara /*
332ac401cc7SJan Kara  * Mark the given slot is locked. The function must be called with
333ac401cc7SJan Kara  * mapping->tree_lock held
334ac401cc7SJan Kara  */
335ac401cc7SJan Kara static inline void *lock_slot(struct address_space *mapping, void **slot)
336ac401cc7SJan Kara {
337ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
338ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
339ac401cc7SJan Kara 
340ac401cc7SJan Kara 	entry |= RADIX_DAX_ENTRY_LOCK;
341ac401cc7SJan Kara 	radix_tree_replace_slot(slot, (void *)entry);
342ac401cc7SJan Kara 	return (void *)entry;
343ac401cc7SJan Kara }
344ac401cc7SJan Kara 
345ac401cc7SJan Kara /*
346ac401cc7SJan Kara  * Mark the given slot is unlocked. The function must be called with
347ac401cc7SJan Kara  * mapping->tree_lock held
348ac401cc7SJan Kara  */
349ac401cc7SJan Kara static inline void *unlock_slot(struct address_space *mapping, void **slot)
350ac401cc7SJan Kara {
351ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
352ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
353ac401cc7SJan Kara 
354ac401cc7SJan Kara 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
355ac401cc7SJan Kara 	radix_tree_replace_slot(slot, (void *)entry);
356ac401cc7SJan Kara 	return (void *)entry;
357ac401cc7SJan Kara }
358ac401cc7SJan Kara 
359ac401cc7SJan Kara /*
360ac401cc7SJan Kara  * Lookup entry in radix tree, wait for it to become unlocked if it is
361ac401cc7SJan Kara  * exceptional entry and return it. The caller must call
362ac401cc7SJan Kara  * put_unlocked_mapping_entry() when he decided not to lock the entry or
363ac401cc7SJan Kara  * put_locked_mapping_entry() when he locked the entry and now wants to
364ac401cc7SJan Kara  * unlock it.
365ac401cc7SJan Kara  *
366ac401cc7SJan Kara  * The function must be called with mapping->tree_lock held.
367ac401cc7SJan Kara  */
368ac401cc7SJan Kara static void *get_unlocked_mapping_entry(struct address_space *mapping,
369ac401cc7SJan Kara 					pgoff_t index, void ***slotp)
370ac401cc7SJan Kara {
371ac401cc7SJan Kara 	void *ret, **slot;
372ac401cc7SJan Kara 	struct wait_exceptional_entry_queue ewait;
373ac401cc7SJan Kara 	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
374ac401cc7SJan Kara 
375ac401cc7SJan Kara 	init_wait(&ewait.wait);
376ac401cc7SJan Kara 	ewait.wait.func = wake_exceptional_entry_func;
377ac401cc7SJan Kara 	ewait.key.mapping = mapping;
378ac401cc7SJan Kara 	ewait.key.index = index;
379ac401cc7SJan Kara 
380ac401cc7SJan Kara 	for (;;) {
381ac401cc7SJan Kara 		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
382ac401cc7SJan Kara 					  &slot);
383ac401cc7SJan Kara 		if (!ret || !radix_tree_exceptional_entry(ret) ||
384ac401cc7SJan Kara 		    !slot_locked(mapping, slot)) {
385ac401cc7SJan Kara 			if (slotp)
386ac401cc7SJan Kara 				*slotp = slot;
387ac401cc7SJan Kara 			return ret;
388ac401cc7SJan Kara 		}
389ac401cc7SJan Kara 		prepare_to_wait_exclusive(wq, &ewait.wait,
390ac401cc7SJan Kara 					  TASK_UNINTERRUPTIBLE);
391ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
392ac401cc7SJan Kara 		schedule();
393ac401cc7SJan Kara 		finish_wait(wq, &ewait.wait);
394ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
395ac401cc7SJan Kara 	}
396ac401cc7SJan Kara }
397ac401cc7SJan Kara 
398ac401cc7SJan Kara /*
399ac401cc7SJan Kara  * Find radix tree entry at given index. If it points to a page, return with
400ac401cc7SJan Kara  * the page locked. If it points to the exceptional entry, return with the
401ac401cc7SJan Kara  * radix tree entry locked. If the radix tree doesn't contain given index,
402ac401cc7SJan Kara  * create empty exceptional entry for the index and return with it locked.
403ac401cc7SJan Kara  *
404ac401cc7SJan Kara  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
405ac401cc7SJan Kara  * persistent memory the benefit is doubtful. We can add that later if we can
406ac401cc7SJan Kara  * show it helps.
407ac401cc7SJan Kara  */
408ac401cc7SJan Kara static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
409ac401cc7SJan Kara {
410ac401cc7SJan Kara 	void *ret, **slot;
411ac401cc7SJan Kara 
412ac401cc7SJan Kara restart:
413ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
414ac401cc7SJan Kara 	ret = get_unlocked_mapping_entry(mapping, index, &slot);
415ac401cc7SJan Kara 	/* No entry for given index? Make sure radix tree is big enough. */
416ac401cc7SJan Kara 	if (!ret) {
417ac401cc7SJan Kara 		int err;
418ac401cc7SJan Kara 
419ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
420ac401cc7SJan Kara 		err = radix_tree_preload(
421ac401cc7SJan Kara 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
422ac401cc7SJan Kara 		if (err)
423ac401cc7SJan Kara 			return ERR_PTR(err);
424ac401cc7SJan Kara 		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
425ac401cc7SJan Kara 			       RADIX_DAX_ENTRY_LOCK);
426ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
427ac401cc7SJan Kara 		err = radix_tree_insert(&mapping->page_tree, index, ret);
428ac401cc7SJan Kara 		radix_tree_preload_end();
429ac401cc7SJan Kara 		if (err) {
430ac401cc7SJan Kara 			spin_unlock_irq(&mapping->tree_lock);
431ac401cc7SJan Kara 			/* Someone already created the entry? */
432ac401cc7SJan Kara 			if (err == -EEXIST)
433ac401cc7SJan Kara 				goto restart;
434ac401cc7SJan Kara 			return ERR_PTR(err);
435ac401cc7SJan Kara 		}
436ac401cc7SJan Kara 		/* Good, we have inserted empty locked entry into the tree. */
437ac401cc7SJan Kara 		mapping->nrexceptional++;
438ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
439ac401cc7SJan Kara 		return ret;
440ac401cc7SJan Kara 	}
441ac401cc7SJan Kara 	/* Normal page in radix tree? */
442ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(ret)) {
443ac401cc7SJan Kara 		struct page *page = ret;
444ac401cc7SJan Kara 
445ac401cc7SJan Kara 		get_page(page);
446ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
447ac401cc7SJan Kara 		lock_page(page);
448ac401cc7SJan Kara 		/* Page got truncated? Retry... */
449ac401cc7SJan Kara 		if (unlikely(page->mapping != mapping)) {
450ac401cc7SJan Kara 			unlock_page(page);
451ac401cc7SJan Kara 			put_page(page);
452ac401cc7SJan Kara 			goto restart;
453ac401cc7SJan Kara 		}
454ac401cc7SJan Kara 		return page;
455ac401cc7SJan Kara 	}
456ac401cc7SJan Kara 	ret = lock_slot(mapping, slot);
457ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
458ac401cc7SJan Kara 	return ret;
459ac401cc7SJan Kara }
460ac401cc7SJan Kara 
461ac401cc7SJan Kara void dax_wake_mapping_entry_waiter(struct address_space *mapping,
462ac401cc7SJan Kara 				   pgoff_t index, bool wake_all)
463ac401cc7SJan Kara {
464ac401cc7SJan Kara 	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
465ac401cc7SJan Kara 
466ac401cc7SJan Kara 	/*
467ac401cc7SJan Kara 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
468ac401cc7SJan Kara 	 * under mapping->tree_lock, ditto for entry handling in our callers.
469ac401cc7SJan Kara 	 * So at this point all tasks that could have seen our entry locked
470ac401cc7SJan Kara 	 * must be in the waitqueue and the following check will see them.
471ac401cc7SJan Kara 	 */
472ac401cc7SJan Kara 	if (waitqueue_active(wq)) {
473ac401cc7SJan Kara 		struct exceptional_entry_key key;
474ac401cc7SJan Kara 
475ac401cc7SJan Kara 		key.mapping = mapping;
476ac401cc7SJan Kara 		key.index = index;
477ac401cc7SJan Kara 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
478ac401cc7SJan Kara 	}
479ac401cc7SJan Kara }
480ac401cc7SJan Kara 
481ac401cc7SJan Kara static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
482ac401cc7SJan Kara {
483ac401cc7SJan Kara 	void *ret, **slot;
484ac401cc7SJan Kara 
485ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
486ac401cc7SJan Kara 	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
487ac401cc7SJan Kara 	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
488ac401cc7SJan Kara 			 !slot_locked(mapping, slot))) {
489ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
490ac401cc7SJan Kara 		return;
491ac401cc7SJan Kara 	}
492ac401cc7SJan Kara 	unlock_slot(mapping, slot);
493ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
494ac401cc7SJan Kara 	dax_wake_mapping_entry_waiter(mapping, index, false);
495ac401cc7SJan Kara }
496ac401cc7SJan Kara 
497ac401cc7SJan Kara static void put_locked_mapping_entry(struct address_space *mapping,
498ac401cc7SJan Kara 				     pgoff_t index, void *entry)
499ac401cc7SJan Kara {
500ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
501ac401cc7SJan Kara 		unlock_page(entry);
502ac401cc7SJan Kara 		put_page(entry);
503ac401cc7SJan Kara 	} else {
504ac401cc7SJan Kara 		unlock_mapping_entry(mapping, index);
505ac401cc7SJan Kara 	}
506ac401cc7SJan Kara }
507ac401cc7SJan Kara 
508ac401cc7SJan Kara /*
509ac401cc7SJan Kara  * Called when we are done with radix tree entry we looked up via
510ac401cc7SJan Kara  * get_unlocked_mapping_entry() and which we didn't lock in the end.
511ac401cc7SJan Kara  */
512ac401cc7SJan Kara static void put_unlocked_mapping_entry(struct address_space *mapping,
513ac401cc7SJan Kara 				       pgoff_t index, void *entry)
514ac401cc7SJan Kara {
515ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry))
516ac401cc7SJan Kara 		return;
517ac401cc7SJan Kara 
518ac401cc7SJan Kara 	/* We have to wake up next waiter for the radix tree entry lock */
519ac401cc7SJan Kara 	dax_wake_mapping_entry_waiter(mapping, index, false);
520ac401cc7SJan Kara }
521ac401cc7SJan Kara 
522ac401cc7SJan Kara /*
523ac401cc7SJan Kara  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
524ac401cc7SJan Kara  * entry to get unlocked before deleting it.
525ac401cc7SJan Kara  */
526ac401cc7SJan Kara int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
527ac401cc7SJan Kara {
528ac401cc7SJan Kara 	void *entry;
529ac401cc7SJan Kara 
530ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
531ac401cc7SJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
532ac401cc7SJan Kara 	/*
533ac401cc7SJan Kara 	 * This gets called from truncate / punch_hole path. As such, the caller
534ac401cc7SJan Kara 	 * must hold locks protecting against concurrent modifications of the
535ac401cc7SJan Kara 	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
536ac401cc7SJan Kara 	 * caller has seen exceptional entry for this index, we better find it
537ac401cc7SJan Kara 	 * at that index as well...
538ac401cc7SJan Kara 	 */
539ac401cc7SJan Kara 	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
540ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
541ac401cc7SJan Kara 		return 0;
542ac401cc7SJan Kara 	}
543ac401cc7SJan Kara 	radix_tree_delete(&mapping->page_tree, index);
544ac401cc7SJan Kara 	mapping->nrexceptional--;
545ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
546ac401cc7SJan Kara 	dax_wake_mapping_entry_waiter(mapping, index, true);
547ac401cc7SJan Kara 
548ac401cc7SJan Kara 	return 1;
549ac401cc7SJan Kara }
550ac401cc7SJan Kara 
551ac401cc7SJan Kara /*
552f7ca90b1SMatthew Wilcox  * The user has performed a load from a hole in the file.  Allocating
553f7ca90b1SMatthew Wilcox  * a new page in the file would cause excessive storage usage for
554f7ca90b1SMatthew Wilcox  * workloads with sparse files.  We allocate a page cache page instead.
555f7ca90b1SMatthew Wilcox  * We'll kick it out of the page cache if it's ever written to,
556f7ca90b1SMatthew Wilcox  * otherwise it will simply fall out of the page cache under memory
557f7ca90b1SMatthew Wilcox  * pressure without ever having been dirtied.
558f7ca90b1SMatthew Wilcox  */
559ac401cc7SJan Kara static int dax_load_hole(struct address_space *mapping, void *entry,
560f7ca90b1SMatthew Wilcox 			 struct vm_fault *vmf)
561f7ca90b1SMatthew Wilcox {
562ac401cc7SJan Kara 	struct page *page;
563f7ca90b1SMatthew Wilcox 
564ac401cc7SJan Kara 	/* Hole page already exists? Return it...  */
565ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
566ac401cc7SJan Kara 		vmf->page = entry;
567ac401cc7SJan Kara 		return VM_FAULT_LOCKED;
568ac401cc7SJan Kara 	}
569ac401cc7SJan Kara 
570ac401cc7SJan Kara 	/* This will replace locked radix tree entry with a hole page */
571ac401cc7SJan Kara 	page = find_or_create_page(mapping, vmf->pgoff,
572ac401cc7SJan Kara 				   vmf->gfp_mask | __GFP_ZERO);
573ac401cc7SJan Kara 	if (!page) {
574ac401cc7SJan Kara 		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
575ac401cc7SJan Kara 		return VM_FAULT_OOM;
576ac401cc7SJan Kara 	}
577f7ca90b1SMatthew Wilcox 	vmf->page = page;
578f7ca90b1SMatthew Wilcox 	return VM_FAULT_LOCKED;
579f7ca90b1SMatthew Wilcox }
580f7ca90b1SMatthew Wilcox 
581b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode,
582b2e0d162SDan Williams 		struct buffer_head *bh, unsigned long vaddr)
583f7ca90b1SMatthew Wilcox {
584b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
585b2e0d162SDan Williams 		.sector = to_sector(bh, inode),
586b2e0d162SDan Williams 		.size = bh->b_size,
587b2e0d162SDan Williams 	};
588b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
589e2e05394SRoss Zwisler 	void *vto;
590e2e05394SRoss Zwisler 
591b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0)
592b2e0d162SDan Williams 		return PTR_ERR(dax.addr);
593f7ca90b1SMatthew Wilcox 	vto = kmap_atomic(to);
594b2e0d162SDan Williams 	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
595f7ca90b1SMatthew Wilcox 	kunmap_atomic(vto);
596b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
597f7ca90b1SMatthew Wilcox 	return 0;
598f7ca90b1SMatthew Wilcox }
599f7ca90b1SMatthew Wilcox 
60009cbfeafSKirill A. Shutemov #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
6019973c98eSRoss Zwisler 
602ac401cc7SJan Kara static void *dax_insert_mapping_entry(struct address_space *mapping,
603ac401cc7SJan Kara 				      struct vm_fault *vmf,
604ac401cc7SJan Kara 				      void *entry, sector_t sector)
6059973c98eSRoss Zwisler {
6069973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
607ac401cc7SJan Kara 	int error = 0;
608ac401cc7SJan Kara 	bool hole_fill = false;
609ac401cc7SJan Kara 	void *new_entry;
610ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
6119973c98eSRoss Zwisler 
612ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
6139973c98eSRoss Zwisler 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
6149973c98eSRoss Zwisler 
615ac401cc7SJan Kara 	/* Replacing hole page with block mapping? */
616ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
617ac401cc7SJan Kara 		hole_fill = true;
6189973c98eSRoss Zwisler 		/*
619ac401cc7SJan Kara 		 * Unmap the page now before we remove it from page cache below.
620ac401cc7SJan Kara 		 * The page is locked so it cannot be faulted in again.
6219973c98eSRoss Zwisler 		 */
622ac401cc7SJan Kara 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
623ac401cc7SJan Kara 				    PAGE_SIZE, 0);
624ac401cc7SJan Kara 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
6259973c98eSRoss Zwisler 		if (error)
626ac401cc7SJan Kara 			return ERR_PTR(error);
627ac401cc7SJan Kara 	}
6289973c98eSRoss Zwisler 
629ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
630ac401cc7SJan Kara 	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
631ac401cc7SJan Kara 		       RADIX_DAX_ENTRY_LOCK);
632ac401cc7SJan Kara 	if (hole_fill) {
633ac401cc7SJan Kara 		__delete_from_page_cache(entry, NULL);
634ac401cc7SJan Kara 		/* Drop pagecache reference */
635ac401cc7SJan Kara 		put_page(entry);
636ac401cc7SJan Kara 		error = radix_tree_insert(page_tree, index, new_entry);
637ac401cc7SJan Kara 		if (error) {
638ac401cc7SJan Kara 			new_entry = ERR_PTR(error);
639ac401cc7SJan Kara 			goto unlock;
640ac401cc7SJan Kara 		}
6419973c98eSRoss Zwisler 		mapping->nrexceptional++;
642ac401cc7SJan Kara 	} else {
643ac401cc7SJan Kara 		void **slot;
644ac401cc7SJan Kara 		void *ret;
645ac401cc7SJan Kara 
646ac401cc7SJan Kara 		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
647ac401cc7SJan Kara 		WARN_ON_ONCE(ret != entry);
648ac401cc7SJan Kara 		radix_tree_replace_slot(slot, new_entry);
649ac401cc7SJan Kara 	}
650ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
6519973c98eSRoss Zwisler 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
6529973c98eSRoss Zwisler  unlock:
6539973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
654ac401cc7SJan Kara 	if (hole_fill) {
655ac401cc7SJan Kara 		radix_tree_preload_end();
656ac401cc7SJan Kara 		/*
657ac401cc7SJan Kara 		 * We don't need hole page anymore, it has been replaced with
658ac401cc7SJan Kara 		 * locked radix tree entry now.
659ac401cc7SJan Kara 		 */
660ac401cc7SJan Kara 		if (mapping->a_ops->freepage)
661ac401cc7SJan Kara 			mapping->a_ops->freepage(entry);
662ac401cc7SJan Kara 		unlock_page(entry);
663ac401cc7SJan Kara 		put_page(entry);
664ac401cc7SJan Kara 	}
665ac401cc7SJan Kara 	return new_entry;
6669973c98eSRoss Zwisler }
6679973c98eSRoss Zwisler 
6689973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev,
6699973c98eSRoss Zwisler 		struct address_space *mapping, pgoff_t index, void *entry)
6709973c98eSRoss Zwisler {
6719973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
6729973c98eSRoss Zwisler 	int type = RADIX_DAX_TYPE(entry);
6739973c98eSRoss Zwisler 	struct radix_tree_node *node;
6749973c98eSRoss Zwisler 	struct blk_dax_ctl dax;
6759973c98eSRoss Zwisler 	void **slot;
6769973c98eSRoss Zwisler 	int ret = 0;
6779973c98eSRoss Zwisler 
6789973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
6799973c98eSRoss Zwisler 	/*
6809973c98eSRoss Zwisler 	 * Regular page slots are stabilized by the page lock even
6819973c98eSRoss Zwisler 	 * without the tree itself locked.  These unlocked entries
6829973c98eSRoss Zwisler 	 * need verification under the tree lock.
6839973c98eSRoss Zwisler 	 */
6849973c98eSRoss Zwisler 	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
6859973c98eSRoss Zwisler 		goto unlock;
6869973c98eSRoss Zwisler 	if (*slot != entry)
6879973c98eSRoss Zwisler 		goto unlock;
6889973c98eSRoss Zwisler 
6899973c98eSRoss Zwisler 	/* another fsync thread may have already written back this entry */
6909973c98eSRoss Zwisler 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
6919973c98eSRoss Zwisler 		goto unlock;
6929973c98eSRoss Zwisler 
6939973c98eSRoss Zwisler 	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
6949973c98eSRoss Zwisler 		ret = -EIO;
6959973c98eSRoss Zwisler 		goto unlock;
6969973c98eSRoss Zwisler 	}
6979973c98eSRoss Zwisler 
6989973c98eSRoss Zwisler 	dax.sector = RADIX_DAX_SECTOR(entry);
6999973c98eSRoss Zwisler 	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
7009973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
7019973c98eSRoss Zwisler 
7029973c98eSRoss Zwisler 	/*
7039973c98eSRoss Zwisler 	 * We cannot hold tree_lock while calling dax_map_atomic() because it
7049973c98eSRoss Zwisler 	 * eventually calls cond_resched().
7059973c98eSRoss Zwisler 	 */
7069973c98eSRoss Zwisler 	ret = dax_map_atomic(bdev, &dax);
7079973c98eSRoss Zwisler 	if (ret < 0)
7089973c98eSRoss Zwisler 		return ret;
7099973c98eSRoss Zwisler 
7109973c98eSRoss Zwisler 	if (WARN_ON_ONCE(ret < dax.size)) {
7119973c98eSRoss Zwisler 		ret = -EIO;
7129973c98eSRoss Zwisler 		goto unmap;
7139973c98eSRoss Zwisler 	}
7149973c98eSRoss Zwisler 
7159973c98eSRoss Zwisler 	wb_cache_pmem(dax.addr, dax.size);
7169973c98eSRoss Zwisler 
7179973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
7189973c98eSRoss Zwisler 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
7199973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
7209973c98eSRoss Zwisler  unmap:
7219973c98eSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
7229973c98eSRoss Zwisler 	return ret;
7239973c98eSRoss Zwisler 
7249973c98eSRoss Zwisler  unlock:
7259973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
7269973c98eSRoss Zwisler 	return ret;
7279973c98eSRoss Zwisler }
7289973c98eSRoss Zwisler 
7299973c98eSRoss Zwisler /*
7309973c98eSRoss Zwisler  * Flush the mapping to the persistent domain within the byte range of [start,
7319973c98eSRoss Zwisler  * end]. This is required by data integrity operations to ensure file data is
7329973c98eSRoss Zwisler  * on persistent storage prior to completion of the operation.
7339973c98eSRoss Zwisler  */
7347f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping,
7357f6d5b52SRoss Zwisler 		struct block_device *bdev, struct writeback_control *wbc)
7369973c98eSRoss Zwisler {
7379973c98eSRoss Zwisler 	struct inode *inode = mapping->host;
7389973c98eSRoss Zwisler 	pgoff_t start_index, end_index, pmd_index;
7399973c98eSRoss Zwisler 	pgoff_t indices[PAGEVEC_SIZE];
7409973c98eSRoss Zwisler 	struct pagevec pvec;
7419973c98eSRoss Zwisler 	bool done = false;
7429973c98eSRoss Zwisler 	int i, ret = 0;
7439973c98eSRoss Zwisler 	void *entry;
7449973c98eSRoss Zwisler 
7459973c98eSRoss Zwisler 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
7469973c98eSRoss Zwisler 		return -EIO;
7479973c98eSRoss Zwisler 
7487f6d5b52SRoss Zwisler 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
7497f6d5b52SRoss Zwisler 		return 0;
7507f6d5b52SRoss Zwisler 
75109cbfeafSKirill A. Shutemov 	start_index = wbc->range_start >> PAGE_SHIFT;
75209cbfeafSKirill A. Shutemov 	end_index = wbc->range_end >> PAGE_SHIFT;
7539973c98eSRoss Zwisler 	pmd_index = DAX_PMD_INDEX(start_index);
7549973c98eSRoss Zwisler 
7559973c98eSRoss Zwisler 	rcu_read_lock();
7569973c98eSRoss Zwisler 	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
7579973c98eSRoss Zwisler 	rcu_read_unlock();
7589973c98eSRoss Zwisler 
7599973c98eSRoss Zwisler 	/* see if the start of our range is covered by a PMD entry */
7609973c98eSRoss Zwisler 	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
7619973c98eSRoss Zwisler 		start_index = pmd_index;
7629973c98eSRoss Zwisler 
7639973c98eSRoss Zwisler 	tag_pages_for_writeback(mapping, start_index, end_index);
7649973c98eSRoss Zwisler 
7659973c98eSRoss Zwisler 	pagevec_init(&pvec, 0);
7669973c98eSRoss Zwisler 	while (!done) {
7679973c98eSRoss Zwisler 		pvec.nr = find_get_entries_tag(mapping, start_index,
7689973c98eSRoss Zwisler 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
7699973c98eSRoss Zwisler 				pvec.pages, indices);
7709973c98eSRoss Zwisler 
7719973c98eSRoss Zwisler 		if (pvec.nr == 0)
7729973c98eSRoss Zwisler 			break;
7739973c98eSRoss Zwisler 
7749973c98eSRoss Zwisler 		for (i = 0; i < pvec.nr; i++) {
7759973c98eSRoss Zwisler 			if (indices[i] > end_index) {
7769973c98eSRoss Zwisler 				done = true;
7779973c98eSRoss Zwisler 				break;
7789973c98eSRoss Zwisler 			}
7799973c98eSRoss Zwisler 
7809973c98eSRoss Zwisler 			ret = dax_writeback_one(bdev, mapping, indices[i],
7819973c98eSRoss Zwisler 					pvec.pages[i]);
7829973c98eSRoss Zwisler 			if (ret < 0)
7839973c98eSRoss Zwisler 				return ret;
7849973c98eSRoss Zwisler 		}
7859973c98eSRoss Zwisler 	}
7869973c98eSRoss Zwisler 	wmb_pmem();
7879973c98eSRoss Zwisler 	return 0;
7889973c98eSRoss Zwisler }
7899973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
7909973c98eSRoss Zwisler 
791ac401cc7SJan Kara static int dax_insert_mapping(struct address_space *mapping,
792ac401cc7SJan Kara 			struct buffer_head *bh, void **entryp,
793f7ca90b1SMatthew Wilcox 			struct vm_area_struct *vma, struct vm_fault *vmf)
794f7ca90b1SMatthew Wilcox {
795f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
796b2e0d162SDan Williams 	struct block_device *bdev = bh->b_bdev;
797b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
798ac401cc7SJan Kara 		.sector = to_sector(bh, mapping->host),
799b2e0d162SDan Williams 		.size = bh->b_size,
800b2e0d162SDan Williams 	};
801f7ca90b1SMatthew Wilcox 	int error;
802ac401cc7SJan Kara 	void *ret;
803ac401cc7SJan Kara 	void *entry = *entryp;
804f7ca90b1SMatthew Wilcox 
8050f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
8060f90cc66SRoss Zwisler 
807b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0) {
808b2e0d162SDan Williams 		error = PTR_ERR(dax.addr);
809f7ca90b1SMatthew Wilcox 		goto out;
810f7ca90b1SMatthew Wilcox 	}
811b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
812f7ca90b1SMatthew Wilcox 
813ac401cc7SJan Kara 	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
814ac401cc7SJan Kara 	if (IS_ERR(ret)) {
815ac401cc7SJan Kara 		error = PTR_ERR(ret);
8169973c98eSRoss Zwisler 		goto out;
817ac401cc7SJan Kara 	}
818ac401cc7SJan Kara 	*entryp = ret;
8199973c98eSRoss Zwisler 
82001c8f1c4SDan Williams 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
821f7ca90b1SMatthew Wilcox  out:
8220f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
823f7ca90b1SMatthew Wilcox 	return error;
824f7ca90b1SMatthew Wilcox }
825f7ca90b1SMatthew Wilcox 
826ce5c5d55SDave Chinner /**
827ce5c5d55SDave Chinner  * __dax_fault - handle a page fault on a DAX file
828ce5c5d55SDave Chinner  * @vma: The virtual memory area where the fault occurred
829ce5c5d55SDave Chinner  * @vmf: The description of the fault
830ce5c5d55SDave Chinner  * @get_block: The filesystem method used to translate file offsets to blocks
831ce5c5d55SDave Chinner  *
832ce5c5d55SDave Chinner  * When a page fault occurs, filesystems may call this helper in their
833ce5c5d55SDave Chinner  * fault handler for DAX files. __dax_fault() assumes the caller has done all
834ce5c5d55SDave Chinner  * the necessary locking for the page fault to proceed successfully.
835ce5c5d55SDave Chinner  */
836ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
83702fbd139SJan Kara 			get_block_t get_block)
838f7ca90b1SMatthew Wilcox {
839f7ca90b1SMatthew Wilcox 	struct file *file = vma->vm_file;
840f7ca90b1SMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
841f7ca90b1SMatthew Wilcox 	struct inode *inode = mapping->host;
842ac401cc7SJan Kara 	void *entry;
843f7ca90b1SMatthew Wilcox 	struct buffer_head bh;
844f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
845f7ca90b1SMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
846f7ca90b1SMatthew Wilcox 	sector_t block;
847f7ca90b1SMatthew Wilcox 	pgoff_t size;
848f7ca90b1SMatthew Wilcox 	int error;
849f7ca90b1SMatthew Wilcox 	int major = 0;
850f7ca90b1SMatthew Wilcox 
851ac401cc7SJan Kara 	/*
852ac401cc7SJan Kara 	 * Check whether offset isn't beyond end of file now. Caller is supposed
853ac401cc7SJan Kara 	 * to hold locks serializing us with truncate / punch hole so this is
854ac401cc7SJan Kara 	 * a reliable test.
855ac401cc7SJan Kara 	 */
856f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
857f7ca90b1SMatthew Wilcox 	if (vmf->pgoff >= size)
858f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS;
859f7ca90b1SMatthew Wilcox 
860f7ca90b1SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
861f7ca90b1SMatthew Wilcox 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
862eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
863f7ca90b1SMatthew Wilcox 	bh.b_size = PAGE_SIZE;
864f7ca90b1SMatthew Wilcox 
865ac401cc7SJan Kara 	entry = grab_mapping_entry(mapping, vmf->pgoff);
866ac401cc7SJan Kara 	if (IS_ERR(entry)) {
867ac401cc7SJan Kara 		error = PTR_ERR(entry);
868ac401cc7SJan Kara 		goto out;
869f7ca90b1SMatthew Wilcox 	}
870f7ca90b1SMatthew Wilcox 
871f7ca90b1SMatthew Wilcox 	error = get_block(inode, block, &bh, 0);
872f7ca90b1SMatthew Wilcox 	if (!error && (bh.b_size < PAGE_SIZE))
873f7ca90b1SMatthew Wilcox 		error = -EIO;		/* fs corruption? */
874f7ca90b1SMatthew Wilcox 	if (error)
875ac401cc7SJan Kara 		goto unlock_entry;
876f7ca90b1SMatthew Wilcox 
877f7ca90b1SMatthew Wilcox 	if (vmf->cow_page) {
878f7ca90b1SMatthew Wilcox 		struct page *new_page = vmf->cow_page;
879f7ca90b1SMatthew Wilcox 		if (buffer_written(&bh))
880b2e0d162SDan Williams 			error = copy_user_bh(new_page, inode, &bh, vaddr);
881f7ca90b1SMatthew Wilcox 		else
882f7ca90b1SMatthew Wilcox 			clear_user_highpage(new_page, vaddr);
883f7ca90b1SMatthew Wilcox 		if (error)
884ac401cc7SJan Kara 			goto unlock_entry;
885ac401cc7SJan Kara 		if (!radix_tree_exceptional_entry(entry)) {
886ac401cc7SJan Kara 			vmf->page = entry;
887ac401cc7SJan Kara 		} else {
888ac401cc7SJan Kara 			unlock_mapping_entry(mapping, vmf->pgoff);
8890f90cc66SRoss Zwisler 			i_mmap_lock_read(mapping);
890ac401cc7SJan Kara 			vmf->page = NULL;
891ac401cc7SJan Kara 		}
892f7ca90b1SMatthew Wilcox 		return VM_FAULT_LOCKED;
893f7ca90b1SMatthew Wilcox 	}
894f7ca90b1SMatthew Wilcox 
895ac401cc7SJan Kara 	if (!buffer_mapped(&bh)) {
896ac401cc7SJan Kara 		if (vmf->flags & FAULT_FLAG_WRITE) {
897ac401cc7SJan Kara 			error = get_block(inode, block, &bh, 1);
898ac401cc7SJan Kara 			count_vm_event(PGMAJFAULT);
899ac401cc7SJan Kara 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
900ac401cc7SJan Kara 			major = VM_FAULT_MAJOR;
901ac401cc7SJan Kara 			if (!error && (bh.b_size < PAGE_SIZE))
902ac401cc7SJan Kara 				error = -EIO;
903ac401cc7SJan Kara 			if (error)
904ac401cc7SJan Kara 				goto unlock_entry;
905ac401cc7SJan Kara 		} else {
906ac401cc7SJan Kara 			return dax_load_hole(mapping, entry, vmf);
907ac401cc7SJan Kara 		}
908f7ca90b1SMatthew Wilcox 	}
909f7ca90b1SMatthew Wilcox 
91002fbd139SJan Kara 	/* Filesystem should not return unwritten buffers to us! */
9112b10945cSJan Kara 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
912ac401cc7SJan Kara 	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
913ac401cc7SJan Kara  unlock_entry:
914ac401cc7SJan Kara 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
915f7ca90b1SMatthew Wilcox  out:
916f7ca90b1SMatthew Wilcox 	if (error == -ENOMEM)
917f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM | major;
918f7ca90b1SMatthew Wilcox 	/* -EBUSY is fine, somebody else faulted on the same PTE */
919f7ca90b1SMatthew Wilcox 	if ((error < 0) && (error != -EBUSY))
920f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS | major;
921f7ca90b1SMatthew Wilcox 	return VM_FAULT_NOPAGE | major;
922f7ca90b1SMatthew Wilcox }
923ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault);
924f7ca90b1SMatthew Wilcox 
925f7ca90b1SMatthew Wilcox /**
926f7ca90b1SMatthew Wilcox  * dax_fault - handle a page fault on a DAX file
927f7ca90b1SMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
928f7ca90b1SMatthew Wilcox  * @vmf: The description of the fault
929f7ca90b1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
930f7ca90b1SMatthew Wilcox  *
931f7ca90b1SMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
932f7ca90b1SMatthew Wilcox  * fault handler for DAX files.
933f7ca90b1SMatthew Wilcox  */
934f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
93502fbd139SJan Kara 	      get_block_t get_block)
936f7ca90b1SMatthew Wilcox {
937f7ca90b1SMatthew Wilcox 	int result;
938f7ca90b1SMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
939f7ca90b1SMatthew Wilcox 
940f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE) {
941f7ca90b1SMatthew Wilcox 		sb_start_pagefault(sb);
942f7ca90b1SMatthew Wilcox 		file_update_time(vma->vm_file);
943f7ca90b1SMatthew Wilcox 	}
94402fbd139SJan Kara 	result = __dax_fault(vma, vmf, get_block);
945f7ca90b1SMatthew Wilcox 	if (vmf->flags & FAULT_FLAG_WRITE)
946f7ca90b1SMatthew Wilcox 		sb_end_pagefault(sb);
947f7ca90b1SMatthew Wilcox 
948f7ca90b1SMatthew Wilcox 	return result;
949f7ca90b1SMatthew Wilcox }
950f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault);
9514c0ccfefSMatthew Wilcox 
952348e967aSJan Kara #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
953844f35dbSMatthew Wilcox /*
954844f35dbSMatthew Wilcox  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
955844f35dbSMatthew Wilcox  * more often than one might expect in the below function.
956844f35dbSMatthew Wilcox  */
957844f35dbSMatthew Wilcox #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
958844f35dbSMatthew Wilcox 
959cbb38e41SDan Williams static void __dax_dbg(struct buffer_head *bh, unsigned long address,
960cbb38e41SDan Williams 		const char *reason, const char *fn)
961cbb38e41SDan Williams {
962cbb38e41SDan Williams 	if (bh) {
963cbb38e41SDan Williams 		char bname[BDEVNAME_SIZE];
964cbb38e41SDan Williams 		bdevname(bh->b_bdev, bname);
965cbb38e41SDan Williams 		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
966cbb38e41SDan Williams 			"length %zd fallback: %s\n", fn, current->comm,
967cbb38e41SDan Williams 			address, bname, bh->b_state, (u64)bh->b_blocknr,
968cbb38e41SDan Williams 			bh->b_size, reason);
969cbb38e41SDan Williams 	} else {
970cbb38e41SDan Williams 		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
971cbb38e41SDan Williams 			current->comm, address, reason);
972cbb38e41SDan Williams 	}
973cbb38e41SDan Williams }
974cbb38e41SDan Williams 
975cbb38e41SDan Williams #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
976cbb38e41SDan Williams 
977844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
97802fbd139SJan Kara 		pmd_t *pmd, unsigned int flags, get_block_t get_block)
979844f35dbSMatthew Wilcox {
980844f35dbSMatthew Wilcox 	struct file *file = vma->vm_file;
981844f35dbSMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
982844f35dbSMatthew Wilcox 	struct inode *inode = mapping->host;
983844f35dbSMatthew Wilcox 	struct buffer_head bh;
984844f35dbSMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
985844f35dbSMatthew Wilcox 	unsigned long pmd_addr = address & PMD_MASK;
986844f35dbSMatthew Wilcox 	bool write = flags & FAULT_FLAG_WRITE;
987b2e0d162SDan Williams 	struct block_device *bdev;
988844f35dbSMatthew Wilcox 	pgoff_t size, pgoff;
989b2e0d162SDan Williams 	sector_t block;
990ac401cc7SJan Kara 	int result = 0;
9919973c98eSRoss Zwisler 	bool alloc = false;
992844f35dbSMatthew Wilcox 
993c046c321SDan Williams 	/* dax pmd mappings require pfn_t_devmap() */
994ee82c9edSDan Williams 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
995ee82c9edSDan Williams 		return VM_FAULT_FALLBACK;
996ee82c9edSDan Williams 
997844f35dbSMatthew Wilcox 	/* Fall back to PTEs if we're going to COW */
99859bf4fb9SToshi Kani 	if (write && !(vma->vm_flags & VM_SHARED)) {
99959bf4fb9SToshi Kani 		split_huge_pmd(vma, pmd, address);
1000cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "cow write");
1001844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
100259bf4fb9SToshi Kani 	}
1003844f35dbSMatthew Wilcox 	/* If the PMD would extend outside the VMA */
1004cbb38e41SDan Williams 	if (pmd_addr < vma->vm_start) {
1005cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "vma start unaligned");
1006844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
1007cbb38e41SDan Williams 	}
1008cbb38e41SDan Williams 	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
1009cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address, "vma end unaligned");
1010844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
1011cbb38e41SDan Williams 	}
1012844f35dbSMatthew Wilcox 
10133fdd1b47SMatthew Wilcox 	pgoff = linear_page_index(vma, pmd_addr);
1014844f35dbSMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1015844f35dbSMatthew Wilcox 	if (pgoff >= size)
1016844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
1017844f35dbSMatthew Wilcox 	/* If the PMD would cover blocks out of the file */
1018cbb38e41SDan Williams 	if ((pgoff | PG_PMD_COLOUR) >= size) {
1019cbb38e41SDan Williams 		dax_pmd_dbg(NULL, address,
1020cbb38e41SDan Williams 				"offset + huge page size > file size");
1021844f35dbSMatthew Wilcox 		return VM_FAULT_FALLBACK;
1022cbb38e41SDan Williams 	}
1023844f35dbSMatthew Wilcox 
1024844f35dbSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
1025d4bbe706SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
1026844f35dbSMatthew Wilcox 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
1027844f35dbSMatthew Wilcox 
1028844f35dbSMatthew Wilcox 	bh.b_size = PMD_SIZE;
10299973c98eSRoss Zwisler 
10309973c98eSRoss Zwisler 	if (get_block(inode, block, &bh, 0) != 0)
1031844f35dbSMatthew Wilcox 		return VM_FAULT_SIGBUS;
10329973c98eSRoss Zwisler 
10339973c98eSRoss Zwisler 	if (!buffer_mapped(&bh) && write) {
10349973c98eSRoss Zwisler 		if (get_block(inode, block, &bh, 1) != 0)
10359973c98eSRoss Zwisler 			return VM_FAULT_SIGBUS;
10369973c98eSRoss Zwisler 		alloc = true;
10372b10945cSJan Kara 		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
10389973c98eSRoss Zwisler 	}
10399973c98eSRoss Zwisler 
1040b2e0d162SDan Williams 	bdev = bh.b_bdev;
1041844f35dbSMatthew Wilcox 
1042844f35dbSMatthew Wilcox 	/*
1043844f35dbSMatthew Wilcox 	 * If the filesystem isn't willing to tell us the length of a hole,
1044844f35dbSMatthew Wilcox 	 * just fall back to PTEs.  Calling get_block 512 times in a loop
1045844f35dbSMatthew Wilcox 	 * would be silly.
1046844f35dbSMatthew Wilcox 	 */
1047cbb38e41SDan Williams 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
1048cbb38e41SDan Williams 		dax_pmd_dbg(&bh, address, "allocated block too small");
10499973c98eSRoss Zwisler 		return VM_FAULT_FALLBACK;
1050cbb38e41SDan Williams 	}
1051844f35dbSMatthew Wilcox 
10529973c98eSRoss Zwisler 	/*
10539973c98eSRoss Zwisler 	 * If we allocated new storage, make sure no process has any
10549973c98eSRoss Zwisler 	 * zero pages covering this hole
10559973c98eSRoss Zwisler 	 */
10569973c98eSRoss Zwisler 	if (alloc) {
10579973c98eSRoss Zwisler 		loff_t lstart = pgoff << PAGE_SHIFT;
10589973c98eSRoss Zwisler 		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
10599973c98eSRoss Zwisler 
10609973c98eSRoss Zwisler 		truncate_pagecache_range(inode, lstart, lend);
10619973c98eSRoss Zwisler 	}
10629973c98eSRoss Zwisler 
10630f90cc66SRoss Zwisler 	i_mmap_lock_read(mapping);
106446c043edSKirill A. Shutemov 
1065b9953536SJan Kara 	if (!write && !buffer_mapped(&bh)) {
1066844f35dbSMatthew Wilcox 		spinlock_t *ptl;
1067d295e341SKirill A. Shutemov 		pmd_t entry;
1068844f35dbSMatthew Wilcox 		struct page *zero_page = get_huge_zero_page();
1069d295e341SKirill A. Shutemov 
1070cbb38e41SDan Williams 		if (unlikely(!zero_page)) {
1071cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "no zero page");
1072844f35dbSMatthew Wilcox 			goto fallback;
1073cbb38e41SDan Williams 		}
1074844f35dbSMatthew Wilcox 
1075d295e341SKirill A. Shutemov 		ptl = pmd_lock(vma->vm_mm, pmd);
1076d295e341SKirill A. Shutemov 		if (!pmd_none(*pmd)) {
1077844f35dbSMatthew Wilcox 			spin_unlock(ptl);
1078cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pmd already present");
1079d295e341SKirill A. Shutemov 			goto fallback;
1080d295e341SKirill A. Shutemov 		}
1081d295e341SKirill A. Shutemov 
1082cbb38e41SDan Williams 		dev_dbg(part_to_dev(bdev->bd_part),
1083cbb38e41SDan Williams 				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
1084cbb38e41SDan Williams 				__func__, current->comm, address,
1085cbb38e41SDan Williams 				(unsigned long long) to_sector(&bh, inode));
1086cbb38e41SDan Williams 
1087d295e341SKirill A. Shutemov 		entry = mk_pmd(zero_page, vma->vm_page_prot);
1088d295e341SKirill A. Shutemov 		entry = pmd_mkhuge(entry);
1089d295e341SKirill A. Shutemov 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
1090844f35dbSMatthew Wilcox 		result = VM_FAULT_NOPAGE;
1091d295e341SKirill A. Shutemov 		spin_unlock(ptl);
1092844f35dbSMatthew Wilcox 	} else {
1093b2e0d162SDan Williams 		struct blk_dax_ctl dax = {
1094b2e0d162SDan Williams 			.sector = to_sector(&bh, inode),
1095b2e0d162SDan Williams 			.size = PMD_SIZE,
1096b2e0d162SDan Williams 		};
1097b2e0d162SDan Williams 		long length = dax_map_atomic(bdev, &dax);
1098b2e0d162SDan Williams 
1099844f35dbSMatthew Wilcox 		if (length < 0) {
11008b3db979SDan Williams 			dax_pmd_dbg(&bh, address, "dax-error fallback");
11018b3db979SDan Williams 			goto fallback;
1102844f35dbSMatthew Wilcox 		}
1103cbb38e41SDan Williams 		if (length < PMD_SIZE) {
1104cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "dax-length too small");
1105cbb38e41SDan Williams 			dax_unmap_atomic(bdev, &dax);
1106cbb38e41SDan Williams 			goto fallback;
1107cbb38e41SDan Williams 		}
1108cbb38e41SDan Williams 		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
1109cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pfn unaligned");
1110b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
1111844f35dbSMatthew Wilcox 			goto fallback;
1112b2e0d162SDan Williams 		}
1113844f35dbSMatthew Wilcox 
1114c046c321SDan Williams 		if (!pfn_t_devmap(dax.pfn)) {
1115b2e0d162SDan Williams 			dax_unmap_atomic(bdev, &dax);
1116cbb38e41SDan Williams 			dax_pmd_dbg(&bh, address, "pfn not in memmap");
1117152d7bd8SDan Williams 			goto fallback;
1118b2e0d162SDan Williams 		}
1119b2e0d162SDan Williams 		dax_unmap_atomic(bdev, &dax);
11200f90cc66SRoss Zwisler 
11219973c98eSRoss Zwisler 		/*
11229973c98eSRoss Zwisler 		 * For PTE faults we insert a radix tree entry for reads, and
11239973c98eSRoss Zwisler 		 * leave it clean.  Then on the first write we dirty the radix
11249973c98eSRoss Zwisler 		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
11259973c98eSRoss Zwisler 		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
11269973c98eSRoss Zwisler 		 * call into get_block() to translate the pgoff to a sector in
11279973c98eSRoss Zwisler 		 * order to be able to create a new radix tree entry.
11289973c98eSRoss Zwisler 		 *
11299973c98eSRoss Zwisler 		 * The PMD path doesn't have an equivalent to
11309973c98eSRoss Zwisler 		 * dax_pfn_mkwrite(), though, so for a read followed by a
11319973c98eSRoss Zwisler 		 * write we traverse all the way through __dax_pmd_fault()
11329973c98eSRoss Zwisler 		 * twice.  This means we can just skip inserting a radix tree
11339973c98eSRoss Zwisler 		 * entry completely on the initial read and just wait until
11349973c98eSRoss Zwisler 		 * the write to insert a dirty entry.
11359973c98eSRoss Zwisler 		 */
11369973c98eSRoss Zwisler 		if (write) {
1137ac401cc7SJan Kara 			/*
1138ac401cc7SJan Kara 			 * We should insert radix-tree entry and dirty it here.
1139ac401cc7SJan Kara 			 * For now this is broken...
1140ac401cc7SJan Kara 			 */
11419973c98eSRoss Zwisler 		}
11429973c98eSRoss Zwisler 
1143cbb38e41SDan Williams 		dev_dbg(part_to_dev(bdev->bd_part),
1144cbb38e41SDan Williams 				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
1145cbb38e41SDan Williams 				__func__, current->comm, address,
1146cbb38e41SDan Williams 				pfn_t_to_pfn(dax.pfn),
1147cbb38e41SDan Williams 				(unsigned long long) dax.sector);
114834c0fd54SDan Williams 		result |= vmf_insert_pfn_pmd(vma, address, pmd,
1149f25748e3SDan Williams 				dax.pfn, write);
1150844f35dbSMatthew Wilcox 	}
1151844f35dbSMatthew Wilcox 
1152844f35dbSMatthew Wilcox  out:
11530f90cc66SRoss Zwisler 	i_mmap_unlock_read(mapping);
11540f90cc66SRoss Zwisler 
1155844f35dbSMatthew Wilcox 	return result;
1156844f35dbSMatthew Wilcox 
1157844f35dbSMatthew Wilcox  fallback:
1158844f35dbSMatthew Wilcox 	count_vm_event(THP_FAULT_FALLBACK);
1159844f35dbSMatthew Wilcox 	result = VM_FAULT_FALLBACK;
1160844f35dbSMatthew Wilcox 	goto out;
1161844f35dbSMatthew Wilcox }
1162844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault);
1163844f35dbSMatthew Wilcox 
1164844f35dbSMatthew Wilcox /**
1165844f35dbSMatthew Wilcox  * dax_pmd_fault - handle a PMD fault on a DAX file
1166844f35dbSMatthew Wilcox  * @vma: The virtual memory area where the fault occurred
1167844f35dbSMatthew Wilcox  * @vmf: The description of the fault
1168844f35dbSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
1169844f35dbSMatthew Wilcox  *
1170844f35dbSMatthew Wilcox  * When a page fault occurs, filesystems may call this helper in their
1171844f35dbSMatthew Wilcox  * pmd_fault handler for DAX files.
1172844f35dbSMatthew Wilcox  */
1173844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
117402fbd139SJan Kara 			pmd_t *pmd, unsigned int flags, get_block_t get_block)
1175844f35dbSMatthew Wilcox {
1176844f35dbSMatthew Wilcox 	int result;
1177844f35dbSMatthew Wilcox 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
1178844f35dbSMatthew Wilcox 
1179844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE) {
1180844f35dbSMatthew Wilcox 		sb_start_pagefault(sb);
1181844f35dbSMatthew Wilcox 		file_update_time(vma->vm_file);
1182844f35dbSMatthew Wilcox 	}
118302fbd139SJan Kara 	result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
1184844f35dbSMatthew Wilcox 	if (flags & FAULT_FLAG_WRITE)
1185844f35dbSMatthew Wilcox 		sb_end_pagefault(sb);
1186844f35dbSMatthew Wilcox 
1187844f35dbSMatthew Wilcox 	return result;
1188844f35dbSMatthew Wilcox }
1189844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault);
1190dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1191844f35dbSMatthew Wilcox 
11924c0ccfefSMatthew Wilcox /**
11930e3b210cSBoaz Harrosh  * dax_pfn_mkwrite - handle first write to DAX page
11940e3b210cSBoaz Harrosh  * @vma: The virtual memory area where the fault occurred
11950e3b210cSBoaz Harrosh  * @vmf: The description of the fault
11960e3b210cSBoaz Harrosh  */
11970e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
11980e3b210cSBoaz Harrosh {
11999973c98eSRoss Zwisler 	struct file *file = vma->vm_file;
1200ac401cc7SJan Kara 	struct address_space *mapping = file->f_mapping;
1201ac401cc7SJan Kara 	void *entry;
1202ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
12030e3b210cSBoaz Harrosh 
1204ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
1205ac401cc7SJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
1206ac401cc7SJan Kara 	if (!entry || !radix_tree_exceptional_entry(entry))
1207ac401cc7SJan Kara 		goto out;
1208ac401cc7SJan Kara 	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1209ac401cc7SJan Kara 	put_unlocked_mapping_entry(mapping, index, entry);
1210ac401cc7SJan Kara out:
1211ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
12120e3b210cSBoaz Harrosh 	return VM_FAULT_NOPAGE;
12130e3b210cSBoaz Harrosh }
12140e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
12150e3b210cSBoaz Harrosh 
12164b0228faSVishal Verma static bool dax_range_is_aligned(struct block_device *bdev,
12174b0228faSVishal Verma 				 unsigned int offset, unsigned int length)
12184b0228faSVishal Verma {
12194b0228faSVishal Verma 	unsigned short sector_size = bdev_logical_block_size(bdev);
12204b0228faSVishal Verma 
12214b0228faSVishal Verma 	if (!IS_ALIGNED(offset, sector_size))
12224b0228faSVishal Verma 		return false;
12234b0228faSVishal Verma 	if (!IS_ALIGNED(length, sector_size))
12244b0228faSVishal Verma 		return false;
12254b0228faSVishal Verma 
12264b0228faSVishal Verma 	return true;
12274b0228faSVishal Verma }
12284b0228faSVishal Verma 
1229679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1230679c8bd3SChristoph Hellwig 		unsigned int offset, unsigned int length)
1231679c8bd3SChristoph Hellwig {
1232679c8bd3SChristoph Hellwig 	struct blk_dax_ctl dax = {
1233679c8bd3SChristoph Hellwig 		.sector		= sector,
1234679c8bd3SChristoph Hellwig 		.size		= PAGE_SIZE,
1235679c8bd3SChristoph Hellwig 	};
1236679c8bd3SChristoph Hellwig 
12374b0228faSVishal Verma 	if (dax_range_is_aligned(bdev, offset, length)) {
12384b0228faSVishal Verma 		sector_t start_sector = dax.sector + (offset >> 9);
12394b0228faSVishal Verma 
12404b0228faSVishal Verma 		return blkdev_issue_zeroout(bdev, start_sector,
12414b0228faSVishal Verma 				length >> 9, GFP_NOFS, true);
12424b0228faSVishal Verma 	} else {
1243679c8bd3SChristoph Hellwig 		if (dax_map_atomic(bdev, &dax) < 0)
1244679c8bd3SChristoph Hellwig 			return PTR_ERR(dax.addr);
1245679c8bd3SChristoph Hellwig 		clear_pmem(dax.addr + offset, length);
1246679c8bd3SChristoph Hellwig 		wmb_pmem();
1247679c8bd3SChristoph Hellwig 		dax_unmap_atomic(bdev, &dax);
12484b0228faSVishal Verma 	}
1249679c8bd3SChristoph Hellwig 	return 0;
1250679c8bd3SChristoph Hellwig }
1251679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1252679c8bd3SChristoph Hellwig 
12530e3b210cSBoaz Harrosh /**
125425726bc1SMatthew Wilcox  * dax_zero_page_range - zero a range within a page of a DAX file
12554c0ccfefSMatthew Wilcox  * @inode: The file being truncated
12564c0ccfefSMatthew Wilcox  * @from: The file offset that is being truncated to
125725726bc1SMatthew Wilcox  * @length: The number of bytes to zero
12584c0ccfefSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
12594c0ccfefSMatthew Wilcox  *
126025726bc1SMatthew Wilcox  * This function can be called by a filesystem when it is zeroing part of a
126125726bc1SMatthew Wilcox  * page in a DAX file.  This is intended for hole-punch operations.  If
126225726bc1SMatthew Wilcox  * you are truncating a file, the helper function dax_truncate_page() may be
126325726bc1SMatthew Wilcox  * more convenient.
12644c0ccfefSMatthew Wilcox  */
126525726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
126625726bc1SMatthew Wilcox 							get_block_t get_block)
12674c0ccfefSMatthew Wilcox {
12684c0ccfefSMatthew Wilcox 	struct buffer_head bh;
126909cbfeafSKirill A. Shutemov 	pgoff_t index = from >> PAGE_SHIFT;
127009cbfeafSKirill A. Shutemov 	unsigned offset = from & (PAGE_SIZE-1);
12714c0ccfefSMatthew Wilcox 	int err;
12724c0ccfefSMatthew Wilcox 
12734c0ccfefSMatthew Wilcox 	/* Block boundary? Nothing to do */
12744c0ccfefSMatthew Wilcox 	if (!length)
12754c0ccfefSMatthew Wilcox 		return 0;
127609cbfeafSKirill A. Shutemov 	BUG_ON((offset + length) > PAGE_SIZE);
12774c0ccfefSMatthew Wilcox 
12784c0ccfefSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
1279eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
128009cbfeafSKirill A. Shutemov 	bh.b_size = PAGE_SIZE;
12814c0ccfefSMatthew Wilcox 	err = get_block(inode, index, &bh, 0);
1282679c8bd3SChristoph Hellwig 	if (err < 0 || !buffer_written(&bh))
12834c0ccfefSMatthew Wilcox 		return err;
1284b2e0d162SDan Williams 
1285679c8bd3SChristoph Hellwig 	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1286679c8bd3SChristoph Hellwig 			offset, length);
12874c0ccfefSMatthew Wilcox }
128825726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range);
128925726bc1SMatthew Wilcox 
129025726bc1SMatthew Wilcox /**
129125726bc1SMatthew Wilcox  * dax_truncate_page - handle a partial page being truncated in a DAX file
129225726bc1SMatthew Wilcox  * @inode: The file being truncated
129325726bc1SMatthew Wilcox  * @from: The file offset that is being truncated to
129425726bc1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
129525726bc1SMatthew Wilcox  *
129625726bc1SMatthew Wilcox  * Similar to block_truncate_page(), this function can be called by a
129725726bc1SMatthew Wilcox  * filesystem when it is truncating a DAX file to handle the partial page.
129825726bc1SMatthew Wilcox  */
129925726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
130025726bc1SMatthew Wilcox {
130109cbfeafSKirill A. Shutemov 	unsigned length = PAGE_ALIGN(from) - from;
130225726bc1SMatthew Wilcox 	return dax_zero_page_range(inode, from, length, get_block);
130325726bc1SMatthew Wilcox }
13044c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page);
1305