xref: /openbmc/linux/fs/dax.c (revision 642261ac)
1d475c634SMatthew Wilcox /*
2d475c634SMatthew Wilcox  * fs/dax.c - Direct Access filesystem code
3d475c634SMatthew Wilcox  * Copyright (c) 2013-2014 Intel Corporation
4d475c634SMatthew Wilcox  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5d475c634SMatthew Wilcox  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6d475c634SMatthew Wilcox  *
7d475c634SMatthew Wilcox  * This program is free software; you can redistribute it and/or modify it
8d475c634SMatthew Wilcox  * under the terms and conditions of the GNU General Public License,
9d475c634SMatthew Wilcox  * version 2, as published by the Free Software Foundation.
10d475c634SMatthew Wilcox  *
11d475c634SMatthew Wilcox  * This program is distributed in the hope it will be useful, but WITHOUT
12d475c634SMatthew Wilcox  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13d475c634SMatthew Wilcox  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14d475c634SMatthew Wilcox  * more details.
15d475c634SMatthew Wilcox  */
16d475c634SMatthew Wilcox 
17d475c634SMatthew Wilcox #include <linux/atomic.h>
18d475c634SMatthew Wilcox #include <linux/blkdev.h>
19d475c634SMatthew Wilcox #include <linux/buffer_head.h>
20d77e92e2SRoss Zwisler #include <linux/dax.h>
21d475c634SMatthew Wilcox #include <linux/fs.h>
22d475c634SMatthew Wilcox #include <linux/genhd.h>
23f7ca90b1SMatthew Wilcox #include <linux/highmem.h>
24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h>
25f7ca90b1SMatthew Wilcox #include <linux/mm.h>
26d475c634SMatthew Wilcox #include <linux/mutex.h>
279973c98eSRoss Zwisler #include <linux/pagevec.h>
282765cfbbSRoss Zwisler #include <linux/pmem.h>
29289c6aedSMatthew Wilcox #include <linux/sched.h>
30d475c634SMatthew Wilcox #include <linux/uio.h>
31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h>
3234c0fd54SDan Williams #include <linux/pfn_t.h>
330e749e54SDan Williams #include <linux/sizes.h>
34a254e568SChristoph Hellwig #include <linux/iomap.h>
35a254e568SChristoph Hellwig #include "internal.h"
36d475c634SMatthew Wilcox 
37ac401cc7SJan Kara /* We choose 4096 entries - same as per-zone page wait tables */
38ac401cc7SJan Kara #define DAX_WAIT_TABLE_BITS 12
39ac401cc7SJan Kara #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
40ac401cc7SJan Kara 
41ce95ab0fSRoss Zwisler static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
42ac401cc7SJan Kara 
43ac401cc7SJan Kara static int __init init_dax_wait_table(void)
44ac401cc7SJan Kara {
45ac401cc7SJan Kara 	int i;
46ac401cc7SJan Kara 
47ac401cc7SJan Kara 	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
48ac401cc7SJan Kara 		init_waitqueue_head(wait_table + i);
49ac401cc7SJan Kara 	return 0;
50ac401cc7SJan Kara }
51ac401cc7SJan Kara fs_initcall(init_dax_wait_table);
52ac401cc7SJan Kara 
53b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
54b2e0d162SDan Williams {
55b2e0d162SDan Williams 	struct request_queue *q = bdev->bd_queue;
56b2e0d162SDan Williams 	long rc = -EIO;
57b2e0d162SDan Williams 
587a9eb206SDan Williams 	dax->addr = ERR_PTR(-EIO);
59b2e0d162SDan Williams 	if (blk_queue_enter(q, true) != 0)
60b2e0d162SDan Williams 		return rc;
61b2e0d162SDan Williams 
62b2e0d162SDan Williams 	rc = bdev_direct_access(bdev, dax);
63b2e0d162SDan Williams 	if (rc < 0) {
647a9eb206SDan Williams 		dax->addr = ERR_PTR(rc);
65b2e0d162SDan Williams 		blk_queue_exit(q);
66b2e0d162SDan Williams 		return rc;
67b2e0d162SDan Williams 	}
68b2e0d162SDan Williams 	return rc;
69b2e0d162SDan Williams }
70b2e0d162SDan Williams 
71b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev,
72b2e0d162SDan Williams 		const struct blk_dax_ctl *dax)
73b2e0d162SDan Williams {
74b2e0d162SDan Williams 	if (IS_ERR(dax->addr))
75b2e0d162SDan Williams 		return;
76b2e0d162SDan Williams 	blk_queue_exit(bdev->bd_queue);
77b2e0d162SDan Williams }
78b2e0d162SDan Williams 
79642261acSRoss Zwisler static int dax_is_pmd_entry(void *entry)
80642261acSRoss Zwisler {
81642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_PMD;
82642261acSRoss Zwisler }
83642261acSRoss Zwisler 
84642261acSRoss Zwisler static int dax_is_pte_entry(void *entry)
85642261acSRoss Zwisler {
86642261acSRoss Zwisler 	return !((unsigned long)entry & RADIX_DAX_PMD);
87642261acSRoss Zwisler }
88642261acSRoss Zwisler 
89642261acSRoss Zwisler static int dax_is_zero_entry(void *entry)
90642261acSRoss Zwisler {
91642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_HZP;
92642261acSRoss Zwisler }
93642261acSRoss Zwisler 
94642261acSRoss Zwisler static int dax_is_empty_entry(void *entry)
95642261acSRoss Zwisler {
96642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_EMPTY;
97642261acSRoss Zwisler }
98642261acSRoss Zwisler 
99d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n)
100d1a5f2b4SDan Williams {
101d1a5f2b4SDan Williams 	struct page *page = alloc_pages(GFP_KERNEL, 0);
102d1a5f2b4SDan Williams 	struct blk_dax_ctl dax = {
103d1a5f2b4SDan Williams 		.size = PAGE_SIZE,
104d1a5f2b4SDan Williams 		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
105d1a5f2b4SDan Williams 	};
106d1a5f2b4SDan Williams 	long rc;
107d1a5f2b4SDan Williams 
108d1a5f2b4SDan Williams 	if (!page)
109d1a5f2b4SDan Williams 		return ERR_PTR(-ENOMEM);
110d1a5f2b4SDan Williams 
111d1a5f2b4SDan Williams 	rc = dax_map_atomic(bdev, &dax);
112d1a5f2b4SDan Williams 	if (rc < 0)
113d1a5f2b4SDan Williams 		return ERR_PTR(rc);
114d1a5f2b4SDan Williams 	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
115d1a5f2b4SDan Williams 	dax_unmap_atomic(bdev, &dax);
116d1a5f2b4SDan Williams 	return page;
117d1a5f2b4SDan Williams }
118d1a5f2b4SDan Williams 
119d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh)
120d475c634SMatthew Wilcox {
121d475c634SMatthew Wilcox 	return buffer_mapped(bh) && !buffer_unwritten(bh);
122d475c634SMatthew Wilcox }
123d475c634SMatthew Wilcox 
124b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh,
125b2e0d162SDan Williams 		const struct inode *inode)
126b2e0d162SDan Williams {
127b2e0d162SDan Williams 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
128b2e0d162SDan Williams 
129b2e0d162SDan Williams 	return sector;
130b2e0d162SDan Williams }
131b2e0d162SDan Williams 
132a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
133d475c634SMatthew Wilcox 		      loff_t start, loff_t end, get_block_t get_block,
134d475c634SMatthew Wilcox 		      struct buffer_head *bh)
135d475c634SMatthew Wilcox {
136b2e0d162SDan Williams 	loff_t pos = start, max = start, bh_max = start;
13714df6a4eSDan Williams 	bool hole = false;
138b2e0d162SDan Williams 	struct block_device *bdev = NULL;
139b2e0d162SDan Williams 	int rw = iov_iter_rw(iter), rc;
140b2e0d162SDan Williams 	long map_len = 0;
141b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
1427a9eb206SDan Williams 		.addr = ERR_PTR(-EIO),
143b2e0d162SDan Williams 	};
144069c77bcSJan Kara 	unsigned blkbits = inode->i_blkbits;
145069c77bcSJan Kara 	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
146069c77bcSJan Kara 								>> blkbits;
147d475c634SMatthew Wilcox 
148b2e0d162SDan Williams 	if (rw == READ)
149d475c634SMatthew Wilcox 		end = min(end, i_size_read(inode));
150d475c634SMatthew Wilcox 
151d475c634SMatthew Wilcox 	while (pos < end) {
1522765cfbbSRoss Zwisler 		size_t len;
153d475c634SMatthew Wilcox 		if (pos == max) {
154e94f5a22SJeff Moyer 			long page = pos >> PAGE_SHIFT;
155e94f5a22SJeff Moyer 			sector_t block = page << (PAGE_SHIFT - blkbits);
156d475c634SMatthew Wilcox 			unsigned first = pos - (block << blkbits);
157d475c634SMatthew Wilcox 			long size;
158d475c634SMatthew Wilcox 
159d475c634SMatthew Wilcox 			if (pos == bh_max) {
160d475c634SMatthew Wilcox 				bh->b_size = PAGE_ALIGN(end - pos);
161d475c634SMatthew Wilcox 				bh->b_state = 0;
162b2e0d162SDan Williams 				rc = get_block(inode, block, bh, rw == WRITE);
163b2e0d162SDan Williams 				if (rc)
164d475c634SMatthew Wilcox 					break;
165d475c634SMatthew Wilcox 				bh_max = pos - first + bh->b_size;
166b2e0d162SDan Williams 				bdev = bh->b_bdev;
167069c77bcSJan Kara 				/*
168069c77bcSJan Kara 				 * We allow uninitialized buffers for writes
169069c77bcSJan Kara 				 * beyond EOF as those cannot race with faults
170069c77bcSJan Kara 				 */
171069c77bcSJan Kara 				WARN_ON_ONCE(
172069c77bcSJan Kara 					(buffer_new(bh) && block < file_blks) ||
173069c77bcSJan Kara 					(rw == WRITE && buffer_unwritten(bh)));
174d475c634SMatthew Wilcox 			} else {
175d475c634SMatthew Wilcox 				unsigned done = bh->b_size -
176d475c634SMatthew Wilcox 						(bh_max - (pos - first));
177d475c634SMatthew Wilcox 				bh->b_blocknr += done >> blkbits;
178d475c634SMatthew Wilcox 				bh->b_size -= done;
179d475c634SMatthew Wilcox 			}
180d475c634SMatthew Wilcox 
181b2e0d162SDan Williams 			hole = rw == READ && !buffer_written(bh);
182d475c634SMatthew Wilcox 			if (hole) {
183d475c634SMatthew Wilcox 				size = bh->b_size - first;
184d475c634SMatthew Wilcox 			} else {
185b2e0d162SDan Williams 				dax_unmap_atomic(bdev, &dax);
186b2e0d162SDan Williams 				dax.sector = to_sector(bh, inode);
187b2e0d162SDan Williams 				dax.size = bh->b_size;
188b2e0d162SDan Williams 				map_len = dax_map_atomic(bdev, &dax);
189b2e0d162SDan Williams 				if (map_len < 0) {
190b2e0d162SDan Williams 					rc = map_len;
191d475c634SMatthew Wilcox 					break;
192b2e0d162SDan Williams 				}
193b2e0d162SDan Williams 				dax.addr += first;
194b2e0d162SDan Williams 				size = map_len - first;
195d475c634SMatthew Wilcox 			}
19602395435SEric Sandeen 			/*
19702395435SEric Sandeen 			 * pos + size is one past the last offset for IO,
19802395435SEric Sandeen 			 * so pos + size can overflow loff_t at extreme offsets.
19902395435SEric Sandeen 			 * Cast to u64 to catch this and get the true minimum.
20002395435SEric Sandeen 			 */
20102395435SEric Sandeen 			max = min_t(u64, pos + size, end);
202d475c634SMatthew Wilcox 		}
203d475c634SMatthew Wilcox 
2042765cfbbSRoss Zwisler 		if (iov_iter_rw(iter) == WRITE) {
205b2e0d162SDan Williams 			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
2062765cfbbSRoss Zwisler 		} else if (!hole)
207b2e0d162SDan Williams 			len = copy_to_iter((void __force *) dax.addr, max - pos,
208e2e05394SRoss Zwisler 					iter);
209d475c634SMatthew Wilcox 		else
210d475c634SMatthew Wilcox 			len = iov_iter_zero(max - pos, iter);
211d475c634SMatthew Wilcox 
212cadfbb6eSAl Viro 		if (!len) {
213b2e0d162SDan Williams 			rc = -EFAULT;
214d475c634SMatthew Wilcox 			break;
215cadfbb6eSAl Viro 		}
216d475c634SMatthew Wilcox 
217d475c634SMatthew Wilcox 		pos += len;
218b2e0d162SDan Williams 		if (!IS_ERR(dax.addr))
219b2e0d162SDan Williams 			dax.addr += len;
220d475c634SMatthew Wilcox 	}
221d475c634SMatthew Wilcox 
222b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
2232765cfbbSRoss Zwisler 
224b2e0d162SDan Williams 	return (pos == start) ? rc : pos - start;
225d475c634SMatthew Wilcox }
226d475c634SMatthew Wilcox 
227d475c634SMatthew Wilcox /**
228d475c634SMatthew Wilcox  * dax_do_io - Perform I/O to a DAX file
229d475c634SMatthew Wilcox  * @iocb: The control block for this I/O
230d475c634SMatthew Wilcox  * @inode: The file which the I/O is directed at
231d475c634SMatthew Wilcox  * @iter: The addresses to do I/O from or to
232d475c634SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
233d475c634SMatthew Wilcox  * @end_io: A filesystem callback for I/O completion
234d475c634SMatthew Wilcox  * @flags: See below
235d475c634SMatthew Wilcox  *
236d475c634SMatthew Wilcox  * This function uses the same locking scheme as do_blockdev_direct_IO:
237d475c634SMatthew Wilcox  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
238d475c634SMatthew Wilcox  * caller for writes.  For reads, we take and release the i_mutex ourselves.
239d475c634SMatthew Wilcox  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
240d475c634SMatthew Wilcox  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
241d475c634SMatthew Wilcox  * is in progress.
242d475c634SMatthew Wilcox  */
243a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
244c8b8e32dSChristoph Hellwig 		  struct iov_iter *iter, get_block_t get_block,
245a95cd631SOmar Sandoval 		  dio_iodone_t end_io, int flags)
246d475c634SMatthew Wilcox {
247d475c634SMatthew Wilcox 	struct buffer_head bh;
248d475c634SMatthew Wilcox 	ssize_t retval = -EINVAL;
249c8b8e32dSChristoph Hellwig 	loff_t pos = iocb->ki_pos;
250d475c634SMatthew Wilcox 	loff_t end = pos + iov_iter_count(iter);
251d475c634SMatthew Wilcox 
252d475c634SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
253eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
254d475c634SMatthew Wilcox 
255c3d98e39SJan Kara 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2565955102cSAl Viro 		inode_lock(inode);
257d475c634SMatthew Wilcox 
258d475c634SMatthew Wilcox 	/* Protects against truncate */
259bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
260fe0f07d0SJens Axboe 		inode_dio_begin(inode);
261d475c634SMatthew Wilcox 
262a95cd631SOmar Sandoval 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
263d475c634SMatthew Wilcox 
264a95cd631SOmar Sandoval 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
2655955102cSAl Viro 		inode_unlock(inode);
266d475c634SMatthew Wilcox 
267187372a3SChristoph Hellwig 	if (end_io) {
268187372a3SChristoph Hellwig 		int err;
269187372a3SChristoph Hellwig 
270187372a3SChristoph Hellwig 		err = end_io(iocb, pos, retval, bh.b_private);
271187372a3SChristoph Hellwig 		if (err)
272187372a3SChristoph Hellwig 			retval = err;
273187372a3SChristoph Hellwig 	}
274d475c634SMatthew Wilcox 
275bbab37ddSMatthew Wilcox 	if (!(flags & DIO_SKIP_DIO_COUNT))
276fe0f07d0SJens Axboe 		inode_dio_end(inode);
277d475c634SMatthew Wilcox 	return retval;
278d475c634SMatthew Wilcox }
279d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io);
280f7ca90b1SMatthew Wilcox 
281f7ca90b1SMatthew Wilcox /*
282ac401cc7SJan Kara  * DAX radix tree locking
283ac401cc7SJan Kara  */
284ac401cc7SJan Kara struct exceptional_entry_key {
285ac401cc7SJan Kara 	struct address_space *mapping;
28663e95b5cSRoss Zwisler 	pgoff_t entry_start;
287ac401cc7SJan Kara };
288ac401cc7SJan Kara 
289ac401cc7SJan Kara struct wait_exceptional_entry_queue {
290ac401cc7SJan Kara 	wait_queue_t wait;
291ac401cc7SJan Kara 	struct exceptional_entry_key key;
292ac401cc7SJan Kara };
293ac401cc7SJan Kara 
29463e95b5cSRoss Zwisler static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
29563e95b5cSRoss Zwisler 		pgoff_t index, void *entry, struct exceptional_entry_key *key)
29663e95b5cSRoss Zwisler {
29763e95b5cSRoss Zwisler 	unsigned long hash;
29863e95b5cSRoss Zwisler 
29963e95b5cSRoss Zwisler 	/*
30063e95b5cSRoss Zwisler 	 * If 'entry' is a PMD, align the 'index' that we use for the wait
30163e95b5cSRoss Zwisler 	 * queue to the start of that PMD.  This ensures that all offsets in
30263e95b5cSRoss Zwisler 	 * the range covered by the PMD map to the same bit lock.
30363e95b5cSRoss Zwisler 	 */
304642261acSRoss Zwisler 	if (dax_is_pmd_entry(entry))
30563e95b5cSRoss Zwisler 		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
30663e95b5cSRoss Zwisler 
30763e95b5cSRoss Zwisler 	key->mapping = mapping;
30863e95b5cSRoss Zwisler 	key->entry_start = index;
30963e95b5cSRoss Zwisler 
31063e95b5cSRoss Zwisler 	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
31163e95b5cSRoss Zwisler 	return wait_table + hash;
31263e95b5cSRoss Zwisler }
31363e95b5cSRoss Zwisler 
314ac401cc7SJan Kara static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
315ac401cc7SJan Kara 				       int sync, void *keyp)
316ac401cc7SJan Kara {
317ac401cc7SJan Kara 	struct exceptional_entry_key *key = keyp;
318ac401cc7SJan Kara 	struct wait_exceptional_entry_queue *ewait =
319ac401cc7SJan Kara 		container_of(wait, struct wait_exceptional_entry_queue, wait);
320ac401cc7SJan Kara 
321ac401cc7SJan Kara 	if (key->mapping != ewait->key.mapping ||
32263e95b5cSRoss Zwisler 	    key->entry_start != ewait->key.entry_start)
323ac401cc7SJan Kara 		return 0;
324ac401cc7SJan Kara 	return autoremove_wake_function(wait, mode, sync, NULL);
325ac401cc7SJan Kara }
326ac401cc7SJan Kara 
327ac401cc7SJan Kara /*
328ac401cc7SJan Kara  * Check whether the given slot is locked. The function must be called with
329ac401cc7SJan Kara  * mapping->tree_lock held
330ac401cc7SJan Kara  */
331ac401cc7SJan Kara static inline int slot_locked(struct address_space *mapping, void **slot)
332ac401cc7SJan Kara {
333ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
334ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
335ac401cc7SJan Kara 	return entry & RADIX_DAX_ENTRY_LOCK;
336ac401cc7SJan Kara }
337ac401cc7SJan Kara 
338ac401cc7SJan Kara /*
339ac401cc7SJan Kara  * Mark the given slot is locked. The function must be called with
340ac401cc7SJan Kara  * mapping->tree_lock held
341ac401cc7SJan Kara  */
342ac401cc7SJan Kara static inline void *lock_slot(struct address_space *mapping, void **slot)
343ac401cc7SJan Kara {
344ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
345ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
346ac401cc7SJan Kara 
347ac401cc7SJan Kara 	entry |= RADIX_DAX_ENTRY_LOCK;
348ac401cc7SJan Kara 	radix_tree_replace_slot(slot, (void *)entry);
349ac401cc7SJan Kara 	return (void *)entry;
350ac401cc7SJan Kara }
351ac401cc7SJan Kara 
352ac401cc7SJan Kara /*
353ac401cc7SJan Kara  * Mark the given slot is unlocked. The function must be called with
354ac401cc7SJan Kara  * mapping->tree_lock held
355ac401cc7SJan Kara  */
356ac401cc7SJan Kara static inline void *unlock_slot(struct address_space *mapping, void **slot)
357ac401cc7SJan Kara {
358ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
359ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
360ac401cc7SJan Kara 
361ac401cc7SJan Kara 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
362ac401cc7SJan Kara 	radix_tree_replace_slot(slot, (void *)entry);
363ac401cc7SJan Kara 	return (void *)entry;
364ac401cc7SJan Kara }
365ac401cc7SJan Kara 
366ac401cc7SJan Kara /*
367ac401cc7SJan Kara  * Lookup entry in radix tree, wait for it to become unlocked if it is
368ac401cc7SJan Kara  * exceptional entry and return it. The caller must call
369ac401cc7SJan Kara  * put_unlocked_mapping_entry() when he decided not to lock the entry or
370ac401cc7SJan Kara  * put_locked_mapping_entry() when he locked the entry and now wants to
371ac401cc7SJan Kara  * unlock it.
372ac401cc7SJan Kara  *
373ac401cc7SJan Kara  * The function must be called with mapping->tree_lock held.
374ac401cc7SJan Kara  */
375ac401cc7SJan Kara static void *get_unlocked_mapping_entry(struct address_space *mapping,
376ac401cc7SJan Kara 					pgoff_t index, void ***slotp)
377ac401cc7SJan Kara {
378e3ad61c6SRoss Zwisler 	void *entry, **slot;
379ac401cc7SJan Kara 	struct wait_exceptional_entry_queue ewait;
38063e95b5cSRoss Zwisler 	wait_queue_head_t *wq;
381ac401cc7SJan Kara 
382ac401cc7SJan Kara 	init_wait(&ewait.wait);
383ac401cc7SJan Kara 	ewait.wait.func = wake_exceptional_entry_func;
384ac401cc7SJan Kara 
385ac401cc7SJan Kara 	for (;;) {
386e3ad61c6SRoss Zwisler 		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
387ac401cc7SJan Kara 					  &slot);
388e3ad61c6SRoss Zwisler 		if (!entry || !radix_tree_exceptional_entry(entry) ||
389ac401cc7SJan Kara 		    !slot_locked(mapping, slot)) {
390ac401cc7SJan Kara 			if (slotp)
391ac401cc7SJan Kara 				*slotp = slot;
392e3ad61c6SRoss Zwisler 			return entry;
393ac401cc7SJan Kara 		}
39463e95b5cSRoss Zwisler 
39563e95b5cSRoss Zwisler 		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
396ac401cc7SJan Kara 		prepare_to_wait_exclusive(wq, &ewait.wait,
397ac401cc7SJan Kara 					  TASK_UNINTERRUPTIBLE);
398ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
399ac401cc7SJan Kara 		schedule();
400ac401cc7SJan Kara 		finish_wait(wq, &ewait.wait);
401ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
402ac401cc7SJan Kara 	}
403ac401cc7SJan Kara }
404ac401cc7SJan Kara 
405422476c4SRoss Zwisler static void put_locked_mapping_entry(struct address_space *mapping,
406422476c4SRoss Zwisler 				     pgoff_t index, void *entry)
407422476c4SRoss Zwisler {
408422476c4SRoss Zwisler 	if (!radix_tree_exceptional_entry(entry)) {
409422476c4SRoss Zwisler 		unlock_page(entry);
410422476c4SRoss Zwisler 		put_page(entry);
411422476c4SRoss Zwisler 	} else {
412422476c4SRoss Zwisler 		dax_unlock_mapping_entry(mapping, index);
413422476c4SRoss Zwisler 	}
414422476c4SRoss Zwisler }
415422476c4SRoss Zwisler 
416422476c4SRoss Zwisler /*
417422476c4SRoss Zwisler  * Called when we are done with radix tree entry we looked up via
418422476c4SRoss Zwisler  * get_unlocked_mapping_entry() and which we didn't lock in the end.
419422476c4SRoss Zwisler  */
420422476c4SRoss Zwisler static void put_unlocked_mapping_entry(struct address_space *mapping,
421422476c4SRoss Zwisler 				       pgoff_t index, void *entry)
422422476c4SRoss Zwisler {
423422476c4SRoss Zwisler 	if (!radix_tree_exceptional_entry(entry))
424422476c4SRoss Zwisler 		return;
425422476c4SRoss Zwisler 
426422476c4SRoss Zwisler 	/* We have to wake up next waiter for the radix tree entry lock */
427422476c4SRoss Zwisler 	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
428422476c4SRoss Zwisler }
429422476c4SRoss Zwisler 
430ac401cc7SJan Kara /*
431ac401cc7SJan Kara  * Find radix tree entry at given index. If it points to a page, return with
432ac401cc7SJan Kara  * the page locked. If it points to the exceptional entry, return with the
433ac401cc7SJan Kara  * radix tree entry locked. If the radix tree doesn't contain given index,
434ac401cc7SJan Kara  * create empty exceptional entry for the index and return with it locked.
435ac401cc7SJan Kara  *
436642261acSRoss Zwisler  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
437642261acSRoss Zwisler  * either return that locked entry or will return an error.  This error will
438642261acSRoss Zwisler  * happen if there are any 4k entries (either zero pages or DAX entries)
439642261acSRoss Zwisler  * within the 2MiB range that we are requesting.
440642261acSRoss Zwisler  *
441642261acSRoss Zwisler  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
442642261acSRoss Zwisler  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
443642261acSRoss Zwisler  * insertion will fail if it finds any 4k entries already in the tree, and a
444642261acSRoss Zwisler  * 4k insertion will cause an existing 2MiB entry to be unmapped and
445642261acSRoss Zwisler  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
446642261acSRoss Zwisler  * well as 2MiB empty entries.
447642261acSRoss Zwisler  *
448642261acSRoss Zwisler  * The exception to this downgrade path is for 2MiB DAX PMD entries that have
449642261acSRoss Zwisler  * real storage backing them.  We will leave these real 2MiB DAX entries in
450642261acSRoss Zwisler  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
451642261acSRoss Zwisler  *
452ac401cc7SJan Kara  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
453ac401cc7SJan Kara  * persistent memory the benefit is doubtful. We can add that later if we can
454ac401cc7SJan Kara  * show it helps.
455ac401cc7SJan Kara  */
456642261acSRoss Zwisler static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
457642261acSRoss Zwisler 		unsigned long size_flag)
458ac401cc7SJan Kara {
459642261acSRoss Zwisler 	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
460e3ad61c6SRoss Zwisler 	void *entry, **slot;
461ac401cc7SJan Kara 
462ac401cc7SJan Kara restart:
463ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
464e3ad61c6SRoss Zwisler 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
465642261acSRoss Zwisler 
466642261acSRoss Zwisler 	if (entry) {
467642261acSRoss Zwisler 		if (size_flag & RADIX_DAX_PMD) {
468642261acSRoss Zwisler 			if (!radix_tree_exceptional_entry(entry) ||
469642261acSRoss Zwisler 			    dax_is_pte_entry(entry)) {
470642261acSRoss Zwisler 				put_unlocked_mapping_entry(mapping, index,
471642261acSRoss Zwisler 						entry);
472642261acSRoss Zwisler 				entry = ERR_PTR(-EEXIST);
473642261acSRoss Zwisler 				goto out_unlock;
474642261acSRoss Zwisler 			}
475642261acSRoss Zwisler 		} else { /* trying to grab a PTE entry */
476642261acSRoss Zwisler 			if (radix_tree_exceptional_entry(entry) &&
477642261acSRoss Zwisler 			    dax_is_pmd_entry(entry) &&
478642261acSRoss Zwisler 			    (dax_is_zero_entry(entry) ||
479642261acSRoss Zwisler 			     dax_is_empty_entry(entry))) {
480642261acSRoss Zwisler 				pmd_downgrade = true;
481642261acSRoss Zwisler 			}
482642261acSRoss Zwisler 		}
483642261acSRoss Zwisler 	}
484642261acSRoss Zwisler 
485ac401cc7SJan Kara 	/* No entry for given index? Make sure radix tree is big enough. */
486642261acSRoss Zwisler 	if (!entry || pmd_downgrade) {
487ac401cc7SJan Kara 		int err;
488ac401cc7SJan Kara 
489642261acSRoss Zwisler 		if (pmd_downgrade) {
490642261acSRoss Zwisler 			/*
491642261acSRoss Zwisler 			 * Make sure 'entry' remains valid while we drop
492642261acSRoss Zwisler 			 * mapping->tree_lock.
493642261acSRoss Zwisler 			 */
494642261acSRoss Zwisler 			entry = lock_slot(mapping, slot);
495642261acSRoss Zwisler 		}
496642261acSRoss Zwisler 
497ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
498ac401cc7SJan Kara 		err = radix_tree_preload(
499ac401cc7SJan Kara 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
500642261acSRoss Zwisler 		if (err) {
501642261acSRoss Zwisler 			if (pmd_downgrade)
502642261acSRoss Zwisler 				put_locked_mapping_entry(mapping, index, entry);
503ac401cc7SJan Kara 			return ERR_PTR(err);
504642261acSRoss Zwisler 		}
505642261acSRoss Zwisler 
506642261acSRoss Zwisler 		/*
507642261acSRoss Zwisler 		 * Besides huge zero pages the only other thing that gets
508642261acSRoss Zwisler 		 * downgraded are empty entries which don't need to be
509642261acSRoss Zwisler 		 * unmapped.
510642261acSRoss Zwisler 		 */
511642261acSRoss Zwisler 		if (pmd_downgrade && dax_is_zero_entry(entry))
512642261acSRoss Zwisler 			unmap_mapping_range(mapping,
513642261acSRoss Zwisler 				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
514642261acSRoss Zwisler 
515ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
516642261acSRoss Zwisler 
517642261acSRoss Zwisler 		if (pmd_downgrade) {
518642261acSRoss Zwisler 			radix_tree_delete(&mapping->page_tree, index);
519642261acSRoss Zwisler 			mapping->nrexceptional--;
520642261acSRoss Zwisler 			dax_wake_mapping_entry_waiter(mapping, index, entry,
521642261acSRoss Zwisler 					true);
522642261acSRoss Zwisler 		}
523642261acSRoss Zwisler 
524642261acSRoss Zwisler 		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
525642261acSRoss Zwisler 
526642261acSRoss Zwisler 		err = __radix_tree_insert(&mapping->page_tree, index,
527642261acSRoss Zwisler 				dax_radix_order(entry), entry);
528ac401cc7SJan Kara 		radix_tree_preload_end();
529ac401cc7SJan Kara 		if (err) {
530ac401cc7SJan Kara 			spin_unlock_irq(&mapping->tree_lock);
531642261acSRoss Zwisler 			/*
532642261acSRoss Zwisler 			 * Someone already created the entry?  This is a
533642261acSRoss Zwisler 			 * normal failure when inserting PMDs in a range
534642261acSRoss Zwisler 			 * that already contains PTEs.  In that case we want
535642261acSRoss Zwisler 			 * to return -EEXIST immediately.
536642261acSRoss Zwisler 			 */
537642261acSRoss Zwisler 			if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
538ac401cc7SJan Kara 				goto restart;
539642261acSRoss Zwisler 			/*
540642261acSRoss Zwisler 			 * Our insertion of a DAX PMD entry failed, most
541642261acSRoss Zwisler 			 * likely because it collided with a PTE sized entry
542642261acSRoss Zwisler 			 * at a different index in the PMD range.  We haven't
543642261acSRoss Zwisler 			 * inserted anything into the radix tree and have no
544642261acSRoss Zwisler 			 * waiters to wake.
545642261acSRoss Zwisler 			 */
546ac401cc7SJan Kara 			return ERR_PTR(err);
547ac401cc7SJan Kara 		}
548ac401cc7SJan Kara 		/* Good, we have inserted empty locked entry into the tree. */
549ac401cc7SJan Kara 		mapping->nrexceptional++;
550ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
551e3ad61c6SRoss Zwisler 		return entry;
552ac401cc7SJan Kara 	}
553ac401cc7SJan Kara 	/* Normal page in radix tree? */
554e3ad61c6SRoss Zwisler 	if (!radix_tree_exceptional_entry(entry)) {
555e3ad61c6SRoss Zwisler 		struct page *page = entry;
556ac401cc7SJan Kara 
557ac401cc7SJan Kara 		get_page(page);
558ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
559ac401cc7SJan Kara 		lock_page(page);
560ac401cc7SJan Kara 		/* Page got truncated? Retry... */
561ac401cc7SJan Kara 		if (unlikely(page->mapping != mapping)) {
562ac401cc7SJan Kara 			unlock_page(page);
563ac401cc7SJan Kara 			put_page(page);
564ac401cc7SJan Kara 			goto restart;
565ac401cc7SJan Kara 		}
566ac401cc7SJan Kara 		return page;
567ac401cc7SJan Kara 	}
568e3ad61c6SRoss Zwisler 	entry = lock_slot(mapping, slot);
569642261acSRoss Zwisler  out_unlock:
570ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
571e3ad61c6SRoss Zwisler 	return entry;
572ac401cc7SJan Kara }
573ac401cc7SJan Kara 
57463e95b5cSRoss Zwisler /*
57563e95b5cSRoss Zwisler  * We do not necessarily hold the mapping->tree_lock when we call this
57663e95b5cSRoss Zwisler  * function so it is possible that 'entry' is no longer a valid item in the
577642261acSRoss Zwisler  * radix tree.  This is okay because all we really need to do is to find the
578642261acSRoss Zwisler  * correct waitqueue where tasks might be waiting for that old 'entry' and
579642261acSRoss Zwisler  * wake them.
58063e95b5cSRoss Zwisler  */
581ac401cc7SJan Kara void dax_wake_mapping_entry_waiter(struct address_space *mapping,
58263e95b5cSRoss Zwisler 		pgoff_t index, void *entry, bool wake_all)
583ac401cc7SJan Kara {
58463e95b5cSRoss Zwisler 	struct exceptional_entry_key key;
58563e95b5cSRoss Zwisler 	wait_queue_head_t *wq;
58663e95b5cSRoss Zwisler 
58763e95b5cSRoss Zwisler 	wq = dax_entry_waitqueue(mapping, index, entry, &key);
588ac401cc7SJan Kara 
589ac401cc7SJan Kara 	/*
590ac401cc7SJan Kara 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
591ac401cc7SJan Kara 	 * under mapping->tree_lock, ditto for entry handling in our callers.
592ac401cc7SJan Kara 	 * So at this point all tasks that could have seen our entry locked
593ac401cc7SJan Kara 	 * must be in the waitqueue and the following check will see them.
594ac401cc7SJan Kara 	 */
59563e95b5cSRoss Zwisler 	if (waitqueue_active(wq))
596ac401cc7SJan Kara 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
597ac401cc7SJan Kara }
598ac401cc7SJan Kara 
599bc2466e4SJan Kara void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
600ac401cc7SJan Kara {
601e3ad61c6SRoss Zwisler 	void *entry, **slot;
602ac401cc7SJan Kara 
603ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
604e3ad61c6SRoss Zwisler 	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
605e3ad61c6SRoss Zwisler 	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
606ac401cc7SJan Kara 			 !slot_locked(mapping, slot))) {
607ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
608ac401cc7SJan Kara 		return;
609ac401cc7SJan Kara 	}
610ac401cc7SJan Kara 	unlock_slot(mapping, slot);
611ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
61263e95b5cSRoss Zwisler 	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
613ac401cc7SJan Kara }
614ac401cc7SJan Kara 
615ac401cc7SJan Kara /*
616ac401cc7SJan Kara  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
617ac401cc7SJan Kara  * entry to get unlocked before deleting it.
618ac401cc7SJan Kara  */
619ac401cc7SJan Kara int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
620ac401cc7SJan Kara {
621ac401cc7SJan Kara 	void *entry;
622ac401cc7SJan Kara 
623ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
624ac401cc7SJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
625ac401cc7SJan Kara 	/*
626ac401cc7SJan Kara 	 * This gets called from truncate / punch_hole path. As such, the caller
627ac401cc7SJan Kara 	 * must hold locks protecting against concurrent modifications of the
628ac401cc7SJan Kara 	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
629ac401cc7SJan Kara 	 * caller has seen exceptional entry for this index, we better find it
630ac401cc7SJan Kara 	 * at that index as well...
631ac401cc7SJan Kara 	 */
632ac401cc7SJan Kara 	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
633ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
634ac401cc7SJan Kara 		return 0;
635ac401cc7SJan Kara 	}
636ac401cc7SJan Kara 	radix_tree_delete(&mapping->page_tree, index);
637ac401cc7SJan Kara 	mapping->nrexceptional--;
638ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
63963e95b5cSRoss Zwisler 	dax_wake_mapping_entry_waiter(mapping, index, entry, true);
640ac401cc7SJan Kara 
641ac401cc7SJan Kara 	return 1;
642ac401cc7SJan Kara }
643ac401cc7SJan Kara 
644ac401cc7SJan Kara /*
645f7ca90b1SMatthew Wilcox  * The user has performed a load from a hole in the file.  Allocating
646f7ca90b1SMatthew Wilcox  * a new page in the file would cause excessive storage usage for
647f7ca90b1SMatthew Wilcox  * workloads with sparse files.  We allocate a page cache page instead.
648f7ca90b1SMatthew Wilcox  * We'll kick it out of the page cache if it's ever written to,
649f7ca90b1SMatthew Wilcox  * otherwise it will simply fall out of the page cache under memory
650f7ca90b1SMatthew Wilcox  * pressure without ever having been dirtied.
651f7ca90b1SMatthew Wilcox  */
652ac401cc7SJan Kara static int dax_load_hole(struct address_space *mapping, void *entry,
653f7ca90b1SMatthew Wilcox 			 struct vm_fault *vmf)
654f7ca90b1SMatthew Wilcox {
655ac401cc7SJan Kara 	struct page *page;
656f7ca90b1SMatthew Wilcox 
657ac401cc7SJan Kara 	/* Hole page already exists? Return it...  */
658ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
659ac401cc7SJan Kara 		vmf->page = entry;
660ac401cc7SJan Kara 		return VM_FAULT_LOCKED;
661ac401cc7SJan Kara 	}
662ac401cc7SJan Kara 
663ac401cc7SJan Kara 	/* This will replace locked radix tree entry with a hole page */
664ac401cc7SJan Kara 	page = find_or_create_page(mapping, vmf->pgoff,
665ac401cc7SJan Kara 				   vmf->gfp_mask | __GFP_ZERO);
666ac401cc7SJan Kara 	if (!page) {
667ac401cc7SJan Kara 		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
668ac401cc7SJan Kara 		return VM_FAULT_OOM;
669ac401cc7SJan Kara 	}
670f7ca90b1SMatthew Wilcox 	vmf->page = page;
671f7ca90b1SMatthew Wilcox 	return VM_FAULT_LOCKED;
672f7ca90b1SMatthew Wilcox }
673f7ca90b1SMatthew Wilcox 
674b0d5e82fSChristoph Hellwig static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
675b0d5e82fSChristoph Hellwig 		struct page *to, unsigned long vaddr)
676f7ca90b1SMatthew Wilcox {
677b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
678b0d5e82fSChristoph Hellwig 		.sector = sector,
679b0d5e82fSChristoph Hellwig 		.size = size,
680b2e0d162SDan Williams 	};
681e2e05394SRoss Zwisler 	void *vto;
682e2e05394SRoss Zwisler 
683b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0)
684b2e0d162SDan Williams 		return PTR_ERR(dax.addr);
685f7ca90b1SMatthew Wilcox 	vto = kmap_atomic(to);
686b2e0d162SDan Williams 	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
687f7ca90b1SMatthew Wilcox 	kunmap_atomic(vto);
688b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
689f7ca90b1SMatthew Wilcox 	return 0;
690f7ca90b1SMatthew Wilcox }
691f7ca90b1SMatthew Wilcox 
692642261acSRoss Zwisler /*
693642261acSRoss Zwisler  * By this point grab_mapping_entry() has ensured that we have a locked entry
694642261acSRoss Zwisler  * of the appropriate size so we don't have to worry about downgrading PMDs to
695642261acSRoss Zwisler  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
696642261acSRoss Zwisler  * already in the tree, we will skip the insertion and just dirty the PMD as
697642261acSRoss Zwisler  * appropriate.
698642261acSRoss Zwisler  */
699ac401cc7SJan Kara static void *dax_insert_mapping_entry(struct address_space *mapping,
700ac401cc7SJan Kara 				      struct vm_fault *vmf,
701642261acSRoss Zwisler 				      void *entry, sector_t sector,
702642261acSRoss Zwisler 				      unsigned long flags)
7039973c98eSRoss Zwisler {
7049973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
705ac401cc7SJan Kara 	int error = 0;
706ac401cc7SJan Kara 	bool hole_fill = false;
707ac401cc7SJan Kara 	void *new_entry;
708ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
7099973c98eSRoss Zwisler 
710ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
7119973c98eSRoss Zwisler 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
7129973c98eSRoss Zwisler 
713ac401cc7SJan Kara 	/* Replacing hole page with block mapping? */
714ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
715ac401cc7SJan Kara 		hole_fill = true;
7169973c98eSRoss Zwisler 		/*
717ac401cc7SJan Kara 		 * Unmap the page now before we remove it from page cache below.
718ac401cc7SJan Kara 		 * The page is locked so it cannot be faulted in again.
7199973c98eSRoss Zwisler 		 */
720ac401cc7SJan Kara 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
721ac401cc7SJan Kara 				    PAGE_SIZE, 0);
722ac401cc7SJan Kara 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
7239973c98eSRoss Zwisler 		if (error)
724ac401cc7SJan Kara 			return ERR_PTR(error);
725642261acSRoss Zwisler 	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
726642261acSRoss Zwisler 		/* replacing huge zero page with PMD block mapping */
727642261acSRoss Zwisler 		unmap_mapping_range(mapping,
728642261acSRoss Zwisler 			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
729ac401cc7SJan Kara 	}
7309973c98eSRoss Zwisler 
731ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
732642261acSRoss Zwisler 	new_entry = dax_radix_locked_entry(sector, flags);
733642261acSRoss Zwisler 
734ac401cc7SJan Kara 	if (hole_fill) {
735ac401cc7SJan Kara 		__delete_from_page_cache(entry, NULL);
736ac401cc7SJan Kara 		/* Drop pagecache reference */
737ac401cc7SJan Kara 		put_page(entry);
738642261acSRoss Zwisler 		error = __radix_tree_insert(page_tree, index,
739642261acSRoss Zwisler 				dax_radix_order(new_entry), new_entry);
740ac401cc7SJan Kara 		if (error) {
741ac401cc7SJan Kara 			new_entry = ERR_PTR(error);
742ac401cc7SJan Kara 			goto unlock;
743ac401cc7SJan Kara 		}
7449973c98eSRoss Zwisler 		mapping->nrexceptional++;
745642261acSRoss Zwisler 	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
746642261acSRoss Zwisler 		/*
747642261acSRoss Zwisler 		 * Only swap our new entry into the radix tree if the current
748642261acSRoss Zwisler 		 * entry is a zero page or an empty entry.  If a normal PTE or
749642261acSRoss Zwisler 		 * PMD entry is already in the tree, we leave it alone.  This
750642261acSRoss Zwisler 		 * means that if we are trying to insert a PTE and the
751642261acSRoss Zwisler 		 * existing entry is a PMD, we will just leave the PMD in the
752642261acSRoss Zwisler 		 * tree and dirty it if necessary.
753642261acSRoss Zwisler 		 */
754ac401cc7SJan Kara 		void **slot;
755ac401cc7SJan Kara 		void *ret;
756ac401cc7SJan Kara 
757ac401cc7SJan Kara 		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
758ac401cc7SJan Kara 		WARN_ON_ONCE(ret != entry);
759ac401cc7SJan Kara 		radix_tree_replace_slot(slot, new_entry);
760ac401cc7SJan Kara 	}
761ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
7629973c98eSRoss Zwisler 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
7639973c98eSRoss Zwisler  unlock:
7649973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
765ac401cc7SJan Kara 	if (hole_fill) {
766ac401cc7SJan Kara 		radix_tree_preload_end();
767ac401cc7SJan Kara 		/*
768ac401cc7SJan Kara 		 * We don't need hole page anymore, it has been replaced with
769ac401cc7SJan Kara 		 * locked radix tree entry now.
770ac401cc7SJan Kara 		 */
771ac401cc7SJan Kara 		if (mapping->a_ops->freepage)
772ac401cc7SJan Kara 			mapping->a_ops->freepage(entry);
773ac401cc7SJan Kara 		unlock_page(entry);
774ac401cc7SJan Kara 		put_page(entry);
775ac401cc7SJan Kara 	}
776ac401cc7SJan Kara 	return new_entry;
7779973c98eSRoss Zwisler }
7789973c98eSRoss Zwisler 
7799973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev,
7809973c98eSRoss Zwisler 		struct address_space *mapping, pgoff_t index, void *entry)
7819973c98eSRoss Zwisler {
7829973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
7839973c98eSRoss Zwisler 	struct radix_tree_node *node;
7849973c98eSRoss Zwisler 	struct blk_dax_ctl dax;
7859973c98eSRoss Zwisler 	void **slot;
7869973c98eSRoss Zwisler 	int ret = 0;
7879973c98eSRoss Zwisler 
7889973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
7899973c98eSRoss Zwisler 	/*
7909973c98eSRoss Zwisler 	 * Regular page slots are stabilized by the page lock even
7919973c98eSRoss Zwisler 	 * without the tree itself locked.  These unlocked entries
7929973c98eSRoss Zwisler 	 * need verification under the tree lock.
7939973c98eSRoss Zwisler 	 */
7949973c98eSRoss Zwisler 	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
7959973c98eSRoss Zwisler 		goto unlock;
7969973c98eSRoss Zwisler 	if (*slot != entry)
7979973c98eSRoss Zwisler 		goto unlock;
7989973c98eSRoss Zwisler 
7999973c98eSRoss Zwisler 	/* another fsync thread may have already written back this entry */
8009973c98eSRoss Zwisler 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
8019973c98eSRoss Zwisler 		goto unlock;
8029973c98eSRoss Zwisler 
803642261acSRoss Zwisler 	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
804642261acSRoss Zwisler 				dax_is_zero_entry(entry))) {
8059973c98eSRoss Zwisler 		ret = -EIO;
8069973c98eSRoss Zwisler 		goto unlock;
8079973c98eSRoss Zwisler 	}
8089973c98eSRoss Zwisler 
809642261acSRoss Zwisler 	/*
810642261acSRoss Zwisler 	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
811642261acSRoss Zwisler 	 * in the middle of a PMD, the 'index' we are given will be aligned to
812642261acSRoss Zwisler 	 * the start index of the PMD, as will the sector we pull from
813642261acSRoss Zwisler 	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
814642261acSRoss Zwisler 	 * worry about partial PMD writebacks.
815642261acSRoss Zwisler 	 */
816642261acSRoss Zwisler 	dax.sector = dax_radix_sector(entry);
817642261acSRoss Zwisler 	dax.size = PAGE_SIZE << dax_radix_order(entry);
8189973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
8199973c98eSRoss Zwisler 
8209973c98eSRoss Zwisler 	/*
8219973c98eSRoss Zwisler 	 * We cannot hold tree_lock while calling dax_map_atomic() because it
8229973c98eSRoss Zwisler 	 * eventually calls cond_resched().
8239973c98eSRoss Zwisler 	 */
8249973c98eSRoss Zwisler 	ret = dax_map_atomic(bdev, &dax);
8259973c98eSRoss Zwisler 	if (ret < 0)
8269973c98eSRoss Zwisler 		return ret;
8279973c98eSRoss Zwisler 
8289973c98eSRoss Zwisler 	if (WARN_ON_ONCE(ret < dax.size)) {
8299973c98eSRoss Zwisler 		ret = -EIO;
8309973c98eSRoss Zwisler 		goto unmap;
8319973c98eSRoss Zwisler 	}
8329973c98eSRoss Zwisler 
8339973c98eSRoss Zwisler 	wb_cache_pmem(dax.addr, dax.size);
8349973c98eSRoss Zwisler 
8359973c98eSRoss Zwisler 	spin_lock_irq(&mapping->tree_lock);
8369973c98eSRoss Zwisler 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
8379973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
8389973c98eSRoss Zwisler  unmap:
8399973c98eSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
8409973c98eSRoss Zwisler 	return ret;
8419973c98eSRoss Zwisler 
8429973c98eSRoss Zwisler  unlock:
8439973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
8449973c98eSRoss Zwisler 	return ret;
8459973c98eSRoss Zwisler }
8469973c98eSRoss Zwisler 
8479973c98eSRoss Zwisler /*
8489973c98eSRoss Zwisler  * Flush the mapping to the persistent domain within the byte range of [start,
8499973c98eSRoss Zwisler  * end]. This is required by data integrity operations to ensure file data is
8509973c98eSRoss Zwisler  * on persistent storage prior to completion of the operation.
8519973c98eSRoss Zwisler  */
8527f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping,
8537f6d5b52SRoss Zwisler 		struct block_device *bdev, struct writeback_control *wbc)
8549973c98eSRoss Zwisler {
8559973c98eSRoss Zwisler 	struct inode *inode = mapping->host;
856642261acSRoss Zwisler 	pgoff_t start_index, end_index;
8579973c98eSRoss Zwisler 	pgoff_t indices[PAGEVEC_SIZE];
8589973c98eSRoss Zwisler 	struct pagevec pvec;
8599973c98eSRoss Zwisler 	bool done = false;
8609973c98eSRoss Zwisler 	int i, ret = 0;
8619973c98eSRoss Zwisler 
8629973c98eSRoss Zwisler 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
8639973c98eSRoss Zwisler 		return -EIO;
8649973c98eSRoss Zwisler 
8657f6d5b52SRoss Zwisler 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
8667f6d5b52SRoss Zwisler 		return 0;
8677f6d5b52SRoss Zwisler 
86809cbfeafSKirill A. Shutemov 	start_index = wbc->range_start >> PAGE_SHIFT;
86909cbfeafSKirill A. Shutemov 	end_index = wbc->range_end >> PAGE_SHIFT;
8709973c98eSRoss Zwisler 
8719973c98eSRoss Zwisler 	tag_pages_for_writeback(mapping, start_index, end_index);
8729973c98eSRoss Zwisler 
8739973c98eSRoss Zwisler 	pagevec_init(&pvec, 0);
8749973c98eSRoss Zwisler 	while (!done) {
8759973c98eSRoss Zwisler 		pvec.nr = find_get_entries_tag(mapping, start_index,
8769973c98eSRoss Zwisler 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
8779973c98eSRoss Zwisler 				pvec.pages, indices);
8789973c98eSRoss Zwisler 
8799973c98eSRoss Zwisler 		if (pvec.nr == 0)
8809973c98eSRoss Zwisler 			break;
8819973c98eSRoss Zwisler 
8829973c98eSRoss Zwisler 		for (i = 0; i < pvec.nr; i++) {
8839973c98eSRoss Zwisler 			if (indices[i] > end_index) {
8849973c98eSRoss Zwisler 				done = true;
8859973c98eSRoss Zwisler 				break;
8869973c98eSRoss Zwisler 			}
8879973c98eSRoss Zwisler 
8889973c98eSRoss Zwisler 			ret = dax_writeback_one(bdev, mapping, indices[i],
8899973c98eSRoss Zwisler 					pvec.pages[i]);
8909973c98eSRoss Zwisler 			if (ret < 0)
8919973c98eSRoss Zwisler 				return ret;
8929973c98eSRoss Zwisler 		}
8939973c98eSRoss Zwisler 	}
8949973c98eSRoss Zwisler 	return 0;
8959973c98eSRoss Zwisler }
8969973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
8979973c98eSRoss Zwisler 
898ac401cc7SJan Kara static int dax_insert_mapping(struct address_space *mapping,
8991aaba095SChristoph Hellwig 		struct block_device *bdev, sector_t sector, size_t size,
9001aaba095SChristoph Hellwig 		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
901f7ca90b1SMatthew Wilcox {
902f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
903b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
9041aaba095SChristoph Hellwig 		.sector = sector,
9051aaba095SChristoph Hellwig 		.size = size,
906b2e0d162SDan Williams 	};
907ac401cc7SJan Kara 	void *ret;
908ac401cc7SJan Kara 	void *entry = *entryp;
909f7ca90b1SMatthew Wilcox 
9104d9a2c87SJan Kara 	if (dax_map_atomic(bdev, &dax) < 0)
9114d9a2c87SJan Kara 		return PTR_ERR(dax.addr);
912b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
913f7ca90b1SMatthew Wilcox 
914642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
9154d9a2c87SJan Kara 	if (IS_ERR(ret))
9164d9a2c87SJan Kara 		return PTR_ERR(ret);
917ac401cc7SJan Kara 	*entryp = ret;
9189973c98eSRoss Zwisler 
9194d9a2c87SJan Kara 	return vm_insert_mixed(vma, vaddr, dax.pfn);
920f7ca90b1SMatthew Wilcox }
921f7ca90b1SMatthew Wilcox 
922ce5c5d55SDave Chinner /**
9236b524995SRoss Zwisler  * dax_fault - handle a page fault on a DAX file
924ce5c5d55SDave Chinner  * @vma: The virtual memory area where the fault occurred
925ce5c5d55SDave Chinner  * @vmf: The description of the fault
926ce5c5d55SDave Chinner  * @get_block: The filesystem method used to translate file offsets to blocks
927ce5c5d55SDave Chinner  *
928ce5c5d55SDave Chinner  * When a page fault occurs, filesystems may call this helper in their
9296b524995SRoss Zwisler  * fault handler for DAX files. dax_fault() assumes the caller has done all
930ce5c5d55SDave Chinner  * the necessary locking for the page fault to proceed successfully.
931ce5c5d55SDave Chinner  */
9326b524995SRoss Zwisler int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
93302fbd139SJan Kara 			get_block_t get_block)
934f7ca90b1SMatthew Wilcox {
935f7ca90b1SMatthew Wilcox 	struct file *file = vma->vm_file;
936f7ca90b1SMatthew Wilcox 	struct address_space *mapping = file->f_mapping;
937f7ca90b1SMatthew Wilcox 	struct inode *inode = mapping->host;
938ac401cc7SJan Kara 	void *entry;
939f7ca90b1SMatthew Wilcox 	struct buffer_head bh;
940f7ca90b1SMatthew Wilcox 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
941f7ca90b1SMatthew Wilcox 	unsigned blkbits = inode->i_blkbits;
942f7ca90b1SMatthew Wilcox 	sector_t block;
943f7ca90b1SMatthew Wilcox 	pgoff_t size;
944f7ca90b1SMatthew Wilcox 	int error;
945f7ca90b1SMatthew Wilcox 	int major = 0;
946f7ca90b1SMatthew Wilcox 
947ac401cc7SJan Kara 	/*
948ac401cc7SJan Kara 	 * Check whether offset isn't beyond end of file now. Caller is supposed
949ac401cc7SJan Kara 	 * to hold locks serializing us with truncate / punch hole so this is
950ac401cc7SJan Kara 	 * a reliable test.
951ac401cc7SJan Kara 	 */
952f7ca90b1SMatthew Wilcox 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
953f7ca90b1SMatthew Wilcox 	if (vmf->pgoff >= size)
954f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS;
955f7ca90b1SMatthew Wilcox 
956f7ca90b1SMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
957f7ca90b1SMatthew Wilcox 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
958eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
959f7ca90b1SMatthew Wilcox 	bh.b_size = PAGE_SIZE;
960f7ca90b1SMatthew Wilcox 
961642261acSRoss Zwisler 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
962ac401cc7SJan Kara 	if (IS_ERR(entry)) {
963ac401cc7SJan Kara 		error = PTR_ERR(entry);
964ac401cc7SJan Kara 		goto out;
965f7ca90b1SMatthew Wilcox 	}
966f7ca90b1SMatthew Wilcox 
967f7ca90b1SMatthew Wilcox 	error = get_block(inode, block, &bh, 0);
968f7ca90b1SMatthew Wilcox 	if (!error && (bh.b_size < PAGE_SIZE))
969f7ca90b1SMatthew Wilcox 		error = -EIO;		/* fs corruption? */
970f7ca90b1SMatthew Wilcox 	if (error)
971ac401cc7SJan Kara 		goto unlock_entry;
972f7ca90b1SMatthew Wilcox 
973f7ca90b1SMatthew Wilcox 	if (vmf->cow_page) {
974f7ca90b1SMatthew Wilcox 		struct page *new_page = vmf->cow_page;
975f7ca90b1SMatthew Wilcox 		if (buffer_written(&bh))
976b0d5e82fSChristoph Hellwig 			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
977b0d5e82fSChristoph Hellwig 					bh.b_size, new_page, vaddr);
978f7ca90b1SMatthew Wilcox 		else
979f7ca90b1SMatthew Wilcox 			clear_user_highpage(new_page, vaddr);
980f7ca90b1SMatthew Wilcox 		if (error)
981ac401cc7SJan Kara 			goto unlock_entry;
982ac401cc7SJan Kara 		if (!radix_tree_exceptional_entry(entry)) {
983ac401cc7SJan Kara 			vmf->page = entry;
984f7ca90b1SMatthew Wilcox 			return VM_FAULT_LOCKED;
985f7ca90b1SMatthew Wilcox 		}
986bc2466e4SJan Kara 		vmf->entry = entry;
987bc2466e4SJan Kara 		return VM_FAULT_DAX_LOCKED;
988bc2466e4SJan Kara 	}
989f7ca90b1SMatthew Wilcox 
990ac401cc7SJan Kara 	if (!buffer_mapped(&bh)) {
991ac401cc7SJan Kara 		if (vmf->flags & FAULT_FLAG_WRITE) {
992ac401cc7SJan Kara 			error = get_block(inode, block, &bh, 1);
993ac401cc7SJan Kara 			count_vm_event(PGMAJFAULT);
994ac401cc7SJan Kara 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
995ac401cc7SJan Kara 			major = VM_FAULT_MAJOR;
996ac401cc7SJan Kara 			if (!error && (bh.b_size < PAGE_SIZE))
997ac401cc7SJan Kara 				error = -EIO;
998ac401cc7SJan Kara 			if (error)
999ac401cc7SJan Kara 				goto unlock_entry;
1000ac401cc7SJan Kara 		} else {
1001ac401cc7SJan Kara 			return dax_load_hole(mapping, entry, vmf);
1002ac401cc7SJan Kara 		}
1003f7ca90b1SMatthew Wilcox 	}
1004f7ca90b1SMatthew Wilcox 
100502fbd139SJan Kara 	/* Filesystem should not return unwritten buffers to us! */
10062b10945cSJan Kara 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
10071aaba095SChristoph Hellwig 	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
10081aaba095SChristoph Hellwig 			bh.b_size, &entry, vma, vmf);
1009ac401cc7SJan Kara  unlock_entry:
1010ac401cc7SJan Kara 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1011f7ca90b1SMatthew Wilcox  out:
1012f7ca90b1SMatthew Wilcox 	if (error == -ENOMEM)
1013f7ca90b1SMatthew Wilcox 		return VM_FAULT_OOM | major;
1014f7ca90b1SMatthew Wilcox 	/* -EBUSY is fine, somebody else faulted on the same PTE */
1015f7ca90b1SMatthew Wilcox 	if ((error < 0) && (error != -EBUSY))
1016f7ca90b1SMatthew Wilcox 		return VM_FAULT_SIGBUS | major;
1017f7ca90b1SMatthew Wilcox 	return VM_FAULT_NOPAGE | major;
1018f7ca90b1SMatthew Wilcox }
1019f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault);
10204c0ccfefSMatthew Wilcox 
10214c0ccfefSMatthew Wilcox /**
10220e3b210cSBoaz Harrosh  * dax_pfn_mkwrite - handle first write to DAX page
10230e3b210cSBoaz Harrosh  * @vma: The virtual memory area where the fault occurred
10240e3b210cSBoaz Harrosh  * @vmf: The description of the fault
10250e3b210cSBoaz Harrosh  */
10260e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
10270e3b210cSBoaz Harrosh {
10289973c98eSRoss Zwisler 	struct file *file = vma->vm_file;
1029ac401cc7SJan Kara 	struct address_space *mapping = file->f_mapping;
1030ac401cc7SJan Kara 	void *entry;
1031ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
10320e3b210cSBoaz Harrosh 
1033ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
1034ac401cc7SJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
1035ac401cc7SJan Kara 	if (!entry || !radix_tree_exceptional_entry(entry))
1036ac401cc7SJan Kara 		goto out;
1037ac401cc7SJan Kara 	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1038ac401cc7SJan Kara 	put_unlocked_mapping_entry(mapping, index, entry);
1039ac401cc7SJan Kara out:
1040ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
10410e3b210cSBoaz Harrosh 	return VM_FAULT_NOPAGE;
10420e3b210cSBoaz Harrosh }
10430e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
10440e3b210cSBoaz Harrosh 
10454b0228faSVishal Verma static bool dax_range_is_aligned(struct block_device *bdev,
10464b0228faSVishal Verma 				 unsigned int offset, unsigned int length)
10474b0228faSVishal Verma {
10484b0228faSVishal Verma 	unsigned short sector_size = bdev_logical_block_size(bdev);
10494b0228faSVishal Verma 
10504b0228faSVishal Verma 	if (!IS_ALIGNED(offset, sector_size))
10514b0228faSVishal Verma 		return false;
10524b0228faSVishal Verma 	if (!IS_ALIGNED(length, sector_size))
10534b0228faSVishal Verma 		return false;
10544b0228faSVishal Verma 
10554b0228faSVishal Verma 	return true;
10564b0228faSVishal Verma }
10574b0228faSVishal Verma 
1058679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1059679c8bd3SChristoph Hellwig 		unsigned int offset, unsigned int length)
1060679c8bd3SChristoph Hellwig {
1061679c8bd3SChristoph Hellwig 	struct blk_dax_ctl dax = {
1062679c8bd3SChristoph Hellwig 		.sector		= sector,
1063679c8bd3SChristoph Hellwig 		.size		= PAGE_SIZE,
1064679c8bd3SChristoph Hellwig 	};
1065679c8bd3SChristoph Hellwig 
10664b0228faSVishal Verma 	if (dax_range_is_aligned(bdev, offset, length)) {
10674b0228faSVishal Verma 		sector_t start_sector = dax.sector + (offset >> 9);
10684b0228faSVishal Verma 
10694b0228faSVishal Verma 		return blkdev_issue_zeroout(bdev, start_sector,
10704b0228faSVishal Verma 				length >> 9, GFP_NOFS, true);
10714b0228faSVishal Verma 	} else {
1072679c8bd3SChristoph Hellwig 		if (dax_map_atomic(bdev, &dax) < 0)
1073679c8bd3SChristoph Hellwig 			return PTR_ERR(dax.addr);
1074679c8bd3SChristoph Hellwig 		clear_pmem(dax.addr + offset, length);
1075679c8bd3SChristoph Hellwig 		dax_unmap_atomic(bdev, &dax);
10764b0228faSVishal Verma 	}
1077679c8bd3SChristoph Hellwig 	return 0;
1078679c8bd3SChristoph Hellwig }
1079679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1080679c8bd3SChristoph Hellwig 
10810e3b210cSBoaz Harrosh /**
108225726bc1SMatthew Wilcox  * dax_zero_page_range - zero a range within a page of a DAX file
10834c0ccfefSMatthew Wilcox  * @inode: The file being truncated
10844c0ccfefSMatthew Wilcox  * @from: The file offset that is being truncated to
108525726bc1SMatthew Wilcox  * @length: The number of bytes to zero
10864c0ccfefSMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
10874c0ccfefSMatthew Wilcox  *
108825726bc1SMatthew Wilcox  * This function can be called by a filesystem when it is zeroing part of a
108925726bc1SMatthew Wilcox  * page in a DAX file.  This is intended for hole-punch operations.  If
109025726bc1SMatthew Wilcox  * you are truncating a file, the helper function dax_truncate_page() may be
109125726bc1SMatthew Wilcox  * more convenient.
10924c0ccfefSMatthew Wilcox  */
109325726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
109425726bc1SMatthew Wilcox 							get_block_t get_block)
10954c0ccfefSMatthew Wilcox {
10964c0ccfefSMatthew Wilcox 	struct buffer_head bh;
109709cbfeafSKirill A. Shutemov 	pgoff_t index = from >> PAGE_SHIFT;
109809cbfeafSKirill A. Shutemov 	unsigned offset = from & (PAGE_SIZE-1);
10994c0ccfefSMatthew Wilcox 	int err;
11004c0ccfefSMatthew Wilcox 
11014c0ccfefSMatthew Wilcox 	/* Block boundary? Nothing to do */
11024c0ccfefSMatthew Wilcox 	if (!length)
11034c0ccfefSMatthew Wilcox 		return 0;
1104aada54f9SRoss Zwisler 	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
1105aada54f9SRoss Zwisler 		return -EINVAL;
11064c0ccfefSMatthew Wilcox 
11074c0ccfefSMatthew Wilcox 	memset(&bh, 0, sizeof(bh));
1108eab95db6SRoss Zwisler 	bh.b_bdev = inode->i_sb->s_bdev;
110909cbfeafSKirill A. Shutemov 	bh.b_size = PAGE_SIZE;
11104c0ccfefSMatthew Wilcox 	err = get_block(inode, index, &bh, 0);
1111679c8bd3SChristoph Hellwig 	if (err < 0 || !buffer_written(&bh))
11124c0ccfefSMatthew Wilcox 		return err;
1113b2e0d162SDan Williams 
1114679c8bd3SChristoph Hellwig 	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1115679c8bd3SChristoph Hellwig 			offset, length);
11164c0ccfefSMatthew Wilcox }
111725726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range);
111825726bc1SMatthew Wilcox 
111925726bc1SMatthew Wilcox /**
112025726bc1SMatthew Wilcox  * dax_truncate_page - handle a partial page being truncated in a DAX file
112125726bc1SMatthew Wilcox  * @inode: The file being truncated
112225726bc1SMatthew Wilcox  * @from: The file offset that is being truncated to
112325726bc1SMatthew Wilcox  * @get_block: The filesystem method used to translate file offsets to blocks
112425726bc1SMatthew Wilcox  *
112525726bc1SMatthew Wilcox  * Similar to block_truncate_page(), this function can be called by a
112625726bc1SMatthew Wilcox  * filesystem when it is truncating a DAX file to handle the partial page.
112725726bc1SMatthew Wilcox  */
112825726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
112925726bc1SMatthew Wilcox {
113009cbfeafSKirill A. Shutemov 	unsigned length = PAGE_ALIGN(from) - from;
113125726bc1SMatthew Wilcox 	return dax_zero_page_range(inode, from, length, get_block);
113225726bc1SMatthew Wilcox }
11334c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page);
1134a254e568SChristoph Hellwig 
1135a254e568SChristoph Hellwig #ifdef CONFIG_FS_IOMAP
1136333ccc97SRoss Zwisler static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
1137333ccc97SRoss Zwisler {
1138333ccc97SRoss Zwisler 	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
1139333ccc97SRoss Zwisler }
1140333ccc97SRoss Zwisler 
1141a254e568SChristoph Hellwig static loff_t
114211c59c92SRoss Zwisler dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1143a254e568SChristoph Hellwig 		struct iomap *iomap)
1144a254e568SChristoph Hellwig {
1145a254e568SChristoph Hellwig 	struct iov_iter *iter = data;
1146a254e568SChristoph Hellwig 	loff_t end = pos + length, done = 0;
1147a254e568SChristoph Hellwig 	ssize_t ret = 0;
1148a254e568SChristoph Hellwig 
1149a254e568SChristoph Hellwig 	if (iov_iter_rw(iter) == READ) {
1150a254e568SChristoph Hellwig 		end = min(end, i_size_read(inode));
1151a254e568SChristoph Hellwig 		if (pos >= end)
1152a254e568SChristoph Hellwig 			return 0;
1153a254e568SChristoph Hellwig 
1154a254e568SChristoph Hellwig 		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1155a254e568SChristoph Hellwig 			return iov_iter_zero(min(length, end - pos), iter);
1156a254e568SChristoph Hellwig 	}
1157a254e568SChristoph Hellwig 
1158a254e568SChristoph Hellwig 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1159a254e568SChristoph Hellwig 		return -EIO;
1160a254e568SChristoph Hellwig 
1161a254e568SChristoph Hellwig 	while (pos < end) {
1162a254e568SChristoph Hellwig 		unsigned offset = pos & (PAGE_SIZE - 1);
1163a254e568SChristoph Hellwig 		struct blk_dax_ctl dax = { 0 };
1164a254e568SChristoph Hellwig 		ssize_t map_len;
1165a254e568SChristoph Hellwig 
1166333ccc97SRoss Zwisler 		dax.sector = dax_iomap_sector(iomap, pos);
1167a254e568SChristoph Hellwig 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1168a254e568SChristoph Hellwig 		map_len = dax_map_atomic(iomap->bdev, &dax);
1169a254e568SChristoph Hellwig 		if (map_len < 0) {
1170a254e568SChristoph Hellwig 			ret = map_len;
1171a254e568SChristoph Hellwig 			break;
1172a254e568SChristoph Hellwig 		}
1173a254e568SChristoph Hellwig 
1174a254e568SChristoph Hellwig 		dax.addr += offset;
1175a254e568SChristoph Hellwig 		map_len -= offset;
1176a254e568SChristoph Hellwig 		if (map_len > end - pos)
1177a254e568SChristoph Hellwig 			map_len = end - pos;
1178a254e568SChristoph Hellwig 
1179a254e568SChristoph Hellwig 		if (iov_iter_rw(iter) == WRITE)
1180a254e568SChristoph Hellwig 			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1181a254e568SChristoph Hellwig 		else
1182a254e568SChristoph Hellwig 			map_len = copy_to_iter(dax.addr, map_len, iter);
1183a254e568SChristoph Hellwig 		dax_unmap_atomic(iomap->bdev, &dax);
1184a254e568SChristoph Hellwig 		if (map_len <= 0) {
1185a254e568SChristoph Hellwig 			ret = map_len ? map_len : -EFAULT;
1186a254e568SChristoph Hellwig 			break;
1187a254e568SChristoph Hellwig 		}
1188a254e568SChristoph Hellwig 
1189a254e568SChristoph Hellwig 		pos += map_len;
1190a254e568SChristoph Hellwig 		length -= map_len;
1191a254e568SChristoph Hellwig 		done += map_len;
1192a254e568SChristoph Hellwig 	}
1193a254e568SChristoph Hellwig 
1194a254e568SChristoph Hellwig 	return done ? done : ret;
1195a254e568SChristoph Hellwig }
1196a254e568SChristoph Hellwig 
1197a254e568SChristoph Hellwig /**
119811c59c92SRoss Zwisler  * dax_iomap_rw - Perform I/O to a DAX file
1199a254e568SChristoph Hellwig  * @iocb:	The control block for this I/O
1200a254e568SChristoph Hellwig  * @iter:	The addresses to do I/O from or to
1201a254e568SChristoph Hellwig  * @ops:	iomap ops passed from the file system
1202a254e568SChristoph Hellwig  *
1203a254e568SChristoph Hellwig  * This function performs read and write operations to directly mapped
1204a254e568SChristoph Hellwig  * persistent memory.  The callers needs to take care of read/write exclusion
1205a254e568SChristoph Hellwig  * and evicting any page cache pages in the region under I/O.
1206a254e568SChristoph Hellwig  */
1207a254e568SChristoph Hellwig ssize_t
120811c59c92SRoss Zwisler dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1209a254e568SChristoph Hellwig 		struct iomap_ops *ops)
1210a254e568SChristoph Hellwig {
1211a254e568SChristoph Hellwig 	struct address_space *mapping = iocb->ki_filp->f_mapping;
1212a254e568SChristoph Hellwig 	struct inode *inode = mapping->host;
1213a254e568SChristoph Hellwig 	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1214a254e568SChristoph Hellwig 	unsigned flags = 0;
1215a254e568SChristoph Hellwig 
1216a254e568SChristoph Hellwig 	if (iov_iter_rw(iter) == WRITE)
1217a254e568SChristoph Hellwig 		flags |= IOMAP_WRITE;
1218a254e568SChristoph Hellwig 
1219a254e568SChristoph Hellwig 	/*
1220a254e568SChristoph Hellwig 	 * Yes, even DAX files can have page cache attached to them:  A zeroed
1221a254e568SChristoph Hellwig 	 * page is inserted into the pagecache when we have to serve a write
1222a254e568SChristoph Hellwig 	 * fault on a hole.  It should never be dirtied and can simply be
1223a254e568SChristoph Hellwig 	 * dropped from the pagecache once we get real data for the page.
1224a254e568SChristoph Hellwig 	 *
1225a254e568SChristoph Hellwig 	 * XXX: This is racy against mmap, and there's nothing we can do about
1226a254e568SChristoph Hellwig 	 * it. We'll eventually need to shift this down even further so that
1227a254e568SChristoph Hellwig 	 * we can check if we allocated blocks over a hole first.
1228a254e568SChristoph Hellwig 	 */
1229a254e568SChristoph Hellwig 	if (mapping->nrpages) {
1230a254e568SChristoph Hellwig 		ret = invalidate_inode_pages2_range(mapping,
1231a254e568SChristoph Hellwig 				pos >> PAGE_SHIFT,
1232a254e568SChristoph Hellwig 				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1233a254e568SChristoph Hellwig 		WARN_ON_ONCE(ret);
1234a254e568SChristoph Hellwig 	}
1235a254e568SChristoph Hellwig 
1236a254e568SChristoph Hellwig 	while (iov_iter_count(iter)) {
1237a254e568SChristoph Hellwig 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
123811c59c92SRoss Zwisler 				iter, dax_iomap_actor);
1239a254e568SChristoph Hellwig 		if (ret <= 0)
1240a254e568SChristoph Hellwig 			break;
1241a254e568SChristoph Hellwig 		pos += ret;
1242a254e568SChristoph Hellwig 		done += ret;
1243a254e568SChristoph Hellwig 	}
1244a254e568SChristoph Hellwig 
1245a254e568SChristoph Hellwig 	iocb->ki_pos += done;
1246a254e568SChristoph Hellwig 	return done ? done : ret;
1247a254e568SChristoph Hellwig }
124811c59c92SRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_rw);
1249a7d73fe6SChristoph Hellwig 
1250a7d73fe6SChristoph Hellwig /**
125111c59c92SRoss Zwisler  * dax_iomap_fault - handle a page fault on a DAX file
1252a7d73fe6SChristoph Hellwig  * @vma: The virtual memory area where the fault occurred
1253a7d73fe6SChristoph Hellwig  * @vmf: The description of the fault
1254a7d73fe6SChristoph Hellwig  * @ops: iomap ops passed from the file system
1255a7d73fe6SChristoph Hellwig  *
1256a7d73fe6SChristoph Hellwig  * When a page fault occurs, filesystems may call this helper in their fault
1257a7d73fe6SChristoph Hellwig  * or mkwrite handler for DAX files. Assumes the caller has done all the
1258a7d73fe6SChristoph Hellwig  * necessary locking for the page fault to proceed successfully.
1259a7d73fe6SChristoph Hellwig  */
126011c59c92SRoss Zwisler int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1261a7d73fe6SChristoph Hellwig 			struct iomap_ops *ops)
1262a7d73fe6SChristoph Hellwig {
1263a7d73fe6SChristoph Hellwig 	struct address_space *mapping = vma->vm_file->f_mapping;
1264a7d73fe6SChristoph Hellwig 	struct inode *inode = mapping->host;
1265a7d73fe6SChristoph Hellwig 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
1266a7d73fe6SChristoph Hellwig 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1267a7d73fe6SChristoph Hellwig 	sector_t sector;
1268a7d73fe6SChristoph Hellwig 	struct iomap iomap = { 0 };
1269a7d73fe6SChristoph Hellwig 	unsigned flags = 0;
1270a7d73fe6SChristoph Hellwig 	int error, major = 0;
12711550290bSRoss Zwisler 	int locked_status = 0;
1272a7d73fe6SChristoph Hellwig 	void *entry;
1273a7d73fe6SChristoph Hellwig 
1274a7d73fe6SChristoph Hellwig 	/*
1275a7d73fe6SChristoph Hellwig 	 * Check whether offset isn't beyond end of file now. Caller is supposed
1276a7d73fe6SChristoph Hellwig 	 * to hold locks serializing us with truncate / punch hole so this is
1277a7d73fe6SChristoph Hellwig 	 * a reliable test.
1278a7d73fe6SChristoph Hellwig 	 */
1279a7d73fe6SChristoph Hellwig 	if (pos >= i_size_read(inode))
1280a7d73fe6SChristoph Hellwig 		return VM_FAULT_SIGBUS;
1281a7d73fe6SChristoph Hellwig 
1282642261acSRoss Zwisler 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1283a7d73fe6SChristoph Hellwig 	if (IS_ERR(entry)) {
1284a7d73fe6SChristoph Hellwig 		error = PTR_ERR(entry);
1285a7d73fe6SChristoph Hellwig 		goto out;
1286a7d73fe6SChristoph Hellwig 	}
1287a7d73fe6SChristoph Hellwig 
1288a7d73fe6SChristoph Hellwig 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1289a7d73fe6SChristoph Hellwig 		flags |= IOMAP_WRITE;
1290a7d73fe6SChristoph Hellwig 
1291a7d73fe6SChristoph Hellwig 	/*
1292a7d73fe6SChristoph Hellwig 	 * Note that we don't bother to use iomap_apply here: DAX required
1293a7d73fe6SChristoph Hellwig 	 * the file system block size to be equal the page size, which means
1294a7d73fe6SChristoph Hellwig 	 * that we never have to deal with more than a single extent here.
1295a7d73fe6SChristoph Hellwig 	 */
1296a7d73fe6SChristoph Hellwig 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1297a7d73fe6SChristoph Hellwig 	if (error)
1298a7d73fe6SChristoph Hellwig 		goto unlock_entry;
1299a7d73fe6SChristoph Hellwig 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1300a7d73fe6SChristoph Hellwig 		error = -EIO;		/* fs corruption? */
13011550290bSRoss Zwisler 		goto finish_iomap;
1302a7d73fe6SChristoph Hellwig 	}
1303a7d73fe6SChristoph Hellwig 
1304333ccc97SRoss Zwisler 	sector = dax_iomap_sector(&iomap, pos);
1305a7d73fe6SChristoph Hellwig 
1306a7d73fe6SChristoph Hellwig 	if (vmf->cow_page) {
1307a7d73fe6SChristoph Hellwig 		switch (iomap.type) {
1308a7d73fe6SChristoph Hellwig 		case IOMAP_HOLE:
1309a7d73fe6SChristoph Hellwig 		case IOMAP_UNWRITTEN:
1310a7d73fe6SChristoph Hellwig 			clear_user_highpage(vmf->cow_page, vaddr);
1311a7d73fe6SChristoph Hellwig 			break;
1312a7d73fe6SChristoph Hellwig 		case IOMAP_MAPPED:
1313a7d73fe6SChristoph Hellwig 			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1314a7d73fe6SChristoph Hellwig 					vmf->cow_page, vaddr);
1315a7d73fe6SChristoph Hellwig 			break;
1316a7d73fe6SChristoph Hellwig 		default:
1317a7d73fe6SChristoph Hellwig 			WARN_ON_ONCE(1);
1318a7d73fe6SChristoph Hellwig 			error = -EIO;
1319a7d73fe6SChristoph Hellwig 			break;
1320a7d73fe6SChristoph Hellwig 		}
1321a7d73fe6SChristoph Hellwig 
1322a7d73fe6SChristoph Hellwig 		if (error)
13231550290bSRoss Zwisler 			goto finish_iomap;
1324a7d73fe6SChristoph Hellwig 		if (!radix_tree_exceptional_entry(entry)) {
1325a7d73fe6SChristoph Hellwig 			vmf->page = entry;
13261550290bSRoss Zwisler 			locked_status = VM_FAULT_LOCKED;
13271550290bSRoss Zwisler 		} else {
1328a7d73fe6SChristoph Hellwig 			vmf->entry = entry;
13291550290bSRoss Zwisler 			locked_status = VM_FAULT_DAX_LOCKED;
13301550290bSRoss Zwisler 		}
13311550290bSRoss Zwisler 		goto finish_iomap;
1332a7d73fe6SChristoph Hellwig 	}
1333a7d73fe6SChristoph Hellwig 
1334a7d73fe6SChristoph Hellwig 	switch (iomap.type) {
1335a7d73fe6SChristoph Hellwig 	case IOMAP_MAPPED:
1336a7d73fe6SChristoph Hellwig 		if (iomap.flags & IOMAP_F_NEW) {
1337a7d73fe6SChristoph Hellwig 			count_vm_event(PGMAJFAULT);
1338a7d73fe6SChristoph Hellwig 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1339a7d73fe6SChristoph Hellwig 			major = VM_FAULT_MAJOR;
1340a7d73fe6SChristoph Hellwig 		}
1341a7d73fe6SChristoph Hellwig 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
1342a7d73fe6SChristoph Hellwig 				PAGE_SIZE, &entry, vma, vmf);
1343a7d73fe6SChristoph Hellwig 		break;
1344a7d73fe6SChristoph Hellwig 	case IOMAP_UNWRITTEN:
1345a7d73fe6SChristoph Hellwig 	case IOMAP_HOLE:
13461550290bSRoss Zwisler 		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
13471550290bSRoss Zwisler 			locked_status = dax_load_hole(mapping, entry, vmf);
13481550290bSRoss Zwisler 			break;
13491550290bSRoss Zwisler 		}
1350a7d73fe6SChristoph Hellwig 		/*FALLTHRU*/
1351a7d73fe6SChristoph Hellwig 	default:
1352a7d73fe6SChristoph Hellwig 		WARN_ON_ONCE(1);
1353a7d73fe6SChristoph Hellwig 		error = -EIO;
1354a7d73fe6SChristoph Hellwig 		break;
1355a7d73fe6SChristoph Hellwig 	}
1356a7d73fe6SChristoph Hellwig 
13571550290bSRoss Zwisler  finish_iomap:
13581550290bSRoss Zwisler 	if (ops->iomap_end) {
13591550290bSRoss Zwisler 		if (error) {
13601550290bSRoss Zwisler 			/* keep previous error */
13611550290bSRoss Zwisler 			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
13621550290bSRoss Zwisler 					&iomap);
13631550290bSRoss Zwisler 		} else {
13641550290bSRoss Zwisler 			error = ops->iomap_end(inode, pos, PAGE_SIZE,
13651550290bSRoss Zwisler 					PAGE_SIZE, flags, &iomap);
13661550290bSRoss Zwisler 		}
13671550290bSRoss Zwisler 	}
1368a7d73fe6SChristoph Hellwig  unlock_entry:
13691550290bSRoss Zwisler 	if (!locked_status || error)
1370a7d73fe6SChristoph Hellwig 		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1371a7d73fe6SChristoph Hellwig  out:
1372a7d73fe6SChristoph Hellwig 	if (error == -ENOMEM)
1373a7d73fe6SChristoph Hellwig 		return VM_FAULT_OOM | major;
1374a7d73fe6SChristoph Hellwig 	/* -EBUSY is fine, somebody else faulted on the same PTE */
1375a7d73fe6SChristoph Hellwig 	if (error < 0 && error != -EBUSY)
1376a7d73fe6SChristoph Hellwig 		return VM_FAULT_SIGBUS | major;
13771550290bSRoss Zwisler 	if (locked_status) {
13781550290bSRoss Zwisler 		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
13791550290bSRoss Zwisler 		return locked_status;
13801550290bSRoss Zwisler 	}
1381a7d73fe6SChristoph Hellwig 	return VM_FAULT_NOPAGE | major;
1382a7d73fe6SChristoph Hellwig }
138311c59c92SRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_fault);
1384642261acSRoss Zwisler 
1385642261acSRoss Zwisler #ifdef CONFIG_FS_DAX_PMD
1386642261acSRoss Zwisler /*
1387642261acSRoss Zwisler  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
1388642261acSRoss Zwisler  * more often than one might expect in the below functions.
1389642261acSRoss Zwisler  */
1390642261acSRoss Zwisler #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
1391642261acSRoss Zwisler 
1392642261acSRoss Zwisler static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1393642261acSRoss Zwisler 		struct vm_fault *vmf, unsigned long address,
1394642261acSRoss Zwisler 		struct iomap *iomap, loff_t pos, bool write, void **entryp)
1395642261acSRoss Zwisler {
1396642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1397642261acSRoss Zwisler 	struct block_device *bdev = iomap->bdev;
1398642261acSRoss Zwisler 	struct blk_dax_ctl dax = {
1399642261acSRoss Zwisler 		.sector = dax_iomap_sector(iomap, pos),
1400642261acSRoss Zwisler 		.size = PMD_SIZE,
1401642261acSRoss Zwisler 	};
1402642261acSRoss Zwisler 	long length = dax_map_atomic(bdev, &dax);
1403642261acSRoss Zwisler 	void *ret;
1404642261acSRoss Zwisler 
1405642261acSRoss Zwisler 	if (length < 0) /* dax_map_atomic() failed */
1406642261acSRoss Zwisler 		return VM_FAULT_FALLBACK;
1407642261acSRoss Zwisler 	if (length < PMD_SIZE)
1408642261acSRoss Zwisler 		goto unmap_fallback;
1409642261acSRoss Zwisler 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1410642261acSRoss Zwisler 		goto unmap_fallback;
1411642261acSRoss Zwisler 	if (!pfn_t_devmap(dax.pfn))
1412642261acSRoss Zwisler 		goto unmap_fallback;
1413642261acSRoss Zwisler 
1414642261acSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
1415642261acSRoss Zwisler 
1416642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1417642261acSRoss Zwisler 			RADIX_DAX_PMD);
1418642261acSRoss Zwisler 	if (IS_ERR(ret))
1419642261acSRoss Zwisler 		return VM_FAULT_FALLBACK;
1420642261acSRoss Zwisler 	*entryp = ret;
1421642261acSRoss Zwisler 
1422642261acSRoss Zwisler 	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1423642261acSRoss Zwisler 
1424642261acSRoss Zwisler  unmap_fallback:
1425642261acSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
1426642261acSRoss Zwisler 	return VM_FAULT_FALLBACK;
1427642261acSRoss Zwisler }
1428642261acSRoss Zwisler 
1429642261acSRoss Zwisler static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1430642261acSRoss Zwisler 		struct vm_fault *vmf, unsigned long address,
1431642261acSRoss Zwisler 		struct iomap *iomap, void **entryp)
1432642261acSRoss Zwisler {
1433642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1434642261acSRoss Zwisler 	unsigned long pmd_addr = address & PMD_MASK;
1435642261acSRoss Zwisler 	struct page *zero_page;
1436642261acSRoss Zwisler 	spinlock_t *ptl;
1437642261acSRoss Zwisler 	pmd_t pmd_entry;
1438642261acSRoss Zwisler 	void *ret;
1439642261acSRoss Zwisler 
1440642261acSRoss Zwisler 	zero_page = mm_get_huge_zero_page(vma->vm_mm);
1441642261acSRoss Zwisler 
1442642261acSRoss Zwisler 	if (unlikely(!zero_page))
1443642261acSRoss Zwisler 		return VM_FAULT_FALLBACK;
1444642261acSRoss Zwisler 
1445642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1446642261acSRoss Zwisler 			RADIX_DAX_PMD | RADIX_DAX_HZP);
1447642261acSRoss Zwisler 	if (IS_ERR(ret))
1448642261acSRoss Zwisler 		return VM_FAULT_FALLBACK;
1449642261acSRoss Zwisler 	*entryp = ret;
1450642261acSRoss Zwisler 
1451642261acSRoss Zwisler 	ptl = pmd_lock(vma->vm_mm, pmd);
1452642261acSRoss Zwisler 	if (!pmd_none(*pmd)) {
1453642261acSRoss Zwisler 		spin_unlock(ptl);
1454642261acSRoss Zwisler 		return VM_FAULT_FALLBACK;
1455642261acSRoss Zwisler 	}
1456642261acSRoss Zwisler 
1457642261acSRoss Zwisler 	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1458642261acSRoss Zwisler 	pmd_entry = pmd_mkhuge(pmd_entry);
1459642261acSRoss Zwisler 	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1460642261acSRoss Zwisler 	spin_unlock(ptl);
1461642261acSRoss Zwisler 	return VM_FAULT_NOPAGE;
1462642261acSRoss Zwisler }
1463642261acSRoss Zwisler 
1464642261acSRoss Zwisler int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1465642261acSRoss Zwisler 		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
1466642261acSRoss Zwisler {
1467642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1468642261acSRoss Zwisler 	unsigned long pmd_addr = address & PMD_MASK;
1469642261acSRoss Zwisler 	bool write = flags & FAULT_FLAG_WRITE;
1470642261acSRoss Zwisler 	unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
1471642261acSRoss Zwisler 	struct inode *inode = mapping->host;
1472642261acSRoss Zwisler 	int result = VM_FAULT_FALLBACK;
1473642261acSRoss Zwisler 	struct iomap iomap = { 0 };
1474642261acSRoss Zwisler 	pgoff_t max_pgoff, pgoff;
1475642261acSRoss Zwisler 	struct vm_fault vmf;
1476642261acSRoss Zwisler 	void *entry;
1477642261acSRoss Zwisler 	loff_t pos;
1478642261acSRoss Zwisler 	int error;
1479642261acSRoss Zwisler 
1480642261acSRoss Zwisler 	/* Fall back to PTEs if we're going to COW */
1481642261acSRoss Zwisler 	if (write && !(vma->vm_flags & VM_SHARED))
1482642261acSRoss Zwisler 		goto fallback;
1483642261acSRoss Zwisler 
1484642261acSRoss Zwisler 	/* If the PMD would extend outside the VMA */
1485642261acSRoss Zwisler 	if (pmd_addr < vma->vm_start)
1486642261acSRoss Zwisler 		goto fallback;
1487642261acSRoss Zwisler 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1488642261acSRoss Zwisler 		goto fallback;
1489642261acSRoss Zwisler 
1490642261acSRoss Zwisler 	/*
1491642261acSRoss Zwisler 	 * Check whether offset isn't beyond end of file now. Caller is
1492642261acSRoss Zwisler 	 * supposed to hold locks serializing us with truncate / punch hole so
1493642261acSRoss Zwisler 	 * this is a reliable test.
1494642261acSRoss Zwisler 	 */
1495642261acSRoss Zwisler 	pgoff = linear_page_index(vma, pmd_addr);
1496642261acSRoss Zwisler 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1497642261acSRoss Zwisler 
1498642261acSRoss Zwisler 	if (pgoff > max_pgoff)
1499642261acSRoss Zwisler 		return VM_FAULT_SIGBUS;
1500642261acSRoss Zwisler 
1501642261acSRoss Zwisler 	/* If the PMD would extend beyond the file size */
1502642261acSRoss Zwisler 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1503642261acSRoss Zwisler 		goto fallback;
1504642261acSRoss Zwisler 
1505642261acSRoss Zwisler 	/*
1506642261acSRoss Zwisler 	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1507642261acSRoss Zwisler 	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
1508642261acSRoss Zwisler 	 * the tree, for instance), it will return -EEXIST and we just fall
1509642261acSRoss Zwisler 	 * back to 4k entries.
1510642261acSRoss Zwisler 	 */
1511642261acSRoss Zwisler 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1512642261acSRoss Zwisler 	if (IS_ERR(entry))
1513642261acSRoss Zwisler 		goto fallback;
1514642261acSRoss Zwisler 
1515642261acSRoss Zwisler 	/*
1516642261acSRoss Zwisler 	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
1517642261acSRoss Zwisler 	 * setting up a mapping, so really we're using iomap_begin() as a way
1518642261acSRoss Zwisler 	 * to look up our filesystem block.
1519642261acSRoss Zwisler 	 */
1520642261acSRoss Zwisler 	pos = (loff_t)pgoff << PAGE_SHIFT;
1521642261acSRoss Zwisler 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1522642261acSRoss Zwisler 	if (error)
1523642261acSRoss Zwisler 		goto unlock_entry;
1524642261acSRoss Zwisler 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
1525642261acSRoss Zwisler 		goto finish_iomap;
1526642261acSRoss Zwisler 
1527642261acSRoss Zwisler 	vmf.pgoff = pgoff;
1528642261acSRoss Zwisler 	vmf.flags = flags;
1529642261acSRoss Zwisler 	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
1530642261acSRoss Zwisler 
1531642261acSRoss Zwisler 	switch (iomap.type) {
1532642261acSRoss Zwisler 	case IOMAP_MAPPED:
1533642261acSRoss Zwisler 		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
1534642261acSRoss Zwisler 				&iomap, pos, write, &entry);
1535642261acSRoss Zwisler 		break;
1536642261acSRoss Zwisler 	case IOMAP_UNWRITTEN:
1537642261acSRoss Zwisler 	case IOMAP_HOLE:
1538642261acSRoss Zwisler 		if (WARN_ON_ONCE(write))
1539642261acSRoss Zwisler 			goto finish_iomap;
1540642261acSRoss Zwisler 		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1541642261acSRoss Zwisler 				&entry);
1542642261acSRoss Zwisler 		break;
1543642261acSRoss Zwisler 	default:
1544642261acSRoss Zwisler 		WARN_ON_ONCE(1);
1545642261acSRoss Zwisler 		break;
1546642261acSRoss Zwisler 	}
1547642261acSRoss Zwisler 
1548642261acSRoss Zwisler  finish_iomap:
1549642261acSRoss Zwisler 	if (ops->iomap_end) {
1550642261acSRoss Zwisler 		if (result == VM_FAULT_FALLBACK) {
1551642261acSRoss Zwisler 			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1552642261acSRoss Zwisler 					&iomap);
1553642261acSRoss Zwisler 		} else {
1554642261acSRoss Zwisler 			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1555642261acSRoss Zwisler 					iomap_flags, &iomap);
1556642261acSRoss Zwisler 			if (error)
1557642261acSRoss Zwisler 				result = VM_FAULT_FALLBACK;
1558642261acSRoss Zwisler 		}
1559642261acSRoss Zwisler 	}
1560642261acSRoss Zwisler  unlock_entry:
1561642261acSRoss Zwisler 	put_locked_mapping_entry(mapping, pgoff, entry);
1562642261acSRoss Zwisler  fallback:
1563642261acSRoss Zwisler 	if (result == VM_FAULT_FALLBACK) {
1564642261acSRoss Zwisler 		split_huge_pmd(vma, pmd, address);
1565642261acSRoss Zwisler 		count_vm_event(THP_FAULT_FALLBACK);
1566642261acSRoss Zwisler 	}
1567642261acSRoss Zwisler 	return result;
1568642261acSRoss Zwisler }
1569642261acSRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1570642261acSRoss Zwisler #endif /* CONFIG_FS_DAX_PMD */
1571a254e568SChristoph Hellwig #endif /* CONFIG_FS_IOMAP */
1572