xref: /openbmc/linux/fs/dax.c (revision d8a849e1)
1d475c634SMatthew Wilcox /*
2d475c634SMatthew Wilcox  * fs/dax.c - Direct Access filesystem code
3d475c634SMatthew Wilcox  * Copyright (c) 2013-2014 Intel Corporation
4d475c634SMatthew Wilcox  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5d475c634SMatthew Wilcox  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6d475c634SMatthew Wilcox  *
7d475c634SMatthew Wilcox  * This program is free software; you can redistribute it and/or modify it
8d475c634SMatthew Wilcox  * under the terms and conditions of the GNU General Public License,
9d475c634SMatthew Wilcox  * version 2, as published by the Free Software Foundation.
10d475c634SMatthew Wilcox  *
11d475c634SMatthew Wilcox  * This program is distributed in the hope it will be useful, but WITHOUT
12d475c634SMatthew Wilcox  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13d475c634SMatthew Wilcox  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14d475c634SMatthew Wilcox  * more details.
15d475c634SMatthew Wilcox  */
16d475c634SMatthew Wilcox 
17d475c634SMatthew Wilcox #include <linux/atomic.h>
18d475c634SMatthew Wilcox #include <linux/blkdev.h>
19d475c634SMatthew Wilcox #include <linux/buffer_head.h>
20d77e92e2SRoss Zwisler #include <linux/dax.h>
21d475c634SMatthew Wilcox #include <linux/fs.h>
22d475c634SMatthew Wilcox #include <linux/genhd.h>
23f7ca90b1SMatthew Wilcox #include <linux/highmem.h>
24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h>
25f7ca90b1SMatthew Wilcox #include <linux/mm.h>
26d475c634SMatthew Wilcox #include <linux/mutex.h>
279973c98eSRoss Zwisler #include <linux/pagevec.h>
282765cfbbSRoss Zwisler #include <linux/pmem.h>
29289c6aedSMatthew Wilcox #include <linux/sched.h>
30d475c634SMatthew Wilcox #include <linux/uio.h>
31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h>
3234c0fd54SDan Williams #include <linux/pfn_t.h>
330e749e54SDan Williams #include <linux/sizes.h>
344b4bb46dSJan Kara #include <linux/mmu_notifier.h>
35a254e568SChristoph Hellwig #include <linux/iomap.h>
36a254e568SChristoph Hellwig #include "internal.h"
37d475c634SMatthew Wilcox 
38282a8e03SRoss Zwisler #define CREATE_TRACE_POINTS
39282a8e03SRoss Zwisler #include <trace/events/fs_dax.h>
40282a8e03SRoss Zwisler 
41ac401cc7SJan Kara /* We choose 4096 entries - same as per-zone page wait tables */
42ac401cc7SJan Kara #define DAX_WAIT_TABLE_BITS 12
43ac401cc7SJan Kara #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
44ac401cc7SJan Kara 
45ce95ab0fSRoss Zwisler static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
46ac401cc7SJan Kara 
47ac401cc7SJan Kara static int __init init_dax_wait_table(void)
48ac401cc7SJan Kara {
49ac401cc7SJan Kara 	int i;
50ac401cc7SJan Kara 
51ac401cc7SJan Kara 	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
52ac401cc7SJan Kara 		init_waitqueue_head(wait_table + i);
53ac401cc7SJan Kara 	return 0;
54ac401cc7SJan Kara }
55ac401cc7SJan Kara fs_initcall(init_dax_wait_table);
56ac401cc7SJan Kara 
57b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
58b2e0d162SDan Williams {
59b2e0d162SDan Williams 	struct request_queue *q = bdev->bd_queue;
60b2e0d162SDan Williams 	long rc = -EIO;
61b2e0d162SDan Williams 
627a9eb206SDan Williams 	dax->addr = ERR_PTR(-EIO);
63b2e0d162SDan Williams 	if (blk_queue_enter(q, true) != 0)
64b2e0d162SDan Williams 		return rc;
65b2e0d162SDan Williams 
66b2e0d162SDan Williams 	rc = bdev_direct_access(bdev, dax);
67b2e0d162SDan Williams 	if (rc < 0) {
687a9eb206SDan Williams 		dax->addr = ERR_PTR(rc);
69b2e0d162SDan Williams 		blk_queue_exit(q);
70b2e0d162SDan Williams 		return rc;
71b2e0d162SDan Williams 	}
72b2e0d162SDan Williams 	return rc;
73b2e0d162SDan Williams }
74b2e0d162SDan Williams 
75b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev,
76b2e0d162SDan Williams 		const struct blk_dax_ctl *dax)
77b2e0d162SDan Williams {
78b2e0d162SDan Williams 	if (IS_ERR(dax->addr))
79b2e0d162SDan Williams 		return;
80b2e0d162SDan Williams 	blk_queue_exit(bdev->bd_queue);
81b2e0d162SDan Williams }
82b2e0d162SDan Williams 
83642261acSRoss Zwisler static int dax_is_pmd_entry(void *entry)
84642261acSRoss Zwisler {
85642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_PMD;
86642261acSRoss Zwisler }
87642261acSRoss Zwisler 
88642261acSRoss Zwisler static int dax_is_pte_entry(void *entry)
89642261acSRoss Zwisler {
90642261acSRoss Zwisler 	return !((unsigned long)entry & RADIX_DAX_PMD);
91642261acSRoss Zwisler }
92642261acSRoss Zwisler 
93642261acSRoss Zwisler static int dax_is_zero_entry(void *entry)
94642261acSRoss Zwisler {
95642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_HZP;
96642261acSRoss Zwisler }
97642261acSRoss Zwisler 
98642261acSRoss Zwisler static int dax_is_empty_entry(void *entry)
99642261acSRoss Zwisler {
100642261acSRoss Zwisler 	return (unsigned long)entry & RADIX_DAX_EMPTY;
101642261acSRoss Zwisler }
102642261acSRoss Zwisler 
103d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n)
104d1a5f2b4SDan Williams {
105d1a5f2b4SDan Williams 	struct page *page = alloc_pages(GFP_KERNEL, 0);
106d1a5f2b4SDan Williams 	struct blk_dax_ctl dax = {
107d1a5f2b4SDan Williams 		.size = PAGE_SIZE,
108d1a5f2b4SDan Williams 		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
109d1a5f2b4SDan Williams 	};
110d1a5f2b4SDan Williams 	long rc;
111d1a5f2b4SDan Williams 
112d1a5f2b4SDan Williams 	if (!page)
113d1a5f2b4SDan Williams 		return ERR_PTR(-ENOMEM);
114d1a5f2b4SDan Williams 
115d1a5f2b4SDan Williams 	rc = dax_map_atomic(bdev, &dax);
116d1a5f2b4SDan Williams 	if (rc < 0)
117d1a5f2b4SDan Williams 		return ERR_PTR(rc);
118d1a5f2b4SDan Williams 	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
119d1a5f2b4SDan Williams 	dax_unmap_atomic(bdev, &dax);
120d1a5f2b4SDan Williams 	return page;
121d1a5f2b4SDan Williams }
122d1a5f2b4SDan Williams 
123f7ca90b1SMatthew Wilcox /*
124ac401cc7SJan Kara  * DAX radix tree locking
125ac401cc7SJan Kara  */
126ac401cc7SJan Kara struct exceptional_entry_key {
127ac401cc7SJan Kara 	struct address_space *mapping;
12863e95b5cSRoss Zwisler 	pgoff_t entry_start;
129ac401cc7SJan Kara };
130ac401cc7SJan Kara 
131ac401cc7SJan Kara struct wait_exceptional_entry_queue {
132ac401cc7SJan Kara 	wait_queue_t wait;
133ac401cc7SJan Kara 	struct exceptional_entry_key key;
134ac401cc7SJan Kara };
135ac401cc7SJan Kara 
13663e95b5cSRoss Zwisler static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
13763e95b5cSRoss Zwisler 		pgoff_t index, void *entry, struct exceptional_entry_key *key)
13863e95b5cSRoss Zwisler {
13963e95b5cSRoss Zwisler 	unsigned long hash;
14063e95b5cSRoss Zwisler 
14163e95b5cSRoss Zwisler 	/*
14263e95b5cSRoss Zwisler 	 * If 'entry' is a PMD, align the 'index' that we use for the wait
14363e95b5cSRoss Zwisler 	 * queue to the start of that PMD.  This ensures that all offsets in
14463e95b5cSRoss Zwisler 	 * the range covered by the PMD map to the same bit lock.
14563e95b5cSRoss Zwisler 	 */
146642261acSRoss Zwisler 	if (dax_is_pmd_entry(entry))
14763e95b5cSRoss Zwisler 		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
14863e95b5cSRoss Zwisler 
14963e95b5cSRoss Zwisler 	key->mapping = mapping;
15063e95b5cSRoss Zwisler 	key->entry_start = index;
15163e95b5cSRoss Zwisler 
15263e95b5cSRoss Zwisler 	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
15363e95b5cSRoss Zwisler 	return wait_table + hash;
15463e95b5cSRoss Zwisler }
15563e95b5cSRoss Zwisler 
156ac401cc7SJan Kara static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
157ac401cc7SJan Kara 				       int sync, void *keyp)
158ac401cc7SJan Kara {
159ac401cc7SJan Kara 	struct exceptional_entry_key *key = keyp;
160ac401cc7SJan Kara 	struct wait_exceptional_entry_queue *ewait =
161ac401cc7SJan Kara 		container_of(wait, struct wait_exceptional_entry_queue, wait);
162ac401cc7SJan Kara 
163ac401cc7SJan Kara 	if (key->mapping != ewait->key.mapping ||
16463e95b5cSRoss Zwisler 	    key->entry_start != ewait->key.entry_start)
165ac401cc7SJan Kara 		return 0;
166ac401cc7SJan Kara 	return autoremove_wake_function(wait, mode, sync, NULL);
167ac401cc7SJan Kara }
168ac401cc7SJan Kara 
169ac401cc7SJan Kara /*
170ac401cc7SJan Kara  * Check whether the given slot is locked. The function must be called with
171ac401cc7SJan Kara  * mapping->tree_lock held
172ac401cc7SJan Kara  */
173ac401cc7SJan Kara static inline int slot_locked(struct address_space *mapping, void **slot)
174ac401cc7SJan Kara {
175ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
176ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
177ac401cc7SJan Kara 	return entry & RADIX_DAX_ENTRY_LOCK;
178ac401cc7SJan Kara }
179ac401cc7SJan Kara 
180ac401cc7SJan Kara /*
181ac401cc7SJan Kara  * Mark the given slot is locked. The function must be called with
182ac401cc7SJan Kara  * mapping->tree_lock held
183ac401cc7SJan Kara  */
184ac401cc7SJan Kara static inline void *lock_slot(struct address_space *mapping, void **slot)
185ac401cc7SJan Kara {
186ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
187ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
188ac401cc7SJan Kara 
189ac401cc7SJan Kara 	entry |= RADIX_DAX_ENTRY_LOCK;
1906d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
191ac401cc7SJan Kara 	return (void *)entry;
192ac401cc7SJan Kara }
193ac401cc7SJan Kara 
194ac401cc7SJan Kara /*
195ac401cc7SJan Kara  * Mark the given slot is unlocked. The function must be called with
196ac401cc7SJan Kara  * mapping->tree_lock held
197ac401cc7SJan Kara  */
198ac401cc7SJan Kara static inline void *unlock_slot(struct address_space *mapping, void **slot)
199ac401cc7SJan Kara {
200ac401cc7SJan Kara 	unsigned long entry = (unsigned long)
201ac401cc7SJan Kara 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
202ac401cc7SJan Kara 
203ac401cc7SJan Kara 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
2046d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
205ac401cc7SJan Kara 	return (void *)entry;
206ac401cc7SJan Kara }
207ac401cc7SJan Kara 
208ac401cc7SJan Kara /*
209ac401cc7SJan Kara  * Lookup entry in radix tree, wait for it to become unlocked if it is
210ac401cc7SJan Kara  * exceptional entry and return it. The caller must call
211ac401cc7SJan Kara  * put_unlocked_mapping_entry() when he decided not to lock the entry or
212ac401cc7SJan Kara  * put_locked_mapping_entry() when he locked the entry and now wants to
213ac401cc7SJan Kara  * unlock it.
214ac401cc7SJan Kara  *
215ac401cc7SJan Kara  * The function must be called with mapping->tree_lock held.
216ac401cc7SJan Kara  */
217ac401cc7SJan Kara static void *get_unlocked_mapping_entry(struct address_space *mapping,
218ac401cc7SJan Kara 					pgoff_t index, void ***slotp)
219ac401cc7SJan Kara {
220e3ad61c6SRoss Zwisler 	void *entry, **slot;
221ac401cc7SJan Kara 	struct wait_exceptional_entry_queue ewait;
22263e95b5cSRoss Zwisler 	wait_queue_head_t *wq;
223ac401cc7SJan Kara 
224ac401cc7SJan Kara 	init_wait(&ewait.wait);
225ac401cc7SJan Kara 	ewait.wait.func = wake_exceptional_entry_func;
226ac401cc7SJan Kara 
227ac401cc7SJan Kara 	for (;;) {
228e3ad61c6SRoss Zwisler 		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
229ac401cc7SJan Kara 					  &slot);
230e3ad61c6SRoss Zwisler 		if (!entry || !radix_tree_exceptional_entry(entry) ||
231ac401cc7SJan Kara 		    !slot_locked(mapping, slot)) {
232ac401cc7SJan Kara 			if (slotp)
233ac401cc7SJan Kara 				*slotp = slot;
234e3ad61c6SRoss Zwisler 			return entry;
235ac401cc7SJan Kara 		}
23663e95b5cSRoss Zwisler 
23763e95b5cSRoss Zwisler 		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
238ac401cc7SJan Kara 		prepare_to_wait_exclusive(wq, &ewait.wait,
239ac401cc7SJan Kara 					  TASK_UNINTERRUPTIBLE);
240ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
241ac401cc7SJan Kara 		schedule();
242ac401cc7SJan Kara 		finish_wait(wq, &ewait.wait);
243ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
244ac401cc7SJan Kara 	}
245ac401cc7SJan Kara }
246ac401cc7SJan Kara 
247b1aa812bSJan Kara static void dax_unlock_mapping_entry(struct address_space *mapping,
248b1aa812bSJan Kara 				     pgoff_t index)
249b1aa812bSJan Kara {
250b1aa812bSJan Kara 	void *entry, **slot;
251b1aa812bSJan Kara 
252b1aa812bSJan Kara 	spin_lock_irq(&mapping->tree_lock);
253b1aa812bSJan Kara 	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
254b1aa812bSJan Kara 	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
255b1aa812bSJan Kara 			 !slot_locked(mapping, slot))) {
256b1aa812bSJan Kara 		spin_unlock_irq(&mapping->tree_lock);
257b1aa812bSJan Kara 		return;
258b1aa812bSJan Kara 	}
259b1aa812bSJan Kara 	unlock_slot(mapping, slot);
260b1aa812bSJan Kara 	spin_unlock_irq(&mapping->tree_lock);
261b1aa812bSJan Kara 	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
262b1aa812bSJan Kara }
263b1aa812bSJan Kara 
264ac401cc7SJan Kara static void put_locked_mapping_entry(struct address_space *mapping,
265ac401cc7SJan Kara 				     pgoff_t index, void *entry)
266ac401cc7SJan Kara {
267ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
268ac401cc7SJan Kara 		unlock_page(entry);
269ac401cc7SJan Kara 		put_page(entry);
270ac401cc7SJan Kara 	} else {
271bc2466e4SJan Kara 		dax_unlock_mapping_entry(mapping, index);
272ac401cc7SJan Kara 	}
273ac401cc7SJan Kara }
274ac401cc7SJan Kara 
275ac401cc7SJan Kara /*
276ac401cc7SJan Kara  * Called when we are done with radix tree entry we looked up via
277ac401cc7SJan Kara  * get_unlocked_mapping_entry() and which we didn't lock in the end.
278ac401cc7SJan Kara  */
279ac401cc7SJan Kara static void put_unlocked_mapping_entry(struct address_space *mapping,
280ac401cc7SJan Kara 				       pgoff_t index, void *entry)
281ac401cc7SJan Kara {
282ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry))
283ac401cc7SJan Kara 		return;
284ac401cc7SJan Kara 
285ac401cc7SJan Kara 	/* We have to wake up next waiter for the radix tree entry lock */
286422476c4SRoss Zwisler 	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
287422476c4SRoss Zwisler }
288422476c4SRoss Zwisler 
289ac401cc7SJan Kara /*
290ac401cc7SJan Kara  * Find radix tree entry at given index. If it points to a page, return with
291ac401cc7SJan Kara  * the page locked. If it points to the exceptional entry, return with the
292ac401cc7SJan Kara  * radix tree entry locked. If the radix tree doesn't contain given index,
293ac401cc7SJan Kara  * create empty exceptional entry for the index and return with it locked.
294ac401cc7SJan Kara  *
295642261acSRoss Zwisler  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
296642261acSRoss Zwisler  * either return that locked entry or will return an error.  This error will
297642261acSRoss Zwisler  * happen if there are any 4k entries (either zero pages or DAX entries)
298642261acSRoss Zwisler  * within the 2MiB range that we are requesting.
299642261acSRoss Zwisler  *
300642261acSRoss Zwisler  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
301642261acSRoss Zwisler  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
302642261acSRoss Zwisler  * insertion will fail if it finds any 4k entries already in the tree, and a
303642261acSRoss Zwisler  * 4k insertion will cause an existing 2MiB entry to be unmapped and
304642261acSRoss Zwisler  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
305642261acSRoss Zwisler  * well as 2MiB empty entries.
306642261acSRoss Zwisler  *
307642261acSRoss Zwisler  * The exception to this downgrade path is for 2MiB DAX PMD entries that have
308642261acSRoss Zwisler  * real storage backing them.  We will leave these real 2MiB DAX entries in
309642261acSRoss Zwisler  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
310642261acSRoss Zwisler  *
311ac401cc7SJan Kara  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
312ac401cc7SJan Kara  * persistent memory the benefit is doubtful. We can add that later if we can
313ac401cc7SJan Kara  * show it helps.
314ac401cc7SJan Kara  */
315642261acSRoss Zwisler static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
316642261acSRoss Zwisler 		unsigned long size_flag)
317ac401cc7SJan Kara {
318642261acSRoss Zwisler 	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
319e3ad61c6SRoss Zwisler 	void *entry, **slot;
320ac401cc7SJan Kara 
321ac401cc7SJan Kara restart:
322ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
323e3ad61c6SRoss Zwisler 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
324642261acSRoss Zwisler 
325642261acSRoss Zwisler 	if (entry) {
326642261acSRoss Zwisler 		if (size_flag & RADIX_DAX_PMD) {
327642261acSRoss Zwisler 			if (!radix_tree_exceptional_entry(entry) ||
328642261acSRoss Zwisler 			    dax_is_pte_entry(entry)) {
329642261acSRoss Zwisler 				put_unlocked_mapping_entry(mapping, index,
330642261acSRoss Zwisler 						entry);
331642261acSRoss Zwisler 				entry = ERR_PTR(-EEXIST);
332642261acSRoss Zwisler 				goto out_unlock;
333642261acSRoss Zwisler 			}
334642261acSRoss Zwisler 		} else { /* trying to grab a PTE entry */
335642261acSRoss Zwisler 			if (radix_tree_exceptional_entry(entry) &&
336642261acSRoss Zwisler 			    dax_is_pmd_entry(entry) &&
337642261acSRoss Zwisler 			    (dax_is_zero_entry(entry) ||
338642261acSRoss Zwisler 			     dax_is_empty_entry(entry))) {
339642261acSRoss Zwisler 				pmd_downgrade = true;
340642261acSRoss Zwisler 			}
341642261acSRoss Zwisler 		}
342642261acSRoss Zwisler 	}
343642261acSRoss Zwisler 
344ac401cc7SJan Kara 	/* No entry for given index? Make sure radix tree is big enough. */
345642261acSRoss Zwisler 	if (!entry || pmd_downgrade) {
346ac401cc7SJan Kara 		int err;
347ac401cc7SJan Kara 
348642261acSRoss Zwisler 		if (pmd_downgrade) {
349642261acSRoss Zwisler 			/*
350642261acSRoss Zwisler 			 * Make sure 'entry' remains valid while we drop
351642261acSRoss Zwisler 			 * mapping->tree_lock.
352642261acSRoss Zwisler 			 */
353642261acSRoss Zwisler 			entry = lock_slot(mapping, slot);
354642261acSRoss Zwisler 		}
355642261acSRoss Zwisler 
356ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
357642261acSRoss Zwisler 		/*
358642261acSRoss Zwisler 		 * Besides huge zero pages the only other thing that gets
359642261acSRoss Zwisler 		 * downgraded are empty entries which don't need to be
360642261acSRoss Zwisler 		 * unmapped.
361642261acSRoss Zwisler 		 */
362642261acSRoss Zwisler 		if (pmd_downgrade && dax_is_zero_entry(entry))
363642261acSRoss Zwisler 			unmap_mapping_range(mapping,
364642261acSRoss Zwisler 				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
365642261acSRoss Zwisler 
3660cb80b48SJan Kara 		err = radix_tree_preload(
3670cb80b48SJan Kara 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
3680cb80b48SJan Kara 		if (err) {
3690cb80b48SJan Kara 			if (pmd_downgrade)
3700cb80b48SJan Kara 				put_locked_mapping_entry(mapping, index, entry);
3710cb80b48SJan Kara 			return ERR_PTR(err);
3720cb80b48SJan Kara 		}
373ac401cc7SJan Kara 		spin_lock_irq(&mapping->tree_lock);
374642261acSRoss Zwisler 
375642261acSRoss Zwisler 		if (pmd_downgrade) {
376642261acSRoss Zwisler 			radix_tree_delete(&mapping->page_tree, index);
377642261acSRoss Zwisler 			mapping->nrexceptional--;
378642261acSRoss Zwisler 			dax_wake_mapping_entry_waiter(mapping, index, entry,
379642261acSRoss Zwisler 					true);
380642261acSRoss Zwisler 		}
381642261acSRoss Zwisler 
382642261acSRoss Zwisler 		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
383642261acSRoss Zwisler 
384642261acSRoss Zwisler 		err = __radix_tree_insert(&mapping->page_tree, index,
385642261acSRoss Zwisler 				dax_radix_order(entry), entry);
386ac401cc7SJan Kara 		radix_tree_preload_end();
387ac401cc7SJan Kara 		if (err) {
388ac401cc7SJan Kara 			spin_unlock_irq(&mapping->tree_lock);
389642261acSRoss Zwisler 			/*
390642261acSRoss Zwisler 			 * Someone already created the entry?  This is a
391642261acSRoss Zwisler 			 * normal failure when inserting PMDs in a range
392642261acSRoss Zwisler 			 * that already contains PTEs.  In that case we want
393642261acSRoss Zwisler 			 * to return -EEXIST immediately.
394642261acSRoss Zwisler 			 */
395642261acSRoss Zwisler 			if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
396ac401cc7SJan Kara 				goto restart;
397642261acSRoss Zwisler 			/*
398642261acSRoss Zwisler 			 * Our insertion of a DAX PMD entry failed, most
399642261acSRoss Zwisler 			 * likely because it collided with a PTE sized entry
400642261acSRoss Zwisler 			 * at a different index in the PMD range.  We haven't
401642261acSRoss Zwisler 			 * inserted anything into the radix tree and have no
402642261acSRoss Zwisler 			 * waiters to wake.
403642261acSRoss Zwisler 			 */
404ac401cc7SJan Kara 			return ERR_PTR(err);
405ac401cc7SJan Kara 		}
406ac401cc7SJan Kara 		/* Good, we have inserted empty locked entry into the tree. */
407ac401cc7SJan Kara 		mapping->nrexceptional++;
408ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
409e3ad61c6SRoss Zwisler 		return entry;
410ac401cc7SJan Kara 	}
411ac401cc7SJan Kara 	/* Normal page in radix tree? */
412e3ad61c6SRoss Zwisler 	if (!radix_tree_exceptional_entry(entry)) {
413e3ad61c6SRoss Zwisler 		struct page *page = entry;
414ac401cc7SJan Kara 
415ac401cc7SJan Kara 		get_page(page);
416ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
417ac401cc7SJan Kara 		lock_page(page);
418ac401cc7SJan Kara 		/* Page got truncated? Retry... */
419ac401cc7SJan Kara 		if (unlikely(page->mapping != mapping)) {
420ac401cc7SJan Kara 			unlock_page(page);
421ac401cc7SJan Kara 			put_page(page);
422ac401cc7SJan Kara 			goto restart;
423ac401cc7SJan Kara 		}
424ac401cc7SJan Kara 		return page;
425ac401cc7SJan Kara 	}
426e3ad61c6SRoss Zwisler 	entry = lock_slot(mapping, slot);
427642261acSRoss Zwisler  out_unlock:
428ac401cc7SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
429e3ad61c6SRoss Zwisler 	return entry;
430ac401cc7SJan Kara }
431ac401cc7SJan Kara 
43263e95b5cSRoss Zwisler /*
43363e95b5cSRoss Zwisler  * We do not necessarily hold the mapping->tree_lock when we call this
43463e95b5cSRoss Zwisler  * function so it is possible that 'entry' is no longer a valid item in the
435642261acSRoss Zwisler  * radix tree.  This is okay because all we really need to do is to find the
436642261acSRoss Zwisler  * correct waitqueue where tasks might be waiting for that old 'entry' and
437642261acSRoss Zwisler  * wake them.
43863e95b5cSRoss Zwisler  */
439ac401cc7SJan Kara void dax_wake_mapping_entry_waiter(struct address_space *mapping,
44063e95b5cSRoss Zwisler 		pgoff_t index, void *entry, bool wake_all)
441ac401cc7SJan Kara {
44263e95b5cSRoss Zwisler 	struct exceptional_entry_key key;
44363e95b5cSRoss Zwisler 	wait_queue_head_t *wq;
44463e95b5cSRoss Zwisler 
44563e95b5cSRoss Zwisler 	wq = dax_entry_waitqueue(mapping, index, entry, &key);
446ac401cc7SJan Kara 
447ac401cc7SJan Kara 	/*
448ac401cc7SJan Kara 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
449ac401cc7SJan Kara 	 * under mapping->tree_lock, ditto for entry handling in our callers.
450ac401cc7SJan Kara 	 * So at this point all tasks that could have seen our entry locked
451ac401cc7SJan Kara 	 * must be in the waitqueue and the following check will see them.
452ac401cc7SJan Kara 	 */
45363e95b5cSRoss Zwisler 	if (waitqueue_active(wq))
454ac401cc7SJan Kara 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
455ac401cc7SJan Kara }
456ac401cc7SJan Kara 
457c6dcf52cSJan Kara static int __dax_invalidate_mapping_entry(struct address_space *mapping,
458c6dcf52cSJan Kara 					  pgoff_t index, bool trunc)
459c6dcf52cSJan Kara {
460c6dcf52cSJan Kara 	int ret = 0;
461c6dcf52cSJan Kara 	void *entry;
462c6dcf52cSJan Kara 	struct radix_tree_root *page_tree = &mapping->page_tree;
463c6dcf52cSJan Kara 
464c6dcf52cSJan Kara 	spin_lock_irq(&mapping->tree_lock);
465c6dcf52cSJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
466c6dcf52cSJan Kara 	if (!entry || !radix_tree_exceptional_entry(entry))
467c6dcf52cSJan Kara 		goto out;
468c6dcf52cSJan Kara 	if (!trunc &&
469c6dcf52cSJan Kara 	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
470c6dcf52cSJan Kara 	     radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
471c6dcf52cSJan Kara 		goto out;
472c6dcf52cSJan Kara 	radix_tree_delete(page_tree, index);
473c6dcf52cSJan Kara 	mapping->nrexceptional--;
474c6dcf52cSJan Kara 	ret = 1;
475c6dcf52cSJan Kara out:
476c6dcf52cSJan Kara 	put_unlocked_mapping_entry(mapping, index, entry);
477c6dcf52cSJan Kara 	spin_unlock_irq(&mapping->tree_lock);
478c6dcf52cSJan Kara 	return ret;
479c6dcf52cSJan Kara }
480ac401cc7SJan Kara /*
481ac401cc7SJan Kara  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
482ac401cc7SJan Kara  * entry to get unlocked before deleting it.
483ac401cc7SJan Kara  */
484ac401cc7SJan Kara int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
485ac401cc7SJan Kara {
486c6dcf52cSJan Kara 	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
487ac401cc7SJan Kara 
488ac401cc7SJan Kara 	/*
489ac401cc7SJan Kara 	 * This gets called from truncate / punch_hole path. As such, the caller
490ac401cc7SJan Kara 	 * must hold locks protecting against concurrent modifications of the
491ac401cc7SJan Kara 	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
492ac401cc7SJan Kara 	 * caller has seen exceptional entry for this index, we better find it
493ac401cc7SJan Kara 	 * at that index as well...
494ac401cc7SJan Kara 	 */
495c6dcf52cSJan Kara 	WARN_ON_ONCE(!ret);
496c6dcf52cSJan Kara 	return ret;
497ac401cc7SJan Kara }
498ac401cc7SJan Kara 
499c6dcf52cSJan Kara /*
500c6dcf52cSJan Kara  * Invalidate exceptional DAX entry if easily possible. This handles DAX
501c6dcf52cSJan Kara  * entries for invalidate_inode_pages() so we evict the entry only if we can
502c6dcf52cSJan Kara  * do so without blocking.
503c6dcf52cSJan Kara  */
504c6dcf52cSJan Kara int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
505c6dcf52cSJan Kara {
506c6dcf52cSJan Kara 	int ret = 0;
507c6dcf52cSJan Kara 	void *entry, **slot;
508c6dcf52cSJan Kara 	struct radix_tree_root *page_tree = &mapping->page_tree;
509c6dcf52cSJan Kara 
510c6dcf52cSJan Kara 	spin_lock_irq(&mapping->tree_lock);
511c6dcf52cSJan Kara 	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
512c6dcf52cSJan Kara 	if (!entry || !radix_tree_exceptional_entry(entry) ||
513c6dcf52cSJan Kara 	    slot_locked(mapping, slot))
514c6dcf52cSJan Kara 		goto out;
515c6dcf52cSJan Kara 	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
516c6dcf52cSJan Kara 	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
517c6dcf52cSJan Kara 		goto out;
518c6dcf52cSJan Kara 	radix_tree_delete(page_tree, index);
519c6dcf52cSJan Kara 	mapping->nrexceptional--;
520c6dcf52cSJan Kara 	ret = 1;
521c6dcf52cSJan Kara out:
522c6dcf52cSJan Kara 	spin_unlock_irq(&mapping->tree_lock);
523c6dcf52cSJan Kara 	if (ret)
524c6dcf52cSJan Kara 		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
525c6dcf52cSJan Kara 	return ret;
526c6dcf52cSJan Kara }
527c6dcf52cSJan Kara 
528c6dcf52cSJan Kara /*
529c6dcf52cSJan Kara  * Invalidate exceptional DAX entry if it is clean.
530c6dcf52cSJan Kara  */
531c6dcf52cSJan Kara int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
532c6dcf52cSJan Kara 				      pgoff_t index)
533c6dcf52cSJan Kara {
534c6dcf52cSJan Kara 	return __dax_invalidate_mapping_entry(mapping, index, false);
535ac401cc7SJan Kara }
536ac401cc7SJan Kara 
537ac401cc7SJan Kara /*
538f7ca90b1SMatthew Wilcox  * The user has performed a load from a hole in the file.  Allocating
539f7ca90b1SMatthew Wilcox  * a new page in the file would cause excessive storage usage for
540f7ca90b1SMatthew Wilcox  * workloads with sparse files.  We allocate a page cache page instead.
541f7ca90b1SMatthew Wilcox  * We'll kick it out of the page cache if it's ever written to,
542f7ca90b1SMatthew Wilcox  * otherwise it will simply fall out of the page cache under memory
543f7ca90b1SMatthew Wilcox  * pressure without ever having been dirtied.
544f7ca90b1SMatthew Wilcox  */
545f449b936SJan Kara static int dax_load_hole(struct address_space *mapping, void **entry,
546f7ca90b1SMatthew Wilcox 			 struct vm_fault *vmf)
547f7ca90b1SMatthew Wilcox {
548ac401cc7SJan Kara 	struct page *page;
549f449b936SJan Kara 	int ret;
550f7ca90b1SMatthew Wilcox 
551ac401cc7SJan Kara 	/* Hole page already exists? Return it...  */
552f449b936SJan Kara 	if (!radix_tree_exceptional_entry(*entry)) {
553f449b936SJan Kara 		page = *entry;
554f449b936SJan Kara 		goto out;
555ac401cc7SJan Kara 	}
556ac401cc7SJan Kara 
557ac401cc7SJan Kara 	/* This will replace locked radix tree entry with a hole page */
558ac401cc7SJan Kara 	page = find_or_create_page(mapping, vmf->pgoff,
559ac401cc7SJan Kara 				   vmf->gfp_mask | __GFP_ZERO);
560b1aa812bSJan Kara 	if (!page)
561ac401cc7SJan Kara 		return VM_FAULT_OOM;
562f449b936SJan Kara  out:
563f7ca90b1SMatthew Wilcox 	vmf->page = page;
564f449b936SJan Kara 	ret = finish_fault(vmf);
565f449b936SJan Kara 	vmf->page = NULL;
566f449b936SJan Kara 	*entry = page;
567f449b936SJan Kara 	if (!ret) {
568f449b936SJan Kara 		/* Grab reference for PTE that is now referencing the page */
569f449b936SJan Kara 		get_page(page);
570f449b936SJan Kara 		return VM_FAULT_NOPAGE;
571f449b936SJan Kara 	}
572f449b936SJan Kara 	return ret;
573f7ca90b1SMatthew Wilcox }
574f7ca90b1SMatthew Wilcox 
575b0d5e82fSChristoph Hellwig static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
576b0d5e82fSChristoph Hellwig 		struct page *to, unsigned long vaddr)
577f7ca90b1SMatthew Wilcox {
578b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
579b0d5e82fSChristoph Hellwig 		.sector = sector,
580b0d5e82fSChristoph Hellwig 		.size = size,
581b2e0d162SDan Williams 	};
582e2e05394SRoss Zwisler 	void *vto;
583e2e05394SRoss Zwisler 
584b2e0d162SDan Williams 	if (dax_map_atomic(bdev, &dax) < 0)
585b2e0d162SDan Williams 		return PTR_ERR(dax.addr);
586f7ca90b1SMatthew Wilcox 	vto = kmap_atomic(to);
587b2e0d162SDan Williams 	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
588f7ca90b1SMatthew Wilcox 	kunmap_atomic(vto);
589b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
590f7ca90b1SMatthew Wilcox 	return 0;
591f7ca90b1SMatthew Wilcox }
592f7ca90b1SMatthew Wilcox 
593642261acSRoss Zwisler /*
594642261acSRoss Zwisler  * By this point grab_mapping_entry() has ensured that we have a locked entry
595642261acSRoss Zwisler  * of the appropriate size so we don't have to worry about downgrading PMDs to
596642261acSRoss Zwisler  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
597642261acSRoss Zwisler  * already in the tree, we will skip the insertion and just dirty the PMD as
598642261acSRoss Zwisler  * appropriate.
599642261acSRoss Zwisler  */
600ac401cc7SJan Kara static void *dax_insert_mapping_entry(struct address_space *mapping,
601ac401cc7SJan Kara 				      struct vm_fault *vmf,
602642261acSRoss Zwisler 				      void *entry, sector_t sector,
603642261acSRoss Zwisler 				      unsigned long flags)
6049973c98eSRoss Zwisler {
6059973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
606ac401cc7SJan Kara 	int error = 0;
607ac401cc7SJan Kara 	bool hole_fill = false;
608ac401cc7SJan Kara 	void *new_entry;
609ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
6109973c98eSRoss Zwisler 
611ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
6129973c98eSRoss Zwisler 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
6139973c98eSRoss Zwisler 
614ac401cc7SJan Kara 	/* Replacing hole page with block mapping? */
615ac401cc7SJan Kara 	if (!radix_tree_exceptional_entry(entry)) {
616ac401cc7SJan Kara 		hole_fill = true;
6179973c98eSRoss Zwisler 		/*
618ac401cc7SJan Kara 		 * Unmap the page now before we remove it from page cache below.
619ac401cc7SJan Kara 		 * The page is locked so it cannot be faulted in again.
6209973c98eSRoss Zwisler 		 */
621ac401cc7SJan Kara 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
622ac401cc7SJan Kara 				    PAGE_SIZE, 0);
623ac401cc7SJan Kara 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
6249973c98eSRoss Zwisler 		if (error)
625ac401cc7SJan Kara 			return ERR_PTR(error);
626642261acSRoss Zwisler 	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
627642261acSRoss Zwisler 		/* replacing huge zero page with PMD block mapping */
628642261acSRoss Zwisler 		unmap_mapping_range(mapping,
629642261acSRoss Zwisler 			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
630ac401cc7SJan Kara 	}
6319973c98eSRoss Zwisler 
632ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
633642261acSRoss Zwisler 	new_entry = dax_radix_locked_entry(sector, flags);
634642261acSRoss Zwisler 
635ac401cc7SJan Kara 	if (hole_fill) {
636ac401cc7SJan Kara 		__delete_from_page_cache(entry, NULL);
637ac401cc7SJan Kara 		/* Drop pagecache reference */
638ac401cc7SJan Kara 		put_page(entry);
639642261acSRoss Zwisler 		error = __radix_tree_insert(page_tree, index,
640642261acSRoss Zwisler 				dax_radix_order(new_entry), new_entry);
641ac401cc7SJan Kara 		if (error) {
642ac401cc7SJan Kara 			new_entry = ERR_PTR(error);
643ac401cc7SJan Kara 			goto unlock;
644ac401cc7SJan Kara 		}
6459973c98eSRoss Zwisler 		mapping->nrexceptional++;
646642261acSRoss Zwisler 	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
647642261acSRoss Zwisler 		/*
648642261acSRoss Zwisler 		 * Only swap our new entry into the radix tree if the current
649642261acSRoss Zwisler 		 * entry is a zero page or an empty entry.  If a normal PTE or
650642261acSRoss Zwisler 		 * PMD entry is already in the tree, we leave it alone.  This
651642261acSRoss Zwisler 		 * means that if we are trying to insert a PTE and the
652642261acSRoss Zwisler 		 * existing entry is a PMD, we will just leave the PMD in the
653642261acSRoss Zwisler 		 * tree and dirty it if necessary.
654642261acSRoss Zwisler 		 */
655f7942430SJohannes Weiner 		struct radix_tree_node *node;
656ac401cc7SJan Kara 		void **slot;
657ac401cc7SJan Kara 		void *ret;
658ac401cc7SJan Kara 
659f7942430SJohannes Weiner 		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
660ac401cc7SJan Kara 		WARN_ON_ONCE(ret != entry);
6614d693d08SJohannes Weiner 		__radix_tree_replace(page_tree, node, slot,
6624d693d08SJohannes Weiner 				     new_entry, NULL, NULL);
663ac401cc7SJan Kara 	}
664ac401cc7SJan Kara 	if (vmf->flags & FAULT_FLAG_WRITE)
6659973c98eSRoss Zwisler 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
6669973c98eSRoss Zwisler  unlock:
6679973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
668ac401cc7SJan Kara 	if (hole_fill) {
669ac401cc7SJan Kara 		radix_tree_preload_end();
670ac401cc7SJan Kara 		/*
671ac401cc7SJan Kara 		 * We don't need hole page anymore, it has been replaced with
672ac401cc7SJan Kara 		 * locked radix tree entry now.
673ac401cc7SJan Kara 		 */
674ac401cc7SJan Kara 		if (mapping->a_ops->freepage)
675ac401cc7SJan Kara 			mapping->a_ops->freepage(entry);
676ac401cc7SJan Kara 		unlock_page(entry);
677ac401cc7SJan Kara 		put_page(entry);
678ac401cc7SJan Kara 	}
679ac401cc7SJan Kara 	return new_entry;
6809973c98eSRoss Zwisler }
6819973c98eSRoss Zwisler 
6824b4bb46dSJan Kara static inline unsigned long
6834b4bb46dSJan Kara pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
6844b4bb46dSJan Kara {
6854b4bb46dSJan Kara 	unsigned long address;
6864b4bb46dSJan Kara 
6874b4bb46dSJan Kara 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
6884b4bb46dSJan Kara 	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
6894b4bb46dSJan Kara 	return address;
6904b4bb46dSJan Kara }
6914b4bb46dSJan Kara 
6924b4bb46dSJan Kara /* Walk all mappings of a given index of a file and writeprotect them */
6934b4bb46dSJan Kara static void dax_mapping_entry_mkclean(struct address_space *mapping,
6944b4bb46dSJan Kara 				      pgoff_t index, unsigned long pfn)
6954b4bb46dSJan Kara {
6964b4bb46dSJan Kara 	struct vm_area_struct *vma;
697f729c8c9SRoss Zwisler 	pte_t pte, *ptep = NULL;
698f729c8c9SRoss Zwisler 	pmd_t *pmdp = NULL;
6994b4bb46dSJan Kara 	spinlock_t *ptl;
7004b4bb46dSJan Kara 	bool changed;
7014b4bb46dSJan Kara 
7024b4bb46dSJan Kara 	i_mmap_lock_read(mapping);
7034b4bb46dSJan Kara 	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
7044b4bb46dSJan Kara 		unsigned long address;
7054b4bb46dSJan Kara 
7064b4bb46dSJan Kara 		cond_resched();
7074b4bb46dSJan Kara 
7084b4bb46dSJan Kara 		if (!(vma->vm_flags & VM_SHARED))
7094b4bb46dSJan Kara 			continue;
7104b4bb46dSJan Kara 
7114b4bb46dSJan Kara 		address = pgoff_address(index, vma);
7124b4bb46dSJan Kara 		changed = false;
713f729c8c9SRoss Zwisler 		if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
7144b4bb46dSJan Kara 			continue;
715f729c8c9SRoss Zwisler 
716f729c8c9SRoss Zwisler 		if (pmdp) {
717f729c8c9SRoss Zwisler #ifdef CONFIG_FS_DAX_PMD
718f729c8c9SRoss Zwisler 			pmd_t pmd;
719f729c8c9SRoss Zwisler 
720f729c8c9SRoss Zwisler 			if (pfn != pmd_pfn(*pmdp))
721f729c8c9SRoss Zwisler 				goto unlock_pmd;
722f729c8c9SRoss Zwisler 			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
723f729c8c9SRoss Zwisler 				goto unlock_pmd;
724f729c8c9SRoss Zwisler 
725f729c8c9SRoss Zwisler 			flush_cache_page(vma, address, pfn);
726f729c8c9SRoss Zwisler 			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
727f729c8c9SRoss Zwisler 			pmd = pmd_wrprotect(pmd);
728f729c8c9SRoss Zwisler 			pmd = pmd_mkclean(pmd);
729f729c8c9SRoss Zwisler 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
730f729c8c9SRoss Zwisler 			changed = true;
731f729c8c9SRoss Zwisler unlock_pmd:
732f729c8c9SRoss Zwisler 			spin_unlock(ptl);
733f729c8c9SRoss Zwisler #endif
734f729c8c9SRoss Zwisler 		} else {
7354b4bb46dSJan Kara 			if (pfn != pte_pfn(*ptep))
736f729c8c9SRoss Zwisler 				goto unlock_pte;
7374b4bb46dSJan Kara 			if (!pte_dirty(*ptep) && !pte_write(*ptep))
738f729c8c9SRoss Zwisler 				goto unlock_pte;
7394b4bb46dSJan Kara 
7404b4bb46dSJan Kara 			flush_cache_page(vma, address, pfn);
7414b4bb46dSJan Kara 			pte = ptep_clear_flush(vma, address, ptep);
7424b4bb46dSJan Kara 			pte = pte_wrprotect(pte);
7434b4bb46dSJan Kara 			pte = pte_mkclean(pte);
7444b4bb46dSJan Kara 			set_pte_at(vma->vm_mm, address, ptep, pte);
7454b4bb46dSJan Kara 			changed = true;
746f729c8c9SRoss Zwisler unlock_pte:
7474b4bb46dSJan Kara 			pte_unmap_unlock(ptep, ptl);
748f729c8c9SRoss Zwisler 		}
7494b4bb46dSJan Kara 
7504b4bb46dSJan Kara 		if (changed)
7514b4bb46dSJan Kara 			mmu_notifier_invalidate_page(vma->vm_mm, address);
7524b4bb46dSJan Kara 	}
7534b4bb46dSJan Kara 	i_mmap_unlock_read(mapping);
7544b4bb46dSJan Kara }
7554b4bb46dSJan Kara 
7569973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev,
7579973c98eSRoss Zwisler 		struct address_space *mapping, pgoff_t index, void *entry)
7589973c98eSRoss Zwisler {
7599973c98eSRoss Zwisler 	struct radix_tree_root *page_tree = &mapping->page_tree;
7609973c98eSRoss Zwisler 	struct blk_dax_ctl dax;
761a6abc2c0SJan Kara 	void *entry2, **slot;
7629973c98eSRoss Zwisler 	int ret = 0;
7639973c98eSRoss Zwisler 
7649973c98eSRoss Zwisler 	/*
765a6abc2c0SJan Kara 	 * A page got tagged dirty in DAX mapping? Something is seriously
766a6abc2c0SJan Kara 	 * wrong.
7679973c98eSRoss Zwisler 	 */
768a6abc2c0SJan Kara 	if (WARN_ON(!radix_tree_exceptional_entry(entry)))
769a6abc2c0SJan Kara 		return -EIO;
7709973c98eSRoss Zwisler 
771a6abc2c0SJan Kara 	spin_lock_irq(&mapping->tree_lock);
772a6abc2c0SJan Kara 	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
773a6abc2c0SJan Kara 	/* Entry got punched out / reallocated? */
774a6abc2c0SJan Kara 	if (!entry2 || !radix_tree_exceptional_entry(entry2))
775a6abc2c0SJan Kara 		goto put_unlocked;
776a6abc2c0SJan Kara 	/*
777a6abc2c0SJan Kara 	 * Entry got reallocated elsewhere? No need to writeback. We have to
778a6abc2c0SJan Kara 	 * compare sectors as we must not bail out due to difference in lockbit
779a6abc2c0SJan Kara 	 * or entry type.
780a6abc2c0SJan Kara 	 */
781a6abc2c0SJan Kara 	if (dax_radix_sector(entry2) != dax_radix_sector(entry))
782a6abc2c0SJan Kara 		goto put_unlocked;
783642261acSRoss Zwisler 	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
784642261acSRoss Zwisler 				dax_is_zero_entry(entry))) {
7859973c98eSRoss Zwisler 		ret = -EIO;
786a6abc2c0SJan Kara 		goto put_unlocked;
7879973c98eSRoss Zwisler 	}
7889973c98eSRoss Zwisler 
789a6abc2c0SJan Kara 	/* Another fsync thread may have already written back this entry */
790a6abc2c0SJan Kara 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
791a6abc2c0SJan Kara 		goto put_unlocked;
792a6abc2c0SJan Kara 	/* Lock the entry to serialize with page faults */
793a6abc2c0SJan Kara 	entry = lock_slot(mapping, slot);
794a6abc2c0SJan Kara 	/*
795a6abc2c0SJan Kara 	 * We can clear the tag now but we have to be careful so that concurrent
796a6abc2c0SJan Kara 	 * dax_writeback_one() calls for the same index cannot finish before we
797a6abc2c0SJan Kara 	 * actually flush the caches. This is achieved as the calls will look
798a6abc2c0SJan Kara 	 * at the entry only under tree_lock and once they do that they will
799a6abc2c0SJan Kara 	 * see the entry locked and wait for it to unlock.
800a6abc2c0SJan Kara 	 */
801a6abc2c0SJan Kara 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
802a6abc2c0SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
803a6abc2c0SJan Kara 
804642261acSRoss Zwisler 	/*
805642261acSRoss Zwisler 	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
806642261acSRoss Zwisler 	 * in the middle of a PMD, the 'index' we are given will be aligned to
807642261acSRoss Zwisler 	 * the start index of the PMD, as will the sector we pull from
808642261acSRoss Zwisler 	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
809642261acSRoss Zwisler 	 * worry about partial PMD writebacks.
810642261acSRoss Zwisler 	 */
811642261acSRoss Zwisler 	dax.sector = dax_radix_sector(entry);
812642261acSRoss Zwisler 	dax.size = PAGE_SIZE << dax_radix_order(entry);
8139973c98eSRoss Zwisler 
8149973c98eSRoss Zwisler 	/*
8159973c98eSRoss Zwisler 	 * We cannot hold tree_lock while calling dax_map_atomic() because it
8169973c98eSRoss Zwisler 	 * eventually calls cond_resched().
8179973c98eSRoss Zwisler 	 */
8189973c98eSRoss Zwisler 	ret = dax_map_atomic(bdev, &dax);
819a6abc2c0SJan Kara 	if (ret < 0) {
820a6abc2c0SJan Kara 		put_locked_mapping_entry(mapping, index, entry);
8219973c98eSRoss Zwisler 		return ret;
822a6abc2c0SJan Kara 	}
8239973c98eSRoss Zwisler 
8249973c98eSRoss Zwisler 	if (WARN_ON_ONCE(ret < dax.size)) {
8259973c98eSRoss Zwisler 		ret = -EIO;
8269973c98eSRoss Zwisler 		goto unmap;
8279973c98eSRoss Zwisler 	}
8289973c98eSRoss Zwisler 
8294b4bb46dSJan Kara 	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
8309973c98eSRoss Zwisler 	wb_cache_pmem(dax.addr, dax.size);
8314b4bb46dSJan Kara 	/*
8324b4bb46dSJan Kara 	 * After we have flushed the cache, we can clear the dirty tag. There
8334b4bb46dSJan Kara 	 * cannot be new dirty data in the pfn after the flush has completed as
8344b4bb46dSJan Kara 	 * the pfn mappings are writeprotected and fault waits for mapping
8354b4bb46dSJan Kara 	 * entry lock.
8364b4bb46dSJan Kara 	 */
8374b4bb46dSJan Kara 	spin_lock_irq(&mapping->tree_lock);
8384b4bb46dSJan Kara 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
8394b4bb46dSJan Kara 	spin_unlock_irq(&mapping->tree_lock);
8409973c98eSRoss Zwisler  unmap:
8419973c98eSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
842a6abc2c0SJan Kara 	put_locked_mapping_entry(mapping, index, entry);
8439973c98eSRoss Zwisler 	return ret;
8449973c98eSRoss Zwisler 
845a6abc2c0SJan Kara  put_unlocked:
846a6abc2c0SJan Kara 	put_unlocked_mapping_entry(mapping, index, entry2);
8479973c98eSRoss Zwisler 	spin_unlock_irq(&mapping->tree_lock);
8489973c98eSRoss Zwisler 	return ret;
8499973c98eSRoss Zwisler }
8509973c98eSRoss Zwisler 
8519973c98eSRoss Zwisler /*
8529973c98eSRoss Zwisler  * Flush the mapping to the persistent domain within the byte range of [start,
8539973c98eSRoss Zwisler  * end]. This is required by data integrity operations to ensure file data is
8549973c98eSRoss Zwisler  * on persistent storage prior to completion of the operation.
8559973c98eSRoss Zwisler  */
8567f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping,
8577f6d5b52SRoss Zwisler 		struct block_device *bdev, struct writeback_control *wbc)
8589973c98eSRoss Zwisler {
8599973c98eSRoss Zwisler 	struct inode *inode = mapping->host;
860642261acSRoss Zwisler 	pgoff_t start_index, end_index;
8619973c98eSRoss Zwisler 	pgoff_t indices[PAGEVEC_SIZE];
8629973c98eSRoss Zwisler 	struct pagevec pvec;
8639973c98eSRoss Zwisler 	bool done = false;
8649973c98eSRoss Zwisler 	int i, ret = 0;
8659973c98eSRoss Zwisler 
8669973c98eSRoss Zwisler 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
8679973c98eSRoss Zwisler 		return -EIO;
8689973c98eSRoss Zwisler 
8697f6d5b52SRoss Zwisler 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
8707f6d5b52SRoss Zwisler 		return 0;
8717f6d5b52SRoss Zwisler 
87209cbfeafSKirill A. Shutemov 	start_index = wbc->range_start >> PAGE_SHIFT;
87309cbfeafSKirill A. Shutemov 	end_index = wbc->range_end >> PAGE_SHIFT;
8749973c98eSRoss Zwisler 
8759973c98eSRoss Zwisler 	tag_pages_for_writeback(mapping, start_index, end_index);
8769973c98eSRoss Zwisler 
8779973c98eSRoss Zwisler 	pagevec_init(&pvec, 0);
8789973c98eSRoss Zwisler 	while (!done) {
8799973c98eSRoss Zwisler 		pvec.nr = find_get_entries_tag(mapping, start_index,
8809973c98eSRoss Zwisler 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
8819973c98eSRoss Zwisler 				pvec.pages, indices);
8829973c98eSRoss Zwisler 
8839973c98eSRoss Zwisler 		if (pvec.nr == 0)
8849973c98eSRoss Zwisler 			break;
8859973c98eSRoss Zwisler 
8869973c98eSRoss Zwisler 		for (i = 0; i < pvec.nr; i++) {
8879973c98eSRoss Zwisler 			if (indices[i] > end_index) {
8889973c98eSRoss Zwisler 				done = true;
8899973c98eSRoss Zwisler 				break;
8909973c98eSRoss Zwisler 			}
8919973c98eSRoss Zwisler 
8929973c98eSRoss Zwisler 			ret = dax_writeback_one(bdev, mapping, indices[i],
8939973c98eSRoss Zwisler 					pvec.pages[i]);
8949973c98eSRoss Zwisler 			if (ret < 0)
8959973c98eSRoss Zwisler 				return ret;
8969973c98eSRoss Zwisler 		}
8979973c98eSRoss Zwisler 	}
8989973c98eSRoss Zwisler 	return 0;
8999973c98eSRoss Zwisler }
9009973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
9019973c98eSRoss Zwisler 
902ac401cc7SJan Kara static int dax_insert_mapping(struct address_space *mapping,
9031aaba095SChristoph Hellwig 		struct block_device *bdev, sector_t sector, size_t size,
9041aaba095SChristoph Hellwig 		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
905f7ca90b1SMatthew Wilcox {
9061a29d85eSJan Kara 	unsigned long vaddr = vmf->address;
907b2e0d162SDan Williams 	struct blk_dax_ctl dax = {
9081aaba095SChristoph Hellwig 		.sector = sector,
9091aaba095SChristoph Hellwig 		.size = size,
910b2e0d162SDan Williams 	};
911ac401cc7SJan Kara 	void *ret;
912ac401cc7SJan Kara 	void *entry = *entryp;
913f7ca90b1SMatthew Wilcox 
9144d9a2c87SJan Kara 	if (dax_map_atomic(bdev, &dax) < 0)
9154d9a2c87SJan Kara 		return PTR_ERR(dax.addr);
916b2e0d162SDan Williams 	dax_unmap_atomic(bdev, &dax);
917f7ca90b1SMatthew Wilcox 
918642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
9194d9a2c87SJan Kara 	if (IS_ERR(ret))
9204d9a2c87SJan Kara 		return PTR_ERR(ret);
921ac401cc7SJan Kara 	*entryp = ret;
9229973c98eSRoss Zwisler 
9234d9a2c87SJan Kara 	return vm_insert_mixed(vma, vaddr, dax.pfn);
924f7ca90b1SMatthew Wilcox }
925f7ca90b1SMatthew Wilcox 
926ce5c5d55SDave Chinner /**
9270e3b210cSBoaz Harrosh  * dax_pfn_mkwrite - handle first write to DAX page
9280e3b210cSBoaz Harrosh  * @vma: The virtual memory area where the fault occurred
9290e3b210cSBoaz Harrosh  * @vmf: The description of the fault
9300e3b210cSBoaz Harrosh  */
9310e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
9320e3b210cSBoaz Harrosh {
9339973c98eSRoss Zwisler 	struct file *file = vma->vm_file;
934ac401cc7SJan Kara 	struct address_space *mapping = file->f_mapping;
9352f89dc12SJan Kara 	void *entry, **slot;
936ac401cc7SJan Kara 	pgoff_t index = vmf->pgoff;
9370e3b210cSBoaz Harrosh 
938ac401cc7SJan Kara 	spin_lock_irq(&mapping->tree_lock);
9392f89dc12SJan Kara 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
9402f89dc12SJan Kara 	if (!entry || !radix_tree_exceptional_entry(entry)) {
9412f89dc12SJan Kara 		if (entry)
942ac401cc7SJan Kara 			put_unlocked_mapping_entry(mapping, index, entry);
943ac401cc7SJan Kara 		spin_unlock_irq(&mapping->tree_lock);
9440e3b210cSBoaz Harrosh 		return VM_FAULT_NOPAGE;
9450e3b210cSBoaz Harrosh 	}
9462f89dc12SJan Kara 	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
9472f89dc12SJan Kara 	entry = lock_slot(mapping, slot);
9482f89dc12SJan Kara 	spin_unlock_irq(&mapping->tree_lock);
9492f89dc12SJan Kara 	/*
9502f89dc12SJan Kara 	 * If we race with somebody updating the PTE and finish_mkwrite_fault()
9512f89dc12SJan Kara 	 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
9522f89dc12SJan Kara 	 * the fault in either case.
9532f89dc12SJan Kara 	 */
9542f89dc12SJan Kara 	finish_mkwrite_fault(vmf);
9552f89dc12SJan Kara 	put_locked_mapping_entry(mapping, index, entry);
9562f89dc12SJan Kara 	return VM_FAULT_NOPAGE;
9572f89dc12SJan Kara }
9580e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
9590e3b210cSBoaz Harrosh 
9604b0228faSVishal Verma static bool dax_range_is_aligned(struct block_device *bdev,
9614b0228faSVishal Verma 				 unsigned int offset, unsigned int length)
9624b0228faSVishal Verma {
9634b0228faSVishal Verma 	unsigned short sector_size = bdev_logical_block_size(bdev);
9644b0228faSVishal Verma 
9654b0228faSVishal Verma 	if (!IS_ALIGNED(offset, sector_size))
9664b0228faSVishal Verma 		return false;
9674b0228faSVishal Verma 	if (!IS_ALIGNED(length, sector_size))
9684b0228faSVishal Verma 		return false;
9694b0228faSVishal Verma 
9704b0228faSVishal Verma 	return true;
9714b0228faSVishal Verma }
9724b0228faSVishal Verma 
973679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
974679c8bd3SChristoph Hellwig 		unsigned int offset, unsigned int length)
975679c8bd3SChristoph Hellwig {
976679c8bd3SChristoph Hellwig 	struct blk_dax_ctl dax = {
977679c8bd3SChristoph Hellwig 		.sector		= sector,
978679c8bd3SChristoph Hellwig 		.size		= PAGE_SIZE,
979679c8bd3SChristoph Hellwig 	};
980679c8bd3SChristoph Hellwig 
9814b0228faSVishal Verma 	if (dax_range_is_aligned(bdev, offset, length)) {
9824b0228faSVishal Verma 		sector_t start_sector = dax.sector + (offset >> 9);
9834b0228faSVishal Verma 
9844b0228faSVishal Verma 		return blkdev_issue_zeroout(bdev, start_sector,
9854b0228faSVishal Verma 				length >> 9, GFP_NOFS, true);
9864b0228faSVishal Verma 	} else {
987679c8bd3SChristoph Hellwig 		if (dax_map_atomic(bdev, &dax) < 0)
988679c8bd3SChristoph Hellwig 			return PTR_ERR(dax.addr);
989679c8bd3SChristoph Hellwig 		clear_pmem(dax.addr + offset, length);
990679c8bd3SChristoph Hellwig 		dax_unmap_atomic(bdev, &dax);
9914b0228faSVishal Verma 	}
992679c8bd3SChristoph Hellwig 	return 0;
993679c8bd3SChristoph Hellwig }
994679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range);
995679c8bd3SChristoph Hellwig 
996333ccc97SRoss Zwisler static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
997333ccc97SRoss Zwisler {
998333ccc97SRoss Zwisler 	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
999333ccc97SRoss Zwisler }
1000333ccc97SRoss Zwisler 
1001a254e568SChristoph Hellwig static loff_t
100211c59c92SRoss Zwisler dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1003a254e568SChristoph Hellwig 		struct iomap *iomap)
1004a254e568SChristoph Hellwig {
1005a254e568SChristoph Hellwig 	struct iov_iter *iter = data;
1006a254e568SChristoph Hellwig 	loff_t end = pos + length, done = 0;
1007a254e568SChristoph Hellwig 	ssize_t ret = 0;
1008a254e568SChristoph Hellwig 
1009a254e568SChristoph Hellwig 	if (iov_iter_rw(iter) == READ) {
1010a254e568SChristoph Hellwig 		end = min(end, i_size_read(inode));
1011a254e568SChristoph Hellwig 		if (pos >= end)
1012a254e568SChristoph Hellwig 			return 0;
1013a254e568SChristoph Hellwig 
1014a254e568SChristoph Hellwig 		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1015a254e568SChristoph Hellwig 			return iov_iter_zero(min(length, end - pos), iter);
1016a254e568SChristoph Hellwig 	}
1017a254e568SChristoph Hellwig 
1018a254e568SChristoph Hellwig 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1019a254e568SChristoph Hellwig 		return -EIO;
1020a254e568SChristoph Hellwig 
1021e3fce68cSJan Kara 	/*
1022e3fce68cSJan Kara 	 * Write can allocate block for an area which has a hole page mapped
1023e3fce68cSJan Kara 	 * into page tables. We have to tear down these mappings so that data
1024e3fce68cSJan Kara 	 * written by write(2) is visible in mmap.
1025e3fce68cSJan Kara 	 */
1026e3fce68cSJan Kara 	if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
1027e3fce68cSJan Kara 		invalidate_inode_pages2_range(inode->i_mapping,
1028e3fce68cSJan Kara 					      pos >> PAGE_SHIFT,
1029e3fce68cSJan Kara 					      (end - 1) >> PAGE_SHIFT);
1030e3fce68cSJan Kara 	}
1031e3fce68cSJan Kara 
1032a254e568SChristoph Hellwig 	while (pos < end) {
1033a254e568SChristoph Hellwig 		unsigned offset = pos & (PAGE_SIZE - 1);
1034a254e568SChristoph Hellwig 		struct blk_dax_ctl dax = { 0 };
1035a254e568SChristoph Hellwig 		ssize_t map_len;
1036a254e568SChristoph Hellwig 
1037d1908f52SMichal Hocko 		if (fatal_signal_pending(current)) {
1038d1908f52SMichal Hocko 			ret = -EINTR;
1039d1908f52SMichal Hocko 			break;
1040d1908f52SMichal Hocko 		}
1041d1908f52SMichal Hocko 
1042333ccc97SRoss Zwisler 		dax.sector = dax_iomap_sector(iomap, pos);
1043a254e568SChristoph Hellwig 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1044a254e568SChristoph Hellwig 		map_len = dax_map_atomic(iomap->bdev, &dax);
1045a254e568SChristoph Hellwig 		if (map_len < 0) {
1046a254e568SChristoph Hellwig 			ret = map_len;
1047a254e568SChristoph Hellwig 			break;
1048a254e568SChristoph Hellwig 		}
1049a254e568SChristoph Hellwig 
1050a254e568SChristoph Hellwig 		dax.addr += offset;
1051a254e568SChristoph Hellwig 		map_len -= offset;
1052a254e568SChristoph Hellwig 		if (map_len > end - pos)
1053a254e568SChristoph Hellwig 			map_len = end - pos;
1054a254e568SChristoph Hellwig 
1055a254e568SChristoph Hellwig 		if (iov_iter_rw(iter) == WRITE)
1056a254e568SChristoph Hellwig 			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1057a254e568SChristoph Hellwig 		else
1058a254e568SChristoph Hellwig 			map_len = copy_to_iter(dax.addr, map_len, iter);
1059a254e568SChristoph Hellwig 		dax_unmap_atomic(iomap->bdev, &dax);
1060a254e568SChristoph Hellwig 		if (map_len <= 0) {
1061a254e568SChristoph Hellwig 			ret = map_len ? map_len : -EFAULT;
1062a254e568SChristoph Hellwig 			break;
1063a254e568SChristoph Hellwig 		}
1064a254e568SChristoph Hellwig 
1065a254e568SChristoph Hellwig 		pos += map_len;
1066a254e568SChristoph Hellwig 		length -= map_len;
1067a254e568SChristoph Hellwig 		done += map_len;
1068a254e568SChristoph Hellwig 	}
1069a254e568SChristoph Hellwig 
1070a254e568SChristoph Hellwig 	return done ? done : ret;
1071a254e568SChristoph Hellwig }
1072a254e568SChristoph Hellwig 
1073a254e568SChristoph Hellwig /**
107411c59c92SRoss Zwisler  * dax_iomap_rw - Perform I/O to a DAX file
1075a254e568SChristoph Hellwig  * @iocb:	The control block for this I/O
1076a254e568SChristoph Hellwig  * @iter:	The addresses to do I/O from or to
1077a254e568SChristoph Hellwig  * @ops:	iomap ops passed from the file system
1078a254e568SChristoph Hellwig  *
1079a254e568SChristoph Hellwig  * This function performs read and write operations to directly mapped
1080a254e568SChristoph Hellwig  * persistent memory.  The callers needs to take care of read/write exclusion
1081a254e568SChristoph Hellwig  * and evicting any page cache pages in the region under I/O.
1082a254e568SChristoph Hellwig  */
1083a254e568SChristoph Hellwig ssize_t
108411c59c92SRoss Zwisler dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1085a254e568SChristoph Hellwig 		struct iomap_ops *ops)
1086a254e568SChristoph Hellwig {
1087a254e568SChristoph Hellwig 	struct address_space *mapping = iocb->ki_filp->f_mapping;
1088a254e568SChristoph Hellwig 	struct inode *inode = mapping->host;
1089a254e568SChristoph Hellwig 	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1090a254e568SChristoph Hellwig 	unsigned flags = 0;
1091a254e568SChristoph Hellwig 
1092168316dbSChristoph Hellwig 	if (iov_iter_rw(iter) == WRITE) {
1093168316dbSChristoph Hellwig 		lockdep_assert_held_exclusive(&inode->i_rwsem);
1094a254e568SChristoph Hellwig 		flags |= IOMAP_WRITE;
1095168316dbSChristoph Hellwig 	} else {
1096168316dbSChristoph Hellwig 		lockdep_assert_held(&inode->i_rwsem);
1097168316dbSChristoph Hellwig 	}
1098a254e568SChristoph Hellwig 
1099a254e568SChristoph Hellwig 	while (iov_iter_count(iter)) {
1100a254e568SChristoph Hellwig 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
110111c59c92SRoss Zwisler 				iter, dax_iomap_actor);
1102a254e568SChristoph Hellwig 		if (ret <= 0)
1103a254e568SChristoph Hellwig 			break;
1104a254e568SChristoph Hellwig 		pos += ret;
1105a254e568SChristoph Hellwig 		done += ret;
1106a254e568SChristoph Hellwig 	}
1107a254e568SChristoph Hellwig 
1108a254e568SChristoph Hellwig 	iocb->ki_pos += done;
1109a254e568SChristoph Hellwig 	return done ? done : ret;
1110a254e568SChristoph Hellwig }
111111c59c92SRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_rw);
1112a7d73fe6SChristoph Hellwig 
11139f141d6eSJan Kara static int dax_fault_return(int error)
11149f141d6eSJan Kara {
11159f141d6eSJan Kara 	if (error == 0)
11169f141d6eSJan Kara 		return VM_FAULT_NOPAGE;
11179f141d6eSJan Kara 	if (error == -ENOMEM)
11189f141d6eSJan Kara 		return VM_FAULT_OOM;
11199f141d6eSJan Kara 	return VM_FAULT_SIGBUS;
11209f141d6eSJan Kara }
11219f141d6eSJan Kara 
1122a7d73fe6SChristoph Hellwig /**
112311c59c92SRoss Zwisler  * dax_iomap_fault - handle a page fault on a DAX file
1124a7d73fe6SChristoph Hellwig  * @vma: The virtual memory area where the fault occurred
1125a7d73fe6SChristoph Hellwig  * @vmf: The description of the fault
1126a7d73fe6SChristoph Hellwig  * @ops: iomap ops passed from the file system
1127a7d73fe6SChristoph Hellwig  *
1128a7d73fe6SChristoph Hellwig  * When a page fault occurs, filesystems may call this helper in their fault
1129a7d73fe6SChristoph Hellwig  * or mkwrite handler for DAX files. Assumes the caller has done all the
1130a7d73fe6SChristoph Hellwig  * necessary locking for the page fault to proceed successfully.
1131a7d73fe6SChristoph Hellwig  */
113211c59c92SRoss Zwisler int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1133a7d73fe6SChristoph Hellwig 			struct iomap_ops *ops)
1134a7d73fe6SChristoph Hellwig {
1135a7d73fe6SChristoph Hellwig 	struct address_space *mapping = vma->vm_file->f_mapping;
1136a7d73fe6SChristoph Hellwig 	struct inode *inode = mapping->host;
11371a29d85eSJan Kara 	unsigned long vaddr = vmf->address;
1138a7d73fe6SChristoph Hellwig 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1139a7d73fe6SChristoph Hellwig 	sector_t sector;
1140a7d73fe6SChristoph Hellwig 	struct iomap iomap = { 0 };
11419484ab1bSJan Kara 	unsigned flags = IOMAP_FAULT;
1142a7d73fe6SChristoph Hellwig 	int error, major = 0;
1143b1aa812bSJan Kara 	int vmf_ret = 0;
1144a7d73fe6SChristoph Hellwig 	void *entry;
1145a7d73fe6SChristoph Hellwig 
1146a7d73fe6SChristoph Hellwig 	/*
1147a7d73fe6SChristoph Hellwig 	 * Check whether offset isn't beyond end of file now. Caller is supposed
1148a7d73fe6SChristoph Hellwig 	 * to hold locks serializing us with truncate / punch hole so this is
1149a7d73fe6SChristoph Hellwig 	 * a reliable test.
1150a7d73fe6SChristoph Hellwig 	 */
1151a7d73fe6SChristoph Hellwig 	if (pos >= i_size_read(inode))
1152a7d73fe6SChristoph Hellwig 		return VM_FAULT_SIGBUS;
1153a7d73fe6SChristoph Hellwig 
1154a7d73fe6SChristoph Hellwig 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1155a7d73fe6SChristoph Hellwig 		flags |= IOMAP_WRITE;
1156a7d73fe6SChristoph Hellwig 
1157a7d73fe6SChristoph Hellwig 	/*
1158a7d73fe6SChristoph Hellwig 	 * Note that we don't bother to use iomap_apply here: DAX required
1159a7d73fe6SChristoph Hellwig 	 * the file system block size to be equal the page size, which means
1160a7d73fe6SChristoph Hellwig 	 * that we never have to deal with more than a single extent here.
1161a7d73fe6SChristoph Hellwig 	 */
1162a7d73fe6SChristoph Hellwig 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1163a7d73fe6SChristoph Hellwig 	if (error)
11649f141d6eSJan Kara 		return dax_fault_return(error);
1165a7d73fe6SChristoph Hellwig 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
11669f141d6eSJan Kara 		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
11679f141d6eSJan Kara 		goto finish_iomap;
11689f141d6eSJan Kara 	}
11699f141d6eSJan Kara 
11709f141d6eSJan Kara 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
11719f141d6eSJan Kara 	if (IS_ERR(entry)) {
11729f141d6eSJan Kara 		vmf_ret = dax_fault_return(PTR_ERR(entry));
11731550290bSRoss Zwisler 		goto finish_iomap;
1174a7d73fe6SChristoph Hellwig 	}
1175a7d73fe6SChristoph Hellwig 
1176333ccc97SRoss Zwisler 	sector = dax_iomap_sector(&iomap, pos);
1177a7d73fe6SChristoph Hellwig 
1178a7d73fe6SChristoph Hellwig 	if (vmf->cow_page) {
1179a7d73fe6SChristoph Hellwig 		switch (iomap.type) {
1180a7d73fe6SChristoph Hellwig 		case IOMAP_HOLE:
1181a7d73fe6SChristoph Hellwig 		case IOMAP_UNWRITTEN:
1182a7d73fe6SChristoph Hellwig 			clear_user_highpage(vmf->cow_page, vaddr);
1183a7d73fe6SChristoph Hellwig 			break;
1184a7d73fe6SChristoph Hellwig 		case IOMAP_MAPPED:
1185a7d73fe6SChristoph Hellwig 			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1186a7d73fe6SChristoph Hellwig 					vmf->cow_page, vaddr);
1187a7d73fe6SChristoph Hellwig 			break;
1188a7d73fe6SChristoph Hellwig 		default:
1189a7d73fe6SChristoph Hellwig 			WARN_ON_ONCE(1);
1190a7d73fe6SChristoph Hellwig 			error = -EIO;
1191a7d73fe6SChristoph Hellwig 			break;
1192a7d73fe6SChristoph Hellwig 		}
1193a7d73fe6SChristoph Hellwig 
1194a7d73fe6SChristoph Hellwig 		if (error)
11959f141d6eSJan Kara 			goto error_unlock_entry;
1196b1aa812bSJan Kara 
1197b1aa812bSJan Kara 		__SetPageUptodate(vmf->cow_page);
1198b1aa812bSJan Kara 		vmf_ret = finish_fault(vmf);
1199b1aa812bSJan Kara 		if (!vmf_ret)
1200b1aa812bSJan Kara 			vmf_ret = VM_FAULT_DONE_COW;
12019f141d6eSJan Kara 		goto unlock_entry;
1202a7d73fe6SChristoph Hellwig 	}
1203a7d73fe6SChristoph Hellwig 
1204a7d73fe6SChristoph Hellwig 	switch (iomap.type) {
1205a7d73fe6SChristoph Hellwig 	case IOMAP_MAPPED:
1206a7d73fe6SChristoph Hellwig 		if (iomap.flags & IOMAP_F_NEW) {
1207a7d73fe6SChristoph Hellwig 			count_vm_event(PGMAJFAULT);
1208a7d73fe6SChristoph Hellwig 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1209a7d73fe6SChristoph Hellwig 			major = VM_FAULT_MAJOR;
1210a7d73fe6SChristoph Hellwig 		}
1211a7d73fe6SChristoph Hellwig 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
1212a7d73fe6SChristoph Hellwig 				PAGE_SIZE, &entry, vma, vmf);
12139f141d6eSJan Kara 		/* -EBUSY is fine, somebody else faulted on the same PTE */
12149f141d6eSJan Kara 		if (error == -EBUSY)
12159f141d6eSJan Kara 			error = 0;
1216a7d73fe6SChristoph Hellwig 		break;
1217a7d73fe6SChristoph Hellwig 	case IOMAP_UNWRITTEN:
1218a7d73fe6SChristoph Hellwig 	case IOMAP_HOLE:
12191550290bSRoss Zwisler 		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1220f449b936SJan Kara 			vmf_ret = dax_load_hole(mapping, &entry, vmf);
12219f141d6eSJan Kara 			goto unlock_entry;
12221550290bSRoss Zwisler 		}
1223a7d73fe6SChristoph Hellwig 		/*FALLTHRU*/
1224a7d73fe6SChristoph Hellwig 	default:
1225a7d73fe6SChristoph Hellwig 		WARN_ON_ONCE(1);
1226a7d73fe6SChristoph Hellwig 		error = -EIO;
1227a7d73fe6SChristoph Hellwig 		break;
1228a7d73fe6SChristoph Hellwig 	}
1229a7d73fe6SChristoph Hellwig 
12309f141d6eSJan Kara  error_unlock_entry:
12319f141d6eSJan Kara 	vmf_ret = dax_fault_return(error) | major;
1232a7d73fe6SChristoph Hellwig  unlock_entry:
1233a7d73fe6SChristoph Hellwig 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
12349f141d6eSJan Kara  finish_iomap:
12359f141d6eSJan Kara 	if (ops->iomap_end) {
12369f141d6eSJan Kara 		int copied = PAGE_SIZE;
12379f141d6eSJan Kara 
12389f141d6eSJan Kara 		if (vmf_ret & VM_FAULT_ERROR)
12399f141d6eSJan Kara 			copied = 0;
12409f141d6eSJan Kara 		/*
12419f141d6eSJan Kara 		 * The fault is done by now and there's no way back (other
12429f141d6eSJan Kara 		 * thread may be already happily using PTE we have installed).
12439f141d6eSJan Kara 		 * Just ignore error from ->iomap_end since we cannot do much
12449f141d6eSJan Kara 		 * with it.
12459f141d6eSJan Kara 		 */
12469f141d6eSJan Kara 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
12471550290bSRoss Zwisler 	}
12489f141d6eSJan Kara 	return vmf_ret;
1249a7d73fe6SChristoph Hellwig }
125011c59c92SRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_fault);
1251642261acSRoss Zwisler 
1252642261acSRoss Zwisler #ifdef CONFIG_FS_DAX_PMD
1253642261acSRoss Zwisler /*
1254642261acSRoss Zwisler  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
1255642261acSRoss Zwisler  * more often than one might expect in the below functions.
1256642261acSRoss Zwisler  */
1257642261acSRoss Zwisler #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
1258642261acSRoss Zwisler 
1259642261acSRoss Zwisler static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1260642261acSRoss Zwisler 		struct vm_fault *vmf, unsigned long address,
1261642261acSRoss Zwisler 		struct iomap *iomap, loff_t pos, bool write, void **entryp)
1262642261acSRoss Zwisler {
1263642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1264642261acSRoss Zwisler 	struct block_device *bdev = iomap->bdev;
126527a7ffacSRoss Zwisler 	struct inode *inode = mapping->host;
1266642261acSRoss Zwisler 	struct blk_dax_ctl dax = {
1267642261acSRoss Zwisler 		.sector = dax_iomap_sector(iomap, pos),
1268642261acSRoss Zwisler 		.size = PMD_SIZE,
1269642261acSRoss Zwisler 	};
1270642261acSRoss Zwisler 	long length = dax_map_atomic(bdev, &dax);
127127a7ffacSRoss Zwisler 	void *ret = NULL;
1272642261acSRoss Zwisler 
1273642261acSRoss Zwisler 	if (length < 0) /* dax_map_atomic() failed */
127427a7ffacSRoss Zwisler 		goto fallback;
1275642261acSRoss Zwisler 	if (length < PMD_SIZE)
1276642261acSRoss Zwisler 		goto unmap_fallback;
1277642261acSRoss Zwisler 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1278642261acSRoss Zwisler 		goto unmap_fallback;
1279642261acSRoss Zwisler 	if (!pfn_t_devmap(dax.pfn))
1280642261acSRoss Zwisler 		goto unmap_fallback;
1281642261acSRoss Zwisler 
1282642261acSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
1283642261acSRoss Zwisler 
1284642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1285642261acSRoss Zwisler 			RADIX_DAX_PMD);
1286642261acSRoss Zwisler 	if (IS_ERR(ret))
128727a7ffacSRoss Zwisler 		goto fallback;
1288642261acSRoss Zwisler 	*entryp = ret;
1289642261acSRoss Zwisler 
129027a7ffacSRoss Zwisler 	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
129127a7ffacSRoss Zwisler 			dax.pfn, ret);
1292642261acSRoss Zwisler 	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1293642261acSRoss Zwisler 
1294642261acSRoss Zwisler  unmap_fallback:
1295642261acSRoss Zwisler 	dax_unmap_atomic(bdev, &dax);
129627a7ffacSRoss Zwisler fallback:
129727a7ffacSRoss Zwisler 	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
129827a7ffacSRoss Zwisler 			length, dax.pfn, ret);
1299642261acSRoss Zwisler 	return VM_FAULT_FALLBACK;
1300642261acSRoss Zwisler }
1301642261acSRoss Zwisler 
1302642261acSRoss Zwisler static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1303642261acSRoss Zwisler 		struct vm_fault *vmf, unsigned long address,
1304642261acSRoss Zwisler 		struct iomap *iomap, void **entryp)
1305642261acSRoss Zwisler {
1306642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1307642261acSRoss Zwisler 	unsigned long pmd_addr = address & PMD_MASK;
1308653b2ea3SRoss Zwisler 	struct inode *inode = mapping->host;
1309642261acSRoss Zwisler 	struct page *zero_page;
1310653b2ea3SRoss Zwisler 	void *ret = NULL;
1311642261acSRoss Zwisler 	spinlock_t *ptl;
1312642261acSRoss Zwisler 	pmd_t pmd_entry;
1313642261acSRoss Zwisler 
1314642261acSRoss Zwisler 	zero_page = mm_get_huge_zero_page(vma->vm_mm);
1315642261acSRoss Zwisler 
1316642261acSRoss Zwisler 	if (unlikely(!zero_page))
1317653b2ea3SRoss Zwisler 		goto fallback;
1318642261acSRoss Zwisler 
1319642261acSRoss Zwisler 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1320642261acSRoss Zwisler 			RADIX_DAX_PMD | RADIX_DAX_HZP);
1321642261acSRoss Zwisler 	if (IS_ERR(ret))
1322653b2ea3SRoss Zwisler 		goto fallback;
1323642261acSRoss Zwisler 	*entryp = ret;
1324642261acSRoss Zwisler 
1325642261acSRoss Zwisler 	ptl = pmd_lock(vma->vm_mm, pmd);
1326642261acSRoss Zwisler 	if (!pmd_none(*pmd)) {
1327642261acSRoss Zwisler 		spin_unlock(ptl);
1328653b2ea3SRoss Zwisler 		goto fallback;
1329642261acSRoss Zwisler 	}
1330642261acSRoss Zwisler 
1331642261acSRoss Zwisler 	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1332642261acSRoss Zwisler 	pmd_entry = pmd_mkhuge(pmd_entry);
1333642261acSRoss Zwisler 	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1334642261acSRoss Zwisler 	spin_unlock(ptl);
1335653b2ea3SRoss Zwisler 	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
1336642261acSRoss Zwisler 	return VM_FAULT_NOPAGE;
1337653b2ea3SRoss Zwisler 
1338653b2ea3SRoss Zwisler fallback:
1339653b2ea3SRoss Zwisler 	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
1340653b2ea3SRoss Zwisler 	return VM_FAULT_FALLBACK;
1341642261acSRoss Zwisler }
1342642261acSRoss Zwisler 
1343d8a849e1SDave Jiang int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1344d8a849e1SDave Jiang 		struct iomap_ops *ops)
1345642261acSRoss Zwisler {
1346642261acSRoss Zwisler 	struct address_space *mapping = vma->vm_file->f_mapping;
1347d8a849e1SDave Jiang 	unsigned long pmd_addr = vmf->address & PMD_MASK;
1348d8a849e1SDave Jiang 	bool write = vmf->flags & FAULT_FLAG_WRITE;
13499484ab1bSJan Kara 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1350642261acSRoss Zwisler 	struct inode *inode = mapping->host;
1351642261acSRoss Zwisler 	int result = VM_FAULT_FALLBACK;
1352642261acSRoss Zwisler 	struct iomap iomap = { 0 };
1353642261acSRoss Zwisler 	pgoff_t max_pgoff, pgoff;
1354642261acSRoss Zwisler 	void *entry;
1355642261acSRoss Zwisler 	loff_t pos;
1356642261acSRoss Zwisler 	int error;
1357642261acSRoss Zwisler 
1358282a8e03SRoss Zwisler 	/*
1359282a8e03SRoss Zwisler 	 * Check whether offset isn't beyond end of file now. Caller is
1360282a8e03SRoss Zwisler 	 * supposed to hold locks serializing us with truncate / punch hole so
1361282a8e03SRoss Zwisler 	 * this is a reliable test.
1362282a8e03SRoss Zwisler 	 */
1363282a8e03SRoss Zwisler 	pgoff = linear_page_index(vma, pmd_addr);
1364282a8e03SRoss Zwisler 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1365282a8e03SRoss Zwisler 
1366d8a849e1SDave Jiang 	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
1367282a8e03SRoss Zwisler 
1368642261acSRoss Zwisler 	/* Fall back to PTEs if we're going to COW */
1369642261acSRoss Zwisler 	if (write && !(vma->vm_flags & VM_SHARED))
1370642261acSRoss Zwisler 		goto fallback;
1371642261acSRoss Zwisler 
1372642261acSRoss Zwisler 	/* If the PMD would extend outside the VMA */
1373642261acSRoss Zwisler 	if (pmd_addr < vma->vm_start)
1374642261acSRoss Zwisler 		goto fallback;
1375642261acSRoss Zwisler 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1376642261acSRoss Zwisler 		goto fallback;
1377642261acSRoss Zwisler 
1378282a8e03SRoss Zwisler 	if (pgoff > max_pgoff) {
1379282a8e03SRoss Zwisler 		result = VM_FAULT_SIGBUS;
1380282a8e03SRoss Zwisler 		goto out;
1381282a8e03SRoss Zwisler 	}
1382642261acSRoss Zwisler 
1383642261acSRoss Zwisler 	/* If the PMD would extend beyond the file size */
1384642261acSRoss Zwisler 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1385642261acSRoss Zwisler 		goto fallback;
1386642261acSRoss Zwisler 
1387642261acSRoss Zwisler 	/*
1388642261acSRoss Zwisler 	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
1389642261acSRoss Zwisler 	 * setting up a mapping, so really we're using iomap_begin() as a way
1390642261acSRoss Zwisler 	 * to look up our filesystem block.
1391642261acSRoss Zwisler 	 */
1392642261acSRoss Zwisler 	pos = (loff_t)pgoff << PAGE_SHIFT;
1393642261acSRoss Zwisler 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1394642261acSRoss Zwisler 	if (error)
13959f141d6eSJan Kara 		goto fallback;
13969f141d6eSJan Kara 
1397642261acSRoss Zwisler 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
1398642261acSRoss Zwisler 		goto finish_iomap;
1399642261acSRoss Zwisler 
14009f141d6eSJan Kara 	/*
14019f141d6eSJan Kara 	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
14029f141d6eSJan Kara 	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
14039f141d6eSJan Kara 	 * the tree, for instance), it will return -EEXIST and we just fall
14049f141d6eSJan Kara 	 * back to 4k entries.
14059f141d6eSJan Kara 	 */
14069f141d6eSJan Kara 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
14079f141d6eSJan Kara 	if (IS_ERR(entry))
14089f141d6eSJan Kara 		goto finish_iomap;
14099f141d6eSJan Kara 
1410642261acSRoss Zwisler 	switch (iomap.type) {
1411642261acSRoss Zwisler 	case IOMAP_MAPPED:
1412d8a849e1SDave Jiang 		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
1413d8a849e1SDave Jiang 				vmf->address, &iomap, pos, write, &entry);
1414642261acSRoss Zwisler 		break;
1415642261acSRoss Zwisler 	case IOMAP_UNWRITTEN:
1416642261acSRoss Zwisler 	case IOMAP_HOLE:
1417642261acSRoss Zwisler 		if (WARN_ON_ONCE(write))
14189f141d6eSJan Kara 			goto unlock_entry;
1419d8a849e1SDave Jiang 		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
1420d8a849e1SDave Jiang 				&iomap, &entry);
1421642261acSRoss Zwisler 		break;
1422642261acSRoss Zwisler 	default:
1423642261acSRoss Zwisler 		WARN_ON_ONCE(1);
1424642261acSRoss Zwisler 		break;
1425642261acSRoss Zwisler 	}
1426642261acSRoss Zwisler 
1427642261acSRoss Zwisler  unlock_entry:
1428642261acSRoss Zwisler 	put_locked_mapping_entry(mapping, pgoff, entry);
14299f141d6eSJan Kara  finish_iomap:
14309f141d6eSJan Kara 	if (ops->iomap_end) {
14319f141d6eSJan Kara 		int copied = PMD_SIZE;
14329f141d6eSJan Kara 
14339f141d6eSJan Kara 		if (result == VM_FAULT_FALLBACK)
14349f141d6eSJan Kara 			copied = 0;
14359f141d6eSJan Kara 		/*
14369f141d6eSJan Kara 		 * The fault is done by now and there's no way back (other
14379f141d6eSJan Kara 		 * thread may be already happily using PMD we have installed).
14389f141d6eSJan Kara 		 * Just ignore error from ->iomap_end since we cannot do much
14399f141d6eSJan Kara 		 * with it.
14409f141d6eSJan Kara 		 */
14419f141d6eSJan Kara 		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
14429f141d6eSJan Kara 				&iomap);
14439f141d6eSJan Kara 	}
1444642261acSRoss Zwisler  fallback:
1445642261acSRoss Zwisler 	if (result == VM_FAULT_FALLBACK) {
1446d8a849e1SDave Jiang 		split_huge_pmd(vma, vmf->pmd, vmf->address);
1447642261acSRoss Zwisler 		count_vm_event(THP_FAULT_FALLBACK);
1448642261acSRoss Zwisler 	}
1449282a8e03SRoss Zwisler out:
1450d8a849e1SDave Jiang 	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
1451642261acSRoss Zwisler 	return result;
1452642261acSRoss Zwisler }
1453642261acSRoss Zwisler EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1454642261acSRoss Zwisler #endif /* CONFIG_FS_DAX_PMD */
1455