xref: /openbmc/linux/fs/dax.c (revision ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1)
1  /*
2   * fs/dax.c - Direct Access filesystem code
3   * Copyright (c) 2013-2014 Intel Corporation
4   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6   *
7   * This program is free software; you can redistribute it and/or modify it
8   * under the terms and conditions of the GNU General Public License,
9   * version 2, as published by the Free Software Foundation.
10   *
11   * This program is distributed in the hope it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14   * more details.
15   */
16  
17  #include <linux/atomic.h>
18  #include <linux/blkdev.h>
19  #include <linux/buffer_head.h>
20  #include <linux/dax.h>
21  #include <linux/fs.h>
22  #include <linux/genhd.h>
23  #include <linux/highmem.h>
24  #include <linux/memcontrol.h>
25  #include <linux/mm.h>
26  #include <linux/mutex.h>
27  #include <linux/pagevec.h>
28  #include <linux/sched.h>
29  #include <linux/sched/signal.h>
30  #include <linux/uio.h>
31  #include <linux/vmstat.h>
32  #include <linux/pfn_t.h>
33  #include <linux/sizes.h>
34  #include <linux/mmu_notifier.h>
35  #include <linux/iomap.h>
36  #include <asm/pgalloc.h>
37  #include "internal.h"
38  
39  #define CREATE_TRACE_POINTS
40  #include <trace/events/fs_dax.h>
41  
42  static inline unsigned int pe_order(enum page_entry_size pe_size)
43  {
44  	if (pe_size == PE_SIZE_PTE)
45  		return PAGE_SHIFT - PAGE_SHIFT;
46  	if (pe_size == PE_SIZE_PMD)
47  		return PMD_SHIFT - PAGE_SHIFT;
48  	if (pe_size == PE_SIZE_PUD)
49  		return PUD_SHIFT - PAGE_SHIFT;
50  	return ~0;
51  }
52  
53  /* We choose 4096 entries - same as per-zone page wait tables */
54  #define DAX_WAIT_TABLE_BITS 12
55  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
56  
57  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
58  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
59  #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
60  
61  /* The order of a PMD entry */
62  #define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
63  
64  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
65  
66  static int __init init_dax_wait_table(void)
67  {
68  	int i;
69  
70  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
71  		init_waitqueue_head(wait_table + i);
72  	return 0;
73  }
74  fs_initcall(init_dax_wait_table);
75  
76  /*
77   * DAX pagecache entries use XArray value entries so they can't be mistaken
78   * for pages.  We use one bit for locking, one bit for the entry size (PMD)
79   * and two more to tell us if the entry is a zero page or an empty entry that
80   * is just used for locking.  In total four special bits.
81   *
82   * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
83   * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
84   * block allocation.
85   */
86  #define DAX_SHIFT	(4)
87  #define DAX_LOCKED	(1UL << 0)
88  #define DAX_PMD		(1UL << 1)
89  #define DAX_ZERO_PAGE	(1UL << 2)
90  #define DAX_EMPTY	(1UL << 3)
91  
92  static unsigned long dax_to_pfn(void *entry)
93  {
94  	return xa_to_value(entry) >> DAX_SHIFT;
95  }
96  
97  static void *dax_make_entry(pfn_t pfn, unsigned long flags)
98  {
99  	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
100  }
101  
102  static bool dax_is_locked(void *entry)
103  {
104  	return xa_to_value(entry) & DAX_LOCKED;
105  }
106  
107  static unsigned int dax_entry_order(void *entry)
108  {
109  	if (xa_to_value(entry) & DAX_PMD)
110  		return PMD_ORDER;
111  	return 0;
112  }
113  
114  static unsigned long dax_is_pmd_entry(void *entry)
115  {
116  	return xa_to_value(entry) & DAX_PMD;
117  }
118  
119  static bool dax_is_pte_entry(void *entry)
120  {
121  	return !(xa_to_value(entry) & DAX_PMD);
122  }
123  
124  static int dax_is_zero_entry(void *entry)
125  {
126  	return xa_to_value(entry) & DAX_ZERO_PAGE;
127  }
128  
129  static int dax_is_empty_entry(void *entry)
130  {
131  	return xa_to_value(entry) & DAX_EMPTY;
132  }
133  
134  /*
135   * DAX page cache entry locking
136   */
137  struct exceptional_entry_key {
138  	struct xarray *xa;
139  	pgoff_t entry_start;
140  };
141  
142  struct wait_exceptional_entry_queue {
143  	wait_queue_entry_t wait;
144  	struct exceptional_entry_key key;
145  };
146  
147  static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
148  		void *entry, struct exceptional_entry_key *key)
149  {
150  	unsigned long hash;
151  	unsigned long index = xas->xa_index;
152  
153  	/*
154  	 * If 'entry' is a PMD, align the 'index' that we use for the wait
155  	 * queue to the start of that PMD.  This ensures that all offsets in
156  	 * the range covered by the PMD map to the same bit lock.
157  	 */
158  	if (dax_is_pmd_entry(entry))
159  		index &= ~PG_PMD_COLOUR;
160  	key->xa = xas->xa;
161  	key->entry_start = index;
162  
163  	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
164  	return wait_table + hash;
165  }
166  
167  static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
168  		unsigned int mode, int sync, void *keyp)
169  {
170  	struct exceptional_entry_key *key = keyp;
171  	struct wait_exceptional_entry_queue *ewait =
172  		container_of(wait, struct wait_exceptional_entry_queue, wait);
173  
174  	if (key->xa != ewait->key.xa ||
175  	    key->entry_start != ewait->key.entry_start)
176  		return 0;
177  	return autoremove_wake_function(wait, mode, sync, NULL);
178  }
179  
180  /*
181   * @entry may no longer be the entry at the index in the mapping.
182   * The important information it's conveying is whether the entry at
183   * this index used to be a PMD entry.
184   */
185  static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
186  {
187  	struct exceptional_entry_key key;
188  	wait_queue_head_t *wq;
189  
190  	wq = dax_entry_waitqueue(xas, entry, &key);
191  
192  	/*
193  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
194  	 * under the i_pages lock, ditto for entry handling in our callers.
195  	 * So at this point all tasks that could have seen our entry locked
196  	 * must be in the waitqueue and the following check will see them.
197  	 */
198  	if (waitqueue_active(wq))
199  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
200  }
201  
202  /*
203   * Look up entry in page cache, wait for it to become unlocked if it
204   * is a DAX entry and return it.  The caller must subsequently call
205   * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
206   * if it did.
207   *
208   * Must be called with the i_pages lock held.
209   */
210  static void *get_unlocked_entry(struct xa_state *xas)
211  {
212  	void *entry;
213  	struct wait_exceptional_entry_queue ewait;
214  	wait_queue_head_t *wq;
215  
216  	init_wait(&ewait.wait);
217  	ewait.wait.func = wake_exceptional_entry_func;
218  
219  	for (;;) {
220  		entry = xas_find_conflict(xas);
221  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
222  				!dax_is_locked(entry))
223  			return entry;
224  
225  		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
226  		prepare_to_wait_exclusive(wq, &ewait.wait,
227  					  TASK_UNINTERRUPTIBLE);
228  		xas_unlock_irq(xas);
229  		xas_reset(xas);
230  		schedule();
231  		finish_wait(wq, &ewait.wait);
232  		xas_lock_irq(xas);
233  	}
234  }
235  
236  /*
237   * The only thing keeping the address space around is the i_pages lock
238   * (it's cycled in clear_inode() after removing the entries from i_pages)
239   * After we call xas_unlock_irq(), we cannot touch xas->xa.
240   */
241  static void wait_entry_unlocked(struct xa_state *xas, void *entry)
242  {
243  	struct wait_exceptional_entry_queue ewait;
244  	wait_queue_head_t *wq;
245  
246  	init_wait(&ewait.wait);
247  	ewait.wait.func = wake_exceptional_entry_func;
248  
249  	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
250  	/*
251  	 * Unlike get_unlocked_entry() there is no guarantee that this
252  	 * path ever successfully retrieves an unlocked entry before an
253  	 * inode dies. Perform a non-exclusive wait in case this path
254  	 * never successfully performs its own wake up.
255  	 */
256  	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
257  	xas_unlock_irq(xas);
258  	schedule();
259  	finish_wait(wq, &ewait.wait);
260  }
261  
262  static void put_unlocked_entry(struct xa_state *xas, void *entry)
263  {
264  	/* If we were the only waiter woken, wake the next one */
265  	if (entry)
266  		dax_wake_entry(xas, entry, false);
267  }
268  
269  /*
270   * We used the xa_state to get the entry, but then we locked the entry and
271   * dropped the xa_lock, so we know the xa_state is stale and must be reset
272   * before use.
273   */
274  static void dax_unlock_entry(struct xa_state *xas, void *entry)
275  {
276  	void *old;
277  
278  	BUG_ON(dax_is_locked(entry));
279  	xas_reset(xas);
280  	xas_lock_irq(xas);
281  	old = xas_store(xas, entry);
282  	xas_unlock_irq(xas);
283  	BUG_ON(!dax_is_locked(old));
284  	dax_wake_entry(xas, entry, false);
285  }
286  
287  /*
288   * Return: The entry stored at this location before it was locked.
289   */
290  static void *dax_lock_entry(struct xa_state *xas, void *entry)
291  {
292  	unsigned long v = xa_to_value(entry);
293  	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
294  }
295  
296  static unsigned long dax_entry_size(void *entry)
297  {
298  	if (dax_is_zero_entry(entry))
299  		return 0;
300  	else if (dax_is_empty_entry(entry))
301  		return 0;
302  	else if (dax_is_pmd_entry(entry))
303  		return PMD_SIZE;
304  	else
305  		return PAGE_SIZE;
306  }
307  
308  static unsigned long dax_end_pfn(void *entry)
309  {
310  	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
311  }
312  
313  /*
314   * Iterate through all mapped pfns represented by an entry, i.e. skip
315   * 'empty' and 'zero' entries.
316   */
317  #define for_each_mapped_pfn(entry, pfn) \
318  	for (pfn = dax_to_pfn(entry); \
319  			pfn < dax_end_pfn(entry); pfn++)
320  
321  /*
322   * TODO: for reflink+dax we need a way to associate a single page with
323   * multiple address_space instances at different linear_page_index()
324   * offsets.
325   */
326  static void dax_associate_entry(void *entry, struct address_space *mapping,
327  		struct vm_area_struct *vma, unsigned long address)
328  {
329  	unsigned long size = dax_entry_size(entry), pfn, index;
330  	int i = 0;
331  
332  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
333  		return;
334  
335  	index = linear_page_index(vma, address & ~(size - 1));
336  	for_each_mapped_pfn(entry, pfn) {
337  		struct page *page = pfn_to_page(pfn);
338  
339  		WARN_ON_ONCE(page->mapping);
340  		page->mapping = mapping;
341  		page->index = index + i++;
342  	}
343  }
344  
345  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
346  		bool trunc)
347  {
348  	unsigned long pfn;
349  
350  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
351  		return;
352  
353  	for_each_mapped_pfn(entry, pfn) {
354  		struct page *page = pfn_to_page(pfn);
355  
356  		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
357  		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
358  		page->mapping = NULL;
359  		page->index = 0;
360  	}
361  }
362  
363  static struct page *dax_busy_page(void *entry)
364  {
365  	unsigned long pfn;
366  
367  	for_each_mapped_pfn(entry, pfn) {
368  		struct page *page = pfn_to_page(pfn);
369  
370  		if (page_ref_count(page) > 1)
371  			return page;
372  	}
373  	return NULL;
374  }
375  
376  /*
377   * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
378   * @page: The page whose entry we want to lock
379   *
380   * Context: Process context.
381   * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
382   * not be locked.
383   */
384  dax_entry_t dax_lock_page(struct page *page)
385  {
386  	XA_STATE(xas, NULL, 0);
387  	void *entry;
388  
389  	/* Ensure page->mapping isn't freed while we look at it */
390  	rcu_read_lock();
391  	for (;;) {
392  		struct address_space *mapping = READ_ONCE(page->mapping);
393  
394  		entry = NULL;
395  		if (!mapping || !dax_mapping(mapping))
396  			break;
397  
398  		/*
399  		 * In the device-dax case there's no need to lock, a
400  		 * struct dev_pagemap pin is sufficient to keep the
401  		 * inode alive, and we assume we have dev_pagemap pin
402  		 * otherwise we would not have a valid pfn_to_page()
403  		 * translation.
404  		 */
405  		entry = (void *)~0UL;
406  		if (S_ISCHR(mapping->host->i_mode))
407  			break;
408  
409  		xas.xa = &mapping->i_pages;
410  		xas_lock_irq(&xas);
411  		if (mapping != page->mapping) {
412  			xas_unlock_irq(&xas);
413  			continue;
414  		}
415  		xas_set(&xas, page->index);
416  		entry = xas_load(&xas);
417  		if (dax_is_locked(entry)) {
418  			rcu_read_unlock();
419  			wait_entry_unlocked(&xas, entry);
420  			rcu_read_lock();
421  			continue;
422  		}
423  		dax_lock_entry(&xas, entry);
424  		xas_unlock_irq(&xas);
425  		break;
426  	}
427  	rcu_read_unlock();
428  	return (dax_entry_t)entry;
429  }
430  
431  void dax_unlock_page(struct page *page, dax_entry_t cookie)
432  {
433  	struct address_space *mapping = page->mapping;
434  	XA_STATE(xas, &mapping->i_pages, page->index);
435  
436  	if (S_ISCHR(mapping->host->i_mode))
437  		return;
438  
439  	dax_unlock_entry(&xas, (void *)cookie);
440  }
441  
442  /*
443   * Find page cache entry at given index. If it is a DAX entry, return it
444   * with the entry locked. If the page cache doesn't contain an entry at
445   * that index, add a locked empty entry.
446   *
447   * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
448   * either return that locked entry or will return VM_FAULT_FALLBACK.
449   * This will happen if there are any PTE entries within the PMD range
450   * that we are requesting.
451   *
452   * We always favor PTE entries over PMD entries. There isn't a flow where we
453   * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
454   * insertion will fail if it finds any PTE entries already in the tree, and a
455   * PTE insertion will cause an existing PMD entry to be unmapped and
456   * downgraded to PTE entries.  This happens for both PMD zero pages as
457   * well as PMD empty entries.
458   *
459   * The exception to this downgrade path is for PMD entries that have
460   * real storage backing them.  We will leave these real PMD entries in
461   * the tree, and PTE writes will simply dirty the entire PMD entry.
462   *
463   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
464   * persistent memory the benefit is doubtful. We can add that later if we can
465   * show it helps.
466   *
467   * On error, this function does not return an ERR_PTR.  Instead it returns
468   * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
469   * overlap with xarray value entries.
470   */
471  static void *grab_mapping_entry(struct xa_state *xas,
472  		struct address_space *mapping, unsigned long size_flag)
473  {
474  	unsigned long index = xas->xa_index;
475  	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
476  	void *entry;
477  
478  retry:
479  	xas_lock_irq(xas);
480  	entry = get_unlocked_entry(xas);
481  
482  	if (entry) {
483  		if (!xa_is_value(entry)) {
484  			xas_set_err(xas, EIO);
485  			goto out_unlock;
486  		}
487  
488  		if (size_flag & DAX_PMD) {
489  			if (dax_is_pte_entry(entry)) {
490  				put_unlocked_entry(xas, entry);
491  				goto fallback;
492  			}
493  		} else { /* trying to grab a PTE entry */
494  			if (dax_is_pmd_entry(entry) &&
495  			    (dax_is_zero_entry(entry) ||
496  			     dax_is_empty_entry(entry))) {
497  				pmd_downgrade = true;
498  			}
499  		}
500  	}
501  
502  	if (pmd_downgrade) {
503  		/*
504  		 * Make sure 'entry' remains valid while we drop
505  		 * the i_pages lock.
506  		 */
507  		dax_lock_entry(xas, entry);
508  
509  		/*
510  		 * Besides huge zero pages the only other thing that gets
511  		 * downgraded are empty entries which don't need to be
512  		 * unmapped.
513  		 */
514  		if (dax_is_zero_entry(entry)) {
515  			xas_unlock_irq(xas);
516  			unmap_mapping_pages(mapping,
517  					xas->xa_index & ~PG_PMD_COLOUR,
518  					PG_PMD_NR, false);
519  			xas_reset(xas);
520  			xas_lock_irq(xas);
521  		}
522  
523  		dax_disassociate_entry(entry, mapping, false);
524  		xas_store(xas, NULL);	/* undo the PMD join */
525  		dax_wake_entry(xas, entry, true);
526  		mapping->nrexceptional--;
527  		entry = NULL;
528  		xas_set(xas, index);
529  	}
530  
531  	if (entry) {
532  		dax_lock_entry(xas, entry);
533  	} else {
534  		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
535  		dax_lock_entry(xas, entry);
536  		if (xas_error(xas))
537  			goto out_unlock;
538  		mapping->nrexceptional++;
539  	}
540  
541  out_unlock:
542  	xas_unlock_irq(xas);
543  	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
544  		goto retry;
545  	if (xas->xa_node == XA_ERROR(-ENOMEM))
546  		return xa_mk_internal(VM_FAULT_OOM);
547  	if (xas_error(xas))
548  		return xa_mk_internal(VM_FAULT_SIGBUS);
549  	return entry;
550  fallback:
551  	xas_unlock_irq(xas);
552  	return xa_mk_internal(VM_FAULT_FALLBACK);
553  }
554  
555  /**
556   * dax_layout_busy_page - find first pinned page in @mapping
557   * @mapping: address space to scan for a page with ref count > 1
558   *
559   * DAX requires ZONE_DEVICE mapped pages. These pages are never
560   * 'onlined' to the page allocator so they are considered idle when
561   * page->count == 1. A filesystem uses this interface to determine if
562   * any page in the mapping is busy, i.e. for DMA, or other
563   * get_user_pages() usages.
564   *
565   * It is expected that the filesystem is holding locks to block the
566   * establishment of new mappings in this address_space. I.e. it expects
567   * to be able to run unmap_mapping_range() and subsequently not race
568   * mapping_mapped() becoming true.
569   */
570  struct page *dax_layout_busy_page(struct address_space *mapping)
571  {
572  	XA_STATE(xas, &mapping->i_pages, 0);
573  	void *entry;
574  	unsigned int scanned = 0;
575  	struct page *page = NULL;
576  
577  	/*
578  	 * In the 'limited' case get_user_pages() for dax is disabled.
579  	 */
580  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
581  		return NULL;
582  
583  	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
584  		return NULL;
585  
586  	/*
587  	 * If we race get_user_pages_fast() here either we'll see the
588  	 * elevated page count in the iteration and wait, or
589  	 * get_user_pages_fast() will see that the page it took a reference
590  	 * against is no longer mapped in the page tables and bail to the
591  	 * get_user_pages() slow path.  The slow path is protected by
592  	 * pte_lock() and pmd_lock(). New references are not taken without
593  	 * holding those locks, and unmap_mapping_range() will not zero the
594  	 * pte or pmd without holding the respective lock, so we are
595  	 * guaranteed to either see new references or prevent new
596  	 * references from being established.
597  	 */
598  	unmap_mapping_range(mapping, 0, 0, 1);
599  
600  	xas_lock_irq(&xas);
601  	xas_for_each(&xas, entry, ULONG_MAX) {
602  		if (WARN_ON_ONCE(!xa_is_value(entry)))
603  			continue;
604  		if (unlikely(dax_is_locked(entry)))
605  			entry = get_unlocked_entry(&xas);
606  		if (entry)
607  			page = dax_busy_page(entry);
608  		put_unlocked_entry(&xas, entry);
609  		if (page)
610  			break;
611  		if (++scanned % XA_CHECK_SCHED)
612  			continue;
613  
614  		xas_pause(&xas);
615  		xas_unlock_irq(&xas);
616  		cond_resched();
617  		xas_lock_irq(&xas);
618  	}
619  	xas_unlock_irq(&xas);
620  	return page;
621  }
622  EXPORT_SYMBOL_GPL(dax_layout_busy_page);
623  
624  static int __dax_invalidate_entry(struct address_space *mapping,
625  					  pgoff_t index, bool trunc)
626  {
627  	XA_STATE(xas, &mapping->i_pages, index);
628  	int ret = 0;
629  	void *entry;
630  
631  	xas_lock_irq(&xas);
632  	entry = get_unlocked_entry(&xas);
633  	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
634  		goto out;
635  	if (!trunc &&
636  	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
637  	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
638  		goto out;
639  	dax_disassociate_entry(entry, mapping, trunc);
640  	xas_store(&xas, NULL);
641  	mapping->nrexceptional--;
642  	ret = 1;
643  out:
644  	put_unlocked_entry(&xas, entry);
645  	xas_unlock_irq(&xas);
646  	return ret;
647  }
648  
649  /*
650   * Delete DAX entry at @index from @mapping.  Wait for it
651   * to be unlocked before deleting it.
652   */
653  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
654  {
655  	int ret = __dax_invalidate_entry(mapping, index, true);
656  
657  	/*
658  	 * This gets called from truncate / punch_hole path. As such, the caller
659  	 * must hold locks protecting against concurrent modifications of the
660  	 * page cache (usually fs-private i_mmap_sem for writing). Since the
661  	 * caller has seen a DAX entry for this index, we better find it
662  	 * at that index as well...
663  	 */
664  	WARN_ON_ONCE(!ret);
665  	return ret;
666  }
667  
668  /*
669   * Invalidate DAX entry if it is clean.
670   */
671  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
672  				      pgoff_t index)
673  {
674  	return __dax_invalidate_entry(mapping, index, false);
675  }
676  
677  static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
678  		sector_t sector, size_t size, struct page *to,
679  		unsigned long vaddr)
680  {
681  	void *vto, *kaddr;
682  	pgoff_t pgoff;
683  	long rc;
684  	int id;
685  
686  	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
687  	if (rc)
688  		return rc;
689  
690  	id = dax_read_lock();
691  	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
692  	if (rc < 0) {
693  		dax_read_unlock(id);
694  		return rc;
695  	}
696  	vto = kmap_atomic(to);
697  	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
698  	kunmap_atomic(vto);
699  	dax_read_unlock(id);
700  	return 0;
701  }
702  
703  /*
704   * By this point grab_mapping_entry() has ensured that we have a locked entry
705   * of the appropriate size so we don't have to worry about downgrading PMDs to
706   * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
707   * already in the tree, we will skip the insertion and just dirty the PMD as
708   * appropriate.
709   */
710  static void *dax_insert_entry(struct xa_state *xas,
711  		struct address_space *mapping, struct vm_fault *vmf,
712  		void *entry, pfn_t pfn, unsigned long flags, bool dirty)
713  {
714  	void *new_entry = dax_make_entry(pfn, flags);
715  
716  	if (dirty)
717  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
718  
719  	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
720  		unsigned long index = xas->xa_index;
721  		/* we are replacing a zero page with block mapping */
722  		if (dax_is_pmd_entry(entry))
723  			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
724  					PG_PMD_NR, false);
725  		else /* pte entry */
726  			unmap_mapping_pages(mapping, index, 1, false);
727  	}
728  
729  	xas_reset(xas);
730  	xas_lock_irq(xas);
731  	if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
732  		dax_disassociate_entry(entry, mapping, false);
733  		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
734  	}
735  
736  	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
737  		/*
738  		 * Only swap our new entry into the page cache if the current
739  		 * entry is a zero page or an empty entry.  If a normal PTE or
740  		 * PMD entry is already in the cache, we leave it alone.  This
741  		 * means that if we are trying to insert a PTE and the
742  		 * existing entry is a PMD, we will just leave the PMD in the
743  		 * tree and dirty it if necessary.
744  		 */
745  		void *old = dax_lock_entry(xas, new_entry);
746  		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
747  					DAX_LOCKED));
748  		entry = new_entry;
749  	} else {
750  		xas_load(xas);	/* Walk the xa_state */
751  	}
752  
753  	if (dirty)
754  		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
755  
756  	xas_unlock_irq(xas);
757  	return entry;
758  }
759  
760  static inline
761  unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
762  {
763  	unsigned long address;
764  
765  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
766  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
767  	return address;
768  }
769  
770  /* Walk all mappings of a given index of a file and writeprotect them */
771  static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
772  		unsigned long pfn)
773  {
774  	struct vm_area_struct *vma;
775  	pte_t pte, *ptep = NULL;
776  	pmd_t *pmdp = NULL;
777  	spinlock_t *ptl;
778  
779  	i_mmap_lock_read(mapping);
780  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
781  		struct mmu_notifier_range range;
782  		unsigned long address;
783  
784  		cond_resched();
785  
786  		if (!(vma->vm_flags & VM_SHARED))
787  			continue;
788  
789  		address = pgoff_address(index, vma);
790  
791  		/*
792  		 * Note because we provide range to follow_pte_pmd it will
793  		 * call mmu_notifier_invalidate_range_start() on our behalf
794  		 * before taking any lock.
795  		 */
796  		if (follow_pte_pmd(vma->vm_mm, address, &range,
797  				   &ptep, &pmdp, &ptl))
798  			continue;
799  
800  		/*
801  		 * No need to call mmu_notifier_invalidate_range() as we are
802  		 * downgrading page table protection not changing it to point
803  		 * to a new page.
804  		 *
805  		 * See Documentation/vm/mmu_notifier.rst
806  		 */
807  		if (pmdp) {
808  #ifdef CONFIG_FS_DAX_PMD
809  			pmd_t pmd;
810  
811  			if (pfn != pmd_pfn(*pmdp))
812  				goto unlock_pmd;
813  			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
814  				goto unlock_pmd;
815  
816  			flush_cache_page(vma, address, pfn);
817  			pmd = pmdp_invalidate(vma, address, pmdp);
818  			pmd = pmd_wrprotect(pmd);
819  			pmd = pmd_mkclean(pmd);
820  			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
821  unlock_pmd:
822  #endif
823  			spin_unlock(ptl);
824  		} else {
825  			if (pfn != pte_pfn(*ptep))
826  				goto unlock_pte;
827  			if (!pte_dirty(*ptep) && !pte_write(*ptep))
828  				goto unlock_pte;
829  
830  			flush_cache_page(vma, address, pfn);
831  			pte = ptep_clear_flush(vma, address, ptep);
832  			pte = pte_wrprotect(pte);
833  			pte = pte_mkclean(pte);
834  			set_pte_at(vma->vm_mm, address, ptep, pte);
835  unlock_pte:
836  			pte_unmap_unlock(ptep, ptl);
837  		}
838  
839  		mmu_notifier_invalidate_range_end(&range);
840  	}
841  	i_mmap_unlock_read(mapping);
842  }
843  
844  static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
845  		struct address_space *mapping, void *entry)
846  {
847  	unsigned long pfn, index, count;
848  	long ret = 0;
849  
850  	/*
851  	 * A page got tagged dirty in DAX mapping? Something is seriously
852  	 * wrong.
853  	 */
854  	if (WARN_ON(!xa_is_value(entry)))
855  		return -EIO;
856  
857  	if (unlikely(dax_is_locked(entry))) {
858  		void *old_entry = entry;
859  
860  		entry = get_unlocked_entry(xas);
861  
862  		/* Entry got punched out / reallocated? */
863  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
864  			goto put_unlocked;
865  		/*
866  		 * Entry got reallocated elsewhere? No need to writeback.
867  		 * We have to compare pfns as we must not bail out due to
868  		 * difference in lockbit or entry type.
869  		 */
870  		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
871  			goto put_unlocked;
872  		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
873  					dax_is_zero_entry(entry))) {
874  			ret = -EIO;
875  			goto put_unlocked;
876  		}
877  
878  		/* Another fsync thread may have already done this entry */
879  		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
880  			goto put_unlocked;
881  	}
882  
883  	/* Lock the entry to serialize with page faults */
884  	dax_lock_entry(xas, entry);
885  
886  	/*
887  	 * We can clear the tag now but we have to be careful so that concurrent
888  	 * dax_writeback_one() calls for the same index cannot finish before we
889  	 * actually flush the caches. This is achieved as the calls will look
890  	 * at the entry only under the i_pages lock and once they do that
891  	 * they will see the entry locked and wait for it to unlock.
892  	 */
893  	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
894  	xas_unlock_irq(xas);
895  
896  	/*
897  	 * If dax_writeback_mapping_range() was given a wbc->range_start
898  	 * in the middle of a PMD, the 'index' we use needs to be
899  	 * aligned to the start of the PMD.
900  	 * This allows us to flush for PMD_SIZE and not have to worry about
901  	 * partial PMD writebacks.
902  	 */
903  	pfn = dax_to_pfn(entry);
904  	count = 1UL << dax_entry_order(entry);
905  	index = xas->xa_index & ~(count - 1);
906  
907  	dax_entry_mkclean(mapping, index, pfn);
908  	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
909  	/*
910  	 * After we have flushed the cache, we can clear the dirty tag. There
911  	 * cannot be new dirty data in the pfn after the flush has completed as
912  	 * the pfn mappings are writeprotected and fault waits for mapping
913  	 * entry lock.
914  	 */
915  	xas_reset(xas);
916  	xas_lock_irq(xas);
917  	xas_store(xas, entry);
918  	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
919  	dax_wake_entry(xas, entry, false);
920  
921  	trace_dax_writeback_one(mapping->host, index, count);
922  	return ret;
923  
924   put_unlocked:
925  	put_unlocked_entry(xas, entry);
926  	return ret;
927  }
928  
929  /*
930   * Flush the mapping to the persistent domain within the byte range of [start,
931   * end]. This is required by data integrity operations to ensure file data is
932   * on persistent storage prior to completion of the operation.
933   */
934  int dax_writeback_mapping_range(struct address_space *mapping,
935  		struct block_device *bdev, struct writeback_control *wbc)
936  {
937  	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
938  	struct inode *inode = mapping->host;
939  	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
940  	struct dax_device *dax_dev;
941  	void *entry;
942  	int ret = 0;
943  	unsigned int scanned = 0;
944  
945  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
946  		return -EIO;
947  
948  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
949  		return 0;
950  
951  	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
952  	if (!dax_dev)
953  		return -EIO;
954  
955  	trace_dax_writeback_range(inode, xas.xa_index, end_index);
956  
957  	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
958  
959  	xas_lock_irq(&xas);
960  	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
961  		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
962  		if (ret < 0) {
963  			mapping_set_error(mapping, ret);
964  			break;
965  		}
966  		if (++scanned % XA_CHECK_SCHED)
967  			continue;
968  
969  		xas_pause(&xas);
970  		xas_unlock_irq(&xas);
971  		cond_resched();
972  		xas_lock_irq(&xas);
973  	}
974  	xas_unlock_irq(&xas);
975  	put_dax(dax_dev);
976  	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
977  	return ret;
978  }
979  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
980  
981  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
982  {
983  	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
984  }
985  
986  static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
987  			 pfn_t *pfnp)
988  {
989  	const sector_t sector = dax_iomap_sector(iomap, pos);
990  	pgoff_t pgoff;
991  	int id, rc;
992  	long length;
993  
994  	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
995  	if (rc)
996  		return rc;
997  	id = dax_read_lock();
998  	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
999  				   NULL, pfnp);
1000  	if (length < 0) {
1001  		rc = length;
1002  		goto out;
1003  	}
1004  	rc = -EINVAL;
1005  	if (PFN_PHYS(length) < size)
1006  		goto out;
1007  	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
1008  		goto out;
1009  	/* For larger pages we need devmap */
1010  	if (length > 1 && !pfn_t_devmap(*pfnp))
1011  		goto out;
1012  	rc = 0;
1013  out:
1014  	dax_read_unlock(id);
1015  	return rc;
1016  }
1017  
1018  /*
1019   * The user has performed a load from a hole in the file.  Allocating a new
1020   * page in the file would cause excessive storage usage for workloads with
1021   * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
1022   * If this page is ever written to we will re-fault and change the mapping to
1023   * point to real DAX storage instead.
1024   */
1025  static vm_fault_t dax_load_hole(struct xa_state *xas,
1026  		struct address_space *mapping, void **entry,
1027  		struct vm_fault *vmf)
1028  {
1029  	struct inode *inode = mapping->host;
1030  	unsigned long vaddr = vmf->address;
1031  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1032  	vm_fault_t ret;
1033  
1034  	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1035  			DAX_ZERO_PAGE, false);
1036  
1037  	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1038  	trace_dax_load_hole(inode, vmf, ret);
1039  	return ret;
1040  }
1041  
1042  static bool dax_range_is_aligned(struct block_device *bdev,
1043  				 unsigned int offset, unsigned int length)
1044  {
1045  	unsigned short sector_size = bdev_logical_block_size(bdev);
1046  
1047  	if (!IS_ALIGNED(offset, sector_size))
1048  		return false;
1049  	if (!IS_ALIGNED(length, sector_size))
1050  		return false;
1051  
1052  	return true;
1053  }
1054  
1055  int __dax_zero_page_range(struct block_device *bdev,
1056  		struct dax_device *dax_dev, sector_t sector,
1057  		unsigned int offset, unsigned int size)
1058  {
1059  	if (dax_range_is_aligned(bdev, offset, size)) {
1060  		sector_t start_sector = sector + (offset >> 9);
1061  
1062  		return blkdev_issue_zeroout(bdev, start_sector,
1063  				size >> 9, GFP_NOFS, 0);
1064  	} else {
1065  		pgoff_t pgoff;
1066  		long rc, id;
1067  		void *kaddr;
1068  
1069  		rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
1070  		if (rc)
1071  			return rc;
1072  
1073  		id = dax_read_lock();
1074  		rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
1075  		if (rc < 0) {
1076  			dax_read_unlock(id);
1077  			return rc;
1078  		}
1079  		memset(kaddr + offset, 0, size);
1080  		dax_flush(dax_dev, kaddr + offset, size);
1081  		dax_read_unlock(id);
1082  	}
1083  	return 0;
1084  }
1085  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1086  
1087  static loff_t
1088  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1089  		struct iomap *iomap)
1090  {
1091  	struct block_device *bdev = iomap->bdev;
1092  	struct dax_device *dax_dev = iomap->dax_dev;
1093  	struct iov_iter *iter = data;
1094  	loff_t end = pos + length, done = 0;
1095  	ssize_t ret = 0;
1096  	size_t xfer;
1097  	int id;
1098  
1099  	if (iov_iter_rw(iter) == READ) {
1100  		end = min(end, i_size_read(inode));
1101  		if (pos >= end)
1102  			return 0;
1103  
1104  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1105  			return iov_iter_zero(min(length, end - pos), iter);
1106  	}
1107  
1108  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1109  		return -EIO;
1110  
1111  	/*
1112  	 * Write can allocate block for an area which has a hole page mapped
1113  	 * into page tables. We have to tear down these mappings so that data
1114  	 * written by write(2) is visible in mmap.
1115  	 */
1116  	if (iomap->flags & IOMAP_F_NEW) {
1117  		invalidate_inode_pages2_range(inode->i_mapping,
1118  					      pos >> PAGE_SHIFT,
1119  					      (end - 1) >> PAGE_SHIFT);
1120  	}
1121  
1122  	id = dax_read_lock();
1123  	while (pos < end) {
1124  		unsigned offset = pos & (PAGE_SIZE - 1);
1125  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
1126  		const sector_t sector = dax_iomap_sector(iomap, pos);
1127  		ssize_t map_len;
1128  		pgoff_t pgoff;
1129  		void *kaddr;
1130  
1131  		if (fatal_signal_pending(current)) {
1132  			ret = -EINTR;
1133  			break;
1134  		}
1135  
1136  		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
1137  		if (ret)
1138  			break;
1139  
1140  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1141  				&kaddr, NULL);
1142  		if (map_len < 0) {
1143  			ret = map_len;
1144  			break;
1145  		}
1146  
1147  		map_len = PFN_PHYS(map_len);
1148  		kaddr += offset;
1149  		map_len -= offset;
1150  		if (map_len > end - pos)
1151  			map_len = end - pos;
1152  
1153  		/*
1154  		 * The userspace address for the memory copy has already been
1155  		 * validated via access_ok() in either vfs_read() or
1156  		 * vfs_write(), depending on which operation we are doing.
1157  		 */
1158  		if (iov_iter_rw(iter) == WRITE)
1159  			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1160  					map_len, iter);
1161  		else
1162  			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1163  					map_len, iter);
1164  
1165  		pos += xfer;
1166  		length -= xfer;
1167  		done += xfer;
1168  
1169  		if (xfer == 0)
1170  			ret = -EFAULT;
1171  		if (xfer < map_len)
1172  			break;
1173  	}
1174  	dax_read_unlock(id);
1175  
1176  	return done ? done : ret;
1177  }
1178  
1179  /**
1180   * dax_iomap_rw - Perform I/O to a DAX file
1181   * @iocb:	The control block for this I/O
1182   * @iter:	The addresses to do I/O from or to
1183   * @ops:	iomap ops passed from the file system
1184   *
1185   * This function performs read and write operations to directly mapped
1186   * persistent memory.  The callers needs to take care of read/write exclusion
1187   * and evicting any page cache pages in the region under I/O.
1188   */
1189  ssize_t
1190  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1191  		const struct iomap_ops *ops)
1192  {
1193  	struct address_space *mapping = iocb->ki_filp->f_mapping;
1194  	struct inode *inode = mapping->host;
1195  	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1196  	unsigned flags = 0;
1197  
1198  	if (iov_iter_rw(iter) == WRITE) {
1199  		lockdep_assert_held_exclusive(&inode->i_rwsem);
1200  		flags |= IOMAP_WRITE;
1201  	} else {
1202  		lockdep_assert_held(&inode->i_rwsem);
1203  	}
1204  
1205  	while (iov_iter_count(iter)) {
1206  		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1207  				iter, dax_iomap_actor);
1208  		if (ret <= 0)
1209  			break;
1210  		pos += ret;
1211  		done += ret;
1212  	}
1213  
1214  	iocb->ki_pos += done;
1215  	return done ? done : ret;
1216  }
1217  EXPORT_SYMBOL_GPL(dax_iomap_rw);
1218  
1219  static vm_fault_t dax_fault_return(int error)
1220  {
1221  	if (error == 0)
1222  		return VM_FAULT_NOPAGE;
1223  	return vmf_error(error);
1224  }
1225  
1226  /*
1227   * MAP_SYNC on a dax mapping guarantees dirty metadata is
1228   * flushed on write-faults (non-cow), but not read-faults.
1229   */
1230  static bool dax_fault_is_synchronous(unsigned long flags,
1231  		struct vm_area_struct *vma, struct iomap *iomap)
1232  {
1233  	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1234  		&& (iomap->flags & IOMAP_F_DIRTY);
1235  }
1236  
1237  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1238  			       int *iomap_errp, const struct iomap_ops *ops)
1239  {
1240  	struct vm_area_struct *vma = vmf->vma;
1241  	struct address_space *mapping = vma->vm_file->f_mapping;
1242  	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1243  	struct inode *inode = mapping->host;
1244  	unsigned long vaddr = vmf->address;
1245  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1246  	struct iomap iomap = { 0 };
1247  	unsigned flags = IOMAP_FAULT;
1248  	int error, major = 0;
1249  	bool write = vmf->flags & FAULT_FLAG_WRITE;
1250  	bool sync;
1251  	vm_fault_t ret = 0;
1252  	void *entry;
1253  	pfn_t pfn;
1254  
1255  	trace_dax_pte_fault(inode, vmf, ret);
1256  	/*
1257  	 * Check whether offset isn't beyond end of file now. Caller is supposed
1258  	 * to hold locks serializing us with truncate / punch hole so this is
1259  	 * a reliable test.
1260  	 */
1261  	if (pos >= i_size_read(inode)) {
1262  		ret = VM_FAULT_SIGBUS;
1263  		goto out;
1264  	}
1265  
1266  	if (write && !vmf->cow_page)
1267  		flags |= IOMAP_WRITE;
1268  
1269  	entry = grab_mapping_entry(&xas, mapping, 0);
1270  	if (xa_is_internal(entry)) {
1271  		ret = xa_to_internal(entry);
1272  		goto out;
1273  	}
1274  
1275  	/*
1276  	 * It is possible, particularly with mixed reads & writes to private
1277  	 * mappings, that we have raced with a PMD fault that overlaps with
1278  	 * the PTE we need to set up.  If so just return and the fault will be
1279  	 * retried.
1280  	 */
1281  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1282  		ret = VM_FAULT_NOPAGE;
1283  		goto unlock_entry;
1284  	}
1285  
1286  	/*
1287  	 * Note that we don't bother to use iomap_apply here: DAX required
1288  	 * the file system block size to be equal the page size, which means
1289  	 * that we never have to deal with more than a single extent here.
1290  	 */
1291  	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1292  	if (iomap_errp)
1293  		*iomap_errp = error;
1294  	if (error) {
1295  		ret = dax_fault_return(error);
1296  		goto unlock_entry;
1297  	}
1298  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1299  		error = -EIO;	/* fs corruption? */
1300  		goto error_finish_iomap;
1301  	}
1302  
1303  	if (vmf->cow_page) {
1304  		sector_t sector = dax_iomap_sector(&iomap, pos);
1305  
1306  		switch (iomap.type) {
1307  		case IOMAP_HOLE:
1308  		case IOMAP_UNWRITTEN:
1309  			clear_user_highpage(vmf->cow_page, vaddr);
1310  			break;
1311  		case IOMAP_MAPPED:
1312  			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1313  					sector, PAGE_SIZE, vmf->cow_page, vaddr);
1314  			break;
1315  		default:
1316  			WARN_ON_ONCE(1);
1317  			error = -EIO;
1318  			break;
1319  		}
1320  
1321  		if (error)
1322  			goto error_finish_iomap;
1323  
1324  		__SetPageUptodate(vmf->cow_page);
1325  		ret = finish_fault(vmf);
1326  		if (!ret)
1327  			ret = VM_FAULT_DONE_COW;
1328  		goto finish_iomap;
1329  	}
1330  
1331  	sync = dax_fault_is_synchronous(flags, vma, &iomap);
1332  
1333  	switch (iomap.type) {
1334  	case IOMAP_MAPPED:
1335  		if (iomap.flags & IOMAP_F_NEW) {
1336  			count_vm_event(PGMAJFAULT);
1337  			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
1338  			major = VM_FAULT_MAJOR;
1339  		}
1340  		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1341  		if (error < 0)
1342  			goto error_finish_iomap;
1343  
1344  		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1345  						 0, write && !sync);
1346  
1347  		/*
1348  		 * If we are doing synchronous page fault and inode needs fsync,
1349  		 * we can insert PTE into page tables only after that happens.
1350  		 * Skip insertion for now and return the pfn so that caller can
1351  		 * insert it after fsync is done.
1352  		 */
1353  		if (sync) {
1354  			if (WARN_ON_ONCE(!pfnp)) {
1355  				error = -EIO;
1356  				goto error_finish_iomap;
1357  			}
1358  			*pfnp = pfn;
1359  			ret = VM_FAULT_NEEDDSYNC | major;
1360  			goto finish_iomap;
1361  		}
1362  		trace_dax_insert_mapping(inode, vmf, entry);
1363  		if (write)
1364  			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
1365  		else
1366  			ret = vmf_insert_mixed(vma, vaddr, pfn);
1367  
1368  		goto finish_iomap;
1369  	case IOMAP_UNWRITTEN:
1370  	case IOMAP_HOLE:
1371  		if (!write) {
1372  			ret = dax_load_hole(&xas, mapping, &entry, vmf);
1373  			goto finish_iomap;
1374  		}
1375  		/*FALLTHRU*/
1376  	default:
1377  		WARN_ON_ONCE(1);
1378  		error = -EIO;
1379  		break;
1380  	}
1381  
1382   error_finish_iomap:
1383  	ret = dax_fault_return(error);
1384   finish_iomap:
1385  	if (ops->iomap_end) {
1386  		int copied = PAGE_SIZE;
1387  
1388  		if (ret & VM_FAULT_ERROR)
1389  			copied = 0;
1390  		/*
1391  		 * The fault is done by now and there's no way back (other
1392  		 * thread may be already happily using PTE we have installed).
1393  		 * Just ignore error from ->iomap_end since we cannot do much
1394  		 * with it.
1395  		 */
1396  		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1397  	}
1398   unlock_entry:
1399  	dax_unlock_entry(&xas, entry);
1400   out:
1401  	trace_dax_pte_fault_done(inode, vmf, ret);
1402  	return ret | major;
1403  }
1404  
1405  #ifdef CONFIG_FS_DAX_PMD
1406  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1407  		struct iomap *iomap, void **entry)
1408  {
1409  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1410  	unsigned long pmd_addr = vmf->address & PMD_MASK;
1411  	struct vm_area_struct *vma = vmf->vma;
1412  	struct inode *inode = mapping->host;
1413  	pgtable_t pgtable = NULL;
1414  	struct page *zero_page;
1415  	spinlock_t *ptl;
1416  	pmd_t pmd_entry;
1417  	pfn_t pfn;
1418  
1419  	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
1420  
1421  	if (unlikely(!zero_page))
1422  		goto fallback;
1423  
1424  	pfn = page_to_pfn_t(zero_page);
1425  	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1426  			DAX_PMD | DAX_ZERO_PAGE, false);
1427  
1428  	if (arch_needs_pgtable_deposit()) {
1429  		pgtable = pte_alloc_one(vma->vm_mm);
1430  		if (!pgtable)
1431  			return VM_FAULT_OOM;
1432  	}
1433  
1434  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1435  	if (!pmd_none(*(vmf->pmd))) {
1436  		spin_unlock(ptl);
1437  		goto fallback;
1438  	}
1439  
1440  	if (pgtable) {
1441  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1442  		mm_inc_nr_ptes(vma->vm_mm);
1443  	}
1444  	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1445  	pmd_entry = pmd_mkhuge(pmd_entry);
1446  	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1447  	spin_unlock(ptl);
1448  	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1449  	return VM_FAULT_NOPAGE;
1450  
1451  fallback:
1452  	if (pgtable)
1453  		pte_free(vma->vm_mm, pgtable);
1454  	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1455  	return VM_FAULT_FALLBACK;
1456  }
1457  
1458  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1459  			       const struct iomap_ops *ops)
1460  {
1461  	struct vm_area_struct *vma = vmf->vma;
1462  	struct address_space *mapping = vma->vm_file->f_mapping;
1463  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1464  	unsigned long pmd_addr = vmf->address & PMD_MASK;
1465  	bool write = vmf->flags & FAULT_FLAG_WRITE;
1466  	bool sync;
1467  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1468  	struct inode *inode = mapping->host;
1469  	vm_fault_t result = VM_FAULT_FALLBACK;
1470  	struct iomap iomap = { 0 };
1471  	pgoff_t max_pgoff;
1472  	void *entry;
1473  	loff_t pos;
1474  	int error;
1475  	pfn_t pfn;
1476  
1477  	/*
1478  	 * Check whether offset isn't beyond end of file now. Caller is
1479  	 * supposed to hold locks serializing us with truncate / punch hole so
1480  	 * this is a reliable test.
1481  	 */
1482  	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1483  
1484  	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
1485  
1486  	/*
1487  	 * Make sure that the faulting address's PMD offset (color) matches
1488  	 * the PMD offset from the start of the file.  This is necessary so
1489  	 * that a PMD range in the page table overlaps exactly with a PMD
1490  	 * range in the page cache.
1491  	 */
1492  	if ((vmf->pgoff & PG_PMD_COLOUR) !=
1493  	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1494  		goto fallback;
1495  
1496  	/* Fall back to PTEs if we're going to COW */
1497  	if (write && !(vma->vm_flags & VM_SHARED))
1498  		goto fallback;
1499  
1500  	/* If the PMD would extend outside the VMA */
1501  	if (pmd_addr < vma->vm_start)
1502  		goto fallback;
1503  	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1504  		goto fallback;
1505  
1506  	if (xas.xa_index >= max_pgoff) {
1507  		result = VM_FAULT_SIGBUS;
1508  		goto out;
1509  	}
1510  
1511  	/* If the PMD would extend beyond the file size */
1512  	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
1513  		goto fallback;
1514  
1515  	/*
1516  	 * grab_mapping_entry() will make sure we get an empty PMD entry,
1517  	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
1518  	 * entry is already in the array, for instance), it will return
1519  	 * VM_FAULT_FALLBACK.
1520  	 */
1521  	entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
1522  	if (xa_is_internal(entry)) {
1523  		result = xa_to_internal(entry);
1524  		goto fallback;
1525  	}
1526  
1527  	/*
1528  	 * It is possible, particularly with mixed reads & writes to private
1529  	 * mappings, that we have raced with a PTE fault that overlaps with
1530  	 * the PMD we need to set up.  If so just return and the fault will be
1531  	 * retried.
1532  	 */
1533  	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1534  			!pmd_devmap(*vmf->pmd)) {
1535  		result = 0;
1536  		goto unlock_entry;
1537  	}
1538  
1539  	/*
1540  	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
1541  	 * setting up a mapping, so really we're using iomap_begin() as a way
1542  	 * to look up our filesystem block.
1543  	 */
1544  	pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1545  	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1546  	if (error)
1547  		goto unlock_entry;
1548  
1549  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
1550  		goto finish_iomap;
1551  
1552  	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
1553  
1554  	switch (iomap.type) {
1555  	case IOMAP_MAPPED:
1556  		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
1557  		if (error < 0)
1558  			goto finish_iomap;
1559  
1560  		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1561  						DAX_PMD, write && !sync);
1562  
1563  		/*
1564  		 * If we are doing synchronous page fault and inode needs fsync,
1565  		 * we can insert PMD into page tables only after that happens.
1566  		 * Skip insertion for now and return the pfn so that caller can
1567  		 * insert it after fsync is done.
1568  		 */
1569  		if (sync) {
1570  			if (WARN_ON_ONCE(!pfnp))
1571  				goto finish_iomap;
1572  			*pfnp = pfn;
1573  			result = VM_FAULT_NEEDDSYNC;
1574  			goto finish_iomap;
1575  		}
1576  
1577  		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
1578  		result = vmf_insert_pfn_pmd(vmf, pfn, write);
1579  		break;
1580  	case IOMAP_UNWRITTEN:
1581  	case IOMAP_HOLE:
1582  		if (WARN_ON_ONCE(write))
1583  			break;
1584  		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
1585  		break;
1586  	default:
1587  		WARN_ON_ONCE(1);
1588  		break;
1589  	}
1590  
1591   finish_iomap:
1592  	if (ops->iomap_end) {
1593  		int copied = PMD_SIZE;
1594  
1595  		if (result == VM_FAULT_FALLBACK)
1596  			copied = 0;
1597  		/*
1598  		 * The fault is done by now and there's no way back (other
1599  		 * thread may be already happily using PMD we have installed).
1600  		 * Just ignore error from ->iomap_end since we cannot do much
1601  		 * with it.
1602  		 */
1603  		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1604  				&iomap);
1605  	}
1606   unlock_entry:
1607  	dax_unlock_entry(&xas, entry);
1608   fallback:
1609  	if (result == VM_FAULT_FALLBACK) {
1610  		split_huge_pmd(vma, vmf->pmd, vmf->address);
1611  		count_vm_event(THP_FAULT_FALLBACK);
1612  	}
1613  out:
1614  	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
1615  	return result;
1616  }
1617  #else
1618  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1619  			       const struct iomap_ops *ops)
1620  {
1621  	return VM_FAULT_FALLBACK;
1622  }
1623  #endif /* CONFIG_FS_DAX_PMD */
1624  
1625  /**
1626   * dax_iomap_fault - handle a page fault on a DAX file
1627   * @vmf: The description of the fault
1628   * @pe_size: Size of the page to fault in
1629   * @pfnp: PFN to insert for synchronous faults if fsync is required
1630   * @iomap_errp: Storage for detailed error code in case of error
1631   * @ops: Iomap ops passed from the file system
1632   *
1633   * When a page fault occurs, filesystems may call this helper in
1634   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
1635   * has done all the necessary locking for page fault to proceed
1636   * successfully.
1637   */
1638  vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1639  		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
1640  {
1641  	switch (pe_size) {
1642  	case PE_SIZE_PTE:
1643  		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1644  	case PE_SIZE_PMD:
1645  		return dax_iomap_pmd_fault(vmf, pfnp, ops);
1646  	default:
1647  		return VM_FAULT_FALLBACK;
1648  	}
1649  }
1650  EXPORT_SYMBOL_GPL(dax_iomap_fault);
1651  
1652  /*
1653   * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1654   * @vmf: The description of the fault
1655   * @pfn: PFN to insert
1656   * @order: Order of entry to insert.
1657   *
1658   * This function inserts a writeable PTE or PMD entry into the page tables
1659   * for an mmaped DAX file.  It also marks the page cache entry as dirty.
1660   */
1661  static vm_fault_t
1662  dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1663  {
1664  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1665  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1666  	void *entry;
1667  	vm_fault_t ret;
1668  
1669  	xas_lock_irq(&xas);
1670  	entry = get_unlocked_entry(&xas);
1671  	/* Did we race with someone splitting entry or so? */
1672  	if (!entry ||
1673  	    (order == 0 && !dax_is_pte_entry(entry)) ||
1674  	    (order == PMD_ORDER && !dax_is_pmd_entry(entry))) {
1675  		put_unlocked_entry(&xas, entry);
1676  		xas_unlock_irq(&xas);
1677  		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1678  						      VM_FAULT_NOPAGE);
1679  		return VM_FAULT_NOPAGE;
1680  	}
1681  	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1682  	dax_lock_entry(&xas, entry);
1683  	xas_unlock_irq(&xas);
1684  	if (order == 0)
1685  		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1686  #ifdef CONFIG_FS_DAX_PMD
1687  	else if (order == PMD_ORDER)
1688  		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1689  #endif
1690  	else
1691  		ret = VM_FAULT_FALLBACK;
1692  	dax_unlock_entry(&xas, entry);
1693  	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1694  	return ret;
1695  }
1696  
1697  /**
1698   * dax_finish_sync_fault - finish synchronous page fault
1699   * @vmf: The description of the fault
1700   * @pe_size: Size of entry to be inserted
1701   * @pfn: PFN to insert
1702   *
1703   * This function ensures that the file range touched by the page fault is
1704   * stored persistently on the media and handles inserting of appropriate page
1705   * table entry.
1706   */
1707  vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
1708  		enum page_entry_size pe_size, pfn_t pfn)
1709  {
1710  	int err;
1711  	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1712  	unsigned int order = pe_order(pe_size);
1713  	size_t len = PAGE_SIZE << order;
1714  
1715  	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1716  	if (err)
1717  		return VM_FAULT_SIGBUS;
1718  	return dax_insert_pfn_mkwrite(vmf, pfn, order);
1719  }
1720  EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
1721