xref: /openbmc/linux/fs/dax.c (revision 96c63fa7393d0a346acfe5a91e0c7d4c7782641b)
1  /*
2   * fs/dax.c - Direct Access filesystem code
3   * Copyright (c) 2013-2014 Intel Corporation
4   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6   *
7   * This program is free software; you can redistribute it and/or modify it
8   * under the terms and conditions of the GNU General Public License,
9   * version 2, as published by the Free Software Foundation.
10   *
11   * This program is distributed in the hope it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14   * more details.
15   */
16  
17  #include <linux/atomic.h>
18  #include <linux/blkdev.h>
19  #include <linux/buffer_head.h>
20  #include <linux/dax.h>
21  #include <linux/fs.h>
22  #include <linux/genhd.h>
23  #include <linux/highmem.h>
24  #include <linux/memcontrol.h>
25  #include <linux/mm.h>
26  #include <linux/mutex.h>
27  #include <linux/pagevec.h>
28  #include <linux/pmem.h>
29  #include <linux/sched.h>
30  #include <linux/uio.h>
31  #include <linux/vmstat.h>
32  #include <linux/pfn_t.h>
33  #include <linux/sizes.h>
34  
35  /*
36   * We use lowest available bit in exceptional entry for locking, other two
37   * bits to determine entry type. In total 3 special bits.
38   */
39  #define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
40  #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
41  #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
42  #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
43  #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
44  #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
45  #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
46  		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
47  		RADIX_TREE_EXCEPTIONAL_ENTRY))
48  
49  /* We choose 4096 entries - same as per-zone page wait tables */
50  #define DAX_WAIT_TABLE_BITS 12
51  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
52  
53  wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
54  
55  static int __init init_dax_wait_table(void)
56  {
57  	int i;
58  
59  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
60  		init_waitqueue_head(wait_table + i);
61  	return 0;
62  }
63  fs_initcall(init_dax_wait_table);
64  
65  static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
66  					      pgoff_t index)
67  {
68  	unsigned long hash = hash_long((unsigned long)mapping ^ index,
69  				       DAX_WAIT_TABLE_BITS);
70  	return wait_table + hash;
71  }
72  
73  static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
74  {
75  	struct request_queue *q = bdev->bd_queue;
76  	long rc = -EIO;
77  
78  	dax->addr = (void __pmem *) ERR_PTR(-EIO);
79  	if (blk_queue_enter(q, true) != 0)
80  		return rc;
81  
82  	rc = bdev_direct_access(bdev, dax);
83  	if (rc < 0) {
84  		dax->addr = (void __pmem *) ERR_PTR(rc);
85  		blk_queue_exit(q);
86  		return rc;
87  	}
88  	return rc;
89  }
90  
91  static void dax_unmap_atomic(struct block_device *bdev,
92  		const struct blk_dax_ctl *dax)
93  {
94  	if (IS_ERR(dax->addr))
95  		return;
96  	blk_queue_exit(bdev->bd_queue);
97  }
98  
99  struct page *read_dax_sector(struct block_device *bdev, sector_t n)
100  {
101  	struct page *page = alloc_pages(GFP_KERNEL, 0);
102  	struct blk_dax_ctl dax = {
103  		.size = PAGE_SIZE,
104  		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
105  	};
106  	long rc;
107  
108  	if (!page)
109  		return ERR_PTR(-ENOMEM);
110  
111  	rc = dax_map_atomic(bdev, &dax);
112  	if (rc < 0)
113  		return ERR_PTR(rc);
114  	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
115  	dax_unmap_atomic(bdev, &dax);
116  	return page;
117  }
118  
119  static bool buffer_written(struct buffer_head *bh)
120  {
121  	return buffer_mapped(bh) && !buffer_unwritten(bh);
122  }
123  
124  /*
125   * When ext4 encounters a hole, it returns without modifying the buffer_head
126   * which means that we can't trust b_size.  To cope with this, we set b_state
127   * to 0 before calling get_block and, if any bit is set, we know we can trust
128   * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
129   * and would save us time calling get_block repeatedly.
130   */
131  static bool buffer_size_valid(struct buffer_head *bh)
132  {
133  	return bh->b_state != 0;
134  }
135  
136  
137  static sector_t to_sector(const struct buffer_head *bh,
138  		const struct inode *inode)
139  {
140  	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
141  
142  	return sector;
143  }
144  
145  static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
146  		      loff_t start, loff_t end, get_block_t get_block,
147  		      struct buffer_head *bh)
148  {
149  	loff_t pos = start, max = start, bh_max = start;
150  	bool hole = false, need_wmb = false;
151  	struct block_device *bdev = NULL;
152  	int rw = iov_iter_rw(iter), rc;
153  	long map_len = 0;
154  	struct blk_dax_ctl dax = {
155  		.addr = (void __pmem *) ERR_PTR(-EIO),
156  	};
157  	unsigned blkbits = inode->i_blkbits;
158  	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
159  								>> blkbits;
160  
161  	if (rw == READ)
162  		end = min(end, i_size_read(inode));
163  
164  	while (pos < end) {
165  		size_t len;
166  		if (pos == max) {
167  			long page = pos >> PAGE_SHIFT;
168  			sector_t block = page << (PAGE_SHIFT - blkbits);
169  			unsigned first = pos - (block << blkbits);
170  			long size;
171  
172  			if (pos == bh_max) {
173  				bh->b_size = PAGE_ALIGN(end - pos);
174  				bh->b_state = 0;
175  				rc = get_block(inode, block, bh, rw == WRITE);
176  				if (rc)
177  					break;
178  				if (!buffer_size_valid(bh))
179  					bh->b_size = 1 << blkbits;
180  				bh_max = pos - first + bh->b_size;
181  				bdev = bh->b_bdev;
182  				/*
183  				 * We allow uninitialized buffers for writes
184  				 * beyond EOF as those cannot race with faults
185  				 */
186  				WARN_ON_ONCE(
187  					(buffer_new(bh) && block < file_blks) ||
188  					(rw == WRITE && buffer_unwritten(bh)));
189  			} else {
190  				unsigned done = bh->b_size -
191  						(bh_max - (pos - first));
192  				bh->b_blocknr += done >> blkbits;
193  				bh->b_size -= done;
194  			}
195  
196  			hole = rw == READ && !buffer_written(bh);
197  			if (hole) {
198  				size = bh->b_size - first;
199  			} else {
200  				dax_unmap_atomic(bdev, &dax);
201  				dax.sector = to_sector(bh, inode);
202  				dax.size = bh->b_size;
203  				map_len = dax_map_atomic(bdev, &dax);
204  				if (map_len < 0) {
205  					rc = map_len;
206  					break;
207  				}
208  				dax.addr += first;
209  				size = map_len - first;
210  			}
211  			max = min(pos + size, end);
212  		}
213  
214  		if (iov_iter_rw(iter) == WRITE) {
215  			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
216  			need_wmb = true;
217  		} else if (!hole)
218  			len = copy_to_iter((void __force *) dax.addr, max - pos,
219  					iter);
220  		else
221  			len = iov_iter_zero(max - pos, iter);
222  
223  		if (!len) {
224  			rc = -EFAULT;
225  			break;
226  		}
227  
228  		pos += len;
229  		if (!IS_ERR(dax.addr))
230  			dax.addr += len;
231  	}
232  
233  	if (need_wmb)
234  		wmb_pmem();
235  	dax_unmap_atomic(bdev, &dax);
236  
237  	return (pos == start) ? rc : pos - start;
238  }
239  
240  /**
241   * dax_do_io - Perform I/O to a DAX file
242   * @iocb: The control block for this I/O
243   * @inode: The file which the I/O is directed at
244   * @iter: The addresses to do I/O from or to
245   * @get_block: The filesystem method used to translate file offsets to blocks
246   * @end_io: A filesystem callback for I/O completion
247   * @flags: See below
248   *
249   * This function uses the same locking scheme as do_blockdev_direct_IO:
250   * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
251   * caller for writes.  For reads, we take and release the i_mutex ourselves.
252   * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
253   * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
254   * is in progress.
255   */
256  ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
257  		  struct iov_iter *iter, get_block_t get_block,
258  		  dio_iodone_t end_io, int flags)
259  {
260  	struct buffer_head bh;
261  	ssize_t retval = -EINVAL;
262  	loff_t pos = iocb->ki_pos;
263  	loff_t end = pos + iov_iter_count(iter);
264  
265  	memset(&bh, 0, sizeof(bh));
266  	bh.b_bdev = inode->i_sb->s_bdev;
267  
268  	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
269  		inode_lock(inode);
270  
271  	/* Protects against truncate */
272  	if (!(flags & DIO_SKIP_DIO_COUNT))
273  		inode_dio_begin(inode);
274  
275  	retval = dax_io(inode, iter, pos, end, get_block, &bh);
276  
277  	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
278  		inode_unlock(inode);
279  
280  	if (end_io) {
281  		int err;
282  
283  		err = end_io(iocb, pos, retval, bh.b_private);
284  		if (err)
285  			retval = err;
286  	}
287  
288  	if (!(flags & DIO_SKIP_DIO_COUNT))
289  		inode_dio_end(inode);
290  	return retval;
291  }
292  EXPORT_SYMBOL_GPL(dax_do_io);
293  
294  /*
295   * DAX radix tree locking
296   */
297  struct exceptional_entry_key {
298  	struct address_space *mapping;
299  	unsigned long index;
300  };
301  
302  struct wait_exceptional_entry_queue {
303  	wait_queue_t wait;
304  	struct exceptional_entry_key key;
305  };
306  
307  static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
308  				       int sync, void *keyp)
309  {
310  	struct exceptional_entry_key *key = keyp;
311  	struct wait_exceptional_entry_queue *ewait =
312  		container_of(wait, struct wait_exceptional_entry_queue, wait);
313  
314  	if (key->mapping != ewait->key.mapping ||
315  	    key->index != ewait->key.index)
316  		return 0;
317  	return autoremove_wake_function(wait, mode, sync, NULL);
318  }
319  
320  /*
321   * Check whether the given slot is locked. The function must be called with
322   * mapping->tree_lock held
323   */
324  static inline int slot_locked(struct address_space *mapping, void **slot)
325  {
326  	unsigned long entry = (unsigned long)
327  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
328  	return entry & RADIX_DAX_ENTRY_LOCK;
329  }
330  
331  /*
332   * Mark the given slot is locked. The function must be called with
333   * mapping->tree_lock held
334   */
335  static inline void *lock_slot(struct address_space *mapping, void **slot)
336  {
337  	unsigned long entry = (unsigned long)
338  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
339  
340  	entry |= RADIX_DAX_ENTRY_LOCK;
341  	radix_tree_replace_slot(slot, (void *)entry);
342  	return (void *)entry;
343  }
344  
345  /*
346   * Mark the given slot is unlocked. The function must be called with
347   * mapping->tree_lock held
348   */
349  static inline void *unlock_slot(struct address_space *mapping, void **slot)
350  {
351  	unsigned long entry = (unsigned long)
352  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
353  
354  	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
355  	radix_tree_replace_slot(slot, (void *)entry);
356  	return (void *)entry;
357  }
358  
359  /*
360   * Lookup entry in radix tree, wait for it to become unlocked if it is
361   * exceptional entry and return it. The caller must call
362   * put_unlocked_mapping_entry() when he decided not to lock the entry or
363   * put_locked_mapping_entry() when he locked the entry and now wants to
364   * unlock it.
365   *
366   * The function must be called with mapping->tree_lock held.
367   */
368  static void *get_unlocked_mapping_entry(struct address_space *mapping,
369  					pgoff_t index, void ***slotp)
370  {
371  	void *ret, **slot;
372  	struct wait_exceptional_entry_queue ewait;
373  	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
374  
375  	init_wait(&ewait.wait);
376  	ewait.wait.func = wake_exceptional_entry_func;
377  	ewait.key.mapping = mapping;
378  	ewait.key.index = index;
379  
380  	for (;;) {
381  		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
382  					  &slot);
383  		if (!ret || !radix_tree_exceptional_entry(ret) ||
384  		    !slot_locked(mapping, slot)) {
385  			if (slotp)
386  				*slotp = slot;
387  			return ret;
388  		}
389  		prepare_to_wait_exclusive(wq, &ewait.wait,
390  					  TASK_UNINTERRUPTIBLE);
391  		spin_unlock_irq(&mapping->tree_lock);
392  		schedule();
393  		finish_wait(wq, &ewait.wait);
394  		spin_lock_irq(&mapping->tree_lock);
395  	}
396  }
397  
398  /*
399   * Find radix tree entry at given index. If it points to a page, return with
400   * the page locked. If it points to the exceptional entry, return with the
401   * radix tree entry locked. If the radix tree doesn't contain given index,
402   * create empty exceptional entry for the index and return with it locked.
403   *
404   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
405   * persistent memory the benefit is doubtful. We can add that later if we can
406   * show it helps.
407   */
408  static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
409  {
410  	void *ret, **slot;
411  
412  restart:
413  	spin_lock_irq(&mapping->tree_lock);
414  	ret = get_unlocked_mapping_entry(mapping, index, &slot);
415  	/* No entry for given index? Make sure radix tree is big enough. */
416  	if (!ret) {
417  		int err;
418  
419  		spin_unlock_irq(&mapping->tree_lock);
420  		err = radix_tree_preload(
421  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
422  		if (err)
423  			return ERR_PTR(err);
424  		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
425  			       RADIX_DAX_ENTRY_LOCK);
426  		spin_lock_irq(&mapping->tree_lock);
427  		err = radix_tree_insert(&mapping->page_tree, index, ret);
428  		radix_tree_preload_end();
429  		if (err) {
430  			spin_unlock_irq(&mapping->tree_lock);
431  			/* Someone already created the entry? */
432  			if (err == -EEXIST)
433  				goto restart;
434  			return ERR_PTR(err);
435  		}
436  		/* Good, we have inserted empty locked entry into the tree. */
437  		mapping->nrexceptional++;
438  		spin_unlock_irq(&mapping->tree_lock);
439  		return ret;
440  	}
441  	/* Normal page in radix tree? */
442  	if (!radix_tree_exceptional_entry(ret)) {
443  		struct page *page = ret;
444  
445  		get_page(page);
446  		spin_unlock_irq(&mapping->tree_lock);
447  		lock_page(page);
448  		/* Page got truncated? Retry... */
449  		if (unlikely(page->mapping != mapping)) {
450  			unlock_page(page);
451  			put_page(page);
452  			goto restart;
453  		}
454  		return page;
455  	}
456  	ret = lock_slot(mapping, slot);
457  	spin_unlock_irq(&mapping->tree_lock);
458  	return ret;
459  }
460  
461  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
462  				   pgoff_t index, bool wake_all)
463  {
464  	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
465  
466  	/*
467  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
468  	 * under mapping->tree_lock, ditto for entry handling in our callers.
469  	 * So at this point all tasks that could have seen our entry locked
470  	 * must be in the waitqueue and the following check will see them.
471  	 */
472  	if (waitqueue_active(wq)) {
473  		struct exceptional_entry_key key;
474  
475  		key.mapping = mapping;
476  		key.index = index;
477  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
478  	}
479  }
480  
481  void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
482  {
483  	void *ret, **slot;
484  
485  	spin_lock_irq(&mapping->tree_lock);
486  	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
487  	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
488  			 !slot_locked(mapping, slot))) {
489  		spin_unlock_irq(&mapping->tree_lock);
490  		return;
491  	}
492  	unlock_slot(mapping, slot);
493  	spin_unlock_irq(&mapping->tree_lock);
494  	dax_wake_mapping_entry_waiter(mapping, index, false);
495  }
496  
497  static void put_locked_mapping_entry(struct address_space *mapping,
498  				     pgoff_t index, void *entry)
499  {
500  	if (!radix_tree_exceptional_entry(entry)) {
501  		unlock_page(entry);
502  		put_page(entry);
503  	} else {
504  		dax_unlock_mapping_entry(mapping, index);
505  	}
506  }
507  
508  /*
509   * Called when we are done with radix tree entry we looked up via
510   * get_unlocked_mapping_entry() and which we didn't lock in the end.
511   */
512  static void put_unlocked_mapping_entry(struct address_space *mapping,
513  				       pgoff_t index, void *entry)
514  {
515  	if (!radix_tree_exceptional_entry(entry))
516  		return;
517  
518  	/* We have to wake up next waiter for the radix tree entry lock */
519  	dax_wake_mapping_entry_waiter(mapping, index, false);
520  }
521  
522  /*
523   * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
524   * entry to get unlocked before deleting it.
525   */
526  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
527  {
528  	void *entry;
529  
530  	spin_lock_irq(&mapping->tree_lock);
531  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
532  	/*
533  	 * This gets called from truncate / punch_hole path. As such, the caller
534  	 * must hold locks protecting against concurrent modifications of the
535  	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
536  	 * caller has seen exceptional entry for this index, we better find it
537  	 * at that index as well...
538  	 */
539  	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
540  		spin_unlock_irq(&mapping->tree_lock);
541  		return 0;
542  	}
543  	radix_tree_delete(&mapping->page_tree, index);
544  	mapping->nrexceptional--;
545  	spin_unlock_irq(&mapping->tree_lock);
546  	dax_wake_mapping_entry_waiter(mapping, index, true);
547  
548  	return 1;
549  }
550  
551  /*
552   * The user has performed a load from a hole in the file.  Allocating
553   * a new page in the file would cause excessive storage usage for
554   * workloads with sparse files.  We allocate a page cache page instead.
555   * We'll kick it out of the page cache if it's ever written to,
556   * otherwise it will simply fall out of the page cache under memory
557   * pressure without ever having been dirtied.
558   */
559  static int dax_load_hole(struct address_space *mapping, void *entry,
560  			 struct vm_fault *vmf)
561  {
562  	struct page *page;
563  
564  	/* Hole page already exists? Return it...  */
565  	if (!radix_tree_exceptional_entry(entry)) {
566  		vmf->page = entry;
567  		return VM_FAULT_LOCKED;
568  	}
569  
570  	/* This will replace locked radix tree entry with a hole page */
571  	page = find_or_create_page(mapping, vmf->pgoff,
572  				   vmf->gfp_mask | __GFP_ZERO);
573  	if (!page) {
574  		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
575  		return VM_FAULT_OOM;
576  	}
577  	vmf->page = page;
578  	return VM_FAULT_LOCKED;
579  }
580  
581  static int copy_user_bh(struct page *to, struct inode *inode,
582  		struct buffer_head *bh, unsigned long vaddr)
583  {
584  	struct blk_dax_ctl dax = {
585  		.sector = to_sector(bh, inode),
586  		.size = bh->b_size,
587  	};
588  	struct block_device *bdev = bh->b_bdev;
589  	void *vto;
590  
591  	if (dax_map_atomic(bdev, &dax) < 0)
592  		return PTR_ERR(dax.addr);
593  	vto = kmap_atomic(to);
594  	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
595  	kunmap_atomic(vto);
596  	dax_unmap_atomic(bdev, &dax);
597  	return 0;
598  }
599  
600  #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
601  
602  static void *dax_insert_mapping_entry(struct address_space *mapping,
603  				      struct vm_fault *vmf,
604  				      void *entry, sector_t sector)
605  {
606  	struct radix_tree_root *page_tree = &mapping->page_tree;
607  	int error = 0;
608  	bool hole_fill = false;
609  	void *new_entry;
610  	pgoff_t index = vmf->pgoff;
611  
612  	if (vmf->flags & FAULT_FLAG_WRITE)
613  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
614  
615  	/* Replacing hole page with block mapping? */
616  	if (!radix_tree_exceptional_entry(entry)) {
617  		hole_fill = true;
618  		/*
619  		 * Unmap the page now before we remove it from page cache below.
620  		 * The page is locked so it cannot be faulted in again.
621  		 */
622  		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
623  				    PAGE_SIZE, 0);
624  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
625  		if (error)
626  			return ERR_PTR(error);
627  	}
628  
629  	spin_lock_irq(&mapping->tree_lock);
630  	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
631  		       RADIX_DAX_ENTRY_LOCK);
632  	if (hole_fill) {
633  		__delete_from_page_cache(entry, NULL);
634  		/* Drop pagecache reference */
635  		put_page(entry);
636  		error = radix_tree_insert(page_tree, index, new_entry);
637  		if (error) {
638  			new_entry = ERR_PTR(error);
639  			goto unlock;
640  		}
641  		mapping->nrexceptional++;
642  	} else {
643  		void **slot;
644  		void *ret;
645  
646  		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
647  		WARN_ON_ONCE(ret != entry);
648  		radix_tree_replace_slot(slot, new_entry);
649  	}
650  	if (vmf->flags & FAULT_FLAG_WRITE)
651  		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
652   unlock:
653  	spin_unlock_irq(&mapping->tree_lock);
654  	if (hole_fill) {
655  		radix_tree_preload_end();
656  		/*
657  		 * We don't need hole page anymore, it has been replaced with
658  		 * locked radix tree entry now.
659  		 */
660  		if (mapping->a_ops->freepage)
661  			mapping->a_ops->freepage(entry);
662  		unlock_page(entry);
663  		put_page(entry);
664  	}
665  	return new_entry;
666  }
667  
668  static int dax_writeback_one(struct block_device *bdev,
669  		struct address_space *mapping, pgoff_t index, void *entry)
670  {
671  	struct radix_tree_root *page_tree = &mapping->page_tree;
672  	int type = RADIX_DAX_TYPE(entry);
673  	struct radix_tree_node *node;
674  	struct blk_dax_ctl dax;
675  	void **slot;
676  	int ret = 0;
677  
678  	spin_lock_irq(&mapping->tree_lock);
679  	/*
680  	 * Regular page slots are stabilized by the page lock even
681  	 * without the tree itself locked.  These unlocked entries
682  	 * need verification under the tree lock.
683  	 */
684  	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
685  		goto unlock;
686  	if (*slot != entry)
687  		goto unlock;
688  
689  	/* another fsync thread may have already written back this entry */
690  	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
691  		goto unlock;
692  
693  	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
694  		ret = -EIO;
695  		goto unlock;
696  	}
697  
698  	dax.sector = RADIX_DAX_SECTOR(entry);
699  	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
700  	spin_unlock_irq(&mapping->tree_lock);
701  
702  	/*
703  	 * We cannot hold tree_lock while calling dax_map_atomic() because it
704  	 * eventually calls cond_resched().
705  	 */
706  	ret = dax_map_atomic(bdev, &dax);
707  	if (ret < 0)
708  		return ret;
709  
710  	if (WARN_ON_ONCE(ret < dax.size)) {
711  		ret = -EIO;
712  		goto unmap;
713  	}
714  
715  	wb_cache_pmem(dax.addr, dax.size);
716  
717  	spin_lock_irq(&mapping->tree_lock);
718  	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
719  	spin_unlock_irq(&mapping->tree_lock);
720   unmap:
721  	dax_unmap_atomic(bdev, &dax);
722  	return ret;
723  
724   unlock:
725  	spin_unlock_irq(&mapping->tree_lock);
726  	return ret;
727  }
728  
729  /*
730   * Flush the mapping to the persistent domain within the byte range of [start,
731   * end]. This is required by data integrity operations to ensure file data is
732   * on persistent storage prior to completion of the operation.
733   */
734  int dax_writeback_mapping_range(struct address_space *mapping,
735  		struct block_device *bdev, struct writeback_control *wbc)
736  {
737  	struct inode *inode = mapping->host;
738  	pgoff_t start_index, end_index, pmd_index;
739  	pgoff_t indices[PAGEVEC_SIZE];
740  	struct pagevec pvec;
741  	bool done = false;
742  	int i, ret = 0;
743  	void *entry;
744  
745  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
746  		return -EIO;
747  
748  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
749  		return 0;
750  
751  	start_index = wbc->range_start >> PAGE_SHIFT;
752  	end_index = wbc->range_end >> PAGE_SHIFT;
753  	pmd_index = DAX_PMD_INDEX(start_index);
754  
755  	rcu_read_lock();
756  	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
757  	rcu_read_unlock();
758  
759  	/* see if the start of our range is covered by a PMD entry */
760  	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
761  		start_index = pmd_index;
762  
763  	tag_pages_for_writeback(mapping, start_index, end_index);
764  
765  	pagevec_init(&pvec, 0);
766  	while (!done) {
767  		pvec.nr = find_get_entries_tag(mapping, start_index,
768  				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
769  				pvec.pages, indices);
770  
771  		if (pvec.nr == 0)
772  			break;
773  
774  		for (i = 0; i < pvec.nr; i++) {
775  			if (indices[i] > end_index) {
776  				done = true;
777  				break;
778  			}
779  
780  			ret = dax_writeback_one(bdev, mapping, indices[i],
781  					pvec.pages[i]);
782  			if (ret < 0)
783  				return ret;
784  		}
785  	}
786  	wmb_pmem();
787  	return 0;
788  }
789  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
790  
791  static int dax_insert_mapping(struct address_space *mapping,
792  			struct buffer_head *bh, void **entryp,
793  			struct vm_area_struct *vma, struct vm_fault *vmf)
794  {
795  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
796  	struct block_device *bdev = bh->b_bdev;
797  	struct blk_dax_ctl dax = {
798  		.sector = to_sector(bh, mapping->host),
799  		.size = bh->b_size,
800  	};
801  	void *ret;
802  	void *entry = *entryp;
803  
804  	if (dax_map_atomic(bdev, &dax) < 0)
805  		return PTR_ERR(dax.addr);
806  	dax_unmap_atomic(bdev, &dax);
807  
808  	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
809  	if (IS_ERR(ret))
810  		return PTR_ERR(ret);
811  	*entryp = ret;
812  
813  	return vm_insert_mixed(vma, vaddr, dax.pfn);
814  }
815  
816  /**
817   * __dax_fault - handle a page fault on a DAX file
818   * @vma: The virtual memory area where the fault occurred
819   * @vmf: The description of the fault
820   * @get_block: The filesystem method used to translate file offsets to blocks
821   *
822   * When a page fault occurs, filesystems may call this helper in their
823   * fault handler for DAX files. __dax_fault() assumes the caller has done all
824   * the necessary locking for the page fault to proceed successfully.
825   */
826  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
827  			get_block_t get_block)
828  {
829  	struct file *file = vma->vm_file;
830  	struct address_space *mapping = file->f_mapping;
831  	struct inode *inode = mapping->host;
832  	void *entry;
833  	struct buffer_head bh;
834  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
835  	unsigned blkbits = inode->i_blkbits;
836  	sector_t block;
837  	pgoff_t size;
838  	int error;
839  	int major = 0;
840  
841  	/*
842  	 * Check whether offset isn't beyond end of file now. Caller is supposed
843  	 * to hold locks serializing us with truncate / punch hole so this is
844  	 * a reliable test.
845  	 */
846  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
847  	if (vmf->pgoff >= size)
848  		return VM_FAULT_SIGBUS;
849  
850  	memset(&bh, 0, sizeof(bh));
851  	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
852  	bh.b_bdev = inode->i_sb->s_bdev;
853  	bh.b_size = PAGE_SIZE;
854  
855  	entry = grab_mapping_entry(mapping, vmf->pgoff);
856  	if (IS_ERR(entry)) {
857  		error = PTR_ERR(entry);
858  		goto out;
859  	}
860  
861  	error = get_block(inode, block, &bh, 0);
862  	if (!error && (bh.b_size < PAGE_SIZE))
863  		error = -EIO;		/* fs corruption? */
864  	if (error)
865  		goto unlock_entry;
866  
867  	if (vmf->cow_page) {
868  		struct page *new_page = vmf->cow_page;
869  		if (buffer_written(&bh))
870  			error = copy_user_bh(new_page, inode, &bh, vaddr);
871  		else
872  			clear_user_highpage(new_page, vaddr);
873  		if (error)
874  			goto unlock_entry;
875  		if (!radix_tree_exceptional_entry(entry)) {
876  			vmf->page = entry;
877  			return VM_FAULT_LOCKED;
878  		}
879  		vmf->entry = entry;
880  		return VM_FAULT_DAX_LOCKED;
881  	}
882  
883  	if (!buffer_mapped(&bh)) {
884  		if (vmf->flags & FAULT_FLAG_WRITE) {
885  			error = get_block(inode, block, &bh, 1);
886  			count_vm_event(PGMAJFAULT);
887  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
888  			major = VM_FAULT_MAJOR;
889  			if (!error && (bh.b_size < PAGE_SIZE))
890  				error = -EIO;
891  			if (error)
892  				goto unlock_entry;
893  		} else {
894  			return dax_load_hole(mapping, entry, vmf);
895  		}
896  	}
897  
898  	/* Filesystem should not return unwritten buffers to us! */
899  	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
900  	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
901   unlock_entry:
902  	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
903   out:
904  	if (error == -ENOMEM)
905  		return VM_FAULT_OOM | major;
906  	/* -EBUSY is fine, somebody else faulted on the same PTE */
907  	if ((error < 0) && (error != -EBUSY))
908  		return VM_FAULT_SIGBUS | major;
909  	return VM_FAULT_NOPAGE | major;
910  }
911  EXPORT_SYMBOL(__dax_fault);
912  
913  /**
914   * dax_fault - handle a page fault on a DAX file
915   * @vma: The virtual memory area where the fault occurred
916   * @vmf: The description of the fault
917   * @get_block: The filesystem method used to translate file offsets to blocks
918   *
919   * When a page fault occurs, filesystems may call this helper in their
920   * fault handler for DAX files.
921   */
922  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
923  	      get_block_t get_block)
924  {
925  	int result;
926  	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
927  
928  	if (vmf->flags & FAULT_FLAG_WRITE) {
929  		sb_start_pagefault(sb);
930  		file_update_time(vma->vm_file);
931  	}
932  	result = __dax_fault(vma, vmf, get_block);
933  	if (vmf->flags & FAULT_FLAG_WRITE)
934  		sb_end_pagefault(sb);
935  
936  	return result;
937  }
938  EXPORT_SYMBOL_GPL(dax_fault);
939  
940  #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
941  /*
942   * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
943   * more often than one might expect in the below function.
944   */
945  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
946  
947  static void __dax_dbg(struct buffer_head *bh, unsigned long address,
948  		const char *reason, const char *fn)
949  {
950  	if (bh) {
951  		char bname[BDEVNAME_SIZE];
952  		bdevname(bh->b_bdev, bname);
953  		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
954  			"length %zd fallback: %s\n", fn, current->comm,
955  			address, bname, bh->b_state, (u64)bh->b_blocknr,
956  			bh->b_size, reason);
957  	} else {
958  		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
959  			current->comm, address, reason);
960  	}
961  }
962  
963  #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
964  
965  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
966  		pmd_t *pmd, unsigned int flags, get_block_t get_block)
967  {
968  	struct file *file = vma->vm_file;
969  	struct address_space *mapping = file->f_mapping;
970  	struct inode *inode = mapping->host;
971  	struct buffer_head bh;
972  	unsigned blkbits = inode->i_blkbits;
973  	unsigned long pmd_addr = address & PMD_MASK;
974  	bool write = flags & FAULT_FLAG_WRITE;
975  	struct block_device *bdev;
976  	pgoff_t size, pgoff;
977  	sector_t block;
978  	int result = 0;
979  	bool alloc = false;
980  
981  	/* dax pmd mappings require pfn_t_devmap() */
982  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
983  		return VM_FAULT_FALLBACK;
984  
985  	/* Fall back to PTEs if we're going to COW */
986  	if (write && !(vma->vm_flags & VM_SHARED)) {
987  		split_huge_pmd(vma, pmd, address);
988  		dax_pmd_dbg(NULL, address, "cow write");
989  		return VM_FAULT_FALLBACK;
990  	}
991  	/* If the PMD would extend outside the VMA */
992  	if (pmd_addr < vma->vm_start) {
993  		dax_pmd_dbg(NULL, address, "vma start unaligned");
994  		return VM_FAULT_FALLBACK;
995  	}
996  	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
997  		dax_pmd_dbg(NULL, address, "vma end unaligned");
998  		return VM_FAULT_FALLBACK;
999  	}
1000  
1001  	pgoff = linear_page_index(vma, pmd_addr);
1002  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1003  	if (pgoff >= size)
1004  		return VM_FAULT_SIGBUS;
1005  	/* If the PMD would cover blocks out of the file */
1006  	if ((pgoff | PG_PMD_COLOUR) >= size) {
1007  		dax_pmd_dbg(NULL, address,
1008  				"offset + huge page size > file size");
1009  		return VM_FAULT_FALLBACK;
1010  	}
1011  
1012  	memset(&bh, 0, sizeof(bh));
1013  	bh.b_bdev = inode->i_sb->s_bdev;
1014  	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
1015  
1016  	bh.b_size = PMD_SIZE;
1017  
1018  	if (get_block(inode, block, &bh, 0) != 0)
1019  		return VM_FAULT_SIGBUS;
1020  
1021  	if (!buffer_mapped(&bh) && write) {
1022  		if (get_block(inode, block, &bh, 1) != 0)
1023  			return VM_FAULT_SIGBUS;
1024  		alloc = true;
1025  		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1026  	}
1027  
1028  	bdev = bh.b_bdev;
1029  
1030  	/*
1031  	 * If the filesystem isn't willing to tell us the length of a hole,
1032  	 * just fall back to PTEs.  Calling get_block 512 times in a loop
1033  	 * would be silly.
1034  	 */
1035  	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
1036  		dax_pmd_dbg(&bh, address, "allocated block too small");
1037  		return VM_FAULT_FALLBACK;
1038  	}
1039  
1040  	/*
1041  	 * If we allocated new storage, make sure no process has any
1042  	 * zero pages covering this hole
1043  	 */
1044  	if (alloc) {
1045  		loff_t lstart = pgoff << PAGE_SHIFT;
1046  		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
1047  
1048  		truncate_pagecache_range(inode, lstart, lend);
1049  	}
1050  
1051  	if (!write && !buffer_mapped(&bh)) {
1052  		spinlock_t *ptl;
1053  		pmd_t entry;
1054  		struct page *zero_page = get_huge_zero_page();
1055  
1056  		if (unlikely(!zero_page)) {
1057  			dax_pmd_dbg(&bh, address, "no zero page");
1058  			goto fallback;
1059  		}
1060  
1061  		ptl = pmd_lock(vma->vm_mm, pmd);
1062  		if (!pmd_none(*pmd)) {
1063  			spin_unlock(ptl);
1064  			dax_pmd_dbg(&bh, address, "pmd already present");
1065  			goto fallback;
1066  		}
1067  
1068  		dev_dbg(part_to_dev(bdev->bd_part),
1069  				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
1070  				__func__, current->comm, address,
1071  				(unsigned long long) to_sector(&bh, inode));
1072  
1073  		entry = mk_pmd(zero_page, vma->vm_page_prot);
1074  		entry = pmd_mkhuge(entry);
1075  		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
1076  		result = VM_FAULT_NOPAGE;
1077  		spin_unlock(ptl);
1078  	} else {
1079  		struct blk_dax_ctl dax = {
1080  			.sector = to_sector(&bh, inode),
1081  			.size = PMD_SIZE,
1082  		};
1083  		long length = dax_map_atomic(bdev, &dax);
1084  
1085  		if (length < 0) {
1086  			dax_pmd_dbg(&bh, address, "dax-error fallback");
1087  			goto fallback;
1088  		}
1089  		if (length < PMD_SIZE) {
1090  			dax_pmd_dbg(&bh, address, "dax-length too small");
1091  			dax_unmap_atomic(bdev, &dax);
1092  			goto fallback;
1093  		}
1094  		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
1095  			dax_pmd_dbg(&bh, address, "pfn unaligned");
1096  			dax_unmap_atomic(bdev, &dax);
1097  			goto fallback;
1098  		}
1099  
1100  		if (!pfn_t_devmap(dax.pfn)) {
1101  			dax_unmap_atomic(bdev, &dax);
1102  			dax_pmd_dbg(&bh, address, "pfn not in memmap");
1103  			goto fallback;
1104  		}
1105  		dax_unmap_atomic(bdev, &dax);
1106  
1107  		/*
1108  		 * For PTE faults we insert a radix tree entry for reads, and
1109  		 * leave it clean.  Then on the first write we dirty the radix
1110  		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
1111  		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
1112  		 * call into get_block() to translate the pgoff to a sector in
1113  		 * order to be able to create a new radix tree entry.
1114  		 *
1115  		 * The PMD path doesn't have an equivalent to
1116  		 * dax_pfn_mkwrite(), though, so for a read followed by a
1117  		 * write we traverse all the way through __dax_pmd_fault()
1118  		 * twice.  This means we can just skip inserting a radix tree
1119  		 * entry completely on the initial read and just wait until
1120  		 * the write to insert a dirty entry.
1121  		 */
1122  		if (write) {
1123  			/*
1124  			 * We should insert radix-tree entry and dirty it here.
1125  			 * For now this is broken...
1126  			 */
1127  		}
1128  
1129  		dev_dbg(part_to_dev(bdev->bd_part),
1130  				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
1131  				__func__, current->comm, address,
1132  				pfn_t_to_pfn(dax.pfn),
1133  				(unsigned long long) dax.sector);
1134  		result |= vmf_insert_pfn_pmd(vma, address, pmd,
1135  				dax.pfn, write);
1136  	}
1137  
1138   out:
1139  	return result;
1140  
1141   fallback:
1142  	count_vm_event(THP_FAULT_FALLBACK);
1143  	result = VM_FAULT_FALLBACK;
1144  	goto out;
1145  }
1146  EXPORT_SYMBOL_GPL(__dax_pmd_fault);
1147  
1148  /**
1149   * dax_pmd_fault - handle a PMD fault on a DAX file
1150   * @vma: The virtual memory area where the fault occurred
1151   * @vmf: The description of the fault
1152   * @get_block: The filesystem method used to translate file offsets to blocks
1153   *
1154   * When a page fault occurs, filesystems may call this helper in their
1155   * pmd_fault handler for DAX files.
1156   */
1157  int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1158  			pmd_t *pmd, unsigned int flags, get_block_t get_block)
1159  {
1160  	int result;
1161  	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
1162  
1163  	if (flags & FAULT_FLAG_WRITE) {
1164  		sb_start_pagefault(sb);
1165  		file_update_time(vma->vm_file);
1166  	}
1167  	result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
1168  	if (flags & FAULT_FLAG_WRITE)
1169  		sb_end_pagefault(sb);
1170  
1171  	return result;
1172  }
1173  EXPORT_SYMBOL_GPL(dax_pmd_fault);
1174  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1175  
1176  /**
1177   * dax_pfn_mkwrite - handle first write to DAX page
1178   * @vma: The virtual memory area where the fault occurred
1179   * @vmf: The description of the fault
1180   */
1181  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1182  {
1183  	struct file *file = vma->vm_file;
1184  	struct address_space *mapping = file->f_mapping;
1185  	void *entry;
1186  	pgoff_t index = vmf->pgoff;
1187  
1188  	spin_lock_irq(&mapping->tree_lock);
1189  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
1190  	if (!entry || !radix_tree_exceptional_entry(entry))
1191  		goto out;
1192  	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1193  	put_unlocked_mapping_entry(mapping, index, entry);
1194  out:
1195  	spin_unlock_irq(&mapping->tree_lock);
1196  	return VM_FAULT_NOPAGE;
1197  }
1198  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1199  
1200  static bool dax_range_is_aligned(struct block_device *bdev,
1201  				 unsigned int offset, unsigned int length)
1202  {
1203  	unsigned short sector_size = bdev_logical_block_size(bdev);
1204  
1205  	if (!IS_ALIGNED(offset, sector_size))
1206  		return false;
1207  	if (!IS_ALIGNED(length, sector_size))
1208  		return false;
1209  
1210  	return true;
1211  }
1212  
1213  int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1214  		unsigned int offset, unsigned int length)
1215  {
1216  	struct blk_dax_ctl dax = {
1217  		.sector		= sector,
1218  		.size		= PAGE_SIZE,
1219  	};
1220  
1221  	if (dax_range_is_aligned(bdev, offset, length)) {
1222  		sector_t start_sector = dax.sector + (offset >> 9);
1223  
1224  		return blkdev_issue_zeroout(bdev, start_sector,
1225  				length >> 9, GFP_NOFS, true);
1226  	} else {
1227  		if (dax_map_atomic(bdev, &dax) < 0)
1228  			return PTR_ERR(dax.addr);
1229  		clear_pmem(dax.addr + offset, length);
1230  		wmb_pmem();
1231  		dax_unmap_atomic(bdev, &dax);
1232  	}
1233  	return 0;
1234  }
1235  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1236  
1237  /**
1238   * dax_zero_page_range - zero a range within a page of a DAX file
1239   * @inode: The file being truncated
1240   * @from: The file offset that is being truncated to
1241   * @length: The number of bytes to zero
1242   * @get_block: The filesystem method used to translate file offsets to blocks
1243   *
1244   * This function can be called by a filesystem when it is zeroing part of a
1245   * page in a DAX file.  This is intended for hole-punch operations.  If
1246   * you are truncating a file, the helper function dax_truncate_page() may be
1247   * more convenient.
1248   */
1249  int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1250  							get_block_t get_block)
1251  {
1252  	struct buffer_head bh;
1253  	pgoff_t index = from >> PAGE_SHIFT;
1254  	unsigned offset = from & (PAGE_SIZE-1);
1255  	int err;
1256  
1257  	/* Block boundary? Nothing to do */
1258  	if (!length)
1259  		return 0;
1260  	BUG_ON((offset + length) > PAGE_SIZE);
1261  
1262  	memset(&bh, 0, sizeof(bh));
1263  	bh.b_bdev = inode->i_sb->s_bdev;
1264  	bh.b_size = PAGE_SIZE;
1265  	err = get_block(inode, index, &bh, 0);
1266  	if (err < 0 || !buffer_written(&bh))
1267  		return err;
1268  
1269  	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1270  			offset, length);
1271  }
1272  EXPORT_SYMBOL_GPL(dax_zero_page_range);
1273  
1274  /**
1275   * dax_truncate_page - handle a partial page being truncated in a DAX file
1276   * @inode: The file being truncated
1277   * @from: The file offset that is being truncated to
1278   * @get_block: The filesystem method used to translate file offsets to blocks
1279   *
1280   * Similar to block_truncate_page(), this function can be called by a
1281   * filesystem when it is truncating a DAX file to handle the partial page.
1282   */
1283  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1284  {
1285  	unsigned length = PAGE_ALIGN(from) - from;
1286  	return dax_zero_page_range(inode, from, length, get_block);
1287  }
1288  EXPORT_SYMBOL_GPL(dax_truncate_page);
1289