1d475c634SMatthew Wilcox /* 2d475c634SMatthew Wilcox * fs/dax.c - Direct Access filesystem code 3d475c634SMatthew Wilcox * Copyright (c) 2013-2014 Intel Corporation 4d475c634SMatthew Wilcox * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5d475c634SMatthew Wilcox * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6d475c634SMatthew Wilcox * 7d475c634SMatthew Wilcox * This program is free software; you can redistribute it and/or modify it 8d475c634SMatthew Wilcox * under the terms and conditions of the GNU General Public License, 9d475c634SMatthew Wilcox * version 2, as published by the Free Software Foundation. 10d475c634SMatthew Wilcox * 11d475c634SMatthew Wilcox * This program is distributed in the hope it will be useful, but WITHOUT 12d475c634SMatthew Wilcox * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13d475c634SMatthew Wilcox * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14d475c634SMatthew Wilcox * more details. 15d475c634SMatthew Wilcox */ 16d475c634SMatthew Wilcox 17d475c634SMatthew Wilcox #include <linux/atomic.h> 18d475c634SMatthew Wilcox #include <linux/blkdev.h> 19d475c634SMatthew Wilcox #include <linux/buffer_head.h> 20d77e92e2SRoss Zwisler #include <linux/dax.h> 21d475c634SMatthew Wilcox #include <linux/fs.h> 22d475c634SMatthew Wilcox #include <linux/genhd.h> 23f7ca90b1SMatthew Wilcox #include <linux/highmem.h> 24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h> 25f7ca90b1SMatthew Wilcox #include <linux/mm.h> 26d475c634SMatthew Wilcox #include <linux/mutex.h> 279973c98eSRoss Zwisler #include <linux/pagevec.h> 282765cfbbSRoss Zwisler #include <linux/pmem.h> 29289c6aedSMatthew Wilcox #include <linux/sched.h> 30d475c634SMatthew Wilcox #include <linux/uio.h> 31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h> 3234c0fd54SDan Williams #include <linux/pfn_t.h> 330e749e54SDan Williams #include <linux/sizes.h> 34a254e568SChristoph Hellwig #include <linux/iomap.h> 35a254e568SChristoph Hellwig #include "internal.h" 36d475c634SMatthew Wilcox 37e804315dSJan Kara /* 38e804315dSJan Kara * We use lowest available bit in exceptional entry for locking, other two 39e804315dSJan Kara * bits to determine entry type. In total 3 special bits. 40e804315dSJan Kara */ 41e804315dSJan Kara #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 42e804315dSJan Kara #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 43e804315dSJan Kara #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 44e804315dSJan Kara #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 45e804315dSJan Kara #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 4678a9be0aSNeilBrown #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 4778a9be0aSNeilBrown #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 48e804315dSJan Kara RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 49e804315dSJan Kara RADIX_TREE_EXCEPTIONAL_ENTRY)) 50e4b27491SNeilBrown 51ac401cc7SJan Kara /* We choose 4096 entries - same as per-zone page wait tables */ 52ac401cc7SJan Kara #define DAX_WAIT_TABLE_BITS 12 53ac401cc7SJan Kara #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 54ac401cc7SJan Kara 55ce95ab0fSRoss Zwisler static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 56ac401cc7SJan Kara 57ac401cc7SJan Kara static int __init init_dax_wait_table(void) 58ac401cc7SJan Kara { 59ac401cc7SJan Kara int i; 60ac401cc7SJan Kara 61ac401cc7SJan Kara for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 62ac401cc7SJan Kara init_waitqueue_head(wait_table + i); 63ac401cc7SJan Kara return 0; 64ac401cc7SJan Kara } 65ac401cc7SJan Kara fs_initcall(init_dax_wait_table); 66ac401cc7SJan Kara 67ac401cc7SJan Kara static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 68ac401cc7SJan Kara pgoff_t index) 69ac401cc7SJan Kara { 70ac401cc7SJan Kara unsigned long hash = hash_long((unsigned long)mapping ^ index, 71ac401cc7SJan Kara DAX_WAIT_TABLE_BITS); 72ac401cc7SJan Kara return wait_table + hash; 73ac401cc7SJan Kara } 7478a9be0aSNeilBrown 75b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 76b2e0d162SDan Williams { 77b2e0d162SDan Williams struct request_queue *q = bdev->bd_queue; 78b2e0d162SDan Williams long rc = -EIO; 79b2e0d162SDan Williams 807a9eb206SDan Williams dax->addr = ERR_PTR(-EIO); 81b2e0d162SDan Williams if (blk_queue_enter(q, true) != 0) 82b2e0d162SDan Williams return rc; 83b2e0d162SDan Williams 84b2e0d162SDan Williams rc = bdev_direct_access(bdev, dax); 85b2e0d162SDan Williams if (rc < 0) { 867a9eb206SDan Williams dax->addr = ERR_PTR(rc); 87b2e0d162SDan Williams blk_queue_exit(q); 88b2e0d162SDan Williams return rc; 89b2e0d162SDan Williams } 90b2e0d162SDan Williams return rc; 91b2e0d162SDan Williams } 92b2e0d162SDan Williams 93b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev, 94b2e0d162SDan Williams const struct blk_dax_ctl *dax) 95b2e0d162SDan Williams { 96b2e0d162SDan Williams if (IS_ERR(dax->addr)) 97b2e0d162SDan Williams return; 98b2e0d162SDan Williams blk_queue_exit(bdev->bd_queue); 99b2e0d162SDan Williams } 100b2e0d162SDan Williams 101d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n) 102d1a5f2b4SDan Williams { 103d1a5f2b4SDan Williams struct page *page = alloc_pages(GFP_KERNEL, 0); 104d1a5f2b4SDan Williams struct blk_dax_ctl dax = { 105d1a5f2b4SDan Williams .size = PAGE_SIZE, 106d1a5f2b4SDan Williams .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 107d1a5f2b4SDan Williams }; 108d1a5f2b4SDan Williams long rc; 109d1a5f2b4SDan Williams 110d1a5f2b4SDan Williams if (!page) 111d1a5f2b4SDan Williams return ERR_PTR(-ENOMEM); 112d1a5f2b4SDan Williams 113d1a5f2b4SDan Williams rc = dax_map_atomic(bdev, &dax); 114d1a5f2b4SDan Williams if (rc < 0) 115d1a5f2b4SDan Williams return ERR_PTR(rc); 116d1a5f2b4SDan Williams memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 117d1a5f2b4SDan Williams dax_unmap_atomic(bdev, &dax); 118d1a5f2b4SDan Williams return page; 119d1a5f2b4SDan Williams } 120d1a5f2b4SDan Williams 121d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh) 122d475c634SMatthew Wilcox { 123d475c634SMatthew Wilcox return buffer_mapped(bh) && !buffer_unwritten(bh); 124d475c634SMatthew Wilcox } 125d475c634SMatthew Wilcox 126b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh, 127b2e0d162SDan Williams const struct inode *inode) 128b2e0d162SDan Williams { 129b2e0d162SDan Williams sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 130b2e0d162SDan Williams 131b2e0d162SDan Williams return sector; 132b2e0d162SDan Williams } 133b2e0d162SDan Williams 134a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 135d475c634SMatthew Wilcox loff_t start, loff_t end, get_block_t get_block, 136d475c634SMatthew Wilcox struct buffer_head *bh) 137d475c634SMatthew Wilcox { 138b2e0d162SDan Williams loff_t pos = start, max = start, bh_max = start; 13914df6a4eSDan Williams bool hole = false; 140b2e0d162SDan Williams struct block_device *bdev = NULL; 141b2e0d162SDan Williams int rw = iov_iter_rw(iter), rc; 142b2e0d162SDan Williams long map_len = 0; 143b2e0d162SDan Williams struct blk_dax_ctl dax = { 1447a9eb206SDan Williams .addr = ERR_PTR(-EIO), 145b2e0d162SDan Williams }; 146069c77bcSJan Kara unsigned blkbits = inode->i_blkbits; 147069c77bcSJan Kara sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 148069c77bcSJan Kara >> blkbits; 149d475c634SMatthew Wilcox 150b2e0d162SDan Williams if (rw == READ) 151d475c634SMatthew Wilcox end = min(end, i_size_read(inode)); 152d475c634SMatthew Wilcox 153d475c634SMatthew Wilcox while (pos < end) { 1542765cfbbSRoss Zwisler size_t len; 155d475c634SMatthew Wilcox if (pos == max) { 156e94f5a22SJeff Moyer long page = pos >> PAGE_SHIFT; 157e94f5a22SJeff Moyer sector_t block = page << (PAGE_SHIFT - blkbits); 158d475c634SMatthew Wilcox unsigned first = pos - (block << blkbits); 159d475c634SMatthew Wilcox long size; 160d475c634SMatthew Wilcox 161d475c634SMatthew Wilcox if (pos == bh_max) { 162d475c634SMatthew Wilcox bh->b_size = PAGE_ALIGN(end - pos); 163d475c634SMatthew Wilcox bh->b_state = 0; 164b2e0d162SDan Williams rc = get_block(inode, block, bh, rw == WRITE); 165b2e0d162SDan Williams if (rc) 166d475c634SMatthew Wilcox break; 167d475c634SMatthew Wilcox bh_max = pos - first + bh->b_size; 168b2e0d162SDan Williams bdev = bh->b_bdev; 169069c77bcSJan Kara /* 170069c77bcSJan Kara * We allow uninitialized buffers for writes 171069c77bcSJan Kara * beyond EOF as those cannot race with faults 172069c77bcSJan Kara */ 173069c77bcSJan Kara WARN_ON_ONCE( 174069c77bcSJan Kara (buffer_new(bh) && block < file_blks) || 175069c77bcSJan Kara (rw == WRITE && buffer_unwritten(bh))); 176d475c634SMatthew Wilcox } else { 177d475c634SMatthew Wilcox unsigned done = bh->b_size - 178d475c634SMatthew Wilcox (bh_max - (pos - first)); 179d475c634SMatthew Wilcox bh->b_blocknr += done >> blkbits; 180d475c634SMatthew Wilcox bh->b_size -= done; 181d475c634SMatthew Wilcox } 182d475c634SMatthew Wilcox 183b2e0d162SDan Williams hole = rw == READ && !buffer_written(bh); 184d475c634SMatthew Wilcox if (hole) { 185d475c634SMatthew Wilcox size = bh->b_size - first; 186d475c634SMatthew Wilcox } else { 187b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 188b2e0d162SDan Williams dax.sector = to_sector(bh, inode); 189b2e0d162SDan Williams dax.size = bh->b_size; 190b2e0d162SDan Williams map_len = dax_map_atomic(bdev, &dax); 191b2e0d162SDan Williams if (map_len < 0) { 192b2e0d162SDan Williams rc = map_len; 193d475c634SMatthew Wilcox break; 194b2e0d162SDan Williams } 195b2e0d162SDan Williams dax.addr += first; 196b2e0d162SDan Williams size = map_len - first; 197d475c634SMatthew Wilcox } 19802395435SEric Sandeen /* 19902395435SEric Sandeen * pos + size is one past the last offset for IO, 20002395435SEric Sandeen * so pos + size can overflow loff_t at extreme offsets. 20102395435SEric Sandeen * Cast to u64 to catch this and get the true minimum. 20202395435SEric Sandeen */ 20302395435SEric Sandeen max = min_t(u64, pos + size, end); 204d475c634SMatthew Wilcox } 205d475c634SMatthew Wilcox 2062765cfbbSRoss Zwisler if (iov_iter_rw(iter) == WRITE) { 207b2e0d162SDan Williams len = copy_from_iter_pmem(dax.addr, max - pos, iter); 2082765cfbbSRoss Zwisler } else if (!hole) 209b2e0d162SDan Williams len = copy_to_iter((void __force *) dax.addr, max - pos, 210e2e05394SRoss Zwisler iter); 211d475c634SMatthew Wilcox else 212d475c634SMatthew Wilcox len = iov_iter_zero(max - pos, iter); 213d475c634SMatthew Wilcox 214cadfbb6eSAl Viro if (!len) { 215b2e0d162SDan Williams rc = -EFAULT; 216d475c634SMatthew Wilcox break; 217cadfbb6eSAl Viro } 218d475c634SMatthew Wilcox 219d475c634SMatthew Wilcox pos += len; 220b2e0d162SDan Williams if (!IS_ERR(dax.addr)) 221b2e0d162SDan Williams dax.addr += len; 222d475c634SMatthew Wilcox } 223d475c634SMatthew Wilcox 224b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 2252765cfbbSRoss Zwisler 226b2e0d162SDan Williams return (pos == start) ? rc : pos - start; 227d475c634SMatthew Wilcox } 228d475c634SMatthew Wilcox 229d475c634SMatthew Wilcox /** 230d475c634SMatthew Wilcox * dax_do_io - Perform I/O to a DAX file 231d475c634SMatthew Wilcox * @iocb: The control block for this I/O 232d475c634SMatthew Wilcox * @inode: The file which the I/O is directed at 233d475c634SMatthew Wilcox * @iter: The addresses to do I/O from or to 234d475c634SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 235d475c634SMatthew Wilcox * @end_io: A filesystem callback for I/O completion 236d475c634SMatthew Wilcox * @flags: See below 237d475c634SMatthew Wilcox * 238d475c634SMatthew Wilcox * This function uses the same locking scheme as do_blockdev_direct_IO: 239d475c634SMatthew Wilcox * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 240d475c634SMatthew Wilcox * caller for writes. For reads, we take and release the i_mutex ourselves. 241d475c634SMatthew Wilcox * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 242d475c634SMatthew Wilcox * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 243d475c634SMatthew Wilcox * is in progress. 244d475c634SMatthew Wilcox */ 245a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 246c8b8e32dSChristoph Hellwig struct iov_iter *iter, get_block_t get_block, 247a95cd631SOmar Sandoval dio_iodone_t end_io, int flags) 248d475c634SMatthew Wilcox { 249d475c634SMatthew Wilcox struct buffer_head bh; 250d475c634SMatthew Wilcox ssize_t retval = -EINVAL; 251c8b8e32dSChristoph Hellwig loff_t pos = iocb->ki_pos; 252d475c634SMatthew Wilcox loff_t end = pos + iov_iter_count(iter); 253d475c634SMatthew Wilcox 254d475c634SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 255eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 256d475c634SMatthew Wilcox 257c3d98e39SJan Kara if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2585955102cSAl Viro inode_lock(inode); 259d475c634SMatthew Wilcox 260d475c634SMatthew Wilcox /* Protects against truncate */ 261bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 262fe0f07d0SJens Axboe inode_dio_begin(inode); 263d475c634SMatthew Wilcox 264a95cd631SOmar Sandoval retval = dax_io(inode, iter, pos, end, get_block, &bh); 265d475c634SMatthew Wilcox 266a95cd631SOmar Sandoval if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2675955102cSAl Viro inode_unlock(inode); 268d475c634SMatthew Wilcox 269187372a3SChristoph Hellwig if (end_io) { 270187372a3SChristoph Hellwig int err; 271187372a3SChristoph Hellwig 272187372a3SChristoph Hellwig err = end_io(iocb, pos, retval, bh.b_private); 273187372a3SChristoph Hellwig if (err) 274187372a3SChristoph Hellwig retval = err; 275187372a3SChristoph Hellwig } 276d475c634SMatthew Wilcox 277bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 278fe0f07d0SJens Axboe inode_dio_end(inode); 279d475c634SMatthew Wilcox return retval; 280d475c634SMatthew Wilcox } 281d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io); 282f7ca90b1SMatthew Wilcox 283f7ca90b1SMatthew Wilcox /* 284ac401cc7SJan Kara * DAX radix tree locking 285ac401cc7SJan Kara */ 286ac401cc7SJan Kara struct exceptional_entry_key { 287ac401cc7SJan Kara struct address_space *mapping; 288ac401cc7SJan Kara unsigned long index; 289ac401cc7SJan Kara }; 290ac401cc7SJan Kara 291ac401cc7SJan Kara struct wait_exceptional_entry_queue { 292ac401cc7SJan Kara wait_queue_t wait; 293ac401cc7SJan Kara struct exceptional_entry_key key; 294ac401cc7SJan Kara }; 295ac401cc7SJan Kara 296ac401cc7SJan Kara static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 297ac401cc7SJan Kara int sync, void *keyp) 298ac401cc7SJan Kara { 299ac401cc7SJan Kara struct exceptional_entry_key *key = keyp; 300ac401cc7SJan Kara struct wait_exceptional_entry_queue *ewait = 301ac401cc7SJan Kara container_of(wait, struct wait_exceptional_entry_queue, wait); 302ac401cc7SJan Kara 303ac401cc7SJan Kara if (key->mapping != ewait->key.mapping || 304ac401cc7SJan Kara key->index != ewait->key.index) 305ac401cc7SJan Kara return 0; 306ac401cc7SJan Kara return autoremove_wake_function(wait, mode, sync, NULL); 307ac401cc7SJan Kara } 308ac401cc7SJan Kara 309ac401cc7SJan Kara /* 310ac401cc7SJan Kara * Check whether the given slot is locked. The function must be called with 311ac401cc7SJan Kara * mapping->tree_lock held 312ac401cc7SJan Kara */ 313ac401cc7SJan Kara static inline int slot_locked(struct address_space *mapping, void **slot) 314ac401cc7SJan Kara { 315ac401cc7SJan Kara unsigned long entry = (unsigned long) 316ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 317ac401cc7SJan Kara return entry & RADIX_DAX_ENTRY_LOCK; 318ac401cc7SJan Kara } 319ac401cc7SJan Kara 320ac401cc7SJan Kara /* 321ac401cc7SJan Kara * Mark the given slot is locked. The function must be called with 322ac401cc7SJan Kara * mapping->tree_lock held 323ac401cc7SJan Kara */ 324ac401cc7SJan Kara static inline void *lock_slot(struct address_space *mapping, void **slot) 325ac401cc7SJan Kara { 326ac401cc7SJan Kara unsigned long entry = (unsigned long) 327ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 328ac401cc7SJan Kara 329ac401cc7SJan Kara entry |= RADIX_DAX_ENTRY_LOCK; 330ac401cc7SJan Kara radix_tree_replace_slot(slot, (void *)entry); 331ac401cc7SJan Kara return (void *)entry; 332ac401cc7SJan Kara } 333ac401cc7SJan Kara 334ac401cc7SJan Kara /* 335ac401cc7SJan Kara * Mark the given slot is unlocked. The function must be called with 336ac401cc7SJan Kara * mapping->tree_lock held 337ac401cc7SJan Kara */ 338ac401cc7SJan Kara static inline void *unlock_slot(struct address_space *mapping, void **slot) 339ac401cc7SJan Kara { 340ac401cc7SJan Kara unsigned long entry = (unsigned long) 341ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 342ac401cc7SJan Kara 343ac401cc7SJan Kara entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 344ac401cc7SJan Kara radix_tree_replace_slot(slot, (void *)entry); 345ac401cc7SJan Kara return (void *)entry; 346ac401cc7SJan Kara } 347ac401cc7SJan Kara 348ac401cc7SJan Kara /* 349ac401cc7SJan Kara * Lookup entry in radix tree, wait for it to become unlocked if it is 350ac401cc7SJan Kara * exceptional entry and return it. The caller must call 351ac401cc7SJan Kara * put_unlocked_mapping_entry() when he decided not to lock the entry or 352ac401cc7SJan Kara * put_locked_mapping_entry() when he locked the entry and now wants to 353ac401cc7SJan Kara * unlock it. 354ac401cc7SJan Kara * 355ac401cc7SJan Kara * The function must be called with mapping->tree_lock held. 356ac401cc7SJan Kara */ 357ac401cc7SJan Kara static void *get_unlocked_mapping_entry(struct address_space *mapping, 358ac401cc7SJan Kara pgoff_t index, void ***slotp) 359ac401cc7SJan Kara { 360e3ad61c6SRoss Zwisler void *entry, **slot; 361ac401cc7SJan Kara struct wait_exceptional_entry_queue ewait; 362ac401cc7SJan Kara wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 363ac401cc7SJan Kara 364ac401cc7SJan Kara init_wait(&ewait.wait); 365ac401cc7SJan Kara ewait.wait.func = wake_exceptional_entry_func; 366ac401cc7SJan Kara ewait.key.mapping = mapping; 367ac401cc7SJan Kara ewait.key.index = index; 368ac401cc7SJan Kara 369ac401cc7SJan Kara for (;;) { 370e3ad61c6SRoss Zwisler entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 371ac401cc7SJan Kara &slot); 372e3ad61c6SRoss Zwisler if (!entry || !radix_tree_exceptional_entry(entry) || 373ac401cc7SJan Kara !slot_locked(mapping, slot)) { 374ac401cc7SJan Kara if (slotp) 375ac401cc7SJan Kara *slotp = slot; 376e3ad61c6SRoss Zwisler return entry; 377ac401cc7SJan Kara } 378ac401cc7SJan Kara prepare_to_wait_exclusive(wq, &ewait.wait, 379ac401cc7SJan Kara TASK_UNINTERRUPTIBLE); 380ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 381ac401cc7SJan Kara schedule(); 382ac401cc7SJan Kara finish_wait(wq, &ewait.wait); 383ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 384ac401cc7SJan Kara } 385ac401cc7SJan Kara } 386ac401cc7SJan Kara 387ac401cc7SJan Kara /* 388ac401cc7SJan Kara * Find radix tree entry at given index. If it points to a page, return with 389ac401cc7SJan Kara * the page locked. If it points to the exceptional entry, return with the 390ac401cc7SJan Kara * radix tree entry locked. If the radix tree doesn't contain given index, 391ac401cc7SJan Kara * create empty exceptional entry for the index and return with it locked. 392ac401cc7SJan Kara * 393ac401cc7SJan Kara * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 394ac401cc7SJan Kara * persistent memory the benefit is doubtful. We can add that later if we can 395ac401cc7SJan Kara * show it helps. 396ac401cc7SJan Kara */ 397ac401cc7SJan Kara static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 398ac401cc7SJan Kara { 399e3ad61c6SRoss Zwisler void *entry, **slot; 400ac401cc7SJan Kara 401ac401cc7SJan Kara restart: 402ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 403e3ad61c6SRoss Zwisler entry = get_unlocked_mapping_entry(mapping, index, &slot); 404ac401cc7SJan Kara /* No entry for given index? Make sure radix tree is big enough. */ 405e3ad61c6SRoss Zwisler if (!entry) { 406ac401cc7SJan Kara int err; 407ac401cc7SJan Kara 408ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 409ac401cc7SJan Kara err = radix_tree_preload( 410ac401cc7SJan Kara mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 411ac401cc7SJan Kara if (err) 412ac401cc7SJan Kara return ERR_PTR(err); 413e3ad61c6SRoss Zwisler entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 414ac401cc7SJan Kara RADIX_DAX_ENTRY_LOCK); 415ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 416e3ad61c6SRoss Zwisler err = radix_tree_insert(&mapping->page_tree, index, entry); 417ac401cc7SJan Kara radix_tree_preload_end(); 418ac401cc7SJan Kara if (err) { 419ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 420ac401cc7SJan Kara /* Someone already created the entry? */ 421ac401cc7SJan Kara if (err == -EEXIST) 422ac401cc7SJan Kara goto restart; 423ac401cc7SJan Kara return ERR_PTR(err); 424ac401cc7SJan Kara } 425ac401cc7SJan Kara /* Good, we have inserted empty locked entry into the tree. */ 426ac401cc7SJan Kara mapping->nrexceptional++; 427ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 428e3ad61c6SRoss Zwisler return entry; 429ac401cc7SJan Kara } 430ac401cc7SJan Kara /* Normal page in radix tree? */ 431e3ad61c6SRoss Zwisler if (!radix_tree_exceptional_entry(entry)) { 432e3ad61c6SRoss Zwisler struct page *page = entry; 433ac401cc7SJan Kara 434ac401cc7SJan Kara get_page(page); 435ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 436ac401cc7SJan Kara lock_page(page); 437ac401cc7SJan Kara /* Page got truncated? Retry... */ 438ac401cc7SJan Kara if (unlikely(page->mapping != mapping)) { 439ac401cc7SJan Kara unlock_page(page); 440ac401cc7SJan Kara put_page(page); 441ac401cc7SJan Kara goto restart; 442ac401cc7SJan Kara } 443ac401cc7SJan Kara return page; 444ac401cc7SJan Kara } 445e3ad61c6SRoss Zwisler entry = lock_slot(mapping, slot); 446ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 447e3ad61c6SRoss Zwisler return entry; 448ac401cc7SJan Kara } 449ac401cc7SJan Kara 450ac401cc7SJan Kara void dax_wake_mapping_entry_waiter(struct address_space *mapping, 451ac401cc7SJan Kara pgoff_t index, bool wake_all) 452ac401cc7SJan Kara { 453ac401cc7SJan Kara wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 454ac401cc7SJan Kara 455ac401cc7SJan Kara /* 456ac401cc7SJan Kara * Checking for locked entry and prepare_to_wait_exclusive() happens 457ac401cc7SJan Kara * under mapping->tree_lock, ditto for entry handling in our callers. 458ac401cc7SJan Kara * So at this point all tasks that could have seen our entry locked 459ac401cc7SJan Kara * must be in the waitqueue and the following check will see them. 460ac401cc7SJan Kara */ 461ac401cc7SJan Kara if (waitqueue_active(wq)) { 462ac401cc7SJan Kara struct exceptional_entry_key key; 463ac401cc7SJan Kara 464ac401cc7SJan Kara key.mapping = mapping; 465ac401cc7SJan Kara key.index = index; 466ac401cc7SJan Kara __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 467ac401cc7SJan Kara } 468ac401cc7SJan Kara } 469ac401cc7SJan Kara 470bc2466e4SJan Kara void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 471ac401cc7SJan Kara { 472e3ad61c6SRoss Zwisler void *entry, **slot; 473ac401cc7SJan Kara 474ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 475e3ad61c6SRoss Zwisler entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 476e3ad61c6SRoss Zwisler if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 477ac401cc7SJan Kara !slot_locked(mapping, slot))) { 478ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 479ac401cc7SJan Kara return; 480ac401cc7SJan Kara } 481ac401cc7SJan Kara unlock_slot(mapping, slot); 482ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 483ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, false); 484ac401cc7SJan Kara } 485ac401cc7SJan Kara 486ac401cc7SJan Kara static void put_locked_mapping_entry(struct address_space *mapping, 487ac401cc7SJan Kara pgoff_t index, void *entry) 488ac401cc7SJan Kara { 489ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 490ac401cc7SJan Kara unlock_page(entry); 491ac401cc7SJan Kara put_page(entry); 492ac401cc7SJan Kara } else { 493bc2466e4SJan Kara dax_unlock_mapping_entry(mapping, index); 494ac401cc7SJan Kara } 495ac401cc7SJan Kara } 496ac401cc7SJan Kara 497ac401cc7SJan Kara /* 498ac401cc7SJan Kara * Called when we are done with radix tree entry we looked up via 499ac401cc7SJan Kara * get_unlocked_mapping_entry() and which we didn't lock in the end. 500ac401cc7SJan Kara */ 501ac401cc7SJan Kara static void put_unlocked_mapping_entry(struct address_space *mapping, 502ac401cc7SJan Kara pgoff_t index, void *entry) 503ac401cc7SJan Kara { 504ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) 505ac401cc7SJan Kara return; 506ac401cc7SJan Kara 507ac401cc7SJan Kara /* We have to wake up next waiter for the radix tree entry lock */ 508ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, false); 509ac401cc7SJan Kara } 510ac401cc7SJan Kara 511ac401cc7SJan Kara /* 512ac401cc7SJan Kara * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 513ac401cc7SJan Kara * entry to get unlocked before deleting it. 514ac401cc7SJan Kara */ 515ac401cc7SJan Kara int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 516ac401cc7SJan Kara { 517ac401cc7SJan Kara void *entry; 518ac401cc7SJan Kara 519ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 520ac401cc7SJan Kara entry = get_unlocked_mapping_entry(mapping, index, NULL); 521ac401cc7SJan Kara /* 522ac401cc7SJan Kara * This gets called from truncate / punch_hole path. As such, the caller 523ac401cc7SJan Kara * must hold locks protecting against concurrent modifications of the 524ac401cc7SJan Kara * radix tree (usually fs-private i_mmap_sem for writing). Since the 525ac401cc7SJan Kara * caller has seen exceptional entry for this index, we better find it 526ac401cc7SJan Kara * at that index as well... 527ac401cc7SJan Kara */ 528ac401cc7SJan Kara if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 529ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 530ac401cc7SJan Kara return 0; 531ac401cc7SJan Kara } 532ac401cc7SJan Kara radix_tree_delete(&mapping->page_tree, index); 533ac401cc7SJan Kara mapping->nrexceptional--; 534ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 535ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, true); 536ac401cc7SJan Kara 537ac401cc7SJan Kara return 1; 538ac401cc7SJan Kara } 539ac401cc7SJan Kara 540ac401cc7SJan Kara /* 541f7ca90b1SMatthew Wilcox * The user has performed a load from a hole in the file. Allocating 542f7ca90b1SMatthew Wilcox * a new page in the file would cause excessive storage usage for 543f7ca90b1SMatthew Wilcox * workloads with sparse files. We allocate a page cache page instead. 544f7ca90b1SMatthew Wilcox * We'll kick it out of the page cache if it's ever written to, 545f7ca90b1SMatthew Wilcox * otherwise it will simply fall out of the page cache under memory 546f7ca90b1SMatthew Wilcox * pressure without ever having been dirtied. 547f7ca90b1SMatthew Wilcox */ 548ac401cc7SJan Kara static int dax_load_hole(struct address_space *mapping, void *entry, 549f7ca90b1SMatthew Wilcox struct vm_fault *vmf) 550f7ca90b1SMatthew Wilcox { 551ac401cc7SJan Kara struct page *page; 552f7ca90b1SMatthew Wilcox 553ac401cc7SJan Kara /* Hole page already exists? Return it... */ 554ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 555ac401cc7SJan Kara vmf->page = entry; 556ac401cc7SJan Kara return VM_FAULT_LOCKED; 557ac401cc7SJan Kara } 558ac401cc7SJan Kara 559ac401cc7SJan Kara /* This will replace locked radix tree entry with a hole page */ 560ac401cc7SJan Kara page = find_or_create_page(mapping, vmf->pgoff, 561ac401cc7SJan Kara vmf->gfp_mask | __GFP_ZERO); 562ac401cc7SJan Kara if (!page) { 563ac401cc7SJan Kara put_locked_mapping_entry(mapping, vmf->pgoff, entry); 564ac401cc7SJan Kara return VM_FAULT_OOM; 565ac401cc7SJan Kara } 566f7ca90b1SMatthew Wilcox vmf->page = page; 567f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 568f7ca90b1SMatthew Wilcox } 569f7ca90b1SMatthew Wilcox 570b0d5e82fSChristoph Hellwig static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 571b0d5e82fSChristoph Hellwig struct page *to, unsigned long vaddr) 572f7ca90b1SMatthew Wilcox { 573b2e0d162SDan Williams struct blk_dax_ctl dax = { 574b0d5e82fSChristoph Hellwig .sector = sector, 575b0d5e82fSChristoph Hellwig .size = size, 576b2e0d162SDan Williams }; 577e2e05394SRoss Zwisler void *vto; 578e2e05394SRoss Zwisler 579b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) 580b2e0d162SDan Williams return PTR_ERR(dax.addr); 581f7ca90b1SMatthew Wilcox vto = kmap_atomic(to); 582b2e0d162SDan Williams copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 583f7ca90b1SMatthew Wilcox kunmap_atomic(vto); 584b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 585f7ca90b1SMatthew Wilcox return 0; 586f7ca90b1SMatthew Wilcox } 587f7ca90b1SMatthew Wilcox 58809cbfeafSKirill A. Shutemov #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 5899973c98eSRoss Zwisler 590ac401cc7SJan Kara static void *dax_insert_mapping_entry(struct address_space *mapping, 591ac401cc7SJan Kara struct vm_fault *vmf, 592ac401cc7SJan Kara void *entry, sector_t sector) 5939973c98eSRoss Zwisler { 5949973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 595ac401cc7SJan Kara int error = 0; 596ac401cc7SJan Kara bool hole_fill = false; 597ac401cc7SJan Kara void *new_entry; 598ac401cc7SJan Kara pgoff_t index = vmf->pgoff; 5999973c98eSRoss Zwisler 600ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) 6019973c98eSRoss Zwisler __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 6029973c98eSRoss Zwisler 603ac401cc7SJan Kara /* Replacing hole page with block mapping? */ 604ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 605ac401cc7SJan Kara hole_fill = true; 6069973c98eSRoss Zwisler /* 607ac401cc7SJan Kara * Unmap the page now before we remove it from page cache below. 608ac401cc7SJan Kara * The page is locked so it cannot be faulted in again. 6099973c98eSRoss Zwisler */ 610ac401cc7SJan Kara unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 611ac401cc7SJan Kara PAGE_SIZE, 0); 612ac401cc7SJan Kara error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 6139973c98eSRoss Zwisler if (error) 614ac401cc7SJan Kara return ERR_PTR(error); 615ac401cc7SJan Kara } 6169973c98eSRoss Zwisler 617ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 618ac401cc7SJan Kara new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 619ac401cc7SJan Kara RADIX_DAX_ENTRY_LOCK); 620ac401cc7SJan Kara if (hole_fill) { 621ac401cc7SJan Kara __delete_from_page_cache(entry, NULL); 622ac401cc7SJan Kara /* Drop pagecache reference */ 623ac401cc7SJan Kara put_page(entry); 624ac401cc7SJan Kara error = radix_tree_insert(page_tree, index, new_entry); 625ac401cc7SJan Kara if (error) { 626ac401cc7SJan Kara new_entry = ERR_PTR(error); 627ac401cc7SJan Kara goto unlock; 628ac401cc7SJan Kara } 6299973c98eSRoss Zwisler mapping->nrexceptional++; 630ac401cc7SJan Kara } else { 631ac401cc7SJan Kara void **slot; 632ac401cc7SJan Kara void *ret; 633ac401cc7SJan Kara 634ac401cc7SJan Kara ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 635ac401cc7SJan Kara WARN_ON_ONCE(ret != entry); 636ac401cc7SJan Kara radix_tree_replace_slot(slot, new_entry); 637ac401cc7SJan Kara } 638ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) 6399973c98eSRoss Zwisler radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 6409973c98eSRoss Zwisler unlock: 6419973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 642ac401cc7SJan Kara if (hole_fill) { 643ac401cc7SJan Kara radix_tree_preload_end(); 644ac401cc7SJan Kara /* 645ac401cc7SJan Kara * We don't need hole page anymore, it has been replaced with 646ac401cc7SJan Kara * locked radix tree entry now. 647ac401cc7SJan Kara */ 648ac401cc7SJan Kara if (mapping->a_ops->freepage) 649ac401cc7SJan Kara mapping->a_ops->freepage(entry); 650ac401cc7SJan Kara unlock_page(entry); 651ac401cc7SJan Kara put_page(entry); 652ac401cc7SJan Kara } 653ac401cc7SJan Kara return new_entry; 6549973c98eSRoss Zwisler } 6559973c98eSRoss Zwisler 6569973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev, 6579973c98eSRoss Zwisler struct address_space *mapping, pgoff_t index, void *entry) 6589973c98eSRoss Zwisler { 6599973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 6609973c98eSRoss Zwisler int type = RADIX_DAX_TYPE(entry); 6619973c98eSRoss Zwisler struct radix_tree_node *node; 6629973c98eSRoss Zwisler struct blk_dax_ctl dax; 6639973c98eSRoss Zwisler void **slot; 6649973c98eSRoss Zwisler int ret = 0; 6659973c98eSRoss Zwisler 6669973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 6679973c98eSRoss Zwisler /* 6689973c98eSRoss Zwisler * Regular page slots are stabilized by the page lock even 6699973c98eSRoss Zwisler * without the tree itself locked. These unlocked entries 6709973c98eSRoss Zwisler * need verification under the tree lock. 6719973c98eSRoss Zwisler */ 6729973c98eSRoss Zwisler if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 6739973c98eSRoss Zwisler goto unlock; 6749973c98eSRoss Zwisler if (*slot != entry) 6759973c98eSRoss Zwisler goto unlock; 6769973c98eSRoss Zwisler 6779973c98eSRoss Zwisler /* another fsync thread may have already written back this entry */ 6789973c98eSRoss Zwisler if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 6799973c98eSRoss Zwisler goto unlock; 6809973c98eSRoss Zwisler 6819973c98eSRoss Zwisler if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 6829973c98eSRoss Zwisler ret = -EIO; 6839973c98eSRoss Zwisler goto unlock; 6849973c98eSRoss Zwisler } 6859973c98eSRoss Zwisler 6869973c98eSRoss Zwisler dax.sector = RADIX_DAX_SECTOR(entry); 6879973c98eSRoss Zwisler dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 6889973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 6899973c98eSRoss Zwisler 6909973c98eSRoss Zwisler /* 6919973c98eSRoss Zwisler * We cannot hold tree_lock while calling dax_map_atomic() because it 6929973c98eSRoss Zwisler * eventually calls cond_resched(). 6939973c98eSRoss Zwisler */ 6949973c98eSRoss Zwisler ret = dax_map_atomic(bdev, &dax); 6959973c98eSRoss Zwisler if (ret < 0) 6969973c98eSRoss Zwisler return ret; 6979973c98eSRoss Zwisler 6989973c98eSRoss Zwisler if (WARN_ON_ONCE(ret < dax.size)) { 6999973c98eSRoss Zwisler ret = -EIO; 7009973c98eSRoss Zwisler goto unmap; 7019973c98eSRoss Zwisler } 7029973c98eSRoss Zwisler 7039973c98eSRoss Zwisler wb_cache_pmem(dax.addr, dax.size); 7049973c98eSRoss Zwisler 7059973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 7069973c98eSRoss Zwisler radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 7079973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 7089973c98eSRoss Zwisler unmap: 7099973c98eSRoss Zwisler dax_unmap_atomic(bdev, &dax); 7109973c98eSRoss Zwisler return ret; 7119973c98eSRoss Zwisler 7129973c98eSRoss Zwisler unlock: 7139973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 7149973c98eSRoss Zwisler return ret; 7159973c98eSRoss Zwisler } 7169973c98eSRoss Zwisler 7179973c98eSRoss Zwisler /* 7189973c98eSRoss Zwisler * Flush the mapping to the persistent domain within the byte range of [start, 7199973c98eSRoss Zwisler * end]. This is required by data integrity operations to ensure file data is 7209973c98eSRoss Zwisler * on persistent storage prior to completion of the operation. 7219973c98eSRoss Zwisler */ 7227f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping, 7237f6d5b52SRoss Zwisler struct block_device *bdev, struct writeback_control *wbc) 7249973c98eSRoss Zwisler { 7259973c98eSRoss Zwisler struct inode *inode = mapping->host; 7269973c98eSRoss Zwisler pgoff_t start_index, end_index, pmd_index; 7279973c98eSRoss Zwisler pgoff_t indices[PAGEVEC_SIZE]; 7289973c98eSRoss Zwisler struct pagevec pvec; 7299973c98eSRoss Zwisler bool done = false; 7309973c98eSRoss Zwisler int i, ret = 0; 7319973c98eSRoss Zwisler void *entry; 7329973c98eSRoss Zwisler 7339973c98eSRoss Zwisler if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 7349973c98eSRoss Zwisler return -EIO; 7359973c98eSRoss Zwisler 7367f6d5b52SRoss Zwisler if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 7377f6d5b52SRoss Zwisler return 0; 7387f6d5b52SRoss Zwisler 73909cbfeafSKirill A. Shutemov start_index = wbc->range_start >> PAGE_SHIFT; 74009cbfeafSKirill A. Shutemov end_index = wbc->range_end >> PAGE_SHIFT; 7419973c98eSRoss Zwisler pmd_index = DAX_PMD_INDEX(start_index); 7429973c98eSRoss Zwisler 7439973c98eSRoss Zwisler rcu_read_lock(); 7449973c98eSRoss Zwisler entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 7459973c98eSRoss Zwisler rcu_read_unlock(); 7469973c98eSRoss Zwisler 7479973c98eSRoss Zwisler /* see if the start of our range is covered by a PMD entry */ 7489973c98eSRoss Zwisler if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 7499973c98eSRoss Zwisler start_index = pmd_index; 7509973c98eSRoss Zwisler 7519973c98eSRoss Zwisler tag_pages_for_writeback(mapping, start_index, end_index); 7529973c98eSRoss Zwisler 7539973c98eSRoss Zwisler pagevec_init(&pvec, 0); 7549973c98eSRoss Zwisler while (!done) { 7559973c98eSRoss Zwisler pvec.nr = find_get_entries_tag(mapping, start_index, 7569973c98eSRoss Zwisler PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 7579973c98eSRoss Zwisler pvec.pages, indices); 7589973c98eSRoss Zwisler 7599973c98eSRoss Zwisler if (pvec.nr == 0) 7609973c98eSRoss Zwisler break; 7619973c98eSRoss Zwisler 7629973c98eSRoss Zwisler for (i = 0; i < pvec.nr; i++) { 7639973c98eSRoss Zwisler if (indices[i] > end_index) { 7649973c98eSRoss Zwisler done = true; 7659973c98eSRoss Zwisler break; 7669973c98eSRoss Zwisler } 7679973c98eSRoss Zwisler 7689973c98eSRoss Zwisler ret = dax_writeback_one(bdev, mapping, indices[i], 7699973c98eSRoss Zwisler pvec.pages[i]); 7709973c98eSRoss Zwisler if (ret < 0) 7719973c98eSRoss Zwisler return ret; 7729973c98eSRoss Zwisler } 7739973c98eSRoss Zwisler } 7749973c98eSRoss Zwisler return 0; 7759973c98eSRoss Zwisler } 7769973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 7779973c98eSRoss Zwisler 778ac401cc7SJan Kara static int dax_insert_mapping(struct address_space *mapping, 7791aaba095SChristoph Hellwig struct block_device *bdev, sector_t sector, size_t size, 7801aaba095SChristoph Hellwig void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 781f7ca90b1SMatthew Wilcox { 782f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 783b2e0d162SDan Williams struct blk_dax_ctl dax = { 7841aaba095SChristoph Hellwig .sector = sector, 7851aaba095SChristoph Hellwig .size = size, 786b2e0d162SDan Williams }; 787ac401cc7SJan Kara void *ret; 788ac401cc7SJan Kara void *entry = *entryp; 789f7ca90b1SMatthew Wilcox 7904d9a2c87SJan Kara if (dax_map_atomic(bdev, &dax) < 0) 7914d9a2c87SJan Kara return PTR_ERR(dax.addr); 792b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 793f7ca90b1SMatthew Wilcox 794ac401cc7SJan Kara ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 7954d9a2c87SJan Kara if (IS_ERR(ret)) 7964d9a2c87SJan Kara return PTR_ERR(ret); 797ac401cc7SJan Kara *entryp = ret; 7989973c98eSRoss Zwisler 7994d9a2c87SJan Kara return vm_insert_mixed(vma, vaddr, dax.pfn); 800f7ca90b1SMatthew Wilcox } 801f7ca90b1SMatthew Wilcox 802ce5c5d55SDave Chinner /** 8036b524995SRoss Zwisler * dax_fault - handle a page fault on a DAX file 804ce5c5d55SDave Chinner * @vma: The virtual memory area where the fault occurred 805ce5c5d55SDave Chinner * @vmf: The description of the fault 806ce5c5d55SDave Chinner * @get_block: The filesystem method used to translate file offsets to blocks 807ce5c5d55SDave Chinner * 808ce5c5d55SDave Chinner * When a page fault occurs, filesystems may call this helper in their 8096b524995SRoss Zwisler * fault handler for DAX files. dax_fault() assumes the caller has done all 810ce5c5d55SDave Chinner * the necessary locking for the page fault to proceed successfully. 811ce5c5d55SDave Chinner */ 8126b524995SRoss Zwisler int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 81302fbd139SJan Kara get_block_t get_block) 814f7ca90b1SMatthew Wilcox { 815f7ca90b1SMatthew Wilcox struct file *file = vma->vm_file; 816f7ca90b1SMatthew Wilcox struct address_space *mapping = file->f_mapping; 817f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 818ac401cc7SJan Kara void *entry; 819f7ca90b1SMatthew Wilcox struct buffer_head bh; 820f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 821f7ca90b1SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 822f7ca90b1SMatthew Wilcox sector_t block; 823f7ca90b1SMatthew Wilcox pgoff_t size; 824f7ca90b1SMatthew Wilcox int error; 825f7ca90b1SMatthew Wilcox int major = 0; 826f7ca90b1SMatthew Wilcox 827ac401cc7SJan Kara /* 828ac401cc7SJan Kara * Check whether offset isn't beyond end of file now. Caller is supposed 829ac401cc7SJan Kara * to hold locks serializing us with truncate / punch hole so this is 830ac401cc7SJan Kara * a reliable test. 831ac401cc7SJan Kara */ 832f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 833f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) 834f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 835f7ca90b1SMatthew Wilcox 836f7ca90b1SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 837f7ca90b1SMatthew Wilcox block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 838eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 839f7ca90b1SMatthew Wilcox bh.b_size = PAGE_SIZE; 840f7ca90b1SMatthew Wilcox 841ac401cc7SJan Kara entry = grab_mapping_entry(mapping, vmf->pgoff); 842ac401cc7SJan Kara if (IS_ERR(entry)) { 843ac401cc7SJan Kara error = PTR_ERR(entry); 844ac401cc7SJan Kara goto out; 845f7ca90b1SMatthew Wilcox } 846f7ca90b1SMatthew Wilcox 847f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 0); 848f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 849f7ca90b1SMatthew Wilcox error = -EIO; /* fs corruption? */ 850f7ca90b1SMatthew Wilcox if (error) 851ac401cc7SJan Kara goto unlock_entry; 852f7ca90b1SMatthew Wilcox 853f7ca90b1SMatthew Wilcox if (vmf->cow_page) { 854f7ca90b1SMatthew Wilcox struct page *new_page = vmf->cow_page; 855f7ca90b1SMatthew Wilcox if (buffer_written(&bh)) 856b0d5e82fSChristoph Hellwig error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), 857b0d5e82fSChristoph Hellwig bh.b_size, new_page, vaddr); 858f7ca90b1SMatthew Wilcox else 859f7ca90b1SMatthew Wilcox clear_user_highpage(new_page, vaddr); 860f7ca90b1SMatthew Wilcox if (error) 861ac401cc7SJan Kara goto unlock_entry; 862ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 863ac401cc7SJan Kara vmf->page = entry; 864f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 865f7ca90b1SMatthew Wilcox } 866bc2466e4SJan Kara vmf->entry = entry; 867bc2466e4SJan Kara return VM_FAULT_DAX_LOCKED; 868bc2466e4SJan Kara } 869f7ca90b1SMatthew Wilcox 870ac401cc7SJan Kara if (!buffer_mapped(&bh)) { 871ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) { 872ac401cc7SJan Kara error = get_block(inode, block, &bh, 1); 873ac401cc7SJan Kara count_vm_event(PGMAJFAULT); 874ac401cc7SJan Kara mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 875ac401cc7SJan Kara major = VM_FAULT_MAJOR; 876ac401cc7SJan Kara if (!error && (bh.b_size < PAGE_SIZE)) 877ac401cc7SJan Kara error = -EIO; 878ac401cc7SJan Kara if (error) 879ac401cc7SJan Kara goto unlock_entry; 880ac401cc7SJan Kara } else { 881ac401cc7SJan Kara return dax_load_hole(mapping, entry, vmf); 882ac401cc7SJan Kara } 883f7ca90b1SMatthew Wilcox } 884f7ca90b1SMatthew Wilcox 88502fbd139SJan Kara /* Filesystem should not return unwritten buffers to us! */ 8862b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 8871aaba095SChristoph Hellwig error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), 8881aaba095SChristoph Hellwig bh.b_size, &entry, vma, vmf); 889ac401cc7SJan Kara unlock_entry: 890ac401cc7SJan Kara put_locked_mapping_entry(mapping, vmf->pgoff, entry); 891f7ca90b1SMatthew Wilcox out: 892f7ca90b1SMatthew Wilcox if (error == -ENOMEM) 893f7ca90b1SMatthew Wilcox return VM_FAULT_OOM | major; 894f7ca90b1SMatthew Wilcox /* -EBUSY is fine, somebody else faulted on the same PTE */ 895f7ca90b1SMatthew Wilcox if ((error < 0) && (error != -EBUSY)) 896f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS | major; 897f7ca90b1SMatthew Wilcox return VM_FAULT_NOPAGE | major; 898f7ca90b1SMatthew Wilcox } 899f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault); 9004c0ccfefSMatthew Wilcox 901348e967aSJan Kara #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 902844f35dbSMatthew Wilcox /* 903844f35dbSMatthew Wilcox * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 904844f35dbSMatthew Wilcox * more often than one might expect in the below function. 905844f35dbSMatthew Wilcox */ 906844f35dbSMatthew Wilcox #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 907844f35dbSMatthew Wilcox 908cbb38e41SDan Williams static void __dax_dbg(struct buffer_head *bh, unsigned long address, 909cbb38e41SDan Williams const char *reason, const char *fn) 910cbb38e41SDan Williams { 911cbb38e41SDan Williams if (bh) { 912cbb38e41SDan Williams char bname[BDEVNAME_SIZE]; 913cbb38e41SDan Williams bdevname(bh->b_bdev, bname); 914cbb38e41SDan Williams pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 915cbb38e41SDan Williams "length %zd fallback: %s\n", fn, current->comm, 916cbb38e41SDan Williams address, bname, bh->b_state, (u64)bh->b_blocknr, 917cbb38e41SDan Williams bh->b_size, reason); 918cbb38e41SDan Williams } else { 919cbb38e41SDan Williams pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 920cbb38e41SDan Williams current->comm, address, reason); 921cbb38e41SDan Williams } 922cbb38e41SDan Williams } 923cbb38e41SDan Williams 924cbb38e41SDan Williams #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 925cbb38e41SDan Williams 9266b524995SRoss Zwisler /** 9276b524995SRoss Zwisler * dax_pmd_fault - handle a PMD fault on a DAX file 9286b524995SRoss Zwisler * @vma: The virtual memory area where the fault occurred 9296b524995SRoss Zwisler * @vmf: The description of the fault 9306b524995SRoss Zwisler * @get_block: The filesystem method used to translate file offsets to blocks 9316b524995SRoss Zwisler * 9326b524995SRoss Zwisler * When a page fault occurs, filesystems may call this helper in their 9336b524995SRoss Zwisler * pmd_fault handler for DAX files. 9346b524995SRoss Zwisler */ 9356b524995SRoss Zwisler int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 93602fbd139SJan Kara pmd_t *pmd, unsigned int flags, get_block_t get_block) 937844f35dbSMatthew Wilcox { 938844f35dbSMatthew Wilcox struct file *file = vma->vm_file; 939844f35dbSMatthew Wilcox struct address_space *mapping = file->f_mapping; 940844f35dbSMatthew Wilcox struct inode *inode = mapping->host; 941844f35dbSMatthew Wilcox struct buffer_head bh; 942844f35dbSMatthew Wilcox unsigned blkbits = inode->i_blkbits; 943844f35dbSMatthew Wilcox unsigned long pmd_addr = address & PMD_MASK; 944844f35dbSMatthew Wilcox bool write = flags & FAULT_FLAG_WRITE; 945b2e0d162SDan Williams struct block_device *bdev; 946844f35dbSMatthew Wilcox pgoff_t size, pgoff; 947b2e0d162SDan Williams sector_t block; 948ac401cc7SJan Kara int result = 0; 9499973c98eSRoss Zwisler bool alloc = false; 950844f35dbSMatthew Wilcox 951c046c321SDan Williams /* dax pmd mappings require pfn_t_devmap() */ 952ee82c9edSDan Williams if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 953ee82c9edSDan Williams return VM_FAULT_FALLBACK; 954ee82c9edSDan Williams 955844f35dbSMatthew Wilcox /* Fall back to PTEs if we're going to COW */ 95659bf4fb9SToshi Kani if (write && !(vma->vm_flags & VM_SHARED)) { 95759bf4fb9SToshi Kani split_huge_pmd(vma, pmd, address); 958cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "cow write"); 959844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 96059bf4fb9SToshi Kani } 961844f35dbSMatthew Wilcox /* If the PMD would extend outside the VMA */ 962cbb38e41SDan Williams if (pmd_addr < vma->vm_start) { 963cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma start unaligned"); 964844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 965cbb38e41SDan Williams } 966cbb38e41SDan Williams if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 967cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma end unaligned"); 968844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 969cbb38e41SDan Williams } 970844f35dbSMatthew Wilcox 9713fdd1b47SMatthew Wilcox pgoff = linear_page_index(vma, pmd_addr); 972844f35dbSMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 973844f35dbSMatthew Wilcox if (pgoff >= size) 974844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 975844f35dbSMatthew Wilcox /* If the PMD would cover blocks out of the file */ 976cbb38e41SDan Williams if ((pgoff | PG_PMD_COLOUR) >= size) { 977cbb38e41SDan Williams dax_pmd_dbg(NULL, address, 978cbb38e41SDan Williams "offset + huge page size > file size"); 979844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 980cbb38e41SDan Williams } 981844f35dbSMatthew Wilcox 982844f35dbSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 983d4bbe706SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 984844f35dbSMatthew Wilcox block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 985844f35dbSMatthew Wilcox 986844f35dbSMatthew Wilcox bh.b_size = PMD_SIZE; 9879973c98eSRoss Zwisler 9889973c98eSRoss Zwisler if (get_block(inode, block, &bh, 0) != 0) 989844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 9909973c98eSRoss Zwisler 9919973c98eSRoss Zwisler if (!buffer_mapped(&bh) && write) { 9929973c98eSRoss Zwisler if (get_block(inode, block, &bh, 1) != 0) 9939973c98eSRoss Zwisler return VM_FAULT_SIGBUS; 9949973c98eSRoss Zwisler alloc = true; 9952b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 9969973c98eSRoss Zwisler } 9979973c98eSRoss Zwisler 998b2e0d162SDan Williams bdev = bh.b_bdev; 999844f35dbSMatthew Wilcox 1000fa0d3fceSRoss Zwisler if (bh.b_size < PMD_SIZE) { 1001cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "allocated block too small"); 10029973c98eSRoss Zwisler return VM_FAULT_FALLBACK; 1003cbb38e41SDan Williams } 1004844f35dbSMatthew Wilcox 10059973c98eSRoss Zwisler /* 10069973c98eSRoss Zwisler * If we allocated new storage, make sure no process has any 10079973c98eSRoss Zwisler * zero pages covering this hole 10089973c98eSRoss Zwisler */ 10099973c98eSRoss Zwisler if (alloc) { 10109973c98eSRoss Zwisler loff_t lstart = pgoff << PAGE_SHIFT; 10119973c98eSRoss Zwisler loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 10129973c98eSRoss Zwisler 10139973c98eSRoss Zwisler truncate_pagecache_range(inode, lstart, lend); 10149973c98eSRoss Zwisler } 10159973c98eSRoss Zwisler 1016b9953536SJan Kara if (!write && !buffer_mapped(&bh)) { 1017844f35dbSMatthew Wilcox spinlock_t *ptl; 1018d295e341SKirill A. Shutemov pmd_t entry; 10196fcb52a5SAaron Lu struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); 1020d295e341SKirill A. Shutemov 1021cbb38e41SDan Williams if (unlikely(!zero_page)) { 1022cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "no zero page"); 1023844f35dbSMatthew Wilcox goto fallback; 1024cbb38e41SDan Williams } 1025844f35dbSMatthew Wilcox 1026d295e341SKirill A. Shutemov ptl = pmd_lock(vma->vm_mm, pmd); 1027d295e341SKirill A. Shutemov if (!pmd_none(*pmd)) { 1028844f35dbSMatthew Wilcox spin_unlock(ptl); 1029cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pmd already present"); 1030d295e341SKirill A. Shutemov goto fallback; 1031d295e341SKirill A. Shutemov } 1032d295e341SKirill A. Shutemov 1033cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 1034cbb38e41SDan Williams "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 1035cbb38e41SDan Williams __func__, current->comm, address, 1036cbb38e41SDan Williams (unsigned long long) to_sector(&bh, inode)); 1037cbb38e41SDan Williams 1038d295e341SKirill A. Shutemov entry = mk_pmd(zero_page, vma->vm_page_prot); 1039d295e341SKirill A. Shutemov entry = pmd_mkhuge(entry); 1040d295e341SKirill A. Shutemov set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1041844f35dbSMatthew Wilcox result = VM_FAULT_NOPAGE; 1042d295e341SKirill A. Shutemov spin_unlock(ptl); 1043844f35dbSMatthew Wilcox } else { 1044b2e0d162SDan Williams struct blk_dax_ctl dax = { 1045b2e0d162SDan Williams .sector = to_sector(&bh, inode), 1046b2e0d162SDan Williams .size = PMD_SIZE, 1047b2e0d162SDan Williams }; 1048b2e0d162SDan Williams long length = dax_map_atomic(bdev, &dax); 1049b2e0d162SDan Williams 1050844f35dbSMatthew Wilcox if (length < 0) { 10518b3db979SDan Williams dax_pmd_dbg(&bh, address, "dax-error fallback"); 10528b3db979SDan Williams goto fallback; 1053844f35dbSMatthew Wilcox } 1054cbb38e41SDan Williams if (length < PMD_SIZE) { 1055cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "dax-length too small"); 1056cbb38e41SDan Williams dax_unmap_atomic(bdev, &dax); 1057cbb38e41SDan Williams goto fallback; 1058cbb38e41SDan Williams } 1059cbb38e41SDan Williams if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1060cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn unaligned"); 1061b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 1062844f35dbSMatthew Wilcox goto fallback; 1063b2e0d162SDan Williams } 1064844f35dbSMatthew Wilcox 1065c046c321SDan Williams if (!pfn_t_devmap(dax.pfn)) { 1066b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 1067cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1068152d7bd8SDan Williams goto fallback; 1069b2e0d162SDan Williams } 1070b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 10710f90cc66SRoss Zwisler 10729973c98eSRoss Zwisler /* 10739973c98eSRoss Zwisler * For PTE faults we insert a radix tree entry for reads, and 10749973c98eSRoss Zwisler * leave it clean. Then on the first write we dirty the radix 10759973c98eSRoss Zwisler * tree entry via the dax_pfn_mkwrite() path. This sequence 10769973c98eSRoss Zwisler * allows the dax_pfn_mkwrite() call to be simpler and avoid a 10779973c98eSRoss Zwisler * call into get_block() to translate the pgoff to a sector in 10789973c98eSRoss Zwisler * order to be able to create a new radix tree entry. 10799973c98eSRoss Zwisler * 10809973c98eSRoss Zwisler * The PMD path doesn't have an equivalent to 10819973c98eSRoss Zwisler * dax_pfn_mkwrite(), though, so for a read followed by a 10826b524995SRoss Zwisler * write we traverse all the way through dax_pmd_fault() 10839973c98eSRoss Zwisler * twice. This means we can just skip inserting a radix tree 10849973c98eSRoss Zwisler * entry completely on the initial read and just wait until 10859973c98eSRoss Zwisler * the write to insert a dirty entry. 10869973c98eSRoss Zwisler */ 10879973c98eSRoss Zwisler if (write) { 1088ac401cc7SJan Kara /* 1089ac401cc7SJan Kara * We should insert radix-tree entry and dirty it here. 1090ac401cc7SJan Kara * For now this is broken... 1091ac401cc7SJan Kara */ 10929973c98eSRoss Zwisler } 10939973c98eSRoss Zwisler 1094cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 1095cbb38e41SDan Williams "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1096cbb38e41SDan Williams __func__, current->comm, address, 1097cbb38e41SDan Williams pfn_t_to_pfn(dax.pfn), 1098cbb38e41SDan Williams (unsigned long long) dax.sector); 109934c0fd54SDan Williams result |= vmf_insert_pfn_pmd(vma, address, pmd, 1100f25748e3SDan Williams dax.pfn, write); 1101844f35dbSMatthew Wilcox } 1102844f35dbSMatthew Wilcox 1103844f35dbSMatthew Wilcox out: 1104844f35dbSMatthew Wilcox return result; 1105844f35dbSMatthew Wilcox 1106844f35dbSMatthew Wilcox fallback: 1107844f35dbSMatthew Wilcox count_vm_event(THP_FAULT_FALLBACK); 1108844f35dbSMatthew Wilcox result = VM_FAULT_FALLBACK; 1109844f35dbSMatthew Wilcox goto out; 1110844f35dbSMatthew Wilcox } 1111844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault); 1112dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1113844f35dbSMatthew Wilcox 11144c0ccfefSMatthew Wilcox /** 11150e3b210cSBoaz Harrosh * dax_pfn_mkwrite - handle first write to DAX page 11160e3b210cSBoaz Harrosh * @vma: The virtual memory area where the fault occurred 11170e3b210cSBoaz Harrosh * @vmf: The description of the fault 11180e3b210cSBoaz Harrosh */ 11190e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 11200e3b210cSBoaz Harrosh { 11219973c98eSRoss Zwisler struct file *file = vma->vm_file; 1122ac401cc7SJan Kara struct address_space *mapping = file->f_mapping; 1123ac401cc7SJan Kara void *entry; 1124ac401cc7SJan Kara pgoff_t index = vmf->pgoff; 11250e3b210cSBoaz Harrosh 1126ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 1127ac401cc7SJan Kara entry = get_unlocked_mapping_entry(mapping, index, NULL); 1128ac401cc7SJan Kara if (!entry || !radix_tree_exceptional_entry(entry)) 1129ac401cc7SJan Kara goto out; 1130ac401cc7SJan Kara radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1131ac401cc7SJan Kara put_unlocked_mapping_entry(mapping, index, entry); 1132ac401cc7SJan Kara out: 1133ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 11340e3b210cSBoaz Harrosh return VM_FAULT_NOPAGE; 11350e3b210cSBoaz Harrosh } 11360e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 11370e3b210cSBoaz Harrosh 11384b0228faSVishal Verma static bool dax_range_is_aligned(struct block_device *bdev, 11394b0228faSVishal Verma unsigned int offset, unsigned int length) 11404b0228faSVishal Verma { 11414b0228faSVishal Verma unsigned short sector_size = bdev_logical_block_size(bdev); 11424b0228faSVishal Verma 11434b0228faSVishal Verma if (!IS_ALIGNED(offset, sector_size)) 11444b0228faSVishal Verma return false; 11454b0228faSVishal Verma if (!IS_ALIGNED(length, sector_size)) 11464b0228faSVishal Verma return false; 11474b0228faSVishal Verma 11484b0228faSVishal Verma return true; 11494b0228faSVishal Verma } 11504b0228faSVishal Verma 1151679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1152679c8bd3SChristoph Hellwig unsigned int offset, unsigned int length) 1153679c8bd3SChristoph Hellwig { 1154679c8bd3SChristoph Hellwig struct blk_dax_ctl dax = { 1155679c8bd3SChristoph Hellwig .sector = sector, 1156679c8bd3SChristoph Hellwig .size = PAGE_SIZE, 1157679c8bd3SChristoph Hellwig }; 1158679c8bd3SChristoph Hellwig 11594b0228faSVishal Verma if (dax_range_is_aligned(bdev, offset, length)) { 11604b0228faSVishal Verma sector_t start_sector = dax.sector + (offset >> 9); 11614b0228faSVishal Verma 11624b0228faSVishal Verma return blkdev_issue_zeroout(bdev, start_sector, 11634b0228faSVishal Verma length >> 9, GFP_NOFS, true); 11644b0228faSVishal Verma } else { 1165679c8bd3SChristoph Hellwig if (dax_map_atomic(bdev, &dax) < 0) 1166679c8bd3SChristoph Hellwig return PTR_ERR(dax.addr); 1167679c8bd3SChristoph Hellwig clear_pmem(dax.addr + offset, length); 1168679c8bd3SChristoph Hellwig dax_unmap_atomic(bdev, &dax); 11694b0228faSVishal Verma } 1170679c8bd3SChristoph Hellwig return 0; 1171679c8bd3SChristoph Hellwig } 1172679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1173679c8bd3SChristoph Hellwig 11740e3b210cSBoaz Harrosh /** 117525726bc1SMatthew Wilcox * dax_zero_page_range - zero a range within a page of a DAX file 11764c0ccfefSMatthew Wilcox * @inode: The file being truncated 11774c0ccfefSMatthew Wilcox * @from: The file offset that is being truncated to 117825726bc1SMatthew Wilcox * @length: The number of bytes to zero 11794c0ccfefSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 11804c0ccfefSMatthew Wilcox * 118125726bc1SMatthew Wilcox * This function can be called by a filesystem when it is zeroing part of a 118225726bc1SMatthew Wilcox * page in a DAX file. This is intended for hole-punch operations. If 118325726bc1SMatthew Wilcox * you are truncating a file, the helper function dax_truncate_page() may be 118425726bc1SMatthew Wilcox * more convenient. 11854c0ccfefSMatthew Wilcox */ 118625726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 118725726bc1SMatthew Wilcox get_block_t get_block) 11884c0ccfefSMatthew Wilcox { 11894c0ccfefSMatthew Wilcox struct buffer_head bh; 119009cbfeafSKirill A. Shutemov pgoff_t index = from >> PAGE_SHIFT; 119109cbfeafSKirill A. Shutemov unsigned offset = from & (PAGE_SIZE-1); 11924c0ccfefSMatthew Wilcox int err; 11934c0ccfefSMatthew Wilcox 11944c0ccfefSMatthew Wilcox /* Block boundary? Nothing to do */ 11954c0ccfefSMatthew Wilcox if (!length) 11964c0ccfefSMatthew Wilcox return 0; 1197aada54f9SRoss Zwisler if (WARN_ON_ONCE((offset + length) > PAGE_SIZE)) 1198aada54f9SRoss Zwisler return -EINVAL; 11994c0ccfefSMatthew Wilcox 12004c0ccfefSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 1201eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 120209cbfeafSKirill A. Shutemov bh.b_size = PAGE_SIZE; 12034c0ccfefSMatthew Wilcox err = get_block(inode, index, &bh, 0); 1204679c8bd3SChristoph Hellwig if (err < 0 || !buffer_written(&bh)) 12054c0ccfefSMatthew Wilcox return err; 1206b2e0d162SDan Williams 1207679c8bd3SChristoph Hellwig return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1208679c8bd3SChristoph Hellwig offset, length); 12094c0ccfefSMatthew Wilcox } 121025726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range); 121125726bc1SMatthew Wilcox 121225726bc1SMatthew Wilcox /** 121325726bc1SMatthew Wilcox * dax_truncate_page - handle a partial page being truncated in a DAX file 121425726bc1SMatthew Wilcox * @inode: The file being truncated 121525726bc1SMatthew Wilcox * @from: The file offset that is being truncated to 121625726bc1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 121725726bc1SMatthew Wilcox * 121825726bc1SMatthew Wilcox * Similar to block_truncate_page(), this function can be called by a 121925726bc1SMatthew Wilcox * filesystem when it is truncating a DAX file to handle the partial page. 122025726bc1SMatthew Wilcox */ 122125726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 122225726bc1SMatthew Wilcox { 122309cbfeafSKirill A. Shutemov unsigned length = PAGE_ALIGN(from) - from; 122425726bc1SMatthew Wilcox return dax_zero_page_range(inode, from, length, get_block); 122525726bc1SMatthew Wilcox } 12264c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page); 1227a254e568SChristoph Hellwig 1228a254e568SChristoph Hellwig #ifdef CONFIG_FS_IOMAP 1229a254e568SChristoph Hellwig static loff_t 1230a254e568SChristoph Hellwig iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1231a254e568SChristoph Hellwig struct iomap *iomap) 1232a254e568SChristoph Hellwig { 1233a254e568SChristoph Hellwig struct iov_iter *iter = data; 1234a254e568SChristoph Hellwig loff_t end = pos + length, done = 0; 1235a254e568SChristoph Hellwig ssize_t ret = 0; 1236a254e568SChristoph Hellwig 1237a254e568SChristoph Hellwig if (iov_iter_rw(iter) == READ) { 1238a254e568SChristoph Hellwig end = min(end, i_size_read(inode)); 1239a254e568SChristoph Hellwig if (pos >= end) 1240a254e568SChristoph Hellwig return 0; 1241a254e568SChristoph Hellwig 1242a254e568SChristoph Hellwig if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1243a254e568SChristoph Hellwig return iov_iter_zero(min(length, end - pos), iter); 1244a254e568SChristoph Hellwig } 1245a254e568SChristoph Hellwig 1246a254e568SChristoph Hellwig if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1247a254e568SChristoph Hellwig return -EIO; 1248a254e568SChristoph Hellwig 1249a254e568SChristoph Hellwig while (pos < end) { 1250a254e568SChristoph Hellwig unsigned offset = pos & (PAGE_SIZE - 1); 1251a254e568SChristoph Hellwig struct blk_dax_ctl dax = { 0 }; 1252a254e568SChristoph Hellwig ssize_t map_len; 1253a254e568SChristoph Hellwig 1254a254e568SChristoph Hellwig dax.sector = iomap->blkno + 1255a254e568SChristoph Hellwig (((pos & PAGE_MASK) - iomap->offset) >> 9); 1256a254e568SChristoph Hellwig dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1257a254e568SChristoph Hellwig map_len = dax_map_atomic(iomap->bdev, &dax); 1258a254e568SChristoph Hellwig if (map_len < 0) { 1259a254e568SChristoph Hellwig ret = map_len; 1260a254e568SChristoph Hellwig break; 1261a254e568SChristoph Hellwig } 1262a254e568SChristoph Hellwig 1263a254e568SChristoph Hellwig dax.addr += offset; 1264a254e568SChristoph Hellwig map_len -= offset; 1265a254e568SChristoph Hellwig if (map_len > end - pos) 1266a254e568SChristoph Hellwig map_len = end - pos; 1267a254e568SChristoph Hellwig 1268a254e568SChristoph Hellwig if (iov_iter_rw(iter) == WRITE) 1269a254e568SChristoph Hellwig map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1270a254e568SChristoph Hellwig else 1271a254e568SChristoph Hellwig map_len = copy_to_iter(dax.addr, map_len, iter); 1272a254e568SChristoph Hellwig dax_unmap_atomic(iomap->bdev, &dax); 1273a254e568SChristoph Hellwig if (map_len <= 0) { 1274a254e568SChristoph Hellwig ret = map_len ? map_len : -EFAULT; 1275a254e568SChristoph Hellwig break; 1276a254e568SChristoph Hellwig } 1277a254e568SChristoph Hellwig 1278a254e568SChristoph Hellwig pos += map_len; 1279a254e568SChristoph Hellwig length -= map_len; 1280a254e568SChristoph Hellwig done += map_len; 1281a254e568SChristoph Hellwig } 1282a254e568SChristoph Hellwig 1283a254e568SChristoph Hellwig return done ? done : ret; 1284a254e568SChristoph Hellwig } 1285a254e568SChristoph Hellwig 1286a254e568SChristoph Hellwig /** 1287a254e568SChristoph Hellwig * iomap_dax_rw - Perform I/O to a DAX file 1288a254e568SChristoph Hellwig * @iocb: The control block for this I/O 1289a254e568SChristoph Hellwig * @iter: The addresses to do I/O from or to 1290a254e568SChristoph Hellwig * @ops: iomap ops passed from the file system 1291a254e568SChristoph Hellwig * 1292a254e568SChristoph Hellwig * This function performs read and write operations to directly mapped 1293a254e568SChristoph Hellwig * persistent memory. The callers needs to take care of read/write exclusion 1294a254e568SChristoph Hellwig * and evicting any page cache pages in the region under I/O. 1295a254e568SChristoph Hellwig */ 1296a254e568SChristoph Hellwig ssize_t 1297a254e568SChristoph Hellwig iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, 1298a254e568SChristoph Hellwig struct iomap_ops *ops) 1299a254e568SChristoph Hellwig { 1300a254e568SChristoph Hellwig struct address_space *mapping = iocb->ki_filp->f_mapping; 1301a254e568SChristoph Hellwig struct inode *inode = mapping->host; 1302a254e568SChristoph Hellwig loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1303a254e568SChristoph Hellwig unsigned flags = 0; 1304a254e568SChristoph Hellwig 1305a254e568SChristoph Hellwig if (iov_iter_rw(iter) == WRITE) 1306a254e568SChristoph Hellwig flags |= IOMAP_WRITE; 1307a254e568SChristoph Hellwig 1308a254e568SChristoph Hellwig /* 1309a254e568SChristoph Hellwig * Yes, even DAX files can have page cache attached to them: A zeroed 1310a254e568SChristoph Hellwig * page is inserted into the pagecache when we have to serve a write 1311a254e568SChristoph Hellwig * fault on a hole. It should never be dirtied and can simply be 1312a254e568SChristoph Hellwig * dropped from the pagecache once we get real data for the page. 1313a254e568SChristoph Hellwig * 1314a254e568SChristoph Hellwig * XXX: This is racy against mmap, and there's nothing we can do about 1315a254e568SChristoph Hellwig * it. We'll eventually need to shift this down even further so that 1316a254e568SChristoph Hellwig * we can check if we allocated blocks over a hole first. 1317a254e568SChristoph Hellwig */ 1318a254e568SChristoph Hellwig if (mapping->nrpages) { 1319a254e568SChristoph Hellwig ret = invalidate_inode_pages2_range(mapping, 1320a254e568SChristoph Hellwig pos >> PAGE_SHIFT, 1321a254e568SChristoph Hellwig (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); 1322a254e568SChristoph Hellwig WARN_ON_ONCE(ret); 1323a254e568SChristoph Hellwig } 1324a254e568SChristoph Hellwig 1325a254e568SChristoph Hellwig while (iov_iter_count(iter)) { 1326a254e568SChristoph Hellwig ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1327a254e568SChristoph Hellwig iter, iomap_dax_actor); 1328a254e568SChristoph Hellwig if (ret <= 0) 1329a254e568SChristoph Hellwig break; 1330a254e568SChristoph Hellwig pos += ret; 1331a254e568SChristoph Hellwig done += ret; 1332a254e568SChristoph Hellwig } 1333a254e568SChristoph Hellwig 1334a254e568SChristoph Hellwig iocb->ki_pos += done; 1335a254e568SChristoph Hellwig return done ? done : ret; 1336a254e568SChristoph Hellwig } 1337a254e568SChristoph Hellwig EXPORT_SYMBOL_GPL(iomap_dax_rw); 1338a7d73fe6SChristoph Hellwig 1339a7d73fe6SChristoph Hellwig /** 1340a7d73fe6SChristoph Hellwig * iomap_dax_fault - handle a page fault on a DAX file 1341a7d73fe6SChristoph Hellwig * @vma: The virtual memory area where the fault occurred 1342a7d73fe6SChristoph Hellwig * @vmf: The description of the fault 1343a7d73fe6SChristoph Hellwig * @ops: iomap ops passed from the file system 1344a7d73fe6SChristoph Hellwig * 1345a7d73fe6SChristoph Hellwig * When a page fault occurs, filesystems may call this helper in their fault 1346a7d73fe6SChristoph Hellwig * or mkwrite handler for DAX files. Assumes the caller has done all the 1347a7d73fe6SChristoph Hellwig * necessary locking for the page fault to proceed successfully. 1348a7d73fe6SChristoph Hellwig */ 1349a7d73fe6SChristoph Hellwig int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 1350a7d73fe6SChristoph Hellwig struct iomap_ops *ops) 1351a7d73fe6SChristoph Hellwig { 1352a7d73fe6SChristoph Hellwig struct address_space *mapping = vma->vm_file->f_mapping; 1353a7d73fe6SChristoph Hellwig struct inode *inode = mapping->host; 1354a7d73fe6SChristoph Hellwig unsigned long vaddr = (unsigned long)vmf->virtual_address; 1355a7d73fe6SChristoph Hellwig loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1356a7d73fe6SChristoph Hellwig sector_t sector; 1357a7d73fe6SChristoph Hellwig struct iomap iomap = { 0 }; 1358a7d73fe6SChristoph Hellwig unsigned flags = 0; 1359a7d73fe6SChristoph Hellwig int error, major = 0; 1360a7d73fe6SChristoph Hellwig void *entry; 1361a7d73fe6SChristoph Hellwig 1362a7d73fe6SChristoph Hellwig /* 1363a7d73fe6SChristoph Hellwig * Check whether offset isn't beyond end of file now. Caller is supposed 1364a7d73fe6SChristoph Hellwig * to hold locks serializing us with truncate / punch hole so this is 1365a7d73fe6SChristoph Hellwig * a reliable test. 1366a7d73fe6SChristoph Hellwig */ 1367a7d73fe6SChristoph Hellwig if (pos >= i_size_read(inode)) 1368a7d73fe6SChristoph Hellwig return VM_FAULT_SIGBUS; 1369a7d73fe6SChristoph Hellwig 1370a7d73fe6SChristoph Hellwig entry = grab_mapping_entry(mapping, vmf->pgoff); 1371a7d73fe6SChristoph Hellwig if (IS_ERR(entry)) { 1372a7d73fe6SChristoph Hellwig error = PTR_ERR(entry); 1373a7d73fe6SChristoph Hellwig goto out; 1374a7d73fe6SChristoph Hellwig } 1375a7d73fe6SChristoph Hellwig 1376a7d73fe6SChristoph Hellwig if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1377a7d73fe6SChristoph Hellwig flags |= IOMAP_WRITE; 1378a7d73fe6SChristoph Hellwig 1379a7d73fe6SChristoph Hellwig /* 1380a7d73fe6SChristoph Hellwig * Note that we don't bother to use iomap_apply here: DAX required 1381a7d73fe6SChristoph Hellwig * the file system block size to be equal the page size, which means 1382a7d73fe6SChristoph Hellwig * that we never have to deal with more than a single extent here. 1383a7d73fe6SChristoph Hellwig */ 1384a7d73fe6SChristoph Hellwig error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1385a7d73fe6SChristoph Hellwig if (error) 1386a7d73fe6SChristoph Hellwig goto unlock_entry; 1387a7d73fe6SChristoph Hellwig if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1388a7d73fe6SChristoph Hellwig error = -EIO; /* fs corruption? */ 1389a7d73fe6SChristoph Hellwig goto unlock_entry; 1390a7d73fe6SChristoph Hellwig } 1391a7d73fe6SChristoph Hellwig 1392a7d73fe6SChristoph Hellwig sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); 1393a7d73fe6SChristoph Hellwig 1394a7d73fe6SChristoph Hellwig if (vmf->cow_page) { 1395a7d73fe6SChristoph Hellwig switch (iomap.type) { 1396a7d73fe6SChristoph Hellwig case IOMAP_HOLE: 1397a7d73fe6SChristoph Hellwig case IOMAP_UNWRITTEN: 1398a7d73fe6SChristoph Hellwig clear_user_highpage(vmf->cow_page, vaddr); 1399a7d73fe6SChristoph Hellwig break; 1400a7d73fe6SChristoph Hellwig case IOMAP_MAPPED: 1401a7d73fe6SChristoph Hellwig error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1402a7d73fe6SChristoph Hellwig vmf->cow_page, vaddr); 1403a7d73fe6SChristoph Hellwig break; 1404a7d73fe6SChristoph Hellwig default: 1405a7d73fe6SChristoph Hellwig WARN_ON_ONCE(1); 1406a7d73fe6SChristoph Hellwig error = -EIO; 1407a7d73fe6SChristoph Hellwig break; 1408a7d73fe6SChristoph Hellwig } 1409a7d73fe6SChristoph Hellwig 1410a7d73fe6SChristoph Hellwig if (error) 1411a7d73fe6SChristoph Hellwig goto unlock_entry; 1412a7d73fe6SChristoph Hellwig if (!radix_tree_exceptional_entry(entry)) { 1413a7d73fe6SChristoph Hellwig vmf->page = entry; 1414a7d73fe6SChristoph Hellwig return VM_FAULT_LOCKED; 1415a7d73fe6SChristoph Hellwig } 1416a7d73fe6SChristoph Hellwig vmf->entry = entry; 1417a7d73fe6SChristoph Hellwig return VM_FAULT_DAX_LOCKED; 1418a7d73fe6SChristoph Hellwig } 1419a7d73fe6SChristoph Hellwig 1420a7d73fe6SChristoph Hellwig switch (iomap.type) { 1421a7d73fe6SChristoph Hellwig case IOMAP_MAPPED: 1422a7d73fe6SChristoph Hellwig if (iomap.flags & IOMAP_F_NEW) { 1423a7d73fe6SChristoph Hellwig count_vm_event(PGMAJFAULT); 1424a7d73fe6SChristoph Hellwig mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1425a7d73fe6SChristoph Hellwig major = VM_FAULT_MAJOR; 1426a7d73fe6SChristoph Hellwig } 1427a7d73fe6SChristoph Hellwig error = dax_insert_mapping(mapping, iomap.bdev, sector, 1428a7d73fe6SChristoph Hellwig PAGE_SIZE, &entry, vma, vmf); 1429a7d73fe6SChristoph Hellwig break; 1430a7d73fe6SChristoph Hellwig case IOMAP_UNWRITTEN: 1431a7d73fe6SChristoph Hellwig case IOMAP_HOLE: 1432a7d73fe6SChristoph Hellwig if (!(vmf->flags & FAULT_FLAG_WRITE)) 1433a7d73fe6SChristoph Hellwig return dax_load_hole(mapping, entry, vmf); 1434a7d73fe6SChristoph Hellwig /*FALLTHRU*/ 1435a7d73fe6SChristoph Hellwig default: 1436a7d73fe6SChristoph Hellwig WARN_ON_ONCE(1); 1437a7d73fe6SChristoph Hellwig error = -EIO; 1438a7d73fe6SChristoph Hellwig break; 1439a7d73fe6SChristoph Hellwig } 1440a7d73fe6SChristoph Hellwig 1441a7d73fe6SChristoph Hellwig unlock_entry: 1442a7d73fe6SChristoph Hellwig put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1443a7d73fe6SChristoph Hellwig out: 1444a7d73fe6SChristoph Hellwig if (error == -ENOMEM) 1445a7d73fe6SChristoph Hellwig return VM_FAULT_OOM | major; 1446a7d73fe6SChristoph Hellwig /* -EBUSY is fine, somebody else faulted on the same PTE */ 1447a7d73fe6SChristoph Hellwig if (error < 0 && error != -EBUSY) 1448a7d73fe6SChristoph Hellwig return VM_FAULT_SIGBUS | major; 1449a7d73fe6SChristoph Hellwig return VM_FAULT_NOPAGE | major; 1450a7d73fe6SChristoph Hellwig } 1451a7d73fe6SChristoph Hellwig EXPORT_SYMBOL_GPL(iomap_dax_fault); 1452a254e568SChristoph Hellwig #endif /* CONFIG_FS_IOMAP */ 1453