1d475c634SMatthew Wilcox /* 2d475c634SMatthew Wilcox * fs/dax.c - Direct Access filesystem code 3d475c634SMatthew Wilcox * Copyright (c) 2013-2014 Intel Corporation 4d475c634SMatthew Wilcox * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5d475c634SMatthew Wilcox * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6d475c634SMatthew Wilcox * 7d475c634SMatthew Wilcox * This program is free software; you can redistribute it and/or modify it 8d475c634SMatthew Wilcox * under the terms and conditions of the GNU General Public License, 9d475c634SMatthew Wilcox * version 2, as published by the Free Software Foundation. 10d475c634SMatthew Wilcox * 11d475c634SMatthew Wilcox * This program is distributed in the hope it will be useful, but WITHOUT 12d475c634SMatthew Wilcox * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13d475c634SMatthew Wilcox * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14d475c634SMatthew Wilcox * more details. 15d475c634SMatthew Wilcox */ 16d475c634SMatthew Wilcox 17d475c634SMatthew Wilcox #include <linux/atomic.h> 18d475c634SMatthew Wilcox #include <linux/blkdev.h> 19d475c634SMatthew Wilcox #include <linux/buffer_head.h> 20d77e92e2SRoss Zwisler #include <linux/dax.h> 21d475c634SMatthew Wilcox #include <linux/fs.h> 22d475c634SMatthew Wilcox #include <linux/genhd.h> 23f7ca90b1SMatthew Wilcox #include <linux/highmem.h> 24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h> 25f7ca90b1SMatthew Wilcox #include <linux/mm.h> 26d475c634SMatthew Wilcox #include <linux/mutex.h> 279973c98eSRoss Zwisler #include <linux/pagevec.h> 282765cfbbSRoss Zwisler #include <linux/pmem.h> 29289c6aedSMatthew Wilcox #include <linux/sched.h> 30d475c634SMatthew Wilcox #include <linux/uio.h> 31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h> 3234c0fd54SDan Williams #include <linux/pfn_t.h> 330e749e54SDan Williams #include <linux/sizes.h> 34d475c634SMatthew Wilcox 35e804315dSJan Kara /* 36e804315dSJan Kara * We use lowest available bit in exceptional entry for locking, other two 37e804315dSJan Kara * bits to determine entry type. In total 3 special bits. 38e804315dSJan Kara */ 39e804315dSJan Kara #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 40e804315dSJan Kara #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 41e804315dSJan Kara #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 42e804315dSJan Kara #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 43e804315dSJan Kara #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 44e4b27491SNeilBrown #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 45e4b27491SNeilBrown #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 46e804315dSJan Kara RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 47e804315dSJan Kara RADIX_TREE_EXCEPTIONAL_ENTRY)) 48e4b27491SNeilBrown 49ac401cc7SJan Kara /* We choose 4096 entries - same as per-zone page wait tables */ 50ac401cc7SJan Kara #define DAX_WAIT_TABLE_BITS 12 51ac401cc7SJan Kara #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 52ac401cc7SJan Kara 53ac401cc7SJan Kara wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 54ac401cc7SJan Kara 55ac401cc7SJan Kara static int __init init_dax_wait_table(void) 56ac401cc7SJan Kara { 57ac401cc7SJan Kara int i; 58ac401cc7SJan Kara 59ac401cc7SJan Kara for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 60ac401cc7SJan Kara init_waitqueue_head(wait_table + i); 61ac401cc7SJan Kara return 0; 62ac401cc7SJan Kara } 63ac401cc7SJan Kara fs_initcall(init_dax_wait_table); 64ac401cc7SJan Kara 65ac401cc7SJan Kara static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 66ac401cc7SJan Kara pgoff_t index) 67ac401cc7SJan Kara { 68ac401cc7SJan Kara unsigned long hash = hash_long((unsigned long)mapping ^ index, 69ac401cc7SJan Kara DAX_WAIT_TABLE_BITS); 70ac401cc7SJan Kara return wait_table + hash; 71ac401cc7SJan Kara } 72ac401cc7SJan Kara 73b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 74b2e0d162SDan Williams { 75b2e0d162SDan Williams struct request_queue *q = bdev->bd_queue; 76b2e0d162SDan Williams long rc = -EIO; 77b2e0d162SDan Williams 78b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(-EIO); 79b2e0d162SDan Williams if (blk_queue_enter(q, true) != 0) 80b2e0d162SDan Williams return rc; 81b2e0d162SDan Williams 82b2e0d162SDan Williams rc = bdev_direct_access(bdev, dax); 83b2e0d162SDan Williams if (rc < 0) { 84b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(rc); 85b2e0d162SDan Williams blk_queue_exit(q); 86b2e0d162SDan Williams return rc; 87b2e0d162SDan Williams } 88b2e0d162SDan Williams return rc; 89b2e0d162SDan Williams } 90b2e0d162SDan Williams 91b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev, 92b2e0d162SDan Williams const struct blk_dax_ctl *dax) 93b2e0d162SDan Williams { 94b2e0d162SDan Williams if (IS_ERR(dax->addr)) 95b2e0d162SDan Williams return; 96b2e0d162SDan Williams blk_queue_exit(bdev->bd_queue); 97b2e0d162SDan Williams } 98b2e0d162SDan Williams 99d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n) 100d1a5f2b4SDan Williams { 101d1a5f2b4SDan Williams struct page *page = alloc_pages(GFP_KERNEL, 0); 102d1a5f2b4SDan Williams struct blk_dax_ctl dax = { 103d1a5f2b4SDan Williams .size = PAGE_SIZE, 104d1a5f2b4SDan Williams .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 105d1a5f2b4SDan Williams }; 106d1a5f2b4SDan Williams long rc; 107d1a5f2b4SDan Williams 108d1a5f2b4SDan Williams if (!page) 109d1a5f2b4SDan Williams return ERR_PTR(-ENOMEM); 110d1a5f2b4SDan Williams 111d1a5f2b4SDan Williams rc = dax_map_atomic(bdev, &dax); 112d1a5f2b4SDan Williams if (rc < 0) 113d1a5f2b4SDan Williams return ERR_PTR(rc); 114d1a5f2b4SDan Williams memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 115d1a5f2b4SDan Williams dax_unmap_atomic(bdev, &dax); 116d1a5f2b4SDan Williams return page; 117d1a5f2b4SDan Williams } 118d1a5f2b4SDan Williams 119d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh) 120d475c634SMatthew Wilcox { 121d475c634SMatthew Wilcox return buffer_mapped(bh) && !buffer_unwritten(bh); 122d475c634SMatthew Wilcox } 123d475c634SMatthew Wilcox 124d475c634SMatthew Wilcox /* 125d475c634SMatthew Wilcox * When ext4 encounters a hole, it returns without modifying the buffer_head 126d475c634SMatthew Wilcox * which means that we can't trust b_size. To cope with this, we set b_state 127d475c634SMatthew Wilcox * to 0 before calling get_block and, if any bit is set, we know we can trust 128d475c634SMatthew Wilcox * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 129d475c634SMatthew Wilcox * and would save us time calling get_block repeatedly. 130d475c634SMatthew Wilcox */ 131d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh) 132d475c634SMatthew Wilcox { 133d475c634SMatthew Wilcox return bh->b_state != 0; 134d475c634SMatthew Wilcox } 135d475c634SMatthew Wilcox 136b2e0d162SDan Williams 137b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh, 138b2e0d162SDan Williams const struct inode *inode) 139b2e0d162SDan Williams { 140b2e0d162SDan Williams sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 141b2e0d162SDan Williams 142b2e0d162SDan Williams return sector; 143b2e0d162SDan Williams } 144b2e0d162SDan Williams 145a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 146d475c634SMatthew Wilcox loff_t start, loff_t end, get_block_t get_block, 147d475c634SMatthew Wilcox struct buffer_head *bh) 148d475c634SMatthew Wilcox { 149b2e0d162SDan Williams loff_t pos = start, max = start, bh_max = start; 150b2e0d162SDan Williams bool hole = false, need_wmb = false; 151b2e0d162SDan Williams struct block_device *bdev = NULL; 152b2e0d162SDan Williams int rw = iov_iter_rw(iter), rc; 153b2e0d162SDan Williams long map_len = 0; 154b2e0d162SDan Williams struct blk_dax_ctl dax = { 155b2e0d162SDan Williams .addr = (void __pmem *) ERR_PTR(-EIO), 156b2e0d162SDan Williams }; 157069c77bcSJan Kara unsigned blkbits = inode->i_blkbits; 158069c77bcSJan Kara sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 159069c77bcSJan Kara >> blkbits; 160d475c634SMatthew Wilcox 161b2e0d162SDan Williams if (rw == READ) 162d475c634SMatthew Wilcox end = min(end, i_size_read(inode)); 163d475c634SMatthew Wilcox 164d475c634SMatthew Wilcox while (pos < end) { 1652765cfbbSRoss Zwisler size_t len; 166d475c634SMatthew Wilcox if (pos == max) { 167e94f5a22SJeff Moyer long page = pos >> PAGE_SHIFT; 168e94f5a22SJeff Moyer sector_t block = page << (PAGE_SHIFT - blkbits); 169d475c634SMatthew Wilcox unsigned first = pos - (block << blkbits); 170d475c634SMatthew Wilcox long size; 171d475c634SMatthew Wilcox 172d475c634SMatthew Wilcox if (pos == bh_max) { 173d475c634SMatthew Wilcox bh->b_size = PAGE_ALIGN(end - pos); 174d475c634SMatthew Wilcox bh->b_state = 0; 175b2e0d162SDan Williams rc = get_block(inode, block, bh, rw == WRITE); 176b2e0d162SDan Williams if (rc) 177d475c634SMatthew Wilcox break; 178d475c634SMatthew Wilcox if (!buffer_size_valid(bh)) 179d475c634SMatthew Wilcox bh->b_size = 1 << blkbits; 180d475c634SMatthew Wilcox bh_max = pos - first + bh->b_size; 181b2e0d162SDan Williams bdev = bh->b_bdev; 182069c77bcSJan Kara /* 183069c77bcSJan Kara * We allow uninitialized buffers for writes 184069c77bcSJan Kara * beyond EOF as those cannot race with faults 185069c77bcSJan Kara */ 186069c77bcSJan Kara WARN_ON_ONCE( 187069c77bcSJan Kara (buffer_new(bh) && block < file_blks) || 188069c77bcSJan Kara (rw == WRITE && buffer_unwritten(bh))); 189d475c634SMatthew Wilcox } else { 190d475c634SMatthew Wilcox unsigned done = bh->b_size - 191d475c634SMatthew Wilcox (bh_max - (pos - first)); 192d475c634SMatthew Wilcox bh->b_blocknr += done >> blkbits; 193d475c634SMatthew Wilcox bh->b_size -= done; 194d475c634SMatthew Wilcox } 195d475c634SMatthew Wilcox 196b2e0d162SDan Williams hole = rw == READ && !buffer_written(bh); 197d475c634SMatthew Wilcox if (hole) { 198d475c634SMatthew Wilcox size = bh->b_size - first; 199d475c634SMatthew Wilcox } else { 200b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 201b2e0d162SDan Williams dax.sector = to_sector(bh, inode); 202b2e0d162SDan Williams dax.size = bh->b_size; 203b2e0d162SDan Williams map_len = dax_map_atomic(bdev, &dax); 204b2e0d162SDan Williams if (map_len < 0) { 205b2e0d162SDan Williams rc = map_len; 206d475c634SMatthew Wilcox break; 207b2e0d162SDan Williams } 208b2e0d162SDan Williams dax.addr += first; 209b2e0d162SDan Williams size = map_len - first; 210d475c634SMatthew Wilcox } 211d475c634SMatthew Wilcox max = min(pos + size, end); 212d475c634SMatthew Wilcox } 213d475c634SMatthew Wilcox 2142765cfbbSRoss Zwisler if (iov_iter_rw(iter) == WRITE) { 215b2e0d162SDan Williams len = copy_from_iter_pmem(dax.addr, max - pos, iter); 2162765cfbbSRoss Zwisler need_wmb = true; 2172765cfbbSRoss Zwisler } else if (!hole) 218b2e0d162SDan Williams len = copy_to_iter((void __force *) dax.addr, max - pos, 219e2e05394SRoss Zwisler iter); 220d475c634SMatthew Wilcox else 221d475c634SMatthew Wilcox len = iov_iter_zero(max - pos, iter); 222d475c634SMatthew Wilcox 223cadfbb6eSAl Viro if (!len) { 224b2e0d162SDan Williams rc = -EFAULT; 225d475c634SMatthew Wilcox break; 226cadfbb6eSAl Viro } 227d475c634SMatthew Wilcox 228d475c634SMatthew Wilcox pos += len; 229b2e0d162SDan Williams if (!IS_ERR(dax.addr)) 230b2e0d162SDan Williams dax.addr += len; 231d475c634SMatthew Wilcox } 232d475c634SMatthew Wilcox 2332765cfbbSRoss Zwisler if (need_wmb) 2342765cfbbSRoss Zwisler wmb_pmem(); 235b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 2362765cfbbSRoss Zwisler 237b2e0d162SDan Williams return (pos == start) ? rc : pos - start; 238d475c634SMatthew Wilcox } 239d475c634SMatthew Wilcox 240d475c634SMatthew Wilcox /** 241d475c634SMatthew Wilcox * dax_do_io - Perform I/O to a DAX file 242d475c634SMatthew Wilcox * @iocb: The control block for this I/O 243d475c634SMatthew Wilcox * @inode: The file which the I/O is directed at 244d475c634SMatthew Wilcox * @iter: The addresses to do I/O from or to 245d475c634SMatthew Wilcox * @pos: The file offset where the I/O starts 246d475c634SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 247d475c634SMatthew Wilcox * @end_io: A filesystem callback for I/O completion 248d475c634SMatthew Wilcox * @flags: See below 249d475c634SMatthew Wilcox * 250d475c634SMatthew Wilcox * This function uses the same locking scheme as do_blockdev_direct_IO: 251d475c634SMatthew Wilcox * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 252d475c634SMatthew Wilcox * caller for writes. For reads, we take and release the i_mutex ourselves. 253d475c634SMatthew Wilcox * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 254d475c634SMatthew Wilcox * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 255d475c634SMatthew Wilcox * is in progress. 256d475c634SMatthew Wilcox */ 257a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 258a95cd631SOmar Sandoval struct iov_iter *iter, loff_t pos, get_block_t get_block, 259a95cd631SOmar Sandoval dio_iodone_t end_io, int flags) 260d475c634SMatthew Wilcox { 261d475c634SMatthew Wilcox struct buffer_head bh; 262d475c634SMatthew Wilcox ssize_t retval = -EINVAL; 263d475c634SMatthew Wilcox loff_t end = pos + iov_iter_count(iter); 264d475c634SMatthew Wilcox 265d475c634SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 266eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 267d475c634SMatthew Wilcox 268c3d98e39SJan Kara if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2695955102cSAl Viro inode_lock(inode); 270d475c634SMatthew Wilcox 271d475c634SMatthew Wilcox /* Protects against truncate */ 272bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 273fe0f07d0SJens Axboe inode_dio_begin(inode); 274d475c634SMatthew Wilcox 275a95cd631SOmar Sandoval retval = dax_io(inode, iter, pos, end, get_block, &bh); 276d475c634SMatthew Wilcox 277a95cd631SOmar Sandoval if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2785955102cSAl Viro inode_unlock(inode); 279d475c634SMatthew Wilcox 280187372a3SChristoph Hellwig if (end_io) { 281187372a3SChristoph Hellwig int err; 282187372a3SChristoph Hellwig 283187372a3SChristoph Hellwig err = end_io(iocb, pos, retval, bh.b_private); 284187372a3SChristoph Hellwig if (err) 285187372a3SChristoph Hellwig retval = err; 286187372a3SChristoph Hellwig } 287d475c634SMatthew Wilcox 288bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 289fe0f07d0SJens Axboe inode_dio_end(inode); 290d475c634SMatthew Wilcox return retval; 291d475c634SMatthew Wilcox } 292d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io); 293f7ca90b1SMatthew Wilcox 294f7ca90b1SMatthew Wilcox /* 295ac401cc7SJan Kara * DAX radix tree locking 296ac401cc7SJan Kara */ 297ac401cc7SJan Kara struct exceptional_entry_key { 298ac401cc7SJan Kara struct address_space *mapping; 299ac401cc7SJan Kara unsigned long index; 300ac401cc7SJan Kara }; 301ac401cc7SJan Kara 302ac401cc7SJan Kara struct wait_exceptional_entry_queue { 303ac401cc7SJan Kara wait_queue_t wait; 304ac401cc7SJan Kara struct exceptional_entry_key key; 305ac401cc7SJan Kara }; 306ac401cc7SJan Kara 307ac401cc7SJan Kara static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 308ac401cc7SJan Kara int sync, void *keyp) 309ac401cc7SJan Kara { 310ac401cc7SJan Kara struct exceptional_entry_key *key = keyp; 311ac401cc7SJan Kara struct wait_exceptional_entry_queue *ewait = 312ac401cc7SJan Kara container_of(wait, struct wait_exceptional_entry_queue, wait); 313ac401cc7SJan Kara 314ac401cc7SJan Kara if (key->mapping != ewait->key.mapping || 315ac401cc7SJan Kara key->index != ewait->key.index) 316ac401cc7SJan Kara return 0; 317ac401cc7SJan Kara return autoremove_wake_function(wait, mode, sync, NULL); 318ac401cc7SJan Kara } 319ac401cc7SJan Kara 320ac401cc7SJan Kara /* 321ac401cc7SJan Kara * Check whether the given slot is locked. The function must be called with 322ac401cc7SJan Kara * mapping->tree_lock held 323ac401cc7SJan Kara */ 324ac401cc7SJan Kara static inline int slot_locked(struct address_space *mapping, void **slot) 325ac401cc7SJan Kara { 326ac401cc7SJan Kara unsigned long entry = (unsigned long) 327ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 328ac401cc7SJan Kara return entry & RADIX_DAX_ENTRY_LOCK; 329ac401cc7SJan Kara } 330ac401cc7SJan Kara 331ac401cc7SJan Kara /* 332ac401cc7SJan Kara * Mark the given slot is locked. The function must be called with 333ac401cc7SJan Kara * mapping->tree_lock held 334ac401cc7SJan Kara */ 335ac401cc7SJan Kara static inline void *lock_slot(struct address_space *mapping, void **slot) 336ac401cc7SJan Kara { 337ac401cc7SJan Kara unsigned long entry = (unsigned long) 338ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 339ac401cc7SJan Kara 340ac401cc7SJan Kara entry |= RADIX_DAX_ENTRY_LOCK; 341ac401cc7SJan Kara radix_tree_replace_slot(slot, (void *)entry); 342ac401cc7SJan Kara return (void *)entry; 343ac401cc7SJan Kara } 344ac401cc7SJan Kara 345ac401cc7SJan Kara /* 346ac401cc7SJan Kara * Mark the given slot is unlocked. The function must be called with 347ac401cc7SJan Kara * mapping->tree_lock held 348ac401cc7SJan Kara */ 349ac401cc7SJan Kara static inline void *unlock_slot(struct address_space *mapping, void **slot) 350ac401cc7SJan Kara { 351ac401cc7SJan Kara unsigned long entry = (unsigned long) 352ac401cc7SJan Kara radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 353ac401cc7SJan Kara 354ac401cc7SJan Kara entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 355ac401cc7SJan Kara radix_tree_replace_slot(slot, (void *)entry); 356ac401cc7SJan Kara return (void *)entry; 357ac401cc7SJan Kara } 358ac401cc7SJan Kara 359ac401cc7SJan Kara /* 360ac401cc7SJan Kara * Lookup entry in radix tree, wait for it to become unlocked if it is 361ac401cc7SJan Kara * exceptional entry and return it. The caller must call 362ac401cc7SJan Kara * put_unlocked_mapping_entry() when he decided not to lock the entry or 363ac401cc7SJan Kara * put_locked_mapping_entry() when he locked the entry and now wants to 364ac401cc7SJan Kara * unlock it. 365ac401cc7SJan Kara * 366ac401cc7SJan Kara * The function must be called with mapping->tree_lock held. 367ac401cc7SJan Kara */ 368ac401cc7SJan Kara static void *get_unlocked_mapping_entry(struct address_space *mapping, 369ac401cc7SJan Kara pgoff_t index, void ***slotp) 370ac401cc7SJan Kara { 371ac401cc7SJan Kara void *ret, **slot; 372ac401cc7SJan Kara struct wait_exceptional_entry_queue ewait; 373ac401cc7SJan Kara wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 374ac401cc7SJan Kara 375ac401cc7SJan Kara init_wait(&ewait.wait); 376ac401cc7SJan Kara ewait.wait.func = wake_exceptional_entry_func; 377ac401cc7SJan Kara ewait.key.mapping = mapping; 378ac401cc7SJan Kara ewait.key.index = index; 379ac401cc7SJan Kara 380ac401cc7SJan Kara for (;;) { 381ac401cc7SJan Kara ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 382ac401cc7SJan Kara &slot); 383ac401cc7SJan Kara if (!ret || !radix_tree_exceptional_entry(ret) || 384ac401cc7SJan Kara !slot_locked(mapping, slot)) { 385ac401cc7SJan Kara if (slotp) 386ac401cc7SJan Kara *slotp = slot; 387ac401cc7SJan Kara return ret; 388ac401cc7SJan Kara } 389ac401cc7SJan Kara prepare_to_wait_exclusive(wq, &ewait.wait, 390ac401cc7SJan Kara TASK_UNINTERRUPTIBLE); 391ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 392ac401cc7SJan Kara schedule(); 393ac401cc7SJan Kara finish_wait(wq, &ewait.wait); 394ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 395ac401cc7SJan Kara } 396ac401cc7SJan Kara } 397ac401cc7SJan Kara 398ac401cc7SJan Kara /* 399ac401cc7SJan Kara * Find radix tree entry at given index. If it points to a page, return with 400ac401cc7SJan Kara * the page locked. If it points to the exceptional entry, return with the 401ac401cc7SJan Kara * radix tree entry locked. If the radix tree doesn't contain given index, 402ac401cc7SJan Kara * create empty exceptional entry for the index and return with it locked. 403ac401cc7SJan Kara * 404ac401cc7SJan Kara * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 405ac401cc7SJan Kara * persistent memory the benefit is doubtful. We can add that later if we can 406ac401cc7SJan Kara * show it helps. 407ac401cc7SJan Kara */ 408ac401cc7SJan Kara static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 409ac401cc7SJan Kara { 410ac401cc7SJan Kara void *ret, **slot; 411ac401cc7SJan Kara 412ac401cc7SJan Kara restart: 413ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 414ac401cc7SJan Kara ret = get_unlocked_mapping_entry(mapping, index, &slot); 415ac401cc7SJan Kara /* No entry for given index? Make sure radix tree is big enough. */ 416ac401cc7SJan Kara if (!ret) { 417ac401cc7SJan Kara int err; 418ac401cc7SJan Kara 419ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 420ac401cc7SJan Kara err = radix_tree_preload( 421ac401cc7SJan Kara mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 422ac401cc7SJan Kara if (err) 423ac401cc7SJan Kara return ERR_PTR(err); 424ac401cc7SJan Kara ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 425ac401cc7SJan Kara RADIX_DAX_ENTRY_LOCK); 426ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 427ac401cc7SJan Kara err = radix_tree_insert(&mapping->page_tree, index, ret); 428ac401cc7SJan Kara radix_tree_preload_end(); 429ac401cc7SJan Kara if (err) { 430ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 431ac401cc7SJan Kara /* Someone already created the entry? */ 432ac401cc7SJan Kara if (err == -EEXIST) 433ac401cc7SJan Kara goto restart; 434ac401cc7SJan Kara return ERR_PTR(err); 435ac401cc7SJan Kara } 436ac401cc7SJan Kara /* Good, we have inserted empty locked entry into the tree. */ 437ac401cc7SJan Kara mapping->nrexceptional++; 438ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 439ac401cc7SJan Kara return ret; 440ac401cc7SJan Kara } 441ac401cc7SJan Kara /* Normal page in radix tree? */ 442ac401cc7SJan Kara if (!radix_tree_exceptional_entry(ret)) { 443ac401cc7SJan Kara struct page *page = ret; 444ac401cc7SJan Kara 445ac401cc7SJan Kara get_page(page); 446ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 447ac401cc7SJan Kara lock_page(page); 448ac401cc7SJan Kara /* Page got truncated? Retry... */ 449ac401cc7SJan Kara if (unlikely(page->mapping != mapping)) { 450ac401cc7SJan Kara unlock_page(page); 451ac401cc7SJan Kara put_page(page); 452ac401cc7SJan Kara goto restart; 453ac401cc7SJan Kara } 454ac401cc7SJan Kara return page; 455ac401cc7SJan Kara } 456ac401cc7SJan Kara ret = lock_slot(mapping, slot); 457ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 458ac401cc7SJan Kara return ret; 459ac401cc7SJan Kara } 460ac401cc7SJan Kara 461ac401cc7SJan Kara void dax_wake_mapping_entry_waiter(struct address_space *mapping, 462ac401cc7SJan Kara pgoff_t index, bool wake_all) 463ac401cc7SJan Kara { 464ac401cc7SJan Kara wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 465ac401cc7SJan Kara 466ac401cc7SJan Kara /* 467ac401cc7SJan Kara * Checking for locked entry and prepare_to_wait_exclusive() happens 468ac401cc7SJan Kara * under mapping->tree_lock, ditto for entry handling in our callers. 469ac401cc7SJan Kara * So at this point all tasks that could have seen our entry locked 470ac401cc7SJan Kara * must be in the waitqueue and the following check will see them. 471ac401cc7SJan Kara */ 472ac401cc7SJan Kara if (waitqueue_active(wq)) { 473ac401cc7SJan Kara struct exceptional_entry_key key; 474ac401cc7SJan Kara 475ac401cc7SJan Kara key.mapping = mapping; 476ac401cc7SJan Kara key.index = index; 477ac401cc7SJan Kara __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 478ac401cc7SJan Kara } 479ac401cc7SJan Kara } 480ac401cc7SJan Kara 481ac401cc7SJan Kara static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 482ac401cc7SJan Kara { 483ac401cc7SJan Kara void *ret, **slot; 484ac401cc7SJan Kara 485ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 486ac401cc7SJan Kara ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 487ac401cc7SJan Kara if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 488ac401cc7SJan Kara !slot_locked(mapping, slot))) { 489ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 490ac401cc7SJan Kara return; 491ac401cc7SJan Kara } 492ac401cc7SJan Kara unlock_slot(mapping, slot); 493ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 494ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, false); 495ac401cc7SJan Kara } 496ac401cc7SJan Kara 497ac401cc7SJan Kara static void put_locked_mapping_entry(struct address_space *mapping, 498ac401cc7SJan Kara pgoff_t index, void *entry) 499ac401cc7SJan Kara { 500ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 501ac401cc7SJan Kara unlock_page(entry); 502ac401cc7SJan Kara put_page(entry); 503ac401cc7SJan Kara } else { 504ac401cc7SJan Kara unlock_mapping_entry(mapping, index); 505ac401cc7SJan Kara } 506ac401cc7SJan Kara } 507ac401cc7SJan Kara 508ac401cc7SJan Kara /* 509ac401cc7SJan Kara * Called when we are done with radix tree entry we looked up via 510ac401cc7SJan Kara * get_unlocked_mapping_entry() and which we didn't lock in the end. 511ac401cc7SJan Kara */ 512ac401cc7SJan Kara static void put_unlocked_mapping_entry(struct address_space *mapping, 513ac401cc7SJan Kara pgoff_t index, void *entry) 514ac401cc7SJan Kara { 515ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) 516ac401cc7SJan Kara return; 517ac401cc7SJan Kara 518ac401cc7SJan Kara /* We have to wake up next waiter for the radix tree entry lock */ 519ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, false); 520ac401cc7SJan Kara } 521ac401cc7SJan Kara 522ac401cc7SJan Kara /* 523ac401cc7SJan Kara * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 524ac401cc7SJan Kara * entry to get unlocked before deleting it. 525ac401cc7SJan Kara */ 526ac401cc7SJan Kara int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 527ac401cc7SJan Kara { 528ac401cc7SJan Kara void *entry; 529ac401cc7SJan Kara 530ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 531ac401cc7SJan Kara entry = get_unlocked_mapping_entry(mapping, index, NULL); 532ac401cc7SJan Kara /* 533ac401cc7SJan Kara * This gets called from truncate / punch_hole path. As such, the caller 534ac401cc7SJan Kara * must hold locks protecting against concurrent modifications of the 535ac401cc7SJan Kara * radix tree (usually fs-private i_mmap_sem for writing). Since the 536ac401cc7SJan Kara * caller has seen exceptional entry for this index, we better find it 537ac401cc7SJan Kara * at that index as well... 538ac401cc7SJan Kara */ 539ac401cc7SJan Kara if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 540ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 541ac401cc7SJan Kara return 0; 542ac401cc7SJan Kara } 543ac401cc7SJan Kara radix_tree_delete(&mapping->page_tree, index); 544ac401cc7SJan Kara mapping->nrexceptional--; 545ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 546ac401cc7SJan Kara dax_wake_mapping_entry_waiter(mapping, index, true); 547ac401cc7SJan Kara 548ac401cc7SJan Kara return 1; 549ac401cc7SJan Kara } 550ac401cc7SJan Kara 551ac401cc7SJan Kara /* 552f7ca90b1SMatthew Wilcox * The user has performed a load from a hole in the file. Allocating 553f7ca90b1SMatthew Wilcox * a new page in the file would cause excessive storage usage for 554f7ca90b1SMatthew Wilcox * workloads with sparse files. We allocate a page cache page instead. 555f7ca90b1SMatthew Wilcox * We'll kick it out of the page cache if it's ever written to, 556f7ca90b1SMatthew Wilcox * otherwise it will simply fall out of the page cache under memory 557f7ca90b1SMatthew Wilcox * pressure without ever having been dirtied. 558f7ca90b1SMatthew Wilcox */ 559ac401cc7SJan Kara static int dax_load_hole(struct address_space *mapping, void *entry, 560f7ca90b1SMatthew Wilcox struct vm_fault *vmf) 561f7ca90b1SMatthew Wilcox { 562ac401cc7SJan Kara struct page *page; 563f7ca90b1SMatthew Wilcox 564ac401cc7SJan Kara /* Hole page already exists? Return it... */ 565ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 566ac401cc7SJan Kara vmf->page = entry; 567ac401cc7SJan Kara return VM_FAULT_LOCKED; 568ac401cc7SJan Kara } 569ac401cc7SJan Kara 570ac401cc7SJan Kara /* This will replace locked radix tree entry with a hole page */ 571ac401cc7SJan Kara page = find_or_create_page(mapping, vmf->pgoff, 572ac401cc7SJan Kara vmf->gfp_mask | __GFP_ZERO); 573ac401cc7SJan Kara if (!page) { 574ac401cc7SJan Kara put_locked_mapping_entry(mapping, vmf->pgoff, entry); 575ac401cc7SJan Kara return VM_FAULT_OOM; 576ac401cc7SJan Kara } 577f7ca90b1SMatthew Wilcox vmf->page = page; 578f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 579f7ca90b1SMatthew Wilcox } 580f7ca90b1SMatthew Wilcox 581b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode, 582b2e0d162SDan Williams struct buffer_head *bh, unsigned long vaddr) 583f7ca90b1SMatthew Wilcox { 584b2e0d162SDan Williams struct blk_dax_ctl dax = { 585b2e0d162SDan Williams .sector = to_sector(bh, inode), 586b2e0d162SDan Williams .size = bh->b_size, 587b2e0d162SDan Williams }; 588b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 589e2e05394SRoss Zwisler void *vto; 590e2e05394SRoss Zwisler 591b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) 592b2e0d162SDan Williams return PTR_ERR(dax.addr); 593f7ca90b1SMatthew Wilcox vto = kmap_atomic(to); 594b2e0d162SDan Williams copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 595f7ca90b1SMatthew Wilcox kunmap_atomic(vto); 596b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 597f7ca90b1SMatthew Wilcox return 0; 598f7ca90b1SMatthew Wilcox } 599f7ca90b1SMatthew Wilcox 60009cbfeafSKirill A. Shutemov #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 6019973c98eSRoss Zwisler 602ac401cc7SJan Kara static void *dax_insert_mapping_entry(struct address_space *mapping, 603ac401cc7SJan Kara struct vm_fault *vmf, 604ac401cc7SJan Kara void *entry, sector_t sector) 6059973c98eSRoss Zwisler { 6069973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 607ac401cc7SJan Kara int error = 0; 608ac401cc7SJan Kara bool hole_fill = false; 609ac401cc7SJan Kara void *new_entry; 610ac401cc7SJan Kara pgoff_t index = vmf->pgoff; 6119973c98eSRoss Zwisler 612ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) 6139973c98eSRoss Zwisler __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 6149973c98eSRoss Zwisler 615ac401cc7SJan Kara /* Replacing hole page with block mapping? */ 616ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 617ac401cc7SJan Kara hole_fill = true; 6189973c98eSRoss Zwisler /* 619ac401cc7SJan Kara * Unmap the page now before we remove it from page cache below. 620ac401cc7SJan Kara * The page is locked so it cannot be faulted in again. 6219973c98eSRoss Zwisler */ 622ac401cc7SJan Kara unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 623ac401cc7SJan Kara PAGE_SIZE, 0); 624ac401cc7SJan Kara error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 6259973c98eSRoss Zwisler if (error) 626ac401cc7SJan Kara return ERR_PTR(error); 627ac401cc7SJan Kara } 6289973c98eSRoss Zwisler 629ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 630ac401cc7SJan Kara new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 631ac401cc7SJan Kara RADIX_DAX_ENTRY_LOCK); 632ac401cc7SJan Kara if (hole_fill) { 633ac401cc7SJan Kara __delete_from_page_cache(entry, NULL); 634ac401cc7SJan Kara /* Drop pagecache reference */ 635ac401cc7SJan Kara put_page(entry); 636ac401cc7SJan Kara error = radix_tree_insert(page_tree, index, new_entry); 637ac401cc7SJan Kara if (error) { 638ac401cc7SJan Kara new_entry = ERR_PTR(error); 639ac401cc7SJan Kara goto unlock; 640ac401cc7SJan Kara } 6419973c98eSRoss Zwisler mapping->nrexceptional++; 642ac401cc7SJan Kara } else { 643ac401cc7SJan Kara void **slot; 644ac401cc7SJan Kara void *ret; 645ac401cc7SJan Kara 646ac401cc7SJan Kara ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 647ac401cc7SJan Kara WARN_ON_ONCE(ret != entry); 648ac401cc7SJan Kara radix_tree_replace_slot(slot, new_entry); 649ac401cc7SJan Kara } 650ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) 6519973c98eSRoss Zwisler radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 6529973c98eSRoss Zwisler unlock: 6539973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 654ac401cc7SJan Kara if (hole_fill) { 655ac401cc7SJan Kara radix_tree_preload_end(); 656ac401cc7SJan Kara /* 657ac401cc7SJan Kara * We don't need hole page anymore, it has been replaced with 658ac401cc7SJan Kara * locked radix tree entry now. 659ac401cc7SJan Kara */ 660ac401cc7SJan Kara if (mapping->a_ops->freepage) 661ac401cc7SJan Kara mapping->a_ops->freepage(entry); 662ac401cc7SJan Kara unlock_page(entry); 663ac401cc7SJan Kara put_page(entry); 664ac401cc7SJan Kara } 665ac401cc7SJan Kara return new_entry; 6669973c98eSRoss Zwisler } 6679973c98eSRoss Zwisler 6689973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev, 6699973c98eSRoss Zwisler struct address_space *mapping, pgoff_t index, void *entry) 6709973c98eSRoss Zwisler { 6719973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 6729973c98eSRoss Zwisler int type = RADIX_DAX_TYPE(entry); 6739973c98eSRoss Zwisler struct radix_tree_node *node; 6749973c98eSRoss Zwisler struct blk_dax_ctl dax; 6759973c98eSRoss Zwisler void **slot; 6769973c98eSRoss Zwisler int ret = 0; 6779973c98eSRoss Zwisler 6789973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 6799973c98eSRoss Zwisler /* 6809973c98eSRoss Zwisler * Regular page slots are stabilized by the page lock even 6819973c98eSRoss Zwisler * without the tree itself locked. These unlocked entries 6829973c98eSRoss Zwisler * need verification under the tree lock. 6839973c98eSRoss Zwisler */ 6849973c98eSRoss Zwisler if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 6859973c98eSRoss Zwisler goto unlock; 6869973c98eSRoss Zwisler if (*slot != entry) 6879973c98eSRoss Zwisler goto unlock; 6889973c98eSRoss Zwisler 6899973c98eSRoss Zwisler /* another fsync thread may have already written back this entry */ 6909973c98eSRoss Zwisler if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 6919973c98eSRoss Zwisler goto unlock; 6929973c98eSRoss Zwisler 6939973c98eSRoss Zwisler if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 6949973c98eSRoss Zwisler ret = -EIO; 6959973c98eSRoss Zwisler goto unlock; 6969973c98eSRoss Zwisler } 6979973c98eSRoss Zwisler 6989973c98eSRoss Zwisler dax.sector = RADIX_DAX_SECTOR(entry); 6999973c98eSRoss Zwisler dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 7009973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 7019973c98eSRoss Zwisler 7029973c98eSRoss Zwisler /* 7039973c98eSRoss Zwisler * We cannot hold tree_lock while calling dax_map_atomic() because it 7049973c98eSRoss Zwisler * eventually calls cond_resched(). 7059973c98eSRoss Zwisler */ 7069973c98eSRoss Zwisler ret = dax_map_atomic(bdev, &dax); 7079973c98eSRoss Zwisler if (ret < 0) 7089973c98eSRoss Zwisler return ret; 7099973c98eSRoss Zwisler 7109973c98eSRoss Zwisler if (WARN_ON_ONCE(ret < dax.size)) { 7119973c98eSRoss Zwisler ret = -EIO; 7129973c98eSRoss Zwisler goto unmap; 7139973c98eSRoss Zwisler } 7149973c98eSRoss Zwisler 7159973c98eSRoss Zwisler wb_cache_pmem(dax.addr, dax.size); 7169973c98eSRoss Zwisler 7179973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 7189973c98eSRoss Zwisler radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 7199973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 7209973c98eSRoss Zwisler unmap: 7219973c98eSRoss Zwisler dax_unmap_atomic(bdev, &dax); 7229973c98eSRoss Zwisler return ret; 7239973c98eSRoss Zwisler 7249973c98eSRoss Zwisler unlock: 7259973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 7269973c98eSRoss Zwisler return ret; 7279973c98eSRoss Zwisler } 7289973c98eSRoss Zwisler 7299973c98eSRoss Zwisler /* 7309973c98eSRoss Zwisler * Flush the mapping to the persistent domain within the byte range of [start, 7319973c98eSRoss Zwisler * end]. This is required by data integrity operations to ensure file data is 7329973c98eSRoss Zwisler * on persistent storage prior to completion of the operation. 7339973c98eSRoss Zwisler */ 7347f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping, 7357f6d5b52SRoss Zwisler struct block_device *bdev, struct writeback_control *wbc) 7369973c98eSRoss Zwisler { 7379973c98eSRoss Zwisler struct inode *inode = mapping->host; 7389973c98eSRoss Zwisler pgoff_t start_index, end_index, pmd_index; 7399973c98eSRoss Zwisler pgoff_t indices[PAGEVEC_SIZE]; 7409973c98eSRoss Zwisler struct pagevec pvec; 7419973c98eSRoss Zwisler bool done = false; 7429973c98eSRoss Zwisler int i, ret = 0; 7439973c98eSRoss Zwisler void *entry; 7449973c98eSRoss Zwisler 7459973c98eSRoss Zwisler if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 7469973c98eSRoss Zwisler return -EIO; 7479973c98eSRoss Zwisler 7487f6d5b52SRoss Zwisler if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 7497f6d5b52SRoss Zwisler return 0; 7507f6d5b52SRoss Zwisler 75109cbfeafSKirill A. Shutemov start_index = wbc->range_start >> PAGE_SHIFT; 75209cbfeafSKirill A. Shutemov end_index = wbc->range_end >> PAGE_SHIFT; 7539973c98eSRoss Zwisler pmd_index = DAX_PMD_INDEX(start_index); 7549973c98eSRoss Zwisler 7559973c98eSRoss Zwisler rcu_read_lock(); 7569973c98eSRoss Zwisler entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 7579973c98eSRoss Zwisler rcu_read_unlock(); 7589973c98eSRoss Zwisler 7599973c98eSRoss Zwisler /* see if the start of our range is covered by a PMD entry */ 7609973c98eSRoss Zwisler if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 7619973c98eSRoss Zwisler start_index = pmd_index; 7629973c98eSRoss Zwisler 7639973c98eSRoss Zwisler tag_pages_for_writeback(mapping, start_index, end_index); 7649973c98eSRoss Zwisler 7659973c98eSRoss Zwisler pagevec_init(&pvec, 0); 7669973c98eSRoss Zwisler while (!done) { 7679973c98eSRoss Zwisler pvec.nr = find_get_entries_tag(mapping, start_index, 7689973c98eSRoss Zwisler PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 7699973c98eSRoss Zwisler pvec.pages, indices); 7709973c98eSRoss Zwisler 7719973c98eSRoss Zwisler if (pvec.nr == 0) 7729973c98eSRoss Zwisler break; 7739973c98eSRoss Zwisler 7749973c98eSRoss Zwisler for (i = 0; i < pvec.nr; i++) { 7759973c98eSRoss Zwisler if (indices[i] > end_index) { 7769973c98eSRoss Zwisler done = true; 7779973c98eSRoss Zwisler break; 7789973c98eSRoss Zwisler } 7799973c98eSRoss Zwisler 7809973c98eSRoss Zwisler ret = dax_writeback_one(bdev, mapping, indices[i], 7819973c98eSRoss Zwisler pvec.pages[i]); 7829973c98eSRoss Zwisler if (ret < 0) 7839973c98eSRoss Zwisler return ret; 7849973c98eSRoss Zwisler } 7859973c98eSRoss Zwisler } 7869973c98eSRoss Zwisler wmb_pmem(); 7879973c98eSRoss Zwisler return 0; 7889973c98eSRoss Zwisler } 7899973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 7909973c98eSRoss Zwisler 791ac401cc7SJan Kara static int dax_insert_mapping(struct address_space *mapping, 792ac401cc7SJan Kara struct buffer_head *bh, void **entryp, 793f7ca90b1SMatthew Wilcox struct vm_area_struct *vma, struct vm_fault *vmf) 794f7ca90b1SMatthew Wilcox { 795f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 796b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 797b2e0d162SDan Williams struct blk_dax_ctl dax = { 798ac401cc7SJan Kara .sector = to_sector(bh, mapping->host), 799b2e0d162SDan Williams .size = bh->b_size, 800b2e0d162SDan Williams }; 801f7ca90b1SMatthew Wilcox int error; 802ac401cc7SJan Kara void *ret; 803ac401cc7SJan Kara void *entry = *entryp; 804f7ca90b1SMatthew Wilcox 8050f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 8060f90cc66SRoss Zwisler 807b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) { 808b2e0d162SDan Williams error = PTR_ERR(dax.addr); 809f7ca90b1SMatthew Wilcox goto out; 810f7ca90b1SMatthew Wilcox } 811b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 812f7ca90b1SMatthew Wilcox 813ac401cc7SJan Kara ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 814ac401cc7SJan Kara if (IS_ERR(ret)) { 815ac401cc7SJan Kara error = PTR_ERR(ret); 8169973c98eSRoss Zwisler goto out; 817ac401cc7SJan Kara } 818ac401cc7SJan Kara *entryp = ret; 8199973c98eSRoss Zwisler 82001c8f1c4SDan Williams error = vm_insert_mixed(vma, vaddr, dax.pfn); 821f7ca90b1SMatthew Wilcox out: 8220f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 823f7ca90b1SMatthew Wilcox return error; 824f7ca90b1SMatthew Wilcox } 825f7ca90b1SMatthew Wilcox 826ce5c5d55SDave Chinner /** 827ce5c5d55SDave Chinner * __dax_fault - handle a page fault on a DAX file 828ce5c5d55SDave Chinner * @vma: The virtual memory area where the fault occurred 829ce5c5d55SDave Chinner * @vmf: The description of the fault 830ce5c5d55SDave Chinner * @get_block: The filesystem method used to translate file offsets to blocks 831ce5c5d55SDave Chinner * 832ce5c5d55SDave Chinner * When a page fault occurs, filesystems may call this helper in their 833ce5c5d55SDave Chinner * fault handler for DAX files. __dax_fault() assumes the caller has done all 834ce5c5d55SDave Chinner * the necessary locking for the page fault to proceed successfully. 835ce5c5d55SDave Chinner */ 836ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 83702fbd139SJan Kara get_block_t get_block) 838f7ca90b1SMatthew Wilcox { 839f7ca90b1SMatthew Wilcox struct file *file = vma->vm_file; 840f7ca90b1SMatthew Wilcox struct address_space *mapping = file->f_mapping; 841f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 842ac401cc7SJan Kara void *entry; 843f7ca90b1SMatthew Wilcox struct buffer_head bh; 844f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 845f7ca90b1SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 846f7ca90b1SMatthew Wilcox sector_t block; 847f7ca90b1SMatthew Wilcox pgoff_t size; 848f7ca90b1SMatthew Wilcox int error; 849f7ca90b1SMatthew Wilcox int major = 0; 850f7ca90b1SMatthew Wilcox 851ac401cc7SJan Kara /* 852ac401cc7SJan Kara * Check whether offset isn't beyond end of file now. Caller is supposed 853ac401cc7SJan Kara * to hold locks serializing us with truncate / punch hole so this is 854ac401cc7SJan Kara * a reliable test. 855ac401cc7SJan Kara */ 856f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 857f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) 858f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 859f7ca90b1SMatthew Wilcox 860f7ca90b1SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 861f7ca90b1SMatthew Wilcox block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 862eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 863f7ca90b1SMatthew Wilcox bh.b_size = PAGE_SIZE; 864f7ca90b1SMatthew Wilcox 865ac401cc7SJan Kara entry = grab_mapping_entry(mapping, vmf->pgoff); 866ac401cc7SJan Kara if (IS_ERR(entry)) { 867ac401cc7SJan Kara error = PTR_ERR(entry); 868ac401cc7SJan Kara goto out; 869f7ca90b1SMatthew Wilcox } 870f7ca90b1SMatthew Wilcox 871f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 0); 872f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 873f7ca90b1SMatthew Wilcox error = -EIO; /* fs corruption? */ 874f7ca90b1SMatthew Wilcox if (error) 875ac401cc7SJan Kara goto unlock_entry; 876f7ca90b1SMatthew Wilcox 877f7ca90b1SMatthew Wilcox if (vmf->cow_page) { 878f7ca90b1SMatthew Wilcox struct page *new_page = vmf->cow_page; 879f7ca90b1SMatthew Wilcox if (buffer_written(&bh)) 880b2e0d162SDan Williams error = copy_user_bh(new_page, inode, &bh, vaddr); 881f7ca90b1SMatthew Wilcox else 882f7ca90b1SMatthew Wilcox clear_user_highpage(new_page, vaddr); 883f7ca90b1SMatthew Wilcox if (error) 884ac401cc7SJan Kara goto unlock_entry; 885ac401cc7SJan Kara if (!radix_tree_exceptional_entry(entry)) { 886ac401cc7SJan Kara vmf->page = entry; 887ac401cc7SJan Kara } else { 888ac401cc7SJan Kara unlock_mapping_entry(mapping, vmf->pgoff); 8890f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 890ac401cc7SJan Kara vmf->page = NULL; 891ac401cc7SJan Kara } 892f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 893f7ca90b1SMatthew Wilcox } 894f7ca90b1SMatthew Wilcox 895ac401cc7SJan Kara if (!buffer_mapped(&bh)) { 896ac401cc7SJan Kara if (vmf->flags & FAULT_FLAG_WRITE) { 897ac401cc7SJan Kara error = get_block(inode, block, &bh, 1); 898ac401cc7SJan Kara count_vm_event(PGMAJFAULT); 899ac401cc7SJan Kara mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 900ac401cc7SJan Kara major = VM_FAULT_MAJOR; 901ac401cc7SJan Kara if (!error && (bh.b_size < PAGE_SIZE)) 902ac401cc7SJan Kara error = -EIO; 903ac401cc7SJan Kara if (error) 904ac401cc7SJan Kara goto unlock_entry; 905ac401cc7SJan Kara } else { 906ac401cc7SJan Kara return dax_load_hole(mapping, entry, vmf); 907ac401cc7SJan Kara } 908f7ca90b1SMatthew Wilcox } 909f7ca90b1SMatthew Wilcox 91002fbd139SJan Kara /* Filesystem should not return unwritten buffers to us! */ 9112b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 912ac401cc7SJan Kara error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 913ac401cc7SJan Kara unlock_entry: 914ac401cc7SJan Kara put_locked_mapping_entry(mapping, vmf->pgoff, entry); 915f7ca90b1SMatthew Wilcox out: 916f7ca90b1SMatthew Wilcox if (error == -ENOMEM) 917f7ca90b1SMatthew Wilcox return VM_FAULT_OOM | major; 918f7ca90b1SMatthew Wilcox /* -EBUSY is fine, somebody else faulted on the same PTE */ 919f7ca90b1SMatthew Wilcox if ((error < 0) && (error != -EBUSY)) 920f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS | major; 921f7ca90b1SMatthew Wilcox return VM_FAULT_NOPAGE | major; 922f7ca90b1SMatthew Wilcox } 923ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault); 924f7ca90b1SMatthew Wilcox 925f7ca90b1SMatthew Wilcox /** 926f7ca90b1SMatthew Wilcox * dax_fault - handle a page fault on a DAX file 927f7ca90b1SMatthew Wilcox * @vma: The virtual memory area where the fault occurred 928f7ca90b1SMatthew Wilcox * @vmf: The description of the fault 929f7ca90b1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 930f7ca90b1SMatthew Wilcox * 931f7ca90b1SMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 932f7ca90b1SMatthew Wilcox * fault handler for DAX files. 933f7ca90b1SMatthew Wilcox */ 934f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 93502fbd139SJan Kara get_block_t get_block) 936f7ca90b1SMatthew Wilcox { 937f7ca90b1SMatthew Wilcox int result; 938f7ca90b1SMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 939f7ca90b1SMatthew Wilcox 940f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 941f7ca90b1SMatthew Wilcox sb_start_pagefault(sb); 942f7ca90b1SMatthew Wilcox file_update_time(vma->vm_file); 943f7ca90b1SMatthew Wilcox } 94402fbd139SJan Kara result = __dax_fault(vma, vmf, get_block); 945f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) 946f7ca90b1SMatthew Wilcox sb_end_pagefault(sb); 947f7ca90b1SMatthew Wilcox 948f7ca90b1SMatthew Wilcox return result; 949f7ca90b1SMatthew Wilcox } 950f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault); 9514c0ccfefSMatthew Wilcox 952348e967aSJan Kara #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 953844f35dbSMatthew Wilcox /* 954844f35dbSMatthew Wilcox * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 955844f35dbSMatthew Wilcox * more often than one might expect in the below function. 956844f35dbSMatthew Wilcox */ 957844f35dbSMatthew Wilcox #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 958844f35dbSMatthew Wilcox 959cbb38e41SDan Williams static void __dax_dbg(struct buffer_head *bh, unsigned long address, 960cbb38e41SDan Williams const char *reason, const char *fn) 961cbb38e41SDan Williams { 962cbb38e41SDan Williams if (bh) { 963cbb38e41SDan Williams char bname[BDEVNAME_SIZE]; 964cbb38e41SDan Williams bdevname(bh->b_bdev, bname); 965cbb38e41SDan Williams pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 966cbb38e41SDan Williams "length %zd fallback: %s\n", fn, current->comm, 967cbb38e41SDan Williams address, bname, bh->b_state, (u64)bh->b_blocknr, 968cbb38e41SDan Williams bh->b_size, reason); 969cbb38e41SDan Williams } else { 970cbb38e41SDan Williams pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 971cbb38e41SDan Williams current->comm, address, reason); 972cbb38e41SDan Williams } 973cbb38e41SDan Williams } 974cbb38e41SDan Williams 975cbb38e41SDan Williams #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 976cbb38e41SDan Williams 977844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 97802fbd139SJan Kara pmd_t *pmd, unsigned int flags, get_block_t get_block) 979844f35dbSMatthew Wilcox { 980844f35dbSMatthew Wilcox struct file *file = vma->vm_file; 981844f35dbSMatthew Wilcox struct address_space *mapping = file->f_mapping; 982844f35dbSMatthew Wilcox struct inode *inode = mapping->host; 983844f35dbSMatthew Wilcox struct buffer_head bh; 984844f35dbSMatthew Wilcox unsigned blkbits = inode->i_blkbits; 985844f35dbSMatthew Wilcox unsigned long pmd_addr = address & PMD_MASK; 986844f35dbSMatthew Wilcox bool write = flags & FAULT_FLAG_WRITE; 987b2e0d162SDan Williams struct block_device *bdev; 988844f35dbSMatthew Wilcox pgoff_t size, pgoff; 989b2e0d162SDan Williams sector_t block; 990ac401cc7SJan Kara int result = 0; 9919973c98eSRoss Zwisler bool alloc = false; 992844f35dbSMatthew Wilcox 993c046c321SDan Williams /* dax pmd mappings require pfn_t_devmap() */ 994ee82c9edSDan Williams if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 995ee82c9edSDan Williams return VM_FAULT_FALLBACK; 996ee82c9edSDan Williams 997844f35dbSMatthew Wilcox /* Fall back to PTEs if we're going to COW */ 99859bf4fb9SToshi Kani if (write && !(vma->vm_flags & VM_SHARED)) { 99959bf4fb9SToshi Kani split_huge_pmd(vma, pmd, address); 1000cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "cow write"); 1001844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 100259bf4fb9SToshi Kani } 1003844f35dbSMatthew Wilcox /* If the PMD would extend outside the VMA */ 1004cbb38e41SDan Williams if (pmd_addr < vma->vm_start) { 1005cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma start unaligned"); 1006844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 1007cbb38e41SDan Williams } 1008cbb38e41SDan Williams if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 1009cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma end unaligned"); 1010844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 1011cbb38e41SDan Williams } 1012844f35dbSMatthew Wilcox 10133fdd1b47SMatthew Wilcox pgoff = linear_page_index(vma, pmd_addr); 1014844f35dbSMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1015844f35dbSMatthew Wilcox if (pgoff >= size) 1016844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 1017844f35dbSMatthew Wilcox /* If the PMD would cover blocks out of the file */ 1018cbb38e41SDan Williams if ((pgoff | PG_PMD_COLOUR) >= size) { 1019cbb38e41SDan Williams dax_pmd_dbg(NULL, address, 1020cbb38e41SDan Williams "offset + huge page size > file size"); 1021844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 1022cbb38e41SDan Williams } 1023844f35dbSMatthew Wilcox 1024844f35dbSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 1025d4bbe706SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 1026844f35dbSMatthew Wilcox block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 1027844f35dbSMatthew Wilcox 1028844f35dbSMatthew Wilcox bh.b_size = PMD_SIZE; 10299973c98eSRoss Zwisler 10309973c98eSRoss Zwisler if (get_block(inode, block, &bh, 0) != 0) 1031844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 10329973c98eSRoss Zwisler 10339973c98eSRoss Zwisler if (!buffer_mapped(&bh) && write) { 10349973c98eSRoss Zwisler if (get_block(inode, block, &bh, 1) != 0) 10359973c98eSRoss Zwisler return VM_FAULT_SIGBUS; 10369973c98eSRoss Zwisler alloc = true; 10372b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 10389973c98eSRoss Zwisler } 10399973c98eSRoss Zwisler 1040b2e0d162SDan Williams bdev = bh.b_bdev; 1041844f35dbSMatthew Wilcox 1042844f35dbSMatthew Wilcox /* 1043844f35dbSMatthew Wilcox * If the filesystem isn't willing to tell us the length of a hole, 1044844f35dbSMatthew Wilcox * just fall back to PTEs. Calling get_block 512 times in a loop 1045844f35dbSMatthew Wilcox * would be silly. 1046844f35dbSMatthew Wilcox */ 1047cbb38e41SDan Williams if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 1048cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "allocated block too small"); 10499973c98eSRoss Zwisler return VM_FAULT_FALLBACK; 1050cbb38e41SDan Williams } 1051844f35dbSMatthew Wilcox 10529973c98eSRoss Zwisler /* 10539973c98eSRoss Zwisler * If we allocated new storage, make sure no process has any 10549973c98eSRoss Zwisler * zero pages covering this hole 10559973c98eSRoss Zwisler */ 10569973c98eSRoss Zwisler if (alloc) { 10579973c98eSRoss Zwisler loff_t lstart = pgoff << PAGE_SHIFT; 10589973c98eSRoss Zwisler loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 10599973c98eSRoss Zwisler 10609973c98eSRoss Zwisler truncate_pagecache_range(inode, lstart, lend); 10619973c98eSRoss Zwisler } 10629973c98eSRoss Zwisler 10630f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 106446c043edSKirill A. Shutemov 1065b9953536SJan Kara if (!write && !buffer_mapped(&bh)) { 1066844f35dbSMatthew Wilcox spinlock_t *ptl; 1067d295e341SKirill A. Shutemov pmd_t entry; 1068844f35dbSMatthew Wilcox struct page *zero_page = get_huge_zero_page(); 1069d295e341SKirill A. Shutemov 1070cbb38e41SDan Williams if (unlikely(!zero_page)) { 1071cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "no zero page"); 1072844f35dbSMatthew Wilcox goto fallback; 1073cbb38e41SDan Williams } 1074844f35dbSMatthew Wilcox 1075d295e341SKirill A. Shutemov ptl = pmd_lock(vma->vm_mm, pmd); 1076d295e341SKirill A. Shutemov if (!pmd_none(*pmd)) { 1077844f35dbSMatthew Wilcox spin_unlock(ptl); 1078cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pmd already present"); 1079d295e341SKirill A. Shutemov goto fallback; 1080d295e341SKirill A. Shutemov } 1081d295e341SKirill A. Shutemov 1082cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 1083cbb38e41SDan Williams "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 1084cbb38e41SDan Williams __func__, current->comm, address, 1085cbb38e41SDan Williams (unsigned long long) to_sector(&bh, inode)); 1086cbb38e41SDan Williams 1087d295e341SKirill A. Shutemov entry = mk_pmd(zero_page, vma->vm_page_prot); 1088d295e341SKirill A. Shutemov entry = pmd_mkhuge(entry); 1089d295e341SKirill A. Shutemov set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1090844f35dbSMatthew Wilcox result = VM_FAULT_NOPAGE; 1091d295e341SKirill A. Shutemov spin_unlock(ptl); 1092844f35dbSMatthew Wilcox } else { 1093b2e0d162SDan Williams struct blk_dax_ctl dax = { 1094b2e0d162SDan Williams .sector = to_sector(&bh, inode), 1095b2e0d162SDan Williams .size = PMD_SIZE, 1096b2e0d162SDan Williams }; 1097b2e0d162SDan Williams long length = dax_map_atomic(bdev, &dax); 1098b2e0d162SDan Williams 1099844f35dbSMatthew Wilcox if (length < 0) { 11008b3db979SDan Williams dax_pmd_dbg(&bh, address, "dax-error fallback"); 11018b3db979SDan Williams goto fallback; 1102844f35dbSMatthew Wilcox } 1103cbb38e41SDan Williams if (length < PMD_SIZE) { 1104cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "dax-length too small"); 1105cbb38e41SDan Williams dax_unmap_atomic(bdev, &dax); 1106cbb38e41SDan Williams goto fallback; 1107cbb38e41SDan Williams } 1108cbb38e41SDan Williams if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1109cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn unaligned"); 1110b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 1111844f35dbSMatthew Wilcox goto fallback; 1112b2e0d162SDan Williams } 1113844f35dbSMatthew Wilcox 1114c046c321SDan Williams if (!pfn_t_devmap(dax.pfn)) { 1115b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 1116cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1117152d7bd8SDan Williams goto fallback; 1118b2e0d162SDan Williams } 1119b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 11200f90cc66SRoss Zwisler 11219973c98eSRoss Zwisler /* 11229973c98eSRoss Zwisler * For PTE faults we insert a radix tree entry for reads, and 11239973c98eSRoss Zwisler * leave it clean. Then on the first write we dirty the radix 11249973c98eSRoss Zwisler * tree entry via the dax_pfn_mkwrite() path. This sequence 11259973c98eSRoss Zwisler * allows the dax_pfn_mkwrite() call to be simpler and avoid a 11269973c98eSRoss Zwisler * call into get_block() to translate the pgoff to a sector in 11279973c98eSRoss Zwisler * order to be able to create a new radix tree entry. 11289973c98eSRoss Zwisler * 11299973c98eSRoss Zwisler * The PMD path doesn't have an equivalent to 11309973c98eSRoss Zwisler * dax_pfn_mkwrite(), though, so for a read followed by a 11319973c98eSRoss Zwisler * write we traverse all the way through __dax_pmd_fault() 11329973c98eSRoss Zwisler * twice. This means we can just skip inserting a radix tree 11339973c98eSRoss Zwisler * entry completely on the initial read and just wait until 11349973c98eSRoss Zwisler * the write to insert a dirty entry. 11359973c98eSRoss Zwisler */ 11369973c98eSRoss Zwisler if (write) { 1137ac401cc7SJan Kara /* 1138ac401cc7SJan Kara * We should insert radix-tree entry and dirty it here. 1139ac401cc7SJan Kara * For now this is broken... 1140ac401cc7SJan Kara */ 11419973c98eSRoss Zwisler } 11429973c98eSRoss Zwisler 1143cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 1144cbb38e41SDan Williams "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1145cbb38e41SDan Williams __func__, current->comm, address, 1146cbb38e41SDan Williams pfn_t_to_pfn(dax.pfn), 1147cbb38e41SDan Williams (unsigned long long) dax.sector); 114834c0fd54SDan Williams result |= vmf_insert_pfn_pmd(vma, address, pmd, 1149f25748e3SDan Williams dax.pfn, write); 1150844f35dbSMatthew Wilcox } 1151844f35dbSMatthew Wilcox 1152844f35dbSMatthew Wilcox out: 11530f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 11540f90cc66SRoss Zwisler 1155844f35dbSMatthew Wilcox return result; 1156844f35dbSMatthew Wilcox 1157844f35dbSMatthew Wilcox fallback: 1158844f35dbSMatthew Wilcox count_vm_event(THP_FAULT_FALLBACK); 1159844f35dbSMatthew Wilcox result = VM_FAULT_FALLBACK; 1160844f35dbSMatthew Wilcox goto out; 1161844f35dbSMatthew Wilcox } 1162844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault); 1163844f35dbSMatthew Wilcox 1164844f35dbSMatthew Wilcox /** 1165844f35dbSMatthew Wilcox * dax_pmd_fault - handle a PMD fault on a DAX file 1166844f35dbSMatthew Wilcox * @vma: The virtual memory area where the fault occurred 1167844f35dbSMatthew Wilcox * @vmf: The description of the fault 1168844f35dbSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 1169844f35dbSMatthew Wilcox * 1170844f35dbSMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 1171844f35dbSMatthew Wilcox * pmd_fault handler for DAX files. 1172844f35dbSMatthew Wilcox */ 1173844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 117402fbd139SJan Kara pmd_t *pmd, unsigned int flags, get_block_t get_block) 1175844f35dbSMatthew Wilcox { 1176844f35dbSMatthew Wilcox int result; 1177844f35dbSMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 1178844f35dbSMatthew Wilcox 1179844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) { 1180844f35dbSMatthew Wilcox sb_start_pagefault(sb); 1181844f35dbSMatthew Wilcox file_update_time(vma->vm_file); 1182844f35dbSMatthew Wilcox } 118302fbd139SJan Kara result = __dax_pmd_fault(vma, address, pmd, flags, get_block); 1184844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) 1185844f35dbSMatthew Wilcox sb_end_pagefault(sb); 1186844f35dbSMatthew Wilcox 1187844f35dbSMatthew Wilcox return result; 1188844f35dbSMatthew Wilcox } 1189844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault); 1190dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1191844f35dbSMatthew Wilcox 11924c0ccfefSMatthew Wilcox /** 11930e3b210cSBoaz Harrosh * dax_pfn_mkwrite - handle first write to DAX page 11940e3b210cSBoaz Harrosh * @vma: The virtual memory area where the fault occurred 11950e3b210cSBoaz Harrosh * @vmf: The description of the fault 11960e3b210cSBoaz Harrosh */ 11970e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 11980e3b210cSBoaz Harrosh { 11999973c98eSRoss Zwisler struct file *file = vma->vm_file; 1200ac401cc7SJan Kara struct address_space *mapping = file->f_mapping; 1201ac401cc7SJan Kara void *entry; 1202ac401cc7SJan Kara pgoff_t index = vmf->pgoff; 12030e3b210cSBoaz Harrosh 1204ac401cc7SJan Kara spin_lock_irq(&mapping->tree_lock); 1205ac401cc7SJan Kara entry = get_unlocked_mapping_entry(mapping, index, NULL); 1206ac401cc7SJan Kara if (!entry || !radix_tree_exceptional_entry(entry)) 1207ac401cc7SJan Kara goto out; 1208ac401cc7SJan Kara radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1209ac401cc7SJan Kara put_unlocked_mapping_entry(mapping, index, entry); 1210ac401cc7SJan Kara out: 1211ac401cc7SJan Kara spin_unlock_irq(&mapping->tree_lock); 12120e3b210cSBoaz Harrosh return VM_FAULT_NOPAGE; 12130e3b210cSBoaz Harrosh } 12140e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 12150e3b210cSBoaz Harrosh 12164b0228faSVishal Verma static bool dax_range_is_aligned(struct block_device *bdev, 12174b0228faSVishal Verma unsigned int offset, unsigned int length) 12184b0228faSVishal Verma { 12194b0228faSVishal Verma unsigned short sector_size = bdev_logical_block_size(bdev); 12204b0228faSVishal Verma 12214b0228faSVishal Verma if (!IS_ALIGNED(offset, sector_size)) 12224b0228faSVishal Verma return false; 12234b0228faSVishal Verma if (!IS_ALIGNED(length, sector_size)) 12244b0228faSVishal Verma return false; 12254b0228faSVishal Verma 12264b0228faSVishal Verma return true; 12274b0228faSVishal Verma } 12284b0228faSVishal Verma 1229679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1230679c8bd3SChristoph Hellwig unsigned int offset, unsigned int length) 1231679c8bd3SChristoph Hellwig { 1232679c8bd3SChristoph Hellwig struct blk_dax_ctl dax = { 1233679c8bd3SChristoph Hellwig .sector = sector, 1234679c8bd3SChristoph Hellwig .size = PAGE_SIZE, 1235679c8bd3SChristoph Hellwig }; 1236679c8bd3SChristoph Hellwig 12374b0228faSVishal Verma if (dax_range_is_aligned(bdev, offset, length)) { 12384b0228faSVishal Verma sector_t start_sector = dax.sector + (offset >> 9); 12394b0228faSVishal Verma 12404b0228faSVishal Verma return blkdev_issue_zeroout(bdev, start_sector, 12414b0228faSVishal Verma length >> 9, GFP_NOFS, true); 12424b0228faSVishal Verma } else { 1243679c8bd3SChristoph Hellwig if (dax_map_atomic(bdev, &dax) < 0) 1244679c8bd3SChristoph Hellwig return PTR_ERR(dax.addr); 1245679c8bd3SChristoph Hellwig clear_pmem(dax.addr + offset, length); 1246679c8bd3SChristoph Hellwig wmb_pmem(); 1247679c8bd3SChristoph Hellwig dax_unmap_atomic(bdev, &dax); 12484b0228faSVishal Verma } 1249679c8bd3SChristoph Hellwig return 0; 1250679c8bd3SChristoph Hellwig } 1251679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1252679c8bd3SChristoph Hellwig 12530e3b210cSBoaz Harrosh /** 125425726bc1SMatthew Wilcox * dax_zero_page_range - zero a range within a page of a DAX file 12554c0ccfefSMatthew Wilcox * @inode: The file being truncated 12564c0ccfefSMatthew Wilcox * @from: The file offset that is being truncated to 125725726bc1SMatthew Wilcox * @length: The number of bytes to zero 12584c0ccfefSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 12594c0ccfefSMatthew Wilcox * 126025726bc1SMatthew Wilcox * This function can be called by a filesystem when it is zeroing part of a 126125726bc1SMatthew Wilcox * page in a DAX file. This is intended for hole-punch operations. If 126225726bc1SMatthew Wilcox * you are truncating a file, the helper function dax_truncate_page() may be 126325726bc1SMatthew Wilcox * more convenient. 12644c0ccfefSMatthew Wilcox */ 126525726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 126625726bc1SMatthew Wilcox get_block_t get_block) 12674c0ccfefSMatthew Wilcox { 12684c0ccfefSMatthew Wilcox struct buffer_head bh; 126909cbfeafSKirill A. Shutemov pgoff_t index = from >> PAGE_SHIFT; 127009cbfeafSKirill A. Shutemov unsigned offset = from & (PAGE_SIZE-1); 12714c0ccfefSMatthew Wilcox int err; 12724c0ccfefSMatthew Wilcox 12734c0ccfefSMatthew Wilcox /* Block boundary? Nothing to do */ 12744c0ccfefSMatthew Wilcox if (!length) 12754c0ccfefSMatthew Wilcox return 0; 127609cbfeafSKirill A. Shutemov BUG_ON((offset + length) > PAGE_SIZE); 12774c0ccfefSMatthew Wilcox 12784c0ccfefSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 1279eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 128009cbfeafSKirill A. Shutemov bh.b_size = PAGE_SIZE; 12814c0ccfefSMatthew Wilcox err = get_block(inode, index, &bh, 0); 1282679c8bd3SChristoph Hellwig if (err < 0 || !buffer_written(&bh)) 12834c0ccfefSMatthew Wilcox return err; 1284b2e0d162SDan Williams 1285679c8bd3SChristoph Hellwig return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1286679c8bd3SChristoph Hellwig offset, length); 12874c0ccfefSMatthew Wilcox } 128825726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range); 128925726bc1SMatthew Wilcox 129025726bc1SMatthew Wilcox /** 129125726bc1SMatthew Wilcox * dax_truncate_page - handle a partial page being truncated in a DAX file 129225726bc1SMatthew Wilcox * @inode: The file being truncated 129325726bc1SMatthew Wilcox * @from: The file offset that is being truncated to 129425726bc1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 129525726bc1SMatthew Wilcox * 129625726bc1SMatthew Wilcox * Similar to block_truncate_page(), this function can be called by a 129725726bc1SMatthew Wilcox * filesystem when it is truncating a DAX file to handle the partial page. 129825726bc1SMatthew Wilcox */ 129925726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 130025726bc1SMatthew Wilcox { 130109cbfeafSKirill A. Shutemov unsigned length = PAGE_ALIGN(from) - from; 130225726bc1SMatthew Wilcox return dax_zero_page_range(inode, from, length, get_block); 130325726bc1SMatthew Wilcox } 13044c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page); 1305