1d475c634SMatthew Wilcox /* 2d475c634SMatthew Wilcox * fs/dax.c - Direct Access filesystem code 3d475c634SMatthew Wilcox * Copyright (c) 2013-2014 Intel Corporation 4d475c634SMatthew Wilcox * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5d475c634SMatthew Wilcox * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6d475c634SMatthew Wilcox * 7d475c634SMatthew Wilcox * This program is free software; you can redistribute it and/or modify it 8d475c634SMatthew Wilcox * under the terms and conditions of the GNU General Public License, 9d475c634SMatthew Wilcox * version 2, as published by the Free Software Foundation. 10d475c634SMatthew Wilcox * 11d475c634SMatthew Wilcox * This program is distributed in the hope it will be useful, but WITHOUT 12d475c634SMatthew Wilcox * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13d475c634SMatthew Wilcox * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14d475c634SMatthew Wilcox * more details. 15d475c634SMatthew Wilcox */ 16d475c634SMatthew Wilcox 17d475c634SMatthew Wilcox #include <linux/atomic.h> 18d475c634SMatthew Wilcox #include <linux/blkdev.h> 19d475c634SMatthew Wilcox #include <linux/buffer_head.h> 20d77e92e2SRoss Zwisler #include <linux/dax.h> 21d475c634SMatthew Wilcox #include <linux/fs.h> 22d475c634SMatthew Wilcox #include <linux/genhd.h> 23f7ca90b1SMatthew Wilcox #include <linux/highmem.h> 24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h> 25f7ca90b1SMatthew Wilcox #include <linux/mm.h> 26d475c634SMatthew Wilcox #include <linux/mutex.h> 272765cfbbSRoss Zwisler #include <linux/pmem.h> 28289c6aedSMatthew Wilcox #include <linux/sched.h> 29d475c634SMatthew Wilcox #include <linux/uio.h> 30f7ca90b1SMatthew Wilcox #include <linux/vmstat.h> 3134c0fd54SDan Williams #include <linux/pfn_t.h> 320e749e54SDan Williams #include <linux/sizes.h> 33d475c634SMatthew Wilcox 34b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 35b2e0d162SDan Williams { 36b2e0d162SDan Williams struct request_queue *q = bdev->bd_queue; 37b2e0d162SDan Williams long rc = -EIO; 38b2e0d162SDan Williams 39b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(-EIO); 40b2e0d162SDan Williams if (blk_queue_enter(q, true) != 0) 41b2e0d162SDan Williams return rc; 42b2e0d162SDan Williams 43b2e0d162SDan Williams rc = bdev_direct_access(bdev, dax); 44b2e0d162SDan Williams if (rc < 0) { 45b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(rc); 46b2e0d162SDan Williams blk_queue_exit(q); 47b2e0d162SDan Williams return rc; 48b2e0d162SDan Williams } 49b2e0d162SDan Williams return rc; 50b2e0d162SDan Williams } 51b2e0d162SDan Williams 52b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev, 53b2e0d162SDan Williams const struct blk_dax_ctl *dax) 54b2e0d162SDan Williams { 55b2e0d162SDan Williams if (IS_ERR(dax->addr)) 56b2e0d162SDan Williams return; 57b2e0d162SDan Williams blk_queue_exit(bdev->bd_queue); 58b2e0d162SDan Williams } 59b2e0d162SDan Williams 601ca19157SDave Chinner /* 611ca19157SDave Chinner * dax_clear_blocks() is called from within transaction context from XFS, 621ca19157SDave Chinner * and hence this means the stack from this point must follow GFP_NOFS 631ca19157SDave Chinner * semantics for all operations. 641ca19157SDave Chinner */ 65b2e0d162SDan Williams int dax_clear_blocks(struct inode *inode, sector_t block, long _size) 66289c6aedSMatthew Wilcox { 67289c6aedSMatthew Wilcox struct block_device *bdev = inode->i_sb->s_bdev; 68b2e0d162SDan Williams struct blk_dax_ctl dax = { 69b2e0d162SDan Williams .sector = block << (inode->i_blkbits - 9), 70b2e0d162SDan Williams .size = _size, 71b2e0d162SDan Williams }; 72289c6aedSMatthew Wilcox 73289c6aedSMatthew Wilcox might_sleep(); 74289c6aedSMatthew Wilcox do { 750e749e54SDan Williams long count, sz; 76289c6aedSMatthew Wilcox 77b2e0d162SDan Williams count = dax_map_atomic(bdev, &dax); 78289c6aedSMatthew Wilcox if (count < 0) 79289c6aedSMatthew Wilcox return count; 800e749e54SDan Williams sz = min_t(long, count, SZ_128K); 81b2e0d162SDan Williams clear_pmem(dax.addr, sz); 82b2e0d162SDan Williams dax.size -= sz; 83b2e0d162SDan Williams dax.sector += sz / 512; 84b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 85289c6aedSMatthew Wilcox cond_resched(); 86b2e0d162SDan Williams } while (dax.size); 87289c6aedSMatthew Wilcox 882765cfbbSRoss Zwisler wmb_pmem(); 89289c6aedSMatthew Wilcox return 0; 90289c6aedSMatthew Wilcox } 91289c6aedSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_clear_blocks); 92289c6aedSMatthew Wilcox 932765cfbbSRoss Zwisler /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 94e2e05394SRoss Zwisler static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 95e2e05394SRoss Zwisler loff_t pos, loff_t end) 96d475c634SMatthew Wilcox { 97d475c634SMatthew Wilcox loff_t final = end - pos + first; /* The final byte of the buffer */ 98d475c634SMatthew Wilcox 99d475c634SMatthew Wilcox if (first > 0) 100e2e05394SRoss Zwisler clear_pmem(addr, first); 101d475c634SMatthew Wilcox if (final < size) 102e2e05394SRoss Zwisler clear_pmem(addr + final, size - final); 103d475c634SMatthew Wilcox } 104d475c634SMatthew Wilcox 105d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh) 106d475c634SMatthew Wilcox { 107d475c634SMatthew Wilcox return buffer_mapped(bh) && !buffer_unwritten(bh); 108d475c634SMatthew Wilcox } 109d475c634SMatthew Wilcox 110d475c634SMatthew Wilcox /* 111d475c634SMatthew Wilcox * When ext4 encounters a hole, it returns without modifying the buffer_head 112d475c634SMatthew Wilcox * which means that we can't trust b_size. To cope with this, we set b_state 113d475c634SMatthew Wilcox * to 0 before calling get_block and, if any bit is set, we know we can trust 114d475c634SMatthew Wilcox * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 115d475c634SMatthew Wilcox * and would save us time calling get_block repeatedly. 116d475c634SMatthew Wilcox */ 117d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh) 118d475c634SMatthew Wilcox { 119d475c634SMatthew Wilcox return bh->b_state != 0; 120d475c634SMatthew Wilcox } 121d475c634SMatthew Wilcox 122b2e0d162SDan Williams 123b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh, 124b2e0d162SDan Williams const struct inode *inode) 125b2e0d162SDan Williams { 126b2e0d162SDan Williams sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 127b2e0d162SDan Williams 128b2e0d162SDan Williams return sector; 129b2e0d162SDan Williams } 130b2e0d162SDan Williams 131a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 132d475c634SMatthew Wilcox loff_t start, loff_t end, get_block_t get_block, 133d475c634SMatthew Wilcox struct buffer_head *bh) 134d475c634SMatthew Wilcox { 135b2e0d162SDan Williams loff_t pos = start, max = start, bh_max = start; 136b2e0d162SDan Williams bool hole = false, need_wmb = false; 137b2e0d162SDan Williams struct block_device *bdev = NULL; 138b2e0d162SDan Williams int rw = iov_iter_rw(iter), rc; 139b2e0d162SDan Williams long map_len = 0; 140b2e0d162SDan Williams struct blk_dax_ctl dax = { 141b2e0d162SDan Williams .addr = (void __pmem *) ERR_PTR(-EIO), 142b2e0d162SDan Williams }; 143d475c634SMatthew Wilcox 144b2e0d162SDan Williams if (rw == READ) 145d475c634SMatthew Wilcox end = min(end, i_size_read(inode)); 146d475c634SMatthew Wilcox 147d475c634SMatthew Wilcox while (pos < end) { 1482765cfbbSRoss Zwisler size_t len; 149d475c634SMatthew Wilcox if (pos == max) { 150d475c634SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 151e94f5a22SJeff Moyer long page = pos >> PAGE_SHIFT; 152e94f5a22SJeff Moyer sector_t block = page << (PAGE_SHIFT - blkbits); 153d475c634SMatthew Wilcox unsigned first = pos - (block << blkbits); 154d475c634SMatthew Wilcox long size; 155d475c634SMatthew Wilcox 156d475c634SMatthew Wilcox if (pos == bh_max) { 157d475c634SMatthew Wilcox bh->b_size = PAGE_ALIGN(end - pos); 158d475c634SMatthew Wilcox bh->b_state = 0; 159b2e0d162SDan Williams rc = get_block(inode, block, bh, rw == WRITE); 160b2e0d162SDan Williams if (rc) 161d475c634SMatthew Wilcox break; 162d475c634SMatthew Wilcox if (!buffer_size_valid(bh)) 163d475c634SMatthew Wilcox bh->b_size = 1 << blkbits; 164d475c634SMatthew Wilcox bh_max = pos - first + bh->b_size; 165b2e0d162SDan Williams bdev = bh->b_bdev; 166d475c634SMatthew Wilcox } else { 167d475c634SMatthew Wilcox unsigned done = bh->b_size - 168d475c634SMatthew Wilcox (bh_max - (pos - first)); 169d475c634SMatthew Wilcox bh->b_blocknr += done >> blkbits; 170d475c634SMatthew Wilcox bh->b_size -= done; 171d475c634SMatthew Wilcox } 172d475c634SMatthew Wilcox 173b2e0d162SDan Williams hole = rw == READ && !buffer_written(bh); 174d475c634SMatthew Wilcox if (hole) { 175d475c634SMatthew Wilcox size = bh->b_size - first; 176d475c634SMatthew Wilcox } else { 177b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 178b2e0d162SDan Williams dax.sector = to_sector(bh, inode); 179b2e0d162SDan Williams dax.size = bh->b_size; 180b2e0d162SDan Williams map_len = dax_map_atomic(bdev, &dax); 181b2e0d162SDan Williams if (map_len < 0) { 182b2e0d162SDan Williams rc = map_len; 183d475c634SMatthew Wilcox break; 184b2e0d162SDan Williams } 1852765cfbbSRoss Zwisler if (buffer_unwritten(bh) || buffer_new(bh)) { 186b2e0d162SDan Williams dax_new_buf(dax.addr, map_len, first, 187b2e0d162SDan Williams pos, end); 1882765cfbbSRoss Zwisler need_wmb = true; 1892765cfbbSRoss Zwisler } 190b2e0d162SDan Williams dax.addr += first; 191b2e0d162SDan Williams size = map_len - first; 192d475c634SMatthew Wilcox } 193d475c634SMatthew Wilcox max = min(pos + size, end); 194d475c634SMatthew Wilcox } 195d475c634SMatthew Wilcox 1962765cfbbSRoss Zwisler if (iov_iter_rw(iter) == WRITE) { 197b2e0d162SDan Williams len = copy_from_iter_pmem(dax.addr, max - pos, iter); 1982765cfbbSRoss Zwisler need_wmb = true; 1992765cfbbSRoss Zwisler } else if (!hole) 200b2e0d162SDan Williams len = copy_to_iter((void __force *) dax.addr, max - pos, 201e2e05394SRoss Zwisler iter); 202d475c634SMatthew Wilcox else 203d475c634SMatthew Wilcox len = iov_iter_zero(max - pos, iter); 204d475c634SMatthew Wilcox 205cadfbb6eSAl Viro if (!len) { 206b2e0d162SDan Williams rc = -EFAULT; 207d475c634SMatthew Wilcox break; 208cadfbb6eSAl Viro } 209d475c634SMatthew Wilcox 210d475c634SMatthew Wilcox pos += len; 211b2e0d162SDan Williams if (!IS_ERR(dax.addr)) 212b2e0d162SDan Williams dax.addr += len; 213d475c634SMatthew Wilcox } 214d475c634SMatthew Wilcox 2152765cfbbSRoss Zwisler if (need_wmb) 2162765cfbbSRoss Zwisler wmb_pmem(); 217b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 2182765cfbbSRoss Zwisler 219b2e0d162SDan Williams return (pos == start) ? rc : pos - start; 220d475c634SMatthew Wilcox } 221d475c634SMatthew Wilcox 222d475c634SMatthew Wilcox /** 223d475c634SMatthew Wilcox * dax_do_io - Perform I/O to a DAX file 224d475c634SMatthew Wilcox * @iocb: The control block for this I/O 225d475c634SMatthew Wilcox * @inode: The file which the I/O is directed at 226d475c634SMatthew Wilcox * @iter: The addresses to do I/O from or to 227d475c634SMatthew Wilcox * @pos: The file offset where the I/O starts 228d475c634SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 229d475c634SMatthew Wilcox * @end_io: A filesystem callback for I/O completion 230d475c634SMatthew Wilcox * @flags: See below 231d475c634SMatthew Wilcox * 232d475c634SMatthew Wilcox * This function uses the same locking scheme as do_blockdev_direct_IO: 233d475c634SMatthew Wilcox * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 234d475c634SMatthew Wilcox * caller for writes. For reads, we take and release the i_mutex ourselves. 235d475c634SMatthew Wilcox * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 236d475c634SMatthew Wilcox * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 237d475c634SMatthew Wilcox * is in progress. 238d475c634SMatthew Wilcox */ 239a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 240a95cd631SOmar Sandoval struct iov_iter *iter, loff_t pos, get_block_t get_block, 241a95cd631SOmar Sandoval dio_iodone_t end_io, int flags) 242d475c634SMatthew Wilcox { 243d475c634SMatthew Wilcox struct buffer_head bh; 244d475c634SMatthew Wilcox ssize_t retval = -EINVAL; 245d475c634SMatthew Wilcox loff_t end = pos + iov_iter_count(iter); 246d475c634SMatthew Wilcox 247d475c634SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 248d475c634SMatthew Wilcox 249a95cd631SOmar Sandoval if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 250d475c634SMatthew Wilcox struct address_space *mapping = inode->i_mapping; 251d475c634SMatthew Wilcox mutex_lock(&inode->i_mutex); 252d475c634SMatthew Wilcox retval = filemap_write_and_wait_range(mapping, pos, end - 1); 253d475c634SMatthew Wilcox if (retval) { 254d475c634SMatthew Wilcox mutex_unlock(&inode->i_mutex); 255d475c634SMatthew Wilcox goto out; 256d475c634SMatthew Wilcox } 257d475c634SMatthew Wilcox } 258d475c634SMatthew Wilcox 259d475c634SMatthew Wilcox /* Protects against truncate */ 260bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 261fe0f07d0SJens Axboe inode_dio_begin(inode); 262d475c634SMatthew Wilcox 263a95cd631SOmar Sandoval retval = dax_io(inode, iter, pos, end, get_block, &bh); 264d475c634SMatthew Wilcox 265a95cd631SOmar Sandoval if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 266d475c634SMatthew Wilcox mutex_unlock(&inode->i_mutex); 267d475c634SMatthew Wilcox 268d475c634SMatthew Wilcox if ((retval > 0) && end_io) 269d475c634SMatthew Wilcox end_io(iocb, pos, retval, bh.b_private); 270d475c634SMatthew Wilcox 271bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 272fe0f07d0SJens Axboe inode_dio_end(inode); 273d475c634SMatthew Wilcox out: 274d475c634SMatthew Wilcox return retval; 275d475c634SMatthew Wilcox } 276d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io); 277f7ca90b1SMatthew Wilcox 278f7ca90b1SMatthew Wilcox /* 279f7ca90b1SMatthew Wilcox * The user has performed a load from a hole in the file. Allocating 280f7ca90b1SMatthew Wilcox * a new page in the file would cause excessive storage usage for 281f7ca90b1SMatthew Wilcox * workloads with sparse files. We allocate a page cache page instead. 282f7ca90b1SMatthew Wilcox * We'll kick it out of the page cache if it's ever written to, 283f7ca90b1SMatthew Wilcox * otherwise it will simply fall out of the page cache under memory 284f7ca90b1SMatthew Wilcox * pressure without ever having been dirtied. 285f7ca90b1SMatthew Wilcox */ 286f7ca90b1SMatthew Wilcox static int dax_load_hole(struct address_space *mapping, struct page *page, 287f7ca90b1SMatthew Wilcox struct vm_fault *vmf) 288f7ca90b1SMatthew Wilcox { 289f7ca90b1SMatthew Wilcox unsigned long size; 290f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 291f7ca90b1SMatthew Wilcox if (!page) 292f7ca90b1SMatthew Wilcox page = find_or_create_page(mapping, vmf->pgoff, 293f7ca90b1SMatthew Wilcox GFP_KERNEL | __GFP_ZERO); 294f7ca90b1SMatthew Wilcox if (!page) 295f7ca90b1SMatthew Wilcox return VM_FAULT_OOM; 296f7ca90b1SMatthew Wilcox /* Recheck i_size under page lock to avoid truncate race */ 297f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 298f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) { 299f7ca90b1SMatthew Wilcox unlock_page(page); 300f7ca90b1SMatthew Wilcox page_cache_release(page); 301f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 302f7ca90b1SMatthew Wilcox } 303f7ca90b1SMatthew Wilcox 304f7ca90b1SMatthew Wilcox vmf->page = page; 305f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 306f7ca90b1SMatthew Wilcox } 307f7ca90b1SMatthew Wilcox 308b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode, 309b2e0d162SDan Williams struct buffer_head *bh, unsigned long vaddr) 310f7ca90b1SMatthew Wilcox { 311b2e0d162SDan Williams struct blk_dax_ctl dax = { 312b2e0d162SDan Williams .sector = to_sector(bh, inode), 313b2e0d162SDan Williams .size = bh->b_size, 314b2e0d162SDan Williams }; 315b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 316e2e05394SRoss Zwisler void *vto; 317e2e05394SRoss Zwisler 318b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) 319b2e0d162SDan Williams return PTR_ERR(dax.addr); 320f7ca90b1SMatthew Wilcox vto = kmap_atomic(to); 321b2e0d162SDan Williams copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 322f7ca90b1SMatthew Wilcox kunmap_atomic(vto); 323b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 324f7ca90b1SMatthew Wilcox return 0; 325f7ca90b1SMatthew Wilcox } 326f7ca90b1SMatthew Wilcox 327f7ca90b1SMatthew Wilcox static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 328f7ca90b1SMatthew Wilcox struct vm_area_struct *vma, struct vm_fault *vmf) 329f7ca90b1SMatthew Wilcox { 330f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 331b2e0d162SDan Williams struct address_space *mapping = inode->i_mapping; 332b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 333b2e0d162SDan Williams struct blk_dax_ctl dax = { 334b2e0d162SDan Williams .sector = to_sector(bh, inode), 335b2e0d162SDan Williams .size = bh->b_size, 336b2e0d162SDan Williams }; 337f7ca90b1SMatthew Wilcox pgoff_t size; 338f7ca90b1SMatthew Wilcox int error; 339f7ca90b1SMatthew Wilcox 3400f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 3410f90cc66SRoss Zwisler 342f7ca90b1SMatthew Wilcox /* 343f7ca90b1SMatthew Wilcox * Check truncate didn't happen while we were allocating a block. 344f7ca90b1SMatthew Wilcox * If it did, this block may or may not be still allocated to the 345f7ca90b1SMatthew Wilcox * file. We can't tell the filesystem to free it because we can't 346f7ca90b1SMatthew Wilcox * take i_mutex here. In the worst case, the file still has blocks 347f7ca90b1SMatthew Wilcox * allocated past the end of the file. 348f7ca90b1SMatthew Wilcox */ 349f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 350f7ca90b1SMatthew Wilcox if (unlikely(vmf->pgoff >= size)) { 351f7ca90b1SMatthew Wilcox error = -EIO; 352f7ca90b1SMatthew Wilcox goto out; 353f7ca90b1SMatthew Wilcox } 354f7ca90b1SMatthew Wilcox 355b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) { 356b2e0d162SDan Williams error = PTR_ERR(dax.addr); 357f7ca90b1SMatthew Wilcox goto out; 358f7ca90b1SMatthew Wilcox } 359f7ca90b1SMatthew Wilcox 3602765cfbbSRoss Zwisler if (buffer_unwritten(bh) || buffer_new(bh)) { 361b2e0d162SDan Williams clear_pmem(dax.addr, PAGE_SIZE); 3622765cfbbSRoss Zwisler wmb_pmem(); 3632765cfbbSRoss Zwisler } 364b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 365f7ca90b1SMatthew Wilcox 36634c0fd54SDan Williams error = vm_insert_mixed(vma, vaddr, pfn_t_to_pfn(dax.pfn)); 367f7ca90b1SMatthew Wilcox 368f7ca90b1SMatthew Wilcox out: 3690f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 3700f90cc66SRoss Zwisler 371f7ca90b1SMatthew Wilcox return error; 372f7ca90b1SMatthew Wilcox } 373f7ca90b1SMatthew Wilcox 374ce5c5d55SDave Chinner /** 375ce5c5d55SDave Chinner * __dax_fault - handle a page fault on a DAX file 376ce5c5d55SDave Chinner * @vma: The virtual memory area where the fault occurred 377ce5c5d55SDave Chinner * @vmf: The description of the fault 378ce5c5d55SDave Chinner * @get_block: The filesystem method used to translate file offsets to blocks 379b2442c5aSDave Chinner * @complete_unwritten: The filesystem method used to convert unwritten blocks 380b2442c5aSDave Chinner * to written so the data written to them is exposed. This is required for 381b2442c5aSDave Chinner * required by write faults for filesystems that will return unwritten 382b2442c5aSDave Chinner * extent mappings from @get_block, but it is optional for reads as 383b2442c5aSDave Chinner * dax_insert_mapping() will always zero unwritten blocks. If the fs does 384b2442c5aSDave Chinner * not support unwritten extents, the it should pass NULL. 385ce5c5d55SDave Chinner * 386ce5c5d55SDave Chinner * When a page fault occurs, filesystems may call this helper in their 387ce5c5d55SDave Chinner * fault handler for DAX files. __dax_fault() assumes the caller has done all 388ce5c5d55SDave Chinner * the necessary locking for the page fault to proceed successfully. 389ce5c5d55SDave Chinner */ 390ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 391e842f290SDave Chinner get_block_t get_block, dax_iodone_t complete_unwritten) 392f7ca90b1SMatthew Wilcox { 393f7ca90b1SMatthew Wilcox struct file *file = vma->vm_file; 394f7ca90b1SMatthew Wilcox struct address_space *mapping = file->f_mapping; 395f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 396f7ca90b1SMatthew Wilcox struct page *page; 397f7ca90b1SMatthew Wilcox struct buffer_head bh; 398f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 399f7ca90b1SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 400f7ca90b1SMatthew Wilcox sector_t block; 401f7ca90b1SMatthew Wilcox pgoff_t size; 402f7ca90b1SMatthew Wilcox int error; 403f7ca90b1SMatthew Wilcox int major = 0; 404f7ca90b1SMatthew Wilcox 405f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 406f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) 407f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 408f7ca90b1SMatthew Wilcox 409f7ca90b1SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 410f7ca90b1SMatthew Wilcox block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 411f7ca90b1SMatthew Wilcox bh.b_size = PAGE_SIZE; 412f7ca90b1SMatthew Wilcox 413f7ca90b1SMatthew Wilcox repeat: 414f7ca90b1SMatthew Wilcox page = find_get_page(mapping, vmf->pgoff); 415f7ca90b1SMatthew Wilcox if (page) { 416f7ca90b1SMatthew Wilcox if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 417f7ca90b1SMatthew Wilcox page_cache_release(page); 418f7ca90b1SMatthew Wilcox return VM_FAULT_RETRY; 419f7ca90b1SMatthew Wilcox } 420f7ca90b1SMatthew Wilcox if (unlikely(page->mapping != mapping)) { 421f7ca90b1SMatthew Wilcox unlock_page(page); 422f7ca90b1SMatthew Wilcox page_cache_release(page); 423f7ca90b1SMatthew Wilcox goto repeat; 424f7ca90b1SMatthew Wilcox } 425f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 426f7ca90b1SMatthew Wilcox if (unlikely(vmf->pgoff >= size)) { 427f7ca90b1SMatthew Wilcox /* 428f7ca90b1SMatthew Wilcox * We have a struct page covering a hole in the file 429f7ca90b1SMatthew Wilcox * from a read fault and we've raced with a truncate 430f7ca90b1SMatthew Wilcox */ 431f7ca90b1SMatthew Wilcox error = -EIO; 4320f90cc66SRoss Zwisler goto unlock_page; 433f7ca90b1SMatthew Wilcox } 434f7ca90b1SMatthew Wilcox } 435f7ca90b1SMatthew Wilcox 436f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 0); 437f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 438f7ca90b1SMatthew Wilcox error = -EIO; /* fs corruption? */ 439f7ca90b1SMatthew Wilcox if (error) 4400f90cc66SRoss Zwisler goto unlock_page; 441f7ca90b1SMatthew Wilcox 442f7ca90b1SMatthew Wilcox if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 443f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 444f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 1); 445f7ca90b1SMatthew Wilcox count_vm_event(PGMAJFAULT); 446f7ca90b1SMatthew Wilcox mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 447f7ca90b1SMatthew Wilcox major = VM_FAULT_MAJOR; 448f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 449f7ca90b1SMatthew Wilcox error = -EIO; 450f7ca90b1SMatthew Wilcox if (error) 4510f90cc66SRoss Zwisler goto unlock_page; 452f7ca90b1SMatthew Wilcox } else { 453f7ca90b1SMatthew Wilcox return dax_load_hole(mapping, page, vmf); 454f7ca90b1SMatthew Wilcox } 455f7ca90b1SMatthew Wilcox } 456f7ca90b1SMatthew Wilcox 457f7ca90b1SMatthew Wilcox if (vmf->cow_page) { 458f7ca90b1SMatthew Wilcox struct page *new_page = vmf->cow_page; 459f7ca90b1SMatthew Wilcox if (buffer_written(&bh)) 460b2e0d162SDan Williams error = copy_user_bh(new_page, inode, &bh, vaddr); 461f7ca90b1SMatthew Wilcox else 462f7ca90b1SMatthew Wilcox clear_user_highpage(new_page, vaddr); 463f7ca90b1SMatthew Wilcox if (error) 4640f90cc66SRoss Zwisler goto unlock_page; 465f7ca90b1SMatthew Wilcox vmf->page = page; 466f7ca90b1SMatthew Wilcox if (!page) { 4670f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 468f7ca90b1SMatthew Wilcox /* Check we didn't race with truncate */ 469f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> 470f7ca90b1SMatthew Wilcox PAGE_SHIFT; 471f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) { 4720f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 473f7ca90b1SMatthew Wilcox error = -EIO; 4740f90cc66SRoss Zwisler goto out; 475f7ca90b1SMatthew Wilcox } 476f7ca90b1SMatthew Wilcox } 477f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 478f7ca90b1SMatthew Wilcox } 479f7ca90b1SMatthew Wilcox 480f7ca90b1SMatthew Wilcox /* Check we didn't race with a read fault installing a new page */ 481f7ca90b1SMatthew Wilcox if (!page && major) 482f7ca90b1SMatthew Wilcox page = find_lock_page(mapping, vmf->pgoff); 483f7ca90b1SMatthew Wilcox 484f7ca90b1SMatthew Wilcox if (page) { 485f7ca90b1SMatthew Wilcox unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 486f7ca90b1SMatthew Wilcox PAGE_CACHE_SIZE, 0); 487f7ca90b1SMatthew Wilcox delete_from_page_cache(page); 488f7ca90b1SMatthew Wilcox unlock_page(page); 489f7ca90b1SMatthew Wilcox page_cache_release(page); 490f7ca90b1SMatthew Wilcox } 491f7ca90b1SMatthew Wilcox 492e842f290SDave Chinner /* 493e842f290SDave Chinner * If we successfully insert the new mapping over an unwritten extent, 494e842f290SDave Chinner * we need to ensure we convert the unwritten extent. If there is an 495e842f290SDave Chinner * error inserting the mapping, the filesystem needs to leave it as 496e842f290SDave Chinner * unwritten to prevent exposure of the stale underlying data to 497e842f290SDave Chinner * userspace, but we still need to call the completion function so 498e842f290SDave Chinner * the private resources on the mapping buffer can be released. We 499e842f290SDave Chinner * indicate what the callback should do via the uptodate variable, same 500e842f290SDave Chinner * as for normal BH based IO completions. 501e842f290SDave Chinner */ 502f7ca90b1SMatthew Wilcox error = dax_insert_mapping(inode, &bh, vma, vmf); 503b2442c5aSDave Chinner if (buffer_unwritten(&bh)) { 504b2442c5aSDave Chinner if (complete_unwritten) 505e842f290SDave Chinner complete_unwritten(&bh, !error); 506b2442c5aSDave Chinner else 507b2442c5aSDave Chinner WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 508b2442c5aSDave Chinner } 509f7ca90b1SMatthew Wilcox 510f7ca90b1SMatthew Wilcox out: 511f7ca90b1SMatthew Wilcox if (error == -ENOMEM) 512f7ca90b1SMatthew Wilcox return VM_FAULT_OOM | major; 513f7ca90b1SMatthew Wilcox /* -EBUSY is fine, somebody else faulted on the same PTE */ 514f7ca90b1SMatthew Wilcox if ((error < 0) && (error != -EBUSY)) 515f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS | major; 516f7ca90b1SMatthew Wilcox return VM_FAULT_NOPAGE | major; 517f7ca90b1SMatthew Wilcox 5180f90cc66SRoss Zwisler unlock_page: 519f7ca90b1SMatthew Wilcox if (page) { 520f7ca90b1SMatthew Wilcox unlock_page(page); 521f7ca90b1SMatthew Wilcox page_cache_release(page); 522f7ca90b1SMatthew Wilcox } 523f7ca90b1SMatthew Wilcox goto out; 524f7ca90b1SMatthew Wilcox } 525ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault); 526f7ca90b1SMatthew Wilcox 527f7ca90b1SMatthew Wilcox /** 528f7ca90b1SMatthew Wilcox * dax_fault - handle a page fault on a DAX file 529f7ca90b1SMatthew Wilcox * @vma: The virtual memory area where the fault occurred 530f7ca90b1SMatthew Wilcox * @vmf: The description of the fault 531f7ca90b1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 532f7ca90b1SMatthew Wilcox * 533f7ca90b1SMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 534f7ca90b1SMatthew Wilcox * fault handler for DAX files. 535f7ca90b1SMatthew Wilcox */ 536f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 537e842f290SDave Chinner get_block_t get_block, dax_iodone_t complete_unwritten) 538f7ca90b1SMatthew Wilcox { 539f7ca90b1SMatthew Wilcox int result; 540f7ca90b1SMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 541f7ca90b1SMatthew Wilcox 542f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 543f7ca90b1SMatthew Wilcox sb_start_pagefault(sb); 544f7ca90b1SMatthew Wilcox file_update_time(vma->vm_file); 545f7ca90b1SMatthew Wilcox } 546ce5c5d55SDave Chinner result = __dax_fault(vma, vmf, get_block, complete_unwritten); 547f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) 548f7ca90b1SMatthew Wilcox sb_end_pagefault(sb); 549f7ca90b1SMatthew Wilcox 550f7ca90b1SMatthew Wilcox return result; 551f7ca90b1SMatthew Wilcox } 552f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault); 5534c0ccfefSMatthew Wilcox 554844f35dbSMatthew Wilcox #ifdef CONFIG_TRANSPARENT_HUGEPAGE 555844f35dbSMatthew Wilcox /* 556844f35dbSMatthew Wilcox * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 557844f35dbSMatthew Wilcox * more often than one might expect in the below function. 558844f35dbSMatthew Wilcox */ 559844f35dbSMatthew Wilcox #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 560844f35dbSMatthew Wilcox 561844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 562844f35dbSMatthew Wilcox pmd_t *pmd, unsigned int flags, get_block_t get_block, 563844f35dbSMatthew Wilcox dax_iodone_t complete_unwritten) 564844f35dbSMatthew Wilcox { 565844f35dbSMatthew Wilcox struct file *file = vma->vm_file; 566844f35dbSMatthew Wilcox struct address_space *mapping = file->f_mapping; 567844f35dbSMatthew Wilcox struct inode *inode = mapping->host; 568844f35dbSMatthew Wilcox struct buffer_head bh; 569844f35dbSMatthew Wilcox unsigned blkbits = inode->i_blkbits; 570844f35dbSMatthew Wilcox unsigned long pmd_addr = address & PMD_MASK; 571844f35dbSMatthew Wilcox bool write = flags & FAULT_FLAG_WRITE; 572b2e0d162SDan Williams struct block_device *bdev; 573844f35dbSMatthew Wilcox pgoff_t size, pgoff; 574b2e0d162SDan Williams sector_t block; 575844f35dbSMatthew Wilcox int result = 0; 576844f35dbSMatthew Wilcox 577ee82c9edSDan Williams /* dax pmd mappings are broken wrt gup and fork */ 578ee82c9edSDan Williams if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 579ee82c9edSDan Williams return VM_FAULT_FALLBACK; 580ee82c9edSDan Williams 581844f35dbSMatthew Wilcox /* Fall back to PTEs if we're going to COW */ 58259bf4fb9SToshi Kani if (write && !(vma->vm_flags & VM_SHARED)) { 58359bf4fb9SToshi Kani split_huge_pmd(vma, pmd, address); 584844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 58559bf4fb9SToshi Kani } 586844f35dbSMatthew Wilcox /* If the PMD would extend outside the VMA */ 587844f35dbSMatthew Wilcox if (pmd_addr < vma->vm_start) 588844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 589844f35dbSMatthew Wilcox if ((pmd_addr + PMD_SIZE) > vma->vm_end) 590844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 591844f35dbSMatthew Wilcox 5923fdd1b47SMatthew Wilcox pgoff = linear_page_index(vma, pmd_addr); 593844f35dbSMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 594844f35dbSMatthew Wilcox if (pgoff >= size) 595844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 596844f35dbSMatthew Wilcox /* If the PMD would cover blocks out of the file */ 597844f35dbSMatthew Wilcox if ((pgoff | PG_PMD_COLOUR) >= size) 598844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 599844f35dbSMatthew Wilcox 600844f35dbSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 601844f35dbSMatthew Wilcox block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 602844f35dbSMatthew Wilcox 603844f35dbSMatthew Wilcox bh.b_size = PMD_SIZE; 604b2e0d162SDan Williams if (get_block(inode, block, &bh, write) != 0) 605844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 606b2e0d162SDan Williams bdev = bh.b_bdev; 6070f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 608844f35dbSMatthew Wilcox 609844f35dbSMatthew Wilcox /* 610844f35dbSMatthew Wilcox * If the filesystem isn't willing to tell us the length of a hole, 611844f35dbSMatthew Wilcox * just fall back to PTEs. Calling get_block 512 times in a loop 612844f35dbSMatthew Wilcox * would be silly. 613844f35dbSMatthew Wilcox */ 614844f35dbSMatthew Wilcox if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 615844f35dbSMatthew Wilcox goto fallback; 616844f35dbSMatthew Wilcox 61746c043edSKirill A. Shutemov /* 61846c043edSKirill A. Shutemov * If we allocated new storage, make sure no process has any 61946c043edSKirill A. Shutemov * zero pages covering this hole 62046c043edSKirill A. Shutemov */ 62146c043edSKirill A. Shutemov if (buffer_new(&bh)) { 6220f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 62346c043edSKirill A. Shutemov unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 6240f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 62546c043edSKirill A. Shutemov } 62646c043edSKirill A. Shutemov 62784c4e5e6SMatthew Wilcox /* 62884c4e5e6SMatthew Wilcox * If a truncate happened while we were allocating blocks, we may 62984c4e5e6SMatthew Wilcox * leave blocks allocated to the file that are beyond EOF. We can't 63084c4e5e6SMatthew Wilcox * take i_mutex here, so just leave them hanging; they'll be freed 63184c4e5e6SMatthew Wilcox * when the file is deleted. 63284c4e5e6SMatthew Wilcox */ 633844f35dbSMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 634844f35dbSMatthew Wilcox if (pgoff >= size) { 635844f35dbSMatthew Wilcox result = VM_FAULT_SIGBUS; 636844f35dbSMatthew Wilcox goto out; 637844f35dbSMatthew Wilcox } 638844f35dbSMatthew Wilcox if ((pgoff | PG_PMD_COLOUR) >= size) 639844f35dbSMatthew Wilcox goto fallback; 640844f35dbSMatthew Wilcox 641844f35dbSMatthew Wilcox if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 642844f35dbSMatthew Wilcox spinlock_t *ptl; 643d295e341SKirill A. Shutemov pmd_t entry; 644844f35dbSMatthew Wilcox struct page *zero_page = get_huge_zero_page(); 645d295e341SKirill A. Shutemov 646844f35dbSMatthew Wilcox if (unlikely(!zero_page)) 647844f35dbSMatthew Wilcox goto fallback; 648844f35dbSMatthew Wilcox 649d295e341SKirill A. Shutemov ptl = pmd_lock(vma->vm_mm, pmd); 650d295e341SKirill A. Shutemov if (!pmd_none(*pmd)) { 651844f35dbSMatthew Wilcox spin_unlock(ptl); 652d295e341SKirill A. Shutemov goto fallback; 653d295e341SKirill A. Shutemov } 654d295e341SKirill A. Shutemov 655d295e341SKirill A. Shutemov entry = mk_pmd(zero_page, vma->vm_page_prot); 656d295e341SKirill A. Shutemov entry = pmd_mkhuge(entry); 657d295e341SKirill A. Shutemov set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 658844f35dbSMatthew Wilcox result = VM_FAULT_NOPAGE; 659d295e341SKirill A. Shutemov spin_unlock(ptl); 660844f35dbSMatthew Wilcox } else { 661b2e0d162SDan Williams struct blk_dax_ctl dax = { 662b2e0d162SDan Williams .sector = to_sector(&bh, inode), 663b2e0d162SDan Williams .size = PMD_SIZE, 664b2e0d162SDan Williams }; 665b2e0d162SDan Williams long length = dax_map_atomic(bdev, &dax); 666b2e0d162SDan Williams 667844f35dbSMatthew Wilcox if (length < 0) { 668844f35dbSMatthew Wilcox result = VM_FAULT_SIGBUS; 669844f35dbSMatthew Wilcox goto out; 670844f35dbSMatthew Wilcox } 67134c0fd54SDan Williams if (length < PMD_SIZE 67234c0fd54SDan Williams || (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)) { 673b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 674844f35dbSMatthew Wilcox goto fallback; 675b2e0d162SDan Williams } 676844f35dbSMatthew Wilcox 677152d7bd8SDan Williams /* 678152d7bd8SDan Williams * TODO: teach vmf_insert_pfn_pmd() to support 679152d7bd8SDan Williams * 'pte_special' for pmds 680152d7bd8SDan Williams */ 68134c0fd54SDan Williams if (pfn_t_has_page(dax.pfn)) { 682b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 683152d7bd8SDan Williams goto fallback; 684b2e0d162SDan Williams } 685152d7bd8SDan Williams 6860f90cc66SRoss Zwisler if (buffer_unwritten(&bh) || buffer_new(&bh)) { 687b2e0d162SDan Williams clear_pmem(dax.addr, PMD_SIZE); 6880f90cc66SRoss Zwisler wmb_pmem(); 6890f90cc66SRoss Zwisler count_vm_event(PGMAJFAULT); 6900f90cc66SRoss Zwisler mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 6910f90cc66SRoss Zwisler result |= VM_FAULT_MAJOR; 6920f90cc66SRoss Zwisler } 693b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 6940f90cc66SRoss Zwisler 69534c0fd54SDan Williams result |= vmf_insert_pfn_pmd(vma, address, pmd, 69634c0fd54SDan Williams pfn_t_to_pfn(dax.pfn), write); 697844f35dbSMatthew Wilcox } 698844f35dbSMatthew Wilcox 699844f35dbSMatthew Wilcox out: 7000f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 7010f90cc66SRoss Zwisler 702844f35dbSMatthew Wilcox if (buffer_unwritten(&bh)) 703844f35dbSMatthew Wilcox complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 704844f35dbSMatthew Wilcox 705844f35dbSMatthew Wilcox return result; 706844f35dbSMatthew Wilcox 707844f35dbSMatthew Wilcox fallback: 708844f35dbSMatthew Wilcox count_vm_event(THP_FAULT_FALLBACK); 709844f35dbSMatthew Wilcox result = VM_FAULT_FALLBACK; 710844f35dbSMatthew Wilcox goto out; 711844f35dbSMatthew Wilcox } 712844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault); 713844f35dbSMatthew Wilcox 714844f35dbSMatthew Wilcox /** 715844f35dbSMatthew Wilcox * dax_pmd_fault - handle a PMD fault on a DAX file 716844f35dbSMatthew Wilcox * @vma: The virtual memory area where the fault occurred 717844f35dbSMatthew Wilcox * @vmf: The description of the fault 718844f35dbSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 719844f35dbSMatthew Wilcox * 720844f35dbSMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 721844f35dbSMatthew Wilcox * pmd_fault handler for DAX files. 722844f35dbSMatthew Wilcox */ 723844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 724844f35dbSMatthew Wilcox pmd_t *pmd, unsigned int flags, get_block_t get_block, 725844f35dbSMatthew Wilcox dax_iodone_t complete_unwritten) 726844f35dbSMatthew Wilcox { 727844f35dbSMatthew Wilcox int result; 728844f35dbSMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 729844f35dbSMatthew Wilcox 730844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) { 731844f35dbSMatthew Wilcox sb_start_pagefault(sb); 732844f35dbSMatthew Wilcox file_update_time(vma->vm_file); 733844f35dbSMatthew Wilcox } 734844f35dbSMatthew Wilcox result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 735844f35dbSMatthew Wilcox complete_unwritten); 736844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) 737844f35dbSMatthew Wilcox sb_end_pagefault(sb); 738844f35dbSMatthew Wilcox 739844f35dbSMatthew Wilcox return result; 740844f35dbSMatthew Wilcox } 741844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault); 742dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 743844f35dbSMatthew Wilcox 7444c0ccfefSMatthew Wilcox /** 7450e3b210cSBoaz Harrosh * dax_pfn_mkwrite - handle first write to DAX page 7460e3b210cSBoaz Harrosh * @vma: The virtual memory area where the fault occurred 7470e3b210cSBoaz Harrosh * @vmf: The description of the fault 7480e3b210cSBoaz Harrosh * 7490e3b210cSBoaz Harrosh */ 7500e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 7510e3b210cSBoaz Harrosh { 7520e3b210cSBoaz Harrosh struct super_block *sb = file_inode(vma->vm_file)->i_sb; 7530e3b210cSBoaz Harrosh 7540e3b210cSBoaz Harrosh sb_start_pagefault(sb); 7550e3b210cSBoaz Harrosh file_update_time(vma->vm_file); 7560e3b210cSBoaz Harrosh sb_end_pagefault(sb); 7570e3b210cSBoaz Harrosh return VM_FAULT_NOPAGE; 7580e3b210cSBoaz Harrosh } 7590e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 7600e3b210cSBoaz Harrosh 7610e3b210cSBoaz Harrosh /** 76225726bc1SMatthew Wilcox * dax_zero_page_range - zero a range within a page of a DAX file 7634c0ccfefSMatthew Wilcox * @inode: The file being truncated 7644c0ccfefSMatthew Wilcox * @from: The file offset that is being truncated to 76525726bc1SMatthew Wilcox * @length: The number of bytes to zero 7664c0ccfefSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 7674c0ccfefSMatthew Wilcox * 76825726bc1SMatthew Wilcox * This function can be called by a filesystem when it is zeroing part of a 76925726bc1SMatthew Wilcox * page in a DAX file. This is intended for hole-punch operations. If 77025726bc1SMatthew Wilcox * you are truncating a file, the helper function dax_truncate_page() may be 77125726bc1SMatthew Wilcox * more convenient. 7724c0ccfefSMatthew Wilcox * 7734c0ccfefSMatthew Wilcox * We work in terms of PAGE_CACHE_SIZE here for commonality with 7744c0ccfefSMatthew Wilcox * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 7754c0ccfefSMatthew Wilcox * took care of disposing of the unnecessary blocks. Even if the filesystem 7764c0ccfefSMatthew Wilcox * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 77725726bc1SMatthew Wilcox * since the file might be mmapped. 7784c0ccfefSMatthew Wilcox */ 77925726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 78025726bc1SMatthew Wilcox get_block_t get_block) 7814c0ccfefSMatthew Wilcox { 7824c0ccfefSMatthew Wilcox struct buffer_head bh; 7834c0ccfefSMatthew Wilcox pgoff_t index = from >> PAGE_CACHE_SHIFT; 7844c0ccfefSMatthew Wilcox unsigned offset = from & (PAGE_CACHE_SIZE-1); 7854c0ccfefSMatthew Wilcox int err; 7864c0ccfefSMatthew Wilcox 7874c0ccfefSMatthew Wilcox /* Block boundary? Nothing to do */ 7884c0ccfefSMatthew Wilcox if (!length) 7894c0ccfefSMatthew Wilcox return 0; 79025726bc1SMatthew Wilcox BUG_ON((offset + length) > PAGE_CACHE_SIZE); 7914c0ccfefSMatthew Wilcox 7924c0ccfefSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 7934c0ccfefSMatthew Wilcox bh.b_size = PAGE_CACHE_SIZE; 7944c0ccfefSMatthew Wilcox err = get_block(inode, index, &bh, 0); 7954c0ccfefSMatthew Wilcox if (err < 0) 7964c0ccfefSMatthew Wilcox return err; 7974c0ccfefSMatthew Wilcox if (buffer_written(&bh)) { 798b2e0d162SDan Williams struct block_device *bdev = bh.b_bdev; 799b2e0d162SDan Williams struct blk_dax_ctl dax = { 800b2e0d162SDan Williams .sector = to_sector(&bh, inode), 801b2e0d162SDan Williams .size = PAGE_CACHE_SIZE, 802b2e0d162SDan Williams }; 803b2e0d162SDan Williams 804b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) 805b2e0d162SDan Williams return PTR_ERR(dax.addr); 806b2e0d162SDan Williams clear_pmem(dax.addr + offset, length); 8072765cfbbSRoss Zwisler wmb_pmem(); 808b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 8094c0ccfefSMatthew Wilcox } 8104c0ccfefSMatthew Wilcox 8114c0ccfefSMatthew Wilcox return 0; 8124c0ccfefSMatthew Wilcox } 81325726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range); 81425726bc1SMatthew Wilcox 81525726bc1SMatthew Wilcox /** 81625726bc1SMatthew Wilcox * dax_truncate_page - handle a partial page being truncated in a DAX file 81725726bc1SMatthew Wilcox * @inode: The file being truncated 81825726bc1SMatthew Wilcox * @from: The file offset that is being truncated to 81925726bc1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 82025726bc1SMatthew Wilcox * 82125726bc1SMatthew Wilcox * Similar to block_truncate_page(), this function can be called by a 82225726bc1SMatthew Wilcox * filesystem when it is truncating a DAX file to handle the partial page. 82325726bc1SMatthew Wilcox * 82425726bc1SMatthew Wilcox * We work in terms of PAGE_CACHE_SIZE here for commonality with 82525726bc1SMatthew Wilcox * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 82625726bc1SMatthew Wilcox * took care of disposing of the unnecessary blocks. Even if the filesystem 82725726bc1SMatthew Wilcox * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 82825726bc1SMatthew Wilcox * since the file might be mmapped. 82925726bc1SMatthew Wilcox */ 83025726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 83125726bc1SMatthew Wilcox { 83225726bc1SMatthew Wilcox unsigned length = PAGE_CACHE_ALIGN(from) - from; 83325726bc1SMatthew Wilcox return dax_zero_page_range(inode, from, length, get_block); 83425726bc1SMatthew Wilcox } 8354c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page); 836