1d475c634SMatthew Wilcox /* 2d475c634SMatthew Wilcox * fs/dax.c - Direct Access filesystem code 3d475c634SMatthew Wilcox * Copyright (c) 2013-2014 Intel Corporation 4d475c634SMatthew Wilcox * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5d475c634SMatthew Wilcox * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6d475c634SMatthew Wilcox * 7d475c634SMatthew Wilcox * This program is free software; you can redistribute it and/or modify it 8d475c634SMatthew Wilcox * under the terms and conditions of the GNU General Public License, 9d475c634SMatthew Wilcox * version 2, as published by the Free Software Foundation. 10d475c634SMatthew Wilcox * 11d475c634SMatthew Wilcox * This program is distributed in the hope it will be useful, but WITHOUT 12d475c634SMatthew Wilcox * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13d475c634SMatthew Wilcox * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14d475c634SMatthew Wilcox * more details. 15d475c634SMatthew Wilcox */ 16d475c634SMatthew Wilcox 17d475c634SMatthew Wilcox #include <linux/atomic.h> 18d475c634SMatthew Wilcox #include <linux/blkdev.h> 19d475c634SMatthew Wilcox #include <linux/buffer_head.h> 20d475c634SMatthew Wilcox #include <linux/fs.h> 21d475c634SMatthew Wilcox #include <linux/genhd.h> 22f7ca90b1SMatthew Wilcox #include <linux/highmem.h> 23f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h> 24f7ca90b1SMatthew Wilcox #include <linux/mm.h> 25d475c634SMatthew Wilcox #include <linux/mutex.h> 26289c6aedSMatthew Wilcox #include <linux/sched.h> 27d475c634SMatthew Wilcox #include <linux/uio.h> 28f7ca90b1SMatthew Wilcox #include <linux/vmstat.h> 29d475c634SMatthew Wilcox 30289c6aedSMatthew Wilcox int dax_clear_blocks(struct inode *inode, sector_t block, long size) 31289c6aedSMatthew Wilcox { 32289c6aedSMatthew Wilcox struct block_device *bdev = inode->i_sb->s_bdev; 33289c6aedSMatthew Wilcox sector_t sector = block << (inode->i_blkbits - 9); 34289c6aedSMatthew Wilcox 35289c6aedSMatthew Wilcox might_sleep(); 36289c6aedSMatthew Wilcox do { 37289c6aedSMatthew Wilcox void *addr; 38289c6aedSMatthew Wilcox unsigned long pfn; 39289c6aedSMatthew Wilcox long count; 40289c6aedSMatthew Wilcox 41289c6aedSMatthew Wilcox count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 42289c6aedSMatthew Wilcox if (count < 0) 43289c6aedSMatthew Wilcox return count; 44289c6aedSMatthew Wilcox BUG_ON(size < count); 45289c6aedSMatthew Wilcox while (count > 0) { 46289c6aedSMatthew Wilcox unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 47289c6aedSMatthew Wilcox if (pgsz > count) 48289c6aedSMatthew Wilcox pgsz = count; 49289c6aedSMatthew Wilcox if (pgsz < PAGE_SIZE) 50289c6aedSMatthew Wilcox memset(addr, 0, pgsz); 51289c6aedSMatthew Wilcox else 52289c6aedSMatthew Wilcox clear_page(addr); 53289c6aedSMatthew Wilcox addr += pgsz; 54289c6aedSMatthew Wilcox size -= pgsz; 55289c6aedSMatthew Wilcox count -= pgsz; 56289c6aedSMatthew Wilcox BUG_ON(pgsz & 511); 57289c6aedSMatthew Wilcox sector += pgsz / 512; 58289c6aedSMatthew Wilcox cond_resched(); 59289c6aedSMatthew Wilcox } 60289c6aedSMatthew Wilcox } while (size); 61289c6aedSMatthew Wilcox 62289c6aedSMatthew Wilcox return 0; 63289c6aedSMatthew Wilcox } 64289c6aedSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_clear_blocks); 65289c6aedSMatthew Wilcox 66d475c634SMatthew Wilcox static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) 67d475c634SMatthew Wilcox { 68d475c634SMatthew Wilcox unsigned long pfn; 69d475c634SMatthew Wilcox sector_t sector = bh->b_blocknr << (blkbits - 9); 70d475c634SMatthew Wilcox return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 71d475c634SMatthew Wilcox } 72d475c634SMatthew Wilcox 73d475c634SMatthew Wilcox static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, 74d475c634SMatthew Wilcox loff_t end) 75d475c634SMatthew Wilcox { 76d475c634SMatthew Wilcox loff_t final = end - pos + first; /* The final byte of the buffer */ 77d475c634SMatthew Wilcox 78d475c634SMatthew Wilcox if (first > 0) 79d475c634SMatthew Wilcox memset(addr, 0, first); 80d475c634SMatthew Wilcox if (final < size) 81d475c634SMatthew Wilcox memset(addr + final, 0, size - final); 82d475c634SMatthew Wilcox } 83d475c634SMatthew Wilcox 84d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh) 85d475c634SMatthew Wilcox { 86d475c634SMatthew Wilcox return buffer_mapped(bh) && !buffer_unwritten(bh); 87d475c634SMatthew Wilcox } 88d475c634SMatthew Wilcox 89d475c634SMatthew Wilcox /* 90d475c634SMatthew Wilcox * When ext4 encounters a hole, it returns without modifying the buffer_head 91d475c634SMatthew Wilcox * which means that we can't trust b_size. To cope with this, we set b_state 92d475c634SMatthew Wilcox * to 0 before calling get_block and, if any bit is set, we know we can trust 93d475c634SMatthew Wilcox * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 94d475c634SMatthew Wilcox * and would save us time calling get_block repeatedly. 95d475c634SMatthew Wilcox */ 96d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh) 97d475c634SMatthew Wilcox { 98d475c634SMatthew Wilcox return bh->b_state != 0; 99d475c634SMatthew Wilcox } 100d475c634SMatthew Wilcox 101d475c634SMatthew Wilcox static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, 102d475c634SMatthew Wilcox loff_t start, loff_t end, get_block_t get_block, 103d475c634SMatthew Wilcox struct buffer_head *bh) 104d475c634SMatthew Wilcox { 105d475c634SMatthew Wilcox ssize_t retval = 0; 106d475c634SMatthew Wilcox loff_t pos = start; 107d475c634SMatthew Wilcox loff_t max = start; 108d475c634SMatthew Wilcox loff_t bh_max = start; 109d475c634SMatthew Wilcox void *addr; 110d475c634SMatthew Wilcox bool hole = false; 111d475c634SMatthew Wilcox 112d475c634SMatthew Wilcox if (rw != WRITE) 113d475c634SMatthew Wilcox end = min(end, i_size_read(inode)); 114d475c634SMatthew Wilcox 115d475c634SMatthew Wilcox while (pos < end) { 116d475c634SMatthew Wilcox unsigned len; 117d475c634SMatthew Wilcox if (pos == max) { 118d475c634SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 119d475c634SMatthew Wilcox sector_t block = pos >> blkbits; 120d475c634SMatthew Wilcox unsigned first = pos - (block << blkbits); 121d475c634SMatthew Wilcox long size; 122d475c634SMatthew Wilcox 123d475c634SMatthew Wilcox if (pos == bh_max) { 124d475c634SMatthew Wilcox bh->b_size = PAGE_ALIGN(end - pos); 125d475c634SMatthew Wilcox bh->b_state = 0; 126d475c634SMatthew Wilcox retval = get_block(inode, block, bh, 127d475c634SMatthew Wilcox rw == WRITE); 128d475c634SMatthew Wilcox if (retval) 129d475c634SMatthew Wilcox break; 130d475c634SMatthew Wilcox if (!buffer_size_valid(bh)) 131d475c634SMatthew Wilcox bh->b_size = 1 << blkbits; 132d475c634SMatthew Wilcox bh_max = pos - first + bh->b_size; 133d475c634SMatthew Wilcox } else { 134d475c634SMatthew Wilcox unsigned done = bh->b_size - 135d475c634SMatthew Wilcox (bh_max - (pos - first)); 136d475c634SMatthew Wilcox bh->b_blocknr += done >> blkbits; 137d475c634SMatthew Wilcox bh->b_size -= done; 138d475c634SMatthew Wilcox } 139d475c634SMatthew Wilcox 140d475c634SMatthew Wilcox hole = (rw != WRITE) && !buffer_written(bh); 141d475c634SMatthew Wilcox if (hole) { 142d475c634SMatthew Wilcox addr = NULL; 143d475c634SMatthew Wilcox size = bh->b_size - first; 144d475c634SMatthew Wilcox } else { 145d475c634SMatthew Wilcox retval = dax_get_addr(bh, &addr, blkbits); 146d475c634SMatthew Wilcox if (retval < 0) 147d475c634SMatthew Wilcox break; 148d475c634SMatthew Wilcox if (buffer_unwritten(bh) || buffer_new(bh)) 149d475c634SMatthew Wilcox dax_new_buf(addr, retval, first, pos, 150d475c634SMatthew Wilcox end); 151d475c634SMatthew Wilcox addr += first; 152d475c634SMatthew Wilcox size = retval - first; 153d475c634SMatthew Wilcox } 154d475c634SMatthew Wilcox max = min(pos + size, end); 155d475c634SMatthew Wilcox } 156d475c634SMatthew Wilcox 157d475c634SMatthew Wilcox if (rw == WRITE) 158d475c634SMatthew Wilcox len = copy_from_iter(addr, max - pos, iter); 159d475c634SMatthew Wilcox else if (!hole) 160d475c634SMatthew Wilcox len = copy_to_iter(addr, max - pos, iter); 161d475c634SMatthew Wilcox else 162d475c634SMatthew Wilcox len = iov_iter_zero(max - pos, iter); 163d475c634SMatthew Wilcox 164d475c634SMatthew Wilcox if (!len) 165d475c634SMatthew Wilcox break; 166d475c634SMatthew Wilcox 167d475c634SMatthew Wilcox pos += len; 168d475c634SMatthew Wilcox addr += len; 169d475c634SMatthew Wilcox } 170d475c634SMatthew Wilcox 171d475c634SMatthew Wilcox return (pos == start) ? retval : pos - start; 172d475c634SMatthew Wilcox } 173d475c634SMatthew Wilcox 174d475c634SMatthew Wilcox /** 175d475c634SMatthew Wilcox * dax_do_io - Perform I/O to a DAX file 176d475c634SMatthew Wilcox * @rw: READ to read or WRITE to write 177d475c634SMatthew Wilcox * @iocb: The control block for this I/O 178d475c634SMatthew Wilcox * @inode: The file which the I/O is directed at 179d475c634SMatthew Wilcox * @iter: The addresses to do I/O from or to 180d475c634SMatthew Wilcox * @pos: The file offset where the I/O starts 181d475c634SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 182d475c634SMatthew Wilcox * @end_io: A filesystem callback for I/O completion 183d475c634SMatthew Wilcox * @flags: See below 184d475c634SMatthew Wilcox * 185d475c634SMatthew Wilcox * This function uses the same locking scheme as do_blockdev_direct_IO: 186d475c634SMatthew Wilcox * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 187d475c634SMatthew Wilcox * caller for writes. For reads, we take and release the i_mutex ourselves. 188d475c634SMatthew Wilcox * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 189d475c634SMatthew Wilcox * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 190d475c634SMatthew Wilcox * is in progress. 191d475c634SMatthew Wilcox */ 192d475c634SMatthew Wilcox ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, 193d475c634SMatthew Wilcox struct iov_iter *iter, loff_t pos, 194d475c634SMatthew Wilcox get_block_t get_block, dio_iodone_t end_io, int flags) 195d475c634SMatthew Wilcox { 196d475c634SMatthew Wilcox struct buffer_head bh; 197d475c634SMatthew Wilcox ssize_t retval = -EINVAL; 198d475c634SMatthew Wilcox loff_t end = pos + iov_iter_count(iter); 199d475c634SMatthew Wilcox 200d475c634SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 201d475c634SMatthew Wilcox 202d475c634SMatthew Wilcox if ((flags & DIO_LOCKING) && (rw == READ)) { 203d475c634SMatthew Wilcox struct address_space *mapping = inode->i_mapping; 204d475c634SMatthew Wilcox mutex_lock(&inode->i_mutex); 205d475c634SMatthew Wilcox retval = filemap_write_and_wait_range(mapping, pos, end - 1); 206d475c634SMatthew Wilcox if (retval) { 207d475c634SMatthew Wilcox mutex_unlock(&inode->i_mutex); 208d475c634SMatthew Wilcox goto out; 209d475c634SMatthew Wilcox } 210d475c634SMatthew Wilcox } 211d475c634SMatthew Wilcox 212d475c634SMatthew Wilcox /* Protects against truncate */ 213d475c634SMatthew Wilcox atomic_inc(&inode->i_dio_count); 214d475c634SMatthew Wilcox 215d475c634SMatthew Wilcox retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); 216d475c634SMatthew Wilcox 217d475c634SMatthew Wilcox if ((flags & DIO_LOCKING) && (rw == READ)) 218d475c634SMatthew Wilcox mutex_unlock(&inode->i_mutex); 219d475c634SMatthew Wilcox 220d475c634SMatthew Wilcox if ((retval > 0) && end_io) 221d475c634SMatthew Wilcox end_io(iocb, pos, retval, bh.b_private); 222d475c634SMatthew Wilcox 223d475c634SMatthew Wilcox inode_dio_done(inode); 224d475c634SMatthew Wilcox out: 225d475c634SMatthew Wilcox return retval; 226d475c634SMatthew Wilcox } 227d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io); 228f7ca90b1SMatthew Wilcox 229f7ca90b1SMatthew Wilcox /* 230f7ca90b1SMatthew Wilcox * The user has performed a load from a hole in the file. Allocating 231f7ca90b1SMatthew Wilcox * a new page in the file would cause excessive storage usage for 232f7ca90b1SMatthew Wilcox * workloads with sparse files. We allocate a page cache page instead. 233f7ca90b1SMatthew Wilcox * We'll kick it out of the page cache if it's ever written to, 234f7ca90b1SMatthew Wilcox * otherwise it will simply fall out of the page cache under memory 235f7ca90b1SMatthew Wilcox * pressure without ever having been dirtied. 236f7ca90b1SMatthew Wilcox */ 237f7ca90b1SMatthew Wilcox static int dax_load_hole(struct address_space *mapping, struct page *page, 238f7ca90b1SMatthew Wilcox struct vm_fault *vmf) 239f7ca90b1SMatthew Wilcox { 240f7ca90b1SMatthew Wilcox unsigned long size; 241f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 242f7ca90b1SMatthew Wilcox if (!page) 243f7ca90b1SMatthew Wilcox page = find_or_create_page(mapping, vmf->pgoff, 244f7ca90b1SMatthew Wilcox GFP_KERNEL | __GFP_ZERO); 245f7ca90b1SMatthew Wilcox if (!page) 246f7ca90b1SMatthew Wilcox return VM_FAULT_OOM; 247f7ca90b1SMatthew Wilcox /* Recheck i_size under page lock to avoid truncate race */ 248f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 249f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) { 250f7ca90b1SMatthew Wilcox unlock_page(page); 251f7ca90b1SMatthew Wilcox page_cache_release(page); 252f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 253f7ca90b1SMatthew Wilcox } 254f7ca90b1SMatthew Wilcox 255f7ca90b1SMatthew Wilcox vmf->page = page; 256f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 257f7ca90b1SMatthew Wilcox } 258f7ca90b1SMatthew Wilcox 259f7ca90b1SMatthew Wilcox static int copy_user_bh(struct page *to, struct buffer_head *bh, 260f7ca90b1SMatthew Wilcox unsigned blkbits, unsigned long vaddr) 261f7ca90b1SMatthew Wilcox { 262f7ca90b1SMatthew Wilcox void *vfrom, *vto; 263f7ca90b1SMatthew Wilcox if (dax_get_addr(bh, &vfrom, blkbits) < 0) 264f7ca90b1SMatthew Wilcox return -EIO; 265f7ca90b1SMatthew Wilcox vto = kmap_atomic(to); 266f7ca90b1SMatthew Wilcox copy_user_page(vto, vfrom, vaddr, to); 267f7ca90b1SMatthew Wilcox kunmap_atomic(vto); 268f7ca90b1SMatthew Wilcox return 0; 269f7ca90b1SMatthew Wilcox } 270f7ca90b1SMatthew Wilcox 271f7ca90b1SMatthew Wilcox static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 272f7ca90b1SMatthew Wilcox struct vm_area_struct *vma, struct vm_fault *vmf) 273f7ca90b1SMatthew Wilcox { 274f7ca90b1SMatthew Wilcox struct address_space *mapping = inode->i_mapping; 275f7ca90b1SMatthew Wilcox sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 276f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 277f7ca90b1SMatthew Wilcox void *addr; 278f7ca90b1SMatthew Wilcox unsigned long pfn; 279f7ca90b1SMatthew Wilcox pgoff_t size; 280f7ca90b1SMatthew Wilcox int error; 281f7ca90b1SMatthew Wilcox 282f7ca90b1SMatthew Wilcox i_mmap_lock_read(mapping); 283f7ca90b1SMatthew Wilcox 284f7ca90b1SMatthew Wilcox /* 285f7ca90b1SMatthew Wilcox * Check truncate didn't happen while we were allocating a block. 286f7ca90b1SMatthew Wilcox * If it did, this block may or may not be still allocated to the 287f7ca90b1SMatthew Wilcox * file. We can't tell the filesystem to free it because we can't 288f7ca90b1SMatthew Wilcox * take i_mutex here. In the worst case, the file still has blocks 289f7ca90b1SMatthew Wilcox * allocated past the end of the file. 290f7ca90b1SMatthew Wilcox */ 291f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 292f7ca90b1SMatthew Wilcox if (unlikely(vmf->pgoff >= size)) { 293f7ca90b1SMatthew Wilcox error = -EIO; 294f7ca90b1SMatthew Wilcox goto out; 295f7ca90b1SMatthew Wilcox } 296f7ca90b1SMatthew Wilcox 297f7ca90b1SMatthew Wilcox error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 298f7ca90b1SMatthew Wilcox if (error < 0) 299f7ca90b1SMatthew Wilcox goto out; 300f7ca90b1SMatthew Wilcox if (error < PAGE_SIZE) { 301f7ca90b1SMatthew Wilcox error = -EIO; 302f7ca90b1SMatthew Wilcox goto out; 303f7ca90b1SMatthew Wilcox } 304f7ca90b1SMatthew Wilcox 305f7ca90b1SMatthew Wilcox if (buffer_unwritten(bh) || buffer_new(bh)) 306f7ca90b1SMatthew Wilcox clear_page(addr); 307f7ca90b1SMatthew Wilcox 308f7ca90b1SMatthew Wilcox error = vm_insert_mixed(vma, vaddr, pfn); 309f7ca90b1SMatthew Wilcox 310f7ca90b1SMatthew Wilcox out: 311f7ca90b1SMatthew Wilcox i_mmap_unlock_read(mapping); 312f7ca90b1SMatthew Wilcox 313f7ca90b1SMatthew Wilcox if (bh->b_end_io) 314f7ca90b1SMatthew Wilcox bh->b_end_io(bh, 1); 315f7ca90b1SMatthew Wilcox 316f7ca90b1SMatthew Wilcox return error; 317f7ca90b1SMatthew Wilcox } 318f7ca90b1SMatthew Wilcox 319f7ca90b1SMatthew Wilcox static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 320f7ca90b1SMatthew Wilcox get_block_t get_block) 321f7ca90b1SMatthew Wilcox { 322f7ca90b1SMatthew Wilcox struct file *file = vma->vm_file; 323f7ca90b1SMatthew Wilcox struct address_space *mapping = file->f_mapping; 324f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 325f7ca90b1SMatthew Wilcox struct page *page; 326f7ca90b1SMatthew Wilcox struct buffer_head bh; 327f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 328f7ca90b1SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 329f7ca90b1SMatthew Wilcox sector_t block; 330f7ca90b1SMatthew Wilcox pgoff_t size; 331f7ca90b1SMatthew Wilcox int error; 332f7ca90b1SMatthew Wilcox int major = 0; 333f7ca90b1SMatthew Wilcox 334f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 335f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) 336f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 337f7ca90b1SMatthew Wilcox 338f7ca90b1SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 339f7ca90b1SMatthew Wilcox block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 340f7ca90b1SMatthew Wilcox bh.b_size = PAGE_SIZE; 341f7ca90b1SMatthew Wilcox 342f7ca90b1SMatthew Wilcox repeat: 343f7ca90b1SMatthew Wilcox page = find_get_page(mapping, vmf->pgoff); 344f7ca90b1SMatthew Wilcox if (page) { 345f7ca90b1SMatthew Wilcox if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 346f7ca90b1SMatthew Wilcox page_cache_release(page); 347f7ca90b1SMatthew Wilcox return VM_FAULT_RETRY; 348f7ca90b1SMatthew Wilcox } 349f7ca90b1SMatthew Wilcox if (unlikely(page->mapping != mapping)) { 350f7ca90b1SMatthew Wilcox unlock_page(page); 351f7ca90b1SMatthew Wilcox page_cache_release(page); 352f7ca90b1SMatthew Wilcox goto repeat; 353f7ca90b1SMatthew Wilcox } 354f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 355f7ca90b1SMatthew Wilcox if (unlikely(vmf->pgoff >= size)) { 356f7ca90b1SMatthew Wilcox /* 357f7ca90b1SMatthew Wilcox * We have a struct page covering a hole in the file 358f7ca90b1SMatthew Wilcox * from a read fault and we've raced with a truncate 359f7ca90b1SMatthew Wilcox */ 360f7ca90b1SMatthew Wilcox error = -EIO; 361f7ca90b1SMatthew Wilcox goto unlock_page; 362f7ca90b1SMatthew Wilcox } 363f7ca90b1SMatthew Wilcox } 364f7ca90b1SMatthew Wilcox 365f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 0); 366f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 367f7ca90b1SMatthew Wilcox error = -EIO; /* fs corruption? */ 368f7ca90b1SMatthew Wilcox if (error) 369f7ca90b1SMatthew Wilcox goto unlock_page; 370f7ca90b1SMatthew Wilcox 371f7ca90b1SMatthew Wilcox if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 372f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 373f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 1); 374f7ca90b1SMatthew Wilcox count_vm_event(PGMAJFAULT); 375f7ca90b1SMatthew Wilcox mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 376f7ca90b1SMatthew Wilcox major = VM_FAULT_MAJOR; 377f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 378f7ca90b1SMatthew Wilcox error = -EIO; 379f7ca90b1SMatthew Wilcox if (error) 380f7ca90b1SMatthew Wilcox goto unlock_page; 381f7ca90b1SMatthew Wilcox } else { 382f7ca90b1SMatthew Wilcox return dax_load_hole(mapping, page, vmf); 383f7ca90b1SMatthew Wilcox } 384f7ca90b1SMatthew Wilcox } 385f7ca90b1SMatthew Wilcox 386f7ca90b1SMatthew Wilcox if (vmf->cow_page) { 387f7ca90b1SMatthew Wilcox struct page *new_page = vmf->cow_page; 388f7ca90b1SMatthew Wilcox if (buffer_written(&bh)) 389f7ca90b1SMatthew Wilcox error = copy_user_bh(new_page, &bh, blkbits, vaddr); 390f7ca90b1SMatthew Wilcox else 391f7ca90b1SMatthew Wilcox clear_user_highpage(new_page, vaddr); 392f7ca90b1SMatthew Wilcox if (error) 393f7ca90b1SMatthew Wilcox goto unlock_page; 394f7ca90b1SMatthew Wilcox vmf->page = page; 395f7ca90b1SMatthew Wilcox if (!page) { 396f7ca90b1SMatthew Wilcox i_mmap_lock_read(mapping); 397f7ca90b1SMatthew Wilcox /* Check we didn't race with truncate */ 398f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> 399f7ca90b1SMatthew Wilcox PAGE_SHIFT; 400f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) { 401f7ca90b1SMatthew Wilcox i_mmap_unlock_read(mapping); 402f7ca90b1SMatthew Wilcox error = -EIO; 403f7ca90b1SMatthew Wilcox goto out; 404f7ca90b1SMatthew Wilcox } 405f7ca90b1SMatthew Wilcox } 406f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 407f7ca90b1SMatthew Wilcox } 408f7ca90b1SMatthew Wilcox 409f7ca90b1SMatthew Wilcox /* Check we didn't race with a read fault installing a new page */ 410f7ca90b1SMatthew Wilcox if (!page && major) 411f7ca90b1SMatthew Wilcox page = find_lock_page(mapping, vmf->pgoff); 412f7ca90b1SMatthew Wilcox 413f7ca90b1SMatthew Wilcox if (page) { 414f7ca90b1SMatthew Wilcox unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 415f7ca90b1SMatthew Wilcox PAGE_CACHE_SIZE, 0); 416f7ca90b1SMatthew Wilcox delete_from_page_cache(page); 417f7ca90b1SMatthew Wilcox unlock_page(page); 418f7ca90b1SMatthew Wilcox page_cache_release(page); 419f7ca90b1SMatthew Wilcox } 420f7ca90b1SMatthew Wilcox 421f7ca90b1SMatthew Wilcox error = dax_insert_mapping(inode, &bh, vma, vmf); 422f7ca90b1SMatthew Wilcox 423f7ca90b1SMatthew Wilcox out: 424f7ca90b1SMatthew Wilcox if (error == -ENOMEM) 425f7ca90b1SMatthew Wilcox return VM_FAULT_OOM | major; 426f7ca90b1SMatthew Wilcox /* -EBUSY is fine, somebody else faulted on the same PTE */ 427f7ca90b1SMatthew Wilcox if ((error < 0) && (error != -EBUSY)) 428f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS | major; 429f7ca90b1SMatthew Wilcox return VM_FAULT_NOPAGE | major; 430f7ca90b1SMatthew Wilcox 431f7ca90b1SMatthew Wilcox unlock_page: 432f7ca90b1SMatthew Wilcox if (page) { 433f7ca90b1SMatthew Wilcox unlock_page(page); 434f7ca90b1SMatthew Wilcox page_cache_release(page); 435f7ca90b1SMatthew Wilcox } 436f7ca90b1SMatthew Wilcox goto out; 437f7ca90b1SMatthew Wilcox } 438f7ca90b1SMatthew Wilcox 439f7ca90b1SMatthew Wilcox /** 440f7ca90b1SMatthew Wilcox * dax_fault - handle a page fault on a DAX file 441f7ca90b1SMatthew Wilcox * @vma: The virtual memory area where the fault occurred 442f7ca90b1SMatthew Wilcox * @vmf: The description of the fault 443f7ca90b1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 444f7ca90b1SMatthew Wilcox * 445f7ca90b1SMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 446f7ca90b1SMatthew Wilcox * fault handler for DAX files. 447f7ca90b1SMatthew Wilcox */ 448f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 449f7ca90b1SMatthew Wilcox get_block_t get_block) 450f7ca90b1SMatthew Wilcox { 451f7ca90b1SMatthew Wilcox int result; 452f7ca90b1SMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 453f7ca90b1SMatthew Wilcox 454f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 455f7ca90b1SMatthew Wilcox sb_start_pagefault(sb); 456f7ca90b1SMatthew Wilcox file_update_time(vma->vm_file); 457f7ca90b1SMatthew Wilcox } 458f7ca90b1SMatthew Wilcox result = do_dax_fault(vma, vmf, get_block); 459f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) 460f7ca90b1SMatthew Wilcox sb_end_pagefault(sb); 461f7ca90b1SMatthew Wilcox 462f7ca90b1SMatthew Wilcox return result; 463f7ca90b1SMatthew Wilcox } 464f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault); 465