1d475c634SMatthew Wilcox /* 2d475c634SMatthew Wilcox * fs/dax.c - Direct Access filesystem code 3d475c634SMatthew Wilcox * Copyright (c) 2013-2014 Intel Corporation 4d475c634SMatthew Wilcox * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5d475c634SMatthew Wilcox * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6d475c634SMatthew Wilcox * 7d475c634SMatthew Wilcox * This program is free software; you can redistribute it and/or modify it 8d475c634SMatthew Wilcox * under the terms and conditions of the GNU General Public License, 9d475c634SMatthew Wilcox * version 2, as published by the Free Software Foundation. 10d475c634SMatthew Wilcox * 11d475c634SMatthew Wilcox * This program is distributed in the hope it will be useful, but WITHOUT 12d475c634SMatthew Wilcox * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13d475c634SMatthew Wilcox * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14d475c634SMatthew Wilcox * more details. 15d475c634SMatthew Wilcox */ 16d475c634SMatthew Wilcox 17d475c634SMatthew Wilcox #include <linux/atomic.h> 18d475c634SMatthew Wilcox #include <linux/blkdev.h> 19d475c634SMatthew Wilcox #include <linux/buffer_head.h> 20d77e92e2SRoss Zwisler #include <linux/dax.h> 21d475c634SMatthew Wilcox #include <linux/fs.h> 22d475c634SMatthew Wilcox #include <linux/genhd.h> 23f7ca90b1SMatthew Wilcox #include <linux/highmem.h> 24f7ca90b1SMatthew Wilcox #include <linux/memcontrol.h> 25f7ca90b1SMatthew Wilcox #include <linux/mm.h> 26d475c634SMatthew Wilcox #include <linux/mutex.h> 279973c98eSRoss Zwisler #include <linux/pagevec.h> 282765cfbbSRoss Zwisler #include <linux/pmem.h> 29289c6aedSMatthew Wilcox #include <linux/sched.h> 30d475c634SMatthew Wilcox #include <linux/uio.h> 31f7ca90b1SMatthew Wilcox #include <linux/vmstat.h> 3234c0fd54SDan Williams #include <linux/pfn_t.h> 330e749e54SDan Williams #include <linux/sizes.h> 34d475c634SMatthew Wilcox 35e4b27491SNeilBrown #define RADIX_DAX_MASK 0xf 36e4b27491SNeilBrown #define RADIX_DAX_SHIFT 4 37e4b27491SNeilBrown #define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) 38e4b27491SNeilBrown #define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) 39e4b27491SNeilBrown #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) 40e4b27491SNeilBrown #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 41e4b27491SNeilBrown #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 42e4b27491SNeilBrown RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) 43e4b27491SNeilBrown 44b2e0d162SDan Williams static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 45b2e0d162SDan Williams { 46b2e0d162SDan Williams struct request_queue *q = bdev->bd_queue; 47b2e0d162SDan Williams long rc = -EIO; 48b2e0d162SDan Williams 49b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(-EIO); 50b2e0d162SDan Williams if (blk_queue_enter(q, true) != 0) 51b2e0d162SDan Williams return rc; 52b2e0d162SDan Williams 53b2e0d162SDan Williams rc = bdev_direct_access(bdev, dax); 54b2e0d162SDan Williams if (rc < 0) { 55b2e0d162SDan Williams dax->addr = (void __pmem *) ERR_PTR(rc); 56b2e0d162SDan Williams blk_queue_exit(q); 57b2e0d162SDan Williams return rc; 58b2e0d162SDan Williams } 59b2e0d162SDan Williams return rc; 60b2e0d162SDan Williams } 61b2e0d162SDan Williams 62b2e0d162SDan Williams static void dax_unmap_atomic(struct block_device *bdev, 63b2e0d162SDan Williams const struct blk_dax_ctl *dax) 64b2e0d162SDan Williams { 65b2e0d162SDan Williams if (IS_ERR(dax->addr)) 66b2e0d162SDan Williams return; 67b2e0d162SDan Williams blk_queue_exit(bdev->bd_queue); 68b2e0d162SDan Williams } 69b2e0d162SDan Williams 70d1a5f2b4SDan Williams struct page *read_dax_sector(struct block_device *bdev, sector_t n) 71d1a5f2b4SDan Williams { 72d1a5f2b4SDan Williams struct page *page = alloc_pages(GFP_KERNEL, 0); 73d1a5f2b4SDan Williams struct blk_dax_ctl dax = { 74d1a5f2b4SDan Williams .size = PAGE_SIZE, 75d1a5f2b4SDan Williams .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 76d1a5f2b4SDan Williams }; 77d1a5f2b4SDan Williams long rc; 78d1a5f2b4SDan Williams 79d1a5f2b4SDan Williams if (!page) 80d1a5f2b4SDan Williams return ERR_PTR(-ENOMEM); 81d1a5f2b4SDan Williams 82d1a5f2b4SDan Williams rc = dax_map_atomic(bdev, &dax); 83d1a5f2b4SDan Williams if (rc < 0) 84d1a5f2b4SDan Williams return ERR_PTR(rc); 85d1a5f2b4SDan Williams memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 86d1a5f2b4SDan Williams dax_unmap_atomic(bdev, &dax); 87d1a5f2b4SDan Williams return page; 88d1a5f2b4SDan Williams } 89d1a5f2b4SDan Williams 90d475c634SMatthew Wilcox static bool buffer_written(struct buffer_head *bh) 91d475c634SMatthew Wilcox { 92d475c634SMatthew Wilcox return buffer_mapped(bh) && !buffer_unwritten(bh); 93d475c634SMatthew Wilcox } 94d475c634SMatthew Wilcox 95d475c634SMatthew Wilcox /* 96d475c634SMatthew Wilcox * When ext4 encounters a hole, it returns without modifying the buffer_head 97d475c634SMatthew Wilcox * which means that we can't trust b_size. To cope with this, we set b_state 98d475c634SMatthew Wilcox * to 0 before calling get_block and, if any bit is set, we know we can trust 99d475c634SMatthew Wilcox * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 100d475c634SMatthew Wilcox * and would save us time calling get_block repeatedly. 101d475c634SMatthew Wilcox */ 102d475c634SMatthew Wilcox static bool buffer_size_valid(struct buffer_head *bh) 103d475c634SMatthew Wilcox { 104d475c634SMatthew Wilcox return bh->b_state != 0; 105d475c634SMatthew Wilcox } 106d475c634SMatthew Wilcox 107b2e0d162SDan Williams 108b2e0d162SDan Williams static sector_t to_sector(const struct buffer_head *bh, 109b2e0d162SDan Williams const struct inode *inode) 110b2e0d162SDan Williams { 111b2e0d162SDan Williams sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 112b2e0d162SDan Williams 113b2e0d162SDan Williams return sector; 114b2e0d162SDan Williams } 115b2e0d162SDan Williams 116a95cd631SOmar Sandoval static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 117d475c634SMatthew Wilcox loff_t start, loff_t end, get_block_t get_block, 118d475c634SMatthew Wilcox struct buffer_head *bh) 119d475c634SMatthew Wilcox { 120b2e0d162SDan Williams loff_t pos = start, max = start, bh_max = start; 121b2e0d162SDan Williams bool hole = false, need_wmb = false; 122b2e0d162SDan Williams struct block_device *bdev = NULL; 123b2e0d162SDan Williams int rw = iov_iter_rw(iter), rc; 124b2e0d162SDan Williams long map_len = 0; 125b2e0d162SDan Williams struct blk_dax_ctl dax = { 126b2e0d162SDan Williams .addr = (void __pmem *) ERR_PTR(-EIO), 127b2e0d162SDan Williams }; 128069c77bcSJan Kara unsigned blkbits = inode->i_blkbits; 129069c77bcSJan Kara sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 130069c77bcSJan Kara >> blkbits; 131d475c634SMatthew Wilcox 132b2e0d162SDan Williams if (rw == READ) 133d475c634SMatthew Wilcox end = min(end, i_size_read(inode)); 134d475c634SMatthew Wilcox 135d475c634SMatthew Wilcox while (pos < end) { 1362765cfbbSRoss Zwisler size_t len; 137d475c634SMatthew Wilcox if (pos == max) { 138e94f5a22SJeff Moyer long page = pos >> PAGE_SHIFT; 139e94f5a22SJeff Moyer sector_t block = page << (PAGE_SHIFT - blkbits); 140d475c634SMatthew Wilcox unsigned first = pos - (block << blkbits); 141d475c634SMatthew Wilcox long size; 142d475c634SMatthew Wilcox 143d475c634SMatthew Wilcox if (pos == bh_max) { 144d475c634SMatthew Wilcox bh->b_size = PAGE_ALIGN(end - pos); 145d475c634SMatthew Wilcox bh->b_state = 0; 146b2e0d162SDan Williams rc = get_block(inode, block, bh, rw == WRITE); 147b2e0d162SDan Williams if (rc) 148d475c634SMatthew Wilcox break; 149d475c634SMatthew Wilcox if (!buffer_size_valid(bh)) 150d475c634SMatthew Wilcox bh->b_size = 1 << blkbits; 151d475c634SMatthew Wilcox bh_max = pos - first + bh->b_size; 152b2e0d162SDan Williams bdev = bh->b_bdev; 153069c77bcSJan Kara /* 154069c77bcSJan Kara * We allow uninitialized buffers for writes 155069c77bcSJan Kara * beyond EOF as those cannot race with faults 156069c77bcSJan Kara */ 157069c77bcSJan Kara WARN_ON_ONCE( 158069c77bcSJan Kara (buffer_new(bh) && block < file_blks) || 159069c77bcSJan Kara (rw == WRITE && buffer_unwritten(bh))); 160d475c634SMatthew Wilcox } else { 161d475c634SMatthew Wilcox unsigned done = bh->b_size - 162d475c634SMatthew Wilcox (bh_max - (pos - first)); 163d475c634SMatthew Wilcox bh->b_blocknr += done >> blkbits; 164d475c634SMatthew Wilcox bh->b_size -= done; 165d475c634SMatthew Wilcox } 166d475c634SMatthew Wilcox 167b2e0d162SDan Williams hole = rw == READ && !buffer_written(bh); 168d475c634SMatthew Wilcox if (hole) { 169d475c634SMatthew Wilcox size = bh->b_size - first; 170d475c634SMatthew Wilcox } else { 171b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 172b2e0d162SDan Williams dax.sector = to_sector(bh, inode); 173b2e0d162SDan Williams dax.size = bh->b_size; 174b2e0d162SDan Williams map_len = dax_map_atomic(bdev, &dax); 175b2e0d162SDan Williams if (map_len < 0) { 176b2e0d162SDan Williams rc = map_len; 177d475c634SMatthew Wilcox break; 178b2e0d162SDan Williams } 179b2e0d162SDan Williams dax.addr += first; 180b2e0d162SDan Williams size = map_len - first; 181d475c634SMatthew Wilcox } 182d475c634SMatthew Wilcox max = min(pos + size, end); 183d475c634SMatthew Wilcox } 184d475c634SMatthew Wilcox 1852765cfbbSRoss Zwisler if (iov_iter_rw(iter) == WRITE) { 186b2e0d162SDan Williams len = copy_from_iter_pmem(dax.addr, max - pos, iter); 1872765cfbbSRoss Zwisler need_wmb = true; 1882765cfbbSRoss Zwisler } else if (!hole) 189b2e0d162SDan Williams len = copy_to_iter((void __force *) dax.addr, max - pos, 190e2e05394SRoss Zwisler iter); 191d475c634SMatthew Wilcox else 192d475c634SMatthew Wilcox len = iov_iter_zero(max - pos, iter); 193d475c634SMatthew Wilcox 194cadfbb6eSAl Viro if (!len) { 195b2e0d162SDan Williams rc = -EFAULT; 196d475c634SMatthew Wilcox break; 197cadfbb6eSAl Viro } 198d475c634SMatthew Wilcox 199d475c634SMatthew Wilcox pos += len; 200b2e0d162SDan Williams if (!IS_ERR(dax.addr)) 201b2e0d162SDan Williams dax.addr += len; 202d475c634SMatthew Wilcox } 203d475c634SMatthew Wilcox 2042765cfbbSRoss Zwisler if (need_wmb) 2052765cfbbSRoss Zwisler wmb_pmem(); 206b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 2072765cfbbSRoss Zwisler 208b2e0d162SDan Williams return (pos == start) ? rc : pos - start; 209d475c634SMatthew Wilcox } 210d475c634SMatthew Wilcox 211d475c634SMatthew Wilcox /** 212d475c634SMatthew Wilcox * dax_do_io - Perform I/O to a DAX file 213d475c634SMatthew Wilcox * @iocb: The control block for this I/O 214d475c634SMatthew Wilcox * @inode: The file which the I/O is directed at 215d475c634SMatthew Wilcox * @iter: The addresses to do I/O from or to 216d475c634SMatthew Wilcox * @pos: The file offset where the I/O starts 217d475c634SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 218d475c634SMatthew Wilcox * @end_io: A filesystem callback for I/O completion 219d475c634SMatthew Wilcox * @flags: See below 220d475c634SMatthew Wilcox * 221d475c634SMatthew Wilcox * This function uses the same locking scheme as do_blockdev_direct_IO: 222d475c634SMatthew Wilcox * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 223d475c634SMatthew Wilcox * caller for writes. For reads, we take and release the i_mutex ourselves. 224d475c634SMatthew Wilcox * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 225d475c634SMatthew Wilcox * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 226d475c634SMatthew Wilcox * is in progress. 227d475c634SMatthew Wilcox */ 228a95cd631SOmar Sandoval ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 229a95cd631SOmar Sandoval struct iov_iter *iter, loff_t pos, get_block_t get_block, 230a95cd631SOmar Sandoval dio_iodone_t end_io, int flags) 231d475c634SMatthew Wilcox { 232d475c634SMatthew Wilcox struct buffer_head bh; 233d475c634SMatthew Wilcox ssize_t retval = -EINVAL; 234d475c634SMatthew Wilcox loff_t end = pos + iov_iter_count(iter); 235d475c634SMatthew Wilcox 236d475c634SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 237eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 238d475c634SMatthew Wilcox 239c3d98e39SJan Kara if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2405955102cSAl Viro inode_lock(inode); 241d475c634SMatthew Wilcox 242d475c634SMatthew Wilcox /* Protects against truncate */ 243bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 244fe0f07d0SJens Axboe inode_dio_begin(inode); 245d475c634SMatthew Wilcox 246a95cd631SOmar Sandoval retval = dax_io(inode, iter, pos, end, get_block, &bh); 247d475c634SMatthew Wilcox 248a95cd631SOmar Sandoval if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 2495955102cSAl Viro inode_unlock(inode); 250d475c634SMatthew Wilcox 251187372a3SChristoph Hellwig if (end_io) { 252187372a3SChristoph Hellwig int err; 253187372a3SChristoph Hellwig 254187372a3SChristoph Hellwig err = end_io(iocb, pos, retval, bh.b_private); 255187372a3SChristoph Hellwig if (err) 256187372a3SChristoph Hellwig retval = err; 257187372a3SChristoph Hellwig } 258d475c634SMatthew Wilcox 259bbab37ddSMatthew Wilcox if (!(flags & DIO_SKIP_DIO_COUNT)) 260fe0f07d0SJens Axboe inode_dio_end(inode); 261d475c634SMatthew Wilcox return retval; 262d475c634SMatthew Wilcox } 263d475c634SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_do_io); 264f7ca90b1SMatthew Wilcox 265f7ca90b1SMatthew Wilcox /* 266f7ca90b1SMatthew Wilcox * The user has performed a load from a hole in the file. Allocating 267f7ca90b1SMatthew Wilcox * a new page in the file would cause excessive storage usage for 268f7ca90b1SMatthew Wilcox * workloads with sparse files. We allocate a page cache page instead. 269f7ca90b1SMatthew Wilcox * We'll kick it out of the page cache if it's ever written to, 270f7ca90b1SMatthew Wilcox * otherwise it will simply fall out of the page cache under memory 271f7ca90b1SMatthew Wilcox * pressure without ever having been dirtied. 272f7ca90b1SMatthew Wilcox */ 273f7ca90b1SMatthew Wilcox static int dax_load_hole(struct address_space *mapping, struct page *page, 274f7ca90b1SMatthew Wilcox struct vm_fault *vmf) 275f7ca90b1SMatthew Wilcox { 276f7ca90b1SMatthew Wilcox if (!page) 277f7ca90b1SMatthew Wilcox page = find_or_create_page(mapping, vmf->pgoff, 278f7ca90b1SMatthew Wilcox GFP_KERNEL | __GFP_ZERO); 279f7ca90b1SMatthew Wilcox if (!page) 280f7ca90b1SMatthew Wilcox return VM_FAULT_OOM; 281f7ca90b1SMatthew Wilcox 282f7ca90b1SMatthew Wilcox vmf->page = page; 283f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 284f7ca90b1SMatthew Wilcox } 285f7ca90b1SMatthew Wilcox 286b2e0d162SDan Williams static int copy_user_bh(struct page *to, struct inode *inode, 287b2e0d162SDan Williams struct buffer_head *bh, unsigned long vaddr) 288f7ca90b1SMatthew Wilcox { 289b2e0d162SDan Williams struct blk_dax_ctl dax = { 290b2e0d162SDan Williams .sector = to_sector(bh, inode), 291b2e0d162SDan Williams .size = bh->b_size, 292b2e0d162SDan Williams }; 293b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 294e2e05394SRoss Zwisler void *vto; 295e2e05394SRoss Zwisler 296b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) 297b2e0d162SDan Williams return PTR_ERR(dax.addr); 298f7ca90b1SMatthew Wilcox vto = kmap_atomic(to); 299b2e0d162SDan Williams copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 300f7ca90b1SMatthew Wilcox kunmap_atomic(vto); 301b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 302f7ca90b1SMatthew Wilcox return 0; 303f7ca90b1SMatthew Wilcox } 304f7ca90b1SMatthew Wilcox 3059973c98eSRoss Zwisler #define NO_SECTOR -1 30609cbfeafSKirill A. Shutemov #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 3079973c98eSRoss Zwisler 3089973c98eSRoss Zwisler static int dax_radix_entry(struct address_space *mapping, pgoff_t index, 3099973c98eSRoss Zwisler sector_t sector, bool pmd_entry, bool dirty) 3109973c98eSRoss Zwisler { 3119973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 3129973c98eSRoss Zwisler pgoff_t pmd_index = DAX_PMD_INDEX(index); 3139973c98eSRoss Zwisler int type, error = 0; 3149973c98eSRoss Zwisler void *entry; 3159973c98eSRoss Zwisler 3169973c98eSRoss Zwisler WARN_ON_ONCE(pmd_entry && !dirty); 317d2b2a28eSDmitry Monakhov if (dirty) 3189973c98eSRoss Zwisler __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 3199973c98eSRoss Zwisler 3209973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 3219973c98eSRoss Zwisler 3229973c98eSRoss Zwisler entry = radix_tree_lookup(page_tree, pmd_index); 3239973c98eSRoss Zwisler if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { 3249973c98eSRoss Zwisler index = pmd_index; 3259973c98eSRoss Zwisler goto dirty; 3269973c98eSRoss Zwisler } 3279973c98eSRoss Zwisler 3289973c98eSRoss Zwisler entry = radix_tree_lookup(page_tree, index); 3299973c98eSRoss Zwisler if (entry) { 3309973c98eSRoss Zwisler type = RADIX_DAX_TYPE(entry); 3319973c98eSRoss Zwisler if (WARN_ON_ONCE(type != RADIX_DAX_PTE && 3329973c98eSRoss Zwisler type != RADIX_DAX_PMD)) { 3339973c98eSRoss Zwisler error = -EIO; 3349973c98eSRoss Zwisler goto unlock; 3359973c98eSRoss Zwisler } 3369973c98eSRoss Zwisler 3379973c98eSRoss Zwisler if (!pmd_entry || type == RADIX_DAX_PMD) 3389973c98eSRoss Zwisler goto dirty; 3399973c98eSRoss Zwisler 3409973c98eSRoss Zwisler /* 3419973c98eSRoss Zwisler * We only insert dirty PMD entries into the radix tree. This 3429973c98eSRoss Zwisler * means we don't need to worry about removing a dirty PTE 3439973c98eSRoss Zwisler * entry and inserting a clean PMD entry, thus reducing the 3449973c98eSRoss Zwisler * range we would flush with a follow-up fsync/msync call. 3459973c98eSRoss Zwisler */ 3469973c98eSRoss Zwisler radix_tree_delete(&mapping->page_tree, index); 3479973c98eSRoss Zwisler mapping->nrexceptional--; 3489973c98eSRoss Zwisler } 3499973c98eSRoss Zwisler 3509973c98eSRoss Zwisler if (sector == NO_SECTOR) { 3519973c98eSRoss Zwisler /* 3529973c98eSRoss Zwisler * This can happen during correct operation if our pfn_mkwrite 3539973c98eSRoss Zwisler * fault raced against a hole punch operation. If this 3549973c98eSRoss Zwisler * happens the pte that was hole punched will have been 3559973c98eSRoss Zwisler * unmapped and the radix tree entry will have been removed by 3569973c98eSRoss Zwisler * the time we are called, but the call will still happen. We 3579973c98eSRoss Zwisler * will return all the way up to wp_pfn_shared(), where the 3589973c98eSRoss Zwisler * pte_same() check will fail, eventually causing page fault 3599973c98eSRoss Zwisler * to be retried by the CPU. 3609973c98eSRoss Zwisler */ 3619973c98eSRoss Zwisler goto unlock; 3629973c98eSRoss Zwisler } 3639973c98eSRoss Zwisler 3649973c98eSRoss Zwisler error = radix_tree_insert(page_tree, index, 3659973c98eSRoss Zwisler RADIX_DAX_ENTRY(sector, pmd_entry)); 3669973c98eSRoss Zwisler if (error) 3679973c98eSRoss Zwisler goto unlock; 3689973c98eSRoss Zwisler 3699973c98eSRoss Zwisler mapping->nrexceptional++; 3709973c98eSRoss Zwisler dirty: 3719973c98eSRoss Zwisler if (dirty) 3729973c98eSRoss Zwisler radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 3739973c98eSRoss Zwisler unlock: 3749973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 3759973c98eSRoss Zwisler return error; 3769973c98eSRoss Zwisler } 3779973c98eSRoss Zwisler 3789973c98eSRoss Zwisler static int dax_writeback_one(struct block_device *bdev, 3799973c98eSRoss Zwisler struct address_space *mapping, pgoff_t index, void *entry) 3809973c98eSRoss Zwisler { 3819973c98eSRoss Zwisler struct radix_tree_root *page_tree = &mapping->page_tree; 3829973c98eSRoss Zwisler int type = RADIX_DAX_TYPE(entry); 3839973c98eSRoss Zwisler struct radix_tree_node *node; 3849973c98eSRoss Zwisler struct blk_dax_ctl dax; 3859973c98eSRoss Zwisler void **slot; 3869973c98eSRoss Zwisler int ret = 0; 3879973c98eSRoss Zwisler 3889973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 3899973c98eSRoss Zwisler /* 3909973c98eSRoss Zwisler * Regular page slots are stabilized by the page lock even 3919973c98eSRoss Zwisler * without the tree itself locked. These unlocked entries 3929973c98eSRoss Zwisler * need verification under the tree lock. 3939973c98eSRoss Zwisler */ 3949973c98eSRoss Zwisler if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 3959973c98eSRoss Zwisler goto unlock; 3969973c98eSRoss Zwisler if (*slot != entry) 3979973c98eSRoss Zwisler goto unlock; 3989973c98eSRoss Zwisler 3999973c98eSRoss Zwisler /* another fsync thread may have already written back this entry */ 4009973c98eSRoss Zwisler if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 4019973c98eSRoss Zwisler goto unlock; 4029973c98eSRoss Zwisler 4039973c98eSRoss Zwisler if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 4049973c98eSRoss Zwisler ret = -EIO; 4059973c98eSRoss Zwisler goto unlock; 4069973c98eSRoss Zwisler } 4079973c98eSRoss Zwisler 4089973c98eSRoss Zwisler dax.sector = RADIX_DAX_SECTOR(entry); 4099973c98eSRoss Zwisler dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 4109973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 4119973c98eSRoss Zwisler 4129973c98eSRoss Zwisler /* 4139973c98eSRoss Zwisler * We cannot hold tree_lock while calling dax_map_atomic() because it 4149973c98eSRoss Zwisler * eventually calls cond_resched(). 4159973c98eSRoss Zwisler */ 4169973c98eSRoss Zwisler ret = dax_map_atomic(bdev, &dax); 4179973c98eSRoss Zwisler if (ret < 0) 4189973c98eSRoss Zwisler return ret; 4199973c98eSRoss Zwisler 4209973c98eSRoss Zwisler if (WARN_ON_ONCE(ret < dax.size)) { 4219973c98eSRoss Zwisler ret = -EIO; 4229973c98eSRoss Zwisler goto unmap; 4239973c98eSRoss Zwisler } 4249973c98eSRoss Zwisler 4259973c98eSRoss Zwisler wb_cache_pmem(dax.addr, dax.size); 4269973c98eSRoss Zwisler 4279973c98eSRoss Zwisler spin_lock_irq(&mapping->tree_lock); 4289973c98eSRoss Zwisler radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 4299973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 4309973c98eSRoss Zwisler unmap: 4319973c98eSRoss Zwisler dax_unmap_atomic(bdev, &dax); 4329973c98eSRoss Zwisler return ret; 4339973c98eSRoss Zwisler 4349973c98eSRoss Zwisler unlock: 4359973c98eSRoss Zwisler spin_unlock_irq(&mapping->tree_lock); 4369973c98eSRoss Zwisler return ret; 4379973c98eSRoss Zwisler } 4389973c98eSRoss Zwisler 4399973c98eSRoss Zwisler /* 4409973c98eSRoss Zwisler * Flush the mapping to the persistent domain within the byte range of [start, 4419973c98eSRoss Zwisler * end]. This is required by data integrity operations to ensure file data is 4429973c98eSRoss Zwisler * on persistent storage prior to completion of the operation. 4439973c98eSRoss Zwisler */ 4447f6d5b52SRoss Zwisler int dax_writeback_mapping_range(struct address_space *mapping, 4457f6d5b52SRoss Zwisler struct block_device *bdev, struct writeback_control *wbc) 4469973c98eSRoss Zwisler { 4479973c98eSRoss Zwisler struct inode *inode = mapping->host; 4489973c98eSRoss Zwisler pgoff_t start_index, end_index, pmd_index; 4499973c98eSRoss Zwisler pgoff_t indices[PAGEVEC_SIZE]; 4509973c98eSRoss Zwisler struct pagevec pvec; 4519973c98eSRoss Zwisler bool done = false; 4529973c98eSRoss Zwisler int i, ret = 0; 4539973c98eSRoss Zwisler void *entry; 4549973c98eSRoss Zwisler 4559973c98eSRoss Zwisler if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 4569973c98eSRoss Zwisler return -EIO; 4579973c98eSRoss Zwisler 4587f6d5b52SRoss Zwisler if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 4597f6d5b52SRoss Zwisler return 0; 4607f6d5b52SRoss Zwisler 46109cbfeafSKirill A. Shutemov start_index = wbc->range_start >> PAGE_SHIFT; 46209cbfeafSKirill A. Shutemov end_index = wbc->range_end >> PAGE_SHIFT; 4639973c98eSRoss Zwisler pmd_index = DAX_PMD_INDEX(start_index); 4649973c98eSRoss Zwisler 4659973c98eSRoss Zwisler rcu_read_lock(); 4669973c98eSRoss Zwisler entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 4679973c98eSRoss Zwisler rcu_read_unlock(); 4689973c98eSRoss Zwisler 4699973c98eSRoss Zwisler /* see if the start of our range is covered by a PMD entry */ 4709973c98eSRoss Zwisler if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 4719973c98eSRoss Zwisler start_index = pmd_index; 4729973c98eSRoss Zwisler 4739973c98eSRoss Zwisler tag_pages_for_writeback(mapping, start_index, end_index); 4749973c98eSRoss Zwisler 4759973c98eSRoss Zwisler pagevec_init(&pvec, 0); 4769973c98eSRoss Zwisler while (!done) { 4779973c98eSRoss Zwisler pvec.nr = find_get_entries_tag(mapping, start_index, 4789973c98eSRoss Zwisler PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 4799973c98eSRoss Zwisler pvec.pages, indices); 4809973c98eSRoss Zwisler 4819973c98eSRoss Zwisler if (pvec.nr == 0) 4829973c98eSRoss Zwisler break; 4839973c98eSRoss Zwisler 4849973c98eSRoss Zwisler for (i = 0; i < pvec.nr; i++) { 4859973c98eSRoss Zwisler if (indices[i] > end_index) { 4869973c98eSRoss Zwisler done = true; 4879973c98eSRoss Zwisler break; 4889973c98eSRoss Zwisler } 4899973c98eSRoss Zwisler 4909973c98eSRoss Zwisler ret = dax_writeback_one(bdev, mapping, indices[i], 4919973c98eSRoss Zwisler pvec.pages[i]); 4929973c98eSRoss Zwisler if (ret < 0) 4939973c98eSRoss Zwisler return ret; 4949973c98eSRoss Zwisler } 4959973c98eSRoss Zwisler } 4969973c98eSRoss Zwisler wmb_pmem(); 4979973c98eSRoss Zwisler return 0; 4989973c98eSRoss Zwisler } 4999973c98eSRoss Zwisler EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 5009973c98eSRoss Zwisler 501f7ca90b1SMatthew Wilcox static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 502f7ca90b1SMatthew Wilcox struct vm_area_struct *vma, struct vm_fault *vmf) 503f7ca90b1SMatthew Wilcox { 504f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 505b2e0d162SDan Williams struct address_space *mapping = inode->i_mapping; 506b2e0d162SDan Williams struct block_device *bdev = bh->b_bdev; 507b2e0d162SDan Williams struct blk_dax_ctl dax = { 508b2e0d162SDan Williams .sector = to_sector(bh, inode), 509b2e0d162SDan Williams .size = bh->b_size, 510b2e0d162SDan Williams }; 511f7ca90b1SMatthew Wilcox int error; 512f7ca90b1SMatthew Wilcox 5130f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 5140f90cc66SRoss Zwisler 515b2e0d162SDan Williams if (dax_map_atomic(bdev, &dax) < 0) { 516b2e0d162SDan Williams error = PTR_ERR(dax.addr); 517f7ca90b1SMatthew Wilcox goto out; 518f7ca90b1SMatthew Wilcox } 519b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 520f7ca90b1SMatthew Wilcox 5219973c98eSRoss Zwisler error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 5229973c98eSRoss Zwisler vmf->flags & FAULT_FLAG_WRITE); 5239973c98eSRoss Zwisler if (error) 5249973c98eSRoss Zwisler goto out; 5259973c98eSRoss Zwisler 52601c8f1c4SDan Williams error = vm_insert_mixed(vma, vaddr, dax.pfn); 527f7ca90b1SMatthew Wilcox 528f7ca90b1SMatthew Wilcox out: 5290f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 5300f90cc66SRoss Zwisler 531f7ca90b1SMatthew Wilcox return error; 532f7ca90b1SMatthew Wilcox } 533f7ca90b1SMatthew Wilcox 534ce5c5d55SDave Chinner /** 535ce5c5d55SDave Chinner * __dax_fault - handle a page fault on a DAX file 536ce5c5d55SDave Chinner * @vma: The virtual memory area where the fault occurred 537ce5c5d55SDave Chinner * @vmf: The description of the fault 538ce5c5d55SDave Chinner * @get_block: The filesystem method used to translate file offsets to blocks 539ce5c5d55SDave Chinner * 540ce5c5d55SDave Chinner * When a page fault occurs, filesystems may call this helper in their 541ce5c5d55SDave Chinner * fault handler for DAX files. __dax_fault() assumes the caller has done all 542ce5c5d55SDave Chinner * the necessary locking for the page fault to proceed successfully. 543ce5c5d55SDave Chinner */ 544ce5c5d55SDave Chinner int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 54502fbd139SJan Kara get_block_t get_block) 546f7ca90b1SMatthew Wilcox { 547f7ca90b1SMatthew Wilcox struct file *file = vma->vm_file; 548f7ca90b1SMatthew Wilcox struct address_space *mapping = file->f_mapping; 549f7ca90b1SMatthew Wilcox struct inode *inode = mapping->host; 550f7ca90b1SMatthew Wilcox struct page *page; 551f7ca90b1SMatthew Wilcox struct buffer_head bh; 552f7ca90b1SMatthew Wilcox unsigned long vaddr = (unsigned long)vmf->virtual_address; 553f7ca90b1SMatthew Wilcox unsigned blkbits = inode->i_blkbits; 554f7ca90b1SMatthew Wilcox sector_t block; 555f7ca90b1SMatthew Wilcox pgoff_t size; 556f7ca90b1SMatthew Wilcox int error; 557f7ca90b1SMatthew Wilcox int major = 0; 558f7ca90b1SMatthew Wilcox 559f7ca90b1SMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 560f7ca90b1SMatthew Wilcox if (vmf->pgoff >= size) 561f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS; 562f7ca90b1SMatthew Wilcox 563f7ca90b1SMatthew Wilcox memset(&bh, 0, sizeof(bh)); 564f7ca90b1SMatthew Wilcox block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 565eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 566f7ca90b1SMatthew Wilcox bh.b_size = PAGE_SIZE; 567f7ca90b1SMatthew Wilcox 568f7ca90b1SMatthew Wilcox repeat: 569f7ca90b1SMatthew Wilcox page = find_get_page(mapping, vmf->pgoff); 570f7ca90b1SMatthew Wilcox if (page) { 571f7ca90b1SMatthew Wilcox if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 57209cbfeafSKirill A. Shutemov put_page(page); 573f7ca90b1SMatthew Wilcox return VM_FAULT_RETRY; 574f7ca90b1SMatthew Wilcox } 575f7ca90b1SMatthew Wilcox if (unlikely(page->mapping != mapping)) { 576f7ca90b1SMatthew Wilcox unlock_page(page); 57709cbfeafSKirill A. Shutemov put_page(page); 578f7ca90b1SMatthew Wilcox goto repeat; 579f7ca90b1SMatthew Wilcox } 580f7ca90b1SMatthew Wilcox } 581f7ca90b1SMatthew Wilcox 582f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 0); 583f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 584f7ca90b1SMatthew Wilcox error = -EIO; /* fs corruption? */ 585f7ca90b1SMatthew Wilcox if (error) 5860f90cc66SRoss Zwisler goto unlock_page; 587f7ca90b1SMatthew Wilcox 588aef39ab1SJan Kara if (!buffer_mapped(&bh) && !vmf->cow_page) { 589f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 590f7ca90b1SMatthew Wilcox error = get_block(inode, block, &bh, 1); 591f7ca90b1SMatthew Wilcox count_vm_event(PGMAJFAULT); 592f7ca90b1SMatthew Wilcox mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 593f7ca90b1SMatthew Wilcox major = VM_FAULT_MAJOR; 594f7ca90b1SMatthew Wilcox if (!error && (bh.b_size < PAGE_SIZE)) 595f7ca90b1SMatthew Wilcox error = -EIO; 596f7ca90b1SMatthew Wilcox if (error) 5970f90cc66SRoss Zwisler goto unlock_page; 598f7ca90b1SMatthew Wilcox } else { 599f7ca90b1SMatthew Wilcox return dax_load_hole(mapping, page, vmf); 600f7ca90b1SMatthew Wilcox } 601f7ca90b1SMatthew Wilcox } 602f7ca90b1SMatthew Wilcox 603f7ca90b1SMatthew Wilcox if (vmf->cow_page) { 604f7ca90b1SMatthew Wilcox struct page *new_page = vmf->cow_page; 605f7ca90b1SMatthew Wilcox if (buffer_written(&bh)) 606b2e0d162SDan Williams error = copy_user_bh(new_page, inode, &bh, vaddr); 607f7ca90b1SMatthew Wilcox else 608f7ca90b1SMatthew Wilcox clear_user_highpage(new_page, vaddr); 609f7ca90b1SMatthew Wilcox if (error) 6100f90cc66SRoss Zwisler goto unlock_page; 611f7ca90b1SMatthew Wilcox vmf->page = page; 6127795bec8SJan Kara if (!page) 6130f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 614f7ca90b1SMatthew Wilcox return VM_FAULT_LOCKED; 615f7ca90b1SMatthew Wilcox } 616f7ca90b1SMatthew Wilcox 617f7ca90b1SMatthew Wilcox /* Check we didn't race with a read fault installing a new page */ 618f7ca90b1SMatthew Wilcox if (!page && major) 619f7ca90b1SMatthew Wilcox page = find_lock_page(mapping, vmf->pgoff); 620f7ca90b1SMatthew Wilcox 621f7ca90b1SMatthew Wilcox if (page) { 622f7ca90b1SMatthew Wilcox unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 62309cbfeafSKirill A. Shutemov PAGE_SIZE, 0); 624f7ca90b1SMatthew Wilcox delete_from_page_cache(page); 625f7ca90b1SMatthew Wilcox unlock_page(page); 62609cbfeafSKirill A. Shutemov put_page(page); 6279973c98eSRoss Zwisler page = NULL; 628f7ca90b1SMatthew Wilcox } 629f7ca90b1SMatthew Wilcox 63002fbd139SJan Kara /* Filesystem should not return unwritten buffers to us! */ 6312b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 632f7ca90b1SMatthew Wilcox error = dax_insert_mapping(inode, &bh, vma, vmf); 633f7ca90b1SMatthew Wilcox 634f7ca90b1SMatthew Wilcox out: 635f7ca90b1SMatthew Wilcox if (error == -ENOMEM) 636f7ca90b1SMatthew Wilcox return VM_FAULT_OOM | major; 637f7ca90b1SMatthew Wilcox /* -EBUSY is fine, somebody else faulted on the same PTE */ 638f7ca90b1SMatthew Wilcox if ((error < 0) && (error != -EBUSY)) 639f7ca90b1SMatthew Wilcox return VM_FAULT_SIGBUS | major; 640f7ca90b1SMatthew Wilcox return VM_FAULT_NOPAGE | major; 641f7ca90b1SMatthew Wilcox 6420f90cc66SRoss Zwisler unlock_page: 643f7ca90b1SMatthew Wilcox if (page) { 644f7ca90b1SMatthew Wilcox unlock_page(page); 64509cbfeafSKirill A. Shutemov put_page(page); 646f7ca90b1SMatthew Wilcox } 647f7ca90b1SMatthew Wilcox goto out; 648f7ca90b1SMatthew Wilcox } 649ce5c5d55SDave Chinner EXPORT_SYMBOL(__dax_fault); 650f7ca90b1SMatthew Wilcox 651f7ca90b1SMatthew Wilcox /** 652f7ca90b1SMatthew Wilcox * dax_fault - handle a page fault on a DAX file 653f7ca90b1SMatthew Wilcox * @vma: The virtual memory area where the fault occurred 654f7ca90b1SMatthew Wilcox * @vmf: The description of the fault 655f7ca90b1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 656f7ca90b1SMatthew Wilcox * 657f7ca90b1SMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 658f7ca90b1SMatthew Wilcox * fault handler for DAX files. 659f7ca90b1SMatthew Wilcox */ 660f7ca90b1SMatthew Wilcox int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 66102fbd139SJan Kara get_block_t get_block) 662f7ca90b1SMatthew Wilcox { 663f7ca90b1SMatthew Wilcox int result; 664f7ca90b1SMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 665f7ca90b1SMatthew Wilcox 666f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) { 667f7ca90b1SMatthew Wilcox sb_start_pagefault(sb); 668f7ca90b1SMatthew Wilcox file_update_time(vma->vm_file); 669f7ca90b1SMatthew Wilcox } 67002fbd139SJan Kara result = __dax_fault(vma, vmf, get_block); 671f7ca90b1SMatthew Wilcox if (vmf->flags & FAULT_FLAG_WRITE) 672f7ca90b1SMatthew Wilcox sb_end_pagefault(sb); 673f7ca90b1SMatthew Wilcox 674f7ca90b1SMatthew Wilcox return result; 675f7ca90b1SMatthew Wilcox } 676f7ca90b1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_fault); 6774c0ccfefSMatthew Wilcox 678844f35dbSMatthew Wilcox #ifdef CONFIG_TRANSPARENT_HUGEPAGE 679844f35dbSMatthew Wilcox /* 680844f35dbSMatthew Wilcox * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 681844f35dbSMatthew Wilcox * more often than one might expect in the below function. 682844f35dbSMatthew Wilcox */ 683844f35dbSMatthew Wilcox #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 684844f35dbSMatthew Wilcox 685cbb38e41SDan Williams static void __dax_dbg(struct buffer_head *bh, unsigned long address, 686cbb38e41SDan Williams const char *reason, const char *fn) 687cbb38e41SDan Williams { 688cbb38e41SDan Williams if (bh) { 689cbb38e41SDan Williams char bname[BDEVNAME_SIZE]; 690cbb38e41SDan Williams bdevname(bh->b_bdev, bname); 691cbb38e41SDan Williams pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 692cbb38e41SDan Williams "length %zd fallback: %s\n", fn, current->comm, 693cbb38e41SDan Williams address, bname, bh->b_state, (u64)bh->b_blocknr, 694cbb38e41SDan Williams bh->b_size, reason); 695cbb38e41SDan Williams } else { 696cbb38e41SDan Williams pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 697cbb38e41SDan Williams current->comm, address, reason); 698cbb38e41SDan Williams } 699cbb38e41SDan Williams } 700cbb38e41SDan Williams 701cbb38e41SDan Williams #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 702cbb38e41SDan Williams 703844f35dbSMatthew Wilcox int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 70402fbd139SJan Kara pmd_t *pmd, unsigned int flags, get_block_t get_block) 705844f35dbSMatthew Wilcox { 706844f35dbSMatthew Wilcox struct file *file = vma->vm_file; 707844f35dbSMatthew Wilcox struct address_space *mapping = file->f_mapping; 708844f35dbSMatthew Wilcox struct inode *inode = mapping->host; 709844f35dbSMatthew Wilcox struct buffer_head bh; 710844f35dbSMatthew Wilcox unsigned blkbits = inode->i_blkbits; 711844f35dbSMatthew Wilcox unsigned long pmd_addr = address & PMD_MASK; 712844f35dbSMatthew Wilcox bool write = flags & FAULT_FLAG_WRITE; 713b2e0d162SDan Williams struct block_device *bdev; 714844f35dbSMatthew Wilcox pgoff_t size, pgoff; 715b2e0d162SDan Williams sector_t block; 7169973c98eSRoss Zwisler int error, result = 0; 7179973c98eSRoss Zwisler bool alloc = false; 718844f35dbSMatthew Wilcox 719c046c321SDan Williams /* dax pmd mappings require pfn_t_devmap() */ 720ee82c9edSDan Williams if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 721ee82c9edSDan Williams return VM_FAULT_FALLBACK; 722ee82c9edSDan Williams 723844f35dbSMatthew Wilcox /* Fall back to PTEs if we're going to COW */ 72459bf4fb9SToshi Kani if (write && !(vma->vm_flags & VM_SHARED)) { 72559bf4fb9SToshi Kani split_huge_pmd(vma, pmd, address); 726cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "cow write"); 727844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 72859bf4fb9SToshi Kani } 729844f35dbSMatthew Wilcox /* If the PMD would extend outside the VMA */ 730cbb38e41SDan Williams if (pmd_addr < vma->vm_start) { 731cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma start unaligned"); 732844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 733cbb38e41SDan Williams } 734cbb38e41SDan Williams if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 735cbb38e41SDan Williams dax_pmd_dbg(NULL, address, "vma end unaligned"); 736844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 737cbb38e41SDan Williams } 738844f35dbSMatthew Wilcox 7393fdd1b47SMatthew Wilcox pgoff = linear_page_index(vma, pmd_addr); 740844f35dbSMatthew Wilcox size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 741844f35dbSMatthew Wilcox if (pgoff >= size) 742844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 743844f35dbSMatthew Wilcox /* If the PMD would cover blocks out of the file */ 744cbb38e41SDan Williams if ((pgoff | PG_PMD_COLOUR) >= size) { 745cbb38e41SDan Williams dax_pmd_dbg(NULL, address, 746cbb38e41SDan Williams "offset + huge page size > file size"); 747844f35dbSMatthew Wilcox return VM_FAULT_FALLBACK; 748cbb38e41SDan Williams } 749844f35dbSMatthew Wilcox 750844f35dbSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 751d4bbe706SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 752844f35dbSMatthew Wilcox block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 753844f35dbSMatthew Wilcox 754844f35dbSMatthew Wilcox bh.b_size = PMD_SIZE; 7559973c98eSRoss Zwisler 7569973c98eSRoss Zwisler if (get_block(inode, block, &bh, 0) != 0) 757844f35dbSMatthew Wilcox return VM_FAULT_SIGBUS; 7589973c98eSRoss Zwisler 7599973c98eSRoss Zwisler if (!buffer_mapped(&bh) && write) { 7609973c98eSRoss Zwisler if (get_block(inode, block, &bh, 1) != 0) 7619973c98eSRoss Zwisler return VM_FAULT_SIGBUS; 7629973c98eSRoss Zwisler alloc = true; 7632b10945cSJan Kara WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 7649973c98eSRoss Zwisler } 7659973c98eSRoss Zwisler 766b2e0d162SDan Williams bdev = bh.b_bdev; 767844f35dbSMatthew Wilcox 768844f35dbSMatthew Wilcox /* 769844f35dbSMatthew Wilcox * If the filesystem isn't willing to tell us the length of a hole, 770844f35dbSMatthew Wilcox * just fall back to PTEs. Calling get_block 512 times in a loop 771844f35dbSMatthew Wilcox * would be silly. 772844f35dbSMatthew Wilcox */ 773cbb38e41SDan Williams if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 774cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "allocated block too small"); 7759973c98eSRoss Zwisler return VM_FAULT_FALLBACK; 776cbb38e41SDan Williams } 777844f35dbSMatthew Wilcox 7789973c98eSRoss Zwisler /* 7799973c98eSRoss Zwisler * If we allocated new storage, make sure no process has any 7809973c98eSRoss Zwisler * zero pages covering this hole 7819973c98eSRoss Zwisler */ 7829973c98eSRoss Zwisler if (alloc) { 7839973c98eSRoss Zwisler loff_t lstart = pgoff << PAGE_SHIFT; 7849973c98eSRoss Zwisler loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 7859973c98eSRoss Zwisler 7869973c98eSRoss Zwisler truncate_pagecache_range(inode, lstart, lend); 7879973c98eSRoss Zwisler } 7889973c98eSRoss Zwisler 7890f90cc66SRoss Zwisler i_mmap_lock_read(mapping); 79046c043edSKirill A. Shutemov 791844f35dbSMatthew Wilcox if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 792844f35dbSMatthew Wilcox spinlock_t *ptl; 793d295e341SKirill A. Shutemov pmd_t entry; 794844f35dbSMatthew Wilcox struct page *zero_page = get_huge_zero_page(); 795d295e341SKirill A. Shutemov 796cbb38e41SDan Williams if (unlikely(!zero_page)) { 797cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "no zero page"); 798844f35dbSMatthew Wilcox goto fallback; 799cbb38e41SDan Williams } 800844f35dbSMatthew Wilcox 801d295e341SKirill A. Shutemov ptl = pmd_lock(vma->vm_mm, pmd); 802d295e341SKirill A. Shutemov if (!pmd_none(*pmd)) { 803844f35dbSMatthew Wilcox spin_unlock(ptl); 804cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pmd already present"); 805d295e341SKirill A. Shutemov goto fallback; 806d295e341SKirill A. Shutemov } 807d295e341SKirill A. Shutemov 808cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 809cbb38e41SDan Williams "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 810cbb38e41SDan Williams __func__, current->comm, address, 811cbb38e41SDan Williams (unsigned long long) to_sector(&bh, inode)); 812cbb38e41SDan Williams 813d295e341SKirill A. Shutemov entry = mk_pmd(zero_page, vma->vm_page_prot); 814d295e341SKirill A. Shutemov entry = pmd_mkhuge(entry); 815d295e341SKirill A. Shutemov set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 816844f35dbSMatthew Wilcox result = VM_FAULT_NOPAGE; 817d295e341SKirill A. Shutemov spin_unlock(ptl); 818844f35dbSMatthew Wilcox } else { 819b2e0d162SDan Williams struct blk_dax_ctl dax = { 820b2e0d162SDan Williams .sector = to_sector(&bh, inode), 821b2e0d162SDan Williams .size = PMD_SIZE, 822b2e0d162SDan Williams }; 823b2e0d162SDan Williams long length = dax_map_atomic(bdev, &dax); 824b2e0d162SDan Williams 825844f35dbSMatthew Wilcox if (length < 0) { 8268b3db979SDan Williams dax_pmd_dbg(&bh, address, "dax-error fallback"); 8278b3db979SDan Williams goto fallback; 828844f35dbSMatthew Wilcox } 829cbb38e41SDan Williams if (length < PMD_SIZE) { 830cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "dax-length too small"); 831cbb38e41SDan Williams dax_unmap_atomic(bdev, &dax); 832cbb38e41SDan Williams goto fallback; 833cbb38e41SDan Williams } 834cbb38e41SDan Williams if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 835cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn unaligned"); 836b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 837844f35dbSMatthew Wilcox goto fallback; 838b2e0d162SDan Williams } 839844f35dbSMatthew Wilcox 840c046c321SDan Williams if (!pfn_t_devmap(dax.pfn)) { 841b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 842cbb38e41SDan Williams dax_pmd_dbg(&bh, address, "pfn not in memmap"); 843152d7bd8SDan Williams goto fallback; 844b2e0d162SDan Williams } 845b2e0d162SDan Williams dax_unmap_atomic(bdev, &dax); 8460f90cc66SRoss Zwisler 8479973c98eSRoss Zwisler /* 8489973c98eSRoss Zwisler * For PTE faults we insert a radix tree entry for reads, and 8499973c98eSRoss Zwisler * leave it clean. Then on the first write we dirty the radix 8509973c98eSRoss Zwisler * tree entry via the dax_pfn_mkwrite() path. This sequence 8519973c98eSRoss Zwisler * allows the dax_pfn_mkwrite() call to be simpler and avoid a 8529973c98eSRoss Zwisler * call into get_block() to translate the pgoff to a sector in 8539973c98eSRoss Zwisler * order to be able to create a new radix tree entry. 8549973c98eSRoss Zwisler * 8559973c98eSRoss Zwisler * The PMD path doesn't have an equivalent to 8569973c98eSRoss Zwisler * dax_pfn_mkwrite(), though, so for a read followed by a 8579973c98eSRoss Zwisler * write we traverse all the way through __dax_pmd_fault() 8589973c98eSRoss Zwisler * twice. This means we can just skip inserting a radix tree 8599973c98eSRoss Zwisler * entry completely on the initial read and just wait until 8609973c98eSRoss Zwisler * the write to insert a dirty entry. 8619973c98eSRoss Zwisler */ 8629973c98eSRoss Zwisler if (write) { 8639973c98eSRoss Zwisler error = dax_radix_entry(mapping, pgoff, dax.sector, 8649973c98eSRoss Zwisler true, true); 8659973c98eSRoss Zwisler if (error) { 8669973c98eSRoss Zwisler dax_pmd_dbg(&bh, address, 8679973c98eSRoss Zwisler "PMD radix insertion failed"); 8689973c98eSRoss Zwisler goto fallback; 8699973c98eSRoss Zwisler } 8709973c98eSRoss Zwisler } 8719973c98eSRoss Zwisler 872cbb38e41SDan Williams dev_dbg(part_to_dev(bdev->bd_part), 873cbb38e41SDan Williams "%s: %s addr: %lx pfn: %lx sect: %llx\n", 874cbb38e41SDan Williams __func__, current->comm, address, 875cbb38e41SDan Williams pfn_t_to_pfn(dax.pfn), 876cbb38e41SDan Williams (unsigned long long) dax.sector); 87734c0fd54SDan Williams result |= vmf_insert_pfn_pmd(vma, address, pmd, 878f25748e3SDan Williams dax.pfn, write); 879844f35dbSMatthew Wilcox } 880844f35dbSMatthew Wilcox 881844f35dbSMatthew Wilcox out: 8820f90cc66SRoss Zwisler i_mmap_unlock_read(mapping); 8830f90cc66SRoss Zwisler 884844f35dbSMatthew Wilcox return result; 885844f35dbSMatthew Wilcox 886844f35dbSMatthew Wilcox fallback: 887844f35dbSMatthew Wilcox count_vm_event(THP_FAULT_FALLBACK); 888844f35dbSMatthew Wilcox result = VM_FAULT_FALLBACK; 889844f35dbSMatthew Wilcox goto out; 890844f35dbSMatthew Wilcox } 891844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(__dax_pmd_fault); 892844f35dbSMatthew Wilcox 893844f35dbSMatthew Wilcox /** 894844f35dbSMatthew Wilcox * dax_pmd_fault - handle a PMD fault on a DAX file 895844f35dbSMatthew Wilcox * @vma: The virtual memory area where the fault occurred 896844f35dbSMatthew Wilcox * @vmf: The description of the fault 897844f35dbSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 898844f35dbSMatthew Wilcox * 899844f35dbSMatthew Wilcox * When a page fault occurs, filesystems may call this helper in their 900844f35dbSMatthew Wilcox * pmd_fault handler for DAX files. 901844f35dbSMatthew Wilcox */ 902844f35dbSMatthew Wilcox int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 90302fbd139SJan Kara pmd_t *pmd, unsigned int flags, get_block_t get_block) 904844f35dbSMatthew Wilcox { 905844f35dbSMatthew Wilcox int result; 906844f35dbSMatthew Wilcox struct super_block *sb = file_inode(vma->vm_file)->i_sb; 907844f35dbSMatthew Wilcox 908844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) { 909844f35dbSMatthew Wilcox sb_start_pagefault(sb); 910844f35dbSMatthew Wilcox file_update_time(vma->vm_file); 911844f35dbSMatthew Wilcox } 91202fbd139SJan Kara result = __dax_pmd_fault(vma, address, pmd, flags, get_block); 913844f35dbSMatthew Wilcox if (flags & FAULT_FLAG_WRITE) 914844f35dbSMatthew Wilcox sb_end_pagefault(sb); 915844f35dbSMatthew Wilcox 916844f35dbSMatthew Wilcox return result; 917844f35dbSMatthew Wilcox } 918844f35dbSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_pmd_fault); 919dd8a2b6cSValentin Rothberg #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 920844f35dbSMatthew Wilcox 9214c0ccfefSMatthew Wilcox /** 9220e3b210cSBoaz Harrosh * dax_pfn_mkwrite - handle first write to DAX page 9230e3b210cSBoaz Harrosh * @vma: The virtual memory area where the fault occurred 9240e3b210cSBoaz Harrosh * @vmf: The description of the fault 9250e3b210cSBoaz Harrosh */ 9260e3b210cSBoaz Harrosh int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 9270e3b210cSBoaz Harrosh { 9289973c98eSRoss Zwisler struct file *file = vma->vm_file; 92930f471fdSRoss Zwisler int error; 9300e3b210cSBoaz Harrosh 9319973c98eSRoss Zwisler /* 9329973c98eSRoss Zwisler * We pass NO_SECTOR to dax_radix_entry() because we expect that a 9339973c98eSRoss Zwisler * RADIX_DAX_PTE entry already exists in the radix tree from a 9349973c98eSRoss Zwisler * previous call to __dax_fault(). We just want to look up that PTE 9359973c98eSRoss Zwisler * entry using vmf->pgoff and make sure the dirty tag is set. This 9369973c98eSRoss Zwisler * saves us from having to make a call to get_block() here to look 9379973c98eSRoss Zwisler * up the sector. 9389973c98eSRoss Zwisler */ 93930f471fdSRoss Zwisler error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, 94030f471fdSRoss Zwisler true); 94130f471fdSRoss Zwisler 94230f471fdSRoss Zwisler if (error == -ENOMEM) 94330f471fdSRoss Zwisler return VM_FAULT_OOM; 94430f471fdSRoss Zwisler if (error) 94530f471fdSRoss Zwisler return VM_FAULT_SIGBUS; 9460e3b210cSBoaz Harrosh return VM_FAULT_NOPAGE; 9470e3b210cSBoaz Harrosh } 9480e3b210cSBoaz Harrosh EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 9490e3b210cSBoaz Harrosh 950679c8bd3SChristoph Hellwig int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 951679c8bd3SChristoph Hellwig unsigned int offset, unsigned int length) 952679c8bd3SChristoph Hellwig { 953679c8bd3SChristoph Hellwig struct blk_dax_ctl dax = { 954679c8bd3SChristoph Hellwig .sector = sector, 955679c8bd3SChristoph Hellwig .size = PAGE_SIZE, 956679c8bd3SChristoph Hellwig }; 957679c8bd3SChristoph Hellwig 958679c8bd3SChristoph Hellwig if (dax_map_atomic(bdev, &dax) < 0) 959679c8bd3SChristoph Hellwig return PTR_ERR(dax.addr); 960679c8bd3SChristoph Hellwig clear_pmem(dax.addr + offset, length); 961679c8bd3SChristoph Hellwig wmb_pmem(); 962679c8bd3SChristoph Hellwig dax_unmap_atomic(bdev, &dax); 963679c8bd3SChristoph Hellwig return 0; 964679c8bd3SChristoph Hellwig } 965679c8bd3SChristoph Hellwig EXPORT_SYMBOL_GPL(__dax_zero_page_range); 966679c8bd3SChristoph Hellwig 9670e3b210cSBoaz Harrosh /** 96825726bc1SMatthew Wilcox * dax_zero_page_range - zero a range within a page of a DAX file 9694c0ccfefSMatthew Wilcox * @inode: The file being truncated 9704c0ccfefSMatthew Wilcox * @from: The file offset that is being truncated to 97125726bc1SMatthew Wilcox * @length: The number of bytes to zero 9724c0ccfefSMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 9734c0ccfefSMatthew Wilcox * 97425726bc1SMatthew Wilcox * This function can be called by a filesystem when it is zeroing part of a 97525726bc1SMatthew Wilcox * page in a DAX file. This is intended for hole-punch operations. If 97625726bc1SMatthew Wilcox * you are truncating a file, the helper function dax_truncate_page() may be 97725726bc1SMatthew Wilcox * more convenient. 9784c0ccfefSMatthew Wilcox * 979ea1754a0SKirill A. Shutemov * We work in terms of PAGE_SIZE here for commonality with 9804c0ccfefSMatthew Wilcox * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 9814c0ccfefSMatthew Wilcox * took care of disposing of the unnecessary blocks. Even if the filesystem 9824c0ccfefSMatthew Wilcox * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 98325726bc1SMatthew Wilcox * since the file might be mmapped. 9844c0ccfefSMatthew Wilcox */ 98525726bc1SMatthew Wilcox int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 98625726bc1SMatthew Wilcox get_block_t get_block) 9874c0ccfefSMatthew Wilcox { 9884c0ccfefSMatthew Wilcox struct buffer_head bh; 98909cbfeafSKirill A. Shutemov pgoff_t index = from >> PAGE_SHIFT; 99009cbfeafSKirill A. Shutemov unsigned offset = from & (PAGE_SIZE-1); 9914c0ccfefSMatthew Wilcox int err; 9924c0ccfefSMatthew Wilcox 9934c0ccfefSMatthew Wilcox /* Block boundary? Nothing to do */ 9944c0ccfefSMatthew Wilcox if (!length) 9954c0ccfefSMatthew Wilcox return 0; 99609cbfeafSKirill A. Shutemov BUG_ON((offset + length) > PAGE_SIZE); 9974c0ccfefSMatthew Wilcox 9984c0ccfefSMatthew Wilcox memset(&bh, 0, sizeof(bh)); 999eab95db6SRoss Zwisler bh.b_bdev = inode->i_sb->s_bdev; 100009cbfeafSKirill A. Shutemov bh.b_size = PAGE_SIZE; 10014c0ccfefSMatthew Wilcox err = get_block(inode, index, &bh, 0); 1002679c8bd3SChristoph Hellwig if (err < 0 || !buffer_written(&bh)) 10034c0ccfefSMatthew Wilcox return err; 1004b2e0d162SDan Williams 1005679c8bd3SChristoph Hellwig return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1006679c8bd3SChristoph Hellwig offset, length); 10074c0ccfefSMatthew Wilcox } 100825726bc1SMatthew Wilcox EXPORT_SYMBOL_GPL(dax_zero_page_range); 100925726bc1SMatthew Wilcox 101025726bc1SMatthew Wilcox /** 101125726bc1SMatthew Wilcox * dax_truncate_page - handle a partial page being truncated in a DAX file 101225726bc1SMatthew Wilcox * @inode: The file being truncated 101325726bc1SMatthew Wilcox * @from: The file offset that is being truncated to 101425726bc1SMatthew Wilcox * @get_block: The filesystem method used to translate file offsets to blocks 101525726bc1SMatthew Wilcox * 101625726bc1SMatthew Wilcox * Similar to block_truncate_page(), this function can be called by a 101725726bc1SMatthew Wilcox * filesystem when it is truncating a DAX file to handle the partial page. 101825726bc1SMatthew Wilcox * 1019ea1754a0SKirill A. Shutemov * We work in terms of PAGE_SIZE here for commonality with 102025726bc1SMatthew Wilcox * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 102125726bc1SMatthew Wilcox * took care of disposing of the unnecessary blocks. Even if the filesystem 102225726bc1SMatthew Wilcox * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 102325726bc1SMatthew Wilcox * since the file might be mmapped. 102425726bc1SMatthew Wilcox */ 102525726bc1SMatthew Wilcox int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 102625726bc1SMatthew Wilcox { 102709cbfeafSKirill A. Shutemov unsigned length = PAGE_ALIGN(from) - from; 102825726bc1SMatthew Wilcox return dax_zero_page_range(inode, from, length, get_block); 102925726bc1SMatthew Wilcox } 10304c0ccfefSMatthew Wilcox EXPORT_SYMBOL_GPL(dax_truncate_page); 1031