1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 35 /* 36 * We use lowest available bit in exceptional entry for locking, other two 37 * bits to determine entry type. In total 3 special bits. 38 */ 39 #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 40 #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 41 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 42 #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 43 #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 44 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 45 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 46 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 47 RADIX_TREE_EXCEPTIONAL_ENTRY)) 48 49 /* We choose 4096 entries - same as per-zone page wait tables */ 50 #define DAX_WAIT_TABLE_BITS 12 51 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 52 53 wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 54 55 static int __init init_dax_wait_table(void) 56 { 57 int i; 58 59 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 60 init_waitqueue_head(wait_table + i); 61 return 0; 62 } 63 fs_initcall(init_dax_wait_table); 64 65 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 66 pgoff_t index) 67 { 68 unsigned long hash = hash_long((unsigned long)mapping ^ index, 69 DAX_WAIT_TABLE_BITS); 70 return wait_table + hash; 71 } 72 73 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 74 { 75 struct request_queue *q = bdev->bd_queue; 76 long rc = -EIO; 77 78 dax->addr = (void __pmem *) ERR_PTR(-EIO); 79 if (blk_queue_enter(q, true) != 0) 80 return rc; 81 82 rc = bdev_direct_access(bdev, dax); 83 if (rc < 0) { 84 dax->addr = (void __pmem *) ERR_PTR(rc); 85 blk_queue_exit(q); 86 return rc; 87 } 88 return rc; 89 } 90 91 static void dax_unmap_atomic(struct block_device *bdev, 92 const struct blk_dax_ctl *dax) 93 { 94 if (IS_ERR(dax->addr)) 95 return; 96 blk_queue_exit(bdev->bd_queue); 97 } 98 99 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 100 { 101 struct page *page = alloc_pages(GFP_KERNEL, 0); 102 struct blk_dax_ctl dax = { 103 .size = PAGE_SIZE, 104 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 105 }; 106 long rc; 107 108 if (!page) 109 return ERR_PTR(-ENOMEM); 110 111 rc = dax_map_atomic(bdev, &dax); 112 if (rc < 0) 113 return ERR_PTR(rc); 114 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 115 dax_unmap_atomic(bdev, &dax); 116 return page; 117 } 118 119 static bool buffer_written(struct buffer_head *bh) 120 { 121 return buffer_mapped(bh) && !buffer_unwritten(bh); 122 } 123 124 /* 125 * When ext4 encounters a hole, it returns without modifying the buffer_head 126 * which means that we can't trust b_size. To cope with this, we set b_state 127 * to 0 before calling get_block and, if any bit is set, we know we can trust 128 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 129 * and would save us time calling get_block repeatedly. 130 */ 131 static bool buffer_size_valid(struct buffer_head *bh) 132 { 133 return bh->b_state != 0; 134 } 135 136 137 static sector_t to_sector(const struct buffer_head *bh, 138 const struct inode *inode) 139 { 140 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 141 142 return sector; 143 } 144 145 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 146 loff_t start, loff_t end, get_block_t get_block, 147 struct buffer_head *bh) 148 { 149 loff_t pos = start, max = start, bh_max = start; 150 bool hole = false, need_wmb = false; 151 struct block_device *bdev = NULL; 152 int rw = iov_iter_rw(iter), rc; 153 long map_len = 0; 154 struct blk_dax_ctl dax = { 155 .addr = (void __pmem *) ERR_PTR(-EIO), 156 }; 157 unsigned blkbits = inode->i_blkbits; 158 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 159 >> blkbits; 160 161 if (rw == READ) 162 end = min(end, i_size_read(inode)); 163 164 while (pos < end) { 165 size_t len; 166 if (pos == max) { 167 long page = pos >> PAGE_SHIFT; 168 sector_t block = page << (PAGE_SHIFT - blkbits); 169 unsigned first = pos - (block << blkbits); 170 long size; 171 172 if (pos == bh_max) { 173 bh->b_size = PAGE_ALIGN(end - pos); 174 bh->b_state = 0; 175 rc = get_block(inode, block, bh, rw == WRITE); 176 if (rc) 177 break; 178 if (!buffer_size_valid(bh)) 179 bh->b_size = 1 << blkbits; 180 bh_max = pos - first + bh->b_size; 181 bdev = bh->b_bdev; 182 /* 183 * We allow uninitialized buffers for writes 184 * beyond EOF as those cannot race with faults 185 */ 186 WARN_ON_ONCE( 187 (buffer_new(bh) && block < file_blks) || 188 (rw == WRITE && buffer_unwritten(bh))); 189 } else { 190 unsigned done = bh->b_size - 191 (bh_max - (pos - first)); 192 bh->b_blocknr += done >> blkbits; 193 bh->b_size -= done; 194 } 195 196 hole = rw == READ && !buffer_written(bh); 197 if (hole) { 198 size = bh->b_size - first; 199 } else { 200 dax_unmap_atomic(bdev, &dax); 201 dax.sector = to_sector(bh, inode); 202 dax.size = bh->b_size; 203 map_len = dax_map_atomic(bdev, &dax); 204 if (map_len < 0) { 205 rc = map_len; 206 break; 207 } 208 dax.addr += first; 209 size = map_len - first; 210 } 211 max = min(pos + size, end); 212 } 213 214 if (iov_iter_rw(iter) == WRITE) { 215 len = copy_from_iter_pmem(dax.addr, max - pos, iter); 216 need_wmb = true; 217 } else if (!hole) 218 len = copy_to_iter((void __force *) dax.addr, max - pos, 219 iter); 220 else 221 len = iov_iter_zero(max - pos, iter); 222 223 if (!len) { 224 rc = -EFAULT; 225 break; 226 } 227 228 pos += len; 229 if (!IS_ERR(dax.addr)) 230 dax.addr += len; 231 } 232 233 if (need_wmb) 234 wmb_pmem(); 235 dax_unmap_atomic(bdev, &dax); 236 237 return (pos == start) ? rc : pos - start; 238 } 239 240 /** 241 * dax_do_io - Perform I/O to a DAX file 242 * @iocb: The control block for this I/O 243 * @inode: The file which the I/O is directed at 244 * @iter: The addresses to do I/O from or to 245 * @get_block: The filesystem method used to translate file offsets to blocks 246 * @end_io: A filesystem callback for I/O completion 247 * @flags: See below 248 * 249 * This function uses the same locking scheme as do_blockdev_direct_IO: 250 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 251 * caller for writes. For reads, we take and release the i_mutex ourselves. 252 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 253 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 254 * is in progress. 255 */ 256 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 257 struct iov_iter *iter, get_block_t get_block, 258 dio_iodone_t end_io, int flags) 259 { 260 struct buffer_head bh; 261 ssize_t retval = -EINVAL; 262 loff_t pos = iocb->ki_pos; 263 loff_t end = pos + iov_iter_count(iter); 264 265 memset(&bh, 0, sizeof(bh)); 266 bh.b_bdev = inode->i_sb->s_bdev; 267 268 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 269 inode_lock(inode); 270 271 /* Protects against truncate */ 272 if (!(flags & DIO_SKIP_DIO_COUNT)) 273 inode_dio_begin(inode); 274 275 retval = dax_io(inode, iter, pos, end, get_block, &bh); 276 277 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 278 inode_unlock(inode); 279 280 if (end_io) { 281 int err; 282 283 err = end_io(iocb, pos, retval, bh.b_private); 284 if (err) 285 retval = err; 286 } 287 288 if (!(flags & DIO_SKIP_DIO_COUNT)) 289 inode_dio_end(inode); 290 return retval; 291 } 292 EXPORT_SYMBOL_GPL(dax_do_io); 293 294 /* 295 * DAX radix tree locking 296 */ 297 struct exceptional_entry_key { 298 struct address_space *mapping; 299 unsigned long index; 300 }; 301 302 struct wait_exceptional_entry_queue { 303 wait_queue_t wait; 304 struct exceptional_entry_key key; 305 }; 306 307 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 308 int sync, void *keyp) 309 { 310 struct exceptional_entry_key *key = keyp; 311 struct wait_exceptional_entry_queue *ewait = 312 container_of(wait, struct wait_exceptional_entry_queue, wait); 313 314 if (key->mapping != ewait->key.mapping || 315 key->index != ewait->key.index) 316 return 0; 317 return autoremove_wake_function(wait, mode, sync, NULL); 318 } 319 320 /* 321 * Check whether the given slot is locked. The function must be called with 322 * mapping->tree_lock held 323 */ 324 static inline int slot_locked(struct address_space *mapping, void **slot) 325 { 326 unsigned long entry = (unsigned long) 327 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 328 return entry & RADIX_DAX_ENTRY_LOCK; 329 } 330 331 /* 332 * Mark the given slot is locked. The function must be called with 333 * mapping->tree_lock held 334 */ 335 static inline void *lock_slot(struct address_space *mapping, void **slot) 336 { 337 unsigned long entry = (unsigned long) 338 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 339 340 entry |= RADIX_DAX_ENTRY_LOCK; 341 radix_tree_replace_slot(slot, (void *)entry); 342 return (void *)entry; 343 } 344 345 /* 346 * Mark the given slot is unlocked. The function must be called with 347 * mapping->tree_lock held 348 */ 349 static inline void *unlock_slot(struct address_space *mapping, void **slot) 350 { 351 unsigned long entry = (unsigned long) 352 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 353 354 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 355 radix_tree_replace_slot(slot, (void *)entry); 356 return (void *)entry; 357 } 358 359 /* 360 * Lookup entry in radix tree, wait for it to become unlocked if it is 361 * exceptional entry and return it. The caller must call 362 * put_unlocked_mapping_entry() when he decided not to lock the entry or 363 * put_locked_mapping_entry() when he locked the entry and now wants to 364 * unlock it. 365 * 366 * The function must be called with mapping->tree_lock held. 367 */ 368 static void *get_unlocked_mapping_entry(struct address_space *mapping, 369 pgoff_t index, void ***slotp) 370 { 371 void *ret, **slot; 372 struct wait_exceptional_entry_queue ewait; 373 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 374 375 init_wait(&ewait.wait); 376 ewait.wait.func = wake_exceptional_entry_func; 377 ewait.key.mapping = mapping; 378 ewait.key.index = index; 379 380 for (;;) { 381 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 382 &slot); 383 if (!ret || !radix_tree_exceptional_entry(ret) || 384 !slot_locked(mapping, slot)) { 385 if (slotp) 386 *slotp = slot; 387 return ret; 388 } 389 prepare_to_wait_exclusive(wq, &ewait.wait, 390 TASK_UNINTERRUPTIBLE); 391 spin_unlock_irq(&mapping->tree_lock); 392 schedule(); 393 finish_wait(wq, &ewait.wait); 394 spin_lock_irq(&mapping->tree_lock); 395 } 396 } 397 398 /* 399 * Find radix tree entry at given index. If it points to a page, return with 400 * the page locked. If it points to the exceptional entry, return with the 401 * radix tree entry locked. If the radix tree doesn't contain given index, 402 * create empty exceptional entry for the index and return with it locked. 403 * 404 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 405 * persistent memory the benefit is doubtful. We can add that later if we can 406 * show it helps. 407 */ 408 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 409 { 410 void *ret, **slot; 411 412 restart: 413 spin_lock_irq(&mapping->tree_lock); 414 ret = get_unlocked_mapping_entry(mapping, index, &slot); 415 /* No entry for given index? Make sure radix tree is big enough. */ 416 if (!ret) { 417 int err; 418 419 spin_unlock_irq(&mapping->tree_lock); 420 err = radix_tree_preload( 421 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 422 if (err) 423 return ERR_PTR(err); 424 ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 425 RADIX_DAX_ENTRY_LOCK); 426 spin_lock_irq(&mapping->tree_lock); 427 err = radix_tree_insert(&mapping->page_tree, index, ret); 428 radix_tree_preload_end(); 429 if (err) { 430 spin_unlock_irq(&mapping->tree_lock); 431 /* Someone already created the entry? */ 432 if (err == -EEXIST) 433 goto restart; 434 return ERR_PTR(err); 435 } 436 /* Good, we have inserted empty locked entry into the tree. */ 437 mapping->nrexceptional++; 438 spin_unlock_irq(&mapping->tree_lock); 439 return ret; 440 } 441 /* Normal page in radix tree? */ 442 if (!radix_tree_exceptional_entry(ret)) { 443 struct page *page = ret; 444 445 get_page(page); 446 spin_unlock_irq(&mapping->tree_lock); 447 lock_page(page); 448 /* Page got truncated? Retry... */ 449 if (unlikely(page->mapping != mapping)) { 450 unlock_page(page); 451 put_page(page); 452 goto restart; 453 } 454 return page; 455 } 456 ret = lock_slot(mapping, slot); 457 spin_unlock_irq(&mapping->tree_lock); 458 return ret; 459 } 460 461 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 462 pgoff_t index, bool wake_all) 463 { 464 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 465 466 /* 467 * Checking for locked entry and prepare_to_wait_exclusive() happens 468 * under mapping->tree_lock, ditto for entry handling in our callers. 469 * So at this point all tasks that could have seen our entry locked 470 * must be in the waitqueue and the following check will see them. 471 */ 472 if (waitqueue_active(wq)) { 473 struct exceptional_entry_key key; 474 475 key.mapping = mapping; 476 key.index = index; 477 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 478 } 479 } 480 481 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 482 { 483 void *ret, **slot; 484 485 spin_lock_irq(&mapping->tree_lock); 486 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 487 if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 488 !slot_locked(mapping, slot))) { 489 spin_unlock_irq(&mapping->tree_lock); 490 return; 491 } 492 unlock_slot(mapping, slot); 493 spin_unlock_irq(&mapping->tree_lock); 494 dax_wake_mapping_entry_waiter(mapping, index, false); 495 } 496 497 static void put_locked_mapping_entry(struct address_space *mapping, 498 pgoff_t index, void *entry) 499 { 500 if (!radix_tree_exceptional_entry(entry)) { 501 unlock_page(entry); 502 put_page(entry); 503 } else { 504 dax_unlock_mapping_entry(mapping, index); 505 } 506 } 507 508 /* 509 * Called when we are done with radix tree entry we looked up via 510 * get_unlocked_mapping_entry() and which we didn't lock in the end. 511 */ 512 static void put_unlocked_mapping_entry(struct address_space *mapping, 513 pgoff_t index, void *entry) 514 { 515 if (!radix_tree_exceptional_entry(entry)) 516 return; 517 518 /* We have to wake up next waiter for the radix tree entry lock */ 519 dax_wake_mapping_entry_waiter(mapping, index, false); 520 } 521 522 /* 523 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 524 * entry to get unlocked before deleting it. 525 */ 526 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 527 { 528 void *entry; 529 530 spin_lock_irq(&mapping->tree_lock); 531 entry = get_unlocked_mapping_entry(mapping, index, NULL); 532 /* 533 * This gets called from truncate / punch_hole path. As such, the caller 534 * must hold locks protecting against concurrent modifications of the 535 * radix tree (usually fs-private i_mmap_sem for writing). Since the 536 * caller has seen exceptional entry for this index, we better find it 537 * at that index as well... 538 */ 539 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 540 spin_unlock_irq(&mapping->tree_lock); 541 return 0; 542 } 543 radix_tree_delete(&mapping->page_tree, index); 544 mapping->nrexceptional--; 545 spin_unlock_irq(&mapping->tree_lock); 546 dax_wake_mapping_entry_waiter(mapping, index, true); 547 548 return 1; 549 } 550 551 /* 552 * The user has performed a load from a hole in the file. Allocating 553 * a new page in the file would cause excessive storage usage for 554 * workloads with sparse files. We allocate a page cache page instead. 555 * We'll kick it out of the page cache if it's ever written to, 556 * otherwise it will simply fall out of the page cache under memory 557 * pressure without ever having been dirtied. 558 */ 559 static int dax_load_hole(struct address_space *mapping, void *entry, 560 struct vm_fault *vmf) 561 { 562 struct page *page; 563 564 /* Hole page already exists? Return it... */ 565 if (!radix_tree_exceptional_entry(entry)) { 566 vmf->page = entry; 567 return VM_FAULT_LOCKED; 568 } 569 570 /* This will replace locked radix tree entry with a hole page */ 571 page = find_or_create_page(mapping, vmf->pgoff, 572 vmf->gfp_mask | __GFP_ZERO); 573 if (!page) { 574 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 575 return VM_FAULT_OOM; 576 } 577 vmf->page = page; 578 return VM_FAULT_LOCKED; 579 } 580 581 static int copy_user_bh(struct page *to, struct inode *inode, 582 struct buffer_head *bh, unsigned long vaddr) 583 { 584 struct blk_dax_ctl dax = { 585 .sector = to_sector(bh, inode), 586 .size = bh->b_size, 587 }; 588 struct block_device *bdev = bh->b_bdev; 589 void *vto; 590 591 if (dax_map_atomic(bdev, &dax) < 0) 592 return PTR_ERR(dax.addr); 593 vto = kmap_atomic(to); 594 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 595 kunmap_atomic(vto); 596 dax_unmap_atomic(bdev, &dax); 597 return 0; 598 } 599 600 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 601 602 static void *dax_insert_mapping_entry(struct address_space *mapping, 603 struct vm_fault *vmf, 604 void *entry, sector_t sector) 605 { 606 struct radix_tree_root *page_tree = &mapping->page_tree; 607 int error = 0; 608 bool hole_fill = false; 609 void *new_entry; 610 pgoff_t index = vmf->pgoff; 611 612 if (vmf->flags & FAULT_FLAG_WRITE) 613 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 614 615 /* Replacing hole page with block mapping? */ 616 if (!radix_tree_exceptional_entry(entry)) { 617 hole_fill = true; 618 /* 619 * Unmap the page now before we remove it from page cache below. 620 * The page is locked so it cannot be faulted in again. 621 */ 622 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 623 PAGE_SIZE, 0); 624 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 625 if (error) 626 return ERR_PTR(error); 627 } 628 629 spin_lock_irq(&mapping->tree_lock); 630 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 631 RADIX_DAX_ENTRY_LOCK); 632 if (hole_fill) { 633 __delete_from_page_cache(entry, NULL); 634 /* Drop pagecache reference */ 635 put_page(entry); 636 error = radix_tree_insert(page_tree, index, new_entry); 637 if (error) { 638 new_entry = ERR_PTR(error); 639 goto unlock; 640 } 641 mapping->nrexceptional++; 642 } else { 643 void **slot; 644 void *ret; 645 646 ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 647 WARN_ON_ONCE(ret != entry); 648 radix_tree_replace_slot(slot, new_entry); 649 } 650 if (vmf->flags & FAULT_FLAG_WRITE) 651 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 652 unlock: 653 spin_unlock_irq(&mapping->tree_lock); 654 if (hole_fill) { 655 radix_tree_preload_end(); 656 /* 657 * We don't need hole page anymore, it has been replaced with 658 * locked radix tree entry now. 659 */ 660 if (mapping->a_ops->freepage) 661 mapping->a_ops->freepage(entry); 662 unlock_page(entry); 663 put_page(entry); 664 } 665 return new_entry; 666 } 667 668 static int dax_writeback_one(struct block_device *bdev, 669 struct address_space *mapping, pgoff_t index, void *entry) 670 { 671 struct radix_tree_root *page_tree = &mapping->page_tree; 672 int type = RADIX_DAX_TYPE(entry); 673 struct radix_tree_node *node; 674 struct blk_dax_ctl dax; 675 void **slot; 676 int ret = 0; 677 678 spin_lock_irq(&mapping->tree_lock); 679 /* 680 * Regular page slots are stabilized by the page lock even 681 * without the tree itself locked. These unlocked entries 682 * need verification under the tree lock. 683 */ 684 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 685 goto unlock; 686 if (*slot != entry) 687 goto unlock; 688 689 /* another fsync thread may have already written back this entry */ 690 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 691 goto unlock; 692 693 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 694 ret = -EIO; 695 goto unlock; 696 } 697 698 dax.sector = RADIX_DAX_SECTOR(entry); 699 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 700 spin_unlock_irq(&mapping->tree_lock); 701 702 /* 703 * We cannot hold tree_lock while calling dax_map_atomic() because it 704 * eventually calls cond_resched(). 705 */ 706 ret = dax_map_atomic(bdev, &dax); 707 if (ret < 0) 708 return ret; 709 710 if (WARN_ON_ONCE(ret < dax.size)) { 711 ret = -EIO; 712 goto unmap; 713 } 714 715 wb_cache_pmem(dax.addr, dax.size); 716 717 spin_lock_irq(&mapping->tree_lock); 718 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 719 spin_unlock_irq(&mapping->tree_lock); 720 unmap: 721 dax_unmap_atomic(bdev, &dax); 722 return ret; 723 724 unlock: 725 spin_unlock_irq(&mapping->tree_lock); 726 return ret; 727 } 728 729 /* 730 * Flush the mapping to the persistent domain within the byte range of [start, 731 * end]. This is required by data integrity operations to ensure file data is 732 * on persistent storage prior to completion of the operation. 733 */ 734 int dax_writeback_mapping_range(struct address_space *mapping, 735 struct block_device *bdev, struct writeback_control *wbc) 736 { 737 struct inode *inode = mapping->host; 738 pgoff_t start_index, end_index, pmd_index; 739 pgoff_t indices[PAGEVEC_SIZE]; 740 struct pagevec pvec; 741 bool done = false; 742 int i, ret = 0; 743 void *entry; 744 745 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 746 return -EIO; 747 748 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 749 return 0; 750 751 start_index = wbc->range_start >> PAGE_SHIFT; 752 end_index = wbc->range_end >> PAGE_SHIFT; 753 pmd_index = DAX_PMD_INDEX(start_index); 754 755 rcu_read_lock(); 756 entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 757 rcu_read_unlock(); 758 759 /* see if the start of our range is covered by a PMD entry */ 760 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 761 start_index = pmd_index; 762 763 tag_pages_for_writeback(mapping, start_index, end_index); 764 765 pagevec_init(&pvec, 0); 766 while (!done) { 767 pvec.nr = find_get_entries_tag(mapping, start_index, 768 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 769 pvec.pages, indices); 770 771 if (pvec.nr == 0) 772 break; 773 774 for (i = 0; i < pvec.nr; i++) { 775 if (indices[i] > end_index) { 776 done = true; 777 break; 778 } 779 780 ret = dax_writeback_one(bdev, mapping, indices[i], 781 pvec.pages[i]); 782 if (ret < 0) 783 return ret; 784 } 785 } 786 wmb_pmem(); 787 return 0; 788 } 789 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 790 791 static int dax_insert_mapping(struct address_space *mapping, 792 struct buffer_head *bh, void **entryp, 793 struct vm_area_struct *vma, struct vm_fault *vmf) 794 { 795 unsigned long vaddr = (unsigned long)vmf->virtual_address; 796 struct block_device *bdev = bh->b_bdev; 797 struct blk_dax_ctl dax = { 798 .sector = to_sector(bh, mapping->host), 799 .size = bh->b_size, 800 }; 801 void *ret; 802 void *entry = *entryp; 803 804 if (dax_map_atomic(bdev, &dax) < 0) 805 return PTR_ERR(dax.addr); 806 dax_unmap_atomic(bdev, &dax); 807 808 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 809 if (IS_ERR(ret)) 810 return PTR_ERR(ret); 811 *entryp = ret; 812 813 return vm_insert_mixed(vma, vaddr, dax.pfn); 814 } 815 816 /** 817 * __dax_fault - handle a page fault on a DAX file 818 * @vma: The virtual memory area where the fault occurred 819 * @vmf: The description of the fault 820 * @get_block: The filesystem method used to translate file offsets to blocks 821 * 822 * When a page fault occurs, filesystems may call this helper in their 823 * fault handler for DAX files. __dax_fault() assumes the caller has done all 824 * the necessary locking for the page fault to proceed successfully. 825 */ 826 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 827 get_block_t get_block) 828 { 829 struct file *file = vma->vm_file; 830 struct address_space *mapping = file->f_mapping; 831 struct inode *inode = mapping->host; 832 void *entry; 833 struct buffer_head bh; 834 unsigned long vaddr = (unsigned long)vmf->virtual_address; 835 unsigned blkbits = inode->i_blkbits; 836 sector_t block; 837 pgoff_t size; 838 int error; 839 int major = 0; 840 841 /* 842 * Check whether offset isn't beyond end of file now. Caller is supposed 843 * to hold locks serializing us with truncate / punch hole so this is 844 * a reliable test. 845 */ 846 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 847 if (vmf->pgoff >= size) 848 return VM_FAULT_SIGBUS; 849 850 memset(&bh, 0, sizeof(bh)); 851 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 852 bh.b_bdev = inode->i_sb->s_bdev; 853 bh.b_size = PAGE_SIZE; 854 855 entry = grab_mapping_entry(mapping, vmf->pgoff); 856 if (IS_ERR(entry)) { 857 error = PTR_ERR(entry); 858 goto out; 859 } 860 861 error = get_block(inode, block, &bh, 0); 862 if (!error && (bh.b_size < PAGE_SIZE)) 863 error = -EIO; /* fs corruption? */ 864 if (error) 865 goto unlock_entry; 866 867 if (vmf->cow_page) { 868 struct page *new_page = vmf->cow_page; 869 if (buffer_written(&bh)) 870 error = copy_user_bh(new_page, inode, &bh, vaddr); 871 else 872 clear_user_highpage(new_page, vaddr); 873 if (error) 874 goto unlock_entry; 875 if (!radix_tree_exceptional_entry(entry)) { 876 vmf->page = entry; 877 return VM_FAULT_LOCKED; 878 } 879 vmf->entry = entry; 880 return VM_FAULT_DAX_LOCKED; 881 } 882 883 if (!buffer_mapped(&bh)) { 884 if (vmf->flags & FAULT_FLAG_WRITE) { 885 error = get_block(inode, block, &bh, 1); 886 count_vm_event(PGMAJFAULT); 887 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 888 major = VM_FAULT_MAJOR; 889 if (!error && (bh.b_size < PAGE_SIZE)) 890 error = -EIO; 891 if (error) 892 goto unlock_entry; 893 } else { 894 return dax_load_hole(mapping, entry, vmf); 895 } 896 } 897 898 /* Filesystem should not return unwritten buffers to us! */ 899 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 900 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 901 unlock_entry: 902 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 903 out: 904 if (error == -ENOMEM) 905 return VM_FAULT_OOM | major; 906 /* -EBUSY is fine, somebody else faulted on the same PTE */ 907 if ((error < 0) && (error != -EBUSY)) 908 return VM_FAULT_SIGBUS | major; 909 return VM_FAULT_NOPAGE | major; 910 } 911 EXPORT_SYMBOL(__dax_fault); 912 913 /** 914 * dax_fault - handle a page fault on a DAX file 915 * @vma: The virtual memory area where the fault occurred 916 * @vmf: The description of the fault 917 * @get_block: The filesystem method used to translate file offsets to blocks 918 * 919 * When a page fault occurs, filesystems may call this helper in their 920 * fault handler for DAX files. 921 */ 922 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 923 get_block_t get_block) 924 { 925 int result; 926 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 927 928 if (vmf->flags & FAULT_FLAG_WRITE) { 929 sb_start_pagefault(sb); 930 file_update_time(vma->vm_file); 931 } 932 result = __dax_fault(vma, vmf, get_block); 933 if (vmf->flags & FAULT_FLAG_WRITE) 934 sb_end_pagefault(sb); 935 936 return result; 937 } 938 EXPORT_SYMBOL_GPL(dax_fault); 939 940 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 941 /* 942 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 943 * more often than one might expect in the below function. 944 */ 945 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 946 947 static void __dax_dbg(struct buffer_head *bh, unsigned long address, 948 const char *reason, const char *fn) 949 { 950 if (bh) { 951 char bname[BDEVNAME_SIZE]; 952 bdevname(bh->b_bdev, bname); 953 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 954 "length %zd fallback: %s\n", fn, current->comm, 955 address, bname, bh->b_state, (u64)bh->b_blocknr, 956 bh->b_size, reason); 957 } else { 958 pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 959 current->comm, address, reason); 960 } 961 } 962 963 #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 964 965 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 966 pmd_t *pmd, unsigned int flags, get_block_t get_block) 967 { 968 struct file *file = vma->vm_file; 969 struct address_space *mapping = file->f_mapping; 970 struct inode *inode = mapping->host; 971 struct buffer_head bh; 972 unsigned blkbits = inode->i_blkbits; 973 unsigned long pmd_addr = address & PMD_MASK; 974 bool write = flags & FAULT_FLAG_WRITE; 975 struct block_device *bdev; 976 pgoff_t size, pgoff; 977 sector_t block; 978 int result = 0; 979 bool alloc = false; 980 981 /* dax pmd mappings require pfn_t_devmap() */ 982 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 983 return VM_FAULT_FALLBACK; 984 985 /* Fall back to PTEs if we're going to COW */ 986 if (write && !(vma->vm_flags & VM_SHARED)) { 987 split_huge_pmd(vma, pmd, address); 988 dax_pmd_dbg(NULL, address, "cow write"); 989 return VM_FAULT_FALLBACK; 990 } 991 /* If the PMD would extend outside the VMA */ 992 if (pmd_addr < vma->vm_start) { 993 dax_pmd_dbg(NULL, address, "vma start unaligned"); 994 return VM_FAULT_FALLBACK; 995 } 996 if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 997 dax_pmd_dbg(NULL, address, "vma end unaligned"); 998 return VM_FAULT_FALLBACK; 999 } 1000 1001 pgoff = linear_page_index(vma, pmd_addr); 1002 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1003 if (pgoff >= size) 1004 return VM_FAULT_SIGBUS; 1005 /* If the PMD would cover blocks out of the file */ 1006 if ((pgoff | PG_PMD_COLOUR) >= size) { 1007 dax_pmd_dbg(NULL, address, 1008 "offset + huge page size > file size"); 1009 return VM_FAULT_FALLBACK; 1010 } 1011 1012 memset(&bh, 0, sizeof(bh)); 1013 bh.b_bdev = inode->i_sb->s_bdev; 1014 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 1015 1016 bh.b_size = PMD_SIZE; 1017 1018 if (get_block(inode, block, &bh, 0) != 0) 1019 return VM_FAULT_SIGBUS; 1020 1021 if (!buffer_mapped(&bh) && write) { 1022 if (get_block(inode, block, &bh, 1) != 0) 1023 return VM_FAULT_SIGBUS; 1024 alloc = true; 1025 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 1026 } 1027 1028 bdev = bh.b_bdev; 1029 1030 /* 1031 * If the filesystem isn't willing to tell us the length of a hole, 1032 * just fall back to PTEs. Calling get_block 512 times in a loop 1033 * would be silly. 1034 */ 1035 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 1036 dax_pmd_dbg(&bh, address, "allocated block too small"); 1037 return VM_FAULT_FALLBACK; 1038 } 1039 1040 /* 1041 * If we allocated new storage, make sure no process has any 1042 * zero pages covering this hole 1043 */ 1044 if (alloc) { 1045 loff_t lstart = pgoff << PAGE_SHIFT; 1046 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 1047 1048 truncate_pagecache_range(inode, lstart, lend); 1049 } 1050 1051 if (!write && !buffer_mapped(&bh)) { 1052 spinlock_t *ptl; 1053 pmd_t entry; 1054 struct page *zero_page = get_huge_zero_page(); 1055 1056 if (unlikely(!zero_page)) { 1057 dax_pmd_dbg(&bh, address, "no zero page"); 1058 goto fallback; 1059 } 1060 1061 ptl = pmd_lock(vma->vm_mm, pmd); 1062 if (!pmd_none(*pmd)) { 1063 spin_unlock(ptl); 1064 dax_pmd_dbg(&bh, address, "pmd already present"); 1065 goto fallback; 1066 } 1067 1068 dev_dbg(part_to_dev(bdev->bd_part), 1069 "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 1070 __func__, current->comm, address, 1071 (unsigned long long) to_sector(&bh, inode)); 1072 1073 entry = mk_pmd(zero_page, vma->vm_page_prot); 1074 entry = pmd_mkhuge(entry); 1075 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1076 result = VM_FAULT_NOPAGE; 1077 spin_unlock(ptl); 1078 } else { 1079 struct blk_dax_ctl dax = { 1080 .sector = to_sector(&bh, inode), 1081 .size = PMD_SIZE, 1082 }; 1083 long length = dax_map_atomic(bdev, &dax); 1084 1085 if (length < 0) { 1086 dax_pmd_dbg(&bh, address, "dax-error fallback"); 1087 goto fallback; 1088 } 1089 if (length < PMD_SIZE) { 1090 dax_pmd_dbg(&bh, address, "dax-length too small"); 1091 dax_unmap_atomic(bdev, &dax); 1092 goto fallback; 1093 } 1094 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1095 dax_pmd_dbg(&bh, address, "pfn unaligned"); 1096 dax_unmap_atomic(bdev, &dax); 1097 goto fallback; 1098 } 1099 1100 if (!pfn_t_devmap(dax.pfn)) { 1101 dax_unmap_atomic(bdev, &dax); 1102 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1103 goto fallback; 1104 } 1105 dax_unmap_atomic(bdev, &dax); 1106 1107 /* 1108 * For PTE faults we insert a radix tree entry for reads, and 1109 * leave it clean. Then on the first write we dirty the radix 1110 * tree entry via the dax_pfn_mkwrite() path. This sequence 1111 * allows the dax_pfn_mkwrite() call to be simpler and avoid a 1112 * call into get_block() to translate the pgoff to a sector in 1113 * order to be able to create a new radix tree entry. 1114 * 1115 * The PMD path doesn't have an equivalent to 1116 * dax_pfn_mkwrite(), though, so for a read followed by a 1117 * write we traverse all the way through __dax_pmd_fault() 1118 * twice. This means we can just skip inserting a radix tree 1119 * entry completely on the initial read and just wait until 1120 * the write to insert a dirty entry. 1121 */ 1122 if (write) { 1123 /* 1124 * We should insert radix-tree entry and dirty it here. 1125 * For now this is broken... 1126 */ 1127 } 1128 1129 dev_dbg(part_to_dev(bdev->bd_part), 1130 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1131 __func__, current->comm, address, 1132 pfn_t_to_pfn(dax.pfn), 1133 (unsigned long long) dax.sector); 1134 result |= vmf_insert_pfn_pmd(vma, address, pmd, 1135 dax.pfn, write); 1136 } 1137 1138 out: 1139 return result; 1140 1141 fallback: 1142 count_vm_event(THP_FAULT_FALLBACK); 1143 result = VM_FAULT_FALLBACK; 1144 goto out; 1145 } 1146 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 1147 1148 /** 1149 * dax_pmd_fault - handle a PMD fault on a DAX file 1150 * @vma: The virtual memory area where the fault occurred 1151 * @vmf: The description of the fault 1152 * @get_block: The filesystem method used to translate file offsets to blocks 1153 * 1154 * When a page fault occurs, filesystems may call this helper in their 1155 * pmd_fault handler for DAX files. 1156 */ 1157 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1158 pmd_t *pmd, unsigned int flags, get_block_t get_block) 1159 { 1160 int result; 1161 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 1162 1163 if (flags & FAULT_FLAG_WRITE) { 1164 sb_start_pagefault(sb); 1165 file_update_time(vma->vm_file); 1166 } 1167 result = __dax_pmd_fault(vma, address, pmd, flags, get_block); 1168 if (flags & FAULT_FLAG_WRITE) 1169 sb_end_pagefault(sb); 1170 1171 return result; 1172 } 1173 EXPORT_SYMBOL_GPL(dax_pmd_fault); 1174 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1175 1176 /** 1177 * dax_pfn_mkwrite - handle first write to DAX page 1178 * @vma: The virtual memory area where the fault occurred 1179 * @vmf: The description of the fault 1180 */ 1181 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1182 { 1183 struct file *file = vma->vm_file; 1184 struct address_space *mapping = file->f_mapping; 1185 void *entry; 1186 pgoff_t index = vmf->pgoff; 1187 1188 spin_lock_irq(&mapping->tree_lock); 1189 entry = get_unlocked_mapping_entry(mapping, index, NULL); 1190 if (!entry || !radix_tree_exceptional_entry(entry)) 1191 goto out; 1192 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1193 put_unlocked_mapping_entry(mapping, index, entry); 1194 out: 1195 spin_unlock_irq(&mapping->tree_lock); 1196 return VM_FAULT_NOPAGE; 1197 } 1198 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1199 1200 static bool dax_range_is_aligned(struct block_device *bdev, 1201 unsigned int offset, unsigned int length) 1202 { 1203 unsigned short sector_size = bdev_logical_block_size(bdev); 1204 1205 if (!IS_ALIGNED(offset, sector_size)) 1206 return false; 1207 if (!IS_ALIGNED(length, sector_size)) 1208 return false; 1209 1210 return true; 1211 } 1212 1213 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1214 unsigned int offset, unsigned int length) 1215 { 1216 struct blk_dax_ctl dax = { 1217 .sector = sector, 1218 .size = PAGE_SIZE, 1219 }; 1220 1221 if (dax_range_is_aligned(bdev, offset, length)) { 1222 sector_t start_sector = dax.sector + (offset >> 9); 1223 1224 return blkdev_issue_zeroout(bdev, start_sector, 1225 length >> 9, GFP_NOFS, true); 1226 } else { 1227 if (dax_map_atomic(bdev, &dax) < 0) 1228 return PTR_ERR(dax.addr); 1229 clear_pmem(dax.addr + offset, length); 1230 wmb_pmem(); 1231 dax_unmap_atomic(bdev, &dax); 1232 } 1233 return 0; 1234 } 1235 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1236 1237 /** 1238 * dax_zero_page_range - zero a range within a page of a DAX file 1239 * @inode: The file being truncated 1240 * @from: The file offset that is being truncated to 1241 * @length: The number of bytes to zero 1242 * @get_block: The filesystem method used to translate file offsets to blocks 1243 * 1244 * This function can be called by a filesystem when it is zeroing part of a 1245 * page in a DAX file. This is intended for hole-punch operations. If 1246 * you are truncating a file, the helper function dax_truncate_page() may be 1247 * more convenient. 1248 */ 1249 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 1250 get_block_t get_block) 1251 { 1252 struct buffer_head bh; 1253 pgoff_t index = from >> PAGE_SHIFT; 1254 unsigned offset = from & (PAGE_SIZE-1); 1255 int err; 1256 1257 /* Block boundary? Nothing to do */ 1258 if (!length) 1259 return 0; 1260 BUG_ON((offset + length) > PAGE_SIZE); 1261 1262 memset(&bh, 0, sizeof(bh)); 1263 bh.b_bdev = inode->i_sb->s_bdev; 1264 bh.b_size = PAGE_SIZE; 1265 err = get_block(inode, index, &bh, 0); 1266 if (err < 0 || !buffer_written(&bh)) 1267 return err; 1268 1269 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1270 offset, length); 1271 } 1272 EXPORT_SYMBOL_GPL(dax_zero_page_range); 1273 1274 /** 1275 * dax_truncate_page - handle a partial page being truncated in a DAX file 1276 * @inode: The file being truncated 1277 * @from: The file offset that is being truncated to 1278 * @get_block: The filesystem method used to translate file offsets to blocks 1279 * 1280 * Similar to block_truncate_page(), this function can be called by a 1281 * filesystem when it is truncating a DAX file to handle the partial page. 1282 */ 1283 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1284 { 1285 unsigned length = PAGE_ALIGN(from) - from; 1286 return dax_zero_page_range(inode, from, length, get_block); 1287 } 1288 EXPORT_SYMBOL_GPL(dax_truncate_page); 1289