1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/sched/signal.h> 31 #include <linux/uio.h> 32 #include <linux/vmstat.h> 33 #include <linux/pfn_t.h> 34 #include <linux/sizes.h> 35 #include <linux/mmu_notifier.h> 36 #include <linux/iomap.h> 37 #include "internal.h" 38 39 #define CREATE_TRACE_POINTS 40 #include <trace/events/fs_dax.h> 41 42 /* We choose 4096 entries - same as per-zone page wait tables */ 43 #define DAX_WAIT_TABLE_BITS 12 44 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 45 46 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 47 48 static int __init init_dax_wait_table(void) 49 { 50 int i; 51 52 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 53 init_waitqueue_head(wait_table + i); 54 return 0; 55 } 56 fs_initcall(init_dax_wait_table); 57 58 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 59 { 60 struct request_queue *q = bdev->bd_queue; 61 long rc = -EIO; 62 63 dax->addr = ERR_PTR(-EIO); 64 if (blk_queue_enter(q, true) != 0) 65 return rc; 66 67 rc = bdev_direct_access(bdev, dax); 68 if (rc < 0) { 69 dax->addr = ERR_PTR(rc); 70 blk_queue_exit(q); 71 return rc; 72 } 73 return rc; 74 } 75 76 static void dax_unmap_atomic(struct block_device *bdev, 77 const struct blk_dax_ctl *dax) 78 { 79 if (IS_ERR(dax->addr)) 80 return; 81 blk_queue_exit(bdev->bd_queue); 82 } 83 84 static int dax_is_pmd_entry(void *entry) 85 { 86 return (unsigned long)entry & RADIX_DAX_PMD; 87 } 88 89 static int dax_is_pte_entry(void *entry) 90 { 91 return !((unsigned long)entry & RADIX_DAX_PMD); 92 } 93 94 static int dax_is_zero_entry(void *entry) 95 { 96 return (unsigned long)entry & RADIX_DAX_HZP; 97 } 98 99 static int dax_is_empty_entry(void *entry) 100 { 101 return (unsigned long)entry & RADIX_DAX_EMPTY; 102 } 103 104 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 105 { 106 struct page *page = alloc_pages(GFP_KERNEL, 0); 107 struct blk_dax_ctl dax = { 108 .size = PAGE_SIZE, 109 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 110 }; 111 long rc; 112 113 if (!page) 114 return ERR_PTR(-ENOMEM); 115 116 rc = dax_map_atomic(bdev, &dax); 117 if (rc < 0) 118 return ERR_PTR(rc); 119 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 120 dax_unmap_atomic(bdev, &dax); 121 return page; 122 } 123 124 /* 125 * DAX radix tree locking 126 */ 127 struct exceptional_entry_key { 128 struct address_space *mapping; 129 pgoff_t entry_start; 130 }; 131 132 struct wait_exceptional_entry_queue { 133 wait_queue_t wait; 134 struct exceptional_entry_key key; 135 }; 136 137 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 138 pgoff_t index, void *entry, struct exceptional_entry_key *key) 139 { 140 unsigned long hash; 141 142 /* 143 * If 'entry' is a PMD, align the 'index' that we use for the wait 144 * queue to the start of that PMD. This ensures that all offsets in 145 * the range covered by the PMD map to the same bit lock. 146 */ 147 if (dax_is_pmd_entry(entry)) 148 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 149 150 key->mapping = mapping; 151 key->entry_start = index; 152 153 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 154 return wait_table + hash; 155 } 156 157 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 158 int sync, void *keyp) 159 { 160 struct exceptional_entry_key *key = keyp; 161 struct wait_exceptional_entry_queue *ewait = 162 container_of(wait, struct wait_exceptional_entry_queue, wait); 163 164 if (key->mapping != ewait->key.mapping || 165 key->entry_start != ewait->key.entry_start) 166 return 0; 167 return autoremove_wake_function(wait, mode, sync, NULL); 168 } 169 170 /* 171 * Check whether the given slot is locked. The function must be called with 172 * mapping->tree_lock held 173 */ 174 static inline int slot_locked(struct address_space *mapping, void **slot) 175 { 176 unsigned long entry = (unsigned long) 177 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 178 return entry & RADIX_DAX_ENTRY_LOCK; 179 } 180 181 /* 182 * Mark the given slot is locked. The function must be called with 183 * mapping->tree_lock held 184 */ 185 static inline void *lock_slot(struct address_space *mapping, void **slot) 186 { 187 unsigned long entry = (unsigned long) 188 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 189 190 entry |= RADIX_DAX_ENTRY_LOCK; 191 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 192 return (void *)entry; 193 } 194 195 /* 196 * Mark the given slot is unlocked. The function must be called with 197 * mapping->tree_lock held 198 */ 199 static inline void *unlock_slot(struct address_space *mapping, void **slot) 200 { 201 unsigned long entry = (unsigned long) 202 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 203 204 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 205 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 206 return (void *)entry; 207 } 208 209 /* 210 * Lookup entry in radix tree, wait for it to become unlocked if it is 211 * exceptional entry and return it. The caller must call 212 * put_unlocked_mapping_entry() when he decided not to lock the entry or 213 * put_locked_mapping_entry() when he locked the entry and now wants to 214 * unlock it. 215 * 216 * The function must be called with mapping->tree_lock held. 217 */ 218 static void *get_unlocked_mapping_entry(struct address_space *mapping, 219 pgoff_t index, void ***slotp) 220 { 221 void *entry, **slot; 222 struct wait_exceptional_entry_queue ewait; 223 wait_queue_head_t *wq; 224 225 init_wait(&ewait.wait); 226 ewait.wait.func = wake_exceptional_entry_func; 227 228 for (;;) { 229 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 230 &slot); 231 if (!entry || !radix_tree_exceptional_entry(entry) || 232 !slot_locked(mapping, slot)) { 233 if (slotp) 234 *slotp = slot; 235 return entry; 236 } 237 238 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 239 prepare_to_wait_exclusive(wq, &ewait.wait, 240 TASK_UNINTERRUPTIBLE); 241 spin_unlock_irq(&mapping->tree_lock); 242 schedule(); 243 finish_wait(wq, &ewait.wait); 244 spin_lock_irq(&mapping->tree_lock); 245 } 246 } 247 248 static void dax_unlock_mapping_entry(struct address_space *mapping, 249 pgoff_t index) 250 { 251 void *entry, **slot; 252 253 spin_lock_irq(&mapping->tree_lock); 254 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 255 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 256 !slot_locked(mapping, slot))) { 257 spin_unlock_irq(&mapping->tree_lock); 258 return; 259 } 260 unlock_slot(mapping, slot); 261 spin_unlock_irq(&mapping->tree_lock); 262 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 263 } 264 265 static void put_locked_mapping_entry(struct address_space *mapping, 266 pgoff_t index, void *entry) 267 { 268 if (!radix_tree_exceptional_entry(entry)) { 269 unlock_page(entry); 270 put_page(entry); 271 } else { 272 dax_unlock_mapping_entry(mapping, index); 273 } 274 } 275 276 /* 277 * Called when we are done with radix tree entry we looked up via 278 * get_unlocked_mapping_entry() and which we didn't lock in the end. 279 */ 280 static void put_unlocked_mapping_entry(struct address_space *mapping, 281 pgoff_t index, void *entry) 282 { 283 if (!radix_tree_exceptional_entry(entry)) 284 return; 285 286 /* We have to wake up next waiter for the radix tree entry lock */ 287 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 288 } 289 290 /* 291 * Find radix tree entry at given index. If it points to a page, return with 292 * the page locked. If it points to the exceptional entry, return with the 293 * radix tree entry locked. If the radix tree doesn't contain given index, 294 * create empty exceptional entry for the index and return with it locked. 295 * 296 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 297 * either return that locked entry or will return an error. This error will 298 * happen if there are any 4k entries (either zero pages or DAX entries) 299 * within the 2MiB range that we are requesting. 300 * 301 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 302 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 303 * insertion will fail if it finds any 4k entries already in the tree, and a 304 * 4k insertion will cause an existing 2MiB entry to be unmapped and 305 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 306 * well as 2MiB empty entries. 307 * 308 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 309 * real storage backing them. We will leave these real 2MiB DAX entries in 310 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 311 * 312 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 313 * persistent memory the benefit is doubtful. We can add that later if we can 314 * show it helps. 315 */ 316 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 317 unsigned long size_flag) 318 { 319 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 320 void *entry, **slot; 321 322 restart: 323 spin_lock_irq(&mapping->tree_lock); 324 entry = get_unlocked_mapping_entry(mapping, index, &slot); 325 326 if (entry) { 327 if (size_flag & RADIX_DAX_PMD) { 328 if (!radix_tree_exceptional_entry(entry) || 329 dax_is_pte_entry(entry)) { 330 put_unlocked_mapping_entry(mapping, index, 331 entry); 332 entry = ERR_PTR(-EEXIST); 333 goto out_unlock; 334 } 335 } else { /* trying to grab a PTE entry */ 336 if (radix_tree_exceptional_entry(entry) && 337 dax_is_pmd_entry(entry) && 338 (dax_is_zero_entry(entry) || 339 dax_is_empty_entry(entry))) { 340 pmd_downgrade = true; 341 } 342 } 343 } 344 345 /* No entry for given index? Make sure radix tree is big enough. */ 346 if (!entry || pmd_downgrade) { 347 int err; 348 349 if (pmd_downgrade) { 350 /* 351 * Make sure 'entry' remains valid while we drop 352 * mapping->tree_lock. 353 */ 354 entry = lock_slot(mapping, slot); 355 } 356 357 spin_unlock_irq(&mapping->tree_lock); 358 /* 359 * Besides huge zero pages the only other thing that gets 360 * downgraded are empty entries which don't need to be 361 * unmapped. 362 */ 363 if (pmd_downgrade && dax_is_zero_entry(entry)) 364 unmap_mapping_range(mapping, 365 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 366 367 err = radix_tree_preload( 368 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 369 if (err) { 370 if (pmd_downgrade) 371 put_locked_mapping_entry(mapping, index, entry); 372 return ERR_PTR(err); 373 } 374 spin_lock_irq(&mapping->tree_lock); 375 376 if (!entry) { 377 /* 378 * We needed to drop the page_tree lock while calling 379 * radix_tree_preload() and we didn't have an entry to 380 * lock. See if another thread inserted an entry at 381 * our index during this time. 382 */ 383 entry = __radix_tree_lookup(&mapping->page_tree, index, 384 NULL, &slot); 385 if (entry) { 386 radix_tree_preload_end(); 387 spin_unlock_irq(&mapping->tree_lock); 388 goto restart; 389 } 390 } 391 392 if (pmd_downgrade) { 393 radix_tree_delete(&mapping->page_tree, index); 394 mapping->nrexceptional--; 395 dax_wake_mapping_entry_waiter(mapping, index, entry, 396 true); 397 } 398 399 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 400 401 err = __radix_tree_insert(&mapping->page_tree, index, 402 dax_radix_order(entry), entry); 403 radix_tree_preload_end(); 404 if (err) { 405 spin_unlock_irq(&mapping->tree_lock); 406 /* 407 * Our insertion of a DAX entry failed, most likely 408 * because we were inserting a PMD entry and it 409 * collided with a PTE sized entry at a different 410 * index in the PMD range. We haven't inserted 411 * anything into the radix tree and have no waiters to 412 * wake. 413 */ 414 return ERR_PTR(err); 415 } 416 /* Good, we have inserted empty locked entry into the tree. */ 417 mapping->nrexceptional++; 418 spin_unlock_irq(&mapping->tree_lock); 419 return entry; 420 } 421 /* Normal page in radix tree? */ 422 if (!radix_tree_exceptional_entry(entry)) { 423 struct page *page = entry; 424 425 get_page(page); 426 spin_unlock_irq(&mapping->tree_lock); 427 lock_page(page); 428 /* Page got truncated? Retry... */ 429 if (unlikely(page->mapping != mapping)) { 430 unlock_page(page); 431 put_page(page); 432 goto restart; 433 } 434 return page; 435 } 436 entry = lock_slot(mapping, slot); 437 out_unlock: 438 spin_unlock_irq(&mapping->tree_lock); 439 return entry; 440 } 441 442 /* 443 * We do not necessarily hold the mapping->tree_lock when we call this 444 * function so it is possible that 'entry' is no longer a valid item in the 445 * radix tree. This is okay because all we really need to do is to find the 446 * correct waitqueue where tasks might be waiting for that old 'entry' and 447 * wake them. 448 */ 449 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 450 pgoff_t index, void *entry, bool wake_all) 451 { 452 struct exceptional_entry_key key; 453 wait_queue_head_t *wq; 454 455 wq = dax_entry_waitqueue(mapping, index, entry, &key); 456 457 /* 458 * Checking for locked entry and prepare_to_wait_exclusive() happens 459 * under mapping->tree_lock, ditto for entry handling in our callers. 460 * So at this point all tasks that could have seen our entry locked 461 * must be in the waitqueue and the following check will see them. 462 */ 463 if (waitqueue_active(wq)) 464 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 465 } 466 467 static int __dax_invalidate_mapping_entry(struct address_space *mapping, 468 pgoff_t index, bool trunc) 469 { 470 int ret = 0; 471 void *entry; 472 struct radix_tree_root *page_tree = &mapping->page_tree; 473 474 spin_lock_irq(&mapping->tree_lock); 475 entry = get_unlocked_mapping_entry(mapping, index, NULL); 476 if (!entry || !radix_tree_exceptional_entry(entry)) 477 goto out; 478 if (!trunc && 479 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 480 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 481 goto out; 482 radix_tree_delete(page_tree, index); 483 mapping->nrexceptional--; 484 ret = 1; 485 out: 486 put_unlocked_mapping_entry(mapping, index, entry); 487 spin_unlock_irq(&mapping->tree_lock); 488 return ret; 489 } 490 /* 491 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 492 * entry to get unlocked before deleting it. 493 */ 494 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 495 { 496 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 497 498 /* 499 * This gets called from truncate / punch_hole path. As such, the caller 500 * must hold locks protecting against concurrent modifications of the 501 * radix tree (usually fs-private i_mmap_sem for writing). Since the 502 * caller has seen exceptional entry for this index, we better find it 503 * at that index as well... 504 */ 505 WARN_ON_ONCE(!ret); 506 return ret; 507 } 508 509 /* 510 * Invalidate exceptional DAX entry if easily possible. This handles DAX 511 * entries for invalidate_inode_pages() so we evict the entry only if we can 512 * do so without blocking. 513 */ 514 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) 515 { 516 int ret = 0; 517 void *entry, **slot; 518 struct radix_tree_root *page_tree = &mapping->page_tree; 519 520 spin_lock_irq(&mapping->tree_lock); 521 entry = __radix_tree_lookup(page_tree, index, NULL, &slot); 522 if (!entry || !radix_tree_exceptional_entry(entry) || 523 slot_locked(mapping, slot)) 524 goto out; 525 if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 526 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 527 goto out; 528 radix_tree_delete(page_tree, index); 529 mapping->nrexceptional--; 530 ret = 1; 531 out: 532 spin_unlock_irq(&mapping->tree_lock); 533 if (ret) 534 dax_wake_mapping_entry_waiter(mapping, index, entry, true); 535 return ret; 536 } 537 538 /* 539 * Invalidate exceptional DAX entry if it is clean. 540 */ 541 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 542 pgoff_t index) 543 { 544 return __dax_invalidate_mapping_entry(mapping, index, false); 545 } 546 547 /* 548 * The user has performed a load from a hole in the file. Allocating 549 * a new page in the file would cause excessive storage usage for 550 * workloads with sparse files. We allocate a page cache page instead. 551 * We'll kick it out of the page cache if it's ever written to, 552 * otherwise it will simply fall out of the page cache under memory 553 * pressure without ever having been dirtied. 554 */ 555 static int dax_load_hole(struct address_space *mapping, void **entry, 556 struct vm_fault *vmf) 557 { 558 struct page *page; 559 int ret; 560 561 /* Hole page already exists? Return it... */ 562 if (!radix_tree_exceptional_entry(*entry)) { 563 page = *entry; 564 goto out; 565 } 566 567 /* This will replace locked radix tree entry with a hole page */ 568 page = find_or_create_page(mapping, vmf->pgoff, 569 vmf->gfp_mask | __GFP_ZERO); 570 if (!page) 571 return VM_FAULT_OOM; 572 out: 573 vmf->page = page; 574 ret = finish_fault(vmf); 575 vmf->page = NULL; 576 *entry = page; 577 if (!ret) { 578 /* Grab reference for PTE that is now referencing the page */ 579 get_page(page); 580 return VM_FAULT_NOPAGE; 581 } 582 return ret; 583 } 584 585 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 586 struct page *to, unsigned long vaddr) 587 { 588 struct blk_dax_ctl dax = { 589 .sector = sector, 590 .size = size, 591 }; 592 void *vto; 593 594 if (dax_map_atomic(bdev, &dax) < 0) 595 return PTR_ERR(dax.addr); 596 vto = kmap_atomic(to); 597 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 598 kunmap_atomic(vto); 599 dax_unmap_atomic(bdev, &dax); 600 return 0; 601 } 602 603 /* 604 * By this point grab_mapping_entry() has ensured that we have a locked entry 605 * of the appropriate size so we don't have to worry about downgrading PMDs to 606 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 607 * already in the tree, we will skip the insertion and just dirty the PMD as 608 * appropriate. 609 */ 610 static void *dax_insert_mapping_entry(struct address_space *mapping, 611 struct vm_fault *vmf, 612 void *entry, sector_t sector, 613 unsigned long flags) 614 { 615 struct radix_tree_root *page_tree = &mapping->page_tree; 616 int error = 0; 617 bool hole_fill = false; 618 void *new_entry; 619 pgoff_t index = vmf->pgoff; 620 621 if (vmf->flags & FAULT_FLAG_WRITE) 622 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 623 624 /* Replacing hole page with block mapping? */ 625 if (!radix_tree_exceptional_entry(entry)) { 626 hole_fill = true; 627 /* 628 * Unmap the page now before we remove it from page cache below. 629 * The page is locked so it cannot be faulted in again. 630 */ 631 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 632 PAGE_SIZE, 0); 633 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 634 if (error) 635 return ERR_PTR(error); 636 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { 637 /* replacing huge zero page with PMD block mapping */ 638 unmap_mapping_range(mapping, 639 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 640 } 641 642 spin_lock_irq(&mapping->tree_lock); 643 new_entry = dax_radix_locked_entry(sector, flags); 644 645 if (hole_fill) { 646 __delete_from_page_cache(entry, NULL); 647 /* Drop pagecache reference */ 648 put_page(entry); 649 error = __radix_tree_insert(page_tree, index, 650 dax_radix_order(new_entry), new_entry); 651 if (error) { 652 new_entry = ERR_PTR(error); 653 goto unlock; 654 } 655 mapping->nrexceptional++; 656 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 657 /* 658 * Only swap our new entry into the radix tree if the current 659 * entry is a zero page or an empty entry. If a normal PTE or 660 * PMD entry is already in the tree, we leave it alone. This 661 * means that if we are trying to insert a PTE and the 662 * existing entry is a PMD, we will just leave the PMD in the 663 * tree and dirty it if necessary. 664 */ 665 struct radix_tree_node *node; 666 void **slot; 667 void *ret; 668 669 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 670 WARN_ON_ONCE(ret != entry); 671 __radix_tree_replace(page_tree, node, slot, 672 new_entry, NULL, NULL); 673 } 674 if (vmf->flags & FAULT_FLAG_WRITE) 675 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 676 unlock: 677 spin_unlock_irq(&mapping->tree_lock); 678 if (hole_fill) { 679 radix_tree_preload_end(); 680 /* 681 * We don't need hole page anymore, it has been replaced with 682 * locked radix tree entry now. 683 */ 684 if (mapping->a_ops->freepage) 685 mapping->a_ops->freepage(entry); 686 unlock_page(entry); 687 put_page(entry); 688 } 689 return new_entry; 690 } 691 692 static inline unsigned long 693 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 694 { 695 unsigned long address; 696 697 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 698 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 699 return address; 700 } 701 702 /* Walk all mappings of a given index of a file and writeprotect them */ 703 static void dax_mapping_entry_mkclean(struct address_space *mapping, 704 pgoff_t index, unsigned long pfn) 705 { 706 struct vm_area_struct *vma; 707 pte_t pte, *ptep = NULL; 708 pmd_t *pmdp = NULL; 709 spinlock_t *ptl; 710 bool changed; 711 712 i_mmap_lock_read(mapping); 713 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { 714 unsigned long address; 715 716 cond_resched(); 717 718 if (!(vma->vm_flags & VM_SHARED)) 719 continue; 720 721 address = pgoff_address(index, vma); 722 changed = false; 723 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) 724 continue; 725 726 if (pmdp) { 727 #ifdef CONFIG_FS_DAX_PMD 728 pmd_t pmd; 729 730 if (pfn != pmd_pfn(*pmdp)) 731 goto unlock_pmd; 732 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) 733 goto unlock_pmd; 734 735 flush_cache_page(vma, address, pfn); 736 pmd = pmdp_huge_clear_flush(vma, address, pmdp); 737 pmd = pmd_wrprotect(pmd); 738 pmd = pmd_mkclean(pmd); 739 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 740 changed = true; 741 unlock_pmd: 742 spin_unlock(ptl); 743 #endif 744 } else { 745 if (pfn != pte_pfn(*ptep)) 746 goto unlock_pte; 747 if (!pte_dirty(*ptep) && !pte_write(*ptep)) 748 goto unlock_pte; 749 750 flush_cache_page(vma, address, pfn); 751 pte = ptep_clear_flush(vma, address, ptep); 752 pte = pte_wrprotect(pte); 753 pte = pte_mkclean(pte); 754 set_pte_at(vma->vm_mm, address, ptep, pte); 755 changed = true; 756 unlock_pte: 757 pte_unmap_unlock(ptep, ptl); 758 } 759 760 if (changed) 761 mmu_notifier_invalidate_page(vma->vm_mm, address); 762 } 763 i_mmap_unlock_read(mapping); 764 } 765 766 static int dax_writeback_one(struct block_device *bdev, 767 struct address_space *mapping, pgoff_t index, void *entry) 768 { 769 struct radix_tree_root *page_tree = &mapping->page_tree; 770 struct blk_dax_ctl dax; 771 void *entry2, **slot; 772 int ret = 0; 773 774 /* 775 * A page got tagged dirty in DAX mapping? Something is seriously 776 * wrong. 777 */ 778 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 779 return -EIO; 780 781 spin_lock_irq(&mapping->tree_lock); 782 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 783 /* Entry got punched out / reallocated? */ 784 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 785 goto put_unlocked; 786 /* 787 * Entry got reallocated elsewhere? No need to writeback. We have to 788 * compare sectors as we must not bail out due to difference in lockbit 789 * or entry type. 790 */ 791 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 792 goto put_unlocked; 793 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 794 dax_is_zero_entry(entry))) { 795 ret = -EIO; 796 goto put_unlocked; 797 } 798 799 /* Another fsync thread may have already written back this entry */ 800 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 801 goto put_unlocked; 802 /* Lock the entry to serialize with page faults */ 803 entry = lock_slot(mapping, slot); 804 /* 805 * We can clear the tag now but we have to be careful so that concurrent 806 * dax_writeback_one() calls for the same index cannot finish before we 807 * actually flush the caches. This is achieved as the calls will look 808 * at the entry only under tree_lock and once they do that they will 809 * see the entry locked and wait for it to unlock. 810 */ 811 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 812 spin_unlock_irq(&mapping->tree_lock); 813 814 /* 815 * Even if dax_writeback_mapping_range() was given a wbc->range_start 816 * in the middle of a PMD, the 'index' we are given will be aligned to 817 * the start index of the PMD, as will the sector we pull from 818 * 'entry'. This allows us to flush for PMD_SIZE and not have to 819 * worry about partial PMD writebacks. 820 */ 821 dax.sector = dax_radix_sector(entry); 822 dax.size = PAGE_SIZE << dax_radix_order(entry); 823 824 /* 825 * We cannot hold tree_lock while calling dax_map_atomic() because it 826 * eventually calls cond_resched(). 827 */ 828 ret = dax_map_atomic(bdev, &dax); 829 if (ret < 0) { 830 put_locked_mapping_entry(mapping, index, entry); 831 return ret; 832 } 833 834 if (WARN_ON_ONCE(ret < dax.size)) { 835 ret = -EIO; 836 goto unmap; 837 } 838 839 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); 840 wb_cache_pmem(dax.addr, dax.size); 841 /* 842 * After we have flushed the cache, we can clear the dirty tag. There 843 * cannot be new dirty data in the pfn after the flush has completed as 844 * the pfn mappings are writeprotected and fault waits for mapping 845 * entry lock. 846 */ 847 spin_lock_irq(&mapping->tree_lock); 848 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 849 spin_unlock_irq(&mapping->tree_lock); 850 unmap: 851 dax_unmap_atomic(bdev, &dax); 852 put_locked_mapping_entry(mapping, index, entry); 853 return ret; 854 855 put_unlocked: 856 put_unlocked_mapping_entry(mapping, index, entry2); 857 spin_unlock_irq(&mapping->tree_lock); 858 return ret; 859 } 860 861 /* 862 * Flush the mapping to the persistent domain within the byte range of [start, 863 * end]. This is required by data integrity operations to ensure file data is 864 * on persistent storage prior to completion of the operation. 865 */ 866 int dax_writeback_mapping_range(struct address_space *mapping, 867 struct block_device *bdev, struct writeback_control *wbc) 868 { 869 struct inode *inode = mapping->host; 870 pgoff_t start_index, end_index; 871 pgoff_t indices[PAGEVEC_SIZE]; 872 struct pagevec pvec; 873 bool done = false; 874 int i, ret = 0; 875 876 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 877 return -EIO; 878 879 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 880 return 0; 881 882 start_index = wbc->range_start >> PAGE_SHIFT; 883 end_index = wbc->range_end >> PAGE_SHIFT; 884 885 tag_pages_for_writeback(mapping, start_index, end_index); 886 887 pagevec_init(&pvec, 0); 888 while (!done) { 889 pvec.nr = find_get_entries_tag(mapping, start_index, 890 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 891 pvec.pages, indices); 892 893 if (pvec.nr == 0) 894 break; 895 896 for (i = 0; i < pvec.nr; i++) { 897 if (indices[i] > end_index) { 898 done = true; 899 break; 900 } 901 902 ret = dax_writeback_one(bdev, mapping, indices[i], 903 pvec.pages[i]); 904 if (ret < 0) 905 return ret; 906 } 907 } 908 return 0; 909 } 910 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 911 912 static int dax_insert_mapping(struct address_space *mapping, 913 struct block_device *bdev, sector_t sector, size_t size, 914 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 915 { 916 unsigned long vaddr = vmf->address; 917 struct blk_dax_ctl dax = { 918 .sector = sector, 919 .size = size, 920 }; 921 void *ret; 922 void *entry = *entryp; 923 924 if (dax_map_atomic(bdev, &dax) < 0) 925 return PTR_ERR(dax.addr); 926 dax_unmap_atomic(bdev, &dax); 927 928 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 929 if (IS_ERR(ret)) 930 return PTR_ERR(ret); 931 *entryp = ret; 932 933 return vm_insert_mixed(vma, vaddr, dax.pfn); 934 } 935 936 /** 937 * dax_pfn_mkwrite - handle first write to DAX page 938 * @vmf: The description of the fault 939 */ 940 int dax_pfn_mkwrite(struct vm_fault *vmf) 941 { 942 struct file *file = vmf->vma->vm_file; 943 struct address_space *mapping = file->f_mapping; 944 void *entry, **slot; 945 pgoff_t index = vmf->pgoff; 946 947 spin_lock_irq(&mapping->tree_lock); 948 entry = get_unlocked_mapping_entry(mapping, index, &slot); 949 if (!entry || !radix_tree_exceptional_entry(entry)) { 950 if (entry) 951 put_unlocked_mapping_entry(mapping, index, entry); 952 spin_unlock_irq(&mapping->tree_lock); 953 return VM_FAULT_NOPAGE; 954 } 955 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 956 entry = lock_slot(mapping, slot); 957 spin_unlock_irq(&mapping->tree_lock); 958 /* 959 * If we race with somebody updating the PTE and finish_mkwrite_fault() 960 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 961 * the fault in either case. 962 */ 963 finish_mkwrite_fault(vmf); 964 put_locked_mapping_entry(mapping, index, entry); 965 return VM_FAULT_NOPAGE; 966 } 967 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 968 969 static bool dax_range_is_aligned(struct block_device *bdev, 970 unsigned int offset, unsigned int length) 971 { 972 unsigned short sector_size = bdev_logical_block_size(bdev); 973 974 if (!IS_ALIGNED(offset, sector_size)) 975 return false; 976 if (!IS_ALIGNED(length, sector_size)) 977 return false; 978 979 return true; 980 } 981 982 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 983 unsigned int offset, unsigned int length) 984 { 985 struct blk_dax_ctl dax = { 986 .sector = sector, 987 .size = PAGE_SIZE, 988 }; 989 990 if (dax_range_is_aligned(bdev, offset, length)) { 991 sector_t start_sector = dax.sector + (offset >> 9); 992 993 return blkdev_issue_zeroout(bdev, start_sector, 994 length >> 9, GFP_NOFS, 0); 995 } else { 996 if (dax_map_atomic(bdev, &dax) < 0) 997 return PTR_ERR(dax.addr); 998 clear_pmem(dax.addr + offset, length); 999 dax_unmap_atomic(bdev, &dax); 1000 } 1001 return 0; 1002 } 1003 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1004 1005 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 1006 { 1007 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 1008 } 1009 1010 static loff_t 1011 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1012 struct iomap *iomap) 1013 { 1014 struct iov_iter *iter = data; 1015 loff_t end = pos + length, done = 0; 1016 ssize_t ret = 0; 1017 1018 if (iov_iter_rw(iter) == READ) { 1019 end = min(end, i_size_read(inode)); 1020 if (pos >= end) 1021 return 0; 1022 1023 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1024 return iov_iter_zero(min(length, end - pos), iter); 1025 } 1026 1027 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1028 return -EIO; 1029 1030 /* 1031 * Write can allocate block for an area which has a hole page mapped 1032 * into page tables. We have to tear down these mappings so that data 1033 * written by write(2) is visible in mmap. 1034 */ 1035 if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { 1036 invalidate_inode_pages2_range(inode->i_mapping, 1037 pos >> PAGE_SHIFT, 1038 (end - 1) >> PAGE_SHIFT); 1039 } 1040 1041 while (pos < end) { 1042 unsigned offset = pos & (PAGE_SIZE - 1); 1043 struct blk_dax_ctl dax = { 0 }; 1044 ssize_t map_len; 1045 1046 if (fatal_signal_pending(current)) { 1047 ret = -EINTR; 1048 break; 1049 } 1050 1051 dax.sector = dax_iomap_sector(iomap, pos); 1052 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1053 map_len = dax_map_atomic(iomap->bdev, &dax); 1054 if (map_len < 0) { 1055 ret = map_len; 1056 break; 1057 } 1058 1059 dax.addr += offset; 1060 map_len -= offset; 1061 if (map_len > end - pos) 1062 map_len = end - pos; 1063 1064 if (iov_iter_rw(iter) == WRITE) 1065 map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1066 else 1067 map_len = copy_to_iter(dax.addr, map_len, iter); 1068 dax_unmap_atomic(iomap->bdev, &dax); 1069 if (map_len <= 0) { 1070 ret = map_len ? map_len : -EFAULT; 1071 break; 1072 } 1073 1074 pos += map_len; 1075 length -= map_len; 1076 done += map_len; 1077 } 1078 1079 return done ? done : ret; 1080 } 1081 1082 /** 1083 * dax_iomap_rw - Perform I/O to a DAX file 1084 * @iocb: The control block for this I/O 1085 * @iter: The addresses to do I/O from or to 1086 * @ops: iomap ops passed from the file system 1087 * 1088 * This function performs read and write operations to directly mapped 1089 * persistent memory. The callers needs to take care of read/write exclusion 1090 * and evicting any page cache pages in the region under I/O. 1091 */ 1092 ssize_t 1093 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1094 const struct iomap_ops *ops) 1095 { 1096 struct address_space *mapping = iocb->ki_filp->f_mapping; 1097 struct inode *inode = mapping->host; 1098 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1099 unsigned flags = 0; 1100 1101 if (iov_iter_rw(iter) == WRITE) { 1102 lockdep_assert_held_exclusive(&inode->i_rwsem); 1103 flags |= IOMAP_WRITE; 1104 } else { 1105 lockdep_assert_held(&inode->i_rwsem); 1106 } 1107 1108 while (iov_iter_count(iter)) { 1109 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1110 iter, dax_iomap_actor); 1111 if (ret <= 0) 1112 break; 1113 pos += ret; 1114 done += ret; 1115 } 1116 1117 iocb->ki_pos += done; 1118 return done ? done : ret; 1119 } 1120 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1121 1122 static int dax_fault_return(int error) 1123 { 1124 if (error == 0) 1125 return VM_FAULT_NOPAGE; 1126 if (error == -ENOMEM) 1127 return VM_FAULT_OOM; 1128 return VM_FAULT_SIGBUS; 1129 } 1130 1131 static int dax_iomap_pte_fault(struct vm_fault *vmf, 1132 const struct iomap_ops *ops) 1133 { 1134 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1135 struct inode *inode = mapping->host; 1136 unsigned long vaddr = vmf->address; 1137 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1138 sector_t sector; 1139 struct iomap iomap = { 0 }; 1140 unsigned flags = IOMAP_FAULT; 1141 int error, major = 0; 1142 int vmf_ret = 0; 1143 void *entry; 1144 1145 /* 1146 * Check whether offset isn't beyond end of file now. Caller is supposed 1147 * to hold locks serializing us with truncate / punch hole so this is 1148 * a reliable test. 1149 */ 1150 if (pos >= i_size_read(inode)) 1151 return VM_FAULT_SIGBUS; 1152 1153 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1154 flags |= IOMAP_WRITE; 1155 1156 /* 1157 * Note that we don't bother to use iomap_apply here: DAX required 1158 * the file system block size to be equal the page size, which means 1159 * that we never have to deal with more than a single extent here. 1160 */ 1161 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1162 if (error) 1163 return dax_fault_return(error); 1164 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1165 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ 1166 goto finish_iomap; 1167 } 1168 1169 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1170 if (IS_ERR(entry)) { 1171 vmf_ret = dax_fault_return(PTR_ERR(entry)); 1172 goto finish_iomap; 1173 } 1174 1175 sector = dax_iomap_sector(&iomap, pos); 1176 1177 if (vmf->cow_page) { 1178 switch (iomap.type) { 1179 case IOMAP_HOLE: 1180 case IOMAP_UNWRITTEN: 1181 clear_user_highpage(vmf->cow_page, vaddr); 1182 break; 1183 case IOMAP_MAPPED: 1184 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1185 vmf->cow_page, vaddr); 1186 break; 1187 default: 1188 WARN_ON_ONCE(1); 1189 error = -EIO; 1190 break; 1191 } 1192 1193 if (error) 1194 goto error_unlock_entry; 1195 1196 __SetPageUptodate(vmf->cow_page); 1197 vmf_ret = finish_fault(vmf); 1198 if (!vmf_ret) 1199 vmf_ret = VM_FAULT_DONE_COW; 1200 goto unlock_entry; 1201 } 1202 1203 switch (iomap.type) { 1204 case IOMAP_MAPPED: 1205 if (iomap.flags & IOMAP_F_NEW) { 1206 count_vm_event(PGMAJFAULT); 1207 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1208 major = VM_FAULT_MAJOR; 1209 } 1210 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1211 PAGE_SIZE, &entry, vmf->vma, vmf); 1212 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1213 if (error == -EBUSY) 1214 error = 0; 1215 break; 1216 case IOMAP_UNWRITTEN: 1217 case IOMAP_HOLE: 1218 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1219 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1220 goto unlock_entry; 1221 } 1222 /*FALLTHRU*/ 1223 default: 1224 WARN_ON_ONCE(1); 1225 error = -EIO; 1226 break; 1227 } 1228 1229 error_unlock_entry: 1230 vmf_ret = dax_fault_return(error) | major; 1231 unlock_entry: 1232 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1233 finish_iomap: 1234 if (ops->iomap_end) { 1235 int copied = PAGE_SIZE; 1236 1237 if (vmf_ret & VM_FAULT_ERROR) 1238 copied = 0; 1239 /* 1240 * The fault is done by now and there's no way back (other 1241 * thread may be already happily using PTE we have installed). 1242 * Just ignore error from ->iomap_end since we cannot do much 1243 * with it. 1244 */ 1245 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1246 } 1247 return vmf_ret; 1248 } 1249 1250 #ifdef CONFIG_FS_DAX_PMD 1251 /* 1252 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 1253 * more often than one might expect in the below functions. 1254 */ 1255 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1256 1257 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1258 loff_t pos, void **entryp) 1259 { 1260 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1261 struct block_device *bdev = iomap->bdev; 1262 struct inode *inode = mapping->host; 1263 struct blk_dax_ctl dax = { 1264 .sector = dax_iomap_sector(iomap, pos), 1265 .size = PMD_SIZE, 1266 }; 1267 long length = dax_map_atomic(bdev, &dax); 1268 void *ret = NULL; 1269 1270 if (length < 0) /* dax_map_atomic() failed */ 1271 goto fallback; 1272 if (length < PMD_SIZE) 1273 goto unmap_fallback; 1274 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) 1275 goto unmap_fallback; 1276 if (!pfn_t_devmap(dax.pfn)) 1277 goto unmap_fallback; 1278 1279 dax_unmap_atomic(bdev, &dax); 1280 1281 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1282 RADIX_DAX_PMD); 1283 if (IS_ERR(ret)) 1284 goto fallback; 1285 *entryp = ret; 1286 1287 trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); 1288 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1289 dax.pfn, vmf->flags & FAULT_FLAG_WRITE); 1290 1291 unmap_fallback: 1292 dax_unmap_atomic(bdev, &dax); 1293 fallback: 1294 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, 1295 dax.pfn, ret); 1296 return VM_FAULT_FALLBACK; 1297 } 1298 1299 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1300 void **entryp) 1301 { 1302 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1303 unsigned long pmd_addr = vmf->address & PMD_MASK; 1304 struct inode *inode = mapping->host; 1305 struct page *zero_page; 1306 void *ret = NULL; 1307 spinlock_t *ptl; 1308 pmd_t pmd_entry; 1309 1310 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1311 1312 if (unlikely(!zero_page)) 1313 goto fallback; 1314 1315 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1316 RADIX_DAX_PMD | RADIX_DAX_HZP); 1317 if (IS_ERR(ret)) 1318 goto fallback; 1319 *entryp = ret; 1320 1321 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1322 if (!pmd_none(*(vmf->pmd))) { 1323 spin_unlock(ptl); 1324 goto fallback; 1325 } 1326 1327 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); 1328 pmd_entry = pmd_mkhuge(pmd_entry); 1329 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 1330 spin_unlock(ptl); 1331 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); 1332 return VM_FAULT_NOPAGE; 1333 1334 fallback: 1335 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); 1336 return VM_FAULT_FALLBACK; 1337 } 1338 1339 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1340 const struct iomap_ops *ops) 1341 { 1342 struct vm_area_struct *vma = vmf->vma; 1343 struct address_space *mapping = vma->vm_file->f_mapping; 1344 unsigned long pmd_addr = vmf->address & PMD_MASK; 1345 bool write = vmf->flags & FAULT_FLAG_WRITE; 1346 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1347 struct inode *inode = mapping->host; 1348 int result = VM_FAULT_FALLBACK; 1349 struct iomap iomap = { 0 }; 1350 pgoff_t max_pgoff, pgoff; 1351 void *entry; 1352 loff_t pos; 1353 int error; 1354 1355 /* 1356 * Check whether offset isn't beyond end of file now. Caller is 1357 * supposed to hold locks serializing us with truncate / punch hole so 1358 * this is a reliable test. 1359 */ 1360 pgoff = linear_page_index(vma, pmd_addr); 1361 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1362 1363 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1364 1365 /* Fall back to PTEs if we're going to COW */ 1366 if (write && !(vma->vm_flags & VM_SHARED)) 1367 goto fallback; 1368 1369 /* If the PMD would extend outside the VMA */ 1370 if (pmd_addr < vma->vm_start) 1371 goto fallback; 1372 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1373 goto fallback; 1374 1375 if (pgoff > max_pgoff) { 1376 result = VM_FAULT_SIGBUS; 1377 goto out; 1378 } 1379 1380 /* If the PMD would extend beyond the file size */ 1381 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1382 goto fallback; 1383 1384 /* 1385 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1386 * setting up a mapping, so really we're using iomap_begin() as a way 1387 * to look up our filesystem block. 1388 */ 1389 pos = (loff_t)pgoff << PAGE_SHIFT; 1390 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1391 if (error) 1392 goto fallback; 1393 1394 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1395 goto finish_iomap; 1396 1397 /* 1398 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1399 * PMD or a HZP entry. If it can't (because a 4k page is already in 1400 * the tree, for instance), it will return -EEXIST and we just fall 1401 * back to 4k entries. 1402 */ 1403 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1404 if (IS_ERR(entry)) 1405 goto finish_iomap; 1406 1407 switch (iomap.type) { 1408 case IOMAP_MAPPED: 1409 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1410 break; 1411 case IOMAP_UNWRITTEN: 1412 case IOMAP_HOLE: 1413 if (WARN_ON_ONCE(write)) 1414 goto unlock_entry; 1415 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1416 break; 1417 default: 1418 WARN_ON_ONCE(1); 1419 break; 1420 } 1421 1422 unlock_entry: 1423 put_locked_mapping_entry(mapping, pgoff, entry); 1424 finish_iomap: 1425 if (ops->iomap_end) { 1426 int copied = PMD_SIZE; 1427 1428 if (result == VM_FAULT_FALLBACK) 1429 copied = 0; 1430 /* 1431 * The fault is done by now and there's no way back (other 1432 * thread may be already happily using PMD we have installed). 1433 * Just ignore error from ->iomap_end since we cannot do much 1434 * with it. 1435 */ 1436 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1437 &iomap); 1438 } 1439 fallback: 1440 if (result == VM_FAULT_FALLBACK) { 1441 split_huge_pmd(vma, vmf->pmd, vmf->address); 1442 count_vm_event(THP_FAULT_FALLBACK); 1443 } 1444 out: 1445 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1446 return result; 1447 } 1448 #else 1449 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1450 const struct iomap_ops *ops) 1451 { 1452 return VM_FAULT_FALLBACK; 1453 } 1454 #endif /* CONFIG_FS_DAX_PMD */ 1455 1456 /** 1457 * dax_iomap_fault - handle a page fault on a DAX file 1458 * @vmf: The description of the fault 1459 * @ops: iomap ops passed from the file system 1460 * 1461 * When a page fault occurs, filesystems may call this helper in 1462 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1463 * has done all the necessary locking for page fault to proceed 1464 * successfully. 1465 */ 1466 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1467 const struct iomap_ops *ops) 1468 { 1469 switch (pe_size) { 1470 case PE_SIZE_PTE: 1471 return dax_iomap_pte_fault(vmf, ops); 1472 case PE_SIZE_PMD: 1473 return dax_iomap_pmd_fault(vmf, ops); 1474 default: 1475 return VM_FAULT_FALLBACK; 1476 } 1477 } 1478 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1479