1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/sched.h> 29 #include <linux/sched/signal.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 #include <linux/mmu_notifier.h> 35 #include <linux/iomap.h> 36 #include "internal.h" 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/fs_dax.h> 40 41 /* We choose 4096 entries - same as per-zone page wait tables */ 42 #define DAX_WAIT_TABLE_BITS 12 43 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 44 45 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 46 47 static int __init init_dax_wait_table(void) 48 { 49 int i; 50 51 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 52 init_waitqueue_head(wait_table + i); 53 return 0; 54 } 55 fs_initcall(init_dax_wait_table); 56 57 static int dax_is_pmd_entry(void *entry) 58 { 59 return (unsigned long)entry & RADIX_DAX_PMD; 60 } 61 62 static int dax_is_pte_entry(void *entry) 63 { 64 return !((unsigned long)entry & RADIX_DAX_PMD); 65 } 66 67 static int dax_is_zero_entry(void *entry) 68 { 69 return (unsigned long)entry & RADIX_DAX_HZP; 70 } 71 72 static int dax_is_empty_entry(void *entry) 73 { 74 return (unsigned long)entry & RADIX_DAX_EMPTY; 75 } 76 77 /* 78 * DAX radix tree locking 79 */ 80 struct exceptional_entry_key { 81 struct address_space *mapping; 82 pgoff_t entry_start; 83 }; 84 85 struct wait_exceptional_entry_queue { 86 wait_queue_entry_t wait; 87 struct exceptional_entry_key key; 88 }; 89 90 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 91 pgoff_t index, void *entry, struct exceptional_entry_key *key) 92 { 93 unsigned long hash; 94 95 /* 96 * If 'entry' is a PMD, align the 'index' that we use for the wait 97 * queue to the start of that PMD. This ensures that all offsets in 98 * the range covered by the PMD map to the same bit lock. 99 */ 100 if (dax_is_pmd_entry(entry)) 101 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 102 103 key->mapping = mapping; 104 key->entry_start = index; 105 106 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 107 return wait_table + hash; 108 } 109 110 static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, 111 int sync, void *keyp) 112 { 113 struct exceptional_entry_key *key = keyp; 114 struct wait_exceptional_entry_queue *ewait = 115 container_of(wait, struct wait_exceptional_entry_queue, wait); 116 117 if (key->mapping != ewait->key.mapping || 118 key->entry_start != ewait->key.entry_start) 119 return 0; 120 return autoremove_wake_function(wait, mode, sync, NULL); 121 } 122 123 /* 124 * Check whether the given slot is locked. The function must be called with 125 * mapping->tree_lock held 126 */ 127 static inline int slot_locked(struct address_space *mapping, void **slot) 128 { 129 unsigned long entry = (unsigned long) 130 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 131 return entry & RADIX_DAX_ENTRY_LOCK; 132 } 133 134 /* 135 * Mark the given slot is locked. The function must be called with 136 * mapping->tree_lock held 137 */ 138 static inline void *lock_slot(struct address_space *mapping, void **slot) 139 { 140 unsigned long entry = (unsigned long) 141 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 142 143 entry |= RADIX_DAX_ENTRY_LOCK; 144 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 145 return (void *)entry; 146 } 147 148 /* 149 * Mark the given slot is unlocked. The function must be called with 150 * mapping->tree_lock held 151 */ 152 static inline void *unlock_slot(struct address_space *mapping, void **slot) 153 { 154 unsigned long entry = (unsigned long) 155 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 156 157 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 158 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 159 return (void *)entry; 160 } 161 162 /* 163 * Lookup entry in radix tree, wait for it to become unlocked if it is 164 * exceptional entry and return it. The caller must call 165 * put_unlocked_mapping_entry() when he decided not to lock the entry or 166 * put_locked_mapping_entry() when he locked the entry and now wants to 167 * unlock it. 168 * 169 * The function must be called with mapping->tree_lock held. 170 */ 171 static void *get_unlocked_mapping_entry(struct address_space *mapping, 172 pgoff_t index, void ***slotp) 173 { 174 void *entry, **slot; 175 struct wait_exceptional_entry_queue ewait; 176 wait_queue_head_t *wq; 177 178 init_wait(&ewait.wait); 179 ewait.wait.func = wake_exceptional_entry_func; 180 181 for (;;) { 182 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 183 &slot); 184 if (!entry || !radix_tree_exceptional_entry(entry) || 185 !slot_locked(mapping, slot)) { 186 if (slotp) 187 *slotp = slot; 188 return entry; 189 } 190 191 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 192 prepare_to_wait_exclusive(wq, &ewait.wait, 193 TASK_UNINTERRUPTIBLE); 194 spin_unlock_irq(&mapping->tree_lock); 195 schedule(); 196 finish_wait(wq, &ewait.wait); 197 spin_lock_irq(&mapping->tree_lock); 198 } 199 } 200 201 static void dax_unlock_mapping_entry(struct address_space *mapping, 202 pgoff_t index) 203 { 204 void *entry, **slot; 205 206 spin_lock_irq(&mapping->tree_lock); 207 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 208 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 209 !slot_locked(mapping, slot))) { 210 spin_unlock_irq(&mapping->tree_lock); 211 return; 212 } 213 unlock_slot(mapping, slot); 214 spin_unlock_irq(&mapping->tree_lock); 215 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 216 } 217 218 static void put_locked_mapping_entry(struct address_space *mapping, 219 pgoff_t index, void *entry) 220 { 221 if (!radix_tree_exceptional_entry(entry)) { 222 unlock_page(entry); 223 put_page(entry); 224 } else { 225 dax_unlock_mapping_entry(mapping, index); 226 } 227 } 228 229 /* 230 * Called when we are done with radix tree entry we looked up via 231 * get_unlocked_mapping_entry() and which we didn't lock in the end. 232 */ 233 static void put_unlocked_mapping_entry(struct address_space *mapping, 234 pgoff_t index, void *entry) 235 { 236 if (!radix_tree_exceptional_entry(entry)) 237 return; 238 239 /* We have to wake up next waiter for the radix tree entry lock */ 240 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 241 } 242 243 /* 244 * Find radix tree entry at given index. If it points to a page, return with 245 * the page locked. If it points to the exceptional entry, return with the 246 * radix tree entry locked. If the radix tree doesn't contain given index, 247 * create empty exceptional entry for the index and return with it locked. 248 * 249 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 250 * either return that locked entry or will return an error. This error will 251 * happen if there are any 4k entries (either zero pages or DAX entries) 252 * within the 2MiB range that we are requesting. 253 * 254 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 255 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 256 * insertion will fail if it finds any 4k entries already in the tree, and a 257 * 4k insertion will cause an existing 2MiB entry to be unmapped and 258 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 259 * well as 2MiB empty entries. 260 * 261 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 262 * real storage backing them. We will leave these real 2MiB DAX entries in 263 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 264 * 265 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 266 * persistent memory the benefit is doubtful. We can add that later if we can 267 * show it helps. 268 */ 269 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 270 unsigned long size_flag) 271 { 272 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 273 void *entry, **slot; 274 275 restart: 276 spin_lock_irq(&mapping->tree_lock); 277 entry = get_unlocked_mapping_entry(mapping, index, &slot); 278 279 if (entry) { 280 if (size_flag & RADIX_DAX_PMD) { 281 if (!radix_tree_exceptional_entry(entry) || 282 dax_is_pte_entry(entry)) { 283 put_unlocked_mapping_entry(mapping, index, 284 entry); 285 entry = ERR_PTR(-EEXIST); 286 goto out_unlock; 287 } 288 } else { /* trying to grab a PTE entry */ 289 if (radix_tree_exceptional_entry(entry) && 290 dax_is_pmd_entry(entry) && 291 (dax_is_zero_entry(entry) || 292 dax_is_empty_entry(entry))) { 293 pmd_downgrade = true; 294 } 295 } 296 } 297 298 /* No entry for given index? Make sure radix tree is big enough. */ 299 if (!entry || pmd_downgrade) { 300 int err; 301 302 if (pmd_downgrade) { 303 /* 304 * Make sure 'entry' remains valid while we drop 305 * mapping->tree_lock. 306 */ 307 entry = lock_slot(mapping, slot); 308 } 309 310 spin_unlock_irq(&mapping->tree_lock); 311 /* 312 * Besides huge zero pages the only other thing that gets 313 * downgraded are empty entries which don't need to be 314 * unmapped. 315 */ 316 if (pmd_downgrade && dax_is_zero_entry(entry)) 317 unmap_mapping_range(mapping, 318 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 319 320 err = radix_tree_preload( 321 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 322 if (err) { 323 if (pmd_downgrade) 324 put_locked_mapping_entry(mapping, index, entry); 325 return ERR_PTR(err); 326 } 327 spin_lock_irq(&mapping->tree_lock); 328 329 if (!entry) { 330 /* 331 * We needed to drop the page_tree lock while calling 332 * radix_tree_preload() and we didn't have an entry to 333 * lock. See if another thread inserted an entry at 334 * our index during this time. 335 */ 336 entry = __radix_tree_lookup(&mapping->page_tree, index, 337 NULL, &slot); 338 if (entry) { 339 radix_tree_preload_end(); 340 spin_unlock_irq(&mapping->tree_lock); 341 goto restart; 342 } 343 } 344 345 if (pmd_downgrade) { 346 radix_tree_delete(&mapping->page_tree, index); 347 mapping->nrexceptional--; 348 dax_wake_mapping_entry_waiter(mapping, index, entry, 349 true); 350 } 351 352 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 353 354 err = __radix_tree_insert(&mapping->page_tree, index, 355 dax_radix_order(entry), entry); 356 radix_tree_preload_end(); 357 if (err) { 358 spin_unlock_irq(&mapping->tree_lock); 359 /* 360 * Our insertion of a DAX entry failed, most likely 361 * because we were inserting a PMD entry and it 362 * collided with a PTE sized entry at a different 363 * index in the PMD range. We haven't inserted 364 * anything into the radix tree and have no waiters to 365 * wake. 366 */ 367 return ERR_PTR(err); 368 } 369 /* Good, we have inserted empty locked entry into the tree. */ 370 mapping->nrexceptional++; 371 spin_unlock_irq(&mapping->tree_lock); 372 return entry; 373 } 374 /* Normal page in radix tree? */ 375 if (!radix_tree_exceptional_entry(entry)) { 376 struct page *page = entry; 377 378 get_page(page); 379 spin_unlock_irq(&mapping->tree_lock); 380 lock_page(page); 381 /* Page got truncated? Retry... */ 382 if (unlikely(page->mapping != mapping)) { 383 unlock_page(page); 384 put_page(page); 385 goto restart; 386 } 387 return page; 388 } 389 entry = lock_slot(mapping, slot); 390 out_unlock: 391 spin_unlock_irq(&mapping->tree_lock); 392 return entry; 393 } 394 395 /* 396 * We do not necessarily hold the mapping->tree_lock when we call this 397 * function so it is possible that 'entry' is no longer a valid item in the 398 * radix tree. This is okay because all we really need to do is to find the 399 * correct waitqueue where tasks might be waiting for that old 'entry' and 400 * wake them. 401 */ 402 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 403 pgoff_t index, void *entry, bool wake_all) 404 { 405 struct exceptional_entry_key key; 406 wait_queue_head_t *wq; 407 408 wq = dax_entry_waitqueue(mapping, index, entry, &key); 409 410 /* 411 * Checking for locked entry and prepare_to_wait_exclusive() happens 412 * under mapping->tree_lock, ditto for entry handling in our callers. 413 * So at this point all tasks that could have seen our entry locked 414 * must be in the waitqueue and the following check will see them. 415 */ 416 if (waitqueue_active(wq)) 417 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 418 } 419 420 static int __dax_invalidate_mapping_entry(struct address_space *mapping, 421 pgoff_t index, bool trunc) 422 { 423 int ret = 0; 424 void *entry; 425 struct radix_tree_root *page_tree = &mapping->page_tree; 426 427 spin_lock_irq(&mapping->tree_lock); 428 entry = get_unlocked_mapping_entry(mapping, index, NULL); 429 if (!entry || !radix_tree_exceptional_entry(entry)) 430 goto out; 431 if (!trunc && 432 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 433 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 434 goto out; 435 radix_tree_delete(page_tree, index); 436 mapping->nrexceptional--; 437 ret = 1; 438 out: 439 put_unlocked_mapping_entry(mapping, index, entry); 440 spin_unlock_irq(&mapping->tree_lock); 441 return ret; 442 } 443 /* 444 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 445 * entry to get unlocked before deleting it. 446 */ 447 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 448 { 449 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 450 451 /* 452 * This gets called from truncate / punch_hole path. As such, the caller 453 * must hold locks protecting against concurrent modifications of the 454 * radix tree (usually fs-private i_mmap_sem for writing). Since the 455 * caller has seen exceptional entry for this index, we better find it 456 * at that index as well... 457 */ 458 WARN_ON_ONCE(!ret); 459 return ret; 460 } 461 462 /* 463 * Invalidate exceptional DAX entry if it is clean. 464 */ 465 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 466 pgoff_t index) 467 { 468 return __dax_invalidate_mapping_entry(mapping, index, false); 469 } 470 471 /* 472 * The user has performed a load from a hole in the file. Allocating 473 * a new page in the file would cause excessive storage usage for 474 * workloads with sparse files. We allocate a page cache page instead. 475 * We'll kick it out of the page cache if it's ever written to, 476 * otherwise it will simply fall out of the page cache under memory 477 * pressure without ever having been dirtied. 478 */ 479 static int dax_load_hole(struct address_space *mapping, void **entry, 480 struct vm_fault *vmf) 481 { 482 struct inode *inode = mapping->host; 483 struct page *page; 484 int ret; 485 486 /* Hole page already exists? Return it... */ 487 if (!radix_tree_exceptional_entry(*entry)) { 488 page = *entry; 489 goto finish_fault; 490 } 491 492 /* This will replace locked radix tree entry with a hole page */ 493 page = find_or_create_page(mapping, vmf->pgoff, 494 vmf->gfp_mask | __GFP_ZERO); 495 if (!page) { 496 ret = VM_FAULT_OOM; 497 goto out; 498 } 499 500 finish_fault: 501 vmf->page = page; 502 ret = finish_fault(vmf); 503 vmf->page = NULL; 504 *entry = page; 505 if (!ret) { 506 /* Grab reference for PTE that is now referencing the page */ 507 get_page(page); 508 ret = VM_FAULT_NOPAGE; 509 } 510 out: 511 trace_dax_load_hole(inode, vmf, ret); 512 return ret; 513 } 514 515 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 516 sector_t sector, size_t size, struct page *to, 517 unsigned long vaddr) 518 { 519 void *vto, *kaddr; 520 pgoff_t pgoff; 521 pfn_t pfn; 522 long rc; 523 int id; 524 525 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 526 if (rc) 527 return rc; 528 529 id = dax_read_lock(); 530 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 531 if (rc < 0) { 532 dax_read_unlock(id); 533 return rc; 534 } 535 vto = kmap_atomic(to); 536 copy_user_page(vto, (void __force *)kaddr, vaddr, to); 537 kunmap_atomic(vto); 538 dax_read_unlock(id); 539 return 0; 540 } 541 542 /* 543 * By this point grab_mapping_entry() has ensured that we have a locked entry 544 * of the appropriate size so we don't have to worry about downgrading PMDs to 545 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 546 * already in the tree, we will skip the insertion and just dirty the PMD as 547 * appropriate. 548 */ 549 static void *dax_insert_mapping_entry(struct address_space *mapping, 550 struct vm_fault *vmf, 551 void *entry, sector_t sector, 552 unsigned long flags) 553 { 554 struct radix_tree_root *page_tree = &mapping->page_tree; 555 int error = 0; 556 bool hole_fill = false; 557 void *new_entry; 558 pgoff_t index = vmf->pgoff; 559 560 if (vmf->flags & FAULT_FLAG_WRITE) 561 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 562 563 /* Replacing hole page with block mapping? */ 564 if (!radix_tree_exceptional_entry(entry)) { 565 hole_fill = true; 566 /* 567 * Unmap the page now before we remove it from page cache below. 568 * The page is locked so it cannot be faulted in again. 569 */ 570 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 571 PAGE_SIZE, 0); 572 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 573 if (error) 574 return ERR_PTR(error); 575 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { 576 /* replacing huge zero page with PMD block mapping */ 577 unmap_mapping_range(mapping, 578 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 579 } 580 581 spin_lock_irq(&mapping->tree_lock); 582 new_entry = dax_radix_locked_entry(sector, flags); 583 584 if (hole_fill) { 585 __delete_from_page_cache(entry, NULL); 586 /* Drop pagecache reference */ 587 put_page(entry); 588 error = __radix_tree_insert(page_tree, index, 589 dax_radix_order(new_entry), new_entry); 590 if (error) { 591 new_entry = ERR_PTR(error); 592 goto unlock; 593 } 594 mapping->nrexceptional++; 595 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 596 /* 597 * Only swap our new entry into the radix tree if the current 598 * entry is a zero page or an empty entry. If a normal PTE or 599 * PMD entry is already in the tree, we leave it alone. This 600 * means that if we are trying to insert a PTE and the 601 * existing entry is a PMD, we will just leave the PMD in the 602 * tree and dirty it if necessary. 603 */ 604 struct radix_tree_node *node; 605 void **slot; 606 void *ret; 607 608 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 609 WARN_ON_ONCE(ret != entry); 610 __radix_tree_replace(page_tree, node, slot, 611 new_entry, NULL, NULL); 612 } 613 if (vmf->flags & FAULT_FLAG_WRITE) 614 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 615 unlock: 616 spin_unlock_irq(&mapping->tree_lock); 617 if (hole_fill) { 618 radix_tree_preload_end(); 619 /* 620 * We don't need hole page anymore, it has been replaced with 621 * locked radix tree entry now. 622 */ 623 if (mapping->a_ops->freepage) 624 mapping->a_ops->freepage(entry); 625 unlock_page(entry); 626 put_page(entry); 627 } 628 return new_entry; 629 } 630 631 static inline unsigned long 632 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 633 { 634 unsigned long address; 635 636 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 637 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 638 return address; 639 } 640 641 /* Walk all mappings of a given index of a file and writeprotect them */ 642 static void dax_mapping_entry_mkclean(struct address_space *mapping, 643 pgoff_t index, unsigned long pfn) 644 { 645 struct vm_area_struct *vma; 646 pte_t pte, *ptep = NULL; 647 pmd_t *pmdp = NULL; 648 spinlock_t *ptl; 649 bool changed; 650 651 i_mmap_lock_read(mapping); 652 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { 653 unsigned long address; 654 655 cond_resched(); 656 657 if (!(vma->vm_flags & VM_SHARED)) 658 continue; 659 660 address = pgoff_address(index, vma); 661 changed = false; 662 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) 663 continue; 664 665 if (pmdp) { 666 #ifdef CONFIG_FS_DAX_PMD 667 pmd_t pmd; 668 669 if (pfn != pmd_pfn(*pmdp)) 670 goto unlock_pmd; 671 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) 672 goto unlock_pmd; 673 674 flush_cache_page(vma, address, pfn); 675 pmd = pmdp_huge_clear_flush(vma, address, pmdp); 676 pmd = pmd_wrprotect(pmd); 677 pmd = pmd_mkclean(pmd); 678 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 679 changed = true; 680 unlock_pmd: 681 spin_unlock(ptl); 682 #endif 683 } else { 684 if (pfn != pte_pfn(*ptep)) 685 goto unlock_pte; 686 if (!pte_dirty(*ptep) && !pte_write(*ptep)) 687 goto unlock_pte; 688 689 flush_cache_page(vma, address, pfn); 690 pte = ptep_clear_flush(vma, address, ptep); 691 pte = pte_wrprotect(pte); 692 pte = pte_mkclean(pte); 693 set_pte_at(vma->vm_mm, address, ptep, pte); 694 changed = true; 695 unlock_pte: 696 pte_unmap_unlock(ptep, ptl); 697 } 698 699 if (changed) 700 mmu_notifier_invalidate_page(vma->vm_mm, address); 701 } 702 i_mmap_unlock_read(mapping); 703 } 704 705 static int dax_writeback_one(struct block_device *bdev, 706 struct dax_device *dax_dev, struct address_space *mapping, 707 pgoff_t index, void *entry) 708 { 709 struct radix_tree_root *page_tree = &mapping->page_tree; 710 void *entry2, **slot, *kaddr; 711 long ret = 0, id; 712 sector_t sector; 713 pgoff_t pgoff; 714 size_t size; 715 pfn_t pfn; 716 717 /* 718 * A page got tagged dirty in DAX mapping? Something is seriously 719 * wrong. 720 */ 721 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 722 return -EIO; 723 724 spin_lock_irq(&mapping->tree_lock); 725 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 726 /* Entry got punched out / reallocated? */ 727 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 728 goto put_unlocked; 729 /* 730 * Entry got reallocated elsewhere? No need to writeback. We have to 731 * compare sectors as we must not bail out due to difference in lockbit 732 * or entry type. 733 */ 734 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 735 goto put_unlocked; 736 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 737 dax_is_zero_entry(entry))) { 738 ret = -EIO; 739 goto put_unlocked; 740 } 741 742 /* Another fsync thread may have already written back this entry */ 743 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 744 goto put_unlocked; 745 /* Lock the entry to serialize with page faults */ 746 entry = lock_slot(mapping, slot); 747 /* 748 * We can clear the tag now but we have to be careful so that concurrent 749 * dax_writeback_one() calls for the same index cannot finish before we 750 * actually flush the caches. This is achieved as the calls will look 751 * at the entry only under tree_lock and once they do that they will 752 * see the entry locked and wait for it to unlock. 753 */ 754 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 755 spin_unlock_irq(&mapping->tree_lock); 756 757 /* 758 * Even if dax_writeback_mapping_range() was given a wbc->range_start 759 * in the middle of a PMD, the 'index' we are given will be aligned to 760 * the start index of the PMD, as will the sector we pull from 761 * 'entry'. This allows us to flush for PMD_SIZE and not have to 762 * worry about partial PMD writebacks. 763 */ 764 sector = dax_radix_sector(entry); 765 size = PAGE_SIZE << dax_radix_order(entry); 766 767 id = dax_read_lock(); 768 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 769 if (ret) 770 goto dax_unlock; 771 772 /* 773 * dax_direct_access() may sleep, so cannot hold tree_lock over 774 * its invocation. 775 */ 776 ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); 777 if (ret < 0) 778 goto dax_unlock; 779 780 if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { 781 ret = -EIO; 782 goto dax_unlock; 783 } 784 785 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); 786 dax_flush(dax_dev, pgoff, kaddr, size); 787 /* 788 * After we have flushed the cache, we can clear the dirty tag. There 789 * cannot be new dirty data in the pfn after the flush has completed as 790 * the pfn mappings are writeprotected and fault waits for mapping 791 * entry lock. 792 */ 793 spin_lock_irq(&mapping->tree_lock); 794 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 795 spin_unlock_irq(&mapping->tree_lock); 796 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 797 dax_unlock: 798 dax_read_unlock(id); 799 put_locked_mapping_entry(mapping, index, entry); 800 return ret; 801 802 put_unlocked: 803 put_unlocked_mapping_entry(mapping, index, entry2); 804 spin_unlock_irq(&mapping->tree_lock); 805 return ret; 806 } 807 808 /* 809 * Flush the mapping to the persistent domain within the byte range of [start, 810 * end]. This is required by data integrity operations to ensure file data is 811 * on persistent storage prior to completion of the operation. 812 */ 813 int dax_writeback_mapping_range(struct address_space *mapping, 814 struct block_device *bdev, struct writeback_control *wbc) 815 { 816 struct inode *inode = mapping->host; 817 pgoff_t start_index, end_index; 818 pgoff_t indices[PAGEVEC_SIZE]; 819 struct dax_device *dax_dev; 820 struct pagevec pvec; 821 bool done = false; 822 int i, ret = 0; 823 824 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 825 return -EIO; 826 827 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 828 return 0; 829 830 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 831 if (!dax_dev) 832 return -EIO; 833 834 start_index = wbc->range_start >> PAGE_SHIFT; 835 end_index = wbc->range_end >> PAGE_SHIFT; 836 837 trace_dax_writeback_range(inode, start_index, end_index); 838 839 tag_pages_for_writeback(mapping, start_index, end_index); 840 841 pagevec_init(&pvec, 0); 842 while (!done) { 843 pvec.nr = find_get_entries_tag(mapping, start_index, 844 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 845 pvec.pages, indices); 846 847 if (pvec.nr == 0) 848 break; 849 850 for (i = 0; i < pvec.nr; i++) { 851 if (indices[i] > end_index) { 852 done = true; 853 break; 854 } 855 856 ret = dax_writeback_one(bdev, dax_dev, mapping, 857 indices[i], pvec.pages[i]); 858 if (ret < 0) { 859 mapping_set_error(mapping, ret); 860 goto out; 861 } 862 } 863 start_index = indices[pvec.nr - 1] + 1; 864 } 865 out: 866 put_dax(dax_dev); 867 trace_dax_writeback_range_done(inode, start_index, end_index); 868 return (ret < 0 ? ret : 0); 869 } 870 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 871 872 static int dax_insert_mapping(struct address_space *mapping, 873 struct block_device *bdev, struct dax_device *dax_dev, 874 sector_t sector, size_t size, void **entryp, 875 struct vm_area_struct *vma, struct vm_fault *vmf) 876 { 877 unsigned long vaddr = vmf->address; 878 void *entry = *entryp; 879 void *ret, *kaddr; 880 pgoff_t pgoff; 881 int id, rc; 882 pfn_t pfn; 883 884 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 885 if (rc) 886 return rc; 887 888 id = dax_read_lock(); 889 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 890 if (rc < 0) { 891 dax_read_unlock(id); 892 return rc; 893 } 894 dax_read_unlock(id); 895 896 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 897 if (IS_ERR(ret)) 898 return PTR_ERR(ret); 899 *entryp = ret; 900 901 trace_dax_insert_mapping(mapping->host, vmf, ret); 902 return vm_insert_mixed(vma, vaddr, pfn); 903 } 904 905 /** 906 * dax_pfn_mkwrite - handle first write to DAX page 907 * @vmf: The description of the fault 908 */ 909 int dax_pfn_mkwrite(struct vm_fault *vmf) 910 { 911 struct file *file = vmf->vma->vm_file; 912 struct address_space *mapping = file->f_mapping; 913 struct inode *inode = mapping->host; 914 void *entry, **slot; 915 pgoff_t index = vmf->pgoff; 916 917 spin_lock_irq(&mapping->tree_lock); 918 entry = get_unlocked_mapping_entry(mapping, index, &slot); 919 if (!entry || !radix_tree_exceptional_entry(entry)) { 920 if (entry) 921 put_unlocked_mapping_entry(mapping, index, entry); 922 spin_unlock_irq(&mapping->tree_lock); 923 trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); 924 return VM_FAULT_NOPAGE; 925 } 926 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 927 entry = lock_slot(mapping, slot); 928 spin_unlock_irq(&mapping->tree_lock); 929 /* 930 * If we race with somebody updating the PTE and finish_mkwrite_fault() 931 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 932 * the fault in either case. 933 */ 934 finish_mkwrite_fault(vmf); 935 put_locked_mapping_entry(mapping, index, entry); 936 trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); 937 return VM_FAULT_NOPAGE; 938 } 939 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 940 941 static bool dax_range_is_aligned(struct block_device *bdev, 942 unsigned int offset, unsigned int length) 943 { 944 unsigned short sector_size = bdev_logical_block_size(bdev); 945 946 if (!IS_ALIGNED(offset, sector_size)) 947 return false; 948 if (!IS_ALIGNED(length, sector_size)) 949 return false; 950 951 return true; 952 } 953 954 int __dax_zero_page_range(struct block_device *bdev, 955 struct dax_device *dax_dev, sector_t sector, 956 unsigned int offset, unsigned int size) 957 { 958 if (dax_range_is_aligned(bdev, offset, size)) { 959 sector_t start_sector = sector + (offset >> 9); 960 961 return blkdev_issue_zeroout(bdev, start_sector, 962 size >> 9, GFP_NOFS, 0); 963 } else { 964 pgoff_t pgoff; 965 long rc, id; 966 void *kaddr; 967 pfn_t pfn; 968 969 rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); 970 if (rc) 971 return rc; 972 973 id = dax_read_lock(); 974 rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, 975 &pfn); 976 if (rc < 0) { 977 dax_read_unlock(id); 978 return rc; 979 } 980 memset(kaddr + offset, 0, size); 981 dax_flush(dax_dev, pgoff, kaddr + offset, size); 982 dax_read_unlock(id); 983 } 984 return 0; 985 } 986 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 987 988 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 989 { 990 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 991 } 992 993 static loff_t 994 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 995 struct iomap *iomap) 996 { 997 struct block_device *bdev = iomap->bdev; 998 struct dax_device *dax_dev = iomap->dax_dev; 999 struct iov_iter *iter = data; 1000 loff_t end = pos + length, done = 0; 1001 ssize_t ret = 0; 1002 int id; 1003 1004 if (iov_iter_rw(iter) == READ) { 1005 end = min(end, i_size_read(inode)); 1006 if (pos >= end) 1007 return 0; 1008 1009 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1010 return iov_iter_zero(min(length, end - pos), iter); 1011 } 1012 1013 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1014 return -EIO; 1015 1016 /* 1017 * Write can allocate block for an area which has a hole page mapped 1018 * into page tables. We have to tear down these mappings so that data 1019 * written by write(2) is visible in mmap. 1020 */ 1021 if (iomap->flags & IOMAP_F_NEW) { 1022 invalidate_inode_pages2_range(inode->i_mapping, 1023 pos >> PAGE_SHIFT, 1024 (end - 1) >> PAGE_SHIFT); 1025 } 1026 1027 id = dax_read_lock(); 1028 while (pos < end) { 1029 unsigned offset = pos & (PAGE_SIZE - 1); 1030 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1031 const sector_t sector = dax_iomap_sector(iomap, pos); 1032 ssize_t map_len; 1033 pgoff_t pgoff; 1034 void *kaddr; 1035 pfn_t pfn; 1036 1037 if (fatal_signal_pending(current)) { 1038 ret = -EINTR; 1039 break; 1040 } 1041 1042 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 1043 if (ret) 1044 break; 1045 1046 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1047 &kaddr, &pfn); 1048 if (map_len < 0) { 1049 ret = map_len; 1050 break; 1051 } 1052 1053 map_len = PFN_PHYS(map_len); 1054 kaddr += offset; 1055 map_len -= offset; 1056 if (map_len > end - pos) 1057 map_len = end - pos; 1058 1059 if (iov_iter_rw(iter) == WRITE) 1060 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1061 map_len, iter); 1062 else 1063 map_len = copy_to_iter(kaddr, map_len, iter); 1064 if (map_len <= 0) { 1065 ret = map_len ? map_len : -EFAULT; 1066 break; 1067 } 1068 1069 pos += map_len; 1070 length -= map_len; 1071 done += map_len; 1072 } 1073 dax_read_unlock(id); 1074 1075 return done ? done : ret; 1076 } 1077 1078 /** 1079 * dax_iomap_rw - Perform I/O to a DAX file 1080 * @iocb: The control block for this I/O 1081 * @iter: The addresses to do I/O from or to 1082 * @ops: iomap ops passed from the file system 1083 * 1084 * This function performs read and write operations to directly mapped 1085 * persistent memory. The callers needs to take care of read/write exclusion 1086 * and evicting any page cache pages in the region under I/O. 1087 */ 1088 ssize_t 1089 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1090 const struct iomap_ops *ops) 1091 { 1092 struct address_space *mapping = iocb->ki_filp->f_mapping; 1093 struct inode *inode = mapping->host; 1094 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1095 unsigned flags = 0; 1096 1097 if (iov_iter_rw(iter) == WRITE) { 1098 lockdep_assert_held_exclusive(&inode->i_rwsem); 1099 flags |= IOMAP_WRITE; 1100 } else { 1101 lockdep_assert_held(&inode->i_rwsem); 1102 } 1103 1104 while (iov_iter_count(iter)) { 1105 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1106 iter, dax_iomap_actor); 1107 if (ret <= 0) 1108 break; 1109 pos += ret; 1110 done += ret; 1111 } 1112 1113 iocb->ki_pos += done; 1114 return done ? done : ret; 1115 } 1116 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1117 1118 static int dax_fault_return(int error) 1119 { 1120 if (error == 0) 1121 return VM_FAULT_NOPAGE; 1122 if (error == -ENOMEM) 1123 return VM_FAULT_OOM; 1124 return VM_FAULT_SIGBUS; 1125 } 1126 1127 static int dax_iomap_pte_fault(struct vm_fault *vmf, 1128 const struct iomap_ops *ops) 1129 { 1130 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1131 struct inode *inode = mapping->host; 1132 unsigned long vaddr = vmf->address; 1133 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1134 sector_t sector; 1135 struct iomap iomap = { 0 }; 1136 unsigned flags = IOMAP_FAULT; 1137 int error, major = 0; 1138 int vmf_ret = 0; 1139 void *entry; 1140 1141 trace_dax_pte_fault(inode, vmf, vmf_ret); 1142 /* 1143 * Check whether offset isn't beyond end of file now. Caller is supposed 1144 * to hold locks serializing us with truncate / punch hole so this is 1145 * a reliable test. 1146 */ 1147 if (pos >= i_size_read(inode)) { 1148 vmf_ret = VM_FAULT_SIGBUS; 1149 goto out; 1150 } 1151 1152 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1153 flags |= IOMAP_WRITE; 1154 1155 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1156 if (IS_ERR(entry)) { 1157 vmf_ret = dax_fault_return(PTR_ERR(entry)); 1158 goto out; 1159 } 1160 1161 /* 1162 * It is possible, particularly with mixed reads & writes to private 1163 * mappings, that we have raced with a PMD fault that overlaps with 1164 * the PTE we need to set up. If so just return and the fault will be 1165 * retried. 1166 */ 1167 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { 1168 vmf_ret = VM_FAULT_NOPAGE; 1169 goto unlock_entry; 1170 } 1171 1172 /* 1173 * Note that we don't bother to use iomap_apply here: DAX required 1174 * the file system block size to be equal the page size, which means 1175 * that we never have to deal with more than a single extent here. 1176 */ 1177 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1178 if (error) { 1179 vmf_ret = dax_fault_return(error); 1180 goto unlock_entry; 1181 } 1182 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1183 error = -EIO; /* fs corruption? */ 1184 goto error_finish_iomap; 1185 } 1186 1187 sector = dax_iomap_sector(&iomap, pos); 1188 1189 if (vmf->cow_page) { 1190 switch (iomap.type) { 1191 case IOMAP_HOLE: 1192 case IOMAP_UNWRITTEN: 1193 clear_user_highpage(vmf->cow_page, vaddr); 1194 break; 1195 case IOMAP_MAPPED: 1196 error = copy_user_dax(iomap.bdev, iomap.dax_dev, 1197 sector, PAGE_SIZE, vmf->cow_page, vaddr); 1198 break; 1199 default: 1200 WARN_ON_ONCE(1); 1201 error = -EIO; 1202 break; 1203 } 1204 1205 if (error) 1206 goto error_finish_iomap; 1207 1208 __SetPageUptodate(vmf->cow_page); 1209 vmf_ret = finish_fault(vmf); 1210 if (!vmf_ret) 1211 vmf_ret = VM_FAULT_DONE_COW; 1212 goto finish_iomap; 1213 } 1214 1215 switch (iomap.type) { 1216 case IOMAP_MAPPED: 1217 if (iomap.flags & IOMAP_F_NEW) { 1218 count_vm_event(PGMAJFAULT); 1219 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 1220 major = VM_FAULT_MAJOR; 1221 } 1222 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1223 sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1224 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1225 if (error == -EBUSY) 1226 error = 0; 1227 break; 1228 case IOMAP_UNWRITTEN: 1229 case IOMAP_HOLE: 1230 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1231 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1232 goto finish_iomap; 1233 } 1234 /*FALLTHRU*/ 1235 default: 1236 WARN_ON_ONCE(1); 1237 error = -EIO; 1238 break; 1239 } 1240 1241 error_finish_iomap: 1242 vmf_ret = dax_fault_return(error) | major; 1243 finish_iomap: 1244 if (ops->iomap_end) { 1245 int copied = PAGE_SIZE; 1246 1247 if (vmf_ret & VM_FAULT_ERROR) 1248 copied = 0; 1249 /* 1250 * The fault is done by now and there's no way back (other 1251 * thread may be already happily using PTE we have installed). 1252 * Just ignore error from ->iomap_end since we cannot do much 1253 * with it. 1254 */ 1255 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1256 } 1257 unlock_entry: 1258 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1259 out: 1260 trace_dax_pte_fault_done(inode, vmf, vmf_ret); 1261 return vmf_ret; 1262 } 1263 1264 #ifdef CONFIG_FS_DAX_PMD 1265 /* 1266 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 1267 * more often than one might expect in the below functions. 1268 */ 1269 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1270 1271 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1272 loff_t pos, void **entryp) 1273 { 1274 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1275 const sector_t sector = dax_iomap_sector(iomap, pos); 1276 struct dax_device *dax_dev = iomap->dax_dev; 1277 struct block_device *bdev = iomap->bdev; 1278 struct inode *inode = mapping->host; 1279 const size_t size = PMD_SIZE; 1280 void *ret = NULL, *kaddr; 1281 long length = 0; 1282 pgoff_t pgoff; 1283 pfn_t pfn; 1284 int id; 1285 1286 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) 1287 goto fallback; 1288 1289 id = dax_read_lock(); 1290 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 1291 if (length < 0) 1292 goto unlock_fallback; 1293 length = PFN_PHYS(length); 1294 1295 if (length < size) 1296 goto unlock_fallback; 1297 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) 1298 goto unlock_fallback; 1299 if (!pfn_t_devmap(pfn)) 1300 goto unlock_fallback; 1301 dax_read_unlock(id); 1302 1303 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1304 RADIX_DAX_PMD); 1305 if (IS_ERR(ret)) 1306 goto fallback; 1307 *entryp = ret; 1308 1309 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1310 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1311 pfn, vmf->flags & FAULT_FLAG_WRITE); 1312 1313 unlock_fallback: 1314 dax_read_unlock(id); 1315 fallback: 1316 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); 1317 return VM_FAULT_FALLBACK; 1318 } 1319 1320 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1321 void **entryp) 1322 { 1323 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1324 unsigned long pmd_addr = vmf->address & PMD_MASK; 1325 struct inode *inode = mapping->host; 1326 struct page *zero_page; 1327 void *ret = NULL; 1328 spinlock_t *ptl; 1329 pmd_t pmd_entry; 1330 1331 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1332 1333 if (unlikely(!zero_page)) 1334 goto fallback; 1335 1336 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1337 RADIX_DAX_PMD | RADIX_DAX_HZP); 1338 if (IS_ERR(ret)) 1339 goto fallback; 1340 *entryp = ret; 1341 1342 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1343 if (!pmd_none(*(vmf->pmd))) { 1344 spin_unlock(ptl); 1345 goto fallback; 1346 } 1347 1348 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); 1349 pmd_entry = pmd_mkhuge(pmd_entry); 1350 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 1351 spin_unlock(ptl); 1352 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); 1353 return VM_FAULT_NOPAGE; 1354 1355 fallback: 1356 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); 1357 return VM_FAULT_FALLBACK; 1358 } 1359 1360 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1361 const struct iomap_ops *ops) 1362 { 1363 struct vm_area_struct *vma = vmf->vma; 1364 struct address_space *mapping = vma->vm_file->f_mapping; 1365 unsigned long pmd_addr = vmf->address & PMD_MASK; 1366 bool write = vmf->flags & FAULT_FLAG_WRITE; 1367 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1368 struct inode *inode = mapping->host; 1369 int result = VM_FAULT_FALLBACK; 1370 struct iomap iomap = { 0 }; 1371 pgoff_t max_pgoff, pgoff; 1372 void *entry; 1373 loff_t pos; 1374 int error; 1375 1376 /* 1377 * Check whether offset isn't beyond end of file now. Caller is 1378 * supposed to hold locks serializing us with truncate / punch hole so 1379 * this is a reliable test. 1380 */ 1381 pgoff = linear_page_index(vma, pmd_addr); 1382 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1383 1384 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1385 1386 /* 1387 * Make sure that the faulting address's PMD offset (color) matches 1388 * the PMD offset from the start of the file. This is necessary so 1389 * that a PMD range in the page table overlaps exactly with a PMD 1390 * range in the radix tree. 1391 */ 1392 if ((vmf->pgoff & PG_PMD_COLOUR) != 1393 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) 1394 goto fallback; 1395 1396 /* Fall back to PTEs if we're going to COW */ 1397 if (write && !(vma->vm_flags & VM_SHARED)) 1398 goto fallback; 1399 1400 /* If the PMD would extend outside the VMA */ 1401 if (pmd_addr < vma->vm_start) 1402 goto fallback; 1403 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1404 goto fallback; 1405 1406 if (pgoff > max_pgoff) { 1407 result = VM_FAULT_SIGBUS; 1408 goto out; 1409 } 1410 1411 /* If the PMD would extend beyond the file size */ 1412 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1413 goto fallback; 1414 1415 /* 1416 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1417 * PMD or a HZP entry. If it can't (because a 4k page is already in 1418 * the tree, for instance), it will return -EEXIST and we just fall 1419 * back to 4k entries. 1420 */ 1421 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1422 if (IS_ERR(entry)) 1423 goto fallback; 1424 1425 /* 1426 * It is possible, particularly with mixed reads & writes to private 1427 * mappings, that we have raced with a PTE fault that overlaps with 1428 * the PMD we need to set up. If so just return and the fault will be 1429 * retried. 1430 */ 1431 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && 1432 !pmd_devmap(*vmf->pmd)) { 1433 result = 0; 1434 goto unlock_entry; 1435 } 1436 1437 /* 1438 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1439 * setting up a mapping, so really we're using iomap_begin() as a way 1440 * to look up our filesystem block. 1441 */ 1442 pos = (loff_t)pgoff << PAGE_SHIFT; 1443 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1444 if (error) 1445 goto unlock_entry; 1446 1447 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1448 goto finish_iomap; 1449 1450 switch (iomap.type) { 1451 case IOMAP_MAPPED: 1452 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1453 break; 1454 case IOMAP_UNWRITTEN: 1455 case IOMAP_HOLE: 1456 if (WARN_ON_ONCE(write)) 1457 break; 1458 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1459 break; 1460 default: 1461 WARN_ON_ONCE(1); 1462 break; 1463 } 1464 1465 finish_iomap: 1466 if (ops->iomap_end) { 1467 int copied = PMD_SIZE; 1468 1469 if (result == VM_FAULT_FALLBACK) 1470 copied = 0; 1471 /* 1472 * The fault is done by now and there's no way back (other 1473 * thread may be already happily using PMD we have installed). 1474 * Just ignore error from ->iomap_end since we cannot do much 1475 * with it. 1476 */ 1477 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1478 &iomap); 1479 } 1480 unlock_entry: 1481 put_locked_mapping_entry(mapping, pgoff, entry); 1482 fallback: 1483 if (result == VM_FAULT_FALLBACK) { 1484 split_huge_pmd(vma, vmf->pmd, vmf->address); 1485 count_vm_event(THP_FAULT_FALLBACK); 1486 } 1487 out: 1488 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1489 return result; 1490 } 1491 #else 1492 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1493 const struct iomap_ops *ops) 1494 { 1495 return VM_FAULT_FALLBACK; 1496 } 1497 #endif /* CONFIG_FS_DAX_PMD */ 1498 1499 /** 1500 * dax_iomap_fault - handle a page fault on a DAX file 1501 * @vmf: The description of the fault 1502 * @ops: iomap ops passed from the file system 1503 * 1504 * When a page fault occurs, filesystems may call this helper in 1505 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1506 * has done all the necessary locking for page fault to proceed 1507 * successfully. 1508 */ 1509 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1510 const struct iomap_ops *ops) 1511 { 1512 switch (pe_size) { 1513 case PE_SIZE_PTE: 1514 return dax_iomap_pte_fault(vmf, ops); 1515 case PE_SIZE_PMD: 1516 return dax_iomap_pmd_fault(vmf, ops); 1517 default: 1518 return VM_FAULT_FALLBACK; 1519 } 1520 } 1521 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1522