1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 #include <linux/mmu_notifier.h> 35 #include <linux/iomap.h> 36 #include "internal.h" 37 38 /* We choose 4096 entries - same as per-zone page wait tables */ 39 #define DAX_WAIT_TABLE_BITS 12 40 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 41 42 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 43 44 static int __init init_dax_wait_table(void) 45 { 46 int i; 47 48 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 49 init_waitqueue_head(wait_table + i); 50 return 0; 51 } 52 fs_initcall(init_dax_wait_table); 53 54 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 55 { 56 struct request_queue *q = bdev->bd_queue; 57 long rc = -EIO; 58 59 dax->addr = ERR_PTR(-EIO); 60 if (blk_queue_enter(q, true) != 0) 61 return rc; 62 63 rc = bdev_direct_access(bdev, dax); 64 if (rc < 0) { 65 dax->addr = ERR_PTR(rc); 66 blk_queue_exit(q); 67 return rc; 68 } 69 return rc; 70 } 71 72 static void dax_unmap_atomic(struct block_device *bdev, 73 const struct blk_dax_ctl *dax) 74 { 75 if (IS_ERR(dax->addr)) 76 return; 77 blk_queue_exit(bdev->bd_queue); 78 } 79 80 static int dax_is_pmd_entry(void *entry) 81 { 82 return (unsigned long)entry & RADIX_DAX_PMD; 83 } 84 85 static int dax_is_pte_entry(void *entry) 86 { 87 return !((unsigned long)entry & RADIX_DAX_PMD); 88 } 89 90 static int dax_is_zero_entry(void *entry) 91 { 92 return (unsigned long)entry & RADIX_DAX_HZP; 93 } 94 95 static int dax_is_empty_entry(void *entry) 96 { 97 return (unsigned long)entry & RADIX_DAX_EMPTY; 98 } 99 100 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 101 { 102 struct page *page = alloc_pages(GFP_KERNEL, 0); 103 struct blk_dax_ctl dax = { 104 .size = PAGE_SIZE, 105 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 106 }; 107 long rc; 108 109 if (!page) 110 return ERR_PTR(-ENOMEM); 111 112 rc = dax_map_atomic(bdev, &dax); 113 if (rc < 0) 114 return ERR_PTR(rc); 115 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 116 dax_unmap_atomic(bdev, &dax); 117 return page; 118 } 119 120 /* 121 * DAX radix tree locking 122 */ 123 struct exceptional_entry_key { 124 struct address_space *mapping; 125 pgoff_t entry_start; 126 }; 127 128 struct wait_exceptional_entry_queue { 129 wait_queue_t wait; 130 struct exceptional_entry_key key; 131 }; 132 133 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 134 pgoff_t index, void *entry, struct exceptional_entry_key *key) 135 { 136 unsigned long hash; 137 138 /* 139 * If 'entry' is a PMD, align the 'index' that we use for the wait 140 * queue to the start of that PMD. This ensures that all offsets in 141 * the range covered by the PMD map to the same bit lock. 142 */ 143 if (dax_is_pmd_entry(entry)) 144 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 145 146 key->mapping = mapping; 147 key->entry_start = index; 148 149 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 150 return wait_table + hash; 151 } 152 153 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 154 int sync, void *keyp) 155 { 156 struct exceptional_entry_key *key = keyp; 157 struct wait_exceptional_entry_queue *ewait = 158 container_of(wait, struct wait_exceptional_entry_queue, wait); 159 160 if (key->mapping != ewait->key.mapping || 161 key->entry_start != ewait->key.entry_start) 162 return 0; 163 return autoremove_wake_function(wait, mode, sync, NULL); 164 } 165 166 /* 167 * Check whether the given slot is locked. The function must be called with 168 * mapping->tree_lock held 169 */ 170 static inline int slot_locked(struct address_space *mapping, void **slot) 171 { 172 unsigned long entry = (unsigned long) 173 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 174 return entry & RADIX_DAX_ENTRY_LOCK; 175 } 176 177 /* 178 * Mark the given slot is locked. The function must be called with 179 * mapping->tree_lock held 180 */ 181 static inline void *lock_slot(struct address_space *mapping, void **slot) 182 { 183 unsigned long entry = (unsigned long) 184 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 185 186 entry |= RADIX_DAX_ENTRY_LOCK; 187 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 188 return (void *)entry; 189 } 190 191 /* 192 * Mark the given slot is unlocked. The function must be called with 193 * mapping->tree_lock held 194 */ 195 static inline void *unlock_slot(struct address_space *mapping, void **slot) 196 { 197 unsigned long entry = (unsigned long) 198 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 199 200 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 201 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 202 return (void *)entry; 203 } 204 205 /* 206 * Lookup entry in radix tree, wait for it to become unlocked if it is 207 * exceptional entry and return it. The caller must call 208 * put_unlocked_mapping_entry() when he decided not to lock the entry or 209 * put_locked_mapping_entry() when he locked the entry and now wants to 210 * unlock it. 211 * 212 * The function must be called with mapping->tree_lock held. 213 */ 214 static void *get_unlocked_mapping_entry(struct address_space *mapping, 215 pgoff_t index, void ***slotp) 216 { 217 void *entry, **slot; 218 struct wait_exceptional_entry_queue ewait; 219 wait_queue_head_t *wq; 220 221 init_wait(&ewait.wait); 222 ewait.wait.func = wake_exceptional_entry_func; 223 224 for (;;) { 225 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 226 &slot); 227 if (!entry || !radix_tree_exceptional_entry(entry) || 228 !slot_locked(mapping, slot)) { 229 if (slotp) 230 *slotp = slot; 231 return entry; 232 } 233 234 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 235 prepare_to_wait_exclusive(wq, &ewait.wait, 236 TASK_UNINTERRUPTIBLE); 237 spin_unlock_irq(&mapping->tree_lock); 238 schedule(); 239 finish_wait(wq, &ewait.wait); 240 spin_lock_irq(&mapping->tree_lock); 241 } 242 } 243 244 static void dax_unlock_mapping_entry(struct address_space *mapping, 245 pgoff_t index) 246 { 247 void *entry, **slot; 248 249 spin_lock_irq(&mapping->tree_lock); 250 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 251 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 252 !slot_locked(mapping, slot))) { 253 spin_unlock_irq(&mapping->tree_lock); 254 return; 255 } 256 unlock_slot(mapping, slot); 257 spin_unlock_irq(&mapping->tree_lock); 258 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 259 } 260 261 static void put_locked_mapping_entry(struct address_space *mapping, 262 pgoff_t index, void *entry) 263 { 264 if (!radix_tree_exceptional_entry(entry)) { 265 unlock_page(entry); 266 put_page(entry); 267 } else { 268 dax_unlock_mapping_entry(mapping, index); 269 } 270 } 271 272 /* 273 * Called when we are done with radix tree entry we looked up via 274 * get_unlocked_mapping_entry() and which we didn't lock in the end. 275 */ 276 static void put_unlocked_mapping_entry(struct address_space *mapping, 277 pgoff_t index, void *entry) 278 { 279 if (!radix_tree_exceptional_entry(entry)) 280 return; 281 282 /* We have to wake up next waiter for the radix tree entry lock */ 283 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 284 } 285 286 /* 287 * Find radix tree entry at given index. If it points to a page, return with 288 * the page locked. If it points to the exceptional entry, return with the 289 * radix tree entry locked. If the radix tree doesn't contain given index, 290 * create empty exceptional entry for the index and return with it locked. 291 * 292 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 293 * either return that locked entry or will return an error. This error will 294 * happen if there are any 4k entries (either zero pages or DAX entries) 295 * within the 2MiB range that we are requesting. 296 * 297 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 298 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 299 * insertion will fail if it finds any 4k entries already in the tree, and a 300 * 4k insertion will cause an existing 2MiB entry to be unmapped and 301 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 302 * well as 2MiB empty entries. 303 * 304 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 305 * real storage backing them. We will leave these real 2MiB DAX entries in 306 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 307 * 308 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 309 * persistent memory the benefit is doubtful. We can add that later if we can 310 * show it helps. 311 */ 312 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 313 unsigned long size_flag) 314 { 315 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 316 void *entry, **slot; 317 318 restart: 319 spin_lock_irq(&mapping->tree_lock); 320 entry = get_unlocked_mapping_entry(mapping, index, &slot); 321 322 if (entry) { 323 if (size_flag & RADIX_DAX_PMD) { 324 if (!radix_tree_exceptional_entry(entry) || 325 dax_is_pte_entry(entry)) { 326 put_unlocked_mapping_entry(mapping, index, 327 entry); 328 entry = ERR_PTR(-EEXIST); 329 goto out_unlock; 330 } 331 } else { /* trying to grab a PTE entry */ 332 if (radix_tree_exceptional_entry(entry) && 333 dax_is_pmd_entry(entry) && 334 (dax_is_zero_entry(entry) || 335 dax_is_empty_entry(entry))) { 336 pmd_downgrade = true; 337 } 338 } 339 } 340 341 /* No entry for given index? Make sure radix tree is big enough. */ 342 if (!entry || pmd_downgrade) { 343 int err; 344 345 if (pmd_downgrade) { 346 /* 347 * Make sure 'entry' remains valid while we drop 348 * mapping->tree_lock. 349 */ 350 entry = lock_slot(mapping, slot); 351 } 352 353 spin_unlock_irq(&mapping->tree_lock); 354 /* 355 * Besides huge zero pages the only other thing that gets 356 * downgraded are empty entries which don't need to be 357 * unmapped. 358 */ 359 if (pmd_downgrade && dax_is_zero_entry(entry)) 360 unmap_mapping_range(mapping, 361 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 362 363 err = radix_tree_preload( 364 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 365 if (err) { 366 if (pmd_downgrade) 367 put_locked_mapping_entry(mapping, index, entry); 368 return ERR_PTR(err); 369 } 370 spin_lock_irq(&mapping->tree_lock); 371 372 if (pmd_downgrade) { 373 radix_tree_delete(&mapping->page_tree, index); 374 mapping->nrexceptional--; 375 dax_wake_mapping_entry_waiter(mapping, index, entry, 376 true); 377 } 378 379 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 380 381 err = __radix_tree_insert(&mapping->page_tree, index, 382 dax_radix_order(entry), entry); 383 radix_tree_preload_end(); 384 if (err) { 385 spin_unlock_irq(&mapping->tree_lock); 386 /* 387 * Someone already created the entry? This is a 388 * normal failure when inserting PMDs in a range 389 * that already contains PTEs. In that case we want 390 * to return -EEXIST immediately. 391 */ 392 if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) 393 goto restart; 394 /* 395 * Our insertion of a DAX PMD entry failed, most 396 * likely because it collided with a PTE sized entry 397 * at a different index in the PMD range. We haven't 398 * inserted anything into the radix tree and have no 399 * waiters to wake. 400 */ 401 return ERR_PTR(err); 402 } 403 /* Good, we have inserted empty locked entry into the tree. */ 404 mapping->nrexceptional++; 405 spin_unlock_irq(&mapping->tree_lock); 406 return entry; 407 } 408 /* Normal page in radix tree? */ 409 if (!radix_tree_exceptional_entry(entry)) { 410 struct page *page = entry; 411 412 get_page(page); 413 spin_unlock_irq(&mapping->tree_lock); 414 lock_page(page); 415 /* Page got truncated? Retry... */ 416 if (unlikely(page->mapping != mapping)) { 417 unlock_page(page); 418 put_page(page); 419 goto restart; 420 } 421 return page; 422 } 423 entry = lock_slot(mapping, slot); 424 out_unlock: 425 spin_unlock_irq(&mapping->tree_lock); 426 return entry; 427 } 428 429 /* 430 * We do not necessarily hold the mapping->tree_lock when we call this 431 * function so it is possible that 'entry' is no longer a valid item in the 432 * radix tree. This is okay because all we really need to do is to find the 433 * correct waitqueue where tasks might be waiting for that old 'entry' and 434 * wake them. 435 */ 436 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 437 pgoff_t index, void *entry, bool wake_all) 438 { 439 struct exceptional_entry_key key; 440 wait_queue_head_t *wq; 441 442 wq = dax_entry_waitqueue(mapping, index, entry, &key); 443 444 /* 445 * Checking for locked entry and prepare_to_wait_exclusive() happens 446 * under mapping->tree_lock, ditto for entry handling in our callers. 447 * So at this point all tasks that could have seen our entry locked 448 * must be in the waitqueue and the following check will see them. 449 */ 450 if (waitqueue_active(wq)) 451 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 452 } 453 454 static int __dax_invalidate_mapping_entry(struct address_space *mapping, 455 pgoff_t index, bool trunc) 456 { 457 int ret = 0; 458 void *entry; 459 struct radix_tree_root *page_tree = &mapping->page_tree; 460 461 spin_lock_irq(&mapping->tree_lock); 462 entry = get_unlocked_mapping_entry(mapping, index, NULL); 463 if (!entry || !radix_tree_exceptional_entry(entry)) 464 goto out; 465 if (!trunc && 466 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 467 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 468 goto out; 469 radix_tree_delete(page_tree, index); 470 mapping->nrexceptional--; 471 ret = 1; 472 out: 473 put_unlocked_mapping_entry(mapping, index, entry); 474 spin_unlock_irq(&mapping->tree_lock); 475 return ret; 476 } 477 /* 478 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 479 * entry to get unlocked before deleting it. 480 */ 481 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 482 { 483 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 484 485 /* 486 * This gets called from truncate / punch_hole path. As such, the caller 487 * must hold locks protecting against concurrent modifications of the 488 * radix tree (usually fs-private i_mmap_sem for writing). Since the 489 * caller has seen exceptional entry for this index, we better find it 490 * at that index as well... 491 */ 492 WARN_ON_ONCE(!ret); 493 return ret; 494 } 495 496 /* 497 * Invalidate exceptional DAX entry if easily possible. This handles DAX 498 * entries for invalidate_inode_pages() so we evict the entry only if we can 499 * do so without blocking. 500 */ 501 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) 502 { 503 int ret = 0; 504 void *entry, **slot; 505 struct radix_tree_root *page_tree = &mapping->page_tree; 506 507 spin_lock_irq(&mapping->tree_lock); 508 entry = __radix_tree_lookup(page_tree, index, NULL, &slot); 509 if (!entry || !radix_tree_exceptional_entry(entry) || 510 slot_locked(mapping, slot)) 511 goto out; 512 if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 513 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 514 goto out; 515 radix_tree_delete(page_tree, index); 516 mapping->nrexceptional--; 517 ret = 1; 518 out: 519 spin_unlock_irq(&mapping->tree_lock); 520 if (ret) 521 dax_wake_mapping_entry_waiter(mapping, index, entry, true); 522 return ret; 523 } 524 525 /* 526 * Invalidate exceptional DAX entry if it is clean. 527 */ 528 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 529 pgoff_t index) 530 { 531 return __dax_invalidate_mapping_entry(mapping, index, false); 532 } 533 534 /* 535 * The user has performed a load from a hole in the file. Allocating 536 * a new page in the file would cause excessive storage usage for 537 * workloads with sparse files. We allocate a page cache page instead. 538 * We'll kick it out of the page cache if it's ever written to, 539 * otherwise it will simply fall out of the page cache under memory 540 * pressure without ever having been dirtied. 541 */ 542 static int dax_load_hole(struct address_space *mapping, void **entry, 543 struct vm_fault *vmf) 544 { 545 struct page *page; 546 int ret; 547 548 /* Hole page already exists? Return it... */ 549 if (!radix_tree_exceptional_entry(*entry)) { 550 page = *entry; 551 goto out; 552 } 553 554 /* This will replace locked radix tree entry with a hole page */ 555 page = find_or_create_page(mapping, vmf->pgoff, 556 vmf->gfp_mask | __GFP_ZERO); 557 if (!page) 558 return VM_FAULT_OOM; 559 out: 560 vmf->page = page; 561 ret = finish_fault(vmf); 562 vmf->page = NULL; 563 *entry = page; 564 if (!ret) { 565 /* Grab reference for PTE that is now referencing the page */ 566 get_page(page); 567 return VM_FAULT_NOPAGE; 568 } 569 return ret; 570 } 571 572 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 573 struct page *to, unsigned long vaddr) 574 { 575 struct blk_dax_ctl dax = { 576 .sector = sector, 577 .size = size, 578 }; 579 void *vto; 580 581 if (dax_map_atomic(bdev, &dax) < 0) 582 return PTR_ERR(dax.addr); 583 vto = kmap_atomic(to); 584 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 585 kunmap_atomic(vto); 586 dax_unmap_atomic(bdev, &dax); 587 return 0; 588 } 589 590 /* 591 * By this point grab_mapping_entry() has ensured that we have a locked entry 592 * of the appropriate size so we don't have to worry about downgrading PMDs to 593 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 594 * already in the tree, we will skip the insertion and just dirty the PMD as 595 * appropriate. 596 */ 597 static void *dax_insert_mapping_entry(struct address_space *mapping, 598 struct vm_fault *vmf, 599 void *entry, sector_t sector, 600 unsigned long flags) 601 { 602 struct radix_tree_root *page_tree = &mapping->page_tree; 603 int error = 0; 604 bool hole_fill = false; 605 void *new_entry; 606 pgoff_t index = vmf->pgoff; 607 608 if (vmf->flags & FAULT_FLAG_WRITE) 609 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 610 611 /* Replacing hole page with block mapping? */ 612 if (!radix_tree_exceptional_entry(entry)) { 613 hole_fill = true; 614 /* 615 * Unmap the page now before we remove it from page cache below. 616 * The page is locked so it cannot be faulted in again. 617 */ 618 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 619 PAGE_SIZE, 0); 620 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 621 if (error) 622 return ERR_PTR(error); 623 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { 624 /* replacing huge zero page with PMD block mapping */ 625 unmap_mapping_range(mapping, 626 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 627 } 628 629 spin_lock_irq(&mapping->tree_lock); 630 new_entry = dax_radix_locked_entry(sector, flags); 631 632 if (hole_fill) { 633 __delete_from_page_cache(entry, NULL); 634 /* Drop pagecache reference */ 635 put_page(entry); 636 error = __radix_tree_insert(page_tree, index, 637 dax_radix_order(new_entry), new_entry); 638 if (error) { 639 new_entry = ERR_PTR(error); 640 goto unlock; 641 } 642 mapping->nrexceptional++; 643 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 644 /* 645 * Only swap our new entry into the radix tree if the current 646 * entry is a zero page or an empty entry. If a normal PTE or 647 * PMD entry is already in the tree, we leave it alone. This 648 * means that if we are trying to insert a PTE and the 649 * existing entry is a PMD, we will just leave the PMD in the 650 * tree and dirty it if necessary. 651 */ 652 struct radix_tree_node *node; 653 void **slot; 654 void *ret; 655 656 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 657 WARN_ON_ONCE(ret != entry); 658 __radix_tree_replace(page_tree, node, slot, 659 new_entry, NULL, NULL); 660 } 661 if (vmf->flags & FAULT_FLAG_WRITE) 662 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 663 unlock: 664 spin_unlock_irq(&mapping->tree_lock); 665 if (hole_fill) { 666 radix_tree_preload_end(); 667 /* 668 * We don't need hole page anymore, it has been replaced with 669 * locked radix tree entry now. 670 */ 671 if (mapping->a_ops->freepage) 672 mapping->a_ops->freepage(entry); 673 unlock_page(entry); 674 put_page(entry); 675 } 676 return new_entry; 677 } 678 679 static inline unsigned long 680 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 681 { 682 unsigned long address; 683 684 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 685 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 686 return address; 687 } 688 689 /* Walk all mappings of a given index of a file and writeprotect them */ 690 static void dax_mapping_entry_mkclean(struct address_space *mapping, 691 pgoff_t index, unsigned long pfn) 692 { 693 struct vm_area_struct *vma; 694 pte_t pte, *ptep = NULL; 695 pmd_t *pmdp = NULL; 696 spinlock_t *ptl; 697 bool changed; 698 699 i_mmap_lock_read(mapping); 700 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { 701 unsigned long address; 702 703 cond_resched(); 704 705 if (!(vma->vm_flags & VM_SHARED)) 706 continue; 707 708 address = pgoff_address(index, vma); 709 changed = false; 710 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) 711 continue; 712 713 if (pmdp) { 714 #ifdef CONFIG_FS_DAX_PMD 715 pmd_t pmd; 716 717 if (pfn != pmd_pfn(*pmdp)) 718 goto unlock_pmd; 719 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) 720 goto unlock_pmd; 721 722 flush_cache_page(vma, address, pfn); 723 pmd = pmdp_huge_clear_flush(vma, address, pmdp); 724 pmd = pmd_wrprotect(pmd); 725 pmd = pmd_mkclean(pmd); 726 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 727 changed = true; 728 unlock_pmd: 729 spin_unlock(ptl); 730 #endif 731 } else { 732 if (pfn != pte_pfn(*ptep)) 733 goto unlock_pte; 734 if (!pte_dirty(*ptep) && !pte_write(*ptep)) 735 goto unlock_pte; 736 737 flush_cache_page(vma, address, pfn); 738 pte = ptep_clear_flush(vma, address, ptep); 739 pte = pte_wrprotect(pte); 740 pte = pte_mkclean(pte); 741 set_pte_at(vma->vm_mm, address, ptep, pte); 742 changed = true; 743 unlock_pte: 744 pte_unmap_unlock(ptep, ptl); 745 } 746 747 if (changed) 748 mmu_notifier_invalidate_page(vma->vm_mm, address); 749 } 750 i_mmap_unlock_read(mapping); 751 } 752 753 static int dax_writeback_one(struct block_device *bdev, 754 struct address_space *mapping, pgoff_t index, void *entry) 755 { 756 struct radix_tree_root *page_tree = &mapping->page_tree; 757 struct blk_dax_ctl dax; 758 void *entry2, **slot; 759 int ret = 0; 760 761 /* 762 * A page got tagged dirty in DAX mapping? Something is seriously 763 * wrong. 764 */ 765 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 766 return -EIO; 767 768 spin_lock_irq(&mapping->tree_lock); 769 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 770 /* Entry got punched out / reallocated? */ 771 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 772 goto put_unlocked; 773 /* 774 * Entry got reallocated elsewhere? No need to writeback. We have to 775 * compare sectors as we must not bail out due to difference in lockbit 776 * or entry type. 777 */ 778 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 779 goto put_unlocked; 780 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 781 dax_is_zero_entry(entry))) { 782 ret = -EIO; 783 goto put_unlocked; 784 } 785 786 /* Another fsync thread may have already written back this entry */ 787 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 788 goto put_unlocked; 789 /* Lock the entry to serialize with page faults */ 790 entry = lock_slot(mapping, slot); 791 /* 792 * We can clear the tag now but we have to be careful so that concurrent 793 * dax_writeback_one() calls for the same index cannot finish before we 794 * actually flush the caches. This is achieved as the calls will look 795 * at the entry only under tree_lock and once they do that they will 796 * see the entry locked and wait for it to unlock. 797 */ 798 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 799 spin_unlock_irq(&mapping->tree_lock); 800 801 /* 802 * Even if dax_writeback_mapping_range() was given a wbc->range_start 803 * in the middle of a PMD, the 'index' we are given will be aligned to 804 * the start index of the PMD, as will the sector we pull from 805 * 'entry'. This allows us to flush for PMD_SIZE and not have to 806 * worry about partial PMD writebacks. 807 */ 808 dax.sector = dax_radix_sector(entry); 809 dax.size = PAGE_SIZE << dax_radix_order(entry); 810 811 /* 812 * We cannot hold tree_lock while calling dax_map_atomic() because it 813 * eventually calls cond_resched(). 814 */ 815 ret = dax_map_atomic(bdev, &dax); 816 if (ret < 0) { 817 put_locked_mapping_entry(mapping, index, entry); 818 return ret; 819 } 820 821 if (WARN_ON_ONCE(ret < dax.size)) { 822 ret = -EIO; 823 goto unmap; 824 } 825 826 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); 827 wb_cache_pmem(dax.addr, dax.size); 828 /* 829 * After we have flushed the cache, we can clear the dirty tag. There 830 * cannot be new dirty data in the pfn after the flush has completed as 831 * the pfn mappings are writeprotected and fault waits for mapping 832 * entry lock. 833 */ 834 spin_lock_irq(&mapping->tree_lock); 835 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 836 spin_unlock_irq(&mapping->tree_lock); 837 unmap: 838 dax_unmap_atomic(bdev, &dax); 839 put_locked_mapping_entry(mapping, index, entry); 840 return ret; 841 842 put_unlocked: 843 put_unlocked_mapping_entry(mapping, index, entry2); 844 spin_unlock_irq(&mapping->tree_lock); 845 return ret; 846 } 847 848 /* 849 * Flush the mapping to the persistent domain within the byte range of [start, 850 * end]. This is required by data integrity operations to ensure file data is 851 * on persistent storage prior to completion of the operation. 852 */ 853 int dax_writeback_mapping_range(struct address_space *mapping, 854 struct block_device *bdev, struct writeback_control *wbc) 855 { 856 struct inode *inode = mapping->host; 857 pgoff_t start_index, end_index; 858 pgoff_t indices[PAGEVEC_SIZE]; 859 struct pagevec pvec; 860 bool done = false; 861 int i, ret = 0; 862 863 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 864 return -EIO; 865 866 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 867 return 0; 868 869 start_index = wbc->range_start >> PAGE_SHIFT; 870 end_index = wbc->range_end >> PAGE_SHIFT; 871 872 tag_pages_for_writeback(mapping, start_index, end_index); 873 874 pagevec_init(&pvec, 0); 875 while (!done) { 876 pvec.nr = find_get_entries_tag(mapping, start_index, 877 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 878 pvec.pages, indices); 879 880 if (pvec.nr == 0) 881 break; 882 883 for (i = 0; i < pvec.nr; i++) { 884 if (indices[i] > end_index) { 885 done = true; 886 break; 887 } 888 889 ret = dax_writeback_one(bdev, mapping, indices[i], 890 pvec.pages[i]); 891 if (ret < 0) 892 return ret; 893 } 894 } 895 return 0; 896 } 897 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 898 899 static int dax_insert_mapping(struct address_space *mapping, 900 struct block_device *bdev, sector_t sector, size_t size, 901 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 902 { 903 unsigned long vaddr = vmf->address; 904 struct blk_dax_ctl dax = { 905 .sector = sector, 906 .size = size, 907 }; 908 void *ret; 909 void *entry = *entryp; 910 911 if (dax_map_atomic(bdev, &dax) < 0) 912 return PTR_ERR(dax.addr); 913 dax_unmap_atomic(bdev, &dax); 914 915 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 916 if (IS_ERR(ret)) 917 return PTR_ERR(ret); 918 *entryp = ret; 919 920 return vm_insert_mixed(vma, vaddr, dax.pfn); 921 } 922 923 /** 924 * dax_pfn_mkwrite - handle first write to DAX page 925 * @vma: The virtual memory area where the fault occurred 926 * @vmf: The description of the fault 927 */ 928 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 929 { 930 struct file *file = vma->vm_file; 931 struct address_space *mapping = file->f_mapping; 932 void *entry, **slot; 933 pgoff_t index = vmf->pgoff; 934 935 spin_lock_irq(&mapping->tree_lock); 936 entry = get_unlocked_mapping_entry(mapping, index, &slot); 937 if (!entry || !radix_tree_exceptional_entry(entry)) { 938 if (entry) 939 put_unlocked_mapping_entry(mapping, index, entry); 940 spin_unlock_irq(&mapping->tree_lock); 941 return VM_FAULT_NOPAGE; 942 } 943 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 944 entry = lock_slot(mapping, slot); 945 spin_unlock_irq(&mapping->tree_lock); 946 /* 947 * If we race with somebody updating the PTE and finish_mkwrite_fault() 948 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 949 * the fault in either case. 950 */ 951 finish_mkwrite_fault(vmf); 952 put_locked_mapping_entry(mapping, index, entry); 953 return VM_FAULT_NOPAGE; 954 } 955 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 956 957 static bool dax_range_is_aligned(struct block_device *bdev, 958 unsigned int offset, unsigned int length) 959 { 960 unsigned short sector_size = bdev_logical_block_size(bdev); 961 962 if (!IS_ALIGNED(offset, sector_size)) 963 return false; 964 if (!IS_ALIGNED(length, sector_size)) 965 return false; 966 967 return true; 968 } 969 970 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 971 unsigned int offset, unsigned int length) 972 { 973 struct blk_dax_ctl dax = { 974 .sector = sector, 975 .size = PAGE_SIZE, 976 }; 977 978 if (dax_range_is_aligned(bdev, offset, length)) { 979 sector_t start_sector = dax.sector + (offset >> 9); 980 981 return blkdev_issue_zeroout(bdev, start_sector, 982 length >> 9, GFP_NOFS, true); 983 } else { 984 if (dax_map_atomic(bdev, &dax) < 0) 985 return PTR_ERR(dax.addr); 986 clear_pmem(dax.addr + offset, length); 987 dax_unmap_atomic(bdev, &dax); 988 } 989 return 0; 990 } 991 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 992 993 #ifdef CONFIG_FS_IOMAP 994 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 995 { 996 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 997 } 998 999 static loff_t 1000 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1001 struct iomap *iomap) 1002 { 1003 struct iov_iter *iter = data; 1004 loff_t end = pos + length, done = 0; 1005 ssize_t ret = 0; 1006 1007 if (iov_iter_rw(iter) == READ) { 1008 end = min(end, i_size_read(inode)); 1009 if (pos >= end) 1010 return 0; 1011 1012 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1013 return iov_iter_zero(min(length, end - pos), iter); 1014 } 1015 1016 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1017 return -EIO; 1018 1019 /* 1020 * Write can allocate block for an area which has a hole page mapped 1021 * into page tables. We have to tear down these mappings so that data 1022 * written by write(2) is visible in mmap. 1023 */ 1024 if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { 1025 invalidate_inode_pages2_range(inode->i_mapping, 1026 pos >> PAGE_SHIFT, 1027 (end - 1) >> PAGE_SHIFT); 1028 } 1029 1030 while (pos < end) { 1031 unsigned offset = pos & (PAGE_SIZE - 1); 1032 struct blk_dax_ctl dax = { 0 }; 1033 ssize_t map_len; 1034 1035 dax.sector = dax_iomap_sector(iomap, pos); 1036 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1037 map_len = dax_map_atomic(iomap->bdev, &dax); 1038 if (map_len < 0) { 1039 ret = map_len; 1040 break; 1041 } 1042 1043 dax.addr += offset; 1044 map_len -= offset; 1045 if (map_len > end - pos) 1046 map_len = end - pos; 1047 1048 if (iov_iter_rw(iter) == WRITE) 1049 map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1050 else 1051 map_len = copy_to_iter(dax.addr, map_len, iter); 1052 dax_unmap_atomic(iomap->bdev, &dax); 1053 if (map_len <= 0) { 1054 ret = map_len ? map_len : -EFAULT; 1055 break; 1056 } 1057 1058 pos += map_len; 1059 length -= map_len; 1060 done += map_len; 1061 } 1062 1063 return done ? done : ret; 1064 } 1065 1066 /** 1067 * dax_iomap_rw - Perform I/O to a DAX file 1068 * @iocb: The control block for this I/O 1069 * @iter: The addresses to do I/O from or to 1070 * @ops: iomap ops passed from the file system 1071 * 1072 * This function performs read and write operations to directly mapped 1073 * persistent memory. The callers needs to take care of read/write exclusion 1074 * and evicting any page cache pages in the region under I/O. 1075 */ 1076 ssize_t 1077 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1078 struct iomap_ops *ops) 1079 { 1080 struct address_space *mapping = iocb->ki_filp->f_mapping; 1081 struct inode *inode = mapping->host; 1082 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1083 unsigned flags = 0; 1084 1085 if (iov_iter_rw(iter) == WRITE) 1086 flags |= IOMAP_WRITE; 1087 1088 while (iov_iter_count(iter)) { 1089 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1090 iter, dax_iomap_actor); 1091 if (ret <= 0) 1092 break; 1093 pos += ret; 1094 done += ret; 1095 } 1096 1097 iocb->ki_pos += done; 1098 return done ? done : ret; 1099 } 1100 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1101 1102 static int dax_fault_return(int error) 1103 { 1104 if (error == 0) 1105 return VM_FAULT_NOPAGE; 1106 if (error == -ENOMEM) 1107 return VM_FAULT_OOM; 1108 return VM_FAULT_SIGBUS; 1109 } 1110 1111 /** 1112 * dax_iomap_fault - handle a page fault on a DAX file 1113 * @vma: The virtual memory area where the fault occurred 1114 * @vmf: The description of the fault 1115 * @ops: iomap ops passed from the file system 1116 * 1117 * When a page fault occurs, filesystems may call this helper in their fault 1118 * or mkwrite handler for DAX files. Assumes the caller has done all the 1119 * necessary locking for the page fault to proceed successfully. 1120 */ 1121 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 1122 struct iomap_ops *ops) 1123 { 1124 struct address_space *mapping = vma->vm_file->f_mapping; 1125 struct inode *inode = mapping->host; 1126 unsigned long vaddr = vmf->address; 1127 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1128 sector_t sector; 1129 struct iomap iomap = { 0 }; 1130 unsigned flags = IOMAP_FAULT; 1131 int error, major = 0; 1132 int vmf_ret = 0; 1133 void *entry; 1134 1135 /* 1136 * Check whether offset isn't beyond end of file now. Caller is supposed 1137 * to hold locks serializing us with truncate / punch hole so this is 1138 * a reliable test. 1139 */ 1140 if (pos >= i_size_read(inode)) 1141 return VM_FAULT_SIGBUS; 1142 1143 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1144 flags |= IOMAP_WRITE; 1145 1146 /* 1147 * Note that we don't bother to use iomap_apply here: DAX required 1148 * the file system block size to be equal the page size, which means 1149 * that we never have to deal with more than a single extent here. 1150 */ 1151 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1152 if (error) 1153 return dax_fault_return(error); 1154 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1155 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ 1156 goto finish_iomap; 1157 } 1158 1159 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1160 if (IS_ERR(entry)) { 1161 vmf_ret = dax_fault_return(PTR_ERR(entry)); 1162 goto finish_iomap; 1163 } 1164 1165 sector = dax_iomap_sector(&iomap, pos); 1166 1167 if (vmf->cow_page) { 1168 switch (iomap.type) { 1169 case IOMAP_HOLE: 1170 case IOMAP_UNWRITTEN: 1171 clear_user_highpage(vmf->cow_page, vaddr); 1172 break; 1173 case IOMAP_MAPPED: 1174 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1175 vmf->cow_page, vaddr); 1176 break; 1177 default: 1178 WARN_ON_ONCE(1); 1179 error = -EIO; 1180 break; 1181 } 1182 1183 if (error) 1184 goto error_unlock_entry; 1185 1186 __SetPageUptodate(vmf->cow_page); 1187 vmf_ret = finish_fault(vmf); 1188 if (!vmf_ret) 1189 vmf_ret = VM_FAULT_DONE_COW; 1190 goto unlock_entry; 1191 } 1192 1193 switch (iomap.type) { 1194 case IOMAP_MAPPED: 1195 if (iomap.flags & IOMAP_F_NEW) { 1196 count_vm_event(PGMAJFAULT); 1197 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1198 major = VM_FAULT_MAJOR; 1199 } 1200 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1201 PAGE_SIZE, &entry, vma, vmf); 1202 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1203 if (error == -EBUSY) 1204 error = 0; 1205 break; 1206 case IOMAP_UNWRITTEN: 1207 case IOMAP_HOLE: 1208 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1209 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1210 goto unlock_entry; 1211 } 1212 /*FALLTHRU*/ 1213 default: 1214 WARN_ON_ONCE(1); 1215 error = -EIO; 1216 break; 1217 } 1218 1219 error_unlock_entry: 1220 vmf_ret = dax_fault_return(error) | major; 1221 unlock_entry: 1222 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1223 finish_iomap: 1224 if (ops->iomap_end) { 1225 int copied = PAGE_SIZE; 1226 1227 if (vmf_ret & VM_FAULT_ERROR) 1228 copied = 0; 1229 /* 1230 * The fault is done by now and there's no way back (other 1231 * thread may be already happily using PTE we have installed). 1232 * Just ignore error from ->iomap_end since we cannot do much 1233 * with it. 1234 */ 1235 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1236 } 1237 return vmf_ret; 1238 } 1239 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1240 1241 #ifdef CONFIG_FS_DAX_PMD 1242 /* 1243 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 1244 * more often than one might expect in the below functions. 1245 */ 1246 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1247 1248 static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, 1249 struct vm_fault *vmf, unsigned long address, 1250 struct iomap *iomap, loff_t pos, bool write, void **entryp) 1251 { 1252 struct address_space *mapping = vma->vm_file->f_mapping; 1253 struct block_device *bdev = iomap->bdev; 1254 struct blk_dax_ctl dax = { 1255 .sector = dax_iomap_sector(iomap, pos), 1256 .size = PMD_SIZE, 1257 }; 1258 long length = dax_map_atomic(bdev, &dax); 1259 void *ret; 1260 1261 if (length < 0) /* dax_map_atomic() failed */ 1262 return VM_FAULT_FALLBACK; 1263 if (length < PMD_SIZE) 1264 goto unmap_fallback; 1265 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) 1266 goto unmap_fallback; 1267 if (!pfn_t_devmap(dax.pfn)) 1268 goto unmap_fallback; 1269 1270 dax_unmap_atomic(bdev, &dax); 1271 1272 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1273 RADIX_DAX_PMD); 1274 if (IS_ERR(ret)) 1275 return VM_FAULT_FALLBACK; 1276 *entryp = ret; 1277 1278 return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); 1279 1280 unmap_fallback: 1281 dax_unmap_atomic(bdev, &dax); 1282 return VM_FAULT_FALLBACK; 1283 } 1284 1285 static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, 1286 struct vm_fault *vmf, unsigned long address, 1287 struct iomap *iomap, void **entryp) 1288 { 1289 struct address_space *mapping = vma->vm_file->f_mapping; 1290 unsigned long pmd_addr = address & PMD_MASK; 1291 struct page *zero_page; 1292 spinlock_t *ptl; 1293 pmd_t pmd_entry; 1294 void *ret; 1295 1296 zero_page = mm_get_huge_zero_page(vma->vm_mm); 1297 1298 if (unlikely(!zero_page)) 1299 return VM_FAULT_FALLBACK; 1300 1301 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1302 RADIX_DAX_PMD | RADIX_DAX_HZP); 1303 if (IS_ERR(ret)) 1304 return VM_FAULT_FALLBACK; 1305 *entryp = ret; 1306 1307 ptl = pmd_lock(vma->vm_mm, pmd); 1308 if (!pmd_none(*pmd)) { 1309 spin_unlock(ptl); 1310 return VM_FAULT_FALLBACK; 1311 } 1312 1313 pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); 1314 pmd_entry = pmd_mkhuge(pmd_entry); 1315 set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); 1316 spin_unlock(ptl); 1317 return VM_FAULT_NOPAGE; 1318 } 1319 1320 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1321 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) 1322 { 1323 struct address_space *mapping = vma->vm_file->f_mapping; 1324 unsigned long pmd_addr = address & PMD_MASK; 1325 bool write = flags & FAULT_FLAG_WRITE; 1326 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1327 struct inode *inode = mapping->host; 1328 int result = VM_FAULT_FALLBACK; 1329 struct iomap iomap = { 0 }; 1330 pgoff_t max_pgoff, pgoff; 1331 struct vm_fault vmf; 1332 void *entry; 1333 loff_t pos; 1334 int error; 1335 1336 /* Fall back to PTEs if we're going to COW */ 1337 if (write && !(vma->vm_flags & VM_SHARED)) 1338 goto fallback; 1339 1340 /* If the PMD would extend outside the VMA */ 1341 if (pmd_addr < vma->vm_start) 1342 goto fallback; 1343 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1344 goto fallback; 1345 1346 /* 1347 * Check whether offset isn't beyond end of file now. Caller is 1348 * supposed to hold locks serializing us with truncate / punch hole so 1349 * this is a reliable test. 1350 */ 1351 pgoff = linear_page_index(vma, pmd_addr); 1352 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1353 1354 if (pgoff > max_pgoff) 1355 return VM_FAULT_SIGBUS; 1356 1357 /* If the PMD would extend beyond the file size */ 1358 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1359 goto fallback; 1360 1361 /* 1362 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1363 * setting up a mapping, so really we're using iomap_begin() as a way 1364 * to look up our filesystem block. 1365 */ 1366 pos = (loff_t)pgoff << PAGE_SHIFT; 1367 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1368 if (error) 1369 goto fallback; 1370 1371 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1372 goto finish_iomap; 1373 1374 /* 1375 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1376 * PMD or a HZP entry. If it can't (because a 4k page is already in 1377 * the tree, for instance), it will return -EEXIST and we just fall 1378 * back to 4k entries. 1379 */ 1380 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1381 if (IS_ERR(entry)) 1382 goto finish_iomap; 1383 1384 vmf.pgoff = pgoff; 1385 vmf.flags = flags; 1386 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; 1387 1388 switch (iomap.type) { 1389 case IOMAP_MAPPED: 1390 result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, 1391 &iomap, pos, write, &entry); 1392 break; 1393 case IOMAP_UNWRITTEN: 1394 case IOMAP_HOLE: 1395 if (WARN_ON_ONCE(write)) 1396 goto unlock_entry; 1397 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, 1398 &entry); 1399 break; 1400 default: 1401 WARN_ON_ONCE(1); 1402 break; 1403 } 1404 1405 unlock_entry: 1406 put_locked_mapping_entry(mapping, pgoff, entry); 1407 finish_iomap: 1408 if (ops->iomap_end) { 1409 int copied = PMD_SIZE; 1410 1411 if (result == VM_FAULT_FALLBACK) 1412 copied = 0; 1413 /* 1414 * The fault is done by now and there's no way back (other 1415 * thread may be already happily using PMD we have installed). 1416 * Just ignore error from ->iomap_end since we cannot do much 1417 * with it. 1418 */ 1419 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1420 &iomap); 1421 } 1422 fallback: 1423 if (result == VM_FAULT_FALLBACK) { 1424 split_huge_pmd(vma, pmd, address); 1425 count_vm_event(THP_FAULT_FALLBACK); 1426 } 1427 return result; 1428 } 1429 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); 1430 #endif /* CONFIG_FS_DAX_PMD */ 1431 #endif /* CONFIG_FS_IOMAP */ 1432