1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 35 /* 36 * We use lowest available bit in exceptional entry for locking, other two 37 * bits to determine entry type. In total 3 special bits. 38 */ 39 #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 40 #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 41 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 42 #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 43 #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 44 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 45 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 46 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 47 RADIX_TREE_EXCEPTIONAL_ENTRY)) 48 49 /* We choose 4096 entries - same as per-zone page wait tables */ 50 #define DAX_WAIT_TABLE_BITS 12 51 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 52 53 wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 54 55 static int __init init_dax_wait_table(void) 56 { 57 int i; 58 59 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 60 init_waitqueue_head(wait_table + i); 61 return 0; 62 } 63 fs_initcall(init_dax_wait_table); 64 65 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 66 pgoff_t index) 67 { 68 unsigned long hash = hash_long((unsigned long)mapping ^ index, 69 DAX_WAIT_TABLE_BITS); 70 return wait_table + hash; 71 } 72 73 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 74 { 75 struct request_queue *q = bdev->bd_queue; 76 long rc = -EIO; 77 78 dax->addr = (void __pmem *) ERR_PTR(-EIO); 79 if (blk_queue_enter(q, true) != 0) 80 return rc; 81 82 rc = bdev_direct_access(bdev, dax); 83 if (rc < 0) { 84 dax->addr = (void __pmem *) ERR_PTR(rc); 85 blk_queue_exit(q); 86 return rc; 87 } 88 return rc; 89 } 90 91 static void dax_unmap_atomic(struct block_device *bdev, 92 const struct blk_dax_ctl *dax) 93 { 94 if (IS_ERR(dax->addr)) 95 return; 96 blk_queue_exit(bdev->bd_queue); 97 } 98 99 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 100 { 101 struct page *page = alloc_pages(GFP_KERNEL, 0); 102 struct blk_dax_ctl dax = { 103 .size = PAGE_SIZE, 104 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 105 }; 106 long rc; 107 108 if (!page) 109 return ERR_PTR(-ENOMEM); 110 111 rc = dax_map_atomic(bdev, &dax); 112 if (rc < 0) 113 return ERR_PTR(rc); 114 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 115 dax_unmap_atomic(bdev, &dax); 116 return page; 117 } 118 119 static bool buffer_written(struct buffer_head *bh) 120 { 121 return buffer_mapped(bh) && !buffer_unwritten(bh); 122 } 123 124 /* 125 * When ext4 encounters a hole, it returns without modifying the buffer_head 126 * which means that we can't trust b_size. To cope with this, we set b_state 127 * to 0 before calling get_block and, if any bit is set, we know we can trust 128 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 129 * and would save us time calling get_block repeatedly. 130 */ 131 static bool buffer_size_valid(struct buffer_head *bh) 132 { 133 return bh->b_state != 0; 134 } 135 136 137 static sector_t to_sector(const struct buffer_head *bh, 138 const struct inode *inode) 139 { 140 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 141 142 return sector; 143 } 144 145 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 146 loff_t start, loff_t end, get_block_t get_block, 147 struct buffer_head *bh) 148 { 149 loff_t pos = start, max = start, bh_max = start; 150 bool hole = false, need_wmb = false; 151 struct block_device *bdev = NULL; 152 int rw = iov_iter_rw(iter), rc; 153 long map_len = 0; 154 struct blk_dax_ctl dax = { 155 .addr = (void __pmem *) ERR_PTR(-EIO), 156 }; 157 unsigned blkbits = inode->i_blkbits; 158 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 159 >> blkbits; 160 161 if (rw == READ) 162 end = min(end, i_size_read(inode)); 163 164 while (pos < end) { 165 size_t len; 166 if (pos == max) { 167 long page = pos >> PAGE_SHIFT; 168 sector_t block = page << (PAGE_SHIFT - blkbits); 169 unsigned first = pos - (block << blkbits); 170 long size; 171 172 if (pos == bh_max) { 173 bh->b_size = PAGE_ALIGN(end - pos); 174 bh->b_state = 0; 175 rc = get_block(inode, block, bh, rw == WRITE); 176 if (rc) 177 break; 178 if (!buffer_size_valid(bh)) 179 bh->b_size = 1 << blkbits; 180 bh_max = pos - first + bh->b_size; 181 bdev = bh->b_bdev; 182 /* 183 * We allow uninitialized buffers for writes 184 * beyond EOF as those cannot race with faults 185 */ 186 WARN_ON_ONCE( 187 (buffer_new(bh) && block < file_blks) || 188 (rw == WRITE && buffer_unwritten(bh))); 189 } else { 190 unsigned done = bh->b_size - 191 (bh_max - (pos - first)); 192 bh->b_blocknr += done >> blkbits; 193 bh->b_size -= done; 194 } 195 196 hole = rw == READ && !buffer_written(bh); 197 if (hole) { 198 size = bh->b_size - first; 199 } else { 200 dax_unmap_atomic(bdev, &dax); 201 dax.sector = to_sector(bh, inode); 202 dax.size = bh->b_size; 203 map_len = dax_map_atomic(bdev, &dax); 204 if (map_len < 0) { 205 rc = map_len; 206 break; 207 } 208 dax.addr += first; 209 size = map_len - first; 210 } 211 /* 212 * pos + size is one past the last offset for IO, 213 * so pos + size can overflow loff_t at extreme offsets. 214 * Cast to u64 to catch this and get the true minimum. 215 */ 216 max = min_t(u64, pos + size, end); 217 } 218 219 if (iov_iter_rw(iter) == WRITE) { 220 len = copy_from_iter_pmem(dax.addr, max - pos, iter); 221 need_wmb = true; 222 } else if (!hole) 223 len = copy_to_iter((void __force *) dax.addr, max - pos, 224 iter); 225 else 226 len = iov_iter_zero(max - pos, iter); 227 228 if (!len) { 229 rc = -EFAULT; 230 break; 231 } 232 233 pos += len; 234 if (!IS_ERR(dax.addr)) 235 dax.addr += len; 236 } 237 238 if (need_wmb) 239 wmb_pmem(); 240 dax_unmap_atomic(bdev, &dax); 241 242 return (pos == start) ? rc : pos - start; 243 } 244 245 /** 246 * dax_do_io - Perform I/O to a DAX file 247 * @iocb: The control block for this I/O 248 * @inode: The file which the I/O is directed at 249 * @iter: The addresses to do I/O from or to 250 * @get_block: The filesystem method used to translate file offsets to blocks 251 * @end_io: A filesystem callback for I/O completion 252 * @flags: See below 253 * 254 * This function uses the same locking scheme as do_blockdev_direct_IO: 255 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 256 * caller for writes. For reads, we take and release the i_mutex ourselves. 257 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 258 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 259 * is in progress. 260 */ 261 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 262 struct iov_iter *iter, get_block_t get_block, 263 dio_iodone_t end_io, int flags) 264 { 265 struct buffer_head bh; 266 ssize_t retval = -EINVAL; 267 loff_t pos = iocb->ki_pos; 268 loff_t end = pos + iov_iter_count(iter); 269 270 memset(&bh, 0, sizeof(bh)); 271 bh.b_bdev = inode->i_sb->s_bdev; 272 273 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 274 inode_lock(inode); 275 276 /* Protects against truncate */ 277 if (!(flags & DIO_SKIP_DIO_COUNT)) 278 inode_dio_begin(inode); 279 280 retval = dax_io(inode, iter, pos, end, get_block, &bh); 281 282 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 283 inode_unlock(inode); 284 285 if (end_io) { 286 int err; 287 288 err = end_io(iocb, pos, retval, bh.b_private); 289 if (err) 290 retval = err; 291 } 292 293 if (!(flags & DIO_SKIP_DIO_COUNT)) 294 inode_dio_end(inode); 295 return retval; 296 } 297 EXPORT_SYMBOL_GPL(dax_do_io); 298 299 /* 300 * DAX radix tree locking 301 */ 302 struct exceptional_entry_key { 303 struct address_space *mapping; 304 unsigned long index; 305 }; 306 307 struct wait_exceptional_entry_queue { 308 wait_queue_t wait; 309 struct exceptional_entry_key key; 310 }; 311 312 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 313 int sync, void *keyp) 314 { 315 struct exceptional_entry_key *key = keyp; 316 struct wait_exceptional_entry_queue *ewait = 317 container_of(wait, struct wait_exceptional_entry_queue, wait); 318 319 if (key->mapping != ewait->key.mapping || 320 key->index != ewait->key.index) 321 return 0; 322 return autoremove_wake_function(wait, mode, sync, NULL); 323 } 324 325 /* 326 * Check whether the given slot is locked. The function must be called with 327 * mapping->tree_lock held 328 */ 329 static inline int slot_locked(struct address_space *mapping, void **slot) 330 { 331 unsigned long entry = (unsigned long) 332 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 333 return entry & RADIX_DAX_ENTRY_LOCK; 334 } 335 336 /* 337 * Mark the given slot is locked. The function must be called with 338 * mapping->tree_lock held 339 */ 340 static inline void *lock_slot(struct address_space *mapping, void **slot) 341 { 342 unsigned long entry = (unsigned long) 343 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 344 345 entry |= RADIX_DAX_ENTRY_LOCK; 346 radix_tree_replace_slot(slot, (void *)entry); 347 return (void *)entry; 348 } 349 350 /* 351 * Mark the given slot is unlocked. The function must be called with 352 * mapping->tree_lock held 353 */ 354 static inline void *unlock_slot(struct address_space *mapping, void **slot) 355 { 356 unsigned long entry = (unsigned long) 357 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 358 359 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 360 radix_tree_replace_slot(slot, (void *)entry); 361 return (void *)entry; 362 } 363 364 /* 365 * Lookup entry in radix tree, wait for it to become unlocked if it is 366 * exceptional entry and return it. The caller must call 367 * put_unlocked_mapping_entry() when he decided not to lock the entry or 368 * put_locked_mapping_entry() when he locked the entry and now wants to 369 * unlock it. 370 * 371 * The function must be called with mapping->tree_lock held. 372 */ 373 static void *get_unlocked_mapping_entry(struct address_space *mapping, 374 pgoff_t index, void ***slotp) 375 { 376 void *ret, **slot; 377 struct wait_exceptional_entry_queue ewait; 378 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 379 380 init_wait(&ewait.wait); 381 ewait.wait.func = wake_exceptional_entry_func; 382 ewait.key.mapping = mapping; 383 ewait.key.index = index; 384 385 for (;;) { 386 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 387 &slot); 388 if (!ret || !radix_tree_exceptional_entry(ret) || 389 !slot_locked(mapping, slot)) { 390 if (slotp) 391 *slotp = slot; 392 return ret; 393 } 394 prepare_to_wait_exclusive(wq, &ewait.wait, 395 TASK_UNINTERRUPTIBLE); 396 spin_unlock_irq(&mapping->tree_lock); 397 schedule(); 398 finish_wait(wq, &ewait.wait); 399 spin_lock_irq(&mapping->tree_lock); 400 } 401 } 402 403 /* 404 * Find radix tree entry at given index. If it points to a page, return with 405 * the page locked. If it points to the exceptional entry, return with the 406 * radix tree entry locked. If the radix tree doesn't contain given index, 407 * create empty exceptional entry for the index and return with it locked. 408 * 409 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 410 * persistent memory the benefit is doubtful. We can add that later if we can 411 * show it helps. 412 */ 413 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 414 { 415 void *ret, **slot; 416 417 restart: 418 spin_lock_irq(&mapping->tree_lock); 419 ret = get_unlocked_mapping_entry(mapping, index, &slot); 420 /* No entry for given index? Make sure radix tree is big enough. */ 421 if (!ret) { 422 int err; 423 424 spin_unlock_irq(&mapping->tree_lock); 425 err = radix_tree_preload( 426 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 427 if (err) 428 return ERR_PTR(err); 429 ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 430 RADIX_DAX_ENTRY_LOCK); 431 spin_lock_irq(&mapping->tree_lock); 432 err = radix_tree_insert(&mapping->page_tree, index, ret); 433 radix_tree_preload_end(); 434 if (err) { 435 spin_unlock_irq(&mapping->tree_lock); 436 /* Someone already created the entry? */ 437 if (err == -EEXIST) 438 goto restart; 439 return ERR_PTR(err); 440 } 441 /* Good, we have inserted empty locked entry into the tree. */ 442 mapping->nrexceptional++; 443 spin_unlock_irq(&mapping->tree_lock); 444 return ret; 445 } 446 /* Normal page in radix tree? */ 447 if (!radix_tree_exceptional_entry(ret)) { 448 struct page *page = ret; 449 450 get_page(page); 451 spin_unlock_irq(&mapping->tree_lock); 452 lock_page(page); 453 /* Page got truncated? Retry... */ 454 if (unlikely(page->mapping != mapping)) { 455 unlock_page(page); 456 put_page(page); 457 goto restart; 458 } 459 return page; 460 } 461 ret = lock_slot(mapping, slot); 462 spin_unlock_irq(&mapping->tree_lock); 463 return ret; 464 } 465 466 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 467 pgoff_t index, bool wake_all) 468 { 469 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 470 471 /* 472 * Checking for locked entry and prepare_to_wait_exclusive() happens 473 * under mapping->tree_lock, ditto for entry handling in our callers. 474 * So at this point all tasks that could have seen our entry locked 475 * must be in the waitqueue and the following check will see them. 476 */ 477 if (waitqueue_active(wq)) { 478 struct exceptional_entry_key key; 479 480 key.mapping = mapping; 481 key.index = index; 482 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 483 } 484 } 485 486 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 487 { 488 void *ret, **slot; 489 490 spin_lock_irq(&mapping->tree_lock); 491 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 492 if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 493 !slot_locked(mapping, slot))) { 494 spin_unlock_irq(&mapping->tree_lock); 495 return; 496 } 497 unlock_slot(mapping, slot); 498 spin_unlock_irq(&mapping->tree_lock); 499 dax_wake_mapping_entry_waiter(mapping, index, false); 500 } 501 502 static void put_locked_mapping_entry(struct address_space *mapping, 503 pgoff_t index, void *entry) 504 { 505 if (!radix_tree_exceptional_entry(entry)) { 506 unlock_page(entry); 507 put_page(entry); 508 } else { 509 dax_unlock_mapping_entry(mapping, index); 510 } 511 } 512 513 /* 514 * Called when we are done with radix tree entry we looked up via 515 * get_unlocked_mapping_entry() and which we didn't lock in the end. 516 */ 517 static void put_unlocked_mapping_entry(struct address_space *mapping, 518 pgoff_t index, void *entry) 519 { 520 if (!radix_tree_exceptional_entry(entry)) 521 return; 522 523 /* We have to wake up next waiter for the radix tree entry lock */ 524 dax_wake_mapping_entry_waiter(mapping, index, false); 525 } 526 527 /* 528 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 529 * entry to get unlocked before deleting it. 530 */ 531 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 532 { 533 void *entry; 534 535 spin_lock_irq(&mapping->tree_lock); 536 entry = get_unlocked_mapping_entry(mapping, index, NULL); 537 /* 538 * This gets called from truncate / punch_hole path. As such, the caller 539 * must hold locks protecting against concurrent modifications of the 540 * radix tree (usually fs-private i_mmap_sem for writing). Since the 541 * caller has seen exceptional entry for this index, we better find it 542 * at that index as well... 543 */ 544 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 545 spin_unlock_irq(&mapping->tree_lock); 546 return 0; 547 } 548 radix_tree_delete(&mapping->page_tree, index); 549 mapping->nrexceptional--; 550 spin_unlock_irq(&mapping->tree_lock); 551 dax_wake_mapping_entry_waiter(mapping, index, true); 552 553 return 1; 554 } 555 556 /* 557 * The user has performed a load from a hole in the file. Allocating 558 * a new page in the file would cause excessive storage usage for 559 * workloads with sparse files. We allocate a page cache page instead. 560 * We'll kick it out of the page cache if it's ever written to, 561 * otherwise it will simply fall out of the page cache under memory 562 * pressure without ever having been dirtied. 563 */ 564 static int dax_load_hole(struct address_space *mapping, void *entry, 565 struct vm_fault *vmf) 566 { 567 struct page *page; 568 569 /* Hole page already exists? Return it... */ 570 if (!radix_tree_exceptional_entry(entry)) { 571 vmf->page = entry; 572 return VM_FAULT_LOCKED; 573 } 574 575 /* This will replace locked radix tree entry with a hole page */ 576 page = find_or_create_page(mapping, vmf->pgoff, 577 vmf->gfp_mask | __GFP_ZERO); 578 if (!page) { 579 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 580 return VM_FAULT_OOM; 581 } 582 vmf->page = page; 583 return VM_FAULT_LOCKED; 584 } 585 586 static int copy_user_bh(struct page *to, struct inode *inode, 587 struct buffer_head *bh, unsigned long vaddr) 588 { 589 struct blk_dax_ctl dax = { 590 .sector = to_sector(bh, inode), 591 .size = bh->b_size, 592 }; 593 struct block_device *bdev = bh->b_bdev; 594 void *vto; 595 596 if (dax_map_atomic(bdev, &dax) < 0) 597 return PTR_ERR(dax.addr); 598 vto = kmap_atomic(to); 599 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 600 kunmap_atomic(vto); 601 dax_unmap_atomic(bdev, &dax); 602 return 0; 603 } 604 605 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 606 607 static void *dax_insert_mapping_entry(struct address_space *mapping, 608 struct vm_fault *vmf, 609 void *entry, sector_t sector) 610 { 611 struct radix_tree_root *page_tree = &mapping->page_tree; 612 int error = 0; 613 bool hole_fill = false; 614 void *new_entry; 615 pgoff_t index = vmf->pgoff; 616 617 if (vmf->flags & FAULT_FLAG_WRITE) 618 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 619 620 /* Replacing hole page with block mapping? */ 621 if (!radix_tree_exceptional_entry(entry)) { 622 hole_fill = true; 623 /* 624 * Unmap the page now before we remove it from page cache below. 625 * The page is locked so it cannot be faulted in again. 626 */ 627 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 628 PAGE_SIZE, 0); 629 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 630 if (error) 631 return ERR_PTR(error); 632 } 633 634 spin_lock_irq(&mapping->tree_lock); 635 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 636 RADIX_DAX_ENTRY_LOCK); 637 if (hole_fill) { 638 __delete_from_page_cache(entry, NULL); 639 /* Drop pagecache reference */ 640 put_page(entry); 641 error = radix_tree_insert(page_tree, index, new_entry); 642 if (error) { 643 new_entry = ERR_PTR(error); 644 goto unlock; 645 } 646 mapping->nrexceptional++; 647 } else { 648 void **slot; 649 void *ret; 650 651 ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 652 WARN_ON_ONCE(ret != entry); 653 radix_tree_replace_slot(slot, new_entry); 654 } 655 if (vmf->flags & FAULT_FLAG_WRITE) 656 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 657 unlock: 658 spin_unlock_irq(&mapping->tree_lock); 659 if (hole_fill) { 660 radix_tree_preload_end(); 661 /* 662 * We don't need hole page anymore, it has been replaced with 663 * locked radix tree entry now. 664 */ 665 if (mapping->a_ops->freepage) 666 mapping->a_ops->freepage(entry); 667 unlock_page(entry); 668 put_page(entry); 669 } 670 return new_entry; 671 } 672 673 static int dax_writeback_one(struct block_device *bdev, 674 struct address_space *mapping, pgoff_t index, void *entry) 675 { 676 struct radix_tree_root *page_tree = &mapping->page_tree; 677 int type = RADIX_DAX_TYPE(entry); 678 struct radix_tree_node *node; 679 struct blk_dax_ctl dax; 680 void **slot; 681 int ret = 0; 682 683 spin_lock_irq(&mapping->tree_lock); 684 /* 685 * Regular page slots are stabilized by the page lock even 686 * without the tree itself locked. These unlocked entries 687 * need verification under the tree lock. 688 */ 689 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 690 goto unlock; 691 if (*slot != entry) 692 goto unlock; 693 694 /* another fsync thread may have already written back this entry */ 695 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 696 goto unlock; 697 698 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 699 ret = -EIO; 700 goto unlock; 701 } 702 703 dax.sector = RADIX_DAX_SECTOR(entry); 704 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 705 spin_unlock_irq(&mapping->tree_lock); 706 707 /* 708 * We cannot hold tree_lock while calling dax_map_atomic() because it 709 * eventually calls cond_resched(). 710 */ 711 ret = dax_map_atomic(bdev, &dax); 712 if (ret < 0) 713 return ret; 714 715 if (WARN_ON_ONCE(ret < dax.size)) { 716 ret = -EIO; 717 goto unmap; 718 } 719 720 wb_cache_pmem(dax.addr, dax.size); 721 722 spin_lock_irq(&mapping->tree_lock); 723 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 724 spin_unlock_irq(&mapping->tree_lock); 725 unmap: 726 dax_unmap_atomic(bdev, &dax); 727 return ret; 728 729 unlock: 730 spin_unlock_irq(&mapping->tree_lock); 731 return ret; 732 } 733 734 /* 735 * Flush the mapping to the persistent domain within the byte range of [start, 736 * end]. This is required by data integrity operations to ensure file data is 737 * on persistent storage prior to completion of the operation. 738 */ 739 int dax_writeback_mapping_range(struct address_space *mapping, 740 struct block_device *bdev, struct writeback_control *wbc) 741 { 742 struct inode *inode = mapping->host; 743 pgoff_t start_index, end_index, pmd_index; 744 pgoff_t indices[PAGEVEC_SIZE]; 745 struct pagevec pvec; 746 bool done = false; 747 int i, ret = 0; 748 void *entry; 749 750 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 751 return -EIO; 752 753 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 754 return 0; 755 756 start_index = wbc->range_start >> PAGE_SHIFT; 757 end_index = wbc->range_end >> PAGE_SHIFT; 758 pmd_index = DAX_PMD_INDEX(start_index); 759 760 rcu_read_lock(); 761 entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 762 rcu_read_unlock(); 763 764 /* see if the start of our range is covered by a PMD entry */ 765 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 766 start_index = pmd_index; 767 768 tag_pages_for_writeback(mapping, start_index, end_index); 769 770 pagevec_init(&pvec, 0); 771 while (!done) { 772 pvec.nr = find_get_entries_tag(mapping, start_index, 773 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 774 pvec.pages, indices); 775 776 if (pvec.nr == 0) 777 break; 778 779 for (i = 0; i < pvec.nr; i++) { 780 if (indices[i] > end_index) { 781 done = true; 782 break; 783 } 784 785 ret = dax_writeback_one(bdev, mapping, indices[i], 786 pvec.pages[i]); 787 if (ret < 0) 788 return ret; 789 } 790 } 791 wmb_pmem(); 792 return 0; 793 } 794 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 795 796 static int dax_insert_mapping(struct address_space *mapping, 797 struct buffer_head *bh, void **entryp, 798 struct vm_area_struct *vma, struct vm_fault *vmf) 799 { 800 unsigned long vaddr = (unsigned long)vmf->virtual_address; 801 struct block_device *bdev = bh->b_bdev; 802 struct blk_dax_ctl dax = { 803 .sector = to_sector(bh, mapping->host), 804 .size = bh->b_size, 805 }; 806 void *ret; 807 void *entry = *entryp; 808 809 if (dax_map_atomic(bdev, &dax) < 0) 810 return PTR_ERR(dax.addr); 811 dax_unmap_atomic(bdev, &dax); 812 813 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 814 if (IS_ERR(ret)) 815 return PTR_ERR(ret); 816 *entryp = ret; 817 818 return vm_insert_mixed(vma, vaddr, dax.pfn); 819 } 820 821 /** 822 * __dax_fault - handle a page fault on a DAX file 823 * @vma: The virtual memory area where the fault occurred 824 * @vmf: The description of the fault 825 * @get_block: The filesystem method used to translate file offsets to blocks 826 * 827 * When a page fault occurs, filesystems may call this helper in their 828 * fault handler for DAX files. __dax_fault() assumes the caller has done all 829 * the necessary locking for the page fault to proceed successfully. 830 */ 831 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 832 get_block_t get_block) 833 { 834 struct file *file = vma->vm_file; 835 struct address_space *mapping = file->f_mapping; 836 struct inode *inode = mapping->host; 837 void *entry; 838 struct buffer_head bh; 839 unsigned long vaddr = (unsigned long)vmf->virtual_address; 840 unsigned blkbits = inode->i_blkbits; 841 sector_t block; 842 pgoff_t size; 843 int error; 844 int major = 0; 845 846 /* 847 * Check whether offset isn't beyond end of file now. Caller is supposed 848 * to hold locks serializing us with truncate / punch hole so this is 849 * a reliable test. 850 */ 851 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 852 if (vmf->pgoff >= size) 853 return VM_FAULT_SIGBUS; 854 855 memset(&bh, 0, sizeof(bh)); 856 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 857 bh.b_bdev = inode->i_sb->s_bdev; 858 bh.b_size = PAGE_SIZE; 859 860 entry = grab_mapping_entry(mapping, vmf->pgoff); 861 if (IS_ERR(entry)) { 862 error = PTR_ERR(entry); 863 goto out; 864 } 865 866 error = get_block(inode, block, &bh, 0); 867 if (!error && (bh.b_size < PAGE_SIZE)) 868 error = -EIO; /* fs corruption? */ 869 if (error) 870 goto unlock_entry; 871 872 if (vmf->cow_page) { 873 struct page *new_page = vmf->cow_page; 874 if (buffer_written(&bh)) 875 error = copy_user_bh(new_page, inode, &bh, vaddr); 876 else 877 clear_user_highpage(new_page, vaddr); 878 if (error) 879 goto unlock_entry; 880 if (!radix_tree_exceptional_entry(entry)) { 881 vmf->page = entry; 882 return VM_FAULT_LOCKED; 883 } 884 vmf->entry = entry; 885 return VM_FAULT_DAX_LOCKED; 886 } 887 888 if (!buffer_mapped(&bh)) { 889 if (vmf->flags & FAULT_FLAG_WRITE) { 890 error = get_block(inode, block, &bh, 1); 891 count_vm_event(PGMAJFAULT); 892 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 893 major = VM_FAULT_MAJOR; 894 if (!error && (bh.b_size < PAGE_SIZE)) 895 error = -EIO; 896 if (error) 897 goto unlock_entry; 898 } else { 899 return dax_load_hole(mapping, entry, vmf); 900 } 901 } 902 903 /* Filesystem should not return unwritten buffers to us! */ 904 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 905 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 906 unlock_entry: 907 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 908 out: 909 if (error == -ENOMEM) 910 return VM_FAULT_OOM | major; 911 /* -EBUSY is fine, somebody else faulted on the same PTE */ 912 if ((error < 0) && (error != -EBUSY)) 913 return VM_FAULT_SIGBUS | major; 914 return VM_FAULT_NOPAGE | major; 915 } 916 EXPORT_SYMBOL(__dax_fault); 917 918 /** 919 * dax_fault - handle a page fault on a DAX file 920 * @vma: The virtual memory area where the fault occurred 921 * @vmf: The description of the fault 922 * @get_block: The filesystem method used to translate file offsets to blocks 923 * 924 * When a page fault occurs, filesystems may call this helper in their 925 * fault handler for DAX files. 926 */ 927 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 928 get_block_t get_block) 929 { 930 int result; 931 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 932 933 if (vmf->flags & FAULT_FLAG_WRITE) { 934 sb_start_pagefault(sb); 935 file_update_time(vma->vm_file); 936 } 937 result = __dax_fault(vma, vmf, get_block); 938 if (vmf->flags & FAULT_FLAG_WRITE) 939 sb_end_pagefault(sb); 940 941 return result; 942 } 943 EXPORT_SYMBOL_GPL(dax_fault); 944 945 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 946 /* 947 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 948 * more often than one might expect in the below function. 949 */ 950 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 951 952 static void __dax_dbg(struct buffer_head *bh, unsigned long address, 953 const char *reason, const char *fn) 954 { 955 if (bh) { 956 char bname[BDEVNAME_SIZE]; 957 bdevname(bh->b_bdev, bname); 958 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 959 "length %zd fallback: %s\n", fn, current->comm, 960 address, bname, bh->b_state, (u64)bh->b_blocknr, 961 bh->b_size, reason); 962 } else { 963 pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 964 current->comm, address, reason); 965 } 966 } 967 968 #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 969 970 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 971 pmd_t *pmd, unsigned int flags, get_block_t get_block) 972 { 973 struct file *file = vma->vm_file; 974 struct address_space *mapping = file->f_mapping; 975 struct inode *inode = mapping->host; 976 struct buffer_head bh; 977 unsigned blkbits = inode->i_blkbits; 978 unsigned long pmd_addr = address & PMD_MASK; 979 bool write = flags & FAULT_FLAG_WRITE; 980 struct block_device *bdev; 981 pgoff_t size, pgoff; 982 sector_t block; 983 int result = 0; 984 bool alloc = false; 985 986 /* dax pmd mappings require pfn_t_devmap() */ 987 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 988 return VM_FAULT_FALLBACK; 989 990 /* Fall back to PTEs if we're going to COW */ 991 if (write && !(vma->vm_flags & VM_SHARED)) { 992 split_huge_pmd(vma, pmd, address); 993 dax_pmd_dbg(NULL, address, "cow write"); 994 return VM_FAULT_FALLBACK; 995 } 996 /* If the PMD would extend outside the VMA */ 997 if (pmd_addr < vma->vm_start) { 998 dax_pmd_dbg(NULL, address, "vma start unaligned"); 999 return VM_FAULT_FALLBACK; 1000 } 1001 if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 1002 dax_pmd_dbg(NULL, address, "vma end unaligned"); 1003 return VM_FAULT_FALLBACK; 1004 } 1005 1006 pgoff = linear_page_index(vma, pmd_addr); 1007 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1008 if (pgoff >= size) 1009 return VM_FAULT_SIGBUS; 1010 /* If the PMD would cover blocks out of the file */ 1011 if ((pgoff | PG_PMD_COLOUR) >= size) { 1012 dax_pmd_dbg(NULL, address, 1013 "offset + huge page size > file size"); 1014 return VM_FAULT_FALLBACK; 1015 } 1016 1017 memset(&bh, 0, sizeof(bh)); 1018 bh.b_bdev = inode->i_sb->s_bdev; 1019 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 1020 1021 bh.b_size = PMD_SIZE; 1022 1023 if (get_block(inode, block, &bh, 0) != 0) 1024 return VM_FAULT_SIGBUS; 1025 1026 if (!buffer_mapped(&bh) && write) { 1027 if (get_block(inode, block, &bh, 1) != 0) 1028 return VM_FAULT_SIGBUS; 1029 alloc = true; 1030 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 1031 } 1032 1033 bdev = bh.b_bdev; 1034 1035 /* 1036 * If the filesystem isn't willing to tell us the length of a hole, 1037 * just fall back to PTEs. Calling get_block 512 times in a loop 1038 * would be silly. 1039 */ 1040 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 1041 dax_pmd_dbg(&bh, address, "allocated block too small"); 1042 return VM_FAULT_FALLBACK; 1043 } 1044 1045 /* 1046 * If we allocated new storage, make sure no process has any 1047 * zero pages covering this hole 1048 */ 1049 if (alloc) { 1050 loff_t lstart = pgoff << PAGE_SHIFT; 1051 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 1052 1053 truncate_pagecache_range(inode, lstart, lend); 1054 } 1055 1056 if (!write && !buffer_mapped(&bh)) { 1057 spinlock_t *ptl; 1058 pmd_t entry; 1059 struct page *zero_page = get_huge_zero_page(); 1060 1061 if (unlikely(!zero_page)) { 1062 dax_pmd_dbg(&bh, address, "no zero page"); 1063 goto fallback; 1064 } 1065 1066 ptl = pmd_lock(vma->vm_mm, pmd); 1067 if (!pmd_none(*pmd)) { 1068 spin_unlock(ptl); 1069 dax_pmd_dbg(&bh, address, "pmd already present"); 1070 goto fallback; 1071 } 1072 1073 dev_dbg(part_to_dev(bdev->bd_part), 1074 "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 1075 __func__, current->comm, address, 1076 (unsigned long long) to_sector(&bh, inode)); 1077 1078 entry = mk_pmd(zero_page, vma->vm_page_prot); 1079 entry = pmd_mkhuge(entry); 1080 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1081 result = VM_FAULT_NOPAGE; 1082 spin_unlock(ptl); 1083 } else { 1084 struct blk_dax_ctl dax = { 1085 .sector = to_sector(&bh, inode), 1086 .size = PMD_SIZE, 1087 }; 1088 long length = dax_map_atomic(bdev, &dax); 1089 1090 if (length < 0) { 1091 dax_pmd_dbg(&bh, address, "dax-error fallback"); 1092 goto fallback; 1093 } 1094 if (length < PMD_SIZE) { 1095 dax_pmd_dbg(&bh, address, "dax-length too small"); 1096 dax_unmap_atomic(bdev, &dax); 1097 goto fallback; 1098 } 1099 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1100 dax_pmd_dbg(&bh, address, "pfn unaligned"); 1101 dax_unmap_atomic(bdev, &dax); 1102 goto fallback; 1103 } 1104 1105 if (!pfn_t_devmap(dax.pfn)) { 1106 dax_unmap_atomic(bdev, &dax); 1107 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1108 goto fallback; 1109 } 1110 dax_unmap_atomic(bdev, &dax); 1111 1112 /* 1113 * For PTE faults we insert a radix tree entry for reads, and 1114 * leave it clean. Then on the first write we dirty the radix 1115 * tree entry via the dax_pfn_mkwrite() path. This sequence 1116 * allows the dax_pfn_mkwrite() call to be simpler and avoid a 1117 * call into get_block() to translate the pgoff to a sector in 1118 * order to be able to create a new radix tree entry. 1119 * 1120 * The PMD path doesn't have an equivalent to 1121 * dax_pfn_mkwrite(), though, so for a read followed by a 1122 * write we traverse all the way through __dax_pmd_fault() 1123 * twice. This means we can just skip inserting a radix tree 1124 * entry completely on the initial read and just wait until 1125 * the write to insert a dirty entry. 1126 */ 1127 if (write) { 1128 /* 1129 * We should insert radix-tree entry and dirty it here. 1130 * For now this is broken... 1131 */ 1132 } 1133 1134 dev_dbg(part_to_dev(bdev->bd_part), 1135 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1136 __func__, current->comm, address, 1137 pfn_t_to_pfn(dax.pfn), 1138 (unsigned long long) dax.sector); 1139 result |= vmf_insert_pfn_pmd(vma, address, pmd, 1140 dax.pfn, write); 1141 } 1142 1143 out: 1144 return result; 1145 1146 fallback: 1147 count_vm_event(THP_FAULT_FALLBACK); 1148 result = VM_FAULT_FALLBACK; 1149 goto out; 1150 } 1151 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 1152 1153 /** 1154 * dax_pmd_fault - handle a PMD fault on a DAX file 1155 * @vma: The virtual memory area where the fault occurred 1156 * @vmf: The description of the fault 1157 * @get_block: The filesystem method used to translate file offsets to blocks 1158 * 1159 * When a page fault occurs, filesystems may call this helper in their 1160 * pmd_fault handler for DAX files. 1161 */ 1162 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1163 pmd_t *pmd, unsigned int flags, get_block_t get_block) 1164 { 1165 int result; 1166 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 1167 1168 if (flags & FAULT_FLAG_WRITE) { 1169 sb_start_pagefault(sb); 1170 file_update_time(vma->vm_file); 1171 } 1172 result = __dax_pmd_fault(vma, address, pmd, flags, get_block); 1173 if (flags & FAULT_FLAG_WRITE) 1174 sb_end_pagefault(sb); 1175 1176 return result; 1177 } 1178 EXPORT_SYMBOL_GPL(dax_pmd_fault); 1179 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1180 1181 /** 1182 * dax_pfn_mkwrite - handle first write to DAX page 1183 * @vma: The virtual memory area where the fault occurred 1184 * @vmf: The description of the fault 1185 */ 1186 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1187 { 1188 struct file *file = vma->vm_file; 1189 struct address_space *mapping = file->f_mapping; 1190 void *entry; 1191 pgoff_t index = vmf->pgoff; 1192 1193 spin_lock_irq(&mapping->tree_lock); 1194 entry = get_unlocked_mapping_entry(mapping, index, NULL); 1195 if (!entry || !radix_tree_exceptional_entry(entry)) 1196 goto out; 1197 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1198 put_unlocked_mapping_entry(mapping, index, entry); 1199 out: 1200 spin_unlock_irq(&mapping->tree_lock); 1201 return VM_FAULT_NOPAGE; 1202 } 1203 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1204 1205 static bool dax_range_is_aligned(struct block_device *bdev, 1206 unsigned int offset, unsigned int length) 1207 { 1208 unsigned short sector_size = bdev_logical_block_size(bdev); 1209 1210 if (!IS_ALIGNED(offset, sector_size)) 1211 return false; 1212 if (!IS_ALIGNED(length, sector_size)) 1213 return false; 1214 1215 return true; 1216 } 1217 1218 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1219 unsigned int offset, unsigned int length) 1220 { 1221 struct blk_dax_ctl dax = { 1222 .sector = sector, 1223 .size = PAGE_SIZE, 1224 }; 1225 1226 if (dax_range_is_aligned(bdev, offset, length)) { 1227 sector_t start_sector = dax.sector + (offset >> 9); 1228 1229 return blkdev_issue_zeroout(bdev, start_sector, 1230 length >> 9, GFP_NOFS, true); 1231 } else { 1232 if (dax_map_atomic(bdev, &dax) < 0) 1233 return PTR_ERR(dax.addr); 1234 clear_pmem(dax.addr + offset, length); 1235 wmb_pmem(); 1236 dax_unmap_atomic(bdev, &dax); 1237 } 1238 return 0; 1239 } 1240 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1241 1242 /** 1243 * dax_zero_page_range - zero a range within a page of a DAX file 1244 * @inode: The file being truncated 1245 * @from: The file offset that is being truncated to 1246 * @length: The number of bytes to zero 1247 * @get_block: The filesystem method used to translate file offsets to blocks 1248 * 1249 * This function can be called by a filesystem when it is zeroing part of a 1250 * page in a DAX file. This is intended for hole-punch operations. If 1251 * you are truncating a file, the helper function dax_truncate_page() may be 1252 * more convenient. 1253 */ 1254 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 1255 get_block_t get_block) 1256 { 1257 struct buffer_head bh; 1258 pgoff_t index = from >> PAGE_SHIFT; 1259 unsigned offset = from & (PAGE_SIZE-1); 1260 int err; 1261 1262 /* Block boundary? Nothing to do */ 1263 if (!length) 1264 return 0; 1265 BUG_ON((offset + length) > PAGE_SIZE); 1266 1267 memset(&bh, 0, sizeof(bh)); 1268 bh.b_bdev = inode->i_sb->s_bdev; 1269 bh.b_size = PAGE_SIZE; 1270 err = get_block(inode, index, &bh, 0); 1271 if (err < 0 || !buffer_written(&bh)) 1272 return err; 1273 1274 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1275 offset, length); 1276 } 1277 EXPORT_SYMBOL_GPL(dax_zero_page_range); 1278 1279 /** 1280 * dax_truncate_page - handle a partial page being truncated in a DAX file 1281 * @inode: The file being truncated 1282 * @from: The file offset that is being truncated to 1283 * @get_block: The filesystem method used to translate file offsets to blocks 1284 * 1285 * Similar to block_truncate_page(), this function can be called by a 1286 * filesystem when it is truncating a DAX file to handle the partial page. 1287 */ 1288 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1289 { 1290 unsigned length = PAGE_ALIGN(from) - from; 1291 return dax_zero_page_range(inode, from, length, get_block); 1292 } 1293 EXPORT_SYMBOL_GPL(dax_truncate_page); 1294