1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 35 /* 36 * We use lowest available bit in exceptional entry for locking, other two 37 * bits to determine entry type. In total 3 special bits. 38 */ 39 #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 40 #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 41 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 42 #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 43 #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) 44 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 45 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 46 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 47 RADIX_TREE_EXCEPTIONAL_ENTRY)) 48 49 /* We choose 4096 entries - same as per-zone page wait tables */ 50 #define DAX_WAIT_TABLE_BITS 12 51 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 52 53 wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 54 55 static int __init init_dax_wait_table(void) 56 { 57 int i; 58 59 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 60 init_waitqueue_head(wait_table + i); 61 return 0; 62 } 63 fs_initcall(init_dax_wait_table); 64 65 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 66 pgoff_t index) 67 { 68 unsigned long hash = hash_long((unsigned long)mapping ^ index, 69 DAX_WAIT_TABLE_BITS); 70 return wait_table + hash; 71 } 72 73 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 74 { 75 struct request_queue *q = bdev->bd_queue; 76 long rc = -EIO; 77 78 dax->addr = ERR_PTR(-EIO); 79 if (blk_queue_enter(q, true) != 0) 80 return rc; 81 82 rc = bdev_direct_access(bdev, dax); 83 if (rc < 0) { 84 dax->addr = ERR_PTR(rc); 85 blk_queue_exit(q); 86 return rc; 87 } 88 return rc; 89 } 90 91 static void dax_unmap_atomic(struct block_device *bdev, 92 const struct blk_dax_ctl *dax) 93 { 94 if (IS_ERR(dax->addr)) 95 return; 96 blk_queue_exit(bdev->bd_queue); 97 } 98 99 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 100 { 101 struct page *page = alloc_pages(GFP_KERNEL, 0); 102 struct blk_dax_ctl dax = { 103 .size = PAGE_SIZE, 104 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 105 }; 106 long rc; 107 108 if (!page) 109 return ERR_PTR(-ENOMEM); 110 111 rc = dax_map_atomic(bdev, &dax); 112 if (rc < 0) 113 return ERR_PTR(rc); 114 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 115 dax_unmap_atomic(bdev, &dax); 116 return page; 117 } 118 119 static bool buffer_written(struct buffer_head *bh) 120 { 121 return buffer_mapped(bh) && !buffer_unwritten(bh); 122 } 123 124 /* 125 * When ext4 encounters a hole, it returns without modifying the buffer_head 126 * which means that we can't trust b_size. To cope with this, we set b_state 127 * to 0 before calling get_block and, if any bit is set, we know we can trust 128 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 129 * and would save us time calling get_block repeatedly. 130 */ 131 static bool buffer_size_valid(struct buffer_head *bh) 132 { 133 return bh->b_state != 0; 134 } 135 136 137 static sector_t to_sector(const struct buffer_head *bh, 138 const struct inode *inode) 139 { 140 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 141 142 return sector; 143 } 144 145 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 146 loff_t start, loff_t end, get_block_t get_block, 147 struct buffer_head *bh) 148 { 149 loff_t pos = start, max = start, bh_max = start; 150 bool hole = false; 151 struct block_device *bdev = NULL; 152 int rw = iov_iter_rw(iter), rc; 153 long map_len = 0; 154 struct blk_dax_ctl dax = { 155 .addr = ERR_PTR(-EIO), 156 }; 157 unsigned blkbits = inode->i_blkbits; 158 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) 159 >> blkbits; 160 161 if (rw == READ) 162 end = min(end, i_size_read(inode)); 163 164 while (pos < end) { 165 size_t len; 166 if (pos == max) { 167 long page = pos >> PAGE_SHIFT; 168 sector_t block = page << (PAGE_SHIFT - blkbits); 169 unsigned first = pos - (block << blkbits); 170 long size; 171 172 if (pos == bh_max) { 173 bh->b_size = PAGE_ALIGN(end - pos); 174 bh->b_state = 0; 175 rc = get_block(inode, block, bh, rw == WRITE); 176 if (rc) 177 break; 178 if (!buffer_size_valid(bh)) 179 bh->b_size = 1 << blkbits; 180 bh_max = pos - first + bh->b_size; 181 bdev = bh->b_bdev; 182 /* 183 * We allow uninitialized buffers for writes 184 * beyond EOF as those cannot race with faults 185 */ 186 WARN_ON_ONCE( 187 (buffer_new(bh) && block < file_blks) || 188 (rw == WRITE && buffer_unwritten(bh))); 189 } else { 190 unsigned done = bh->b_size - 191 (bh_max - (pos - first)); 192 bh->b_blocknr += done >> blkbits; 193 bh->b_size -= done; 194 } 195 196 hole = rw == READ && !buffer_written(bh); 197 if (hole) { 198 size = bh->b_size - first; 199 } else { 200 dax_unmap_atomic(bdev, &dax); 201 dax.sector = to_sector(bh, inode); 202 dax.size = bh->b_size; 203 map_len = dax_map_atomic(bdev, &dax); 204 if (map_len < 0) { 205 rc = map_len; 206 break; 207 } 208 dax.addr += first; 209 size = map_len - first; 210 } 211 /* 212 * pos + size is one past the last offset for IO, 213 * so pos + size can overflow loff_t at extreme offsets. 214 * Cast to u64 to catch this and get the true minimum. 215 */ 216 max = min_t(u64, pos + size, end); 217 } 218 219 if (iov_iter_rw(iter) == WRITE) { 220 len = copy_from_iter_pmem(dax.addr, max - pos, iter); 221 } else if (!hole) 222 len = copy_to_iter((void __force *) dax.addr, max - pos, 223 iter); 224 else 225 len = iov_iter_zero(max - pos, iter); 226 227 if (!len) { 228 rc = -EFAULT; 229 break; 230 } 231 232 pos += len; 233 if (!IS_ERR(dax.addr)) 234 dax.addr += len; 235 } 236 237 dax_unmap_atomic(bdev, &dax); 238 239 return (pos == start) ? rc : pos - start; 240 } 241 242 /** 243 * dax_do_io - Perform I/O to a DAX file 244 * @iocb: The control block for this I/O 245 * @inode: The file which the I/O is directed at 246 * @iter: The addresses to do I/O from or to 247 * @get_block: The filesystem method used to translate file offsets to blocks 248 * @end_io: A filesystem callback for I/O completion 249 * @flags: See below 250 * 251 * This function uses the same locking scheme as do_blockdev_direct_IO: 252 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 253 * caller for writes. For reads, we take and release the i_mutex ourselves. 254 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 255 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 256 * is in progress. 257 */ 258 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 259 struct iov_iter *iter, get_block_t get_block, 260 dio_iodone_t end_io, int flags) 261 { 262 struct buffer_head bh; 263 ssize_t retval = -EINVAL; 264 loff_t pos = iocb->ki_pos; 265 loff_t end = pos + iov_iter_count(iter); 266 267 memset(&bh, 0, sizeof(bh)); 268 bh.b_bdev = inode->i_sb->s_bdev; 269 270 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 271 inode_lock(inode); 272 273 /* Protects against truncate */ 274 if (!(flags & DIO_SKIP_DIO_COUNT)) 275 inode_dio_begin(inode); 276 277 retval = dax_io(inode, iter, pos, end, get_block, &bh); 278 279 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 280 inode_unlock(inode); 281 282 if (end_io) { 283 int err; 284 285 err = end_io(iocb, pos, retval, bh.b_private); 286 if (err) 287 retval = err; 288 } 289 290 if (!(flags & DIO_SKIP_DIO_COUNT)) 291 inode_dio_end(inode); 292 return retval; 293 } 294 EXPORT_SYMBOL_GPL(dax_do_io); 295 296 /* 297 * DAX radix tree locking 298 */ 299 struct exceptional_entry_key { 300 struct address_space *mapping; 301 unsigned long index; 302 }; 303 304 struct wait_exceptional_entry_queue { 305 wait_queue_t wait; 306 struct exceptional_entry_key key; 307 }; 308 309 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 310 int sync, void *keyp) 311 { 312 struct exceptional_entry_key *key = keyp; 313 struct wait_exceptional_entry_queue *ewait = 314 container_of(wait, struct wait_exceptional_entry_queue, wait); 315 316 if (key->mapping != ewait->key.mapping || 317 key->index != ewait->key.index) 318 return 0; 319 return autoremove_wake_function(wait, mode, sync, NULL); 320 } 321 322 /* 323 * Check whether the given slot is locked. The function must be called with 324 * mapping->tree_lock held 325 */ 326 static inline int slot_locked(struct address_space *mapping, void **slot) 327 { 328 unsigned long entry = (unsigned long) 329 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 330 return entry & RADIX_DAX_ENTRY_LOCK; 331 } 332 333 /* 334 * Mark the given slot is locked. The function must be called with 335 * mapping->tree_lock held 336 */ 337 static inline void *lock_slot(struct address_space *mapping, void **slot) 338 { 339 unsigned long entry = (unsigned long) 340 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 341 342 entry |= RADIX_DAX_ENTRY_LOCK; 343 radix_tree_replace_slot(slot, (void *)entry); 344 return (void *)entry; 345 } 346 347 /* 348 * Mark the given slot is unlocked. The function must be called with 349 * mapping->tree_lock held 350 */ 351 static inline void *unlock_slot(struct address_space *mapping, void **slot) 352 { 353 unsigned long entry = (unsigned long) 354 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 355 356 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 357 radix_tree_replace_slot(slot, (void *)entry); 358 return (void *)entry; 359 } 360 361 /* 362 * Lookup entry in radix tree, wait for it to become unlocked if it is 363 * exceptional entry and return it. The caller must call 364 * put_unlocked_mapping_entry() when he decided not to lock the entry or 365 * put_locked_mapping_entry() when he locked the entry and now wants to 366 * unlock it. 367 * 368 * The function must be called with mapping->tree_lock held. 369 */ 370 static void *get_unlocked_mapping_entry(struct address_space *mapping, 371 pgoff_t index, void ***slotp) 372 { 373 void *ret, **slot; 374 struct wait_exceptional_entry_queue ewait; 375 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 376 377 init_wait(&ewait.wait); 378 ewait.wait.func = wake_exceptional_entry_func; 379 ewait.key.mapping = mapping; 380 ewait.key.index = index; 381 382 for (;;) { 383 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 384 &slot); 385 if (!ret || !radix_tree_exceptional_entry(ret) || 386 !slot_locked(mapping, slot)) { 387 if (slotp) 388 *slotp = slot; 389 return ret; 390 } 391 prepare_to_wait_exclusive(wq, &ewait.wait, 392 TASK_UNINTERRUPTIBLE); 393 spin_unlock_irq(&mapping->tree_lock); 394 schedule(); 395 finish_wait(wq, &ewait.wait); 396 spin_lock_irq(&mapping->tree_lock); 397 } 398 } 399 400 /* 401 * Find radix tree entry at given index. If it points to a page, return with 402 * the page locked. If it points to the exceptional entry, return with the 403 * radix tree entry locked. If the radix tree doesn't contain given index, 404 * create empty exceptional entry for the index and return with it locked. 405 * 406 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 407 * persistent memory the benefit is doubtful. We can add that later if we can 408 * show it helps. 409 */ 410 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 411 { 412 void *ret, **slot; 413 414 restart: 415 spin_lock_irq(&mapping->tree_lock); 416 ret = get_unlocked_mapping_entry(mapping, index, &slot); 417 /* No entry for given index? Make sure radix tree is big enough. */ 418 if (!ret) { 419 int err; 420 421 spin_unlock_irq(&mapping->tree_lock); 422 err = radix_tree_preload( 423 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 424 if (err) 425 return ERR_PTR(err); 426 ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 427 RADIX_DAX_ENTRY_LOCK); 428 spin_lock_irq(&mapping->tree_lock); 429 err = radix_tree_insert(&mapping->page_tree, index, ret); 430 radix_tree_preload_end(); 431 if (err) { 432 spin_unlock_irq(&mapping->tree_lock); 433 /* Someone already created the entry? */ 434 if (err == -EEXIST) 435 goto restart; 436 return ERR_PTR(err); 437 } 438 /* Good, we have inserted empty locked entry into the tree. */ 439 mapping->nrexceptional++; 440 spin_unlock_irq(&mapping->tree_lock); 441 return ret; 442 } 443 /* Normal page in radix tree? */ 444 if (!radix_tree_exceptional_entry(ret)) { 445 struct page *page = ret; 446 447 get_page(page); 448 spin_unlock_irq(&mapping->tree_lock); 449 lock_page(page); 450 /* Page got truncated? Retry... */ 451 if (unlikely(page->mapping != mapping)) { 452 unlock_page(page); 453 put_page(page); 454 goto restart; 455 } 456 return page; 457 } 458 ret = lock_slot(mapping, slot); 459 spin_unlock_irq(&mapping->tree_lock); 460 return ret; 461 } 462 463 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 464 pgoff_t index, bool wake_all) 465 { 466 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 467 468 /* 469 * Checking for locked entry and prepare_to_wait_exclusive() happens 470 * under mapping->tree_lock, ditto for entry handling in our callers. 471 * So at this point all tasks that could have seen our entry locked 472 * must be in the waitqueue and the following check will see them. 473 */ 474 if (waitqueue_active(wq)) { 475 struct exceptional_entry_key key; 476 477 key.mapping = mapping; 478 key.index = index; 479 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 480 } 481 } 482 483 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 484 { 485 void *ret, **slot; 486 487 spin_lock_irq(&mapping->tree_lock); 488 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 489 if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 490 !slot_locked(mapping, slot))) { 491 spin_unlock_irq(&mapping->tree_lock); 492 return; 493 } 494 unlock_slot(mapping, slot); 495 spin_unlock_irq(&mapping->tree_lock); 496 dax_wake_mapping_entry_waiter(mapping, index, false); 497 } 498 499 static void put_locked_mapping_entry(struct address_space *mapping, 500 pgoff_t index, void *entry) 501 { 502 if (!radix_tree_exceptional_entry(entry)) { 503 unlock_page(entry); 504 put_page(entry); 505 } else { 506 dax_unlock_mapping_entry(mapping, index); 507 } 508 } 509 510 /* 511 * Called when we are done with radix tree entry we looked up via 512 * get_unlocked_mapping_entry() and which we didn't lock in the end. 513 */ 514 static void put_unlocked_mapping_entry(struct address_space *mapping, 515 pgoff_t index, void *entry) 516 { 517 if (!radix_tree_exceptional_entry(entry)) 518 return; 519 520 /* We have to wake up next waiter for the radix tree entry lock */ 521 dax_wake_mapping_entry_waiter(mapping, index, false); 522 } 523 524 /* 525 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 526 * entry to get unlocked before deleting it. 527 */ 528 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 529 { 530 void *entry; 531 532 spin_lock_irq(&mapping->tree_lock); 533 entry = get_unlocked_mapping_entry(mapping, index, NULL); 534 /* 535 * This gets called from truncate / punch_hole path. As such, the caller 536 * must hold locks protecting against concurrent modifications of the 537 * radix tree (usually fs-private i_mmap_sem for writing). Since the 538 * caller has seen exceptional entry for this index, we better find it 539 * at that index as well... 540 */ 541 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 542 spin_unlock_irq(&mapping->tree_lock); 543 return 0; 544 } 545 radix_tree_delete(&mapping->page_tree, index); 546 mapping->nrexceptional--; 547 spin_unlock_irq(&mapping->tree_lock); 548 dax_wake_mapping_entry_waiter(mapping, index, true); 549 550 return 1; 551 } 552 553 /* 554 * The user has performed a load from a hole in the file. Allocating 555 * a new page in the file would cause excessive storage usage for 556 * workloads with sparse files. We allocate a page cache page instead. 557 * We'll kick it out of the page cache if it's ever written to, 558 * otherwise it will simply fall out of the page cache under memory 559 * pressure without ever having been dirtied. 560 */ 561 static int dax_load_hole(struct address_space *mapping, void *entry, 562 struct vm_fault *vmf) 563 { 564 struct page *page; 565 566 /* Hole page already exists? Return it... */ 567 if (!radix_tree_exceptional_entry(entry)) { 568 vmf->page = entry; 569 return VM_FAULT_LOCKED; 570 } 571 572 /* This will replace locked radix tree entry with a hole page */ 573 page = find_or_create_page(mapping, vmf->pgoff, 574 vmf->gfp_mask | __GFP_ZERO); 575 if (!page) { 576 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 577 return VM_FAULT_OOM; 578 } 579 vmf->page = page; 580 return VM_FAULT_LOCKED; 581 } 582 583 static int copy_user_bh(struct page *to, struct inode *inode, 584 struct buffer_head *bh, unsigned long vaddr) 585 { 586 struct blk_dax_ctl dax = { 587 .sector = to_sector(bh, inode), 588 .size = bh->b_size, 589 }; 590 struct block_device *bdev = bh->b_bdev; 591 void *vto; 592 593 if (dax_map_atomic(bdev, &dax) < 0) 594 return PTR_ERR(dax.addr); 595 vto = kmap_atomic(to); 596 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 597 kunmap_atomic(vto); 598 dax_unmap_atomic(bdev, &dax); 599 return 0; 600 } 601 602 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 603 604 static void *dax_insert_mapping_entry(struct address_space *mapping, 605 struct vm_fault *vmf, 606 void *entry, sector_t sector) 607 { 608 struct radix_tree_root *page_tree = &mapping->page_tree; 609 int error = 0; 610 bool hole_fill = false; 611 void *new_entry; 612 pgoff_t index = vmf->pgoff; 613 614 if (vmf->flags & FAULT_FLAG_WRITE) 615 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 616 617 /* Replacing hole page with block mapping? */ 618 if (!radix_tree_exceptional_entry(entry)) { 619 hole_fill = true; 620 /* 621 * Unmap the page now before we remove it from page cache below. 622 * The page is locked so it cannot be faulted in again. 623 */ 624 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 625 PAGE_SIZE, 0); 626 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 627 if (error) 628 return ERR_PTR(error); 629 } 630 631 spin_lock_irq(&mapping->tree_lock); 632 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 633 RADIX_DAX_ENTRY_LOCK); 634 if (hole_fill) { 635 __delete_from_page_cache(entry, NULL); 636 /* Drop pagecache reference */ 637 put_page(entry); 638 error = radix_tree_insert(page_tree, index, new_entry); 639 if (error) { 640 new_entry = ERR_PTR(error); 641 goto unlock; 642 } 643 mapping->nrexceptional++; 644 } else { 645 void **slot; 646 void *ret; 647 648 ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 649 WARN_ON_ONCE(ret != entry); 650 radix_tree_replace_slot(slot, new_entry); 651 } 652 if (vmf->flags & FAULT_FLAG_WRITE) 653 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 654 unlock: 655 spin_unlock_irq(&mapping->tree_lock); 656 if (hole_fill) { 657 radix_tree_preload_end(); 658 /* 659 * We don't need hole page anymore, it has been replaced with 660 * locked radix tree entry now. 661 */ 662 if (mapping->a_ops->freepage) 663 mapping->a_ops->freepage(entry); 664 unlock_page(entry); 665 put_page(entry); 666 } 667 return new_entry; 668 } 669 670 static int dax_writeback_one(struct block_device *bdev, 671 struct address_space *mapping, pgoff_t index, void *entry) 672 { 673 struct radix_tree_root *page_tree = &mapping->page_tree; 674 int type = RADIX_DAX_TYPE(entry); 675 struct radix_tree_node *node; 676 struct blk_dax_ctl dax; 677 void **slot; 678 int ret = 0; 679 680 spin_lock_irq(&mapping->tree_lock); 681 /* 682 * Regular page slots are stabilized by the page lock even 683 * without the tree itself locked. These unlocked entries 684 * need verification under the tree lock. 685 */ 686 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 687 goto unlock; 688 if (*slot != entry) 689 goto unlock; 690 691 /* another fsync thread may have already written back this entry */ 692 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 693 goto unlock; 694 695 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 696 ret = -EIO; 697 goto unlock; 698 } 699 700 dax.sector = RADIX_DAX_SECTOR(entry); 701 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 702 spin_unlock_irq(&mapping->tree_lock); 703 704 /* 705 * We cannot hold tree_lock while calling dax_map_atomic() because it 706 * eventually calls cond_resched(). 707 */ 708 ret = dax_map_atomic(bdev, &dax); 709 if (ret < 0) 710 return ret; 711 712 if (WARN_ON_ONCE(ret < dax.size)) { 713 ret = -EIO; 714 goto unmap; 715 } 716 717 wb_cache_pmem(dax.addr, dax.size); 718 719 spin_lock_irq(&mapping->tree_lock); 720 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 721 spin_unlock_irq(&mapping->tree_lock); 722 unmap: 723 dax_unmap_atomic(bdev, &dax); 724 return ret; 725 726 unlock: 727 spin_unlock_irq(&mapping->tree_lock); 728 return ret; 729 } 730 731 /* 732 * Flush the mapping to the persistent domain within the byte range of [start, 733 * end]. This is required by data integrity operations to ensure file data is 734 * on persistent storage prior to completion of the operation. 735 */ 736 int dax_writeback_mapping_range(struct address_space *mapping, 737 struct block_device *bdev, struct writeback_control *wbc) 738 { 739 struct inode *inode = mapping->host; 740 pgoff_t start_index, end_index, pmd_index; 741 pgoff_t indices[PAGEVEC_SIZE]; 742 struct pagevec pvec; 743 bool done = false; 744 int i, ret = 0; 745 void *entry; 746 747 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 748 return -EIO; 749 750 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 751 return 0; 752 753 start_index = wbc->range_start >> PAGE_SHIFT; 754 end_index = wbc->range_end >> PAGE_SHIFT; 755 pmd_index = DAX_PMD_INDEX(start_index); 756 757 rcu_read_lock(); 758 entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 759 rcu_read_unlock(); 760 761 /* see if the start of our range is covered by a PMD entry */ 762 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 763 start_index = pmd_index; 764 765 tag_pages_for_writeback(mapping, start_index, end_index); 766 767 pagevec_init(&pvec, 0); 768 while (!done) { 769 pvec.nr = find_get_entries_tag(mapping, start_index, 770 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 771 pvec.pages, indices); 772 773 if (pvec.nr == 0) 774 break; 775 776 for (i = 0; i < pvec.nr; i++) { 777 if (indices[i] > end_index) { 778 done = true; 779 break; 780 } 781 782 ret = dax_writeback_one(bdev, mapping, indices[i], 783 pvec.pages[i]); 784 if (ret < 0) 785 return ret; 786 } 787 } 788 return 0; 789 } 790 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 791 792 static int dax_insert_mapping(struct address_space *mapping, 793 struct buffer_head *bh, void **entryp, 794 struct vm_area_struct *vma, struct vm_fault *vmf) 795 { 796 unsigned long vaddr = (unsigned long)vmf->virtual_address; 797 struct block_device *bdev = bh->b_bdev; 798 struct blk_dax_ctl dax = { 799 .sector = to_sector(bh, mapping->host), 800 .size = bh->b_size, 801 }; 802 void *ret; 803 void *entry = *entryp; 804 805 if (dax_map_atomic(bdev, &dax) < 0) 806 return PTR_ERR(dax.addr); 807 dax_unmap_atomic(bdev, &dax); 808 809 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 810 if (IS_ERR(ret)) 811 return PTR_ERR(ret); 812 *entryp = ret; 813 814 return vm_insert_mixed(vma, vaddr, dax.pfn); 815 } 816 817 /** 818 * dax_fault - handle a page fault on a DAX file 819 * @vma: The virtual memory area where the fault occurred 820 * @vmf: The description of the fault 821 * @get_block: The filesystem method used to translate file offsets to blocks 822 * 823 * When a page fault occurs, filesystems may call this helper in their 824 * fault handler for DAX files. dax_fault() assumes the caller has done all 825 * the necessary locking for the page fault to proceed successfully. 826 */ 827 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 828 get_block_t get_block) 829 { 830 struct file *file = vma->vm_file; 831 struct address_space *mapping = file->f_mapping; 832 struct inode *inode = mapping->host; 833 void *entry; 834 struct buffer_head bh; 835 unsigned long vaddr = (unsigned long)vmf->virtual_address; 836 unsigned blkbits = inode->i_blkbits; 837 sector_t block; 838 pgoff_t size; 839 int error; 840 int major = 0; 841 842 /* 843 * Check whether offset isn't beyond end of file now. Caller is supposed 844 * to hold locks serializing us with truncate / punch hole so this is 845 * a reliable test. 846 */ 847 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 848 if (vmf->pgoff >= size) 849 return VM_FAULT_SIGBUS; 850 851 memset(&bh, 0, sizeof(bh)); 852 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 853 bh.b_bdev = inode->i_sb->s_bdev; 854 bh.b_size = PAGE_SIZE; 855 856 entry = grab_mapping_entry(mapping, vmf->pgoff); 857 if (IS_ERR(entry)) { 858 error = PTR_ERR(entry); 859 goto out; 860 } 861 862 error = get_block(inode, block, &bh, 0); 863 if (!error && (bh.b_size < PAGE_SIZE)) 864 error = -EIO; /* fs corruption? */ 865 if (error) 866 goto unlock_entry; 867 868 if (vmf->cow_page) { 869 struct page *new_page = vmf->cow_page; 870 if (buffer_written(&bh)) 871 error = copy_user_bh(new_page, inode, &bh, vaddr); 872 else 873 clear_user_highpage(new_page, vaddr); 874 if (error) 875 goto unlock_entry; 876 if (!radix_tree_exceptional_entry(entry)) { 877 vmf->page = entry; 878 return VM_FAULT_LOCKED; 879 } 880 vmf->entry = entry; 881 return VM_FAULT_DAX_LOCKED; 882 } 883 884 if (!buffer_mapped(&bh)) { 885 if (vmf->flags & FAULT_FLAG_WRITE) { 886 error = get_block(inode, block, &bh, 1); 887 count_vm_event(PGMAJFAULT); 888 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 889 major = VM_FAULT_MAJOR; 890 if (!error && (bh.b_size < PAGE_SIZE)) 891 error = -EIO; 892 if (error) 893 goto unlock_entry; 894 } else { 895 return dax_load_hole(mapping, entry, vmf); 896 } 897 } 898 899 /* Filesystem should not return unwritten buffers to us! */ 900 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 901 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 902 unlock_entry: 903 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 904 out: 905 if (error == -ENOMEM) 906 return VM_FAULT_OOM | major; 907 /* -EBUSY is fine, somebody else faulted on the same PTE */ 908 if ((error < 0) && (error != -EBUSY)) 909 return VM_FAULT_SIGBUS | major; 910 return VM_FAULT_NOPAGE | major; 911 } 912 EXPORT_SYMBOL_GPL(dax_fault); 913 914 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) 915 /* 916 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 917 * more often than one might expect in the below function. 918 */ 919 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 920 921 static void __dax_dbg(struct buffer_head *bh, unsigned long address, 922 const char *reason, const char *fn) 923 { 924 if (bh) { 925 char bname[BDEVNAME_SIZE]; 926 bdevname(bh->b_bdev, bname); 927 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 928 "length %zd fallback: %s\n", fn, current->comm, 929 address, bname, bh->b_state, (u64)bh->b_blocknr, 930 bh->b_size, reason); 931 } else { 932 pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 933 current->comm, address, reason); 934 } 935 } 936 937 #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 938 939 /** 940 * dax_pmd_fault - handle a PMD fault on a DAX file 941 * @vma: The virtual memory area where the fault occurred 942 * @vmf: The description of the fault 943 * @get_block: The filesystem method used to translate file offsets to blocks 944 * 945 * When a page fault occurs, filesystems may call this helper in their 946 * pmd_fault handler for DAX files. 947 */ 948 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 949 pmd_t *pmd, unsigned int flags, get_block_t get_block) 950 { 951 struct file *file = vma->vm_file; 952 struct address_space *mapping = file->f_mapping; 953 struct inode *inode = mapping->host; 954 struct buffer_head bh; 955 unsigned blkbits = inode->i_blkbits; 956 unsigned long pmd_addr = address & PMD_MASK; 957 bool write = flags & FAULT_FLAG_WRITE; 958 struct block_device *bdev; 959 pgoff_t size, pgoff; 960 sector_t block; 961 int result = 0; 962 bool alloc = false; 963 964 /* dax pmd mappings require pfn_t_devmap() */ 965 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 966 return VM_FAULT_FALLBACK; 967 968 /* Fall back to PTEs if we're going to COW */ 969 if (write && !(vma->vm_flags & VM_SHARED)) { 970 split_huge_pmd(vma, pmd, address); 971 dax_pmd_dbg(NULL, address, "cow write"); 972 return VM_FAULT_FALLBACK; 973 } 974 /* If the PMD would extend outside the VMA */ 975 if (pmd_addr < vma->vm_start) { 976 dax_pmd_dbg(NULL, address, "vma start unaligned"); 977 return VM_FAULT_FALLBACK; 978 } 979 if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 980 dax_pmd_dbg(NULL, address, "vma end unaligned"); 981 return VM_FAULT_FALLBACK; 982 } 983 984 pgoff = linear_page_index(vma, pmd_addr); 985 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 986 if (pgoff >= size) 987 return VM_FAULT_SIGBUS; 988 /* If the PMD would cover blocks out of the file */ 989 if ((pgoff | PG_PMD_COLOUR) >= size) { 990 dax_pmd_dbg(NULL, address, 991 "offset + huge page size > file size"); 992 return VM_FAULT_FALLBACK; 993 } 994 995 memset(&bh, 0, sizeof(bh)); 996 bh.b_bdev = inode->i_sb->s_bdev; 997 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 998 999 bh.b_size = PMD_SIZE; 1000 1001 if (get_block(inode, block, &bh, 0) != 0) 1002 return VM_FAULT_SIGBUS; 1003 1004 if (!buffer_mapped(&bh) && write) { 1005 if (get_block(inode, block, &bh, 1) != 0) 1006 return VM_FAULT_SIGBUS; 1007 alloc = true; 1008 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 1009 } 1010 1011 bdev = bh.b_bdev; 1012 1013 /* 1014 * If the filesystem isn't willing to tell us the length of a hole, 1015 * just fall back to PTEs. Calling get_block 512 times in a loop 1016 * would be silly. 1017 */ 1018 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 1019 dax_pmd_dbg(&bh, address, "allocated block too small"); 1020 return VM_FAULT_FALLBACK; 1021 } 1022 1023 /* 1024 * If we allocated new storage, make sure no process has any 1025 * zero pages covering this hole 1026 */ 1027 if (alloc) { 1028 loff_t lstart = pgoff << PAGE_SHIFT; 1029 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 1030 1031 truncate_pagecache_range(inode, lstart, lend); 1032 } 1033 1034 if (!write && !buffer_mapped(&bh)) { 1035 spinlock_t *ptl; 1036 pmd_t entry; 1037 struct page *zero_page = get_huge_zero_page(); 1038 1039 if (unlikely(!zero_page)) { 1040 dax_pmd_dbg(&bh, address, "no zero page"); 1041 goto fallback; 1042 } 1043 1044 ptl = pmd_lock(vma->vm_mm, pmd); 1045 if (!pmd_none(*pmd)) { 1046 spin_unlock(ptl); 1047 dax_pmd_dbg(&bh, address, "pmd already present"); 1048 goto fallback; 1049 } 1050 1051 dev_dbg(part_to_dev(bdev->bd_part), 1052 "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 1053 __func__, current->comm, address, 1054 (unsigned long long) to_sector(&bh, inode)); 1055 1056 entry = mk_pmd(zero_page, vma->vm_page_prot); 1057 entry = pmd_mkhuge(entry); 1058 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 1059 result = VM_FAULT_NOPAGE; 1060 spin_unlock(ptl); 1061 } else { 1062 struct blk_dax_ctl dax = { 1063 .sector = to_sector(&bh, inode), 1064 .size = PMD_SIZE, 1065 }; 1066 long length = dax_map_atomic(bdev, &dax); 1067 1068 if (length < 0) { 1069 dax_pmd_dbg(&bh, address, "dax-error fallback"); 1070 goto fallback; 1071 } 1072 if (length < PMD_SIZE) { 1073 dax_pmd_dbg(&bh, address, "dax-length too small"); 1074 dax_unmap_atomic(bdev, &dax); 1075 goto fallback; 1076 } 1077 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 1078 dax_pmd_dbg(&bh, address, "pfn unaligned"); 1079 dax_unmap_atomic(bdev, &dax); 1080 goto fallback; 1081 } 1082 1083 if (!pfn_t_devmap(dax.pfn)) { 1084 dax_unmap_atomic(bdev, &dax); 1085 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 1086 goto fallback; 1087 } 1088 dax_unmap_atomic(bdev, &dax); 1089 1090 /* 1091 * For PTE faults we insert a radix tree entry for reads, and 1092 * leave it clean. Then on the first write we dirty the radix 1093 * tree entry via the dax_pfn_mkwrite() path. This sequence 1094 * allows the dax_pfn_mkwrite() call to be simpler and avoid a 1095 * call into get_block() to translate the pgoff to a sector in 1096 * order to be able to create a new radix tree entry. 1097 * 1098 * The PMD path doesn't have an equivalent to 1099 * dax_pfn_mkwrite(), though, so for a read followed by a 1100 * write we traverse all the way through dax_pmd_fault() 1101 * twice. This means we can just skip inserting a radix tree 1102 * entry completely on the initial read and just wait until 1103 * the write to insert a dirty entry. 1104 */ 1105 if (write) { 1106 /* 1107 * We should insert radix-tree entry and dirty it here. 1108 * For now this is broken... 1109 */ 1110 } 1111 1112 dev_dbg(part_to_dev(bdev->bd_part), 1113 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 1114 __func__, current->comm, address, 1115 pfn_t_to_pfn(dax.pfn), 1116 (unsigned long long) dax.sector); 1117 result |= vmf_insert_pfn_pmd(vma, address, pmd, 1118 dax.pfn, write); 1119 } 1120 1121 out: 1122 return result; 1123 1124 fallback: 1125 count_vm_event(THP_FAULT_FALLBACK); 1126 result = VM_FAULT_FALLBACK; 1127 goto out; 1128 } 1129 EXPORT_SYMBOL_GPL(dax_pmd_fault); 1130 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1131 1132 /** 1133 * dax_pfn_mkwrite - handle first write to DAX page 1134 * @vma: The virtual memory area where the fault occurred 1135 * @vmf: The description of the fault 1136 */ 1137 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1138 { 1139 struct file *file = vma->vm_file; 1140 struct address_space *mapping = file->f_mapping; 1141 void *entry; 1142 pgoff_t index = vmf->pgoff; 1143 1144 spin_lock_irq(&mapping->tree_lock); 1145 entry = get_unlocked_mapping_entry(mapping, index, NULL); 1146 if (!entry || !radix_tree_exceptional_entry(entry)) 1147 goto out; 1148 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1149 put_unlocked_mapping_entry(mapping, index, entry); 1150 out: 1151 spin_unlock_irq(&mapping->tree_lock); 1152 return VM_FAULT_NOPAGE; 1153 } 1154 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1155 1156 static bool dax_range_is_aligned(struct block_device *bdev, 1157 unsigned int offset, unsigned int length) 1158 { 1159 unsigned short sector_size = bdev_logical_block_size(bdev); 1160 1161 if (!IS_ALIGNED(offset, sector_size)) 1162 return false; 1163 if (!IS_ALIGNED(length, sector_size)) 1164 return false; 1165 1166 return true; 1167 } 1168 1169 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1170 unsigned int offset, unsigned int length) 1171 { 1172 struct blk_dax_ctl dax = { 1173 .sector = sector, 1174 .size = PAGE_SIZE, 1175 }; 1176 1177 if (dax_range_is_aligned(bdev, offset, length)) { 1178 sector_t start_sector = dax.sector + (offset >> 9); 1179 1180 return blkdev_issue_zeroout(bdev, start_sector, 1181 length >> 9, GFP_NOFS, true); 1182 } else { 1183 if (dax_map_atomic(bdev, &dax) < 0) 1184 return PTR_ERR(dax.addr); 1185 clear_pmem(dax.addr + offset, length); 1186 dax_unmap_atomic(bdev, &dax); 1187 } 1188 return 0; 1189 } 1190 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1191 1192 /** 1193 * dax_zero_page_range - zero a range within a page of a DAX file 1194 * @inode: The file being truncated 1195 * @from: The file offset that is being truncated to 1196 * @length: The number of bytes to zero 1197 * @get_block: The filesystem method used to translate file offsets to blocks 1198 * 1199 * This function can be called by a filesystem when it is zeroing part of a 1200 * page in a DAX file. This is intended for hole-punch operations. If 1201 * you are truncating a file, the helper function dax_truncate_page() may be 1202 * more convenient. 1203 */ 1204 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 1205 get_block_t get_block) 1206 { 1207 struct buffer_head bh; 1208 pgoff_t index = from >> PAGE_SHIFT; 1209 unsigned offset = from & (PAGE_SIZE-1); 1210 int err; 1211 1212 /* Block boundary? Nothing to do */ 1213 if (!length) 1214 return 0; 1215 BUG_ON((offset + length) > PAGE_SIZE); 1216 1217 memset(&bh, 0, sizeof(bh)); 1218 bh.b_bdev = inode->i_sb->s_bdev; 1219 bh.b_size = PAGE_SIZE; 1220 err = get_block(inode, index, &bh, 0); 1221 if (err < 0 || !buffer_written(&bh)) 1222 return err; 1223 1224 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), 1225 offset, length); 1226 } 1227 EXPORT_SYMBOL_GPL(dax_zero_page_range); 1228 1229 /** 1230 * dax_truncate_page - handle a partial page being truncated in a DAX file 1231 * @inode: The file being truncated 1232 * @from: The file offset that is being truncated to 1233 * @get_block: The filesystem method used to translate file offsets to blocks 1234 * 1235 * Similar to block_truncate_page(), this function can be called by a 1236 * filesystem when it is truncating a DAX file to handle the partial page. 1237 */ 1238 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1239 { 1240 unsigned length = PAGE_ALIGN(from) - from; 1241 return dax_zero_page_range(inode, from, length, get_block); 1242 } 1243 EXPORT_SYMBOL_GPL(dax_truncate_page); 1244