1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pmem.h> 28 #include <linux/sched.h> 29 #include <linux/uio.h> 30 #include <linux/vmstat.h> 31 32 int dax_clear_blocks(struct inode *inode, sector_t block, long size) 33 { 34 struct block_device *bdev = inode->i_sb->s_bdev; 35 sector_t sector = block << (inode->i_blkbits - 9); 36 37 might_sleep(); 38 do { 39 void __pmem *addr; 40 unsigned long pfn; 41 long count; 42 43 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 44 if (count < 0) 45 return count; 46 BUG_ON(size < count); 47 while (count > 0) { 48 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 49 if (pgsz > count) 50 pgsz = count; 51 clear_pmem(addr, pgsz); 52 addr += pgsz; 53 size -= pgsz; 54 count -= pgsz; 55 BUG_ON(pgsz & 511); 56 sector += pgsz / 512; 57 cond_resched(); 58 } 59 } while (size); 60 61 wmb_pmem(); 62 return 0; 63 } 64 EXPORT_SYMBOL_GPL(dax_clear_blocks); 65 66 static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, 67 unsigned blkbits) 68 { 69 unsigned long pfn; 70 sector_t sector = bh->b_blocknr << (blkbits - 9); 71 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 72 } 73 74 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 75 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 76 loff_t pos, loff_t end) 77 { 78 loff_t final = end - pos + first; /* The final byte of the buffer */ 79 80 if (first > 0) 81 clear_pmem(addr, first); 82 if (final < size) 83 clear_pmem(addr + final, size - final); 84 } 85 86 static bool buffer_written(struct buffer_head *bh) 87 { 88 return buffer_mapped(bh) && !buffer_unwritten(bh); 89 } 90 91 /* 92 * When ext4 encounters a hole, it returns without modifying the buffer_head 93 * which means that we can't trust b_size. To cope with this, we set b_state 94 * to 0 before calling get_block and, if any bit is set, we know we can trust 95 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 96 * and would save us time calling get_block repeatedly. 97 */ 98 static bool buffer_size_valid(struct buffer_head *bh) 99 { 100 return bh->b_state != 0; 101 } 102 103 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 104 loff_t start, loff_t end, get_block_t get_block, 105 struct buffer_head *bh) 106 { 107 ssize_t retval = 0; 108 loff_t pos = start; 109 loff_t max = start; 110 loff_t bh_max = start; 111 void __pmem *addr; 112 bool hole = false; 113 bool need_wmb = false; 114 115 if (iov_iter_rw(iter) != WRITE) 116 end = min(end, i_size_read(inode)); 117 118 while (pos < end) { 119 size_t len; 120 if (pos == max) { 121 unsigned blkbits = inode->i_blkbits; 122 long page = pos >> PAGE_SHIFT; 123 sector_t block = page << (PAGE_SHIFT - blkbits); 124 unsigned first = pos - (block << blkbits); 125 long size; 126 127 if (pos == bh_max) { 128 bh->b_size = PAGE_ALIGN(end - pos); 129 bh->b_state = 0; 130 retval = get_block(inode, block, bh, 131 iov_iter_rw(iter) == WRITE); 132 if (retval) 133 break; 134 if (!buffer_size_valid(bh)) 135 bh->b_size = 1 << blkbits; 136 bh_max = pos - first + bh->b_size; 137 } else { 138 unsigned done = bh->b_size - 139 (bh_max - (pos - first)); 140 bh->b_blocknr += done >> blkbits; 141 bh->b_size -= done; 142 } 143 144 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 145 if (hole) { 146 addr = NULL; 147 size = bh->b_size - first; 148 } else { 149 retval = dax_get_addr(bh, &addr, blkbits); 150 if (retval < 0) 151 break; 152 if (buffer_unwritten(bh) || buffer_new(bh)) { 153 dax_new_buf(addr, retval, first, pos, 154 end); 155 need_wmb = true; 156 } 157 addr += first; 158 size = retval - first; 159 } 160 max = min(pos + size, end); 161 } 162 163 if (iov_iter_rw(iter) == WRITE) { 164 len = copy_from_iter_pmem(addr, max - pos, iter); 165 need_wmb = true; 166 } else if (!hole) 167 len = copy_to_iter((void __force *)addr, max - pos, 168 iter); 169 else 170 len = iov_iter_zero(max - pos, iter); 171 172 if (!len) 173 break; 174 175 pos += len; 176 addr += len; 177 } 178 179 if (need_wmb) 180 wmb_pmem(); 181 182 return (pos == start) ? retval : pos - start; 183 } 184 185 /** 186 * dax_do_io - Perform I/O to a DAX file 187 * @iocb: The control block for this I/O 188 * @inode: The file which the I/O is directed at 189 * @iter: The addresses to do I/O from or to 190 * @pos: The file offset where the I/O starts 191 * @get_block: The filesystem method used to translate file offsets to blocks 192 * @end_io: A filesystem callback for I/O completion 193 * @flags: See below 194 * 195 * This function uses the same locking scheme as do_blockdev_direct_IO: 196 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 197 * caller for writes. For reads, we take and release the i_mutex ourselves. 198 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 199 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 200 * is in progress. 201 */ 202 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 203 struct iov_iter *iter, loff_t pos, get_block_t get_block, 204 dio_iodone_t end_io, int flags) 205 { 206 struct buffer_head bh; 207 ssize_t retval = -EINVAL; 208 loff_t end = pos + iov_iter_count(iter); 209 210 memset(&bh, 0, sizeof(bh)); 211 212 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 213 struct address_space *mapping = inode->i_mapping; 214 mutex_lock(&inode->i_mutex); 215 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 216 if (retval) { 217 mutex_unlock(&inode->i_mutex); 218 goto out; 219 } 220 } 221 222 /* Protects against truncate */ 223 if (!(flags & DIO_SKIP_DIO_COUNT)) 224 inode_dio_begin(inode); 225 226 retval = dax_io(inode, iter, pos, end, get_block, &bh); 227 228 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 229 mutex_unlock(&inode->i_mutex); 230 231 if ((retval > 0) && end_io) 232 end_io(iocb, pos, retval, bh.b_private); 233 234 if (!(flags & DIO_SKIP_DIO_COUNT)) 235 inode_dio_end(inode); 236 out: 237 return retval; 238 } 239 EXPORT_SYMBOL_GPL(dax_do_io); 240 241 /* 242 * The user has performed a load from a hole in the file. Allocating 243 * a new page in the file would cause excessive storage usage for 244 * workloads with sparse files. We allocate a page cache page instead. 245 * We'll kick it out of the page cache if it's ever written to, 246 * otherwise it will simply fall out of the page cache under memory 247 * pressure without ever having been dirtied. 248 */ 249 static int dax_load_hole(struct address_space *mapping, struct page *page, 250 struct vm_fault *vmf) 251 { 252 unsigned long size; 253 struct inode *inode = mapping->host; 254 if (!page) 255 page = find_or_create_page(mapping, vmf->pgoff, 256 GFP_KERNEL | __GFP_ZERO); 257 if (!page) 258 return VM_FAULT_OOM; 259 /* Recheck i_size under page lock to avoid truncate race */ 260 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 261 if (vmf->pgoff >= size) { 262 unlock_page(page); 263 page_cache_release(page); 264 return VM_FAULT_SIGBUS; 265 } 266 267 vmf->page = page; 268 return VM_FAULT_LOCKED; 269 } 270 271 static int copy_user_bh(struct page *to, struct buffer_head *bh, 272 unsigned blkbits, unsigned long vaddr) 273 { 274 void __pmem *vfrom; 275 void *vto; 276 277 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 278 return -EIO; 279 vto = kmap_atomic(to); 280 copy_user_page(vto, (void __force *)vfrom, vaddr, to); 281 kunmap_atomic(vto); 282 return 0; 283 } 284 285 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 286 struct vm_area_struct *vma, struct vm_fault *vmf) 287 { 288 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 289 unsigned long vaddr = (unsigned long)vmf->virtual_address; 290 void __pmem *addr; 291 unsigned long pfn; 292 pgoff_t size; 293 int error; 294 295 /* 296 * Check truncate didn't happen while we were allocating a block. 297 * If it did, this block may or may not be still allocated to the 298 * file. We can't tell the filesystem to free it because we can't 299 * take i_mutex here. In the worst case, the file still has blocks 300 * allocated past the end of the file. 301 */ 302 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 303 if (unlikely(vmf->pgoff >= size)) { 304 error = -EIO; 305 goto out; 306 } 307 308 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 309 if (error < 0) 310 goto out; 311 if (error < PAGE_SIZE) { 312 error = -EIO; 313 goto out; 314 } 315 316 if (buffer_unwritten(bh) || buffer_new(bh)) { 317 clear_pmem(addr, PAGE_SIZE); 318 wmb_pmem(); 319 } 320 321 error = vm_insert_mixed(vma, vaddr, pfn); 322 323 out: 324 return error; 325 } 326 327 /** 328 * __dax_fault - handle a page fault on a DAX file 329 * @vma: The virtual memory area where the fault occurred 330 * @vmf: The description of the fault 331 * @get_block: The filesystem method used to translate file offsets to blocks 332 * @complete_unwritten: The filesystem method used to convert unwritten blocks 333 * to written so the data written to them is exposed. This is required for 334 * required by write faults for filesystems that will return unwritten 335 * extent mappings from @get_block, but it is optional for reads as 336 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 337 * not support unwritten extents, the it should pass NULL. 338 * 339 * When a page fault occurs, filesystems may call this helper in their 340 * fault handler for DAX files. __dax_fault() assumes the caller has done all 341 * the necessary locking for the page fault to proceed successfully. 342 */ 343 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 344 get_block_t get_block, dax_iodone_t complete_unwritten) 345 { 346 struct file *file = vma->vm_file; 347 struct address_space *mapping = file->f_mapping; 348 struct inode *inode = mapping->host; 349 struct page *page; 350 struct buffer_head bh; 351 unsigned long vaddr = (unsigned long)vmf->virtual_address; 352 unsigned blkbits = inode->i_blkbits; 353 sector_t block; 354 pgoff_t size; 355 int error; 356 int major = 0; 357 358 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 359 if (vmf->pgoff >= size) 360 return VM_FAULT_SIGBUS; 361 362 memset(&bh, 0, sizeof(bh)); 363 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 364 bh.b_size = PAGE_SIZE; 365 366 repeat: 367 page = find_get_page(mapping, vmf->pgoff); 368 if (page) { 369 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 370 page_cache_release(page); 371 return VM_FAULT_RETRY; 372 } 373 if (unlikely(page->mapping != mapping)) { 374 unlock_page(page); 375 page_cache_release(page); 376 goto repeat; 377 } 378 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 379 if (unlikely(vmf->pgoff >= size)) { 380 /* 381 * We have a struct page covering a hole in the file 382 * from a read fault and we've raced with a truncate 383 */ 384 error = -EIO; 385 goto unlock; 386 } 387 } else { 388 i_mmap_lock_write(mapping); 389 } 390 391 error = get_block(inode, block, &bh, 0); 392 if (!error && (bh.b_size < PAGE_SIZE)) 393 error = -EIO; /* fs corruption? */ 394 if (error) 395 goto unlock; 396 397 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 398 if (vmf->flags & FAULT_FLAG_WRITE) { 399 error = get_block(inode, block, &bh, 1); 400 count_vm_event(PGMAJFAULT); 401 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 402 major = VM_FAULT_MAJOR; 403 if (!error && (bh.b_size < PAGE_SIZE)) 404 error = -EIO; 405 if (error) 406 goto unlock; 407 } else { 408 i_mmap_unlock_write(mapping); 409 return dax_load_hole(mapping, page, vmf); 410 } 411 } 412 413 if (vmf->cow_page) { 414 struct page *new_page = vmf->cow_page; 415 if (buffer_written(&bh)) 416 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 417 else 418 clear_user_highpage(new_page, vaddr); 419 if (error) 420 goto unlock; 421 vmf->page = page; 422 if (!page) { 423 /* Check we didn't race with truncate */ 424 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 425 PAGE_SHIFT; 426 if (vmf->pgoff >= size) { 427 error = -EIO; 428 goto unlock; 429 } 430 } 431 return VM_FAULT_LOCKED; 432 } 433 434 /* Check we didn't race with a read fault installing a new page */ 435 if (!page && major) 436 page = find_lock_page(mapping, vmf->pgoff); 437 438 if (page) { 439 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 440 PAGE_CACHE_SIZE, 0); 441 delete_from_page_cache(page); 442 unlock_page(page); 443 page_cache_release(page); 444 } 445 446 /* 447 * If we successfully insert the new mapping over an unwritten extent, 448 * we need to ensure we convert the unwritten extent. If there is an 449 * error inserting the mapping, the filesystem needs to leave it as 450 * unwritten to prevent exposure of the stale underlying data to 451 * userspace, but we still need to call the completion function so 452 * the private resources on the mapping buffer can be released. We 453 * indicate what the callback should do via the uptodate variable, same 454 * as for normal BH based IO completions. 455 */ 456 error = dax_insert_mapping(inode, &bh, vma, vmf); 457 if (buffer_unwritten(&bh)) { 458 if (complete_unwritten) 459 complete_unwritten(&bh, !error); 460 else 461 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 462 } 463 464 if (!page) 465 i_mmap_unlock_write(mapping); 466 out: 467 if (error == -ENOMEM) 468 return VM_FAULT_OOM | major; 469 /* -EBUSY is fine, somebody else faulted on the same PTE */ 470 if ((error < 0) && (error != -EBUSY)) 471 return VM_FAULT_SIGBUS | major; 472 return VM_FAULT_NOPAGE | major; 473 474 unlock: 475 if (page) { 476 unlock_page(page); 477 page_cache_release(page); 478 } else { 479 i_mmap_unlock_write(mapping); 480 } 481 482 goto out; 483 } 484 EXPORT_SYMBOL(__dax_fault); 485 486 /** 487 * dax_fault - handle a page fault on a DAX file 488 * @vma: The virtual memory area where the fault occurred 489 * @vmf: The description of the fault 490 * @get_block: The filesystem method used to translate file offsets to blocks 491 * 492 * When a page fault occurs, filesystems may call this helper in their 493 * fault handler for DAX files. 494 */ 495 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 496 get_block_t get_block, dax_iodone_t complete_unwritten) 497 { 498 int result; 499 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 500 501 if (vmf->flags & FAULT_FLAG_WRITE) { 502 sb_start_pagefault(sb); 503 file_update_time(vma->vm_file); 504 } 505 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 506 if (vmf->flags & FAULT_FLAG_WRITE) 507 sb_end_pagefault(sb); 508 509 return result; 510 } 511 EXPORT_SYMBOL_GPL(dax_fault); 512 513 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 514 /* 515 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 516 * more often than one might expect in the below function. 517 */ 518 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 519 520 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 521 pmd_t *pmd, unsigned int flags, get_block_t get_block, 522 dax_iodone_t complete_unwritten) 523 { 524 struct file *file = vma->vm_file; 525 struct address_space *mapping = file->f_mapping; 526 struct inode *inode = mapping->host; 527 struct buffer_head bh; 528 unsigned blkbits = inode->i_blkbits; 529 unsigned long pmd_addr = address & PMD_MASK; 530 bool write = flags & FAULT_FLAG_WRITE; 531 long length; 532 void __pmem *kaddr; 533 pgoff_t size, pgoff; 534 sector_t block, sector; 535 unsigned long pfn; 536 int result = 0; 537 538 /* Fall back to PTEs if we're going to COW */ 539 if (write && !(vma->vm_flags & VM_SHARED)) 540 return VM_FAULT_FALLBACK; 541 /* If the PMD would extend outside the VMA */ 542 if (pmd_addr < vma->vm_start) 543 return VM_FAULT_FALLBACK; 544 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 545 return VM_FAULT_FALLBACK; 546 547 pgoff = linear_page_index(vma, pmd_addr); 548 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 549 if (pgoff >= size) 550 return VM_FAULT_SIGBUS; 551 /* If the PMD would cover blocks out of the file */ 552 if ((pgoff | PG_PMD_COLOUR) >= size) 553 return VM_FAULT_FALLBACK; 554 555 memset(&bh, 0, sizeof(bh)); 556 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 557 558 bh.b_size = PMD_SIZE; 559 i_mmap_lock_write(mapping); 560 length = get_block(inode, block, &bh, write); 561 if (length) 562 return VM_FAULT_SIGBUS; 563 564 /* 565 * If the filesystem isn't willing to tell us the length of a hole, 566 * just fall back to PTEs. Calling get_block 512 times in a loop 567 * would be silly. 568 */ 569 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 570 goto fallback; 571 572 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 573 int i; 574 for (i = 0; i < PTRS_PER_PMD; i++) 575 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); 576 wmb_pmem(); 577 count_vm_event(PGMAJFAULT); 578 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 579 result |= VM_FAULT_MAJOR; 580 } 581 582 /* 583 * If we allocated new storage, make sure no process has any 584 * zero pages covering this hole 585 */ 586 if (buffer_new(&bh)) { 587 i_mmap_unlock_write(mapping); 588 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 589 i_mmap_lock_write(mapping); 590 } 591 592 /* 593 * If a truncate happened while we were allocating blocks, we may 594 * leave blocks allocated to the file that are beyond EOF. We can't 595 * take i_mutex here, so just leave them hanging; they'll be freed 596 * when the file is deleted. 597 */ 598 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 599 if (pgoff >= size) { 600 result = VM_FAULT_SIGBUS; 601 goto out; 602 } 603 if ((pgoff | PG_PMD_COLOUR) >= size) 604 goto fallback; 605 606 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 607 spinlock_t *ptl; 608 pmd_t entry; 609 struct page *zero_page = get_huge_zero_page(); 610 611 if (unlikely(!zero_page)) 612 goto fallback; 613 614 ptl = pmd_lock(vma->vm_mm, pmd); 615 if (!pmd_none(*pmd)) { 616 spin_unlock(ptl); 617 goto fallback; 618 } 619 620 entry = mk_pmd(zero_page, vma->vm_page_prot); 621 entry = pmd_mkhuge(entry); 622 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 623 result = VM_FAULT_NOPAGE; 624 spin_unlock(ptl); 625 } else { 626 sector = bh.b_blocknr << (blkbits - 9); 627 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 628 bh.b_size); 629 if (length < 0) { 630 result = VM_FAULT_SIGBUS; 631 goto out; 632 } 633 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 634 goto fallback; 635 636 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 637 } 638 639 out: 640 if (buffer_unwritten(&bh)) 641 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 642 643 i_mmap_unlock_write(mapping); 644 645 return result; 646 647 fallback: 648 count_vm_event(THP_FAULT_FALLBACK); 649 result = VM_FAULT_FALLBACK; 650 goto out; 651 } 652 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 653 654 /** 655 * dax_pmd_fault - handle a PMD fault on a DAX file 656 * @vma: The virtual memory area where the fault occurred 657 * @vmf: The description of the fault 658 * @get_block: The filesystem method used to translate file offsets to blocks 659 * 660 * When a page fault occurs, filesystems may call this helper in their 661 * pmd_fault handler for DAX files. 662 */ 663 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 664 pmd_t *pmd, unsigned int flags, get_block_t get_block, 665 dax_iodone_t complete_unwritten) 666 { 667 int result; 668 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 669 670 if (flags & FAULT_FLAG_WRITE) { 671 sb_start_pagefault(sb); 672 file_update_time(vma->vm_file); 673 } 674 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 675 complete_unwritten); 676 if (flags & FAULT_FLAG_WRITE) 677 sb_end_pagefault(sb); 678 679 return result; 680 } 681 EXPORT_SYMBOL_GPL(dax_pmd_fault); 682 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 683 684 /** 685 * dax_pfn_mkwrite - handle first write to DAX page 686 * @vma: The virtual memory area where the fault occurred 687 * @vmf: The description of the fault 688 * 689 */ 690 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 691 { 692 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 693 694 sb_start_pagefault(sb); 695 file_update_time(vma->vm_file); 696 sb_end_pagefault(sb); 697 return VM_FAULT_NOPAGE; 698 } 699 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 700 701 /** 702 * dax_zero_page_range - zero a range within a page of a DAX file 703 * @inode: The file being truncated 704 * @from: The file offset that is being truncated to 705 * @length: The number of bytes to zero 706 * @get_block: The filesystem method used to translate file offsets to blocks 707 * 708 * This function can be called by a filesystem when it is zeroing part of a 709 * page in a DAX file. This is intended for hole-punch operations. If 710 * you are truncating a file, the helper function dax_truncate_page() may be 711 * more convenient. 712 * 713 * We work in terms of PAGE_CACHE_SIZE here for commonality with 714 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 715 * took care of disposing of the unnecessary blocks. Even if the filesystem 716 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 717 * since the file might be mmapped. 718 */ 719 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 720 get_block_t get_block) 721 { 722 struct buffer_head bh; 723 pgoff_t index = from >> PAGE_CACHE_SHIFT; 724 unsigned offset = from & (PAGE_CACHE_SIZE-1); 725 int err; 726 727 /* Block boundary? Nothing to do */ 728 if (!length) 729 return 0; 730 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 731 732 memset(&bh, 0, sizeof(bh)); 733 bh.b_size = PAGE_CACHE_SIZE; 734 err = get_block(inode, index, &bh, 0); 735 if (err < 0) 736 return err; 737 if (buffer_written(&bh)) { 738 void __pmem *addr; 739 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 740 if (err < 0) 741 return err; 742 clear_pmem(addr + offset, length); 743 wmb_pmem(); 744 } 745 746 return 0; 747 } 748 EXPORT_SYMBOL_GPL(dax_zero_page_range); 749 750 /** 751 * dax_truncate_page - handle a partial page being truncated in a DAX file 752 * @inode: The file being truncated 753 * @from: The file offset that is being truncated to 754 * @get_block: The filesystem method used to translate file offsets to blocks 755 * 756 * Similar to block_truncate_page(), this function can be called by a 757 * filesystem when it is truncating a DAX file to handle the partial page. 758 * 759 * We work in terms of PAGE_CACHE_SIZE here for commonality with 760 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 761 * took care of disposing of the unnecessary blocks. Even if the filesystem 762 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 763 * since the file might be mmapped. 764 */ 765 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 766 { 767 unsigned length = PAGE_CACHE_ALIGN(from) - from; 768 return dax_zero_page_range(inode, from, length, get_block); 769 } 770 EXPORT_SYMBOL_GPL(dax_truncate_page); 771