1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pmem.h> 28 #include <linux/sched.h> 29 #include <linux/uio.h> 30 #include <linux/vmstat.h> 31 32 int dax_clear_blocks(struct inode *inode, sector_t block, long size) 33 { 34 struct block_device *bdev = inode->i_sb->s_bdev; 35 sector_t sector = block << (inode->i_blkbits - 9); 36 37 might_sleep(); 38 do { 39 void __pmem *addr; 40 unsigned long pfn; 41 long count; 42 43 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 44 if (count < 0) 45 return count; 46 BUG_ON(size < count); 47 while (count > 0) { 48 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 49 if (pgsz > count) 50 pgsz = count; 51 clear_pmem(addr, pgsz); 52 addr += pgsz; 53 size -= pgsz; 54 count -= pgsz; 55 BUG_ON(pgsz & 511); 56 sector += pgsz / 512; 57 cond_resched(); 58 } 59 } while (size); 60 61 wmb_pmem(); 62 return 0; 63 } 64 EXPORT_SYMBOL_GPL(dax_clear_blocks); 65 66 static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, 67 unsigned blkbits) 68 { 69 unsigned long pfn; 70 sector_t sector = bh->b_blocknr << (blkbits - 9); 71 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 72 } 73 74 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 75 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 76 loff_t pos, loff_t end) 77 { 78 loff_t final = end - pos + first; /* The final byte of the buffer */ 79 80 if (first > 0) 81 clear_pmem(addr, first); 82 if (final < size) 83 clear_pmem(addr + final, size - final); 84 } 85 86 static bool buffer_written(struct buffer_head *bh) 87 { 88 return buffer_mapped(bh) && !buffer_unwritten(bh); 89 } 90 91 /* 92 * When ext4 encounters a hole, it returns without modifying the buffer_head 93 * which means that we can't trust b_size. To cope with this, we set b_state 94 * to 0 before calling get_block and, if any bit is set, we know we can trust 95 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 96 * and would save us time calling get_block repeatedly. 97 */ 98 static bool buffer_size_valid(struct buffer_head *bh) 99 { 100 return bh->b_state != 0; 101 } 102 103 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 104 loff_t start, loff_t end, get_block_t get_block, 105 struct buffer_head *bh) 106 { 107 ssize_t retval = 0; 108 loff_t pos = start; 109 loff_t max = start; 110 loff_t bh_max = start; 111 void __pmem *addr; 112 bool hole = false; 113 bool need_wmb = false; 114 115 if (iov_iter_rw(iter) != WRITE) 116 end = min(end, i_size_read(inode)); 117 118 while (pos < end) { 119 size_t len; 120 if (pos == max) { 121 unsigned blkbits = inode->i_blkbits; 122 long page = pos >> PAGE_SHIFT; 123 sector_t block = page << (PAGE_SHIFT - blkbits); 124 unsigned first = pos - (block << blkbits); 125 long size; 126 127 if (pos == bh_max) { 128 bh->b_size = PAGE_ALIGN(end - pos); 129 bh->b_state = 0; 130 retval = get_block(inode, block, bh, 131 iov_iter_rw(iter) == WRITE); 132 if (retval) 133 break; 134 if (!buffer_size_valid(bh)) 135 bh->b_size = 1 << blkbits; 136 bh_max = pos - first + bh->b_size; 137 } else { 138 unsigned done = bh->b_size - 139 (bh_max - (pos - first)); 140 bh->b_blocknr += done >> blkbits; 141 bh->b_size -= done; 142 } 143 144 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 145 if (hole) { 146 addr = NULL; 147 size = bh->b_size - first; 148 } else { 149 retval = dax_get_addr(bh, &addr, blkbits); 150 if (retval < 0) 151 break; 152 if (buffer_unwritten(bh) || buffer_new(bh)) { 153 dax_new_buf(addr, retval, first, pos, 154 end); 155 need_wmb = true; 156 } 157 addr += first; 158 size = retval - first; 159 } 160 max = min(pos + size, end); 161 } 162 163 if (iov_iter_rw(iter) == WRITE) { 164 len = copy_from_iter_pmem(addr, max - pos, iter); 165 need_wmb = true; 166 } else if (!hole) 167 len = copy_to_iter((void __force *)addr, max - pos, 168 iter); 169 else 170 len = iov_iter_zero(max - pos, iter); 171 172 if (!len) 173 break; 174 175 pos += len; 176 addr += len; 177 } 178 179 if (need_wmb) 180 wmb_pmem(); 181 182 return (pos == start) ? retval : pos - start; 183 } 184 185 /** 186 * dax_do_io - Perform I/O to a DAX file 187 * @iocb: The control block for this I/O 188 * @inode: The file which the I/O is directed at 189 * @iter: The addresses to do I/O from or to 190 * @pos: The file offset where the I/O starts 191 * @get_block: The filesystem method used to translate file offsets to blocks 192 * @end_io: A filesystem callback for I/O completion 193 * @flags: See below 194 * 195 * This function uses the same locking scheme as do_blockdev_direct_IO: 196 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 197 * caller for writes. For reads, we take and release the i_mutex ourselves. 198 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 199 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 200 * is in progress. 201 */ 202 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 203 struct iov_iter *iter, loff_t pos, get_block_t get_block, 204 dio_iodone_t end_io, int flags) 205 { 206 struct buffer_head bh; 207 ssize_t retval = -EINVAL; 208 loff_t end = pos + iov_iter_count(iter); 209 210 memset(&bh, 0, sizeof(bh)); 211 212 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 213 struct address_space *mapping = inode->i_mapping; 214 mutex_lock(&inode->i_mutex); 215 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 216 if (retval) { 217 mutex_unlock(&inode->i_mutex); 218 goto out; 219 } 220 } 221 222 /* Protects against truncate */ 223 if (!(flags & DIO_SKIP_DIO_COUNT)) 224 inode_dio_begin(inode); 225 226 retval = dax_io(inode, iter, pos, end, get_block, &bh); 227 228 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 229 mutex_unlock(&inode->i_mutex); 230 231 if ((retval > 0) && end_io) 232 end_io(iocb, pos, retval, bh.b_private); 233 234 if (!(flags & DIO_SKIP_DIO_COUNT)) 235 inode_dio_end(inode); 236 out: 237 return retval; 238 } 239 EXPORT_SYMBOL_GPL(dax_do_io); 240 241 /* 242 * The user has performed a load from a hole in the file. Allocating 243 * a new page in the file would cause excessive storage usage for 244 * workloads with sparse files. We allocate a page cache page instead. 245 * We'll kick it out of the page cache if it's ever written to, 246 * otherwise it will simply fall out of the page cache under memory 247 * pressure without ever having been dirtied. 248 */ 249 static int dax_load_hole(struct address_space *mapping, struct page *page, 250 struct vm_fault *vmf) 251 { 252 unsigned long size; 253 struct inode *inode = mapping->host; 254 if (!page) 255 page = find_or_create_page(mapping, vmf->pgoff, 256 GFP_KERNEL | __GFP_ZERO); 257 if (!page) 258 return VM_FAULT_OOM; 259 /* Recheck i_size under page lock to avoid truncate race */ 260 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 261 if (vmf->pgoff >= size) { 262 unlock_page(page); 263 page_cache_release(page); 264 return VM_FAULT_SIGBUS; 265 } 266 267 vmf->page = page; 268 return VM_FAULT_LOCKED; 269 } 270 271 static int copy_user_bh(struct page *to, struct buffer_head *bh, 272 unsigned blkbits, unsigned long vaddr) 273 { 274 void __pmem *vfrom; 275 void *vto; 276 277 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 278 return -EIO; 279 vto = kmap_atomic(to); 280 copy_user_page(vto, (void __force *)vfrom, vaddr, to); 281 kunmap_atomic(vto); 282 return 0; 283 } 284 285 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 286 struct vm_area_struct *vma, struct vm_fault *vmf) 287 { 288 struct address_space *mapping = inode->i_mapping; 289 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 290 unsigned long vaddr = (unsigned long)vmf->virtual_address; 291 void __pmem *addr; 292 unsigned long pfn; 293 pgoff_t size; 294 int error; 295 296 i_mmap_lock_read(mapping); 297 298 /* 299 * Check truncate didn't happen while we were allocating a block. 300 * If it did, this block may or may not be still allocated to the 301 * file. We can't tell the filesystem to free it because we can't 302 * take i_mutex here. In the worst case, the file still has blocks 303 * allocated past the end of the file. 304 */ 305 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 306 if (unlikely(vmf->pgoff >= size)) { 307 error = -EIO; 308 goto out; 309 } 310 311 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 312 if (error < 0) 313 goto out; 314 if (error < PAGE_SIZE) { 315 error = -EIO; 316 goto out; 317 } 318 319 if (buffer_unwritten(bh) || buffer_new(bh)) { 320 clear_pmem(addr, PAGE_SIZE); 321 wmb_pmem(); 322 } 323 324 error = vm_insert_mixed(vma, vaddr, pfn); 325 326 out: 327 i_mmap_unlock_read(mapping); 328 329 return error; 330 } 331 332 /** 333 * __dax_fault - handle a page fault on a DAX file 334 * @vma: The virtual memory area where the fault occurred 335 * @vmf: The description of the fault 336 * @get_block: The filesystem method used to translate file offsets to blocks 337 * @complete_unwritten: The filesystem method used to convert unwritten blocks 338 * to written so the data written to them is exposed. This is required for 339 * required by write faults for filesystems that will return unwritten 340 * extent mappings from @get_block, but it is optional for reads as 341 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 342 * not support unwritten extents, the it should pass NULL. 343 * 344 * When a page fault occurs, filesystems may call this helper in their 345 * fault handler for DAX files. __dax_fault() assumes the caller has done all 346 * the necessary locking for the page fault to proceed successfully. 347 */ 348 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 349 get_block_t get_block, dax_iodone_t complete_unwritten) 350 { 351 struct file *file = vma->vm_file; 352 struct address_space *mapping = file->f_mapping; 353 struct inode *inode = mapping->host; 354 struct page *page; 355 struct buffer_head bh; 356 unsigned long vaddr = (unsigned long)vmf->virtual_address; 357 unsigned blkbits = inode->i_blkbits; 358 sector_t block; 359 pgoff_t size; 360 int error; 361 int major = 0; 362 363 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 364 if (vmf->pgoff >= size) 365 return VM_FAULT_SIGBUS; 366 367 memset(&bh, 0, sizeof(bh)); 368 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 369 bh.b_size = PAGE_SIZE; 370 371 repeat: 372 page = find_get_page(mapping, vmf->pgoff); 373 if (page) { 374 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 375 page_cache_release(page); 376 return VM_FAULT_RETRY; 377 } 378 if (unlikely(page->mapping != mapping)) { 379 unlock_page(page); 380 page_cache_release(page); 381 goto repeat; 382 } 383 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 384 if (unlikely(vmf->pgoff >= size)) { 385 /* 386 * We have a struct page covering a hole in the file 387 * from a read fault and we've raced with a truncate 388 */ 389 error = -EIO; 390 goto unlock_page; 391 } 392 } 393 394 error = get_block(inode, block, &bh, 0); 395 if (!error && (bh.b_size < PAGE_SIZE)) 396 error = -EIO; /* fs corruption? */ 397 if (error) 398 goto unlock_page; 399 400 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 401 if (vmf->flags & FAULT_FLAG_WRITE) { 402 error = get_block(inode, block, &bh, 1); 403 count_vm_event(PGMAJFAULT); 404 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 405 major = VM_FAULT_MAJOR; 406 if (!error && (bh.b_size < PAGE_SIZE)) 407 error = -EIO; 408 if (error) 409 goto unlock_page; 410 } else { 411 return dax_load_hole(mapping, page, vmf); 412 } 413 } 414 415 if (vmf->cow_page) { 416 struct page *new_page = vmf->cow_page; 417 if (buffer_written(&bh)) 418 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 419 else 420 clear_user_highpage(new_page, vaddr); 421 if (error) 422 goto unlock_page; 423 vmf->page = page; 424 if (!page) { 425 i_mmap_lock_read(mapping); 426 /* Check we didn't race with truncate */ 427 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 428 PAGE_SHIFT; 429 if (vmf->pgoff >= size) { 430 i_mmap_unlock_read(mapping); 431 error = -EIO; 432 goto out; 433 } 434 } 435 return VM_FAULT_LOCKED; 436 } 437 438 /* Check we didn't race with a read fault installing a new page */ 439 if (!page && major) 440 page = find_lock_page(mapping, vmf->pgoff); 441 442 if (page) { 443 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 444 PAGE_CACHE_SIZE, 0); 445 delete_from_page_cache(page); 446 unlock_page(page); 447 page_cache_release(page); 448 } 449 450 /* 451 * If we successfully insert the new mapping over an unwritten extent, 452 * we need to ensure we convert the unwritten extent. If there is an 453 * error inserting the mapping, the filesystem needs to leave it as 454 * unwritten to prevent exposure of the stale underlying data to 455 * userspace, but we still need to call the completion function so 456 * the private resources on the mapping buffer can be released. We 457 * indicate what the callback should do via the uptodate variable, same 458 * as for normal BH based IO completions. 459 */ 460 error = dax_insert_mapping(inode, &bh, vma, vmf); 461 if (buffer_unwritten(&bh)) { 462 if (complete_unwritten) 463 complete_unwritten(&bh, !error); 464 else 465 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 466 } 467 468 out: 469 if (error == -ENOMEM) 470 return VM_FAULT_OOM | major; 471 /* -EBUSY is fine, somebody else faulted on the same PTE */ 472 if ((error < 0) && (error != -EBUSY)) 473 return VM_FAULT_SIGBUS | major; 474 return VM_FAULT_NOPAGE | major; 475 476 unlock_page: 477 if (page) { 478 unlock_page(page); 479 page_cache_release(page); 480 } 481 goto out; 482 } 483 EXPORT_SYMBOL(__dax_fault); 484 485 /** 486 * dax_fault - handle a page fault on a DAX file 487 * @vma: The virtual memory area where the fault occurred 488 * @vmf: The description of the fault 489 * @get_block: The filesystem method used to translate file offsets to blocks 490 * 491 * When a page fault occurs, filesystems may call this helper in their 492 * fault handler for DAX files. 493 */ 494 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 495 get_block_t get_block, dax_iodone_t complete_unwritten) 496 { 497 int result; 498 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 499 500 if (vmf->flags & FAULT_FLAG_WRITE) { 501 sb_start_pagefault(sb); 502 file_update_time(vma->vm_file); 503 } 504 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 505 if (vmf->flags & FAULT_FLAG_WRITE) 506 sb_end_pagefault(sb); 507 508 return result; 509 } 510 EXPORT_SYMBOL_GPL(dax_fault); 511 512 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 513 /* 514 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 515 * more often than one might expect in the below function. 516 */ 517 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 518 519 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 520 pmd_t *pmd, unsigned int flags, get_block_t get_block, 521 dax_iodone_t complete_unwritten) 522 { 523 struct file *file = vma->vm_file; 524 struct address_space *mapping = file->f_mapping; 525 struct inode *inode = mapping->host; 526 struct buffer_head bh; 527 unsigned blkbits = inode->i_blkbits; 528 unsigned long pmd_addr = address & PMD_MASK; 529 bool write = flags & FAULT_FLAG_WRITE; 530 long length; 531 void __pmem *kaddr; 532 pgoff_t size, pgoff; 533 sector_t block, sector; 534 unsigned long pfn; 535 int result = 0; 536 537 /* Fall back to PTEs if we're going to COW */ 538 if (write && !(vma->vm_flags & VM_SHARED)) 539 return VM_FAULT_FALLBACK; 540 /* If the PMD would extend outside the VMA */ 541 if (pmd_addr < vma->vm_start) 542 return VM_FAULT_FALLBACK; 543 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 544 return VM_FAULT_FALLBACK; 545 546 pgoff = linear_page_index(vma, pmd_addr); 547 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 548 if (pgoff >= size) 549 return VM_FAULT_SIGBUS; 550 /* If the PMD would cover blocks out of the file */ 551 if ((pgoff | PG_PMD_COLOUR) >= size) 552 return VM_FAULT_FALLBACK; 553 554 memset(&bh, 0, sizeof(bh)); 555 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 556 557 bh.b_size = PMD_SIZE; 558 length = get_block(inode, block, &bh, write); 559 if (length) 560 return VM_FAULT_SIGBUS; 561 i_mmap_lock_read(mapping); 562 563 /* 564 * If the filesystem isn't willing to tell us the length of a hole, 565 * just fall back to PTEs. Calling get_block 512 times in a loop 566 * would be silly. 567 */ 568 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 569 goto fallback; 570 571 /* 572 * If we allocated new storage, make sure no process has any 573 * zero pages covering this hole 574 */ 575 if (buffer_new(&bh)) { 576 i_mmap_unlock_read(mapping); 577 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 578 i_mmap_lock_read(mapping); 579 } 580 581 /* 582 * If a truncate happened while we were allocating blocks, we may 583 * leave blocks allocated to the file that are beyond EOF. We can't 584 * take i_mutex here, so just leave them hanging; they'll be freed 585 * when the file is deleted. 586 */ 587 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 588 if (pgoff >= size) { 589 result = VM_FAULT_SIGBUS; 590 goto out; 591 } 592 if ((pgoff | PG_PMD_COLOUR) >= size) 593 goto fallback; 594 595 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 596 spinlock_t *ptl; 597 pmd_t entry; 598 struct page *zero_page = get_huge_zero_page(); 599 600 if (unlikely(!zero_page)) 601 goto fallback; 602 603 ptl = pmd_lock(vma->vm_mm, pmd); 604 if (!pmd_none(*pmd)) { 605 spin_unlock(ptl); 606 goto fallback; 607 } 608 609 entry = mk_pmd(zero_page, vma->vm_page_prot); 610 entry = pmd_mkhuge(entry); 611 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 612 result = VM_FAULT_NOPAGE; 613 spin_unlock(ptl); 614 } else { 615 sector = bh.b_blocknr << (blkbits - 9); 616 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 617 bh.b_size); 618 if (length < 0) { 619 result = VM_FAULT_SIGBUS; 620 goto out; 621 } 622 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 623 goto fallback; 624 625 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 626 int i; 627 for (i = 0; i < PTRS_PER_PMD; i++) 628 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); 629 wmb_pmem(); 630 count_vm_event(PGMAJFAULT); 631 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 632 result |= VM_FAULT_MAJOR; 633 } 634 635 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 636 } 637 638 out: 639 i_mmap_unlock_read(mapping); 640 641 if (buffer_unwritten(&bh)) 642 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 643 644 return result; 645 646 fallback: 647 count_vm_event(THP_FAULT_FALLBACK); 648 result = VM_FAULT_FALLBACK; 649 goto out; 650 } 651 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 652 653 /** 654 * dax_pmd_fault - handle a PMD fault on a DAX file 655 * @vma: The virtual memory area where the fault occurred 656 * @vmf: The description of the fault 657 * @get_block: The filesystem method used to translate file offsets to blocks 658 * 659 * When a page fault occurs, filesystems may call this helper in their 660 * pmd_fault handler for DAX files. 661 */ 662 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 663 pmd_t *pmd, unsigned int flags, get_block_t get_block, 664 dax_iodone_t complete_unwritten) 665 { 666 int result; 667 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 668 669 if (flags & FAULT_FLAG_WRITE) { 670 sb_start_pagefault(sb); 671 file_update_time(vma->vm_file); 672 } 673 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 674 complete_unwritten); 675 if (flags & FAULT_FLAG_WRITE) 676 sb_end_pagefault(sb); 677 678 return result; 679 } 680 EXPORT_SYMBOL_GPL(dax_pmd_fault); 681 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 682 683 /** 684 * dax_pfn_mkwrite - handle first write to DAX page 685 * @vma: The virtual memory area where the fault occurred 686 * @vmf: The description of the fault 687 * 688 */ 689 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 690 { 691 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 692 693 sb_start_pagefault(sb); 694 file_update_time(vma->vm_file); 695 sb_end_pagefault(sb); 696 return VM_FAULT_NOPAGE; 697 } 698 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 699 700 /** 701 * dax_zero_page_range - zero a range within a page of a DAX file 702 * @inode: The file being truncated 703 * @from: The file offset that is being truncated to 704 * @length: The number of bytes to zero 705 * @get_block: The filesystem method used to translate file offsets to blocks 706 * 707 * This function can be called by a filesystem when it is zeroing part of a 708 * page in a DAX file. This is intended for hole-punch operations. If 709 * you are truncating a file, the helper function dax_truncate_page() may be 710 * more convenient. 711 * 712 * We work in terms of PAGE_CACHE_SIZE here for commonality with 713 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 714 * took care of disposing of the unnecessary blocks. Even if the filesystem 715 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 716 * since the file might be mmapped. 717 */ 718 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 719 get_block_t get_block) 720 { 721 struct buffer_head bh; 722 pgoff_t index = from >> PAGE_CACHE_SHIFT; 723 unsigned offset = from & (PAGE_CACHE_SIZE-1); 724 int err; 725 726 /* Block boundary? Nothing to do */ 727 if (!length) 728 return 0; 729 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 730 731 memset(&bh, 0, sizeof(bh)); 732 bh.b_size = PAGE_CACHE_SIZE; 733 err = get_block(inode, index, &bh, 0); 734 if (err < 0) 735 return err; 736 if (buffer_written(&bh)) { 737 void __pmem *addr; 738 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 739 if (err < 0) 740 return err; 741 clear_pmem(addr + offset, length); 742 wmb_pmem(); 743 } 744 745 return 0; 746 } 747 EXPORT_SYMBOL_GPL(dax_zero_page_range); 748 749 /** 750 * dax_truncate_page - handle a partial page being truncated in a DAX file 751 * @inode: The file being truncated 752 * @from: The file offset that is being truncated to 753 * @get_block: The filesystem method used to translate file offsets to blocks 754 * 755 * Similar to block_truncate_page(), this function can be called by a 756 * filesystem when it is truncating a DAX file to handle the partial page. 757 * 758 * We work in terms of PAGE_CACHE_SIZE here for commonality with 759 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 760 * took care of disposing of the unnecessary blocks. Even if the filesystem 761 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 762 * since the file might be mmapped. 763 */ 764 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 765 { 766 unsigned length = PAGE_CACHE_ALIGN(from) - from; 767 return dax_zero_page_range(inode, from, length, get_block); 768 } 769 EXPORT_SYMBOL_GPL(dax_truncate_page); 770