1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pmem.h> 28 #include <linux/sched.h> 29 #include <linux/uio.h> 30 #include <linux/vmstat.h> 31 32 int dax_clear_blocks(struct inode *inode, sector_t block, long size) 33 { 34 struct block_device *bdev = inode->i_sb->s_bdev; 35 sector_t sector = block << (inode->i_blkbits - 9); 36 37 might_sleep(); 38 do { 39 void __pmem *addr; 40 unsigned long pfn; 41 long count; 42 43 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 44 if (count < 0) 45 return count; 46 BUG_ON(size < count); 47 while (count > 0) { 48 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 49 if (pgsz > count) 50 pgsz = count; 51 clear_pmem(addr, pgsz); 52 addr += pgsz; 53 size -= pgsz; 54 count -= pgsz; 55 BUG_ON(pgsz & 511); 56 sector += pgsz / 512; 57 cond_resched(); 58 } 59 } while (size); 60 61 wmb_pmem(); 62 return 0; 63 } 64 EXPORT_SYMBOL_GPL(dax_clear_blocks); 65 66 static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, 67 unsigned blkbits) 68 { 69 unsigned long pfn; 70 sector_t sector = bh->b_blocknr << (blkbits - 9); 71 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 72 } 73 74 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 75 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 76 loff_t pos, loff_t end) 77 { 78 loff_t final = end - pos + first; /* The final byte of the buffer */ 79 80 if (first > 0) 81 clear_pmem(addr, first); 82 if (final < size) 83 clear_pmem(addr + final, size - final); 84 } 85 86 static bool buffer_written(struct buffer_head *bh) 87 { 88 return buffer_mapped(bh) && !buffer_unwritten(bh); 89 } 90 91 /* 92 * When ext4 encounters a hole, it returns without modifying the buffer_head 93 * which means that we can't trust b_size. To cope with this, we set b_state 94 * to 0 before calling get_block and, if any bit is set, we know we can trust 95 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 96 * and would save us time calling get_block repeatedly. 97 */ 98 static bool buffer_size_valid(struct buffer_head *bh) 99 { 100 return bh->b_state != 0; 101 } 102 103 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 104 loff_t start, loff_t end, get_block_t get_block, 105 struct buffer_head *bh) 106 { 107 ssize_t retval = 0; 108 loff_t pos = start; 109 loff_t max = start; 110 loff_t bh_max = start; 111 void __pmem *addr; 112 bool hole = false; 113 bool need_wmb = false; 114 115 if (iov_iter_rw(iter) != WRITE) 116 end = min(end, i_size_read(inode)); 117 118 while (pos < end) { 119 size_t len; 120 if (pos == max) { 121 unsigned blkbits = inode->i_blkbits; 122 sector_t block = pos >> blkbits; 123 unsigned first = pos - (block << blkbits); 124 long size; 125 126 if (pos == bh_max) { 127 bh->b_size = PAGE_ALIGN(end - pos); 128 bh->b_state = 0; 129 retval = get_block(inode, block, bh, 130 iov_iter_rw(iter) == WRITE); 131 if (retval) 132 break; 133 if (!buffer_size_valid(bh)) 134 bh->b_size = 1 << blkbits; 135 bh_max = pos - first + bh->b_size; 136 } else { 137 unsigned done = bh->b_size - 138 (bh_max - (pos - first)); 139 bh->b_blocknr += done >> blkbits; 140 bh->b_size -= done; 141 } 142 143 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 144 if (hole) { 145 addr = NULL; 146 size = bh->b_size - first; 147 } else { 148 retval = dax_get_addr(bh, &addr, blkbits); 149 if (retval < 0) 150 break; 151 if (buffer_unwritten(bh) || buffer_new(bh)) { 152 dax_new_buf(addr, retval, first, pos, 153 end); 154 need_wmb = true; 155 } 156 addr += first; 157 size = retval - first; 158 } 159 max = min(pos + size, end); 160 } 161 162 if (iov_iter_rw(iter) == WRITE) { 163 len = copy_from_iter_pmem(addr, max - pos, iter); 164 need_wmb = true; 165 } else if (!hole) 166 len = copy_to_iter((void __force *)addr, max - pos, 167 iter); 168 else 169 len = iov_iter_zero(max - pos, iter); 170 171 if (!len) 172 break; 173 174 pos += len; 175 addr += len; 176 } 177 178 if (need_wmb) 179 wmb_pmem(); 180 181 return (pos == start) ? retval : pos - start; 182 } 183 184 /** 185 * dax_do_io - Perform I/O to a DAX file 186 * @iocb: The control block for this I/O 187 * @inode: The file which the I/O is directed at 188 * @iter: The addresses to do I/O from or to 189 * @pos: The file offset where the I/O starts 190 * @get_block: The filesystem method used to translate file offsets to blocks 191 * @end_io: A filesystem callback for I/O completion 192 * @flags: See below 193 * 194 * This function uses the same locking scheme as do_blockdev_direct_IO: 195 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 196 * caller for writes. For reads, we take and release the i_mutex ourselves. 197 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 198 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 199 * is in progress. 200 */ 201 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 202 struct iov_iter *iter, loff_t pos, get_block_t get_block, 203 dio_iodone_t end_io, int flags) 204 { 205 struct buffer_head bh; 206 ssize_t retval = -EINVAL; 207 loff_t end = pos + iov_iter_count(iter); 208 209 memset(&bh, 0, sizeof(bh)); 210 211 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 212 struct address_space *mapping = inode->i_mapping; 213 mutex_lock(&inode->i_mutex); 214 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 215 if (retval) { 216 mutex_unlock(&inode->i_mutex); 217 goto out; 218 } 219 } 220 221 /* Protects against truncate */ 222 if (!(flags & DIO_SKIP_DIO_COUNT)) 223 inode_dio_begin(inode); 224 225 retval = dax_io(inode, iter, pos, end, get_block, &bh); 226 227 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 228 mutex_unlock(&inode->i_mutex); 229 230 if ((retval > 0) && end_io) 231 end_io(iocb, pos, retval, bh.b_private); 232 233 if (!(flags & DIO_SKIP_DIO_COUNT)) 234 inode_dio_end(inode); 235 out: 236 return retval; 237 } 238 EXPORT_SYMBOL_GPL(dax_do_io); 239 240 /* 241 * The user has performed a load from a hole in the file. Allocating 242 * a new page in the file would cause excessive storage usage for 243 * workloads with sparse files. We allocate a page cache page instead. 244 * We'll kick it out of the page cache if it's ever written to, 245 * otherwise it will simply fall out of the page cache under memory 246 * pressure without ever having been dirtied. 247 */ 248 static int dax_load_hole(struct address_space *mapping, struct page *page, 249 struct vm_fault *vmf) 250 { 251 unsigned long size; 252 struct inode *inode = mapping->host; 253 if (!page) 254 page = find_or_create_page(mapping, vmf->pgoff, 255 GFP_KERNEL | __GFP_ZERO); 256 if (!page) 257 return VM_FAULT_OOM; 258 /* Recheck i_size under page lock to avoid truncate race */ 259 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 260 if (vmf->pgoff >= size) { 261 unlock_page(page); 262 page_cache_release(page); 263 return VM_FAULT_SIGBUS; 264 } 265 266 vmf->page = page; 267 return VM_FAULT_LOCKED; 268 } 269 270 static int copy_user_bh(struct page *to, struct buffer_head *bh, 271 unsigned blkbits, unsigned long vaddr) 272 { 273 void __pmem *vfrom; 274 void *vto; 275 276 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 277 return -EIO; 278 vto = kmap_atomic(to); 279 copy_user_page(vto, (void __force *)vfrom, vaddr, to); 280 kunmap_atomic(vto); 281 return 0; 282 } 283 284 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 285 struct vm_area_struct *vma, struct vm_fault *vmf) 286 { 287 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 288 unsigned long vaddr = (unsigned long)vmf->virtual_address; 289 void __pmem *addr; 290 unsigned long pfn; 291 pgoff_t size; 292 int error; 293 294 /* 295 * Check truncate didn't happen while we were allocating a block. 296 * If it did, this block may or may not be still allocated to the 297 * file. We can't tell the filesystem to free it because we can't 298 * take i_mutex here. In the worst case, the file still has blocks 299 * allocated past the end of the file. 300 */ 301 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 302 if (unlikely(vmf->pgoff >= size)) { 303 error = -EIO; 304 goto out; 305 } 306 307 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 308 if (error < 0) 309 goto out; 310 if (error < PAGE_SIZE) { 311 error = -EIO; 312 goto out; 313 } 314 315 if (buffer_unwritten(bh) || buffer_new(bh)) { 316 clear_pmem(addr, PAGE_SIZE); 317 wmb_pmem(); 318 } 319 320 error = vm_insert_mixed(vma, vaddr, pfn); 321 322 out: 323 return error; 324 } 325 326 /** 327 * __dax_fault - handle a page fault on a DAX file 328 * @vma: The virtual memory area where the fault occurred 329 * @vmf: The description of the fault 330 * @get_block: The filesystem method used to translate file offsets to blocks 331 * @complete_unwritten: The filesystem method used to convert unwritten blocks 332 * to written so the data written to them is exposed. This is required for 333 * required by write faults for filesystems that will return unwritten 334 * extent mappings from @get_block, but it is optional for reads as 335 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 336 * not support unwritten extents, the it should pass NULL. 337 * 338 * When a page fault occurs, filesystems may call this helper in their 339 * fault handler for DAX files. __dax_fault() assumes the caller has done all 340 * the necessary locking for the page fault to proceed successfully. 341 */ 342 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 343 get_block_t get_block, dax_iodone_t complete_unwritten) 344 { 345 struct file *file = vma->vm_file; 346 struct address_space *mapping = file->f_mapping; 347 struct inode *inode = mapping->host; 348 struct page *page; 349 struct buffer_head bh; 350 unsigned long vaddr = (unsigned long)vmf->virtual_address; 351 unsigned blkbits = inode->i_blkbits; 352 sector_t block; 353 pgoff_t size; 354 int error; 355 int major = 0; 356 357 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 358 if (vmf->pgoff >= size) 359 return VM_FAULT_SIGBUS; 360 361 memset(&bh, 0, sizeof(bh)); 362 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 363 bh.b_size = PAGE_SIZE; 364 365 repeat: 366 page = find_get_page(mapping, vmf->pgoff); 367 if (page) { 368 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 369 page_cache_release(page); 370 return VM_FAULT_RETRY; 371 } 372 if (unlikely(page->mapping != mapping)) { 373 unlock_page(page); 374 page_cache_release(page); 375 goto repeat; 376 } 377 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 378 if (unlikely(vmf->pgoff >= size)) { 379 /* 380 * We have a struct page covering a hole in the file 381 * from a read fault and we've raced with a truncate 382 */ 383 error = -EIO; 384 goto unlock; 385 } 386 } else { 387 i_mmap_lock_write(mapping); 388 } 389 390 error = get_block(inode, block, &bh, 0); 391 if (!error && (bh.b_size < PAGE_SIZE)) 392 error = -EIO; /* fs corruption? */ 393 if (error) 394 goto unlock; 395 396 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 397 if (vmf->flags & FAULT_FLAG_WRITE) { 398 error = get_block(inode, block, &bh, 1); 399 count_vm_event(PGMAJFAULT); 400 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 401 major = VM_FAULT_MAJOR; 402 if (!error && (bh.b_size < PAGE_SIZE)) 403 error = -EIO; 404 if (error) 405 goto unlock; 406 } else { 407 i_mmap_unlock_write(mapping); 408 return dax_load_hole(mapping, page, vmf); 409 } 410 } 411 412 if (vmf->cow_page) { 413 struct page *new_page = vmf->cow_page; 414 if (buffer_written(&bh)) 415 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 416 else 417 clear_user_highpage(new_page, vaddr); 418 if (error) 419 goto unlock; 420 vmf->page = page; 421 if (!page) { 422 /* Check we didn't race with truncate */ 423 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 424 PAGE_SHIFT; 425 if (vmf->pgoff >= size) { 426 error = -EIO; 427 goto unlock; 428 } 429 } 430 return VM_FAULT_LOCKED; 431 } 432 433 /* Check we didn't race with a read fault installing a new page */ 434 if (!page && major) 435 page = find_lock_page(mapping, vmf->pgoff); 436 437 if (page) { 438 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 439 PAGE_CACHE_SIZE, 0); 440 delete_from_page_cache(page); 441 unlock_page(page); 442 page_cache_release(page); 443 } 444 445 /* 446 * If we successfully insert the new mapping over an unwritten extent, 447 * we need to ensure we convert the unwritten extent. If there is an 448 * error inserting the mapping, the filesystem needs to leave it as 449 * unwritten to prevent exposure of the stale underlying data to 450 * userspace, but we still need to call the completion function so 451 * the private resources on the mapping buffer can be released. We 452 * indicate what the callback should do via the uptodate variable, same 453 * as for normal BH based IO completions. 454 */ 455 error = dax_insert_mapping(inode, &bh, vma, vmf); 456 if (buffer_unwritten(&bh)) { 457 if (complete_unwritten) 458 complete_unwritten(&bh, !error); 459 else 460 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 461 } 462 463 if (!page) 464 i_mmap_unlock_write(mapping); 465 out: 466 if (error == -ENOMEM) 467 return VM_FAULT_OOM | major; 468 /* -EBUSY is fine, somebody else faulted on the same PTE */ 469 if ((error < 0) && (error != -EBUSY)) 470 return VM_FAULT_SIGBUS | major; 471 return VM_FAULT_NOPAGE | major; 472 473 unlock: 474 if (page) { 475 unlock_page(page); 476 page_cache_release(page); 477 } else { 478 i_mmap_unlock_write(mapping); 479 } 480 481 goto out; 482 } 483 EXPORT_SYMBOL(__dax_fault); 484 485 /** 486 * dax_fault - handle a page fault on a DAX file 487 * @vma: The virtual memory area where the fault occurred 488 * @vmf: The description of the fault 489 * @get_block: The filesystem method used to translate file offsets to blocks 490 * 491 * When a page fault occurs, filesystems may call this helper in their 492 * fault handler for DAX files. 493 */ 494 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 495 get_block_t get_block, dax_iodone_t complete_unwritten) 496 { 497 int result; 498 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 499 500 if (vmf->flags & FAULT_FLAG_WRITE) { 501 sb_start_pagefault(sb); 502 file_update_time(vma->vm_file); 503 } 504 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 505 if (vmf->flags & FAULT_FLAG_WRITE) 506 sb_end_pagefault(sb); 507 508 return result; 509 } 510 EXPORT_SYMBOL_GPL(dax_fault); 511 512 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 513 /* 514 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 515 * more often than one might expect in the below function. 516 */ 517 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 518 519 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 520 pmd_t *pmd, unsigned int flags, get_block_t get_block, 521 dax_iodone_t complete_unwritten) 522 { 523 struct file *file = vma->vm_file; 524 struct address_space *mapping = file->f_mapping; 525 struct inode *inode = mapping->host; 526 struct buffer_head bh; 527 unsigned blkbits = inode->i_blkbits; 528 unsigned long pmd_addr = address & PMD_MASK; 529 bool write = flags & FAULT_FLAG_WRITE; 530 long length; 531 void __pmem *kaddr; 532 pgoff_t size, pgoff; 533 sector_t block, sector; 534 unsigned long pfn; 535 int result = 0; 536 537 /* Fall back to PTEs if we're going to COW */ 538 if (write && !(vma->vm_flags & VM_SHARED)) 539 return VM_FAULT_FALLBACK; 540 /* If the PMD would extend outside the VMA */ 541 if (pmd_addr < vma->vm_start) 542 return VM_FAULT_FALLBACK; 543 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 544 return VM_FAULT_FALLBACK; 545 546 pgoff = linear_page_index(vma, pmd_addr); 547 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 548 if (pgoff >= size) 549 return VM_FAULT_SIGBUS; 550 /* If the PMD would cover blocks out of the file */ 551 if ((pgoff | PG_PMD_COLOUR) >= size) 552 return VM_FAULT_FALLBACK; 553 554 memset(&bh, 0, sizeof(bh)); 555 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 556 557 bh.b_size = PMD_SIZE; 558 i_mmap_lock_write(mapping); 559 length = get_block(inode, block, &bh, write); 560 if (length) 561 return VM_FAULT_SIGBUS; 562 563 /* 564 * If the filesystem isn't willing to tell us the length of a hole, 565 * just fall back to PTEs. Calling get_block 512 times in a loop 566 * would be silly. 567 */ 568 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 569 goto fallback; 570 571 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 572 int i; 573 for (i = 0; i < PTRS_PER_PMD; i++) 574 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); 575 wmb_pmem(); 576 count_vm_event(PGMAJFAULT); 577 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 578 result |= VM_FAULT_MAJOR; 579 } 580 581 /* 582 * If we allocated new storage, make sure no process has any 583 * zero pages covering this hole 584 */ 585 if (buffer_new(&bh)) { 586 i_mmap_unlock_write(mapping); 587 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 588 i_mmap_lock_write(mapping); 589 } 590 591 /* 592 * If a truncate happened while we were allocating blocks, we may 593 * leave blocks allocated to the file that are beyond EOF. We can't 594 * take i_mutex here, so just leave them hanging; they'll be freed 595 * when the file is deleted. 596 */ 597 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 598 if (pgoff >= size) { 599 result = VM_FAULT_SIGBUS; 600 goto out; 601 } 602 if ((pgoff | PG_PMD_COLOUR) >= size) 603 goto fallback; 604 605 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 606 spinlock_t *ptl; 607 pmd_t entry; 608 struct page *zero_page = get_huge_zero_page(); 609 610 if (unlikely(!zero_page)) 611 goto fallback; 612 613 ptl = pmd_lock(vma->vm_mm, pmd); 614 if (!pmd_none(*pmd)) { 615 spin_unlock(ptl); 616 goto fallback; 617 } 618 619 entry = mk_pmd(zero_page, vma->vm_page_prot); 620 entry = pmd_mkhuge(entry); 621 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 622 result = VM_FAULT_NOPAGE; 623 spin_unlock(ptl); 624 } else { 625 sector = bh.b_blocknr << (blkbits - 9); 626 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 627 bh.b_size); 628 if (length < 0) { 629 result = VM_FAULT_SIGBUS; 630 goto out; 631 } 632 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 633 goto fallback; 634 635 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 636 } 637 638 out: 639 if (buffer_unwritten(&bh)) 640 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 641 642 i_mmap_unlock_write(mapping); 643 644 return result; 645 646 fallback: 647 count_vm_event(THP_FAULT_FALLBACK); 648 result = VM_FAULT_FALLBACK; 649 goto out; 650 } 651 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 652 653 /** 654 * dax_pmd_fault - handle a PMD fault on a DAX file 655 * @vma: The virtual memory area where the fault occurred 656 * @vmf: The description of the fault 657 * @get_block: The filesystem method used to translate file offsets to blocks 658 * 659 * When a page fault occurs, filesystems may call this helper in their 660 * pmd_fault handler for DAX files. 661 */ 662 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 663 pmd_t *pmd, unsigned int flags, get_block_t get_block, 664 dax_iodone_t complete_unwritten) 665 { 666 int result; 667 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 668 669 if (flags & FAULT_FLAG_WRITE) { 670 sb_start_pagefault(sb); 671 file_update_time(vma->vm_file); 672 } 673 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 674 complete_unwritten); 675 if (flags & FAULT_FLAG_WRITE) 676 sb_end_pagefault(sb); 677 678 return result; 679 } 680 EXPORT_SYMBOL_GPL(dax_pmd_fault); 681 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 682 683 /** 684 * dax_pfn_mkwrite - handle first write to DAX page 685 * @vma: The virtual memory area where the fault occurred 686 * @vmf: The description of the fault 687 * 688 */ 689 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 690 { 691 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 692 693 sb_start_pagefault(sb); 694 file_update_time(vma->vm_file); 695 sb_end_pagefault(sb); 696 return VM_FAULT_NOPAGE; 697 } 698 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 699 700 /** 701 * dax_zero_page_range - zero a range within a page of a DAX file 702 * @inode: The file being truncated 703 * @from: The file offset that is being truncated to 704 * @length: The number of bytes to zero 705 * @get_block: The filesystem method used to translate file offsets to blocks 706 * 707 * This function can be called by a filesystem when it is zeroing part of a 708 * page in a DAX file. This is intended for hole-punch operations. If 709 * you are truncating a file, the helper function dax_truncate_page() may be 710 * more convenient. 711 * 712 * We work in terms of PAGE_CACHE_SIZE here for commonality with 713 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 714 * took care of disposing of the unnecessary blocks. Even if the filesystem 715 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 716 * since the file might be mmapped. 717 */ 718 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 719 get_block_t get_block) 720 { 721 struct buffer_head bh; 722 pgoff_t index = from >> PAGE_CACHE_SHIFT; 723 unsigned offset = from & (PAGE_CACHE_SIZE-1); 724 int err; 725 726 /* Block boundary? Nothing to do */ 727 if (!length) 728 return 0; 729 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 730 731 memset(&bh, 0, sizeof(bh)); 732 bh.b_size = PAGE_CACHE_SIZE; 733 err = get_block(inode, index, &bh, 0); 734 if (err < 0) 735 return err; 736 if (buffer_written(&bh)) { 737 void __pmem *addr; 738 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 739 if (err < 0) 740 return err; 741 clear_pmem(addr + offset, length); 742 wmb_pmem(); 743 } 744 745 return 0; 746 } 747 EXPORT_SYMBOL_GPL(dax_zero_page_range); 748 749 /** 750 * dax_truncate_page - handle a partial page being truncated in a DAX file 751 * @inode: The file being truncated 752 * @from: The file offset that is being truncated to 753 * @get_block: The filesystem method used to translate file offsets to blocks 754 * 755 * Similar to block_truncate_page(), this function can be called by a 756 * filesystem when it is truncating a DAX file to handle the partial page. 757 * 758 * We work in terms of PAGE_CACHE_SIZE here for commonality with 759 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 760 * took care of disposing of the unnecessary blocks. Even if the filesystem 761 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 762 * since the file might be mmapped. 763 */ 764 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 765 { 766 unsigned length = PAGE_CACHE_ALIGN(from) - from; 767 return dax_zero_page_range(inode, from, length, get_block); 768 } 769 EXPORT_SYMBOL_GPL(dax_truncate_page); 770