1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_mount.h" 25 #include "xfs_da_format.h" 26 #include "xfs_da_btree.h" 27 #include "xfs_inode.h" 28 #include "xfs_trans.h" 29 #include "xfs_inode_item.h" 30 #include "xfs_bmap.h" 31 #include "xfs_bmap_util.h" 32 #include "xfs_error.h" 33 #include "xfs_dir2.h" 34 #include "xfs_dir2_priv.h" 35 #include "xfs_ioctl.h" 36 #include "xfs_trace.h" 37 #include "xfs_log.h" 38 #include "xfs_icache.h" 39 #include "xfs_pnfs.h" 40 #include "xfs_iomap.h" 41 #include "xfs_reflink.h" 42 43 #include <linux/dcache.h> 44 #include <linux/falloc.h> 45 #include <linux/pagevec.h> 46 #include <linux/backing-dev.h> 47 48 static const struct vm_operations_struct xfs_file_vm_ops; 49 50 /* 51 * Locking primitives for read and write IO paths to ensure we consistently use 52 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. 53 */ 54 static inline void 55 xfs_rw_ilock( 56 struct xfs_inode *ip, 57 int type) 58 { 59 if (type & XFS_IOLOCK_EXCL) 60 inode_lock(VFS_I(ip)); 61 xfs_ilock(ip, type); 62 } 63 64 static inline void 65 xfs_rw_iunlock( 66 struct xfs_inode *ip, 67 int type) 68 { 69 xfs_iunlock(ip, type); 70 if (type & XFS_IOLOCK_EXCL) 71 inode_unlock(VFS_I(ip)); 72 } 73 74 static inline void 75 xfs_rw_ilock_demote( 76 struct xfs_inode *ip, 77 int type) 78 { 79 xfs_ilock_demote(ip, type); 80 if (type & XFS_IOLOCK_EXCL) 81 inode_unlock(VFS_I(ip)); 82 } 83 84 /* 85 * Clear the specified ranges to zero through either the pagecache or DAX. 86 * Holes and unwritten extents will be left as-is as they already are zeroed. 87 */ 88 int 89 xfs_zero_range( 90 struct xfs_inode *ip, 91 xfs_off_t pos, 92 xfs_off_t count, 93 bool *did_zero) 94 { 95 return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); 96 } 97 98 int 99 xfs_update_prealloc_flags( 100 struct xfs_inode *ip, 101 enum xfs_prealloc_flags flags) 102 { 103 struct xfs_trans *tp; 104 int error; 105 106 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, 107 0, 0, 0, &tp); 108 if (error) 109 return error; 110 111 xfs_ilock(ip, XFS_ILOCK_EXCL); 112 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 113 114 if (!(flags & XFS_PREALLOC_INVISIBLE)) { 115 VFS_I(ip)->i_mode &= ~S_ISUID; 116 if (VFS_I(ip)->i_mode & S_IXGRP) 117 VFS_I(ip)->i_mode &= ~S_ISGID; 118 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 119 } 120 121 if (flags & XFS_PREALLOC_SET) 122 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 123 if (flags & XFS_PREALLOC_CLEAR) 124 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 125 126 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 127 if (flags & XFS_PREALLOC_SYNC) 128 xfs_trans_set_sync(tp); 129 return xfs_trans_commit(tp); 130 } 131 132 /* 133 * Fsync operations on directories are much simpler than on regular files, 134 * as there is no file data to flush, and thus also no need for explicit 135 * cache flush operations, and there are no non-transaction metadata updates 136 * on directories either. 137 */ 138 STATIC int 139 xfs_dir_fsync( 140 struct file *file, 141 loff_t start, 142 loff_t end, 143 int datasync) 144 { 145 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 146 struct xfs_mount *mp = ip->i_mount; 147 xfs_lsn_t lsn = 0; 148 149 trace_xfs_dir_fsync(ip); 150 151 xfs_ilock(ip, XFS_ILOCK_SHARED); 152 if (xfs_ipincount(ip)) 153 lsn = ip->i_itemp->ili_last_lsn; 154 xfs_iunlock(ip, XFS_ILOCK_SHARED); 155 156 if (!lsn) 157 return 0; 158 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 159 } 160 161 STATIC int 162 xfs_file_fsync( 163 struct file *file, 164 loff_t start, 165 loff_t end, 166 int datasync) 167 { 168 struct inode *inode = file->f_mapping->host; 169 struct xfs_inode *ip = XFS_I(inode); 170 struct xfs_mount *mp = ip->i_mount; 171 int error = 0; 172 int log_flushed = 0; 173 xfs_lsn_t lsn = 0; 174 175 trace_xfs_file_fsync(ip); 176 177 error = filemap_write_and_wait_range(inode->i_mapping, start, end); 178 if (error) 179 return error; 180 181 if (XFS_FORCED_SHUTDOWN(mp)) 182 return -EIO; 183 184 xfs_iflags_clear(ip, XFS_ITRUNCATED); 185 186 if (mp->m_flags & XFS_MOUNT_BARRIER) { 187 /* 188 * If we have an RT and/or log subvolume we need to make sure 189 * to flush the write cache the device used for file data 190 * first. This is to ensure newly written file data make 191 * it to disk before logging the new inode size in case of 192 * an extending write. 193 */ 194 if (XFS_IS_REALTIME_INODE(ip)) 195 xfs_blkdev_issue_flush(mp->m_rtdev_targp); 196 else if (mp->m_logdev_targp != mp->m_ddev_targp) 197 xfs_blkdev_issue_flush(mp->m_ddev_targp); 198 } 199 200 /* 201 * All metadata updates are logged, which means that we just have to 202 * flush the log up to the latest LSN that touched the inode. If we have 203 * concurrent fsync/fdatasync() calls, we need them to all block on the 204 * log force before we clear the ili_fsync_fields field. This ensures 205 * that we don't get a racing sync operation that does not wait for the 206 * metadata to hit the journal before returning. If we race with 207 * clearing the ili_fsync_fields, then all that will happen is the log 208 * force will do nothing as the lsn will already be on disk. We can't 209 * race with setting ili_fsync_fields because that is done under 210 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared 211 * until after the ili_fsync_fields is cleared. 212 */ 213 xfs_ilock(ip, XFS_ILOCK_SHARED); 214 if (xfs_ipincount(ip)) { 215 if (!datasync || 216 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 217 lsn = ip->i_itemp->ili_last_lsn; 218 } 219 220 if (lsn) { 221 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); 222 ip->i_itemp->ili_fsync_fields = 0; 223 } 224 xfs_iunlock(ip, XFS_ILOCK_SHARED); 225 226 /* 227 * If we only have a single device, and the log force about was 228 * a no-op we might have to flush the data device cache here. 229 * This can only happen for fdatasync/O_DSYNC if we were overwriting 230 * an already allocated file and thus do not have any metadata to 231 * commit. 232 */ 233 if ((mp->m_flags & XFS_MOUNT_BARRIER) && 234 mp->m_logdev_targp == mp->m_ddev_targp && 235 !XFS_IS_REALTIME_INODE(ip) && 236 !log_flushed) 237 xfs_blkdev_issue_flush(mp->m_ddev_targp); 238 239 return error; 240 } 241 242 STATIC ssize_t 243 xfs_file_dio_aio_read( 244 struct kiocb *iocb, 245 struct iov_iter *to) 246 { 247 struct address_space *mapping = iocb->ki_filp->f_mapping; 248 struct inode *inode = mapping->host; 249 struct xfs_inode *ip = XFS_I(inode); 250 loff_t isize = i_size_read(inode); 251 size_t count = iov_iter_count(to); 252 loff_t end = iocb->ki_pos + count - 1; 253 struct iov_iter data; 254 struct xfs_buftarg *target; 255 ssize_t ret = 0; 256 257 trace_xfs_file_direct_read(ip, count, iocb->ki_pos); 258 259 if (!count) 260 return 0; /* skip atime */ 261 262 if (XFS_IS_REALTIME_INODE(ip)) 263 target = ip->i_mount->m_rtdev_targp; 264 else 265 target = ip->i_mount->m_ddev_targp; 266 267 /* DIO must be aligned to device logical sector size */ 268 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { 269 if (iocb->ki_pos == isize) 270 return 0; 271 return -EINVAL; 272 } 273 274 file_accessed(iocb->ki_filp); 275 276 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 277 if (mapping->nrpages) { 278 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); 279 if (ret) 280 goto out_unlock; 281 282 /* 283 * Invalidate whole pages. This can return an error if we fail 284 * to invalidate a page, but this should never happen on XFS. 285 * Warn if it does fail. 286 */ 287 ret = invalidate_inode_pages2_range(mapping, 288 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); 289 WARN_ON_ONCE(ret); 290 ret = 0; 291 } 292 293 data = *to; 294 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, 295 xfs_get_blocks_direct, NULL, NULL, 0); 296 if (ret >= 0) { 297 iocb->ki_pos += ret; 298 iov_iter_advance(to, ret); 299 } 300 301 out_unlock: 302 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 303 return ret; 304 } 305 306 static noinline ssize_t 307 xfs_file_dax_read( 308 struct kiocb *iocb, 309 struct iov_iter *to) 310 { 311 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 312 size_t count = iov_iter_count(to); 313 ssize_t ret = 0; 314 315 trace_xfs_file_dax_read(ip, count, iocb->ki_pos); 316 317 if (!count) 318 return 0; /* skip atime */ 319 320 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 321 ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 323 324 file_accessed(iocb->ki_filp); 325 return ret; 326 } 327 328 STATIC ssize_t 329 xfs_file_buffered_aio_read( 330 struct kiocb *iocb, 331 struct iov_iter *to) 332 { 333 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 334 ssize_t ret; 335 336 trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); 337 338 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 339 ret = generic_file_read_iter(iocb, to); 340 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 341 342 return ret; 343 } 344 345 STATIC ssize_t 346 xfs_file_read_iter( 347 struct kiocb *iocb, 348 struct iov_iter *to) 349 { 350 struct inode *inode = file_inode(iocb->ki_filp); 351 struct xfs_mount *mp = XFS_I(inode)->i_mount; 352 ssize_t ret = 0; 353 354 XFS_STATS_INC(mp, xs_read_calls); 355 356 if (XFS_FORCED_SHUTDOWN(mp)) 357 return -EIO; 358 359 if (IS_DAX(inode)) 360 ret = xfs_file_dax_read(iocb, to); 361 else if (iocb->ki_flags & IOCB_DIRECT) 362 ret = xfs_file_dio_aio_read(iocb, to); 363 else 364 ret = xfs_file_buffered_aio_read(iocb, to); 365 366 if (ret > 0) 367 XFS_STATS_ADD(mp, xs_read_bytes, ret); 368 return ret; 369 } 370 371 /* 372 * Zero any on disk space between the current EOF and the new, larger EOF. 373 * 374 * This handles the normal case of zeroing the remainder of the last block in 375 * the file and the unusual case of zeroing blocks out beyond the size of the 376 * file. This second case only happens with fixed size extents and when the 377 * system crashes before the inode size was updated but after blocks were 378 * allocated. 379 * 380 * Expects the iolock to be held exclusive, and will take the ilock internally. 381 */ 382 int /* error (positive) */ 383 xfs_zero_eof( 384 struct xfs_inode *ip, 385 xfs_off_t offset, /* starting I/O offset */ 386 xfs_fsize_t isize, /* current inode size */ 387 bool *did_zeroing) 388 { 389 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 390 ASSERT(offset > isize); 391 392 trace_xfs_zero_eof(ip, isize, offset - isize); 393 return xfs_zero_range(ip, isize, offset - isize, did_zeroing); 394 } 395 396 /* 397 * Common pre-write limit and setup checks. 398 * 399 * Called with the iolocked held either shared and exclusive according to 400 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 401 * if called for a direct write beyond i_size. 402 */ 403 STATIC ssize_t 404 xfs_file_aio_write_checks( 405 struct kiocb *iocb, 406 struct iov_iter *from, 407 int *iolock) 408 { 409 struct file *file = iocb->ki_filp; 410 struct inode *inode = file->f_mapping->host; 411 struct xfs_inode *ip = XFS_I(inode); 412 ssize_t error = 0; 413 size_t count = iov_iter_count(from); 414 bool drained_dio = false; 415 416 restart: 417 error = generic_write_checks(iocb, from); 418 if (error <= 0) 419 return error; 420 421 error = xfs_break_layouts(inode, iolock, true); 422 if (error) 423 return error; 424 425 /* For changing security info in file_remove_privs() we need i_mutex */ 426 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 427 xfs_rw_iunlock(ip, *iolock); 428 *iolock = XFS_IOLOCK_EXCL; 429 xfs_rw_ilock(ip, *iolock); 430 goto restart; 431 } 432 /* 433 * If the offset is beyond the size of the file, we need to zero any 434 * blocks that fall between the existing EOF and the start of this 435 * write. If zeroing is needed and we are currently holding the 436 * iolock shared, we need to update it to exclusive which implies 437 * having to redo all checks before. 438 * 439 * We need to serialise against EOF updates that occur in IO 440 * completions here. We want to make sure that nobody is changing the 441 * size while we do this check until we have placed an IO barrier (i.e. 442 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. 443 * The spinlock effectively forms a memory barrier once we have the 444 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value 445 * and hence be able to correctly determine if we need to run zeroing. 446 */ 447 spin_lock(&ip->i_flags_lock); 448 if (iocb->ki_pos > i_size_read(inode)) { 449 bool zero = false; 450 451 spin_unlock(&ip->i_flags_lock); 452 if (!drained_dio) { 453 if (*iolock == XFS_IOLOCK_SHARED) { 454 xfs_rw_iunlock(ip, *iolock); 455 *iolock = XFS_IOLOCK_EXCL; 456 xfs_rw_ilock(ip, *iolock); 457 iov_iter_reexpand(from, count); 458 } 459 /* 460 * We now have an IO submission barrier in place, but 461 * AIO can do EOF updates during IO completion and hence 462 * we now need to wait for all of them to drain. Non-AIO 463 * DIO will have drained before we are given the 464 * XFS_IOLOCK_EXCL, and so for most cases this wait is a 465 * no-op. 466 */ 467 inode_dio_wait(inode); 468 drained_dio = true; 469 goto restart; 470 } 471 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); 472 if (error) 473 return error; 474 } else 475 spin_unlock(&ip->i_flags_lock); 476 477 /* 478 * Updating the timestamps will grab the ilock again from 479 * xfs_fs_dirty_inode, so we have to call it after dropping the 480 * lock above. Eventually we should look into a way to avoid 481 * the pointless lock roundtrip. 482 */ 483 if (likely(!(file->f_mode & FMODE_NOCMTIME))) { 484 error = file_update_time(file); 485 if (error) 486 return error; 487 } 488 489 /* 490 * If we're writing the file then make sure to clear the setuid and 491 * setgid bits if the process is not being run by root. This keeps 492 * people from modifying setuid and setgid binaries. 493 */ 494 if (!IS_NOSEC(inode)) 495 return file_remove_privs(file); 496 return 0; 497 } 498 499 /* 500 * xfs_file_dio_aio_write - handle direct IO writes 501 * 502 * Lock the inode appropriately to prepare for and issue a direct IO write. 503 * By separating it from the buffered write path we remove all the tricky to 504 * follow locking changes and looping. 505 * 506 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL 507 * until we're sure the bytes at the new EOF have been zeroed and/or the cached 508 * pages are flushed out. 509 * 510 * In most cases the direct IO writes will be done holding IOLOCK_SHARED 511 * allowing them to be done in parallel with reads and other direct IO writes. 512 * However, if the IO is not aligned to filesystem blocks, the direct IO layer 513 * needs to do sub-block zeroing and that requires serialisation against other 514 * direct IOs to the same block. In this case we need to serialise the 515 * submission of the unaligned IOs so that we don't get racing block zeroing in 516 * the dio layer. To avoid the problem with aio, we also need to wait for 517 * outstanding IOs to complete so that unwritten extent conversion is completed 518 * before we try to map the overlapping block. This is currently implemented by 519 * hitting it with a big hammer (i.e. inode_dio_wait()). 520 * 521 * Returns with locks held indicated by @iolock and errors indicated by 522 * negative return values. 523 */ 524 STATIC ssize_t 525 xfs_file_dio_aio_write( 526 struct kiocb *iocb, 527 struct iov_iter *from) 528 { 529 struct file *file = iocb->ki_filp; 530 struct address_space *mapping = file->f_mapping; 531 struct inode *inode = mapping->host; 532 struct xfs_inode *ip = XFS_I(inode); 533 struct xfs_mount *mp = ip->i_mount; 534 ssize_t ret = 0; 535 int unaligned_io = 0; 536 int iolock; 537 size_t count = iov_iter_count(from); 538 loff_t end; 539 struct iov_iter data; 540 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 541 mp->m_rtdev_targp : mp->m_ddev_targp; 542 543 /* DIO must be aligned to device logical sector size */ 544 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 545 return -EINVAL; 546 547 /* 548 * Don't take the exclusive iolock here unless the I/O is unaligned to 549 * the file system block size. We don't need to consider the EOF 550 * extension case here because xfs_file_aio_write_checks() will relock 551 * the inode as necessary for EOF zeroing cases and fill out the new 552 * inode size as appropriate. 553 */ 554 if ((iocb->ki_pos & mp->m_blockmask) || 555 ((iocb->ki_pos + count) & mp->m_blockmask)) { 556 unaligned_io = 1; 557 iolock = XFS_IOLOCK_EXCL; 558 } else { 559 iolock = XFS_IOLOCK_SHARED; 560 } 561 562 xfs_rw_ilock(ip, iolock); 563 564 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 565 if (ret) 566 goto out; 567 count = iov_iter_count(from); 568 end = iocb->ki_pos + count - 1; 569 570 if (mapping->nrpages) { 571 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); 572 if (ret) 573 goto out; 574 575 /* 576 * Invalidate whole pages. This can return an error if we fail 577 * to invalidate a page, but this should never happen on XFS. 578 * Warn if it does fail. 579 */ 580 ret = invalidate_inode_pages2_range(mapping, 581 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); 582 WARN_ON_ONCE(ret); 583 ret = 0; 584 } 585 586 /* 587 * If we are doing unaligned IO, wait for all other IO to drain, 588 * otherwise demote the lock if we had to take the exclusive lock 589 * for other reasons in xfs_file_aio_write_checks. 590 */ 591 if (unaligned_io) 592 inode_dio_wait(inode); 593 else if (iolock == XFS_IOLOCK_EXCL) { 594 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 595 iolock = XFS_IOLOCK_SHARED; 596 } 597 598 trace_xfs_file_direct_write(ip, count, iocb->ki_pos); 599 600 /* If this is a block-aligned directio CoW, remap immediately. */ 601 if (xfs_is_reflink_inode(ip) && !unaligned_io) { 602 ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); 603 if (ret) 604 goto out; 605 } 606 607 data = *from; 608 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, 609 xfs_get_blocks_direct, xfs_end_io_direct_write, 610 NULL, DIO_ASYNC_EXTEND); 611 612 /* see generic_file_direct_write() for why this is necessary */ 613 if (mapping->nrpages) { 614 invalidate_inode_pages2_range(mapping, 615 iocb->ki_pos >> PAGE_SHIFT, 616 end >> PAGE_SHIFT); 617 } 618 619 if (ret > 0) { 620 iocb->ki_pos += ret; 621 iov_iter_advance(from, ret); 622 } 623 out: 624 xfs_rw_iunlock(ip, iolock); 625 626 /* 627 * No fallback to buffered IO on errors for XFS, direct IO will either 628 * complete fully or fail. 629 */ 630 ASSERT(ret < 0 || ret == count); 631 return ret; 632 } 633 634 static noinline ssize_t 635 xfs_file_dax_write( 636 struct kiocb *iocb, 637 struct iov_iter *from) 638 { 639 struct inode *inode = iocb->ki_filp->f_mapping->host; 640 struct xfs_inode *ip = XFS_I(inode); 641 int iolock = XFS_IOLOCK_EXCL; 642 ssize_t ret, error = 0; 643 size_t count; 644 loff_t pos; 645 646 xfs_rw_ilock(ip, iolock); 647 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 648 if (ret) 649 goto out; 650 651 pos = iocb->ki_pos; 652 count = iov_iter_count(from); 653 654 trace_xfs_file_dax_write(ip, count, pos); 655 656 ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); 657 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 658 i_size_write(inode, iocb->ki_pos); 659 error = xfs_setfilesize(ip, pos, ret); 660 } 661 662 out: 663 xfs_rw_iunlock(ip, iolock); 664 return error ? error : ret; 665 } 666 667 STATIC ssize_t 668 xfs_file_buffered_aio_write( 669 struct kiocb *iocb, 670 struct iov_iter *from) 671 { 672 struct file *file = iocb->ki_filp; 673 struct address_space *mapping = file->f_mapping; 674 struct inode *inode = mapping->host; 675 struct xfs_inode *ip = XFS_I(inode); 676 ssize_t ret; 677 int enospc = 0; 678 int iolock = XFS_IOLOCK_EXCL; 679 680 xfs_rw_ilock(ip, iolock); 681 682 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 683 if (ret) 684 goto out; 685 686 /* We can write back this queue in page reclaim */ 687 current->backing_dev_info = inode_to_bdi(inode); 688 689 write_retry: 690 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); 691 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); 692 if (likely(ret >= 0)) 693 iocb->ki_pos += ret; 694 695 /* 696 * If we hit a space limit, try to free up some lingering preallocated 697 * space before returning an error. In the case of ENOSPC, first try to 698 * write back all dirty inodes to free up some of the excess reserved 699 * metadata space. This reduces the chances that the eofblocks scan 700 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 701 * also behaves as a filter to prevent too many eofblocks scans from 702 * running at the same time. 703 */ 704 if (ret == -EDQUOT && !enospc) { 705 enospc = xfs_inode_free_quota_eofblocks(ip); 706 if (enospc) 707 goto write_retry; 708 enospc = xfs_inode_free_quota_cowblocks(ip); 709 if (enospc) 710 goto write_retry; 711 } else if (ret == -ENOSPC && !enospc) { 712 struct xfs_eofblocks eofb = {0}; 713 714 enospc = 1; 715 xfs_flush_inodes(ip->i_mount); 716 eofb.eof_scan_owner = ip->i_ino; /* for locking */ 717 eofb.eof_flags = XFS_EOF_FLAGS_SYNC; 718 xfs_icache_free_eofblocks(ip->i_mount, &eofb); 719 goto write_retry; 720 } 721 722 current->backing_dev_info = NULL; 723 out: 724 xfs_rw_iunlock(ip, iolock); 725 return ret; 726 } 727 728 STATIC ssize_t 729 xfs_file_write_iter( 730 struct kiocb *iocb, 731 struct iov_iter *from) 732 { 733 struct file *file = iocb->ki_filp; 734 struct address_space *mapping = file->f_mapping; 735 struct inode *inode = mapping->host; 736 struct xfs_inode *ip = XFS_I(inode); 737 ssize_t ret; 738 size_t ocount = iov_iter_count(from); 739 740 XFS_STATS_INC(ip->i_mount, xs_write_calls); 741 742 if (ocount == 0) 743 return 0; 744 745 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 746 return -EIO; 747 748 if (IS_DAX(inode)) 749 ret = xfs_file_dax_write(iocb, from); 750 else if (iocb->ki_flags & IOCB_DIRECT) { 751 /* 752 * Allow a directio write to fall back to a buffered 753 * write *only* in the case that we're doing a reflink 754 * CoW. In all other directio scenarios we do not 755 * allow an operation to fall back to buffered mode. 756 */ 757 ret = xfs_file_dio_aio_write(iocb, from); 758 if (ret == -EREMCHG) 759 goto buffered; 760 } else { 761 buffered: 762 ret = xfs_file_buffered_aio_write(iocb, from); 763 } 764 765 if (ret > 0) { 766 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 767 768 /* Handle various SYNC-type writes */ 769 ret = generic_write_sync(iocb, ret); 770 } 771 return ret; 772 } 773 774 #define XFS_FALLOC_FL_SUPPORTED \ 775 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 776 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ 777 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) 778 779 STATIC long 780 xfs_file_fallocate( 781 struct file *file, 782 int mode, 783 loff_t offset, 784 loff_t len) 785 { 786 struct inode *inode = file_inode(file); 787 struct xfs_inode *ip = XFS_I(inode); 788 long error; 789 enum xfs_prealloc_flags flags = 0; 790 uint iolock = XFS_IOLOCK_EXCL; 791 loff_t new_size = 0; 792 bool do_file_insert = 0; 793 794 if (!S_ISREG(inode->i_mode)) 795 return -EINVAL; 796 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 797 return -EOPNOTSUPP; 798 799 xfs_ilock(ip, iolock); 800 error = xfs_break_layouts(inode, &iolock, false); 801 if (error) 802 goto out_unlock; 803 804 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 805 iolock |= XFS_MMAPLOCK_EXCL; 806 807 if (mode & FALLOC_FL_PUNCH_HOLE) { 808 error = xfs_free_file_space(ip, offset, len); 809 if (error) 810 goto out_unlock; 811 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { 812 unsigned blksize_mask = (1 << inode->i_blkbits) - 1; 813 814 if (offset & blksize_mask || len & blksize_mask) { 815 error = -EINVAL; 816 goto out_unlock; 817 } 818 819 /* 820 * There is no need to overlap collapse range with EOF, 821 * in which case it is effectively a truncate operation 822 */ 823 if (offset + len >= i_size_read(inode)) { 824 error = -EINVAL; 825 goto out_unlock; 826 } 827 828 new_size = i_size_read(inode) - len; 829 830 error = xfs_collapse_file_space(ip, offset, len); 831 if (error) 832 goto out_unlock; 833 } else if (mode & FALLOC_FL_INSERT_RANGE) { 834 unsigned blksize_mask = (1 << inode->i_blkbits) - 1; 835 836 new_size = i_size_read(inode) + len; 837 if (offset & blksize_mask || len & blksize_mask) { 838 error = -EINVAL; 839 goto out_unlock; 840 } 841 842 /* check the new inode size does not wrap through zero */ 843 if (new_size > inode->i_sb->s_maxbytes) { 844 error = -EFBIG; 845 goto out_unlock; 846 } 847 848 /* Offset should be less than i_size */ 849 if (offset >= i_size_read(inode)) { 850 error = -EINVAL; 851 goto out_unlock; 852 } 853 do_file_insert = 1; 854 } else { 855 flags |= XFS_PREALLOC_SET; 856 857 if (!(mode & FALLOC_FL_KEEP_SIZE) && 858 offset + len > i_size_read(inode)) { 859 new_size = offset + len; 860 error = inode_newsize_ok(inode, new_size); 861 if (error) 862 goto out_unlock; 863 } 864 865 if (mode & FALLOC_FL_ZERO_RANGE) 866 error = xfs_zero_file_space(ip, offset, len); 867 else { 868 if (mode & FALLOC_FL_UNSHARE_RANGE) { 869 error = xfs_reflink_unshare(ip, offset, len); 870 if (error) 871 goto out_unlock; 872 } 873 error = xfs_alloc_file_space(ip, offset, len, 874 XFS_BMAPI_PREALLOC); 875 } 876 if (error) 877 goto out_unlock; 878 } 879 880 if (file->f_flags & O_DSYNC) 881 flags |= XFS_PREALLOC_SYNC; 882 883 error = xfs_update_prealloc_flags(ip, flags); 884 if (error) 885 goto out_unlock; 886 887 /* Change file size if needed */ 888 if (new_size) { 889 struct iattr iattr; 890 891 iattr.ia_valid = ATTR_SIZE; 892 iattr.ia_size = new_size; 893 error = xfs_vn_setattr_size(file_dentry(file), &iattr); 894 if (error) 895 goto out_unlock; 896 } 897 898 /* 899 * Perform hole insertion now that the file size has been 900 * updated so that if we crash during the operation we don't 901 * leave shifted extents past EOF and hence losing access to 902 * the data that is contained within them. 903 */ 904 if (do_file_insert) 905 error = xfs_insert_file_space(ip, offset, len); 906 907 out_unlock: 908 xfs_iunlock(ip, iolock); 909 return error; 910 } 911 912 STATIC ssize_t 913 xfs_file_copy_range( 914 struct file *file_in, 915 loff_t pos_in, 916 struct file *file_out, 917 loff_t pos_out, 918 size_t len, 919 unsigned int flags) 920 { 921 int error; 922 923 error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, 924 len, false); 925 if (error) 926 return error; 927 return len; 928 } 929 930 STATIC int 931 xfs_file_clone_range( 932 struct file *file_in, 933 loff_t pos_in, 934 struct file *file_out, 935 loff_t pos_out, 936 u64 len) 937 { 938 return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, 939 len, false); 940 } 941 942 #define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) 943 STATIC ssize_t 944 xfs_file_dedupe_range( 945 struct file *src_file, 946 u64 loff, 947 u64 len, 948 struct file *dst_file, 949 u64 dst_loff) 950 { 951 int error; 952 953 /* 954 * Limit the total length we will dedupe for each operation. 955 * This is intended to bound the total time spent in this 956 * ioctl to something sane. 957 */ 958 if (len > XFS_MAX_DEDUPE_LEN) 959 len = XFS_MAX_DEDUPE_LEN; 960 961 error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, 962 len, true); 963 if (error) 964 return error; 965 return len; 966 } 967 968 STATIC int 969 xfs_file_open( 970 struct inode *inode, 971 struct file *file) 972 { 973 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 974 return -EFBIG; 975 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) 976 return -EIO; 977 return 0; 978 } 979 980 STATIC int 981 xfs_dir_open( 982 struct inode *inode, 983 struct file *file) 984 { 985 struct xfs_inode *ip = XFS_I(inode); 986 int mode; 987 int error; 988 989 error = xfs_file_open(inode, file); 990 if (error) 991 return error; 992 993 /* 994 * If there are any blocks, read-ahead block 0 as we're almost 995 * certain to have the next operation be a read there. 996 */ 997 mode = xfs_ilock_data_map_shared(ip); 998 if (ip->i_d.di_nextents > 0) 999 xfs_dir3_data_readahead(ip, 0, -1); 1000 xfs_iunlock(ip, mode); 1001 return 0; 1002 } 1003 1004 STATIC int 1005 xfs_file_release( 1006 struct inode *inode, 1007 struct file *filp) 1008 { 1009 return xfs_release(XFS_I(inode)); 1010 } 1011 1012 STATIC int 1013 xfs_file_readdir( 1014 struct file *file, 1015 struct dir_context *ctx) 1016 { 1017 struct inode *inode = file_inode(file); 1018 xfs_inode_t *ip = XFS_I(inode); 1019 size_t bufsize; 1020 1021 /* 1022 * The Linux API doesn't pass down the total size of the buffer 1023 * we read into down to the filesystem. With the filldir concept 1024 * it's not needed for correct information, but the XFS dir2 leaf 1025 * code wants an estimate of the buffer size to calculate it's 1026 * readahead window and size the buffers used for mapping to 1027 * physical blocks. 1028 * 1029 * Try to give it an estimate that's good enough, maybe at some 1030 * point we can change the ->readdir prototype to include the 1031 * buffer size. For now we use the current glibc buffer size. 1032 */ 1033 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 1034 1035 return xfs_readdir(ip, ctx, bufsize); 1036 } 1037 1038 /* 1039 * This type is designed to indicate the type of offset we would like 1040 * to search from page cache for xfs_seek_hole_data(). 1041 */ 1042 enum { 1043 HOLE_OFF = 0, 1044 DATA_OFF, 1045 }; 1046 1047 /* 1048 * Lookup the desired type of offset from the given page. 1049 * 1050 * On success, return true and the offset argument will point to the 1051 * start of the region that was found. Otherwise this function will 1052 * return false and keep the offset argument unchanged. 1053 */ 1054 STATIC bool 1055 xfs_lookup_buffer_offset( 1056 struct page *page, 1057 loff_t *offset, 1058 unsigned int type) 1059 { 1060 loff_t lastoff = page_offset(page); 1061 bool found = false; 1062 struct buffer_head *bh, *head; 1063 1064 bh = head = page_buffers(page); 1065 do { 1066 /* 1067 * Unwritten extents that have data in the page 1068 * cache covering them can be identified by the 1069 * BH_Unwritten state flag. Pages with multiple 1070 * buffers might have a mix of holes, data and 1071 * unwritten extents - any buffer with valid 1072 * data in it should have BH_Uptodate flag set 1073 * on it. 1074 */ 1075 if (buffer_unwritten(bh) || 1076 buffer_uptodate(bh)) { 1077 if (type == DATA_OFF) 1078 found = true; 1079 } else { 1080 if (type == HOLE_OFF) 1081 found = true; 1082 } 1083 1084 if (found) { 1085 *offset = lastoff; 1086 break; 1087 } 1088 lastoff += bh->b_size; 1089 } while ((bh = bh->b_this_page) != head); 1090 1091 return found; 1092 } 1093 1094 /* 1095 * This routine is called to find out and return a data or hole offset 1096 * from the page cache for unwritten extents according to the desired 1097 * type for xfs_seek_hole_data(). 1098 * 1099 * The argument offset is used to tell where we start to search from the 1100 * page cache. Map is used to figure out the end points of the range to 1101 * lookup pages. 1102 * 1103 * Return true if the desired type of offset was found, and the argument 1104 * offset is filled with that address. Otherwise, return false and keep 1105 * offset unchanged. 1106 */ 1107 STATIC bool 1108 xfs_find_get_desired_pgoff( 1109 struct inode *inode, 1110 struct xfs_bmbt_irec *map, 1111 unsigned int type, 1112 loff_t *offset) 1113 { 1114 struct xfs_inode *ip = XFS_I(inode); 1115 struct xfs_mount *mp = ip->i_mount; 1116 struct pagevec pvec; 1117 pgoff_t index; 1118 pgoff_t end; 1119 loff_t endoff; 1120 loff_t startoff = *offset; 1121 loff_t lastoff = startoff; 1122 bool found = false; 1123 1124 pagevec_init(&pvec, 0); 1125 1126 index = startoff >> PAGE_SHIFT; 1127 endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); 1128 end = endoff >> PAGE_SHIFT; 1129 do { 1130 int want; 1131 unsigned nr_pages; 1132 unsigned int i; 1133 1134 want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 1135 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 1136 want); 1137 /* 1138 * No page mapped into given range. If we are searching holes 1139 * and if this is the first time we got into the loop, it means 1140 * that the given offset is landed in a hole, return it. 1141 * 1142 * If we have already stepped through some block buffers to find 1143 * holes but they all contains data. In this case, the last 1144 * offset is already updated and pointed to the end of the last 1145 * mapped page, if it does not reach the endpoint to search, 1146 * that means there should be a hole between them. 1147 */ 1148 if (nr_pages == 0) { 1149 /* Data search found nothing */ 1150 if (type == DATA_OFF) 1151 break; 1152 1153 ASSERT(type == HOLE_OFF); 1154 if (lastoff == startoff || lastoff < endoff) { 1155 found = true; 1156 *offset = lastoff; 1157 } 1158 break; 1159 } 1160 1161 /* 1162 * At lease we found one page. If this is the first time we 1163 * step into the loop, and if the first page index offset is 1164 * greater than the given search offset, a hole was found. 1165 */ 1166 if (type == HOLE_OFF && lastoff == startoff && 1167 lastoff < page_offset(pvec.pages[0])) { 1168 found = true; 1169 break; 1170 } 1171 1172 for (i = 0; i < nr_pages; i++) { 1173 struct page *page = pvec.pages[i]; 1174 loff_t b_offset; 1175 1176 /* 1177 * At this point, the page may be truncated or 1178 * invalidated (changing page->mapping to NULL), 1179 * or even swizzled back from swapper_space to tmpfs 1180 * file mapping. However, page->index will not change 1181 * because we have a reference on the page. 1182 * 1183 * Searching done if the page index is out of range. 1184 * If the current offset is not reaches the end of 1185 * the specified search range, there should be a hole 1186 * between them. 1187 */ 1188 if (page->index > end) { 1189 if (type == HOLE_OFF && lastoff < endoff) { 1190 *offset = lastoff; 1191 found = true; 1192 } 1193 goto out; 1194 } 1195 1196 lock_page(page); 1197 /* 1198 * Page truncated or invalidated(page->mapping == NULL). 1199 * We can freely skip it and proceed to check the next 1200 * page. 1201 */ 1202 if (unlikely(page->mapping != inode->i_mapping)) { 1203 unlock_page(page); 1204 continue; 1205 } 1206 1207 if (!page_has_buffers(page)) { 1208 unlock_page(page); 1209 continue; 1210 } 1211 1212 found = xfs_lookup_buffer_offset(page, &b_offset, type); 1213 if (found) { 1214 /* 1215 * The found offset may be less than the start 1216 * point to search if this is the first time to 1217 * come here. 1218 */ 1219 *offset = max_t(loff_t, startoff, b_offset); 1220 unlock_page(page); 1221 goto out; 1222 } 1223 1224 /* 1225 * We either searching data but nothing was found, or 1226 * searching hole but found a data buffer. In either 1227 * case, probably the next page contains the desired 1228 * things, update the last offset to it so. 1229 */ 1230 lastoff = page_offset(page) + PAGE_SIZE; 1231 unlock_page(page); 1232 } 1233 1234 /* 1235 * The number of returned pages less than our desired, search 1236 * done. In this case, nothing was found for searching data, 1237 * but we found a hole behind the last offset. 1238 */ 1239 if (nr_pages < want) { 1240 if (type == HOLE_OFF) { 1241 *offset = lastoff; 1242 found = true; 1243 } 1244 break; 1245 } 1246 1247 index = pvec.pages[i - 1]->index + 1; 1248 pagevec_release(&pvec); 1249 } while (index <= end); 1250 1251 out: 1252 pagevec_release(&pvec); 1253 return found; 1254 } 1255 1256 /* 1257 * caller must lock inode with xfs_ilock_data_map_shared, 1258 * can we craft an appropriate ASSERT? 1259 * 1260 * end is because the VFS-level lseek interface is defined such that any 1261 * offset past i_size shall return -ENXIO, but we use this for quota code 1262 * which does not maintain i_size, and we want to SEEK_DATA past i_size. 1263 */ 1264 loff_t 1265 __xfs_seek_hole_data( 1266 struct inode *inode, 1267 loff_t start, 1268 loff_t end, 1269 int whence) 1270 { 1271 struct xfs_inode *ip = XFS_I(inode); 1272 struct xfs_mount *mp = ip->i_mount; 1273 loff_t uninitialized_var(offset); 1274 xfs_fileoff_t fsbno; 1275 xfs_filblks_t lastbno; 1276 int error; 1277 1278 if (start >= end) { 1279 error = -ENXIO; 1280 goto out_error; 1281 } 1282 1283 /* 1284 * Try to read extents from the first block indicated 1285 * by fsbno to the end block of the file. 1286 */ 1287 fsbno = XFS_B_TO_FSBT(mp, start); 1288 lastbno = XFS_B_TO_FSB(mp, end); 1289 1290 for (;;) { 1291 struct xfs_bmbt_irec map[2]; 1292 int nmap = 2; 1293 unsigned int i; 1294 1295 error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap, 1296 XFS_BMAPI_ENTIRE); 1297 if (error) 1298 goto out_error; 1299 1300 /* No extents at given offset, must be beyond EOF */ 1301 if (nmap == 0) { 1302 error = -ENXIO; 1303 goto out_error; 1304 } 1305 1306 for (i = 0; i < nmap; i++) { 1307 offset = max_t(loff_t, start, 1308 XFS_FSB_TO_B(mp, map[i].br_startoff)); 1309 1310 /* Landed in the hole we wanted? */ 1311 if (whence == SEEK_HOLE && 1312 map[i].br_startblock == HOLESTARTBLOCK) 1313 goto out; 1314 1315 /* Landed in the data extent we wanted? */ 1316 if (whence == SEEK_DATA && 1317 (map[i].br_startblock == DELAYSTARTBLOCK || 1318 (map[i].br_state == XFS_EXT_NORM && 1319 !isnullstartblock(map[i].br_startblock)))) 1320 goto out; 1321 1322 /* 1323 * Landed in an unwritten extent, try to search 1324 * for hole or data from page cache. 1325 */ 1326 if (map[i].br_state == XFS_EXT_UNWRITTEN) { 1327 if (xfs_find_get_desired_pgoff(inode, &map[i], 1328 whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, 1329 &offset)) 1330 goto out; 1331 } 1332 } 1333 1334 /* 1335 * We only received one extent out of the two requested. This 1336 * means we've hit EOF and didn't find what we are looking for. 1337 */ 1338 if (nmap == 1) { 1339 /* 1340 * If we were looking for a hole, set offset to 1341 * the end of the file (i.e., there is an implicit 1342 * hole at the end of any file). 1343 */ 1344 if (whence == SEEK_HOLE) { 1345 offset = end; 1346 break; 1347 } 1348 /* 1349 * If we were looking for data, it's nowhere to be found 1350 */ 1351 ASSERT(whence == SEEK_DATA); 1352 error = -ENXIO; 1353 goto out_error; 1354 } 1355 1356 ASSERT(i > 1); 1357 1358 /* 1359 * Nothing was found, proceed to the next round of search 1360 * if the next reading offset is not at or beyond EOF. 1361 */ 1362 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; 1363 start = XFS_FSB_TO_B(mp, fsbno); 1364 if (start >= end) { 1365 if (whence == SEEK_HOLE) { 1366 offset = end; 1367 break; 1368 } 1369 ASSERT(whence == SEEK_DATA); 1370 error = -ENXIO; 1371 goto out_error; 1372 } 1373 } 1374 1375 out: 1376 /* 1377 * If at this point we have found the hole we wanted, the returned 1378 * offset may be bigger than the file size as it may be aligned to 1379 * page boundary for unwritten extents. We need to deal with this 1380 * situation in particular. 1381 */ 1382 if (whence == SEEK_HOLE) 1383 offset = min_t(loff_t, offset, end); 1384 1385 return offset; 1386 1387 out_error: 1388 return error; 1389 } 1390 1391 STATIC loff_t 1392 xfs_seek_hole_data( 1393 struct file *file, 1394 loff_t start, 1395 int whence) 1396 { 1397 struct inode *inode = file->f_mapping->host; 1398 struct xfs_inode *ip = XFS_I(inode); 1399 struct xfs_mount *mp = ip->i_mount; 1400 uint lock; 1401 loff_t offset, end; 1402 int error = 0; 1403 1404 if (XFS_FORCED_SHUTDOWN(mp)) 1405 return -EIO; 1406 1407 lock = xfs_ilock_data_map_shared(ip); 1408 1409 end = i_size_read(inode); 1410 offset = __xfs_seek_hole_data(inode, start, end, whence); 1411 if (offset < 0) { 1412 error = offset; 1413 goto out_unlock; 1414 } 1415 1416 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1417 1418 out_unlock: 1419 xfs_iunlock(ip, lock); 1420 1421 if (error) 1422 return error; 1423 return offset; 1424 } 1425 1426 STATIC loff_t 1427 xfs_file_llseek( 1428 struct file *file, 1429 loff_t offset, 1430 int whence) 1431 { 1432 switch (whence) { 1433 case SEEK_END: 1434 case SEEK_CUR: 1435 case SEEK_SET: 1436 return generic_file_llseek(file, offset, whence); 1437 case SEEK_HOLE: 1438 case SEEK_DATA: 1439 return xfs_seek_hole_data(file, offset, whence); 1440 default: 1441 return -EINVAL; 1442 } 1443 } 1444 1445 /* 1446 * Locking for serialisation of IO during page faults. This results in a lock 1447 * ordering of: 1448 * 1449 * mmap_sem (MM) 1450 * sb_start_pagefault(vfs, freeze) 1451 * i_mmaplock (XFS - truncate serialisation) 1452 * page_lock (MM) 1453 * i_lock (XFS - extent map serialisation) 1454 */ 1455 1456 /* 1457 * mmap()d file has taken write protection fault and is being made writable. We 1458 * can set the page state up correctly for a writable page, which means we can 1459 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent 1460 * mapping. 1461 */ 1462 STATIC int 1463 xfs_filemap_page_mkwrite( 1464 struct vm_area_struct *vma, 1465 struct vm_fault *vmf) 1466 { 1467 struct inode *inode = file_inode(vma->vm_file); 1468 int ret; 1469 1470 trace_xfs_filemap_page_mkwrite(XFS_I(inode)); 1471 1472 sb_start_pagefault(inode->i_sb); 1473 file_update_time(vma->vm_file); 1474 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1475 1476 if (IS_DAX(inode)) { 1477 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1478 } else { 1479 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); 1480 ret = block_page_mkwrite_return(ret); 1481 } 1482 1483 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1484 sb_end_pagefault(inode->i_sb); 1485 1486 return ret; 1487 } 1488 1489 STATIC int 1490 xfs_filemap_fault( 1491 struct vm_area_struct *vma, 1492 struct vm_fault *vmf) 1493 { 1494 struct inode *inode = file_inode(vma->vm_file); 1495 int ret; 1496 1497 trace_xfs_filemap_fault(XFS_I(inode)); 1498 1499 /* DAX can shortcut the normal fault path on write faults! */ 1500 if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) 1501 return xfs_filemap_page_mkwrite(vma, vmf); 1502 1503 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1504 if (IS_DAX(inode)) { 1505 /* 1506 * we do not want to trigger unwritten extent conversion on read 1507 * faults - that is unnecessary overhead and would also require 1508 * changes to xfs_get_blocks_direct() to map unwritten extent 1509 * ioend for conversion on read-only mappings. 1510 */ 1511 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1512 } else 1513 ret = filemap_fault(vma, vmf); 1514 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1515 1516 return ret; 1517 } 1518 1519 /* 1520 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on 1521 * both read and write faults. Hence we need to handle both cases. There is no 1522 * ->pmd_mkwrite callout for huge pages, so we have a single function here to 1523 * handle both cases here. @flags carries the information on the type of fault 1524 * occuring. 1525 */ 1526 STATIC int 1527 xfs_filemap_pmd_fault( 1528 struct vm_area_struct *vma, 1529 unsigned long addr, 1530 pmd_t *pmd, 1531 unsigned int flags) 1532 { 1533 struct inode *inode = file_inode(vma->vm_file); 1534 struct xfs_inode *ip = XFS_I(inode); 1535 int ret; 1536 1537 if (!IS_DAX(inode)) 1538 return VM_FAULT_FALLBACK; 1539 1540 trace_xfs_filemap_pmd_fault(ip); 1541 1542 if (flags & FAULT_FLAG_WRITE) { 1543 sb_start_pagefault(inode->i_sb); 1544 file_update_time(vma->vm_file); 1545 } 1546 1547 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1548 ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); 1549 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1550 1551 if (flags & FAULT_FLAG_WRITE) 1552 sb_end_pagefault(inode->i_sb); 1553 1554 return ret; 1555 } 1556 1557 /* 1558 * pfn_mkwrite was originally inteneded to ensure we capture time stamp 1559 * updates on write faults. In reality, it's need to serialise against 1560 * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED 1561 * to ensure we serialise the fault barrier in place. 1562 */ 1563 static int 1564 xfs_filemap_pfn_mkwrite( 1565 struct vm_area_struct *vma, 1566 struct vm_fault *vmf) 1567 { 1568 1569 struct inode *inode = file_inode(vma->vm_file); 1570 struct xfs_inode *ip = XFS_I(inode); 1571 int ret = VM_FAULT_NOPAGE; 1572 loff_t size; 1573 1574 trace_xfs_filemap_pfn_mkwrite(ip); 1575 1576 sb_start_pagefault(inode->i_sb); 1577 file_update_time(vma->vm_file); 1578 1579 /* check if the faulting page hasn't raced with truncate */ 1580 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1581 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1582 if (vmf->pgoff >= size) 1583 ret = VM_FAULT_SIGBUS; 1584 else if (IS_DAX(inode)) 1585 ret = dax_pfn_mkwrite(vma, vmf); 1586 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1587 sb_end_pagefault(inode->i_sb); 1588 return ret; 1589 1590 } 1591 1592 static const struct vm_operations_struct xfs_file_vm_ops = { 1593 .fault = xfs_filemap_fault, 1594 .pmd_fault = xfs_filemap_pmd_fault, 1595 .map_pages = filemap_map_pages, 1596 .page_mkwrite = xfs_filemap_page_mkwrite, 1597 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1598 }; 1599 1600 STATIC int 1601 xfs_file_mmap( 1602 struct file *filp, 1603 struct vm_area_struct *vma) 1604 { 1605 file_accessed(filp); 1606 vma->vm_ops = &xfs_file_vm_ops; 1607 if (IS_DAX(file_inode(filp))) 1608 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 1609 return 0; 1610 } 1611 1612 const struct file_operations xfs_file_operations = { 1613 .llseek = xfs_file_llseek, 1614 .read_iter = xfs_file_read_iter, 1615 .write_iter = xfs_file_write_iter, 1616 .splice_read = generic_file_splice_read, 1617 .splice_write = iter_file_splice_write, 1618 .unlocked_ioctl = xfs_file_ioctl, 1619 #ifdef CONFIG_COMPAT 1620 .compat_ioctl = xfs_file_compat_ioctl, 1621 #endif 1622 .mmap = xfs_file_mmap, 1623 .open = xfs_file_open, 1624 .release = xfs_file_release, 1625 .fsync = xfs_file_fsync, 1626 .get_unmapped_area = thp_get_unmapped_area, 1627 .fallocate = xfs_file_fallocate, 1628 .copy_file_range = xfs_file_copy_range, 1629 .clone_file_range = xfs_file_clone_range, 1630 .dedupe_file_range = xfs_file_dedupe_range, 1631 }; 1632 1633 const struct file_operations xfs_dir_file_operations = { 1634 .open = xfs_dir_open, 1635 .read = generic_read_dir, 1636 .iterate_shared = xfs_file_readdir, 1637 .llseek = generic_file_llseek, 1638 .unlocked_ioctl = xfs_file_ioctl, 1639 #ifdef CONFIG_COMPAT 1640 .compat_ioctl = xfs_file_compat_ioctl, 1641 #endif 1642 .fsync = xfs_dir_fsync, 1643 }; 1644