1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 #include <linux/sched.h> 34 #include <linux/splice.h> 35 #include <linux/mount.h> 36 #include <linux/writeback.h> 37 #include <linux/falloc.h> 38 #include <linux/quotaops.h> 39 40 #define MLOG_MASK_PREFIX ML_INODE 41 #include <cluster/masklog.h> 42 43 #include "ocfs2.h" 44 45 #include "alloc.h" 46 #include "aops.h" 47 #include "dir.h" 48 #include "dlmglue.h" 49 #include "extent_map.h" 50 #include "file.h" 51 #include "sysfile.h" 52 #include "inode.h" 53 #include "ioctl.h" 54 #include "journal.h" 55 #include "locks.h" 56 #include "mmap.h" 57 #include "suballoc.h" 58 #include "super.h" 59 #include "xattr.h" 60 #include "acl.h" 61 #include "quota.h" 62 #include "refcounttree.h" 63 64 #include "buffer_head_io.h" 65 66 static int ocfs2_sync_inode(struct inode *inode) 67 { 68 filemap_fdatawrite(inode->i_mapping); 69 return sync_mapping_buffers(inode->i_mapping); 70 } 71 72 static int ocfs2_init_file_private(struct inode *inode, struct file *file) 73 { 74 struct ocfs2_file_private *fp; 75 76 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); 77 if (!fp) 78 return -ENOMEM; 79 80 fp->fp_file = file; 81 mutex_init(&fp->fp_mutex); 82 ocfs2_file_lock_res_init(&fp->fp_flock, fp); 83 file->private_data = fp; 84 85 return 0; 86 } 87 88 static void ocfs2_free_file_private(struct inode *inode, struct file *file) 89 { 90 struct ocfs2_file_private *fp = file->private_data; 91 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 92 93 if (fp) { 94 ocfs2_simple_drop_lockres(osb, &fp->fp_flock); 95 ocfs2_lock_res_free(&fp->fp_flock); 96 kfree(fp); 97 file->private_data = NULL; 98 } 99 } 100 101 static int ocfs2_file_open(struct inode *inode, struct file *file) 102 { 103 int status; 104 int mode = file->f_flags; 105 struct ocfs2_inode_info *oi = OCFS2_I(inode); 106 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 109 110 spin_lock(&oi->ip_lock); 111 112 /* Check that the inode hasn't been wiped from disk by another 113 * node. If it hasn't then we're safe as long as we hold the 114 * spin lock until our increment of open count. */ 115 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 116 spin_unlock(&oi->ip_lock); 117 118 status = -ENOENT; 119 goto leave; 120 } 121 122 if (mode & O_DIRECT) 123 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 124 125 oi->ip_open_count++; 126 spin_unlock(&oi->ip_lock); 127 128 status = ocfs2_init_file_private(inode, file); 129 if (status) { 130 /* 131 * We want to set open count back if we're failing the 132 * open. 133 */ 134 spin_lock(&oi->ip_lock); 135 oi->ip_open_count--; 136 spin_unlock(&oi->ip_lock); 137 } 138 139 leave: 140 mlog_exit(status); 141 return status; 142 } 143 144 static int ocfs2_file_release(struct inode *inode, struct file *file) 145 { 146 struct ocfs2_inode_info *oi = OCFS2_I(inode); 147 148 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 149 file->f_path.dentry->d_name.len, 150 file->f_path.dentry->d_name.name); 151 152 spin_lock(&oi->ip_lock); 153 if (!--oi->ip_open_count) 154 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 155 spin_unlock(&oi->ip_lock); 156 157 ocfs2_free_file_private(inode, file); 158 159 mlog_exit(0); 160 161 return 0; 162 } 163 164 static int ocfs2_dir_open(struct inode *inode, struct file *file) 165 { 166 return ocfs2_init_file_private(inode, file); 167 } 168 169 static int ocfs2_dir_release(struct inode *inode, struct file *file) 170 { 171 ocfs2_free_file_private(inode, file); 172 return 0; 173 } 174 175 static int ocfs2_sync_file(struct file *file, 176 struct dentry *dentry, 177 int datasync) 178 { 179 int err = 0; 180 journal_t *journal; 181 struct inode *inode = dentry->d_inode; 182 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 183 184 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 185 dentry->d_name.len, dentry->d_name.name); 186 187 err = ocfs2_sync_inode(dentry->d_inode); 188 if (err) 189 goto bail; 190 191 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 192 goto bail; 193 194 journal = osb->journal->j_journal; 195 err = jbd2_journal_force_commit(journal); 196 197 bail: 198 mlog_exit(err); 199 200 return (err < 0) ? -EIO : 0; 201 } 202 203 int ocfs2_should_update_atime(struct inode *inode, 204 struct vfsmount *vfsmnt) 205 { 206 struct timespec now; 207 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 208 209 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 210 return 0; 211 212 if ((inode->i_flags & S_NOATIME) || 213 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 214 return 0; 215 216 /* 217 * We can be called with no vfsmnt structure - NFSD will 218 * sometimes do this. 219 * 220 * Note that our action here is different than touch_atime() - 221 * if we can't tell whether this is a noatime mount, then we 222 * don't know whether to trust the value of s_atime_quantum. 223 */ 224 if (vfsmnt == NULL) 225 return 0; 226 227 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 228 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 229 return 0; 230 231 if (vfsmnt->mnt_flags & MNT_RELATIME) { 232 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 233 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 234 return 1; 235 236 return 0; 237 } 238 239 now = CURRENT_TIME; 240 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 241 return 0; 242 else 243 return 1; 244 } 245 246 int ocfs2_update_inode_atime(struct inode *inode, 247 struct buffer_head *bh) 248 { 249 int ret; 250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 251 handle_t *handle; 252 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 253 254 mlog_entry_void(); 255 256 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 257 if (IS_ERR(handle)) { 258 ret = PTR_ERR(handle); 259 mlog_errno(ret); 260 goto out; 261 } 262 263 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 264 OCFS2_JOURNAL_ACCESS_WRITE); 265 if (ret) { 266 mlog_errno(ret); 267 goto out_commit; 268 } 269 270 /* 271 * Don't use ocfs2_mark_inode_dirty() here as we don't always 272 * have i_mutex to guard against concurrent changes to other 273 * inode fields. 274 */ 275 inode->i_atime = CURRENT_TIME; 276 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 277 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 278 279 ret = ocfs2_journal_dirty(handle, bh); 280 if (ret < 0) 281 mlog_errno(ret); 282 283 out_commit: 284 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 285 out: 286 mlog_exit(ret); 287 return ret; 288 } 289 290 static int ocfs2_set_inode_size(handle_t *handle, 291 struct inode *inode, 292 struct buffer_head *fe_bh, 293 u64 new_i_size) 294 { 295 int status; 296 297 mlog_entry_void(); 298 i_size_write(inode, new_i_size); 299 inode->i_blocks = ocfs2_inode_sector_count(inode); 300 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 301 302 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 303 if (status < 0) { 304 mlog_errno(status); 305 goto bail; 306 } 307 308 bail: 309 mlog_exit(status); 310 return status; 311 } 312 313 int ocfs2_simple_size_update(struct inode *inode, 314 struct buffer_head *di_bh, 315 u64 new_i_size) 316 { 317 int ret; 318 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 319 handle_t *handle = NULL; 320 321 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 322 if (IS_ERR(handle)) { 323 ret = PTR_ERR(handle); 324 mlog_errno(ret); 325 goto out; 326 } 327 328 ret = ocfs2_set_inode_size(handle, inode, di_bh, 329 new_i_size); 330 if (ret < 0) 331 mlog_errno(ret); 332 333 ocfs2_commit_trans(osb, handle); 334 out: 335 return ret; 336 } 337 338 static int ocfs2_cow_file_pos(struct inode *inode, 339 struct buffer_head *fe_bh, 340 u64 offset) 341 { 342 int status; 343 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 344 unsigned int num_clusters = 0; 345 unsigned int ext_flags = 0; 346 347 /* 348 * If the new offset is aligned to the range of the cluster, there is 349 * no space for ocfs2_zero_range_for_truncate to fill, so no need to 350 * CoW either. 351 */ 352 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) 353 return 0; 354 355 status = ocfs2_get_clusters(inode, cpos, &phys, 356 &num_clusters, &ext_flags); 357 if (status) { 358 mlog_errno(status); 359 goto out; 360 } 361 362 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 363 goto out; 364 365 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 366 367 out: 368 return status; 369 } 370 371 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 372 struct inode *inode, 373 struct buffer_head *fe_bh, 374 u64 new_i_size) 375 { 376 int status; 377 handle_t *handle; 378 struct ocfs2_dinode *di; 379 u64 cluster_bytes; 380 381 mlog_entry_void(); 382 383 /* 384 * We need to CoW the cluster contains the offset if it is reflinked 385 * since we will call ocfs2_zero_range_for_truncate later which will 386 * write "0" from offset to the end of the cluster. 387 */ 388 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); 389 if (status) { 390 mlog_errno(status); 391 return status; 392 } 393 394 /* TODO: This needs to actually orphan the inode in this 395 * transaction. */ 396 397 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 398 if (IS_ERR(handle)) { 399 status = PTR_ERR(handle); 400 mlog_errno(status); 401 goto out; 402 } 403 404 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 405 OCFS2_JOURNAL_ACCESS_WRITE); 406 if (status < 0) { 407 mlog_errno(status); 408 goto out_commit; 409 } 410 411 /* 412 * Do this before setting i_size. 413 */ 414 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 415 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 416 cluster_bytes); 417 if (status) { 418 mlog_errno(status); 419 goto out_commit; 420 } 421 422 i_size_write(inode, new_i_size); 423 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 424 425 di = (struct ocfs2_dinode *) fe_bh->b_data; 426 di->i_size = cpu_to_le64(new_i_size); 427 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 428 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 429 430 status = ocfs2_journal_dirty(handle, fe_bh); 431 if (status < 0) 432 mlog_errno(status); 433 434 out_commit: 435 ocfs2_commit_trans(osb, handle); 436 out: 437 438 mlog_exit(status); 439 return status; 440 } 441 442 static int ocfs2_truncate_file(struct inode *inode, 443 struct buffer_head *di_bh, 444 u64 new_i_size) 445 { 446 int status = 0; 447 struct ocfs2_dinode *fe = NULL; 448 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 449 struct ocfs2_truncate_context *tc = NULL; 450 451 mlog_entry("(inode = %llu, new_i_size = %llu\n", 452 (unsigned long long)OCFS2_I(inode)->ip_blkno, 453 (unsigned long long)new_i_size); 454 455 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 456 * already validated it */ 457 fe = (struct ocfs2_dinode *) di_bh->b_data; 458 459 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 460 "Inode %llu, inode i_size = %lld != di " 461 "i_size = %llu, i_flags = 0x%x\n", 462 (unsigned long long)OCFS2_I(inode)->ip_blkno, 463 i_size_read(inode), 464 (unsigned long long)le64_to_cpu(fe->i_size), 465 le32_to_cpu(fe->i_flags)); 466 467 if (new_i_size > le64_to_cpu(fe->i_size)) { 468 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 469 (unsigned long long)le64_to_cpu(fe->i_size), 470 (unsigned long long)new_i_size); 471 status = -EINVAL; 472 mlog_errno(status); 473 goto bail; 474 } 475 476 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 477 (unsigned long long)le64_to_cpu(fe->i_blkno), 478 (unsigned long long)le64_to_cpu(fe->i_size), 479 (unsigned long long)new_i_size); 480 481 /* lets handle the simple truncate cases before doing any more 482 * cluster locking. */ 483 if (new_i_size == le64_to_cpu(fe->i_size)) 484 goto bail; 485 486 down_write(&OCFS2_I(inode)->ip_alloc_sem); 487 488 /* 489 * The inode lock forced other nodes to sync and drop their 490 * pages, which (correctly) happens even if we have a truncate 491 * without allocation change - ocfs2 cluster sizes can be much 492 * greater than page size, so we have to truncate them 493 * anyway. 494 */ 495 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 496 truncate_inode_pages(inode->i_mapping, new_i_size); 497 498 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 499 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 500 i_size_read(inode), 1); 501 if (status) 502 mlog_errno(status); 503 504 goto bail_unlock_sem; 505 } 506 507 /* alright, we're going to need to do a full blown alloc size 508 * change. Orphan the inode so that recovery can complete the 509 * truncate if necessary. This does the task of marking 510 * i_size. */ 511 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 512 if (status < 0) { 513 mlog_errno(status); 514 goto bail_unlock_sem; 515 } 516 517 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 518 if (status < 0) { 519 mlog_errno(status); 520 goto bail_unlock_sem; 521 } 522 523 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 524 if (status < 0) { 525 mlog_errno(status); 526 goto bail_unlock_sem; 527 } 528 529 /* TODO: orphan dir cleanup here. */ 530 bail_unlock_sem: 531 up_write(&OCFS2_I(inode)->ip_alloc_sem); 532 533 bail: 534 if (!status && OCFS2_I(inode)->ip_clusters == 0) 535 status = ocfs2_try_remove_refcount_tree(inode, di_bh); 536 537 mlog_exit(status); 538 return status; 539 } 540 541 /* 542 * extend file allocation only here. 543 * we'll update all the disk stuff, and oip->alloc_size 544 * 545 * expect stuff to be locked, a transaction started and enough data / 546 * metadata reservations in the contexts. 547 * 548 * Will return -EAGAIN, and a reason if a restart is needed. 549 * If passed in, *reason will always be set, even in error. 550 */ 551 int ocfs2_add_inode_data(struct ocfs2_super *osb, 552 struct inode *inode, 553 u32 *logical_offset, 554 u32 clusters_to_add, 555 int mark_unwritten, 556 struct buffer_head *fe_bh, 557 handle_t *handle, 558 struct ocfs2_alloc_context *data_ac, 559 struct ocfs2_alloc_context *meta_ac, 560 enum ocfs2_alloc_restarted *reason_ret) 561 { 562 int ret; 563 struct ocfs2_extent_tree et; 564 565 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); 566 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, 567 clusters_to_add, mark_unwritten, 568 data_ac, meta_ac, reason_ret); 569 570 return ret; 571 } 572 573 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 574 u32 clusters_to_add, int mark_unwritten) 575 { 576 int status = 0; 577 int restart_func = 0; 578 int credits; 579 u32 prev_clusters; 580 struct buffer_head *bh = NULL; 581 struct ocfs2_dinode *fe = NULL; 582 handle_t *handle = NULL; 583 struct ocfs2_alloc_context *data_ac = NULL; 584 struct ocfs2_alloc_context *meta_ac = NULL; 585 enum ocfs2_alloc_restarted why; 586 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 587 struct ocfs2_extent_tree et; 588 int did_quota = 0; 589 590 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 591 592 /* 593 * This function only exists for file systems which don't 594 * support holes. 595 */ 596 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 597 598 status = ocfs2_read_inode_block(inode, &bh); 599 if (status < 0) { 600 mlog_errno(status); 601 goto leave; 602 } 603 fe = (struct ocfs2_dinode *) bh->b_data; 604 605 restart_all: 606 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 607 608 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 609 "clusters_to_add = %u\n", 610 (unsigned long long)OCFS2_I(inode)->ip_blkno, 611 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 612 clusters_to_add); 613 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); 614 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 615 &data_ac, &meta_ac); 616 if (status) { 617 mlog_errno(status); 618 goto leave; 619 } 620 621 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, 622 clusters_to_add); 623 handle = ocfs2_start_trans(osb, credits); 624 if (IS_ERR(handle)) { 625 status = PTR_ERR(handle); 626 handle = NULL; 627 mlog_errno(status); 628 goto leave; 629 } 630 631 restarted_transaction: 632 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 633 clusters_to_add))) { 634 status = -EDQUOT; 635 goto leave; 636 } 637 did_quota = 1; 638 639 /* reserve a write to the file entry early on - that we if we 640 * run out of credits in the allocation path, we can still 641 * update i_size. */ 642 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 643 OCFS2_JOURNAL_ACCESS_WRITE); 644 if (status < 0) { 645 mlog_errno(status); 646 goto leave; 647 } 648 649 prev_clusters = OCFS2_I(inode)->ip_clusters; 650 651 status = ocfs2_add_inode_data(osb, 652 inode, 653 &logical_start, 654 clusters_to_add, 655 mark_unwritten, 656 bh, 657 handle, 658 data_ac, 659 meta_ac, 660 &why); 661 if ((status < 0) && (status != -EAGAIN)) { 662 if (status != -ENOSPC) 663 mlog_errno(status); 664 goto leave; 665 } 666 667 status = ocfs2_journal_dirty(handle, bh); 668 if (status < 0) { 669 mlog_errno(status); 670 goto leave; 671 } 672 673 spin_lock(&OCFS2_I(inode)->ip_lock); 674 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 675 spin_unlock(&OCFS2_I(inode)->ip_lock); 676 /* Release unused quota reservation */ 677 vfs_dq_free_space(inode, 678 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 679 did_quota = 0; 680 681 if (why != RESTART_NONE && clusters_to_add) { 682 if (why == RESTART_META) { 683 mlog(0, "restarting function.\n"); 684 restart_func = 1; 685 } else { 686 BUG_ON(why != RESTART_TRANS); 687 688 mlog(0, "restarting transaction.\n"); 689 /* TODO: This can be more intelligent. */ 690 credits = ocfs2_calc_extend_credits(osb->sb, 691 &fe->id2.i_list, 692 clusters_to_add); 693 status = ocfs2_extend_trans(handle, credits); 694 if (status < 0) { 695 /* handle still has to be committed at 696 * this point. */ 697 status = -ENOMEM; 698 mlog_errno(status); 699 goto leave; 700 } 701 goto restarted_transaction; 702 } 703 } 704 705 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 706 le32_to_cpu(fe->i_clusters), 707 (unsigned long long)le64_to_cpu(fe->i_size)); 708 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 709 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 710 711 leave: 712 if (status < 0 && did_quota) 713 vfs_dq_free_space(inode, 714 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 715 if (handle) { 716 ocfs2_commit_trans(osb, handle); 717 handle = NULL; 718 } 719 if (data_ac) { 720 ocfs2_free_alloc_context(data_ac); 721 data_ac = NULL; 722 } 723 if (meta_ac) { 724 ocfs2_free_alloc_context(meta_ac); 725 meta_ac = NULL; 726 } 727 if ((!status) && restart_func) { 728 restart_func = 0; 729 goto restart_all; 730 } 731 brelse(bh); 732 bh = NULL; 733 734 mlog_exit(status); 735 return status; 736 } 737 738 /* Some parts of this taken from generic_cont_expand, which turned out 739 * to be too fragile to do exactly what we need without us having to 740 * worry about recursive locking in ->write_begin() and ->write_end(). */ 741 static int ocfs2_write_zero_page(struct inode *inode, 742 u64 size) 743 { 744 struct address_space *mapping = inode->i_mapping; 745 struct page *page; 746 unsigned long index; 747 unsigned int offset; 748 handle_t *handle = NULL; 749 int ret; 750 751 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 752 /* ugh. in prepare/commit_write, if from==to==start of block, we 753 ** skip the prepare. make sure we never send an offset for the start 754 ** of a block 755 */ 756 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 757 offset++; 758 } 759 index = size >> PAGE_CACHE_SHIFT; 760 761 page = grab_cache_page(mapping, index); 762 if (!page) { 763 ret = -ENOMEM; 764 mlog_errno(ret); 765 goto out; 766 } 767 768 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 769 if (ret < 0) { 770 mlog_errno(ret); 771 goto out_unlock; 772 } 773 774 if (ocfs2_should_order_data(inode)) { 775 handle = ocfs2_start_walk_page_trans(inode, page, offset, 776 offset); 777 if (IS_ERR(handle)) { 778 ret = PTR_ERR(handle); 779 handle = NULL; 780 goto out_unlock; 781 } 782 } 783 784 /* must not update i_size! */ 785 ret = block_commit_write(page, offset, offset); 786 if (ret < 0) 787 mlog_errno(ret); 788 else 789 ret = 0; 790 791 if (handle) 792 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 793 out_unlock: 794 unlock_page(page); 795 page_cache_release(page); 796 out: 797 return ret; 798 } 799 800 static int ocfs2_zero_extend(struct inode *inode, 801 u64 zero_to_size) 802 { 803 int ret = 0; 804 u64 start_off; 805 struct super_block *sb = inode->i_sb; 806 807 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 808 while (start_off < zero_to_size) { 809 ret = ocfs2_write_zero_page(inode, start_off); 810 if (ret < 0) { 811 mlog_errno(ret); 812 goto out; 813 } 814 815 start_off += sb->s_blocksize; 816 817 /* 818 * Very large extends have the potential to lock up 819 * the cpu for extended periods of time. 820 */ 821 cond_resched(); 822 } 823 824 out: 825 return ret; 826 } 827 828 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 829 { 830 int ret; 831 u32 clusters_to_add; 832 struct ocfs2_inode_info *oi = OCFS2_I(inode); 833 834 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 835 if (clusters_to_add < oi->ip_clusters) 836 clusters_to_add = 0; 837 else 838 clusters_to_add -= oi->ip_clusters; 839 840 if (clusters_to_add) { 841 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, 842 clusters_to_add, 0); 843 if (ret) { 844 mlog_errno(ret); 845 goto out; 846 } 847 } 848 849 /* 850 * Call this even if we don't add any clusters to the tree. We 851 * still need to zero the area between the old i_size and the 852 * new i_size. 853 */ 854 ret = ocfs2_zero_extend(inode, zero_to); 855 if (ret < 0) 856 mlog_errno(ret); 857 858 out: 859 return ret; 860 } 861 862 static int ocfs2_extend_file(struct inode *inode, 863 struct buffer_head *di_bh, 864 u64 new_i_size) 865 { 866 int ret = 0; 867 struct ocfs2_inode_info *oi = OCFS2_I(inode); 868 869 BUG_ON(!di_bh); 870 871 /* setattr sometimes calls us like this. */ 872 if (new_i_size == 0) 873 goto out; 874 875 if (i_size_read(inode) == new_i_size) 876 goto out; 877 BUG_ON(new_i_size < i_size_read(inode)); 878 879 /* 880 * Fall through for converting inline data, even if the fs 881 * supports sparse files. 882 * 883 * The check for inline data here is legal - nobody can add 884 * the feature since we have i_mutex. We must check it again 885 * after acquiring ip_alloc_sem though, as paths like mmap 886 * might have raced us to converting the inode to extents. 887 */ 888 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 889 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 890 goto out_update_size; 891 892 /* 893 * The alloc sem blocks people in read/write from reading our 894 * allocation until we're done changing it. We depend on 895 * i_mutex to block other extend/truncate calls while we're 896 * here. 897 */ 898 down_write(&oi->ip_alloc_sem); 899 900 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 901 /* 902 * We can optimize small extends by keeping the inodes 903 * inline data. 904 */ 905 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { 906 up_write(&oi->ip_alloc_sem); 907 goto out_update_size; 908 } 909 910 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 911 if (ret) { 912 up_write(&oi->ip_alloc_sem); 913 914 mlog_errno(ret); 915 goto out; 916 } 917 } 918 919 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 920 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 921 922 up_write(&oi->ip_alloc_sem); 923 924 if (ret < 0) { 925 mlog_errno(ret); 926 goto out; 927 } 928 929 out_update_size: 930 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 931 if (ret < 0) 932 mlog_errno(ret); 933 934 out: 935 return ret; 936 } 937 938 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 939 { 940 int status = 0, size_change; 941 struct inode *inode = dentry->d_inode; 942 struct super_block *sb = inode->i_sb; 943 struct ocfs2_super *osb = OCFS2_SB(sb); 944 struct buffer_head *bh = NULL; 945 handle_t *handle = NULL; 946 int qtype; 947 struct dquot *transfer_from[MAXQUOTAS] = { }; 948 struct dquot *transfer_to[MAXQUOTAS] = { }; 949 950 mlog_entry("(0x%p, '%.*s')\n", dentry, 951 dentry->d_name.len, dentry->d_name.name); 952 953 /* ensuring we don't even attempt to truncate a symlink */ 954 if (S_ISLNK(inode->i_mode)) 955 attr->ia_valid &= ~ATTR_SIZE; 956 957 if (attr->ia_valid & ATTR_MODE) 958 mlog(0, "mode change: %d\n", attr->ia_mode); 959 if (attr->ia_valid & ATTR_UID) 960 mlog(0, "uid change: %d\n", attr->ia_uid); 961 if (attr->ia_valid & ATTR_GID) 962 mlog(0, "gid change: %d\n", attr->ia_gid); 963 if (attr->ia_valid & ATTR_SIZE) 964 mlog(0, "size change...\n"); 965 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 966 mlog(0, "time change...\n"); 967 968 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 969 | ATTR_GID | ATTR_UID | ATTR_MODE) 970 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 971 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 972 return 0; 973 } 974 975 status = inode_change_ok(inode, attr); 976 if (status) 977 return status; 978 979 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 980 if (size_change) { 981 status = ocfs2_rw_lock(inode, 1); 982 if (status < 0) { 983 mlog_errno(status); 984 goto bail; 985 } 986 } 987 988 status = ocfs2_inode_lock(inode, &bh, 1); 989 if (status < 0) { 990 if (status != -ENOENT) 991 mlog_errno(status); 992 goto bail_unlock_rw; 993 } 994 995 if (size_change && attr->ia_size != i_size_read(inode)) { 996 if (attr->ia_size > sb->s_maxbytes) { 997 status = -EFBIG; 998 goto bail_unlock; 999 } 1000 1001 if (i_size_read(inode) > attr->ia_size) { 1002 if (ocfs2_should_order_data(inode)) { 1003 status = ocfs2_begin_ordered_truncate(inode, 1004 attr->ia_size); 1005 if (status) 1006 goto bail_unlock; 1007 } 1008 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1009 } else 1010 status = ocfs2_extend_file(inode, bh, attr->ia_size); 1011 if (status < 0) { 1012 if (status != -ENOSPC) 1013 mlog_errno(status); 1014 status = -ENOSPC; 1015 goto bail_unlock; 1016 } 1017 } 1018 1019 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 1020 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 1021 /* 1022 * Gather pointers to quota structures so that allocation / 1023 * freeing of quota structures happens here and not inside 1024 * vfs_dq_transfer() where we have problems with lock ordering 1025 */ 1026 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1027 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1028 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1029 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1030 USRQUOTA); 1031 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, 1032 USRQUOTA); 1033 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) { 1034 status = -ESRCH; 1035 goto bail_unlock; 1036 } 1037 } 1038 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 1039 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1040 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1041 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1042 GRPQUOTA); 1043 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, 1044 GRPQUOTA); 1045 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) { 1046 status = -ESRCH; 1047 goto bail_unlock; 1048 } 1049 } 1050 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 1051 2 * ocfs2_quota_trans_credits(sb)); 1052 if (IS_ERR(handle)) { 1053 status = PTR_ERR(handle); 1054 mlog_errno(status); 1055 goto bail_unlock; 1056 } 1057 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1058 if (status < 0) 1059 goto bail_commit; 1060 } else { 1061 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1062 if (IS_ERR(handle)) { 1063 status = PTR_ERR(handle); 1064 mlog_errno(status); 1065 goto bail_unlock; 1066 } 1067 } 1068 1069 /* 1070 * This will intentionally not wind up calling vmtruncate(), 1071 * since all the work for a size change has been done above. 1072 * Otherwise, we could get into problems with truncate as 1073 * ip_alloc_sem is used there to protect against i_size 1074 * changes. 1075 */ 1076 status = inode_setattr(inode, attr); 1077 if (status < 0) { 1078 mlog_errno(status); 1079 goto bail_commit; 1080 } 1081 1082 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1083 if (status < 0) 1084 mlog_errno(status); 1085 1086 bail_commit: 1087 ocfs2_commit_trans(osb, handle); 1088 bail_unlock: 1089 ocfs2_inode_unlock(inode, 1); 1090 bail_unlock_rw: 1091 if (size_change) 1092 ocfs2_rw_unlock(inode, 1); 1093 bail: 1094 brelse(bh); 1095 1096 /* Release quota pointers in case we acquired them */ 1097 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1098 dqput(transfer_to[qtype]); 1099 dqput(transfer_from[qtype]); 1100 } 1101 1102 if (!status && attr->ia_valid & ATTR_MODE) { 1103 status = ocfs2_acl_chmod(inode); 1104 if (status < 0) 1105 mlog_errno(status); 1106 } 1107 1108 mlog_exit(status); 1109 return status; 1110 } 1111 1112 int ocfs2_getattr(struct vfsmount *mnt, 1113 struct dentry *dentry, 1114 struct kstat *stat) 1115 { 1116 struct inode *inode = dentry->d_inode; 1117 struct super_block *sb = dentry->d_inode->i_sb; 1118 struct ocfs2_super *osb = sb->s_fs_info; 1119 int err; 1120 1121 mlog_entry_void(); 1122 1123 err = ocfs2_inode_revalidate(dentry); 1124 if (err) { 1125 if (err != -ENOENT) 1126 mlog_errno(err); 1127 goto bail; 1128 } 1129 1130 generic_fillattr(inode, stat); 1131 1132 /* We set the blksize from the cluster size for performance */ 1133 stat->blksize = osb->s_clustersize; 1134 1135 bail: 1136 mlog_exit(err); 1137 1138 return err; 1139 } 1140 1141 int ocfs2_permission(struct inode *inode, int mask) 1142 { 1143 int ret; 1144 1145 mlog_entry_void(); 1146 1147 ret = ocfs2_inode_lock(inode, NULL, 0); 1148 if (ret) { 1149 if (ret != -ENOENT) 1150 mlog_errno(ret); 1151 goto out; 1152 } 1153 1154 ret = generic_permission(inode, mask, ocfs2_check_acl); 1155 1156 ocfs2_inode_unlock(inode, 0); 1157 out: 1158 mlog_exit(ret); 1159 return ret; 1160 } 1161 1162 static int __ocfs2_write_remove_suid(struct inode *inode, 1163 struct buffer_head *bh) 1164 { 1165 int ret; 1166 handle_t *handle; 1167 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1168 struct ocfs2_dinode *di; 1169 1170 mlog_entry("(Inode %llu, mode 0%o)\n", 1171 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); 1172 1173 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1174 if (IS_ERR(handle)) { 1175 ret = PTR_ERR(handle); 1176 mlog_errno(ret); 1177 goto out; 1178 } 1179 1180 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1181 OCFS2_JOURNAL_ACCESS_WRITE); 1182 if (ret < 0) { 1183 mlog_errno(ret); 1184 goto out_trans; 1185 } 1186 1187 inode->i_mode &= ~S_ISUID; 1188 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1189 inode->i_mode &= ~S_ISGID; 1190 1191 di = (struct ocfs2_dinode *) bh->b_data; 1192 di->i_mode = cpu_to_le16(inode->i_mode); 1193 1194 ret = ocfs2_journal_dirty(handle, bh); 1195 if (ret < 0) 1196 mlog_errno(ret); 1197 1198 out_trans: 1199 ocfs2_commit_trans(osb, handle); 1200 out: 1201 mlog_exit(ret); 1202 return ret; 1203 } 1204 1205 /* 1206 * Will look for holes and unwritten extents in the range starting at 1207 * pos for count bytes (inclusive). 1208 */ 1209 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1210 size_t count) 1211 { 1212 int ret = 0; 1213 unsigned int extent_flags; 1214 u32 cpos, clusters, extent_len, phys_cpos; 1215 struct super_block *sb = inode->i_sb; 1216 1217 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1218 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1219 1220 while (clusters) { 1221 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1222 &extent_flags); 1223 if (ret < 0) { 1224 mlog_errno(ret); 1225 goto out; 1226 } 1227 1228 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1229 ret = 1; 1230 break; 1231 } 1232 1233 if (extent_len > clusters) 1234 extent_len = clusters; 1235 1236 clusters -= extent_len; 1237 cpos += extent_len; 1238 } 1239 out: 1240 return ret; 1241 } 1242 1243 static int ocfs2_write_remove_suid(struct inode *inode) 1244 { 1245 int ret; 1246 struct buffer_head *bh = NULL; 1247 1248 ret = ocfs2_read_inode_block(inode, &bh); 1249 if (ret < 0) { 1250 mlog_errno(ret); 1251 goto out; 1252 } 1253 1254 ret = __ocfs2_write_remove_suid(inode, bh); 1255 out: 1256 brelse(bh); 1257 return ret; 1258 } 1259 1260 /* 1261 * Allocate enough extents to cover the region starting at byte offset 1262 * start for len bytes. Existing extents are skipped, any extents 1263 * added are marked as "unwritten". 1264 */ 1265 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1266 u64 start, u64 len) 1267 { 1268 int ret; 1269 u32 cpos, phys_cpos, clusters, alloc_size; 1270 u64 end = start + len; 1271 struct buffer_head *di_bh = NULL; 1272 1273 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1274 ret = ocfs2_read_inode_block(inode, &di_bh); 1275 if (ret) { 1276 mlog_errno(ret); 1277 goto out; 1278 } 1279 1280 /* 1281 * Nothing to do if the requested reservation range 1282 * fits within the inode. 1283 */ 1284 if (ocfs2_size_fits_inline_data(di_bh, end)) 1285 goto out; 1286 1287 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1288 if (ret) { 1289 mlog_errno(ret); 1290 goto out; 1291 } 1292 } 1293 1294 /* 1295 * We consider both start and len to be inclusive. 1296 */ 1297 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1298 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1299 clusters -= cpos; 1300 1301 while (clusters) { 1302 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1303 &alloc_size, NULL); 1304 if (ret) { 1305 mlog_errno(ret); 1306 goto out; 1307 } 1308 1309 /* 1310 * Hole or existing extent len can be arbitrary, so 1311 * cap it to our own allocation request. 1312 */ 1313 if (alloc_size > clusters) 1314 alloc_size = clusters; 1315 1316 if (phys_cpos) { 1317 /* 1318 * We already have an allocation at this 1319 * region so we can safely skip it. 1320 */ 1321 goto next; 1322 } 1323 1324 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1325 if (ret) { 1326 if (ret != -ENOSPC) 1327 mlog_errno(ret); 1328 goto out; 1329 } 1330 1331 next: 1332 cpos += alloc_size; 1333 clusters -= alloc_size; 1334 } 1335 1336 ret = 0; 1337 out: 1338 1339 brelse(di_bh); 1340 return ret; 1341 } 1342 1343 /* 1344 * Truncate a byte range, avoiding pages within partial clusters. This 1345 * preserves those pages for the zeroing code to write to. 1346 */ 1347 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1348 u64 byte_len) 1349 { 1350 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1351 loff_t start, end; 1352 struct address_space *mapping = inode->i_mapping; 1353 1354 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1355 end = byte_start + byte_len; 1356 end = end & ~(osb->s_clustersize - 1); 1357 1358 if (start < end) { 1359 unmap_mapping_range(mapping, start, end - start, 0); 1360 truncate_inode_pages_range(mapping, start, end - 1); 1361 } 1362 } 1363 1364 static int ocfs2_zero_partial_clusters(struct inode *inode, 1365 u64 start, u64 len) 1366 { 1367 int ret = 0; 1368 u64 tmpend, end = start + len; 1369 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1370 unsigned int csize = osb->s_clustersize; 1371 handle_t *handle; 1372 1373 /* 1374 * The "start" and "end" values are NOT necessarily part of 1375 * the range whose allocation is being deleted. Rather, this 1376 * is what the user passed in with the request. We must zero 1377 * partial clusters here. There's no need to worry about 1378 * physical allocation - the zeroing code knows to skip holes. 1379 */ 1380 mlog(0, "byte start: %llu, end: %llu\n", 1381 (unsigned long long)start, (unsigned long long)end); 1382 1383 /* 1384 * If both edges are on a cluster boundary then there's no 1385 * zeroing required as the region is part of the allocation to 1386 * be truncated. 1387 */ 1388 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1389 goto out; 1390 1391 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1392 if (IS_ERR(handle)) { 1393 ret = PTR_ERR(handle); 1394 mlog_errno(ret); 1395 goto out; 1396 } 1397 1398 /* 1399 * We want to get the byte offset of the end of the 1st cluster. 1400 */ 1401 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1402 if (tmpend > end) 1403 tmpend = end; 1404 1405 mlog(0, "1st range: start: %llu, tmpend: %llu\n", 1406 (unsigned long long)start, (unsigned long long)tmpend); 1407 1408 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1409 if (ret) 1410 mlog_errno(ret); 1411 1412 if (tmpend < end) { 1413 /* 1414 * This may make start and end equal, but the zeroing 1415 * code will skip any work in that case so there's no 1416 * need to catch it up here. 1417 */ 1418 start = end & ~(osb->s_clustersize - 1); 1419 1420 mlog(0, "2nd range: start: %llu, end: %llu\n", 1421 (unsigned long long)start, (unsigned long long)end); 1422 1423 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1424 if (ret) 1425 mlog_errno(ret); 1426 } 1427 1428 ocfs2_commit_trans(osb, handle); 1429 out: 1430 return ret; 1431 } 1432 1433 static int ocfs2_remove_inode_range(struct inode *inode, 1434 struct buffer_head *di_bh, u64 byte_start, 1435 u64 byte_len) 1436 { 1437 int ret = 0; 1438 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1439 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1440 struct ocfs2_cached_dealloc_ctxt dealloc; 1441 struct address_space *mapping = inode->i_mapping; 1442 struct ocfs2_extent_tree et; 1443 1444 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1445 ocfs2_init_dealloc_ctxt(&dealloc); 1446 1447 if (byte_len == 0) 1448 return 0; 1449 1450 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1451 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1452 byte_start + byte_len, 0); 1453 if (ret) { 1454 mlog_errno(ret); 1455 goto out; 1456 } 1457 /* 1458 * There's no need to get fancy with the page cache 1459 * truncate of an inline-data inode. We're talking 1460 * about less than a page here, which will be cached 1461 * in the dinode buffer anyway. 1462 */ 1463 unmap_mapping_range(mapping, 0, 0, 0); 1464 truncate_inode_pages(mapping, 0); 1465 goto out; 1466 } 1467 1468 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1469 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1470 if (trunc_len >= trunc_start) 1471 trunc_len -= trunc_start; 1472 else 1473 trunc_len = 0; 1474 1475 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1476 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1477 (unsigned long long)byte_start, 1478 (unsigned long long)byte_len, trunc_start, trunc_len); 1479 1480 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1481 if (ret) { 1482 mlog_errno(ret); 1483 goto out; 1484 } 1485 1486 cpos = trunc_start; 1487 while (trunc_len) { 1488 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1489 &alloc_size, NULL); 1490 if (ret) { 1491 mlog_errno(ret); 1492 goto out; 1493 } 1494 1495 if (alloc_size > trunc_len) 1496 alloc_size = trunc_len; 1497 1498 /* Only do work for non-holes */ 1499 if (phys_cpos != 0) { 1500 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1501 phys_cpos, alloc_size, 1502 &dealloc); 1503 if (ret) { 1504 mlog_errno(ret); 1505 goto out; 1506 } 1507 } 1508 1509 cpos += alloc_size; 1510 trunc_len -= alloc_size; 1511 } 1512 1513 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1514 1515 out: 1516 ocfs2_schedule_truncate_log_flush(osb, 1); 1517 ocfs2_run_deallocs(osb, &dealloc); 1518 1519 return ret; 1520 } 1521 1522 /* 1523 * Parts of this function taken from xfs_change_file_space() 1524 */ 1525 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1526 loff_t f_pos, unsigned int cmd, 1527 struct ocfs2_space_resv *sr, 1528 int change_size) 1529 { 1530 int ret; 1531 s64 llen; 1532 loff_t size; 1533 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1534 struct buffer_head *di_bh = NULL; 1535 handle_t *handle; 1536 unsigned long long max_off = inode->i_sb->s_maxbytes; 1537 1538 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1539 return -EROFS; 1540 1541 mutex_lock(&inode->i_mutex); 1542 1543 /* 1544 * This prevents concurrent writes on other nodes 1545 */ 1546 ret = ocfs2_rw_lock(inode, 1); 1547 if (ret) { 1548 mlog_errno(ret); 1549 goto out; 1550 } 1551 1552 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1553 if (ret) { 1554 mlog_errno(ret); 1555 goto out_rw_unlock; 1556 } 1557 1558 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1559 ret = -EPERM; 1560 goto out_inode_unlock; 1561 } 1562 1563 switch (sr->l_whence) { 1564 case 0: /*SEEK_SET*/ 1565 break; 1566 case 1: /*SEEK_CUR*/ 1567 sr->l_start += f_pos; 1568 break; 1569 case 2: /*SEEK_END*/ 1570 sr->l_start += i_size_read(inode); 1571 break; 1572 default: 1573 ret = -EINVAL; 1574 goto out_inode_unlock; 1575 } 1576 sr->l_whence = 0; 1577 1578 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1579 1580 if (sr->l_start < 0 1581 || sr->l_start > max_off 1582 || (sr->l_start + llen) < 0 1583 || (sr->l_start + llen) > max_off) { 1584 ret = -EINVAL; 1585 goto out_inode_unlock; 1586 } 1587 size = sr->l_start + sr->l_len; 1588 1589 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1590 if (sr->l_len <= 0) { 1591 ret = -EINVAL; 1592 goto out_inode_unlock; 1593 } 1594 } 1595 1596 if (file && should_remove_suid(file->f_path.dentry)) { 1597 ret = __ocfs2_write_remove_suid(inode, di_bh); 1598 if (ret) { 1599 mlog_errno(ret); 1600 goto out_inode_unlock; 1601 } 1602 } 1603 1604 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1605 switch (cmd) { 1606 case OCFS2_IOC_RESVSP: 1607 case OCFS2_IOC_RESVSP64: 1608 /* 1609 * This takes unsigned offsets, but the signed ones we 1610 * pass have been checked against overflow above. 1611 */ 1612 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1613 sr->l_len); 1614 break; 1615 case OCFS2_IOC_UNRESVSP: 1616 case OCFS2_IOC_UNRESVSP64: 1617 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1618 sr->l_len); 1619 break; 1620 default: 1621 ret = -EINVAL; 1622 } 1623 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1624 if (ret) { 1625 mlog_errno(ret); 1626 goto out_inode_unlock; 1627 } 1628 1629 /* 1630 * We update c/mtime for these changes 1631 */ 1632 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1633 if (IS_ERR(handle)) { 1634 ret = PTR_ERR(handle); 1635 mlog_errno(ret); 1636 goto out_inode_unlock; 1637 } 1638 1639 if (change_size && i_size_read(inode) < size) 1640 i_size_write(inode, size); 1641 1642 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1643 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1644 if (ret < 0) 1645 mlog_errno(ret); 1646 1647 ocfs2_commit_trans(osb, handle); 1648 1649 out_inode_unlock: 1650 brelse(di_bh); 1651 ocfs2_inode_unlock(inode, 1); 1652 out_rw_unlock: 1653 ocfs2_rw_unlock(inode, 1); 1654 1655 out: 1656 mutex_unlock(&inode->i_mutex); 1657 return ret; 1658 } 1659 1660 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1661 struct ocfs2_space_resv *sr) 1662 { 1663 struct inode *inode = file->f_path.dentry->d_inode; 1664 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1665 1666 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1667 !ocfs2_writes_unwritten_extents(osb)) 1668 return -ENOTTY; 1669 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1670 !ocfs2_sparse_alloc(osb)) 1671 return -ENOTTY; 1672 1673 if (!S_ISREG(inode->i_mode)) 1674 return -EINVAL; 1675 1676 if (!(file->f_mode & FMODE_WRITE)) 1677 return -EBADF; 1678 1679 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1680 } 1681 1682 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1683 loff_t len) 1684 { 1685 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1686 struct ocfs2_space_resv sr; 1687 int change_size = 1; 1688 1689 if (!ocfs2_writes_unwritten_extents(osb)) 1690 return -EOPNOTSUPP; 1691 1692 if (S_ISDIR(inode->i_mode)) 1693 return -ENODEV; 1694 1695 if (mode & FALLOC_FL_KEEP_SIZE) 1696 change_size = 0; 1697 1698 sr.l_whence = 0; 1699 sr.l_start = (s64)offset; 1700 sr.l_len = (s64)len; 1701 1702 return __ocfs2_change_file_space(NULL, inode, offset, 1703 OCFS2_IOC_RESVSP64, &sr, change_size); 1704 } 1705 1706 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 1707 size_t count) 1708 { 1709 int ret = 0; 1710 unsigned int extent_flags; 1711 u32 cpos, clusters, extent_len, phys_cpos; 1712 struct super_block *sb = inode->i_sb; 1713 1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || 1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || 1716 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1717 return 0; 1718 1719 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1720 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1721 1722 while (clusters) { 1723 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1724 &extent_flags); 1725 if (ret < 0) { 1726 mlog_errno(ret); 1727 goto out; 1728 } 1729 1730 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { 1731 ret = 1; 1732 break; 1733 } 1734 1735 if (extent_len > clusters) 1736 extent_len = clusters; 1737 1738 clusters -= extent_len; 1739 cpos += extent_len; 1740 } 1741 out: 1742 return ret; 1743 } 1744 1745 static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 1746 loff_t pos, size_t count, 1747 int *meta_level) 1748 { 1749 int ret; 1750 struct buffer_head *di_bh = NULL; 1751 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1752 u32 clusters = 1753 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 1754 1755 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1756 if (ret) { 1757 mlog_errno(ret); 1758 goto out; 1759 } 1760 1761 *meta_level = 1; 1762 1763 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 1764 if (ret) 1765 mlog_errno(ret); 1766 out: 1767 brelse(di_bh); 1768 return ret; 1769 } 1770 1771 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1772 loff_t *ppos, 1773 size_t count, 1774 int appending, 1775 int *direct_io, 1776 int *has_refcount) 1777 { 1778 int ret = 0, meta_level = 0; 1779 struct inode *inode = dentry->d_inode; 1780 loff_t saved_pos, end; 1781 1782 /* 1783 * We start with a read level meta lock and only jump to an ex 1784 * if we need to make modifications here. 1785 */ 1786 for(;;) { 1787 ret = ocfs2_inode_lock(inode, NULL, meta_level); 1788 if (ret < 0) { 1789 meta_level = -1; 1790 mlog_errno(ret); 1791 goto out; 1792 } 1793 1794 /* Clear suid / sgid if necessary. We do this here 1795 * instead of later in the write path because 1796 * remove_suid() calls ->setattr without any hint that 1797 * we may have already done our cluster locking. Since 1798 * ocfs2_setattr() *must* take cluster locks to 1799 * proceeed, this will lead us to recursively lock the 1800 * inode. There's also the dinode i_size state which 1801 * can be lost via setattr during extending writes (we 1802 * set inode->i_size at the end of a write. */ 1803 if (should_remove_suid(dentry)) { 1804 if (meta_level == 0) { 1805 ocfs2_inode_unlock(inode, meta_level); 1806 meta_level = 1; 1807 continue; 1808 } 1809 1810 ret = ocfs2_write_remove_suid(inode); 1811 if (ret < 0) { 1812 mlog_errno(ret); 1813 goto out_unlock; 1814 } 1815 } 1816 1817 /* work on a copy of ppos until we're sure that we won't have 1818 * to recalculate it due to relocking. */ 1819 if (appending) { 1820 saved_pos = i_size_read(inode); 1821 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1822 } else { 1823 saved_pos = *ppos; 1824 } 1825 1826 end = saved_pos + count; 1827 1828 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); 1829 if (ret == 1) { 1830 ocfs2_inode_unlock(inode, meta_level); 1831 meta_level = -1; 1832 1833 ret = ocfs2_prepare_inode_for_refcount(inode, 1834 saved_pos, 1835 count, 1836 &meta_level); 1837 if (has_refcount) 1838 *has_refcount = 1; 1839 } 1840 1841 if (ret < 0) { 1842 mlog_errno(ret); 1843 goto out_unlock; 1844 } 1845 1846 /* 1847 * Skip the O_DIRECT checks if we don't need 1848 * them. 1849 */ 1850 if (!direct_io || !(*direct_io)) 1851 break; 1852 1853 /* 1854 * There's no sane way to do direct writes to an inode 1855 * with inline data. 1856 */ 1857 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1858 *direct_io = 0; 1859 break; 1860 } 1861 1862 if (has_refcount && *has_refcount == 1) { 1863 *direct_io = 0; 1864 break; 1865 } 1866 /* 1867 * Allowing concurrent direct writes means 1868 * i_size changes wouldn't be synchronized, so 1869 * one node could wind up truncating another 1870 * nodes writes. 1871 */ 1872 if (end > i_size_read(inode)) { 1873 *direct_io = 0; 1874 break; 1875 } 1876 1877 /* 1878 * We don't fill holes during direct io, so 1879 * check for them here. If any are found, the 1880 * caller will have to retake some cluster 1881 * locks and initiate the io as buffered. 1882 */ 1883 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 1884 if (ret == 1) { 1885 *direct_io = 0; 1886 ret = 0; 1887 } else if (ret < 0) 1888 mlog_errno(ret); 1889 break; 1890 } 1891 1892 if (appending) 1893 *ppos = saved_pos; 1894 1895 out_unlock: 1896 if (meta_level >= 0) 1897 ocfs2_inode_unlock(inode, meta_level); 1898 1899 out: 1900 return ret; 1901 } 1902 1903 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1904 const struct iovec *iov, 1905 unsigned long nr_segs, 1906 loff_t pos) 1907 { 1908 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1909 int can_do_direct, has_refcount = 0; 1910 ssize_t written = 0; 1911 size_t ocount; /* original count */ 1912 size_t count; /* after file limit checks */ 1913 loff_t old_size, *ppos = &iocb->ki_pos; 1914 u32 old_clusters; 1915 struct file *file = iocb->ki_filp; 1916 struct inode *inode = file->f_path.dentry->d_inode; 1917 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1918 1919 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1920 (unsigned int)nr_segs, 1921 file->f_path.dentry->d_name.len, 1922 file->f_path.dentry->d_name.name); 1923 1924 if (iocb->ki_left == 0) 1925 return 0; 1926 1927 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1928 1929 appending = file->f_flags & O_APPEND ? 1 : 0; 1930 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 1931 1932 mutex_lock(&inode->i_mutex); 1933 1934 relock: 1935 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1936 if (direct_io) { 1937 down_read(&inode->i_alloc_sem); 1938 have_alloc_sem = 1; 1939 } 1940 1941 /* concurrent O_DIRECT writes are allowed */ 1942 rw_level = !direct_io; 1943 ret = ocfs2_rw_lock(inode, rw_level); 1944 if (ret < 0) { 1945 mlog_errno(ret); 1946 goto out_sems; 1947 } 1948 1949 can_do_direct = direct_io; 1950 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1951 iocb->ki_left, appending, 1952 &can_do_direct, &has_refcount); 1953 if (ret < 0) { 1954 mlog_errno(ret); 1955 goto out; 1956 } 1957 1958 /* 1959 * We can't complete the direct I/O as requested, fall back to 1960 * buffered I/O. 1961 */ 1962 if (direct_io && !can_do_direct) { 1963 ocfs2_rw_unlock(inode, rw_level); 1964 up_read(&inode->i_alloc_sem); 1965 1966 have_alloc_sem = 0; 1967 rw_level = -1; 1968 1969 direct_io = 0; 1970 goto relock; 1971 } 1972 1973 /* 1974 * To later detect whether a journal commit for sync writes is 1975 * necessary, we sample i_size, and cluster count here. 1976 */ 1977 old_size = i_size_read(inode); 1978 old_clusters = OCFS2_I(inode)->ip_clusters; 1979 1980 /* communicate with ocfs2_dio_end_io */ 1981 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1982 1983 if (direct_io) { 1984 ret = generic_segment_checks(iov, &nr_segs, &ocount, 1985 VERIFY_READ); 1986 if (ret) 1987 goto out_dio; 1988 1989 count = ocount; 1990 ret = generic_write_checks(file, ppos, &count, 1991 S_ISBLK(inode->i_mode)); 1992 if (ret) 1993 goto out_dio; 1994 1995 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 1996 ppos, count, ocount); 1997 if (written < 0) { 1998 /* 1999 * direct write may have instantiated a few 2000 * blocks outside i_size. Trim these off again. 2001 * Don't need i_size_read because we hold i_mutex. 2002 */ 2003 if (*ppos + count > inode->i_size) 2004 vmtruncate(inode, inode->i_size); 2005 ret = written; 2006 goto out_dio; 2007 } 2008 } else { 2009 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); 2010 } 2011 2012 out_dio: 2013 /* buffered aio wouldn't have proper lock coverage today */ 2014 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2015 2016 if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) || 2017 (file->f_flags & O_DIRECT && has_refcount)) { 2018 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2019 pos + count - 1); 2020 if (ret < 0) 2021 written = ret; 2022 2023 if (!ret && (old_size != i_size_read(inode) || 2024 old_clusters != OCFS2_I(inode)->ip_clusters || 2025 has_refcount)) { 2026 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2027 if (ret < 0) 2028 written = ret; 2029 } 2030 2031 if (!ret) 2032 ret = filemap_fdatawait_range(file->f_mapping, pos, 2033 pos + count - 1); 2034 } 2035 2036 /* 2037 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2038 * function pointer which is called when o_direct io completes so that 2039 * it can unlock our rw lock. (it's the clustered equivalent of 2040 * i_alloc_sem; protects truncate from racing with pending ios). 2041 * Unfortunately there are error cases which call end_io and others 2042 * that don't. so we don't have to unlock the rw_lock if either an 2043 * async dio is going to do it in the future or an end_io after an 2044 * error has already done it. 2045 */ 2046 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2047 rw_level = -1; 2048 have_alloc_sem = 0; 2049 } 2050 2051 out: 2052 if (rw_level != -1) 2053 ocfs2_rw_unlock(inode, rw_level); 2054 2055 out_sems: 2056 if (have_alloc_sem) 2057 up_read(&inode->i_alloc_sem); 2058 2059 mutex_unlock(&inode->i_mutex); 2060 2061 if (written) 2062 ret = written; 2063 mlog_exit(ret); 2064 return ret; 2065 } 2066 2067 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 2068 struct file *out, 2069 struct splice_desc *sd) 2070 { 2071 int ret; 2072 2073 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2074 sd->total_len, 0, NULL, NULL); 2075 if (ret < 0) { 2076 mlog_errno(ret); 2077 return ret; 2078 } 2079 2080 return splice_from_pipe_feed(pipe, sd, pipe_to_file); 2081 } 2082 2083 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2084 struct file *out, 2085 loff_t *ppos, 2086 size_t len, 2087 unsigned int flags) 2088 { 2089 int ret; 2090 struct address_space *mapping = out->f_mapping; 2091 struct inode *inode = mapping->host; 2092 struct splice_desc sd = { 2093 .total_len = len, 2094 .flags = flags, 2095 .pos = *ppos, 2096 .u.file = out, 2097 }; 2098 2099 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 2100 (unsigned int)len, 2101 out->f_path.dentry->d_name.len, 2102 out->f_path.dentry->d_name.name); 2103 2104 if (pipe->inode) 2105 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 2106 2107 splice_from_pipe_begin(&sd); 2108 do { 2109 ret = splice_from_pipe_next(pipe, &sd); 2110 if (ret <= 0) 2111 break; 2112 2113 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2114 ret = ocfs2_rw_lock(inode, 1); 2115 if (ret < 0) 2116 mlog_errno(ret); 2117 else { 2118 ret = ocfs2_splice_to_file(pipe, out, &sd); 2119 ocfs2_rw_unlock(inode, 1); 2120 } 2121 mutex_unlock(&inode->i_mutex); 2122 } while (ret > 0); 2123 splice_from_pipe_end(pipe, &sd); 2124 2125 if (pipe->inode) 2126 mutex_unlock(&pipe->inode->i_mutex); 2127 2128 if (sd.num_spliced) 2129 ret = sd.num_spliced; 2130 2131 if (ret > 0) { 2132 unsigned long nr_pages; 2133 int err; 2134 2135 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2136 2137 err = generic_write_sync(out, *ppos, ret); 2138 if (err) 2139 ret = err; 2140 else 2141 *ppos += ret; 2142 2143 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2144 } 2145 2146 mlog_exit(ret); 2147 return ret; 2148 } 2149 2150 static ssize_t ocfs2_file_splice_read(struct file *in, 2151 loff_t *ppos, 2152 struct pipe_inode_info *pipe, 2153 size_t len, 2154 unsigned int flags) 2155 { 2156 int ret = 0, lock_level = 0; 2157 struct inode *inode = in->f_path.dentry->d_inode; 2158 2159 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 2160 (unsigned int)len, 2161 in->f_path.dentry->d_name.len, 2162 in->f_path.dentry->d_name.name); 2163 2164 /* 2165 * See the comment in ocfs2_file_aio_read() 2166 */ 2167 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); 2168 if (ret < 0) { 2169 mlog_errno(ret); 2170 goto bail; 2171 } 2172 ocfs2_inode_unlock(inode, lock_level); 2173 2174 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2175 2176 bail: 2177 mlog_exit(ret); 2178 return ret; 2179 } 2180 2181 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2182 const struct iovec *iov, 2183 unsigned long nr_segs, 2184 loff_t pos) 2185 { 2186 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2187 struct file *filp = iocb->ki_filp; 2188 struct inode *inode = filp->f_path.dentry->d_inode; 2189 2190 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 2191 (unsigned int)nr_segs, 2192 filp->f_path.dentry->d_name.len, 2193 filp->f_path.dentry->d_name.name); 2194 2195 if (!inode) { 2196 ret = -EINVAL; 2197 mlog_errno(ret); 2198 goto bail; 2199 } 2200 2201 /* 2202 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2203 * need locks to protect pending reads from racing with truncate. 2204 */ 2205 if (filp->f_flags & O_DIRECT) { 2206 down_read(&inode->i_alloc_sem); 2207 have_alloc_sem = 1; 2208 2209 ret = ocfs2_rw_lock(inode, 0); 2210 if (ret < 0) { 2211 mlog_errno(ret); 2212 goto bail; 2213 } 2214 rw_level = 0; 2215 /* communicate with ocfs2_dio_end_io */ 2216 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2217 } 2218 2219 /* 2220 * We're fine letting folks race truncates and extending 2221 * writes with read across the cluster, just like they can 2222 * locally. Hence no rw_lock during read. 2223 * 2224 * Take and drop the meta data lock to update inode fields 2225 * like i_size. This allows the checks down below 2226 * generic_file_aio_read() a chance of actually working. 2227 */ 2228 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2229 if (ret < 0) { 2230 mlog_errno(ret); 2231 goto bail; 2232 } 2233 ocfs2_inode_unlock(inode, lock_level); 2234 2235 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2236 if (ret == -EINVAL) 2237 mlog(0, "generic_file_aio_read returned -EINVAL\n"); 2238 2239 /* buffered aio wouldn't have proper lock coverage today */ 2240 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2241 2242 /* see ocfs2_file_aio_write */ 2243 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2244 rw_level = -1; 2245 have_alloc_sem = 0; 2246 } 2247 2248 bail: 2249 if (have_alloc_sem) 2250 up_read(&inode->i_alloc_sem); 2251 if (rw_level != -1) 2252 ocfs2_rw_unlock(inode, rw_level); 2253 mlog_exit(ret); 2254 2255 return ret; 2256 } 2257 2258 const struct inode_operations ocfs2_file_iops = { 2259 .setattr = ocfs2_setattr, 2260 .getattr = ocfs2_getattr, 2261 .permission = ocfs2_permission, 2262 .setxattr = generic_setxattr, 2263 .getxattr = generic_getxattr, 2264 .listxattr = ocfs2_listxattr, 2265 .removexattr = generic_removexattr, 2266 .fallocate = ocfs2_fallocate, 2267 .fiemap = ocfs2_fiemap, 2268 }; 2269 2270 const struct inode_operations ocfs2_special_file_iops = { 2271 .setattr = ocfs2_setattr, 2272 .getattr = ocfs2_getattr, 2273 .permission = ocfs2_permission, 2274 }; 2275 2276 /* 2277 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with 2278 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2279 */ 2280 const struct file_operations ocfs2_fops = { 2281 .llseek = generic_file_llseek, 2282 .read = do_sync_read, 2283 .write = do_sync_write, 2284 .mmap = ocfs2_mmap, 2285 .fsync = ocfs2_sync_file, 2286 .release = ocfs2_file_release, 2287 .open = ocfs2_file_open, 2288 .aio_read = ocfs2_file_aio_read, 2289 .aio_write = ocfs2_file_aio_write, 2290 .unlocked_ioctl = ocfs2_ioctl, 2291 #ifdef CONFIG_COMPAT 2292 .compat_ioctl = ocfs2_compat_ioctl, 2293 #endif 2294 .lock = ocfs2_lock, 2295 .flock = ocfs2_flock, 2296 .splice_read = ocfs2_file_splice_read, 2297 .splice_write = ocfs2_file_splice_write, 2298 }; 2299 2300 const struct file_operations ocfs2_dops = { 2301 .llseek = generic_file_llseek, 2302 .read = generic_read_dir, 2303 .readdir = ocfs2_readdir, 2304 .fsync = ocfs2_sync_file, 2305 .release = ocfs2_dir_release, 2306 .open = ocfs2_dir_open, 2307 .unlocked_ioctl = ocfs2_ioctl, 2308 #ifdef CONFIG_COMPAT 2309 .compat_ioctl = ocfs2_compat_ioctl, 2310 #endif 2311 .lock = ocfs2_lock, 2312 .flock = ocfs2_flock, 2313 }; 2314 2315 /* 2316 * POSIX-lockless variants of our file_operations. 2317 * 2318 * These will be used if the underlying cluster stack does not support 2319 * posix file locking, if the user passes the "localflocks" mount 2320 * option, or if we have a local-only fs. 2321 * 2322 * ocfs2_flock is in here because all stacks handle UNIX file locks, 2323 * so we still want it in the case of no stack support for 2324 * plocks. Internally, it will do the right thing when asked to ignore 2325 * the cluster. 2326 */ 2327 const struct file_operations ocfs2_fops_no_plocks = { 2328 .llseek = generic_file_llseek, 2329 .read = do_sync_read, 2330 .write = do_sync_write, 2331 .mmap = ocfs2_mmap, 2332 .fsync = ocfs2_sync_file, 2333 .release = ocfs2_file_release, 2334 .open = ocfs2_file_open, 2335 .aio_read = ocfs2_file_aio_read, 2336 .aio_write = ocfs2_file_aio_write, 2337 .unlocked_ioctl = ocfs2_ioctl, 2338 #ifdef CONFIG_COMPAT 2339 .compat_ioctl = ocfs2_compat_ioctl, 2340 #endif 2341 .flock = ocfs2_flock, 2342 .splice_read = ocfs2_file_splice_read, 2343 .splice_write = ocfs2_file_splice_write, 2344 }; 2345 2346 const struct file_operations ocfs2_dops_no_plocks = { 2347 .llseek = generic_file_llseek, 2348 .read = generic_read_dir, 2349 .readdir = ocfs2_readdir, 2350 .fsync = ocfs2_sync_file, 2351 .release = ocfs2_dir_release, 2352 .open = ocfs2_dir_open, 2353 .unlocked_ioctl = ocfs2_ioctl, 2354 #ifdef CONFIG_COMPAT 2355 .compat_ioctl = ocfs2_compat_ioctl, 2356 #endif 2357 .flock = ocfs2_flock, 2358 }; 2359