1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 #include <linux/sched.h> 34 #include <linux/splice.h> 35 #include <linux/mount.h> 36 #include <linux/writeback.h> 37 #include <linux/falloc.h> 38 39 #define MLOG_MASK_PREFIX ML_INODE 40 #include <cluster/masklog.h> 41 42 #include "ocfs2.h" 43 44 #include "alloc.h" 45 #include "aops.h" 46 #include "dir.h" 47 #include "dlmglue.h" 48 #include "extent_map.h" 49 #include "file.h" 50 #include "sysfile.h" 51 #include "inode.h" 52 #include "ioctl.h" 53 #include "journal.h" 54 #include "mmap.h" 55 #include "suballoc.h" 56 #include "super.h" 57 58 #include "buffer_head_io.h" 59 60 static int ocfs2_sync_inode(struct inode *inode) 61 { 62 filemap_fdatawrite(inode->i_mapping); 63 return sync_mapping_buffers(inode->i_mapping); 64 } 65 66 static int ocfs2_file_open(struct inode *inode, struct file *file) 67 { 68 int status; 69 int mode = file->f_flags; 70 struct ocfs2_inode_info *oi = OCFS2_I(inode); 71 72 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 73 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 74 75 spin_lock(&oi->ip_lock); 76 77 /* Check that the inode hasn't been wiped from disk by another 78 * node. If it hasn't then we're safe as long as we hold the 79 * spin lock until our increment of open count. */ 80 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 81 spin_unlock(&oi->ip_lock); 82 83 status = -ENOENT; 84 goto leave; 85 } 86 87 if (mode & O_DIRECT) 88 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 89 90 oi->ip_open_count++; 91 spin_unlock(&oi->ip_lock); 92 status = 0; 93 leave: 94 mlog_exit(status); 95 return status; 96 } 97 98 static int ocfs2_file_release(struct inode *inode, struct file *file) 99 { 100 struct ocfs2_inode_info *oi = OCFS2_I(inode); 101 102 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 103 file->f_path.dentry->d_name.len, 104 file->f_path.dentry->d_name.name); 105 106 spin_lock(&oi->ip_lock); 107 if (!--oi->ip_open_count) 108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 109 spin_unlock(&oi->ip_lock); 110 111 mlog_exit(0); 112 113 return 0; 114 } 115 116 static int ocfs2_sync_file(struct file *file, 117 struct dentry *dentry, 118 int datasync) 119 { 120 int err = 0; 121 journal_t *journal; 122 struct inode *inode = dentry->d_inode; 123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 124 125 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 126 dentry->d_name.len, dentry->d_name.name); 127 128 err = ocfs2_sync_inode(dentry->d_inode); 129 if (err) 130 goto bail; 131 132 journal = osb->journal->j_journal; 133 err = journal_force_commit(journal); 134 135 bail: 136 mlog_exit(err); 137 138 return (err < 0) ? -EIO : 0; 139 } 140 141 int ocfs2_should_update_atime(struct inode *inode, 142 struct vfsmount *vfsmnt) 143 { 144 struct timespec now; 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 146 147 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 148 return 0; 149 150 if ((inode->i_flags & S_NOATIME) || 151 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 152 return 0; 153 154 /* 155 * We can be called with no vfsmnt structure - NFSD will 156 * sometimes do this. 157 * 158 * Note that our action here is different than touch_atime() - 159 * if we can't tell whether this is a noatime mount, then we 160 * don't know whether to trust the value of s_atime_quantum. 161 */ 162 if (vfsmnt == NULL) 163 return 0; 164 165 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 166 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 167 return 0; 168 169 if (vfsmnt->mnt_flags & MNT_RELATIME) { 170 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 171 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 172 return 1; 173 174 return 0; 175 } 176 177 now = CURRENT_TIME; 178 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 179 return 0; 180 else 181 return 1; 182 } 183 184 int ocfs2_update_inode_atime(struct inode *inode, 185 struct buffer_head *bh) 186 { 187 int ret; 188 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 189 handle_t *handle; 190 191 mlog_entry_void(); 192 193 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 194 if (handle == NULL) { 195 ret = -ENOMEM; 196 mlog_errno(ret); 197 goto out; 198 } 199 200 inode->i_atime = CURRENT_TIME; 201 ret = ocfs2_mark_inode_dirty(handle, inode, bh); 202 if (ret < 0) 203 mlog_errno(ret); 204 205 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 206 out: 207 mlog_exit(ret); 208 return ret; 209 } 210 211 static int ocfs2_set_inode_size(handle_t *handle, 212 struct inode *inode, 213 struct buffer_head *fe_bh, 214 u64 new_i_size) 215 { 216 int status; 217 218 mlog_entry_void(); 219 i_size_write(inode, new_i_size); 220 inode->i_blocks = ocfs2_inode_sector_count(inode); 221 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 222 223 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 224 if (status < 0) { 225 mlog_errno(status); 226 goto bail; 227 } 228 229 bail: 230 mlog_exit(status); 231 return status; 232 } 233 234 static int ocfs2_simple_size_update(struct inode *inode, 235 struct buffer_head *di_bh, 236 u64 new_i_size) 237 { 238 int ret; 239 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 240 handle_t *handle = NULL; 241 242 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 243 if (handle == NULL) { 244 ret = -ENOMEM; 245 mlog_errno(ret); 246 goto out; 247 } 248 249 ret = ocfs2_set_inode_size(handle, inode, di_bh, 250 new_i_size); 251 if (ret < 0) 252 mlog_errno(ret); 253 254 ocfs2_commit_trans(osb, handle); 255 out: 256 return ret; 257 } 258 259 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 260 struct inode *inode, 261 struct buffer_head *fe_bh, 262 u64 new_i_size) 263 { 264 int status; 265 handle_t *handle; 266 struct ocfs2_dinode *di; 267 u64 cluster_bytes; 268 269 mlog_entry_void(); 270 271 /* TODO: This needs to actually orphan the inode in this 272 * transaction. */ 273 274 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 275 if (IS_ERR(handle)) { 276 status = PTR_ERR(handle); 277 mlog_errno(status); 278 goto out; 279 } 280 281 status = ocfs2_journal_access(handle, inode, fe_bh, 282 OCFS2_JOURNAL_ACCESS_WRITE); 283 if (status < 0) { 284 mlog_errno(status); 285 goto out_commit; 286 } 287 288 /* 289 * Do this before setting i_size. 290 */ 291 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 292 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 293 cluster_bytes); 294 if (status) { 295 mlog_errno(status); 296 goto out_commit; 297 } 298 299 i_size_write(inode, new_i_size); 300 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 301 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 302 303 di = (struct ocfs2_dinode *) fe_bh->b_data; 304 di->i_size = cpu_to_le64(new_i_size); 305 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 306 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 307 308 status = ocfs2_journal_dirty(handle, fe_bh); 309 if (status < 0) 310 mlog_errno(status); 311 312 out_commit: 313 ocfs2_commit_trans(osb, handle); 314 out: 315 316 mlog_exit(status); 317 return status; 318 } 319 320 static int ocfs2_truncate_file(struct inode *inode, 321 struct buffer_head *di_bh, 322 u64 new_i_size) 323 { 324 int status = 0; 325 struct ocfs2_dinode *fe = NULL; 326 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 327 struct ocfs2_truncate_context *tc = NULL; 328 329 mlog_entry("(inode = %llu, new_i_size = %llu\n", 330 (unsigned long long)OCFS2_I(inode)->ip_blkno, 331 (unsigned long long)new_i_size); 332 333 fe = (struct ocfs2_dinode *) di_bh->b_data; 334 if (!OCFS2_IS_VALID_DINODE(fe)) { 335 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 336 status = -EIO; 337 goto bail; 338 } 339 340 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 341 "Inode %llu, inode i_size = %lld != di " 342 "i_size = %llu, i_flags = 0x%x\n", 343 (unsigned long long)OCFS2_I(inode)->ip_blkno, 344 i_size_read(inode), 345 (unsigned long long)le64_to_cpu(fe->i_size), 346 le32_to_cpu(fe->i_flags)); 347 348 if (new_i_size > le64_to_cpu(fe->i_size)) { 349 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 350 (unsigned long long)le64_to_cpu(fe->i_size), 351 (unsigned long long)new_i_size); 352 status = -EINVAL; 353 mlog_errno(status); 354 goto bail; 355 } 356 357 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 358 (unsigned long long)le64_to_cpu(fe->i_blkno), 359 (unsigned long long)le64_to_cpu(fe->i_size), 360 (unsigned long long)new_i_size); 361 362 /* lets handle the simple truncate cases before doing any more 363 * cluster locking. */ 364 if (new_i_size == le64_to_cpu(fe->i_size)) 365 goto bail; 366 367 down_write(&OCFS2_I(inode)->ip_alloc_sem); 368 369 /* This forces other nodes to sync and drop their pages. Do 370 * this even if we have a truncate without allocation change - 371 * ocfs2 cluster sizes can be much greater than page size, so 372 * we have to truncate them anyway. */ 373 status = ocfs2_data_lock(inode, 1); 374 if (status < 0) { 375 up_write(&OCFS2_I(inode)->ip_alloc_sem); 376 377 mlog_errno(status); 378 goto bail; 379 } 380 381 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 382 truncate_inode_pages(inode->i_mapping, new_i_size); 383 384 /* alright, we're going to need to do a full blown alloc size 385 * change. Orphan the inode so that recovery can complete the 386 * truncate if necessary. This does the task of marking 387 * i_size. */ 388 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 389 if (status < 0) { 390 mlog_errno(status); 391 goto bail_unlock_data; 392 } 393 394 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 395 if (status < 0) { 396 mlog_errno(status); 397 goto bail_unlock_data; 398 } 399 400 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 401 if (status < 0) { 402 mlog_errno(status); 403 goto bail_unlock_data; 404 } 405 406 /* TODO: orphan dir cleanup here. */ 407 bail_unlock_data: 408 ocfs2_data_unlock(inode, 1); 409 410 up_write(&OCFS2_I(inode)->ip_alloc_sem); 411 412 bail: 413 414 mlog_exit(status); 415 return status; 416 } 417 418 /* 419 * extend allocation only here. 420 * we'll update all the disk stuff, and oip->alloc_size 421 * 422 * expect stuff to be locked, a transaction started and enough data / 423 * metadata reservations in the contexts. 424 * 425 * Will return -EAGAIN, and a reason if a restart is needed. 426 * If passed in, *reason will always be set, even in error. 427 */ 428 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 429 struct inode *inode, 430 u32 *logical_offset, 431 u32 clusters_to_add, 432 int mark_unwritten, 433 struct buffer_head *fe_bh, 434 handle_t *handle, 435 struct ocfs2_alloc_context *data_ac, 436 struct ocfs2_alloc_context *meta_ac, 437 enum ocfs2_alloc_restarted *reason_ret) 438 { 439 int status = 0; 440 int free_extents; 441 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 442 enum ocfs2_alloc_restarted reason = RESTART_NONE; 443 u32 bit_off, num_bits; 444 u64 block; 445 u8 flags = 0; 446 447 BUG_ON(!clusters_to_add); 448 449 if (mark_unwritten) 450 flags = OCFS2_EXT_UNWRITTEN; 451 452 free_extents = ocfs2_num_free_extents(osb, inode, fe); 453 if (free_extents < 0) { 454 status = free_extents; 455 mlog_errno(status); 456 goto leave; 457 } 458 459 /* there are two cases which could cause us to EAGAIN in the 460 * we-need-more-metadata case: 461 * 1) we haven't reserved *any* 462 * 2) we are so fragmented, we've needed to add metadata too 463 * many times. */ 464 if (!free_extents && !meta_ac) { 465 mlog(0, "we haven't reserved any metadata!\n"); 466 status = -EAGAIN; 467 reason = RESTART_META; 468 goto leave; 469 } else if ((!free_extents) 470 && (ocfs2_alloc_context_bits_left(meta_ac) 471 < ocfs2_extend_meta_needed(fe))) { 472 mlog(0, "filesystem is really fragmented...\n"); 473 status = -EAGAIN; 474 reason = RESTART_META; 475 goto leave; 476 } 477 478 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 479 &bit_off, &num_bits); 480 if (status < 0) { 481 if (status != -ENOSPC) 482 mlog_errno(status); 483 goto leave; 484 } 485 486 BUG_ON(num_bits > clusters_to_add); 487 488 /* reserve our write early -- insert_extent may update the inode */ 489 status = ocfs2_journal_access(handle, inode, fe_bh, 490 OCFS2_JOURNAL_ACCESS_WRITE); 491 if (status < 0) { 492 mlog_errno(status); 493 goto leave; 494 } 495 496 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 497 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 498 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 499 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 500 *logical_offset, block, num_bits, 501 flags, meta_ac); 502 if (status < 0) { 503 mlog_errno(status); 504 goto leave; 505 } 506 507 status = ocfs2_journal_dirty(handle, fe_bh); 508 if (status < 0) { 509 mlog_errno(status); 510 goto leave; 511 } 512 513 clusters_to_add -= num_bits; 514 *logical_offset += num_bits; 515 516 if (clusters_to_add) { 517 mlog(0, "need to alloc once more, clusters = %u, wanted = " 518 "%u\n", fe->i_clusters, clusters_to_add); 519 status = -EAGAIN; 520 reason = RESTART_TRANS; 521 } 522 523 leave: 524 mlog_exit(status); 525 if (reason_ret) 526 *reason_ret = reason; 527 return status; 528 } 529 530 /* 531 * For a given allocation, determine which allocators will need to be 532 * accessed, and lock them, reserving the appropriate number of bits. 533 * 534 * Sparse file systems call this from ocfs2_write_begin_nolock() 535 * and ocfs2_allocate_unwritten_extents(). 536 * 537 * File systems which don't support holes call this from 538 * ocfs2_extend_allocation(). 539 */ 540 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 541 u32 clusters_to_add, u32 extents_to_split, 542 struct ocfs2_alloc_context **data_ac, 543 struct ocfs2_alloc_context **meta_ac) 544 { 545 int ret = 0, num_free_extents; 546 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 547 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 548 549 *meta_ac = NULL; 550 if (data_ac) 551 *data_ac = NULL; 552 553 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 554 555 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 556 "clusters_to_add = %u, extents_to_split = %u\n", 557 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 558 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 559 560 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 561 if (num_free_extents < 0) { 562 ret = num_free_extents; 563 mlog_errno(ret); 564 goto out; 565 } 566 567 /* 568 * Sparse allocation file systems need to be more conservative 569 * with reserving room for expansion - the actual allocation 570 * happens while we've got a journal handle open so re-taking 571 * a cluster lock (because we ran out of room for another 572 * extent) will violate ordering rules. 573 * 574 * Most of the time we'll only be seeing this 1 cluster at a time 575 * anyway. 576 * 577 * Always lock for any unwritten extents - we might want to 578 * add blocks during a split. 579 */ 580 if (!num_free_extents || 581 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 582 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 583 if (ret < 0) { 584 if (ret != -ENOSPC) 585 mlog_errno(ret); 586 goto out; 587 } 588 } 589 590 if (clusters_to_add == 0) 591 goto out; 592 593 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 594 if (ret < 0) { 595 if (ret != -ENOSPC) 596 mlog_errno(ret); 597 goto out; 598 } 599 600 out: 601 if (ret) { 602 if (*meta_ac) { 603 ocfs2_free_alloc_context(*meta_ac); 604 *meta_ac = NULL; 605 } 606 607 /* 608 * We cannot have an error and a non null *data_ac. 609 */ 610 } 611 612 return ret; 613 } 614 615 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 616 u32 clusters_to_add, int mark_unwritten) 617 { 618 int status = 0; 619 int restart_func = 0; 620 int credits; 621 u32 prev_clusters; 622 struct buffer_head *bh = NULL; 623 struct ocfs2_dinode *fe = NULL; 624 handle_t *handle = NULL; 625 struct ocfs2_alloc_context *data_ac = NULL; 626 struct ocfs2_alloc_context *meta_ac = NULL; 627 enum ocfs2_alloc_restarted why; 628 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 629 630 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 631 632 /* 633 * This function only exists for file systems which don't 634 * support holes. 635 */ 636 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 637 638 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 639 OCFS2_BH_CACHED, inode); 640 if (status < 0) { 641 mlog_errno(status); 642 goto leave; 643 } 644 645 fe = (struct ocfs2_dinode *) bh->b_data; 646 if (!OCFS2_IS_VALID_DINODE(fe)) { 647 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 648 status = -EIO; 649 goto leave; 650 } 651 652 restart_all: 653 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 654 655 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, 656 &meta_ac); 657 if (status) { 658 mlog_errno(status); 659 goto leave; 660 } 661 662 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 663 handle = ocfs2_start_trans(osb, credits); 664 if (IS_ERR(handle)) { 665 status = PTR_ERR(handle); 666 handle = NULL; 667 mlog_errno(status); 668 goto leave; 669 } 670 671 restarted_transaction: 672 /* reserve a write to the file entry early on - that we if we 673 * run out of credits in the allocation path, we can still 674 * update i_size. */ 675 status = ocfs2_journal_access(handle, inode, bh, 676 OCFS2_JOURNAL_ACCESS_WRITE); 677 if (status < 0) { 678 mlog_errno(status); 679 goto leave; 680 } 681 682 prev_clusters = OCFS2_I(inode)->ip_clusters; 683 684 status = ocfs2_do_extend_allocation(osb, 685 inode, 686 &logical_start, 687 clusters_to_add, 688 mark_unwritten, 689 bh, 690 handle, 691 data_ac, 692 meta_ac, 693 &why); 694 if ((status < 0) && (status != -EAGAIN)) { 695 if (status != -ENOSPC) 696 mlog_errno(status); 697 goto leave; 698 } 699 700 status = ocfs2_journal_dirty(handle, bh); 701 if (status < 0) { 702 mlog_errno(status); 703 goto leave; 704 } 705 706 spin_lock(&OCFS2_I(inode)->ip_lock); 707 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 708 spin_unlock(&OCFS2_I(inode)->ip_lock); 709 710 if (why != RESTART_NONE && clusters_to_add) { 711 if (why == RESTART_META) { 712 mlog(0, "restarting function.\n"); 713 restart_func = 1; 714 } else { 715 BUG_ON(why != RESTART_TRANS); 716 717 mlog(0, "restarting transaction.\n"); 718 /* TODO: This can be more intelligent. */ 719 credits = ocfs2_calc_extend_credits(osb->sb, 720 fe, 721 clusters_to_add); 722 status = ocfs2_extend_trans(handle, credits); 723 if (status < 0) { 724 /* handle still has to be committed at 725 * this point. */ 726 status = -ENOMEM; 727 mlog_errno(status); 728 goto leave; 729 } 730 goto restarted_transaction; 731 } 732 } 733 734 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 735 le32_to_cpu(fe->i_clusters), 736 (unsigned long long)le64_to_cpu(fe->i_size)); 737 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 738 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 739 740 leave: 741 if (handle) { 742 ocfs2_commit_trans(osb, handle); 743 handle = NULL; 744 } 745 if (data_ac) { 746 ocfs2_free_alloc_context(data_ac); 747 data_ac = NULL; 748 } 749 if (meta_ac) { 750 ocfs2_free_alloc_context(meta_ac); 751 meta_ac = NULL; 752 } 753 if ((!status) && restart_func) { 754 restart_func = 0; 755 goto restart_all; 756 } 757 if (bh) { 758 brelse(bh); 759 bh = NULL; 760 } 761 762 mlog_exit(status); 763 return status; 764 } 765 766 static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 767 u32 clusters_to_add, int mark_unwritten) 768 { 769 int ret; 770 771 /* 772 * The alloc sem blocks peope in read/write from reading our 773 * allocation until we're done changing it. We depend on 774 * i_mutex to block other extend/truncate calls while we're 775 * here. 776 */ 777 down_write(&OCFS2_I(inode)->ip_alloc_sem); 778 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add, 779 mark_unwritten); 780 up_write(&OCFS2_I(inode)->ip_alloc_sem); 781 782 return ret; 783 } 784 785 /* Some parts of this taken from generic_cont_expand, which turned out 786 * to be too fragile to do exactly what we need without us having to 787 * worry about recursive locking in ->prepare_write() and 788 * ->commit_write(). */ 789 static int ocfs2_write_zero_page(struct inode *inode, 790 u64 size) 791 { 792 struct address_space *mapping = inode->i_mapping; 793 struct page *page; 794 unsigned long index; 795 unsigned int offset; 796 handle_t *handle = NULL; 797 int ret; 798 799 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 800 /* ugh. in prepare/commit_write, if from==to==start of block, we 801 ** skip the prepare. make sure we never send an offset for the start 802 ** of a block 803 */ 804 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 805 offset++; 806 } 807 index = size >> PAGE_CACHE_SHIFT; 808 809 page = grab_cache_page(mapping, index); 810 if (!page) { 811 ret = -ENOMEM; 812 mlog_errno(ret); 813 goto out; 814 } 815 816 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 817 if (ret < 0) { 818 mlog_errno(ret); 819 goto out_unlock; 820 } 821 822 if (ocfs2_should_order_data(inode)) { 823 handle = ocfs2_start_walk_page_trans(inode, page, offset, 824 offset); 825 if (IS_ERR(handle)) { 826 ret = PTR_ERR(handle); 827 handle = NULL; 828 goto out_unlock; 829 } 830 } 831 832 /* must not update i_size! */ 833 ret = block_commit_write(page, offset, offset); 834 if (ret < 0) 835 mlog_errno(ret); 836 else 837 ret = 0; 838 839 if (handle) 840 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 841 out_unlock: 842 unlock_page(page); 843 page_cache_release(page); 844 out: 845 return ret; 846 } 847 848 static int ocfs2_zero_extend(struct inode *inode, 849 u64 zero_to_size) 850 { 851 int ret = 0; 852 u64 start_off; 853 struct super_block *sb = inode->i_sb; 854 855 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 856 while (start_off < zero_to_size) { 857 ret = ocfs2_write_zero_page(inode, start_off); 858 if (ret < 0) { 859 mlog_errno(ret); 860 goto out; 861 } 862 863 start_off += sb->s_blocksize; 864 865 /* 866 * Very large extends have the potential to lock up 867 * the cpu for extended periods of time. 868 */ 869 cond_resched(); 870 } 871 872 out: 873 return ret; 874 } 875 876 /* 877 * A tail_to_skip value > 0 indicates that we're being called from 878 * ocfs2_file_aio_write(). This has the following implications: 879 * 880 * - we don't want to update i_size 881 * - di_bh will be NULL, which is fine because it's only used in the 882 * case where we want to update i_size. 883 * - ocfs2_zero_extend() will then only be filling the hole created 884 * between i_size and the start of the write. 885 */ 886 static int ocfs2_extend_file(struct inode *inode, 887 struct buffer_head *di_bh, 888 u64 new_i_size, 889 size_t tail_to_skip) 890 { 891 int ret = 0; 892 u32 clusters_to_add = 0; 893 894 BUG_ON(!tail_to_skip && !di_bh); 895 896 /* setattr sometimes calls us like this. */ 897 if (new_i_size == 0) 898 goto out; 899 900 if (i_size_read(inode) == new_i_size) 901 goto out; 902 BUG_ON(new_i_size < i_size_read(inode)); 903 904 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 905 BUG_ON(tail_to_skip != 0); 906 goto out_update_size; 907 } 908 909 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 910 OCFS2_I(inode)->ip_clusters; 911 912 /* 913 * protect the pages that ocfs2_zero_extend is going to be 914 * pulling into the page cache.. we do this before the 915 * metadata extend so that we don't get into the situation 916 * where we've extended the metadata but can't get the data 917 * lock to zero. 918 */ 919 ret = ocfs2_data_lock(inode, 1); 920 if (ret < 0) { 921 mlog_errno(ret); 922 goto out; 923 } 924 925 if (clusters_to_add) { 926 ret = ocfs2_extend_allocation(inode, 927 OCFS2_I(inode)->ip_clusters, 928 clusters_to_add, 0); 929 if (ret < 0) { 930 mlog_errno(ret); 931 goto out_unlock; 932 } 933 } 934 935 /* 936 * Call this even if we don't add any clusters to the tree. We 937 * still need to zero the area between the old i_size and the 938 * new i_size. 939 */ 940 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 941 if (ret < 0) { 942 mlog_errno(ret); 943 goto out_unlock; 944 } 945 946 out_update_size: 947 if (!tail_to_skip) { 948 /* We're being called from ocfs2_setattr() which wants 949 * us to update i_size */ 950 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 951 if (ret < 0) 952 mlog_errno(ret); 953 } 954 955 out_unlock: 956 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 957 ocfs2_data_unlock(inode, 1); 958 959 out: 960 return ret; 961 } 962 963 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 964 { 965 int status = 0, size_change; 966 struct inode *inode = dentry->d_inode; 967 struct super_block *sb = inode->i_sb; 968 struct ocfs2_super *osb = OCFS2_SB(sb); 969 struct buffer_head *bh = NULL; 970 handle_t *handle = NULL; 971 972 mlog_entry("(0x%p, '%.*s')\n", dentry, 973 dentry->d_name.len, dentry->d_name.name); 974 975 if (attr->ia_valid & ATTR_MODE) 976 mlog(0, "mode change: %d\n", attr->ia_mode); 977 if (attr->ia_valid & ATTR_UID) 978 mlog(0, "uid change: %d\n", attr->ia_uid); 979 if (attr->ia_valid & ATTR_GID) 980 mlog(0, "gid change: %d\n", attr->ia_gid); 981 if (attr->ia_valid & ATTR_SIZE) 982 mlog(0, "size change...\n"); 983 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 984 mlog(0, "time change...\n"); 985 986 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 987 | ATTR_GID | ATTR_UID | ATTR_MODE) 988 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 989 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 990 return 0; 991 } 992 993 status = inode_change_ok(inode, attr); 994 if (status) 995 return status; 996 997 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 998 if (size_change) { 999 status = ocfs2_rw_lock(inode, 1); 1000 if (status < 0) { 1001 mlog_errno(status); 1002 goto bail; 1003 } 1004 } 1005 1006 status = ocfs2_meta_lock(inode, &bh, 1); 1007 if (status < 0) { 1008 if (status != -ENOENT) 1009 mlog_errno(status); 1010 goto bail_unlock_rw; 1011 } 1012 1013 if (size_change && attr->ia_size != i_size_read(inode)) { 1014 if (i_size_read(inode) > attr->ia_size) 1015 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1016 else 1017 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 1018 if (status < 0) { 1019 if (status != -ENOSPC) 1020 mlog_errno(status); 1021 status = -ENOSPC; 1022 goto bail_unlock; 1023 } 1024 } 1025 1026 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1027 if (IS_ERR(handle)) { 1028 status = PTR_ERR(handle); 1029 mlog_errno(status); 1030 goto bail_unlock; 1031 } 1032 1033 /* 1034 * This will intentionally not wind up calling vmtruncate(), 1035 * since all the work for a size change has been done above. 1036 * Otherwise, we could get into problems with truncate as 1037 * ip_alloc_sem is used there to protect against i_size 1038 * changes. 1039 */ 1040 status = inode_setattr(inode, attr); 1041 if (status < 0) { 1042 mlog_errno(status); 1043 goto bail_commit; 1044 } 1045 1046 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1047 if (status < 0) 1048 mlog_errno(status); 1049 1050 bail_commit: 1051 ocfs2_commit_trans(osb, handle); 1052 bail_unlock: 1053 ocfs2_meta_unlock(inode, 1); 1054 bail_unlock_rw: 1055 if (size_change) 1056 ocfs2_rw_unlock(inode, 1); 1057 bail: 1058 if (bh) 1059 brelse(bh); 1060 1061 mlog_exit(status); 1062 return status; 1063 } 1064 1065 int ocfs2_getattr(struct vfsmount *mnt, 1066 struct dentry *dentry, 1067 struct kstat *stat) 1068 { 1069 struct inode *inode = dentry->d_inode; 1070 struct super_block *sb = dentry->d_inode->i_sb; 1071 struct ocfs2_super *osb = sb->s_fs_info; 1072 int err; 1073 1074 mlog_entry_void(); 1075 1076 err = ocfs2_inode_revalidate(dentry); 1077 if (err) { 1078 if (err != -ENOENT) 1079 mlog_errno(err); 1080 goto bail; 1081 } 1082 1083 generic_fillattr(inode, stat); 1084 1085 /* We set the blksize from the cluster size for performance */ 1086 stat->blksize = osb->s_clustersize; 1087 1088 bail: 1089 mlog_exit(err); 1090 1091 return err; 1092 } 1093 1094 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 1095 { 1096 int ret; 1097 1098 mlog_entry_void(); 1099 1100 ret = ocfs2_meta_lock(inode, NULL, 0); 1101 if (ret) { 1102 if (ret != -ENOENT) 1103 mlog_errno(ret); 1104 goto out; 1105 } 1106 1107 ret = generic_permission(inode, mask, NULL); 1108 1109 ocfs2_meta_unlock(inode, 0); 1110 out: 1111 mlog_exit(ret); 1112 return ret; 1113 } 1114 1115 static int __ocfs2_write_remove_suid(struct inode *inode, 1116 struct buffer_head *bh) 1117 { 1118 int ret; 1119 handle_t *handle; 1120 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1121 struct ocfs2_dinode *di; 1122 1123 mlog_entry("(Inode %llu, mode 0%o)\n", 1124 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); 1125 1126 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1127 if (handle == NULL) { 1128 ret = -ENOMEM; 1129 mlog_errno(ret); 1130 goto out; 1131 } 1132 1133 ret = ocfs2_journal_access(handle, inode, bh, 1134 OCFS2_JOURNAL_ACCESS_WRITE); 1135 if (ret < 0) { 1136 mlog_errno(ret); 1137 goto out_trans; 1138 } 1139 1140 inode->i_mode &= ~S_ISUID; 1141 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1142 inode->i_mode &= ~S_ISGID; 1143 1144 di = (struct ocfs2_dinode *) bh->b_data; 1145 di->i_mode = cpu_to_le16(inode->i_mode); 1146 1147 ret = ocfs2_journal_dirty(handle, bh); 1148 if (ret < 0) 1149 mlog_errno(ret); 1150 1151 out_trans: 1152 ocfs2_commit_trans(osb, handle); 1153 out: 1154 mlog_exit(ret); 1155 return ret; 1156 } 1157 1158 /* 1159 * Will look for holes and unwritten extents in the range starting at 1160 * pos for count bytes (inclusive). 1161 */ 1162 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1163 size_t count) 1164 { 1165 int ret = 0; 1166 unsigned int extent_flags; 1167 u32 cpos, clusters, extent_len, phys_cpos; 1168 struct super_block *sb = inode->i_sb; 1169 1170 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1171 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1172 1173 while (clusters) { 1174 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1175 &extent_flags); 1176 if (ret < 0) { 1177 mlog_errno(ret); 1178 goto out; 1179 } 1180 1181 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1182 ret = 1; 1183 break; 1184 } 1185 1186 if (extent_len > clusters) 1187 extent_len = clusters; 1188 1189 clusters -= extent_len; 1190 cpos += extent_len; 1191 } 1192 out: 1193 return ret; 1194 } 1195 1196 static int ocfs2_write_remove_suid(struct inode *inode) 1197 { 1198 int ret; 1199 struct buffer_head *bh = NULL; 1200 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1201 1202 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1203 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 1204 if (ret < 0) { 1205 mlog_errno(ret); 1206 goto out; 1207 } 1208 1209 ret = __ocfs2_write_remove_suid(inode, bh); 1210 out: 1211 brelse(bh); 1212 return ret; 1213 } 1214 1215 /* 1216 * Allocate enough extents to cover the region starting at byte offset 1217 * start for len bytes. Existing extents are skipped, any extents 1218 * added are marked as "unwritten". 1219 */ 1220 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1221 u64 start, u64 len) 1222 { 1223 int ret; 1224 u32 cpos, phys_cpos, clusters, alloc_size; 1225 1226 /* 1227 * We consider both start and len to be inclusive. 1228 */ 1229 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1230 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1231 clusters -= cpos; 1232 1233 while (clusters) { 1234 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1235 &alloc_size, NULL); 1236 if (ret) { 1237 mlog_errno(ret); 1238 goto out; 1239 } 1240 1241 /* 1242 * Hole or existing extent len can be arbitrary, so 1243 * cap it to our own allocation request. 1244 */ 1245 if (alloc_size > clusters) 1246 alloc_size = clusters; 1247 1248 if (phys_cpos) { 1249 /* 1250 * We already have an allocation at this 1251 * region so we can safely skip it. 1252 */ 1253 goto next; 1254 } 1255 1256 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1257 if (ret) { 1258 if (ret != -ENOSPC) 1259 mlog_errno(ret); 1260 goto out; 1261 } 1262 1263 next: 1264 cpos += alloc_size; 1265 clusters -= alloc_size; 1266 } 1267 1268 ret = 0; 1269 out: 1270 return ret; 1271 } 1272 1273 static int __ocfs2_remove_inode_range(struct inode *inode, 1274 struct buffer_head *di_bh, 1275 u32 cpos, u32 phys_cpos, u32 len, 1276 struct ocfs2_cached_dealloc_ctxt *dealloc) 1277 { 1278 int ret; 1279 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 1280 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1281 struct inode *tl_inode = osb->osb_tl_inode; 1282 handle_t *handle; 1283 struct ocfs2_alloc_context *meta_ac = NULL; 1284 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1285 1286 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); 1287 if (ret) { 1288 mlog_errno(ret); 1289 return ret; 1290 } 1291 1292 mutex_lock(&tl_inode->i_mutex); 1293 1294 if (ocfs2_truncate_log_needs_flush(osb)) { 1295 ret = __ocfs2_flush_truncate_log(osb); 1296 if (ret < 0) { 1297 mlog_errno(ret); 1298 goto out; 1299 } 1300 } 1301 1302 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); 1303 if (handle == NULL) { 1304 ret = -ENOMEM; 1305 mlog_errno(ret); 1306 goto out; 1307 } 1308 1309 ret = ocfs2_journal_access(handle, inode, di_bh, 1310 OCFS2_JOURNAL_ACCESS_WRITE); 1311 if (ret) { 1312 mlog_errno(ret); 1313 goto out; 1314 } 1315 1316 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, 1317 dealloc); 1318 if (ret) { 1319 mlog_errno(ret); 1320 goto out_commit; 1321 } 1322 1323 OCFS2_I(inode)->ip_clusters -= len; 1324 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1325 1326 ret = ocfs2_journal_dirty(handle, di_bh); 1327 if (ret) { 1328 mlog_errno(ret); 1329 goto out_commit; 1330 } 1331 1332 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 1333 if (ret) 1334 mlog_errno(ret); 1335 1336 out_commit: 1337 ocfs2_commit_trans(osb, handle); 1338 out: 1339 mutex_unlock(&tl_inode->i_mutex); 1340 1341 if (meta_ac) 1342 ocfs2_free_alloc_context(meta_ac); 1343 1344 return ret; 1345 } 1346 1347 /* 1348 * Truncate a byte range, avoiding pages within partial clusters. This 1349 * preserves those pages for the zeroing code to write to. 1350 */ 1351 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1352 u64 byte_len) 1353 { 1354 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1355 loff_t start, end; 1356 struct address_space *mapping = inode->i_mapping; 1357 1358 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1359 end = byte_start + byte_len; 1360 end = end & ~(osb->s_clustersize - 1); 1361 1362 if (start < end) { 1363 unmap_mapping_range(mapping, start, end - start, 0); 1364 truncate_inode_pages_range(mapping, start, end - 1); 1365 } 1366 } 1367 1368 static int ocfs2_zero_partial_clusters(struct inode *inode, 1369 u64 start, u64 len) 1370 { 1371 int ret = 0; 1372 u64 tmpend, end = start + len; 1373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1374 unsigned int csize = osb->s_clustersize; 1375 handle_t *handle; 1376 1377 /* 1378 * The "start" and "end" values are NOT necessarily part of 1379 * the range whose allocation is being deleted. Rather, this 1380 * is what the user passed in with the request. We must zero 1381 * partial clusters here. There's no need to worry about 1382 * physical allocation - the zeroing code knows to skip holes. 1383 */ 1384 mlog(0, "byte start: %llu, end: %llu\n", 1385 (unsigned long long)start, (unsigned long long)end); 1386 1387 /* 1388 * If both edges are on a cluster boundary then there's no 1389 * zeroing required as the region is part of the allocation to 1390 * be truncated. 1391 */ 1392 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1393 goto out; 1394 1395 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1396 if (handle == NULL) { 1397 ret = -ENOMEM; 1398 mlog_errno(ret); 1399 goto out; 1400 } 1401 1402 /* 1403 * We want to get the byte offset of the end of the 1st cluster. 1404 */ 1405 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1406 if (tmpend > end) 1407 tmpend = end; 1408 1409 mlog(0, "1st range: start: %llu, tmpend: %llu\n", 1410 (unsigned long long)start, (unsigned long long)tmpend); 1411 1412 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1413 if (ret) 1414 mlog_errno(ret); 1415 1416 if (tmpend < end) { 1417 /* 1418 * This may make start and end equal, but the zeroing 1419 * code will skip any work in that case so there's no 1420 * need to catch it up here. 1421 */ 1422 start = end & ~(osb->s_clustersize - 1); 1423 1424 mlog(0, "2nd range: start: %llu, end: %llu\n", 1425 (unsigned long long)start, (unsigned long long)end); 1426 1427 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1428 if (ret) 1429 mlog_errno(ret); 1430 } 1431 1432 ocfs2_commit_trans(osb, handle); 1433 out: 1434 return ret; 1435 } 1436 1437 static int ocfs2_remove_inode_range(struct inode *inode, 1438 struct buffer_head *di_bh, u64 byte_start, 1439 u64 byte_len) 1440 { 1441 int ret = 0; 1442 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1444 struct ocfs2_cached_dealloc_ctxt dealloc; 1445 1446 ocfs2_init_dealloc_ctxt(&dealloc); 1447 1448 if (byte_len == 0) 1449 return 0; 1450 1451 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1452 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1453 if (trunc_len >= trunc_start) 1454 trunc_len -= trunc_start; 1455 else 1456 trunc_len = 0; 1457 1458 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1459 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1460 (unsigned long long)byte_start, 1461 (unsigned long long)byte_len, trunc_start, trunc_len); 1462 1463 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1464 if (ret) { 1465 mlog_errno(ret); 1466 goto out; 1467 } 1468 1469 cpos = trunc_start; 1470 while (trunc_len) { 1471 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1472 &alloc_size, NULL); 1473 if (ret) { 1474 mlog_errno(ret); 1475 goto out; 1476 } 1477 1478 if (alloc_size > trunc_len) 1479 alloc_size = trunc_len; 1480 1481 /* Only do work for non-holes */ 1482 if (phys_cpos != 0) { 1483 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, 1484 phys_cpos, alloc_size, 1485 &dealloc); 1486 if (ret) { 1487 mlog_errno(ret); 1488 goto out; 1489 } 1490 } 1491 1492 cpos += alloc_size; 1493 trunc_len -= alloc_size; 1494 } 1495 1496 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1497 1498 out: 1499 ocfs2_schedule_truncate_log_flush(osb, 1); 1500 ocfs2_run_deallocs(osb, &dealloc); 1501 1502 return ret; 1503 } 1504 1505 /* 1506 * Parts of this function taken from xfs_change_file_space() 1507 */ 1508 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1509 loff_t f_pos, unsigned int cmd, 1510 struct ocfs2_space_resv *sr, 1511 int change_size) 1512 { 1513 int ret; 1514 s64 llen; 1515 loff_t size; 1516 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1517 struct buffer_head *di_bh = NULL; 1518 handle_t *handle; 1519 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits); 1520 1521 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1522 return -EROFS; 1523 1524 mutex_lock(&inode->i_mutex); 1525 1526 /* 1527 * This prevents concurrent writes on other nodes 1528 */ 1529 ret = ocfs2_rw_lock(inode, 1); 1530 if (ret) { 1531 mlog_errno(ret); 1532 goto out; 1533 } 1534 1535 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1536 if (ret) { 1537 mlog_errno(ret); 1538 goto out_rw_unlock; 1539 } 1540 1541 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1542 ret = -EPERM; 1543 goto out_meta_unlock; 1544 } 1545 1546 switch (sr->l_whence) { 1547 case 0: /*SEEK_SET*/ 1548 break; 1549 case 1: /*SEEK_CUR*/ 1550 sr->l_start += f_pos; 1551 break; 1552 case 2: /*SEEK_END*/ 1553 sr->l_start += i_size_read(inode); 1554 break; 1555 default: 1556 ret = -EINVAL; 1557 goto out_meta_unlock; 1558 } 1559 sr->l_whence = 0; 1560 1561 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1562 1563 if (sr->l_start < 0 1564 || sr->l_start > max_off 1565 || (sr->l_start + llen) < 0 1566 || (sr->l_start + llen) > max_off) { 1567 ret = -EINVAL; 1568 goto out_meta_unlock; 1569 } 1570 size = sr->l_start + sr->l_len; 1571 1572 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1573 if (sr->l_len <= 0) { 1574 ret = -EINVAL; 1575 goto out_meta_unlock; 1576 } 1577 } 1578 1579 if (file && should_remove_suid(file->f_path.dentry)) { 1580 ret = __ocfs2_write_remove_suid(inode, di_bh); 1581 if (ret) { 1582 mlog_errno(ret); 1583 goto out_meta_unlock; 1584 } 1585 } 1586 1587 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1588 switch (cmd) { 1589 case OCFS2_IOC_RESVSP: 1590 case OCFS2_IOC_RESVSP64: 1591 /* 1592 * This takes unsigned offsets, but the signed ones we 1593 * pass have been checked against overflow above. 1594 */ 1595 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1596 sr->l_len); 1597 break; 1598 case OCFS2_IOC_UNRESVSP: 1599 case OCFS2_IOC_UNRESVSP64: 1600 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1601 sr->l_len); 1602 break; 1603 default: 1604 ret = -EINVAL; 1605 } 1606 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1607 if (ret) { 1608 mlog_errno(ret); 1609 goto out_meta_unlock; 1610 } 1611 1612 /* 1613 * We update c/mtime for these changes 1614 */ 1615 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1616 if (IS_ERR(handle)) { 1617 ret = PTR_ERR(handle); 1618 mlog_errno(ret); 1619 goto out_meta_unlock; 1620 } 1621 1622 if (change_size && i_size_read(inode) < size) 1623 i_size_write(inode, size); 1624 1625 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1626 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1627 if (ret < 0) 1628 mlog_errno(ret); 1629 1630 ocfs2_commit_trans(osb, handle); 1631 1632 out_meta_unlock: 1633 brelse(di_bh); 1634 ocfs2_meta_unlock(inode, 1); 1635 out_rw_unlock: 1636 ocfs2_rw_unlock(inode, 1); 1637 1638 mutex_unlock(&inode->i_mutex); 1639 out: 1640 return ret; 1641 } 1642 1643 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1644 struct ocfs2_space_resv *sr) 1645 { 1646 struct inode *inode = file->f_path.dentry->d_inode; 1647 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; 1648 1649 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1650 !ocfs2_writes_unwritten_extents(osb)) 1651 return -ENOTTY; 1652 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1653 !ocfs2_sparse_alloc(osb)) 1654 return -ENOTTY; 1655 1656 if (!S_ISREG(inode->i_mode)) 1657 return -EINVAL; 1658 1659 if (!(file->f_mode & FMODE_WRITE)) 1660 return -EBADF; 1661 1662 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1663 } 1664 1665 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1666 loff_t len) 1667 { 1668 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1669 struct ocfs2_space_resv sr; 1670 int change_size = 1; 1671 1672 if (!ocfs2_writes_unwritten_extents(osb)) 1673 return -EOPNOTSUPP; 1674 1675 if (S_ISDIR(inode->i_mode)) 1676 return -ENODEV; 1677 1678 if (mode & FALLOC_FL_KEEP_SIZE) 1679 change_size = 0; 1680 1681 sr.l_whence = 0; 1682 sr.l_start = (s64)offset; 1683 sr.l_len = (s64)len; 1684 1685 return __ocfs2_change_file_space(NULL, inode, offset, 1686 OCFS2_IOC_RESVSP64, &sr, change_size); 1687 } 1688 1689 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1690 loff_t *ppos, 1691 size_t count, 1692 int appending, 1693 int *direct_io) 1694 { 1695 int ret = 0, meta_level = appending; 1696 struct inode *inode = dentry->d_inode; 1697 u32 clusters; 1698 loff_t newsize, saved_pos; 1699 1700 /* 1701 * We sample i_size under a read level meta lock to see if our write 1702 * is extending the file, if it is we back off and get a write level 1703 * meta lock. 1704 */ 1705 for(;;) { 1706 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1707 if (ret < 0) { 1708 meta_level = -1; 1709 mlog_errno(ret); 1710 goto out; 1711 } 1712 1713 /* Clear suid / sgid if necessary. We do this here 1714 * instead of later in the write path because 1715 * remove_suid() calls ->setattr without any hint that 1716 * we may have already done our cluster locking. Since 1717 * ocfs2_setattr() *must* take cluster locks to 1718 * proceeed, this will lead us to recursively lock the 1719 * inode. There's also the dinode i_size state which 1720 * can be lost via setattr during extending writes (we 1721 * set inode->i_size at the end of a write. */ 1722 if (should_remove_suid(dentry)) { 1723 if (meta_level == 0) { 1724 ocfs2_meta_unlock(inode, meta_level); 1725 meta_level = 1; 1726 continue; 1727 } 1728 1729 ret = ocfs2_write_remove_suid(inode); 1730 if (ret < 0) { 1731 mlog_errno(ret); 1732 goto out_unlock; 1733 } 1734 } 1735 1736 /* work on a copy of ppos until we're sure that we won't have 1737 * to recalculate it due to relocking. */ 1738 if (appending) { 1739 saved_pos = i_size_read(inode); 1740 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1741 } else { 1742 saved_pos = *ppos; 1743 } 1744 1745 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 1746 loff_t end = saved_pos + count; 1747 1748 /* 1749 * Skip the O_DIRECT checks if we don't need 1750 * them. 1751 */ 1752 if (!direct_io || !(*direct_io)) 1753 break; 1754 1755 /* 1756 * Allowing concurrent direct writes means 1757 * i_size changes wouldn't be synchronized, so 1758 * one node could wind up truncating another 1759 * nodes writes. 1760 */ 1761 if (end > i_size_read(inode)) { 1762 *direct_io = 0; 1763 break; 1764 } 1765 1766 /* 1767 * We don't fill holes during direct io, so 1768 * check for them here. If any are found, the 1769 * caller will have to retake some cluster 1770 * locks and initiate the io as buffered. 1771 */ 1772 ret = ocfs2_check_range_for_holes(inode, saved_pos, 1773 count); 1774 if (ret == 1) { 1775 *direct_io = 0; 1776 ret = 0; 1777 } else if (ret < 0) 1778 mlog_errno(ret); 1779 break; 1780 } 1781 1782 /* 1783 * The rest of this loop is concerned with legacy file 1784 * systems which don't support sparse files. 1785 */ 1786 1787 newsize = count + saved_pos; 1788 1789 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1790 (long long) saved_pos, (long long) newsize, 1791 (long long) i_size_read(inode)); 1792 1793 /* No need for a higher level metadata lock if we're 1794 * never going past i_size. */ 1795 if (newsize <= i_size_read(inode)) 1796 break; 1797 1798 if (meta_level == 0) { 1799 ocfs2_meta_unlock(inode, meta_level); 1800 meta_level = 1; 1801 continue; 1802 } 1803 1804 spin_lock(&OCFS2_I(inode)->ip_lock); 1805 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1806 OCFS2_I(inode)->ip_clusters; 1807 spin_unlock(&OCFS2_I(inode)->ip_lock); 1808 1809 mlog(0, "Writing at EOF, may need more allocation: " 1810 "i_size = %lld, newsize = %lld, need %u clusters\n", 1811 (long long) i_size_read(inode), (long long) newsize, 1812 clusters); 1813 1814 /* We only want to continue the rest of this loop if 1815 * our extend will actually require more 1816 * allocation. */ 1817 if (!clusters) 1818 break; 1819 1820 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1821 if (ret < 0) { 1822 if (ret != -ENOSPC) 1823 mlog_errno(ret); 1824 goto out_unlock; 1825 } 1826 break; 1827 } 1828 1829 if (appending) 1830 *ppos = saved_pos; 1831 1832 out_unlock: 1833 ocfs2_meta_unlock(inode, meta_level); 1834 1835 out: 1836 return ret; 1837 } 1838 1839 static inline void 1840 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) 1841 { 1842 const struct iovec *iov = *iovp; 1843 size_t base = *basep; 1844 1845 do { 1846 int copy = min(bytes, iov->iov_len - base); 1847 1848 bytes -= copy; 1849 base += copy; 1850 if (iov->iov_len == base) { 1851 iov++; 1852 base = 0; 1853 } 1854 } while (bytes); 1855 *iovp = iov; 1856 *basep = base; 1857 } 1858 1859 static struct page * ocfs2_get_write_source(char **ret_src_buf, 1860 const struct iovec *cur_iov, 1861 size_t iov_offset) 1862 { 1863 int ret; 1864 char *buf = cur_iov->iov_base + iov_offset; 1865 struct page *src_page = NULL; 1866 unsigned long off; 1867 1868 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; 1869 1870 if (!segment_eq(get_fs(), KERNEL_DS)) { 1871 /* 1872 * Pull in the user page. We want to do this outside 1873 * of the meta data locks in order to preserve locking 1874 * order in case of page fault. 1875 */ 1876 ret = get_user_pages(current, current->mm, 1877 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1878 0, 0, &src_page, NULL); 1879 if (ret == 1) 1880 *ret_src_buf = kmap(src_page) + off; 1881 else 1882 src_page = ERR_PTR(-EFAULT); 1883 } else { 1884 *ret_src_buf = buf; 1885 } 1886 1887 return src_page; 1888 } 1889 1890 static void ocfs2_put_write_source(struct page *page) 1891 { 1892 if (page) { 1893 kunmap(page); 1894 page_cache_release(page); 1895 } 1896 } 1897 1898 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, 1899 const struct iovec *iov, 1900 unsigned long nr_segs, 1901 size_t count, 1902 ssize_t o_direct_written) 1903 { 1904 int ret = 0; 1905 ssize_t copied, total = 0; 1906 size_t iov_offset = 0, bytes; 1907 loff_t pos; 1908 const struct iovec *cur_iov = iov; 1909 struct page *user_page, *page; 1910 char * uninitialized_var(buf); 1911 char *dst; 1912 void *fsdata; 1913 1914 /* 1915 * handle partial DIO write. Adjust cur_iov if needed. 1916 */ 1917 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1918 1919 do { 1920 pos = *ppos; 1921 1922 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); 1923 if (IS_ERR(user_page)) { 1924 ret = PTR_ERR(user_page); 1925 goto out; 1926 } 1927 1928 /* Stay within our page boundaries */ 1929 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), 1930 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); 1931 /* Stay within the vector boundary */ 1932 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); 1933 /* Stay within count */ 1934 bytes = min(bytes, count); 1935 1936 page = NULL; 1937 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, 1938 &page, &fsdata); 1939 if (ret) { 1940 mlog_errno(ret); 1941 goto out; 1942 } 1943 1944 dst = kmap_atomic(page, KM_USER0); 1945 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); 1946 kunmap_atomic(dst, KM_USER0); 1947 flush_dcache_page(page); 1948 ocfs2_put_write_source(user_page); 1949 1950 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, 1951 bytes, page, fsdata); 1952 if (copied < 0) { 1953 mlog_errno(copied); 1954 ret = copied; 1955 goto out; 1956 } 1957 1958 total += copied; 1959 *ppos = pos + copied; 1960 count -= copied; 1961 1962 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1963 } while(count); 1964 1965 out: 1966 return total ? total : ret; 1967 } 1968 1969 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1970 const struct iovec *iov, 1971 unsigned long nr_segs, 1972 loff_t pos) 1973 { 1974 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1975 int can_do_direct, sync = 0; 1976 ssize_t written = 0; 1977 size_t ocount; /* original count */ 1978 size_t count; /* after file limit checks */ 1979 loff_t *ppos = &iocb->ki_pos; 1980 struct file *file = iocb->ki_filp; 1981 struct inode *inode = file->f_path.dentry->d_inode; 1982 1983 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1984 (unsigned int)nr_segs, 1985 file->f_path.dentry->d_name.len, 1986 file->f_path.dentry->d_name.name); 1987 1988 if (iocb->ki_left == 0) 1989 return 0; 1990 1991 ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1992 if (ret) 1993 return ret; 1994 1995 count = ocount; 1996 1997 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1998 1999 appending = file->f_flags & O_APPEND ? 1 : 0; 2000 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 2001 2002 mutex_lock(&inode->i_mutex); 2003 2004 relock: 2005 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2006 if (direct_io) { 2007 down_read(&inode->i_alloc_sem); 2008 have_alloc_sem = 1; 2009 } 2010 2011 /* concurrent O_DIRECT writes are allowed */ 2012 rw_level = !direct_io; 2013 ret = ocfs2_rw_lock(inode, rw_level); 2014 if (ret < 0) { 2015 mlog_errno(ret); 2016 goto out_sems; 2017 } 2018 2019 can_do_direct = direct_io; 2020 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2021 iocb->ki_left, appending, 2022 &can_do_direct); 2023 if (ret < 0) { 2024 mlog_errno(ret); 2025 goto out; 2026 } 2027 2028 /* 2029 * We can't complete the direct I/O as requested, fall back to 2030 * buffered I/O. 2031 */ 2032 if (direct_io && !can_do_direct) { 2033 ocfs2_rw_unlock(inode, rw_level); 2034 up_read(&inode->i_alloc_sem); 2035 2036 have_alloc_sem = 0; 2037 rw_level = -1; 2038 2039 direct_io = 0; 2040 sync = 1; 2041 goto relock; 2042 } 2043 2044 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) 2045 sync = 1; 2046 2047 /* 2048 * XXX: Is it ok to execute these checks a second time? 2049 */ 2050 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); 2051 if (ret) 2052 goto out; 2053 2054 /* 2055 * Set pos so that sync_page_range_nolock() below understands 2056 * where to start from. We might've moved it around via the 2057 * calls above. The range we want to actually sync starts from 2058 * *ppos here. 2059 * 2060 */ 2061 pos = *ppos; 2062 2063 /* communicate with ocfs2_dio_end_io */ 2064 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2065 2066 if (direct_io) { 2067 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2068 ppos, count, ocount); 2069 if (written < 0) { 2070 ret = written; 2071 goto out_dio; 2072 } 2073 } else { 2074 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, 2075 count, written); 2076 if (written < 0) { 2077 ret = written; 2078 if (ret != -EFAULT || ret != -ENOSPC) 2079 mlog_errno(ret); 2080 goto out; 2081 } 2082 } 2083 2084 out_dio: 2085 /* buffered aio wouldn't have proper lock coverage today */ 2086 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2087 2088 /* 2089 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2090 * function pointer which is called when o_direct io completes so that 2091 * it can unlock our rw lock. (it's the clustered equivalent of 2092 * i_alloc_sem; protects truncate from racing with pending ios). 2093 * Unfortunately there are error cases which call end_io and others 2094 * that don't. so we don't have to unlock the rw_lock if either an 2095 * async dio is going to do it in the future or an end_io after an 2096 * error has already done it. 2097 */ 2098 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2099 rw_level = -1; 2100 have_alloc_sem = 0; 2101 } 2102 2103 out: 2104 if (rw_level != -1) 2105 ocfs2_rw_unlock(inode, rw_level); 2106 2107 out_sems: 2108 if (have_alloc_sem) 2109 up_read(&inode->i_alloc_sem); 2110 2111 if (written > 0 && sync) { 2112 ssize_t err; 2113 2114 err = sync_page_range_nolock(inode, file->f_mapping, pos, count); 2115 if (err < 0) 2116 written = err; 2117 } 2118 2119 mutex_unlock(&inode->i_mutex); 2120 2121 mlog_exit(ret); 2122 return written ? written : ret; 2123 } 2124 2125 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, 2126 struct pipe_buffer *buf, 2127 struct splice_desc *sd) 2128 { 2129 int ret, count; 2130 ssize_t copied = 0; 2131 struct file *file = sd->u.file; 2132 unsigned int offset; 2133 struct page *page = NULL; 2134 void *fsdata; 2135 char *src, *dst; 2136 2137 ret = buf->ops->confirm(pipe, buf); 2138 if (ret) 2139 goto out; 2140 2141 offset = sd->pos & ~PAGE_CACHE_MASK; 2142 count = sd->len; 2143 if (count + offset > PAGE_CACHE_SIZE) 2144 count = PAGE_CACHE_SIZE - offset; 2145 2146 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, 2147 &page, &fsdata); 2148 if (ret) { 2149 mlog_errno(ret); 2150 goto out; 2151 } 2152 2153 src = buf->ops->map(pipe, buf, 1); 2154 dst = kmap_atomic(page, KM_USER1); 2155 memcpy(dst + offset, src + buf->offset, count); 2156 kunmap_atomic(dst, KM_USER1); 2157 buf->ops->unmap(pipe, buf, src); 2158 2159 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, 2160 page, fsdata); 2161 if (copied < 0) { 2162 mlog_errno(copied); 2163 ret = copied; 2164 goto out; 2165 } 2166 out: 2167 2168 return copied ? copied : ret; 2169 } 2170 2171 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2172 struct file *out, 2173 loff_t *ppos, 2174 size_t len, 2175 unsigned int flags) 2176 { 2177 int ret, err; 2178 struct address_space *mapping = out->f_mapping; 2179 struct inode *inode = mapping->host; 2180 struct splice_desc sd = { 2181 .total_len = len, 2182 .flags = flags, 2183 .pos = *ppos, 2184 .u.file = out, 2185 }; 2186 2187 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); 2188 if (ret > 0) { 2189 *ppos += ret; 2190 2191 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 2192 err = generic_osync_inode(inode, mapping, 2193 OSYNC_METADATA|OSYNC_DATA); 2194 if (err) 2195 ret = err; 2196 } 2197 } 2198 2199 return ret; 2200 } 2201 2202 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2203 struct file *out, 2204 loff_t *ppos, 2205 size_t len, 2206 unsigned int flags) 2207 { 2208 int ret; 2209 struct inode *inode = out->f_path.dentry->d_inode; 2210 2211 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 2212 (unsigned int)len, 2213 out->f_path.dentry->d_name.len, 2214 out->f_path.dentry->d_name.name); 2215 2216 inode_double_lock(inode, pipe->inode); 2217 2218 ret = ocfs2_rw_lock(inode, 1); 2219 if (ret < 0) { 2220 mlog_errno(ret); 2221 goto out; 2222 } 2223 2224 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 2225 NULL); 2226 if (ret < 0) { 2227 mlog_errno(ret); 2228 goto out_unlock; 2229 } 2230 2231 /* ok, we're done with i_size and alloc work */ 2232 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); 2233 2234 out_unlock: 2235 ocfs2_rw_unlock(inode, 1); 2236 out: 2237 inode_double_unlock(inode, pipe->inode); 2238 2239 mlog_exit(ret); 2240 return ret; 2241 } 2242 2243 static ssize_t ocfs2_file_splice_read(struct file *in, 2244 loff_t *ppos, 2245 struct pipe_inode_info *pipe, 2246 size_t len, 2247 unsigned int flags) 2248 { 2249 int ret = 0; 2250 struct inode *inode = in->f_path.dentry->d_inode; 2251 2252 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 2253 (unsigned int)len, 2254 in->f_path.dentry->d_name.len, 2255 in->f_path.dentry->d_name.name); 2256 2257 /* 2258 * See the comment in ocfs2_file_aio_read() 2259 */ 2260 ret = ocfs2_meta_lock(inode, NULL, 0); 2261 if (ret < 0) { 2262 mlog_errno(ret); 2263 goto bail; 2264 } 2265 ocfs2_meta_unlock(inode, 0); 2266 2267 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2268 2269 bail: 2270 mlog_exit(ret); 2271 return ret; 2272 } 2273 2274 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2275 const struct iovec *iov, 2276 unsigned long nr_segs, 2277 loff_t pos) 2278 { 2279 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2280 struct file *filp = iocb->ki_filp; 2281 struct inode *inode = filp->f_path.dentry->d_inode; 2282 2283 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 2284 (unsigned int)nr_segs, 2285 filp->f_path.dentry->d_name.len, 2286 filp->f_path.dentry->d_name.name); 2287 2288 if (!inode) { 2289 ret = -EINVAL; 2290 mlog_errno(ret); 2291 goto bail; 2292 } 2293 2294 /* 2295 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2296 * need locks to protect pending reads from racing with truncate. 2297 */ 2298 if (filp->f_flags & O_DIRECT) { 2299 down_read(&inode->i_alloc_sem); 2300 have_alloc_sem = 1; 2301 2302 ret = ocfs2_rw_lock(inode, 0); 2303 if (ret < 0) { 2304 mlog_errno(ret); 2305 goto bail; 2306 } 2307 rw_level = 0; 2308 /* communicate with ocfs2_dio_end_io */ 2309 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2310 } 2311 2312 /* 2313 * We're fine letting folks race truncates and extending 2314 * writes with read across the cluster, just like they can 2315 * locally. Hence no rw_lock during read. 2316 * 2317 * Take and drop the meta data lock to update inode fields 2318 * like i_size. This allows the checks down below 2319 * generic_file_aio_read() a chance of actually working. 2320 */ 2321 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2322 if (ret < 0) { 2323 mlog_errno(ret); 2324 goto bail; 2325 } 2326 ocfs2_meta_unlock(inode, lock_level); 2327 2328 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2329 if (ret == -EINVAL) 2330 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 2331 2332 /* buffered aio wouldn't have proper lock coverage today */ 2333 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2334 2335 /* see ocfs2_file_aio_write */ 2336 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2337 rw_level = -1; 2338 have_alloc_sem = 0; 2339 } 2340 2341 bail: 2342 if (have_alloc_sem) 2343 up_read(&inode->i_alloc_sem); 2344 if (rw_level != -1) 2345 ocfs2_rw_unlock(inode, rw_level); 2346 mlog_exit(ret); 2347 2348 return ret; 2349 } 2350 2351 const struct inode_operations ocfs2_file_iops = { 2352 .setattr = ocfs2_setattr, 2353 .getattr = ocfs2_getattr, 2354 .permission = ocfs2_permission, 2355 .fallocate = ocfs2_fallocate, 2356 }; 2357 2358 const struct inode_operations ocfs2_special_file_iops = { 2359 .setattr = ocfs2_setattr, 2360 .getattr = ocfs2_getattr, 2361 .permission = ocfs2_permission, 2362 }; 2363 2364 const struct file_operations ocfs2_fops = { 2365 .read = do_sync_read, 2366 .write = do_sync_write, 2367 .mmap = ocfs2_mmap, 2368 .fsync = ocfs2_sync_file, 2369 .release = ocfs2_file_release, 2370 .open = ocfs2_file_open, 2371 .aio_read = ocfs2_file_aio_read, 2372 .aio_write = ocfs2_file_aio_write, 2373 .ioctl = ocfs2_ioctl, 2374 #ifdef CONFIG_COMPAT 2375 .compat_ioctl = ocfs2_compat_ioctl, 2376 #endif 2377 .splice_read = ocfs2_file_splice_read, 2378 .splice_write = ocfs2_file_splice_write, 2379 }; 2380 2381 const struct file_operations ocfs2_dops = { 2382 .read = generic_read_dir, 2383 .readdir = ocfs2_readdir, 2384 .fsync = ocfs2_sync_file, 2385 .ioctl = ocfs2_ioctl, 2386 #ifdef CONFIG_COMPAT 2387 .compat_ioctl = ocfs2_compat_ioctl, 2388 #endif 2389 }; 2390