1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/pagemap.h> 31 #include <linux/uio.h> 32 33 #define MLOG_MASK_PREFIX ML_INODE 34 #include <cluster/masklog.h> 35 36 #include "ocfs2.h" 37 38 #include "alloc.h" 39 #include "aops.h" 40 #include "dir.h" 41 #include "dlmglue.h" 42 #include "extent_map.h" 43 #include "file.h" 44 #include "sysfile.h" 45 #include "inode.h" 46 #include "journal.h" 47 #include "mmap.h" 48 #include "suballoc.h" 49 #include "super.h" 50 51 #include "buffer_head_io.h" 52 53 static int ocfs2_sync_inode(struct inode *inode) 54 { 55 filemap_fdatawrite(inode->i_mapping); 56 return sync_mapping_buffers(inode->i_mapping); 57 } 58 59 static int ocfs2_file_open(struct inode *inode, struct file *file) 60 { 61 int status; 62 int mode = file->f_flags; 63 struct ocfs2_inode_info *oi = OCFS2_I(inode); 64 65 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 66 file->f_dentry->d_name.len, file->f_dentry->d_name.name); 67 68 spin_lock(&oi->ip_lock); 69 70 /* Check that the inode hasn't been wiped from disk by another 71 * node. If it hasn't then we're safe as long as we hold the 72 * spin lock until our increment of open count. */ 73 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 74 spin_unlock(&oi->ip_lock); 75 76 status = -ENOENT; 77 goto leave; 78 } 79 80 if (mode & O_DIRECT) 81 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 82 83 oi->ip_open_count++; 84 spin_unlock(&oi->ip_lock); 85 status = 0; 86 leave: 87 mlog_exit(status); 88 return status; 89 } 90 91 static int ocfs2_file_release(struct inode *inode, struct file *file) 92 { 93 struct ocfs2_inode_info *oi = OCFS2_I(inode); 94 95 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 96 file->f_dentry->d_name.len, 97 file->f_dentry->d_name.name); 98 99 spin_lock(&oi->ip_lock); 100 if (!--oi->ip_open_count) 101 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 102 spin_unlock(&oi->ip_lock); 103 104 mlog_exit(0); 105 106 return 0; 107 } 108 109 static int ocfs2_sync_file(struct file *file, 110 struct dentry *dentry, 111 int datasync) 112 { 113 int err = 0; 114 journal_t *journal; 115 struct inode *inode = dentry->d_inode; 116 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 117 118 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 119 dentry->d_name.len, dentry->d_name.name); 120 121 err = ocfs2_sync_inode(dentry->d_inode); 122 if (err) 123 goto bail; 124 125 journal = osb->journal->j_journal; 126 err = journal_force_commit(journal); 127 128 bail: 129 mlog_exit(err); 130 131 return (err < 0) ? -EIO : 0; 132 } 133 134 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, 135 struct inode *inode, 136 struct buffer_head *fe_bh, 137 u64 new_i_size) 138 { 139 int status; 140 141 mlog_entry_void(); 142 i_size_write(inode, new_i_size); 143 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 144 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 145 146 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 147 if (status < 0) { 148 mlog_errno(status); 149 goto bail; 150 } 151 152 bail: 153 mlog_exit(status); 154 return status; 155 } 156 157 static int ocfs2_simple_size_update(struct inode *inode, 158 struct buffer_head *di_bh, 159 u64 new_i_size) 160 { 161 int ret; 162 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 163 struct ocfs2_journal_handle *handle = NULL; 164 165 handle = ocfs2_start_trans(osb, NULL, 166 OCFS2_INODE_UPDATE_CREDITS); 167 if (handle == NULL) { 168 ret = -ENOMEM; 169 mlog_errno(ret); 170 goto out; 171 } 172 173 ret = ocfs2_set_inode_size(handle, inode, di_bh, 174 new_i_size); 175 if (ret < 0) 176 mlog_errno(ret); 177 178 ocfs2_commit_trans(handle); 179 out: 180 return ret; 181 } 182 183 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 184 struct inode *inode, 185 struct buffer_head *fe_bh, 186 u64 new_i_size) 187 { 188 int status; 189 struct ocfs2_journal_handle *handle; 190 191 mlog_entry_void(); 192 193 /* TODO: This needs to actually orphan the inode in this 194 * transaction. */ 195 196 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 197 if (IS_ERR(handle)) { 198 status = PTR_ERR(handle); 199 mlog_errno(status); 200 goto out; 201 } 202 203 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 204 if (status < 0) 205 mlog_errno(status); 206 207 ocfs2_commit_trans(handle); 208 out: 209 mlog_exit(status); 210 return status; 211 } 212 213 static int ocfs2_truncate_file(struct inode *inode, 214 struct buffer_head *di_bh, 215 u64 new_i_size) 216 { 217 int status = 0; 218 struct ocfs2_dinode *fe = NULL; 219 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 220 struct ocfs2_truncate_context *tc = NULL; 221 222 mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", 223 OCFS2_I(inode)->ip_blkno, new_i_size); 224 225 truncate_inode_pages(inode->i_mapping, new_i_size); 226 227 fe = (struct ocfs2_dinode *) di_bh->b_data; 228 if (!OCFS2_IS_VALID_DINODE(fe)) { 229 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 230 status = -EIO; 231 goto bail; 232 } 233 234 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 235 "Inode %"MLFu64", inode i_size = %lld != di " 236 "i_size = %"MLFu64", i_flags = 0x%x\n", 237 OCFS2_I(inode)->ip_blkno, 238 i_size_read(inode), 239 le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); 240 241 if (new_i_size > le64_to_cpu(fe->i_size)) { 242 mlog(0, "asked to truncate file with size (%"MLFu64") " 243 "to size (%"MLFu64")!\n", 244 le64_to_cpu(fe->i_size), new_i_size); 245 status = -EINVAL; 246 mlog_errno(status); 247 goto bail; 248 } 249 250 mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", 251 le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); 252 253 /* lets handle the simple truncate cases before doing any more 254 * cluster locking. */ 255 if (new_i_size == le64_to_cpu(fe->i_size)) 256 goto bail; 257 258 if (le32_to_cpu(fe->i_clusters) == 259 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 260 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 261 fe->i_clusters); 262 /* No allocation change is required, so lets fast path 263 * this truncate. */ 264 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 265 if (status < 0) 266 mlog_errno(status); 267 goto bail; 268 } 269 270 /* This forces other nodes to sync and drop their pages */ 271 status = ocfs2_data_lock(inode, 1); 272 if (status < 0) { 273 mlog_errno(status); 274 goto bail; 275 } 276 ocfs2_data_unlock(inode, 1); 277 278 /* alright, we're going to need to do a full blown alloc size 279 * change. Orphan the inode so that recovery can complete the 280 * truncate if necessary. This does the task of marking 281 * i_size. */ 282 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 283 if (status < 0) { 284 mlog_errno(status); 285 goto bail; 286 } 287 288 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 289 if (status < 0) { 290 mlog_errno(status); 291 goto bail; 292 } 293 294 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 295 if (status < 0) { 296 mlog_errno(status); 297 goto bail; 298 } 299 300 /* TODO: orphan dir cleanup here. */ 301 bail: 302 303 mlog_exit(status); 304 return status; 305 } 306 307 /* 308 * extend allocation only here. 309 * we'll update all the disk stuff, and oip->alloc_size 310 * 311 * expect stuff to be locked, a transaction started and enough data / 312 * metadata reservations in the contexts. 313 * 314 * Will return -EAGAIN, and a reason if a restart is needed. 315 * If passed in, *reason will always be set, even in error. 316 */ 317 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 318 struct inode *inode, 319 u32 clusters_to_add, 320 struct buffer_head *fe_bh, 321 struct ocfs2_journal_handle *handle, 322 struct ocfs2_alloc_context *data_ac, 323 struct ocfs2_alloc_context *meta_ac, 324 enum ocfs2_alloc_restarted *reason_ret) 325 { 326 int status = 0; 327 int free_extents; 328 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 329 enum ocfs2_alloc_restarted reason = RESTART_NONE; 330 u32 bit_off, num_bits; 331 u64 block; 332 333 BUG_ON(!clusters_to_add); 334 335 free_extents = ocfs2_num_free_extents(osb, inode, fe); 336 if (free_extents < 0) { 337 status = free_extents; 338 mlog_errno(status); 339 goto leave; 340 } 341 342 /* there are two cases which could cause us to EAGAIN in the 343 * we-need-more-metadata case: 344 * 1) we haven't reserved *any* 345 * 2) we are so fragmented, we've needed to add metadata too 346 * many times. */ 347 if (!free_extents && !meta_ac) { 348 mlog(0, "we haven't reserved any metadata!\n"); 349 status = -EAGAIN; 350 reason = RESTART_META; 351 goto leave; 352 } else if ((!free_extents) 353 && (ocfs2_alloc_context_bits_left(meta_ac) 354 < ocfs2_extend_meta_needed(fe))) { 355 mlog(0, "filesystem is really fragmented...\n"); 356 status = -EAGAIN; 357 reason = RESTART_META; 358 goto leave; 359 } 360 361 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 362 &bit_off, &num_bits); 363 if (status < 0) { 364 if (status != -ENOSPC) 365 mlog_errno(status); 366 goto leave; 367 } 368 369 BUG_ON(num_bits > clusters_to_add); 370 371 /* reserve our write early -- insert_extent may update the inode */ 372 status = ocfs2_journal_access(handle, inode, fe_bh, 373 OCFS2_JOURNAL_ACCESS_WRITE); 374 if (status < 0) { 375 mlog_errno(status); 376 goto leave; 377 } 378 379 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 380 mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", 381 num_bits, bit_off, OCFS2_I(inode)->ip_blkno); 382 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 383 num_bits, meta_ac); 384 if (status < 0) { 385 mlog_errno(status); 386 goto leave; 387 } 388 389 le32_add_cpu(&fe->i_clusters, num_bits); 390 spin_lock(&OCFS2_I(inode)->ip_lock); 391 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 392 spin_unlock(&OCFS2_I(inode)->ip_lock); 393 394 status = ocfs2_journal_dirty(handle, fe_bh); 395 if (status < 0) { 396 mlog_errno(status); 397 goto leave; 398 } 399 400 clusters_to_add -= num_bits; 401 402 if (clusters_to_add) { 403 mlog(0, "need to alloc once more, clusters = %u, wanted = " 404 "%u\n", fe->i_clusters, clusters_to_add); 405 status = -EAGAIN; 406 reason = RESTART_TRANS; 407 } 408 409 leave: 410 mlog_exit(status); 411 if (reason_ret) 412 *reason_ret = reason; 413 return status; 414 } 415 416 static int ocfs2_extend_allocation(struct inode *inode, 417 u32 clusters_to_add) 418 { 419 int status = 0; 420 int restart_func = 0; 421 int drop_alloc_sem = 0; 422 int credits, num_free_extents; 423 u32 prev_clusters; 424 struct buffer_head *bh = NULL; 425 struct ocfs2_dinode *fe = NULL; 426 struct ocfs2_journal_handle *handle = NULL; 427 struct ocfs2_alloc_context *data_ac = NULL; 428 struct ocfs2_alloc_context *meta_ac = NULL; 429 enum ocfs2_alloc_restarted why; 430 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 431 432 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 433 434 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 435 OCFS2_BH_CACHED, inode); 436 if (status < 0) { 437 mlog_errno(status); 438 goto leave; 439 } 440 441 fe = (struct ocfs2_dinode *) bh->b_data; 442 if (!OCFS2_IS_VALID_DINODE(fe)) { 443 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 444 status = -EIO; 445 goto leave; 446 } 447 448 restart_all: 449 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 450 451 mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " 452 "clusters_to_add = %u\n", 453 OCFS2_I(inode)->ip_blkno, i_size_read(inode), 454 fe->i_clusters, clusters_to_add); 455 456 handle = ocfs2_alloc_handle(osb); 457 if (handle == NULL) { 458 status = -ENOMEM; 459 mlog_errno(status); 460 goto leave; 461 } 462 463 num_free_extents = ocfs2_num_free_extents(osb, 464 inode, 465 fe); 466 if (num_free_extents < 0) { 467 status = num_free_extents; 468 mlog_errno(status); 469 goto leave; 470 } 471 472 if (!num_free_extents) { 473 status = ocfs2_reserve_new_metadata(osb, 474 handle, 475 fe, 476 &meta_ac); 477 if (status < 0) { 478 if (status != -ENOSPC) 479 mlog_errno(status); 480 goto leave; 481 } 482 } 483 484 status = ocfs2_reserve_clusters(osb, 485 handle, 486 clusters_to_add, 487 &data_ac); 488 if (status < 0) { 489 if (status != -ENOSPC) 490 mlog_errno(status); 491 goto leave; 492 } 493 494 /* blocks peope in read/write from reading our allocation 495 * until we're done changing it. We depend on i_mutex to block 496 * other extend/truncate calls while we're here. Ordering wrt 497 * start_trans is important here -- always do it before! */ 498 down_write(&OCFS2_I(inode)->ip_alloc_sem); 499 drop_alloc_sem = 1; 500 501 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 502 handle = ocfs2_start_trans(osb, handle, credits); 503 if (IS_ERR(handle)) { 504 status = PTR_ERR(handle); 505 handle = NULL; 506 mlog_errno(status); 507 goto leave; 508 } 509 510 restarted_transaction: 511 /* reserve a write to the file entry early on - that we if we 512 * run out of credits in the allocation path, we can still 513 * update i_size. */ 514 status = ocfs2_journal_access(handle, inode, bh, 515 OCFS2_JOURNAL_ACCESS_WRITE); 516 if (status < 0) { 517 mlog_errno(status); 518 goto leave; 519 } 520 521 prev_clusters = OCFS2_I(inode)->ip_clusters; 522 523 status = ocfs2_do_extend_allocation(osb, 524 inode, 525 clusters_to_add, 526 bh, 527 handle, 528 data_ac, 529 meta_ac, 530 &why); 531 if ((status < 0) && (status != -EAGAIN)) { 532 if (status != -ENOSPC) 533 mlog_errno(status); 534 goto leave; 535 } 536 537 status = ocfs2_journal_dirty(handle, bh); 538 if (status < 0) { 539 mlog_errno(status); 540 goto leave; 541 } 542 543 spin_lock(&OCFS2_I(inode)->ip_lock); 544 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 545 spin_unlock(&OCFS2_I(inode)->ip_lock); 546 547 if (why != RESTART_NONE && clusters_to_add) { 548 if (why == RESTART_META) { 549 mlog(0, "restarting function.\n"); 550 restart_func = 1; 551 } else { 552 BUG_ON(why != RESTART_TRANS); 553 554 mlog(0, "restarting transaction.\n"); 555 /* TODO: This can be more intelligent. */ 556 credits = ocfs2_calc_extend_credits(osb->sb, 557 fe, 558 clusters_to_add); 559 status = ocfs2_extend_trans(handle, credits); 560 if (status < 0) { 561 /* handle still has to be committed at 562 * this point. */ 563 status = -ENOMEM; 564 mlog_errno(status); 565 goto leave; 566 } 567 goto restarted_transaction; 568 } 569 } 570 571 mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", 572 fe->i_clusters, fe->i_size); 573 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 574 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 575 576 leave: 577 if (drop_alloc_sem) { 578 up_write(&OCFS2_I(inode)->ip_alloc_sem); 579 drop_alloc_sem = 0; 580 } 581 if (handle) { 582 ocfs2_commit_trans(handle); 583 handle = NULL; 584 } 585 if (data_ac) { 586 ocfs2_free_alloc_context(data_ac); 587 data_ac = NULL; 588 } 589 if (meta_ac) { 590 ocfs2_free_alloc_context(meta_ac); 591 meta_ac = NULL; 592 } 593 if ((!status) && restart_func) { 594 restart_func = 0; 595 goto restart_all; 596 } 597 if (bh) { 598 brelse(bh); 599 bh = NULL; 600 } 601 602 mlog_exit(status); 603 return status; 604 } 605 606 /* Some parts of this taken from generic_cont_expand, which turned out 607 * to be too fragile to do exactly what we need without us having to 608 * worry about recursive locking in ->commit_write(). */ 609 static int ocfs2_write_zero_page(struct inode *inode, 610 u64 size) 611 { 612 struct address_space *mapping = inode->i_mapping; 613 struct page *page; 614 unsigned long index; 615 unsigned int offset; 616 struct ocfs2_journal_handle *handle = NULL; 617 int ret; 618 619 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 620 /* ugh. in prepare/commit_write, if from==to==start of block, we 621 ** skip the prepare. make sure we never send an offset for the start 622 ** of a block 623 */ 624 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 625 offset++; 626 } 627 index = size >> PAGE_CACHE_SHIFT; 628 629 page = grab_cache_page(mapping, index); 630 if (!page) { 631 ret = -ENOMEM; 632 mlog_errno(ret); 633 goto out; 634 } 635 636 ret = ocfs2_prepare_write(NULL, page, offset, offset); 637 if (ret < 0) { 638 mlog_errno(ret); 639 goto out_unlock; 640 } 641 642 if (ocfs2_should_order_data(inode)) { 643 handle = ocfs2_start_walk_page_trans(inode, page, offset, 644 offset); 645 if (IS_ERR(handle)) { 646 ret = PTR_ERR(handle); 647 handle = NULL; 648 goto out_unlock; 649 } 650 } 651 652 /* must not update i_size! */ 653 ret = block_commit_write(page, offset, offset); 654 if (ret < 0) 655 mlog_errno(ret); 656 else 657 ret = 0; 658 659 if (handle) 660 ocfs2_commit_trans(handle); 661 out_unlock: 662 unlock_page(page); 663 page_cache_release(page); 664 out: 665 return ret; 666 } 667 668 static int ocfs2_zero_extend(struct inode *inode, 669 u64 zero_to_size) 670 { 671 int ret = 0; 672 u64 start_off; 673 struct super_block *sb = inode->i_sb; 674 675 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 676 while (start_off < zero_to_size) { 677 ret = ocfs2_write_zero_page(inode, start_off); 678 if (ret < 0) { 679 mlog_errno(ret); 680 goto out; 681 } 682 683 start_off += sb->s_blocksize; 684 } 685 686 out: 687 return ret; 688 } 689 690 static int ocfs2_extend_file(struct inode *inode, 691 struct buffer_head *di_bh, 692 u64 new_i_size) 693 { 694 int ret = 0; 695 u32 clusters_to_add; 696 697 /* setattr sometimes calls us like this. */ 698 if (new_i_size == 0) 699 goto out; 700 701 if (i_size_read(inode) == new_i_size) 702 goto out; 703 BUG_ON(new_i_size < i_size_read(inode)); 704 705 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 706 OCFS2_I(inode)->ip_clusters; 707 708 if (clusters_to_add) { 709 ret = ocfs2_extend_allocation(inode, clusters_to_add); 710 if (ret < 0) { 711 mlog_errno(ret); 712 goto out; 713 } 714 715 ret = ocfs2_zero_extend(inode, new_i_size); 716 if (ret < 0) { 717 mlog_errno(ret); 718 goto out; 719 } 720 } 721 722 /* No allocation required, we just use this helper to 723 * do a trivial update of i_size. */ 724 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 725 if (ret < 0) { 726 mlog_errno(ret); 727 goto out; 728 } 729 730 out: 731 return ret; 732 } 733 734 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 735 { 736 int status = 0, size_change; 737 struct inode *inode = dentry->d_inode; 738 struct super_block *sb = inode->i_sb; 739 struct ocfs2_super *osb = OCFS2_SB(sb); 740 struct buffer_head *bh = NULL; 741 struct ocfs2_journal_handle *handle = NULL; 742 743 mlog_entry("(0x%p, '%.*s')\n", dentry, 744 dentry->d_name.len, dentry->d_name.name); 745 746 if (attr->ia_valid & ATTR_MODE) 747 mlog(0, "mode change: %d\n", attr->ia_mode); 748 if (attr->ia_valid & ATTR_UID) 749 mlog(0, "uid change: %d\n", attr->ia_uid); 750 if (attr->ia_valid & ATTR_GID) 751 mlog(0, "gid change: %d\n", attr->ia_gid); 752 if (attr->ia_valid & ATTR_SIZE) 753 mlog(0, "size change...\n"); 754 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 755 mlog(0, "time change...\n"); 756 757 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 758 | ATTR_GID | ATTR_UID | ATTR_MODE) 759 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 760 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 761 return 0; 762 } 763 764 status = inode_change_ok(inode, attr); 765 if (status) 766 return status; 767 768 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 769 if (size_change) { 770 status = ocfs2_rw_lock(inode, 1); 771 if (status < 0) { 772 mlog_errno(status); 773 goto bail; 774 } 775 } 776 777 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 778 if (status < 0) { 779 if (status != -ENOENT) 780 mlog_errno(status); 781 goto bail_unlock_rw; 782 } 783 784 if (size_change && attr->ia_size != i_size_read(inode)) { 785 if (i_size_read(inode) > attr->ia_size) 786 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 787 else 788 status = ocfs2_extend_file(inode, bh, attr->ia_size); 789 if (status < 0) { 790 if (status != -ENOSPC) 791 mlog_errno(status); 792 status = -ENOSPC; 793 goto bail_unlock; 794 } 795 } 796 797 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 798 if (IS_ERR(handle)) { 799 status = PTR_ERR(handle); 800 mlog_errno(status); 801 goto bail_unlock; 802 } 803 804 status = inode_setattr(inode, attr); 805 if (status < 0) { 806 mlog_errno(status); 807 goto bail_commit; 808 } 809 810 status = ocfs2_mark_inode_dirty(handle, inode, bh); 811 if (status < 0) 812 mlog_errno(status); 813 814 bail_commit: 815 ocfs2_commit_trans(handle); 816 bail_unlock: 817 ocfs2_meta_unlock(inode, 1); 818 bail_unlock_rw: 819 if (size_change) 820 ocfs2_rw_unlock(inode, 1); 821 bail: 822 if (bh) 823 brelse(bh); 824 825 mlog_exit(status); 826 return status; 827 } 828 829 int ocfs2_getattr(struct vfsmount *mnt, 830 struct dentry *dentry, 831 struct kstat *stat) 832 { 833 struct inode *inode = dentry->d_inode; 834 struct super_block *sb = dentry->d_inode->i_sb; 835 struct ocfs2_super *osb = sb->s_fs_info; 836 int err; 837 838 mlog_entry_void(); 839 840 err = ocfs2_inode_revalidate(dentry); 841 if (err) { 842 if (err != -ENOENT) 843 mlog_errno(err); 844 goto bail; 845 } 846 847 generic_fillattr(inode, stat); 848 849 /* We set the blksize from the cluster size for performance */ 850 stat->blksize = osb->s_clustersize; 851 852 bail: 853 mlog_exit(err); 854 855 return err; 856 } 857 858 static int ocfs2_write_remove_suid(struct inode *inode) 859 { 860 int ret; 861 struct buffer_head *bh = NULL; 862 struct ocfs2_inode_info *oi = OCFS2_I(inode); 863 struct ocfs2_journal_handle *handle; 864 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 865 struct ocfs2_dinode *di; 866 867 mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, 868 inode->i_mode); 869 870 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 871 if (handle == NULL) { 872 ret = -ENOMEM; 873 mlog_errno(ret); 874 goto out; 875 } 876 877 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 878 if (ret < 0) { 879 mlog_errno(ret); 880 goto out_trans; 881 } 882 883 ret = ocfs2_journal_access(handle, inode, bh, 884 OCFS2_JOURNAL_ACCESS_WRITE); 885 if (ret < 0) { 886 mlog_errno(ret); 887 goto out_bh; 888 } 889 890 inode->i_mode &= ~S_ISUID; 891 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 892 inode->i_mode &= ~S_ISGID; 893 894 di = (struct ocfs2_dinode *) bh->b_data; 895 di->i_mode = cpu_to_le16(inode->i_mode); 896 897 ret = ocfs2_journal_dirty(handle, bh); 898 if (ret < 0) 899 mlog_errno(ret); 900 out_bh: 901 brelse(bh); 902 out_trans: 903 ocfs2_commit_trans(handle); 904 out: 905 mlog_exit(ret); 906 return ret; 907 } 908 909 static inline int ocfs2_write_should_remove_suid(struct inode *inode) 910 { 911 mode_t mode = inode->i_mode; 912 913 if (!capable(CAP_FSETID)) { 914 if (unlikely(mode & S_ISUID)) 915 return 1; 916 917 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 918 return 1; 919 } 920 return 0; 921 } 922 923 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 924 const char __user *buf, 925 size_t count, 926 loff_t pos) 927 { 928 struct iovec local_iov = { .iov_base = (void __user *)buf, 929 .iov_len = count }; 930 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; 931 u32 clusters; 932 struct file *filp = iocb->ki_filp; 933 struct inode *inode = filp->f_dentry->d_inode; 934 loff_t newsize, saved_pos; 935 #ifdef OCFS2_ORACORE_WORKAROUNDS 936 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 937 #endif 938 939 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 940 (unsigned int)count, 941 filp->f_dentry->d_name.len, 942 filp->f_dentry->d_name.name); 943 944 /* happy write of zero bytes */ 945 if (count == 0) 946 return 0; 947 948 if (!inode) { 949 mlog(0, "bad inode\n"); 950 return -EIO; 951 } 952 953 #ifdef OCFS2_ORACORE_WORKAROUNDS 954 /* ugh, work around some applications which open everything O_DIRECT + 955 * O_APPEND and really don't mean to use O_DIRECT. */ 956 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && 957 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 958 filp->f_flags &= ~O_DIRECT; 959 #endif 960 961 mutex_lock(&inode->i_mutex); 962 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 963 if (filp->f_flags & O_DIRECT) { 964 have_alloc_sem = 1; 965 down_read(&inode->i_alloc_sem); 966 } 967 968 /* concurrent O_DIRECT writes are allowed */ 969 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 970 ret = ocfs2_rw_lock(inode, rw_level); 971 if (ret < 0) { 972 rw_level = -1; 973 mlog_errno(ret); 974 goto out; 975 } 976 977 /* 978 * We sample i_size under a read level meta lock to see if our write 979 * is extending the file, if it is we back off and get a write level 980 * meta lock. 981 */ 982 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; 983 for(;;) { 984 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); 985 if (ret < 0) { 986 meta_level = -1; 987 mlog_errno(ret); 988 goto out; 989 } 990 991 /* Clear suid / sgid if necessary. We do this here 992 * instead of later in the write path because 993 * remove_suid() calls ->setattr without any hint that 994 * we may have already done our cluster locking. Since 995 * ocfs2_setattr() *must* take cluster locks to 996 * proceeed, this will lead us to recursively lock the 997 * inode. There's also the dinode i_size state which 998 * can be lost via setattr during extending writes (we 999 * set inode->i_size at the end of a write. */ 1000 if (ocfs2_write_should_remove_suid(inode)) { 1001 if (meta_level == 0) { 1002 ocfs2_meta_unlock(inode, meta_level); 1003 meta_level = 1; 1004 continue; 1005 } 1006 1007 ret = ocfs2_write_remove_suid(inode); 1008 if (ret < 0) { 1009 mlog_errno(ret); 1010 goto out; 1011 } 1012 } 1013 1014 /* work on a copy of ppos until we're sure that we won't have 1015 * to recalculate it due to relocking. */ 1016 if (filp->f_flags & O_APPEND) { 1017 saved_pos = i_size_read(inode); 1018 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1019 } else { 1020 saved_pos = iocb->ki_pos; 1021 } 1022 newsize = count + saved_pos; 1023 1024 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", 1025 saved_pos, newsize, i_size_read(inode)); 1026 1027 /* No need for a higher level metadata lock if we're 1028 * never going past i_size. */ 1029 if (newsize <= i_size_read(inode)) 1030 break; 1031 1032 if (meta_level == 0) { 1033 ocfs2_meta_unlock(inode, meta_level); 1034 meta_level = 1; 1035 continue; 1036 } 1037 1038 spin_lock(&OCFS2_I(inode)->ip_lock); 1039 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1040 OCFS2_I(inode)->ip_clusters; 1041 spin_unlock(&OCFS2_I(inode)->ip_lock); 1042 1043 mlog(0, "Writing at EOF, may need more allocation: " 1044 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", 1045 i_size_read(inode), newsize, clusters); 1046 1047 /* We only want to continue the rest of this loop if 1048 * our extend will actually require more 1049 * allocation. */ 1050 if (!clusters) 1051 break; 1052 1053 ret = ocfs2_extend_allocation(inode, clusters); 1054 if (ret < 0) { 1055 if (ret != -ENOSPC) 1056 mlog_errno(ret); 1057 goto out; 1058 } 1059 1060 /* Fill any holes which would've been created by this 1061 * write. If we're O_APPEND, this will wind up 1062 * (correctly) being a noop. */ 1063 ret = ocfs2_zero_extend(inode, (u64) newsize - count); 1064 if (ret < 0) { 1065 mlog_errno(ret); 1066 goto out; 1067 } 1068 break; 1069 } 1070 1071 /* ok, we're done with i_size and alloc work */ 1072 iocb->ki_pos = saved_pos; 1073 ocfs2_meta_unlock(inode, meta_level); 1074 meta_level = -1; 1075 1076 /* communicate with ocfs2_dio_end_io */ 1077 ocfs2_iocb_set_rw_locked(iocb); 1078 1079 #ifdef OCFS2_ORACORE_WORKAROUNDS 1080 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && 1081 filp->f_flags & O_DIRECT) { 1082 unsigned int saved_flags = filp->f_flags; 1083 int sector_size = 1 << osb->s_sectsize_bits; 1084 1085 if ((saved_pos & (sector_size - 1)) || 1086 (count & (sector_size - 1)) || 1087 ((unsigned long)buf & (sector_size - 1))) { 1088 filp->f_flags |= O_SYNC; 1089 filp->f_flags &= ~O_DIRECT; 1090 } 1091 1092 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, 1093 &iocb->ki_pos); 1094 1095 filp->f_flags = saved_flags; 1096 } else 1097 #endif 1098 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, 1099 &iocb->ki_pos); 1100 1101 /* buffered aio wouldn't have proper lock coverage today */ 1102 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1103 1104 /* 1105 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1106 * function pointer which is called when o_direct io completes so that 1107 * it can unlock our rw lock. (it's the clustered equivalent of 1108 * i_alloc_sem; protects truncate from racing with pending ios). 1109 * Unfortunately there are error cases which call end_io and others 1110 * that don't. so we don't have to unlock the rw_lock if either an 1111 * async dio is going to do it in the future or an end_io after an 1112 * error has already done it. 1113 */ 1114 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1115 rw_level = -1; 1116 have_alloc_sem = 0; 1117 } 1118 1119 out: 1120 if (meta_level != -1) 1121 ocfs2_meta_unlock(inode, meta_level); 1122 if (have_alloc_sem) 1123 up_read(&inode->i_alloc_sem); 1124 if (rw_level != -1) 1125 ocfs2_rw_unlock(inode, rw_level); 1126 mutex_unlock(&inode->i_mutex); 1127 1128 mlog_exit(ret); 1129 return ret; 1130 } 1131 1132 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1133 char __user *buf, 1134 size_t count, 1135 loff_t pos) 1136 { 1137 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1138 struct file *filp = iocb->ki_filp; 1139 struct inode *inode = filp->f_dentry->d_inode; 1140 #ifdef OCFS2_ORACORE_WORKAROUNDS 1141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1142 #endif 1143 1144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1145 (unsigned int)count, 1146 filp->f_dentry->d_name.len, 1147 filp->f_dentry->d_name.name); 1148 1149 if (!inode) { 1150 ret = -EINVAL; 1151 mlog_errno(ret); 1152 goto bail; 1153 } 1154 1155 #ifdef OCFS2_ORACORE_WORKAROUNDS 1156 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { 1157 if (filp->f_flags & O_DIRECT) { 1158 int sector_size = 1 << osb->s_sectsize_bits; 1159 1160 if ((pos & (sector_size - 1)) || 1161 (count & (sector_size - 1)) || 1162 ((unsigned long)buf & (sector_size - 1)) || 1163 (i_size_read(inode) & (sector_size -1))) { 1164 filp->f_flags &= ~O_DIRECT; 1165 } 1166 } 1167 } 1168 #endif 1169 1170 /* 1171 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1172 * need locks to protect pending reads from racing with truncate. 1173 */ 1174 if (filp->f_flags & O_DIRECT) { 1175 down_read(&inode->i_alloc_sem); 1176 have_alloc_sem = 1; 1177 1178 ret = ocfs2_rw_lock(inode, 0); 1179 if (ret < 0) { 1180 mlog_errno(ret); 1181 goto bail; 1182 } 1183 rw_level = 0; 1184 /* communicate with ocfs2_dio_end_io */ 1185 ocfs2_iocb_set_rw_locked(iocb); 1186 } 1187 1188 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); 1189 if (ret == -EINVAL) 1190 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1191 1192 /* buffered aio wouldn't have proper lock coverage today */ 1193 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1194 1195 /* see ocfs2_file_aio_write */ 1196 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1197 rw_level = -1; 1198 have_alloc_sem = 0; 1199 } 1200 1201 bail: 1202 if (have_alloc_sem) 1203 up_read(&inode->i_alloc_sem); 1204 if (rw_level != -1) 1205 ocfs2_rw_unlock(inode, rw_level); 1206 mlog_exit(ret); 1207 1208 return ret; 1209 } 1210 1211 struct inode_operations ocfs2_file_iops = { 1212 .setattr = ocfs2_setattr, 1213 .getattr = ocfs2_getattr, 1214 }; 1215 1216 struct inode_operations ocfs2_special_file_iops = { 1217 .setattr = ocfs2_setattr, 1218 .getattr = ocfs2_getattr, 1219 }; 1220 1221 struct file_operations ocfs2_fops = { 1222 .read = do_sync_read, 1223 .write = do_sync_write, 1224 .sendfile = generic_file_sendfile, 1225 .mmap = ocfs2_mmap, 1226 .fsync = ocfs2_sync_file, 1227 .release = ocfs2_file_release, 1228 .open = ocfs2_file_open, 1229 .aio_read = ocfs2_file_aio_read, 1230 .aio_write = ocfs2_file_aio_write, 1231 }; 1232 1233 struct file_operations ocfs2_dops = { 1234 .read = generic_read_dir, 1235 .readdir = ocfs2_readdir, 1236 .fsync = ocfs2_sync_file, 1237 }; 1238