1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * inode.c 5 * 6 * vfs' aops, fops, dops and iops 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/highmem.h> 29 #include <linux/pagemap.h> 30 #include <linux/quotaops.h> 31 #include <linux/iversion.h> 32 33 #include <asm/byteorder.h> 34 35 #include <cluster/masklog.h> 36 37 #include "ocfs2.h" 38 39 #include "alloc.h" 40 #include "dir.h" 41 #include "blockcheck.h" 42 #include "dlmglue.h" 43 #include "extent_map.h" 44 #include "file.h" 45 #include "heartbeat.h" 46 #include "inode.h" 47 #include "journal.h" 48 #include "namei.h" 49 #include "suballoc.h" 50 #include "super.h" 51 #include "symlink.h" 52 #include "sysfile.h" 53 #include "uptodate.h" 54 #include "xattr.h" 55 #include "refcounttree.h" 56 #include "ocfs2_trace.h" 57 #include "filecheck.h" 58 59 #include "buffer_head_io.h" 60 61 struct ocfs2_find_inode_args 62 { 63 u64 fi_blkno; 64 unsigned long fi_ino; 65 unsigned int fi_flags; 66 unsigned int fi_sysfile_type; 67 }; 68 69 static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; 70 71 static int ocfs2_read_locked_inode(struct inode *inode, 72 struct ocfs2_find_inode_args *args); 73 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 74 static int ocfs2_find_actor(struct inode *inode, void *opaque); 75 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 76 struct inode *inode, 77 struct buffer_head *fe_bh); 78 79 static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, 80 struct buffer_head **bh, 81 int flags, int type); 82 static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, 83 struct buffer_head *bh); 84 static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, 85 struct buffer_head *bh); 86 87 void ocfs2_set_inode_flags(struct inode *inode) 88 { 89 unsigned int flags = OCFS2_I(inode)->ip_attr; 90 91 inode->i_flags &= ~(S_IMMUTABLE | 92 S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); 93 94 if (flags & OCFS2_IMMUTABLE_FL) 95 inode->i_flags |= S_IMMUTABLE; 96 97 if (flags & OCFS2_SYNC_FL) 98 inode->i_flags |= S_SYNC; 99 if (flags & OCFS2_APPEND_FL) 100 inode->i_flags |= S_APPEND; 101 if (flags & OCFS2_NOATIME_FL) 102 inode->i_flags |= S_NOATIME; 103 if (flags & OCFS2_DIRSYNC_FL) 104 inode->i_flags |= S_DIRSYNC; 105 } 106 107 /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ 108 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) 109 { 110 unsigned int flags = oi->vfs_inode.i_flags; 111 112 oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| 113 OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); 114 if (flags & S_SYNC) 115 oi->ip_attr |= OCFS2_SYNC_FL; 116 if (flags & S_APPEND) 117 oi->ip_attr |= OCFS2_APPEND_FL; 118 if (flags & S_IMMUTABLE) 119 oi->ip_attr |= OCFS2_IMMUTABLE_FL; 120 if (flags & S_NOATIME) 121 oi->ip_attr |= OCFS2_NOATIME_FL; 122 if (flags & S_DIRSYNC) 123 oi->ip_attr |= OCFS2_DIRSYNC_FL; 124 } 125 126 struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) 127 { 128 struct ocfs2_find_inode_args args; 129 130 args.fi_blkno = blkno; 131 args.fi_flags = 0; 132 args.fi_ino = ino_from_blkno(sb, blkno); 133 args.fi_sysfile_type = 0; 134 135 return ilookup5(sb, blkno, ocfs2_find_actor, &args); 136 } 137 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 138 int sysfile_type) 139 { 140 int rc = -ESTALE; 141 struct inode *inode = NULL; 142 struct super_block *sb = osb->sb; 143 struct ocfs2_find_inode_args args; 144 journal_t *journal = OCFS2_SB(sb)->journal->j_journal; 145 146 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 147 sysfile_type); 148 149 /* Ok. By now we've either got the offsets passed to us by the 150 * caller, or we just pulled them off the bh. Lets do some 151 * sanity checks to make sure they're OK. */ 152 if (blkno == 0) { 153 inode = ERR_PTR(-EINVAL); 154 mlog_errno(PTR_ERR(inode)); 155 goto bail; 156 } 157 158 args.fi_blkno = blkno; 159 args.fi_flags = flags; 160 args.fi_ino = ino_from_blkno(sb, blkno); 161 args.fi_sysfile_type = sysfile_type; 162 163 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 164 ocfs2_init_locked_inode, &args); 165 /* inode was *not* in the inode cache. 2.6.x requires 166 * us to do our own read_inode call and unlock it 167 * afterwards. */ 168 if (inode == NULL) { 169 inode = ERR_PTR(-ENOMEM); 170 mlog_errno(PTR_ERR(inode)); 171 goto bail; 172 } 173 trace_ocfs2_iget5_locked(inode->i_state); 174 if (inode->i_state & I_NEW) { 175 rc = ocfs2_read_locked_inode(inode, &args); 176 unlock_new_inode(inode); 177 } 178 if (is_bad_inode(inode)) { 179 iput(inode); 180 inode = ERR_PTR(rc); 181 goto bail; 182 } 183 184 /* 185 * Set transaction id's of transactions that have to be committed 186 * to finish f[data]sync. We set them to currently running transaction 187 * as we cannot be sure that the inode or some of its metadata isn't 188 * part of the transaction - the inode could have been reclaimed and 189 * now it is reread from disk. 190 */ 191 if (journal) { 192 transaction_t *transaction; 193 tid_t tid; 194 struct ocfs2_inode_info *oi = OCFS2_I(inode); 195 196 read_lock(&journal->j_state_lock); 197 if (journal->j_running_transaction) 198 transaction = journal->j_running_transaction; 199 else 200 transaction = journal->j_committing_transaction; 201 if (transaction) 202 tid = transaction->t_tid; 203 else 204 tid = journal->j_commit_sequence; 205 read_unlock(&journal->j_state_lock); 206 oi->i_sync_tid = tid; 207 oi->i_datasync_tid = tid; 208 } 209 210 bail: 211 if (!IS_ERR(inode)) { 212 trace_ocfs2_iget_end(inode, 213 (unsigned long long)OCFS2_I(inode)->ip_blkno); 214 } 215 216 return inode; 217 } 218 219 220 /* 221 * here's how inodes get read from disk: 222 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR 223 * found? : return the in-memory inode 224 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE 225 */ 226 227 static int ocfs2_find_actor(struct inode *inode, void *opaque) 228 { 229 struct ocfs2_find_inode_args *args = NULL; 230 struct ocfs2_inode_info *oi = OCFS2_I(inode); 231 int ret = 0; 232 233 args = opaque; 234 235 mlog_bug_on_msg(!inode, "No inode in find actor!\n"); 236 237 trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); 238 239 if (oi->ip_blkno != args->fi_blkno) 240 goto bail; 241 242 ret = 1; 243 bail: 244 return ret; 245 } 246 247 /* 248 * initialize the new inode, but don't do anything that would cause 249 * us to sleep. 250 * return 0 on success, 1 on failure 251 */ 252 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) 253 { 254 struct ocfs2_find_inode_args *args = opaque; 255 static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, 256 ocfs2_file_ip_alloc_sem_key; 257 258 inode->i_ino = args->fi_ino; 259 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 260 if (args->fi_sysfile_type != 0) 261 lockdep_set_class(&inode->i_rwsem, 262 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); 263 if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || 264 args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || 265 args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || 266 args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) 267 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, 268 &ocfs2_quota_ip_alloc_sem_key); 269 else 270 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, 271 &ocfs2_file_ip_alloc_sem_key); 272 273 return 0; 274 } 275 276 void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 277 int create_ino) 278 { 279 struct super_block *sb; 280 struct ocfs2_super *osb; 281 int use_plocks = 1; 282 283 sb = inode->i_sb; 284 osb = OCFS2_SB(sb); 285 286 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || 287 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) 288 use_plocks = 0; 289 290 /* 291 * These have all been checked by ocfs2_read_inode_block() or set 292 * by ocfs2_mknod_locked(), so a failure is a code bug. 293 */ 294 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode 295 cannot create a superblock 296 inode today. change if 297 that is needed. */ 298 BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); 299 BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); 300 301 302 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 303 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 304 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 305 306 inode_set_iversion(inode, 1); 307 inode->i_generation = le32_to_cpu(fe->i_generation); 308 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 309 inode->i_mode = le16_to_cpu(fe->i_mode); 310 i_uid_write(inode, le32_to_cpu(fe->i_uid)); 311 i_gid_write(inode, le32_to_cpu(fe->i_gid)); 312 313 /* Fast symlinks will have i_size but no allocated clusters. */ 314 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { 315 inode->i_blocks = 0; 316 inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; 317 } else { 318 inode->i_blocks = ocfs2_inode_sector_count(inode); 319 inode->i_mapping->a_ops = &ocfs2_aops; 320 } 321 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 322 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 323 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 324 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); 325 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); 326 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); 327 328 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) 329 mlog(ML_ERROR, 330 "ip_blkno %llu != i_blkno %llu!\n", 331 (unsigned long long)OCFS2_I(inode)->ip_blkno, 332 (unsigned long long)le64_to_cpu(fe->i_blkno)); 333 334 set_nlink(inode, ocfs2_read_links_count(fe)); 335 336 trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, 337 le32_to_cpu(fe->i_flags)); 338 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 339 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 340 inode->i_flags |= S_NOQUOTA; 341 } 342 343 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 344 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 345 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 346 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 347 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { 348 inode->i_flags |= S_NOQUOTA; 349 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 350 /* we can't actually hit this as read_inode can't 351 * handle superblocks today ;-) */ 352 BUG(); 353 } 354 355 switch (inode->i_mode & S_IFMT) { 356 case S_IFREG: 357 if (use_plocks) 358 inode->i_fop = &ocfs2_fops; 359 else 360 inode->i_fop = &ocfs2_fops_no_plocks; 361 inode->i_op = &ocfs2_file_iops; 362 i_size_write(inode, le64_to_cpu(fe->i_size)); 363 break; 364 case S_IFDIR: 365 inode->i_op = &ocfs2_dir_iops; 366 if (use_plocks) 367 inode->i_fop = &ocfs2_dops; 368 else 369 inode->i_fop = &ocfs2_dops_no_plocks; 370 i_size_write(inode, le64_to_cpu(fe->i_size)); 371 OCFS2_I(inode)->ip_dir_lock_gen = 1; 372 break; 373 case S_IFLNK: 374 inode->i_op = &ocfs2_symlink_inode_operations; 375 inode_nohighmem(inode); 376 i_size_write(inode, le64_to_cpu(fe->i_size)); 377 break; 378 default: 379 inode->i_op = &ocfs2_special_file_iops; 380 init_special_inode(inode, inode->i_mode, 381 inode->i_rdev); 382 break; 383 } 384 385 if (create_ino) { 386 inode->i_ino = ino_from_blkno(inode->i_sb, 387 le64_to_cpu(fe->i_blkno)); 388 389 /* 390 * If we ever want to create system files from kernel, 391 * the generation argument to 392 * ocfs2_inode_lock_res_init() will have to change. 393 */ 394 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 395 396 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, 397 OCFS2_LOCK_TYPE_META, 0, inode); 398 399 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 400 OCFS2_LOCK_TYPE_OPEN, 0, inode); 401 } 402 403 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 404 OCFS2_LOCK_TYPE_RW, inode->i_generation, 405 inode); 406 407 ocfs2_set_inode_flags(inode); 408 409 OCFS2_I(inode)->ip_last_used_slot = 0; 410 OCFS2_I(inode)->ip_last_used_group = 0; 411 412 if (S_ISDIR(inode->i_mode)) 413 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, 414 OCFS2_RESV_FLAG_DIR); 415 } 416 417 static int ocfs2_read_locked_inode(struct inode *inode, 418 struct ocfs2_find_inode_args *args) 419 { 420 struct super_block *sb; 421 struct ocfs2_super *osb; 422 struct ocfs2_dinode *fe; 423 struct buffer_head *bh = NULL; 424 int status, can_lock, lock_level = 0; 425 u32 generation = 0; 426 427 status = -EINVAL; 428 sb = inode->i_sb; 429 osb = OCFS2_SB(sb); 430 431 /* 432 * To improve performance of cold-cache inode stats, we take 433 * the cluster lock here if possible. 434 * 435 * Generally, OCFS2 never trusts the contents of an inode 436 * unless it's holding a cluster lock, so taking it here isn't 437 * a correctness issue as much as it is a performance 438 * improvement. 439 * 440 * There are three times when taking the lock is not a good idea: 441 * 442 * 1) During startup, before we have initialized the DLM. 443 * 444 * 2) If we are reading certain system files which never get 445 * cluster locks (local alloc, truncate log). 446 * 447 * 3) If the process doing the iget() is responsible for 448 * orphan dir recovery. We're holding the orphan dir lock and 449 * can get into a deadlock with another process on another 450 * node in ->delete_inode(). 451 * 452 * #1 and #2 can be simply solved by never taking the lock 453 * here for system files (which are the only type we read 454 * during mount). It's a heavier approach, but our main 455 * concern is user-accessible files anyway. 456 * 457 * #3 works itself out because we'll eventually take the 458 * cluster lock before trusting anything anyway. 459 */ 460 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 461 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) 462 && !ocfs2_mount_local(osb); 463 464 trace_ocfs2_read_locked_inode( 465 (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); 466 467 /* 468 * To maintain backwards compatibility with older versions of 469 * ocfs2-tools, we still store the generation value for system 470 * files. The only ones that actually matter to userspace are 471 * the journals, but it's easier and inexpensive to just flag 472 * all system files similarly. 473 */ 474 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 475 generation = osb->fs_generation; 476 477 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, 478 OCFS2_LOCK_TYPE_META, 479 generation, inode); 480 481 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 482 OCFS2_LOCK_TYPE_OPEN, 483 0, inode); 484 485 if (can_lock) { 486 status = ocfs2_open_lock(inode); 487 if (status) { 488 make_bad_inode(inode); 489 mlog_errno(status); 490 return status; 491 } 492 status = ocfs2_inode_lock(inode, NULL, lock_level); 493 if (status) { 494 make_bad_inode(inode); 495 mlog_errno(status); 496 return status; 497 } 498 } 499 500 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { 501 status = ocfs2_try_open_lock(inode, 0); 502 if (status) { 503 make_bad_inode(inode); 504 return status; 505 } 506 } 507 508 if (can_lock) { 509 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) 510 status = ocfs2_filecheck_read_inode_block_full(inode, 511 &bh, OCFS2_BH_IGNORE_CACHE, 0); 512 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) 513 status = ocfs2_filecheck_read_inode_block_full(inode, 514 &bh, OCFS2_BH_IGNORE_CACHE, 1); 515 else 516 status = ocfs2_read_inode_block_full(inode, 517 &bh, OCFS2_BH_IGNORE_CACHE); 518 } else { 519 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 520 /* 521 * If buffer is in jbd, then its checksum may not have been 522 * computed as yet. 523 */ 524 if (!status && !buffer_jbd(bh)) { 525 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) 526 status = ocfs2_filecheck_validate_inode_block( 527 osb->sb, bh); 528 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) 529 status = ocfs2_filecheck_repair_inode_block( 530 osb->sb, bh); 531 else 532 status = ocfs2_validate_inode_block( 533 osb->sb, bh); 534 } 535 } 536 if (status < 0) { 537 mlog_errno(status); 538 goto bail; 539 } 540 541 status = -EINVAL; 542 fe = (struct ocfs2_dinode *) bh->b_data; 543 544 /* 545 * This is a code bug. Right now the caller needs to 546 * understand whether it is asking for a system file inode or 547 * not so the proper lock names can be built. 548 */ 549 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != 550 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), 551 "Inode %llu: system file state is ambigous\n", 552 (unsigned long long)args->fi_blkno); 553 554 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 555 S_ISBLK(le16_to_cpu(fe->i_mode))) 556 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 557 558 ocfs2_populate_inode(inode, fe, 0); 559 560 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 561 562 if (buffer_dirty(bh) && !buffer_jbd(bh)) { 563 if (can_lock) { 564 ocfs2_inode_unlock(inode, lock_level); 565 lock_level = 1; 566 ocfs2_inode_lock(inode, NULL, lock_level); 567 } 568 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); 569 if (status < 0) { 570 mlog_errno(status); 571 goto bail; 572 } 573 } 574 575 status = 0; 576 577 bail: 578 if (can_lock) 579 ocfs2_inode_unlock(inode, lock_level); 580 581 if (status < 0) 582 make_bad_inode(inode); 583 584 brelse(bh); 585 586 return status; 587 } 588 589 void ocfs2_sync_blockdev(struct super_block *sb) 590 { 591 sync_blockdev(sb->s_bdev); 592 } 593 594 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 595 struct inode *inode, 596 struct buffer_head *fe_bh) 597 { 598 int status = 0; 599 struct ocfs2_dinode *fe; 600 handle_t *handle = NULL; 601 602 fe = (struct ocfs2_dinode *) fe_bh->b_data; 603 604 /* 605 * This check will also skip truncate of inodes with inline 606 * data and fast symlinks. 607 */ 608 if (fe->i_clusters) { 609 if (ocfs2_should_order_data(inode)) 610 ocfs2_begin_ordered_truncate(inode, 0); 611 612 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 613 if (IS_ERR(handle)) { 614 status = PTR_ERR(handle); 615 handle = NULL; 616 mlog_errno(status); 617 goto out; 618 } 619 620 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 621 fe_bh, 622 OCFS2_JOURNAL_ACCESS_WRITE); 623 if (status < 0) { 624 mlog_errno(status); 625 goto out; 626 } 627 628 i_size_write(inode, 0); 629 630 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 631 if (status < 0) { 632 mlog_errno(status); 633 goto out; 634 } 635 636 ocfs2_commit_trans(osb, handle); 637 handle = NULL; 638 639 status = ocfs2_commit_truncate(osb, inode, fe_bh); 640 if (status < 0) 641 mlog_errno(status); 642 } 643 644 out: 645 if (handle) 646 ocfs2_commit_trans(osb, handle); 647 return status; 648 } 649 650 static int ocfs2_remove_inode(struct inode *inode, 651 struct buffer_head *di_bh, 652 struct inode *orphan_dir_inode, 653 struct buffer_head *orphan_dir_bh) 654 { 655 int status; 656 struct inode *inode_alloc_inode = NULL; 657 struct buffer_head *inode_alloc_bh = NULL; 658 handle_t *handle; 659 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 660 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 661 662 inode_alloc_inode = 663 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 664 le16_to_cpu(di->i_suballoc_slot)); 665 if (!inode_alloc_inode) { 666 status = -ENOENT; 667 mlog_errno(status); 668 goto bail; 669 } 670 671 inode_lock(inode_alloc_inode); 672 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); 673 if (status < 0) { 674 inode_unlock(inode_alloc_inode); 675 676 mlog_errno(status); 677 goto bail; 678 } 679 680 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 681 ocfs2_quota_trans_credits(inode->i_sb)); 682 if (IS_ERR(handle)) { 683 status = PTR_ERR(handle); 684 mlog_errno(status); 685 goto bail_unlock; 686 } 687 688 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 689 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 690 orphan_dir_bh, false); 691 if (status < 0) { 692 mlog_errno(status); 693 goto bail_commit; 694 } 695 } 696 697 /* set the inodes dtime */ 698 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 699 OCFS2_JOURNAL_ACCESS_WRITE); 700 if (status < 0) { 701 mlog_errno(status); 702 goto bail_commit; 703 } 704 705 di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); 706 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 707 ocfs2_journal_dirty(handle, di_bh); 708 709 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 710 dquot_free_inode(inode); 711 712 status = ocfs2_free_dinode(handle, inode_alloc_inode, 713 inode_alloc_bh, di); 714 if (status < 0) 715 mlog_errno(status); 716 717 bail_commit: 718 ocfs2_commit_trans(osb, handle); 719 bail_unlock: 720 ocfs2_inode_unlock(inode_alloc_inode, 1); 721 inode_unlock(inode_alloc_inode); 722 brelse(inode_alloc_bh); 723 bail: 724 iput(inode_alloc_inode); 725 726 return status; 727 } 728 729 /* 730 * Serialize with orphan dir recovery. If the process doing 731 * recovery on this orphan dir does an iget() with the dir 732 * i_mutex held, we'll deadlock here. Instead we detect this 733 * and exit early - recovery will wipe this inode for us. 734 */ 735 static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, 736 int slot) 737 { 738 int ret = 0; 739 740 spin_lock(&osb->osb_lock); 741 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { 742 ret = -EDEADLK; 743 goto out; 744 } 745 /* This signals to the orphan recovery process that it should 746 * wait for us to handle the wipe. */ 747 osb->osb_orphan_wipes[slot]++; 748 out: 749 spin_unlock(&osb->osb_lock); 750 trace_ocfs2_check_orphan_recovery_state(slot, ret); 751 return ret; 752 } 753 754 static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, 755 int slot) 756 { 757 spin_lock(&osb->osb_lock); 758 osb->osb_orphan_wipes[slot]--; 759 spin_unlock(&osb->osb_lock); 760 761 wake_up(&osb->osb_wipe_event); 762 } 763 764 static int ocfs2_wipe_inode(struct inode *inode, 765 struct buffer_head *di_bh) 766 { 767 int status, orphaned_slot = -1; 768 struct inode *orphan_dir_inode = NULL; 769 struct buffer_head *orphan_dir_bh = NULL; 770 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 771 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 772 773 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 774 orphaned_slot = le16_to_cpu(di->i_orphaned_slot); 775 776 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 777 if (status) 778 return status; 779 780 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 781 ORPHAN_DIR_SYSTEM_INODE, 782 orphaned_slot); 783 if (!orphan_dir_inode) { 784 status = -ENOENT; 785 mlog_errno(status); 786 goto bail; 787 } 788 789 /* Lock the orphan dir. The lock will be held for the entire 790 * delete_inode operation. We do this now to avoid races with 791 * recovery completion on other nodes. */ 792 inode_lock(orphan_dir_inode); 793 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 794 if (status < 0) { 795 inode_unlock(orphan_dir_inode); 796 797 mlog_errno(status); 798 goto bail; 799 } 800 } 801 802 /* we do this while holding the orphan dir lock because we 803 * don't want recovery being run from another node to try an 804 * inode delete underneath us -- this will result in two nodes 805 * truncating the same file! */ 806 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 807 if (status < 0) { 808 mlog_errno(status); 809 goto bail_unlock_dir; 810 } 811 812 /* Remove any dir index tree */ 813 if (S_ISDIR(inode->i_mode)) { 814 status = ocfs2_dx_dir_truncate(inode, di_bh); 815 if (status) { 816 mlog_errno(status); 817 goto bail_unlock_dir; 818 } 819 } 820 821 /*Free extended attribute resources associated with this inode.*/ 822 status = ocfs2_xattr_remove(inode, di_bh); 823 if (status < 0) { 824 mlog_errno(status); 825 goto bail_unlock_dir; 826 } 827 828 status = ocfs2_remove_refcount_tree(inode, di_bh); 829 if (status < 0) { 830 mlog_errno(status); 831 goto bail_unlock_dir; 832 } 833 834 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 835 orphan_dir_bh); 836 if (status < 0) 837 mlog_errno(status); 838 839 bail_unlock_dir: 840 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) 841 return status; 842 843 ocfs2_inode_unlock(orphan_dir_inode, 1); 844 inode_unlock(orphan_dir_inode); 845 brelse(orphan_dir_bh); 846 bail: 847 iput(orphan_dir_inode); 848 ocfs2_signal_wipe_completion(osb, orphaned_slot); 849 850 return status; 851 } 852 853 /* There is a series of simple checks that should be done before a 854 * trylock is even considered. Encapsulate those in this function. */ 855 static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 856 { 857 int ret = 0; 858 struct ocfs2_inode_info *oi = OCFS2_I(inode); 859 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 860 861 trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, 862 (unsigned long long)oi->ip_blkno, 863 oi->ip_flags); 864 865 /* We shouldn't be getting here for the root directory 866 * inode.. */ 867 if (inode == osb->root_inode) { 868 mlog(ML_ERROR, "Skipping delete of root inode.\n"); 869 goto bail; 870 } 871 872 /* 873 * If we're coming from downconvert_thread we can't go into our own 874 * voting [hello, deadlock city!] so we cannot delete the inode. But 875 * since we dropped last inode ref when downconverting dentry lock, 876 * we cannot have the file open and thus the node doing unlink will 877 * take care of deleting the inode. 878 */ 879 if (current == osb->dc_task) 880 goto bail; 881 882 spin_lock(&oi->ip_lock); 883 /* OCFS2 *never* deletes system files. This should technically 884 * never get here as system file inodes should always have a 885 * positive link count. */ 886 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { 887 mlog(ML_ERROR, "Skipping delete of system file %llu\n", 888 (unsigned long long)oi->ip_blkno); 889 goto bail_unlock; 890 } 891 892 ret = 1; 893 bail_unlock: 894 spin_unlock(&oi->ip_lock); 895 bail: 896 return ret; 897 } 898 899 /* Query the cluster to determine whether we should wipe an inode from 900 * disk or not. 901 * 902 * Requires the inode to have the cluster lock. */ 903 static int ocfs2_query_inode_wipe(struct inode *inode, 904 struct buffer_head *di_bh, 905 int *wipe) 906 { 907 int status = 0, reason = 0; 908 struct ocfs2_inode_info *oi = OCFS2_I(inode); 909 struct ocfs2_dinode *di; 910 911 *wipe = 0; 912 913 trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, 914 inode->i_nlink); 915 916 /* While we were waiting for the cluster lock in 917 * ocfs2_delete_inode, another node might have asked to delete 918 * the inode. Recheck our flags to catch this. */ 919 if (!ocfs2_inode_is_valid_to_delete(inode)) { 920 reason = 1; 921 goto bail; 922 } 923 924 /* Now that we have an up to date inode, we can double check 925 * the link count. */ 926 if (inode->i_nlink) 927 goto bail; 928 929 /* Do some basic inode verification... */ 930 di = (struct ocfs2_dinode *) di_bh->b_data; 931 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && 932 !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 933 /* 934 * Inodes in the orphan dir must have ORPHANED_FL. The only 935 * inodes that come back out of the orphan dir are reflink 936 * targets. A reflink target may be moved out of the orphan 937 * dir between the time we scan the directory and the time we 938 * process it. This would lead to HAS_REFCOUNT_FL being set but 939 * ORPHANED_FL not. 940 */ 941 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { 942 reason = 2; 943 goto bail; 944 } 945 946 /* for lack of a better error? */ 947 status = -EEXIST; 948 mlog(ML_ERROR, 949 "Inode %llu (on-disk %llu) not orphaned! " 950 "Disk flags 0x%x, inode flags 0x%x\n", 951 (unsigned long long)oi->ip_blkno, 952 (unsigned long long)le64_to_cpu(di->i_blkno), 953 le32_to_cpu(di->i_flags), oi->ip_flags); 954 goto bail; 955 } 956 957 /* has someone already deleted us?! baaad... */ 958 if (di->i_dtime) { 959 status = -EEXIST; 960 mlog_errno(status); 961 goto bail; 962 } 963 964 /* 965 * This is how ocfs2 determines whether an inode is still live 966 * within the cluster. Every node takes a shared read lock on 967 * the inode open lock in ocfs2_read_locked_inode(). When we 968 * get to ->delete_inode(), each node tries to convert it's 969 * lock to an exclusive. Trylocks are serialized by the inode 970 * meta data lock. If the upconvert succeeds, we know the inode 971 * is no longer live and can be deleted. 972 * 973 * Though we call this with the meta data lock held, the 974 * trylock keeps us from ABBA deadlock. 975 */ 976 status = ocfs2_try_open_lock(inode, 1); 977 if (status == -EAGAIN) { 978 status = 0; 979 reason = 3; 980 goto bail; 981 } 982 if (status < 0) { 983 mlog_errno(status); 984 goto bail; 985 } 986 987 *wipe = 1; 988 trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); 989 990 bail: 991 trace_ocfs2_query_inode_wipe_end(status, reason); 992 return status; 993 } 994 995 /* Support function for ocfs2_delete_inode. Will help us keep the 996 * inode data in a consistent state for clear_inode. Always truncates 997 * pages, optionally sync's them first. */ 998 static void ocfs2_cleanup_delete_inode(struct inode *inode, 999 int sync_data) 1000 { 1001 trace_ocfs2_cleanup_delete_inode( 1002 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 1003 if (sync_data) 1004 filemap_write_and_wait(inode->i_mapping); 1005 truncate_inode_pages_final(&inode->i_data); 1006 } 1007 1008 static void ocfs2_delete_inode(struct inode *inode) 1009 { 1010 int wipe, status; 1011 sigset_t oldset; 1012 struct buffer_head *di_bh = NULL; 1013 struct ocfs2_dinode *di = NULL; 1014 1015 trace_ocfs2_delete_inode(inode->i_ino, 1016 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1017 is_bad_inode(inode)); 1018 1019 /* When we fail in read_inode() we mark inode as bad. The second test 1020 * catches the case when inode allocation fails before allocating 1021 * a block for inode. */ 1022 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) 1023 goto bail; 1024 1025 if (!ocfs2_inode_is_valid_to_delete(inode)) { 1026 /* It's probably not necessary to truncate_inode_pages 1027 * here but we do it for safety anyway (it will most 1028 * likely be a no-op anyway) */ 1029 ocfs2_cleanup_delete_inode(inode, 0); 1030 goto bail; 1031 } 1032 1033 dquot_initialize(inode); 1034 1035 /* We want to block signals in delete_inode as the lock and 1036 * messaging paths may return us -ERESTARTSYS. Which would 1037 * cause us to exit early, resulting in inodes being orphaned 1038 * forever. */ 1039 ocfs2_block_signals(&oldset); 1040 1041 /* 1042 * Synchronize us against ocfs2_get_dentry. We take this in 1043 * shared mode so that all nodes can still concurrently 1044 * process deletes. 1045 */ 1046 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); 1047 if (status < 0) { 1048 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); 1049 ocfs2_cleanup_delete_inode(inode, 0); 1050 goto bail_unblock; 1051 } 1052 /* Lock down the inode. This gives us an up to date view of 1053 * it's metadata (for verification), and allows us to 1054 * serialize delete_inode on multiple nodes. 1055 * 1056 * Even though we might be doing a truncate, we don't take the 1057 * allocation lock here as it won't be needed - nobody will 1058 * have the file open. 1059 */ 1060 status = ocfs2_inode_lock(inode, &di_bh, 1); 1061 if (status < 0) { 1062 if (status != -ENOENT) 1063 mlog_errno(status); 1064 ocfs2_cleanup_delete_inode(inode, 0); 1065 goto bail_unlock_nfs_sync; 1066 } 1067 1068 di = (struct ocfs2_dinode *)di_bh->b_data; 1069 /* Skip inode deletion and wait for dio orphan entry recovered 1070 * first */ 1071 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { 1072 ocfs2_cleanup_delete_inode(inode, 0); 1073 goto bail_unlock_inode; 1074 } 1075 1076 /* Query the cluster. This will be the final decision made 1077 * before we go ahead and wipe the inode. */ 1078 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 1079 if (!wipe || status < 0) { 1080 /* Error and remote inode busy both mean we won't be 1081 * removing the inode, so they take almost the same 1082 * path. */ 1083 if (status < 0) 1084 mlog_errno(status); 1085 1086 /* Someone in the cluster has disallowed a wipe of 1087 * this inode, or it was never completely 1088 * orphaned. Write out the pages and exit now. */ 1089 ocfs2_cleanup_delete_inode(inode, 1); 1090 goto bail_unlock_inode; 1091 } 1092 1093 ocfs2_cleanup_delete_inode(inode, 0); 1094 1095 status = ocfs2_wipe_inode(inode, di_bh); 1096 if (status < 0) { 1097 if (status != -EDEADLK) 1098 mlog_errno(status); 1099 goto bail_unlock_inode; 1100 } 1101 1102 /* 1103 * Mark the inode as successfully deleted. 1104 * 1105 * This is important for ocfs2_clear_inode() as it will check 1106 * this flag and skip any checkpointing work 1107 * 1108 * ocfs2_stuff_meta_lvb() also uses this flag to invalidate 1109 * the LVB for other nodes. 1110 */ 1111 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 1112 1113 bail_unlock_inode: 1114 ocfs2_inode_unlock(inode, 1); 1115 brelse(di_bh); 1116 1117 bail_unlock_nfs_sync: 1118 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1119 1120 bail_unblock: 1121 ocfs2_unblock_signals(&oldset); 1122 bail: 1123 return; 1124 } 1125 1126 static void ocfs2_clear_inode(struct inode *inode) 1127 { 1128 int status; 1129 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1130 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1131 1132 clear_inode(inode); 1133 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1134 inode->i_nlink); 1135 1136 mlog_bug_on_msg(osb == NULL, 1137 "Inode=%lu\n", inode->i_ino); 1138 1139 dquot_drop(inode); 1140 1141 /* To preven remote deletes we hold open lock before, now it 1142 * is time to unlock PR and EX open locks. */ 1143 ocfs2_open_unlock(inode); 1144 1145 /* Do these before all the other work so that we don't bounce 1146 * the downconvert thread while waiting to destroy the locks. */ 1147 ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); 1148 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); 1149 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); 1150 1151 ocfs2_resv_discard(&osb->osb_la_resmap, 1152 &oi->ip_la_data_resv); 1153 ocfs2_resv_init_once(&oi->ip_la_data_resv); 1154 1155 /* We very well may get a clear_inode before all an inodes 1156 * metadata has hit disk. Of course, we can't drop any cluster 1157 * locks until the journal has finished with it. The only 1158 * exception here are successfully wiped inodes - their 1159 * metadata can now be considered to be part of the system 1160 * inodes from which it came. */ 1161 if (!(oi->ip_flags & OCFS2_INODE_DELETED)) 1162 ocfs2_checkpoint_inode(inode); 1163 1164 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1165 "Clear inode of %llu, inode has io markers\n", 1166 (unsigned long long)oi->ip_blkno); 1167 mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), 1168 "Clear inode of %llu, inode has unwritten extents\n", 1169 (unsigned long long)oi->ip_blkno); 1170 1171 ocfs2_extent_map_trunc(inode, 0); 1172 1173 status = ocfs2_drop_inode_locks(inode); 1174 if (status < 0) 1175 mlog_errno(status); 1176 1177 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1178 ocfs2_lock_res_free(&oi->ip_inode_lockres); 1179 ocfs2_lock_res_free(&oi->ip_open_lockres); 1180 1181 ocfs2_metadata_cache_exit(INODE_CACHE(inode)); 1182 1183 mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, 1184 "Clear inode of %llu, inode has %u cache items\n", 1185 (unsigned long long)oi->ip_blkno, 1186 INODE_CACHE(inode)->ci_num_cached); 1187 1188 mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), 1189 "Clear inode of %llu, inode has a bad flag\n", 1190 (unsigned long long)oi->ip_blkno); 1191 1192 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), 1193 "Clear inode of %llu, inode is locked\n", 1194 (unsigned long long)oi->ip_blkno); 1195 1196 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), 1197 "Clear inode of %llu, io_mutex is locked\n", 1198 (unsigned long long)oi->ip_blkno); 1199 mutex_unlock(&oi->ip_io_mutex); 1200 1201 /* 1202 * down_trylock() returns 0, down_write_trylock() returns 1 1203 * kernel 1, world 0 1204 */ 1205 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), 1206 "Clear inode of %llu, alloc_sem is locked\n", 1207 (unsigned long long)oi->ip_blkno); 1208 up_write(&oi->ip_alloc_sem); 1209 1210 mlog_bug_on_msg(oi->ip_open_count, 1211 "Clear inode of %llu has open count %d\n", 1212 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 1213 1214 /* Clear all other flags. */ 1215 oi->ip_flags = 0; 1216 oi->ip_dir_start_lookup = 0; 1217 oi->ip_blkno = 0ULL; 1218 1219 /* 1220 * ip_jinode is used to track txns against this inode. We ensure that 1221 * the journal is flushed before journal shutdown. Thus it is safe to 1222 * have inodes get cleaned up after journal shutdown. 1223 */ 1224 jbd2_journal_release_jbd_inode(osb->journal->j_journal, 1225 &oi->ip_jinode); 1226 } 1227 1228 void ocfs2_evict_inode(struct inode *inode) 1229 { 1230 if (!inode->i_nlink || 1231 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { 1232 ocfs2_delete_inode(inode); 1233 } else { 1234 truncate_inode_pages_final(&inode->i_data); 1235 } 1236 ocfs2_clear_inode(inode); 1237 } 1238 1239 /* Called under inode_lock, with no more references on the 1240 * struct inode, so it's safe here to check the flags field 1241 * and to manipulate i_nlink without any other locks. */ 1242 int ocfs2_drop_inode(struct inode *inode) 1243 { 1244 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1245 1246 trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, 1247 inode->i_nlink, oi->ip_flags); 1248 1249 assert_spin_locked(&inode->i_lock); 1250 inode->i_state |= I_WILL_FREE; 1251 spin_unlock(&inode->i_lock); 1252 write_inode_now(inode, 1); 1253 spin_lock(&inode->i_lock); 1254 WARN_ON(inode->i_state & I_NEW); 1255 inode->i_state &= ~I_WILL_FREE; 1256 1257 return 1; 1258 } 1259 1260 /* 1261 * This is called from our getattr. 1262 */ 1263 int ocfs2_inode_revalidate(struct dentry *dentry) 1264 { 1265 struct inode *inode = d_inode(dentry); 1266 int status = 0; 1267 1268 trace_ocfs2_inode_revalidate(inode, 1269 inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, 1270 inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); 1271 1272 if (!inode) { 1273 status = -ENOENT; 1274 goto bail; 1275 } 1276 1277 spin_lock(&OCFS2_I(inode)->ip_lock); 1278 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 1279 spin_unlock(&OCFS2_I(inode)->ip_lock); 1280 status = -ENOENT; 1281 goto bail; 1282 } 1283 spin_unlock(&OCFS2_I(inode)->ip_lock); 1284 1285 /* Let ocfs2_inode_lock do the work of updating our struct 1286 * inode for us. */ 1287 status = ocfs2_inode_lock(inode, NULL, 0); 1288 if (status < 0) { 1289 if (status != -ENOENT) 1290 mlog_errno(status); 1291 goto bail; 1292 } 1293 ocfs2_inode_unlock(inode, 0); 1294 bail: 1295 return status; 1296 } 1297 1298 /* 1299 * Updates a disk inode from a 1300 * struct inode. 1301 * Only takes ip_lock. 1302 */ 1303 int ocfs2_mark_inode_dirty(handle_t *handle, 1304 struct inode *inode, 1305 struct buffer_head *bh) 1306 { 1307 int status; 1308 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 1309 1310 trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); 1311 1312 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1313 OCFS2_JOURNAL_ACCESS_WRITE); 1314 if (status < 0) { 1315 mlog_errno(status); 1316 goto leave; 1317 } 1318 1319 spin_lock(&OCFS2_I(inode)->ip_lock); 1320 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1321 ocfs2_get_inode_flags(OCFS2_I(inode)); 1322 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); 1323 fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); 1324 spin_unlock(&OCFS2_I(inode)->ip_lock); 1325 1326 fe->i_size = cpu_to_le64(i_size_read(inode)); 1327 ocfs2_set_links_count(fe, inode->i_nlink); 1328 fe->i_uid = cpu_to_le32(i_uid_read(inode)); 1329 fe->i_gid = cpu_to_le32(i_gid_read(inode)); 1330 fe->i_mode = cpu_to_le16(inode->i_mode); 1331 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 1332 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 1333 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 1334 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 1335 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1336 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1337 1338 ocfs2_journal_dirty(handle, bh); 1339 ocfs2_update_inode_fsync_trans(handle, inode, 1); 1340 leave: 1341 return status; 1342 } 1343 1344 /* 1345 * 1346 * Updates a struct inode from a disk inode. 1347 * does no i/o, only takes ip_lock. 1348 */ 1349 void ocfs2_refresh_inode(struct inode *inode, 1350 struct ocfs2_dinode *fe) 1351 { 1352 spin_lock(&OCFS2_I(inode)->ip_lock); 1353 1354 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1355 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 1356 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1357 ocfs2_set_inode_flags(inode); 1358 i_size_write(inode, le64_to_cpu(fe->i_size)); 1359 set_nlink(inode, ocfs2_read_links_count(fe)); 1360 i_uid_write(inode, le32_to_cpu(fe->i_uid)); 1361 i_gid_write(inode, le32_to_cpu(fe->i_gid)); 1362 inode->i_mode = le16_to_cpu(fe->i_mode); 1363 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) 1364 inode->i_blocks = 0; 1365 else 1366 inode->i_blocks = ocfs2_inode_sector_count(inode); 1367 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 1368 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 1369 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 1370 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); 1371 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); 1372 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); 1373 1374 spin_unlock(&OCFS2_I(inode)->ip_lock); 1375 } 1376 1377 int ocfs2_validate_inode_block(struct super_block *sb, 1378 struct buffer_head *bh) 1379 { 1380 int rc; 1381 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1382 1383 trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); 1384 1385 BUG_ON(!buffer_uptodate(bh)); 1386 1387 /* 1388 * If the ecc fails, we return the error but otherwise 1389 * leave the filesystem running. We know any error is 1390 * local to this block. 1391 */ 1392 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); 1393 if (rc) { 1394 mlog(ML_ERROR, "Checksum failed for dinode %llu\n", 1395 (unsigned long long)bh->b_blocknr); 1396 goto bail; 1397 } 1398 1399 /* 1400 * Errors after here are fatal. 1401 */ 1402 1403 rc = -EINVAL; 1404 1405 if (!OCFS2_IS_VALID_DINODE(di)) { 1406 rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", 1407 (unsigned long long)bh->b_blocknr, 7, 1408 di->i_signature); 1409 goto bail; 1410 } 1411 1412 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1413 rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", 1414 (unsigned long long)bh->b_blocknr, 1415 (unsigned long long)le64_to_cpu(di->i_blkno)); 1416 goto bail; 1417 } 1418 1419 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1420 rc = ocfs2_error(sb, 1421 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", 1422 (unsigned long long)bh->b_blocknr); 1423 goto bail; 1424 } 1425 1426 if (le32_to_cpu(di->i_fs_generation) != 1427 OCFS2_SB(sb)->fs_generation) { 1428 rc = ocfs2_error(sb, 1429 "Invalid dinode #%llu: fs_generation is %u\n", 1430 (unsigned long long)bh->b_blocknr, 1431 le32_to_cpu(di->i_fs_generation)); 1432 goto bail; 1433 } 1434 1435 rc = 0; 1436 1437 bail: 1438 return rc; 1439 } 1440 1441 static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, 1442 struct buffer_head *bh) 1443 { 1444 int rc = 0; 1445 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1446 1447 trace_ocfs2_filecheck_validate_inode_block( 1448 (unsigned long long)bh->b_blocknr); 1449 1450 BUG_ON(!buffer_uptodate(bh)); 1451 1452 /* 1453 * Call ocfs2_validate_meta_ecc() first since it has ecc repair 1454 * function, but we should not return error immediately when ecc 1455 * validation fails, because the reason is quite likely the invalid 1456 * inode number inputed. 1457 */ 1458 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); 1459 if (rc) { 1460 mlog(ML_ERROR, 1461 "Filecheck: checksum failed for dinode %llu\n", 1462 (unsigned long long)bh->b_blocknr); 1463 rc = -OCFS2_FILECHECK_ERR_BLOCKECC; 1464 } 1465 1466 if (!OCFS2_IS_VALID_DINODE(di)) { 1467 mlog(ML_ERROR, 1468 "Filecheck: invalid dinode #%llu: signature = %.*s\n", 1469 (unsigned long long)bh->b_blocknr, 7, di->i_signature); 1470 rc = -OCFS2_FILECHECK_ERR_INVALIDINO; 1471 goto bail; 1472 } else if (rc) 1473 goto bail; 1474 1475 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1476 mlog(ML_ERROR, 1477 "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", 1478 (unsigned long long)bh->b_blocknr, 1479 (unsigned long long)le64_to_cpu(di->i_blkno)); 1480 rc = -OCFS2_FILECHECK_ERR_BLOCKNO; 1481 goto bail; 1482 } 1483 1484 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1485 mlog(ML_ERROR, 1486 "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " 1487 "not set\n", 1488 (unsigned long long)bh->b_blocknr); 1489 rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; 1490 goto bail; 1491 } 1492 1493 if (le32_to_cpu(di->i_fs_generation) != 1494 OCFS2_SB(sb)->fs_generation) { 1495 mlog(ML_ERROR, 1496 "Filecheck: invalid dinode #%llu: fs_generation is %u\n", 1497 (unsigned long long)bh->b_blocknr, 1498 le32_to_cpu(di->i_fs_generation)); 1499 rc = -OCFS2_FILECHECK_ERR_GENERATION; 1500 } 1501 1502 bail: 1503 return rc; 1504 } 1505 1506 static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, 1507 struct buffer_head *bh) 1508 { 1509 int changed = 0; 1510 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1511 1512 if (!ocfs2_filecheck_validate_inode_block(sb, bh)) 1513 return 0; 1514 1515 trace_ocfs2_filecheck_repair_inode_block( 1516 (unsigned long long)bh->b_blocknr); 1517 1518 if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || 1519 ocfs2_is_soft_readonly(OCFS2_SB(sb))) { 1520 mlog(ML_ERROR, 1521 "Filecheck: cannot repair dinode #%llu " 1522 "on readonly filesystem\n", 1523 (unsigned long long)bh->b_blocknr); 1524 return -OCFS2_FILECHECK_ERR_READONLY; 1525 } 1526 1527 if (buffer_jbd(bh)) { 1528 mlog(ML_ERROR, 1529 "Filecheck: cannot repair dinode #%llu, " 1530 "its buffer is in jbd\n", 1531 (unsigned long long)bh->b_blocknr); 1532 return -OCFS2_FILECHECK_ERR_INJBD; 1533 } 1534 1535 if (!OCFS2_IS_VALID_DINODE(di)) { 1536 /* Cannot fix invalid inode block */ 1537 return -OCFS2_FILECHECK_ERR_INVALIDINO; 1538 } 1539 1540 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1541 /* Cannot just add VALID_FL flag back as a fix, 1542 * need more things to check here. 1543 */ 1544 return -OCFS2_FILECHECK_ERR_VALIDFLAG; 1545 } 1546 1547 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1548 di->i_blkno = cpu_to_le64(bh->b_blocknr); 1549 changed = 1; 1550 mlog(ML_ERROR, 1551 "Filecheck: reset dinode #%llu: i_blkno to %llu\n", 1552 (unsigned long long)bh->b_blocknr, 1553 (unsigned long long)le64_to_cpu(di->i_blkno)); 1554 } 1555 1556 if (le32_to_cpu(di->i_fs_generation) != 1557 OCFS2_SB(sb)->fs_generation) { 1558 di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1559 changed = 1; 1560 mlog(ML_ERROR, 1561 "Filecheck: reset dinode #%llu: fs_generation to %u\n", 1562 (unsigned long long)bh->b_blocknr, 1563 le32_to_cpu(di->i_fs_generation)); 1564 } 1565 1566 if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { 1567 ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); 1568 mark_buffer_dirty(bh); 1569 mlog(ML_ERROR, 1570 "Filecheck: reset dinode #%llu: compute meta ecc\n", 1571 (unsigned long long)bh->b_blocknr); 1572 } 1573 1574 return 0; 1575 } 1576 1577 static int 1578 ocfs2_filecheck_read_inode_block_full(struct inode *inode, 1579 struct buffer_head **bh, 1580 int flags, int type) 1581 { 1582 int rc; 1583 struct buffer_head *tmp = *bh; 1584 1585 if (!type) /* Check inode block */ 1586 rc = ocfs2_read_blocks(INODE_CACHE(inode), 1587 OCFS2_I(inode)->ip_blkno, 1588 1, &tmp, flags, 1589 ocfs2_filecheck_validate_inode_block); 1590 else /* Repair inode block */ 1591 rc = ocfs2_read_blocks(INODE_CACHE(inode), 1592 OCFS2_I(inode)->ip_blkno, 1593 1, &tmp, flags, 1594 ocfs2_filecheck_repair_inode_block); 1595 1596 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1597 if (!rc && !*bh) 1598 *bh = tmp; 1599 1600 return rc; 1601 } 1602 1603 int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 1604 int flags) 1605 { 1606 int rc; 1607 struct buffer_head *tmp = *bh; 1608 1609 rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1610 1, &tmp, flags, ocfs2_validate_inode_block); 1611 1612 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1613 if (!rc && !*bh) 1614 *bh = tmp; 1615 1616 return rc; 1617 } 1618 1619 int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) 1620 { 1621 return ocfs2_read_inode_block_full(inode, bh, 0); 1622 } 1623 1624 1625 static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) 1626 { 1627 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1628 1629 return oi->ip_blkno; 1630 } 1631 1632 static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) 1633 { 1634 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1635 1636 return oi->vfs_inode.i_sb; 1637 } 1638 1639 static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) 1640 { 1641 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1642 1643 spin_lock(&oi->ip_lock); 1644 } 1645 1646 static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) 1647 { 1648 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1649 1650 spin_unlock(&oi->ip_lock); 1651 } 1652 1653 static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) 1654 { 1655 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1656 1657 mutex_lock(&oi->ip_io_mutex); 1658 } 1659 1660 static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) 1661 { 1662 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1663 1664 mutex_unlock(&oi->ip_io_mutex); 1665 } 1666 1667 const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { 1668 .co_owner = ocfs2_inode_cache_owner, 1669 .co_get_super = ocfs2_inode_cache_get_super, 1670 .co_cache_lock = ocfs2_inode_cache_lock, 1671 .co_cache_unlock = ocfs2_inode_cache_unlock, 1672 .co_io_lock = ocfs2_inode_cache_io_lock, 1673 .co_io_unlock = ocfs2_inode_cache_io_unlock, 1674 }; 1675 1676