1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_format.h" 23 #include "xfs_log.h" 24 #include "xfs_inum.h" 25 #include "xfs_trans.h" 26 #include "xfs_trans_space.h" 27 #include "xfs_trans_priv.h" 28 #include "xfs_sb.h" 29 #include "xfs_ag.h" 30 #include "xfs_mount.h" 31 #include "xfs_da_btree.h" 32 #include "xfs_dir2_format.h" 33 #include "xfs_dir2.h" 34 #include "xfs_bmap_btree.h" 35 #include "xfs_alloc_btree.h" 36 #include "xfs_ialloc_btree.h" 37 #include "xfs_attr_sf.h" 38 #include "xfs_attr.h" 39 #include "xfs_dinode.h" 40 #include "xfs_inode.h" 41 #include "xfs_buf_item.h" 42 #include "xfs_inode_item.h" 43 #include "xfs_btree.h" 44 #include "xfs_alloc.h" 45 #include "xfs_ialloc.h" 46 #include "xfs_bmap.h" 47 #include "xfs_bmap_util.h" 48 #include "xfs_error.h" 49 #include "xfs_quota.h" 50 #include "xfs_filestream.h" 51 #include "xfs_cksum.h" 52 #include "xfs_trace.h" 53 #include "xfs_icache.h" 54 #include "xfs_symlink.h" 55 56 kmem_zone_t *xfs_inode_zone; 57 58 /* 59 * Used in xfs_itruncate_extents(). This is the maximum number of extents 60 * freed from a file in a single transaction. 61 */ 62 #define XFS_ITRUNC_MAX_EXTENTS 2 63 64 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 65 66 /* 67 * helper function to extract extent size hint from inode 68 */ 69 xfs_extlen_t 70 xfs_get_extsz_hint( 71 struct xfs_inode *ip) 72 { 73 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) 74 return ip->i_d.di_extsize; 75 if (XFS_IS_REALTIME_INODE(ip)) 76 return ip->i_mount->m_sb.sb_rextsize; 77 return 0; 78 } 79 80 /* 81 * This is a wrapper routine around the xfs_ilock() routine used to centralize 82 * some grungy code. It is used in places that wish to lock the inode solely 83 * for reading the extents. The reason these places can't just call 84 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the 85 * extents from disk for a file in b-tree format. If the inode is in b-tree 86 * format, then we need to lock the inode exclusively until the extents are read 87 * in. Locking it exclusively all the time would limit our parallelism 88 * unnecessarily, though. What we do instead is check to see if the extents 89 * have been read in yet, and only lock the inode exclusively if they have not. 90 * 91 * The function returns a value which should be given to the corresponding 92 * xfs_iunlock_map_shared(). This value is the mode in which the lock was 93 * actually taken. 94 */ 95 uint 96 xfs_ilock_map_shared( 97 xfs_inode_t *ip) 98 { 99 uint lock_mode; 100 101 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && 102 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { 103 lock_mode = XFS_ILOCK_EXCL; 104 } else { 105 lock_mode = XFS_ILOCK_SHARED; 106 } 107 108 xfs_ilock(ip, lock_mode); 109 110 return lock_mode; 111 } 112 113 /* 114 * This is simply the unlock routine to go with xfs_ilock_map_shared(). 115 * All it does is call xfs_iunlock() with the given lock_mode. 116 */ 117 void 118 xfs_iunlock_map_shared( 119 xfs_inode_t *ip, 120 unsigned int lock_mode) 121 { 122 xfs_iunlock(ip, lock_mode); 123 } 124 125 /* 126 * The xfs inode contains 2 locks: a multi-reader lock called the 127 * i_iolock and a multi-reader lock called the i_lock. This routine 128 * allows either or both of the locks to be obtained. 129 * 130 * The 2 locks should always be ordered so that the IO lock is 131 * obtained first in order to prevent deadlock. 132 * 133 * ip -- the inode being locked 134 * lock_flags -- this parameter indicates the inode's locks 135 * to be locked. It can be: 136 * XFS_IOLOCK_SHARED, 137 * XFS_IOLOCK_EXCL, 138 * XFS_ILOCK_SHARED, 139 * XFS_ILOCK_EXCL, 140 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, 141 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, 142 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, 143 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL 144 */ 145 void 146 xfs_ilock( 147 xfs_inode_t *ip, 148 uint lock_flags) 149 { 150 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 151 152 /* 153 * You can't set both SHARED and EXCL for the same lock, 154 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 155 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 156 */ 157 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 158 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 159 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 160 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 161 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 162 163 if (lock_flags & XFS_IOLOCK_EXCL) 164 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 165 else if (lock_flags & XFS_IOLOCK_SHARED) 166 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 167 168 if (lock_flags & XFS_ILOCK_EXCL) 169 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 170 else if (lock_flags & XFS_ILOCK_SHARED) 171 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 172 } 173 174 /* 175 * This is just like xfs_ilock(), except that the caller 176 * is guaranteed not to sleep. It returns 1 if it gets 177 * the requested locks and 0 otherwise. If the IO lock is 178 * obtained but the inode lock cannot be, then the IO lock 179 * is dropped before returning. 180 * 181 * ip -- the inode being locked 182 * lock_flags -- this parameter indicates the inode's locks to be 183 * to be locked. See the comment for xfs_ilock() for a list 184 * of valid values. 185 */ 186 int 187 xfs_ilock_nowait( 188 xfs_inode_t *ip, 189 uint lock_flags) 190 { 191 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 192 193 /* 194 * You can't set both SHARED and EXCL for the same lock, 195 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 196 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 197 */ 198 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 199 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 200 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 201 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 202 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 203 204 if (lock_flags & XFS_IOLOCK_EXCL) { 205 if (!mrtryupdate(&ip->i_iolock)) 206 goto out; 207 } else if (lock_flags & XFS_IOLOCK_SHARED) { 208 if (!mrtryaccess(&ip->i_iolock)) 209 goto out; 210 } 211 if (lock_flags & XFS_ILOCK_EXCL) { 212 if (!mrtryupdate(&ip->i_lock)) 213 goto out_undo_iolock; 214 } else if (lock_flags & XFS_ILOCK_SHARED) { 215 if (!mrtryaccess(&ip->i_lock)) 216 goto out_undo_iolock; 217 } 218 return 1; 219 220 out_undo_iolock: 221 if (lock_flags & XFS_IOLOCK_EXCL) 222 mrunlock_excl(&ip->i_iolock); 223 else if (lock_flags & XFS_IOLOCK_SHARED) 224 mrunlock_shared(&ip->i_iolock); 225 out: 226 return 0; 227 } 228 229 /* 230 * xfs_iunlock() is used to drop the inode locks acquired with 231 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 232 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 233 * that we know which locks to drop. 234 * 235 * ip -- the inode being unlocked 236 * lock_flags -- this parameter indicates the inode's locks to be 237 * to be unlocked. See the comment for xfs_ilock() for a list 238 * of valid values for this parameter. 239 * 240 */ 241 void 242 xfs_iunlock( 243 xfs_inode_t *ip, 244 uint lock_flags) 245 { 246 /* 247 * You can't set both SHARED and EXCL for the same lock, 248 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 249 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 250 */ 251 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 252 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 253 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 254 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 255 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 256 ASSERT(lock_flags != 0); 257 258 if (lock_flags & XFS_IOLOCK_EXCL) 259 mrunlock_excl(&ip->i_iolock); 260 else if (lock_flags & XFS_IOLOCK_SHARED) 261 mrunlock_shared(&ip->i_iolock); 262 263 if (lock_flags & XFS_ILOCK_EXCL) 264 mrunlock_excl(&ip->i_lock); 265 else if (lock_flags & XFS_ILOCK_SHARED) 266 mrunlock_shared(&ip->i_lock); 267 268 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 269 } 270 271 /* 272 * give up write locks. the i/o lock cannot be held nested 273 * if it is being demoted. 274 */ 275 void 276 xfs_ilock_demote( 277 xfs_inode_t *ip, 278 uint lock_flags) 279 { 280 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); 281 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 282 283 if (lock_flags & XFS_ILOCK_EXCL) 284 mrdemote(&ip->i_lock); 285 if (lock_flags & XFS_IOLOCK_EXCL) 286 mrdemote(&ip->i_iolock); 287 288 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 289 } 290 291 #if defined(DEBUG) || defined(XFS_WARN) 292 int 293 xfs_isilocked( 294 xfs_inode_t *ip, 295 uint lock_flags) 296 { 297 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { 298 if (!(lock_flags & XFS_ILOCK_SHARED)) 299 return !!ip->i_lock.mr_writer; 300 return rwsem_is_locked(&ip->i_lock.mr_lock); 301 } 302 303 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 304 if (!(lock_flags & XFS_IOLOCK_SHARED)) 305 return !!ip->i_iolock.mr_writer; 306 return rwsem_is_locked(&ip->i_iolock.mr_lock); 307 } 308 309 ASSERT(0); 310 return 0; 311 } 312 #endif 313 314 #ifdef DEBUG 315 int xfs_locked_n; 316 int xfs_small_retries; 317 int xfs_middle_retries; 318 int xfs_lots_retries; 319 int xfs_lock_delays; 320 #endif 321 322 /* 323 * Bump the subclass so xfs_lock_inodes() acquires each lock with 324 * a different value 325 */ 326 static inline int 327 xfs_lock_inumorder(int lock_mode, int subclass) 328 { 329 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 330 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; 331 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) 332 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; 333 334 return lock_mode; 335 } 336 337 /* 338 * The following routine will lock n inodes in exclusive mode. 339 * We assume the caller calls us with the inodes in i_ino order. 340 * 341 * We need to detect deadlock where an inode that we lock 342 * is in the AIL and we start waiting for another inode that is locked 343 * by a thread in a long running transaction (such as truncate). This can 344 * result in deadlock since the long running trans might need to wait 345 * for the inode we just locked in order to push the tail and free space 346 * in the log. 347 */ 348 void 349 xfs_lock_inodes( 350 xfs_inode_t **ips, 351 int inodes, 352 uint lock_mode) 353 { 354 int attempts = 0, i, j, try_lock; 355 xfs_log_item_t *lp; 356 357 ASSERT(ips && (inodes >= 2)); /* we need at least two */ 358 359 try_lock = 0; 360 i = 0; 361 362 again: 363 for (; i < inodes; i++) { 364 ASSERT(ips[i]); 365 366 if (i && (ips[i] == ips[i-1])) /* Already locked */ 367 continue; 368 369 /* 370 * If try_lock is not set yet, make sure all locked inodes 371 * are not in the AIL. 372 * If any are, set try_lock to be used later. 373 */ 374 375 if (!try_lock) { 376 for (j = (i - 1); j >= 0 && !try_lock; j--) { 377 lp = (xfs_log_item_t *)ips[j]->i_itemp; 378 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { 379 try_lock++; 380 } 381 } 382 } 383 384 /* 385 * If any of the previous locks we have locked is in the AIL, 386 * we must TRY to get the second and subsequent locks. If 387 * we can't get any, we must release all we have 388 * and try again. 389 */ 390 391 if (try_lock) { 392 /* try_lock must be 0 if i is 0. */ 393 /* 394 * try_lock means we have an inode locked 395 * that is in the AIL. 396 */ 397 ASSERT(i != 0); 398 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { 399 attempts++; 400 401 /* 402 * Unlock all previous guys and try again. 403 * xfs_iunlock will try to push the tail 404 * if the inode is in the AIL. 405 */ 406 407 for(j = i - 1; j >= 0; j--) { 408 409 /* 410 * Check to see if we've already 411 * unlocked this one. 412 * Not the first one going back, 413 * and the inode ptr is the same. 414 */ 415 if ((j != (i - 1)) && ips[j] == 416 ips[j+1]) 417 continue; 418 419 xfs_iunlock(ips[j], lock_mode); 420 } 421 422 if ((attempts % 5) == 0) { 423 delay(1); /* Don't just spin the CPU */ 424 #ifdef DEBUG 425 xfs_lock_delays++; 426 #endif 427 } 428 i = 0; 429 try_lock = 0; 430 goto again; 431 } 432 } else { 433 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 434 } 435 } 436 437 #ifdef DEBUG 438 if (attempts) { 439 if (attempts < 5) xfs_small_retries++; 440 else if (attempts < 100) xfs_middle_retries++; 441 else xfs_lots_retries++; 442 } else { 443 xfs_locked_n++; 444 } 445 #endif 446 } 447 448 /* 449 * xfs_lock_two_inodes() can only be used to lock one type of lock 450 * at a time - the iolock or the ilock, but not both at once. If 451 * we lock both at once, lockdep will report false positives saying 452 * we have violated locking orders. 453 */ 454 void 455 xfs_lock_two_inodes( 456 xfs_inode_t *ip0, 457 xfs_inode_t *ip1, 458 uint lock_mode) 459 { 460 xfs_inode_t *temp; 461 int attempts = 0; 462 xfs_log_item_t *lp; 463 464 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 465 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); 466 ASSERT(ip0->i_ino != ip1->i_ino); 467 468 if (ip0->i_ino > ip1->i_ino) { 469 temp = ip0; 470 ip0 = ip1; 471 ip1 = temp; 472 } 473 474 again: 475 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); 476 477 /* 478 * If the first lock we have locked is in the AIL, we must TRY to get 479 * the second lock. If we can't get it, we must release the first one 480 * and try again. 481 */ 482 lp = (xfs_log_item_t *)ip0->i_itemp; 483 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { 484 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { 485 xfs_iunlock(ip0, lock_mode); 486 if ((++attempts % 5) == 0) 487 delay(1); /* Don't just spin the CPU */ 488 goto again; 489 } 490 } else { 491 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); 492 } 493 } 494 495 496 void 497 __xfs_iflock( 498 struct xfs_inode *ip) 499 { 500 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); 501 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); 502 503 do { 504 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 505 if (xfs_isiflocked(ip)) 506 io_schedule(); 507 } while (!xfs_iflock_nowait(ip)); 508 509 finish_wait(wq, &wait.wait); 510 } 511 512 STATIC uint 513 _xfs_dic2xflags( 514 __uint16_t di_flags) 515 { 516 uint flags = 0; 517 518 if (di_flags & XFS_DIFLAG_ANY) { 519 if (di_flags & XFS_DIFLAG_REALTIME) 520 flags |= XFS_XFLAG_REALTIME; 521 if (di_flags & XFS_DIFLAG_PREALLOC) 522 flags |= XFS_XFLAG_PREALLOC; 523 if (di_flags & XFS_DIFLAG_IMMUTABLE) 524 flags |= XFS_XFLAG_IMMUTABLE; 525 if (di_flags & XFS_DIFLAG_APPEND) 526 flags |= XFS_XFLAG_APPEND; 527 if (di_flags & XFS_DIFLAG_SYNC) 528 flags |= XFS_XFLAG_SYNC; 529 if (di_flags & XFS_DIFLAG_NOATIME) 530 flags |= XFS_XFLAG_NOATIME; 531 if (di_flags & XFS_DIFLAG_NODUMP) 532 flags |= XFS_XFLAG_NODUMP; 533 if (di_flags & XFS_DIFLAG_RTINHERIT) 534 flags |= XFS_XFLAG_RTINHERIT; 535 if (di_flags & XFS_DIFLAG_PROJINHERIT) 536 flags |= XFS_XFLAG_PROJINHERIT; 537 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 538 flags |= XFS_XFLAG_NOSYMLINKS; 539 if (di_flags & XFS_DIFLAG_EXTSIZE) 540 flags |= XFS_XFLAG_EXTSIZE; 541 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 542 flags |= XFS_XFLAG_EXTSZINHERIT; 543 if (di_flags & XFS_DIFLAG_NODEFRAG) 544 flags |= XFS_XFLAG_NODEFRAG; 545 if (di_flags & XFS_DIFLAG_FILESTREAM) 546 flags |= XFS_XFLAG_FILESTREAM; 547 } 548 549 return flags; 550 } 551 552 uint 553 xfs_ip2xflags( 554 xfs_inode_t *ip) 555 { 556 xfs_icdinode_t *dic = &ip->i_d; 557 558 return _xfs_dic2xflags(dic->di_flags) | 559 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 560 } 561 562 uint 563 xfs_dic2xflags( 564 xfs_dinode_t *dip) 565 { 566 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 567 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 568 } 569 570 /* 571 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 572 * is allowed, otherwise it has to be an exact match. If a CI match is found, 573 * ci_name->name will point to a the actual name (caller must free) or 574 * will be set to NULL if an exact match is found. 575 */ 576 int 577 xfs_lookup( 578 xfs_inode_t *dp, 579 struct xfs_name *name, 580 xfs_inode_t **ipp, 581 struct xfs_name *ci_name) 582 { 583 xfs_ino_t inum; 584 int error; 585 uint lock_mode; 586 587 trace_xfs_lookup(dp, name); 588 589 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 590 return XFS_ERROR(EIO); 591 592 lock_mode = xfs_ilock_map_shared(dp); 593 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 594 xfs_iunlock_map_shared(dp, lock_mode); 595 596 if (error) 597 goto out; 598 599 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 600 if (error) 601 goto out_free_name; 602 603 return 0; 604 605 out_free_name: 606 if (ci_name) 607 kmem_free(ci_name->name); 608 out: 609 *ipp = NULL; 610 return error; 611 } 612 613 /* 614 * Allocate an inode on disk and return a copy of its in-core version. 615 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 616 * appropriately within the inode. The uid and gid for the inode are 617 * set according to the contents of the given cred structure. 618 * 619 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 620 * has a free inode available, call xfs_iget() to obtain the in-core 621 * version of the allocated inode. Finally, fill in the inode and 622 * log its initial contents. In this case, ialloc_context would be 623 * set to NULL. 624 * 625 * If xfs_dialloc() does not have an available inode, it will replenish 626 * its supply by doing an allocation. Since we can only do one 627 * allocation within a transaction without deadlocks, we must commit 628 * the current transaction before returning the inode itself. 629 * In this case, therefore, we will set ialloc_context and return. 630 * The caller should then commit the current transaction, start a new 631 * transaction, and call xfs_ialloc() again to actually get the inode. 632 * 633 * To ensure that some other process does not grab the inode that 634 * was allocated during the first call to xfs_ialloc(), this routine 635 * also returns the [locked] bp pointing to the head of the freelist 636 * as ialloc_context. The caller should hold this buffer across 637 * the commit and pass it back into this routine on the second call. 638 * 639 * If we are allocating quota inodes, we do not have a parent inode 640 * to attach to or associate with (i.e. pip == NULL) because they 641 * are not linked into the directory structure - they are attached 642 * directly to the superblock - and so have no parent. 643 */ 644 int 645 xfs_ialloc( 646 xfs_trans_t *tp, 647 xfs_inode_t *pip, 648 umode_t mode, 649 xfs_nlink_t nlink, 650 xfs_dev_t rdev, 651 prid_t prid, 652 int okalloc, 653 xfs_buf_t **ialloc_context, 654 xfs_inode_t **ipp) 655 { 656 struct xfs_mount *mp = tp->t_mountp; 657 xfs_ino_t ino; 658 xfs_inode_t *ip; 659 uint flags; 660 int error; 661 timespec_t tv; 662 int filestreams = 0; 663 664 /* 665 * Call the space management code to pick 666 * the on-disk inode to be allocated. 667 */ 668 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 669 ialloc_context, &ino); 670 if (error) 671 return error; 672 if (*ialloc_context || ino == NULLFSINO) { 673 *ipp = NULL; 674 return 0; 675 } 676 ASSERT(*ialloc_context == NULL); 677 678 /* 679 * Get the in-core inode with the lock held exclusively. 680 * This is because we're setting fields here we need 681 * to prevent others from looking at until we're done. 682 */ 683 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, 684 XFS_ILOCK_EXCL, &ip); 685 if (error) 686 return error; 687 ASSERT(ip != NULL); 688 689 ip->i_d.di_mode = mode; 690 ip->i_d.di_onlink = 0; 691 ip->i_d.di_nlink = nlink; 692 ASSERT(ip->i_d.di_nlink == nlink); 693 ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); 694 ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); 695 xfs_set_projid(ip, prid); 696 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 697 698 /* 699 * If the superblock version is up to where we support new format 700 * inodes and this is currently an old format inode, then change 701 * the inode version number now. This way we only do the conversion 702 * here rather than here and in the flush/logging code. 703 */ 704 if (xfs_sb_version_hasnlink(&mp->m_sb) && 705 ip->i_d.di_version == 1) { 706 ip->i_d.di_version = 2; 707 /* 708 * We've already zeroed the old link count, the projid field, 709 * and the pad field. 710 */ 711 } 712 713 /* 714 * Project ids won't be stored on disk if we are using a version 1 inode. 715 */ 716 if ((prid != 0) && (ip->i_d.di_version == 1)) 717 xfs_bump_ino_vers2(tp, ip); 718 719 if (pip && XFS_INHERIT_GID(pip)) { 720 ip->i_d.di_gid = pip->i_d.di_gid; 721 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { 722 ip->i_d.di_mode |= S_ISGID; 723 } 724 } 725 726 /* 727 * If the group ID of the new file does not match the effective group 728 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 729 * (and only if the irix_sgid_inherit compatibility variable is set). 730 */ 731 if ((irix_sgid_inherit) && 732 (ip->i_d.di_mode & S_ISGID) && 733 (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { 734 ip->i_d.di_mode &= ~S_ISGID; 735 } 736 737 ip->i_d.di_size = 0; 738 ip->i_d.di_nextents = 0; 739 ASSERT(ip->i_d.di_nblocks == 0); 740 741 nanotime(&tv); 742 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 743 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 744 ip->i_d.di_atime = ip->i_d.di_mtime; 745 ip->i_d.di_ctime = ip->i_d.di_mtime; 746 747 /* 748 * di_gen will have been taken care of in xfs_iread. 749 */ 750 ip->i_d.di_extsize = 0; 751 ip->i_d.di_dmevmask = 0; 752 ip->i_d.di_dmstate = 0; 753 ip->i_d.di_flags = 0; 754 755 if (ip->i_d.di_version == 3) { 756 ASSERT(ip->i_d.di_ino == ino); 757 ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); 758 ip->i_d.di_crc = 0; 759 ip->i_d.di_changecount = 1; 760 ip->i_d.di_lsn = 0; 761 ip->i_d.di_flags2 = 0; 762 memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); 763 ip->i_d.di_crtime = ip->i_d.di_mtime; 764 } 765 766 767 flags = XFS_ILOG_CORE; 768 switch (mode & S_IFMT) { 769 case S_IFIFO: 770 case S_IFCHR: 771 case S_IFBLK: 772 case S_IFSOCK: 773 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 774 ip->i_df.if_u2.if_rdev = rdev; 775 ip->i_df.if_flags = 0; 776 flags |= XFS_ILOG_DEV; 777 break; 778 case S_IFREG: 779 /* 780 * we can't set up filestreams until after the VFS inode 781 * is set up properly. 782 */ 783 if (pip && xfs_inode_is_filestream(pip)) 784 filestreams = 1; 785 /* fall through */ 786 case S_IFDIR: 787 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 788 uint di_flags = 0; 789 790 if (S_ISDIR(mode)) { 791 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 792 di_flags |= XFS_DIFLAG_RTINHERIT; 793 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 794 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 795 ip->i_d.di_extsize = pip->i_d.di_extsize; 796 } 797 } else if (S_ISREG(mode)) { 798 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 799 di_flags |= XFS_DIFLAG_REALTIME; 800 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 801 di_flags |= XFS_DIFLAG_EXTSIZE; 802 ip->i_d.di_extsize = pip->i_d.di_extsize; 803 } 804 } 805 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 806 xfs_inherit_noatime) 807 di_flags |= XFS_DIFLAG_NOATIME; 808 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 809 xfs_inherit_nodump) 810 di_flags |= XFS_DIFLAG_NODUMP; 811 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 812 xfs_inherit_sync) 813 di_flags |= XFS_DIFLAG_SYNC; 814 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 815 xfs_inherit_nosymlinks) 816 di_flags |= XFS_DIFLAG_NOSYMLINKS; 817 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 818 di_flags |= XFS_DIFLAG_PROJINHERIT; 819 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 820 xfs_inherit_nodefrag) 821 di_flags |= XFS_DIFLAG_NODEFRAG; 822 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 823 di_flags |= XFS_DIFLAG_FILESTREAM; 824 ip->i_d.di_flags |= di_flags; 825 } 826 /* FALLTHROUGH */ 827 case S_IFLNK: 828 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 829 ip->i_df.if_flags = XFS_IFEXTENTS; 830 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 831 ip->i_df.if_u1.if_extents = NULL; 832 break; 833 default: 834 ASSERT(0); 835 } 836 /* 837 * Attribute fork settings for new inode. 838 */ 839 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 840 ip->i_d.di_anextents = 0; 841 842 /* 843 * Log the new values stuffed into the inode. 844 */ 845 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 846 xfs_trans_log_inode(tp, ip, flags); 847 848 /* now that we have an i_mode we can setup inode ops and unlock */ 849 xfs_setup_inode(ip); 850 851 /* now we have set up the vfs inode we can associate the filestream */ 852 if (filestreams) { 853 error = xfs_filestream_associate(pip, ip); 854 if (error < 0) 855 return -error; 856 if (!error) 857 xfs_iflags_set(ip, XFS_IFILESTREAM); 858 } 859 860 *ipp = ip; 861 return 0; 862 } 863 864 /* 865 * Allocates a new inode from disk and return a pointer to the 866 * incore copy. This routine will internally commit the current 867 * transaction and allocate a new one if the Space Manager needed 868 * to do an allocation to replenish the inode free-list. 869 * 870 * This routine is designed to be called from xfs_create and 871 * xfs_create_dir. 872 * 873 */ 874 int 875 xfs_dir_ialloc( 876 xfs_trans_t **tpp, /* input: current transaction; 877 output: may be a new transaction. */ 878 xfs_inode_t *dp, /* directory within whose allocate 879 the inode. */ 880 umode_t mode, 881 xfs_nlink_t nlink, 882 xfs_dev_t rdev, 883 prid_t prid, /* project id */ 884 int okalloc, /* ok to allocate new space */ 885 xfs_inode_t **ipp, /* pointer to inode; it will be 886 locked. */ 887 int *committed) 888 889 { 890 xfs_trans_t *tp; 891 xfs_trans_t *ntp; 892 xfs_inode_t *ip; 893 xfs_buf_t *ialloc_context = NULL; 894 int code; 895 void *dqinfo; 896 uint tflags; 897 898 tp = *tpp; 899 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 900 901 /* 902 * xfs_ialloc will return a pointer to an incore inode if 903 * the Space Manager has an available inode on the free 904 * list. Otherwise, it will do an allocation and replenish 905 * the freelist. Since we can only do one allocation per 906 * transaction without deadlocks, we will need to commit the 907 * current transaction and start a new one. We will then 908 * need to call xfs_ialloc again to get the inode. 909 * 910 * If xfs_ialloc did an allocation to replenish the freelist, 911 * it returns the bp containing the head of the freelist as 912 * ialloc_context. We will hold a lock on it across the 913 * transaction commit so that no other process can steal 914 * the inode(s) that we've just allocated. 915 */ 916 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, 917 &ialloc_context, &ip); 918 919 /* 920 * Return an error if we were unable to allocate a new inode. 921 * This should only happen if we run out of space on disk or 922 * encounter a disk error. 923 */ 924 if (code) { 925 *ipp = NULL; 926 return code; 927 } 928 if (!ialloc_context && !ip) { 929 *ipp = NULL; 930 return XFS_ERROR(ENOSPC); 931 } 932 933 /* 934 * If the AGI buffer is non-NULL, then we were unable to get an 935 * inode in one operation. We need to commit the current 936 * transaction and call xfs_ialloc() again. It is guaranteed 937 * to succeed the second time. 938 */ 939 if (ialloc_context) { 940 struct xfs_trans_res tres; 941 942 /* 943 * Normally, xfs_trans_commit releases all the locks. 944 * We call bhold to hang on to the ialloc_context across 945 * the commit. Holding this buffer prevents any other 946 * processes from doing any allocations in this 947 * allocation group. 948 */ 949 xfs_trans_bhold(tp, ialloc_context); 950 /* 951 * Save the log reservation so we can use 952 * them in the next transaction. 953 */ 954 tres.tr_logres = xfs_trans_get_log_res(tp); 955 tres.tr_logcount = xfs_trans_get_log_count(tp); 956 957 /* 958 * We want the quota changes to be associated with the next 959 * transaction, NOT this one. So, detach the dqinfo from this 960 * and attach it to the next transaction. 961 */ 962 dqinfo = NULL; 963 tflags = 0; 964 if (tp->t_dqinfo) { 965 dqinfo = (void *)tp->t_dqinfo; 966 tp->t_dqinfo = NULL; 967 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; 968 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); 969 } 970 971 ntp = xfs_trans_dup(tp); 972 code = xfs_trans_commit(tp, 0); 973 tp = ntp; 974 if (committed != NULL) { 975 *committed = 1; 976 } 977 /* 978 * If we get an error during the commit processing, 979 * release the buffer that is still held and return 980 * to the caller. 981 */ 982 if (code) { 983 xfs_buf_relse(ialloc_context); 984 if (dqinfo) { 985 tp->t_dqinfo = dqinfo; 986 xfs_trans_free_dqinfo(tp); 987 } 988 *tpp = ntp; 989 *ipp = NULL; 990 return code; 991 } 992 993 /* 994 * transaction commit worked ok so we can drop the extra ticket 995 * reference that we gained in xfs_trans_dup() 996 */ 997 xfs_log_ticket_put(tp->t_ticket); 998 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; 999 code = xfs_trans_reserve(tp, &tres, 0, 0); 1000 1001 /* 1002 * Re-attach the quota info that we detached from prev trx. 1003 */ 1004 if (dqinfo) { 1005 tp->t_dqinfo = dqinfo; 1006 tp->t_flags |= tflags; 1007 } 1008 1009 if (code) { 1010 xfs_buf_relse(ialloc_context); 1011 *tpp = ntp; 1012 *ipp = NULL; 1013 return code; 1014 } 1015 xfs_trans_bjoin(tp, ialloc_context); 1016 1017 /* 1018 * Call ialloc again. Since we've locked out all 1019 * other allocations in this allocation group, 1020 * this call should always succeed. 1021 */ 1022 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, 1023 okalloc, &ialloc_context, &ip); 1024 1025 /* 1026 * If we get an error at this point, return to the caller 1027 * so that the current transaction can be aborted. 1028 */ 1029 if (code) { 1030 *tpp = tp; 1031 *ipp = NULL; 1032 return code; 1033 } 1034 ASSERT(!ialloc_context && ip); 1035 1036 } else { 1037 if (committed != NULL) 1038 *committed = 0; 1039 } 1040 1041 *ipp = ip; 1042 *tpp = tp; 1043 1044 return 0; 1045 } 1046 1047 /* 1048 * Decrement the link count on an inode & log the change. 1049 * If this causes the link count to go to zero, initiate the 1050 * logging activity required to truncate a file. 1051 */ 1052 int /* error */ 1053 xfs_droplink( 1054 xfs_trans_t *tp, 1055 xfs_inode_t *ip) 1056 { 1057 int error; 1058 1059 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1060 1061 ASSERT (ip->i_d.di_nlink > 0); 1062 ip->i_d.di_nlink--; 1063 drop_nlink(VFS_I(ip)); 1064 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1065 1066 error = 0; 1067 if (ip->i_d.di_nlink == 0) { 1068 /* 1069 * We're dropping the last link to this file. 1070 * Move the on-disk inode to the AGI unlinked list. 1071 * From xfs_inactive() we will pull the inode from 1072 * the list and free it. 1073 */ 1074 error = xfs_iunlink(tp, ip); 1075 } 1076 return error; 1077 } 1078 1079 /* 1080 * This gets called when the inode's version needs to be changed from 1 to 2. 1081 * Currently this happens when the nlink field overflows the old 16-bit value 1082 * or when chproj is called to change the project for the first time. 1083 * As a side effect the superblock version will also get rev'd 1084 * to contain the NLINK bit. 1085 */ 1086 void 1087 xfs_bump_ino_vers2( 1088 xfs_trans_t *tp, 1089 xfs_inode_t *ip) 1090 { 1091 xfs_mount_t *mp; 1092 1093 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1094 ASSERT(ip->i_d.di_version == 1); 1095 1096 ip->i_d.di_version = 2; 1097 ip->i_d.di_onlink = 0; 1098 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1099 mp = tp->t_mountp; 1100 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 1101 spin_lock(&mp->m_sb_lock); 1102 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 1103 xfs_sb_version_addnlink(&mp->m_sb); 1104 spin_unlock(&mp->m_sb_lock); 1105 xfs_mod_sb(tp, XFS_SB_VERSIONNUM); 1106 } else { 1107 spin_unlock(&mp->m_sb_lock); 1108 } 1109 } 1110 /* Caller must log the inode */ 1111 } 1112 1113 /* 1114 * Increment the link count on an inode & log the change. 1115 */ 1116 int 1117 xfs_bumplink( 1118 xfs_trans_t *tp, 1119 xfs_inode_t *ip) 1120 { 1121 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1122 1123 ASSERT(ip->i_d.di_nlink > 0); 1124 ip->i_d.di_nlink++; 1125 inc_nlink(VFS_I(ip)); 1126 if ((ip->i_d.di_version == 1) && 1127 (ip->i_d.di_nlink > XFS_MAXLINK_1)) { 1128 /* 1129 * The inode has increased its number of links beyond 1130 * what can fit in an old format inode. It now needs 1131 * to be converted to a version 2 inode with a 32 bit 1132 * link count. If this is the first inode in the file 1133 * system to do this, then we need to bump the superblock 1134 * version number as well. 1135 */ 1136 xfs_bump_ino_vers2(tp, ip); 1137 } 1138 1139 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1140 return 0; 1141 } 1142 1143 int 1144 xfs_create( 1145 xfs_inode_t *dp, 1146 struct xfs_name *name, 1147 umode_t mode, 1148 xfs_dev_t rdev, 1149 xfs_inode_t **ipp) 1150 { 1151 int is_dir = S_ISDIR(mode); 1152 struct xfs_mount *mp = dp->i_mount; 1153 struct xfs_inode *ip = NULL; 1154 struct xfs_trans *tp = NULL; 1155 int error; 1156 xfs_bmap_free_t free_list; 1157 xfs_fsblock_t first_block; 1158 bool unlock_dp_on_error = false; 1159 uint cancel_flags; 1160 int committed; 1161 prid_t prid; 1162 struct xfs_dquot *udqp = NULL; 1163 struct xfs_dquot *gdqp = NULL; 1164 struct xfs_dquot *pdqp = NULL; 1165 struct xfs_trans_res tres; 1166 uint resblks; 1167 1168 trace_xfs_create(dp, name); 1169 1170 if (XFS_FORCED_SHUTDOWN(mp)) 1171 return XFS_ERROR(EIO); 1172 1173 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1174 prid = xfs_get_projid(dp); 1175 else 1176 prid = XFS_PROJID_DEFAULT; 1177 1178 /* 1179 * Make sure that we have allocated dquot(s) on disk. 1180 */ 1181 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), 1182 xfs_kgid_to_gid(current_fsgid()), prid, 1183 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1184 &udqp, &gdqp, &pdqp); 1185 if (error) 1186 return error; 1187 1188 if (is_dir) { 1189 rdev = 0; 1190 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1191 tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; 1192 tres.tr_logcount = XFS_MKDIR_LOG_COUNT; 1193 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); 1194 } else { 1195 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1196 tres.tr_logres = M_RES(mp)->tr_create.tr_logres; 1197 tres.tr_logcount = XFS_CREATE_LOG_COUNT; 1198 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); 1199 } 1200 1201 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1202 1203 /* 1204 * Initially assume that the file does not exist and 1205 * reserve the resources for that case. If that is not 1206 * the case we'll drop the one we have and get a more 1207 * appropriate transaction later. 1208 */ 1209 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; 1210 error = xfs_trans_reserve(tp, &tres, resblks, 0); 1211 if (error == ENOSPC) { 1212 /* flush outstanding delalloc blocks and retry */ 1213 xfs_flush_inodes(mp); 1214 error = xfs_trans_reserve(tp, &tres, resblks, 0); 1215 } 1216 if (error == ENOSPC) { 1217 /* No space at all so try a "no-allocation" reservation */ 1218 resblks = 0; 1219 error = xfs_trans_reserve(tp, &tres, 0, 0); 1220 } 1221 if (error) { 1222 cancel_flags = 0; 1223 goto out_trans_cancel; 1224 } 1225 1226 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1227 unlock_dp_on_error = true; 1228 1229 xfs_bmap_init(&free_list, &first_block); 1230 1231 /* 1232 * Reserve disk quota and the inode. 1233 */ 1234 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, 1235 pdqp, resblks, 1, 0); 1236 if (error) 1237 goto out_trans_cancel; 1238 1239 error = xfs_dir_canenter(tp, dp, name, resblks); 1240 if (error) 1241 goto out_trans_cancel; 1242 1243 /* 1244 * A newly created regular or special file just has one directory 1245 * entry pointing to them, but a directory also the "." entry 1246 * pointing to itself. 1247 */ 1248 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, 1249 prid, resblks > 0, &ip, &committed); 1250 if (error) { 1251 if (error == ENOSPC) 1252 goto out_trans_cancel; 1253 goto out_trans_abort; 1254 } 1255 1256 /* 1257 * Now we join the directory inode to the transaction. We do not do it 1258 * earlier because xfs_dir_ialloc might commit the previous transaction 1259 * (and release all the locks). An error from here on will result in 1260 * the transaction cancel unlocking dp so don't do it explicitly in the 1261 * error path. 1262 */ 1263 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1264 unlock_dp_on_error = false; 1265 1266 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1267 &first_block, &free_list, resblks ? 1268 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1269 if (error) { 1270 ASSERT(error != ENOSPC); 1271 goto out_trans_abort; 1272 } 1273 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1274 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1275 1276 if (is_dir) { 1277 error = xfs_dir_init(tp, ip, dp); 1278 if (error) 1279 goto out_bmap_cancel; 1280 1281 error = xfs_bumplink(tp, dp); 1282 if (error) 1283 goto out_bmap_cancel; 1284 } 1285 1286 /* 1287 * If this is a synchronous mount, make sure that the 1288 * create transaction goes to disk before returning to 1289 * the user. 1290 */ 1291 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 1292 xfs_trans_set_sync(tp); 1293 1294 /* 1295 * Attach the dquot(s) to the inodes and modify them incore. 1296 * These ids of the inode couldn't have changed since the new 1297 * inode has been locked ever since it was created. 1298 */ 1299 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1300 1301 error = xfs_bmap_finish(&tp, &free_list, &committed); 1302 if (error) 1303 goto out_bmap_cancel; 1304 1305 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1306 if (error) 1307 goto out_release_inode; 1308 1309 xfs_qm_dqrele(udqp); 1310 xfs_qm_dqrele(gdqp); 1311 xfs_qm_dqrele(pdqp); 1312 1313 *ipp = ip; 1314 return 0; 1315 1316 out_bmap_cancel: 1317 xfs_bmap_cancel(&free_list); 1318 out_trans_abort: 1319 cancel_flags |= XFS_TRANS_ABORT; 1320 out_trans_cancel: 1321 xfs_trans_cancel(tp, cancel_flags); 1322 out_release_inode: 1323 /* 1324 * Wait until after the current transaction is aborted to 1325 * release the inode. This prevents recursive transactions 1326 * and deadlocks from xfs_inactive. 1327 */ 1328 if (ip) 1329 IRELE(ip); 1330 1331 xfs_qm_dqrele(udqp); 1332 xfs_qm_dqrele(gdqp); 1333 xfs_qm_dqrele(pdqp); 1334 1335 if (unlock_dp_on_error) 1336 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1337 return error; 1338 } 1339 1340 int 1341 xfs_link( 1342 xfs_inode_t *tdp, 1343 xfs_inode_t *sip, 1344 struct xfs_name *target_name) 1345 { 1346 xfs_mount_t *mp = tdp->i_mount; 1347 xfs_trans_t *tp; 1348 int error; 1349 xfs_bmap_free_t free_list; 1350 xfs_fsblock_t first_block; 1351 int cancel_flags; 1352 int committed; 1353 int resblks; 1354 1355 trace_xfs_link(tdp, target_name); 1356 1357 ASSERT(!S_ISDIR(sip->i_d.di_mode)); 1358 1359 if (XFS_FORCED_SHUTDOWN(mp)) 1360 return XFS_ERROR(EIO); 1361 1362 error = xfs_qm_dqattach(sip, 0); 1363 if (error) 1364 goto std_return; 1365 1366 error = xfs_qm_dqattach(tdp, 0); 1367 if (error) 1368 goto std_return; 1369 1370 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); 1371 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1372 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1373 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); 1374 if (error == ENOSPC) { 1375 resblks = 0; 1376 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); 1377 } 1378 if (error) { 1379 cancel_flags = 0; 1380 goto error_return; 1381 } 1382 1383 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1384 1385 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 1386 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); 1387 1388 /* 1389 * If we are using project inheritance, we only allow hard link 1390 * creation in our tree when the project IDs are the same; else 1391 * the tree quota mechanism could be circumvented. 1392 */ 1393 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1394 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { 1395 error = XFS_ERROR(EXDEV); 1396 goto error_return; 1397 } 1398 1399 error = xfs_dir_canenter(tp, tdp, target_name, resblks); 1400 if (error) 1401 goto error_return; 1402 1403 xfs_bmap_init(&free_list, &first_block); 1404 1405 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1406 &first_block, &free_list, resblks); 1407 if (error) 1408 goto abort_return; 1409 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1410 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1411 1412 error = xfs_bumplink(tp, sip); 1413 if (error) 1414 goto abort_return; 1415 1416 /* 1417 * If this is a synchronous mount, make sure that the 1418 * link transaction goes to disk before returning to 1419 * the user. 1420 */ 1421 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 1422 xfs_trans_set_sync(tp); 1423 } 1424 1425 error = xfs_bmap_finish (&tp, &free_list, &committed); 1426 if (error) { 1427 xfs_bmap_cancel(&free_list); 1428 goto abort_return; 1429 } 1430 1431 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1432 1433 abort_return: 1434 cancel_flags |= XFS_TRANS_ABORT; 1435 error_return: 1436 xfs_trans_cancel(tp, cancel_flags); 1437 std_return: 1438 return error; 1439 } 1440 1441 /* 1442 * Free up the underlying blocks past new_size. The new size must be smaller 1443 * than the current size. This routine can be used both for the attribute and 1444 * data fork, and does not modify the inode size, which is left to the caller. 1445 * 1446 * The transaction passed to this routine must have made a permanent log 1447 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1448 * given transaction and start new ones, so make sure everything involved in 1449 * the transaction is tidy before calling here. Some transaction will be 1450 * returned to the caller to be committed. The incoming transaction must 1451 * already include the inode, and both inode locks must be held exclusively. 1452 * The inode must also be "held" within the transaction. On return the inode 1453 * will be "held" within the returned transaction. This routine does NOT 1454 * require any disk space to be reserved for it within the transaction. 1455 * 1456 * If we get an error, we must return with the inode locked and linked into the 1457 * current transaction. This keeps things simple for the higher level code, 1458 * because it always knows that the inode is locked and held in the transaction 1459 * that returns to it whether errors occur or not. We don't mark the inode 1460 * dirty on error so that transactions can be easily aborted if possible. 1461 */ 1462 int 1463 xfs_itruncate_extents( 1464 struct xfs_trans **tpp, 1465 struct xfs_inode *ip, 1466 int whichfork, 1467 xfs_fsize_t new_size) 1468 { 1469 struct xfs_mount *mp = ip->i_mount; 1470 struct xfs_trans *tp = *tpp; 1471 struct xfs_trans *ntp; 1472 xfs_bmap_free_t free_list; 1473 xfs_fsblock_t first_block; 1474 xfs_fileoff_t first_unmap_block; 1475 xfs_fileoff_t last_block; 1476 xfs_filblks_t unmap_len; 1477 int committed; 1478 int error = 0; 1479 int done = 0; 1480 1481 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1482 ASSERT(!atomic_read(&VFS_I(ip)->i_count) || 1483 xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1484 ASSERT(new_size <= XFS_ISIZE(ip)); 1485 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1486 ASSERT(ip->i_itemp != NULL); 1487 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1488 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1489 1490 trace_xfs_itruncate_extents_start(ip, new_size); 1491 1492 /* 1493 * Since it is possible for space to become allocated beyond 1494 * the end of the file (in a crash where the space is allocated 1495 * but the inode size is not yet updated), simply remove any 1496 * blocks which show up between the new EOF and the maximum 1497 * possible file size. If the first block to be removed is 1498 * beyond the maximum file size (ie it is the same as last_block), 1499 * then there is nothing to do. 1500 */ 1501 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1502 last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 1503 if (first_unmap_block == last_block) 1504 return 0; 1505 1506 ASSERT(first_unmap_block < last_block); 1507 unmap_len = last_block - first_unmap_block + 1; 1508 while (!done) { 1509 xfs_bmap_init(&free_list, &first_block); 1510 error = xfs_bunmapi(tp, ip, 1511 first_unmap_block, unmap_len, 1512 xfs_bmapi_aflag(whichfork), 1513 XFS_ITRUNC_MAX_EXTENTS, 1514 &first_block, &free_list, 1515 &done); 1516 if (error) 1517 goto out_bmap_cancel; 1518 1519 /* 1520 * Duplicate the transaction that has the permanent 1521 * reservation and commit the old transaction. 1522 */ 1523 error = xfs_bmap_finish(&tp, &free_list, &committed); 1524 if (committed) 1525 xfs_trans_ijoin(tp, ip, 0); 1526 if (error) 1527 goto out_bmap_cancel; 1528 1529 if (committed) { 1530 /* 1531 * Mark the inode dirty so it will be logged and 1532 * moved forward in the log as part of every commit. 1533 */ 1534 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1535 } 1536 1537 ntp = xfs_trans_dup(tp); 1538 error = xfs_trans_commit(tp, 0); 1539 tp = ntp; 1540 1541 xfs_trans_ijoin(tp, ip, 0); 1542 1543 if (error) 1544 goto out; 1545 1546 /* 1547 * Transaction commit worked ok so we can drop the extra ticket 1548 * reference that we gained in xfs_trans_dup() 1549 */ 1550 xfs_log_ticket_put(tp->t_ticket); 1551 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 1552 if (error) 1553 goto out; 1554 } 1555 1556 /* 1557 * Always re-log the inode so that our permanent transaction can keep 1558 * on rolling it forward in the log. 1559 */ 1560 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1561 1562 trace_xfs_itruncate_extents_end(ip, new_size); 1563 1564 out: 1565 *tpp = tp; 1566 return error; 1567 out_bmap_cancel: 1568 /* 1569 * If the bunmapi call encounters an error, return to the caller where 1570 * the transaction can be properly aborted. We just need to make sure 1571 * we're not holding any resources that we were not when we came in. 1572 */ 1573 xfs_bmap_cancel(&free_list); 1574 goto out; 1575 } 1576 1577 int 1578 xfs_release( 1579 xfs_inode_t *ip) 1580 { 1581 xfs_mount_t *mp = ip->i_mount; 1582 int error; 1583 1584 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) 1585 return 0; 1586 1587 /* If this is a read-only mount, don't do this (would generate I/O) */ 1588 if (mp->m_flags & XFS_MOUNT_RDONLY) 1589 return 0; 1590 1591 if (!XFS_FORCED_SHUTDOWN(mp)) { 1592 int truncated; 1593 1594 /* 1595 * If we are using filestreams, and we have an unlinked 1596 * file that we are processing the last close on, then nothing 1597 * will be able to reopen and write to this file. Purge this 1598 * inode from the filestreams cache so that it doesn't delay 1599 * teardown of the inode. 1600 */ 1601 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) 1602 xfs_filestream_deassociate(ip); 1603 1604 /* 1605 * If we previously truncated this file and removed old data 1606 * in the process, we want to initiate "early" writeout on 1607 * the last close. This is an attempt to combat the notorious 1608 * NULL files problem which is particularly noticeable from a 1609 * truncate down, buffered (re-)write (delalloc), followed by 1610 * a crash. What we are effectively doing here is 1611 * significantly reducing the time window where we'd otherwise 1612 * be exposed to that problem. 1613 */ 1614 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1615 if (truncated) { 1616 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1617 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { 1618 error = -filemap_flush(VFS_I(ip)->i_mapping); 1619 if (error) 1620 return error; 1621 } 1622 } 1623 } 1624 1625 if (ip->i_d.di_nlink == 0) 1626 return 0; 1627 1628 if (xfs_can_free_eofblocks(ip, false)) { 1629 1630 /* 1631 * If we can't get the iolock just skip truncating the blocks 1632 * past EOF because we could deadlock with the mmap_sem 1633 * otherwise. We'll get another chance to drop them once the 1634 * last reference to the inode is dropped, so we'll never leak 1635 * blocks permanently. 1636 * 1637 * Further, check if the inode is being opened, written and 1638 * closed frequently and we have delayed allocation blocks 1639 * outstanding (e.g. streaming writes from the NFS server), 1640 * truncating the blocks past EOF will cause fragmentation to 1641 * occur. 1642 * 1643 * In this case don't do the truncation, either, but we have to 1644 * be careful how we detect this case. Blocks beyond EOF show 1645 * up as i_delayed_blks even when the inode is clean, so we 1646 * need to truncate them away first before checking for a dirty 1647 * release. Hence on the first dirty close we will still remove 1648 * the speculative allocation, but after that we will leave it 1649 * in place. 1650 */ 1651 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1652 return 0; 1653 1654 error = xfs_free_eofblocks(mp, ip, true); 1655 if (error && error != EAGAIN) 1656 return error; 1657 1658 /* delalloc blocks after truncation means it really is dirty */ 1659 if (ip->i_delayed_blks) 1660 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1661 } 1662 return 0; 1663 } 1664 1665 /* 1666 * xfs_inactive 1667 * 1668 * This is called when the vnode reference count for the vnode 1669 * goes to zero. If the file has been unlinked, then it must 1670 * now be truncated. Also, we clear all of the read-ahead state 1671 * kept for the inode here since the file is now closed. 1672 */ 1673 int 1674 xfs_inactive( 1675 xfs_inode_t *ip) 1676 { 1677 xfs_bmap_free_t free_list; 1678 xfs_fsblock_t first_block; 1679 int committed; 1680 struct xfs_trans *tp; 1681 struct xfs_mount *mp; 1682 struct xfs_trans_res *resp; 1683 int error; 1684 int truncate = 0; 1685 1686 /* 1687 * If the inode is already free, then there can be nothing 1688 * to clean up here. 1689 */ 1690 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { 1691 ASSERT(ip->i_df.if_real_bytes == 0); 1692 ASSERT(ip->i_df.if_broot_bytes == 0); 1693 return VN_INACTIVE_CACHE; 1694 } 1695 1696 mp = ip->i_mount; 1697 1698 error = 0; 1699 1700 /* If this is a read-only mount, don't do this (would generate I/O) */ 1701 if (mp->m_flags & XFS_MOUNT_RDONLY) 1702 goto out; 1703 1704 if (ip->i_d.di_nlink != 0) { 1705 /* 1706 * force is true because we are evicting an inode from the 1707 * cache. Post-eof blocks must be freed, lest we end up with 1708 * broken free space accounting. 1709 */ 1710 if (xfs_can_free_eofblocks(ip, true)) { 1711 error = xfs_free_eofblocks(mp, ip, false); 1712 if (error) 1713 return VN_INACTIVE_CACHE; 1714 } 1715 goto out; 1716 } 1717 1718 if (S_ISREG(ip->i_d.di_mode) && 1719 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || 1720 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) 1721 truncate = 1; 1722 1723 error = xfs_qm_dqattach(ip, 0); 1724 if (error) 1725 return VN_INACTIVE_CACHE; 1726 1727 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1728 resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ? 1729 &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree; 1730 1731 error = xfs_trans_reserve(tp, resp, 0, 0); 1732 if (error) { 1733 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1734 xfs_trans_cancel(tp, 0); 1735 return VN_INACTIVE_CACHE; 1736 } 1737 1738 xfs_ilock(ip, XFS_ILOCK_EXCL); 1739 xfs_trans_ijoin(tp, ip, 0); 1740 1741 if (S_ISLNK(ip->i_d.di_mode)) { 1742 error = xfs_inactive_symlink(ip, &tp); 1743 if (error) 1744 goto out_cancel; 1745 } else if (truncate) { 1746 ip->i_d.di_size = 0; 1747 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1748 1749 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1750 if (error) 1751 goto out_cancel; 1752 1753 ASSERT(ip->i_d.di_nextents == 0); 1754 } 1755 1756 /* 1757 * If there are attributes associated with the file then blow them away 1758 * now. The code calls a routine that recursively deconstructs the 1759 * attribute fork. We need to just commit the current transaction 1760 * because we can't use it for xfs_attr_inactive(). 1761 */ 1762 if (ip->i_d.di_anextents > 0) { 1763 ASSERT(ip->i_d.di_forkoff != 0); 1764 1765 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1766 if (error) 1767 goto out_unlock; 1768 1769 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1770 1771 error = xfs_attr_inactive(ip); 1772 if (error) 1773 goto out; 1774 1775 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1776 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0); 1777 if (error) { 1778 xfs_trans_cancel(tp, 0); 1779 goto out; 1780 } 1781 1782 xfs_ilock(ip, XFS_ILOCK_EXCL); 1783 xfs_trans_ijoin(tp, ip, 0); 1784 } 1785 1786 if (ip->i_afp) 1787 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 1788 1789 ASSERT(ip->i_d.di_anextents == 0); 1790 1791 /* 1792 * Free the inode. 1793 */ 1794 xfs_bmap_init(&free_list, &first_block); 1795 error = xfs_ifree(tp, ip, &free_list); 1796 if (error) { 1797 /* 1798 * If we fail to free the inode, shut down. The cancel 1799 * might do that, we need to make sure. Otherwise the 1800 * inode might be lost for a long time or forever. 1801 */ 1802 if (!XFS_FORCED_SHUTDOWN(mp)) { 1803 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1804 __func__, error); 1805 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1806 } 1807 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1808 } else { 1809 /* 1810 * Credit the quota account(s). The inode is gone. 1811 */ 1812 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1813 1814 /* 1815 * Just ignore errors at this point. There is nothing we can 1816 * do except to try to keep going. Make sure it's not a silent 1817 * error. 1818 */ 1819 error = xfs_bmap_finish(&tp, &free_list, &committed); 1820 if (error) 1821 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", 1822 __func__, error); 1823 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1824 if (error) 1825 xfs_notice(mp, "%s: xfs_trans_commit returned error %d", 1826 __func__, error); 1827 } 1828 1829 /* 1830 * Release the dquots held by inode, if any. 1831 */ 1832 xfs_qm_dqdetach(ip); 1833 out_unlock: 1834 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1835 out: 1836 return VN_INACTIVE_CACHE; 1837 out_cancel: 1838 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1839 goto out_unlock; 1840 } 1841 1842 /* 1843 * This is called when the inode's link count goes to 0. 1844 * We place the on-disk inode on a list in the AGI. It 1845 * will be pulled from this list when the inode is freed. 1846 */ 1847 int 1848 xfs_iunlink( 1849 xfs_trans_t *tp, 1850 xfs_inode_t *ip) 1851 { 1852 xfs_mount_t *mp; 1853 xfs_agi_t *agi; 1854 xfs_dinode_t *dip; 1855 xfs_buf_t *agibp; 1856 xfs_buf_t *ibp; 1857 xfs_agino_t agino; 1858 short bucket_index; 1859 int offset; 1860 int error; 1861 1862 ASSERT(ip->i_d.di_nlink == 0); 1863 ASSERT(ip->i_d.di_mode != 0); 1864 1865 mp = tp->t_mountp; 1866 1867 /* 1868 * Get the agi buffer first. It ensures lock ordering 1869 * on the list. 1870 */ 1871 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1872 if (error) 1873 return error; 1874 agi = XFS_BUF_TO_AGI(agibp); 1875 1876 /* 1877 * Get the index into the agi hash table for the 1878 * list this inode will go on. 1879 */ 1880 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1881 ASSERT(agino != 0); 1882 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1883 ASSERT(agi->agi_unlinked[bucket_index]); 1884 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1885 1886 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { 1887 /* 1888 * There is already another inode in the bucket we need 1889 * to add ourselves to. Add us at the front of the list. 1890 * Here we put the head pointer into our next pointer, 1891 * and then we fall through to point the head at us. 1892 */ 1893 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 1894 0, 0); 1895 if (error) 1896 return error; 1897 1898 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); 1899 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1900 offset = ip->i_imap.im_boffset + 1901 offsetof(xfs_dinode_t, di_next_unlinked); 1902 1903 /* need to recalc the inode CRC if appropriate */ 1904 xfs_dinode_calc_crc(mp, dip); 1905 1906 xfs_trans_inode_buf(tp, ibp); 1907 xfs_trans_log_buf(tp, ibp, offset, 1908 (offset + sizeof(xfs_agino_t) - 1)); 1909 xfs_inobp_check(mp, ibp); 1910 } 1911 1912 /* 1913 * Point the bucket head pointer at the inode being inserted. 1914 */ 1915 ASSERT(agino != 0); 1916 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1917 offset = offsetof(xfs_agi_t, agi_unlinked) + 1918 (sizeof(xfs_agino_t) * bucket_index); 1919 xfs_trans_log_buf(tp, agibp, offset, 1920 (offset + sizeof(xfs_agino_t) - 1)); 1921 return 0; 1922 } 1923 1924 /* 1925 * Pull the on-disk inode from the AGI unlinked list. 1926 */ 1927 STATIC int 1928 xfs_iunlink_remove( 1929 xfs_trans_t *tp, 1930 xfs_inode_t *ip) 1931 { 1932 xfs_ino_t next_ino; 1933 xfs_mount_t *mp; 1934 xfs_agi_t *agi; 1935 xfs_dinode_t *dip; 1936 xfs_buf_t *agibp; 1937 xfs_buf_t *ibp; 1938 xfs_agnumber_t agno; 1939 xfs_agino_t agino; 1940 xfs_agino_t next_agino; 1941 xfs_buf_t *last_ibp; 1942 xfs_dinode_t *last_dip = NULL; 1943 short bucket_index; 1944 int offset, last_offset = 0; 1945 int error; 1946 1947 mp = tp->t_mountp; 1948 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1949 1950 /* 1951 * Get the agi buffer first. It ensures lock ordering 1952 * on the list. 1953 */ 1954 error = xfs_read_agi(mp, tp, agno, &agibp); 1955 if (error) 1956 return error; 1957 1958 agi = XFS_BUF_TO_AGI(agibp); 1959 1960 /* 1961 * Get the index into the agi hash table for the 1962 * list this inode will go on. 1963 */ 1964 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1965 ASSERT(agino != 0); 1966 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1967 ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); 1968 ASSERT(agi->agi_unlinked[bucket_index]); 1969 1970 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1971 /* 1972 * We're at the head of the list. Get the inode's on-disk 1973 * buffer to see if there is anyone after us on the list. 1974 * Only modify our next pointer if it is not already NULLAGINO. 1975 * This saves us the overhead of dealing with the buffer when 1976 * there is no need to change it. 1977 */ 1978 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 1979 0, 0); 1980 if (error) { 1981 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", 1982 __func__, error); 1983 return error; 1984 } 1985 next_agino = be32_to_cpu(dip->di_next_unlinked); 1986 ASSERT(next_agino != 0); 1987 if (next_agino != NULLAGINO) { 1988 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1989 offset = ip->i_imap.im_boffset + 1990 offsetof(xfs_dinode_t, di_next_unlinked); 1991 1992 /* need to recalc the inode CRC if appropriate */ 1993 xfs_dinode_calc_crc(mp, dip); 1994 1995 xfs_trans_inode_buf(tp, ibp); 1996 xfs_trans_log_buf(tp, ibp, offset, 1997 (offset + sizeof(xfs_agino_t) - 1)); 1998 xfs_inobp_check(mp, ibp); 1999 } else { 2000 xfs_trans_brelse(tp, ibp); 2001 } 2002 /* 2003 * Point the bucket head pointer at the next inode. 2004 */ 2005 ASSERT(next_agino != 0); 2006 ASSERT(next_agino != agino); 2007 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 2008 offset = offsetof(xfs_agi_t, agi_unlinked) + 2009 (sizeof(xfs_agino_t) * bucket_index); 2010 xfs_trans_log_buf(tp, agibp, offset, 2011 (offset + sizeof(xfs_agino_t) - 1)); 2012 } else { 2013 /* 2014 * We need to search the list for the inode being freed. 2015 */ 2016 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2017 last_ibp = NULL; 2018 while (next_agino != agino) { 2019 struct xfs_imap imap; 2020 2021 if (last_ibp) 2022 xfs_trans_brelse(tp, last_ibp); 2023 2024 imap.im_blkno = 0; 2025 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 2026 2027 error = xfs_imap(mp, tp, next_ino, &imap, 0); 2028 if (error) { 2029 xfs_warn(mp, 2030 "%s: xfs_imap returned error %d.", 2031 __func__, error); 2032 return error; 2033 } 2034 2035 error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, 2036 &last_ibp, 0, 0); 2037 if (error) { 2038 xfs_warn(mp, 2039 "%s: xfs_imap_to_bp returned error %d.", 2040 __func__, error); 2041 return error; 2042 } 2043 2044 last_offset = imap.im_boffset; 2045 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 2046 ASSERT(next_agino != NULLAGINO); 2047 ASSERT(next_agino != 0); 2048 } 2049 2050 /* 2051 * Now last_ibp points to the buffer previous to us on the 2052 * unlinked list. Pull us from the list. 2053 */ 2054 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 2055 0, 0); 2056 if (error) { 2057 xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", 2058 __func__, error); 2059 return error; 2060 } 2061 next_agino = be32_to_cpu(dip->di_next_unlinked); 2062 ASSERT(next_agino != 0); 2063 ASSERT(next_agino != agino); 2064 if (next_agino != NULLAGINO) { 2065 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 2066 offset = ip->i_imap.im_boffset + 2067 offsetof(xfs_dinode_t, di_next_unlinked); 2068 2069 /* need to recalc the inode CRC if appropriate */ 2070 xfs_dinode_calc_crc(mp, dip); 2071 2072 xfs_trans_inode_buf(tp, ibp); 2073 xfs_trans_log_buf(tp, ibp, offset, 2074 (offset + sizeof(xfs_agino_t) - 1)); 2075 xfs_inobp_check(mp, ibp); 2076 } else { 2077 xfs_trans_brelse(tp, ibp); 2078 } 2079 /* 2080 * Point the previous inode on the list to the next inode. 2081 */ 2082 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 2083 ASSERT(next_agino != 0); 2084 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 2085 2086 /* need to recalc the inode CRC if appropriate */ 2087 xfs_dinode_calc_crc(mp, last_dip); 2088 2089 xfs_trans_inode_buf(tp, last_ibp); 2090 xfs_trans_log_buf(tp, last_ibp, offset, 2091 (offset + sizeof(xfs_agino_t) - 1)); 2092 xfs_inobp_check(mp, last_ibp); 2093 } 2094 return 0; 2095 } 2096 2097 /* 2098 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2099 * inodes that are in memory - they all must be marked stale and attached to 2100 * the cluster buffer. 2101 */ 2102 STATIC int 2103 xfs_ifree_cluster( 2104 xfs_inode_t *free_ip, 2105 xfs_trans_t *tp, 2106 xfs_ino_t inum) 2107 { 2108 xfs_mount_t *mp = free_ip->i_mount; 2109 int blks_per_cluster; 2110 int nbufs; 2111 int ninodes; 2112 int i, j; 2113 xfs_daddr_t blkno; 2114 xfs_buf_t *bp; 2115 xfs_inode_t *ip; 2116 xfs_inode_log_item_t *iip; 2117 xfs_log_item_t *lip; 2118 struct xfs_perag *pag; 2119 2120 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2121 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2122 blks_per_cluster = 1; 2123 ninodes = mp->m_sb.sb_inopblock; 2124 nbufs = XFS_IALLOC_BLOCKS(mp); 2125 } else { 2126 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 2127 mp->m_sb.sb_blocksize; 2128 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 2129 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 2130 } 2131 2132 for (j = 0; j < nbufs; j++, inum += ninodes) { 2133 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2134 XFS_INO_TO_AGBNO(mp, inum)); 2135 2136 /* 2137 * We obtain and lock the backing buffer first in the process 2138 * here, as we have to ensure that any dirty inode that we 2139 * can't get the flush lock on is attached to the buffer. 2140 * If we scan the in-memory inodes first, then buffer IO can 2141 * complete before we get a lock on it, and hence we may fail 2142 * to mark all the active inodes on the buffer stale. 2143 */ 2144 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2145 mp->m_bsize * blks_per_cluster, 2146 XBF_UNMAPPED); 2147 2148 if (!bp) 2149 return ENOMEM; 2150 2151 /* 2152 * This buffer may not have been correctly initialised as we 2153 * didn't read it from disk. That's not important because we are 2154 * only using to mark the buffer as stale in the log, and to 2155 * attach stale cached inodes on it. That means it will never be 2156 * dispatched for IO. If it is, we want to know about it, and we 2157 * want it to fail. We can acheive this by adding a write 2158 * verifier to the buffer. 2159 */ 2160 bp->b_ops = &xfs_inode_buf_ops; 2161 2162 /* 2163 * Walk the inodes already attached to the buffer and mark them 2164 * stale. These will all have the flush locks held, so an 2165 * in-memory inode walk can't lock them. By marking them all 2166 * stale first, we will not attempt to lock them in the loop 2167 * below as the XFS_ISTALE flag will be set. 2168 */ 2169 lip = bp->b_fspriv; 2170 while (lip) { 2171 if (lip->li_type == XFS_LI_INODE) { 2172 iip = (xfs_inode_log_item_t *)lip; 2173 ASSERT(iip->ili_logged == 1); 2174 lip->li_cb = xfs_istale_done; 2175 xfs_trans_ail_copy_lsn(mp->m_ail, 2176 &iip->ili_flush_lsn, 2177 &iip->ili_item.li_lsn); 2178 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 2179 } 2180 lip = lip->li_bio_list; 2181 } 2182 2183 2184 /* 2185 * For each inode in memory attempt to add it to the inode 2186 * buffer and set it up for being staled on buffer IO 2187 * completion. This is safe as we've locked out tail pushing 2188 * and flushing by locking the buffer. 2189 * 2190 * We have already marked every inode that was part of a 2191 * transaction stale above, which means there is no point in 2192 * even trying to lock them. 2193 */ 2194 for (i = 0; i < ninodes; i++) { 2195 retry: 2196 rcu_read_lock(); 2197 ip = radix_tree_lookup(&pag->pag_ici_root, 2198 XFS_INO_TO_AGINO(mp, (inum + i))); 2199 2200 /* Inode not in memory, nothing to do */ 2201 if (!ip) { 2202 rcu_read_unlock(); 2203 continue; 2204 } 2205 2206 /* 2207 * because this is an RCU protected lookup, we could 2208 * find a recently freed or even reallocated inode 2209 * during the lookup. We need to check under the 2210 * i_flags_lock for a valid inode here. Skip it if it 2211 * is not valid, the wrong inode or stale. 2212 */ 2213 spin_lock(&ip->i_flags_lock); 2214 if (ip->i_ino != inum + i || 2215 __xfs_iflags_test(ip, XFS_ISTALE)) { 2216 spin_unlock(&ip->i_flags_lock); 2217 rcu_read_unlock(); 2218 continue; 2219 } 2220 spin_unlock(&ip->i_flags_lock); 2221 2222 /* 2223 * Don't try to lock/unlock the current inode, but we 2224 * _cannot_ skip the other inodes that we did not find 2225 * in the list attached to the buffer and are not 2226 * already marked stale. If we can't lock it, back off 2227 * and retry. 2228 */ 2229 if (ip != free_ip && 2230 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2231 rcu_read_unlock(); 2232 delay(1); 2233 goto retry; 2234 } 2235 rcu_read_unlock(); 2236 2237 xfs_iflock(ip); 2238 xfs_iflags_set(ip, XFS_ISTALE); 2239 2240 /* 2241 * we don't need to attach clean inodes or those only 2242 * with unlogged changes (which we throw away, anyway). 2243 */ 2244 iip = ip->i_itemp; 2245 if (!iip || xfs_inode_clean(ip)) { 2246 ASSERT(ip != free_ip); 2247 xfs_ifunlock(ip); 2248 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2249 continue; 2250 } 2251 2252 iip->ili_last_fields = iip->ili_fields; 2253 iip->ili_fields = 0; 2254 iip->ili_logged = 1; 2255 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2256 &iip->ili_item.li_lsn); 2257 2258 xfs_buf_attach_iodone(bp, xfs_istale_done, 2259 &iip->ili_item); 2260 2261 if (ip != free_ip) 2262 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2263 } 2264 2265 xfs_trans_stale_inode_buf(tp, bp); 2266 xfs_trans_binval(tp, bp); 2267 } 2268 2269 xfs_perag_put(pag); 2270 return 0; 2271 } 2272 2273 /* 2274 * This is called to return an inode to the inode free list. 2275 * The inode should already be truncated to 0 length and have 2276 * no pages associated with it. This routine also assumes that 2277 * the inode is already a part of the transaction. 2278 * 2279 * The on-disk copy of the inode will have been added to the list 2280 * of unlinked inodes in the AGI. We need to remove the inode from 2281 * that list atomically with respect to freeing it here. 2282 */ 2283 int 2284 xfs_ifree( 2285 xfs_trans_t *tp, 2286 xfs_inode_t *ip, 2287 xfs_bmap_free_t *flist) 2288 { 2289 int error; 2290 int delete; 2291 xfs_ino_t first_ino; 2292 2293 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2294 ASSERT(ip->i_d.di_nlink == 0); 2295 ASSERT(ip->i_d.di_nextents == 0); 2296 ASSERT(ip->i_d.di_anextents == 0); 2297 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); 2298 ASSERT(ip->i_d.di_nblocks == 0); 2299 2300 /* 2301 * Pull the on-disk inode from the AGI unlinked list. 2302 */ 2303 error = xfs_iunlink_remove(tp, ip); 2304 if (error) 2305 return error; 2306 2307 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2308 if (error) 2309 return error; 2310 2311 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2312 ip->i_d.di_flags = 0; 2313 ip->i_d.di_dmevmask = 0; 2314 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2315 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2316 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2317 /* 2318 * Bump the generation count so no one will be confused 2319 * by reincarnations of this inode. 2320 */ 2321 ip->i_d.di_gen++; 2322 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2323 2324 if (delete) 2325 error = xfs_ifree_cluster(ip, tp, first_ino); 2326 2327 return error; 2328 } 2329 2330 /* 2331 * This is called to unpin an inode. The caller must have the inode locked 2332 * in at least shared mode so that the buffer cannot be subsequently pinned 2333 * once someone is waiting for it to be unpinned. 2334 */ 2335 static void 2336 xfs_iunpin( 2337 struct xfs_inode *ip) 2338 { 2339 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2340 2341 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2342 2343 /* Give the log a push to start the unpinning I/O */ 2344 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2345 2346 } 2347 2348 static void 2349 __xfs_iunpin_wait( 2350 struct xfs_inode *ip) 2351 { 2352 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2353 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2354 2355 xfs_iunpin(ip); 2356 2357 do { 2358 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 2359 if (xfs_ipincount(ip)) 2360 io_schedule(); 2361 } while (xfs_ipincount(ip)); 2362 finish_wait(wq, &wait.wait); 2363 } 2364 2365 void 2366 xfs_iunpin_wait( 2367 struct xfs_inode *ip) 2368 { 2369 if (xfs_ipincount(ip)) 2370 __xfs_iunpin_wait(ip); 2371 } 2372 2373 int 2374 xfs_remove( 2375 xfs_inode_t *dp, 2376 struct xfs_name *name, 2377 xfs_inode_t *ip) 2378 { 2379 xfs_mount_t *mp = dp->i_mount; 2380 xfs_trans_t *tp = NULL; 2381 int is_dir = S_ISDIR(ip->i_d.di_mode); 2382 int error = 0; 2383 xfs_bmap_free_t free_list; 2384 xfs_fsblock_t first_block; 2385 int cancel_flags; 2386 int committed; 2387 int link_zero; 2388 uint resblks; 2389 uint log_count; 2390 2391 trace_xfs_remove(dp, name); 2392 2393 if (XFS_FORCED_SHUTDOWN(mp)) 2394 return XFS_ERROR(EIO); 2395 2396 error = xfs_qm_dqattach(dp, 0); 2397 if (error) 2398 goto std_return; 2399 2400 error = xfs_qm_dqattach(ip, 0); 2401 if (error) 2402 goto std_return; 2403 2404 if (is_dir) { 2405 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); 2406 log_count = XFS_DEFAULT_LOG_COUNT; 2407 } else { 2408 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); 2409 log_count = XFS_REMOVE_LOG_COUNT; 2410 } 2411 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2412 2413 /* 2414 * We try to get the real space reservation first, 2415 * allowing for directory btree deletion(s) implying 2416 * possible bmap insert(s). If we can't get the space 2417 * reservation then we use 0 instead, and avoid the bmap 2418 * btree insert(s) in the directory code by, if the bmap 2419 * insert tries to happen, instead trimming the LAST 2420 * block from the directory. 2421 */ 2422 resblks = XFS_REMOVE_SPACE_RES(mp); 2423 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); 2424 if (error == ENOSPC) { 2425 resblks = 0; 2426 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); 2427 } 2428 if (error) { 2429 ASSERT(error != ENOSPC); 2430 cancel_flags = 0; 2431 goto out_trans_cancel; 2432 } 2433 2434 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 2435 2436 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2437 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2438 2439 /* 2440 * If we're removing a directory perform some additional validation. 2441 */ 2442 if (is_dir) { 2443 ASSERT(ip->i_d.di_nlink >= 2); 2444 if (ip->i_d.di_nlink != 2) { 2445 error = XFS_ERROR(ENOTEMPTY); 2446 goto out_trans_cancel; 2447 } 2448 if (!xfs_dir_isempty(ip)) { 2449 error = XFS_ERROR(ENOTEMPTY); 2450 goto out_trans_cancel; 2451 } 2452 } 2453 2454 xfs_bmap_init(&free_list, &first_block); 2455 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 2456 &first_block, &free_list, resblks); 2457 if (error) { 2458 ASSERT(error != ENOENT); 2459 goto out_bmap_cancel; 2460 } 2461 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2462 2463 if (is_dir) { 2464 /* 2465 * Drop the link from ip's "..". 2466 */ 2467 error = xfs_droplink(tp, dp); 2468 if (error) 2469 goto out_bmap_cancel; 2470 2471 /* 2472 * Drop the "." link from ip to self. 2473 */ 2474 error = xfs_droplink(tp, ip); 2475 if (error) 2476 goto out_bmap_cancel; 2477 } else { 2478 /* 2479 * When removing a non-directory we need to log the parent 2480 * inode here. For a directory this is done implicitly 2481 * by the xfs_droplink call for the ".." entry. 2482 */ 2483 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2484 } 2485 2486 /* 2487 * Drop the link from dp to ip. 2488 */ 2489 error = xfs_droplink(tp, ip); 2490 if (error) 2491 goto out_bmap_cancel; 2492 2493 /* 2494 * Determine if this is the last link while 2495 * we are in the transaction. 2496 */ 2497 link_zero = (ip->i_d.di_nlink == 0); 2498 2499 /* 2500 * If this is a synchronous mount, make sure that the 2501 * remove transaction goes to disk before returning to 2502 * the user. 2503 */ 2504 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 2505 xfs_trans_set_sync(tp); 2506 2507 error = xfs_bmap_finish(&tp, &free_list, &committed); 2508 if (error) 2509 goto out_bmap_cancel; 2510 2511 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 2512 if (error) 2513 goto std_return; 2514 2515 /* 2516 * If we are using filestreams, kill the stream association. 2517 * If the file is still open it may get a new one but that 2518 * will get killed on last close in xfs_close() so we don't 2519 * have to worry about that. 2520 */ 2521 if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) 2522 xfs_filestream_deassociate(ip); 2523 2524 return 0; 2525 2526 out_bmap_cancel: 2527 xfs_bmap_cancel(&free_list); 2528 cancel_flags |= XFS_TRANS_ABORT; 2529 out_trans_cancel: 2530 xfs_trans_cancel(tp, cancel_flags); 2531 std_return: 2532 return error; 2533 } 2534 2535 /* 2536 * Enter all inodes for a rename transaction into a sorted array. 2537 */ 2538 STATIC void 2539 xfs_sort_for_rename( 2540 xfs_inode_t *dp1, /* in: old (source) directory inode */ 2541 xfs_inode_t *dp2, /* in: new (target) directory inode */ 2542 xfs_inode_t *ip1, /* in: inode of old entry */ 2543 xfs_inode_t *ip2, /* in: inode of new entry, if it 2544 already exists, NULL otherwise. */ 2545 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ 2546 int *num_inodes) /* out: number of inodes in array */ 2547 { 2548 xfs_inode_t *temp; 2549 int i, j; 2550 2551 /* 2552 * i_tab contains a list of pointers to inodes. We initialize 2553 * the table here & we'll sort it. We will then use it to 2554 * order the acquisition of the inode locks. 2555 * 2556 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2557 */ 2558 i_tab[0] = dp1; 2559 i_tab[1] = dp2; 2560 i_tab[2] = ip1; 2561 if (ip2) { 2562 *num_inodes = 4; 2563 i_tab[3] = ip2; 2564 } else { 2565 *num_inodes = 3; 2566 i_tab[3] = NULL; 2567 } 2568 2569 /* 2570 * Sort the elements via bubble sort. (Remember, there are at 2571 * most 4 elements to sort, so this is adequate.) 2572 */ 2573 for (i = 0; i < *num_inodes; i++) { 2574 for (j = 1; j < *num_inodes; j++) { 2575 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2576 temp = i_tab[j]; 2577 i_tab[j] = i_tab[j-1]; 2578 i_tab[j-1] = temp; 2579 } 2580 } 2581 } 2582 } 2583 2584 /* 2585 * xfs_rename 2586 */ 2587 int 2588 xfs_rename( 2589 xfs_inode_t *src_dp, 2590 struct xfs_name *src_name, 2591 xfs_inode_t *src_ip, 2592 xfs_inode_t *target_dp, 2593 struct xfs_name *target_name, 2594 xfs_inode_t *target_ip) 2595 { 2596 xfs_trans_t *tp = NULL; 2597 xfs_mount_t *mp = src_dp->i_mount; 2598 int new_parent; /* moving to a new dir */ 2599 int src_is_directory; /* src_name is a directory */ 2600 int error; 2601 xfs_bmap_free_t free_list; 2602 xfs_fsblock_t first_block; 2603 int cancel_flags; 2604 int committed; 2605 xfs_inode_t *inodes[4]; 2606 int spaceres; 2607 int num_inodes; 2608 2609 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 2610 2611 new_parent = (src_dp != target_dp); 2612 src_is_directory = S_ISDIR(src_ip->i_d.di_mode); 2613 2614 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, 2615 inodes, &num_inodes); 2616 2617 xfs_bmap_init(&free_list, &first_block); 2618 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); 2619 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2620 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 2621 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); 2622 if (error == ENOSPC) { 2623 spaceres = 0; 2624 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); 2625 } 2626 if (error) { 2627 xfs_trans_cancel(tp, 0); 2628 goto std_return; 2629 } 2630 2631 /* 2632 * Attach the dquots to the inodes 2633 */ 2634 error = xfs_qm_vop_rename_dqattach(inodes); 2635 if (error) { 2636 xfs_trans_cancel(tp, cancel_flags); 2637 goto std_return; 2638 } 2639 2640 /* 2641 * Lock all the participating inodes. Depending upon whether 2642 * the target_name exists in the target directory, and 2643 * whether the target directory is the same as the source 2644 * directory, we can lock from 2 to 4 inodes. 2645 */ 2646 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 2647 2648 /* 2649 * Join all the inodes to the transaction. From this point on, 2650 * we can rely on either trans_commit or trans_cancel to unlock 2651 * them. 2652 */ 2653 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 2654 if (new_parent) 2655 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 2656 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 2657 if (target_ip) 2658 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 2659 2660 /* 2661 * If we are using project inheritance, we only allow renames 2662 * into our tree when the project IDs are the same; else the 2663 * tree quota mechanism would be circumvented. 2664 */ 2665 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 2666 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { 2667 error = XFS_ERROR(EXDEV); 2668 goto error_return; 2669 } 2670 2671 /* 2672 * Set up the target. 2673 */ 2674 if (target_ip == NULL) { 2675 /* 2676 * If there's no space reservation, check the entry will 2677 * fit before actually inserting it. 2678 */ 2679 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); 2680 if (error) 2681 goto error_return; 2682 /* 2683 * If target does not exist and the rename crosses 2684 * directories, adjust the target directory link count 2685 * to account for the ".." reference from the new entry. 2686 */ 2687 error = xfs_dir_createname(tp, target_dp, target_name, 2688 src_ip->i_ino, &first_block, 2689 &free_list, spaceres); 2690 if (error == ENOSPC) 2691 goto error_return; 2692 if (error) 2693 goto abort_return; 2694 2695 xfs_trans_ichgtime(tp, target_dp, 2696 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2697 2698 if (new_parent && src_is_directory) { 2699 error = xfs_bumplink(tp, target_dp); 2700 if (error) 2701 goto abort_return; 2702 } 2703 } else { /* target_ip != NULL */ 2704 /* 2705 * If target exists and it's a directory, check that both 2706 * target and source are directories and that target can be 2707 * destroyed, or that neither is a directory. 2708 */ 2709 if (S_ISDIR(target_ip->i_d.di_mode)) { 2710 /* 2711 * Make sure target dir is empty. 2712 */ 2713 if (!(xfs_dir_isempty(target_ip)) || 2714 (target_ip->i_d.di_nlink > 2)) { 2715 error = XFS_ERROR(EEXIST); 2716 goto error_return; 2717 } 2718 } 2719 2720 /* 2721 * Link the source inode under the target name. 2722 * If the source inode is a directory and we are moving 2723 * it across directories, its ".." entry will be 2724 * inconsistent until we replace that down below. 2725 * 2726 * In case there is already an entry with the same 2727 * name at the destination directory, remove it first. 2728 */ 2729 error = xfs_dir_replace(tp, target_dp, target_name, 2730 src_ip->i_ino, 2731 &first_block, &free_list, spaceres); 2732 if (error) 2733 goto abort_return; 2734 2735 xfs_trans_ichgtime(tp, target_dp, 2736 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2737 2738 /* 2739 * Decrement the link count on the target since the target 2740 * dir no longer points to it. 2741 */ 2742 error = xfs_droplink(tp, target_ip); 2743 if (error) 2744 goto abort_return; 2745 2746 if (src_is_directory) { 2747 /* 2748 * Drop the link from the old "." entry. 2749 */ 2750 error = xfs_droplink(tp, target_ip); 2751 if (error) 2752 goto abort_return; 2753 } 2754 } /* target_ip != NULL */ 2755 2756 /* 2757 * Remove the source. 2758 */ 2759 if (new_parent && src_is_directory) { 2760 /* 2761 * Rewrite the ".." entry to point to the new 2762 * directory. 2763 */ 2764 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 2765 target_dp->i_ino, 2766 &first_block, &free_list, spaceres); 2767 ASSERT(error != EEXIST); 2768 if (error) 2769 goto abort_return; 2770 } 2771 2772 /* 2773 * We always want to hit the ctime on the source inode. 2774 * 2775 * This isn't strictly required by the standards since the source 2776 * inode isn't really being changed, but old unix file systems did 2777 * it and some incremental backup programs won't work without it. 2778 */ 2779 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 2780 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 2781 2782 /* 2783 * Adjust the link count on src_dp. This is necessary when 2784 * renaming a directory, either within one parent when 2785 * the target existed, or across two parent directories. 2786 */ 2787 if (src_is_directory && (new_parent || target_ip != NULL)) { 2788 2789 /* 2790 * Decrement link count on src_directory since the 2791 * entry that's moved no longer points to it. 2792 */ 2793 error = xfs_droplink(tp, src_dp); 2794 if (error) 2795 goto abort_return; 2796 } 2797 2798 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 2799 &first_block, &free_list, spaceres); 2800 if (error) 2801 goto abort_return; 2802 2803 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2804 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 2805 if (new_parent) 2806 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 2807 2808 /* 2809 * If this is a synchronous mount, make sure that the 2810 * rename transaction goes to disk before returning to 2811 * the user. 2812 */ 2813 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 2814 xfs_trans_set_sync(tp); 2815 } 2816 2817 error = xfs_bmap_finish(&tp, &free_list, &committed); 2818 if (error) { 2819 xfs_bmap_cancel(&free_list); 2820 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | 2821 XFS_TRANS_ABORT)); 2822 goto std_return; 2823 } 2824 2825 /* 2826 * trans_commit will unlock src_ip, target_ip & decrement 2827 * the vnode references. 2828 */ 2829 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 2830 2831 abort_return: 2832 cancel_flags |= XFS_TRANS_ABORT; 2833 error_return: 2834 xfs_bmap_cancel(&free_list); 2835 xfs_trans_cancel(tp, cancel_flags); 2836 std_return: 2837 return error; 2838 } 2839 2840 STATIC int 2841 xfs_iflush_cluster( 2842 xfs_inode_t *ip, 2843 xfs_buf_t *bp) 2844 { 2845 xfs_mount_t *mp = ip->i_mount; 2846 struct xfs_perag *pag; 2847 unsigned long first_index, mask; 2848 unsigned long inodes_per_cluster; 2849 int ilist_size; 2850 xfs_inode_t **ilist; 2851 xfs_inode_t *iq; 2852 int nr_found; 2853 int clcount = 0; 2854 int bufwasdelwri; 2855 int i; 2856 2857 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2858 2859 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2860 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2861 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2862 if (!ilist) 2863 goto out_put; 2864 2865 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2866 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2867 rcu_read_lock(); 2868 /* really need a gang lookup range call here */ 2869 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2870 first_index, inodes_per_cluster); 2871 if (nr_found == 0) 2872 goto out_free; 2873 2874 for (i = 0; i < nr_found; i++) { 2875 iq = ilist[i]; 2876 if (iq == ip) 2877 continue; 2878 2879 /* 2880 * because this is an RCU protected lookup, we could find a 2881 * recently freed or even reallocated inode during the lookup. 2882 * We need to check under the i_flags_lock for a valid inode 2883 * here. Skip it if it is not valid or the wrong inode. 2884 */ 2885 spin_lock(&ip->i_flags_lock); 2886 if (!ip->i_ino || 2887 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { 2888 spin_unlock(&ip->i_flags_lock); 2889 continue; 2890 } 2891 spin_unlock(&ip->i_flags_lock); 2892 2893 /* 2894 * Do an un-protected check to see if the inode is dirty and 2895 * is a candidate for flushing. These checks will be repeated 2896 * later after the appropriate locks are acquired. 2897 */ 2898 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2899 continue; 2900 2901 /* 2902 * Try to get locks. If any are unavailable or it is pinned, 2903 * then this inode cannot be flushed and is skipped. 2904 */ 2905 2906 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2907 continue; 2908 if (!xfs_iflock_nowait(iq)) { 2909 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2910 continue; 2911 } 2912 if (xfs_ipincount(iq)) { 2913 xfs_ifunlock(iq); 2914 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2915 continue; 2916 } 2917 2918 /* 2919 * arriving here means that this inode can be flushed. First 2920 * re-check that it's dirty before flushing. 2921 */ 2922 if (!xfs_inode_clean(iq)) { 2923 int error; 2924 error = xfs_iflush_int(iq, bp); 2925 if (error) { 2926 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2927 goto cluster_corrupt_out; 2928 } 2929 clcount++; 2930 } else { 2931 xfs_ifunlock(iq); 2932 } 2933 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2934 } 2935 2936 if (clcount) { 2937 XFS_STATS_INC(xs_icluster_flushcnt); 2938 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2939 } 2940 2941 out_free: 2942 rcu_read_unlock(); 2943 kmem_free(ilist); 2944 out_put: 2945 xfs_perag_put(pag); 2946 return 0; 2947 2948 2949 cluster_corrupt_out: 2950 /* 2951 * Corruption detected in the clustering loop. Invalidate the 2952 * inode buffer and shut down the filesystem. 2953 */ 2954 rcu_read_unlock(); 2955 /* 2956 * Clean up the buffer. If it was delwri, just release it -- 2957 * brelse can handle it with no problems. If not, shut down the 2958 * filesystem before releasing the buffer. 2959 */ 2960 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); 2961 if (bufwasdelwri) 2962 xfs_buf_relse(bp); 2963 2964 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2965 2966 if (!bufwasdelwri) { 2967 /* 2968 * Just like incore_relse: if we have b_iodone functions, 2969 * mark the buffer as an error and call them. Otherwise 2970 * mark it as stale and brelse. 2971 */ 2972 if (bp->b_iodone) { 2973 XFS_BUF_UNDONE(bp); 2974 xfs_buf_stale(bp); 2975 xfs_buf_ioerror(bp, EIO); 2976 xfs_buf_ioend(bp, 0); 2977 } else { 2978 xfs_buf_stale(bp); 2979 xfs_buf_relse(bp); 2980 } 2981 } 2982 2983 /* 2984 * Unlocks the flush lock 2985 */ 2986 xfs_iflush_abort(iq, false); 2987 kmem_free(ilist); 2988 xfs_perag_put(pag); 2989 return XFS_ERROR(EFSCORRUPTED); 2990 } 2991 2992 /* 2993 * Flush dirty inode metadata into the backing buffer. 2994 * 2995 * The caller must have the inode lock and the inode flush lock held. The 2996 * inode lock will still be held upon return to the caller, and the inode 2997 * flush lock will be released after the inode has reached the disk. 2998 * 2999 * The caller must write out the buffer returned in *bpp and release it. 3000 */ 3001 int 3002 xfs_iflush( 3003 struct xfs_inode *ip, 3004 struct xfs_buf **bpp) 3005 { 3006 struct xfs_mount *mp = ip->i_mount; 3007 struct xfs_buf *bp; 3008 struct xfs_dinode *dip; 3009 int error; 3010 3011 XFS_STATS_INC(xs_iflush_count); 3012 3013 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3014 ASSERT(xfs_isiflocked(ip)); 3015 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3016 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3017 3018 *bpp = NULL; 3019 3020 xfs_iunpin_wait(ip); 3021 3022 /* 3023 * For stale inodes we cannot rely on the backing buffer remaining 3024 * stale in cache for the remaining life of the stale inode and so 3025 * xfs_imap_to_bp() below may give us a buffer that no longer contains 3026 * inodes below. We have to check this after ensuring the inode is 3027 * unpinned so that it is safe to reclaim the stale inode after the 3028 * flush call. 3029 */ 3030 if (xfs_iflags_test(ip, XFS_ISTALE)) { 3031 xfs_ifunlock(ip); 3032 return 0; 3033 } 3034 3035 /* 3036 * This may have been unpinned because the filesystem is shutting 3037 * down forcibly. If that's the case we must not write this inode 3038 * to disk, because the log record didn't make it to disk. 3039 * 3040 * We also have to remove the log item from the AIL in this case, 3041 * as we wait for an empty AIL as part of the unmount process. 3042 */ 3043 if (XFS_FORCED_SHUTDOWN(mp)) { 3044 error = XFS_ERROR(EIO); 3045 goto abort_out; 3046 } 3047 3048 /* 3049 * Get the buffer containing the on-disk inode. 3050 */ 3051 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 3052 0); 3053 if (error || !bp) { 3054 xfs_ifunlock(ip); 3055 return error; 3056 } 3057 3058 /* 3059 * First flush out the inode that xfs_iflush was called with. 3060 */ 3061 error = xfs_iflush_int(ip, bp); 3062 if (error) 3063 goto corrupt_out; 3064 3065 /* 3066 * If the buffer is pinned then push on the log now so we won't 3067 * get stuck waiting in the write for too long. 3068 */ 3069 if (xfs_buf_ispinned(bp)) 3070 xfs_log_force(mp, 0); 3071 3072 /* 3073 * inode clustering: 3074 * see if other inodes can be gathered into this write 3075 */ 3076 error = xfs_iflush_cluster(ip, bp); 3077 if (error) 3078 goto cluster_corrupt_out; 3079 3080 *bpp = bp; 3081 return 0; 3082 3083 corrupt_out: 3084 xfs_buf_relse(bp); 3085 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3086 cluster_corrupt_out: 3087 error = XFS_ERROR(EFSCORRUPTED); 3088 abort_out: 3089 /* 3090 * Unlocks the flush lock 3091 */ 3092 xfs_iflush_abort(ip, false); 3093 return error; 3094 } 3095 3096 STATIC int 3097 xfs_iflush_int( 3098 struct xfs_inode *ip, 3099 struct xfs_buf *bp) 3100 { 3101 struct xfs_inode_log_item *iip = ip->i_itemp; 3102 struct xfs_dinode *dip; 3103 struct xfs_mount *mp = ip->i_mount; 3104 3105 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3106 ASSERT(xfs_isiflocked(ip)); 3107 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3108 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3109 ASSERT(iip != NULL && iip->ili_fields != 0); 3110 3111 /* set *dip = inode's place in the buffer */ 3112 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 3113 3114 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3115 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3116 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3117 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 3118 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3119 goto corrupt_out; 3120 } 3121 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3122 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 3123 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3124 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 3125 __func__, ip->i_ino, ip, ip->i_d.di_magic); 3126 goto corrupt_out; 3127 } 3128 if (S_ISREG(ip->i_d.di_mode)) { 3129 if (XFS_TEST_ERROR( 3130 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3131 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 3132 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 3133 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3134 "%s: Bad regular inode %Lu, ptr 0x%p", 3135 __func__, ip->i_ino, ip); 3136 goto corrupt_out; 3137 } 3138 } else if (S_ISDIR(ip->i_d.di_mode)) { 3139 if (XFS_TEST_ERROR( 3140 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3141 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3142 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 3143 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 3144 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3145 "%s: Bad directory inode %Lu, ptr 0x%p", 3146 __func__, ip->i_ino, ip); 3147 goto corrupt_out; 3148 } 3149 } 3150 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 3151 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 3152 XFS_RANDOM_IFLUSH_5)) { 3153 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3154 "%s: detected corrupt incore inode %Lu, " 3155 "total extents = %d, nblocks = %Ld, ptr 0x%p", 3156 __func__, ip->i_ino, 3157 ip->i_d.di_nextents + ip->i_d.di_anextents, 3158 ip->i_d.di_nblocks, ip); 3159 goto corrupt_out; 3160 } 3161 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 3162 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 3163 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3164 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 3165 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 3166 goto corrupt_out; 3167 } 3168 3169 /* 3170 * Inode item log recovery for v1/v2 inodes are dependent on the 3171 * di_flushiter count for correct sequencing. We bump the flush 3172 * iteration count so we can detect flushes which postdate a log record 3173 * during recovery. This is redundant as we now log every change and 3174 * hence this can't happen but we need to still do it to ensure 3175 * backwards compatibility with old kernels that predate logging all 3176 * inode changes. 3177 */ 3178 if (ip->i_d.di_version < 3) 3179 ip->i_d.di_flushiter++; 3180 3181 /* 3182 * Copy the dirty parts of the inode into the on-disk 3183 * inode. We always copy out the core of the inode, 3184 * because if the inode is dirty at all the core must 3185 * be. 3186 */ 3187 xfs_dinode_to_disk(dip, &ip->i_d); 3188 3189 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3190 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3191 ip->i_d.di_flushiter = 0; 3192 3193 /* 3194 * If this is really an old format inode and the superblock version 3195 * has not been updated to support only new format inodes, then 3196 * convert back to the old inode format. If the superblock version 3197 * has been updated, then make the conversion permanent. 3198 */ 3199 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 3200 if (ip->i_d.di_version == 1) { 3201 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 3202 /* 3203 * Convert it back. 3204 */ 3205 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3206 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 3207 } else { 3208 /* 3209 * The superblock version has already been bumped, 3210 * so just make the conversion to the new inode 3211 * format permanent. 3212 */ 3213 ip->i_d.di_version = 2; 3214 dip->di_version = 2; 3215 ip->i_d.di_onlink = 0; 3216 dip->di_onlink = 0; 3217 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3218 memset(&(dip->di_pad[0]), 0, 3219 sizeof(dip->di_pad)); 3220 ASSERT(xfs_get_projid(ip) == 0); 3221 } 3222 } 3223 3224 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 3225 if (XFS_IFORK_Q(ip)) 3226 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3227 xfs_inobp_check(mp, bp); 3228 3229 /* 3230 * We've recorded everything logged in the inode, so we'd like to clear 3231 * the ili_fields bits so we don't log and flush things unnecessarily. 3232 * However, we can't stop logging all this information until the data 3233 * we've copied into the disk buffer is written to disk. If we did we 3234 * might overwrite the copy of the inode in the log with all the data 3235 * after re-logging only part of it, and in the face of a crash we 3236 * wouldn't have all the data we need to recover. 3237 * 3238 * What we do is move the bits to the ili_last_fields field. When 3239 * logging the inode, these bits are moved back to the ili_fields field. 3240 * In the xfs_iflush_done() routine we clear ili_last_fields, since we 3241 * know that the information those bits represent is permanently on 3242 * disk. As long as the flush completes before the inode is logged 3243 * again, then both ili_fields and ili_last_fields will be cleared. 3244 * 3245 * We can play with the ili_fields bits here, because the inode lock 3246 * must be held exclusively in order to set bits there and the flush 3247 * lock protects the ili_last_fields bits. Set ili_logged so the flush 3248 * done routine can tell whether or not to look in the AIL. Also, store 3249 * the current LSN of the inode so that we can tell whether the item has 3250 * moved in the AIL from xfs_iflush_done(). In order to read the lsn we 3251 * need the AIL lock, because it is a 64 bit value that cannot be read 3252 * atomically. 3253 */ 3254 iip->ili_last_fields = iip->ili_fields; 3255 iip->ili_fields = 0; 3256 iip->ili_logged = 1; 3257 3258 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3259 &iip->ili_item.li_lsn); 3260 3261 /* 3262 * Attach the function xfs_iflush_done to the inode's 3263 * buffer. This will remove the inode from the AIL 3264 * and unlock the inode's flush lock when the inode is 3265 * completely written to disk. 3266 */ 3267 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 3268 3269 /* update the lsn in the on disk inode if required */ 3270 if (ip->i_d.di_version == 3) 3271 dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); 3272 3273 /* generate the checksum. */ 3274 xfs_dinode_calc_crc(mp, dip); 3275 3276 ASSERT(bp->b_fspriv != NULL); 3277 ASSERT(bp->b_iodone != NULL); 3278 return 0; 3279 3280 corrupt_out: 3281 return XFS_ERROR(EFSCORRUPTED); 3282 } 3283