1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include <linux/iversion.h> 7 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_sb.h" 15 #include "xfs_mount.h" 16 #include "xfs_defer.h" 17 #include "xfs_inode.h" 18 #include "xfs_dir2.h" 19 #include "xfs_attr.h" 20 #include "xfs_trans_space.h" 21 #include "xfs_trans.h" 22 #include "xfs_buf_item.h" 23 #include "xfs_inode_item.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_bmap.h" 26 #include "xfs_bmap_util.h" 27 #include "xfs_errortag.h" 28 #include "xfs_error.h" 29 #include "xfs_quota.h" 30 #include "xfs_filestream.h" 31 #include "xfs_trace.h" 32 #include "xfs_icache.h" 33 #include "xfs_symlink.h" 34 #include "xfs_trans_priv.h" 35 #include "xfs_log.h" 36 #include "xfs_bmap_btree.h" 37 #include "xfs_reflink.h" 38 39 kmem_zone_t *xfs_inode_zone; 40 41 /* 42 * Used in xfs_itruncate_extents(). This is the maximum number of extents 43 * freed from a file in a single transaction. 44 */ 45 #define XFS_ITRUNC_MAX_EXTENTS 2 46 47 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); 48 STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); 49 50 /* 51 * helper function to extract extent size hint from inode 52 */ 53 xfs_extlen_t 54 xfs_get_extsz_hint( 55 struct xfs_inode *ip) 56 { 57 /* 58 * No point in aligning allocations if we need to COW to actually 59 * write to them. 60 */ 61 if (xfs_is_always_cow_inode(ip)) 62 return 0; 63 if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 64 return ip->i_extsize; 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return ip->i_mount->m_sb.sb_rextsize; 67 return 0; 68 } 69 70 /* 71 * Helper function to extract CoW extent size hint from inode. 72 * Between the extent size hint and the CoW extent size hint, we 73 * return the greater of the two. If the value is zero (automatic), 74 * use the default size. 75 */ 76 xfs_extlen_t 77 xfs_get_cowextsz_hint( 78 struct xfs_inode *ip) 79 { 80 xfs_extlen_t a, b; 81 82 a = 0; 83 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 84 a = ip->i_cowextsize; 85 b = xfs_get_extsz_hint(ip); 86 87 a = max(a, b); 88 if (a == 0) 89 return XFS_DEFAULT_COWEXTSZ_HINT; 90 return a; 91 } 92 93 /* 94 * These two are wrapper routines around the xfs_ilock() routine used to 95 * centralize some grungy code. They are used in places that wish to lock the 96 * inode solely for reading the extents. The reason these places can't just 97 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 98 * bringing in of the extents from disk for a file in b-tree format. If the 99 * inode is in b-tree format, then we need to lock the inode exclusively until 100 * the extents are read in. Locking it exclusively all the time would limit 101 * our parallelism unnecessarily, though. What we do instead is check to see 102 * if the extents have been read in yet, and only lock the inode exclusively 103 * if they have not. 104 * 105 * The functions return a value which should be given to the corresponding 106 * xfs_iunlock() call. 107 */ 108 uint 109 xfs_ilock_data_map_shared( 110 struct xfs_inode *ip) 111 { 112 uint lock_mode = XFS_ILOCK_SHARED; 113 114 if (xfs_need_iread_extents(&ip->i_df)) 115 lock_mode = XFS_ILOCK_EXCL; 116 xfs_ilock(ip, lock_mode); 117 return lock_mode; 118 } 119 120 uint 121 xfs_ilock_attr_map_shared( 122 struct xfs_inode *ip) 123 { 124 uint lock_mode = XFS_ILOCK_SHARED; 125 126 if (ip->i_afp && xfs_need_iread_extents(ip->i_afp)) 127 lock_mode = XFS_ILOCK_EXCL; 128 xfs_ilock(ip, lock_mode); 129 return lock_mode; 130 } 131 132 /* 133 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 134 * multi-reader locks: i_mmap_lock and the i_lock. This routine allows 135 * various combinations of the locks to be obtained. 136 * 137 * The 3 locks should always be ordered so that the IO lock is obtained first, 138 * the mmap lock second and the ilock last in order to prevent deadlock. 139 * 140 * Basic locking order: 141 * 142 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock 143 * 144 * mmap_lock locking order: 145 * 146 * i_rwsem -> page lock -> mmap_lock 147 * mmap_lock -> i_mmap_lock -> page_lock 148 * 149 * The difference in mmap_lock locking order mean that we cannot hold the 150 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can 151 * fault in pages during copy in/out (for buffered IO) or require the mmap_lock 152 * in get_user_pages() to map the user pages into the kernel address space for 153 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because 154 * page faults already hold the mmap_lock. 155 * 156 * Hence to serialise fully against both syscall and mmap based IO, we need to 157 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both 158 * taken in places where we need to invalidate the page cache in a race 159 * free manner (e.g. truncate, hole punch and other extent manipulation 160 * functions). 161 */ 162 void 163 xfs_ilock( 164 xfs_inode_t *ip, 165 uint lock_flags) 166 { 167 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 168 169 /* 170 * You can't set both SHARED and EXCL for the same lock, 171 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 172 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 173 */ 174 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 175 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 176 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 177 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 178 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 179 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 180 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 181 182 if (lock_flags & XFS_IOLOCK_EXCL) { 183 down_write_nested(&VFS_I(ip)->i_rwsem, 184 XFS_IOLOCK_DEP(lock_flags)); 185 } else if (lock_flags & XFS_IOLOCK_SHARED) { 186 down_read_nested(&VFS_I(ip)->i_rwsem, 187 XFS_IOLOCK_DEP(lock_flags)); 188 } 189 190 if (lock_flags & XFS_MMAPLOCK_EXCL) 191 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); 192 else if (lock_flags & XFS_MMAPLOCK_SHARED) 193 mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); 194 195 if (lock_flags & XFS_ILOCK_EXCL) 196 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 197 else if (lock_flags & XFS_ILOCK_SHARED) 198 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 199 } 200 201 /* 202 * This is just like xfs_ilock(), except that the caller 203 * is guaranteed not to sleep. It returns 1 if it gets 204 * the requested locks and 0 otherwise. If the IO lock is 205 * obtained but the inode lock cannot be, then the IO lock 206 * is dropped before returning. 207 * 208 * ip -- the inode being locked 209 * lock_flags -- this parameter indicates the inode's locks to be 210 * to be locked. See the comment for xfs_ilock() for a list 211 * of valid values. 212 */ 213 int 214 xfs_ilock_nowait( 215 xfs_inode_t *ip, 216 uint lock_flags) 217 { 218 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 219 220 /* 221 * You can't set both SHARED and EXCL for the same lock, 222 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 223 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 224 */ 225 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 226 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 227 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 228 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 229 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 230 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 231 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 232 233 if (lock_flags & XFS_IOLOCK_EXCL) { 234 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) 235 goto out; 236 } else if (lock_flags & XFS_IOLOCK_SHARED) { 237 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) 238 goto out; 239 } 240 241 if (lock_flags & XFS_MMAPLOCK_EXCL) { 242 if (!mrtryupdate(&ip->i_mmaplock)) 243 goto out_undo_iolock; 244 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 245 if (!mrtryaccess(&ip->i_mmaplock)) 246 goto out_undo_iolock; 247 } 248 249 if (lock_flags & XFS_ILOCK_EXCL) { 250 if (!mrtryupdate(&ip->i_lock)) 251 goto out_undo_mmaplock; 252 } else if (lock_flags & XFS_ILOCK_SHARED) { 253 if (!mrtryaccess(&ip->i_lock)) 254 goto out_undo_mmaplock; 255 } 256 return 1; 257 258 out_undo_mmaplock: 259 if (lock_flags & XFS_MMAPLOCK_EXCL) 260 mrunlock_excl(&ip->i_mmaplock); 261 else if (lock_flags & XFS_MMAPLOCK_SHARED) 262 mrunlock_shared(&ip->i_mmaplock); 263 out_undo_iolock: 264 if (lock_flags & XFS_IOLOCK_EXCL) 265 up_write(&VFS_I(ip)->i_rwsem); 266 else if (lock_flags & XFS_IOLOCK_SHARED) 267 up_read(&VFS_I(ip)->i_rwsem); 268 out: 269 return 0; 270 } 271 272 /* 273 * xfs_iunlock() is used to drop the inode locks acquired with 274 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 275 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 276 * that we know which locks to drop. 277 * 278 * ip -- the inode being unlocked 279 * lock_flags -- this parameter indicates the inode's locks to be 280 * to be unlocked. See the comment for xfs_ilock() for a list 281 * of valid values for this parameter. 282 * 283 */ 284 void 285 xfs_iunlock( 286 xfs_inode_t *ip, 287 uint lock_flags) 288 { 289 /* 290 * You can't set both SHARED and EXCL for the same lock, 291 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 292 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 293 */ 294 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 295 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 296 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 297 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 298 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 299 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 300 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 301 ASSERT(lock_flags != 0); 302 303 if (lock_flags & XFS_IOLOCK_EXCL) 304 up_write(&VFS_I(ip)->i_rwsem); 305 else if (lock_flags & XFS_IOLOCK_SHARED) 306 up_read(&VFS_I(ip)->i_rwsem); 307 308 if (lock_flags & XFS_MMAPLOCK_EXCL) 309 mrunlock_excl(&ip->i_mmaplock); 310 else if (lock_flags & XFS_MMAPLOCK_SHARED) 311 mrunlock_shared(&ip->i_mmaplock); 312 313 if (lock_flags & XFS_ILOCK_EXCL) 314 mrunlock_excl(&ip->i_lock); 315 else if (lock_flags & XFS_ILOCK_SHARED) 316 mrunlock_shared(&ip->i_lock); 317 318 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 319 } 320 321 /* 322 * give up write locks. the i/o lock cannot be held nested 323 * if it is being demoted. 324 */ 325 void 326 xfs_ilock_demote( 327 xfs_inode_t *ip, 328 uint lock_flags) 329 { 330 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); 331 ASSERT((lock_flags & 332 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 333 334 if (lock_flags & XFS_ILOCK_EXCL) 335 mrdemote(&ip->i_lock); 336 if (lock_flags & XFS_MMAPLOCK_EXCL) 337 mrdemote(&ip->i_mmaplock); 338 if (lock_flags & XFS_IOLOCK_EXCL) 339 downgrade_write(&VFS_I(ip)->i_rwsem); 340 341 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 342 } 343 344 #if defined(DEBUG) || defined(XFS_WARN) 345 int 346 xfs_isilocked( 347 xfs_inode_t *ip, 348 uint lock_flags) 349 { 350 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { 351 if (!(lock_flags & XFS_ILOCK_SHARED)) 352 return !!ip->i_lock.mr_writer; 353 return rwsem_is_locked(&ip->i_lock.mr_lock); 354 } 355 356 if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { 357 if (!(lock_flags & XFS_MMAPLOCK_SHARED)) 358 return !!ip->i_mmaplock.mr_writer; 359 return rwsem_is_locked(&ip->i_mmaplock.mr_lock); 360 } 361 362 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 363 if (!(lock_flags & XFS_IOLOCK_SHARED)) 364 return !debug_locks || 365 lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); 366 return rwsem_is_locked(&VFS_I(ip)->i_rwsem); 367 } 368 369 ASSERT(0); 370 return 0; 371 } 372 #endif 373 374 /* 375 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 376 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 377 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 378 * errors and warnings. 379 */ 380 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 381 static bool 382 xfs_lockdep_subclass_ok( 383 int subclass) 384 { 385 return subclass < MAX_LOCKDEP_SUBCLASSES; 386 } 387 #else 388 #define xfs_lockdep_subclass_ok(subclass) (true) 389 #endif 390 391 /* 392 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 393 * value. This can be called for any type of inode lock combination, including 394 * parent locking. Care must be taken to ensure we don't overrun the subclass 395 * storage fields in the class mask we build. 396 */ 397 static inline int 398 xfs_lock_inumorder(int lock_mode, int subclass) 399 { 400 int class = 0; 401 402 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | 403 XFS_ILOCK_RTSUM))); 404 ASSERT(xfs_lockdep_subclass_ok(subclass)); 405 406 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 407 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 408 class += subclass << XFS_IOLOCK_SHIFT; 409 } 410 411 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 412 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 413 class += subclass << XFS_MMAPLOCK_SHIFT; 414 } 415 416 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { 417 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 418 class += subclass << XFS_ILOCK_SHIFT; 419 } 420 421 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 422 } 423 424 /* 425 * The following routine will lock n inodes in exclusive mode. We assume the 426 * caller calls us with the inodes in i_ino order. 427 * 428 * We need to detect deadlock where an inode that we lock is in the AIL and we 429 * start waiting for another inode that is locked by a thread in a long running 430 * transaction (such as truncate). This can result in deadlock since the long 431 * running trans might need to wait for the inode we just locked in order to 432 * push the tail and free space in the log. 433 * 434 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 435 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 436 * lock more than one at a time, lockdep will report false positives saying we 437 * have violated locking orders. 438 */ 439 static void 440 xfs_lock_inodes( 441 struct xfs_inode **ips, 442 int inodes, 443 uint lock_mode) 444 { 445 int attempts = 0, i, j, try_lock; 446 struct xfs_log_item *lp; 447 448 /* 449 * Currently supports between 2 and 5 inodes with exclusive locking. We 450 * support an arbitrary depth of locking here, but absolute limits on 451 * inodes depend on the type of locking and the limits placed by 452 * lockdep annotations in xfs_lock_inumorder. These are all checked by 453 * the asserts. 454 */ 455 ASSERT(ips && inodes >= 2 && inodes <= 5); 456 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | 457 XFS_ILOCK_EXCL)); 458 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 459 XFS_ILOCK_SHARED))); 460 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 461 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 462 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 463 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 464 465 if (lock_mode & XFS_IOLOCK_EXCL) { 466 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 467 } else if (lock_mode & XFS_MMAPLOCK_EXCL) 468 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 469 470 try_lock = 0; 471 i = 0; 472 again: 473 for (; i < inodes; i++) { 474 ASSERT(ips[i]); 475 476 if (i && (ips[i] == ips[i - 1])) /* Already locked */ 477 continue; 478 479 /* 480 * If try_lock is not set yet, make sure all locked inodes are 481 * not in the AIL. If any are, set try_lock to be used later. 482 */ 483 if (!try_lock) { 484 for (j = (i - 1); j >= 0 && !try_lock; j--) { 485 lp = &ips[j]->i_itemp->ili_item; 486 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) 487 try_lock++; 488 } 489 } 490 491 /* 492 * If any of the previous locks we have locked is in the AIL, 493 * we must TRY to get the second and subsequent locks. If 494 * we can't get any, we must release all we have 495 * and try again. 496 */ 497 if (!try_lock) { 498 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 499 continue; 500 } 501 502 /* try_lock means we have an inode locked that is in the AIL. */ 503 ASSERT(i != 0); 504 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) 505 continue; 506 507 /* 508 * Unlock all previous guys and try again. xfs_iunlock will try 509 * to push the tail if the inode is in the AIL. 510 */ 511 attempts++; 512 for (j = i - 1; j >= 0; j--) { 513 /* 514 * Check to see if we've already unlocked this one. Not 515 * the first one going back, and the inode ptr is the 516 * same. 517 */ 518 if (j != (i - 1) && ips[j] == ips[j + 1]) 519 continue; 520 521 xfs_iunlock(ips[j], lock_mode); 522 } 523 524 if ((attempts % 5) == 0) { 525 delay(1); /* Don't just spin the CPU */ 526 } 527 i = 0; 528 try_lock = 0; 529 goto again; 530 } 531 } 532 533 /* 534 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - 535 * the mmaplock or the ilock, but not more than one type at a time. If we lock 536 * more than one at a time, lockdep will report false positives saying we have 537 * violated locking orders. The iolock must be double-locked separately since 538 * we use i_rwsem for that. We now support taking one lock EXCL and the other 539 * SHARED. 540 */ 541 void 542 xfs_lock_two_inodes( 543 struct xfs_inode *ip0, 544 uint ip0_mode, 545 struct xfs_inode *ip1, 546 uint ip1_mode) 547 { 548 struct xfs_inode *temp; 549 uint mode_temp; 550 int attempts = 0; 551 struct xfs_log_item *lp; 552 553 ASSERT(hweight32(ip0_mode) == 1); 554 ASSERT(hweight32(ip1_mode) == 1); 555 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 556 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 557 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || 558 !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); 559 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || 560 !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); 561 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || 562 !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); 563 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || 564 !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); 565 566 ASSERT(ip0->i_ino != ip1->i_ino); 567 568 if (ip0->i_ino > ip1->i_ino) { 569 temp = ip0; 570 ip0 = ip1; 571 ip1 = temp; 572 mode_temp = ip0_mode; 573 ip0_mode = ip1_mode; 574 ip1_mode = mode_temp; 575 } 576 577 again: 578 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 579 580 /* 581 * If the first lock we have locked is in the AIL, we must TRY to get 582 * the second lock. If we can't get it, we must release the first one 583 * and try again. 584 */ 585 lp = &ip0->i_itemp->ili_item; 586 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 587 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 588 xfs_iunlock(ip0, ip0_mode); 589 if ((++attempts % 5) == 0) 590 delay(1); /* Don't just spin the CPU */ 591 goto again; 592 } 593 } else { 594 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 595 } 596 } 597 598 uint 599 xfs_ip2xflags( 600 struct xfs_inode *ip) 601 { 602 uint flags = 0; 603 604 if (ip->i_diflags & XFS_DIFLAG_ANY) { 605 if (ip->i_diflags & XFS_DIFLAG_REALTIME) 606 flags |= FS_XFLAG_REALTIME; 607 if (ip->i_diflags & XFS_DIFLAG_PREALLOC) 608 flags |= FS_XFLAG_PREALLOC; 609 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) 610 flags |= FS_XFLAG_IMMUTABLE; 611 if (ip->i_diflags & XFS_DIFLAG_APPEND) 612 flags |= FS_XFLAG_APPEND; 613 if (ip->i_diflags & XFS_DIFLAG_SYNC) 614 flags |= FS_XFLAG_SYNC; 615 if (ip->i_diflags & XFS_DIFLAG_NOATIME) 616 flags |= FS_XFLAG_NOATIME; 617 if (ip->i_diflags & XFS_DIFLAG_NODUMP) 618 flags |= FS_XFLAG_NODUMP; 619 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) 620 flags |= FS_XFLAG_RTINHERIT; 621 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) 622 flags |= FS_XFLAG_PROJINHERIT; 623 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) 624 flags |= FS_XFLAG_NOSYMLINKS; 625 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 626 flags |= FS_XFLAG_EXTSIZE; 627 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) 628 flags |= FS_XFLAG_EXTSZINHERIT; 629 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) 630 flags |= FS_XFLAG_NODEFRAG; 631 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) 632 flags |= FS_XFLAG_FILESTREAM; 633 } 634 635 if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { 636 if (ip->i_diflags2 & XFS_DIFLAG2_DAX) 637 flags |= FS_XFLAG_DAX; 638 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 639 flags |= FS_XFLAG_COWEXTSIZE; 640 } 641 642 if (XFS_IFORK_Q(ip)) 643 flags |= FS_XFLAG_HASATTR; 644 return flags; 645 } 646 647 /* 648 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 649 * is allowed, otherwise it has to be an exact match. If a CI match is found, 650 * ci_name->name will point to a the actual name (caller must free) or 651 * will be set to NULL if an exact match is found. 652 */ 653 int 654 xfs_lookup( 655 xfs_inode_t *dp, 656 struct xfs_name *name, 657 xfs_inode_t **ipp, 658 struct xfs_name *ci_name) 659 { 660 xfs_ino_t inum; 661 int error; 662 663 trace_xfs_lookup(dp, name); 664 665 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 666 return -EIO; 667 668 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 669 if (error) 670 goto out_unlock; 671 672 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 673 if (error) 674 goto out_free_name; 675 676 return 0; 677 678 out_free_name: 679 if (ci_name) 680 kmem_free(ci_name->name); 681 out_unlock: 682 *ipp = NULL; 683 return error; 684 } 685 686 /* Propagate di_flags from a parent inode to a child inode. */ 687 static void 688 xfs_inode_inherit_flags( 689 struct xfs_inode *ip, 690 const struct xfs_inode *pip) 691 { 692 unsigned int di_flags = 0; 693 umode_t mode = VFS_I(ip)->i_mode; 694 695 if (S_ISDIR(mode)) { 696 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) 697 di_flags |= XFS_DIFLAG_RTINHERIT; 698 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 699 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 700 ip->i_extsize = pip->i_extsize; 701 } 702 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) 703 di_flags |= XFS_DIFLAG_PROJINHERIT; 704 } else if (S_ISREG(mode)) { 705 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && 706 xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) 707 di_flags |= XFS_DIFLAG_REALTIME; 708 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 709 di_flags |= XFS_DIFLAG_EXTSIZE; 710 ip->i_extsize = pip->i_extsize; 711 } 712 } 713 if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && 714 xfs_inherit_noatime) 715 di_flags |= XFS_DIFLAG_NOATIME; 716 if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && 717 xfs_inherit_nodump) 718 di_flags |= XFS_DIFLAG_NODUMP; 719 if ((pip->i_diflags & XFS_DIFLAG_SYNC) && 720 xfs_inherit_sync) 721 di_flags |= XFS_DIFLAG_SYNC; 722 if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && 723 xfs_inherit_nosymlinks) 724 di_flags |= XFS_DIFLAG_NOSYMLINKS; 725 if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && 726 xfs_inherit_nodefrag) 727 di_flags |= XFS_DIFLAG_NODEFRAG; 728 if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) 729 di_flags |= XFS_DIFLAG_FILESTREAM; 730 731 ip->i_diflags |= di_flags; 732 } 733 734 /* Propagate di_flags2 from a parent inode to a child inode. */ 735 static void 736 xfs_inode_inherit_flags2( 737 struct xfs_inode *ip, 738 const struct xfs_inode *pip) 739 { 740 if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 741 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 742 ip->i_cowextsize = pip->i_cowextsize; 743 } 744 if (pip->i_diflags2 & XFS_DIFLAG2_DAX) 745 ip->i_diflags2 |= XFS_DIFLAG2_DAX; 746 } 747 748 /* 749 * Initialise a newly allocated inode and return the in-core inode to the 750 * caller locked exclusively. 751 */ 752 static int 753 xfs_init_new_inode( 754 struct user_namespace *mnt_userns, 755 struct xfs_trans *tp, 756 struct xfs_inode *pip, 757 xfs_ino_t ino, 758 umode_t mode, 759 xfs_nlink_t nlink, 760 dev_t rdev, 761 prid_t prid, 762 bool init_xattrs, 763 struct xfs_inode **ipp) 764 { 765 struct inode *dir = pip ? VFS_I(pip) : NULL; 766 struct xfs_mount *mp = tp->t_mountp; 767 struct xfs_inode *ip; 768 unsigned int flags; 769 int error; 770 struct timespec64 tv; 771 struct inode *inode; 772 773 /* 774 * Protect against obviously corrupt allocation btree records. Later 775 * xfs_iget checks will catch re-allocation of other active in-memory 776 * and on-disk inodes. If we don't catch reallocating the parent inode 777 * here we will deadlock in xfs_iget() so we have to do these checks 778 * first. 779 */ 780 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 781 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 782 return -EFSCORRUPTED; 783 } 784 785 /* 786 * Get the in-core inode with the lock held exclusively to prevent 787 * others from looking at until we're done. 788 */ 789 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 790 if (error) 791 return error; 792 793 ASSERT(ip != NULL); 794 inode = VFS_I(ip); 795 set_nlink(inode, nlink); 796 inode->i_rdev = rdev; 797 ip->i_projid = prid; 798 799 if (dir && !(dir->i_mode & S_ISGID) && 800 (mp->m_flags & XFS_MOUNT_GRPID)) { 801 inode_fsuid_set(inode, mnt_userns); 802 inode->i_gid = dir->i_gid; 803 inode->i_mode = mode; 804 } else { 805 inode_init_owner(mnt_userns, inode, dir, mode); 806 } 807 808 /* 809 * If the group ID of the new file does not match the effective group 810 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 811 * (and only if the irix_sgid_inherit compatibility variable is set). 812 */ 813 if (irix_sgid_inherit && 814 (inode->i_mode & S_ISGID) && 815 !in_group_p(i_gid_into_mnt(mnt_userns, inode))) 816 inode->i_mode &= ~S_ISGID; 817 818 ip->i_disk_size = 0; 819 ip->i_df.if_nextents = 0; 820 ASSERT(ip->i_nblocks == 0); 821 822 tv = current_time(inode); 823 inode->i_mtime = tv; 824 inode->i_atime = tv; 825 inode->i_ctime = tv; 826 827 ip->i_extsize = 0; 828 ip->i_diflags = 0; 829 830 if (xfs_sb_version_has_v3inode(&mp->m_sb)) { 831 inode_set_iversion(inode, 1); 832 ip->i_cowextsize = 0; 833 ip->i_crtime = tv; 834 } 835 836 flags = XFS_ILOG_CORE; 837 switch (mode & S_IFMT) { 838 case S_IFIFO: 839 case S_IFCHR: 840 case S_IFBLK: 841 case S_IFSOCK: 842 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 843 flags |= XFS_ILOG_DEV; 844 break; 845 case S_IFREG: 846 case S_IFDIR: 847 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) 848 xfs_inode_inherit_flags(ip, pip); 849 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) 850 xfs_inode_inherit_flags2(ip, pip); 851 /* FALLTHROUGH */ 852 case S_IFLNK: 853 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 854 ip->i_df.if_bytes = 0; 855 ip->i_df.if_u1.if_root = NULL; 856 break; 857 default: 858 ASSERT(0); 859 } 860 861 /* 862 * If we need to create attributes immediately after allocating the 863 * inode, initialise an empty attribute fork right now. We use the 864 * default fork offset for attributes here as we don't know exactly what 865 * size or how many attributes we might be adding. We can do this 866 * safely here because we know the data fork is completely empty and 867 * this saves us from needing to run a separate transaction to set the 868 * fork offset in the immediate future. 869 */ 870 if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) { 871 ip->i_forkoff = xfs_default_attroffset(ip) >> 3; 872 ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); 873 } 874 875 /* 876 * Log the new values stuffed into the inode. 877 */ 878 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 879 xfs_trans_log_inode(tp, ip, flags); 880 881 /* now that we have an i_mode we can setup the inode structure */ 882 xfs_setup_inode(ip); 883 884 *ipp = ip; 885 return 0; 886 } 887 888 /* 889 * Allocates a new inode from disk and return a pointer to the incore copy. This 890 * routine will internally commit the current transaction and allocate a new one 891 * if we needed to allocate more on-disk free inodes to perform the requested 892 * operation. 893 * 894 * If we are allocating quota inodes, we do not have a parent inode to attach to 895 * or associate with (i.e. dp == NULL) because they are not linked into the 896 * directory structure - they are attached directly to the superblock - and so 897 * have no parent. 898 */ 899 int 900 xfs_dir_ialloc( 901 struct user_namespace *mnt_userns, 902 struct xfs_trans **tpp, 903 struct xfs_inode *dp, 904 umode_t mode, 905 xfs_nlink_t nlink, 906 dev_t rdev, 907 prid_t prid, 908 bool init_xattrs, 909 struct xfs_inode **ipp) 910 { 911 struct xfs_buf *agibp; 912 xfs_ino_t parent_ino = dp ? dp->i_ino : 0; 913 xfs_ino_t ino; 914 int error; 915 916 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 917 918 /* 919 * Call the space management code to pick the on-disk inode to be 920 * allocated. 921 */ 922 error = xfs_dialloc_select_ag(tpp, parent_ino, mode, &agibp); 923 if (error) 924 return error; 925 926 if (!agibp) 927 return -ENOSPC; 928 929 /* Allocate an inode from the selected AG */ 930 error = xfs_dialloc_ag(*tpp, agibp, parent_ino, &ino); 931 if (error) 932 return error; 933 ASSERT(ino != NULLFSINO); 934 935 return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, 936 prid, init_xattrs, ipp); 937 } 938 939 /* 940 * Decrement the link count on an inode & log the change. If this causes the 941 * link count to go to zero, move the inode to AGI unlinked list so that it can 942 * be freed when the last active reference goes away via xfs_inactive(). 943 */ 944 static int /* error */ 945 xfs_droplink( 946 xfs_trans_t *tp, 947 xfs_inode_t *ip) 948 { 949 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 950 951 drop_nlink(VFS_I(ip)); 952 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 953 954 if (VFS_I(ip)->i_nlink) 955 return 0; 956 957 return xfs_iunlink(tp, ip); 958 } 959 960 /* 961 * Increment the link count on an inode & log the change. 962 */ 963 static void 964 xfs_bumplink( 965 xfs_trans_t *tp, 966 xfs_inode_t *ip) 967 { 968 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 969 970 inc_nlink(VFS_I(ip)); 971 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 972 } 973 974 int 975 xfs_create( 976 struct user_namespace *mnt_userns, 977 xfs_inode_t *dp, 978 struct xfs_name *name, 979 umode_t mode, 980 dev_t rdev, 981 bool init_xattrs, 982 xfs_inode_t **ipp) 983 { 984 int is_dir = S_ISDIR(mode); 985 struct xfs_mount *mp = dp->i_mount; 986 struct xfs_inode *ip = NULL; 987 struct xfs_trans *tp = NULL; 988 int error; 989 bool unlock_dp_on_error = false; 990 prid_t prid; 991 struct xfs_dquot *udqp = NULL; 992 struct xfs_dquot *gdqp = NULL; 993 struct xfs_dquot *pdqp = NULL; 994 struct xfs_trans_res *tres; 995 uint resblks; 996 997 trace_xfs_create(dp, name); 998 999 if (XFS_FORCED_SHUTDOWN(mp)) 1000 return -EIO; 1001 1002 prid = xfs_get_initial_prid(dp); 1003 1004 /* 1005 * Make sure that we have allocated dquot(s) on disk. 1006 */ 1007 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), 1008 mapped_fsgid(mnt_userns), prid, 1009 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1010 &udqp, &gdqp, &pdqp); 1011 if (error) 1012 return error; 1013 1014 if (is_dir) { 1015 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1016 tres = &M_RES(mp)->tr_mkdir; 1017 } else { 1018 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1019 tres = &M_RES(mp)->tr_create; 1020 } 1021 1022 /* 1023 * Initially assume that the file does not exist and 1024 * reserve the resources for that case. If that is not 1025 * the case we'll drop the one we have and get a more 1026 * appropriate transaction later. 1027 */ 1028 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1029 &tp); 1030 if (error == -ENOSPC) { 1031 /* flush outstanding delalloc blocks and retry */ 1032 xfs_flush_inodes(mp); 1033 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, 1034 resblks, &tp); 1035 } 1036 if (error) 1037 goto out_release_dquots; 1038 1039 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1040 unlock_dp_on_error = true; 1041 1042 error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, 1043 XFS_IEXT_DIR_MANIP_CNT(mp)); 1044 if (error) 1045 goto out_trans_cancel; 1046 1047 /* 1048 * A newly created regular or special file just has one directory 1049 * entry pointing to them, but a directory also the "." entry 1050 * pointing to itself. 1051 */ 1052 error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, 1053 prid, init_xattrs, &ip); 1054 if (error) 1055 goto out_trans_cancel; 1056 1057 /* 1058 * Now we join the directory inode to the transaction. We do not do it 1059 * earlier because xfs_dir_ialloc might commit the previous transaction 1060 * (and release all the locks). An error from here on will result in 1061 * the transaction cancel unlocking dp so don't do it explicitly in the 1062 * error path. 1063 */ 1064 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1065 unlock_dp_on_error = false; 1066 1067 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1068 resblks - XFS_IALLOC_SPACE_RES(mp)); 1069 if (error) { 1070 ASSERT(error != -ENOSPC); 1071 goto out_trans_cancel; 1072 } 1073 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1074 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1075 1076 if (is_dir) { 1077 error = xfs_dir_init(tp, ip, dp); 1078 if (error) 1079 goto out_trans_cancel; 1080 1081 xfs_bumplink(tp, dp); 1082 } 1083 1084 /* 1085 * If this is a synchronous mount, make sure that the 1086 * create transaction goes to disk before returning to 1087 * the user. 1088 */ 1089 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 1090 xfs_trans_set_sync(tp); 1091 1092 /* 1093 * Attach the dquot(s) to the inodes and modify them incore. 1094 * These ids of the inode couldn't have changed since the new 1095 * inode has been locked ever since it was created. 1096 */ 1097 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1098 1099 error = xfs_trans_commit(tp); 1100 if (error) 1101 goto out_release_inode; 1102 1103 xfs_qm_dqrele(udqp); 1104 xfs_qm_dqrele(gdqp); 1105 xfs_qm_dqrele(pdqp); 1106 1107 *ipp = ip; 1108 return 0; 1109 1110 out_trans_cancel: 1111 xfs_trans_cancel(tp); 1112 out_release_inode: 1113 /* 1114 * Wait until after the current transaction is aborted to finish the 1115 * setup of the inode and release the inode. This prevents recursive 1116 * transactions and deadlocks from xfs_inactive. 1117 */ 1118 if (ip) { 1119 xfs_finish_inode_setup(ip); 1120 xfs_irele(ip); 1121 } 1122 out_release_dquots: 1123 xfs_qm_dqrele(udqp); 1124 xfs_qm_dqrele(gdqp); 1125 xfs_qm_dqrele(pdqp); 1126 1127 if (unlock_dp_on_error) 1128 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1129 return error; 1130 } 1131 1132 int 1133 xfs_create_tmpfile( 1134 struct user_namespace *mnt_userns, 1135 struct xfs_inode *dp, 1136 umode_t mode, 1137 struct xfs_inode **ipp) 1138 { 1139 struct xfs_mount *mp = dp->i_mount; 1140 struct xfs_inode *ip = NULL; 1141 struct xfs_trans *tp = NULL; 1142 int error; 1143 prid_t prid; 1144 struct xfs_dquot *udqp = NULL; 1145 struct xfs_dquot *gdqp = NULL; 1146 struct xfs_dquot *pdqp = NULL; 1147 struct xfs_trans_res *tres; 1148 uint resblks; 1149 1150 if (XFS_FORCED_SHUTDOWN(mp)) 1151 return -EIO; 1152 1153 prid = xfs_get_initial_prid(dp); 1154 1155 /* 1156 * Make sure that we have allocated dquot(s) on disk. 1157 */ 1158 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), 1159 mapped_fsgid(mnt_userns), prid, 1160 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1161 &udqp, &gdqp, &pdqp); 1162 if (error) 1163 return error; 1164 1165 resblks = XFS_IALLOC_SPACE_RES(mp); 1166 tres = &M_RES(mp)->tr_create_tmpfile; 1167 1168 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1169 &tp); 1170 if (error) 1171 goto out_release_dquots; 1172 1173 error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, 1174 false, &ip); 1175 if (error) 1176 goto out_trans_cancel; 1177 1178 if (mp->m_flags & XFS_MOUNT_WSYNC) 1179 xfs_trans_set_sync(tp); 1180 1181 /* 1182 * Attach the dquot(s) to the inodes and modify them incore. 1183 * These ids of the inode couldn't have changed since the new 1184 * inode has been locked ever since it was created. 1185 */ 1186 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1187 1188 error = xfs_iunlink(tp, ip); 1189 if (error) 1190 goto out_trans_cancel; 1191 1192 error = xfs_trans_commit(tp); 1193 if (error) 1194 goto out_release_inode; 1195 1196 xfs_qm_dqrele(udqp); 1197 xfs_qm_dqrele(gdqp); 1198 xfs_qm_dqrele(pdqp); 1199 1200 *ipp = ip; 1201 return 0; 1202 1203 out_trans_cancel: 1204 xfs_trans_cancel(tp); 1205 out_release_inode: 1206 /* 1207 * Wait until after the current transaction is aborted to finish the 1208 * setup of the inode and release the inode. This prevents recursive 1209 * transactions and deadlocks from xfs_inactive. 1210 */ 1211 if (ip) { 1212 xfs_finish_inode_setup(ip); 1213 xfs_irele(ip); 1214 } 1215 out_release_dquots: 1216 xfs_qm_dqrele(udqp); 1217 xfs_qm_dqrele(gdqp); 1218 xfs_qm_dqrele(pdqp); 1219 1220 return error; 1221 } 1222 1223 int 1224 xfs_link( 1225 xfs_inode_t *tdp, 1226 xfs_inode_t *sip, 1227 struct xfs_name *target_name) 1228 { 1229 xfs_mount_t *mp = tdp->i_mount; 1230 xfs_trans_t *tp; 1231 int error; 1232 int resblks; 1233 1234 trace_xfs_link(tdp, target_name); 1235 1236 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1237 1238 if (XFS_FORCED_SHUTDOWN(mp)) 1239 return -EIO; 1240 1241 error = xfs_qm_dqattach(sip); 1242 if (error) 1243 goto std_return; 1244 1245 error = xfs_qm_dqattach(tdp); 1246 if (error) 1247 goto std_return; 1248 1249 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1250 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); 1251 if (error == -ENOSPC) { 1252 resblks = 0; 1253 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); 1254 } 1255 if (error) 1256 goto std_return; 1257 1258 xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); 1259 1260 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 1261 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); 1262 1263 error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, 1264 XFS_IEXT_DIR_MANIP_CNT(mp)); 1265 if (error) 1266 goto error_return; 1267 1268 /* 1269 * If we are using project inheritance, we only allow hard link 1270 * creation in our tree when the project IDs are the same; else 1271 * the tree quota mechanism could be circumvented. 1272 */ 1273 if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 1274 tdp->i_projid != sip->i_projid)) { 1275 error = -EXDEV; 1276 goto error_return; 1277 } 1278 1279 if (!resblks) { 1280 error = xfs_dir_canenter(tp, tdp, target_name); 1281 if (error) 1282 goto error_return; 1283 } 1284 1285 /* 1286 * Handle initial link state of O_TMPFILE inode 1287 */ 1288 if (VFS_I(sip)->i_nlink == 0) { 1289 error = xfs_iunlink_remove(tp, sip); 1290 if (error) 1291 goto error_return; 1292 } 1293 1294 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1295 resblks); 1296 if (error) 1297 goto error_return; 1298 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1299 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1300 1301 xfs_bumplink(tp, sip); 1302 1303 /* 1304 * If this is a synchronous mount, make sure that the 1305 * link transaction goes to disk before returning to 1306 * the user. 1307 */ 1308 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 1309 xfs_trans_set_sync(tp); 1310 1311 return xfs_trans_commit(tp); 1312 1313 error_return: 1314 xfs_trans_cancel(tp); 1315 std_return: 1316 return error; 1317 } 1318 1319 /* Clear the reflink flag and the cowblocks tag if possible. */ 1320 static void 1321 xfs_itruncate_clear_reflink_flags( 1322 struct xfs_inode *ip) 1323 { 1324 struct xfs_ifork *dfork; 1325 struct xfs_ifork *cfork; 1326 1327 if (!xfs_is_reflink_inode(ip)) 1328 return; 1329 dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1330 cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1331 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) 1332 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1333 if (cfork->if_bytes == 0) 1334 xfs_inode_clear_cowblocks_tag(ip); 1335 } 1336 1337 /* 1338 * Free up the underlying blocks past new_size. The new size must be smaller 1339 * than the current size. This routine can be used both for the attribute and 1340 * data fork, and does not modify the inode size, which is left to the caller. 1341 * 1342 * The transaction passed to this routine must have made a permanent log 1343 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1344 * given transaction and start new ones, so make sure everything involved in 1345 * the transaction is tidy before calling here. Some transaction will be 1346 * returned to the caller to be committed. The incoming transaction must 1347 * already include the inode, and both inode locks must be held exclusively. 1348 * The inode must also be "held" within the transaction. On return the inode 1349 * will be "held" within the returned transaction. This routine does NOT 1350 * require any disk space to be reserved for it within the transaction. 1351 * 1352 * If we get an error, we must return with the inode locked and linked into the 1353 * current transaction. This keeps things simple for the higher level code, 1354 * because it always knows that the inode is locked and held in the transaction 1355 * that returns to it whether errors occur or not. We don't mark the inode 1356 * dirty on error so that transactions can be easily aborted if possible. 1357 */ 1358 int 1359 xfs_itruncate_extents_flags( 1360 struct xfs_trans **tpp, 1361 struct xfs_inode *ip, 1362 int whichfork, 1363 xfs_fsize_t new_size, 1364 int flags) 1365 { 1366 struct xfs_mount *mp = ip->i_mount; 1367 struct xfs_trans *tp = *tpp; 1368 xfs_fileoff_t first_unmap_block; 1369 xfs_filblks_t unmap_len; 1370 int error = 0; 1371 1372 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1373 ASSERT(!atomic_read(&VFS_I(ip)->i_count) || 1374 xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1375 ASSERT(new_size <= XFS_ISIZE(ip)); 1376 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1377 ASSERT(ip->i_itemp != NULL); 1378 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1379 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1380 1381 trace_xfs_itruncate_extents_start(ip, new_size); 1382 1383 flags |= xfs_bmapi_aflag(whichfork); 1384 1385 /* 1386 * Since it is possible for space to become allocated beyond 1387 * the end of the file (in a crash where the space is allocated 1388 * but the inode size is not yet updated), simply remove any 1389 * blocks which show up between the new EOF and the maximum 1390 * possible file size. 1391 * 1392 * We have to free all the blocks to the bmbt maximum offset, even if 1393 * the page cache can't scale that far. 1394 */ 1395 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1396 if (!xfs_verify_fileoff(mp, first_unmap_block)) { 1397 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1398 return 0; 1399 } 1400 1401 unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; 1402 while (unmap_len > 0) { 1403 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1404 error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, 1405 flags, XFS_ITRUNC_MAX_EXTENTS); 1406 if (error) 1407 goto out; 1408 1409 /* free the just unmapped extents */ 1410 error = xfs_defer_finish(&tp); 1411 if (error) 1412 goto out; 1413 } 1414 1415 if (whichfork == XFS_DATA_FORK) { 1416 /* Remove all pending CoW reservations. */ 1417 error = xfs_reflink_cancel_cow_blocks(ip, &tp, 1418 first_unmap_block, XFS_MAX_FILEOFF, true); 1419 if (error) 1420 goto out; 1421 1422 xfs_itruncate_clear_reflink_flags(ip); 1423 } 1424 1425 /* 1426 * Always re-log the inode so that our permanent transaction can keep 1427 * on rolling it forward in the log. 1428 */ 1429 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1430 1431 trace_xfs_itruncate_extents_end(ip, new_size); 1432 1433 out: 1434 *tpp = tp; 1435 return error; 1436 } 1437 1438 int 1439 xfs_release( 1440 xfs_inode_t *ip) 1441 { 1442 xfs_mount_t *mp = ip->i_mount; 1443 int error = 0; 1444 1445 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) 1446 return 0; 1447 1448 /* If this is a read-only mount, don't do this (would generate I/O) */ 1449 if (mp->m_flags & XFS_MOUNT_RDONLY) 1450 return 0; 1451 1452 if (!XFS_FORCED_SHUTDOWN(mp)) { 1453 int truncated; 1454 1455 /* 1456 * If we previously truncated this file and removed old data 1457 * in the process, we want to initiate "early" writeout on 1458 * the last close. This is an attempt to combat the notorious 1459 * NULL files problem which is particularly noticeable from a 1460 * truncate down, buffered (re-)write (delalloc), followed by 1461 * a crash. What we are effectively doing here is 1462 * significantly reducing the time window where we'd otherwise 1463 * be exposed to that problem. 1464 */ 1465 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1466 if (truncated) { 1467 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1468 if (ip->i_delayed_blks > 0) { 1469 error = filemap_flush(VFS_I(ip)->i_mapping); 1470 if (error) 1471 return error; 1472 } 1473 } 1474 } 1475 1476 if (VFS_I(ip)->i_nlink == 0) 1477 return 0; 1478 1479 /* 1480 * If we can't get the iolock just skip truncating the blocks past EOF 1481 * because we could deadlock with the mmap_lock otherwise. We'll get 1482 * another chance to drop them once the last reference to the inode is 1483 * dropped, so we'll never leak blocks permanently. 1484 */ 1485 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) 1486 return 0; 1487 1488 if (xfs_can_free_eofblocks(ip, false)) { 1489 /* 1490 * Check if the inode is being opened, written and closed 1491 * frequently and we have delayed allocation blocks outstanding 1492 * (e.g. streaming writes from the NFS server), truncating the 1493 * blocks past EOF will cause fragmentation to occur. 1494 * 1495 * In this case don't do the truncation, but we have to be 1496 * careful how we detect this case. Blocks beyond EOF show up as 1497 * i_delayed_blks even when the inode is clean, so we need to 1498 * truncate them away first before checking for a dirty release. 1499 * Hence on the first dirty close we will still remove the 1500 * speculative allocation, but after that we will leave it in 1501 * place. 1502 */ 1503 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1504 goto out_unlock; 1505 1506 error = xfs_free_eofblocks(ip); 1507 if (error) 1508 goto out_unlock; 1509 1510 /* delalloc blocks after truncation means it really is dirty */ 1511 if (ip->i_delayed_blks) 1512 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1513 } 1514 1515 out_unlock: 1516 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1517 return error; 1518 } 1519 1520 /* 1521 * xfs_inactive_truncate 1522 * 1523 * Called to perform a truncate when an inode becomes unlinked. 1524 */ 1525 STATIC int 1526 xfs_inactive_truncate( 1527 struct xfs_inode *ip) 1528 { 1529 struct xfs_mount *mp = ip->i_mount; 1530 struct xfs_trans *tp; 1531 int error; 1532 1533 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1534 if (error) { 1535 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1536 return error; 1537 } 1538 xfs_ilock(ip, XFS_ILOCK_EXCL); 1539 xfs_trans_ijoin(tp, ip, 0); 1540 1541 /* 1542 * Log the inode size first to prevent stale data exposure in the event 1543 * of a system crash before the truncate completes. See the related 1544 * comment in xfs_vn_setattr_size() for details. 1545 */ 1546 ip->i_disk_size = 0; 1547 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1548 1549 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1550 if (error) 1551 goto error_trans_cancel; 1552 1553 ASSERT(ip->i_df.if_nextents == 0); 1554 1555 error = xfs_trans_commit(tp); 1556 if (error) 1557 goto error_unlock; 1558 1559 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1560 return 0; 1561 1562 error_trans_cancel: 1563 xfs_trans_cancel(tp); 1564 error_unlock: 1565 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1566 return error; 1567 } 1568 1569 /* 1570 * xfs_inactive_ifree() 1571 * 1572 * Perform the inode free when an inode is unlinked. 1573 */ 1574 STATIC int 1575 xfs_inactive_ifree( 1576 struct xfs_inode *ip) 1577 { 1578 struct xfs_mount *mp = ip->i_mount; 1579 struct xfs_trans *tp; 1580 int error; 1581 1582 /* 1583 * We try to use a per-AG reservation for any block needed by the finobt 1584 * tree, but as the finobt feature predates the per-AG reservation 1585 * support a degraded file system might not have enough space for the 1586 * reservation at mount time. In that case try to dip into the reserved 1587 * pool and pray. 1588 * 1589 * Send a warning if the reservation does happen to fail, as the inode 1590 * now remains allocated and sits on the unlinked list until the fs is 1591 * repaired. 1592 */ 1593 if (unlikely(mp->m_finobt_nores)) { 1594 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1595 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1596 &tp); 1597 } else { 1598 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1599 } 1600 if (error) { 1601 if (error == -ENOSPC) { 1602 xfs_warn_ratelimited(mp, 1603 "Failed to remove inode(s) from unlinked list. " 1604 "Please free space, unmount and run xfs_repair."); 1605 } else { 1606 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1607 } 1608 return error; 1609 } 1610 1611 /* 1612 * We do not hold the inode locked across the entire rolling transaction 1613 * here. We only need to hold it for the first transaction that 1614 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1615 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1616 * here breaks the relationship between cluster buffer invalidation and 1617 * stale inode invalidation on cluster buffer item journal commit 1618 * completion, and can result in leaving dirty stale inodes hanging 1619 * around in memory. 1620 * 1621 * We have no need for serialising this inode operation against other 1622 * operations - we freed the inode and hence reallocation is required 1623 * and that will serialise on reallocating the space the deferops need 1624 * to free. Hence we can unlock the inode on the first commit of 1625 * the transaction rather than roll it right through the deferops. This 1626 * avoids relogging the XFS_ISTALE inode. 1627 * 1628 * We check that xfs_ifree() hasn't grown an internal transaction roll 1629 * by asserting that the inode is still locked when it returns. 1630 */ 1631 xfs_ilock(ip, XFS_ILOCK_EXCL); 1632 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1633 1634 error = xfs_ifree(tp, ip); 1635 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1636 if (error) { 1637 /* 1638 * If we fail to free the inode, shut down. The cancel 1639 * might do that, we need to make sure. Otherwise the 1640 * inode might be lost for a long time or forever. 1641 */ 1642 if (!XFS_FORCED_SHUTDOWN(mp)) { 1643 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1644 __func__, error); 1645 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1646 } 1647 xfs_trans_cancel(tp); 1648 return error; 1649 } 1650 1651 /* 1652 * Credit the quota account(s). The inode is gone. 1653 */ 1654 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1655 1656 /* 1657 * Just ignore errors at this point. There is nothing we can do except 1658 * to try to keep going. Make sure it's not a silent error. 1659 */ 1660 error = xfs_trans_commit(tp); 1661 if (error) 1662 xfs_notice(mp, "%s: xfs_trans_commit returned error %d", 1663 __func__, error); 1664 1665 return 0; 1666 } 1667 1668 /* 1669 * xfs_inactive 1670 * 1671 * This is called when the vnode reference count for the vnode 1672 * goes to zero. If the file has been unlinked, then it must 1673 * now be truncated. Also, we clear all of the read-ahead state 1674 * kept for the inode here since the file is now closed. 1675 */ 1676 void 1677 xfs_inactive( 1678 xfs_inode_t *ip) 1679 { 1680 struct xfs_mount *mp; 1681 int error; 1682 int truncate = 0; 1683 1684 /* 1685 * If the inode is already free, then there can be nothing 1686 * to clean up here. 1687 */ 1688 if (VFS_I(ip)->i_mode == 0) { 1689 ASSERT(ip->i_df.if_broot_bytes == 0); 1690 return; 1691 } 1692 1693 mp = ip->i_mount; 1694 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1695 1696 /* If this is a read-only mount, don't do this (would generate I/O) */ 1697 if (mp->m_flags & XFS_MOUNT_RDONLY) 1698 return; 1699 1700 /* Metadata inodes require explicit resource cleanup. */ 1701 if (xfs_is_metadata_inode(ip)) 1702 return; 1703 1704 /* Try to clean out the cow blocks if there are any. */ 1705 if (xfs_inode_has_cow_data(ip)) 1706 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1707 1708 if (VFS_I(ip)->i_nlink != 0) { 1709 /* 1710 * force is true because we are evicting an inode from the 1711 * cache. Post-eof blocks must be freed, lest we end up with 1712 * broken free space accounting. 1713 * 1714 * Note: don't bother with iolock here since lockdep complains 1715 * about acquiring it in reclaim context. We have the only 1716 * reference to the inode at this point anyways. 1717 */ 1718 if (xfs_can_free_eofblocks(ip, true)) 1719 xfs_free_eofblocks(ip); 1720 1721 return; 1722 } 1723 1724 if (S_ISREG(VFS_I(ip)->i_mode) && 1725 (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || 1726 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) 1727 truncate = 1; 1728 1729 error = xfs_qm_dqattach(ip); 1730 if (error) 1731 return; 1732 1733 if (S_ISLNK(VFS_I(ip)->i_mode)) 1734 error = xfs_inactive_symlink(ip); 1735 else if (truncate) 1736 error = xfs_inactive_truncate(ip); 1737 if (error) 1738 return; 1739 1740 /* 1741 * If there are attributes associated with the file then blow them away 1742 * now. The code calls a routine that recursively deconstructs the 1743 * attribute fork. If also blows away the in-core attribute fork. 1744 */ 1745 if (XFS_IFORK_Q(ip)) { 1746 error = xfs_attr_inactive(ip); 1747 if (error) 1748 return; 1749 } 1750 1751 ASSERT(!ip->i_afp); 1752 ASSERT(ip->i_forkoff == 0); 1753 1754 /* 1755 * Free the inode. 1756 */ 1757 error = xfs_inactive_ifree(ip); 1758 if (error) 1759 return; 1760 1761 /* 1762 * Release the dquots held by inode, if any. 1763 */ 1764 xfs_qm_dqdetach(ip); 1765 } 1766 1767 /* 1768 * In-Core Unlinked List Lookups 1769 * ============================= 1770 * 1771 * Every inode is supposed to be reachable from some other piece of metadata 1772 * with the exception of the root directory. Inodes with a connection to a 1773 * file descriptor but not linked from anywhere in the on-disk directory tree 1774 * are collectively known as unlinked inodes, though the filesystem itself 1775 * maintains links to these inodes so that on-disk metadata are consistent. 1776 * 1777 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1778 * header contains a number of buckets that point to an inode, and each inode 1779 * record has a pointer to the next inode in the hash chain. This 1780 * singly-linked list causes scaling problems in the iunlink remove function 1781 * because we must walk that list to find the inode that points to the inode 1782 * being removed from the unlinked hash bucket list. 1783 * 1784 * What if we modelled the unlinked list as a collection of records capturing 1785 * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd 1786 * have a fast way to look up unlinked list predecessors, which avoids the 1787 * slow list walk. That's exactly what we do here (in-core) with a per-AG 1788 * rhashtable. 1789 * 1790 * Because this is a backref cache, we ignore operational failures since the 1791 * iunlink code can fall back to the slow bucket walk. The only errors that 1792 * should bubble out are for obviously incorrect situations. 1793 * 1794 * All users of the backref cache MUST hold the AGI buffer lock to serialize 1795 * access or have otherwise provided for concurrency control. 1796 */ 1797 1798 /* Capture a "X.next_unlinked = Y" relationship. */ 1799 struct xfs_iunlink { 1800 struct rhash_head iu_rhash_head; 1801 xfs_agino_t iu_agino; /* X */ 1802 xfs_agino_t iu_next_unlinked; /* Y */ 1803 }; 1804 1805 /* Unlinked list predecessor lookup hashtable construction */ 1806 static int 1807 xfs_iunlink_obj_cmpfn( 1808 struct rhashtable_compare_arg *arg, 1809 const void *obj) 1810 { 1811 const xfs_agino_t *key = arg->key; 1812 const struct xfs_iunlink *iu = obj; 1813 1814 if (iu->iu_next_unlinked != *key) 1815 return 1; 1816 return 0; 1817 } 1818 1819 static const struct rhashtable_params xfs_iunlink_hash_params = { 1820 .min_size = XFS_AGI_UNLINKED_BUCKETS, 1821 .key_len = sizeof(xfs_agino_t), 1822 .key_offset = offsetof(struct xfs_iunlink, 1823 iu_next_unlinked), 1824 .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), 1825 .automatic_shrinking = true, 1826 .obj_cmpfn = xfs_iunlink_obj_cmpfn, 1827 }; 1828 1829 /* 1830 * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such 1831 * relation is found. 1832 */ 1833 static xfs_agino_t 1834 xfs_iunlink_lookup_backref( 1835 struct xfs_perag *pag, 1836 xfs_agino_t agino) 1837 { 1838 struct xfs_iunlink *iu; 1839 1840 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, 1841 xfs_iunlink_hash_params); 1842 return iu ? iu->iu_agino : NULLAGINO; 1843 } 1844 1845 /* 1846 * Take ownership of an iunlink cache entry and insert it into the hash table. 1847 * If successful, the entry will be owned by the cache; if not, it is freed. 1848 * Either way, the caller does not own @iu after this call. 1849 */ 1850 static int 1851 xfs_iunlink_insert_backref( 1852 struct xfs_perag *pag, 1853 struct xfs_iunlink *iu) 1854 { 1855 int error; 1856 1857 error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, 1858 &iu->iu_rhash_head, xfs_iunlink_hash_params); 1859 /* 1860 * Fail loudly if there already was an entry because that's a sign of 1861 * corruption of in-memory data. Also fail loudly if we see an error 1862 * code we didn't anticipate from the rhashtable code. Currently we 1863 * only anticipate ENOMEM. 1864 */ 1865 if (error) { 1866 WARN(error != -ENOMEM, "iunlink cache insert error %d", error); 1867 kmem_free(iu); 1868 } 1869 /* 1870 * Absorb any runtime errors that aren't a result of corruption because 1871 * this is a cache and we can always fall back to bucket list scanning. 1872 */ 1873 if (error != 0 && error != -EEXIST) 1874 error = 0; 1875 return error; 1876 } 1877 1878 /* Remember that @prev_agino.next_unlinked = @this_agino. */ 1879 static int 1880 xfs_iunlink_add_backref( 1881 struct xfs_perag *pag, 1882 xfs_agino_t prev_agino, 1883 xfs_agino_t this_agino) 1884 { 1885 struct xfs_iunlink *iu; 1886 1887 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) 1888 return 0; 1889 1890 iu = kmem_zalloc(sizeof(*iu), KM_NOFS); 1891 iu->iu_agino = prev_agino; 1892 iu->iu_next_unlinked = this_agino; 1893 1894 return xfs_iunlink_insert_backref(pag, iu); 1895 } 1896 1897 /* 1898 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. 1899 * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there 1900 * wasn't any such entry then we don't bother. 1901 */ 1902 static int 1903 xfs_iunlink_change_backref( 1904 struct xfs_perag *pag, 1905 xfs_agino_t agino, 1906 xfs_agino_t next_unlinked) 1907 { 1908 struct xfs_iunlink *iu; 1909 int error; 1910 1911 /* Look up the old entry; if there wasn't one then exit. */ 1912 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, 1913 xfs_iunlink_hash_params); 1914 if (!iu) 1915 return 0; 1916 1917 /* 1918 * Remove the entry. This shouldn't ever return an error, but if we 1919 * couldn't remove the old entry we don't want to add it again to the 1920 * hash table, and if the entry disappeared on us then someone's 1921 * violated the locking rules and we need to fail loudly. Either way 1922 * we cannot remove the inode because internal state is or would have 1923 * been corrupt. 1924 */ 1925 error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, 1926 &iu->iu_rhash_head, xfs_iunlink_hash_params); 1927 if (error) 1928 return error; 1929 1930 /* If there is no new next entry just free our item and return. */ 1931 if (next_unlinked == NULLAGINO) { 1932 kmem_free(iu); 1933 return 0; 1934 } 1935 1936 /* Update the entry and re-add it to the hash table. */ 1937 iu->iu_next_unlinked = next_unlinked; 1938 return xfs_iunlink_insert_backref(pag, iu); 1939 } 1940 1941 /* Set up the in-core predecessor structures. */ 1942 int 1943 xfs_iunlink_init( 1944 struct xfs_perag *pag) 1945 { 1946 return rhashtable_init(&pag->pagi_unlinked_hash, 1947 &xfs_iunlink_hash_params); 1948 } 1949 1950 /* Free the in-core predecessor structures. */ 1951 static void 1952 xfs_iunlink_free_item( 1953 void *ptr, 1954 void *arg) 1955 { 1956 struct xfs_iunlink *iu = ptr; 1957 bool *freed_anything = arg; 1958 1959 *freed_anything = true; 1960 kmem_free(iu); 1961 } 1962 1963 void 1964 xfs_iunlink_destroy( 1965 struct xfs_perag *pag) 1966 { 1967 bool freed_anything = false; 1968 1969 rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, 1970 xfs_iunlink_free_item, &freed_anything); 1971 1972 ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); 1973 } 1974 1975 /* 1976 * Point the AGI unlinked bucket at an inode and log the results. The caller 1977 * is responsible for validating the old value. 1978 */ 1979 STATIC int 1980 xfs_iunlink_update_bucket( 1981 struct xfs_trans *tp, 1982 xfs_agnumber_t agno, 1983 struct xfs_buf *agibp, 1984 unsigned int bucket_index, 1985 xfs_agino_t new_agino) 1986 { 1987 struct xfs_agi *agi = agibp->b_addr; 1988 xfs_agino_t old_value; 1989 int offset; 1990 1991 ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); 1992 1993 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1994 trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, 1995 old_value, new_agino); 1996 1997 /* 1998 * We should never find the head of the list already set to the value 1999 * passed in because either we're adding or removing ourselves from the 2000 * head of the list. 2001 */ 2002 if (old_value == new_agino) { 2003 xfs_buf_mark_corrupt(agibp); 2004 return -EFSCORRUPTED; 2005 } 2006 2007 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 2008 offset = offsetof(struct xfs_agi, agi_unlinked) + 2009 (sizeof(xfs_agino_t) * bucket_index); 2010 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 2011 return 0; 2012 } 2013 2014 /* Set an on-disk inode's next_unlinked pointer. */ 2015 STATIC void 2016 xfs_iunlink_update_dinode( 2017 struct xfs_trans *tp, 2018 xfs_agnumber_t agno, 2019 xfs_agino_t agino, 2020 struct xfs_buf *ibp, 2021 struct xfs_dinode *dip, 2022 struct xfs_imap *imap, 2023 xfs_agino_t next_agino) 2024 { 2025 struct xfs_mount *mp = tp->t_mountp; 2026 int offset; 2027 2028 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); 2029 2030 trace_xfs_iunlink_update_dinode(mp, agno, agino, 2031 be32_to_cpu(dip->di_next_unlinked), next_agino); 2032 2033 dip->di_next_unlinked = cpu_to_be32(next_agino); 2034 offset = imap->im_boffset + 2035 offsetof(struct xfs_dinode, di_next_unlinked); 2036 2037 /* need to recalc the inode CRC if appropriate */ 2038 xfs_dinode_calc_crc(mp, dip); 2039 xfs_trans_inode_buf(tp, ibp); 2040 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); 2041 } 2042 2043 /* Set an in-core inode's unlinked pointer and return the old value. */ 2044 STATIC int 2045 xfs_iunlink_update_inode( 2046 struct xfs_trans *tp, 2047 struct xfs_inode *ip, 2048 xfs_agnumber_t agno, 2049 xfs_agino_t next_agino, 2050 xfs_agino_t *old_next_agino) 2051 { 2052 struct xfs_mount *mp = tp->t_mountp; 2053 struct xfs_dinode *dip; 2054 struct xfs_buf *ibp; 2055 xfs_agino_t old_value; 2056 int error; 2057 2058 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); 2059 2060 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); 2061 if (error) 2062 return error; 2063 dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); 2064 2065 /* Make sure the old pointer isn't garbage. */ 2066 old_value = be32_to_cpu(dip->di_next_unlinked); 2067 if (!xfs_verify_agino_or_null(mp, agno, old_value)) { 2068 xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, 2069 sizeof(*dip), __this_address); 2070 error = -EFSCORRUPTED; 2071 goto out; 2072 } 2073 2074 /* 2075 * Since we're updating a linked list, we should never find that the 2076 * current pointer is the same as the new value, unless we're 2077 * terminating the list. 2078 */ 2079 *old_next_agino = old_value; 2080 if (old_value == next_agino) { 2081 if (next_agino != NULLAGINO) { 2082 xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, 2083 dip, sizeof(*dip), __this_address); 2084 error = -EFSCORRUPTED; 2085 } 2086 goto out; 2087 } 2088 2089 /* Ok, update the new pointer. */ 2090 xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), 2091 ibp, dip, &ip->i_imap, next_agino); 2092 return 0; 2093 out: 2094 xfs_trans_brelse(tp, ibp); 2095 return error; 2096 } 2097 2098 /* 2099 * This is called when the inode's link count has gone to 0 or we are creating 2100 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2101 * 2102 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2103 * list when the inode is freed. 2104 */ 2105 STATIC int 2106 xfs_iunlink( 2107 struct xfs_trans *tp, 2108 struct xfs_inode *ip) 2109 { 2110 struct xfs_mount *mp = tp->t_mountp; 2111 struct xfs_agi *agi; 2112 struct xfs_buf *agibp; 2113 xfs_agino_t next_agino; 2114 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 2115 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2116 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2117 int error; 2118 2119 ASSERT(VFS_I(ip)->i_nlink == 0); 2120 ASSERT(VFS_I(ip)->i_mode != 0); 2121 trace_xfs_iunlink(ip); 2122 2123 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2124 error = xfs_read_agi(mp, tp, agno, &agibp); 2125 if (error) 2126 return error; 2127 agi = agibp->b_addr; 2128 2129 /* 2130 * Get the index into the agi hash table for the list this inode will 2131 * go on. Make sure the pointer isn't garbage and that this inode 2132 * isn't already on the list. 2133 */ 2134 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2135 if (next_agino == agino || 2136 !xfs_verify_agino_or_null(mp, agno, next_agino)) { 2137 xfs_buf_mark_corrupt(agibp); 2138 return -EFSCORRUPTED; 2139 } 2140 2141 if (next_agino != NULLAGINO) { 2142 xfs_agino_t old_agino; 2143 2144 /* 2145 * There is already another inode in the bucket, so point this 2146 * inode to the current head of the list. 2147 */ 2148 error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, 2149 &old_agino); 2150 if (error) 2151 return error; 2152 ASSERT(old_agino == NULLAGINO); 2153 2154 /* 2155 * agino has been unlinked, add a backref from the next inode 2156 * back to agino. 2157 */ 2158 error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); 2159 if (error) 2160 return error; 2161 } 2162 2163 /* Point the head of the list to point to this inode. */ 2164 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); 2165 } 2166 2167 /* Return the imap, dinode pointer, and buffer for an inode. */ 2168 STATIC int 2169 xfs_iunlink_map_ino( 2170 struct xfs_trans *tp, 2171 xfs_agnumber_t agno, 2172 xfs_agino_t agino, 2173 struct xfs_imap *imap, 2174 struct xfs_dinode **dipp, 2175 struct xfs_buf **bpp) 2176 { 2177 struct xfs_mount *mp = tp->t_mountp; 2178 int error; 2179 2180 imap->im_blkno = 0; 2181 error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); 2182 if (error) { 2183 xfs_warn(mp, "%s: xfs_imap returned error %d.", 2184 __func__, error); 2185 return error; 2186 } 2187 2188 error = xfs_imap_to_bp(mp, tp, imap, bpp); 2189 if (error) { 2190 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", 2191 __func__, error); 2192 return error; 2193 } 2194 2195 *dipp = xfs_buf_offset(*bpp, imap->im_boffset); 2196 return 0; 2197 } 2198 2199 /* 2200 * Walk the unlinked chain from @head_agino until we find the inode that 2201 * points to @target_agino. Return the inode number, map, dinode pointer, 2202 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. 2203 * 2204 * @tp, @pag, @head_agino, and @target_agino are input parameters. 2205 * @agino, @imap, @dipp, and @bpp are all output parameters. 2206 * 2207 * Do not call this function if @target_agino is the head of the list. 2208 */ 2209 STATIC int 2210 xfs_iunlink_map_prev( 2211 struct xfs_trans *tp, 2212 xfs_agnumber_t agno, 2213 xfs_agino_t head_agino, 2214 xfs_agino_t target_agino, 2215 xfs_agino_t *agino, 2216 struct xfs_imap *imap, 2217 struct xfs_dinode **dipp, 2218 struct xfs_buf **bpp, 2219 struct xfs_perag *pag) 2220 { 2221 struct xfs_mount *mp = tp->t_mountp; 2222 xfs_agino_t next_agino; 2223 int error; 2224 2225 ASSERT(head_agino != target_agino); 2226 *bpp = NULL; 2227 2228 /* See if our backref cache can find it faster. */ 2229 *agino = xfs_iunlink_lookup_backref(pag, target_agino); 2230 if (*agino != NULLAGINO) { 2231 error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); 2232 if (error) 2233 return error; 2234 2235 if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) 2236 return 0; 2237 2238 /* 2239 * If we get here the cache contents were corrupt, so drop the 2240 * buffer and fall back to walking the bucket list. 2241 */ 2242 xfs_trans_brelse(tp, *bpp); 2243 *bpp = NULL; 2244 WARN_ON_ONCE(1); 2245 } 2246 2247 trace_xfs_iunlink_map_prev_fallback(mp, agno); 2248 2249 /* Otherwise, walk the entire bucket until we find it. */ 2250 next_agino = head_agino; 2251 while (next_agino != target_agino) { 2252 xfs_agino_t unlinked_agino; 2253 2254 if (*bpp) 2255 xfs_trans_brelse(tp, *bpp); 2256 2257 *agino = next_agino; 2258 error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, 2259 bpp); 2260 if (error) 2261 return error; 2262 2263 unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); 2264 /* 2265 * Make sure this pointer is valid and isn't an obvious 2266 * infinite loop. 2267 */ 2268 if (!xfs_verify_agino(mp, agno, unlinked_agino) || 2269 next_agino == unlinked_agino) { 2270 XFS_CORRUPTION_ERROR(__func__, 2271 XFS_ERRLEVEL_LOW, mp, 2272 *dipp, sizeof(**dipp)); 2273 error = -EFSCORRUPTED; 2274 return error; 2275 } 2276 next_agino = unlinked_agino; 2277 } 2278 2279 return 0; 2280 } 2281 2282 /* 2283 * Pull the on-disk inode from the AGI unlinked list. 2284 */ 2285 STATIC int 2286 xfs_iunlink_remove( 2287 struct xfs_trans *tp, 2288 struct xfs_inode *ip) 2289 { 2290 struct xfs_mount *mp = tp->t_mountp; 2291 struct xfs_agi *agi; 2292 struct xfs_buf *agibp; 2293 struct xfs_buf *last_ibp; 2294 struct xfs_dinode *last_dip = NULL; 2295 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 2296 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2297 xfs_agino_t next_agino; 2298 xfs_agino_t head_agino; 2299 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2300 int error; 2301 2302 trace_xfs_iunlink_remove(ip); 2303 2304 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2305 error = xfs_read_agi(mp, tp, agno, &agibp); 2306 if (error) 2307 return error; 2308 agi = agibp->b_addr; 2309 2310 /* 2311 * Get the index into the agi hash table for the list this inode will 2312 * go on. Make sure the head pointer isn't garbage. 2313 */ 2314 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2315 if (!xfs_verify_agino(mp, agno, head_agino)) { 2316 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2317 agi, sizeof(*agi)); 2318 return -EFSCORRUPTED; 2319 } 2320 2321 /* 2322 * Set our inode's next_unlinked pointer to NULL and then return 2323 * the old pointer value so that we can update whatever was previous 2324 * to us in the list to point to whatever was next in the list. 2325 */ 2326 error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); 2327 if (error) 2328 return error; 2329 2330 /* 2331 * If there was a backref pointing from the next inode back to this 2332 * one, remove it because we've removed this inode from the list. 2333 * 2334 * Later, if this inode was in the middle of the list we'll update 2335 * this inode's backref to point from the next inode. 2336 */ 2337 if (next_agino != NULLAGINO) { 2338 error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, 2339 NULLAGINO); 2340 if (error) 2341 return error; 2342 } 2343 2344 if (head_agino != agino) { 2345 struct xfs_imap imap; 2346 xfs_agino_t prev_agino; 2347 2348 /* We need to search the list for the inode being freed. */ 2349 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, 2350 &prev_agino, &imap, &last_dip, &last_ibp, 2351 agibp->b_pag); 2352 if (error) 2353 return error; 2354 2355 /* Point the previous inode on the list to the next inode. */ 2356 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, 2357 last_dip, &imap, next_agino); 2358 2359 /* 2360 * Now we deal with the backref for this inode. If this inode 2361 * pointed at a real inode, change the backref that pointed to 2362 * us to point to our old next. If this inode was the end of 2363 * the list, delete the backref that pointed to us. Note that 2364 * change_backref takes care of deleting the backref if 2365 * next_agino is NULLAGINO. 2366 */ 2367 return xfs_iunlink_change_backref(agibp->b_pag, agino, 2368 next_agino); 2369 } 2370 2371 /* Point the head of the list to the next unlinked inode. */ 2372 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, 2373 next_agino); 2374 } 2375 2376 /* 2377 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2378 * mark it stale. We should only find clean inodes in this lookup that aren't 2379 * already stale. 2380 */ 2381 static void 2382 xfs_ifree_mark_inode_stale( 2383 struct xfs_buf *bp, 2384 struct xfs_inode *free_ip, 2385 xfs_ino_t inum) 2386 { 2387 struct xfs_mount *mp = bp->b_mount; 2388 struct xfs_perag *pag = bp->b_pag; 2389 struct xfs_inode_log_item *iip; 2390 struct xfs_inode *ip; 2391 2392 retry: 2393 rcu_read_lock(); 2394 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2395 2396 /* Inode not in memory, nothing to do */ 2397 if (!ip) { 2398 rcu_read_unlock(); 2399 return; 2400 } 2401 2402 /* 2403 * because this is an RCU protected lookup, we could find a recently 2404 * freed or even reallocated inode during the lookup. We need to check 2405 * under the i_flags_lock for a valid inode here. Skip it if it is not 2406 * valid, the wrong inode or stale. 2407 */ 2408 spin_lock(&ip->i_flags_lock); 2409 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) 2410 goto out_iflags_unlock; 2411 2412 /* 2413 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2414 * other inodes that we did not find in the list attached to the buffer 2415 * and are not already marked stale. If we can't lock it, back off and 2416 * retry. 2417 */ 2418 if (ip != free_ip) { 2419 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2420 spin_unlock(&ip->i_flags_lock); 2421 rcu_read_unlock(); 2422 delay(1); 2423 goto retry; 2424 } 2425 } 2426 ip->i_flags |= XFS_ISTALE; 2427 2428 /* 2429 * If the inode is flushing, it is already attached to the buffer. All 2430 * we needed to do here is mark the inode stale so buffer IO completion 2431 * will remove it from the AIL. 2432 */ 2433 iip = ip->i_itemp; 2434 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2435 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2436 ASSERT(iip->ili_last_fields); 2437 goto out_iunlock; 2438 } 2439 2440 /* 2441 * Inodes not attached to the buffer can be released immediately. 2442 * Everything else has to go through xfs_iflush_abort() on journal 2443 * commit as the flock synchronises removal of the inode from the 2444 * cluster buffer against inode reclaim. 2445 */ 2446 if (!iip || list_empty(&iip->ili_item.li_bio_list)) 2447 goto out_iunlock; 2448 2449 __xfs_iflags_set(ip, XFS_IFLUSHING); 2450 spin_unlock(&ip->i_flags_lock); 2451 rcu_read_unlock(); 2452 2453 /* we have a dirty inode in memory that has not yet been flushed. */ 2454 spin_lock(&iip->ili_lock); 2455 iip->ili_last_fields = iip->ili_fields; 2456 iip->ili_fields = 0; 2457 iip->ili_fsync_fields = 0; 2458 spin_unlock(&iip->ili_lock); 2459 ASSERT(iip->ili_last_fields); 2460 2461 if (ip != free_ip) 2462 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2463 return; 2464 2465 out_iunlock: 2466 if (ip != free_ip) 2467 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2468 out_iflags_unlock: 2469 spin_unlock(&ip->i_flags_lock); 2470 rcu_read_unlock(); 2471 } 2472 2473 /* 2474 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2475 * inodes that are in memory - they all must be marked stale and attached to 2476 * the cluster buffer. 2477 */ 2478 STATIC int 2479 xfs_ifree_cluster( 2480 struct xfs_inode *free_ip, 2481 struct xfs_trans *tp, 2482 struct xfs_icluster *xic) 2483 { 2484 struct xfs_mount *mp = free_ip->i_mount; 2485 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2486 struct xfs_buf *bp; 2487 xfs_daddr_t blkno; 2488 xfs_ino_t inum = xic->first_ino; 2489 int nbufs; 2490 int i, j; 2491 int ioffset; 2492 int error; 2493 2494 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2495 2496 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2497 /* 2498 * The allocation bitmap tells us which inodes of the chunk were 2499 * physically allocated. Skip the cluster if an inode falls into 2500 * a sparse region. 2501 */ 2502 ioffset = inum - xic->first_ino; 2503 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2504 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2505 continue; 2506 } 2507 2508 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2509 XFS_INO_TO_AGBNO(mp, inum)); 2510 2511 /* 2512 * We obtain and lock the backing buffer first in the process 2513 * here to ensure dirty inodes attached to the buffer remain in 2514 * the flushing state while we mark them stale. 2515 * 2516 * If we scan the in-memory inodes first, then buffer IO can 2517 * complete before we get a lock on it, and hence we may fail 2518 * to mark all the active inodes on the buffer stale. 2519 */ 2520 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2521 mp->m_bsize * igeo->blocks_per_cluster, 2522 XBF_UNMAPPED, &bp); 2523 if (error) 2524 return error; 2525 2526 /* 2527 * This buffer may not have been correctly initialised as we 2528 * didn't read it from disk. That's not important because we are 2529 * only using to mark the buffer as stale in the log, and to 2530 * attach stale cached inodes on it. That means it will never be 2531 * dispatched for IO. If it is, we want to know about it, and we 2532 * want it to fail. We can acheive this by adding a write 2533 * verifier to the buffer. 2534 */ 2535 bp->b_ops = &xfs_inode_buf_ops; 2536 2537 /* 2538 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2539 * too. This requires lookups, and will skip inodes that we've 2540 * already marked XFS_ISTALE. 2541 */ 2542 for (i = 0; i < igeo->inodes_per_cluster; i++) 2543 xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); 2544 2545 xfs_trans_stale_inode_buf(tp, bp); 2546 xfs_trans_binval(tp, bp); 2547 } 2548 return 0; 2549 } 2550 2551 /* 2552 * This is called to return an inode to the inode free list. 2553 * The inode should already be truncated to 0 length and have 2554 * no pages associated with it. This routine also assumes that 2555 * the inode is already a part of the transaction. 2556 * 2557 * The on-disk copy of the inode will have been added to the list 2558 * of unlinked inodes in the AGI. We need to remove the inode from 2559 * that list atomically with respect to freeing it here. 2560 */ 2561 int 2562 xfs_ifree( 2563 struct xfs_trans *tp, 2564 struct xfs_inode *ip) 2565 { 2566 int error; 2567 struct xfs_icluster xic = { 0 }; 2568 struct xfs_inode_log_item *iip = ip->i_itemp; 2569 2570 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2571 ASSERT(VFS_I(ip)->i_nlink == 0); 2572 ASSERT(ip->i_df.if_nextents == 0); 2573 ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2574 ASSERT(ip->i_nblocks == 0); 2575 2576 /* 2577 * Pull the on-disk inode from the AGI unlinked list. 2578 */ 2579 error = xfs_iunlink_remove(tp, ip); 2580 if (error) 2581 return error; 2582 2583 error = xfs_difree(tp, ip->i_ino, &xic); 2584 if (error) 2585 return error; 2586 2587 /* 2588 * Free any local-format data sitting around before we reset the 2589 * data fork to extents format. Note that the attr fork data has 2590 * already been freed by xfs_attr_inactive. 2591 */ 2592 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2593 kmem_free(ip->i_df.if_u1.if_data); 2594 ip->i_df.if_u1.if_data = NULL; 2595 ip->i_df.if_bytes = 0; 2596 } 2597 2598 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2599 ip->i_diflags = 0; 2600 ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2; 2601 ip->i_forkoff = 0; /* mark the attr fork not in use */ 2602 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2603 if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) 2604 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); 2605 2606 /* Don't attempt to replay owner changes for a deleted inode */ 2607 spin_lock(&iip->ili_lock); 2608 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2609 spin_unlock(&iip->ili_lock); 2610 2611 /* 2612 * Bump the generation count so no one will be confused 2613 * by reincarnations of this inode. 2614 */ 2615 VFS_I(ip)->i_generation++; 2616 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2617 2618 if (xic.deleted) 2619 error = xfs_ifree_cluster(ip, tp, &xic); 2620 2621 return error; 2622 } 2623 2624 /* 2625 * This is called to unpin an inode. The caller must have the inode locked 2626 * in at least shared mode so that the buffer cannot be subsequently pinned 2627 * once someone is waiting for it to be unpinned. 2628 */ 2629 static void 2630 xfs_iunpin( 2631 struct xfs_inode *ip) 2632 { 2633 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2634 2635 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2636 2637 /* Give the log a push to start the unpinning I/O */ 2638 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); 2639 2640 } 2641 2642 static void 2643 __xfs_iunpin_wait( 2644 struct xfs_inode *ip) 2645 { 2646 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2647 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2648 2649 xfs_iunpin(ip); 2650 2651 do { 2652 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2653 if (xfs_ipincount(ip)) 2654 io_schedule(); 2655 } while (xfs_ipincount(ip)); 2656 finish_wait(wq, &wait.wq_entry); 2657 } 2658 2659 void 2660 xfs_iunpin_wait( 2661 struct xfs_inode *ip) 2662 { 2663 if (xfs_ipincount(ip)) 2664 __xfs_iunpin_wait(ip); 2665 } 2666 2667 /* 2668 * Removing an inode from the namespace involves removing the directory entry 2669 * and dropping the link count on the inode. Removing the directory entry can 2670 * result in locking an AGF (directory blocks were freed) and removing a link 2671 * count can result in placing the inode on an unlinked list which results in 2672 * locking an AGI. 2673 * 2674 * The big problem here is that we have an ordering constraint on AGF and AGI 2675 * locking - inode allocation locks the AGI, then can allocate a new extent for 2676 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2677 * removes the inode from the unlinked list, requiring that we lock the AGI 2678 * first, and then freeing the inode can result in an inode chunk being freed 2679 * and hence freeing disk space requiring that we lock an AGF. 2680 * 2681 * Hence the ordering that is imposed by other parts of the code is AGI before 2682 * AGF. This means we cannot remove the directory entry before we drop the inode 2683 * reference count and put it on the unlinked list as this results in a lock 2684 * order of AGF then AGI, and this can deadlock against inode allocation and 2685 * freeing. Therefore we must drop the link counts before we remove the 2686 * directory entry. 2687 * 2688 * This is still safe from a transactional point of view - it is not until we 2689 * get to xfs_defer_finish() that we have the possibility of multiple 2690 * transactions in this operation. Hence as long as we remove the directory 2691 * entry and drop the link count in the first transaction of the remove 2692 * operation, there are no transactional constraints on the ordering here. 2693 */ 2694 int 2695 xfs_remove( 2696 xfs_inode_t *dp, 2697 struct xfs_name *name, 2698 xfs_inode_t *ip) 2699 { 2700 xfs_mount_t *mp = dp->i_mount; 2701 xfs_trans_t *tp = NULL; 2702 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2703 int error = 0; 2704 uint resblks; 2705 2706 trace_xfs_remove(dp, name); 2707 2708 if (XFS_FORCED_SHUTDOWN(mp)) 2709 return -EIO; 2710 2711 error = xfs_qm_dqattach(dp); 2712 if (error) 2713 goto std_return; 2714 2715 error = xfs_qm_dqattach(ip); 2716 if (error) 2717 goto std_return; 2718 2719 /* 2720 * We try to get the real space reservation first, 2721 * allowing for directory btree deletion(s) implying 2722 * possible bmap insert(s). If we can't get the space 2723 * reservation then we use 0 instead, and avoid the bmap 2724 * btree insert(s) in the directory code by, if the bmap 2725 * insert tries to happen, instead trimming the LAST 2726 * block from the directory. 2727 */ 2728 resblks = XFS_REMOVE_SPACE_RES(mp); 2729 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); 2730 if (error == -ENOSPC) { 2731 resblks = 0; 2732 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, 2733 &tp); 2734 } 2735 if (error) { 2736 ASSERT(error != -ENOSPC); 2737 goto std_return; 2738 } 2739 2740 xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); 2741 2742 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2743 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2744 2745 /* 2746 * If we're removing a directory perform some additional validation. 2747 */ 2748 if (is_dir) { 2749 ASSERT(VFS_I(ip)->i_nlink >= 2); 2750 if (VFS_I(ip)->i_nlink != 2) { 2751 error = -ENOTEMPTY; 2752 goto out_trans_cancel; 2753 } 2754 if (!xfs_dir_isempty(ip)) { 2755 error = -ENOTEMPTY; 2756 goto out_trans_cancel; 2757 } 2758 2759 /* Drop the link from ip's "..". */ 2760 error = xfs_droplink(tp, dp); 2761 if (error) 2762 goto out_trans_cancel; 2763 2764 /* Drop the "." link from ip to self. */ 2765 error = xfs_droplink(tp, ip); 2766 if (error) 2767 goto out_trans_cancel; 2768 } else { 2769 /* 2770 * When removing a non-directory we need to log the parent 2771 * inode here. For a directory this is done implicitly 2772 * by the xfs_droplink call for the ".." entry. 2773 */ 2774 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2775 } 2776 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2777 2778 /* Drop the link from dp to ip. */ 2779 error = xfs_droplink(tp, ip); 2780 if (error) 2781 goto out_trans_cancel; 2782 2783 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2784 if (error) { 2785 ASSERT(error != -ENOENT); 2786 goto out_trans_cancel; 2787 } 2788 2789 /* 2790 * If this is a synchronous mount, make sure that the 2791 * remove transaction goes to disk before returning to 2792 * the user. 2793 */ 2794 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 2795 xfs_trans_set_sync(tp); 2796 2797 error = xfs_trans_commit(tp); 2798 if (error) 2799 goto std_return; 2800 2801 if (is_dir && xfs_inode_is_filestream(ip)) 2802 xfs_filestream_deassociate(ip); 2803 2804 return 0; 2805 2806 out_trans_cancel: 2807 xfs_trans_cancel(tp); 2808 std_return: 2809 return error; 2810 } 2811 2812 /* 2813 * Enter all inodes for a rename transaction into a sorted array. 2814 */ 2815 #define __XFS_SORT_INODES 5 2816 STATIC void 2817 xfs_sort_for_rename( 2818 struct xfs_inode *dp1, /* in: old (source) directory inode */ 2819 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2820 struct xfs_inode *ip1, /* in: inode of old entry */ 2821 struct xfs_inode *ip2, /* in: inode of new entry */ 2822 struct xfs_inode *wip, /* in: whiteout inode */ 2823 struct xfs_inode **i_tab,/* out: sorted array of inodes */ 2824 int *num_inodes) /* in/out: inodes in array */ 2825 { 2826 int i, j; 2827 2828 ASSERT(*num_inodes == __XFS_SORT_INODES); 2829 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2830 2831 /* 2832 * i_tab contains a list of pointers to inodes. We initialize 2833 * the table here & we'll sort it. We will then use it to 2834 * order the acquisition of the inode locks. 2835 * 2836 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2837 */ 2838 i = 0; 2839 i_tab[i++] = dp1; 2840 i_tab[i++] = dp2; 2841 i_tab[i++] = ip1; 2842 if (ip2) 2843 i_tab[i++] = ip2; 2844 if (wip) 2845 i_tab[i++] = wip; 2846 *num_inodes = i; 2847 2848 /* 2849 * Sort the elements via bubble sort. (Remember, there are at 2850 * most 5 elements to sort, so this is adequate.) 2851 */ 2852 for (i = 0; i < *num_inodes; i++) { 2853 for (j = 1; j < *num_inodes; j++) { 2854 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2855 struct xfs_inode *temp = i_tab[j]; 2856 i_tab[j] = i_tab[j-1]; 2857 i_tab[j-1] = temp; 2858 } 2859 } 2860 } 2861 } 2862 2863 static int 2864 xfs_finish_rename( 2865 struct xfs_trans *tp) 2866 { 2867 /* 2868 * If this is a synchronous mount, make sure that the rename transaction 2869 * goes to disk before returning to the user. 2870 */ 2871 if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) 2872 xfs_trans_set_sync(tp); 2873 2874 return xfs_trans_commit(tp); 2875 } 2876 2877 /* 2878 * xfs_cross_rename() 2879 * 2880 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall 2881 */ 2882 STATIC int 2883 xfs_cross_rename( 2884 struct xfs_trans *tp, 2885 struct xfs_inode *dp1, 2886 struct xfs_name *name1, 2887 struct xfs_inode *ip1, 2888 struct xfs_inode *dp2, 2889 struct xfs_name *name2, 2890 struct xfs_inode *ip2, 2891 int spaceres) 2892 { 2893 int error = 0; 2894 int ip1_flags = 0; 2895 int ip2_flags = 0; 2896 int dp2_flags = 0; 2897 2898 /* Swap inode number for dirent in first parent */ 2899 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2900 if (error) 2901 goto out_trans_abort; 2902 2903 /* Swap inode number for dirent in second parent */ 2904 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2905 if (error) 2906 goto out_trans_abort; 2907 2908 /* 2909 * If we're renaming one or more directories across different parents, 2910 * update the respective ".." entries (and link counts) to match the new 2911 * parents. 2912 */ 2913 if (dp1 != dp2) { 2914 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2915 2916 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2917 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2918 dp1->i_ino, spaceres); 2919 if (error) 2920 goto out_trans_abort; 2921 2922 /* transfer ip2 ".." reference to dp1 */ 2923 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2924 error = xfs_droplink(tp, dp2); 2925 if (error) 2926 goto out_trans_abort; 2927 xfs_bumplink(tp, dp1); 2928 } 2929 2930 /* 2931 * Although ip1 isn't changed here, userspace needs 2932 * to be warned about the change, so that applications 2933 * relying on it (like backup ones), will properly 2934 * notify the change 2935 */ 2936 ip1_flags |= XFS_ICHGTIME_CHG; 2937 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2938 } 2939 2940 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2941 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2942 dp2->i_ino, spaceres); 2943 if (error) 2944 goto out_trans_abort; 2945 2946 /* transfer ip1 ".." reference to dp2 */ 2947 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2948 error = xfs_droplink(tp, dp1); 2949 if (error) 2950 goto out_trans_abort; 2951 xfs_bumplink(tp, dp2); 2952 } 2953 2954 /* 2955 * Although ip2 isn't changed here, userspace needs 2956 * to be warned about the change, so that applications 2957 * relying on it (like backup ones), will properly 2958 * notify the change 2959 */ 2960 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2961 ip2_flags |= XFS_ICHGTIME_CHG; 2962 } 2963 } 2964 2965 if (ip1_flags) { 2966 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2967 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2968 } 2969 if (ip2_flags) { 2970 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2971 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2972 } 2973 if (dp2_flags) { 2974 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2975 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2976 } 2977 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2978 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2979 return xfs_finish_rename(tp); 2980 2981 out_trans_abort: 2982 xfs_trans_cancel(tp); 2983 return error; 2984 } 2985 2986 /* 2987 * xfs_rename_alloc_whiteout() 2988 * 2989 * Return a referenced, unlinked, unlocked inode that can be used as a 2990 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2991 * crash between allocating the inode and linking it into the rename transaction 2992 * recovery will free the inode and we won't leak it. 2993 */ 2994 static int 2995 xfs_rename_alloc_whiteout( 2996 struct user_namespace *mnt_userns, 2997 struct xfs_inode *dp, 2998 struct xfs_inode **wip) 2999 { 3000 struct xfs_inode *tmpfile; 3001 int error; 3002 3003 error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, 3004 &tmpfile); 3005 if (error) 3006 return error; 3007 3008 /* 3009 * Prepare the tmpfile inode as if it were created through the VFS. 3010 * Complete the inode setup and flag it as linkable. nlink is already 3011 * zero, so we can skip the drop_nlink. 3012 */ 3013 xfs_setup_iops(tmpfile); 3014 xfs_finish_inode_setup(tmpfile); 3015 VFS_I(tmpfile)->i_state |= I_LINKABLE; 3016 3017 *wip = tmpfile; 3018 return 0; 3019 } 3020 3021 /* 3022 * xfs_rename 3023 */ 3024 int 3025 xfs_rename( 3026 struct user_namespace *mnt_userns, 3027 struct xfs_inode *src_dp, 3028 struct xfs_name *src_name, 3029 struct xfs_inode *src_ip, 3030 struct xfs_inode *target_dp, 3031 struct xfs_name *target_name, 3032 struct xfs_inode *target_ip, 3033 unsigned int flags) 3034 { 3035 struct xfs_mount *mp = src_dp->i_mount; 3036 struct xfs_trans *tp; 3037 struct xfs_inode *wip = NULL; /* whiteout inode */ 3038 struct xfs_inode *inodes[__XFS_SORT_INODES]; 3039 int i; 3040 int num_inodes = __XFS_SORT_INODES; 3041 bool new_parent = (src_dp != target_dp); 3042 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 3043 int spaceres; 3044 int error; 3045 3046 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 3047 3048 if ((flags & RENAME_EXCHANGE) && !target_ip) 3049 return -EINVAL; 3050 3051 /* 3052 * If we are doing a whiteout operation, allocate the whiteout inode 3053 * we will be placing at the target and ensure the type is set 3054 * appropriately. 3055 */ 3056 if (flags & RENAME_WHITEOUT) { 3057 ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); 3058 error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); 3059 if (error) 3060 return error; 3061 3062 /* setup target dirent info as whiteout */ 3063 src_name->type = XFS_DIR3_FT_CHRDEV; 3064 } 3065 3066 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, 3067 inodes, &num_inodes); 3068 3069 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 3070 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 3071 if (error == -ENOSPC) { 3072 spaceres = 0; 3073 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, 3074 &tp); 3075 } 3076 if (error) 3077 goto out_release_wip; 3078 3079 /* 3080 * Attach the dquots to the inodes 3081 */ 3082 error = xfs_qm_vop_rename_dqattach(inodes); 3083 if (error) 3084 goto out_trans_cancel; 3085 3086 /* 3087 * Lock all the participating inodes. Depending upon whether 3088 * the target_name exists in the target directory, and 3089 * whether the target directory is the same as the source 3090 * directory, we can lock from 2 to 4 inodes. 3091 */ 3092 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 3093 3094 /* 3095 * Join all the inodes to the transaction. From this point on, 3096 * we can rely on either trans_commit or trans_cancel to unlock 3097 * them. 3098 */ 3099 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3100 if (new_parent) 3101 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3102 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3103 if (target_ip) 3104 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3105 if (wip) 3106 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3107 3108 /* 3109 * If we are using project inheritance, we only allow renames 3110 * into our tree when the project IDs are the same; else the 3111 * tree quota mechanism would be circumvented. 3112 */ 3113 if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 3114 target_dp->i_projid != src_ip->i_projid)) { 3115 error = -EXDEV; 3116 goto out_trans_cancel; 3117 } 3118 3119 /* RENAME_EXCHANGE is unique from here on. */ 3120 if (flags & RENAME_EXCHANGE) 3121 return xfs_cross_rename(tp, src_dp, src_name, src_ip, 3122 target_dp, target_name, target_ip, 3123 spaceres); 3124 3125 /* 3126 * Check for expected errors before we dirty the transaction 3127 * so we can return an error without a transaction abort. 3128 * 3129 * Extent count overflow check: 3130 * 3131 * From the perspective of src_dp, a rename operation is essentially a 3132 * directory entry remove operation. Hence the only place where we check 3133 * for extent count overflow for src_dp is in 3134 * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns 3135 * -ENOSPC when it detects a possible extent count overflow and in 3136 * response, the higher layers of directory handling code do the 3137 * following: 3138 * 1. Data/Free blocks: XFS lets these blocks linger until a 3139 * future remove operation removes them. 3140 * 2. Dabtree blocks: XFS swaps the blocks with the last block in the 3141 * Leaf space and unmaps the last block. 3142 * 3143 * For target_dp, there are two cases depending on whether the 3144 * destination directory entry exists or not. 3145 * 3146 * When destination directory entry does not exist (i.e. target_ip == 3147 * NULL), extent count overflow check is performed only when transaction 3148 * has a non-zero sized space reservation associated with it. With a 3149 * zero-sized space reservation, XFS allows a rename operation to 3150 * continue only when the directory has sufficient free space in its 3151 * data/leaf/free space blocks to hold the new entry. 3152 * 3153 * When destination directory entry exists (i.e. target_ip != NULL), all 3154 * we need to do is change the inode number associated with the already 3155 * existing entry. Hence there is no need to perform an extent count 3156 * overflow check. 3157 */ 3158 if (target_ip == NULL) { 3159 /* 3160 * If there's no space reservation, check the entry will 3161 * fit before actually inserting it. 3162 */ 3163 if (!spaceres) { 3164 error = xfs_dir_canenter(tp, target_dp, target_name); 3165 if (error) 3166 goto out_trans_cancel; 3167 } else { 3168 error = xfs_iext_count_may_overflow(target_dp, 3169 XFS_DATA_FORK, 3170 XFS_IEXT_DIR_MANIP_CNT(mp)); 3171 if (error) 3172 goto out_trans_cancel; 3173 } 3174 } else { 3175 /* 3176 * If target exists and it's a directory, check that whether 3177 * it can be destroyed. 3178 */ 3179 if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3180 (!xfs_dir_isempty(target_ip) || 3181 (VFS_I(target_ip)->i_nlink > 2))) { 3182 error = -EEXIST; 3183 goto out_trans_cancel; 3184 } 3185 } 3186 3187 /* 3188 * Lock the AGI buffers we need to handle bumping the nlink of the 3189 * whiteout inode off the unlinked list and to handle dropping the 3190 * nlink of the target inode. Per locking order rules, do this in 3191 * increasing AG order and before directory block allocation tries to 3192 * grab AGFs because we grab AGIs before AGFs. 3193 * 3194 * The (vfs) caller must ensure that if src is a directory then 3195 * target_ip is either null or an empty directory. 3196 */ 3197 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3198 if (inodes[i] == wip || 3199 (inodes[i] == target_ip && 3200 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3201 struct xfs_buf *bp; 3202 xfs_agnumber_t agno; 3203 3204 agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); 3205 error = xfs_read_agi(mp, tp, agno, &bp); 3206 if (error) 3207 goto out_trans_cancel; 3208 } 3209 } 3210 3211 /* 3212 * Directory entry creation below may acquire the AGF. Remove 3213 * the whiteout from the unlinked list first to preserve correct 3214 * AGI/AGF locking order. This dirties the transaction so failures 3215 * after this point will abort and log recovery will clean up the 3216 * mess. 3217 * 3218 * For whiteouts, we need to bump the link count on the whiteout 3219 * inode. After this point, we have a real link, clear the tmpfile 3220 * state flag from the inode so it doesn't accidentally get misused 3221 * in future. 3222 */ 3223 if (wip) { 3224 ASSERT(VFS_I(wip)->i_nlink == 0); 3225 error = xfs_iunlink_remove(tp, wip); 3226 if (error) 3227 goto out_trans_cancel; 3228 3229 xfs_bumplink(tp, wip); 3230 VFS_I(wip)->i_state &= ~I_LINKABLE; 3231 } 3232 3233 /* 3234 * Set up the target. 3235 */ 3236 if (target_ip == NULL) { 3237 /* 3238 * If target does not exist and the rename crosses 3239 * directories, adjust the target directory link count 3240 * to account for the ".." reference from the new entry. 3241 */ 3242 error = xfs_dir_createname(tp, target_dp, target_name, 3243 src_ip->i_ino, spaceres); 3244 if (error) 3245 goto out_trans_cancel; 3246 3247 xfs_trans_ichgtime(tp, target_dp, 3248 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3249 3250 if (new_parent && src_is_directory) { 3251 xfs_bumplink(tp, target_dp); 3252 } 3253 } else { /* target_ip != NULL */ 3254 /* 3255 * Link the source inode under the target name. 3256 * If the source inode is a directory and we are moving 3257 * it across directories, its ".." entry will be 3258 * inconsistent until we replace that down below. 3259 * 3260 * In case there is already an entry with the same 3261 * name at the destination directory, remove it first. 3262 */ 3263 error = xfs_dir_replace(tp, target_dp, target_name, 3264 src_ip->i_ino, spaceres); 3265 if (error) 3266 goto out_trans_cancel; 3267 3268 xfs_trans_ichgtime(tp, target_dp, 3269 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3270 3271 /* 3272 * Decrement the link count on the target since the target 3273 * dir no longer points to it. 3274 */ 3275 error = xfs_droplink(tp, target_ip); 3276 if (error) 3277 goto out_trans_cancel; 3278 3279 if (src_is_directory) { 3280 /* 3281 * Drop the link from the old "." entry. 3282 */ 3283 error = xfs_droplink(tp, target_ip); 3284 if (error) 3285 goto out_trans_cancel; 3286 } 3287 } /* target_ip != NULL */ 3288 3289 /* 3290 * Remove the source. 3291 */ 3292 if (new_parent && src_is_directory) { 3293 /* 3294 * Rewrite the ".." entry to point to the new 3295 * directory. 3296 */ 3297 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 3298 target_dp->i_ino, spaceres); 3299 ASSERT(error != -EEXIST); 3300 if (error) 3301 goto out_trans_cancel; 3302 } 3303 3304 /* 3305 * We always want to hit the ctime on the source inode. 3306 * 3307 * This isn't strictly required by the standards since the source 3308 * inode isn't really being changed, but old unix file systems did 3309 * it and some incremental backup programs won't work without it. 3310 */ 3311 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3312 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3313 3314 /* 3315 * Adjust the link count on src_dp. This is necessary when 3316 * renaming a directory, either within one parent when 3317 * the target existed, or across two parent directories. 3318 */ 3319 if (src_is_directory && (new_parent || target_ip != NULL)) { 3320 3321 /* 3322 * Decrement link count on src_directory since the 3323 * entry that's moved no longer points to it. 3324 */ 3325 error = xfs_droplink(tp, src_dp); 3326 if (error) 3327 goto out_trans_cancel; 3328 } 3329 3330 /* 3331 * For whiteouts, we only need to update the source dirent with the 3332 * inode number of the whiteout inode rather than removing it 3333 * altogether. 3334 */ 3335 if (wip) { 3336 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, 3337 spaceres); 3338 } else { 3339 /* 3340 * NOTE: We don't need to check for extent count overflow here 3341 * because the dir remove name code will leave the dir block in 3342 * place if the extent count would overflow. 3343 */ 3344 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3345 spaceres); 3346 } 3347 3348 if (error) 3349 goto out_trans_cancel; 3350 3351 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3352 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3353 if (new_parent) 3354 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3355 3356 error = xfs_finish_rename(tp); 3357 if (wip) 3358 xfs_irele(wip); 3359 return error; 3360 3361 out_trans_cancel: 3362 xfs_trans_cancel(tp); 3363 out_release_wip: 3364 if (wip) 3365 xfs_irele(wip); 3366 return error; 3367 } 3368 3369 static int 3370 xfs_iflush( 3371 struct xfs_inode *ip, 3372 struct xfs_buf *bp) 3373 { 3374 struct xfs_inode_log_item *iip = ip->i_itemp; 3375 struct xfs_dinode *dip; 3376 struct xfs_mount *mp = ip->i_mount; 3377 int error; 3378 3379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3380 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3381 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3382 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3383 ASSERT(iip->ili_item.li_buf == bp); 3384 3385 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3386 3387 /* 3388 * We don't flush the inode if any of the following checks fail, but we 3389 * do still update the log item and attach to the backing buffer as if 3390 * the flush happened. This is a formality to facilitate predictable 3391 * error handling as the caller will shutdown and fail the buffer. 3392 */ 3393 error = -EFSCORRUPTED; 3394 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3395 mp, XFS_ERRTAG_IFLUSH_1)) { 3396 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3397 "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, 3398 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3399 goto flush_out; 3400 } 3401 if (S_ISREG(VFS_I(ip)->i_mode)) { 3402 if (XFS_TEST_ERROR( 3403 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3404 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3405 mp, XFS_ERRTAG_IFLUSH_3)) { 3406 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3407 "%s: Bad regular inode %Lu, ptr "PTR_FMT, 3408 __func__, ip->i_ino, ip); 3409 goto flush_out; 3410 } 3411 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3412 if (XFS_TEST_ERROR( 3413 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3414 ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3415 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3416 mp, XFS_ERRTAG_IFLUSH_4)) { 3417 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3418 "%s: Bad directory inode %Lu, ptr "PTR_FMT, 3419 __func__, ip->i_ino, ip); 3420 goto flush_out; 3421 } 3422 } 3423 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > 3424 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { 3425 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3426 "%s: detected corrupt incore inode %Lu, " 3427 "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, 3428 __func__, ip->i_ino, 3429 ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), 3430 ip->i_nblocks, ip); 3431 goto flush_out; 3432 } 3433 if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, 3434 mp, XFS_ERRTAG_IFLUSH_6)) { 3435 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3436 "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, 3437 __func__, ip->i_ino, ip->i_forkoff, ip); 3438 goto flush_out; 3439 } 3440 3441 /* 3442 * Inode item log recovery for v2 inodes are dependent on the flushiter 3443 * count for correct sequencing. We bump the flush iteration count so 3444 * we can detect flushes which postdate a log record during recovery. 3445 * This is redundant as we now log every change and hence this can't 3446 * happen but we need to still do it to ensure backwards compatibility 3447 * with old kernels that predate logging all inode changes. 3448 */ 3449 if (!xfs_sb_version_has_v3inode(&mp->m_sb)) 3450 ip->i_flushiter++; 3451 3452 /* 3453 * If there are inline format data / attr forks attached to this inode, 3454 * make sure they are not corrupt. 3455 */ 3456 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && 3457 xfs_ifork_verify_local_data(ip)) 3458 goto flush_out; 3459 if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && 3460 xfs_ifork_verify_local_attr(ip)) 3461 goto flush_out; 3462 3463 /* 3464 * Copy the dirty parts of the inode into the on-disk inode. We always 3465 * copy out the core of the inode, because if the inode is dirty at all 3466 * the core must be. 3467 */ 3468 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3469 3470 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3471 if (!xfs_sb_version_has_v3inode(&mp->m_sb)) { 3472 if (ip->i_flushiter == DI_MAX_FLUSH) 3473 ip->i_flushiter = 0; 3474 } 3475 3476 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3477 if (XFS_IFORK_Q(ip)) 3478 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3479 3480 /* 3481 * We've recorded everything logged in the inode, so we'd like to clear 3482 * the ili_fields bits so we don't log and flush things unnecessarily. 3483 * However, we can't stop logging all this information until the data 3484 * we've copied into the disk buffer is written to disk. If we did we 3485 * might overwrite the copy of the inode in the log with all the data 3486 * after re-logging only part of it, and in the face of a crash we 3487 * wouldn't have all the data we need to recover. 3488 * 3489 * What we do is move the bits to the ili_last_fields field. When 3490 * logging the inode, these bits are moved back to the ili_fields field. 3491 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3492 * we know that the information those bits represent is permanently on 3493 * disk. As long as the flush completes before the inode is logged 3494 * again, then both ili_fields and ili_last_fields will be cleared. 3495 */ 3496 error = 0; 3497 flush_out: 3498 spin_lock(&iip->ili_lock); 3499 iip->ili_last_fields = iip->ili_fields; 3500 iip->ili_fields = 0; 3501 iip->ili_fsync_fields = 0; 3502 spin_unlock(&iip->ili_lock); 3503 3504 /* 3505 * Store the current LSN of the inode so that we can tell whether the 3506 * item has moved in the AIL from xfs_buf_inode_iodone(). 3507 */ 3508 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3509 &iip->ili_item.li_lsn); 3510 3511 /* generate the checksum. */ 3512 xfs_dinode_calc_crc(mp, dip); 3513 return error; 3514 } 3515 3516 /* 3517 * Non-blocking flush of dirty inode metadata into the backing buffer. 3518 * 3519 * The caller must have a reference to the inode and hold the cluster buffer 3520 * locked. The function will walk across all the inodes on the cluster buffer it 3521 * can find and lock without blocking, and flush them to the cluster buffer. 3522 * 3523 * On successful flushing of at least one inode, the caller must write out the 3524 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3525 * the caller needs to release the buffer. On failure, the filesystem will be 3526 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3527 * will be returned. 3528 */ 3529 int 3530 xfs_iflush_cluster( 3531 struct xfs_buf *bp) 3532 { 3533 struct xfs_mount *mp = bp->b_mount; 3534 struct xfs_log_item *lip, *n; 3535 struct xfs_inode *ip; 3536 struct xfs_inode_log_item *iip; 3537 int clcount = 0; 3538 int error = 0; 3539 3540 /* 3541 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3542 * can remove itself from the list. 3543 */ 3544 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3545 iip = (struct xfs_inode_log_item *)lip; 3546 ip = iip->ili_inode; 3547 3548 /* 3549 * Quick and dirty check to avoid locks if possible. 3550 */ 3551 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) 3552 continue; 3553 if (xfs_ipincount(ip)) 3554 continue; 3555 3556 /* 3557 * The inode is still attached to the buffer, which means it is 3558 * dirty but reclaim might try to grab it. Check carefully for 3559 * that, and grab the ilock while still holding the i_flags_lock 3560 * to guarantee reclaim will not be able to reclaim this inode 3561 * once we drop the i_flags_lock. 3562 */ 3563 spin_lock(&ip->i_flags_lock); 3564 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3565 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3566 spin_unlock(&ip->i_flags_lock); 3567 continue; 3568 } 3569 3570 /* 3571 * ILOCK will pin the inode against reclaim and prevent 3572 * concurrent transactions modifying the inode while we are 3573 * flushing the inode. If we get the lock, set the flushing 3574 * state before we drop the i_flags_lock. 3575 */ 3576 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3577 spin_unlock(&ip->i_flags_lock); 3578 continue; 3579 } 3580 __xfs_iflags_set(ip, XFS_IFLUSHING); 3581 spin_unlock(&ip->i_flags_lock); 3582 3583 /* 3584 * Abort flushing this inode if we are shut down because the 3585 * inode may not currently be in the AIL. This can occur when 3586 * log I/O failure unpins the inode without inserting into the 3587 * AIL, leaving a dirty/unpinned inode attached to the buffer 3588 * that otherwise looks like it should be flushed. 3589 */ 3590 if (XFS_FORCED_SHUTDOWN(mp)) { 3591 xfs_iunpin_wait(ip); 3592 xfs_iflush_abort(ip); 3593 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3594 error = -EIO; 3595 continue; 3596 } 3597 3598 /* don't block waiting on a log force to unpin dirty inodes */ 3599 if (xfs_ipincount(ip)) { 3600 xfs_iflags_clear(ip, XFS_IFLUSHING); 3601 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3602 continue; 3603 } 3604 3605 if (!xfs_inode_clean(ip)) 3606 error = xfs_iflush(ip, bp); 3607 else 3608 xfs_iflags_clear(ip, XFS_IFLUSHING); 3609 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3610 if (error) 3611 break; 3612 clcount++; 3613 } 3614 3615 if (error) { 3616 bp->b_flags |= XBF_ASYNC; 3617 xfs_buf_ioend_fail(bp); 3618 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3619 return error; 3620 } 3621 3622 if (!clcount) 3623 return -EAGAIN; 3624 3625 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3626 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3627 return 0; 3628 3629 } 3630 3631 /* Release an inode. */ 3632 void 3633 xfs_irele( 3634 struct xfs_inode *ip) 3635 { 3636 trace_xfs_irele(ip, _RET_IP_); 3637 iput(VFS_I(ip)); 3638 } 3639 3640 /* 3641 * Ensure all commited transactions touching the inode are written to the log. 3642 */ 3643 int 3644 xfs_log_force_inode( 3645 struct xfs_inode *ip) 3646 { 3647 xfs_lsn_t lsn = 0; 3648 3649 xfs_ilock(ip, XFS_ILOCK_SHARED); 3650 if (xfs_ipincount(ip)) 3651 lsn = ip->i_itemp->ili_last_lsn; 3652 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3653 3654 if (!lsn) 3655 return 0; 3656 return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); 3657 } 3658 3659 /* 3660 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3661 * abide vfs locking order (lowest pointer value goes first) and breaking the 3662 * layout leases before proceeding. The loop is needed because we cannot call 3663 * the blocking break_layout() with the iolocks held, and therefore have to 3664 * back out both locks. 3665 */ 3666 static int 3667 xfs_iolock_two_inodes_and_break_layout( 3668 struct inode *src, 3669 struct inode *dest) 3670 { 3671 int error; 3672 3673 if (src > dest) 3674 swap(src, dest); 3675 3676 retry: 3677 /* Wait to break both inodes' layouts before we start locking. */ 3678 error = break_layout(src, true); 3679 if (error) 3680 return error; 3681 if (src != dest) { 3682 error = break_layout(dest, true); 3683 if (error) 3684 return error; 3685 } 3686 3687 /* Lock one inode and make sure nobody got in and leased it. */ 3688 inode_lock(src); 3689 error = break_layout(src, false); 3690 if (error) { 3691 inode_unlock(src); 3692 if (error == -EWOULDBLOCK) 3693 goto retry; 3694 return error; 3695 } 3696 3697 if (src == dest) 3698 return 0; 3699 3700 /* Lock the other inode and make sure nobody got in and leased it. */ 3701 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3702 error = break_layout(dest, false); 3703 if (error) { 3704 inode_unlock(src); 3705 inode_unlock(dest); 3706 if (error == -EWOULDBLOCK) 3707 goto retry; 3708 return error; 3709 } 3710 3711 return 0; 3712 } 3713 3714 /* 3715 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3716 * mmap activity. 3717 */ 3718 int 3719 xfs_ilock2_io_mmap( 3720 struct xfs_inode *ip1, 3721 struct xfs_inode *ip2) 3722 { 3723 int ret; 3724 3725 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3726 if (ret) 3727 return ret; 3728 if (ip1 == ip2) 3729 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3730 else 3731 xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, 3732 ip2, XFS_MMAPLOCK_EXCL); 3733 return 0; 3734 } 3735 3736 /* Unlock both inodes to allow IO and mmap activity. */ 3737 void 3738 xfs_iunlock2_io_mmap( 3739 struct xfs_inode *ip1, 3740 struct xfs_inode *ip2) 3741 { 3742 bool same_inode = (ip1 == ip2); 3743 3744 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3745 if (!same_inode) 3746 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3747 inode_unlock(VFS_I(ip2)); 3748 if (!same_inode) 3749 inode_unlock(VFS_I(ip1)); 3750 } 3751